Update clang-tools to ab/10774124 am: 820ba880a1

Original change: https://android-review.googlesource.com/c/platform/prebuilts/clang-tools/+/2768905

Change-Id: Ic1906d8b3da08869045da3a8834966065cfc9a56
Signed-off-by: Automerger Merge Worker <android-build-automerger-merge-worker@system.gserviceaccount.com>
diff --git a/darwin-x86/bin/versioner b/darwin-x86/bin/versioner
index dde2e25..efe1d39 100755
--- a/darwin-x86/bin/versioner
+++ b/darwin-x86/bin/versioner
Binary files differ
diff --git a/darwin-x86/lib64/clang/17/include/__clang_hip_math.h b/darwin-x86/lib64/clang/17/include/__clang_hip_math.h
index 537dd0f..a4e557e 100644
--- a/darwin-x86/lib64/clang/17/include/__clang_hip_math.h
+++ b/darwin-x86/lib64/clang/17/include/__clang_hip_math.h
@@ -243,7 +243,7 @@
 
 __DEVICE__
 float fmaf(float __x, float __y, float __z) {
-  return __ocml_fma_f32(__x, __y, __z);
+  return __builtin_fmaf(__x, __y, __z);
 }
 
 __DEVICE__
@@ -621,7 +621,7 @@
 #else
 __DEVICE__
 float __fmaf_rn(float __x, float __y, float __z) {
-  return __ocml_fma_f32(__x, __y, __z);
+  return __builtin_fmaf(__x, __y, __z);
 }
 #endif
 
@@ -799,7 +799,7 @@
 
 __DEVICE__
 double fma(double __x, double __y, double __z) {
-  return __ocml_fma_f64(__x, __y, __z);
+  return __builtin_fma(__x, __y, __z);
 }
 
 __DEVICE__
@@ -1258,7 +1258,7 @@
 #else
 __DEVICE__
 double __fma_rn(double __x, double __y, double __z) {
-  return __ocml_fma_f64(__x, __y, __z);
+  return __builtin_fma(__x, __y, __z);
 }
 #endif
 // END INTRINSICS
diff --git a/darwin-x86/lib64/clang/17/include/__clang_hip_runtime_wrapper.h b/darwin-x86/lib64/clang/17/include/__clang_hip_runtime_wrapper.h
index 0508731..e881707 100644
--- a/darwin-x86/lib64/clang/17/include/__clang_hip_runtime_wrapper.h
+++ b/darwin-x86/lib64/clang/17/include/__clang_hip_runtime_wrapper.h
@@ -80,12 +80,25 @@
 #if HIP_VERSION_MAJOR * 100 + HIP_VERSION_MINOR >= 405
 extern "C" __device__ unsigned long long __ockl_dm_alloc(unsigned long long __size);
 extern "C" __device__ void __ockl_dm_dealloc(unsigned long long __addr);
+#if __has_feature(address_sanitizer)
+extern "C" __device__ unsigned long long __asan_malloc_impl(unsigned long long __size, unsigned long long __pc);
+extern "C" __device__ void __asan_free_impl(unsigned long long __addr, unsigned long long __pc);
+__attribute__((noinline, weak)) __device__ void *malloc(__hip_size_t __size) {
+  unsigned long long __pc = (unsigned long long)__builtin_return_address(0);
+  return (void *)__asan_malloc_impl(__size, __pc);
+}
+__attribute__((noinline, weak)) __device__ void free(void *__ptr) {
+  unsigned long long __pc = (unsigned long long)__builtin_return_address(0);
+  __asan_free_impl((unsigned long long)__ptr, __pc);
+}
+#else
 __attribute__((weak)) inline __device__ void *malloc(__hip_size_t __size) {
   return (void *) __ockl_dm_alloc(__size);
 }
 __attribute__((weak)) inline __device__ void free(void *__ptr) {
   __ockl_dm_dealloc((unsigned long long)__ptr);
 }
+#endif // __has_feature(address_sanitizer)
 #else  // HIP version check
 #if __HIP_ENABLE_DEVICE_MALLOC__
 __device__ void *__hip_malloc(__hip_size_t __size);
diff --git a/darwin-x86/lib64/clang/17/include/adxintrin.h b/darwin-x86/lib64/clang/17/include/adxintrin.h
index 72b9ed0..4382530 100644
--- a/darwin-x86/lib64/clang/17/include/adxintrin.h
+++ b/darwin-x86/lib64/clang/17/include/adxintrin.h
@@ -17,56 +17,69 @@
 /* Define the default attributes for the functions in this file. */
 #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__))
 
+/* Use C++ inline semantics in C++, GNU inline for C mode. */
+#if defined(__cplusplus)
+#define __INLINE __inline
+#else
+#define __INLINE static __inline
+#endif
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
 /* Intrinsics that are available only if __ADX__ defined */
-static __inline unsigned char __attribute__((__always_inline__, __nodebug__, __target__("adx")))
-_addcarryx_u32(unsigned char __cf, unsigned int __x, unsigned int __y,
-               unsigned int *__p)
-{
+__INLINE unsigned char
+    __attribute__((__always_inline__, __nodebug__, __target__("adx")))
+    _addcarryx_u32(unsigned char __cf, unsigned int __x, unsigned int __y,
+                   unsigned int *__p) {
   return __builtin_ia32_addcarryx_u32(__cf, __x, __y, __p);
 }
 
 #ifdef __x86_64__
-static __inline unsigned char __attribute__((__always_inline__, __nodebug__, __target__("adx")))
-_addcarryx_u64(unsigned char __cf, unsigned long long __x,
-               unsigned long long __y, unsigned long long  *__p)
-{
+__INLINE unsigned char
+    __attribute__((__always_inline__, __nodebug__, __target__("adx")))
+    _addcarryx_u64(unsigned char __cf, unsigned long long __x,
+                   unsigned long long __y, unsigned long long *__p) {
   return __builtin_ia32_addcarryx_u64(__cf, __x, __y, __p);
 }
 #endif
 
 /* Intrinsics that are also available if __ADX__ undefined */
-static __inline unsigned char __DEFAULT_FN_ATTRS
-_addcarry_u32(unsigned char __cf, unsigned int __x, unsigned int __y,
-              unsigned int *__p)
-{
+__INLINE unsigned char __DEFAULT_FN_ATTRS _addcarry_u32(unsigned char __cf,
+                                                        unsigned int __x,
+                                                        unsigned int __y,
+                                                        unsigned int *__p) {
   return __builtin_ia32_addcarryx_u32(__cf, __x, __y, __p);
 }
 
 #ifdef __x86_64__
-static __inline unsigned char __DEFAULT_FN_ATTRS
+__INLINE unsigned char __DEFAULT_FN_ATTRS
 _addcarry_u64(unsigned char __cf, unsigned long long __x,
-              unsigned long long __y, unsigned long long  *__p)
-{
+              unsigned long long __y, unsigned long long *__p) {
   return __builtin_ia32_addcarryx_u64(__cf, __x, __y, __p);
 }
 #endif
 
-static __inline unsigned char __DEFAULT_FN_ATTRS
-_subborrow_u32(unsigned char __cf, unsigned int __x, unsigned int __y,
-              unsigned int *__p)
-{
+__INLINE unsigned char __DEFAULT_FN_ATTRS _subborrow_u32(unsigned char __cf,
+                                                         unsigned int __x,
+                                                         unsigned int __y,
+                                                         unsigned int *__p) {
   return __builtin_ia32_subborrow_u32(__cf, __x, __y, __p);
 }
 
 #ifdef __x86_64__
-static __inline unsigned char __DEFAULT_FN_ATTRS
+__INLINE unsigned char __DEFAULT_FN_ATTRS
 _subborrow_u64(unsigned char __cf, unsigned long long __x,
-               unsigned long long __y, unsigned long long  *__p)
-{
+               unsigned long long __y, unsigned long long *__p) {
   return __builtin_ia32_subborrow_u64(__cf, __x, __y, __p);
 }
 #endif
 
+#if defined(__cplusplus)
+}
+#endif
+
 #undef __DEFAULT_FN_ATTRS
 
 #endif /* __ADXINTRIN_H */
diff --git a/darwin-x86/lib64/clang/17/include/amxcomplexintrin.h b/darwin-x86/lib64/clang/17/include/amxcomplexintrin.h
new file mode 100644
index 0000000..84ef972
--- /dev/null
+++ b/darwin-x86/lib64/clang/17/include/amxcomplexintrin.h
@@ -0,0 +1,169 @@
+/*===--------- amxcomplexintrin.h - AMXCOMPLEX intrinsics -*- C++ -*---------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===------------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#error "Never use <amxcomplexintrin.h> directly; include <immintrin.h> instead."
+#endif // __IMMINTRIN_H
+
+#ifndef __AMX_COMPLEXINTRIN_H
+#define __AMX_COMPLEXINTRIN_H
+#ifdef __x86_64__
+
+#define __DEFAULT_FN_ATTRS_COMPLEX                                             \
+  __attribute__((__always_inline__, __nodebug__, __target__("amx-complex")))
+
+/// Perform matrix multiplication of two tiles containing complex elements and
+///    accumulate the results into a packed single precision tile. Each dword
+///    element in input tiles \a a and \a b is interpreted as a complex number
+///    with FP16 real part and FP16 imaginary part.
+/// Calculates the imaginary part of the result. For each possible combination
+///    of (row of \a a, column of \a b), it performs a set of multiplication
+///    and accumulations on all corresponding complex numbers (one from \a a
+///    and one from \a b). The imaginary part of the \a a element is multiplied
+///    with the real part of the corresponding \a b element, and the real part
+///    of the \a a element is multiplied with the imaginary part of the
+///    corresponding \a b elements. The two accumulated results are added, and
+///    then accumulated into the corresponding row and column of \a dst.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// void _tile_cmmimfp16ps(__tile dst, __tile a, __tile b);
+/// \endcode
+///
+/// \code{.operation}
+/// FOR m := 0 TO dst.rows - 1
+///	tmp := dst.row[m]
+///	FOR k := 0 TO (a.colsb / 4) - 1
+///		FOR n := 0 TO (dst.colsb / 4) - 1
+///			tmp.fp32[n] += FP32(a.row[m].fp16[2*k+0]) * FP32(b.row[k].fp16[2*n+1])
+///			tmp.fp32[n] += FP32(a.row[m].fp16[2*k+1]) * FP32(b.row[k].fp16[2*n+0])
+///		ENDFOR
+///	ENDFOR
+///	write_row_and_zero(dst, m, tmp, dst.colsb)
+/// ENDFOR
+/// zero_upper_rows(dst, dst.rows)
+/// zero_tileconfig_start()
+/// \endcode
+///
+/// This intrinsic corresponds to the \c TCMMIMFP16PS instruction.
+///
+/// \param dst
+///    The destination tile. Max size is 1024 Bytes.
+/// \param a
+///    The 1st source tile. Max size is 1024 Bytes.
+/// \param b
+///    The 2nd source tile. Max size is 1024 Bytes.
+#define _tile_cmmimfp16ps(dst, a, b) __builtin_ia32_tcmmimfp16ps(dst, a, b)
+
+/// Perform matrix multiplication of two tiles containing complex elements and
+///    accumulate the results into a packed single precision tile. Each dword
+///    element in input tiles \a a and \a b is interpreted as a complex number
+///    with FP16 real part and FP16 imaginary part.
+/// Calculates the real part of the result. For each possible combination
+///    of (row of \a a, column of \a b), it performs a set of multiplication
+///    and accumulations on all corresponding complex numbers (one from \a a
+///    and one from \a b). The real part of the \a a element is multiplied
+///    with the real part of the corresponding \a b element, and the negated
+///    imaginary part of the \a a element is multiplied with the imaginary
+///    part of the corresponding \a b elements. The two accumulated results
+///    are added, and then accumulated into the corresponding row and column
+///    of \a dst.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// void _tile_cmmrlfp16ps(__tile dst, __tile a, __tile b);
+/// \endcode
+///
+/// \code{.operation}
+/// FOR m := 0 TO dst.rows - 1
+///	tmp := dst.row[m]
+///	FOR k := 0 TO (a.colsb / 4) - 1
+///		FOR n := 0 TO (dst.colsb / 4) - 1
+///			tmp.fp32[n] += FP32(a.row[m].fp16[2*k+0]) * FP32(b.row[k].fp16[2*n+0])
+///			tmp.fp32[n] += FP32(-a.row[m].fp16[2*k+1]) * FP32(b.row[k].fp16[2*n+1])
+///		ENDFOR
+///	ENDFOR
+///	write_row_and_zero(dst, m, tmp, dst.colsb)
+/// ENDFOR
+/// zero_upper_rows(dst, dst.rows)
+/// zero_tileconfig_start()
+/// \endcode
+///
+/// This intrinsic corresponds to the \c TCMMIMFP16PS instruction.
+///
+/// \param dst
+///    The destination tile. Max size is 1024 Bytes.
+/// \param a
+///    The 1st source tile. Max size is 1024 Bytes.
+/// \param b
+///    The 2nd source tile. Max size is 1024 Bytes.
+#define _tile_cmmrlfp16ps(dst, a, b) __builtin_ia32_tcmmrlfp16ps(dst, a, b)
+
+static __inline__ _tile1024i __DEFAULT_FN_ATTRS_COMPLEX
+_tile_cmmimfp16ps_internal(unsigned short m, unsigned short n, unsigned short k,
+                           _tile1024i dst, _tile1024i src1, _tile1024i src2) {
+  return __builtin_ia32_tcmmimfp16ps_internal(m, n, k, dst, src1, src2);
+}
+
+static __inline__ _tile1024i __DEFAULT_FN_ATTRS_COMPLEX
+_tile_cmmrlfp16ps_internal(unsigned short m, unsigned short n, unsigned short k,
+                           _tile1024i dst, _tile1024i src1, _tile1024i src2) {
+  return __builtin_ia32_tcmmrlfp16ps_internal(m, n, k, dst, src1, src2);
+}
+
+/// Perform matrix multiplication of two tiles containing complex elements and
+/// accumulate the results into a packed single precision tile. Each dword
+/// element in input tiles src0 and src1 is interpreted as a complex number with
+/// FP16 real part and FP16 imaginary part.
+/// This function calculates the imaginary part of the result.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> TCMMIMFP16PS </c> instruction.
+///
+/// \param dst
+///    The destination tile. Max size is 1024 Bytes.
+/// \param src0
+///    The 1st source tile. Max size is 1024 Bytes.
+/// \param src1
+///    The 2nd source tile. Max size is 1024 Bytes.
+__DEFAULT_FN_ATTRS_COMPLEX
+static void __tile_cmmimfp16ps(__tile1024i *dst, __tile1024i src0,
+                               __tile1024i src1) {
+  dst->tile = _tile_cmmimfp16ps_internal(src0.row, src1.col, src0.col,
+                                         dst->tile, src0.tile, src1.tile);
+}
+
+/// Perform matrix multiplication of two tiles containing complex elements and
+/// accumulate the results into a packed single precision tile. Each dword
+/// element in input tiles src0 and src1 is interpreted as a complex number with
+/// FP16 real part and FP16 imaginary part.
+/// This function calculates the real part of the result.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> TCMMRLFP16PS </c> instruction.
+///
+/// \param dst
+///    The destination tile. Max size is 1024 Bytes.
+/// \param src0
+///    The 1st source tile. Max size is 1024 Bytes.
+/// \param src1
+///    The 2nd source tile. Max size is 1024 Bytes.
+__DEFAULT_FN_ATTRS_COMPLEX
+static void __tile_cmmrlfp16ps(__tile1024i *dst, __tile1024i src0,
+                               __tile1024i src1) {
+  dst->tile = _tile_cmmrlfp16ps_internal(src0.row, src1.col, src0.col,
+                                         dst->tile, src0.tile, src1.tile);
+}
+
+#endif // __x86_64__
+#endif // __AMX_COMPLEXINTRIN_H
diff --git a/darwin-x86/lib64/clang/17/include/arm_neon.h b/darwin-x86/lib64/clang/17/include/arm_neon.h
index 23d26a0..90ebf7d 100644
--- a/darwin-x86/lib64/clang/17/include/arm_neon.h
+++ b/darwin-x86/lib64/clang/17/include/arm_neon.h
@@ -35,7 +35,6 @@
 #include <stdint.h>
 
 #include <arm_bf16.h>
-typedef __bf16 bfloat16_t;
 typedef float float32_t;
 typedef __fp16 float16_t;
 #ifdef __aarch64__
diff --git a/darwin-x86/lib64/clang/17/include/arm_sve.h b/darwin-x86/lib64/clang/17/include/arm_sve.h
index 64362b8..f7bbc7f 100644
--- a/darwin-x86/lib64/clang/17/include/arm_sve.h
+++ b/darwin-x86/lib64/clang/17/include/arm_sve.h
@@ -37,7 +37,6 @@
 
 typedef __SVBFloat16_t svbfloat16_t;
 #include <arm_bf16.h>
-typedef __bf16 bfloat16_t;
 typedef __SVFloat32_t svfloat32_t;
 typedef __SVFloat64_t svfloat64_t;
 typedef __clang_svint8x2_t svint8x2_t;
@@ -74,6 +73,8 @@
 typedef __clang_svfloat32x4_t svfloat32x4_t;
 typedef __clang_svfloat64x4_t svfloat64x4_t;
 typedef __SVBool_t  svbool_t;
+typedef __clang_svboolx2_t  svboolx2_t;
+typedef __clang_svboolx4_t  svboolx4_t;
 
 typedef __clang_svbfloat16x2_t svbfloat16x2_t;
 typedef __clang_svbfloat16x3_t svbfloat16x3_t;
@@ -2914,6 +2915,10 @@
 svint64_t svdup_lane_s64(svint64_t, uint64_t);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_lane_s16)))
 svint16_t svdup_lane_s16(svint16_t, uint16_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdupq_n_u8)))
+svuint8_t svdupq_n_u8(uint8_t, uint8_t, uint8_t, uint8_t, uint8_t, uint8_t, uint8_t, uint8_t, uint8_t, uint8_t, uint8_t, uint8_t, uint8_t, uint8_t, uint8_t, uint8_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdupq_n_s8)))
+svint8_t svdupq_n_s8(int8_t, int8_t, int8_t, int8_t, int8_t, int8_t, int8_t, int8_t, int8_t, int8_t, int8_t, int8_t, int8_t, int8_t, int8_t, int8_t);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdupq_n_u16)))
 svuint16_t svdupq_n_u16(uint16_t, uint16_t, uint16_t, uint16_t, uint16_t, uint16_t, uint16_t, uint16_t);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdupq_n_f16)))
@@ -2932,18 +2937,14 @@
 svfloat64_t svdupq_n_f64(float64_t, float64_t);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdupq_n_s64)))
 svint64_t svdupq_n_s64(int64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdupq_n_u8)))
-svuint8_t svdupq_n_u8(uint8_t, uint8_t, uint8_t, uint8_t, uint8_t, uint8_t, uint8_t, uint8_t, uint8_t, uint8_t, uint8_t, uint8_t, uint8_t, uint8_t, uint8_t, uint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdupq_n_s8)))
-svint8_t svdupq_n_s8(int8_t, int8_t, int8_t, int8_t, int8_t, int8_t, int8_t, int8_t, int8_t, int8_t, int8_t, int8_t, int8_t, int8_t, int8_t, int8_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdupq_n_b8)))
+svbool_t svdupq_n_b8(bool, bool, bool, bool, bool, bool, bool, bool, bool, bool, bool, bool, bool, bool, bool, bool);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdupq_n_b16)))
 svbool_t svdupq_n_b16(bool, bool, bool, bool, bool, bool, bool, bool);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdupq_n_b32)))
 svbool_t svdupq_n_b32(bool, bool, bool, bool);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdupq_n_b64)))
 svbool_t svdupq_n_b64(bool, bool);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdupq_n_b8)))
-svbool_t svdupq_n_b8(bool, bool, bool, bool, bool, bool, bool, bool, bool, bool, bool, bool, bool, bool, bool, bool);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdupq_lane_u8)))
 svuint8_t svdupq_lane_u8(svuint8_t, uint64_t);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdupq_lane_u32)))
@@ -6958,14 +6959,14 @@
 svint64_t svrev_s64(svint64_t);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrev_s16)))
 svint16_t svrev_s16(svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrev_b8)))
-svbool_t svrev_b8(svbool_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrev_b16)))
+svbool_t svrev_b16(svbool_t);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrev_b32)))
 svbool_t svrev_b32(svbool_t);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrev_b64)))
 svbool_t svrev_b64(svbool_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrev_b16)))
-svbool_t svrev_b16(svbool_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrev_b8)))
+svbool_t svrev_b8(svbool_t);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrevb_u32_m)))
 svuint32_t svrevb_u32_m(svuint32_t, svbool_t, svuint32_t);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrevb_u64_m)))
@@ -8126,14 +8127,14 @@
 svint64_t svtrn1_s64(svint64_t, svint64_t);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn1_s16)))
 svint16_t svtrn1_s16(svint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn1_b8)))
-svbool_t svtrn1_b8(svbool_t, svbool_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn1_b16)))
+svbool_t svtrn1_b16(svbool_t, svbool_t);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn1_b32)))
 svbool_t svtrn1_b32(svbool_t, svbool_t);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn1_b64)))
 svbool_t svtrn1_b64(svbool_t, svbool_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn1_b16)))
-svbool_t svtrn1_b16(svbool_t, svbool_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn1_b8)))
+svbool_t svtrn1_b8(svbool_t, svbool_t);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn2_u8)))
 svuint8_t svtrn2_u8(svuint8_t, svuint8_t);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn2_u32)))
@@ -8156,14 +8157,14 @@
 svint64_t svtrn2_s64(svint64_t, svint64_t);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn2_s16)))
 svint16_t svtrn2_s16(svint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn2_b8)))
-svbool_t svtrn2_b8(svbool_t, svbool_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn2_b16)))
+svbool_t svtrn2_b16(svbool_t, svbool_t);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn2_b32)))
 svbool_t svtrn2_b32(svbool_t, svbool_t);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn2_b64)))
 svbool_t svtrn2_b64(svbool_t, svbool_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn2_b16)))
-svbool_t svtrn2_b16(svbool_t, svbool_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn2_b8)))
+svbool_t svtrn2_b8(svbool_t, svbool_t);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtsmul_f64)))
 svfloat64_t svtsmul_f64(svfloat64_t, svuint64_t);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtsmul_f32)))
@@ -8314,14 +8315,14 @@
 svint64_t svuzp1_s64(svint64_t, svint64_t);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp1_s16)))
 svint16_t svuzp1_s16(svint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp1_b8)))
-svbool_t svuzp1_b8(svbool_t, svbool_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp1_b16)))
+svbool_t svuzp1_b16(svbool_t, svbool_t);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp1_b32)))
 svbool_t svuzp1_b32(svbool_t, svbool_t);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp1_b64)))
 svbool_t svuzp1_b64(svbool_t, svbool_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp1_b16)))
-svbool_t svuzp1_b16(svbool_t, svbool_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp1_b8)))
+svbool_t svuzp1_b8(svbool_t, svbool_t);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp2_u8)))
 svuint8_t svuzp2_u8(svuint8_t, svuint8_t);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp2_u32)))
@@ -8344,14 +8345,14 @@
 svint64_t svuzp2_s64(svint64_t, svint64_t);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp2_s16)))
 svint16_t svuzp2_s16(svint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp2_b8)))
-svbool_t svuzp2_b8(svbool_t, svbool_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp2_b16)))
+svbool_t svuzp2_b16(svbool_t, svbool_t);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp2_b32)))
 svbool_t svuzp2_b32(svbool_t, svbool_t);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp2_b64)))
 svbool_t svuzp2_b64(svbool_t, svbool_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp2_b16)))
-svbool_t svuzp2_b16(svbool_t, svbool_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp2_b8)))
+svbool_t svuzp2_b8(svbool_t, svbool_t);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilele_b8_s32)))
 svbool_t svwhilele_b8_s32(int32_t, int32_t);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilele_b32_s32)))
@@ -8440,14 +8441,14 @@
 svint64_t svzip1_s64(svint64_t, svint64_t);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip1_s16)))
 svint16_t svzip1_s16(svint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip1_b8)))
-svbool_t svzip1_b8(svbool_t, svbool_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip1_b16)))
+svbool_t svzip1_b16(svbool_t, svbool_t);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip1_b32)))
 svbool_t svzip1_b32(svbool_t, svbool_t);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip1_b64)))
 svbool_t svzip1_b64(svbool_t, svbool_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip1_b16)))
-svbool_t svzip1_b16(svbool_t, svbool_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip1_b8)))
+svbool_t svzip1_b8(svbool_t, svbool_t);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip2_u8)))
 svuint8_t svzip2_u8(svuint8_t, svuint8_t);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip2_u32)))
@@ -8470,14 +8471,14 @@
 svint64_t svzip2_s64(svint64_t, svint64_t);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip2_s16)))
 svint16_t svzip2_s16(svint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip2_b8)))
-svbool_t svzip2_b8(svbool_t, svbool_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip2_b16)))
+svbool_t svzip2_b16(svbool_t, svbool_t);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip2_b32)))
 svbool_t svzip2_b32(svbool_t, svbool_t);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip2_b64)))
 svbool_t svzip2_b64(svbool_t, svbool_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip2_b16)))
-svbool_t svzip2_b16(svbool_t, svbool_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip2_b8)))
+svbool_t svzip2_b8(svbool_t, svbool_t);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabd_n_f64_m)))
 svfloat64_t svabd_m(svbool_t, svfloat64_t, float64_t);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabd_n_f32_m)))
@@ -10528,6 +10529,10 @@
 svint64_t svdup_lane(svint64_t, uint64_t);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_lane_s16)))
 svint16_t svdup_lane(svint16_t, uint16_t);
+__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdupq_n_u8)))
+svuint8_t svdupq_u8(uint8_t, uint8_t, uint8_t, uint8_t, uint8_t, uint8_t, uint8_t, uint8_t, uint8_t, uint8_t, uint8_t, uint8_t, uint8_t, uint8_t, uint8_t, uint8_t);
+__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdupq_n_s8)))
+svint8_t svdupq_s8(int8_t, int8_t, int8_t, int8_t, int8_t, int8_t, int8_t, int8_t, int8_t, int8_t, int8_t, int8_t, int8_t, int8_t, int8_t, int8_t);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdupq_n_u16)))
 svuint16_t svdupq_u16(uint16_t, uint16_t, uint16_t, uint16_t, uint16_t, uint16_t, uint16_t, uint16_t);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdupq_n_f16)))
@@ -10546,18 +10551,14 @@
 svfloat64_t svdupq_f64(float64_t, float64_t);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdupq_n_s64)))
 svint64_t svdupq_s64(int64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdupq_n_u8)))
-svuint8_t svdupq_u8(uint8_t, uint8_t, uint8_t, uint8_t, uint8_t, uint8_t, uint8_t, uint8_t, uint8_t, uint8_t, uint8_t, uint8_t, uint8_t, uint8_t, uint8_t, uint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdupq_n_s8)))
-svint8_t svdupq_s8(int8_t, int8_t, int8_t, int8_t, int8_t, int8_t, int8_t, int8_t, int8_t, int8_t, int8_t, int8_t, int8_t, int8_t, int8_t, int8_t);
+__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdupq_n_b8)))
+svbool_t svdupq_b8(bool, bool, bool, bool, bool, bool, bool, bool, bool, bool, bool, bool, bool, bool, bool, bool);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdupq_n_b16)))
 svbool_t svdupq_b16(bool, bool, bool, bool, bool, bool, bool, bool);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdupq_n_b32)))
 svbool_t svdupq_b32(bool, bool, bool, bool);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdupq_n_b64)))
 svbool_t svdupq_b64(bool, bool);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdupq_n_b8)))
-svbool_t svdupq_b8(bool, bool, bool, bool, bool, bool, bool, bool, bool, bool, bool, bool, bool, bool, bool, bool);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdupq_lane_u8)))
 svuint8_t svdupq_lane(svuint8_t, uint64_t);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdupq_lane_u32)))
diff --git a/darwin-x86/lib64/clang/17/include/avx2intrin.h b/darwin-x86/lib64/clang/17/include/avx2intrin.h
index f8521e7..33f24f2 100644
--- a/darwin-x86/lib64/clang/17/include/avx2intrin.h
+++ b/darwin-x86/lib64/clang/17/include/avx2intrin.h
@@ -935,102 +935,810 @@
   return (__m128i)__builtin_ia32_psrlv2di((__v2di)__X, (__v2di)__Y);
 }
 
+/// Conditionally gathers two 64-bit floating-point values, either from the
+///    128-bit vector of [2 x double] in \a a, or from memory \a m using scaled
+///    indexes from the 128-bit vector of [4 x i32] in \a i. The 128-bit vector
+///    of [2 x double] in \a mask determines the source for each element.
+///
+/// \code{.operation}
+/// FOR element := 0 to 1
+///   j := element*64
+///   k := element*32
+///   IF mask[j+63] == 0
+///     result[j+63:j] := a[j+63:j]
+///   ELSE
+///     result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
+///   FI
+/// ENDFOR
+/// \endcode
+///
+/// \headerfile <immintrin.h>
+///
+/// \code
+/// __m128d _mm_mask_i32gather_pd(__m128d a, const double *m, __m128i i,
+///                               __m128d mask, const int s);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VGATHERDPD instruction.
+///
+/// \param a
+///    A 128-bit vector of [2 x double] used as the source when a mask bit is
+///    zero.
+/// \param m
+///    A pointer to the memory used for loading values.
+/// \param i
+///    A 128-bit vector of [4 x i32] containing signed indexes into \a m. Only
+///    the first two elements are used.
+/// \param mask
+///    A 128-bit vector of [2 x double] containing the mask. The most
+///    significant bit of each element in the mask vector represents the mask
+///    bits. If a mask bit is zero, the corresponding value from vector \a a
+///    is gathered; otherwise the value is loaded from memory.
+/// \param s
+///    A literal constant scale factor for the indexes in \a i. Must be
+///    1, 2, 4, or 8.
+/// \returns A 128-bit vector of [2 x double] containing the gathered values.
 #define _mm_mask_i32gather_pd(a, m, i, mask, s) \
   ((__m128d)__builtin_ia32_gatherd_pd((__v2df)(__m128i)(a), \
                                       (double const *)(m), \
                                       (__v4si)(__m128i)(i), \
                                       (__v2df)(__m128d)(mask), (s)))
 
+/// Conditionally gathers four 64-bit floating-point values, either from the
+///    256-bit vector of [4 x double] in \a a, or from memory \a m using scaled
+///    indexes from the 128-bit vector of [4 x i32] in \a i. The 256-bit vector
+///    of [4 x double] in \a mask determines the source for each element.
+///
+/// \code{.operation}
+/// FOR element := 0 to 3
+///   j := element*64
+///   k := element*32
+///   IF mask[j+63] == 0
+///     result[j+63:j] := a[j+63:j]
+///   ELSE
+///     result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
+///   FI
+/// ENDFOR
+/// \endcode
+///
+/// \headerfile <immintrin.h>
+///
+/// \code
+/// __m256d _mm256_mask_i32gather_pd(__m256d a, const double *m, __m128i i,
+///                                  __m256d mask, const int s);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VGATHERDPD instruction.
+///
+/// \param a
+///    A 256-bit vector of [4 x double] used as the source when a mask bit is
+///    zero.
+/// \param m
+///    A pointer to the memory used for loading values.
+/// \param i
+///    A 128-bit vector of [4 x i32] containing signed indexes into \a m.
+/// \param mask
+///    A 256-bit vector of [4 x double] containing the mask. The most
+///    significant bit of each element in the mask vector represents the mask
+///    bits. If a mask bit is zero, the corresponding value from vector \a a
+///    is gathered; otherwise the value is loaded from memory.
+/// \param s
+///    A literal constant scale factor for the indexes in \a i. Must be
+///    1, 2, 4, or 8.
+/// \returns A 256-bit vector of [4 x double] containing the gathered values.
 #define _mm256_mask_i32gather_pd(a, m, i, mask, s) \
   ((__m256d)__builtin_ia32_gatherd_pd256((__v4df)(__m256d)(a), \
                                          (double const *)(m), \
                                          (__v4si)(__m128i)(i), \
                                          (__v4df)(__m256d)(mask), (s)))
 
+/// Conditionally gathers two 64-bit floating-point values, either from the
+///    128-bit vector of [2 x double] in \a a, or from memory \a m using scaled
+///    indexes from the 128-bit vector of [2 x i64] in \a i. The 128-bit vector
+///    of [2 x double] in \a mask determines the source for each element.
+///
+/// \code{.operation}
+/// FOR element := 0 to 1
+///   j := element*64
+///   k := element*64
+///   IF mask[j+63] == 0
+///     result[j+63:j] := a[j+63:j]
+///   ELSE
+///     result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
+///   FI
+/// ENDFOR
+/// \endcode
+///
+/// \headerfile <immintrin.h>
+///
+/// \code
+/// __m128d _mm_mask_i64gather_pd(__m128d a, const double *m, __m128i i,
+///                               __m128d mask, const int s);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VGATHERQPD instruction.
+///
+/// \param a
+///    A 128-bit vector of [2 x double] used as the source when a mask bit is
+///    zero.
+/// \param m
+///    A pointer to the memory used for loading values.
+/// \param i
+///    A 128-bit vector of [2 x i64] containing signed indexes into \a m.
+/// \param mask
+///    A 128-bit vector of [2 x double] containing the mask. The most
+///    significant bit of each element in the mask vector represents the mask
+///    bits. If a mask bit is zero, the corresponding value from vector \a a
+///    is gathered; otherwise the value is loaded from memory.
+/// \param s
+///    A literal constant scale factor for the indexes in \a i. Must be
+///    1, 2, 4, or 8.
+/// \returns A 128-bit vector of [2 x double] containing the gathered values.
 #define _mm_mask_i64gather_pd(a, m, i, mask, s) \
   ((__m128d)__builtin_ia32_gatherq_pd((__v2df)(__m128d)(a), \
                                       (double const *)(m), \
                                       (__v2di)(__m128i)(i), \
                                       (__v2df)(__m128d)(mask), (s)))
 
+/// Conditionally gathers four 64-bit floating-point values, either from the
+///    256-bit vector of [4 x double] in \a a, or from memory \a m using scaled
+///    indexes from the 256-bit vector of [4 x i64] in \a i. The 256-bit vector
+///    of [4 x double] in \a mask determines the source for each element.
+///
+/// \code{.operation}
+/// FOR element := 0 to 3
+///   j := element*64
+///   k := element*64
+///   IF mask[j+63] == 0
+///     result[j+63:j] := a[j+63:j]
+///   ELSE
+///     result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
+///   FI
+/// ENDFOR
+/// \endcode
+///
+/// \headerfile <immintrin.h>
+///
+/// \code
+/// __m256d _mm256_mask_i64gather_pd(__m256d a, const double *m, __m256i i,
+///                                  __m256d mask, const int s);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VGATHERQPD instruction.
+///
+/// \param a
+///    A 256-bit vector of [4 x double] used as the source when a mask bit is
+///    zero.
+/// \param m
+///    A pointer to the memory used for loading values.
+/// \param i
+///    A 256-bit vector of [4 x i64] containing signed indexes into \a m.
+/// \param mask
+///    A 256-bit vector of [4 x double] containing the mask. The most
+///    significant bit of each element in the mask vector represents the mask
+///    bits. If a mask bit is zero, the corresponding value from vector \a a
+///    is gathered; otherwise the value is loaded from memory.
+/// \param s
+///    A literal constant scale factor for the indexes in \a i. Must be
+///    1, 2, 4, or 8.
+/// \returns A 256-bit vector of [4 x double] containing the gathered values.
 #define _mm256_mask_i64gather_pd(a, m, i, mask, s) \
   ((__m256d)__builtin_ia32_gatherq_pd256((__v4df)(__m256d)(a), \
                                          (double const *)(m), \
                                          (__v4di)(__m256i)(i), \
                                          (__v4df)(__m256d)(mask), (s)))
 
+/// Conditionally gathers four 32-bit floating-point values, either from the
+///    128-bit vector of [4 x float] in \a a, or from memory \a m using scaled
+///    indexes from the 128-bit vector of [4 x i32] in \a i. The 128-bit vector
+///    of [4 x float] in \a mask determines the source for each element.
+///
+/// \code{.operation}
+/// FOR element := 0 to 3
+///   j := element*32
+///   k := element*32
+///   IF mask[j+31] == 0
+///     result[j+31:j] := a[j+31:j]
+///   ELSE
+///     result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
+///   FI
+/// ENDFOR
+/// \endcode
+///
+/// \headerfile <immintrin.h>
+///
+/// \code
+/// __m128 _mm_mask_i32gather_ps(__m128 a, const float *m, __m128i i,
+///                              __m128 mask, const int s);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VGATHERDPS instruction.
+///
+/// \param a
+///    A 128-bit vector of [4 x float] used as the source when a mask bit is
+///    zero.
+/// \param m
+///    A pointer to the memory used for loading values.
+/// \param i
+///    A 128-bit vector of [4 x i32] containing signed indexes into \a m.
+/// \param mask
+///    A 128-bit vector of [4 x float] containing the mask. The most
+///    significant bit of each element in the mask vector represents the mask
+///    bits. If a mask bit is zero, the corresponding value from vector \a a
+///    is gathered; otherwise the value is loaded from memory.
+/// \param s
+///    A literal constant scale factor for the indexes in \a i. Must be
+///    1, 2, 4, or 8.
+/// \returns A 128-bit vector of [4 x float] containing the gathered values.
 #define _mm_mask_i32gather_ps(a, m, i, mask, s) \
   ((__m128)__builtin_ia32_gatherd_ps((__v4sf)(__m128)(a), \
                                      (float const *)(m), \
                                      (__v4si)(__m128i)(i), \
                                      (__v4sf)(__m128)(mask), (s)))
 
+/// Conditionally gathers eight 32-bit floating-point values, either from the
+///    256-bit vector of [8 x float] in \a a, or from memory \a m using scaled
+///    indexes from the 256-bit vector of [8 x i32] in \a i. The 256-bit vector
+///    of [8 x float] in \a mask determines the source for each element.
+///
+/// \code{.operation}
+/// FOR element := 0 to 7
+///   j := element*32
+///   k := element*32
+///   IF mask[j+31] == 0
+///     result[j+31:j] := a[j+31:j]
+///   ELSE
+///     result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
+///   FI
+/// ENDFOR
+/// \endcode
+///
+/// \headerfile <immintrin.h>
+///
+/// \code
+/// __m256 _mm256_mask_i32gather_ps(__m256 a, const float *m, __m256i i,
+///                                 __m256 mask, const int s);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VGATHERDPS instruction.
+///
+/// \param a
+///    A 256-bit vector of [8 x float] used as the source when a mask bit is
+///    zero.
+/// \param m
+///    A pointer to the memory used for loading values.
+/// \param i
+///    A 256-bit vector of [8 x i32] containing signed indexes into \a m.
+/// \param mask
+///    A 256-bit vector of [8 x float] containing the mask. The most
+///    significant bit of each element in the mask vector represents the mask
+///    bits. If a mask bit is zero, the corresponding value from vector \a a
+///    is gathered; otherwise the value is loaded from memory.
+/// \param s
+///    A literal constant scale factor for the indexes in \a i. Must be
+///    1, 2, 4, or 8.
+/// \returns A 256-bit vector of [8 x float] containing the gathered values.
 #define _mm256_mask_i32gather_ps(a, m, i, mask, s) \
   ((__m256)__builtin_ia32_gatherd_ps256((__v8sf)(__m256)(a), \
                                         (float const *)(m), \
                                         (__v8si)(__m256i)(i), \
                                         (__v8sf)(__m256)(mask), (s)))
 
+/// Conditionally gathers two 32-bit floating-point values, either from the
+///    128-bit vector of [4 x float] in \a a, or from memory \a m using scaled
+///    indexes from the 128-bit vector of [2 x i64] in \a i. The 128-bit vector
+///    of [4 x float] in \a mask determines the source for the lower two
+///    elements. The upper two elements of the result are zeroed.
+///
+/// \code{.operation}
+/// FOR element := 0 to 1
+///   j := element*32
+///   k := element*64
+///   IF mask[j+31] == 0
+///     result[j+31:j] := a[j+31:j]
+///   ELSE
+///     result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)
+///   FI
+/// ENDFOR
+/// result[127:64] := 0
+/// \endcode
+///
+/// \headerfile <immintrin.h>
+///
+/// \code
+/// __m128 _mm_mask_i64gather_ps(__m128 a, const float *m, __m128i i,
+///                              __m128 mask, const int s);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VGATHERQPS instruction.
+///
+/// \param a
+///    A 128-bit vector of [4 x float] used as the source when a mask bit is
+///    zero. Only the first two elements are used.
+/// \param m
+///    A pointer to the memory used for loading values.
+/// \param i
+///    A 128-bit vector of [2 x i64] containing signed indexes into \a m.
+/// \param mask
+///    A 128-bit vector of [4 x float] containing the mask. The most
+///    significant bit of each element in the mask vector represents the mask
+///    bits. If a mask bit is zero, the corresponding value from vector \a a
+///    is gathered; otherwise the value is loaded from memory. Only the first
+///    two elements are used.
+/// \param s
+///    A literal constant scale factor for the indexes in \a i. Must be
+///    1, 2, 4, or 8.
+/// \returns A 128-bit vector of [4 x float] containing the gathered values.
 #define _mm_mask_i64gather_ps(a, m, i, mask, s) \
   ((__m128)__builtin_ia32_gatherq_ps((__v4sf)(__m128)(a), \
                                      (float const *)(m), \
                                      (__v2di)(__m128i)(i), \
                                      (__v4sf)(__m128)(mask), (s)))
 
+/// Conditionally gathers four 32-bit floating-point values, either from the
+///    128-bit vector of [4 x float] in \a a, or from memory \a m using scaled
+///    indexes from the 256-bit vector of [4 x i64] in \a i. The 128-bit vector
+///    of [4 x float] in \a mask determines the source for each element.
+///
+/// \code{.operation}
+/// FOR element := 0 to 3
+///   j := element*32
+///   k := element*64
+///   IF mask[j+31] == 0
+///     result[j+31:j] := a[j+31:j]
+///   ELSE
+///     result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)
+///   FI
+/// ENDFOR
+/// \endcode
+///
+/// \headerfile <immintrin.h>
+///
+/// \code
+/// __m128 _mm256_mask_i64gather_ps(__m128 a, const float *m, __m256i i,
+///                                 __m128 mask, const int s);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VGATHERQPS instruction.
+///
+/// \param a
+///    A 128-bit vector of [4 x float] used as the source when a mask bit is
+///   zero.
+/// \param m
+///    A pointer to the memory used for loading values.
+/// \param i
+///    A 256-bit vector of [4 x i64] containing signed indexes into \a m.
+/// \param mask
+///    A 128-bit vector of [4 x float] containing the mask. The most
+///    significant bit of each element in the mask vector represents the mask
+///    bits. If a mask bit is zero, the corresponding value from vector \a a
+///    is gathered; otherwise the value is loaded from memory.
+/// \param s
+///    A literal constant scale factor for the indexes in \a i. Must be
+///    1, 2, 4, or 8.
+/// \returns A 128-bit vector of [4 x float] containing the gathered values.
 #define _mm256_mask_i64gather_ps(a, m, i, mask, s) \
   ((__m128)__builtin_ia32_gatherq_ps256((__v4sf)(__m128)(a), \
                                         (float const *)(m), \
                                         (__v4di)(__m256i)(i), \
                                         (__v4sf)(__m128)(mask), (s)))
 
+/// Conditionally gathers four 32-bit integer values, either from the
+///    128-bit vector of [4 x i32] in \a a, or from memory \a m using scaled
+///    indexes from the 128-bit vector of [4 x i32] in \a i. The 128-bit vector
+///    of [4 x i32] in \a mask determines the source for each element.
+///
+/// \code{.operation}
+/// FOR element := 0 to 3
+///   j := element*32
+///   k := element*32
+///   IF mask[j+31] == 0
+///     result[j+31:j] := a[j+31:j]
+///   ELSE
+///     result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
+///   FI
+/// ENDFOR
+/// \endcode
+///
+/// \headerfile <immintrin.h>
+///
+/// \code
+/// __m128i _mm_mask_i32gather_epi32(__m128i a, const int *m, __m128i i,
+///                                  __m128i mask, const int s);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VPGATHERDD instruction.
+///
+/// \param a
+///    A 128-bit vector of [4 x i32] used as the source when a mask bit is
+///    zero.
+/// \param m
+///    A pointer to the memory used for loading values.
+/// \param i
+///    A 128-bit vector of [4 x i32] containing signed indexes into \a m.
+/// \param mask
+///    A 128-bit vector of [4 x i32] containing the mask. The most significant
+///    bit of each element in the mask vector represents the mask bits. If a
+///    mask bit is zero, the corresponding value from vector \a a is gathered;
+///    otherwise the value is loaded from memory.
+/// \param s
+///    A literal constant scale factor for the indexes in \a i. Must be
+///    1, 2, 4, or 8.
+/// \returns A 128-bit vector of [4 x i32] containing the gathered values.
 #define _mm_mask_i32gather_epi32(a, m, i, mask, s) \
   ((__m128i)__builtin_ia32_gatherd_d((__v4si)(__m128i)(a), \
                                      (int const *)(m), \
                                      (__v4si)(__m128i)(i), \
                                      (__v4si)(__m128i)(mask), (s)))
 
+/// Conditionally gathers eight 32-bit integer values, either from the
+///    256-bit vector of [8 x i32] in \a a, or from memory \a m using scaled
+///    indexes from the 256-bit vector of [8 x i32] in \a i. The 256-bit vector
+///    of [8 x i32] in \a mask determines the source for each element.
+///
+/// \code{.operation}
+/// FOR element := 0 to 7
+///   j := element*32
+///   k := element*32
+///   IF mask[j+31] == 0
+///     result[j+31:j] := a[j+31:j]
+///   ELSE
+///     result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
+///   FI
+/// ENDFOR
+/// \endcode
+///
+/// \headerfile <immintrin.h>
+///
+/// \code
+/// __m256i _mm256_mask_i32gather_epi32(__m256i a, const int *m, __m256i i,
+///                                     __m256i mask, const int s);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VPGATHERDD instruction.
+///
+/// \param a
+///    A 256-bit vector of [8 x i32] used as the source when a mask bit is
+///    zero.
+/// \param m
+///    A pointer to the memory used for loading values.
+/// \param i
+///    A 256-bit vector of [8 x i32] containing signed indexes into \a m.
+/// \param mask
+///    A 256-bit vector of [8 x i32] containing the mask. The most significant
+///    bit of each element in the mask vector represents the mask bits. If a
+///    mask bit is zero, the corresponding value from vector \a a is gathered;
+///    otherwise the value is loaded from memory.
+/// \param s
+///    A literal constant scale factor for the indexes in \a i. Must be
+///    1, 2, 4, or 8.
+/// \returns A 256-bit vector of [8 x i32] containing the gathered values.
 #define _mm256_mask_i32gather_epi32(a, m, i, mask, s) \
   ((__m256i)__builtin_ia32_gatherd_d256((__v8si)(__m256i)(a), \
                                         (int const *)(m), \
                                         (__v8si)(__m256i)(i), \
                                         (__v8si)(__m256i)(mask), (s)))
 
+/// Conditionally gathers two 32-bit integer values, either from the
+///    128-bit vector of [4 x i32] in \a a, or from memory \a m using scaled
+///    indexes from the 128-bit vector of [2 x i64] in \a i. The 128-bit vector
+///    of [4 x i32] in \a mask determines the source for the lower two
+///    elements. The upper two elements of the result are zeroed.
+///
+/// \code{.operation}
+/// FOR element := 0 to 1
+///   j := element*32
+///   k := element*64
+///   IF mask[j+31] == 0
+///     result[j+31:j] := a[j+31:j]
+///   ELSE
+///     result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)
+///   FI
+/// ENDFOR
+/// result[127:64] := 0
+/// \endcode
+///
+/// \headerfile <immintrin.h>
+///
+/// \code
+/// __m128i _mm_mask_i64gather_epi32(__m128i a, const int *m, __m128i i,
+///                                  __m128i mask, const int s);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VPGATHERQD instruction.
+///
+/// \param a
+///    A 128-bit vector of [4 x i32] used as the source when a mask bit is
+///   zero. Only the first two elements are used.
+/// \param m
+///    A pointer to the memory used for loading values.
+/// \param i
+///    A 128-bit vector of [2 x i64] containing indexes into \a m.
+/// \param mask
+///    A 128-bit vector of [4 x i32] containing the mask. The most significant
+///    bit of each element in the mask vector represents the mask bits. If a
+///    mask bit is zero, the corresponding value from vector \a a is gathered;
+///    otherwise the value is loaded from memory. Only the first two elements
+///    are used.
+/// \param s
+///    A literal constant scale factor for the indexes in \a i. Must be
+///    1, 2, 4, or 8.
+/// \returns A 128-bit vector of [4 x i32] containing the gathered values.
 #define _mm_mask_i64gather_epi32(a, m, i, mask, s) \
   ((__m128i)__builtin_ia32_gatherq_d((__v4si)(__m128i)(a), \
                                      (int const *)(m), \
                                      (__v2di)(__m128i)(i), \
                                      (__v4si)(__m128i)(mask), (s)))
 
+/// Conditionally gathers four 32-bit integer values, either from the
+///    128-bit vector of [4 x i32] in \a a, or from memory \a m using scaled
+///    indexes from the 256-bit vector of [4 x i64] in \a i. The 128-bit vector
+///    of [4 x i32] in \a mask determines the source for each element.
+///
+/// \code{.operation}
+/// FOR element := 0 to 3
+///   j := element*32
+///   k := element*64
+///   IF mask[j+31] == 0
+///     result[j+31:j] := a[j+31:j]
+///   ELSE
+///     result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)
+///   FI
+/// ENDFOR
+/// \endcode
+///
+/// \headerfile <immintrin.h>
+///
+/// \code
+/// __m128i _mm256_mask_i64gather_epi32(__m128i a, const int *m, __m256i i,
+///                                     __m128i mask, const int s);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VPGATHERQD instruction.
+///
+/// \param a
+///    A 128-bit vector of [4 x i32] used as the source when a mask bit is
+///    zero.
+/// \param m
+///    A pointer to the memory used for loading values.
+/// \param i
+///    A 256-bit vector of [4 x i64] containing signed indexes into \a m.
+/// \param mask
+///    A 128-bit vector of [4 x i32] containing the mask. The most significant
+///    bit of each element in the mask vector represents the mask bits. If a
+///    mask bit is zero, the corresponding value from vector \a a is gathered;
+///    otherwise the value is loaded from memory.
+/// \param s
+///    A literal constant scale factor for the indexes in \a i. Must be
+///    1, 2, 4, or 8.
+/// \returns A 128-bit vector of [4 x i32] containing the gathered values.
 #define _mm256_mask_i64gather_epi32(a, m, i, mask, s) \
   ((__m128i)__builtin_ia32_gatherq_d256((__v4si)(__m128i)(a), \
                                         (int const *)(m), \
                                         (__v4di)(__m256i)(i), \
                                         (__v4si)(__m128i)(mask), (s)))
 
+/// Conditionally gathers two 64-bit integer values, either from the
+///    128-bit vector of [2 x i64] in \a a, or from memory \a m using scaled
+///    indexes from the 128-bit vector of [4 x i32] in \a i. The 128-bit vector
+///    of [2 x i64] in \a mask determines the source for each element.
+///
+/// \code{.operation}
+/// FOR element := 0 to 1
+///   j := element*64
+///   k := element*32
+///   IF mask[j+63] == 0
+///     result[j+63:j] := a[j+63:j]
+///   ELSE
+///     result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
+///   FI
+/// ENDFOR
+/// \endcode
+///
+/// \headerfile <immintrin.h>
+///
+/// \code
+/// __m128i _mm_mask_i32gather_epi64(__m128i a, const long long *m, __m128i i,
+///                                  __m128i mask, const int s);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VPGATHERDQ instruction.
+///
+/// \param a
+///    A 128-bit vector of [2 x i64] used as the source when a mask bit is
+///    zero.
+/// \param m
+///    A pointer to the memory used for loading values.
+/// \param i
+///    A 128-bit vector of [4 x i32] containing signed indexes into \a m. Only
+///    the first two elements are used.
+/// \param mask
+///    A 128-bit vector of [2 x i64] containing the mask. The most significant
+///    bit of each element in the mask vector represents the mask bits. If a
+///    mask bit is zero, the corresponding value from vector \a a is gathered;
+///    otherwise the value is loaded from memory.
+/// \param s
+///    A literal constant scale factor for the indexes in \a i. Must be
+///    1, 2, 4, or 8.
+/// \returns A 128-bit vector of [2 x i64] containing the gathered values.
 #define _mm_mask_i32gather_epi64(a, m, i, mask, s) \
   ((__m128i)__builtin_ia32_gatherd_q((__v2di)(__m128i)(a), \
                                      (long long const *)(m), \
                                      (__v4si)(__m128i)(i), \
                                      (__v2di)(__m128i)(mask), (s)))
 
+/// Conditionally gathers four 64-bit integer values, either from the
+///    256-bit vector of [4 x i64] in \a a, or from memory \a m using scaled
+///    indexes from the 128-bit vector of [4 x i32] in \a i. The 256-bit vector
+///    of [4 x i64] in \a mask determines the source for each element.
+///
+/// \code{.operation}
+/// FOR element := 0 to 3
+///   j := element*64
+///   k := element*32
+///   IF mask[j+63] == 0
+///     result[j+63:j] := a[j+63:j]
+///   ELSE
+///     result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
+///   FI
+/// ENDFOR
+/// \endcode
+///
+/// \headerfile <immintrin.h>
+///
+/// \code
+/// __m256i _mm256_mask_i32gather_epi64(__m256i a, const long long *m,
+///                                     __m128i i, __m256i mask, const int s);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VPGATHERDQ instruction.
+///
+/// \param a
+///    A 256-bit vector of [4 x i64] used as the source when a mask bit is
+///    zero.
+/// \param m
+///    A pointer to the memory used for loading values.
+/// \param i
+///    A 128-bit vector of [4 x i32] containing signed indexes into \a m.
+/// \param mask
+///    A 256-bit vector of [4 x i64] containing the mask. The most significant
+///    bit of each element in the mask vector represents the mask bits. If a
+///    mask bit is zero, the corresponding value from vector \a a is gathered;
+///    otherwise the value is loaded from memory.
+/// \param s
+///    A literal constant scale factor for the indexes in \a i. Must be
+///    1, 2, 4, or 8.
+/// \returns A 256-bit vector of [4 x i64] containing the gathered values.
 #define _mm256_mask_i32gather_epi64(a, m, i, mask, s) \
   ((__m256i)__builtin_ia32_gatherd_q256((__v4di)(__m256i)(a), \
                                         (long long const *)(m), \
                                         (__v4si)(__m128i)(i), \
                                         (__v4di)(__m256i)(mask), (s)))
 
+/// Conditionally gathers two 64-bit integer values, either from the
+///    128-bit vector of [2 x i64] in \a a, or from memory \a m using scaled
+///    indexes from the 128-bit vector of [2 x i64] in \a i. The 128-bit vector
+///    of [2 x i64] in \a mask determines the source for each element.
+///
+/// \code{.operation}
+/// FOR element := 0 to 1
+///   j := element*64
+///   k := element*64
+///   IF mask[j+63] == 0
+///     result[j+63:j] := a[j+63:j]
+///   ELSE
+///     result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
+///   FI
+/// ENDFOR
+/// \endcode
+///
+/// \headerfile <immintrin.h>
+///
+/// \code
+/// __m128i _mm_mask_i64gather_epi64(__m128i a, const long long *m, __m128i i,
+///                                  __m128i mask, const int s);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VPGATHERQQ instruction.
+///
+/// \param a
+///    A 128-bit vector of [2 x i64] used as the source when a mask bit is
+///    zero.
+/// \param m
+///    A pointer to the memory used for loading values.
+/// \param i
+///    A 128-bit vector of [2 x i64] containing signed indexes into \a m.
+/// \param mask
+///    A 128-bit vector of [2 x i64] containing the mask. The most significant
+///    bit of each element in the mask vector represents the mask bits. If a
+///    mask bit is zero, the corresponding value from vector \a a is gathered;
+///    otherwise the value is loaded from memory.
+/// \param s
+///    A literal constant scale factor for the indexes in \a i. Must be
+///    1, 2, 4, or 8.
+/// \returns A 128-bit vector of [2 x i64] containing the gathered values.
 #define _mm_mask_i64gather_epi64(a, m, i, mask, s) \
   ((__m128i)__builtin_ia32_gatherq_q((__v2di)(__m128i)(a), \
                                      (long long const *)(m), \
                                      (__v2di)(__m128i)(i), \
                                      (__v2di)(__m128i)(mask), (s)))
 
+/// Conditionally gathers four 64-bit integer values, either from the
+///    256-bit vector of [4 x i64] in \a a, or from memory \a m using scaled
+///    indexes from the 256-bit vector of [4 x i64] in \a i. The 256-bit vector
+///    of [4 x i64] in \a mask determines the source for each element.
+///
+/// \code{.operation}
+/// FOR element := 0 to 3
+///   j := element*64
+///   k := element*64
+///   IF mask[j+63] == 0
+///     result[j+63:j] := a[j+63:j]
+///   ELSE
+///     result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
+///   FI
+/// ENDFOR
+/// \endcode
+///
+/// \headerfile <immintrin.h>
+///
+/// \code
+/// __m256i _mm256_mask_i64gather_epi64(__m256i a, const long long *m,
+///                                     __m256i i, __m256i mask, const int s);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VPGATHERQQ instruction.
+///
+/// \param a
+///    A 256-bit vector of [4 x i64] used as the source when a mask bit is
+///    zero.
+/// \param m
+///    A pointer to the memory used for loading values.
+/// \param i
+///    A 256-bit vector of [4 x i64] containing signed indexes into \a m.
+/// \param mask
+///    A 256-bit vector of [4 x i64] containing the mask. The most significant
+///    bit of each element in the mask vector represents the mask bits. If a
+///    mask bit is zero, the corresponding value from vector \a a is gathered;
+///    otherwise the value is loaded from memory.
+/// \param s
+///    A literal constant scale factor for the indexes in \a i. Must be
+///    1, 2, 4, or 8.
+/// \returns A 256-bit vector of [4 x i64] containing the gathered values.
 #define _mm256_mask_i64gather_epi64(a, m, i, mask, s) \
   ((__m256i)__builtin_ia32_gatherq_q256((__v4di)(__m256i)(a), \
                                         (long long const *)(m), \
                                         (__v4di)(__m256i)(i), \
                                         (__v4di)(__m256i)(mask), (s)))
 
+/// Gathers two 64-bit floating-point values from memory \a m using scaled
+///    indexes from the 128-bit vector of [4 x i32] in \a i.
+///
+/// \code{.operation}
+/// FOR element := 0 to 1
+///   j := element*64
+///   k := element*32
+///   result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
+/// ENDFOR
+/// \endcode
+///
+/// \headerfile <immintrin.h>
+///
+/// \code
+/// __m128d _mm_i32gather_pd(const double *m, __m128i i, const int s);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VGATHERDPD instruction.
+///
+/// \param m
+///    A pointer to the memory used for loading values.
+/// \param i
+///    A 128-bit vector of [4 x i32] containing signed indexes into \a m. Only
+///    the first two elements are used.
+/// \param s
+///    A literal constant scale factor for the indexes in \a i. Must be
+///    1, 2, 4, or 8.
+/// \returns A 128-bit vector of [2 x double] containing the gathered values.
 #define _mm_i32gather_pd(m, i, s) \
   ((__m128d)__builtin_ia32_gatherd_pd((__v2df)_mm_undefined_pd(), \
                                       (double const *)(m), \
@@ -1039,6 +1747,33 @@
                                                            _mm_setzero_pd()), \
                                       (s)))
 
+/// Gathers four 64-bit floating-point values from memory \a m using scaled
+///    indexes from the 128-bit vector of [4 x i32] in \a i.
+///
+/// \code{.operation}
+/// FOR element := 0 to 3
+///   j := element*64
+///   k := element*32
+///   result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
+/// ENDFOR
+/// \endcode
+///
+/// \headerfile <immintrin.h>
+///
+/// \code
+/// __m256d _mm256_i32gather_pd(const double *m, __m128i i, const int s);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VGATHERDPD instruction.
+///
+/// \param m
+///    A pointer to the memory used for loading values.
+/// \param i
+///    A 128-bit vector of [4 x i32] containing signed indexes into \a m.
+/// \param s
+///    A literal constant scale factor for the indexes in \a i. Must be
+///    1, 2, 4, or 8.
+/// \returns A 256-bit vector of [4 x double] containing the gathered values.
 #define _mm256_i32gather_pd(m, i, s) \
   ((__m256d)__builtin_ia32_gatherd_pd256((__v4df)_mm256_undefined_pd(), \
                                          (double const *)(m), \
@@ -1048,6 +1783,33 @@
                                                                _CMP_EQ_OQ), \
                                          (s)))
 
+/// Gathers two 64-bit floating-point values from memory \a m using scaled
+///    indexes from the 128-bit vector of [2 x i64] in \a i.
+///
+/// \code{.operation}
+/// FOR element := 0 to 1
+///   j := element*64
+///   k := element*64
+///   result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
+/// ENDFOR
+/// \endcode
+///
+/// \headerfile <immintrin.h>
+///
+/// \code
+/// __m128d _mm_i64gather_pd(const double *m, __m128i i, const int s);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VGATHERQPD instruction.
+///
+/// \param m
+///    A pointer to the memory used for loading values.
+/// \param i
+///    A 128-bit vector of [2 x i64] containing signed indexes into \a m.
+/// \param s
+///    A literal constant scale factor for the indexes in \a i. Must be
+///    1, 2, 4, or 8.
+/// \returns A 128-bit vector of [2 x double] containing the gathered values.
 #define _mm_i64gather_pd(m, i, s) \
   ((__m128d)__builtin_ia32_gatherq_pd((__v2df)_mm_undefined_pd(), \
                                       (double const *)(m), \
@@ -1056,6 +1818,33 @@
                                                            _mm_setzero_pd()), \
                                       (s)))
 
+/// Gathers four 64-bit floating-point values from memory \a m using scaled
+///    indexes from the 256-bit vector of [4 x i64] in \a i.
+///
+/// \code{.operation}
+/// FOR element := 0 to 3
+///   j := element*64
+///   k := element*64
+///   result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
+/// ENDFOR
+/// \endcode
+///
+/// \headerfile <immintrin.h>
+///
+/// \code
+/// __m256d _mm256_i64gather_pd(const double *m, __m256i i, const int s);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VGATHERQPD instruction.
+///
+/// \param m
+///    A pointer to the memory used for loading values.
+/// \param i
+///    A 256-bit vector of [4 x i64] containing signed indexes into \a m.
+/// \param s
+///    A literal constant scale factor for the indexes in \a i. Must be
+///    1, 2, 4, or 8.
+/// \returns A 256-bit vector of [4 x double] containing the gathered values.
 #define _mm256_i64gather_pd(m, i, s) \
   ((__m256d)__builtin_ia32_gatherq_pd256((__v4df)_mm256_undefined_pd(), \
                                          (double const *)(m), \
@@ -1065,6 +1854,33 @@
                                                                _CMP_EQ_OQ), \
                                          (s)))
 
+/// Gathers four 32-bit floating-point values from memory \a m using scaled
+///    indexes from the 128-bit vector of [4 x i32] in \a i.
+///
+/// \code{.operation}
+/// FOR element := 0 to 3
+///   j := element*32
+///   k := element*32
+///   result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
+/// ENDFOR
+/// \endcode
+///
+/// \headerfile <immintrin.h>
+///
+/// \code
+/// __m128 _mm_i32gather_ps(const float *m, __m128i i, const int s);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VGATHERDPS instruction.
+///
+/// \param m
+///    A pointer to the memory used for loading values.
+/// \param i
+///    A 128-bit vector of [4 x i32] containing signed indexes into \a m.
+/// \param s
+///    A literal constant scale factor for the indexes in \a i. Must be
+///    1, 2, 4, or 8.
+/// \returns A 128-bit vector of [4 x float] containing the gathered values.
 #define _mm_i32gather_ps(m, i, s) \
   ((__m128)__builtin_ia32_gatherd_ps((__v4sf)_mm_undefined_ps(), \
                                      (float const *)(m), \
@@ -1073,6 +1889,33 @@
                                                           _mm_setzero_ps()), \
                                      (s)))
 
+/// Gathers eight 32-bit floating-point values from memory \a m using scaled
+///    indexes from the 256-bit vector of [8 x i32] in \a i.
+///
+/// \code{.operation}
+/// FOR element := 0 to 7
+///   j := element*32
+///   k := element*32
+///   result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
+/// ENDFOR
+/// \endcode
+///
+/// \headerfile <immintrin.h>
+///
+/// \code
+/// __m256 _mm256_i32gather_ps(const float *m, __m256i i, const int s);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VGATHERDPS instruction.
+///
+/// \param m
+///    A pointer to the memory used for loading values.
+/// \param i
+///    A 256-bit vector of [8 x i32] containing signed indexes into \a m.
+/// \param s
+///    A literal constant scale factor for the indexes in \a i. Must be
+///    1, 2, 4, or 8.
+/// \returns A 256-bit vector of [8 x float] containing the gathered values.
 #define _mm256_i32gather_ps(m, i, s) \
   ((__m256)__builtin_ia32_gatherd_ps256((__v8sf)_mm256_undefined_ps(), \
                                         (float const *)(m), \
@@ -1082,6 +1925,35 @@
                                                               _CMP_EQ_OQ), \
                                         (s)))
 
+/// Gathers two 32-bit floating-point values from memory \a m using scaled
+///    indexes from the 128-bit vector of [2 x i64] in \a i. The upper two
+///    elements of the result are zeroed.
+///
+/// \code{.operation}
+/// FOR element := 0 to 1
+///   j := element*32
+///   k := element*64
+///   result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)
+/// ENDFOR
+/// result[127:64] := 0
+/// \endcode
+///
+/// \headerfile <immintrin.h>
+///
+/// \code
+/// __m128 _mm_i64gather_ps(const float *m, __m128i i, const int s);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VGATHERQPS instruction.
+///
+/// \param m
+///    A pointer to the memory used for loading values.
+/// \param i
+///    A 128-bit vector of [2 x i64] containing signed indexes into \a m.
+/// \param s
+///    A literal constant scale factor for the indexes in \a i. Must be
+///    1, 2, 4, or 8.
+/// \returns A 128-bit vector of [4 x float] containing the gathered values.
 #define _mm_i64gather_ps(m, i, s) \
   ((__m128)__builtin_ia32_gatherq_ps((__v4sf)_mm_undefined_ps(), \
                                      (float const *)(m), \
@@ -1090,6 +1962,33 @@
                                                           _mm_setzero_ps()), \
                                      (s)))
 
+/// Gathers four 32-bit floating-point values from memory \a m using scaled
+///    indexes from the 256-bit vector of [4 x i64] in \a i.
+///
+/// \code{.operation}
+/// FOR element := 0 to 3
+///   j := element*32
+///   k := element*64
+///   result[j+31:j] := Load32(m + SignExtend(i[k+64:k])*s)
+/// ENDFOR
+/// \endcode
+///
+/// \headerfile <immintrin.h>
+///
+/// \code
+/// __m128 _mm256_i64gather_ps(const float *m, __m256i i, const int s);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VGATHERQPS instruction.
+///
+/// \param m
+///    A pointer to the memory used for loading values.
+/// \param i
+///    A 256-bit vector of [4 x i64] containing signed indexes into \a m.
+/// \param s
+///    A literal constant scale factor for the indexes in \a i. Must be
+///    1, 2, 4, or 8.
+/// \returns A 128-bit vector of [4 x float] containing the gathered values.
 #define _mm256_i64gather_ps(m, i, s) \
   ((__m128)__builtin_ia32_gatherq_ps256((__v4sf)_mm_undefined_ps(), \
                                         (float const *)(m), \
@@ -1098,44 +1997,263 @@
                                                              _mm_setzero_ps()), \
                                         (s)))
 
+/// Gathers four 32-bit floating-point values from memory \a m using scaled
+///    indexes from the 128-bit vector of [4 x i32] in \a i.
+///
+/// \code{.operation}
+/// FOR element := 0 to 3
+///   j := element*32
+///   k := element*32
+///   result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
+/// ENDFOR
+/// \endcode
+///
+/// \headerfile <immintrin.h>
+///
+/// \code
+/// __m128i _mm_i32gather_epi32(const int *m, __m128i i, const int s);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VPGATHERDD instruction.
+///
+/// \param m
+///    A pointer to the memory used for loading values.
+/// \param i
+///    A 128-bit vector of [4 x i32] containing signed indexes into \a m.
+/// \param s
+///    A literal constant scale factor for the indexes in \a i. Must be
+///    1, 2, 4, or 8.
+/// \returns A 128-bit vector of [4 x i32] containing the gathered values.
 #define _mm_i32gather_epi32(m, i, s) \
   ((__m128i)__builtin_ia32_gatherd_d((__v4si)_mm_undefined_si128(), \
                                      (int const *)(m), (__v4si)(__m128i)(i), \
                                      (__v4si)_mm_set1_epi32(-1), (s)))
 
+/// Gathers eight 32-bit floating-point values from memory \a m using scaled
+///    indexes from the 256-bit vector of [8 x i32] in \a i.
+///
+/// \code{.operation}
+/// FOR element := 0 to 7
+///   j := element*32
+///   k := element*32
+///   result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
+/// ENDFOR
+/// \endcode
+///
+/// \headerfile <immintrin.h>
+///
+/// \code
+/// __m256i _mm256_i32gather_epi32(const int *m, __m256i i, const int s);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VPGATHERDD instruction.
+///
+/// \param m
+///    A pointer to the memory used for loading values.
+/// \param i
+///    A 256-bit vector of [8 x i32] containing signed indexes into \a m.
+/// \param s
+///    A literal constant scale factor for the indexes in \a i. Must be
+///    1, 2, 4, or 8.
+/// \returns A 256-bit vector of [8 x i32] containing the gathered values.
 #define _mm256_i32gather_epi32(m, i, s) \
   ((__m256i)__builtin_ia32_gatherd_d256((__v8si)_mm256_undefined_si256(), \
                                         (int const *)(m), (__v8si)(__m256i)(i), \
                                         (__v8si)_mm256_set1_epi32(-1), (s)))
 
+/// Gathers two 32-bit integer values from memory \a m using scaled indexes
+///    from the 128-bit vector of [2 x i64] in \a i. The upper two elements
+///    of the result are zeroed.
+///
+/// \code{.operation}
+/// FOR element := 0 to 1
+///   j := element*32
+///   k := element*64
+///   result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)
+/// ENDFOR
+/// result[127:64] := 0
+/// \endcode
+///
+/// \headerfile <immintrin.h>
+///
+/// \code
+/// __m128i _mm_i64gather_epi32(const int *m, __m128i i, const int s);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VPGATHERQD instruction.
+///
+/// \param m
+///    A pointer to the memory used for loading values.
+/// \param i
+///    A 128-bit vector of [2 x i64] containing signed indexes into \a m.
+/// \param s
+///    A literal constant scale factor for the indexes in \a i. Must be
+///    1, 2, 4, or 8.
+/// \returns A 128-bit vector of [4 x i32] containing the gathered values.
 #define _mm_i64gather_epi32(m, i, s) \
   ((__m128i)__builtin_ia32_gatherq_d((__v4si)_mm_undefined_si128(), \
                                      (int const *)(m), (__v2di)(__m128i)(i), \
                                      (__v4si)_mm_set1_epi32(-1), (s)))
 
+/// Gathers four 32-bit integer values from memory \a m using scaled indexes
+///    from the 256-bit vector of [4 x i64] in \a i.
+///
+/// \code{.operation}
+/// FOR element := 0 to 3
+///   j := element*32
+///   k := element*64
+///   result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)
+/// ENDFOR
+/// \endcode
+///
+/// \headerfile <immintrin.h>
+///
+/// \code
+/// __m128i _mm256_i64gather_epi32(const int *m, __m256i i, const int s);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VPGATHERQD instruction.
+///
+/// \param m
+///    A pointer to the memory used for loading values.
+/// \param i
+///    A 256-bit vector of [4 x i64] containing signed indexes into \a m.
+/// \param s
+///    A literal constant scale factor for the indexes in \a i. Must be
+///    1, 2, 4, or 8.
+/// \returns A 128-bit vector of [4 x i32] containing the gathered values.
 #define _mm256_i64gather_epi32(m, i, s) \
   ((__m128i)__builtin_ia32_gatherq_d256((__v4si)_mm_undefined_si128(), \
                                         (int const *)(m), (__v4di)(__m256i)(i), \
                                         (__v4si)_mm_set1_epi32(-1), (s)))
 
+/// Gathers two 64-bit integer values from memory \a m using scaled indexes
+///    from the 128-bit vector of [4 x i32] in \a i.
+///
+/// \code{.operation}
+/// FOR element := 0 to 1
+///   j := element*64
+///   k := element*32
+///   result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
+/// ENDFOR
+/// \endcode
+///
+/// \headerfile <immintrin.h>
+///
+/// \code
+/// __m128i _mm_i32gather_epi64(const long long *m, __m128i i, const int s);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VPGATHERDQ instruction.
+///
+/// \param m
+///    A pointer to the memory used for loading values.
+/// \param i
+///    A 128-bit vector of [4 x i32] containing signed indexes into \a m. Only
+///    the first two elements are used.
+/// \param s
+///    A literal constant scale factor for the indexes in \a i. Must be
+///    1, 2, 4, or 8.
+/// \returns A 128-bit vector of [2 x i64] containing the gathered values.
 #define _mm_i32gather_epi64(m, i, s) \
   ((__m128i)__builtin_ia32_gatherd_q((__v2di)_mm_undefined_si128(), \
                                      (long long const *)(m), \
                                      (__v4si)(__m128i)(i), \
                                      (__v2di)_mm_set1_epi64x(-1), (s)))
 
+/// Gathers four 64-bit integer values from memory \a m using scaled indexes
+///    from the 128-bit vector of [4 x i32] in \a i.
+///
+/// \code{.operation}
+/// FOR element := 0 to 3
+///   j := element*64
+///   k := element*32
+///   result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
+/// ENDFOR
+/// \endcode
+///
+/// \headerfile <immintrin.h>
+///
+/// \code
+/// __m256i _mm256_i32gather_epi64(const long long *m, __m128i i, const int s);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VPGATHERDQ instruction.
+///
+/// \param m
+///    A pointer to the memory used for loading values.
+/// \param i
+///    A 128-bit vector of [4 x i32] containing signed indexes into \a m.
+/// \param s
+///    A literal constant scale factor for the indexes in \a i. Must be
+///    1, 2, 4, or 8.
+/// \returns A 256-bit vector of [4 x i64] containing the gathered values.
 #define _mm256_i32gather_epi64(m, i, s) \
   ((__m256i)__builtin_ia32_gatherd_q256((__v4di)_mm256_undefined_si256(), \
                                         (long long const *)(m), \
                                         (__v4si)(__m128i)(i), \
                                         (__v4di)_mm256_set1_epi64x(-1), (s)))
 
+/// Gathers two 64-bit integer values from memory \a m using scaled indexes
+///    from the 128-bit vector of [2 x i64] in \a i.
+///
+/// \code{.operation}
+/// FOR element := 0 to 1
+///   j := element*64
+///   k := element*64
+///   result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
+/// ENDFOR
+/// \endcode
+///
+/// \headerfile <immintrin.h>
+///
+/// \code
+/// __m128i _mm_i64gather_epi64(const long long *m, __m128i i, const int s);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VPGATHERQQ instruction.
+///
+/// \param m
+///    A pointer to the memory used for loading values.
+/// \param i
+///    A 128-bit vector of [2 x i64] containing signed indexes into \a m.
+/// \param s
+///    A literal constant scale factor for the indexes in \a i. Must be
+///    1, 2, 4, or 8.
+/// \returns A 128-bit vector of [2 x i64] containing the gathered values.
 #define _mm_i64gather_epi64(m, i, s) \
   ((__m128i)__builtin_ia32_gatherq_q((__v2di)_mm_undefined_si128(), \
                                      (long long const *)(m), \
                                      (__v2di)(__m128i)(i), \
                                      (__v2di)_mm_set1_epi64x(-1), (s)))
 
+/// Gathers four 64-bit integer values from memory \a m using scaled indexes
+///    from the 256-bit vector of [4 x i64] in \a i.
+///
+/// \code{.operation}
+/// FOR element := 0 to 3
+///   j := element*64
+///   k := element*64
+///   result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
+/// ENDFOR
+/// \endcode
+///
+/// \headerfile <immintrin.h>
+///
+/// \code
+/// __m256i _mm256_i64gather_epi64(const long long *m, __m256i i, const int s);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VPGATHERQQ instruction.
+///
+/// \param m
+///    A pointer to the memory used for loading values.
+/// \param i
+///    A 256-bit vector of [4 x i64] containing signed indexes into \a m.
+/// \param s
+///    A literal constant scale factor for the indexes in \a i. Must be
+///    1, 2, 4, or 8.
+/// \returns A 256-bit vector of [4 x i64] containing the gathered values.
 #define _mm256_i64gather_epi64(m, i, s) \
   ((__m256i)__builtin_ia32_gatherq_q256((__v4di)_mm256_undefined_si256(), \
                                         (long long const *)(m), \
diff --git a/darwin-x86/lib64/clang/17/include/avx512fintrin.h b/darwin-x86/lib64/clang/17/include/avx512fintrin.h
index b19d2fb..88a8ceb 100644
--- a/darwin-x86/lib64/clang/17/include/avx512fintrin.h
+++ b/darwin-x86/lib64/clang/17/include/avx512fintrin.h
@@ -397,14 +397,15 @@
 static __inline __m512d __DEFAULT_FN_ATTRS512
 _mm512_castpd256_pd512(__m256d __a)
 {
-  return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, -1, -1, -1, -1);
+  return __builtin_shufflevector(__a, __builtin_nondeterministic_value(__a), 0,
+                                 1, 2, 3, 4, 5, 6, 7);
 }
 
 static __inline __m512 __DEFAULT_FN_ATTRS512
 _mm512_castps256_ps512(__m256 __a)
 {
-  return __builtin_shufflevector(__a, __a, 0,  1,  2,  3,  4,  5,  6,  7,
-                                          -1, -1, -1, -1, -1, -1, -1, -1);
+  return __builtin_shufflevector(__a, __builtin_nondeterministic_value(__a), 0,
+                                 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
 }
 
 static __inline __m128d __DEFAULT_FN_ATTRS512
@@ -446,7 +447,10 @@
 static __inline__ __m512d __DEFAULT_FN_ATTRS512
 _mm512_castpd128_pd512 (__m128d __A)
 {
-  return __builtin_shufflevector( __A, __A, 0, 1, -1, -1, -1, -1, -1, -1);
+  __m256d __B = __builtin_nondeterministic_value(__B);
+  return __builtin_shufflevector(
+      __builtin_shufflevector(__A, __builtin_nondeterministic_value(__A), 0, 1, 2, 3),
+      __B, 0, 1, 2, 3, 4, 5, 6, 7);
 }
 
 static __inline __m512d __DEFAULT_FN_ATTRS512
@@ -464,19 +468,25 @@
 static __inline__ __m512 __DEFAULT_FN_ATTRS512
 _mm512_castps128_ps512 (__m128 __A)
 {
-    return  __builtin_shufflevector( __A, __A, 0, 1, 2, 3, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
+  __m256 __B = __builtin_nondeterministic_value(__B);
+  return __builtin_shufflevector(
+      __builtin_shufflevector(__A, __builtin_nondeterministic_value(__A), 0, 1, 2, 3, 4, 5, 6, 7),
+      __B, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS512
 _mm512_castsi128_si512 (__m128i __A)
 {
-   return  __builtin_shufflevector( __A, __A, 0, 1, -1, -1, -1, -1, -1, -1);
+  __m256i __B = __builtin_nondeterministic_value(__B);
+  return __builtin_shufflevector(
+      __builtin_shufflevector(__A, __builtin_nondeterministic_value(__A), 0, 1, 2, 3),
+      __B, 0, 1, 2, 3, 4, 5, 6, 7);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS512
 _mm512_castsi256_si512 (__m256i __A)
 {
-   return  __builtin_shufflevector( __A, __A, 0, 1, 2, 3, -1, -1, -1, -1);
+   return  __builtin_shufflevector( __A, __builtin_nondeterministic_value(__A), 0, 1, 2, 3, 4, 5, 6, 7);
 }
 
 static __inline __m512 __DEFAULT_FN_ATTRS512
diff --git a/darwin-x86/lib64/clang/17/include/avx512fp16intrin.h b/darwin-x86/lib64/clang/17/include/avx512fp16intrin.h
index 5cdc37f..d326586 100644
--- a/darwin-x86/lib64/clang/17/include/avx512fp16intrin.h
+++ b/darwin-x86/lib64/clang/17/include/avx512fp16intrin.h
@@ -192,22 +192,26 @@
 
 static __inline__ __m256h __DEFAULT_FN_ATTRS256
 _mm256_castph128_ph256(__m128h __a) {
-  return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, 4, 5, 6, 7, -1, -1, -1,
-                                 -1, -1, -1, -1, -1);
+  return __builtin_shufflevector(__a, __builtin_nondeterministic_value(__a),
+                                  0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
 }
 
 static __inline__ __m512h __DEFAULT_FN_ATTRS512
 _mm512_castph128_ph512(__m128h __a) {
-  return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, 4, 5, 6, 7, -1, -1, -1,
-                                 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-                                 -1, -1, -1, -1, -1, -1, -1, -1, -1);
+  __m256h __b = __builtin_nondeterministic_value(__b);
+  return __builtin_shufflevector(
+      __builtin_shufflevector(__a, __builtin_nondeterministic_value(__a),
+                              0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15),
+      __b, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
+      20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
 }
 
 static __inline__ __m512h __DEFAULT_FN_ATTRS512
 _mm512_castph256_ph512(__m256h __a) {
-  return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
-                                 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1,
-                                 -1, -1, -1, -1, -1, -1, -1, -1);
+  return __builtin_shufflevector(__a, __builtin_nondeterministic_value(__a), 0,
+                                 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
+                                 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
+                                 27, 28, 29, 30, 31);
 }
 
 /// Constructs a 256-bit floating-point vector of [16 x half] from a
diff --git a/darwin-x86/lib64/clang/17/include/avxintrin.h b/darwin-x86/lib64/clang/17/include/avxintrin.h
index ee31569..bd11922 100644
--- a/darwin-x86/lib64/clang/17/include/avxintrin.h
+++ b/darwin-x86/lib64/clang/17/include/avxintrin.h
@@ -4499,7 +4499,8 @@
 static __inline __m256d __DEFAULT_FN_ATTRS
 _mm256_castpd128_pd256(__m128d __a)
 {
-  return __builtin_shufflevector((__v2df)__a, (__v2df)__a, 0, 1, -1, -1);
+  return __builtin_shufflevector(
+      (__v2df)__a, (__v2df)__builtin_nondeterministic_value(__a), 0, 1, 2, 3);
 }
 
 /// Constructs a 256-bit floating-point vector of [8 x float] from a
@@ -4520,7 +4521,9 @@
 static __inline __m256 __DEFAULT_FN_ATTRS
 _mm256_castps128_ps256(__m128 __a)
 {
-  return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 1, 2, 3, -1, -1, -1, -1);
+  return __builtin_shufflevector((__v4sf)__a,
+                                 (__v4sf)__builtin_nondeterministic_value(__a),
+                                 0, 1, 2, 3, 4, 5, 6, 7);
 }
 
 /// Constructs a 256-bit integer vector from a 128-bit integer vector.
@@ -4539,7 +4542,8 @@
 static __inline __m256i __DEFAULT_FN_ATTRS
 _mm256_castsi128_si256(__m128i __a)
 {
-  return __builtin_shufflevector((__v2di)__a, (__v2di)__a, 0, 1, -1, -1);
+  return __builtin_shufflevector(
+      (__v2di)__a, (__v2di)__builtin_nondeterministic_value(__a), 0, 1, 2, 3);
 }
 
 /// Constructs a 256-bit floating-point vector of [4 x double] from a
diff --git a/darwin-x86/lib64/clang/17/include/bits/stdatomic.h b/darwin-x86/lib64/clang/17/include/bits/stdatomic.h
index 2ce6ee6..fe3d68d 100644
--- a/darwin-x86/lib64/clang/17/include/bits/stdatomic.h
+++ b/darwin-x86/lib64/clang/17/include/bits/stdatomic.h
@@ -269,18 +269,18 @@
 
 #define	ATOMIC_FLAG_INIT		{ ATOMIC_VAR_INIT(false) }
 
-static __inline bool atomic_flag_test_and_set_explicit(volatile atomic_flag *__object, memory_order __order) {
+static __inline bool atomic_flag_test_and_set_explicit(volatile atomic_flag * _Nonnull __object, memory_order __order) {
 	return (atomic_exchange_explicit(&__object->__flag, 1, __order));
 }
 
-static __inline void atomic_flag_clear_explicit(volatile atomic_flag *__object, memory_order __order) {
+static __inline void atomic_flag_clear_explicit(volatile atomic_flag * _Nonnull __object, memory_order __order) {
 	atomic_store_explicit(&__object->__flag, 0, __order);
 }
 
-static __inline bool atomic_flag_test_and_set(volatile atomic_flag *__object) {
+static __inline bool atomic_flag_test_and_set(volatile atomic_flag * _Nonnull __object) {
 	return (atomic_flag_test_and_set_explicit(__object, memory_order_seq_cst));
 }
 
-static __inline void atomic_flag_clear(volatile atomic_flag *__object) {
+static __inline void atomic_flag_clear(volatile atomic_flag * _Nonnull __object) {
 	atomic_flag_clear_explicit(__object, memory_order_seq_cst);
 }
diff --git a/darwin-x86/lib64/clang/17/include/cuda_wrappers/shared_ptr_base.h b/darwin-x86/lib64/clang/17/include/cuda_wrappers/shared_ptr_base.h
new file mode 100644
index 0000000..10028dd
--- /dev/null
+++ b/darwin-x86/lib64/clang/17/include/cuda_wrappers/shared_ptr_base.h
@@ -0,0 +1,9 @@
+// CUDA headers define __noinline__ which interferes with libstdc++'s use of
+// `__attribute((__noinline__))`. In order to avoid compilation error,
+// temporarily unset __noinline__ when we include affected libstdc++ header.
+
+#pragma push_macro("__noinline__")
+#undef __noinline__
+#include_next "bits/shared_ptr_base.h"
+
+#pragma pop_macro("__noinline__")
diff --git a/darwin-x86/lib64/clang/17/include/fmaintrin.h b/darwin-x86/lib64/clang/17/include/fmaintrin.h
index d889b7c..ea832fa 100644
--- a/darwin-x86/lib64/clang/17/include/fmaintrin.h
+++ b/darwin-x86/lib64/clang/17/include/fmaintrin.h
@@ -18,192 +18,756 @@
 #define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("fma"), __min_vector_width__(128)))
 #define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("fma"), __min_vector_width__(256)))
 
+/// Computes a multiply-add of 128-bit vectors of [4 x float].
+///    For each element, computes <c> (__A * __B) + __C </c>.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the \c VFMADD213PS instruction.
+///
+/// \param __A
+///    A 128-bit vector of [4 x float] containing the multiplicand.
+/// \param __B
+///    A 128-bit vector of [4 x float] containing the multiplier.
+/// \param __C
+///    A 128-bit vector of [4 x float] containing the addend.
+/// \returns A 128-bit vector of [4 x float] containing the result.
 static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_fmadd_ps(__m128 __A, __m128 __B, __m128 __C)
 {
   return (__m128)__builtin_ia32_vfmaddps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
 }
 
+/// Computes a multiply-add of 128-bit vectors of [2 x double].
+///    For each element, computes <c> (__A * __B) + __C </c>.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the \c VFMADD213PD instruction.
+///
+/// \param __A
+///    A 128-bit vector of [2 x double] containing the multiplicand.
+/// \param __B
+///    A 128-bit vector of [2 x double] containing the multiplier.
+/// \param __C
+///    A 128-bit vector of [2 x double] containing the addend.
+/// \returns A 128-bit [2 x double] vector containing the result.
 static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_fmadd_pd(__m128d __A, __m128d __B, __m128d __C)
 {
   return (__m128d)__builtin_ia32_vfmaddpd((__v2df)__A, (__v2df)__B, (__v2df)__C);
 }
 
+/// Computes a scalar multiply-add of the single-precision values in the
+///    low 32 bits of 128-bit vectors of [4 x float].
+/// \code
+/// result[31:0] = (__A[31:0] * __B[31:0]) + __C[31:0]
+/// result[127:32] = __A[127:32]
+/// \endcode
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the \c VFMADD213SS instruction.
+///
+/// \param __A
+///    A 128-bit vector of [4 x float] containing the multiplicand in the low
+///    32 bits.
+/// \param __B
+///    A 128-bit vector of [4 x float] containing the multiplier in the low
+///    32 bits.
+/// \param __C
+///    A 128-bit vector of [4 x float] containing the addend in the low
+///    32 bits.
+/// \returns A 128-bit vector of [4 x float] containing the result in the low
+///    32 bits and a copy of \a __A[127:32] in the upper 96 bits.
 static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_fmadd_ss(__m128 __A, __m128 __B, __m128 __C)
 {
   return (__m128)__builtin_ia32_vfmaddss3((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
 }
 
+/// Computes a scalar multiply-add of the double-precision values in the
+///    low 64 bits of 128-bit vectors of [2 x double].
+/// \code
+/// result[63:0] = (__A[63:0] * __B[63:0]) + __C[63:0]
+/// result[127:64] = __A[127:64]
+/// \endcode
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the \c VFMADD213SD instruction.
+///
+/// \param __A
+///    A 128-bit vector of [2 x double] containing the multiplicand in the low
+///    64 bits.
+/// \param __B
+///    A 128-bit vector of [2 x double] containing the multiplier in the low
+///    64 bits.
+/// \param __C
+///    A 128-bit vector of [2 x double] containing the addend in the low
+///    64 bits.
+/// \returns A 128-bit vector of [2 x double] containing the result in the low
+///    64 bits and a copy of \a __A[127:64] in the upper 64 bits.
 static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_fmadd_sd(__m128d __A, __m128d __B, __m128d __C)
 {
   return (__m128d)__builtin_ia32_vfmaddsd3((__v2df)__A, (__v2df)__B, (__v2df)__C);
 }
 
+/// Computes a multiply-subtract of 128-bit vectors of [4 x float].
+///    For each element, computes <c> (__A * __B) - __C </c>.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the \c VFMSUB213PS instruction.
+///
+/// \param __A
+///    A 128-bit vector of [4 x float] containing the multiplicand.
+/// \param __B
+///    A 128-bit vector of [4 x float] containing the multiplier.
+/// \param __C
+///    A 128-bit vector of [4 x float] containing the subtrahend.
+/// \returns A 128-bit vector of [4 x float] containing the result.
 static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_fmsub_ps(__m128 __A, __m128 __B, __m128 __C)
 {
   return (__m128)__builtin_ia32_vfmaddps((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
 }
 
+/// Computes a multiply-subtract of 128-bit vectors of [2 x double].
+///    For each element, computes <c> (__A * __B) - __C </c>.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the \c VFMSUB213PD instruction.
+///
+/// \param __A
+///    A 128-bit vector of [2 x double] containing the multiplicand.
+/// \param __B
+///    A 128-bit vector of [2 x double] containing the multiplier.
+/// \param __C
+///    A 128-bit vector of [2 x double] containing the addend.
+/// \returns A 128-bit vector of [2 x double] containing the result.
 static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_fmsub_pd(__m128d __A, __m128d __B, __m128d __C)
 {
   return (__m128d)__builtin_ia32_vfmaddpd((__v2df)__A, (__v2df)__B, -(__v2df)__C);
 }
 
+/// Computes a scalar multiply-subtract of the single-precision values in
+///    the low 32 bits of 128-bit vectors of [4 x float].
+/// \code
+/// result[31:0] = (__A[31:0] * __B[31:0]) - __C[31:0]
+/// result[127:32] = __A[127:32]
+/// \endcode
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the \c VFMSUB213SS instruction.
+///
+/// \param __A
+///    A 128-bit vector of [4 x float] containing the multiplicand in the low
+///    32 bits.
+/// \param __B
+///    A 128-bit vector of [4 x float] containing the multiplier in the low
+///    32 bits.
+/// \param __C
+///    A 128-bit vector of [4 x float] containing the subtrahend in the low
+///   32 bits.
+/// \returns A 128-bit vector of [4 x float] containing the result in the low
+///    32 bits, and a copy of \a __A[127:32] in the upper 96 bits.
 static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_fmsub_ss(__m128 __A, __m128 __B, __m128 __C)
 {
   return (__m128)__builtin_ia32_vfmaddss3((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
 }
 
+/// Computes a scalar multiply-subtract of the double-precision values in
+///    the low 64 bits of 128-bit vectors of [2 x double].
+/// \code
+/// result[63:0] = (__A[63:0] * __B[63:0]) - __C[63:0]
+/// result[127:64] = __A[127:64]
+/// \endcode
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the \c VFMSUB213SD instruction.
+///
+/// \param __A
+///    A 128-bit vector of [2 x double] containing the multiplicand in the low
+///    64 bits.
+/// \param __B
+///    A 128-bit vector of [2 x double] containing the multiplier in the low
+///    64 bits.
+/// \param __C
+///    A 128-bit vector of [2 x double] containing the subtrahend in the low
+///    64 bits.
+/// \returns A 128-bit vector of [2 x double] containing the result in the low
+///    64 bits, and a copy of \a __A[127:64] in the upper 64 bits.
 static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_fmsub_sd(__m128d __A, __m128d __B, __m128d __C)
 {
   return (__m128d)__builtin_ia32_vfmaddsd3((__v2df)__A, (__v2df)__B, -(__v2df)__C);
 }
 
+/// Computes a negated multiply-add of 128-bit vectors of [4 x float].
+///    For each element, computes <c> -(__A * __B) + __C </c>.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the \c VFNMADD213DPS instruction.
+///
+/// \param __A
+///    A 128-bit vector of [4 x float] containing the multiplicand.
+/// \param __B
+///    A 128-bit vector of [4 x float] containing the multiplier.
+/// \param __C
+///    A 128-bit vector of [4 x float] containing the addend.
+/// \returns A 128-bit [4 x float] vector containing the result.
 static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_fnmadd_ps(__m128 __A, __m128 __B, __m128 __C)
 {
   return (__m128)__builtin_ia32_vfmaddps(-(__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
 }
 
+/// Computes a negated multiply-add of 128-bit vectors of [2 x double].
+///    For each element, computes <c> -(__A * __B) + __C </c>.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the \c VFNMADD213PD instruction.
+///
+/// \param __A
+///    A 128-bit vector of [2 x double] containing the multiplicand.
+/// \param __B
+///    A 128-bit vector of [2 x double] containing the multiplier.
+/// \param __C
+///    A 128-bit vector of [2 x double] containing the addend.
+/// \returns A 128-bit vector of [2 x double] containing the result.
 static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_fnmadd_pd(__m128d __A, __m128d __B, __m128d __C)
 {
   return (__m128d)__builtin_ia32_vfmaddpd(-(__v2df)__A, (__v2df)__B, (__v2df)__C);
 }
 
+/// Computes a scalar negated multiply-add of the single-precision values in
+///    the low 32 bits of 128-bit vectors of [4 x float].
+/// \code
+/// result[31:0] = -(__A[31:0] * __B[31:0]) + __C[31:0]
+/// result[127:32] = __A[127:32]
+/// \endcode
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the \c VFNMADD213SS instruction.
+///
+/// \param __A
+///    A 128-bit vector of [4 x float] containing the multiplicand in the low
+///    32 bits.
+/// \param __B
+///    A 128-bit vector of [4 x float] containing the multiplier in the low
+///    32 bits.
+/// \param __C
+///    A 128-bit vector of [4 x float] containing the addend in the low
+///    32 bits.
+/// \returns A 128-bit vector of [4 x float] containing the result in the low
+///    32 bits, and a copy of \a __A[127:32] in the upper 96 bits.
 static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_fnmadd_ss(__m128 __A, __m128 __B, __m128 __C)
 {
   return (__m128)__builtin_ia32_vfmaddss3((__v4sf)__A, -(__v4sf)__B, (__v4sf)__C);
 }
 
+/// Computes a scalar negated multiply-add of the double-precision values
+///    in the low 64 bits of 128-bit vectors of [2 x double].
+/// \code
+/// result[63:0] = -(__A[63:0] * __B[63:0]) + __C[63:0]
+/// result[127:64] = __A[127:64]
+/// \endcode
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the \c VFNMADD213SD instruction.
+///
+/// \param __A
+///    A 128-bit vector of [2 x double] containing the multiplicand in the low
+///    64 bits.
+/// \param __B
+///    A 128-bit vector of [2 x double] containing the multiplier in the low
+///    64 bits.
+/// \param __C
+///    A 128-bit vector of [2 x double] containing the addend in the low
+///    64 bits.
+/// \returns A 128-bit vector of [2 x double] containing the result in the low
+///    64 bits, and a copy of \a __A[127:64] in the upper 64 bits.
 static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_fnmadd_sd(__m128d __A, __m128d __B, __m128d __C)
 {
   return (__m128d)__builtin_ia32_vfmaddsd3((__v2df)__A, -(__v2df)__B, (__v2df)__C);
 }
 
+/// Computes a negated multiply-subtract of 128-bit vectors of [4 x float].
+///    For each element, computes <c> -(__A * __B) - __C </c>.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the \c VFNMSUB213PS instruction.
+///
+/// \param __A
+///    A 128-bit vector of [4 x float] containing the multiplicand.
+/// \param __B
+///    A 128-bit vector of [4 x float] containing the multiplier.
+/// \param __C
+///    A 128-bit vector of [4 x float] containing the subtrahend.
+/// \returns A 128-bit vector of [4 x float] containing the result.
 static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_fnmsub_ps(__m128 __A, __m128 __B, __m128 __C)
 {
   return (__m128)__builtin_ia32_vfmaddps(-(__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
 }
 
+/// Computes a negated multiply-subtract of 128-bit vectors of [2 x double].
+///    For each element, computes <c> -(__A * __B) - __C </c>.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the \c VFNMSUB213PD instruction.
+///
+/// \param __A
+///    A 128-bit vector of [2 x double] containing the multiplicand.
+/// \param __B
+///    A 128-bit vector of [2 x double] containing the multiplier.
+/// \param __C
+///    A 128-bit vector of [2 x double] containing the subtrahend.
+/// \returns A 128-bit vector of [2 x double] containing the result.
 static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_fnmsub_pd(__m128d __A, __m128d __B, __m128d __C)
 {
   return (__m128d)__builtin_ia32_vfmaddpd(-(__v2df)__A, (__v2df)__B, -(__v2df)__C);
 }
 
+/// Computes a scalar negated multiply-subtract of the single-precision
+///    values in the low 32 bits of 128-bit vectors of [4 x float].
+/// \code
+/// result[31:0] = -(__A[31:0] * __B[31:0]) - __C[31:0]
+/// result[127:32] = __A[127:32]
+/// \endcode
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the \c VFNMSUB213SS instruction.
+///
+/// \param __A
+///    A 128-bit vector of [4 x float] containing the multiplicand in the low
+///    32 bits.
+/// \param __B
+///    A 128-bit vector of [4 x float] containing the multiplier in the low
+///    32 bits.
+/// \param __C
+///    A 128-bit vector of [4 x float] containing the subtrahend in the low
+///    32 bits.
+/// \returns A 128-bit vector of [4 x float] containing the result in the low
+///    32 bits, and a copy of \a __A[127:32] in the upper 96 bits.
 static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_fnmsub_ss(__m128 __A, __m128 __B, __m128 __C)
 {
   return (__m128)__builtin_ia32_vfmaddss3((__v4sf)__A, -(__v4sf)__B, -(__v4sf)__C);
 }
 
+/// Computes a scalar negated multiply-subtract of the double-precision
+///    values in the low 64 bits of 128-bit vectors of [2 x double].
+/// \code
+/// result[63:0] = -(__A[63:0] * __B[63:0]) - __C[63:0]
+/// result[127:64] = __A[127:64]
+/// \endcode
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the \c VFNMSUB213SD instruction.
+///
+/// \param __A
+///    A 128-bit vector of [2 x double] containing the multiplicand in the low
+///    64 bits.
+/// \param __B
+///    A 128-bit vector of [2 x double] containing the multiplier in the low
+///    64 bits.
+/// \param __C
+///    A 128-bit vector of [2 x double] containing the subtrahend in the low
+///    64 bits.
+/// \returns A 128-bit vector of [2 x double] containing the result in the low
+///    64 bits, and a copy of \a __A[127:64] in the upper 64 bits.
 static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_fnmsub_sd(__m128d __A, __m128d __B, __m128d __C)
 {
   return (__m128d)__builtin_ia32_vfmaddsd3((__v2df)__A, -(__v2df)__B, -(__v2df)__C);
 }
 
+/// Computes a multiply with alternating add/subtract of 128-bit vectors of
+///    [4 x float].
+/// \code
+/// result[31:0]  = (__A[31:0] * __B[31:0]) - __C[31:0]
+/// result[63:32] = (__A[63:32] * __B[63:32]) + __C[63:32]
+/// result[95:64] = (__A[95:64] * __B[95:64]) - __C[95:64]
+/// result[127:96] = (__A[127:96] * __B[127:96]) + __C[127:96]
+/// \endcode
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the \c VFMADDSUB213PS instruction.
+///
+/// \param __A
+///    A 128-bit vector of [4 x float] containing the multiplicand.
+/// \param __B
+///    A 128-bit vector of [4 x float] containing the multiplier.
+/// \param __C
+///    A 128-bit vector of [4 x float] containing the addend/subtrahend.
+/// \returns A 128-bit vector of [4 x float] containing the result.
 static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_fmaddsub_ps(__m128 __A, __m128 __B, __m128 __C)
 {
   return (__m128)__builtin_ia32_vfmaddsubps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
 }
 
+/// Computes a multiply with alternating add/subtract of 128-bit vectors of
+///    [2 x double].
+/// \code
+/// result[63:0]  = (__A[63:0] * __B[63:0]) - __C[63:0]
+/// result[127:64] = (__A[127:64] * __B[127:64]) + __C[127:64]
+/// \endcode
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the \c VFMADDSUB213PD instruction.
+///
+/// \param __A
+///    A 128-bit vector of [2 x double] containing the multiplicand.
+/// \param __B
+///    A 128-bit vector of [2 x double] containing the multiplier.
+/// \param __C
+///    A 128-bit vector of [2 x double] containing the addend/subtrahend.
+/// \returns A 128-bit vector of [2 x double] containing the result.
 static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_fmaddsub_pd(__m128d __A, __m128d __B, __m128d __C)
 {
   return (__m128d)__builtin_ia32_vfmaddsubpd((__v2df)__A, (__v2df)__B, (__v2df)__C);
 }
 
+/// Computes a multiply with alternating add/subtract of 128-bit vectors of
+///    [4 x float].
+/// \code
+/// result[31:0]  = (__A[31:0] * __B[31:0]) + __C[31:0]
+/// result[63:32] = (__A[63:32] * __B[63:32]) - __C[63:32]
+/// result[95:64] = (__A[95:64] * __B[95:64]) + __C[95:64]
+/// result[127:96 = (__A[127:96] * __B[127:96]) - __C[127:96]
+/// \endcode
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the \c VFMSUBADD213PS instruction.
+///
+/// \param __A
+///    A 128-bit vector of [4 x float] containing the multiplicand.
+/// \param __B
+///    A 128-bit vector of [4 x float] containing the multiplier.
+/// \param __C
+///    A 128-bit vector of [4 x float] containing the addend/subtrahend.
+/// \returns A 128-bit vector of [4 x float] containing the result.
 static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_fmsubadd_ps(__m128 __A, __m128 __B, __m128 __C)
 {
   return (__m128)__builtin_ia32_vfmaddsubps((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
 }
 
+/// Computes a multiply with alternating add/subtract of 128-bit vectors of
+///    [2 x double].
+/// \code
+/// result[63:0]  = (__A[63:0] * __B[63:0]) + __C[63:0]
+/// result[127:64] = (__A[127:64] * __B[127:64]) - __C[127:64]
+/// \endcode
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the \c VFMADDSUB213PD instruction.
+///
+/// \param __A
+///    A 128-bit vector of [2 x double] containing the multiplicand.
+/// \param __B
+///    A 128-bit vector of [2 x double] containing the multiplier.
+/// \param __C
+///    A 128-bit vector of [2 x double] containing the addend/subtrahend.
+/// \returns A 128-bit vector of [2 x double] containing the result.
 static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_fmsubadd_pd(__m128d __A, __m128d __B, __m128d __C)
 {
   return (__m128d)__builtin_ia32_vfmaddsubpd((__v2df)__A, (__v2df)__B, -(__v2df)__C);
 }
 
+/// Computes a multiply-add of 256-bit vectors of [8 x float].
+///    For each element, computes <c> (__A * __B) + __C </c>.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the \c VFMADD213PS instruction.
+///
+/// \param __A
+///    A 256-bit vector of [8 x float] containing the multiplicand.
+/// \param __B
+///    A 256-bit vector of [8 x float] containing the multiplier.
+/// \param __C
+///    A 256-bit vector of [8 x float] containing the addend.
+/// \returns A 256-bit vector of [8 x float] containing the result.
 static __inline__ __m256 __DEFAULT_FN_ATTRS256
 _mm256_fmadd_ps(__m256 __A, __m256 __B, __m256 __C)
 {
   return (__m256)__builtin_ia32_vfmaddps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
 }
 
+/// Computes a multiply-add of 256-bit vectors of [4 x double].
+///    For each element, computes <c> (__A * __B) + __C </c>.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the \c VFMADD213PD instruction.
+///
+/// \param __A
+///    A 256-bit vector of [4 x double] containing the multiplicand.
+/// \param __B
+///    A 256-bit vector of [4 x double] containing the multiplier.
+/// \param __C
+///    A 256-bit vector of [4 x double] containing the addend.
+/// \returns A 256-bit vector of [4 x double] containing the result.
 static __inline__ __m256d __DEFAULT_FN_ATTRS256
 _mm256_fmadd_pd(__m256d __A, __m256d __B, __m256d __C)
 {
   return (__m256d)__builtin_ia32_vfmaddpd256((__v4df)__A, (__v4df)__B, (__v4df)__C);
 }
 
+/// Computes a multiply-subtract of 256-bit vectors of [8 x float].
+///    For each element, computes <c> (__A * __B) - __C </c>.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the \c VFMSUB213PS instruction.
+///
+/// \param __A
+///    A 256-bit vector of [8 x float] containing the multiplicand.
+/// \param __B
+///    A 256-bit vector of [8 x float] containing the multiplier.
+/// \param __C
+///    A 256-bit vector of [8 x float] containing the subtrahend.
+/// \returns A 256-bit vector of [8 x float] containing the result.
 static __inline__ __m256 __DEFAULT_FN_ATTRS256
 _mm256_fmsub_ps(__m256 __A, __m256 __B, __m256 __C)
 {
   return (__m256)__builtin_ia32_vfmaddps256((__v8sf)__A, (__v8sf)__B, -(__v8sf)__C);
 }
 
+/// Computes a multiply-subtract of 256-bit vectors of [4 x double].
+///    For each element, computes <c> (__A * __B) - __C </c>.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the \c VFMSUB213PD instruction.
+///
+/// \param __A
+///    A 256-bit vector of [4 x double] containing the multiplicand.
+/// \param __B
+///    A 256-bit vector of [4 x double] containing the multiplier.
+/// \param __C
+///    A 256-bit vector of [4 x double] containing the subtrahend.
+/// \returns A 256-bit vector of [4 x double] containing the result.
 static __inline__ __m256d __DEFAULT_FN_ATTRS256
 _mm256_fmsub_pd(__m256d __A, __m256d __B, __m256d __C)
 {
   return (__m256d)__builtin_ia32_vfmaddpd256((__v4df)__A, (__v4df)__B, -(__v4df)__C);
 }
 
+/// Computes a negated multiply-add of 256-bit vectors of [8 x float].
+///    For each element, computes <c> -(__A * __B) + __C </c>.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the \c VFNMADD213PS instruction.
+///
+/// \param __A
+///    A 256-bit vector of [8 x float] containing the multiplicand.
+/// \param __B
+///    A 256-bit vector of [8 x float] containing the multiplier.
+/// \param __C
+///    A 256-bit vector of [8 x float] containing the addend.
+/// \returns A 256-bit vector of [8 x float] containing the result.
 static __inline__ __m256 __DEFAULT_FN_ATTRS256
 _mm256_fnmadd_ps(__m256 __A, __m256 __B, __m256 __C)
 {
   return (__m256)__builtin_ia32_vfmaddps256(-(__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
 }
 
+/// Computes a negated multiply-add of 256-bit vectors of [4 x double].
+///    For each element, computes <c> -(__A * __B) + __C </c>.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the \c VFNMADD213PD instruction.
+///
+/// \param __A
+///    A 256-bit vector of [4 x double] containing the multiplicand.
+/// \param __B
+///    A 256-bit vector of [4 x double] containing the multiplier.
+/// \param __C
+///    A 256-bit vector of [4 x double] containing the addend.
+/// \returns A 256-bit vector of [4 x double] containing the result.
 static __inline__ __m256d __DEFAULT_FN_ATTRS256
 _mm256_fnmadd_pd(__m256d __A, __m256d __B, __m256d __C)
 {
   return (__m256d)__builtin_ia32_vfmaddpd256(-(__v4df)__A, (__v4df)__B, (__v4df)__C);
 }
 
+/// Computes a negated multiply-subtract of 256-bit vectors of [8 x float].
+///    For each element, computes <c> -(__A * __B) - __C </c>.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the \c VFNMSUB213PS instruction.
+///
+/// \param __A
+///    A 256-bit vector of [8 x float] containing the multiplicand.
+/// \param __B
+///    A 256-bit vector of [8 x float] containing the multiplier.
+/// \param __C
+///    A 256-bit vector of [8 x float] containing the subtrahend.
+/// \returns A 256-bit vector of [8 x float] containing the result.
 static __inline__ __m256 __DEFAULT_FN_ATTRS256
 _mm256_fnmsub_ps(__m256 __A, __m256 __B, __m256 __C)
 {
   return (__m256)__builtin_ia32_vfmaddps256(-(__v8sf)__A, (__v8sf)__B, -(__v8sf)__C);
 }
 
+/// Computes a negated multiply-subtract of 256-bit vectors of [4 x double].
+///    For each element, computes <c> -(__A * __B) - __C </c>.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the \c VFNMSUB213PD instruction.
+///
+/// \param __A
+///    A 256-bit vector of [4 x double] containing the multiplicand.
+/// \param __B
+///    A 256-bit vector of [4 x double] containing the multiplier.
+/// \param __C
+///    A 256-bit vector of [4 x double] containing the subtrahend.
+/// \returns A 256-bit vector of [4 x double] containing the result.
 static __inline__ __m256d __DEFAULT_FN_ATTRS256
 _mm256_fnmsub_pd(__m256d __A, __m256d __B, __m256d __C)
 {
   return (__m256d)__builtin_ia32_vfmaddpd256(-(__v4df)__A, (__v4df)__B, -(__v4df)__C);
 }
 
+/// Computes a multiply with alternating add/subtract of 256-bit vectors of
+///    [8 x float].
+/// \code
+/// result[31:0] = (__A[31:0] * __B[31:0]) - __C[31:0]
+/// result[63:32] = (__A[63:32] * __B[63:32]) + __C[63:32]
+/// result[95:64] = (__A[95:64] * __B[95:64]) - __C[95:64]
+/// result[127:96] = (__A[127:96] * __B[127:96]) + __C[127:96]
+/// result[159:128] = (__A[159:128] * __B[159:128]) - __C[159:128]
+/// result[191:160] = (__A[191:160] * __B[191:160]) + __C[191:160]
+/// result[223:192] = (__A[223:192] * __B[223:192]) - __C[223:192]
+/// result[255:224] = (__A[255:224] * __B[255:224]) + __C[255:224]
+/// \endcode
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the \c VFMADDSUB213PS instruction.
+///
+/// \param __A
+///    A 256-bit vector of [8 x float] containing the multiplicand.
+/// \param __B
+///    A 256-bit vector of [8 x float] containing the multiplier.
+/// \param __C
+///    A 256-bit vector of [8 x float] containing the addend/subtrahend.
+/// \returns A 256-bit vector of [8 x float] containing the result.
 static __inline__ __m256 __DEFAULT_FN_ATTRS256
 _mm256_fmaddsub_ps(__m256 __A, __m256 __B, __m256 __C)
 {
   return (__m256)__builtin_ia32_vfmaddsubps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
 }
 
+/// Computes a multiply with alternating add/subtract of 256-bit vectors of
+///    [4 x double].
+/// \code
+/// result[63:0] = (__A[63:0] * __B[63:0]) - __C[63:0]
+/// result[127:64] = (__A[127:64] * __B[127:64]) + __C[127:64]
+/// result[191:128] = (__A[191:128] * __B[191:128]) - __C[191:128]
+/// result[255:192] = (__A[255:192] * __B[255:192]) + __C[255:192]
+/// \endcode
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the \c VFMADDSUB213PD instruction.
+///
+/// \param __A
+///    A 256-bit vector of [4 x double] containing the multiplicand.
+/// \param __B
+///    A 256-bit vector of [4 x double] containing the multiplier.
+/// \param __C
+///    A 256-bit vector of [4 x double] containing the addend/subtrahend.
+/// \returns A 256-bit vector of [4 x double] containing the result.
 static __inline__ __m256d __DEFAULT_FN_ATTRS256
 _mm256_fmaddsub_pd(__m256d __A, __m256d __B, __m256d __C)
 {
   return (__m256d)__builtin_ia32_vfmaddsubpd256((__v4df)__A, (__v4df)__B, (__v4df)__C);
 }
 
+/// Computes a vector multiply with alternating add/subtract of 256-bit
+///    vectors of [8 x float].
+/// \code
+/// result[31:0] = (__A[31:0] * __B[31:0]) + __C[31:0]
+/// result[63:32] = (__A[63:32] * __B[63:32]) - __C[63:32]
+/// result[95:64] = (__A[95:64] * __B[95:64]) + __C[95:64]
+/// result[127:96] = (__A[127:96] * __B[127:96]) - __C[127:96]
+/// result[159:128] = (__A[159:128] * __B[159:128]) + __C[159:128]
+/// result[191:160] = (__A[191:160] * __B[191:160]) - __C[191:160]
+/// result[223:192] = (__A[223:192] * __B[223:192]) + __C[223:192]
+/// result[255:224] = (__A[255:224] * __B[255:224]) - __C[255:224]
+/// \endcode
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the \c VFMSUBADD213PS instruction.
+///
+/// \param __A
+///    A 256-bit vector of [8 x float] containing the multiplicand.
+/// \param __B
+///    A 256-bit vector of [8 x float] containing the multiplier.
+/// \param __C
+///    A 256-bit vector of [8 x float] containing the addend/subtrahend.
+/// \returns A 256-bit vector of [8 x float] containing the result.
 static __inline__ __m256 __DEFAULT_FN_ATTRS256
 _mm256_fmsubadd_ps(__m256 __A, __m256 __B, __m256 __C)
 {
   return (__m256)__builtin_ia32_vfmaddsubps256((__v8sf)__A, (__v8sf)__B, -(__v8sf)__C);
 }
 
+/// Computes a vector multiply with alternating add/subtract of 256-bit
+///    vectors of [4 x double].
+/// \code
+/// result[63:0] = (__A[63:0] * __B[63:0]) + __C[63:0]
+/// result[127:64] = (__A[127:64] * __B[127:64]) - __C[127:64]
+/// result[191:128] = (__A[191:128] * __B[191:128]) + __C[191:128]
+/// result[255:192] = (__A[255:192] * __B[255:192]) - __C[255:192]
+/// \endcode
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the \c VFMSUBADD213PD instruction.
+///
+/// \param __A
+///    A 256-bit vector of [4 x double] containing the multiplicand.
+/// \param __B
+///    A 256-bit vector of [4 x double] containing the multiplier.
+/// \param __C
+///    A 256-bit vector of [4 x double] containing the addend/subtrahend.
+/// \returns A 256-bit vector of [4 x double] containing the result.
 static __inline__ __m256d __DEFAULT_FN_ATTRS256
 _mm256_fmsubadd_pd(__m256d __A, __m256d __B, __m256d __C)
 {
diff --git a/darwin-x86/lib64/clang/17/include/immintrin.h b/darwin-x86/lib64/clang/17/include/immintrin.h
index 6967b46..c5f84ae 100644
--- a/darwin-x86/lib64/clang/17/include/immintrin.h
+++ b/darwin-x86/lib64/clang/17/include/immintrin.h
@@ -284,30 +284,53 @@
 
 #if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
     defined(__RDRND__)
+/// Returns a 16-bit hardware-generated random value.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> RDRAND </c> instruction.
+///
+/// \param __p
+///    A pointer to a 16-bit memory location to place the random value.
+/// \returns 1 if the value was successfully generated, 0 otherwise.
 static __inline__ int __attribute__((__always_inline__, __nodebug__, __target__("rdrnd")))
 _rdrand16_step(unsigned short *__p)
 {
   return (int)__builtin_ia32_rdrand16_step(__p);
 }
 
+/// Returns a 32-bit hardware-generated random value.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> RDRAND </c> instruction.
+///
+/// \param __p
+///    A pointer to a 32-bit memory location to place the random value.
+/// \returns 1 if the value was successfully generated, 0 otherwise.
 static __inline__ int __attribute__((__always_inline__, __nodebug__, __target__("rdrnd")))
 _rdrand32_step(unsigned int *__p)
 {
   return (int)__builtin_ia32_rdrand32_step(__p);
 }
 
+/// Returns a 64-bit hardware-generated random value.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> RDRAND </c> instruction.
+///
+/// \param __p
+///    A pointer to a 64-bit memory location to place the random value.
+/// \returns 1 if the value was successfully generated, 0 otherwise.
+static __inline__ int __attribute__((__always_inline__, __nodebug__, __target__("rdrnd")))
+_rdrand64_step(unsigned long long *__p)
+{
 #ifdef __x86_64__
-static __inline__ int __attribute__((__always_inline__, __nodebug__, __target__("rdrnd")))
-_rdrand64_step(unsigned long long *__p)
-{
   return (int)__builtin_ia32_rdrand64_step(__p);
-}
 #else
-// We need to emulate the functionality of 64-bit rdrand with 2 32-bit
-// rdrand instructions.
-static __inline__ int __attribute__((__always_inline__, __nodebug__, __target__("rdrnd")))
-_rdrand64_step(unsigned long long *__p)
-{
+  // We need to emulate the functionality of 64-bit rdrand with 2 32-bit
+  // rdrand instructions.
   unsigned int __lo, __hi;
   unsigned int __res_lo = __builtin_ia32_rdrand32_step(&__lo);
   unsigned int __res_hi = __builtin_ia32_rdrand32_step(&__hi);
@@ -318,55 +341,115 @@
     *__p = 0;
     return 0;
   }
-}
 #endif
+}
 #endif /* __RDRND__ */
 
 #if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
     defined(__FSGSBASE__)
 #ifdef __x86_64__
+/// Reads the FS base register.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> RDFSBASE </c> instruction.
+///
+/// \returns The lower 32 bits of the FS base register.
 static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__, __target__("fsgsbase")))
 _readfsbase_u32(void)
 {
   return __builtin_ia32_rdfsbase32();
 }
 
+/// Reads the FS base register.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> RDFSBASE </c> instruction.
+///
+/// \returns The contents of the FS base register.
 static __inline__ unsigned long long __attribute__((__always_inline__, __nodebug__, __target__("fsgsbase")))
 _readfsbase_u64(void)
 {
   return __builtin_ia32_rdfsbase64();
 }
 
+/// Reads the GS base register.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> RDGSBASE </c> instruction.
+///
+/// \returns The lower 32 bits of the GS base register.
 static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__, __target__("fsgsbase")))
 _readgsbase_u32(void)
 {
   return __builtin_ia32_rdgsbase32();
 }
 
+/// Reads the GS base register.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> RDGSBASE </c> instruction.
+///
+/// \returns The contents of the GS base register.
 static __inline__ unsigned long long __attribute__((__always_inline__, __nodebug__, __target__("fsgsbase")))
 _readgsbase_u64(void)
 {
   return __builtin_ia32_rdgsbase64();
 }
 
+/// Modifies the FS base register.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> WRFSBASE </c> instruction.
+///
+/// \param __V
+///    Value to use for the lower 32 bits of the FS base register.
 static __inline__ void __attribute__((__always_inline__, __nodebug__, __target__("fsgsbase")))
 _writefsbase_u32(unsigned int __V)
 {
   __builtin_ia32_wrfsbase32(__V);
 }
 
+/// Modifies the FS base register.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> WRFSBASE </c> instruction.
+///
+/// \param __V
+///    Value to use for the FS base register.
 static __inline__ void __attribute__((__always_inline__, __nodebug__, __target__("fsgsbase")))
 _writefsbase_u64(unsigned long long __V)
 {
   __builtin_ia32_wrfsbase64(__V);
 }
 
+/// Modifies the GS base register.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> WRGSBASE </c> instruction.
+///
+/// \param __V
+///    Value to use for the lower 32 bits of the GS base register.
 static __inline__ void __attribute__((__always_inline__, __nodebug__, __target__("fsgsbase")))
 _writegsbase_u32(unsigned int __V)
 {
   __builtin_ia32_wrgsbase32(__V);
 }
 
+/// Modifies the GS base register.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> WRFSBASE </c> instruction.
+///
+/// \param __V
+///    Value to use for GS base register.
 static __inline__ void __attribute__((__always_inline__, __nodebug__, __target__("fsgsbase")))
 _writegsbase_u64(unsigned long long __V)
 {
@@ -524,7 +607,7 @@
 #include <invpcidintrin.h>
 #endif
 #if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__AMXFP16__)
+    defined(__AMX_FP16__)
 #include <amxfp16intrin.h>
 #endif
 
@@ -534,11 +617,16 @@
 #endif
 
 #if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__AMXTILE__) || defined(__AMXINT8__) || defined(__AMXBF16__)
+    defined(__AMX_TILE__) || defined(__AMX_INT8__) || defined(__AMX_BF16__)
 #include <amxintrin.h>
 #endif
 
 #if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
+    defined(__AMX_COMPLEX__)
+#include <amxcomplexintrin.h>
+#endif
+
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
     defined(__AVX512VP2INTERSECT__)
 #include <avx512vp2intersectintrin.h>
 #endif
diff --git a/darwin-x86/lib64/clang/17/include/limits.h b/darwin-x86/lib64/clang/17/include/limits.h
index 32cc901..354e031 100644
--- a/darwin-x86/lib64/clang/17/include/limits.h
+++ b/darwin-x86/lib64/clang/17/include/limits.h
@@ -52,7 +52,11 @@
 #define LONG_MIN  (-__LONG_MAX__ -1L)
 
 #define UCHAR_MAX (__SCHAR_MAX__*2  +1)
-#define USHRT_MAX (__SHRT_MAX__ *2  +1)
+#if __SHRT_WIDTH__ < __INT_WIDTH__
+#define USHRT_MAX (__SHRT_MAX__ * 2 + 1)
+#else
+#define USHRT_MAX (__SHRT_MAX__ * 2U + 1U)
+#endif
 #define UINT_MAX  (__INT_MAX__  *2U +1U)
 #define ULONG_MAX (__LONG_MAX__ *2UL+1UL)
 
diff --git a/darwin-x86/lib64/clang/17/include/mwaitxintrin.h b/darwin-x86/lib64/clang/17/include/mwaitxintrin.h
index ed48538..65f4271 100644
--- a/darwin-x86/lib64/clang/17/include/mwaitxintrin.h
+++ b/darwin-x86/lib64/clang/17/include/mwaitxintrin.h
@@ -16,12 +16,41 @@
 
 /* Define the default attributes for the functions in this file. */
 #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__,  __target__("mwaitx")))
+
+/// Establishes a linear address memory range to be monitored and puts
+///    the processor in the monitor event pending state. Data stored in the
+///    monitored address range causes the processor to exit the pending state.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c MONITORX instruction.
+///
+/// \param __p
+///    The memory range to be monitored. The size of the range is determined by
+///    CPUID function 0000_0005h.
+/// \param __extensions
+///    Optional extensions for the monitoring state.
+/// \param __hints
+///    Optional hints for the monitoring state.
 static __inline__ void __DEFAULT_FN_ATTRS
 _mm_monitorx(void * __p, unsigned __extensions, unsigned __hints)
 {
   __builtin_ia32_monitorx(__p, __extensions, __hints);
 }
 
+/// Used with the \c MONITORX instruction to wait while the processor is in
+///    the monitor event pending state. Data stored in the monitored address
+///    range, or an interrupt, causes the processor to exit the pending state.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c MWAITX instruction.
+///
+/// \param __extensions
+///    Optional extensions for the monitoring state, which can vary by
+///    processor.
+/// \param __hints
+///    Optional hints for the monitoring state, which can vary by processor.
 static __inline__ void __DEFAULT_FN_ATTRS
 _mm_mwaitx(unsigned __extensions, unsigned __hints, unsigned __clock)
 {
diff --git a/darwin-x86/lib64/clang/17/include/orc/c_api.h b/darwin-x86/lib64/clang/17/include/orc/c_api.h
index 96d01df..628c5cd 100644
--- a/darwin-x86/lib64/clang/17/include/orc/c_api.h
+++ b/darwin-x86/lib64/clang/17/include/orc/c_api.h
@@ -48,17 +48,17 @@
 typedef union {
   char *ValuePtr;
   char Value[sizeof(char *)];
-} __orc_rt_CWrapperFunctionResultDataUnion;
+} orc_rt_CWrapperFunctionResultDataUnion;
 
 /**
- * __orc_rt_CWrapperFunctionResult is a kind of C-SmallVector with an
+ * orc_rt_CWrapperFunctionResult is a kind of C-SmallVector with an
  * out-of-band error state.
  *
  * If Size == 0 and Data.ValuePtr is non-zero then the value is in the
  * 'out-of-band error' state, and Data.ValuePtr points at a malloc-allocated,
  * null-terminated string error message.
  *
- * If Size <= sizeof(__orc_rt_CWrapperFunctionResultData) then the value is in
+ * If Size <= sizeof(orc_rt_CWrapperFunctionResultData) then the value is in
  * the 'small' state and the content is held in the first Size bytes of
  * Data.Value.
  *
@@ -68,29 +68,29 @@
  * malloc, and will be freed with free when this value is destroyed.
  */
 typedef struct {
-  __orc_rt_CWrapperFunctionResultDataUnion Data;
+  orc_rt_CWrapperFunctionResultDataUnion Data;
   size_t Size;
-} __orc_rt_CWrapperFunctionResult;
+} orc_rt_CWrapperFunctionResult;
 
-typedef struct __orc_rt_CSharedOpaqueJITProcessControl
-    *__orc_rt_SharedJITProcessControlRef;
+typedef struct orc_rt_CSharedOpaqueJITProcessControl
+    *orc_rt_SharedJITProcessControlRef;
 
 /**
- * Zero-initialize an __orc_rt_CWrapperFunctionResult.
+ * Zero-initialize an orc_rt_CWrapperFunctionResult.
  */
 static inline void
-__orc_rt_CWrapperFunctionResultInit(__orc_rt_CWrapperFunctionResult *R) {
+orc_rt_CWrapperFunctionResultInit(orc_rt_CWrapperFunctionResult *R) {
   R->Size = 0;
   R->Data.ValuePtr = 0;
 }
 
 /**
- * Create an __orc_rt_CWrapperFunctionResult with an uninitialized buffer of
+ * Create an orc_rt_CWrapperFunctionResult with an uninitialized buffer of
  * size Size. The buffer is returned via the DataPtr argument.
  */
-static inline __orc_rt_CWrapperFunctionResult
-__orc_rt_CWrapperFunctionResultAllocate(size_t Size) {
-  __orc_rt_CWrapperFunctionResult R;
+static inline orc_rt_CWrapperFunctionResult
+orc_rt_CWrapperFunctionResultAllocate(size_t Size) {
+  orc_rt_CWrapperFunctionResult R;
   R.Size = Size;
   // If Size is 0 ValuePtr must be 0 or it is considered an out-of-band error.
   R.Data.ValuePtr = 0;
@@ -100,11 +100,11 @@
 }
 
 /**
- * Create an __orc_rt_WrapperFunctionResult from the given data range.
+ * Create an orc_rt_WrapperFunctionResult from the given data range.
  */
-static inline __orc_rt_CWrapperFunctionResult
-__orc_rt_CreateCWrapperFunctionResultFromRange(const char *Data, size_t Size) {
-  __orc_rt_CWrapperFunctionResult R;
+static inline orc_rt_CWrapperFunctionResult
+orc_rt_CreateCWrapperFunctionResultFromRange(const char *Data, size_t Size) {
+  orc_rt_CWrapperFunctionResult R;
   R.Size = Size;
   if (R.Size > sizeof(R.Data.Value)) {
     char *Tmp = (char *)malloc(Size);
@@ -116,28 +116,28 @@
 }
 
 /**
- * Create an __orc_rt_CWrapperFunctionResult by copying the given string,
+ * Create an orc_rt_CWrapperFunctionResult by copying the given string,
  * including the null-terminator.
  *
  * This function copies the input string. The client is responsible for freeing
  * the ErrMsg arg.
  */
-static inline __orc_rt_CWrapperFunctionResult
-__orc_rt_CreateCWrapperFunctionResultFromString(const char *Source) {
-  return __orc_rt_CreateCWrapperFunctionResultFromRange(Source,
-                                                        strlen(Source) + 1);
+static inline orc_rt_CWrapperFunctionResult
+orc_rt_CreateCWrapperFunctionResultFromString(const char *Source) {
+  return orc_rt_CreateCWrapperFunctionResultFromRange(Source,
+                                                      strlen(Source) + 1);
 }
 
 /**
- * Create an __orc_rt_CWrapperFunctionResult representing an out-of-band
+ * Create an orc_rt_CWrapperFunctionResult representing an out-of-band
  * error.
  *
  * This function copies the input string. The client is responsible for freeing
  * the ErrMsg arg.
  */
-static inline __orc_rt_CWrapperFunctionResult
-__orc_rt_CreateCWrapperFunctionResultFromOutOfBandError(const char *ErrMsg) {
-  __orc_rt_CWrapperFunctionResult R;
+static inline orc_rt_CWrapperFunctionResult
+orc_rt_CreateCWrapperFunctionResultFromOutOfBandError(const char *ErrMsg) {
+  orc_rt_CWrapperFunctionResult R;
   R.Size = 0;
   char *Tmp = (char *)malloc(strlen(ErrMsg) + 1);
   strcpy(Tmp, ErrMsg);
@@ -146,11 +146,11 @@
 }
 
 /**
- * This should be called to destroy __orc_rt_CWrapperFunctionResult values
+ * This should be called to destroy orc_rt_CWrapperFunctionResult values
  * regardless of their state.
  */
 static inline void
-__orc_rt_DisposeCWrapperFunctionResult(__orc_rt_CWrapperFunctionResult *R) {
+orc_rt_DisposeCWrapperFunctionResult(orc_rt_CWrapperFunctionResult *R) {
   if (R->Size > sizeof(R->Data.Value) ||
       (R->Size == 0 && R->Data.ValuePtr))
     free(R->Data.ValuePtr);
@@ -158,22 +158,22 @@
 
 /**
  * Get a pointer to the data contained in the given
- * __orc_rt_CWrapperFunctionResult.
+ * orc_rt_CWrapperFunctionResult.
  */
 static inline char *
-__orc_rt_CWrapperFunctionResultData(__orc_rt_CWrapperFunctionResult *R) {
+orc_rt_CWrapperFunctionResultData(orc_rt_CWrapperFunctionResult *R) {
   assert((R->Size != 0 || R->Data.ValuePtr == NULL) &&
          "Cannot get data for out-of-band error value");
   return R->Size > sizeof(R->Data.Value) ? R->Data.ValuePtr : R->Data.Value;
 }
 
 /**
- * Safely get the size of the given __orc_rt_CWrapperFunctionResult.
+ * Safely get the size of the given orc_rt_CWrapperFunctionResult.
  *
  * Asserts that we're not trying to access the size of an error value.
  */
 static inline size_t
-__orc_rt_CWrapperFunctionResultSize(const __orc_rt_CWrapperFunctionResult *R) {
+orc_rt_CWrapperFunctionResultSize(const orc_rt_CWrapperFunctionResult *R) {
   assert((R->Size != 0 || R->Data.ValuePtr == NULL) &&
          "Cannot get size for out-of-band error value");
   return R->Size;
@@ -181,22 +181,22 @@
 
 /**
  * Returns 1 if this value is equivalent to a value just initialized by
- * __orc_rt_CWrapperFunctionResultInit, 0 otherwise.
+ * orc_rt_CWrapperFunctionResultInit, 0 otherwise.
  */
 static inline size_t
-__orc_rt_CWrapperFunctionResultEmpty(const __orc_rt_CWrapperFunctionResult *R) {
+orc_rt_CWrapperFunctionResultEmpty(const orc_rt_CWrapperFunctionResult *R) {
   return R->Size == 0 && R->Data.ValuePtr == 0;
 }
 
 /**
  * Returns a pointer to the out-of-band error string for this
- * __orc_rt_CWrapperFunctionResult, or null if there is no error.
+ * orc_rt_CWrapperFunctionResult, or null if there is no error.
  *
- * The __orc_rt_CWrapperFunctionResult retains ownership of the error
+ * The orc_rt_CWrapperFunctionResult retains ownership of the error
  * string, so it should be copied if the caller wishes to preserve it.
  */
-static inline const char *__orc_rt_CWrapperFunctionResultGetOutOfBandError(
-    const __orc_rt_CWrapperFunctionResult *R) {
+static inline const char *orc_rt_CWrapperFunctionResultGetOutOfBandError(
+    const orc_rt_CWrapperFunctionResult *R) {
   return R->Size == 0 ? R->Data.ValuePtr : 0;
 }
 
diff --git a/darwin-x86/lib64/clang/17/include/pmmintrin.h b/darwin-x86/lib64/clang/17/include/pmmintrin.h
index ee660e9..203c0aa 100644
--- a/darwin-x86/lib64/clang/17/include/pmmintrin.h
+++ b/darwin-x86/lib64/clang/17/include/pmmintrin.h
@@ -253,9 +253,12 @@
 ///    the processor in the monitor event pending state. Data stored in the
 ///    monitored address range causes the processor to exit the pending state.
 ///
+/// The \c MONITOR instruction can be used in kernel mode, and in other modes
+/// if MSR <c> C001_0015h[MonMwaitUserEn] </c> is set.
+///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the <c> MONITOR </c> instruction.
+/// This intrinsic corresponds to the \c MONITOR instruction.
 ///
 /// \param __p
 ///    The memory range to be monitored. The size of the range is determined by
@@ -270,19 +273,22 @@
   __builtin_ia32_monitor(__p, __extensions, __hints);
 }
 
-/// Used with the MONITOR instruction to wait while the processor is in
+/// Used with the \c MONITOR instruction to wait while the processor is in
 ///    the monitor event pending state. Data stored in the monitored address
-///    range causes the processor to exit the pending state.
+///    range, or an interrupt, causes the processor to exit the pending state.
+///
+/// The \c MWAIT instruction can be used in kernel mode, and in other modes if
+/// MSR <c> C001_0015h[MonMwaitUserEn] </c> is set.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the <c> MWAIT </c> instruction.
+/// This intrinsic corresponds to the \c MWAIT instruction.
 ///
 /// \param __extensions
-///    Optional extensions for the monitoring state, which may vary by
+///    Optional extensions for the monitoring state, which can vary by
 ///    processor.
 /// \param __hints
-///    Optional hints for the monitoring state, which may vary by processor.
+///    Optional hints for the monitoring state, which can vary by processor.
 static __inline__ void __DEFAULT_FN_ATTRS
 _mm_mwait(unsigned __extensions, unsigned __hints)
 {
diff --git a/darwin-x86/lib64/clang/17/include/ppc_wrappers/emmintrin.h b/darwin-x86/lib64/clang/17/include/ppc_wrappers/emmintrin.h
index 0814ea5..fc18ab9 100644
--- a/darwin-x86/lib64/clang/17/include/ppc_wrappers/emmintrin.h
+++ b/darwin-x86/lib64/clang/17/include/ppc_wrappers/emmintrin.h
@@ -46,6 +46,7 @@
 
 /* SSE2 */
 typedef __vector double __v2df;
+typedef __vector float __v4f;
 typedef __vector long long __v2di;
 typedef __vector unsigned long long __v2du;
 typedef __vector int __v4si;
@@ -951,7 +952,7 @@
     _mm_cvtpi32_pd(__m64 __A) {
   __v4si __temp;
   __v2di __tmp2;
-  __v2df __result;
+  __v4f __result;
 
   __temp = (__v4si)vec_splats(__A);
   __tmp2 = (__v2di)vec_unpackl(__temp);
diff --git a/darwin-x86/lib64/clang/17/include/ppc_wrappers/smmintrin.h b/darwin-x86/lib64/clang/17/include/ppc_wrappers/smmintrin.h
index 6fe6d2a..349b395 100644
--- a/darwin-x86/lib64/clang/17/include/ppc_wrappers/smmintrin.h
+++ b/darwin-x86/lib64/clang/17/include/ppc_wrappers/smmintrin.h
@@ -305,9 +305,9 @@
 extern __inline __m128i
     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     _mm_blend_epi16(__m128i __A, __m128i __B, const int __imm8) {
-  __v16qi __charmask = vec_splats((signed char)__imm8);
+  __v16qu __charmask = vec_splats((unsigned char)__imm8);
   __charmask = vec_gb(__charmask);
-  __v8hu __shortmask = (__v8hu)vec_unpackh(__charmask);
+  __v8hu __shortmask = (__v8hu)vec_unpackh((__v16qi)__charmask);
 #ifdef __BIG_ENDIAN__
   __shortmask = vec_reve(__shortmask);
 #endif
diff --git a/darwin-x86/lib64/clang/17/include/profile/InstrProfData.inc b/darwin-x86/lib64/clang/17/include/profile/InstrProfData.inc
index 05419bf..94261f4 100644
--- a/darwin-x86/lib64/clang/17/include/profile/InstrProfData.inc
+++ b/darwin-x86/lib64/clang/17/include/profile/InstrProfData.inc
@@ -650,7 +650,7 @@
 /* Raw profile format version (start from 1). */
 #define INSTR_PROF_RAW_VERSION 8
 /* Indexed profile format version (start from 1). */
-#define INSTR_PROF_INDEX_VERSION 9
+#define INSTR_PROF_INDEX_VERSION 10
 /* Coverage mapping format version (start from 0). */
 #define INSTR_PROF_COVMAP_VERSION 5
 
@@ -663,6 +663,7 @@
  * The 60th bit indicates single byte coverage instrumentation.
  * The 61st bit indicates function entry instrumentation only.
  * The 62nd bit indicates whether memory profile information is present.
+ * The 63rd bit indicates if this is a temporal profile.
  */
 #define VARIANT_MASKS_ALL 0xff00000000000000ULL
 #define GET_VERSION(V) ((V) & ~VARIANT_MASKS_ALL)
@@ -673,9 +674,11 @@
 #define VARIANT_MASK_BYTE_COVERAGE (0x1ULL << 60)
 #define VARIANT_MASK_FUNCTION_ENTRY_ONLY (0x1ULL << 61)
 #define VARIANT_MASK_MEMPROF (0x1ULL << 62)
+#define VARIANT_MASK_TEMPORAL_PROF (0x1ULL << 63)
 #define INSTR_PROF_RAW_VERSION_VAR __llvm_profile_raw_version
 #define INSTR_PROF_PROFILE_RUNTIME_VAR __llvm_profile_runtime
 #define INSTR_PROF_PROFILE_COUNTER_BIAS_VAR __llvm_profile_counter_bias
+#define INSTR_PROF_PROFILE_SET_TIMESTAMP __llvm_profile_set_timestamp
 
 /* The variable that holds the name of the profile data
  * specified via command line. */
diff --git a/darwin-x86/lib64/clang/17/include/riscv_ntlh.h b/darwin-x86/lib64/clang/17/include/riscv_ntlh.h
new file mode 100644
index 0000000..9ce1709
--- /dev/null
+++ b/darwin-x86/lib64/clang/17/include/riscv_ntlh.h
@@ -0,0 +1,28 @@
+/*===---- riscv_ntlh.h - RISC-V NTLH intrinsics ----------------------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __RISCV_NTLH_H
+#define __RISCV_NTLH_H
+
+#ifndef __riscv_zihintntl
+#error "NTLH intrinsics require the NTLH extension."
+#endif
+
+enum {
+  __RISCV_NTLH_INNERMOST_PRIVATE = 2,
+  __RISCV_NTLH_ALL_PRIVATE,
+  __RISCV_NTLH_INNERMOST_SHARED,
+  __RISCV_NTLH_ALL
+};
+
+#define __riscv_ntl_load(PTR, DOMAIN) __builtin_riscv_ntl_load((PTR), (DOMAIN))
+#define __riscv_ntl_store(PTR, VAL, DOMAIN)                                    \
+  __builtin_riscv_ntl_store((PTR), (VAL), (DOMAIN))
+
+#endif
\ No newline at end of file
diff --git a/darwin-x86/lib64/clang/17/include/riscv_vector.h b/darwin-x86/lib64/clang/17/include/riscv_vector.h
index 2a9598e..5c5480b 100644
--- a/darwin-x86/lib64/clang/17/include/riscv_vector.h
+++ b/darwin-x86/lib64/clang/17/include/riscv_vector.h
@@ -25,7 +25,7 @@
 #pragma clang riscv intrinsic vector
 
 
-#define vlenb() __builtin_rvv_vlenb()
+#define __riscv_vlenb() __builtin_rvv_vlenb()
 
 enum RVV_CSR {
   RVV_VSTART = 0,
@@ -35,7 +35,7 @@
 };
 
 static __inline__ __attribute__((__always_inline__, __nodebug__))
-unsigned long vread_csr(enum RVV_CSR __csr) {
+unsigned long __riscv_vread_csr(enum RVV_CSR __csr) {
   unsigned long __rv = 0;
   switch (__csr) {
     case RVV_VSTART:
@@ -55,7 +55,7 @@
 }
 
 static __inline__ __attribute__((__always_inline__, __nodebug__))
-void vwrite_csr(enum RVV_CSR __csr, unsigned long __value) {
+void __riscv_vwrite_csr(enum RVV_CSR __csr, unsigned long __value) {
   switch (__csr) {
     case RVV_VSTART:
       __asm__ __volatile__ ("csrw\tvstart, %z0" : : "rJ"(__value) : "memory");
@@ -72,62 +72,62 @@
   }
 }
 
-#define vsetvl_e8mf4(avl) __builtin_rvv_vsetvli((size_t)(avl), 0, 6)
-#define vsetvl_e8mf2(avl) __builtin_rvv_vsetvli((size_t)(avl), 0, 7)
-#define vsetvl_e8m1(avl) __builtin_rvv_vsetvli((size_t)(avl), 0, 0)
-#define vsetvl_e8m2(avl) __builtin_rvv_vsetvli((size_t)(avl), 0, 1)
-#define vsetvl_e8m4(avl) __builtin_rvv_vsetvli((size_t)(avl), 0, 2)
-#define vsetvl_e8m8(avl) __builtin_rvv_vsetvli((size_t)(avl), 0, 3)
+#define __riscv_vsetvl_e8mf4(avl) __builtin_rvv_vsetvli((size_t)(avl), 0, 6)
+#define __riscv_vsetvl_e8mf2(avl) __builtin_rvv_vsetvli((size_t)(avl), 0, 7)
+#define __riscv_vsetvl_e8m1(avl) __builtin_rvv_vsetvli((size_t)(avl), 0, 0)
+#define __riscv_vsetvl_e8m2(avl) __builtin_rvv_vsetvli((size_t)(avl), 0, 1)
+#define __riscv_vsetvl_e8m4(avl) __builtin_rvv_vsetvli((size_t)(avl), 0, 2)
+#define __riscv_vsetvl_e8m8(avl) __builtin_rvv_vsetvli((size_t)(avl), 0, 3)
 
-#define vsetvl_e16mf2(avl) __builtin_rvv_vsetvli((size_t)(avl), 1, 7)
-#define vsetvl_e16m1(avl) __builtin_rvv_vsetvli((size_t)(avl), 1, 0)
-#define vsetvl_e16m2(avl) __builtin_rvv_vsetvli((size_t)(avl), 1, 1)
-#define vsetvl_e16m4(avl) __builtin_rvv_vsetvli((size_t)(avl), 1, 2)
-#define vsetvl_e16m8(avl) __builtin_rvv_vsetvli((size_t)(avl), 1, 3)
+#define __riscv_vsetvl_e16mf2(avl) __builtin_rvv_vsetvli((size_t)(avl), 1, 7)
+#define __riscv_vsetvl_e16m1(avl) __builtin_rvv_vsetvli((size_t)(avl), 1, 0)
+#define __riscv_vsetvl_e16m2(avl) __builtin_rvv_vsetvli((size_t)(avl), 1, 1)
+#define __riscv_vsetvl_e16m4(avl) __builtin_rvv_vsetvli((size_t)(avl), 1, 2)
+#define __riscv_vsetvl_e16m8(avl) __builtin_rvv_vsetvli((size_t)(avl), 1, 3)
 
-#define vsetvl_e32m1(avl) __builtin_rvv_vsetvli((size_t)(avl), 2, 0)
-#define vsetvl_e32m2(avl) __builtin_rvv_vsetvli((size_t)(avl), 2, 1)
-#define vsetvl_e32m4(avl) __builtin_rvv_vsetvli((size_t)(avl), 2, 2)
-#define vsetvl_e32m8(avl) __builtin_rvv_vsetvli((size_t)(avl), 2, 3)
+#define __riscv_vsetvl_e32m1(avl) __builtin_rvv_vsetvli((size_t)(avl), 2, 0)
+#define __riscv_vsetvl_e32m2(avl) __builtin_rvv_vsetvli((size_t)(avl), 2, 1)
+#define __riscv_vsetvl_e32m4(avl) __builtin_rvv_vsetvli((size_t)(avl), 2, 2)
+#define __riscv_vsetvl_e32m8(avl) __builtin_rvv_vsetvli((size_t)(avl), 2, 3)
 
 #if __riscv_v_elen >= 64
-#define vsetvl_e8mf8(avl) __builtin_rvv_vsetvli((size_t)(avl), 0, 5)
-#define vsetvl_e16mf4(avl) __builtin_rvv_vsetvli((size_t)(avl), 1, 6)
-#define vsetvl_e32mf2(avl) __builtin_rvv_vsetvli((size_t)(avl), 2, 7)
+#define __riscv_vsetvl_e8mf8(avl) __builtin_rvv_vsetvli((size_t)(avl), 0, 5)
+#define __riscv_vsetvl_e16mf4(avl) __builtin_rvv_vsetvli((size_t)(avl), 1, 6)
+#define __riscv_vsetvl_e32mf2(avl) __builtin_rvv_vsetvli((size_t)(avl), 2, 7)
 
-#define vsetvl_e64m1(avl) __builtin_rvv_vsetvli((size_t)(avl), 3, 0)
-#define vsetvl_e64m2(avl) __builtin_rvv_vsetvli((size_t)(avl), 3, 1)
-#define vsetvl_e64m4(avl) __builtin_rvv_vsetvli((size_t)(avl), 3, 2)
-#define vsetvl_e64m8(avl) __builtin_rvv_vsetvli((size_t)(avl), 3, 3)
+#define __riscv_vsetvl_e64m1(avl) __builtin_rvv_vsetvli((size_t)(avl), 3, 0)
+#define __riscv_vsetvl_e64m2(avl) __builtin_rvv_vsetvli((size_t)(avl), 3, 1)
+#define __riscv_vsetvl_e64m4(avl) __builtin_rvv_vsetvli((size_t)(avl), 3, 2)
+#define __riscv_vsetvl_e64m8(avl) __builtin_rvv_vsetvli((size_t)(avl), 3, 3)
 #endif
 
-#define vsetvlmax_e8mf4() __builtin_rvv_vsetvlimax(0, 6)
-#define vsetvlmax_e8mf2() __builtin_rvv_vsetvlimax(0, 7)
-#define vsetvlmax_e8m1() __builtin_rvv_vsetvlimax(0, 0)
-#define vsetvlmax_e8m2() __builtin_rvv_vsetvlimax(0, 1)
-#define vsetvlmax_e8m4() __builtin_rvv_vsetvlimax(0, 2)
-#define vsetvlmax_e8m8() __builtin_rvv_vsetvlimax(0, 3)
+#define __riscv_vsetvlmax_e8mf4() __builtin_rvv_vsetvlimax(0, 6)
+#define __riscv_vsetvlmax_e8mf2() __builtin_rvv_vsetvlimax(0, 7)
+#define __riscv_vsetvlmax_e8m1() __builtin_rvv_vsetvlimax(0, 0)
+#define __riscv_vsetvlmax_e8m2() __builtin_rvv_vsetvlimax(0, 1)
+#define __riscv_vsetvlmax_e8m4() __builtin_rvv_vsetvlimax(0, 2)
+#define __riscv_vsetvlmax_e8m8() __builtin_rvv_vsetvlimax(0, 3)
 
-#define vsetvlmax_e16mf2() __builtin_rvv_vsetvlimax(1, 7)
-#define vsetvlmax_e16m1() __builtin_rvv_vsetvlimax(1, 0)
-#define vsetvlmax_e16m2() __builtin_rvv_vsetvlimax(1, 1)
-#define vsetvlmax_e16m4() __builtin_rvv_vsetvlimax(1, 2)
-#define vsetvlmax_e16m8() __builtin_rvv_vsetvlimax(1, 3)
+#define __riscv_vsetvlmax_e16mf2() __builtin_rvv_vsetvlimax(1, 7)
+#define __riscv_vsetvlmax_e16m1() __builtin_rvv_vsetvlimax(1, 0)
+#define __riscv_vsetvlmax_e16m2() __builtin_rvv_vsetvlimax(1, 1)
+#define __riscv_vsetvlmax_e16m4() __builtin_rvv_vsetvlimax(1, 2)
+#define __riscv_vsetvlmax_e16m8() __builtin_rvv_vsetvlimax(1, 3)
 
-#define vsetvlmax_e32m1() __builtin_rvv_vsetvlimax(2, 0)
-#define vsetvlmax_e32m2() __builtin_rvv_vsetvlimax(2, 1)
-#define vsetvlmax_e32m4() __builtin_rvv_vsetvlimax(2, 2)
-#define vsetvlmax_e32m8() __builtin_rvv_vsetvlimax(2, 3)
+#define __riscv_vsetvlmax_e32m1() __builtin_rvv_vsetvlimax(2, 0)
+#define __riscv_vsetvlmax_e32m2() __builtin_rvv_vsetvlimax(2, 1)
+#define __riscv_vsetvlmax_e32m4() __builtin_rvv_vsetvlimax(2, 2)
+#define __riscv_vsetvlmax_e32m8() __builtin_rvv_vsetvlimax(2, 3)
 
 #if __riscv_v_elen >= 64
-#define vsetvlmax_e8mf8() __builtin_rvv_vsetvlimax(0, 5)
-#define vsetvlmax_e16mf4() __builtin_rvv_vsetvlimax(1, 6)
-#define vsetvlmax_e32mf2() __builtin_rvv_vsetvlimax(2, 7)
+#define __riscv_vsetvlmax_e8mf8() __builtin_rvv_vsetvlimax(0, 5)
+#define __riscv_vsetvlmax_e16mf4() __builtin_rvv_vsetvlimax(1, 6)
+#define __riscv_vsetvlmax_e32mf2() __builtin_rvv_vsetvlimax(2, 7)
 
-#define vsetvlmax_e64m1() __builtin_rvv_vsetvlimax(3, 0)
-#define vsetvlmax_e64m2() __builtin_rvv_vsetvlimax(3, 1)
-#define vsetvlmax_e64m4() __builtin_rvv_vsetvlimax(3, 2)
-#define vsetvlmax_e64m8() __builtin_rvv_vsetvlimax(3, 3)
+#define __riscv_vsetvlmax_e64m1() __builtin_rvv_vsetvlimax(3, 0)
+#define __riscv_vsetvlmax_e64m2() __builtin_rvv_vsetvlimax(3, 1)
+#define __riscv_vsetvlmax_e64m4() __builtin_rvv_vsetvlimax(3, 2)
+#define __riscv_vsetvlmax_e64m8() __builtin_rvv_vsetvlimax(3, 3)
 #endif
 
 typedef __rvv_bool64_t vbool64_t;
@@ -181,28 +181,21 @@
 typedef __rvv_uint64m4_t vuint64m4_t;
 typedef __rvv_int64m8_t vint64m8_t;
 typedef __rvv_uint64m8_t vuint64m8_t;
-#if defined(__riscv_zvfh)
 typedef __rvv_float16mf4_t vfloat16mf4_t;
 typedef __rvv_float16mf2_t vfloat16mf2_t;
 typedef __rvv_float16m1_t vfloat16m1_t;
 typedef __rvv_float16m2_t vfloat16m2_t;
 typedef __rvv_float16m4_t vfloat16m4_t;
 typedef __rvv_float16m8_t vfloat16m8_t;
-#endif
-#if (__riscv_v_elen_fp >= 32)
 typedef __rvv_float32mf2_t vfloat32mf2_t;
 typedef __rvv_float32m1_t vfloat32m1_t;
 typedef __rvv_float32m2_t vfloat32m2_t;
 typedef __rvv_float32m4_t vfloat32m4_t;
 typedef __rvv_float32m8_t vfloat32m8_t;
-#endif
-#if (__riscv_v_elen_fp >= 64)
 typedef __rvv_float64m1_t vfloat64m1_t;
 typedef __rvv_float64m2_t vfloat64m2_t;
 typedef __rvv_float64m4_t vfloat64m4_t;
 typedef __rvv_float64m8_t vfloat64m8_t;
-#endif
-
 #define __riscv_v_intrinsic_overloading 1
 
 #ifdef __cplusplus
diff --git a/darwin-x86/lib64/clang/17/include/sanitizer/allocator_interface.h b/darwin-x86/lib64/clang/17/include/sanitizer/allocator_interface.h
index 6226135..d0cfce7 100644
--- a/darwin-x86/lib64/clang/17/include/sanitizer/allocator_interface.h
+++ b/darwin-x86/lib64/clang/17/include/sanitizer/allocator_interface.h
@@ -26,6 +26,10 @@
      is not yet freed. */
   int __sanitizer_get_ownership(const volatile void *p);
 
+  /* If a pointer lies within an allocation, it will return the start address
+     of the allocation. Otherwise, it returns nullptr. */
+  const void *__sanitizer_get_allocated_begin(const void *p);
+
   /* Returns the number of bytes reserved for the pointer p.
      Requires (get_ownership(p) == true) or (p == 0). */
   size_t __sanitizer_get_allocated_size(const volatile void *p);
diff --git a/darwin-x86/lib64/clang/17/include/sanitizer/common_interface_defs.h b/darwin-x86/lib64/clang/17/include/sanitizer/common_interface_defs.h
index 2f415bd..983df7c 100644
--- a/darwin-x86/lib64/clang/17/include/sanitizer/common_interface_defs.h
+++ b/darwin-x86/lib64/clang/17/include/sanitizer/common_interface_defs.h
@@ -129,26 +129,23 @@
 /// state <c>mid == end</c>, so that should be the final state when the
 /// container is destroyed or when the container reallocates the storage.
 ///
-/// For ASan, <c><i>beg</i></c> should be 8-aligned and <c><i>end</i></c>
-/// should be either 8-aligned or it should point to the end of a separate
-/// heap-, stack-, or global-allocated buffer. So the following example will
-/// not work:
+/// For ASan, <c><i>beg</i></c> no longer needs to be 8-aligned,
+/// first and last granule may be shared with other objects
+/// and therefore the function can be used for any allocator.
+///
+/// The following example shows how to use the function:
 ///
 /// \code
-///   int64_t x[2]; // 16 bytes, 8-aligned
-///   char *beg = (char *)&x[0];
-///   char *end = beg + 12; // Not 8-aligned, not the end of the buffer
-/// \endcode
-///
-/// The following, however, will work:
-/// \code
-///   int32_t x[3]; // 12 bytes, but 8-aligned under ASan.
+///   int32_t x[3]; // 12 bytes
 ///   char *beg = (char*)&x[0];
-///   char *end = beg + 12; // Not 8-aligned, but is the end of the buffer
+///   char *end = beg + 12;
+///   __sanitizer_annotate_contiguous_container(beg, end, beg, end);
 /// \endcode
 ///
 /// \note  Use this function with caution and do not use for anything other
 /// than vector-like classes.
+/// \note  Unaligned <c><i>beg</i></c> or <c><i>end</i></c> may miss bugs in
+/// these granules.
 ///
 /// \param beg Beginning of memory region.
 /// \param end End of memory region.
diff --git a/darwin-x86/lib64/clang/17/include/sanitizer/hwasan_interface.h b/darwin-x86/lib64/clang/17/include/sanitizer/hwasan_interface.h
index 14035c0..ee742c7 100644
--- a/darwin-x86/lib64/clang/17/include/sanitizer/hwasan_interface.h
+++ b/darwin-x86/lib64/clang/17/include/sanitizer/hwasan_interface.h
@@ -1,4 +1,4 @@
-//===-- sanitizer/asan_interface.h ------------------------------*- C++ -*-===//
+//===-- sanitizer/hwasan_interface.h ----------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/darwin-x86/lib64/clang/17/include/sanitizer/tsan_interface.h b/darwin-x86/lib64/clang/17/include/sanitizer/tsan_interface.h
index 2782e61..58f2513 100644
--- a/darwin-x86/lib64/clang/17/include/sanitizer/tsan_interface.h
+++ b/darwin-x86/lib64/clang/17/include/sanitizer/tsan_interface.h
@@ -172,6 +172,12 @@
 // Release TSan internal memory in a best-effort manner.
 void __tsan_flush_memory();
 
+// User-provided default TSAN options.
+const char* __tsan_default_options(void);
+
+// User-provided default TSAN suppressions.
+const char* __tsan_default_suppressions(void);
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/darwin-x86/lib64/clang/17/include/sifive_vector.h b/darwin-x86/lib64/clang/17/include/sifive_vector.h
new file mode 100644
index 0000000..42d7224
--- /dev/null
+++ b/darwin-x86/lib64/clang/17/include/sifive_vector.h
@@ -0,0 +1,16 @@
+//===----- sifive_vector.h - SiFive Vector definitions --------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _SIFIVE_VECTOR_H_
+#define _SIFIVE_VECTOR_H_
+
+#include "riscv_vector.h"
+
+#pragma clang riscv intrinsic sifive_vector
+
+#endif //_SIFIVE_VECTOR_H_
diff --git a/darwin-x86/lib64/clang/17/include/stdalign.h b/darwin-x86/lib64/clang/17/include/stdalign.h
index 6ad25db..8ae6e65 100644
--- a/darwin-x86/lib64/clang/17/include/stdalign.h
+++ b/darwin-x86/lib64/clang/17/include/stdalign.h
@@ -10,6 +10,10 @@
 #ifndef __STDALIGN_H
 #define __STDALIGN_H
 
+/* FIXME: This is using the placeholder dates Clang produces for these macros
+   in C2x mode; switch to the correct values once they've been published. */
+#if defined(__cplusplus) ||                                                    \
+    (defined(__STDC_VERSION__) && __STDC_VERSION__ < 202000L)
 #ifndef __cplusplus
 #define alignas _Alignas
 #define alignof _Alignof
@@ -17,5 +21,6 @@
 
 #define __alignas_is_defined 1
 #define __alignof_is_defined 1
+#endif /* __STDC_VERSION__ */
 
 #endif /* __STDALIGN_H */
diff --git a/darwin-x86/lib64/clang/17/include/stddef.h b/darwin-x86/lib64/clang/17/include/stddef.h
index 4281517..539541f 100644
--- a/darwin-x86/lib64/clang/17/include/stddef.h
+++ b/darwin-x86/lib64/clang/17/include/stddef.h
@@ -103,6 +103,11 @@
 typedef typeof(nullptr) nullptr_t;
 #endif /* defined(__STDC_VERSION__) && __STDC_VERSION__ >= 202000L */
 
+#if defined(__need_STDDEF_H_misc) && defined(__STDC_VERSION__) &&              \
+    __STDC_VERSION__ >= 202000L
+#define unreachable() __builtin_unreachable()
+#endif /* defined(__need_STDDEF_H_misc) && >= C23 */
+
 #if defined(__need_STDDEF_H_misc)
 #if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L) ||              \
     (defined(__cplusplus) && __cplusplus >= 201103L)
diff --git a/darwin-x86/lib64/clang/17/include/wasm_simd128.h b/darwin-x86/lib64/clang/17/include/wasm_simd128.h
index f93de12..a099ab5 100644
--- a/darwin-x86/lib64/clang/17/include/wasm_simd128.h
+++ b/darwin-x86/lib64/clang/17/include/wasm_simd128.h
@@ -961,17 +961,17 @@
 
 static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i8x16_shl(v128_t __a,
                                                            uint32_t __b) {
-  return (v128_t)((__i8x16)__a << __b);
+  return (v128_t)((__i8x16)__a << (__b & 0x7));
 }
 
 static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i8x16_shr(v128_t __a,
                                                            uint32_t __b) {
-  return (v128_t)((__i8x16)__a >> __b);
+  return (v128_t)((__i8x16)__a >> (__b & 0x7));
 }
 
 static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_u8x16_shr(v128_t __a,
                                                            uint32_t __b) {
-  return (v128_t)((__u8x16)__a >> __b);
+  return (v128_t)((__u8x16)__a >> (__b & 0x7));
 }
 
 static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i8x16_add(v128_t __a,
@@ -1047,17 +1047,17 @@
 
 static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i16x8_shl(v128_t __a,
                                                            uint32_t __b) {
-  return (v128_t)((__i16x8)__a << __b);
+  return (v128_t)((__i16x8)__a << (__b & 0xF));
 }
 
 static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i16x8_shr(v128_t __a,
                                                            uint32_t __b) {
-  return (v128_t)((__i16x8)__a >> __b);
+  return (v128_t)((__i16x8)__a >> (__b & 0xF));
 }
 
 static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_u16x8_shr(v128_t __a,
                                                            uint32_t __b) {
-  return (v128_t)((__u16x8)__a >> __b);
+  return (v128_t)((__u16x8)__a >> (__b & 0xF));
 }
 
 static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i16x8_add(v128_t __a,
@@ -1138,17 +1138,17 @@
 
 static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i32x4_shl(v128_t __a,
                                                            uint32_t __b) {
-  return (v128_t)((__i32x4)__a << __b);
+  return (v128_t)((__i32x4)__a << (__b & 0x1F));
 }
 
 static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i32x4_shr(v128_t __a,
                                                            uint32_t __b) {
-  return (v128_t)((__i32x4)__a >> __b);
+  return (v128_t)((__i32x4)__a >> (__b & 0x1F));
 }
 
 static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_u32x4_shr(v128_t __a,
                                                            uint32_t __b) {
-  return (v128_t)((__u32x4)__a >> __b);
+  return (v128_t)((__u32x4)__a >> (__b & 0x1F));
 }
 
 static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i32x4_add(v128_t __a,
@@ -1209,17 +1209,17 @@
 
 static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i64x2_shl(v128_t __a,
                                                            uint32_t __b) {
-  return (v128_t)((__i64x2)__a << (int64_t)__b);
+  return (v128_t)((__i64x2)__a << ((int64_t)__b & 0x3F));
 }
 
 static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i64x2_shr(v128_t __a,
                                                            uint32_t __b) {
-  return (v128_t)((__i64x2)__a >> (int64_t)__b);
+  return (v128_t)((__i64x2)__a >> ((int64_t)__b & 0x3F));
 }
 
 static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_u64x2_shr(v128_t __a,
                                                            uint32_t __b) {
-  return (v128_t)((__u64x2)__a >> (int64_t)__b);
+  return (v128_t)((__u64x2)__a >> ((int64_t)__b & 0x3F));
 }
 
 static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i64x2_add(v128_t __a,
diff --git a/darwin-x86/lib64/libbase.dylib b/darwin-x86/lib64/libbase.dylib
index e1e913c..5822ebf 100755
--- a/darwin-x86/lib64/libbase.dylib
+++ b/darwin-x86/lib64/libbase.dylib
Binary files differ
diff --git a/darwin-x86/lib64/libc++.1.dylib b/darwin-x86/lib64/libc++.1.dylib
deleted file mode 100755
index 93d9d6f..0000000
--- a/darwin-x86/lib64/libc++.1.dylib
+++ /dev/null
Binary files differ
diff --git a/darwin-x86/lib64/libc++.dylib b/darwin-x86/lib64/libc++.dylib
index c8718c4..059534b 100755
--- a/darwin-x86/lib64/libc++.dylib
+++ b/darwin-x86/lib64/libc++.dylib
Binary files differ
diff --git a/darwin-x86/lib64/libc++abi.1.dylib b/darwin-x86/lib64/libc++abi.1.dylib
deleted file mode 100755
index 6c8c81f..0000000
--- a/darwin-x86/lib64/libc++abi.1.dylib
+++ /dev/null
Binary files differ
diff --git a/darwin-x86/lib64/libc++abi.dylib b/darwin-x86/lib64/libc++abi.dylib
new file mode 100755
index 0000000..464f8db
--- /dev/null
+++ b/darwin-x86/lib64/libc++abi.dylib
Binary files differ
diff --git a/darwin-x86/lib64/libclang-cpp.dylib b/darwin-x86/lib64/libclang-cpp.dylib
index c04b49e..9f84531 100755
--- a/darwin-x86/lib64/libclang-cpp.dylib
+++ b/darwin-x86/lib64/libclang-cpp.dylib
Binary files differ
diff --git a/darwin-x86/lib64/liblog.dylib b/darwin-x86/lib64/liblog.dylib
index 54e1667..cea7e68 100755
--- a/darwin-x86/lib64/liblog.dylib
+++ b/darwin-x86/lib64/liblog.dylib
Binary files differ
diff --git a/linux-x86/bin/bindgen b/linux-x86/bin/bindgen
index 44ccc6c..a06c6f1 100755
--- a/linux-x86/bin/bindgen
+++ b/linux-x86/bin/bindgen
Binary files differ
diff --git a/linux-x86/bin/cxx_extractor b/linux-x86/bin/cxx_extractor
index 3950786..f145513 100755
--- a/linux-x86/bin/cxx_extractor
+++ b/linux-x86/bin/cxx_extractor
Binary files differ
diff --git a/linux-x86/bin/header-abi-diff b/linux-x86/bin/header-abi-diff
index 02c6318..0c325f6 100755
--- a/linux-x86/bin/header-abi-diff
+++ b/linux-x86/bin/header-abi-diff
Binary files differ
diff --git a/linux-x86/bin/header-abi-dumper b/linux-x86/bin/header-abi-dumper
index ac18f7a..1b3694a 100755
--- a/linux-x86/bin/header-abi-dumper
+++ b/linux-x86/bin/header-abi-dumper
Binary files differ
diff --git a/linux-x86/bin/header-abi-linker b/linux-x86/bin/header-abi-linker
index a16c929..f6cf733 100755
--- a/linux-x86/bin/header-abi-linker
+++ b/linux-x86/bin/header-abi-linker
Binary files differ
diff --git a/linux-x86/bin/proto_metadata_plugin b/linux-x86/bin/proto_metadata_plugin
index 11c94c5..c5f9f87 100755
--- a/linux-x86/bin/proto_metadata_plugin
+++ b/linux-x86/bin/proto_metadata_plugin
Binary files differ
diff --git a/linux-x86/bin/protoc_extractor b/linux-x86/bin/protoc_extractor
index 95cfe3f..c21c19a 100755
--- a/linux-x86/bin/protoc_extractor
+++ b/linux-x86/bin/protoc_extractor
Binary files differ
diff --git a/linux-x86/bin/versioner b/linux-x86/bin/versioner
index 66602a5..be981e4 100755
--- a/linux-x86/bin/versioner
+++ b/linux-x86/bin/versioner
Binary files differ
diff --git a/linux-x86/lib64/clang/17/include/__clang_hip_math.h b/linux-x86/lib64/clang/17/include/__clang_hip_math.h
index 537dd0f..a4e557e 100644
--- a/linux-x86/lib64/clang/17/include/__clang_hip_math.h
+++ b/linux-x86/lib64/clang/17/include/__clang_hip_math.h
@@ -243,7 +243,7 @@
 
 __DEVICE__
 float fmaf(float __x, float __y, float __z) {
-  return __ocml_fma_f32(__x, __y, __z);
+  return __builtin_fmaf(__x, __y, __z);
 }
 
 __DEVICE__
@@ -621,7 +621,7 @@
 #else
 __DEVICE__
 float __fmaf_rn(float __x, float __y, float __z) {
-  return __ocml_fma_f32(__x, __y, __z);
+  return __builtin_fmaf(__x, __y, __z);
 }
 #endif
 
@@ -799,7 +799,7 @@
 
 __DEVICE__
 double fma(double __x, double __y, double __z) {
-  return __ocml_fma_f64(__x, __y, __z);
+  return __builtin_fma(__x, __y, __z);
 }
 
 __DEVICE__
@@ -1258,7 +1258,7 @@
 #else
 __DEVICE__
 double __fma_rn(double __x, double __y, double __z) {
-  return __ocml_fma_f64(__x, __y, __z);
+  return __builtin_fma(__x, __y, __z);
 }
 #endif
 // END INTRINSICS
diff --git a/linux-x86/lib64/clang/17/include/__clang_hip_runtime_wrapper.h b/linux-x86/lib64/clang/17/include/__clang_hip_runtime_wrapper.h
index 0508731..e881707 100644
--- a/linux-x86/lib64/clang/17/include/__clang_hip_runtime_wrapper.h
+++ b/linux-x86/lib64/clang/17/include/__clang_hip_runtime_wrapper.h
@@ -80,12 +80,25 @@
 #if HIP_VERSION_MAJOR * 100 + HIP_VERSION_MINOR >= 405
 extern "C" __device__ unsigned long long __ockl_dm_alloc(unsigned long long __size);
 extern "C" __device__ void __ockl_dm_dealloc(unsigned long long __addr);
+#if __has_feature(address_sanitizer)
+extern "C" __device__ unsigned long long __asan_malloc_impl(unsigned long long __size, unsigned long long __pc);
+extern "C" __device__ void __asan_free_impl(unsigned long long __addr, unsigned long long __pc);
+__attribute__((noinline, weak)) __device__ void *malloc(__hip_size_t __size) {
+  unsigned long long __pc = (unsigned long long)__builtin_return_address(0);
+  return (void *)__asan_malloc_impl(__size, __pc);
+}
+__attribute__((noinline, weak)) __device__ void free(void *__ptr) {
+  unsigned long long __pc = (unsigned long long)__builtin_return_address(0);
+  __asan_free_impl((unsigned long long)__ptr, __pc);
+}
+#else
 __attribute__((weak)) inline __device__ void *malloc(__hip_size_t __size) {
   return (void *) __ockl_dm_alloc(__size);
 }
 __attribute__((weak)) inline __device__ void free(void *__ptr) {
   __ockl_dm_dealloc((unsigned long long)__ptr);
 }
+#endif // __has_feature(address_sanitizer)
 #else  // HIP version check
 #if __HIP_ENABLE_DEVICE_MALLOC__
 __device__ void *__hip_malloc(__hip_size_t __size);
diff --git a/linux-x86/lib64/clang/17/include/adxintrin.h b/linux-x86/lib64/clang/17/include/adxintrin.h
index 72b9ed0..4382530 100644
--- a/linux-x86/lib64/clang/17/include/adxintrin.h
+++ b/linux-x86/lib64/clang/17/include/adxintrin.h
@@ -17,56 +17,69 @@
 /* Define the default attributes for the functions in this file. */
 #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__))
 
+/* Use C++ inline semantics in C++, GNU inline for C mode. */
+#if defined(__cplusplus)
+#define __INLINE __inline
+#else
+#define __INLINE static __inline
+#endif
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
 /* Intrinsics that are available only if __ADX__ defined */
-static __inline unsigned char __attribute__((__always_inline__, __nodebug__, __target__("adx")))
-_addcarryx_u32(unsigned char __cf, unsigned int __x, unsigned int __y,
-               unsigned int *__p)
-{
+__INLINE unsigned char
+    __attribute__((__always_inline__, __nodebug__, __target__("adx")))
+    _addcarryx_u32(unsigned char __cf, unsigned int __x, unsigned int __y,
+                   unsigned int *__p) {
   return __builtin_ia32_addcarryx_u32(__cf, __x, __y, __p);
 }
 
 #ifdef __x86_64__
-static __inline unsigned char __attribute__((__always_inline__, __nodebug__, __target__("adx")))
-_addcarryx_u64(unsigned char __cf, unsigned long long __x,
-               unsigned long long __y, unsigned long long  *__p)
-{
+__INLINE unsigned char
+    __attribute__((__always_inline__, __nodebug__, __target__("adx")))
+    _addcarryx_u64(unsigned char __cf, unsigned long long __x,
+                   unsigned long long __y, unsigned long long *__p) {
   return __builtin_ia32_addcarryx_u64(__cf, __x, __y, __p);
 }
 #endif
 
 /* Intrinsics that are also available if __ADX__ undefined */
-static __inline unsigned char __DEFAULT_FN_ATTRS
-_addcarry_u32(unsigned char __cf, unsigned int __x, unsigned int __y,
-              unsigned int *__p)
-{
+__INLINE unsigned char __DEFAULT_FN_ATTRS _addcarry_u32(unsigned char __cf,
+                                                        unsigned int __x,
+                                                        unsigned int __y,
+                                                        unsigned int *__p) {
   return __builtin_ia32_addcarryx_u32(__cf, __x, __y, __p);
 }
 
 #ifdef __x86_64__
-static __inline unsigned char __DEFAULT_FN_ATTRS
+__INLINE unsigned char __DEFAULT_FN_ATTRS
 _addcarry_u64(unsigned char __cf, unsigned long long __x,
-              unsigned long long __y, unsigned long long  *__p)
-{
+              unsigned long long __y, unsigned long long *__p) {
   return __builtin_ia32_addcarryx_u64(__cf, __x, __y, __p);
 }
 #endif
 
-static __inline unsigned char __DEFAULT_FN_ATTRS
-_subborrow_u32(unsigned char __cf, unsigned int __x, unsigned int __y,
-              unsigned int *__p)
-{
+__INLINE unsigned char __DEFAULT_FN_ATTRS _subborrow_u32(unsigned char __cf,
+                                                         unsigned int __x,
+                                                         unsigned int __y,
+                                                         unsigned int *__p) {
   return __builtin_ia32_subborrow_u32(__cf, __x, __y, __p);
 }
 
 #ifdef __x86_64__
-static __inline unsigned char __DEFAULT_FN_ATTRS
+__INLINE unsigned char __DEFAULT_FN_ATTRS
 _subborrow_u64(unsigned char __cf, unsigned long long __x,
-               unsigned long long __y, unsigned long long  *__p)
-{
+               unsigned long long __y, unsigned long long *__p) {
   return __builtin_ia32_subborrow_u64(__cf, __x, __y, __p);
 }
 #endif
 
+#if defined(__cplusplus)
+}
+#endif
+
 #undef __DEFAULT_FN_ATTRS
 
 #endif /* __ADXINTRIN_H */
diff --git a/linux-x86/lib64/clang/17/include/amxcomplexintrin.h b/linux-x86/lib64/clang/17/include/amxcomplexintrin.h
new file mode 100644
index 0000000..84ef972
--- /dev/null
+++ b/linux-x86/lib64/clang/17/include/amxcomplexintrin.h
@@ -0,0 +1,169 @@
+/*===--------- amxcomplexintrin.h - AMXCOMPLEX intrinsics -*- C++ -*---------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===------------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#error "Never use <amxcomplexintrin.h> directly; include <immintrin.h> instead."
+#endif // __IMMINTRIN_H
+
+#ifndef __AMX_COMPLEXINTRIN_H
+#define __AMX_COMPLEXINTRIN_H
+#ifdef __x86_64__
+
+#define __DEFAULT_FN_ATTRS_COMPLEX                                             \
+  __attribute__((__always_inline__, __nodebug__, __target__("amx-complex")))
+
+/// Perform matrix multiplication of two tiles containing complex elements and
+///    accumulate the results into a packed single precision tile. Each dword
+///    element in input tiles \a a and \a b is interpreted as a complex number
+///    with FP16 real part and FP16 imaginary part.
+/// Calculates the imaginary part of the result. For each possible combination
+///    of (row of \a a, column of \a b), it performs a set of multiplication
+///    and accumulations on all corresponding complex numbers (one from \a a
+///    and one from \a b). The imaginary part of the \a a element is multiplied
+///    with the real part of the corresponding \a b element, and the real part
+///    of the \a a element is multiplied with the imaginary part of the
+///    corresponding \a b elements. The two accumulated results are added, and
+///    then accumulated into the corresponding row and column of \a dst.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// void _tile_cmmimfp16ps(__tile dst, __tile a, __tile b);
+/// \endcode
+///
+/// \code{.operation}
+/// FOR m := 0 TO dst.rows - 1
+///	tmp := dst.row[m]
+///	FOR k := 0 TO (a.colsb / 4) - 1
+///		FOR n := 0 TO (dst.colsb / 4) - 1
+///			tmp.fp32[n] += FP32(a.row[m].fp16[2*k+0]) * FP32(b.row[k].fp16[2*n+1])
+///			tmp.fp32[n] += FP32(a.row[m].fp16[2*k+1]) * FP32(b.row[k].fp16[2*n+0])
+///		ENDFOR
+///	ENDFOR
+///	write_row_and_zero(dst, m, tmp, dst.colsb)
+/// ENDFOR
+/// zero_upper_rows(dst, dst.rows)
+/// zero_tileconfig_start()
+/// \endcode
+///
+/// This intrinsic corresponds to the \c TCMMIMFP16PS instruction.
+///
+/// \param dst
+///    The destination tile. Max size is 1024 Bytes.
+/// \param a
+///    The 1st source tile. Max size is 1024 Bytes.
+/// \param b
+///    The 2nd source tile. Max size is 1024 Bytes.
+#define _tile_cmmimfp16ps(dst, a, b) __builtin_ia32_tcmmimfp16ps(dst, a, b)
+
+/// Perform matrix multiplication of two tiles containing complex elements and
+///    accumulate the results into a packed single precision tile. Each dword
+///    element in input tiles \a a and \a b is interpreted as a complex number
+///    with FP16 real part and FP16 imaginary part.
+/// Calculates the real part of the result. For each possible combination
+///    of (row of \a a, column of \a b), it performs a set of multiplication
+///    and accumulations on all corresponding complex numbers (one from \a a
+///    and one from \a b). The real part of the \a a element is multiplied
+///    with the real part of the corresponding \a b element, and the negated
+///    imaginary part of the \a a element is multiplied with the imaginary
+///    part of the corresponding \a b elements. The two accumulated results
+///    are added, and then accumulated into the corresponding row and column
+///    of \a dst.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// void _tile_cmmrlfp16ps(__tile dst, __tile a, __tile b);
+/// \endcode
+///
+/// \code{.operation}
+/// FOR m := 0 TO dst.rows - 1
+///	tmp := dst.row[m]
+///	FOR k := 0 TO (a.colsb / 4) - 1
+///		FOR n := 0 TO (dst.colsb / 4) - 1
+///			tmp.fp32[n] += FP32(a.row[m].fp16[2*k+0]) * FP32(b.row[k].fp16[2*n+0])
+///			tmp.fp32[n] += FP32(-a.row[m].fp16[2*k+1]) * FP32(b.row[k].fp16[2*n+1])
+///		ENDFOR
+///	ENDFOR
+///	write_row_and_zero(dst, m, tmp, dst.colsb)
+/// ENDFOR
+/// zero_upper_rows(dst, dst.rows)
+/// zero_tileconfig_start()
+/// \endcode
+///
+/// This intrinsic corresponds to the \c TCMMIMFP16PS instruction.
+///
+/// \param dst
+///    The destination tile. Max size is 1024 Bytes.
+/// \param a
+///    The 1st source tile. Max size is 1024 Bytes.
+/// \param b
+///    The 2nd source tile. Max size is 1024 Bytes.
+#define _tile_cmmrlfp16ps(dst, a, b) __builtin_ia32_tcmmrlfp16ps(dst, a, b)
+
+static __inline__ _tile1024i __DEFAULT_FN_ATTRS_COMPLEX
+_tile_cmmimfp16ps_internal(unsigned short m, unsigned short n, unsigned short k,
+                           _tile1024i dst, _tile1024i src1, _tile1024i src2) {
+  return __builtin_ia32_tcmmimfp16ps_internal(m, n, k, dst, src1, src2);
+}
+
+static __inline__ _tile1024i __DEFAULT_FN_ATTRS_COMPLEX
+_tile_cmmrlfp16ps_internal(unsigned short m, unsigned short n, unsigned short k,
+                           _tile1024i dst, _tile1024i src1, _tile1024i src2) {
+  return __builtin_ia32_tcmmrlfp16ps_internal(m, n, k, dst, src1, src2);
+}
+
+/// Perform matrix multiplication of two tiles containing complex elements and
+/// accumulate the results into a packed single precision tile. Each dword
+/// element in input tiles src0 and src1 is interpreted as a complex number with
+/// FP16 real part and FP16 imaginary part.
+/// This function calculates the imaginary part of the result.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> TCMMIMFP16PS </c> instruction.
+///
+/// \param dst
+///    The destination tile. Max size is 1024 Bytes.
+/// \param src0
+///    The 1st source tile. Max size is 1024 Bytes.
+/// \param src1
+///    The 2nd source tile. Max size is 1024 Bytes.
+__DEFAULT_FN_ATTRS_COMPLEX
+static void __tile_cmmimfp16ps(__tile1024i *dst, __tile1024i src0,
+                               __tile1024i src1) {
+  dst->tile = _tile_cmmimfp16ps_internal(src0.row, src1.col, src0.col,
+                                         dst->tile, src0.tile, src1.tile);
+}
+
+/// Perform matrix multiplication of two tiles containing complex elements and
+/// accumulate the results into a packed single precision tile. Each dword
+/// element in input tiles src0 and src1 is interpreted as a complex number with
+/// FP16 real part and FP16 imaginary part.
+/// This function calculates the real part of the result.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> TCMMRLFP16PS </c> instruction.
+///
+/// \param dst
+///    The destination tile. Max size is 1024 Bytes.
+/// \param src0
+///    The 1st source tile. Max size is 1024 Bytes.
+/// \param src1
+///    The 2nd source tile. Max size is 1024 Bytes.
+__DEFAULT_FN_ATTRS_COMPLEX
+static void __tile_cmmrlfp16ps(__tile1024i *dst, __tile1024i src0,
+                               __tile1024i src1) {
+  dst->tile = _tile_cmmrlfp16ps_internal(src0.row, src1.col, src0.col,
+                                         dst->tile, src0.tile, src1.tile);
+}
+
+#endif // __x86_64__
+#endif // __AMX_COMPLEXINTRIN_H
diff --git a/linux-x86/lib64/clang/17/include/arm_neon.h b/linux-x86/lib64/clang/17/include/arm_neon.h
index 23d26a0..90ebf7d 100644
--- a/linux-x86/lib64/clang/17/include/arm_neon.h
+++ b/linux-x86/lib64/clang/17/include/arm_neon.h
@@ -35,7 +35,6 @@
 #include <stdint.h>
 
 #include <arm_bf16.h>
-typedef __bf16 bfloat16_t;
 typedef float float32_t;
 typedef __fp16 float16_t;
 #ifdef __aarch64__
diff --git a/linux-x86/lib64/clang/17/include/arm_sve.h b/linux-x86/lib64/clang/17/include/arm_sve.h
index 64362b8..f7bbc7f 100644
--- a/linux-x86/lib64/clang/17/include/arm_sve.h
+++ b/linux-x86/lib64/clang/17/include/arm_sve.h
@@ -37,7 +37,6 @@
 
 typedef __SVBFloat16_t svbfloat16_t;
 #include <arm_bf16.h>
-typedef __bf16 bfloat16_t;
 typedef __SVFloat32_t svfloat32_t;
 typedef __SVFloat64_t svfloat64_t;
 typedef __clang_svint8x2_t svint8x2_t;
@@ -74,6 +73,8 @@
 typedef __clang_svfloat32x4_t svfloat32x4_t;
 typedef __clang_svfloat64x4_t svfloat64x4_t;
 typedef __SVBool_t  svbool_t;
+typedef __clang_svboolx2_t  svboolx2_t;
+typedef __clang_svboolx4_t  svboolx4_t;
 
 typedef __clang_svbfloat16x2_t svbfloat16x2_t;
 typedef __clang_svbfloat16x3_t svbfloat16x3_t;
@@ -2914,6 +2915,10 @@
 svint64_t svdup_lane_s64(svint64_t, uint64_t);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_lane_s16)))
 svint16_t svdup_lane_s16(svint16_t, uint16_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdupq_n_u8)))
+svuint8_t svdupq_n_u8(uint8_t, uint8_t, uint8_t, uint8_t, uint8_t, uint8_t, uint8_t, uint8_t, uint8_t, uint8_t, uint8_t, uint8_t, uint8_t, uint8_t, uint8_t, uint8_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdupq_n_s8)))
+svint8_t svdupq_n_s8(int8_t, int8_t, int8_t, int8_t, int8_t, int8_t, int8_t, int8_t, int8_t, int8_t, int8_t, int8_t, int8_t, int8_t, int8_t, int8_t);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdupq_n_u16)))
 svuint16_t svdupq_n_u16(uint16_t, uint16_t, uint16_t, uint16_t, uint16_t, uint16_t, uint16_t, uint16_t);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdupq_n_f16)))
@@ -2932,18 +2937,14 @@
 svfloat64_t svdupq_n_f64(float64_t, float64_t);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdupq_n_s64)))
 svint64_t svdupq_n_s64(int64_t, int64_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdupq_n_u8)))
-svuint8_t svdupq_n_u8(uint8_t, uint8_t, uint8_t, uint8_t, uint8_t, uint8_t, uint8_t, uint8_t, uint8_t, uint8_t, uint8_t, uint8_t, uint8_t, uint8_t, uint8_t, uint8_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdupq_n_s8)))
-svint8_t svdupq_n_s8(int8_t, int8_t, int8_t, int8_t, int8_t, int8_t, int8_t, int8_t, int8_t, int8_t, int8_t, int8_t, int8_t, int8_t, int8_t, int8_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdupq_n_b8)))
+svbool_t svdupq_n_b8(bool, bool, bool, bool, bool, bool, bool, bool, bool, bool, bool, bool, bool, bool, bool, bool);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdupq_n_b16)))
 svbool_t svdupq_n_b16(bool, bool, bool, bool, bool, bool, bool, bool);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdupq_n_b32)))
 svbool_t svdupq_n_b32(bool, bool, bool, bool);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdupq_n_b64)))
 svbool_t svdupq_n_b64(bool, bool);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdupq_n_b8)))
-svbool_t svdupq_n_b8(bool, bool, bool, bool, bool, bool, bool, bool, bool, bool, bool, bool, bool, bool, bool, bool);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdupq_lane_u8)))
 svuint8_t svdupq_lane_u8(svuint8_t, uint64_t);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdupq_lane_u32)))
@@ -6958,14 +6959,14 @@
 svint64_t svrev_s64(svint64_t);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrev_s16)))
 svint16_t svrev_s16(svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrev_b8)))
-svbool_t svrev_b8(svbool_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrev_b16)))
+svbool_t svrev_b16(svbool_t);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrev_b32)))
 svbool_t svrev_b32(svbool_t);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrev_b64)))
 svbool_t svrev_b64(svbool_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrev_b16)))
-svbool_t svrev_b16(svbool_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrev_b8)))
+svbool_t svrev_b8(svbool_t);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrevb_u32_m)))
 svuint32_t svrevb_u32_m(svuint32_t, svbool_t, svuint32_t);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svrevb_u64_m)))
@@ -8126,14 +8127,14 @@
 svint64_t svtrn1_s64(svint64_t, svint64_t);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn1_s16)))
 svint16_t svtrn1_s16(svint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn1_b8)))
-svbool_t svtrn1_b8(svbool_t, svbool_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn1_b16)))
+svbool_t svtrn1_b16(svbool_t, svbool_t);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn1_b32)))
 svbool_t svtrn1_b32(svbool_t, svbool_t);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn1_b64)))
 svbool_t svtrn1_b64(svbool_t, svbool_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn1_b16)))
-svbool_t svtrn1_b16(svbool_t, svbool_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn1_b8)))
+svbool_t svtrn1_b8(svbool_t, svbool_t);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn2_u8)))
 svuint8_t svtrn2_u8(svuint8_t, svuint8_t);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn2_u32)))
@@ -8156,14 +8157,14 @@
 svint64_t svtrn2_s64(svint64_t, svint64_t);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn2_s16)))
 svint16_t svtrn2_s16(svint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn2_b8)))
-svbool_t svtrn2_b8(svbool_t, svbool_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn2_b16)))
+svbool_t svtrn2_b16(svbool_t, svbool_t);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn2_b32)))
 svbool_t svtrn2_b32(svbool_t, svbool_t);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn2_b64)))
 svbool_t svtrn2_b64(svbool_t, svbool_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn2_b16)))
-svbool_t svtrn2_b16(svbool_t, svbool_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtrn2_b8)))
+svbool_t svtrn2_b8(svbool_t, svbool_t);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtsmul_f64)))
 svfloat64_t svtsmul_f64(svfloat64_t, svuint64_t);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svtsmul_f32)))
@@ -8314,14 +8315,14 @@
 svint64_t svuzp1_s64(svint64_t, svint64_t);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp1_s16)))
 svint16_t svuzp1_s16(svint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp1_b8)))
-svbool_t svuzp1_b8(svbool_t, svbool_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp1_b16)))
+svbool_t svuzp1_b16(svbool_t, svbool_t);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp1_b32)))
 svbool_t svuzp1_b32(svbool_t, svbool_t);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp1_b64)))
 svbool_t svuzp1_b64(svbool_t, svbool_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp1_b16)))
-svbool_t svuzp1_b16(svbool_t, svbool_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp1_b8)))
+svbool_t svuzp1_b8(svbool_t, svbool_t);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp2_u8)))
 svuint8_t svuzp2_u8(svuint8_t, svuint8_t);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp2_u32)))
@@ -8344,14 +8345,14 @@
 svint64_t svuzp2_s64(svint64_t, svint64_t);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp2_s16)))
 svint16_t svuzp2_s16(svint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp2_b8)))
-svbool_t svuzp2_b8(svbool_t, svbool_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp2_b16)))
+svbool_t svuzp2_b16(svbool_t, svbool_t);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp2_b32)))
 svbool_t svuzp2_b32(svbool_t, svbool_t);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp2_b64)))
 svbool_t svuzp2_b64(svbool_t, svbool_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp2_b16)))
-svbool_t svuzp2_b16(svbool_t, svbool_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svuzp2_b8)))
+svbool_t svuzp2_b8(svbool_t, svbool_t);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilele_b8_s32)))
 svbool_t svwhilele_b8_s32(int32_t, int32_t);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svwhilele_b32_s32)))
@@ -8440,14 +8441,14 @@
 svint64_t svzip1_s64(svint64_t, svint64_t);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip1_s16)))
 svint16_t svzip1_s16(svint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip1_b8)))
-svbool_t svzip1_b8(svbool_t, svbool_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip1_b16)))
+svbool_t svzip1_b16(svbool_t, svbool_t);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip1_b32)))
 svbool_t svzip1_b32(svbool_t, svbool_t);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip1_b64)))
 svbool_t svzip1_b64(svbool_t, svbool_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip1_b16)))
-svbool_t svzip1_b16(svbool_t, svbool_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip1_b8)))
+svbool_t svzip1_b8(svbool_t, svbool_t);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip2_u8)))
 svuint8_t svzip2_u8(svuint8_t, svuint8_t);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip2_u32)))
@@ -8470,14 +8471,14 @@
 svint64_t svzip2_s64(svint64_t, svint64_t);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip2_s16)))
 svint16_t svzip2_s16(svint16_t, svint16_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip2_b8)))
-svbool_t svzip2_b8(svbool_t, svbool_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip2_b16)))
+svbool_t svzip2_b16(svbool_t, svbool_t);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip2_b32)))
 svbool_t svzip2_b32(svbool_t, svbool_t);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip2_b64)))
 svbool_t svzip2_b64(svbool_t, svbool_t);
-__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip2_b16)))
-svbool_t svzip2_b16(svbool_t, svbool_t);
+__ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svzip2_b8)))
+svbool_t svzip2_b8(svbool_t, svbool_t);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabd_n_f64_m)))
 svfloat64_t svabd_m(svbool_t, svfloat64_t, float64_t);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svabd_n_f32_m)))
@@ -10528,6 +10529,10 @@
 svint64_t svdup_lane(svint64_t, uint64_t);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_lane_s16)))
 svint16_t svdup_lane(svint16_t, uint16_t);
+__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdupq_n_u8)))
+svuint8_t svdupq_u8(uint8_t, uint8_t, uint8_t, uint8_t, uint8_t, uint8_t, uint8_t, uint8_t, uint8_t, uint8_t, uint8_t, uint8_t, uint8_t, uint8_t, uint8_t, uint8_t);
+__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdupq_n_s8)))
+svint8_t svdupq_s8(int8_t, int8_t, int8_t, int8_t, int8_t, int8_t, int8_t, int8_t, int8_t, int8_t, int8_t, int8_t, int8_t, int8_t, int8_t, int8_t);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdupq_n_u16)))
 svuint16_t svdupq_u16(uint16_t, uint16_t, uint16_t, uint16_t, uint16_t, uint16_t, uint16_t, uint16_t);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdupq_n_f16)))
@@ -10546,18 +10551,14 @@
 svfloat64_t svdupq_f64(float64_t, float64_t);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdupq_n_s64)))
 svint64_t svdupq_s64(int64_t, int64_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdupq_n_u8)))
-svuint8_t svdupq_u8(uint8_t, uint8_t, uint8_t, uint8_t, uint8_t, uint8_t, uint8_t, uint8_t, uint8_t, uint8_t, uint8_t, uint8_t, uint8_t, uint8_t, uint8_t, uint8_t);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdupq_n_s8)))
-svint8_t svdupq_s8(int8_t, int8_t, int8_t, int8_t, int8_t, int8_t, int8_t, int8_t, int8_t, int8_t, int8_t, int8_t, int8_t, int8_t, int8_t, int8_t);
+__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdupq_n_b8)))
+svbool_t svdupq_b8(bool, bool, bool, bool, bool, bool, bool, bool, bool, bool, bool, bool, bool, bool, bool, bool);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdupq_n_b16)))
 svbool_t svdupq_b16(bool, bool, bool, bool, bool, bool, bool, bool);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdupq_n_b32)))
 svbool_t svdupq_b32(bool, bool, bool, bool);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdupq_n_b64)))
 svbool_t svdupq_b64(bool, bool);
-__aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdupq_n_b8)))
-svbool_t svdupq_b8(bool, bool, bool, bool, bool, bool, bool, bool, bool, bool, bool, bool, bool, bool, bool, bool);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdupq_lane_u8)))
 svuint8_t svdupq_lane(svuint8_t, uint64_t);
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdupq_lane_u32)))
diff --git a/linux-x86/lib64/clang/17/include/avx2intrin.h b/linux-x86/lib64/clang/17/include/avx2intrin.h
index f8521e7..33f24f2 100644
--- a/linux-x86/lib64/clang/17/include/avx2intrin.h
+++ b/linux-x86/lib64/clang/17/include/avx2intrin.h
@@ -935,102 +935,810 @@
   return (__m128i)__builtin_ia32_psrlv2di((__v2di)__X, (__v2di)__Y);
 }
 
+/// Conditionally gathers two 64-bit floating-point values, either from the
+///    128-bit vector of [2 x double] in \a a, or from memory \a m using scaled
+///    indexes from the 128-bit vector of [4 x i32] in \a i. The 128-bit vector
+///    of [2 x double] in \a mask determines the source for each element.
+///
+/// \code{.operation}
+/// FOR element := 0 to 1
+///   j := element*64
+///   k := element*32
+///   IF mask[j+63] == 0
+///     result[j+63:j] := a[j+63:j]
+///   ELSE
+///     result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
+///   FI
+/// ENDFOR
+/// \endcode
+///
+/// \headerfile <immintrin.h>
+///
+/// \code
+/// __m128d _mm_mask_i32gather_pd(__m128d a, const double *m, __m128i i,
+///                               __m128d mask, const int s);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VGATHERDPD instruction.
+///
+/// \param a
+///    A 128-bit vector of [2 x double] used as the source when a mask bit is
+///    zero.
+/// \param m
+///    A pointer to the memory used for loading values.
+/// \param i
+///    A 128-bit vector of [4 x i32] containing signed indexes into \a m. Only
+///    the first two elements are used.
+/// \param mask
+///    A 128-bit vector of [2 x double] containing the mask. The most
+///    significant bit of each element in the mask vector represents the mask
+///    bits. If a mask bit is zero, the corresponding value from vector \a a
+///    is gathered; otherwise the value is loaded from memory.
+/// \param s
+///    A literal constant scale factor for the indexes in \a i. Must be
+///    1, 2, 4, or 8.
+/// \returns A 128-bit vector of [2 x double] containing the gathered values.
 #define _mm_mask_i32gather_pd(a, m, i, mask, s) \
   ((__m128d)__builtin_ia32_gatherd_pd((__v2df)(__m128i)(a), \
                                       (double const *)(m), \
                                       (__v4si)(__m128i)(i), \
                                       (__v2df)(__m128d)(mask), (s)))
 
+/// Conditionally gathers four 64-bit floating-point values, either from the
+///    256-bit vector of [4 x double] in \a a, or from memory \a m using scaled
+///    indexes from the 128-bit vector of [4 x i32] in \a i. The 256-bit vector
+///    of [4 x double] in \a mask determines the source for each element.
+///
+/// \code{.operation}
+/// FOR element := 0 to 3
+///   j := element*64
+///   k := element*32
+///   IF mask[j+63] == 0
+///     result[j+63:j] := a[j+63:j]
+///   ELSE
+///     result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
+///   FI
+/// ENDFOR
+/// \endcode
+///
+/// \headerfile <immintrin.h>
+///
+/// \code
+/// __m256d _mm256_mask_i32gather_pd(__m256d a, const double *m, __m128i i,
+///                                  __m256d mask, const int s);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VGATHERDPD instruction.
+///
+/// \param a
+///    A 256-bit vector of [4 x double] used as the source when a mask bit is
+///    zero.
+/// \param m
+///    A pointer to the memory used for loading values.
+/// \param i
+///    A 128-bit vector of [4 x i32] containing signed indexes into \a m.
+/// \param mask
+///    A 256-bit vector of [4 x double] containing the mask. The most
+///    significant bit of each element in the mask vector represents the mask
+///    bits. If a mask bit is zero, the corresponding value from vector \a a
+///    is gathered; otherwise the value is loaded from memory.
+/// \param s
+///    A literal constant scale factor for the indexes in \a i. Must be
+///    1, 2, 4, or 8.
+/// \returns A 256-bit vector of [4 x double] containing the gathered values.
 #define _mm256_mask_i32gather_pd(a, m, i, mask, s) \
   ((__m256d)__builtin_ia32_gatherd_pd256((__v4df)(__m256d)(a), \
                                          (double const *)(m), \
                                          (__v4si)(__m128i)(i), \
                                          (__v4df)(__m256d)(mask), (s)))
 
+/// Conditionally gathers two 64-bit floating-point values, either from the
+///    128-bit vector of [2 x double] in \a a, or from memory \a m using scaled
+///    indexes from the 128-bit vector of [2 x i64] in \a i. The 128-bit vector
+///    of [2 x double] in \a mask determines the source for each element.
+///
+/// \code{.operation}
+/// FOR element := 0 to 1
+///   j := element*64
+///   k := element*64
+///   IF mask[j+63] == 0
+///     result[j+63:j] := a[j+63:j]
+///   ELSE
+///     result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
+///   FI
+/// ENDFOR
+/// \endcode
+///
+/// \headerfile <immintrin.h>
+///
+/// \code
+/// __m128d _mm_mask_i64gather_pd(__m128d a, const double *m, __m128i i,
+///                               __m128d mask, const int s);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VGATHERQPD instruction.
+///
+/// \param a
+///    A 128-bit vector of [2 x double] used as the source when a mask bit is
+///    zero.
+/// \param m
+///    A pointer to the memory used for loading values.
+/// \param i
+///    A 128-bit vector of [2 x i64] containing signed indexes into \a m.
+/// \param mask
+///    A 128-bit vector of [2 x double] containing the mask. The most
+///    significant bit of each element in the mask vector represents the mask
+///    bits. If a mask bit is zero, the corresponding value from vector \a a
+///    is gathered; otherwise the value is loaded from memory.
+/// \param s
+///    A literal constant scale factor for the indexes in \a i. Must be
+///    1, 2, 4, or 8.
+/// \returns A 128-bit vector of [2 x double] containing the gathered values.
 #define _mm_mask_i64gather_pd(a, m, i, mask, s) \
   ((__m128d)__builtin_ia32_gatherq_pd((__v2df)(__m128d)(a), \
                                       (double const *)(m), \
                                       (__v2di)(__m128i)(i), \
                                       (__v2df)(__m128d)(mask), (s)))
 
+/// Conditionally gathers four 64-bit floating-point values, either from the
+///    256-bit vector of [4 x double] in \a a, or from memory \a m using scaled
+///    indexes from the 256-bit vector of [4 x i64] in \a i. The 256-bit vector
+///    of [4 x double] in \a mask determines the source for each element.
+///
+/// \code{.operation}
+/// FOR element := 0 to 3
+///   j := element*64
+///   k := element*64
+///   IF mask[j+63] == 0
+///     result[j+63:j] := a[j+63:j]
+///   ELSE
+///     result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
+///   FI
+/// ENDFOR
+/// \endcode
+///
+/// \headerfile <immintrin.h>
+///
+/// \code
+/// __m256d _mm256_mask_i64gather_pd(__m256d a, const double *m, __m256i i,
+///                                  __m256d mask, const int s);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VGATHERQPD instruction.
+///
+/// \param a
+///    A 256-bit vector of [4 x double] used as the source when a mask bit is
+///    zero.
+/// \param m
+///    A pointer to the memory used for loading values.
+/// \param i
+///    A 256-bit vector of [4 x i64] containing signed indexes into \a m.
+/// \param mask
+///    A 256-bit vector of [4 x double] containing the mask. The most
+///    significant bit of each element in the mask vector represents the mask
+///    bits. If a mask bit is zero, the corresponding value from vector \a a
+///    is gathered; otherwise the value is loaded from memory.
+/// \param s
+///    A literal constant scale factor for the indexes in \a i. Must be
+///    1, 2, 4, or 8.
+/// \returns A 256-bit vector of [4 x double] containing the gathered values.
 #define _mm256_mask_i64gather_pd(a, m, i, mask, s) \
   ((__m256d)__builtin_ia32_gatherq_pd256((__v4df)(__m256d)(a), \
                                          (double const *)(m), \
                                          (__v4di)(__m256i)(i), \
                                          (__v4df)(__m256d)(mask), (s)))
 
+/// Conditionally gathers four 32-bit floating-point values, either from the
+///    128-bit vector of [4 x float] in \a a, or from memory \a m using scaled
+///    indexes from the 128-bit vector of [4 x i32] in \a i. The 128-bit vector
+///    of [4 x float] in \a mask determines the source for each element.
+///
+/// \code{.operation}
+/// FOR element := 0 to 3
+///   j := element*32
+///   k := element*32
+///   IF mask[j+31] == 0
+///     result[j+31:j] := a[j+31:j]
+///   ELSE
+///     result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
+///   FI
+/// ENDFOR
+/// \endcode
+///
+/// \headerfile <immintrin.h>
+///
+/// \code
+/// __m128 _mm_mask_i32gather_ps(__m128 a, const float *m, __m128i i,
+///                              __m128 mask, const int s);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VGATHERDPS instruction.
+///
+/// \param a
+///    A 128-bit vector of [4 x float] used as the source when a mask bit is
+///    zero.
+/// \param m
+///    A pointer to the memory used for loading values.
+/// \param i
+///    A 128-bit vector of [4 x i32] containing signed indexes into \a m.
+/// \param mask
+///    A 128-bit vector of [4 x float] containing the mask. The most
+///    significant bit of each element in the mask vector represents the mask
+///    bits. If a mask bit is zero, the corresponding value from vector \a a
+///    is gathered; otherwise the value is loaded from memory.
+/// \param s
+///    A literal constant scale factor for the indexes in \a i. Must be
+///    1, 2, 4, or 8.
+/// \returns A 128-bit vector of [4 x float] containing the gathered values.
 #define _mm_mask_i32gather_ps(a, m, i, mask, s) \
   ((__m128)__builtin_ia32_gatherd_ps((__v4sf)(__m128)(a), \
                                      (float const *)(m), \
                                      (__v4si)(__m128i)(i), \
                                      (__v4sf)(__m128)(mask), (s)))
 
+/// Conditionally gathers eight 32-bit floating-point values, either from the
+///    256-bit vector of [8 x float] in \a a, or from memory \a m using scaled
+///    indexes from the 256-bit vector of [8 x i32] in \a i. The 256-bit vector
+///    of [8 x float] in \a mask determines the source for each element.
+///
+/// \code{.operation}
+/// FOR element := 0 to 7
+///   j := element*32
+///   k := element*32
+///   IF mask[j+31] == 0
+///     result[j+31:j] := a[j+31:j]
+///   ELSE
+///     result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
+///   FI
+/// ENDFOR
+/// \endcode
+///
+/// \headerfile <immintrin.h>
+///
+/// \code
+/// __m256 _mm256_mask_i32gather_ps(__m256 a, const float *m, __m256i i,
+///                                 __m256 mask, const int s);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VGATHERDPS instruction.
+///
+/// \param a
+///    A 256-bit vector of [8 x float] used as the source when a mask bit is
+///    zero.
+/// \param m
+///    A pointer to the memory used for loading values.
+/// \param i
+///    A 256-bit vector of [8 x i32] containing signed indexes into \a m.
+/// \param mask
+///    A 256-bit vector of [8 x float] containing the mask. The most
+///    significant bit of each element in the mask vector represents the mask
+///    bits. If a mask bit is zero, the corresponding value from vector \a a
+///    is gathered; otherwise the value is loaded from memory.
+/// \param s
+///    A literal constant scale factor for the indexes in \a i. Must be
+///    1, 2, 4, or 8.
+/// \returns A 256-bit vector of [8 x float] containing the gathered values.
 #define _mm256_mask_i32gather_ps(a, m, i, mask, s) \
   ((__m256)__builtin_ia32_gatherd_ps256((__v8sf)(__m256)(a), \
                                         (float const *)(m), \
                                         (__v8si)(__m256i)(i), \
                                         (__v8sf)(__m256)(mask), (s)))
 
+/// Conditionally gathers two 32-bit floating-point values, either from the
+///    128-bit vector of [4 x float] in \a a, or from memory \a m using scaled
+///    indexes from the 128-bit vector of [2 x i64] in \a i. The 128-bit vector
+///    of [4 x float] in \a mask determines the source for the lower two
+///    elements. The upper two elements of the result are zeroed.
+///
+/// \code{.operation}
+/// FOR element := 0 to 1
+///   j := element*32
+///   k := element*64
+///   IF mask[j+31] == 0
+///     result[j+31:j] := a[j+31:j]
+///   ELSE
+///     result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)
+///   FI
+/// ENDFOR
+/// result[127:64] := 0
+/// \endcode
+///
+/// \headerfile <immintrin.h>
+///
+/// \code
+/// __m128 _mm_mask_i64gather_ps(__m128 a, const float *m, __m128i i,
+///                              __m128 mask, const int s);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VGATHERQPS instruction.
+///
+/// \param a
+///    A 128-bit vector of [4 x float] used as the source when a mask bit is
+///    zero. Only the first two elements are used.
+/// \param m
+///    A pointer to the memory used for loading values.
+/// \param i
+///    A 128-bit vector of [2 x i64] containing signed indexes into \a m.
+/// \param mask
+///    A 128-bit vector of [4 x float] containing the mask. The most
+///    significant bit of each element in the mask vector represents the mask
+///    bits. If a mask bit is zero, the corresponding value from vector \a a
+///    is gathered; otherwise the value is loaded from memory. Only the first
+///    two elements are used.
+/// \param s
+///    A literal constant scale factor for the indexes in \a i. Must be
+///    1, 2, 4, or 8.
+/// \returns A 128-bit vector of [4 x float] containing the gathered values.
 #define _mm_mask_i64gather_ps(a, m, i, mask, s) \
   ((__m128)__builtin_ia32_gatherq_ps((__v4sf)(__m128)(a), \
                                      (float const *)(m), \
                                      (__v2di)(__m128i)(i), \
                                      (__v4sf)(__m128)(mask), (s)))
 
+/// Conditionally gathers four 32-bit floating-point values, either from the
+///    128-bit vector of [4 x float] in \a a, or from memory \a m using scaled
+///    indexes from the 256-bit vector of [4 x i64] in \a i. The 128-bit vector
+///    of [4 x float] in \a mask determines the source for each element.
+///
+/// \code{.operation}
+/// FOR element := 0 to 3
+///   j := element*32
+///   k := element*64
+///   IF mask[j+31] == 0
+///     result[j+31:j] := a[j+31:j]
+///   ELSE
+///     result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)
+///   FI
+/// ENDFOR
+/// \endcode
+///
+/// \headerfile <immintrin.h>
+///
+/// \code
+/// __m128 _mm256_mask_i64gather_ps(__m128 a, const float *m, __m256i i,
+///                                 __m128 mask, const int s);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VGATHERQPS instruction.
+///
+/// \param a
+///    A 128-bit vector of [4 x float] used as the source when a mask bit is
+///   zero.
+/// \param m
+///    A pointer to the memory used for loading values.
+/// \param i
+///    A 256-bit vector of [4 x i64] containing signed indexes into \a m.
+/// \param mask
+///    A 128-bit vector of [4 x float] containing the mask. The most
+///    significant bit of each element in the mask vector represents the mask
+///    bits. If a mask bit is zero, the corresponding value from vector \a a
+///    is gathered; otherwise the value is loaded from memory.
+/// \param s
+///    A literal constant scale factor for the indexes in \a i. Must be
+///    1, 2, 4, or 8.
+/// \returns A 128-bit vector of [4 x float] containing the gathered values.
 #define _mm256_mask_i64gather_ps(a, m, i, mask, s) \
   ((__m128)__builtin_ia32_gatherq_ps256((__v4sf)(__m128)(a), \
                                         (float const *)(m), \
                                         (__v4di)(__m256i)(i), \
                                         (__v4sf)(__m128)(mask), (s)))
 
+/// Conditionally gathers four 32-bit integer values, either from the
+///    128-bit vector of [4 x i32] in \a a, or from memory \a m using scaled
+///    indexes from the 128-bit vector of [4 x i32] in \a i. The 128-bit vector
+///    of [4 x i32] in \a mask determines the source for each element.
+///
+/// \code{.operation}
+/// FOR element := 0 to 3
+///   j := element*32
+///   k := element*32
+///   IF mask[j+31] == 0
+///     result[j+31:j] := a[j+31:j]
+///   ELSE
+///     result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
+///   FI
+/// ENDFOR
+/// \endcode
+///
+/// \headerfile <immintrin.h>
+///
+/// \code
+/// __m128i _mm_mask_i32gather_epi32(__m128i a, const int *m, __m128i i,
+///                                  __m128i mask, const int s);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VPGATHERDD instruction.
+///
+/// \param a
+///    A 128-bit vector of [4 x i32] used as the source when a mask bit is
+///    zero.
+/// \param m
+///    A pointer to the memory used for loading values.
+/// \param i
+///    A 128-bit vector of [4 x i32] containing signed indexes into \a m.
+/// \param mask
+///    A 128-bit vector of [4 x i32] containing the mask. The most significant
+///    bit of each element in the mask vector represents the mask bits. If a
+///    mask bit is zero, the corresponding value from vector \a a is gathered;
+///    otherwise the value is loaded from memory.
+/// \param s
+///    A literal constant scale factor for the indexes in \a i. Must be
+///    1, 2, 4, or 8.
+/// \returns A 128-bit vector of [4 x i32] containing the gathered values.
 #define _mm_mask_i32gather_epi32(a, m, i, mask, s) \
   ((__m128i)__builtin_ia32_gatherd_d((__v4si)(__m128i)(a), \
                                      (int const *)(m), \
                                      (__v4si)(__m128i)(i), \
                                      (__v4si)(__m128i)(mask), (s)))
 
+/// Conditionally gathers eight 32-bit integer values, either from the
+///    256-bit vector of [8 x i32] in \a a, or from memory \a m using scaled
+///    indexes from the 256-bit vector of [8 x i32] in \a i. The 256-bit vector
+///    of [8 x i32] in \a mask determines the source for each element.
+///
+/// \code{.operation}
+/// FOR element := 0 to 7
+///   j := element*32
+///   k := element*32
+///   IF mask[j+31] == 0
+///     result[j+31:j] := a[j+31:j]
+///   ELSE
+///     result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
+///   FI
+/// ENDFOR
+/// \endcode
+///
+/// \headerfile <immintrin.h>
+///
+/// \code
+/// __m256i _mm256_mask_i32gather_epi32(__m256i a, const int *m, __m256i i,
+///                                     __m256i mask, const int s);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VPGATHERDD instruction.
+///
+/// \param a
+///    A 256-bit vector of [8 x i32] used as the source when a mask bit is
+///    zero.
+/// \param m
+///    A pointer to the memory used for loading values.
+/// \param i
+///    A 256-bit vector of [8 x i32] containing signed indexes into \a m.
+/// \param mask
+///    A 256-bit vector of [8 x i32] containing the mask. The most significant
+///    bit of each element in the mask vector represents the mask bits. If a
+///    mask bit is zero, the corresponding value from vector \a a is gathered;
+///    otherwise the value is loaded from memory.
+/// \param s
+///    A literal constant scale factor for the indexes in \a i. Must be
+///    1, 2, 4, or 8.
+/// \returns A 256-bit vector of [8 x i32] containing the gathered values.
 #define _mm256_mask_i32gather_epi32(a, m, i, mask, s) \
   ((__m256i)__builtin_ia32_gatherd_d256((__v8si)(__m256i)(a), \
                                         (int const *)(m), \
                                         (__v8si)(__m256i)(i), \
                                         (__v8si)(__m256i)(mask), (s)))
 
+/// Conditionally gathers two 32-bit integer values, either from the
+///    128-bit vector of [4 x i32] in \a a, or from memory \a m using scaled
+///    indexes from the 128-bit vector of [2 x i64] in \a i. The 128-bit vector
+///    of [4 x i32] in \a mask determines the source for the lower two
+///    elements. The upper two elements of the result are zeroed.
+///
+/// \code{.operation}
+/// FOR element := 0 to 1
+///   j := element*32
+///   k := element*64
+///   IF mask[j+31] == 0
+///     result[j+31:j] := a[j+31:j]
+///   ELSE
+///     result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)
+///   FI
+/// ENDFOR
+/// result[127:64] := 0
+/// \endcode
+///
+/// \headerfile <immintrin.h>
+///
+/// \code
+/// __m128i _mm_mask_i64gather_epi32(__m128i a, const int *m, __m128i i,
+///                                  __m128i mask, const int s);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VPGATHERQD instruction.
+///
+/// \param a
+///    A 128-bit vector of [4 x i32] used as the source when a mask bit is
+///   zero. Only the first two elements are used.
+/// \param m
+///    A pointer to the memory used for loading values.
+/// \param i
+///    A 128-bit vector of [2 x i64] containing indexes into \a m.
+/// \param mask
+///    A 128-bit vector of [4 x i32] containing the mask. The most significant
+///    bit of each element in the mask vector represents the mask bits. If a
+///    mask bit is zero, the corresponding value from vector \a a is gathered;
+///    otherwise the value is loaded from memory. Only the first two elements
+///    are used.
+/// \param s
+///    A literal constant scale factor for the indexes in \a i. Must be
+///    1, 2, 4, or 8.
+/// \returns A 128-bit vector of [4 x i32] containing the gathered values.
 #define _mm_mask_i64gather_epi32(a, m, i, mask, s) \
   ((__m128i)__builtin_ia32_gatherq_d((__v4si)(__m128i)(a), \
                                      (int const *)(m), \
                                      (__v2di)(__m128i)(i), \
                                      (__v4si)(__m128i)(mask), (s)))
 
+/// Conditionally gathers four 32-bit integer values, either from the
+///    128-bit vector of [4 x i32] in \a a, or from memory \a m using scaled
+///    indexes from the 256-bit vector of [4 x i64] in \a i. The 128-bit vector
+///    of [4 x i32] in \a mask determines the source for each element.
+///
+/// \code{.operation}
+/// FOR element := 0 to 3
+///   j := element*32
+///   k := element*64
+///   IF mask[j+31] == 0
+///     result[j+31:j] := a[j+31:j]
+///   ELSE
+///     result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)
+///   FI
+/// ENDFOR
+/// \endcode
+///
+/// \headerfile <immintrin.h>
+///
+/// \code
+/// __m128i _mm256_mask_i64gather_epi32(__m128i a, const int *m, __m256i i,
+///                                     __m128i mask, const int s);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VPGATHERQD instruction.
+///
+/// \param a
+///    A 128-bit vector of [4 x i32] used as the source when a mask bit is
+///    zero.
+/// \param m
+///    A pointer to the memory used for loading values.
+/// \param i
+///    A 256-bit vector of [4 x i64] containing signed indexes into \a m.
+/// \param mask
+///    A 128-bit vector of [4 x i32] containing the mask. The most significant
+///    bit of each element in the mask vector represents the mask bits. If a
+///    mask bit is zero, the corresponding value from vector \a a is gathered;
+///    otherwise the value is loaded from memory.
+/// \param s
+///    A literal constant scale factor for the indexes in \a i. Must be
+///    1, 2, 4, or 8.
+/// \returns A 128-bit vector of [4 x i32] containing the gathered values.
 #define _mm256_mask_i64gather_epi32(a, m, i, mask, s) \
   ((__m128i)__builtin_ia32_gatherq_d256((__v4si)(__m128i)(a), \
                                         (int const *)(m), \
                                         (__v4di)(__m256i)(i), \
                                         (__v4si)(__m128i)(mask), (s)))
 
+/// Conditionally gathers two 64-bit integer values, either from the
+///    128-bit vector of [2 x i64] in \a a, or from memory \a m using scaled
+///    indexes from the 128-bit vector of [4 x i32] in \a i. The 128-bit vector
+///    of [2 x i64] in \a mask determines the source for each element.
+///
+/// \code{.operation}
+/// FOR element := 0 to 1
+///   j := element*64
+///   k := element*32
+///   IF mask[j+63] == 0
+///     result[j+63:j] := a[j+63:j]
+///   ELSE
+///     result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
+///   FI
+/// ENDFOR
+/// \endcode
+///
+/// \headerfile <immintrin.h>
+///
+/// \code
+/// __m128i _mm_mask_i32gather_epi64(__m128i a, const long long *m, __m128i i,
+///                                  __m128i mask, const int s);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VPGATHERDQ instruction.
+///
+/// \param a
+///    A 128-bit vector of [2 x i64] used as the source when a mask bit is
+///    zero.
+/// \param m
+///    A pointer to the memory used for loading values.
+/// \param i
+///    A 128-bit vector of [4 x i32] containing signed indexes into \a m. Only
+///    the first two elements are used.
+/// \param mask
+///    A 128-bit vector of [2 x i64] containing the mask. The most significant
+///    bit of each element in the mask vector represents the mask bits. If a
+///    mask bit is zero, the corresponding value from vector \a a is gathered;
+///    otherwise the value is loaded from memory.
+/// \param s
+///    A literal constant scale factor for the indexes in \a i. Must be
+///    1, 2, 4, or 8.
+/// \returns A 128-bit vector of [2 x i64] containing the gathered values.
 #define _mm_mask_i32gather_epi64(a, m, i, mask, s) \
   ((__m128i)__builtin_ia32_gatherd_q((__v2di)(__m128i)(a), \
                                      (long long const *)(m), \
                                      (__v4si)(__m128i)(i), \
                                      (__v2di)(__m128i)(mask), (s)))
 
+/// Conditionally gathers four 64-bit integer values, either from the
+///    256-bit vector of [4 x i64] in \a a, or from memory \a m using scaled
+///    indexes from the 128-bit vector of [4 x i32] in \a i. The 256-bit vector
+///    of [4 x i64] in \a mask determines the source for each element.
+///
+/// \code{.operation}
+/// FOR element := 0 to 3
+///   j := element*64
+///   k := element*32
+///   IF mask[j+63] == 0
+///     result[j+63:j] := a[j+63:j]
+///   ELSE
+///     result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
+///   FI
+/// ENDFOR
+/// \endcode
+///
+/// \headerfile <immintrin.h>
+///
+/// \code
+/// __m256i _mm256_mask_i32gather_epi64(__m256i a, const long long *m,
+///                                     __m128i i, __m256i mask, const int s);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VPGATHERDQ instruction.
+///
+/// \param a
+///    A 256-bit vector of [4 x i64] used as the source when a mask bit is
+///    zero.
+/// \param m
+///    A pointer to the memory used for loading values.
+/// \param i
+///    A 128-bit vector of [4 x i32] containing signed indexes into \a m.
+/// \param mask
+///    A 256-bit vector of [4 x i64] containing the mask. The most significant
+///    bit of each element in the mask vector represents the mask bits. If a
+///    mask bit is zero, the corresponding value from vector \a a is gathered;
+///    otherwise the value is loaded from memory.
+/// \param s
+///    A literal constant scale factor for the indexes in \a i. Must be
+///    1, 2, 4, or 8.
+/// \returns A 256-bit vector of [4 x i64] containing the gathered values.
 #define _mm256_mask_i32gather_epi64(a, m, i, mask, s) \
   ((__m256i)__builtin_ia32_gatherd_q256((__v4di)(__m256i)(a), \
                                         (long long const *)(m), \
                                         (__v4si)(__m128i)(i), \
                                         (__v4di)(__m256i)(mask), (s)))
 
+/// Conditionally gathers two 64-bit integer values, either from the
+///    128-bit vector of [2 x i64] in \a a, or from memory \a m using scaled
+///    indexes from the 128-bit vector of [2 x i64] in \a i. The 128-bit vector
+///    of [2 x i64] in \a mask determines the source for each element.
+///
+/// \code{.operation}
+/// FOR element := 0 to 1
+///   j := element*64
+///   k := element*64
+///   IF mask[j+63] == 0
+///     result[j+63:j] := a[j+63:j]
+///   ELSE
+///     result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
+///   FI
+/// ENDFOR
+/// \endcode
+///
+/// \headerfile <immintrin.h>
+///
+/// \code
+/// __m128i _mm_mask_i64gather_epi64(__m128i a, const long long *m, __m128i i,
+///                                  __m128i mask, const int s);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VPGATHERQQ instruction.
+///
+/// \param a
+///    A 128-bit vector of [2 x i64] used as the source when a mask bit is
+///    zero.
+/// \param m
+///    A pointer to the memory used for loading values.
+/// \param i
+///    A 128-bit vector of [2 x i64] containing signed indexes into \a m.
+/// \param mask
+///    A 128-bit vector of [2 x i64] containing the mask. The most significant
+///    bit of each element in the mask vector represents the mask bits. If a
+///    mask bit is zero, the corresponding value from vector \a a is gathered;
+///    otherwise the value is loaded from memory.
+/// \param s
+///    A literal constant scale factor for the indexes in \a i. Must be
+///    1, 2, 4, or 8.
+/// \returns A 128-bit vector of [2 x i64] containing the gathered values.
 #define _mm_mask_i64gather_epi64(a, m, i, mask, s) \
   ((__m128i)__builtin_ia32_gatherq_q((__v2di)(__m128i)(a), \
                                      (long long const *)(m), \
                                      (__v2di)(__m128i)(i), \
                                      (__v2di)(__m128i)(mask), (s)))
 
+/// Conditionally gathers four 64-bit integer values, either from the
+///    256-bit vector of [4 x i64] in \a a, or from memory \a m using scaled
+///    indexes from the 256-bit vector of [4 x i64] in \a i. The 256-bit vector
+///    of [4 x i64] in \a mask determines the source for each element.
+///
+/// \code{.operation}
+/// FOR element := 0 to 3
+///   j := element*64
+///   k := element*64
+///   IF mask[j+63] == 0
+///     result[j+63:j] := a[j+63:j]
+///   ELSE
+///     result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
+///   FI
+/// ENDFOR
+/// \endcode
+///
+/// \headerfile <immintrin.h>
+///
+/// \code
+/// __m256i _mm256_mask_i64gather_epi64(__m256i a, const long long *m,
+///                                     __m256i i, __m256i mask, const int s);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VPGATHERQQ instruction.
+///
+/// \param a
+///    A 256-bit vector of [4 x i64] used as the source when a mask bit is
+///    zero.
+/// \param m
+///    A pointer to the memory used for loading values.
+/// \param i
+///    A 256-bit vector of [4 x i64] containing signed indexes into \a m.
+/// \param mask
+///    A 256-bit vector of [4 x i64] containing the mask. The most significant
+///    bit of each element in the mask vector represents the mask bits. If a
+///    mask bit is zero, the corresponding value from vector \a a is gathered;
+///    otherwise the value is loaded from memory.
+/// \param s
+///    A literal constant scale factor for the indexes in \a i. Must be
+///    1, 2, 4, or 8.
+/// \returns A 256-bit vector of [4 x i64] containing the gathered values.
 #define _mm256_mask_i64gather_epi64(a, m, i, mask, s) \
   ((__m256i)__builtin_ia32_gatherq_q256((__v4di)(__m256i)(a), \
                                         (long long const *)(m), \
                                         (__v4di)(__m256i)(i), \
                                         (__v4di)(__m256i)(mask), (s)))
 
+/// Gathers two 64-bit floating-point values from memory \a m using scaled
+///    indexes from the 128-bit vector of [4 x i32] in \a i.
+///
+/// \code{.operation}
+/// FOR element := 0 to 1
+///   j := element*64
+///   k := element*32
+///   result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
+/// ENDFOR
+/// \endcode
+///
+/// \headerfile <immintrin.h>
+///
+/// \code
+/// __m128d _mm_i32gather_pd(const double *m, __m128i i, const int s);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VGATHERDPD instruction.
+///
+/// \param m
+///    A pointer to the memory used for loading values.
+/// \param i
+///    A 128-bit vector of [4 x i32] containing signed indexes into \a m. Only
+///    the first two elements are used.
+/// \param s
+///    A literal constant scale factor for the indexes in \a i. Must be
+///    1, 2, 4, or 8.
+/// \returns A 128-bit vector of [2 x double] containing the gathered values.
 #define _mm_i32gather_pd(m, i, s) \
   ((__m128d)__builtin_ia32_gatherd_pd((__v2df)_mm_undefined_pd(), \
                                       (double const *)(m), \
@@ -1039,6 +1747,33 @@
                                                            _mm_setzero_pd()), \
                                       (s)))
 
+/// Gathers four 64-bit floating-point values from memory \a m using scaled
+///    indexes from the 128-bit vector of [4 x i32] in \a i.
+///
+/// \code{.operation}
+/// FOR element := 0 to 3
+///   j := element*64
+///   k := element*32
+///   result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
+/// ENDFOR
+/// \endcode
+///
+/// \headerfile <immintrin.h>
+///
+/// \code
+/// __m256d _mm256_i32gather_pd(const double *m, __m128i i, const int s);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VGATHERDPD instruction.
+///
+/// \param m
+///    A pointer to the memory used for loading values.
+/// \param i
+///    A 128-bit vector of [4 x i32] containing signed indexes into \a m.
+/// \param s
+///    A literal constant scale factor for the indexes in \a i. Must be
+///    1, 2, 4, or 8.
+/// \returns A 256-bit vector of [4 x double] containing the gathered values.
 #define _mm256_i32gather_pd(m, i, s) \
   ((__m256d)__builtin_ia32_gatherd_pd256((__v4df)_mm256_undefined_pd(), \
                                          (double const *)(m), \
@@ -1048,6 +1783,33 @@
                                                                _CMP_EQ_OQ), \
                                          (s)))
 
+/// Gathers two 64-bit floating-point values from memory \a m using scaled
+///    indexes from the 128-bit vector of [2 x i64] in \a i.
+///
+/// \code{.operation}
+/// FOR element := 0 to 1
+///   j := element*64
+///   k := element*64
+///   result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
+/// ENDFOR
+/// \endcode
+///
+/// \headerfile <immintrin.h>
+///
+/// \code
+/// __m128d _mm_i64gather_pd(const double *m, __m128i i, const int s);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VGATHERQPD instruction.
+///
+/// \param m
+///    A pointer to the memory used for loading values.
+/// \param i
+///    A 128-bit vector of [2 x i64] containing signed indexes into \a m.
+/// \param s
+///    A literal constant scale factor for the indexes in \a i. Must be
+///    1, 2, 4, or 8.
+/// \returns A 128-bit vector of [2 x double] containing the gathered values.
 #define _mm_i64gather_pd(m, i, s) \
   ((__m128d)__builtin_ia32_gatherq_pd((__v2df)_mm_undefined_pd(), \
                                       (double const *)(m), \
@@ -1056,6 +1818,33 @@
                                                            _mm_setzero_pd()), \
                                       (s)))
 
+/// Gathers four 64-bit floating-point values from memory \a m using scaled
+///    indexes from the 256-bit vector of [4 x i64] in \a i.
+///
+/// \code{.operation}
+/// FOR element := 0 to 3
+///   j := element*64
+///   k := element*64
+///   result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
+/// ENDFOR
+/// \endcode
+///
+/// \headerfile <immintrin.h>
+///
+/// \code
+/// __m256d _mm256_i64gather_pd(const double *m, __m256i i, const int s);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VGATHERQPD instruction.
+///
+/// \param m
+///    A pointer to the memory used for loading values.
+/// \param i
+///    A 256-bit vector of [4 x i64] containing signed indexes into \a m.
+/// \param s
+///    A literal constant scale factor for the indexes in \a i. Must be
+///    1, 2, 4, or 8.
+/// \returns A 256-bit vector of [4 x double] containing the gathered values.
 #define _mm256_i64gather_pd(m, i, s) \
   ((__m256d)__builtin_ia32_gatherq_pd256((__v4df)_mm256_undefined_pd(), \
                                          (double const *)(m), \
@@ -1065,6 +1854,33 @@
                                                                _CMP_EQ_OQ), \
                                          (s)))
 
+/// Gathers four 32-bit floating-point values from memory \a m using scaled
+///    indexes from the 128-bit vector of [4 x i32] in \a i.
+///
+/// \code{.operation}
+/// FOR element := 0 to 3
+///   j := element*32
+///   k := element*32
+///   result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
+/// ENDFOR
+/// \endcode
+///
+/// \headerfile <immintrin.h>
+///
+/// \code
+/// __m128 _mm_i32gather_ps(const float *m, __m128i i, const int s);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VGATHERDPS instruction.
+///
+/// \param m
+///    A pointer to the memory used for loading values.
+/// \param i
+///    A 128-bit vector of [4 x i32] containing signed indexes into \a m.
+/// \param s
+///    A literal constant scale factor for the indexes in \a i. Must be
+///    1, 2, 4, or 8.
+/// \returns A 128-bit vector of [4 x float] containing the gathered values.
 #define _mm_i32gather_ps(m, i, s) \
   ((__m128)__builtin_ia32_gatherd_ps((__v4sf)_mm_undefined_ps(), \
                                      (float const *)(m), \
@@ -1073,6 +1889,33 @@
                                                           _mm_setzero_ps()), \
                                      (s)))
 
+/// Gathers eight 32-bit floating-point values from memory \a m using scaled
+///    indexes from the 256-bit vector of [8 x i32] in \a i.
+///
+/// \code{.operation}
+/// FOR element := 0 to 7
+///   j := element*32
+///   k := element*32
+///   result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
+/// ENDFOR
+/// \endcode
+///
+/// \headerfile <immintrin.h>
+///
+/// \code
+/// __m256 _mm256_i32gather_ps(const float *m, __m256i i, const int s);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VGATHERDPS instruction.
+///
+/// \param m
+///    A pointer to the memory used for loading values.
+/// \param i
+///    A 256-bit vector of [8 x i32] containing signed indexes into \a m.
+/// \param s
+///    A literal constant scale factor for the indexes in \a i. Must be
+///    1, 2, 4, or 8.
+/// \returns A 256-bit vector of [8 x float] containing the gathered values.
 #define _mm256_i32gather_ps(m, i, s) \
   ((__m256)__builtin_ia32_gatherd_ps256((__v8sf)_mm256_undefined_ps(), \
                                         (float const *)(m), \
@@ -1082,6 +1925,35 @@
                                                               _CMP_EQ_OQ), \
                                         (s)))
 
+/// Gathers two 32-bit floating-point values from memory \a m using scaled
+///    indexes from the 128-bit vector of [2 x i64] in \a i. The upper two
+///    elements of the result are zeroed.
+///
+/// \code{.operation}
+/// FOR element := 0 to 1
+///   j := element*32
+///   k := element*64
+///   result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)
+/// ENDFOR
+/// result[127:64] := 0
+/// \endcode
+///
+/// \headerfile <immintrin.h>
+///
+/// \code
+/// __m128 _mm_i64gather_ps(const float *m, __m128i i, const int s);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VGATHERQPS instruction.
+///
+/// \param m
+///    A pointer to the memory used for loading values.
+/// \param i
+///    A 128-bit vector of [2 x i64] containing signed indexes into \a m.
+/// \param s
+///    A literal constant scale factor for the indexes in \a i. Must be
+///    1, 2, 4, or 8.
+/// \returns A 128-bit vector of [4 x float] containing the gathered values.
 #define _mm_i64gather_ps(m, i, s) \
   ((__m128)__builtin_ia32_gatherq_ps((__v4sf)_mm_undefined_ps(), \
                                      (float const *)(m), \
@@ -1090,6 +1962,33 @@
                                                           _mm_setzero_ps()), \
                                      (s)))
 
+/// Gathers four 32-bit floating-point values from memory \a m using scaled
+///    indexes from the 256-bit vector of [4 x i64] in \a i.
+///
+/// \code{.operation}
+/// FOR element := 0 to 3
+///   j := element*32
+///   k := element*64
+///   result[j+31:j] := Load32(m + SignExtend(i[k+64:k])*s)
+/// ENDFOR
+/// \endcode
+///
+/// \headerfile <immintrin.h>
+///
+/// \code
+/// __m128 _mm256_i64gather_ps(const float *m, __m256i i, const int s);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VGATHERQPS instruction.
+///
+/// \param m
+///    A pointer to the memory used for loading values.
+/// \param i
+///    A 256-bit vector of [4 x i64] containing signed indexes into \a m.
+/// \param s
+///    A literal constant scale factor for the indexes in \a i. Must be
+///    1, 2, 4, or 8.
+/// \returns A 128-bit vector of [4 x float] containing the gathered values.
 #define _mm256_i64gather_ps(m, i, s) \
   ((__m128)__builtin_ia32_gatherq_ps256((__v4sf)_mm_undefined_ps(), \
                                         (float const *)(m), \
@@ -1098,44 +1997,263 @@
                                                              _mm_setzero_ps()), \
                                         (s)))
 
+/// Gathers four 32-bit floating-point values from memory \a m using scaled
+///    indexes from the 128-bit vector of [4 x i32] in \a i.
+///
+/// \code{.operation}
+/// FOR element := 0 to 3
+///   j := element*32
+///   k := element*32
+///   result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
+/// ENDFOR
+/// \endcode
+///
+/// \headerfile <immintrin.h>
+///
+/// \code
+/// __m128i _mm_i32gather_epi32(const int *m, __m128i i, const int s);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VPGATHERDD instruction.
+///
+/// \param m
+///    A pointer to the memory used for loading values.
+/// \param i
+///    A 128-bit vector of [4 x i32] containing signed indexes into \a m.
+/// \param s
+///    A literal constant scale factor for the indexes in \a i. Must be
+///    1, 2, 4, or 8.
+/// \returns A 128-bit vector of [4 x i32] containing the gathered values.
 #define _mm_i32gather_epi32(m, i, s) \
   ((__m128i)__builtin_ia32_gatherd_d((__v4si)_mm_undefined_si128(), \
                                      (int const *)(m), (__v4si)(__m128i)(i), \
                                      (__v4si)_mm_set1_epi32(-1), (s)))
 
+/// Gathers eight 32-bit floating-point values from memory \a m using scaled
+///    indexes from the 256-bit vector of [8 x i32] in \a i.
+///
+/// \code{.operation}
+/// FOR element := 0 to 7
+///   j := element*32
+///   k := element*32
+///   result[j+31:j] := Load32(m + SignExtend(i[k+31:k])*s)
+/// ENDFOR
+/// \endcode
+///
+/// \headerfile <immintrin.h>
+///
+/// \code
+/// __m256i _mm256_i32gather_epi32(const int *m, __m256i i, const int s);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VPGATHERDD instruction.
+///
+/// \param m
+///    A pointer to the memory used for loading values.
+/// \param i
+///    A 256-bit vector of [8 x i32] containing signed indexes into \a m.
+/// \param s
+///    A literal constant scale factor for the indexes in \a i. Must be
+///    1, 2, 4, or 8.
+/// \returns A 256-bit vector of [8 x i32] containing the gathered values.
 #define _mm256_i32gather_epi32(m, i, s) \
   ((__m256i)__builtin_ia32_gatherd_d256((__v8si)_mm256_undefined_si256(), \
                                         (int const *)(m), (__v8si)(__m256i)(i), \
                                         (__v8si)_mm256_set1_epi32(-1), (s)))
 
+/// Gathers two 32-bit integer values from memory \a m using scaled indexes
+///    from the 128-bit vector of [2 x i64] in \a i. The upper two elements
+///    of the result are zeroed.
+///
+/// \code{.operation}
+/// FOR element := 0 to 1
+///   j := element*32
+///   k := element*64
+///   result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)
+/// ENDFOR
+/// result[127:64] := 0
+/// \endcode
+///
+/// \headerfile <immintrin.h>
+///
+/// \code
+/// __m128i _mm_i64gather_epi32(const int *m, __m128i i, const int s);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VPGATHERQD instruction.
+///
+/// \param m
+///    A pointer to the memory used for loading values.
+/// \param i
+///    A 128-bit vector of [2 x i64] containing signed indexes into \a m.
+/// \param s
+///    A literal constant scale factor for the indexes in \a i. Must be
+///    1, 2, 4, or 8.
+/// \returns A 128-bit vector of [4 x i32] containing the gathered values.
 #define _mm_i64gather_epi32(m, i, s) \
   ((__m128i)__builtin_ia32_gatherq_d((__v4si)_mm_undefined_si128(), \
                                      (int const *)(m), (__v2di)(__m128i)(i), \
                                      (__v4si)_mm_set1_epi32(-1), (s)))
 
+/// Gathers four 32-bit integer values from memory \a m using scaled indexes
+///    from the 256-bit vector of [4 x i64] in \a i.
+///
+/// \code{.operation}
+/// FOR element := 0 to 3
+///   j := element*32
+///   k := element*64
+///   result[j+31:j] := Load32(m + SignExtend(i[k+63:k])*s)
+/// ENDFOR
+/// \endcode
+///
+/// \headerfile <immintrin.h>
+///
+/// \code
+/// __m128i _mm256_i64gather_epi32(const int *m, __m256i i, const int s);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VPGATHERQD instruction.
+///
+/// \param m
+///    A pointer to the memory used for loading values.
+/// \param i
+///    A 256-bit vector of [4 x i64] containing signed indexes into \a m.
+/// \param s
+///    A literal constant scale factor for the indexes in \a i. Must be
+///    1, 2, 4, or 8.
+/// \returns A 128-bit vector of [4 x i32] containing the gathered values.
 #define _mm256_i64gather_epi32(m, i, s) \
   ((__m128i)__builtin_ia32_gatherq_d256((__v4si)_mm_undefined_si128(), \
                                         (int const *)(m), (__v4di)(__m256i)(i), \
                                         (__v4si)_mm_set1_epi32(-1), (s)))
 
+/// Gathers two 64-bit integer values from memory \a m using scaled indexes
+///    from the 128-bit vector of [4 x i32] in \a i.
+///
+/// \code{.operation}
+/// FOR element := 0 to 1
+///   j := element*64
+///   k := element*32
+///   result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
+/// ENDFOR
+/// \endcode
+///
+/// \headerfile <immintrin.h>
+///
+/// \code
+/// __m128i _mm_i32gather_epi64(const long long *m, __m128i i, const int s);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VPGATHERDQ instruction.
+///
+/// \param m
+///    A pointer to the memory used for loading values.
+/// \param i
+///    A 128-bit vector of [4 x i32] containing signed indexes into \a m. Only
+///    the first two elements are used.
+/// \param s
+///    A literal constant scale factor for the indexes in \a i. Must be
+///    1, 2, 4, or 8.
+/// \returns A 128-bit vector of [2 x i64] containing the gathered values.
 #define _mm_i32gather_epi64(m, i, s) \
   ((__m128i)__builtin_ia32_gatherd_q((__v2di)_mm_undefined_si128(), \
                                      (long long const *)(m), \
                                      (__v4si)(__m128i)(i), \
                                      (__v2di)_mm_set1_epi64x(-1), (s)))
 
+/// Gathers four 64-bit integer values from memory \a m using scaled indexes
+///    from the 128-bit vector of [4 x i32] in \a i.
+///
+/// \code{.operation}
+/// FOR element := 0 to 3
+///   j := element*64
+///   k := element*32
+///   result[j+63:j] := Load64(m + SignExtend(i[k+31:k])*s)
+/// ENDFOR
+/// \endcode
+///
+/// \headerfile <immintrin.h>
+///
+/// \code
+/// __m256i _mm256_i32gather_epi64(const long long *m, __m128i i, const int s);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VPGATHERDQ instruction.
+///
+/// \param m
+///    A pointer to the memory used for loading values.
+/// \param i
+///    A 128-bit vector of [4 x i32] containing signed indexes into \a m.
+/// \param s
+///    A literal constant scale factor for the indexes in \a i. Must be
+///    1, 2, 4, or 8.
+/// \returns A 256-bit vector of [4 x i64] containing the gathered values.
 #define _mm256_i32gather_epi64(m, i, s) \
   ((__m256i)__builtin_ia32_gatherd_q256((__v4di)_mm256_undefined_si256(), \
                                         (long long const *)(m), \
                                         (__v4si)(__m128i)(i), \
                                         (__v4di)_mm256_set1_epi64x(-1), (s)))
 
+/// Gathers two 64-bit integer values from memory \a m using scaled indexes
+///    from the 128-bit vector of [2 x i64] in \a i.
+///
+/// \code{.operation}
+/// FOR element := 0 to 1
+///   j := element*64
+///   k := element*64
+///   result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
+/// ENDFOR
+/// \endcode
+///
+/// \headerfile <immintrin.h>
+///
+/// \code
+/// __m128i _mm_i64gather_epi64(const long long *m, __m128i i, const int s);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VPGATHERQQ instruction.
+///
+/// \param m
+///    A pointer to the memory used for loading values.
+/// \param i
+///    A 128-bit vector of [2 x i64] containing signed indexes into \a m.
+/// \param s
+///    A literal constant scale factor for the indexes in \a i. Must be
+///    1, 2, 4, or 8.
+/// \returns A 128-bit vector of [2 x i64] containing the gathered values.
 #define _mm_i64gather_epi64(m, i, s) \
   ((__m128i)__builtin_ia32_gatherq_q((__v2di)_mm_undefined_si128(), \
                                      (long long const *)(m), \
                                      (__v2di)(__m128i)(i), \
                                      (__v2di)_mm_set1_epi64x(-1), (s)))
 
+/// Gathers four 64-bit integer values from memory \a m using scaled indexes
+///    from the 256-bit vector of [4 x i64] in \a i.
+///
+/// \code{.operation}
+/// FOR element := 0 to 3
+///   j := element*64
+///   k := element*64
+///   result[j+63:j] := Load64(m + SignExtend(i[k+63:k])*s)
+/// ENDFOR
+/// \endcode
+///
+/// \headerfile <immintrin.h>
+///
+/// \code
+/// __m256i _mm256_i64gather_epi64(const long long *m, __m256i i, const int s);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VPGATHERQQ instruction.
+///
+/// \param m
+///    A pointer to the memory used for loading values.
+/// \param i
+///    A 256-bit vector of [4 x i64] containing signed indexes into \a m.
+/// \param s
+///    A literal constant scale factor for the indexes in \a i. Must be
+///    1, 2, 4, or 8.
+/// \returns A 256-bit vector of [4 x i64] containing the gathered values.
 #define _mm256_i64gather_epi64(m, i, s) \
   ((__m256i)__builtin_ia32_gatherq_q256((__v4di)_mm256_undefined_si256(), \
                                         (long long const *)(m), \
diff --git a/linux-x86/lib64/clang/17/include/avx512fintrin.h b/linux-x86/lib64/clang/17/include/avx512fintrin.h
index b19d2fb..88a8ceb 100644
--- a/linux-x86/lib64/clang/17/include/avx512fintrin.h
+++ b/linux-x86/lib64/clang/17/include/avx512fintrin.h
@@ -397,14 +397,15 @@
 static __inline __m512d __DEFAULT_FN_ATTRS512
 _mm512_castpd256_pd512(__m256d __a)
 {
-  return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, -1, -1, -1, -1);
+  return __builtin_shufflevector(__a, __builtin_nondeterministic_value(__a), 0,
+                                 1, 2, 3, 4, 5, 6, 7);
 }
 
 static __inline __m512 __DEFAULT_FN_ATTRS512
 _mm512_castps256_ps512(__m256 __a)
 {
-  return __builtin_shufflevector(__a, __a, 0,  1,  2,  3,  4,  5,  6,  7,
-                                          -1, -1, -1, -1, -1, -1, -1, -1);
+  return __builtin_shufflevector(__a, __builtin_nondeterministic_value(__a), 0,
+                                 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
 }
 
 static __inline __m128d __DEFAULT_FN_ATTRS512
@@ -446,7 +447,10 @@
 static __inline__ __m512d __DEFAULT_FN_ATTRS512
 _mm512_castpd128_pd512 (__m128d __A)
 {
-  return __builtin_shufflevector( __A, __A, 0, 1, -1, -1, -1, -1, -1, -1);
+  __m256d __B = __builtin_nondeterministic_value(__B);
+  return __builtin_shufflevector(
+      __builtin_shufflevector(__A, __builtin_nondeterministic_value(__A), 0, 1, 2, 3),
+      __B, 0, 1, 2, 3, 4, 5, 6, 7);
 }
 
 static __inline __m512d __DEFAULT_FN_ATTRS512
@@ -464,19 +468,25 @@
 static __inline__ __m512 __DEFAULT_FN_ATTRS512
 _mm512_castps128_ps512 (__m128 __A)
 {
-    return  __builtin_shufflevector( __A, __A, 0, 1, 2, 3, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
+  __m256 __B = __builtin_nondeterministic_value(__B);
+  return __builtin_shufflevector(
+      __builtin_shufflevector(__A, __builtin_nondeterministic_value(__A), 0, 1, 2, 3, 4, 5, 6, 7),
+      __B, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS512
 _mm512_castsi128_si512 (__m128i __A)
 {
-   return  __builtin_shufflevector( __A, __A, 0, 1, -1, -1, -1, -1, -1, -1);
+  __m256i __B = __builtin_nondeterministic_value(__B);
+  return __builtin_shufflevector(
+      __builtin_shufflevector(__A, __builtin_nondeterministic_value(__A), 0, 1, 2, 3),
+      __B, 0, 1, 2, 3, 4, 5, 6, 7);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS512
 _mm512_castsi256_si512 (__m256i __A)
 {
-   return  __builtin_shufflevector( __A, __A, 0, 1, 2, 3, -1, -1, -1, -1);
+   return  __builtin_shufflevector( __A, __builtin_nondeterministic_value(__A), 0, 1, 2, 3, 4, 5, 6, 7);
 }
 
 static __inline __m512 __DEFAULT_FN_ATTRS512
diff --git a/linux-x86/lib64/clang/17/include/avx512fp16intrin.h b/linux-x86/lib64/clang/17/include/avx512fp16intrin.h
index 5cdc37f..d326586 100644
--- a/linux-x86/lib64/clang/17/include/avx512fp16intrin.h
+++ b/linux-x86/lib64/clang/17/include/avx512fp16intrin.h
@@ -192,22 +192,26 @@
 
 static __inline__ __m256h __DEFAULT_FN_ATTRS256
 _mm256_castph128_ph256(__m128h __a) {
-  return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, 4, 5, 6, 7, -1, -1, -1,
-                                 -1, -1, -1, -1, -1);
+  return __builtin_shufflevector(__a, __builtin_nondeterministic_value(__a),
+                                  0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
 }
 
 static __inline__ __m512h __DEFAULT_FN_ATTRS512
 _mm512_castph128_ph512(__m128h __a) {
-  return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, 4, 5, 6, 7, -1, -1, -1,
-                                 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-                                 -1, -1, -1, -1, -1, -1, -1, -1, -1);
+  __m256h __b = __builtin_nondeterministic_value(__b);
+  return __builtin_shufflevector(
+      __builtin_shufflevector(__a, __builtin_nondeterministic_value(__a),
+                              0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15),
+      __b, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
+      20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
 }
 
 static __inline__ __m512h __DEFAULT_FN_ATTRS512
 _mm512_castph256_ph512(__m256h __a) {
-  return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
-                                 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1,
-                                 -1, -1, -1, -1, -1, -1, -1, -1);
+  return __builtin_shufflevector(__a, __builtin_nondeterministic_value(__a), 0,
+                                 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
+                                 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
+                                 27, 28, 29, 30, 31);
 }
 
 /// Constructs a 256-bit floating-point vector of [16 x half] from a
diff --git a/linux-x86/lib64/clang/17/include/avxintrin.h b/linux-x86/lib64/clang/17/include/avxintrin.h
index ee31569..bd11922 100644
--- a/linux-x86/lib64/clang/17/include/avxintrin.h
+++ b/linux-x86/lib64/clang/17/include/avxintrin.h
@@ -4499,7 +4499,8 @@
 static __inline __m256d __DEFAULT_FN_ATTRS
 _mm256_castpd128_pd256(__m128d __a)
 {
-  return __builtin_shufflevector((__v2df)__a, (__v2df)__a, 0, 1, -1, -1);
+  return __builtin_shufflevector(
+      (__v2df)__a, (__v2df)__builtin_nondeterministic_value(__a), 0, 1, 2, 3);
 }
 
 /// Constructs a 256-bit floating-point vector of [8 x float] from a
@@ -4520,7 +4521,9 @@
 static __inline __m256 __DEFAULT_FN_ATTRS
 _mm256_castps128_ps256(__m128 __a)
 {
-  return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 1, 2, 3, -1, -1, -1, -1);
+  return __builtin_shufflevector((__v4sf)__a,
+                                 (__v4sf)__builtin_nondeterministic_value(__a),
+                                 0, 1, 2, 3, 4, 5, 6, 7);
 }
 
 /// Constructs a 256-bit integer vector from a 128-bit integer vector.
@@ -4539,7 +4542,8 @@
 static __inline __m256i __DEFAULT_FN_ATTRS
 _mm256_castsi128_si256(__m128i __a)
 {
-  return __builtin_shufflevector((__v2di)__a, (__v2di)__a, 0, 1, -1, -1);
+  return __builtin_shufflevector(
+      (__v2di)__a, (__v2di)__builtin_nondeterministic_value(__a), 0, 1, 2, 3);
 }
 
 /// Constructs a 256-bit floating-point vector of [4 x double] from a
diff --git a/linux-x86/lib64/clang/17/include/bits/stdatomic.h b/linux-x86/lib64/clang/17/include/bits/stdatomic.h
index 2ce6ee6..fe3d68d 100644
--- a/linux-x86/lib64/clang/17/include/bits/stdatomic.h
+++ b/linux-x86/lib64/clang/17/include/bits/stdatomic.h
@@ -269,18 +269,18 @@
 
 #define	ATOMIC_FLAG_INIT		{ ATOMIC_VAR_INIT(false) }
 
-static __inline bool atomic_flag_test_and_set_explicit(volatile atomic_flag *__object, memory_order __order) {
+static __inline bool atomic_flag_test_and_set_explicit(volatile atomic_flag * _Nonnull __object, memory_order __order) {
 	return (atomic_exchange_explicit(&__object->__flag, 1, __order));
 }
 
-static __inline void atomic_flag_clear_explicit(volatile atomic_flag *__object, memory_order __order) {
+static __inline void atomic_flag_clear_explicit(volatile atomic_flag * _Nonnull __object, memory_order __order) {
 	atomic_store_explicit(&__object->__flag, 0, __order);
 }
 
-static __inline bool atomic_flag_test_and_set(volatile atomic_flag *__object) {
+static __inline bool atomic_flag_test_and_set(volatile atomic_flag * _Nonnull __object) {
 	return (atomic_flag_test_and_set_explicit(__object, memory_order_seq_cst));
 }
 
-static __inline void atomic_flag_clear(volatile atomic_flag *__object) {
+static __inline void atomic_flag_clear(volatile atomic_flag * _Nonnull __object) {
 	atomic_flag_clear_explicit(__object, memory_order_seq_cst);
 }
diff --git a/linux-x86/lib64/clang/17/include/cuda_wrappers/shared_ptr_base.h b/linux-x86/lib64/clang/17/include/cuda_wrappers/shared_ptr_base.h
new file mode 100644
index 0000000..10028dd
--- /dev/null
+++ b/linux-x86/lib64/clang/17/include/cuda_wrappers/shared_ptr_base.h
@@ -0,0 +1,9 @@
+// CUDA headers define __noinline__ which interferes with libstdc++'s use of
+// `__attribute((__noinline__))`. In order to avoid compilation error,
+// temporarily unset __noinline__ when we include affected libstdc++ header.
+
+#pragma push_macro("__noinline__")
+#undef __noinline__
+#include_next "bits/shared_ptr_base.h"
+
+#pragma pop_macro("__noinline__")
diff --git a/linux-x86/lib64/clang/17/include/fmaintrin.h b/linux-x86/lib64/clang/17/include/fmaintrin.h
index d889b7c..ea832fa 100644
--- a/linux-x86/lib64/clang/17/include/fmaintrin.h
+++ b/linux-x86/lib64/clang/17/include/fmaintrin.h
@@ -18,192 +18,756 @@
 #define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("fma"), __min_vector_width__(128)))
 #define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("fma"), __min_vector_width__(256)))
 
+/// Computes a multiply-add of 128-bit vectors of [4 x float].
+///    For each element, computes <c> (__A * __B) + __C </c>.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the \c VFMADD213PS instruction.
+///
+/// \param __A
+///    A 128-bit vector of [4 x float] containing the multiplicand.
+/// \param __B
+///    A 128-bit vector of [4 x float] containing the multiplier.
+/// \param __C
+///    A 128-bit vector of [4 x float] containing the addend.
+/// \returns A 128-bit vector of [4 x float] containing the result.
 static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_fmadd_ps(__m128 __A, __m128 __B, __m128 __C)
 {
   return (__m128)__builtin_ia32_vfmaddps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
 }
 
+/// Computes a multiply-add of 128-bit vectors of [2 x double].
+///    For each element, computes <c> (__A * __B) + __C </c>.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the \c VFMADD213PD instruction.
+///
+/// \param __A
+///    A 128-bit vector of [2 x double] containing the multiplicand.
+/// \param __B
+///    A 128-bit vector of [2 x double] containing the multiplier.
+/// \param __C
+///    A 128-bit vector of [2 x double] containing the addend.
+/// \returns A 128-bit [2 x double] vector containing the result.
 static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_fmadd_pd(__m128d __A, __m128d __B, __m128d __C)
 {
   return (__m128d)__builtin_ia32_vfmaddpd((__v2df)__A, (__v2df)__B, (__v2df)__C);
 }
 
+/// Computes a scalar multiply-add of the single-precision values in the
+///    low 32 bits of 128-bit vectors of [4 x float].
+/// \code
+/// result[31:0] = (__A[31:0] * __B[31:0]) + __C[31:0]
+/// result[127:32] = __A[127:32]
+/// \endcode
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the \c VFMADD213SS instruction.
+///
+/// \param __A
+///    A 128-bit vector of [4 x float] containing the multiplicand in the low
+///    32 bits.
+/// \param __B
+///    A 128-bit vector of [4 x float] containing the multiplier in the low
+///    32 bits.
+/// \param __C
+///    A 128-bit vector of [4 x float] containing the addend in the low
+///    32 bits.
+/// \returns A 128-bit vector of [4 x float] containing the result in the low
+///    32 bits and a copy of \a __A[127:32] in the upper 96 bits.
 static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_fmadd_ss(__m128 __A, __m128 __B, __m128 __C)
 {
   return (__m128)__builtin_ia32_vfmaddss3((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
 }
 
+/// Computes a scalar multiply-add of the double-precision values in the
+///    low 64 bits of 128-bit vectors of [2 x double].
+/// \code
+/// result[63:0] = (__A[63:0] * __B[63:0]) + __C[63:0]
+/// result[127:64] = __A[127:64]
+/// \endcode
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the \c VFMADD213SD instruction.
+///
+/// \param __A
+///    A 128-bit vector of [2 x double] containing the multiplicand in the low
+///    64 bits.
+/// \param __B
+///    A 128-bit vector of [2 x double] containing the multiplier in the low
+///    64 bits.
+/// \param __C
+///    A 128-bit vector of [2 x double] containing the addend in the low
+///    64 bits.
+/// \returns A 128-bit vector of [2 x double] containing the result in the low
+///    64 bits and a copy of \a __A[127:64] in the upper 64 bits.
 static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_fmadd_sd(__m128d __A, __m128d __B, __m128d __C)
 {
   return (__m128d)__builtin_ia32_vfmaddsd3((__v2df)__A, (__v2df)__B, (__v2df)__C);
 }
 
+/// Computes a multiply-subtract of 128-bit vectors of [4 x float].
+///    For each element, computes <c> (__A * __B) - __C </c>.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the \c VFMSUB213PS instruction.
+///
+/// \param __A
+///    A 128-bit vector of [4 x float] containing the multiplicand.
+/// \param __B
+///    A 128-bit vector of [4 x float] containing the multiplier.
+/// \param __C
+///    A 128-bit vector of [4 x float] containing the subtrahend.
+/// \returns A 128-bit vector of [4 x float] containing the result.
 static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_fmsub_ps(__m128 __A, __m128 __B, __m128 __C)
 {
   return (__m128)__builtin_ia32_vfmaddps((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
 }
 
+/// Computes a multiply-subtract of 128-bit vectors of [2 x double].
+///    For each element, computes <c> (__A * __B) - __C </c>.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the \c VFMSUB213PD instruction.
+///
+/// \param __A
+///    A 128-bit vector of [2 x double] containing the multiplicand.
+/// \param __B
+///    A 128-bit vector of [2 x double] containing the multiplier.
+/// \param __C
+///    A 128-bit vector of [2 x double] containing the addend.
+/// \returns A 128-bit vector of [2 x double] containing the result.
 static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_fmsub_pd(__m128d __A, __m128d __B, __m128d __C)
 {
   return (__m128d)__builtin_ia32_vfmaddpd((__v2df)__A, (__v2df)__B, -(__v2df)__C);
 }
 
+/// Computes a scalar multiply-subtract of the single-precision values in
+///    the low 32 bits of 128-bit vectors of [4 x float].
+/// \code
+/// result[31:0] = (__A[31:0] * __B[31:0]) - __C[31:0]
+/// result[127:32] = __A[127:32]
+/// \endcode
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the \c VFMSUB213SS instruction.
+///
+/// \param __A
+///    A 128-bit vector of [4 x float] containing the multiplicand in the low
+///    32 bits.
+/// \param __B
+///    A 128-bit vector of [4 x float] containing the multiplier in the low
+///    32 bits.
+/// \param __C
+///    A 128-bit vector of [4 x float] containing the subtrahend in the low
+///   32 bits.
+/// \returns A 128-bit vector of [4 x float] containing the result in the low
+///    32 bits, and a copy of \a __A[127:32] in the upper 96 bits.
 static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_fmsub_ss(__m128 __A, __m128 __B, __m128 __C)
 {
   return (__m128)__builtin_ia32_vfmaddss3((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
 }
 
+/// Computes a scalar multiply-subtract of the double-precision values in
+///    the low 64 bits of 128-bit vectors of [2 x double].
+/// \code
+/// result[63:0] = (__A[63:0] * __B[63:0]) - __C[63:0]
+/// result[127:64] = __A[127:64]
+/// \endcode
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the \c VFMSUB213SD instruction.
+///
+/// \param __A
+///    A 128-bit vector of [2 x double] containing the multiplicand in the low
+///    64 bits.
+/// \param __B
+///    A 128-bit vector of [2 x double] containing the multiplier in the low
+///    64 bits.
+/// \param __C
+///    A 128-bit vector of [2 x double] containing the subtrahend in the low
+///    64 bits.
+/// \returns A 128-bit vector of [2 x double] containing the result in the low
+///    64 bits, and a copy of \a __A[127:64] in the upper 64 bits.
 static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_fmsub_sd(__m128d __A, __m128d __B, __m128d __C)
 {
   return (__m128d)__builtin_ia32_vfmaddsd3((__v2df)__A, (__v2df)__B, -(__v2df)__C);
 }
 
+/// Computes a negated multiply-add of 128-bit vectors of [4 x float].
+///    For each element, computes <c> -(__A * __B) + __C </c>.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the \c VFNMADD213DPS instruction.
+///
+/// \param __A
+///    A 128-bit vector of [4 x float] containing the multiplicand.
+/// \param __B
+///    A 128-bit vector of [4 x float] containing the multiplier.
+/// \param __C
+///    A 128-bit vector of [4 x float] containing the addend.
+/// \returns A 128-bit [4 x float] vector containing the result.
 static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_fnmadd_ps(__m128 __A, __m128 __B, __m128 __C)
 {
   return (__m128)__builtin_ia32_vfmaddps(-(__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
 }
 
+/// Computes a negated multiply-add of 128-bit vectors of [2 x double].
+///    For each element, computes <c> -(__A * __B) + __C </c>.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the \c VFNMADD213PD instruction.
+///
+/// \param __A
+///    A 128-bit vector of [2 x double] containing the multiplicand.
+/// \param __B
+///    A 128-bit vector of [2 x double] containing the multiplier.
+/// \param __C
+///    A 128-bit vector of [2 x double] containing the addend.
+/// \returns A 128-bit vector of [2 x double] containing the result.
 static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_fnmadd_pd(__m128d __A, __m128d __B, __m128d __C)
 {
   return (__m128d)__builtin_ia32_vfmaddpd(-(__v2df)__A, (__v2df)__B, (__v2df)__C);
 }
 
+/// Computes a scalar negated multiply-add of the single-precision values in
+///    the low 32 bits of 128-bit vectors of [4 x float].
+/// \code
+/// result[31:0] = -(__A[31:0] * __B[31:0]) + __C[31:0]
+/// result[127:32] = __A[127:32]
+/// \endcode
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the \c VFNMADD213SS instruction.
+///
+/// \param __A
+///    A 128-bit vector of [4 x float] containing the multiplicand in the low
+///    32 bits.
+/// \param __B
+///    A 128-bit vector of [4 x float] containing the multiplier in the low
+///    32 bits.
+/// \param __C
+///    A 128-bit vector of [4 x float] containing the addend in the low
+///    32 bits.
+/// \returns A 128-bit vector of [4 x float] containing the result in the low
+///    32 bits, and a copy of \a __A[127:32] in the upper 96 bits.
 static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_fnmadd_ss(__m128 __A, __m128 __B, __m128 __C)
 {
   return (__m128)__builtin_ia32_vfmaddss3((__v4sf)__A, -(__v4sf)__B, (__v4sf)__C);
 }
 
+/// Computes a scalar negated multiply-add of the double-precision values
+///    in the low 64 bits of 128-bit vectors of [2 x double].
+/// \code
+/// result[63:0] = -(__A[63:0] * __B[63:0]) + __C[63:0]
+/// result[127:64] = __A[127:64]
+/// \endcode
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the \c VFNMADD213SD instruction.
+///
+/// \param __A
+///    A 128-bit vector of [2 x double] containing the multiplicand in the low
+///    64 bits.
+/// \param __B
+///    A 128-bit vector of [2 x double] containing the multiplier in the low
+///    64 bits.
+/// \param __C
+///    A 128-bit vector of [2 x double] containing the addend in the low
+///    64 bits.
+/// \returns A 128-bit vector of [2 x double] containing the result in the low
+///    64 bits, and a copy of \a __A[127:64] in the upper 64 bits.
 static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_fnmadd_sd(__m128d __A, __m128d __B, __m128d __C)
 {
   return (__m128d)__builtin_ia32_vfmaddsd3((__v2df)__A, -(__v2df)__B, (__v2df)__C);
 }
 
+/// Computes a negated multiply-subtract of 128-bit vectors of [4 x float].
+///    For each element, computes <c> -(__A * __B) - __C </c>.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the \c VFNMSUB213PS instruction.
+///
+/// \param __A
+///    A 128-bit vector of [4 x float] containing the multiplicand.
+/// \param __B
+///    A 128-bit vector of [4 x float] containing the multiplier.
+/// \param __C
+///    A 128-bit vector of [4 x float] containing the subtrahend.
+/// \returns A 128-bit vector of [4 x float] containing the result.
 static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_fnmsub_ps(__m128 __A, __m128 __B, __m128 __C)
 {
   return (__m128)__builtin_ia32_vfmaddps(-(__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
 }
 
+/// Computes a negated multiply-subtract of 128-bit vectors of [2 x double].
+///    For each element, computes <c> -(__A * __B) - __C </c>.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the \c VFNMSUB213PD instruction.
+///
+/// \param __A
+///    A 128-bit vector of [2 x double] containing the multiplicand.
+/// \param __B
+///    A 128-bit vector of [2 x double] containing the multiplier.
+/// \param __C
+///    A 128-bit vector of [2 x double] containing the subtrahend.
+/// \returns A 128-bit vector of [2 x double] containing the result.
 static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_fnmsub_pd(__m128d __A, __m128d __B, __m128d __C)
 {
   return (__m128d)__builtin_ia32_vfmaddpd(-(__v2df)__A, (__v2df)__B, -(__v2df)__C);
 }
 
+/// Computes a scalar negated multiply-subtract of the single-precision
+///    values in the low 32 bits of 128-bit vectors of [4 x float].
+/// \code
+/// result[31:0] = -(__A[31:0] * __B[31:0]) - __C[31:0]
+/// result[127:32] = __A[127:32]
+/// \endcode
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the \c VFNMSUB213SS instruction.
+///
+/// \param __A
+///    A 128-bit vector of [4 x float] containing the multiplicand in the low
+///    32 bits.
+/// \param __B
+///    A 128-bit vector of [4 x float] containing the multiplier in the low
+///    32 bits.
+/// \param __C
+///    A 128-bit vector of [4 x float] containing the subtrahend in the low
+///    32 bits.
+/// \returns A 128-bit vector of [4 x float] containing the result in the low
+///    32 bits, and a copy of \a __A[127:32] in the upper 96 bits.
 static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_fnmsub_ss(__m128 __A, __m128 __B, __m128 __C)
 {
   return (__m128)__builtin_ia32_vfmaddss3((__v4sf)__A, -(__v4sf)__B, -(__v4sf)__C);
 }
 
+/// Computes a scalar negated multiply-subtract of the double-precision
+///    values in the low 64 bits of 128-bit vectors of [2 x double].
+/// \code
+/// result[63:0] = -(__A[63:0] * __B[63:0]) - __C[63:0]
+/// result[127:64] = __A[127:64]
+/// \endcode
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the \c VFNMSUB213SD instruction.
+///
+/// \param __A
+///    A 128-bit vector of [2 x double] containing the multiplicand in the low
+///    64 bits.
+/// \param __B
+///    A 128-bit vector of [2 x double] containing the multiplier in the low
+///    64 bits.
+/// \param __C
+///    A 128-bit vector of [2 x double] containing the subtrahend in the low
+///    64 bits.
+/// \returns A 128-bit vector of [2 x double] containing the result in the low
+///    64 bits, and a copy of \a __A[127:64] in the upper 64 bits.
 static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_fnmsub_sd(__m128d __A, __m128d __B, __m128d __C)
 {
   return (__m128d)__builtin_ia32_vfmaddsd3((__v2df)__A, -(__v2df)__B, -(__v2df)__C);
 }
 
+/// Computes a multiply with alternating add/subtract of 128-bit vectors of
+///    [4 x float].
+/// \code
+/// result[31:0]  = (__A[31:0] * __B[31:0]) - __C[31:0]
+/// result[63:32] = (__A[63:32] * __B[63:32]) + __C[63:32]
+/// result[95:64] = (__A[95:64] * __B[95:64]) - __C[95:64]
+/// result[127:96] = (__A[127:96] * __B[127:96]) + __C[127:96]
+/// \endcode
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the \c VFMADDSUB213PS instruction.
+///
+/// \param __A
+///    A 128-bit vector of [4 x float] containing the multiplicand.
+/// \param __B
+///    A 128-bit vector of [4 x float] containing the multiplier.
+/// \param __C
+///    A 128-bit vector of [4 x float] containing the addend/subtrahend.
+/// \returns A 128-bit vector of [4 x float] containing the result.
 static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_fmaddsub_ps(__m128 __A, __m128 __B, __m128 __C)
 {
   return (__m128)__builtin_ia32_vfmaddsubps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
 }
 
+/// Computes a multiply with alternating add/subtract of 128-bit vectors of
+///    [2 x double].
+/// \code
+/// result[63:0]  = (__A[63:0] * __B[63:0]) - __C[63:0]
+/// result[127:64] = (__A[127:64] * __B[127:64]) + __C[127:64]
+/// \endcode
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the \c VFMADDSUB213PD instruction.
+///
+/// \param __A
+///    A 128-bit vector of [2 x double] containing the multiplicand.
+/// \param __B
+///    A 128-bit vector of [2 x double] containing the multiplier.
+/// \param __C
+///    A 128-bit vector of [2 x double] containing the addend/subtrahend.
+/// \returns A 128-bit vector of [2 x double] containing the result.
 static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_fmaddsub_pd(__m128d __A, __m128d __B, __m128d __C)
 {
   return (__m128d)__builtin_ia32_vfmaddsubpd((__v2df)__A, (__v2df)__B, (__v2df)__C);
 }
 
+/// Computes a multiply with alternating add/subtract of 128-bit vectors of
+///    [4 x float].
+/// \code
+/// result[31:0]  = (__A[31:0] * __B[31:0]) + __C[31:0]
+/// result[63:32] = (__A[63:32] * __B[63:32]) - __C[63:32]
+/// result[95:64] = (__A[95:64] * __B[95:64]) + __C[95:64]
+/// result[127:96 = (__A[127:96] * __B[127:96]) - __C[127:96]
+/// \endcode
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the \c VFMSUBADD213PS instruction.
+///
+/// \param __A
+///    A 128-bit vector of [4 x float] containing the multiplicand.
+/// \param __B
+///    A 128-bit vector of [4 x float] containing the multiplier.
+/// \param __C
+///    A 128-bit vector of [4 x float] containing the addend/subtrahend.
+/// \returns A 128-bit vector of [4 x float] containing the result.
 static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_fmsubadd_ps(__m128 __A, __m128 __B, __m128 __C)
 {
   return (__m128)__builtin_ia32_vfmaddsubps((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
 }
 
+/// Computes a multiply with alternating add/subtract of 128-bit vectors of
+///    [2 x double].
+/// \code
+/// result[63:0]  = (__A[63:0] * __B[63:0]) + __C[63:0]
+/// result[127:64] = (__A[127:64] * __B[127:64]) - __C[127:64]
+/// \endcode
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the \c VFMADDSUB213PD instruction.
+///
+/// \param __A
+///    A 128-bit vector of [2 x double] containing the multiplicand.
+/// \param __B
+///    A 128-bit vector of [2 x double] containing the multiplier.
+/// \param __C
+///    A 128-bit vector of [2 x double] containing the addend/subtrahend.
+/// \returns A 128-bit vector of [2 x double] containing the result.
 static __inline__ __m128d __DEFAULT_FN_ATTRS128
 _mm_fmsubadd_pd(__m128d __A, __m128d __B, __m128d __C)
 {
   return (__m128d)__builtin_ia32_vfmaddsubpd((__v2df)__A, (__v2df)__B, -(__v2df)__C);
 }
 
+/// Computes a multiply-add of 256-bit vectors of [8 x float].
+///    For each element, computes <c> (__A * __B) + __C </c>.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the \c VFMADD213PS instruction.
+///
+/// \param __A
+///    A 256-bit vector of [8 x float] containing the multiplicand.
+/// \param __B
+///    A 256-bit vector of [8 x float] containing the multiplier.
+/// \param __C
+///    A 256-bit vector of [8 x float] containing the addend.
+/// \returns A 256-bit vector of [8 x float] containing the result.
 static __inline__ __m256 __DEFAULT_FN_ATTRS256
 _mm256_fmadd_ps(__m256 __A, __m256 __B, __m256 __C)
 {
   return (__m256)__builtin_ia32_vfmaddps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
 }
 
+/// Computes a multiply-add of 256-bit vectors of [4 x double].
+///    For each element, computes <c> (__A * __B) + __C </c>.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the \c VFMADD213PD instruction.
+///
+/// \param __A
+///    A 256-bit vector of [4 x double] containing the multiplicand.
+/// \param __B
+///    A 256-bit vector of [4 x double] containing the multiplier.
+/// \param __C
+///    A 256-bit vector of [4 x double] containing the addend.
+/// \returns A 256-bit vector of [4 x double] containing the result.
 static __inline__ __m256d __DEFAULT_FN_ATTRS256
 _mm256_fmadd_pd(__m256d __A, __m256d __B, __m256d __C)
 {
   return (__m256d)__builtin_ia32_vfmaddpd256((__v4df)__A, (__v4df)__B, (__v4df)__C);
 }
 
+/// Computes a multiply-subtract of 256-bit vectors of [8 x float].
+///    For each element, computes <c> (__A * __B) - __C </c>.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the \c VFMSUB213PS instruction.
+///
+/// \param __A
+///    A 256-bit vector of [8 x float] containing the multiplicand.
+/// \param __B
+///    A 256-bit vector of [8 x float] containing the multiplier.
+/// \param __C
+///    A 256-bit vector of [8 x float] containing the subtrahend.
+/// \returns A 256-bit vector of [8 x float] containing the result.
 static __inline__ __m256 __DEFAULT_FN_ATTRS256
 _mm256_fmsub_ps(__m256 __A, __m256 __B, __m256 __C)
 {
   return (__m256)__builtin_ia32_vfmaddps256((__v8sf)__A, (__v8sf)__B, -(__v8sf)__C);
 }
 
+/// Computes a multiply-subtract of 256-bit vectors of [4 x double].
+///    For each element, computes <c> (__A * __B) - __C </c>.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the \c VFMSUB213PD instruction.
+///
+/// \param __A
+///    A 256-bit vector of [4 x double] containing the multiplicand.
+/// \param __B
+///    A 256-bit vector of [4 x double] containing the multiplier.
+/// \param __C
+///    A 256-bit vector of [4 x double] containing the subtrahend.
+/// \returns A 256-bit vector of [4 x double] containing the result.
 static __inline__ __m256d __DEFAULT_FN_ATTRS256
 _mm256_fmsub_pd(__m256d __A, __m256d __B, __m256d __C)
 {
   return (__m256d)__builtin_ia32_vfmaddpd256((__v4df)__A, (__v4df)__B, -(__v4df)__C);
 }
 
+/// Computes a negated multiply-add of 256-bit vectors of [8 x float].
+///    For each element, computes <c> -(__A * __B) + __C </c>.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the \c VFNMADD213PS instruction.
+///
+/// \param __A
+///    A 256-bit vector of [8 x float] containing the multiplicand.
+/// \param __B
+///    A 256-bit vector of [8 x float] containing the multiplier.
+/// \param __C
+///    A 256-bit vector of [8 x float] containing the addend.
+/// \returns A 256-bit vector of [8 x float] containing the result.
 static __inline__ __m256 __DEFAULT_FN_ATTRS256
 _mm256_fnmadd_ps(__m256 __A, __m256 __B, __m256 __C)
 {
   return (__m256)__builtin_ia32_vfmaddps256(-(__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
 }
 
+/// Computes a negated multiply-add of 256-bit vectors of [4 x double].
+///    For each element, computes <c> -(__A * __B) + __C </c>.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the \c VFNMADD213PD instruction.
+///
+/// \param __A
+///    A 256-bit vector of [4 x double] containing the multiplicand.
+/// \param __B
+///    A 256-bit vector of [4 x double] containing the multiplier.
+/// \param __C
+///    A 256-bit vector of [4 x double] containing the addend.
+/// \returns A 256-bit vector of [4 x double] containing the result.
 static __inline__ __m256d __DEFAULT_FN_ATTRS256
 _mm256_fnmadd_pd(__m256d __A, __m256d __B, __m256d __C)
 {
   return (__m256d)__builtin_ia32_vfmaddpd256(-(__v4df)__A, (__v4df)__B, (__v4df)__C);
 }
 
+/// Computes a negated multiply-subtract of 256-bit vectors of [8 x float].
+///    For each element, computes <c> -(__A * __B) - __C </c>.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the \c VFNMSUB213PS instruction.
+///
+/// \param __A
+///    A 256-bit vector of [8 x float] containing the multiplicand.
+/// \param __B
+///    A 256-bit vector of [8 x float] containing the multiplier.
+/// \param __C
+///    A 256-bit vector of [8 x float] containing the subtrahend.
+/// \returns A 256-bit vector of [8 x float] containing the result.
 static __inline__ __m256 __DEFAULT_FN_ATTRS256
 _mm256_fnmsub_ps(__m256 __A, __m256 __B, __m256 __C)
 {
   return (__m256)__builtin_ia32_vfmaddps256(-(__v8sf)__A, (__v8sf)__B, -(__v8sf)__C);
 }
 
+/// Computes a negated multiply-subtract of 256-bit vectors of [4 x double].
+///    For each element, computes <c> -(__A * __B) - __C </c>.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the \c VFNMSUB213PD instruction.
+///
+/// \param __A
+///    A 256-bit vector of [4 x double] containing the multiplicand.
+/// \param __B
+///    A 256-bit vector of [4 x double] containing the multiplier.
+/// \param __C
+///    A 256-bit vector of [4 x double] containing the subtrahend.
+/// \returns A 256-bit vector of [4 x double] containing the result.
 static __inline__ __m256d __DEFAULT_FN_ATTRS256
 _mm256_fnmsub_pd(__m256d __A, __m256d __B, __m256d __C)
 {
   return (__m256d)__builtin_ia32_vfmaddpd256(-(__v4df)__A, (__v4df)__B, -(__v4df)__C);
 }
 
+/// Computes a multiply with alternating add/subtract of 256-bit vectors of
+///    [8 x float].
+/// \code
+/// result[31:0] = (__A[31:0] * __B[31:0]) - __C[31:0]
+/// result[63:32] = (__A[63:32] * __B[63:32]) + __C[63:32]
+/// result[95:64] = (__A[95:64] * __B[95:64]) - __C[95:64]
+/// result[127:96] = (__A[127:96] * __B[127:96]) + __C[127:96]
+/// result[159:128] = (__A[159:128] * __B[159:128]) - __C[159:128]
+/// result[191:160] = (__A[191:160] * __B[191:160]) + __C[191:160]
+/// result[223:192] = (__A[223:192] * __B[223:192]) - __C[223:192]
+/// result[255:224] = (__A[255:224] * __B[255:224]) + __C[255:224]
+/// \endcode
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the \c VFMADDSUB213PS instruction.
+///
+/// \param __A
+///    A 256-bit vector of [8 x float] containing the multiplicand.
+/// \param __B
+///    A 256-bit vector of [8 x float] containing the multiplier.
+/// \param __C
+///    A 256-bit vector of [8 x float] containing the addend/subtrahend.
+/// \returns A 256-bit vector of [8 x float] containing the result.
 static __inline__ __m256 __DEFAULT_FN_ATTRS256
 _mm256_fmaddsub_ps(__m256 __A, __m256 __B, __m256 __C)
 {
   return (__m256)__builtin_ia32_vfmaddsubps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
 }
 
+/// Computes a multiply with alternating add/subtract of 256-bit vectors of
+///    [4 x double].
+/// \code
+/// result[63:0] = (__A[63:0] * __B[63:0]) - __C[63:0]
+/// result[127:64] = (__A[127:64] * __B[127:64]) + __C[127:64]
+/// result[191:128] = (__A[191:128] * __B[191:128]) - __C[191:128]
+/// result[255:192] = (__A[255:192] * __B[255:192]) + __C[255:192]
+/// \endcode
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the \c VFMADDSUB213PD instruction.
+///
+/// \param __A
+///    A 256-bit vector of [4 x double] containing the multiplicand.
+/// \param __B
+///    A 256-bit vector of [4 x double] containing the multiplier.
+/// \param __C
+///    A 256-bit vector of [4 x double] containing the addend/subtrahend.
+/// \returns A 256-bit vector of [4 x double] containing the result.
 static __inline__ __m256d __DEFAULT_FN_ATTRS256
 _mm256_fmaddsub_pd(__m256d __A, __m256d __B, __m256d __C)
 {
   return (__m256d)__builtin_ia32_vfmaddsubpd256((__v4df)__A, (__v4df)__B, (__v4df)__C);
 }
 
+/// Computes a vector multiply with alternating add/subtract of 256-bit
+///    vectors of [8 x float].
+/// \code
+/// result[31:0] = (__A[31:0] * __B[31:0]) + __C[31:0]
+/// result[63:32] = (__A[63:32] * __B[63:32]) - __C[63:32]
+/// result[95:64] = (__A[95:64] * __B[95:64]) + __C[95:64]
+/// result[127:96] = (__A[127:96] * __B[127:96]) - __C[127:96]
+/// result[159:128] = (__A[159:128] * __B[159:128]) + __C[159:128]
+/// result[191:160] = (__A[191:160] * __B[191:160]) - __C[191:160]
+/// result[223:192] = (__A[223:192] * __B[223:192]) + __C[223:192]
+/// result[255:224] = (__A[255:224] * __B[255:224]) - __C[255:224]
+/// \endcode
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the \c VFMSUBADD213PS instruction.
+///
+/// \param __A
+///    A 256-bit vector of [8 x float] containing the multiplicand.
+/// \param __B
+///    A 256-bit vector of [8 x float] containing the multiplier.
+/// \param __C
+///    A 256-bit vector of [8 x float] containing the addend/subtrahend.
+/// \returns A 256-bit vector of [8 x float] containing the result.
 static __inline__ __m256 __DEFAULT_FN_ATTRS256
 _mm256_fmsubadd_ps(__m256 __A, __m256 __B, __m256 __C)
 {
   return (__m256)__builtin_ia32_vfmaddsubps256((__v8sf)__A, (__v8sf)__B, -(__v8sf)__C);
 }
 
+/// Computes a vector multiply with alternating add/subtract of 256-bit
+///    vectors of [4 x double].
+/// \code
+/// result[63:0] = (__A[63:0] * __B[63:0]) + __C[63:0]
+/// result[127:64] = (__A[127:64] * __B[127:64]) - __C[127:64]
+/// result[191:128] = (__A[191:128] * __B[191:128]) + __C[191:128]
+/// result[255:192] = (__A[255:192] * __B[255:192]) - __C[255:192]
+/// \endcode
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the \c VFMSUBADD213PD instruction.
+///
+/// \param __A
+///    A 256-bit vector of [4 x double] containing the multiplicand.
+/// \param __B
+///    A 256-bit vector of [4 x double] containing the multiplier.
+/// \param __C
+///    A 256-bit vector of [4 x double] containing the addend/subtrahend.
+/// \returns A 256-bit vector of [4 x double] containing the result.
 static __inline__ __m256d __DEFAULT_FN_ATTRS256
 _mm256_fmsubadd_pd(__m256d __A, __m256d __B, __m256d __C)
 {
diff --git a/linux-x86/lib64/clang/17/include/immintrin.h b/linux-x86/lib64/clang/17/include/immintrin.h
index 6967b46..c5f84ae 100644
--- a/linux-x86/lib64/clang/17/include/immintrin.h
+++ b/linux-x86/lib64/clang/17/include/immintrin.h
@@ -284,30 +284,53 @@
 
 #if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
     defined(__RDRND__)
+/// Returns a 16-bit hardware-generated random value.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> RDRAND </c> instruction.
+///
+/// \param __p
+///    A pointer to a 16-bit memory location to place the random value.
+/// \returns 1 if the value was successfully generated, 0 otherwise.
 static __inline__ int __attribute__((__always_inline__, __nodebug__, __target__("rdrnd")))
 _rdrand16_step(unsigned short *__p)
 {
   return (int)__builtin_ia32_rdrand16_step(__p);
 }
 
+/// Returns a 32-bit hardware-generated random value.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> RDRAND </c> instruction.
+///
+/// \param __p
+///    A pointer to a 32-bit memory location to place the random value.
+/// \returns 1 if the value was successfully generated, 0 otherwise.
 static __inline__ int __attribute__((__always_inline__, __nodebug__, __target__("rdrnd")))
 _rdrand32_step(unsigned int *__p)
 {
   return (int)__builtin_ia32_rdrand32_step(__p);
 }
 
+/// Returns a 64-bit hardware-generated random value.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> RDRAND </c> instruction.
+///
+/// \param __p
+///    A pointer to a 64-bit memory location to place the random value.
+/// \returns 1 if the value was successfully generated, 0 otherwise.
+static __inline__ int __attribute__((__always_inline__, __nodebug__, __target__("rdrnd")))
+_rdrand64_step(unsigned long long *__p)
+{
 #ifdef __x86_64__
-static __inline__ int __attribute__((__always_inline__, __nodebug__, __target__("rdrnd")))
-_rdrand64_step(unsigned long long *__p)
-{
   return (int)__builtin_ia32_rdrand64_step(__p);
-}
 #else
-// We need to emulate the functionality of 64-bit rdrand with 2 32-bit
-// rdrand instructions.
-static __inline__ int __attribute__((__always_inline__, __nodebug__, __target__("rdrnd")))
-_rdrand64_step(unsigned long long *__p)
-{
+  // We need to emulate the functionality of 64-bit rdrand with 2 32-bit
+  // rdrand instructions.
   unsigned int __lo, __hi;
   unsigned int __res_lo = __builtin_ia32_rdrand32_step(&__lo);
   unsigned int __res_hi = __builtin_ia32_rdrand32_step(&__hi);
@@ -318,55 +341,115 @@
     *__p = 0;
     return 0;
   }
-}
 #endif
+}
 #endif /* __RDRND__ */
 
 #if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
     defined(__FSGSBASE__)
 #ifdef __x86_64__
+/// Reads the FS base register.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> RDFSBASE </c> instruction.
+///
+/// \returns The lower 32 bits of the FS base register.
 static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__, __target__("fsgsbase")))
 _readfsbase_u32(void)
 {
   return __builtin_ia32_rdfsbase32();
 }
 
+/// Reads the FS base register.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> RDFSBASE </c> instruction.
+///
+/// \returns The contents of the FS base register.
 static __inline__ unsigned long long __attribute__((__always_inline__, __nodebug__, __target__("fsgsbase")))
 _readfsbase_u64(void)
 {
   return __builtin_ia32_rdfsbase64();
 }
 
+/// Reads the GS base register.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> RDGSBASE </c> instruction.
+///
+/// \returns The lower 32 bits of the GS base register.
 static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__, __target__("fsgsbase")))
 _readgsbase_u32(void)
 {
   return __builtin_ia32_rdgsbase32();
 }
 
+/// Reads the GS base register.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> RDGSBASE </c> instruction.
+///
+/// \returns The contents of the GS base register.
 static __inline__ unsigned long long __attribute__((__always_inline__, __nodebug__, __target__("fsgsbase")))
 _readgsbase_u64(void)
 {
   return __builtin_ia32_rdgsbase64();
 }
 
+/// Modifies the FS base register.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> WRFSBASE </c> instruction.
+///
+/// \param __V
+///    Value to use for the lower 32 bits of the FS base register.
 static __inline__ void __attribute__((__always_inline__, __nodebug__, __target__("fsgsbase")))
 _writefsbase_u32(unsigned int __V)
 {
   __builtin_ia32_wrfsbase32(__V);
 }
 
+/// Modifies the FS base register.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> WRFSBASE </c> instruction.
+///
+/// \param __V
+///    Value to use for the FS base register.
 static __inline__ void __attribute__((__always_inline__, __nodebug__, __target__("fsgsbase")))
 _writefsbase_u64(unsigned long long __V)
 {
   __builtin_ia32_wrfsbase64(__V);
 }
 
+/// Modifies the GS base register.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> WRGSBASE </c> instruction.
+///
+/// \param __V
+///    Value to use for the lower 32 bits of the GS base register.
 static __inline__ void __attribute__((__always_inline__, __nodebug__, __target__("fsgsbase")))
 _writegsbase_u32(unsigned int __V)
 {
   __builtin_ia32_wrgsbase32(__V);
 }
 
+/// Modifies the GS base register.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> WRFSBASE </c> instruction.
+///
+/// \param __V
+///    Value to use for GS base register.
 static __inline__ void __attribute__((__always_inline__, __nodebug__, __target__("fsgsbase")))
 _writegsbase_u64(unsigned long long __V)
 {
@@ -524,7 +607,7 @@
 #include <invpcidintrin.h>
 #endif
 #if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__AMXFP16__)
+    defined(__AMX_FP16__)
 #include <amxfp16intrin.h>
 #endif
 
@@ -534,11 +617,16 @@
 #endif
 
 #if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
-    defined(__AMXTILE__) || defined(__AMXINT8__) || defined(__AMXBF16__)
+    defined(__AMX_TILE__) || defined(__AMX_INT8__) || defined(__AMX_BF16__)
 #include <amxintrin.h>
 #endif
 
 #if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
+    defined(__AMX_COMPLEX__)
+#include <amxcomplexintrin.h>
+#endif
+
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
     defined(__AVX512VP2INTERSECT__)
 #include <avx512vp2intersectintrin.h>
 #endif
diff --git a/linux-x86/lib64/clang/17/include/limits.h b/linux-x86/lib64/clang/17/include/limits.h
index 32cc901..354e031 100644
--- a/linux-x86/lib64/clang/17/include/limits.h
+++ b/linux-x86/lib64/clang/17/include/limits.h
@@ -52,7 +52,11 @@
 #define LONG_MIN  (-__LONG_MAX__ -1L)
 
 #define UCHAR_MAX (__SCHAR_MAX__*2  +1)
-#define USHRT_MAX (__SHRT_MAX__ *2  +1)
+#if __SHRT_WIDTH__ < __INT_WIDTH__
+#define USHRT_MAX (__SHRT_MAX__ * 2 + 1)
+#else
+#define USHRT_MAX (__SHRT_MAX__ * 2U + 1U)
+#endif
 #define UINT_MAX  (__INT_MAX__  *2U +1U)
 #define ULONG_MAX (__LONG_MAX__ *2UL+1UL)
 
diff --git a/linux-x86/lib64/clang/17/include/mwaitxintrin.h b/linux-x86/lib64/clang/17/include/mwaitxintrin.h
index ed48538..65f4271 100644
--- a/linux-x86/lib64/clang/17/include/mwaitxintrin.h
+++ b/linux-x86/lib64/clang/17/include/mwaitxintrin.h
@@ -16,12 +16,41 @@
 
 /* Define the default attributes for the functions in this file. */
 #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__,  __target__("mwaitx")))
+
+/// Establishes a linear address memory range to be monitored and puts
+///    the processor in the monitor event pending state. Data stored in the
+///    monitored address range causes the processor to exit the pending state.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c MONITORX instruction.
+///
+/// \param __p
+///    The memory range to be monitored. The size of the range is determined by
+///    CPUID function 0000_0005h.
+/// \param __extensions
+///    Optional extensions for the monitoring state.
+/// \param __hints
+///    Optional hints for the monitoring state.
 static __inline__ void __DEFAULT_FN_ATTRS
 _mm_monitorx(void * __p, unsigned __extensions, unsigned __hints)
 {
   __builtin_ia32_monitorx(__p, __extensions, __hints);
 }
 
+/// Used with the \c MONITORX instruction to wait while the processor is in
+///    the monitor event pending state. Data stored in the monitored address
+///    range, or an interrupt, causes the processor to exit the pending state.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c MWAITX instruction.
+///
+/// \param __extensions
+///    Optional extensions for the monitoring state, which can vary by
+///    processor.
+/// \param __hints
+///    Optional hints for the monitoring state, which can vary by processor.
 static __inline__ void __DEFAULT_FN_ATTRS
 _mm_mwaitx(unsigned __extensions, unsigned __hints, unsigned __clock)
 {
diff --git a/linux-x86/lib64/clang/17/include/orc/c_api.h b/linux-x86/lib64/clang/17/include/orc/c_api.h
index 96d01df..628c5cd 100644
--- a/linux-x86/lib64/clang/17/include/orc/c_api.h
+++ b/linux-x86/lib64/clang/17/include/orc/c_api.h
@@ -48,17 +48,17 @@
 typedef union {
   char *ValuePtr;
   char Value[sizeof(char *)];
-} __orc_rt_CWrapperFunctionResultDataUnion;
+} orc_rt_CWrapperFunctionResultDataUnion;
 
 /**
- * __orc_rt_CWrapperFunctionResult is a kind of C-SmallVector with an
+ * orc_rt_CWrapperFunctionResult is a kind of C-SmallVector with an
  * out-of-band error state.
  *
  * If Size == 0 and Data.ValuePtr is non-zero then the value is in the
  * 'out-of-band error' state, and Data.ValuePtr points at a malloc-allocated,
  * null-terminated string error message.
  *
- * If Size <= sizeof(__orc_rt_CWrapperFunctionResultData) then the value is in
+ * If Size <= sizeof(orc_rt_CWrapperFunctionResultData) then the value is in
  * the 'small' state and the content is held in the first Size bytes of
  * Data.Value.
  *
@@ -68,29 +68,29 @@
  * malloc, and will be freed with free when this value is destroyed.
  */
 typedef struct {
-  __orc_rt_CWrapperFunctionResultDataUnion Data;
+  orc_rt_CWrapperFunctionResultDataUnion Data;
   size_t Size;
-} __orc_rt_CWrapperFunctionResult;
+} orc_rt_CWrapperFunctionResult;
 
-typedef struct __orc_rt_CSharedOpaqueJITProcessControl
-    *__orc_rt_SharedJITProcessControlRef;
+typedef struct orc_rt_CSharedOpaqueJITProcessControl
+    *orc_rt_SharedJITProcessControlRef;
 
 /**
- * Zero-initialize an __orc_rt_CWrapperFunctionResult.
+ * Zero-initialize an orc_rt_CWrapperFunctionResult.
  */
 static inline void
-__orc_rt_CWrapperFunctionResultInit(__orc_rt_CWrapperFunctionResult *R) {
+orc_rt_CWrapperFunctionResultInit(orc_rt_CWrapperFunctionResult *R) {
   R->Size = 0;
   R->Data.ValuePtr = 0;
 }
 
 /**
- * Create an __orc_rt_CWrapperFunctionResult with an uninitialized buffer of
+ * Create an orc_rt_CWrapperFunctionResult with an uninitialized buffer of
  * size Size. The buffer is returned via the DataPtr argument.
  */
-static inline __orc_rt_CWrapperFunctionResult
-__orc_rt_CWrapperFunctionResultAllocate(size_t Size) {
-  __orc_rt_CWrapperFunctionResult R;
+static inline orc_rt_CWrapperFunctionResult
+orc_rt_CWrapperFunctionResultAllocate(size_t Size) {
+  orc_rt_CWrapperFunctionResult R;
   R.Size = Size;
   // If Size is 0 ValuePtr must be 0 or it is considered an out-of-band error.
   R.Data.ValuePtr = 0;
@@ -100,11 +100,11 @@
 }
 
 /**
- * Create an __orc_rt_WrapperFunctionResult from the given data range.
+ * Create an orc_rt_WrapperFunctionResult from the given data range.
  */
-static inline __orc_rt_CWrapperFunctionResult
-__orc_rt_CreateCWrapperFunctionResultFromRange(const char *Data, size_t Size) {
-  __orc_rt_CWrapperFunctionResult R;
+static inline orc_rt_CWrapperFunctionResult
+orc_rt_CreateCWrapperFunctionResultFromRange(const char *Data, size_t Size) {
+  orc_rt_CWrapperFunctionResult R;
   R.Size = Size;
   if (R.Size > sizeof(R.Data.Value)) {
     char *Tmp = (char *)malloc(Size);
@@ -116,28 +116,28 @@
 }
 
 /**
- * Create an __orc_rt_CWrapperFunctionResult by copying the given string,
+ * Create an orc_rt_CWrapperFunctionResult by copying the given string,
  * including the null-terminator.
  *
  * This function copies the input string. The client is responsible for freeing
  * the ErrMsg arg.
  */
-static inline __orc_rt_CWrapperFunctionResult
-__orc_rt_CreateCWrapperFunctionResultFromString(const char *Source) {
-  return __orc_rt_CreateCWrapperFunctionResultFromRange(Source,
-                                                        strlen(Source) + 1);
+static inline orc_rt_CWrapperFunctionResult
+orc_rt_CreateCWrapperFunctionResultFromString(const char *Source) {
+  return orc_rt_CreateCWrapperFunctionResultFromRange(Source,
+                                                      strlen(Source) + 1);
 }
 
 /**
- * Create an __orc_rt_CWrapperFunctionResult representing an out-of-band
+ * Create an orc_rt_CWrapperFunctionResult representing an out-of-band
  * error.
  *
  * This function copies the input string. The client is responsible for freeing
  * the ErrMsg arg.
  */
-static inline __orc_rt_CWrapperFunctionResult
-__orc_rt_CreateCWrapperFunctionResultFromOutOfBandError(const char *ErrMsg) {
-  __orc_rt_CWrapperFunctionResult R;
+static inline orc_rt_CWrapperFunctionResult
+orc_rt_CreateCWrapperFunctionResultFromOutOfBandError(const char *ErrMsg) {
+  orc_rt_CWrapperFunctionResult R;
   R.Size = 0;
   char *Tmp = (char *)malloc(strlen(ErrMsg) + 1);
   strcpy(Tmp, ErrMsg);
@@ -146,11 +146,11 @@
 }
 
 /**
- * This should be called to destroy __orc_rt_CWrapperFunctionResult values
+ * This should be called to destroy orc_rt_CWrapperFunctionResult values
  * regardless of their state.
  */
 static inline void
-__orc_rt_DisposeCWrapperFunctionResult(__orc_rt_CWrapperFunctionResult *R) {
+orc_rt_DisposeCWrapperFunctionResult(orc_rt_CWrapperFunctionResult *R) {
   if (R->Size > sizeof(R->Data.Value) ||
       (R->Size == 0 && R->Data.ValuePtr))
     free(R->Data.ValuePtr);
@@ -158,22 +158,22 @@
 
 /**
  * Get a pointer to the data contained in the given
- * __orc_rt_CWrapperFunctionResult.
+ * orc_rt_CWrapperFunctionResult.
  */
 static inline char *
-__orc_rt_CWrapperFunctionResultData(__orc_rt_CWrapperFunctionResult *R) {
+orc_rt_CWrapperFunctionResultData(orc_rt_CWrapperFunctionResult *R) {
   assert((R->Size != 0 || R->Data.ValuePtr == NULL) &&
          "Cannot get data for out-of-band error value");
   return R->Size > sizeof(R->Data.Value) ? R->Data.ValuePtr : R->Data.Value;
 }
 
 /**
- * Safely get the size of the given __orc_rt_CWrapperFunctionResult.
+ * Safely get the size of the given orc_rt_CWrapperFunctionResult.
  *
  * Asserts that we're not trying to access the size of an error value.
  */
 static inline size_t
-__orc_rt_CWrapperFunctionResultSize(const __orc_rt_CWrapperFunctionResult *R) {
+orc_rt_CWrapperFunctionResultSize(const orc_rt_CWrapperFunctionResult *R) {
   assert((R->Size != 0 || R->Data.ValuePtr == NULL) &&
          "Cannot get size for out-of-band error value");
   return R->Size;
@@ -181,22 +181,22 @@
 
 /**
  * Returns 1 if this value is equivalent to a value just initialized by
- * __orc_rt_CWrapperFunctionResultInit, 0 otherwise.
+ * orc_rt_CWrapperFunctionResultInit, 0 otherwise.
  */
 static inline size_t
-__orc_rt_CWrapperFunctionResultEmpty(const __orc_rt_CWrapperFunctionResult *R) {
+orc_rt_CWrapperFunctionResultEmpty(const orc_rt_CWrapperFunctionResult *R) {
   return R->Size == 0 && R->Data.ValuePtr == 0;
 }
 
 /**
  * Returns a pointer to the out-of-band error string for this
- * __orc_rt_CWrapperFunctionResult, or null if there is no error.
+ * orc_rt_CWrapperFunctionResult, or null if there is no error.
  *
- * The __orc_rt_CWrapperFunctionResult retains ownership of the error
+ * The orc_rt_CWrapperFunctionResult retains ownership of the error
  * string, so it should be copied if the caller wishes to preserve it.
  */
-static inline const char *__orc_rt_CWrapperFunctionResultGetOutOfBandError(
-    const __orc_rt_CWrapperFunctionResult *R) {
+static inline const char *orc_rt_CWrapperFunctionResultGetOutOfBandError(
+    const orc_rt_CWrapperFunctionResult *R) {
   return R->Size == 0 ? R->Data.ValuePtr : 0;
 }
 
diff --git a/linux-x86/lib64/clang/17/include/pmmintrin.h b/linux-x86/lib64/clang/17/include/pmmintrin.h
index ee660e9..203c0aa 100644
--- a/linux-x86/lib64/clang/17/include/pmmintrin.h
+++ b/linux-x86/lib64/clang/17/include/pmmintrin.h
@@ -253,9 +253,12 @@
 ///    the processor in the monitor event pending state. Data stored in the
 ///    monitored address range causes the processor to exit the pending state.
 ///
+/// The \c MONITOR instruction can be used in kernel mode, and in other modes
+/// if MSR <c> C001_0015h[MonMwaitUserEn] </c> is set.
+///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the <c> MONITOR </c> instruction.
+/// This intrinsic corresponds to the \c MONITOR instruction.
 ///
 /// \param __p
 ///    The memory range to be monitored. The size of the range is determined by
@@ -270,19 +273,22 @@
   __builtin_ia32_monitor(__p, __extensions, __hints);
 }
 
-/// Used with the MONITOR instruction to wait while the processor is in
+/// Used with the \c MONITOR instruction to wait while the processor is in
 ///    the monitor event pending state. Data stored in the monitored address
-///    range causes the processor to exit the pending state.
+///    range, or an interrupt, causes the processor to exit the pending state.
+///
+/// The \c MWAIT instruction can be used in kernel mode, and in other modes if
+/// MSR <c> C001_0015h[MonMwaitUserEn] </c> is set.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the <c> MWAIT </c> instruction.
+/// This intrinsic corresponds to the \c MWAIT instruction.
 ///
 /// \param __extensions
-///    Optional extensions for the monitoring state, which may vary by
+///    Optional extensions for the monitoring state, which can vary by
 ///    processor.
 /// \param __hints
-///    Optional hints for the monitoring state, which may vary by processor.
+///    Optional hints for the monitoring state, which can vary by processor.
 static __inline__ void __DEFAULT_FN_ATTRS
 _mm_mwait(unsigned __extensions, unsigned __hints)
 {
diff --git a/linux-x86/lib64/clang/17/include/ppc_wrappers/emmintrin.h b/linux-x86/lib64/clang/17/include/ppc_wrappers/emmintrin.h
index 0814ea5..fc18ab9 100644
--- a/linux-x86/lib64/clang/17/include/ppc_wrappers/emmintrin.h
+++ b/linux-x86/lib64/clang/17/include/ppc_wrappers/emmintrin.h
@@ -46,6 +46,7 @@
 
 /* SSE2 */
 typedef __vector double __v2df;
+typedef __vector float __v4f;
 typedef __vector long long __v2di;
 typedef __vector unsigned long long __v2du;
 typedef __vector int __v4si;
@@ -951,7 +952,7 @@
     _mm_cvtpi32_pd(__m64 __A) {
   __v4si __temp;
   __v2di __tmp2;
-  __v2df __result;
+  __v4f __result;
 
   __temp = (__v4si)vec_splats(__A);
   __tmp2 = (__v2di)vec_unpackl(__temp);
diff --git a/linux-x86/lib64/clang/17/include/ppc_wrappers/smmintrin.h b/linux-x86/lib64/clang/17/include/ppc_wrappers/smmintrin.h
index 6fe6d2a..349b395 100644
--- a/linux-x86/lib64/clang/17/include/ppc_wrappers/smmintrin.h
+++ b/linux-x86/lib64/clang/17/include/ppc_wrappers/smmintrin.h
@@ -305,9 +305,9 @@
 extern __inline __m128i
     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     _mm_blend_epi16(__m128i __A, __m128i __B, const int __imm8) {
-  __v16qi __charmask = vec_splats((signed char)__imm8);
+  __v16qu __charmask = vec_splats((unsigned char)__imm8);
   __charmask = vec_gb(__charmask);
-  __v8hu __shortmask = (__v8hu)vec_unpackh(__charmask);
+  __v8hu __shortmask = (__v8hu)vec_unpackh((__v16qi)__charmask);
 #ifdef __BIG_ENDIAN__
   __shortmask = vec_reve(__shortmask);
 #endif
diff --git a/linux-x86/lib64/clang/17/include/profile/InstrProfData.inc b/linux-x86/lib64/clang/17/include/profile/InstrProfData.inc
index 05419bf..94261f4 100644
--- a/linux-x86/lib64/clang/17/include/profile/InstrProfData.inc
+++ b/linux-x86/lib64/clang/17/include/profile/InstrProfData.inc
@@ -650,7 +650,7 @@
 /* Raw profile format version (start from 1). */
 #define INSTR_PROF_RAW_VERSION 8
 /* Indexed profile format version (start from 1). */
-#define INSTR_PROF_INDEX_VERSION 9
+#define INSTR_PROF_INDEX_VERSION 10
 /* Coverage mapping format version (start from 0). */
 #define INSTR_PROF_COVMAP_VERSION 5
 
@@ -663,6 +663,7 @@
  * The 60th bit indicates single byte coverage instrumentation.
  * The 61st bit indicates function entry instrumentation only.
  * The 62nd bit indicates whether memory profile information is present.
+ * The 63rd bit indicates if this is a temporal profile.
  */
 #define VARIANT_MASKS_ALL 0xff00000000000000ULL
 #define GET_VERSION(V) ((V) & ~VARIANT_MASKS_ALL)
@@ -673,9 +674,11 @@
 #define VARIANT_MASK_BYTE_COVERAGE (0x1ULL << 60)
 #define VARIANT_MASK_FUNCTION_ENTRY_ONLY (0x1ULL << 61)
 #define VARIANT_MASK_MEMPROF (0x1ULL << 62)
+#define VARIANT_MASK_TEMPORAL_PROF (0x1ULL << 63)
 #define INSTR_PROF_RAW_VERSION_VAR __llvm_profile_raw_version
 #define INSTR_PROF_PROFILE_RUNTIME_VAR __llvm_profile_runtime
 #define INSTR_PROF_PROFILE_COUNTER_BIAS_VAR __llvm_profile_counter_bias
+#define INSTR_PROF_PROFILE_SET_TIMESTAMP __llvm_profile_set_timestamp
 
 /* The variable that holds the name of the profile data
  * specified via command line. */
diff --git a/linux-x86/lib64/clang/17/include/riscv_ntlh.h b/linux-x86/lib64/clang/17/include/riscv_ntlh.h
new file mode 100644
index 0000000..9ce1709
--- /dev/null
+++ b/linux-x86/lib64/clang/17/include/riscv_ntlh.h
@@ -0,0 +1,28 @@
+/*===---- riscv_ntlh.h - RISC-V NTLH intrinsics ----------------------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __RISCV_NTLH_H
+#define __RISCV_NTLH_H
+
+#ifndef __riscv_zihintntl
+#error "NTLH intrinsics require the NTLH extension."
+#endif
+
+enum {
+  __RISCV_NTLH_INNERMOST_PRIVATE = 2,
+  __RISCV_NTLH_ALL_PRIVATE,
+  __RISCV_NTLH_INNERMOST_SHARED,
+  __RISCV_NTLH_ALL
+};
+
+#define __riscv_ntl_load(PTR, DOMAIN) __builtin_riscv_ntl_load((PTR), (DOMAIN))
+#define __riscv_ntl_store(PTR, VAL, DOMAIN)                                    \
+  __builtin_riscv_ntl_store((PTR), (VAL), (DOMAIN))
+
+#endif
\ No newline at end of file
diff --git a/linux-x86/lib64/clang/17/include/riscv_vector.h b/linux-x86/lib64/clang/17/include/riscv_vector.h
index 2a9598e..5c5480b 100644
--- a/linux-x86/lib64/clang/17/include/riscv_vector.h
+++ b/linux-x86/lib64/clang/17/include/riscv_vector.h
@@ -25,7 +25,7 @@
 #pragma clang riscv intrinsic vector
 
 
-#define vlenb() __builtin_rvv_vlenb()
+#define __riscv_vlenb() __builtin_rvv_vlenb()
 
 enum RVV_CSR {
   RVV_VSTART = 0,
@@ -35,7 +35,7 @@
 };
 
 static __inline__ __attribute__((__always_inline__, __nodebug__))
-unsigned long vread_csr(enum RVV_CSR __csr) {
+unsigned long __riscv_vread_csr(enum RVV_CSR __csr) {
   unsigned long __rv = 0;
   switch (__csr) {
     case RVV_VSTART:
@@ -55,7 +55,7 @@
 }
 
 static __inline__ __attribute__((__always_inline__, __nodebug__))
-void vwrite_csr(enum RVV_CSR __csr, unsigned long __value) {
+void __riscv_vwrite_csr(enum RVV_CSR __csr, unsigned long __value) {
   switch (__csr) {
     case RVV_VSTART:
       __asm__ __volatile__ ("csrw\tvstart, %z0" : : "rJ"(__value) : "memory");
@@ -72,62 +72,62 @@
   }
 }
 
-#define vsetvl_e8mf4(avl) __builtin_rvv_vsetvli((size_t)(avl), 0, 6)
-#define vsetvl_e8mf2(avl) __builtin_rvv_vsetvli((size_t)(avl), 0, 7)
-#define vsetvl_e8m1(avl) __builtin_rvv_vsetvli((size_t)(avl), 0, 0)
-#define vsetvl_e8m2(avl) __builtin_rvv_vsetvli((size_t)(avl), 0, 1)
-#define vsetvl_e8m4(avl) __builtin_rvv_vsetvli((size_t)(avl), 0, 2)
-#define vsetvl_e8m8(avl) __builtin_rvv_vsetvli((size_t)(avl), 0, 3)
+#define __riscv_vsetvl_e8mf4(avl) __builtin_rvv_vsetvli((size_t)(avl), 0, 6)
+#define __riscv_vsetvl_e8mf2(avl) __builtin_rvv_vsetvli((size_t)(avl), 0, 7)
+#define __riscv_vsetvl_e8m1(avl) __builtin_rvv_vsetvli((size_t)(avl), 0, 0)
+#define __riscv_vsetvl_e8m2(avl) __builtin_rvv_vsetvli((size_t)(avl), 0, 1)
+#define __riscv_vsetvl_e8m4(avl) __builtin_rvv_vsetvli((size_t)(avl), 0, 2)
+#define __riscv_vsetvl_e8m8(avl) __builtin_rvv_vsetvli((size_t)(avl), 0, 3)
 
-#define vsetvl_e16mf2(avl) __builtin_rvv_vsetvli((size_t)(avl), 1, 7)
-#define vsetvl_e16m1(avl) __builtin_rvv_vsetvli((size_t)(avl), 1, 0)
-#define vsetvl_e16m2(avl) __builtin_rvv_vsetvli((size_t)(avl), 1, 1)
-#define vsetvl_e16m4(avl) __builtin_rvv_vsetvli((size_t)(avl), 1, 2)
-#define vsetvl_e16m8(avl) __builtin_rvv_vsetvli((size_t)(avl), 1, 3)
+#define __riscv_vsetvl_e16mf2(avl) __builtin_rvv_vsetvli((size_t)(avl), 1, 7)
+#define __riscv_vsetvl_e16m1(avl) __builtin_rvv_vsetvli((size_t)(avl), 1, 0)
+#define __riscv_vsetvl_e16m2(avl) __builtin_rvv_vsetvli((size_t)(avl), 1, 1)
+#define __riscv_vsetvl_e16m4(avl) __builtin_rvv_vsetvli((size_t)(avl), 1, 2)
+#define __riscv_vsetvl_e16m8(avl) __builtin_rvv_vsetvli((size_t)(avl), 1, 3)
 
-#define vsetvl_e32m1(avl) __builtin_rvv_vsetvli((size_t)(avl), 2, 0)
-#define vsetvl_e32m2(avl) __builtin_rvv_vsetvli((size_t)(avl), 2, 1)
-#define vsetvl_e32m4(avl) __builtin_rvv_vsetvli((size_t)(avl), 2, 2)
-#define vsetvl_e32m8(avl) __builtin_rvv_vsetvli((size_t)(avl), 2, 3)
+#define __riscv_vsetvl_e32m1(avl) __builtin_rvv_vsetvli((size_t)(avl), 2, 0)
+#define __riscv_vsetvl_e32m2(avl) __builtin_rvv_vsetvli((size_t)(avl), 2, 1)
+#define __riscv_vsetvl_e32m4(avl) __builtin_rvv_vsetvli((size_t)(avl), 2, 2)
+#define __riscv_vsetvl_e32m8(avl) __builtin_rvv_vsetvli((size_t)(avl), 2, 3)
 
 #if __riscv_v_elen >= 64
-#define vsetvl_e8mf8(avl) __builtin_rvv_vsetvli((size_t)(avl), 0, 5)
-#define vsetvl_e16mf4(avl) __builtin_rvv_vsetvli((size_t)(avl), 1, 6)
-#define vsetvl_e32mf2(avl) __builtin_rvv_vsetvli((size_t)(avl), 2, 7)
+#define __riscv_vsetvl_e8mf8(avl) __builtin_rvv_vsetvli((size_t)(avl), 0, 5)
+#define __riscv_vsetvl_e16mf4(avl) __builtin_rvv_vsetvli((size_t)(avl), 1, 6)
+#define __riscv_vsetvl_e32mf2(avl) __builtin_rvv_vsetvli((size_t)(avl), 2, 7)
 
-#define vsetvl_e64m1(avl) __builtin_rvv_vsetvli((size_t)(avl), 3, 0)
-#define vsetvl_e64m2(avl) __builtin_rvv_vsetvli((size_t)(avl), 3, 1)
-#define vsetvl_e64m4(avl) __builtin_rvv_vsetvli((size_t)(avl), 3, 2)
-#define vsetvl_e64m8(avl) __builtin_rvv_vsetvli((size_t)(avl), 3, 3)
+#define __riscv_vsetvl_e64m1(avl) __builtin_rvv_vsetvli((size_t)(avl), 3, 0)
+#define __riscv_vsetvl_e64m2(avl) __builtin_rvv_vsetvli((size_t)(avl), 3, 1)
+#define __riscv_vsetvl_e64m4(avl) __builtin_rvv_vsetvli((size_t)(avl), 3, 2)
+#define __riscv_vsetvl_e64m8(avl) __builtin_rvv_vsetvli((size_t)(avl), 3, 3)
 #endif
 
-#define vsetvlmax_e8mf4() __builtin_rvv_vsetvlimax(0, 6)
-#define vsetvlmax_e8mf2() __builtin_rvv_vsetvlimax(0, 7)
-#define vsetvlmax_e8m1() __builtin_rvv_vsetvlimax(0, 0)
-#define vsetvlmax_e8m2() __builtin_rvv_vsetvlimax(0, 1)
-#define vsetvlmax_e8m4() __builtin_rvv_vsetvlimax(0, 2)
-#define vsetvlmax_e8m8() __builtin_rvv_vsetvlimax(0, 3)
+#define __riscv_vsetvlmax_e8mf4() __builtin_rvv_vsetvlimax(0, 6)
+#define __riscv_vsetvlmax_e8mf2() __builtin_rvv_vsetvlimax(0, 7)
+#define __riscv_vsetvlmax_e8m1() __builtin_rvv_vsetvlimax(0, 0)
+#define __riscv_vsetvlmax_e8m2() __builtin_rvv_vsetvlimax(0, 1)
+#define __riscv_vsetvlmax_e8m4() __builtin_rvv_vsetvlimax(0, 2)
+#define __riscv_vsetvlmax_e8m8() __builtin_rvv_vsetvlimax(0, 3)
 
-#define vsetvlmax_e16mf2() __builtin_rvv_vsetvlimax(1, 7)
-#define vsetvlmax_e16m1() __builtin_rvv_vsetvlimax(1, 0)
-#define vsetvlmax_e16m2() __builtin_rvv_vsetvlimax(1, 1)
-#define vsetvlmax_e16m4() __builtin_rvv_vsetvlimax(1, 2)
-#define vsetvlmax_e16m8() __builtin_rvv_vsetvlimax(1, 3)
+#define __riscv_vsetvlmax_e16mf2() __builtin_rvv_vsetvlimax(1, 7)
+#define __riscv_vsetvlmax_e16m1() __builtin_rvv_vsetvlimax(1, 0)
+#define __riscv_vsetvlmax_e16m2() __builtin_rvv_vsetvlimax(1, 1)
+#define __riscv_vsetvlmax_e16m4() __builtin_rvv_vsetvlimax(1, 2)
+#define __riscv_vsetvlmax_e16m8() __builtin_rvv_vsetvlimax(1, 3)
 
-#define vsetvlmax_e32m1() __builtin_rvv_vsetvlimax(2, 0)
-#define vsetvlmax_e32m2() __builtin_rvv_vsetvlimax(2, 1)
-#define vsetvlmax_e32m4() __builtin_rvv_vsetvlimax(2, 2)
-#define vsetvlmax_e32m8() __builtin_rvv_vsetvlimax(2, 3)
+#define __riscv_vsetvlmax_e32m1() __builtin_rvv_vsetvlimax(2, 0)
+#define __riscv_vsetvlmax_e32m2() __builtin_rvv_vsetvlimax(2, 1)
+#define __riscv_vsetvlmax_e32m4() __builtin_rvv_vsetvlimax(2, 2)
+#define __riscv_vsetvlmax_e32m8() __builtin_rvv_vsetvlimax(2, 3)
 
 #if __riscv_v_elen >= 64
-#define vsetvlmax_e8mf8() __builtin_rvv_vsetvlimax(0, 5)
-#define vsetvlmax_e16mf4() __builtin_rvv_vsetvlimax(1, 6)
-#define vsetvlmax_e32mf2() __builtin_rvv_vsetvlimax(2, 7)
+#define __riscv_vsetvlmax_e8mf8() __builtin_rvv_vsetvlimax(0, 5)
+#define __riscv_vsetvlmax_e16mf4() __builtin_rvv_vsetvlimax(1, 6)
+#define __riscv_vsetvlmax_e32mf2() __builtin_rvv_vsetvlimax(2, 7)
 
-#define vsetvlmax_e64m1() __builtin_rvv_vsetvlimax(3, 0)
-#define vsetvlmax_e64m2() __builtin_rvv_vsetvlimax(3, 1)
-#define vsetvlmax_e64m4() __builtin_rvv_vsetvlimax(3, 2)
-#define vsetvlmax_e64m8() __builtin_rvv_vsetvlimax(3, 3)
+#define __riscv_vsetvlmax_e64m1() __builtin_rvv_vsetvlimax(3, 0)
+#define __riscv_vsetvlmax_e64m2() __builtin_rvv_vsetvlimax(3, 1)
+#define __riscv_vsetvlmax_e64m4() __builtin_rvv_vsetvlimax(3, 2)
+#define __riscv_vsetvlmax_e64m8() __builtin_rvv_vsetvlimax(3, 3)
 #endif
 
 typedef __rvv_bool64_t vbool64_t;
@@ -181,28 +181,21 @@
 typedef __rvv_uint64m4_t vuint64m4_t;
 typedef __rvv_int64m8_t vint64m8_t;
 typedef __rvv_uint64m8_t vuint64m8_t;
-#if defined(__riscv_zvfh)
 typedef __rvv_float16mf4_t vfloat16mf4_t;
 typedef __rvv_float16mf2_t vfloat16mf2_t;
 typedef __rvv_float16m1_t vfloat16m1_t;
 typedef __rvv_float16m2_t vfloat16m2_t;
 typedef __rvv_float16m4_t vfloat16m4_t;
 typedef __rvv_float16m8_t vfloat16m8_t;
-#endif
-#if (__riscv_v_elen_fp >= 32)
 typedef __rvv_float32mf2_t vfloat32mf2_t;
 typedef __rvv_float32m1_t vfloat32m1_t;
 typedef __rvv_float32m2_t vfloat32m2_t;
 typedef __rvv_float32m4_t vfloat32m4_t;
 typedef __rvv_float32m8_t vfloat32m8_t;
-#endif
-#if (__riscv_v_elen_fp >= 64)
 typedef __rvv_float64m1_t vfloat64m1_t;
 typedef __rvv_float64m2_t vfloat64m2_t;
 typedef __rvv_float64m4_t vfloat64m4_t;
 typedef __rvv_float64m8_t vfloat64m8_t;
-#endif
-
 #define __riscv_v_intrinsic_overloading 1
 
 #ifdef __cplusplus
diff --git a/linux-x86/lib64/clang/17/include/sanitizer/allocator_interface.h b/linux-x86/lib64/clang/17/include/sanitizer/allocator_interface.h
index 6226135..d0cfce7 100644
--- a/linux-x86/lib64/clang/17/include/sanitizer/allocator_interface.h
+++ b/linux-x86/lib64/clang/17/include/sanitizer/allocator_interface.h
@@ -26,6 +26,10 @@
      is not yet freed. */
   int __sanitizer_get_ownership(const volatile void *p);
 
+  /* If a pointer lies within an allocation, it will return the start address
+     of the allocation. Otherwise, it returns nullptr. */
+  const void *__sanitizer_get_allocated_begin(const void *p);
+
   /* Returns the number of bytes reserved for the pointer p.
      Requires (get_ownership(p) == true) or (p == 0). */
   size_t __sanitizer_get_allocated_size(const volatile void *p);
diff --git a/linux-x86/lib64/clang/17/include/sanitizer/common_interface_defs.h b/linux-x86/lib64/clang/17/include/sanitizer/common_interface_defs.h
index 2f415bd..983df7c 100644
--- a/linux-x86/lib64/clang/17/include/sanitizer/common_interface_defs.h
+++ b/linux-x86/lib64/clang/17/include/sanitizer/common_interface_defs.h
@@ -129,26 +129,23 @@
 /// state <c>mid == end</c>, so that should be the final state when the
 /// container is destroyed or when the container reallocates the storage.
 ///
-/// For ASan, <c><i>beg</i></c> should be 8-aligned and <c><i>end</i></c>
-/// should be either 8-aligned or it should point to the end of a separate
-/// heap-, stack-, or global-allocated buffer. So the following example will
-/// not work:
+/// For ASan, <c><i>beg</i></c> no longer needs to be 8-aligned,
+/// first and last granule may be shared with other objects
+/// and therefore the function can be used for any allocator.
+///
+/// The following example shows how to use the function:
 ///
 /// \code
-///   int64_t x[2]; // 16 bytes, 8-aligned
-///   char *beg = (char *)&x[0];
-///   char *end = beg + 12; // Not 8-aligned, not the end of the buffer
-/// \endcode
-///
-/// The following, however, will work:
-/// \code
-///   int32_t x[3]; // 12 bytes, but 8-aligned under ASan.
+///   int32_t x[3]; // 12 bytes
 ///   char *beg = (char*)&x[0];
-///   char *end = beg + 12; // Not 8-aligned, but is the end of the buffer
+///   char *end = beg + 12;
+///   __sanitizer_annotate_contiguous_container(beg, end, beg, end);
 /// \endcode
 ///
 /// \note  Use this function with caution and do not use for anything other
 /// than vector-like classes.
+/// \note  Unaligned <c><i>beg</i></c> or <c><i>end</i></c> may miss bugs in
+/// these granules.
 ///
 /// \param beg Beginning of memory region.
 /// \param end End of memory region.
diff --git a/linux-x86/lib64/clang/17/include/sanitizer/hwasan_interface.h b/linux-x86/lib64/clang/17/include/sanitizer/hwasan_interface.h
index 14035c0..ee742c7 100644
--- a/linux-x86/lib64/clang/17/include/sanitizer/hwasan_interface.h
+++ b/linux-x86/lib64/clang/17/include/sanitizer/hwasan_interface.h
@@ -1,4 +1,4 @@
-//===-- sanitizer/asan_interface.h ------------------------------*- C++ -*-===//
+//===-- sanitizer/hwasan_interface.h ----------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/linux-x86/lib64/clang/17/include/sanitizer/tsan_interface.h b/linux-x86/lib64/clang/17/include/sanitizer/tsan_interface.h
index 2782e61..58f2513 100644
--- a/linux-x86/lib64/clang/17/include/sanitizer/tsan_interface.h
+++ b/linux-x86/lib64/clang/17/include/sanitizer/tsan_interface.h
@@ -172,6 +172,12 @@
 // Release TSan internal memory in a best-effort manner.
 void __tsan_flush_memory();
 
+// User-provided default TSAN options.
+const char* __tsan_default_options(void);
+
+// User-provided default TSAN suppressions.
+const char* __tsan_default_suppressions(void);
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/linux-x86/lib64/clang/17/include/sifive_vector.h b/linux-x86/lib64/clang/17/include/sifive_vector.h
new file mode 100644
index 0000000..42d7224
--- /dev/null
+++ b/linux-x86/lib64/clang/17/include/sifive_vector.h
@@ -0,0 +1,16 @@
+//===----- sifive_vector.h - SiFive Vector definitions --------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _SIFIVE_VECTOR_H_
+#define _SIFIVE_VECTOR_H_
+
+#include "riscv_vector.h"
+
+#pragma clang riscv intrinsic sifive_vector
+
+#endif //_SIFIVE_VECTOR_H_
diff --git a/linux-x86/lib64/clang/17/include/stdalign.h b/linux-x86/lib64/clang/17/include/stdalign.h
index 6ad25db..8ae6e65 100644
--- a/linux-x86/lib64/clang/17/include/stdalign.h
+++ b/linux-x86/lib64/clang/17/include/stdalign.h
@@ -10,6 +10,10 @@
 #ifndef __STDALIGN_H
 #define __STDALIGN_H
 
+/* FIXME: This is using the placeholder dates Clang produces for these macros
+   in C2x mode; switch to the correct values once they've been published. */
+#if defined(__cplusplus) ||                                                    \
+    (defined(__STDC_VERSION__) && __STDC_VERSION__ < 202000L)
 #ifndef __cplusplus
 #define alignas _Alignas
 #define alignof _Alignof
@@ -17,5 +21,6 @@
 
 #define __alignas_is_defined 1
 #define __alignof_is_defined 1
+#endif /* __STDC_VERSION__ */
 
 #endif /* __STDALIGN_H */
diff --git a/linux-x86/lib64/clang/17/include/stddef.h b/linux-x86/lib64/clang/17/include/stddef.h
index 4281517..539541f 100644
--- a/linux-x86/lib64/clang/17/include/stddef.h
+++ b/linux-x86/lib64/clang/17/include/stddef.h
@@ -103,6 +103,11 @@
 typedef typeof(nullptr) nullptr_t;
 #endif /* defined(__STDC_VERSION__) && __STDC_VERSION__ >= 202000L */
 
+#if defined(__need_STDDEF_H_misc) && defined(__STDC_VERSION__) &&              \
+    __STDC_VERSION__ >= 202000L
+#define unreachable() __builtin_unreachable()
+#endif /* defined(__need_STDDEF_H_misc) && >= C23 */
+
 #if defined(__need_STDDEF_H_misc)
 #if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L) ||              \
     (defined(__cplusplus) && __cplusplus >= 201103L)
diff --git a/linux-x86/lib64/clang/17/include/wasm_simd128.h b/linux-x86/lib64/clang/17/include/wasm_simd128.h
index f93de12..a099ab5 100644
--- a/linux-x86/lib64/clang/17/include/wasm_simd128.h
+++ b/linux-x86/lib64/clang/17/include/wasm_simd128.h
@@ -961,17 +961,17 @@
 
 static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i8x16_shl(v128_t __a,
                                                            uint32_t __b) {
-  return (v128_t)((__i8x16)__a << __b);
+  return (v128_t)((__i8x16)__a << (__b & 0x7));
 }
 
 static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i8x16_shr(v128_t __a,
                                                            uint32_t __b) {
-  return (v128_t)((__i8x16)__a >> __b);
+  return (v128_t)((__i8x16)__a >> (__b & 0x7));
 }
 
 static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_u8x16_shr(v128_t __a,
                                                            uint32_t __b) {
-  return (v128_t)((__u8x16)__a >> __b);
+  return (v128_t)((__u8x16)__a >> (__b & 0x7));
 }
 
 static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i8x16_add(v128_t __a,
@@ -1047,17 +1047,17 @@
 
 static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i16x8_shl(v128_t __a,
                                                            uint32_t __b) {
-  return (v128_t)((__i16x8)__a << __b);
+  return (v128_t)((__i16x8)__a << (__b & 0xF));
 }
 
 static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i16x8_shr(v128_t __a,
                                                            uint32_t __b) {
-  return (v128_t)((__i16x8)__a >> __b);
+  return (v128_t)((__i16x8)__a >> (__b & 0xF));
 }
 
 static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_u16x8_shr(v128_t __a,
                                                            uint32_t __b) {
-  return (v128_t)((__u16x8)__a >> __b);
+  return (v128_t)((__u16x8)__a >> (__b & 0xF));
 }
 
 static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i16x8_add(v128_t __a,
@@ -1138,17 +1138,17 @@
 
 static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i32x4_shl(v128_t __a,
                                                            uint32_t __b) {
-  return (v128_t)((__i32x4)__a << __b);
+  return (v128_t)((__i32x4)__a << (__b & 0x1F));
 }
 
 static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i32x4_shr(v128_t __a,
                                                            uint32_t __b) {
-  return (v128_t)((__i32x4)__a >> __b);
+  return (v128_t)((__i32x4)__a >> (__b & 0x1F));
 }
 
 static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_u32x4_shr(v128_t __a,
                                                            uint32_t __b) {
-  return (v128_t)((__u32x4)__a >> __b);
+  return (v128_t)((__u32x4)__a >> (__b & 0x1F));
 }
 
 static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i32x4_add(v128_t __a,
@@ -1209,17 +1209,17 @@
 
 static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i64x2_shl(v128_t __a,
                                                            uint32_t __b) {
-  return (v128_t)((__i64x2)__a << (int64_t)__b);
+  return (v128_t)((__i64x2)__a << ((int64_t)__b & 0x3F));
 }
 
 static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i64x2_shr(v128_t __a,
                                                            uint32_t __b) {
-  return (v128_t)((__i64x2)__a >> (int64_t)__b);
+  return (v128_t)((__i64x2)__a >> ((int64_t)__b & 0x3F));
 }
 
 static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_u64x2_shr(v128_t __a,
                                                            uint32_t __b) {
-  return (v128_t)((__u64x2)__a >> (int64_t)__b);
+  return (v128_t)((__u64x2)__a >> ((int64_t)__b & 0x3F));
 }
 
 static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i64x2_add(v128_t __a,
diff --git a/linux-x86/lib64/clang/17/share/dfsan_abilist.txt b/linux-x86/lib64/clang/17/share/dfsan_abilist.txt
index e2f927d..c5ca68f 100644
--- a/linux-x86/lib64/clang/17/share/dfsan_abilist.txt
+++ b/linux-x86/lib64/clang/17/share/dfsan_abilist.txt
@@ -283,6 +283,7 @@
 fun:strpbrk=custom
 fun:strrchr=custom
 fun:strstr=custom
+fun:strsep=custom
 
 # Functions which take action based on global state, such as running a callback
 # set by a separate function.
diff --git a/linux-x86/lib64/libbase.so b/linux-x86/lib64/libbase.so
index b2d5e3b..a0d19eb 100755
--- a/linux-x86/lib64/libbase.so
+++ b/linux-x86/lib64/libbase.so
Binary files differ
diff --git a/linux-x86/lib64/libc++.so b/linux-x86/lib64/libc++.so
old mode 100755
new mode 100644
index 17e9f62..47c6350
--- a/linux-x86/lib64/libc++.so
+++ b/linux-x86/lib64/libc++.so
Binary files differ
diff --git a/linux-x86/lib64/libc++.so.1 b/linux-x86/lib64/libc++.so.1
deleted file mode 100644
index bdf8a9d..0000000
--- a/linux-x86/lib64/libc++.so.1
+++ /dev/null
Binary files differ
diff --git a/linux-x86/lib64/libclang-cpp.so b/linux-x86/lib64/libclang-cpp.so
new file mode 100644
index 0000000..d1924fa
--- /dev/null
+++ b/linux-x86/lib64/libclang-cpp.so
Binary files differ
diff --git a/linux-x86/lib64/libclang-cpp.so.17 b/linux-x86/lib64/libclang-cpp.so.17
deleted file mode 100644
index 7c9e386..0000000
--- a/linux-x86/lib64/libclang-cpp.so.17
+++ /dev/null
Binary files differ
diff --git a/linux-x86/lib64/liblog.so b/linux-x86/lib64/liblog.so
index 69b8240..cacd7fb 100755
--- a/linux-x86/lib64/liblog.so
+++ b/linux-x86/lib64/liblog.so
Binary files differ
diff --git a/linux-x86/lib64/libprotobuf-cpp-full.so b/linux-x86/lib64/libprotobuf-cpp-full.so
index 7d8599f..752ccce 100755
--- a/linux-x86/lib64/libprotobuf-cpp-full.so
+++ b/linux-x86/lib64/libprotobuf-cpp-full.so
Binary files differ
diff --git a/linux-x86/lib64/libz-host.so b/linux-x86/lib64/libz-host.so
index b845d04..4e46468 100755
--- a/linux-x86/lib64/libz-host.so
+++ b/linux-x86/lib64/libz-host.so
Binary files differ
diff --git a/linux-x86/lib64/libziparchive.so b/linux-x86/lib64/libziparchive.so
index a190823..cef52a1 100755
--- a/linux-x86/lib64/libziparchive.so
+++ b/linux-x86/lib64/libziparchive.so
Binary files differ
diff --git a/manifest.xml b/manifest.xml
index ba11df9..56eb0a0 100644
--- a/manifest.xml
+++ b/manifest.xml
@@ -3,35 +3,37 @@
 <manifest>
   <remote name="aosp" fetch="https://android.googlesource.com/" review="https://android.googlesource.com/" />
 
-  <default revision="master" remote="aosp" sync-j="4" />
+  <default revision="main" remote="aosp" sync-j="4" />
 
-  <project path="build/bazel" name="platform/build/bazel" groups="pdk" revision="d8a432f37fd4c3cf66a1437a24bab5af774b945f">
+  <project path="build/bazel" name="platform/build/bazel" groups="pdk" revision="0580228a0c4073a7dbff6d740466e2a8b54aab67">
     <linkfile dest="WORKSPACE" src="bazel.WORKSPACE" />
 
     <linkfile dest="BUILD" src="bazel.BUILD" />
 </project>
 
-  <project path="build/bazel_common_rules" name="platform/build/bazel_common_rules" groups="pdk" revision="46c9982f2b13f458f7b1efd3ba8ab478cc3ceac4" />
+  <project path="build/bazel_common_rules" name="platform/build/bazel_common_rules" groups="pdk" revision="ded6ea7ad5198a04053eda26fd59b20950e7100e" />
 
-  <project path="build/make" name="platform/build" revision="61ce7afb21a6e971a9bd4a9097fbb3f26d90ed85">
+  <project path="build/make" name="platform/build" revision="f49b9a33716ee0fa19acd3914a7f1a367118a164">
     <linkfile dest="build/tools" src="tools" />
 </project>
 
-  <project path="build/blueprint" name="platform/build/blueprint" revision="797fe25cb2f2cfcb712f8de314361d662c4b8c3c" />
+  <project path="build/blueprint" name="platform/build/blueprint" revision="a29ee638ea359b4ca9618cbffb19858b9d29c172" />
 
   <project path="build/kati" name="platform/build/kati" revision="27de420ef55e982c9ea29a00c33bf7d1243cea4d" />
 
-  <project path="build/soong" name="platform/build/soong" revision="03d510404c1bfd55b102a4e5d34e08f34342cb96">
+  <project path="build/soong" name="platform/build/soong" revision="06521c4f4661ded380d44b51d68fb2f20750dfad">
     <linkfile dest="Android.bp" src="root.bp" />
 
     <linkfile dest="bootstrap.bash" src="bootstrap.bash" />
 </project>
 
-  <project path="external/bazelbuild-rules_android" name="platform/external/bazelbuild-rules_android" groups="pdk" revision="f3de233800ea8627d9ff5c3da839d9a2a93bb4cd" />
+  <project path="external/bazelbuild-rules_android" name="platform/external/bazelbuild-rules_android" groups="pdk" revision="adc16dfd3a9b83c7ba15634b77232dc4bc6ccb38" />
 
   <project path="external/bazelbuild-rules_go" name="platform/external/bazelbuild-rules_go" groups="pdk" revision="711a453236752a1786e93fbde5929b92008fc7ff" />
 
-  <project path="external/bazelbuild-kotlin-rules" name="platform/external/bazelbuild-kotlin-rules" groups="pdk" revision="2d99ab7781561ae32e22653253c9712a1cdae0a8" />
+  <project path="external/bazelbuild-rules_java" name="platform/external/bazelbuild-rules_java" groups="pdk" revision="4ba159a9c43358f1e074edb4979ba3a5c5c590e4" />
+
+  <project path="external/bazelbuild-kotlin-rules" name="platform/external/bazelbuild-kotlin-rules" groups="pdk" revision="960043796bbda173c40b1381338117749ba7b589" />
 
   <project path="external/bazelbuild-rules_license" name="platform/external/bazelbuild-rules_license" groups="pdk" revision="eb146bbc492eb4ebea082d3cd0837105d94449ef" />
 
@@ -39,29 +41,29 @@
 
   <project path="external/golang-protobuf" name="platform/external/golang-protobuf" revision="bdb9197b4fd8ffab8d09fcd10a3097a2c5b945dd" />
 
-  <project path="prebuilts/bazel/common" name="platform/prebuilts/bazel/common" groups="pdk" clone-depth="1" revision="57d287ff9c14cb67453fedcff8df9d77dbe32b98" />
+  <project path="prebuilts/bazel/common" name="platform/prebuilts/bazel/common" groups="pdk" clone-depth="1" revision="bf9ab37b2bfc20a3531104ef69e64c528f621880" />
 
-  <project path="prebuilts/bazel/darwin-x86_64" name="platform/prebuilts/bazel/darwin-x86_64" groups="darwin,pdk" clone-depth="1" revision="469f26205d5d074dab6b401bb9afa86338e4b89b" />
+  <project path="prebuilts/bazel/darwin-x86_64" name="platform/prebuilts/bazel/darwin-x86_64" groups="darwin,pdk" clone-depth="1" revision="df534e346cb23382c2a1d0cde9effe62ba6107b3" />
 
-  <project path="prebuilts/bazel/linux-x86_64" name="platform/prebuilts/bazel/linux-x86_64" groups="linux,pdk" clone-depth="1" revision="38268b05e51c9863c9a40096dcb2c8f5206c4699" />
+  <project path="prebuilts/bazel/linux-x86_64" name="platform/prebuilts/bazel/linux-x86_64" groups="linux,pdk" clone-depth="1" revision="2f9e837049e7d5031a8117d225524e4ae0a3db67" />
 
-  <project path="prebuilts/build-tools" name="platform/prebuilts/build-tools" clone-depth="1" revision="86f778561ed3cc3c46ca5c7fd9f2c591d054cc70" />
+  <project path="prebuilts/build-tools" name="platform/prebuilts/build-tools" clone-depth="1" revision="26a146b347379a9391e49f645636674f58c39edf" />
 
-  <project path="prebuilts/clang-tools" name="platform/prebuilts/clang-tools" clone-depth="3" revision="95a2687c1dd50f9ea21a5d7d271a11e7f3efdfad" />
+  <project path="prebuilts/clang-tools" name="platform/prebuilts/clang-tools" clone-depth="3" revision="f08a96e4015aed7c0341b880ccd85500b1b44a26" />
 
-  <project path="prebuilts/clang/host/linux-x86" name="platform/prebuilts/clang/host/linux-x86" groups="linux" clone-depth="1" revision="1620eb4d7350d301ef59c63375739a326f4adfee" />
+  <project path="prebuilts/clang/host/linux-x86" name="platform/prebuilts/clang/host/linux-x86" groups="linux" clone-depth="1" revision="646296f496bc3b3898cec7f34047a682032a2a15" />
 
   <project path="prebuilts/gcc/linux-x86/host/x86_64-linux-glibc2.15-4.8" name="platform/prebuilts/gcc/linux-x86/host/x86_64-linux-glibc2.15-4.8" clone-depth="1" revision="e089f0d72820a43be332be964643b83a32e4b1a7" />
 
-  <project path="prebuilts/gcc/linux-x86/host/x86_64-linux-glibc2.17-4.8" name="platform/prebuilts/gcc/linux-x86/host/x86_64-linux-glibc2.17-4.8" clone-depth="1" revision="f6d40532273c8bd628b960ecc0970c7294c8d891" />
+  <project path="prebuilts/gcc/linux-x86/host/x86_64-linux-glibc2.17-4.8" name="platform/prebuilts/gcc/linux-x86/host/x86_64-linux-glibc2.17-4.8" clone-depth="1" revision="6c28ba8bdf3bbcc8353a37ec6910b09d1e175434" />
 
   <project path="prebuilts/go/linux-x86" name="platform/prebuilts/go/linux-x86" groups="linux" clone-depth="1" revision="b02123bfdb03d875974be3f47209efe6c5ff6b1c" />
 
   <project path="prebuilts/ninja/linux-x86" name="platform/prebuilts/ninja/linux-x86" groups="linux" clone-depth="1" revision="8a10824f74fe0e22af9bf314a837f5b70e2bb67f" />
 
-  <project path="prebuilts/rust" name="platform/prebuilts/rust" clone-depth="1" revision="a2c4fe19164924fa6e6316f9b005587a353b46bd" />
+  <project path="prebuilts/rust" name="platform/prebuilts/rust" clone-depth="1" revision="90869676c50498f068e856e9a9eb95e49ce500a1" />
 
-  <project path="prebuilts/clang/host/darwin-x86" name="platform/prebuilts/clang/host/darwin-x86" groups="darwin" clone-depth="1" revision="8f3c37a4d06bbcd1d795e4687aaeda02aa33180b" />
+  <project path="prebuilts/clang/host/darwin-x86" name="platform/prebuilts/clang/host/darwin-x86" groups="darwin" clone-depth="1" revision="da6db0cf1149f9011eaa0c54e9461ec7fa8d294b" />
 
   <project path="prebuilts/gcc/darwin-x86/host/headers" name="platform/prebuilts/gcc/darwin-x86/host/headers" groups="darwin" clone-depth="1" revision="4ac4f7cc41cf3c9e36fc3d6cf37fd1cfa9587a68" />
 
@@ -77,21 +79,21 @@
 
   <project path="prebuilts/jdk/jdk9" name="platform/prebuilts/jdk/jdk9" clone-depth="1" revision="1f0b937930e83b0f7470b9555ad289153072882f" />
 
-  <project path="prebuilts/misc" name="platform/prebuilts/misc" clone-depth="1" revision="8940dc5ee7033bdf17907de6a5875777bcf9bc32" />
+  <project path="prebuilts/misc" name="platform/prebuilts/misc" clone-depth="1" revision="cd0bc3e97057665dc04a3078b32af4d27ac90704" />
 
-  <project path="bionic" name="platform/bionic" revision="48d405ea4b06f7ebde5ac2a372c9ad90c7ac012e" />
+  <project path="bionic" name="platform/bionic" revision="f3bf97003c443606d4fb9d0757814ade285051ea" />
 
-  <project path="development" name="platform/development" revision="57a28a1076947e026441ca156c3e467ea89f9f6e" />
+  <project path="development" name="platform/development" revision="ad5a7859c8d8d2a4e6bdd379323f5e8f24aab94d" />
 
-  <project path="external/abseil-cpp" name="platform/external/abseil-cpp" revision="5170eb190bc570bb7050d934550bf3627a9ba616" />
+  <project path="external/abseil-cpp" name="platform/external/abseil-cpp" revision="54d5decc6c12df968ed9ad179c19c2c5aa290d40" />
 
-  <project path="external/boringssl" name="platform/external/boringssl" groups="pdk" revision="754dca2689b5c10990e8e023619e61a5b3c007c1" />
+  <project path="external/boringssl" name="platform/external/boringssl" groups="pdk" revision="5065296f344ac267a150ee04fe1b662fbe50870a" />
 
   <project path="external/clang" name="platform/external/clang" revision="101d502358fca757c7df2a94d551a0b07f84781e" />
 
   <project path="external/compiler-rt" name="platform/external/compiler-rt" revision="adebad9c8a6a383836d5fcb4a493ff1bd583b926" />
 
-  <project path="external/fmtlib" name="platform/external/fmtlib" revision="8eddae67de93782e91c43748dc1b79aecceca052" />
+  <project path="external/fmtlib" name="platform/external/fmtlib" revision="d0da765300c55c0884a01103ece3227c6b62455c" />
 
   <project path="external/gflags" name="platform/external/gflags" groups="pdk" revision="073396cf0a3d0dcaea54b771d1fda65b3e882259" />
 
@@ -115,9 +117,9 @@
 
   <project path="external/jsoncpp" name="platform/external/jsoncpp" revision="f62be7257b4d9291e7b63dcf848208c7c680e46a" />
 
-  <project path="external/kythe" name="platform/external/kythe" revision="80c14b0393968462aa668529f09b4f4e9307bb87" />
+  <project path="external/kythe" name="platform/external/kythe" revision="722ba5ca86bcf17a1ff135335ca61ddc64a6350e" />
 
-  <project path="external/libcxx" name="platform/external/libcxx" revision="ee86e3b18921712b5c0c61d01665079018ae2ead" />
+  <project path="external/libcxx" name="platform/external/libcxx" revision="d3d779a3ff888d72ee401bcfda78c10a9b8f1f28" />
 
   <project path="external/libcxxabi" name="platform/external/libcxxabi" revision="e40f13ac538b19cb067c81e3d6ce7314eac4cf8d" />
 
@@ -129,41 +131,43 @@
 
   <project path="external/llvm" name="platform/external/llvm" revision="1502bfa4286b170e331cda282f4bc8b25c2f71f7" />
 
-  <project path="external/protobuf" name="platform/external/protobuf" revision="b5b8b11fb16c7c72c1e4b26cd029953d2a5c37cd" />
+  <project path="external/protobuf" name="platform/external/protobuf" revision="58a2accc85ce86961849c3327352b538616c5038" />
 
   <project path="external/python/cpython2" name="platform/external/python/cpython2" revision="f4c6431b80426d19655ea61e1b5e1505c8b9bbe4" />
 
-  <project path="external/python/cpython3" name="platform/external/python/cpython3" revision="7725ed4b1c3951951cd9b0022d2a3b8599ff8f49" />
+  <project path="external/python/cpython3" name="platform/external/python/cpython3" revision="57d51f2bf6fb15c01f4f44bdbaf1d159222da016" />
 
   <project path="external/rapidjson" name="platform/external/rapidjson" revision="9fa2a3d9e356a1f42a6184dcf1e0508ddfa9dbfb" />
 
-  <project path="external/regex-re2" name="platform/external/regex-re2" revision="84e28962b2c2f357b5daccb460501b169193fafe" />
+  <project path="external/regex-re2" name="platform/external/regex-re2" revision="6b496379b475f223f3107a17050aa43a1b3f5182" />
 
   <project path="external/spdx-tools" name="platform/external/spdx-tools" revision="8cc30032636536a86a06b7c80938874363262e67" />
 
   <project path="external/starlark-go" name="platform/external/starlark-go" revision="46b7813df652cee050bf7d540313bc693fd2e3f3" />
 
-  <project path="external/zlib" name="platform/external/zlib" revision="dff4be8ee85e72532ced1f82d27ac3bf9f26c142" />
+  <project path="external/zlib" name="platform/external/zlib" revision="cfa5de0e204556eb3ab7c7fde29258329cd4ebb6" />
 
   <project path="external/zopfli" name="platform/external/zopfli" revision="15fdf31c61251f3e5aa3b188df2770eb153b9484" />
 
-  <project path="system/core" name="platform/system/core" revision="8790a71bc4afc32b5400a7ebe8e3ef4f91ee0355" />
+  <project path="system/core" name="platform/system/core" revision="b03378fe7113746641f480e198fb3bd3173aab5d" />
 
-  <project path="system/libbase" name="platform/system/libbase" revision="cf91a49ecab40a7d89d7ee20b54fbd182be36bfb" />
+  <project path="system/libbase" name="platform/system/libbase" revision="97f6717480bca00fdf47f82e5efad5095ab50da3" />
 
-  <project path="system/logging" name="platform/system/logging" revision="e68cbeea3406417f0c16eb5c2a588f1f68ffbf9e" />
+  <project path="system/logging" name="platform/system/logging" revision="effdaf7bb473385d5bd5a5861628eb7985f5b333" />
 
-  <project path="system/libziparchive" name="platform/system/libziparchive" revision="2711173f4d4712de52b3bbaa0186ec5d7caa61b9" />
+  <project path="system/libziparchive" name="platform/system/libziparchive" revision="529ed41ed60973e7a45b9190eea173ddc54e61f2" />
 
-  <project path="system/tools/aidl" name="platform/system/tools/aidl" revision="d8a838aec6662e409106ddc6ca128e1bd3b3b8c9" />
+  <project path="system/tools/aidl" name="platform/system/tools/aidl" revision="6807787e041a85e41c7139965488c7e13fafc382" />
 
   <project path="external/rust/crates/ahash" name="platform/external/rust/crates/ahash" revision="43f36985814d209debe83848c68ebbc5caf88f83" />
 
   <project path="external/rust/crates/aho-corasick" name="platform/external/rust/crates/aho-corasick" revision="14de7dbb8c9a319a079c4057cf4e8bf78367f58a" />
 
-  <project path="external/rust/crates/bindgen" name="platform/external/rust/crates/bindgen" revision="01485df57e02077408faf4423d3f13e2ed2afe7d" />
+  <project path="external/rust/crates/annotate-snippets" name="platform/external/rust/crates/annotate-snippets" revision="feb8fd5a46b48fc8ed6c1e5969de0dbfa3114b40" />
 
-  <project path="external/rust/crates/bindgen-cli" name="platform/external/rust/crates/bindgen-cli" revision="0a105542967a7abb5822017e9a2950e49f3927f6" />
+  <project path="external/rust/crates/bindgen" name="platform/external/rust/crates/bindgen" revision="e2a0f730e6b1f3682e9a0939fc048164bb4b82db" />
+
+  <project path="external/rust/crates/bindgen-cli" name="platform/external/rust/crates/bindgen-cli" revision="274153ebbb2187e459b0cb3b30e3a86fe49e06bf" />
 
   <project path="external/rust/crates/bitflags" name="platform/external/rust/crates/bitflags" revision="4cb5dac10a9ca8a0c9b78ea24f0f23e7972576e2" />
 
@@ -173,11 +177,11 @@
 
   <project path="external/rust/crates/clang-sys" name="platform/external/rust/crates/clang-sys" revision="22500bd136c9984324dcbc37ca878f68319238ca" />
 
-  <project path="external/rust/crates/clap" name="platform/external/rust/crates/clap" revision="e364e3505aef5e62a5df39a9aa487cc5513cf285" />
+  <project path="external/rust/crates/clap" name="platform/external/rust/crates/clap" revision="e2a105e6f04f09fa3bf42cc6991c3bba4c04302a" />
 
-  <project path="external/rust/crates/clap_derive" name="platform/external/rust/crates/clap_derive" revision="5e5c42af664a2d6ac1716312c56905374476f904" />
+  <project path="external/rust/crates/clap_derive" name="platform/external/rust/crates/clap_derive" revision="804ee00fef3cb4454305856df688215a8cda66ee" />
 
-  <project path="external/rust/crates/clap_lex" name="platform/external/rust/crates/clap_lex" revision="789922627d4d60f5847327c73cf3411722380abe" />
+  <project path="external/rust/crates/clap_lex" name="platform/external/rust/crates/clap_lex" revision="581d2377a6c6fa649e157f5f4561fc36897a0180" />
 
   <project path="external/rust/crates/either" name="platform/external/rust/crates/either" revision="030c450d3636dd3fce703e1a96ff2789189e31a0" />
 
@@ -189,37 +193,41 @@
 
   <project path="external/rust/crates/hashbrown" name="platform/external/rust/crates/hashbrown" revision="7eef8d8602197ced6f3c13fdb61b6d3e36be0852" />
 
-  <project path="external/rust/crates/heck" name="platform/external/rust/crates/heck" revision="835ee383e854173bb3da575725d80f37aa059e9d" />
+  <project path="external/rust/crates/heck" name="platform/external/rust/crates/heck" revision="ee29472cfd3d84dc11f437863cc0404fc80394c9" />
 
-  <project path="external/rust/crates/indexmap" name="platform/external/rust/crates/indexmap" revision="a4b6a1a4530c9d649d4105820e4b0593e883ef72" />
+  <project path="external/rust/crates/indexmap" name="platform/external/rust/crates/indexmap" revision="ed8232249106a5da60dcedbf2c9976984e9e3410" />
+
+  <project path="external/rust/crates/itertools" name="platform/external/rust/crates/itertools" revision="ff6a7187432661fa05ce1f140576e62694a1f3bc" />
 
   <project path="external/rust/crates/lazy_static" name="platform/external/rust/crates/lazy_static" revision="2b5db3e35d2fca59bdd32fa5bd50b4292a37029a" />
 
   <project path="external/rust/crates/lazycell" name="platform/external/rust/crates/lazycell" revision="cf08b65d0816bb4762a923c575495c01c94b7d88" />
 
-  <project path="external/rust/crates/libc" name="platform/external/rust/crates/libc" revision="0a0f8021d41b800cec8d9028aa3686081c1de64d" />
+  <project path="external/rust/crates/libc" name="platform/external/rust/crates/libc" revision="49258ba94dd0e0b9e87d7218bd1c1089e12f3aac" />
 
   <project path="external/rust/crates/libloading" name="platform/external/rust/crates/libloading" revision="e2ca7383f7d5410d8f56d9ab7f56598585329e65" />
 
-  <project path="external/rust/crates/log" name="platform/external/rust/crates/log" revision="1a36a6a8e10d14dd4c7a6a3d33ff7e7e654cc6e1" />
+  <project path="external/rust/crates/log" name="platform/external/rust/crates/log" revision="28e0c72e0380786ead30eaa3d19ca85899ae5893" />
 
   <project path="external/rust/crates/memchr" name="platform/external/rust/crates/memchr" revision="fe4070a3c5922eb647b7cf6d5b735eb8f379bf1b" />
 
-  <project path="external/rust/crates/minimal-lexical" name="platform/external/rust/crates/minimal-lexical" revision="5cf4ce630698f9be26365a0cd329a96a5e09ab82" />
+  <project path="external/rust/crates/minimal-lexical" name="platform/external/rust/crates/minimal-lexical" revision="bb67f0bbce15ec3c77fa3933b4f396c6ee19008d" />
 
   <project path="external/rust/crates/nom" name="platform/external/rust/crates/nom" revision="c71e40dd9fe8a03204b261e48e27c1550a2f1a9d" />
 
   <project path="external/rust/crates/once_cell" name="platform/external/rust/crates/once_cell" revision="4a9d8e8ee32e8ff0c1fea38c19cc50f33068910e" />
 
-  <project path="external/rust/crates/os_str_bytes" name="platform/external/rust/crates/os_str_bytes" revision="5dae56a156ac32f773bc76e174456583d7785ef6" />
+  <project path="external/rust/crates/os_str_bytes" name="platform/external/rust/crates/os_str_bytes" revision="6372ea42f7639212701d45323149c465b9f05901" />
 
   <project path="external/rust/crates/peeking_take_while" name="platform/external/rust/crates/peeking_take_while" revision="301d2543136b8fd8c431a9254cbe88af80c82ba8" />
 
+  <project path="external/rust/crates/prettyplease" name="platform/external/rust/crates/prettyplease" revision="d4594e85105d4ed3bac9c9b2bb147b8c3fa12f11" />
+
   <project path="external/rust/crates/proc-macro-error-attr" name="platform/external/rust/crates/proc-macro-error-attr" revision="1fdea0dd71970e9aa47a8f9f42e50b04f1f3b276" />
 
   <project path="external/rust/crates/proc-macro-error" name="platform/external/rust/crates/proc-macro-error" revision="d180d017350e241f42c991929bdfb0a674c7b3a7" />
 
-  <project path="external/rust/crates/proc-macro2" name="platform/external/rust/crates/proc-macro2" revision="0f3d74e8f41021bd82e05a51ebbe0f067fcdfe0a" />
+  <project path="external/rust/crates/proc-macro2" name="platform/external/rust/crates/proc-macro2" revision="5c471c4a5c3e80c810dadb19c5996e420426c3bc" />
 
   <project path="external/rust/crates/quote" name="platform/external/rust/crates/quote" revision="9791b3d2d985ed9aae113fc5112e9a607c6c5741" />
 
@@ -235,15 +243,17 @@
 
   <project path="external/rust/crates/textwrap" name="platform/external/rust/crates/textwrap" revision="d05b8953fc9aa7dac83cd85259138eeaaad858d9" />
 
-  <project path="external/rust/crates/unicode-ident" name="platform/external/rust/crates/unicode-ident" revision="77d0a1d8739833fcc94dac401e96cd22d18d317e" />
+  <project path="external/rust/crates/unicode-ident" name="platform/external/rust/crates/unicode-ident" revision="2d1f5f891da7f9645c864e54fa08a0aabc7ab65f" />
 
   <project path="external/rust/crates/unicode-segmentation" name="platform/external/rust/crates/unicode-segmentation" revision="eb9e371b88ecfa6105d901c5ff5c2447976b95b7" />
 
+  <project path="external/rust/crates/unicode-width" name="platform/external/rust/crates/unicode-width" revision="c6c1458a2a97909712874b5e52bfe9b9bffa3086" />
+
   <project path="external/rust/crates/unicode-xid" name="platform/external/rust/crates/unicode-xid" revision="7d2ba533b4898ff4641850e23d5b01b18be9f33a" />
 
   <project path="external/rust/crates/which" name="platform/external/rust/crates/which" revision="f8c7b779bcf43565718f27dbff59f07d9afa4bfc" />
 
-  <project path="dalvik" name="platform/dalvik" revision="b9bcad53a7418326fe716296e61012d921513d9d" />
+  <project path="dalvik" name="platform/dalvik" revision="e6ca0156450e010df7c1d9624c0eb97a9b2bc1e8" />
 
-  <project path="external/ninja" name="platform/external/ninja" revision="45d6a94d6de54494150299ff0e7137196f3f0b47" />
+  <project path="external/ninja" name="platform/external/ninja" revision="efbceece33817facf746ddc378f33ae4cc602788" />
 </manifest>