Snap for 6203617 from 38ff772f60dd2fba919dc8bebb9c349364c1a61a to rvc-release

Change-Id: I79715f28c30d605bc8ee52972a8ebe6d9245ff0c
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..73b2998
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,19 @@
+# Ninja files
+build.ninja
+
+# Build objects and artifacts
+deps/
+build/
+bin/
+lib/
+*.pyc
+*.pyo
+
+# System files
+.DS_Store
+.DS_Store?
+._*
+.Spotlight-V100
+.Trashes
+ehthumbs.db
+Thumbs.db
diff --git a/Android.bp b/Android.bp
new file mode 100644
index 0000000..3b967ed
--- /dev/null
+++ b/Android.bp
@@ -0,0 +1,21 @@
+// Copyright (C) 2020 The Android Open Source Project
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+cc_library_headers {
+    name: "psimd_headers",
+    export_include_dirs: ["include"],
+    vendor_available: true,
+    sdk_version: "current",
+}
+
diff --git a/CMakeLists.txt b/CMakeLists.txt
new file mode 100644
index 0000000..bd69c62
--- /dev/null
+++ b/CMakeLists.txt
@@ -0,0 +1,17 @@
+CMAKE_MINIMUM_REQUIRED(VERSION 2.8.12 FATAL_ERROR)
+
+INCLUDE(GNUInstallDirs)
+
+# ---[ Project
+PROJECT(psimd C)
+
+# ---[ psimd library
+IF(${CMAKE_VERSION} VERSION_LESS "3.0")
+  ADD_LIBRARY(psimd STATIC include/psimd.h)
+  SET_TARGET_PROPERTIES(psimd PROPERTIES LINKER_LANGUAGE C)
+ELSE()
+  ADD_LIBRARY(psimd INTERFACE)
+ENDIF()
+TARGET_INCLUDE_DIRECTORIES(psimd INTERFACE include)
+
+INSTALL(FILES include/psimd.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..0a117cb
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,11 @@
+The MIT License (MIT)
+
+Copyright (c) 2017 Facebook Inc.
+Copyright (c) 2014-2017 Georgia Institute of Technology
+Copyright 2019 Google LLC
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
diff --git a/METADATA b/METADATA
new file mode 100644
index 0000000..5b1c1a4
--- /dev/null
+++ b/METADATA
@@ -0,0 +1,17 @@
+name: "psimd"
+description:
+    "Portable 128-bit SIMD intrinsics"
+
+third_party {
+  url {
+    type: HOMEPAGE
+    value: "https://github.com/Maratyszcza/psimd"
+  }
+  url {
+    type: GIT
+    value: "https://github.com/Maratyszcza/psimd"
+  }
+  version: "10b4ffc6ea9e2e11668f86969586f88bc82aaefa"
+  last_upgrade_date { year: 2020 month: 2 day: 3 }
+  license_type: NOTICE
+}
diff --git a/MODULE_LICENSE_MIT b/MODULE_LICENSE_MIT
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/MODULE_LICENSE_MIT
diff --git a/NOTICE b/NOTICE
new file mode 120000
index 0000000..7a694c9
--- /dev/null
+++ b/NOTICE
@@ -0,0 +1 @@
+LICENSE
\ No newline at end of file
diff --git a/OWNERS b/OWNERS
new file mode 100644
index 0000000..a2cc597
--- /dev/null
+++ b/OWNERS
@@ -0,0 +1,11 @@
+butlermichael@google.com
+dgross@google.com
+galarragas@google.com
+jeanluc@google.com
+levp@google.com
+maratek@google.com
+miaowang@google.com
+pszczepaniak@google.com
+slavash@google.com
+vddang@google.com
+xusongw@google.com
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..aef0db6
--- /dev/null
+++ b/README.md
@@ -0,0 +1,2 @@
+# P(ortable) SIMD
+Portable 128-bit SIMD intrinsics
diff --git a/configure.py b/configure.py
new file mode 100755
index 0000000..26b29bd
--- /dev/null
+++ b/configure.py
@@ -0,0 +1,19 @@
+#!/usr/bin/env python
+
+
+import confu
+parser = confu.standard_parser("FP16 configuration script")
+
+
+def main(args):
+    options = parser.parse_args(args)
+    build = confu.Build.from_options(options)
+
+    build.export_cpath("include", ["psimd.h"])
+
+    return build
+
+
+if __name__ == "__main__":
+    import sys
+    main(sys.argv[1:]).generate()
diff --git a/confu.yaml b/confu.yaml
new file mode 100644
index 0000000..822eee3
--- /dev/null
+++ b/confu.yaml
@@ -0,0 +1,3 @@
+name: psimd
+title: portable 128-bit SIMD intrinsics
+license: MIT
\ No newline at end of file
diff --git a/include/psimd.h b/include/psimd.h
new file mode 100644
index 0000000..1aa1014
--- /dev/null
+++ b/include/psimd.h
@@ -0,0 +1,1384 @@
+#pragma once
+#ifndef PSIMD_H
+#define PSIMD_H
+
+#if defined(__CUDA_ARCH__)
+	/* CUDA compiler */
+	#define PSIMD_INTRINSIC __forceinline__ __device__
+#elif defined(__OPENCL_VERSION__)
+	/* OpenCL compiler */
+	#define PSIMD_INTRINSIC inline static
+#elif defined(__INTEL_COMPILER)
+	/* Intel compiler, even on Windows */
+	#define PSIMD_INTRINSIC inline static __attribute__((__always_inline__))
+#elif defined(__GNUC__)
+	/* GCC-compatible compiler (gcc/clang/icc) */
+	#define PSIMD_INTRINSIC inline static __attribute__((__always_inline__))
+#elif defined(_MSC_VER)
+	/* MSVC-compatible compiler (cl/icl/clang-cl) */
+	#define PSIMD_INTRINSIC __forceinline static
+#elif defined(__cplusplus)
+	/* Generic C++ compiler */
+	#define PSIMD_INTRINSIC inline static
+#elif defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)
+	/* Generic C99 compiler */
+	#define PSIMD_INTRINSIC inline static
+#else
+	/* Generic C compiler */
+	#define PSIMD_INTRINSIC static
+#endif
+
+#if defined(__GNUC__)
+	#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+		#include <arm_neon.h>
+	#endif
+
+	#if defined(__SSE2__)
+		#include <emmintrin.h>
+	#endif
+
+	#if defined(__SSE3__)
+		#include <pmmintrin.h>
+	#endif
+
+	#if defined(__SSSE3__)
+		#include <tmmintrin.h>
+	#endif
+
+	#if defined(__SSE4_1__)
+		#include <smmintrin.h>
+	#endif
+
+	#if defined(__SSE4_2__)
+		#include <nmmintrin.h>
+	#endif
+
+	#if defined(__AVX__)
+		#include <immintrin.h>
+	#endif
+#elif defined(_MSC_VER)
+	#include <intrin.h>
+#endif
+
+#if defined(__cplusplus)
+	#define PSIMD_CXX_SYNTAX
+#elif defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L)
+	#define PSIMD_C11_SYNTAX
+#elif defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)
+	#define PSIMD_C99_SYNTAX
+#else
+	#define PSIMD_C89_SYNTAX
+#endif
+
+#if defined(__cplusplus) && (__cplusplus >= 201103L)
+	#include <cstddef>
+	#include <cstdint>
+#elif !defined(__OPENCL_VERSION__)
+	#include <stddef.h>
+	#include <stdint.h>
+#endif
+
+#if defined(__GNUC__)
+	#define PSIMD_HAVE_F64 0
+	#define PSIMD_HAVE_F32 1
+	#define PSIMD_HAVE_U8 1
+	#define PSIMD_HAVE_S8 1
+	#define PSIMD_HAVE_U16 1
+	#define PSIMD_HAVE_S16 1
+	#define PSIMD_HAVE_U32 1
+	#define PSIMD_HAVE_S32 1
+	#define PSIMD_HAVE_U64 0
+	#define PSIMD_HAVE_S64 0
+
+	typedef int8_t   psimd_s8  __attribute__((vector_size(16), aligned(1)));
+	typedef uint8_t  psimd_u8  __attribute__((vector_size(16), aligned(1)));
+	typedef int16_t  psimd_s16 __attribute__((vector_size(16), aligned(2)));
+	typedef uint16_t psimd_u16 __attribute__((vector_size(16), aligned(2)));
+	typedef int32_t  psimd_s32 __attribute__((vector_size(16), aligned(4)));
+	typedef uint32_t psimd_u32 __attribute__((vector_size(16), aligned(4)));
+	typedef float    psimd_f32 __attribute__((vector_size(16), aligned(4)));
+
+	typedef struct {
+		psimd_s8 lo;
+		psimd_s8 hi;
+	} psimd_s8x2;
+
+	typedef struct {
+		psimd_u8 lo;
+		psimd_u8 hi;
+	} psimd_u8x2;
+
+	typedef struct {
+		psimd_s16 lo;
+		psimd_s16 hi;
+	} psimd_s16x2;
+
+	typedef struct {
+		psimd_u16 lo;
+		psimd_u16 hi;
+	} psimd_u16x2;
+
+	typedef struct {
+		psimd_s32 lo;
+		psimd_s32 hi;
+	} psimd_s32x2;
+
+	typedef struct {
+		psimd_u32 lo;
+		psimd_u32 hi;
+	} psimd_u32x2;
+
+	typedef struct {
+		psimd_f32 lo;
+		psimd_f32 hi;
+	} psimd_f32x2;
+
+	/* Bit casts */
+	PSIMD_INTRINSIC psimd_u32x2 psimd_cast_s32x2_u32x2(psimd_s32x2 v) {
+		return (psimd_u32x2) { .lo = (psimd_u32) v.lo, .hi = (psimd_u32) v.hi };
+	}
+
+	PSIMD_INTRINSIC psimd_f32x2 psimd_cast_s32x2_f32x2(psimd_s32x2 v) {
+		return (psimd_f32x2) { .lo = (psimd_f32) v.lo, .hi = (psimd_f32) v.hi };
+	}
+
+	PSIMD_INTRINSIC psimd_s32x2 psimd_cast_u32x2_s32x2(psimd_u32x2 v) {
+		return (psimd_s32x2) { .lo = (psimd_s32) v.lo, .hi = (psimd_s32) v.hi };
+	}
+
+	PSIMD_INTRINSIC psimd_f32x2 psimd_cast_u32x2_f32x2(psimd_u32x2 v) {
+		return (psimd_f32x2) { .lo = (psimd_f32) v.lo, .hi = (psimd_f32) v.hi };
+	}
+
+	PSIMD_INTRINSIC psimd_s32x2 psimd_cast_f32x2_s32x2(psimd_f32x2 v) {
+		return (psimd_s32x2) { .lo = (psimd_s32) v.lo, .hi = (psimd_s32) v.hi };
+	}
+
+	PSIMD_INTRINSIC psimd_u32x2 psimd_cast_f32x2_u32x2(psimd_f32x2 v) {
+		return (psimd_u32x2) { .lo = (psimd_u32) v.lo, .hi = (psimd_u32) v.hi };
+	}
+
+	/* Swap */
+	PSIMD_INTRINSIC void psimd_swap_s8(psimd_s8 a[1], psimd_s8 b[1]) {
+		const psimd_s8 new_a = *b;
+		const psimd_s8 new_b = *a;
+		*a = new_a;
+		*b = new_b;
+	}
+
+	PSIMD_INTRINSIC void psimd_swap_u8(psimd_u8 a[1], psimd_u8 b[1]) {
+		const psimd_u8 new_a = *b;
+		const psimd_u8 new_b = *a;
+		*a = new_a;
+		*b = new_b;
+	}
+
+	PSIMD_INTRINSIC void psimd_swap_s16(psimd_s16 a[1], psimd_s16 b[1]) {
+		const psimd_s16 new_a = *b;
+		const psimd_s16 new_b = *a;
+		*a = new_a;
+		*b = new_b;
+	}
+
+	PSIMD_INTRINSIC void psimd_swap_u16(psimd_u16 a[1], psimd_u16 b[1]) {
+		const psimd_u16 new_a = *b;
+		const psimd_u16 new_b = *a;
+		*a = new_a;
+		*b = new_b;
+	}
+
+	PSIMD_INTRINSIC void psimd_swap_s32(psimd_s32 a[1], psimd_s32 b[1]) {
+		const psimd_s32 new_a = *b;
+		const psimd_s32 new_b = *a;
+		*a = new_a;
+		*b = new_b;
+	}
+
+	PSIMD_INTRINSIC void psimd_swap_u32(psimd_u32 a[1], psimd_u32 b[1]) {
+		const psimd_u32 new_a = *b;
+		const psimd_u32 new_b = *a;
+		*a = new_a;
+		*b = new_b;
+	}
+
+	PSIMD_INTRINSIC void psimd_swap_f32(psimd_f32 a[1], psimd_f32 b[1]) {
+		const psimd_f32 new_a = *b;
+		const psimd_f32 new_b = *a;
+		*a = new_a;
+		*b = new_b;
+	}
+
+	/* Zero-initialization */
+	PSIMD_INTRINSIC psimd_s8 psimd_zero_s8(void) {
+		return (psimd_s8) { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
+	}
+
+	PSIMD_INTRINSIC psimd_u8 psimd_zero_u8(void) {
+		return (psimd_u8) { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
+	}
+
+	PSIMD_INTRINSIC psimd_s16 psimd_zero_s16(void) {
+		return (psimd_s16) { 0, 0, 0, 0, 0, 0, 0, 0 };
+	}
+
+	PSIMD_INTRINSIC psimd_u16 psimd_zero_u16(void) {
+		return (psimd_u16) { 0, 0, 0, 0, 0, 0, 0, 0 };
+	}
+
+	PSIMD_INTRINSIC psimd_s32 psimd_zero_s32(void) {
+		return (psimd_s32) { 0, 0, 0, 0 };
+	}
+
+	PSIMD_INTRINSIC psimd_u32 psimd_zero_u32(void) {
+		return (psimd_u32) { 0, 0, 0, 0 };
+	}
+
+	PSIMD_INTRINSIC psimd_f32 psimd_zero_f32(void) {
+		return (psimd_f32) { 0.0f, 0.0f, 0.0f, 0.0f };
+	}
+
+	/* Initialization to the same constant */
+	PSIMD_INTRINSIC psimd_s8 psimd_splat_s8(int8_t c) {
+		return (psimd_s8) { c, c, c, c, c, c, c, c, c, c, c, c, c, c, c, c };
+	}
+
+	PSIMD_INTRINSIC psimd_u8 psimd_splat_u8(uint8_t c) {
+		return (psimd_u8) { c, c, c, c, c, c, c, c, c, c, c, c, c, c, c, c };
+	}
+
+	PSIMD_INTRINSIC psimd_s16 psimd_splat_s16(int16_t c) {
+		return (psimd_s16) { c, c, c, c, c, c, c, c };
+	}
+
+	PSIMD_INTRINSIC psimd_u16 psimd_splat_u16(uint16_t c) {
+		return (psimd_u16) { c, c, c, c, c, c, c, c };
+	}
+
+	PSIMD_INTRINSIC psimd_s32 psimd_splat_s32(int32_t c) {
+		return (psimd_s32) { c, c, c, c };
+	}
+
+	PSIMD_INTRINSIC psimd_u32 psimd_splat_u32(uint32_t c) {
+		return (psimd_u32) { c, c, c, c };
+	}
+
+	PSIMD_INTRINSIC psimd_f32 psimd_splat_f32(float c) {
+		return (psimd_f32) { c, c, c, c };
+	}
+
+	/* Load vector */
+	PSIMD_INTRINSIC psimd_s8 psimd_load_s8(const void* address) {
+		return *((const psimd_s8*) address);
+	}
+
+	PSIMD_INTRINSIC psimd_u8 psimd_load_u8(const void* address) {
+		return *((const psimd_u8*) address);
+	}
+
+	PSIMD_INTRINSIC psimd_s16 psimd_load_s16(const void* address) {
+		return *((const psimd_s16*) address);
+	}
+
+	PSIMD_INTRINSIC psimd_u16 psimd_load_u16(const void* address) {
+		return *((const psimd_u16*) address);
+	}
+
+	PSIMD_INTRINSIC psimd_s32 psimd_load_s32(const void* address) {
+		return *((const psimd_s32*) address);
+	}
+
+	PSIMD_INTRINSIC psimd_u32 psimd_load_u32(const void* address) {
+		return *((const psimd_u32*) address);
+	}
+
+	PSIMD_INTRINSIC psimd_f32 psimd_load_f32(const void* address) {
+		return *((const psimd_f32*) address);
+	}
+
+	PSIMD_INTRINSIC psimd_s8 psimd_load_splat_s8(const void* address) {
+		return psimd_splat_s8(*((const int8_t*) address));
+	}
+
+	PSIMD_INTRINSIC psimd_u8 psimd_load_splat_u8(const void* address) {
+		return psimd_splat_u8(*((const uint8_t*) address));
+	}
+
+	PSIMD_INTRINSIC psimd_s16 psimd_load_splat_s16(const void* address) {
+		return psimd_splat_s16(*((const int16_t*) address));
+	}
+
+	PSIMD_INTRINSIC psimd_u16 psimd_load_splat_u16(const void* address) {
+		return psimd_splat_u16(*((const uint16_t*) address));
+	}
+
+	PSIMD_INTRINSIC psimd_s32 psimd_load_splat_s32(const void* address) {
+		return psimd_splat_s32(*((const int32_t*) address));
+	}
+
+	PSIMD_INTRINSIC psimd_u32 psimd_load_splat_u32(const void* address) {
+		return psimd_splat_u32(*((const uint32_t*) address));
+	}
+
+	PSIMD_INTRINSIC psimd_f32 psimd_load_splat_f32(const void* address) {
+		return psimd_splat_f32(*((const float*) address));
+	}
+
+	PSIMD_INTRINSIC psimd_s32 psimd_load1_s32(const void* address) {
+		return (psimd_s32) { *((const int32_t*) address), 0, 0, 0 };
+	}
+
+	PSIMD_INTRINSIC psimd_u32 psimd_load1_u32(const void* address) {
+		return (psimd_u32) { *((const uint32_t*) address), 0, 0, 0 };
+	}
+
+	PSIMD_INTRINSIC psimd_f32 psimd_load1_f32(const void* address) {
+		return (psimd_f32) { *((const float*) address), 0.0f, 0.0f, 0.0f };
+	}
+
+	PSIMD_INTRINSIC psimd_s32 psimd_load2_s32(const void* address) {
+		const int32_t* address_s32 = (const int32_t*) address;
+		return (psimd_s32) { address_s32[0], address_s32[1], 0, 0 };
+	}
+
+	PSIMD_INTRINSIC psimd_u32 psimd_load2_u32(const void* address) {
+		const uint32_t* address_u32 = (const uint32_t*) address;
+		return (psimd_u32) { address_u32[0], address_u32[1], 0, 0 };
+	}
+
+	PSIMD_INTRINSIC psimd_f32 psimd_load2_f32(const void* address) {
+		const float* address_f32 = (const float*) address;
+		return (psimd_f32) { address_f32[0], address_f32[1], 0.0f, 0.0f };
+	}
+
+	PSIMD_INTRINSIC psimd_s32 psimd_load3_s32(const void* address) {
+		const int32_t* address_s32 = (const int32_t*) address;
+		return (psimd_s32) { address_s32[0], address_s32[1], address_s32[2], 0 };
+	}
+
+	PSIMD_INTRINSIC psimd_u32 psimd_load3_u32(const void* address) {
+		const uint32_t* address_u32 = (const uint32_t*) address;
+		return (psimd_u32) { address_u32[0], address_u32[1], address_u32[2], 0 };
+	}
+
+	PSIMD_INTRINSIC psimd_f32 psimd_load3_f32(const void* address) {
+		const float* address_f32 = (const float*) address;
+		return (psimd_f32) { address_f32[0], address_f32[1], address_f32[2], 0.0f };
+	}
+
+	PSIMD_INTRINSIC psimd_s32 psimd_load4_s32(const void* address) {
+		return psimd_load_s32(address);
+	}
+
+	PSIMD_INTRINSIC psimd_u32 psimd_load4_u32(const void* address) {
+		return psimd_load_u32(address);
+	}
+
+	PSIMD_INTRINSIC psimd_f32 psimd_load4_f32(const void* address) {
+		return psimd_load_f32(address);
+	}
+
+	PSIMD_INTRINSIC psimd_f32 psimd_load_stride2_f32(const void* address) {
+		const psimd_f32 v0x1x = psimd_load_f32(address);
+		const psimd_f32 vx2x3 = psimd_load_f32((const float*) address + 3);
+		#if defined(__clang__)
+			return __builtin_shufflevector(v0x1x, vx2x3, 0, 2, 5, 7);
+		#else
+			return __builtin_shuffle(v0x1x, vx2x3, (psimd_s32) { 0, 2, 5, 7 });
+		#endif
+	}
+
+	PSIMD_INTRINSIC psimd_f32 psimd_load1_stride2_f32(const void* address) {
+		return psimd_load_f32(address);
+	}
+
+	PSIMD_INTRINSIC psimd_f32 psimd_load2_stride2_f32(const void* address) {
+		const float* address_f32 = (const float*) address;
+		return (psimd_f32) { address_f32[0], address_f32[2], 0.0f, 0.0f };
+	}
+
+	PSIMD_INTRINSIC psimd_f32 psimd_load3_stride2_f32(const void* address) {
+		const psimd_f32 v0x1x = psimd_load_f32(address);
+		const psimd_f32 v2zzz = psimd_load1_f32((const float*) address + 2);
+		#if defined(__clang__)
+			return __builtin_shufflevector(v0x1x, v2zzz, 0, 2, 4, 6);
+		#else
+			return __builtin_shuffle(v0x1x, v2zzz, (psimd_s32) { 0, 2, 4, 6 });
+		#endif
+	}
+
+	PSIMD_INTRINSIC psimd_f32 psimd_load4_stride2_f32(const void* address) {
+		return psimd_load_stride2_f32(address);
+	}
+
+	PSIMD_INTRINSIC psimd_f32 psimd_load_stride_f32(const void* address, size_t stride) {
+		const float* address0_f32 = (const float*) address;
+		const float* address1_f32 = address0_f32 + stride;
+		const float* address2_f32 = address1_f32 + stride;
+		const float* address3_f32 = address2_f32 + stride;
+		return (psimd_f32) { *address0_f32, *address1_f32, *address2_f32, *address3_f32 };
+	}
+
+	PSIMD_INTRINSIC psimd_f32 psimd_load1_stride_f32(const void* address, size_t stride) {
+		return psimd_load1_f32(address);
+	}
+
+	PSIMD_INTRINSIC psimd_f32 psimd_load2_stride_f32(const void* address, size_t stride) {
+		const float* address_f32 = (const float*) address;
+		return (psimd_f32) { address_f32[0], address_f32[stride], 0.0f, 0.0f };
+	}
+
+	PSIMD_INTRINSIC psimd_f32 psimd_load3_stride_f32(const void* address, size_t stride) {
+		const float* address0_f32 = (const float*) address;
+		const float* address1_f32 = address0_f32 + stride;
+		const float* address2_f32 = address1_f32 + stride;
+		return (psimd_f32) { *address0_f32, *address1_f32, *address2_f32, 0.0f };
+	}
+
+	PSIMD_INTRINSIC psimd_f32 psimd_load4_stride_f32(const void* address, size_t stride) {
+		return psimd_load_stride_f32(address, stride);
+	}
+
+	/* Store vector */
+	PSIMD_INTRINSIC void psimd_store_s8(void* address, psimd_s8 value) {
+		*((psimd_s8*) address) = value;
+	}
+
+	PSIMD_INTRINSIC void psimd_store_u8(void* address, psimd_u8 value) {
+		*((psimd_u8*) address) = value;
+	}
+
+	PSIMD_INTRINSIC void psimd_store_s16(void* address, psimd_s16 value) {
+		*((psimd_s16*) address) = value;
+	}
+
+	PSIMD_INTRINSIC void psimd_store_u16(void* address, psimd_u16 value) {
+		*((psimd_u16*) address) = value;
+	}
+
+	PSIMD_INTRINSIC void psimd_store_s32(void* address, psimd_s32 value) {
+		*((psimd_s32*) address) = value;
+	}
+
+	PSIMD_INTRINSIC void psimd_store_u32(void* address, psimd_u32 value) {
+		*((psimd_u32*) address) = value;
+	}
+
+	PSIMD_INTRINSIC void psimd_store_f32(void* address, psimd_f32 value) {
+		*((psimd_f32*) address) = value;
+	}
+
+	PSIMD_INTRINSIC void psimd_store1_s32(void* address, psimd_s32 value) {
+		*((int32_t*) address) = value[0];
+	}
+
+	PSIMD_INTRINSIC void psimd_store1_u32(void* address, psimd_u32 value) {
+		*((uint32_t*) address) = value[0];
+	}
+
+	PSIMD_INTRINSIC void psimd_store1_f32(void* address, psimd_f32 value) {
+		*((float*) address) = value[0];
+	}
+
+	PSIMD_INTRINSIC void psimd_store2_s32(void* address, psimd_s32 value) {
+		int32_t* address_s32 = (int32_t*) address;
+		address_s32[0] = value[0];
+		address_s32[1] = value[1];
+	}
+
+	PSIMD_INTRINSIC void psimd_store2_u32(void* address, psimd_u32 value) {
+		uint32_t* address_u32 = (uint32_t*) address;
+		address_u32[0] = value[0];
+		address_u32[1] = value[1];
+	}
+
+	PSIMD_INTRINSIC void psimd_store2_f32(void* address, psimd_f32 value) {
+		float* address_f32 = (float*) address;
+		address_f32[0] = value[0];
+		address_f32[1] = value[1];
+	}
+
+	PSIMD_INTRINSIC void psimd_store3_s32(void* address, psimd_s32 value) {
+		int32_t* address_s32 = (int32_t*) address;
+		address_s32[0] = value[0];
+		address_s32[1] = value[1];
+		address_s32[2] = value[2];
+	}
+
+	PSIMD_INTRINSIC void psimd_store3_u32(void* address, psimd_u32 value) {
+		uint32_t* address_u32 = (uint32_t*) address;
+		address_u32[0] = value[0];
+		address_u32[1] = value[1];
+		address_u32[2] = value[2];
+	}
+
+	PSIMD_INTRINSIC void psimd_store3_f32(void* address, psimd_f32 value) {
+		float* address_f32 = (float*) address;
+		address_f32[0] = value[0];
+		address_f32[1] = value[1];
+		address_f32[2] = value[2];
+	}
+
+	PSIMD_INTRINSIC void psimd_store4_s32(void* address, psimd_s32 value) {
+		psimd_store_s32(address, value);
+	}
+
+	PSIMD_INTRINSIC void psimd_store4_u32(void* address, psimd_u32 value) {
+		psimd_store_u32(address, value);
+	}
+
+	PSIMD_INTRINSIC void psimd_store4_f32(void* address, psimd_f32 value) {
+		psimd_store_f32(address, value);
+	}
+
+	PSIMD_INTRINSIC void psimd_store_stride_f32(void* address, size_t stride, psimd_f32 value) {
+		float* address0_f32 = (float*) address;
+		float* address1_f32 = address0_f32 + stride;
+		float* address2_f32 = address1_f32 + stride;
+		float* address3_f32 = address2_f32 + stride;
+		*address0_f32 = value[0];
+		*address1_f32 = value[1];
+		*address2_f32 = value[2];
+		*address3_f32 = value[3];
+	}
+
+	PSIMD_INTRINSIC void psimd_store1_stride_f32(void* address, size_t stride, psimd_f32 value) {
+		psimd_store1_f32(address, value);
+	}
+
+	PSIMD_INTRINSIC void psimd_store2_stride_f32(void* address, size_t stride, psimd_f32 value) {
+		float* address_f32 = (float*) address;
+		address_f32[0]      = value[0];
+		address_f32[stride] = value[1];
+	}
+
+	PSIMD_INTRINSIC void psimd_store3_stride_f32(void* address, size_t stride, psimd_f32 value) {
+		float* address0_f32 = (float*) address;
+		float* address1_f32 = address0_f32 + stride;
+		float* address2_f32 = address1_f32 + stride;
+		*address0_f32 = value[0];
+		*address1_f32 = value[1];
+		*address2_f32 = value[2];
+	}
+
+	/* Vector addition */
+	PSIMD_INTRINSIC psimd_s8 psimd_add_s8(psimd_s8 a, psimd_s8 b) {
+		return a + b;
+	}
+
+	PSIMD_INTRINSIC psimd_u8 psimd_add_u8(psimd_u8 a, psimd_u8 b) {
+		return a + b;
+	}
+
+	PSIMD_INTRINSIC psimd_s16 psimd_add_s16(psimd_s16 a, psimd_s16 b) {
+		return a + b;
+	}
+
+	PSIMD_INTRINSIC psimd_u16 psimd_add_u16(psimd_u16 a, psimd_u16 b) {
+		return a + b;
+	}
+
+	PSIMD_INTRINSIC psimd_s32 psimd_add_s32(psimd_s32 a, psimd_s32 b) {
+		return a + b;
+	}
+
+	PSIMD_INTRINSIC psimd_u32 psimd_add_u32(psimd_u32 a, psimd_u32 b) {
+		return a + b;
+	}
+
+	PSIMD_INTRINSIC psimd_f32 psimd_add_f32(psimd_f32 a, psimd_f32 b) {
+		#if defined(__ARM_ARCH_7A__) && defined(__ARM_NEON__) && !defined(__FAST_MATH__)
+			return (psimd_f32) vaddq_f32((float32x4_t) a, (float32x4_t) b);
+		#else
+			return a + b;
+		#endif
+	}
+
+	/* Vector subtraction */
+	PSIMD_INTRINSIC psimd_s8 psimd_sub_s8(psimd_s8 a, psimd_s8 b) {
+		return a - b;
+	}
+
+	PSIMD_INTRINSIC psimd_u8 psimd_sub_u8(psimd_u8 a, psimd_u8 b) {
+		return a - b;
+	}
+
+	PSIMD_INTRINSIC psimd_s16 psimd_sub_s16(psimd_s16 a, psimd_s16 b) {
+		return a - b;
+	}
+
+	PSIMD_INTRINSIC psimd_u16 psimd_sub_u16(psimd_u16 a, psimd_u16 b) {
+		return a - b;
+	}
+
+	PSIMD_INTRINSIC psimd_s32 psimd_sub_s32(psimd_s32 a, psimd_s32 b) {
+		return a - b;
+	}
+
+	PSIMD_INTRINSIC psimd_u32 psimd_sub_u32(psimd_u32 a, psimd_u32 b) {
+		return a - b;
+	}
+
+	PSIMD_INTRINSIC psimd_f32 psimd_sub_f32(psimd_f32 a, psimd_f32 b) {
+		#if defined(__ARM_ARCH_7A__) && defined(__ARM_NEON__) && !defined(__FAST_MATH__)
+			return (psimd_f32) vsubq_f32((float32x4_t) a, (float32x4_t) b);
+		#else
+			return a - b;
+		#endif
+	}
+
+	/* Vector multiplication */
+	PSIMD_INTRINSIC psimd_s8 psimd_mul_s8(psimd_s8 a, psimd_s8 b) {
+		return a * b;
+	}
+
+	PSIMD_INTRINSIC psimd_u8 psimd_mul_u8(psimd_u8 a, psimd_u8 b) {
+		return a * b;
+	}
+
+	PSIMD_INTRINSIC psimd_s16 psimd_mul_s16(psimd_s16 a, psimd_s16 b) {
+		return a * b;
+	}
+
+	PSIMD_INTRINSIC psimd_u16 psimd_mul_u16(psimd_u16 a, psimd_u16 b) {
+		return a * b;
+	}
+
+	PSIMD_INTRINSIC psimd_s32 psimd_mul_s32(psimd_s32 a, psimd_s32 b) {
+		return a * b;
+	}
+
+	PSIMD_INTRINSIC psimd_u32 psimd_mul_u32(psimd_u32 a, psimd_u32 b) {
+		return a * b;
+	}
+
+	PSIMD_INTRINSIC psimd_f32 psimd_mul_f32(psimd_f32 a, psimd_f32 b) {
+		#if defined(__ARM_ARCH_7A__) && defined(__ARM_NEON__) && !defined(__FAST_MATH__)
+			return (psimd_f32) vmulq_f32((float32x4_t) a, (float32x4_t) b);
+		#else
+			return a * b;
+		#endif
+	}
+
+	/* Quasi-Fused Multiply-Add */
+	PSIMD_INTRINSIC psimd_f32 psimd_qfma_f32(psimd_f32 a, psimd_f32 b, psimd_f32 c) {
+		#if defined(__aarch64__) || defined(__ARM_NEON__) && defined(__ARM_FEATURE_FMA)
+			return (psimd_f32) vfmaq_f32((float32x4_t) a, (float32x4_t) b, (float32x4_t) c);
+		#elif (defined(__x86_64__) || defined(__i386__) || defined(__i686__)) && defined(__FMA__)
+			return (psimd_f32) _mm_fmadd_ps((__m128) c, (__m128) a, (__m128) b);
+		#elif (defined(__x86_64__) || defined(__i386__) || defined(__i686__)) && defined(__FMA4__)
+			return (psimd_f32) _mm_macc_ps((__m128) c, (__m128) a, (__m128) b);
+		#elif defined(__wasm__) && defined(__wasm_simd128__) && defined(__clang__)
+			return (psimd_f32) __builtin_wasm_qfma_f32x4(a, b, c);
+		#else
+			return a + b * c;
+		#endif
+	}
+
+	PSIMD_INTRINSIC psimd_f32 psimd_div_f32(psimd_f32 a, psimd_f32 b) {
+		return a / b;
+	}
+
+	/* Vector and */
+	PSIMD_INTRINSIC psimd_f32 psimd_andmask_f32(psimd_s32 mask, psimd_f32 v) {
+		return (psimd_f32) (mask & (psimd_s32) v);
+	}
+
+	/* Vector and-not */
+	PSIMD_INTRINSIC psimd_f32 psimd_andnotmask_f32(psimd_s32 mask, psimd_f32 v) {
+		return (psimd_f32) (~mask & (psimd_s32) v);
+	}
+
+	/* Vector blend */
+	PSIMD_INTRINSIC psimd_s8 psimd_blend_s8(psimd_s8 mask, psimd_s8 a, psimd_s8 b) {
+		#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+			return (psimd_s8) vbslq_s8((uint8x16_t) mask, (int8x16_t) a, (int8x16_t) b);
+		#elif defined(__wasm__) && defined(__wasm_simd128__) && defined(__clang__)
+			return (psimd_s8) __builtin_wasm_bitselect(a, b, mask);
+		#else
+			return (mask & a) | (~mask & b);
+		#endif
+	}
+
+	PSIMD_INTRINSIC psimd_u8 psimd_blend_u8(psimd_s8 mask, psimd_u8 a, psimd_u8 b) {
+		#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+			return (psimd_u8) vbslq_u8((uint8x16_t) mask, (uint8x16_t) a, (uint8x16_t) b);
+		#elif defined(__wasm__) && defined(__wasm_simd128__) && defined(__clang__)
+			return (psimd_u8) __builtin_wasm_bitselect(a, b, mask);
+		#else
+			return (psimd_u8) ((mask & (psimd_s8) a) | (~mask & (psimd_s8) b));
+		#endif
+	}
+	
+	PSIMD_INTRINSIC psimd_s16 psimd_blend_s16(psimd_s16 mask, psimd_s16 a, psimd_s16 b) {
+		#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+			return (psimd_s16) vbslq_s16((uint16x8_t) mask, (int16x8_t) a, (int16x8_t) b);
+		#elif defined(__wasm__) && defined(__wasm_simd128__) && defined(__clang__)
+			return (psimd_s16) __builtin_wasm_bitselect(a, b, mask);
+		#else
+			return (mask & a) | (~mask & b);
+		#endif
+	}
+
+	PSIMD_INTRINSIC psimd_u16 psimd_blend_u16(psimd_s16 mask, psimd_u16 a, psimd_u16 b) {
+		#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+			return (psimd_u16) vbslq_u16((uint16x8_t) mask, (uint16x8_t) a, (uint16x8_t) b);
+		#elif defined(__wasm__) && defined(__wasm_simd128__) && defined(__clang__)
+			return (psimd_u16) __builtin_wasm_bitselect(a, b, mask);
+		#else
+			return (psimd_u16) ((mask & (psimd_s16) a) | (~mask & (psimd_s16) b));
+		#endif
+	}
+	
+	PSIMD_INTRINSIC psimd_s32 psimd_blend_s32(psimd_s32 mask, psimd_s32 a, psimd_s32 b) {
+		#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+			return (psimd_s32) vbslq_s32((uint32x4_t) mask, (int32x4_t) a, (int32x4_t) b);
+		#elif defined(__wasm__) && defined(__wasm_simd128__) && defined(__clang__)
+			return (psimd_s32) __builtin_wasm_bitselect(a, b, mask);
+		#else
+			return (mask & a) | (~mask & b);
+		#endif
+	}
+
+	PSIMD_INTRINSIC psimd_u32 psimd_blend_u32(psimd_s32 mask, psimd_u32 a, psimd_u32 b) {
+		#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+			return (psimd_u32) vbslq_u32((uint32x4_t) mask, (uint32x4_t) a, (uint32x4_t) b);
+		#elif defined(__wasm__) && defined(__wasm_simd128__) && defined(__clang__)
+			return (psimd_u32) __builtin_wasm_bitselect(a, b, mask);
+		#else
+			return (psimd_u32) ((mask & (psimd_s32) a) | (~mask & (psimd_s32) b));
+		#endif
+	}
+	
+	PSIMD_INTRINSIC psimd_f32 psimd_blend_f32(psimd_s32 mask, psimd_f32 a, psimd_f32 b) {
+		#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+			return (psimd_f32) vbslq_f32((uint32x4_t) mask, (float32x4_t) a, (float32x4_t) b);
+		#elif defined(__wasm__) && defined(__wasm_simd128__) && defined(__clang__)
+			return (psimd_f32) __builtin_wasm_bitselect(a, b, mask);
+		#else
+			return (psimd_f32) ((mask & (psimd_s32) a) | (~mask & (psimd_s32) b));
+		#endif
+	}
+
+	/* Vector blend on sign */
+	PSIMD_INTRINSIC psimd_s8 psimd_signblend_s8(psimd_s8 x, psimd_s8 a, psimd_s8 b) {
+		return psimd_blend_s8(x >> psimd_splat_s8(7), a, b);
+	}
+
+	PSIMD_INTRINSIC psimd_u8 psimd_signblend_u8(psimd_s8 x, psimd_u8 a, psimd_u8 b) {
+		return psimd_blend_u8((x >> psimd_splat_s8(7)), a, b);
+	}
+
+	PSIMD_INTRINSIC psimd_s16 psimd_signblend_s16(psimd_s16 x, psimd_s16 a, psimd_s16 b) {
+		return psimd_blend_s16(x >> psimd_splat_s16(15), a, b);
+	}
+
+	PSIMD_INTRINSIC psimd_u16 psimd_signblend_u16(psimd_s16 x, psimd_u16 a, psimd_u16 b) {
+		return psimd_blend_u16((x >> psimd_splat_s16(15)), a, b);
+	}
+
+	PSIMD_INTRINSIC psimd_s32 psimd_signblend_s32(psimd_s32 x, psimd_s32 a, psimd_s32 b) {
+		return psimd_blend_s32(x >> psimd_splat_s32(31), a, b);
+	}
+
+	PSIMD_INTRINSIC psimd_u32 psimd_signblend_u32(psimd_s32 x, psimd_u32 a, psimd_u32 b) {
+		return psimd_blend_u32((x >> psimd_splat_s32(31)), a, b);
+	}
+
+	PSIMD_INTRINSIC psimd_f32 psimd_signblend_f32(psimd_f32 x, psimd_f32 a, psimd_f32 b) {
+		const psimd_s32 mask = (psimd_s32) x >> psimd_splat_s32(31);
+		return psimd_blend_f32(mask, a, b);
+	}
+
+	/* Vector absolute value */
+	PSIMD_INTRINSIC psimd_f32 psimd_abs_f32(psimd_f32 v) {
+		const psimd_s32 mask = (psimd_s32) psimd_splat_f32(-0.0f);
+		return (psimd_f32) ((psimd_s32) v & ~mask);
+	}
+
+	/* Vector negation */
+	PSIMD_INTRINSIC psimd_f32 psimd_neg_f32(psimd_f32 v) {
+		const psimd_s32 mask = (psimd_s32) psimd_splat_f32(-0.0f);
+		return (psimd_f32) ((psimd_s32) v ^ mask);
+	}
+
+	/* Vector maximum */
+	PSIMD_INTRINSIC psimd_s8 psimd_max_s8(psimd_s8 a, psimd_s8 b) {
+		#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+			return (psimd_s8) vmaxq_s8((int8x16_t) a, (int8x16_t) b);
+		#else
+			return psimd_blend_s8(a > b, a, b);
+		#endif
+	}
+
+	PSIMD_INTRINSIC psimd_u8 psimd_max_u8(psimd_u8 a, psimd_u8 b) {
+		#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+			return (psimd_u8) vmaxq_u8((uint8x16_t) a, (uint8x16_t) b);
+		#else
+			return psimd_blend_u8(a > b, a, b);
+		#endif
+	}
+
+	PSIMD_INTRINSIC psimd_s16 psimd_max_s16(psimd_s16 a, psimd_s16 b) {
+		#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+			return (psimd_s16) vmaxq_s16((int16x8_t) a, (int16x8_t) b);
+		#else
+			return psimd_blend_s16(a > b, a, b);
+		#endif
+	}
+
+	PSIMD_INTRINSIC psimd_u16 psimd_max_u16(psimd_u16 a, psimd_u16 b) {
+		#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+			return (psimd_u16) vmaxq_u16((uint16x8_t) a, (uint16x8_t) b);
+		#else
+			return psimd_blend_u16(a > b, a, b);
+		#endif
+	}
+
+	PSIMD_INTRINSIC psimd_s32 psimd_max_s32(psimd_s32 a, psimd_s32 b) {
+		#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+			return (psimd_s32) vmaxq_s32((int32x4_t) a, (int32x4_t) b);
+		#else
+			return psimd_blend_s32(a > b, a, b);
+		#endif
+	}
+
+	PSIMD_INTRINSIC psimd_u32 psimd_max_u32(psimd_u32 a, psimd_u32 b) {
+		#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+			return (psimd_u32) vmaxq_u32((uint32x4_t) a, (uint32x4_t) b);
+		#else
+			return psimd_blend_u32(a > b, a, b);
+		#endif
+	}
+
+	PSIMD_INTRINSIC psimd_f32 psimd_max_f32(psimd_f32 a, psimd_f32 b) {
+		#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+			return (psimd_f32) vmaxq_f32((float32x4_t) a, (float32x4_t) b);
+		#elif defined(__wasm__) && defined(__wasm_simd128__) && defined(__clang__)
+			return __builtin_wasm_max_f32x4(a, b);
+		#else
+			return psimd_blend_f32(a > b, a, b);
+		#endif
+	}
+
+	/* Vector minimum */
+	PSIMD_INTRINSIC psimd_s8 psimd_min_s8(psimd_s8 a, psimd_s8 b) {
+		#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+			return (psimd_s8) vminq_s8((int8x16_t) a, (int8x16_t) b);
+		#else
+			return psimd_blend_s8(a < b, a, b);
+		#endif
+	}
+
+	PSIMD_INTRINSIC psimd_u8 psimd_min_u8(psimd_u8 a, psimd_u8 b) {
+		#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+			return (psimd_u8) vminq_u8((uint8x16_t) a, (uint8x16_t) b);
+		#else
+			return psimd_blend_u8(a < b, a, b);
+		#endif
+	}
+
+	PSIMD_INTRINSIC psimd_s16 psimd_min_s16(psimd_s16 a, psimd_s16 b) {
+		#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+			return (psimd_s16) vminq_s16((int16x8_t) a, (int16x8_t) b);
+		#else
+			return psimd_blend_s16(a < b, a, b);
+		#endif
+	}
+
+	PSIMD_INTRINSIC psimd_u16 psimd_min_u16(psimd_u16 a, psimd_u16 b) {
+		#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+			return (psimd_u16) vminq_u16((uint16x8_t) a, (uint16x8_t) b);
+		#else
+			return psimd_blend_u16(a < b, a, b);
+		#endif
+	}
+
+	PSIMD_INTRINSIC psimd_s32 psimd_min_s32(psimd_s32 a, psimd_s32 b) {
+		#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+			return (psimd_s32) vminq_s32((int32x4_t) a, (int32x4_t) b);
+		#else
+			return psimd_blend_s32(a < b, a, b);
+		#endif
+	}
+
+	PSIMD_INTRINSIC psimd_u32 psimd_min_u32(psimd_u32 a, psimd_u32 b) {
+		#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+			return (psimd_u32) vminq_u32((uint32x4_t) a, (uint32x4_t) b);
+		#else
+			return psimd_blend_u32(a < b, a, b);
+		#endif
+	}
+
+	PSIMD_INTRINSIC psimd_f32 psimd_min_f32(psimd_f32 a, psimd_f32 b) {
+		#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+			return (psimd_f32) vminq_f32((float32x4_t) a, (float32x4_t) b);
+		#elif defined(__wasm__) && defined(__wasm_simd128__) && defined(__clang__)
+			return __builtin_wasm_min_f32x4(a, b);
+		#else
+			return psimd_blend_f32(a < b, a, b);
+		#endif
+	}
+
+	PSIMD_INTRINSIC psimd_f32 psimd_cvt_s32_f32(psimd_s32 v) {
+		#if defined(__clang__)
+			return __builtin_convertvector(v, psimd_f32);
+		#elif defined(__ARM_NEON__) || defined(__ARM_NEON)
+			return (psimd_f32) vcvtq_f32_s32((int32x4_t) v);
+		#elif defined(__SSE2__)
+			return (psimd_f32) _mm_cvtepi32_ps((__m128i) v);
+		#else
+			return (psimd_f32) { (float) v[0], (float) v[1], (float) v[2], (float) v[3] };
+		#endif
+	}
+
+	/* Broadcast vector element */
+	#if defined(__clang__)
+		PSIMD_INTRINSIC psimd_f32 psimd_splat0_f32(psimd_f32 v) {
+			return __builtin_shufflevector(v, v, 0, 0, 0, 0);
+		}
+
+		PSIMD_INTRINSIC psimd_f32 psimd_splat1_f32(psimd_f32 v) {
+			return __builtin_shufflevector(v, v, 1, 1, 1, 1);
+		}
+
+		PSIMD_INTRINSIC psimd_f32 psimd_splat2_f32(psimd_f32 v) {
+			return __builtin_shufflevector(v, v, 2, 2, 2, 2);
+		}
+
+		PSIMD_INTRINSIC psimd_f32 psimd_splat3_f32(psimd_f32 v) {
+			return __builtin_shufflevector(v, v, 3, 3, 3, 3);
+		}
+	#else
+		PSIMD_INTRINSIC psimd_f32 psimd_splat0_f32(psimd_f32 v) {
+			return __builtin_shuffle(v, (psimd_s32) { 0, 0, 0, 0 });
+		}
+
+		PSIMD_INTRINSIC psimd_f32 psimd_splat1_f32(psimd_f32 v) {
+			return __builtin_shuffle(v, (psimd_s32) { 1, 1, 1, 1 });
+		}
+
+		PSIMD_INTRINSIC psimd_f32 psimd_splat2_f32(psimd_f32 v) {
+			return __builtin_shuffle(v, (psimd_s32) { 2, 2, 2, 2 });
+		}
+
+		PSIMD_INTRINSIC psimd_f32 psimd_splat3_f32(psimd_f32 v) {
+			return __builtin_shuffle(v, (psimd_s32) { 3, 3, 3, 3 });
+		}
+	#endif
+
+	/* Reversal of vector elements */
+	#if defined(__clang__)
+		PSIMD_INTRINSIC psimd_s8 psimd_reverse_s8(psimd_s8 v) {
+			return __builtin_shufflevector(v, v, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+		}
+
+		PSIMD_INTRINSIC psimd_u8 psimd_reverse_u8(psimd_u8 v) {
+			return __builtin_shufflevector(v, v, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+		}
+
+		PSIMD_INTRINSIC psimd_s16 psimd_reverse_s16(psimd_s16 v) {
+			return __builtin_shufflevector(v, v, 7, 6, 5, 4, 3, 2, 1, 0);
+		}
+
+		PSIMD_INTRINSIC psimd_u16 psimd_reverse_u16(psimd_u16 v) {
+			return __builtin_shufflevector(v, v, 7, 6, 5, 4, 3, 2, 1, 0);
+		}
+
+		PSIMD_INTRINSIC psimd_s32 psimd_reverse_s32(psimd_s32 v) {
+			return __builtin_shufflevector(v, v, 3, 2, 1, 0);
+		}
+
+		PSIMD_INTRINSIC psimd_u32 psimd_reverse_u32(psimd_u32 v) {
+			return __builtin_shufflevector(v, v, 3, 2, 1, 0);
+		}
+
+		PSIMD_INTRINSIC psimd_f32 psimd_reverse_f32(psimd_f32 v) {
+			return __builtin_shufflevector(v, v, 3, 2, 1, 0);
+		}
+	#else
+		PSIMD_INTRINSIC psimd_s8 psimd_reverse_s8(psimd_s8 v) {
+			return __builtin_shuffle(v, (psimd_s8) { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 });
+		}
+
+		PSIMD_INTRINSIC psimd_u8 psimd_reverse_u8(psimd_u8 v) {
+			return __builtin_shuffle(v, (psimd_s8) { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 });
+		}
+
+		PSIMD_INTRINSIC psimd_s16 psimd_reverse_s16(psimd_s16 v) {
+			return __builtin_shuffle(v, (psimd_s16) { 7, 6, 5, 4, 3, 2, 1, 0 });
+		}
+
+		PSIMD_INTRINSIC psimd_u16 psimd_reverse_u16(psimd_u16 v) {
+			return __builtin_shuffle(v, (psimd_s16) { 7, 6, 5, 4, 3, 2, 1, 0 });
+		}
+
+		PSIMD_INTRINSIC psimd_s32 psimd_reverse_s32(psimd_s32 v) {
+			return __builtin_shuffle(v, (psimd_s32) { 3, 2, 1, 0 });
+		}
+
+		PSIMD_INTRINSIC psimd_u32 psimd_reverse_u32(psimd_u32 v) {
+			return __builtin_shuffle(v, (psimd_s32) { 3, 2, 1, 0 });
+		}
+
+		PSIMD_INTRINSIC psimd_f32 psimd_reverse_f32(psimd_f32 v) {
+			return __builtin_shuffle(v, (psimd_s32) { 3, 2, 1, 0 });
+		}
+	#endif
+
+	/* Interleaving of vector elements */
+	#if defined(__clang__)
+		PSIMD_INTRINSIC psimd_s16 psimd_interleave_lo_s16(psimd_s16 a, psimd_s16 b) {
+			return __builtin_shufflevector(a, b, 0, 8+0, 1, 8+1, 2, 8+2, 3, 8+3);
+		}
+
+		PSIMD_INTRINSIC psimd_s16 psimd_interleave_hi_s16(psimd_s16 a, psimd_s16 b) {
+			return __builtin_shufflevector(a, b, 4, 8+4, 5, 8+5, 6, 8+6, 7, 8+7);
+		}
+
+		PSIMD_INTRINSIC psimd_u16 psimd_interleave_lo_u16(psimd_u16 a, psimd_u16 b) {
+			return __builtin_shufflevector(a, b, 0, 8+0, 1, 8+1, 2, 8+2, 3, 8+3);
+		}
+
+		PSIMD_INTRINSIC psimd_u16 psimd_interleave_hi_u16(psimd_u16 a, psimd_u16 b) {
+			return __builtin_shufflevector(a, b, 4, 8+4, 5, 8+5, 6, 8+6, 7, 8+7);
+		}
+
+		PSIMD_INTRINSIC psimd_s32 psimd_interleave_lo_s32(psimd_s32 a, psimd_s32 b) {
+			return __builtin_shufflevector(a, b, 0, 4+0, 1, 4+1);
+		}
+
+		PSIMD_INTRINSIC psimd_s32 psimd_interleave_hi_s32(psimd_s32 a, psimd_s32 b) {
+			return __builtin_shufflevector(a, b, 2, 4+2, 3, 4+3);
+		}
+
+		PSIMD_INTRINSIC psimd_u32 psimd_interleave_lo_u32(psimd_u32 a, psimd_u32 b) {
+			return __builtin_shufflevector(a, b, 0, 4+0, 1, 4+1);
+		}
+
+		PSIMD_INTRINSIC psimd_u32 psimd_interleave_hi_u32(psimd_u32 a, psimd_u32 b) {
+			return __builtin_shufflevector(a, b, 2, 4+2, 3, 4+3);
+		}
+
+		PSIMD_INTRINSIC psimd_f32 psimd_interleave_lo_f32(psimd_f32 a, psimd_f32 b) {
+			return __builtin_shufflevector(a, b, 0, 4+0, 1, 4+1);
+		}
+
+		PSIMD_INTRINSIC psimd_f32 psimd_interleave_hi_f32(psimd_f32 a, psimd_f32 b) {
+			return __builtin_shufflevector(a, b, 2, 4+2, 3, 4+3);
+		}
+	#else
+		PSIMD_INTRINSIC psimd_s16 psimd_interleave_lo_s16(psimd_s16 a, psimd_s16 b) {
+			return __builtin_shuffle(a, b, (psimd_s16) { 0, 8+0, 1, 8+1, 2, 8+2, 3, 8+3 });
+		}
+
+		PSIMD_INTRINSIC psimd_s16 psimd_interleave_hi_s16(psimd_s16 a, psimd_s16 b) {
+			return __builtin_shuffle(a, b, (psimd_s16) { 4, 8+4, 5, 8+5, 6, 8+6, 7, 8+7 });
+		}
+
+		PSIMD_INTRINSIC psimd_u16 psimd_interleave_lo_u16(psimd_u16 a, psimd_u16 b) {
+			return __builtin_shuffle(a, b, (psimd_s16) { 0, 8+0, 1, 8+1, 2, 8+2, 3, 8+3 });
+		}
+
+		PSIMD_INTRINSIC psimd_u16 psimd_interleave_hi_u16(psimd_u16 a, psimd_u16 b) {
+			return __builtin_shuffle(a, b, (psimd_s16) { 4, 8+4, 5, 8+5, 6, 8+6, 7, 8+7 });
+		}
+
+		PSIMD_INTRINSIC psimd_s32 psimd_interleave_lo_s32(psimd_s32 a, psimd_s32 b) {
+			return __builtin_shuffle(a, b, (psimd_s32) { 0, 4+0, 1, 4+1 });
+		}
+
+		PSIMD_INTRINSIC psimd_s32 psimd_interleave_hi_s32(psimd_s32 a, psimd_s32 b) {
+			return __builtin_shuffle(a, b, (psimd_s32) { 2, 4+2, 3, 4+3 });
+		}
+
+		PSIMD_INTRINSIC psimd_u32 psimd_interleave_lo_u32(psimd_u32 a, psimd_u32 b) {
+			return __builtin_shuffle(a, b, (psimd_s32) { 0, 4+0, 1, 4+1 });
+		}
+
+		PSIMD_INTRINSIC psimd_u32 psimd_interleave_hi_u32(psimd_u32 a, psimd_u32 b) {
+			return __builtin_shuffle(a, b, (psimd_s32) { 2, 4+2, 3, 4+3 });
+		}
+
+		PSIMD_INTRINSIC psimd_f32 psimd_interleave_lo_f32(psimd_f32 a, psimd_f32 b) {
+			return __builtin_shuffle(a, b, (psimd_s32) { 0, 4+0, 1, 4+1 });
+		}
+
+		PSIMD_INTRINSIC psimd_f32 psimd_interleave_hi_f32(psimd_f32 a, psimd_f32 b) {
+			return __builtin_shuffle(a, b, (psimd_s32) { 2, 4+2, 3, 4+3 });
+		}
+	#endif
+
+	/* Concatenation of low/high vector elements */
+	#if defined(__clang__)
+		PSIMD_INTRINSIC psimd_s16 psimd_concat_lo_s16(psimd_s16 a, psimd_s16 b) {
+			return __builtin_shufflevector(a, b, 0, 1, 2, 3, 8+0, 8+1, 8+2, 8+3);
+		}
+
+		PSIMD_INTRINSIC psimd_s16 psimd_concat_hi_s16(psimd_s16 a, psimd_s16 b) {
+			return __builtin_shufflevector(a, b, 4, 5, 6, 7, 8+4, 8+5, 8+6, 8+7);
+		}
+
+		PSIMD_INTRINSIC psimd_u16 psimd_concat_lo_u16(psimd_u16 a, psimd_u16 b) {
+			return __builtin_shufflevector(a, b, 0, 1, 2, 3, 8+0, 8+1, 8+2, 8+3);
+		}
+
+		PSIMD_INTRINSIC psimd_u16 psimd_concat_hi_u16(psimd_u16 a, psimd_u16 b) {
+			return __builtin_shufflevector(a, b, 4, 5, 6, 7, 8+4, 8+5, 8+6, 8+7);
+		}
+
+		PSIMD_INTRINSIC psimd_s32 psimd_concat_lo_s32(psimd_s32 a, psimd_s32 b) {
+			return __builtin_shufflevector(a, b, 0, 1, 4+0, 4+1);
+		}
+
+		PSIMD_INTRINSIC psimd_s32 psimd_concat_hi_s32(psimd_s32 a, psimd_s32 b) {
+			return __builtin_shufflevector(a, b, 2, 3, 4+2, 4+3);
+		}
+
+		PSIMD_INTRINSIC psimd_u32 psimd_concat_lo_u32(psimd_u32 a, psimd_u32 b) {
+			return __builtin_shufflevector(a, b, 0, 1, 4+0, 4+1);
+		}
+
+		PSIMD_INTRINSIC psimd_u32 psimd_concat_hi_u32(psimd_u32 a, psimd_u32 b) {
+			return __builtin_shufflevector(a, b, 2, 3, 4+2, 4+3);
+		}
+
+		PSIMD_INTRINSIC psimd_f32 psimd_concat_lo_f32(psimd_f32 a, psimd_f32 b) {
+			return __builtin_shufflevector(a, b, 0, 1, 4+0, 4+1);
+		}
+
+		PSIMD_INTRINSIC psimd_f32 psimd_concat_hi_f32(psimd_f32 a, psimd_f32 b) {
+			return __builtin_shufflevector(a, b, 2, 3, 4+2, 4+3);
+		}
+	#else
+		PSIMD_INTRINSIC psimd_s16 psimd_concat_lo_s16(psimd_s16 a, psimd_s16 b) {
+			return __builtin_shuffle(a, b, (psimd_s16) { 0, 1, 2, 3, 8+0, 8+1, 8+2, 8+3 });
+		}
+
+		PSIMD_INTRINSIC psimd_s16 psimd_concat_hi_s16(psimd_s16 a, psimd_s16 b) {
+			return __builtin_shuffle(a, b, (psimd_s16) { 4, 5, 6, 7, 8+4, 8+5, 8+6, 8+7 });
+		}
+
+		PSIMD_INTRINSIC psimd_u16 psimd_concat_lo_u16(psimd_u16 a, psimd_u16 b) {
+			return __builtin_shuffle(a, b, (psimd_s16) { 0, 1, 2, 3, 8+0, 8+1, 8+2, 8+3 });
+		}
+
+		PSIMD_INTRINSIC psimd_u16 psimd_concat_hi_u16(psimd_u16 a, psimd_u16 b) {
+			return __builtin_shuffle(a, b, (psimd_s16) { 4, 5, 6, 7, 8+4, 8+5, 8+6, 8+7 });
+		}
+
+		PSIMD_INTRINSIC psimd_s32 psimd_concat_lo_s32(psimd_s32 a, psimd_s32 b) {
+			return __builtin_shuffle(a, b, (psimd_s32) { 0, 1, 4+0, 4+1 });
+		}
+
+		PSIMD_INTRINSIC psimd_s32 psimd_concat_hi_s32(psimd_s32 a, psimd_s32 b) {
+			return __builtin_shuffle(a, b, (psimd_s32) { 2, 3, 4+2, 4+3 });
+		}
+
+		PSIMD_INTRINSIC psimd_u32 psimd_concat_lo_u32(psimd_u32 a, psimd_u32 b) {
+			return __builtin_shuffle(a, b, (psimd_s32) { 0, 1, 4+0, 4+1 });
+		}
+
+		PSIMD_INTRINSIC psimd_u32 psimd_concat_hi_u32(psimd_u32 a, psimd_u32 b) {
+			return __builtin_shuffle(a, b, (psimd_s32) { 2, 3, 4+2, 4+3 });
+		}
+
+		PSIMD_INTRINSIC psimd_f32 psimd_concat_lo_f32(psimd_f32 a, psimd_f32 b) {
+			return __builtin_shuffle(a, b, (psimd_s32) { 0, 1, 4+0, 4+1 });
+		}
+
+		PSIMD_INTRINSIC psimd_f32 psimd_concat_hi_f32(psimd_f32 a, psimd_f32 b) {
+			return __builtin_shuffle(a, b, (psimd_s32) { 2, 3, 4+2, 4+3 });
+		}
+	#endif
+
+	/* Concatenation of even/odd vector elements */
+	#if defined(__clang__)
+		PSIMD_INTRINSIC psimd_s8 psimd_concat_even_s8(psimd_s8 a, psimd_s8 b) {
+			return __builtin_shufflevector(a, b,
+				0, 2, 4, 6, 8, 10, 12, 14, 16+0, 16+2, 16+4, 16+6, 16+8, 16+10, 16+12, 16+14);
+		}
+
+		PSIMD_INTRINSIC psimd_s8 psimd_concat_odd_s8(psimd_s8 a, psimd_s8 b) {
+			return __builtin_shufflevector(a, b,
+				1, 3, 5, 7, 9, 11, 13, 15, 16+1, 16+3, 16+5, 16+7, 16+9, 16+11, 16+13, 16+15);
+		}
+
+		PSIMD_INTRINSIC psimd_u8 psimd_concat_even_u8(psimd_u8 a, psimd_u8 b) {
+			return __builtin_shufflevector(a, b,
+				0, 2, 4, 6, 8, 10, 12, 14, 16+0, 16+2, 16+4, 16+6, 16+8, 16+10, 16+12, 16+14);
+		}
+
+		PSIMD_INTRINSIC psimd_u8 psimd_concat_odd_u8(psimd_u8 a, psimd_u8 b) {
+			return __builtin_shufflevector(a, b,
+				1, 3, 5, 7, 9, 11, 13, 15, 16+1, 16+3, 16+5, 16+7, 16+9, 16+11, 16+13, 16+15);
+		}
+
+		PSIMD_INTRINSIC psimd_s16 psimd_concat_even_s16(psimd_s16 a, psimd_s16 b) {
+			return __builtin_shufflevector(a, b, 0, 2, 4, 6, 8+0, 8+2, 8+4, 8+6);
+		}
+
+		PSIMD_INTRINSIC psimd_s16 psimd_concat_odd_s16(psimd_s16 a, psimd_s16 b) {
+			return __builtin_shufflevector(a, b, 1, 3, 5, 7, 8+1, 8+3, 8+5, 8+7);
+		}
+
+		PSIMD_INTRINSIC psimd_u16 psimd_concat_even_u16(psimd_u16 a, psimd_u16 b) {
+			return __builtin_shufflevector(a, b, 0, 2, 4, 6, 8+0, 8+2, 8+4, 8+6);
+		}
+
+		PSIMD_INTRINSIC psimd_u16 psimd_concat_odd_u16(psimd_u16 a, psimd_u16 b) {
+			return __builtin_shufflevector(a, b, 1, 3, 5, 7, 8+1, 8+3, 8+5, 8+7);
+		}
+
+		PSIMD_INTRINSIC psimd_s32 psimd_concat_even_s32(psimd_s32 a, psimd_s32 b) {
+			return __builtin_shufflevector(a, b, 0, 2, 4+0, 4+2);
+		}
+
+		PSIMD_INTRINSIC psimd_s32 psimd_concat_odd_s32(psimd_s32 a, psimd_s32 b) {
+			return __builtin_shufflevector(a, b, 1, 3, 4+1, 4+3);
+		}
+
+		PSIMD_INTRINSIC psimd_u32 psimd_concat_even_u32(psimd_u32 a, psimd_u32 b) {
+			return __builtin_shufflevector(a, b, 0, 2, 4+0, 4+2);
+		}
+
+		PSIMD_INTRINSIC psimd_u32 psimd_concat_odd_u32(psimd_u32 a, psimd_u32 b) {
+			return __builtin_shufflevector(a, b, 1, 3, 4+1, 4+3);
+		}
+
+		PSIMD_INTRINSIC psimd_f32 psimd_concat_even_f32(psimd_f32 a, psimd_f32 b) {
+			return __builtin_shufflevector(a, b, 0, 2, 4+0, 4+2);
+		}
+
+		PSIMD_INTRINSIC psimd_f32 psimd_concat_odd_f32(psimd_f32 a, psimd_f32 b) {
+			return __builtin_shufflevector(a, b, 1, 3, 4+1, 4+3);
+		}
+	#else
+		PSIMD_INTRINSIC psimd_s8 psimd_concat_even_s8(psimd_s8 a, psimd_s8 b) {
+			return __builtin_shuffle(a, b,
+				(psimd_s8) { 0, 2, 4, 6, 8, 10, 12, 14, 16+0, 16+2, 16+4, 16+6, 16+8, 16+10, 16+12, 16+14 });
+		}
+
+		PSIMD_INTRINSIC psimd_s8 psimd_concat_odd_s8(psimd_s8 a, psimd_s8 b) {
+			return __builtin_shuffle(a, b,
+				(psimd_s8) { 1, 3, 5, 7, 9, 11, 13, 15, 16+1, 16+3, 16+5, 16+7, 16+9, 16+11, 16+13, 16+15 });
+		}
+
+		PSIMD_INTRINSIC psimd_u8 psimd_concat_even_u8(psimd_u8 a, psimd_u8 b) {
+			return __builtin_shuffle(a, b,
+				(psimd_s8) { 0, 2, 4, 6, 8, 10, 12, 14, 16+0, 16+2, 16+4, 16+6, 16+8, 16+10, 16+12, 16+14 });
+		}
+
+		PSIMD_INTRINSIC psimd_u8 psimd_concat_odd_u8(psimd_u8 a, psimd_u8 b) {
+			return __builtin_shuffle(a, b,
+				(psimd_s8) { 1, 3, 5, 7, 9, 11, 13, 15, 16+1, 16+3, 16+5, 16+7, 16+9, 16+11, 16+13, 16+15 });
+		}
+
+		PSIMD_INTRINSIC psimd_s16 psimd_concat_even_s16(psimd_s16 a, psimd_s16 b) {
+			return __builtin_shuffle(a, b, (psimd_s16) { 0, 2, 4, 6, 8+0, 8+2, 8+4, 8+6 });
+		}
+
+		PSIMD_INTRINSIC psimd_s16 psimd_concat_odd_s16(psimd_s16 a, psimd_s16 b) {
+			return __builtin_shuffle(a, b, (psimd_s16) { 1, 3, 5, 7, 8+1, 8+3, 8+5, 8+7 });
+		}
+
+		PSIMD_INTRINSIC psimd_u16 psimd_concat_even_u16(psimd_u16 a, psimd_u16 b) {
+			return __builtin_shuffle(a, b, (psimd_s16) { 0, 2, 4, 6, 8+0, 8+2, 8+4, 8+6 });
+		}
+
+		PSIMD_INTRINSIC psimd_u16 psimd_concat_odd_u16(psimd_u16 a, psimd_u16 b) {
+			return __builtin_shuffle(a, b, (psimd_s16) { 1, 3, 5, 7, 8+1, 8+3, 8+5, 8+7 });
+		}
+
+		PSIMD_INTRINSIC psimd_s32 psimd_concat_even_s32(psimd_s32 a, psimd_s32 b) {
+			return __builtin_shuffle(a, b, (psimd_s32) { 0, 2, 4+0, 4+2 });
+		}
+
+		PSIMD_INTRINSIC psimd_s32 psimd_concat_odd_s32(psimd_s32 a, psimd_s32 b) {
+			return __builtin_shuffle(a, b, (psimd_s32) { 1, 3, 4+1, 4+3 });
+		}
+
+		PSIMD_INTRINSIC psimd_u32 psimd_concat_even_u32(psimd_u32 a, psimd_u32 b) {
+			return __builtin_shuffle(a, b, (psimd_s32) { 0, 2, 4+0, 4+2 });
+		}
+
+		PSIMD_INTRINSIC psimd_u32 psimd_concat_odd_u32(psimd_u32 a, psimd_u32 b) {
+			return __builtin_shuffle(a, b, (psimd_s32) { 1, 3, 4+1, 4+3 });
+		}
+
+		PSIMD_INTRINSIC psimd_f32 psimd_concat_even_f32(psimd_f32 a, psimd_f32 b) {
+			return __builtin_shuffle(a, b, (psimd_s32) { 0, 2, 4+0, 4+2 });
+		}
+
+		PSIMD_INTRINSIC psimd_f32 psimd_concat_odd_f32(psimd_f32 a, psimd_f32 b) {
+			return __builtin_shuffle(a, b, (psimd_s32) { 1, 3, 4+1, 4+3 });
+		}
+	#endif
+
+	/* Vector reduce */
+	#if defined(__clang__)
+		PSIMD_INTRINSIC psimd_f32 psimd_allreduce_sum_f32(psimd_f32 v) {
+			const psimd_f32 temp = v + __builtin_shufflevector(v, v, 2, 3, 0, 1);
+			return temp + __builtin_shufflevector(temp, temp, 1, 0, 3, 2);
+		}
+
+		PSIMD_INTRINSIC psimd_f32 psimd_allreduce_max_f32(psimd_f32 v) {
+			const psimd_f32 temp = psimd_max_f32(v, __builtin_shufflevector(v, v, 2, 3, 0, 1));
+			return psimd_max_f32(temp, __builtin_shufflevector(temp, temp, 1, 0, 3, 2));
+		}
+
+		PSIMD_INTRINSIC psimd_f32 psimd_allreduce_min_f32(psimd_f32 v) {
+			const psimd_f32 temp = psimd_min_f32(v, __builtin_shufflevector(v, v, 2, 3, 0, 1));
+			return psimd_min_f32(temp, __builtin_shufflevector(temp, temp, 1, 0, 3, 2));
+		}
+
+		PSIMD_INTRINSIC float psimd_reduce_sum_f32(psimd_f32 v) {
+			const psimd_f32 temp = v + __builtin_shufflevector(v, v, 2, 3, -1, -1);
+			const psimd_f32 result = temp + __builtin_shufflevector(temp, temp, 1, -1, -1, -1);
+			return result[0];
+		}
+
+		PSIMD_INTRINSIC float psimd_reduce_max_f32(psimd_f32 v) {
+			const psimd_f32 temp = psimd_max_f32(v, __builtin_shufflevector(v, v, 2, 3, -1, -1));
+			const psimd_f32 result = psimd_max_f32(temp, __builtin_shufflevector(temp, temp, 1, -1, -1, -1));
+			return result[0];
+		}
+
+		PSIMD_INTRINSIC float psimd_reduce_min_f32(psimd_f32 v) {
+			const psimd_f32 temp = psimd_min_f32(v, __builtin_shufflevector(v, v, 2, 3, -1, -1));
+			const psimd_f32 result = psimd_min_f32(temp, __builtin_shufflevector(temp, temp, 1, -1, -1, -1));
+			return result[0];
+		}
+	#else
+		PSIMD_INTRINSIC psimd_f32 psimd_allreduce_sum_f32(psimd_f32 v) {
+			const psimd_f32 temp = v + __builtin_shuffle(v, (psimd_s32) { 2, 3, 0, 1 });
+			return temp + __builtin_shuffle(temp, (psimd_s32) { 1, 0, 3, 2 });
+		}
+
+		PSIMD_INTRINSIC psimd_f32 psimd_allreduce_max_f32(psimd_f32 v) {
+			const psimd_f32 temp = psimd_max_f32(v, __builtin_shuffle(v, (psimd_s32) { 2, 3, 0, 1 }));
+			return psimd_max_f32(temp, __builtin_shuffle(temp, (psimd_s32) { 1, 0, 3, 2 }));
+		}
+
+		PSIMD_INTRINSIC psimd_f32 psimd_allreduce_min_f32(psimd_f32 v) {
+			const psimd_f32 temp = psimd_min_f32(v, __builtin_shuffle(v, (psimd_s32) { 2, 3, 0, 1 }));
+			return psimd_min_f32(temp, __builtin_shuffle(temp, (psimd_s32) { 1, 0, 3, 2 }));
+		}
+
+		PSIMD_INTRINSIC float psimd_reduce_sum_f32(psimd_f32 v) {
+			const psimd_f32 result = psimd_allreduce_sum_f32(v);
+			return result[0];
+		}
+
+		PSIMD_INTRINSIC float psimd_reduce_max_f32(psimd_f32 v) {
+			const psimd_f32 result = psimd_allreduce_max_f32(v);
+			return result[0];
+		}
+
+		PSIMD_INTRINSIC float psimd_reduce_min_f32(psimd_f32 v) {
+			const psimd_f32 result = psimd_allreduce_min_f32(v);
+			return result[0];
+		}
+	#endif
+#endif
+
+#endif /* PSIMD_H */