[linux-x86] Refresh gcc-4.9 prebuilts for X86_64
gcc.4-9 source is from google/gcc-4_9 branch.

See prev commit message for build instructions

Change-Id: I310615253d4b359784a9d63622fddef47a71ea74
diff --git a/SOURCES b/SOURCES
index 3c437f9..e111615 100644
--- a/SOURCES
+++ b/SOURCES
@@ -6,16 +6,16 @@
 toolchain/isl.git                      b05d4572958c5d497da793f3317084bab90c3033 add isl-0.11.1.tar.bz2 needed by GCC 4.8 with graphite
 toolchain/ppl.git                      8ba1875b4c5341d902321761022a6d2a0b5b19a4 add ppl-1.0.tar.bz2
 toolchain/expat.git                    40172a0ae9d40a068f1e1a48ffcf6a1ccf765ed5 expat package for building gdb-7.3
-toolchain/binutils.git                 2a6558a8ecfb81d75215b4ec7dc61113e12cfd5f Merge "[2.24] Fix assert failure with --emit-relocs and .eh_frame sections by backporting the following patch from mainline:"
-toolchain/gcc.git                      8f2898c09773292b6ba523cee7e528a320ff1e02 Merge "[4.8, 4.9] Fix aarch64/arm_neon.h vqdmulh_n_s16"
-toolchain/gdb.git                      df2206c67d812619976f49319911ed12c8220b11 Fix broken x86_64 gdbserver build
+toolchain/binutils.git                 c8e1636493f7d40bb0ba37501e9a6f67cd062a31 [2.24] Cortex-A53 erratum 835769 linker workaround This is ported from upstream binutils 2.24 branch   commit: cde98f8566e14f52b896abc92c357cdd14717505
+toolchain/gcc.git                      ba21122c63b06a70b620c8990c517b87198ad05b [4.9] Restore recog state after finding pre-madd instruction. This patch backports svn r216853 from upsteam gcc-4.9 branch that fixed a bug introduced in fixing cortex a53 erratum 835769.
+toolchain/gdb.git                      24237bc8bc3001a82d6cd9685719c4679f721792 fix some build errors
 toolchain/python.git                   0d4194853e08d3244931523470331c00dfb94863 Fix python build inc_dirs[] and lib_dirs[] for linux/darwin
 toolchain/perl.git                     1121daca35c6c692602621eab28d4de19f0b347d Add -Dcc_as_ld to configure
-toolchain/mclinker.git                 5fca8b9c9c671d6c01f428c00ca131e65042a9fd Merge upstream mclinker 2.7
+toolchain/mclinker.git                 66bf13b826497336180182aef105716603921494 Merge branch 'master' of https://android.googlesource.com/toolchain/mclinker into aosp
 toolchain/yasm.git                     87c09baff80ca5bbe938392d8f320e621707f317 test commit
-toolchain/clang.git (release_34)       32d5296021ee140d3afa467ef54de7f9daf9c9c1 Backport clang svn@r207520
-toolchain/llvm.git (release_34)        4e3e3137a7d3c271f5190ac9ef8af7848e5f67e2 Fix missing change for expandVAArg.
-toolchain/compiler-rt.git (release_34) b065fccd8ab47b4de4610093fb98514fc5df2362 Alias __aeabi_fcmpun to __unordsf2.
-toolchain/clang.git (release_33)       260f965f1273cd0aa8a1270885519df8e6317062 [ndk] Fix diagnostics for C-style cast to function type.
-toolchain/llvm.git (release_33)        8609a3469a8126eb6fb99ff65906bcc20c272d95 [ndk] Fix createBranchWeights() assertions.
-toolchain/compiler-rt.git (release_33) c880feaaa8829681a025d29a33704c18e21e87e1 Misc fixes for compiler_rt
+toolchain/clang.git (release_35)       e8513044903a3766443ef6d5eab92e376f2eba32 Fix assertion failure on DeferredDeclsToEmit.
+toolchain/llvm.git (release_35)        6797a08398a6b54589c37e8d8f9e9a26b4fb5621 [ndk] Fix inappropriate debug info assertion.
+toolchain/compiler-rt.git (release_35) 58bfd9dc4d03012ed9e59dc21e6b2e098f9476f8 Implement __aeabi_idiv0 and __aeabi_uidiv0.
+toolchain/clang.git (release_34)       a284801985744bd9667b9d14f079b3a2e8b19130 [ndk] Define __USER_LABEL_PREFIX__ to empty string.
+toolchain/llvm.git (release_34)        b2631bf0dc9579ddfb4bc1afdf7998c12fa268af Merge "python: AC_PATH_PROG -> AC_PATH_PROGS and fix search order" into release_34
+toolchain/compiler-rt.git (release_34) 4f9b4718e8d836317c224955a1e87b7bb5252ae1 Update clear_cache to trunk@208591
diff --git a/bin/x86_64-linux-android-addr2line b/bin/x86_64-linux-android-addr2line
index a68613f..9b415bf 100755
--- a/bin/x86_64-linux-android-addr2line
+++ b/bin/x86_64-linux-android-addr2line
Binary files differ
diff --git a/bin/x86_64-linux-android-ar b/bin/x86_64-linux-android-ar
index a7224ad..783f065 100755
--- a/bin/x86_64-linux-android-ar
+++ b/bin/x86_64-linux-android-ar
Binary files differ
diff --git a/bin/x86_64-linux-android-as b/bin/x86_64-linux-android-as
index b71340b..a942719 100755
--- a/bin/x86_64-linux-android-as
+++ b/bin/x86_64-linux-android-as
Binary files differ
diff --git a/bin/x86_64-linux-android-c++filt b/bin/x86_64-linux-android-c++filt
index d7f5957..5a13216 100755
--- a/bin/x86_64-linux-android-c++filt
+++ b/bin/x86_64-linux-android-c++filt
Binary files differ
diff --git a/bin/x86_64-linux-android-cpp b/bin/x86_64-linux-android-cpp
index 2b5e18b..465f379 100755
--- a/bin/x86_64-linux-android-cpp
+++ b/bin/x86_64-linux-android-cpp
Binary files differ
diff --git a/bin/x86_64-linux-android-g++ b/bin/x86_64-linux-android-g++
index 41b1a5a..6d33342 100755
--- a/bin/x86_64-linux-android-g++
+++ b/bin/x86_64-linux-android-g++
Binary files differ
diff --git a/bin/x86_64-linux-android-gcc b/bin/x86_64-linux-android-gcc
index 40e37b8..0713d6e 100755
--- a/bin/x86_64-linux-android-gcc
+++ b/bin/x86_64-linux-android-gcc
Binary files differ
diff --git a/bin/x86_64-linux-android-gcov b/bin/x86_64-linux-android-gcov
index 659aeab..fa8c4e4 100755
--- a/bin/x86_64-linux-android-gcov
+++ b/bin/x86_64-linux-android-gcov
Binary files differ
diff --git a/bin/x86_64-linux-android-gcov-tool b/bin/x86_64-linux-android-gcov-tool
new file mode 100755
index 0000000..0905437
--- /dev/null
+++ b/bin/x86_64-linux-android-gcov-tool
Binary files differ
diff --git a/bin/x86_64-linux-android-gdb b/bin/x86_64-linux-android-gdb
index 267a5bd..a8713c1 100755
--- a/bin/x86_64-linux-android-gdb
+++ b/bin/x86_64-linux-android-gdb
Binary files differ
diff --git a/bin/x86_64-linux-android-gprof b/bin/x86_64-linux-android-gprof
index e5a1fc9..705bc4d 100755
--- a/bin/x86_64-linux-android-gprof
+++ b/bin/x86_64-linux-android-gprof
Binary files differ
diff --git a/bin/x86_64-linux-android-ld.bfd b/bin/x86_64-linux-android-ld.bfd
index e62c0da..c16c913 100755
--- a/bin/x86_64-linux-android-ld.bfd
+++ b/bin/x86_64-linux-android-ld.bfd
Binary files differ
diff --git a/bin/x86_64-linux-android-ld.mcld b/bin/x86_64-linux-android-ld.mcld
index 0de2251..61d362c 100755
--- a/bin/x86_64-linux-android-ld.mcld
+++ b/bin/x86_64-linux-android-ld.mcld
Binary files differ
diff --git a/bin/x86_64-linux-android-nm b/bin/x86_64-linux-android-nm
index e087950..936a398 100755
--- a/bin/x86_64-linux-android-nm
+++ b/bin/x86_64-linux-android-nm
Binary files differ
diff --git a/bin/x86_64-linux-android-objcopy b/bin/x86_64-linux-android-objcopy
index d3564fa..6282d52 100755
--- a/bin/x86_64-linux-android-objcopy
+++ b/bin/x86_64-linux-android-objcopy
Binary files differ
diff --git a/bin/x86_64-linux-android-objdump b/bin/x86_64-linux-android-objdump
index 03e3d8e..9924978 100755
--- a/bin/x86_64-linux-android-objdump
+++ b/bin/x86_64-linux-android-objdump
Binary files differ
diff --git a/bin/x86_64-linux-android-ranlib b/bin/x86_64-linux-android-ranlib
index 2776b20..68c5429 100755
--- a/bin/x86_64-linux-android-ranlib
+++ b/bin/x86_64-linux-android-ranlib
Binary files differ
diff --git a/bin/x86_64-linux-android-readelf b/bin/x86_64-linux-android-readelf
index 6cde6c5..a2ce89a 100755
--- a/bin/x86_64-linux-android-readelf
+++ b/bin/x86_64-linux-android-readelf
Binary files differ
diff --git a/bin/x86_64-linux-android-size b/bin/x86_64-linux-android-size
index fe1383d..ba2ee5b 100755
--- a/bin/x86_64-linux-android-size
+++ b/bin/x86_64-linux-android-size
Binary files differ
diff --git a/bin/x86_64-linux-android-strings b/bin/x86_64-linux-android-strings
index fe096d7..f9f3bcb 100755
--- a/bin/x86_64-linux-android-strings
+++ b/bin/x86_64-linux-android-strings
Binary files differ
diff --git a/bin/x86_64-linux-android-strip b/bin/x86_64-linux-android-strip
index 9640255..ffbcf4a 100755
--- a/bin/x86_64-linux-android-strip
+++ b/bin/x86_64-linux-android-strip
Binary files differ
diff --git a/lib/gcc/x86_64-linux-android/4.9/32/crtbegin.o b/lib/gcc/x86_64-linux-android/4.9/32/crtbegin.o
index a333bb8..77fe4f4 100644
--- a/lib/gcc/x86_64-linux-android/4.9/32/crtbegin.o
+++ b/lib/gcc/x86_64-linux-android/4.9/32/crtbegin.o
Binary files differ
diff --git a/lib/gcc/x86_64-linux-android/4.9/32/crtbeginS.o b/lib/gcc/x86_64-linux-android/4.9/32/crtbeginS.o
index a9ff81d..74f5b7a 100644
--- a/lib/gcc/x86_64-linux-android/4.9/32/crtbeginS.o
+++ b/lib/gcc/x86_64-linux-android/4.9/32/crtbeginS.o
Binary files differ
diff --git a/lib/gcc/x86_64-linux-android/4.9/32/crtbeginT.o b/lib/gcc/x86_64-linux-android/4.9/32/crtbeginT.o
index a333bb8..77fe4f4 100644
--- a/lib/gcc/x86_64-linux-android/4.9/32/crtbeginT.o
+++ b/lib/gcc/x86_64-linux-android/4.9/32/crtbeginT.o
Binary files differ
diff --git a/lib/gcc/x86_64-linux-android/4.9/32/crtend.o b/lib/gcc/x86_64-linux-android/4.9/32/crtend.o
index 687ae6f..0d8db8d 100644
--- a/lib/gcc/x86_64-linux-android/4.9/32/crtend.o
+++ b/lib/gcc/x86_64-linux-android/4.9/32/crtend.o
Binary files differ
diff --git a/lib/gcc/x86_64-linux-android/4.9/32/crtendS.o b/lib/gcc/x86_64-linux-android/4.9/32/crtendS.o
index 687ae6f..0d8db8d 100644
--- a/lib/gcc/x86_64-linux-android/4.9/32/crtendS.o
+++ b/lib/gcc/x86_64-linux-android/4.9/32/crtendS.o
Binary files differ
diff --git a/lib/gcc/x86_64-linux-android/4.9/32/crtfastmath.o b/lib/gcc/x86_64-linux-android/4.9/32/crtfastmath.o
index d59dc6e..6e6756e 100644
--- a/lib/gcc/x86_64-linux-android/4.9/32/crtfastmath.o
+++ b/lib/gcc/x86_64-linux-android/4.9/32/crtfastmath.o
Binary files differ
diff --git a/lib/gcc/x86_64-linux-android/4.9/32/crtprec32.o b/lib/gcc/x86_64-linux-android/4.9/32/crtprec32.o
index 0614852..b0d3cb0 100644
--- a/lib/gcc/x86_64-linux-android/4.9/32/crtprec32.o
+++ b/lib/gcc/x86_64-linux-android/4.9/32/crtprec32.o
Binary files differ
diff --git a/lib/gcc/x86_64-linux-android/4.9/32/crtprec64.o b/lib/gcc/x86_64-linux-android/4.9/32/crtprec64.o
index 42138bf..381c5e5 100644
--- a/lib/gcc/x86_64-linux-android/4.9/32/crtprec64.o
+++ b/lib/gcc/x86_64-linux-android/4.9/32/crtprec64.o
Binary files differ
diff --git a/lib/gcc/x86_64-linux-android/4.9/32/crtprec80.o b/lib/gcc/x86_64-linux-android/4.9/32/crtprec80.o
index a541e6f..b869221 100644
--- a/lib/gcc/x86_64-linux-android/4.9/32/crtprec80.o
+++ b/lib/gcc/x86_64-linux-android/4.9/32/crtprec80.o
Binary files differ
diff --git a/lib/gcc/x86_64-linux-android/4.9/32/libgcc.a b/lib/gcc/x86_64-linux-android/4.9/32/libgcc.a
index a6011ca..0650efd 100644
--- a/lib/gcc/x86_64-linux-android/4.9/32/libgcc.a
+++ b/lib/gcc/x86_64-linux-android/4.9/32/libgcc.a
Binary files differ
diff --git a/lib/gcc/x86_64-linux-android/4.9/32/libgcov.a b/lib/gcc/x86_64-linux-android/4.9/32/libgcov.a
index b72d700..6a1f2c3 100644
--- a/lib/gcc/x86_64-linux-android/4.9/32/libgcov.a
+++ b/lib/gcc/x86_64-linux-android/4.9/32/libgcov.a
Binary files differ
diff --git a/lib/gcc/x86_64-linux-android/4.9/crtbegin.o b/lib/gcc/x86_64-linux-android/4.9/crtbegin.o
index 2597da2..cc9da22 100644
--- a/lib/gcc/x86_64-linux-android/4.9/crtbegin.o
+++ b/lib/gcc/x86_64-linux-android/4.9/crtbegin.o
Binary files differ
diff --git a/lib/gcc/x86_64-linux-android/4.9/crtbeginS.o b/lib/gcc/x86_64-linux-android/4.9/crtbeginS.o
index 7ea0491..db87d0a 100644
--- a/lib/gcc/x86_64-linux-android/4.9/crtbeginS.o
+++ b/lib/gcc/x86_64-linux-android/4.9/crtbeginS.o
Binary files differ
diff --git a/lib/gcc/x86_64-linux-android/4.9/crtbeginT.o b/lib/gcc/x86_64-linux-android/4.9/crtbeginT.o
index 2597da2..cc9da22 100644
--- a/lib/gcc/x86_64-linux-android/4.9/crtbeginT.o
+++ b/lib/gcc/x86_64-linux-android/4.9/crtbeginT.o
Binary files differ
diff --git a/lib/gcc/x86_64-linux-android/4.9/crtend.o b/lib/gcc/x86_64-linux-android/4.9/crtend.o
index 88e5daf..451e2ec 100644
--- a/lib/gcc/x86_64-linux-android/4.9/crtend.o
+++ b/lib/gcc/x86_64-linux-android/4.9/crtend.o
Binary files differ
diff --git a/lib/gcc/x86_64-linux-android/4.9/crtendS.o b/lib/gcc/x86_64-linux-android/4.9/crtendS.o
index 88e5daf..451e2ec 100644
--- a/lib/gcc/x86_64-linux-android/4.9/crtendS.o
+++ b/lib/gcc/x86_64-linux-android/4.9/crtendS.o
Binary files differ
diff --git a/lib/gcc/x86_64-linux-android/4.9/crtfastmath.o b/lib/gcc/x86_64-linux-android/4.9/crtfastmath.o
index e5c1d09..7586a98 100644
--- a/lib/gcc/x86_64-linux-android/4.9/crtfastmath.o
+++ b/lib/gcc/x86_64-linux-android/4.9/crtfastmath.o
Binary files differ
diff --git a/lib/gcc/x86_64-linux-android/4.9/crtprec32.o b/lib/gcc/x86_64-linux-android/4.9/crtprec32.o
index 16ebe52..fd3de32 100644
--- a/lib/gcc/x86_64-linux-android/4.9/crtprec32.o
+++ b/lib/gcc/x86_64-linux-android/4.9/crtprec32.o
Binary files differ
diff --git a/lib/gcc/x86_64-linux-android/4.9/crtprec64.o b/lib/gcc/x86_64-linux-android/4.9/crtprec64.o
index 1724eb7..ef505ab 100644
--- a/lib/gcc/x86_64-linux-android/4.9/crtprec64.o
+++ b/lib/gcc/x86_64-linux-android/4.9/crtprec64.o
Binary files differ
diff --git a/lib/gcc/x86_64-linux-android/4.9/crtprec80.o b/lib/gcc/x86_64-linux-android/4.9/crtprec80.o
index 89b83dc..8dba5f5 100644
--- a/lib/gcc/x86_64-linux-android/4.9/crtprec80.o
+++ b/lib/gcc/x86_64-linux-android/4.9/crtprec80.o
Binary files differ
diff --git a/lib/gcc/x86_64-linux-android/4.9/gcov-src/gcov-io.c b/lib/gcc/x86_64-linux-android/4.9/gcov-src/gcov-io.c
new file mode 100644
index 0000000..f226cbf
--- /dev/null
+++ b/lib/gcc/x86_64-linux-android/4.9/gcov-src/gcov-io.c
@@ -0,0 +1,1088 @@
+/* File format for coverage information
+   Copyright (C) 1996-2014 Free Software Foundation, Inc.
+   Contributed by Bob Manson <manson@cygnus.com>.
+   Completely remangled by Nathan Sidwell <nathan@codesourcery.com>.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free
+Software Foundation; either version 3, or (at your option) any later
+version.
+
+GCC is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or
+FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+Under Section 7 of GPL version 3, you are granted additional
+permissions described in the GCC Runtime Library Exception, version
+3.1, as published by the Free Software Foundation.
+
+You should have received a copy of the GNU General Public License and
+a copy of the GCC Runtime Library Exception along with this program;
+see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+<http://www.gnu.org/licenses/>.  */
+
+/* Routines declared in gcov-io.h.  This file should be #included by
+   another source file, after having #included gcov-io.h.  */
+
+#if !IN_GCOV
+static void gcov_write_block (unsigned);
+static gcov_unsigned_t *gcov_write_words (unsigned);
+#endif
+static const gcov_unsigned_t *gcov_read_words (unsigned);
+#if !IN_LIBGCOV
+static void gcov_allocate (unsigned);
+#endif
+
+/* Optimum number of gcov_unsigned_t's read from or written to disk.  */
+#define GCOV_BLOCK_SIZE (1 << 10)
+
+GCOV_LINKAGE struct gcov_var
+{
+  FILE *file;
+  gcov_position_t start;	/* Position of first byte of block */
+  unsigned offset;		/* Read/write position within the block.  */
+  unsigned length;		/* Read limit in the block.  */
+  unsigned overread;		/* Number of words overread.  */
+  int error;			/* < 0 overflow, > 0 disk error.  */
+  int mode;	                /* < 0 writing, > 0 reading */
+#if IN_LIBGCOV
+  /* Holds one block plus 4 bytes, thus all coverage reads & writes
+     fit within this buffer and we always can transfer GCOV_BLOCK_SIZE
+     to and from the disk. libgcov never backtracks and only writes 4
+     or 8 byte objects.  */
+  gcov_unsigned_t buffer[GCOV_BLOCK_SIZE + 1];
+#else
+  int endian;			/* Swap endianness.  */
+  /* Holds a variable length block, as the compiler can write
+     strings and needs to backtrack.  */
+  size_t alloc;
+  gcov_unsigned_t *buffer;
+#endif
+} gcov_var;
+
+/* Save the current position in the gcov file.  */
+/* We need to expose this function when compiling for gcov-tool.  */
+#ifndef IN_GCOV_TOOL
+static inline
+#endif
+gcov_position_t
+gcov_position (void)
+{
+  return gcov_var.start + gcov_var.offset;
+}
+
+/* Return nonzero if the error flag is set.  */
+/* We need to expose this function when compiling for gcov-tool.  */
+#ifndef IN_GCOV_TOOL
+static inline
+#endif
+int
+gcov_is_error (void)
+{
+  return gcov_var.file ? gcov_var.error : 1;
+}
+
+#if IN_LIBGCOV
+/* Move to beginning of file and initialize for writing.  */
+GCOV_LINKAGE inline void
+gcov_rewrite (void)
+{
+  gcc_assert (gcov_var.mode > 0); 
+  gcov_var.mode = -1; 
+  gcov_var.start = 0;
+  gcov_var.offset = 0;
+  fseek (gcov_var.file, 0L, SEEK_SET);
+}
+#endif
+
+static inline gcov_unsigned_t from_file (gcov_unsigned_t value)
+{
+#if !IN_LIBGCOV
+  if (gcov_var.endian)
+    {
+      value = (value >> 16) | (value << 16);
+      value = ((value & 0xff00ff) << 8) | ((value >> 8) & 0xff00ff);
+    }
+#endif
+  return value;
+}
+
+/* Open a gcov file. NAME is the name of the file to open and MODE
+   indicates whether a new file should be created, or an existing file
+   opened. If MODE is >= 0 an existing file will be opened, if
+   possible, and if MODE is <= 0, a new file will be created. Use
+   MODE=0 to attempt to reopen an existing file and then fall back on
+   creating a new one.  If MODE < 0, the file will be opened in
+   read-only mode.  Otherwise it will be opened for modification.
+   Return zero on failure, >0 on opening an existing file and <0 on
+   creating a new one.  */
+
+GCOV_LINKAGE int
+#if IN_LIBGCOV
+gcov_open (const char *name)
+#else
+gcov_open (const char *name, int mode)
+#endif
+{
+#if IN_LIBGCOV
+  const int mode = 0;
+#endif
+#if GCOV_LOCKED
+  struct flock s_flock;
+  int fd;
+
+  s_flock.l_whence = SEEK_SET;
+  s_flock.l_start = 0;
+  s_flock.l_len = 0; /* Until EOF.  */
+  s_flock.l_pid = getpid ();
+#endif
+
+  gcc_assert (!gcov_var.file);
+  gcov_var.start = 0;
+  gcov_var.offset = gcov_var.length = 0;
+  gcov_var.overread = -1u;
+  gcov_var.error = 0;
+#if !IN_LIBGCOV
+  gcov_var.endian = 0;
+#endif
+#if GCOV_LOCKED
+  if (mode > 0)
+    {
+      /* Read-only mode - acquire a read-lock.  */
+      s_flock.l_type = F_RDLCK;
+      /* pass mode (ignored) for compatibility */
+      fd = open (name, O_RDONLY, S_IRUSR | S_IWUSR);
+    }
+  else if (mode < 0)
+     {
+       /* Write mode - acquire a write-lock.  */
+       s_flock.l_type = F_WRLCK;
+      fd = open (name, O_RDWR | O_CREAT | O_TRUNC, 0666);
+    }
+  else /* mode == 0 */
+    {
+      /* Read-Write mode - acquire a write-lock.  */
+      s_flock.l_type = F_WRLCK;
+      fd = open (name, O_RDWR | O_CREAT, 0666);
+    }
+  if (fd < 0)
+    return 0;
+
+  while (fcntl (fd, F_SETLKW, &s_flock) && errno == EINTR)
+    continue;
+
+  gcov_var.file = fdopen (fd, (mode > 0) ? "rb" : "r+b");
+
+  if (!gcov_var.file)
+    {
+      close (fd);
+      return 0;
+    }
+
+  if (mode > 0)
+    gcov_var.mode = 1;
+  else if (mode == 0)
+    {
+      struct stat st;
+
+      if (fstat (fd, &st) < 0)
+	{
+	  fclose (gcov_var.file);
+	  gcov_var.file = 0;
+	  return 0;
+	}
+      if (st.st_size != 0)
+	gcov_var.mode = 1;
+      else
+	gcov_var.mode = mode * 2 + 1;
+    }
+  else
+    gcov_var.mode = mode * 2 + 1;
+#else
+  if (mode >= 0)
+    gcov_var.file = fopen (name, (mode > 0) ? "rb" : "r+b");
+
+  if (gcov_var.file)
+    gcov_var.mode = 1;
+  else if (mode <= 0)
+    {
+      gcov_var.file = fopen (name, "w+b");
+      if (gcov_var.file)
+	gcov_var.mode = mode * 2 + 1;
+    }
+  if (!gcov_var.file)
+    return 0;
+#endif
+
+  setbuf (gcov_var.file, (char *)0);
+
+  return 1;
+}
+
+/* Close the current gcov file. Flushes data to disk. Returns nonzero
+   on failure or error flag set.  */
+
+GCOV_LINKAGE int
+gcov_close (void)
+{
+  if (gcov_var.file)
+    {
+#if !IN_GCOV
+      if (gcov_var.offset && gcov_var.mode < 0)
+	gcov_write_block (gcov_var.offset);
+#endif
+      fclose (gcov_var.file);
+      gcov_var.file = 0;
+      gcov_var.length = 0;
+    }
+#if !IN_LIBGCOV
+  free (gcov_var.buffer);
+  gcov_var.alloc = 0;
+  gcov_var.buffer = 0;
+#endif
+  gcov_var.mode = 0;
+  return gcov_var.error;
+}
+
+#if !IN_LIBGCOV
+/* Check if MAGIC is EXPECTED. Use it to determine endianness of the
+   file. Returns +1 for same endian, -1 for other endian and zero for
+   not EXPECTED.  */
+
+GCOV_LINKAGE int
+gcov_magic (gcov_unsigned_t magic, gcov_unsigned_t expected)
+{
+  if (magic == expected)
+    return 1;
+  magic = (magic >> 16) | (magic << 16);
+  magic = ((magic & 0xff00ff) << 8) | ((magic >> 8) & 0xff00ff);
+  if (magic == expected)
+    {
+      gcov_var.endian = 1;
+      return -1;
+    }
+  return 0;
+}
+#endif
+
+#if !IN_LIBGCOV
+static void
+gcov_allocate (unsigned length)
+{
+  size_t new_size = gcov_var.alloc;
+
+  if (!new_size)
+    new_size = GCOV_BLOCK_SIZE;
+  new_size += length;
+  new_size *= 2;
+
+  gcov_var.alloc = new_size;
+  gcov_var.buffer = XRESIZEVAR (gcov_unsigned_t, gcov_var.buffer, new_size << 2);
+}
+#endif
+
+#if !IN_GCOV
+/* Write out the current block, if needs be.  */
+
+static void
+gcov_write_block (unsigned size)
+{
+  if (fwrite (gcov_var.buffer, size << 2, 1, gcov_var.file) != 1)
+    gcov_var.error = 1;
+  gcov_var.start += size;
+  gcov_var.offset -= size;
+}
+
+/* Allocate space to write BYTES bytes to the gcov file. Return a
+   pointer to those bytes, or NULL on failure.  */
+
+static gcov_unsigned_t *
+gcov_write_words (unsigned words)
+{
+  gcov_unsigned_t *result;
+
+  gcc_assert (gcov_var.mode < 0);
+#if IN_LIBGCOV
+  if (gcov_var.offset >= GCOV_BLOCK_SIZE)
+    {
+      gcov_write_block (GCOV_BLOCK_SIZE);
+      if (gcov_var.offset)
+	{
+	  gcc_assert (gcov_var.offset == 1);
+	  memcpy (gcov_var.buffer, gcov_var.buffer + GCOV_BLOCK_SIZE, 4);
+	}
+    }
+#else
+  if (gcov_var.offset + words > gcov_var.alloc)
+    gcov_allocate (gcov_var.offset + words);
+#endif
+  result = &gcov_var.buffer[gcov_var.offset];
+  gcov_var.offset += words;
+
+  return result;
+}
+
+/* Write unsigned VALUE to coverage file.  Sets error flag
+   appropriately.  */
+
+GCOV_LINKAGE void
+gcov_write_unsigned (gcov_unsigned_t value)
+{
+  gcov_unsigned_t *buffer = gcov_write_words (1);
+
+  buffer[0] = value;
+}
+
+/* Write counter VALUE to coverage file.  Sets error flag
+   appropriately.  */
+
+#if IN_LIBGCOV
+GCOV_LINKAGE void
+gcov_write_counter (gcov_type value)
+{
+  gcov_unsigned_t *buffer = gcov_write_words (2);
+
+  buffer[0] = (gcov_unsigned_t) value;
+  if (sizeof (value) > sizeof (gcov_unsigned_t))
+    buffer[1] = (gcov_unsigned_t) (value >> 32);
+  else
+    buffer[1] = 0;
+}
+#endif /* IN_LIBGCOV */
+
+#if !IN_LIBGCOV
+/* Write STRING to coverage file.  Sets error flag on file
+   error, overflow flag on overflow */
+
+GCOV_LINKAGE void
+gcov_write_string (const char *string)
+{
+  unsigned length = 0;
+  unsigned alloc = 0;
+  gcov_unsigned_t *buffer;
+
+  if (string)
+    {
+      length = strlen (string);
+      alloc = (length + 4) >> 2;
+    }
+
+  buffer = gcov_write_words (1 + alloc);
+
+  buffer[0] = alloc;
+  buffer[alloc] = 0;
+  memcpy (&buffer[1], string, length);
+}
+#endif
+
+#if !IN_LIBGCOV
+/* Write a tag TAG and reserve space for the record length. Return a
+   value to be used for gcov_write_length.  */
+
+GCOV_LINKAGE gcov_position_t
+gcov_write_tag (gcov_unsigned_t tag)
+{
+  gcov_position_t result = gcov_var.start + gcov_var.offset;
+  gcov_unsigned_t *buffer = gcov_write_words (2);
+
+  buffer[0] = tag;
+  buffer[1] = 0;
+
+  return result;
+}
+
+/* Write a record length using POSITION, which was returned by
+   gcov_write_tag.  The current file position is the end of the
+   record, and is restored before returning.  Returns nonzero on
+   overflow.  */
+
+GCOV_LINKAGE void
+gcov_write_length (gcov_position_t position)
+{
+  unsigned offset;
+  gcov_unsigned_t length;
+  gcov_unsigned_t *buffer;
+
+  gcc_assert (gcov_var.mode < 0);
+  gcc_assert (position + 2 <= gcov_var.start + gcov_var.offset);
+  gcc_assert (position >= gcov_var.start);
+  offset = position - gcov_var.start;
+  length = gcov_var.offset - offset - 2;
+  buffer = (gcov_unsigned_t *) &gcov_var.buffer[offset];
+  buffer[1] = length;
+  if (gcov_var.offset >= GCOV_BLOCK_SIZE)
+    gcov_write_block (gcov_var.offset);
+}
+
+#else /* IN_LIBGCOV */
+
+/* Write a tag TAG and length LENGTH.  */
+
+GCOV_LINKAGE void
+gcov_write_tag_length (gcov_unsigned_t tag, gcov_unsigned_t length)
+{
+  gcov_unsigned_t *buffer = gcov_write_words (2);
+
+  buffer[0] = tag;
+  buffer[1] = length;
+}
+
+/* Write a summary structure to the gcov file.  Return nonzero on
+   overflow.  */
+
+GCOV_LINKAGE void
+gcov_write_summary (gcov_unsigned_t tag, const struct gcov_summary *summary)
+{
+  unsigned ix, h_ix, bv_ix, h_cnt = 0;
+  const struct gcov_ctr_summary *csum;
+  unsigned histo_bitvector[GCOV_HISTOGRAM_BITVECTOR_SIZE];
+
+  /* Count number of non-zero histogram entries, and fill in a bit vector
+     of non-zero indices. The histogram is only currently computed for arc
+     counters.  */
+  for (bv_ix = 0; bv_ix < GCOV_HISTOGRAM_BITVECTOR_SIZE; bv_ix++)
+    histo_bitvector[bv_ix] = 0;
+  csum = &summary->ctrs[GCOV_COUNTER_ARCS];
+  for (h_ix = 0; h_ix < GCOV_HISTOGRAM_SIZE; h_ix++)
+    {
+      if (csum->histogram[h_ix].num_counters > 0)
+        {
+          histo_bitvector[h_ix / 32] |= 1 << (h_ix % 32);
+          h_cnt++;
+        }
+    }
+  gcov_write_tag_length (tag, GCOV_TAG_SUMMARY_LENGTH (h_cnt));
+  gcov_write_unsigned (summary->checksum);
+  for (csum = summary->ctrs, ix = GCOV_COUNTERS_SUMMABLE; ix--; csum++)
+    {
+      gcov_write_unsigned (csum->num);
+      gcov_write_unsigned (csum->runs);
+      gcov_write_counter (csum->sum_all);
+      gcov_write_counter (csum->run_max);
+      gcov_write_counter (csum->sum_max);
+      if (ix != GCOV_COUNTER_ARCS)
+        {
+          for (bv_ix = 0; bv_ix < GCOV_HISTOGRAM_BITVECTOR_SIZE; bv_ix++)
+            gcov_write_unsigned (0);
+          continue;
+        }
+      for (bv_ix = 0; bv_ix < GCOV_HISTOGRAM_BITVECTOR_SIZE; bv_ix++)
+        gcov_write_unsigned (histo_bitvector[bv_ix]);
+      for (h_ix = 0; h_ix < GCOV_HISTOGRAM_SIZE; h_ix++)
+        {
+          if (!csum->histogram[h_ix].num_counters)
+            continue;
+          gcov_write_unsigned (csum->histogram[h_ix].num_counters);
+          gcov_write_counter (csum->histogram[h_ix].min_value);
+          gcov_write_counter (csum->histogram[h_ix].cum_value);
+        }
+    }
+}
+#endif /* IN_LIBGCOV */
+
+#endif /*!IN_GCOV */
+
+/* Return a pointer to read BYTES bytes from the gcov file. Returns
+   NULL on failure (read past EOF).  */
+
+static const gcov_unsigned_t *
+gcov_read_words (unsigned words)
+{
+  const gcov_unsigned_t *result;
+  unsigned excess = gcov_var.length - gcov_var.offset;
+
+  gcc_assert (gcov_var.mode > 0);
+  if (excess < words)
+    {
+      gcov_var.start += gcov_var.offset;
+#if IN_LIBGCOV
+      if (excess)
+	{
+	  gcc_assert (excess == 1);
+	  memcpy (gcov_var.buffer, gcov_var.buffer + gcov_var.offset, 4);
+	}
+#else
+      memmove (gcov_var.buffer, gcov_var.buffer + gcov_var.offset, excess * 4);
+#endif
+      gcov_var.offset = 0;
+      gcov_var.length = excess;
+#if IN_LIBGCOV
+      gcc_assert (!gcov_var.length || gcov_var.length == 1);
+      excess = GCOV_BLOCK_SIZE;
+#else
+      if (gcov_var.length + words > gcov_var.alloc)
+	gcov_allocate (gcov_var.length + words);
+      excess = gcov_var.alloc - gcov_var.length;
+#endif
+      excess = fread (gcov_var.buffer + gcov_var.length,
+		      1, excess << 2, gcov_var.file) >> 2;
+      gcov_var.length += excess;
+      if (gcov_var.length < words)
+	{
+	  gcov_var.overread += words - gcov_var.length;
+	  gcov_var.length = 0;
+	  return 0;
+	}
+    }
+  result = &gcov_var.buffer[gcov_var.offset];
+  gcov_var.offset += words;
+  return result;
+}
+
+/* Read unsigned value from a coverage file. Sets error flag on file
+   error, overflow flag on overflow */
+
+GCOV_LINKAGE gcov_unsigned_t
+gcov_read_unsigned (void)
+{
+  gcov_unsigned_t value;
+  const gcov_unsigned_t *buffer = gcov_read_words (1);
+
+  if (!buffer)
+    return 0;
+  value = from_file (buffer[0]);
+  return value;
+}
+
+/* Read counter value from a coverage file. Sets error flag on file
+   error, overflow flag on overflow */
+
+GCOV_LINKAGE gcov_type
+gcov_read_counter (void)
+{
+  gcov_type value;
+  const gcov_unsigned_t *buffer = gcov_read_words (2);
+
+  if (!buffer)
+    return 0;
+  value = from_file (buffer[0]);
+  if (sizeof (value) > sizeof (gcov_unsigned_t))
+    value |= ((gcov_type) from_file (buffer[1])) << 32;
+  else if (buffer[1])
+    gcov_var.error = -1;
+
+  return value;
+}
+
+/* We need to expose the below function when compiling for gcov-tool.  */
+
+#if !IN_LIBGCOV || defined (IN_GCOV_TOOL)
+/* Read string from coverage file. Returns a pointer to a static
+   buffer, or NULL on empty string. You must copy the string before
+   calling another gcov function.  */
+
+GCOV_LINKAGE const char *
+gcov_read_string (void)
+{
+  unsigned length = gcov_read_unsigned ();
+
+  if (!length)
+    return 0;
+
+  return (const char *) gcov_read_words (length);
+}
+#endif
+
+GCOV_LINKAGE void
+gcov_read_summary (struct gcov_summary *summary)
+{
+  unsigned ix, h_ix, bv_ix, h_cnt = 0;
+  struct gcov_ctr_summary *csum;
+  unsigned histo_bitvector[GCOV_HISTOGRAM_BITVECTOR_SIZE];
+  unsigned cur_bitvector;
+
+  summary->checksum = gcov_read_unsigned ();
+  for (csum = summary->ctrs, ix = GCOV_COUNTERS_SUMMABLE; ix--; csum++)
+    {
+      csum->num = gcov_read_unsigned ();
+      csum->runs = gcov_read_unsigned ();
+      csum->sum_all = gcov_read_counter ();
+      csum->run_max = gcov_read_counter ();
+      csum->sum_max = gcov_read_counter ();
+      memset (csum->histogram, 0,
+              sizeof (gcov_bucket_type) * GCOV_HISTOGRAM_SIZE);
+      for (bv_ix = 0; bv_ix < GCOV_HISTOGRAM_BITVECTOR_SIZE; bv_ix++)
+        {
+          histo_bitvector[bv_ix] = gcov_read_unsigned ();
+#if IN_LIBGCOV
+          /* When building libgcov we don't include system.h, which includes
+             hwint.h (where popcount_hwi is declared). However, libgcov.a
+             is built by the bootstrapped compiler and therefore the builtins
+             are always available.  */
+          h_cnt += __builtin_popcount (histo_bitvector[bv_ix]);
+#else
+          h_cnt += popcount_hwi (histo_bitvector[bv_ix]);
+#endif
+        }
+      bv_ix = 0;
+      h_ix = 0;
+      cur_bitvector = 0;
+      while (h_cnt--)
+        {
+          /* Find the index corresponding to the next entry we will read in.
+             First find the next non-zero bitvector and re-initialize
+             the histogram index accordingly, then right shift and increment
+             the index until we find a set bit.  */
+          while (!cur_bitvector)
+            {
+              h_ix = bv_ix * 32;
+              gcc_assert (bv_ix < GCOV_HISTOGRAM_BITVECTOR_SIZE);
+              cur_bitvector = histo_bitvector[bv_ix++];
+            }
+          while (!(cur_bitvector & 0x1))
+            {
+              h_ix++;
+              cur_bitvector >>= 1;
+            }
+          gcc_assert (h_ix < GCOV_HISTOGRAM_SIZE);
+
+          csum->histogram[h_ix].num_counters = gcov_read_unsigned ();
+          csum->histogram[h_ix].min_value = gcov_read_counter ();
+          csum->histogram[h_ix].cum_value = gcov_read_counter ();
+          /* Shift off the index we are done with and increment to the
+             corresponding next histogram entry.  */
+          cur_bitvector >>= 1;
+          h_ix++;
+        }
+    }
+}
+
+#if (!IN_LIBGCOV && IN_GCOV != 1) || defined (IN_GCOV_TOOL)
+/* Read LEN words (unsigned type) and construct MOD_INFO.  */
+
+GCOV_LINKAGE void
+gcov_read_module_info (struct gcov_module_info *mod_info,
+                       gcov_unsigned_t len)
+{
+  gcov_unsigned_t src_filename_len, filename_len, i, j, num_strings;
+  mod_info->ident = gcov_read_unsigned ();
+  mod_info->is_primary = gcov_read_unsigned ();
+  mod_info->flags = gcov_read_unsigned ();
+  mod_info->lang = gcov_read_unsigned ();
+  mod_info->ggc_memory = gcov_read_unsigned ();
+  mod_info->num_quote_paths = gcov_read_unsigned ();
+  mod_info->num_bracket_paths = gcov_read_unsigned ();
+  mod_info->num_system_paths = gcov_read_unsigned ();
+  mod_info->num_cpp_defines = gcov_read_unsigned ();
+  mod_info->num_cpp_includes = gcov_read_unsigned ();
+  mod_info->num_cl_args = gcov_read_unsigned ();
+  len -= 11;
+
+  filename_len = gcov_read_unsigned ();
+  mod_info->da_filename = (char *) xmalloc (filename_len *
+                                            sizeof (gcov_unsigned_t));
+  for (i = 0; i < filename_len; i++)
+    ((gcov_unsigned_t *) mod_info->da_filename)[i] = gcov_read_unsigned ();
+  len -= (filename_len + 1);
+
+  src_filename_len = gcov_read_unsigned ();
+  mod_info->source_filename = (char *) xmalloc (src_filename_len *
+						sizeof (gcov_unsigned_t));
+  for (i = 0; i < src_filename_len; i++)
+    ((gcov_unsigned_t *) mod_info->source_filename)[i] = gcov_read_unsigned ();
+  len -= (src_filename_len + 1);
+
+  num_strings = mod_info->num_quote_paths + mod_info->num_bracket_paths
+    + mod_info->num_system_paths
+    + mod_info->num_cpp_defines + mod_info->num_cpp_includes
+    + mod_info->num_cl_args;
+  for (j = 0; j < num_strings; j++)
+   {
+     gcov_unsigned_t string_len = gcov_read_unsigned ();
+     mod_info->string_array[j] =
+       (char *) xmalloc (string_len * sizeof (gcov_unsigned_t));
+     for (i = 0; i < string_len; i++)
+       ((gcov_unsigned_t *) mod_info->string_array[j])[i] =
+	 gcov_read_unsigned ();
+     len -= (string_len + 1);
+   }
+  gcc_assert (!len);
+}
+#endif
+
+/* We need to expose the below function when compiling for gcov-tool.  */
+
+#if !IN_LIBGCOV || defined (IN_GCOV_TOOL)
+/* Reset to a known position.  BASE should have been obtained from
+   gcov_position, LENGTH should be a record length.  */
+
+GCOV_LINKAGE void
+gcov_sync (gcov_position_t base, gcov_unsigned_t length)
+{
+  gcc_assert (gcov_var.mode > 0);
+  base += length;
+  if (base - gcov_var.start <= gcov_var.length)
+    gcov_var.offset = base - gcov_var.start;
+  else
+    {
+      gcov_var.offset = gcov_var.length = 0;
+      fseek (gcov_var.file, base << 2, SEEK_SET);
+      gcov_var.start = ftell (gcov_var.file) >> 2;
+    }
+}
+#endif
+
+#if IN_LIBGCOV
+/* Move to a given position in a gcov file.  */
+
+GCOV_LINKAGE void
+gcov_seek (gcov_position_t base)
+{
+  gcc_assert (gcov_var.mode < 0);
+  if (gcov_var.offset)
+    gcov_write_block (gcov_var.offset);
+  fseek (gcov_var.file, base << 2, SEEK_SET);
+  gcov_var.start = ftell (gcov_var.file) >> 2;
+}
+
+/* Truncate the gcov file at the current position.  */
+
+GCOV_LINKAGE void
+gcov_truncate (void)
+{
+  long offs;
+  int filenum;
+  gcc_assert (gcov_var.mode < 0);
+  if (gcov_var.offset)
+    gcov_write_block (gcov_var.offset);
+  offs = ftell (gcov_var.file);
+  filenum = fileno (gcov_var.file);
+  if (offs == -1 || filenum == -1 || ftruncate (filenum, offs))
+    gcov_var.error = 1;
+}
+#endif
+
+#if IN_GCOV > 0
+/* Return the modification time of the current gcov file.  */
+
+GCOV_LINKAGE time_t
+gcov_time (void)
+{
+  struct stat status;
+
+  if (fstat (fileno (gcov_var.file), &status))
+    return 0;
+  else
+    return status.st_mtime;
+}
+#endif /* IN_GCOV */
+
+#if !IN_GCOV
+/* Determine the index into histogram for VALUE. */
+
+#if IN_LIBGCOV
+static unsigned
+#else
+GCOV_LINKAGE unsigned
+#endif
+gcov_histo_index (gcov_type value)
+{
+  gcov_type_unsigned v = (gcov_type_unsigned)value;
+  unsigned r = 0;
+  unsigned prev2bits = 0;
+
+  /* Find index into log2 scale histogram, where each of the log2
+     sized buckets is divided into 4 linear sub-buckets for better
+     focus in the higher buckets.  */
+
+  /* Find the place of the most-significant bit set.  */
+  if (v > 0)
+    {
+#if IN_LIBGCOV
+      /* When building libgcov we don't include system.h, which includes
+         hwint.h (where floor_log2 is declared). However, libgcov.a
+         is built by the bootstrapped compiler and therefore the builtins
+         are always available.  */
+      r = sizeof (long long) * __CHAR_BIT__ - 1 - __builtin_clzll (v);
+#else
+      /* We use floor_log2 from hwint.c, which takes a HOST_WIDE_INT
+         that is either 32 or 64 bits, and gcov_type_unsigned may be 64 bits.
+         Need to check for the case where gcov_type_unsigned is 64 bits
+         and HOST_WIDE_INT is 32 bits and handle it specially.  */
+#if HOST_BITS_PER_WIDEST_INT == HOST_BITS_PER_WIDE_INT
+      r = floor_log2 (v);
+#elif HOST_BITS_PER_WIDEST_INT == 2 * HOST_BITS_PER_WIDE_INT
+      HOST_WIDE_INT hwi_v = v >> HOST_BITS_PER_WIDE_INT;
+      if (hwi_v)
+        r = floor_log2 (hwi_v) + HOST_BITS_PER_WIDE_INT;
+      else
+        r = floor_log2 ((HOST_WIDE_INT)v);
+#else
+      gcc_unreachable ();
+#endif
+#endif
+    }
+
+  /* If at most the 2 least significant bits are set (value is
+     0 - 3) then that value is our index into the lowest set of
+     four buckets.  */
+  if (r < 2)
+    return (unsigned)value;
+
+  gcc_assert (r < 64);
+
+  /* Find the two next most significant bits to determine which
+     of the four linear sub-buckets to select.  */
+  prev2bits = (v >> (r - 2)) & 0x3;
+  /* Finally, compose the final bucket index from the log2 index and
+     the next 2 bits. The minimum r value at this point is 2 since we
+     returned above if r was 2 or more, so the minimum bucket at this
+     point is 4.  */
+  return (r - 1) * 4 + prev2bits;
+}
+
+/* Merge SRC_HISTO into TGT_HISTO. The counters are assumed to be in
+   the same relative order in both histograms, and are matched up
+   and merged in reverse order. Each counter is assigned an equal portion of
+   its entry's original cumulative counter value when computing the
+   new merged cum_value.  */
+
+static void gcov_histogram_merge (gcov_bucket_type *tgt_histo,
+                                  gcov_bucket_type *src_histo)
+{
+  int src_i, tgt_i, tmp_i = 0;
+  unsigned src_num, tgt_num, merge_num;
+  gcov_type src_cum, tgt_cum, merge_src_cum, merge_tgt_cum, merge_cum;
+  gcov_type merge_min;
+  gcov_bucket_type tmp_histo[GCOV_HISTOGRAM_SIZE];
+  int src_done = 0;
+
+  memset (tmp_histo, 0, sizeof (gcov_bucket_type) * GCOV_HISTOGRAM_SIZE);
+
+  /* Assume that the counters are in the same relative order in both
+     histograms. Walk the histograms from largest to smallest entry,
+     matching up and combining counters in order.  */
+  src_num = 0;
+  src_cum = 0;
+  src_i = GCOV_HISTOGRAM_SIZE - 1;
+  for (tgt_i = GCOV_HISTOGRAM_SIZE - 1; tgt_i >= 0 && !src_done; tgt_i--)
+    {
+      tgt_num = tgt_histo[tgt_i].num_counters;
+      tgt_cum = tgt_histo[tgt_i].cum_value;
+      /* Keep going until all of the target histogram's counters at this
+         position have been matched and merged with counters from the
+         source histogram.  */
+      while (tgt_num > 0 && !src_done)
+        {
+          /* If this is either the first time through this loop or we just
+             exhausted the previous non-zero source histogram entry, look
+             for the next non-zero source histogram entry.  */
+          if (!src_num)
+            {
+              /* Locate the next non-zero entry.  */
+              while (src_i >= 0 && !src_histo[src_i].num_counters)
+                src_i--;
+              /* If source histogram has fewer counters, then just copy over the
+                 remaining target counters and quit.  */
+              if (src_i < 0)
+                {
+                  tmp_histo[tgt_i].num_counters += tgt_num;
+                  tmp_histo[tgt_i].cum_value += tgt_cum;
+                  if (!tmp_histo[tgt_i].min_value ||
+                      tgt_histo[tgt_i].min_value < tmp_histo[tgt_i].min_value)
+                    tmp_histo[tgt_i].min_value = tgt_histo[tgt_i].min_value;
+                  while (--tgt_i >= 0)
+                    {
+                      tmp_histo[tgt_i].num_counters
+                          += tgt_histo[tgt_i].num_counters;
+                      tmp_histo[tgt_i].cum_value += tgt_histo[tgt_i].cum_value;
+                      if (!tmp_histo[tgt_i].min_value ||
+                          tgt_histo[tgt_i].min_value
+                          < tmp_histo[tgt_i].min_value)
+                        tmp_histo[tgt_i].min_value = tgt_histo[tgt_i].min_value;
+                    }
+
+                  src_done = 1;
+                  break;
+                }
+
+              src_num = src_histo[src_i].num_counters;
+              src_cum = src_histo[src_i].cum_value;
+            }
+
+          /* The number of counters to merge on this pass is the minimum
+             of the remaining counters from the current target and source
+             histogram entries.  */
+          merge_num = tgt_num;
+          if (src_num < merge_num)
+            merge_num = src_num;
+
+          /* The merged min_value is the sum of the min_values from target
+             and source.  */
+          merge_min = tgt_histo[tgt_i].min_value + src_histo[src_i].min_value;
+
+          /* Compute the portion of source and target entries' cum_value
+             that will be apportioned to the counters being merged.
+             The total remaining cum_value from each entry is divided
+             equally among the counters from that histogram entry if we
+             are not merging all of them.  */
+          merge_src_cum = src_cum;
+          if (merge_num < src_num)
+            merge_src_cum = merge_num * src_cum / src_num;
+          merge_tgt_cum = tgt_cum;
+          if (merge_num < tgt_num)
+            merge_tgt_cum = merge_num * tgt_cum / tgt_num;
+          /* The merged cum_value is the sum of the source and target
+             components.  */
+          merge_cum = merge_src_cum + merge_tgt_cum;
+
+          /* Update the remaining number of counters and cum_value left
+             to be merged from this source and target entry.  */
+          src_cum -= merge_src_cum;
+          tgt_cum -= merge_tgt_cum;
+          src_num -= merge_num;
+          tgt_num -= merge_num;
+
+          /* The merged counters get placed in the new merged histogram
+             at the entry for the merged min_value.  */
+          tmp_i = gcov_histo_index (merge_min);
+          gcc_assert (tmp_i < GCOV_HISTOGRAM_SIZE);
+          tmp_histo[tmp_i].num_counters += merge_num;
+          tmp_histo[tmp_i].cum_value += merge_cum;
+          if (!tmp_histo[tmp_i].min_value ||
+              merge_min < tmp_histo[tmp_i].min_value)
+            tmp_histo[tmp_i].min_value = merge_min;
+
+          /* Ensure the search for the next non-zero src_histo entry starts
+             at the next smallest histogram bucket.  */
+          if (!src_num)
+            src_i--;
+        }
+    }
+
+  gcc_assert (tgt_i < 0);
+
+  /* In the case where there were more counters in the source histogram,
+     accumulate the remaining unmerged cumulative counter values. Add
+     those to the smallest non-zero target histogram entry. Otherwise,
+     the total cumulative counter values in the histogram will be smaller
+     than the sum_all stored in the summary, which will complicate
+     computing the working set information from the histogram later on.  */
+  if (src_num)
+    src_i--;
+  while (src_i >= 0)
+    {
+      src_cum += src_histo[src_i].cum_value;
+      src_i--;
+    }
+  /* At this point, tmp_i should be the smallest non-zero entry in the
+     tmp_histo.  */
+  gcc_assert (tmp_i >= 0 && tmp_i < GCOV_HISTOGRAM_SIZE
+	      && tmp_histo[tmp_i].num_counters > 0);
+  tmp_histo[tmp_i].cum_value += src_cum;
+
+  /* Finally, copy the merged histogram into tgt_histo.  */
+  memcpy (tgt_histo, tmp_histo,
+	  sizeof (gcov_bucket_type) * GCOV_HISTOGRAM_SIZE);
+}
+#endif /* !IN_GCOV */
+
+/* This is used by gcov-dump (IN_GCOV == -1) and in the compiler
+   (!IN_GCOV && !IN_LIBGCOV).  */
+#if IN_GCOV <= 0 && !IN_LIBGCOV
+/* Compute the working set information from the counter histogram in
+   the profile summary. This is an array of information corresponding to a
+   range of percentages of the total execution count (sum_all), and includes
+   the number of counters required to cover that working set percentage and
+   the minimum counter value in that working set.  */
+
+GCOV_LINKAGE void
+compute_working_sets (const struct gcov_ctr_summary *summary,
+                      gcov_working_set_t *gcov_working_sets)
+{
+  gcov_type working_set_cum_values[NUM_GCOV_WORKING_SETS];
+  gcov_type ws_cum_hotness_incr;
+  gcov_type cum, tmp_cum;
+  const gcov_bucket_type *histo_bucket;
+  unsigned ws_ix, c_num, count;
+  int h_ix;
+
+  /* Compute the amount of sum_all that the cumulative hotness grows
+     by in each successive working set entry, which depends on the
+     number of working set entries.  */
+  ws_cum_hotness_incr = summary->sum_all / NUM_GCOV_WORKING_SETS;
+
+  /* Next fill in an array of the cumulative hotness values corresponding
+     to each working set summary entry we are going to compute below.
+     Skip 0% statistics, which can be extrapolated from the
+     rest of the summary data.  */
+  cum = ws_cum_hotness_incr;
+  for (ws_ix = 0; ws_ix < NUM_GCOV_WORKING_SETS;
+       ws_ix++, cum += ws_cum_hotness_incr)
+    working_set_cum_values[ws_ix] = cum;
+  /* The last summary entry is reserved for (roughly) 99.9% of the
+     working set. Divide by 1024 so it becomes a shift, which gives
+     almost exactly 99.9%.  */
+  working_set_cum_values[NUM_GCOV_WORKING_SETS-1]
+      = summary->sum_all - summary->sum_all/1024;
+
+  /* Next, walk through the histogram in decending order of hotness
+     and compute the statistics for the working set summary array.
+     As histogram entries are accumulated, we check to see which
+     working set entries have had their expected cum_value reached
+     and fill them in, walking the working set entries in increasing
+     size of cum_value.  */
+  ws_ix = 0; /* The current entry into the working set array.  */
+  cum = 0; /* The current accumulated counter sum.  */
+  count = 0; /* The current accumulated count of block counters.  */
+  for (h_ix = GCOV_HISTOGRAM_SIZE - 1;
+       h_ix >= 0 && ws_ix < NUM_GCOV_WORKING_SETS; h_ix--)
+    {
+      histo_bucket = &summary->histogram[h_ix];
+
+      /* If we haven't reached the required cumulative counter value for
+         the current working set percentage, simply accumulate this histogram
+         entry into the running sums and continue to the next histogram
+         entry.  */
+      if (cum + histo_bucket->cum_value < working_set_cum_values[ws_ix])
+        {
+          cum += histo_bucket->cum_value;
+          count += histo_bucket->num_counters;
+          continue;
+        }
+
+      /* If adding the current histogram entry's cumulative counter value
+         causes us to exceed the current working set size, then estimate
+         how many of this histogram entry's counter values are required to
+         reach the working set size, and fill in working set entries
+         as we reach their expected cumulative value.  */
+      for (c_num = 0, tmp_cum = cum;
+           c_num < histo_bucket->num_counters && ws_ix < NUM_GCOV_WORKING_SETS;
+           c_num++)
+        {
+          count++;
+          /* If we haven't reached the last histogram entry counter, add
+             in the minimum value again. This will underestimate the
+             cumulative sum so far, because many of the counter values in this
+             entry may have been larger than the minimum. We could add in the
+             average value every time, but that would require an expensive
+             divide operation.  */
+          if (c_num + 1 < histo_bucket->num_counters)
+            tmp_cum += histo_bucket->min_value;
+          /* If we have reached the last histogram entry counter, then add
+             in the entire cumulative value.  */
+          else
+            tmp_cum = cum + histo_bucket->cum_value;
+
+	  /* Next walk through successive working set entries and fill in
+	     the statistics for any whose size we have reached by accumulating
+	     this histogram counter.  */
+	  while (ws_ix < NUM_GCOV_WORKING_SETS
+		 && tmp_cum >= working_set_cum_values[ws_ix])
+            {
+              gcov_working_sets[ws_ix].num_counters = count;
+              gcov_working_sets[ws_ix].min_counter
+                  = histo_bucket->min_value;
+              ws_ix++;
+            }
+        }
+      /* Finally, update the running cumulative value since we were
+         using a temporary above.  */
+      cum += histo_bucket->cum_value;
+    }
+  gcc_assert (ws_ix == NUM_GCOV_WORKING_SETS);
+}
+#endif /* IN_GCOV <= 0 && !IN_LIBGCOV */
diff --git a/lib/gcc/x86_64-linux-android/4.9/gcov-src/gcov-io.h b/lib/gcc/x86_64-linux-android/4.9/gcov-src/gcov-io.h
new file mode 100644
index 0000000..50ffa55
--- /dev/null
+++ b/lib/gcc/x86_64-linux-android/4.9/gcov-src/gcov-io.h
@@ -0,0 +1,489 @@
+/* File format for coverage information
+   Copyright (C) 1996-2014 Free Software Foundation, Inc.
+   Contributed by Bob Manson <manson@cygnus.com>.
+   Completely remangled by Nathan Sidwell <nathan@codesourcery.com>.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free
+Software Foundation; either version 3, or (at your option) any later
+version.
+
+GCC is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or
+FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+Under Section 7 of GPL version 3, you are granted additional
+permissions described in the GCC Runtime Library Exception, version
+3.1, as published by the Free Software Foundation.
+
+You should have received a copy of the GNU General Public License and
+a copy of the GCC Runtime Library Exception along with this program;
+see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+<http://www.gnu.org/licenses/>.  */
+
+
+/* Coverage information is held in two files.  A notes file, which is
+   generated by the compiler, and a data file, which is generated by
+   the program under test.  Both files use a similar structure.  We do
+   not attempt to make these files backwards compatible with previous
+   versions, as you only need coverage information when developing a
+   program.  We do hold version information, so that mismatches can be
+   detected, and we use a format that allows tools to skip information
+   they do not understand or are not interested in.
+
+   Numbers are recorded in the 32 bit unsigned binary form of the
+   endianness of the machine generating the file. 64 bit numbers are
+   stored as two 32 bit numbers, the low part first.  Strings are
+   padded with 1 to 4 NUL bytes, to bring the length up to a multiple
+   of 4. The number of 4 bytes is stored, followed by the padded
+   string. Zero length and NULL strings are simply stored as a length
+   of zero (they have no trailing NUL or padding).
+
+   	int32:  byte3 byte2 byte1 byte0 | byte0 byte1 byte2 byte3
+	int64:  int32:low int32:high
+	string: int32:0 | int32:length char* char:0 padding
+	padding: | char:0 | char:0 char:0 | char:0 char:0 char:0
+	item: int32 | int64 | string
+
+   The basic format of the files is
+
+   	file : int32:magic int32:version int32:stamp record*
+
+   The magic ident is different for the notes and the data files.  The
+   magic ident is used to determine the endianness of the file, when
+   reading.  The version is the same for both files and is derived
+   from gcc's version number. The stamp value is used to synchronize
+   note and data files and to synchronize merging within a data
+   file. It need not be an absolute time stamp, merely a ticker that
+   increments fast enough and cycles slow enough to distinguish
+   different compile/run/compile cycles.
+
+   Although the ident and version are formally 32 bit numbers, they
+   are derived from 4 character ASCII strings.  The version number
+   consists of the single character major version number, a two
+   character minor version number (leading zero for versions less than
+   10), and a single character indicating the status of the release.
+   That will be 'e' experimental, 'p' prerelease and 'r' for release.
+   Because, by good fortune, these are in alphabetical order, string
+   collating can be used to compare version strings.  Be aware that
+   the 'e' designation will (naturally) be unstable and might be
+   incompatible with itself.  For gcc 3.4 experimental, it would be
+   '304e' (0x33303465).  When the major version reaches 10, the
+   letters A-Z will be used.  Assuming minor increments releases every
+   6 months, we have to make a major increment every 50 years.
+   Assuming major increments releases every 5 years, we're ok for the
+   next 155 years -- good enough for me.
+
+   A record has a tag, length and variable amount of data.
+
+   	record: header data
+	header: int32:tag int32:length
+	data: item*
+
+   Records are not nested, but there is a record hierarchy.  Tag
+   numbers reflect this hierarchy.  Tags are unique across note and
+   data files.  Some record types have a varying amount of data.  The
+   LENGTH is the number of 4bytes that follow and is usually used to
+   determine how much data.  The tag value is split into 4 8-bit
+   fields, one for each of four possible levels.  The most significant
+   is allocated first.  Unused levels are zero.  Active levels are
+   odd-valued, so that the LSB of the level is one.  A sub-level
+   incorporates the values of its superlevels.  This formatting allows
+   you to determine the tag hierarchy, without understanding the tags
+   themselves, and is similar to the standard section numbering used
+   in technical documents.  Level values [1..3f] are used for common
+   tags, values [41..9f] for the notes file and [a1..ff] for the data
+   file.
+
+   The notes file contains the following records
+   	note: unit function-graph*
+	unit: header int32:checksum string:source
+	function-graph: announce_function basic_blocks {arcs | lines}*
+	announce_function: header int32:ident
+		int32:lineno_checksum int32:cfg_checksum
+		string:name string:source int32:lineno
+	basic_block: header int32:flags*
+	arcs: header int32:block_no arc*
+	arc:  int32:dest_block int32:flags
+        lines: header int32:block_no line*
+               int32:0 string:NULL
+	line:  int32:line_no | int32:0 string:filename
+
+   The BASIC_BLOCK record holds per-bb flags.  The number of blocks
+   can be inferred from its data length.  There is one ARCS record per
+   basic block.  The number of arcs from a bb is implicit from the
+   data length.  It enumerates the destination bb and per-arc flags.
+   There is one LINES record per basic block, it enumerates the source
+   lines which belong to that basic block.  Source file names are
+   introduced by a line number of 0, following lines are from the new
+   source file.  The initial source file for the function is NULL, but
+   the current source file should be remembered from one LINES record
+   to the next.  The end of a block is indicated by an empty filename
+   - this does not reset the current source file.  Note there is no
+   ordering of the ARCS and LINES records: they may be in any order,
+   interleaved in any manner.  The current filename follows the order
+   the LINES records are stored in the file, *not* the ordering of the
+   blocks they are for.
+
+   The data file contains the following records.
+        data: {unit summary:object summary:program* function-data*}*
+	unit: header int32:checksum
+        function-data:	announce_function present counts
+	announce_function: header int32:ident
+		int32:lineno_checksum int32:cfg_checksum
+	present: header int32:present
+	counts: header int64:count*
+	summary: int32:checksum {count-summary}GCOV_COUNTERS_SUMMABLE
+	count-summary:	int32:num int32:runs int64:sum
+			int64:max int64:sum_max histogram
+        histogram: {int32:bitvector}8 histogram-buckets*
+        histogram-buckets: int32:num int64:min int64:sum
+
+   The ANNOUNCE_FUNCTION record is the same as that in the note file,
+   but without the source location.  The COUNTS gives the
+   counter values for instrumented features.  The about the whole
+   program.  The checksum is used for whole program summaries, and
+   disambiguates different programs which include the same
+   instrumented object file.  There may be several program summaries,
+   each with a unique checksum.  The object summary's checksum is
+   zero.  Note that the data file might contain information from
+   several runs concatenated, or the data might be merged.
+
+   This file is included by both the compiler, gcov tools and the
+   runtime support library libgcov. IN_LIBGCOV and IN_GCOV are used to
+   distinguish which case is which.  If IN_LIBGCOV is nonzero,
+   libgcov is being built. If IN_GCOV is nonzero, the gcov tools are
+   being built. Otherwise the compiler is being built. IN_GCOV may be
+   positive or negative. If positive, we are compiling a tool that
+   requires additional functions (see the code for knowledge of what
+   those functions are).  */
+
+#ifndef GCC_GCOV_IO_H
+#define GCC_GCOV_IO_H
+
+#ifndef IN_LIBGCOV
+/* About the host */
+
+typedef unsigned gcov_unsigned_t;
+typedef unsigned gcov_position_t;
+
+#if LONG_LONG_TYPE_SIZE > 32
+#define GCOV_TYPE_ATOMIC_FETCH_ADD_FN __atomic_fetch_add_8
+#define GCOV_TYPE_ATOMIC_FETCH_ADD BUILT_IN_ATOMIC_FETCH_ADD_8
+#else
+#define GCOV_TYPE_ATOMIC_FETCH_ADD_FN __atomic_fetch_add_4
+#define GCOV_TYPE_ATOMIC_FETCH_ADD BUILT_IN_ATOMIC_FETCH_ADD_4
+#endif
+#define PROFILE_GEN_EDGE_ATOMIC (flag_profile_gen_atomic == 1 || \
+                                 flag_profile_gen_atomic == 3)
+#define PROFILE_GEN_VALUE_ATOMIC (flag_profile_gen_atomic == 2 || \
+                                  flag_profile_gen_atomic == 3)
+
+/* gcov_type is typedef'd elsewhere for the compiler */
+#if IN_GCOV
+#define GCOV_LINKAGE static
+typedef HOST_WIDEST_INT gcov_type;
+typedef unsigned HOST_WIDEST_INT gcov_type_unsigned;
+#if IN_GCOV > 0
+#include <sys/types.h>
+#endif
+
+#define FUNC_ID_WIDTH HOST_BITS_PER_WIDE_INT/2
+#define FUNC_ID_MASK ((1L << FUNC_ID_WIDTH) - 1)
+#define EXTRACT_MODULE_ID_FROM_GLOBAL_ID(gid) (unsigned)(((gid) >> FUNC_ID_WIDTH) & FUNC_ID_MASK)
+#define EXTRACT_FUNC_ID_FROM_GLOBAL_ID(gid) (unsigned)((gid) & FUNC_ID_MASK)
+#define FUNC_GLOBAL_ID(m,f) ((((HOST_WIDE_INT) (m)) << FUNC_ID_WIDTH) | (f)
+
+#else /*!IN_GCOV */
+#define GCOV_TYPE_SIZE (LONG_LONG_TYPE_SIZE > 32 ? 64 : 32)
+#endif
+
+#if defined (HOST_HAS_F_SETLKW)
+#define GCOV_LOCKED 1
+#else
+#define GCOV_LOCKED 0
+#endif
+
+#define ATTRIBUTE_HIDDEN
+
+#endif /* !IN_LIBGOCV */
+
+#ifndef GCOV_LINKAGE
+#define GCOV_LINKAGE extern
+#endif
+
+/* File suffixes.  */
+#define GCOV_DATA_SUFFIX ".gcda"
+#define GCOV_NOTE_SUFFIX ".gcno"
+
+/* File magic. Must not be palindromes.  */
+#define GCOV_DATA_MAGIC ((gcov_unsigned_t)0x67636461) /* "gcda" */
+#define GCOV_NOTE_MAGIC ((gcov_unsigned_t)0x67636e6f) /* "gcno" */
+
+/* gcov-iov.h is automatically generated by the makefile from
+   version.c, it looks like
+   	#define GCOV_VERSION ((gcov_unsigned_t)0x89abcdef)
+*/
+#include "gcov-iov.h"
+
+/* Convert a magic or version number to a 4 character string.  */
+#define GCOV_UNSIGNED2STRING(ARRAY,VALUE)	\
+  ((ARRAY)[0] = (char)((VALUE) >> 24),		\
+   (ARRAY)[1] = (char)((VALUE) >> 16),		\
+   (ARRAY)[2] = (char)((VALUE) >> 8),		\
+   (ARRAY)[3] = (char)((VALUE) >> 0))
+
+/* The record tags.  Values [1..3f] are for tags which may be in either
+   file.  Values [41..9f] for those in the note file and [a1..ff] for
+   the data file.  The tag value zero is used as an explicit end of
+   file marker -- it is not required to be present.  */
+
+#define GCOV_TAG_FUNCTION	 ((gcov_unsigned_t)0x01000000)
+#define GCOV_TAG_FUNCTION_LENGTH (3)
+#define GCOV_TAG_BLOCKS		 ((gcov_unsigned_t)0x01410000)
+#define GCOV_TAG_BLOCKS_LENGTH(NUM) (NUM)
+#define GCOV_TAG_BLOCKS_NUM(LENGTH) (LENGTH)
+#define GCOV_TAG_ARCS		 ((gcov_unsigned_t)0x01430000)
+#define GCOV_TAG_ARCS_LENGTH(NUM)  (1 + (NUM) * 2)
+#define GCOV_TAG_ARCS_NUM(LENGTH)  (((LENGTH) - 1) / 2)
+#define GCOV_TAG_LINES		 ((gcov_unsigned_t)0x01450000)
+#define GCOV_TAG_COUNTER_BASE 	 ((gcov_unsigned_t)0x01a10000)
+#define GCOV_TAG_COUNTER_LENGTH(NUM) ((NUM) * 2)
+#define GCOV_TAG_COUNTER_NUM(LENGTH) ((LENGTH) / 2)
+#define GCOV_TAG_OBJECT_SUMMARY  ((gcov_unsigned_t)0xa1000000) /* Obsolete */
+#define GCOV_TAG_PROGRAM_SUMMARY ((gcov_unsigned_t)0xa3000000)
+#define GCOV_TAG_SUMMARY_LENGTH(NUM)  \
+        (1 + GCOV_COUNTERS_SUMMABLE * (10 + 3 * 2) + (NUM) * 5)
+#define GCOV_TAG_MODULE_INFO ((gcov_unsigned_t)0xab000000)
+#define GCOV_TAG_AFDO_FILE_NAMES ((gcov_unsigned_t)0xaa000000)
+#define GCOV_TAG_AFDO_FUNCTION ((gcov_unsigned_t)0xac000000)
+#define GCOV_TAG_AFDO_MODULE_GROUPING ((gcov_unsigned_t)0xae000000)
+#define GCOV_TAG_AFDO_WORKING_SET ((gcov_unsigned_t)0xaf000000)
+
+/* Counters that are collected.  */
+#define DEF_GCOV_COUNTER(COUNTER, NAME, MERGE_FN) COUNTER,
+enum {
+#include "gcov-counter.def"
+GCOV_COUNTERS
+};
+#undef DEF_GCOV_COUNTER
+
+/* Counters which can be summaried.  */
+#define GCOV_COUNTERS_SUMMABLE (GCOV_COUNTER_ARCS + 1)
+
+/* The first of counters used for value profiling.  They must form a
+   consecutive interval and their order must match the order of
+   HIST_TYPEs in value-prof.h.  */
+#define GCOV_FIRST_VALUE_COUNTER GCOV_COUNTERS_SUMMABLE
+
+/* The last of counters used for value profiling.  */
+#define GCOV_LAST_VALUE_COUNTER (GCOV_COUNTERS - 2)
+
+/* Number of counters used for value profiling.  */
+#define GCOV_N_VALUE_COUNTERS \
+  (GCOV_LAST_VALUE_COUNTER - GCOV_FIRST_VALUE_COUNTER + 1)
+
+#define GCOV_ICALL_TOPN_VAL  2   /* Track two hottest callees */
+#define GCOV_ICALL_TOPN_NCOUNTS  9 /* The number of counter entries per icall callsite */
+
+/* Convert a counter index to a tag.  */
+#define GCOV_TAG_FOR_COUNTER(COUNT)				\
+	(GCOV_TAG_COUNTER_BASE + ((gcov_unsigned_t)(COUNT) << 17))
+/* Convert a tag to a counter.  */
+#define GCOV_COUNTER_FOR_TAG(TAG)					\
+	((unsigned)(((TAG) - GCOV_TAG_COUNTER_BASE) >> 17))
+/* Check whether a tag is a counter tag.  */
+#define GCOV_TAG_IS_COUNTER(TAG)				\
+	(!((TAG) & 0xFFFF) && GCOV_COUNTER_FOR_TAG (TAG) < GCOV_COUNTERS)
+
+/* The tag level mask has 1's in the position of the inner levels, &
+   the lsb of the current level, and zero on the current and outer
+   levels.  */
+#define GCOV_TAG_MASK(TAG) (((TAG) - 1) ^ (TAG))
+
+/* Return nonzero if SUB is an immediate subtag of TAG.  */
+#define GCOV_TAG_IS_SUBTAG(TAG,SUB)				\
+	(GCOV_TAG_MASK (TAG) >> 8 == GCOV_TAG_MASK (SUB) 	\
+	 && !(((SUB) ^ (TAG)) & ~GCOV_TAG_MASK (TAG)))
+
+/* Return nonzero if SUB is at a sublevel to TAG.  */
+#define GCOV_TAG_IS_SUBLEVEL(TAG,SUB)				\
+     	(GCOV_TAG_MASK (TAG) > GCOV_TAG_MASK (SUB))
+
+/* Basic block flags.  */
+#define GCOV_BLOCK_UNEXPECTED	(1 << 1)
+
+/* Arc flags.  */
+#define GCOV_ARC_ON_TREE 	(1 << 0)
+#define GCOV_ARC_FAKE		(1 << 1)
+#define GCOV_ARC_FALLTHROUGH	(1 << 2)
+
+/* Structured records.  */
+
+/* Structure used for each bucket of the log2 histogram of counter values.  */
+typedef struct
+{
+  /* Number of counters whose profile count falls within the bucket.  */
+  gcov_unsigned_t num_counters;
+  /* Smallest profile count included in this bucket.  */
+  gcov_type min_value;
+  /* Cumulative value of the profile counts in this bucket.  */
+  gcov_type cum_value;
+} gcov_bucket_type;
+
+/* For a log2 scale histogram with each range split into 4
+   linear sub-ranges, there will be at most 64 (max gcov_type bit size) - 1 log2
+   ranges since the lowest 2 log2 values share the lowest 4 linear
+   sub-range (values 0 - 3).  This is 252 total entries (63*4).  */
+
+#define GCOV_HISTOGRAM_SIZE 252
+
+/* How many unsigned ints are required to hold a bit vector of non-zero
+   histogram entries when the histogram is written to the gcov file.
+   This is essentially a ceiling divide by 32 bits.  */
+#define GCOV_HISTOGRAM_BITVECTOR_SIZE (GCOV_HISTOGRAM_SIZE + 31) / 32
+
+/* Cumulative counter data.  */
+struct gcov_ctr_summary
+{
+  gcov_unsigned_t num;		/* number of counters.  */
+  gcov_unsigned_t runs;		/* number of program runs */
+  gcov_type sum_all;		/* sum of all counters accumulated.  */
+  gcov_type run_max;		/* maximum value on a single run.  */
+  gcov_type sum_max;    	/* sum of individual run max values.  */
+  gcov_bucket_type histogram[GCOV_HISTOGRAM_SIZE]; /* histogram of
+                                                      counter values.  */
+};
+
+/* Object & program summary record.  */
+struct gcov_summary
+{
+  gcov_unsigned_t checksum;	/* checksum of program */
+  struct gcov_ctr_summary ctrs[GCOV_COUNTERS_SUMMABLE];
+};
+
+#define GCOV_MODULE_UNKNOWN_LANG  0
+#define GCOV_MODULE_C_LANG    1
+#define GCOV_MODULE_CPP_LANG  2
+#define GCOV_MODULE_FORT_LANG 3
+
+#define GCOV_MODULE_ASM_STMTS (1 << 16)
+#define GCOV_MODULE_LANG_MASK 0xffff
+
+/* Source module info. The data structure is used in
+   both runtime and profile-use phase. Make sure to allocate
+   enough space for the variable length member.  */
+struct gcov_module_info
+{
+  gcov_unsigned_t ident;
+  gcov_unsigned_t is_primary; /* this is overloaded to mean two things:
+                                 (1) means FDO/LIPO in instrumented binary.
+                                 (2) means IS_PRIMARY in persistent file or
+                                     memory copy used in profile-use.  */
+  gcov_unsigned_t flags;      /* bit 0: is_exported,
+                                 bit 1: need to include all the auxiliary 
+                                 modules in use compilation.  */
+  gcov_unsigned_t lang; /* lower 16 bits encode the language, and the upper
+                           16 bits enocde other attributes, such as whether
+                           any assembler is present in the source, etc.  */
+  gcov_unsigned_t ggc_memory; /* memory needed for parsing in kb  */
+  char *da_filename;
+  char *source_filename;
+  gcov_unsigned_t num_quote_paths;
+  gcov_unsigned_t num_bracket_paths;
+  gcov_unsigned_t num_system_paths;
+  gcov_unsigned_t num_cpp_defines;
+  gcov_unsigned_t num_cpp_includes;
+  gcov_unsigned_t num_cl_args;
+  char *string_array[1];
+};
+
+extern struct gcov_module_info **module_infos;
+extern unsigned primary_module_id;
+#define SET_MODULE_INCLUDE_ALL_AUX(modu) ((modu->flags |= 0x2))
+#define MODULE_INCLUDE_ALL_AUX_FLAG(modu) ((modu->flags & 0x2))
+#define SET_MODULE_EXPORTED(modu) ((modu->flags |= 0x1))
+#define MODULE_EXPORTED_FLAG(modu) ((modu->flags & 0x1))
+#define PRIMARY_MODULE_EXPORTED                                         \
+  (MODULE_EXPORTED_FLAG (module_infos[0])                               \
+   && !((module_infos[0]->lang & GCOV_MODULE_ASM_STMTS)                 \
+        && flag_ripa_disallow_asm_modules))
+
+#if !defined(inhibit_libc)
+
+/* Functions for reading and writing gcov files. In libgcov you can
+   open the file for reading then writing. Elsewhere you can open the
+   file either for reading or for writing. When reading a file you may
+   use the gcov_read_* functions, gcov_sync, gcov_position, &
+   gcov_error. When writing a file you may use the gcov_write
+   functions, gcov_seek & gcov_error. When a file is to be rewritten
+   you use the functions for reading, then gcov_rewrite then the
+   functions for writing.  Your file may become corrupted if you break
+   these invariants.  */
+
+#if !IN_LIBGCOV
+GCOV_LINKAGE int gcov_open (const char */*name*/, int /*direction*/);
+GCOV_LINKAGE int gcov_magic (gcov_unsigned_t, gcov_unsigned_t);
+#endif
+
+/* Available everywhere.  */
+GCOV_LINKAGE int gcov_close (void) ATTRIBUTE_HIDDEN;
+GCOV_LINKAGE gcov_unsigned_t gcov_read_unsigned (void) ATTRIBUTE_HIDDEN;
+GCOV_LINKAGE gcov_type gcov_read_counter (void) ATTRIBUTE_HIDDEN;
+GCOV_LINKAGE void gcov_read_summary (struct gcov_summary *) ATTRIBUTE_HIDDEN;
+GCOV_LINKAGE const char *gcov_read_string (void);
+GCOV_LINKAGE void gcov_sync (gcov_position_t /*base*/,
+			     gcov_unsigned_t /*length */);
+
+
+#if !IN_LIBGCOV && IN_GCOV != 1
+GCOV_LINKAGE void gcov_read_module_info (struct gcov_module_info *mod_info,
+					 gcov_unsigned_t len) ATTRIBUTE_HIDDEN;
+#endif
+
+#if !IN_GCOV
+/* Available outside gcov */
+GCOV_LINKAGE void gcov_write_unsigned (gcov_unsigned_t) ATTRIBUTE_HIDDEN;
+#endif
+
+#if !IN_GCOV && !IN_LIBGCOV
+/* Available only in compiler */
+GCOV_LINKAGE unsigned gcov_histo_index (gcov_type value);
+GCOV_LINKAGE void gcov_write_string (const char *);
+GCOV_LINKAGE gcov_position_t gcov_write_tag (gcov_unsigned_t);
+GCOV_LINKAGE void gcov_write_length (gcov_position_t /*position*/);
+#endif
+
+#if IN_GCOV <= 0 && !IN_LIBGCOV
+/* Available in gcov-dump and the compiler.  */
+
+/* Number of data points in the working set summary array. Using 128
+   provides information for at least every 1% increment of the total
+   profile size. The last entry is hardwired to 99.9% of the total.  */
+#define NUM_GCOV_WORKING_SETS 128
+
+/* Working set size statistics for a given percentage of the entire
+   profile (sum_all from the counter summary).  */
+typedef struct gcov_working_set_info
+{
+  /* Number of hot counters included in this working set.  */
+  unsigned num_counters;
+  /* Smallest counter included in this working set.  */
+  gcov_type min_counter;
+} gcov_working_set_t;
+
+GCOV_LINKAGE void compute_working_sets (const struct gcov_ctr_summary *summary,
+                                        gcov_working_set_t *gcov_working_sets);
+#endif
+
+#if IN_GCOV > 0
+/* Available in gcov */
+GCOV_LINKAGE time_t gcov_time (void);
+#endif
+
+#endif /* !inhibit_libc  */
+
+#endif /* GCC_GCOV_IO_H */
diff --git a/lib/gcc/x86_64-linux-android/4.9/gcov-src/gcov-iov.h b/lib/gcc/x86_64-linux-android/4.9/gcov-src/gcov-iov.h
new file mode 100644
index 0000000..d6e33b7
--- /dev/null
+++ b/lib/gcc/x86_64-linux-android/4.9/gcov-src/gcov-iov.h
@@ -0,0 +1,4 @@
+/* Generated automatically by the program `build/gcov-iov'
+   from `4.9 (4 9) and prerelease (*)'.  */
+
+#define GCOV_VERSION ((gcov_unsigned_t)0x3430392a)  /* 409* */
diff --git a/lib/gcc/x86_64-linux-android/4.9/gcov-src/libgcov-driver.c b/lib/gcc/x86_64-linux-android/4.9/gcov-src/libgcov-driver.c
new file mode 100644
index 0000000..dc8cf36
--- /dev/null
+++ b/lib/gcc/x86_64-linux-android/4.9/gcov-src/libgcov-driver.c
@@ -0,0 +1,1193 @@
+/* Routines required for instrumenting a program.  */
+/* Compile this one with gcc.  */
+/* Copyright (C) 1989-2014 Free Software Foundation, Inc.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free
+Software Foundation; either version 3, or (at your option) any later
+version.
+
+GCC is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or
+FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+Under Section 7 of GPL version 3, you are granted additional
+permissions described in the GCC Runtime Library Exception, version
+3.1, as published by the Free Software Foundation.
+
+You should have received a copy of the GNU General Public License and
+a copy of the GCC Runtime Library Exception along with this program;
+see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+<http://www.gnu.org/licenses/>.  */
+
+#include "libgcov.h"
+
+#if defined(inhibit_libc)
+/* If libc and its header files are not available, provide dummy functions.  */
+
+#if defined(L_gcov)
+void __gcov_init (struct gcov_info *p __attribute__ ((unused))) {}
+#endif
+
+#else /* inhibit_libc */
+
+#include <string.h>
+#if GCOV_LOCKED
+#include <fcntl.h>
+#include <errno.h>
+#include <sys/stat.h>
+#endif
+
+#ifdef L_gcov
+#include "gcov-io.c"
+
+#ifndef IN_GCOV_TOOL
+extern gcov_unsigned_t __gcov_sampling_period;
+extern gcov_unsigned_t __gcov_has_sampling;
+static int gcov_sampling_period_initialized = 0;
+#endif
+
+/* Unique identifier assigned to each module (object file).  */
+static gcov_unsigned_t gcov_cur_module_id = 0;
+
+
+/* Dynamic call graph build and form module groups.  */
+int __gcov_compute_module_groups (void) ATTRIBUTE_HIDDEN;
+void __gcov_finalize_dyn_callgraph (void) ATTRIBUTE_HIDDEN;
+
+/* The following functions can be called from outside of this file.  */
+extern void gcov_clear (void) ATTRIBUTE_HIDDEN;
+extern void gcov_exit (void) ATTRIBUTE_HIDDEN;
+extern void set_gcov_dump_complete (void) ATTRIBUTE_HIDDEN;
+extern void reset_gcov_dump_complete (void) ATTRIBUTE_HIDDEN;
+extern int get_gcov_dump_complete (void) ATTRIBUTE_HIDDEN;
+extern void set_gcov_list (struct gcov_info *) ATTRIBUTE_HIDDEN;
+__attribute__((weak)) void __coverage_callback (gcov_type, int); 
+
+#ifndef IN_GCOV_TOOL
+/* Create a strong reference to these symbols so that they are
+   unconditionally pulled into the instrumented binary, even when
+   the only reference is a weak reference. This is necessary because
+   we are using weak references to enable references from code that
+   may not be linked with libgcov. These are the only symbols that
+   should be accessed via link references from application code!
+
+   A subtlety of the linker is that it will only resolve weak references
+   defined within archive libraries when there is a strong reference to
+   something else defined within the same object file. Since these functions
+   are defined within their own object files, they would not automatically
+   get resolved. Since there are symbols within the main L_gcov
+   section that are strongly referenced during -fprofile-generate and
+   -ftest-coverage builds, these dummy symbols will always need to be
+   resolved.  */
+void (*__gcov_dummy_ref1)(void) = &__gcov_reset;
+void (*__gcov_dummy_ref2)(void) = &__gcov_dump;
+extern char *__gcov_get_profile_prefix (void);
+char *(*__gcov_dummy_ref3)(void) = &__gcov_get_profile_prefix;
+extern void __gcov_set_sampling_period (unsigned int period);
+char *(*__gcov_dummy_ref4)(void) = &__gcov_set_sampling_period;
+extern unsigned int __gcov_sampling_enabled (void);
+char *(*__gcov_dummy_ref5)(void) = &__gcov_sampling_enabled;
+extern void __gcov_flush (void);
+char *(*__gcov_dummy_ref6)(void) = &__gcov_flush;
+extern unsigned int __gcov_profiling_for_test_coverage (void);
+char *(*__gcov_dummy_ref7)(void) = &__gcov_profiling_for_test_coverage;
+#endif
+
+/* Default callback function for profile instrumentation callback.  */
+__attribute__((weak)) void
+__coverage_callback (gcov_type funcdef_no __attribute__ ((unused)),
+                     int edge_no __attribute__ ((unused)))
+{
+   /* nothing */
+}
+
+struct gcov_fn_buffer
+{
+  struct gcov_fn_buffer *next;
+  unsigned fn_ix;
+  struct gcov_fn_info info;
+  /* note gcov_fn_info ends in a trailing array.  */
+};
+
+struct gcov_summary_buffer
+{
+  struct gcov_summary_buffer *next;
+  struct gcov_summary summary;
+};
+
+/* Chain of per-object gcov structures.  */
+extern struct gcov_info *__gcov_list;
+
+/* Set the head of gcov_list.  */
+void
+set_gcov_list (struct gcov_info *head)
+{
+  __gcov_list = head;
+}
+
+/* Size of the longest file name. */
+/* We need to expose this static variable when compiling for gcov-tool.  */
+#ifndef IN_GCOV_TOOL
+static
+#endif
+size_t gcov_max_filename = 0;
+
+/* Flag when the profile has already been dumped via __gcov_dump().  */
+static int gcov_dump_complete;
+
+/* A global function that get the vaule of gcov_dump_complete.  */
+
+int
+get_gcov_dump_complete (void)
+{
+  return gcov_dump_complete;
+}
+
+/* A global functino that set the vaule of gcov_dump_complete. Will
+   be used in __gcov_dump() in libgcov-interface.c.  */
+
+void
+set_gcov_dump_complete (void)
+{
+  gcov_dump_complete = 1;
+}
+
+/* A global functino that set the vaule of gcov_dump_complete. Will
+   be used in __gcov_reset() in libgcov-interface.c.  */
+
+void
+reset_gcov_dump_complete (void)
+{
+  gcov_dump_complete = 0;
+}
+
+/* A utility function for outputing errors.  */
+static int gcov_error (const char *, ...);
+
+static struct gcov_fn_buffer *
+free_fn_data (const struct gcov_info *gi_ptr, struct gcov_fn_buffer *buffer,
+              unsigned limit)
+{
+  struct gcov_fn_buffer *next;
+  unsigned ix, n_ctr = 0;
+
+  if (!buffer)
+    return 0;
+  next = buffer->next;
+
+  for (ix = 0; ix != limit; ix++)
+    if (gi_ptr->merge[ix])
+      free (buffer->info.ctrs[n_ctr++].values);
+  free (buffer);
+  return next;
+}
+
+static struct gcov_fn_buffer **
+buffer_fn_data (const char *filename, const struct gcov_info *gi_ptr,
+                struct gcov_fn_buffer **end_ptr, unsigned fn_ix)
+{
+  unsigned n_ctrs = 0, ix = 0;
+  struct gcov_fn_buffer *fn_buffer;
+  unsigned len;
+
+  for (ix = GCOV_COUNTERS; ix--;)
+    if (gi_ptr->merge[ix])
+      n_ctrs++;
+
+  len = sizeof (*fn_buffer) + sizeof (fn_buffer->info.ctrs[0]) * n_ctrs;
+  fn_buffer = (struct gcov_fn_buffer *) xmalloc (len);
+
+  if (!fn_buffer)
+    goto fail;
+
+  fn_buffer->next = 0;
+  fn_buffer->fn_ix = fn_ix;
+  fn_buffer->info.ident = gcov_read_unsigned ();
+  fn_buffer->info.lineno_checksum = gcov_read_unsigned ();
+  fn_buffer->info.cfg_checksum = gcov_read_unsigned ();
+
+  for (n_ctrs = ix = 0; ix != GCOV_COUNTERS; ix++)
+    {
+      gcov_unsigned_t length;
+      gcov_type *values;
+
+      if (!gi_ptr->merge[ix])
+        continue;
+
+      if (gcov_read_unsigned () != GCOV_TAG_FOR_COUNTER (ix))
+        {
+          len = 0;
+          goto fail;
+        }
+
+      length = GCOV_TAG_COUNTER_NUM (gcov_read_unsigned ());
+      len = length * sizeof (gcov_type);
+      values = (gcov_type *) xmalloc (len);
+      if (!values)
+        goto fail;
+
+      fn_buffer->info.ctrs[n_ctrs].num = length;
+      fn_buffer->info.ctrs[n_ctrs].values = values;
+
+      while (length--)
+        *values++ = gcov_read_counter ();
+      n_ctrs++;
+    }
+
+  *end_ptr = fn_buffer;
+  return &fn_buffer->next;
+
+fail:
+  gcov_error ("profiling:%s:Function %u %s %u \n", filename, fn_ix,
+              len ? "cannot allocate" : "counter mismatch", len ? len : ix);
+
+  return (struct gcov_fn_buffer **)free_fn_data (gi_ptr, fn_buffer, ix);
+}
+
+/* Determine whether a counter is active.  */
+
+static inline int
+gcov_counter_active (const struct gcov_info *info, unsigned int type)
+{
+  return (info->merge[type] != 0);
+}
+
+/* Add an unsigned value to the current crc */
+
+static gcov_unsigned_t
+crc32_unsigned (gcov_unsigned_t crc32, gcov_unsigned_t value)
+{
+  unsigned ix;
+
+  for (ix = 32; ix--; value <<= 1)
+    {
+      unsigned feedback;
+
+      feedback = (value ^ crc32) & 0x80000000 ? 0x04c11db7 : 0;
+      crc32 <<= 1;
+      crc32 ^= feedback;
+    }
+
+  return crc32;
+}
+
+/* Check if VERSION of the info block PTR matches libgcov one.
+   Return 1 on success, or zero in case of versions mismatch.
+   If FILENAME is not NULL, its value used for reporting purposes
+   instead of value from the info block.  */
+
+static int
+gcov_version (struct gcov_info *ptr, gcov_unsigned_t version,
+              const char *filename)
+{
+  if (version != GCOV_VERSION)
+    {
+      char v[4], e[4];
+
+      GCOV_UNSIGNED2STRING (v, version);
+      GCOV_UNSIGNED2STRING (e, GCOV_VERSION);
+
+      if (filename)
+        gcov_error ("profiling:%s:Version mismatch - expected %.4s got %.4s\n",
+                   filename? filename : ptr->filename, e, v);
+      else
+        gcov_error ("profiling:Version mismatch - expected %.4s got %.4s\n", e, v);
+
+      return 0;
+    }
+  return 1;
+}
+
+/* Insert counter VALUE into HISTOGRAM.  */
+
+static void
+gcov_histogram_insert(gcov_bucket_type *histogram, gcov_type value)
+{
+  unsigned i;
+
+  i = gcov_histo_index(value);
+  histogram[i].num_counters++;
+  histogram[i].cum_value += value;
+  if (value < histogram[i].min_value)
+    histogram[i].min_value = value;
+}
+
+/* Computes a histogram of the arc counters to place in the summary SUM.  */
+
+static void
+gcov_compute_histogram (struct gcov_summary *sum)
+{
+  struct gcov_info *gi_ptr;
+  const struct gcov_fn_info *gfi_ptr;
+  const struct gcov_ctr_info *ci_ptr;
+  struct gcov_ctr_summary *cs_ptr;
+  unsigned t_ix, f_ix, ctr_info_ix, ix;
+  int h_ix;
+
+  /* This currently only applies to arc counters.  */
+  t_ix = GCOV_COUNTER_ARCS;
+
+  /* First check if there are any counts recorded for this counter.  */
+  cs_ptr = &(sum->ctrs[t_ix]);
+  if (!cs_ptr->num)
+    return;
+
+  for (h_ix = 0; h_ix < GCOV_HISTOGRAM_SIZE; h_ix++)
+    {
+      cs_ptr->histogram[h_ix].num_counters = 0;
+      cs_ptr->histogram[h_ix].min_value = cs_ptr->run_max;
+      cs_ptr->histogram[h_ix].cum_value = 0;
+    }
+
+  /* Walk through all the per-object structures and record each of
+     the count values in histogram.  */
+  for (gi_ptr = __gcov_list; gi_ptr; gi_ptr = gi_ptr->next)
+    {
+      if (!gi_ptr->merge[t_ix])
+        continue;
+
+      /* Find the appropriate index into the gcov_ctr_info array
+         for the counter we are currently working on based on the
+         existence of the merge function pointer for this object.  */
+      for (ix = 0, ctr_info_ix = 0; ix < t_ix; ix++)
+        {
+          if (gi_ptr->merge[ix])
+            ctr_info_ix++;
+        }
+      for (f_ix = 0; f_ix != gi_ptr->n_functions; f_ix++)
+        {
+          gfi_ptr = gi_ptr->functions[f_ix];
+
+          if (!gfi_ptr || gfi_ptr->key != gi_ptr)
+            continue;
+
+          ci_ptr = &gfi_ptr->ctrs[ctr_info_ix];
+          for (ix = 0; ix < ci_ptr->num; ix++)
+            gcov_histogram_insert (cs_ptr->histogram, ci_ptr->values[ix]);
+        }
+    }
+}
+
+/* gcda filename.  */
+static char *gi_filename;
+/* buffer for the fn_data from another program.  */
+static struct gcov_fn_buffer *fn_buffer;
+/* buffer for summary from other programs to be written out. */
+static struct gcov_summary_buffer *sum_buffer;
+/* If application calls fork or exec multiple times, we end up storing
+   profile repeadely.  We should not account this as multiple runs or
+   functions executed once may mistakely become cold.  */
+static int run_accounted = 0;
+
+/* This funtions computes the program level summary and the histo-gram.
+   It computes and returns CRC32 and stored summary in THIS_PRG.  */
+
+static gcov_unsigned_t
+gcov_exit_compute_summary (struct gcov_summary *this_prg)
+{
+  struct gcov_info *gi_ptr;
+  const struct gcov_fn_info *gfi_ptr;
+  struct gcov_ctr_summary *cs_ptr;
+  const struct gcov_ctr_info *ci_ptr;
+  int f_ix;
+  unsigned t_ix;
+  gcov_unsigned_t c_num;
+  gcov_unsigned_t crc32 = 0;
+
+  /* Find the totals for this execution.  */
+  memset (this_prg, 0, sizeof (*this_prg));
+  for (gi_ptr = __gcov_list; gi_ptr; gi_ptr = gi_ptr->next)
+    {
+      crc32 = crc32_unsigned (crc32, gi_ptr->stamp);
+      crc32 = crc32_unsigned (crc32, gi_ptr->n_functions);
+
+      for (f_ix = 0; (unsigned)f_ix != gi_ptr->n_functions; f_ix++)
+        {
+          gfi_ptr = gi_ptr->functions[f_ix];
+
+          if (gfi_ptr && gfi_ptr->key != gi_ptr)
+            gfi_ptr = 0;
+
+          crc32 = crc32_unsigned (crc32, gfi_ptr ? gfi_ptr->cfg_checksum : 0);
+          crc32 = crc32_unsigned (crc32,
+                                  gfi_ptr ? gfi_ptr->lineno_checksum : 0);
+          if (!gfi_ptr)
+            continue;
+
+          ci_ptr = gfi_ptr->ctrs;
+          for (t_ix = 0; t_ix != GCOV_COUNTERS_SUMMABLE; t_ix++)
+            {
+              if (!gi_ptr->merge[t_ix])
+                continue;
+
+              cs_ptr = &(this_prg->ctrs[t_ix]);
+              cs_ptr->num += ci_ptr->num;
+              crc32 = crc32_unsigned (crc32, ci_ptr->num);
+
+              for (c_num = 0; c_num < ci_ptr->num; c_num++)
+                {
+                  cs_ptr->sum_all += ci_ptr->values[c_num];
+                  if (cs_ptr->run_max < ci_ptr->values[c_num])
+                    cs_ptr->run_max = ci_ptr->values[c_num];
+                }
+              ci_ptr++;
+            }
+        }
+    }
+  gcov_compute_histogram (this_prg);
+  return crc32;
+}
+
+/* A struct that bundles all the related information about the
+   gcda filename.  */
+struct gcov_filename_aux{
+  char *gi_filename_up;
+  int gcov_prefix_strip;
+  size_t prefix_length;
+};
+
+/* Including system dependent components. */
+#include "libgcov-driver-system.c"
+
+/* Scan through the current open gcda file corresponding to GI_PTR
+   to locate the end position of the last summary, returned in
+   SUMMARY_END_POS_P.  Return 0 on success, -1 on error.  */
+static int
+gcov_scan_summary_end (struct gcov_info *gi_ptr,
+                       gcov_position_t *summary_end_pos_p)
+{
+  gcov_unsigned_t tag, version, stamp;
+  tag = gcov_read_unsigned ();
+  if (tag != GCOV_DATA_MAGIC)
+    {
+      gcov_error ("profiling:%s:Not a gcov data file\n", gi_filename);
+      return -1;
+    }
+
+  version = gcov_read_unsigned ();
+  if (!gcov_version (gi_ptr, version, gi_filename))
+    return -1;
+
+  stamp = gcov_read_unsigned ();
+  if (stamp != gi_ptr->stamp)
+    /* Read from a different compilation.  Overwrite the file.  */
+    return -1;
+
+  /* Look for program summary.  */
+  while (1)
+    {
+      struct gcov_summary tmp;
+
+      *summary_end_pos_p = gcov_position ();
+      tag = gcov_read_unsigned ();
+      if (tag != GCOV_TAG_PROGRAM_SUMMARY)
+        break;
+
+      gcov_read_unsigned ();
+      gcov_read_summary (&tmp);
+      if (gcov_is_error ())
+        return -1;
+    }
+
+  return 0;
+}
+
+/* This function merges counters in GI_PTR to an existing gcda file.
+   Return 0 on success.
+   Return -1 on error. In this case, caller will goto read_fatal.  */
+
+static int
+gcov_exit_merge_gcda (struct gcov_info *gi_ptr,
+                      struct gcov_summary *prg_p,
+                      struct gcov_summary *this_prg,
+                      gcov_position_t *summary_pos_p,
+                      gcov_position_t *eof_pos_p,
+		      gcov_unsigned_t crc32)
+{
+  gcov_unsigned_t tag, length;
+  unsigned t_ix;
+  int f_ix;
+  int error = 0;
+  struct gcov_fn_buffer **fn_tail = &fn_buffer;
+  struct gcov_summary_buffer **sum_tail = &sum_buffer;
+
+  length = gcov_read_unsigned ();
+  if (!gcov_version (gi_ptr, length, gi_filename))
+    return -1;
+
+  length = gcov_read_unsigned ();
+  if (length != gi_ptr->stamp)
+    /* Read from a different compilation. Overwrite the file.  */
+    return 0;
+
+  /* Look for program summary.  */
+  for (f_ix = 0;;)
+    {
+      struct gcov_summary tmp;
+
+      *eof_pos_p = gcov_position ();
+      tag = gcov_read_unsigned ();
+      if (tag != GCOV_TAG_PROGRAM_SUMMARY)
+        break;
+
+      f_ix--;
+      length = gcov_read_unsigned ();
+      gcov_read_summary (&tmp);
+      if ((error = gcov_is_error ()))
+        goto read_error;
+      if (*summary_pos_p)
+        {
+          /* Save all summaries after the one that will be
+             merged into below. These will need to be rewritten
+             as histogram merging may change the number of non-zero
+             histogram entries that will be emitted, and thus the
+             size of the merged summary.  */
+          (*sum_tail) = (struct gcov_summary_buffer *)
+              xmalloc (sizeof(struct gcov_summary_buffer));
+          (*sum_tail)->summary = tmp;
+          (*sum_tail)->next = 0;
+          sum_tail = &((*sum_tail)->next);
+          goto next_summary;
+        }
+      if (tmp.checksum != crc32)
+        goto next_summary;
+
+      for (t_ix = 0; t_ix != GCOV_COUNTERS_SUMMABLE; t_ix++)
+        if (tmp.ctrs[t_ix].num != this_prg->ctrs[t_ix].num)
+          goto next_summary;
+      *prg_p = tmp;
+      *summary_pos_p = *eof_pos_p;
+
+    next_summary:;
+    }
+
+  /* Merge execution counts for each function.  */
+  for (f_ix = 0; (unsigned)f_ix != gi_ptr->n_functions;
+       f_ix++, tag = gcov_read_unsigned ())
+    {
+      const struct gcov_ctr_info *ci_ptr;
+      const struct gcov_fn_info *gfi_ptr = gi_ptr->functions[f_ix];
+
+      if (tag != GCOV_TAG_FUNCTION)
+        goto read_mismatch;
+
+      length = gcov_read_unsigned ();
+      if (!length)
+        /* This function did not appear in the other program.
+           We have nothing to merge.  */
+        continue;
+
+      if (length != GCOV_TAG_FUNCTION_LENGTH)
+        goto read_mismatch;
+
+      if (!gfi_ptr || gfi_ptr->key != gi_ptr)
+        {
+          /* This function appears in the other program.  We
+             need to buffer the information in order to write
+             it back out -- we'll be inserting data before
+             this point, so cannot simply keep the data in the
+             file.  */
+          fn_tail = buffer_fn_data (gi_filename,
+                                    gi_ptr, fn_tail, f_ix);
+          if (!fn_tail)
+            goto read_mismatch;
+          continue;
+        }
+
+      length = gcov_read_unsigned ();
+      if (length != gfi_ptr->ident)
+        goto read_mismatch;
+
+      length = gcov_read_unsigned ();
+      if (length != gfi_ptr->lineno_checksum)
+        goto read_mismatch;
+
+      length = gcov_read_unsigned ();
+      if (length != gfi_ptr->cfg_checksum)
+        goto read_mismatch;
+
+      ci_ptr = gfi_ptr->ctrs;
+      for (t_ix = 0; t_ix < GCOV_COUNTERS; t_ix++)
+        {
+          gcov_merge_fn merge = gi_ptr->merge[t_ix];
+
+          if (!merge)
+            continue;
+
+          tag = gcov_read_unsigned ();
+          length = gcov_read_unsigned ();
+          if (tag != GCOV_TAG_FOR_COUNTER (t_ix)
+              || length != GCOV_TAG_COUNTER_LENGTH (ci_ptr->num))
+            goto read_mismatch;
+          (*merge) (ci_ptr->values, ci_ptr->num);
+          ci_ptr++;
+        }
+      if ((error = gcov_is_error ()))
+        goto read_error;
+    }
+
+  if (tag && tag != GCOV_TAG_MODULE_INFO)
+    {
+    read_mismatch:;
+      gcov_error ("profiling:%s:Merge mismatch for %s %u\n",
+                  gi_filename, f_ix >= 0 ? "function" : "summary",
+                  f_ix < 0 ? -1 - f_ix : f_ix);
+      return -1;
+    }
+  return 0;
+
+read_error:
+  gcov_error ("profiling:%s:%s merging\n", gi_filename,
+              error < 0 ? "Overflow": "Error");
+  return -1;
+}
+
+/* Write counters in GI_PTR to a gcda file starting from its current
+   location.  */
+
+static void
+gcov_write_func_counters (struct gcov_info *gi_ptr)
+{
+  unsigned f_ix;
+
+  /* Write execution counts for each function.  */
+  for (f_ix = 0; f_ix != gi_ptr->n_functions; f_ix++)
+    {
+      unsigned buffered = 0;
+      const struct gcov_fn_info *gfi_ptr;
+      const struct gcov_ctr_info *ci_ptr;
+      gcov_unsigned_t length;
+      unsigned t_ix;
+
+      if (fn_buffer && fn_buffer->fn_ix == f_ix)
+        {
+          /* Buffered data from another program.  */
+          buffered = 1;
+          gfi_ptr = &fn_buffer->info;
+          length = GCOV_TAG_FUNCTION_LENGTH;
+        }
+      else
+        {
+          gfi_ptr = gi_ptr->functions[f_ix];
+          if (gfi_ptr && gfi_ptr->key == gi_ptr)
+            length = GCOV_TAG_FUNCTION_LENGTH;
+          else
+                length = 0;
+        }
+
+      gcov_write_tag_length (GCOV_TAG_FUNCTION, length);
+      if (!length)
+        continue;
+
+      gcov_write_unsigned (gfi_ptr->ident);
+      gcov_write_unsigned (gfi_ptr->lineno_checksum);
+      gcov_write_unsigned (gfi_ptr->cfg_checksum);
+
+      ci_ptr = gfi_ptr->ctrs;
+      for (t_ix = 0; t_ix < GCOV_COUNTERS; t_ix++)
+        {
+          gcov_unsigned_t n_counts;
+          gcov_type *c_ptr;
+
+          if (!gi_ptr->merge[t_ix])
+            continue;
+
+          n_counts = ci_ptr->num;
+          gcov_write_tag_length (GCOV_TAG_FOR_COUNTER (t_ix),
+                                 GCOV_TAG_COUNTER_LENGTH (n_counts));
+          c_ptr = ci_ptr->values;
+          while (n_counts--)
+            gcov_write_counter (*c_ptr++);
+          ci_ptr++;
+        }
+      if (buffered)
+        fn_buffer = free_fn_data (gi_ptr, fn_buffer, GCOV_COUNTERS);
+    }
+
+  gi_ptr->eof_pos = gcov_position ();
+  gcov_write_unsigned (0);
+}
+
+/* Write counters in GI_PTR and the summary in PRG to a gcda file.  In
+   the case of appending to an existing file, SUMMARY_POS will be non-zero.
+   We will write the file starting from SUMMAY_POS.  */
+
+static void
+gcov_exit_write_gcda (struct gcov_info *gi_ptr,
+                      const struct gcov_summary *prg_p,
+                      const gcov_position_t eof_pos,
+                      const gcov_position_t summary_pos)
+
+{
+  struct gcov_summary_buffer *next_sum_buffer;
+
+  /* Write out the data.  */
+  if (!eof_pos)
+    {
+      gcov_write_tag_length (GCOV_DATA_MAGIC, GCOV_VERSION);
+      gcov_write_unsigned (gi_ptr->stamp);
+    }
+
+  if (summary_pos)
+     gcov_seek (summary_pos);
+  gcc_assert (!summary_pos || summary_pos == gcov_position ());
+
+  /* Generate whole program statistics.  */
+  gcov_write_summary (GCOV_TAG_PROGRAM_SUMMARY, prg_p);
+
+  /* Rewrite all the summaries that were after the summary we merged
+     into.  This is necessary as the merged summary may have a different
+     size due to the number of non-zero histogram entries changing after
+     merging.  */
+
+  while (sum_buffer)
+    {
+      gcov_write_summary (GCOV_TAG_PROGRAM_SUMMARY, &sum_buffer->summary);
+      next_sum_buffer = sum_buffer->next;
+      free (sum_buffer);
+      sum_buffer = next_sum_buffer;
+    }
+
+  /* Write the counters.  */
+  gcov_write_func_counters (gi_ptr);
+}
+
+/* Helper function for merging summary.
+   Return -1 on error. Return 0 on success.  */
+
+static int
+gcov_exit_merge_summary (const struct gcov_info *gi_ptr, struct gcov_summary *prg,
+                         struct gcov_summary *this_prg, gcov_unsigned_t crc32,
+			 struct gcov_summary *all_prg __attribute__ ((unused)))
+{
+  struct gcov_ctr_summary *cs_prg, *cs_tprg;
+  unsigned t_ix;
+#if !GCOV_LOCKED 
+  /* summary for all instances of program.  */ 
+  struct gcov_ctr_summary *cs_all;
+#endif 
+
+  /* Merge the summaries.  */
+  for (t_ix = 0; t_ix < GCOV_COUNTERS_SUMMABLE; t_ix++)
+    {
+      cs_prg = &(prg->ctrs[t_ix]);
+      cs_tprg = &(this_prg->ctrs[t_ix]);
+
+      if (gi_ptr->merge[t_ix])
+        {
+	  int first = !cs_prg->runs;
+
+	  if (!run_accounted)
+	    cs_prg->runs++;
+          if (first)
+            cs_prg->num = cs_tprg->num;
+          cs_prg->sum_all += cs_tprg->sum_all;
+          if (cs_prg->run_max < cs_tprg->run_max)
+            cs_prg->run_max = cs_tprg->run_max;
+          cs_prg->sum_max += cs_tprg->run_max;
+          if (first)
+            memcpy (cs_prg->histogram, cs_tprg->histogram,
+                   sizeof (gcov_bucket_type) * GCOV_HISTOGRAM_SIZE);
+          else
+            gcov_histogram_merge (cs_prg->histogram, cs_tprg->histogram);
+        }
+      else if (cs_prg->runs)
+        {
+          gcov_error ("profiling:%s:Merge mismatch for summary.\n",
+                      gi_filename);
+          return -1;
+        }
+#if !GCOV_LOCKED
+      cs_all = &all_prg->ctrs[t_ix];
+      if (!cs_all->runs && cs_prg->runs)
+        {
+          cs_all->num = cs_prg->num;
+          cs_all->runs = cs_prg->runs;
+          cs_all->sum_all = cs_prg->sum_all;
+          cs_all->run_max = cs_prg->run_max;
+          cs_all->sum_max = cs_prg->sum_max;
+        }
+      else if (!all_prg->checksum
+               /* Don't compare the histograms, which may have slight
+                  variations depending on the order they were updated
+                  due to the truncating integer divides used in the
+                  merge.  */
+               && (cs_all->num != cs_prg->num
+                   || cs_all->runs != cs_prg->runs
+                   || cs_all->sum_all != cs_prg->sum_all
+                   || cs_all->run_max != cs_prg->run_max
+                   || cs_all->sum_max != cs_prg->sum_max))
+             {
+               gcov_error ("profiling:%s:Data file mismatch - some "
+                           "data files may have been concurrently "
+                           "updated without locking support\n", gi_filename);
+               all_prg->checksum = ~0u;
+             }
+#endif
+    }
+  
+  prg->checksum = crc32;
+
+  return 0;
+}
+
+/* Sort N entries in VALUE_ARRAY in descending order.
+   Each entry in VALUE_ARRAY has two values. The sorting
+   is based on the second value.  */
+
+GCOV_LINKAGE  void
+gcov_sort_n_vals (gcov_type *value_array, int n)
+{
+  int j, k;
+  for (j = 2; j < n; j += 2)
+    {
+      gcov_type cur_ent[2];
+      cur_ent[0] = value_array[j];
+      cur_ent[1] = value_array[j + 1];
+      k = j - 2;
+      while (k >= 0 && value_array[k + 1] < cur_ent[1])
+        {
+          value_array[k + 2] = value_array[k];
+          value_array[k + 3] = value_array[k+1];
+          k -= 2;
+        }
+      value_array[k + 2] = cur_ent[0];
+      value_array[k + 3] = cur_ent[1];
+    }
+}
+
+/* Sort the profile counters for all indirect call sites. Counters
+   for each call site are allocated in array COUNTERS.  */
+
+static void
+gcov_sort_icall_topn_counter (const struct gcov_ctr_info *counters)
+{
+  int i;
+  gcov_type *values;
+  int n = counters->num;
+  gcc_assert (!(n % GCOV_ICALL_TOPN_NCOUNTS));
+
+  values = counters->values;
+
+  for (i = 0; i < n; i += GCOV_ICALL_TOPN_NCOUNTS)
+    {
+      gcov_type *value_array = &values[i + 1];
+      gcov_sort_n_vals (value_array, GCOV_ICALL_TOPN_NCOUNTS - 1);
+    }
+}
+
+static void
+gcov_sort_topn_counter_arrays (const struct gcov_info *gi_ptr)
+{
+  unsigned int i;
+  int f_ix;
+  const struct gcov_fn_info *gfi_ptr;
+  const struct gcov_ctr_info *ci_ptr;
+
+  for (f_ix = 0; (unsigned)f_ix != gi_ptr->n_functions; f_ix++)
+    {
+      gfi_ptr = gi_ptr->functions[f_ix];
+      ci_ptr = gfi_ptr->ctrs;
+      for (i = 0; i < GCOV_COUNTERS; i++)
+        {
+          if (!gcov_counter_active (gi_ptr, i))
+            continue;
+          if (i == GCOV_COUNTER_ICALL_TOPNV)
+            {
+              gcov_sort_icall_topn_counter (ci_ptr);
+              break;
+            }
+          ci_ptr++;
+        }
+     }
+}
+
+/* Dump the coverage counts for one gcov_info object. We merge with existing
+   counts when possible, to avoid growing the .da files ad infinitum. We use
+   this program's checksum to make sure we only accumulate whole program
+   statistics to the correct summary. An object file might be embedded
+   in two separate programs, and we must keep the two program
+   summaries separate.  */
+
+static void
+gcov_exit_dump_gcov (struct gcov_info *gi_ptr, struct gcov_filename_aux *gf,
+		     gcov_unsigned_t crc32, struct gcov_summary *all_prg,
+                     struct gcov_summary *this_prg)
+{
+  struct gcov_summary prg; /* summary for this object over all program.  */
+  int error;
+  gcov_unsigned_t tag;
+  gcov_position_t summary_pos = 0;
+  gcov_position_t eof_pos = 0;
+
+  fn_buffer = 0;
+  sum_buffer = 0;
+
+  gcov_sort_topn_counter_arrays (gi_ptr);
+
+  error = gcov_exit_open_gcda_file (gi_ptr, gf);
+  if (error == -1)
+    return;
+
+  tag = gcov_read_unsigned ();
+  if (tag)
+    {
+      /* Merge data from file.  */
+      if (tag != GCOV_DATA_MAGIC)
+        {
+          gcov_error ("profiling:%s:Not a gcov data file\n", gi_filename);
+          goto read_fatal;
+        }
+      error = gcov_exit_merge_gcda (gi_ptr, &prg, this_prg, &summary_pos, &eof_pos,
+				    crc32);
+      if (error == -1)
+        goto read_fatal;
+    }
+
+  gcov_rewrite ();
+
+  if (!summary_pos)
+    {
+      memset (&prg, 0, sizeof (prg));
+      summary_pos = eof_pos;
+    }
+
+  error = gcov_exit_merge_summary (gi_ptr, &prg, this_prg, crc32, all_prg);
+  if (error == -1)
+    goto read_fatal;
+
+  gcov_exit_write_gcda (gi_ptr, &prg, eof_pos, summary_pos);
+  /* fall through */
+
+read_fatal:;
+  while (fn_buffer)
+    fn_buffer = free_fn_data (gi_ptr, fn_buffer, GCOV_COUNTERS);
+
+  if ((error = gcov_close ()))
+    gcov_error (error  < 0 ?
+                "profiling:%s:Overflow writing\n" :
+                "profiling:%s:Error writing\n",
+                gi_filename);
+}
+
+/* Write imported files (auxiliary modules) for primary module GI_PTR
+   into file GI_FILENAME.  */
+
+static void
+gcov_write_import_file (char *gi_filename, struct gcov_info *gi_ptr)
+{
+  char  *gi_imports_filename;
+  const char *gcov_suffix;
+  FILE *imports_file;
+  size_t prefix_length, suffix_length;
+
+  gcov_suffix = getenv ("GCOV_IMPORTS_SUFFIX");
+  if (!gcov_suffix || !strlen (gcov_suffix))
+    gcov_suffix = ".imports";
+  suffix_length = strlen (gcov_suffix);
+  prefix_length = strlen (gi_filename);
+  gi_imports_filename = (char *) alloca (prefix_length + suffix_length + 1);
+  memset (gi_imports_filename, 0, prefix_length + suffix_length + 1);
+  memcpy (gi_imports_filename, gi_filename, prefix_length);
+  memcpy (gi_imports_filename + prefix_length, gcov_suffix, suffix_length);
+  imports_file = fopen (gi_imports_filename, "w");
+  if (imports_file)
+    {
+      const struct dyn_imp_mod **imp_mods;
+      unsigned i, imp_len;
+      imp_mods = gcov_get_sorted_import_module_array (gi_ptr, &imp_len);
+      if (imp_mods)
+        {
+          for (i = 0; i < imp_len; i++)
+            {
+              fprintf (imports_file, "%s\n",
+                       imp_mods[i]->imp_mod->mod_info->source_filename);
+              fprintf (imports_file, "%s%s\n",
+                       imp_mods[i]->imp_mod->mod_info->da_filename, GCOV_DATA_SUFFIX);
+            }
+          free (imp_mods);
+        }
+      fclose (imports_file);
+    }
+}
+
+static void
+gcov_dump_module_info (struct gcov_filename_aux *gf)
+{
+  struct gcov_info *gi_ptr;
+
+  /* Compute the module groups and record whether there were any
+     counter fixups applied that require rewriting the counters.  */
+  int changed = __gcov_compute_module_groups ();
+
+  /* Now write out module group info.  */
+  for (gi_ptr = __gcov_list; gi_ptr; gi_ptr = gi_ptr->next)
+    {
+      int error; 
+
+      if (gcov_exit_open_gcda_file (gi_ptr, gf) == -1)
+	continue;
+
+      if (changed)
+        {
+          /* Scan file to find the end of the summary section, which is
+             where we will start re-writing the counters.  */
+          gcov_position_t summary_end_pos;
+          if (gcov_scan_summary_end (gi_ptr, &summary_end_pos) == -1)
+            gcov_error ("profiling:%s:Error scanning summaries\n",
+                        gi_filename);
+          else
+            {
+              gcov_position_t eof_pos = gi_ptr->eof_pos;
+              gcov_rewrite ();
+              gcov_seek (summary_end_pos);
+              gcov_write_func_counters (gi_ptr);
+              gcc_assert (eof_pos == gi_ptr->eof_pos);
+            }
+        }
+      else
+        gcov_rewrite ();
+
+      /* Overwrite the zero word at the of the file.  */
+      gcov_seek (gi_ptr->eof_pos);
+
+      gcov_write_module_infos (gi_ptr);
+      /* Write the end marker  */
+      gcov_write_unsigned (0);
+      gcov_truncate (); 
+      
+      if ((error = gcov_close ()))
+        gcov_error (error  < 0 ?  "profiling:%s:Overflow writing\n" :
+                                  "profiling:%s:Error writing\n",
+                                  gi_filename);
+      gcov_write_import_file (gi_filename, gi_ptr);
+    }
+  __gcov_finalize_dyn_callgraph ();
+}
+
+/* Dump all the coverage counts for the program. It first computes program
+   summary and then traverses gcov_list list and dumps the gcov_info
+   objects one by one.  */
+
+void
+gcov_exit (void)
+{
+  struct gcov_info *gi_ptr;
+  struct gcov_filename_aux gf;
+  gcov_unsigned_t crc32;
+  int dump_module_info = 0;
+  struct gcov_summary all_prg;
+  struct gcov_summary this_prg;
+
+  /* Prevent the counters from being dumped a second time on exit when the
+     application already wrote out the profile using __gcov_dump().  */
+  if (gcov_dump_complete)
+    return;
+
+  crc32 = gcov_exit_compute_summary (&this_prg);
+
+  allocate_filename_struct (&gf);
+#if !GCOV_LOCKED
+  memset (&all_prg, 0, sizeof (all_prg));
+#endif
+
+  /* Now merge each file.  */
+  for (gi_ptr = __gcov_list; gi_ptr; gi_ptr = gi_ptr->next)
+    {
+      gcov_exit_dump_gcov (gi_ptr, &gf, crc32, &all_prg, &this_prg);
+
+      /* The IS_PRIMARY field is overloaded to indicate if this module
+       is FDO/LIPO.  */
+      dump_module_info |= gi_ptr->mod_info->is_primary;
+    }
+  run_accounted = 1;
+
+  if (dump_module_info)
+    gcov_dump_module_info (&gf);
+
+  if (gi_filename)
+    free (gi_filename);
+}
+
+/* Reset all counters to zero.  */
+
+void
+gcov_clear (void)
+{
+  const struct gcov_info *gi_ptr;
+
+  for (gi_ptr = __gcov_list; gi_ptr; gi_ptr = gi_ptr->next)
+    {
+      unsigned f_ix;
+
+      for (f_ix = 0; f_ix < gi_ptr->n_functions; f_ix++)
+        {
+          unsigned t_ix;
+          const struct gcov_fn_info *gfi_ptr = gi_ptr->functions[f_ix];
+
+          if (!gfi_ptr || gfi_ptr->key != gi_ptr)
+            continue;
+          const struct gcov_ctr_info *ci_ptr = gfi_ptr->ctrs;
+          for (t_ix = 0; t_ix != GCOV_COUNTERS; t_ix++)
+            {
+              if (!gi_ptr->merge[t_ix])
+                continue;
+
+              memset (ci_ptr->values, 0, sizeof (gcov_type) * ci_ptr->num);
+              ci_ptr++;
+            }
+        }
+    }
+}
+
+/* Add a new object file onto the bb chain.  Invoked automatically
+  when running an object file's global ctors.  */
+
+void
+__gcov_init (struct gcov_info *info)
+{
+#ifndef IN_GCOV_TOOL
+   if (!gcov_sampling_period_initialized)
+    {
+      const char* env_value_str = getenv ("GCOV_SAMPLING_PERIOD");
+      if (env_value_str)
+        {
+          int env_value_int = atoi(env_value_str);
+          if (env_value_int >= 1)
+            __gcov_sampling_period = env_value_int;
+        }
+      gcov_sampling_period_initialized = 1;
+    }
+#endif
+
+  if (!info->version || !info->n_functions)
+    return;
+  if (gcov_version (info, info->version, 0))
+    {
+      size_t filename_length = strlen(info->filename);
+
+      /* Refresh the longest file name information */
+      if (filename_length > gcov_max_filename)
+        gcov_max_filename = filename_length;
+
+      /* Assign the module ID (starting at 1).  */
+      info->mod_info->ident = (++gcov_cur_module_id);
+      gcc_assert (EXTRACT_MODULE_ID_FROM_GLOBAL_ID (GEN_FUNC_GLOBAL_ID (
+                                                       info->mod_info->ident, 0))
+                  == info->mod_info->ident);
+
+      if (!__gcov_list)
+        atexit (gcov_exit);
+
+      info->next = __gcov_list;
+      __gcov_list = info;
+    }
+  info->version = 0;
+}
+
+#endif /* L_gcov */
+#endif /* inhibit_libc */
diff --git a/lib/gcc/x86_64-linux-android/4.9/include-fixed/stdio.h b/lib/gcc/x86_64-linux-android/4.9/include-fixed/stdio.h
deleted file mode 100644
index 1089c96..0000000
--- a/lib/gcc/x86_64-linux-android/4.9/include-fixed/stdio.h
+++ /dev/null
@@ -1,465 +0,0 @@
-/*  DO NOT EDIT THIS FILE.
-
-    It has been auto-edited by fixincludes from:
-
-	"/tmp/ndk-andrewhsieh/build/toolchain/prefix/sysroot/usr/include/stdio.h"
-
-    This had to be done to correct non-standard usages in the
-    original, manufacturer supplied header file.  */
-
-/*	$OpenBSD: stdio.h,v 1.35 2006/01/13 18:10:09 miod Exp $	*/
-/*	$NetBSD: stdio.h,v 1.18 1996/04/25 18:29:21 jtc Exp $	*/
-
-/*-
- * Copyright (c) 1990 The Regents of the University of California.
- * All rights reserved.
- *
- * This code is derived from software contributed to Berkeley by
- * Chris Torek.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- * 3. Neither the name of the University nor the names of its contributors
- *    may be used to endorse or promote products derived from this software
- *    without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- *
- *	@(#)stdio.h	5.17 (Berkeley) 6/3/91
- */
-
-#ifndef	_STDIO_H_
-#define	_STDIO_H_
-
-#include <sys/cdefs.h>
-#include <sys/types.h>
-
-#include <stdarg.h>
-#include <stddef.h>
-
-#define __need_NULL
-#include <stddef.h>
-
-#define	_FSTDIO			/* Define for new stdio with functions. */
-
-typedef off_t fpos_t;		/* stdio file position type */
-
-/*
- * NB: to fit things in six character monocase externals, the stdio
- * code uses the prefix `__s' for stdio objects, typically followed
- * by a three-character attempt at a mnemonic.
- */
-
-/* stdio buffers */
-struct __sbuf {
-	unsigned char *_base;
-	int	_size;
-};
-
-/*
- * stdio state variables.
- *
- * The following always hold:
- *
- *	if (_flags&(__SLBF|__SWR)) == (__SLBF|__SWR),
- *		_lbfsize is -_bf._size, else _lbfsize is 0
- *	if _flags&__SRD, _w is 0
- *	if _flags&__SWR, _r is 0
- *
- * This ensures that the getc and putc macros (or inline functions) never
- * try to write or read from a file that is in `read' or `write' mode.
- * (Moreover, they can, and do, automatically switch from read mode to
- * write mode, and back, on "r+" and "w+" files.)
- *
- * _lbfsize is used only to make the inline line-buffered output stream
- * code as compact as possible.
- *
- * _ub, _up, and _ur are used when ungetc() pushes back more characters
- * than fit in the current _bf, or when ungetc() pushes back a character
- * that does not match the previous one in _bf.  When this happens,
- * _ub._base becomes non-nil (i.e., a stream has ungetc() data iff
- * _ub._base!=NULL) and _up and _ur save the current values of _p and _r.
- *
- * NOTE: if you change this structure, you also need to update the
- * std() initializer in findfp.c.
- */
-typedef	struct __sFILE {
-	unsigned char *_p;	/* current position in (some) buffer */
-	int	_r;		/* read space left for getc() */
-	int	_w;		/* write space left for putc() */
-	short	_flags;		/* flags, below; this FILE is free if 0 */
-	short	_file;		/* fileno, if Unix descriptor, else -1 */
-	struct	__sbuf _bf;	/* the buffer (at least 1 byte, if !NULL) */
-	int	_lbfsize;	/* 0 or -_bf._size, for inline putc */
-
-	/* operations */
-	void	*_cookie;	/* cookie passed to io functions */
-	int	(*_close)(void *);
-	int	(*_read)(void *, char *, int);
-	fpos_t	(*_seek)(void *, fpos_t, int);
-	int	(*_write)(void *, const char *, int);
-
-	/* extension data, to avoid further ABI breakage */
-	struct	__sbuf _ext;
-	/* data for long sequences of ungetc() */
-	unsigned char *_up;	/* saved _p when _p is doing ungetc data */
-	int	_ur;		/* saved _r when _r is counting ungetc data */
-
-	/* tricks to meet minimum requirements even when malloc() fails */
-	unsigned char _ubuf[3];	/* guarantee an ungetc() buffer */
-	unsigned char _nbuf[1];	/* guarantee a getc() buffer */
-
-	/* separate buffer for fgetln() when line crosses buffer boundary */
-	struct	__sbuf _lb;	/* buffer for fgetln() */
-
-	/* Unix stdio files get aligned to block boundaries on fseek() */
-	int	_blksize;	/* stat.st_blksize (may be != _bf._size) */
-	fpos_t	_offset;	/* current lseek offset */
-} FILE;
-
-__BEGIN_DECLS
-extern FILE __sF[];
-__END_DECLS
-
-#define	__SLBF	0x0001		/* line buffered */
-#define	__SNBF	0x0002		/* unbuffered */
-#define	__SRD	0x0004		/* OK to read */
-#define	__SWR	0x0008		/* OK to write */
-	/* RD and WR are never simultaneously asserted */
-#define	__SRW	0x0010		/* open for reading & writing */
-#define	__SEOF	0x0020		/* found EOF */
-#define	__SERR	0x0040		/* found error */
-#define	__SMBF	0x0080		/* _buf is from malloc */
-#define	__SAPP	0x0100		/* fdopen()ed in append mode */
-#define	__SSTR	0x0200		/* this is an sprintf/snprintf string */
-#define	__SOPT	0x0400		/* do fseek() optimization */
-#define	__SNPT	0x0800		/* do not do fseek() optimization */
-#define	__SOFF	0x1000		/* set iff _offset is in fact correct */
-#define	__SMOD	0x2000		/* true => fgetln modified _p text */
-#define	__SALC	0x4000		/* allocate string space dynamically */
-#define	__SIGN	0x8000		/* ignore this file in _fwalk */
-
-/*
- * The following three definitions are for ANSI C, which took them
- * from System V, which brilliantly took internal interface macros and
- * made them official arguments to setvbuf(), without renaming them.
- * Hence, these ugly _IOxxx names are *supposed* to appear in user code.
- *
- * Although numbered as their counterparts above, the implementation
- * does not rely on this.
- */
-#define	_IOFBF	0		/* setvbuf should set fully buffered */
-#define	_IOLBF	1		/* setvbuf should set line buffered */
-#define	_IONBF	2		/* setvbuf should set unbuffered */
-
-#define	BUFSIZ	1024		/* size of buffer used by setbuf */
-#define	EOF	(-1)
-
-/*
- * FOPEN_MAX is a minimum maximum, and is the number of streams that
- * stdio can provide without attempting to allocate further resources
- * (which could fail).  Do not use this for anything.
- */
-
-#define	FOPEN_MAX	20	/* must be <= OPEN_MAX <sys/syslimits.h> */
-#define	FILENAME_MAX	1024	/* must be <= PATH_MAX <sys/syslimits.h> */
-
-/* System V/ANSI C; this is the wrong way to do this, do *not* use these. */
-#if __BSD_VISIBLE || __XPG_VISIBLE
-#define	P_tmpdir	"/tmp/"
-#endif
-#define	L_tmpnam	1024	/* XXX must be == PATH_MAX */
-#define	TMP_MAX		308915776
-
-/* Always ensure that these are consistent with <fcntl.h> and <unistd.h>! */
-#ifndef SEEK_SET
-#define	SEEK_SET	0	/* set file offset to offset */
-#endif
-#ifndef SEEK_CUR
-#define	SEEK_CUR	1	/* set file offset to current plus offset */
-#endif
-#ifndef SEEK_END
-#define	SEEK_END	2	/* set file offset to EOF plus offset */
-#endif
-
-#define	stdin	(&__sF[0])
-#define	stdout	(&__sF[1])
-#define	stderr	(&__sF[2])
-
-/*
- * Functions defined in ANSI C standard.
- */
-__BEGIN_DECLS
-void	 clearerr(FILE *);
-int	 fclose(FILE *);
-int	 feof(FILE *);
-int	 ferror(FILE *);
-int	 fflush(FILE *);
-int	 fgetc(FILE *);
-char	*fgets(char * __restrict, int, FILE * __restrict);
-FILE	*fopen(const char * __restrict , const char * __restrict);
-int	 fprintf(FILE * __restrict , const char * __restrict, ...)
-		__printflike(2, 3);
-int	 fputc(int, FILE *);
-int	 fputs(const char * __restrict, FILE * __restrict);
-size_t	 fread(void * __restrict, size_t, size_t, FILE * __restrict);
-FILE	*freopen(const char * __restrict, const char * __restrict,
-	    FILE * __restrict);
-int	 fscanf(FILE * __restrict, const char * __restrict, ...)
-		__scanflike(2, 3);
-int	 fseek(FILE *, long, int);
-long	 ftell(FILE *);
-size_t	 fwrite(const void * __restrict, size_t, size_t, FILE * __restrict);
-int	 getc(FILE *);
-int	 getchar(void);
-ssize_t	 getdelim(char ** __restrict, size_t * __restrict, int,
-	    FILE * __restrict);
-ssize_t	 getline(char ** __restrict, size_t * __restrict, FILE * __restrict);
-
-#if __BSD_VISIBLE && !defined(__SYS_ERRLIST)
-#define __SYS_ERRLIST
-extern int sys_nerr;			/* perror(3) external variables */
-extern char *sys_errlist[];
-#endif
-
-void	 perror(const char *);
-int	 printf(const char * __restrict, ...)
-		__printflike(1, 2);
-int	 putc(int, FILE *);
-int	 putchar(int);
-int	 puts(const char *);
-int	 remove(const char *);
-void	 rewind(FILE *);
-int	 scanf(const char * __restrict, ...)
-		__scanflike(1, 2);
-void	 setbuf(FILE * __restrict, char * __restrict);
-int	 setvbuf(FILE * __restrict, char * __restrict, int, size_t);
-int	 sscanf(const char * __restrict, const char * __restrict, ...)
-		__scanflike(2, 3);
-FILE	*tmpfile(void);
-int	 ungetc(int, FILE *);
-int	 vfprintf(FILE * __restrict, const char * __restrict, __gnuc_va_list)
-		__printflike(2, 0);
-int	 vprintf(const char * __restrict, __gnuc_va_list)
-		__printflike(1, 0);
-
-int dprintf(int, const char * __restrict, ...) __printflike(2, 3);
-int vdprintf(int, const char * __restrict, __gnuc_va_list) __printflike(2, 0);
-
-#ifndef __AUDIT__
-char* gets(char*) __warnattr("gets is very unsafe; consider using fgets");
-int sprintf(char* __restrict, const char* __restrict, ...)
-    __printflike(2, 3); //__warnattr("sprintf is often misused; please use snprintf");
-char* tmpnam(char*) __warnattr("tmpnam possibly used unsafely; consider using mkstemp");
-int vsprintf(char* __restrict, const char* __restrict, __gnuc_va_list)
-    __printflike(2, 0); //__warnattr("vsprintf is often misused; please use vsnprintf");
-#if __XPG_VISIBLE
-char* tempnam(const char*, const char*)
-    __warnattr("tempnam possibly used unsafely; consider using mkstemp");
-#endif
-#endif
-
-extern int rename(const char*, const char*);
-extern int renameat(int, const char*, int, const char*);
-
-int	 fgetpos(FILE * __restrict, fpos_t * __restrict);
-int	 fsetpos(FILE *, const fpos_t *);
-
-int	 fseeko(FILE *, off_t, int);
-off_t	 ftello(FILE *);
-
-#if __ISO_C_VISIBLE >= 1999 || __BSD_VISIBLE
-int	 snprintf(char * __restrict, size_t, const char * __restrict, ...)
-		__printflike(3, 4);
-int	 vfscanf(FILE * __restrict, const char * __restrict, __gnuc_va_list)
-		__scanflike(2, 0);
-int	 vscanf(const char *, __gnuc_va_list)
-		__scanflike(1, 0);
-int	 vsnprintf(char * __restrict, size_t, const char * __restrict, __gnuc_va_list)
-		__printflike(3, 0);
-int	 vsscanf(const char * __restrict, const char * __restrict, __gnuc_va_list)
-		__scanflike(2, 0);
-#endif /* __ISO_C_VISIBLE >= 1999 || __BSD_VISIBLE */
-
-__END_DECLS
-
-
-/*
- * Functions defined in POSIX 1003.1.
- */
-#if __BSD_VISIBLE || __POSIX_VISIBLE || __XPG_VISIBLE
-#define	L_ctermid	1024	/* size for ctermid(); PATH_MAX */
-#define L_cuserid	9	/* size for cuserid(); UT_NAMESIZE + 1 */
-
-__BEGIN_DECLS
-#if 0 /* MISSING FROM BIONIC */
-char	*ctermid(char *);
-char	*cuserid(char *);
-#endif /* MISSING */
-FILE	*fdopen(int, const char *);
-int	 fileno(FILE *);
-
-#if (__POSIX_VISIBLE >= 199209)
-int	 pclose(FILE *);
-FILE	*popen(const char *, const char *);
-#endif
-
-#if __POSIX_VISIBLE >= 199506
-void	 flockfile(FILE *);
-int	 ftrylockfile(FILE *);
-void	 funlockfile(FILE *);
-
-/*
- * These are normally used through macros as defined below, but POSIX
- * requires functions as well.
- */
-int	 getc_unlocked(FILE *);
-int	 getchar_unlocked(void);
-int	 putc_unlocked(int, FILE *);
-int	 putchar_unlocked(int);
-#endif /* __POSIX_VISIBLE >= 199506 */
-
-__END_DECLS
-
-#endif /* __BSD_VISIBLE || __POSIX_VISIBLE || __XPG_VISIBLE */
-
-/*
- * Routines that are purely local.
- */
-#if __BSD_VISIBLE
-__BEGIN_DECLS
-int	 asprintf(char ** __restrict, const char * __restrict, ...)
-		__printflike(2, 3);
-char	*fgetln(FILE * __restrict, size_t * __restrict);
-int	 fpurge(FILE *);
-int	 getw(FILE *);
-int	 putw(int, FILE *);
-void	 setbuffer(FILE *, char *, int);
-int	 setlinebuf(FILE *);
-int	 vasprintf(char ** __restrict, const char * __restrict,
-    __gnuc_va_list)
-		__printflike(2, 0);
-__END_DECLS
-
-/*
- * Stdio function-access interface.
- */
-__BEGIN_DECLS
-FILE	*funopen(const void *,
-		int (*)(void *, char *, int),
-		int (*)(void *, const char *, int),
-		fpos_t (*)(void *, fpos_t, int),
-		int (*)(void *));
-__END_DECLS
-#define	fropen(cookie, fn) funopen(cookie, fn, 0, 0, 0)
-#define	fwopen(cookie, fn) funopen(cookie, 0, fn, 0, 0)
-#endif /* __BSD_VISIBLE */
-
-#if defined(__BIONIC_FORTIFY)
-
-__BEGIN_DECLS
-
-__BIONIC_FORTIFY_INLINE
-__printflike(3, 0)
-int vsnprintf(char *dest, size_t size, const char *format, __va_list ap)
-{
-    return __builtin___vsnprintf_chk(dest, size, 0, __bos(dest), format, ap);
-}
-
-__BIONIC_FORTIFY_INLINE
-__printflike(2, 0)
-int vsprintf(char *dest, const char *format, __va_list ap)
-{
-    return __builtin___vsprintf_chk(dest, 0, __bos(dest), format, ap);
-}
-
-#if defined(__clang__)
-  #if !defined(snprintf)
-    #define __wrap_snprintf(dest, size, ...) __builtin___snprintf_chk(dest, size, 0, __bos(dest), __VA_ARGS__)
-    #define snprintf(...) __wrap_snprintf(__VA_ARGS__)
-  #endif
-#else
-__BIONIC_FORTIFY_INLINE
-__printflike(3, 4)
-int snprintf(char *dest, size_t size, const char *format, ...)
-{
-    return __builtin___snprintf_chk(dest, size, 0,
-        __bos(dest), format, __builtin_va_arg_pack());
-}
-#endif
-
-#if defined(__clang__)
-  #if !defined(sprintf)
-    #define __wrap_sprintf(dest, ...) __builtin___sprintf_chk(dest, 0, __bos(dest), __VA_ARGS__)
-    #define sprintf(...) __wrap_sprintf(__VA_ARGS__)
-  #endif
-#else
-__BIONIC_FORTIFY_INLINE
-__printflike(2, 3)
-int sprintf(char *dest, const char *format, ...)
-{
-    return __builtin___sprintf_chk(dest, 0,
-        __bos(dest), format, __builtin_va_arg_pack());
-}
-#endif
-
-extern char* __fgets_chk(char*, int, FILE*, size_t);
-extern char* __fgets_real(char*, int, FILE*) __asm__(__USER_LABEL_PREFIX__ "fgets");
-__errordecl(__fgets_too_big_error, "fgets called with size bigger than buffer");
-__errordecl(__fgets_too_small_error, "fgets called with size less than zero");
-
-#if !defined(__clang__)
-
-__BIONIC_FORTIFY_INLINE
-char *fgets(char* dest, int size, FILE* stream) {
-    size_t bos = __bos(dest);
-
-    // Compiler can prove, at compile time, that the passed in size
-    // is always negative. Force a compiler error.
-    if (__builtin_constant_p(size) && (size < 0)) {
-        __fgets_too_small_error();
-    }
-
-    // Compiler doesn't know destination size. Don't call __fgets_chk
-    if (bos == __BIONIC_FORTIFY_UNKNOWN_SIZE) {
-        return __fgets_real(dest, size, stream);
-    }
-
-    // Compiler can prove, at compile time, that the passed in size
-    // is always <= the actual object size. Don't call __fgets_chk
-    if (__builtin_constant_p(size) && (size <= (int) bos)) {
-        return __fgets_real(dest, size, stream);
-    }
-
-    // Compiler can prove, at compile time, that the passed in size
-    // is always > the actual object size. Force a compiler error.
-    if (__builtin_constant_p(size) && (size > (int) bos)) {
-        __fgets_too_big_error();
-    }
-
-    return __fgets_chk(dest, size, stream, bos);
-}
-
-#endif /* !defined(__clang__) */
-
-__END_DECLS
-
-#endif /* defined(__BIONIC_FORTIFY) */
-
-#endif /* _STDIO_H_ */
diff --git a/lib/gcc/x86_64-linux-android/4.9/include/arm_neon.h b/lib/gcc/x86_64-linux-android/4.9/include/arm_neon.h
index 873b7d6..10944f3 100644
--- a/lib/gcc/x86_64-linux-android/4.9/include/arm_neon.h
+++ b/lib/gcc/x86_64-linux-android/4.9/include/arm_neon.h
@@ -30,59 +30,62 @@
 //*****************************************************************************************
 
 //!!!!!!!  To use this file in your project that uses ARM NEON intinsics just keep arm_neon.h included and complile it as usual.
-//!!!!!!!  Please pay attention at #define USE_SSSE3 and USE_SSE4 below - you need to define them for newest Intel platforms for
-//!!!!!!!  greater performance. It can be done by -mssse3 or -msse4.2 (which also implies -mssse3) compiler switch.
+//!!!!!!!  Please pay attention at USE_SSE4 below - you need to define it for newest Intel platforms for
+//!!!!!!!  greater performance. It can be done by -msse4.2 compiler switch.
 
 #ifndef NEON2SSE_H
 #define NEON2SSE_H
 
 #ifndef USE_SSE4
-    #if defined(__SSE4_2__)
-        #define USE_SSE4
-        #define USE_SSSE3
-    #endif
+#if defined(__SSE4_2__)
+    #define USE_SSE4
 #endif
-
-#ifndef USE_SSSE3
-    #if defined(__SSSE3__)
-        #define USE_SSSE3
-    #endif
 #endif
 
 #include <xmmintrin.h>     //SSE
 #include <emmintrin.h>     //SSE2
 #include <pmmintrin.h>     //SSE3
-
-#ifdef USE_SSSE3
-    #include <tmmintrin.h>     //SSSE3
-#else
-# warning "Some functions require SSSE3 or higher."
+#include <tmmintrin.h>     //SSSE3
+#ifdef USE_SSE4
+#include <smmintrin.h> //SSE4.1
+#include <nmmintrin.h> //SSE4.2
 #endif
 
-#ifdef USE_SSE4
-    #include <smmintrin.h>     //SSE4.1
-    #include <nmmintrin.h>     //SSE4.2
+
+//***************  functions and data attributes, compiler dependent  *********************************
+//***********************************************************************************
+#ifdef __GNUC__
+#define _GCC_VERSION (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__)
+#define _NEON2SSE_ALIGN_16  __attribute__((aligned(16)))
+#define _NEON2SSE_INLINE extern inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+#if _GCC_VERSION <  40500
+    #define _NEON2SSE_PERFORMANCE_WARNING(function, explanation)   __attribute__((deprecated)) function
+#else
+    #define _NEON2SSE_PERFORMANCE_WARNING(function, explanation)   __attribute__((deprecated(explanation))) function
+#endif
+#if defined(__x86_64__)
+    #define _NEON2SSE_64BIT  __x86_64__
+#endif
+#else
+#define _NEON2SSE_ALIGN_16  __declspec(align(16))
+#define _NEON2SSE_INLINE __inline
+#if defined(_MSC_VER)|| defined (__INTEL_COMPILER)
+    #define _NEON2SSE_PERFORMANCE_WARNING(function, EXPLANATION) __declspec(deprecated(EXPLANATION)) function
+#if defined(_M_X64)
+        #define _NEON2SSE_64BIT  _M_X64
+#endif
+#else
+    #define _NEON2SSE_PERFORMANCE_WARNING(function, explanation)  function
+#endif
+#endif
+
+#if defined  (_NEON2SSE_64BIT) && defined (USE_SSE4)
+    #define _NEON2SSE_64BIT_SSE4
 #endif
 
 /*********************************************************************************************************************/
 //    data types conversion
 /*********************************************************************************************************************/
-
-typedef __m128 float32x4_t;
-
-typedef __m128 float16x8_t;         //not supported by IA, for compartibility
-
-typedef __m128i int8x16_t;
-typedef __m128i int16x8_t;
-typedef __m128i int32x4_t;
-typedef __m128i int64x2_t;
-typedef __m128i uint8x16_t;
-typedef __m128i uint16x8_t;
-typedef __m128i uint32x4_t;
-typedef __m128i uint64x2_t;
-typedef __m128i poly8x16_t;
-typedef __m128i poly16x8_t;
-
 #if defined(_MSC_VER) && (_MSC_VER < 1300)
     typedef signed char int8_t;
     typedef unsigned char uint8_t;
@@ -100,18 +103,59 @@
     typedef signed __int32 int32_t;
     typedef unsigned __int32 uint32_t;
 
-typedef signed long long int64_t;
-typedef unsigned long long uint64_t;
+    typedef signed long long int64_t;
+    typedef unsigned long long uint64_t;
 #else
-    #include <stdint.h>
-    #include <limits.h>
+#include <stdint.h>
+#include <limits.h>
 #endif
+
+typedef union   __m64_128 {
+    uint64_t m64_u64[1];
+    float m64_f32[2];
+    int8_t m64_i8[8];
+    int16_t m64_i16[4];
+    int32_t m64_i32[2];
+    int64_t m64_i64[1];
+    uint8_t m64_u8[8];
+    uint16_t m64_u16[4];
+    uint32_t m64_u32[2];
+} __m64_128;
+
+typedef __m64_128 int8x8_t;
+typedef __m64_128 uint8x8_t;
+typedef __m64_128 int16x4_t;
+typedef __m64_128 uint16x4_t;
+typedef __m64_128 int32x2_t;
+typedef __m64_128 uint32x2_t;
+typedef __m64_128 int64x1_t;
+typedef __m64_128 uint64x1_t;
+typedef __m64_128 poly8x8_t;
+typedef __m64_128 poly16x4_t;
+
+typedef __m64_128 float32x2_t;
+typedef __m128 float32x4_t;
+
+typedef __m128 float16x4_t; //not supported by IA, for compatibility
+typedef __m128 float16x8_t; //not supported by IA, for compatibility
+
+typedef __m128i int8x16_t;
+typedef __m128i int16x8_t;
+typedef __m128i int32x4_t;
+typedef __m128i int64x2_t;
+typedef __m128i uint8x16_t;
+typedef __m128i uint16x8_t;
+typedef __m128i uint32x4_t;
+typedef __m128i uint64x2_t;
+typedef __m128i poly8x16_t;
+typedef __m128i poly16x8_t;
+
 #if defined(_MSC_VER)
-#define SINT_MIN     (-2147483647 - 1)    /* min signed int value */
-#define SINT_MAX       2147483647         /* max signed int value */
+    #define SINT_MIN     (-2147483647 - 1) /* min signed int value */
+    #define SINT_MAX       2147483647 /* max signed int value */
 #else
-#define SINT_MIN     INT_MIN              /* min signed int value */
-#define SINT_MAX     INT_MAX              /* max signed int value */
+    #define SINT_MIN     INT_MIN /* min signed int value */
+    #define SINT_MAX     INT_MAX /* max signed int value */
 #endif
 
 typedef   float float32_t;
@@ -122,10 +166,9 @@
 typedef  uint8_t poly8_t;
 typedef  uint16_t poly16_t;
 
+
 //MSVC compilers (tested up to 2012 VS version) doesn't allow using structures or arrays of __m128x type  as functions arguments resulting in
 //error C2719: 'src': formal parameter with __declspec(align('16')) won't be aligned.  To avoid it we need the special trick for functions that use these types
-
-//Unfortunately we are unable to merge two 64-bits in on 128 bit register because user should be able to access val[n] members explicitly!!!
 struct int8x16x2_t {
     int8x16_t val[2];
 };
@@ -138,16 +181,29 @@
 struct int64x2x2_t {
     int64x2_t val[2];
 };
+//Unfortunately we are unable to merge two 64-bits in on 128 bit register because user should be able to access val[n] members explicitly!!!
+struct int8x8x2_t {
+    int8x8_t val[2];
+};
+struct int16x4x2_t {
+    int16x4_t val[2];
+};
+struct int32x2x2_t {
+    int32x2_t val[2];
+};
+struct int64x1x2_t {
+    int64x1_t val[2];
+};
 
-typedef struct int8x16x2_t int8x16x2_t;         //for C compilers to make them happy
-typedef struct int16x8x2_t int16x8x2_t;         //for C compilers to make them happy
-typedef struct int32x4x2_t int32x4x2_t;         //for C compilers to make them happy
-typedef struct int64x2x2_t int64x2x2_t;         //for C compilers to make them happy
-//to avoid pointers conversion
-typedef  int8x16x2_t int8x8x2_t;
-typedef  int16x8x2_t int16x4x2_t;
-typedef  int32x4x2_t int32x2x2_t;
-typedef  int64x2x2_t int64x1x2_t;
+typedef struct int8x16x2_t int8x16x2_t; //for C compilers to make them happy
+typedef struct int16x8x2_t int16x8x2_t; //for C compilers to make them happy
+typedef struct int32x4x2_t int32x4x2_t; //for C compilers to make them happy
+typedef struct int64x2x2_t int64x2x2_t; //for C compilers to make them happy
+
+typedef struct int8x8x2_t int8x8x2_t; //for C compilers to make them happy
+typedef struct int16x4x2_t int16x4x2_t; //for C compilers to make them happy
+typedef struct int32x2x2_t int32x2x2_t; //for C compilers to make them happy
+typedef struct int64x1x2_t int64x1x2_t; //for C compilers to make them happy
 
 /* to avoid pointer conversions the following unsigned integers structures are defined via the corresponding signed integers structures above */
 typedef struct int8x16x2_t uint8x16x2_t;
@@ -157,12 +213,12 @@
 typedef struct int8x16x2_t poly8x16x2_t;
 typedef struct int16x8x2_t poly16x8x2_t;
 
-typedef  int8x8x2_t uint8x8x2_t;
-typedef  int16x4x2_t uint16x4x2_t;
-typedef  int32x2x2_t uint32x2x2_t;
-typedef  int64x1x2_t uint64x1x2_t;
-typedef  int8x8x2_t poly8x8x2_t;
-typedef  int16x4x2_t poly16x4x2_t;
+typedef struct int8x8x2_t uint8x8x2_t;
+typedef struct int16x4x2_t uint16x4x2_t;
+typedef struct int32x2x2_t uint32x2x2_t;
+typedef struct int64x1x2_t uint64x1x2_t;
+typedef struct int8x8x2_t poly8x8x2_t;
+typedef struct int16x4x2_t poly16x4x2_t;
 
 //float
 struct float32x4x2_t {
@@ -171,9 +227,13 @@
 struct float16x8x2_t {
     float16x8_t val[2];
 };
-typedef struct float32x4x2_t float32x4x2_t;         //for C compilers to make them happy
-typedef struct float16x8x2_t float16x8x2_t;         //for C compilers to make them happy
-typedef  float32x4x2_t float32x2x2_t;
+struct float32x2x2_t {
+    float32x2_t val[2];
+};
+
+typedef struct float32x4x2_t float32x4x2_t; //for C compilers to make them happy
+typedef struct float16x8x2_t float16x8x2_t; //for C compilers to make them happy
+typedef struct  float32x2x2_t float32x2x2_t; //for C compilers to make them happy
 typedef  float16x8x2_t float16x4x2_t;
 
 //4
@@ -190,22 +250,36 @@
     int64x2_t val[4];
 };
 
-typedef struct int8x16x4_t int8x16x4_t;         //for C compilers to make them happy
-typedef struct int16x8x4_t int16x8x4_t;         //for C compilers to make them happy
-typedef struct int32x4x4_t int32x4x4_t;         //for C compilers to make them happy
-typedef struct int64x2x4_t int64x2x4_t;         //for C compilers to make them happy
-typedef  int8x16x4_t int8x8x4_t;
-typedef  int16x8x4_t int16x4x4_t;
-typedef  int32x4x4_t int32x2x4_t;
-typedef  int64x2x4_t int64x1x4_t;
+struct int8x8x4_t {
+    int8x8_t val[4];
+};
+struct int16x4x4_t {
+    int16x4_t val[4];
+};
+struct int32x2x4_t {
+    int32x2_t val[4];
+};
+struct int64x1x4_t {
+    int64x1_t val[4];
+};
+
+typedef struct int8x16x4_t int8x16x4_t; //for C compilers to make them happy
+typedef struct int16x8x4_t int16x8x4_t; //for C compilers to make them happy
+typedef struct int32x4x4_t int32x4x4_t; //for C compilers to make them happy
+typedef struct int64x2x4_t int64x2x4_t; //for C compilers to make them happy
+
+typedef struct int8x8x4_t int8x8x4_t; //for C compilers to make them happy
+typedef struct int16x4x4_t int16x4x4_t; //for C compilers to make them happy
+typedef struct int32x2x4_t int32x2x4_t; //for C compilers to make them happy
+typedef struct int64x1x4_t int64x1x4_t; //for C compilers to make them happy
 
 /* to avoid pointer conversions the following unsigned integers structures are defined via the corresponding signed integers dealing structures above:*/
-typedef int8x8x4_t uint8x8x4_t;
-typedef int16x4x4_t uint16x4x4_t;
-typedef int32x2x4_t uint32x2x4_t;
-typedef int64x1x4_t uint64x1x4_t;
-typedef uint8x8x4_t poly8x8x4_t;
-typedef uint16x4x4_t poly16x4x4_t;
+typedef struct int8x8x4_t uint8x8x4_t;
+typedef struct int16x4x4_t uint16x4x4_t;
+typedef struct int32x2x4_t uint32x2x4_t;
+typedef struct int64x1x4_t uint64x1x4_t;
+typedef struct int8x8x4_t poly8x8x4_t;
+typedef struct int16x4x4_t poly16x4x4_t;
 
 typedef struct int8x16x4_t uint8x16x4_t;
 typedef struct int16x8x4_t uint16x8x4_t;
@@ -220,10 +294,13 @@
 struct float16x8x4_t {
     float16x8_t val[4];
 };
+struct float32x2x4_t {
+    float32x2_t val[4];
+};
 
-typedef struct float32x4x4_t float32x4x4_t;         //for C compilers to make them happy
-typedef struct float16x8x4_t float16x8x4_t;         //for C compilers to make them happy
-typedef  float32x4x4_t float32x2x4_t;
+typedef struct float32x4x4_t float32x4x4_t; //for C compilers to make them happy
+typedef struct float16x8x4_t float16x8x4_t; //for C compilers to make them happy
+typedef struct  float32x2x4_t float32x2x4_t; //for C compilers to make them happy
 typedef  float16x8x4_t float16x4x4_t;
 
 //3
@@ -240,14 +317,28 @@
     int8x16_t val[3];
 };
 
-typedef struct int16x8x3_t int16x8x3_t;         //for C compilers to make them happy
-typedef struct int32x4x3_t int32x4x3_t;         //for C compilers to make them happy
-typedef struct int64x2x3_t int64x2x3_t;         //for C compilers to make them happy
-typedef struct int8x16x3_t int8x16x3_t;         //for C compilers to make them happy
-typedef  int16x8x3_t int16x4x3_t;
-typedef  int32x4x3_t int32x2x3_t;
-typedef  int64x2x3_t int64x1x3_t;
-typedef  int8x16x3_t int8x8x3_t;
+struct int16x4x3_t {
+    int16x4_t val[3];
+};
+struct int32x2x3_t {
+    int32x2_t val[3];
+};
+struct int64x1x3_t {
+    int64x1_t val[3];
+};
+struct int8x8x3_t {
+    int8x8_t val[3];
+};
+typedef struct int16x8x3_t int16x8x3_t; //for C compilers to make them happy
+typedef struct int32x4x3_t int32x4x3_t; //for C compilers to make them happy
+typedef struct int64x2x3_t int64x2x3_t; //for C compilers to make them happy
+typedef struct int8x16x3_t int8x16x3_t; //for C compilers to make them happy
+
+typedef struct int8x8x3_t int8x8x3_t; //for C compilers to make them happy
+typedef struct int16x4x3_t int16x4x3_t; //for C compilers to make them happy
+typedef struct int32x2x3_t int32x2x3_t; //for C compilers to make them happy
+typedef struct int64x1x3_t int64x1x3_t; //for C compilers to make them happy
+
 
 /* to avoid pointer conversions the following unsigned integers structures are defined via the corresponding signed integers dealing structures above:*/
 typedef struct int8x16x3_t uint8x16x3_t;
@@ -256,33 +347,55 @@
 typedef struct int64x2x3_t uint64x2x3_t;
 typedef struct int8x16x3_t poly8x16x3_t;
 typedef struct int16x8x3_t poly16x8x3_t;
-typedef int8x8x3_t uint8x8x3_t;
-typedef int16x4x3_t uint16x4x3_t;
-typedef int32x2x3_t uint32x2x3_t;
-typedef int64x1x3_t uint64x1x3_t;
-typedef int8x8x3_t poly8x8x3_t;
-typedef int16x4x3_t poly16x4x3_t;
+typedef struct  int8x8x3_t uint8x8x3_t;
+typedef struct  int16x4x3_t uint16x4x3_t;
+typedef struct  int32x2x3_t uint32x2x3_t;
+typedef struct  int64x1x3_t uint64x1x3_t;
+typedef struct  int8x8x3_t poly8x8x3_t;
+typedef struct  int16x4x3_t poly16x4x3_t;
 
 //float
 struct float32x4x3_t {
     float32x4_t val[3];
 };
+struct float32x2x3_t {
+    float32x2_t val[3];
+};
 struct float16x8x3_t {
     float16x8_t val[3];
 };
 
-typedef struct float32x4x3_t float32x4x3_t;         //for C compilers to make them happy
-typedef struct float16x8x3_t float16x8x3_t;         //for C compilers to make them happy
-typedef  float32x4x3_t float32x2x3_t;
+typedef struct float32x4x3_t float32x4x3_t; //for C compilers to make them happy
+typedef struct float16x8x3_t float16x8x3_t; //for C compilers to make them happy
+typedef struct float32x2x3_t float32x2x3_t; //for C compilers to make them happy
 typedef  float16x8x3_t float16x4x3_t;
 
+
 //****************************************************************************
 //****** Porting auxiliary macros ********************************************
-#define _M128i(a) (*(__m128i*)&(a))
-#define _M128d(a) (*(__m128d*)&(a))
-#define _M128(a) (*(__m128*)&(a))
+
+//** floating point related macros **
+#define _M128i(a) _mm_castps_si128(a)
+#define _M128(a) _mm_castsi128_ps(a)
+//here the most performance effective implementation is compiler and 32/64 bits build dependent
+#if defined (_NEON2SSE_64BIT) || (defined (__INTEL_COMPILER) && (__INTEL_COMPILER  >= 1500) )
+
+        #define _pM128i(a) _mm_cvtsi64_si128(*(int64_t*)(&(a)))
+        #define _M64(out, inp) out.m64_i64[0] = _mm_cvtsi128_si64 (inp);
+        #define _M64f(out, inp) out.m64_i64[0] = _mm_cvtsi128_si64 (_M128i(inp));
+#else
+   //for 32bit gcc and Microsoft compilers builds
+    #define _pM128i(a) _mm_loadl_epi64((__m128i*)&(a))
+    #define _M64(out, inp)  _mm_storel_epi64 ((__m128i*)&(out), inp)
+    #define _M64f(out, inp)  _mm_storel_epi64 ((__m128i*)&(out), _M128i(inp))
+#endif
+#define _pM128(a) _mm_castsi128_ps(_pM128i(a))
+
+#define return64(a)  _M64(res64,a); return res64;
+#define return64f(a)  _M64f(res64,a); return res64;
+
 #define _Ui64(a) (*(uint64_t*)&(a))
-#define _UNSIGNED_T(a) u##a
+#define _UNSIGNED_T(a) u ## a
 
 #define _SIGNBIT64 ((uint64_t)1 << 63)
 #define _SWAP_HI_LOW32  (2 | (3 << 2) | (0 << 4) | (1 << 6))
@@ -291,1145 +404,1854 @@
 #define  _NEON2SSE_REASON_SLOW_SERIAL "The function may be very slow due to the serial implementation, please try to avoid it"
 #define  _NEON2SSE_REASON_SLOW_UNEFFECTIVE "The function may be slow due to inefficient x86 SIMD implementation, please try to avoid it"
 
-//***************  functions attributes  ********************************************
-//***********************************************************************************
-#ifdef __GNUC__
-    #define _GCC_VERSION (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__)
-    #define _NEON2SSE_ALIGN_16  __attribute__((aligned(16)))
-    #define _NEON2SSE_INLINE extern inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
-    #if _GCC_VERSION <  40500
-        #define _NEON2SSE_PERFORMANCE_WARNING(function, explanation)   __attribute__((deprecated)) function
-    #else
-        #define _NEON2SSE_PERFORMANCE_WARNING(function, explanation)   __attribute__((deprecated(explanation))) function
-    #endif
-#elif defined(_MSC_VER)|| defined (__INTEL_COMPILER)
-    #define _NEON2SSE_ALIGN_16  __declspec(align(16))
-    #define _NEON2SSE_INLINE __inline
-    #define _NEON2SSE_PERFORMANCE_WARNING(function, EXPLANATION) __declspec(deprecated(EXPLANATION)) function
-#else
-    #define _NEON2SSE_ALIGN_16  __declspec(align(16))
-    #define _NEON2SSE_INLINE inline
-    #define _NEON2SSE_PERFORMANCE_WARNING(function, explanation)  function
-#endif
-
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 #define __constrange(min,max)  const
 #define __transfersize(size)
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
+
 //*************************************************************************
 //*************************************************************************
 //*********  Functions declarations as declared in original arm_neon.h *****
 //*************************************************************************
 //Vector add: vadd -> Vr[i]:=Va[i]+Vb[i], Vr, Va, Vb have equal lane sizes.
-
-int8x16_t vaddq_s8(int8x16_t a, int8x16_t b);         // VADD.I8 q0,q0,q0
-int16x8_t vaddq_s16(int16x8_t a, int16x8_t b);         // VADD.I16 q0,q0,q0
-int32x4_t vaddq_s32(int32x4_t a, int32x4_t b);         // VADD.I32 q0,q0,q0
-int64x2_t vaddq_s64(int64x2_t a, int64x2_t b);         // VADD.I64 q0,q0,q0
-float32x4_t vaddq_f32(float32x4_t a, float32x4_t b);         // VADD.F32 q0,q0,q0
-uint8x16_t vaddq_u8(uint8x16_t a, uint8x16_t b);         // VADD.I8 q0,q0,q0
-uint16x8_t vaddq_u16(uint16x8_t a, uint16x8_t b);         // VADD.I16 q0,q0,q0
-uint32x4_t vaddq_u32(uint32x4_t a, uint32x4_t b);         // VADD.I32 q0,q0,q0
-uint64x2_t vaddq_u64(uint64x2_t a, uint64x2_t b);         // VADD.I64 q0,q0,q0
+int8x8_t vadd_s8(int8x8_t a, int8x8_t b); // VADD.I8 d0,d0,d0
+int16x4_t vadd_s16(int16x4_t a, int16x4_t b); // VADD.I16 d0,d0,d0
+int32x2_t vadd_s32(int32x2_t a, int32x2_t b); // VADD.I32 d0,d0,d0
+int64x1_t vadd_s64(int64x1_t a, int64x1_t b); // VADD.I64 d0,d0,d0
+float32x2_t vadd_f32(float32x2_t a, float32x2_t b); // VADD.F32 d0,d0,d0
+uint8x8_t vadd_u8(uint8x8_t a, uint8x8_t b); // VADD.I8 d0,d0,d0
+uint16x4_t vadd_u16(uint16x4_t a, uint16x4_t b); // VADD.I16 d0,d0,d0
+uint32x2_t vadd_u32(uint32x2_t a, uint32x2_t b); // VADD.I32 d0,d0,d0
+uint64x1_t vadd_u64(uint64x1_t a, uint64x1_t b); // VADD.I64 d0,d0,d0
+int8x16_t vaddq_s8(int8x16_t a, int8x16_t b); // VADD.I8 q0,q0,q0
+int16x8_t vaddq_s16(int16x8_t a, int16x8_t b); // VADD.I16 q0,q0,q0
+int32x4_t vaddq_s32(int32x4_t a, int32x4_t b); // VADD.I32 q0,q0,q0
+int64x2_t vaddq_s64(int64x2_t a, int64x2_t b); // VADD.I64 q0,q0,q0
+float32x4_t vaddq_f32(float32x4_t a, float32x4_t b); // VADD.F32 q0,q0,q0
+uint8x16_t vaddq_u8(uint8x16_t a, uint8x16_t b); // VADD.I8 q0,q0,q0
+uint16x8_t vaddq_u16(uint16x8_t a, uint16x8_t b); // VADD.I16 q0,q0,q0
+uint32x4_t vaddq_u32(uint32x4_t a, uint32x4_t b); // VADD.I32 q0,q0,q0
+uint64x2_t vaddq_u64(uint64x2_t a, uint64x2_t b); // VADD.I64 q0,q0,q0
 //Vector long add: vaddl -> Vr[i]:=Va[i]+Vb[i], Va, Vb have equal lane sizes, result is a 128 bit vector of lanes that are twice the width.
-
+int16x8_t vaddl_s8(int8x8_t a, int8x8_t b); // VADDL.S8 q0,d0,d0
+int32x4_t vaddl_s16(int16x4_t a, int16x4_t b); // VADDL.S16 q0,d0,d0
+int64x2_t vaddl_s32(int32x2_t a, int32x2_t b); // VADDL.S32 q0,d0,d0
+uint16x8_t vaddl_u8(uint8x8_t a, uint8x8_t b); // VADDL.U8 q0,d0,d0
+uint32x4_t vaddl_u16(uint16x4_t a, uint16x4_t b); // VADDL.U16 q0,d0,d0
+uint64x2_t vaddl_u32(uint32x2_t a, uint32x2_t b); // VADDL.U32 q0,d0,d0
 //Vector wide addw: vadd -> Vr[i]:=Va[i]+Vb[i]
-
+int16x8_t vaddw_s8(int16x8_t a, int8x8_t b); // VADDW.S8 q0,q0,d0
+int32x4_t vaddw_s16(int32x4_t a, int16x4_t b); // VADDW.S16 q0,q0,d0
+int64x2_t vaddw_s32(int64x2_t a, int32x2_t b); // VADDW.S32 q0,q0,d0
+uint16x8_t vaddw_u8(uint16x8_t a, uint8x8_t b); // VADDW.U8 q0,q0,d0
+uint32x4_t vaddw_u16(uint32x4_t a, uint16x4_t b); // VADDW.U16 q0,q0,d0
+uint64x2_t vaddw_u32(uint64x2_t a, uint32x2_t b); // VADDW.U32 q0,q0,d0
 //Vector halving add: vhadd -> Vr[i]:=(Va[i]+Vb[i])>>1
-
-int8x16_t vhaddq_s8(int8x16_t a, int8x16_t b);         // VHADD.S8 q0,q0,q0
-int16x8_t vhaddq_s16(int16x8_t a, int16x8_t b);         // VHADD.S16 q0,q0,q0
-int32x4_t vhaddq_s32(int32x4_t a, int32x4_t b);         // VHADD.S32 q0,q0,q0
-uint8x16_t vhaddq_u8(uint8x16_t a, uint8x16_t b);         // VHADD.U8 q0,q0,q0
-uint16x8_t vhaddq_u16(uint16x8_t a, uint16x8_t b);         // VHADD.U16 q0,q0,q0
-uint32x4_t vhaddq_u32(uint32x4_t a, uint32x4_t b);         // VHADD.U32 q0,q0,q0
+int8x8_t vhadd_s8(int8x8_t a, int8x8_t b); // VHADD.S8 d0,d0,d0
+int16x4_t vhadd_s16(int16x4_t a, int16x4_t b); // VHADD.S16 d0,d0,d0
+int32x2_t vhadd_s32(int32x2_t a, int32x2_t b); // VHADD.S32 d0,d0,d0
+uint8x8_t vhadd_u8(uint8x8_t a, uint8x8_t b); // VHADD.U8 d0,d0,d0
+uint16x4_t vhadd_u16(uint16x4_t a, uint16x4_t b); // VHADD.U16 d0,d0,d0
+uint32x2_t vhadd_u32(uint32x2_t a, uint32x2_t b); // VHADD.U32 d0,d0,d0
+int8x16_t vhaddq_s8(int8x16_t a, int8x16_t b); // VHADD.S8 q0,q0,q0
+int16x8_t vhaddq_s16(int16x8_t a, int16x8_t b); // VHADD.S16 q0,q0,q0
+int32x4_t vhaddq_s32(int32x4_t a, int32x4_t b); // VHADD.S32 q0,q0,q0
+uint8x16_t vhaddq_u8(uint8x16_t a, uint8x16_t b); // VHADD.U8 q0,q0,q0
+uint16x8_t vhaddq_u16(uint16x8_t a, uint16x8_t b); // VHADD.U16 q0,q0,q0
+uint32x4_t vhaddq_u32(uint32x4_t a, uint32x4_t b); // VHADD.U32 q0,q0,q0
 //Vector rounding halving add: vrhadd -> Vr[i]:=(Va[i]+Vb[i]+1)>>1
-
-int8x16_t vrhaddq_s8(int8x16_t a, int8x16_t b);         // VRHADD.S8 q0,q0,q0
-int16x8_t vrhaddq_s16(int16x8_t a, int16x8_t b);         // VRHADD.S16 q0,q0,q0
-int32x4_t vrhaddq_s32(int32x4_t a, int32x4_t b);         // VRHADD.S32 q0,q0,q0
-uint8x16_t vrhaddq_u8(uint8x16_t a, uint8x16_t b);         // VRHADD.U8 q0,q0,q0
-uint16x8_t vrhaddq_u16(uint16x8_t a, uint16x8_t b);         // VRHADD.U16 q0,q0,q0
-uint32x4_t vrhaddq_u32(uint32x4_t a, uint32x4_t b);         // VRHADD.U32 q0,q0,q0
+int8x8_t vrhadd_s8(int8x8_t a, int8x8_t b); // VRHADD.S8 d0,d0,d0
+int16x4_t vrhadd_s16(int16x4_t a, int16x4_t b); // VRHADD.S16 d0,d0,d0
+int32x2_t vrhadd_s32(int32x2_t a, int32x2_t b); // VRHADD.S32 d0,d0,d0
+uint8x8_t vrhadd_u8(uint8x8_t a, uint8x8_t b); // VRHADD.U8 d0,d0,d0
+uint16x4_t vrhadd_u16(uint16x4_t a, uint16x4_t b); // VRHADD.U16 d0,d0,d0
+uint32x2_t vrhadd_u32(uint32x2_t a, uint32x2_t b); // VRHADD.U32 d0,d0,d0
+int8x16_t vrhaddq_s8(int8x16_t a, int8x16_t b); // VRHADD.S8 q0,q0,q0
+int16x8_t vrhaddq_s16(int16x8_t a, int16x8_t b); // VRHADD.S16 q0,q0,q0
+int32x4_t vrhaddq_s32(int32x4_t a, int32x4_t b); // VRHADD.S32 q0,q0,q0
+uint8x16_t vrhaddq_u8(uint8x16_t a, uint8x16_t b); // VRHADD.U8 q0,q0,q0
+uint16x8_t vrhaddq_u16(uint16x8_t a, uint16x8_t b); // VRHADD.U16 q0,q0,q0
+uint32x4_t vrhaddq_u32(uint32x4_t a, uint32x4_t b); // VRHADD.U32 q0,q0,q0
 //Vector saturating add: vqadd -> Vr[i]:=sat<size>(Va[i]+Vb[i])
-
-int8x16_t vqaddq_s8(int8x16_t a, int8x16_t b);         // VQADD.S8 q0,q0,q0
-int16x8_t vqaddq_s16(int16x8_t a, int16x8_t b);         // VQADD.S16 q0,q0,q0
-int32x4_t vqaddq_s32(int32x4_t a, int32x4_t b);         // VQADD.S32 q0,q0,q0
-int64x2_t vqaddq_s64(int64x2_t a, int64x2_t b);         // VQADD.S64 q0,q0,q0
-uint8x16_t vqaddq_u8(uint8x16_t a, uint8x16_t b);         // VQADD.U8 q0,q0,q0
-uint16x8_t vqaddq_u16(uint16x8_t a, uint16x8_t b);         // VQADD.U16 q0,q0,q0
-uint32x4_t vqaddq_u32(uint32x4_t a, uint32x4_t b);         // VQADD.U32 q0,q0,q0
-uint64x2_t vqaddq_u64(uint64x2_t a, uint64x2_t b);         // VQADD.U64 q0,q0,q0
+int8x8_t vqadd_s8(int8x8_t a, int8x8_t b); // VQADD.S8 d0,d0,d0
+int16x4_t vqadd_s16(int16x4_t a, int16x4_t b); // VQADD.S16 d0,d0,d0
+int32x2_t vqadd_s32(int32x2_t a, int32x2_t b); // VQADD.S32 d0,d0,d0
+int64x1_t vqadd_s64(int64x1_t a, int64x1_t b); // VQADD.S64 d0,d0,d0
+uint8x8_t vqadd_u8(uint8x8_t a, uint8x8_t b); // VQADD.U8 d0,d0,d0
+uint16x4_t vqadd_u16(uint16x4_t a, uint16x4_t b); // VQADD.U16 d0,d0,d0
+uint32x2_t vqadd_u32(uint32x2_t a, uint32x2_t b); // VQADD.U32 d0,d0,d0
+uint64x1_t vqadd_u64(uint64x1_t a, uint64x1_t b); // VQADD.U64 d0,d0,d0
+int8x16_t vqaddq_s8(int8x16_t a, int8x16_t b); // VQADD.S8 q0,q0,q0
+int16x8_t vqaddq_s16(int16x8_t a, int16x8_t b); // VQADD.S16 q0,q0,q0
+int32x4_t vqaddq_s32(int32x4_t a, int32x4_t b); // VQADD.S32 q0,q0,q0
+int64x2_t vqaddq_s64(int64x2_t a, int64x2_t b); // VQADD.S64 q0,q0,q0
+uint8x16_t vqaddq_u8(uint8x16_t a, uint8x16_t b); // VQADD.U8 q0,q0,q0
+uint16x8_t vqaddq_u16(uint16x8_t a, uint16x8_t b); // VQADD.U16 q0,q0,q0
+uint32x4_t vqaddq_u32(uint32x4_t a, uint32x4_t b); // VQADD.U32 q0,q0,q0
+uint64x2_t vqaddq_u64(uint64x2_t a, uint64x2_t b); // VQADD.U64 q0,q0,q0
 //Vector add high half: vaddhn-> Vr[i]:=Va[i]+Vb[i]
-
+int8x8_t vaddhn_s16(int16x8_t a, int16x8_t b); // VADDHN.I16 d0,q0,q0
+int16x4_t vaddhn_s32(int32x4_t a, int32x4_t b); // VADDHN.I32 d0,q0,q0
+int32x2_t vaddhn_s64(int64x2_t a, int64x2_t b); // VADDHN.I64 d0,q0,q0
+uint8x8_t vaddhn_u16(uint16x8_t a, uint16x8_t b); // VADDHN.I16 d0,q0,q0
+uint16x4_t vaddhn_u32(uint32x4_t a, uint32x4_t b); // VADDHN.I32 d0,q0,q0
+uint32x2_t vaddhn_u64(uint64x2_t a, uint64x2_t b); // VADDHN.I64 d0,q0,q0
 //Vector rounding add high half: vraddhn
-
+int8x8_t vraddhn_s16(int16x8_t a, int16x8_t b); // VRADDHN.I16 d0,q0,q0
+int16x4_t vraddhn_s32(int32x4_t a, int32x4_t b); // VRADDHN.I32 d0,q0,q0
+int32x2_t vraddhn_s64(int64x2_t a, int64x2_t b); // VRADDHN.I64 d0,q0,q0
+uint8x8_t vraddhn_u16(uint16x8_t a, uint16x8_t b); // VRADDHN.I16 d0,q0,q0
+uint16x4_t vraddhn_u32(uint32x4_t a, uint32x4_t b); // VRADDHN.I32 d0,q0,q0
+uint32x2_t vraddhn_u64(uint64x2_t a, uint64x2_t b); // VRADDHN.I64 d0,q0,q0
 //Multiplication
 //Vector multiply: vmul -> Vr[i] := Va[i] * Vb[i]
-
-int8x16_t vmulq_s8(int8x16_t a, int8x16_t b);         // VMUL.I8 q0,q0,q0
-int16x8_t vmulq_s16(int16x8_t a, int16x8_t b);         // VMUL.I16 q0,q0,q0
-int32x4_t vmulq_s32(int32x4_t a, int32x4_t b);         // VMUL.I32 q0,q0,q0
-float32x4_t vmulq_f32(float32x4_t a, float32x4_t b);         // VMUL.F32 q0,q0,q0
-uint8x16_t vmulq_u8(uint8x16_t a, uint8x16_t b);         // VMUL.I8 q0,q0,q0
-uint16x8_t vmulq_u16(uint16x8_t a, uint16x8_t b);         // VMUL.I16 q0,q0,q0
-uint32x4_t vmulq_u32(uint32x4_t a, uint32x4_t b);         // VMUL.I32 q0,q0,q0
-poly8x16_t vmulq_p8(poly8x16_t a, poly8x16_t b);         // VMUL.P8 q0,q0,q0
+int8x8_t vmul_s8(int8x8_t a, int8x8_t b); // VMUL.I8 d0,d0,d0
+int16x4_t vmul_s16(int16x4_t a, int16x4_t b); // VMUL.I16 d0,d0,d0
+int32x2_t vmul_s32(int32x2_t a, int32x2_t b); // VMUL.I32 d0,d0,d0
+float32x2_t vmul_f32(float32x2_t a, float32x2_t b); // VMUL.F32 d0,d0,d0
+uint8x8_t vmul_u8(uint8x8_t a, uint8x8_t b); // VMUL.I8 d0,d0,d0
+uint16x4_t vmul_u16(uint16x4_t a, uint16x4_t b); // VMUL.I16 d0,d0,d0
+uint32x2_t vmul_u32(uint32x2_t a, uint32x2_t b); // VMUL.I32 d0,d0,d0
+poly8x8_t vmul_p8(poly8x8_t a, poly8x8_t b); // VMUL.P8 d0,d0,d0
+int8x16_t vmulq_s8(int8x16_t a, int8x16_t b); // VMUL.I8 q0,q0,q0
+int16x8_t vmulq_s16(int16x8_t a, int16x8_t b); // VMUL.I16 q0,q0,q0
+int32x4_t vmulq_s32(int32x4_t a, int32x4_t b); // VMUL.I32 q0,q0,q0
+float32x4_t vmulq_f32(float32x4_t a, float32x4_t b); // VMUL.F32 q0,q0,q0
+uint8x16_t vmulq_u8(uint8x16_t a, uint8x16_t b); // VMUL.I8 q0,q0,q0
+uint16x8_t vmulq_u16(uint16x8_t a, uint16x8_t b); // VMUL.I16 q0,q0,q0
+uint32x4_t vmulq_u32(uint32x4_t a, uint32x4_t b); // VMUL.I32 q0,q0,q0
+poly8x16_t vmulq_p8(poly8x16_t a, poly8x16_t b); // VMUL.P8 q0,q0,q0
+//multiply lane
+int16x4_t vmul_lane_s16 (int16x4_t a, int16x4_t b, __constrange(0,3) int c);
+int32x2_t vmul_lane_s32 (int32x2_t a, int32x2_t b, __constrange(0,1) int c);
+float32x2_t vmul_lane_f32 (float32x2_t a, float32x2_t b, __constrange(0,1) int c);
+uint16x4_t vmul_lane_u16 (uint16x4_t a, uint16x4_t b, __constrange(0,3) int c);
+uint32x2_t vmul_lane_u32 (uint32x2_t a, uint32x2_t b, __constrange(0,1) int c);
+int16x8_t vmulq_lane_s16 (int16x8_t a, int16x4_t b, __constrange(0,3) int c);
+int32x4_t vmulq_lane_s32 (int32x4_t a, int32x2_t b, __constrange(0,1) int c);
+float32x4_t vmulq_lane_f32 (float32x4_t a, float32x2_t b, __constrange(0,1) int c);
+uint16x8_t vmulq_lane_u16 (uint16x8_t a, uint16x4_t b, __constrange(0,3) int c);
+uint32x4_t vmulq_lane_u32 (uint32x4_t a, uint32x2_t b, __constrange(0,1) int c);
 //Vector multiply accumulate: vmla -> Vr[i] := Va[i] + Vb[i] * Vc[i]
-
-int8x16_t vmlaq_s8(int8x16_t a, int8x16_t b, int8x16_t c);         // VMLA.I8 q0,q0,q0
-int16x8_t vmlaq_s16(int16x8_t a, int16x8_t b, int16x8_t c);         // VMLA.I16 q0,q0,q0
-int32x4_t vmlaq_s32(int32x4_t a, int32x4_t b, int32x4_t c);         // VMLA.I32 q0,q0,q0
-float32x4_t vmlaq_f32(float32x4_t a, float32x4_t b, float32x4_t c);         // VMLA.F32 q0,q0,q0
-uint8x16_t vmlaq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c);         // VMLA.I8 q0,q0,q0
-uint16x8_t vmlaq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c);         // VMLA.I16 q0,q0,q0
-uint32x4_t vmlaq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c);         // VMLA.I32 q0,q0,q0
+int8x8_t vmla_s8(int8x8_t a, int8x8_t b, int8x8_t c); // VMLA.I8 d0,d0,d0
+int16x4_t vmla_s16(int16x4_t a, int16x4_t b, int16x4_t c); // VMLA.I16 d0,d0,d0
+int32x2_t vmla_s32(int32x2_t a, int32x2_t b, int32x2_t c); // VMLA.I32 d0,d0,d0
+float32x2_t vmla_f32(float32x2_t a, float32x2_t b, float32x2_t c); // VMLA.F32 d0,d0,d0
+uint8x8_t vmla_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c); // VMLA.I8 d0,d0,d0
+uint16x4_t vmla_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c); // VMLA.I16 d0,d0,d0
+uint32x2_t vmla_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c); // VMLA.I32 d0,d0,d0
+int8x16_t vmlaq_s8(int8x16_t a, int8x16_t b, int8x16_t c); // VMLA.I8 q0,q0,q0
+int16x8_t vmlaq_s16(int16x8_t a, int16x8_t b, int16x8_t c); // VMLA.I16 q0,q0,q0
+int32x4_t vmlaq_s32(int32x4_t a, int32x4_t b, int32x4_t c); // VMLA.I32 q0,q0,q0
+float32x4_t vmlaq_f32(float32x4_t a, float32x4_t b, float32x4_t c); // VMLA.F32 q0,q0,q0
+uint8x16_t vmlaq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c); // VMLA.I8 q0,q0,q0
+uint16x8_t vmlaq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c); // VMLA.I16 q0,q0,q0
+uint32x4_t vmlaq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c); // VMLA.I32 q0,q0,q0
 //Vector multiply accumulate long: vmlal -> Vr[i] := Va[i] + Vb[i] * Vc[i]
-
+int16x8_t vmlal_s8(int16x8_t a, int8x8_t b, int8x8_t c); // VMLAL.S8 q0,d0,d0
+int32x4_t vmlal_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VMLAL.S16 q0,d0,d0
+int64x2_t vmlal_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VMLAL.S32 q0,d0,d0
+uint16x8_t vmlal_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c); // VMLAL.U8 q0,d0,d0
+uint32x4_t vmlal_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c); // VMLAL.U16 q0,d0,d0
+uint64x2_t vmlal_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c); // VMLAL.U32 q0,d0,d0
 //Vector multiply subtract: vmls -> Vr[i] := Va[i] - Vb[i] * Vc[i]
-
-int8x16_t vmlsq_s8(int8x16_t a, int8x16_t b, int8x16_t c);         // VMLS.I8 q0,q0,q0
-int16x8_t vmlsq_s16(int16x8_t a, int16x8_t b, int16x8_t c);         // VMLS.I16 q0,q0,q0
-int32x4_t vmlsq_s32(int32x4_t a, int32x4_t b, int32x4_t c);         // VMLS.I32 q0,q0,q0
-float32x4_t vmlsq_f32(float32x4_t a, float32x4_t b, float32x4_t c);         // VMLS.F32 q0,q0,q0
-uint8x16_t vmlsq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c);         // VMLS.I8 q0,q0,q0
-uint16x8_t vmlsq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c);         // VMLS.I16 q0,q0,q0
-uint32x4_t vmlsq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c);         // VMLS.I32 q0,q0,q0
+int8x8_t vmls_s8(int8x8_t a, int8x8_t b, int8x8_t c); // VMLS.I8 d0,d0,d0
+int16x4_t vmls_s16(int16x4_t a, int16x4_t b, int16x4_t c); // VMLS.I16 d0,d0,d0
+int32x2_t vmls_s32(int32x2_t a, int32x2_t b, int32x2_t c); // VMLS.I32 d0,d0,d0
+float32x2_t vmls_f32(float32x2_t a, float32x2_t b, float32x2_t c); // VMLS.F32 d0,d0,d0
+uint8x8_t vmls_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c); // VMLS.I8 d0,d0,d0
+uint16x4_t vmls_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c); // VMLS.I16 d0,d0,d0
+uint32x2_t vmls_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c); // VMLS.I32 d0,d0,d0
+int8x16_t vmlsq_s8(int8x16_t a, int8x16_t b, int8x16_t c); // VMLS.I8 q0,q0,q0
+int16x8_t vmlsq_s16(int16x8_t a, int16x8_t b, int16x8_t c); // VMLS.I16 q0,q0,q0
+int32x4_t vmlsq_s32(int32x4_t a, int32x4_t b, int32x4_t c); // VMLS.I32 q0,q0,q0
+float32x4_t vmlsq_f32(float32x4_t a, float32x4_t b, float32x4_t c); // VMLS.F32 q0,q0,q0
+uint8x16_t vmlsq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c); // VMLS.I8 q0,q0,q0
+uint16x8_t vmlsq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c); // VMLS.I16 q0,q0,q0
+uint32x4_t vmlsq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c); // VMLS.I32 q0,q0,q0
 //Vector multiply subtract long
-
+int16x8_t vmlsl_s8(int16x8_t a, int8x8_t b, int8x8_t c); // VMLSL.S8 q0,d0,d0
+int32x4_t vmlsl_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VMLSL.S16 q0,d0,d0
+int64x2_t vmlsl_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VMLSL.S32 q0,d0,d0
+uint16x8_t vmlsl_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c); // VMLSL.U8 q0,d0,d0
+uint32x4_t vmlsl_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c); // VMLSL.U16 q0,d0,d0
+uint64x2_t vmlsl_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c); // VMLSL.U32 q0,d0,d0
 //Vector saturating doubling multiply high
-
-int16x8_t vqdmulhq_s16(int16x8_t a, int16x8_t b);         // VQDMULH.S16 q0,q0,q0
-int32x4_t vqdmulhq_s32(int32x4_t a, int32x4_t b);         // VQDMULH.S32 q0,q0,q0
+int16x4_t vqdmulh_s16(int16x4_t a, int16x4_t b); // VQDMULH.S16 d0,d0,d0
+int32x2_t vqdmulh_s32(int32x2_t a, int32x2_t b); // VQDMULH.S32 d0,d0,d0
+int16x8_t vqdmulhq_s16(int16x8_t a, int16x8_t b); // VQDMULH.S16 q0,q0,q0
+int32x4_t vqdmulhq_s32(int32x4_t a, int32x4_t b); // VQDMULH.S32 q0,q0,q0
 //Vector saturating rounding doubling multiply high
-
-int16x8_t vqrdmulhq_s16(int16x8_t a, int16x8_t b);         // VQRDMULH.S16 q0,q0,q0
-int32x4_t vqrdmulhq_s32(int32x4_t a, int32x4_t b);         // VQRDMULH.S32 q0,q0,q0
+int16x4_t vqrdmulh_s16(int16x4_t a, int16x4_t b); // VQRDMULH.S16 d0,d0,d0
+int32x2_t vqrdmulh_s32(int32x2_t a, int32x2_t b); // VQRDMULH.S32 d0,d0,d0
+int16x8_t vqrdmulhq_s16(int16x8_t a, int16x8_t b); // VQRDMULH.S16 q0,q0,q0
+int32x4_t vqrdmulhq_s32(int32x4_t a, int32x4_t b); // VQRDMULH.S32 q0,q0,q0
 //Vector saturating doubling multiply accumulate long
-
+int32x4_t vqdmlal_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VQDMLAL.S16 q0,d0,d0
+int64x2_t vqdmlal_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VQDMLAL.S32 q0,d0,d0
 //Vector saturating doubling multiply subtract long
-
+int32x4_t vqdmlsl_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VQDMLSL.S16 q0,d0,d0
+int64x2_t vqdmlsl_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VQDMLSL.S32 q0,d0,d0
 //Vector long multiply
-
+int16x8_t vmull_s8(int8x8_t a, int8x8_t b); // VMULL.S8 q0,d0,d0
+int32x4_t vmull_s16(int16x4_t a, int16x4_t b); // VMULL.S16 q0,d0,d0
+int64x2_t vmull_s32(int32x2_t a, int32x2_t b); // VMULL.S32 q0,d0,d0
+uint16x8_t vmull_u8(uint8x8_t a, uint8x8_t b); // VMULL.U8 q0,d0,d0
+uint32x4_t vmull_u16(uint16x4_t a, uint16x4_t b); // VMULL.U16 q0,d0,d0
+uint64x2_t vmull_u32(uint32x2_t a, uint32x2_t b); // VMULL.U32 q0,d0,d0
+poly16x8_t vmull_p8(poly8x8_t a, poly8x8_t b); // VMULL.P8 q0,d0,d0
 //Vector saturating doubling long multiply
-
+int32x4_t vqdmull_s16(int16x4_t a, int16x4_t b); // VQDMULL.S16 q0,d0,d0
+int64x2_t vqdmull_s32(int32x2_t a, int32x2_t b); // VQDMULL.S32 q0,d0,d0
 //Subtraction
 //Vector subtract
-
-int8x16_t vsubq_s8(int8x16_t a, int8x16_t b);         // VSUB.I8 q0,q0,q0
-int16x8_t vsubq_s16(int16x8_t a, int16x8_t b);         // VSUB.I16 q0,q0,q0
-int32x4_t vsubq_s32(int32x4_t a, int32x4_t b);         // VSUB.I32 q0,q0,q0
-int64x2_t vsubq_s64(int64x2_t a, int64x2_t b);         // VSUB.I64 q0,q0,q0
-float32x4_t vsubq_f32(float32x4_t a, float32x4_t b);         // VSUB.F32 q0,q0,q0
-uint8x16_t vsubq_u8(uint8x16_t a, uint8x16_t b);         // VSUB.I8 q0,q0,q0
-uint16x8_t vsubq_u16(uint16x8_t a, uint16x8_t b);         // VSUB.I16 q0,q0,q0
-uint32x4_t vsubq_u32(uint32x4_t a, uint32x4_t b);         // VSUB.I32 q0,q0,q0
-uint64x2_t vsubq_u64(uint64x2_t a, uint64x2_t b);         // VSUB.I64 q0,q0,q0
+int8x8_t vsub_s8(int8x8_t a, int8x8_t b); // VSUB.I8 d0,d0,d0
+int16x4_t vsub_s16(int16x4_t a, int16x4_t b); // VSUB.I16 d0,d0,d0
+int32x2_t vsub_s32(int32x2_t a, int32x2_t b); // VSUB.I32 d0,d0,d0
+int64x1_t vsub_s64(int64x1_t a, int64x1_t b); // VSUB.I64 d0,d0,d0
+float32x2_t vsub_f32(float32x2_t a, float32x2_t b); // VSUB.F32 d0,d0,d0
+uint8x8_t vsub_u8(uint8x8_t a, uint8x8_t b); // VSUB.I8 d0,d0,d0
+uint16x4_t vsub_u16(uint16x4_t a, uint16x4_t b); // VSUB.I16 d0,d0,d0
+uint32x2_t vsub_u32(uint32x2_t a, uint32x2_t b); // VSUB.I32 d0,d0,d0
+uint64x1_t vsub_u64(uint64x1_t a, uint64x1_t b); // VSUB.I64 d0,d0,d0
+int8x16_t vsubq_s8(int8x16_t a, int8x16_t b); // VSUB.I8 q0,q0,q0
+int16x8_t vsubq_s16(int16x8_t a, int16x8_t b); // VSUB.I16 q0,q0,q0
+int32x4_t vsubq_s32(int32x4_t a, int32x4_t b); // VSUB.I32 q0,q0,q0
+int64x2_t vsubq_s64(int64x2_t a, int64x2_t b); // VSUB.I64 q0,q0,q0
+float32x4_t vsubq_f32(float32x4_t a, float32x4_t b); // VSUB.F32 q0,q0,q0
+uint8x16_t vsubq_u8(uint8x16_t a, uint8x16_t b); // VSUB.I8 q0,q0,q0
+uint16x8_t vsubq_u16(uint16x8_t a, uint16x8_t b); // VSUB.I16 q0,q0,q0
+uint32x4_t vsubq_u32(uint32x4_t a, uint32x4_t b); // VSUB.I32 q0,q0,q0
+uint64x2_t vsubq_u64(uint64x2_t a, uint64x2_t b); // VSUB.I64 q0,q0,q0
 //Vector long subtract: vsub -> Vr[i]:=Va[i]+Vb[i]
-
+int16x8_t vsubl_s8(int8x8_t a, int8x8_t b); // VSUBL.S8 q0,d0,d0
+int32x4_t vsubl_s16(int16x4_t a, int16x4_t b); // VSUBL.S16 q0,d0,d0
+int64x2_t vsubl_s32(int32x2_t a, int32x2_t b); // VSUBL.S32 q0,d0,d0
+uint16x8_t vsubl_u8(uint8x8_t a, uint8x8_t b); // VSUBL.U8 q0,d0,d0
+uint32x4_t vsubl_u16(uint16x4_t a, uint16x4_t b); // VSUBL.U16 q0,d0,d0
+uint64x2_t vsubl_u32(uint32x2_t a, uint32x2_t b); // VSUBL.U32 q0,d0,d0
 //Vector wide subtract: vsub -> Vr[i]:=Va[i]+Vb[i]
-
+int16x8_t vsubw_s8(int16x8_t a, int8x8_t b); // VSUBW.S8 q0,q0,d0
+int32x4_t vsubw_s16(int32x4_t a, int16x4_t b); // VSUBW.S16 q0,q0,d0
+int64x2_t vsubw_s32(int64x2_t a, int32x2_t b); // VSUBW.S32 q0,q0,d0
+uint16x8_t vsubw_u8(uint16x8_t a, uint8x8_t b); // VSUBW.U8 q0,q0,d0
+uint32x4_t vsubw_u16(uint32x4_t a, uint16x4_t b); // VSUBW.U16 q0,q0,d0
+uint64x2_t vsubw_u32(uint64x2_t a, uint32x2_t b); // VSUBW.U32 q0,q0,d0
 //Vector saturating subtract
-
-int8x16_t vqsubq_s8(int8x16_t a, int8x16_t b);         // VQSUB.S8 q0,q0,q0
-int16x8_t vqsubq_s16(int16x8_t a, int16x8_t b);         // VQSUB.S16 q0,q0,q0
-int32x4_t vqsubq_s32(int32x4_t a, int32x4_t b);         // VQSUB.S32 q0,q0,q0
-int64x2_t vqsubq_s64(int64x2_t a, int64x2_t b);         // VQSUB.S64 q0,q0,q0
-uint8x16_t vqsubq_u8(uint8x16_t a, uint8x16_t b);         // VQSUB.U8 q0,q0,q0
-uint16x8_t vqsubq_u16(uint16x8_t a, uint16x8_t b);         // VQSUB.U16 q0,q0,q0
-uint32x4_t vqsubq_u32(uint32x4_t a, uint32x4_t b);         // VQSUB.U32 q0,q0,q0
-uint64x2_t vqsubq_u64(uint64x2_t a, uint64x2_t b);         // VQSUB.U64 q0,q0,q0
+int8x8_t vqsub_s8(int8x8_t a, int8x8_t b); // VQSUB.S8 d0,d0,d0
+int16x4_t vqsub_s16(int16x4_t a, int16x4_t b); // VQSUB.S16 d0,d0,d0
+int32x2_t vqsub_s32(int32x2_t a, int32x2_t b); // VQSUB.S32 d0,d0,d0
+int64x1_t vqsub_s64(int64x1_t a, int64x1_t b); // VQSUB.S64 d0,d0,d0
+uint8x8_t vqsub_u8(uint8x8_t a, uint8x8_t b); // VQSUB.U8 d0,d0,d0
+uint16x4_t vqsub_u16(uint16x4_t a, uint16x4_t b); // VQSUB.U16 d0,d0,d0
+uint32x2_t vqsub_u32(uint32x2_t a, uint32x2_t b); // VQSUB.U32 d0,d0,d0
+uint64x1_t vqsub_u64(uint64x1_t a, uint64x1_t b); // VQSUB.U64 d0,d0,d0
+int8x16_t vqsubq_s8(int8x16_t a, int8x16_t b); // VQSUB.S8 q0,q0,q0
+int16x8_t vqsubq_s16(int16x8_t a, int16x8_t b); // VQSUB.S16 q0,q0,q0
+int32x4_t vqsubq_s32(int32x4_t a, int32x4_t b); // VQSUB.S32 q0,q0,q0
+int64x2_t vqsubq_s64(int64x2_t a, int64x2_t b); // VQSUB.S64 q0,q0,q0
+uint8x16_t vqsubq_u8(uint8x16_t a, uint8x16_t b); // VQSUB.U8 q0,q0,q0
+uint16x8_t vqsubq_u16(uint16x8_t a, uint16x8_t b); // VQSUB.U16 q0,q0,q0
+uint32x4_t vqsubq_u32(uint32x4_t a, uint32x4_t b); // VQSUB.U32 q0,q0,q0
+uint64x2_t vqsubq_u64(uint64x2_t a, uint64x2_t b); // VQSUB.U64 q0,q0,q0
 //Vector halving subtract
-
-int8x16_t vhsubq_s8(int8x16_t a, int8x16_t b);         // VHSUB.S8 q0,q0,q0
-int16x8_t vhsubq_s16(int16x8_t a, int16x8_t b);         // VHSUB.S16 q0,q0,q0
-int32x4_t vhsubq_s32(int32x4_t a, int32x4_t b);         // VHSUB.S32 q0,q0,q0
-uint8x16_t vhsubq_u8(uint8x16_t a, uint8x16_t b);         // VHSUB.U8 q0,q0,q0
-uint16x8_t vhsubq_u16(uint16x8_t a, uint16x8_t b);         // VHSUB.U16 q0,q0,q0
-uint32x4_t vhsubq_u32(uint32x4_t a, uint32x4_t b);         // VHSUB.U32 q0,q0,q0
+int8x8_t vhsub_s8(int8x8_t a, int8x8_t b); // VHSUB.S8 d0,d0,d0
+int16x4_t vhsub_s16(int16x4_t a, int16x4_t b); // VHSUB.S16 d0,d0,d0
+int32x2_t vhsub_s32(int32x2_t a, int32x2_t b); // VHSUB.S32 d0,d0,d0
+uint8x8_t vhsub_u8(uint8x8_t a, uint8x8_t b); // VHSUB.U8 d0,d0,d0
+uint16x4_t vhsub_u16(uint16x4_t a, uint16x4_t b); // VHSUB.U16 d0,d0,d0
+uint32x2_t vhsub_u32(uint32x2_t a, uint32x2_t b); // VHSUB.U32 d0,d0,d0
+int8x16_t vhsubq_s8(int8x16_t a, int8x16_t b); // VHSUB.S8 q0,q0,q0
+int16x8_t vhsubq_s16(int16x8_t a, int16x8_t b); // VHSUB.S16 q0,q0,q0
+int32x4_t vhsubq_s32(int32x4_t a, int32x4_t b); // VHSUB.S32 q0,q0,q0
+uint8x16_t vhsubq_u8(uint8x16_t a, uint8x16_t b); // VHSUB.U8 q0,q0,q0
+uint16x8_t vhsubq_u16(uint16x8_t a, uint16x8_t b); // VHSUB.U16 q0,q0,q0
+uint32x4_t vhsubq_u32(uint32x4_t a, uint32x4_t b); // VHSUB.U32 q0,q0,q0
 //Vector subtract high half
-
+int8x8_t vsubhn_s16(int16x8_t a, int16x8_t b); // VSUBHN.I16 d0,q0,q0
+int16x4_t vsubhn_s32(int32x4_t a, int32x4_t b); // VSUBHN.I32 d0,q0,q0
+int32x2_t vsubhn_s64(int64x2_t a, int64x2_t b); // VSUBHN.I64 d0,q0,q0
+uint8x8_t vsubhn_u16(uint16x8_t a, uint16x8_t b); // VSUBHN.I16 d0,q0,q0
+uint16x4_t vsubhn_u32(uint32x4_t a, uint32x4_t b); // VSUBHN.I32 d0,q0,q0
+uint32x2_t vsubhn_u64(uint64x2_t a, uint64x2_t b); // VSUBHN.I64 d0,q0,q0
 //Vector rounding subtract high half
-
+int8x8_t vrsubhn_s16(int16x8_t a, int16x8_t b); // VRSUBHN.I16 d0,q0,q0
+int16x4_t vrsubhn_s32(int32x4_t a, int32x4_t b); // VRSUBHN.I32 d0,q0,q0
+int32x2_t vrsubhn_s64(int64x2_t a, int64x2_t b); // VRSUBHN.I64 d0,q0,q0
+uint8x8_t vrsubhn_u16(uint16x8_t a, uint16x8_t b); // VRSUBHN.I16 d0,q0,q0
+uint16x4_t vrsubhn_u32(uint32x4_t a, uint32x4_t b); // VRSUBHN.I32 d0,q0,q0
+uint32x2_t vrsubhn_u64(uint64x2_t a, uint64x2_t b); // VRSUBHN.I64 d0,q0,q0
 //Comparison
 //Vector compare equal
-
-uint8x16_t vceqq_s8(int8x16_t a, int8x16_t b);         // VCEQ.I8 q0, q0, q0
-uint16x8_t vceqq_s16(int16x8_t a, int16x8_t b);         // VCEQ.I16 q0, q0, q0
-uint32x4_t vceqq_s32(int32x4_t a, int32x4_t b);         // VCEQ.I32 q0, q0, q0
-uint32x4_t vceqq_f32(float32x4_t a, float32x4_t b);         // VCEQ.F32 q0, q0, q0
-uint8x16_t vceqq_u8(uint8x16_t a, uint8x16_t b);         // VCEQ.I8 q0, q0, q0
-uint16x8_t vceqq_u16(uint16x8_t a, uint16x8_t b);         // VCEQ.I16 q0, q0, q0
-uint32x4_t vceqq_u32(uint32x4_t a, uint32x4_t b);         // VCEQ.I32 q0, q0, q0
-uint8x16_t vceqq_p8(poly8x16_t a, poly8x16_t b);         // VCEQ.I8 q0, q0, q0
+uint8x8_t vceq_s8(int8x8_t a, int8x8_t b); // VCEQ.I8 d0, d0, d0
+uint16x4_t vceq_s16(int16x4_t a, int16x4_t b); // VCEQ.I16 d0, d0, d0
+uint32x2_t vceq_s32(int32x2_t a, int32x2_t b); // VCEQ.I32 d0, d0, d0
+uint32x2_t vceq_f32(float32x2_t a, float32x2_t b); // VCEQ.F32 d0, d0, d0
+uint8x8_t vceq_u8(uint8x8_t a, uint8x8_t b); // VCEQ.I8 d0, d0, d0
+uint16x4_t vceq_u16(uint16x4_t a, uint16x4_t b); // VCEQ.I16 d0, d0, d0
+uint32x2_t vceq_u32(uint32x2_t a, uint32x2_t b); // VCEQ.I32 d0, d0, d0
+uint8x8_t vceq_p8(poly8x8_t a, poly8x8_t b); // VCEQ.I8 d0, d0, d0
+uint8x16_t vceqq_s8(int8x16_t a, int8x16_t b); // VCEQ.I8 q0, q0, q0
+uint16x8_t vceqq_s16(int16x8_t a, int16x8_t b); // VCEQ.I16 q0, q0, q0
+uint32x4_t vceqq_s32(int32x4_t a, int32x4_t b); // VCEQ.I32 q0, q0, q0
+uint32x4_t vceqq_f32(float32x4_t a, float32x4_t b); // VCEQ.F32 q0, q0, q0
+uint8x16_t vceqq_u8(uint8x16_t a, uint8x16_t b); // VCEQ.I8 q0, q0, q0
+uint16x8_t vceqq_u16(uint16x8_t a, uint16x8_t b); // VCEQ.I16 q0, q0, q0
+uint32x4_t vceqq_u32(uint32x4_t a, uint32x4_t b); // VCEQ.I32 q0, q0, q0
+uint8x16_t vceqq_p8(poly8x16_t a, poly8x16_t b); // VCEQ.I8 q0, q0, q0
 //Vector compare greater-than or equal
-
-uint8x16_t vcgeq_s8(int8x16_t a, int8x16_t b);         // VCGE.S8 q0, q0, q0
-uint16x8_t vcgeq_s16(int16x8_t a, int16x8_t b);         // VCGE.S16 q0, q0, q0
-uint32x4_t vcgeq_s32(int32x4_t a, int32x4_t b);         // VCGE.S32 q0, q0, q0
-uint32x4_t vcgeq_f32(float32x4_t a, float32x4_t b);         // VCGE.F32 q0, q0, q0
-uint8x16_t vcgeq_u8(uint8x16_t a, uint8x16_t b);         // VCGE.U8 q0, q0, q0
-uint16x8_t vcgeq_u16(uint16x8_t a, uint16x8_t b);         // VCGE.U16 q0, q0, q0
-uint32x4_t vcgeq_u32(uint32x4_t a, uint32x4_t b);         // VCGE.U32 q0, q0, q0
+uint8x8_t vcge_s8(int8x8_t a, int8x8_t b); // VCGE.S8 d0, d0, d0
+uint16x4_t vcge_s16(int16x4_t a, int16x4_t b); // VCGE.S16 d0, d0, d0
+uint32x2_t vcge_s32(int32x2_t a, int32x2_t b); // VCGE.S32 d0, d0, d0
+uint32x2_t vcge_f32(float32x2_t a, float32x2_t b); // VCGE.F32 d0, d0, d0
+uint8x8_t vcge_u8(uint8x8_t a, uint8x8_t b); // VCGE.U8 d0, d0, d0
+uint16x4_t vcge_u16(uint16x4_t a, uint16x4_t b); // VCGE.U16 d0, d0, d0
+uint32x2_t vcge_u32(uint32x2_t a, uint32x2_t b); // VCGE.U32 d0, d0, d0
+uint8x16_t vcgeq_s8(int8x16_t a, int8x16_t b); // VCGE.S8 q0, q0, q0
+uint16x8_t vcgeq_s16(int16x8_t a, int16x8_t b); // VCGE.S16 q0, q0, q0
+uint32x4_t vcgeq_s32(int32x4_t a, int32x4_t b); // VCGE.S32 q0, q0, q0
+uint32x4_t vcgeq_f32(float32x4_t a, float32x4_t b); // VCGE.F32 q0, q0, q0
+uint8x16_t vcgeq_u8(uint8x16_t a, uint8x16_t b); // VCGE.U8 q0, q0, q0
+uint16x8_t vcgeq_u16(uint16x8_t a, uint16x8_t b); // VCGE.U16 q0, q0, q0
+uint32x4_t vcgeq_u32(uint32x4_t a, uint32x4_t b); // VCGE.U32 q0, q0, q0
 //Vector compare less-than or equal
-
-uint8x16_t vcleq_s8(int8x16_t a, int8x16_t b);         // VCGE.S8 q0, q0, q0
-uint16x8_t vcleq_s16(int16x8_t a, int16x8_t b);         // VCGE.S16 q0, q0, q0
-uint32x4_t vcleq_s32(int32x4_t a, int32x4_t b);         // VCGE.S32 q0, q0, q0
-uint32x4_t vcleq_f32(float32x4_t a, float32x4_t b);         // VCGE.F32 q0, q0, q0
-uint8x16_t vcleq_u8(uint8x16_t a, uint8x16_t b);         // VCGE.U8 q0, q0, q0
-uint16x8_t vcleq_u16(uint16x8_t a, uint16x8_t b);         // VCGE.U16 q0, q0, q0
-uint32x4_t vcleq_u32(uint32x4_t a, uint32x4_t b);         // VCGE.U32 q0, q0, q0
+uint8x8_t vcle_s8(int8x8_t a, int8x8_t b); // VCGE.S8 d0, d0, d0
+uint16x4_t vcle_s16(int16x4_t a, int16x4_t b); // VCGE.S16 d0, d0, d0
+uint32x2_t vcle_s32(int32x2_t a, int32x2_t b); // VCGE.S32 d0, d0, d0
+uint32x2_t vcle_f32(float32x2_t a, float32x2_t b); // VCGE.F32 d0, d0, d0
+uint8x8_t vcle_u8(uint8x8_t a, uint8x8_t b); // VCGE.U8 d0, d0, d0
+uint16x4_t vcle_u16(uint16x4_t a, uint16x4_t b); // VCGE.U16 d0, d0, d0
+uint32x2_t vcle_u32(uint32x2_t a, uint32x2_t b); // VCGE.U32 d0, d0, d0
+uint8x16_t vcleq_s8(int8x16_t a, int8x16_t b); // VCGE.S8 q0, q0, q0
+uint16x8_t vcleq_s16(int16x8_t a, int16x8_t b); // VCGE.S16 q0, q0, q0
+uint32x4_t vcleq_s32(int32x4_t a, int32x4_t b); // VCGE.S32 q0, q0, q0
+uint32x4_t vcleq_f32(float32x4_t a, float32x4_t b); // VCGE.F32 q0, q0, q0
+uint8x16_t vcleq_u8(uint8x16_t a, uint8x16_t b); // VCGE.U8 q0, q0, q0
+uint16x8_t vcleq_u16(uint16x8_t a, uint16x8_t b); // VCGE.U16 q0, q0, q0
+uint32x4_t vcleq_u32(uint32x4_t a, uint32x4_t b); // VCGE.U32 q0, q0, q0
 //Vector compare greater-than
-
-uint8x16_t vcgtq_s8(int8x16_t a, int8x16_t b);         // VCGT.S8 q0, q0, q0
-uint16x8_t vcgtq_s16(int16x8_t a, int16x8_t b);         // VCGT.S16 q0, q0, q0
-uint32x4_t vcgtq_s32(int32x4_t a, int32x4_t b);         // VCGT.S32 q0, q0, q0
-uint32x4_t vcgtq_f32(float32x4_t a, float32x4_t b);         // VCGT.F32 q0, q0, q0
-uint8x16_t vcgtq_u8(uint8x16_t a, uint8x16_t b);         // VCGT.U8 q0, q0, q0
-uint16x8_t vcgtq_u16(uint16x8_t a, uint16x8_t b);         // VCGT.U16 q0, q0, q0
-uint32x4_t vcgtq_u32(uint32x4_t a, uint32x4_t b);         // VCGT.U32 q0, q0, q0
+uint8x8_t vcgt_s8(int8x8_t a, int8x8_t b); // VCGT.S8 d0, d0, d0
+uint16x4_t vcgt_s16(int16x4_t a, int16x4_t b); // VCGT.S16 d0, d0, d0
+uint32x2_t vcgt_s32(int32x2_t a, int32x2_t b); // VCGT.S32 d0, d0, d0
+uint32x2_t vcgt_f32(float32x2_t a, float32x2_t b); // VCGT.F32 d0, d0, d0
+uint8x8_t vcgt_u8(uint8x8_t a, uint8x8_t b); // VCGT.U8 d0, d0, d0
+uint16x4_t vcgt_u16(uint16x4_t a, uint16x4_t b); // VCGT.U16 d0, d0, d0
+uint32x2_t vcgt_u32(uint32x2_t a, uint32x2_t b); // VCGT.U32 d0, d0, d0
+uint8x16_t vcgtq_s8(int8x16_t a, int8x16_t b); // VCGT.S8 q0, q0, q0
+uint16x8_t vcgtq_s16(int16x8_t a, int16x8_t b); // VCGT.S16 q0, q0, q0
+uint32x4_t vcgtq_s32(int32x4_t a, int32x4_t b); // VCGT.S32 q0, q0, q0
+uint32x4_t vcgtq_f32(float32x4_t a, float32x4_t b); // VCGT.F32 q0, q0, q0
+uint8x16_t vcgtq_u8(uint8x16_t a, uint8x16_t b); // VCGT.U8 q0, q0, q0
+uint16x8_t vcgtq_u16(uint16x8_t a, uint16x8_t b); // VCGT.U16 q0, q0, q0
+uint32x4_t vcgtq_u32(uint32x4_t a, uint32x4_t b); // VCGT.U32 q0, q0, q0
 //Vector compare less-than
-
-uint8x16_t vcltq_s8(int8x16_t a, int8x16_t b);         // VCGT.S8 q0, q0, q0
-uint16x8_t vcltq_s16(int16x8_t a, int16x8_t b);         // VCGT.S16 q0, q0, q0
-uint32x4_t vcltq_s32(int32x4_t a, int32x4_t b);         // VCGT.S32 q0, q0, q0
-uint32x4_t vcltq_f32(float32x4_t a, float32x4_t b);         // VCGT.F32 q0, q0, q0
-uint8x16_t vcltq_u8(uint8x16_t a, uint8x16_t b);         // VCGT.U8 q0, q0, q0
-uint16x8_t vcltq_u16(uint16x8_t a, uint16x8_t b);         // VCGT.U16 q0, q0, q0
-uint32x4_t vcltq_u32(uint32x4_t a, uint32x4_t b);         // VCGT.U32 q0, q0, q0
+uint8x8_t vclt_s8(int8x8_t a, int8x8_t b); // VCGT.S8 d0, d0, d0
+uint16x4_t vclt_s16(int16x4_t a, int16x4_t b); // VCGT.S16 d0, d0, d0
+uint32x2_t vclt_s32(int32x2_t a, int32x2_t b); // VCGT.S32 d0, d0, d0
+uint32x2_t vclt_f32(float32x2_t a, float32x2_t b); // VCGT.F32 d0, d0, d0
+uint8x8_t vclt_u8(uint8x8_t a, uint8x8_t b); // VCGT.U8 d0, d0, d0
+uint16x4_t vclt_u16(uint16x4_t a, uint16x4_t b); // VCGT.U16 d0, d0, d0
+uint32x2_t vclt_u32(uint32x2_t a, uint32x2_t b); // VCGT.U32 d0, d0, d0
+uint8x16_t vcltq_s8(int8x16_t a, int8x16_t b); // VCGT.S8 q0, q0, q0
+uint16x8_t vcltq_s16(int16x8_t a, int16x8_t b); // VCGT.S16 q0, q0, q0
+uint32x4_t vcltq_s32(int32x4_t a, int32x4_t b); // VCGT.S32 q0, q0, q0
+uint32x4_t vcltq_f32(float32x4_t a, float32x4_t b); // VCGT.F32 q0, q0, q0
+uint8x16_t vcltq_u8(uint8x16_t a, uint8x16_t b); // VCGT.U8 q0, q0, q0
+uint16x8_t vcltq_u16(uint16x8_t a, uint16x8_t b); // VCGT.U16 q0, q0, q0
+uint32x4_t vcltq_u32(uint32x4_t a, uint32x4_t b); // VCGT.U32 q0, q0, q0
 //Vector compare absolute greater-than or equal
-
-uint32x4_t vcageq_f32(float32x4_t a, float32x4_t b);         // VACGE.F32 q0, q0, q0
+uint32x2_t vcage_f32(float32x2_t a, float32x2_t b); // VACGE.F32 d0, d0, d0
+uint32x4_t vcageq_f32(float32x4_t a, float32x4_t b); // VACGE.F32 q0, q0, q0
 //Vector compare absolute less-than or equal
-
-uint32x4_t vcaleq_f32(float32x4_t a, float32x4_t b);         // VACGE.F32 q0, q0, q0
+uint32x2_t vcale_f32(float32x2_t a, float32x2_t b); // VACGE.F32 d0, d0, d0
+uint32x4_t vcaleq_f32(float32x4_t a, float32x4_t b); // VACGE.F32 q0, q0, q0
 //Vector compare absolute greater-than
-
-uint32x4_t vcagtq_f32(float32x4_t a, float32x4_t b);         // VACGT.F32 q0, q0, q0
+uint32x2_t vcagt_f32(float32x2_t a, float32x2_t b); // VACGT.F32 d0, d0, d0
+uint32x4_t vcagtq_f32(float32x4_t a, float32x4_t b); // VACGT.F32 q0, q0, q0
 //Vector compare absolute less-than
-
-uint32x4_t vcaltq_f32(float32x4_t a, float32x4_t b);         // VACGT.F32 q0, q0, q0
+uint32x2_t vcalt_f32(float32x2_t a, float32x2_t b); // VACGT.F32 d0, d0, d0
+uint32x4_t vcaltq_f32(float32x4_t a, float32x4_t b); // VACGT.F32 q0, q0, q0
 //Vector test bits
-
-uint8x16_t vtstq_s8(int8x16_t a, int8x16_t b);         // VTST.8 q0, q0, q0
-uint16x8_t vtstq_s16(int16x8_t a, int16x8_t b);         // VTST.16 q0, q0, q0
-uint32x4_t vtstq_s32(int32x4_t a, int32x4_t b);         // VTST.32 q0, q0, q0
-uint8x16_t vtstq_u8(uint8x16_t a, uint8x16_t b);         // VTST.8 q0, q0, q0
-uint16x8_t vtstq_u16(uint16x8_t a, uint16x8_t b);         // VTST.16 q0, q0, q0
-uint32x4_t vtstq_u32(uint32x4_t a, uint32x4_t b);         // VTST.32 q0, q0, q0
-uint8x16_t vtstq_p8(poly8x16_t a, poly8x16_t b);         // VTST.8 q0, q0, q0
+uint8x8_t vtst_s8(int8x8_t a, int8x8_t b); // VTST.8 d0, d0, d0
+uint16x4_t vtst_s16(int16x4_t a, int16x4_t b); // VTST.16 d0, d0, d0
+uint32x2_t vtst_s32(int32x2_t a, int32x2_t b); // VTST.32 d0, d0, d0
+uint8x8_t vtst_u8(uint8x8_t a, uint8x8_t b); // VTST.8 d0, d0, d0
+uint16x4_t vtst_u16(uint16x4_t a, uint16x4_t b); // VTST.16 d0, d0, d0
+uint32x2_t vtst_u32(uint32x2_t a, uint32x2_t b); // VTST.32 d0, d0, d0
+uint8x8_t vtst_p8(poly8x8_t a, poly8x8_t b); // VTST.8 d0, d0, d0
+uint8x16_t vtstq_s8(int8x16_t a, int8x16_t b); // VTST.8 q0, q0, q0
+uint16x8_t vtstq_s16(int16x8_t a, int16x8_t b); // VTST.16 q0, q0, q0
+uint32x4_t vtstq_s32(int32x4_t a, int32x4_t b); // VTST.32 q0, q0, q0
+uint8x16_t vtstq_u8(uint8x16_t a, uint8x16_t b); // VTST.8 q0, q0, q0
+uint16x8_t vtstq_u16(uint16x8_t a, uint16x8_t b); // VTST.16 q0, q0, q0
+uint32x4_t vtstq_u32(uint32x4_t a, uint32x4_t b); // VTST.32 q0, q0, q0
+uint8x16_t vtstq_p8(poly8x16_t a, poly8x16_t b); // VTST.8 q0, q0, q0
 //Absolute difference
 //Absolute difference between the arguments: Vr[i] = | Va[i] - Vb[i] |
-
-int8x16_t vabdq_s8(int8x16_t a, int8x16_t b);         // VABD.S8 q0,q0,q0
-int16x8_t vabdq_s16(int16x8_t a, int16x8_t b);         // VABD.S16 q0,q0,q0
-int32x4_t vabdq_s32(int32x4_t a, int32x4_t b);         // VABD.S32 q0,q0,q0
-uint8x16_t vabdq_u8(uint8x16_t a, uint8x16_t b);         // VABD.U8 q0,q0,q0
-uint16x8_t vabdq_u16(uint16x8_t a, uint16x8_t b);         // VABD.U16 q0,q0,q0
-uint32x4_t vabdq_u32(uint32x4_t a, uint32x4_t b);         // VABD.U32 q0,q0,q0
-float32x4_t vabdq_f32(float32x4_t a, float32x4_t b);         // VABD.F32 q0,q0,q0
+int8x8_t vabd_s8(int8x8_t a, int8x8_t b); // VABD.S8 d0,d0,d0
+int16x4_t vabd_s16(int16x4_t a, int16x4_t b); // VABD.S16 d0,d0,d0
+int32x2_t vabd_s32(int32x2_t a, int32x2_t b); // VABD.S32 d0,d0,d0
+uint8x8_t vabd_u8(uint8x8_t a, uint8x8_t b); // VABD.U8 d0,d0,d0
+uint16x4_t vabd_u16(uint16x4_t a, uint16x4_t b); // VABD.U16 d0,d0,d0
+uint32x2_t vabd_u32(uint32x2_t a, uint32x2_t b); // VABD.U32 d0,d0,d0
+float32x2_t vabd_f32(float32x2_t a, float32x2_t b); // VABD.F32 d0,d0,d0
+int8x16_t vabdq_s8(int8x16_t a, int8x16_t b); // VABD.S8 q0,q0,q0
+int16x8_t vabdq_s16(int16x8_t a, int16x8_t b); // VABD.S16 q0,q0,q0
+int32x4_t vabdq_s32(int32x4_t a, int32x4_t b); // VABD.S32 q0,q0,q0
+uint8x16_t vabdq_u8(uint8x16_t a, uint8x16_t b); // VABD.U8 q0,q0,q0
+uint16x8_t vabdq_u16(uint16x8_t a, uint16x8_t b); // VABD.U16 q0,q0,q0
+uint32x4_t vabdq_u32(uint32x4_t a, uint32x4_t b); // VABD.U32 q0,q0,q0
+float32x4_t vabdq_f32(float32x4_t a, float32x4_t b); // VABD.F32 q0,q0,q0
 //Absolute difference - long
-
+int16x8_t vabdl_s8(int8x8_t a, int8x8_t b); // VABDL.S8 q0,d0,d0
+int32x4_t vabdl_s16(int16x4_t a, int16x4_t b); // VABDL.S16 q0,d0,d0
+int64x2_t vabdl_s32(int32x2_t a, int32x2_t b); // VABDL.S32 q0,d0,d0
+uint16x8_t vabdl_u8(uint8x8_t a, uint8x8_t b); // VABDL.U8 q0,d0,d0
+uint32x4_t vabdl_u16(uint16x4_t a, uint16x4_t b); // VABDL.U16 q0,d0,d0
+uint64x2_t vabdl_u32(uint32x2_t a, uint32x2_t b); // VABDL.U32 q0,d0,d0
 //Absolute difference and accumulate: Vr[i] = Va[i] + | Vb[i] - Vc[i] |
-
-int8x16_t vabaq_s8(int8x16_t a, int8x16_t b, int8x16_t c);         // VABA.S8 q0,q0,q0
-int16x8_t vabaq_s16(int16x8_t a, int16x8_t b, int16x8_t c);         // VABA.S16 q0,q0,q0
-int32x4_t vabaq_s32(int32x4_t a, int32x4_t b, int32x4_t c);         // VABA.S32 q0,q0,q0
-uint8x16_t vabaq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c);         // VABA.U8 q0,q0,q0
-uint16x8_t vabaq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c);         // VABA.U16 q0,q0,q0
-uint32x4_t vabaq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c);         // VABA.U32 q0,q0,q0
+int8x8_t vaba_s8(int8x8_t a, int8x8_t b, int8x8_t c); // VABA.S8 d0,d0,d0
+int16x4_t vaba_s16(int16x4_t a, int16x4_t b, int16x4_t c); // VABA.S16 d0,d0,d0
+int32x2_t vaba_s32(int32x2_t a, int32x2_t b, int32x2_t c); // VABA.S32 d0,d0,d0
+uint8x8_t vaba_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c); // VABA.U8 d0,d0,d0
+uint16x4_t vaba_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c); // VABA.U16 d0,d0,d0
+uint32x2_t vaba_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c); // VABA.U32 d0,d0,d0
+int8x16_t vabaq_s8(int8x16_t a, int8x16_t b, int8x16_t c); // VABA.S8 q0,q0,q0
+int16x8_t vabaq_s16(int16x8_t a, int16x8_t b, int16x8_t c); // VABA.S16 q0,q0,q0
+int32x4_t vabaq_s32(int32x4_t a, int32x4_t b, int32x4_t c); // VABA.S32 q0,q0,q0
+uint8x16_t vabaq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c); // VABA.U8 q0,q0,q0
+uint16x8_t vabaq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c); // VABA.U16 q0,q0,q0
+uint32x4_t vabaq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c); // VABA.U32 q0,q0,q0
 //Absolute difference and accumulate - long
-
+int16x8_t vabal_s8(int16x8_t a, int8x8_t b, int8x8_t c); // VABAL.S8 q0,d0,d0
+int32x4_t vabal_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VABAL.S16 q0,d0,d0
+int64x2_t vabal_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VABAL.S32 q0,d0,d0
+uint16x8_t vabal_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c); // VABAL.U8 q0,d0,d0
+uint32x4_t vabal_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c); // VABAL.U16 q0,d0,d0
+uint64x2_t vabal_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c); // VABAL.U32 q0,d0,d0
 //Max/Min
 //vmax -> Vr[i] := (Va[i] >= Vb[i]) ? Va[i] : Vb[i]
-
-int8x16_t vmaxq_s8(int8x16_t a, int8x16_t b);         // VMAX.S8 q0,q0,q0
-int16x8_t vmaxq_s16(int16x8_t a, int16x8_t b);         // VMAX.S16 q0,q0,q0
-int32x4_t vmaxq_s32(int32x4_t a, int32x4_t b);         // VMAX.S32 q0,q0,q0
-uint8x16_t vmaxq_u8(uint8x16_t a, uint8x16_t b);         // VMAX.U8 q0,q0,q0
-uint16x8_t vmaxq_u16(uint16x8_t a, uint16x8_t b);         // VMAX.U16 q0,q0,q0
-uint32x4_t vmaxq_u32(uint32x4_t a, uint32x4_t b);         // VMAX.U32 q0,q0,q0
-float32x4_t vmaxq_f32(float32x4_t a, float32x4_t b);         // VMAX.F32 q0,q0,q0
+int8x8_t vmax_s8(int8x8_t a, int8x8_t b); // VMAX.S8 d0,d0,d0
+int16x4_t vmax_s16(int16x4_t a, int16x4_t b); // VMAX.S16 d0,d0,d0
+int32x2_t vmax_s32(int32x2_t a, int32x2_t b); // VMAX.S32 d0,d0,d0
+uint8x8_t vmax_u8(uint8x8_t a, uint8x8_t b); // VMAX.U8 d0,d0,d0
+uint16x4_t vmax_u16(uint16x4_t a, uint16x4_t b); // VMAX.U16 d0,d0,d0
+uint32x2_t vmax_u32(uint32x2_t a, uint32x2_t b); // VMAX.U32 d0,d0,d0
+float32x2_t vmax_f32(float32x2_t a, float32x2_t b); // VMAX.F32 d0,d0,d0
+int8x16_t vmaxq_s8(int8x16_t a, int8x16_t b); // VMAX.S8 q0,q0,q0
+int16x8_t vmaxq_s16(int16x8_t a, int16x8_t b); // VMAX.S16 q0,q0,q0
+int32x4_t vmaxq_s32(int32x4_t a, int32x4_t b); // VMAX.S32 q0,q0,q0
+uint8x16_t vmaxq_u8(uint8x16_t a, uint8x16_t b); // VMAX.U8 q0,q0,q0
+uint16x8_t vmaxq_u16(uint16x8_t a, uint16x8_t b); // VMAX.U16 q0,q0,q0
+uint32x4_t vmaxq_u32(uint32x4_t a, uint32x4_t b); // VMAX.U32 q0,q0,q0
+float32x4_t vmaxq_f32(float32x4_t a, float32x4_t b); // VMAX.F32 q0,q0,q0
 //vmin -> Vr[i] := (Va[i] >= Vb[i]) ? Vb[i] : Va[i]
-
-int8x16_t vminq_s8(int8x16_t a, int8x16_t b);         // VMIN.S8 q0,q0,q0
-int16x8_t vminq_s16(int16x8_t a, int16x8_t b);         // VMIN.S16 q0,q0,q0
-int32x4_t vminq_s32(int32x4_t a, int32x4_t b);         // VMIN.S32 q0,q0,q0
-uint8x16_t vminq_u8(uint8x16_t a, uint8x16_t b);         // VMIN.U8 q0,q0,q0
-uint16x8_t vminq_u16(uint16x8_t a, uint16x8_t b);         // VMIN.U16 q0,q0,q0
-uint32x4_t vminq_u32(uint32x4_t a, uint32x4_t b);         // VMIN.U32 q0,q0,q0
-float32x4_t vminq_f32(float32x4_t a, float32x4_t b);         // VMIN.F32 q0,q0,q0
+int8x8_t vmin_s8(int8x8_t a, int8x8_t b); // VMIN.S8 d0,d0,d0
+int16x4_t vmin_s16(int16x4_t a, int16x4_t b); // VMIN.S16 d0,d0,d0
+int32x2_t vmin_s32(int32x2_t a, int32x2_t b); // VMIN.S32 d0,d0,d0
+uint8x8_t vmin_u8(uint8x8_t a, uint8x8_t b); // VMIN.U8 d0,d0,d0
+uint16x4_t vmin_u16(uint16x4_t a, uint16x4_t b); // VMIN.U16 d0,d0,d0
+uint32x2_t vmin_u32(uint32x2_t a, uint32x2_t b); // VMIN.U32 d0,d0,d0
+float32x2_t vmin_f32(float32x2_t a, float32x2_t b); // VMIN.F32 d0,d0,d0
+int8x16_t vminq_s8(int8x16_t a, int8x16_t b); // VMIN.S8 q0,q0,q0
+int16x8_t vminq_s16(int16x8_t a, int16x8_t b); // VMIN.S16 q0,q0,q0
+int32x4_t vminq_s32(int32x4_t a, int32x4_t b); // VMIN.S32 q0,q0,q0
+uint8x16_t vminq_u8(uint8x16_t a, uint8x16_t b); // VMIN.U8 q0,q0,q0
+uint16x8_t vminq_u16(uint16x8_t a, uint16x8_t b); // VMIN.U16 q0,q0,q0
+uint32x4_t vminq_u32(uint32x4_t a, uint32x4_t b); // VMIN.U32 q0,q0,q0
+float32x4_t vminq_f32(float32x4_t a, float32x4_t b); // VMIN.F32 q0,q0,q0
 //Pairwise addition
 //Pairwise add
-
+int8x8_t vpadd_s8(int8x8_t a, int8x8_t b); // VPADD.I8 d0,d0,d0
+int16x4_t vpadd_s16(int16x4_t a, int16x4_t b); // VPADD.I16 d0,d0,d0
+int32x2_t vpadd_s32(int32x2_t a, int32x2_t b); // VPADD.I32 d0,d0,d0
+uint8x8_t vpadd_u8(uint8x8_t a, uint8x8_t b); // VPADD.I8 d0,d0,d0
+uint16x4_t vpadd_u16(uint16x4_t a, uint16x4_t b); // VPADD.I16 d0,d0,d0
+uint32x2_t vpadd_u32(uint32x2_t a, uint32x2_t b); // VPADD.I32 d0,d0,d0
+float32x2_t vpadd_f32(float32x2_t a, float32x2_t b); // VPADD.F32 d0,d0,d0
 //Long pairwise add
-
-int16x8_t vpaddlq_s8(int8x16_t a);         // VPADDL.S8 q0,q0
-int32x4_t vpaddlq_s16(int16x8_t a);         // VPADDL.S16 q0,q0
-int64x2_t vpaddlq_s32(int32x4_t a);         // VPADDL.S32 q0,q0
-uint16x8_t vpaddlq_u8(uint8x16_t a);         // VPADDL.U8 q0,q0
-uint32x4_t vpaddlq_u16(uint16x8_t a);         // VPADDL.U16 q0,q0
-uint64x2_t vpaddlq_u32(uint32x4_t a);         // VPADDL.U32 q0,q0
+int16x4_t vpaddl_s8(int8x8_t a); // VPADDL.S8 d0,d0
+int32x2_t vpaddl_s16(int16x4_t a); // VPADDL.S16 d0,d0
+int64x1_t vpaddl_s32(int32x2_t a); // VPADDL.S32 d0,d0
+uint16x4_t vpaddl_u8(uint8x8_t a); // VPADDL.U8 d0,d0
+uint32x2_t vpaddl_u16(uint16x4_t a); // VPADDL.U16 d0,d0
+uint64x1_t vpaddl_u32(uint32x2_t a); // VPADDL.U32 d0,d0
+int16x8_t vpaddlq_s8(int8x16_t a); // VPADDL.S8 q0,q0
+int32x4_t vpaddlq_s16(int16x8_t a); // VPADDL.S16 q0,q0
+int64x2_t vpaddlq_s32(int32x4_t a); // VPADDL.S32 q0,q0
+uint16x8_t vpaddlq_u8(uint8x16_t a); // VPADDL.U8 q0,q0
+uint32x4_t vpaddlq_u16(uint16x8_t a); // VPADDL.U16 q0,q0
+uint64x2_t vpaddlq_u32(uint32x4_t a); // VPADDL.U32 q0,q0
 //Long pairwise add and accumulate
-
-int16x8_t vpadalq_s8(int16x8_t a, int8x16_t b);         // VPADAL.S8 q0,q0
-int32x4_t vpadalq_s16(int32x4_t a, int16x8_t b);         // VPADAL.S16 q0,q0
-int64x2_t vpadalq_s32(int64x2_t a, int32x4_t b);         // VPADAL.S32 q0,q0
-uint16x8_t vpadalq_u8(uint16x8_t a, uint8x16_t b);         // VPADAL.U8 q0,q0
-uint32x4_t vpadalq_u16(uint32x4_t a, uint16x8_t b);         // VPADAL.U16 q0,q0
-uint64x2_t vpadalq_u32(uint64x2_t a, uint32x4_t b);         // VPADAL.U32 q0,q0
+int16x4_t vpadal_s8(int16x4_t a, int8x8_t b); // VPADAL.S8 d0,d0
+int32x2_t vpadal_s16(int32x2_t a, int16x4_t b); // VPADAL.S16 d0,d0
+int64x1_t vpadal_s32(int64x1_t a, int32x2_t b); // VPADAL.S32 d0,d0
+uint16x4_t vpadal_u8(uint16x4_t a, uint8x8_t b); // VPADAL.U8 d0,d0
+uint32x2_t vpadal_u16(uint32x2_t a, uint16x4_t b); // VPADAL.U16 d0,d0
+uint64x1_t vpadal_u32(uint64x1_t a, uint32x2_t b); // VPADAL.U32 d0,d0
+int16x8_t vpadalq_s8(int16x8_t a, int8x16_t b); // VPADAL.S8 q0,q0
+int32x4_t vpadalq_s16(int32x4_t a, int16x8_t b); // VPADAL.S16 q0,q0
+int64x2_t vpadalq_s32(int64x2_t a, int32x4_t b); // VPADAL.S32 q0,q0
+uint16x8_t vpadalq_u8(uint16x8_t a, uint8x16_t b); // VPADAL.U8 q0,q0
+uint32x4_t vpadalq_u16(uint32x4_t a, uint16x8_t b); // VPADAL.U16 q0,q0
+uint64x2_t vpadalq_u32(uint64x2_t a, uint32x4_t b); // VPADAL.U32 q0,q0
 //Folding maximum vpmax -> takes maximum of adjacent pairs
-
+int8x8_t vpmax_s8(int8x8_t a, int8x8_t b); // VPMAX.S8 d0,d0,d0
+int16x4_t vpmax_s16(int16x4_t a, int16x4_t b); // VPMAX.S16 d0,d0,d0
+int32x2_t vpmax_s32(int32x2_t a, int32x2_t b); // VPMAX.S32 d0,d0,d0
+uint8x8_t vpmax_u8(uint8x8_t a, uint8x8_t b); // VPMAX.U8 d0,d0,d0
+uint16x4_t vpmax_u16(uint16x4_t a, uint16x4_t b); // VPMAX.U16 d0,d0,d0
+uint32x2_t vpmax_u32(uint32x2_t a, uint32x2_t b); // VPMAX.U32 d0,d0,d0
+float32x2_t vpmax_f32(float32x2_t a, float32x2_t b); // VPMAX.F32 d0,d0,d0
 //Folding minimum vpmin -> takes minimum of adjacent pairs
-
+int8x8_t vpmin_s8(int8x8_t a, int8x8_t b); // VPMIN.S8 d0,d0,d0
+int16x4_t vpmin_s16(int16x4_t a, int16x4_t b); // VPMIN.S16 d0,d0,d0
+int32x2_t vpmin_s32(int32x2_t a, int32x2_t b); // VPMIN.S32 d0,d0,d0
+uint8x8_t vpmin_u8(uint8x8_t a, uint8x8_t b); // VPMIN.U8 d0,d0,d0
+uint16x4_t vpmin_u16(uint16x4_t a, uint16x4_t b); // VPMIN.U16 d0,d0,d0
+uint32x2_t vpmin_u32(uint32x2_t a, uint32x2_t b); // VPMIN.U32 d0,d0,d0
+float32x2_t vpmin_f32(float32x2_t a, float32x2_t b); // VPMIN.F32 d0,d0,d0
 //Reciprocal/Sqrt
-
-float32x4_t vrecpsq_f32(float32x4_t a, float32x4_t b);         // VRECPS.F32 q0, q0, q0
-
-float32x4_t vrsqrtsq_f32(float32x4_t a, float32x4_t b);         // VRSQRTS.F32 q0, q0, q0
+float32x2_t vrecps_f32(float32x2_t a, float32x2_t b); // VRECPS.F32 d0, d0, d0
+float32x4_t vrecpsq_f32(float32x4_t a, float32x4_t b); // VRECPS.F32 q0, q0, q0
+float32x2_t vrsqrts_f32(float32x2_t a, float32x2_t b); // VRSQRTS.F32 d0, d0, d0
+float32x4_t vrsqrtsq_f32(float32x4_t a, float32x4_t b); // VRSQRTS.F32 q0, q0, q0
 //Shifts by signed variable
 //Vector shift left: Vr[i] := Va[i] << Vb[i] (negative values shift right)
-
-int8x16_t vshlq_s8(int8x16_t a, int8x16_t b);         // VSHL.S8 q0,q0,q0
-int16x8_t vshlq_s16(int16x8_t a, int16x8_t b);         // VSHL.S16 q0,q0,q0
-int32x4_t vshlq_s32(int32x4_t a, int32x4_t b);         // VSHL.S32 q0,q0,q0
-int64x2_t vshlq_s64(int64x2_t a, int64x2_t b);         // VSHL.S64 q0,q0,q0
-uint8x16_t vshlq_u8(uint8x16_t a, int8x16_t b);         // VSHL.U8 q0,q0,q0
-uint16x8_t vshlq_u16(uint16x8_t a, int16x8_t b);         // VSHL.U16 q0,q0,q0
-uint32x4_t vshlq_u32(uint32x4_t a, int32x4_t b);         // VSHL.U32 q0,q0,q0
-uint64x2_t vshlq_u64(uint64x2_t a, int64x2_t b);         // VSHL.U64 q0,q0,q0
+int8x8_t vshl_s8(int8x8_t a, int8x8_t b); // VSHL.S8 d0,d0,d0
+int16x4_t vshl_s16(int16x4_t a, int16x4_t b); // VSHL.S16 d0,d0,d0
+int32x2_t vshl_s32(int32x2_t a, int32x2_t b); // VSHL.S32 d0,d0,d0
+int64x1_t vshl_s64(int64x1_t a, int64x1_t b); // VSHL.S64 d0,d0,d0
+uint8x8_t vshl_u8(uint8x8_t a, int8x8_t b); // VSHL.U8 d0,d0,d0
+uint16x4_t vshl_u16(uint16x4_t a, int16x4_t b); // VSHL.U16 d0,d0,d0
+uint32x2_t vshl_u32(uint32x2_t a, int32x2_t b); // VSHL.U32 d0,d0,d0
+uint64x1_t vshl_u64(uint64x1_t a, int64x1_t b); // VSHL.U64 d0,d0,d0
+int8x16_t vshlq_s8(int8x16_t a, int8x16_t b); // VSHL.S8 q0,q0,q0
+int16x8_t vshlq_s16(int16x8_t a, int16x8_t b); // VSHL.S16 q0,q0,q0
+int32x4_t vshlq_s32(int32x4_t a, int32x4_t b); // VSHL.S32 q0,q0,q0
+int64x2_t vshlq_s64(int64x2_t a, int64x2_t b); // VSHL.S64 q0,q0,q0
+uint8x16_t vshlq_u8(uint8x16_t a, int8x16_t b); // VSHL.U8 q0,q0,q0
+uint16x8_t vshlq_u16(uint16x8_t a, int16x8_t b); // VSHL.U16 q0,q0,q0
+uint32x4_t vshlq_u32(uint32x4_t a, int32x4_t b); // VSHL.U32 q0,q0,q0
+uint64x2_t vshlq_u64(uint64x2_t a, int64x2_t b); // VSHL.U64 q0,q0,q0
 //Vector saturating shift left: (negative values shift right)
-
-int8x16_t vqshlq_s8(int8x16_t a, int8x16_t b);         // VQSHL.S8 q0,q0,q0
-int16x8_t vqshlq_s16(int16x8_t a, int16x8_t b);         // VQSHL.S16 q0,q0,q0
-int32x4_t vqshlq_s32(int32x4_t a, int32x4_t b);         // VQSHL.S32 q0,q0,q0
-int64x2_t vqshlq_s64(int64x2_t a, int64x2_t b);         // VQSHL.S64 q0,q0,q0
-uint8x16_t vqshlq_u8(uint8x16_t a, int8x16_t b);         // VQSHL.U8 q0,q0,q0
-uint16x8_t vqshlq_u16(uint16x8_t a, int16x8_t b);         // VQSHL.U16 q0,q0,q0
-uint32x4_t vqshlq_u32(uint32x4_t a, int32x4_t b);         // VQSHL.U32 q0,q0,q0
-uint64x2_t vqshlq_u64(uint64x2_t a, int64x2_t b);         // VQSHL.U64 q0,q0,q0
+int8x8_t vqshl_s8(int8x8_t a, int8x8_t b); // VQSHL.S8 d0,d0,d0
+int16x4_t vqshl_s16(int16x4_t a, int16x4_t b); // VQSHL.S16 d0,d0,d0
+int32x2_t vqshl_s32(int32x2_t a, int32x2_t b); // VQSHL.S32 d0,d0,d0
+int64x1_t vqshl_s64(int64x1_t a, int64x1_t b); // VQSHL.S64 d0,d0,d0
+uint8x8_t vqshl_u8(uint8x8_t a, int8x8_t b); // VQSHL.U8 d0,d0,d0
+uint16x4_t vqshl_u16(uint16x4_t a, int16x4_t b); // VQSHL.U16 d0,d0,d0
+uint32x2_t vqshl_u32(uint32x2_t a, int32x2_t b); // VQSHL.U32 d0,d0,d0
+uint64x1_t vqshl_u64(uint64x1_t a, int64x1_t b); // VQSHL.U64 d0,d0,d0
+int8x16_t vqshlq_s8(int8x16_t a, int8x16_t b); // VQSHL.S8 q0,q0,q0
+int16x8_t vqshlq_s16(int16x8_t a, int16x8_t b); // VQSHL.S16 q0,q0,q0
+int32x4_t vqshlq_s32(int32x4_t a, int32x4_t b); // VQSHL.S32 q0,q0,q0
+int64x2_t vqshlq_s64(int64x2_t a, int64x2_t b); // VQSHL.S64 q0,q0,q0
+uint8x16_t vqshlq_u8(uint8x16_t a, int8x16_t b); // VQSHL.U8 q0,q0,q0
+uint16x8_t vqshlq_u16(uint16x8_t a, int16x8_t b); // VQSHL.U16 q0,q0,q0
+uint32x4_t vqshlq_u32(uint32x4_t a, int32x4_t b); // VQSHL.U32 q0,q0,q0
+uint64x2_t vqshlq_u64(uint64x2_t a, int64x2_t b); // VQSHL.U64 q0,q0,q0
 //Vector rounding shift left: (negative values shift right)
-
-int8x16_t vrshlq_s8(int8x16_t a, int8x16_t b);         // VRSHL.S8 q0,q0,q0
-int16x8_t vrshlq_s16(int16x8_t a, int16x8_t b);         // VRSHL.S16 q0,q0,q0
-int32x4_t vrshlq_s32(int32x4_t a, int32x4_t b);         // VRSHL.S32 q0,q0,q0
-int64x2_t vrshlq_s64(int64x2_t a, int64x2_t b);         // VRSHL.S64 q0,q0,q0
-uint8x16_t vrshlq_u8(uint8x16_t a, int8x16_t b);         // VRSHL.U8 q0,q0,q0
-uint16x8_t vrshlq_u16(uint16x8_t a, int16x8_t b);         // VRSHL.U16 q0,q0,q0
-uint32x4_t vrshlq_u32(uint32x4_t a, int32x4_t b);         // VRSHL.U32 q0,q0,q0
-uint64x2_t vrshlq_u64(uint64x2_t a, int64x2_t b);         // VRSHL.U64 q0,q0,q0
+int8x8_t vrshl_s8(int8x8_t a, int8x8_t b); // VRSHL.S8 d0,d0,d0
+int16x4_t vrshl_s16(int16x4_t a, int16x4_t b); // VRSHL.S16 d0,d0,d0
+int32x2_t vrshl_s32(int32x2_t a, int32x2_t b); // VRSHL.S32 d0,d0,d0
+int64x1_t vrshl_s64(int64x1_t a, int64x1_t b); // VRSHL.S64 d0,d0,d0
+uint8x8_t vrshl_u8(uint8x8_t a, int8x8_t b); // VRSHL.U8 d0,d0,d0
+uint16x4_t vrshl_u16(uint16x4_t a, int16x4_t b); // VRSHL.U16 d0,d0,d0
+uint32x2_t vrshl_u32(uint32x2_t a, int32x2_t b); // VRSHL.U32 d0,d0,d0
+uint64x1_t vrshl_u64(uint64x1_t a, int64x1_t b); // VRSHL.U64 d0,d0,d0
+int8x16_t vrshlq_s8(int8x16_t a, int8x16_t b); // VRSHL.S8 q0,q0,q0
+int16x8_t vrshlq_s16(int16x8_t a, int16x8_t b); // VRSHL.S16 q0,q0,q0
+int32x4_t vrshlq_s32(int32x4_t a, int32x4_t b); // VRSHL.S32 q0,q0,q0
+int64x2_t vrshlq_s64(int64x2_t a, int64x2_t b); // VRSHL.S64 q0,q0,q0
+uint8x16_t vrshlq_u8(uint8x16_t a, int8x16_t b); // VRSHL.U8 q0,q0,q0
+uint16x8_t vrshlq_u16(uint16x8_t a, int16x8_t b); // VRSHL.U16 q0,q0,q0
+uint32x4_t vrshlq_u32(uint32x4_t a, int32x4_t b); // VRSHL.U32 q0,q0,q0
+uint64x2_t vrshlq_u64(uint64x2_t a, int64x2_t b); // VRSHL.U64 q0,q0,q0
 //Vector saturating rounding shift left: (negative values shift right)
-
-int8x16_t vqrshlq_s8(int8x16_t a, int8x16_t b);         // VQRSHL.S8 q0,q0,q0
-int16x8_t vqrshlq_s16(int16x8_t a, int16x8_t b);         // VQRSHL.S16 q0,q0,q0
-int32x4_t vqrshlq_s32(int32x4_t a, int32x4_t b);         // VQRSHL.S32 q0,q0,q0
-int64x2_t vqrshlq_s64(int64x2_t a, int64x2_t b);         // VQRSHL.S64 q0,q0,q0
-uint8x16_t vqrshlq_u8(uint8x16_t a, int8x16_t b);         // VQRSHL.U8 q0,q0,q0
-uint16x8_t vqrshlq_u16(uint16x8_t a, int16x8_t b);         // VQRSHL.U16 q0,q0,q0
-uint32x4_t vqrshlq_u32(uint32x4_t a, int32x4_t b);         // VQRSHL.U32 q0,q0,q0
-uint64x2_t vqrshlq_u64(uint64x2_t a, int64x2_t b);         // VQRSHL.U64 q0,q0,q0
+int8x8_t vqrshl_s8(int8x8_t a, int8x8_t b); // VQRSHL.S8 d0,d0,d0
+int16x4_t vqrshl_s16(int16x4_t a, int16x4_t b); // VQRSHL.S16 d0,d0,d0
+int32x2_t vqrshl_s32(int32x2_t a, int32x2_t b); // VQRSHL.S32 d0,d0,d0
+int64x1_t vqrshl_s64(int64x1_t a, int64x1_t b); // VQRSHL.S64 d0,d0,d0
+uint8x8_t vqrshl_u8(uint8x8_t a, int8x8_t b); // VQRSHL.U8 d0,d0,d0
+uint16x4_t vqrshl_u16(uint16x4_t a, int16x4_t b); // VQRSHL.U16 d0,d0,d0
+uint32x2_t vqrshl_u32(uint32x2_t a, int32x2_t b); // VQRSHL.U32 d0,d0,d0
+uint64x1_t vqrshl_u64(uint64x1_t a, int64x1_t b); // VQRSHL.U64 d0,d0,d0
+int8x16_t vqrshlq_s8(int8x16_t a, int8x16_t b); // VQRSHL.S8 q0,q0,q0
+int16x8_t vqrshlq_s16(int16x8_t a, int16x8_t b); // VQRSHL.S16 q0,q0,q0
+int32x4_t vqrshlq_s32(int32x4_t a, int32x4_t b); // VQRSHL.S32 q0,q0,q0
+int64x2_t vqrshlq_s64(int64x2_t a, int64x2_t b); // VQRSHL.S64 q0,q0,q0
+uint8x16_t vqrshlq_u8(uint8x16_t a, int8x16_t b); // VQRSHL.U8 q0,q0,q0
+uint16x8_t vqrshlq_u16(uint16x8_t a, int16x8_t b); // VQRSHL.U16 q0,q0,q0
+uint32x4_t vqrshlq_u32(uint32x4_t a, int32x4_t b); // VQRSHL.U32 q0,q0,q0
+uint64x2_t vqrshlq_u64(uint64x2_t a, int64x2_t b); // VQRSHL.U64 q0,q0,q0
 //Shifts by a constant
 //Vector shift right by constant
-
-int8x16_t vshrq_n_s8(int8x16_t a, __constrange(1,8) int b);         // VSHR.S8 q0,q0,#8
-int16x8_t vshrq_n_s16(int16x8_t a, __constrange(1,16) int b);         // VSHR.S16 q0,q0,#16
-int32x4_t vshrq_n_s32(int32x4_t a, __constrange(1,32) int b);         // VSHR.S32 q0,q0,#32
-int64x2_t vshrq_n_s64(int64x2_t a, __constrange(1,64) int b);         // VSHR.S64 q0,q0,#64
-uint8x16_t vshrq_n_u8(uint8x16_t a, __constrange(1,8) int b);         // VSHR.U8 q0,q0,#8
-uint16x8_t vshrq_n_u16(uint16x8_t a, __constrange(1,16) int b);         // VSHR.U16 q0,q0,#16
-uint32x4_t vshrq_n_u32(uint32x4_t a, __constrange(1,32) int b);         // VSHR.U32 q0,q0,#32
-uint64x2_t vshrq_n_u64(uint64x2_t a, __constrange(1,64) int b);         // VSHR.U64 q0,q0,#64
+int8x8_t vshr_n_s8(int8x8_t a, __constrange(1,8) int b); // VSHR.S8 d0,d0,#8
+int16x4_t vshr_n_s16(int16x4_t a, __constrange(1,16) int b); // VSHR.S16 d0,d0,#16
+int32x2_t vshr_n_s32(int32x2_t a, __constrange(1,32) int b); // VSHR.S32 d0,d0,#32
+int64x1_t vshr_n_s64(int64x1_t a, __constrange(1,64) int b); // VSHR.S64 d0,d0,#64
+uint8x8_t vshr_n_u8(uint8x8_t a, __constrange(1,8) int b); // VSHR.U8 d0,d0,#8
+uint16x4_t vshr_n_u16(uint16x4_t a, __constrange(1,16) int b); // VSHR.U16 d0,d0,#16
+uint32x2_t vshr_n_u32(uint32x2_t a, __constrange(1,32) int b); // VSHR.U32 d0,d0,#32
+uint64x1_t vshr_n_u64(uint64x1_t a, __constrange(1,64) int b); // VSHR.U64 d0,d0,#64
+int8x16_t vshrq_n_s8(int8x16_t a, __constrange(1,8) int b); // VSHR.S8 q0,q0,#8
+int16x8_t vshrq_n_s16(int16x8_t a, __constrange(1,16) int b); // VSHR.S16 q0,q0,#16
+int32x4_t vshrq_n_s32(int32x4_t a, __constrange(1,32) int b); // VSHR.S32 q0,q0,#32
+int64x2_t vshrq_n_s64(int64x2_t a, __constrange(1,64) int b); // VSHR.S64 q0,q0,#64
+uint8x16_t vshrq_n_u8(uint8x16_t a, __constrange(1,8) int b); // VSHR.U8 q0,q0,#8
+uint16x8_t vshrq_n_u16(uint16x8_t a, __constrange(1,16) int b); // VSHR.U16 q0,q0,#16
+uint32x4_t vshrq_n_u32(uint32x4_t a, __constrange(1,32) int b); // VSHR.U32 q0,q0,#32
+uint64x2_t vshrq_n_u64(uint64x2_t a, __constrange(1,64) int b); // VSHR.U64 q0,q0,#64
 //Vector shift left by constant
-
-int8x16_t vshlq_n_s8(int8x16_t a, __constrange(0,7) int b);         // VSHL.I8 q0,q0,#0
-int16x8_t vshlq_n_s16(int16x8_t a, __constrange(0,15) int b);         // VSHL.I16 q0,q0,#0
-int32x4_t vshlq_n_s32(int32x4_t a, __constrange(0,31) int b);         // VSHL.I32 q0,q0,#0
-int64x2_t vshlq_n_s64(int64x2_t a, __constrange(0,63) int b);         // VSHL.I64 q0,q0,#0
-uint8x16_t vshlq_n_u8(uint8x16_t a, __constrange(0,7) int b);         // VSHL.I8 q0,q0,#0
-uint16x8_t vshlq_n_u16(uint16x8_t a, __constrange(0,15) int b);         // VSHL.I16 q0,q0,#0
-uint32x4_t vshlq_n_u32(uint32x4_t a, __constrange(0,31) int b);         // VSHL.I32 q0,q0,#0
-uint64x2_t vshlq_n_u64(uint64x2_t a, __constrange(0,63) int b);         // VSHL.I64 q0,q0,#0
+int8x8_t vshl_n_s8(int8x8_t a, __constrange(0,7) int b); // VSHL.I8 d0,d0,#0
+int16x4_t vshl_n_s16(int16x4_t a, __constrange(0,15) int b); // VSHL.I16 d0,d0,#0
+int32x2_t vshl_n_s32(int32x2_t a, __constrange(0,31) int b); // VSHL.I32 d0,d0,#0
+int64x1_t vshl_n_s64(int64x1_t a, __constrange(0,63) int b); // VSHL.I64 d0,d0,#0
+uint8x8_t vshl_n_u8(uint8x8_t a, __constrange(0,7) int b); // VSHL.I8 d0,d0,#0
+uint16x4_t vshl_n_u16(uint16x4_t a, __constrange(0,15) int b); // VSHL.I16 d0,d0,#0
+uint32x2_t vshl_n_u32(uint32x2_t a, __constrange(0,31) int b); // VSHL.I32 d0,d0,#0
+uint64x1_t vshl_n_u64(uint64x1_t a, __constrange(0,63) int b); // VSHL.I64 d0,d0,#0
+int8x16_t vshlq_n_s8(int8x16_t a, __constrange(0,7) int b); // VSHL.I8 q0,q0,#0
+int16x8_t vshlq_n_s16(int16x8_t a, __constrange(0,15) int b); // VSHL.I16 q0,q0,#0
+int32x4_t vshlq_n_s32(int32x4_t a, __constrange(0,31) int b); // VSHL.I32 q0,q0,#0
+int64x2_t vshlq_n_s64(int64x2_t a, __constrange(0,63) int b); // VSHL.I64 q0,q0,#0
+uint8x16_t vshlq_n_u8(uint8x16_t a, __constrange(0,7) int b); // VSHL.I8 q0,q0,#0
+uint16x8_t vshlq_n_u16(uint16x8_t a, __constrange(0,15) int b); // VSHL.I16 q0,q0,#0
+uint32x4_t vshlq_n_u32(uint32x4_t a, __constrange(0,31) int b); // VSHL.I32 q0,q0,#0
+uint64x2_t vshlq_n_u64(uint64x2_t a, __constrange(0,63) int b); // VSHL.I64 q0,q0,#0
 //Vector rounding shift right by constant
-
-int8x16_t vrshrq_n_s8(int8x16_t a, __constrange(1,8) int b);         // VRSHR.S8 q0,q0,#8
-int16x8_t vrshrq_n_s16(int16x8_t a, __constrange(1,16) int b);         // VRSHR.S16 q0,q0,#16
-int32x4_t vrshrq_n_s32(int32x4_t a, __constrange(1,32) int b);         // VRSHR.S32 q0,q0,#32
-int64x2_t vrshrq_n_s64(int64x2_t a, __constrange(1,64) int b);         // VRSHR.S64 q0,q0,#64
-uint8x16_t vrshrq_n_u8(uint8x16_t a, __constrange(1,8) int b);         // VRSHR.U8 q0,q0,#8
-uint16x8_t vrshrq_n_u16(uint16x8_t a, __constrange(1,16) int b);         // VRSHR.U16 q0,q0,#16
-uint32x4_t vrshrq_n_u32(uint32x4_t a, __constrange(1,32) int b);         // VRSHR.U32 q0,q0,#32
-uint64x2_t vrshrq_n_u64(uint64x2_t a, __constrange(1,64) int b);         // VRSHR.U64 q0,q0,#64
+int8x8_t vrshr_n_s8(int8x8_t a, __constrange(1,8) int b); // VRSHR.S8 d0,d0,#8
+int16x4_t vrshr_n_s16(int16x4_t a, __constrange(1,16) int b); // VRSHR.S16 d0,d0,#16
+int32x2_t vrshr_n_s32(int32x2_t a, __constrange(1,32) int b); // VRSHR.S32 d0,d0,#32
+int64x1_t vrshr_n_s64(int64x1_t a, __constrange(1,64) int b); // VRSHR.S64 d0,d0,#64
+uint8x8_t vrshr_n_u8(uint8x8_t a, __constrange(1,8) int b); // VRSHR.U8 d0,d0,#8
+uint16x4_t vrshr_n_u16(uint16x4_t a, __constrange(1,16) int b); // VRSHR.U16 d0,d0,#16
+uint32x2_t vrshr_n_u32(uint32x2_t a, __constrange(1,32) int b); // VRSHR.U32 d0,d0,#32
+uint64x1_t vrshr_n_u64(uint64x1_t a, __constrange(1,64) int b); // VRSHR.U64 d0,d0,#64
+int8x16_t vrshrq_n_s8(int8x16_t a, __constrange(1,8) int b); // VRSHR.S8 q0,q0,#8
+int16x8_t vrshrq_n_s16(int16x8_t a, __constrange(1,16) int b); // VRSHR.S16 q0,q0,#16
+int32x4_t vrshrq_n_s32(int32x4_t a, __constrange(1,32) int b); // VRSHR.S32 q0,q0,#32
+int64x2_t vrshrq_n_s64(int64x2_t a, __constrange(1,64) int b); // VRSHR.S64 q0,q0,#64
+uint8x16_t vrshrq_n_u8(uint8x16_t a, __constrange(1,8) int b); // VRSHR.U8 q0,q0,#8
+uint16x8_t vrshrq_n_u16(uint16x8_t a, __constrange(1,16) int b); // VRSHR.U16 q0,q0,#16
+uint32x4_t vrshrq_n_u32(uint32x4_t a, __constrange(1,32) int b); // VRSHR.U32 q0,q0,#32
+uint64x2_t vrshrq_n_u64(uint64x2_t a, __constrange(1,64) int b); // VRSHR.U64 q0,q0,#64
 //Vector shift right by constant and accumulate
-
-int8x16_t vsraq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c);         // VSRA.S8 q0,q0,#8
-int16x8_t vsraq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c);         // VSRA.S16 q0,q0,#16
-int32x4_t vsraq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c);         // VSRA.S32 q0,q0,#32
-int64x2_t vsraq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c);         // VSRA.S64 q0,q0,#64
-uint8x16_t vsraq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(1,8) int c);         // VSRA.U8 q0,q0,#8
-uint16x8_t vsraq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(1,16) int c);         // VSRA.U16 q0,q0,#16
-uint32x4_t vsraq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(1,32) int c);         // VSRA.U32 q0,q0,#32
-uint64x2_t vsraq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(1,64) int c);         // VSRA.U64 q0,q0,#64
+int8x8_t vsra_n_s8(int8x8_t a, int8x8_t b, __constrange(1,8) int c); // VSRA.S8 d0,d0,#8
+int16x4_t vsra_n_s16(int16x4_t a, int16x4_t b, __constrange(1,16) int c); // VSRA.S16 d0,d0,#16
+int32x2_t vsra_n_s32(int32x2_t a, int32x2_t b, __constrange(1,32) int c); // VSRA.S32 d0,d0,#32
+int64x1_t vsra_n_s64(int64x1_t a, int64x1_t b, __constrange(1,64) int c); // VSRA.S64 d0,d0,#64
+uint8x8_t vsra_n_u8(uint8x8_t a, uint8x8_t b, __constrange(1,8) int c); // VSRA.U8 d0,d0,#8
+uint16x4_t vsra_n_u16(uint16x4_t a, uint16x4_t b, __constrange(1,16) int c); // VSRA.U16 d0,d0,#16
+uint32x2_t vsra_n_u32(uint32x2_t a, uint32x2_t b, __constrange(1,32) int c); // VSRA.U32 d0,d0,#32
+uint64x1_t vsra_n_u64(uint64x1_t a, uint64x1_t b, __constrange(1,64) int c); // VSRA.U64 d0,d0,#64
+int8x16_t vsraq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c); // VSRA.S8 q0,q0,#8
+int16x8_t vsraq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c); // VSRA.S16 q0,q0,#16
+int32x4_t vsraq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c); // VSRA.S32 q0,q0,#32
+int64x2_t vsraq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c); // VSRA.S64 q0,q0,#64
+uint8x16_t vsraq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(1,8) int c); // VSRA.U8 q0,q0,#8
+uint16x8_t vsraq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(1,16) int c); // VSRA.U16 q0,q0,#16
+uint32x4_t vsraq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(1,32) int c); // VSRA.U32 q0,q0,#32
+uint64x2_t vsraq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(1,64) int c); // VSRA.U64 q0,q0,#64
 //Vector rounding shift right by constant and accumulate
-
-int8x16_t vrsraq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c);         // VRSRA.S8 q0,q0,#8
-int16x8_t vrsraq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c);         // VRSRA.S16 q0,q0,#16
-int32x4_t vrsraq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c);         // VRSRA.S32 q0,q0,#32
-int64x2_t vrsraq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c);         // VRSRA.S64 q0,q0,#64
-uint8x16_t vrsraq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(1,8) int c);         // VRSRA.U8 q0,q0,#8
-uint16x8_t vrsraq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(1,16) int c);         // VRSRA.U16 q0,q0,#16
-uint32x4_t vrsraq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(1,32) int c);         // VRSRA.U32 q0,q0,#32
-uint64x2_t vrsraq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(1,64) int c);         // VRSRA.U64 q0,q0,#64
+int8x8_t vrsra_n_s8(int8x8_t a, int8x8_t b, __constrange(1,8) int c); // VRSRA.S8 d0,d0,#8
+int16x4_t vrsra_n_s16(int16x4_t a, int16x4_t b, __constrange(1,16) int c); // VRSRA.S16 d0,d0,#16
+int32x2_t vrsra_n_s32(int32x2_t a, int32x2_t b, __constrange(1,32) int c); // VRSRA.S32 d0,d0,#32
+int64x1_t vrsra_n_s64(int64x1_t a, int64x1_t b, __constrange(1,64) int c); // VRSRA.S64 d0,d0,#64
+uint8x8_t vrsra_n_u8(uint8x8_t a, uint8x8_t b, __constrange(1,8) int c); // VRSRA.U8 d0,d0,#8
+uint16x4_t vrsra_n_u16(uint16x4_t a, uint16x4_t b, __constrange(1,16) int c); // VRSRA.U16 d0,d0,#16
+uint32x2_t vrsra_n_u32(uint32x2_t a, uint32x2_t b, __constrange(1,32) int c); // VRSRA.U32 d0,d0,#32
+uint64x1_t vrsra_n_u64(uint64x1_t a, uint64x1_t b, __constrange(1,64) int c); // VRSRA.U64 d0,d0,#64
+int8x16_t vrsraq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c); // VRSRA.S8 q0,q0,#8
+int16x8_t vrsraq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c); // VRSRA.S16 q0,q0,#16
+int32x4_t vrsraq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c); // VRSRA.S32 q0,q0,#32
+int64x2_t vrsraq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c); // VRSRA.S64 q0,q0,#64
+uint8x16_t vrsraq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(1,8) int c); // VRSRA.U8 q0,q0,#8
+uint16x8_t vrsraq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(1,16) int c); // VRSRA.U16 q0,q0,#16
+uint32x4_t vrsraq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(1,32) int c); // VRSRA.U32 q0,q0,#32
+uint64x2_t vrsraq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(1,64) int c); // VRSRA.U64 q0,q0,#64
 //Vector saturating shift left by constant
-
-int8x16_t vqshlq_n_s8(int8x16_t a, __constrange(0,7) int b);         // VQSHL.S8 q0,q0,#0
-int16x8_t vqshlq_n_s16(int16x8_t a, __constrange(0,15) int b);         // VQSHL.S16 q0,q0,#0
-int32x4_t vqshlq_n_s32(int32x4_t a, __constrange(0,31) int b);         // VQSHL.S32 q0,q0,#0
-int64x2_t vqshlq_n_s64(int64x2_t a, __constrange(0,63) int b);         // VQSHL.S64 q0,q0,#0
-uint8x16_t vqshlq_n_u8(uint8x16_t a, __constrange(0,7) int b);         // VQSHL.U8 q0,q0,#0
-uint16x8_t vqshlq_n_u16(uint16x8_t a, __constrange(0,15) int b);         // VQSHL.U16 q0,q0,#0
-uint32x4_t vqshlq_n_u32(uint32x4_t a, __constrange(0,31) int b);         // VQSHL.U32 q0,q0,#0
-uint64x2_t vqshlq_n_u64(uint64x2_t a, __constrange(0,63) int b);         // VQSHL.U64 q0,q0,#0
+int8x8_t vqshl_n_s8(int8x8_t a, __constrange(0,7) int b); // VQSHL.S8 d0,d0,#0
+int16x4_t vqshl_n_s16(int16x4_t a, __constrange(0,15) int b); // VQSHL.S16 d0,d0,#0
+int32x2_t vqshl_n_s32(int32x2_t a, __constrange(0,31) int b); // VQSHL.S32 d0,d0,#0
+int64x1_t vqshl_n_s64(int64x1_t a, __constrange(0,63) int b); // VQSHL.S64 d0,d0,#0
+uint8x8_t vqshl_n_u8(uint8x8_t a, __constrange(0,7) int b); // VQSHL.U8 d0,d0,#0
+uint16x4_t vqshl_n_u16(uint16x4_t a, __constrange(0,15) int b); // VQSHL.U16 d0,d0,#0
+uint32x2_t vqshl_n_u32(uint32x2_t a, __constrange(0,31) int b); // VQSHL.U32 d0,d0,#0
+uint64x1_t vqshl_n_u64(uint64x1_t a, __constrange(0,63) int b); // VQSHL.U64 d0,d0,#0
+int8x16_t vqshlq_n_s8(int8x16_t a, __constrange(0,7) int b); // VQSHL.S8 q0,q0,#0
+int16x8_t vqshlq_n_s16(int16x8_t a, __constrange(0,15) int b); // VQSHL.S16 q0,q0,#0
+int32x4_t vqshlq_n_s32(int32x4_t a, __constrange(0,31) int b); // VQSHL.S32 q0,q0,#0
+int64x2_t vqshlq_n_s64(int64x2_t a, __constrange(0,63) int b); // VQSHL.S64 q0,q0,#0
+uint8x16_t vqshlq_n_u8(uint8x16_t a, __constrange(0,7) int b); // VQSHL.U8 q0,q0,#0
+uint16x8_t vqshlq_n_u16(uint16x8_t a, __constrange(0,15) int b); // VQSHL.U16 q0,q0,#0
+uint32x4_t vqshlq_n_u32(uint32x4_t a, __constrange(0,31) int b); // VQSHL.U32 q0,q0,#0
+uint64x2_t vqshlq_n_u64(uint64x2_t a, __constrange(0,63) int b); // VQSHL.U64 q0,q0,#0
 //Vector signed->unsigned saturating shift left by constant
-
-uint8x16_t vqshluq_n_s8(int8x16_t a, __constrange(0,7) int b);         // VQSHLU.S8 q0,q0,#0
-uint16x8_t vqshluq_n_s16(int16x8_t a, __constrange(0,15) int b);         // VQSHLU.S16 q0,q0,#0
-uint32x4_t vqshluq_n_s32(int32x4_t a, __constrange(0,31) int b);         // VQSHLU.S32 q0,q0,#0
-uint64x2_t vqshluq_n_s64(int64x2_t a, __constrange(0,63) int b);         // VQSHLU.S64 q0,q0,#0
+uint8x8_t vqshlu_n_s8(int8x8_t a, __constrange(0,7) int b); // VQSHLU.S8 d0,d0,#0
+uint16x4_t vqshlu_n_s16(int16x4_t a, __constrange(0,15) int b); // VQSHLU.S16 d0,d0,#0
+uint32x2_t vqshlu_n_s32(int32x2_t a, __constrange(0,31) int b); // VQSHLU.S32 d0,d0,#0
+uint64x1_t vqshlu_n_s64(int64x1_t a, __constrange(0,63) int b); // VQSHLU.S64 d0,d0,#0
+uint8x16_t vqshluq_n_s8(int8x16_t a, __constrange(0,7) int b); // VQSHLU.S8 q0,q0,#0
+uint16x8_t vqshluq_n_s16(int16x8_t a, __constrange(0,15) int b); // VQSHLU.S16 q0,q0,#0
+uint32x4_t vqshluq_n_s32(int32x4_t a, __constrange(0,31) int b); // VQSHLU.S32 q0,q0,#0
+uint64x2_t vqshluq_n_s64(int64x2_t a, __constrange(0,63) int b); // VQSHLU.S64 q0,q0,#0
 //Vector narrowing shift right by constant
-
+int8x8_t vshrn_n_s16(int16x8_t a, __constrange(1,8) int b); // VSHRN.I16 d0,q0,#8
+int16x4_t vshrn_n_s32(int32x4_t a, __constrange(1,16) int b); // VSHRN.I32 d0,q0,#16
+int32x2_t vshrn_n_s64(int64x2_t a, __constrange(1,32) int b); // VSHRN.I64 d0,q0,#32
+uint8x8_t vshrn_n_u16(uint16x8_t a, __constrange(1,8) int b); // VSHRN.I16 d0,q0,#8
+uint16x4_t vshrn_n_u32(uint32x4_t a, __constrange(1,16) int b); // VSHRN.I32 d0,q0,#16
+uint32x2_t vshrn_n_u64(uint64x2_t a, __constrange(1,32) int b); // VSHRN.I64 d0,q0,#32
 //Vector signed->unsigned narrowing saturating shift right by constant
-
+uint8x8_t vqshrun_n_s16(int16x8_t a, __constrange(1,8) int b); // VQSHRUN.S16 d0,q0,#8
+uint16x4_t vqshrun_n_s32(int32x4_t a, __constrange(1,16) int b); // VQSHRUN.S32 d0,q0,#16
+uint32x2_t vqshrun_n_s64(int64x2_t a, __constrange(1,32) int b); // VQSHRUN.S64 d0,q0,#32
 //Vector signed->unsigned rounding narrowing saturating shift right by constant
-
+uint8x8_t vqrshrun_n_s16(int16x8_t a, __constrange(1,8) int b); // VQRSHRUN.S16 d0,q0,#8
+uint16x4_t vqrshrun_n_s32(int32x4_t a, __constrange(1,16) int b); // VQRSHRUN.S32 d0,q0,#16
+uint32x2_t vqrshrun_n_s64(int64x2_t a, __constrange(1,32) int b); // VQRSHRUN.S64 d0,q0,#32
 //Vector narrowing saturating shift right by constant
-
+int8x8_t vqshrn_n_s16(int16x8_t a, __constrange(1,8) int b); // VQSHRN.S16 d0,q0,#8
+int16x4_t vqshrn_n_s32(int32x4_t a, __constrange(1,16) int b); // VQSHRN.S32 d0,q0,#16
+int32x2_t vqshrn_n_s64(int64x2_t a, __constrange(1,32) int b); // VQSHRN.S64 d0,q0,#32
+uint8x8_t vqshrn_n_u16(uint16x8_t a, __constrange(1,8) int b); // VQSHRN.U16 d0,q0,#8
+uint16x4_t vqshrn_n_u32(uint32x4_t a, __constrange(1,16) int b); // VQSHRN.U32 d0,q0,#16
+uint32x2_t vqshrn_n_u64(uint64x2_t a, __constrange(1,32) int b); // VQSHRN.U64 d0,q0,#32
 //Vector rounding narrowing shift right by constant
-
+int8x8_t vrshrn_n_s16(int16x8_t a, __constrange(1,8) int b); // VRSHRN.I16 d0,q0,#8
+int16x4_t vrshrn_n_s32(int32x4_t a, __constrange(1,16) int b); // VRSHRN.I32 d0,q0,#16
+int32x2_t vrshrn_n_s64(int64x2_t a, __constrange(1,32) int b); // VRSHRN.I64 d0,q0,#32
+uint8x8_t vrshrn_n_u16(uint16x8_t a, __constrange(1,8) int b); // VRSHRN.I16 d0,q0,#8
+uint16x4_t vrshrn_n_u32(uint32x4_t a, __constrange(1,16) int b); // VRSHRN.I32 d0,q0,#16
+uint32x2_t vrshrn_n_u64(uint64x2_t a, __constrange(1,32) int b); // VRSHRN.I64 d0,q0,#32
 //Vector rounding narrowing saturating shift right by constant
-
+int8x8_t vqrshrn_n_s16(int16x8_t a, __constrange(1,8) int b); // VQRSHRN.S16 d0,q0,#8
+int16x4_t vqrshrn_n_s32(int32x4_t a, __constrange(1,16) int b); // VQRSHRN.S32 d0,q0,#16
+int32x2_t vqrshrn_n_s64(int64x2_t a, __constrange(1,32) int b); // VQRSHRN.S64 d0,q0,#32
+uint8x8_t vqrshrn_n_u16(uint16x8_t a, __constrange(1,8) int b); // VQRSHRN.U16 d0,q0,#8
+uint16x4_t vqrshrn_n_u32(uint32x4_t a, __constrange(1,16) int b); // VQRSHRN.U32 d0,q0,#16
+uint32x2_t vqrshrn_n_u64(uint64x2_t a, __constrange(1,32) int b); // VQRSHRN.U64 d0,q0,#32
 //Vector widening shift left by constant
-
+int16x8_t vshll_n_s8(int8x8_t a, __constrange(0,8) int b); // VSHLL.S8 q0,d0,#0
+int32x4_t vshll_n_s16(int16x4_t a, __constrange(0,16) int b); // VSHLL.S16 q0,d0,#0
+int64x2_t vshll_n_s32(int32x2_t a, __constrange(0,32) int b); // VSHLL.S32 q0,d0,#0
+uint16x8_t vshll_n_u8(uint8x8_t a, __constrange(0,8) int b); // VSHLL.U8 q0,d0,#0
+uint32x4_t vshll_n_u16(uint16x4_t a, __constrange(0,16) int b); // VSHLL.U16 q0,d0,#0
+uint64x2_t vshll_n_u32(uint32x2_t a, __constrange(0,32) int b); // VSHLL.U32 q0,d0,#0
 //Shifts with insert
 //Vector shift right and insert
-
-int8x16_t vsriq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c);         // VSRI.8 q0,q0,#8
-int16x8_t vsriq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c);         // VSRI.16 q0,q0,#16
-int32x4_t vsriq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c);         // VSRI.32 q0,q0,#32
-int64x2_t vsriq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c);         // VSRI.64 q0,q0,#64
-uint8x16_t vsriq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(1,8) int c);         // VSRI.8 q0,q0,#8
-uint16x8_t vsriq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(1,16) int c);         // VSRI.16 q0,q0,#16
-uint32x4_t vsriq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(1,32) int c);         // VSRI.32 q0,q0,#32
-uint64x2_t vsriq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(1,64) int c);         // VSRI.64 q0,q0,#64
-poly8x16_t vsriq_n_p8(poly8x16_t a, poly8x16_t b, __constrange(1,8) int c);         // VSRI.8 q0,q0,#8
-poly16x8_t vsriq_n_p16(poly16x8_t a, poly16x8_t b, __constrange(1,16) int c);         // VSRI.16 q0,q0,#16
+int8x8_t vsri_n_s8(int8x8_t a, int8x8_t b, __constrange(1,8) int c); // VSRI.8 d0,d0,#8
+int16x4_t vsri_n_s16(int16x4_t a, int16x4_t b, __constrange(1,16) int c); // VSRI.16 d0,d0,#16
+int32x2_t vsri_n_s32(int32x2_t a, int32x2_t b, __constrange(1,32) int c); // VSRI.32 d0,d0,#32
+int64x1_t vsri_n_s64(int64x1_t a, int64x1_t b, __constrange(1,64) int c); // VSRI.64 d0,d0,#64
+uint8x8_t vsri_n_u8(uint8x8_t a, uint8x8_t b, __constrange(1,8) int c); // VSRI.8 d0,d0,#8
+uint16x4_t vsri_n_u16(uint16x4_t a, uint16x4_t b, __constrange(1,16) int c); // VSRI.16 d0,d0,#16
+uint32x2_t vsri_n_u32(uint32x2_t a, uint32x2_t b, __constrange(1,32) int c); // VSRI.32 d0,d0,#32
+uint64x1_t vsri_n_u64(uint64x1_t a, uint64x1_t b, __constrange(1,64) int c); // VSRI.64 d0,d0,#64
+poly8x8_t vsri_n_p8(poly8x8_t a, poly8x8_t b, __constrange(1,8) int c); // VSRI.8 d0,d0,#8
+poly16x4_t vsri_n_p16(poly16x4_t a, poly16x4_t b, __constrange(1,16) int c); // VSRI.16 d0,d0,#16
+int8x16_t vsriq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c); // VSRI.8 q0,q0,#8
+int16x8_t vsriq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c); // VSRI.16 q0,q0,#16
+int32x4_t vsriq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c); // VSRI.32 q0,q0,#32
+int64x2_t vsriq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c); // VSRI.64 q0,q0,#64
+uint8x16_t vsriq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(1,8) int c); // VSRI.8 q0,q0,#8
+uint16x8_t vsriq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(1,16) int c); // VSRI.16 q0,q0,#16
+uint32x4_t vsriq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(1,32) int c); // VSRI.32 q0,q0,#32
+uint64x2_t vsriq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(1,64) int c); // VSRI.64 q0,q0,#64
+poly8x16_t vsriq_n_p8(poly8x16_t a, poly8x16_t b, __constrange(1,8) int c); // VSRI.8 q0,q0,#8
+poly16x8_t vsriq_n_p16(poly16x8_t a, poly16x8_t b, __constrange(1,16) int c); // VSRI.16 q0,q0,#16
 //Vector shift left and insert
-
-int8x16_t vsliq_n_s8(int8x16_t a, int8x16_t b, __constrange(0,7) int c);         // VSLI.8 q0,q0,#0
-int16x8_t vsliq_n_s16(int16x8_t a, int16x8_t b, __constrange(0,15) int c);         // VSLI.16 q0,q0,#0
-int32x4_t vsliq_n_s32(int32x4_t a, int32x4_t b, __constrange(0,31) int c);         // VSLI.32 q0,q0,#0
-int64x2_t vsliq_n_s64(int64x2_t a, int64x2_t b, __constrange(0,63) int c);         // VSLI.64 q0,q0,#0
-uint8x16_t vsliq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(0,7) int c);         // VSLI.8 q0,q0,#0
-uint16x8_t vsliq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(0,15) int c);         // VSLI.16 q0,q0,#0
-uint32x4_t vsliq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(0,31) int c);         // VSLI.32 q0,q0,#0
-uint64x2_t vsliq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(0,63) int c);         // VSLI.64 q0,q0,#0
-poly8x16_t vsliq_n_p8(poly8x16_t a, poly8x16_t b, __constrange(0,7) int c);         // VSLI.8 q0,q0,#0
-poly16x8_t vsliq_n_p16(poly16x8_t a, poly16x8_t b, __constrange(0,15) int c);         // VSLI.16 q0,q0,#0
+int8x8_t vsli_n_s8(int8x8_t a, int8x8_t b, __constrange(0,7) int c); // VSLI.8 d0,d0,#0
+int16x4_t vsli_n_s16(int16x4_t a, int16x4_t b, __constrange(0,15) int c); // VSLI.16 d0,d0,#0
+int32x2_t vsli_n_s32(int32x2_t a, int32x2_t b, __constrange(0,31) int c); // VSLI.32 d0,d0,#0
+int64x1_t vsli_n_s64(int64x1_t a, int64x1_t b, __constrange(0,63) int c); // VSLI.64 d0,d0,#0
+uint8x8_t vsli_n_u8(uint8x8_t a, uint8x8_t b, __constrange(0,7) int c); // VSLI.8 d0,d0,#0
+uint16x4_t vsli_n_u16(uint16x4_t a, uint16x4_t b, __constrange(0,15) int c); // VSLI.16 d0,d0,#0
+uint32x2_t vsli_n_u32(uint32x2_t a, uint32x2_t b, __constrange(0,31) int c); // VSLI.32 d0,d0,#0
+uint64x1_t vsli_n_u64(uint64x1_t a, uint64x1_t b, __constrange(0,63) int c); // VSLI.64 d0,d0,#0
+poly8x8_t vsli_n_p8(poly8x8_t a, poly8x8_t b, __constrange(0,7) int c); // VSLI.8 d0,d0,#0
+poly16x4_t vsli_n_p16(poly16x4_t a, poly16x4_t b, __constrange(0,15) int c); // VSLI.16 d0,d0,#0
+int8x16_t vsliq_n_s8(int8x16_t a, int8x16_t b, __constrange(0,7) int c); // VSLI.8 q0,q0,#0
+int16x8_t vsliq_n_s16(int16x8_t a, int16x8_t b, __constrange(0,15) int c); // VSLI.16 q0,q0,#0
+int32x4_t vsliq_n_s32(int32x4_t a, int32x4_t b, __constrange(0,31) int c); // VSLI.32 q0,q0,#0
+int64x2_t vsliq_n_s64(int64x2_t a, int64x2_t b, __constrange(0,63) int c); // VSLI.64 q0,q0,#0
+uint8x16_t vsliq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(0,7) int c); // VSLI.8 q0,q0,#0
+uint16x8_t vsliq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(0,15) int c); // VSLI.16 q0,q0,#0
+uint32x4_t vsliq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(0,31) int c); // VSLI.32 q0,q0,#0
+uint64x2_t vsliq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(0,63) int c); // VSLI.64 q0,q0,#0
+poly8x16_t vsliq_n_p8(poly8x16_t a, poly8x16_t b, __constrange(0,7) int c); // VSLI.8 q0,q0,#0
+poly16x8_t vsliq_n_p16(poly16x8_t a, poly16x8_t b, __constrange(0,15) int c); // VSLI.16 q0,q0,#0
 //Loads of a single vector or lane. Perform loads and stores of a single vector of some type.
 //Load a single vector from memory
-uint8x16_t vld1q_u8(__transfersize(16) uint8_t const * ptr);         // VLD1.8 {d0, d1}, [r0]
-uint16x8_t vld1q_u16(__transfersize(8) uint16_t const * ptr);         // VLD1.16 {d0, d1}, [r0]
-uint32x4_t vld1q_u32(__transfersize(4) uint32_t const * ptr);         // VLD1.32 {d0, d1}, [r0]
-uint64x2_t vld1q_u64(__transfersize(2) uint64_t const * ptr);         // VLD1.64 {d0, d1}, [r0]
-int8x16_t vld1q_s8(__transfersize(16) int8_t const * ptr);         // VLD1.8 {d0, d1}, [r0]
-int16x8_t vld1q_s16(__transfersize(8) int16_t const * ptr);         // VLD1.16 {d0, d1}, [r0]
-int32x4_t vld1q_s32(__transfersize(4) int32_t const * ptr);         // VLD1.32 {d0, d1}, [r0]
-int64x2_t vld1q_s64(__transfersize(2) int64_t const * ptr);         // VLD1.64 {d0, d1}, [r0]
-float16x8_t vld1q_f16(__transfersize(8) __fp16 const * ptr);         // VLD1.16 {d0, d1}, [r0]
-float32x4_t vld1q_f32(__transfersize(4) float32_t const * ptr);         // VLD1.32 {d0, d1}, [r0]
-poly8x16_t vld1q_p8(__transfersize(16) poly8_t const * ptr);         // VLD1.8 {d0, d1}, [r0]
-poly16x8_t vld1q_p16(__transfersize(8) poly16_t const * ptr);         // VLD1.16 {d0, d1}, [r0]
-
+uint8x16_t vld1q_u8(__transfersize(16) uint8_t const * ptr); // VLD1.8 {d0, d1}, [r0]
+uint16x8_t vld1q_u16(__transfersize(8) uint16_t const * ptr); // VLD1.16 {d0, d1}, [r0]
+uint32x4_t vld1q_u32(__transfersize(4) uint32_t const * ptr); // VLD1.32 {d0, d1}, [r0]
+uint64x2_t vld1q_u64(__transfersize(2) uint64_t const * ptr); // VLD1.64 {d0, d1}, [r0]
+int8x16_t vld1q_s8(__transfersize(16) int8_t const * ptr); // VLD1.8 {d0, d1}, [r0]
+int16x8_t vld1q_s16(__transfersize(8) int16_t const * ptr); // VLD1.16 {d0, d1}, [r0]
+int32x4_t vld1q_s32(__transfersize(4) int32_t const * ptr); // VLD1.32 {d0, d1}, [r0]
+int64x2_t vld1q_s64(__transfersize(2) int64_t const * ptr); // VLD1.64 {d0, d1}, [r0]
+float16x8_t vld1q_f16(__transfersize(8) __fp16 const * ptr); // VLD1.16 {d0, d1}, [r0]
+float32x4_t vld1q_f32(__transfersize(4) float32_t const * ptr); // VLD1.32 {d0, d1}, [r0]
+poly8x16_t vld1q_p8(__transfersize(16) poly8_t const * ptr); // VLD1.8 {d0, d1}, [r0]
+poly16x8_t vld1q_p16(__transfersize(8) poly16_t const * ptr); // VLD1.16 {d0, d1}, [r0]
+uint8x8_t vld1_u8(__transfersize(8) uint8_t const * ptr); // VLD1.8 {d0}, [r0]
+uint16x4_t vld1_u16(__transfersize(4) uint16_t const * ptr); // VLD1.16 {d0}, [r0]
+uint32x2_t vld1_u32(__transfersize(2) uint32_t const * ptr); // VLD1.32 {d0}, [r0]
+uint64x1_t vld1_u64(__transfersize(1) uint64_t const * ptr); // VLD1.64 {d0}, [r0]
+int8x8_t vld1_s8(__transfersize(8) int8_t const * ptr); // VLD1.8 {d0}, [r0]
+int16x4_t vld1_s16(__transfersize(4) int16_t const * ptr); // VLD1.16 {d0}, [r0]
+int32x2_t vld1_s32(__transfersize(2) int32_t const * ptr); // VLD1.32 {d0}, [r0]
+int64x1_t vld1_s64(__transfersize(1) int64_t const * ptr); // VLD1.64 {d0}, [r0]
+float16x4_t vld1_f16(__transfersize(4) __fp16 const * ptr); // VLD1.16 {d0}, [r0]
+float32x2_t vld1_f32(__transfersize(2) float32_t const * ptr); // VLD1.32 {d0}, [r0]
+poly8x8_t vld1_p8(__transfersize(8) poly8_t const * ptr); // VLD1.8 {d0}, [r0]
+poly16x4_t vld1_p16(__transfersize(4) poly16_t const * ptr); // VLD1.16 {d0}, [r0]
 //Load a single lane from memory
-uint8x16_t vld1q_lane_u8(__transfersize(1) uint8_t const * ptr, uint8x16_t vec, __constrange(0,15) int lane);         //VLD1.8 {d0[0]}, [r0]
-uint16x8_t vld1q_lane_u16(__transfersize(1) uint16_t const * ptr, uint16x8_t vec, __constrange(0,7) int lane);         // VLD1.16 {d0[0]}, [r0]
-uint32x4_t vld1q_lane_u32(__transfersize(1) uint32_t const * ptr, uint32x4_t vec, __constrange(0,3) int lane);         // VLD1.32 {d0[0]}, [r0]
-uint64x2_t vld1q_lane_u64(__transfersize(1) uint64_t const * ptr, uint64x2_t vec, __constrange(0,1) int lane);         // VLD1.64 {d0}, [r0]
-int8x16_t vld1q_lane_s8(__transfersize(1) int8_t const * ptr, int8x16_t vec, __constrange(0,15) int lane);         //VLD1.8 {d0[0]}, [r0]
-int16x8_t vld1q_lane_s16(__transfersize(1) int16_t const * ptr, int16x8_t vec, __constrange(0,7) int lane);         //VLD1.16 {d0[0]}, [r0]
-int32x4_t vld1q_lane_s32(__transfersize(1) int32_t const * ptr, int32x4_t vec, __constrange(0,3) int lane);         //VLD1.32 {d0[0]}, [r0]
-float32x4_t vld1q_lane_f32(__transfersize(1) float32_t const * ptr, float32x4_t vec, __constrange(0,3) int lane);         // VLD1.32 {d0[0]}, [r0]
-int64x2_t vld1q_lane_s64(__transfersize(1) int64_t const * ptr, int64x2_t vec, __constrange(0,1) int lane);         //VLD1.64 {d0}, [r0]
-poly8x16_t vld1q_lane_p8(__transfersize(1) poly8_t const * ptr, poly8x16_t vec, __constrange(0,15) int lane);         //VLD1.8 {d0[0]}, [r0]
-poly16x8_t vld1q_lane_p16(__transfersize(1) poly16_t const * ptr, poly16x8_t vec, __constrange(0,7) int lane);         // VLD1.16 {d0[0]}, [r0]
-
+uint8x16_t vld1q_lane_u8(__transfersize(1) uint8_t const * ptr, uint8x16_t vec, __constrange(0,15) int lane); //VLD1.8 {d0[0]}, [r0]
+uint16x8_t vld1q_lane_u16(__transfersize(1) uint16_t const * ptr, uint16x8_t vec, __constrange(0,7) int lane); // VLD1.16 {d0[0]}, [r0]
+uint32x4_t vld1q_lane_u32(__transfersize(1) uint32_t const * ptr, uint32x4_t vec, __constrange(0,3) int lane); // VLD1.32 {d0[0]}, [r0]
+uint64x2_t vld1q_lane_u64(__transfersize(1) uint64_t const * ptr, uint64x2_t vec, __constrange(0,1) int lane); // VLD1.64 {d0}, [r0]
+int8x16_t vld1q_lane_s8(__transfersize(1) int8_t const * ptr, int8x16_t vec, __constrange(0,15) int lane); //VLD1.8 {d0[0]}, [r0]
+int16x8_t vld1q_lane_s16(__transfersize(1) int16_t const * ptr, int16x8_t vec, __constrange(0,7) int lane); //VLD1.16 {d0[0]}, [r0]
+int32x4_t vld1q_lane_s32(__transfersize(1) int32_t const * ptr, int32x4_t vec, __constrange(0,3) int lane); //VLD1.32 {d0[0]}, [r0]
+float16x8_t vld1q_lane_f16(__transfersize(1) __fp16 const * ptr, float16x8_t vec, __constrange(0,7) int lane); //VLD1.16 {d0[0]}, [r0]
+float32x4_t vld1q_lane_f32(__transfersize(1) float32_t const * ptr, float32x4_t vec, __constrange(0,3) int lane); // VLD1.32 {d0[0]}, [r0]
+int64x2_t vld1q_lane_s64(__transfersize(1) int64_t const * ptr, int64x2_t vec, __constrange(0,1) int lane); //VLD1.64 {d0}, [r0]
+poly8x16_t vld1q_lane_p8(__transfersize(1) poly8_t const * ptr, poly8x16_t vec, __constrange(0,15) int lane); //VLD1.8 {d0[0]}, [r0]
+poly16x8_t vld1q_lane_p16(__transfersize(1) poly16_t const * ptr, poly16x8_t vec, __constrange(0,7) int lane); // VLD1.16 {d0[0]}, [r0]
+uint8x8_t vld1_lane_u8(__transfersize(1) uint8_t const * ptr, uint8x8_t vec, __constrange(0,7) int lane); //VLD1.8 {d0[0]}, [r0]
+uint16x4_t vld1_lane_u16(__transfersize(1) uint16_t const * ptr, uint16x4_t vec, __constrange(0,3) int lane); //VLD1.16 {d0[0]}, [r0]
+uint32x2_t vld1_lane_u32(__transfersize(1) uint32_t const * ptr, uint32x2_t vec, __constrange(0,1) int lane); //VLD1.32 {d0[0]}, [r0]
+uint64x1_t vld1_lane_u64(__transfersize(1) uint64_t const * ptr, uint64x1_t vec, __constrange(0,0) int lane); //VLD1.64 {d0}, [r0]
+int8x8_t vld1_lane_s8(__transfersize(1) int8_t const * ptr, int8x8_t vec, __constrange(0,7) int lane); // VLD1.8{d0[0]}, [r0]
+int16x4_t vld1_lane_s16(__transfersize(1) int16_t const * ptr, int16x4_t vec, __constrange(0,3) int lane); //VLD1.16 {d0[0]}, [r0]
+int32x2_t vld1_lane_s32(__transfersize(1) int32_t const * ptr, int32x2_t vec, __constrange(0,1) int lane); //VLD1.32 {d0[0]}, [r0]
+float16x4_t vld1q_lane_f16(__transfersize(1) __fp16 const * ptr, float16x4_t vec, __constrange(0,3) int lane); //VLD1.16 {d0[0]}, [r0]
+float32x2_t vld1_lane_f32(__transfersize(1) float32_t const * ptr, float32x2_t vec, __constrange(0,1) int lane); // VLD1.32 {d0[0]}, [r0]
+int64x1_t vld1_lane_s64(__transfersize(1) int64_t const * ptr, int64x1_t vec, __constrange(0,0) int lane); //VLD1.64 {d0}, [r0]
+poly8x8_t vld1_lane_p8(__transfersize(1) poly8_t const * ptr, poly8x8_t vec, __constrange(0,7) int lane); //VLD1.8 {d0[0]}, [r0]
+poly16x4_t vld1_lane_p16(__transfersize(1) poly16_t const * ptr, poly16x4_t vec, __constrange(0,3) int lane); //VLD1.16 {d0[0]}, [r0]
 //Load all lanes of vector with same value from memory
-uint8x16_t vld1q_dup_u8(__transfersize(1) uint8_t const * ptr);         // VLD1.8 {d0[]}, [r0]
-uint16x8_t vld1q_dup_u16(__transfersize(1) uint16_t const * ptr);         // VLD1.16 {d0[]}, [r0]
-uint32x4_t vld1q_dup_u32(__transfersize(1) uint32_t const * ptr);         // VLD1.32 {d0[]}, [r0]
-uint64x2_t vld1q_dup_u64(__transfersize(1) uint64_t const * ptr);         // VLD1.64 {d0}, [r0]
-int8x16_t vld1q_dup_s8(__transfersize(1) int8_t const * ptr);         // VLD1.8 {d0[]}, [r0]
-int16x8_t vld1q_dup_s16(__transfersize(1) int16_t const * ptr);         // VLD1.16 {d0[]}, [r0]
-int32x4_t vld1q_dup_s32(__transfersize(1) int32_t const * ptr);         // VLD1.32 {d0[]}, [r0]
-int64x2_t vld1q_dup_s64(__transfersize(1) int64_t const * ptr);         // VLD1.64 {d0}, [r0]
-float16x8_t vld1q_dup_f16(__transfersize(1) __fp16 const * ptr);         // VLD1.16 {d0[]}, [r0]
-float32x4_t vld1q_dup_f32(__transfersize(1) float32_t const * ptr);         // VLD1.32 {d0[]}, [r0]
-poly8x16_t vld1q_dup_p8(__transfersize(1) poly8_t const * ptr);         // VLD1.8 {d0[]}, [r0]
-poly16x8_t vld1q_dup_p16(__transfersize(1) poly16_t const * ptr);         // VLD1.16 {d0[]}, [r0]
-
+uint8x16_t vld1q_dup_u8(__transfersize(1) uint8_t const * ptr); // VLD1.8 {d0[]}, [r0]
+uint16x8_t vld1q_dup_u16(__transfersize(1) uint16_t const * ptr); // VLD1.16 {d0[]}, [r0]
+uint32x4_t vld1q_dup_u32(__transfersize(1) uint32_t const * ptr); // VLD1.32 {d0[]}, [r0]
+uint64x2_t vld1q_dup_u64(__transfersize(1) uint64_t const * ptr); // VLD1.64 {d0}, [r0]
+int8x16_t vld1q_dup_s8(__transfersize(1) int8_t const * ptr); // VLD1.8 {d0[]}, [r0]
+int16x8_t vld1q_dup_s16(__transfersize(1) int16_t const * ptr); // VLD1.16 {d0[]}, [r0]
+int32x4_t vld1q_dup_s32(__transfersize(1) int32_t const * ptr); // VLD1.32 {d0[]}, [r0]
+int64x2_t vld1q_dup_s64(__transfersize(1) int64_t const * ptr); // VLD1.64 {d0}, [r0]
+float16x8_t vld1q_dup_f16(__transfersize(1) __fp16 const * ptr); // VLD1.16 {d0[]}, [r0]
+float32x4_t vld1q_dup_f32(__transfersize(1) float32_t const * ptr); // VLD1.32 {d0[]}, [r0]
+poly8x16_t vld1q_dup_p8(__transfersize(1) poly8_t const * ptr); // VLD1.8 {d0[]}, [r0]
+poly16x8_t vld1q_dup_p16(__transfersize(1) poly16_t const * ptr); // VLD1.16 {d0[]}, [r0]
+uint8x8_t vld1_dup_u8(__transfersize(1) uint8_t const * ptr); // VLD1.8 {d0[]}, [r0]
+uint16x4_t vld1_dup_u16(__transfersize(1) uint16_t const * ptr); // VLD1.16 {d0[]}, [r0]
+uint32x2_t vld1_dup_u32(__transfersize(1) uint32_t const * ptr); // VLD1.32 {d0[]}, [r0]
+uint64x1_t vld1_dup_u64(__transfersize(1) uint64_t const * ptr); // VLD1.64 {d0}, [r0]
+int8x8_t vld1_dup_s8(__transfersize(1) int8_t const * ptr); // VLD1.8 {d0[]}, [r0]
+int16x4_t vld1_dup_s16(__transfersize(1) int16_t const * ptr); // VLD1.16 {d0[]}, [r0]
+int32x2_t vld1_dup_s32(__transfersize(1) int32_t const * ptr); // VLD1.32 {d0[]}, [r0]
+int64x1_t vld1_dup_s64(__transfersize(1) int64_t const * ptr); // VLD1.64 {d0}, [r0]
+float16x4_t vld1_dup_f16(__transfersize(1) __fp16 const * ptr); // VLD1.16 {d0[]}, [r0]
+float32x2_t vld1_dup_f32(__transfersize(1) float32_t const * ptr); // VLD1.32 {d0[]}, [r0]
+poly8x8_t vld1_dup_p8(__transfersize(1) poly8_t const * ptr); // VLD1.8 {d0[]}, [r0]
+poly16x4_t vld1_dup_p16(__transfersize(1) poly16_t const * ptr); // VLD1.16 {d0[]}, [r0]
 //Store a single vector or lane. Stores all lanes or a single lane of a vector.
 //Store a single vector into memory
-void vst1q_u8(__transfersize(16) uint8_t * ptr, uint8x16_t val);         // VST1.8 {d0, d1}, [r0]
-void vst1q_u16(__transfersize(8) uint16_t * ptr, uint16x8_t val);         // VST1.16 {d0, d1}, [r0]
-void vst1q_u32(__transfersize(4) uint32_t * ptr, uint32x4_t val);         // VST1.32 {d0, d1}, [r0]
-void vst1q_u64(__transfersize(2) uint64_t * ptr, uint64x2_t val);         // VST1.64 {d0, d1}, [r0]
-void vst1q_s8(__transfersize(16) int8_t * ptr, int8x16_t val);         // VST1.8 {d0, d1}, [r0]
-void vst1q_s16(__transfersize(8) int16_t * ptr, int16x8_t val);         // VST1.16 {d0, d1}, [r0]
-void vst1q_s32(__transfersize(4) int32_t * ptr, int32x4_t val);         // VST1.32 {d0, d1}, [r0]
-void vst1q_s64(__transfersize(2) int64_t * ptr, int64x2_t val);         // VST1.64 {d0, d1}, [r0]
-void vst1q_f16(__transfersize(8) __fp16 * ptr, float16x8_t val);         // VST1.16 {d0, d1}, [r0]
-void vst1q_f32(__transfersize(4) float32_t * ptr, float32x4_t val);         // VST1.32 {d0, d1}, [r0]
-void vst1q_p8(__transfersize(16) poly8_t * ptr, poly8x16_t val);         // VST1.8 {d0, d1}, [r0]
-void vst1q_p16(__transfersize(8) poly16_t * ptr, poly16x8_t val);         // VST1.16 {d0, d1}, [r0]
-
+void vst1q_u8(__transfersize(16) uint8_t * ptr, uint8x16_t val); // VST1.8 {d0, d1}, [r0]
+void vst1q_u16(__transfersize(8) uint16_t * ptr, uint16x8_t val); // VST1.16 {d0, d1}, [r0]
+void vst1q_u32(__transfersize(4) uint32_t * ptr, uint32x4_t val); // VST1.32 {d0, d1}, [r0]
+void vst1q_u64(__transfersize(2) uint64_t * ptr, uint64x2_t val); // VST1.64 {d0, d1}, [r0]
+void vst1q_s8(__transfersize(16) int8_t * ptr, int8x16_t val); // VST1.8 {d0, d1}, [r0]
+void vst1q_s16(__transfersize(8) int16_t * ptr, int16x8_t val); // VST1.16 {d0, d1}, [r0]
+void vst1q_s32(__transfersize(4) int32_t * ptr, int32x4_t val); // VST1.32 {d0, d1}, [r0]
+void vst1q_s64(__transfersize(2) int64_t * ptr, int64x2_t val); // VST1.64 {d0, d1}, [r0]
+void vst1q_f16(__transfersize(8) __fp16 * ptr, float16x8_t val); // VST1.16 {d0, d1}, [r0]
+void vst1q_f32(__transfersize(4) float32_t * ptr, float32x4_t val); // VST1.32 {d0, d1}, [r0]
+void vst1q_p8(__transfersize(16) poly8_t * ptr, poly8x16_t val); // VST1.8 {d0, d1}, [r0]
+void vst1q_p16(__transfersize(8) poly16_t * ptr, poly16x8_t val); // VST1.16 {d0, d1}, [r0]
+void vst1_u8(__transfersize(8) uint8_t * ptr, uint8x8_t val); // VST1.8 {d0}, [r0]
+void vst1_u16(__transfersize(4) uint16_t * ptr, uint16x4_t val); // VST1.16 {d0}, [r0]
+void vst1_u32(__transfersize(2) uint32_t * ptr, uint32x2_t val); // VST1.32 {d0}, [r0]
+void vst1_u64(__transfersize(1) uint64_t * ptr, uint64x1_t val); // VST1.64 {d0}, [r0]
+void vst1_s8(__transfersize(8) int8_t * ptr, int8x8_t val); // VST1.8 {d0}, [r0]
+void vst1_s16(__transfersize(4) int16_t * ptr, int16x4_t val); // VST1.16 {d0}, [r0]
+void vst1_s32(__transfersize(2) int32_t * ptr, int32x2_t val); // VST1.32 {d0}, [r0]
+void vst1_s64(__transfersize(1) int64_t * ptr, int64x1_t val); // VST1.64 {d0}, [r0]
+void vst1_f16(__transfersize(4) __fp16 * ptr, float16x4_t val); // VST1.16 {d0}, [r0]
+void vst1_f32(__transfersize(2) float32_t * ptr, float32x2_t val); // VST1.32 {d0}, [r0]
+void vst1_p8(__transfersize(8) poly8_t * ptr, poly8x8_t val); // VST1.8 {d0}, [r0]
+void vst1_p16(__transfersize(4) poly16_t * ptr, poly16x4_t val); // VST1.16 {d0}, [r0]
 //Store a lane of a vector into memory
 //Loads of an N-element structure
 //Load N-element structure from memory
-uint8x16x2_t vld2q_u8(__transfersize(32) uint8_t const * ptr);         // VLD2.8 {d0, d2}, [r0]
-uint16x8x2_t vld2q_u16(__transfersize(16) uint16_t const * ptr);         // VLD2.16 {d0, d2}, [r0]
-uint32x4x2_t vld2q_u32(__transfersize(8) uint32_t const * ptr);         // VLD2.32 {d0, d2}, [r0]
-int8x16x2_t vld2q_s8(__transfersize(32) int8_t const * ptr);         // VLD2.8 {d0, d2}, [r0]
-int16x8x2_t vld2q_s16(__transfersize(16) int16_t const * ptr);         // VLD2.16 {d0, d2}, [r0]
-int32x4x2_t vld2q_s32(__transfersize(8) int32_t const * ptr);         // VLD2.32 {d0, d2}, [r0]
-float16x8x2_t vld2q_f16(__transfersize(16) __fp16 const * ptr);         // VLD2.16 {d0, d2}, [r0]
-float32x4x2_t vld2q_f32(__transfersize(8) float32_t const * ptr);         // VLD2.32 {d0, d2}, [r0]
-poly8x16x2_t vld2q_p8(__transfersize(32) poly8_t const * ptr);         // VLD2.8 {d0, d2}, [r0]
-poly16x8x2_t vld2q_p16(__transfersize(16) poly16_t const * ptr);         // VLD2.16 {d0, d2}, [r0]
-uint8x8x2_t vld2_u8(__transfersize(16) uint8_t const * ptr);         // VLD2.8 {d0, d1}, [r0]
-uint16x4x2_t vld2_u16(__transfersize(8) uint16_t const * ptr);         // VLD2.16 {d0, d1}, [r0]
-uint32x2x2_t vld2_u32(__transfersize(4) uint32_t const * ptr);         // VLD2.32 {d0, d1}, [r0]
-uint64x1x2_t vld2_u64(__transfersize(2) uint64_t const * ptr);         // VLD1.64 {d0, d1}, [r0]
-int8x8x2_t vld2_s8(__transfersize(16) int8_t const * ptr);         // VLD2.8 {d0, d1}, [r0]
-int16x4x2_t vld2_s16(__transfersize(8) int16_t const * ptr);         // VLD2.16 {d0, d1}, [r0]
-int32x2x2_t vld2_s32(__transfersize(4) int32_t const * ptr);         // VLD2.32 {d0, d1}, [r0]
-int64x1x2_t vld2_s64(__transfersize(2) int64_t const * ptr);         // VLD1.64 {d0, d1}, [r0]
+uint8x16x2_t vld2q_u8(__transfersize(32) uint8_t const * ptr); // VLD2.8 {d0, d2}, [r0]
+uint16x8x2_t vld2q_u16(__transfersize(16) uint16_t const * ptr); // VLD2.16 {d0, d2}, [r0]
+uint32x4x2_t vld2q_u32(__transfersize(8) uint32_t const * ptr); // VLD2.32 {d0, d2}, [r0]
+int8x16x2_t vld2q_s8(__transfersize(32) int8_t const * ptr); // VLD2.8 {d0, d2}, [r0]
+int16x8x2_t vld2q_s16(__transfersize(16) int16_t const * ptr); // VLD2.16 {d0, d2}, [r0]
+int32x4x2_t vld2q_s32(__transfersize(8) int32_t const * ptr); // VLD2.32 {d0, d2}, [r0]
+float16x8x2_t vld2q_f16(__transfersize(16) __fp16 const * ptr); // VLD2.16 {d0, d2}, [r0]
+float32x4x2_t vld2q_f32(__transfersize(8) float32_t const * ptr); // VLD2.32 {d0, d2}, [r0]
+poly8x16x2_t vld2q_p8(__transfersize(32) poly8_t const * ptr); // VLD2.8 {d0, d2}, [r0]
+poly16x8x2_t vld2q_p16(__transfersize(16) poly16_t const * ptr); // VLD2.16 {d0, d2}, [r0]
+uint8x8x2_t vld2_u8(__transfersize(16) uint8_t const * ptr); // VLD2.8 {d0, d1}, [r0]
+uint16x4x2_t vld2_u16(__transfersize(8) uint16_t const * ptr); // VLD2.16 {d0, d1}, [r0]
+uint32x2x2_t vld2_u32(__transfersize(4) uint32_t const * ptr); // VLD2.32 {d0, d1}, [r0]
+uint64x1x2_t vld2_u64(__transfersize(2) uint64_t const * ptr); // VLD1.64 {d0, d1}, [r0]
+int8x8x2_t vld2_s8(__transfersize(16) int8_t const * ptr); // VLD2.8 {d0, d1}, [r0]
+int16x4x2_t vld2_s16(__transfersize(8) int16_t const * ptr); // VLD2.16 {d0, d1}, [r0]
+int32x2x2_t vld2_s32(__transfersize(4) int32_t const * ptr); // VLD2.32 {d0, d1}, [r0]
+int64x1x2_t vld2_s64(__transfersize(2) int64_t const * ptr); // VLD1.64 {d0, d1}, [r0]
 //float16x4x2_t vld2_f16(__transfersize(8) __fp16 const * ptr); // VLD2.16 {d0, d1}, [r0]
-float32x2x2_t vld2_f32(__transfersize(4) float32_t const * ptr);         // VLD2.32 {d0, d1}, [r0]
-poly8x8x2_t vld2_p8(__transfersize(16) poly8_t const * ptr);         // VLD2.8 {d0, d1}, [r0]
-poly16x4x2_t vld2_p16(__transfersize(8) poly16_t const * ptr);         // VLD2.16 {d0, d1}, [r0]
-uint8x16x3_t vld3q_u8(__transfersize(48) uint8_t const * ptr);         // VLD3.8 {d0, d2, d4}, [r0]
-uint16x8x3_t vld3q_u16(__transfersize(24) uint16_t const * ptr);         // VLD3.16 {d0, d2, d4}, [r0]
-uint32x4x3_t vld3q_u32(__transfersize(12) uint32_t const * ptr);         // VLD3.32 {d0, d2, d4}, [r0]
-int8x16x3_t vld3q_s8(__transfersize(48) int8_t const * ptr);         // VLD3.8 {d0, d2, d4}, [r0]
-int16x8x3_t vld3q_s16(__transfersize(24) int16_t const * ptr);         // VLD3.16 {d0, d2, d4}, [r0]
-int32x4x3_t vld3q_s32(__transfersize(12) int32_t const * ptr);         // VLD3.32 {d0, d2, d4}, [r0]
-float16x8x3_t vld3q_f16(__transfersize(24) __fp16 const * ptr);         // VLD3.16 {d0, d2, d4}, [r0]
-float32x4x3_t vld3q_f32(__transfersize(12) float32_t const * ptr);         // VLD3.32 {d0, d2, d4}, [r0]
-poly8x16x3_t vld3q_p8(__transfersize(48) poly8_t const * ptr);         // VLD3.8 {d0, d2, d4}, [r0]
-poly16x8x3_t vld3q_p16(__transfersize(24) poly16_t const * ptr);         // VLD3.16 {d0, d2, d4}, [r0]
-uint8x8x3_t vld3_u8(__transfersize(24) uint8_t const * ptr);         // VLD3.8 {d0, d1, d2}, [r0]
-uint16x4x3_t vld3_u16(__transfersize(12) uint16_t const * ptr);         // VLD3.16 {d0, d1, d2}, [r0]
-uint32x2x3_t vld3_u32(__transfersize(6) uint32_t const * ptr);         // VLD3.32 {d0, d1, d2}, [r0]
-uint64x1x3_t vld3_u64(__transfersize(3) uint64_t const * ptr);         // VLD1.64 {d0, d1, d2}, [r0]
-int8x8x3_t vld3_s8(__transfersize(24) int8_t const * ptr);         // VLD3.8 {d0, d1, d2}, [r0]
-int16x4x3_t vld3_s16(__transfersize(12) int16_t const * ptr);         // VLD3.16 {d0, d1, d2}, [r0]
-int32x2x3_t vld3_s32(__transfersize(6) int32_t const * ptr);         // VLD3.32 {d0, d1, d2}, [r0]
-int64x1x3_t vld3_s64(__transfersize(3) int64_t const * ptr);         // VLD1.64 {d0, d1, d2}, [r0]
-float16x4x3_t vld3_f16(__transfersize(12) __fp16 const * ptr);         // VLD3.16 {d0, d1, d2}, [r0]
-float32x2x3_t vld3_f32(__transfersize(6) float32_t const * ptr);         // VLD3.32 {d0, d1, d2}, [r0]
-poly8x8x3_t vld3_p8(__transfersize(24) poly8_t const * ptr);         // VLD3.8 {d0, d1, d2}, [r0]
-poly16x4x3_t vld3_p16(__transfersize(12) poly16_t const * ptr);         // VLD3.16 {d0, d1, d2}, [r0]
-uint8x16x4_t vld4q_u8(__transfersize(64) uint8_t const * ptr);         // VLD4.8 {d0, d2, d4, d6}, [r0]
-uint16x8x4_t vld4q_u16(__transfersize(32) uint16_t const * ptr);         // VLD4.16 {d0, d2, d4, d6}, [r0]
-uint32x4x4_t vld4q_u32(__transfersize(16) uint32_t const * ptr);         // VLD4.32 {d0, d2, d4, d6}, [r0]
-int8x16x4_t vld4q_s8(__transfersize(64) int8_t const * ptr);         // VLD4.8 {d0, d2, d4, d6}, [r0]
-int16x8x4_t vld4q_s16(__transfersize(32) int16_t const * ptr);         // VLD4.16 {d0, d2, d4, d6}, [r0]
-int32x4x4_t vld4q_s32(__transfersize(16) int32_t const * ptr);         // VLD4.32 {d0, d2, d4, d6}, [r0]
-float16x8x4_t vld4q_f16(__transfersize(32) __fp16 const * ptr);         // VLD4.16 {d0, d2, d4, d6}, [r0]
-float32x4x4_t vld4q_f32(__transfersize(16) float32_t const * ptr);         // VLD4.32 {d0, d2, d4, d6}, [r0]
-poly8x16x4_t vld4q_p8(__transfersize(64) poly8_t const * ptr);         // VLD4.8 {d0, d2, d4, d6}, [r0]
-poly16x8x4_t vld4q_p16(__transfersize(32) poly16_t const * ptr);         // VLD4.16 {d0, d2, d4, d6}, [r0]
-uint8x8x4_t vld4_u8(__transfersize(32) uint8_t const * ptr);         // VLD4.8 {d0, d1, d2, d3}, [r0]
-uint16x4x4_t vld4_u16(__transfersize(16) uint16_t const * ptr);         // VLD4.16 {d0, d1, d2, d3}, [r0]
-uint32x2x4_t vld4_u32(__transfersize(8) uint32_t const * ptr);         // VLD4.32 {d0, d1, d2, d3}, [r0]
-uint64x1x4_t vld4_u64(__transfersize(4) uint64_t const * ptr);         // VLD1.64 {d0, d1, d2, d3}, [r0]
-int8x8x4_t vld4_s8(__transfersize(32) int8_t const * ptr);         // VLD4.8 {d0, d1, d2, d3}, [r0]
-int16x4x4_t vld4_s16(__transfersize(16) int16_t const * ptr);         // VLD4.16 {d0, d1, d2, d3}, [r0]
-int32x2x4_t vld4_s32(__transfersize(8) int32_t const * ptr);         // VLD4.32 {d0, d1, d2, d3}, [r0]
-int64x1x4_t vld4_s64(__transfersize(4) int64_t const * ptr);         // VLD1.64 {d0, d1, d2, d3}, [r0]
-float16x4x4_t vld4_f16(__transfersize(16) __fp16 const * ptr);         // VLD4.16 {d0, d1, d2, d3}, [r0]
-float32x2x4_t vld4_f32(__transfersize(8) float32_t const * ptr);         // VLD4.32 {d0, d1, d2, d3}, [r0]
-poly8x8x4_t vld4_p8(__transfersize(32) poly8_t const * ptr);         // VLD4.8 {d0, d1, d2, d3}, [r0]
-poly16x4x4_t vld4_p16(__transfersize(16) poly16_t const * ptr);         // VLD4.16 {d0, d1, d2, d3}, [r0]
+float32x2x2_t vld2_f32(__transfersize(4) float32_t const * ptr); // VLD2.32 {d0, d1}, [r0]
+poly8x8x2_t vld2_p8(__transfersize(16) poly8_t const * ptr); // VLD2.8 {d0, d1}, [r0]
+poly16x4x2_t vld2_p16(__transfersize(8) poly16_t const * ptr); // VLD2.16 {d0, d1}, [r0]
+uint8x16x3_t vld3q_u8(__transfersize(48) uint8_t const * ptr); // VLD3.8 {d0, d2, d4}, [r0]
+uint16x8x3_t vld3q_u16(__transfersize(24) uint16_t const * ptr); // VLD3.16 {d0, d2, d4}, [r0]
+uint32x4x3_t vld3q_u32(__transfersize(12) uint32_t const * ptr); // VLD3.32 {d0, d2, d4}, [r0]
+int8x16x3_t vld3q_s8(__transfersize(48) int8_t const * ptr); // VLD3.8 {d0, d2, d4}, [r0]
+int16x8x3_t vld3q_s16(__transfersize(24) int16_t const * ptr); // VLD3.16 {d0, d2, d4}, [r0]
+int32x4x3_t vld3q_s32(__transfersize(12) int32_t const * ptr); // VLD3.32 {d0, d2, d4}, [r0]
+float16x8x3_t vld3q_f16(__transfersize(24) __fp16 const * ptr); // VLD3.16 {d0, d2, d4}, [r0]
+float32x4x3_t vld3q_f32(__transfersize(12) float32_t const * ptr); // VLD3.32 {d0, d2, d4}, [r0]
+poly8x16x3_t vld3q_p8(__transfersize(48) poly8_t const * ptr); // VLD3.8 {d0, d2, d4}, [r0]
+poly16x8x3_t vld3q_p16(__transfersize(24) poly16_t const * ptr); // VLD3.16 {d0, d2, d4}, [r0]
+uint8x8x3_t vld3_u8(__transfersize(24) uint8_t const * ptr); // VLD3.8 {d0, d1, d2}, [r0]
+uint16x4x3_t vld3_u16(__transfersize(12) uint16_t const * ptr); // VLD3.16 {d0, d1, d2}, [r0]
+uint32x2x3_t vld3_u32(__transfersize(6) uint32_t const * ptr); // VLD3.32 {d0, d1, d2}, [r0]
+uint64x1x3_t vld3_u64(__transfersize(3) uint64_t const * ptr); // VLD1.64 {d0, d1, d2}, [r0]
+int8x8x3_t vld3_s8(__transfersize(24) int8_t const * ptr); // VLD3.8 {d0, d1, d2}, [r0]
+int16x4x3_t vld3_s16(__transfersize(12) int16_t const * ptr); // VLD3.16 {d0, d1, d2}, [r0]
+int32x2x3_t vld3_s32(__transfersize(6) int32_t const * ptr); // VLD3.32 {d0, d1, d2}, [r0]
+int64x1x3_t vld3_s64(__transfersize(3) int64_t const * ptr); // VLD1.64 {d0, d1, d2}, [r0]
+float16x4x3_t vld3_f16(__transfersize(12) __fp16 const * ptr); // VLD3.16 {d0, d1, d2}, [r0]
+float32x2x3_t vld3_f32(__transfersize(6) float32_t const * ptr); // VLD3.32 {d0, d1, d2}, [r0]
+poly8x8x3_t vld3_p8(__transfersize(24) poly8_t const * ptr); // VLD3.8 {d0, d1, d2}, [r0]
+poly16x4x3_t vld3_p16(__transfersize(12) poly16_t const * ptr); // VLD3.16 {d0, d1, d2}, [r0]
+uint8x16x4_t vld4q_u8(__transfersize(64) uint8_t const * ptr); // VLD4.8 {d0, d2, d4, d6}, [r0]
+uint16x8x4_t vld4q_u16(__transfersize(32) uint16_t const * ptr); // VLD4.16 {d0, d2, d4, d6}, [r0]
+uint32x4x4_t vld4q_u32(__transfersize(16) uint32_t const * ptr); // VLD4.32 {d0, d2, d4, d6}, [r0]
+int8x16x4_t vld4q_s8(__transfersize(64) int8_t const * ptr); // VLD4.8 {d0, d2, d4, d6}, [r0]
+int16x8x4_t vld4q_s16(__transfersize(32) int16_t const * ptr); // VLD4.16 {d0, d2, d4, d6}, [r0]
+int32x4x4_t vld4q_s32(__transfersize(16) int32_t const * ptr); // VLD4.32 {d0, d2, d4, d6}, [r0]
+float16x8x4_t vld4q_f16(__transfersize(32) __fp16 const * ptr); // VLD4.16 {d0, d2, d4, d6}, [r0]
+float32x4x4_t vld4q_f32(__transfersize(16) float32_t const * ptr); // VLD4.32 {d0, d2, d4, d6}, [r0]
+poly8x16x4_t vld4q_p8(__transfersize(64) poly8_t const * ptr); // VLD4.8 {d0, d2, d4, d6}, [r0]
+poly16x8x4_t vld4q_p16(__transfersize(32) poly16_t const * ptr); // VLD4.16 {d0, d2, d4, d6}, [r0]
+uint8x8x4_t vld4_u8(__transfersize(32) uint8_t const * ptr); // VLD4.8 {d0, d1, d2, d3}, [r0]
+uint16x4x4_t vld4_u16(__transfersize(16) uint16_t const * ptr); // VLD4.16 {d0, d1, d2, d3}, [r0]
+uint32x2x4_t vld4_u32(__transfersize(8) uint32_t const * ptr); // VLD4.32 {d0, d1, d2, d3}, [r0]
+uint64x1x4_t vld4_u64(__transfersize(4) uint64_t const * ptr); // VLD1.64 {d0, d1, d2, d3}, [r0]
+int8x8x4_t vld4_s8(__transfersize(32) int8_t const * ptr); // VLD4.8 {d0, d1, d2, d3}, [r0]
+int16x4x4_t vld4_s16(__transfersize(16) int16_t const * ptr); // VLD4.16 {d0, d1, d2, d3}, [r0]
+int32x2x4_t vld4_s32(__transfersize(8) int32_t const * ptr); // VLD4.32 {d0, d1, d2, d3}, [r0]
+int64x1x4_t vld4_s64(__transfersize(4) int64_t const * ptr); // VLD1.64 {d0, d1, d2, d3}, [r0]
+float16x4x4_t vld4_f16(__transfersize(16) __fp16 const * ptr); // VLD4.16 {d0, d1, d2, d3}, [r0]
+float32x2x4_t vld4_f32(__transfersize(8) float32_t const * ptr); // VLD4.32 {d0, d1, d2, d3}, [r0]
+poly8x8x4_t vld4_p8(__transfersize(32) poly8_t const * ptr); // VLD4.8 {d0, d1, d2, d3}, [r0]
+poly16x4x4_t vld4_p16(__transfersize(16) poly16_t const * ptr); // VLD4.16 {d0, d1, d2, d3}, [r0]
 //Load all lanes of N-element structure with same value from memory
-uint8x8x2_t vld2_dup_u8(__transfersize(2) uint8_t const * ptr);         // VLD2.8 {d0[], d1[]}, [r0]
-uint16x4x2_t vld2_dup_u16(__transfersize(2) uint16_t const * ptr);         // VLD2.16 {d0[], d1[]}, [r0]
-uint32x2x2_t vld2_dup_u32(__transfersize(2) uint32_t const * ptr);         // VLD2.32 {d0[], d1[]}, [r0]
-uint64x1x2_t vld2_dup_u64(__transfersize(2) uint64_t const * ptr);         // VLD1.64 {d0, d1}, [r0]
-int8x8x2_t vld2_dup_s8(__transfersize(2) int8_t const * ptr);         // VLD2.8 {d0[], d1[]}, [r0]
-int16x4x2_t vld2_dup_s16(__transfersize(2) int16_t const * ptr);         // VLD2.16 {d0[], d1[]}, [r0]
-int32x2x2_t vld2_dup_s32(__transfersize(2) int32_t const * ptr);         // VLD2.32 {d0[], d1[]}, [r0]
-int64x1x2_t vld2_dup_s64(__transfersize(2) int64_t const * ptr);         // VLD1.64 {d0, d1}, [r0]
+uint8x8x2_t vld2_dup_u8(__transfersize(2) uint8_t const * ptr); // VLD2.8 {d0[], d1[]}, [r0]
+uint16x4x2_t vld2_dup_u16(__transfersize(2) uint16_t const * ptr); // VLD2.16 {d0[], d1[]}, [r0]
+uint32x2x2_t vld2_dup_u32(__transfersize(2) uint32_t const * ptr); // VLD2.32 {d0[], d1[]}, [r0]
+uint64x1x2_t vld2_dup_u64(__transfersize(2) uint64_t const * ptr); // VLD1.64 {d0, d1}, [r0]
+int8x8x2_t vld2_dup_s8(__transfersize(2) int8_t const * ptr); // VLD2.8 {d0[], d1[]}, [r0]
+int16x4x2_t vld2_dup_s16(__transfersize(2) int16_t const * ptr); // VLD2.16 {d0[], d1[]}, [r0]
+int32x2x2_t vld2_dup_s32(__transfersize(2) int32_t const * ptr); // VLD2.32 {d0[], d1[]}, [r0]
+int64x1x2_t vld2_dup_s64(__transfersize(2) int64_t const * ptr); // VLD1.64 {d0, d1}, [r0]
 //float16x4x2_t vld2_dup_f16(__transfersize(2) __fp16 const * ptr); // VLD2.16 {d0[], d1[]}, [r0]
-float32x2x2_t vld2_dup_f32(__transfersize(2) float32_t const * ptr);         // VLD2.32 {d0[], d1[]}, [r0]
-poly8x8x2_t vld2_dup_p8(__transfersize(2) poly8_t const * ptr);         // VLD2.8 {d0[], d1[]}, [r0]
-poly16x4x2_t vld2_dup_p16(__transfersize(2) poly16_t const * ptr);         // VLD2.16 {d0[], d1[]}, [r0]
-uint8x8x3_t vld3_dup_u8(__transfersize(3) uint8_t const * ptr);         // VLD3.8 {d0[], d1[], d2[]}, [r0]
-uint16x4x3_t vld3_dup_u16(__transfersize(3) uint16_t const * ptr);         // VLD3.16 {d0[], d1[], d2[]}, [r0]
-uint32x2x3_t vld3_dup_u32(__transfersize(3) uint32_t const * ptr);         // VLD3.32 {d0[], d1[], d2[]}, [r0]
-uint64x1x3_t vld3_dup_u64(__transfersize(3) uint64_t const * ptr);         // VLD1.64 {d0, d1, d2}, [r0]
-int8x8x3_t vld3_dup_s8(__transfersize(3) int8_t const * ptr);         // VLD3.8 {d0[], d1[], d2[]}, [r0]
-int16x4x3_t vld3_dup_s16(__transfersize(3) int16_t const * ptr);         // VLD3.16 {d0[], d1[], d2[]}, [r0]
-int32x2x3_t vld3_dup_s32(__transfersize(3) int32_t const * ptr);         // VLD3.32 {d0[], d1[], d2[]}, [r0]
-int64x1x3_t vld3_dup_s64(__transfersize(3) int64_t const * ptr);         // VLD1.64 {d0, d1, d2}, [r0]
-float16x4x3_t vld3_dup_f16(__transfersize(3) __fp16 const * ptr);         // VLD3.16 {d0[], d1[], d2[]}, [r0]
-float32x2x3_t vld3_dup_f32(__transfersize(3) float32_t const * ptr);         // VLD3.32 {d0[], d1[], d2[]}, [r0]
-poly8x8x3_t vld3_dup_p8(__transfersize(3) poly8_t const * ptr);         // VLD3.8 {d0[], d1[], d2[]}, [r0]
-poly16x4x3_t vld3_dup_p16(__transfersize(3) poly16_t const * ptr);         // VLD3.16 {d0[], d1[], d2[]}, [r0]
-uint8x8x4_t vld4_dup_u8(__transfersize(4) uint8_t const * ptr);         // VLD4.8 {d0[], d1[], d2[], d3[]}, [r0]
-uint16x4x4_t vld4_dup_u16(__transfersize(4) uint16_t const * ptr);         // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0]
-uint32x2x4_t vld4_dup_u32(__transfersize(4) uint32_t const * ptr);         // VLD4.32 {d0[], d1[], d2[], d3[]}, [r0]
-uint64x1x4_t vld4_dup_u64(__transfersize(4) uint64_t const * ptr);         // VLD1.64 {d0, d1, d2, d3}, [r0]
-int8x8x4_t vld4_dup_s8(__transfersize(4) int8_t const * ptr);         // VLD4.8 {d0[], d1[], d2[], d3[]}, [r0]
-int16x4x4_t vld4_dup_s16(__transfersize(4) int16_t const * ptr);         // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0]
-int32x2x4_t vld4_dup_s32(__transfersize(4) int32_t const * ptr);         // VLD4.32 {d0[], d1[], d2[], d3[]}, [r0]
-int64x1x4_t vld4_dup_s64(__transfersize(4) int64_t const * ptr);         // VLD1.64 {d0, d1, d2, d3}, [r0]
-float16x4x4_t vld4_dup_f16(__transfersize(4) __fp16 const * ptr);         // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0]
-float32x2x4_t vld4_dup_f32(__transfersize(4) float32_t const * ptr);         // VLD4.32 {d0[], d1[], d2[], d3[]}, [r0]
-poly8x8x4_t vld4_dup_p8(__transfersize(4) poly8_t const * ptr);         // VLD4.8 {d0[], d1[], d2[], d3[]}, [r0]
-poly16x4x4_t vld4_dup_p16(__transfersize(4) poly16_t const * ptr);         // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0]
+float32x2x2_t vld2_dup_f32(__transfersize(2) float32_t const * ptr); // VLD2.32 {d0[], d1[]}, [r0]
+poly8x8x2_t vld2_dup_p8(__transfersize(2) poly8_t const * ptr); // VLD2.8 {d0[], d1[]}, [r0]
+poly16x4x2_t vld2_dup_p16(__transfersize(2) poly16_t const * ptr); // VLD2.16 {d0[], d1[]}, [r0]
+uint8x8x3_t vld3_dup_u8(__transfersize(3) uint8_t const * ptr); // VLD3.8 {d0[], d1[], d2[]}, [r0]
+uint16x4x3_t vld3_dup_u16(__transfersize(3) uint16_t const * ptr); // VLD3.16 {d0[], d1[], d2[]}, [r0]
+uint32x2x3_t vld3_dup_u32(__transfersize(3) uint32_t const * ptr); // VLD3.32 {d0[], d1[], d2[]}, [r0]
+uint64x1x3_t vld3_dup_u64(__transfersize(3) uint64_t const * ptr); // VLD1.64 {d0, d1, d2}, [r0]
+int8x8x3_t vld3_dup_s8(__transfersize(3) int8_t const * ptr); // VLD3.8 {d0[], d1[], d2[]}, [r0]
+int16x4x3_t vld3_dup_s16(__transfersize(3) int16_t const * ptr); // VLD3.16 {d0[], d1[], d2[]}, [r0]
+int32x2x3_t vld3_dup_s32(__transfersize(3) int32_t const * ptr); // VLD3.32 {d0[], d1[], d2[]}, [r0]
+int64x1x3_t vld3_dup_s64(__transfersize(3) int64_t const * ptr); // VLD1.64 {d0, d1, d2}, [r0]
+float16x4x3_t vld3_dup_f16(__transfersize(3) __fp16 const * ptr); // VLD3.16 {d0[], d1[], d2[]}, [r0]
+float32x2x3_t vld3_dup_f32(__transfersize(3) float32_t const * ptr); // VLD3.32 {d0[], d1[], d2[]}, [r0]
+poly8x8x3_t vld3_dup_p8(__transfersize(3) poly8_t const * ptr); // VLD3.8 {d0[], d1[], d2[]}, [r0]
+poly16x4x3_t vld3_dup_p16(__transfersize(3) poly16_t const * ptr); // VLD3.16 {d0[], d1[], d2[]}, [r0]
+uint8x8x4_t vld4_dup_u8(__transfersize(4) uint8_t const * ptr); // VLD4.8 {d0[], d1[], d2[], d3[]}, [r0]
+uint16x4x4_t vld4_dup_u16(__transfersize(4) uint16_t const * ptr); // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0]
+uint32x2x4_t vld4_dup_u32(__transfersize(4) uint32_t const * ptr); // VLD4.32 {d0[], d1[], d2[], d3[]}, [r0]
+uint64x1x4_t vld4_dup_u64(__transfersize(4) uint64_t const * ptr); // VLD1.64 {d0, d1, d2, d3}, [r0]
+int8x8x4_t vld4_dup_s8(__transfersize(4) int8_t const * ptr); // VLD4.8 {d0[], d1[], d2[], d3[]}, [r0]
+int16x4x4_t vld4_dup_s16(__transfersize(4) int16_t const * ptr); // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0]
+int32x2x4_t vld4_dup_s32(__transfersize(4) int32_t const * ptr); // VLD4.32 {d0[], d1[], d2[], d3[]}, [r0]
+int64x1x4_t vld4_dup_s64(__transfersize(4) int64_t const * ptr); // VLD1.64 {d0, d1, d2, d3}, [r0]
+float16x4x4_t vld4_dup_f16(__transfersize(4) __fp16 const * ptr); // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0]
+float32x2x4_t vld4_dup_f32(__transfersize(4) float32_t const * ptr); // VLD4.32 {d0[], d1[], d2[], d3[]}, [r0]
+poly8x8x4_t vld4_dup_p8(__transfersize(4) poly8_t const * ptr); // VLD4.8 {d0[], d1[], d2[], d3[]}, [r0]
+poly16x4x4_t vld4_dup_p16(__transfersize(4) poly16_t const * ptr); // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0]
 //Load a single lane of N-element structure from memory
 //the functions below are modified to deal with the error C2719: 'src': formal parameter with __declspec(align('16')) won't be aligned
-uint16x8x2_t vld2q_lane_u16_ptr(__transfersize(2) uint16_t const * ptr, uint16x8x2_t * src, __constrange(0,7) int lane);         // VLD2.16 {d0[0], d2[0]}, [r0]
-uint32x4x2_t vld2q_lane_u32_ptr(__transfersize(2) uint32_t const * ptr, uint32x4x2_t * src, __constrange(0,3) int lane);         // VLD2.32 {d0[0], d2[0]}, [r0]
-int16x8x2_t vld2q_lane_s16_ptr(__transfersize(2) int16_t const * ptr, int16x8x2_t * src, __constrange(0,7) int lane);         // VLD2.16 {d0[0], d2[0]}, [r0]
-int32x4x2_t vld2q_lane_s32_ptr(__transfersize(2) int32_t const * ptr, int32x4x2_t * src, __constrange(0,3) int lane);         // VLD2.32 {d0[0], d2[0]}, [r0]
-float16x8x2_t vld2q_lane_f16_ptr(__transfersize(2) __fp16 const * ptr, float16x8x2_t * src, __constrange(0,7) int lane);         // VLD2.16 {d0[0], d2[0]}, [r0]
-float32x4x2_t vld2q_lane_f32_ptr(__transfersize(2) float32_t const * ptr, float32x4x2_t * src, __constrange(0,3) int lane);         // VLD2.32 {d0[0], d2[0]}, [r0]
-poly16x8x2_t vld2q_lane_p16_ptr(__transfersize(2) poly16_t const * ptr, poly16x8x2_t * src, __constrange(0,7) int lane);         // VLD2.16 {d0[0], d2[0]}, [r0]
-uint8x8x2_t vld2_lane_u8_ptr(__transfersize(2) uint8_t const * ptr, uint8x8x2_t * src, __constrange(0,7) int lane);         //VLD2.8 {d0[0], d1[0]}, [r0]
-uint16x4x2_t vld2_lane_u16_ptr(__transfersize(2) uint16_t const * ptr, uint16x4x2_t * src, __constrange(0,3) int lane);         // VLD2.16 {d0[0], d1[0]}, [r0]
-uint32x2x2_t vld2_lane_u32_ptr(__transfersize(2) uint32_t const * ptr, uint32x2x2_t * src, __constrange(0,1) int lane);         // VLD2.32 {d0[0], d1[0]}, [r0]
-int8x8x2_t vld2_lane_s8_ptr(__transfersize(2) int8_t const * ptr, int8x8x2_t * src, __constrange(0,7) int lane);         //VLD2.8 {d0[0], d1[0]}, [r0]
-int16x4x2_t vld2_lane_s16_ptr(__transfersize(2) int16_t const * ptr, int16x4x2_t * src, __constrange(0,3) int lane);         //VLD2.16 {d0[0], d1[0]}, [r0]
-int32x2x2_t vld2_lane_s32_ptr(__transfersize(2) int32_t const * ptr, int32x2x2_t * src, __constrange(0,1) int lane);         //VLD2.32 {d0[0], d1[0]}, [r0]
+uint16x8x2_t vld2q_lane_u16_ptr(__transfersize(2) uint16_t const * ptr, uint16x8x2_t * src, __constrange(0,7) int lane); // VLD2.16 {d0[0], d2[0]}, [r0]
+uint32x4x2_t vld2q_lane_u32_ptr(__transfersize(2) uint32_t const * ptr, uint32x4x2_t * src, __constrange(0,3) int lane); // VLD2.32 {d0[0], d2[0]}, [r0]
+int16x8x2_t vld2q_lane_s16_ptr(__transfersize(2) int16_t const * ptr, int16x8x2_t * src, __constrange(0,7) int lane); // VLD2.16 {d0[0], d2[0]}, [r0]
+int32x4x2_t vld2q_lane_s32_ptr(__transfersize(2) int32_t const * ptr, int32x4x2_t * src, __constrange(0,3) int lane); // VLD2.32 {d0[0], d2[0]}, [r0]
+float16x8x2_t vld2q_lane_f16_ptr(__transfersize(2) __fp16 const * ptr, float16x8x2_t * src, __constrange(0,7) int lane); // VLD2.16 {d0[0], d2[0]}, [r0]
+float32x4x2_t vld2q_lane_f32_ptr(__transfersize(2) float32_t const * ptr, float32x4x2_t * src, __constrange(0,3) int lane); // VLD2.32 {d0[0], d2[0]}, [r0]
+poly16x8x2_t vld2q_lane_p16_ptr(__transfersize(2) poly16_t const * ptr, poly16x8x2_t * src, __constrange(0,7) int lane); // VLD2.16 {d0[0], d2[0]}, [r0]
+uint8x8x2_t vld2_lane_u8_ptr(__transfersize(2) uint8_t const * ptr, uint8x8x2_t * src, __constrange(0,7) int lane); //VLD2.8 {d0[0], d1[0]}, [r0]
+uint16x4x2_t vld2_lane_u16_ptr(__transfersize(2) uint16_t const * ptr, uint16x4x2_t * src, __constrange(0,3) int lane); // VLD2.16 {d0[0], d1[0]}, [r0]
+uint32x2x2_t vld2_lane_u32_ptr(__transfersize(2) uint32_t const * ptr, uint32x2x2_t * src, __constrange(0,1) int lane); // VLD2.32 {d0[0], d1[0]}, [r0]
+int8x8x2_t vld2_lane_s8_ptr(__transfersize(2) int8_t const * ptr, int8x8x2_t * src, __constrange(0,7) int lane); //VLD2.8 {d0[0], d1[0]}, [r0]
+int16x4x2_t vld2_lane_s16_ptr(__transfersize(2) int16_t const * ptr, int16x4x2_t * src, __constrange(0,3) int lane); //VLD2.16 {d0[0], d1[0]}, [r0]
+int32x2x2_t vld2_lane_s32_ptr(__transfersize(2) int32_t const * ptr, int32x2x2_t * src, __constrange(0,1) int lane); //VLD2.32 {d0[0], d1[0]}, [r0]
 //float16x4x2_t vld2_lane_f16_ptr(__transfersize(2) __fp16 const * ptr, float16x4x2_t * src, __constrange(0,3) int lane); // VLD2.16 {d0[0], d1[0]}, [r0]
-float32x2x2_t vld2_lane_f32_ptr(__transfersize(2) float32_t const * ptr, float32x2x2_t * src, __constrange(0,1) int lane);         // VLD2.32 {d0[0], d1[0]}, [r0]
-poly8x8x2_t vld2_lane_p8_ptr(__transfersize(2) poly8_t const * ptr, poly8x8x2_t * src, __constrange(0,7) int lane);         //VLD2.8 {d0[0], d1[0]}, [r0]
-poly16x4x2_t vld2_lane_p16_ptr(__transfersize(2) poly16_t const * ptr, poly16x4x2_t * src, __constrange(0,3) int lane);         // VLD2.16 {d0[0], d1[0]}, [r0]
-uint16x8x3_t vld3q_lane_u16_ptr(__transfersize(3) uint16_t const * ptr, uint16x8x3_t * src, __constrange(0,7) int lane);         // VLD3.16 {d0[0], d2[0], d4[0]}, [r0]
-uint32x4x3_t vld3q_lane_u32_ptr(__transfersize(3) uint32_t const * ptr, uint32x4x3_t * src, __constrange(0,3) int lane);         // VLD3.32 {d0[0], d2[0], d4[0]}, [r0]
-int16x8x3_t vld3q_lane_s16_ptr(__transfersize(3) int16_t const * ptr, int16x8x3_t * src, __constrange(0,7) int lane);         // VLD3.16 {d0[0], d2[0], d4[0]}, [r0]
-int32x4x3_t vld3q_lane_s32_ptr(__transfersize(3) int32_t const * ptr, int32x4x3_t * src, __constrange(0,3) int lane);         // VLD3.32 {d0[0], d2[0], d4[0]}, [r0]
-float16x8x3_t vld3q_lane_f16_ptr(__transfersize(3) __fp16 const * ptr, float16x8x3_t * src, __constrange(0,7) int lane);         // VLD3.16 {d0[0], d2[0], d4[0]}, [r0]
-float32x4x3_t vld3q_lane_f32_ptr(__transfersize(3) float32_t const * ptr, float32x4x3_t * src, __constrange(0,3) int lane);         // VLD3.32 {d0[0], d2[0], d4[0]}, [r0]
-poly16x8x3_t vld3q_lane_p16_ptr(__transfersize(3) poly16_t const * ptr, poly16x8x3_t * src, __constrange(0,7) int lane);         // VLD3.16 {d0[0], d2[0], d4[0]}, [r0]
-uint8x8x3_t vld3_lane_u8_ptr(__transfersize(3) uint8_t const * ptr, uint8x8x3_t * src, __constrange(0,7) int lane);         //VLD3.8 {d0[0], d1[0], d2[0]}, [r0]
-uint16x4x3_t vld3_lane_u16_ptr(__transfersize(3) uint16_t const * ptr, uint16x4x3_t * src, __constrange(0,3) int lane);         // VLD3.16 {d0[0], d1[0], d2[0]}, [r0]
-uint32x2x3_t vld3_lane_u32_ptr(__transfersize(3) uint32_t const * ptr, uint32x2x3_t * src, __constrange(0,1) int lane);         // VLD3.32 {d0[0], d1[0], d2[0]}, [r0]
-int8x8x3_t vld3_lane_s8_ptr(__transfersize(3) int8_t const * ptr, int8x8x3_t * src, __constrange(0,7) int lane);         //VLD3.8 {d0[0], d1[0], d2[0]}, [r0]
-int16x4x3_t vld3_lane_s16_ptr(__transfersize(3) int16_t const * ptr, int16x4x3_t * src, __constrange(0,3) int lane);         //VLD3.16 {d0[0], d1[0], d2[0]}, [r0]
-int32x2x3_t vld3_lane_s32_ptr(__transfersize(3) int32_t const * ptr, int32x2x3_t * src, __constrange(0,1) int lane);         //VLD3.32 {d0[0], d1[0], d2[0]}, [r0]
-float16x4x3_t vld3_lane_f16_ptr(__transfersize(3) __fp16 const * ptr, float16x4x3_t * src, __constrange(0,3) int lane);         // VLD3.16 {d0[0], d1[0], d2[0]}, [r0]
-float32x2x3_t vld3_lane_f32_ptr(__transfersize(3) float32_t const * ptr, float32x2x3_t * src, __constrange(0,1) int lane);         // VLD3.32 {d0[0], d1[0], d2[0]}, [r0]
-poly8x8x3_t vld3_lane_p8_ptr(__transfersize(3) poly8_t const * ptr, poly8x8x3_t * src, __constrange(0,7) int lane);         //VLD3.8 {d0[0], d1[0], d2[0]}, [r0]
-poly16x4x3_t vld3_lane_p16_ptr(__transfersize(3) poly16_t const * ptr, poly16x4x3_t * src, __constrange(0,3) int lane);         // VLD3.16 {d0[0], d1[0], d2[0]}, [r0]
-uint16x8x4_t vld4q_lane_u16_ptr(__transfersize(4) uint16_t const * ptr, uint16x8x4_t * src, __constrange(0,7) int lane);         // VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
-uint32x4x4_t vld4q_lane_u32_ptr(__transfersize(4) uint32_t const * ptr, uint32x4x4_t * src, __constrange(0,3) int lane);         // VLD4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0]
-int16x8x4_t vld4q_lane_s16_ptr(__transfersize(4) int16_t const * ptr, int16x8x4_t * src, __constrange(0,7) int lane);         // VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
-int32x4x4_t vld4q_lane_s32_ptr(__transfersize(4) int32_t const * ptr, int32x4x4_t * src, __constrange(0,3) int lane);         // VLD4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0]
-float16x8x4_t vld4q_lane_f16_ptr(__transfersize(4) __fp16 const * ptr, float16x8x4_t * src, __constrange(0,7) int lane);         // VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
-float32x4x4_t vld4q_lane_f32_ptr(__transfersize(4) float32_t const * ptr, float32x4x4_t * src, __constrange(0,3) int lane);         // VLD4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0]
-poly16x8x4_t vld4q_lane_p16_ptr(__transfersize(4) poly16_t const * ptr, poly16x8x4_t * src, __constrange(0,7) int lane);         // VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
-uint8x8x4_t vld4_lane_u8_ptr(__transfersize(4) uint8_t const * ptr, uint8x8x4_t * src, __constrange(0,7) int lane);         //VLD4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]
-uint16x4x4_t vld4_lane_u16_ptr(__transfersize(4) uint16_t const * ptr, uint16x4x4_t * src, __constrange(0,3) int lane);         // VLD4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0]
-uint32x2x4_t vld4_lane_u32_ptr(__transfersize(4) uint32_t const * ptr, uint32x2x4_t * src, __constrange(0,1) int lane);         // VLD4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0]
-int8x8x4_t vld4_lane_s8_ptr(__transfersize(4) int8_t const * ptr, int8x8x4_t * src, __constrange(0,7) int lane);         //VLD4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]
-int16x4x4_t vld4_lane_s16_ptr(__transfersize(4) int16_t const * ptr, int16x4x4_t * src, __constrange(0,3) int lane);         //VLD4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0]
-int32x2x4_t vld4_lane_s32_ptr(__transfersize(4) int32_t const * ptr, int32x2x4_t * src, __constrange(0,1) int lane);         //VLD4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0]
-float16x4x4_t vld4_lane_f16_ptr(__transfersize(4) __fp16 const * ptr, float16x4x4_t * src, __constrange(0,3) int lane);         // VLD4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0]
-float32x2x4_t vld4_lane_f32_ptr(__transfersize(4) float32_t const * ptr, float32x2x4_t * src, __constrange(0,1) int lane);         // VLD4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0]
-poly8x8x4_t vld4_lane_p8_ptr(__transfersize(4) poly8_t const * ptr, poly8x8x4_t * src, __constrange(0,7) int lane);         //VLD4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]
-poly16x4x4_t vld4_lane_p16_ptr(__transfersize(4) poly16_t const * ptr, poly16x4x4_t * src, __constrange(0,3) int lane);         // VLD4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0]
+float32x2x2_t vld2_lane_f32_ptr(__transfersize(2) float32_t const * ptr, float32x2x2_t * src, __constrange(0,1) int lane); // VLD2.32 {d0[0], d1[0]}, [r0]
+poly8x8x2_t vld2_lane_p8_ptr(__transfersize(2) poly8_t const * ptr, poly8x8x2_t * src, __constrange(0,7) int lane); //VLD2.8 {d0[0], d1[0]}, [r0]
+poly16x4x2_t vld2_lane_p16_ptr(__transfersize(2) poly16_t const * ptr, poly16x4x2_t * src, __constrange(0,3) int lane); // VLD2.16 {d0[0], d1[0]}, [r0]
+uint16x8x3_t vld3q_lane_u16_ptr(__transfersize(3) uint16_t const * ptr, uint16x8x3_t * src, __constrange(0,7) int lane); // VLD3.16 {d0[0], d2[0], d4[0]}, [r0]
+uint32x4x3_t vld3q_lane_u32_ptr(__transfersize(3) uint32_t const * ptr, uint32x4x3_t * src, __constrange(0,3) int lane); // VLD3.32 {d0[0], d2[0], d4[0]}, [r0]
+int16x8x3_t vld3q_lane_s16_ptr(__transfersize(3) int16_t const * ptr, int16x8x3_t * src, __constrange(0,7) int lane); // VLD3.16 {d0[0], d2[0], d4[0]}, [r0]
+int32x4x3_t vld3q_lane_s32_ptr(__transfersize(3) int32_t const * ptr, int32x4x3_t * src, __constrange(0,3) int lane); // VLD3.32 {d0[0], d2[0], d4[0]}, [r0]
+float16x8x3_t vld3q_lane_f16_ptr(__transfersize(3) __fp16 const * ptr, float16x8x3_t * src, __constrange(0,7) int lane); // VLD3.16 {d0[0], d2[0], d4[0]}, [r0]
+float32x4x3_t vld3q_lane_f32_ptr(__transfersize(3) float32_t const * ptr, float32x4x3_t * src, __constrange(0,3) int lane); // VLD3.32 {d0[0], d2[0], d4[0]}, [r0]
+poly16x8x3_t vld3q_lane_p16_ptr(__transfersize(3) poly16_t const * ptr, poly16x8x3_t * src, __constrange(0,7) int lane); // VLD3.16 {d0[0], d2[0], d4[0]}, [r0]
+uint8x8x3_t vld3_lane_u8_ptr(__transfersize(3) uint8_t const * ptr, uint8x8x3_t * src, __constrange(0,7) int lane); //VLD3.8 {d0[0], d1[0], d2[0]}, [r0]
+uint16x4x3_t vld3_lane_u16_ptr(__transfersize(3) uint16_t const * ptr, uint16x4x3_t * src, __constrange(0,3) int lane); // VLD3.16 {d0[0], d1[0], d2[0]}, [r0]
+uint32x2x3_t vld3_lane_u32_ptr(__transfersize(3) uint32_t const * ptr, uint32x2x3_t * src, __constrange(0,1) int lane); // VLD3.32 {d0[0], d1[0], d2[0]}, [r0]
+int8x8x3_t vld3_lane_s8_ptr(__transfersize(3) int8_t const * ptr, int8x8x3_t * src, __constrange(0,7) int lane); //VLD3.8 {d0[0], d1[0], d2[0]}, [r0]
+int16x4x3_t vld3_lane_s16_ptr(__transfersize(3) int16_t const * ptr, int16x4x3_t * src, __constrange(0,3) int lane); //VLD3.16 {d0[0], d1[0], d2[0]}, [r0]
+int32x2x3_t vld3_lane_s32_ptr(__transfersize(3) int32_t const * ptr, int32x2x3_t * src, __constrange(0,1) int lane); //VLD3.32 {d0[0], d1[0], d2[0]}, [r0]
+float16x4x3_t vld3_lane_f16_ptr(__transfersize(3) __fp16 const * ptr, float16x4x3_t * src, __constrange(0,3) int lane); // VLD3.16 {d0[0], d1[0], d2[0]}, [r0]
+float32x2x3_t vld3_lane_f32_ptr(__transfersize(3) float32_t const * ptr, float32x2x3_t * src, __constrange(0,1) int lane); // VLD3.32 {d0[0], d1[0], d2[0]}, [r0]
+poly8x8x3_t vld3_lane_p8_ptr(__transfersize(3) poly8_t const * ptr, poly8x8x3_t * src, __constrange(0,7) int lane); //VLD3.8 {d0[0], d1[0], d2[0]}, [r0]
+poly16x4x3_t vld3_lane_p16_ptr(__transfersize(3) poly16_t const * ptr, poly16x4x3_t * src, __constrange(0,3) int lane); // VLD3.16 {d0[0], d1[0], d2[0]}, [r0]
+uint16x8x4_t vld4q_lane_u16_ptr(__transfersize(4) uint16_t const * ptr, uint16x8x4_t * src, __constrange(0,7) int lane); // VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
+uint32x4x4_t vld4q_lane_u32_ptr(__transfersize(4) uint32_t const * ptr, uint32x4x4_t * src, __constrange(0,3) int lane); // VLD4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0]
+int16x8x4_t vld4q_lane_s16_ptr(__transfersize(4) int16_t const * ptr, int16x8x4_t * src, __constrange(0,7) int lane); // VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
+int32x4x4_t vld4q_lane_s32_ptr(__transfersize(4) int32_t const * ptr, int32x4x4_t * src, __constrange(0,3) int lane); // VLD4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0]
+float16x8x4_t vld4q_lane_f16_ptr(__transfersize(4) __fp16 const * ptr, float16x8x4_t * src, __constrange(0,7) int lane); // VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
+float32x4x4_t vld4q_lane_f32_ptr(__transfersize(4) float32_t const * ptr, float32x4x4_t * src, __constrange(0,3) int lane); // VLD4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0]
+poly16x8x4_t vld4q_lane_p16_ptr(__transfersize(4) poly16_t const * ptr, poly16x8x4_t * src, __constrange(0,7) int lane); // VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
+uint8x8x4_t vld4_lane_u8_ptr(__transfersize(4) uint8_t const * ptr, uint8x8x4_t * src, __constrange(0,7) int lane); //VLD4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]
+uint16x4x4_t vld4_lane_u16_ptr(__transfersize(4) uint16_t const * ptr, uint16x4x4_t * src, __constrange(0,3) int lane); // VLD4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0]
+uint32x2x4_t vld4_lane_u32_ptr(__transfersize(4) uint32_t const * ptr, uint32x2x4_t * src, __constrange(0,1) int lane); // VLD4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0]
+int8x8x4_t vld4_lane_s8_ptr(__transfersize(4) int8_t const * ptr, int8x8x4_t * src, __constrange(0,7) int lane); //VLD4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]
+int16x4x4_t vld4_lane_s16_ptr(__transfersize(4) int16_t const * ptr, int16x4x4_t * src, __constrange(0,3) int lane); //VLD4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0]
+int32x2x4_t vld4_lane_s32_ptr(__transfersize(4) int32_t const * ptr, int32x2x4_t * src, __constrange(0,1) int lane); //VLD4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0]
+float16x4x4_t vld4_lane_f16_ptr(__transfersize(4) __fp16 const * ptr, float16x4x4_t * src, __constrange(0,3) int lane); // VLD4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0]
+float32x2x4_t vld4_lane_f32_ptr(__transfersize(4) float32_t const * ptr, float32x2x4_t * src, __constrange(0,1) int lane); // VLD4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0]
+poly8x8x4_t vld4_lane_p8_ptr(__transfersize(4) poly8_t const * ptr, poly8x8x4_t * src, __constrange(0,7) int lane); //VLD4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]
+poly16x4x4_t vld4_lane_p16_ptr(__transfersize(4) poly16_t const * ptr, poly16x4x4_t * src, __constrange(0,3) int lane); // VLD4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0]
 //Store N-element structure to memory
-void vst2q_u8_ptr(__transfersize(32) uint8_t * ptr, uint8x16x2_t * val);         // VST2.8 {d0, d2}, [r0]
-void vst2q_u16_ptr(__transfersize(16) uint16_t * ptr, uint16x8x2_t * val);         // VST2.16 {d0, d2}, [r0]
-void vst2q_u32_ptr(__transfersize(8) uint32_t * ptr, uint32x4x2_t * val);         // VST2.32 {d0, d2}, [r0]
-void vst2q_s8_ptr(__transfersize(32) int8_t * ptr, int8x16x2_t * val);         // VST2.8 {d0, d2}, [r0]
-void vst2q_s16_ptr(__transfersize(16) int16_t * ptr, int16x8x2_t * val);         // VST2.16 {d0, d2}, [r0]
-void vst2q_s32_ptr(__transfersize(8) int32_t * ptr, int32x4x2_t * val);         // VST2.32 {d0, d2}, [r0]
-void vst2q_f16_ptr(__transfersize(16) __fp16 * ptr, float16x8x2_t * val);         // VST2.16 {d0, d2}, [r0]
-void vst2q_f32_ptr(__transfersize(8) float32_t * ptr, float32x4x2_t * val);         // VST2.32 {d0, d2}, [r0]
-void vst2q_p8_ptr(__transfersize(32) poly8_t * ptr, poly8x16x2_t * val);         // VST2.8 {d0, d2}, [r0]
-void vst2q_p16_ptr(__transfersize(16) poly16_t * ptr, poly16x8x2_t * val);         // VST2.16 {d0, d2}, [r0]
-void vst2_u8_ptr(__transfersize(16) uint8_t * ptr, uint8x8x2_t * val);         // VST2.8 {d0, d1}, [r0]
-void vst2_u16_ptr(__transfersize(8) uint16_t * ptr, uint16x4x2_t * val);         // VST2.16 {d0, d1}, [r0]
-void vst2_u32_ptr(__transfersize(4) uint32_t * ptr, uint32x2x2_t * val);         // VST2.32 {d0, d1}, [r0]
-void vst2_u64_ptr(__transfersize(2) uint64_t * ptr, uint64x1x2_t * val);         // VST1.64 {d0, d1}, [r0]
-void vst2_s8_ptr(__transfersize(16) int8_t * ptr, int8x8x2_t * val);         // VST2.8 {d0, d1}, [r0]
-void vst2_s16_ptr(__transfersize(8) int16_t * ptr, int16x4x2_t * val);         // VST2.16 {d0, d1}, [r0]
-void vst2_s32_ptr(__transfersize(4) int32_t * ptr, int32x2x2_t * val);         // VST2.32 {d0, d1}, [r0]
-void vst2_s64_ptr(__transfersize(2) int64_t * ptr, int64x1x2_t * val);         // VST1.64 {d0, d1}, [r0]
+void vst2q_u8_ptr(__transfersize(32) uint8_t * ptr, uint8x16x2_t * val); // VST2.8 {d0, d2}, [r0]
+void vst2q_u16_ptr(__transfersize(16) uint16_t * ptr, uint16x8x2_t * val); // VST2.16 {d0, d2}, [r0]
+void vst2q_u32_ptr(__transfersize(8) uint32_t * ptr, uint32x4x2_t * val); // VST2.32 {d0, d2}, [r0]
+void vst2q_s8_ptr(__transfersize(32) int8_t * ptr, int8x16x2_t * val); // VST2.8 {d0, d2}, [r0]
+void vst2q_s16_ptr(__transfersize(16) int16_t * ptr, int16x8x2_t * val); // VST2.16 {d0, d2}, [r0]
+void vst2q_s32_ptr(__transfersize(8) int32_t * ptr, int32x4x2_t * val); // VST2.32 {d0, d2}, [r0]
+void vst2q_f16_ptr(__transfersize(16) __fp16 * ptr, float16x8x2_t * val); // VST2.16 {d0, d2}, [r0]
+void vst2q_f32_ptr(__transfersize(8) float32_t * ptr, float32x4x2_t * val); // VST2.32 {d0, d2}, [r0]
+void vst2q_p8_ptr(__transfersize(32) poly8_t * ptr, poly8x16x2_t * val); // VST2.8 {d0, d2}, [r0]
+void vst2q_p16_ptr(__transfersize(16) poly16_t * ptr, poly16x8x2_t * val); // VST2.16 {d0, d2}, [r0]
+void vst2_u8_ptr(__transfersize(16) uint8_t * ptr, uint8x8x2_t * val); // VST2.8 {d0, d1}, [r0]
+void vst2_u16_ptr(__transfersize(8) uint16_t * ptr, uint16x4x2_t * val); // VST2.16 {d0, d1}, [r0]
+void vst2_u32_ptr(__transfersize(4) uint32_t * ptr, uint32x2x2_t * val); // VST2.32 {d0, d1}, [r0]
+void vst2_u64_ptr(__transfersize(2) uint64_t * ptr, uint64x1x2_t * val); // VST1.64 {d0, d1}, [r0]
+void vst2_s8_ptr(__transfersize(16) int8_t * ptr, int8x8x2_t * val); // VST2.8 {d0, d1}, [r0]
+void vst2_s16_ptr(__transfersize(8) int16_t * ptr, int16x4x2_t * val); // VST2.16 {d0, d1}, [r0]
+void vst2_s32_ptr(__transfersize(4) int32_t * ptr, int32x2x2_t * val); // VST2.32 {d0, d1}, [r0]
+void vst2_s64_ptr(__transfersize(2) int64_t * ptr, int64x1x2_t * val); // VST1.64 {d0, d1}, [r0]
 //void vst2_f16_ptr(__transfersize(8) __fp16 * ptr, float16x4x2_t * val); // VST2.16 {d0, d1}, [r0]
-void vst2_f32_ptr(__transfersize(4) float32_t * ptr, float32x2x2_t * val);         // VST2.32 {d0, d1}, [r0]
-void vst2_p8_ptr(__transfersize(16) poly8_t * ptr, poly8x8x2_t * val);         // VST2.8 {d0, d1}, [r0]
-void vst2_p16_ptr(__transfersize(8) poly16_t * ptr, poly16x4x2_t * val);         // VST2.16 {d0, d1}, [r0]
-void vst3q_u8_ptr(__transfersize(48) uint8_t * ptr, uint8x16x3_t * val);         // VST3.8 {d0, d2, d4}, [r0]
-void vst3q_u16_ptr(__transfersize(24) uint16_t * ptr, uint16x8x3_t * val);         // VST3.16 {d0, d2, d4}, [r0]
-void vst3q_u32_ptr(__transfersize(12) uint32_t * ptr, uint32x4x3_t * val);         // VST3.32 {d0, d2, d4}, [r0]
-void vst3q_s8_ptr(__transfersize(48) int8_t * ptr, int8x16x3_t * val);         // VST3.8 {d0, d2, d4}, [r0]
-void vst3q_s16_ptr(__transfersize(24) int16_t * ptr, int16x8x3_t * val);         // VST3.16 {d0, d2, d4}, [r0]
-void vst3q_s32_ptr(__transfersize(12) int32_t * ptr, int32x4x3_t * val);         // VST3.32 {d0, d2, d4}, [r0]
-void vst3q_f16_ptr(__transfersize(24) __fp16 * ptr, float16x8x3_t * val);         // VST3.16 {d0, d2, d4}, [r0]
-void vst3q_f32_ptr(__transfersize(12) float32_t * ptr, float32x4x3_t * val);         // VST3.32 {d0, d2, d4}, [r0]
-void vst3q_p8_ptr(__transfersize(48) poly8_t * ptr, poly8x16x3_t * val);         // VST3.8 {d0, d2, d4}, [r0]
-void vst3q_p16_ptr(__transfersize(24) poly16_t * ptr, poly16x8x3_t * val);         // VST3.16 {d0, d2, d4}, [r0]
-void vst3_u8_ptr(__transfersize(24) uint8_t * ptr, uint8x8x3_t * val);         // VST3.8 {d0, d1, d2}, [r0]
-void vst3_u16_ptr(__transfersize(12) uint16_t * ptr, uint16x4x3_t * val);         // VST3.16 {d0, d1, d2}, [r0]
-void vst3_u32_ptr(__transfersize(6) uint32_t * ptr, uint32x2x3_t * val);         // VST3.32 {d0, d1, d2}, [r0]
-void vst3_u64_ptr(__transfersize(3) uint64_t * ptr, uint64x1x3_t * val);         // VST1.64 {d0, d1, d2}, [r0]
-void vst3_s8_ptr(__transfersize(24) int8_t * ptr, int8x8x3_t * val);         // VST3.8 {d0, d1, d2}, [r0]
-void vst3_s16_ptr(__transfersize(12) int16_t * ptr, int16x4x3_t * val);         // VST3.16 {d0, d1, d2}, [r0]
-void vst3_s32_ptr(__transfersize(6) int32_t * ptr, int32x2x3_t * val);         // VST3.32 {d0, d1, d2}, [r0]
-void vst3_s64_ptr(__transfersize(3) int64_t * ptr, int64x1x3_t * val);         // VST1.64 {d0, d1, d2}, [r0]
-void vst3_f16_ptr(__transfersize(12) __fp16 * ptr, float16x4x3_t * val);         // VST3.16 {d0, d1, d2}, [r0]
-void vst3_f32_ptr(__transfersize(6) float32_t * ptr, float32x2x3_t * val);         // VST3.32 {d0, d1, d2}, [r0]
-void vst3_p8_ptr(__transfersize(24) poly8_t * ptr, poly8x8x3_t * val);         // VST3.8 {d0, d1, d2}, [r0]
-void vst3_p16_ptr(__transfersize(12) poly16_t * ptr, poly16x4x3_t * val);         // VST3.16 {d0, d1, d2}, [r0]
-void vst4q_u8_ptr(__transfersize(64) uint8_t * ptr, uint8x16x4_t * val);         // VST4.8 {d0, d2, d4, d6}, [r0]
-void vst4q_u16_ptr(__transfersize(32) uint16_t * ptr, uint16x8x4_t * val);         // VST4.16 {d0, d2, d4, d6}, [r0]
-void vst4q_u32_ptr(__transfersize(16) uint32_t * ptr, uint32x4x4_t * val);         // VST4.32 {d0, d2, d4, d6}, [r0]
-void vst4q_s8_ptr(__transfersize(64) int8_t * ptr, int8x16x4_t * val);         // VST4.8 {d0, d2, d4, d6}, [r0]
-void vst4q_s16_ptr(__transfersize(32) int16_t * ptr, int16x8x4_t * val);         // VST4.16 {d0, d2, d4, d6}, [r0]
-void vst4q_s32_ptr(__transfersize(16) int32_t * ptr, int32x4x4_t * val);         // VST4.32 {d0, d2, d4, d6}, [r0]
-void vst4q_f16_ptr(__transfersize(32) __fp16 * ptr, float16x8x4_t * val);         // VST4.16 {d0, d2, d4, d6}, [r0]
-void vst4q_f32_ptr(__transfersize(16) float32_t * ptr, float32x4x4_t * val);         // VST4.32 {d0, d2, d4, d6}, [r0]
-void vst4q_p8_ptr(__transfersize(64) poly8_t * ptr, poly8x16x4_t * val);         // VST4.8 {d0, d2, d4, d6}, [r0]
-void vst4q_p16_ptr(__transfersize(32) poly16_t * ptr, poly16x8x4_t * val);         // VST4.16 {d0, d2, d4, d6}, [r0]
-void vst4_u8_ptr(__transfersize(32) uint8_t * ptr, uint8x8x4_t * val);         // VST4.8 {d0, d1, d2, d3}, [r0]
-void vst4_u16_ptr(__transfersize(16) uint16_t * ptr, uint16x4x4_t * val);         // VST4.16 {d0, d1, d2, d3}, [r0]
-void vst4_u32_ptr(__transfersize(8) uint32_t * ptr, uint32x2x4_t * val);         // VST4.32 {d0, d1, d2, d3}, [r0]
-void vst4_u64_ptr(__transfersize(4) uint64_t * ptr, uint64x1x4_t * val);         // VST1.64 {d0, d1, d2, d3}, [r0]
-void vst4_s8_ptr(__transfersize(32) int8_t * ptr, int8x8x4_t * val);         // VST4.8 {d0, d1, d2, d3}, [r0]
-void vst4_s16_ptr(__transfersize(16) int16_t * ptr, int16x4x4_t * val);         // VST4.16 {d0, d1, d2, d3}, [r0]
-void vst4_s32_ptr(__transfersize(8) int32_t * ptr, int32x2x4_t * val);         // VST4.32 {d0, d1, d2, d3}, [r0]
-void vst4_s64_ptr(__transfersize(4) int64_t * ptr, int64x1x4_t * val);         // VST1.64 {d0, d1, d2, d3}, [r0]
-void vst4_f16_ptr(__transfersize(16) __fp16 * ptr, float16x4x4_t * val);         // VST4.16 {d0, d1, d2, d3}, [r0]
-void vst4_f32_ptr(__transfersize(8) float32_t * ptr, float32x2x4_t * val);         // VST4.32 {d0, d1, d2, d3}, [r0]
-void vst4_p8_ptr(__transfersize(32) poly8_t * ptr, poly8x8x4_t * val);         // VST4.8 {d0, d1, d2, d3}, [r0]
-void vst4_p16_ptr(__transfersize(16) poly16_t * ptr, poly16x4x4_t * val);         // VST4.16 {d0, d1, d2, d3}, [r0]
+void vst2_f32_ptr(__transfersize(4) float32_t * ptr, float32x2x2_t * val); // VST2.32 {d0, d1}, [r0]
+void vst2_p8_ptr(__transfersize(16) poly8_t * ptr, poly8x8x2_t * val); // VST2.8 {d0, d1}, [r0]
+void vst2_p16_ptr(__transfersize(8) poly16_t * ptr, poly16x4x2_t * val); // VST2.16 {d0, d1}, [r0]
+void vst3q_u8_ptr(__transfersize(48) uint8_t * ptr, uint8x16x3_t * val); // VST3.8 {d0, d2, d4}, [r0]
+void vst3q_u16_ptr(__transfersize(24) uint16_t * ptr, uint16x8x3_t * val); // VST3.16 {d0, d2, d4}, [r0]
+void vst3q_u32_ptr(__transfersize(12) uint32_t * ptr, uint32x4x3_t * val); // VST3.32 {d0, d2, d4}, [r0]
+void vst3q_s8_ptr(__transfersize(48) int8_t * ptr, int8x16x3_t * val); // VST3.8 {d0, d2, d4}, [r0]
+void vst3q_s16_ptr(__transfersize(24) int16_t * ptr, int16x8x3_t * val); // VST3.16 {d0, d2, d4}, [r0]
+void vst3q_s32_ptr(__transfersize(12) int32_t * ptr, int32x4x3_t * val); // VST3.32 {d0, d2, d4}, [r0]
+void vst3q_f16_ptr(__transfersize(24) __fp16 * ptr, float16x8x3_t * val); // VST3.16 {d0, d2, d4}, [r0]
+void vst3q_f32_ptr(__transfersize(12) float32_t * ptr, float32x4x3_t * val); // VST3.32 {d0, d2, d4}, [r0]
+void vst3q_p8_ptr(__transfersize(48) poly8_t * ptr, poly8x16x3_t * val); // VST3.8 {d0, d2, d4}, [r0]
+void vst3q_p16_ptr(__transfersize(24) poly16_t * ptr, poly16x8x3_t * val); // VST3.16 {d0, d2, d4}, [r0]
+void vst3_u8_ptr(__transfersize(24) uint8_t * ptr, uint8x8x3_t * val); // VST3.8 {d0, d1, d2}, [r0]
+void vst3_u16_ptr(__transfersize(12) uint16_t * ptr, uint16x4x3_t * val); // VST3.16 {d0, d1, d2}, [r0]
+void vst3_u32_ptr(__transfersize(6) uint32_t * ptr, uint32x2x3_t * val); // VST3.32 {d0, d1, d2}, [r0]
+void vst3_u64_ptr(__transfersize(3) uint64_t * ptr, uint64x1x3_t * val); // VST1.64 {d0, d1, d2}, [r0]
+void vst3_s8_ptr(__transfersize(24) int8_t * ptr, int8x8x3_t * val); // VST3.8 {d0, d1, d2}, [r0]
+void vst3_s16_ptr(__transfersize(12) int16_t * ptr, int16x4x3_t * val); // VST3.16 {d0, d1, d2}, [r0]
+void vst3_s32_ptr(__transfersize(6) int32_t * ptr, int32x2x3_t * val); // VST3.32 {d0, d1, d2}, [r0]
+void vst3_s64_ptr(__transfersize(3) int64_t * ptr, int64x1x3_t * val); // VST1.64 {d0, d1, d2}, [r0]
+void vst3_f16_ptr(__transfersize(12) __fp16 * ptr, float16x4x3_t * val); // VST3.16 {d0, d1, d2}, [r0]
+void vst3_f32_ptr(__transfersize(6) float32_t * ptr, float32x2x3_t * val); // VST3.32 {d0, d1, d2}, [r0]
+void vst3_p8_ptr(__transfersize(24) poly8_t * ptr, poly8x8x3_t * val); // VST3.8 {d0, d1, d2}, [r0]
+void vst3_p16_ptr(__transfersize(12) poly16_t * ptr, poly16x4x3_t * val); // VST3.16 {d0, d1, d2}, [r0]
+void vst4q_u8_ptr(__transfersize(64) uint8_t * ptr, uint8x16x4_t * val); // VST4.8 {d0, d2, d4, d6}, [r0]
+void vst4q_u16_ptr(__transfersize(32) uint16_t * ptr, uint16x8x4_t * val); // VST4.16 {d0, d2, d4, d6}, [r0]
+void vst4q_u32_ptr(__transfersize(16) uint32_t * ptr, uint32x4x4_t * val); // VST4.32 {d0, d2, d4, d6}, [r0]
+void vst4q_s8_ptr(__transfersize(64) int8_t * ptr, int8x16x4_t * val); // VST4.8 {d0, d2, d4, d6}, [r0]
+void vst4q_s16_ptr(__transfersize(32) int16_t * ptr, int16x8x4_t * val); // VST4.16 {d0, d2, d4, d6}, [r0]
+void vst4q_s32_ptr(__transfersize(16) int32_t * ptr, int32x4x4_t * val); // VST4.32 {d0, d2, d4, d6}, [r0]
+void vst4q_f16_ptr(__transfersize(32) __fp16 * ptr, float16x8x4_t * val); // VST4.16 {d0, d2, d4, d6}, [r0]
+void vst4q_f32_ptr(__transfersize(16) float32_t * ptr, float32x4x4_t * val); // VST4.32 {d0, d2, d4, d6}, [r0]
+void vst4q_p8_ptr(__transfersize(64) poly8_t * ptr, poly8x16x4_t * val); // VST4.8 {d0, d2, d4, d6}, [r0]
+void vst4q_p16_ptr(__transfersize(32) poly16_t * ptr, poly16x8x4_t * val); // VST4.16 {d0, d2, d4, d6}, [r0]
+void vst4_u8_ptr(__transfersize(32) uint8_t * ptr, uint8x8x4_t * val); // VST4.8 {d0, d1, d2, d3}, [r0]
+void vst4_u16_ptr(__transfersize(16) uint16_t * ptr, uint16x4x4_t * val); // VST4.16 {d0, d1, d2, d3}, [r0]
+void vst4_u32_ptr(__transfersize(8) uint32_t * ptr, uint32x2x4_t * val); // VST4.32 {d0, d1, d2, d3}, [r0]
+void vst4_u64_ptr(__transfersize(4) uint64_t * ptr, uint64x1x4_t * val); // VST1.64 {d0, d1, d2, d3}, [r0]
+void vst4_s8_ptr(__transfersize(32) int8_t * ptr, int8x8x4_t * val); // VST4.8 {d0, d1, d2, d3}, [r0]
+void vst4_s16_ptr(__transfersize(16) int16_t * ptr, int16x4x4_t * val); // VST4.16 {d0, d1, d2, d3}, [r0]
+void vst4_s32_ptr(__transfersize(8) int32_t * ptr, int32x2x4_t * val); // VST4.32 {d0, d1, d2, d3}, [r0]
+void vst4_s64_ptr(__transfersize(4) int64_t * ptr, int64x1x4_t * val); // VST1.64 {d0, d1, d2, d3}, [r0]
+void vst4_f16_ptr(__transfersize(16) __fp16 * ptr, float16x4x4_t * val); // VST4.16 {d0, d1, d2, d3}, [r0]
+void vst4_f32_ptr(__transfersize(8) float32_t * ptr, float32x2x4_t * val); // VST4.32 {d0, d1, d2, d3}, [r0]
+void vst4_p8_ptr(__transfersize(32) poly8_t * ptr, poly8x8x4_t * val); // VST4.8 {d0, d1, d2, d3}, [r0]
+void vst4_p16_ptr(__transfersize(16) poly16_t * ptr, poly16x4x4_t * val); // VST4.16 {d0, d1, d2, d3}, [r0]
 //Store a single lane of N-element structure to memory
-void vst2q_lane_u16_ptr(__transfersize(2) uint16_t * ptr, uint16x8x2_t * val, __constrange(0,7) int lane);         // VST2.16{d0[0], d2[0]}, [r0]
-void vst2q_lane_u32_ptr(__transfersize(2) uint32_t * ptr, uint32x4x2_t * val, __constrange(0,3) int lane);         // VST2.32{d0[0], d2[0]}, [r0]
-void vst2q_lane_s16_ptr(__transfersize(2) int16_t * ptr, int16x8x2_t * val, __constrange(0,7) int lane);         // VST2.16{d0[0], d2[0]}, [r0]
-void vst2q_lane_s32_ptr(__transfersize(2) int32_t * ptr, int32x4x2_t * val, __constrange(0,3) int lane);         // VST2.32{d0[0], d2[0]}, [r0]
-void vst2q_lane_f16_ptr(__transfersize(2) __fp16 * ptr, float16x8x2_t * val, __constrange(0,7) int lane);         // VST2.16{d0[0], d2[0]}, [r0]
-void vst2q_lane_f32_ptr(__transfersize(2) float32_t * ptr, float32x4x2_t * val, __constrange(0,3) int lane);         //VST2.32 {d0[0], d2[0]}, [r0]
-void vst2q_lane_p16_ptr(__transfersize(2) poly16_t * ptr, poly16x8x2_t * val, __constrange(0,7) int lane);         // VST2.16{d0[0], d2[0]}, [r0]
-void vst2_lane_u8_ptr(__transfersize(2) uint8_t * ptr, uint8x8x2_t * val, __constrange(0,7) int lane);         // VST2.8{d0[0], d1[0]}, [r0]
-void vst2_lane_u16_ptr(__transfersize(2) uint16_t * ptr, uint16x4x2_t * val, __constrange(0,3) int lane);         // VST2.16{d0[0], d1[0]}, [r0]
-void vst2_lane_u32_ptr(__transfersize(2) uint32_t * ptr, uint32x2x2_t * val, __constrange(0,1) int lane);         // VST2.32{d0[0], d1[0]}, [r0]
-void vst2_lane_s8_ptr(__transfersize(2) int8_t * ptr, int8x8x2_t * val, __constrange(0,7) int lane);         // VST2.8 {d0[0],d1[0]}, [r0]
-void vst2_lane_s16_ptr(__transfersize(2) int16_t * ptr, int16x4x2_t * val, __constrange(0,3) int lane);         // VST2.16{d0[0], d1[0]}, [r0]
-void vst2_lane_s32_ptr(__transfersize(2) int32_t * ptr, int32x2x2_t * val, __constrange(0,1) int lane);         // VST2.32{d0[0], d1[0]}, [r0]
-void vst2_lane_f16_ptr(__transfersize(2) __fp16 * ptr, float16x4x2_t * val, __constrange(0,3) int lane);         // VST2.16{d0[0], d1[0]}, [r0]
-void vst2_lane_f32_ptr(__transfersize(2) float32_t * ptr, float32x2x2_t * val, __constrange(0,1) int lane);         // VST2.32{d0[0], d1[0]}, [r0]
-void vst2_lane_p8_ptr(__transfersize(2) poly8_t * ptr, poly8x8x2_t * val, __constrange(0,7) int lane);         // VST2.8{d0[0], d1[0]}, [r0]
-void vst2_lane_p16_ptr(__transfersize(2) poly16_t * ptr, poly16x4x2_t * val, __constrange(0,3) int lane);         // VST2.16{d0[0], d1[0]}, [r0]
-void vst3q_lane_u16_ptr(__transfersize(3) uint16_t * ptr, uint16x8x3_t * val, __constrange(0,7) int lane);         // VST3.16{d0[0], d2[0], d4[0]}, [r0]
-void vst3q_lane_u32_ptr(__transfersize(3) uint32_t * ptr, uint32x4x3_t * val, __constrange(0,3) int lane);         // VST3.32{d0[0], d2[0], d4[0]}, [r0]
-void vst3q_lane_s16_ptr(__transfersize(3) int16_t * ptr, int16x8x3_t * val, __constrange(0,7) int lane);         // VST3.16{d0[0], d2[0], d4[0]}, [r0]
-void vst3q_lane_s32_ptr(__transfersize(3) int32_t * ptr, int32x4x3_t * val, __constrange(0,3) int lane);         // VST3.32{d0[0], d2[0], d4[0]}, [r0]
-void vst3q_lane_f16_ptr(__transfersize(3) __fp16 * ptr, float16x8x3_t * val, __constrange(0,7) int lane);         // VST3.16{d0[0], d2[0], d4[0]}, [r0]
-void vst3q_lane_f32_ptr(__transfersize(3) float32_t * ptr, float32x4x3_t * val, __constrange(0,3) int lane);         //VST3.32 {d0[0], d2[0], d4[0]}, [r0]
-void vst3q_lane_p16_ptr(__transfersize(3) poly16_t * ptr, poly16x8x3_t * val, __constrange(0,7) int lane);         // VST3.16{d0[0], d2[0], d4[0]}, [r0]
-void vst3_lane_u8_ptr(__transfersize(3) uint8_t * ptr, uint8x8x3_t * val, __constrange(0,7) int lane);         // VST3.8{d0[0], d1[0], d2[0]}, [r0]
-void vst3_lane_u16_ptr(__transfersize(3) uint16_t * ptr, uint16x4x3_t * val, __constrange(0,3) int lane);         // VST3.16{d0[0], d1[0], d2[0]}, [r0]
-void vst3_lane_u32_ptr(__transfersize(3) uint32_t * ptr, uint32x2x3_t * val, __constrange(0,1) int lane);         // VST3.32{d0[0], d1[0], d2[0]}, [r0]
-void vst3_lane_s8_ptr(__transfersize(3) int8_t * ptr, int8x8x3_t * val, __constrange(0,7) int lane);         // VST3.8 {d0[0],d1[0], d2[0]}, [r0]
-void vst3_lane_s16_ptr(__transfersize(3) int16_t * ptr, int16x4x3_t * val, __constrange(0,3) int lane);         // VST3.16{d0[0], d1[0], d2[0]}, [r0]
-void vst3_lane_s32_ptr(__transfersize(3) int32_t * ptr, int32x2x3_t * val, __constrange(0,1) int lane);         // VST3.32{d0[0], d1[0], d2[0]}, [r0]
-void vst3_lane_f16_ptr(__transfersize(3) __fp16 * ptr, float16x4x3_t * val, __constrange(0,3) int lane);         // VST3.16{d0[0], d1[0], d2[0]}, [r0]
-void vst3_lane_f32_ptr(__transfersize(3) float32_t * ptr, float32x2x3_t * val, __constrange(0,1) int lane);         // VST3.32{d0[0], d1[0], d2[0]}, [r0]
-void vst3_lane_p8_ptr(__transfersize(3) poly8_t * ptr, poly8x8x3_t * val, __constrange(0,7) int lane);         // VST3.8{d0[0], d1[0], d2[0]}, [r0]
-void vst3_lane_p16_ptr(__transfersize(3) poly16_t * ptr, poly16x4x3_t * val, __constrange(0,3) int lane);         // VST3.16{d0[0], d1[0], d2[0]}, [r0]
-void vst4q_lane_u16_ptr(__transfersize(4) uint16_t * ptr, uint16x8x4_t * val, __constrange(0,7) int lane);         // VST4.16{d0[0], d2[0], d4[0], d6[0]}, [r0]
-void vst4q_lane_u32_ptr(__transfersize(4) uint32_t * ptr, uint32x4x4_t * val, __constrange(0,3) int lane);         // VST4.32{d0[0], d2[0], d4[0], d6[0]}, [r0]
-void vst4q_lane_s16_ptr(__transfersize(4) int16_t * ptr, int16x8x4_t * val, __constrange(0,7) int lane);         // VST4.16{d0[0], d2[0], d4[0], d6[0]}, [r0]
-void vst4q_lane_s32_ptr(__transfersize(4) int32_t * ptr, int32x4x4_t * val, __constrange(0,3) int lane);         // VST4.32{d0[0], d2[0], d4[0], d6[0]}, [r0]
-void vst4q_lane_f16_ptr(__transfersize(4) __fp16 * ptr, float16x8x4_t * val, __constrange(0,7) int lane);         // VST4.16{d0[0], d2[0], d4[0], d6[0]}, [r0]
-void vst4q_lane_f32_ptr(__transfersize(4) float32_t * ptr, float32x4x4_t * val, __constrange(0,3) int lane);         //VST4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0]
-void vst4q_lane_p16_ptr(__transfersize(4) poly16_t * ptr, poly16x8x4_t * val, __constrange(0,7) int lane);         // VST4.16{d0[0], d2[0], d4[0], d6[0]}, [r0]
-void vst4_lane_u8_ptr(__transfersize(4) uint8_t * ptr, uint8x8x4_t * val, __constrange(0,7) int lane);         // VST4.8{d0[0], d1[0], d2[0], d3[0]}, [r0]
-void vst4_lane_u16_ptr(__transfersize(4) uint16_t * ptr, uint16x4x4_t * val, __constrange(0,3) int lane);         // VST4.16{d0[0], d1[0], d2[0], d3[0]}, [r0]
-void vst4_lane_u32_ptr(__transfersize(4) uint32_t * ptr, uint32x2x4_t * val, __constrange(0,1) int lane);         // VST4.32{d0[0], d1[0], d2[0], d3[0]}, [r0]
-void vst4_lane_s8_ptr(__transfersize(4) int8_t * ptr, int8x8x4_t * val, __constrange(0,7) int lane);         // VST4.8 {d0[0],d1[0], d2[0], d3[0]}, [r0]
-void vst4_lane_s16_ptr(__transfersize(4) int16_t * ptr, int16x4x4_t * val, __constrange(0,3) int lane);         // VST4.16{d0[0], d1[0], d2[0], d3[0]}, [r0]
-void vst4_lane_s32_ptr(__transfersize(4) int32_t * ptr, int32x2x4_t * val, __constrange(0,1) int lane);         // VST4.32{d0[0], d1[0], d2[0], d3[0]}, [r0]
-void vst4_lane_f16_ptr(__transfersize(4) __fp16 * ptr, float16x4x4_t * val, __constrange(0,3) int lane);         // VST4.16{d0[0], d1[0], d2[0], d3[0]}, [r0]
-void vst4_lane_f32_ptr(__transfersize(4) float32_t * ptr, float32x2x4_t * val, __constrange(0,1) int lane);         // VST4.32{d0[0], d1[0], d2[0], d3[0]}, [r0]
-void vst4_lane_p8_ptr(__transfersize(4) poly8_t * ptr, poly8x8x4_t * val, __constrange(0,7) int lane);         // VST4.8{d0[0], d1[0], d2[0], d3[0]}, [r0]
-void vst4_lane_p16_ptr(__transfersize(4) poly16_t * ptr, poly16x4x4_t * val, __constrange(0,3) int lane);         // VST4.16{d0[0], d1[0], d2[0], d3[0]}, [r0]
+void vst2q_lane_u16_ptr(__transfersize(2) uint16_t * ptr, uint16x8x2_t * val, __constrange(0,7) int lane); // VST2.16{d0[0], d2[0]}, [r0]
+void vst2q_lane_u32_ptr(__transfersize(2) uint32_t * ptr, uint32x4x2_t * val, __constrange(0,3) int lane); // VST2.32{d0[0], d2[0]}, [r0]
+void vst2q_lane_s16_ptr(__transfersize(2) int16_t * ptr, int16x8x2_t * val, __constrange(0,7) int lane); // VST2.16{d0[0], d2[0]}, [r0]
+void vst2q_lane_s32_ptr(__transfersize(2) int32_t * ptr, int32x4x2_t * val, __constrange(0,3) int lane); // VST2.32{d0[0], d2[0]}, [r0]
+void vst2q_lane_f16_ptr(__transfersize(2) __fp16 * ptr, float16x8x2_t * val, __constrange(0,7) int lane); // VST2.16{d0[0], d2[0]}, [r0]
+void vst2q_lane_f32_ptr(__transfersize(2) float32_t * ptr, float32x4x2_t * val, __constrange(0,3) int lane); //VST2.32 {d0[0], d2[0]}, [r0]
+void vst2q_lane_p16_ptr(__transfersize(2) poly16_t * ptr, poly16x8x2_t * val, __constrange(0,7) int lane); // VST2.16{d0[0], d2[0]}, [r0]
+void vst2_lane_u8_ptr(__transfersize(2) uint8_t * ptr, uint8x8x2_t * val, __constrange(0,7) int lane); // VST2.8{d0[0], d1[0]}, [r0]
+void vst2_lane_u16_ptr(__transfersize(2) uint16_t * ptr, uint16x4x2_t * val, __constrange(0,3) int lane); // VST2.16{d0[0], d1[0]}, [r0]
+void vst2_lane_u32_ptr(__transfersize(2) uint32_t * ptr, uint32x2x2_t * val, __constrange(0,1) int lane); // VST2.32{d0[0], d1[0]}, [r0]
+void vst2_lane_s8_ptr(__transfersize(2) int8_t * ptr, int8x8x2_t * val, __constrange(0,7) int lane); // VST2.8 {d0[0],d1[0]}, [r0]
+void vst2_lane_s16_ptr(__transfersize(2) int16_t * ptr, int16x4x2_t * val, __constrange(0,3) int lane); // VST2.16{d0[0], d1[0]}, [r0]
+void vst2_lane_s32_ptr(__transfersize(2) int32_t * ptr, int32x2x2_t * val, __constrange(0,1) int lane); // VST2.32{d0[0], d1[0]}, [r0]
+void vst2_lane_f16_ptr(__transfersize(2) __fp16 * ptr, float16x4x2_t * val, __constrange(0,3) int lane); // VST2.16{d0[0], d1[0]}, [r0]
+void vst2_lane_f32_ptr(__transfersize(2) float32_t * ptr, float32x2x2_t * val, __constrange(0,1) int lane); // VST2.32{d0[0], d1[0]}, [r0]
+void vst2_lane_p8_ptr(__transfersize(2) poly8_t * ptr, poly8x8x2_t * val, __constrange(0,7) int lane); // VST2.8{d0[0], d1[0]}, [r0]
+void vst2_lane_p16_ptr(__transfersize(2) poly16_t * ptr, poly16x4x2_t * val, __constrange(0,3) int lane); // VST2.16{d0[0], d1[0]}, [r0]
+void vst3q_lane_u16_ptr(__transfersize(3) uint16_t * ptr, uint16x8x3_t * val, __constrange(0,7) int lane); // VST3.16{d0[0], d2[0], d4[0]}, [r0]
+void vst3q_lane_u32_ptr(__transfersize(3) uint32_t * ptr, uint32x4x3_t * val, __constrange(0,3) int lane); // VST3.32{d0[0], d2[0], d4[0]}, [r0]
+void vst3q_lane_s16_ptr(__transfersize(3) int16_t * ptr, int16x8x3_t * val, __constrange(0,7) int lane); // VST3.16{d0[0], d2[0], d4[0]}, [r0]
+void vst3q_lane_s32_ptr(__transfersize(3) int32_t * ptr, int32x4x3_t * val, __constrange(0,3) int lane); // VST3.32{d0[0], d2[0], d4[0]}, [r0]
+void vst3q_lane_f16_ptr(__transfersize(3) __fp16 * ptr, float16x8x3_t * val, __constrange(0,7) int lane); // VST3.16{d0[0], d2[0], d4[0]}, [r0]
+void vst3q_lane_f32_ptr(__transfersize(3) float32_t * ptr, float32x4x3_t * val, __constrange(0,3) int lane); //VST3.32 {d0[0], d2[0], d4[0]}, [r0]
+void vst3q_lane_p16_ptr(__transfersize(3) poly16_t * ptr, poly16x8x3_t * val, __constrange(0,7) int lane); // VST3.16{d0[0], d2[0], d4[0]}, [r0]
+void vst3_lane_u8_ptr(__transfersize(3) uint8_t * ptr, uint8x8x3_t * val, __constrange(0,7) int lane); // VST3.8{d0[0], d1[0], d2[0]}, [r0]
+void vst3_lane_u16_ptr(__transfersize(3) uint16_t * ptr, uint16x4x3_t * val, __constrange(0,3) int lane); // VST3.16{d0[0], d1[0], d2[0]}, [r0]
+void vst3_lane_u32_ptr(__transfersize(3) uint32_t * ptr, uint32x2x3_t * val, __constrange(0,1) int lane); // VST3.32{d0[0], d1[0], d2[0]}, [r0]
+void vst3_lane_s8_ptr(__transfersize(3) int8_t * ptr, int8x8x3_t * val, __constrange(0,7) int lane); // VST3.8 {d0[0],d1[0], d2[0]}, [r0]
+void vst3_lane_s16_ptr(__transfersize(3) int16_t * ptr, int16x4x3_t * val, __constrange(0,3) int lane); // VST3.16{d0[0], d1[0], d2[0]}, [r0]
+void vst3_lane_s32_ptr(__transfersize(3) int32_t * ptr, int32x2x3_t * val, __constrange(0,1) int lane); // VST3.32{d0[0], d1[0], d2[0]}, [r0]
+void vst3_lane_f16_ptr(__transfersize(3) __fp16 * ptr, float16x4x3_t * val, __constrange(0,3) int lane); // VST3.16{d0[0], d1[0], d2[0]}, [r0]
+void vst3_lane_f32_ptr(__transfersize(3) float32_t * ptr, float32x2x3_t * val, __constrange(0,1) int lane); // VST3.32{d0[0], d1[0], d2[0]}, [r0]
+void vst3_lane_p8_ptr(__transfersize(3) poly8_t * ptr, poly8x8x3_t * val, __constrange(0,7) int lane); // VST3.8{d0[0], d1[0], d2[0]}, [r0]
+void vst3_lane_p16_ptr(__transfersize(3) poly16_t * ptr, poly16x4x3_t * val, __constrange(0,3) int lane); // VST3.16{d0[0], d1[0], d2[0]}, [r0]
+void vst4q_lane_u16_ptr(__transfersize(4) uint16_t * ptr, uint16x8x4_t * val, __constrange(0,7) int lane); // VST4.16{d0[0], d2[0], d4[0], d6[0]}, [r0]
+void vst4q_lane_u32_ptr(__transfersize(4) uint32_t * ptr, uint32x4x4_t * val, __constrange(0,3) int lane); // VST4.32{d0[0], d2[0], d4[0], d6[0]}, [r0]
+void vst4q_lane_s16_ptr(__transfersize(4) int16_t * ptr, int16x8x4_t * val, __constrange(0,7) int lane); // VST4.16{d0[0], d2[0], d4[0], d6[0]}, [r0]
+void vst4q_lane_s32_ptr(__transfersize(4) int32_t * ptr, int32x4x4_t * val, __constrange(0,3) int lane); // VST4.32{d0[0], d2[0], d4[0], d6[0]}, [r0]
+void vst4q_lane_f16_ptr(__transfersize(4) __fp16 * ptr, float16x8x4_t * val, __constrange(0,7) int lane); // VST4.16{d0[0], d2[0], d4[0], d6[0]}, [r0]
+void vst4q_lane_f32_ptr(__transfersize(4) float32_t * ptr, float32x4x4_t * val, __constrange(0,3) int lane); //VST4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0]
+void vst4q_lane_p16_ptr(__transfersize(4) poly16_t * ptr, poly16x8x4_t * val, __constrange(0,7) int lane); // VST4.16{d0[0], d2[0], d4[0], d6[0]}, [r0]
+void vst4_lane_u8_ptr(__transfersize(4) uint8_t * ptr, uint8x8x4_t * val, __constrange(0,7) int lane); // VST4.8{d0[0], d1[0], d2[0], d3[0]}, [r0]
+void vst4_lane_u16_ptr(__transfersize(4) uint16_t * ptr, uint16x4x4_t * val, __constrange(0,3) int lane); // VST4.16{d0[0], d1[0], d2[0], d3[0]}, [r0]
+void vst4_lane_u32_ptr(__transfersize(4) uint32_t * ptr, uint32x2x4_t * val, __constrange(0,1) int lane); // VST4.32{d0[0], d1[0], d2[0], d3[0]}, [r0]
+void vst4_lane_s8_ptr(__transfersize(4) int8_t * ptr, int8x8x4_t * val, __constrange(0,7) int lane); // VST4.8 {d0[0],d1[0], d2[0], d3[0]}, [r0]
+void vst4_lane_s16_ptr(__transfersize(4) int16_t * ptr, int16x4x4_t * val, __constrange(0,3) int lane); // VST4.16{d0[0], d1[0], d2[0], d3[0]}, [r0]
+void vst4_lane_s32_ptr(__transfersize(4) int32_t * ptr, int32x2x4_t * val, __constrange(0,1) int lane); // VST4.32{d0[0], d1[0], d2[0], d3[0]}, [r0]
+void vst4_lane_f16_ptr(__transfersize(4) __fp16 * ptr, float16x4x4_t * val, __constrange(0,3) int lane); // VST4.16{d0[0], d1[0], d2[0], d3[0]}, [r0]
+void vst4_lane_f32_ptr(__transfersize(4) float32_t * ptr, float32x2x4_t * val, __constrange(0,1) int lane); // VST4.32{d0[0], d1[0], d2[0], d3[0]}, [r0]
+void vst4_lane_p8_ptr(__transfersize(4) poly8_t * ptr, poly8x8x4_t * val, __constrange(0,7) int lane); // VST4.8{d0[0], d1[0], d2[0], d3[0]}, [r0]
+void vst4_lane_p16_ptr(__transfersize(4) poly16_t * ptr, poly16x4x4_t * val, __constrange(0,3) int lane); // VST4.16{d0[0], d1[0], d2[0], d3[0]}, [r0]
 //Extract lanes from a vector and put into a register. These intrinsics extract a single lane (element) from a vector.
-
-uint8_t vgetq_lane_u8(uint8x16_t vec, __constrange(0,15) int lane);         // VMOV.U8 r0, d0[0]
-uint16_t vgetq_lane_u16(uint16x8_t vec, __constrange(0,7) int lane);         // VMOV.U16 r0, d0[0]
-uint32_t vgetq_lane_u32(uint32x4_t vec, __constrange(0,3) int lane);         // VMOV.32 r0, d0[0]
-int8_t vgetq_lane_s8(int8x16_t vec, __constrange(0,15) int lane);         // VMOV.S8 r0, d0[0]
-int16_t vgetq_lane_s16(int16x8_t vec, __constrange(0,7) int lane);         // VMOV.S16 r0, d0[0]
-int32_t vgetq_lane_s32(int32x4_t vec, __constrange(0,3) int lane);         // VMOV.32 r0, d0[0]
-poly8_t vgetq_lane_p8(poly8x16_t vec, __constrange(0,15) int lane);         // VMOV.U8 r0, d0[0]
-poly16_t vgetq_lane_p16(poly16x8_t vec, __constrange(0,7) int lane);         // VMOV.U16 r0, d0[0]
-float32_t vgetq_lane_f32(float32x4_t vec, __constrange(0,3) int lane);         // VMOV.32 r0, d0[0]
-
-int64_t vgetq_lane_s64(int64x2_t vec, __constrange(0,1) int lane);         // VMOV r0,r0,d0
-uint64_t vgetq_lane_u64(uint64x2_t vec, __constrange(0,1) int lane);         // VMOV r0,r0,d0
+uint8_t vget_lane_u8(uint8x8_t vec, __constrange(0,7) int lane); // VMOV.U8 r0, d0[0]
+uint16_t vget_lane_u16(uint16x4_t vec, __constrange(0,3) int lane); // VMOV.U16 r0, d0[0]
+uint32_t vget_lane_u32(uint32x2_t vec, __constrange(0,1) int lane); // VMOV.32 r0, d0[0]
+int8_t vget_lane_s8(int8x8_t vec, __constrange(0,7) int lane); // VMOV.S8 r0, d0[0]
+int16_t vget_lane_s16(int16x4_t vec, __constrange(0,3) int lane); // VMOV.S16 r0, d0[0]
+int32_t vget_lane_s32(int32x2_t vec, __constrange(0,1) int lane); // VMOV.32 r0, d0[0]
+poly8_t vget_lane_p8(poly8x8_t vec, __constrange(0,7) int lane); // VMOV.U8 r0, d0[0]
+poly16_t vget_lane_p16(poly16x4_t vec, __constrange(0,3) int lane); // VMOV.U16 r0, d0[0]
+float32_t vget_lane_f32(float32x2_t vec, __constrange(0,1) int lane); // VMOV.32 r0, d0[0]
+uint8_t vgetq_lane_u8(uint8x16_t vec, __constrange(0,15) int lane); // VMOV.U8 r0, d0[0]
+uint16_t vgetq_lane_u16(uint16x8_t vec, __constrange(0,7) int lane); // VMOV.U16 r0, d0[0]
+uint32_t vgetq_lane_u32(uint32x4_t vec, __constrange(0,3) int lane); // VMOV.32 r0, d0[0]
+int8_t vgetq_lane_s8(int8x16_t vec, __constrange(0,15) int lane); // VMOV.S8 r0, d0[0]
+int16_t vgetq_lane_s16(int16x8_t vec, __constrange(0,7) int lane); // VMOV.S16 r0, d0[0]
+int32_t vgetq_lane_s32(int32x4_t vec, __constrange(0,3) int lane); // VMOV.32 r0, d0[0]
+poly8_t vgetq_lane_p8(poly8x16_t vec, __constrange(0,15) int lane); // VMOV.U8 r0, d0[0]
+poly16_t vgetq_lane_p16(poly16x8_t vec, __constrange(0,7) int lane); // VMOV.U16 r0, d0[0]
+float32_t vgetq_lane_f32(float32x4_t vec, __constrange(0,3) int lane); // VMOV.32 r0, d0[0]
+int64_t vget_lane_s64(int64x1_t vec, __constrange(0,0) int lane); // VMOV r0,r0,d0
+uint64_t vget_lane_u64(uint64x1_t vec, __constrange(0,0) int lane); // VMOV r0,r0,d0
+int64_t vgetq_lane_s64(int64x2_t vec, __constrange(0,1) int lane); // VMOV r0,r0,d0
+uint64_t vgetq_lane_u64(uint64x2_t vec, __constrange(0,1) int lane); // VMOV r0,r0,d0
 //Load a single lane of a vector from a literal. These intrinsics set a single lane (element) within a vector.
-
-uint8x16_t vsetq_lane_u8(uint8_t value, uint8x16_t vec, __constrange(0,15) int lane);         // VMOV.8 d0[0],r0
-uint16x8_t vsetq_lane_u16(uint16_t value, uint16x8_t vec, __constrange(0,7) int lane);         // VMOV.16 d0[0],r0
-uint32x4_t vsetq_lane_u32(uint32_t value, uint32x4_t vec, __constrange(0,3) int lane);         // VMOV.32 d0[0],r0
-int8x16_t vsetq_lane_s8(int8_t value, int8x16_t vec, __constrange(0,15) int lane);         // VMOV.8 d0[0],r0
-int16x8_t vsetq_lane_s16(int16_t value, int16x8_t vec, __constrange(0,7) int lane);         // VMOV.16 d0[0],r0
-int32x4_t vsetq_lane_s32(int32_t value, int32x4_t vec, __constrange(0,3) int lane);         // VMOV.32 d0[0],r0
-poly8x16_t vsetq_lane_p8(poly8_t value, poly8x16_t vec, __constrange(0,15) int lane);         // VMOV.8 d0[0],r0
-poly16x8_t vsetq_lane_p16(poly16_t value, poly16x8_t vec, __constrange(0,7) int lane);         // VMOV.16 d0[0],r0
-float32x4_t vsetq_lane_f32(float32_t value, float32x4_t vec, __constrange(0,3) int lane);         // VMOV.32 d0[0],r0
-
-int64x2_t vsetq_lane_s64(int64_t value, int64x2_t vec, __constrange(0,1) int lane);         // VMOV d0,r0,r0
-uint64x2_t vsetq_lane_u64(uint64_t value, uint64x2_t vec, __constrange(0,1) int lane);         // VMOV d0,r0,r0
+uint8x8_t vset_lane_u8(uint8_t value, uint8x8_t vec, __constrange(0,7) int lane); // VMOV.8 d0[0],r0
+uint16x4_t vset_lane_u16(uint16_t value, uint16x4_t vec, __constrange(0,3) int lane); // VMOV.16 d0[0],r0
+uint32x2_t vset_lane_u32(uint32_t value, uint32x2_t vec, __constrange(0,1) int lane); // VMOV.32 d0[0],r0
+int8x8_t vset_lane_s8(int8_t value, int8x8_t vec, __constrange(0,7) int lane); // VMOV.8 d0[0],r0
+int16x4_t vset_lane_s16(int16_t value, int16x4_t vec, __constrange(0,3) int lane); // VMOV.16 d0[0],r0
+int32x2_t vset_lane_s32(int32_t value, int32x2_t vec, __constrange(0,1) int lane); // VMOV.32 d0[0],r0
+poly8x8_t vset_lane_p8(poly8_t value, poly8x8_t vec, __constrange(0,7) int lane); // VMOV.8 d0[0],r0
+poly16x4_t vset_lane_p16(poly16_t value, poly16x4_t vec, __constrange(0,3) int lane); // VMOV.16 d0[0],r0
+float32x2_t vset_lane_f32(float32_t value, float32x2_t vec, __constrange(0,1) int lane); // VMOV.32 d0[0],r0
+uint8x16_t vsetq_lane_u8(uint8_t value, uint8x16_t vec, __constrange(0,15) int lane); // VMOV.8 d0[0],r0
+uint16x8_t vsetq_lane_u16(uint16_t value, uint16x8_t vec, __constrange(0,7) int lane); // VMOV.16 d0[0],r0
+uint32x4_t vsetq_lane_u32(uint32_t value, uint32x4_t vec, __constrange(0,3) int lane); // VMOV.32 d0[0],r0
+int8x16_t vsetq_lane_s8(int8_t value, int8x16_t vec, __constrange(0,15) int lane); // VMOV.8 d0[0],r0
+int16x8_t vsetq_lane_s16(int16_t value, int16x8_t vec, __constrange(0,7) int lane); // VMOV.16 d0[0],r0
+int32x4_t vsetq_lane_s32(int32_t value, int32x4_t vec, __constrange(0,3) int lane); // VMOV.32 d0[0],r0
+poly8x16_t vsetq_lane_p8(poly8_t value, poly8x16_t vec, __constrange(0,15) int lane); // VMOV.8 d0[0],r0
+poly16x8_t vsetq_lane_p16(poly16_t value, poly16x8_t vec, __constrange(0,7) int lane); // VMOV.16 d0[0],r0
+float32x4_t vsetq_lane_f32(float32_t value, float32x4_t vec, __constrange(0,3) int lane); // VMOV.32 d0[0],r0
+int64x1_t vset_lane_s64(int64_t value, int64x1_t vec, __constrange(0,0) int lane); // VMOV d0,r0,r0
+uint64x1_t vset_lane_u64(uint64_t value, uint64x1_t vec, __constrange(0,0) int lane); // VMOV d0,r0,r0
+int64x2_t vsetq_lane_s64(int64_t value, int64x2_t vec, __constrange(0,1) int lane); // VMOV d0,r0,r0
+uint64x2_t vsetq_lane_u64(uint64_t value, uint64x2_t vec, __constrange(0,1) int lane); // VMOV d0,r0,r0
 //Initialize a vector from a literal bit pattern.
-
+int8x8_t vcreate_s8(uint64_t a); // VMOV d0,r0,r0
+int16x4_t vcreate_s16(uint64_t a); // VMOV d0,r0,r0
+int32x2_t vcreate_s32(uint64_t a); // VMOV d0,r0,r0
+float16x4_t vcreate_f16(uint64_t a); // VMOV d0,r0,r0
+float32x2_t vcreate_f32(uint64_t a); // VMOV d0,r0,r0
+uint8x8_t vcreate_u8(uint64_t a); // VMOV d0,r0,r0
+uint16x4_t vcreate_u16(uint64_t a); // VMOV d0,r0,r0
+uint32x2_t vcreate_u32(uint64_t a); // VMOV d0,r0,r0
+uint64x1_t vcreate_u64(uint64_t a); // VMOV d0,r0,r0
+poly8x8_t vcreate_p8(uint64_t a); // VMOV d0,r0,r0
+poly16x4_t vcreate_p16(uint64_t a); // VMOV d0,r0,r0
+int64x1_t vcreate_s64(uint64_t a); // VMOV d0,r0,r0
 //Set all lanes to same value
 //Load all lanes of vector to the same literal value
-
-uint8x16_t vdupq_n_u8(uint8_t value);         // VDUP.8 q0,r0
-uint16x8_t vdupq_n_u16(uint16_t value);         // VDUP.16 q0,r0
-uint32x4_t vdupq_n_u32(uint32_t value);         // VDUP.32 q0,r0
-int8x16_t vdupq_n_s8(int8_t value);         // VDUP.8 q0,r0
-int16x8_t vdupq_n_s16(int16_t value);         // VDUP.16 q0,r0
-int32x4_t vdupq_n_s32(int32_t value);         // VDUP.32 q0,r0
-poly8x16_t vdupq_n_p8(poly8_t value);         // VDUP.8 q0,r0
-poly16x8_t vdupq_n_p16(poly16_t value);         // VDUP.16 q0,r0
-float32x4_t vdupq_n_f32(float32_t value);         // VDUP.32 q0,r0
-
-int64x2_t vdupq_n_s64(int64_t value);         // VMOV d0,r0,r0
-uint64x2_t vdupq_n_u64(uint64_t value);         // VMOV d0,r0,r0
-
-uint8x16_t vmovq_n_u8(uint8_t value);         // VDUP.8 q0,r0
-uint16x8_t vmovq_n_u16(uint16_t value);         // VDUP.16 q0,r0
-uint32x4_t vmovq_n_u32(uint32_t value);         // VDUP.32 q0,r0
-int8x16_t vmovq_n_s8(int8_t value);         // VDUP.8 q0,r0
-int16x8_t vmovq_n_s16(int16_t value);         // VDUP.16 q0,r0
-int32x4_t vmovq_n_s32(int32_t value);         // VDUP.32 q0,r0
-poly8x16_t vmovq_n_p8(poly8_t value);         // VDUP.8 q0,r0
-poly16x8_t vmovq_n_p16(poly16_t value);         // VDUP.16 q0,r0
-float32x4_t vmovq_n_f32(float32_t value);         // VDUP.32 q0,r0
-
-int64x2_t vmovq_n_s64(int64_t value);         // VMOV d0,r0,r0
-uint64x2_t vmovq_n_u64(uint64_t value);         // VMOV d0,r0,r0
+uint8x8_t vdup_n_u8(uint8_t value); // VDUP.8 d0,r0
+uint16x4_t vdup_n_u16(uint16_t value); // VDUP.16 d0,r0
+uint32x2_t vdup_n_u32(uint32_t value); // VDUP.32 d0,r0
+int8x8_t vdup_n_s8(int8_t value); // VDUP.8 d0,r0
+int16x4_t vdup_n_s16(int16_t value); // VDUP.16 d0,r0
+int32x2_t vdup_n_s32(int32_t value); // VDUP.32 d0,r0
+poly8x8_t vdup_n_p8(poly8_t value); // VDUP.8 d0,r0
+poly16x4_t vdup_n_p16(poly16_t value); // VDUP.16 d0,r0
+float32x2_t vdup_n_f32(float32_t value); // VDUP.32 d0,r0
+uint8x16_t vdupq_n_u8(uint8_t value); // VDUP.8 q0,r0
+uint16x8_t vdupq_n_u16(uint16_t value); // VDUP.16 q0,r0
+uint32x4_t vdupq_n_u32(uint32_t value); // VDUP.32 q0,r0
+int8x16_t vdupq_n_s8(int8_t value); // VDUP.8 q0,r0
+int16x8_t vdupq_n_s16(int16_t value); // VDUP.16 q0,r0
+int32x4_t vdupq_n_s32(int32_t value); // VDUP.32 q0,r0
+poly8x16_t vdupq_n_p8(poly8_t value); // VDUP.8 q0,r0
+poly16x8_t vdupq_n_p16(poly16_t value); // VDUP.16 q0,r0
+float32x4_t vdupq_n_f32(float32_t value); // VDUP.32 q0,r0
+int64x1_t vdup_n_s64(int64_t value); // VMOV d0,r0,r0
+uint64x1_t vdup_n_u64(uint64_t value); // VMOV d0,r0,r0
+int64x2_t vdupq_n_s64(int64_t value); // VMOV d0,r0,r0
+uint64x2_t vdupq_n_u64(uint64_t value); // VMOV d0,r0,r0
+uint8x8_t vmov_n_u8(uint8_t value); // VDUP.8 d0,r0
+uint16x4_t vmov_n_u16(uint16_t value); // VDUP.16 d0,r0
+uint32x2_t vmov_n_u32(uint32_t value); // VDUP.32 d0,r0
+int8x8_t vmov_n_s8(int8_t value); // VDUP.8 d0,r0
+int16x4_t vmov_n_s16(int16_t value); // VDUP.16 d0,r0
+int32x2_t vmov_n_s32(int32_t value); // VDUP.32 d0,r0
+poly8x8_t vmov_n_p8(poly8_t value); // VDUP.8 d0,r0
+poly16x4_t vmov_n_p16(poly16_t value); // VDUP.16 d0,r0
+float32x2_t vmov_n_f32(float32_t value); // VDUP.32 d0,r0
+uint8x16_t vmovq_n_u8(uint8_t value); // VDUP.8 q0,r0
+uint16x8_t vmovq_n_u16(uint16_t value); // VDUP.16 q0,r0
+uint32x4_t vmovq_n_u32(uint32_t value); // VDUP.32 q0,r0
+int8x16_t vmovq_n_s8(int8_t value); // VDUP.8 q0,r0
+int16x8_t vmovq_n_s16(int16_t value); // VDUP.16 q0,r0
+int32x4_t vmovq_n_s32(int32_t value); // VDUP.32 q0,r0
+poly8x16_t vmovq_n_p8(poly8_t value); // VDUP.8 q0,r0
+poly16x8_t vmovq_n_p16(poly16_t value); // VDUP.16 q0,r0
+float32x4_t vmovq_n_f32(float32_t value); // VDUP.32 q0,r0
+int64x1_t vmov_n_s64(int64_t value); // VMOV d0,r0,r0
+uint64x1_t vmov_n_u64(uint64_t value); // VMOV d0,r0,r0
+int64x2_t vmovq_n_s64(int64_t value); // VMOV d0,r0,r0
+uint64x2_t vmovq_n_u64(uint64_t value); // VMOV d0,r0,r0
 //Load all lanes of the vector to the value of a lane of a vector
-
+uint8x8_t vdup_lane_u8(uint8x8_t vec, __constrange(0,7) int lane); // VDUP.8 d0,d0[0]
+uint16x4_t vdup_lane_u16(uint16x4_t vec, __constrange(0,3) int lane); // VDUP.16 d0,d0[0]
+uint32x2_t vdup_lane_u32(uint32x2_t vec, __constrange(0,1) int lane); // VDUP.32 d0,d0[0]
+int8x8_t vdup_lane_s8(int8x8_t vec, __constrange(0,7) int lane); // VDUP.8 d0,d0[0]
+int16x4_t vdup_lane_s16(int16x4_t vec, __constrange(0,3) int lane); // VDUP.16 d0,d0[0]
+int32x2_t vdup_lane_s32(int32x2_t vec, __constrange(0,1) int lane); // VDUP.32 d0,d0[0]
+poly8x8_t vdup_lane_p8(poly8x8_t vec, __constrange(0,7) int lane); // VDUP.8 d0,d0[0]
+poly16x4_t vdup_lane_p16(poly16x4_t vec, __constrange(0,3) int lane); // VDUP.16 d0,d0[0]
+float32x2_t vdup_lane_f32(float32x2_t vec, __constrange(0,1) int lane); // VDUP.32 d0,d0[0]
+uint8x16_t vdupq_lane_u8(uint8x8_t vec, __constrange(0,7) int lane); // VDUP.8 q0,d0[0]
+uint16x8_t vdupq_lane_u16(uint16x4_t vec, __constrange(0,3) int lane); // VDUP.16 q0,d0[0]
+uint32x4_t vdupq_lane_u32(uint32x2_t vec, __constrange(0,1) int lane); // VDUP.32 q0,d0[0]
+int8x16_t vdupq_lane_s8(int8x8_t vec, __constrange(0,7) int lane); // VDUP.8 q0,d0[0]
+int16x8_t vdupq_lane_s16(int16x4_t vec, __constrange(0,3) int lane); // VDUP.16 q0,d0[0]
+int32x4_t vdupq_lane_s32(int32x2_t vec, __constrange(0,1) int lane); // VDUP.32 q0,d0[0]
+poly8x16_t vdupq_lane_p8(poly8x8_t vec, __constrange(0,7) int lane); // VDUP.8 q0,d0[0]
+poly16x8_t vdupq_lane_p16(poly16x4_t vec, __constrange(0,3) int lane); // VDUP.16 q0,d0[0]
+float32x4_t vdupq_lane_f32(float32x2_t vec, __constrange(0,1) int lane); // VDUP.32 q0,d0[0]
+int64x1_t vdup_lane_s64(int64x1_t vec, __constrange(0,0) int lane); // VMOV d0,d0
+uint64x1_t vdup_lane_u64(uint64x1_t vec, __constrange(0,0) int lane); // VMOV d0,d0
+int64x2_t vdupq_lane_s64(int64x1_t vec, __constrange(0,0) int lane); // VMOV q0,q0
+uint64x2_t vdupq_lane_u64(uint64x1_t vec, __constrange(0,0) int lane); // VMOV q0,q0
 //Combining vectors. These intrinsics join two 64 bit vectors into a single 128bit vector.
-
+int8x16_t vcombine_s8(int8x8_t low, int8x8_t high); // VMOV d0,d0
+int16x8_t vcombine_s16(int16x4_t low, int16x4_t high); // VMOV d0,d0
+int32x4_t vcombine_s32(int32x2_t low, int32x2_t high); // VMOV d0,d0
+int64x2_t vcombine_s64(int64x1_t low, int64x1_t high); // VMOV d0,d0
+float16x8_t vcombine_f16(float16x4_t low, float16x4_t high); // VMOV d0,d0
+float32x4_t vcombine_f32(float32x2_t low, float32x2_t high); // VMOV d0,d0
+uint8x16_t vcombine_u8(uint8x8_t low, uint8x8_t high); // VMOV d0,d0
+uint16x8_t vcombine_u16(uint16x4_t low, uint16x4_t high); // VMOV d0,d0
+uint32x4_t vcombine_u32(uint32x2_t low, uint32x2_t high); // VMOV d0,d0
+uint64x2_t vcombine_u64(uint64x1_t low, uint64x1_t high); // VMOV d0,d0
+poly8x16_t vcombine_p8(poly8x8_t low, poly8x8_t high); // VMOV d0,d0
+poly16x8_t vcombine_p16(poly16x4_t low, poly16x4_t high); // VMOV d0,d0
 //Splitting vectors. These intrinsics split a 128 bit vector into 2 component 64 bit vectors
-
+int8x8_t vget_high_s8(int8x16_t a); // VMOV d0,d0
+int16x4_t vget_high_s16(int16x8_t a); // VMOV d0,d0
+int32x2_t vget_high_s32(int32x4_t a); // VMOV d0,d0
+int64x1_t vget_high_s64(int64x2_t a); // VMOV d0,d0
+float16x4_t vget_high_f16(float16x8_t a); // VMOV d0,d0
+float32x2_t vget_high_f32(float32x4_t a); // VMOV d0,d0
+uint8x8_t vget_high_u8(uint8x16_t a); // VMOV d0,d0
+uint16x4_t vget_high_u16(uint16x8_t a); // VMOV d0,d0
+uint32x2_t vget_high_u32(uint32x4_t a); // VMOV d0,d0
+uint64x1_t vget_high_u64(uint64x2_t a); // VMOV d0,d0
+poly8x8_t vget_high_p8(poly8x16_t a); // VMOV d0,d0
+poly16x4_t vget_high_p16(poly16x8_t a); // VMOV d0,d0
+int8x8_t vget_low_s8(int8x16_t a); // VMOV d0,d0
+int16x4_t vget_low_s16(int16x8_t a); // VMOV d0,d0
+int32x2_t vget_low_s32(int32x4_t a); // VMOV d0,d0
+int64x1_t vget_low_s64(int64x2_t a); // VMOV d0,d0
+float16x4_t vget_low_f16(float16x8_t a); // VMOV d0,d0
+float32x2_t vget_low_f32(float32x4_t a); // VMOV d0,d0
+uint8x8_t vget_low_u8(uint8x16_t a); // VMOV d0,d0
+uint16x4_t vget_low_u16(uint16x8_t a); // VMOV d0,d0
+uint32x2_t vget_low_u32(uint32x4_t a); // VMOV d0,d0
+uint64x1_t vget_low_u64(uint64x2_t a); // VMOV d0,d0
+poly8x8_t vget_low_p8(poly8x16_t a); // VMOV d0,d0
+poly16x4_t vget_low_p16(poly16x8_t a); // VMOV d0,d0
 //Converting vectors. These intrinsics are used to convert vectors.
 //Convert from float
-
-int32x4_t vcvtq_s32_f32(float32x4_t a);         // VCVT.S32.F32 q0, q0
-uint32x4_t vcvtq_u32_f32(float32x4_t a);         // VCVT.U32.F32 q0, q0
-
-int32x4_t vcvtq_n_s32_f32(float32x4_t a, __constrange(1,32) int b);         // VCVT.S32.F32 q0, q0, #32
-uint32x4_t vcvtq_n_u32_f32(float32x4_t a, __constrange(1,32) int b);         // VCVT.U32.F32 q0, q0, #32
+int32x2_t vcvt_s32_f32(float32x2_t a); // VCVT.S32.F32 d0, d0
+uint32x2_t vcvt_u32_f32(float32x2_t a); // VCVT.U32.F32 d0, d0
+int32x4_t vcvtq_s32_f32(float32x4_t a); // VCVT.S32.F32 q0, q0
+uint32x4_t vcvtq_u32_f32(float32x4_t a); // VCVT.U32.F32 q0, q0
+int32x2_t vcvt_n_s32_f32(float32x2_t a, __constrange(1,32) int b); // VCVT.S32.F32 d0, d0, #32
+uint32x2_t vcvt_n_u32_f32(float32x2_t a, __constrange(1,32) int b); // VCVT.U32.F32 d0, d0, #32
+int32x4_t vcvtq_n_s32_f32(float32x4_t a, __constrange(1,32) int b); // VCVT.S32.F32 q0, q0, #32
+uint32x4_t vcvtq_n_u32_f32(float32x4_t a, __constrange(1,32) int b); // VCVT.U32.F32 q0, q0, #32
 //Convert to float
-
-float32x4_t vcvtq_f32_s32(int32x4_t a);         // VCVT.F32.S32 q0, q0
-float32x4_t vcvtq_f32_u32(uint32x4_t a);         // VCVT.F32.U32 q0, q0
-
-float32x4_t vcvtq_n_f32_s32(int32x4_t a, __constrange(1,32) int b);         // VCVT.F32.S32 q0, q0, #32
-float32x4_t vcvtq_n_f32_u32(uint32x4_t a, __constrange(1,32) int b);         // VCVT.F32.U32 q0, q0, #32
+float32x2_t vcvt_f32_s32(int32x2_t a); // VCVT.F32.S32 d0, d0
+float32x2_t vcvt_f32_u32(uint32x2_t a); // VCVT.F32.U32 d0, d0
+float32x4_t vcvtq_f32_s32(int32x4_t a); // VCVT.F32.S32 q0, q0
+float32x4_t vcvtq_f32_u32(uint32x4_t a); // VCVT.F32.U32 q0, q0
+float32x2_t vcvt_n_f32_s32(int32x2_t a, __constrange(1,32) int b); // VCVT.F32.S32 d0, d0, #32
+float32x2_t vcvt_n_f32_u32(uint32x2_t a, __constrange(1,32) int b); // VCVT.F32.U32 d0, d0, #32
+float32x4_t vcvtq_n_f32_s32(int32x4_t a, __constrange(1,32) int b); // VCVT.F32.S32 q0, q0, #32
+float32x4_t vcvtq_n_f32_u32(uint32x4_t a, __constrange(1,32) int b); // VCVT.F32.U32 q0, q0, #32
 //Convert between floats
-
+float16x4_t vcvt_f16_f32(float32x4_t a); // VCVT.F16.F32 d0, q0
+float32x4_t vcvt_f32_f16(float16x4_t a); // VCVT.F32.F16 q0, d0
 //Vector narrow integer
-
+int8x8_t vmovn_s16(int16x8_t a); // VMOVN.I16 d0,q0
+int16x4_t vmovn_s32(int32x4_t a); // VMOVN.I32 d0,q0
+int32x2_t vmovn_s64(int64x2_t a); // VMOVN.I64 d0,q0
+uint8x8_t vmovn_u16(uint16x8_t a); // VMOVN.I16 d0,q0
+uint16x4_t vmovn_u32(uint32x4_t a); // VMOVN.I32 d0,q0
+uint32x2_t vmovn_u64(uint64x2_t a); // VMOVN.I64 d0,q0
 //Vector long move
-
+int16x8_t vmovl_s8(int8x8_t a); // VMOVL.S8 q0,d0
+int32x4_t vmovl_s16(int16x4_t a); // VMOVL.S16 q0,d0
+int64x2_t vmovl_s32(int32x2_t a); // VMOVL.S32 q0,d0
+uint16x8_t vmovl_u8(uint8x8_t a); // VMOVL.U8 q0,d0
+uint32x4_t vmovl_u16(uint16x4_t a); // VMOVL.U16 q0,d0
+uint64x2_t vmovl_u32(uint32x2_t a); // VMOVL.U32 q0,d0
 //Vector saturating narrow integer
-
+int8x8_t vqmovn_s16(int16x8_t a); // VQMOVN.S16 d0,q0
+int16x4_t vqmovn_s32(int32x4_t a); // VQMOVN.S32 d0,q0
+int32x2_t vqmovn_s64(int64x2_t a); // VQMOVN.S64 d0,q0
+uint8x8_t vqmovn_u16(uint16x8_t a); // VQMOVN.U16 d0,q0
+uint16x4_t vqmovn_u32(uint32x4_t a); // VQMOVN.U32 d0,q0
+uint32x2_t vqmovn_u64(uint64x2_t a); // VQMOVN.U64 d0,q0
 //Vector saturating narrow integer signed->unsigned
-
+uint8x8_t vqmovun_s16(int16x8_t a); // VQMOVUN.S16 d0,q0
+uint16x4_t vqmovun_s32(int32x4_t a); // VQMOVUN.S32 d0,q0
+uint32x2_t vqmovun_s64(int64x2_t a); // VQMOVUN.S64 d0,q0
 //Table look up
-
+uint8x8_t vtbl1_u8(uint8x8_t a, uint8x8_t b); // VTBL.8 d0, {d0}, d0
+int8x8_t vtbl1_s8(int8x8_t a, int8x8_t b); // VTBL.8 d0, {d0}, d0
+poly8x8_t vtbl1_p8(poly8x8_t a, uint8x8_t b); // VTBL.8 d0, {d0}, d0
+uint8x8_t vtbl2_u8_ptr(uint8x8x2_t *a, uint8x8_t b); // VTBL.8 d0, {d0, d1}, d0
+int8x8_t vtbl2_s8_ptr(int8x8x2_t *a, int8x8_t b); // VTBL.8 d0, {d0, d1}, d0
+poly8x8_t vtbl2_p8_ptr(poly8x8x2_t *a, uint8x8_t b); // VTBL.8 d0, {d0, d1}, d0
+uint8x8_t vtbl3_u8_ptr(uint8x8x3_t *a, uint8x8_t b); // VTBL.8 d0, {d0, d1, d2}, d0
+int8x8_t vtbl3_s8_ptr(int8x8x3_t *a, int8x8_t b); // VTBL.8 d0, {d0, d1, d2}, d0
+poly8x8_t vtbl3_p8_ptr(poly8x8x3_t *a, uint8x8_t b); // VTBL.8 d0, {d0, d1, d2}, d0
+uint8x8_t vtbl4_u8_ptr(uint8x8x4_t *a, uint8x8_t b); // VTBL.8 d0, {d0, d1, d2, d3}, d0
+int8x8_t vtbl4_s8_ptr(int8x8x4_t *a, int8x8_t b); // VTBL.8 d0, {d0, d1, d2, d3}, d0
+poly8x8_t vtbl4_p8_ptr(poly8x8x4_t *a, uint8x8_t b); // VTBL.8 d0, {d0, d1, d2, d3}, d0
 //Extended table look up intrinsics
-
+uint8x8_t vtbx1_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c); // VTBX.8 d0, {d0}, d0
+int8x8_t vtbx1_s8(int8x8_t a, int8x8_t b, int8x8_t c); // VTBX.8 d0, {d0}, d0
+poly8x8_t vtbx1_p8(poly8x8_t a, poly8x8_t b, uint8x8_t c); // VTBX.8 d0, {d0}, d0
+uint8x8_t vtbx2_u8_ptr(uint8x8_t a, uint8x8x2_t *b, uint8x8_t c); // VTBX.8 d0, {d0, d1}, d0
+int8x8_t vtbx2_s8_ptr(int8x8_t a, int8x8x2_t *b, int8x8_t c); // VTBX.8 d0, {d0, d1}, d0
+poly8x8_t vtbx2_p8_ptr(poly8x8_t a, poly8x8x2_t *b, uint8x8_t c); // VTBX.8 d0, {d0, d1}, d0
+uint8x8_t vtbx3_u8_ptr(uint8x8_t a, uint8x8x3_t *b, uint8x8_t c); // VTBX.8 d0, {d0, d1, d2}, d0
+int8x8_t vtbx3_s8_ptr(int8x8_t a, int8x8x3_t *b, int8x8_t c); // VTBX.8 d0, {d0, d1, d2}, d0
+poly8x8_t vtbx3_p8_ptr(poly8x8_t a, poly8x8x3_t *b, uint8x8_t c); // VTBX.8 d0, {d0, d1, d2}, d0
+uint8x8_t vtbx4_u8_ptr(uint8x8_t a, uint8x8x4_t *b, uint8x8_t c); // VTBX.8 d0, {d0, d1, d2, d3}, d0
+int8x8_t vtbx4_s8_ptr(int8x8_t a, int8x8x4_t *b, int8x8_t c); // VTBX.8 d0, {d0, d1, d2, d3}, d0
+poly8x8_t vtbx4_p8_ptr(poly8x8_t a, poly8x8x4_t *b, uint8x8_t c); // VTBX.8 d0, {d0, d1, d2, d3}, d0
 //Operations with a scalar value
 //Vector multiply accumulate with scalar
-
+int16x4_t vmla_lane_s16(int16x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); // VMLA.I16 d0, d0,d0[0]
+int32x2_t vmla_lane_s32(int32x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); // VMLA.I32 d0, d0,d0[0]
+uint16x4_t vmla_lane_u16(uint16x4_t a, uint16x4_t b, uint16x4_t v, __constrange(0,3) int l); // VMLA.I16 d0, d0,d0[0]
+uint32x2_t vmla_lane_u32(uint32x2_t a, uint32x2_t b, uint32x2_t v, __constrange(0,1) int l); // VMLA.I32 d0, d0,d0[0]
+float32x2_t vmla_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v, __constrange(0,1) int l); // VMLA.F32 d0,d0, d0[0]
+int16x8_t vmlaq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t v, __constrange(0,3) int l); // VMLA.I16 q0, q0,d0[0]
+int32x4_t vmlaq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t v, __constrange(0,1) int l); // VMLA.I32 q0, q0,d0[0]
+uint16x8_t vmlaq_lane_u16(uint16x8_t a, uint16x8_t b, uint16x4_t v, __constrange(0,3) int l); // VMLA.I16 q0,q0, d0[0]
+uint32x4_t vmlaq_lane_u32(uint32x4_t a, uint32x4_t b, uint32x2_t v, __constrange(0,1) int l); // VMLA.I32 q0,q0, d0[0]
+float32x4_t vmlaq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v, __constrange(0,1) int l); // VMLA.F32 q0,q0, d0[0]
 //Vector widening multiply accumulate with scalar
-
+int32x4_t vmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); //VMLAL.S16 q0, d0,d0[0]
+int64x2_t vmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); //VMLAL.S32 q0, d0,d0[0]
+uint32x4_t vmlal_lane_u16(uint32x4_t a, uint16x4_t b, uint16x4_t v, __constrange(0,3) int l); // VMLAL.U16 q0,d0, d0[0]
+uint64x2_t vmlal_lane_u32(uint64x2_t a, uint32x2_t b, uint32x2_t v, __constrange(0,1) int l); // VMLAL.U32 q0,d0, d0[0]
 //Vector widening saturating doubling multiply accumulate with scalar
-
+int32x4_t vqdmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); // VQDMLAL.S16 q0,d0, d0[0]
+int64x2_t vqdmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); // VQDMLAL.S32 q0,d0, d0[0]
 //Vector multiply subtract with scalar
-
+int16x4_t vmls_lane_s16(int16x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); // VMLS.I16 d0, d0,d0[0]
+int32x2_t vmls_lane_s32(int32x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); // VMLS.I32 d0, d0,d0[0]
+uint16x4_t vmls_lane_u16(uint16x4_t a, uint16x4_t b, uint16x4_t v, __constrange(0,3) int l); // VMLS.I16 d0, d0,d0[0]
+uint32x2_t vmls_lane_u32(uint32x2_t a, uint32x2_t b, uint32x2_t v, __constrange(0,1) int l); // VMLS.I32 d0, d0,d0[0]
+float32x2_t vmls_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v, __constrange(0,1) int l); // VMLS.F32 d0,d0, d0[0]
+int16x8_t vmlsq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t v, __constrange(0,3) int l); // VMLS.I16 q0, q0,d0[0]
+int32x4_t vmlsq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t v, __constrange(0,1) int l); // VMLS.I32 q0, q0,d0[0]
+uint16x8_t vmlsq_lane_u16(uint16x8_t a, uint16x8_t b, uint16x4_t v, __constrange(0,3) int l); // VMLS.I16 q0,q0, d0[0]
+uint32x4_t vmlsq_lane_u32(uint32x4_t a, uint32x4_t b, uint32x2_t v, __constrange(0,1) int l); // VMLS.I32 q0,q0, d0[0]
+float32x4_t vmlsq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v, __constrange(0,1) int l); // VMLS.F32 q0,q0, d0[0]
 //Vector widening multiply subtract with scalar
-
+int32x4_t vmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); // VMLSL.S16 q0, d0,d0[0]
+int64x2_t vmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); // VMLSL.S32 q0, d0,d0[0]
+uint32x4_t vmlsl_lane_u16(uint32x4_t a, uint16x4_t b, uint16x4_t v, __constrange(0,3) int l); // VMLSL.U16 q0,d0, d0[0]
+uint64x2_t vmlsl_lane_u32(uint64x2_t a, uint32x2_t b, uint32x2_t v, __constrange(0,1) int l); // VMLSL.U32 q0,d0, d0[0]
 //Vector widening saturating doubling multiply subtract with scalar
-
+int32x4_t vqdmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); // VQDMLSL.S16 q0,d0, d0[0]
+int64x2_t vqdmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); // VQDMLSL.S32 q0,d0, d0[0]
 //Vector multiply by scalar
-
-int16x8_t vmulq_n_s16(int16x8_t a, int16_t b);         // VMUL.I16 q0,q0,d0[0]
-int32x4_t vmulq_n_s32(int32x4_t a, int32_t b);         // VMUL.I32 q0,q0,d0[0]
-float32x4_t vmulq_n_f32(float32x4_t a, float32_t b);         // VMUL.F32 q0,q0,d0[0]
-uint16x8_t vmulq_n_u16(uint16x8_t a, uint16_t b);         // VMUL.I16 q0,q0,d0[0]
-uint32x4_t vmulq_n_u32(uint32x4_t a, uint32_t b);         // VMUL.I32 q0,q0,d0[0]
+int16x4_t vmul_n_s16(int16x4_t a, int16_t b); // VMUL.I16 d0,d0,d0[0]
+int32x2_t vmul_n_s32(int32x2_t a, int32_t b); // VMUL.I32 d0,d0,d0[0]
+float32x2_t vmul_n_f32(float32x2_t a, float32_t b); // VMUL.F32 d0,d0,d0[0]
+uint16x4_t vmul_n_u16(uint16x4_t a, uint16_t b); // VMUL.I16 d0,d0,d0[0]
+uint32x2_t vmul_n_u32(uint32x2_t a, uint32_t b); // VMUL.I32 d0,d0,d0[0]
+int16x8_t vmulq_n_s16(int16x8_t a, int16_t b); // VMUL.I16 q0,q0,d0[0]
+int32x4_t vmulq_n_s32(int32x4_t a, int32_t b); // VMUL.I32 q0,q0,d0[0]
+float32x4_t vmulq_n_f32(float32x4_t a, float32_t b); // VMUL.F32 q0,q0,d0[0]
+uint16x8_t vmulq_n_u16(uint16x8_t a, uint16_t b); // VMUL.I16 q0,q0,d0[0]
+uint32x4_t vmulq_n_u32(uint32x4_t a, uint32_t b); // VMUL.I32 q0,q0,d0[0]
 //Vector long multiply with scalar
-
+int32x4_t vmull_n_s16(int16x4_t vec1, int16_t val2); // VMULL.S16 q0,d0,d0[0]
+int64x2_t vmull_n_s32(int32x2_t vec1, int32_t val2); // VMULL.S32 q0,d0,d0[0]
+uint32x4_t vmull_n_u16(uint16x4_t vec1, uint16_t val2); // VMULL.U16 q0,d0,d0[0]
+uint64x2_t vmull_n_u32(uint32x2_t vec1, uint32_t val2); // VMULL.U32 q0,d0,d0[0]
 //Vector long multiply by scalar
-
+int32x4_t vmull_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3); // VMULL.S16 q0,d0,d0[0]
+int64x2_t vmull_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3); // VMULL.S32 q0,d0,d0[0]
+uint32x4_t vmull_lane_u16(uint16x4_t vec1, uint16x4_t val2, __constrange(0, 3) int val3); // VMULL.U16 q0,d0,d0[0]
+uint64x2_t vmull_lane_u32(uint32x2_t vec1, uint32x2_t val2, __constrange(0, 1) int val3); // VMULL.U32 q0,d0,d0[0]
 //Vector saturating doubling long multiply with scalar
-
+int32x4_t vqdmull_n_s16(int16x4_t vec1, int16_t val2); // VQDMULL.S16 q0,d0,d0[0]
+int64x2_t vqdmull_n_s32(int32x2_t vec1, int32_t val2); // VQDMULL.S32 q0,d0,d0[0]
 //Vector saturating doubling long multiply by scalar
-
+int32x4_t vqdmull_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3); // VQDMULL.S16 q0,d0,d0[0]
+int64x2_t vqdmull_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3); // VQDMULL.S32 q0,d0,d0[0]
 //Vector saturating doubling multiply high with scalar
-
-int16x8_t vqdmulhq_n_s16(int16x8_t vec1, int16_t val2);         // VQDMULH.S16 q0,q0,d0[0]
-int32x4_t vqdmulhq_n_s32(int32x4_t vec1, int32_t val2);         // VQDMULH.S32 q0,q0,d0[0]
+int16x4_t vqdmulh_n_s16(int16x4_t vec1, int16_t val2); // VQDMULH.S16 d0,d0,d0[0]
+int32x2_t vqdmulh_n_s32(int32x2_t vec1, int32_t val2); // VQDMULH.S32 d0,d0,d0[0]
+int16x8_t vqdmulhq_n_s16(int16x8_t vec1, int16_t val2); // VQDMULH.S16 q0,q0,d0[0]
+int32x4_t vqdmulhq_n_s32(int32x4_t vec1, int32_t val2); // VQDMULH.S32 q0,q0,d0[0]
 //Vector saturating doubling multiply high by scalar
-
+int16x4_t vqdmulh_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3); // VQDMULH.S16 d0,d0,d0[0]
+int32x2_t vqdmulh_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3); // VQDMULH.S32 d0,d0,d0[0]
+int16x8_t vqdmulhq_lane_s16(int16x8_t vec1, int16x4_t val2, __constrange(0, 3) int val3); // VQDMULH.S16 q0,q0,d0[0]
+int32x4_t vqdmulhq_lane_s32(int32x4_t vec1, int32x2_t val2, __constrange(0, 1) int val3); // VQDMULH.S32 q0,q0,d0[0]
 //Vector saturating rounding doubling multiply high with scalar
-
-int16x8_t vqrdmulhq_n_s16(int16x8_t vec1, int16_t val2);         // VQRDMULH.S16 q0,q0,d0[0]
-int32x4_t vqrdmulhq_n_s32(int32x4_t vec1, int32_t val2);         // VQRDMULH.S32 q0,q0,d0[0]
+int16x4_t vqrdmulh_n_s16(int16x4_t vec1, int16_t val2); // VQRDMULH.S16 d0,d0,d0[0]
+int32x2_t vqrdmulh_n_s32(int32x2_t vec1, int32_t val2); // VQRDMULH.S32 d0,d0,d0[0]
+int16x8_t vqrdmulhq_n_s16(int16x8_t vec1, int16_t val2); // VQRDMULH.S16 q0,q0,d0[0]
+int32x4_t vqrdmulhq_n_s32(int32x4_t vec1, int32_t val2); // VQRDMULH.S32 q0,q0,d0[0]
 //Vector rounding saturating doubling multiply high by scalar
-
+int16x4_t vqrdmulh_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3); // VQRDMULH.S16 d0,d0,d0[0]
+int32x2_t vqrdmulh_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3); // VQRDMULH.S32 d0,d0,d0[0]
+int16x8_t vqrdmulhq_lane_s16(int16x8_t vec1, int16x4_t val2, __constrange(0, 3) int val3); // VQRDMULH.S16 q0,q0,d0[0]
+int32x4_t vqrdmulhq_lane_s32(int32x4_t vec1, int32x2_t val2, __constrange(0, 1) int val3); // VQRDMULH.S32 q0,q0,d0[0]
 //Vector multiply accumulate with scalar
-
-int16x8_t vmlaq_n_s16(int16x8_t a, int16x8_t b, int16_t c);         // VMLA.I16 q0, q0, d0[0]
-int32x4_t vmlaq_n_s32(int32x4_t a, int32x4_t b, int32_t c);         // VMLA.I32 q0, q0, d0[0]
-uint16x8_t vmlaq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c);         // VMLA.I16 q0, q0, d0[0]
-uint32x4_t vmlaq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c);         // VMLA.I32 q0, q0, d0[0]
-float32x4_t vmlaq_n_f32(float32x4_t a, float32x4_t b, float32_t c);         // VMLA.F32 q0, q0, d0[0]
+int16x4_t vmla_n_s16(int16x4_t a, int16x4_t b, int16_t c); // VMLA.I16 d0, d0, d0[0]
+int32x2_t vmla_n_s32(int32x2_t a, int32x2_t b, int32_t c); // VMLA.I32 d0, d0, d0[0]
+uint16x4_t vmla_n_u16(uint16x4_t a, uint16x4_t b, uint16_t c); // VMLA.I16 d0, d0, d0[0]
+uint32x2_t vmla_n_u32(uint32x2_t a, uint32x2_t b, uint32_t c); // VMLA.I32 d0, d0, d0[0]
+float32x2_t vmla_n_f32(float32x2_t a, float32x2_t b, float32_t c); // VMLA.F32 d0, d0, d0[0]
+int16x8_t vmlaq_n_s16(int16x8_t a, int16x8_t b, int16_t c); // VMLA.I16 q0, q0, d0[0]
+int32x4_t vmlaq_n_s32(int32x4_t a, int32x4_t b, int32_t c); // VMLA.I32 q0, q0, d0[0]
+uint16x8_t vmlaq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c); // VMLA.I16 q0, q0, d0[0]
+uint32x4_t vmlaq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c); // VMLA.I32 q0, q0, d0[0]
+float32x4_t vmlaq_n_f32(float32x4_t a, float32x4_t b, float32_t c); // VMLA.F32 q0, q0, d0[0]
 //Vector widening multiply accumulate with scalar
-
+int32x4_t vmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c); // VMLAL.S16 q0, d0, d0[0]
+int64x2_t vmlal_n_s32(int64x2_t a, int32x2_t b, int32_t c); // VMLAL.S32 q0, d0, d0[0]
+uint32x4_t vmlal_n_u16(uint32x4_t a, uint16x4_t b, uint16_t c); // VMLAL.U16 q0, d0, d0[0]
+uint64x2_t vmlal_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c); // VMLAL.U32 q0, d0, d0[0]
 //Vector widening saturating doubling multiply accumulate with scalar
-
+int32x4_t vqdmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c); // VQDMLAL.S16 q0, d0, d0[0]
+int64x2_t vqdmlal_n_s32(int64x2_t a, int32x2_t b, int32_t c); // VQDMLAL.S32 q0, d0, d0[0]
 //Vector multiply subtract with scalar
-
-int16x8_t vmlsq_n_s16(int16x8_t a, int16x8_t b, int16_t c);         // VMLS.I16 q0, q0, d0[0]
-int32x4_t vmlsq_n_s32(int32x4_t a, int32x4_t b, int32_t c);         // VMLS.I32 q0, q0, d0[0]
-uint16x8_t vmlsq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c);         // VMLS.I16 q0, q0, d0[0]
-uint32x4_t vmlsq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c);         // VMLS.I32 q0, q0, d0[0]
-float32x4_t vmlsq_n_f32(float32x4_t a, float32x4_t b, float32_t c);         // VMLS.F32 q0, q0, d0[0]
+int16x4_t vmls_n_s16(int16x4_t a, int16x4_t b, int16_t c); // VMLS.I16 d0, d0, d0[0]
+int32x2_t vmls_n_s32(int32x2_t a, int32x2_t b, int32_t c); // VMLS.I32 d0, d0, d0[0]
+uint16x4_t vmls_n_u16(uint16x4_t a, uint16x4_t b, uint16_t c); // VMLS.I16 d0, d0, d0[0]
+uint32x2_t vmls_n_u32(uint32x2_t a, uint32x2_t b, uint32_t c); // VMLS.I32 d0, d0, d0[0]
+float32x2_t vmls_n_f32(float32x2_t a, float32x2_t b, float32_t c); // VMLS.F32 d0, d0, d0[0]
+int16x8_t vmlsq_n_s16(int16x8_t a, int16x8_t b, int16_t c); // VMLS.I16 q0, q0, d0[0]
+int32x4_t vmlsq_n_s32(int32x4_t a, int32x4_t b, int32_t c); // VMLS.I32 q0, q0, d0[0]
+uint16x8_t vmlsq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c); // VMLS.I16 q0, q0, d0[0]
+uint32x4_t vmlsq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c); // VMLS.I32 q0, q0, d0[0]
+float32x4_t vmlsq_n_f32(float32x4_t a, float32x4_t b, float32_t c); // VMLS.F32 q0, q0, d0[0]
 //Vector widening multiply subtract with scalar
-
+int32x4_t vmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c); // VMLSL.S16 q0, d0, d0[0]
+int64x2_t vmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c); // VMLSL.S32 q0, d0, d0[0]
+uint32x4_t vmlsl_n_u16(uint32x4_t a, uint16x4_t b, uint16_t c); // VMLSL.U16 q0, d0, d0[0]
+uint64x2_t vmlsl_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c); // VMLSL.U32 q0, d0, d0[0]
 //Vector widening saturating doubling multiply subtract with scalar
-
+int32x4_t vqdmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c); // VQDMLSL.S16 q0, d0, d0[0]
+int64x2_t vqdmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c); // VQDMLSL.S32 q0, d0, d0[0]
 //Vector extract
-
-int8x16_t vextq_s8(int8x16_t a, int8x16_t b, __constrange(0,15) int c);         // VEXT.8 q0,q0,q0,#0
-uint8x16_t vextq_u8(uint8x16_t a, uint8x16_t b, __constrange(0,15) int c);         // VEXT.8 q0,q0,q0,#0
-poly8x16_t vextq_p8(poly8x16_t a, poly8x16_t b, __constrange(0,15) int c);         // VEXT.8 q0,q0,q0,#0
-int16x8_t vextq_s16(int16x8_t a, int16x8_t b, __constrange(0,7) int c);         // VEXT.16 q0,q0,q0,#0
-uint16x8_t vextq_u16(uint16x8_t a, uint16x8_t b, __constrange(0,7) int c);         // VEXT.16 q0,q0,q0,#0
-poly16x8_t vextq_p16(poly16x8_t a, poly16x8_t b, __constrange(0,7) int c);         // VEXT.16 q0,q0,q0,#0
-int32x4_t vextq_s32(int32x4_t a, int32x4_t b, __constrange(0,3) int c);         // VEXT.32 q0,q0,q0,#0
-uint32x4_t vextq_u32(uint32x4_t a, uint32x4_t b, __constrange(0,3) int c);         // VEXT.32 q0,q0,q0,#0
-int64x2_t vextq_s64(int64x2_t a, int64x2_t b, __constrange(0,1) int c);         // VEXT.64 q0,q0,q0,#0
-uint64x2_t vextq_u64(uint64x2_t a, uint64x2_t b, __constrange(0,1) int c);         // VEXT.64 q0,q0,q0,#0
+int8x8_t vext_s8(int8x8_t a, int8x8_t b, __constrange(0,7) int c); // VEXT.8 d0,d0,d0,#0
+uint8x8_t vext_u8(uint8x8_t a, uint8x8_t b, __constrange(0,7) int c); // VEXT.8 d0,d0,d0,#0
+poly8x8_t vext_p8(poly8x8_t a, poly8x8_t b, __constrange(0,7) int c); // VEXT.8 d0,d0,d0,#0
+int16x4_t vext_s16(int16x4_t a, int16x4_t b, __constrange(0,3) int c); // VEXT.16 d0,d0,d0,#0
+uint16x4_t vext_u16(uint16x4_t a, uint16x4_t b, __constrange(0,3) int c); // VEXT.16 d0,d0,d0,#0
+poly16x4_t vext_p16(poly16x4_t a, poly16x4_t b, __constrange(0,3) int c); // VEXT.16 d0,d0,d0,#0
+int32x2_t vext_s32(int32x2_t a, int32x2_t b, __constrange(0,1) int c); // VEXT.32 d0,d0,d0,#0
+uint32x2_t vext_u32(uint32x2_t a, uint32x2_t b, __constrange(0,1) int c); // VEXT.32 d0,d0,d0,#0
+int64x1_t vext_s64(int64x1_t a, int64x1_t b, __constrange(0,0) int c); // VEXT.64 d0,d0,d0,#0
+uint64x1_t vext_u64(uint64x1_t a, uint64x1_t b, __constrange(0,0) int c); // VEXT.64 d0,d0,d0,#0
+float32x2_t vext_f32(float32x2_t a, float32x2_t b, __constrange(0,1) int c); // VEXT.32 d0,d0,d0,#0
+int8x16_t vextq_s8(int8x16_t a, int8x16_t b, __constrange(0,15) int c); // VEXT.8 q0,q0,q0,#0
+uint8x16_t vextq_u8(uint8x16_t a, uint8x16_t b, __constrange(0,15) int c); // VEXT.8 q0,q0,q0,#0
+poly8x16_t vextq_p8(poly8x16_t a, poly8x16_t b, __constrange(0,15) int c); // VEXT.8 q0,q0,q0,#0
+int16x8_t vextq_s16(int16x8_t a, int16x8_t b, __constrange(0,7) int c); // VEXT.16 q0,q0,q0,#0
+uint16x8_t vextq_u16(uint16x8_t a, uint16x8_t b, __constrange(0,7) int c); // VEXT.16 q0,q0,q0,#0
+poly16x8_t vextq_p16(poly16x8_t a, poly16x8_t b, __constrange(0,7) int c); // VEXT.16 q0,q0,q0,#0
+int32x4_t vextq_s32(int32x4_t a, int32x4_t b, __constrange(0,3) int c); // VEXT.32 q0,q0,q0,#0
+uint32x4_t vextq_u32(uint32x4_t a, uint32x4_t b, __constrange(0,3) int c); // VEXT.32 q0,q0,q0,#0
+int64x2_t vextq_s64(int64x2_t a, int64x2_t b, __constrange(0,1) int c); // VEXT.64 q0,q0,q0,#0
+uint64x2_t vextq_u64(uint64x2_t a, uint64x2_t b, __constrange(0,1) int c); // VEXT.64 q0,q0,q0,#0
+float32x4_t vextq_f32(float32x4_t a, float32x4_t b, __constrange(0,3) float c); // VEXT.32 q0,q0,q0,#0
 //Reverse vector elements (swap endianness). VREVn.m reverses the order of the m-bit lanes within a set that is n bits wide.
-
-int8x16_t vrev64q_s8(int8x16_t vec);         // VREV64.8 q0,q0
-int16x8_t vrev64q_s16(int16x8_t vec);         // VREV64.16 q0,q0
-int32x4_t vrev64q_s32(int32x4_t vec);         // VREV64.32 q0,q0
-uint8x16_t vrev64q_u8(uint8x16_t vec);         // VREV64.8 q0,q0
-uint16x8_t vrev64q_u16(uint16x8_t vec);         // VREV64.16 q0,q0
-uint32x4_t vrev64q_u32(uint32x4_t vec);         // VREV64.32 q0,q0
-poly8x16_t vrev64q_p8(poly8x16_t vec);         // VREV64.8 q0,q0
-poly16x8_t vrev64q_p16(poly16x8_t vec);         // VREV64.16 q0,q0
-float32x4_t vrev64q_f32(float32x4_t vec);         // VREV64.32 q0,q0
-
-int8x16_t vrev32q_s8(int8x16_t vec);         // VREV32.8 q0,q0
-int16x8_t vrev32q_s16(int16x8_t vec);         // VREV32.16 q0,q0
-uint8x16_t vrev32q_u8(uint8x16_t vec);         // VREV32.8 q0,q0
-uint16x8_t vrev32q_u16(uint16x8_t vec);         // VREV32.16 q0,q0
-poly8x16_t vrev32q_p8(poly8x16_t vec);         // VREV32.8 q0,q0
-
-int8x16_t vrev16q_s8(int8x16_t vec);         // VREV16.8 q0,q0
-uint8x16_t vrev16q_u8(uint8x16_t vec);         // VREV16.8 q0,q0
-poly8x16_t vrev16q_p8(poly8x16_t vec);         // VREV16.8 q0,q0
+int8x8_t vrev64_s8(int8x8_t vec); // VREV64.8 d0,d0
+int16x4_t vrev64_s16(int16x4_t vec); // VREV64.16 d0,d0
+int32x2_t vrev64_s32(int32x2_t vec); // VREV64.32 d0,d0
+uint8x8_t vrev64_u8(uint8x8_t vec); // VREV64.8 d0,d0
+uint16x4_t vrev64_u16(uint16x4_t vec); // VREV64.16 d0,d0
+uint32x2_t vrev64_u32(uint32x2_t vec); // VREV64.32 d0,d0
+poly8x8_t vrev64_p8(poly8x8_t vec); // VREV64.8 d0,d0
+poly16x4_t vrev64_p16(poly16x4_t vec); // VREV64.16 d0,d0
+float32x2_t vrev64_f32(float32x2_t vec); // VREV64.32 d0,d0
+int8x16_t vrev64q_s8(int8x16_t vec); // VREV64.8 q0,q0
+int16x8_t vrev64q_s16(int16x8_t vec); // VREV64.16 q0,q0
+int32x4_t vrev64q_s32(int32x4_t vec); // VREV64.32 q0,q0
+uint8x16_t vrev64q_u8(uint8x16_t vec); // VREV64.8 q0,q0
+uint16x8_t vrev64q_u16(uint16x8_t vec); // VREV64.16 q0,q0
+uint32x4_t vrev64q_u32(uint32x4_t vec); // VREV64.32 q0,q0
+poly8x16_t vrev64q_p8(poly8x16_t vec); // VREV64.8 q0,q0
+poly16x8_t vrev64q_p16(poly16x8_t vec); // VREV64.16 q0,q0
+float32x4_t vrev64q_f32(float32x4_t vec); // VREV64.32 q0,q0
+int8x8_t vrev32_s8(int8x8_t vec); // VREV32.8 d0,d0
+int16x4_t vrev32_s16(int16x4_t vec); // VREV32.16 d0,d0
+uint8x8_t vrev32_u8(uint8x8_t vec); // VREV32.8 d0,d0
+uint16x4_t vrev32_u16(uint16x4_t vec); // VREV32.16 d0,d0
+poly8x8_t vrev32_p8(poly8x8_t vec); // VREV32.8 d0,d0
+poly16x4_t vrev32_p16(poly16x4_t vec); // VREV32.16 d0,d0
+int8x16_t vrev32q_s8(int8x16_t vec); // VREV32.8 q0,q0
+int16x8_t vrev32q_s16(int16x8_t vec); // VREV32.16 q0,q0
+uint8x16_t vrev32q_u8(uint8x16_t vec); // VREV32.8 q0,q0
+uint16x8_t vrev32q_u16(uint16x8_t vec); // VREV32.16 q0,q0
+poly8x16_t vrev32q_p8(poly8x16_t vec); // VREV32.8 q0,q0
+poly16x8_t vrev32q_p16(poly16x8_t vec); // VREV32.16 q0,q0
+int8x8_t vrev16_s8(int8x8_t vec); // VREV16.8 d0,d0
+uint8x8_t vrev16_u8(uint8x8_t vec); // VREV16.8 d0,d0
+poly8x8_t vrev16_p8(poly8x8_t vec); // VREV16.8 d0,d0
+int8x16_t vrev16q_s8(int8x16_t vec); // VREV16.8 q0,q0
+uint8x16_t vrev16q_u8(uint8x16_t vec); // VREV16.8 q0,q0
+poly8x16_t vrev16q_p8(poly8x16_t vec); // VREV16.8 q0,q0
 //Other single operand arithmetic
 //Absolute: Vd[i] = |Va[i]|
-
-int8x16_t vabsq_s8(int8x16_t a);         // VABS.S8 q0,q0
-int16x8_t vabsq_s16(int16x8_t a);         // VABS.S16 q0,q0
-int32x4_t vabsq_s32(int32x4_t a);         // VABS.S32 q0,q0
-float32x4_t vabsq_f32(float32x4_t a);         // VABS.F32 q0,q0
+int8x8_t vabs_s8(int8x8_t a); // VABS.S8 d0,d0
+int16x4_t vabs_s16(int16x4_t a); // VABS.S16 d0,d0
+int32x2_t vabs_s32(int32x2_t a); // VABS.S32 d0,d0
+float32x2_t vabs_f32(float32x2_t a); // VABS.F32 d0,d0
+int8x16_t vabsq_s8(int8x16_t a); // VABS.S8 q0,q0
+int16x8_t vabsq_s16(int16x8_t a); // VABS.S16 q0,q0
+int32x4_t vabsq_s32(int32x4_t a); // VABS.S32 q0,q0
+float32x4_t vabsq_f32(float32x4_t a); // VABS.F32 q0,q0
 //Saturating absolute: Vd[i] = sat(|Va[i]|)
-
-int8x16_t vqabsq_s8(int8x16_t a);         // VQABS.S8 q0,q0
-int16x8_t vqabsq_s16(int16x8_t a);         // VQABS.S16 q0,q0
-int32x4_t vqabsq_s32(int32x4_t a);         // VQABS.S32 q0,q0
+int8x8_t vqabs_s8(int8x8_t a); // VQABS.S8 d0,d0
+int16x4_t vqabs_s16(int16x4_t a); // VQABS.S16 d0,d0
+int32x2_t vqabs_s32(int32x2_t a); // VQABS.S32 d0,d0
+int8x16_t vqabsq_s8(int8x16_t a); // VQABS.S8 q0,q0
+int16x8_t vqabsq_s16(int16x8_t a); // VQABS.S16 q0,q0
+int32x4_t vqabsq_s32(int32x4_t a); // VQABS.S32 q0,q0
 //Negate: Vd[i] = - Va[i]
-
-int8x16_t vnegq_s8(int8x16_t a);         // VNE//q0,q0
-int16x8_t vnegq_s16(int16x8_t a);         // VNE//q0,q0
-int32x4_t vnegq_s32(int32x4_t a);         // VNE//q0,q0
-float32x4_t vnegq_f32(float32x4_t a);         // VNE//q0,q0
+int8x8_t vneg_s8(int8x8_t a); // VNE//d0,d0
+int16x4_t vneg_s16(int16x4_t a); // VNE//d0,d0
+int32x2_t vneg_s32(int32x2_t a); // VNE//d0,d0
+float32x2_t vneg_f32(float32x2_t a); // VNE//d0,d0
+int8x16_t vnegq_s8(int8x16_t a); // VNE//q0,q0
+int16x8_t vnegq_s16(int16x8_t a); // VNE//q0,q0
+int32x4_t vnegq_s32(int32x4_t a); // VNE//q0,q0
+float32x4_t vnegq_f32(float32x4_t a); // VNE//q0,q0
 //Saturating Negate: sat(Vd[i] = - Va[i])
-
-int8x16_t vqnegq_s8(int8x16_t a);         // VQNE//q0,q0
-int16x8_t vqnegq_s16(int16x8_t a);         // VQNE//q0,q0
-int32x4_t vqnegq_s32(int32x4_t a);         // VQNE//q0,q0
+int8x8_t vqneg_s8(int8x8_t a); // VQNE//d0,d0
+int16x4_t vqneg_s16(int16x4_t a); // VQNE//d0,d0
+int32x2_t vqneg_s32(int32x2_t a); // VQNE//d0,d0
+int8x16_t vqnegq_s8(int8x16_t a); // VQNE//q0,q0
+int16x8_t vqnegq_s16(int16x8_t a); // VQNE//q0,q0
+int32x4_t vqnegq_s32(int32x4_t a); // VQNE//q0,q0
 //Count leading sign bits
-
-int8x16_t vclsq_s8(int8x16_t a);         // VCLS.S8 q0,q0
-int16x8_t vclsq_s16(int16x8_t a);         // VCLS.S16 q0,q0
-int32x4_t vclsq_s32(int32x4_t a);         // VCLS.S32 q0,q0
+int8x8_t vcls_s8(int8x8_t a); // VCLS.S8 d0,d0
+int16x4_t vcls_s16(int16x4_t a); // VCLS.S16 d0,d0
+int32x2_t vcls_s32(int32x2_t a); // VCLS.S32 d0,d0
+int8x16_t vclsq_s8(int8x16_t a); // VCLS.S8 q0,q0
+int16x8_t vclsq_s16(int16x8_t a); // VCLS.S16 q0,q0
+int32x4_t vclsq_s32(int32x4_t a); // VCLS.S32 q0,q0
 //Count leading zeros
-
-int8x16_t vclzq_s8(int8x16_t a);         // VCLZ.I8 q0,q0
-int16x8_t vclzq_s16(int16x8_t a);         // VCLZ.I16 q0,q0
-int32x4_t vclzq_s32(int32x4_t a);         // VCLZ.I32 q0,q0
-uint8x16_t vclzq_u8(uint8x16_t a);         // VCLZ.I8 q0,q0
-uint16x8_t vclzq_u16(uint16x8_t a);         // VCLZ.I16 q0,q0
-uint32x4_t vclzq_u32(uint32x4_t a);         // VCLZ.I32 q0,q0
+int8x8_t vclz_s8(int8x8_t a); // VCLZ.I8 d0,d0
+int16x4_t vclz_s16(int16x4_t a); // VCLZ.I16 d0,d0
+int32x2_t vclz_s32(int32x2_t a); // VCLZ.I32 d0,d0
+uint8x8_t vclz_u8(uint8x8_t a); // VCLZ.I8 d0,d0
+uint16x4_t vclz_u16(uint16x4_t a); // VCLZ.I16 d0,d0
+uint32x2_t vclz_u32(uint32x2_t a); // VCLZ.I32 d0,d0
+int8x16_t vclzq_s8(int8x16_t a); // VCLZ.I8 q0,q0
+int16x8_t vclzq_s16(int16x8_t a); // VCLZ.I16 q0,q0
+int32x4_t vclzq_s32(int32x4_t a); // VCLZ.I32 q0,q0
+uint8x16_t vclzq_u8(uint8x16_t a); // VCLZ.I8 q0,q0
+uint16x8_t vclzq_u16(uint16x8_t a); // VCLZ.I16 q0,q0
+uint32x4_t vclzq_u32(uint32x4_t a); // VCLZ.I32 q0,q0
 //Count number of set bits
-
-uint8x16_t vcntq_u8(uint8x16_t a);         // VCNT.8 q0,q0
-int8x16_t vcntq_s8(int8x16_t a);         // VCNT.8 q0,q0
-poly8x16_t vcntq_p8(poly8x16_t a);         // VCNT.8 q0,q0
+uint8x8_t vcnt_u8(uint8x8_t a); // VCNT.8 d0,d0
+int8x8_t vcnt_s8(int8x8_t a); // VCNT.8 d0,d0
+poly8x8_t vcnt_p8(poly8x8_t a); // VCNT.8 d0,d0
+uint8x16_t vcntq_u8(uint8x16_t a); // VCNT.8 q0,q0
+int8x16_t vcntq_s8(int8x16_t a); // VCNT.8 q0,q0
+poly8x16_t vcntq_p8(poly8x16_t a); // VCNT.8 q0,q0
 //Reciprocal estimate
-
-float32x4_t vrecpeq_f32(float32x4_t a);         // VRECPE.F32 q0,q0
-uint32x4_t vrecpeq_u32(uint32x4_t a);         // VRECPE.U32 q0,q0
+float32x2_t vrecpe_f32(float32x2_t a); // VRECPE.F32 d0,d0
+uint32x2_t vrecpe_u32(uint32x2_t a); // VRECPE.U32 d0,d0
+float32x4_t vrecpeq_f32(float32x4_t a); // VRECPE.F32 q0,q0
+uint32x4_t vrecpeq_u32(uint32x4_t a); // VRECPE.U32 q0,q0
 //Reciprocal square root estimate
-
-float32x4_t vrsqrteq_f32(float32x4_t a);         // VRSQRTE.F32 q0,q0
-uint32x4_t vrsqrteq_u32(uint32x4_t a);         // VRSQRTE.U32 q0,q0
+float32x2_t vrsqrte_f32(float32x2_t a); // VRSQRTE.F32 d0,d0
+uint32x2_t vrsqrte_u32(uint32x2_t a); // VRSQRTE.U32 d0,d0
+float32x4_t vrsqrteq_f32(float32x4_t a); // VRSQRTE.F32 q0,q0
+uint32x4_t vrsqrteq_u32(uint32x4_t a); // VRSQRTE.U32 q0,q0
 //Logical operations
 //Bitwise not
-
-int8x16_t vmvnq_s8(int8x16_t a);         // VMVN q0,q0
-int16x8_t vmvnq_s16(int16x8_t a);         // VMVN q0,q0
-int32x4_t vmvnq_s32(int32x4_t a);         // VMVN q0,q0
-uint8x16_t vmvnq_u8(uint8x16_t a);         // VMVN q0,q0
-uint16x8_t vmvnq_u16(uint16x8_t a);         // VMVN q0,q0
-uint32x4_t vmvnq_u32(uint32x4_t a);         // VMVN q0,q0
-poly8x16_t vmvnq_p8(poly8x16_t a);         // VMVN q0,q0
+int8x8_t vmvn_s8(int8x8_t a); // VMVN d0,d0
+int16x4_t vmvn_s16(int16x4_t a); // VMVN d0,d0
+int32x2_t vmvn_s32(int32x2_t a); // VMVN d0,d0
+uint8x8_t vmvn_u8(uint8x8_t a); // VMVN d0,d0
+uint16x4_t vmvn_u16(uint16x4_t a); // VMVN d0,d0
+uint32x2_t vmvn_u32(uint32x2_t a); // VMVN d0,d0
+poly8x8_t vmvn_p8(poly8x8_t a); // VMVN d0,d0
+int8x16_t vmvnq_s8(int8x16_t a); // VMVN q0,q0
+int16x8_t vmvnq_s16(int16x8_t a); // VMVN q0,q0
+int32x4_t vmvnq_s32(int32x4_t a); // VMVN q0,q0
+uint8x16_t vmvnq_u8(uint8x16_t a); // VMVN q0,q0
+uint16x8_t vmvnq_u16(uint16x8_t a); // VMVN q0,q0
+uint32x4_t vmvnq_u32(uint32x4_t a); // VMVN q0,q0
+poly8x16_t vmvnq_p8(poly8x16_t a); // VMVN q0,q0
 //Bitwise and
-
-int8x16_t vandq_s8(int8x16_t a, int8x16_t b);         // VAND q0,q0,q0
-int16x8_t vandq_s16(int16x8_t a, int16x8_t b);         // VAND q0,q0,q0
-int32x4_t vandq_s32(int32x4_t a, int32x4_t b);         // VAND q0,q0,q0
-int64x2_t vandq_s64(int64x2_t a, int64x2_t b);         // VAND q0,q0,q0
-uint8x16_t vandq_u8(uint8x16_t a, uint8x16_t b);         // VAND q0,q0,q0
-uint16x8_t vandq_u16(uint16x8_t a, uint16x8_t b);         // VAND q0,q0,q0
-uint32x4_t vandq_u32(uint32x4_t a, uint32x4_t b);         // VAND q0,q0,q0
-uint64x2_t vandq_u64(uint64x2_t a, uint64x2_t b);         // VAND q0,q0,q0
+int8x8_t vand_s8(int8x8_t a, int8x8_t b); // VAND d0,d0,d0
+int16x4_t vand_s16(int16x4_t a, int16x4_t b); // VAND d0,d0,d0
+int32x2_t vand_s32(int32x2_t a, int32x2_t b); // VAND d0,d0,d0
+int64x1_t vand_s64(int64x1_t a, int64x1_t b); // VAND d0,d0,d0
+uint8x8_t vand_u8(uint8x8_t a, uint8x8_t b); // VAND d0,d0,d0
+uint16x4_t vand_u16(uint16x4_t a, uint16x4_t b); // VAND d0,d0,d0
+uint32x2_t vand_u32(uint32x2_t a, uint32x2_t b); // VAND d0,d0,d0
+uint64x1_t vand_u64(uint64x1_t a, uint64x1_t b); // VAND d0,d0,d0
+int8x16_t vandq_s8(int8x16_t a, int8x16_t b); // VAND q0,q0,q0
+int16x8_t vandq_s16(int16x8_t a, int16x8_t b); // VAND q0,q0,q0
+int32x4_t vandq_s32(int32x4_t a, int32x4_t b); // VAND q0,q0,q0
+int64x2_t vandq_s64(int64x2_t a, int64x2_t b); // VAND q0,q0,q0
+uint8x16_t vandq_u8(uint8x16_t a, uint8x16_t b); // VAND q0,q0,q0
+uint16x8_t vandq_u16(uint16x8_t a, uint16x8_t b); // VAND q0,q0,q0
+uint32x4_t vandq_u32(uint32x4_t a, uint32x4_t b); // VAND q0,q0,q0
+uint64x2_t vandq_u64(uint64x2_t a, uint64x2_t b); // VAND q0,q0,q0
 //Bitwise or
-
-int8x16_t vorrq_s8(int8x16_t a, int8x16_t b);         // VORR q0,q0,q0
-int16x8_t vorrq_s16(int16x8_t a, int16x8_t b);         // VORR q0,q0,q0
-int32x4_t vorrq_s32(int32x4_t a, int32x4_t b);         // VORR q0,q0,q0
-int64x2_t vorrq_s64(int64x2_t a, int64x2_t b);         // VORR q0,q0,q0
-uint8x16_t vorrq_u8(uint8x16_t a, uint8x16_t b);         // VORR q0,q0,q0
-uint16x8_t vorrq_u16(uint16x8_t a, uint16x8_t b);         // VORR q0,q0,q0
-uint32x4_t vorrq_u32(uint32x4_t a, uint32x4_t b);         // VORR q0,q0,q0
-uint64x2_t vorrq_u64(uint64x2_t a, uint64x2_t b);         // VORR q0,q0,q0
+int8x8_t vorr_s8(int8x8_t a, int8x8_t b); // VORR d0,d0,d0
+int16x4_t vorr_s16(int16x4_t a, int16x4_t b); // VORR d0,d0,d0
+int32x2_t vorr_s32(int32x2_t a, int32x2_t b); // VORR d0,d0,d0
+int64x1_t vorr_s64(int64x1_t a, int64x1_t b); // VORR d0,d0,d0
+uint8x8_t vorr_u8(uint8x8_t a, uint8x8_t b); // VORR d0,d0,d0
+uint16x4_t vorr_u16(uint16x4_t a, uint16x4_t b); // VORR d0,d0,d0
+uint32x2_t vorr_u32(uint32x2_t a, uint32x2_t b); // VORR d0,d0,d0
+uint64x1_t vorr_u64(uint64x1_t a, uint64x1_t b); // VORR d0,d0,d0
+int8x16_t vorrq_s8(int8x16_t a, int8x16_t b); // VORR q0,q0,q0
+int16x8_t vorrq_s16(int16x8_t a, int16x8_t b); // VORR q0,q0,q0
+int32x4_t vorrq_s32(int32x4_t a, int32x4_t b); // VORR q0,q0,q0
+int64x2_t vorrq_s64(int64x2_t a, int64x2_t b); // VORR q0,q0,q0
+uint8x16_t vorrq_u8(uint8x16_t a, uint8x16_t b); // VORR q0,q0,q0
+uint16x8_t vorrq_u16(uint16x8_t a, uint16x8_t b); // VORR q0,q0,q0
+uint32x4_t vorrq_u32(uint32x4_t a, uint32x4_t b); // VORR q0,q0,q0
+uint64x2_t vorrq_u64(uint64x2_t a, uint64x2_t b); // VORR q0,q0,q0
 //Bitwise exclusive or (EOR or XOR)
-
-int8x16_t veorq_s8(int8x16_t a, int8x16_t b);         // VEOR q0,q0,q0
-int16x8_t veorq_s16(int16x8_t a, int16x8_t b);         // VEOR q0,q0,q0
-int32x4_t veorq_s32(int32x4_t a, int32x4_t b);         // VEOR q0,q0,q0
-int64x2_t veorq_s64(int64x2_t a, int64x2_t b);         // VEOR q0,q0,q0
-uint8x16_t veorq_u8(uint8x16_t a, uint8x16_t b);         // VEOR q0,q0,q0
-uint16x8_t veorq_u16(uint16x8_t a, uint16x8_t b);         // VEOR q0,q0,q0
-uint32x4_t veorq_u32(uint32x4_t a, uint32x4_t b);         // VEOR q0,q0,q0
-uint64x2_t veorq_u64(uint64x2_t a, uint64x2_t b);         // VEOR q0,q0,q0
+int8x8_t veor_s8(int8x8_t a, int8x8_t b); // VEOR d0,d0,d0
+int16x4_t veor_s16(int16x4_t a, int16x4_t b); // VEOR d0,d0,d0
+int32x2_t veor_s32(int32x2_t a, int32x2_t b); // VEOR d0,d0,d0
+int64x1_t veor_s64(int64x1_t a, int64x1_t b); // VEOR d0,d0,d0
+uint8x8_t veor_u8(uint8x8_t a, uint8x8_t b); // VEOR d0,d0,d0
+uint16x4_t veor_u16(uint16x4_t a, uint16x4_t b); // VEOR d0,d0,d0
+uint32x2_t veor_u32(uint32x2_t a, uint32x2_t b); // VEOR d0,d0,d0
+uint64x1_t veor_u64(uint64x1_t a, uint64x1_t b); // VEOR d0,d0,d0
+int8x16_t veorq_s8(int8x16_t a, int8x16_t b); // VEOR q0,q0,q0
+int16x8_t veorq_s16(int16x8_t a, int16x8_t b); // VEOR q0,q0,q0
+int32x4_t veorq_s32(int32x4_t a, int32x4_t b); // VEOR q0,q0,q0
+int64x2_t veorq_s64(int64x2_t a, int64x2_t b); // VEOR q0,q0,q0
+uint8x16_t veorq_u8(uint8x16_t a, uint8x16_t b); // VEOR q0,q0,q0
+uint16x8_t veorq_u16(uint16x8_t a, uint16x8_t b); // VEOR q0,q0,q0
+uint32x4_t veorq_u32(uint32x4_t a, uint32x4_t b); // VEOR q0,q0,q0
+uint64x2_t veorq_u64(uint64x2_t a, uint64x2_t b); // VEOR q0,q0,q0
 //Bit Clear
-
-int8x16_t vbicq_s8(int8x16_t a, int8x16_t b);         // VBIC q0,q0,q0
-int16x8_t vbicq_s16(int16x8_t a, int16x8_t b);         // VBIC q0,q0,q0
-int32x4_t vbicq_s32(int32x4_t a, int32x4_t b);         // VBIC q0,q0,q0
-int64x2_t vbicq_s64(int64x2_t a, int64x2_t b);         // VBIC q0,q0,q0
-uint8x16_t vbicq_u8(uint8x16_t a, uint8x16_t b);         // VBIC q0,q0,q0
-uint16x8_t vbicq_u16(uint16x8_t a, uint16x8_t b);         // VBIC q0,q0,q0
-uint32x4_t vbicq_u32(uint32x4_t a, uint32x4_t b);         // VBIC q0,q0,q0
-uint64x2_t vbicq_u64(uint64x2_t a, uint64x2_t b);         // VBIC q0,q0,q0
+int8x8_t vbic_s8(int8x8_t a, int8x8_t b); // VBIC d0,d0,d0
+int16x4_t vbic_s16(int16x4_t a, int16x4_t b); // VBIC d0,d0,d0
+int32x2_t vbic_s32(int32x2_t a, int32x2_t b); // VBIC d0,d0,d0
+int64x1_t vbic_s64(int64x1_t a, int64x1_t b); // VBIC d0,d0,d0
+uint8x8_t vbic_u8(uint8x8_t a, uint8x8_t b); // VBIC d0,d0,d0
+uint16x4_t vbic_u16(uint16x4_t a, uint16x4_t b); // VBIC d0,d0,d0
+uint32x2_t vbic_u32(uint32x2_t a, uint32x2_t b); // VBIC d0,d0,d0
+uint64x1_t vbic_u64(uint64x1_t a, uint64x1_t b); // VBIC d0,d0,d0
+int8x16_t vbicq_s8(int8x16_t a, int8x16_t b); // VBIC q0,q0,q0
+int16x8_t vbicq_s16(int16x8_t a, int16x8_t b); // VBIC q0,q0,q0
+int32x4_t vbicq_s32(int32x4_t a, int32x4_t b); // VBIC q0,q0,q0
+int64x2_t vbicq_s64(int64x2_t a, int64x2_t b); // VBIC q0,q0,q0
+uint8x16_t vbicq_u8(uint8x16_t a, uint8x16_t b); // VBIC q0,q0,q0
+uint16x8_t vbicq_u16(uint16x8_t a, uint16x8_t b); // VBIC q0,q0,q0
+uint32x4_t vbicq_u32(uint32x4_t a, uint32x4_t b); // VBIC q0,q0,q0
+uint64x2_t vbicq_u64(uint64x2_t a, uint64x2_t b); // VBIC q0,q0,q0
 //Bitwise OR complement
-
-int8x16_t vornq_s8(int8x16_t a, int8x16_t b);         // VORN q0,q0,q0
-int16x8_t vornq_s16(int16x8_t a, int16x8_t b);         // VORN q0,q0,q0
-int32x4_t vornq_s32(int32x4_t a, int32x4_t b);         // VORN q0,q0,q0
-int64x2_t vornq_s64(int64x2_t a, int64x2_t b);         // VORN q0,q0,q0
-uint8x16_t vornq_u8(uint8x16_t a, uint8x16_t b);         // VORN q0,q0,q0
-uint16x8_t vornq_u16(uint16x8_t a, uint16x8_t b);         // VORN q0,q0,q0
-uint32x4_t vornq_u32(uint32x4_t a, uint32x4_t b);         // VORN q0,q0,q0
-uint64x2_t vornq_u64(uint64x2_t a, uint64x2_t b);         // VORN q0,q0,q0
+int8x8_t vorn_s8(int8x8_t a, int8x8_t b); // VORN d0,d0,d0
+int16x4_t vorn_s16(int16x4_t a, int16x4_t b); // VORN d0,d0,d0
+int32x2_t vorn_s32(int32x2_t a, int32x2_t b); // VORN d0,d0,d0
+int64x1_t vorn_s64(int64x1_t a, int64x1_t b); // VORN d0,d0,d0
+uint8x8_t vorn_u8(uint8x8_t a, uint8x8_t b); // VORN d0,d0,d0
+uint16x4_t vorn_u16(uint16x4_t a, uint16x4_t b); // VORN d0,d0,d0
+uint32x2_t vorn_u32(uint32x2_t a, uint32x2_t b); // VORN d0,d0,d0
+uint64x1_t vorn_u64(uint64x1_t a, uint64x1_t b); // VORN d0,d0,d0
+int8x16_t vornq_s8(int8x16_t a, int8x16_t b); // VORN q0,q0,q0
+int16x8_t vornq_s16(int16x8_t a, int16x8_t b); // VORN q0,q0,q0
+int32x4_t vornq_s32(int32x4_t a, int32x4_t b); // VORN q0,q0,q0
+int64x2_t vornq_s64(int64x2_t a, int64x2_t b); // VORN q0,q0,q0
+uint8x16_t vornq_u8(uint8x16_t a, uint8x16_t b); // VORN q0,q0,q0
+uint16x8_t vornq_u16(uint16x8_t a, uint16x8_t b); // VORN q0,q0,q0
+uint32x4_t vornq_u32(uint32x4_t a, uint32x4_t b); // VORN q0,q0,q0
+uint64x2_t vornq_u64(uint64x2_t a, uint64x2_t b); // VORN q0,q0,q0
 //Bitwise Select
-
-int8x16_t vbslq_s8(uint8x16_t a, int8x16_t b, int8x16_t c);         // VBSL q0,q0,q0
-int16x8_t vbslq_s16(uint16x8_t a, int16x8_t b, int16x8_t c);         // VBSL q0,q0,q0
-int32x4_t vbslq_s32(uint32x4_t a, int32x4_t b, int32x4_t c);         // VBSL q0,q0,q0
-int64x2_t vbslq_s64(uint64x2_t a, int64x2_t b, int64x2_t c);         // VBSL q0,q0,q0
-uint8x16_t vbslq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c);         // VBSL q0,q0,q0
-uint16x8_t vbslq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c);         // VBSL q0,q0,q0
-uint32x4_t vbslq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c);         // VBSL q0,q0,q0
-uint64x2_t vbslq_u64(uint64x2_t a, uint64x2_t b, uint64x2_t c);         // VBSL q0,q0,q0
-float32x4_t vbslq_f32(uint32x4_t a, float32x4_t b, float32x4_t c);         // VBSL q0,q0,q0
-poly8x16_t vbslq_p8(uint8x16_t a, poly8x16_t b, poly8x16_t c);         // VBSL q0,q0,q0
-poly16x8_t vbslq_p16(uint16x8_t a, poly16x8_t b, poly16x8_t c);         // VBSL q0,q0,q0
+int8x8_t vbsl_s8(uint8x8_t a, int8x8_t b, int8x8_t c); // VBSL d0,d0,d0
+int16x4_t vbsl_s16(uint16x4_t a, int16x4_t b, int16x4_t c); // VBSL d0,d0,d0
+int32x2_t vbsl_s32(uint32x2_t a, int32x2_t b, int32x2_t c); // VBSL d0,d0,d0
+int64x1_t vbsl_s64(uint64x1_t a, int64x1_t b, int64x1_t c); // VBSL d0,d0,d0
+uint8x8_t vbsl_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c); // VBSL d0,d0,d0
+uint16x4_t vbsl_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c); // VBSL d0,d0,d0
+uint32x2_t vbsl_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c); // VBSL d0,d0,d0
+uint64x1_t vbsl_u64(uint64x1_t a, uint64x1_t b, uint64x1_t c); // VBSL d0,d0,d0
+float32x2_t vbsl_f32(uint32x2_t a, float32x2_t b, float32x2_t c); // VBSL d0,d0,d0
+poly8x8_t vbsl_p8(uint8x8_t a, poly8x8_t b, poly8x8_t c); // VBSL d0,d0,d0
+poly16x4_t vbsl_p16(uint16x4_t a, poly16x4_t b, poly16x4_t c); // VBSL d0,d0,d0
+int8x16_t vbslq_s8(uint8x16_t a, int8x16_t b, int8x16_t c); // VBSL q0,q0,q0
+int16x8_t vbslq_s16(uint16x8_t a, int16x8_t b, int16x8_t c); // VBSL q0,q0,q0
+int32x4_t vbslq_s32(uint32x4_t a, int32x4_t b, int32x4_t c); // VBSL q0,q0,q0
+int64x2_t vbslq_s64(uint64x2_t a, int64x2_t b, int64x2_t c); // VBSL q0,q0,q0
+uint8x16_t vbslq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c); // VBSL q0,q0,q0
+uint16x8_t vbslq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c); // VBSL q0,q0,q0
+uint32x4_t vbslq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c); // VBSL q0,q0,q0
+uint64x2_t vbslq_u64(uint64x2_t a, uint64x2_t b, uint64x2_t c); // VBSL q0,q0,q0
+float32x4_t vbslq_f32(uint32x4_t a, float32x4_t b, float32x4_t c); // VBSL q0,q0,q0
+poly8x16_t vbslq_p8(uint8x16_t a, poly8x16_t b, poly8x16_t c); // VBSL q0,q0,q0
+poly16x8_t vbslq_p16(uint16x8_t a, poly16x8_t b, poly16x8_t c); // VBSL q0,q0,q0
 //Transposition operations
 //Transpose elements
-
-int8x16x2_t vtrnq_s8(int8x16_t a, int8x16_t b);         // VTRN.8 q0,q0
-int16x8x2_t vtrnq_s16(int16x8_t a, int16x8_t b);         // VTRN.16 q0,q0
-int32x4x2_t vtrnq_s32(int32x4_t a, int32x4_t b);         // VTRN.32 q0,q0
-uint8x16x2_t vtrnq_u8(uint8x16_t a, uint8x16_t b);         // VTRN.8 q0,q0
-uint16x8x2_t vtrnq_u16(uint16x8_t a, uint16x8_t b);         // VTRN.16 q0,q0
-uint32x4x2_t vtrnq_u32(uint32x4_t a, uint32x4_t b);         // VTRN.32 q0,q0
-float32x4x2_t vtrnq_f32(float32x4_t a, float32x4_t b);         // VTRN.32 q0,q0
-poly8x16x2_t vtrnq_p8(poly8x16_t a, poly8x16_t b);         // VTRN.8 q0,q0
-poly16x8x2_t vtrnq_p16(poly16x8_t a, poly16x8_t b);         // VTRN.16 q0,q0
+int8x8x2_t vtrn_s8(int8x8_t a, int8x8_t b); // VTRN.8 d0,d0
+int16x4x2_t vtrn_s16(int16x4_t a, int16x4_t b); // VTRN.16 d0,d0
+int32x2x2_t vtrn_s32(int32x2_t a, int32x2_t b); // VTRN.32 d0,d0
+uint8x8x2_t vtrn_u8(uint8x8_t a, uint8x8_t b); // VTRN.8 d0,d0
+uint16x4x2_t vtrn_u16(uint16x4_t a, uint16x4_t b); // VTRN.16 d0,d0
+uint32x2x2_t vtrn_u32(uint32x2_t a, uint32x2_t b); // VTRN.32 d0,d0
+float32x2x2_t vtrn_f32(float32x2_t a, float32x2_t b); // VTRN.32 d0,d0
+poly8x8x2_t vtrn_p8(poly8x8_t a, poly8x8_t b); // VTRN.8 d0,d0
+poly16x4x2_t vtrn_p16(poly16x4_t a, poly16x4_t b); // VTRN.16 d0,d0
+int8x16x2_t vtrnq_s8(int8x16_t a, int8x16_t b); // VTRN.8 q0,q0
+int16x8x2_t vtrnq_s16(int16x8_t a, int16x8_t b); // VTRN.16 q0,q0
+int32x4x2_t vtrnq_s32(int32x4_t a, int32x4_t b); // VTRN.32 q0,q0
+uint8x16x2_t vtrnq_u8(uint8x16_t a, uint8x16_t b); // VTRN.8 q0,q0
+uint16x8x2_t vtrnq_u16(uint16x8_t a, uint16x8_t b); // VTRN.16 q0,q0
+uint32x4x2_t vtrnq_u32(uint32x4_t a, uint32x4_t b); // VTRN.32 q0,q0
+float32x4x2_t vtrnq_f32(float32x4_t a, float32x4_t b); // VTRN.32 q0,q0
+poly8x16x2_t vtrnq_p8(poly8x16_t a, poly8x16_t b); // VTRN.8 q0,q0
+poly16x8x2_t vtrnq_p16(poly16x8_t a, poly16x8_t b); // VTRN.16 q0,q0
 //Interleave elements
-
-int8x16x2_t vzipq_s8(int8x16_t a, int8x16_t b);         // VZIP.8 q0,q0
-int16x8x2_t vzipq_s16(int16x8_t a, int16x8_t b);         // VZIP.16 q0,q0
-int32x4x2_t vzipq_s32(int32x4_t a, int32x4_t b);         // VZIP.32 q0,q0
-uint8x16x2_t vzipq_u8(uint8x16_t a, uint8x16_t b);         // VZIP.8 q0,q0
-uint16x8x2_t vzipq_u16(uint16x8_t a, uint16x8_t b);         // VZIP.16 q0,q0
-uint32x4x2_t vzipq_u32(uint32x4_t a, uint32x4_t b);         // VZIP.32 q0,q0
-float32x4x2_t vzipq_f32(float32x4_t a, float32x4_t b);         // VZIP.32 q0,q0
-poly8x16x2_t vzipq_p8(poly8x16_t a, poly8x16_t b);         // VZIP.8 q0,q0
-poly16x8x2_t vzipq_p16(poly16x8_t a, poly16x8_t b);         // VZIP.16 q0,q0
+int8x8x2_t vzip_s8(int8x8_t a, int8x8_t b); // VZIP.8 d0,d0
+int16x4x2_t vzip_s16(int16x4_t a, int16x4_t b); // VZIP.16 d0,d0
+int32x2x2_t vzip_s32(int32x2_t a, int32x2_t b); // VZIP.32 d0,d0
+uint8x8x2_t vzip_u8(uint8x8_t a, uint8x8_t b); // VZIP.8 d0,d0
+uint16x4x2_t vzip_u16(uint16x4_t a, uint16x4_t b); // VZIP.16 d0,d0
+uint32x2x2_t vzip_u32(uint32x2_t a, uint32x2_t b); // VZIP.32 d0,d0
+float32x2x2_t vzip_f32(float32x2_t a, float32x2_t b); // VZIP.32 d0,d0
+poly8x8x2_t vzip_p8(poly8x8_t a, poly8x8_t b); // VZIP.8 d0,d0
+poly16x4x2_t vzip_p16(poly16x4_t a, poly16x4_t b); // VZIP.16 d0,d0
+int8x16x2_t vzipq_s8(int8x16_t a, int8x16_t b); // VZIP.8 q0,q0
+int16x8x2_t vzipq_s16(int16x8_t a, int16x8_t b); // VZIP.16 q0,q0
+int32x4x2_t vzipq_s32(int32x4_t a, int32x4_t b); // VZIP.32 q0,q0
+uint8x16x2_t vzipq_u8(uint8x16_t a, uint8x16_t b); // VZIP.8 q0,q0
+uint16x8x2_t vzipq_u16(uint16x8_t a, uint16x8_t b); // VZIP.16 q0,q0
+uint32x4x2_t vzipq_u32(uint32x4_t a, uint32x4_t b); // VZIP.32 q0,q0
+float32x4x2_t vzipq_f32(float32x4_t a, float32x4_t b); // VZIP.32 q0,q0
+poly8x16x2_t vzipq_p8(poly8x16_t a, poly8x16_t b); // VZIP.8 q0,q0
+poly16x8x2_t vzipq_p16(poly16x8_t a, poly16x8_t b); // VZIP.16 q0,q0
 //De-Interleave elements
+int8x8x2_t vuzp_s8(int8x8_t a, int8x8_t b); // VUZP.8 d0,d0
+int16x4x2_t vuzp_s16(int16x4_t a, int16x4_t b); // VUZP.16 d0,d0
+int32x2x2_t vuzp_s32(int32x2_t a, int32x2_t b); // VUZP.32 d0,d0
+uint8x8x2_t vuzp_u8(uint8x8_t a, uint8x8_t b); // VUZP.8 d0,d0
+uint16x4x2_t vuzp_u16(uint16x4_t a, uint16x4_t b); // VUZP.16 d0,d0
+uint32x2x2_t vuzp_u32(uint32x2_t a, uint32x2_t b); // VUZP.32 d0,d0
+float32x2x2_t vuzp_f32(float32x2_t a, float32x2_t b); // VUZP.32 d0,d0
+poly8x8x2_t vuzp_p8(poly8x8_t a, poly8x8_t b); // VUZP.8 d0,d0
+poly16x4x2_t vuzp_p16(poly16x4_t a, poly16x4_t b); // VUZP.16 d0,d0
+int8x16x2_t vuzpq_s8(int8x16_t a, int8x16_t b); // VUZP.8 q0,q0
+int16x8x2_t vuzpq_s16(int16x8_t a, int16x8_t b); // VUZP.16 q0,q0
+int32x4x2_t vuzpq_s32(int32x4_t a, int32x4_t b); // VUZP.32 q0,q0
+uint8x16x2_t vuzpq_u8(uint8x16_t a, uint8x16_t b); // VUZP.8 q0,q0
+uint16x8x2_t vuzpq_u16(uint16x8_t a, uint16x8_t b); // VUZP.16 q0,q0
+uint32x4x2_t vuzpq_u32(uint32x4_t a, uint32x4_t b); // VUZP.32 q0,q0
+float32x4x2_t vuzpq_f32(float32x4_t a, float32x4_t b); // VUZP.32 q0,q0
+poly8x16x2_t vuzpq_p8(poly8x16_t a, poly8x16_t b); // VUZP.8 q0,q0
+poly16x8x2_t vuzpq_p16(poly16x8_t a, poly16x8_t b); // VUZP.16 q0,q0
 
-int8x16x2_t vuzpq_s8(int8x16_t a, int8x16_t b);         // VUZP.8 q0,q0
-int16x8x2_t vuzpq_s16(int16x8_t a, int16x8_t b);         // VUZP.16 q0,q0
-int32x4x2_t vuzpq_s32(int32x4_t a, int32x4_t b);         // VUZP.32 q0,q0
-uint8x16x2_t vuzpq_u8(uint8x16_t a, uint8x16_t b);         // VUZP.8 q0,q0
-uint16x8x2_t vuzpq_u16(uint16x8_t a, uint16x8_t b);         // VUZP.16 q0,q0
-uint32x4x2_t vuzpq_u32(uint32x4_t a, uint32x4_t b);         // VUZP.32 q0,q0
-float32x4x2_t vuzpq_f32(float32x4_t a, float32x4_t b);         // VUZP.32 q0,q0
-poly8x16x2_t vuzpq_p8(poly8x16_t a, poly8x16_t b);         // VUZP.8 q0,q0
-poly16x8x2_t vuzpq_p16(poly16x8_t a, poly16x8_t b);         // VUZP.16 q0,q0
 
 //^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 // the following macros solve the problem of the "immediate parameters requirement" for some x86 intrinsics. While for release build it is not a must,
 //for debug build we need it to compile the code unless the "Intrinsic parameter must be an immediate value" error is our goal
 //
-#if ( ((defined _MSC_VER) && (_MSC_VER > 1600)) || defined __INTEL_COMPILER )&& defined NDEBUG     //if it is a release build, we also need it to fix the issue for VS2010 and earlier compilers.
+#if ( ((defined _MSC_VER) && (_MSC_VER > 1600)) || defined (__INTEL_COMPILER) )&& defined NDEBUG     //if it is a release build, we also need it to fix the issue for VS2010 and earlier compilers.
 
-    #if defined(USE_SSSE3)
-        #define _MM_ALIGNR_EPI8 _mm_alignr_epi8
-    #endif
+    #define _MM_ALIGNR_EPI8 _mm_alignr_epi8
 
     #define _MM_EXTRACT_EPI16  _mm_extract_epi16
     #define _MM_INSERT_EPI16 _mm_insert_epi16
-    #ifdef USE_SSE4
+#ifdef USE_SSE4
         #define _MM_EXTRACT_EPI8  _mm_extract_epi8
         #define _MM_EXTRACT_EPI32  _mm_extract_epi32
         #define _MM_EXTRACT_PS  _mm_extract_ps
@@ -1437,11 +2259,11 @@
         #define _MM_INSERT_EPI8  _mm_insert_epi8
         #define _MM_INSERT_EPI32 _mm_insert_epi32
         #define _MM_INSERT_PS    _mm_insert_ps
-    #ifdef  _M_X64
+#ifdef  _NEON2SSE_64BIT
             #define _MM_INSERT_EPI64 _mm_insert_epi64
             #define _MM_EXTRACT_EPI64 _mm_extract_epi64
-    #endif
-    #endif     //SSE4
+#endif
+#endif //SSE4
 #else
     #define _NEON2SSE_COMMA ,
     #define _NEON2SSE_SWITCH16(NAME, a, b, LANE) \
@@ -1490,12 +2312,10 @@
         default:     return NAME(vec p,case0); \
         }
 
-    #if defined(USE_SSSE3)
     _NEON2SSE_INLINE __m128i _MM_ALIGNR_EPI8(__m128i a, __m128i b, int LANE)
     {
         _NEON2SSE_SWITCH16(_mm_alignr_epi8, a, _NEON2SSE_COMMA b, LANE)
     }
-    #endif
 
     _NEON2SSE_INLINE __m128i  _MM_INSERT_EPI16(__m128i vec, int p, const int LANE)
     {
@@ -1507,7 +2327,7 @@
         _NEON2SSE_SWITCH8(_mm_extract_epi16, vec, LANE,)
     }
 
-    #ifdef USE_SSE4
+#ifdef USE_SSE4
         _NEON2SSE_INLINE int _MM_EXTRACT_EPI32(__m128i vec, const int LANE)
         {
             _NEON2SSE_SWITCH4(_mm_extract_epi32, 0,1,2,3, vec, LANE,)
@@ -1532,11 +2352,12 @@
         {
             _NEON2SSE_SWITCH16(_mm_insert_epi8, vec, _NEON2SSE_COMMA p, LANE)
         }
-    #ifdef  _M_X64
+
+#ifdef  _NEON2SSE_64BIT
+            //the special case of functions available only for SSE4 and 64-bit build.
             _NEON2SSE_INLINE __m128i  _MM_INSERT_EPI64(__m128i vec, int p, const int LANE)
             {
-                switch(LANE)
-                {
+                switch(LANE) {
                 case 0:
                     return _mm_insert_epi64(vec,  p, 0);
                 case 1:
@@ -1551,13 +2372,14 @@
                 if (LANE ==0) return _mm_extract_epi64(val, 0);
                 else return _mm_extract_epi64(val, 1);
             }
-    #endif
+#endif
+
         _NEON2SSE_INLINE __m128 _MM_INSERT_PS(__m128 vec, __m128 p, const int LANE)
         {
             _NEON2SSE_SWITCH4(_mm_insert_ps, 0, 16, 32, 48, vec, LANE, _NEON2SSE_COMMA p)
         }
 
-    #endif     //USE_SSE4
+#endif //USE_SSE4
 
 #endif     //#ifdef NDEBUG
 
@@ -1589,6 +2411,8 @@
 
     #define _MM_MULLO_EPI32 _mm_mullo_epi32
     #define _MM_MUL_EPI32  _mm_mul_epi32
+
+    #define _MM_CMPEQ_EPI64 _mm_cmpeq_epi64
 #else     //no SSE4 !!!!!!
     _NEON2SSE_INLINE __m128i _MM_CVTEPU8_EPI16(__m128i a)
     {
@@ -1657,8 +2481,8 @@
         __m128i vec_masked, p_masked;
         pvec[LANE] = p;
         mask[LANE] = 0x0;
-        vec_masked = _mm_and_si128 (*(__m128i*)mask,vec);         //ready for p
-        p_masked = _mm_andnot_si128 (*(__m128i*)mask,*(__m128i*)pvec);         //ready for vec
+        vec_masked = _mm_and_si128 (*(__m128i*)mask,vec); //ready for p
+        p_masked = _mm_andnot_si128 (*(__m128i*)mask,*(__m128i*)pvec); //ready for vec
         return _mm_or_si128(vec_masked, p_masked);
     }
 
@@ -1669,8 +2493,8 @@
         __m128i vec_masked, p_masked;
         pvec[LANE] = (int8_t)p;
         mask[LANE] = 0x0;
-        vec_masked = _mm_and_si128 (*(__m128i*)mask,vec);         //ready for p
-        p_masked = _mm_andnot_si128  (*(__m128i*)mask,*(__m128i*)pvec);         //ready for vec
+        vec_masked = _mm_and_si128 (*(__m128i*)mask,vec); //ready for p
+        p_masked = _mm_andnot_si128  (*(__m128i*)mask,*(__m128i*)pvec); //ready for vec
         return _mm_or_si128(vec_masked, p_masked);
     }
 
@@ -1678,9 +2502,9 @@
     {
         _NEON2SSE_ALIGN_16 int32_t mask[4] = {0xffffffff,0xffffffff,0xffffffff,0xffffffff};
         __m128 tmp, vec_masked, p_masked;
-        mask[LANE >> 4] = 0x0;         //here the LANE is not actural lane, need to deal with it
-        vec_masked = _mm_and_ps (*(__m128*)mask,vec);         //ready for p
-        p_masked = _mm_andnot_ps (*(__m128*)mask, p);         //ready for vec
+        mask[LANE >> 4] = 0x0; //here the LANE is not actural lane, need to deal with it
+        vec_masked = _mm_and_ps (*(__m128*)mask,vec); //ready for p
+        p_masked = _mm_andnot_ps (*(__m128*)mask, p); //ready for vec
         tmp = _mm_or_ps(vec_masked, p_masked);
         return tmp;
     }
@@ -1706,11 +2530,11 @@
     _NEON2SSE_INLINE __m128i _MM_MAX_EPU16(__m128i a, __m128i b)
     {
         __m128i c8000, b_s, a_s, cmp;
-        c8000 = _mm_cmpeq_epi16 (a,a);         //0xffff
-        c8000 = _mm_slli_epi16 (c8000, 15);         //0x8000
+        c8000 = _mm_cmpeq_epi16 (a,a); //0xffff
+        c8000 = _mm_slli_epi16 (c8000, 15); //0x8000
         b_s = _mm_sub_epi16 (b, c8000);
         a_s = _mm_sub_epi16 (a, c8000);
-        cmp = _mm_cmpgt_epi16 (a_s, b_s);         //no unsigned comparison, need to go to signed
+        cmp = _mm_cmpgt_epi16 (a_s, b_s); //no unsigned comparison, need to go to signed
         a_s = _mm_and_si128 (cmp,a);
         b_s = _mm_andnot_si128 (cmp,b);
         return _mm_or_si128(a_s, b_s);
@@ -1719,11 +2543,11 @@
     _NEON2SSE_INLINE __m128i _MM_MAX_EPU32(__m128i a, __m128i b)
     {
         __m128i c80000000, b_s, a_s, cmp;
-        c80000000 = _mm_cmpeq_epi32 (a,a);         //0xffffffff
-        c80000000 = _mm_slli_epi32 (c80000000, 31);         //0x80000000
+        c80000000 = _mm_cmpeq_epi32 (a,a); //0xffffffff
+        c80000000 = _mm_slli_epi32 (c80000000, 31); //0x80000000
         b_s = _mm_sub_epi32 (b, c80000000);
         a_s = _mm_sub_epi32 (a, c80000000);
-        cmp = _mm_cmpgt_epi32 (a_s, b_s);         //no unsigned comparison, need to go to signed
+        cmp = _mm_cmpgt_epi32 (a_s, b_s); //no unsigned comparison, need to go to signed
         a_s = _mm_and_si128 (cmp,a);
         b_s = _mm_andnot_si128 (cmp,b);
         return _mm_or_si128(a_s, b_s);
@@ -1750,11 +2574,11 @@
     _NEON2SSE_INLINE __m128i _MM_MIN_EPU16(__m128i a, __m128i b)
     {
         __m128i c8000, b_s, a_s, cmp;
-        c8000 = _mm_cmpeq_epi16 (a,a);         //0xffff
-        c8000 = _mm_slli_epi16 (c8000, 15);         //0x8000
+        c8000 = _mm_cmpeq_epi16 (a,a); //0xffff
+        c8000 = _mm_slli_epi16 (c8000, 15); //0x8000
         b_s = _mm_sub_epi16 (b, c8000);
         a_s = _mm_sub_epi16 (a, c8000);
-        cmp = _mm_cmpgt_epi16 (b_s, a_s);         //no unsigned comparison, need to go to signed
+        cmp = _mm_cmpgt_epi16 (b_s, a_s); //no unsigned comparison, need to go to signed
         a_s = _mm_and_si128 (cmp,a);
         b_s = _mm_andnot_si128 (cmp,b);
         return _mm_or_si128(a_s, b_s);
@@ -1763,25 +2587,25 @@
     _NEON2SSE_INLINE __m128i _MM_MIN_EPU32(__m128i a, __m128i b)
     {
         __m128i c80000000, b_s, a_s, cmp;
-        c80000000 = _mm_cmpeq_epi32 (a,a);         //0xffffffff
-        c80000000 = _mm_slli_epi32 (c80000000, 31);         //0x80000000
+        c80000000 = _mm_cmpeq_epi32 (a,a); //0xffffffff
+        c80000000 = _mm_slli_epi32 (c80000000, 31); //0x80000000
         b_s = _mm_sub_epi32 (b, c80000000);
         a_s = _mm_sub_epi32 (a, c80000000);
-        cmp = _mm_cmpgt_epi32 (b_s, a_s);         //no unsigned comparison, need to go to signed
+        cmp = _mm_cmpgt_epi32 (b_s, a_s); //no unsigned comparison, need to go to signed
         a_s = _mm_and_si128 (cmp,a);
         b_s = _mm_andnot_si128 (cmp,b);
         return _mm_or_si128(a_s, b_s);
     }
 
-    _NEON2SSE_INLINE __m128i  _MM_BLENDV_EPI8(__m128i a, __m128i b, __m128i mask)         //this is NOT exact implementation of _mm_blendv_epi8  !!!!! - please see below
-    {         //it assumes mask is either 0xff or 0  always (like in all usecases below) while for the original _mm_blendv_epi8 only MSB mask byte matters.
+    _NEON2SSE_INLINE __m128i  _MM_BLENDV_EPI8(__m128i a, __m128i b, __m128i mask) //this is NOT exact implementation of _mm_blendv_epi8  !!!!! - please see below
+    {
+        //it assumes mask is either 0xff or 0  always (like in all usecases below) while for the original _mm_blendv_epi8 only MSB mask byte matters.
         __m128i a_masked, b_masked;
-        b_masked = _mm_and_si128 (mask,b);         //use b if mask 0xff
+        b_masked = _mm_and_si128 (mask,b); //use b if mask 0xff
         a_masked = _mm_andnot_si128 (mask,a);
         return _mm_or_si128(a_masked, b_masked);
     }
 
-    #if defined(USE_SSSE3)
     _NEON2SSE_INLINE __m128i _MM_PACKUS_EPI32(__m128i a, __m128i b)
     {
         _NEON2SSE_ALIGN_16 int8_t mask8_32_even_odd[16] = { 0,1, 4,5, 8,9,  12,13,  2,3, 6,7,10,11,14,15};
@@ -1789,29 +2613,27 @@
         zero = _mm_setzero_si128();
         a16 = _mm_shuffle_epi8 (a, *(__m128i*) mask8_32_even_odd);
         b16 = _mm_shuffle_epi8 (b, *(__m128i*) mask8_32_even_odd);
-        res = _mm_unpacklo_epi64(a16, b16);         //result without saturation
-        reshi = _mm_unpackhi_epi64(a16, b16);         //hi part of result used for saturation
-        cmp = _mm_cmpgt_epi16(zero, reshi);         //if cmp<0 the result should be zero
-        res = _mm_andnot_si128(cmp,res);         //if cmp zero - do nothing, otherwise cmp <0  and the result is 0
-        cmp = _mm_cmpgt_epi16(reshi,zero);         //if cmp positive
-        return _mm_or_si128(res, cmp);         //if cmp positive we are out of 16bits need to saturaate to 0xffff
+        res = _mm_unpacklo_epi64(a16, b16); //result without saturation
+        reshi = _mm_unpackhi_epi64(a16, b16); //hi part of result used for saturation
+        cmp = _mm_cmpgt_epi16(zero, reshi); //if cmp<0 the result should be zero
+        res = _mm_andnot_si128(cmp,res); //if cmp zero - do nothing, otherwise cmp <0  and the result is 0
+        cmp = _mm_cmpgt_epi16(reshi,zero); //if cmp positive
+        return _mm_or_si128(res, cmp); //if cmp positive we are out of 16bits need to saturaate to 0xffff
     }
-    #endif
 
-    #if defined(USE_SSSE3)
     _NEON2SSE_INLINE __m128i _MM_PACKUS1_EPI32(__m128i a)
     {
         _NEON2SSE_ALIGN_16 int8_t mask8_32_even_odd[16] = { 0,1, 4,5, 8,9,  12,13,  2,3, 6,7,10,11,14,15};
         __m128i a16, res, reshi,cmp, zero;
         zero = _mm_setzero_si128();
         a16 = _mm_shuffle_epi8 (a, *(__m128i*)mask8_32_even_odd);
-        reshi = _mm_unpackhi_epi64(a16, a16);         //hi part of result used for saturation
-        cmp = _mm_cmpgt_epi16(zero, reshi);         //if cmp<0 the result should be zero
-        res = _mm_andnot_si128(cmp, a16);         //if cmp zero - do nothing, otherwise cmp <0  and the result is 0
-        cmp = _mm_cmpgt_epi16(reshi,zero);         //if cmp positive
-        return _mm_or_si128(res, cmp);         //if cmp positive we are out of 16bits need to saturaate to 0xffff
+        reshi = _mm_unpackhi_epi64(a16, a16); //hi part of result used for saturation
+        cmp = _mm_cmpgt_epi16(zero, reshi); //if cmp<0 the result should be zero
+        res = _mm_andnot_si128(cmp, a16); //if cmp zero - do nothing, otherwise cmp <0  and the result is 0
+        cmp = _mm_cmpgt_epi16(reshi,zero); //if cmp positive
+        return _mm_or_si128(res, cmp); //if cmp positive we are out of 16bits need to saturaate to 0xffff
     }
-    #endif
+
 
     _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(__m128i _MM_MULLO_EPI32(__m128i a, __m128i b), _NEON2SSE_REASON_SLOW_SERIAL)
     {
@@ -1827,61 +2649,71 @@
         return _mm_load_si128((__m128i*)res);
     }
 
-    #if defined(USE_SSSE3)
     _NEON2SSE_INLINE __m128i _MM_MUL_EPI32(__m128i a, __m128i b)
     {
         __m128i sign, zero,  mul_us, a_neg, b_neg, mul_us_neg;
         sign = _mm_xor_si128 (a, b);
-        sign =  _mm_srai_epi32 (sign, 31);         //promote sign bit to all fields, all fff if negative and all 0 if positive
+        sign =  _mm_srai_epi32 (sign, 31); //promote sign bit to all fields, all fff if negative and all 0 if positive
         zero = _mm_setzero_si128();
-        a_neg = _mm_abs_epi32 (a);         //negate a and b
-        b_neg = _mm_abs_epi32 (b);         //negate a and b
-        mul_us = _mm_mul_epu32 (a_neg, b_neg);         //uses 0 and 2nd data lanes, (abs), the multiplication gives 64 bit result
+        a_neg = _mm_abs_epi32 (a); //negate a and b
+        b_neg = _mm_abs_epi32 (b); //negate a and b
+        mul_us = _mm_mul_epu32 (a_neg, b_neg); //uses 0 and 2nd data lanes, (abs), the multiplication gives 64 bit result
         mul_us_neg = _mm_sub_epi64(zero, mul_us);
         mul_us_neg = _mm_and_si128(sign, mul_us_neg);
         mul_us = _mm_andnot_si128(sign, mul_us);
         return _mm_or_si128 (mul_us, mul_us_neg);
     }
-    #endif
+
+    _NEON2SSE_INLINE __m128i _MM_CMPEQ_EPI64(__m128i a, __m128i b)
+    {
+        __m128i res;
+        res = _mm_cmpeq_epi32 (a, b);
+        return _mm_shuffle_epi32 (res, 1 | (1 << 2) | (3 << 4) | (3 << 6)); //copy the information from hi to low part of the 64 bit data
+    }
 #endif     //SSE4
 
-#ifndef _MM_INSERT_EPI64     //special case of SSE4 and  _M_X64
-    _NEON2SSE_INLINE __m128i  _MM_INSERT_EPI64(__m128i vec, int p, const int LANE)
-    {
-        _NEON2SSE_ALIGN_16 uint64_t pvec[2] = {0,0};
-        _NEON2SSE_ALIGN_16 uint64_t mask[2] = {0xffffffffffffffff,0xffffffffffffffff};
-        __m128i vec_masked, p_masked;
-        pvec[LANE] = p;
-        mask[LANE] = 0x0;
-        vec_masked = _mm_and_si128 (*(__m128i*)mask,vec);         //ready for p
-        p_masked = _mm_andnot_si128 (*(__m128i*)mask,*(__m128i*)pvec);         //ready for vec
-        return _mm_or_si128(vec_masked, p_masked);
-    }
-#endif
-#ifndef _MM_EXTRACT_EPI64     //special case of SSE4 and  _M_X64
-    _NEON2SSE_INLINE int64_t _MM_EXTRACT_EPI64(__m128i val, const int LANE)
-    {
-        _NEON2SSE_ALIGN_16 int64_t tmp[2];
-        _mm_store_si128((__m128i*)tmp, val);
-        return tmp[LANE];
-    }
+//the special case of functions working only for 32 bits, no SSE4
+_NEON2SSE_INLINE __m128i  _MM_INSERT_EPI64_32(__m128i vec, int p, const int LANE)
+{
+    _NEON2SSE_ALIGN_16 uint64_t pvec[2] = {0,0};
+    _NEON2SSE_ALIGN_16 uint64_t mask[2] = {0xffffffffffffffff, 0xffffffffffffffff};
+    __m128i vec_masked, p_masked;
+    pvec[LANE] = p;
+    mask[LANE] = 0x0;
+    vec_masked = _mm_and_si128 (*(__m128i*)mask,vec); //ready for p
+    p_masked = _mm_andnot_si128 (*(__m128i*)mask,*(__m128i*)pvec); //ready for vec
+    return _mm_or_si128(vec_masked, p_masked);
+}
+
+_NEON2SSE_INLINE int64_t _MM_EXTRACT_EPI64_32(__m128i val, const int LANE)
+{
+    _NEON2SSE_ALIGN_16 int64_t tmp[2];
+    _mm_store_si128((__m128i*)tmp, val);
+    return tmp[LANE];
+}
+
+#ifndef _NEON2SSE_64BIT_SSE4
+    #define _MM_INSERT_EPI64 _MM_INSERT_EPI64_32
+    #define _MM_EXTRACT_EPI64 _MM_EXTRACT_EPI64_32
 #endif
 
-int32x4_t  vqd_s32(int32x4_t a);         //Doubling saturation for signed ints
+int32x4_t  vqd_s32(int32x4_t a); //Doubling saturation for signed ints
 _NEON2SSE_INLINE int32x4_t  vqd_s32(int32x4_t a)
-{         //Overflow happens only if a and sum have the opposite signs
+{
+    //Overflow happens only if a and sum have the opposite signs
     __m128i c7fffffff, res, res_sat, res_xor_a;
     c7fffffff = _mm_set1_epi32(0x7fffffff);
-    res = _mm_slli_epi32 (a, 1);         // res = a*2
+    res = _mm_slli_epi32 (a, 1); // res = a*2
     res_sat = _mm_srli_epi32(a, 31);
     res_sat = _mm_add_epi32(res_sat, c7fffffff);
     res_xor_a = _mm_xor_si128(res, a);
-    res_xor_a = _mm_srai_epi32(res_xor_a,31);         //propagate the sigh bit, all ffff if <0 all ones otherwise
+    res_xor_a = _mm_srai_epi32(res_xor_a,31); //propagate the sigh bit, all ffff if <0 all ones otherwise
     res_sat = _mm_and_si128(res_xor_a, res_sat);
     res = _mm_andnot_si128(res_xor_a, res);
     return _mm_or_si128(res, res_sat);
 }
 
+
 //!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
 //*************************************************************************
 //*************************************************************************
@@ -1894,6 +2726,7 @@
 #ifdef ARM
 #define vector_addq_s32 _mm_add_epi32
 #else //if we have IA
+#define vector_addq_s32 vadd_s32
 #endif
 
 ********************************************************************************************
@@ -1904,54 +2737,264 @@
 2.  x86 implementation using more than one x86 intrinsics. In this case it is shaped as inlined C function with return statement
 3.  the reference to the NEON function returning the same result and implemented in x86 as above. In this case it is shaped as matching NEON function definition
 4.  for about 5% of functions due to the corresponding x86 SIMD unavailability or inefficiency in terms of performance
-the serial implementation is provided along with the corresponding compiler warnin//these functions are on your app critical path
+the serial implementation is provided along with the corresponding compiler warning. If these functions are on your app critical path
 - please consider such functions removal from your code.
 */
 
 //***********************************************************************
 //************************      Vector add   *****************************
 //***********************************************************************
+int8x8_t vadd_s8(int8x8_t a, int8x8_t b); // VADD.I8 d0,d0,d0
+_NEON2SSE_INLINE int8x8_t vadd_s8(int8x8_t a, int8x8_t b)
+{
+    int8x8_t res64;
+    return64(_mm_add_epi8(_pM128i(a),_pM128i(b)));
+}
 
-int8x16_t   vaddq_s8(int8x16_t a, int8x16_t b);         // VADD.I8 q0,q0,q0
+
+int16x4_t vadd_s16(int16x4_t a, int16x4_t b); // VADD.I16 d0,d0,d0
+_NEON2SSE_INLINE int16x4_t vadd_s16(int16x4_t a, int16x4_t b)
+{
+    int16x4_t res64;
+    return64(_mm_add_epi16(_pM128i(a),_pM128i(b)));
+}
+
+
+int32x2_t vadd_s32(int32x2_t a, int32x2_t b); // VADD.I32 d0,d0,d0
+_NEON2SSE_INLINE int32x2_t vadd_s32(int32x2_t a, int32x2_t b)
+{
+    int32x2_t res64;
+    return64(_mm_add_epi32(_pM128i(a),_pM128i(b)));
+}
+
+
+int64x1_t  vadd_s64(int64x1_t a,  int64x1_t b); // VADD.I64 d0,d0,d0
+_NEON2SSE_INLINE int64x1_t  vadd_s64(int64x1_t a,  int64x1_t b)
+{
+    int64x1_t res64;
+    res64.m64_i64[0] = a.m64_i64[0] + b.m64_i64[0];
+    return res64;
+}
+
+
+float32x2_t vadd_f32(float32x2_t a, float32x2_t b); // VADD.F32 d0,d0,d0
+_NEON2SSE_INLINE float32x2_t vadd_f32(float32x2_t a, float32x2_t b)
+{
+    __m128 res;
+    __m64_128 res64;
+    res = _mm_add_ps(_pM128(a),_pM128(b)); //SSE, use only low 64 bits
+    _M64f(res64, res);
+    return res64;
+}
+
+uint8x8_t  vadd_u8(uint8x8_t a, uint8x8_t b); // VADD.I8 d0,d0,d0
+#define vadd_u8 vadd_s8
+
+uint16x4_t  vadd_u16(uint16x4_t a, uint16x4_t b); // VADD.I16 d0,d0,d0
+#define vadd_u16 vadd_s16
+
+uint32x2_t  vadd_u32(uint32x2_t a, uint32x2_t b); // VADD.I32 d0,d0,d0
+#define vadd_u32 vadd_s32
+
+uint64x1_t vadd_u64(uint64x1_t a,  uint64x1_t b); // VADD.I64 d0,d0,d0
+_NEON2SSE_INLINE uint64x1_t vadd_u64(uint64x1_t a,  uint64x1_t b)
+{
+    uint64x1_t res64;
+    res64.m64_u64[0] = a.m64_u64[0] + b.m64_u64[0];
+    return res64;
+}
+
+
+int8x16_t   vaddq_s8(int8x16_t a, int8x16_t b); // VADD.I8 q0,q0,q0
 #define vaddq_s8 _mm_add_epi8
 
-int16x8_t   vaddq_s16(int16x8_t a, int16x8_t b);         // VADD.I16 q0,q0,q0
+int16x8_t   vaddq_s16(int16x8_t a, int16x8_t b); // VADD.I16 q0,q0,q0
 #define vaddq_s16 _mm_add_epi16
 
-int32x4_t   vaddq_s32(int32x4_t a, int32x4_t b);         // VADD.I32 q0,q0,q0
+int32x4_t   vaddq_s32(int32x4_t a, int32x4_t b); // VADD.I32 q0,q0,q0
 #define vaddq_s32 _mm_add_epi32
 
-int64x2_t   vaddq_s64(int64x2_t a, int64x2_t b);         // VADD.I64 q0,q0,q0
+int64x2_t   vaddq_s64(int64x2_t a, int64x2_t b); // VADD.I64 q0,q0,q0
 #define vaddq_s64 _mm_add_epi64
 
-float32x4_t vaddq_f32(float32x4_t a, float32x4_t b);         // VADD.F32 q0,q0,q0
+float32x4_t vaddq_f32(float32x4_t a, float32x4_t b); // VADD.F32 q0,q0,q0
 #define vaddq_f32 _mm_add_ps
 
-uint8x16_t   vaddq_u8(uint8x16_t a, uint8x16_t b);         // VADD.I8 q0,q0,q0
+uint8x16_t   vaddq_u8(uint8x16_t a, uint8x16_t b); // VADD.I8 q0,q0,q0
 #define vaddq_u8 _mm_add_epi8
 
-uint16x8_t   vaddq_u16(uint16x8_t a, uint16x8_t b);         // VADD.I16 q0,q0,q0
+uint16x8_t   vaddq_u16(uint16x8_t a, uint16x8_t b); // VADD.I16 q0,q0,q0
 #define vaddq_u16 _mm_add_epi16
 
-uint32x4_t   vaddq_u32(uint32x4_t a, uint32x4_t b);         // VADD.I32 q0,q0,q0
+uint32x4_t   vaddq_u32(uint32x4_t a, uint32x4_t b); // VADD.I32 q0,q0,q0
 #define vaddq_u32 _mm_add_epi32
 
-uint64x2_t   vaddq_u64(uint64x2_t a, uint64x2_t b);         // VADD.I64 q0,q0,q0
+uint64x2_t   vaddq_u64(uint64x2_t a, uint64x2_t b); // VADD.I64 q0,q0,q0
 #define vaddq_u64 _mm_add_epi64
 
 //**************************** Vector long add *****************************:
 //***********************************************************************
 //Va, Vb have equal lane sizes, result is a 128 bit vector of lanes that are twice the width.
+int16x8_t  vaddl_s8(int8x8_t a, int8x8_t b); // VADDL.S8 q0,d0,d0
+_NEON2SSE_INLINE int16x8_t  vaddl_s8(int8x8_t a, int8x8_t b) // VADDL.S8 q0,d0,d0
+{
+    __m128i a16, b16;
+    a16 = _MM_CVTEPI8_EPI16 (_pM128i(a)); //SSE4.1,
+    b16 = _MM_CVTEPI8_EPI16 (_pM128i(b)); //SSE4.1,
+    return _mm_add_epi16 (a16, b16);
+}
+
+int32x4_t  vaddl_s16(int16x4_t a, int16x4_t b); // VADDL.S16 q0,d0,d0
+_NEON2SSE_INLINE int32x4_t  vaddl_s16(int16x4_t a, int16x4_t b) // VADDL.S16 q0,d0,d0
+{
+    __m128i a32, b32;
+    a32 = _MM_CVTEPI16_EPI32 (_pM128i(a)); //SSE4.1
+    b32 = _MM_CVTEPI16_EPI32 (_pM128i(b)); //SSE4.1
+    return _mm_add_epi32 (a32, b32);
+}
+
+int64x2_t  vaddl_s32(int32x2_t a, int32x2_t b); // VADDL.S32 q0,d0,d0
+_NEON2SSE_INLINE int64x2_t  vaddl_s32(int32x2_t a, int32x2_t b) // VADDL.S32 q0,d0,d0
+{
+    //may be not optimal
+    __m128i a64, b64;
+    a64 = _MM_CVTEPI32_EPI64 (_pM128i(a)); //SSE4.1
+    b64 = _MM_CVTEPI32_EPI64 (_pM128i(b)); //SSE4.1
+    return _mm_add_epi64 ( a64, b64);
+}
+
+uint16x8_t vaddl_u8(uint8x8_t a, uint8x8_t b); // VADDL.U8 q0,d0,d0
+_NEON2SSE_INLINE uint16x8_t vaddl_u8(uint8x8_t a, uint8x8_t b) // VADDL.U8 q0,d0,d0
+{
+    __m128i a16, b16;
+    a16 = _MM_CVTEPU8_EPI16 (_pM128i(a)); //SSE4.1
+    b16 = _MM_CVTEPU8_EPI16 (_pM128i(b)); //SSE4.1
+    return _mm_add_epi16 (a16, b16);
+}
+
+uint32x4_t vaddl_u16(uint16x4_t a, uint16x4_t b); // VADDL.s16 q0,d0,d0
+_NEON2SSE_INLINE uint32x4_t vaddl_u16(uint16x4_t a, uint16x4_t b) // VADDL.s16 q0,d0,d0
+{
+    __m128i a32, b32;
+    a32 = _MM_CVTEPU16_EPI32 (_pM128i(a)); //SSE4.1
+    b32 = _MM_CVTEPU16_EPI32 (_pM128i(b)); //SSE4.1
+    return _mm_add_epi32 (a32, b32);
+}
+
+uint64x2_t vaddl_u32(uint32x2_t a, uint32x2_t b); // VADDL.U32 q0,d0,d0
+_NEON2SSE_INLINE uint64x2_t vaddl_u32(uint32x2_t a, uint32x2_t b) // VADDL.U32 q0,d0,d0
+{
+    //may be not optimal
+    __m128i a64, b64;
+    a64 = _MM_CVTEPU32_EPI64 (_pM128i(a)); //SSE4.1
+    b64 = _MM_CVTEPU32_EPI64 (_pM128i(b)); //SSE4.1
+    return _mm_add_epi64 (a64, b64);
+}
 
 //***************   Vector wide add: vaddw_<type>. Vr[i]:=Va[i]+Vb[i] ******************
 //*************** *********************************************************************
+int16x8_t  vaddw_s8(int16x8_t a, int8x8_t b); // VADDW.S8 q0,q0,d0
+_NEON2SSE_INLINE int16x8_t  vaddw_s8(int16x8_t a, int8x8_t b) // VADDW.S8 q0,q0,d0
+{
+    __m128i b16;
+    b16 = _MM_CVTEPI8_EPI16 (_pM128i(b)); //SSE4.1,
+    return _mm_add_epi16 (a, b16);
+}
+
+int32x4_t  vaddw_s16(int32x4_t a, int16x4_t b); // VADDW.S16 q0,q0,d0
+_NEON2SSE_INLINE int32x4_t  vaddw_s16(int32x4_t a, int16x4_t b) // VADDW.S16 q0,q0,d0
+{
+    __m128i b32;
+    b32 =  _MM_CVTEPI16_EPI32(_pM128i(b)); //SSE4.1,
+    return _mm_add_epi32 (a, b32);
+}
+
+int64x2_t  vaddw_s32(int64x2_t a, int32x2_t b); // VADDW.S32 q0,q0,d0
+_NEON2SSE_INLINE int64x2_t  vaddw_s32(int64x2_t a, int32x2_t b) // VADDW.S32 q0,q0,d0
+{
+    __m128i b64;
+    b64 = _MM_CVTEPI32_EPI64 (_pM128i(b)); //SSE4.1
+    return _mm_add_epi64 (a, b64);
+}
+
+uint16x8_t vaddw_u8(uint16x8_t a, uint8x8_t b); // VADDW.U8 q0,q0,d0
+_NEON2SSE_INLINE uint16x8_t vaddw_u8(uint16x8_t a, uint8x8_t b) // VADDW.U8 q0,q0,d0
+{
+    __m128i b16;
+    b16 = _MM_CVTEPU8_EPI16 (_pM128i(b)); //SSE4.1
+    return _mm_add_epi16 (a, b16);
+}
+
+uint32x4_t vaddw_u16(uint32x4_t a, uint16x4_t b); // VADDW.s16 q0,q0,d0
+_NEON2SSE_INLINE uint32x4_t vaddw_u16(uint32x4_t a, uint16x4_t b) // VADDW.s16 q0,q0,d0
+{
+    __m128i b32;
+    b32 = _MM_CVTEPU16_EPI32 (_pM128i(b)); //SSE4.1
+    return _mm_add_epi32 (a, b32);
+}
+
+uint64x2_t vaddw_u32(uint64x2_t a, uint32x2_t b); // VADDW.U32 q0,q0,d0
+_NEON2SSE_INLINE uint64x2_t vaddw_u32(uint64x2_t a, uint32x2_t b) // VADDW.U32 q0,q0,d0
+{
+    __m128i b64;
+    b64 = _MM_CVTEPU32_EPI64 (_pM128i(b)); //SSE4.1
+    return _mm_add_epi64 (a, b64);
+}
 
 //******************************Vector halving add: vhadd -> Vr[i]:=(Va[i]+Vb[i])>>1 ,  result truncated *******************************
 //*************************************************************************************************************************
+int8x8_t vhadd_s8(int8x8_t a,  int8x8_t b); // VHADD.S8 d0,d0,d0
+_NEON2SSE_INLINE int8x8_t vhadd_s8(int8x8_t a,  int8x8_t b)
+{
+    int8x8_t res64;
+    return64(vhaddq_u8(_pM128i(a), _pM128i(b)));
+}
 
-int8x16_t vhaddq_s8(int8x16_t a, int8x16_t b);         // VHADD.S8 q0,q0,q0
+
+int16x4_t vhadd_s16(int16x4_t a,  int16x4_t b); // VHADD.S16 d0,d0,d0
+_NEON2SSE_INLINE int16x4_t vhadd_s16(int16x4_t a,  int16x4_t b)
+{
+    int16x4_t res64;
+    return64( vhaddq_s16(_pM128i(a), _pM128i(b)));
+}
+
+
+int32x2_t vhadd_s32(int32x2_t a,  int32x2_t b); // VHADD.S32 d0,d0,d0
+_NEON2SSE_INLINE int32x2_t vhadd_s32(int32x2_t a,  int32x2_t b)
+{
+    int32x2_t res64;
+    return64( vhaddq_s32(_pM128i(a), _pM128i(b)));
+}
+
+
+uint8x8_t vhadd_u8(uint8x8_t a,  uint8x8_t b); // VHADD.w d0,d0,d0
+_NEON2SSE_INLINE uint8x8_t vhadd_u8(uint8x8_t a,  uint8x8_t b)
+{
+    uint8x8_t res64;
+    return64( vhaddq_u8(_pM128i(a), _pM128i(b)));
+}
+
+
+uint16x4_t vhadd_u16(uint16x4_t a,  uint16x4_t b); // VHADD.s16 d0,d0,d0
+_NEON2SSE_INLINE uint16x4_t vhadd_u16(uint16x4_t a,  uint16x4_t b)
+{
+    uint16x4_t res64;
+    return64( vhaddq_u16(_pM128i(a), _pM128i(b)));
+}
+
+
+uint32x2_t vhadd_u32(uint32x2_t a,  uint32x2_t b); // VHADD.U32 d0,d0,d0
+_NEON2SSE_INLINE uint32x2_t vhadd_u32(uint32x2_t a,  uint32x2_t b)
+{
+    uint32x2_t res64;
+    return64( vhaddq_u32(_pM128i(a), _pM128i(b)));
+}
+
+
+int8x16_t vhaddq_s8(int8x16_t a, int8x16_t b); // VHADD.S8 q0,q0,q0
 _NEON2SSE_INLINE int8x16_t vhaddq_s8(int8x16_t a, int8x16_t b)
-{         //need to avoid internal overflow, will use the (x&y)+((x^y)>>1).
+{
+    //need to avoid internal overflow, will use the (x&y)+((x^y)>>1).
     __m128i tmp1, tmp2;
     tmp1 = _mm_and_si128(a,b);
     tmp2 = _mm_xor_si128(a,b);
@@ -1959,9 +3002,10 @@
     return _mm_add_epi8(tmp1,tmp2);
 }
 
-int16x8_t vhaddq_s16(int16x8_t a, int16x8_t b);         // VHADD.S1 6 q0,q0,q0
+int16x8_t vhaddq_s16(int16x8_t a, int16x8_t b); // VHADD.S1 6 q0,q0,q0
 _NEON2SSE_INLINE int16x8_t vhaddq_s16(int16x8_t a, int16x8_t b)
-{         //need to avoid internal overflow, will use the (x&y)+((x^y)>>1).
+{
+    //need to avoid internal overflow, will use the (x&y)+((x^y)>>1).
     __m128i tmp1, tmp2;
     tmp1 = _mm_and_si128(a,b);
     tmp2 = _mm_xor_si128(a,b);
@@ -1969,9 +3013,10 @@
     return _mm_add_epi16(tmp1,tmp2);
 }
 
-int32x4_t vhaddq_s32(int32x4_t a, int32x4_t b);         // VHADD.S32 q0,q0,q0
-_NEON2SSE_INLINE int32x4_t vhaddq_s32(int32x4_t a, int32x4_t b)         // VHADD.S32 q0,q0,q0
-{         //need to avoid internal overflow, will use the (x&y)+((x^y)>>1).
+int32x4_t vhaddq_s32(int32x4_t a, int32x4_t b); // VHADD.S32 q0,q0,q0
+_NEON2SSE_INLINE int32x4_t vhaddq_s32(int32x4_t a, int32x4_t b) // VHADD.S32 q0,q0,q0
+{
+    //need to avoid internal overflow, will use the (x&y)+((x^y)>>1).
     __m128i tmp1, tmp2;
     tmp1 = _mm_and_si128(a,b);
     tmp2 = _mm_xor_si128(a,b);
@@ -1979,31 +3024,32 @@
     return _mm_add_epi32(tmp1,tmp2);
 }
 
-uint8x16_t vhaddq_u8(uint8x16_t a, uint8x16_t b);         // VHADD.U8 q0,q0,q0
-_NEON2SSE_INLINE uint8x16_t vhaddq_u8(uint8x16_t a, uint8x16_t b)         // VHADD.U8 q0,q0,q0
+uint8x16_t vhaddq_u8(uint8x16_t a, uint8x16_t b); // VHADD.U8 q0,q0,q0
+_NEON2SSE_INLINE uint8x16_t vhaddq_u8(uint8x16_t a, uint8x16_t b) // VHADD.U8 q0,q0,q0
 {
     __m128i c1, sum, res;
     c1 = _mm_set1_epi8(1);
-    sum = _mm_avg_epu8(a, b);         //result is rounded, need to compensate it
-    res = _mm_xor_si128(a, b);         //for rounding compensation
-    res = _mm_and_si128(res,c1);         //for rounding compensation
-    return _mm_sub_epi8 (sum, res);         //actual rounding compensation
+    sum = _mm_avg_epu8(a, b); //result is rounded, need to compensate it
+    res = _mm_xor_si128(a, b); //for rounding compensation
+    res = _mm_and_si128(res,c1); //for rounding compensation
+    return _mm_sub_epi8 (sum, res); //actual rounding compensation
 }
 
-uint16x8_t vhaddq_u16(uint16x8_t a, uint16x8_t b);         // VHADD.s16 q0,q0,q0
-_NEON2SSE_INLINE uint16x8_t vhaddq_u16(uint16x8_t a, uint16x8_t b)         // VHADD.s16 q0,q0,q0
+uint16x8_t vhaddq_u16(uint16x8_t a, uint16x8_t b); // VHADD.s16 q0,q0,q0
+_NEON2SSE_INLINE uint16x8_t vhaddq_u16(uint16x8_t a, uint16x8_t b) // VHADD.s16 q0,q0,q0
 {
     __m128i sum, res;
-    sum = _mm_avg_epu16(a, b);         //result is rounded, need to compensate it
-    res = _mm_xor_si128(a, b);         //for rounding compensation
-    res = _mm_slli_epi16 (res,15);         //shift left  then back right to
-    res = _mm_srli_epi16 (res,15);         //get 1 or zero
-    return _mm_sub_epi16 (sum, res);         //actual rounding compensation
+    sum = _mm_avg_epu16(a, b); //result is rounded, need to compensate it
+    res = _mm_xor_si128(a, b); //for rounding compensation
+    res = _mm_slli_epi16 (res,15); //shift left  then back right to
+    res = _mm_srli_epi16 (res,15); //get 1 or zero
+    return _mm_sub_epi16 (sum, res); //actual rounding compensation
 }
 
-uint32x4_t vhaddq_u32(uint32x4_t a, uint32x4_t b);         // VHADD.U32 q0,q0,q0
-_NEON2SSE_INLINE uint32x4_t vhaddq_u32(uint32x4_t a, uint32x4_t b)         // VHADD.U32 q0,q0,q0
-{         //need to avoid internal overflow, will use the (x&y)+((x^y)>>1).
+uint32x4_t vhaddq_u32(uint32x4_t a, uint32x4_t b); // VHADD.U32 q0,q0,q0
+_NEON2SSE_INLINE uint32x4_t vhaddq_u32(uint32x4_t a, uint32x4_t b) // VHADD.U32 q0,q0,q0
+{
+    //need to avoid internal overflow, will use the (x&y)+((x^y)>>1).
     __m128i tmp1, tmp2;
     tmp1 = _mm_and_si128(a,b);
     tmp2 = _mm_xor_si128(a,b);
@@ -2013,77 +3059,202 @@
 
 //************************Vector rounding halving add: vrhadd{q}_<type>. Vr[i]:=(Va[i]+Vb[i]+1)>>1   ***************************
 //*****************************************************************************************************************************
+int8x8_t vrhadd_s8(int8x8_t a,  int8x8_t b); // VRHADD.S8 d0,d0,d0
+_NEON2SSE_INLINE int8x8_t vrhadd_s8(int8x8_t a,  int8x8_t b)
+{
+    int8x8_t res64;
+    return64(vrhaddq_s8(_pM128i(a), _pM128i(b)));
+}
 
-//SSE, result rounding!!!
 
-//SSE, result rounding!!!
+int16x4_t vrhadd_s16(int16x4_t a,  int16x4_t b); // VRHADD.S16 d0,d0,d0
+_NEON2SSE_INLINE int16x4_t vrhadd_s16(int16x4_t a,  int16x4_t b)
+{
+    int16x4_t res64;
+    return64(vrhaddq_s16(_pM128i(a), _pM128i(b)));
+}
 
-int8x16_t  vrhaddq_s8(int8x16_t a, int8x16_t b);         // VRHADD.S8 q0,q0,q0
-_NEON2SSE_INLINE int8x16_t  vrhaddq_s8(int8x16_t a, int8x16_t b)         // VRHADD.S8 q0,q0,q0
-{         //no signed average in x86 SIMD, go to unsigned
+
+int32x2_t vrhadd_s32(int32x2_t a,  int32x2_t b); // VRHADD.S32 d0,d0,d0
+_NEON2SSE_INLINE int32x2_t vrhadd_s32(int32x2_t a,  int32x2_t b)
+{
+    int32x2_t res64;
+    return64(vrhaddq_s32(_pM128i(a), _pM128i(b)));
+}
+
+
+uint8x8_t vrhadd_u8(uint8x8_t a, uint8x8_t b); // VRHADD.U8 d0,d0,d0
+_NEON2SSE_INLINE uint8x8_t vrhadd_u8(uint8x8_t a, uint8x8_t b)
+{
+    uint8x8_t res64;
+    return64(_mm_avg_epu8(_pM128i(a),_pM128i(b))); //SSE, result rounding!!!
+}
+
+
+uint16x4_t vrhadd_u16(uint16x4_t a, uint16x4_t b); // VRHADD.s16 d0,d0,d0
+_NEON2SSE_INLINE uint16x4_t vrhadd_u16(uint16x4_t a, uint16x4_t b)
+{
+    uint16x4_t res64;
+    return64(_mm_avg_epu16(_pM128i(a),_pM128i(b))); //SSE, result rounding!!!
+}
+
+
+uint32x2_t vrhadd_u32(uint32x2_t a,  uint32x2_t b); // VRHADD.U32 d0,d0,d0
+_NEON2SSE_INLINE uint32x2_t vrhadd_u32(uint32x2_t a,  uint32x2_t b)
+{
+    uint32x2_t res64;
+    return64(vrhaddq_u32(_pM128i(a), _pM128i(b)));
+}
+
+
+int8x16_t  vrhaddq_s8(int8x16_t a, int8x16_t b); // VRHADD.S8 q0,q0,q0
+_NEON2SSE_INLINE int8x16_t  vrhaddq_s8(int8x16_t a, int8x16_t b) // VRHADD.S8 q0,q0,q0
+{
+    //no signed average in x86 SIMD, go to unsigned
     __m128i c128, au, bu, sum;
-    c128 = _mm_set1_epi8(128);
-    au = _mm_add_epi8(a, c128);
-    bu = _mm_add_epi8(b, c128);
+    c128 = _mm_set1_epi8(0x80); //-128
+    au = _mm_sub_epi8(a, c128); //add 128
+    bu = _mm_sub_epi8(b, c128); //add 128
     sum = _mm_avg_epu8(au, bu);
-    return _mm_sub_epi8 (sum, c128);
+    return _mm_add_epi8 (sum, c128); //sub 128
 }
 
-int16x8_t  vrhaddq_s16(int16x8_t a, int16x8_t b);         // VRHADD.S16 q0,q0,q0
-_NEON2SSE_INLINE int16x8_t  vrhaddq_s16(int16x8_t a, int16x8_t b)         // VRHADD.S16 q0,q0,q0
-{         //no signed average in x86 SIMD, go to unsigned
+int16x8_t  vrhaddq_s16(int16x8_t a, int16x8_t b); // VRHADD.S16 q0,q0,q0
+_NEON2SSE_INLINE int16x8_t  vrhaddq_s16(int16x8_t a, int16x8_t b) // VRHADD.S16 q0,q0,q0
+{
+    //no signed average in x86 SIMD, go to unsigned
     __m128i cx8000, au, bu, sum;
-    cx8000 = _mm_set1_epi16(0x8000);
-    au = _mm_add_epi16(a, cx8000);
-    bu = _mm_add_epi16(b, cx8000);
+    cx8000 = _mm_set1_epi16(0x8000); // - 32768
+    au = _mm_sub_epi16(a, cx8000); //add 32768
+    bu = _mm_sub_epi16(b, cx8000); //add 32768
     sum = _mm_avg_epu16(au, bu);
-    return _mm_sub_epi16 (sum, cx8000);
+    return _mm_add_epi16 (sum, cx8000); //sub 32768
 }
 
-int32x4_t  vrhaddq_s32(int32x4_t a, int32x4_t b);         // VRHADD.S32 q0,q0,q0
+int32x4_t  vrhaddq_s32(int32x4_t a, int32x4_t b); // VRHADD.S32 q0,q0,q0
 _NEON2SSE_INLINE int32x4_t  vrhaddq_s32(int32x4_t a, int32x4_t b)
-{         //need to avoid overflow
+{
+    //need to avoid overflow
     __m128i a2, b2, res, sum;
-    a2 = _mm_srai_epi32(a,1);         //a2=a/2;
-    b2 = _mm_srai_epi32(b,1);         // b2=b/2;
-    res = _mm_or_si128(a,b);         //for rounding
-    res = _mm_slli_epi32 (res,31);         //shift left  then back right to
-    res = _mm_srli_epi32 (res,31);         //get 1 or zero
+    a2 = _mm_srai_epi32(a,1); //a2=a/2;
+    b2 = _mm_srai_epi32(b,1); // b2=b/2;
+    res = _mm_or_si128(a,b); //for rounding
+    res = _mm_slli_epi32 (res,31); //shift left  then back right to
+    res = _mm_srli_epi32 (res,31); //get 1 or zero
     sum = _mm_add_epi32(a2,b2);
     return _mm_add_epi32(sum,res);
 }
 
-uint8x16_t   vrhaddq_u8(uint8x16_t a, uint8x16_t b);         // VRHADD.U8 q0,q0,q0
-#define vrhaddq_u8 _mm_avg_epu8         //SSE2, results rounded
+uint8x16_t   vrhaddq_u8(uint8x16_t a, uint8x16_t b); // VRHADD.U8 q0,q0,q0
+#define vrhaddq_u8 _mm_avg_epu8 //SSE2, results rounded
 
-uint16x8_t   vrhaddq_u16(uint16x8_t a, uint16x8_t b);         // VRHADD.s16 q0,q0,q0
-#define vrhaddq_u16 _mm_avg_epu16         //SSE2, results rounded
+uint16x8_t   vrhaddq_u16(uint16x8_t a, uint16x8_t b); // VRHADD.s16 q0,q0,q0
+#define vrhaddq_u16 _mm_avg_epu16 //SSE2, results rounded
 
-uint32x4_t vrhaddq_u32(uint32x4_t a, uint32x4_t b);         // VRHADD.U32 q0,q0,q0
-_NEON2SSE_INLINE uint32x4_t vrhaddq_u32(uint32x4_t a, uint32x4_t b)         // VRHADD.U32 q0,q0,q0
-{         //need to avoid overflow
+
+uint32x4_t vrhaddq_u32(uint32x4_t a, uint32x4_t b); // VRHADD.U32 q0,q0,q0
+_NEON2SSE_INLINE uint32x4_t vrhaddq_u32(uint32x4_t a, uint32x4_t b) // VRHADD.U32 q0,q0,q0
+{
+    //need to avoid overflow
     __m128i a2, b2, res, sum;
-    a2 = _mm_srli_epi32(a,1);         //a2=a/2;
-    b2 = _mm_srli_epi32(b,1);         // b2=b/2;
-    res = _mm_or_si128(a,b);         //for rounding
-    res = _mm_slli_epi32 (res,31);         //shift left  then back right to
-    res = _mm_srli_epi32 (res,31);         //get 1 or zero
+    a2 = _mm_srli_epi32(a,1); //a2=a/2;
+    b2 = _mm_srli_epi32(b,1); // b2=b/2;
+    res = _mm_or_si128(a,b); //for rounding
+    res = _mm_slli_epi32 (res,31); //shift left  then back right to
+    res = _mm_srli_epi32 (res,31); //get 1 or zero
     sum = _mm_add_epi32(a2,b2);
     return _mm_add_epi32(sum,res);
 }
 
 //****************** VQADD: Vector saturating add ************************
 //************************************************************************
+int8x8_t vqadd_s8(int8x8_t a, int8x8_t b); // VQADD.S8 d0,d0,d0
+_NEON2SSE_INLINE int8x8_t vqadd_s8(int8x8_t a, int8x8_t b)
+{
+    int8x8_t res64;
+    return64(_mm_adds_epi8(_pM128i(a),_pM128i(b)));
+}
 
-int8x16_t   vqaddq_s8(int8x16_t a, int8x16_t b);         // VQADD.S8 q0,q0,q0
+
+int16x4_t vqadd_s16(int16x4_t a, int16x4_t b); // VQADD.S16 d0,d0,d0
+_NEON2SSE_INLINE int16x4_t vqadd_s16(int16x4_t a, int16x4_t b)
+{
+    int16x4_t res64;
+    return64(_mm_adds_epi16(_pM128i(a),_pM128i(b)));
+}
+
+
+int32x2_t vqadd_s32(int32x2_t a,  int32x2_t b); // VQADD.S32 d0,d0,d0
+_NEON2SSE_INLINE int32x2_t vqadd_s32(int32x2_t a,  int32x2_t b)
+{
+    int32x2_t res64;
+    return64(vqaddq_s32(_pM128i(a), _pM128i(b)));
+}
+
+
+int64x1_t  vqadd_s64(int64x1_t a, int64x1_t b); // VQADD.S64 d0,d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vqadd_s64(int64x1_t a, int64x1_t b), _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    int64x1_t res;
+    uint64_t a64, b64;
+    a64 = a.m64_u64[0];
+    b64 = b.m64_u64[0];
+    res.m64_u64[0] = a64 + b64;
+    a64 = (a64 >> 63) + (~_SIGNBIT64);
+    if ((int64_t)((b64 ^ a64) | ~(res.m64_u64[0] ^ b64))>=0) {
+        res.m64_u64[0] = a64;
+    }
+    return res;
+}
+
+uint8x8_t vqadd_u8(uint8x8_t a, uint8x8_t b); // VQADD.U8 d0,d0,d0
+_NEON2SSE_INLINE uint8x8_t vqadd_u8(uint8x8_t a, uint8x8_t b)
+{
+    uint8x8_t res64;
+    return64(_mm_adds_epu8(_pM128i(a),_pM128i(b)));
+}
+
+
+uint16x4_t vqadd_u16(uint16x4_t a, uint16x4_t b); // VQADD.s16 d0,d0,d0
+_NEON2SSE_INLINE uint16x4_t vqadd_u16(uint16x4_t a, uint16x4_t b)
+{
+    uint16x4_t res64;
+    return64(_mm_adds_epu16(_pM128i(a),_pM128i(b)));
+}
+
+
+uint32x2_t vqadd_u32(uint32x2_t a,  uint32x2_t b); // VQADD.U32 d0,d0,d0
+_NEON2SSE_INLINE uint32x2_t vqadd_u32(uint32x2_t a,  uint32x2_t b)
+{
+    uint32x2_t res64;
+    return64(vqaddq_u32(_pM128i(a), _pM128i(b)));
+}
+
+
+uint64x1_t vqadd_u64(uint64x1_t a, uint64x1_t b); // VQADD.U64 d0,d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x1_t vqadd_u64(uint64x1_t a, uint64x1_t b), _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    _NEON2SSE_ALIGN_16 uint64_t a64, b64;
+    uint64x1_t res;
+    a64 = a.m64_u64[0];
+    b64 = b.m64_u64[0];
+    res.m64_u64[0] = a64 + b64;
+    if (res.m64_u64[0] < a64) {
+        res.m64_u64[0] = ~(uint64_t)0;
+    }
+    return res;
+}
+
+int8x16_t   vqaddq_s8(int8x16_t a, int8x16_t b); // VQADD.S8 q0,q0,q0
 #define vqaddq_s8 _mm_adds_epi8
 
-int16x8_t   vqaddq_s16(int16x8_t a, int16x8_t b);         // VQADD.S16 q0,q0,q0
+int16x8_t   vqaddq_s16(int16x8_t a, int16x8_t b); // VQADD.S16 q0,q0,q0
 #define vqaddq_s16 _mm_adds_epi16
 
-int32x4_t  vqaddq_s32(int32x4_t a, int32x4_t b);         // VQADD.S32 q0,q0,q0
+int32x4_t  vqaddq_s32(int32x4_t a, int32x4_t b); // VQADD.S32 q0,q0,q0
 _NEON2SSE_INLINE int32x4_t  vqaddq_s32(int32x4_t a, int32x4_t b)
-{         //no corresponding x86 SIMD soulution, special tricks are necessary. Overflow happens only if a and b have the same sign and sum has the opposite sign
+{
+    //no corresponding x86 SIMD soulution, special tricks are necessary. Overflow happens only if a and b have the same sign and sum has the opposite sign
     __m128i c7fffffff, res, res_sat, res_xor_a, b_xor_a_;
     c7fffffff = _mm_set1_epi32(0x7fffffff);
     res = _mm_add_epi32(a, b);
@@ -2092,13 +3263,13 @@
     res_xor_a = _mm_xor_si128(res, a);
     b_xor_a_ = _mm_xor_si128(b, a);
     res_xor_a = _mm_andnot_si128(b_xor_a_, res_xor_a);
-    res_xor_a = _mm_srai_epi32(res_xor_a,31);         //propagate the sigh bit, all ffff if <0 all ones otherwise
+    res_xor_a = _mm_srai_epi32(res_xor_a,31); //propagate the sigh bit, all ffff if <0 all ones otherwise
     res_sat = _mm_and_si128(res_xor_a, res_sat);
     res = _mm_andnot_si128(res_xor_a, res);
     return _mm_or_si128(res, res_sat);
 }
 
-int64x2_t  vqaddq_s64(int64x2_t a, int64x2_t b);         // VQADD.S64 q0,q0,q0
+int64x2_t  vqaddq_s64(int64x2_t a, int64x2_t b); // VQADD.S64 q0,q0,q0
 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqaddq_s64(int64x2_t a, int64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
 {
     _NEON2SSE_ALIGN_16 uint64_t atmp[2], btmp[2], res[2];
@@ -2119,13 +3290,13 @@
     return _mm_load_si128((__m128i*)res);
 }
 
-uint8x16_t   vqaddq_u8(uint8x16_t a, uint8x16_t b);         // VQADD.U8 q0,q0,q0
+uint8x16_t   vqaddq_u8(uint8x16_t a, uint8x16_t b); // VQADD.U8 q0,q0,q0
 #define vqaddq_u8 _mm_adds_epu8
 
-uint16x8_t   vqaddq_u16(uint16x8_t a, uint16x8_t b);         // VQADD.s16 q0,q0,q0
+uint16x8_t   vqaddq_u16(uint16x8_t a, uint16x8_t b); // VQADD.s16 q0,q0,q0
 #define vqaddq_u16 _mm_adds_epu16
 
-uint32x4_t vqaddq_u32(uint32x4_t a, uint32x4_t b);         // VQADD.U32 q0,q0,q0
+uint32x4_t vqaddq_u32(uint32x4_t a, uint32x4_t b); // VQADD.U32 q0,q0,q0
 _NEON2SSE_INLINE uint32x4_t vqaddq_u32(uint32x4_t a, uint32x4_t b)
 {
     __m128i c80000000, cmp, subsum, suba, sum;
@@ -2133,11 +3304,11 @@
     sum = _mm_add_epi32 (a, b);
     subsum = _mm_sub_epi32 (sum, c80000000);
     suba = _mm_sub_epi32 (a, c80000000);
-    cmp = _mm_cmpgt_epi32 ( suba, subsum);         //no unsigned comparison, need to go to signed
-    return _mm_or_si128 (sum, cmp);         //saturation
+    cmp = _mm_cmpgt_epi32 ( suba, subsum); //no unsigned comparison, need to go to signed
+    return _mm_or_si128 (sum, cmp); //saturation
 }
 
-uint64x2_t vqaddq_u64(uint64x2_t a, uint64x2_t b);         // VQADD.U64 q0,q0,q0
+uint64x2_t vqaddq_u64(uint64x2_t a, uint64x2_t b); // VQADD.U64 q0,q0,q0
 #ifdef USE_SSE4
     _NEON2SSE_INLINE uint64x2_t vqaddq_u64(uint64x2_t a, uint64x2_t b)
     {
@@ -2146,8 +3317,8 @@
         sum = _mm_add_epi64 (a, b);
         subsum = _mm_sub_epi64 (sum, c80000000);
         suba = _mm_sub_epi64 (a, c80000000);
-        cmp = _mm_cmpgt_epi64 ( suba, subsum);         //no unsigned comparison, need to go to signed, SSE4.2!!!
-        return _mm_or_si128 (sum, cmp);         //saturation
+        cmp = _mm_cmpgt_epi64 ( suba, subsum); //no unsigned comparison, need to go to signed, SSE4.2!!!
+        return _mm_or_si128 (sum, cmp); //saturation
     }
 #else
     _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vqaddq_u64(uint64x2_t a, uint64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
@@ -2163,11 +3334,142 @@
     }
 #endif
 
+
 //******************* Vector add high half (truncated)  ******************
 //************************************************************************
+int8x8_t   vaddhn_s16(int16x8_t a, int16x8_t b); // VADDHN.I16 d0,q0,q0
+_NEON2SSE_INLINE int8x8_t   vaddhn_s16(int16x8_t a, int16x8_t b) // VADDHN.I16 d0,q0,q0
+{
+    int8x8_t res64;
+    __m128i sum;
+    sum = _mm_add_epi16 (a, b);
+    sum = _mm_srai_epi16 (sum, 8);
+    sum = _mm_packs_epi16 (sum, sum); //use 64 low bits only
+    return64(sum);
+}
+
+int16x4_t  vaddhn_s32(int32x4_t a, int32x4_t b); // VADDHN.I32 d0,q0,q0
+_NEON2SSE_INLINE int16x4_t  vaddhn_s32(int32x4_t a, int32x4_t b) // VADDHN.I32 d0,q0,q0
+{
+    int16x4_t res64;
+    __m128i sum;
+    sum = _mm_add_epi32 (a, b);
+    sum = _mm_srai_epi32(sum, 16);
+    sum = _mm_packs_epi32 (sum, sum); //use 64 low bits only
+    return64(sum);
+}
+
+int32x2_t  vaddhn_s64(int64x2_t a, int64x2_t b); // VADDHN.I64 d0,q0,q0
+_NEON2SSE_INLINE int32x2_t  vaddhn_s64(int64x2_t a, int64x2_t b)
+{
+    int32x2_t res64;
+    __m128i sum;
+    sum = _mm_add_epi64 (a, b);
+    sum = _mm_shuffle_epi32(sum,  1 | (3 << 2) | (0 << 4) | (2 << 6));
+    return64(sum);
+}
+
+uint8x8_t  vaddhn_u16(uint16x8_t a, uint16x8_t b); // VADDHN.I16 d0,q0,q0
+_NEON2SSE_INLINE uint8x8_t  vaddhn_u16(uint16x8_t a, uint16x8_t b) // VADDHN.I16 d0,q0,q0
+{
+    uint8x8_t res64;
+    __m128i sum;
+    sum = _mm_add_epi16 (a, b);
+    sum = _mm_srli_epi16 (sum, 8);
+    sum = _mm_packus_epi16 (sum,sum); //use 64 low bits only
+    return64(sum);
+}
+
+uint16x4_t vaddhn_u32(uint32x4_t a, uint32x4_t b); // VADDHN.I32 d0,q0,q0
+_NEON2SSE_INLINE uint16x4_t vaddhn_u32(uint32x4_t a, uint32x4_t b) // VADDHN.I32 d0,q0,q0
+{
+    uint16x4_t res64;
+    __m128i sum;
+    sum = _mm_add_epi32 (a, b);
+    sum = _mm_srli_epi32 (sum, 16);
+    sum = _MM_PACKUS1_EPI32 (sum); //use 64 low bits only
+    return64(sum);
+}
+
+uint32x2_t vaddhn_u64(uint64x2_t a, uint64x2_t b); // VADDHN.I64 d0,q0,q0
+#define vaddhn_u64 vaddhn_s64
 
 //*********** Vector rounding add high half: vraddhn_<type> ******************.
 //***************************************************************************
+int8x8_t   vraddhn_s16(int16x8_t a, int16x8_t b); // VRADDHN.I16 d0,q0,q0
+_NEON2SSE_INLINE int8x8_t   vraddhn_s16(int16x8_t a, int16x8_t b) // VRADDHN.I16 d0,q0,q0
+{
+    int8x8_t res64;
+    __m128i sum, mask1;
+    sum = _mm_add_epi16 (a, b);
+    mask1 = _mm_slli_epi16(sum, 9); //shift left then back right to
+    mask1 = _mm_srli_epi16(mask1, 15); //get  7-th bit 1 or zero
+    sum = _mm_srai_epi16 (sum, 8); //get high half
+    sum = _mm_add_epi16 (sum, mask1); //actual rounding
+    sum = _mm_packs_epi16 (sum, sum);
+    return64(sum);
+}
+
+int16x4_t  vraddhn_s32(int32x4_t a, int32x4_t b); // VRADDHN.I32 d0,q0,q0
+_NEON2SSE_INLINE int16x4_t  vraddhn_s32(int32x4_t a, int32x4_t b) // VRADDHN.I32 d0,q0,q0
+{
+    //SIMD may be not optimal, serial may be faster
+    int16x4_t res64;
+    __m128i sum, mask1;
+    sum = _mm_add_epi32 (a, b);
+    mask1 = _mm_slli_epi32(sum, 17); //shift left then back right to
+    mask1 = _mm_srli_epi32(mask1,31); //get  15-th bit 1 or zero
+    sum = _mm_srai_epi32 (sum, 16); //get high half
+    sum = _mm_add_epi32 (sum, mask1); //actual rounding
+    sum = _mm_packs_epi32 (sum, sum);
+    return64(sum);
+}
+
+int32x2_t  vraddhn_s64(int64x2_t a, int64x2_t b); // VRADDHN.I64 d0,q0,q0
+_NEON2SSE_INLINE int32x2_t vraddhn_s64(int64x2_t a, int64x2_t b)
+{
+    //SIMD may be not optimal, serial may be faster
+    int32x2_t res64;
+    __m128i sum, mask1;
+    sum = _mm_add_epi64 (a, b);
+    mask1 = _mm_slli_epi64(sum, 33); //shift left then back right to
+    mask1 = _mm_srli_epi64(mask1,32); //get  31-th bit 1 or zero
+    sum = _mm_add_epi64 (sum, mask1); //actual high half rounding
+    sum = _mm_shuffle_epi32(sum,  1 | (3 << 2) | (1 << 4) | (3 << 6));
+    return64(sum);
+}
+
+uint8x8_t  vraddhn_u16(uint16x8_t a, uint16x8_t b); // VRADDHN.I16 d0,q0,q0
+_NEON2SSE_INLINE uint8x8_t  vraddhn_u16(uint16x8_t a, uint16x8_t b) // VRADDHN.I16 d0,q0,q0
+{
+    uint8x8_t res64;
+    __m128i sum, mask1;
+    sum = _mm_add_epi16 (a, b);
+    mask1 = _mm_slli_epi16(sum, 9); //shift left then back right to
+    mask1 = _mm_srli_epi16(mask1, 15); //get  7-th bit 1 or zero
+    sum = _mm_srai_epi16 (sum, 8); //get high half
+    sum = _mm_add_epi16 (sum, mask1); //actual rounding
+    sum = _mm_packus_epi16 (sum, sum);
+    return64(sum);
+}
+
+uint16x4_t vraddhn_u32(uint32x4_t a, uint32x4_t b); // VRADDHN.I32 d0,q0,q0
+_NEON2SSE_INLINE uint16x4_t vraddhn_u32(uint32x4_t a, uint32x4_t b)
+{
+    //SIMD may be not optimal, serial may be faster
+    uint16x4_t res64;
+    __m128i sum, mask1;
+    sum = _mm_add_epi32 (a, b);
+    mask1 = _mm_slli_epi32(sum, 17); //shift left then back right to
+    mask1 = _mm_srli_epi32(mask1,31); //get  15-th bit 1 or zero
+    sum = _mm_srai_epi32 (sum, 16); //get high half
+    sum = _mm_add_epi32 (sum, mask1); //actual rounding
+    sum = _MM_PACKUS1_EPI32 (sum);
+    return64(sum);
+}
+
+uint32x2_t vraddhn_u64(uint64x2_t a, uint64x2_t b); // VRADDHN.I64 d0,q0,q0
+#define vraddhn_u64 vraddhn_s64
 
 //**********************************************************************************
 //*********             Multiplication            *************************************
@@ -2175,79 +3477,163 @@
 
 //Vector multiply: vmul -> Vr[i] := Va[i] * Vb[i]
 //As we don't go to wider result functions are equal to "multiply low" in x86
+int8x8_t vmul_s8(int8x8_t a, int8x8_t b); // VMUL.I8 d0,d0,d0
+_NEON2SSE_INLINE int8x8_t vmul_s8(int8x8_t a, int8x8_t b) // VMUL.I8 d0,d0,d0
+{
+    // no 8 bit simd multiply, need to go to 16 bits in SSE
+    int8x8_t res64;
+    __m128i a128, b128, res;
+    _NEON2SSE_ALIGN_16 int8_t mask8_16_even_odd[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 };
+    a128 = _MM_CVTEPI8_EPI16 (_pM128i(a)); // SSE 4.1 use low 64 bits
+    b128 = _MM_CVTEPI8_EPI16 (_pM128i(b)); // SSE 4.1 use low 64 bits
+    res = _mm_mullo_epi16 (a128, b128);
+    res = _mm_shuffle_epi8 (res, *(__m128i*) mask8_16_even_odd); //return to 8 bit from 16, use 64 low bits only
+    return64(res);
+}
 
-#if defined(USE_SSSE3)
-int8x16_t vmulq_s8(int8x16_t a, int8x16_t b);         // VMUL.I8 q0,q0,q0
-_NEON2SSE_INLINE int8x16_t vmulq_s8(int8x16_t a, int8x16_t b)         // VMUL.I8 q0,q0,q0
-{         // no 8 bit simd multiply, need to go to 16 bits
-      //solution may be not optimal
+int16x4_t vmul_s16(int16x4_t a,  int16x4_t b); // VMUL.I16 d0,d0,d0
+#define vmul_s16 vmul_u16
+
+int32x2_t vmul_s32(int32x2_t a,  int32x2_t b); // VMUL.I32 d0,d0,d0
+#define vmul_s32 vmul_u32
+
+float32x2_t vmul_f32(float32x2_t a, float32x2_t b); // VMUL.F32 d0,d0,d0
+_NEON2SSE_INLINE float32x2_t vmul_f32(float32x2_t a, float32x2_t b)
+{
+    float32x4_t tmp;
+    __m64_128 res64;
+    tmp =  _mm_mul_ps(_pM128(a),_pM128(b));
+    _M64f(res64, tmp); //use low 64 bits
+    return res64;
+}
+
+uint8x8_t vmul_u8(uint8x8_t a, uint8x8_t b); // VMUL.I8 d0,d0,d0
+_NEON2SSE_INLINE uint8x8_t vmul_u8(uint8x8_t a, uint8x8_t b) // VMUL.I8 d0,d0,d0
+{
+    // no 8 bit simd multiply, need to go to 16 bits in SSE
+    uint8x8_t res64;
+    __m128i mask, a128, b128, res;
+    mask = _mm_set1_epi16(0xff);
+    a128 = _MM_CVTEPU8_EPI16 (_pM128i(a));
+    b128 = _MM_CVTEPU8_EPI16 (_pM128i(b));
+    res = _mm_mullo_epi16 (a128, b128);
+    res = _mm_and_si128(res, mask); //to avoid saturation
+    res = _mm_packus_epi16 (res,res); //use only low 64 bits
+    return64(res);
+}
+
+uint16x4_t vmul_u16(uint16x4_t a, uint16x4_t b); // VMUL.I16 d0,d0,d0
+_NEON2SSE_INLINE uint16x4_t vmul_u16(uint16x4_t a, uint16x4_t b)
+{
+    uint16x4_t res64;
+    return64(_mm_mullo_epi16(_pM128i(a),_pM128i(b)));
+}
+
+
+uint32x2_t   vmul_u32(uint32x2_t a, uint32x2_t b); // VMUL.I32 d0,d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING( uint32x2_t   vmul_u32(uint32x2_t a, uint32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    uint32x2_t res;
+    res.m64_u32[0] = a.m64_u32[0] * b.m64_u32[0];
+    res.m64_u32[1] = a.m64_u32[1] * b.m64_u32[1];
+    return res;
+}
+
+poly8x8_t vmul_p8(poly8x8_t a, poly8x8_t b); // VMUL.P8 d0,d0,d0
+_NEON2SSE_INLINE poly8x8_t vmul_p8(poly8x8_t a, poly8x8_t b)
+{
+    //may be optimized
+    poly8x8_t res64;
+    __m128i a64, b64, c1, res, tmp, bmasked;
+    int i;
+    a64 = _pM128i(a);
+    b64 = _pM128i(b);
+    c1 = _mm_cmpeq_epi8 (a64,a64); //all ones 0xff....
+    c1 = vshrq_n_u8(c1,7); //0x1
+    bmasked = _mm_and_si128(b64, c1); //0x1
+    res = vmulq_u8(a64, bmasked);
+    for(i = 1; i<8; i++) {
+        c1 = _mm_slli_epi16(c1,1); //shift mask left by 1, 16 bit shift is OK here
+        bmasked = _mm_and_si128(b64, c1); //0x1
+        tmp = vmulq_u8(a64, bmasked);
+        res = _mm_xor_si128(res, tmp);
+    }
+    return64 (res);
+}
+
+int8x16_t vmulq_s8(int8x16_t a, int8x16_t b); // VMUL.I8 q0,q0,q0
+_NEON2SSE_INLINE int8x16_t vmulq_s8(int8x16_t a, int8x16_t b) // VMUL.I8 q0,q0,q0
+{
+    // no 8 bit simd multiply, need to go to 16 bits
+    //solution may be not optimal
     __m128i a16, b16, r16_1, r16_2;
     _NEON2SSE_ALIGN_16 int8_t mask8_16_even_odd[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 };
-    a16 = _MM_CVTEPI8_EPI16 (a);         // SSE 4.1
-    b16 = _MM_CVTEPI8_EPI16 (b);         // SSE 4.1
+    a16 = _MM_CVTEPI8_EPI16 (a); // SSE 4.1
+    b16 = _MM_CVTEPI8_EPI16 (b); // SSE 4.1
     r16_1 = _mm_mullo_epi16 (a16, b16);
     //swap hi and low part of a and b to process the remaining data
     a16 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
     b16 = _mm_shuffle_epi32 (b, _SWAP_HI_LOW32);
-    a16 = _MM_CVTEPI8_EPI16 (a16);         // SSE 4.1
-    b16 = _MM_CVTEPI8_EPI16 (b16);         // SSE 4.1  __m128i r16_2
+    a16 = _MM_CVTEPI8_EPI16 (a16); // SSE 4.1
+    b16 = _MM_CVTEPI8_EPI16 (b16); // SSE 4.1  __m128i r16_2
 
     r16_2 = _mm_mullo_epi16 (a16, b16);
-    r16_1 = _mm_shuffle_epi8 (r16_1, *(__m128i*)mask8_16_even_odd);         //return to 8 bit
-    r16_2 = _mm_shuffle_epi8 (r16_2, *(__m128i*)mask8_16_even_odd);         //return to 8 bit
+    r16_1 = _mm_shuffle_epi8 (r16_1, *(__m128i*)mask8_16_even_odd); //return to 8 bit
+    r16_2 = _mm_shuffle_epi8 (r16_2, *(__m128i*)mask8_16_even_odd); //return to 8 bit
 
     return _mm_unpacklo_epi64(r16_1,  r16_2);
 }
-#endif
 
-int16x8_t   vmulq_s16(int16x8_t a, int16x8_t b);         // VMUL.I16 q0,q0,q0
+int16x8_t   vmulq_s16(int16x8_t a, int16x8_t b); // VMUL.I16 q0,q0,q0
 #define vmulq_s16 _mm_mullo_epi16
 
-int32x4_t   vmulq_s32(int32x4_t a, int32x4_t b);         // VMUL.I32 q0,q0,q0
-#define vmulq_s32 _MM_MULLO_EPI32         //SSE4.1
+int32x4_t   vmulq_s32(int32x4_t a, int32x4_t b); // VMUL.I32 q0,q0,q0
+#define vmulq_s32 _MM_MULLO_EPI32 //SSE4.1
 
-float32x4_t vmulq_f32(float32x4_t a, float32x4_t b);         // VMUL.F32 q0,q0,q0
+float32x4_t vmulq_f32(float32x4_t a, float32x4_t b); // VMUL.F32 q0,q0,q0
 #define vmulq_f32 _mm_mul_ps
 
-uint8x16_t vmulq_u8(uint8x16_t a, uint8x16_t b);         // VMUL.I8 q0,q0,q0
-_NEON2SSE_INLINE uint8x16_t vmulq_u8(uint8x16_t a, uint8x16_t b)         // VMUL.I8 q0,q0,q0
-{         // no 8 bit simd multiply, need to go to 16 bits
-      //solution may be not optimal
+uint8x16_t vmulq_u8(uint8x16_t a, uint8x16_t b); // VMUL.I8 q0,q0,q0
+_NEON2SSE_INLINE uint8x16_t vmulq_u8(uint8x16_t a, uint8x16_t b) // VMUL.I8 q0,q0,q0
+{
+    // no 8 bit simd multiply, need to go to 16 bits
+    //solution may be not optimal
     __m128i maskff, a16, b16, r16_1, r16_2;
     maskff = _mm_set1_epi16(0xff);
-    a16 = _MM_CVTEPU8_EPI16 (a);         // SSE 4.1
-    b16 = _MM_CVTEPU8_EPI16 (b);         // SSE 4.1
+    a16 = _MM_CVTEPU8_EPI16 (a); // SSE 4.1
+    b16 = _MM_CVTEPU8_EPI16 (b); // SSE 4.1
     r16_1 = _mm_mullo_epi16 (a16, b16);
-    r16_1 = _mm_and_si128(r16_1, maskff);         //to avoid saturation
+    r16_1 = _mm_and_si128(r16_1, maskff); //to avoid saturation
     //swap hi and low part of a and b to process the remaining data
     a16 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
     b16 = _mm_shuffle_epi32 (b, _SWAP_HI_LOW32);
-    a16 = _MM_CVTEPI8_EPI16 (a16);         // SSE 4.1
-    b16 = _MM_CVTEPI8_EPI16 (b16);         // SSE 4.1
+    a16 = _MM_CVTEPI8_EPI16 (a16); // SSE 4.1
+    b16 = _MM_CVTEPI8_EPI16 (b16); // SSE 4.1
 
     r16_2 = _mm_mullo_epi16 (a16, b16);
-    r16_2 = _mm_and_si128(r16_2, maskff);         //to avoid saturation
+    r16_2 = _mm_and_si128(r16_2, maskff); //to avoid saturation
     return _mm_packus_epi16 (r16_1,  r16_2);
 }
 
-uint16x8_t   vmulq_u16(uint16x8_t a, uint16x8_t b);         // VMUL.I16 q0,q0,q0
+uint16x8_t   vmulq_u16(uint16x8_t a, uint16x8_t b); // VMUL.I16 q0,q0,q0
 #define vmulq_u16 _mm_mullo_epi16
 
-uint32x4_t   vmulq_u32(uint32x4_t a, uint32x4_t b);         // VMUL.I32 q0,q0,q0
-#define vmulq_u32 _MM_MULLO_EPI32         //SSE4.1
+uint32x4_t   vmulq_u32(uint32x4_t a, uint32x4_t b); // VMUL.I32 q0,q0,q0
+#define vmulq_u32 _MM_MULLO_EPI32 //SSE4.1
 
-poly8x16_t vmulq_p8(poly8x16_t a, poly8x16_t b);         // VMUL.P8 q0,q0,q0
+poly8x16_t vmulq_p8(poly8x16_t a, poly8x16_t b); // VMUL.P8 q0,q0,q0
 _NEON2SSE_INLINE poly8x16_t vmulq_p8(poly8x16_t a, poly8x16_t b)
-{         //may be optimized
+{
+    //may be optimized
     __m128i c1, res, tmp, bmasked;
     int i;
-    c1 = _mm_cmpeq_epi8 (a,a);         //all ones 0xff....
-    c1 = vshrq_n_u8(c1,7);         //0x1
-    bmasked = _mm_and_si128(b, c1);         //0x1
+    c1 = _mm_cmpeq_epi8 (a,a); //all ones 0xff....
+    c1 = vshrq_n_u8(c1,7); //0x1
+    bmasked = _mm_and_si128(b, c1); //0x1
     res = vmulq_u8(a, bmasked);
     for(i = 1; i<8; i++) {
-        c1 = _mm_slli_epi16(c1,1);         //shift mask left by 1, 16 bit shift is OK here
-        bmasked = _mm_and_si128(b, c1);         //0x1
+        c1 = _mm_slli_epi16(c1,1); //shift mask left by 1, 16 bit shift is OK here
+        bmasked = _mm_and_si128(b, c1); //0x1
         tmp = vmulq_u8(a, bmasked);
         res = _mm_xor_si128(res, tmp);
     }
@@ -2256,111 +3642,404 @@
 
 //************************* Vector long multiply ***********************************
 //****************************************************************************
+int16x8_t vmull_s8(int8x8_t a, int8x8_t b); // VMULL.S8 q0,d0,d0
+_NEON2SSE_INLINE int16x8_t vmull_s8(int8x8_t a, int8x8_t b) // VMULL.S8 q0,d0,d0
+{
+    //no 8 bit simd multiply, need to go to 16 bits
+    __m128i a16, b16;
+    a16 = _MM_CVTEPI8_EPI16 (_pM128i(a)); // SSE 4.1
+    b16 = _MM_CVTEPI8_EPI16 (_pM128i(b)); // SSE 4.1
+    return _mm_mullo_epi16 (a16, b16); //should fit into 16 bit
+}
+
+int32x4_t vmull_s16(int16x4_t a, int16x4_t b); // VMULL.S16 q0,d0,d0
+_NEON2SSE_INLINE int32x4_t vmull_s16(int16x4_t a, int16x4_t b) // VMULL.S16 q0,d0,d0
+{
+    #ifdef USE_SSE4
+        __m128i a16, b16;
+        a16 = _MM_CVTEPI16_EPI32 (_pM128i(a)); // SSE 4.1
+        b16 = _MM_CVTEPI16_EPI32 (_pM128i(b)); // SSE 4.1
+        return _MM_MULLO_EPI32 (a16, b16); // SSE 4.1
+    #else
+        __m128i low, hi, a128,b128;
+        a128 = _pM128i(a);
+        b128 = _pM128i(b);
+        low =  _mm_mullo_epi16(a128,b128);
+        hi =   _mm_mulhi_epi16(a128,b128);
+        return _mm_unpacklo_epi16(low,hi);
+    #endif
+}
+
+int64x2_t vmull_s32(int32x2_t a, int32x2_t b); // VMULL.S32 q0,d0,d0
+_NEON2SSE_INLINE int64x2_t vmull_s32(int32x2_t a, int32x2_t b) // VMULL.S32 q0,d0,d0
+{
+    __m128i ab, ba, a128, b128;
+    a128 = _pM128i(a);
+    b128 = _pM128i(b);
+    ab = _mm_unpacklo_epi32 (a128, b128); //a0, b0, a1,b1
+    ba = _mm_unpacklo_epi32 (b128, a128); //b0, a0, b1,a1
+    return _MM_MUL_EPI32(ab, ba); //uses 1rst and 3rd data lanes, the multiplication gives 64 bit result
+}
+
+uint16x8_t vmull_u8(uint8x8_t a, uint8x8_t b); // VMULL.U8 q0,d0,d0
+_NEON2SSE_INLINE uint16x8_t vmull_u8(uint8x8_t a, uint8x8_t b) // VMULL.U8 q0,d0,d0
+{
+    //no 8 bit simd multiply, need to go to 16 bits
+    __m128i a16, b16;
+    a16 = _MM_CVTEPU8_EPI16 (_pM128i(a)); // SSE 4.1
+    b16 = _MM_CVTEPU8_EPI16 (_pM128i(b)); // SSE 4.1
+    return _mm_mullo_epi16 (a16, b16); //should fit into 16 bit
+}
+
+uint32x4_t vmull_u16(uint16x4_t a, uint16x4_t b); // VMULL.s16 q0,d0,d0
+_NEON2SSE_INLINE uint32x4_t vmull_u16(uint16x4_t a, uint16x4_t b) // VMULL.s16 q0,d0,d0
+{
+    #ifdef USE_SSE4
+        __m128i a16, b16;
+        a16 = _MM_CVTEPU16_EPI32 (_pM128i(a)); // SSE 4.1
+        b16 = _MM_CVTEPU16_EPI32 (_pM128i(b)); // SSE 4.1
+        return _MM_MULLO_EPI32 (a16, b16); // SSE 4.1
+    #else
+        __m128i a128,b128,low, hi;
+        a128 = _pM128i(a);
+        b128 = _pM128i(b);
+        low =  _mm_mullo_epi16(a128,b128);
+        hi =   _mm_mulhi_epu16(a128,b128);
+        return _mm_unpacklo_epi16(low,hi);
+    #endif
+}
+
+uint64x2_t vmull_u32(uint32x2_t a, uint32x2_t b); // VMULL.U32 q0,d0,d0
+_NEON2SSE_INLINE uint64x2_t vmull_u32(uint32x2_t a, uint32x2_t b) // VMULL.U32 q0,d0,d0
+{
+    ///may be not optimal compared with serial implementation
+    __m128i ab, ba, a128, b128;
+    a128 = _pM128i(a);
+    b128 = _pM128i(b);
+    ab = _mm_unpacklo_epi32 (a128, b128); //a0, b0, a1,b1
+    ba = _mm_unpacklo_epi32 (b128, a128); //b0, a0, b1,a1
+    return _mm_mul_epu32 (ab, ba); //uses 1rst and 3rd data lanes, the multiplication gives 64 bit result
+}
+
+poly16x8_t vmull_p8(poly8x8_t a, poly8x8_t b); // VMULL.P8 q0,d0,d0
+_NEON2SSE_INLINE poly16x8_t vmull_p8(poly8x8_t a, poly8x8_t b)
+{
+    //may be optimized
+    __m128i a128,b128, c1, a128_16, bmasked_16, res, tmp, bmasked;
+    int i;
+    a128 = _pM128i(a);
+    b128 = _pM128i(b);
+    c1 = _mm_cmpeq_epi8 (a128,a128); //all ones 0xff....
+    c1 = vshrq_n_u8(c1,7); //0x1
+    bmasked = _mm_and_si128(b128, c1); //0x1
+
+    a128_16 = _MM_CVTEPU8_EPI16 (a128); // SSE 4.1
+    bmasked_16 = _MM_CVTEPU8_EPI16 (bmasked); // SSE 4.1
+    res = _mm_mullo_epi16 (a128_16, bmasked_16); //should fit into 16 bit
+    for(i = 1; i<8; i++) {
+        c1 = _mm_slli_epi16(c1,1); //shift mask left by 1, 16 bit shift is OK here
+        bmasked = _mm_and_si128(b128, c1); //0x1
+        bmasked_16 = _MM_CVTEPU8_EPI16 (bmasked); // SSE 4.1
+        tmp = _mm_mullo_epi16 (a128_16, bmasked_16); //should fit into 16 bit, vmull_u8(a, bmasked);
+        res = _mm_xor_si128(res, tmp);
+    }
+    return res;
+}
 
 //****************Vector saturating doubling long multiply **************************
 //*****************************************************************
+int32x4_t vqdmull_s16(int16x4_t a, int16x4_t b); // VQDMULL.S16 q0,d0,d0
+_NEON2SSE_INLINE int32x4_t vqdmull_s16(int16x4_t a, int16x4_t b)
+{
+    //the serial soulution may be faster due to saturation
+    __m128i res;
+    res = vmull_s16(a, b);
+    return vqd_s32(res);
+}
+
+int64x2_t vqdmull_s32(int32x2_t a, int32x2_t b); // VQDMULL.S32 q0,d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqdmull_s32(int32x2_t a, int32x2_t b),_NEON2SSE_REASON_SLOW_SERIAL)
+{
+    //the serial soulution may be faster due to saturation
+    __m128i res;
+    res = vmull_s32(a,b);
+    return vqaddq_s64(res,res); //slow serial function!!!!
+}
 
 //********************* Vector multiply accumulate: vmla -> Vr[i] := Va[i] + Vb[i] * Vc[i]  ************************
 //******************************************************************************************
+int8x8_t vmla_s8(int8x8_t a, int8x8_t b, int8x8_t c); // VMLA.I8 d0,d0,d0
+_NEON2SSE_INLINE int8x8_t vmla_s8(int8x8_t a, int8x8_t b, int8x8_t c) // VMLA.I8 d0,d0,d0
+{
+    // no 8 bit x86 simd multiply, need to go to 16 bits,  and use the low 64 bits
+    int8x8_t res64;
+    __m128i b128, c128, res;
+    _NEON2SSE_ALIGN_16 int8_t mask8_16_even_odd[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 };
+    b128 = _MM_CVTEPI8_EPI16 (_pM128i(b)); // SSE 4.1 use low 64 bits
+    c128 = _MM_CVTEPI8_EPI16 (_pM128i(c)); // SSE 4.1 use low 64 bits
+    res = _mm_mullo_epi16 (c128, b128);
+    res  =  _mm_shuffle_epi8 (res, *(__m128i*) mask8_16_even_odd);
+    res  = _mm_add_epi8 (res, _pM128i(a)); //use the low 64 bits
+    return64(res);
+}
 
-#if defined(USE_SSSE3)
-int8x16_t vmlaq_s8(int8x16_t a, int8x16_t b, int8x16_t c);         // VMLA.I8 q0,q0,q0
-_NEON2SSE_INLINE int8x16_t vmlaq_s8(int8x16_t a, int8x16_t b, int8x16_t c)         // VMLA.I8 q0,q0,q0
-{         //solution may be not optimal
-      // no 8 bit simd multiply, need to go to 16 bits
+int16x4_t vmla_s16(int16x4_t a,  int16x4_t b, int16x4_t c); // VMLA.I16 d0,d0,d0
+_NEON2SSE_INLINE int16x4_t vmla_s16(int16x4_t a,  int16x4_t b, int16x4_t c)
+{
+    int16x4_t res64;
+    return64(vmlaq_s16(_pM128i(a),_pM128i(b), _pM128i(c)));
+}
+
+
+int32x2_t vmla_s32(int32x2_t a, int32x2_t b, int32x2_t c); // VMLA.I32 d0,d0,d0
+_NEON2SSE_INLINE int32x2_t vmla_s32(int32x2_t a, int32x2_t b, int32x2_t c) // VMLA.I32 d0,d0,d0
+{
+    int32x2_t res64;
+    __m128i res;
+    res = _MM_MULLO_EPI32 (_pM128i(b), _pM128i(c)); //SSE4.1
+    res = _mm_add_epi32 (res, _pM128i(a)); //use the low 64 bits
+    return64(res);
+}
+
+float32x2_t vmla_f32(float32x2_t a, float32x2_t b, float32x2_t c); // VMLA.F32 d0,d0,d0
+_NEON2SSE_INLINE float32x2_t vmla_f32(float32x2_t a, float32x2_t b, float32x2_t c)
+{
+    //fma is coming soon, but right now:
+    __m128 res;
+    __m64_128 res64;
+    res = _mm_mul_ps (_pM128(c), _pM128(b));
+    res = _mm_add_ps (_pM128(a), res);
+    _M64f(res64, res);
+    return res64;
+}
+
+uint8x8_t vmla_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c); // VMLA.I8 d0,d0,d0
+_NEON2SSE_INLINE uint8x8_t vmla_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c) // VMLA.I8 d0,d0,d0
+{
+    // no 8 bit x86 simd multiply, need to go to 16 bits,  and use the low 64 bits
+    uint8x8_t res64;
+    __m128i mask, b128, c128, res;
+    mask = _mm_set1_epi16(0xff);
+    b128 = _MM_CVTEPU8_EPI16 (_pM128i(b)); // SSE 4.1 use low 64 bits
+    c128 = _MM_CVTEPU8_EPI16 (_pM128i(c)); // SSE 4.1 use low 64 bits
+    res = _mm_mullo_epi16 (c128, b128);
+    res = _mm_and_si128(res, mask); //to avoid saturation
+    res = _mm_packus_epi16 (res, res);
+    res =  _mm_add_epi8 (res, _pM128i(a)); //use the low 64 bits
+    return64(res);
+}
+
+uint16x4_t vmla_u16(uint16x4_t a,  uint16x4_t b, uint16x4_t c); // VMLA.I16 d0,d0,d0
+#define vmla_u16 vmla_s16
+
+uint32x2_t vmla_u32(uint32x2_t a,  uint32x2_t b, uint32x2_t c); // VMLA.I32 d0,d0,d0
+#define vmla_u32 vmla_s32
+
+int8x16_t vmlaq_s8(int8x16_t a, int8x16_t b, int8x16_t c); // VMLA.I8 q0,q0,q0
+_NEON2SSE_INLINE int8x16_t vmlaq_s8(int8x16_t a, int8x16_t b, int8x16_t c) // VMLA.I8 q0,q0,q0
+{
+    //solution may be not optimal
+    // no 8 bit simd multiply, need to go to 16 bits
     __m128i b16, c16, r16_1, a_2,r16_2;
     _NEON2SSE_ALIGN_16 int8_t mask8_16_even_odd[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 };
-    b16 = _MM_CVTEPI8_EPI16 (b);         // SSE 4.1
-    c16 = _MM_CVTEPI8_EPI16 (c);         // SSE 4.1
+    b16 = _MM_CVTEPI8_EPI16 (b); // SSE 4.1
+    c16 = _MM_CVTEPI8_EPI16 (c); // SSE 4.1
     r16_1 = _mm_mullo_epi16 (b16, c16);
-    r16_1 = _mm_shuffle_epi8 (r16_1, *(__m128i*) mask8_16_even_odd);         //return to 8 bits
+    r16_1 = _mm_shuffle_epi8 (r16_1, *(__m128i*) mask8_16_even_odd); //return to 8 bits
     r16_1 = _mm_add_epi8 (r16_1, a);
     //swap hi and low part of a, b and c to process the remaining data
     a_2 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
     b16 = _mm_shuffle_epi32 (b, _SWAP_HI_LOW32);
     c16 = _mm_shuffle_epi32 (c, _SWAP_HI_LOW32);
-    b16 = _MM_CVTEPI8_EPI16 (b16);         // SSE 4.1
-    c16 = _MM_CVTEPI8_EPI16 (c16);         // SSE 4.1
+    b16 = _MM_CVTEPI8_EPI16 (b16); // SSE 4.1
+    c16 = _MM_CVTEPI8_EPI16 (c16); // SSE 4.1
 
     r16_2 = _mm_mullo_epi16 (b16, c16);
     r16_2 = _mm_shuffle_epi8 (r16_2, *(__m128i*) mask8_16_even_odd);
     r16_2 = _mm_add_epi8(r16_2, a_2);
     return _mm_unpacklo_epi64(r16_1,r16_2);
 }
-#endif
 
-int16x8_t vmlaq_s16(int16x8_t a, int16x8_t b, int16x8_t c);         // VMLA.I16 q0,q0,q0
-_NEON2SSE_INLINE int16x8_t vmlaq_s16(int16x8_t a, int16x8_t b, int16x8_t c)         // VMLA.I16 q0,q0,q0
+int16x8_t vmlaq_s16(int16x8_t a, int16x8_t b, int16x8_t c); // VMLA.I16 q0,q0,q0
+_NEON2SSE_INLINE int16x8_t vmlaq_s16(int16x8_t a, int16x8_t b, int16x8_t c) // VMLA.I16 q0,q0,q0
 {
     __m128i res;
     res = _mm_mullo_epi16 (c, b);
     return _mm_add_epi16 (res, a);
 }
 
-int32x4_t vmlaq_s32(int32x4_t a, int32x4_t b, int32x4_t c);         // VMLA.I32 q0,q0,q0
-_NEON2SSE_INLINE int32x4_t vmlaq_s32(int32x4_t a, int32x4_t b, int32x4_t c)         // VMLA.I32 q0,q0,q0
+int32x4_t vmlaq_s32(int32x4_t a, int32x4_t b, int32x4_t c); // VMLA.I32 q0,q0,q0
+_NEON2SSE_INLINE int32x4_t vmlaq_s32(int32x4_t a, int32x4_t b, int32x4_t c) // VMLA.I32 q0,q0,q0
 {
     __m128i res;
-    res = _MM_MULLO_EPI32 (c,  b);         //SSE4.1
+    res = _MM_MULLO_EPI32 (c,  b); //SSE4.1
     return _mm_add_epi32 (res, a);
 }
 
-float32x4_t vmlaq_f32(float32x4_t a, float32x4_t b, float32x4_t c);         // VMLA.F32 q0,q0,q0
-_NEON2SSE_INLINE float32x4_t vmlaq_f32(float32x4_t a, float32x4_t b, float32x4_t c)         // VMLA.F32 q0,q0,q0
-{         //fma is coming soon, but right now:
+float32x4_t vmlaq_f32(float32x4_t a, float32x4_t b, float32x4_t c); // VMLA.F32 q0,q0,q0
+_NEON2SSE_INLINE float32x4_t vmlaq_f32(float32x4_t a, float32x4_t b, float32x4_t c) // VMLA.F32 q0,q0,q0
+{
+    //fma is coming soon, but right now:
     __m128 res;
     res = _mm_mul_ps (c, b);
     return _mm_add_ps (a, res);
 }
 
-#if defined(USE_SSSE3)
-uint8x16_t vmlaq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c);         // VMLA.I8 q0,q0,q0
-_NEON2SSE_INLINE uint8x16_t vmlaq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c)         // VMLA.I8 q0,q0,q0
-{         //solution may be not optimal
-      // no 8 bit simd multiply, need to go to 16 bits
+uint8x16_t vmlaq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c); // VMLA.I8 q0,q0,q0
+_NEON2SSE_INLINE uint8x16_t vmlaq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c) // VMLA.I8 q0,q0,q0
+{
+    //solution may be not optimal
+    // no 8 bit simd multiply, need to go to 16 bits
     __m128i b16, c16, r16_1, a_2, r16_2;
     _NEON2SSE_ALIGN_16 int8_t mask8_16_even_odd[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 };
-    b16 = _MM_CVTEPU8_EPI16 (b);         // SSE 4.1
-    c16 = _MM_CVTEPU8_EPI16 (c);         // SSE 4.1
+    b16 = _MM_CVTEPU8_EPI16 (b); // SSE 4.1
+    c16 = _MM_CVTEPU8_EPI16 (c); // SSE 4.1
     r16_1 = _mm_mullo_epi16 (b16, c16);
-    r16_1 = _mm_shuffle_epi8 (r16_1, *(__m128i*) mask8_16_even_odd);         //return to 8 bits
+    r16_1 = _mm_shuffle_epi8 (r16_1, *(__m128i*) mask8_16_even_odd); //return to 8 bits
     r16_1 = _mm_add_epi8 (r16_1, a);
     //swap hi and low part of a, b and c to process the remaining data
     a_2 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
     b16 = _mm_shuffle_epi32 (b, _SWAP_HI_LOW32);
     c16 = _mm_shuffle_epi32 (c, _SWAP_HI_LOW32);
-    b16 = _MM_CVTEPU8_EPI16 (b16);         // SSE 4.1
-    c16 = _MM_CVTEPU8_EPI16 (c16);         // SSE 4.1
+    b16 = _MM_CVTEPU8_EPI16 (b16); // SSE 4.1
+    c16 = _MM_CVTEPU8_EPI16 (c16); // SSE 4.1
 
     r16_2 = _mm_mullo_epi16 (b16, c16);
     r16_2 = _mm_shuffle_epi8 (r16_2, *(__m128i*) mask8_16_even_odd);
     r16_2 = _mm_add_epi8(r16_2, a_2);
     return _mm_unpacklo_epi64(r16_1,r16_2);
 }
-#endif
 
-uint16x8_t vmlaq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c);         // VMLA.I16 q0,q0,q0
+uint16x8_t vmlaq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c); // VMLA.I16 q0,q0,q0
 #define vmlaq_u16 vmlaq_s16
 
-uint32x4_t vmlaq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c);         // VMLA.I32 q0,q0,q0
+uint32x4_t vmlaq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c); // VMLA.I32 q0,q0,q0
 #define vmlaq_u32 vmlaq_s32
 
 //**********************  Vector widening multiply accumulate (long multiply accumulate):
 //                          vmla -> Vr[i] := Va[i] + Vb[i] * Vc[i]  **************
 //********************************************************************************************
+int16x8_t vmlal_s8(int16x8_t a, int8x8_t b, int8x8_t c); // VMLAL.S8 q0,d0,d0
+_NEON2SSE_INLINE int16x8_t vmlal_s8(int16x8_t a, int8x8_t b, int8x8_t c) // VMLAL.S8 q0,d0,d0
+{
+    int16x8_t res;
+    res = vmull_s8(b, c);
+    return _mm_add_epi16 (res, a);
+}
+
+int32x4_t vmlal_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VMLAL.S16 q0,d0,d0
+_NEON2SSE_INLINE int32x4_t vmlal_s16(int32x4_t a, int16x4_t b, int16x4_t c) // VMLAL.S16 q0,d0,d0
+{
+    //may be not optimal compared with serial implementation
+    int32x4_t res;
+    res = vmull_s16(b,  c);
+    return _mm_add_epi32 (res, a);
+}
+
+int64x2_t vmlal_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VMLAL.S32 q0,d0,d0
+_NEON2SSE_INLINE int64x2_t vmlal_s32(int64x2_t a, int32x2_t b, int32x2_t c) // VMLAL.S32 q0,d0,d0
+{
+    //may be not optimal compared with serial implementation
+    int64x2_t res;
+    res = vmull_s32( b, c);
+    return _mm_add_epi64 (res, a);
+}
+
+uint16x8_t vmlal_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c); // VMLAL.U8 q0,d0,d0
+_NEON2SSE_INLINE uint16x8_t vmlal_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c) // VMLAL.U8 q0,d0,d0
+{
+    uint16x8_t res;
+    res = vmull_u8(b, c);
+    return _mm_add_epi16 (res, a);
+}
+
+uint32x4_t vmlal_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c); // VMLAL.s16 q0,d0,d0
+_NEON2SSE_INLINE uint32x4_t vmlal_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c) // VMLAL.s16 q0,d0,d0
+{
+    //may be not optimal compared with serial implementation
+    uint32x4_t res;
+    res = vmull_u16(b, c);
+    return _mm_add_epi32 (res, a);
+}
+
+uint64x2_t vmlal_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c); // VMLAL.U32 q0,d0,d0
+_NEON2SSE_INLINE uint64x2_t vmlal_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c) // VMLAL.U32 q0,d0,d0
+{
+    //may be not optimal compared with serial implementation
+    int64x2_t res;
+    res = vmull_u32( b,c);
+    return _mm_add_epi64 (res, a);
+}
 
 //******************** Vector multiply subtract: vmls -> Vr[i] := Va[i] - Vb[i] * Vc[i] ***************************************
 //********************************************************************************************
+int8x8_t vmls_s8(int8x8_t a, int8x8_t b, int8x8_t c); // VMLS.I8 d0,d0,d0
+_NEON2SSE_INLINE int8x8_t vmls_s8(int8x8_t a, int8x8_t b, int8x8_t c) // VMLS.I8 d0,d0,d0
+{
+    // no 8 bit simd multiply, need to go to 16 bits -  and use the low 64 bits
+    int8x8_t res64;
+    __m128i res;
+    res64 = vmul_s8(b,c);
+    res = _mm_sub_epi8 (_pM128i(a), _pM128i(res64));
+    return64(res);
+}
 
-#if defined(USE_SSSE3)
-int8x16_t vmlsq_s8(int8x16_t a, int8x16_t b, int8x16_t c);         // VMLS.I8 q0,q0,q0
-_NEON2SSE_INLINE int8x16_t vmlsq_s8(int8x16_t a, int8x16_t b, int8x16_t c)         // VMLS.I8 q0,q0,q0
-{         //solution may be not optimal
-      // no 8 bit simd multiply, need to go to 16 bits
+int16x4_t vmls_s16(int16x4_t a,  int16x4_t b, int16x4_t c); // VMLS.I16 d0,d0,d0
+_NEON2SSE_INLINE int16x4_t vmls_s16(int16x4_t a,  int16x4_t b, int16x4_t c)
+{
+    int16x4_t res64;
+    return64(vmlsq_s16(_pM128i(a),_pM128i(b), _pM128i(c)));
+}
+
+
+int32x2_t vmls_s32(int32x2_t a, int32x2_t b, int32x2_t c); // VMLS.I32 d0,d0,d0
+_NEON2SSE_INLINE int32x2_t vmls_s32(int32x2_t a, int32x2_t b, int32x2_t c) // VMLS.I32 d0,d0,d0
+{
+    int32x2_t res64;
+    __m128i res;
+    res = _MM_MULLO_EPI32 (_pM128i(c),_pM128i( b)); //SSE4.1
+    res =  _mm_sub_epi32 (_pM128i(a),res); //use low 64 bits only
+    return64(res);
+}
+
+float32x2_t vmls_f32(float32x2_t a, float32x2_t b, float32x2_t c); // VMLS.F32 d0,d0,d0
+_NEON2SSE_INLINE float32x2_t vmls_f32(float32x2_t a, float32x2_t b, float32x2_t c)
+{
+    __m128 res;
+    __m64_128 res64;
+    res = _mm_mul_ps (_pM128(c), _pM128(b));
+    res = _mm_sub_ps (_pM128(a), res);
+    _M64f(res64, res);
+    return res64;
+}
+
+uint8x8_t vmls_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c); // VMLS.I8 d0,d0,d0
+_NEON2SSE_INLINE uint8x8_t vmls_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c)
+{
+    // no 8 bit simd multiply, need to go to 16 bits -  and use the low 64 bits
+    uint8x8_t res64;
+    __m128i res;
+    res64 = vmul_u8(b,c);
+    res = _mm_sub_epi8 (_pM128i(a), _pM128i(res64));
+    return64(res);
+}
+
+uint16x4_t vmls_u16(uint16x4_t a,  uint16x4_t b, uint16x4_t c); // VMLS.I16 d0,d0,d0
+#define vmls_u16 vmls_s16
+
+uint32x2_t vmls_u32(uint32x2_t a,  uint32x2_t b, uint32x2_t c); // VMLS.I32 d0,d0,d0
+#define vmls_u32 vmls_s32
+
+
+int8x16_t vmlsq_s8(int8x16_t a, int8x16_t b, int8x16_t c); // VMLS.I8 q0,q0,q0
+_NEON2SSE_INLINE int8x16_t vmlsq_s8(int8x16_t a, int8x16_t b, int8x16_t c) // VMLS.I8 q0,q0,q0
+{
+    //solution may be not optimal
+    // no 8 bit simd multiply, need to go to 16 bits
     __m128i b16, c16, r16_1, a_2, r16_2;
     _NEON2SSE_ALIGN_16 int8_t mask8_16_even_odd[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 };
-    b16 = _MM_CVTEPI8_EPI16 (b);         // SSE 4.1
-    c16 = _MM_CVTEPI8_EPI16 (c);         // SSE 4.1
+    b16 = _MM_CVTEPI8_EPI16 (b); // SSE 4.1
+    c16 = _MM_CVTEPI8_EPI16 (c); // SSE 4.1
     r16_1 = _mm_mullo_epi16 (b16, c16);
     r16_1 = _mm_shuffle_epi8 (r16_1, *(__m128i*) mask8_16_even_odd);
     r16_1 = _mm_sub_epi8 (a, r16_1);
@@ -2368,205 +4047,573 @@
     a_2 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
     b16 = _mm_shuffle_epi32 (b, _SWAP_HI_LOW32);
     c16 = _mm_shuffle_epi32 (c, _SWAP_HI_LOW32);
-    b16 = _MM_CVTEPI8_EPI16 (b16);         // SSE 4.1
-    c16 = _MM_CVTEPI8_EPI16 (c16);         // SSE 4.1
+    b16 = _MM_CVTEPI8_EPI16 (b16); // SSE 4.1
+    c16 = _MM_CVTEPI8_EPI16 (c16); // SSE 4.1
 
     r16_2 = _mm_mullo_epi16 (b16, c16);
     r16_2 = _mm_shuffle_epi8 (r16_2, *(__m128i*) mask8_16_even_odd);
     r16_2 = _mm_sub_epi8 (a_2, r16_2);
     return _mm_unpacklo_epi64(r16_1,r16_2);
 }
-#endif
 
-int16x8_t vmlsq_s16(int16x8_t a, int16x8_t b, int16x8_t c);         // VMLS.I16 q0,q0,q0
-_NEON2SSE_INLINE int16x8_t vmlsq_s16(int16x8_t a, int16x8_t b, int16x8_t c)         // VMLS.I16 q0,q0,q0
+int16x8_t vmlsq_s16(int16x8_t a, int16x8_t b, int16x8_t c); // VMLS.I16 q0,q0,q0
+_NEON2SSE_INLINE int16x8_t vmlsq_s16(int16x8_t a, int16x8_t b, int16x8_t c) // VMLS.I16 q0,q0,q0
 {
     __m128i res;
     res = _mm_mullo_epi16 (c, b);
     return _mm_sub_epi16 (a, res);
 }
 
-int32x4_t vmlsq_s32(int32x4_t a, int32x4_t b, int32x4_t c);         // VMLS.I32 q0,q0,q0
-_NEON2SSE_INLINE int32x4_t vmlsq_s32(int32x4_t a, int32x4_t b, int32x4_t c)         // VMLS.I32 q0,q0,q0
+int32x4_t vmlsq_s32(int32x4_t a, int32x4_t b, int32x4_t c); // VMLS.I32 q0,q0,q0
+_NEON2SSE_INLINE int32x4_t vmlsq_s32(int32x4_t a, int32x4_t b, int32x4_t c) // VMLS.I32 q0,q0,q0
 {
     __m128i res;
-    res = _MM_MULLO_EPI32 (c, b);         //SSE4.1
+    res = _MM_MULLO_EPI32 (c, b); //SSE4.1
     return _mm_sub_epi32 (a, res);
 }
 
-float32x4_t vmlsq_f32(float32x4_t a, float32x4_t b, float32x4_t c);         // VMLS.F32 q0,q0,q0
-_NEON2SSE_INLINE float32x4_t vmlsq_f32(float32x4_t a, float32x4_t b, float32x4_t c)         // VMLS.F32 q0,q0,q0
+float32x4_t vmlsq_f32(float32x4_t a, float32x4_t b, float32x4_t c); // VMLS.F32 q0,q0,q0
+_NEON2SSE_INLINE float32x4_t vmlsq_f32(float32x4_t a, float32x4_t b, float32x4_t c) // VMLS.F32 q0,q0,q0
 {
     __m128 res;
     res = _mm_mul_ps (c, b);
     return _mm_sub_ps (a, res);
 }
 
-#if defined(USE_SSSE3)
-uint8x16_t vmlsq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c);         // VMLS.I8 q0,q0,q0
-_NEON2SSE_INLINE uint8x16_t vmlsq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c)         // VMLS.I8 q0,q0,q0
-{         //solution may be not optimal
-      // no 8 bit simd multiply, need to go to 16 bits
+uint8x16_t vmlsq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c); // VMLS.I8 q0,q0,q0
+_NEON2SSE_INLINE uint8x16_t vmlsq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c) // VMLS.I8 q0,q0,q0
+{
+    //solution may be not optimal
+    // no 8 bit simd multiply, need to go to 16 bits
     __m128i b16, c16, r16_1, a_2, r16_2;
     _NEON2SSE_ALIGN_16 int8_t mask8_16_even_odd[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 };
-    b16 = _MM_CVTEPU8_EPI16 (b);         // SSE 4.1
-    c16 = _MM_CVTEPU8_EPI16 (c);         // SSE 4.1
+    b16 = _MM_CVTEPU8_EPI16 (b); // SSE 4.1
+    c16 = _MM_CVTEPU8_EPI16 (c); // SSE 4.1
     r16_1 = _mm_mullo_epi16 (b16, c16);
-    r16_1 = _mm_shuffle_epi8 (r16_1, *(__m128i*) mask8_16_even_odd);         //return to 8 bits
+    r16_1 = _mm_shuffle_epi8 (r16_1, *(__m128i*) mask8_16_even_odd); //return to 8 bits
     r16_1 = _mm_sub_epi8 (a, r16_1);
     //swap hi and low part of a, b and c to process the remaining data
     a_2 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
     b16 = _mm_shuffle_epi32 (b, _SWAP_HI_LOW32);
     c16 = _mm_shuffle_epi32 (c, _SWAP_HI_LOW32);
-    b16 = _MM_CVTEPU8_EPI16 (b16);         // SSE 4.1
-    c16 = _MM_CVTEPU8_EPI16 (c16);         // SSE 4.1
+    b16 = _MM_CVTEPU8_EPI16 (b16); // SSE 4.1
+    c16 = _MM_CVTEPU8_EPI16 (c16); // SSE 4.1
 
     r16_2 = _mm_mullo_epi16 (b16, c16);
     r16_2 = _mm_shuffle_epi8 (r16_2, *(__m128i*) mask8_16_even_odd);
     r16_2 = _mm_sub_epi8(a_2, r16_2);
     return _mm_unpacklo_epi64(r16_1,r16_2);
 }
-#endif
 
-uint16x8_t vmlsq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c);         // VMLS.I16 q0,q0,q0
+uint16x8_t vmlsq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c); // VMLS.I16 q0,q0,q0
 #define vmlsq_u16 vmlsq_s16
 
-uint32x4_t vmlsq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c);         // VMLS.I32 q0,q0,q0
+uint32x4_t vmlsq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c); // VMLS.I32 q0,q0,q0
 #define vmlsq_u32 vmlsq_s32
 
 //******************** Vector multiply subtract long (widening multiply subtract) ************************************
 //*************************************************************************************************************
+int16x8_t vmlsl_s8(int16x8_t a, int8x8_t b, int8x8_t c); // VMLSL.S8 q0,d0,d0
+_NEON2SSE_INLINE int16x8_t vmlsl_s8(int16x8_t a, int8x8_t b, int8x8_t c) // VMLSL.S8 q0,d0,d0
+{
+    int16x8_t res;
+    res = vmull_s8(b, c);
+    return _mm_sub_epi16 (a, res);
+}
+
+int32x4_t vmlsl_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VMLSL.S16 q0,d0,d0
+_NEON2SSE_INLINE int32x4_t vmlsl_s16(int32x4_t a, int16x4_t b, int16x4_t c) // VMLSL.S16 q0,d0,d0
+{
+    //may be not optimal compared with serial implementation
+    int32x4_t res;
+    res = vmull_s16(b,  c);
+    return _mm_sub_epi32 (a, res);
+}
+
+int64x2_t vmlsl_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VMLSL.S32 q0,d0,d0
+_NEON2SSE_INLINE int64x2_t vmlsl_s32(int64x2_t a, int32x2_t b, int32x2_t c) // VMLSL.S32 q0,d0,d0
+{
+    //may be not optimal compared with serial implementation
+    int64x2_t res;
+    res = vmull_s32( b,c);
+    return _mm_sub_epi64 (a, res);
+}
+
+uint16x8_t vmlsl_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c); // VMLSL.U8 q0,d0,d0
+_NEON2SSE_INLINE uint16x8_t vmlsl_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c) // VMLSL.U8 q0,d0,d0
+{
+    uint16x8_t res;
+    res = vmull_u8(b, c);
+    return _mm_sub_epi16 (a, res);
+}
+
+uint32x4_t vmlsl_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c); // VMLSL.s16 q0,d0,d0
+_NEON2SSE_INLINE uint32x4_t vmlsl_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c) // VMLSL.s16 q0,d0,d0
+{
+    //may be not optimal compared with serial implementation
+    uint32x4_t res;
+    res = vmull_u16(b, c);
+    return _mm_sub_epi32 (a, res);
+}
+
+uint64x2_t vmlsl_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c); // VMLSL.U32 q0,d0,d0
+_NEON2SSE_INLINE uint64x2_t vmlsl_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c) // VMLSL.U32 q0,d0,d0
+{
+    //may be not optimal compared with serial implementation
+    int64x2_t res;
+    res = vmull_u32( b,c);
+    return _mm_sub_epi64 (a, res);
+}
 
 //******  Vector saturating doubling multiply high **********************
 //*************************************************************************
-//For some ARM implementations if the multiply high result is all 0xffffffff then it is not doubled. We do the same here
-
-int16x8_t vqdmulhq_s16(int16x8_t a, int16x8_t b);         // VQDMULH.S16 q0,q0,q0
-_NEON2SSE_INLINE int16x8_t vqdmulhq_s16(int16x8_t a, int16x8_t b)         // VQDMULH.S16 q0,q0,q0
+int16x4_t vqdmulh_s16(int16x4_t a,  int16x4_t b); // VQDMULH.S16 d0,d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x4_t vqdmulh_s16(int16x4_t a,  int16x4_t b), _NEON2SSE_REASON_SLOW_SERIAL)
 {
-    __m128i res_sat, cffff, mask, res;
-    res = _mm_mulhi_epi16 (a, b);
-    cffff = _mm_cmpeq_epi16(res,res);         //0xffff
-    mask = _mm_cmpeq_epi16(res, cffff);         //if ffff need to saturate
-    res_sat = _mm_adds_epi16(res, res);         //res *= 2 and saturate
-    return _mm_or_si128(mask, res_sat);
+    int16x4_t res;
+    int32_t a32, b32, i;
+    for (i = 0; i<4; i++) {
+        a32 = (int32_t) a.m64_i16[i];
+        b32 = (int32_t) b.m64_i16[i];
+        a32 = (a32 * b32) >> 15;
+        res.m64_i16[i] = (a32 == 0x8000) ? 0x7fff : (int16_t) a32;
+    }
+    return res;
 }
 
-#if defined(USE_SSSE3)
-int32x4_t vqdmulhq_s32(int32x4_t a, int32x4_t b);         // VQDMULH.S32 q0,q0,q0
-_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vqdmulhq_s32(int32x4_t a, int32x4_t b), _NEON2SSE_REASON_SLOW_UNEFFECTIVE)
-{         // no multiply high 32 bit SIMD in IA32, may be not optimal compared with a serial solution for the SSSE3 target
-    __m128i ab, ba, res_sat, cffffffff, mask, mul, mul1;
-    ab = _mm_unpacklo_epi32 (a, b);         //a0, b0, a1,b1
-    ba = _mm_unpacklo_epi32 (b, a);         //b0, a0, b1,a1
-    mul = _MM_MUL_EPI32(ab, ba);         //uses 1rst and 3rd data lanes, the multiplication gives 64 bit result
-    ab = _mm_unpackhi_epi32 (a, b);         //a2, b2, a3,b3
-    ba = _mm_unpackhi_epi32 (b, a);         //b2, a2, b3,a3
-    mul1 = _MM_MUL_EPI32(ab, ba);         //uses 1rst and 3rd data lanes, the multiplication gives 64 bit result
-    mul = _mm_shuffle_epi32 (mul, 1 | (3 << 2) | (0 << 4) | (2 << 6));         //shuffle the data to get 2 32-bits
-    mul1 = _mm_shuffle_epi32 (mul1, 1 | (3 << 2) | (0 << 4) | (2 << 6));         //shuffle the data to get 2 32-bits
-    mul = _mm_unpacklo_epi64(mul, mul1);
-    cffffffff = _mm_cmpeq_epi32(mul,mul);         //0xffffffff
-    mask = _mm_cmpeq_epi32(mul, cffffffff);         //if ffffffff need to saturate
-    res_sat = vqd_s32(mul);
-    return _mm_or_si128(mask, res_sat);
+int32x2_t vqdmulh_s32(int32x2_t a, int32x2_t b); // VQDMULH.S32 d0,d0,d0
+_NEON2SSE_INLINE int32x2_t vqdmulh_s32(int32x2_t a, int32x2_t b) // no multiply high 32 bit SIMD in IA32, so need to do some tricks, serial solution may be faster
+{
+    //may be not optimal compared with a serial solution
+    int32x2_t res64;
+    __m128i mask;
+    _NEON2SSE_ALIGN_16 uint32_t cmask32[] = {0x80000000, 0x80000000, 0x80000000, 0x80000000};
+    int64x2_t mul;
+    mul = vmull_s32(a,b);
+    mul = _mm_slli_epi64(mul,1); //double the result
+    //at this point start treating 2 64-bit numbers as 4 32-bit
+    mul = _mm_shuffle_epi32 (mul, 1 | (3 << 2) | (0 << 4) | (2 << 6)); //shuffle the data to get 2 32-bits
+    mask = _mm_cmpeq_epi32 (mul, *(__m128i*)cmask32);
+    mul = _mm_xor_si128 (mul,  mask); //res saturated for 0x80000000
+    return64(mul);
 }
-#endif
+
+int16x8_t vqdmulhq_s16(int16x8_t a, int16x8_t b); // VQDMULH.S16 q0,q0,q0
+_NEON2SSE_INLINE int16x8_t vqdmulhq_s16(int16x8_t a, int16x8_t b) // VQDMULH.S16 q0,q0,q0
+{
+    __m128i res, res_lo, mask;
+    _NEON2SSE_ALIGN_16 uint16_t cmask[] = {0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000};
+    res = _mm_mulhi_epi16 (a, b);
+    res = _mm_slli_epi16 (res, 1); //double the result, don't care about saturation
+    res_lo = _mm_mullo_epi16 (a, b);
+    res_lo = _mm_srli_epi16(res_lo,15); //take the highest bit
+    res = _mm_add_epi16(res, res_lo); //combine results
+    mask = _mm_cmpeq_epi16 (res, *(__m128i*)cmask);
+    return _mm_xor_si128 (res,  mask); //res saturated for 0x8000
+}
+
+int32x4_t vqdmulhq_s32(int32x4_t a, int32x4_t b); // VQDMULH.S32 q0,q0,q0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vqdmulhq_s32(int32x4_t a, int32x4_t b), _NEON2SSE_REASON_SLOW_UNEFFECTIVE)
+{
+    // no multiply high 32 bit SIMD in IA32, may be not optimal compared with a serial solution for the SSSE3 target
+    __m128i ab, ba, mask, mul, mul1;
+    _NEON2SSE_ALIGN_16 uint32_t cmask32[] = {0x80000000, 0x80000000, 0x80000000, 0x80000000};
+    ab = _mm_unpacklo_epi32 (a, b); //a0, b0, a1,b1
+    ba = _mm_unpacklo_epi32 (b, a); //b0, a0, b1,a1
+    mul = _MM_MUL_EPI32(ab, ba); //uses 1rst and 3rd data lanes, the multiplication gives 64 bit result
+    mul = _mm_slli_epi64(mul,1); //double the result
+    ab = _mm_unpackhi_epi32 (a, b); //a2, b2, a3,b3
+    ba = _mm_unpackhi_epi32 (b, a); //b2, a2, b3,a3
+    mul1 = _MM_MUL_EPI32(ab, ba); //uses 1rst and 3rd data lanes, the multiplication gives 64 bit result
+    mul1 = _mm_slli_epi64(mul1,1); //double the result
+    mul = _mm_shuffle_epi32 (mul, 1 | (3 << 2) | (0 << 4) | (2 << 6)); //shuffle the data to get 2 32-bits
+    mul1 = _mm_shuffle_epi32 (mul1, 1 | (3 << 2) | (0 << 4) | (2 << 6)); //shuffle the data to get 2 32-bits
+    mul = _mm_unpacklo_epi64(mul, mul1);
+    mask = _mm_cmpeq_epi32 (mul, *(__m128i*)cmask32);
+    return _mm_xor_si128 (mul,  mask); //res saturated for 0x80000000
+}
 
 //********* Vector saturating rounding doubling multiply high ****************
 //****************************************************************************
 //If use _mm_mulhrs_xx functions  the result may differ from NEON one a little  due to different rounding rules and order
-
-#if defined(USE_SSSE3)
-int16x8_t vqrdmulhq_s16(int16x8_t a, int16x8_t b);         // VQRDMULH.S16 q0,q0,q0
-_NEON2SSE_INLINE int16x8_t vqrdmulhq_s16(int16x8_t a, int16x8_t b)         // VQRDMULH.S16 q0,q0,q0
+int16x4_t vqrdmulh_s16(int16x4_t a,  int16x4_t b); // VQRDMULH.S16 d0,d0,d0
+_NEON2SSE_INLINE int16x4_t vqrdmulh_s16(int16x4_t a,  int16x4_t b)
 {
-    __m128i res_sat, cffff, mask, res;
-    res = _mm_mulhrs_epi16 (a, b);
-    cffff = _mm_cmpeq_epi16(res,res);         //0xffff
-    mask = _mm_cmpeq_epi16(res, cffff);         //if ffff need to saturate
-    res_sat = _mm_adds_epi16(res, res);         //res *= 2 and saturate
-    return _mm_or_si128(mask, res_sat);
+    int16x4_t res64;
+    return64(vqrdmulhq_s16(_pM128i(a), _pM128i(b)));
 }
-#endif
 
-#if defined(USE_SSSE3)
-int32x4_t vqrdmulhq_s32(int32x4_t a, int32x4_t b);         // VQRDMULH.S32 q0,q0,q0
-_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vqrdmulhq_s32(int32x4_t a, int32x4_t b), _NEON2SSE_REASON_SLOW_UNEFFECTIVE)
-{         // no multiply high 32 bit SIMD in IA32, may be not optimal compared with a serial solution for the SSSE3 target
-    __m128i ab, ba, res_sat, cffffffff, mask, mul, mul1, mask1;
-    ab = _mm_unpacklo_epi32 (a, b);         //a0, b0, a1,b1
-    ba = _mm_unpacklo_epi32 (b, a);         //b0, a0, b1,a1
-    mul = _MM_MUL_EPI32(ab, ba);         //uses 1rst and 3rd data lanes, the multiplication gives 64 bit result
-    ab = _mm_unpackhi_epi32 (a, b);         //a2, b2, a3,b3
-    ba = _mm_unpackhi_epi32 (b, a);         //b2, a2, b3,a3
-    mul1 = _MM_MUL_EPI32(ab, ba);         //uses 1rst and 3rd data lanes, the multiplication gives 64 bit result
-    mul = _mm_shuffle_epi32 (mul, 1 | (3 << 2) | (0 << 4) | (2 << 6));         //shuffle the data to get 2 32-bits
-    mul1 = _mm_shuffle_epi32 (mul1, 1 | (3 << 2) | (0 << 4) | (2 << 6));         //shuffle the data to get 2 32-bits
-    mul = _mm_unpacklo_epi64(mul, mul1);
-    cffffffff = _mm_cmpeq_epi32(mul,mul);         //0xffffffff
-    mask1 = _mm_slli_epi32(mul, 17);         //shift left then back right to
-    mask1 = _mm_srli_epi32(mul,31);         //get  15-th bit 1 or zero
-    mul = _mm_add_epi32 (mul, mask1);         //actual rounding
-    mask = _mm_cmpeq_epi32(mul, cffffffff);         //if ffffffff need to saturate
-    res_sat = vqd_s32(mul);
-    return _mm_or_si128(mask, res_sat);
+int32x2_t vqrdmulh_s32(int32x2_t a, int32x2_t b); // VQRDMULH.S32 d0,d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vqrdmulh_s32(int32x2_t a, int32x2_t b), _NEON2SSE_REASON_SLOW_UNEFFECTIVE)
+{
+    //may be not optimal compared with a serial solution
+    int32x2_t res64;
+    _NEON2SSE_ALIGN_16 uint32_t cmask32[] = {0x80000000, 0x80000000, 0x80000000, 0x80000000};
+    __m128i res_sat, mask, mask1;
+    int64x2_t mul;
+    mul = vmull_s32(a,b);
+    res_sat = _mm_slli_epi64 (mul, 1); //double the result, saturation not considered
+    mask1 = _mm_slli_epi64(res_sat, 32); //shift left then back right to
+    mask1 = _mm_srli_epi64(mask1,31); //get  31-th bit 1 or zero
+    mul = _mm_add_epi32 (res_sat, mask1); //actual rounding
+    //at this point start treating 2 64-bit numbers as 4 32-bit
+    mul = _mm_shuffle_epi32 (mul, 1 | (3 << 2) | (0 << 4) | (2 << 6)); //shuffle the data to get 2 32-bits from each 64-bit
+    mask = _mm_cmpeq_epi32 (mul, *(__m128i*)cmask32);
+    mul = _mm_xor_si128 (mul,  mask); //res saturated for 0x80000000
+    return64(mul);
 }
-#endif
+
+int16x8_t vqrdmulhq_s16(int16x8_t a, int16x8_t b); // VQRDMULH.S16 q0,q0,q0
+_NEON2SSE_INLINE int16x8_t vqrdmulhq_s16(int16x8_t a, int16x8_t b) // VQRDMULH.S16 q0,q0,q0
+{
+    __m128i mask, res;
+    _NEON2SSE_ALIGN_16 uint16_t cmask[] = {0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000};
+    res = _mm_mulhrs_epi16 (a, b);
+    mask = _mm_cmpeq_epi16 (res, *(__m128i*)cmask);
+    return _mm_xor_si128 (res,  mask); //res saturated for 0x8000
+}
+
+int32x4_t vqrdmulhq_s32(int32x4_t a, int32x4_t b); // VQRDMULH.S32 q0,q0,q0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vqrdmulhq_s32(int32x4_t a, int32x4_t b), _NEON2SSE_REASON_SLOW_UNEFFECTIVE)
+{
+    // no multiply high 32 bit SIMD in IA32, may be not optimal compared with a serial solution for the SSSE3 target
+    __m128i ab, ba,  mask, mul, mul1, mask1;
+    _NEON2SSE_ALIGN_16 uint32_t cmask32[] = {0x80000000, 0x80000000, 0x80000000, 0x80000000};
+    ab = _mm_unpacklo_epi32 (a, b); //a0, b0, a1,b1
+    ba = _mm_unpacklo_epi32 (b, a); //b0, a0, b1,a1
+    mul = _MM_MUL_EPI32(ab, ba); //uses 1rst and 3rd data lanes, the multiplication gives 64 bit result
+    mul = _mm_slli_epi64 (mul, 1); //double the result, saturation not considered
+    mask1 = _mm_slli_epi64(mul, 32); //shift left then back right to
+    mask1 = _mm_srli_epi64(mask1,31); //get  31-th bit 1 or zero
+    mul = _mm_add_epi32 (mul, mask1); //actual rounding
+
+    ab = _mm_unpackhi_epi32 (a, b); //a2, b2, a3,b3
+    ba = _mm_unpackhi_epi32 (b, a); //b2, a2, b3,a3
+    mul1 = _MM_MUL_EPI32(ab, ba); //uses 1rst and 3rd data lanes, the multiplication gives 64 bit result
+    mul1 = _mm_slli_epi64 (mul1, 1); //double the result, saturation not considered
+    mask1 = _mm_slli_epi64(mul1, 32); //shift left then back right to
+    mask1 = _mm_srli_epi64(mask1,31); //get  31-th bit 1 or zero
+    mul1 = _mm_add_epi32 (mul1, mask1); //actual rounding
+    //at this point start treating 2 64-bit numbers as 4 32-bit
+    mul = _mm_shuffle_epi32 (mul, 1 | (3 << 2) | (0 << 4) | (2 << 6)); //shuffle the data to get 2 32-bits from each 64-bit
+    mul1 = _mm_shuffle_epi32 (mul1, 1 | (3 << 2) | (0 << 4) | (2 << 6)); //shuffle the data to get 2 32-bits from each 64-bit
+    mul = _mm_unpacklo_epi64(mul, mul1);
+    mask = _mm_cmpeq_epi32 (mul, *(__m128i*)cmask32);
+    return _mm_xor_si128 (mul,  mask); //res saturated for 0x80000000
+}
 
 //*************Vector widening saturating doubling multiply accumulate (long saturating doubling multiply accumulate) *****
 //*************************************************************************************************************************
+int32x4_t vqdmlal_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VQDMLAL.S16 q0,d0,d0
+_NEON2SSE_INLINE int32x4_t vqdmlal_s16(int32x4_t a, int16x4_t b, int16x4_t c) // VQDMLAL.S16 q0,d0,d0
+{
+    //not optimal SIMD soulution, serial may be faster
+    __m128i res32;
+    res32 = vmull_s16(b,  c);
+    res32 = vqd_s32(res32); //doubling & saturation ,if no saturation we could use _mm_slli_epi32 (res, 1);
+    return vqaddq_s32(res32, a); //saturation
+}
+
+int64x2_t vqdmlal_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VQDMLAL.S32 q0,d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqdmlal_s32(int64x2_t a, int32x2_t b, int32x2_t c),_NEON2SSE_REASON_SLOW_SERIAL)
+{
+    __m128i res64;
+    res64 = vmull_s32(b,c);
+    res64 = vqaddq_s64(res64, res64); //doubling & saturation ,if no saturation we could use _mm_slli_epi64 (res, 1);
+    return vqaddq_s64(res64, a); //saturation
+}
 
 //************************************************************************************
 //******************  Vector subtract ***********************************************
 //************************************************************************************
+int8x8_t vsub_s8(int8x8_t a, int8x8_t b); // VSUB.I8 d0,d0,d0
+_NEON2SSE_INLINE int8x8_t vsub_s8(int8x8_t a, int8x8_t b)
+{
+    int8x8_t res64;
+    return64(_mm_sub_epi8(_pM128i(a),_pM128i(b)));
+}
 
-int8x16_t   vsubq_s8(int8x16_t a, int8x16_t b);         // VSUB.I8 q0,q0,q0
+
+int16x4_t vsub_s16(int16x4_t a, int16x4_t b); // VSUB.I16 d0,d0,d0
+_NEON2SSE_INLINE int16x4_t vsub_s16(int16x4_t a, int16x4_t b)
+{
+    int16x4_t res64;
+    return64(_mm_sub_epi16(_pM128i(a),_pM128i(b)));
+}
+
+
+int32x2_t vsub_s32(int32x2_t a, int32x2_t b); // VSUB.I32 d0,d0,d0
+_NEON2SSE_INLINE int32x2_t vsub_s32(int32x2_t a, int32x2_t b)
+{
+    int32x2_t res64;
+    return64(_mm_sub_epi32(_pM128i(a),_pM128i(b)));
+}
+
+
+int64x1_t vsub_s64(int64x1_t a,  int64x1_t b); // VSUB.I64 d0,d0,d0
+_NEON2SSE_INLINE int64x1_t vsub_s64(int64x1_t a,  int64x1_t b)
+{
+    int64x1_t res64;
+    res64.m64_i64[0] = a.m64_i64[0] - b.m64_i64[0];
+    return res64;
+}
+
+
+float32x2_t vsub_f32(float32x2_t a, float32x2_t b); // VSUB.F32 d0,d0,d0
+_NEON2SSE_INLINE float32x2_t vsub_f32(float32x2_t a, float32x2_t b)
+{
+    float32x2_t res;
+    res.m64_f32[0] = a.m64_f32[0] - b.m64_f32[0];
+    res.m64_f32[1] = a.m64_f32[1] - b.m64_f32[1];
+    return res;
+}
+
+uint8x8_t vsub_u8(uint8x8_t a, uint8x8_t b); // VSUB.I8 d0,d0,d0
+#define vsub_u8 vsub_s8
+
+uint16x4_t vsub_u16(uint16x4_t a, uint16x4_t b); // VSUB.I16 d0,d0,d0
+#define vsub_u16 vsub_s16
+
+uint32x2_t vsub_u32(uint32x2_t a, uint32x2_t b); // VSUB.I32 d0,d0,d0
+#define vsub_u32 vsub_s32
+
+
+uint64x1_t vsub_u64(uint64x1_t a,  uint64x1_t b); // VSUB.I64 d0,d0,d0
+_NEON2SSE_INLINE uint64x1_t vsub_u64(uint64x1_t a,  uint64x1_t b)
+{
+    int64x1_t res64;
+    res64.m64_u64[0] = a.m64_u64[0] - b.m64_u64[0];
+    return res64;
+}
+
+
+int8x16_t   vsubq_s8(int8x16_t a, int8x16_t b); // VSUB.I8 q0,q0,q0
 #define vsubq_s8 _mm_sub_epi8
 
-int16x8_t   vsubq_s16(int16x8_t a, int16x8_t b);         // VSUB.I16 q0,q0,q0
+int16x8_t   vsubq_s16(int16x8_t a, int16x8_t b); // VSUB.I16 q0,q0,q0
 #define vsubq_s16 _mm_sub_epi16
 
-int32x4_t   vsubq_s32(int32x4_t a, int32x4_t b);         // VSUB.I32 q0,q0,q0
+int32x4_t   vsubq_s32(int32x4_t a, int32x4_t b); // VSUB.I32 q0,q0,q0
 #define vsubq_s32 _mm_sub_epi32
 
-int64x2_t   vsubq_s64(int64x2_t a, int64x2_t b);         // VSUB.I64 q0,q0,q0
+int64x2_t   vsubq_s64(int64x2_t a, int64x2_t b); // VSUB.I64 q0,q0,q0
 #define vsubq_s64 _mm_sub_epi64
 
-float32x4_t vsubq_f32(float32x4_t a, float32x4_t b);         // VSUB.F32 q0,q0,q0
+float32x4_t vsubq_f32(float32x4_t a, float32x4_t b); // VSUB.F32 q0,q0,q0
 #define vsubq_f32 _mm_sub_ps
 
-uint8x16_t   vsubq_u8(uint8x16_t a, uint8x16_t b);         // VSUB.I8 q0,q0,q0
+uint8x16_t   vsubq_u8(uint8x16_t a, uint8x16_t b); // VSUB.I8 q0,q0,q0
 #define vsubq_u8 _mm_sub_epi8
 
-uint16x8_t   vsubq_u16(uint16x8_t a, uint16x8_t b);         // VSUB.I16 q0,q0,q0
+uint16x8_t   vsubq_u16(uint16x8_t a, uint16x8_t b); // VSUB.I16 q0,q0,q0
 #define vsubq_u16 _mm_sub_epi16
 
-uint32x4_t   vsubq_u32(uint32x4_t a, uint32x4_t b);         // VSUB.I32 q0,q0,q0
+uint32x4_t   vsubq_u32(uint32x4_t a, uint32x4_t b); // VSUB.I32 q0,q0,q0
 #define vsubq_u32 _mm_sub_epi32
 
-uint64x2_t   vsubq_u64(uint64x2_t a, uint64x2_t b);         // VSUB.I64 q0,q0,q0
+uint64x2_t   vsubq_u64(uint64x2_t a, uint64x2_t b); // VSUB.I64 q0,q0,q0
 #define vsubq_u64 _mm_sub_epi64
 
 //***************Vector long subtract: vsub -> Vr[i]:=Va[i]-Vb[i] ******************
 //***********************************************************************************
 //Va, Vb have equal lane sizes, result is a 128 bit vector of lanes that are twice the width.
+int16x8_t vsubl_s8(int8x8_t a, int8x8_t b); // VSUBL.S8 q0,d0,d0
+_NEON2SSE_INLINE int16x8_t vsubl_s8(int8x8_t a, int8x8_t b) // VSUBL.S8 q0,d0,d0
+{
+    __m128i a16, b16;
+    a16 = _MM_CVTEPI8_EPI16 (_pM128i(a)); //SSE4.1,
+    b16 = _MM_CVTEPI8_EPI16 (_pM128i(b)); //SSE4.1,
+    return _mm_sub_epi16 (a16, b16);
+}
+
+int32x4_t vsubl_s16(int16x4_t a, int16x4_t b); // VSUBL.S16 q0,d0,d0
+_NEON2SSE_INLINE int32x4_t vsubl_s16(int16x4_t a, int16x4_t b) // VSUBL.S16 q0,d0,d0
+{
+    __m128i a32, b32;
+    a32 = _MM_CVTEPI16_EPI32 (_pM128i(a)); //SSE4.1
+    b32 = _MM_CVTEPI16_EPI32 (_pM128i(b)); //SSE4.1,
+    return _mm_sub_epi32 (a32, b32);
+}
+
+int64x2_t vsubl_s32(int32x2_t a, int32x2_t b); // VSUBL.S32 q0,d0,d0
+_NEON2SSE_INLINE int64x2_t vsubl_s32(int32x2_t a, int32x2_t b) // VSUBL.S32 q0,d0,d0
+{
+    //may be not optimal
+    __m128i a64, b64;
+    a64 = _MM_CVTEPI32_EPI64 (_pM128i(a)); //SSE4.1
+    b64 = _MM_CVTEPI32_EPI64 (_pM128i(b)); //SSE4.1,
+    return _mm_sub_epi64 (a64, b64);
+}
+
+uint16x8_t vsubl_u8(uint8x8_t a, uint8x8_t b); // VSUBL.U8 q0,d0,d0
+_NEON2SSE_INLINE uint16x8_t vsubl_u8(uint8x8_t a, uint8x8_t b) // VSUBL.U8 q0,d0,d0
+{
+    __m128i a16, b16;
+    a16 = _MM_CVTEPU8_EPI16 (_pM128i(a)); //SSE4.1,
+    b16 = _MM_CVTEPU8_EPI16 (_pM128i(b)); //SSE4.1,
+    return _mm_sub_epi16 (a16, b16);
+}
+
+uint32x4_t vsubl_u16(uint16x4_t a, uint16x4_t b); // VSUBL.s16 q0,d0,d0
+_NEON2SSE_INLINE uint32x4_t vsubl_u16(uint16x4_t a, uint16x4_t b) // VSUBL.s16 q0,d0,d0
+{
+    __m128i a32, b32;
+    a32 = _MM_CVTEPU16_EPI32 (_pM128i(a)); //SSE4.1
+    b32 = _MM_CVTEPU16_EPI32 (_pM128i(b)); //SSE4.1,
+    return _mm_sub_epi32 (a32, b32);
+}
+
+uint64x2_t vsubl_u32(uint32x2_t a, uint32x2_t b); // VSUBL.U32 q0,d0,d0
+_NEON2SSE_INLINE uint64x2_t vsubl_u32(uint32x2_t a, uint32x2_t b) // VSUBL.U32 q0,d0,d0
+{
+    //may be not optimal
+    __m128i a64, b64;
+    a64 = _MM_CVTEPU32_EPI64 (_pM128i(a)); //SSE4.1
+    b64 = _MM_CVTEPU32_EPI64 (_pM128i(b)); //SSE4.1,
+    return _mm_sub_epi64 (a64, b64);
+}
 
 //***************** Vector wide subtract: vsub -> Vr[i]:=Va[i]-Vb[i] **********************************
 //*****************************************************************************************************
+int16x8_t vsubw_s8(int16x8_t a, int8x8_t b); // VSUBW.S8 q0,q0,d0
+_NEON2SSE_INLINE int16x8_t vsubw_s8(int16x8_t a, int8x8_t b) // VSUBW.S8 q0,q0,d0
+{
+    __m128i b16;
+    b16 = _MM_CVTEPI8_EPI16 (_pM128i(b)); //SSE4.1,
+    return _mm_sub_epi16 (a, b16);
+}
+
+int32x4_t vsubw_s16(int32x4_t a, int16x4_t b); // VSUBW.S16 q0,q0,d0
+_NEON2SSE_INLINE int32x4_t vsubw_s16(int32x4_t a, int16x4_t b) // VSUBW.S16 q0,q0,d0
+{
+    __m128i b32;
+    b32 = _MM_CVTEPI16_EPI32 (_pM128i(b)); //SSE4.1,
+    return _mm_sub_epi32 (a, b32);
+}
+
+int64x2_t vsubw_s32(int64x2_t a, int32x2_t b); // VSUBW.S32 q0,q0,d0
+_NEON2SSE_INLINE int64x2_t vsubw_s32(int64x2_t a, int32x2_t b) // VSUBW.S32 q0,q0,d0
+{
+    __m128i b64;
+    b64 = _MM_CVTEPI32_EPI64 (_pM128i(b)); //SSE4.1
+    return _mm_sub_epi64 (a, b64);
+}
+
+uint16x8_t vsubw_u8(uint16x8_t a, uint8x8_t b); // VSUBW.U8 q0,q0,d0
+_NEON2SSE_INLINE uint16x8_t vsubw_u8(uint16x8_t a, uint8x8_t b) // VSUBW.U8 q0,q0,d0
+{
+    __m128i b16;
+    b16 = _MM_CVTEPU8_EPI16 (_pM128i(b)); //SSE4.1,
+    return _mm_sub_epi16 (a, b16);
+}
+
+uint32x4_t vsubw_u16(uint32x4_t a, uint16x4_t b); // VSUBW.s16 q0,q0,d0
+_NEON2SSE_INLINE uint32x4_t vsubw_u16(uint32x4_t a, uint16x4_t b) // VSUBW.s16 q0,q0,d0
+{
+    __m128i b32;
+    b32 = _MM_CVTEPU16_EPI32 (_pM128i(b)); //SSE4.1,
+    return _mm_sub_epi32 (a, b32);
+}
+
+uint64x2_t vsubw_u32(uint64x2_t a, uint32x2_t b); // VSUBW.U32 q0,q0,d0
+_NEON2SSE_INLINE uint64x2_t vsubw_u32(uint64x2_t a, uint32x2_t b) // VSUBW.U32 q0,q0,d0
+{
+    __m128i b64;
+    b64 = _MM_CVTEPU32_EPI64 (_pM128i(b)); //SSE4.1
+    return _mm_sub_epi64 (a, b64);
+}
 
 //************************Vector saturating subtract *********************************
 //*************************************************************************************
+int8x8_t vqsub_s8(int8x8_t a, int8x8_t b); // VQSUB.S8 d0,d0,d0
+_NEON2SSE_INLINE int8x8_t vqsub_s8(int8x8_t a, int8x8_t b)
+{
+    int8x8_t res64;
+    return64(_mm_subs_epi8(_pM128i(a),_pM128i(b)));
+}
 
-int8x16_t   vqsubq_s8(int8x16_t a, int8x16_t b);         // VQSUB.S8 q0,q0,q0
+
+int16x4_t vqsub_s16(int16x4_t a, int16x4_t b); // VQSUB.S16 d0,d0,d0
+_NEON2SSE_INLINE int16x4_t vqsub_s16(int16x4_t a, int16x4_t b)
+{
+    int16x4_t res64;
+    return64(_mm_subs_epi16(_pM128i(a),_pM128i(b)));
+}
+
+
+int32x2_t vqsub_s32(int32x2_t a,  int32x2_t b); // VQSUB.S32 d0,d0,d0
+_NEON2SSE_INLINE int32x2_t vqsub_s32(int32x2_t a,  int32x2_t b)
+{
+    int32x2_t res64;
+    return64(vqsubq_s32(_pM128i(a), _pM128i(b)));
+}
+
+
+int64x1_t vqsub_s64(int64x1_t a, int64x1_t b); // VQSUB.S64 d0,d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vqsub_s64(int64x1_t a, int64x1_t b), _NEON2SSE_REASON_SLOW_SERIAL) //no optimal SIMD soulution
+{
+    uint64x1_t res;
+    uint64_t a64,b64;
+    a64 = a.m64_u64[0];
+    b64 = b.m64_u64[0];
+    res.m64_u64[0] = a64 - b64;
+
+    a64 =  (a64 >> 63) + (~_SIGNBIT64);
+    if ((int64_t)((a64 ^ b64) & (a64 ^ res.m64_u64[0])) < 0) {
+        res.m64_u64[0] = a64;
+    }
+    return res;
+}
+
+uint8x8_t vqsub_u8(uint8x8_t a, uint8x8_t b); // VQSUB.U8 d0,d0,d0
+_NEON2SSE_INLINE uint8x8_t vqsub_u8(uint8x8_t a, uint8x8_t b)
+{
+    uint8x8_t res64;
+    return64(_mm_subs_epu8(_pM128i(a),_pM128i(b)));
+}
+
+
+uint16x4_t vqsub_u16(uint16x4_t a, uint16x4_t b); // VQSUB.s16 d0,d0,d0
+_NEON2SSE_INLINE uint16x4_t vqsub_u16(uint16x4_t a, uint16x4_t b)
+{
+    uint16x4_t res64;
+    return64(_mm_subs_epu16(_pM128i(a),_pM128i(b)));
+}
+
+
+uint32x2_t vqsub_u32(uint32x2_t a,  uint32x2_t b); // VQSUB.U32 d0,d0,d0
+_NEON2SSE_INLINE uint32x2_t vqsub_u32(uint32x2_t a,  uint32x2_t b)
+{
+    uint32x2_t res64;
+    return64(vqsubq_u32(_pM128i(a), _pM128i(b)));
+}
+
+
+uint64x1_t vqsub_u64(uint64x1_t a, uint64x1_t b); // VQSUB.U64 d0,d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x1_t vqsub_u64(uint64x1_t a, uint64x1_t b), _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    uint64x1_t res;
+    uint64_t a64, b64;
+    a64 = _Ui64(a);
+    b64 = _Ui64(b);
+    if (a64 > b64) {
+        res.m64_u64[0] = a64 - b64;
+    } else {
+        res.m64_u64[0] = 0;
+    }
+    return res;
+}
+
+int8x16_t   vqsubq_s8(int8x16_t a, int8x16_t b); // VQSUB.S8 q0,q0,q0
 #define vqsubq_s8 _mm_subs_epi8
 
-int16x8_t   vqsubq_s16(int16x8_t a, int16x8_t b);         // VQSUB.S16 q0,q0,q0
+int16x8_t   vqsubq_s16(int16x8_t a, int16x8_t b); // VQSUB.S16 q0,q0,q0
 #define vqsubq_s16 _mm_subs_epi16
 
-int32x4_t vqsubq_s32(int32x4_t a, int32x4_t b);         // VQSUB.S32 q0,q0,q0
+int32x4_t vqsubq_s32(int32x4_t a, int32x4_t b); // VQSUB.S32 q0,q0,q0
 _NEON2SSE_INLINE int32x4_t vqsubq_s32(int32x4_t a, int32x4_t b)
-{         //no corresponding x86 SIMD soulution, special tricks are necessary. The overflow is possible only if a and b have opposite signs and sub has opposite sign to a
+{
+    //no corresponding x86 SIMD soulution, special tricks are necessary. The overflow is possible only if a and b have opposite signs and sub has opposite sign to a
     __m128i c7fffffff, res, res_sat, res_xor_a, b_xor_a;
     c7fffffff = _mm_set1_epi32(0x7fffffff);
     res = _mm_sub_epi32(a, b);
@@ -2575,14 +4622,14 @@
     res_xor_a = _mm_xor_si128(res, a);
     b_xor_a = _mm_xor_si128(b, a);
     res_xor_a = _mm_and_si128(b_xor_a, res_xor_a);
-    res_xor_a = _mm_srai_epi32(res_xor_a,31);         //propagate the sigh bit, all ffff if <0 all ones otherwise
+    res_xor_a = _mm_srai_epi32(res_xor_a,31); //propagate the sigh bit, all ffff if <0 all ones otherwise
     res_sat = _mm_and_si128(res_xor_a, res_sat);
     res = _mm_andnot_si128(res_xor_a, res);
     return _mm_or_si128(res, res_sat);
 }
 
-int64x2_t vqsubq_s64(int64x2_t a, int64x2_t b);         // VQSUB.S64 q0,q0,q0
-_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqsubq_s64(int64x2_t a, int64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)         //no optimal SIMD soulution
+int64x2_t vqsubq_s64(int64x2_t a, int64x2_t b); // VQSUB.S64 q0,q0,q0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqsubq_s64(int64x2_t a, int64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL) //no optimal SIMD soulution
 {
     _NEON2SSE_ALIGN_16 int64_t atmp[2], btmp[2];
     _NEON2SSE_ALIGN_16 uint64_t res[2];
@@ -2599,23 +4646,23 @@
     return _mm_load_si128((__m128i*)res);
 }
 
-uint8x16_t   vqsubq_u8(uint8x16_t a, uint8x16_t b);         // VQSUB.U8 q0,q0,q0
+uint8x16_t   vqsubq_u8(uint8x16_t a, uint8x16_t b); // VQSUB.U8 q0,q0,q0
 #define vqsubq_u8 _mm_subs_epu8
 
-uint16x8_t   vqsubq_u16(uint16x8_t a, uint16x8_t b);         // VQSUB.s16 q0,q0,q0
+uint16x8_t   vqsubq_u16(uint16x8_t a, uint16x8_t b); // VQSUB.s16 q0,q0,q0
 #define vqsubq_u16 _mm_subs_epu16
 
-uint32x4_t vqsubq_u32(uint32x4_t a, uint32x4_t b);         // VQSUB.U32 q0,q0,q0
-_NEON2SSE_INLINE uint32x4_t vqsubq_u32(uint32x4_t a, uint32x4_t b)         // VQSUB.U32 q0,q0,q0
+uint32x4_t vqsubq_u32(uint32x4_t a, uint32x4_t b); // VQSUB.U32 q0,q0,q0
+_NEON2SSE_INLINE uint32x4_t vqsubq_u32(uint32x4_t a, uint32x4_t b) // VQSUB.U32 q0,q0,q0
 {
     __m128i min, mask, sub;
-    min = _MM_MIN_EPU32(a, b);         //SSE4.1
+    min = _MM_MIN_EPU32(a, b); //SSE4.1
     mask = _mm_cmpeq_epi32 (min,  b);
     sub = _mm_sub_epi32 (a, b);
     return _mm_and_si128 ( sub, mask);
 }
 
-_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vqsubq_u64(uint64x2_t a, uint64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL);         // VQSUB.U64 q0,q0,q0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vqsubq_u64(uint64x2_t a, uint64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL); // VQSUB.U64 q0,q0,q0
 #ifdef USE_SSE4
     _NEON2SSE_INLINE uint64x2_t vqsubq_u64(uint64x2_t a, uint64x2_t b)
     {
@@ -2624,8 +4671,8 @@
         sub  = _mm_sub_epi64 (a, b);
         suba = _mm_sub_epi64 (a, c80000000);
         subb = _mm_sub_epi64 (b, c80000000);
-        cmp = _mm_cmpgt_epi64 ( suba, subb);         //no unsigned comparison, need to go to signed, SSE4.2!!!
-        return _mm_and_si128 (sub, cmp);         //saturation
+        cmp = _mm_cmpgt_epi64 ( suba, subb); //no unsigned comparison, need to go to signed, SSE4.2!!!
+        return _mm_and_si128 (sub, cmp); //saturation
     }
 #else
     _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vqsubq_u64(uint64x2_t a, uint64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
@@ -2641,10 +4688,62 @@
 
 //**********Vector halving subtract Vr[i]:=(Va[i]-Vb[i])>>1  ******************************************************
 //****************************************************************
+int8x8_t vhsub_s8(int8x8_t a, int8x8_t b); // VHSUB.S8 d0,d0,d0
+_NEON2SSE_INLINE int8x8_t vhsub_s8(int8x8_t a, int8x8_t b) // VHSUB.S8 d0,d0,d0
+{
+    //no 8 bit shift available, internal overflow is possible, so let's go to 16 bit,
+    int8x8_t res64;
+    __m128i r16;
+    int8x8_t r;
+    r = vsub_s8 (a, b);
+    r16 = _MM_CVTEPI8_EPI16 (_pM128i(r)); //SSE 4.1
+    r16 = _mm_srai_epi16 (r16, 1); //SSE2
+    r16 =  _mm_packs_epi16 (r16,r16); //use low 64 bits
+    return64(r16);
+}
 
-int8x16_t vhsubq_s8(int8x16_t a, int8x16_t b);         // VHSUB.S8 q0,q0,q0
-_NEON2SSE_INLINE int8x16_t vhsubq_s8(int8x16_t a, int8x16_t b)         // VHSUB.S8 q0,q0,q0
-{         // //need to deal with the possibility of internal overflow
+int16x4_t vhsub_s16(int16x4_t a,  int16x4_t b); // VHSUB.S16 d0,d0,d0
+_NEON2SSE_INLINE int16x4_t vhsub_s16(int16x4_t a,  int16x4_t b)
+{
+    int16x4_t res64;
+    return64(vhsubq_s16(_pM128i(a), _pM128i(b)));
+}
+
+
+
+int32x2_t vhsub_s32(int32x2_t a,  int32x2_t b); // VHSUB.S32 d0,d0,d0
+_NEON2SSE_INLINE int32x2_t vhsub_s32(int32x2_t a,  int32x2_t b)
+{
+    int32x2_t res64;
+    return64(vhsubq_s32(_pM128i(a), _pM128i(b)));
+}
+
+
+uint8x8_t vhsub_u8(uint8x8_t a,  uint8x8_t b); // VHSUB.U8 d0,d0,d0
+_NEON2SSE_INLINE uint8x8_t vhsub_u8(uint8x8_t a,  uint8x8_t b)
+{
+    uint8x8_t res64;
+    return64(vhsubq_u8(_pM128i(a), _pM128i(b)));
+}
+
+uint16x4_t vhsub_u16(uint16x4_t a,  uint16x4_t b); // VHSUB.s16 d0,d0,d0
+_NEON2SSE_INLINE uint16x4_t vhsub_u16(uint16x4_t a,  uint16x4_t b)
+{
+    uint16x4_t res64;
+    return64(vhsubq_u16(_pM128i(a), _pM128i(b)));
+}
+
+uint32x2_t vhsub_u32(uint32x2_t a,  uint32x2_t b); // VHSUB.U32 d0,d0,d0
+_NEON2SSE_INLINE uint32x2_t vhsub_u32(uint32x2_t a,  uint32x2_t b)
+{
+    uint32x2_t res64;
+    return64(vhsubq_u32(_pM128i(a), _pM128i(b)));
+}
+
+int8x16_t vhsubq_s8(int8x16_t a, int8x16_t b); // VHSUB.S8 q0,q0,q0
+_NEON2SSE_INLINE int8x16_t vhsubq_s8(int8x16_t a, int8x16_t b) // VHSUB.S8 q0,q0,q0
+{
+    // //need to deal with the possibility of internal overflow
     __m128i c128, au,bu;
     c128 = _mm_set1_epi8 (128);
     au = _mm_add_epi8( a, c128);
@@ -2652,9 +4751,10 @@
     return vhsubq_u8(au,bu);
 }
 
-int16x8_t vhsubq_s16(int16x8_t a, int16x8_t b);         // VHSUB.S16 q0,q0,q0
-_NEON2SSE_INLINE int16x8_t vhsubq_s16(int16x8_t a, int16x8_t b)         // VHSUB.S16 q0,q0,q0
-{         //need to deal with the possibility of internal overflow
+int16x8_t vhsubq_s16(int16x8_t a, int16x8_t b); // VHSUB.S16 q0,q0,q0
+_NEON2SSE_INLINE int16x8_t vhsubq_s16(int16x8_t a, int16x8_t b) // VHSUB.S16 q0,q0,q0
+{
+    //need to deal with the possibility of internal overflow
     __m128i c8000, au,bu;
     c8000 = _mm_set1_epi16(0x8000);
     au = _mm_add_epi16( a, c8000);
@@ -2662,9 +4762,10 @@
     return vhsubq_u16(au,bu);
 }
 
-int32x4_t vhsubq_s32(int32x4_t a, int32x4_t b);         // VHSUB.S32 q0,q0,q0
-_NEON2SSE_INLINE int32x4_t vhsubq_s32(int32x4_t a, int32x4_t b)         // VHSUB.S32 q0,q0,q0
-{//need to deal with the possibility of internal overflow
+int32x4_t vhsubq_s32(int32x4_t a, int32x4_t b); // VHSUB.S32 q0,q0,q0
+_NEON2SSE_INLINE int32x4_t vhsubq_s32(int32x4_t a, int32x4_t b) // VHSUB.S32 q0,q0,q0
+{
+    //need to deal with the possibility of internal overflow
     __m128i a2, b2,r, b_1;
     a2 = _mm_srai_epi32 (a,1);
     b2 = _mm_srai_epi32 (b,1);
@@ -2675,25 +4776,26 @@
     return _mm_sub_epi32(r,b_1);
 }
 
-uint8x16_t vhsubq_u8(uint8x16_t a, uint8x16_t b);         // VHSUB.U8 q0,q0,q0
-_NEON2SSE_INLINE uint8x16_t vhsubq_u8(uint8x16_t a, uint8x16_t b)         // VHSUB.U8 q0,q0,q0
+uint8x16_t vhsubq_u8(uint8x16_t a, uint8x16_t b); // VHSUB.U8 q0,q0,q0
+_NEON2SSE_INLINE uint8x16_t vhsubq_u8(uint8x16_t a, uint8x16_t b) // VHSUB.U8 q0,q0,q0
 {
     __m128i avg;
     avg = _mm_avg_epu8 (a, b);
     return _mm_sub_epi8(a, avg);
 }
 
-uint16x8_t vhsubq_u16(uint16x8_t a, uint16x8_t b);         // VHSUB.s16 q0,q0,q0
-_NEON2SSE_INLINE uint16x8_t vhsubq_u16(uint16x8_t a, uint16x8_t b)         // VHSUB.s16 q0,q0,q0
+uint16x8_t vhsubq_u16(uint16x8_t a, uint16x8_t b); // VHSUB.s16 q0,q0,q0
+_NEON2SSE_INLINE uint16x8_t vhsubq_u16(uint16x8_t a, uint16x8_t b) // VHSUB.s16 q0,q0,q0
 {
     __m128i avg;
     avg = _mm_avg_epu16 (a, b);
     return _mm_sub_epi16(a, avg);
 }
 
-uint32x4_t vhsubq_u32(uint32x4_t a, uint32x4_t b);         // VHSUB.U32 q0,q0,q0
-_NEON2SSE_INLINE uint32x4_t vhsubq_u32(uint32x4_t a, uint32x4_t b)         // VHSUB.U32 q0,q0,q0
-{//need to deal with the possibility of internal overflow
+uint32x4_t vhsubq_u32(uint32x4_t a, uint32x4_t b); // VHSUB.U32 q0,q0,q0
+_NEON2SSE_INLINE uint32x4_t vhsubq_u32(uint32x4_t a, uint32x4_t b) // VHSUB.U32 q0,q0,q0
+{
+    //need to deal with the possibility of internal overflow
     __m128i a2, b2,r, b_1;
     a2 = _mm_srli_epi32 (a,1);
     b2 = _mm_srli_epi32 (b,1);
@@ -2706,44 +4808,260 @@
 
 //******* Vector subtract high half (truncated) ** ************
 //************************************************************
+int8x8_t vsubhn_s16(int16x8_t a, int16x8_t b); // VSUBHN.I16 d0,q0,q0
+_NEON2SSE_INLINE int8x8_t vsubhn_s16(int16x8_t a, int16x8_t b) // VSUBHN.I16 d0,q0,q0
+{
+    int8x8_t res64;
+    __m128i sum, sum8;
+    sum = _mm_sub_epi16 (a, b);
+    sum8 = _mm_srai_epi16 (sum, 8);
+    sum8 = _mm_packs_epi16(sum8,sum8);
+    return64(sum8);
+}
+
+int16x4_t vsubhn_s32(int32x4_t a, int32x4_t b); // VSUBHN.I32 d0,q0,q0
+_NEON2SSE_INLINE int16x4_t vsubhn_s32(int32x4_t a, int32x4_t b) // VSUBHN.I32 d0,q0,q0
+{
+    int16x4_t res64;
+    __m128i sum, sum16;
+    sum = _mm_sub_epi32 (a, b);
+    sum16 = _mm_srai_epi32 (sum, 16);
+    sum16 = _mm_packs_epi32(sum16,sum16);
+    return64(sum16);
+}
+
+int32x2_t vsubhn_s64(int64x2_t a, int64x2_t b); // VSUBHN.I64 d0,q0,q0
+_NEON2SSE_INLINE int32x2_t vsubhn_s64(int64x2_t a, int64x2_t b)
+{
+    int32x2_t res64;
+    __m128i sub;
+    sub = _mm_sub_epi64 (a, b);
+    sub = _mm_shuffle_epi32(sub,  1 | (3 << 2) | (0 << 4) | (2 << 6));
+    return64(sub);
+}
+
+uint8x8_t vsubhn_u16(uint16x8_t a, uint16x8_t b); // VSUBHN.I16 d0,q0,q0
+_NEON2SSE_INLINE uint8x8_t vsubhn_u16(uint16x8_t a, uint16x8_t b) // VSUBHN.I16 d0,q0,q0
+{
+    uint8x8_t res64;
+    __m128i sum, sum8;
+    sum = _mm_sub_epi16 (a, b);
+    sum8 = _mm_srli_epi16 (sum, 8);
+    sum8 =  _mm_packus_epi16(sum8,sum8);
+    return64(sum8);
+}
+
+uint16x4_t vsubhn_u32(uint32x4_t a, uint32x4_t b); // VSUBHN.I32 d0,q0,q0
+_NEON2SSE_INLINE uint16x4_t vsubhn_u32(uint32x4_t a, uint32x4_t b) // VSUBHN.I32 d0,q0,q0
+{
+    uint16x4_t res64;
+    __m128i sum, sum16;
+    sum = _mm_sub_epi32 (a, b);
+    sum16 = _mm_srli_epi32 (sum, 16);
+    sum16 =  _MM_PACKUS1_EPI32(sum16);
+    return64(sum16);
+}
+
+uint32x2_t vsubhn_u64(uint64x2_t a, uint64x2_t b); // VSUBHN.I64 d0,q0,q0
+#define vsubhn_u64 vsubhn_s64
 
 //************ Vector rounding subtract high half *********************
 //*********************************************************************
+int8x8_t vrsubhn_s16(int16x8_t a, int16x8_t b); // VRSUBHN.I16 d0,q0,q0
+_NEON2SSE_INLINE int8x8_t vrsubhn_s16(int16x8_t a, int16x8_t b) // VRSUBHN.I16 d0,q0,q0
+{
+    int8x8_t res64;
+    __m128i sub, mask1;
+    sub = _mm_sub_epi16 (a, b);
+    mask1 = _mm_slli_epi16(sub, 9); //shift left then back right to
+    mask1 = _mm_srli_epi16(mask1, 15); //get  7-th bit 1 or zero
+    sub = _mm_srai_epi16 (sub, 8); //get high half
+    sub = _mm_add_epi16 (sub, mask1); //actual rounding
+    sub =  _mm_packs_epi16 (sub, sub);
+    return64(sub);
+}
+
+int16x4_t vrsubhn_s32(int32x4_t a, int32x4_t b); // VRSUBHN.I32 d0,q0,q0
+_NEON2SSE_INLINE int16x4_t vrsubhn_s32(int32x4_t a, int32x4_t b) // VRSUBHN.I32 d0,q0,q0
+{
+    //SIMD may be not optimal, serial may be faster
+    int16x4_t res64;
+    __m128i sub, mask1;
+    sub = _mm_sub_epi32 (a, b);
+    mask1 = _mm_slli_epi32(sub, 17); //shift left then back right to
+    mask1 = _mm_srli_epi32(mask1,31); //get  15-th bit 1 or zero
+    sub = _mm_srai_epi32 (sub, 16); //get high half
+    sub = _mm_add_epi32 (sub, mask1); //actual rounding
+    sub = _mm_packs_epi32 (sub, sub);
+    return64(sub);
+}
+
+int32x2_t vrsubhn_s64(int64x2_t a, int64x2_t b); // VRSUBHN.I64 d0,q0,q0
+_NEON2SSE_INLINE int32x2_t vrsubhn_s64(int64x2_t a, int64x2_t b)
+{
+    //SIMD may be not optimal, serial may be faster
+    int32x2_t res64;
+    __m128i sub, mask1;
+    sub = _mm_sub_epi64 (a, b);
+    mask1 = _mm_slli_epi64(sub, 33); //shift left then back right to
+    mask1 = _mm_srli_epi64(mask1,32); //get  31-th bit 1 or zero
+    sub = _mm_add_epi64 (sub, mask1); //actual high half rounding
+    sub = _mm_shuffle_epi32(sub,  1 | (3 << 2) | (0 << 4) | (2 << 6));
+    return64(sub);
+}
+
+uint8x8_t vrsubhn_u16(uint16x8_t a, uint16x8_t b); // VRSUBHN.I16 d0,q0,q0
+_NEON2SSE_INLINE uint8x8_t vrsubhn_u16(uint16x8_t a, uint16x8_t b) // VRSUBHN.I16 d0,q0,q0
+{
+    uint8x8_t res64;
+    __m128i sub, mask1;
+    sub = _mm_sub_epi16 (a, b);
+    mask1 = _mm_slli_epi16(sub, 9); //shift left then back right to
+    mask1 = _mm_srli_epi16(mask1, 15); //get  7-th bit 1 or zero
+    sub = _mm_srai_epi16 (sub, 8); //get high half
+    sub = _mm_add_epi16 (sub, mask1); //actual rounding
+    sub = _mm_packus_epi16 (sub, sub);
+    return64(sub);
+}
+
+uint16x4_t vrsubhn_u32(uint32x4_t a, uint32x4_t b); // VRSUBHN.I32 d0,q0,q0
+_NEON2SSE_INLINE uint16x4_t vrsubhn_u32(uint32x4_t a, uint32x4_t b) // VRSUBHN.I32 d0,q0,q0
+{
+    //SIMD may be not optimal, serial may be faster
+    uint16x4_t res64;
+    __m128i sub, mask1;
+    sub = _mm_sub_epi32 (a, b);
+    mask1 = _mm_slli_epi32(sub, 17); //shift left then back right to
+    mask1 = _mm_srli_epi32(mask1,31); //get  15-th bit 1 or zero
+    sub = _mm_srai_epi32 (sub, 16); //get high half
+    sub = _mm_add_epi32 (sub, mask1); //actual rounding
+    sub =  _MM_PACKUS1_EPI32 (sub);
+    return64(sub);
+}
+
+uint32x2_t vrsubhn_u64(uint64x2_t a, uint64x2_t b); // VRSUBHN.I64 d0,q0,q0
+#define vrsubhn_u64 vrsubhn_s64
 
 //*********** Vector saturating doubling multiply subtract long ********************
 //************************************************************************************
+int32x4_t vqdmlsl_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VQDMLSL.S16 q0,d0,d0
+_NEON2SSE_INLINE int32x4_t vqdmlsl_s16(int32x4_t a, int16x4_t b, int16x4_t c)
+{
+    //not optimal SIMD soulution, serial may be faster
+    __m128i res32, mask;
+    int32x4_t res;
+    _NEON2SSE_ALIGN_16 uint32_t cmask[] = {0x80000000, 0x80000000, 0x80000000, 0x80000000};
+    res = vmull_s16(b,  c);
+    res32 = _mm_slli_epi32 (res, 1); //double the result, saturation not considered
+    mask = _mm_cmpeq_epi32 (res32, *(__m128i*)cmask);
+    res32 = _mm_xor_si128 (res32,  mask); //res32 saturated for 0x80000000
+    return vqsubq_s32(a, res32); //saturation
+}
+
+int64x2_t vqdmlsl_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VQDMLSL.S32 q0,d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqdmlsl_s32(int64x2_t a, int32x2_t b, int32x2_t c), _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    __m128i res64, mask;
+    int64x2_t res;
+    _NEON2SSE_ALIGN_16 uint64_t cmask[] = {0x8000000000000000, 0x8000000000000000};
+    res = vmull_s32(b,  c);
+    res64 = _mm_slli_epi64 (res, 1); //double the result, saturation not considered
+    mask = _MM_CMPEQ_EPI64 (res64, *(__m128i*)cmask);
+    res64 = _mm_xor_si128 (res64,  mask); //res32 saturated for 0x80000000
+    return vqsubq_s64(a, res64); //saturation
+}
 
 //******************  COMPARISON ***************************************
 //******************* Vector compare equal *************************************
 //****************************************************************************
+uint8x8_t vceq_s8(int8x8_t a, int8x8_t b); // VCEQ.I8 d0, d0, d0
+_NEON2SSE_INLINE int8x8_t vceq_s8(int8x8_t a, int8x8_t b)
+{
+    int8x8_t res64;
+    return64(_mm_cmpeq_epi8(_pM128i(a),_pM128i(b)));
+}
 
-uint8x16_t   vceqq_s8(int8x16_t a, int8x16_t b);         // VCEQ.I8 q0, q0, q0
+
+uint16x4_t vceq_s16(int16x4_t a, int16x4_t b); // VCEQ.I16 d0, d0, d0
+_NEON2SSE_INLINE int16x4_t vceq_s16(int16x4_t a, int16x4_t b)
+{
+    int16x4_t res64;
+    return64(_mm_cmpeq_epi16(_pM128i(a),_pM128i(b)));
+}
+
+
+uint32x2_t vceq_s32(int32x2_t a, int32x2_t b); // VCEQ.I32 d0, d0, d0
+_NEON2SSE_INLINE int32x2_t vceq_s32(int32x2_t a, int32x2_t b)
+{
+    int32x2_t res64;
+    return64(_mm_cmpeq_epi32(_pM128i(a),_pM128i(b)));
+}
+
+
+uint32x2_t vceq_f32(float32x2_t a, float32x2_t b); // VCEQ.F32 d0, d0, d0
+_NEON2SSE_INLINE uint32x2_t vceq_f32(float32x2_t a, float32x2_t b)
+{
+    uint32x2_t res64;
+    __m128 res;
+    res = _mm_cmpeq_ps(_pM128(a), _pM128(b) );
+    return64f(res);
+}
+
+uint8x8_t vceq_u8(uint8x8_t a, uint8x8_t b); // VCEQ.I8 d0, d0, d0
+_NEON2SSE_INLINE uint8x8_t vceq_u8(uint8x8_t a, uint8x8_t b)
+{
+    uint8x8_t res64;
+    return64(_mm_cmpeq_epi8(_pM128i(a),_pM128i(b)));
+}
+
+
+uint16x4_t vceq_u16(uint16x4_t a, uint16x4_t b); // VCEQ.I16 d0, d0, d0
+_NEON2SSE_INLINE uint16x4_t vceq_u16(uint16x4_t a, uint16x4_t b)
+{
+    uint16x4_t res64;
+    return64(_mm_cmpeq_epi16(_pM128i(a),_pM128i(b)));
+}
+
+
+uint32x2_t vceq_u32(uint32x2_t a, uint32x2_t b); // VCEQ.I32 d0, d0, d0
+_NEON2SSE_INLINE uint32x2_t vceq_u32(uint32x2_t a, uint32x2_t b)
+{
+    uint32x2_t res64;
+    return64(_mm_cmpeq_epi32(_pM128i(a),_pM128i(b)));
+}
+
+
+uint8x8_t   vceq_p8(poly8x8_t a, poly8x8_t b); // VCEQ.I8 d0, d0, d0
+#define vceq_p8 vceq_u8
+
+
+uint8x16_t   vceqq_s8(int8x16_t a, int8x16_t b); // VCEQ.I8 q0, q0, q0
 #define vceqq_s8 _mm_cmpeq_epi8
 
-uint16x8_t   vceqq_s16(int16x8_t a, int16x8_t b);         // VCEQ.I16 q0, q0, q0
+uint16x8_t   vceqq_s16(int16x8_t a, int16x8_t b); // VCEQ.I16 q0, q0, q0
 #define vceqq_s16 _mm_cmpeq_epi16
 
-uint32x4_t   vceqq_s32(int32x4_t a, int32x4_t b);         // VCEQ.I32 q0, q0, q0
+uint32x4_t   vceqq_s32(int32x4_t a, int32x4_t b); // VCEQ.I32 q0, q0, q0
 #define vceqq_s32 _mm_cmpeq_epi32
 
-uint32x4_t vceqq_f32(float32x4_t a, float32x4_t b);         // VCEQ.F32 q0, q0, q0
+uint32x4_t vceqq_f32(float32x4_t a, float32x4_t b); // VCEQ.F32 q0, q0, q0
 _NEON2SSE_INLINE uint32x4_t vceqq_f32(float32x4_t a, float32x4_t b)
 {
     __m128 res;
     res = _mm_cmpeq_ps(a,b);
-    return *(__m128i*)&res;
+    return _M128i(res);
 }
 
-uint8x16_t   vceqq_u8(uint8x16_t a, uint8x16_t b);         // VCEQ.I8 q0, q0, q0
+uint8x16_t   vceqq_u8(uint8x16_t a, uint8x16_t b); // VCEQ.I8 q0, q0, q0
 #define vceqq_u8 _mm_cmpeq_epi8
 
-uint16x8_t   vceqq_u16(uint16x8_t a, uint16x8_t b);         // VCEQ.I16 q0, q0, q0
+uint16x8_t   vceqq_u16(uint16x8_t a, uint16x8_t b); // VCEQ.I16 q0, q0, q0
 #define vceqq_u16 _mm_cmpeq_epi16
 
-uint32x4_t   vceqq_u32(uint32x4_t a, uint32x4_t b);         // VCEQ.I32 q0, q0, q0
+uint32x4_t   vceqq_u32(uint32x4_t a, uint32x4_t b); // VCEQ.I32 q0, q0, q0
 #define vceqq_u32 _mm_cmpeq_epi32
 
-uint8x16_t   vceqq_p8(poly8x16_t a, poly8x16_t b);         // VCEQ.I8 q0, q0, q0
+uint8x16_t   vceqq_p8(poly8x16_t a, poly8x16_t b); // VCEQ.I8 q0, q0, q0
 #define vceqq_p8 _mm_cmpeq_epi8
 
 //******************Vector compare greater-than or equal*************************
@@ -2751,8 +5069,67 @@
 //in IA SIMD no greater-than-or-equal comparison for integers,
 // there is greater-than available only, so we need the following tricks
 
-uint8x16_t vcgeq_s8(int8x16_t a, int8x16_t b);         // VCGE.S8 q0, q0, q0
-_NEON2SSE_INLINE uint8x16_t vcgeq_s8(int8x16_t a, int8x16_t b)         // VCGE.S8 q0, q0, q0
+uint8x8_t vcge_s8(int8x8_t a,  int8x8_t b); // VCGE.S8 d0, d0, d0
+_NEON2SSE_INLINE int8x8_t vcge_s8(int8x8_t a,  int8x8_t b)
+{
+    int8x8_t res64;
+    return64(vcgeq_s8(_pM128i(a), _pM128i(b)));
+}
+
+
+uint16x4_t vcge_s16(int16x4_t a,  int16x4_t b); // VCGE.S16 d0, d0, d0
+_NEON2SSE_INLINE int16x4_t vcge_s16(int16x4_t a,  int16x4_t b)
+{
+    int16x4_t res64;
+    return64(vcgeq_s16(_pM128i(a), _pM128i(b)));
+}
+
+
+uint32x2_t vcge_s32(int32x2_t a,  int32x2_t b); // VCGE.S32 d0, d0, d0
+_NEON2SSE_INLINE int32x2_t vcge_s32(int32x2_t a,  int32x2_t b)
+{
+    int32x2_t res64;
+    return64(vcgeq_s32(_pM128i(a), _pM128i(b)));
+}
+
+
+uint32x2_t vcge_f32(float32x2_t a, float32x2_t b); // VCGE.F32 d0, d0, d0
+_NEON2SSE_INLINE uint32x2_t vcge_f32(float32x2_t a, float32x2_t b)
+{
+    uint32x2_t res64;
+    __m128 res;
+    res = _mm_cmpge_ps(_pM128(a),_pM128(b)); //use only 2 first entries
+    return64f(res);
+}
+
+uint8x8_t vcge_u8(uint8x8_t a,  uint8x8_t b); // VCGE.U8 d0, d0, d0
+_NEON2SSE_INLINE uint8x8_t vcge_u8(uint8x8_t a,  uint8x8_t b)
+{
+    uint8x8_t res64;
+    return64(vcgeq_u8(_pM128i(a), _pM128i(b)));
+}
+
+
+uint16x4_t vcge_u16(uint16x4_t a,  uint16x4_t b); // VCGE.s16 d0, d0, d0
+_NEON2SSE_INLINE uint16x4_t vcge_u16(uint16x4_t a,  uint16x4_t b)
+{
+    uint16x4_t res64;
+    return64(vcgeq_u16(_pM128i(a), _pM128i(b)));
+}
+
+
+uint32x2_t vcge_u32(uint32x2_t a,  uint32x2_t b); // VCGE.U32 d0, d0, d0
+_NEON2SSE_INLINE uint32x2_t vcge_u32(uint32x2_t a,  uint32x2_t b)
+{
+    //serial solution looks faster
+    uint32x2_t res64;
+    return64(vcgeq_u32 (_pM128i(a), _pM128i(b)));
+}
+
+
+
+uint8x16_t vcgeq_s8(int8x16_t a, int8x16_t b); // VCGE.S8 q0, q0, q0
+_NEON2SSE_INLINE uint8x16_t vcgeq_s8(int8x16_t a, int8x16_t b) // VCGE.S8 q0, q0, q0
 {
     __m128i m1, m2;
     m1 = _mm_cmpgt_epi8 ( a, b);
@@ -2760,8 +5137,8 @@
     return _mm_or_si128  ( m1, m2);
 }
 
-uint16x8_t vcgeq_s16(int16x8_t a, int16x8_t b);         // VCGE.S16 q0, q0, q0
-_NEON2SSE_INLINE uint16x8_t vcgeq_s16(int16x8_t a, int16x8_t b)         // VCGE.S16 q0, q0, q0
+uint16x8_t vcgeq_s16(int16x8_t a, int16x8_t b); // VCGE.S16 q0, q0, q0
+_NEON2SSE_INLINE uint16x8_t vcgeq_s16(int16x8_t a, int16x8_t b) // VCGE.S16 q0, q0, q0
 {
     __m128i m1, m2;
     m1 = _mm_cmpgt_epi16 ( a, b);
@@ -2769,8 +5146,8 @@
     return _mm_or_si128   ( m1,m2);
 }
 
-uint32x4_t vcgeq_s32(int32x4_t a, int32x4_t b);         // VCGE.S32 q0, q0, q0
-_NEON2SSE_INLINE uint32x4_t vcgeq_s32(int32x4_t a, int32x4_t b)         // VCGE.S32 q0, q0, q0
+uint32x4_t vcgeq_s32(int32x4_t a, int32x4_t b); // VCGE.S32 q0, q0, q0
+_NEON2SSE_INLINE uint32x4_t vcgeq_s32(int32x4_t a, int32x4_t b) // VCGE.S32 q0, q0, q0
 {
     __m128i m1, m2;
     m1 = _mm_cmpgt_epi32 (a, b);
@@ -2778,21 +5155,22 @@
     return _mm_or_si128   (m1, m2);
 }
 
-uint32x4_t vcgeq_f32(float32x4_t a, float32x4_t b);         // VCGE.F32 q0, q0, q0
+uint32x4_t vcgeq_f32(float32x4_t a, float32x4_t b); // VCGE.F32 q0, q0, q0
 _NEON2SSE_INLINE uint32x4_t vcgeq_f32(float32x4_t a, float32x4_t b)
 {
     __m128 res;
-    res = _mm_cmpge_ps(a,b);         //use only 2 first entries
+    res = _mm_cmpge_ps(a,b); //use only 2 first entries
     return *(__m128i*)&res;
 }
 
-uint8x16_t vcgeq_u8(uint8x16_t a, uint8x16_t b);         // VCGE.U8 q0, q0, q0
-_NEON2SSE_INLINE uint8x16_t vcgeq_u8(uint8x16_t a, uint8x16_t b)         // VCGE.U8 q0, q0, q0
-{         //no unsigned chars comparison, only signed available,so need the trick
+uint8x16_t vcgeq_u8(uint8x16_t a, uint8x16_t b); // VCGE.U8 q0, q0, q0
+_NEON2SSE_INLINE uint8x16_t vcgeq_u8(uint8x16_t a, uint8x16_t b) // VCGE.U8 q0, q0, q0
+{
+    //no unsigned chars comparison, only signed available,so need the trick
     #ifdef USE_SSE4
         __m128i cmp;
         cmp = _mm_max_epu8(a, b);
-        return _mm_cmpeq_epi8(cmp, a);         //a>=b
+        return _mm_cmpeq_epi8(cmp, a); //a>=b
     #else
         __m128i c128, as, bs, m1, m2;
         c128 = _mm_set1_epi8 (128);
@@ -2804,13 +5182,14 @@
     #endif
 }
 
-uint16x8_t vcgeq_u16(uint16x8_t a, uint16x8_t b);         // VCGE.s16 q0, q0, q0
-_NEON2SSE_INLINE uint16x8_t vcgeq_u16(uint16x8_t a, uint16x8_t b)         // VCGE.s16 q0, q0, q0
-{         //no unsigned shorts comparison, only signed available,so need the trick
+uint16x8_t vcgeq_u16(uint16x8_t a, uint16x8_t b); // VCGE.s16 q0, q0, q0
+_NEON2SSE_INLINE uint16x8_t vcgeq_u16(uint16x8_t a, uint16x8_t b) // VCGE.s16 q0, q0, q0
+{
+    //no unsigned shorts comparison, only signed available,so need the trick
     #ifdef USE_SSE4
         __m128i cmp;
         cmp = _mm_max_epu16(a, b);
-        return _mm_cmpeq_epi16(cmp, a);         //a>=b
+        return _mm_cmpeq_epi16(cmp, a); //a>=b
     #else
         __m128i c8000, as, bs, m1, m2;
         c8000 = _mm_set1_epi16 (0x8000);
@@ -2822,13 +5201,14 @@
     #endif
 }
 
-uint32x4_t vcgeq_u32(uint32x4_t a, uint32x4_t b);         // VCGE.U32 q0, q0, q0
-_NEON2SSE_INLINE uint32x4_t vcgeq_u32(uint32x4_t a, uint32x4_t b)         // VCGE.U32 q0, q0, q0
-{         //no unsigned ints comparison, only signed available,so need the trick
+uint32x4_t vcgeq_u32(uint32x4_t a, uint32x4_t b); // VCGE.U32 q0, q0, q0
+_NEON2SSE_INLINE uint32x4_t vcgeq_u32(uint32x4_t a, uint32x4_t b) // VCGE.U32 q0, q0, q0
+{
+    //no unsigned ints comparison, only signed available,so need the trick
     #ifdef USE_SSE4
         __m128i cmp;
         cmp = _mm_max_epu32(a, b);
-        return _mm_cmpeq_epi32(cmp, a);         //a>=b
+        return _mm_cmpeq_epi32(cmp, a); //a>=b
     #else
         //serial solution may be faster
         __m128i c80000000, as, bs, m1, m2;
@@ -2845,34 +5225,78 @@
 //***************************************************************************************
 //in IA SIMD no less-than-or-equal comparison for integers present, so we need the tricks
 
-uint8x16_t vcleq_s8(int8x16_t a, int8x16_t b);         // VCGE.S8 q0, q0, q0
-_NEON2SSE_INLINE uint8x16_t vcleq_s8(int8x16_t a, int8x16_t b)         // VCGE.S8 q0, q0, q0
+uint8x8_t vcle_s8(int8x8_t a,  int8x8_t b); // VCGE.S8 d0, d0, d0
+_NEON2SSE_INLINE int8x8_t vcle_s8(int8x8_t a,  int8x8_t b)
 {
-    __m128i c1, res;
-    c1 = _mm_cmpeq_epi8 (a,a);         //all ones 0xff....
-    res = _mm_cmpgt_epi8 ( a,  b);
-    return _mm_andnot_si128 (res, c1);         //inverse the cmpgt result, get less-than-or-equal
+    int8x8_t res64;
+    return64(vcleq_s8(_pM128i(a), _pM128i(b)));
 }
 
-uint16x8_t vcleq_s16(int16x8_t a, int16x8_t b);         // VCGE.S16 q0, q0, q0
-_NEON2SSE_INLINE uint16x8_t vcleq_s16(int16x8_t a, int16x8_t b)         // VCGE.S16 q0, q0, q0
+
+uint16x4_t vcle_s16(int16x4_t a,  int16x4_t b); // VCGE.S16 d0, d0, d0
+_NEON2SSE_INLINE int16x4_t vcle_s16(int16x4_t a,  int16x4_t b)
+{
+    int16x4_t res64;
+    return64(vcleq_s16(_pM128i(a), _pM128i(b)));
+}
+
+
+uint32x2_t vcle_s32(int32x2_t a,  int32x2_t b); // VCGE.S32 d0, d0, d0
+_NEON2SSE_INLINE int32x2_t vcle_s32(int32x2_t a,  int32x2_t b)
+{
+    int32x2_t res64;
+    return64(vcleq_s32(_pM128i(a), _pM128i(b)));
+}
+
+
+uint32x2_t vcle_f32(float32x2_t a, float32x2_t b); // VCGE.F32 d0, d0, d0?
+_NEON2SSE_INLINE uint32x2_t vcle_f32(float32x2_t a, float32x2_t b)
+{
+    uint32x2_t res64;
+    __m128 res;
+    res = _mm_cmple_ps(_pM128(a),_pM128(b));
+    return64f(res);
+}
+
+uint8x8_t vcle_u8(uint8x8_t a,  uint8x8_t b); // VCGE.U8 d0, d0, d0
+#define vcle_u8(a,b) vcge_u8(b,a)
+
+
+uint16x4_t vcle_u16(uint16x4_t a,  uint16x4_t b); // VCGE.s16 d0, d0, d0
+#define vcle_u16(a,b) vcge_u16(b,a)
+
+
+uint32x2_t vcle_u32(uint32x2_t a,  uint32x2_t b); // VCGE.U32 d0, d0, d0
+#define vcle_u32(a,b) vcge_u32(b,a)
+
+uint8x16_t vcleq_s8(int8x16_t a, int8x16_t b); // VCGE.S8 q0, q0, q0
+_NEON2SSE_INLINE uint8x16_t vcleq_s8(int8x16_t a, int8x16_t b) // VCGE.S8 q0, q0, q0
 {
     __m128i c1, res;
-    c1 = _mm_cmpeq_epi16 (a,a);         //all ones 0xff....
+    c1 = _mm_cmpeq_epi8 (a,a); //all ones 0xff....
+    res = _mm_cmpgt_epi8 ( a,  b);
+    return _mm_andnot_si128 (res, c1); //inverse the cmpgt result, get less-than-or-equal
+}
+
+uint16x8_t vcleq_s16(int16x8_t a, int16x8_t b); // VCGE.S16 q0, q0, q0
+_NEON2SSE_INLINE uint16x8_t vcleq_s16(int16x8_t a, int16x8_t b) // VCGE.S16 q0, q0, q0
+{
+    __m128i c1, res;
+    c1 = _mm_cmpeq_epi16 (a,a); //all ones 0xff....
     res = _mm_cmpgt_epi16 ( a,  b);
     return _mm_andnot_si128 (res, c1);
 }
 
-uint32x4_t vcleq_s32(int32x4_t a, int32x4_t b);         // VCGE.S32 q0, q0, q0
-_NEON2SSE_INLINE uint32x4_t vcleq_s32(int32x4_t a, int32x4_t b)         // VCGE.S32 q0, q0, q0
+uint32x4_t vcleq_s32(int32x4_t a, int32x4_t b); // VCGE.S32 q0, q0, q0
+_NEON2SSE_INLINE uint32x4_t vcleq_s32(int32x4_t a, int32x4_t b) // VCGE.S32 q0, q0, q0
 {
     __m128i c1, res;
-    c1 = _mm_cmpeq_epi32 (a,a);         //all ones 0xff....
+    c1 = _mm_cmpeq_epi32 (a,a); //all ones 0xff....
     res = _mm_cmpgt_epi32 ( a,  b);
     return _mm_andnot_si128 (res, c1);
 }
 
-uint32x4_t vcleq_f32(float32x4_t a, float32x4_t b);         // VCGE.F32 q0, q0, q0
+uint32x4_t vcleq_f32(float32x4_t a, float32x4_t b); // VCGE.F32 q0, q0, q0
 _NEON2SSE_INLINE uint32x4_t vcleq_f32(float32x4_t a, float32x4_t b)
 {
     __m128 res;
@@ -2880,67 +5304,129 @@
     return *(__m128i*)&res;
 }
 
-uint8x16_t vcleq_u8(uint8x16_t a, uint8x16_t b);         // VCGE.U8 q0, q0, q0
+uint8x16_t vcleq_u8(uint8x16_t a, uint8x16_t b); // VCGE.U8 q0, q0, q0
 #ifdef USE_SSE4
-    _NEON2SSE_INLINE uint8x16_t vcleq_u8(uint8x16_t a, uint8x16_t b)         // VCGE.U8 q0, q0, q0
-    {         //no unsigned chars comparison in SSE, only signed available,so need the trick
-
+    _NEON2SSE_INLINE uint8x16_t vcleq_u8(uint8x16_t a, uint8x16_t b) // VCGE.U8 q0, q0, q0
+    {
+        //no unsigned chars comparison in SSE, only signed available,so need the trick
         __m128i cmp;
         cmp = _mm_min_epu8(a, b);
-        return _mm_cmpeq_epi8(cmp, a);         //a<=b
+        return _mm_cmpeq_epi8(cmp, a); //a<=b
     }
 #else
     #define vcleq_u8(a,b) vcgeq_u8(b,a)
 #endif
 
-uint16x8_t vcleq_u16(uint16x8_t a, uint16x8_t b);         // VCGE.s16 q0, q0, q0
+
+uint16x8_t vcleq_u16(uint16x8_t a, uint16x8_t b); // VCGE.s16 q0, q0, q0
 #ifdef USE_SSE4
-    _NEON2SSE_INLINE uint16x8_t vcleq_u16(uint16x8_t a, uint16x8_t b)         // VCGE.s16 q0, q0, q0
-    {         //no unsigned shorts comparison in SSE, only signed available,so need the trick
+    _NEON2SSE_INLINE uint16x8_t vcleq_u16(uint16x8_t a, uint16x8_t b) // VCGE.s16 q0, q0, q0
+    {
+        //no unsigned shorts comparison in SSE, only signed available,so need the trick
         __m128i cmp;
         cmp = _mm_min_epu16(a, b);
-        return _mm_cmpeq_epi16(cmp, a);         //a<=b
+        return _mm_cmpeq_epi16(cmp, a); //a<=b
     }
 #else
     #define vcleq_u16(a,b) vcgeq_u16(b,a)
 #endif
 
-uint32x4_t vcleq_u32(uint32x4_t a, uint32x4_t b);         // VCGE.U32 q0, q0, q0
+
+uint32x4_t vcleq_u32(uint32x4_t a, uint32x4_t b); // VCGE.U32 q0, q0, q0
 #ifdef USE_SSE4
-    _NEON2SSE_INLINE uint32x4_t vcleq_u32(uint32x4_t a, uint32x4_t b)         // VCGE.U32 q0, q0, q0
-    {         //no unsigned chars comparison in SSE, only signed available,so need the trick
+    _NEON2SSE_INLINE uint32x4_t vcleq_u32(uint32x4_t a, uint32x4_t b) // VCGE.U32 q0, q0, q0
+    {
+        //no unsigned chars comparison in SSE, only signed available,so need the trick
         __m128i cmp;
         cmp = _mm_min_epu32(a, b);
-        return _mm_cmpeq_epi32(cmp, a);         //a<=b
+        return _mm_cmpeq_epi32(cmp, a); //a<=b
     }
 #else
 //solution may be not optimal compared with the serial one
     #define vcleq_u32(a,b) vcgeq_u32(b,a)
 #endif
 
+
 //****** Vector compare greater-than ******************************************
 //**************************************************************************
+uint8x8_t vcgt_s8(int8x8_t a, int8x8_t b); // VCGT.S8 d0, d0, d0
+_NEON2SSE_INLINE int8x8_t vcgt_s8(int8x8_t a, int8x8_t b)
+{
+    int8x8_t res64;
+    return64(_mm_cmpgt_epi8(_pM128i(a),_pM128i(b)));
+}
 
-uint8x16_t   vcgtq_s8(int8x16_t a, int8x16_t b);         // VCGT.S8 q0, q0, q0
+
+uint16x4_t vcgt_s16(int16x4_t a, int16x4_t b); // VCGT.S16 d0, d0, d0
+_NEON2SSE_INLINE int16x4_t vcgt_s16(int16x4_t a, int16x4_t b)
+{
+    int16x4_t res64;
+    return64(_mm_cmpgt_epi16(_pM128i(a),_pM128i(b)));
+}
+
+
+uint32x2_t vcgt_s32(int32x2_t a, int32x2_t b); // VCGT.S32 d0, d0, d0
+_NEON2SSE_INLINE int32x2_t vcgt_s32(int32x2_t a, int32x2_t b)
+{
+    int32x2_t res64;
+    return64(_mm_cmpgt_epi32(_pM128i(a),_pM128i(b)));
+}
+
+
+uint32x2_t vcgt_f32(float32x2_t a, float32x2_t b); // VCGT.F32 d0, d0, d0
+_NEON2SSE_INLINE uint32x2_t vcgt_f32(float32x2_t a, float32x2_t b)
+{
+    uint32x2_t res64;
+    __m128 res;
+    res = _mm_cmpgt_ps(_pM128(a),_pM128(b)); //use only 2 first entries
+    return64f(res);
+}
+
+uint8x8_t vcgt_u8(uint8x8_t a,  uint8x8_t b); // VCGT.U8 d0, d0, d0
+_NEON2SSE_INLINE uint8x8_t vcgt_u8(uint8x8_t a,  uint8x8_t b)
+{
+    uint8x8_t res64;
+    return64(vcgtq_u8(_pM128i(a), _pM128i(b)));
+}
+
+
+uint16x4_t vcgt_u16(uint16x4_t a,  uint16x4_t b); // VCGT.s16 d0, d0, d0
+_NEON2SSE_INLINE uint16x4_t vcgt_u16(uint16x4_t a,  uint16x4_t b)
+{
+    uint16x4_t res64;
+    return64(vcgtq_u16(_pM128i(a), _pM128i(b)));
+}
+
+
+uint32x2_t vcgt_u32(uint32x2_t a,  uint32x2_t b); // VCGT.U32 d0, d0, d0
+_NEON2SSE_INLINE uint32x2_t vcgt_u32(uint32x2_t a,  uint32x2_t b)
+{
+    uint32x2_t res64;
+    return64(vcgtq_u32(_pM128i(a), _pM128i(b)));
+}
+
+
+uint8x16_t   vcgtq_s8(int8x16_t a, int8x16_t b); // VCGT.S8 q0, q0, q0
 #define vcgtq_s8 _mm_cmpgt_epi8
 
-uint16x8_t   vcgtq_s16(int16x8_t a, int16x8_t b);         // VCGT.S16 q0, q0, q0
+uint16x8_t   vcgtq_s16(int16x8_t a, int16x8_t b); // VCGT.S16 q0, q0, q0
 #define vcgtq_s16 _mm_cmpgt_epi16
 
-uint32x4_t   vcgtq_s32(int32x4_t a, int32x4_t b);         // VCGT.S32 q0, q0, q0
+uint32x4_t   vcgtq_s32(int32x4_t a, int32x4_t b); // VCGT.S32 q0, q0, q0
 #define vcgtq_s32 _mm_cmpgt_epi32
 
-uint32x4_t vcgtq_f32(float32x4_t a, float32x4_t b);         // VCGT.F32 q0, q0, q0
+uint32x4_t vcgtq_f32(float32x4_t a, float32x4_t b); // VCGT.F32 q0, q0, q0
 _NEON2SSE_INLINE uint32x4_t vcgtq_f32(float32x4_t a, float32x4_t b)
 {
     __m128 res;
-    res = _mm_cmpgt_ps(a,b);         //use only 2 first entries
+    res = _mm_cmpgt_ps(a,b); //use only 2 first entries
     return *(__m128i*)&res;
 }
 
-uint8x16_t vcgtq_u8(uint8x16_t a, uint8x16_t b);         // VCGT.U8 q0, q0, q0
-_NEON2SSE_INLINE uint8x16_t vcgtq_u8(uint8x16_t a, uint8x16_t b)         // VCGT.U8 q0, q0, q0
-{         //no unsigned chars comparison, only signed available,so need the trick
+uint8x16_t vcgtq_u8(uint8x16_t a, uint8x16_t b); // VCGT.U8 q0, q0, q0
+_NEON2SSE_INLINE uint8x16_t vcgtq_u8(uint8x16_t a, uint8x16_t b) // VCGT.U8 q0, q0, q0
+{
+    //no unsigned chars comparison, only signed available,so need the trick
     __m128i c128, as, bs;
     c128 = _mm_set1_epi8 (128);
     as = _mm_sub_epi8(a,c128);
@@ -2948,9 +5434,10 @@
     return _mm_cmpgt_epi8 (as, bs);
 }
 
-uint16x8_t vcgtq_u16(uint16x8_t a, uint16x8_t b);         // VCGT.s16 q0, q0, q0
-_NEON2SSE_INLINE uint16x8_t vcgtq_u16(uint16x8_t a, uint16x8_t b)         // VCGT.s16 q0, q0, q0
-{         //no unsigned short comparison, only signed available,so need the trick
+uint16x8_t vcgtq_u16(uint16x8_t a, uint16x8_t b); // VCGT.s16 q0, q0, q0
+_NEON2SSE_INLINE uint16x8_t vcgtq_u16(uint16x8_t a, uint16x8_t b) // VCGT.s16 q0, q0, q0
+{
+    //no unsigned short comparison, only signed available,so need the trick
     __m128i c8000, as, bs;
     c8000 = _mm_set1_epi16 (0x8000);
     as = _mm_sub_epi16(a,c8000);
@@ -2958,9 +5445,10 @@
     return _mm_cmpgt_epi16 ( as, bs);
 }
 
-uint32x4_t vcgtq_u32(uint32x4_t a, uint32x4_t b);         // VCGT.U32 q0, q0, q0
-_NEON2SSE_INLINE uint32x4_t vcgtq_u32(uint32x4_t a, uint32x4_t b)         // VCGT.U32 q0, q0, q0
-{         //no unsigned int comparison, only signed available,so need the trick
+uint32x4_t vcgtq_u32(uint32x4_t a, uint32x4_t b); // VCGT.U32 q0, q0, q0
+_NEON2SSE_INLINE uint32x4_t vcgtq_u32(uint32x4_t a, uint32x4_t b) // VCGT.U32 q0, q0, q0
+{
+    //no unsigned int comparison, only signed available,so need the trick
     __m128i c80000000, as, bs;
     c80000000 = _mm_set1_epi32 (0x80000000);
     as = _mm_sub_epi32(a,c80000000);
@@ -2970,33 +5458,68 @@
 
 //********************* Vector compare less-than **************************
 //*************************************************************************
+uint8x8_t   vclt_s8(int8x8_t a, int8x8_t b); // VCGT.S8 d0, d0, d0
+#define vclt_s8(a,b) vcgt_s8(b,a) //swap the arguments!!
 
-uint8x16_t   vcltq_s8(int8x16_t a, int8x16_t b);         // VCGT.S8 q0, q0, q0
-#define vcltq_s8(a,b) vcgtq_s8(b, a)         //swap the arguments!!
 
-uint16x8_t   vcltq_s16(int16x8_t a, int16x8_t b);         // VCGT.S16 q0, q0, q0
-#define vcltq_s16(a,b) vcgtq_s16(b, a)         //swap the arguments!!
+uint16x4_t   vclt_s16(int16x4_t a, int16x4_t b); // VCGT.S16 d0, d0, d0
+#define vclt_s16(a,b) vcgt_s16(b,a) //swap the arguments!!
 
-uint32x4_t   vcltq_s32(int32x4_t a, int32x4_t b);         // VCGT.S32 q0, q0, q0
-#define vcltq_s32(a,b) vcgtq_s32(b, a)         //swap the arguments!!
 
-uint32x4_t vcltq_f32(float32x4_t a, float32x4_t b);         // VCGT.F32 q0, q0, q0
-#define vcltq_f32(a,b) vcgtq_f32(b, a)         //swap the arguments!!
+uint32x2_t   vclt_s32(int32x2_t a, int32x2_t b); // VCGT.S32 d0, d0, d0
+#define vclt_s32(a,b)  vcgt_s32(b,a) //swap the arguments!!
 
-uint8x16_t vcltq_u8(uint8x16_t a, uint8x16_t b);         // VCGT.U8 q0, q0, q0
-#define vcltq_u8(a,b) vcgtq_u8(b, a)         //swap the arguments!!
 
-uint16x8_t vcltq_u16(uint16x8_t a, uint16x8_t b);         // VCGT.s16 q0, q0, q0
-#define vcltq_u16(a,b) vcgtq_u16(b, a)         //swap the arguments!!
+uint32x2_t vclt_f32(float32x2_t a, float32x2_t b); // VCGT.F32 d0, d0, d0
+#define vclt_f32(a,b) vcgt_f32(b, a) //swap the arguments!!
 
-uint32x4_t vcltq_u32(uint32x4_t a, uint32x4_t b);         // VCGT.U32 q0, q0, q0
-#define vcltq_u32(a,b) vcgtq_u32(b, a)         //swap the arguments!!
+uint8x8_t vclt_u8(uint8x8_t a, uint8x8_t b); // VCGT.U8 d0, d0, d0
+#define vclt_u8(a,b) vcgt_u8(b,a) //swap the arguments!!
+
+uint16x4_t vclt_u16(uint16x4_t a, uint16x4_t b); // VCGT.s16 d0, d0, d0
+#define vclt_u16(a,b) vcgt_u16(b,a) //swap the arguments!!
+
+uint32x2_t vclt_u32(uint32x2_t a, uint32x2_t b); // VCGT.U32 d0, d0, d0
+#define vclt_u32(a,b) vcgt_u32(b,a) //swap the arguments!!
+
+uint8x16_t   vcltq_s8(int8x16_t a, int8x16_t b); // VCGT.S8 q0, q0, q0
+#define vcltq_s8(a,b) vcgtq_s8(b, a) //swap the arguments!!
+
+uint16x8_t   vcltq_s16(int16x8_t a, int16x8_t b); // VCGT.S16 q0, q0, q0
+#define vcltq_s16(a,b) vcgtq_s16(b, a) //swap the arguments!!
+
+uint32x4_t   vcltq_s32(int32x4_t a, int32x4_t b); // VCGT.S32 q0, q0, q0
+#define vcltq_s32(a,b) vcgtq_s32(b, a) //swap the arguments!!
+
+uint32x4_t vcltq_f32(float32x4_t a, float32x4_t b); // VCGT.F32 q0, q0, q0
+#define vcltq_f32(a,b) vcgtq_f32(b, a) //swap the arguments!!
+
+uint8x16_t vcltq_u8(uint8x16_t a, uint8x16_t b); // VCGT.U8 q0, q0, q0
+#define vcltq_u8(a,b) vcgtq_u8(b, a) //swap the arguments!!
+
+uint16x8_t vcltq_u16(uint16x8_t a, uint16x8_t b); // VCGT.s16 q0, q0, q0
+#define vcltq_u16(a,b) vcgtq_u16(b, a) //swap the arguments!!
+
+uint32x4_t vcltq_u32(uint32x4_t a, uint32x4_t b); // VCGT.U32 q0, q0, q0
+#define vcltq_u32(a,b) vcgtq_u32(b, a) //swap the arguments!!
 
 //*****************Vector compare absolute greater-than or equal ************
 //***************************************************************************
+uint32x2_t vcage_f32(float32x2_t a, float32x2_t b); // VACGE.F32 d0, d0, d0
+_NEON2SSE_INLINE uint32x2_t vcage_f32(float32x2_t a, float32x2_t b)
+{
+    uint32x2_t res64;
+    __m128i c7fffffff;
+    __m128 a0, b0;
+    c7fffffff = _mm_set1_epi32 (0x7fffffff);
+    a0 = _mm_and_ps (_pM128(a), *(__m128*)&c7fffffff);
+    b0 = _mm_and_ps (_pM128(b), *(__m128*)&c7fffffff);
+    a0 = _mm_cmpge_ps ( a0, b0);
+    return64f(a0);
+}
 
-uint32x4_t vcageq_f32(float32x4_t a, float32x4_t b);         // VACGE.F32 q0, q0, q0
-_NEON2SSE_INLINE uint32x4_t vcageq_f32(float32x4_t a, float32x4_t b)         // VACGE.F32 q0, q0, q0
+uint32x4_t vcageq_f32(float32x4_t a, float32x4_t b); // VACGE.F32 q0, q0, q0
+_NEON2SSE_INLINE uint32x4_t vcageq_f32(float32x4_t a, float32x4_t b) // VACGE.F32 q0, q0, q0
 {
     __m128i c7fffffff;
     __m128 a0, b0;
@@ -3009,9 +5532,21 @@
 
 //********Vector compare absolute less-than or equal ******************
 //********************************************************************
+uint32x2_t vcale_f32(float32x2_t a, float32x2_t b); // VACGE.F32 d0, d0, d0
+_NEON2SSE_INLINE uint32x2_t vcale_f32(float32x2_t a, float32x2_t b)
+{
+    uint32x2_t res64;
+    __m128i c7fffffff;
+    __m128 a0, b0;
+    c7fffffff = _mm_set1_epi32 (0x7fffffff);
+    a0 = _mm_and_ps (_pM128(a), *(__m128*)&c7fffffff);
+    b0 = _mm_and_ps (_pM128(b), *(__m128*)&c7fffffff);
+    a0 = _mm_cmple_ps (a0, b0);
+    return64f(a0);
+}
 
-uint32x4_t vcaleq_f32(float32x4_t a, float32x4_t b);         // VACGE.F32 q0, q0, q0
-_NEON2SSE_INLINE uint32x4_t vcaleq_f32(float32x4_t a, float32x4_t b)         // VACGE.F32 q0, q0, q0
+uint32x4_t vcaleq_f32(float32x4_t a, float32x4_t b); // VACGE.F32 q0, q0, q0
+_NEON2SSE_INLINE uint32x4_t vcaleq_f32(float32x4_t a, float32x4_t b) // VACGE.F32 q0, q0, q0
 {
     __m128i c7fffffff;
     __m128 a0, b0;
@@ -3024,9 +5559,21 @@
 
 //********  Vector compare absolute greater-than    ******************
 //******************************************************************
+uint32x2_t vcagt_f32(float32x2_t a, float32x2_t b); // VACGT.F32 d0, d0, d0
+_NEON2SSE_INLINE uint32x2_t vcagt_f32(float32x2_t a, float32x2_t b)
+{
+    uint32x2_t res64;
+    __m128i c7fffffff;
+    __m128 a0, b0;
+    c7fffffff = _mm_set1_epi32 (0x7fffffff);
+    a0 = _mm_and_ps (_pM128(a), *(__m128*)&c7fffffff);
+    b0 = _mm_and_ps (_pM128(b), *(__m128*)&c7fffffff);
+    a0 = _mm_cmpgt_ps (a0, b0);
+    return64f(a0);
+}
 
-uint32x4_t vcagtq_f32(float32x4_t a, float32x4_t b);         // VACGT.F32 q0, q0, q0
-_NEON2SSE_INLINE uint32x4_t vcagtq_f32(float32x4_t a, float32x4_t b)         // VACGT.F32 q0, q0, q0
+uint32x4_t vcagtq_f32(float32x4_t a, float32x4_t b); // VACGT.F32 q0, q0, q0
+_NEON2SSE_INLINE uint32x4_t vcagtq_f32(float32x4_t a, float32x4_t b) // VACGT.F32 q0, q0, q0
 {
     __m128i c7fffffff;
     __m128 a0, b0;
@@ -3039,9 +5586,21 @@
 
 //***************Vector compare absolute less-than  ***********************
 //*************************************************************************
+uint32x2_t vcalt_f32(float32x2_t a, float32x2_t b); // VACGT.F32 d0, d0, d0
+_NEON2SSE_INLINE uint32x2_t vcalt_f32(float32x2_t a, float32x2_t b)
+{
+    uint32x2_t res64;
+    __m128i c7fffffff;
+    __m128 a0, b0;
+    c7fffffff = _mm_set1_epi32 (0x7fffffff);
+    a0 = _mm_and_ps (_pM128(a), *(__m128*)&c7fffffff);
+    b0 = _mm_and_ps (_pM128(b), *(__m128*)&c7fffffff);
+    a0 = _mm_cmplt_ps (a0, b0);
+    return64f(a0);
+}
 
-uint32x4_t vcaltq_f32(float32x4_t a, float32x4_t b);         // VACGT.F32 q0, q0, q0
-_NEON2SSE_INLINE uint32x4_t vcaltq_f32(float32x4_t a, float32x4_t b)         // VACGT.F32 q0, q0, q0
+uint32x4_t vcaltq_f32(float32x4_t a, float32x4_t b); // VACGT.F32 q0, q0, q0
+_NEON2SSE_INLINE uint32x4_t vcaltq_f32(float32x4_t a, float32x4_t b) // VACGT.F32 q0, q0, q0
 {
     __m128i c7fffffff;
     __m128 a0, b0;
@@ -3050,7 +5609,6 @@
     b0 = _mm_and_ps (b, *(__m128*)&c7fffffff);
     a0 = _mm_cmplt_ps (a0, b0);
     return (*(__m128i*)&a0);
-
 }
 
 //*************************Vector test bits************************************
@@ -3060,90 +5618,169 @@
 corresponding element in the destination vector is set to all ones. Otherwise, it is set to
 all zeros. */
 
-uint8x16_t vtstq_s8(int8x16_t a, int8x16_t b);         // VTST.8 q0, q0, q0
-_NEON2SSE_INLINE uint8x16_t vtstq_s8(int8x16_t a, int8x16_t b)         // VTST.8 q0, q0, q0
+uint8x8_t vtst_s8(int8x8_t a,  int8x8_t b); // VTST.8 d0, d0, d0
+_NEON2SSE_INLINE uint8x8_t vtst_s8(int8x8_t a,  int8x8_t b)
+{
+    int8x8_t res64;
+    return64(vtstq_s8(_pM128i(a), _pM128i(b)));
+}
+
+
+uint16x4_t vtst_s16(int16x4_t a,  int16x4_t b); // VTST.16 d0, d0, d0
+_NEON2SSE_INLINE uint16x4_t vtst_s16(int16x4_t a,  int16x4_t b)
+{
+    int16x4_t res64;
+    return64(vtstq_s16(_pM128i(a), _pM128i(b)));
+}
+
+
+uint32x2_t vtst_s32(int32x2_t a,  int32x2_t b); // VTST.32 d0, d0, d0
+_NEON2SSE_INLINE uint32x2_t vtst_s32(int32x2_t a,  int32x2_t b)
+{
+    int32x2_t res64;
+    return64(vtstq_s32(_pM128i(a), _pM128i(b)));
+}
+
+
+uint8x8_t vtst_u8(uint8x8_t a,  uint8x8_t b); // VTST.8 d0, d0, d0
+#define vtst_u8 vtst_s8
+
+uint16x4_t vtst_u16(uint16x4_t a,  uint16x4_t b); // VTST.16 d0, d0, d0
+#define vtst_u16 vtst_s16
+
+uint32x2_t vtst_u32(uint32x2_t a,  uint32x2_t b); // VTST.32 d0, d0, d0
+#define vtst_u32 vtst_s32
+
+
+uint8x8_t vtst_p8(poly8x8_t a, poly8x8_t b); // VTST.8 d0, d0, d0
+#define vtst_p8 vtst_u8
+
+uint8x16_t vtstq_s8(int8x16_t a, int8x16_t b); // VTST.8 q0, q0, q0
+_NEON2SSE_INLINE uint8x16_t vtstq_s8(int8x16_t a, int8x16_t b) // VTST.8 q0, q0, q0
 {
     __m128i zero, one, res;
     zero = _mm_setzero_si128 ();
-    one = _mm_cmpeq_epi8(zero,zero);         //0xfff..ffff
+    one = _mm_cmpeq_epi8(zero,zero); //0xfff..ffff
     res = _mm_and_si128 (a, b);
     res =  _mm_cmpeq_epi8 (res, zero);
-    return _mm_xor_si128(res, one);         //invert result
+    return _mm_xor_si128(res, one); //invert result
 }
 
-uint16x8_t vtstq_s16(int16x8_t a, int16x8_t b);         // VTST.16 q0, q0, q0
-_NEON2SSE_INLINE uint16x8_t vtstq_s16(int16x8_t a, int16x8_t b)         // VTST.16 q0, q0, q0
+uint16x8_t vtstq_s16(int16x8_t a, int16x8_t b); // VTST.16 q0, q0, q0
+_NEON2SSE_INLINE uint16x8_t vtstq_s16(int16x8_t a, int16x8_t b) // VTST.16 q0, q0, q0
 {
     __m128i zero, one, res;
     zero = _mm_setzero_si128 ();
-    one = _mm_cmpeq_epi8(zero,zero);         //0xfff..ffff
+    one = _mm_cmpeq_epi8(zero,zero); //0xfff..ffff
     res = _mm_and_si128 (a, b);
     res =  _mm_cmpeq_epi16 (res, zero);
-    return _mm_xor_si128(res, one);         //invert result
+    return _mm_xor_si128(res, one); //invert result
 }
 
-uint32x4_t vtstq_s32(int32x4_t a, int32x4_t b);         // VTST.32 q0, q0, q0
-_NEON2SSE_INLINE uint32x4_t vtstq_s32(int32x4_t a, int32x4_t b)         // VTST.32 q0, q0, q0
+uint32x4_t vtstq_s32(int32x4_t a, int32x4_t b); // VTST.32 q0, q0, q0
+_NEON2SSE_INLINE uint32x4_t vtstq_s32(int32x4_t a, int32x4_t b) // VTST.32 q0, q0, q0
 {
     __m128i zero, one, res;
     zero = _mm_setzero_si128 ();
-    one = _mm_cmpeq_epi8(zero,zero);         //0xfff..ffff
+    one = _mm_cmpeq_epi8(zero,zero); //0xfff..ffff
     res = _mm_and_si128 (a, b);
     res =  _mm_cmpeq_epi32 (res, zero);
-    return _mm_xor_si128(res, one);         //invert result
+    return _mm_xor_si128(res, one); //invert result
 }
 
-uint8x16_t vtstq_u8(uint8x16_t a, uint8x16_t b);         // VTST.8 q0, q0, q0
+uint8x16_t vtstq_u8(uint8x16_t a, uint8x16_t b); // VTST.8 q0, q0, q0
 #define vtstq_u8 vtstq_s8
 
-uint16x8_t vtstq_u16(uint16x8_t a, uint16x8_t b);         // VTST.16 q0, q0, q0
+uint16x8_t vtstq_u16(uint16x8_t a, uint16x8_t b); // VTST.16 q0, q0, q0
 #define vtstq_u16 vtstq_s16
 
-uint32x4_t vtstq_u32(uint32x4_t a, uint32x4_t b);         // VTST.32 q0, q0, q0
+uint32x4_t vtstq_u32(uint32x4_t a, uint32x4_t b); // VTST.32 q0, q0, q0
 #define vtstq_u32 vtstq_s32
 
-uint8x16_t vtstq_p8(poly8x16_t a, poly8x16_t b);         // VTST.8 q0, q0, q0
+uint8x16_t vtstq_p8(poly8x16_t a, poly8x16_t b); // VTST.8 q0, q0, q0
 #define vtstq_p8 vtstq_u8
 
 //****************** Absolute difference ********************
 //*** Absolute difference between the arguments: Vr[i] = | Va[i] - Vb[i] |*****
 //************************************************************
-#if defined(USE_SSSE3)
+int8x8_t vabd_s8(int8x8_t a,  int8x8_t b); // VABD.S8 d0,d0,d0
+_NEON2SSE_INLINE int8x8_t vabd_s8(int8x8_t a,  int8x8_t b)
+{
+    int8x8_t res64;
+    return64(vabdq_s8(_pM128i(a), _pM128i(b)));
+}
 
-#endif
+int16x4_t vabd_s16(int16x4_t a,  int16x4_t b); // VABD.S16 d0,d0,d0
+_NEON2SSE_INLINE int16x4_t vabd_s16(int16x4_t a,  int16x4_t b)
+{
+    int16x4_t res64;
+    return64(vabdq_s16(_pM128i(a), _pM128i(b)));
+}
 
-#if defined(USE_SSSE3)
-int8x16_t vabdq_s8(int8x16_t a, int8x16_t b);         // VABD.S8 q0,q0,q0
-_NEON2SSE_INLINE int8x16_t vabdq_s8(int8x16_t a, int8x16_t b)         // VABD.S8 q0,q0,q0
+int32x2_t vabd_s32(int32x2_t a,  int32x2_t b); // VABD.S32 d0,d0,d0
+_NEON2SSE_INLINE int32x2_t vabd_s32(int32x2_t a,  int32x2_t b)
+{
+    int32x2_t res64;
+    return64(vabdq_s32(_pM128i(a), _pM128i(b)));
+}
+
+uint8x8_t vabd_u8(uint8x8_t a,  uint8x8_t b); // VABD.U8 d0,d0,d0
+_NEON2SSE_INLINE uint8x8_t vabd_u8(uint8x8_t a,  uint8x8_t b)
+{
+    uint8x8_t res64;
+    return64(vabdq_u8(_pM128i(a), _pM128i(b)));
+}
+
+uint16x4_t vabd_u16(uint16x4_t a,  uint16x4_t b); // VABD.s16 d0,d0,d0
+_NEON2SSE_INLINE uint16x4_t vabd_u16(uint16x4_t a,  uint16x4_t b)
+{
+    uint16x4_t res64;
+    return64(vabdq_u16(_pM128i(a), _pM128i(b)));
+}
+
+uint32x2_t vabd_u32(uint32x2_t a,  uint32x2_t b); // VABD.U32 d0,d0,d0
+_NEON2SSE_INLINE uint32x2_t vabd_u32(uint32x2_t a,  uint32x2_t b)
+{
+    uint32x2_t res64;
+    return64(vabdq_u32(_pM128i(a), _pM128i(b)));
+}
+
+float32x2_t vabd_f32(float32x2_t a, float32x2_t b); // VABD.F32 d0,d0,d0
+_NEON2SSE_INLINE float32x2_t vabd_f32(float32x2_t a, float32x2_t b)
+{
+    float32x4_t res;
+    __m64_128 res64;
+    res = vabdq_f32(_pM128(a), _pM128(b));
+    _M64f(res64, res);
+    return res64;
+}
+
+int8x16_t vabdq_s8(int8x16_t a, int8x16_t b); // VABD.S8 q0,q0,q0
+_NEON2SSE_INLINE int8x16_t vabdq_s8(int8x16_t a, int8x16_t b) // VABD.S8 q0,q0,q0
 {
     __m128i res;
     res = _mm_sub_epi8 (a, b);
     return _mm_abs_epi8 (res);
 }
-#endif
 
-#if defined(USE_SSSE3)
-int16x8_t vabdq_s16(int16x8_t a, int16x8_t b);         // VABD.S16 q0,q0,q0
-_NEON2SSE_INLINE int16x8_t vabdq_s16(int16x8_t a, int16x8_t b)         // VABD.S16 q0,q0,q0
+int16x8_t vabdq_s16(int16x8_t a, int16x8_t b); // VABD.S16 q0,q0,q0
+_NEON2SSE_INLINE int16x8_t vabdq_s16(int16x8_t a, int16x8_t b) // VABD.S16 q0,q0,q0
 {
     __m128i res;
     res = _mm_sub_epi16 (a,b);
     return _mm_abs_epi16 (res);
 }
-#endif
 
-#if defined(USE_SSSE3)
-int32x4_t vabdq_s32(int32x4_t a, int32x4_t b);         // VABD.S32 q0,q0,q0
-_NEON2SSE_INLINE int32x4_t vabdq_s32(int32x4_t a, int32x4_t b)         // VABD.S32 q0,q0,q0
+int32x4_t vabdq_s32(int32x4_t a, int32x4_t b); // VABD.S32 q0,q0,q0
+_NEON2SSE_INLINE int32x4_t vabdq_s32(int32x4_t a, int32x4_t b) // VABD.S32 q0,q0,q0
 {
     __m128i res;
     res = _mm_sub_epi32 (a,b);
     return _mm_abs_epi32 (res);
 }
-#endif
 
-uint8x16_t vabdq_u8(uint8x16_t a, uint8x16_t b);         // VABD.U8 q0,q0,q0
-_NEON2SSE_INLINE uint8x16_t vabdq_u8(uint8x16_t a, uint8x16_t b)         //no abs for unsigned
+uint8x16_t vabdq_u8(uint8x16_t a, uint8x16_t b); // VABD.U8 q0,q0,q0
+_NEON2SSE_INLINE uint8x16_t vabdq_u8(uint8x16_t a, uint8x16_t b) //no abs for unsigned
 {
     __m128i cmp, difab, difba;
     cmp = vcgtq_u8(a,b);
@@ -3154,7 +5791,7 @@
     return _mm_or_si128(difab, difba);
 }
 
-uint16x8_t vabdq_u16(uint16x8_t a, uint16x8_t b);         // VABD.s16 q0,q0,q0
+uint16x8_t vabdq_u16(uint16x8_t a, uint16x8_t b); // VABD.s16 q0,q0,q0
 _NEON2SSE_INLINE uint16x8_t vabdq_u16(uint16x8_t a, uint16x8_t b)
 {
     __m128i cmp, difab, difba;
@@ -3166,7 +5803,7 @@
     return _mm_or_si128(difab, difba);
 }
 
-uint32x4_t vabdq_u32(uint32x4_t a, uint32x4_t b);         // VABD.U32 q0,q0,q0
+uint32x4_t vabdq_u32(uint32x4_t a, uint32x4_t b); // VABD.U32 q0,q0,q0
 _NEON2SSE_INLINE uint32x4_t vabdq_u32(uint32x4_t a, uint32x4_t b)
 {
     __m128i cmp, difab, difba;
@@ -3178,8 +5815,8 @@
     return _mm_or_si128(difab, difba);
 }
 
-float32x4_t vabdq_f32(float32x4_t a, float32x4_t b);         // VABD.F32 q0,q0,q0
-_NEON2SSE_INLINE float32x4_t vabdq_f32(float32x4_t a, float32x4_t b)         // VABD.F32 q0,q0,q0
+float32x4_t vabdq_f32(float32x4_t a, float32x4_t b); // VABD.F32 q0,q0,q0
+_NEON2SSE_INLINE float32x4_t vabdq_f32(float32x4_t a, float32x4_t b) // VABD.F32 q0,q0,q0
 {
     __m128i c1;
     __m128 res;
@@ -3190,41 +5827,126 @@
 
 //************  Absolute difference - long **************************
 //********************************************************************
+int16x8_t vabdl_s8(int8x8_t a, int8x8_t b); // VABDL.S8 q0,d0,d0
+_NEON2SSE_INLINE int16x8_t vabdl_s8(int8x8_t a, int8x8_t b) // VABDL.S8 q0,d0,d0
+{
+    __m128i a16, b16;
+    a16 = _MM_CVTEPI8_EPI16 (_pM128i(a)); //SSE4.1,
+    b16 = _MM_CVTEPI8_EPI16 (_pM128i(b)); //SSE4.1,
+    return vabdq_s16(a16, b16);
+
+}
+
+int32x4_t vabdl_s16(int16x4_t a, int16x4_t b); // VABDL.S16 q0,d0,d0
+_NEON2SSE_INLINE int32x4_t vabdl_s16(int16x4_t a, int16x4_t b) // VABDL.S16 q0,d0,d0
+{
+    __m128i a32, b32;
+    a32 = _MM_CVTEPI16_EPI32 (_pM128i(a)); //SSE4.1
+    b32 = _MM_CVTEPI16_EPI32 (_pM128i(b)); //SSE4.1,
+    return vabdq_s32(a32, b32);
+}
+
+int64x2_t vabdl_s32(int32x2_t a, int32x2_t b); // VABDL.S32 q0,d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING (int64x2_t vabdl_s32(int32x2_t a, int32x2_t b),_NEON2SSE_REASON_SLOW_SERIAL)
+{
+    //no optimal SIMD solution, serial looks faster
+    _NEON2SSE_ALIGN_16 int64_t res[2];
+    if(a.m64_i32[0] > b.m64_i32[0]) res[0] = ( int64_t) a.m64_i32[0] - ( int64_t) b.m64_i32[0];
+    else res[0] = ( int64_t) b.m64_i32[0] - ( int64_t) a.m64_i32[0];
+    if(a.m64_i32[1] > b.m64_i32[1]) res[1] = ( int64_t) a.m64_i32[1] - ( int64_t) b.m64_i32[1];
+    else res[1] = ( int64_t) b.m64_i32[1] - ( int64_t) a.m64_i32[1];
+    return _mm_load_si128((__m128i*)res);
+}
+
+uint16x8_t vabdl_u8(uint8x8_t a, uint8x8_t b); // VABDL.U8 q0,d0,d0
+_NEON2SSE_INLINE uint16x8_t vabdl_u8(uint8x8_t a, uint8x8_t b)
+{
+    __m128i res;
+    res = vsubl_u8(a,b);
+    return _mm_abs_epi16(res);
+}
+
+uint32x4_t vabdl_u16(uint16x4_t a, uint16x4_t b); // VABDL.s16 q0,d0,d0
+_NEON2SSE_INLINE uint32x4_t vabdl_u16(uint16x4_t a, uint16x4_t b)
+{
+    __m128i res;
+    res = vsubl_u16(a,b);
+    return _mm_abs_epi32(res);
+}
+
+uint64x2_t vabdl_u32(uint32x2_t a, uint32x2_t b); // VABDL.U32 q0,d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING (uint64x2_t vabdl_u32(uint32x2_t a, uint32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    _NEON2SSE_ALIGN_16 uint64_t res[2];
+    if(a.m64_u32[0] > b.m64_u32[0]) res[0] = ( uint64_t) a.m64_u32[0] - ( uint64_t) b.m64_u32[0];
+    else res[0] = ( uint64_t) b.m64_u32[0] - ( uint64_t) a.m64_u32[0];
+    if(a.m64_u32[1] > b.m64_u32[1]) res[1] = ( uint64_t) a.m64_u32[1] - ( uint64_t) b.m64_u32[1];
+    else res[1] = ( uint64_t) b.m64_u32[1] - ( uint64_t) a.m64_u32[1];
+    return _mm_load_si128((__m128i*)res);
+}
 
 //**********Absolute difference and accumulate: Vr[i] = Va[i] + | Vb[i] - Vc[i] | *************
 //*********************************************************************************************
+int8x8_t vaba_s8(int8x8_t a,  int8x8_t b, int8x8_t c); // VABA.S8 d0,d0,d0
+_NEON2SSE_INLINE int8x8_t vaba_s8(int8x8_t a,  int8x8_t b, int8x8_t c)
+{
+    int8x8_t res64;
+    return64(vabaq_s8(_pM128i(a),_pM128i(b), _pM128i(c)));
+}
 
-#if defined(USE_SSSE3)
-int8x16_t vabaq_s8(int8x16_t a, int8x16_t b, int8x16_t c);         // VABA.S8 q0,q0,q0
-_NEON2SSE_INLINE int8x16_t vabaq_s8(int8x16_t a, int8x16_t b, int8x16_t c)         // VABA.S8 q0,q0,q0
+int16x4_t vaba_s16(int16x4_t a,  int16x4_t b, int16x4_t c); // VABA.S16 d0,d0,d0
+_NEON2SSE_INLINE int16x4_t vaba_s16(int16x4_t a,  int16x4_t b, int16x4_t c)
+{
+    int16x4_t res64;
+    return64(vabaq_s16(_pM128i(a), _pM128i(b), _pM128i(c)));
+}
+
+int32x2_t vaba_s32(int32x2_t a,  int32x2_t b, int32x2_t c); // VABA.S32 d0,d0,d0
+_NEON2SSE_INLINE int32x2_t vaba_s32(int32x2_t a,  int32x2_t b, int32x2_t c)
+{
+    int32x2_t res64;
+    return64(vabaq_s32(_pM128i(a), _pM128i(b), _pM128i(c)));
+}
+
+uint8x8_t vaba_u8(uint8x8_t a,  uint8x8_t b, uint8x8_t c); // VABA.U8 d0,d0,d0
+#define vaba_u8 vaba_s8
+
+
+uint16x4_t vaba_u16(uint16x4_t a,  uint16x4_t b, uint16x4_t c); // VABA.s16 d0,d0,d0
+#define vaba_u16 vaba_s16
+
+uint32x2_t vaba_u32(uint32x2_t a,  uint32x2_t b, uint32x2_t c); // VABA.U32 d0,d0,d0
+_NEON2SSE_INLINE uint32x2_t vaba_u32(uint32x2_t a,  uint32x2_t b, uint32x2_t c)
+{
+    uint32x2_t res64;
+    return64(vabaq_u32(_pM128i(a), _pM128i(b), _pM128i(c)));
+}
+
+int8x16_t vabaq_s8(int8x16_t a, int8x16_t b, int8x16_t c); // VABA.S8 q0,q0,q0
+_NEON2SSE_INLINE int8x16_t vabaq_s8(int8x16_t a, int8x16_t b, int8x16_t c) // VABA.S8 q0,q0,q0
 {
     int8x16_t sub;
     sub = vabdq_s8(b, c);
     return vaddq_s8( a, sub);
 }
-#endif
 
-#if defined(USE_SSSE3)
-int16x8_t vabaq_s16(int16x8_t a, int16x8_t b, int16x8_t c);         // VABA.S16 q0,q0,q0
-_NEON2SSE_INLINE int16x8_t vabaq_s16(int16x8_t a, int16x8_t b, int16x8_t c)         // VABA.S16 q0,q0,q0
+int16x8_t vabaq_s16(int16x8_t a, int16x8_t b, int16x8_t c); // VABA.S16 q0,q0,q0
+_NEON2SSE_INLINE int16x8_t vabaq_s16(int16x8_t a, int16x8_t b, int16x8_t c) // VABA.S16 q0,q0,q0
 {
     int16x8_t sub;
     sub = vabdq_s16(b, c);
     return vaddq_s16( a, sub);
 }
-#endif
 
-#if defined(USE_SSSE3)
-int32x4_t vabaq_s32(int32x4_t a, int32x4_t b, int32x4_t c);         // VABA.S32 q0,q0,q0
-_NEON2SSE_INLINE int32x4_t vabaq_s32(int32x4_t a, int32x4_t b, int32x4_t c)         // VABA.S32 q0,q0,q0
+int32x4_t vabaq_s32(int32x4_t a, int32x4_t b, int32x4_t c); // VABA.S32 q0,q0,q0
+_NEON2SSE_INLINE int32x4_t vabaq_s32(int32x4_t a, int32x4_t b, int32x4_t c) // VABA.S32 q0,q0,q0
 {
     int32x4_t sub;
     sub = vabdq_s32(b, c);
     return vaddq_s32( a, sub);
 }
-#endif
 
-uint8x16_t vabaq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c);         // VABA.U8 q0,q0,q0
+uint8x16_t vabaq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c); // VABA.U8 q0,q0,q0
 _NEON2SSE_INLINE uint8x16_t vabaq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c)
 {
     uint8x16_t sub;
@@ -3232,7 +5954,7 @@
     return vaddq_u8( a, sub);
 }
 
-uint16x8_t vabaq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c);         // VABA.s16 q0,q0,q0
+uint16x8_t vabaq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c); // VABA.s16 q0,q0,q0
 _NEON2SSE_INLINE uint16x8_t vabaq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c)
 {
     uint16x8_t sub;
@@ -3240,7 +5962,7 @@
     return vaddq_u16( a, sub);
 }
 
-uint32x4_t vabaq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c);         // VABA.U32 q0,q0,q0
+uint32x4_t vabaq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c); // VABA.U32 q0,q0,q0
 _NEON2SSE_INLINE uint32x4_t vabaq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c)
 {
     uint32x4_t sub;
@@ -3250,84 +5972,411 @@
 
 //************** Absolute difference and accumulate - long ********************************
 //*************************************************************************************
+int16x8_t vabal_s8(int16x8_t a, int8x8_t b, int8x8_t c); // VABAL.S8 q0,d0,d0
+_NEON2SSE_INLINE int16x8_t vabal_s8(int16x8_t a, int8x8_t b, int8x8_t c) // VABAL.S8 q0,d0,d0
+{
+    __m128i b16, c16, res;
+    b16 = _MM_CVTEPI8_EPI16 (_pM128i(b)); //SSE4.1,
+    c16 = _MM_CVTEPI8_EPI16 (_pM128i(c)); //SSE4.1,
+    res = _mm_abs_epi16 (_mm_sub_epi16 (b16, c16) );
+    return _mm_add_epi16 (a, res);
+}
+
+int32x4_t vabal_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VABAL.S16 q0,d0,d0
+_NEON2SSE_INLINE int32x4_t vabal_s16(int32x4_t a, int16x4_t b, int16x4_t c) // VABAL.S16 q0,d0,d0
+{
+    __m128i b32, c32, res;
+    b32 = _MM_CVTEPI16_EPI32(_pM128i(b)); //SSE4.1
+    c32 = _MM_CVTEPI16_EPI32(_pM128i(c)); //SSE4.1
+    res = _mm_abs_epi32 (_mm_sub_epi32 (b32, c32) );
+    return _mm_add_epi32 (a, res);
+}
+
+int64x2_t vabal_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VABAL.S32 q0,d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING (int64x2_t vabal_s32(int64x2_t a, int32x2_t b, int32x2_t c), _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    __m128i res;
+    res = vabdl_s32(b,c);
+    return _mm_add_epi64(a, res);
+}
+
+uint16x8_t vabal_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c); // VABAL.U8 q0,d0,d0
+_NEON2SSE_INLINE uint16x8_t vabal_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c)
+{
+    __m128i b16, c16, res;
+    b16 = _MM_CVTEPU8_EPI16 (_pM128i(b)); //SSE4.1,
+    c16 = _MM_CVTEPU8_EPI16 (_pM128i(c)); //SSE4.1,
+    res = _mm_abs_epi16 (_mm_sub_epi16 (b16, c16) );
+    return _mm_add_epi16 (a, res);
+}
+
+uint32x4_t vabal_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c); // VABAL.s16 q0,d0,d0
+_NEON2SSE_INLINE uint32x4_t vabal_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c)
+{
+    __m128i b32, c32, res;
+    b32 = _MM_CVTEPU16_EPI32(_pM128i(b)); //SSE4.1
+    c32 = _MM_CVTEPU16_EPI32(_pM128i(c)); //SSE4.1
+    res = _mm_abs_epi32 (_mm_sub_epi32 (b32, c32) );
+    return _mm_add_epi32 (a, res);
+}
+
+uint64x2_t vabal_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c); // VABAL.U32 q0,d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING (uint64x2_t vabal_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c), _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    __m128i res;
+    res = vabdl_u32(b,c);
+    return _mm_add_epi64(a, res);
+}
 
 //***********************************************************************************
 //****************  Maximum and minimum operations **********************************
 //***********************************************************************************
 //************* Maximum:  vmax -> Vr[i] := (Va[i] >= Vb[i]) ? Va[i] : Vb[i]    *******
 //***********************************************************************************
+int8x8_t   vmax_s8(int8x8_t a, int8x8_t b); // VMAX.S8 d0,d0,d0
+_NEON2SSE_INLINE int8x8_t   vmax_s8(int8x8_t a, int8x8_t b)
+{
+    int8x8_t res64;
+    __m128i res;
+    res = _MM_MAX_EPI8(_pM128i(a),_pM128i(b)); //SSE4.1, use only lower 64 bits
+    return64(res);
+}
 
-int8x16_t   vmaxq_s8(int8x16_t a, int8x16_t b);         // VMAX.S8 q0,q0,q0
-#define vmaxq_s8 _MM_MAX_EPI8         //SSE4.1
+int16x4_t vmax_s16(int16x4_t a, int16x4_t b); // VMAX.S16 d0,d0,d0
+_NEON2SSE_INLINE int16x4_t vmax_s16(int16x4_t a, int16x4_t b)
+{
+    int16x4_t res64;
+    return64(_mm_max_epi16(_pM128i(a),_pM128i(b)));
+}
 
-int16x8_t   vmaxq_s16(int16x8_t a, int16x8_t b);         // VMAX.S16 q0,q0,q0
+int32x2_t   vmax_s32(int32x2_t a, int32x2_t b); // VMAX.S32 d0,d0,d0
+_NEON2SSE_INLINE int32x2_t   vmax_s32(int32x2_t a, int32x2_t b)
+{
+    int32x2_t res64;
+    __m128i res;
+    res =  _MM_MAX_EPI32(_pM128i(a),_pM128i(b)); //SSE4.1, use only lower 64 bits
+    return64(res);
+}
+
+uint8x8_t vmax_u8(uint8x8_t a, uint8x8_t b); // VMAX.U8 d0,d0,d0
+_NEON2SSE_INLINE uint8x8_t vmax_u8(uint8x8_t a, uint8x8_t b)
+{
+    uint8x8_t res64;
+    return64(_mm_max_epu8(_pM128i(a),_pM128i(b)));
+}
+
+
+uint16x4_t vmax_u16(uint16x4_t a, uint16x4_t b); // VMAX.s16 d0,d0,d0
+_NEON2SSE_INLINE uint16x4_t vmax_u16(uint16x4_t a, uint16x4_t b)
+{
+    uint16x4_t res64;
+    return64(_MM_MAX_EPU16(_pM128i(a),_pM128i(b)));
+}
+
+
+uint32x2_t   vmax_u32(uint32x2_t a, uint32x2_t b); // VMAX.U32 d0,d0,d0
+_NEON2SSE_INLINE uint32x2_t   vmax_u32(uint32x2_t a, uint32x2_t b)
+{
+    uint32x2_t res64;
+    __m128i res;
+    res = _MM_MAX_EPU32(_pM128i(a),_pM128i(b)); //SSE4.1, use only lower 64 bits, may be not effective compared with serial
+    return64(res);
+}
+
+float32x2_t vmax_f32(float32x2_t a, float32x2_t b); // VMAX.F32 d0,d0,d0
+_NEON2SSE_INLINE float32x2_t vmax_f32(float32x2_t a, float32x2_t b)
+{
+    //serial solution looks faster than  SIMD one
+    float32x2_t res;
+    res.m64_f32[0] = (a.m64_f32[0] > b.m64_f32[0]) ? a.m64_f32[0] : b.m64_f32[0];
+    res.m64_f32[1] = (a.m64_f32[1] > b.m64_f32[1]) ? a.m64_f32[1] : b.m64_f32[1];
+    return res;
+}
+
+int8x16_t   vmaxq_s8(int8x16_t a, int8x16_t b); // VMAX.S8 q0,q0,q0
+#define vmaxq_s8 _MM_MAX_EPI8 //SSE4.1
+
+int16x8_t   vmaxq_s16(int16x8_t a, int16x8_t b); // VMAX.S16 q0,q0,q0
 #define vmaxq_s16 _mm_max_epi16
 
-int32x4_t   vmaxq_s32(int32x4_t a, int32x4_t b);         // VMAX.S32 q0,q0,q0
-#define vmaxq_s32 _MM_MAX_EPI32         //SSE4.1
+int32x4_t   vmaxq_s32(int32x4_t a, int32x4_t b); // VMAX.S32 q0,q0,q0
+#define vmaxq_s32 _MM_MAX_EPI32 //SSE4.1
 
-uint8x16_t   vmaxq_u8(uint8x16_t a, uint8x16_t b);         // VMAX.U8 q0,q0,q0
+uint8x16_t   vmaxq_u8(uint8x16_t a, uint8x16_t b); // VMAX.U8 q0,q0,q0
 #define vmaxq_u8 _mm_max_epu8
 
-uint16x8_t   vmaxq_u16(uint16x8_t a, uint16x8_t b);         // VMAX.s16 q0,q0,q0
-#define vmaxq_u16 _MM_MAX_EPU16         //SSE4.1
+uint16x8_t   vmaxq_u16(uint16x8_t a, uint16x8_t b); // VMAX.s16 q0,q0,q0
+#define vmaxq_u16 _MM_MAX_EPU16 //SSE4.1
 
-uint32x4_t   vmaxq_u32(uint32x4_t a, uint32x4_t b);         // VMAX.U32 q0,q0,q0
-#define vmaxq_u32 _MM_MAX_EPU32         //SSE4.1
+uint32x4_t   vmaxq_u32(uint32x4_t a, uint32x4_t b); // VMAX.U32 q0,q0,q0
+#define vmaxq_u32 _MM_MAX_EPU32 //SSE4.1
 
-float32x4_t vmaxq_f32(float32x4_t a, float32x4_t b);         // VMAX.F32 q0,q0,q0
+
+float32x4_t vmaxq_f32(float32x4_t a, float32x4_t b); // VMAX.F32 q0,q0,q0
 #define vmaxq_f32 _mm_max_ps
 
 //*************** Minimum: vmin -> Vr[i] := (Va[i] >= Vb[i]) ? Vb[i] : Va[i] ********************************
 //***********************************************************************************************************
+int8x8_t   vmin_s8(int8x8_t a, int8x8_t b); // VMIN.S8 d0,d0,d0
+_NEON2SSE_INLINE int8x8_t   vmin_s8(int8x8_t a, int8x8_t b)
+{
+    int8x8_t res64;
+    __m128i res;
+    res = _MM_MIN_EPI8(_pM128i(a),_pM128i(b)); //SSE4.1, use only lower 64 bits
+    return64(res);
+}
 
-int8x16_t   vminq_s8(int8x16_t a, int8x16_t b);         // VMIN.S8 q0,q0,q0
-#define vminq_s8 _MM_MIN_EPI8         //SSE4.1
+int16x4_t vmin_s16(int16x4_t a, int16x4_t b); // VMIN.S16 d0,d0,d0
+_NEON2SSE_INLINE int16x4_t vmin_s16(int16x4_t a, int16x4_t b)
+{
+    int16x4_t res64;
+    return64(_mm_min_epi16(_pM128i(a),_pM128i(b)));
+}
 
-int16x8_t   vminq_s16(int16x8_t a, int16x8_t b);         // VMIN.S16 q0,q0,q0
+
+int32x2_t   vmin_s32(int32x2_t a, int32x2_t b); // VMIN.S32 d0,d0,d0
+_NEON2SSE_INLINE int32x2_t   vmin_s32(int32x2_t a, int32x2_t b)
+{
+    int32x2_t res64;
+    __m128i res;
+    res = _MM_MIN_EPI32(_pM128i(a),_pM128i(b)); //SSE4.1, use only lower 64 bits
+    return64(res);
+}
+
+uint8x8_t vmin_u8(uint8x8_t a, uint8x8_t b); // VMIN.U8 d0,d0,d0
+_NEON2SSE_INLINE uint8x8_t vmin_u8(uint8x8_t a, uint8x8_t b)
+{
+    uint8x8_t res64;
+    return64(_mm_min_epu8(_pM128i(a),_pM128i(b)));
+}
+
+
+uint16x4_t vmin_u16(uint16x4_t a, uint16x4_t b); // VMIN.s16 d0,d0,d0
+_NEON2SSE_INLINE uint16x4_t vmin_u16(uint16x4_t a, uint16x4_t b)
+{
+    uint16x4_t res64;
+    return64(_MM_MIN_EPU16(_pM128i(a),_pM128i(b)));
+}
+
+
+uint32x2_t   vmin_u32(uint32x2_t a, uint32x2_t b); // VMIN.U32 d0,d0,d0
+_NEON2SSE_INLINE uint32x2_t   vmin_u32(uint32x2_t a, uint32x2_t b)
+{
+    uint32x2_t res64;
+    __m128i res;
+    res = _MM_MIN_EPU32(_pM128i(a),_pM128i(b)); //SSE4.1, use only lower 64 bits, may be not effective compared with serial
+    return64(res);
+}
+
+float32x2_t vmin_f32(float32x2_t a, float32x2_t b); // VMIN.F32 d0,d0,d0
+_NEON2SSE_INLINE float32x2_t vmin_f32(float32x2_t a, float32x2_t b)
+{
+    //serial solution looks faster than  SIMD one
+    float32x2_t res;
+    res.m64_f32[0] = (a.m64_f32[0] < b.m64_f32[0]) ? a.m64_f32[0] : b.m64_f32[0];
+    res.m64_f32[1] = (a.m64_f32[1] < b.m64_f32[1]) ? a.m64_f32[1] : b.m64_f32[1];
+    return res;
+}
+
+int8x16_t   vminq_s8(int8x16_t a, int8x16_t b); // VMIN.S8 q0,q0,q0
+#define vminq_s8 _MM_MIN_EPI8 //SSE4.1
+
+int16x8_t   vminq_s16(int16x8_t a, int16x8_t b); // VMIN.S16 q0,q0,q0
 #define vminq_s16 _mm_min_epi16
 
-int32x4_t   vminq_s32(int32x4_t a, int32x4_t b);         // VMIN.S32 q0,q0,q0
-#define vminq_s32 _MM_MIN_EPI32         //SSE4.1
+int32x4_t   vminq_s32(int32x4_t a, int32x4_t b); // VMIN.S32 q0,q0,q0
+#define vminq_s32 _MM_MIN_EPI32 //SSE4.1
 
-uint8x16_t   vminq_u8(uint8x16_t a, uint8x16_t b);         // VMIN.U8 q0,q0,q0
+uint8x16_t   vminq_u8(uint8x16_t a, uint8x16_t b); // VMIN.U8 q0,q0,q0
 #define vminq_u8 _mm_min_epu8
 
-uint16x8_t   vminq_u16(uint16x8_t a, uint16x8_t b);         // VMIN.s16 q0,q0,q0
-#define vminq_u16 _MM_MIN_EPU16         //SSE4.1
+uint16x8_t   vminq_u16(uint16x8_t a, uint16x8_t b); // VMIN.s16 q0,q0,q0
+#define vminq_u16 _MM_MIN_EPU16 //SSE4.1
 
-uint32x4_t   vminq_u32(uint32x4_t a, uint32x4_t b);         // VMIN.U32 q0,q0,q0
-#define vminq_u32 _MM_MIN_EPU32         //SSE4.1
+uint32x4_t   vminq_u32(uint32x4_t a, uint32x4_t b); // VMIN.U32 q0,q0,q0
+#define vminq_u32 _MM_MIN_EPU32 //SSE4.1
 
-float32x4_t vminq_f32(float32x4_t a, float32x4_t b);         // VMIN.F32 q0,q0,q0
+float32x4_t vminq_f32(float32x4_t a, float32x4_t b); // VMIN.F32 q0,q0,q0
 #define vminq_f32 _mm_min_ps
 
 //*************  Pairwise addition operations. **************************************
 //************************************************************************************
 //Pairwise add - adds adjacent pairs of elements of two vectors, and places the results in the destination vector
+int8x8_t vpadd_s8(int8x8_t a, int8x8_t b); // VPADD.I8 d0,d0,d0
+_NEON2SSE_INLINE int8x8_t vpadd_s8(int8x8_t a, int8x8_t b) // VPADD.I8 d0,d0,d0
+{
+    //no 8 bit hadd in IA32, need to go to 16 bit and then pack
+    int8x8_t res64;
+    __m128i a16, b16, res;
+    _NEON2SSE_ALIGN_16 int8_t mask8_16_even_odd[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 };
+    a16 = _MM_CVTEPI8_EPI16 (_pM128i(a)); // SSE 4.1
+    b16 = _MM_CVTEPI8_EPI16 (_pM128i(b)); // SSE 4.1
+    res = _mm_hadd_epi16 (a16, b16);
+    res = _mm_shuffle_epi8 (res, *(__m128i*) mask8_16_even_odd); //return to 8 bit, use low 64 bits
+    return64(res);
+}
+
+int16x4_t   vpadd_s16(int16x4_t a, int16x4_t b); // VPADD.I16 d0,d0,d0
+_NEON2SSE_INLINE int16x4_t   vpadd_s16(int16x4_t a, int16x4_t b)
+{
+    int16x4_t res64;
+    __m128i hadd128;
+    hadd128 = _mm_hadd_epi16 (_pM128i(a), _pM128i(b));
+    hadd128 = _mm_shuffle_epi32 (hadd128, 0 | (2 << 2) | (1 << 4) | (3 << 6));
+    return64(hadd128);
+}
+
+
+int32x2_t   vpadd_s32(int32x2_t a, int32x2_t b); // VPADD.I32 d0,d0,d0
+_NEON2SSE_INLINE int32x2_t   vpadd_s32(int32x2_t a, int32x2_t b)
+{
+    int32x2_t res64;
+    __m128i hadd128;
+    hadd128 = _mm_hadd_epi32 (_pM128i(a), _pM128i(b));
+    hadd128 = _mm_shuffle_epi32 (hadd128, 0 | (2 << 2) | (1 << 4) | (3 << 6));
+    return64(hadd128);
+}
+
+
+uint8x8_t vpadd_u8(uint8x8_t a, uint8x8_t b); // VPADD.I8 d0,d0,d0
+_NEON2SSE_INLINE uint8x8_t vpadd_u8(uint8x8_t a, uint8x8_t b) // VPADD.I8 d0,d0,d0
+{
+    //  no 8 bit hadd in IA32, need to go to 16 bit and then pack
+    uint8x8_t res64;
+//  no unsigned _mm_hadd_ functions in IA32, but 8 unsigned is less then 16 signed, so it should work
+    __m128i mask8, a16, b16, res;
+    mask8 = _mm_set1_epi16(0xff);
+    a16 = _MM_CVTEPU8_EPI16 (_pM128i(a)); // SSE 4.1
+    b16 = _MM_CVTEPU8_EPI16 (_pM128i(b)); // SSE 4.1
+    res = _mm_hadd_epi16 (a16, b16);
+    res = _mm_and_si128(res, mask8); //to avoid saturation
+    res = _mm_packus_epi16 (res,res); //use low 64 bits
+    return64(res);
+}
+
+uint16x4_t vpadd_u16(uint16x4_t a, uint16x4_t b); // VPADD.I16 d0,d0,d0
+_NEON2SSE_INLINE uint16x4_t vpadd_u16(uint16x4_t a, uint16x4_t b) // VPADD.I16 d0,d0,d0
+{
+    // solution may be not optimal, serial execution may be faster
+    // no unsigned _mm_hadd_ functions in IA32, need to move from unsigned to signed
+    uint16x4_t res64;
+    __m128i c32767,  cfffe, as, bs, res;
+    c32767 = _mm_set1_epi16 (32767);
+    cfffe = _mm_set1_epi16 (0xfffe);
+    as = _mm_sub_epi16 (_pM128i(a), c32767);
+    bs = _mm_sub_epi16 (_pM128i(b), c32767);
+    res = _mm_hadd_epi16 (as, bs);
+    res = _mm_add_epi16 (res, cfffe);
+    res = _mm_shuffle_epi32 (res, 0 | (2 << 2) | (1 << 4) | (3 << 6));
+    return64(res);
+}
+
+uint32x2_t vpadd_u32(uint32x2_t a, uint32x2_t b); // VPADD.I32 d0,d0,d0
+_NEON2SSE_INLINE uint32x2_t vpadd_u32(uint32x2_t a, uint32x2_t b) //serial may be faster
+{
+    //hadd doesn't work for unsigned values
+    uint32x2_t res64;
+    __m128i ab, ab_sh, res;
+    ab = _mm_unpacklo_epi64 ( _pM128i(a), _pM128i(b)); //a0 a1 b0 b1
+    ab_sh = _mm_shuffle_epi32(ab, 1 | (0 << 2) | (3 << 4) | (2 << 6)); //a1, a0, b1, b0
+    res = _mm_add_epi32(ab, ab_sh);
+    res = _mm_shuffle_epi32(res, 0 | (2 << 2) | (1 << 4) | (3 << 6));
+    return64(res);
+}
+
+float32x2_t vpadd_f32(float32x2_t a, float32x2_t b); // VPADD.F32 d0,d0,d0
+_NEON2SSE_INLINE float32x2_t vpadd_f32(float32x2_t a, float32x2_t b)
+{
+    __m128 hadd128;
+    __m64_128 res64;
+    hadd128 = _mm_hadd_ps (_pM128(a), _pM128(b));
+    hadd128 = _mm_shuffle_ps (hadd128, hadd128, _MM_SHUFFLE(3,1, 2, 0)); //use low 64 bits
+    _M64f(res64, hadd128);
+    return res64;
+}
+
 
 //**************************  Long pairwise add  **********************************
 //*********************************************************************************
 //Adds adjacent pairs of elements of a vector,sign or zero extends the results to twice their original width,
 // and places the final results in the destination vector.
 
-#if defined(USE_SSSE3)
-int16x8_t vpaddlq_s8(int8x16_t a);         // VPADDL.S8 q0,q0
-_NEON2SSE_INLINE int16x8_t vpaddlq_s8(int8x16_t a)         // VPADDL.S8 q0,q0
-{         //no 8 bit hadd in IA32, need to go to 16 bit
+int16x4_t vpaddl_s8(int8x8_t a); // VPADDL.S8 d0,d0
+_NEON2SSE_INLINE int16x4_t vpaddl_s8(int8x8_t a) // VPADDL.S8 d0,d0
+{
+    //no 8 bit hadd in IA32, need to go to 16 bit anyway
+    __m128i a16;
+    int16x4_t res64;
+    a16 = _MM_CVTEPI8_EPI16 (_pM128i(a)); // SSE 4.1
+    a16 = _mm_hadd_epi16 (a16,  a16); //use low 64 bits
+    return64(a16);
+}
+
+int32x2_t vpaddl_s16(int16x4_t a); // VPADDL.S16 d0,d0
+_NEON2SSE_INLINE int32x2_t vpaddl_s16(int16x4_t a) // VPADDL.S16 d0,d0
+{
+    // solution may be not optimal, serial execution may be faster
+    int32x2_t res64;
+    __m128i r32_1;
+    r32_1 = _MM_CVTEPI16_EPI32 (_pM128i(a));
+    r32_1 = _mm_hadd_epi32(r32_1, r32_1); //use low 64 bits
+    return64(r32_1);
+}
+
+int64x1_t vpaddl_s32(int32x2_t a); // VPADDL.S32 d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vpaddl_s32(int32x2_t a), _NEON2SSE_REASON_SLOW_SERIAL) //serial solution looks faster
+{
+    int64x1_t res;
+    res.m64_i64[0] = (int64_t)a.m64_i32[0] + (int64_t)a.m64_i32[1];
+    return res;
+}
+
+uint16x4_t vpaddl_u8(uint8x8_t a); // VPADDL.U8 d0,d0
+_NEON2SSE_INLINE uint16x4_t vpaddl_u8(uint8x8_t a) // VPADDL.U8 d0,d0
+{
+    //  no 8 bit hadd in IA32, need to go to 16 bit
+//  no unsigned _mm_hadd_ functions in IA32, but 8 unsigned is less then 16 signed, so it should work
+    uint16x4_t res64;
+    __m128i a16;
+    a16 = _MM_CVTEPU8_EPI16 (_pM128i(a)); // SSE 4.1 use low 64 bits
+    a16 = _mm_hadd_epi16 (a16, a16); //use low 64 bits
+    return64(a16);
+}
+
+uint32x2_t vpaddl_u16(uint16x4_t a); // VPADDL.s16 d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vpaddl_u16(uint16x4_t a),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    //serial solution looks faster than a SIMD one
+    uint32x2_t res;
+    res.m64_u32[0] = (uint32_t)a.m64_u16[0] + (uint32_t)a.m64_u16[1];
+    res.m64_u32[1] = (uint32_t)a.m64_u16[2] + (uint32_t)a.m64_u16[3];
+    return res;
+}
+
+uint64x1_t vpaddl_u32(uint32x2_t a); // VPADDL.U32 d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x1_t vpaddl_u32(uint32x2_t a), _NEON2SSE_REASON_SLOW_SERIAL) //serial solution looks faster
+{
+    uint64x1_t res;
+    res.m64_u64[0] = (uint64_t)a.m64_u32[0] + (uint64_t)a.m64_u32[1];
+    return res;
+}
+
+int16x8_t vpaddlq_s8(int8x16_t a); // VPADDL.S8 q0,q0
+_NEON2SSE_INLINE int16x8_t vpaddlq_s8(int8x16_t a) // VPADDL.S8 q0,q0
+{
+    //no 8 bit hadd in IA32, need to go to 16 bit
     __m128i r16_1, r16_2;
-    r16_1 = _MM_CVTEPI8_EPI16 (a);         // SSE 4.1
+    r16_1 = _MM_CVTEPI8_EPI16 (a); // SSE 4.1
     //swap hi and low part of r to process the remaining data
     r16_2 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
     r16_2 = _MM_CVTEPI8_EPI16 (r16_2);
     return _mm_hadd_epi16 (r16_1, r16_2);
 }
-#endif
 
-#if defined(USE_SSSE3)
-int32x4_t vpaddlq_s16(int16x8_t a);         // VPADDL.S16 q0,q0
-_NEON2SSE_INLINE int32x4_t vpaddlq_s16(int16x8_t a)         // VPADDL.S16 q0,q0
-{         //no 8 bit hadd in IA32, need to go to 16 bit
+int32x4_t vpaddlq_s16(int16x8_t a); // VPADDL.S16 q0,q0
+_NEON2SSE_INLINE int32x4_t vpaddlq_s16(int16x8_t a) // VPADDL.S16 q0,q0
+{
+    //no 8 bit hadd in IA32, need to go to 16 bit
     __m128i r32_1, r32_2;
     r32_1 = _MM_CVTEPI16_EPI32(a);
     //swap hi and low part of r to process the remaining data
@@ -3335,10 +6384,9 @@
     r32_2 = _MM_CVTEPI16_EPI32 (r32_2);
     return _mm_hadd_epi32 (r32_1, r32_2);
 }
-#endif
 
-int64x2_t vpaddlq_s32(int32x4_t a);         // VPADDL.S32 q0,q0
-_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vpaddlq_s32(int32x4_t a), _NEON2SSE_REASON_SLOW_SERIAL)         // VPADDL.S32 q0,q0
+int64x2_t vpaddlq_s32(int32x4_t a); // VPADDL.S32 q0,q0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vpaddlq_s32(int32x4_t a), _NEON2SSE_REASON_SLOW_SERIAL) // VPADDL.S32 q0,q0
 {
     _NEON2SSE_ALIGN_16 int32_t atmp[4];
     _NEON2SSE_ALIGN_16 int64_t res[2];
@@ -3348,10 +6396,10 @@
     return _mm_load_si128((__m128i*)res);
 }
 
-#if defined(USE_SSSE3)
-uint16x8_t vpaddlq_u8(uint8x16_t a);         // VPADDL.U8 q0,q0
-_NEON2SSE_INLINE uint16x8_t vpaddlq_u8(uint8x16_t a)         // VPADDL.U8 q0,q0
-{         //no 8 bit hadd in IA32, need to go to 16 bit
+uint16x8_t vpaddlq_u8(uint8x16_t a); // VPADDL.U8 q0,q0
+_NEON2SSE_INLINE uint16x8_t vpaddlq_u8(uint8x16_t a) // VPADDL.U8 q0,q0
+{
+    //no 8 bit hadd in IA32, need to go to 16 bit
     __m128i r16_1, r16_2;
     r16_1 = _MM_CVTEPU8_EPI16(a);
     //swap hi and low part of r to process the remaining data
@@ -3359,11 +6407,11 @@
     r16_2 = _MM_CVTEPU8_EPI16 (r16_2);
     return _mm_hadd_epi16 (r16_1, r16_2);
 }
-#endif
 
-uint32x4_t vpaddlq_u16(uint16x8_t a);         // VPADDL.s16 q0,q0
+uint32x4_t vpaddlq_u16(uint16x8_t a); // VPADDL.s16 q0,q0
 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vpaddlq_u16(uint16x8_t a),  _NEON2SSE_REASON_SLOW_SERIAL)
-{         //serial solution looks faster than a SIMD one
+{
+    //serial solution looks faster than a SIMD one
     _NEON2SSE_ALIGN_16 uint16_t atmp[8];
     _NEON2SSE_ALIGN_16 uint32_t res[4];
     _mm_store_si128((__m128i*)atmp, a);
@@ -3374,7 +6422,7 @@
     return _mm_load_si128((__m128i*)res);
 }
 
-uint64x2_t vpaddlq_u32(uint32x4_t a);         // VPADDL.U32 q0,q0
+uint64x2_t vpaddlq_u32(uint32x4_t a); // VPADDL.U32 q0,q0
 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vpaddlq_u32(uint32x4_t a), _NEON2SSE_REASON_SLOW_SERIAL)
 {
     _NEON2SSE_ALIGN_16 uint32_t atmp[4];
@@ -3389,28 +6437,69 @@
 //****************************************************************************************
 //VPADAL (Vector Pairwise Add and Accumulate Long) adds adjacent pairs of elements of a vector,
 // and accumulates the  values of the results into the elements of the destination (wide) vector
+int16x4_t vpadal_s8(int16x4_t a,  int8x8_t b); // VPADAL.S8 d0,d0
+_NEON2SSE_INLINE int16x4_t vpadal_s8(int16x4_t a,  int8x8_t b)
+{
+    int16x4_t res64;
+    return64(vpadalq_s8(_pM128i(a), _pM128i(b)));
+}
 
-#if defined(USE_SSSE3)
-int16x8_t vpadalq_s8(int16x8_t a, int8x16_t b);         // VPADAL.S8 q0,q0
-_NEON2SSE_INLINE int16x8_t vpadalq_s8(int16x8_t a, int8x16_t b)         // VPADAL.S8 q0,q0
+int32x2_t vpadal_s16(int32x2_t a,  int16x4_t b); // VPADAL.S16 d0,d0
+_NEON2SSE_INLINE int32x2_t vpadal_s16(int32x2_t a,  int16x4_t b)
+{
+    int32x2_t res64;
+    return64(vpadalq_s16(_pM128i(a), _pM128i(b)));
+}
+
+
+int64x1_t vpadal_s32(int64x1_t a, int32x2_t b); // VPADAL.S32 d0,d0
+_NEON2SSE_INLINE int64x1_t vpadal_s32(int64x1_t a, int32x2_t b)
+{
+    int64x1_t res;
+    res.m64_i64[0] = (int64_t)b.m64_i32[0] + (int64_t)b.m64_i32[1] + a.m64_i64[0];
+    return res;
+}
+
+uint16x4_t vpadal_u8(uint16x4_t a,  uint8x8_t b); // VPADAL.U8 d0,d0
+_NEON2SSE_INLINE uint16x4_t vpadal_u8(uint16x4_t a,  uint8x8_t b)
+{
+    uint16x4_t res64;
+    return64(vpadalq_u8(_pM128i(a), _pM128i(b)));
+}
+
+
+uint32x2_t vpadal_u16(uint32x2_t a,  uint16x4_t b); // VPADAL.s16 d0,d0
+_NEON2SSE_INLINE uint32x2_t vpadal_u16(uint32x2_t a,  uint16x4_t b)
+{
+    uint32x2_t res64;
+    return64(vpadalq_u16(_pM128i(a), _pM128i(b)));
+}
+
+uint64x1_t vpadal_u32(uint64x1_t a, uint32x2_t b); // VPADAL.U32 d0,d0
+_NEON2SSE_INLINE uint64x1_t vpadal_u32(uint64x1_t a, uint32x2_t b)
+{
+    uint64x1_t res;
+    res.m64_u64[0] = (uint64_t)b.m64_u32[0] + (uint64_t)b.m64_u32[1] + a.m64_u64[0];
+    return res;
+}
+
+int16x8_t vpadalq_s8(int16x8_t a, int8x16_t b); // VPADAL.S8 q0,q0
+_NEON2SSE_INLINE int16x8_t vpadalq_s8(int16x8_t a, int8x16_t b) // VPADAL.S8 q0,q0
 {
     int16x8_t pad;
     pad = vpaddlq_s8(b);
     return _mm_add_epi16 (a, pad);
 }
-#endif
 
-#if defined(USE_SSSE3)
-int32x4_t vpadalq_s16(int32x4_t a, int16x8_t b);         // VPADAL.S16 q0,q0
-_NEON2SSE_INLINE int32x4_t vpadalq_s16(int32x4_t a, int16x8_t b)         // VPADAL.S16 q0,q0
+int32x4_t vpadalq_s16(int32x4_t a, int16x8_t b); // VPADAL.S16 q0,q0
+_NEON2SSE_INLINE int32x4_t vpadalq_s16(int32x4_t a, int16x8_t b) // VPADAL.S16 q0,q0
 {
     int32x4_t pad;
     pad = vpaddlq_s16(b);
     return _mm_add_epi32(a, pad);
 }
-#endif
 
-int64x2_t vpadalq_s32(int64x2_t a, int32x4_t b);         // VPADAL.S32 q0,q0
+int64x2_t vpadalq_s32(int64x2_t a, int32x4_t b); // VPADAL.S32 q0,q0
 _NEON2SSE_INLINE int64x2_t vpadalq_s32(int64x2_t a, int32x4_t b)
 {
     int64x2_t pad;
@@ -3418,82 +6507,376 @@
     return _mm_add_epi64 (a, pad);
 }
 
-#if defined(USE_SSSE3)
-uint16x8_t vpadalq_u8(uint16x8_t a, uint8x16_t b);         // VPADAL.U8 q0,q0
-_NEON2SSE_INLINE uint16x8_t vpadalq_u8(uint16x8_t a, uint8x16_t b)         // VPADAL.U8 q0,q0
+uint16x8_t vpadalq_u8(uint16x8_t a, uint8x16_t b); // VPADAL.U8 q0,q0
+_NEON2SSE_INLINE uint16x8_t vpadalq_u8(uint16x8_t a, uint8x16_t b) // VPADAL.U8 q0,q0
 {
     uint16x8_t pad;
     pad = vpaddlq_u8(b);
     return _mm_add_epi16 (a, pad);
 }
-#endif
 
-uint32x4_t vpadalq_u16(uint32x4_t a, uint16x8_t b);         // VPADAL.s16 q0,q0
+uint32x4_t vpadalq_u16(uint32x4_t a, uint16x8_t b); // VPADAL.s16 q0,q0
 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vpadalq_u16(uint32x4_t a, uint16x8_t b), _NEON2SSE_REASON_SLOW_SERIAL)
 {
     uint32x4_t pad;
     pad = vpaddlq_u16(b);
     return _mm_add_epi32(a, pad);
-}         //no optimal SIMD solution, serial is faster
+} //no optimal SIMD solution, serial is faster
 
-uint64x2_t vpadalq_u32(uint64x2_t a, uint32x4_t b);         // VPADAL.U32 q0,q0
+uint64x2_t vpadalq_u32(uint64x2_t a, uint32x4_t b); // VPADAL.U32 q0,q0
 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vpadalq_u32(uint64x2_t a, uint32x4_t b), _NEON2SSE_REASON_SLOW_SERIAL)
-{         //no optimal SIMD solution, serial is faster
+{
+    //no optimal SIMD solution, serial is faster
     uint64x2_t pad;
     pad = vpaddlq_u32(b);
     return _mm_add_epi64(a, pad);
-}         //no optimal SIMD solution, serial is faster
+} //no optimal SIMD solution, serial is faster
 
 //**********  Folding maximum   *************************************
 //*******************************************************************
 //VPMAX (Vector Pairwise Maximum) compares adjacent pairs of elements in two vectors,
 //and copies the larger of each pair into the corresponding element in the destination
 //    no corresponding functionality in IA32 SIMD, so we need to do the vertical comparison
+int8x8_t vpmax_s8(int8x8_t a, int8x8_t b); // VPMAX.S8 d0,d0,d0
+_NEON2SSE_INLINE int8x8_t vpmax_s8(int8x8_t a, int8x8_t b) // VPMAX.S8 d0,d0,d0
+{
+    int8x8_t res64;
+    __m128i ab, ab1, max;
+    _NEON2SSE_ALIGN_16 uint8_t mask8_sab[16] = { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14};
+    _NEON2SSE_ALIGN_16 uint8_t mask8_odd[16] = { 1, 3,  5,  7, 9, 11, 13, 15, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
+    ab = _mm_unpacklo_epi64 ( _pM128i(a), _pM128i(b)); //ab
+    ab1 = _mm_shuffle_epi8 (ab, *(__m128i*) mask8_sab); //horisontal pairs swap for vertical max finding
+    max = _MM_MAX_EPI8 (ab, ab1); // SSE4.1
+    max = _mm_shuffle_epi8 (max, *(__m128i*) mask8_odd); //remove repetitive data
+    return64(max); //we need 64 bits only
+}
+
+int16x4_t vpmax_s16(int16x4_t a, int16x4_t b); // VPMAX.S16 d0,d0,d0
+_NEON2SSE_INLINE int16x4_t vpmax_s16(int16x4_t a, int16x4_t b) // VPMAX.S16 d0,d0,d0
+{
+    //solution may be not optimal compared with the serial one
+    int16x4_t res64;
+    __m128i ab, ab1, max;
+    _NEON2SSE_ALIGN_16 int8_t mask16_sab[16] = { 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13}; //each chars pair is considerd to be 16 bit number
+    _NEON2SSE_ALIGN_16 int8_t mask16_odd[16] = { 0,1, 4,5, 8,9, 12,13,  0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
+    ab = _mm_unpacklo_epi64 ( _pM128i(a),  _pM128i(b)); //ab
+    ab1 = _mm_shuffle_epi8 (ab, *(__m128i*) mask16_sab); //horisontal pairs swap for vertical max finding, use 8bit fn and the corresponding mask
+    max = _mm_max_epi16 (ab, ab1);
+    max =  _mm_shuffle_epi8 (max, *(__m128i*) mask16_odd); //remove repetitive data, use 8bit fn and the corresponding mask
+    return64(max);
+}
+
+int32x2_t vpmax_s32(int32x2_t a, int32x2_t b); // VPMAX.S32 d0,d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vpmax_s32(int32x2_t a, int32x2_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    //serial solution looks faster than SIMD one
+    int32x2_t res;
+    res.m64_i32[0] = (a.m64_i32[0] < a.m64_i32[1]) ? a.m64_i32[1] : a.m64_i32[0];
+    res.m64_i32[1] = (b.m64_i32[0] < b.m64_i32[1]) ? b.m64_i32[1] : b.m64_i32[0];
+    return res;
+}
+
+uint8x8_t vpmax_u8(uint8x8_t a, uint8x8_t b); // VPMAX.U8 d0,d0,d0
+_NEON2SSE_INLINE uint8x8_t vpmax_u8(uint8x8_t a, uint8x8_t b) // VPMAX.U8 d0,d0,d0
+{
+    uint8x8_t res64;
+    __m128i ab, ab1, max;
+    _NEON2SSE_ALIGN_16 int8_t mask8_sab[16] = { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14};
+    _NEON2SSE_ALIGN_16 uint8_t mask8_odd[16] = { 1, 3,  5,  7, 9, 11, 13, 15, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
+    ab = _mm_unpacklo_epi64 (_pM128i(a), _pM128i(b)); //ab
+    ab1 = _mm_shuffle_epi8 (ab, *(__m128i*) mask8_sab); //horisontal pairs swap for vertical max finding
+    max = _mm_max_epu8 (ab, ab1); // SSE4.1
+    max = _mm_shuffle_epi8 (max, *(__m128i*) mask8_odd); //remove repetitive data
+    return64(max);
+}
+
+uint16x4_t vpmax_u16(uint16x4_t a, uint16x4_t b); // VPMAX.s16 d0,d0,d0
+_NEON2SSE_INLINE uint16x4_t vpmax_u16(uint16x4_t a, uint16x4_t b) // VPMAX.s16 d0,d0,d0
+{
+    //solution may be not optimal compared with the serial one
+    uint16x4_t res64;
+    __m128i ab, ab1, max;
+    _NEON2SSE_ALIGN_16 uint8_t mask16_sab[16] = { 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13}; //each chars pair is considerd to be 16 bit number
+    _NEON2SSE_ALIGN_16 uint8_t mask16_odd[16] = { 0,1, 4,5, 8,9, 12,13,  0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
+    ab = _mm_unpacklo_epi64 ( _pM128i(a), _pM128i(b)); //ab
+    ab1 = _mm_shuffle_epi8 (ab, *(__m128i*) mask16_sab); //horisontal pairs swap for vertical max finding, use 8bit fn and the corresponding mask
+    max = _MM_MAX_EPU16 (ab, ab1);
+    max = _mm_shuffle_epi8 (max, *(__m128i*) mask16_odd); //remove repetitive data, use 8bit fn and the corresponding mask
+    return64(max);
+}
+
+uint32x2_t vpmax_u32(uint32x2_t a, uint32x2_t b); // VPMAX.U32 d0,d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vpmax_u32(uint32x2_t a, uint32x2_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    //serial solution looks faster than SIMD one
+    uint32x2_t res;
+    res.m64_i32[0] = (a.m64_i32[0] < a.m64_i32[1]) ? a.m64_i32[1] : a.m64_i32[0];
+    res.m64_i32[1] = (b.m64_i32[0] < b.m64_i32[1]) ? b.m64_i32[1] : b.m64_i32[0];
+    return res;
+} //serial solution looks faster than a SIMD one
+
+float32x2_t vpmax_f32(float32x2_t a, float32x2_t b); // VPMAX.F32 d0,d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(float32x2_t vpmax_f32(float32x2_t a, float32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    //serial solution looks faster than  SIMD one
+    float32x2_t res;
+    res.m64_f32[0] = (a.m64_f32[0] < a.m64_f32[1]) ? a.m64_f32[1] : a.m64_f32[0];
+    res.m64_f32[1] = (b.m64_f32[0] < b.m64_f32[1]) ? b.m64_f32[1] : b.m64_f32[0];
+    return res;
+}
 
 // ***************** Folding minimum  ****************************
 // **************************************************************
 //vpmin -> takes minimum of adjacent pairs
+int8x8_t vpmin_s8(int8x8_t a, int8x8_t b); // VPMIN.S8 d0,d0,d0
+_NEON2SSE_INLINE int8x8_t vpmin_s8(int8x8_t a, int8x8_t b) // VPMIN.S8 d0,d0,d0
+{
+    int8x8_t res64;
+    __m128i ab, ab1, min;
+    _NEON2SSE_ALIGN_16 uint8_t mask8_sab[16] = { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14};
+    _NEON2SSE_ALIGN_16 uint8_t mask8_odd[16] = { 1, 3,  5,  7, 9, 11, 13, 15, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
+    ab = _mm_unpacklo_epi64 ( _pM128i(a), _pM128i(b)); //ab
+    ab1 = _mm_shuffle_epi8 (ab, *(__m128i*) mask8_sab); //horisontal pairs swap for vertical min finding
+    min =  _MM_MIN_EPI8 (ab, ab1); // SSE4.1
+    min =  _mm_shuffle_epi8 (min, *(__m128i*) mask8_odd); //remove repetitive data
+    return64(min);
+}
+
+int16x4_t vpmin_s16(int16x4_t a, int16x4_t b); // VPMIN.S16 d0,d0,d0
+_NEON2SSE_INLINE int16x4_t vpmin_s16(int16x4_t a, int16x4_t b) // VPMIN.S16 d0,d0,d0
+{
+    //solution may be not optimal compared with the serial one
+    int16x4_t res64;
+    __m128i ab, ab1, min;
+    _NEON2SSE_ALIGN_16 int8_t mask16_sab[16] = { 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13}; //each chars pair is considerd to be 16 bit number
+    _NEON2SSE_ALIGN_16 int8_t mask16_odd[16] = { 0,1, 4,5, 8,9, 12,13,  0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
+    ab = _mm_unpacklo_epi64 (  _pM128i(a),  _pM128i(b)); //ab
+    ab1 = _mm_shuffle_epi8 (ab, *(__m128i*) mask16_sab); //horisontal pairs swap for vertical max finding, use 8bit fn and the corresponding mask
+    min = _mm_min_epi16 (ab, ab1);
+    min = _mm_shuffle_epi8 (min, *(__m128i*) mask16_odd); //remove repetitive data, use 8bit fn and the corresponding mask
+    return64(min);
+}
+
+int32x2_t vpmin_s32(int32x2_t a, int32x2_t b); // VPMIN.S32 d0,d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vpmin_s32(int32x2_t a, int32x2_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    //serial solution looks faster than SIMD one
+    int32x2_t res;
+    res.m64_i32[0] = (a.m64_i32[0] > a.m64_i32[1]) ? a.m64_i32[1] : a.m64_i32[0];
+    res.m64_i32[1] = (b.m64_i32[0] > b.m64_i32[1]) ? b.m64_i32[1] : b.m64_i32[0];
+    return res;
+}
+
+uint8x8_t vpmin_u8(uint8x8_t a, uint8x8_t b); // VPMIN.U8 d0,d0,d0
+_NEON2SSE_INLINE uint8x8_t vpmin_u8(uint8x8_t a, uint8x8_t b) // VPMIN.U8 d0,d0,d0
+{
+    uint8x8_t res64;
+    __m128i ab, ab1, min;
+    _NEON2SSE_ALIGN_16 uint8_t mask8_sab[16] = { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14};
+    _NEON2SSE_ALIGN_16 uint8_t mask8_odd[16] = { 1, 3,  5,  7, 9, 11, 13, 15, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
+    ab = _mm_unpacklo_epi64 (  _pM128i(a),  _pM128i(b)); //ab
+    ab1 = _mm_shuffle_epi8 (ab, *(__m128i*) mask8_sab); //horisontal pairs swap for vertical max finding
+    min = _mm_min_epu8 (ab, ab1); // SSE4.1
+    min = _mm_shuffle_epi8 (min, *(__m128i*) mask8_odd); //remove repetitive data
+    return64(min);
+}
+
+uint16x4_t vpmin_u16(uint16x4_t a, uint16x4_t b); // VPMIN.s16 d0,d0,d0
+_NEON2SSE_INLINE uint16x4_t vpmin_u16(uint16x4_t a, uint16x4_t b) // VPMIN.s16 d0,d0,d0
+{
+    //solution may be not optimal compared with the serial one
+    uint16x4_t res64;
+    __m128i ab, ab1, min;
+    _NEON2SSE_ALIGN_16 uint8_t mask16_sab[16] = { 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13}; //each chars pair is considerd to be 16 bit number
+    _NEON2SSE_ALIGN_16 uint8_t mask16_odd[16] = { 0,1, 4,5, 8,9, 12,13,  0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
+    ab = _mm_unpacklo_epi64 ( _pM128i(a),  _pM128i(b)); //ab
+    ab1 = _mm_shuffle_epi8 (ab, *(__m128i*) mask16_sab); //horisontal pairs swap for vertical min finding, use 8bit fn and the corresponding mask
+    min = _MM_MIN_EPU16 (ab, ab1);
+    min =    _mm_shuffle_epi8 (min, *(__m128i*) mask16_odd); //remove repetitive data, use 8bit fn and the corresponding mask
+    return64(min);
+}
+
+uint32x2_t vpmin_u32(uint32x2_t a, uint32x2_t b); // VPMIN.U32 d0,d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vpmin_u32(uint32x2_t a, uint32x2_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    //serial solution looks faster than SIMD one
+    uint32x2_t res;
+    res.m64_u32[0] = (a.m64_u32[0] > a.m64_u32[1]) ? a.m64_u32[1] : a.m64_u32[0];
+    res.m64_u32[1] = (b.m64_u32[0] > b.m64_u32[1]) ? b.m64_u32[1] : b.m64_u32[0];
+    return res;
+}
+
+float32x2_t vpmin_f32(float32x2_t a, float32x2_t b); // VPMIN.F32 d0,d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(float32x2_t vpmin_f32(float32x2_t a, float32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    //serial solution looks faster than SIMD one
+    float32x2_t res;
+    res.m64_f32[0] = (a.m64_f32[0] > a.m64_f32[1]) ? a.m64_f32[1] : a.m64_f32[0];
+    res.m64_f32[1] = (b.m64_f32[0] > b.m64_f32[1]) ? b.m64_f32[1] : b.m64_f32[0];
+    return res;
+}
 
 //***************************************************************
 //***********  Reciprocal/Sqrt ************************************
 //***************************************************************
 //****************** Reciprocal estimate *******************************
-
 //the ARM NEON and x86 SIMD results may be slightly different
+float32x2_t vrecpe_f32(float32x2_t a); // VRECPE.F32 d0,d0
+_NEON2SSE_INLINE float32x2_t vrecpe_f32(float32x2_t a) //use low 64 bits
+{
+    float32x4_t res;
+    __m64_128 res64;
+    res = _mm_rcp_ps(_pM128(a));
+    _M64f(res64, res);
+    return res64;
+}
 
-float32x4_t vrecpeq_f32(float32x4_t a);         // VRECPE.F32 q0,q0
-//the ARM NEON and x86 SIMD results may be slightly different
+uint32x2_t vrecpe_u32(uint32x2_t a); // VRECPE.U32 d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vrecpe_u32(uint32x2_t a), _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    //Input is  fixed point number!!! No reciprocal for ints in IA32 available
+    uint32x2_t res;
+    float resf, r;
+    int i, q, s;
+    for (i =0; i<2; i++){
+        if((a.m64_u32[i] & 0x80000000) == 0) {
+            res.m64_u32[i] = 0xffffffff;
+        }else{
+            resf =  (float) (a.m64_u32[i] * (0.5f / (uint32_t)(1 << 31)));
+            q = (int)(resf * 512.0); /* a in units of 1/512 rounded down */
+            r = 1.0 / (((float)q + 0.5) / 512.0); /* reciprocal r */
+            s = (int)(256.0 * r + 0.5); /* r in units of 1/256 rounded to nearest */
+            r =  (float)s / 256.0;
+            res.m64_u32[i] = r * (uint32_t)(1 << 31);
+        }
+    }
+    return res;
+}
+
+float32x4_t vrecpeq_f32(float32x4_t a); // VRECPE.F32 q0,q0
 #define vrecpeq_f32 _mm_rcp_ps
 
-uint32x4_t vrecpeq_u32(uint32x4_t a);         // VRECPE.U32 q0,q0
+
+uint32x4_t vrecpeq_u32(uint32x4_t a); // VRECPE.U32 q0,q0
 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vrecpeq_u32(uint32x4_t a), _NEON2SSE_REASON_SLOW_SERIAL)
-{         //no reciprocal for ints in IA32 available, neither for  unsigned int to float 4 lanes conversion, so serial solution looks faster
-    _NEON2SSE_ALIGN_16 uint32_t atmp[4], res[4];
+{
+    //Input is  fixed point number!!!
+    //We implement the recip_estimate function as described in ARMv7 reference manual (VRECPE instruction) but use float instead of double
+    _NEON2SSE_ALIGN_16 uint32_t atmp[4];
+    _NEON2SSE_ALIGN_16 uint32_t res[4];
+   _NEON2SSE_ALIGN_16 int c80000000[4] = {0x80000000,0x80000000, 0x80000000,0x80000000};
+    float resf, r;
+    int i, q, s;
+  __m128i res128, mask, zero;
     _mm_store_si128((__m128i*)atmp, a);
-    res[0] = (atmp[0]) ? 1 / atmp[0] : 0xffffffff;
-    res[1] = (atmp[1]) ? 1 / atmp[1] : 0xffffffff;
-    return _mm_load_si128((__m128i*)res);
+    zero = _mm_setzero_si128();
+    for (i =0; i<4; i++){
+        resf = (atmp[i] * (0.5f / (uint32_t) (1 << 31)));  //  2.3283064365386963E-10 ~(0.5f / (uint32_t) (1 << 31))
+        q = (int)(resf * 512.0); /* a in units of 1/512 rounded down */
+        r = 1.0 / (((float)q + 0.5) / 512.0); /* reciprocal r */
+        s = (int)(256.0 * r + 0.5); /* r in units of 1/256 rounded to nearest */
+        r =  (float)s / 256.0;
+        res[i] = (uint32_t) (r * (((uint32_t)1) << 31) );
+    }
+    res128 = _mm_load_si128((__m128i*)res);
+    mask = _mm_and_si128(a, *(__m128i*)c80000000);
+    mask = _mm_cmpeq_epi32(zero, mask);  //0xffffffff if atmp[i] <= 0x7fffffff
+    return _mm_or_si128(res128, mask);
 }
 
 //**********Reciprocal square root estimate ****************
 //**********************************************************
 //no reciprocal square root for ints in IA32 available, neither for unsigned int to float4 lanes conversion, so a serial solution looks faster
+//but the particular implementation for vrsqrte_u32 may vary for various ARM compilers
+////the ARM NEON and x86 SIMD results may be slightly different
+float32x2_t vrsqrte_f32(float32x2_t a); // VRSQRTE.F32 d0,d0
+_NEON2SSE_INLINE float32x2_t vrsqrte_f32(float32x2_t a) //use low 64 bits
+{
+    float32x4_t res;
+    __m64_128 res64;
+    res = _mm_rsqrt_ps(_pM128(a));
+    _M64f(res64, res);
+    return res64;
+}
 
-float32x4_t vrsqrteq_f32(float32x4_t a);         // VRSQRTE.F32 q0,q0
-//the ARM NEON and x86 SIMD results may be slightly different
+uint32x2_t vrsqrte_u32(uint32x2_t a); // VRSQRTE.U32 d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vrsqrte_u32(uint32x2_t a), _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    //Input is  fixed point number!!!
+    //We implement the recip_sqrt_estimate function as described in ARMv7 reference manual (VRSQRTE instruction) but use float instead of double
+   uint32x2_t res;
+   __m128 tmp;
+    float r, resf, coeff;
+    int i,q0, q1, s;;
+    for (i =0; i<2; i++){
+        if((a.m64_u32[i] & 0xc0000000) == 0) { //a <=0x3fffffff
+            res.m64_u32[i] = 0xffffffff;
+        }else{
+            resf =  (float) (a.m64_u32[i] * (0.5f / (uint32_t)(1 << 31)));
+            coeff = (resf < 0.5)? 512.0 : 256.0 ; /* range 0.25 <= resf < 0.5  or range 0.5 <= resf < 1.0*/
+            q0 = (int)(resf * coeff); /* a in units of 1/512 rounded down */
+            r = ((float)q0 + 0.5) / coeff;
+            tmp = _mm_rsqrt_ss(_mm_load_ss( &r));/* reciprocal root r */
+            _mm_store_ss(&r, tmp);
+            s = (int)(256.0 * r + 0.5); /* r in units of 1/256 rounded to nearest */
+            r = (float)s / 256.0;
+            res.m64_u32[i] = r * (((uint32_t)1) << 31);
+        }
+    }
+    return res;
+}
+
+float32x4_t vrsqrteq_f32(float32x4_t a); // VRSQRTE.F32 q0,q0
 #define vrsqrteq_f32 _mm_rsqrt_ps
 
-uint32x4_t vrsqrteq_u32(uint32x4_t a);         // VRSQRTE.U32 q0,q0
-#define vrsqrteq_u32(a) _mm_castps_si128(_mm_rsqrt_ps(_M128(a)) )
-
+uint32x4_t vrsqrteq_u32(uint32x4_t a); // VRSQRTE.U32 q0,q0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vrsqrteq_u32(uint32x4_t a), _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    //Input is  fixed point number!!!
+    //We implement the recip_sqrt_estimate function as described in ARMv7 reference manual (VRSQRTE instruction) but use float instead of double
+   _NEON2SSE_ALIGN_16 uint32_t  atmp[4], res[4];
+   _NEON2SSE_ALIGN_16 float c1_31[4] = {(float)(((uint32_t)1) << 31), (float)(((uint32_t)1) << 31),(float)(((uint32_t)1) << 31), (float)(((uint32_t)1) << 31)};
+   _NEON2SSE_ALIGN_16 int c_c0000000[4] = {0xc0000000,0xc0000000, 0xc0000000,0xc0000000};
+  __m128 tmp;
+  __m128i res128, mask, zero;
+    float r, resf, coeff;
+    int i,q0, q1, s;
+    _mm_store_si128((__m128i*)atmp, a);
+    zero = _mm_setzero_si128();
+    for (i =0; i<4; i++){
+        resf =  (float) (atmp[i] * (0.5f / (uint32_t)(1 << 31)));
+        coeff = (resf < 0.5)? 512.0 : 256.0 ; /* range 0.25 <= resf < 0.5  or range 0.5 <= resf < 1.0*/
+        q0 = (int)(resf * coeff); /* a in units of 1/512 rounded down */
+        r = ((float)q0 + 0.5) / coeff;
+        tmp = _mm_rsqrt_ss(_mm_load_ss( &r));/* reciprocal root r */
+        _mm_store_ss(&r, tmp);
+        s = (int)(256.0 * r + 0.5); /* r in units of 1/256 rounded to nearest */
+        r = (float)s / 256.0;
+        res[i] = (uint32_t) (r * (((uint32_t)1) << 31) );
+    }
+    res128 = _mm_load_si128((__m128i*)res);
+    mask = _mm_and_si128(a, *(__m128i*)c_c0000000);
+    mask = _mm_cmpeq_epi32(zero, mask);  //0xffffffff if atmp[i] <= 0x3fffffff
+    return _mm_or_si128(res128, mask);
+}
 //************ Reciprocal estimate/step and 1/sqrt estimate/step ***************************
 //******************************************************************************************
 //******VRECPS (Vector Reciprocal Step) ***************************************************
 //multiplies the elements of one vector by the corresponding elements of another vector,
 //subtracts each of the results from 2, and places the final results into the elements of the destination vector.
 
-float32x4_t vrecpsq_f32(float32x4_t a, float32x4_t b);         // VRECPS.F32 q0, q0, q0
-_NEON2SSE_INLINE float32x4_t vrecpsq_f32(float32x4_t a, float32x4_t b)         // VRECPS.F32 q0, q0, q0
+float32x2_t vrecps_f32(float32x2_t a, float32x2_t b); // VRECPS.F32 d0, d0, d0
+_NEON2SSE_INLINE float32x2_t vrecps_f32(float32x2_t a, float32x2_t b)
+{
+    float32x4_t res;
+    __m64_128 res64;
+    res = vrecpsq_f32(_pM128(a), _pM128(b));
+    _M64f(res64, res);
+    return res64;
+}
+
+float32x4_t vrecpsq_f32(float32x4_t a, float32x4_t b); // VRECPS.F32 q0, q0, q0
+_NEON2SSE_INLINE float32x4_t vrecpsq_f32(float32x4_t a, float32x4_t b) // VRECPS.F32 q0, q0, q0
 {
     __m128 f2, mul;
     f2 =  _mm_set1_ps(2.);
@@ -3505,8 +6888,17 @@
 //multiplies the elements of one vector by the corresponding elements of another vector,
 //subtracts each of the results from 3, divides these results by two, and places the final results into the elements of the destination vector.
 
-float32x4_t vrsqrtsq_f32(float32x4_t a, float32x4_t b);         // VRSQRTS.F32 q0, q0, q0
-_NEON2SSE_INLINE float32x4_t vrsqrtsq_f32(float32x4_t a, float32x4_t b)         // VRSQRTS.F32 q0, q0, q0
+float32x2_t vrsqrts_f32(float32x2_t a, float32x2_t b); // VRSQRTS.F32 d0, d0, d0
+_NEON2SSE_INLINE float32x2_t vrsqrts_f32(float32x2_t a, float32x2_t b)
+{
+    float32x2_t res;
+    res.m64_f32[0] = (3 - a.m64_f32[0] * b.m64_f32[0]) / 2;
+    res.m64_f32[1] = (3 - a.m64_f32[1] * b.m64_f32[1]) / 2;
+    return res;
+}
+
+float32x4_t vrsqrtsq_f32(float32x4_t a, float32x4_t b); // VRSQRTS.F32 q0, q0, q0
+_NEON2SSE_INLINE float32x4_t vrsqrtsq_f32(float32x4_t a, float32x4_t b) // VRSQRTS.F32 q0, q0, q0
 {
     __m128 f3, f05, mul;
     f3 =  _mm_set1_ps(3.);
@@ -3530,54 +6922,110 @@
         else res[i] = (btmp[i] >=0) ? atmp[i] << btmp[i] : atmp[i] >> (-btmp[i]); } \
         return _mm_load_si128((__m128i*)res);
 
-int8x16_t vshlq_s8(int8x16_t a, int8x16_t b);         // VSHL.S8 q0,q0,q0
+#define SERIAL_SHIFT_64(TYPE, SIGN, LEN) \
+        int ## TYPE ## x ## LEN ## _t res;  int i, lanesize = sizeof(int ## TYPE ## _t) << 3; \
+        for (i = 0; i<LEN; i++) { \
+        if( (b.m64_i ## TYPE[i] >= lanesize)||(b.m64_i ## TYPE[i] <= -lanesize) ) res.m64_ ## SIGN ## TYPE[i] = 0; \
+        else res.m64_ ## SIGN ## TYPE[i] = (b.m64_i ## TYPE[i] >=0) ? a.m64_ ## SIGN ## TYPE[i] << b.m64_i ## TYPE[i] : a.m64_ ## SIGN ## TYPE[i] >> (-b.m64_i ## TYPE[i]); } \
+        return res;
+
+int8x8_t vshl_s8(int8x8_t a, int8x8_t b); // VSHL.S8 d0,d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x8_t vshl_s8(int8x8_t a, int8x8_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_SHIFT_64(8, i, 8)
+}
+
+int16x4_t vshl_s16(int16x4_t a, int16x4_t b); // VSHL.S16 d0,d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x4_t vshl_s16(int16x4_t a, int16x4_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_SHIFT_64(16, i, 4)
+}
+
+int32x2_t vshl_s32(int32x2_t a, int32x2_t b); // VSHL.S32 d0,d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vshl_s32(int32x2_t a, int32x2_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_SHIFT_64(32, i, 2)
+}
+
+int64x1_t vshl_s64(int64x1_t a, int64x1_t b); // VSHL.S64 d0,d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vshl_s64(int64x1_t a, int64x1_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_SHIFT_64(64, i, 1)
+}
+
+uint8x8_t vshl_u8(uint8x8_t a, int8x8_t b); // VSHL.U8 d0,d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x8_t vshl_u8(uint8x8_t a, int8x8_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_SHIFT_64(8, u, 8)
+}
+
+uint16x4_t vshl_u16(uint16x4_t a, int16x4_t b); // VSHL.s16 d0,d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x4_t vshl_u16(uint16x4_t a, int16x4_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_SHIFT_64(16, u, 4)
+}
+
+uint32x2_t vshl_u32(uint32x2_t a, int32x2_t b); // VSHL.U32 d0,d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vshl_u32(uint32x2_t a, int32x2_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_SHIFT_64(32, u, 2)
+}
+
+uint64x1_t vshl_u64(uint64x1_t a, int64x1_t b); // VSHL.U64 d0,d0,d0
+_NEON2SSE_INLINE uint64x1_t vshl_u64(uint64x1_t a, int64x1_t b) //if we use the SERIAL_SHIFT macro need to have the special processing  for large numbers
+{
+    SERIAL_SHIFT_64(64, u, 1)
+}
+
+int8x16_t vshlq_s8(int8x16_t a, int8x16_t b); // VSHL.S8 q0,q0,q0
 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x16_t vshlq_s8(int8x16_t a, int8x16_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
 {
     SERIAL_SHIFT(int8_t, int8_t, 16, 16)
 }
 
-int16x8_t vshlq_s16(int16x8_t a, int16x8_t b);         // VSHL.S16 q0,q0,q0
+int16x8_t vshlq_s16(int16x8_t a, int16x8_t b); // VSHL.S16 q0,q0,q0
 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x8_t vshlq_s16(int16x8_t a, int16x8_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
 {
     SERIAL_SHIFT(int16_t, int16_t, 8, 8)
 }
 
-int32x4_t vshlq_s32(int32x4_t a, int32x4_t b);         // VSHL.S32 q0,q0,q0
+int32x4_t vshlq_s32(int32x4_t a, int32x4_t b); // VSHL.S32 q0,q0,q0
 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vshlq_s32(int32x4_t a, int32x4_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
 {
     SERIAL_SHIFT(int32_t, int32_t, 4, 4)
 }
 
-int64x2_t vshlq_s64(int64x2_t a, int64x2_t b);         // VSHL.S64 q0,q0,q0
+int64x2_t vshlq_s64(int64x2_t a, int64x2_t b); // VSHL.S64 q0,q0,q0
 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vshlq_s64(int64x2_t a, int64x2_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
 {
     SERIAL_SHIFT(int64_t, int64_t, 2, 2)
 }
 
-uint8x16_t vshlq_u8(uint8x16_t a, int8x16_t b);         // VSHL.U8 q0,q0,q0
+uint8x16_t vshlq_u8(uint8x16_t a, int8x16_t b); // VSHL.U8 q0,q0,q0
 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x16_t vshlq_u8(uint8x16_t a, int8x16_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
 {
     SERIAL_SHIFT(uint8_t, int8_t, 16, 16)
 }
 
-uint16x8_t vshlq_u16(uint16x8_t a, int16x8_t b);         // VSHL.s16 q0,q0,q0
+uint16x8_t vshlq_u16(uint16x8_t a, int16x8_t b); // VSHL.s16 q0,q0,q0
 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x8_t vshlq_u16(uint16x8_t a, int16x8_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
 {
     SERIAL_SHIFT(uint16_t, int16_t, 8, 8)
 }
 
-uint32x4_t vshlq_u32(uint32x4_t a, int32x4_t b);         // VSHL.U32 q0,q0,q0
+uint32x4_t vshlq_u32(uint32x4_t a, int32x4_t b); // VSHL.U32 q0,q0,q0
 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vshlq_u32(uint32x4_t a, int32x4_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
 {
     SERIAL_SHIFT(uint32_t, int32_t, 4, 4)
 }
 
-uint64x2_t vshlq_u64(uint64x2_t a, int64x2_t b);         // VSHL.U64 q0,q0,q0
+uint64x2_t vshlq_u64(uint64x2_t a, int64x2_t b); // VSHL.U64 q0,q0,q0
 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING( uint64x2_t vshlq_u64(uint64x2_t a, int64x2_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
 {
     SERIAL_SHIFT(uint64_t, int64_t, 2, 2)
 }
 
+
 //*********** Vector saturating shift left: (negative values shift right) **********************
 //********************************************************************************************
 //No such operations in IA32 SIMD available yet, constant shift only available, so need to do the serial solution
@@ -3614,54 +7062,134 @@
                     res[i] = ( atmp[i] >= limit) ? res[i] = ~((TYPE)0) : atmp[i] << btmp[i]; }}}} \
         return _mm_load_si128((__m128i*)res);
 
-int8x16_t vqshlq_s8(int8x16_t a, int8x16_t b);         // VQSHL.S8 q0,q0,q0
+#define SERIAL_SATURATING_SHIFT_SIGNED_64(TYPE, LEN) \
+        int ## TYPE ## x ## LEN ## _t res; int ## TYPE ## _t limit; int i; \
+        int lanesize_1 = (sizeof( int ## TYPE ## _t) << 3) - 1; \
+        for (i = 0; i<LEN; i++) { \
+        if (a.m64_i ## TYPE[i] ==0) res.m64_i ## TYPE[i] = 0; \
+        else{ \
+            if(b.m64_i ## TYPE[i] <0) res.m64_i ## TYPE[i] = a.m64_i ## TYPE[i] >> (-(b.m64_i ## TYPE[i])); \
+            else{ \
+                if (b.m64_i ## TYPE[i]>lanesize_1) { \
+                    res.m64_i ## TYPE[i] = ((_UNSIGNED_T(int ## TYPE ## _t))a.m64_i ## TYPE[i] >> lanesize_1 ) + ((int ## TYPE ## _t) 1 << lanesize_1) - 1; \
+                }else{ \
+                    limit = (int ## TYPE ## _t) 1 << (lanesize_1 - b.m64_i ## TYPE[i]); \
+                    if((a.m64_i ## TYPE[i] >= limit)||(a.m64_i ## TYPE[i] <= -limit)) \
+                        res.m64_i ## TYPE[i] = ((_UNSIGNED_T(int ## TYPE ## _t))a.m64_i ## TYPE[i] >> lanesize_1 ) + ((int ## TYPE ## _t) 1 << lanesize_1) - 1; \
+                    else res.m64_i ## TYPE[i] = a.m64_i ## TYPE[i] << b.m64_i ## TYPE[i]; }}}} \
+        return res;
+
+#define SERIAL_SATURATING_SHIFT_UNSIGNED_64(TYPE, LEN) \
+        int ## TYPE ## x ## LEN ## _t res;  _UNSIGNED_T(int ## TYPE ## _t) limit; int i; \
+        int ## TYPE ## _t lanesize = (sizeof(int ## TYPE ## _t) << 3); \
+        for (i = 0; i<LEN; i++) { \
+        if (a.m64_u ## TYPE[i] ==0) {res.m64_u ## TYPE[i] = 0; \
+        }else{ \
+            if(b.m64_i ## TYPE[i] < 0) res.m64_u ## TYPE[i] = a.m64_u ## TYPE[i] >> (-(b.m64_i ## TYPE[i])); \
+            else{ \
+                if (b.m64_i ## TYPE[i]>lanesize) res.m64_u ## TYPE[i] = ~((int ## TYPE ## _t) 0); \
+                else{ \
+                    limit = (int ## TYPE ## _t) 1 << (lanesize - b.m64_i ## TYPE[i]); \
+                    res.m64_u ## TYPE[i] = ( a.m64_u ## TYPE[i] >= limit) ? res.m64_u ## TYPE[i] = ~((int ## TYPE ## _t) 0) : a.m64_u ## TYPE[i] << b.m64_u ## TYPE[i]; }}}} \
+        return res;
+
+int8x8_t vqshl_s8(int8x8_t a, int8x8_t b); // VQSHL.S8 d0,d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x8_t vqshl_s8(int8x8_t a, int8x8_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_SATURATING_SHIFT_SIGNED_64(8,8)
+}
+
+int16x4_t vqshl_s16(int16x4_t a, int16x4_t b); // VQSHL.S16 d0,d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x4_t vqshl_s16(int16x4_t a, int16x4_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_SATURATING_SHIFT_SIGNED_64(16,4)
+}
+
+int32x2_t vqshl_s32(int32x2_t a, int32x2_t b); // VQSHL.S32 d0,d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vqshl_s32(int32x2_t a, int32x2_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_SATURATING_SHIFT_SIGNED_64(32,2)
+}
+
+int64x1_t vqshl_s64(int64x1_t a, int64x1_t b); // VQSHL.S64 d0,d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vqshl_s64(int64x1_t a, int64x1_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_SATURATING_SHIFT_SIGNED_64(64,1)
+}
+
+uint8x8_t vqshl_u8(uint8x8_t a, int8x8_t b); // VQSHL.U8 d0,d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x8_t vqshl_u8(uint8x8_t a, int8x8_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_SATURATING_SHIFT_UNSIGNED_64(8,8)
+}
+
+uint16x4_t vqshl_u16(uint16x4_t a, int16x4_t b); // VQSHL.s16 d0,d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x4_t vqshl_u16(uint16x4_t a, int16x4_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_SATURATING_SHIFT_UNSIGNED_64(16,4)
+}
+
+uint32x2_t vqshl_u32(uint32x2_t a, int32x2_t b); // VQSHL.U32 d0,d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vqshl_u32(uint32x2_t a, int32x2_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_SATURATING_SHIFT_UNSIGNED_64(32,2)
+}
+
+uint64x1_t vqshl_u64(uint64x1_t a, int64x1_t b); // VQSHL.U64 d0,d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x1_t vqshl_u64(uint64x1_t a, int64x1_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_SATURATING_SHIFT_UNSIGNED_64(64,1)
+}
+
+int8x16_t vqshlq_s8(int8x16_t a, int8x16_t b); // VQSHL.S8 q0,q0,q0
 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x16_t vqshlq_s8(int8x16_t a, int8x16_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
 {
     SERIAL_SATURATING_SHIFT_SIGNED(int8_t, 16, 16)
 }
 
-int16x8_t vqshlq_s16(int16x8_t a, int16x8_t b);         // VQSHL.S16 q0,q0,q0
+int16x8_t vqshlq_s16(int16x8_t a, int16x8_t b); // VQSHL.S16 q0,q0,q0
 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x8_t vqshlq_s16(int16x8_t a, int16x8_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
 {
     SERIAL_SATURATING_SHIFT_SIGNED(int16_t, 8, 8)
 }
 
-int32x4_t vqshlq_s32(int32x4_t a, int32x4_t b);         // VQSHL.S32 q0,q0,q0
+int32x4_t vqshlq_s32(int32x4_t a, int32x4_t b); // VQSHL.S32 q0,q0,q0
 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vqshlq_s32(int32x4_t a, int32x4_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
 {
     SERIAL_SATURATING_SHIFT_SIGNED(int32_t, 4, 4)
 }
 
-int64x2_t vqshlq_s64(int64x2_t a, int64x2_t b);         // VQSHL.S64 q0,q0,q0
+int64x2_t vqshlq_s64(int64x2_t a, int64x2_t b); // VQSHL.S64 q0,q0,q0
 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqshlq_s64(int64x2_t a, int64x2_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
 {
     SERIAL_SATURATING_SHIFT_SIGNED(int64_t, 2, 2)
 }
 
-uint8x16_t vqshlq_u8(uint8x16_t a, int8x16_t b);         // VQSHL.U8 q0,q0,q0
+uint8x16_t vqshlq_u8(uint8x16_t a, int8x16_t b); // VQSHL.U8 q0,q0,q0
 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x16_t vqshlq_u8(uint8x16_t a, int8x16_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
 {
     SERIAL_SATURATING_SHIFT_UNSIGNED(int8_t, 16, 16)
 }
 
-uint16x8_t vqshlq_u16(uint16x8_t a, int16x8_t b);         // VQSHL.s16 q0,q0,q0
+uint16x8_t vqshlq_u16(uint16x8_t a, int16x8_t b); // VQSHL.s16 q0,q0,q0
 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x8_t vqshlq_u16(uint16x8_t a, int16x8_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
 {
     SERIAL_SATURATING_SHIFT_UNSIGNED(int16_t, 8, 8)
 }
 
-uint32x4_t vqshlq_u32(uint32x4_t a, int32x4_t b);         // VQSHL.U32 q0,q0,q0
+uint32x4_t vqshlq_u32(uint32x4_t a, int32x4_t b); // VQSHL.U32 q0,q0,q0
 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vqshlq_u32(uint32x4_t a, int32x4_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
 {
     SERIAL_SATURATING_SHIFT_UNSIGNED(int32_t, 4, 4)
 }
 
-uint64x2_t vqshlq_u64(uint64x2_t a, int64x2_t b);         // VQSHL.U64 q0,q0,q0
+uint64x2_t vqshlq_u64(uint64x2_t a, int64x2_t b); // VQSHL.U64 q0,q0,q0
 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vqshlq_u64(uint64x2_t a, int64x2_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
 {
     SERIAL_SATURATING_SHIFT_UNSIGNED(int64_t, 2, 2)
 }
 
+
 //******** Vector rounding shift left: (negative values shift right) **********
 //****************************************************************************
 //No such operations in IA32 SIMD available yet, constant shift only available, so need to do the serial solution
@@ -3679,54 +7207,117 @@
                             (atmp[i] >> (-btmp[i])) + ( (atmp[i] & ((INTERNAL_TYPE)1 << (-btmp[i] - 1))) >> (-btmp[i] - 1) );    }} \
         return _mm_load_si128((__m128i*)res);
 
-int8x16_t vrshlq_s8(int8x16_t a, int8x16_t b);         // VRSHL.S8 q0,q0,q0
+
+#define SERIAL_ROUNDING_SHIFT_64(TYPE, SIGN, LEN) \
+        int ## TYPE ## x ## LEN ## _t res;  int i;  int lanesize = sizeof(int ## TYPE ## _t) << 3; \
+        for (i = 0; i<LEN; i++) { \
+        if( b.m64_i ## TYPE[i] >= 0) { \
+            if(b.m64_i ## TYPE[i] >= lanesize) res.m64_ ## SIGN ## TYPE[i] = 0; \
+            else res.m64_ ## SIGN ## TYPE[i] = (a.m64_ ## SIGN ## TYPE[i] << b.m64_i ## TYPE[i]); \
+        }else{ \
+            res.m64_ ## SIGN ## TYPE[i] = (b.m64_i ## TYPE[i] < -lanesize) ? res.m64_ ## SIGN ## TYPE[i] = 0 : \
+                            (b.m64_i ## TYPE[i] == -lanesize) ? (a.m64_ ## SIGN ## TYPE[i] & ((int ## TYPE ## _t) 1 << (-(b.m64_i ## TYPE[i]) - 1))) >> (-(b.m64_i ## TYPE[i]) - 1) : \
+                            (a.m64_ ## SIGN ## TYPE[i] >> (-(b.m64_i ## TYPE[i]))) + ( (a.m64_ ## SIGN ## TYPE[i] & ((int ## TYPE ## _t) 1 << (-(b.m64_i ## TYPE[i]) - 1))) >> (-(b.m64_i ## TYPE[i]) - 1) );    }} \
+        return res;
+
+
+int8x8_t vrshl_s8(int8x8_t a, int8x8_t b); // VRSHL.S8 d0,d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x8_t vrshl_s8(int8x8_t a, int8x8_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_ROUNDING_SHIFT_64(8,i,8)
+}
+
+int16x4_t vrshl_s16(int16x4_t a, int16x4_t b); // VRSHL.S16 d0,d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x4_t vrshl_s16(int16x4_t a, int16x4_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_ROUNDING_SHIFT_64(16,i,4)
+}
+
+int32x2_t vrshl_s32(int32x2_t a, int32x2_t b); // VRSHL.S32 d0,d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vrshl_s32(int32x2_t a, int32x2_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_ROUNDING_SHIFT_64(32,i,2)
+}
+
+int64x1_t vrshl_s64(int64x1_t a, int64x1_t b); // VRSHL.S64 d0,d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vrshl_s64(int64x1_t a, int64x1_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_ROUNDING_SHIFT_64(64,i,1)
+}
+
+uint8x8_t vrshl_u8(uint8x8_t a, int8x8_t b); // VRSHL.U8 d0,d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x8_t vrshl_u8(uint8x8_t a, int8x8_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_ROUNDING_SHIFT_64(8,u,8)
+}
+
+uint16x4_t vrshl_u16(uint16x4_t a, int16x4_t b); // VRSHL.s16 d0,d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x4_t vrshl_u16(uint16x4_t a, int16x4_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_ROUNDING_SHIFT_64(16,u,4)
+}
+
+uint32x2_t vrshl_u32(uint32x2_t a, int32x2_t b); // VRSHL.U32 d0,d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vrshl_u32(uint32x2_t a, int32x2_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_ROUNDING_SHIFT_64(32,u,2)
+}
+
+uint64x1_t vrshl_u64(uint64x1_t a, int64x1_t b); // VRSHL.U64 d0,d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x1_t vrshl_u64(uint64x1_t a, int64x1_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_ROUNDING_SHIFT_64(64,u,1)
+}
+
+int8x16_t vrshlq_s8(int8x16_t a, int8x16_t b); // VRSHL.S8 q0,q0,q0
 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x16_t vrshlq_s8(int8x16_t a, int8x16_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
 {
     SERIAL_ROUNDING_SHIFT(int8_t, int8_t, 16, 16)
 }
 
-int16x8_t vrshlq_s16(int16x8_t a, int16x8_t b);         // VRSHL.S16 q0,q0,q0
+int16x8_t vrshlq_s16(int16x8_t a, int16x8_t b); // VRSHL.S16 q0,q0,q0
 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x8_t vrshlq_s16(int16x8_t a, int16x8_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
 {
     SERIAL_ROUNDING_SHIFT(int16_t, int16_t, 8, 8)
 }
 
-int32x4_t vrshlq_s32(int32x4_t a, int32x4_t b);         // VRSHL.S32 q0,q0,q0
+int32x4_t vrshlq_s32(int32x4_t a, int32x4_t b); // VRSHL.S32 q0,q0,q0
 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vrshlq_s32(int32x4_t a, int32x4_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
 {
     SERIAL_ROUNDING_SHIFT(int32_t, int32_t, 4, 4)
 }
 
-int64x2_t vrshlq_s64(int64x2_t a, int64x2_t b);         // VRSHL.S64 q0,q0,q0
+int64x2_t vrshlq_s64(int64x2_t a, int64x2_t b); // VRSHL.S64 q0,q0,q0
 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vrshlq_s64(int64x2_t a, int64x2_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
 {
     SERIAL_ROUNDING_SHIFT(int64_t, int64_t, 2, 2)
 }
 
-uint8x16_t vrshlq_u8(uint8x16_t a, int8x16_t b);         // VRSHL.U8 q0,q0,q0
+uint8x16_t vrshlq_u8(uint8x16_t a, int8x16_t b); // VRSHL.U8 q0,q0,q0
 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x16_t vrshlq_u8(uint8x16_t a, int8x16_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
 {
     SERIAL_ROUNDING_SHIFT(uint8_t, int8_t, 16, 16)
 }
 
-uint16x8_t vrshlq_u16(uint16x8_t a, int16x8_t b);         // VRSHL.s16 q0,q0,q0
+uint16x8_t vrshlq_u16(uint16x8_t a, int16x8_t b); // VRSHL.s16 q0,q0,q0
 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x8_t vrshlq_u16(uint16x8_t a, int16x8_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
 {
     SERIAL_ROUNDING_SHIFT(uint16_t, int16_t, 8, 8)
 }
 
-uint32x4_t vrshlq_u32(uint32x4_t a, int32x4_t b);         // VRSHL.U32 q0,q0,q0
+uint32x4_t vrshlq_u32(uint32x4_t a, int32x4_t b); // VRSHL.U32 q0,q0,q0
 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vrshlq_u32(uint32x4_t a, int32x4_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
 {
     SERIAL_ROUNDING_SHIFT(uint32_t, int32_t, 4, 4)
 }
 
-uint64x2_t vrshlq_u64(uint64x2_t a, int64x2_t b);         // VRSHL.U64 q0,q0,q0
+uint64x2_t vrshlq_u64(uint64x2_t a, int64x2_t b); // VRSHL.U64 q0,q0,q0
 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vrshlq_u64(uint64x2_t a, int64x2_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
 {
     SERIAL_ROUNDING_SHIFT(uint64_t, int64_t, 2, 2)
 }
 
+
 //********** Vector saturating rounding shift left: (negative values shift right) ****************
 //*************************************************************************************************
 //No such operations in IA32 SIMD unfortunately, constant shift only available, so need to do the serial solution
@@ -3764,49 +7355,128 @@
                     res[i] = ( atmp[i] >= limit) ? res[i] = ~((TYPE)0) : atmp[i] << btmp[i]; }}}} \
         return _mm_load_si128((__m128i*)res);
 
-int8x16_t vqrshlq_s8(int8x16_t a, int8x16_t b);         // VQRSHL.S8 q0,q0,q0
+#define SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED_64(TYPE, LEN) \
+        __m64_128 res; int ## TYPE ## _t limit; int i; \
+        int lanesize_1 = (sizeof(int ## TYPE ## _t ) << 3) - 1; \
+        for (i = 0; i<LEN; i++) { \
+        if (a.m64_i ## TYPE[i] ==0) res.m64_i ## TYPE[i] = 0; \
+        else{ \
+            if(b.m64_i ## TYPE[i] <0) res.m64_i ## TYPE[i] = (b.m64_i ## TYPE[i] < (-lanesize_1)) ? 0 : (a.m64_i ## TYPE[i] >> (-(b.m64_i ## TYPE[i]))) + ( (a.m64_i ## TYPE[i] & ((int ## TYPE ## _t ) 1 << (-(b.m64_i ## TYPE[i]) - 1))) >> (-(b.m64_i ## TYPE[i]) - 1) ); \
+            else{ \
+                if (b.m64_i ## TYPE[i]>lanesize_1) { \
+                    res.m64_i ## TYPE[i] = ((_UNSIGNED_T(int ## TYPE ## _t ))a.m64_i ## TYPE[i] >> lanesize_1 ) + ((int ## TYPE ## _t ) 1 << lanesize_1) - 1; \
+                }else{ \
+                    limit = (int ## TYPE ## _t ) 1 << (lanesize_1 - b.m64_i ## TYPE[i]); \
+                    if((a.m64_i ## TYPE[i] >= limit)||(a.m64_i ## TYPE[i] <= -limit)) \
+                        res.m64_i ## TYPE[i] = ((_UNSIGNED_T(int ## TYPE ## _t ))a.m64_i ## TYPE[i] >> lanesize_1 ) + ((int ## TYPE ## _t ) 1 << lanesize_1) - 1; \
+                    else res.m64_i ## TYPE[i] = a.m64_i ## TYPE[i] << b.m64_i ## TYPE[i]; }}}} \
+        return res;
+
+#define SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED_64(TYPE, LEN) \
+        __m64_128 res; _UNSIGNED_T(int ## TYPE ## _t) limit; int i; \
+        int lanesize = (sizeof(int ## TYPE ## _t) << 3); \
+        for (i = 0; i<LEN; i++) { \
+        if (a.m64_u ## TYPE[i] ==0) {res.m64_u ## TYPE[i] = 0; \
+        }else{ \
+            if(b.m64_i ## TYPE[i] < 0) res.m64_u ## TYPE[i] = (b.m64_i ## TYPE[i] < (-lanesize)) ? 0 : (a.m64_u ## TYPE[i] >> (-(b.m64_i ## TYPE[i]))) + ( (a.m64_u ## TYPE[i] & ((int ## TYPE ## _t) 1 << (-(b.m64_i ## TYPE[i]) - 1))) >> (-(b.m64_i ## TYPE[i]) - 1) ); \
+            else{ \
+                if (b.m64_i ## TYPE[i]>lanesize) res.m64_u ## TYPE[i] = ~((int ## TYPE ## _t) 0); \
+                else{ \
+                    limit = (int ## TYPE ## _t) 1 << (lanesize - b.m64_i ## TYPE[i]); \
+                    res.m64_u ## TYPE[i] = ( a.m64_u ## TYPE[i] >= limit) ? res.m64_u ## TYPE[i] = ~((int ## TYPE ## _t) 0) : a.m64_u ## TYPE[i] << b.m64_i ## TYPE[i]; }}}} \
+        return res;
+
+int8x8_t vqrshl_s8(int8x8_t a, int8x8_t b); // VQRSHL.S8 d0,d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x8_t vqrshl_s8(int8x8_t a, int8x8_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED_64(8,8)
+}
+
+int16x4_t vqrshl_s16(int16x4_t a, int16x4_t b); // VQRSHL.S16 d0,d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x4_t vqrshl_s16(int16x4_t a, int16x4_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED_64(16,4)
+}
+
+int32x2_t vqrshl_s32(int32x2_t a, int32x2_t b); // VQRSHL.S32 d0,d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vqrshl_s32(int32x2_t a, int32x2_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED_64(32,2)
+}
+
+int64x1_t vqrshl_s64(int64x1_t a, int64x1_t b); // VQRSHL.S64 d0,d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vqrshl_s64(int64x1_t a, int64x1_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED_64(64,1)
+}
+
+uint8x8_t vqrshl_u8(uint8x8_t a, int8x8_t b); // VQRSHL.U8 d0,d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x8_t vqrshl_u8(uint8x8_t a, int8x8_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED_64(8,8)
+}
+
+uint16x4_t vqrshl_u16(uint16x4_t a, int16x4_t b); // VQRSHL.s16 d0,d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x4_t vqrshl_u16(uint16x4_t a, int16x4_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED_64(16,4)
+}
+
+uint32x2_t vqrshl_u32(uint32x2_t a, int32x2_t b); // VQRSHL.U32 d0,d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vqrshl_u32(uint32x2_t a, int32x2_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED_64(32,2)
+}
+
+uint64x1_t vqrshl_u64(uint64x1_t a, int64x1_t b); // VQRSHL.U64 d0,d0,d0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x1_t vqrshl_u64(uint64x1_t a, int64x1_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED_64(64,1)
+}
+
+int8x16_t vqrshlq_s8(int8x16_t a, int8x16_t b); // VQRSHL.S8 q0,q0,q0
 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x16_t vqrshlq_s8(int8x16_t a, int8x16_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
 {
     SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED(int8_t, 16, 16)
 }
 
-int16x8_t vqrshlq_s16(int16x8_t a, int16x8_t b);         // VQRSHL.S16 q0,q0,q0
+int16x8_t vqrshlq_s16(int16x8_t a, int16x8_t b); // VQRSHL.S16 q0,q0,q0
 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x8_t vqrshlq_s16(int16x8_t a, int16x8_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
 {
     SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED(int16_t, 8, 8)
 }
 
-int32x4_t vqrshlq_s32(int32x4_t a, int32x4_t b);         // VQRSHL.S32 q0,q0,q0
+int32x4_t vqrshlq_s32(int32x4_t a, int32x4_t b); // VQRSHL.S32 q0,q0,q0
 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vqrshlq_s32(int32x4_t a, int32x4_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
 {
     SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED(int32_t, 4, 4)
 }
 
-int64x2_t vqrshlq_s64(int64x2_t a, int64x2_t b);         // VQRSHL.S64 q0,q0,q0
+int64x2_t vqrshlq_s64(int64x2_t a, int64x2_t b); // VQRSHL.S64 q0,q0,q0
 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqrshlq_s64(int64x2_t a, int64x2_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
 {
     SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED(int64_t, 2, 2)
 }
 
-uint8x16_t vqrshlq_u8(uint8x16_t a, int8x16_t b);         // VQRSHL.U8 q0,q0,q0
+uint8x16_t vqrshlq_u8(uint8x16_t a, int8x16_t b); // VQRSHL.U8 q0,q0,q0
 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x16_t vqrshlq_u8(uint8x16_t a, int8x16_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
 {
     SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED(int8_t, 16, 16)
 }
 
-uint16x8_t vqrshlq_u16(uint16x8_t a, int16x8_t b);         // VQRSHL.s16 q0,q0,q0
+uint16x8_t vqrshlq_u16(uint16x8_t a, int16x8_t b); // VQRSHL.s16 q0,q0,q0
 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x8_t vqrshlq_u16(uint16x8_t a, int16x8_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
 {
     SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED(int16_t, 8, 8)
 }
 
-uint32x4_t vqrshlq_u32(uint32x4_t a, int32x4_t b);         // VQRSHL.U32 q0,q0,q0
+uint32x4_t vqrshlq_u32(uint32x4_t a, int32x4_t b); // VQRSHL.U32 q0,q0,q0
 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vqrshlq_u32(uint32x4_t a, int32x4_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
 {
     SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED(int32_t, 4, 4)
 }
 
-uint64x2_t vqrshlq_u64(uint64x2_t a, int64x2_t b);         // VQRSHL.U64 q0,q0,q0
+uint64x2_t vqrshlq_u64(uint64x2_t a, int64x2_t b); // VQRSHL.U64 q0,q0,q0
 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vqrshlq_u64(uint64x2_t a, int64x2_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
 {
     SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED(int64_t, 2, 2)
@@ -3817,249 +7487,535 @@
 // *********************************************************************************
 //**************** Vector shift right by constant*************************************
 //************************************************************************************
+int8x8_t vshr_n_s8(int8x8_t a, __constrange(1,8) int b); // VSHR.S8 d0,d0,#8
+_NEON2SSE_INLINE int8x8_t vshr_n_s8(int8x8_t a, __constrange(1,8) int b) // VSHR.S8 d0,d0,#8
+{
+    //no 8 bit shift available, go to 16 bit
+    int8x8_t res64;
+    __m128i r;
+    r = _MM_CVTEPI8_EPI16 (_pM128i(a)); //SSE 4.1
+    r = _mm_srai_epi16 (r, b); //SSE2
+    r = _mm_packs_epi16 (r,r); //we need 64 bits only
+    return64(r);
+}
 
-int8x16_t vshrq_n_s8(int8x16_t a, __constrange(1,8) int b);         // VSHR.S8 q0,q0,#8
-_NEON2SSE_INLINE int8x16_t vshrq_n_s8(int8x16_t a, __constrange(1,8) int b)         // VSHR.S8 q0,q0,#8
-{         //no 8 bit shift available, go to 16 bit trick
+int16x4_t vshr_n_s16(int16x4_t a,  __constrange(1,16) int b); // VSHR.S16 d0,d0,#16
+_NEON2SSE_INLINE int16x4_t vshr_n_s16(int16x4_t a,  __constrange(1,16) int b)
+{
+    int16x4_t res64;
+    return64(_mm_srai_epi16(_pM128i(a), b));
+}
+
+
+int32x2_t vshr_n_s32(int32x2_t a,  __constrange(1,32) int b); // VSHR.S32 d0,d0,#32
+_NEON2SSE_INLINE int32x2_t vshr_n_s32(int32x2_t a,  __constrange(1,32) int b)
+{
+    int32x2_t res64;
+    return64(_mm_srai_epi32(_pM128i(a), b));
+}
+
+int64x1_t vshr_n_s64(int64x1_t a, __constrange(1,64) int b); // VSHR.S64 d0,d0,#64
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vshr_n_s64(int64x1_t a, __constrange(1,64) int b), _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    //no arithmetic shift for 64bit values, serial solution used
+    int64x1_t res;
+    if(b>=64) res.m64_i64[0] = 0;
+    else res.m64_i64[0] = (*(int64_t*)&a) >> b;
+    return res;
+}
+
+uint8x8_t vshr_n_u8(uint8x8_t a, __constrange(1,8) int b); // VSHR.U8 d0,d0,#8
+_NEON2SSE_INLINE uint8x8_t vshr_n_u8(uint8x8_t a, __constrange(1,8) int b) // VSHR.U8 d0,d0,#8
+{
+    //no 8 bit shift available, go to 16 bit
+    uint8x8_t res64;
+    __m128i r;
+    r = _MM_CVTEPU8_EPI16 (_pM128i(a)); //SSE 4.1
+    r = _mm_srli_epi16 (r, b); //for unsigned variables we use the logical shift not arithmetical one
+    r = _mm_packus_epi16 (r,r); //we need 64 bits only
+    return64(r);
+}
+
+uint16x4_t vshr_n_u16(uint16x4_t a,  __constrange(1,16) int b); // VSHR.s16 d0,d0,#16
+_NEON2SSE_INLINE uint16x4_t vshr_n_u16(uint16x4_t a,  __constrange(1,16) int b)
+{
+    uint16x4_t res64;
+    return64(_mm_srli_epi16(_pM128i(a), b));
+}
+
+
+uint32x2_t vshr_n_u32(uint32x2_t a,  __constrange(1,32) int b); // VSHR.U32 d0,d0,#32
+_NEON2SSE_INLINE uint32x2_t vshr_n_u32(uint32x2_t a,  __constrange(1,32) int b)
+{
+    uint32x2_t res64;
+    return64(_mm_srli_epi32(_pM128i(a), b));
+}
+
+
+uint64x1_t vshr_n_u64(uint64x1_t a,  __constrange(1,64) int b); // VSHR.U64 d0,d0,#64
+_NEON2SSE_INLINE uint64x1_t vshr_n_u64(uint64x1_t a,  __constrange(1,64) int b)
+{
+    uint64x1_t res64;
+    return64(_mm_srli_epi64(_pM128i(a), b));
+}
+
+
+int8x16_t vshrq_n_s8(int8x16_t a, __constrange(1,8) int b); // VSHR.S8 q0,q0,#8
+_NEON2SSE_INLINE int8x16_t vshrq_n_s8(int8x16_t a, __constrange(1,8) int b) // VSHR.S8 q0,q0,#8
+{
+    //no 8 bit shift available, go to 16 bit trick
     __m128i zero, mask0, a_sign, r, a_sign_mask;
     _NEON2SSE_ALIGN_16 int16_t mask0_16[9] = {0x0000, 0x0080, 0x00c0, 0x00e0, 0x00f0,  0x00f8, 0x00fc, 0x00fe, 0x00ff};
     zero = _mm_setzero_si128();
-    mask0 = _mm_set1_epi16(mask0_16[b]);         //to mask the bits to be "spoiled"  by 16 bit shift
-    a_sign =  _mm_cmpgt_epi8 (zero, a);         //ff if a<0 or zero if a>0
+    mask0 = _mm_set1_epi16(mask0_16[b]); //to mask the bits to be "spoiled"  by 16 bit shift
+    a_sign =  _mm_cmpgt_epi8 (zero, a); //ff if a<0 or zero if a>0
     r = _mm_srai_epi16 (a, b);
     a_sign_mask =  _mm_and_si128 (mask0, a_sign);
     r =  _mm_andnot_si128 (mask0, r);
     return _mm_or_si128 (r, a_sign_mask);
 }
 
-int16x8_t vshrq_n_s16(int16x8_t a, __constrange(1,16) int b);         // VSHR.S16 q0,q0,#16
+int16x8_t vshrq_n_s16(int16x8_t a, __constrange(1,16) int b); // VSHR.S16 q0,q0,#16
 #define vshrq_n_s16 _mm_srai_epi16
 
-int32x4_t vshrq_n_s32(int32x4_t a, __constrange(1,32) int b);         // VSHR.S32 q0,q0,#32
+int32x4_t vshrq_n_s32(int32x4_t a, __constrange(1,32) int b); // VSHR.S32 q0,q0,#32
 #define vshrq_n_s32 _mm_srai_epi32
 
-int64x2_t vshrq_n_s64(int64x2_t a, __constrange(1,64) int b);         // VSHR.S64 q0,q0,#64
+int64x2_t vshrq_n_s64(int64x2_t a, __constrange(1,64) int b); // VSHR.S64 q0,q0,#64
 _NEON2SSE_INLINE int64x2_t vshrq_n_s64(int64x2_t a, __constrange(1,64) int b)
-{         //SIMD implementation may be not optimal due to 64 bit arithmetic shift absense in x86 SIMD
+{
+    //SIMD implementation may be not optimal due to 64 bit arithmetic shift absense in x86 SIMD
     __m128i c1, signmask,a0,  res64;
     _NEON2SSE_ALIGN_16 uint64_t mask[] = {0x8000000000000000, 0x8000000000000000};
-    c1 =  _mm_cmpeq_epi32(a,a);         //0xffffffffffffffff
+    c1 =  _mm_cmpeq_epi32(a,a); //0xffffffffffffffff
     signmask  =  _mm_slli_epi64 (c1, (64 - b));
-    a0 = _mm_or_si128(a, *(__m128i*)mask);         //get the first bit
-    #ifdef USE_SSE4
-        a0 = _mm_cmpeq_epi64 (a, a0);         //SSE4.1
-    #else
-        a0 = _mm_cmpeq_epi32 (a, a0);
-        a0 = _mm_shuffle_epi32 (a0, 1 | (1 << 2) | (3 << 4) | (3 << 6));         //copy the information from hi to low part of the 64 bit data
-    #endif
+    a0 = _mm_or_si128(a, *(__m128i*)mask); //get the first bit
+    a0 = _MM_CMPEQ_EPI64 (a, a0);
     signmask = _mm_and_si128(a0, signmask);
     res64 = _mm_srli_epi64 (a, b);
     return _mm_or_si128(res64, signmask);
 }
 
-uint8x16_t vshrq_n_u8(uint8x16_t a, __constrange(1,8) int b);         // VSHR.U8 q0,q0,#8
-_NEON2SSE_INLINE uint8x16_t vshrq_n_u8(uint8x16_t a, __constrange(1,8) int b)         // VSHR.U8 q0,q0,#8
-{         //no 8 bit shift available, need the special trick
+uint8x16_t vshrq_n_u8(uint8x16_t a, __constrange(1,8) int b); // VSHR.U8 q0,q0,#8
+_NEON2SSE_INLINE uint8x16_t vshrq_n_u8(uint8x16_t a, __constrange(1,8) int b) // VSHR.U8 q0,q0,#8
+{
+    //no 8 bit shift available, need the special trick
     __m128i mask0, r;
     _NEON2SSE_ALIGN_16 uint16_t mask10_16[9] = {0xffff, 0xff7f, 0xff3f, 0xff1f, 0xff0f,  0xff07, 0xff03, 0xff01, 0xff00};
-    mask0 = _mm_set1_epi16(mask10_16[b]);         //to mask the bits to be "spoiled"  by 16 bit shift
+    mask0 = _mm_set1_epi16(mask10_16[b]); //to mask the bits to be "spoiled"  by 16 bit shift
     r = _mm_srli_epi16 ( a, b);
     return _mm_and_si128 (r,  mask0);
 }
 
-uint16x8_t vshrq_n_u16(uint16x8_t a, __constrange(1,16) int b);         // VSHR.s16 q0,q0,#16
+uint16x8_t vshrq_n_u16(uint16x8_t a, __constrange(1,16) int b); // VSHR.s16 q0,q0,#16
 #define vshrq_n_u16 _mm_srli_epi16
 
-uint32x4_t vshrq_n_u32(uint32x4_t a, __constrange(1,32) int b);         // VSHR.U32 q0,q0,#32
+uint32x4_t vshrq_n_u32(uint32x4_t a, __constrange(1,32) int b); // VSHR.U32 q0,q0,#32
 #define vshrq_n_u32 _mm_srli_epi32
 
-uint64x2_t vshrq_n_u64(uint64x2_t a, __constrange(1,64) int b);         // VSHR.U64 q0,q0,#64
+uint64x2_t vshrq_n_u64(uint64x2_t a, __constrange(1,64) int b); // VSHR.U64 q0,q0,#64
 #define vshrq_n_u64 _mm_srli_epi64
 
 //*************************** Vector shift left by constant *************************
 //*********************************************************************************
+int8x8_t vshl_n_s8(int8x8_t a, __constrange(0,7) int b); // VSHL.I8 d0,d0,#0
+_NEON2SSE_INLINE int8x8_t vshl_n_s8(int8x8_t a, __constrange(0,7) int b) // VSHL.I8 d0,d0,#0
+{
+    //no 8 bit shift available, go to 16 bit
+    int8x8_t res64;
+    __m128i r;
+    _NEON2SSE_ALIGN_16 int8_t mask8_16_even_odd[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 };
+    r = _MM_CVTEPI8_EPI16 (_pM128i(a)); //SSE 4.1
+    r = _mm_slli_epi16 (r, b); //SSE2
+    r = _mm_shuffle_epi8 (r, *(__m128i*) mask8_16_even_odd); //return to 8 bit, we need 64 bits only
+    return64(r);
+}
 
-int8x16_t vshlq_n_s8(int8x16_t a, __constrange(0,7) int b);         // VSHL.I8 q0,q0,#0
+int16x4_t vshl_n_s16(int16x4_t a,  __constrange(0,15) int b); // VSHL.I16 d0,d0,#0
+_NEON2SSE_INLINE int16x4_t vshl_n_s16(int16x4_t a,  __constrange(0,15) int b)
+{
+    int16x4_t res64;
+    return64(_mm_slli_epi16(_pM128i(a), b));
+}
+
+
+int32x2_t vshl_n_s32(int32x2_t a,  __constrange(0,31) int b); // VSHL.I32 d0,d0,#0
+_NEON2SSE_INLINE int32x2_t vshl_n_s32(int32x2_t a,  __constrange(0,31) int b)
+{
+    int32x2_t res64;
+    return64(_mm_slli_epi32(_pM128i(a), b));
+}
+
+
+int64x1_t vshl_n_s64(int64x1_t a,  __constrange(0,63) int b); // VSHL.I64 d0,d0,#0
+_NEON2SSE_INLINE int64x1_t vshl_n_s64(int64x1_t a,  __constrange(0,63) int b)
+{
+    int64x1_t res64;
+    return64(_mm_slli_epi64(_pM128i(a), b));
+}
+
+
+uint8x8_t vshl_n_u8(uint8x8_t a, __constrange(0,7) int b); // VSHL.I8 d0,d0,#0
+_NEON2SSE_INLINE uint8x8_t vshl_n_u8(uint8x8_t a, __constrange(0,7) int b)
+{
+    //no 8 bit shift available, go to 16 bit
+    uint8x8_t res64;
+    __m128i mask8;
+    __m128i r;
+    mask8 = _mm_set1_epi16(0xff);
+    r = _MM_CVTEPU8_EPI16 (_pM128i(a)); //SSE 4.1
+    r = _mm_slli_epi16 (r, b); //SSE2
+    r = _mm_and_si128(r, mask8); //to avoid saturation
+    r = _mm_packus_epi16 (r,r); //we need 64 bits only
+    return64(r);
+}
+
+uint16x4_t vshl_n_u16(uint16x4_t a,  __constrange(0,15) int b); // VSHL.I16 d0,d0,#0
+#define vshl_n_u16 vshl_n_s16
+
+
+uint32x2_t vshl_n_u32(uint32x2_t a,  __constrange(0,31) int b); // VSHL.I32 d0,d0,#0
+#define vshl_n_u32 vshl_n_s32
+
+uint64x1_t vshl_n_u64(uint64x1_t a, __constrange(0,63) int b); // VSHL.I64 d0,d0,#0
+#define vshl_n_u64 vshl_n_s64
+
+int8x16_t vshlq_n_s8(int8x16_t a, __constrange(0,7) int b); // VSHL.I8 q0,q0,#0
 #define vshlq_n_s8 vshlq_n_u8
 
-int16x8_t vshlq_n_s16(int16x8_t a, __constrange(0,15) int b);         // VSHL.I16 q0,q0,#0
+int16x8_t vshlq_n_s16(int16x8_t a, __constrange(0,15) int b); // VSHL.I16 q0,q0,#0
 #define vshlq_n_s16 _mm_slli_epi16
 
-int32x4_t vshlq_n_s32(int32x4_t a, __constrange(0,31) int b);         // VSHL.I32 q0,q0,#0
+int32x4_t vshlq_n_s32(int32x4_t a, __constrange(0,31) int b); // VSHL.I32 q0,q0,#0
 #define vshlq_n_s32 _mm_slli_epi32
 
-int64x2_t vshlq_n_s64(int64x2_t a, __constrange(0,63) int b);         // VSHL.I64 q0,q0,#0
+int64x2_t vshlq_n_s64(int64x2_t a, __constrange(0,63) int b); // VSHL.I64 q0,q0,#0
 #define vshlq_n_s64 _mm_slli_epi64
 
-uint8x16_t vshlq_n_u8(uint8x16_t a, __constrange(0,7) int b);         // VSHL.I8 q0,q0,#0
+uint8x16_t vshlq_n_u8(uint8x16_t a, __constrange(0,7) int b); // VSHL.I8 q0,q0,#0
 _NEON2SSE_INLINE uint8x16_t vshlq_n_u8(uint8x16_t a, __constrange(0,7) int b)
-{         //no 8 bit shift available, need the special trick
+{
+    //no 8 bit shift available, need the special trick
     __m128i mask0, r;
     _NEON2SSE_ALIGN_16 uint16_t mask10_16[9] = {0xffff, 0xfeff, 0xfcff, 0xf8ff, 0xf0ff,  0xe0ff, 0xc0ff, 0x80ff, 0xff};
-    mask0 = _mm_set1_epi16(mask10_16[b]);         //to mask the bits to be "spoiled"  by 16 bit shift
+    mask0 = _mm_set1_epi16(mask10_16[b]); //to mask the bits to be "spoiled"  by 16 bit shift
     r = _mm_slli_epi16 ( a, b);
     return _mm_and_si128 (r,  mask0);
 }
 
-uint16x8_t vshlq_n_u16(uint16x8_t a, __constrange(0,15) int b);         // VSHL.I16 q0,q0,#0
+uint16x8_t vshlq_n_u16(uint16x8_t a, __constrange(0,15) int b); // VSHL.I16 q0,q0,#0
 #define vshlq_n_u16 vshlq_n_s16
 
-uint32x4_t vshlq_n_u32(uint32x4_t a, __constrange(0,31) int b);         // VSHL.I32 q0,q0,#0
+uint32x4_t vshlq_n_u32(uint32x4_t a, __constrange(0,31) int b); // VSHL.I32 q0,q0,#0
 #define vshlq_n_u32 vshlq_n_s32
 
-uint64x2_t vshlq_n_u64(uint64x2_t a, __constrange(0,63) int b);         // VSHL.I64 q0,q0,#0
+uint64x2_t vshlq_n_u64(uint64x2_t a, __constrange(0,63) int b); // VSHL.I64 q0,q0,#0
 #define vshlq_n_u64 vshlq_n_s64
 
 //************* Vector rounding shift right by constant ******************
 //*************************************************************************
 //No corresponding  x86 intrinsics exist, need to do some tricks
-
-int8x16_t vrshrq_n_s8(int8x16_t a, __constrange(1,8) int b);         // VRSHR.S8 q0,q0,#8
-_NEON2SSE_INLINE int8x16_t vrshrq_n_s8(int8x16_t a, __constrange(1,8) int b)         // VRSHR.S8 q0,q0,#8
-{         //no 8 bit shift available, go to 16 bit trick
-    __m128i r, mask1, maskb;
-    _NEON2SSE_ALIGN_16 uint16_t mask2b[9] = {0x0000, 0x0101, 0x0202, 0x0404, 0x0808, 0x1010, 0x2020, 0x4040, 0x8080};         // 2^b-th bit set to 1
-    r = vshrq_n_s8 (a, b);
-    mask1 = _mm_set1_epi16(mask2b[b]);         // 2^b-th bit set to 1 for 16bit, need it for rounding
-    maskb = _mm_and_si128(a, mask1);         //get b or 0 for rounding
-    maskb =  _mm_srli_epi16 (maskb, b - 1);         // to add 1
-    return _mm_add_epi8(r, maskb);         //actual rounding
+int8x8_t vrshr_n_s8(int8x8_t a, __constrange(1,8) int b); // VRSHR.S8 d0,d0,#8
+_NEON2SSE_INLINE int8x8_t vrshr_n_s8(int8x8_t a, __constrange(1,8) int b) // VRSHR.S8 d0,d0,#8
+{
+    //no 8 bit shift available, go to 16 bit
+    int8x8_t res64;
+    __m128i r, maskb;
+    r = _MM_CVTEPI8_EPI16 (_pM128i(a)); //SSE 4.1
+    maskb =  _mm_slli_epi16 (r, (16 - b)); //to get rounding (b-1)th bit
+    maskb = _mm_srli_epi16 (maskb, 15); //1 or 0
+    r = _mm_srai_epi16 (r, b);
+    r = _mm_add_epi16 (r, maskb); //actual rounding
+    r = _mm_packs_epi16 (r,r); ////we need 64 bits only
+    return64(r);
 }
 
-int16x8_t vrshrq_n_s16(int16x8_t a, __constrange(1,16) int b);         // VRSHR.S16 q0,q0,#16
-_NEON2SSE_INLINE int16x8_t vrshrq_n_s16(int16x8_t a, __constrange(1,16) int b)         // VRSHR.S16 q0,q0,#16
+int16x4_t vrshr_n_s16(int16x4_t a,  __constrange(1,16) int b); // VRSHR.S16 d0,d0,#16
+_NEON2SSE_INLINE int16x4_t vrshr_n_s16(int16x4_t a,  __constrange(1,16) int b)
+{
+    int16x4_t res64;
+    return64(vrshrq_n_s16(_pM128i(a), b));
+}
+
+
+int32x2_t vrshr_n_s32(int32x2_t a,  __constrange(1,32) int b); // VRSHR.S32 d0,d0,#32
+_NEON2SSE_INLINE int32x2_t vrshr_n_s32(int32x2_t a,  __constrange(1,32) int b)
+{
+    int32x2_t res64;
+    return64(vrshrq_n_s32(_pM128i(a), b));
+}
+
+
+int64x1_t vrshr_n_s64(int64x1_t a, __constrange(1,64) int b); // VRSHR.S64 d0,d0,#64
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vrshr_n_s64(int64x1_t a, __constrange(1,64) int b), _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    //serial solution is faster
+    int64x1_t res;
+    int64_t a_i64 = *( int64_t*)&a;
+    if(b==64) {
+        res.m64_i64[0] = 0; //for some compilers rounding happens and we need to use(a_i64 & _SIGNBIT64)>>63;
+    } else {
+        int64_t maskb = a_i64 & (( int64_t)1 << (b - 1));
+        res.m64_i64[0] = (a_i64 >> b) + (maskb >> (b - 1));
+    }
+    return res;
+}
+
+uint8x8_t vrshr_n_u8(uint8x8_t a, __constrange(1,8) int b); // VRSHR.U8 d0,d0,#8
+_NEON2SSE_INLINE uint8x8_t vrshr_n_u8(uint8x8_t a, __constrange(1,8) int b) // VRSHR.U8 d0,d0,#8
+{
+    //no 8 bit shift available, go to 16 bit, solution may be not optimal compared with the serial one
+    uint8x8_t res64;
+    __m128i r, maskb;
+    r = _MM_CVTEPU8_EPI16 (_pM128i(a)); //SSE 4.1
+    maskb =  _mm_slli_epi16 (r, (16 - b)); //to get rounding (b-1)th bit
+    maskb = _mm_srli_epi16 (maskb, 15); //1 or 0
+    r = _mm_srli_epi16 (r, b);
+    r = _mm_add_epi16 (r, maskb); //actual rounding
+    r =  _mm_packus_epi16 (r,r); ////we need 64 bits only
+    return64(r);
+}
+
+uint16x4_t vrshr_n_u16(uint16x4_t a,  __constrange(1,16) int b); // VRSHR.s16 d0,d0,#16
+_NEON2SSE_INLINE uint16x4_t vrshr_n_u16(uint16x4_t a,  __constrange(1,16) int b)
+{
+    uint16x4_t res64;
+    return64(vrshrq_n_u16(_pM128i(a), b));
+}
+
+
+uint32x2_t vrshr_n_u32(uint32x2_t a,  __constrange(1,32) int b); // VRSHR.U32 d0,d0,#32
+_NEON2SSE_INLINE uint32x2_t vrshr_n_u32(uint32x2_t a,  __constrange(1,32) int b)
+{
+    uint32x2_t res64;
+    return64(vrshrq_n_u32(_pM128i(a), b));
+}
+
+
+uint64x1_t vrshr_n_u64(uint64x1_t a, __constrange(1,64) int b); // VRSHR.U64 d0,d0,#64
+_NEON2SSE_INLINE uint64x1_t vrshr_n_u64(uint64x1_t a, __constrange(1,64) int b)
+{
+    uint64x1_t res64;
+    return64(vrshrq_n_u64(_pM128i(a), b));
+}
+
+int8x16_t vrshrq_n_s8(int8x16_t a, __constrange(1,8) int b); // VRSHR.S8 q0,q0,#8
+_NEON2SSE_INLINE int8x16_t vrshrq_n_s8(int8x16_t a, __constrange(1,8) int b) // VRSHR.S8 q0,q0,#8
+{
+    //no 8 bit shift available, go to 16 bit trick
+    __m128i r, mask1, maskb;
+    _NEON2SSE_ALIGN_16 uint16_t mask2b[9] = {0x0000, 0x0101, 0x0202, 0x0404, 0x0808, 0x1010, 0x2020, 0x4040, 0x8080}; // 2^b-th bit set to 1
+    r = vshrq_n_s8 (a, b);
+    mask1 = _mm_set1_epi16(mask2b[b]); // 2^b-th bit set to 1 for 16bit, need it for rounding
+    maskb = _mm_and_si128(a, mask1); //get b or 0 for rounding
+    maskb =  _mm_srli_epi16 (maskb, b - 1); // to add 1
+    return _mm_add_epi8(r, maskb); //actual rounding
+}
+
+int16x8_t vrshrq_n_s16(int16x8_t a, __constrange(1,16) int b); // VRSHR.S16 q0,q0,#16
+_NEON2SSE_INLINE int16x8_t vrshrq_n_s16(int16x8_t a, __constrange(1,16) int b) // VRSHR.S16 q0,q0,#16
 {
     __m128i maskb, r;
-    maskb =  _mm_slli_epi16(a, (16 - b));         //to get rounding (b-1)th bit
-    maskb = _mm_srli_epi16(maskb, 15);         //1 or 0
+    maskb =  _mm_slli_epi16(a, (16 - b)); //to get rounding (b-1)th bit
+    maskb = _mm_srli_epi16(maskb, 15); //1 or 0
     r = _mm_srai_epi16 (a, b);
-    return _mm_add_epi16 (r, maskb);         //actual rounding
+    return _mm_add_epi16 (r, maskb); //actual rounding
 }
 
-int32x4_t vrshrq_n_s32(int32x4_t a, __constrange(1,32) int b);         // VRSHR.S32 q0,q0,#32
-_NEON2SSE_INLINE int32x4_t vrshrq_n_s32(int32x4_t a, __constrange(1,32) int b)         // VRSHR.S32 q0,q0,#32
+int32x4_t vrshrq_n_s32(int32x4_t a, __constrange(1,32) int b); // VRSHR.S32 q0,q0,#32
+_NEON2SSE_INLINE int32x4_t vrshrq_n_s32(int32x4_t a, __constrange(1,32) int b) // VRSHR.S32 q0,q0,#32
 {
     __m128i maskb,  r;
-    maskb = _mm_slli_epi32 (a, (32 - b));         //to get rounding (b-1)th bit
-    maskb = _mm_srli_epi32 (maskb,31);         //1 or 0
+    maskb = _mm_slli_epi32 (a, (32 - b)); //to get rounding (b-1)th bit
+    maskb = _mm_srli_epi32 (maskb,31); //1 or 0
     r = _mm_srai_epi32(a, b);
-    return _mm_add_epi32 (r, maskb);         //actual rounding
+    return _mm_add_epi32 (r, maskb); //actual rounding
 }
 
-int64x2_t vrshrq_n_s64(int64x2_t a, __constrange(1,64) int b);         // VRSHR.S64 q0,q0,#64
+int64x2_t vrshrq_n_s64(int64x2_t a, __constrange(1,64) int b); // VRSHR.S64 q0,q0,#64
 _NEON2SSE_INLINE int64x2_t vrshrq_n_s64(int64x2_t a, __constrange(1,64) int b)
-{         //solution may be not optimal compared with a serial one
+{
+    //solution may be not optimal compared with a serial one
     __m128i maskb;
     int64x2_t r;
-    maskb = _mm_slli_epi64 (a, (64 - b));         //to get rounding (b-1)th bit
-    maskb = _mm_srli_epi64 (maskb,63);         //1 or 0
+    maskb = _mm_slli_epi64 (a, (64 - b)); //to get rounding (b-1)th bit
+    maskb = _mm_srli_epi64 (maskb,63); //1 or 0
     r = vshrq_n_s64(a, b);
-    return _mm_add_epi64 (r, maskb);         //actual rounding
+    return _mm_add_epi64 (r, maskb); //actual rounding
 }
 
-uint8x16_t vrshrq_n_u8(uint8x16_t a, __constrange(1,8) int b);         // VRSHR.U8 q0,q0,#8
-_NEON2SSE_INLINE uint8x16_t vrshrq_n_u8(uint8x16_t a, __constrange(1,8) int b)         // VRSHR.U8 q0,q0,#8
-{         //no 8 bit shift available, go to 16 bit trick
+uint8x16_t vrshrq_n_u8(uint8x16_t a, __constrange(1,8) int b); // VRSHR.U8 q0,q0,#8
+_NEON2SSE_INLINE uint8x16_t vrshrq_n_u8(uint8x16_t a, __constrange(1,8) int b) // VRSHR.U8 q0,q0,#8
+{
+    //no 8 bit shift available, go to 16 bit trick
     __m128i r, mask1, maskb;
-    _NEON2SSE_ALIGN_16 uint16_t mask2b[9] = {0x0000, 0x0101, 0x0202, 0x0404, 0x0808, 0x1010, 0x2020, 0x4040, 0x8080};         // 2^b-th bit set to 1
+    _NEON2SSE_ALIGN_16 uint16_t mask2b[9] = {0x0000, 0x0101, 0x0202, 0x0404, 0x0808, 0x1010, 0x2020, 0x4040, 0x8080}; // 2^b-th bit set to 1
     r = vshrq_n_u8 (a, b);
-    mask1 = _mm_set1_epi16(mask2b[b]);         // 2^b-th bit set to 1 for 16bit, need it for rounding
-    maskb = _mm_and_si128(a, mask1);         //get b or 0 for rounding
-    maskb =  _mm_srli_epi16 (maskb, b - 1);         // to add 1
-    return _mm_add_epi8(r, maskb);         //actual rounding
+    mask1 = _mm_set1_epi16(mask2b[b]); // 2^b-th bit set to 1 for 16bit, need it for rounding
+    maskb = _mm_and_si128(a, mask1); //get b or 0 for rounding
+    maskb =  _mm_srli_epi16 (maskb, b - 1); // to add 1
+    return _mm_add_epi8(r, maskb); //actual rounding
 }
 
-uint16x8_t vrshrq_n_u16(uint16x8_t a, __constrange(1,16) int b);         // VRSHR.s16 q0,q0,#16
-_NEON2SSE_INLINE uint16x8_t vrshrq_n_u16(uint16x8_t a, __constrange(1,16) int b)         // VRSHR.S16 q0,q0,#16
+uint16x8_t vrshrq_n_u16(uint16x8_t a, __constrange(1,16) int b); // VRSHR.s16 q0,q0,#16
+_NEON2SSE_INLINE uint16x8_t vrshrq_n_u16(uint16x8_t a, __constrange(1,16) int b) // VRSHR.S16 q0,q0,#16
 {
     __m128i maskb, r;
-    maskb =  _mm_slli_epi16(a, (16 - b));         //to get rounding (b-1)th bit
-    maskb = _mm_srli_epi16(maskb, 15);         //1 or 0
+    maskb =  _mm_slli_epi16(a, (16 - b)); //to get rounding (b-1)th bit
+    maskb = _mm_srli_epi16(maskb, 15); //1 or 0
     r = _mm_srli_epi16 (a, b);
-    return _mm_add_epi16 (r, maskb);         //actual rounding
+    return _mm_add_epi16 (r, maskb); //actual rounding
 }
 
-uint32x4_t vrshrq_n_u32(uint32x4_t a, __constrange(1,32) int b);         // VRSHR.U32 q0,q0,#32
-_NEON2SSE_INLINE uint32x4_t vrshrq_n_u32(uint32x4_t a, __constrange(1,32) int b)         // VRSHR.S32 q0,q0,#32
+uint32x4_t vrshrq_n_u32(uint32x4_t a, __constrange(1,32) int b); // VRSHR.U32 q0,q0,#32
+_NEON2SSE_INLINE uint32x4_t vrshrq_n_u32(uint32x4_t a, __constrange(1,32) int b) // VRSHR.S32 q0,q0,#32
 {
     __m128i maskb,  r;
-    maskb = _mm_slli_epi32 (a, (32 - b));         //to get rounding (b-1)th bit
-    maskb = _mm_srli_epi32 (maskb,31);         //1 or 0
+    maskb = _mm_slli_epi32 (a, (32 - b)); //to get rounding (b-1)th bit
+    maskb = _mm_srli_epi32 (maskb,31); //1 or 0
     r = _mm_srli_epi32(a, b);
-    return _mm_add_epi32 (r, maskb);         //actual rounding
+    return _mm_add_epi32 (r, maskb); //actual rounding
 }
 
-uint64x2_t vrshrq_n_u64(uint64x2_t a, __constrange(1,64) int b);         // VRSHR.U64 q0,q0,#64
+uint64x2_t vrshrq_n_u64(uint64x2_t a, __constrange(1,64) int b); // VRSHR.U64 q0,q0,#64
 _NEON2SSE_INLINE uint64x2_t vrshrq_n_u64(uint64x2_t a, __constrange(1,64) int b)
-{         //solution may be not optimal compared with a serial one
+{
+    //solution may be not optimal compared with a serial one
     __m128i maskb,  r;
-    maskb = _mm_slli_epi64 (a, (64 - b));         //to get rounding (b-1)th bit
-    maskb = _mm_srli_epi64 (maskb,63);         //1 or 0
+    maskb = _mm_slli_epi64 (a, (64 - b)); //to get rounding (b-1)th bit
+    maskb = _mm_srli_epi64 (maskb,63); //1 or 0
     r = _mm_srli_epi64(a, b);
-    return _mm_add_epi64 (r, maskb);         //actual rounding
+    return _mm_add_epi64 (r, maskb); //actual rounding
 }
 
 //************* Vector shift right by constant and accumulate *********
 //*********************************************************************
+int8x8_t vsra_n_s8(int8x8_t a, int8x8_t b, __constrange(1,8) int c); // VSRA.S8 d0,d0,#8
+_NEON2SSE_INLINE int8x8_t vsra_n_s8(int8x8_t a, int8x8_t b, __constrange(1,8) int c) // VSRA.S8 d0,d0,#8
+{
+    int8x8_t shift;
+    shift = vshr_n_s8(b, c);
+    return vadd_s8( a, shift);
+}
 
-int8x16_t vsraq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c);         // VSRA.S8 q0,q0,#8
-_NEON2SSE_INLINE int8x16_t vsraq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c)         // VSRA.S8 q0,q0,#8
+int16x4_t vsra_n_s16(int16x4_t a, int16x4_t b, __constrange(1,16) int c); // VSRA.S16 d0,d0,#16
+_NEON2SSE_INLINE int16x4_t vsra_n_s16(int16x4_t a, int16x4_t b, __constrange(1,16) int c) // VSRA.S16 d0,d0,#16
+{
+    int16x4_t shift;
+    shift = vshr_n_s16( b, c);
+    return vadd_s16(a, shift);
+}
+
+int32x2_t vsra_n_s32(int32x2_t a, int32x2_t b, __constrange(1,32) int c); // VSRA.S32 d0,d0,#32
+_NEON2SSE_INLINE int32x2_t vsra_n_s32(int32x2_t a, int32x2_t b, __constrange(1,32) int c) // VSRA.S32 d0,d0,#32
+{
+    //may be not optimal compared with the serial execution
+    int32x2_t shift;
+    shift = vshr_n_s32(b, c);
+    return vadd_s32( a, shift);
+}
+
+int64x1_t vsra_n_s64(int64x1_t a, int64x1_t b, __constrange(1,64) int c); // VSRA.S64 d0,d0,#64
+_NEON2SSE_INLINE int64x1_t vsra_n_s64(int64x1_t a, int64x1_t b, __constrange(1,64) int c)
+{
+    //may be not optimal compared with a serial solution
+    int64x1_t shift;
+    shift = vshr_n_s64(b, c);
+    return vadd_s64( a, shift);
+}
+
+uint8x8_t vsra_n_u8(uint8x8_t a, uint8x8_t b, __constrange(1,8) int c); // VSRA.U8 d0,d0,#8
+_NEON2SSE_INLINE uint8x8_t vsra_n_u8(uint8x8_t a, uint8x8_t b, __constrange(1,8) int c) // VSRA.U8 d0,d0,#8
+{
+    uint8x8_t shift;
+    shift = vshr_n_u8(b, c);
+    return vadd_u8(a, shift);
+}
+
+uint16x4_t vsra_n_u16(uint16x4_t a, uint16x4_t b, __constrange(1,16) int c); // VSRA.s16 d0,d0,#16
+_NEON2SSE_INLINE uint16x4_t vsra_n_u16(uint16x4_t a, uint16x4_t b, __constrange(1,16) int c) // VSRA.s16 d0,d0,#16
+{
+    uint16x4_t shift;
+    shift = vshr_n_u16(b, c);
+    return vadd_u16(a,shift);
+}
+
+uint32x2_t vsra_n_u32(uint32x2_t a, uint32x2_t b, __constrange(1,32) int c); // VSRA.U32 d0,d0,#32
+_NEON2SSE_INLINE uint32x2_t vsra_n_u32(uint32x2_t a, uint32x2_t b, __constrange(1,32) int c) // VSRA.U32 d0,d0,#32
+{
+    //may be not optimal compared with the serial execution
+    uint32x2_t shift;
+    shift = vshr_n_u32(b, c);
+    return vadd_u32( a, shift);
+}
+
+uint64x1_t vsra_n_u64(uint64x1_t a, uint64x1_t b, __constrange(1,64) int c); // VSRA.U64 d0,d0,#64
+_NEON2SSE_INLINE uint64x1_t vsra_n_u64(uint64x1_t a, uint64x1_t b, __constrange(1,64) int c) // VSRA.U64 d0,d0,#64
+{
+    //may be not optimal compared with the serial execution
+    uint64x1_t shift;
+    shift = vshr_n_u64(b, c);
+    return vadd_u64(a, shift);
+}
+
+int8x16_t vsraq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c); // VSRA.S8 q0,q0,#8
+_NEON2SSE_INLINE int8x16_t vsraq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c) // VSRA.S8 q0,q0,#8
 {
     int8x16_t shift;
     shift = vshrq_n_s8(b, c);
     return vaddq_s8(a, shift);
 }
 
-int16x8_t vsraq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c);         // VSRA.S16 q0,q0,#16
-_NEON2SSE_INLINE int16x8_t vsraq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c)         // VSRA.S16 q0,q0,#16
+int16x8_t vsraq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c); // VSRA.S16 q0,q0,#16
+_NEON2SSE_INLINE int16x8_t vsraq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c) // VSRA.S16 q0,q0,#16
 {
     int16x8_t shift;
     shift = vshrq_n_s16(b, c);
     return vaddq_s16(a, shift);
 }
 
-int32x4_t vsraq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c);         // VSRA.S32 q0,q0,#32
-_NEON2SSE_INLINE int32x4_t vsraq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c)         // VSRA.S32 q0,q0,#32
+int32x4_t vsraq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c); // VSRA.S32 q0,q0,#32
+_NEON2SSE_INLINE int32x4_t vsraq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c) // VSRA.S32 q0,q0,#32
 {
     int32x4_t shift;
     shift = vshrq_n_s32(b, c);
     return vaddq_s32(a, shift);
 }
 
-int64x2_t vsraq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c);         // VSRA.S64 q0,q0,#64
-_NEON2SSE_INLINE int64x2_t vsraq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c)         // VSRA.S64 q0,q0,#64
+int64x2_t vsraq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c); // VSRA.S64 q0,q0,#64
+_NEON2SSE_INLINE int64x2_t vsraq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c) // VSRA.S64 q0,q0,#64
 {
     int64x2_t shift;
     shift = vshrq_n_s64(b, c);
     return vaddq_s64( a, shift);
 }
 
-uint8x16_t vsraq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(1,8) int c);         // VSRA.U8 q0,q0,#8
-_NEON2SSE_INLINE uint8x16_t vsraq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(1,8) int c)         // VSRA.U8 q0,q0,#8
+uint8x16_t vsraq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(1,8) int c); // VSRA.U8 q0,q0,#8
+_NEON2SSE_INLINE uint8x16_t vsraq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(1,8) int c) // VSRA.U8 q0,q0,#8
 {
     uint8x16_t shift;
     shift = vshrq_n_u8(b, c);
     return vaddq_u8(a, shift);
 }
 
-uint16x8_t vsraq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(1,16) int c);         // VSRA.s16 q0,q0,#16
-_NEON2SSE_INLINE uint16x8_t vsraq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(1,16) int c)         // VSRA.s16 q0,q0,#16
+uint16x8_t vsraq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(1,16) int c); // VSRA.s16 q0,q0,#16
+_NEON2SSE_INLINE uint16x8_t vsraq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(1,16) int c) // VSRA.s16 q0,q0,#16
 {
     uint16x8_t shift;
     shift = vshrq_n_u16(b, c);
     return vaddq_u16(a,  shift);
 }
 
-uint32x4_t vsraq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(1,32) int c);         // VSRA.U32 q0,q0,#32
-_NEON2SSE_INLINE uint32x4_t vsraq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(1,32) int c)         // VSRA.U32 q0,q0,#32
+uint32x4_t vsraq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(1,32) int c); // VSRA.U32 q0,q0,#32
+_NEON2SSE_INLINE uint32x4_t vsraq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(1,32) int c) // VSRA.U32 q0,q0,#32
 {
     uint32x4_t shift;
     shift = vshrq_n_u32(b, c);
     return vaddq_u32(a, shift);
 }
 
-uint64x2_t vsraq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(1,64) int c);         // VSRA.U64 q0,q0,#64
-_NEON2SSE_INLINE uint64x2_t vsraq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(1,64) int c)         // VSRA.U64 q0,q0,#64
+uint64x2_t vsraq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(1,64) int c); // VSRA.U64 q0,q0,#64
+_NEON2SSE_INLINE uint64x2_t vsraq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(1,64) int c) // VSRA.U64 q0,q0,#64
 {
     uint64x2_t shift;
     shift = vshrq_n_u64(b, c);
@@ -4068,32 +8024,98 @@
 
 //************* Vector rounding shift right by constant and accumulate ****************************
 //************************************************************************************************
+int8x8_t vrsra_n_s8(int8x8_t a, int8x8_t b, __constrange(1,8) int c); // VRSRA.S8 d0,d0,#8
+_NEON2SSE_INLINE int8x8_t vrsra_n_s8(int8x8_t a, int8x8_t b, __constrange(1,8) int c) // VRSRA.S8 d0,d0,#8
+{
+    int8x8_t shift;
+    shift = vrshr_n_s8(b, c);
+    return vadd_s8( a, shift);
+}
 
-int8x16_t vrsraq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c);         // VRSRA.S8 q0,q0,#8
-_NEON2SSE_INLINE int8x16_t vrsraq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c)         // VRSRA.S8 q0,q0,#8
+int16x4_t vrsra_n_s16(int16x4_t a, int16x4_t b, __constrange(1,16) int c); // VRSRA.S16 d0,d0,#16
+_NEON2SSE_INLINE int16x4_t vrsra_n_s16(int16x4_t a, int16x4_t b, __constrange(1,16) int c) // VRSRA.S16 d0,d0,#16
+{
+    int16x4_t shift;
+    shift = vrshr_n_s16( b, c);
+    return vadd_s16(a, shift);
+}
+
+int32x2_t vrsra_n_s32(int32x2_t a, int32x2_t b, __constrange(1,32) int c); // VRSRA.S32 d0,d0,#32
+_NEON2SSE_INLINE int32x2_t vrsra_n_s32(int32x2_t a, int32x2_t b, __constrange(1,32) int c) // VRSRA.S32 d0,d0,#32
+{
+    //may be not optimal compared with the serial execution
+    int32x2_t shift;
+    shift = vrshr_n_s32(b, c);
+    return vadd_s32( a, shift);
+}
+
+int64x1_t vrsra_n_s64(int64x1_t a, int64x1_t b, __constrange(1,64) int c); // VRSRA.S64 d0,d0,#64
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vrsra_n_s64(int64x1_t a, int64x1_t b, __constrange(1,64) int c), _NEON2SSE_REASON_SLOW_SERIAL) //serial solution
+{
+    int64x1_t shift;
+    shift = vrshr_n_s64(b, c);
+    return vadd_s64( a, shift);
+}
+
+uint8x8_t vrsra_n_u8(uint8x8_t a, uint8x8_t b, __constrange(1,8) int c); // VRSRA.U8 d0,d0,#8
+_NEON2SSE_INLINE uint8x8_t vrsra_n_u8(uint8x8_t a, uint8x8_t b, __constrange(1,8) int c) // VRSRA.U8 d0,d0,#8
+{
+    uint8x8_t shift;
+    shift = vrshr_n_u8(b, c);
+    return vadd_u8(a, shift);
+}
+
+uint16x4_t vrsra_n_u16(uint16x4_t a, uint16x4_t b, __constrange(1,16) int c); // VRSRA.s16 d0,d0,#16
+_NEON2SSE_INLINE uint16x4_t vrsra_n_u16(uint16x4_t a, uint16x4_t b, __constrange(1,16) int c) // VRSRA.s16 d0,d0,#16
+{
+    uint16x4_t shift;
+    shift = vrshr_n_u16(b, c);
+    return vadd_u16(a,shift);
+}
+
+uint32x2_t vrsra_n_u32(uint32x2_t a, uint32x2_t b, __constrange(1,32) int c); // VRSRA.U32 d0,d0,#32
+_NEON2SSE_INLINE uint32x2_t vrsra_n_u32(uint32x2_t a, uint32x2_t b, __constrange(1,32) int c) // VRSRA.U32 d0,d0,#32
+{
+    //may be not optimal compared with the serial execution
+    uint32x2_t shift;
+    shift = vrshr_n_u32(b, c);
+    return vadd_u32( a, shift);
+}
+
+uint64x1_t vrsra_n_u64(uint64x1_t a, uint64x1_t b, __constrange(1,64) int c); // VRSRA.U64 d0,d0,#64
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x1_t vrsra_n_u64(uint64x1_t a, uint64x1_t b, __constrange(1,64) int c), _NEON2SSE_REASON_SLOW_SERIAL) //serial solution
+{
+    //may be not optimal compared with the serial execution
+    uint64x1_t shift;
+    shift = vrshr_n_u64(b, c);
+    return vadd_u64( a, shift);
+}
+
+int8x16_t vrsraq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c); // VRSRA.S8 q0,q0,#8
+_NEON2SSE_INLINE int8x16_t vrsraq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c) // VRSRA.S8 q0,q0,#8
 {
     int8x16_t shift;
     shift = vrshrq_n_s8(b, c);
     return vaddq_s8(a, shift);
 }
 
-int16x8_t vrsraq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c);         // VRSRA.S16 q0,q0,#16
-_NEON2SSE_INLINE int16x8_t vrsraq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c)         // VRSRA.S16 q0,q0,#16
+int16x8_t vrsraq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c); // VRSRA.S16 q0,q0,#16
+_NEON2SSE_INLINE int16x8_t vrsraq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c) // VRSRA.S16 q0,q0,#16
 {
     int16x8_t shift;
     shift = vrshrq_n_s16(b, c);
     return vaddq_s16(a, shift);
 }
 
-int32x4_t vrsraq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c);         // VRSRA.S32 q0,q0,#32
-_NEON2SSE_INLINE int32x4_t vrsraq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c)         // VRSRA.S32 q0,q0,#32
+int32x4_t vrsraq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c); // VRSRA.S32 q0,q0,#32
+_NEON2SSE_INLINE int32x4_t vrsraq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c) // VRSRA.S32 q0,q0,#32
 {
     int32x4_t shift;
     shift = vrshrq_n_s32(b, c);
     return vaddq_s32(a, shift);
 }
 
-int64x2_t vrsraq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c);         // VRSRA.S64 q0,q0,#64
+int64x2_t vrsraq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c); // VRSRA.S64 q0,q0,#64
 _NEON2SSE_INLINE int64x2_t vrsraq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c)
 {
     int64x2_t shift;
@@ -4101,31 +8123,31 @@
     return vaddq_s64(a, shift);
 }
 
-uint8x16_t vrsraq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(1,8) int c);         // VRSRA.U8 q0,q0,#8
-_NEON2SSE_INLINE uint8x16_t vrsraq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(1,8) int c)         // VRSRA.U8 q0,q0,#8
+uint8x16_t vrsraq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(1,8) int c); // VRSRA.U8 q0,q0,#8
+_NEON2SSE_INLINE uint8x16_t vrsraq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(1,8) int c) // VRSRA.U8 q0,q0,#8
 {
     uint8x16_t shift;
     shift = vrshrq_n_u8(b, c);
     return vaddq_u8(a, shift);
 }
 
-uint16x8_t vrsraq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(1,16) int c);         // VRSRA.s16 q0,q0,#16
-_NEON2SSE_INLINE uint16x8_t vrsraq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(1,16) int c)         // VRSRA.s16 q0,q0,#16
+uint16x8_t vrsraq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(1,16) int c); // VRSRA.s16 q0,q0,#16
+_NEON2SSE_INLINE uint16x8_t vrsraq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(1,16) int c) // VRSRA.s16 q0,q0,#16
 {
     uint16x8_t shift;
     shift = vrshrq_n_u16(b, c);
     return vaddq_u16(a,  shift);
 }
 
-uint32x4_t vrsraq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(1,32) int c);         // VRSRA.U32 q0,q0,#32
-_NEON2SSE_INLINE uint32x4_t vrsraq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(1,32) int c)         // VRSRA.U32 q0,q0,#32
+uint32x4_t vrsraq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(1,32) int c); // VRSRA.U32 q0,q0,#32
+_NEON2SSE_INLINE uint32x4_t vrsraq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(1,32) int c) // VRSRA.U32 q0,q0,#32
 {
     uint32x4_t shift;
     shift = vrshrq_n_u32(b, c);
     return vaddq_u32(a, shift);
 }
 
-uint64x2_t vrsraq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(1,64) int c);         // VRSRA.U64 q0,q0,#64
+uint64x2_t vrsraq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(1,64) int c); // VRSRA.U64 q0,q0,#64
 _NEON2SSE_INLINE uint64x2_t vrsraq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(1,64) int c)
 {
     uint64x2_t shift;
@@ -4136,61 +8158,157 @@
 //**********************Vector saturating shift left by constant *****************************
 //********************************************************************************************
 //we don't check const ranges  assuming they are met
+int8x8_t vqshl_n_s8(int8x8_t a, __constrange(0,7) int b); // VQSHL.S8 d0,d0,#0
+_NEON2SSE_INLINE int8x8_t vqshl_n_s8(int8x8_t a, __constrange(0,7) int b) // VQSHL.S8 d0,d0,#0
+{
+    //no 8 bit shift available in IA32 SIMD, go to 16 bit. It also provides the auto saturation (in packs function)
+    int8x8_t res64;
+    __m128i a128, r128;
+    a128 = _MM_CVTEPI8_EPI16 (_pM128i(a)); //SSE 4.1
+    r128 = _mm_slli_epi16 (a128, b);
+    r128 = _mm_packs_epi16 (r128,r128); //saturated s8, use 64 low bits only
+    return64(r128);
+}
 
-int8x16_t vqshlq_n_s8(int8x16_t a, __constrange(0,7) int b);         // VQSHL.S8 q0,q0,#0
-_NEON2SSE_INLINE int8x16_t vqshlq_n_s8(int8x16_t a, __constrange(0,7) int b)         // VQSHL.S8 q0,q0,#0
-{         // go to 16 bit to get the auto saturation (in packs function)
+int16x4_t vqshl_n_s16(int16x4_t a, __constrange(0,15) int b); // VQSHL.S16 d0,d0,#0
+_NEON2SSE_INLINE int16x4_t vqshl_n_s16(int16x4_t a, __constrange(0,15) int b) // VQSHL.S16 d0,d0,#0
+{
+    // go to 32 bit to get the auto saturation (in packs function)
+    int16x4_t res64;
+    __m128i a128, r128;
+    a128 = _MM_CVTEPI16_EPI32 (_pM128i(a)); //SSE 4.1
+    r128 = _mm_slli_epi32 (a128, b); //shift_res
+    r128 = _mm_packs_epi32 (r128,r128); //saturated s16, use 64 low bits only
+    return64(r128);
+}
+
+int32x2_t vqshl_n_s32(int32x2_t a,  __constrange(0,31) int b); // VQSHL.S32 d0,d0,#0
+_NEON2SSE_INLINE int32x2_t vqshl_n_s32(int32x2_t a,  __constrange(0,31) int b)
+{
+    //serial execution may be faster
+    int32x2_t res64;
+    return64(vqshlq_n_s32 (_pM128i(a), b));
+}
+
+
+int64x1_t vqshl_n_s64(int64x1_t a, __constrange(0,63) int b); // VQSHL.S64 d0,d0,#0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vqshl_n_s64(int64x1_t a, __constrange(0,63) int b), _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    // no effective SIMD solution here
+    int64x1_t res;
+    int64_t bmask;
+    int64_t a_i64 = *( int64_t*)&a;
+    bmask = ( int64_t)1 << (63 - b); //positive
+    if (a_i64 >= bmask) {
+        res.m64_i64[0] = ~(_SIGNBIT64);
+    } else {
+        res.m64_i64[0]  = (a_i64 <= -bmask) ? _SIGNBIT64 : a_i64 << b;
+    }
+    return res;
+}
+
+
+uint8x8_t vqshl_n_u8(uint8x8_t a, __constrange(0,7) int b); // VQSHL.U8 d0,d0,#0
+_NEON2SSE_INLINE uint8x8_t vqshl_n_u8(uint8x8_t a, __constrange(0,7) int b) // VQSHL.U8 d0,d0,#0
+{
+    //no 8 bit shift available in IA32 SIMD, go to 16 bit
+    uint8x8_t res64;
+    __m128i a128, r128;
+    a128 = _MM_CVTEPU8_EPI16 (_pM128i(a)); //SSE 4.1
+    r128 = _mm_slli_epi16 (a128, b); //shift_res
+    r128 = _mm_packus_epi16 (r128,r128); //saturated u8, use 64 low bits only
+    return64(r128);
+}
+
+uint16x4_t vqshl_n_u16(uint16x4_t a, __constrange(0,15) int b); // VQSHL.s16 d0,d0,#0
+_NEON2SSE_INLINE uint16x4_t vqshl_n_u16(uint16x4_t a, __constrange(0,15) int b) // VQSHL.s16 d0,d0,#0
+{
+    // go to 32 bit to get the auto saturation (in packus function)
+    uint16x4_t res64;
+    __m128i a128, r128;
+    a128 = _MM_CVTEPU16_EPI32 (_pM128i(a)); //SSE 4.1
+    r128 = _mm_slli_epi32 (a128, b); //shift_res
+    r128 = _MM_PACKUS1_EPI32 (r128); //saturated s16
+    return64(r128);
+}
+
+uint32x2_t vqshl_n_u32(uint32x2_t a,  __constrange(0,31) int b); // VQSHL.U32 d0,d0,#0
+_NEON2SSE_INLINE uint32x2_t vqshl_n_u32(uint32x2_t a,  __constrange(0,31) int b)
+{
+    uint32x2_t res64;
+    return64(vqshlq_n_u32(_pM128i(a), b));
+}
+
+uint64x1_t vqshl_n_u64(uint64x1_t a, __constrange(0,63) int b); // VQSHL.U64 d0,d0,#0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x1_t vqshl_n_u64(uint64x1_t a, __constrange(0,63) int b), _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    // no effective SIMD solution here
+    uint64x1_t res;
+    uint64_t bmask;
+    uint64_t a_i64 = *(uint64_t*)&a;
+    bmask = ( uint64_t)1 << (64 - b);
+    res.m64_u64[0] = (a_i64 >= bmask)&&(b>0) ? 0xffffffffffffffff : a_i64 << b; //if b=0 we are fine with any a
+    return res;
+}
+
+int8x16_t vqshlq_n_s8(int8x16_t a, __constrange(0,7) int b); // VQSHL.S8 q0,q0,#0
+_NEON2SSE_INLINE int8x16_t vqshlq_n_s8(int8x16_t a, __constrange(0,7) int b) // VQSHL.S8 q0,q0,#0
+{
+    // go to 16 bit to get the auto saturation (in packs function)
     __m128i a128, r128_1, r128_2;
-    a128 = _MM_CVTEPI8_EPI16 (a);         //SSE 4.1
+    a128 = _MM_CVTEPI8_EPI16 (a); //SSE 4.1
     r128_1 = _mm_slli_epi16 (a128, b);
     //swap hi and low part of a128 to process the remaining data
     a128 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
     a128 = _MM_CVTEPI8_EPI16 (a128);
     r128_2 = _mm_slli_epi16 (a128, b);
-    return _mm_packs_epi16 (r128_1, r128_2);         //saturated s8
+    return _mm_packs_epi16 (r128_1, r128_2); //saturated s8
 }
 
-int16x8_t vqshlq_n_s16(int16x8_t a, __constrange(0,15) int b);         // VQSHL.S16 q0,q0,#0
-_NEON2SSE_INLINE int16x8_t vqshlq_n_s16(int16x8_t a, __constrange(0,15) int b)         // VQSHL.S16 q0,q0,#0
-{         // manual saturation solution looks LESS optimal than 32 bits conversion one
-      // go to 32 bit to get the auto saturation (in packs function)
+int16x8_t vqshlq_n_s16(int16x8_t a, __constrange(0,15) int b); // VQSHL.S16 q0,q0,#0
+_NEON2SSE_INLINE int16x8_t vqshlq_n_s16(int16x8_t a, __constrange(0,15) int b) // VQSHL.S16 q0,q0,#0
+{
+    // manual saturation solution looks LESS optimal than 32 bits conversion one
+    // go to 32 bit to get the auto saturation (in packs function)
     __m128i a128, r128_1, r128_2;
-    a128 = _MM_CVTEPI16_EPI32 (a);         //SSE 4.1
-    r128_1 = _mm_slli_epi32 (a128, b);         //shift_res
+    a128 = _MM_CVTEPI16_EPI32 (a); //SSE 4.1
+    r128_1 = _mm_slli_epi32 (a128, b); //shift_res
     //swap hi and low part of a128 to process the remaining data
     a128 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
     a128 = _MM_CVTEPI16_EPI32 (a128);
     r128_2 = _mm_slli_epi32 (a128, b);
-    return _mm_packs_epi32 (r128_1, r128_2);         //saturated s16
+    return _mm_packs_epi32 (r128_1, r128_2); //saturated s16
 }
 
-int32x4_t vqshlq_n_s32(int32x4_t a, __constrange(0,31) int b);         // VQSHL.S32 q0,q0,#0
-_NEON2SSE_INLINE int32x4_t vqshlq_n_s32(int32x4_t a, __constrange(0,31) int b)         // VQSHL.S32 q0,q0,#0
-{         // no 64 bit saturation option available, special tricks necessary
+int32x4_t vqshlq_n_s32(int32x4_t a, __constrange(0,31) int b); // VQSHL.S32 q0,q0,#0
+_NEON2SSE_INLINE int32x4_t vqshlq_n_s32(int32x4_t a, __constrange(0,31) int b) // VQSHL.S32 q0,q0,#0
+{
+    // no 64 bit saturation option available, special tricks necessary
     __m128i c1, maskA, saturation_mask, c7ffffff_mask, shift_res, shift_res_mask;
-    c1 = _mm_cmpeq_epi32(a,a);         //0xff..ff
-    maskA = _mm_srli_epi32(c1, b + 1);         //mask for positive numbers (32-b+1) zeros and b-1 ones
-    saturation_mask = _mm_cmpgt_epi32 (a, maskA);         //0xff...ff if we need saturation, 0  otherwise
-    c7ffffff_mask  = _mm_srli_epi32(saturation_mask, 1);         //saturated to 0x7f..ff when needed and zeros if not
+    c1 = _mm_cmpeq_epi32(a,a); //0xff..ff
+    maskA = _mm_srli_epi32(c1, b + 1); //mask for positive numbers (32-b+1) zeros and b-1 ones
+    saturation_mask = _mm_cmpgt_epi32 (a, maskA); //0xff...ff if we need saturation, 0  otherwise
+    c7ffffff_mask  = _mm_srli_epi32(saturation_mask, 1); //saturated to 0x7f..ff when needed and zeros if not
     shift_res = _mm_slli_epi32 (a, b);
     shift_res_mask = _mm_andnot_si128(saturation_mask, shift_res);
     //result with positive numbers saturated
     shift_res = _mm_or_si128 (c7ffffff_mask, shift_res_mask);
     //treat negative numbers
-    maskA = _mm_slli_epi32(c1, 31 - b);         //mask for negative numbers b-1 ones  and (32-b+1)  zeros
-    saturation_mask = _mm_cmpgt_epi32 (maskA,a);         //0xff...ff if we need saturation, 0  otherwise
-    c7ffffff_mask  = _mm_slli_epi32(saturation_mask, 31);         //saturated to 0x80..00 when needed and zeros if not
+    maskA = _mm_slli_epi32(c1, 31 - b); //mask for negative numbers b-1 ones  and (32-b+1)  zeros
+    saturation_mask = _mm_cmpgt_epi32 (maskA,a); //0xff...ff if we need saturation, 0  otherwise
+    c7ffffff_mask  = _mm_slli_epi32(saturation_mask, 31); //saturated to 0x80..00 when needed and zeros if not
     shift_res_mask = _mm_andnot_si128(saturation_mask, shift_res);
     return _mm_or_si128 (c7ffffff_mask, shift_res_mask);
 }
 
-int64x2_t vqshlq_n_s64(int64x2_t a, __constrange(0,63) int b);         // VQSHL.S64 q0,q0,#0
+int64x2_t vqshlq_n_s64(int64x2_t a, __constrange(0,63) int b); // VQSHL.S64 q0,q0,#0
 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqshlq_n_s64(int64x2_t a, __constrange(0,63) int b), _NEON2SSE_REASON_SLOW_SERIAL)
-{         // no effective SIMD solution here
+{
+    // no effective SIMD solution here
     _NEON2SSE_ALIGN_16 int64_t atmp[2], res[2];
     int64_t bmask;
     int i;
-    bmask = ( int64_t)1 << (63 - b);         //positive
+    bmask = ( int64_t)1 << (63 - b); //positive
     _mm_store_si128((__m128i*)atmp, a);
     for (i = 0; i<2; i++) {
         if (atmp[i] >= bmask) {
@@ -4202,110 +8320,158 @@
     return _mm_load_si128((__m128i*)res);
 }
 
-uint8x16_t vqshlq_n_u8(uint8x16_t a, __constrange(0,7) int b);         // VQSHL.U8 q0,q0,#0
-_NEON2SSE_INLINE uint8x16_t vqshlq_n_u8(uint8x16_t a, __constrange(0,7) int b)         // VQSHL.U8 q0,q0,#0
-{         // go to 16 bit to get the auto saturation (in packs function)
+uint8x16_t vqshlq_n_u8(uint8x16_t a, __constrange(0,7) int b); // VQSHL.U8 q0,q0,#0
+_NEON2SSE_INLINE uint8x16_t vqshlq_n_u8(uint8x16_t a, __constrange(0,7) int b) // VQSHL.U8 q0,q0,#0
+{
+    // go to 16 bit to get the auto saturation (in packs function)
     __m128i a128, r128_1, r128_2;
-    a128 = _MM_CVTEPU8_EPI16 (a);         //SSE 4.1
+    a128 = _MM_CVTEPU8_EPI16 (a); //SSE 4.1
     r128_1 = _mm_slli_epi16 (a128, b);
     //swap hi and low part of a128 to process the remaining data
     a128 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
     a128 = _MM_CVTEPU8_EPI16 (a128);
     r128_2 = _mm_slli_epi16 (a128, b);
-    return _mm_packus_epi16 (r128_1, r128_2);         //saturated u8
+    return _mm_packus_epi16 (r128_1, r128_2); //saturated u8
 }
 
-uint16x8_t vqshlq_n_u16(uint16x8_t a, __constrange(0,15) int b);         // VQSHL.s16 q0,q0,#0
-_NEON2SSE_INLINE uint16x8_t vqshlq_n_u16(uint16x8_t a, __constrange(0,15) int b)         // VQSHL.s16 q0,q0,#0
-{         // manual saturation solution looks more optimal than 32 bits conversion one
+uint16x8_t vqshlq_n_u16(uint16x8_t a, __constrange(0,15) int b); // VQSHL.s16 q0,q0,#0
+_NEON2SSE_INLINE uint16x8_t vqshlq_n_u16(uint16x8_t a, __constrange(0,15) int b) // VQSHL.s16 q0,q0,#0
+{
+    // manual saturation solution looks more optimal than 32 bits conversion one
     __m128i cb, c8000, a_signed, saturation_mask,  shift_res;
     cb = _mm_set1_epi16((1 << (16 - b)) - 1 - 0x8000 );
     c8000 = _mm_set1_epi16 (0x8000);
 //no unsigned shorts comparison in SSE, only signed available, so need the trick
-    a_signed = _mm_sub_epi16(a, c8000);         //go to signed
+    a_signed = _mm_sub_epi16(a, c8000); //go to signed
     saturation_mask = _mm_cmpgt_epi16 (a_signed, cb);
     shift_res = _mm_slli_epi16 (a, b);
     return _mm_or_si128 (shift_res, saturation_mask);
 }
 
-uint32x4_t vqshlq_n_u32(uint32x4_t a, __constrange(0,31) int b);         // VQSHL.U32 q0,q0,#0
-_NEON2SSE_INLINE uint32x4_t vqshlq_n_u32(uint32x4_t a, __constrange(0,31) int b)         // VQSHL.U32 q0,q0,#0
-{         // manual saturation solution, no 64 bit saturation option, the serial version may be faster
+uint32x4_t vqshlq_n_u32(uint32x4_t a, __constrange(0,31) int b); // VQSHL.U32 q0,q0,#0
+_NEON2SSE_INLINE uint32x4_t vqshlq_n_u32(uint32x4_t a, __constrange(0,31) int b) // VQSHL.U32 q0,q0,#0
+{
+    // manual saturation solution, no 64 bit saturation option, the serial version may be faster
     __m128i cb, c80000000, a_signed, saturation_mask,  shift_res;
     cb = _mm_set1_epi32((1 << (32 - b)) - 1 - 0x80000000 );
     c80000000 = _mm_set1_epi32 (0x80000000);
 //no unsigned ints comparison in SSE, only signed available, so need the trick
-    a_signed = _mm_sub_epi32(a, c80000000);         //go to signed
+    a_signed = _mm_sub_epi32(a, c80000000); //go to signed
     saturation_mask = _mm_cmpgt_epi32 (a_signed, cb);
     shift_res = _mm_slli_epi32 (a, b);
     return _mm_or_si128 (shift_res, saturation_mask);
 }
 
-uint64x2_t vqshlq_n_u64(uint64x2_t a, __constrange(0,63) int b);         // VQSHL.U64 q0,q0,#0
+uint64x2_t vqshlq_n_u64(uint64x2_t a, __constrange(0,63) int b); // VQSHL.U64 q0,q0,#0
 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vqshlq_n_u64(uint64x2_t a, __constrange(0,63) int b), _NEON2SSE_REASON_SLOW_SERIAL)
-{         // no effective SIMD solution here
+{
+    // no effective SIMD solution here
     _NEON2SSE_ALIGN_16 uint64_t atmp[2], res[2];
     uint64_t bmask;
     int i;
     bmask = ( uint64_t)1 << (64 - b);
     _mm_store_si128((__m128i*)atmp, a);
     for (i = 0; i<2; i++) {
-        res[i] = (atmp[i] >= bmask)&&(b>0) ? 0xffffffffffffffff : atmp[i] << b;         //if b=0 we are fine with any a
+        res[i] = (atmp[i] >= bmask)&&(b>0) ? 0xffffffffffffffff : atmp[i] << b; //if b=0 we are fine with any a
     }
     return _mm_load_si128((__m128i*)res);
 }
 
 //**************Vector signed->unsigned saturating shift left by constant *************
 //*************************************************************************************
+uint8x8_t vqshlu_n_s8(int8x8_t a, __constrange(0,7) int b); // VQSHLU.S8 d0,d0,#0
+_NEON2SSE_INLINE uint8x8_t vqshlu_n_s8(int8x8_t a, __constrange(0,7) int b) // VQSHLU.S8 d0,d0,#0
+{
+    //no 8 bit shift available in IA32 SIMD, go to 16 bit. It also provides the auto saturation (in packs function)
+    uint8x8_t res64;
+    __m128i a128, r128;
+    a128 = _MM_CVTEPI8_EPI16 (_pM128i(a)); //SSE 4.1
+    r128 = _mm_slli_epi16 (a128, b);
+    r128 = _mm_packus_epi16 (r128,r128); //saturated u8, use 64 low bits only
+    return64(r128);
+}
 
-uint8x16_t vqshluq_n_s8(int8x16_t a, __constrange(0,7) int b);         // VQSHLU.S8 q0,q0,#0
-_NEON2SSE_INLINE uint8x16_t vqshluq_n_s8(int8x16_t a, __constrange(0,7) int b)         // VQSHLU.S8 q0,q0,#0
+uint16x4_t vqshlu_n_s16(int16x4_t a, __constrange(0,15) int b); // VQSHLU.S16 d0,d0,#0
+_NEON2SSE_INLINE uint16x4_t vqshlu_n_s16(int16x4_t a, __constrange(0,15) int b) // VQSHLU.S16 d0,d0,#0
+{
+    uint16x4_t res64;
+    __m128i a128, r128;
+    a128 = _MM_CVTEPI16_EPI32 (_pM128i(a)); //SSE 4.1
+    r128 = _mm_slli_epi32 (a128, b); //shift_res
+    r128 = _MM_PACKUS1_EPI32 (r128); //saturated s16, use 64 low bits only
+    return64(r128);
+}
+
+uint32x2_t vqshlu_n_s32(int32x2_t a,  __constrange(0,31) int b); // VQSHLU.S32 d0,d0,#0
+_NEON2SSE_INLINE int32x2_t vqshlu_n_s32(int32x2_t a,  __constrange(0,31) int b)
+{
+    int32x2_t res64;
+    return64( vqshluq_n_s32(_pM128i(a), b));
+}
+
+uint64x1_t vqshlu_n_s64(int64x1_t a, __constrange(0,63) int b); // VQSHLU.S64 d0,d0,#0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x1_t vqshlu_n_s64(int64x1_t a, __constrange(0,63) int b), _NEON2SSE_REASON_SLOW_SERIAL) // no effective SIMD solution here, serial execution looks faster
+{
+    uint64x1_t res;
+    uint64_t limit;
+    if (a.m64_i64[0]<=0) {
+        res.m64_u64[0] = 0;
+    } else {
+        limit = (uint64_t) 1 << (64 - b);
+        res.m64_u64[0] = ( ((uint64_t)a.m64_i64[0]) >= limit) ? res.m64_u64[0] = ~((uint64_t)0) : a.m64_i64[0] << b;
+    }
+    return res;
+}
+
+uint8x16_t vqshluq_n_s8(int8x16_t a, __constrange(0,7) int b); // VQSHLU.S8 q0,q0,#0
+_NEON2SSE_INLINE uint8x16_t vqshluq_n_s8(int8x16_t a, __constrange(0,7) int b) // VQSHLU.S8 q0,q0,#0
 {
     __m128i a128, r128_1, r128_2;
-    a128 = _MM_CVTEPI8_EPI16 (a);         //SSE 4.1
+    a128 = _MM_CVTEPI8_EPI16 (a); //SSE 4.1
     r128_1 = _mm_slli_epi16 (a128, b);
     //swap hi and low part of a128 to process the remaining data
     a128 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
     a128 = _MM_CVTEPI8_EPI16 (a128);
     r128_2 = _mm_slli_epi16 (a128, b);
-    return _mm_packus_epi16 (r128_1, r128_2);         //saturated u8
+    return _mm_packus_epi16 (r128_1, r128_2); //saturated u8
 }
 
-#if defined(USE_SSSE3)
-uint16x8_t vqshluq_n_s16(int16x8_t a, __constrange(0,15) int b);         // VQSHLU.S16 q0,q0,#0
-_NEON2SSE_INLINE uint16x8_t vqshluq_n_s16(int16x8_t a, __constrange(0,15) int b)         // VQSHLU.S16 q0,q0,#0
-{         // manual saturation solution looks LESS optimal than 32 bits conversion one
+uint16x8_t vqshluq_n_s16(int16x8_t a, __constrange(0,15) int b); // VQSHLU.S16 q0,q0,#0
+_NEON2SSE_INLINE uint16x8_t vqshluq_n_s16(int16x8_t a, __constrange(0,15) int b) // VQSHLU.S16 q0,q0,#0
+{
+    // manual saturation solution looks LESS optimal than 32 bits conversion one
     __m128i a128, r128_1, r128_2;
-    a128 = _MM_CVTEPI16_EPI32 (a);         //SSE 4.1
-    r128_1 = _mm_slli_epi32 (a128, b);         //shift_res
+    a128 = _MM_CVTEPI16_EPI32 (a); //SSE 4.1
+    r128_1 = _mm_slli_epi32 (a128, b); //shift_res
     //swap hi and low part of a128 to process the remaining data
     a128 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
     a128 = _MM_CVTEPI16_EPI32 (a128);
     r128_2 = _mm_slli_epi32 (a128, b);
-    return _MM_PACKUS_EPI32 (r128_1, r128_2);         //saturated s16
+    return _MM_PACKUS_EPI32 (r128_1, r128_2); //saturated s16
 }
-#endif
 
-uint32x4_t vqshluq_n_s32(int32x4_t a, __constrange(0,31) int b);         // VQSHLU.S32 q0,q0,#0
-_NEON2SSE_INLINE uint32x4_t vqshluq_n_s32(int32x4_t a, __constrange(0,31) int b)         // VQSHLU.S32 q0,q0,#0
-{         //solution may be  not optimal compared with the serial one
+uint32x4_t vqshluq_n_s32(int32x4_t a, __constrange(0,31) int b); // VQSHLU.S32 q0,q0,#0
+_NEON2SSE_INLINE uint32x4_t vqshluq_n_s32(int32x4_t a, __constrange(0,31) int b) // VQSHLU.S32 q0,q0,#0
+{
+    //solution may be  not optimal compared with the serial one
     __m128i zero, maskA, maskGT0, a0,  a_masked, a_shift;
     zero = _mm_setzero_si128();
     maskA = _mm_cmpeq_epi32(a, a);
-    maskA = _mm_slli_epi32(maskA,(32 - b));         // b ones and (32-b)zeros
+    maskA = _mm_slli_epi32(maskA,(32 - b)); // b ones and (32-b)zeros
     //saturate negative numbers to zero
-    maskGT0   = _mm_cmpgt_epi32 (a, zero);         // //0xffffffff if positive number and zero otherwise (negative numbers)
-    a0 = _mm_and_si128 (a,  maskGT0);         //negative are zeros now
+    maskGT0   = _mm_cmpgt_epi32 (a, zero); // //0xffffffff if positive number and zero otherwise (negative numbers)
+    a0 = _mm_and_si128 (a,  maskGT0); //negative are zeros now
     //saturate positive to 0xffffffff
     a_masked = _mm_and_si128 (a0, maskA);
-    a_masked = _mm_cmpgt_epi32 (a_masked, zero);         //0xffffffff if saturation necessary 0 otherwise
+    a_masked = _mm_cmpgt_epi32 (a_masked, zero); //0xffffffff if saturation necessary 0 otherwise
     a_shift = _mm_slli_epi32 (a0, b);
-    return _mm_or_si128 (a_shift, a_masked);         //actual saturation
+    return _mm_or_si128 (a_shift, a_masked); //actual saturation
 }
 
-uint64x2_t vqshluq_n_s64(int64x2_t a, __constrange(0,63) int b);         // VQSHLU.S64 q0,q0,#0
+uint64x2_t vqshluq_n_s64(int64x2_t a, __constrange(0,63) int b); // VQSHLU.S64 q0,q0,#0
 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vqshluq_n_s64(int64x2_t a, __constrange(0,63) int b),  _NEON2SSE_REASON_SLOW_SERIAL)
-{         // no effective SIMD solution here, serial execution looks faster
+{
+    // no effective SIMD solution here, serial execution looks faster
     _NEON2SSE_ALIGN_16 int64_t atmp[2];
     _NEON2SSE_ALIGN_16 uint64_t res[2];
     uint64_t limit;
@@ -4324,23 +8490,437 @@
 
 //************** Vector narrowing  shift right by constant **************
 //**********************************************************************
+int8x8_t vshrn_n_s16(int16x8_t a, __constrange(1,8) int b); // VSHRN.I16 d0,q0,#8
+_NEON2SSE_INLINE int8x8_t vshrn_n_s16(int16x8_t a, __constrange(1,8) int b) // VSHRN.I16 d0,q0,#8
+{
+    int8x8_t res64;
+    __m128i r16;
+    _NEON2SSE_ALIGN_16 int8_t mask8_16_even_odd[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 };
+    r16  = vshrq_n_s16(a,b);
+    r16  = _mm_shuffle_epi8 (r16, *(__m128i*) mask8_16_even_odd); //narrow, use low 64 bits only. Impossible to use _mm_packs because of negative saturation problems
+    return64(r16);
+}
+
+int16x4_t vshrn_n_s32(int32x4_t a, __constrange(1,16) int b); // VSHRN.I32 d0,q0,#16
+_NEON2SSE_INLINE int16x4_t vshrn_n_s32(int32x4_t a, __constrange(1,16) int b) // VSHRN.I32 d0,q0,#16
+{
+    int16x4_t res64;
+    __m128i r32;
+    _NEON2SSE_ALIGN_16 int8_t mask16_odd[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 };
+    r32  = vshrq_n_s32(a,b);
+    r32  =  _mm_shuffle_epi8 (r32, *(__m128i*) mask16_odd); //narrow, use low 64 bits only. Impossible to use _mm_packs because of negative saturation problems
+    return64(r32);
+}
+
+int32x2_t vshrn_n_s64(int64x2_t a, __constrange(1,32) int b); // VSHRN.I64 d0,q0,#32
+_NEON2SSE_INLINE int32x2_t vshrn_n_s64(int64x2_t a, __constrange(1,32) int b)
+{
+    int32x2_t res64;
+    __m128i r64;
+    r64  = vshrq_n_s64(a,b);
+    r64  = _mm_shuffle_epi32(r64, 0 | (2 << 2) | (1 << 4) | (3 << 6)); //shuffle the data to get 2 32-bits
+    return64(r64);
+}
+
+uint8x8_t vshrn_n_u16(uint16x8_t a, __constrange(1,8) int b); // VSHRN.I16 d0,q0,#8
+_NEON2SSE_INLINE uint8x8_t vshrn_n_u16(uint16x8_t a, __constrange(1,8) int b) // VSHRN.I16 d0,q0,#8
+{
+    uint8x8_t res64;
+    __m128i mask, r16;
+    mask = _mm_set1_epi16(0xff);
+    r16  = vshrq_n_s16(a,b); //after right shift b>=1 unsigned var fits into signed range, so we could use _mm_packus_epi16 (signed 16 to unsigned 8)
+    r16 = _mm_and_si128(r16, mask); //to avoid saturation
+    r16 = _mm_packus_epi16 (r16,r16); //narrow, use low 64 bits only
+    return64(r16);
+}
+
+uint16x4_t vshrn_n_u32(uint32x4_t a, __constrange(1,16) int b); // VSHRN.I32 d0,q0,#16
+_NEON2SSE_INLINE uint16x4_t vshrn_n_u32(uint32x4_t a, __constrange(1,16) int b) // VSHRN.I32 d0,q0,#16
+{
+    uint16x4_t res64;
+    __m128i mask, r32;
+    mask = _mm_set1_epi32(0xffff);
+    r32  = vshrq_n_u32(a,b); //after right shift b>=1 unsigned var fits into signed range, so we could use _MM_PACKUS_EPI32 (signed 32 to unsigned 16)
+    r32 = _mm_and_si128(r32, mask); //to avoid saturation
+    r32 =  _MM_PACKUS1_EPI32 (r32); //saturate and  narrow, use low 64 bits only
+    return64(r32);
+}
+
+uint32x2_t vshrn_n_u64(uint64x2_t a, __constrange(1,32) int b); // VSHRN.I64 d0,q0,#32
+_NEON2SSE_INLINE uint32x2_t vshrn_n_u64(uint64x2_t a, __constrange(1,32) int b)
+{
+    uint32x2_t res64;
+    __m128i r64;
+    r64  = vshrq_n_u64(a,b);
+    r64  = _mm_shuffle_epi32(r64, 0 | (2 << 2) | (1 << 4) | (3 << 6)); //shuffle the data to get 2 32-bits
+    return64(r64);
+}
 
 //************** Vector signed->unsigned narrowing saturating shift right by constant ********
 //*********************************************************************************************
+uint8x8_t vqshrun_n_s16(int16x8_t a, __constrange(1,8) int b); // VQSHRUN.S16 d0,q0,#8
+_NEON2SSE_INLINE uint8x8_t vqshrun_n_s16(int16x8_t a, __constrange(1,8) int b) // VQSHRUN.S16 d0,q0,#8
+{
+    uint8x8_t res64;
+    __m128i r16;
+    r16  = vshrq_n_s16(a,b);
+    r16 = _mm_packus_epi16 (r16,r16); //saturate and  narrow (signed to unsigned), use low 64 bits only
+    return64(r16);
+}
+
+uint16x4_t vqshrun_n_s32(int32x4_t a, __constrange(1,16) int b); // VQSHRUN.S32 d0,q0,#16
+_NEON2SSE_INLINE uint16x4_t vqshrun_n_s32(int32x4_t a, __constrange(1,16) int b) // VQSHRUN.S32 d0,q0,#16
+{
+    uint16x4_t res64;
+    __m128i r32;
+    r32  = vshrq_n_s32(a,b);
+    r32  = _MM_PACKUS1_EPI32 (r32); //saturate and  narrow(signed to unsigned), use low 64 bits only
+    return64(r32);
+}
+
+uint32x2_t vqshrun_n_s64(int64x2_t a, __constrange(1,32) int b); // VQSHRUN.S64 d0,q0,#32
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vqshrun_n_s64(int64x2_t a, __constrange(1,32) int b), _NEON2SSE_REASON_SLOW_SERIAL) //serial solution is faster
+{
+    _NEON2SSE_ALIGN_16 int64_t atmp[2];
+    uint32x2_t res;
+    int64_t res64;
+    _mm_store_si128((__m128i*)atmp, a);
+    if (atmp[0] < 0) {
+        res.m64_u32[0] = 0;
+    } else {
+        res64 = (atmp[0] >> b);
+        res.m64_u32[0] = (res64 > (int64_t)0xffffffff) ? 0xffffffff : (uint32_t) res64;
+    }
+    if (atmp[1] < 0) {
+        res.m64_u32[1] = 0;
+    } else {
+        res64 = (atmp[1] >> b);
+        res.m64_u32[1] = (res64 > (int64_t)0xffffffff) ? 0xffffffff : (uint32_t)res64;
+    }
+    return res;
+}
 
 //**** Vector signed->unsigned rounding narrowing saturating shift right by constant *****
+uint8x8_t vqrshrun_n_s16(int16x8_t a, __constrange(1,8) int b); // VQRSHRUN.S16 d0,q0,#8
+_NEON2SSE_INLINE uint8x8_t vqrshrun_n_s16(int16x8_t a, __constrange(1,8) int b) // VQRSHRUN.S16 d0,q0,#8
+{
+    //solution may be not optimal compared with the serial one
+    __m128i r16;
+    uint8x8_t res64;
+    r16 = vrshrq_n_s16(a,b);
+    r16 =  _mm_packus_epi16 (r16,r16); //saturate and  narrow (signed to unsigned), use low 64 bits only
+    return64(r16);
+}
+
+uint16x4_t vqrshrun_n_s32(int32x4_t a, __constrange(1,16) int b); // VQRSHRUN.S32 d0,q0,#16
+_NEON2SSE_INLINE uint16x4_t vqrshrun_n_s32(int32x4_t a, __constrange(1,16) int b) // VQRSHRUN.S32 d0,q0,#16
+{
+    //solution may be not optimal compared with the serial one
+    __m128i r32;
+    uint16x4_t res64;
+    r32 = vrshrq_n_s32(a,b);
+    r32 =  _MM_PACKUS1_EPI32 (r32); //saturate and  narrow (signed to unsigned), use low 64 bits only
+    return64(r32);
+}
+
+uint32x2_t vqrshrun_n_s64(int64x2_t a, __constrange(1,32) int b); // VQRSHRUN.S64 d0,q0,#32
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vqrshrun_n_s64(int64x2_t a, __constrange(1,32) int b), _NEON2SSE_REASON_SLOW_SERIAL) //serial solution is faster
+{
+    _NEON2SSE_ALIGN_16 int64_t atmp[2];
+    uint32x2_t res;
+    int64_t res64;
+    _mm_store_si128((__m128i*)atmp, a);
+    if (atmp[0] < 0) {
+        res.m64_u32[0] = 0;
+    } else {
+        res64 = (atmp[0] >> b) + ( (atmp[0] & ((int64_t)1 << (b - 1))) >> (b - 1)  );
+        res.m64_u32[0] = (res64 > (int64_t)0xffffffff ) ? 0xffffffff : res64;
+    }
+    if (atmp[1] < 0) {
+        res.m64_u32[1] = 0;
+    } else {
+        res64 = (atmp[1] >> b) + ( (atmp[0] & ((int64_t)1 << (b - 1))) >> (b - 1)  );
+        res.m64_u32[1] = (res64 > (int64_t)0xffffffff ) ? 0xffffffff : res64;
+    }
+    return res;
+}
 
 //***** Vector narrowing saturating shift right by constant ******
 //*****************************************************************
+int8x8_t vqshrn_n_s16(int16x8_t a, __constrange(1,8) int b); // VQSHRN.S16 d0,q0,#8
+_NEON2SSE_INLINE int8x8_t vqshrn_n_s16(int16x8_t a, __constrange(1,8) int b) // VQSHRN.S16 d0,q0,#8
+{
+    int8x8_t res64;
+    __m128i r16;
+    r16  = vshrq_n_s16(a,b);
+    r16  = _mm_packs_epi16 (r16,r16); //saturate and  narrow, use low 64 bits only
+    return64(r16);
+}
+
+int16x4_t vqshrn_n_s32(int32x4_t a, __constrange(1,16) int b); // VQSHRN.S32 d0,q0,#16
+_NEON2SSE_INLINE int16x4_t vqshrn_n_s32(int32x4_t a, __constrange(1,16) int b) // VQSHRN.S32 d0,q0,#16
+{
+    int16x4_t res64;
+    __m128i r32;
+    r32  = vshrq_n_s32(a,b);
+    r32  = _mm_packs_epi32 (r32,r32); //saturate and  narrow, use low 64 bits only
+    return64(r32);
+}
+
+int32x2_t vqshrn_n_s64(int64x2_t a, __constrange(1,32) int b); // VQSHRN.S64 d0,q0,#32
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vqshrn_n_s64(int64x2_t a, __constrange(1,32) int b), _NEON2SSE_REASON_SLOW_UNEFFECTIVE)
+{
+    //no optimal SIMD solution found
+    _NEON2SSE_ALIGN_16 int64_t res64[2], atmp[2];
+    int32x2_t res;
+    _mm_store_si128((__m128i*)atmp, a);
+    res64[0] = (atmp[0] >> b);
+    res64[1] = (atmp[1] >> b);
+    if(res64[0]>SINT_MAX) res64[0] = SINT_MAX;
+    if(res64[0]<SINT_MIN) res64[0] = SINT_MIN;
+    if(res64[1]>SINT_MAX) res64[1] = SINT_MAX;
+    if(res64[1]<SINT_MIN) res64[1] = SINT_MIN;
+    res.m64_i32[0] = (int32_t)res64[0];
+    res.m64_i32[1] = (int32_t)res64[1];
+    return res;
+}
+
+uint8x8_t vqshrn_n_u16(uint16x8_t a, __constrange(1,8) int b); // VQSHRN.s16 d0,q0,#8
+_NEON2SSE_INLINE uint8x8_t vqshrn_n_u16(uint16x8_t a, __constrange(1,8) int b) // VQSHRN.s16 d0,q0,#8
+{
+    uint8x8_t res64;
+    __m128i r16;
+    r16  = vshrq_n_u16(a,b); //after right shift b>=1 unsigned var fits into signed range, so we could use _mm_packus_epi16 (signed 16 to unsigned 8)
+    r16  = _mm_packus_epi16 (r16,r16); //saturate and  narrow, use low 64 bits only
+    return64(r16);
+}
+
+uint16x4_t vqshrn_n_u32(uint32x4_t a, __constrange(1,16) int b); // VQSHRN.U32 d0,q0,#16
+_NEON2SSE_INLINE uint16x4_t vqshrn_n_u32(uint32x4_t a, __constrange(1,16) int b) // VQSHRN.U32 d0,q0,#16
+{
+    uint16x4_t res64;
+    __m128i r32;
+    r32  = vshrq_n_u32(a,b); //after right shift b>=1 unsigned var fits into signed range, so we could use _MM_PACKUS_EPI32 (signed 32 to unsigned 8)
+    r32  = _MM_PACKUS1_EPI32 (r32); //saturate and  narrow, use low 64 bits only
+    return64(r32);
+}
+
+uint32x2_t vqshrn_n_u64(uint64x2_t a, __constrange(1,32) int b); // VQSHRN.U64 d0,q0,#32
+_NEON2SSE_INLINE uint32x2_t vqshrn_n_u64(uint64x2_t a, __constrange(1,32) int b)
+{
+    //serial solution may be faster
+    uint32x2_t res64;
+    __m128i r64, res_hi, zero;
+    zero = _mm_setzero_si128();
+    r64  = vshrq_n_u64(a,b);
+    res_hi = _mm_srli_epi64(r64,  32);
+    res_hi = _mm_cmpgt_epi32(res_hi, zero);
+    r64 = _mm_or_si128(r64, res_hi);
+    r64 = _mm_shuffle_epi32(r64, 0 | (2 << 2) | (1 << 4) | (3 << 6)); //shuffle the data to get 2 32-bits
+    return64(r64);
+}
+
 
 //********* Vector rounding narrowing shift right by constant *************************
 //****************************************************************************************
+int8x8_t vrshrn_n_s16(int16x8_t a, __constrange(1,8) int b); // VRSHRN.I16 d0,q0,#8
+_NEON2SSE_INLINE int8x8_t vrshrn_n_s16(int16x8_t a, __constrange(1,8) int b) // VRSHRN.I16 d0,q0,#8
+{
+    int8x8_t res64;
+    __m128i r16;
+    _NEON2SSE_ALIGN_16 int8_t mask8_16_even_odd[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 };
+    r16  = vrshrq_n_s16(a,b);
+    r16  = _mm_shuffle_epi8 (r16, *(__m128i*) mask8_16_even_odd); //narrow, use low 64 bits only. Impossible to use _mm_packs because of negative saturation problems
+    return64(r16);
+}
+
+int16x4_t vrshrn_n_s32(int32x4_t a, __constrange(1,16) int b); // VRSHRN.I32 d0,q0,#16
+_NEON2SSE_INLINE int16x4_t vrshrn_n_s32(int32x4_t a, __constrange(1,16) int b) // VRSHRN.I32 d0,q0,#16
+{
+    int16x4_t res64;
+    __m128i r32;
+    _NEON2SSE_ALIGN_16 int8_t mask16_odd[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 };
+    r32  = vrshrq_n_s32(a,b);
+    r32  =  _mm_shuffle_epi8 (r32, *(__m128i*) mask16_odd); //narrow, use low 64 bits only. Impossible to use _mm_packs because of negative saturation problems
+    return64(r32);
+}
+
+int32x2_t vrshrn_n_s64(int64x2_t a, __constrange(1,32) int b); // VRSHRN.I64 d0,q0,#32
+_NEON2SSE_INLINE int32x2_t vrshrn_n_s64(int64x2_t a, __constrange(1,32) int b)
+{
+    int32x2_t res64;
+    __m128i r64;
+    r64  = vrshrq_n_s64(a,b);
+    r64  = _mm_shuffle_epi32(r64, 0 | (2 << 2) | (1 << 4) | (3 << 6)); //shuffle the data to get 2 32-bits
+    return64(r64);
+}
+
+uint8x8_t vrshrn_n_u16(uint16x8_t a, __constrange(1,8) int b); // VRSHRN.I16 d0,q0,#8
+_NEON2SSE_INLINE uint8x8_t vrshrn_n_u16(uint16x8_t a, __constrange(1,8) int b) // VRSHRN.I16 d0,q0,#8
+{
+    uint8x8_t res64;
+    __m128i mask, r16;
+    mask = _mm_set1_epi16(0xff);
+    r16  = vrshrq_n_s16(a,b); //after right shift b>=1 unsigned var fits into signed range, so we could use _mm_packus_epi16 (signed 16 to unsigned 8)
+    r16 = _mm_and_si128(r16, mask); //to avoid saturation
+    r16 = _mm_packus_epi16 (r16,r16); //saturate and  narrow, use low 64 bits only
+    return64(r16);
+}
+
+uint16x4_t vrshrn_n_u32(uint32x4_t a, __constrange(1,16) int b); // VRSHRN.I32 d0,q0,#16
+_NEON2SSE_INLINE uint16x4_t vrshrn_n_u32(uint32x4_t a, __constrange(1,16) int b) // VRSHRN.I32 d0,q0,#16
+{
+    uint16x4_t res64;
+    __m128i mask, r32;
+    mask = _mm_set1_epi32(0xffff);
+    r32  = vrshrq_n_u32(a,b); //after right shift b>=1 unsigned var fits into signed range, so we could use _MM_PACKUS_EPI32 (signed 32 to unsigned 8)
+    r32 = _mm_and_si128(r32, mask); //to avoid saturation
+    r32 = _MM_PACKUS1_EPI32 (r32); //saturate and  narrow, use low 64 bits only
+    return64(r32);
+}
+
+uint32x2_t vrshrn_n_u64(uint64x2_t a, __constrange(1,32) int b); // VRSHRN.I64 d0,q0,#32
+_NEON2SSE_INLINE uint32x2_t vrshrn_n_u64(uint64x2_t a, __constrange(1,32) int b) //serial solution may be faster
+{
+    uint32x2_t res64;
+    __m128i r64;
+    r64  = vrshrq_n_u64(a,b);
+    r64  =  _mm_shuffle_epi32(r64, 0 | (2 << 2) | (1 << 4) | (3 << 6)); //shuffle the data to get 2 32-bits
+    return64(r64);
+}
 
 //************* Vector rounding narrowing saturating shift right by constant ************
 //****************************************************************************************
+int8x8_t vqrshrn_n_s16(int16x8_t a, __constrange(1,8) int b); // VQRSHRN.S16 d0,q0,#8
+_NEON2SSE_INLINE int8x8_t vqrshrn_n_s16(int16x8_t a, __constrange(1,8) int b) // VQRSHRN.S16 d0,q0,#8
+{
+    int8x8_t res64;
+    __m128i r16;
+    r16  = vrshrq_n_s16(a,b);
+    r16  =  _mm_packs_epi16 (r16,r16); //saturate and  narrow, use low 64 bits only
+    return64(r16);
+}
+
+int16x4_t vqrshrn_n_s32(int32x4_t a, __constrange(1,16) int b); // VQRSHRN.S32 d0,q0,#16
+_NEON2SSE_INLINE int16x4_t vqrshrn_n_s32(int32x4_t a, __constrange(1,16) int b) // VQRSHRN.S32 d0,q0,#16
+{
+    int16x4_t res64;
+    __m128i r32;
+    r32  = vrshrq_n_s32(a,b);
+    r32  = _mm_packs_epi32 (r32,r32); //saturate and  narrow, use low 64 bits only
+    return64(r32);
+}
+
+int32x2_t vqrshrn_n_s64(int64x2_t a, __constrange(1,32) int b); // VQRSHRN.S64 d0,q0,#32
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vqrshrn_n_s64(int64x2_t a, __constrange(1,32) int b), _NEON2SSE_REASON_SLOW_UNEFFECTIVE)
+{
+    //no optimal SIMD solution found
+    _NEON2SSE_ALIGN_16 int64_t res64[2], atmp[2], maskb[2];
+    int32x2_t res;
+    _mm_store_si128((__m128i*)atmp, a);
+    maskb[0] = atmp[0] & (( int64_t)1 << (b - 1));
+    res64[0] = (atmp[0] >> b) + (maskb[0] >> (b - 1)); //rounded result
+    maskb[1] = atmp[1] & (( int64_t)1 << (b - 1));
+    res64[1] = (atmp[1] >> b) + (maskb[1] >> (b - 1)); //rounded result
+    if(res64[0]>SINT_MAX) res64[0] = SINT_MAX;
+    if(res64[0]<SINT_MIN) res64[0] = SINT_MIN;
+    if(res64[1]>SINT_MAX) res64[1] = SINT_MAX;
+    if(res64[1]<SINT_MIN) res64[1] = SINT_MIN;
+    res.m64_i32[0] = (int32_t)res64[0];
+    res.m64_i32[1] = (int32_t)res64[1];
+    return res;
+}
+
+uint8x8_t vqrshrn_n_u16(uint16x8_t a, __constrange(1,8) int b); // VQRSHRN.s16 d0,q0,#8
+_NEON2SSE_INLINE uint8x8_t vqrshrn_n_u16(uint16x8_t a, __constrange(1,8) int b) // VQRSHRN.s16 d0,q0,#8
+{
+    uint8x8_t res64;
+    __m128i r16;
+    r16  = vrshrq_n_u16(a,b); //after right shift b>=1 unsigned var fits into signed range, so we could use _mm_packus_epi16 (signed 16 to unsigned 8)
+    r16  = _mm_packus_epi16 (r16,r16); //saturate and  narrow, use low 64 bits only
+    return64(r16);
+}
+
+uint16x4_t vqrshrn_n_u32(uint32x4_t a, __constrange(1,16) int b); // VQRSHRN.U32 d0,q0,#16
+_NEON2SSE_INLINE uint16x4_t vqrshrn_n_u32(uint32x4_t a, __constrange(1,16) int b) // VQRSHRN.U32 d0,q0,#16
+{
+    uint16x4_t res64;
+    __m128i r32;
+    r32  = vrshrq_n_u32(a,b); //after right shift b>=1 unsigned var fits into signed range, so we could use _MM_PACKUS_EPI32 (signed 32 to unsigned 8)
+    r32  = _MM_PACKUS1_EPI32 (r32); //saturate and  narrow, use low 64 bits only
+    return64(r32);
+}
+
+uint32x2_t vqrshrn_n_u64(uint64x2_t a, __constrange(1,32) int b); // VQRSHRN.U64 d0,q0,#32
+_NEON2SSE_INLINE uint32x2_t vqrshrn_n_u64(uint64x2_t a, __constrange(1,32) int b)
+{
+    //serial solution may be faster
+    uint32x2_t res64;
+    __m128i r64, res_hi, zero;
+    zero = _mm_setzero_si128();
+    r64  = vrshrq_n_u64(a,b);
+    res_hi = _mm_srli_epi64(r64,  32);
+    res_hi = _mm_cmpgt_epi32(res_hi, zero);
+    r64 = _mm_or_si128(r64, res_hi);
+    r64 = _mm_shuffle_epi32(r64, 0 | (2 << 2) | (1 << 4) | (3 << 6)); //shuffle the data to get 2 32-bits
+    return64(r64);
+}
 
 //************** Vector widening shift left by constant ****************
 //************************************************************************
+int16x8_t vshll_n_s8(int8x8_t a, __constrange(0,8) int b); // VSHLL.S8 q0,d0,#0
+_NEON2SSE_INLINE int16x8_t vshll_n_s8(int8x8_t a, __constrange(0,8) int b) // VSHLL.S8 q0,d0,#0
+{
+    __m128i r;
+    r = _MM_CVTEPI8_EPI16 (_pM128i(a)); //SSE 4.1
+    return _mm_slli_epi16 (r, b);
+}
+
+int32x4_t vshll_n_s16(int16x4_t a, __constrange(0,16) int b); // VSHLL.S16 q0,d0,#0
+_NEON2SSE_INLINE int32x4_t vshll_n_s16(int16x4_t a, __constrange(0,16) int b) // VSHLL.S16 q0,d0,#0
+{
+    __m128i r;
+    r =  _MM_CVTEPI16_EPI32(_pM128i(a)); //SSE4.1,
+    return _mm_slli_epi32 (r, b);
+}
+
+int64x2_t vshll_n_s32(int32x2_t a, __constrange(0,32) int b); // VSHLL.S32 q0,d0,#0
+_NEON2SSE_INLINE int64x2_t vshll_n_s32(int32x2_t a, __constrange(0,32) int b) // VSHLL.S32 q0,d0,#0
+{
+    __m128i r;
+    r =  _MM_CVTEPI32_EPI64(_pM128i(a)); //SSE4.1,
+    return _mm_slli_epi64 (r, b);
+}
+
+uint16x8_t vshll_n_u8(uint8x8_t a, __constrange(0,8) int b); // VSHLL.U8 q0,d0,#0
+_NEON2SSE_INLINE uint16x8_t vshll_n_u8(uint8x8_t a, __constrange(0,8) int b) // VSHLL.U8 q0,d0,#0
+{
+    //no uint8 to uint16 conversion available, manual conversion used
+    __m128i zero,  r;
+    zero = _mm_setzero_si128 ();
+    r = _mm_unpacklo_epi8(_pM128i(a), zero);
+    return _mm_slli_epi16 (r, b);
+}
+
+uint32x4_t vshll_n_u16(uint16x4_t a, __constrange(0,16) int b); // VSHLL.s16 q0,d0,#0
+_NEON2SSE_INLINE uint32x4_t vshll_n_u16(uint16x4_t a, __constrange(0,16) int b) // VSHLL.s16 q0,d0,#0
+{
+    //no uint16 to uint32 conversion available, manual conversion used
+    __m128i zero,  r;
+    zero = _mm_setzero_si128 ();
+    r = _mm_unpacklo_epi16(_pM128i(a), zero);
+    return _mm_slli_epi32 (r, b);
+}
+
+uint64x2_t vshll_n_u32(uint32x2_t a, __constrange(0,32) int b); // VSHLL.U32 q0,d0,#0
+_NEON2SSE_INLINE uint64x2_t vshll_n_u32(uint32x2_t a, __constrange(0,32) int b) // VSHLL.U32 q0,d0,#0
+{
+    //no uint32 to uint64 conversion available, manual conversion used
+    __m128i zero,  r;
+    zero = _mm_setzero_si128 ();
+    r = _mm_unpacklo_epi32(_pM128i(a), zero);
+    return _mm_slli_epi64 (r, b);
+}
 
 //************************************************************************************
 //**************************** Shifts with insert ************************************
@@ -4351,138 +8931,247 @@
 //**************** Vector shift right and insert ************************************
 //Actually the "c" left bits from "a" are the only bits remained from "a"  after the shift.
 //All other bits are taken from b shifted.
+int8x8_t vsri_n_s8(int8x8_t a,  int8x8_t b, __constrange(1,8) int c); // VSRI.8 d0,d0,#8
+_NEON2SSE_INLINE int8x8_t vsri_n_s8(int8x8_t a,  int8x8_t b, __constrange(1,8) int c)
+{
+    int8x8_t res64;
+    return64(vsriq_n_s8(_pM128i(a),_pM128i(b), c));
+}
 
-int8x16_t vsriq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c);         // VSRI.8 q0,q0,#8
-_NEON2SSE_INLINE int8x16_t vsriq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c)         // VSRI.8 q0,q0,#8
+
+int16x4_t vsri_n_s16(int16x4_t a,  int16x4_t b, __constrange(1,16) int c); // VSRI.16 d0,d0,#16
+_NEON2SSE_INLINE int16x4_t vsri_n_s16(int16x4_t a,  int16x4_t b, __constrange(1,16) int c)
+{
+    int16x4_t res64;
+    return64(vsriq_n_s16(_pM128i(a),_pM128i(b), c));
+}
+
+
+int32x2_t vsri_n_s32(int32x2_t a,  int32x2_t b, __constrange(1,32) int c); // VSRI.32 d0,d0,#32
+_NEON2SSE_INLINE int32x2_t vsri_n_s32(int32x2_t a,  int32x2_t b, __constrange(1,32) int c)
+{
+    int32x2_t res64;
+    return64(vsriq_n_s32(_pM128i(a),_pM128i(b), c));
+}
+
+
+int64x1_t vsri_n_s64(int64x1_t a, int64x1_t b, __constrange(1,64) int c); // VSRI.64 d0,d0,#64
+_NEON2SSE_INLINE int64x1_t vsri_n_s64(int64x1_t a, int64x1_t b, __constrange(1,64) int c)
+{
+    int64x1_t res;
+    if (c ==64)
+        res = a;
+    else{
+        res.m64_i64[0] = (b.m64_u64[0] >> c) | ((a.m64_i64[0] >> (64 - c)) << (64 - c)); //treat b as unsigned for shift to get leading zeros
+    }
+    return res;
+}
+
+uint8x8_t vsri_n_u8(uint8x8_t a,  uint8x8_t b, __constrange(1,8) int c); // VSRI.8 d0,d0,#8
+#define vsri_n_u8 vsri_n_s8
+
+uint16x4_t vsri_n_u16(uint16x4_t a,  uint16x4_t b, __constrange(1,16) int c); // VSRI.16 d0,d0,#16
+#define vsri_n_u16 vsri_n_s16
+
+uint32x2_t vsri_n_u32(uint32x2_t a,  uint32x2_t b, __constrange(1,32) int c); // VSRI.32 d0,d0,#32
+#define vsri_n_u32 vsri_n_s32
+
+
+uint64x1_t vsri_n_u64(uint64x1_t a, uint64x1_t b, __constrange(1,64) int c); // VSRI.64 d0,d0,#64
+#define vsri_n_u64 vsri_n_s64
+
+poly8x8_t vsri_n_p8(poly8x8_t a, poly8x8_t b, __constrange(1,8) int c); // VSRI.8 d0,d0,#8
+#define vsri_n_p8 vsri_n_u8
+
+poly16x4_t vsri_n_p16(poly16x4_t a, poly16x4_t b, __constrange(1,16) int c); // VSRI.16 d0,d0,#16
+#define vsri_n_p16 vsri_n_u16
+
+int8x16_t vsriq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c); // VSRI.8 q0,q0,#8
+_NEON2SSE_INLINE int8x16_t vsriq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c) // VSRI.8 q0,q0,#8
 {
     __m128i maskA, a_masked;
     uint8x16_t b_shift;
-    _NEON2SSE_ALIGN_16 uint8_t maskLeft[9] = {0x0, 0x80, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc, 0xfe, 0xff};         //"a" bits mask, 0 bit not used
-    maskA = _mm_set1_epi8(maskLeft[c]);         // c ones and (8-c)zeros
+    _NEON2SSE_ALIGN_16 uint8_t maskLeft[9] = {0x0, 0x80, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc, 0xfe, 0xff}; //"a" bits mask, 0 bit not used
+    maskA = _mm_set1_epi8(maskLeft[c]); // c ones and (8-c)zeros
     a_masked = _mm_and_si128 (a, maskA);
-    b_shift = vshrq_n_u8( b, c);         // c zeros on the left in b due to logical shift
-    return _mm_or_si128 (a_masked, b_shift);         //combine (insert b into a)
+    b_shift = vshrq_n_u8( b, c); // c zeros on the left in b due to logical shift
+    return _mm_or_si128 (a_masked, b_shift); //combine (insert b into a)
 }
 
-int16x8_t vsriq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c);         // VSRI.16 q0,q0,#16
-_NEON2SSE_INLINE int16x8_t vsriq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c)         // VSRI.16 q0,q0,#16
-{         //to cut "c" left bits from a we do shift right and then  shift back left providing c right zeros in a
+int16x8_t vsriq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c); // VSRI.16 q0,q0,#16
+_NEON2SSE_INLINE int16x8_t vsriq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c) // VSRI.16 q0,q0,#16
+{
+    //to cut "c" left bits from a we do shift right and then  shift back left providing c right zeros in a
     uint16x8_t b_shift;
     uint16x8_t a_c;
-    b_shift = vshrq_n_u16( b, c);         // c zeros on the left in b due to logical shift
+    b_shift = vshrq_n_u16( b, c); // c zeros on the left in b due to logical shift
     a_c = vshrq_n_u16( a, (16 - c));
-    a_c  = _mm_slli_epi16(a_c, (16 - c));         //logical shift provides right "c" bits zeros in a
-    return _mm_or_si128 (a_c, b_shift);         //combine (insert b into a)
+    a_c  = _mm_slli_epi16(a_c, (16 - c)); //logical shift provides right "c" bits zeros in a
+    return _mm_or_si128 (a_c, b_shift); //combine (insert b into a)
 }
 
-int32x4_t vsriq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c);         // VSRI.32 q0,q0,#32
-_NEON2SSE_INLINE int32x4_t vsriq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c)         // VSRI.32 q0,q0,#32
-{         //to cut "c" left bits from a we do shift right and then  shift back left providing c right zeros in a
+int32x4_t vsriq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c); // VSRI.32 q0,q0,#32
+_NEON2SSE_INLINE int32x4_t vsriq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c) // VSRI.32 q0,q0,#32
+{
+    //to cut "c" left bits from a we do shift right and then  shift back left providing c right zeros in a
     uint32x4_t b_shift;
     uint32x4_t a_c;
-    b_shift = vshrq_n_u32( b, c);         // c zeros on the left in b due to logical shift
+    b_shift = vshrq_n_u32( b, c); // c zeros on the left in b due to logical shift
     a_c = vshrq_n_u32( a, (32 - c));
-    a_c  = _mm_slli_epi32(a_c, (32 - c));         //logical shift provides right "c" bits zeros in a
-    return _mm_or_si128 (a_c, b_shift);         //combine (insert b into a)
+    a_c  = _mm_slli_epi32(a_c, (32 - c)); //logical shift provides right "c" bits zeros in a
+    return _mm_or_si128 (a_c, b_shift); //combine (insert b into a)
 }
 
-int64x2_t vsriq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c);         // VSRI.64 q0,q0,#64
+int64x2_t vsriq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c); // VSRI.64 q0,q0,#64
 _NEON2SSE_INLINE int64x2_t vsriq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c)
-{         //serial solution may be faster
+{
+    //serial solution may be faster
     uint64x2_t b_shift;
     uint64x2_t a_c;
-    b_shift = _mm_srli_epi64(b, c);         // c zeros on the left in b due to logical shift
+    b_shift = _mm_srli_epi64(b, c); // c zeros on the left in b due to logical shift
     a_c = _mm_srli_epi64(a, (64 - c));
-    a_c  = _mm_slli_epi64(a_c, (64 - c));         //logical shift provides right "c" bits zeros in a
-    return _mm_or_si128 (a_c, b_shift);         //combine (insert b into a)
+    a_c  = _mm_slli_epi64(a_c, (64 - c)); //logical shift provides right "c" bits zeros in a
+    return _mm_or_si128 (a_c, b_shift); //combine (insert b into a)
 }
 
-uint8x16_t vsriq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(1,8) int c);         // VSRI.8 q0,q0,#8
+uint8x16_t vsriq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(1,8) int c); // VSRI.8 q0,q0,#8
 #define vsriq_n_u8 vsriq_n_s8
 
-uint16x8_t vsriq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(1,16) int c);         // VSRI.16 q0,q0,#16
+uint16x8_t vsriq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(1,16) int c); // VSRI.16 q0,q0,#16
 #define vsriq_n_u16 vsriq_n_s16
 
-uint32x4_t vsriq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(1,32) int c);         // VSRI.32 q0,q0,#32
+uint32x4_t vsriq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(1,32) int c); // VSRI.32 q0,q0,#32
 #define vsriq_n_u32 vsriq_n_s32
 
-uint64x2_t vsriq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(1,64) int c);         // VSRI.64 q0,q0,#64
+uint64x2_t vsriq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(1,64) int c); // VSRI.64 q0,q0,#64
 #define vsriq_n_u64 vsriq_n_s64
 
-poly8x16_t vsriq_n_p8(poly8x16_t a, poly8x16_t b, __constrange(1,8) int c);         // VSRI.8 q0,q0,#8
+poly8x16_t vsriq_n_p8(poly8x16_t a, poly8x16_t b, __constrange(1,8) int c); // VSRI.8 q0,q0,#8
 #define vsriq_n_p8 vsriq_n_u8
 
-poly16x8_t vsriq_n_p16(poly16x8_t a, poly16x8_t b, __constrange(1,16) int c);         // VSRI.16 q0,q0,#16
+poly16x8_t vsriq_n_p16(poly16x8_t a, poly16x8_t b, __constrange(1,16) int c); // VSRI.16 q0,q0,#16
 #define vsriq_n_p16 vsriq_n_u16
 
 //***** Vector shift left and insert *********************************************
 //*********************************************************************************
 //Actually the "c" right bits from "a" are the only bits remained from "a"  after the shift.
 //All other bits are taken from b shifted. Ending zeros are inserted in b in the shift proces. We need to combine "a" and "b shifted".
+int8x8_t vsli_n_s8(int8x8_t a,  int8x8_t b, __constrange(0,7) int c); // VSLI.8 d0,d0,#0
+_NEON2SSE_INLINE int8x8_t vsli_n_s8(int8x8_t a,  int8x8_t b, __constrange(0,7) int c)
+{
+    int8x8_t res64;
+    return64(vsliq_n_s8(_pM128i(a),_pM128i(b), c));
+}
 
-int8x16_t vsliq_n_s8(int8x16_t a, int8x16_t b, __constrange(0,7) int c);         // VSLI.8 q0,q0,#0
-_NEON2SSE_INLINE int8x16_t vsliq_n_s8(int8x16_t a, int8x16_t b, __constrange(0,7) int c)         // VSLI.8 q0,q0,#0
+
+int16x4_t vsli_n_s16(int16x4_t a,  int16x4_t b, __constrange(0,15) int c); // VSLI.16 d0,d0,#0
+_NEON2SSE_INLINE int16x4_t vsli_n_s16(int16x4_t a,  int16x4_t b, __constrange(0,15) int c)
+{
+    int16x4_t res64;
+    return64(vsliq_n_s16(_pM128i(a),_pM128i(b), c));
+}
+
+
+int32x2_t vsli_n_s32(int32x2_t a,  int32x2_t b, __constrange(0,31) int c); // VSLI.32 d0,d0,#0
+_NEON2SSE_INLINE int32x2_t vsli_n_s32(int32x2_t a,  int32x2_t b, __constrange(0,31) int c)
+{
+    int32x2_t res64;
+    return64(vsliq_n_s32(_pM128i(a),_pM128i(b), c));
+}
+
+int64x1_t vsli_n_s64(int64x1_t a, int64x1_t b, __constrange(0,63) int c); // VSLI.64 d0,d0,#0
+_NEON2SSE_INLINE int64x1_t vsli_n_s64(int64x1_t a, int64x1_t b, __constrange(0,63) int c)
+{
+    int64x1_t res;
+    res.m64_i64[0] = (b.m64_i64[0] << c) | ((a.m64_u64[0] << (64 - c)) >> (64 - c)); //need to treat a as unsigned to get leading zeros
+    return res;
+}
+
+
+uint8x8_t vsli_n_u8(uint8x8_t a,  uint8x8_t b, __constrange(0,7) int c); // VSLI.8 d0,d0,#0
+#define vsli_n_u8 vsli_n_s8
+
+uint16x4_t vsli_n_u16(uint16x4_t a,  uint16x4_t b, __constrange(0,15) int c); // VSLI.16 d0,d0,#0
+#define vsli_n_u16 vsli_n_s16
+
+uint32x2_t vsli_n_u32(uint32x2_t a,  uint32x2_t b, __constrange(0,31) int c); // VSLI.32 d0,d0,#0
+#define vsli_n_u32 vsli_n_s32
+
+uint64x1_t vsli_n_u64(uint64x1_t a, uint64x1_t b, __constrange(0,63) int c); // VSLI.64 d0,d0,#0
+#define vsli_n_u64 vsli_n_s64
+
+poly8x8_t vsli_n_p8(poly8x8_t a, poly8x8_t b, __constrange(0,7) int c); // VSLI.8 d0,d0,#0
+#define vsli_n_p8 vsli_n_u8
+
+poly16x4_t vsli_n_p16(poly16x4_t a, poly16x4_t b, __constrange(0,15) int c); // VSLI.16 d0,d0,#0
+#define vsli_n_p16 vsli_n_u16
+
+int8x16_t vsliq_n_s8(int8x16_t a, int8x16_t b, __constrange(0,7) int c); // VSLI.8 q0,q0,#0
+_NEON2SSE_INLINE int8x16_t vsliq_n_s8(int8x16_t a, int8x16_t b, __constrange(0,7) int c) // VSLI.8 q0,q0,#0
 {
     __m128i maskA, a_masked;
     int8x16_t b_shift;
-    _NEON2SSE_ALIGN_16 uint8_t maskRight[8] = {0x0, 0x1, 0x3, 0x7, 0x0f, 0x1f, 0x3f, 0x7f};         //"a" bits mask
-    maskA = _mm_set1_epi8(maskRight[c]);         // (8-c)zeros and c ones
+    _NEON2SSE_ALIGN_16 uint8_t maskRight[8] = {0x0, 0x1, 0x3, 0x7, 0x0f, 0x1f, 0x3f, 0x7f}; //"a" bits mask
+    maskA = _mm_set1_epi8(maskRight[c]); // (8-c)zeros and c ones
     b_shift = vshlq_n_s8( b, c);
     a_masked = _mm_and_si128 (a, maskA);
-    return _mm_or_si128 (b_shift, a_masked);         //combine (insert b into a)
+    return _mm_or_si128 (b_shift, a_masked); //combine (insert b into a)
 }
 
-int16x8_t vsliq_n_s16(int16x8_t a, int16x8_t b, __constrange(0,15) int c);         // VSLI.16 q0,q0,#0
-_NEON2SSE_INLINE int16x8_t vsliq_n_s16(int16x8_t a, int16x8_t b, __constrange(0,15) int c)         // VSLI.16 q0,q0,#0
-{         //to cut "c" right bits from a we do shift left and then logical shift back right providing (16-c)zeros in a
+int16x8_t vsliq_n_s16(int16x8_t a, int16x8_t b, __constrange(0,15) int c); // VSLI.16 q0,q0,#0
+_NEON2SSE_INLINE int16x8_t vsliq_n_s16(int16x8_t a, int16x8_t b, __constrange(0,15) int c) // VSLI.16 q0,q0,#0
+{
+    //to cut "c" right bits from a we do shift left and then logical shift back right providing (16-c)zeros in a
     int16x8_t b_shift;
     int16x8_t a_c;
     b_shift = vshlq_n_s16( b, c);
     a_c = vshlq_n_s16( a, (16 - c));
     a_c  = _mm_srli_epi16(a_c, (16 - c));
-    return _mm_or_si128 (b_shift, a_c);         //combine (insert b into a)
+    return _mm_or_si128 (b_shift, a_c); //combine (insert b into a)
 }
 
-int32x4_t vsliq_n_s32(int32x4_t a, int32x4_t b, __constrange(0,31) int c);         // VSLI.32 q0,q0,#0
-_NEON2SSE_INLINE int32x4_t vsliq_n_s32(int32x4_t a, int32x4_t b, __constrange(0,31) int c)         // VSLI.32 q0,q0,#0
-{         //solution may be  not optimal compared with the serial one
-      //to cut "c" right bits from a we do shift left and then logical shift back right providing (32-c)zeros in a
+int32x4_t vsliq_n_s32(int32x4_t a, int32x4_t b, __constrange(0,31) int c); // VSLI.32 q0,q0,#0
+_NEON2SSE_INLINE int32x4_t vsliq_n_s32(int32x4_t a, int32x4_t b, __constrange(0,31) int c) // VSLI.32 q0,q0,#0
+{
+    //solution may be  not optimal compared with the serial one
+    //to cut "c" right bits from a we do shift left and then logical shift back right providing (32-c)zeros in a
     int32x4_t b_shift;
     int32x4_t a_c;
     b_shift = vshlq_n_s32( b, c);
     a_c = vshlq_n_s32( a, (32 - c));
     a_c  = _mm_srli_epi32(a_c, (32 - c));
-    return _mm_or_si128 (b_shift, a_c);         //combine (insert b into a)
+    return _mm_or_si128 (b_shift, a_c); //combine (insert b into a)
 }
 
-int64x2_t vsliq_n_s64(int64x2_t a, int64x2_t b, __constrange(0,63) int c);         // VSLI.64 q0,q0,#0
-_NEON2SSE_INLINE int64x2_t vsliq_n_s64(int64x2_t a, int64x2_t b, __constrange(0,63) int c)         // VSLI.64 q0,q0,#0
-{         //solution may be  not optimal compared with the serial one
-      //to cut "c" right bits from a we do shift left and then logical shift back right providing (64-c)zeros in a
+int64x2_t vsliq_n_s64(int64x2_t a, int64x2_t b, __constrange(0,63) int c); // VSLI.64 q0,q0,#0
+_NEON2SSE_INLINE int64x2_t vsliq_n_s64(int64x2_t a, int64x2_t b, __constrange(0,63) int c) // VSLI.64 q0,q0,#0
+{
+    //solution may be  not optimal compared with the serial one
+    //to cut "c" right bits from a we do shift left and then logical shift back right providing (64-c)zeros in a
     int64x2_t b_shift;
     int64x2_t a_c;
     b_shift = vshlq_n_s64( b, c);
     a_c = vshlq_n_s64( a, (64 - c));
     a_c  = _mm_srli_epi64(a_c, (64 - c));
-    return _mm_or_si128 (b_shift, a_c);         //combine (insert b into a)
+    return _mm_or_si128 (b_shift, a_c); //combine (insert b into a)
 }
 
-uint8x16_t vsliq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(0,7) int c);         // VSLI.8 q0,q0,#0
+uint8x16_t vsliq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(0,7) int c); // VSLI.8 q0,q0,#0
 #define vsliq_n_u8 vsliq_n_s8
 
-uint16x8_t vsliq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(0,15) int c);         // VSLI.16 q0,q0,#0
+uint16x8_t vsliq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(0,15) int c); // VSLI.16 q0,q0,#0
 #define vsliq_n_u16 vsliq_n_s16
 
-uint32x4_t vsliq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(0,31) int c);         // VSLI.32 q0,q0,#0
+uint32x4_t vsliq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(0,31) int c); // VSLI.32 q0,q0,#0
 #define vsliq_n_u32 vsliq_n_s32
 
-uint64x2_t vsliq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(0,63) int c);         // VSLI.64 q0,q0,#0
+uint64x2_t vsliq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(0,63) int c); // VSLI.64 q0,q0,#0
 #define vsliq_n_u64 vsliq_n_s64
 
-poly8x16_t vsliq_n_p8(poly8x16_t a, poly8x16_t b, __constrange(0,7) int c);         // VSLI.8 q0,q0,#0
+poly8x16_t vsliq_n_p8(poly8x16_t a, poly8x16_t b, __constrange(0,7) int c); // VSLI.8 q0,q0,#0
 #define vsliq_n_p8 vsliq_n_u8
 
-poly16x8_t vsliq_n_p16(poly16x8_t a, poly16x8_t b, __constrange(0,15) int c);         // VSLI.16 q0,q0,#0
+poly16x8_t vsliq_n_p16(poly16x8_t a, poly16x8_t b, __constrange(0,15) int c); // VSLI.16 q0,q0,#0
 #define vsliq_n_p16 vsliq_n_u16
 
 // ***********************************************************************************************
@@ -4498,31 +9187,31 @@
 #define LOAD_SI128(ptr) \
         ( ((unsigned long)(ptr) & 15) == 0 ) ? _mm_load_si128((__m128i*)(ptr)) : _mm_loadu_si128((__m128i*)(ptr));
 
-uint8x16_t vld1q_u8(__transfersize(16) uint8_t const * ptr);         // VLD1.8 {d0, d1}, [r0]
+uint8x16_t vld1q_u8(__transfersize(16) uint8_t const * ptr); // VLD1.8 {d0, d1}, [r0]
 #define vld1q_u8 LOAD_SI128
 
-uint16x8_t vld1q_u16(__transfersize(8) uint16_t const * ptr);         // VLD1.16 {d0, d1}, [r0]
+uint16x8_t vld1q_u16(__transfersize(8) uint16_t const * ptr); // VLD1.16 {d0, d1}, [r0]
 #define vld1q_u16 LOAD_SI128
 
-uint32x4_t vld1q_u32(__transfersize(4) uint32_t const * ptr);         // VLD1.32 {d0, d1}, [r0]
+uint32x4_t vld1q_u32(__transfersize(4) uint32_t const * ptr); // VLD1.32 {d0, d1}, [r0]
 #define vld1q_u32 LOAD_SI128
 
-uint64x2_t vld1q_u64(__transfersize(2) uint64_t const * ptr);         // VLD1.64 {d0, d1}, [r0]
+uint64x2_t vld1q_u64(__transfersize(2) uint64_t const * ptr); // VLD1.64 {d0, d1}, [r0]
 #define vld1q_u64 LOAD_SI128
 
-int8x16_t vld1q_s8(__transfersize(16) int8_t const * ptr);         // VLD1.8 {d0, d1}, [r0]
+int8x16_t vld1q_s8(__transfersize(16) int8_t const * ptr); // VLD1.8 {d0, d1}, [r0]
 #define vld1q_s8 LOAD_SI128
 
-int16x8_t vld1q_s16(__transfersize(8) int16_t const * ptr);         // VLD1.16 {d0, d1}, [r0]
+int16x8_t vld1q_s16(__transfersize(8) int16_t const * ptr); // VLD1.16 {d0, d1}, [r0]
 #define vld1q_s16 LOAD_SI128
 
-int32x4_t vld1q_s32(__transfersize(4) int32_t const * ptr);         // VLD1.32 {d0, d1}, [r0]
+int32x4_t vld1q_s32(__transfersize(4) int32_t const * ptr); // VLD1.32 {d0, d1}, [r0]
 #define vld1q_s32 LOAD_SI128
 
-int64x2_t vld1q_s64(__transfersize(2) int64_t const * ptr);         // VLD1.64 {d0, d1}, [r0]
+int64x2_t vld1q_s64(__transfersize(2) int64_t const * ptr); // VLD1.64 {d0, d1}, [r0]
 #define vld1q_s64 LOAD_SI128
 
-float16x8_t vld1q_f16(__transfersize(8) __fp16 const * ptr);         // VLD1.16 {d0, d1}, [r0]
+float16x8_t vld1q_f16(__transfersize(8) __fp16 const * ptr); // VLD1.16 {d0, d1}, [r0]
 // IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers
 /* _NEON2SSE_INLINE float16x8_t vld1q_f16(__transfersize(8) __fp16 const * ptr)// VLD1.16 {d0, d1}, [r0]
 {__m128 f1 = _mm_set_ps (ptr[3], ptr[2], ptr[1], ptr[0]);
@@ -4530,114 +9219,292 @@
 f2 = _mm_set_ps (ptr[7], ptr[6], ptr[5], ptr[4]);
 }*/
 
-float32x4_t vld1q_f32(__transfersize(4) float32_t const * ptr);         // VLD1.32 {d0, d1}, [r0]
+float32x4_t vld1q_f32(__transfersize(4) float32_t const * ptr); // VLD1.32 {d0, d1}, [r0]
 _NEON2SSE_INLINE float32x4_t vld1q_f32(__transfersize(4) float32_t const * ptr)
 {
-    if( (((unsigned long)(ptr)) & 15 ) == 0 )         //16 bits aligned
+    if( (((unsigned long)(ptr)) & 15 ) == 0 ) //16 bits aligned
         return _mm_load_ps(ptr);
     else
         return _mm_loadu_ps(ptr);
 }
 
-poly8x16_t vld1q_p8(__transfersize(16) poly8_t const * ptr);         // VLD1.8 {d0, d1}, [r0]
+poly8x16_t vld1q_p8(__transfersize(16) poly8_t const * ptr); // VLD1.8 {d0, d1}, [r0]
 #define vld1q_p8  LOAD_SI128
 
-poly16x8_t vld1q_p16(__transfersize(8) poly16_t const * ptr);         // VLD1.16 {d0, d1}, [r0]
+poly16x8_t vld1q_p16(__transfersize(8) poly16_t const * ptr); // VLD1.16 {d0, d1}, [r0]
 #define vld1q_p16 LOAD_SI128
 
+uint8x8_t vld1_u8(__transfersize(8) uint8_t const * ptr); // VLD1.8 {d0}, [r0]
+#define vld1_u8(ptr)  *((__m64_128*)(ptr)) //was _mm_loadl_epi64((__m128i*)(ptr))
+
+uint16x4_t vld1_u16(__transfersize(4) uint16_t const * ptr); // VLD1.16 {d0}, [r0]
+#define vld1_u16 vld1_u8
+
+uint32x2_t vld1_u32(__transfersize(2) uint32_t const * ptr); // VLD1.32 {d0}, [r0]
+#define vld1_u32 vld1_u8
+
+
+uint64x1_t vld1_u64(__transfersize(1) uint64_t const * ptr); // VLD1.64 {d0}, [r0]
+#define vld1_u64 vld1_u8
+
+int8x8_t vld1_s8(__transfersize(8) int8_t const * ptr); // VLD1.8 {d0}, [r0]
+#define vld1_s8 vld1_u8
+
+int16x4_t vld1_s16(__transfersize(4) int16_t const * ptr); // VLD1.16 {d0}, [r0]
+#define vld1_s16 vld1_u16
+
+int32x2_t vld1_s32(__transfersize(2) int32_t const * ptr); // VLD1.32 {d0}, [r0]
+#define vld1_s32 vld1_u32
+
+int64x1_t vld1_s64(__transfersize(1) int64_t const * ptr); // VLD1.64 {d0}, [r0]
+#define vld1_s64 vld1_u64
+
+float16x4_t vld1_f16(__transfersize(4) __fp16 const * ptr); // VLD1.16 {d0}, [r0]
 // IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit like _mm_set_ps (ptr[3], ptr[2], ptr[1], ptr[0]);
 
+float32x2_t vld1_f32(__transfersize(2) float32_t const * ptr); // VLD1.32 {d0}, [r0]
+_NEON2SSE_INLINE float32x2_t vld1_f32(__transfersize(2) float32_t const * ptr)
+{
+    float32x2_t res;
+    res.m64_f32[0] = *(ptr);
+    res.m64_f32[1] = *(ptr + 1);
+    return res;
+}
+
+poly8x8_t vld1_p8(__transfersize(8) poly8_t const * ptr); // VLD1.8 {d0}, [r0]
+#define vld1_p8 vld1_u8
+
+poly16x4_t vld1_p16(__transfersize(4) poly16_t const * ptr); // VLD1.16 {d0}, [r0]
+#define vld1_p16 vld1_u16
+
 //***********************************************************************************************************
 //******* Lane load functions - insert the data at  vector's given position (lane) *************************
 //***********************************************************************************************************
-uint8x16_t vld1q_lane_u8(__transfersize(1) uint8_t const * ptr, uint8x16_t vec, __constrange(0,15) int lane);         // VLD1.8 {d0[0]}, [r0]
+uint8x16_t vld1q_lane_u8(__transfersize(1) uint8_t const * ptr, uint8x16_t vec, __constrange(0,15) int lane); // VLD1.8 {d0[0]}, [r0]
 #define vld1q_lane_u8(ptr, vec, lane) _MM_INSERT_EPI8(vec, *(ptr), lane)
 
-uint16x8_t vld1q_lane_u16(__transfersize(1)    uint16_t const * ptr, uint16x8_t vec, __constrange(0,7) int lane);         // VLD1.16 {d0[0]}, [r0]
+uint16x8_t vld1q_lane_u16(__transfersize(1)    uint16_t const * ptr, uint16x8_t vec, __constrange(0,7) int lane); // VLD1.16 {d0[0]}, [r0]
 #define vld1q_lane_u16(ptr, vec, lane) _MM_INSERT_EPI16(vec, *(ptr), lane)
 
-uint32x4_t vld1q_lane_u32(__transfersize(1) uint32_t const * ptr, uint32x4_t vec, __constrange(0,3) int lane);         // VLD1.32 {d0[0]}, [r0]
+uint32x4_t vld1q_lane_u32(__transfersize(1) uint32_t const * ptr, uint32x4_t vec, __constrange(0,3) int lane); // VLD1.32 {d0[0]}, [r0]
 #define vld1q_lane_u32(ptr, vec, lane) _MM_INSERT_EPI32(vec, *(ptr), lane)
 
-uint64x2_t vld1q_lane_u64(__transfersize(1) uint64_t const * ptr, uint64x2_t vec, __constrange(0,1) int lane);         // VLD1.64 {d0}, [r0]
-#define vld1q_lane_u64(ptr, vec, lane) _MM_INSERT_EPI64(vec, *(ptr), lane);         // _p;
+uint64x2_t vld1q_lane_u64(__transfersize(1) uint64_t const * ptr, uint64x2_t vec, __constrange(0,1) int lane); // VLD1.64 {d0}, [r0]
+#define vld1q_lane_u64(ptr, vec, lane) _MM_INSERT_EPI64(vec, *(ptr), lane); // _p;
 
-int8x16_t vld1q_lane_s8(__transfersize(1) int8_t const * ptr, int8x16_t vec, __constrange(0,15) int lane);         // VLD1.8 {d0[0]}, [r0]
+
+int8x16_t vld1q_lane_s8(__transfersize(1) int8_t const * ptr, int8x16_t vec, __constrange(0,15) int lane); // VLD1.8 {d0[0]}, [r0]
 #define vld1q_lane_s8(ptr, vec, lane) _MM_INSERT_EPI8(vec, *(ptr), lane)
 
-int16x8_t vld1q_lane_s16(__transfersize(1) int16_t const * ptr, int16x8_t vec, __constrange(0,7) int lane);         // VLD1.16 {d0[0]}, [r0]
+int16x8_t vld1q_lane_s16(__transfersize(1) int16_t const * ptr, int16x8_t vec, __constrange(0,7) int lane); // VLD1.16 {d0[0]}, [r0]
 #define vld1q_lane_s16(ptr, vec, lane) _MM_INSERT_EPI16(vec, *(ptr), lane)
 
-int32x4_t vld1q_lane_s32(__transfersize(1) int32_t const * ptr, int32x4_t vec, __constrange(0,3) int lane);         // VLD1.32 {d0[0]}, [r0]
+int32x4_t vld1q_lane_s32(__transfersize(1) int32_t const * ptr, int32x4_t vec, __constrange(0,3) int lane); // VLD1.32 {d0[0]}, [r0]
 #define vld1q_lane_s32(ptr, vec, lane) _MM_INSERT_EPI32(vec, *(ptr), lane)
 
+float16x8_t vld1q_lane_f16(__transfersize(1) __fp16 const * ptr, float16x8_t vec, __constrange(0,7) int lane); // VLD1.16 {d0[0]}, [r0]
 //current IA SIMD doesn't support float16
 
-float32x4_t vld1q_lane_f32(__transfersize(1) float32_t const * ptr, float32x4_t vec, __constrange(0,3) int lane);         // VLD1.32 {d0[0]}, [r0]
+float32x4_t vld1q_lane_f32(__transfersize(1) float32_t const * ptr, float32x4_t vec, __constrange(0,3) int lane); // VLD1.32 {d0[0]}, [r0]
 _NEON2SSE_INLINE float32x4_t vld1q_lane_f32(__transfersize(1) float32_t const * ptr, float32x4_t vec, __constrange(0,3) int lane)
-{         //we need to deal with  ptr  16bit NOT aligned case
+{
+    //we need to deal with  ptr  16bit NOT aligned case
     __m128 p;
     p = _mm_set1_ps(*(ptr));
     return _MM_INSERT_PS(vec,  p, _INSERTPS_NDX(0, lane));
 }
 
-int64x2_t vld1q_lane_s64(__transfersize(1) int64_t const * ptr, int64x2_t vec, __constrange(0,1) int lane);         // VLD1.64 {d0}, [r0]
+int64x2_t vld1q_lane_s64(__transfersize(1) int64_t const * ptr, int64x2_t vec, __constrange(0,1) int lane); // VLD1.64 {d0}, [r0]
 #define vld1q_lane_s64(ptr, vec, lane) _MM_INSERT_EPI64(vec, *(ptr), lane)
 
-poly8x16_t vld1q_lane_p8(__transfersize(1) poly8_t const * ptr, poly8x16_t vec, __constrange(0,15) int lane);         // VLD1.8 {d0[0]}, [r0]
+poly8x16_t vld1q_lane_p8(__transfersize(1) poly8_t const * ptr, poly8x16_t vec, __constrange(0,15) int lane); // VLD1.8 {d0[0]}, [r0]
 #define vld1q_lane_p8(ptr, vec, lane) _MM_INSERT_EPI8(vec, *(ptr), lane)
 
-poly16x8_t vld1q_lane_p16(__transfersize(1) poly16_t const * ptr, poly16x8_t vec, __constrange(0,7) int lane);         // VLD1.16 {d0[0]}, [r0]
+poly16x8_t vld1q_lane_p16(__transfersize(1) poly16_t const * ptr, poly16x8_t vec, __constrange(0,7) int lane); // VLD1.16 {d0[0]}, [r0]
 #define vld1q_lane_p16(ptr, vec, lane) _MM_INSERT_EPI16(vec, *(ptr), lane)
 
-//serial solution may be faster
+uint8x8_t vld1_lane_u8(__transfersize(1) uint8_t const * ptr, uint8x8_t vec, __constrange(0,7) int lane); // VLD1.8 {d0[0]}, [r0]
+_NEON2SSE_INLINE uint8x8_t vld1_lane_u8(__transfersize(1) uint8_t const * ptr, uint8x8_t vec, __constrange(0,7) int lane)
+{
+    uint8x8_t res;
+    res = vec;
+    res.m64_u8[lane] = *(ptr);
+    return res;
+}
 
+uint16x4_t vld1_lane_u16(__transfersize(1) uint16_t const * ptr, uint16x4_t vec, __constrange(0,3) int lane); // VLD1.16 {d0[0]}, [r0]
+_NEON2SSE_INLINE uint16x4_t vld1_lane_u16(__transfersize(1) uint16_t const * ptr, uint16x4_t vec, __constrange(0,3) int lane)
+{
+    uint16x4_t res;
+    res = vec;
+    res.m64_u16[lane] = *(ptr);
+    return res;
+}
+
+uint32x2_t vld1_lane_u32(__transfersize(1) uint32_t const * ptr, uint32x2_t vec, __constrange(0,1) int lane); // VLD1.32 {d0[0]}, [r0]
+_NEON2SSE_INLINE uint32x2_t vld1_lane_u32(__transfersize(1) uint32_t const * ptr, uint32x2_t vec, __constrange(0,1) int lane)
+{
+    uint32x2_t res;
+    res = vec;
+    res.m64_u32[lane] = *(ptr);
+    return res;
+}
+
+uint64x1_t vld1_lane_u64(__transfersize(1) uint64_t const * ptr, uint64x1_t vec, __constrange(0,0) int lane); // VLD1.64 {d0}, [r0]
+_NEON2SSE_INLINE uint64x1_t vld1_lane_u64(__transfersize(1) uint64_t const * ptr, uint64x1_t vec, __constrange(0,0) int lane)
+{
+    uint64x1_t res;
+    res.m64_u64[0] = *(ptr);
+    return res;
+}
+
+
+int8x8_t vld1_lane_s8(__transfersize(1) int8_t const * ptr, int8x8_t vec, __constrange(0,7) int lane); // VLD1.8 {d0[0]}, [r0]
+#define vld1_lane_s8(ptr, vec, lane) vld1_lane_u8((uint8_t*)ptr, vec, lane)
+
+int16x4_t vld1_lane_s16(__transfersize(1) int16_t const * ptr, int16x4_t vec, __constrange(0,3) int lane); // VLD1.16 {d0[0]}, [r0]
+#define vld1_lane_s16(ptr, vec, lane) vld1_lane_u16((uint16_t*)ptr, vec, lane)
+
+int32x2_t vld1_lane_s32(__transfersize(1) int32_t const * ptr, int32x2_t vec, __constrange(0,1) int lane); // VLD1.32 {d0[0]}, [r0]
+#define vld1_lane_s32(ptr, vec, lane) vld1_lane_u32((uint32_t*)ptr, vec, lane)
+
+float16x4_t vld1_lane_f16(__transfersize(1) __fp16 const * ptr, float16x4_t vec, __constrange(0,3) int lane); // VLD1.16 {d0[0]}, [r0]
 //current IA SIMD doesn't support float16
 
+float32x2_t vld1_lane_f32(__transfersize(1) float32_t const * ptr, float32x2_t vec, __constrange(0,1) int lane); // VLD1.32 {d0[0]}, [r0]
+_NEON2SSE_INLINE float32x2_t vld1_lane_f32(__transfersize(1) float32_t const * ptr, float32x2_t vec, __constrange(0,1) int lane)
+{
+    float32x2_t res;
+    res = vec;
+    res.m64_f32[lane] = *(ptr);
+    return res;
+}
+
+int64x1_t vld1_lane_s64(__transfersize(1) int64_t const * ptr, int64x1_t vec, __constrange(0,0) int lane); // VLD1.64 {d0}, [r0]
+#define vld1_lane_s64(ptr, vec, lane) vld1_lane_u64((uint64_t*)ptr, vec, lane)
+
+poly8x8_t vld1_lane_p8(__transfersize(1) poly8_t const * ptr, poly8x8_t vec, __constrange(0,7) int lane); // VLD1.8 {d0[0]}, [r0]
+#define vld1_lane_p8 vld1_lane_u8
+
+poly16x4_t vld1_lane_p16(__transfersize(1) poly16_t const * ptr, poly16x4_t vec, __constrange(0,3) int lane); // VLD1.16 {d0[0]}, [r0]
+#define vld1_lane_p16 vld1_lane_s16
+
 // ****************** Load single value ( set all lanes of vector with same value from memory)**********************
 // ******************************************************************************************************************
-uint8x16_t vld1q_dup_u8(__transfersize(1) uint8_t const * ptr);         // VLD1.8 {d0[]}, [r0]
+uint8x16_t vld1q_dup_u8(__transfersize(1) uint8_t const * ptr); // VLD1.8 {d0[]}, [r0]
 #define vld1q_dup_u8(ptr) _mm_set1_epi8(*(ptr))
 
-uint16x8_t vld1q_dup_u16(__transfersize(1) uint16_t const * ptr);         // VLD1.16 {d0[]}, [r0]
+uint16x8_t vld1q_dup_u16(__transfersize(1) uint16_t const * ptr); // VLD1.16 {d0[]}, [r0]
 #define vld1q_dup_u16(ptr) _mm_set1_epi16(*(ptr))
 
-uint32x4_t vld1q_dup_u32(__transfersize(1) uint32_t const * ptr);         // VLD1.32 {d0[]}, [r0]
+uint32x4_t vld1q_dup_u32(__transfersize(1) uint32_t const * ptr); // VLD1.32 {d0[]}, [r0]
 #define vld1q_dup_u32(ptr) _mm_set1_epi32(*(ptr))
 
-uint64x2_t vld1q_dup_u64(__transfersize(1) uint64_t const * ptr);         // VLD1.64 {d0}, [r0]
+uint64x2_t vld1q_dup_u64(__transfersize(1) uint64_t const * ptr); // VLD1.64 {d0}, [r0]
 _NEON2SSE_INLINE uint64x2_t   vld1q_dup_u64(__transfersize(1) uint64_t const * ptr)
 {
     _NEON2SSE_ALIGN_16 uint64_t val[2] = {*(ptr), *(ptr)};
     return LOAD_SI128(val);
 }
 
-int8x16_t vld1q_dup_s8(__transfersize(1) int8_t const * ptr);         // VLD1.8 {d0[]}, [r0]
+int8x16_t vld1q_dup_s8(__transfersize(1) int8_t const * ptr); // VLD1.8 {d0[]}, [r0]
 #define vld1q_dup_s8(ptr) _mm_set1_epi8(*(ptr))
 
-int16x8_t vld1q_dup_s16(__transfersize(1) int16_t const * ptr);         // VLD1.16 {d0[]}, [r0]
+int16x8_t vld1q_dup_s16(__transfersize(1) int16_t const * ptr); // VLD1.16 {d0[]}, [r0]
 #define vld1q_dup_s16(ptr) _mm_set1_epi16 (*(ptr))
 
-int32x4_t vld1q_dup_s32(__transfersize(1) int32_t const * ptr);         // VLD1.32 {d0[]}, [r0]
+int32x4_t vld1q_dup_s32(__transfersize(1) int32_t const * ptr); // VLD1.32 {d0[]}, [r0]
 #define vld1q_dup_s32(ptr) _mm_set1_epi32 (*(ptr))
 
-int64x2_t vld1q_dup_s64(__transfersize(1) int64_t const * ptr);         // VLD1.64 {d0}, [r0]
+int64x2_t vld1q_dup_s64(__transfersize(1) int64_t const * ptr); // VLD1.64 {d0}, [r0]
 #define vld1q_dup_s64(ptr) vld1q_dup_u64((uint64_t*)ptr)
 
-float16x8_t vld1q_dup_f16(__transfersize(1) __fp16 const * ptr);         // VLD1.16 {d0[]}, [r0]
+float16x8_t vld1q_dup_f16(__transfersize(1) __fp16 const * ptr); // VLD1.16 {d0[]}, [r0]
 //current IA SIMD doesn't support float16, need to go to 32 bits
 
-float32x4_t vld1q_dup_f32(__transfersize(1) float32_t const * ptr);         // VLD1.32 {d0[]}, [r0]
+float32x4_t vld1q_dup_f32(__transfersize(1) float32_t const * ptr); // VLD1.32 {d0[]}, [r0]
 #define vld1q_dup_f32(ptr) _mm_set1_ps (*(ptr))
 
-poly8x16_t vld1q_dup_p8(__transfersize(1) poly8_t const * ptr);         // VLD1.8 {d0[]}, [r0]
+poly8x16_t vld1q_dup_p8(__transfersize(1) poly8_t const * ptr); // VLD1.8 {d0[]}, [r0]
 #define vld1q_dup_p8(ptr) _mm_set1_epi8(*(ptr))
 
-poly16x8_t vld1q_dup_p16(__transfersize(1) poly16_t const * ptr);         // VLD1.16 {d0[]}, [r0]
+poly16x8_t vld1q_dup_p16(__transfersize(1) poly16_t const * ptr); // VLD1.16 {d0[]}, [r0]
 #define vld1q_dup_p16(ptr) _mm_set1_epi16 (*(ptr))
 
+uint8x8_t vld1_dup_u8(__transfersize(1) uint8_t const * ptr); // VLD1.8 {d0[]}, [r0]
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x8_t vld1_dup_u8(__transfersize(1) uint8_t const * ptr), _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    uint8x8_t res;
+    int i;
+    for(i = 0; i<8; i++) {
+        res.m64_u8[i] =  *(ptr);
+    }
+    return res;
+}
+
+uint16x4_t vld1_dup_u16(__transfersize(1) uint16_t const * ptr); // VLD1.16 {d0[]}, [r0]
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x4_t vld1_dup_u16(__transfersize(1) uint16_t const * ptr), _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    uint16x4_t res;
+    int i;
+    for(i = 0; i<4; i++) {
+        res.m64_u16[i] =  *(ptr);
+    }
+    return res;
+}
+
+uint32x2_t vld1_dup_u32(__transfersize(1) uint32_t const * ptr); // VLD1.32 {d0[]}, [r0]
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vld1_dup_u32(__transfersize(1) uint32_t const * ptr), _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    uint32x2_t res;
+    res.m64_u32[0] = *(ptr);
+    res.m64_u32[1] = *(ptr);
+    return res;
+}
+
+uint64x1_t vld1_dup_u64(__transfersize(1) uint64_t const * ptr); // VLD1.64 {d0}, [r0]
+_NEON2SSE_INLINE uint64x1_t vld1_dup_u64(__transfersize(1) uint64_t const * ptr)
+{
+    uint64x1_t res;
+    res.m64_u64[0] = *(ptr);
+    return res;
+}
+
+int8x8_t vld1_dup_s8(__transfersize(1) int8_t const * ptr); // VLD1.8 {d0[]}, [r0]
+#define vld1_dup_s8(ptr) vld1_dup_u8((uint8_t*)ptr)
+
+
+int16x4_t vld1_dup_s16(__transfersize(1) int16_t const * ptr); // VLD1.16 {d0[]}, [r0]
+#define vld1_dup_s16(ptr) vld1_dup_u16((uint16_t*)ptr)
+
+
+int32x2_t vld1_dup_s32(__transfersize(1) int32_t const * ptr); // VLD1.32 {d0[]}, [r0]
+#define vld1_dup_s32(ptr) vld1_dup_u32((uint32_t*)ptr)
+
+
+int64x1_t vld1_dup_s64(__transfersize(1) int64_t const * ptr); // VLD1.64 {d0}, [r0]
+#define vld1_dup_s64(ptr) vld1_dup_u64((uint64_t*)ptr)
+
+float16x4_t vld1_dup_f16(__transfersize(1) __fp16 const * ptr); // VLD1.16 {d0[]}, [r0]
 //current IA SIMD doesn't support float16
 
+float32x2_t vld1_dup_f32(__transfersize(1) float32_t const * ptr); // VLD1.32 {d0[]}, [r0]
+_NEON2SSE_INLINE float32x2_t vld1_dup_f32(__transfersize(1) float32_t const * ptr)
+{
+    float32x2_t res;
+    res.m64_f32[0] = *(ptr);
+    res.m64_f32[1] = res.m64_f32[0];
+    return res; // use last 64bits only
+}
+
+poly8x8_t vld1_dup_p8(__transfersize(1) poly8_t const * ptr); // VLD1.8 {d0[]}, [r0]
+#define vld1_dup_p8 vld1_dup_u8
+
+
+poly16x4_t vld1_dup_p16(__transfersize(1) poly16_t const * ptr); // VLD1.16 {d0[]}, [r0]
+#define vld1_dup_p16 vld1_dup_u16
+
+
 //*************************************************************************************
 //********************************* Store **********************************************
 //*************************************************************************************
@@ -4646,80 +9513,148 @@
 #define STORE_SI128(ptr, val) \
         (((unsigned long)(ptr) & 15) == 0 ) ? _mm_store_si128 ((__m128i*)(ptr), val) : _mm_storeu_si128 ((__m128i*)(ptr), val);
 
-void vst1q_u8(__transfersize(16) uint8_t * ptr, uint8x16_t val);         // VST1.8 {d0, d1}, [r0]
+void vst1q_u8(__transfersize(16) uint8_t * ptr, uint8x16_t val); // VST1.8 {d0, d1}, [r0]
 #define vst1q_u8 STORE_SI128
 
-void vst1q_u16(__transfersize(8) uint16_t * ptr, uint16x8_t val);         // VST1.16 {d0, d1}, [r0]
+void vst1q_u16(__transfersize(8) uint16_t * ptr, uint16x8_t val); // VST1.16 {d0, d1}, [r0]
 #define vst1q_u16 STORE_SI128
 
-void vst1q_u32(__transfersize(4) uint32_t * ptr, uint32x4_t val);         // VST1.32 {d0, d1}, [r0]
+void vst1q_u32(__transfersize(4) uint32_t * ptr, uint32x4_t val); // VST1.32 {d0, d1}, [r0]
 #define vst1q_u32 STORE_SI128
 
-void vst1q_u64(__transfersize(2) uint64_t * ptr, uint64x2_t val);         // VST1.64 {d0, d1}, [r0]
+void vst1q_u64(__transfersize(2) uint64_t * ptr, uint64x2_t val); // VST1.64 {d0, d1}, [r0]
 #define vst1q_u64 STORE_SI128
 
-void vst1q_s8(__transfersize(16) int8_t * ptr, int8x16_t val);         // VST1.8 {d0, d1}, [r0]
+void vst1q_s8(__transfersize(16) int8_t * ptr, int8x16_t val); // VST1.8 {d0, d1}, [r0]
 #define vst1q_s8 STORE_SI128
 
-void vst1q_s16(__transfersize(8) int16_t * ptr, int16x8_t val);         // VST1.16 {d0, d1}, [r0]
+void vst1q_s16(__transfersize(8) int16_t * ptr, int16x8_t val); // VST1.16 {d0, d1}, [r0]
 #define vst1q_s16 STORE_SI128
 
-void vst1q_s32(__transfersize(4) int32_t * ptr, int32x4_t val);         // VST1.32 {d0, d1}, [r0]
+void vst1q_s32(__transfersize(4) int32_t * ptr, int32x4_t val); // VST1.32 {d0, d1}, [r0]
 #define vst1q_s32 STORE_SI128
 
-void vst1q_s64(__transfersize(2) int64_t * ptr, int64x2_t val);         // VST1.64 {d0, d1}, [r0]
+void vst1q_s64(__transfersize(2) int64_t * ptr, int64x2_t val); // VST1.64 {d0, d1}, [r0]
 #define vst1q_s64 STORE_SI128
 
-void vst1q_f16(__transfersize(8) __fp16 * ptr, float16x8_t val);         // VST1.16 {d0, d1}, [r0]
+void vst1q_f16(__transfersize(8) __fp16 * ptr, float16x8_t val); // VST1.16 {d0, d1}, [r0]
 // IA32 SIMD doesn't work with 16bit floats currently
 
-void vst1q_f32(__transfersize(4) float32_t * ptr, float32x4_t val);         // VST1.32 {d0, d1}, [r0]
+void vst1q_f32(__transfersize(4) float32_t * ptr, float32x4_t val); // VST1.32 {d0, d1}, [r0]
 _NEON2SSE_INLINE void vst1q_f32(__transfersize(4) float32_t * ptr, float32x4_t val)
 {
-    if( ((unsigned long)(ptr) & 15)  == 0 )         //16 bits aligned
+    if( ((unsigned long)(ptr) & 15)  == 0 ) //16 bits aligned
         _mm_store_ps (ptr, val);
     else
         _mm_storeu_ps (ptr, val);
 }
 
-void vst1q_p8(__transfersize(16) poly8_t * ptr, poly8x16_t val);         // VST1.8 {d0, d1}, [r0]
+void vst1q_p8(__transfersize(16) poly8_t * ptr, poly8x16_t val); // VST1.8 {d0, d1}, [r0]
 #define vst1q_p8  vst1q_u8
 
-void vst1q_p16(__transfersize(8) poly16_t * ptr, poly16x8_t val);         // VST1.16 {d0, d1}, [r0]
+void vst1q_p16(__transfersize(8) poly16_t * ptr, poly16x8_t val); // VST1.16 {d0, d1}, [r0]
 #define vst1q_p16 vst1q_u16
 
+void vst1_u8(__transfersize(8) uint8_t * ptr, uint8x8_t val); // VST1.8 {d0}, [r0]
+_NEON2SSE_INLINE void vst1_u8(__transfersize(8) uint8_t * ptr, uint8x8_t val)
+{
+    int i;
+    for (i = 0; i<8; i++) {
+        *(ptr + i) = ((uint8_t*)&val)[i];
+    }
+    //_mm_storel_epi64((__m128i*)ptr, val);
+    return;
+}
+
+void vst1_u16(__transfersize(4) uint16_t * ptr, uint16x4_t val); // VST1.16 {d0}, [r0]
+_NEON2SSE_INLINE void vst1_u16(__transfersize(4) uint16_t * ptr, uint16x4_t val)
+{
+    int i;
+    for (i = 0; i<4; i++) {
+        *(ptr + i) = ((uint16_t*)&val)[i];
+    }
+    //_mm_storel_epi64((__m128i*)ptr, val);
+    return;
+}
+
+void vst1_u32(__transfersize(2) uint32_t * ptr, uint32x2_t val); // VST1.32 {d0}, [r0]
+_NEON2SSE_INLINE void vst1_u32(__transfersize(2) uint32_t * ptr, uint32x2_t val)
+{
+    int i;
+    for (i = 0; i<2; i++) {
+        *(ptr + i) = ((uint32_t*)&val)[i];
+    }
+    //_mm_storel_epi64((__m128i*)ptr, val);
+    return;
+}
+
+void vst1_u64(__transfersize(1) uint64_t * ptr, uint64x1_t val); // VST1.64 {d0}, [r0]
+_NEON2SSE_INLINE void vst1_u64(__transfersize(1) uint64_t * ptr, uint64x1_t val)
+{
+    *(ptr) = *((uint64_t*)&val);
+    //_mm_storel_epi64((__m128i*)ptr, val);
+    return;
+}
+
+void vst1_s8(__transfersize(8) int8_t * ptr, int8x8_t val); // VST1.8 {d0}, [r0]
+#define vst1_s8(ptr,val) vst1_u8((uint8_t*)ptr,val)
+
+void vst1_s16(__transfersize(4) int16_t * ptr, int16x4_t val); // VST1.16 {d0}, [r0]
+#define vst1_s16(ptr,val) vst1_u16((uint16_t*)ptr,val)
+
+void vst1_s32(__transfersize(2) int32_t * ptr, int32x2_t val); // VST1.32 {d0}, [r0]
+#define vst1_s32(ptr,val) vst1_u32((uint32_t*)ptr,val)
+
+void vst1_s64(__transfersize(1) int64_t * ptr, int64x1_t val); // VST1.64 {d0}, [r0]
+#define vst1_s64(ptr,val) vst1_u64((uint64_t*)ptr,val)
+
+void vst1_f16(__transfersize(4) __fp16 * ptr, float16x4_t val); // VST1.16 {d0}, [r0]
 //current IA SIMD doesn't support float16
 
+void vst1_f32(__transfersize(2) float32_t * ptr, float32x2_t val); // VST1.32 {d0}, [r0]
+_NEON2SSE_INLINE void vst1_f32(__transfersize(2) float32_t * ptr, float32x2_t val)
+{
+    *(ptr) =   val.m64_f32[0];
+    *(ptr + 1) = val.m64_f32[1];
+    return;
+}
+
+void vst1_p8(__transfersize(8) poly8_t * ptr, poly8x8_t val); // VST1.8 {d0}, [r0]
+#define vst1_p8 vst1_u8
+
+void vst1_p16(__transfersize(4) poly16_t * ptr, poly16x4_t val); // VST1.16 {d0}, [r0]
+#define vst1_p16 vst1_u16
+
 //***********Store a lane of a vector into memory (extract given lane) *********************
 //******************************************************************************************
-void vst1q_lane_u8(__transfersize(1) uint8_t * ptr, uint8x16_t val, __constrange(0,15) int lane);         // VST1.8 {d0[0]}, [r0]
+void vst1q_lane_u8(__transfersize(1) uint8_t * ptr, uint8x16_t val, __constrange(0,15) int lane); // VST1.8 {d0[0]}, [r0]
 #define vst1q_lane_u8(ptr, val, lane) *(ptr) = _MM_EXTRACT_EPI8 (val, lane)
 
-void vst1q_lane_u16(__transfersize(1) uint16_t * ptr, uint16x8_t val, __constrange(0,7) int lane);         // VST1.16 {d0[0]}, [r0]
+void vst1q_lane_u16(__transfersize(1) uint16_t * ptr, uint16x8_t val, __constrange(0,7) int lane); // VST1.16 {d0[0]}, [r0]
 #define vst1q_lane_u16(ptr, val, lane) *(ptr) = _MM_EXTRACT_EPI16 (val, lane)
 
-void vst1q_lane_u32(__transfersize(1) uint32_t * ptr, uint32x4_t val, __constrange(0,3) int lane);         // VST1.32 {d0[0]}, [r0]
+void vst1q_lane_u32(__transfersize(1) uint32_t * ptr, uint32x4_t val, __constrange(0,3) int lane); // VST1.32 {d0[0]}, [r0]
 #define vst1q_lane_u32(ptr, val, lane) *(ptr) = _MM_EXTRACT_EPI32 (val, lane)
 
-void vst1q_lane_u64(__transfersize(1) uint64_t * ptr, uint64x2_t val, __constrange(0,1) int lane);         // VST1.64 {d0}, [r0]
+void vst1q_lane_u64(__transfersize(1) uint64_t * ptr, uint64x2_t val, __constrange(0,1) int lane); // VST1.64 {d0}, [r0]
 #define vst1q_lane_u64(ptr, val, lane) *(ptr) = _MM_EXTRACT_EPI64 (val, lane)
 
-void vst1q_lane_s8(__transfersize(1) int8_t * ptr, int8x16_t val, __constrange(0,15) int lane);         // VST1.8 {d0[0]}, [r0]
+void vst1q_lane_s8(__transfersize(1) int8_t * ptr, int8x16_t val, __constrange(0,15) int lane); // VST1.8 {d0[0]}, [r0]
 #define vst1q_lane_s8(ptr, val, lane) *(ptr) = _MM_EXTRACT_EPI8 (val, lane)
 
-void vst1q_lane_s16(__transfersize(1) int16_t * ptr, int16x8_t val, __constrange(0,7) int lane);         // VST1.16 {d0[0]}, [r0]
+void vst1q_lane_s16(__transfersize(1) int16_t * ptr, int16x8_t val, __constrange(0,7) int lane); // VST1.16 {d0[0]}, [r0]
 #define vst1q_lane_s16(ptr, val, lane) *(ptr) = _MM_EXTRACT_EPI16 (val, lane)
 
-void vst1q_lane_s32(__transfersize(1) int32_t * ptr, int32x4_t val, __constrange(0,3) int lane);         // VST1.32 {d0[0]}, [r0]
+void vst1q_lane_s32(__transfersize(1) int32_t * ptr, int32x4_t val, __constrange(0,3) int lane); // VST1.32 {d0[0]}, [r0]
 #define vst1q_lane_s32(ptr, val, lane) *(ptr) = _MM_EXTRACT_EPI32 (val, lane)
 
-void vst1q_lane_s64(__transfersize(1) int64_t * ptr, int64x2_t val, __constrange(0,1) int lane);         // VST1.64 {d0}, [r0]
+void vst1q_lane_s64(__transfersize(1) int64_t * ptr, int64x2_t val, __constrange(0,1) int lane); // VST1.64 {d0}, [r0]
 #define vst1q_lane_s64(ptr, val, lane) *(ptr) = _MM_EXTRACT_EPI64 (val, lane)
 
-void vst1q_lane_f16(__transfersize(1) __fp16 * ptr, float16x8_t val, __constrange(0,7) int lane);         // VST1.16 {d0[0]}, [r0]
+void vst1q_lane_f16(__transfersize(1) __fp16 * ptr, float16x8_t val, __constrange(0,7) int lane); // VST1.16 {d0[0]}, [r0]
 //current IA SIMD doesn't support float16
 
-void vst1q_lane_f32(__transfersize(1) float32_t * ptr, float32x4_t val, __constrange(0,3) int lane);         // VST1.32 {d0[0]}, [r0]
+void vst1q_lane_f32(__transfersize(1) float32_t * ptr, float32x4_t val, __constrange(0,3) int lane); // VST1.32 {d0[0]}, [r0]
 _NEON2SSE_INLINE void vst1q_lane_f32(__transfersize(1) float32_t * ptr, float32x4_t val, __constrange(0,3) int lane)
 {
     int32_t ilane;
@@ -4727,22 +9662,73 @@
     *(ptr) =  *((float*)&ilane);
 }
 
-void vst1q_lane_p8(__transfersize(1) poly8_t * ptr, poly8x16_t val, __constrange(0,15) int lane);         // VST1.8 {d0[0]}, [r0]
+void vst1q_lane_p8(__transfersize(1) poly8_t * ptr, poly8x16_t val, __constrange(0,15) int lane); // VST1.8 {d0[0]}, [r0]
 #define vst1q_lane_p8   vst1q_lane_u8
 
-void vst1q_lane_p16(__transfersize(1) poly16_t * ptr, poly16x8_t val, __constrange(0,7) int lane);         // VST1.16 {d0[0]}, [r0]
+void vst1q_lane_p16(__transfersize(1) poly16_t * ptr, poly16x8_t val, __constrange(0,7) int lane); // VST1.16 {d0[0]}, [r0]
 #define vst1q_lane_p16   vst1q_lane_s16
 
+void vst1_lane_u8(__transfersize(1) uint8_t * ptr, uint8x8_t val, __constrange(0,7) int lane); // VST1.8 {d0[0]}, [r0]
+_NEON2SSE_INLINE void vst1_lane_u8(__transfersize(1) uint8_t * ptr, uint8x8_t val, __constrange(0,7) int lane)
+{
+    *(ptr) = val.m64_u8[lane];
+}
+
+void vst1_lane_u16(__transfersize(1) uint16_t * ptr, uint16x4_t val, __constrange(0,3) int lane); // VST1.16 {d0[0]}, [r0]
+_NEON2SSE_INLINE void vst1_lane_u16(__transfersize(1) uint16_t * ptr, uint16x4_t val, __constrange(0,3) int lane)
+{
+    *(ptr) = val.m64_u16[lane];
+}
+
+void vst1_lane_u32(__transfersize(1) uint32_t * ptr, uint32x2_t val, __constrange(0,1) int lane); // VST1.32 {d0[0]}, [r0]
+_NEON2SSE_INLINE void vst1_lane_u32(__transfersize(1) uint32_t * ptr, uint32x2_t val, __constrange(0,1) int lane)
+{
+    *(ptr) = val.m64_u32[lane];
+}
+
+void vst1_lane_u64(__transfersize(1) uint64_t * ptr, uint64x1_t val, __constrange(0,0) int lane); // VST1.64 {d0}, [r0]
+_NEON2SSE_INLINE void vst1_lane_u64(__transfersize(1) uint64_t * ptr, uint64x1_t val, __constrange(0,0) int lane)
+{
+    *(ptr) = val.m64_u64[0];
+}
+
+void vst1_lane_s8(__transfersize(1) int8_t * ptr, int8x8_t val, __constrange(0,7) int lane); // VST1.8 {d0[0]}, [r0]
+#define  vst1_lane_s8(ptr, val, lane) vst1_lane_u8((uint8_t*)ptr, val, lane)
+
+void vst1_lane_s16(__transfersize(1) int16_t * ptr, int16x4_t val, __constrange(0,3) int lane); // VST1.16 {d0[0]}, [r0]
+#define vst1_lane_s16(ptr, val, lane) vst1_lane_u16((uint16_t*)ptr, val, lane)
+
+void vst1_lane_s32(__transfersize(1) int32_t * ptr, int32x2_t val, __constrange(0,1) int lane); // VST1.32 {d0[0]}, [r0]
+#define vst1_lane_s32(ptr, val, lane)  vst1_lane_u32((uint32_t*)ptr, val, lane)
+
+
+void vst1_lane_s64(__transfersize(1) int64_t * ptr, int64x1_t val, __constrange(0,0) int lane); // VST1.64 {d0}, [r0]
+#define vst1_lane_s64(ptr, val, lane) vst1_lane_u64((uint64_t*)ptr, val, lane)
+
+
+void vst1_lane_f16(__transfersize(1) __fp16 * ptr, float16x4_t val, __constrange(0,3) int lane); // VST1.16 {d0[0]}, [r0]
 //current IA SIMD doesn't support float16
 
+void vst1_lane_f32(__transfersize(1) float32_t * ptr, float32x2_t val, __constrange(0,1) int lane); // VST1.32 {d0[0]}, [r0]
+_NEON2SSE_INLINE void vst1_lane_f32(__transfersize(1) float32_t * ptr, float32x2_t val, __constrange(0,1) int lane)
+{
+    *(ptr) = val.m64_f32[lane];
+}
+
+void vst1_lane_p8(__transfersize(1) poly8_t * ptr, poly8x8_t val, __constrange(0,7) int lane); // VST1.8 {d0[0]}, [r0]
+#define vst1_lane_p8 vst1_lane_u8
+
+void vst1_lane_p16(__transfersize(1) poly16_t * ptr, poly16x4_t val, __constrange(0,3) int lane); // VST1.16 {d0[0]}, [r0]
+#define vst1_lane_p16 vst1_lane_s16
+
 //***********************************************************************************************
 //**************** Loads and stores of an N-element structure **********************************
 //***********************************************************************************************
 //These intrinsics load or store an n-element structure. The array structures are defined in the beginning
 //We assume ptr is NOT aligned in general case, for more details see  "Loads and stores of a single vector functions"
 //****************** 2 elements load  *********************************************
-uint8x16x2_t vld2q_u8(__transfersize(32) uint8_t const * ptr);         // VLD2.8 {d0, d2}, [r0]
-_NEON2SSE_INLINE uint8x16x2_t vld2q_u8(__transfersize(32) uint8_t const * ptr)         // VLD2.8 {d0, d2}, [r0]
+uint8x16x2_t vld2q_u8(__transfersize(32) uint8_t const * ptr); // VLD2.8 {d0, d2}, [r0]
+_NEON2SSE_INLINE uint8x16x2_t vld2q_u8(__transfersize(32) uint8_t const * ptr) // VLD2.8 {d0, d2}, [r0]
 {
     uint8x16x2_t v;
     v.val[0] = vld1q_u8(ptr);
@@ -4751,9 +9737,8 @@
     return v;
 }
 
-#if defined(USE_SSSE3)
-uint16x8x2_t vld2q_u16(__transfersize(16) uint16_t const * ptr);         // VLD2.16 {d0, d2}, [r0]
-_NEON2SSE_INLINE uint16x8x2_t vld2q_u16(__transfersize(16) uint16_t const * ptr)         // VLD2.16 {d0, d2}, [r0]
+uint16x8x2_t vld2q_u16(__transfersize(16) uint16_t const * ptr); // VLD2.16 {d0, d2}, [r0]
+_NEON2SSE_INLINE uint16x8x2_t vld2q_u16(__transfersize(16) uint16_t const * ptr) // VLD2.16 {d0, d2}, [r0]
 {
     uint16x8x2_t v;
     v.val[0] = vld1q_u16( ptr);
@@ -4761,10 +9746,9 @@
     v = vuzpq_s16(v.val[0], v.val[1]);
     return v;
 }
-#endif
 
-uint32x4x2_t vld2q_u32(__transfersize(8) uint32_t const * ptr);         // VLD2.32 {d0, d2}, [r0]
-_NEON2SSE_INLINE uint32x4x2_t vld2q_u32(__transfersize(8) uint32_t const * ptr)         // VLD2.32 {d0, d2}, [r0]
+uint32x4x2_t vld2q_u32(__transfersize(8) uint32_t const * ptr); // VLD2.32 {d0, d2}, [r0]
+_NEON2SSE_INLINE uint32x4x2_t vld2q_u32(__transfersize(8) uint32_t const * ptr) // VLD2.32 {d0, d2}, [r0]
 {
     uint32x4x2_t v;
     v.val[0] = vld1q_u32 ( ptr);
@@ -4776,19 +9760,18 @@
 int8x16x2_t vld2q_s8(__transfersize(32) int8_t const * ptr);
 #define  vld2q_s8(ptr) vld2q_u8((uint8_t*) ptr)
 
-#if defined(USE_SSSE3)
-int16x8x2_t vld2q_s16(__transfersize(16) int16_t const * ptr);         // VLD2.16 {d0, d2}, [r0]
+int16x8x2_t vld2q_s16(__transfersize(16) int16_t const * ptr); // VLD2.16 {d0, d2}, [r0]
 #define vld2q_s16(ptr) vld2q_u16((uint16_t*) ptr)
-#endif
 
-int32x4x2_t vld2q_s32(__transfersize(8) int32_t const * ptr);         // VLD2.32 {d0, d2}, [r0]
+int32x4x2_t vld2q_s32(__transfersize(8) int32_t const * ptr); // VLD2.32 {d0, d2}, [r0]
 #define vld2q_s32(ptr) vld2q_u32((uint32_t*) ptr)
 
-float16x8x2_t vld2q_f16(__transfersize(16) __fp16 const * ptr);         // VLD2.16 {d0, d2}, [r0]
+
+float16x8x2_t vld2q_f16(__transfersize(16) __fp16 const * ptr); // VLD2.16 {d0, d2}, [r0]
 // IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example
 
-float32x4x2_t vld2q_f32(__transfersize(8) float32_t const * ptr);         // VLD2.32 {d0, d2}, [r0]
-_NEON2SSE_INLINE float32x4x2_t vld2q_f32(__transfersize(8) float32_t const * ptr)         // VLD2.32 {d0, d2}, [r0]
+float32x4x2_t vld2q_f32(__transfersize(8) float32_t const * ptr); // VLD2.32 {d0, d2}, [r0]
+_NEON2SSE_INLINE float32x4x2_t vld2q_f32(__transfersize(8) float32_t const * ptr) // VLD2.32 {d0, d2}, [r0]
 {
     float32x4x2_t v;
     v.val[0] =  vld1q_f32 (ptr);
@@ -4797,114 +9780,106 @@
     return v;
 }
 
-poly8x16x2_t vld2q_p8(__transfersize(32) poly8_t const * ptr);         // VLD2.8 {d0, d2}, [r0]
+poly8x16x2_t vld2q_p8(__transfersize(32) poly8_t const * ptr); // VLD2.8 {d0, d2}, [r0]
 #define  vld2q_p8 vld2q_u8
 
-#if defined(USE_SSSE3)
-poly16x8x2_t vld2q_p16(__transfersize(16) poly16_t const * ptr);         // VLD2.16 {d0, d2}, [r0]
+poly16x8x2_t vld2q_p16(__transfersize(16) poly16_t const * ptr); // VLD2.16 {d0, d2}, [r0]
 #define vld2q_p16 vld2q_u16
-#endif
 
-#if defined(USE_SSSE3)
-uint8x8x2_t vld2_u8(__transfersize(16) uint8_t const * ptr);         // VLD2.8 {d0, d1}, [r0]
+uint8x8x2_t vld2_u8(__transfersize(16) uint8_t const * ptr); // VLD2.8 {d0, d1}, [r0]
 _NEON2SSE_INLINE uint8x8x2_t vld2_u8(__transfersize(16) uint8_t const * ptr)
 {
     uint8x8x2_t v;
     _NEON2SSE_ALIGN_16 int8_t mask8_even_odd[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15};
     __m128i ld128;
-    ld128 = vld1q_u8(ptr);         //merge two 64-bits in 128 bit
-    v.val[0] = _mm_shuffle_epi8(ld128, *(__m128i*)mask8_even_odd);
-    v.val[1] = _mm_shuffle_epi32(v.val[0], _SWAP_HI_LOW32);
+    ld128 = vld1q_u8(ptr); //merge two 64-bits in 128 bit
+    ld128 =  _mm_shuffle_epi8(ld128, *(__m128i*)mask8_even_odd);
+    vst1q_u8((v.val), ld128); //  v.val[1] = _mm_shuffle_epi32(v.val[0], _SWAP_HI_LOW32);
     return v;
 }
-#endif
 
-#if defined(USE_SSSE3)
-uint16x4x2_t vld2_u16(__transfersize(8) uint16_t const * ptr);         // VLD2.16 {d0, d1}, [r0]
+uint16x4x2_t vld2_u16(__transfersize(8) uint16_t const * ptr); // VLD2.16 {d0, d1}, [r0]
 _NEON2SSE_INLINE uint16x4x2_t vld2_u16(__transfersize(8) uint16_t const * ptr)
 {
-    uint16x4x2_t v;
+    _NEON2SSE_ALIGN_16 uint16x4x2_t v;
     _NEON2SSE_ALIGN_16 int8_t mask16_even_odd[16] = { 0,1, 4,5, 8,9, 12,13, 2,3, 6,7, 10,11, 14,15};
     __m128i ld128;
-    ld128 = vld1q_u16(ptr);         //merge two 64-bits in 128 bit
-    v.val[0] = _mm_shuffle_epi8(ld128, *(__m128i*)mask16_even_odd);
-    v.val[1] = _mm_shuffle_epi32(v.val[0], _SWAP_HI_LOW32);
+    ld128 = vld1q_u16(ptr); //merge two 64-bits in 128 bit
+    ld128 = _mm_shuffle_epi8(ld128, *(__m128i*)mask16_even_odd);
+    vst1q_u16((v.val), ld128);
     return v;
 }
-#endif
 
-uint32x2x2_t vld2_u32(__transfersize(4) uint32_t const * ptr);         // VLD2.32 {d0, d1}, [r0]
+uint32x2x2_t vld2_u32(__transfersize(4) uint32_t const * ptr); // VLD2.32 {d0, d1}, [r0]
 _NEON2SSE_INLINE uint32x2x2_t vld2_u32(__transfersize(4) uint32_t const * ptr)
 {
-    uint32x2x2_t v;
+    _NEON2SSE_ALIGN_16 uint32x2x2_t v;
     __m128i ld128;
-    ld128 = vld1q_u32(ptr);         //merge two 64-bits in 128 bit
-    v.val[0] = _mm_shuffle_epi32(ld128,  0 | (2 << 2) | (1 << 4) | (3 << 6));
-    v.val[1] = _mm_shuffle_epi32(v.val[0], _SWAP_HI_LOW32);
+    ld128 = vld1q_u32(ptr); //merge two 64-bits in 128 bit
+    ld128 = _mm_shuffle_epi32(ld128,  0 | (2 << 2) | (1 << 4) | (3 << 6));
+    vst1q_u32((v.val), ld128);
     return v;
 }
 
-uint64x1x2_t vld2_u64(__transfersize(2) uint64_t const * ptr);         // VLD1.64 {d0, d1}, [r0]
+uint64x1x2_t vld2_u64(__transfersize(2) uint64_t const * ptr); // VLD1.64 {d0, d1}, [r0]
 _NEON2SSE_INLINE uint64x1x2_t vld2_u64(__transfersize(2) uint64_t const * ptr)
 {
     uint64x1x2_t v;
-    v.val[0] = vld1q_u64(ptr);
-    v.val[1] = _mm_shuffle_epi32(v.val[0], _SWAP_HI_LOW32);
+    v.val[0].m64_u64[0] = *(ptr);
+    v.val[1].m64_u64[0] = *(ptr + 1);
     return v;
 }
 
-#if defined(USE_SSSE3)
-int8x8x2_t vld2_s8(__transfersize(16) int8_t const * ptr);         // VLD2.8 {d0, d1}, [r0]
+int8x8x2_t vld2_s8(__transfersize(16) int8_t const * ptr); // VLD2.8 {d0, d1}, [r0]
 #define vld2_s8(ptr) vld2_u8((uint8_t*)ptr)
 
-int16x4x2_t vld2_s16(__transfersize(8) int16_t const * ptr);         // VLD2.16 {d0, d1}, [r0]
+int16x4x2_t vld2_s16(__transfersize(8) int16_t const * ptr); // VLD2.16 {d0, d1}, [r0]
 #define vld2_s16(ptr) vld2_u16((uint16_t*)ptr)
-#endif
 
-int32x2x2_t vld2_s32(__transfersize(4) int32_t const * ptr);         // VLD2.32 {d0, d1}, [r0]
+int32x2x2_t vld2_s32(__transfersize(4) int32_t const * ptr); // VLD2.32 {d0, d1}, [r0]
 #define vld2_s32(ptr) vld2_u32((uint32_t*)ptr)
 
-int64x1x2_t vld2_s64(__transfersize(2) int64_t const * ptr);         // VLD1.64 {d0, d1}, [r0]
+int64x1x2_t vld2_s64(__transfersize(2) int64_t const * ptr); // VLD1.64 {d0, d1}, [r0]
 #define vld2_s64(ptr) vld2_u64((uint64_t*)ptr)
 
-float16x4x2_t vld2_f16(__transfersize(8) __fp16 const * ptr);         // VLD2.16 {d0, d1}, [r0]
+float16x4x2_t vld2_f16(__transfersize(8) __fp16 const * ptr); // VLD2.16 {d0, d1}, [r0]
+// IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1_f16 for example
 
-float32x2x2_t vld2_f32(__transfersize(4) float32_t const * ptr);         // VLD2.32 {d0, d1}, [r0]
+float32x2x2_t vld2_f32(__transfersize(4) float32_t const * ptr); // VLD2.32 {d0, d1}, [r0]
 _NEON2SSE_INLINE float32x2x2_t vld2_f32(__transfersize(4) float32_t const * ptr)
 {
     float32x2x2_t v;
-    v.val[0] = vld1q_f32(ptr);
-    v.val[0] = _mm_shuffle_ps(v.val[0], v.val[0], _MM_SHUFFLE(3,1, 2, 0));
-    v.val[1] = _mm_movehl_ps(v.val[0],v.val[0]);
+    v.val[0].m64_f32[0] = *(ptr);
+    v.val[0].m64_f32[1] = *(ptr + 2);
+    v.val[1].m64_f32[0] = *(ptr + 1);
+    v.val[1].m64_f32[1] = *(ptr + 3);
     return v;
 }
 
-#if defined(USE_SSSE3)
-poly8x8x2_t vld2_p8(__transfersize(16) poly8_t const * ptr);         // VLD2.8 {d0, d1}, [r0]
+poly8x8x2_t vld2_p8(__transfersize(16) poly8_t const * ptr); // VLD2.8 {d0, d1}, [r0]
 #define vld2_p8 vld2_u8
 
-poly16x4x2_t vld2_p16(__transfersize(8) poly16_t const * ptr);         // VLD2.16 {d0, d1}, [r0]
+poly16x4x2_t vld2_p16(__transfersize(8) poly16_t const * ptr); // VLD2.16 {d0, d1}, [r0]
 #define vld2_p16 vld2_u16
-#endif
 
 //******************** Triplets ***************************************
 //*********************************************************************
-#if defined(USE_SSSE3)
-uint8x16x3_t vld3q_u8(__transfersize(48) uint8_t const * ptr);         // VLD3.8 {d0, d2, d4}, [r0]
-_NEON2SSE_INLINE uint8x16x3_t vld3q_u8(__transfersize(48) uint8_t const * ptr)         // VLD3.8 {d0, d2, d4}, [r0]
-{  //a0,a1,a2,a3,...a7,a8,...a15,  b0,b1,b2,...b7,b8,...b15, c0,c1,c2,...c7,c8,...c15 ->
-   //a:0,3,6,9,12,15,b:2,5,8,11,14,  c:1,4,7,10,13
-   //a:1,4,7,10,13,  b:0,3,6,9,12,15,c:2,5,8,11,14,
-   //a:2,5,8,11,14,  b:1,4,7,10,13,  c:0,3,6,9,12,15
+uint8x16x3_t vld3q_u8(__transfersize(48) uint8_t const * ptr); // VLD3.8 {d0, d2, d4}, [r0]
+_NEON2SSE_INLINE uint8x16x3_t vld3q_u8(__transfersize(48) uint8_t const * ptr) // VLD3.8 {d0, d2, d4}, [r0]
+{
+    //a0,a1,a2,a3,...a7,a8,...a15,  b0,b1,b2,...b7,b8,...b15, c0,c1,c2,...c7,c8,...c15 ->
+    //a:0,3,6,9,12,15,b:2,5,8,11,14,  c:1,4,7,10,13
+    //a:1,4,7,10,13,  b:0,3,6,9,12,15,c:2,5,8,11,14,
+    //a:2,5,8,11,14,  b:1,4,7,10,13,  c:0,3,6,9,12,15
     uint8x16x3_t v;
     __m128i tmp0, tmp1,tmp2, tmp3;
     _NEON2SSE_ALIGN_16 int8_t mask8_0[16] = {0,3,6,9,12,15,1,4,7,10,13,2,5,8,11,14};
     _NEON2SSE_ALIGN_16 int8_t mask8_1[16] = {2,5,8,11,14,0,3,6,9,12,15,1,4,7,10,13};
     _NEON2SSE_ALIGN_16 int8_t mask8_2[16] = {1,4,7,10,13,2,5,8,11,14,0,3,6,9,12,15};
 
-    v.val[0] =  vld1q_u8 (ptr);        //a0,a1,a2,a3,...a7, ...a15
-    v.val[1] =  vld1q_u8 ((ptr + 16));	//b0,b1,b2,b3...b7, ...b15
-    v.val[2] =  vld1q_u8 ((ptr + 32));  //c0,c1,c2,c3,...c7,...c15
+    v.val[0] =  vld1q_u8 (ptr); //a0,a1,a2,a3,...a7, ...a15
+    v.val[1] =  vld1q_u8 ((ptr + 16)); //b0,b1,b2,b3...b7, ...b15
+    v.val[2] =  vld1q_u8 ((ptr + 32)); //c0,c1,c2,c3,...c7,...c15
 
     tmp0 = _mm_shuffle_epi8(v.val[0], *(__m128i*)mask8_0); //a:0,3,6,9,12,15,1,4,7,10,13,2,5,8,11
     tmp1 = _mm_shuffle_epi8(v.val[1], *(__m128i*)mask8_1); //b:2,5,8,11,14,0,3,6,9,12,15,1,4,7,10,13
@@ -4915,43 +9890,42 @@
     tmp3 = _mm_slli_si128(tmp3, 5); //0,0,0,0,0,a:0,3,6,9,12,15,b:2,5,8,11,14,
     tmp3 = _mm_srli_si128(tmp3, 5); //a:0,3,6,9,12,15,b:2,5,8,11,14,:0,0,0,0,0
     v.val[0] = _mm_slli_si128(tmp2, 11); //0,0,0,0,0,0,0,0,0,0,0,0, 1,4,7,10,13,
-    v.val[0] = _mm_or_si128(v.val[0],tmp3) ;//a:0,3,6,9,12,15,b:2,5,8,11,14,c:1,4,7,10,13,
+    v.val[0] = _mm_or_si128(v.val[0],tmp3); //a:0,3,6,9,12,15,b:2,5,8,11,14,c:1,4,7,10,13,
 
-    tmp3 = _mm_slli_si128(tmp0, 5);//0,0,0,0,0,a:0,3,6,9,12,15,1,4,7,10,13,
+    tmp3 = _mm_slli_si128(tmp0, 5); //0,0,0,0,0,a:0,3,6,9,12,15,1,4,7,10,13,
     tmp3 = _mm_srli_si128(tmp3, 11); //a:1,4,7,10,13, 0,0,0,0,0,0,0,0,0,0,0
     v.val[1] = _mm_srli_si128(tmp1,5); //b:0,3,6,9,12,15,C:1,4,7,10,13, 0,0,0,0,0
-    v.val[1] = _mm_slli_si128(v.val[1], 5);//0,0,0,0,0,b:0,3,6,9,12,15,C:1,4,7,10,13,
-    v.val[1] = _mm_or_si128(v.val[1],tmp3);//a:1,4,7,10,13,b:0,3,6,9,12,15,C:1,4,7,10,13,
-    v.val[1] =	_mm_slli_si128(v.val[1],5);//0,0,0,0,0,a:1,4,7,10,13,b:0,3,6,9,12,15,
-    v.val[1] = _mm_srli_si128(v.val[1], 5);//a:1,4,7,10,13,b:0,3,6,9,12,15,0,0,0,0,0
+    v.val[1] = _mm_slli_si128(v.val[1], 5); //0,0,0,0,0,b:0,3,6,9,12,15,C:1,4,7,10,13,
+    v.val[1] = _mm_or_si128(v.val[1],tmp3); //a:1,4,7,10,13,b:0,3,6,9,12,15,C:1,4,7,10,13,
+    v.val[1] =  _mm_slli_si128(v.val[1],5); //0,0,0,0,0,a:1,4,7,10,13,b:0,3,6,9,12,15,
+    v.val[1] = _mm_srli_si128(v.val[1], 5); //a:1,4,7,10,13,b:0,3,6,9,12,15,0,0,0,0,0
     tmp3 = _mm_srli_si128(tmp2,5); //c:2,5,8,11,14,0,3,6,9,12,15,0,0,0,0,0
-    tmp3 = _mm_slli_si128(tmp3,11);//0,0,0,0,0,0,0,0,0,0,0,c:2,5,8,11,14,
-    v.val[1] = _mm_or_si128(v.val[1],tmp3);//a:1,4,7,10,13,b:0,3,6,9,12,15,c:2,5,8,11,14,
+    tmp3 = _mm_slli_si128(tmp3,11); //0,0,0,0,0,0,0,0,0,0,0,c:2,5,8,11,14,
+    v.val[1] = _mm_or_si128(v.val[1],tmp3); //a:1,4,7,10,13,b:0,3,6,9,12,15,c:2,5,8,11,14,
 
     tmp3 = _mm_srli_si128(tmp2,10); //c:0,3,6,9,12,15, 0,0,0,0,0,0,0,0,0,0,
     tmp3 = _mm_slli_si128(tmp3,10); //0,0,0,0,0,0,0,0,0,0, c:0,3,6,9,12,15,
     v.val[2] = _mm_srli_si128(tmp1,11); //b:1,4,7,10,13,0,0,0,0,0,0,0,0,0,0,0
-    v.val[2] = _mm_slli_si128(v.val[2],5);//0,0,0,0,0,b:1,4,7,10,13, 0,0,0,0,0,0
-    v.val[2] = _mm_or_si128(v.val[2],tmp3);//0,0,0,0,0,b:1,4,7,10,13,c:0,3,6,9,12,15,
+    v.val[2] = _mm_slli_si128(v.val[2],5); //0,0,0,0,0,b:1,4,7,10,13, 0,0,0,0,0,0
+    v.val[2] = _mm_or_si128(v.val[2],tmp3); //0,0,0,0,0,b:1,4,7,10,13,c:0,3,6,9,12,15,
     tmp0 = _mm_srli_si128(tmp0, 11); //a:2,5,8,11,14, 0,0,0,0,0,0,0,0,0,0,0,
-    v.val[2] = _mm_or_si128(v.val[2],tmp0);//a:2,5,8,11,14,b:1,4,7,10,13,c:0,3,6,9,12,15,
+    v.val[2] = _mm_or_si128(v.val[2],tmp0); //a:2,5,8,11,14,b:1,4,7,10,13,c:0,3,6,9,12,15,
     return v;
 }
-#endif
 
-#if defined(USE_SSSE3)
-uint16x8x3_t vld3q_u16(__transfersize(24) uint16_t const * ptr);         // VLD3.16 {d0, d2, d4}, [r0]
-_NEON2SSE_INLINE uint16x8x3_t vld3q_u16(__transfersize(24) uint16_t const * ptr)         // VLD3.16 {d0, d2, d4}, [r0]
-{  //a0, a1,a2,a3,...a7,  b0,b1,b2,b3,...b7, c0,c1,c2,c3...c7 -> a0,a3,a6,b1,b4,b7,c2,c5, a1,a4,a7,b2,b5,c0,c3,c6, a2,a5,b0,b3,b6,c1,c4,c7
+uint16x8x3_t vld3q_u16(__transfersize(24) uint16_t const * ptr); // VLD3.16 {d0, d2, d4}, [r0]
+_NEON2SSE_INLINE uint16x8x3_t vld3q_u16(__transfersize(24) uint16_t const * ptr) // VLD3.16 {d0, d2, d4}, [r0]
+{
+    //a0, a1,a2,a3,...a7,  b0,b1,b2,b3,...b7, c0,c1,c2,c3...c7 -> a0,a3,a6,b1,b4,b7,c2,c5, a1,a4,a7,b2,b5,c0,c3,c6, a2,a5,b0,b3,b6,c1,c4,c7
     uint16x8x3_t v;
     __m128i tmp0, tmp1,tmp2, tmp3;
     _NEON2SSE_ALIGN_16 int8_t mask16_0[16] = {0,1, 6,7, 12,13, 2,3, 8,9, 14,15, 4,5, 10,11};
     _NEON2SSE_ALIGN_16 int8_t mask16_1[16] = {2,3, 8,9, 14,15, 4,5, 10,11, 0,1, 6,7, 12,13};
     _NEON2SSE_ALIGN_16 int8_t mask16_2[16] = {4,5, 10,11, 0,1, 6,7, 12,13, 2,3, 8,9, 14,15};
 
-    v.val[0] =  vld1q_u16 (ptr);        //a0,a1,a2,a3,...a7,
-    v.val[1] =  vld1q_u16 ((ptr + 8));	//b0,b1,b2,b3...b7
-    v.val[2] =  vld1q_u16 ((ptr + 16));  //c0,c1,c2,c3,...c7
+    v.val[0] =  vld1q_u16 (ptr); //a0,a1,a2,a3,...a7,
+    v.val[1] =  vld1q_u16 ((ptr + 8)); //b0,b1,b2,b3...b7
+    v.val[2] =  vld1q_u16 ((ptr + 16)); //c0,c1,c2,c3,...c7
 
     tmp0 = _mm_shuffle_epi8(v.val[0], *(__m128i*)mask16_0); //a0,a3,a6,a1,a4,a7,a2,a5,
     tmp1 = _mm_shuffle_epi8(v.val[1], *(__m128i*)mask16_1); //b1,b4,b7,b2,b5,b0,b3,b6
@@ -4962,38 +9936,38 @@
     tmp3 = _mm_slli_si128(tmp3, 4); //0,0, a0,a3,a6,b1,b4,b7
     tmp3 = _mm_srli_si128(tmp3, 4); //a0,a3,a6,b1,b4,b7,0,0
     v.val[0] = _mm_slli_si128(tmp2, 12); //0,0,0,0,0,0, c2,c5,
-    v.val[0] = _mm_or_si128(v.val[0],tmp3);//a0,a3,a6,b1,b4,b7,c2,c5
+    v.val[0] = _mm_or_si128(v.val[0],tmp3); //a0,a3,a6,b1,b4,b7,c2,c5
 
-    tmp3 = _mm_slli_si128(tmp0, 4);//0,0,a0,a3,a6,a1,a4,a7
+    tmp3 = _mm_slli_si128(tmp0, 4); //0,0,a0,a3,a6,a1,a4,a7
     tmp3 = _mm_srli_si128(tmp3,10); //a1,a4,a7, 0,0,0,0,0
     v.val[1] = _mm_srli_si128(tmp1,6); //b2,b5,b0,b3,b6,0,0
     v.val[1] = _mm_slli_si128(v.val[1], 6); //0,0,0,b2,b5,b0,b3,b6,
-    v.val[1] = _mm_or_si128(v.val[1],tmp3);//a1,a4,a7,b2,b5,b0,b3,b6,
-    v.val[1] =	_mm_slli_si128(v.val[1],6);//0,0,0,a1,a4,a7,b2,b5,
-    v.val[1] = _mm_srli_si128(v.val[1], 6);//a1,a4,a7,b2,b5,0,0,0,
-    tmp3 = _mm_srli_si128(tmp2,4);  //c0,c3,c6, c1,c4,c7,0,0
-    tmp3 = _mm_slli_si128(tmp3,10);  //0,0,0,0,0,c0,c3,c6,
+    v.val[1] = _mm_or_si128(v.val[1],tmp3); //a1,a4,a7,b2,b5,b0,b3,b6,
+    v.val[1] =  _mm_slli_si128(v.val[1],6); //0,0,0,a1,a4,a7,b2,b5,
+    v.val[1] = _mm_srli_si128(v.val[1], 6); //a1,a4,a7,b2,b5,0,0,0,
+    tmp3 = _mm_srli_si128(tmp2,4); //c0,c3,c6, c1,c4,c7,0,0
+    tmp3 = _mm_slli_si128(tmp3,10); //0,0,0,0,0,c0,c3,c6,
     v.val[1] = _mm_or_si128(v.val[1],tmp3); //a1,a4,a7,b2,b5,c0,c3,c6,
 
     tmp3 = _mm_srli_si128(tmp2,10); //c1,c4,c7, 0,0,0,0,0
     tmp3 = _mm_slli_si128(tmp3,10); //0,0,0,0,0, c1,c4,c7,
     v.val[2] = _mm_srli_si128(tmp1,10); //b0,b3,b6,0,0, 0,0,0
-    v.val[2] = _mm_slli_si128(v.val[2],4);//0,0, b0,b3,b6,0,0,0
-    v.val[2] = _mm_or_si128(v.val[2],tmp3);//0,0, b0,b3,b6,c1,c4,c7,
+    v.val[2] = _mm_slli_si128(v.val[2],4); //0,0, b0,b3,b6,0,0,0
+    v.val[2] = _mm_or_si128(v.val[2],tmp3); //0,0, b0,b3,b6,c1,c4,c7,
     tmp0 = _mm_srli_si128(tmp0, 12); //a2,a5,0,0,0,0,0,0
-    v.val[2] = _mm_or_si128(v.val[2],tmp0);//a2,a5,b0,b3,b6,c1,c4,c7,
+    v.val[2] = _mm_or_si128(v.val[2],tmp0); //a2,a5,b0,b3,b6,c1,c4,c7,
     return v;
 }
-#endif
 
-uint32x4x3_t vld3q_u32(__transfersize(12) uint32_t const * ptr);         // VLD3.32 {d0, d2, d4}, [r0]
-_NEON2SSE_INLINE uint32x4x3_t vld3q_u32(__transfersize(12) uint32_t const * ptr)         // VLD3.32 {d0, d2, d4}, [r0]
-{//a0,a1,a2,a3,  b0,b1,b2,b3, c0,c1,c2,c3 -> a0,a3,b2,c1,  a1,b0,b3,c2, a2,b1,c0,c3,
+uint32x4x3_t vld3q_u32(__transfersize(12) uint32_t const * ptr); // VLD3.32 {d0, d2, d4}, [r0]
+_NEON2SSE_INLINE uint32x4x3_t vld3q_u32(__transfersize(12) uint32_t const * ptr) // VLD3.32 {d0, d2, d4}, [r0]
+{
+    //a0,a1,a2,a3,  b0,b1,b2,b3, c0,c1,c2,c3 -> a0,a3,b2,c1,  a1,b0,b3,c2, a2,b1,c0,c3,
     uint32x4x3_t v;
     __m128i tmp0, tmp1,tmp2, tmp3;
-    v.val[0] =  vld1q_u32 (ptr);        //a0,a1,a2,a3,
-    v.val[1] =  vld1q_u32 ((ptr + 4));	//b0,b1,b2,b3
-    v.val[2] =  vld1q_u32 ((ptr + 8));  //c0,c1,c2,c3,
+    v.val[0] =  vld1q_u32 (ptr); //a0,a1,a2,a3,
+    v.val[1] =  vld1q_u32 ((ptr + 4)); //b0,b1,b2,b3
+    v.val[2] =  vld1q_u32 ((ptr + 8)); //c0,c1,c2,c3,
 
     tmp0 = _mm_shuffle_epi32(v.val[0], 0 | (3 << 2) | (1 << 4) | (2 << 6)); //a0,a3,a1,a2
     tmp1 = _mm_shuffle_epi32(v.val[1], _SWAP_HI_LOW32); //b2,b3,b0,b1
@@ -5008,28 +9982,27 @@
     return v;
 }
 
-#if defined(USE_SSSE3)
-int8x16x3_t vld3q_s8(__transfersize(48) int8_t const * ptr);         // VLD3.8 {d0, d2, d4}, [r0]
+int8x16x3_t vld3q_s8(__transfersize(48) int8_t const * ptr); // VLD3.8 {d0, d2, d4}, [r0]
 #define  vld3q_s8(ptr) vld3q_u8((uint8_t*) (ptr))
 
-int16x8x3_t vld3q_s16(__transfersize(24) int16_t const * ptr);         // VLD3.16 {d0, d2, d4}, [r0]
+int16x8x3_t vld3q_s16(__transfersize(24) int16_t const * ptr); // VLD3.16 {d0, d2, d4}, [r0]
 #define  vld3q_s16(ptr) vld3q_u16((uint16_t*) (ptr))
-#endif
 
-int32x4x3_t vld3q_s32(__transfersize(12) int32_t const * ptr);         // VLD3.32 {d0, d2, d4}, [r0]
+int32x4x3_t vld3q_s32(__transfersize(12) int32_t const * ptr); // VLD3.32 {d0, d2, d4}, [r0]
 #define  vld3q_s32(ptr) vld3q_u32((uint32_t*) (ptr))
 
-float16x8x3_t vld3q_f16(__transfersize(24) __fp16 const * ptr);         // VLD3.16 {d0, d2, d4}, [r0]
+float16x8x3_t vld3q_f16(__transfersize(24) __fp16 const * ptr); // VLD3.16 {d0, d2, d4}, [r0]
 // IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example
 
-float32x4x3_t vld3q_f32(__transfersize(12) float32_t const * ptr);         // VLD3.32 {d0, d2, d4}, [r0]
-_NEON2SSE_INLINE float32x4x3_t vld3q_f32(__transfersize(12) float32_t const * ptr)         // VLD3.32 {d0, d2, d4}, [r0]
-{ //a0,a1,a2,a3,  b0,b1,b2,b3, c0,c1,c2,c3 -> a0,a3,b2,c1,  a1,b0,b3,c2, a2,b1,c0,c3,
+float32x4x3_t vld3q_f32(__transfersize(12) float32_t const * ptr); // VLD3.32 {d0, d2, d4}, [r0]
+_NEON2SSE_INLINE float32x4x3_t vld3q_f32(__transfersize(12) float32_t const * ptr) // VLD3.32 {d0, d2, d4}, [r0]
+{
+    //a0,a1,a2,a3,  b0,b1,b2,b3, c0,c1,c2,c3 -> a0,a3,b2,c1,  a1,b0,b3,c2, a2,b1,c0,c3,
     float32x4x3_t v;
     __m128 tmp0, tmp1,tmp2, tmp3;
-    v.val[0] =  vld1q_f32 (ptr);        //a0,a1,a2,a3,
-    v.val[1] =  vld1q_f32 ((ptr + 4));	//b0,b1,b2,b3
-    v.val[2] =  vld1q_f32 ((ptr + 8));  //c0,c1,c2,c3,
+    v.val[0] =  vld1q_f32 (ptr); //a0,a1,a2,a3,
+    v.val[1] =  vld1q_f32 ((ptr + 4)); //b0,b1,b2,b3
+    v.val[2] =  vld1q_f32 ((ptr + 8)); //c0,c1,c2,c3,
 
     tmp0 = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(v.val[0]), 0 | (3 << 2) | (1 << 4) | (2 << 6))); //a0,a3,a1,a2
     tmp1 = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(v.val[1]), _SWAP_HI_LOW32)); //b2,b3,b0,b1
@@ -5044,162 +10017,171 @@
     return v;
 }
 
-#if defined(USE_SSSE3)
-poly8x16x3_t vld3q_p8(__transfersize(48) poly8_t const * ptr);         // VLD3.8 {d0, d2, d4}, [r0]
+poly8x16x3_t vld3q_p8(__transfersize(48) poly8_t const * ptr); // VLD3.8 {d0, d2, d4}, [r0]
 #define vld3q_p8 vld3q_u8
 
-poly16x8x3_t vld3q_p16(__transfersize(24) poly16_t const * ptr);         // VLD3.16 {d0, d2, d4}, [r0]
+poly16x8x3_t vld3q_p16(__transfersize(24) poly16_t const * ptr); // VLD3.16 {d0, d2, d4}, [r0]
 #define vld3q_p16 vld3q_u16
-#endif
 
-#if defined(USE_SSSE3)
-uint8x8x3_t vld3_u8(__transfersize(24) uint8_t const * ptr);         // VLD3.8 {d0, d1, d2}, [r0]
-_NEON2SSE_INLINE uint8x8x3_t vld3_u8(__transfersize(24) uint8_t const * ptr)         // VLD3.8 {d0, d1, d2}, [r0]
-{ //a0, a1,a2,a3,...a7,  b0,b1,b2,b3,...b7, c0,c1,c2,c3...c7 -> a0,a3,a6,b1,b4,b7,c2,c5, a1,a4,a7,b2,b5,c0,c3,c6, a2,a5,b0,b3,b6,c1,c4,c7
+uint8x8x3_t vld3_u8(__transfersize(24) uint8_t const * ptr); // VLD3.8 {d0, d1, d2}, [r0]
+_NEON2SSE_INLINE uint8x8x3_t vld3_u8(__transfersize(24) uint8_t const * ptr) // VLD3.8 {d0, d1, d2}, [r0]
+{
+    //a0, a1,a2,a3,...a7,  b0,b1,b2,b3,...b7, c0,c1,c2,c3...c7 -> a0,a3,a6,b1,b4,b7,c2,c5, a1,a4,a7,b2,b5,c0,c3,c6, a2,a5,b0,b3,b6,c1,c4,c7
     uint8x8x3_t v;
-    __m128i tmp0, tmp1;
+    __m128i val0, val1, val2, tmp0, tmp1;
     _NEON2SSE_ALIGN_16 int8_t mask8_0[16] = {0,3,6,9,12,15, 1,4,7,10,13, 2,5,8,11,14};
     _NEON2SSE_ALIGN_16 int8_t mask8_1[16] = {2,5, 0,3,6, 1,4,7, 0,0,0,0,0,0,0,0};
-    v.val[0] =  vld1q_u8 (ptr);        //a0,a1,a2,a3,...a7, b0,b1,b2,b3...b7
+    val0 =  vld1q_u8 (ptr); //a0,a1,a2,a3,...a7, b0,b1,b2,b3...b7
+    val2 =  _mm_loadl_epi64((__m128i*)(ptr + 16)); //c0,c1,c2,c3,...c7
 
-    tmp0 = _mm_shuffle_epi8(v.val[0], *(__m128i*)mask8_0); //a0,a3,a6,b1,b4,b7, a1,a4,a7,b2,b5, a2,a5,b0,b3,b6,
-    tmp1 = _mm_shuffle_epi8(v.val[2], *(__m128i*)mask8_1); //c2,c5, c0,c3,c6, c1,c4,c7,x,x,x,x,x,x,x,x
-    v.val[0] = _mm_slli_si128(tmp0,10);
-    v.val[0] = _mm_srli_si128(v.val[0],10);  //a0,a3,a6,b1,b4,b7, 0,0,0,0,0,0,0,0,0,0
-    v.val[2] = _mm_slli_si128(tmp1,6);//0,0,0,0,0,0,c2,c5,x,x,x,x,x,x,x,x
-    v.val[0] = _mm_or_si128(v.val[0],v.val[2]) ;//a0,a3,a6,b1,b4,b7,c2,c5 x,x,x,x,x,x,x,x
+    tmp0 = _mm_shuffle_epi8(val0, *(__m128i*)mask8_0); //a0,a3,a6,b1,b4,b7, a1,a4,a7,b2,b5, a2,a5,b0,b3,b6,
+    tmp1 = _mm_shuffle_epi8(val2, *(__m128i*)mask8_1); //c2,c5, c0,c3,c6, c1,c4,c7,x,x,x,x,x,x,x,x
+    val0 = _mm_slli_si128(tmp0,10);
+    val0 = _mm_srli_si128(val0,10); //a0,a3,a6,b1,b4,b7, 0,0,0,0,0,0,0,0,0,0
+    val2 = _mm_slli_si128(tmp1,6); //0,0,0,0,0,0,c2,c5,x,x,x,x,x,x,x,x
+    val0 = _mm_or_si128(val0,val2); //a0,a3,a6,b1,b4,b7,c2,c5 x,x,x,x,x,x,x,x
+    _M64(v.val[0], val0);
+    val1 = _mm_slli_si128(tmp0,5); //0,0,0,0,0,0,0,0,0,0,0, a1,a4,a7,b2,b5,
+    val1 = _mm_srli_si128(val1,11); //a1,a4,a7,b2,b5,0,0,0,0,0,0,0,0,0,0,0,
+    val2 = _mm_srli_si128(tmp1,2); //c0,c3,c6,c1,c4,c7,x,x,x,x,x,x,x,x,0,0
+    val2 = _mm_slli_si128(val2,5); //0,0,0,0,0,c0,c3,c6,0,0,0,0,0,0,0,0
+    val1 = _mm_or_si128(val1,val2); //a1,a4,a7,b2,b5,c0,c3,c6,x,x,x,x,x,x,x,x
+    _M64(v.val[1], val1);
 
-    v.val[1] = _mm_slli_si128(tmp0,5);  //0,0,0,0,0,0,0,0,0,0,0, a1,a4,a7,b2,b5,
-    v.val[1] = _mm_srli_si128(v.val[1],11);  //a1,a4,a7,b2,b5,0,0,0,0,0,0,0,0,0,0,0,
-    v.val[2] = _mm_srli_si128(tmp1,2); //c0,c3,c6,c1,c4,c7,x,x,x,x,x,x,x,x,0,0
-    v.val[2] = _mm_slli_si128(v.val[2],5);//0,0,0,0,0,c0,c3,c6,0,0,0,0,0,0,0,0
-    v.val[1] = _mm_or_si128(v.val[1],v.val[2]) ;//a1,a4,a7,b2,b5,c0,c3,c6,x,x,x,x,x,x,x,x
-
-    tmp0 = _mm_srli_si128(tmp0,11);  //a2,a5,b0,b3,b6,0,0,0,0,0,0,0,0,0,0,0,
-    v.val[2] = _mm_srli_si128(tmp1,5); //c1,c4,c7,0,0,0,0,0,0,0,0,0,0,0,0,0
-    v.val[2] = _mm_slli_si128(v.val[2],5);//0,0,0,0,0,c1,c4,c7,
-    v.val[2] = _mm_or_si128(tmp0, v.val[2]) ;//a2,a5,b0,b3,b6,c1,c4,c7,x,x,x,x,x,x,x,x
+    tmp0 = _mm_srli_si128(tmp0,11); //a2,a5,b0,b3,b6,0,0,0,0,0,0,0,0,0,0,0,
+    val2 = _mm_srli_si128(tmp1,5); //c1,c4,c7,0,0,0,0,0,0,0,0,0,0,0,0,0
+    val2 = _mm_slli_si128(val2,5); //0,0,0,0,0,c1,c4,c7,
+    val2 = _mm_or_si128(tmp0, val2); //a2,a5,b0,b3,b6,c1,c4,c7,x,x,x,x,x,x,x,x
+    _M64(v.val[2], val2);
     return v;
 }
-#endif
 
-#if defined(USE_SSSE3)
-uint16x4x3_t vld3_u16(__transfersize(12) uint16_t const * ptr);         // VLD3.16 {d0, d1, d2}, [r0]
-_NEON2SSE_INLINE uint16x4x3_t vld3_u16(__transfersize(12) uint16_t const * ptr)         // VLD3.16 {d0, d1, d2}, [r0]
-{ //a0,a1,a2,a3,  b0,b1,b2,b3, c0,c1,c2,c3 -> a0,a3,b2,c1,  a1,b0,b3,c2, a2,b1,c0,c3,
+uint16x4x3_t vld3_u16(__transfersize(12) uint16_t const * ptr); // VLD3.16 {d0, d1, d2}, [r0]
+_NEON2SSE_INLINE uint16x4x3_t vld3_u16(__transfersize(12) uint16_t const * ptr) // VLD3.16 {d0, d1, d2}, [r0]
+{
+    //a0,a1,a2,a3,  b0,b1,b2,b3, c0,c1,c2,c3 -> a0,a3,b2,c1,  a1,b0,b3,c2, a2,b1,c0,c3,
     uint16x4x3_t v;
-    __m128i tmp0, tmp1;
+    __m128i val0, val1, val2, tmp0, tmp1;
     _NEON2SSE_ALIGN_16 int8_t mask16[16] = {0,1, 6,7, 12,13, 2,3, 8,9, 14,15, 4,5, 10,11};
-    v.val[0] =  vld1q_u16 (ptr);        //a0,a1,a2,a3,  b0,b1,b2,b3
+    val0 =  vld1q_u16 (ptr); //a0,a1,a2,a3,  b0,b1,b2,b3
+    val2 =  _mm_loadl_epi64((__m128i*)(ptr + 8)); //c0,c1,c2,c3, x,x,x,x
 
-    tmp0 = _mm_shuffle_epi8(v.val[0], *(__m128i*)mask16); //a0, a3, b2,a1, b0, b3, a2, b1
-    tmp1 = _mm_shufflelo_epi16(v.val[2], 201); //11 00 10 01     : c1, c2, c0, c3,
-    v.val[0] = _mm_slli_si128(tmp0,10);
-    v.val[0] = _mm_srli_si128(v.val[0],10);  //a0, a3, b2, 0,0, 0,0,
-    v.val[2] = _mm_slli_si128(tmp1,14);//0,0,0,0,0,0,0,c1
-    v.val[2] = _mm_srli_si128(v.val[2],8);//0,0,0,c1,0,0,0,0
-    v.val[0] = _mm_or_si128(v.val[0],v.val[2]) ;//a0, a3, b2, c1, x,x,x,x
+    tmp0 = _mm_shuffle_epi8(val0, *(__m128i*)mask16); //a0, a3, b2,a1, b0, b3, a2, b1
+    tmp1 = _mm_shufflelo_epi16(val2, 201); //11 00 10 01     : c1, c2, c0, c3,
+    val0 = _mm_slli_si128(tmp0,10);
+    val0 = _mm_srli_si128(val0,10); //a0, a3, b2, 0,0, 0,0,
+    val2 = _mm_slli_si128(tmp1,14); //0,0,0,0,0,0,0,c1
+    val2 = _mm_srli_si128(val2,8); //0,0,0,c1,0,0,0,0
+    val0 = _mm_or_si128(val0,val2); //a0, a3, b2, c1, x,x,x,x
+    _M64(v.val[0], val0);
 
-    v.val[1] = _mm_slli_si128(tmp0,4);  //0,0,0,0,0,a1, b0, b3
-    v.val[1] = _mm_srli_si128(v.val[1],10);  //a1, b0, b3, 0,0, 0,0,
-    v.val[2] = _mm_srli_si128(tmp1,2);//c2, 0,0,0,0,0,0,0,
-    v.val[2] = _mm_slli_si128(v.val[2],6);//0,0,0,c2,0,0,0,0
-    v.val[1] = _mm_or_si128(v.val[1],v.val[2]); //a1, b0, b3, c2, x,x,x,x
+    val1 = _mm_slli_si128(tmp0,4); //0,0,0,0,0,a1, b0, b3
+    val1 = _mm_srli_si128(val1,10); //a1, b0, b3, 0,0, 0,0,
+    val2 = _mm_srli_si128(tmp1,2); //c2, 0,0,0,0,0,0,0,
+    val2 = _mm_slli_si128(val2,6); //0,0,0,c2,0,0,0,0
+    val1 = _mm_or_si128(val1,val2); //a1, b0, b3, c2, x,x,x,x
+    _M64(v.val[1], val1);
 
     tmp0 = _mm_srli_si128(tmp0,12); //a2, b1,0,0,0,0,0,0
     tmp1 = _mm_srli_si128(tmp1,4);
-    tmp1 = _mm_slli_si128(tmp1,4);  //0,0,c0, c3,
-    v.val[2] = _mm_or_si128(tmp0, tmp1); //a2, b1, c0, c3,
+    tmp1 = _mm_slli_si128(tmp1,4); //0,0,c0, c3,
+    val2 = _mm_or_si128(tmp0, tmp1); //a2, b1, c0, c3,
+    _M64(v.val[2], val2);
     return v;
 }
-#endif
 
-uint32x2x3_t vld3_u32(__transfersize(6) uint32_t const * ptr);         // VLD3.32 {d0, d1, d2}, [r0]
-_NEON2SSE_INLINE uint32x2x3_t vld3_u32(__transfersize(6) uint32_t const * ptr)         // VLD3.32 {d0, d1, d2}, [r0]
-{ //a0,a1,  b0,b1, c0,c1,  -> a0,b1, a1,c0, b0,c1
+uint32x2x3_t vld3_u32(__transfersize(6) uint32_t const * ptr); // VLD3.32 {d0, d1, d2}, [r0]
+_NEON2SSE_INLINE uint32x2x3_t vld3_u32(__transfersize(6) uint32_t const * ptr) // VLD3.32 {d0, d1, d2}, [r0]
+{
+    //a0,a1,  b0,b1, c0,c1,  -> a0,b1, a1,c0, b0,c1
     uint32x2x3_t v;
-    v.val[0] =  vld1q_u32 (ptr);        //a0,a1,  b0,b1,
+    __m128i val0, val1, val2;
+    val0 =  vld1q_u32 (ptr); //a0,a1,  b0,b1,
+    val2 =   _mm_loadl_epi64((__m128i*) (ptr + 4)); //c0,c1, x,x
 
-    v.val[0] = _mm_shuffle_epi32(v.val[0], 0 | (3 << 2) | (1 << 4) | (2 << 6));  //a0,b1, a1, b0
-    v.val[2] =  _mm_slli_si128(v.val[2], 8);  //x, x,c0,c1,
-    v.val[1] =  _mm_unpackhi_epi32(v.val[0],v.val[2]); //a1,c0, b0, c1
-    v.val[2] =  _mm_srli_si128(v.val[1], 8);  //b0, c1, x, x,
+    val0 = _mm_shuffle_epi32(val0, 0 | (3 << 2) | (1 << 4) | (2 << 6)); //a0,b1, a1, b0
+    _M64(v.val[0], val0);
+    val2 =  _mm_slli_si128(val2, 8); //x, x,c0,c1,
+    val1 =  _mm_unpackhi_epi32(val0,val2); //a1,c0, b0, c1
+    _M64(v.val[1], val1);
+    val2 =  _mm_srli_si128(val1, 8); //b0, c1, x, x,
+    _M64(v.val[2], val2);
     return v;
 }
-uint64x1x3_t vld3_u64(__transfersize(3) uint64_t const * ptr);         // VLD1.64 {d0, d1, d2}, [r0]
-_NEON2SSE_INLINE uint64x1x3_t vld3_u64(__transfersize(3) uint64_t const * ptr)         // VLD1.64 {d0, d1, d2}, [r0]
+uint64x1x3_t vld3_u64(__transfersize(3) uint64_t const * ptr); // VLD1.64 {d0, d1, d2}, [r0]
+_NEON2SSE_INLINE uint64x1x3_t vld3_u64(__transfersize(3) uint64_t const * ptr) // VLD1.64 {d0, d1, d2}, [r0]
 {
     uint64x1x3_t v;
-    v.val[0] = vld1q_u64 (ptr);
-    v.val[1] = _mm_shuffle_epi32(v.val[0], _SWAP_HI_LOW32);
+    v.val[0].m64_u64[0] = *(ptr);
+    v.val[1].m64_u64[0] = *(ptr + 1);
+    v.val[2].m64_u64[0] = *(ptr + 2);
     return v;
 }
 
-#if defined(USE_SSSE3)
-int8x8x3_t vld3_s8(__transfersize(24) int8_t const * ptr);         // VLD3.8 {d0, d1, d2}, [r0]
+int8x8x3_t vld3_s8(__transfersize(24) int8_t const * ptr); // VLD3.8 {d0, d1, d2}, [r0]
 #define vld3_s8(ptr) vld3_u8((uint8_t*)ptr)
 
-int16x4x3_t vld3_s16(__transfersize(12) int16_t const * ptr);         // VLD3.16 {d0, d1, d2}, [r0]
+int16x4x3_t vld3_s16(__transfersize(12) int16_t const * ptr); // VLD3.16 {d0, d1, d2}, [r0]
 #define vld3_s16(ptr) vld3_u16((uint16_t*)ptr)
-#endif
 
-int32x2x3_t vld3_s32(__transfersize(6) int32_t const * ptr);         // VLD3.32 {d0, d1, d2}, [r0]
+int32x2x3_t vld3_s32(__transfersize(6) int32_t const * ptr); // VLD3.32 {d0, d1, d2}, [r0]
 #define vld3_s32(ptr) vld3_u32((uint32_t*)ptr)
 
-int64x1x3_t vld3_s64(__transfersize(3) int64_t const * ptr);         // VLD1.64 {d0, d1, d2}, [r0]
+int64x1x3_t vld3_s64(__transfersize(3) int64_t const * ptr); // VLD1.64 {d0, d1, d2}, [r0]
 #define vld3_s64(ptr) vld3_u64((uint64_t*)ptr)
 
-float16x4x3_t vld3_f16(__transfersize(12) __fp16 const * ptr);         // VLD3.16 {d0, d1, d2}, [r0]
+float16x4x3_t vld3_f16(__transfersize(12) __fp16 const * ptr); // VLD3.16 {d0, d1, d2}, [r0]
 // IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example
 
-float32x2x3_t vld3_f32(__transfersize(6) float32_t const * ptr);         // VLD3.32 {d0, d1, d2}, [r0]
+float32x2x3_t vld3_f32(__transfersize(6) float32_t const * ptr); // VLD3.32 {d0, d1, d2}, [r0]
 _NEON2SSE_INLINE float32x2x3_t vld3_f32(__transfersize(6) float32_t const * ptr)
-{ //a0,a1,  b0,b1, c0,c1,  -> a0,b1, a1,c0, b0,c1
+{
+    //a0,a1,  b0,b1, c0,c1,  -> a0,b1, a1,c0, b0,c1
     float32x2x3_t v;
-    v.val[0] =  vld1q_f32 (ptr);        //a0,a1,  b0,b1,
+    v.val[0].m64_f32[0] = *(ptr);
+    v.val[0].m64_f32[1] = *(ptr + 3);
 
-    v.val[0] = _mm_shuffle_ps(v.val[0],v.val[0], _MM_SHUFFLE(2,1, 3, 0));  //a0,b1, a1, b0
-    v.val[2] =  _mm_movelh_ps(v.val[2], v.val[2]);  //x, x,c0,c1,
-    v.val[1] =  _mm_unpackhi_ps(v.val[0],v.val[2]); //a1,c0, b0, c1
-    v.val[2] =  _mm_movehl_ps(v.val[1], v.val[1]);  //b0, c1, x, x,
+    v.val[1].m64_f32[0] = *(ptr + 1);
+    v.val[1].m64_f32[1] = *(ptr + 4);
+
+    v.val[2].m64_f32[0] = *(ptr + 2);
+    v.val[2].m64_f32[1] = *(ptr + 5);
     return v;
 }
 
-#if defined(USE_SSSE3)
-poly8x8x3_t vld3_p8(__transfersize(24) poly8_t const * ptr);         // VLD3.8 {d0, d1, d2}, [r0]
+poly8x8x3_t vld3_p8(__transfersize(24) poly8_t const * ptr); // VLD3.8 {d0, d1, d2}, [r0]
 #define vld3_p8 vld3_u8
 
-poly16x4x3_t vld3_p16(__transfersize(12) poly16_t const * ptr);         // VLD3.16 {d0, d1, d2}, [r0]
+poly16x4x3_t vld3_p16(__transfersize(12) poly16_t const * ptr); // VLD3.16 {d0, d1, d2}, [r0]
 #define vld3_p16 vld3_u16
-#endif
 
 //***************  Quadruples load ********************************
 //*****************************************************************
-uint8x16x4_t vld4q_u8(__transfersize(64) uint8_t const * ptr);         // VLD4.8 {d0, d2, d4, d6}, [r0]
-_NEON2SSE_INLINE uint8x16x4_t vld4q_u8(__transfersize(64) uint8_t const * ptr)         // VLD4.8 {d0, d2, d4, d6}, [r0]
+uint8x16x4_t vld4q_u8(__transfersize(64) uint8_t const * ptr); // VLD4.8 {d0, d2, d4, d6}, [r0]
+_NEON2SSE_INLINE uint8x16x4_t vld4q_u8(__transfersize(64) uint8_t const * ptr) // VLD4.8 {d0, d2, d4, d6}, [r0]
 {
     uint8x16x4_t v;
-   __m128i tmp3, tmp2, tmp1, tmp0;
+    __m128i tmp3, tmp2, tmp1, tmp0;
 
     v.val[0] = vld1q_u8 ( ptr); //a0,a1,a2,...a7, ...a15
-    v.val[1] = vld1q_u8 ( (ptr + 16));//b0, b1,b2,...b7.... b15
-    v.val[2] = vld1q_u8 ( (ptr + 32));//c0, c1,c2,...c7....c15
+    v.val[1] = vld1q_u8 ( (ptr + 16)); //b0, b1,b2,...b7.... b15
+    v.val[2] = vld1q_u8 ( (ptr + 32)); //c0, c1,c2,...c7....c15
     v.val[3] = vld1q_u8 ( (ptr + 48)); //d0,d1,d2,...d7....d15
 
-    tmp0= _mm_unpacklo_epi8(v.val[0],v.val[1]); //a0,b0, a1,b1, a2,b2, a3,b3,....a7,b7
-    tmp1= _mm_unpacklo_epi8(v.val[2],v.val[3]); //c0,d0, c1,d1, c2,d2, c3,d3,... c7,d7
-    tmp2= _mm_unpackhi_epi8(v.val[0],v.val[1]);//a8,b8, a9,b9, a10,b10, a11,b11,...a15,b15
-    tmp3= _mm_unpackhi_epi8(v.val[2],v.val[3]);//c8,d8, c9,d9, c10,d10, c11,d11,...c15,d15
+    tmp0 = _mm_unpacklo_epi8(v.val[0],v.val[1]); //a0,b0, a1,b1, a2,b2, a3,b3,....a7,b7
+    tmp1 = _mm_unpacklo_epi8(v.val[2],v.val[3]); //c0,d0, c1,d1, c2,d2, c3,d3,... c7,d7
+    tmp2 = _mm_unpackhi_epi8(v.val[0],v.val[1]); //a8,b8, a9,b9, a10,b10, a11,b11,...a15,b15
+    tmp3 = _mm_unpackhi_epi8(v.val[2],v.val[3]); //c8,d8, c9,d9, c10,d10, c11,d11,...c15,d15
 
     v.val[0] = _mm_unpacklo_epi8(tmp0, tmp2); //a0,a8, b0,b8,  a1,a9, b1,b9, ....a3,a11, b3,b11
     v.val[1] = _mm_unpackhi_epi8(tmp0, tmp2); //a4,a12, b4,b12, a5,a13, b5,b13,....a7,a15,b7,b15
     v.val[2] = _mm_unpacklo_epi8(tmp1, tmp3); //c0,c8, d0,d8, c1,c9, d1,d9.....d3,d11
     v.val[3] = _mm_unpackhi_epi8(tmp1, tmp3); //c4,c12,d4,d12, c5,c13, d5,d13,....d7,d15
 
-    tmp0 =  _mm_unpacklo_epi32(v.val[0] , v.val[2] ); ///a0,a8, b0,b8, c0,c8,  d0,d8, a1,a9, b1,b9, c1,c9, d1,d9
-    tmp1 =  _mm_unpackhi_epi32(v.val[0] , v.val[2] ); //a2,a10, b2,b10, c2,c10, d2,d10, a3,a11, b3,b11, c3,c11, d3,d11
-    tmp2 =  _mm_unpacklo_epi32(v.val[1] , v.val[3] ); //a4,a12, b4,b12, c4,c12, d4,d12, a5,a13, b5,b13, c5,c13, d5,d13,
-    tmp3 =  _mm_unpackhi_epi32(v.val[1] , v.val[3] ); //a6,a14, b6,b14, c6,c14, d6,d14, a7,a15,b7,b15,c7,c15,d7,d15
+    tmp0 =  _mm_unpacklo_epi32(v.val[0], v.val[2] ); ///a0,a8, b0,b8, c0,c8,  d0,d8, a1,a9, b1,b9, c1,c9, d1,d9
+    tmp1 =  _mm_unpackhi_epi32(v.val[0], v.val[2] ); //a2,a10, b2,b10, c2,c10, d2,d10, a3,a11, b3,b11, c3,c11, d3,d11
+    tmp2 =  _mm_unpacklo_epi32(v.val[1], v.val[3] ); //a4,a12, b4,b12, c4,c12, d4,d12, a5,a13, b5,b13, c5,c13, d5,d13,
+    tmp3 =  _mm_unpackhi_epi32(v.val[1], v.val[3] ); //a6,a14, b6,b14, c6,c14, d6,d14, a7,a15,b7,b15,c7,c15,d7,d15
 
     v.val[0] = _mm_unpacklo_epi8(tmp0, tmp2); //a0,a4,a8,a12,b0,b4,b8,b12,c0,c4,c8,c12,d0,d4,d8,d12
     v.val[1] = _mm_unpackhi_epi8(tmp0, tmp2); //a1,a5, a9, a13, b1,b5, b9,b13, c1,c5, c9, c13, d1,d5, d9,d13
@@ -5208,23 +10190,23 @@
     return v;
 }
 
-uint16x8x4_t vld4q_u16(__transfersize(32) uint16_t const * ptr);         // VLD4.16 {d0, d2, d4, d6}, [r0]
-_NEON2SSE_INLINE uint16x8x4_t vld4q_u16(__transfersize(32) uint16_t const * ptr)         // VLD4.16 {d0, d2, d4, d6}, [r0]
+uint16x8x4_t vld4q_u16(__transfersize(32) uint16_t const * ptr); // VLD4.16 {d0, d2, d4, d6}, [r0]
+_NEON2SSE_INLINE uint16x8x4_t vld4q_u16(__transfersize(32) uint16_t const * ptr) // VLD4.16 {d0, d2, d4, d6}, [r0]
 {
     uint16x8x4_t v;
     __m128i tmp3, tmp2, tmp1, tmp0;
-    tmp0  =  vld1q_u16 (ptr);       //a0,a1,a2,...a7
+    tmp0  =  vld1q_u16 (ptr); //a0,a1,a2,...a7
     tmp1  =  vld1q_u16 ((ptr + 8)); //b0, b1,b2,...b7
     tmp2  =  vld1q_u16 ((ptr + 16)); //c0, c1,c2,...c7
     tmp3  =  vld1q_u16 ((ptr + 24)); //d0,d1,d2,...d7
-    v.val[0]= _mm_unpacklo_epi16(tmp0,tmp1); //a0,b0, a1,b1, a2,b2, a3,b3,
-    v.val[1]= _mm_unpacklo_epi16(tmp2,tmp3); //c0,d0, c1,d1, c2,d2, c3,d3,
-    v.val[2]= _mm_unpackhi_epi16(tmp0,tmp1);//a4,b4, a5,b5, a6,b6, a7,b7
-    v.val[3]= _mm_unpackhi_epi16(tmp2,tmp3);//c4,d4, c5,d5, c6,d6, c7,d7
-    tmp0 = _mm_unpacklo_epi16(v.val[0], v.val[2]);//a0,a4, b0,b4, a1,a5, b1,b5
+    v.val[0] = _mm_unpacklo_epi16(tmp0,tmp1); //a0,b0, a1,b1, a2,b2, a3,b3,
+    v.val[1] = _mm_unpacklo_epi16(tmp2,tmp3); //c0,d0, c1,d1, c2,d2, c3,d3,
+    v.val[2] = _mm_unpackhi_epi16(tmp0,tmp1); //a4,b4, a5,b5, a6,b6, a7,b7
+    v.val[3] = _mm_unpackhi_epi16(tmp2,tmp3); //c4,d4, c5,d5, c6,d6, c7,d7
+    tmp0 = _mm_unpacklo_epi16(v.val[0], v.val[2]); //a0,a4, b0,b4, a1,a5, b1,b5
     tmp1 = _mm_unpackhi_epi16(v.val[0], v.val[2]); //a2,a6, b2,b6, a3,a7, b3,b7
     tmp2 = _mm_unpacklo_epi16(v.val[1], v.val[3]); //c0,c4, d0,d4, c1,c5, d1,d5
-    tmp3 = _mm_unpackhi_epi16(v.val[1], v.val[3]);//c2,c6, d2,d6, c3,c7, d3,d7
+    tmp3 = _mm_unpackhi_epi16(v.val[1], v.val[3]); //c2,c6, d2,d6, c3,c7, d3,d7
     v.val[0] =  _mm_unpacklo_epi64(tmp0, tmp2); //a0,a4, b0,b4, c0,c4, d0,d4,
     v.val[1] =  _mm_unpackhi_epi64(tmp0, tmp2); //a1,a5, b1,b5, c1,c5, d1,d5
     v.val[2] =  _mm_unpacklo_epi64(tmp1, tmp3); //a2,a6, b2,b6, c2,c6, d2,d6,
@@ -5232,8 +10214,8 @@
     return v;
 }
 
-uint32x4x4_t vld4q_u32(__transfersize(16) uint32_t const * ptr);         // VLD4.32 {d0, d2, d4, d6}, [r0]
-_NEON2SSE_INLINE uint32x4x4_t vld4q_u32(__transfersize(16) uint32_t const * ptr)         // VLD4.32 {d0, d2, d4, d6}, [r0]
+uint32x4x4_t vld4q_u32(__transfersize(16) uint32_t const * ptr); // VLD4.32 {d0, d2, d4, d6}, [r0]
+_NEON2SSE_INLINE uint32x4x4_t vld4q_u32(__transfersize(16) uint32_t const * ptr) // VLD4.32 {d0, d2, d4, d6}, [r0]
 {
     uint32x4x4_t v;
     __m128i tmp3, tmp2, tmp1, tmp0;
@@ -5252,20 +10234,20 @@
     return v;
 }
 
-int8x16x4_t vld4q_s8(__transfersize(64) int8_t const * ptr);         // VLD4.8 {d0, d2, d4, d6}, [r0]
+int8x16x4_t vld4q_s8(__transfersize(64) int8_t const * ptr); // VLD4.8 {d0, d2, d4, d6}, [r0]
 #define vld4q_s8(ptr) vld4q_u8((uint8_t*)ptr)
 
-int16x8x4_t vld4q_s16(__transfersize(32) int16_t const * ptr);         // VLD4.16 {d0, d2, d4, d6}, [r0]
+int16x8x4_t vld4q_s16(__transfersize(32) int16_t const * ptr); // VLD4.16 {d0, d2, d4, d6}, [r0]
 #define  vld4q_s16(ptr) vld4q_u16((uint16_t*)ptr)
 
-int32x4x4_t vld4q_s32(__transfersize(16) int32_t const * ptr);         // VLD4.32 {d0, d2, d4, d6}, [r0]
+int32x4x4_t vld4q_s32(__transfersize(16) int32_t const * ptr); // VLD4.32 {d0, d2, d4, d6}, [r0]
 #define  vld4q_s32(ptr) vld4q_u32((uint32_t*)ptr)
 
-float16x8x4_t vld4q_f16(__transfersize(32) __fp16 const * ptr);         // VLD4.16 {d0, d2, d4, d6}, [r0]
+float16x8x4_t vld4q_f16(__transfersize(32) __fp16 const * ptr); // VLD4.16 {d0, d2, d4, d6}, [r0]
 // IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example
 
-float32x4x4_t vld4q_f32(__transfersize(16) float32_t const * ptr);         // VLD4.32 {d0, d2, d4, d6}, [r0]
-_NEON2SSE_INLINE float32x4x4_t vld4q_f32(__transfersize(16) float32_t const * ptr)         // VLD4.32 {d0, d2, d4, d6}, [r0]
+float32x4x4_t vld4q_f32(__transfersize(16) float32_t const * ptr); // VLD4.32 {d0, d2, d4, d6}, [r0]
+_NEON2SSE_INLINE float32x4x4_t vld4q_f32(__transfersize(16) float32_t const * ptr) // VLD4.32 {d0, d2, d4, d6}, [r0]
 {
     float32x4x4_t v;
     __m128 tmp3, tmp2, tmp1, tmp0;
@@ -5285,346 +10267,377 @@
     return v;
 }
 
-poly8x16x4_t vld4q_p8(__transfersize(64) poly8_t const * ptr);         // VLD4.8 {d0, d2, d4, d6}, [r0]
+poly8x16x4_t vld4q_p8(__transfersize(64) poly8_t const * ptr); // VLD4.8 {d0, d2, d4, d6}, [r0]
 #define vld4q_p8 vld4q_u8
 
-poly16x8x4_t vld4q_p16(__transfersize(32) poly16_t const * ptr);         // VLD4.16 {d0, d2, d4, d6}, [r0]
+poly16x8x4_t vld4q_p16(__transfersize(32) poly16_t const * ptr); // VLD4.16 {d0, d2, d4, d6}, [r0]
 #define vld4q_p16 vld4q_s16
 
-#if defined(USE_SSSE3)
-uint8x8x4_t vld4_u8(__transfersize(32) uint8_t const * ptr);         // VLD4.8 {d0, d1, d2, d3}, [r0]
-_NEON2SSE_INLINE uint8x8x4_t vld4_u8(__transfersize(32) uint8_t const * ptr)         // VLD4.8 {d0, d1, d2, d3}, [r0]
+uint8x8x4_t vld4_u8(__transfersize(32) uint8_t const * ptr); // VLD4.8 {d0, d1, d2, d3}, [r0]
+_NEON2SSE_INLINE uint8x8x4_t vld4_u8(__transfersize(32) uint8_t const * ptr) // VLD4.8 {d0, d1, d2, d3}, [r0]
 {
     uint8x8x4_t v;
     __m128i sh0, sh1;
+    __m128i val0,  val2;
     _NEON2SSE_ALIGN_16 int8_t mask4_8[16] = {0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15};
 
-    v.val[0] = vld1q_u8(( ptr));         //load first 64-bits in val[0] and val[1]
-    v.val[1] = vld1q_u8(( ptr + 16));         //load third and forth 64-bits in val[2], val[3]
+    val0 = vld1q_u8(( ptr)); //load first 64-bits in val[0] and val[1]
+    val2 = vld1q_u8(( ptr + 16)); //load third and forth 64-bits in val[2], val[3]
 
-    sh0 = _mm_shuffle_epi8(v.val[0], *(__m128i*)mask4_8);
-    sh1 = _mm_shuffle_epi8(v.val[1], *(__m128i*)mask4_8);
-    v.val[0] = _mm_unpacklo_epi32(sh0,sh1);         //0,4,8,12,16,20,24,28, 1,5,9,13,17,21,25,29
-    v.val[2] = _mm_unpackhi_epi32(sh0,sh1);         //2,6,10,14,18,22,26,30, 3,7,11,15,19,23,27,31
-    v.val[1] = _mm_shuffle_epi32(v.val[0],_SWAP_HI_LOW32);
-    v.val[3] = _mm_shuffle_epi32(v.val[2],_SWAP_HI_LOW32);
-
+    sh0 = _mm_shuffle_epi8(val0, *(__m128i*)mask4_8);
+    sh1 = _mm_shuffle_epi8(val2, *(__m128i*)mask4_8);
+    val0 = _mm_unpacklo_epi32(sh0,sh1); //0,4,8,12,16,20,24,28, 1,5,9,13,17,21,25,29
+    vst1q_u8(&v.val[0], val0 );
+    val2 = _mm_unpackhi_epi32(sh0,sh1); //2,6,10,14,18,22,26,30, 3,7,11,15,19,23,27,31
+    vst1q_u8(&v.val[2], val2 );
     return v;
 }
-#endif
 
-#if defined(USE_SSSE3)
-uint16x4x4_t vld4_u16(__transfersize(16) uint16_t const * ptr);         // VLD4.16 {d0, d1, d2, d3}, [r0]
-_NEON2SSE_INLINE uint16x4x4_t vld4_u16(__transfersize(16) uint16_t const * ptr)         // VLD4.16 {d0, d1, d2, d3}, [r0]
+uint16x4x4_t vld4_u16(__transfersize(16) uint16_t const * ptr); // VLD4.16 {d0, d1, d2, d3}, [r0]
+_NEON2SSE_INLINE uint16x4x4_t vld4_u16(__transfersize(16) uint16_t const * ptr) // VLD4.16 {d0, d1, d2, d3}, [r0]
 {
     uint16x4x4_t v;
     __m128i sh0, sh1;
-    _NEON2SSE_ALIGN_16 int8_t mask4_16[16] = {0,1, 8,9, 2,3, 10,11, 4,5, 12,13, 6,7, 14,15};         //0, 4, 1, 5, 2, 6, 3, 7
-    v.val[0] = vld1q_u16 ( (ptr));         //load first 64-bits in val[0] and val[1]
-    v.val[2] = vld1q_u16 ( (ptr + 8));         //load third and forth 64-bits in val[2], val[3]
-    sh0 = _mm_shuffle_epi8(v.val[0], *(__m128i*)mask4_16);
-    sh1 = _mm_shuffle_epi8(v.val[2], *(__m128i*)mask4_16);
-    v.val[0] = _mm_unpacklo_epi32(sh0,sh1);         //0,4,8,12, 1,5,9,13
-    v.val[2] = _mm_unpackhi_epi32(sh0,sh1);         //2,6,10,14, 3,7,11,15
-    v.val[1] = _mm_shuffle_epi32(v.val[0],_SWAP_HI_LOW32);
-    v.val[3] = _mm_shuffle_epi32(v.val[2],_SWAP_HI_LOW32);
+    __m128i val0, val2;
+    _NEON2SSE_ALIGN_16 int8_t mask4_16[16] = {0,1, 8,9, 2,3, 10,11, 4,5, 12,13, 6,7, 14,15}; //0, 4, 1, 5, 2, 6, 3, 7
+    val0 = vld1q_u16 ( (ptr)); //load first 64-bits in val[0] and val[1]
+    val2 = vld1q_u16 ( (ptr + 8)); //load third and forth 64-bits in val[2], val[3]
+    sh0 = _mm_shuffle_epi8(val0, *(__m128i*)mask4_16);
+    sh1 = _mm_shuffle_epi8(val2, *(__m128i*)mask4_16);
+    val0 = _mm_unpacklo_epi32(sh0,sh1); //0,4,8,12, 1,5,9,13
+    vst1q_u16(&v.val[0], val0 );
+    val2 = _mm_unpackhi_epi32(sh0,sh1); //2,6,10,14, 3,7,11,15
+    vst1q_u16(&v.val[2], val2 );
     return v;
 }
-#endif
 
-uint32x2x4_t vld4_u32(__transfersize(8) uint32_t const * ptr);         // VLD4.32 {d0, d1, d2, d3}, [r0]
+uint32x2x4_t vld4_u32(__transfersize(8) uint32_t const * ptr); // VLD4.32 {d0, d1, d2, d3}, [r0]
 _NEON2SSE_INLINE uint32x2x4_t vld4_u32(__transfersize(8) uint32_t const * ptr)
-{   //a0,a1,  b0,b1, c0,c1, d0,d1 -> a0,c0, a1,c1, b0,d0, b1,d1
-     uint32x4x4_t v, res;
-    v.val[0] =  vld1q_u32 (ptr);        //a0,a1,  b0,b1,
-    v.val[2] =  vld1q_u32 ((ptr + 4));  //c0,c1, d0,d1
-    res.val[0] = _mm_unpacklo_epi32(v.val[0],v.val[2]);  //a0, c0, a1,c1,
-    res.val[2] = _mm_unpackhi_epi32(v.val[0],v.val[2]);  //b0,d0, b1, d1
-    res.val[1] = _mm_shuffle_epi32(res.val[0],_SWAP_HI_LOW32); //a1,c1, a0, c0,
-    res.val[3] = _mm_shuffle_epi32(res.val[2],_SWAP_HI_LOW32);//b1, d1,b0,d0,
-    return res;
+{
+    //a0,a1,  b0,b1, c0,c1, d0,d1 -> a0,c0, a1,c1, b0,d0, b1,d1
+    uint32x2x4_t v;
+    __m128i val0, val01, val2;
+    val0 =  vld1q_u32 (ptr); //a0,a1,  b0,b1,
+    val2 =  vld1q_u32 ((ptr + 4)); //c0,c1, d0,d1
+    val01 = _mm_unpacklo_epi32(val0,val2); //a0, c0, a1,c1,
+    val2 = _mm_unpackhi_epi32(val0,val2); //b0,d0, b1, d1
+    vst1q_u32(&v.val[0], val01);
+    vst1q_u32(&v.val[2], val2 );
+    return v;
 }
 
-uint64x1x4_t vld4_u64(__transfersize(4) uint64_t const * ptr);         // VLD1.64 {d0, d1, d2, d3}, [r0]
-_NEON2SSE_INLINE uint64x1x4_t vld4_u64(__transfersize(4) uint64_t const * ptr)         // VLD1.64 {d0, d1, d2, d3}, [r0]
+uint64x1x4_t vld4_u64(__transfersize(4) uint64_t const * ptr); // VLD1.64 {d0, d1, d2, d3}, [r0]
+_NEON2SSE_INLINE uint64x1x4_t vld4_u64(__transfersize(4) uint64_t const * ptr) // VLD1.64 {d0, d1, d2, d3}, [r0]
 {
     uint64x1x4_t v;
-    v.val[0] = vld1q_u64( (ptr));         //load first 64-bits in val[0] and val[1]
-    v.val[2] = vld1q_u64( (ptr + 2));         //load third and forth 64-bits in val[2], val[3]
+    v.val[0].m64_u64[0] = *(ptr); //load first 64-bits in val[0] and val[1]
+    v.val[1].m64_u64[0] = *(ptr + 1); //load first 64-bits in val[0] and val[1]
+    v.val[2].m64_u64[0] = *(ptr + 2); //load third and forth 64-bits in val[2], val[3]
+    v.val[3].m64_u64[0] = *(ptr + 3); //load third and forth 64-bits in val[2], val[3]
     return v;
 }
 
-#if defined(USE_SSSE3)
-int8x8x4_t vld4_s8(__transfersize(32) int8_t const * ptr);         // VLD4.8 {d0, d1, d2, d3}, [r0]
+int8x8x4_t vld4_s8(__transfersize(32) int8_t const * ptr); // VLD4.8 {d0, d1, d2, d3}, [r0]
 #define  vld4_s8(ptr) vld4_u8((uint8_t*)ptr)
 
-int16x4x4_t vld4_s16(__transfersize(16) int16_t const * ptr);         // VLD4.16 {d0, d1, d2, d3}, [r0]
+int16x4x4_t vld4_s16(__transfersize(16) int16_t const * ptr); // VLD4.16 {d0, d1, d2, d3}, [r0]
 #define vld4_s16(ptr) vld4_u16((uint16_t*)ptr)
-#endif
 
-int32x2x4_t vld4_s32(__transfersize(8) int32_t const * ptr);         // VLD4.32 {d0, d1, d2, d3}, [r0]
+int32x2x4_t vld4_s32(__transfersize(8) int32_t const * ptr); // VLD4.32 {d0, d1, d2, d3}, [r0]
 #define vld4_s32(ptr) vld4_u32((uint32_t*)ptr)
 
-int64x1x4_t vld4_s64(__transfersize(4) int64_t const * ptr);         // VLD1.64 {d0, d1, d2, d3}, [r0]
+int64x1x4_t vld4_s64(__transfersize(4) int64_t const * ptr); // VLD1.64 {d0, d1, d2, d3}, [r0]
 #define vld4_s64(ptr) vld4_u64((uint64_t*)ptr)
 
-float16x4x4_t vld4_f16(__transfersize(16) __fp16 const * ptr);         // VLD4.16 {d0, d1, d2, d3}, [r0]
+float16x4x4_t vld4_f16(__transfersize(16) __fp16 const * ptr); // VLD4.16 {d0, d1, d2, d3}, [r0]
 // IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example
 
-float32x2x4_t vld4_f32(__transfersize(8) float32_t const * ptr);         // VLD4.32 {d0, d1, d2, d3}, [r0]
-_NEON2SSE_INLINE float32x2x4_t vld4_f32(__transfersize(8) float32_t const * ptr)         // VLD4.32 {d0, d1, d2, d3}, [r0]
-{         //a0,a1,  b0,b1, c0,c1, d0,d1 -> a0,c0, a1,c1, b0,d0, b1,d1
-    float32x2x4_t v, res;
-    v.val[0] =  vld1q_f32 ((float*) ptr);         //a0,a1,  b0,b1,
-    v.val[2] =  vld1q_f32 ((float*) (ptr + 4));         //c0,c1, d0,d1
-    res.val[0] = _mm_unpacklo_ps(v.val[0],v.val[2]);         //a0, c0, a1,c1,
-    res.val[2] = _mm_unpackhi_ps(v.val[0],v.val[2]);         //b0,d0, b1, d1
-    res.val[1] = _mm_movehl_ps(res.val[0],res.val[0]);          // a1,c1, a0, c0,
-    res.val[3] = _mm_movehl_ps(res.val[2],res.val[2]);         // b1, d1, b0,d0,
+float32x2x4_t vld4_f32(__transfersize(8) float32_t const * ptr); // VLD4.32 {d0, d1, d2, d3}, [r0]
+_NEON2SSE_INLINE float32x2x4_t vld4_f32(__transfersize(8) float32_t const * ptr) // VLD4.32 {d0, d1, d2, d3}, [r0]
+{
+    //a0,a1,  b0,b1, c0,c1, d0,d1 -> a0,c0, a1,c1, b0,d0, b1,d1
+    float32x2x4_t res;
+    res.val[0].m64_f32[0] = *(ptr);
+    res.val[0].m64_f32[1] = *(ptr + 4);
+    res.val[1].m64_f32[0] = *(ptr + 1);
+    res.val[1].m64_f32[1] = *(ptr + 5);
+    res.val[2].m64_f32[0] = *(ptr + 2);
+    res.val[2].m64_f32[1] = *(ptr + 6);
+    res.val[3].m64_f32[0] = *(ptr + 3);
+    res.val[3].m64_f32[1] = *(ptr + 7);
     return res;
 }
 
-#if defined(USE_SSSE3)
-poly8x8x4_t vld4_p8(__transfersize(32) poly8_t const * ptr);         // VLD4.8 {d0, d1, d2, d3}, [r0]
+poly8x8x4_t vld4_p8(__transfersize(32) poly8_t const * ptr); // VLD4.8 {d0, d1, d2, d3}, [r0]
 #define vld4_p8 vld4_u8
 
-poly16x4x4_t vld4_p16(__transfersize(16) poly16_t const * ptr);         // VLD4.16 {d0, d1, d2, d3}, [r0]
+poly16x4x4_t vld4_p16(__transfersize(16) poly16_t const * ptr); // VLD4.16 {d0, d1, d2, d3}, [r0]
 #define vld4_p16 vld4_u16
-#endif
 
 //************* Duplicate (or propagate) ptr[0] to all val[0] lanes and ptr[1] to all val[1] lanes *******************
 //*******************************************************************************************************************
-uint8x8x2_t vld2_dup_u8(__transfersize(2) uint8_t const * ptr);         // VLD2.8 {d0[], d1[]}, [r0]
-_NEON2SSE_INLINE uint8x8x2_t vld2_dup_u8(__transfersize(2) uint8_t const * ptr)         // VLD2.8 {d0[], d1[]}, [r0]
+uint8x8x2_t vld2_dup_u8(__transfersize(2) uint8_t const * ptr); // VLD2.8 {d0[], d1[]}, [r0]
+_NEON2SSE_INLINE uint8x8x2_t vld2_dup_u8(__transfersize(2) uint8_t const * ptr) // VLD2.8 {d0[], d1[]}, [r0]
 {
     uint8x8x2_t v;
-    v.val[0] = LOAD_SI128(ptr); //0,1,x,x, x,x,x,x,x,x,x,x, x,x,x,x
-    v.val[1] = _mm_unpacklo_epi8(v.val[0],v.val[0]);//0,0,1,1,x,x,x,x, x,x,x,x,x,x,x,x,
-    v.val[1] = _mm_unpacklo_epi16(v.val[1],v.val[1]);//0,0,0,0, 1,1,1,1,x,x,x,x, x,x,x,x
-    v.val[0] = _mm_unpacklo_epi32(v.val[1],v.val[1]);//0,0,0,0, 0,0,0,0,1,1,1,1,1,1,1,1,
-    v.val[1] = _mm_shuffle_epi32(v.val[0], _SWAP_HI_LOW32);
+    __m128i val0, val1;
+    val0 = LOAD_SI128(ptr); //0,1,x,x, x,x,x,x,x,x,x,x, x,x,x,x
+    val1 = _mm_unpacklo_epi8(val0,val0); //0,0,1,1,x,x,x,x, x,x,x,x,x,x,x,x,
+    val1 = _mm_unpacklo_epi16(val1,val1); //0,0,0,0, 1,1,1,1,x,x,x,x, x,x,x,x
+    val0 = _mm_unpacklo_epi32(val1,val1); //0,0,0,0, 0,0,0,0,1,1,1,1,1,1,1,1,
+    vst1q_u8(v.val, val0);
     return v;
 }
 
-uint16x4x2_t vld2_dup_u16(__transfersize(2) uint16_t const * ptr);         // VLD2.16 {d0[], d1[]}, [r0]
-_NEON2SSE_INLINE uint16x4x2_t vld2_dup_u16(__transfersize(2) uint16_t const * ptr)         // VLD2.16 {d0[], d1[]}, [r0]
+uint16x4x2_t vld2_dup_u16(__transfersize(2) uint16_t const * ptr); // VLD2.16 {d0[], d1[]}, [r0]
+_NEON2SSE_INLINE uint16x4x2_t vld2_dup_u16(__transfersize(2) uint16_t const * ptr) // VLD2.16 {d0[], d1[]}, [r0]
 {
     uint16x4x2_t v;
-    v.val[1] = LOAD_SI128(ptr); //0,1,x,x, x,x,x,x
-    v.val[0] = _mm_shufflelo_epi16(v.val[1], 0); //00 00 00 00 (all 0)
-    v.val[1] = _mm_shufflelo_epi16(v.val[1], 85);//01 01 01 01 (all 1)
+    __m128i val0, val1;
+    val1 = LOAD_SI128(ptr); //0,1,x,x, x,x,x,x
+    val0 = _mm_shufflelo_epi16(val1, 0); //00 00 00 00 (all 0)
+    _M64(v.val[0], val0);
+    val1 = _mm_shufflelo_epi16(val1, 85); //01 01 01 01 (all 1)
+    _M64(v.val[1], val1);
     return v;
 }
 
-uint32x2x2_t vld2_dup_u32(__transfersize(2) uint32_t const * ptr);         // VLD2.32 {d0[], d1[]}, [r0]
-_NEON2SSE_INLINE uint32x2x2_t vld2_dup_u32(__transfersize(2) uint32_t const * ptr)         // VLD2.32 {d0[], d1[]}, [r0]
+uint32x2x2_t vld2_dup_u32(__transfersize(2) uint32_t const * ptr); // VLD2.32 {d0[], d1[]}, [r0]
+_NEON2SSE_INLINE uint32x2x2_t vld2_dup_u32(__transfersize(2) uint32_t const * ptr) // VLD2.32 {d0[], d1[]}, [r0]
 {
     uint32x2x2_t v;
-    v.val[0] = LOAD_SI128(ptr); //0,1,x,x
-    v.val[0] = _mm_shuffle_epi32(v.val[0],   0 | (0 << 2) | (1 << 4) | (1 << 6)); //0,0,1,1
-    v.val[1] = _mm_srli_si128(v.val[0], 8); //1,1,0x0,0x0
+    __m128i val0;
+    val0 = LOAD_SI128(ptr); //0,1,x,x
+    val0 = _mm_shuffle_epi32(val0,   0 | (0 << 2) | (1 << 4) | (1 << 6)); //0,0,1,1
+    vst1q_u32(v.val, val0);
     return v;
 }
 
-uint64x1x2_t vld2_dup_u64(__transfersize(2) uint64_t const * ptr);         // VLD1.64 {d0, d1}, [r0]
+uint64x1x2_t vld2_dup_u64(__transfersize(2) uint64_t const * ptr); // VLD1.64 {d0, d1}, [r0]
 #define vld2_dup_u64 vld2_u64
 
-int8x8x2_t vld2_dup_s8(__transfersize(2) int8_t const * ptr);         // VLD2.8 {d0[], d1[]}, [r0]
+int8x8x2_t vld2_dup_s8(__transfersize(2) int8_t const * ptr); // VLD2.8 {d0[], d1[]}, [r0]
 #define vld2_dup_s8(ptr) vld2_dup_u8((uint8_t*)ptr)
 
-int16x4x2_t vld2_dup_s16(__transfersize(2) int16_t const * ptr);         // VLD2.16 {d0[], d1[]}, [r0]
+int16x4x2_t vld2_dup_s16(__transfersize(2) int16_t const * ptr); // VLD2.16 {d0[], d1[]}, [r0]
 #define vld2_dup_s16(ptr) vld2_dup_u16((uint16_t*)ptr)
 
-int32x2x2_t vld2_dup_s32(__transfersize(2) int32_t const * ptr);         // VLD2.32 {d0[], d1[]}, [r0]
+int32x2x2_t vld2_dup_s32(__transfersize(2) int32_t const * ptr); // VLD2.32 {d0[], d1[]}, [r0]
 #define vld2_dup_s32(ptr) vld2_dup_u32((uint32_t*)ptr)
 
-int64x1x2_t vld2_dup_s64(__transfersize(2) int64_t const * ptr);         // VLD1.64 {d0, d1}, [r0]
+int64x1x2_t vld2_dup_s64(__transfersize(2) int64_t const * ptr); // VLD1.64 {d0, d1}, [r0]
 #define vld2_dup_s64(ptr) vld2_dup_u64((uint64_t*)ptr)
 
-float16x4x2_t vld2_dup_f16(__transfersize(2) __fp16 const * ptr);         // VLD2.16 {d0[], d1[]}, [r0]
+float16x4x2_t vld2_dup_f16(__transfersize(2) __fp16 const * ptr); // VLD2.16 {d0[], d1[]}, [r0]
 // IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example
 
-float32x2x2_t vld2_dup_f32(__transfersize(2) float32_t const * ptr);         // VLD2.32 {d0[], d1[]}, [r0]
-_NEON2SSE_INLINE float32x2x2_t vld2_dup_f32(__transfersize(2) float32_t const * ptr)         // VLD2.32 {d0[], d1[]}, [r0]
+float32x2x2_t vld2_dup_f32(__transfersize(2) float32_t const * ptr); // VLD2.32 {d0[], d1[]}, [r0]
+_NEON2SSE_INLINE float32x2x2_t vld2_dup_f32(__transfersize(2) float32_t const * ptr) // VLD2.32 {d0[], d1[]}, [r0]
 {
     float32x2x2_t v;
-    v.val[0] = vld1q_f32(ptr);  //0,1,x,x
-    v.val[1] = _mm_movehdup_ps(v.val[0]); //1,1,x,x
-    v.val[0] = _mm_moveldup_ps(v.val[0]); //0,0,x,x
+    v.val[0].m64_f32[0] = *(ptr); //0,0
+    v.val[0].m64_f32[1] = *(ptr); //0,0
+    v.val[1].m64_f32[0] = *(ptr + 1); //1,1
+    v.val[1].m64_f32[1] = *(ptr + 1); //1,1
     return v;
 }
 
-poly8x8x2_t vld2_dup_p8(__transfersize(2) poly8_t const * ptr);         // VLD2.8 {d0[], d1[]}, [r0]
+poly8x8x2_t vld2_dup_p8(__transfersize(2) poly8_t const * ptr); // VLD2.8 {d0[], d1[]}, [r0]
 #define vld2_dup_p8 vld2_dup_u8
 
-poly16x4x2_t vld2_dup_p16(__transfersize(2) poly16_t const * ptr);         // VLD2.16 {d0[], d1[]}, [r0]
+poly16x4x2_t vld2_dup_p16(__transfersize(2) poly16_t const * ptr); // VLD2.16 {d0[], d1[]}, [r0]
 #define vld2_dup_p16 vld2_dup_s16
 
 //************* Duplicate (or propagate)triplets: *******************
 //********************************************************************
 //ptr[0] to all val[0] lanes, ptr[1] to all val[1] lanes and ptr[2] to all val[2] lanes
-uint8x8x3_t vld3_dup_u8(__transfersize(3) uint8_t const * ptr);         // VLD3.8 {d0[], d1[], d2[]}, [r0]
-_NEON2SSE_INLINE uint8x8x3_t vld3_dup_u8(__transfersize(3) uint8_t const * ptr)         // VLD3.8 {d0[], d1[], d2[]}, [r0]
+uint8x8x3_t vld3_dup_u8(__transfersize(3) uint8_t const * ptr); // VLD3.8 {d0[], d1[], d2[]}, [r0]
+_NEON2SSE_INLINE uint8x8x3_t vld3_dup_u8(__transfersize(3) uint8_t const * ptr) // VLD3.8 {d0[], d1[], d2[]}, [r0]
 {
     uint8x8x3_t v;
-    v.val[0] = LOAD_SI128(ptr); //0,1,2,x, x,x,x,x,x,x,x,x, x,x,x,x
-    v.val[1] = _mm_unpacklo_epi8(v.val[0],v.val[0]);//0,0,1,1,2,2,x,x, x,x,x,x,x,x,x,x,
-    v.val[1] = _mm_unpacklo_epi16(v.val[1],v.val[1]);//0,0,0,0, 1,1,1,1,2,2,2,2,x,x,x,x,
-    v.val[0] = _mm_unpacklo_epi32(v.val[1],v.val[1]);//0,0,0,0, 0,0,0,0,1,1,1,1,1,1,1,1,
-    v.val[2] = _mm_unpackhi_epi32(v.val[1],v.val[1]);// 2,2,2,2,2,2,2,2, x,x,x,x,x,x,x,x,
-    v.val[1] = _mm_shuffle_epi32(v.val[0], _SWAP_HI_LOW32);
+    __m128i val0, val1, val2;
+    val0 = LOAD_SI128(ptr); //0,1,2,x, x,x,x,x,x,x,x,x, x,x,x,x
+    val1 = _mm_unpacklo_epi8(val0,val0); //0,0,1,1,2,2,x,x, x,x,x,x,x,x,x,x,
+    val1 = _mm_unpacklo_epi16(val1,val1); //0,0,0,0, 1,1,1,1,2,2,2,2,x,x,x,x,
+    val0 = _mm_unpacklo_epi32(val1,val1); //0,0,0,0, 0,0,0,0,1,1,1,1,1,1,1,1,
+    val2 = _mm_unpackhi_epi32(val1,val1); // 2,2,2,2,2,2,2,2, x,x,x,x,x,x,x,x,
+    vst1q_u8(v.val, val0);
+    _M64(v.val[2], val2);
     return v;
 }
 
-uint16x4x3_t vld3_dup_u16(__transfersize(3) uint16_t const * ptr);         // VLD3.16 {d0[], d1[], d2[]}, [r0]
-_NEON2SSE_INLINE uint16x4x3_t vld3_dup_u16(__transfersize(3) uint16_t const * ptr)         // VLD3.16 {d0[], d1[], d2[]}, [r0]
+uint16x4x3_t vld3_dup_u16(__transfersize(3) uint16_t const * ptr); // VLD3.16 {d0[], d1[], d2[]}, [r0]
+_NEON2SSE_INLINE uint16x4x3_t vld3_dup_u16(__transfersize(3) uint16_t const * ptr) // VLD3.16 {d0[], d1[], d2[]}, [r0]
 {
     uint16x4x3_t v;
-    v.val[2] = LOAD_SI128(ptr); //0,1,2,x, x,x,x,x
-    v.val[0] = _mm_shufflelo_epi16(v.val[2], 0); //00 00 00 00 (all 0)
-    v.val[1] = _mm_shufflelo_epi16(v.val[2], 85);//01 01 01 01 (all 1)
-    v.val[2] = _mm_shufflelo_epi16(v.val[2], 170);//10 10 10 10 (all 2)
+    __m128i val0, val1, val2;
+    val2 = LOAD_SI128(ptr); //0,1,2,x, x,x,x,x
+    val0 = _mm_shufflelo_epi16(val2, 0); //00 00 00 00 (all 0)
+    val1 = _mm_shufflelo_epi16(val2, 85); //01 01 01 01 (all 1)
+    val2 = _mm_shufflelo_epi16(val2, 170); //10 10 10 10 (all 2)
+    _M64(v.val[0], val0);
+    _M64(v.val[1], val1);
+    _M64(v.val[2], val2);
     return v;
 }
 
-uint32x2x3_t vld3_dup_u32(__transfersize(3) uint32_t const * ptr);         // VLD3.32 {d0[], d1[], d2[]}, [r0]
-_NEON2SSE_INLINE uint32x2x3_t vld3_dup_u32(__transfersize(3) uint32_t const * ptr)         // VLD3.32 {d0[], d1[], d2[]}, [r0]
+uint32x2x3_t vld3_dup_u32(__transfersize(3) uint32_t const * ptr); // VLD3.32 {d0[], d1[], d2[]}, [r0]
+_NEON2SSE_INLINE uint32x2x3_t vld3_dup_u32(__transfersize(3) uint32_t const * ptr) // VLD3.32 {d0[], d1[], d2[]}, [r0]
 {
     uint32x2x3_t v;
-    v.val[2] = LOAD_SI128(ptr); //0,1,2,x
-    v.val[0] = _mm_shuffle_epi32(v.val[2],   0 | (0 << 2) | (2 << 4) | (2 << 6)); //0,0,2,2
-    v.val[1] = _mm_shuffle_epi32(v.val[2],   1 | (1 << 2) | (2 << 4) | (2 << 6)); //1,1,2,2
-    v.val[2] = _mm_srli_si128(v.val[0], 8); //2,2,0x0,0x0
+    __m128i val0, val1, val2;
+    val2 = LOAD_SI128(ptr); //0,1,2,x
+    val0 = _mm_shuffle_epi32(val2,   0 | (0 << 2) | (2 << 4) | (2 << 6)); //0,0,2,2
+    val1 = _mm_shuffle_epi32(val2,   1 | (1 << 2) | (2 << 4) | (2 << 6)); //1,1,2,2
+    val2 = _mm_srli_si128(val0, 8); //2,2,0x0,0x0
+    _M64(v.val[0], val0);
+    _M64(v.val[1], val1);
+    _M64(v.val[2], val2);
     return v;
 }
 
-uint64x1x3_t vld3_dup_u64(__transfersize(3) uint64_t const * ptr);         // VLD1.64 {d0, d1, d2}, [r0]
-_NEON2SSE_INLINE uint64x1x3_t vld3_dup_u64(__transfersize(3) uint64_t const * ptr)         // VLD1.64 {d0, d1, d2}, [r0]
+uint64x1x3_t vld3_dup_u64(__transfersize(3) uint64_t const * ptr); // VLD1.64 {d0, d1, d2}, [r0]
+_NEON2SSE_INLINE uint64x1x3_t vld3_dup_u64(__transfersize(3) uint64_t const * ptr) // VLD1.64 {d0, d1, d2}, [r0]
 {
     uint64x1x3_t v;
-    v.val[0] = LOAD_SI128(ptr);//0,1,
-    v.val[1] = _mm_shuffle_epi32(v.val[0], _SWAP_HI_LOW32); //1,0
-    v.val[2] = LOAD_SI128((ptr + 2));  //2,x
+    v.val[0].m64_u64[0] = *(ptr);
+    v.val[1].m64_u64[0] = *(ptr + 1);
+    v.val[2].m64_u64[0] = *(ptr + 2);
     return v;
 }
 
-int8x8x3_t vld3_dup_s8(__transfersize(3) int8_t const * ptr);         // VLD3.8 {d0[], d1[], d2[]}, [r0]
+int8x8x3_t vld3_dup_s8(__transfersize(3) int8_t const * ptr); // VLD3.8 {d0[], d1[], d2[]}, [r0]
 #define vld3_dup_s8(ptr) vld3_dup_u8((uint8_t*)ptr)
 
-int16x4x3_t vld3_dup_s16(__transfersize(3) int16_t const * ptr);         // VLD3.16 {d0[], d1[], d2[]}, [r0]
+int16x4x3_t vld3_dup_s16(__transfersize(3) int16_t const * ptr); // VLD3.16 {d0[], d1[], d2[]}, [r0]
 #define vld3_dup_s16(ptr) vld3_dup_u16((uint16_t*)ptr)
 
-int32x2x3_t vld3_dup_s32(__transfersize(3) int32_t const * ptr);         // VLD3.32 {d0[], d1[], d2[]}, [r0]
+int32x2x3_t vld3_dup_s32(__transfersize(3) int32_t const * ptr); // VLD3.32 {d0[], d1[], d2[]}, [r0]
 #define vld3_dup_s32(ptr) vld3_dup_u32((uint32_t*)ptr)
 
-int64x1x3_t vld3_dup_s64(__transfersize(3) int64_t const * ptr);         // VLD1.64 {d0, d1, d2}, [r0]
+int64x1x3_t vld3_dup_s64(__transfersize(3) int64_t const * ptr); // VLD1.64 {d0, d1, d2}, [r0]
 #define vld3_dup_s64(ptr) vld3_dup_u64((uint64_t*)ptr)
 
-float16x4x3_t vld3_dup_f16(__transfersize(3) __fp16 const * ptr);         // VLD3.16 {d0[], d1[], d2[]}, [r0]
+
+float16x4x3_t vld3_dup_f16(__transfersize(3) __fp16 const * ptr); // VLD3.16 {d0[], d1[], d2[]}, [r0]
 // IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example
 
-float32x2x3_t vld3_dup_f32(__transfersize(3) float32_t const * ptr);         // VLD3.32 {d0[], d1[], d2[]}, [r0]
-_NEON2SSE_INLINE float32x2x3_t vld3_dup_f32(__transfersize(3) float32_t const * ptr)         // VLD3.32 {d0[], d1[], d2[]}, [r0]
+float32x2x3_t vld3_dup_f32(__transfersize(3) float32_t const * ptr); // VLD3.32 {d0[], d1[], d2[]}, [r0]
+_NEON2SSE_INLINE float32x2x3_t vld3_dup_f32(__transfersize(3) float32_t const * ptr) // VLD3.32 {d0[], d1[], d2[]}, [r0]
 {
     float32x2x3_t v;
-    v.val[0] = vld1q_f32(ptr);  //0,1,2,x
-    v.val[1] = _mm_movehdup_ps(v.val[0]); //1,1,x,x
-    v.val[0] = _mm_moveldup_ps(v.val[0]); //0,0,2,2
-    v.val[2] = _mm_movehl_ps(v.val[0], v.val[0]); //2,2,0,0,
+    int i;
+    for (i = 0; i<3; i++) {
+        v.val[i].m64_f32[0] = *(ptr + i);
+        v.val[i].m64_f32[1] = *(ptr + i);
+    }
     return v;
 }
 
-poly8x8x3_t vld3_dup_p8(__transfersize(3) poly8_t const * ptr);         // VLD3.8 {d0[], d1[], d2[]}, [r0]
+poly8x8x3_t vld3_dup_p8(__transfersize(3) poly8_t const * ptr); // VLD3.8 {d0[], d1[], d2[]}, [r0]
 #define vld3_dup_p8 vld3_dup_u8
 
-poly16x4x3_t vld3_dup_p16(__transfersize(3) poly16_t const * ptr);         // VLD3.16 {d0[], d1[], d2[]}, [r0]
+poly16x4x3_t vld3_dup_p16(__transfersize(3) poly16_t const * ptr); // VLD3.16 {d0[], d1[], d2[]}, [r0]
 #define vld3_dup_p16 vld3_dup_s16
 
+
 //************* Duplicate (or propagate) quadruples: *******************
 //***********************************************************************
 //ptr[0] to all val[0] lanes, ptr[1] to all val[1] lanes, ptr[2] to all val[2] lanes  and  ptr[3] to all val[3] lanes
-uint8x8x4_t vld4_dup_u8(__transfersize(4) uint8_t const * ptr);         // VLD4.8 {d0[], d1[], d2[], d3[]}, [r0]
-_NEON2SSE_INLINE uint8x8x4_t vld4_dup_u8(__transfersize(4) uint8_t const * ptr)         // VLD4.8 {d0[], d1[], d2[], d3[]}, [r0]
+uint8x8x4_t vld4_dup_u8(__transfersize(4) uint8_t const * ptr); // VLD4.8 {d0[], d1[], d2[], d3[]}, [r0]
+_NEON2SSE_INLINE uint8x8x4_t vld4_dup_u8(__transfersize(4) uint8_t const * ptr) // VLD4.8 {d0[], d1[], d2[], d3[]}, [r0]
 {
     uint8x8x4_t v;
-    v.val[0] = LOAD_SI128(ptr); //0,1,2,3, x,x,x,x,x,x,x,x, x,x,x,x
-    v.val[1] = _mm_unpacklo_epi8(v.val[0],v.val[0]);//0,0,1,1,2,2,3,3, x,x,x,x,x,x,x,x,
-    v.val[1] = _mm_unpacklo_epi16(v.val[1],v.val[1]);//0,0,0,0, 1,1,1,1,2,2,2,2,3,3,3,3
-    v.val[0] = _mm_unpacklo_epi32(v.val[1],v.val[1]);//0,0,0,0, 0,0,0,0,1,1,1,1,1,1,1,1,
-    v.val[2] = _mm_unpackhi_epi32(v.val[1],v.val[1]);// 2,2,2,2,2,2,2,2, 3,3,3,3, 3,3,3,3
-    v.val[1] = _mm_shuffle_epi32(v.val[0], _SWAP_HI_LOW32);
-    v.val[3] = _mm_shuffle_epi32(v.val[2], _SWAP_HI_LOW32);
+    __m128i val0, val1, val2;
+    val0 = LOAD_SI128(ptr); //0,1,2,3, x,x,x,x,x,x,x,x, x,x,x,x
+    val1 = _mm_unpacklo_epi8(val0,val0); //0,0,1,1,2,2,3,3, x,x,x,x,x,x,x,x,
+    val1 = _mm_unpacklo_epi16(val1,val1); //0,0,0,0, 1,1,1,1,2,2,2,2,3,3,3,3
+    val0 = _mm_unpacklo_epi32(val1,val1); //0,0,0,0, 0,0,0,0,1,1,1,1,1,1,1,1,
+    val2 = _mm_unpackhi_epi32(val1,val1); // 2,2,2,2,2,2,2,2, 3,3,3,3, 3,3,3,3
+    vst1q_u8(&v.val[0], val0);
+    vst1q_u8(&v.val[2], val2);
     return v;
 }
 
-uint16x4x4_t vld4_dup_u16(__transfersize(4) uint16_t const * ptr);         // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0]
-_NEON2SSE_INLINE uint16x4x4_t vld4_dup_u16(__transfersize(4) uint16_t const * ptr)         // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0]
+uint16x4x4_t vld4_dup_u16(__transfersize(4) uint16_t const * ptr); // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0]
+_NEON2SSE_INLINE uint16x4x4_t vld4_dup_u16(__transfersize(4) uint16_t const * ptr) // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0]
 {
     uint16x4x4_t v;
-    v.val[3] = LOAD_SI128(ptr); //0,1,2,3, x,x,x,x
-    v.val[0] = _mm_shufflelo_epi16(v.val[3], 0); //00 00 00 00 (all 0)
-    v.val[1] = _mm_shufflelo_epi16(v.val[3], 85);//01 01 01 01 (all 1)
-    v.val[2] = _mm_shufflelo_epi16(v.val[3], 170);//10 10 10 10 (all 2)
-    v.val[3] = _mm_shufflelo_epi16(v.val[3], 255);//11 11 11 11 (all 3)
+    __m128i val0, val1, val2, val3;
+    val3 = LOAD_SI128(ptr); //0,1,2,3, x,x,x,x
+    val0 = _mm_shufflelo_epi16(val3, 0); //00 00 00 00 (all 0)
+    val1 = _mm_shufflelo_epi16(val3, 85); //01 01 01 01 (all 1)
+    val2 = _mm_shufflelo_epi16(val3, 170); //10 10 10 10 (all 2)
+    val3 = _mm_shufflelo_epi16(val3, 255); //11 11 11 11 (all 3)
+    _M64(v.val[0], val0);
+    _M64(v.val[1], val1);
+    _M64(v.val[2], val2);
+    _M64(v.val[3], val3);
     return v;
 }
 
-uint32x2x4_t vld4_dup_u32(__transfersize(4) uint32_t const * ptr);         // VLD4.32 {d0[], d1[], d2[], d3[]}, [r0]
-_NEON2SSE_INLINE uint32x2x4_t vld4_dup_u32(__transfersize(4) uint32_t const * ptr)         // VLD4.32 {d0[], d1[], d2[], d3[]}, [r0]
+uint32x2x4_t vld4_dup_u32(__transfersize(4) uint32_t const * ptr); // VLD4.32 {d0[], d1[], d2[], d3[]}, [r0]
+_NEON2SSE_INLINE uint32x2x4_t vld4_dup_u32(__transfersize(4) uint32_t const * ptr) // VLD4.32 {d0[], d1[], d2[], d3[]}, [r0]
 {
     uint32x2x4_t v;
-    v.val[3] = LOAD_SI128(ptr) ; //0,1,2,3
-    v.val[0] = _mm_shuffle_epi32(v.val[3],   0 | (0 << 2) | (2 << 4) | (3 << 6)); //0,0,2,3
-    v.val[1] = _mm_shuffle_epi32(v.val[3],   1 | (1 << 2) | (2 << 4) | (3 << 6)); //1,1,2,3
-    v.val[2] = _mm_shuffle_epi32(v.val[3],   2 | (2 << 2) | (3 << 4) | (3 << 6)); //2,2,3,3
-    v.val[3] = _mm_shuffle_epi32(v.val[3],   3 | (3 << 2) | (3 << 4) | (3 << 6)); //3,3,2,2
+    __m128i val0, val1, val2, val3;
+    val3 = LOAD_SI128(ptr); //0,1,2,3
+    val0 = _mm_shuffle_epi32(val3,   0 | (0 << 2) | (2 << 4) | (3 << 6)); //0,0,2,3
+    val1 = _mm_shuffle_epi32(val3,   1 | (1 << 2) | (2 << 4) | (3 << 6)); //1,1,2,3
+    val2 = _mm_shuffle_epi32(val3,   2 | (2 << 2) | (3 << 4) | (3 << 6)); //2,2,3,3
+    val3 = _mm_shuffle_epi32(val3,   3 | (3 << 2) | (3 << 4) | (3 << 6)); //3,3,2,2
+    _M64(v.val[0], val0);
+    _M64(v.val[1], val1);
+    _M64(v.val[2], val2);
+    _M64(v.val[3], val3);
     return v;
 }
 
-uint64x1x4_t vld4_dup_u64(__transfersize(4) uint64_t const * ptr);         // VLD1.64 {d0, d1, d2, d3}, [r0]
-_NEON2SSE_INLINE uint64x1x4_t vld4_dup_u64(__transfersize(4) uint64_t const * ptr)         // VLD1.64 {d0, d1, d2, d3}, [r0]
+uint64x1x4_t vld4_dup_u64(__transfersize(4) uint64_t const * ptr); // VLD1.64 {d0, d1, d2, d3}, [r0]
+_NEON2SSE_INLINE uint64x1x4_t vld4_dup_u64(__transfersize(4) uint64_t const * ptr) // VLD1.64 {d0, d1, d2, d3}, [r0]
 {
     uint64x1x4_t v;
-    v.val[0] = LOAD_SI128(ptr); //0,1,
-    v.val[1] = _mm_shuffle_epi32(v.val[0], _SWAP_HI_LOW32); //1,0
-    v.val[2] = LOAD_SI128((ptr + 2)); //2,3
-    v.val[3] = _mm_shuffle_epi32(v.val[2], _SWAP_HI_LOW32); //3,2
+    v.val[0].m64_u64[0] = *(ptr);
+    v.val[1].m64_u64[0] = *(ptr + 1);
+    v.val[2].m64_u64[0] = *(ptr + 2);
+    v.val[3].m64_u64[0] = *(ptr + 3);
     return v;
 }
 
-int8x8x4_t vld4_dup_s8(__transfersize(4) int8_t const * ptr);         // VLD4.8 {d0[], d1[], d2[], d3[]}, [r0]
+int8x8x4_t vld4_dup_s8(__transfersize(4) int8_t const * ptr); // VLD4.8 {d0[], d1[], d2[], d3[]}, [r0]
 #define vld4_dup_s8(ptr) vld4_dup_u8((uint8_t*)ptr)
 
-int16x4x4_t vld4_dup_s16(__transfersize(4) int16_t const * ptr);         // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0]
+int16x4x4_t vld4_dup_s16(__transfersize(4) int16_t const * ptr); // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0]
 #define vld4_dup_s16(ptr) vld4_dup_u16((uint16_t*)ptr)
 
-int32x2x4_t vld4_dup_s32(__transfersize(4) int32_t const * ptr);         // VLD4.32 {d0[], d1[], d2[], d3[]}, [r0]
+int32x2x4_t vld4_dup_s32(__transfersize(4) int32_t const * ptr); // VLD4.32 {d0[], d1[], d2[], d3[]}, [r0]
 #define vld4_dup_s32(ptr) vld4_dup_u32((uint32_t*)ptr)
 
-int64x1x4_t vld4_dup_s64(__transfersize(4) int64_t const * ptr);         // VLD1.64 {d0, d1, d2, d3}, [r0]
+int64x1x4_t vld4_dup_s64(__transfersize(4) int64_t const * ptr); // VLD1.64 {d0, d1, d2, d3}, [r0]
 #define vld4_dup_s64(ptr) vld4_dup_u64((uint64_t*)ptr)
 
-float16x4x4_t vld4_dup_f16(__transfersize(4) __fp16 const * ptr);         // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0]
+float16x4x4_t vld4_dup_f16(__transfersize(4) __fp16 const * ptr); // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0]
 // IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example
 
-float32x2x4_t vld4_dup_f32(__transfersize(4) float32_t const * ptr);         // VLD4.32 {d0[], d1[], d2[], d3[]}, [r0]
-_NEON2SSE_INLINE float32x2x4_t vld4_dup_f32(__transfersize(4) float32_t const * ptr)         // VLD4.32 {d0[], d1[], d2[], d3[]}, [r0]
+float32x2x4_t vld4_dup_f32(__transfersize(4) float32_t const * ptr); // VLD4.32 {d0[], d1[], d2[], d3[]}, [r0]
+_NEON2SSE_INLINE float32x2x4_t vld4_dup_f32(__transfersize(4) float32_t const * ptr) // VLD4.32 {d0[], d1[], d2[], d3[]}, [r0]
 {
     float32x2x4_t v;
-    v.val[0] = vld1q_f32(ptr);  //0,1,2,3
-    v.val[1] = _mm_movehdup_ps(v.val[0]); //1,1,3,3
-    v.val[0] = _mm_moveldup_ps(v.val[0]); //0,0,2,2
-    v.val[2] = _mm_movehl_ps(v.val[0], v.val[0]); //2,2,0,0,
-    v.val[3] = _mm_movehl_ps(v.val[1], v.val[1]); //3,3,1,1,
+    int i;
+    for (i = 0; i<4; i++) {
+        v.val[i].m64_f32[0] = *(ptr + i);
+        v.val[i].m64_f32[1] = *(ptr + i);
+    }
     return v;
 }
 
-poly8x8x4_t vld4_dup_p8(__transfersize(4) poly8_t const * ptr);         // VLD4.8 {d0[], d1[], d2[], d3[]}, [r0]
+poly8x8x4_t vld4_dup_p8(__transfersize(4) poly8_t const * ptr); // VLD4.8 {d0[], d1[], d2[], d3[]}, [r0]
 #define vld4_dup_p8 vld4_dup_u8
 
-poly16x4x4_t vld4_dup_p16(__transfersize(4) poly16_t const * ptr);         // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0]
+poly16x4x4_t vld4_dup_p16(__transfersize(4) poly16_t const * ptr); // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0]
 #define vld4_dup_p16 vld4_dup_u16
 
+
 //**********************************************************************************
 //*******************Lane loads for  an N-element structures ***********************
 //**********************************************************************************
@@ -5636,7 +10649,7 @@
 //to fix it the all functions below work with  xxxxxx_2t pointers and the corresponding original functions are redefined
 
 //uint16x8x2_t vld2q_lane_u16(__transfersize(2) uint16_t const * ptr, uint16x8x2_t src,__constrange(0,7) int lane);// VLD2.16 {d0[0], d2[0]}, [r0]
-_NEON2SSE_INLINE uint16x8x2_t vld2q_lane_u16_ptr(__transfersize(2) uint16_t const * ptr, uint16x8x2_t* src,__constrange(0,7) int lane)         // VLD2.16 {d0[0], d2[0]}, [r0]
+_NEON2SSE_INLINE uint16x8x2_t vld2q_lane_u16_ptr(__transfersize(2) uint16_t const * ptr, uint16x8x2_t* src,__constrange(0,7) int lane) // VLD2.16 {d0[0], d2[0]}, [r0]
 {
     uint16x8x2_t v;
     v.val[0] = vld1q_lane_s16 (ptr, src->val[0],  lane);
@@ -5646,7 +10659,7 @@
 #define vld2q_lane_u16(ptr, src, lane) vld2q_lane_u16_ptr(ptr, &src, lane)
 
 //uint32x4x2_t vld2q_lane_u32(__transfersize(2) uint32_t const * ptr, uint32x4x2_t src,__constrange(0,3) int lane);// VLD2.32 {d0[0], d2[0]}, [r0]
-_NEON2SSE_INLINE uint32x4x2_t vld2q_lane_u32_ptr(__transfersize(2) uint32_t const * ptr, uint32x4x2_t* src,__constrange(0,3) int lane)         // VLD2.32 {d0[0], d2[0]}, [r0]
+_NEON2SSE_INLINE uint32x4x2_t vld2q_lane_u32_ptr(__transfersize(2) uint32_t const * ptr, uint32x4x2_t* src,__constrange(0,3) int lane) // VLD2.32 {d0[0], d2[0]}, [r0]
 {
     uint32x4x2_t v;
     v.val[0] = _MM_INSERT_EPI32 (src->val[0],  ptr[0], lane);
@@ -5678,59 +10691,80 @@
 //float16x8x2_t vld2q_lane_f16(__transfersize(2) __fp16 const * ptr, float16x8x2_t src, __constrange(0,7)int lane);// VLD2.16 {d0[0], d2[0]}, [r0]
 //current IA SIMD doesn't support float16
 
-//float32x4x2_t vld2q_lane_f32(__transfersize(2) float32_t const * ptr, float32x4x2_t src,__constrange(0,3) int lane);// VLD2.32 {d0[0], d2[0]}, [r0]
-_NEON2SSE_INLINE float32x4x2_t vld2q_lane_f32_ptr(__transfersize(2) float32_t const * ptr, float32x4x2_t* src,__constrange(0,3) int lane)         // VLD2.32 {d0[0], d2[0]}, [r0]
+//float32x4x2_t vld2q_lane_f32_ptr(__transfersize(2) float32_t const * ptr, float32x4x2_t src,__constrange(0,3) int lane);// VLD2.32 {d0[0], d2[0]}, [r0]
+_NEON2SSE_INLINE float32x4x2_t vld2q_lane_f32_ptr(__transfersize(2) float32_t const * ptr, float32x4x2_t* src,__constrange(0,3) int lane) // VLD2.32 {d0[0], d2[0]}, [r0]
 {
     float32x4x2_t v;
     v.val[0] = vld1q_lane_f32(ptr, src->val[0], lane);
     v.val[1] = vld1q_lane_f32((ptr + 1), src->val[1], lane);
     return v;
 }
-#define vld2q_lane_f32(ptr, src, lane) vld2q_lane_f32_ptr(ptr, &src, lane)
+#define vld2q_lane_f32(ptr,src,lane) vld2q_lane_f32_ptr(ptr,&src,lane)
 
 //poly16x8x2_t vld2q_lane_p16(__transfersize(2) poly16_t const * ptr, poly16x8x2_t src,__constrange(0,7) int lane);// VLD2.16 {d0[0], d2[0]}, [r0]
 #define vld2q_lane_p16 vld2q_lane_u16
 
 //uint8x8x2_t vld2_lane_u8(__transfersize(2) uint8_t const * ptr, uint8x8x2_t src, __constrange(0,7) int lane);// VLD2.8 {d0[0], d1[0]}, [r0]
-_NEON2SSE_INLINE uint8x8x2_t vld2_lane_u8_ptr(__transfersize(2) uint8_t const * ptr, uint8x8x2_t* src, __constrange(0,7) int lane)         // VLD2.8 {d0[0], d1[0]}, [r0]
+_NEON2SSE_INLINE uint8x8x2_t vld2_lane_u8_ptr(__transfersize(2) uint8_t const * ptr, uint8x8x2_t* src, __constrange(0,7) int lane) // VLD2.8 {d0[0], d1[0]}, [r0]
 {
-    uint8x8x2_t val;
-    val.val[0] = _MM_INSERT_EPI8 (src->val[0], (int)ptr[0], lane);
-    val.val[1] = _MM_INSERT_EPI8 (src->val[1], (int)ptr[1], lane);
-    return val;
+    uint8x8x2_t v;
+    v.val[0] = vld1_lane_u8(ptr, src->val[0], lane);
+    v.val[1] = vld1_lane_u8((ptr + 1), src->val[1], lane);
+    return v;
 }
 #define vld2_lane_u8(ptr, src, lane) vld2_lane_u8_ptr(ptr, &src, lane)
 
 //uint16x4x2_t vld2_lane_u16(__transfersize(2) uint16_t const * ptr, uint16x4x2_t src, __constrange(0,3)int lane);// VLD2.16 {d0[0], d1[0]}, [r0]
-#define vld2_lane_u16 vld2q_lane_u16
+_NEON2SSE_INLINE uint16x4x2_t vld2_lane_u16_ptr(__transfersize(2) uint16_t const * ptr, uint16x4x2_t* src, __constrange(0,3) int lane)
+{
+    uint16x4x2_t v;
+    v.val[0]  =  vld1_lane_u16(ptr, src->val[0], lane);
+    v.val[1]  = vld1_lane_u16((ptr + 1), src->val[1], lane);
+    return v;
+}
+#define vld2_lane_u16(ptr, src, lane) vld2_lane_u16_ptr(ptr, &src, lane)
 
 //uint32x2x2_t vld2_lane_u32(__transfersize(2) uint32_t const * ptr, uint32x2x2_t src, __constrange(0,1)int lane);// VLD2.32 {d0[0], d1[0]}, [r0]
-#define vld2_lane_u32 vld2q_lane_u32
+_NEON2SSE_INLINE uint32x2x2_t vld2_lane_u32_ptr(__transfersize(2) uint32_t const * ptr, uint32x2x2_t* src, __constrange(0,1) int lane)
+{
+    uint32x2x2_t v;
+    v.val[0]  =  vld1_lane_u32(ptr, src->val[0], lane);
+    v.val[1]  = vld1_lane_u32((ptr + 1), src->val[1], lane);
+    return v;
+}
+#define vld2_lane_u32(ptr, src, lane) vld2_lane_u32_ptr(ptr, &src, lane)
 
 //int8x8x2_t vld2_lane_s8(__transfersize(2) int8_t const * ptr, int8x8x2_t src, __constrange(0,7) int lane);// VLD2.8 {d0[0], d1[0]}, [r0]
-int8x8x2_t vld2_lane_s8_ptr(__transfersize(2) int8_t const * ptr, int8x8x2_t * src, __constrange(0,7) int lane);         // VLD2.8 {d0[0], d1[0]}, [r0]
+int8x8x2_t vld2_lane_s8_ptr(__transfersize(2) int8_t const * ptr, int8x8x2_t * src, __constrange(0,7) int lane); // VLD2.8 {d0[0], d1[0]}, [r0]
 #define vld2_lane_s8(ptr, src, lane)  vld2_lane_u8(( uint8_t*) ptr, src, lane)
 
 //int16x4x2_t vld2_lane_s16(__transfersize(2) int16_t const * ptr, int16x4x2_t src, __constrange(0,3) int lane);// VLD2.16 {d0[0], d1[0]}, [r0]
-int16x4x2_t vld2_lane_s16_ptr(__transfersize(2) int16_t const * ptr, int16x4x2_t * src, __constrange(0,3) int lane);         // VLD2.16 {d0[0], d1[0]}, [r0]
+int16x4x2_t vld2_lane_s16_ptr(__transfersize(2) int16_t const * ptr, int16x4x2_t * src, __constrange(0,3) int lane); // VLD2.16 {d0[0], d1[0]}, [r0]
 #define vld2_lane_s16(ptr, src, lane) vld2_lane_u16(( uint16_t*) ptr, src, lane)
 
 //int32x2x2_t vld2_lane_s32(__transfersize(2) int32_t const * ptr, int32x2x2_t src, __constrange(0,1) int lane);// VLD2.32 {d0[0], d1[0]}, [r0]
-int32x2x2_t vld2_lane_s32_ptr(__transfersize(2) int32_t const * ptr, int32x2x2_t * src, __constrange(0,1) int lane);         // VLD2.32 {d0[0], d1[0]}, [r0]
+int32x2x2_t vld2_lane_s32_ptr(__transfersize(2) int32_t const * ptr, int32x2x2_t * src, __constrange(0,1) int lane); // VLD2.32 {d0[0], d1[0]}, [r0]
 #define vld2_lane_s32(ptr, src, lane) vld2_lane_u32(( uint32_t*) ptr, src, lane)
 
 //float16x4x2_t vld2_lane_f16(__transfersize(2) __fp16 const * ptr, float16x4x2_t src, __constrange(0,3) int lane); // VLD2.16 {d0[0], d1[0]}, [r0]
 //current IA SIMD doesn't support float16
 
-float32x2x2_t vld2_lane_f32_ptr(__transfersize(2) float32_t const * ptr, float32x2x2_t * src,__constrange(0,1) int lane);         // VLD2.32 {d0[0], d1[0]}, [r0]
-#define vld2_lane_f32 vld2q_lane_f32
+float32x2x2_t vld2_lane_f32_ptr(__transfersize(2) float32_t const * ptr, float32x2x2_t * src,__constrange(0,1) int lane); // VLD2.32 {d0[0], d1[0]}, [r0]
+_NEON2SSE_INLINE float32x2x2_t vld2_lane_f32_ptr(__transfersize(2) float32_t const * ptr, float32x2x2_t * src,__constrange(0,1) int lane)
+{
+    float32x2x2_t v;
+    v.val[0] = vld1_lane_f32(ptr, src->val[0], lane);
+    v.val[1] = vld1_lane_f32((ptr + 1), src->val[1], lane);
+    return v;
+}
+#define vld2_lane_f32(ptr, src, lane) vld2_lane_f32_ptr(ptr, &src, lane)
 
 //poly8x8x2_t vld2_lane_p8(__transfersize(2) poly8_t const * ptr, poly8x8x2_t src, __constrange(0,7) int lane);// VLD2.8 {d0[0], d1[0]}, [r0]
-poly8x8x2_t vld2_lane_p8_ptr(__transfersize(2) poly8_t const * ptr, poly8x8x2_t * src, __constrange(0,7) int lane);         // VLD2.8 {d0[0], d1[0]}, [r0]
+poly8x8x2_t vld2_lane_p8_ptr(__transfersize(2) poly8_t const * ptr, poly8x8x2_t * src, __constrange(0,7) int lane); // VLD2.8 {d0[0], d1[0]}, [r0]
 #define vld2_lane_p8 vld2_lane_u8
 
 //poly16x4x2_t vld2_lane_p16(__transfersize(2) poly16_t const * ptr, poly16x4x2_t src, __constrange(0,3)int lane);// VLD2.16 {d0[0], d1[0]}, [r0]
-poly16x4x2_t vld2_lane_p16_ptr(__transfersize(2) poly16_t const * ptr, poly16x4x2_t * src, __constrange(0,3) int lane);         // VLD2.16 {d0[0], d1[0]}, [r0]
+poly16x4x2_t vld2_lane_p16_ptr(__transfersize(2) poly16_t const * ptr, poly16x4x2_t * src, __constrange(0,3) int lane); // VLD2.16 {d0[0], d1[0]}, [r0]
 #define vld2_lane_p16 vld2_lane_u16
 
 //*********** Lane triplets **********************
@@ -5739,7 +10773,7 @@
 //we assume src is 16 bit aligned
 
 //uint16x8x3_t vld3q_lane_u16(__transfersize(3) uint16_t const * ptr, uint16x8x3_t src,__constrange(0,7) int lane);// VLD3.16 {d0[0], d2[0], d4[0]}, [r0]
-_NEON2SSE_INLINE uint16x8x3_t vld3q_lane_u16_ptr(__transfersize(3) uint16_t const * ptr, uint16x8x3_t* src,__constrange(0,7) int lane)         // VLD3.16 {d0[0], d2[0], d4[0]}, [r0]
+_NEON2SSE_INLINE uint16x8x3_t vld3q_lane_u16_ptr(__transfersize(3) uint16_t const * ptr, uint16x8x3_t* src,__constrange(0,7) int lane) // VLD3.16 {d0[0], d2[0], d4[0]}, [r0]
 {
     uint16x8x3_t v;
     v.val[0] = _MM_INSERT_EPI16 ( src->val[0],  ptr[0], lane);
@@ -5750,7 +10784,7 @@
 #define vld3q_lane_u16(ptr, src, lane) vld3q_lane_u16_ptr(ptr, &src, lane)
 
 //uint32x4x3_t vld3q_lane_u32(__transfersize(3) uint32_t const * ptr, uint32x4x3_t src,__constrange(0,3) int lane);// VLD3.32 {d0[0], d2[0], d4[0]}, [r0]
-_NEON2SSE_INLINE uint32x4x3_t vld3q_lane_u32_ptr(__transfersize(3) uint32_t const * ptr, uint32x4x3_t* src,__constrange(0,3) int lane)         // VLD3.32 {d0[0], d2[0], d4[0]}, [r0]
+_NEON2SSE_INLINE uint32x4x3_t vld3q_lane_u32_ptr(__transfersize(3) uint32_t const * ptr, uint32x4x3_t* src,__constrange(0,3) int lane) // VLD3.32 {d0[0], d2[0], d4[0]}, [r0]
 {
     uint32x4x3_t v;
     v.val[0] = _MM_INSERT_EPI32 ( src->val[0],  ptr[0], lane);
@@ -5761,7 +10795,7 @@
 #define vld3q_lane_u32(ptr, src, lane) vld3q_lane_u32_ptr(ptr, &src, lane)
 
 //int16x8x3_t vld3q_lane_s16(__transfersize(3) int16_t const * ptr, int16x8x3_t src, __constrange(0,7)int lane);// VLD3.16 {d0[0], d2[0], d4[0]}, [r0]
-_NEON2SSE_INLINE int16x8x3_t vld3q_lane_s16_ptr(__transfersize(3) int16_t const * ptr, int16x8x3_t* src, __constrange(0,7) int lane)         // VLD3.16 {d0[0], d2[0], d4[0]}, [r0]
+_NEON2SSE_INLINE int16x8x3_t vld3q_lane_s16_ptr(__transfersize(3) int16_t const * ptr, int16x8x3_t* src, __constrange(0,7) int lane) // VLD3.16 {d0[0], d2[0], d4[0]}, [r0]
 {
     int16x8x3_t v;
     v.val[0] = _MM_INSERT_EPI16 ( src->val[0],  ptr[0], lane);
@@ -5772,7 +10806,7 @@
 #define vld3q_lane_s16(ptr, src, lane) vld3q_lane_s16_ptr(ptr, &src, lane)
 
 //int32x4x3_t vld3q_lane_s32(__transfersize(3) int32_t const * ptr, int32x4x3_t src, __constrange(0,3)int lane);// VLD3.32 {d0[0], d2[0], d4[0]}, [r0]
-_NEON2SSE_INLINE int32x4x3_t vld3q_lane_s32_ptr(__transfersize(3) int32_t const * ptr, int32x4x3_t* src, __constrange(0,3) int lane)         // VLD3.32 {d0[0], d2[0], d4[0]}, [r0]
+_NEON2SSE_INLINE int32x4x3_t vld3q_lane_s32_ptr(__transfersize(3) int32_t const * ptr, int32x4x3_t* src, __constrange(0,3) int lane) // VLD3.32 {d0[0], d2[0], d4[0]}, [r0]
 {
     int32x4x3_t v;
     v.val[0] = _MM_INSERT_EPI32 ( src->val[0],  ptr[0], lane);
@@ -5782,12 +10816,13 @@
 }
 #define vld3q_lane_s32(ptr, src, lane) vld3q_lane_s32_ptr(ptr, &src, lane)
 
-float16x8x3_t vld3q_lane_f16_ptr(__transfersize(3) __fp16 const * ptr, float16x8x3_t * src, __constrange(0,7) int lane);         // VLD3.16 {d0[0], d2[0], d4[0]}, [r0]
+float16x8x3_t vld3q_lane_f16_ptr(__transfersize(3) __fp16 const * ptr, float16x8x3_t * src, __constrange(0,7) int lane); // VLD3.16 {d0[0], d2[0], d4[0]}, [r0]
 //current IA SIMD doesn't support float16
 #define vld3q_lane_f16(ptr, src, lane) vld3q_lane_f16_ptr(ptr, &src, lane)
 
+
 //float32x4x3_t vld3q_lane_f32(__transfersize(3) float32_t const * ptr, float32x4x3_t src,__constrange(0,3) int lane);// VLD3.32 {d0[0], d2[0], d4[0]}, [r0]
-_NEON2SSE_INLINE float32x4x3_t vld3q_lane_f32_ptr(__transfersize(3) float32_t const * ptr, float32x4x3_t* src,__constrange(0,3) int lane)         // VLD3.32 {d0[0], d2[0], d4[0]}, [r0]
+_NEON2SSE_INLINE float32x4x3_t vld3q_lane_f32_ptr(__transfersize(3) float32_t const * ptr, float32x4x3_t* src,__constrange(0,3) int lane) // VLD3.32 {d0[0], d2[0], d4[0]}, [r0]
 {
     float32x4x3_t v;
     v.val[0] = vld1q_lane_f32(&ptr[0], src->val[0], lane);
@@ -5795,64 +10830,67 @@
     v.val[2] = vld1q_lane_f32(&ptr[2], src->val[2], lane);
     return v;
 }
-#define vld3q_lane_f32(ptr, src, lane) vld3q_lane_f32_ptr(ptr, &src, lane)
+#define vld3q_lane_f32(ptr,src,lane) vld3q_lane_f32_ptr(ptr,&src,lane)
 
-poly16x8x3_t vld3q_lane_p16_ptr(__transfersize(3) poly16_t const * ptr, poly16x8x3_t * src,__constrange(0,7) int lane);         // VLD3.16 {d0[0], d2[0], d4[0]}, [r0]
+poly16x8x3_t vld3q_lane_p16_ptr(__transfersize(3) poly16_t const * ptr, poly16x8x3_t * src,__constrange(0,7) int lane); // VLD3.16 {d0[0], d2[0], d4[0]}, [r0]
 #define vld3q_lane_p16 vld3q_lane_u16
 
 //uint8x8x3_t vld3_lane_u8(__transfersize(3) uint8_t const * ptr, uint8x8x3_t src, __constrange(0,7) int lane);// VLD3.8 {d0[0], d1[0], d2[0]}, [r0]
-_NEON2SSE_INLINE uint8x8x3_t vld3_lane_u8_ptr(__transfersize(3) uint8_t const * ptr, uint8x8x3_t* src, __constrange(0,7) int lane)         // VLD3.8 {d0[0], d1[0], d2[0]}, [r0]
+_NEON2SSE_INLINE uint8x8x3_t vld3_lane_u8_ptr(__transfersize(3) uint8_t const * ptr, uint8x8x3_t* src, __constrange(0,7) int lane) // VLD3.8 {d0[0], d1[0], d2[0]}, [r0]
 {
     uint8x8x3_t v;
-    v.val[0] = _MM_INSERT_EPI8 (src->val[0], ptr[0], lane);
-    v.val[1] = _MM_INSERT_EPI8 (src->val[1], ptr[1], lane);
-    v.val[2] = _MM_INSERT_EPI8 (src->val[2], ptr[2], lane);
+    v.val[0] = vld1_lane_u8(ptr, src->val[0], lane);
+    v.val[1] = vld1_lane_u8((ptr + 1), src->val[1], lane);
+    v.val[2] = vld1_lane_u8((ptr + 2), src->val[2], lane);
     return v;
 }
 #define vld3_lane_u8(ptr, src, lane) vld3_lane_u8_ptr(ptr, &src, lane)
 
 //uint16x4x3_t vld3_lane_u16(__transfersize(3) uint16_t   const * ptr, uint16x4x3_t src, __constrange(0,3)int lane);// VLD3.16 {d0[0], d1[0], d2[0]}, [r0]
-_NEON2SSE_INLINE uint16x4x3_t vld3_lane_u16_ptr(__transfersize(3) uint16_t const * ptr, uint16x4x3_t* src, __constrange(0,3) int lane)         // VLD3.16 {d0[0], d1[0], d2[0]}, [r0]
+_NEON2SSE_INLINE uint16x4x3_t vld3_lane_u16_ptr(__transfersize(3) uint16_t const * ptr, uint16x4x3_t* src, __constrange(0,3) int lane) // VLD3.16 {d0[0], d1[0], d2[0]}, [r0]
 {
     uint16x4x3_t v;
-    v.val[0] = _MM_INSERT_EPI16 (src->val[0], ptr[0], lane);
-    v.val[1] = _MM_INSERT_EPI16 (src->val[1], ptr[1], lane);
-    v.val[2] = _MM_INSERT_EPI16 (src->val[2], ptr[2], lane);
+    v.val[0] = vld1_lane_u16(ptr, src->val[0], lane);
+    v.val[1] = vld1_lane_u16((ptr + 1), src->val[1], lane);
+    v.val[2] = vld1_lane_u16((ptr + 2), src->val[2], lane);
     return v;
 }
 #define vld3_lane_u16(ptr, src, lane) vld3_lane_u16_ptr(ptr, &src, lane)
 
 //uint32x2x3_t vld3_lane_u32(__transfersize(3) uint32_t const * ptr, uint32x2x3_t src, __constrange(0,1)int lane);// VLD3.32 {d0[0], d1[0], d2[0]}, [r0]
-_NEON2SSE_INLINE uint32x2x3_t vld3_lane_u32_ptr(__transfersize(3) uint32_t const * ptr, uint32x2x3_t* src, __constrange(0,1) int lane)         // VLD3.32 {d0[0], d1[0], d2[0]}, [r0]
-{         //need to merge into 128 bit anyway
+_NEON2SSE_INLINE uint32x2x3_t vld3_lane_u32_ptr(__transfersize(3) uint32_t const * ptr, uint32x2x3_t* src, __constrange(0,1) int lane) // VLD3.32 {d0[0], d1[0], d2[0]}, [r0]
+{
+    //need to merge into 128 bit anyway
     uint32x2x3_t v;
-    v.val[0] = _MM_INSERT_EPI32 (src->val[0], ptr[0], lane);
-    v.val[1] = _MM_INSERT_EPI32 (src->val[1], ptr[1], lane);
-    v.val[2] = _MM_INSERT_EPI32 (src->val[2], ptr[2], lane);
+    v.val[0] = vld1_lane_u32(ptr, src->val[0], lane);;
+    v.val[1] = vld1_lane_u32((ptr + 1), src->val[1], lane);;
+    v.val[2] = vld1_lane_u32((ptr + 2), src->val[2], lane);;
     return v;
 }
 #define vld3_lane_u32(ptr, src, lane) vld3_lane_u32_ptr(ptr, &src, lane)
 
-int8x8x3_t vld3_lane_s8_ptr(__transfersize(3) int8_t const * ptr, int8x8x3_t * src, __constrange(0,7) int lane);         // VLD3.8 {d0[0], d1[0], d2[0]}, [r0]
+int8x8x3_t vld3_lane_s8_ptr(__transfersize(3) int8_t const * ptr, int8x8x3_t * src, __constrange(0,7) int lane); // VLD3.8 {d0[0], d1[0], d2[0]}, [r0]
 #define vld3_lane_s8(ptr, src, lane)  vld3_lane_u8_ptr(( uint8_t*) ptr, &src, lane)
 
-int16x4x3_t vld3_lane_s16_ptr(__transfersize(3) int16_t const * ptr, int16x4x3_t * src, __constrange(0,3) int lane);         // VLD3.16 {d0[0], d1[0], d2[0]}, [r0]
+int16x4x3_t vld3_lane_s16_ptr(__transfersize(3) int16_t const * ptr, int16x4x3_t * src, __constrange(0,3) int lane); // VLD3.16 {d0[0], d1[0], d2[0]}, [r0]
 #define vld3_lane_s16(ptr, src, lane)  vld3_lane_u16_ptr(( uint16_t*) ptr, &src, lane)
 
-int32x2x3_t vld3_lane_s32_ptr(__transfersize(3) int32_t const * ptr, int32x2x3_t * src, __constrange(0,1) int lane);         // VLD3.32 {d0[0], d1[0], d2[0]}, [r0]
+int32x2x3_t vld3_lane_s32_ptr(__transfersize(3) int32_t const * ptr, int32x2x3_t * src, __constrange(0,1) int lane); // VLD3.32 {d0[0], d1[0], d2[0]}, [r0]
 #define vld3_lane_s32(ptr, src, lane)  vld3_lane_u32_ptr(( uint32_t*) ptr, &src, lane)
 
-float16x4x3_t vld3_lane_f16_ptr(__transfersize(3) __fp16 const * ptr, float16x4x3_t * src, __constrange(0,3) int lane);         // VLD3.16 {d0[0], d1[0], d2[0]}, [r0]
+float16x4x3_t vld3_lane_f16_ptr(__transfersize(3) __fp16 const * ptr, float16x4x3_t * src, __constrange(0,3) int lane); // VLD3.16 {d0[0], d1[0], d2[0]}, [r0]
 //current IA SIMD doesn't support float16
 
 //float32x2x3_t vld3_lane_f32(__transfersize(3) float32_t const * ptr, float32x2x3_t src,__constrange(0,1) int lane);// VLD3.32 {d0[0], d1[0], d2[0]}, [r0]
-_NEON2SSE_INLINE float32x2x3_t vld3_lane_f32_ptr(__transfersize(3) float32_t const * ptr, float32x2x3_t* src,__constrange(0,1) int lane)         // VLD3.32 {d0[0], d1[0], d2[0]}, [r0]
+_NEON2SSE_INLINE float32x2x3_t vld3_lane_f32_ptr(__transfersize(3) float32_t const * ptr, float32x2x3_t* src,__constrange(0,1) int lane) // VLD3.32 {d0[0], d1[0], d2[0]}, [r0]
 {
     float32x2x3_t v;
-    v.val[0] = vld1q_lane_f32(ptr, src->val[0], lane);
+    v.val[0] = vld1_lane_f32(ptr, src->val[0], lane);
+    v.val[1] = vld1_lane_f32((ptr + 1), src->val[1], lane);
+    v.val[2] = vld1_lane_f32((ptr + 2), src->val[2], lane);
     return v;
 }
-#define vld3_lane_f32(ptr, src, lane) vld3_lane_f32_ptr(ptr, &src, lane)
+#define vld3_lane_f32(ptr,src,lane) vld3_lane_f32_ptr(ptr,&src,lane)
 
 //poly8x8x3_t vld3_lane_p8_ptr(__transfersize(3) poly8_t const * ptr, poly8x8x3_t * src, __constrange(0,7) int lane); // VLD3.8 {d0[0], d1[0], d2[0]}, [r0]
 #define vld3_lane_p8 vld3_lane_u8
@@ -5890,15 +10928,15 @@
 #define vld4q_lane_u32(ptr, src, lane) vld4q_lane_u32_ptr(ptr, &src, lane)
 
 //int16x8x4_t vld4q_lane_s16(__transfersize(4) int16_t const * ptr, int16x8x4_t src, __constrange(0,7)int lane);// VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
-int16x8x4_t vld4q_lane_s16_ptr(__transfersize(4) int16_t const * ptr, int16x8x4_t * src, __constrange(0,7) int lane);         // VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
+int16x8x4_t vld4q_lane_s16_ptr(__transfersize(4) int16_t const * ptr, int16x8x4_t * src, __constrange(0,7) int lane); // VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
 #define vld4q_lane_s16(ptr, src, lane) vld4q_lane_u16(( uint16_t*) ptr, src, lane)
 
 //int32x4x4_t vld4q_lane_s32(__transfersize(4) int32_t const * ptr, int32x4x4_t src, __constrange(0,3)int lane);// VLD4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0]
-int32x4x4_t vld4q_lane_s32_ptr(__transfersize(4) int32_t const * ptr, int32x4x4_t * src, __constrange(0,3) int lane);         // VLD4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0]
+int32x4x4_t vld4q_lane_s32_ptr(__transfersize(4) int32_t const * ptr, int32x4x4_t * src, __constrange(0,3) int lane); // VLD4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0]
 #define vld4q_lane_s32(ptr, src, lane)  vld4q_lane_u32(( uint32_t*) ptr, src, lane)
 
 //float16x8x4_t vld4q_lane_f16(__transfersize(4) __fp16 const * ptr, float16x8x4_t src, __constrange(0,7)int lane);// VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
-float16x8x4_t vld4q_lane_f16_ptr(__transfersize(4) __fp16 const * ptr, float16x8x4_t * src, __constrange(0,7) int lane);         // VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
+float16x8x4_t vld4q_lane_f16_ptr(__transfersize(4) __fp16 const * ptr, float16x8x4_t * src, __constrange(0,7) int lane); // VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
 //current IA SIMD doesn't support float16
 
 //float32x4x4_t vld4q_lane_f32(__transfersize(4) float32_t const * ptr, float32x4x4_t src,__constrange(0,3) int lane)// VLD4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0]
@@ -5911,20 +10949,20 @@
     v.val[3] = vld1q_lane_f32(&ptr[3], src->val[3], lane);
     return v;
 }
-#define vld4q_lane_f32(ptr, src, lane) vld4q_lane_f32_ptr(ptr, &src, lane)
+#define vld4q_lane_f32(ptr,val,lane) vld4q_lane_f32_ptr(ptr,&val,lane)
 
 //poly16x8x4_t vld4q_lane_p16(__transfersize(4) poly16_t const * ptr, poly16x8x4_t src,__constrange(0,7) int lane);// VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
-poly16x8x4_t vld4q_lane_p16_ptr(__transfersize(4) poly16_t const * ptr, poly16x8x4_t * src,__constrange(0,7) int lane);         // VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
+poly16x8x4_t vld4q_lane_p16_ptr(__transfersize(4) poly16_t const * ptr, poly16x8x4_t * src,__constrange(0,7) int lane); // VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
 #define vld4q_lane_p16 vld4q_lane_u16
 
 //uint8x8x4_t vld4_lane_u8(__transfersize(4) uint8_t const * ptr, uint8x8x4_t src, __constrange(0,7) int lane)// VLD4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]
 _NEON2SSE_INLINE uint8x8x4_t vld4_lane_u8_ptr(__transfersize(4) uint8_t const * ptr, uint8x8x4_t* src, __constrange(0,7) int lane)
 {
     uint8x8x4_t v;
-    v.val[0] = _MM_INSERT_EPI8 (src->val[0], ptr[0], lane);
-    v.val[1] = _MM_INSERT_EPI8 (src->val[1], ptr[1], lane );
-    v.val[2] = _MM_INSERT_EPI8 (src->val[2], ptr[2], lane );
-    v.val[3] = _MM_INSERT_EPI8 (src->val[3], ptr[3], lane );
+    v.val[0] = vld1_lane_u8(ptr, src->val[0], lane);
+    v.val[1] = vld1_lane_u8((ptr + 1), src->val[1], lane);
+    v.val[2] = vld1_lane_u8((ptr + 2), src->val[2], lane);
+    v.val[3] = vld1_lane_u8((ptr + 3), src->val[3], lane);
     return v;
 }
 #define vld4_lane_u8(ptr, src, lane) vld4_lane_u8_ptr(ptr, &src, lane)
@@ -5933,10 +10971,10 @@
 _NEON2SSE_INLINE uint16x4x4_t vld4_lane_u16_ptr(__transfersize(4) uint16_t const * ptr, uint16x4x4_t* src, __constrange(0,3) int lane)
 {
     uint16x4x4_t v;
-    v.val[0] = _MM_INSERT_EPI16 (src->val[0], ptr[0], lane);
-    v.val[1] = _MM_INSERT_EPI16 (src->val[1], ptr[1], lane );
-    v.val[2] = _MM_INSERT_EPI16 (src->val[2], ptr[2], lane );
-    v.val[3] = _MM_INSERT_EPI16 (src->val[3], ptr[3], lane );
+    v.val[0] = vld1_lane_u16(ptr, src->val[0], lane);
+    v.val[1] = vld1_lane_u16((ptr + 1), src->val[1], lane);
+    v.val[2] = vld1_lane_u16((ptr + 2), src->val[2], lane);
+    v.val[3] = vld1_lane_u16((ptr + 3), src->val[3], lane);
     return v;
 }
 #define vld4_lane_u16(ptr, src, lane) vld4_lane_u16_ptr(ptr, &src, lane)
@@ -5945,10 +10983,10 @@
 _NEON2SSE_INLINE uint32x2x4_t vld4_lane_u32_ptr(__transfersize(4) uint32_t const * ptr, uint32x2x4_t* src, __constrange(0,1) int lane)
 {
     uint32x2x4_t v;
-    v.val[0] = _MM_INSERT_EPI32 (src->val[0], ptr[0], lane);
-    v.val[1] = _MM_INSERT_EPI32 (src->val[1], ptr[1], lane );
-    v.val[2] = _MM_INSERT_EPI32 (src->val[2], ptr[2], lane );
-    v.val[3] = _MM_INSERT_EPI32 (src->val[3], ptr[3], lane );
+    v.val[0] = vld1_lane_u32(ptr, src->val[0], lane);
+    v.val[1] = vld1_lane_u32((ptr + 1), src->val[1], lane);
+    v.val[2] = vld1_lane_u32((ptr + 2), src->val[2], lane);
+    v.val[3] = vld1_lane_u32((ptr + 3), src->val[3], lane);
     return v;
 }
 #define vld4_lane_u32(ptr, src, lane) vld4_lane_u32_ptr(ptr, &src, lane)
@@ -5971,11 +11009,16 @@
 
 //float32x2x4_t vld4_lane_f32(__transfersize(4) float32_t const * ptr, float32x2x4_t src,__constrange(0,1) int lane)// VLD4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0]
 _NEON2SSE_INLINE float32x2x4_t vld4_lane_f32_ptr(__transfersize(4) float32_t const * ptr, float32x2x4_t* src,__constrange(0,1) int lane)
-{         //serial solution may be faster
+{
+    //serial solution may be faster
     float32x2x4_t v;
+    v.val[0] = vld1_lane_f32(ptr, src->val[0], lane);
+    v.val[1] = vld1_lane_f32((ptr + 1), src->val[1], lane);
+    v.val[2] = vld1_lane_f32((ptr + 2), src->val[2], lane);
+    v.val[3] = vld1_lane_f32((ptr + 3), src->val[3], lane);
     return v;
 }
-#define vld4_lane_f32(ptr, src, lane) vld4_lane_f32_ptr(ptr, &src, lane)
+#define vld4_lane_f32(ptr,src,lane) vld4_lane_f32_ptr(ptr,&src,lane)
 
 //poly8x8x4_t vld4_lane_p8(__transfersize(4) poly8_t const * ptr, poly8x8x4_t src, __constrange(0,7) int lane);// VLD4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]
 poly8x8x4_t vld4_lane_p8_ptr(__transfersize(4) poly8_t const * ptr, poly8x8x4_t * src, __constrange(0,7) int lane);
@@ -6060,37 +11103,37 @@
 //void vst2_u8(__transfersize(16) uint8_t * ptr, uint8x8x2_t val);// VST2.8 {d0, d1}, [r0]
 _NEON2SSE_INLINE void vst2_u8_ptr(__transfersize(16) uint8_t * ptr, uint8x8x2_t* val)
 {
-    uint8x8x2_t v;
-    v.val[0] = _mm_unpacklo_epi8(val->val[0], val->val[1]);
-    vst1q_u8 (ptr, v.val[0]);
+    __m128i v0;
+    v0 = _mm_unpacklo_epi8(_pM128i(val->val[0]), _pM128i(val->val[1]));
+    vst1q_u8 (ptr, v0);
 }
 #define vst2_u8(ptr, val) vst2_u8_ptr(ptr, &val)
 
 //void vst2_u16(__transfersize(8) uint16_t * ptr, uint16x4x2_t val);// VST2.16 {d0, d1}, [r0]
 _NEON2SSE_INLINE void vst2_u16_ptr(__transfersize(8) uint16_t * ptr, uint16x4x2_t* val)
 {
-    uint16x4x2_t v;
-    v.val[0] = _mm_unpacklo_epi16(val->val[0], val->val[1]);
-    vst1q_u16 (ptr, v.val[0]);
+    __m128i v0;
+    v0 = _mm_unpacklo_epi16(_pM128i(val->val[0]), _pM128i(val->val[1]));
+    vst1q_u16 (ptr, v0);
 }
 #define vst2_u16(ptr, val) vst2_u16_ptr(ptr, &val)
 
 //void vst2_u32(__transfersize(4) uint32_t * ptr, uint32x2x2_t val);// VST2.32 {d0, d1}, [r0]
 _NEON2SSE_INLINE void vst2_u32_ptr(__transfersize(4) uint32_t * ptr, uint32x2x2_t* val)
 {
-    uint32x2x2_t v;
-    v.val[0] = _mm_unpacklo_epi32(val->val[0], val->val[1]);
-    vst1q_u32 (ptr, v.val[0]);
+    __m128i v0;
+    v0 = _mm_unpacklo_epi32(_pM128i(val->val[0]), _pM128i(val->val[1]));
+    vst1q_u32 (ptr, v0);
 }
 #define vst2_u32(ptr, val) vst2_u32_ptr(ptr, &val)
 
+
 //void vst2_u64(__transfersize(2) uint64_t * ptr, uint64x1x2_t val);// VST1.64 {d0, d1}, [r0]
 void vst2_u64_ptr(__transfersize(2) uint64_t * ptr, uint64x1x2_t * val);
 _NEON2SSE_INLINE void vst2_u64_ptr(__transfersize(2) uint64_t * ptr, uint64x1x2_t* val)
 {
-    uint64x1x2_t v;
-    v.val[0] = _mm_unpacklo_epi64(val->val[0], val->val[1]);
-    vst1q_u64(ptr, v.val[0]);
+    *(ptr) = val->val[0].m64_u64[0];
+    *(ptr + 1) = val->val[1].m64_u64[0];
 }
 #define vst2_u64(ptr, val) vst2_u64_ptr(ptr, &val)
 
@@ -6109,12 +11152,13 @@
 //void vst2_f16(__transfersize(8) __fp16 * ptr, float16x4x2_t val); // VST2.16 {d0, d1}, [r0]
 //current IA SIMD doesn't support float16
 
-void vst2_f32_ptr(__transfersize(4) float32_t * ptr, float32x2x2_t * val);         // VST2.32 {d0, d1}, [r0]
+//void vst2_f32(__transfersize(4) float32_t * ptr, float32x2x2_t val); // VST2.32 {d0, d1}, [r0]
 _NEON2SSE_INLINE void vst2_f32_ptr(__transfersize(4) float32_t* ptr, float32x2x2_t* val)
 {
-    float32x4x2_t v;
-    v.val[0] = _mm_unpacklo_ps(val->val[0], val->val[1]);
-    vst1q_f32 (ptr, v.val[0]);
+    *(ptr) =   val->val[0].m64_f32[0];
+    *(ptr + 1) = val->val[1].m64_f32[0];
+    *(ptr + 2) = val->val[0].m64_f32[1];
+    *(ptr + 3) = val->val[1].m64_f32[1];
 }
 #define vst2_f32(ptr, val) vst2_f32_ptr(ptr, &val)
 
@@ -6127,7 +11171,6 @@
 //******************** Triplets store  *****************************************
 //******************************************************************************
 //void vst3q_u8(__transfersize(48) uint8_t * ptr, uint8x16x3_t val)// VST3.8 {d0, d2, d4}, [r0]
-#if defined(USE_SSSE3)
 _NEON2SSE_INLINE void vst3q_u8_ptr(__transfersize(48) uint8_t * ptr, uint8x16x3_t* val)
 {
     uint8x16x3_t v;
@@ -6139,30 +11182,28 @@
     _NEON2SSE_ALIGN_16 uint8_t mask2med[16] = {0xff, 5, 0xff, 0xff, 6, 0xff,0xff, 7, 0xff,0xff, 8, 0xff,0xff, 9, 0xff, 0xff};
     _NEON2SSE_ALIGN_16 uint8_t mask2hi[16] = {10, 0xff,0xff, 11, 0xff,0xff, 12, 0xff,0xff, 13, 0xff,0xff, 14, 0xff, 0xff, 15};
 
-    v0 =  _mm_unpacklo_epi8(val->val[0], val->val[1]);         //0,1, 3,4, 6,7, 9,10, 12,13, 15,16, 18,19, 21,22
-    v2 =  _mm_unpackhi_epi8(val->val[0], val->val[1]);         //24,25,  27,28, 30,31, 33,34, 36,37, 39,40, 42,43, 45,46
-    v1 =  _mm_alignr_epi8(v2, v0, 11);         //12,13, 15,16, 18,19, 21,22, 24,25,  27,28, 30,31, 33,34
-    v.val[0] =  _mm_shuffle_epi8(v0, *(__m128i*)mask0);         //make holes for the v.val[2] data embedding
-    v.val[2] =  _mm_shuffle_epi8(val->val[2], *(__m128i*)mask2lo);         //make plugs for the v.val[2] data embedding
-    cff = _mm_cmpeq_epi8(v0, v0);         //all ff
+    v0 =  _mm_unpacklo_epi8(val->val[0], val->val[1]); //0,1, 3,4, 6,7, 9,10, 12,13, 15,16, 18,19, 21,22
+    v2 =  _mm_unpackhi_epi8(val->val[0], val->val[1]); //24,25,  27,28, 30,31, 33,34, 36,37, 39,40, 42,43, 45,46
+    v1 =  _mm_alignr_epi8(v2, v0, 11); //12,13, 15,16, 18,19, 21,22, 24,25,  27,28, 30,31, 33,34
+    v.val[0] =  _mm_shuffle_epi8(v0, *(__m128i*)mask0); //make holes for the v.val[2] data embedding
+    v.val[2] =  _mm_shuffle_epi8(val->val[2], *(__m128i*)mask2lo); //make plugs for the v.val[2] data embedding
+    cff = _mm_cmpeq_epi8(v0, v0); //all ff
     bldmask = _mm_cmpeq_epi8(*(__m128i*)mask0, cff);
     v.val[0] = _MM_BLENDV_EPI8(v.val[0], v.val[2], bldmask);
     vst1q_u8(ptr,   v.val[0]);
-    v.val[0] =  _mm_shuffle_epi8(v1, *(__m128i*)mask1);         //make holes for the v.val[2] data embedding
-    v.val[2] =  _mm_shuffle_epi8(val->val[2], *(__m128i*)mask2med);         //make plugs for the v.val[2] data embedding
+    v.val[0] =  _mm_shuffle_epi8(v1, *(__m128i*)mask1); //make holes for the v.val[2] data embedding
+    v.val[2] =  _mm_shuffle_epi8(val->val[2], *(__m128i*)mask2med); //make plugs for the v.val[2] data embedding
     bldmask = _mm_cmpeq_epi8(*(__m128i*)mask1, cff);
     v.val[1] = _MM_BLENDV_EPI8(v.val[0],v.val[2], bldmask);
     vst1q_u8((ptr + 16),  v.val[1]);
-    v.val[0] =  _mm_shuffle_epi8(v2, *(__m128i*)mask2);         //make holes for the v.val[2] data embedding
-    v.val[2] =  _mm_shuffle_epi8(val->val[2], *(__m128i*)mask2hi);         //make plugs for the v.val[2] data embedding
+    v.val[0] =  _mm_shuffle_epi8(v2, *(__m128i*)mask2); //make holes for the v.val[2] data embedding
+    v.val[2] =  _mm_shuffle_epi8(val->val[2], *(__m128i*)mask2hi); //make plugs for the v.val[2] data embedding
     bldmask = _mm_cmpeq_epi8(*(__m128i*)mask2, cff);
     v.val[2] = _MM_BLENDV_EPI8(v.val[0],v.val[2], bldmask );
     vst1q_u8((ptr + 32),  v.val[2]);
 }
 #define vst3q_u8(ptr, val) vst3q_u8_ptr(ptr, &val)
-#endif
 
-#if defined(USE_SSSE3)
 //void vst3q_u16(__transfersize(24) uint16_t * ptr, uint16x8x3_t val)// VST3.16 {d0, d2, d4}, [r0]
 _NEON2SSE_INLINE void vst3q_u16_ptr(__transfersize(24) uint16_t * ptr, uint16x8x3_t* val)
 {
@@ -6175,32 +11216,32 @@
     _NEON2SSE_ALIGN_16 uint8_t mask2med[16] = {4,5, 0xff,0xff,0xff,0xff, 6,7, 0xff, 0xff,0xff,0xff, 8,9, 0xff, 0xff};
     _NEON2SSE_ALIGN_16 uint8_t mask2hi[16] = {0xff, 0xff, 10,11, 0xff, 0xff, 0xff, 0xff, 12,13, 0xff, 0xff, 0xff, 0xff,14,15};
 
-    v0 =  _mm_unpacklo_epi16(val->val[0], val->val[1]);         //0,1, 3,4, 6,7, 9,10
-    v2 =  _mm_unpackhi_epi16(val->val[0], val->val[1]);         //12,13, 15,16, 18,19, 21,22,
-    v1 =  _mm_alignr_epi8(v2, v0, 12);         //9,10, 12,13, 15,16, 18,19
-    v.val[0] =  _mm_shuffle_epi8(v0, *(__m128i*)mask0);         //make holes for the v.val[2] data embedding
-    v.val[2] =  _mm_shuffle_epi8(val->val[2], *(__m128i*)mask2lo);         //make plugs for the v.val[2] data embedding
-    cff = _mm_cmpeq_epi16(v0, v0);         //all ff
+    v0 =  _mm_unpacklo_epi16(val->val[0], val->val[1]); //0,1, 3,4, 6,7, 9,10
+    v2 =  _mm_unpackhi_epi16(val->val[0], val->val[1]); //12,13, 15,16, 18,19, 21,22,
+    v1 =  _mm_alignr_epi8(v2, v0, 12); //9,10, 12,13, 15,16, 18,19
+    v.val[0] =  _mm_shuffle_epi8(v0, *(__m128i*)mask0); //make holes for the v.val[2] data embedding
+    v.val[2] =  _mm_shuffle_epi8(val->val[2], *(__m128i*)mask2lo); //make plugs for the v.val[2] data embedding
+    cff = _mm_cmpeq_epi16(v0, v0); //all ff
     bldmask = _mm_cmpeq_epi16(*(__m128i*)mask0, cff);
     v.val[0] = _MM_BLENDV_EPI8(v.val[0], v.val[2], bldmask);
     vst1q_u16(ptr,      v.val[0]);
-    v.val[0] =  _mm_shuffle_epi8(v1, *(__m128i*)mask1);         //make holes for the v.val[2] data embedding
-    v.val[2] =  _mm_shuffle_epi8(val->val[2], *(__m128i*)mask2med);         //make plugs for the v.val[2] data embedding
+    v.val[0] =  _mm_shuffle_epi8(v1, *(__m128i*)mask1); //make holes for the v.val[2] data embedding
+    v.val[2] =  _mm_shuffle_epi8(val->val[2], *(__m128i*)mask2med); //make plugs for the v.val[2] data embedding
     bldmask = _mm_cmpeq_epi16(*(__m128i*)mask1, cff);
     v.val[1] = _MM_BLENDV_EPI8(v.val[0],v.val[2], bldmask);
     vst1q_u16((ptr + 8),  v.val[1]);
-    v.val[0] =  _mm_shuffle_epi8(v2, *(__m128i*)mask2);         //make holes for the v.val[2] data embedding
-    v.val[2] =  _mm_shuffle_epi8(val->val[2], *(__m128i*)mask2hi);         //make plugs for the v.val[2] data embedding
+    v.val[0] =  _mm_shuffle_epi8(v2, *(__m128i*)mask2); //make holes for the v.val[2] data embedding
+    v.val[2] =  _mm_shuffle_epi8(val->val[2], *(__m128i*)mask2hi); //make plugs for the v.val[2] data embedding
     bldmask = _mm_cmpeq_epi16(*(__m128i*)mask2, cff);
     v.val[2] = _MM_BLENDV_EPI8(v.val[0],v.val[2], bldmask );
     vst1q_u16((ptr + 16), v.val[2]);
 }
 #define vst3q_u16(ptr, val) vst3q_u16_ptr(ptr, &val)
-#endif
 
 //void vst3q_u32(__transfersize(12) uint32_t * ptr, uint32x4x3_t val)// VST3.32 {d0, d2, d4}, [r0]
 _NEON2SSE_INLINE void vst3q_u32_ptr(__transfersize(12) uint32_t * ptr, uint32x4x3_t* val)
-{   //a0,a1,a2,a3,  b0,b1,b2,b3, c0,c1,c2,c3 -> a0,b0,c0,a1, b1,c1,a2,b2, c2,a3,b3,c3
+{
+    //a0,a1,a2,a3,  b0,b1,b2,b3, c0,c1,c2,c3 -> a0,b0,c0,a1, b1,c1,a2,b2, c2,a3,b3,c3
     uint32x4x3_t v;
     __m128i tmp0, tmp1,tmp2;
     tmp0 = _mm_unpacklo_epi32(val->val[0], val->val[1]); //a0,b0,a1,b1
@@ -6218,7 +11259,6 @@
 }
 #define vst3q_u32(ptr, val) vst3q_u32_ptr(ptr, &val)
 
-#if defined(USE_SSSE3)
 //void vst3q_s8(__transfersize(48) int8_t * ptr, int8x16x3_t val);
 void vst3q_s8_ptr(__transfersize(48) int8_t * ptr, int8x16x3_t * val);
 #define vst3q_s8(ptr, val) vst3q_u8((uint8_t*)(ptr), val)
@@ -6226,7 +11266,6 @@
 //void vst3q_s16(__transfersize(24) int16_t * ptr, int16x8x3_t val);
 void vst3q_s16_ptr(__transfersize(24) int16_t * ptr, int16x8x3_t * val);
 #define vst3q_s16(ptr, val) vst3q_u16((uint16_t*)(ptr), val)
-#endif
 
 //void vst3q_s32(__transfersize(12) int32_t * ptr, int32x4x3_t val);
 void vst3q_s32_ptr(__transfersize(12) int32_t * ptr, int32x4x3_t * val);
@@ -6239,7 +11278,7 @@
 //void vst3q_f32(__transfersize(12) float32_t * ptr, float32x4x3_t val)// VST3.32 {d0, d2, d4}, [r0]
 _NEON2SSE_INLINE void vst3q_f32_ptr(__transfersize(12) float32_t * ptr, float32x4x3_t* val)
 {
-     float32x4x3_t  v;
+    float32x4x3_t v;
     __m128 tmp0, tmp1,tmp2;
     tmp0 = _mm_unpacklo_ps(val->val[0], val->val[1]); //a0,b0,a1,b1
     tmp1 = _mm_unpackhi_ps(val->val[0], val->val[1]); //a2,b2,a3,b3
@@ -6256,7 +11295,6 @@
 }
 #define vst3q_f32(ptr, val) vst3q_f32_ptr(ptr, &val)
 
-#if defined(USE_SSSE3)
 //void vst3q_p8(__transfersize(48) poly8_t * ptr, poly8x16x3_t val);// VST3.8 {d0, d2, d4}, [r0]
 void vst3q_p8_ptr(__transfersize(48) poly8_t * ptr, poly8x16x3_t * val);
 #define vst3q_p8 vst3q_u8
@@ -6264,81 +11302,78 @@
 //void vst3q_p16(__transfersize(24) poly16_t * ptr, poly16x8x3_t val);// VST3.16 {d0, d2, d4}, [r0]
 void vst3q_p16_ptr(__transfersize(24) poly16_t * ptr, poly16x8x3_t * val);
 #define vst3q_p16 vst3q_u16
-#endif
 
 //void vst3_u8(__transfersize(24) uint8_t * ptr, uint8x8x3_t val)// VST3.8 {d0, d1, d2}, [r0]
-#if defined(USE_SSSE3)
 _NEON2SSE_INLINE void vst3_u8_ptr(__transfersize(24) uint8_t * ptr, uint8x8x3_t* val)
 {
-    uint8x8x3_t v;
-    __m128i tmp, sh0, sh1;
+    __m128i tmp, sh0, sh1, val0, val2;
     _NEON2SSE_ALIGN_16 int8_t mask0[16] = { 0, 8, 16, 1, 9, 17, 2, 10, 18, 3, 11, 19, 4, 12, 20, 5};
     _NEON2SSE_ALIGN_16 int8_t mask1[16] = {13, 21, 6, 14, 22, 7, 15, 23, 0,0,0,0,0,0,0,0};
     _NEON2SSE_ALIGN_16 int8_t mask0_sel[16] = {0, 0, 0xff, 0, 0, 0xff, 0, 0, 0xff, 0, 0, 0xff, 0, 0, 0xff, 0};
     _NEON2SSE_ALIGN_16 int8_t mask1_sel[16] = {0, 0xff, 0, 0, 0xff, 0, 0, 0xff, 0,0,0,0,0,0,0,0};
-    tmp = _mm_unpacklo_epi64(val->val[0], val->val[1]);
-    sh0 =  _mm_shuffle_epi8(tmp, *(__m128i*)mask0);         //for bi>15 bi is wrapped (bi-=15)
-    sh1 =  _mm_shuffle_epi8(val->val[2], *(__m128i*)mask0);
-    v.val[0] = _MM_BLENDV_EPI8(sh0, sh1, *(__m128i*)mask0_sel);
-    vst1q_u8(ptr,   v.val[0]);         //store as 128 bit structure
-    sh0 =  _mm_shuffle_epi8(tmp, *(__m128i*)mask1);         //for bi>15 bi is wrapped (bi-=15)
-    sh1 =  _mm_shuffle_epi8(val->val[2], *(__m128i*)mask1);
-    v.val[1] = _MM_BLENDV_EPI8(sh0, sh1, *(__m128i*)mask1_sel);
+    tmp = _mm_unpacklo_epi64(_pM128i(val->val[0]), _pM128i(val->val[1]) );
+    sh0 =  _mm_shuffle_epi8(tmp, *(__m128i*)mask0); //for bi>15 bi is wrapped (bi-=15)
+    val2 = _pM128i(val->val[2]);
+    sh1 =  _mm_shuffle_epi8(val2, *(__m128i*)mask0);
+    val0 = _MM_BLENDV_EPI8(sh0, sh1, *(__m128i*)mask0_sel);
+    vst1q_u8(ptr,   val0); //store as 128 bit structure
+    sh0 =  _mm_shuffle_epi8(tmp, *(__m128i*)mask1); //for bi>15 bi is wrapped (bi-=15)
+    sh1 =  _mm_shuffle_epi8(val2, *(__m128i*)mask1);
+    val2 = _MM_BLENDV_EPI8(sh0, sh1, *(__m128i*)mask1_sel);
+    _M64((*(__m64_128*)(ptr + 16)),  val2); //need it to fit into *ptr memory
 }
 #define vst3_u8(ptr, val) vst3_u8_ptr(ptr, &val)
-#endif
 
 //void vst3_u16(__transfersize(12) uint16_t * ptr, uint16x4x3_t val)// VST3.16 {d0, d1, d2}, [r0]
-#if defined(USE_SSSE3)
 _NEON2SSE_INLINE void vst3_u16_ptr(__transfersize(12) uint16_t * ptr, uint16x4x3_t* val)
 {
-    uint16x4x3_t v;
-    __m128i tmp;
+    __m128i tmp, val0, val1, val2;
     _NEON2SSE_ALIGN_16 int8_t mask0[16] = {0,1, 8,9, 16,17, 2,3, 10,11, 18,19, 4,5, 12,13};
     _NEON2SSE_ALIGN_16 int8_t mask1[16] = {20,21, 6,7, 14,15, 22,23,   0,0,0,0,0,0,0,0};
-    _NEON2SSE_ALIGN_16 uint16_t mask0f[8] = {0xffff, 0xffff, 0, 0xffff, 0xffff, 0, 0xffff, 0xffff};         //if all ones we take the result from v.val[0]  otherwise from v.val[1]
-    _NEON2SSE_ALIGN_16 uint16_t mask1f[8] = {0xffff, 0, 0, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff};         //if all ones we take the result from v.val[1]  otherwise from v.val[0]
-    tmp = _mm_unpacklo_epi64(val->val[0], val->val[1]);
-    v.val[0] = _mm_shuffle_epi8(tmp, *(__m128i*)mask0);
-    v.val[1] = _mm_shuffle_epi8(val->val[2], *(__m128i*)mask0);
-    v.val[0] = _MM_BLENDV_EPI8(v.val[1], v.val[0], *(__m128i*)mask0f);
-    vst1q_u16(ptr,     v.val[0]);         //store as 128 bit structure
-    v.val[0] = _mm_shuffle_epi8(tmp, *(__m128i*)mask1);
-    v.val[1] = _mm_shuffle_epi8(val->val[2], *(__m128i*)mask1);
-    v.val[1] = _MM_BLENDV_EPI8(v.val[0], v.val[1],  *(__m128i*)mask1f);         //change the operands order
+    _NEON2SSE_ALIGN_16 uint16_t mask0f[8] = {0xffff, 0xffff, 0, 0xffff, 0xffff, 0, 0xffff, 0xffff}; //if all ones we take the result from v.val[0]  otherwise from v.val[1]
+    _NEON2SSE_ALIGN_16 uint16_t mask1f[8] = {0xffff, 0, 0, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff}; //if all ones we take the result from v.val[1]  otherwise from v.val[0]
+    tmp = _mm_unpacklo_epi64(_pM128i(val->val[0]), _pM128i(val->val[1]));
+    val0 = _mm_shuffle_epi8(tmp, *(__m128i*)mask0);
+    val2 = _pM128i(val->val[2]);
+    val1 = _mm_shuffle_epi8(val2, *(__m128i*)mask0);
+    val0 = _MM_BLENDV_EPI8(val1, val0, *(__m128i*)mask0f);
+    vst1q_u16(ptr,    val0); //store as 128 bit structure
+    val0 = _mm_shuffle_epi8(tmp, *(__m128i*)mask1);
+    val1 = _mm_shuffle_epi8(val2, *(__m128i*)mask1);
+    val1 = _MM_BLENDV_EPI8(val0, val1,  *(__m128i*)mask1f); //change the operands order
+    _M64((*(__m64_128*)(ptr + 8)),  val1); //need it to fit into *ptr memory
 }
 #define vst3_u16(ptr, val) vst3_u16_ptr(ptr, &val)
-#endif
 
 //void vst3_u32(__transfersize(6) uint32_t * ptr, uint32x2x3_t val)// VST3.32 {d0, d1, d2}, [r0]
 _NEON2SSE_INLINE void vst3_u32_ptr(__transfersize(6) uint32_t * ptr, uint32x2x3_t* val)
-{         //val->val[0]:0,3,val->val[1]:1,4; val->val[2]:2,5,x,x;
-    uint32x2x3_t res;
-    res.val[0] = _mm_unpacklo_epi64(val->val[1], val->val[2]);         //val[0]: 1,4,2,5
-    res.val[0] = _mm_shuffle_epi32(res.val[0], 0 | (2 << 2) | (1 << 4) | (3 << 6));         //1,2,4,5
-    res.val[1] = _mm_srli_si128(res.val[0], 8);         //4,5, x,x
-    res.val[0] = _mm_unpacklo_epi32(val->val[0], res.val[0]);         //0,1,3,2
-    res.val[0] = _mm_shuffle_epi32(res.val[0], 0 | (1 << 2) | (3 << 4) | (2 << 6));         //0,1,2, 3
-    vst1q_u32(ptr, res.val[0]);         //store as 128 bit structure
+{
+    //val->val[0]:0,3,val->val[1]:1,4; val->val[2]:2,5,x,x;
+    __m128i val0, val1;
+    val0 = _mm_unpacklo_epi64(_pM128i(val->val[1]), _pM128i(val->val[2])); //val[0]: 1,4,2,5
+    val0 = _mm_shuffle_epi32(val0, 0 | (2 << 2) | (1 << 4) | (3 << 6)); //1,2,4,5
+    val1 = _mm_srli_si128(val0, 8); //4,5, x,x
+    _M64((*(__m64_128*)(ptr + 4)),  val1);
+    val0 = _mm_unpacklo_epi32(_pM128i(val->val[0]), val0); //0,1,3,2
+    val0 = _mm_shuffle_epi32(val0, 0 | (1 << 2) | (3 << 4) | (2 << 6)); //0,1,2, 3
+    vst1q_u32(ptr, val0); //store as 128 bit structure
 }
 #define vst3_u32(ptr, val) vst3_u32_ptr(ptr, &val)
 
 //void vst3_u64(__transfersize(3) uint64_t * ptr, uint64x1x3_t val)// VST1.64 {d0, d1, d2}, [r0]
 _NEON2SSE_INLINE void vst3_u64_ptr(__transfersize(3) uint64_t * ptr, uint64x1x3_t* val)
 {
-    __m128i tmp;
-    tmp =  _mm_unpacklo_epi64(val->val[0], val->val[1]);
-    vst1q_u64(ptr, tmp);         //store as 128 bit structure
+    *(ptr) = val->val[0].m64_u64[0];
+    *(ptr + 1) = val->val[1].m64_u64[0];
+    *(ptr + 2) = val->val[2].m64_u64[0];
 }
 #define vst3_u64(ptr, val) vst3_u64_ptr(ptr, &val)
 
-#if defined(USE_SSSE3)
 //void vst3_s8(__transfersize(24) int8_t * ptr, int8x8x3_t val)  // VST3.8 {d0, d1, d2}, [r0]
 #define vst3_s8(ptr, val) vst3_u8_ptr((uint8_t*)ptr, &val)
 
 //void vst3_s16(__transfersize(12) int16_t * ptr, int16x4x3_t val)  // VST3.16 {d0, d1, d2}, [r0]
 #define vst3_s16(ptr, val) vst3_u16_ptr((uint16_t*)ptr, &val)
-#endif
 
 //void vst3_s32(__transfersize(6) int32_t * ptr, int32x2x3_t val); // VST3.32 {d0, d1, d2}, [r0]
 #define vst3_s32(ptr, val) vst3_u32_ptr((uint32_t*)ptr, &val)
@@ -6347,23 +11382,22 @@
 #define vst3_s64(ptr, val) vst3_u64_ptr((uint64_t*)ptr, &val)
 
 //void vst3_f16(__transfersize(12) __fp16 * ptr, float16x4x3_t val);// VST3.16 {d0, d1, d2}, [r0]
-void vst3_f16_ptr(__transfersize(12) __fp16 * ptr, float16x4x3_t * val);         // VST3.16 {d0, d1, d2}, [r0]
+void vst3_f16_ptr(__transfersize(12) __fp16 * ptr, float16x4x3_t * val); // VST3.16 {d0, d1, d2}, [r0]
 // IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example
 
 //void vst3_f32(__transfersize(6) float32_t * ptr, float32x2x3_t val)// VST3.32 {d0, d1, d2}, [r0]
 _NEON2SSE_INLINE void vst3_f32_ptr(__transfersize(6) float32_t * ptr, float32x2x3_t* val)
-{         //val->val[0]:0,3,val->val[1]:1,4; val->val[2]:2,5,x,x;
-    float32x2x3_t res;
-    res.val[0] = _mm_castsi128_ps(_mm_unpacklo_epi64(_mm_castps_si128(val->val[1]), _mm_castps_si128(val->val[2])) );
-    res.val[0] = _mm_shuffle_ps(res.val[0],res.val[0], _MM_SHUFFLE(3,1,2,0));         //1,2,4,5
-    res.val[1] = _mm_shuffle_ps(res.val[0],res.val[0], _MM_SHUFFLE(1,0,3,2));         //4,5, 1,2
-    res.val[0] = _mm_unpacklo_ps(val->val[0], res.val[0]);         //0,1,3, 2
-    res.val[0] = _mm_shuffle_ps(res.val[0],res.val[0], _MM_SHUFFLE(2,3,1,0));         //0,1,2, 3
-    vst1q_f32(ptr, res.val[0]);         //store as 128 bit structure
+{
+    //val->val[0]:0,3,val->val[1]:1,4; val->val[2]:2,5,x,x;   -> 0,2, 4,1, 3,5
+    *(ptr) =   val->val[0].m64_f32[0];
+    *(ptr + 1) = val->val[1].m64_f32[0];
+    *(ptr + 2) = val->val[2].m64_f32[0];
+    *(ptr + 3) = val->val[0].m64_f32[1];
+    *(ptr + 4) = val->val[1].m64_f32[1];
+    *(ptr + 5) = val->val[2].m64_f32[1];
 }
 #define vst3_f32(ptr, val) vst3_f32_ptr(ptr, &val)
 
-#if defined(USE_SSSE3)
 //void vst3_p8(__transfersize(24) poly8_t * ptr, poly8x8x3_t val);// VST3.8 {d0, d1, d2}, [r0]
 void vst3_p8_ptr(__transfersize(24) poly8_t * ptr, poly8x8x3_t * val);
 #define vst3_p8 vst3_u8
@@ -6371,7 +11405,6 @@
 //void vst3_p16(__transfersize(12) poly16_t * ptr, poly16x4x3_t val);// VST3.16 {d0, d1, d2}, [r0]
 void vst3_p16_ptr(__transfersize(12) poly16_t * ptr, poly16x4x3_t * val);
 #define vst3_p16 vst3_s16
-#endif
 
 //***************  Quadruples store ********************************
 //*********************************************************************
@@ -6379,17 +11412,17 @@
 _NEON2SSE_INLINE void vst4q_u8_ptr(__transfersize(64) uint8_t * ptr, uint8x16x4_t* val)
 {
     __m128i tmp1, tmp2, res;
-    tmp1 = _mm_unpacklo_epi8(val->val[0], val->val[1]);         //  0,1, 4,5, 8,9, 12,13, 16,17, 20,21, 24,25, 28,29
-    tmp2 = _mm_unpacklo_epi8(val->val[2], val->val[3]);         //  2,3, 6,7, 10,11, 14,15, 18,19, 22,23, 26,27, 30,31
-    res = _mm_unpacklo_epi16(tmp1, tmp2);         //0,1, 2,3, 4,5, 6,7, 8,9, 10,11, 12,13, 14,15
+    tmp1 = _mm_unpacklo_epi8(val->val[0], val->val[1]); //  0,1, 4,5, 8,9, 12,13, 16,17, 20,21, 24,25, 28,29
+    tmp2 = _mm_unpacklo_epi8(val->val[2], val->val[3]); //  2,3, 6,7, 10,11, 14,15, 18,19, 22,23, 26,27, 30,31
+    res = _mm_unpacklo_epi16(tmp1, tmp2); //0,1, 2,3, 4,5, 6,7, 8,9, 10,11, 12,13, 14,15
     vst1q_u8(ptr,  res);
-    res = _mm_unpackhi_epi16(tmp1, tmp2);         //16,17, 18,19, 20,21, 22,23, 24,25, 26,27, 28,29, 30,31
+    res = _mm_unpackhi_epi16(tmp1, tmp2); //16,17, 18,19, 20,21, 22,23, 24,25, 26,27, 28,29, 30,31
     vst1q_u8((ptr + 16), res);
-    tmp1 = _mm_unpackhi_epi8(val->val[0], val->val[1]);         //
-    tmp2 = _mm_unpackhi_epi8(val->val[2], val->val[3]);         //
-    res = _mm_unpacklo_epi16(tmp1, tmp2);         //
+    tmp1 = _mm_unpackhi_epi8(val->val[0], val->val[1]); //
+    tmp2 = _mm_unpackhi_epi8(val->val[2], val->val[3]); //
+    res = _mm_unpacklo_epi16(tmp1, tmp2); //
     vst1q_u8((ptr + 32), res);
-    res = _mm_unpackhi_epi16(tmp1, tmp2);         //
+    res = _mm_unpackhi_epi16(tmp1, tmp2); //
     vst1q_u8((ptr + 48), res);
 }
 #define vst4q_u8(ptr, val) vst4q_u8_ptr(ptr, &val)
@@ -6399,12 +11432,12 @@
 {
     uint16x8x4_t v;
     __m128i tmp1, tmp2;
-    tmp1 = _mm_unpacklo_epi16(val->val[0], val->val[1]);         //0,1, 4,5, 8,9, 12,13
-    tmp2 = _mm_unpacklo_epi16(val->val[2], val->val[3]);         //2,3, 6,7 , 10,11, 14,15
+    tmp1 = _mm_unpacklo_epi16(val->val[0], val->val[1]); //0,1, 4,5, 8,9, 12,13
+    tmp2 = _mm_unpacklo_epi16(val->val[2], val->val[3]); //2,3, 6,7 , 10,11, 14,15
     v.val[0] = _mm_unpacklo_epi32(tmp1, tmp2);
     v.val[1] = _mm_unpackhi_epi32(tmp1, tmp2);
-    tmp1 = _mm_unpackhi_epi16(val->val[0], val->val[1]);         //0,1, 4,5, 8,9, 12,13
-    tmp2 = _mm_unpackhi_epi16(val->val[2], val->val[3]);         //2,3, 6,7 , 10,11, 14,15
+    tmp1 = _mm_unpackhi_epi16(val->val[0], val->val[1]); //0,1, 4,5, 8,9, 12,13
+    tmp2 = _mm_unpackhi_epi16(val->val[2], val->val[3]); //2,3, 6,7 , 10,11, 14,15
     v.val[2] = _mm_unpacklo_epi32(tmp1, tmp2);
     v.val[3] = _mm_unpackhi_epi32(tmp1, tmp2);
     vst1q_u16(ptr,     v.val[0]);
@@ -6419,12 +11452,12 @@
 {
     uint16x8x4_t v;
     __m128i tmp1, tmp2;
-    tmp1 = _mm_unpacklo_epi32(val->val[0], val->val[1]);         //0,1, 4,5, 8,9, 12,13
-    tmp2 = _mm_unpacklo_epi32(val->val[2], val->val[3]);         //2,3, 6,7 , 10,11, 14,15
+    tmp1 = _mm_unpacklo_epi32(val->val[0], val->val[1]); //0,1, 4,5, 8,9, 12,13
+    tmp2 = _mm_unpacklo_epi32(val->val[2], val->val[3]); //2,3, 6,7 , 10,11, 14,15
     v.val[0] = _mm_unpacklo_epi64(tmp1, tmp2);
     v.val[1] = _mm_unpackhi_epi64(tmp1, tmp2);
-    tmp1 = _mm_unpackhi_epi32(val->val[0], val->val[1]);         //0,1, 4,5, 8,9, 12,13
-    tmp2 = _mm_unpackhi_epi32(val->val[2], val->val[3]);         //2,3, 6,7 , 10,11, 14,15
+    tmp1 = _mm_unpackhi_epi32(val->val[0], val->val[1]); //0,1, 4,5, 8,9, 12,13
+    tmp2 = _mm_unpackhi_epi32(val->val[2], val->val[3]); //2,3, 6,7 , 10,11, 14,15
     v.val[2] = _mm_unpacklo_epi64(tmp1, tmp2);
     v.val[3] = _mm_unpackhi_epi64(tmp1, tmp2);
     vst1q_u32(ptr,      v.val[0]);
@@ -6481,50 +11514,50 @@
 //void vst4_u8(__transfersize(32) uint8_t * ptr, uint8x8x4_t val)// VST4.8 {d0, d1, d2, d3}, [r0]
 _NEON2SSE_INLINE void vst4_u8_ptr(__transfersize(32) uint8_t * ptr, uint8x8x4_t* val)
 {
-    uint8x8x4_t v;
-    __m128i sh0, sh1;
-    sh0 = _mm_unpacklo_epi8(val->val[0],val->val[1]);         // a0,b0,a1,b1,a2,b2,a3,b3,a4,b4,a5,b5, a6,b6,a7,b7,
-    sh1 = _mm_unpacklo_epi8(val->val[2],val->val[3]);         // c0,d0,c1,d1,c2,d2,c3,d3, c4,d4,c5,d5,c6,d6,c7,d7
-    v.val[0] = _mm_unpacklo_epi16(sh0,sh1);         // a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,
-    v.val[2] = _mm_unpackhi_epi16(sh0,sh1);         //a4,b4,c4,d4,a5,b5,c5,d5, a6,b6,c6,d6,a7,b7,c7,d7
-    vst1q_u8(ptr,      v.val[0]);
-    vst1q_u8((ptr + 16),  v.val[2]);
+    __m128i sh0, sh1, val0, val2;
+    sh0 = _mm_unpacklo_epi8(_pM128i(val->val[0]),_pM128i(val->val[1])); // a0,b0,a1,b1,a2,b2,a3,b3,a4,b4,a5,b5, a6,b6,a7,b7,
+    sh1 = _mm_unpacklo_epi8(_pM128i(val->val[2]),_pM128i(val->val[3])); // c0,d0,c1,d1,c2,d2,c3,d3, c4,d4,c5,d5,c6,d6,c7,d7
+    val0 = _mm_unpacklo_epi16(sh0,sh1); // a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,
+    val2 = _mm_unpackhi_epi16(sh0,sh1); //a4,b4,c4,d4,a5,b5,c5,d5, a6,b6,c6,d6,a7,b7,c7,d7
+    vst1q_u8(ptr,    val0);
+    vst1q_u8((ptr + 16),  val2);
 }
 #define vst4_u8(ptr, val) vst4_u8_ptr(ptr, &val)
 
 //void vst4_u16(__transfersize(16) uint16_t * ptr, uint16x4x4_t val)// VST4.16 {d0, d1, d2, d3}, [r0]
 _NEON2SSE_INLINE void vst4_u16_ptr(__transfersize(16) uint16_t * ptr, uint16x4x4_t* val)
 {
-    uint16x4x4_t v;
-    __m128i sh0, sh1;
-    sh0 = _mm_unpacklo_epi16(val->val[0],val->val[1]);         //a0,a1,b0,b1,c0,c1,d0,d1,
-    sh1 = _mm_unpacklo_epi16(val->val[2],val->val[3]);         //a2,a3,b2,b3,c2,c3,d2,d3
-    v.val[0] = _mm_unpacklo_epi32(sh0,sh1);         // a0,a1,a2,a3,b0,b1,b2,b3
-    v.val[2] = _mm_unpackhi_epi32(sh0,sh1);         // c0,c1,c2,c3,d0,d1,d2,d3
-    vst1q_u16(ptr,      v.val[0]);         //store as 128 bit structure
-    vst1q_u16((ptr + 8),  v.val[2]);
+    __m128i sh0, sh1, val0, val2;
+    sh0 = _mm_unpacklo_epi16(_pM128i(val->val[0]),_pM128i(val->val[1])); //a0,a1,b0,b1,c0,c1,d0,d1,
+    sh1 = _mm_unpacklo_epi16(_pM128i(val->val[2]),_pM128i(val->val[3])); //a2,a3,b2,b3,c2,c3,d2,d3
+    val0 = _mm_unpacklo_epi32(sh0,sh1); // a0,a1,a2,a3,b0,b1,b2,b3
+    val2 = _mm_unpackhi_epi32(sh0,sh1); // c0,c1,c2,c3,d0,d1,d2,d3
+    vst1q_u16(ptr,      val0); //store as 128 bit structure
+    vst1q_u16((ptr + 8),  val2);
 }
 #define vst4_u16(ptr, val) vst4_u16_ptr(ptr, &val)
 
 //void vst4_u32(__transfersize(8) uint32_t * ptr, uint32x2x4_t val)// VST4.32 {d0, d1, d2, d3}, [r0]
 _NEON2SSE_INLINE void vst4_u32_ptr(__transfersize(8) uint32_t * ptr, uint32x2x4_t* val)
-{         //0,4,   1,5,  2,6,  3,7
-    uint32x2x4_t v;
-    __m128i sh0, sh1;
-    sh0 = _mm_unpacklo_epi32(val->val[0], val->val[1]);         //0,1,4,5
-    sh1 = _mm_unpacklo_epi32(val->val[2], val->val[3]);         //2,3,6,7
-    v.val[0] = _mm_unpacklo_epi64(sh0,sh1);         //
-    v.val[1] = _mm_unpackhi_epi64(sh0,sh1);         //
-    vst1q_u32(ptr,     v.val[0]);         //store as 128 bit structure
-    vst1q_u32((ptr + 4),  v.val[1]);
+{
+    //0,4,   1,5,  2,6,  3,7
+    __m128i sh0, sh1, val0, val1;
+    sh0 = _mm_unpacklo_epi32(_pM128i(val->val[0]), _pM128i(val->val[1])); //0,1,4,5
+    sh1 = _mm_unpacklo_epi32(_pM128i(val->val[2]), _pM128i(val->val[3])); //2,3,6,7
+    val0 = _mm_unpacklo_epi64(sh0,sh1); //
+    val1 = _mm_unpackhi_epi64(sh0,sh1); //
+    vst1q_u32(ptr,     val0); //store as 128 bit structure
+    vst1q_u32((ptr + 4),  val1);
 }
 #define vst4_u32(ptr, val) vst4_u32_ptr(ptr, &val)
 
 //void vst4_u64(__transfersize(4) uint64_t * ptr, uint64x1x4_t val)// VST1.64 {d0, d1, d2, d3}, [r0]
 _NEON2SSE_INLINE void vst4_u64_ptr(__transfersize(4) uint64_t * ptr, uint64x1x4_t* val)
 {
-    vst1q_u64(ptr,    val->val[0]);
-    vst1q_u64((ptr + 2), val->val[2]);
+    *(ptr) =  val->val[0].m64_u64[0];
+    *(ptr + 1) =  val->val[1].m64_u64[0];
+    *(ptr + 2) =  val->val[2].m64_u64[0];
+    *(ptr + 3) =  val->val[3].m64_u64[0];
 }
 #define vst4_u64(ptr, val) vst4_u64_ptr(ptr, &val)
 
@@ -6547,14 +11580,16 @@
 
 //void vst4_f32(__transfersize(8) float32_t * ptr, float32x2x4_t val)// VST4.32 {d0, d1, d2, d3}, [r0]
 _NEON2SSE_INLINE void vst4_f32_ptr(__transfersize(8) float32_t * ptr, float32x2x4_t* val)
-{         //a0,a1,  b0,b1, c0,c1, d0,d1 -> a0,c0, a1,c1, b0,d0, b1,d1
-    float32x2x4_t v;
-    v.val[0] = _mm_unpacklo_ps(val->val[0],val->val[1]);
-    v.val[2] = _mm_unpacklo_ps(val->val[2],val->val[3]);
-    v.val[1] = _mm_movelh_ps (v.val[0], v.val[2]);         //a0, c0, a1,c1,
-    v.val[3] = _mm_movehl_ps (v.val[2],v.val[0]);         //b0,d0, b1, d1
-    vst1q_f32(ptr,     v.val[1]);         //store as 128 bit structure
-    vst1q_f32((ptr + 4),  v.val[3]);
+{
+    //0,4,   1,5,  2,6,  3,7 -> 0,1, 2,3, 4,5, 6,7
+    *(ptr) =   val->val[0].m64_f32[0];
+    *(ptr + 1) = val->val[1].m64_f32[0];
+    *(ptr + 2) = val->val[2].m64_f32[0];
+    *(ptr + 3) = val->val[3].m64_f32[0];
+    *(ptr + 4) = val->val[0].m64_f32[1];
+    *(ptr + 5) = val->val[1].m64_f32[1];
+    *(ptr + 6) = val->val[2].m64_f32[1];
+    *(ptr + 7) = val->val[3].m64_f32[1];
 }
 #define vst4_f32(ptr, val) vst4_f32_ptr(ptr, &val)
 
@@ -6602,19 +11637,38 @@
     vst1q_lane_f32(ptr, val->val[0], lane);
     vst1q_lane_f32((ptr + 1), val->val[1], lane);
 }
-#define vst2q_lane_f32(ptr, val, lane) vst2q_lane_f32_ptr(ptr, &val, lane)
+#define vst2q_lane_f32(ptr,src,lane) vst2q_lane_f32_ptr(ptr,&src,lane)
 
 //void vst2q_lane_p16(__transfersize(2) poly16_t * ptr, poly16x8x2_t val, __constrange(0,7) int lane);// VST2.16 {d0[0], d2[0]}, [r0]
 void vst2q_lane_p16_ptr(__transfersize(2) poly16_t * ptr, poly16x8x2_t * val, __constrange(0,7) int lane);
 #define vst2q_lane_p16 vst2q_lane_s16
 
+//void vst2_lane_u8(__transfersize(2) uint8_t * ptr, uint8x8x2_t val, __constrange(0,7) int lane);// VST2.8 {d0[0], d1[0]}, [r0]
+void vst2_lane_u8_ptr(__transfersize(2) uint8_t * ptr, uint8x8x2_t * val, __constrange(0,7) int lane); // VST2.8 {d0[0], d1[0]}, [r0]
+_NEON2SSE_INLINE void vst2_lane_u8_ptr(__transfersize(2) uint8_t * ptr, uint8x8x2_t* val, __constrange(0,7) int lane) // VST2.8 {d0[0], d1[0]}, [r0]
+{
+    *(ptr) = val->val[0].m64_u8[lane];
+    *(ptr + 1) = val->val[1].m64_u8[lane];
+}
+#define vst2_lane_u8(ptr, val, lane) vst2_lane_u8_ptr(ptr, &val, lane)
+
 //void vst2_lane_u16(__transfersize(2) uint16_t * ptr, uint16x4x2_t val, __constrange(0,3) int lane);// VST2.16 {d0[0], d1[0]}, [r0]
-void vst2_lane_u16_ptr(__transfersize(2) uint16_t * ptr, uint16x4x2_t * val, __constrange(0,3) int lane);         // VST2.16 {d0[0], d1[0]}, [r0]
-#define vst2_lane_u16 vst2q_lane_u16
+void vst2_lane_u16_ptr(__transfersize(2) uint16_t * ptr, uint16x4x2_t * val, __constrange(0,3) int lane); // VST2.16 {d0[0], d1[0]}, [r0]
+_NEON2SSE_INLINE void vst2_lane_u16_ptr(__transfersize(2) uint16_t * ptr, uint16x4x2_t * val, __constrange(0,3) int lane)
+{
+    *(ptr) = val->val[0].m64_u16[lane];
+    *(ptr + 1) = val->val[1].m64_u16[lane];
+}
+#define vst2_lane_u16(ptr, val, lane) vst2_lane_u16_ptr(ptr, &val, lane)
 
 //void vst2_lane_u32(__transfersize(2) uint32_t * ptr, uint32x2x2_t val, __constrange(0,1) int lane);// VST2.32 {d0[0], d1[0]}, [r0]
-void vst2_lane_u32_ptr(__transfersize(2) uint32_t * ptr, uint32x2x2_t * val, __constrange(0,1) int lane);         // VST2.32 {d0[0], d1[0]}, [r0]
-#define vst2_lane_u32 vst2q_lane_u32
+void vst2_lane_u32_ptr(__transfersize(2) uint32_t * ptr, uint32x2x2_t * val, __constrange(0,1) int lane); // VST2.32 {d0[0], d1[0]}, [r0]
+_NEON2SSE_INLINE void vst2_lane_u32_ptr(__transfersize(2) uint32_t * ptr, uint32x2x2_t * val, __constrange(0,1) int lane)
+{
+    *(ptr) = val->val[0].m64_u32[lane];
+    *(ptr + 1) = val->val[1].m64_u32[lane];
+}
+#define vst2_lane_u32(ptr, val, lane) vst2_lane_u32_ptr(ptr, &val, lane)
 
 //void vst2_lane_s8(__transfersize(2) int8_t * ptr, int8x8x2_t val, __constrange(0,7) int lane);// VST2.8 {d0[0], d1[0]}, [r0]
 void vst2_lane_s8_ptr(__transfersize(2) int8_t * ptr, int8x8x2_t * val, __constrange(0,7) int lane);
@@ -6622,17 +11676,22 @@
 
 //void vst2_lane_s16(__transfersize(2) int16_t * ptr, int16x4x2_t val, __constrange(0,3) int lane);// VST2.16 {d0[0], d1[0]}, [r0]
 void vst2_lane_s16_ptr(__transfersize(2) int16_t * ptr, int16x4x2_t * val, __constrange(0,3) int lane);
-#define vst2_lane_s16 vst2q_lane_s16
+#define vst2_lane_s16(ptr, val, lane)  vst2_lane_u16((uint16_t*)ptr, val, lane)
 
 //void vst2_lane_s32(__transfersize(2) int32_t * ptr, int32x2x2_t val, __constrange(0,1) int lane);// VST2.32 {d0[0], d1[0]}, [r0]
 void vst2_lane_s32_ptr(__transfersize(2) int32_t * ptr, int32x2x2_t * val, __constrange(0,1) int lane);
-#define vst2_lane_s32 vst2q_lane_s32
+#define vst2_lane_s32(ptr, val, lane)  vst2_lane_u32((uint32_t*)ptr, val, lane)
 
 //void vst2_lane_f16(__transfersize(2) __fp16 * ptr, float16x4x2_t val, __constrange(0,3) int lane); // VST2.16 {d0[0], d1[0]}, [r0]
 //current IA SIMD doesn't support float16
 
-void vst2_lane_f32_ptr(__transfersize(2) float32_t * ptr, float32x2x2_t * val, __constrange(0,1) int lane);         // VST2.32 {d0[0], d1[0]}, [r0]
-#define vst2_lane_f32 vst2q_lane_f32
+void vst2_lane_f32_ptr(__transfersize(2) float32_t * ptr, float32x2x2_t * val, __constrange(0,1) int lane); // VST2.32 {d0[0], d1[0]}, [r0]
+_NEON2SSE_INLINE void vst2_lane_f32_ptr(__transfersize(2) float32_t * ptr, float32x2x2_t * val, __constrange(0,1) int lane)
+{
+    *(ptr) = val->val[0].m64_f32[lane];
+    *(ptr + 1) = val->val[1].m64_f32[lane];
+}
+#define vst2_lane_f32(ptr,src,lane) vst2_lane_f32_ptr(ptr,&src,lane)
 
 //void vst2_lane_p8(__transfersize(2) poly8_t * ptr, poly8x8x2_t val, __constrange(0,7) int lane);// VST2.8 {d0[0], d1[0]}, [r0]
 #define vst2_lane_p8 vst2_lane_u8
@@ -6677,7 +11736,38 @@
     vst1q_lane_f32((ptr + 1),   val->val[1], lane);
     vst1q_lane_f32((ptr + 2), val->val[2], lane);
 }
-#define vst3q_lane_f32(ptr, val, lane) vst3q_lane_f32_ptr(ptr, &val, lane)
+#define vst3q_lane_f32(ptr,val,lane) vst3q_lane_f32_ptr(ptr,&val,lane)
+
+//void vst3q_lane_p16(__transfersize(3) poly16_t * ptr, poly16x8x3_t val, __constrange(0,7) int lane);// VST3.16 {d0[0], d2[0], d4[0]}, [r0]
+void vst3q_lane_p16_ptr(__transfersize(3) poly16_t * ptr, poly16x8x3_t * val, __constrange(0,7) int lane);
+#define vst3q_lane_p16 vst3q_lane_s16
+
+//void vst3_lane_u8(__transfersize(3) uint8_t * ptr, uint8x8x3_t val, __constrange(0,7) int lane)// VST3.8 {d0[0], d1[0], d2[0]}, [r0]
+_NEON2SSE_INLINE void vst3_lane_u8_ptr(__transfersize(3) uint8_t * ptr, uint8x8x3_t* val, __constrange(0,7) int lane)
+{
+    *(ptr) =     val->val[0].m64_u8[lane];
+    *(ptr + 1) = val->val[1].m64_u8[lane];
+    *(ptr + 2) = val->val[2].m64_u8[lane];
+}
+#define vst3_lane_u8(ptr, val, lane) vst3_lane_u8_ptr(ptr, &val, lane)
+
+//void vst3_lane_u16(__transfersize(3) uint16_t * ptr, uint16x4x3_t val, __constrange(0,3) int lane)// VST3.16 {d0[0], d1[0], d2[0]}, [r0]
+_NEON2SSE_INLINE void vst3_lane_u16_ptr(__transfersize(3) uint16_t * ptr, uint16x4x3_t* val, __constrange(0,3) int lane)
+{
+    *(ptr) =     val->val[0].m64_u16[lane];
+    *(ptr + 1) = val->val[1].m64_u16[lane];
+    *(ptr + 2) = val->val[2].m64_u16[lane];
+}
+#define vst3_lane_u16(ptr, val, lane) vst3_lane_u16_ptr(ptr, &val, lane)
+
+//void vst3_lane_u32(__transfersize(3) uint32_t * ptr, uint32x2x3_t val, __constrange(0,1) int lane)// VST3.32 {d0[0], d1[0], d2[0]}, [r0]
+_NEON2SSE_INLINE void vst3_lane_u32_ptr(__transfersize(3) uint32_t * ptr, uint32x2x3_t* val, __constrange(0,1) int lane)
+{
+    *(ptr) =     val->val[0].m64_u32[lane];
+    *(ptr + 1) = val->val[1].m64_u32[lane];
+    *(ptr + 2) = val->val[2].m64_u32[lane];
+}
+#define vst3_lane_u32(ptr, val, lane) vst3_lane_u32_ptr(ptr, &val, lane)
 
 //void vst3_lane_s8(__transfersize(3) int8_t * ptr, int8x8x3_t val, __constrange(0,7) int lane);// VST3.8 {d0[0], d1[0], d2[0]}, [r0]
 void vst3_lane_s8_ptr(__transfersize(3) int8_t * ptr, int8x8x3_t * val, __constrange(0,7) int lane);
@@ -6697,7 +11787,13 @@
 
 //void vst3_lane_f32(__transfersize(3) float32_t * ptr, float32x2x3_t val, __constrange(0,1) int lane)// VST3.32 {d0[0], d1[0], d2[0]}, [r0]
 void vst3_lane_f32_ptr(__transfersize(3) float32_t * ptr, float32x2x3_t * val, __constrange(0,1) int lane);
-#define vst3_lane_f32 vst3q_lane_f32
+_NEON2SSE_INLINE void vst3_lane_f32_ptr(__transfersize(3) float32_t * ptr, float32x2x3_t * val, __constrange(0,1) int lane)
+{
+    *(ptr) = val->val[0].m64_f32[lane];
+    *(ptr + 1) = val->val[1].m64_f32[lane];
+    *(ptr + 2) = val->val[2].m64_f32[lane];
+}
+#define vst3_lane_f32(ptr,val,lane) vst3_lane_f32_ptr(ptr,&val,lane)
 
 //void vst3_lane_p8(__transfersize(3) poly8_t * ptr, poly8x8x3_t val, __constrange(0,7) int lane);// VST3.8 {d0[0], d1[0], d2[0]}, [r0]
 void vst3_lane_p8_ptr(__transfersize(3) poly8_t * ptr, poly8x8x3_t * val, __constrange(0,7) int lane);
@@ -6745,7 +11841,7 @@
     vst1q_lane_f32((ptr + 2), val->val[2], lane);
     vst1q_lane_f32((ptr + 3), val->val[3], lane);
 }
-#define vst4q_lane_f32(ptr, val, lane) vst4q_lane_f32_ptr(ptr, &val, lane)
+#define vst4q_lane_f32(ptr,val,lane) vst4q_lane_f32_ptr(ptr,&val,lane)
 
 //void vst4q_lane_p16(__transfersize(4) poly16_t * ptr, poly16x8x4_t val, __constrange(0,7) int lane);// VST4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
 void vst4q_lane_p16_ptr(__transfersize(4) poly16_t * ptr, poly16x8x4_t * val, __constrange(0,7) int lane);
@@ -6754,31 +11850,30 @@
 //void vst4_lane_u8(__transfersize(4) uint8_t * ptr, uint8x8x4_t val, __constrange(0,7) int lane)// VST4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]
 _NEON2SSE_INLINE void vst4_lane_u8_ptr(__transfersize(4) uint8_t * ptr, uint8x8x4_t* val, __constrange(0,7) int lane)
 {
-    vst1q_lane_u8(ptr,   val->val[0], lane);
-    vst1q_lane_u8((ptr + 1),  val->val[1], lane);
-    vst1q_lane_u8((ptr + 2), val->val[2], lane);
-    vst1q_lane_u8((ptr + 3), val->val[3], lane);
+    *(ptr) =     val->val[0].m64_u8[lane];
+    *(ptr + 1) = val->val[1].m64_u8[lane];
+    *(ptr + 2) = val->val[2].m64_u8[lane];
+    *(ptr + 3) = val->val[3].m64_u8[lane];
 }
 #define vst4_lane_u8(ptr, val, lane) vst4_lane_u8_ptr(ptr, &val, lane)
 
 //void vst4_lane_u16(__transfersize(4) uint16_t * ptr, uint16x4x4_t val, __constrange(0,3) int lane)// VST4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0]
 _NEON2SSE_INLINE void vst4_lane_u16_ptr(__transfersize(4) uint16_t * ptr, uint16x4x4_t* val, __constrange(0,3) int lane)
 {
-    vst1q_lane_u16(ptr,   val->val[0], lane);
-    vst1q_lane_u16((ptr + 1),val->val[1], lane);
-    vst1q_lane_u16((ptr + 2), val->val[2], lane);
-    vst1q_lane_u16((ptr + 3), val->val[3], lane);
+    *(ptr) =     val->val[0].m64_u16[lane];
+    *(ptr + 1) = val->val[1].m64_u16[lane];
+    *(ptr + 2) = val->val[2].m64_u16[lane];
+    *(ptr + 3) = val->val[3].m64_u16[lane];
 }
 #define vst4_lane_u16(ptr, val, lane) vst4_lane_u16_ptr(ptr, &val, lane)
 
 //void vst4_lane_u32(__transfersize(4) uint32_t * ptr, uint32x2x4_t val, __constrange(0,1) int lane)// VST4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0]
 _NEON2SSE_INLINE void vst4_lane_u32_ptr(__transfersize(4) uint32_t * ptr, uint32x2x4_t* val, __constrange(0,1) int lane)
 {
-    vst1q_lane_u32(ptr,   val->val[0], lane);
-    vst1q_lane_u32((ptr + 1), val->val[1], lane);
-    vst1q_lane_u32((ptr + 2), val->val[2], lane);
-    vst1q_lane_u32((ptr + 3), val->val[3], lane);
-
+    *(ptr) =     val->val[0].m64_u32[lane];
+    *(ptr + 1) = val->val[1].m64_u32[lane];
+    *(ptr + 2) = val->val[2].m64_u32[lane];
+    *(ptr + 3) = val->val[3].m64_u32[lane];
 }
 #define vst4_lane_u32(ptr, val, lane) vst4_lane_u32_ptr(ptr, &val, lane)
 
@@ -6795,8 +11890,15 @@
 void vst4_lane_f16_ptr(__transfersize(4) __fp16 * ptr, float16x4x4_t * val, __constrange(0,3) int lane);
 //current IA SIMD doesn't support float16
 
-//void vst4_lane_f32(__transfersize(4) float32_t * ptr, float32x2x4_t val, __constrange(0,1) int lane)// VST4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0]
-#define vst4_lane_f32 vst4q_lane_f32
+void vst4_lane_f32_ptr(__transfersize(4) float32_t * ptr, float32x2x4_t * val, __constrange(0,1) int lane); // VST4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0]
+_NEON2SSE_INLINE void vst4_lane_f32_ptr(__transfersize(4) float32_t * ptr, float32x2x4_t* val, __constrange(0,1) int lane)
+{
+    *(ptr) = val->val[0].m64_f32[lane];
+    *(ptr + 1) = val->val[1].m64_f32[lane];
+    *(ptr + 2) = val->val[2].m64_f32[lane];
+    *(ptr + 3) = val->val[3].m64_f32[lane];
+}
+#define vst4_lane_f32(ptr,val,lane) vst4_lane_f32_ptr(ptr,&val,lane)
 
 //void vst4_lane_p8(__transfersize(4) poly8_t * ptr, poly8x8x4_t val, __constrange(0,7) int lane);// VST4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]
 void vst4_lane_p8_ptr(__transfersize(4) poly8_t * ptr, poly8x8x4_t * val, __constrange(0,7) int lane);
@@ -6810,32 +11912,59 @@
 //************************ Extract lanes from a vector ********************************************
 //**************************************************************************************************
 //These intrinsics extract a single lane (element) from a vector.
+uint8_t vget_lane_u8(uint8x8_t vec, __constrange(0,7) int lane); // VMOV.U8 r0, d0[0]
+#define vget_lane_u8(vec, lane) vec.m64_u8[lane]
 
-uint8_t vgetq_lane_u8(uint8x16_t vec, __constrange(0,15) int lane);         // VMOV.U8 r0, d0[0]
+uint16_t vget_lane_u16(uint16x4_t vec, __constrange(0,3) int lane); // VMOV.s16 r0, d0[0]
+#define vget_lane_u16(vec, lane) vec.m64_u16[lane]
+
+
+uint32_t vget_lane_u32(uint32x2_t vec, __constrange(0,1) int lane); // VMOV.32 r0, d0[0]
+#define vget_lane_u32(vec, lane) vec.m64_u32[lane]
+
+int8_t vget_lane_s8(int8x8_t vec, __constrange(0,7) int lane); // VMOV.S8 r0, d0[0]
+#define vget_lane_s8(vec, lane) vec.m64_i8[lane]
+
+int16_t vget_lane_s16(int16x4_t vec, __constrange(0,3) int lane); // VMOV.S16 r0, d0[0]
+#define vget_lane_s16(vec, lane) vec.m64_i16[lane]
+
+int32_t vget_lane_s32(int32x2_t vec, __constrange(0,1) int lane); // VMOV.32 r0, d0[0]
+#define vget_lane_s32(vec, lane) vec.m64_i32[lane]
+
+poly8_t vget_lane_p8(poly8x8_t vec, __constrange(0,7) int lane); // VMOV.U8 r0, d0[0]
+#define vget_lane_p8 vget_lane_u8
+
+poly16_t vget_lane_p16(poly16x4_t vec, __constrange(0,3) int lane); // VMOV.s16 r0, d0[0]
+#define vget_lane_p16 vget_lane_u16
+
+float32_t vget_lane_f32(float32x2_t vec, __constrange(0,1) int lane); // VMOV.32 r0, d0[0]
+#define vget_lane_f32(vec, lane) vec.m64_f32[lane]
+
+uint8_t vgetq_lane_u8(uint8x16_t vec, __constrange(0,15) int lane); // VMOV.U8 r0, d0[0]
 #define vgetq_lane_u8 _MM_EXTRACT_EPI8
 
-uint16_t vgetq_lane_u16(uint16x8_t vec, __constrange(0,7) int lane);         // VMOV.s16 r0, d0[0]
+uint16_t vgetq_lane_u16(uint16x8_t vec, __constrange(0,7) int lane); // VMOV.s16 r0, d0[0]
 #define  vgetq_lane_u16 _MM_EXTRACT_EPI16
 
-uint32_t vgetq_lane_u32(uint32x4_t vec, __constrange(0,3) int lane);         // VMOV.32 r0, d0[0]
+uint32_t vgetq_lane_u32(uint32x4_t vec, __constrange(0,3) int lane); // VMOV.32 r0, d0[0]
 #define vgetq_lane_u32 _MM_EXTRACT_EPI32
 
-int8_t vgetq_lane_s8(int8x16_t vec, __constrange(0,15) int lane);         // VMOV.S8 r0, d0[0]
+int8_t vgetq_lane_s8(int8x16_t vec, __constrange(0,15) int lane); // VMOV.S8 r0, d0[0]
 #define vgetq_lane_s8 vgetq_lane_u8
 
-int16_t vgetq_lane_s16(int16x8_t vec, __constrange(0,7) int lane);         // VMOV.S16 r0, d0[0]
+int16_t vgetq_lane_s16(int16x8_t vec, __constrange(0,7) int lane); // VMOV.S16 r0, d0[0]
 #define vgetq_lane_s16 vgetq_lane_u16
 
-int32_t vgetq_lane_s32(int32x4_t vec, __constrange(0,3) int lane);         // VMOV.32 r0, d0[0]
+int32_t vgetq_lane_s32(int32x4_t vec, __constrange(0,3) int lane); // VMOV.32 r0, d0[0]
 #define vgetq_lane_s32 vgetq_lane_u32
 
-poly8_t vgetq_lane_p8(poly8x16_t vec, __constrange(0,15) int lane);         // VMOV.U8 r0, d0[0]
+poly8_t vgetq_lane_p8(poly8x16_t vec, __constrange(0,15) int lane); // VMOV.U8 r0, d0[0]
 #define vgetq_lane_p8 vgetq_lane_u8
 
-poly16_t vgetq_lane_p16(poly16x8_t vec, __constrange(0,7) int lane);         // VMOV.s16 r0, d0[0]
+poly16_t vgetq_lane_p16(poly16x8_t vec, __constrange(0,7) int lane); // VMOV.s16 r0, d0[0]
 #define vgetq_lane_p16 vgetq_lane_u16
 
-float32_t vgetq_lane_f32(float32x4_t vec, __constrange(0,3) int lane);         // VMOV.32 r0, d0[0]
+float32_t vgetq_lane_f32(float32x4_t vec, __constrange(0,3) int lane); // VMOV.32 r0, d0[0]
 _NEON2SSE_INLINE float32_t vgetq_lane_f32(float32x4_t vec, __constrange(0,3) int lane)
 {
     int32_t ilane;
@@ -6843,10 +11972,17 @@
     return *(float*)&ilane;
 }
 
-int64_t vgetq_lane_s64(int64x2_t vec, __constrange(0,1) int lane);         // VMOV r0,r0,d0
+int64_t vget_lane_s64(int64x1_t vec, __constrange(0,0) int lane); // VMOV r0,r0,d0
+#define vget_lane_s64(vec, lane) vec.m64_i64[0]
+
+uint64_t vget_lane_u64(uint64x1_t vec, __constrange(0,0) int lane); // VMOV r0,r0,d0
+#define vget_lane_u64(vec, lane) vec.m64_u64[0]
+
+
+int64_t vgetq_lane_s64(int64x2_t vec, __constrange(0,1) int lane); // VMOV r0,r0,d0
 #define vgetq_lane_s64 (int64_t) vgetq_lane_u64
 
-uint64_t vgetq_lane_u64(uint64x2_t vec, __constrange(0,1) int lane);         // VMOV r0,r0,d0
+uint64_t vgetq_lane_u64(uint64x2_t vec, __constrange(0,1) int lane); // VMOV r0,r0,d0
 #define vgetq_lane_u64 _MM_EXTRACT_EPI64
 
 // ***************** Set lanes within a vector ********************************************
@@ -6854,7 +11990,69 @@
 //These intrinsics set a single lane (element) within a vector.
 //same functions as vld1_lane_xx ones, but take the value to be set directly.
 
-uint8x16_t vsetq_lane_u8(uint8_t value, uint8x16_t vec, __constrange(0,15) int lane);         // VMOV.8 d0[0],r0
+uint8x8_t vset_lane_u8(uint8_t value, uint8x8_t vec, __constrange(0,7) int lane); // VMOV.8 d0[0],r0
+_NEON2SSE_INLINE uint8x8_t vset_lane_u8(uint8_t value, uint8x8_t vec, __constrange(0,7) int lane)
+{
+    uint8_t val;
+    val = value;
+    return vld1_lane_u8(&val, vec,  lane);
+}
+
+uint16x4_t vset_lane_u16(uint16_t value, uint16x4_t vec, __constrange(0,3) int lane); // VMOV.16 d0[0],r0
+_NEON2SSE_INLINE uint16x4_t vset_lane_u16(uint16_t value, uint16x4_t vec, __constrange(0,3) int lane)
+{
+    uint16_t val;
+    val = value;
+    return vld1_lane_u16(&val, vec,  lane);
+}
+
+uint32x2_t vset_lane_u32(uint32_t value, uint32x2_t vec, __constrange(0,1) int lane); // VMOV.32 d0[0],r0
+_NEON2SSE_INLINE uint32x2_t vset_lane_u32(uint32_t value, uint32x2_t vec, __constrange(0,1) int lane)
+{
+    uint32_t val;
+    val = value;
+    return vld1_lane_u32(&val, vec,  lane);
+}
+
+int8x8_t vset_lane_s8(int8_t value, int8x8_t vec, __constrange(0,7) int lane); // VMOV.8 d0[0],r0
+_NEON2SSE_INLINE int8x8_t vset_lane_s8(int8_t value, int8x8_t vec, __constrange(0,7) int lane)
+{
+    int8_t val;
+    val = value;
+    return vld1_lane_s8(&val, vec,  lane);
+}
+
+int16x4_t vset_lane_s16(int16_t value, int16x4_t vec, __constrange(0,3) int lane); // VMOV.16 d0[0],r0
+_NEON2SSE_INLINE int16x4_t vset_lane_s16(int16_t value, int16x4_t vec, __constrange(0,3) int lane)
+{
+    int16_t val;
+    val = value;
+    return vld1_lane_s16(&val, vec,  lane);
+}
+
+int32x2_t vset_lane_s32(int32_t value, int32x2_t vec, __constrange(0,1) int lane); // VMOV.32 d0[0],r0
+_NEON2SSE_INLINE int32x2_t vset_lane_s32(int32_t value, int32x2_t vec, __constrange(0,1) int lane)
+{
+    int32_t val;
+    val = value;
+    return vld1_lane_s32(&val, vec,  lane);
+}
+
+poly8x8_t vset_lane_p8(poly8_t value, poly8x8_t vec, __constrange(0,7) int lane); // VMOV.8 d0[0],r0
+#define vset_lane_p8  vset_lane_u8
+
+poly16x4_t vset_lane_p16(poly16_t value, poly16x4_t vec, __constrange(0,3) int lane); // VMOV.16 d0[0],r0
+#define vset_lane_p16  vset_lane_u16
+
+float32x2_t vset_lane_f32(float32_t value, float32x2_t vec, __constrange(0,1) int lane); // VMOV.32 d0[0],r0
+_NEON2SSE_INLINE float32x2_t vset_lane_f32(float32_t value, float32x2_t vec, __constrange(0,1) int lane)
+{
+    float32_t val;
+    val = value;
+    return vld1_lane_f32(&val, vec,  lane);
+}
+
+uint8x16_t vsetq_lane_u8(uint8_t value, uint8x16_t vec, __constrange(0,15) int lane); // VMOV.8 d0[0],r0
 _NEON2SSE_INLINE uint8x16_t vsetq_lane_u8(uint8_t value, uint8x16_t vec, __constrange(0,15) int lane)
 {
     uint8_t val;
@@ -6862,7 +12060,7 @@
     return vld1q_lane_u8(&val, vec,  lane);
 }
 
-uint16x8_t vsetq_lane_u16(uint16_t value, uint16x8_t vec, __constrange(0,7) int lane);         // VMOV.16 d0[0],r0
+uint16x8_t vsetq_lane_u16(uint16_t value, uint16x8_t vec, __constrange(0,7) int lane); // VMOV.16 d0[0],r0
 _NEON2SSE_INLINE uint16x8_t vsetq_lane_u16(uint16_t value, uint16x8_t vec, __constrange(0,7) int lane)
 {
     uint16_t val;
@@ -6870,7 +12068,7 @@
     return vld1q_lane_u16(&val, vec,  lane);
 }
 
-uint32x4_t vsetq_lane_u32(uint32_t value, uint32x4_t vec, __constrange(0,3) int lane);         // VMOV.32 d0[0],r0
+uint32x4_t vsetq_lane_u32(uint32_t value, uint32x4_t vec, __constrange(0,3) int lane); // VMOV.32 d0[0],r0
 _NEON2SSE_INLINE uint32x4_t vsetq_lane_u32(uint32_t value, uint32x4_t vec, __constrange(0,3) int lane)
 {
     uint32_t val;
@@ -6878,7 +12076,7 @@
     return vld1q_lane_u32(&val, vec,  lane);
 }
 
-int8x16_t vsetq_lane_s8(int8_t value, int8x16_t vec, __constrange(0,15) int lane);         // VMOV.8 d0[0],r0
+int8x16_t vsetq_lane_s8(int8_t value, int8x16_t vec, __constrange(0,15) int lane); // VMOV.8 d0[0],r0
 _NEON2SSE_INLINE int8x16_t vsetq_lane_s8(int8_t value, int8x16_t vec, __constrange(0,15) int lane)
 {
     int8_t val;
@@ -6886,7 +12084,7 @@
     return vld1q_lane_s8(&val, vec,  lane);
 }
 
-int16x8_t vsetq_lane_s16(int16_t value, int16x8_t vec, __constrange(0,7) int lane);         // VMOV.16 d0[0],r0
+int16x8_t vsetq_lane_s16(int16_t value, int16x8_t vec, __constrange(0,7) int lane); // VMOV.16 d0[0],r0
 _NEON2SSE_INLINE int16x8_t vsetq_lane_s16(int16_t value, int16x8_t vec, __constrange(0,7) int lane)
 {
     int16_t val;
@@ -6894,7 +12092,7 @@
     return vld1q_lane_s16(&val, vec,  lane);
 }
 
-int32x4_t vsetq_lane_s32(int32_t value, int32x4_t vec, __constrange(0,3) int lane);         // VMOV.32 d0[0],r0
+int32x4_t vsetq_lane_s32(int32_t value, int32x4_t vec, __constrange(0,3) int lane); // VMOV.32 d0[0],r0
 _NEON2SSE_INLINE int32x4_t vsetq_lane_s32(int32_t value, int32x4_t vec, __constrange(0,3) int lane)
 {
     int32_t val;
@@ -6902,20 +12100,37 @@
     return vld1q_lane_s32(&val, vec,  lane);
 }
 
-poly8x16_t vsetq_lane_p8(poly8_t value, poly8x16_t vec, __constrange(0,15) int lane);         // VMOV.8 d0[0],r0
+poly8x16_t vsetq_lane_p8(poly8_t value, poly8x16_t vec, __constrange(0,15) int lane); // VMOV.8 d0[0],r0
 #define vsetq_lane_p8 vsetq_lane_u8
 
-poly16x8_t vsetq_lane_p16(poly16_t value, poly16x8_t vec, __constrange(0,7) int lane);         // VMOV.16 d0[0],r0
+poly16x8_t vsetq_lane_p16(poly16_t value, poly16x8_t vec, __constrange(0,7) int lane); // VMOV.16 d0[0],r0
 #define vsetq_lane_p16 vsetq_lane_u16
 
-float32x4_t vsetq_lane_f32(float32_t value, float32x4_t vec, __constrange(0,3) int lane);         // VMOV.32 d0[0],r0
+float32x4_t vsetq_lane_f32(float32_t value, float32x4_t vec, __constrange(0,3) int lane); // VMOV.32 d0[0],r0
 _NEON2SSE_INLINE float32x4_t vsetq_lane_f32(float32_t value, float32x4_t vec, __constrange(0,3) int lane)
 {
     float32_t val;
     val = value;
+    return vld1q_lane_f32(&val, vec,  lane);
 }
 
-int64x2_t vsetq_lane_s64(int64_t value, int64x2_t vec, __constrange(0,1) int lane);         // VMOV d0,r0,r0
+int64x1_t vset_lane_s64(int64_t value, int64x1_t vec, __constrange(0,0) int lane); // VMOV d0,r0,r0
+_NEON2SSE_INLINE int64x1_t vset_lane_s64(int64_t value, int64x1_t vec, __constrange(0,0) int lane)
+{
+    int64_t val;
+    val = value;
+    return vld1_lane_s64(&val, vec,  lane);
+}
+
+uint64x1_t vset_lane_u64(uint64_t value, uint64x1_t vec, __constrange(0,0) int lane); // VMOV d0,r0,r0
+_NEON2SSE_INLINE uint64x1_t vset_lane_u64(uint64_t value, uint64x1_t vec, __constrange(0,0) int lane)
+{
+    uint64_t val;
+    val = value;
+    return vld1_lane_u64(&val, vec,  lane);
+}
+
+int64x2_t vsetq_lane_s64(int64_t value, int64x2_t vec, __constrange(0,1) int lane); // VMOV d0,r0,r0
 _NEON2SSE_INLINE int64x2_t vsetq_lane_s64(int64_t value, int64x2_t vec, __constrange(0,1) int lane)
 {
     uint64_t val;
@@ -6923,134 +12138,580 @@
     return vld1q_lane_s64(&val, vec,  lane);
 }
 
-uint64x2_t vsetq_lane_u64(uint64_t value, uint64x2_t vec, __constrange(0,1) int lane);         // VMOV d0,r0,r0
+uint64x2_t vsetq_lane_u64(uint64_t value, uint64x2_t vec, __constrange(0,1) int lane); // VMOV d0,r0,r0
 #define vsetq_lane_u64 vsetq_lane_s64
 
 // *******************************************************************************
 // **************** Initialize a vector from bit pattern ***************************
 // *******************************************************************************
 //These intrinsics create a vector from a literal bit pattern.
+int8x8_t vcreate_s8(uint64_t a); // VMOV d0,r0,r0
+#define vcreate_s8(a)  (*(__m64_128*)&(a))
 
+
+int16x4_t vcreate_s16(uint64_t a); // VMOV d0,r0,r0
+#define vcreate_s16  vcreate_s8
+
+int32x2_t vcreate_s32(uint64_t a); // VMOV d0,r0,r0
+#define vcreate_s32  vcreate_s8
+
+float16x4_t vcreate_f16(uint64_t a); // VMOV d0,r0,r0
 //no IA32 SIMD avalilable
 
+float32x2_t vcreate_f32(uint64_t a); // VMOV d0,r0,r0
+#define vcreate_f32(a)  (*(__m64_128*)&(a))
+
+uint8x8_t vcreate_u8(uint64_t a); // VMOV d0,r0,r0
+#define vcreate_u8 vcreate_s8
+
+uint16x4_t vcreate_u16(uint64_t a); // VMOV d0,r0,r0
+#define vcreate_u16 vcreate_s16
+
+uint32x2_t vcreate_u32(uint64_t a); // VMOV d0,r0,r0
+#define vcreate_u32 vcreate_s32
+
+uint64x1_t vcreate_u64(uint64_t a); // VMOV d0,r0,r0
+#define vcreate_u64  vcreate_s8
+
+
+poly8x8_t vcreate_p8(uint64_t a); // VMOV d0,r0,r0
+#define vcreate_p8 vcreate_u8
+
+poly16x4_t vcreate_p16(uint64_t a); // VMOV d0,r0,r0
+#define vcreate_p16 vcreate_u16
+
+int64x1_t vcreate_s64(uint64_t a); // VMOV d0,r0,r0
+#define vcreate_s64 vcreate_u64
+
 //********************* Set all lanes to same value ********************************
 //*********************************************************************************
 //These intrinsics set all lanes to the same value.
+uint8x8_t   vdup_n_u8(uint8_t value); // VDUP.8 d0,r0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x8_t  vdup_n_u8(uint8_t value),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    uint8x8_t res;
+    int i;
+    for (i = 0; i<8; i++) {
+        res.m64_u8[i] = value;
+    }
+    return res;
+}
 
-uint8x16_t   vdupq_n_u8(uint8_t value);         // VDUP.8 q0,r0
+uint16x4_t   vdup_n_u16(uint16_t value); // VDUP.16 d0,r0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x4_t  vdup_n_u16(uint16_t value),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    uint16x4_t res;
+    int i;
+    for (i = 0; i<4; i++) {
+        res.m64_u16[i] = value;
+    }
+    return res;
+}
+
+uint32x2_t   vdup_n_u32(uint32_t value); // VDUP.32 d0,r0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t  vdup_n_u32(uint32_t value),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    uint32x2_t res;
+    res.m64_u32[0] = value;
+    res.m64_u32[1] = value;
+    return res;
+}
+
+int8x8_t   vdup_n_s8(int8_t value); // VDUP.8 d0,r0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x8_t  vdup_n_s8(int8_t value),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    int8x8_t res;
+    int i;
+    for (i = 0; i<8; i++) {
+        res.m64_i8[i] = value;
+    }
+    return res;
+}
+
+int16x4_t   vdup_n_s16(int16_t value); // VDUP.16 d0,r0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x4_t  vdup_n_s16(int16_t value),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    int16x4_t res;
+    int i;
+    for (i = 0; i<4; i++) {
+        res.m64_i16[i] = value;
+    }
+    return res;
+}
+
+int32x2_t   vdup_n_s32(int32_t value); // VDUP.32 d0,r0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t  vdup_n_s32(int32_t value),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    int32x2_t res;
+    res.m64_i32[0] = value;
+    res.m64_i32[1] = value;
+    return res;
+}
+
+poly8x8_t vdup_n_p8(poly8_t value); // VDUP.8 d0,r0
+#define vdup_n_p8 vdup_n_u8
+
+poly16x4_t vdup_n_p16(poly16_t value); // VDUP.16 d0,r0
+#define vdup_n_p16 vdup_n_s16
+
+float32x2_t vdup_n_f32(float32_t value); // VDUP.32 d0,r0
+_NEON2SSE_INLINE float32x2_t vdup_n_f32(float32_t value)
+{
+    float32x2_t res;
+    res.m64_f32[0] = value;
+    res.m64_f32[1] = value;
+    return res;
+}
+
+uint8x16_t   vdupq_n_u8(uint8_t value); // VDUP.8 q0,r0
 #define vdupq_n_u8(value) _mm_set1_epi8((uint8_t) (value))
 
-uint16x8_t   vdupq_n_u16(uint16_t value);         // VDUP.16 q0,r0
+uint16x8_t   vdupq_n_u16(uint16_t value); // VDUP.16 q0,r0
 #define vdupq_n_u16(value) _mm_set1_epi16((uint16_t) (value))
 
-uint32x4_t   vdupq_n_u32(uint32_t value);         // VDUP.32 q0,r0
+uint32x4_t   vdupq_n_u32(uint32_t value); // VDUP.32 q0,r0
 #define vdupq_n_u32(value) _mm_set1_epi32((uint32_t) (value))
 
-int8x16_t   vdupq_n_s8(int8_t value);         // VDUP.8 q0,r0
+int8x16_t   vdupq_n_s8(int8_t value); // VDUP.8 q0,r0
 #define vdupq_n_s8 _mm_set1_epi8
 
-int16x8_t   vdupq_n_s16(int16_t value);         // VDUP.16 q0,r0
+int16x8_t   vdupq_n_s16(int16_t value); // VDUP.16 q0,r0
 #define vdupq_n_s16 _mm_set1_epi16
 
-int32x4_t   vdupq_n_s32(int32_t value);         // VDUP.32 q0,r0
+int32x4_t   vdupq_n_s32(int32_t value); // VDUP.32 q0,r0
 #define vdupq_n_s32 _mm_set1_epi32
 
-poly8x16_t vdupq_n_p8(poly8_t value);         // VDUP.8 q0,r0
+poly8x16_t vdupq_n_p8(poly8_t value); // VDUP.8 q0,r0
 #define  vdupq_n_p8 vdupq_n_u8
 
-poly16x8_t vdupq_n_p16(poly16_t value);         // VDUP.16 q0,r0
+poly16x8_t vdupq_n_p16(poly16_t value); // VDUP.16 q0,r0
 #define  vdupq_n_p16 vdupq_n_u16
 
-float32x4_t vdupq_n_f32(float32_t value);         // VDUP.32 q0,r0
+float32x4_t vdupq_n_f32(float32_t value); // VDUP.32 q0,r0
 #define vdupq_n_f32 _mm_set1_ps
 
-int64x2_t   vdupq_n_s64(int64_t value);         // VMOV d0,r0,r0
+int64x1_t vdup_n_s64(int64_t value); // VMOV d0,r0,r0
+_NEON2SSE_INLINE int64x1_t vdup_n_s64(int64_t value)
+{
+    int64x1_t res;
+    res.m64_i64[0] = value;
+    return res;
+}
+
+uint64x1_t vdup_n_u64(uint64_t value); // VMOV d0,r0,r0
+_NEON2SSE_INLINE uint64x1_t  vdup_n_u64(uint64_t value)
+{
+    uint64x1_t res;
+    res.m64_u64[0] = value;
+    return res;
+}
+
+int64x2_t   vdupq_n_s64(int64_t value); // VMOV d0,r0,r0
 _NEON2SSE_INLINE int64x2_t   vdupq_n_s64(int64_t value)
 {
-    _NEON2SSE_ALIGN_16 int64_t value2[2] = {value, value};         //value may be an immediate
+    _NEON2SSE_ALIGN_16 int64_t value2[2] = {value, value}; //value may be an immediate
     return LOAD_SI128(value2);
 }
 
-uint64x2_t   vdupq_n_u64(uint64_t value);         // VMOV d0,r0,r0
+uint64x2_t   vdupq_n_u64(uint64_t value); // VMOV d0,r0,r0
 _NEON2SSE_INLINE uint64x2_t   vdupq_n_u64(uint64_t value)
 {
-    _NEON2SSE_ALIGN_16 uint64_t val[2] = {value, value};         //value may be an immediate
+    _NEON2SSE_ALIGN_16 uint64_t val[2] = {value, value}; //value may be an immediate
     return LOAD_SI128(val);
 }
 
 //****  Set all lanes to same value  ************************
 //Same functions as above - just aliaces.********************
 //Probably they reflect the fact that 128-bit functions versions use VMOV instruction **********
+uint8x8_t vmov_n_u8(uint8_t value); // VDUP.8 d0,r0
+#define vmov_n_u8 vdup_n_s8
 
-uint8x16_t vmovq_n_u8(uint8_t value);         // VDUP.8 q0,r0
+uint16x4_t vmov_n_u16(uint16_t value); // VDUP.16 d0,r0
+#define vmov_n_u16 vdup_n_s16
+
+uint32x2_t vmov_n_u32(uint32_t value); // VDUP.32 d0,r0
+#define vmov_n_u32 vdup_n_u32
+
+int8x8_t vmov_n_s8(int8_t value); // VDUP.8 d0,r0
+#define vmov_n_s8 vdup_n_s8
+
+int16x4_t vmov_n_s16(int16_t value); // VDUP.16 d0,r0
+#define vmov_n_s16 vdup_n_s16
+
+int32x2_t vmov_n_s32(int32_t value); // VDUP.32 d0,r0
+#define vmov_n_s32 vdup_n_s32
+
+poly8x8_t vmov_n_p8(poly8_t value); // VDUP.8 d0,r0
+#define vmov_n_p8 vdup_n_u8
+
+poly16x4_t vmov_n_p16(poly16_t value); // VDUP.16 d0,r0
+#define vmov_n_p16 vdup_n_s16
+
+float32x2_t vmov_n_f32(float32_t value); // VDUP.32 d0,r0
+#define vmov_n_f32 vdup_n_f32
+
+uint8x16_t vmovq_n_u8(uint8_t value); // VDUP.8 q0,r0
 #define vmovq_n_u8 vdupq_n_u8
 
-uint16x8_t vmovq_n_u16(uint16_t value);         // VDUP.16 q0,r0
+uint16x8_t vmovq_n_u16(uint16_t value); // VDUP.16 q0,r0
 #define vmovq_n_u16 vdupq_n_s16
 
-uint32x4_t vmovq_n_u32(uint32_t value);         // VDUP.32 q0,r0
+uint32x4_t vmovq_n_u32(uint32_t value); // VDUP.32 q0,r0
 #define vmovq_n_u32 vdupq_n_u32
 
-int8x16_t vmovq_n_s8(int8_t value);         // VDUP.8 q0,r0
+int8x16_t vmovq_n_s8(int8_t value); // VDUP.8 q0,r0
 #define vmovq_n_s8 vdupq_n_s8
 
-int16x8_t vmovq_n_s16(int16_t value);         // VDUP.16 q0,r0
+int16x8_t vmovq_n_s16(int16_t value); // VDUP.16 q0,r0
 #define vmovq_n_s16 vdupq_n_s16
 
-int32x4_t vmovq_n_s32(int32_t value);         // VDUP.32 q0,r0
+int32x4_t vmovq_n_s32(int32_t value); // VDUP.32 q0,r0
 #define vmovq_n_s32 vdupq_n_s32
 
-poly8x16_t vmovq_n_p8(poly8_t value);         // VDUP.8 q0,r0
+poly8x16_t vmovq_n_p8(poly8_t value); // VDUP.8 q0,r0
 #define vmovq_n_p8 vdupq_n_u8
 
-poly16x8_t vmovq_n_p16(poly16_t value);         // VDUP.16 q0,r0
+poly16x8_t vmovq_n_p16(poly16_t value); // VDUP.16 q0,r0
 #define vmovq_n_p16 vdupq_n_s16
 
-float32x4_t vmovq_n_f32(float32_t value);         // VDUP.32 q0,r0
+float32x4_t vmovq_n_f32(float32_t value); // VDUP.32 q0,r0
 #define vmovq_n_f32 vdupq_n_f32
 
-int64x2_t vmovq_n_s64(int64_t value);         // VMOV d0,r0,r0
+int64x1_t vmov_n_s64(int64_t value); // VMOV d0,r0,r0
+#define vmov_n_s64 vdup_n_s64
+
+uint64x1_t vmov_n_u64(uint64_t value); // VMOV d0,r0,r0
+#define vmov_n_u64 vdup_n_u64
+
+int64x2_t vmovq_n_s64(int64_t value); // VMOV d0,r0,r0
 #define vmovq_n_s64 vdupq_n_s64
 
-uint64x2_t vmovq_n_u64(uint64_t value);         // VMOV d0,r0,r0
+uint64x2_t vmovq_n_u64(uint64_t value); // VMOV d0,r0,r0
 #define vmovq_n_u64 vdupq_n_u64
 
 //**************Set all lanes to the value of one lane of a vector *************
 //****************************************************************************
 //here shuffle is better solution than lane extraction followed by set1 function
+uint8x8_t vdup_lane_u8(uint8x8_t vec, __constrange(0,7) int lane); // VDUP.8 d0,d0[0]
+_NEON2SSE_INLINE uint8x8_t vdup_lane_u8(uint8x8_t vec, __constrange(0,7) int lane)
+{
+    uint8x8_t res;
+    uint8_t valane;
+    int i = 0;
+    valane = vec.m64_u8[lane];
+    for (i = 0; i<8; i++) {
+        res.m64_u8[i] = valane;
+    }
+    return res;
+}
+
+uint16x4_t vdup_lane_u16(uint16x4_t vec, __constrange(0,3) int lane); // VDUP.16 d0,d0[0]
+_NEON2SSE_INLINE uint16x4_t vdup_lane_u16(uint16x4_t vec, __constrange(0,3) int lane)
+{
+    uint16x4_t res;
+    uint16_t valane;
+    valane = vec.m64_u16[lane];
+    res.m64_u16[0] = valane;
+    res.m64_u16[1] = valane;
+    res.m64_u16[2] = valane;
+    res.m64_u16[3] = valane;
+    return res;
+}
+
+uint32x2_t vdup_lane_u32(uint32x2_t vec, __constrange(0,1) int lane); // VDUP.32 d0,d0[0]
+_NEON2SSE_INLINE uint32x2_t vdup_lane_u32(uint32x2_t vec, __constrange(0,1) int lane)
+{
+    uint32x2_t res;
+    res.m64_u32[0] = vec.m64_u32[lane];
+    res.m64_u32[1] = res.m64_u32[0];
+    return res;
+}
+
+int8x8_t vdup_lane_s8(int8x8_t vec,  __constrange(0,7) int lane); // VDUP.8 d0,d0[0]
+#define vdup_lane_s8 vdup_lane_u8
+
+int16x4_t vdup_lane_s16(int16x4_t vec,  __constrange(0,3) int lane); // VDUP.16 d0,d0[0]
+#define vdup_lane_s16 vdup_lane_u16
+
+int32x2_t vdup_lane_s32(int32x2_t vec,  __constrange(0,1) int lane); // VDUP.32 d0,d0[0]
+#define vdup_lane_s32 vdup_lane_u32
+
+poly8x8_t vdup_lane_p8(poly8x8_t vec, __constrange(0,7) int lane); // VDUP.8 d0,d0[0]
+#define vdup_lane_p8 vdup_lane_u8
+
+poly16x4_t vdup_lane_p16(poly16x4_t vec, __constrange(0,3) int lane); // VDUP.16 d0,d0[0]
+#define vdup_lane_p16 vdup_lane_s16
+
+float32x2_t vdup_lane_f32(float32x2_t vec, __constrange(0,1) int lane); // VDUP.32 d0,d0[0]
+_NEON2SSE_INLINE float32x2_t vdup_lane_f32(float32x2_t vec, __constrange(0,1) int lane)
+{
+    float32x2_t res;
+    res.m64_f32[0] = vec.m64_f32[lane];
+    res.m64_f32[1] = res.m64_f32[0];
+    return res;
+}
+
+uint8x16_t vdupq_lane_u8(uint8x8_t vec, __constrange(0,7) int lane); // VDUP.8 q0,d0[0]
+_NEON2SSE_INLINE uint8x16_t vdupq_lane_u8(uint8x8_t vec, __constrange(0,7) int lane) // VDUP.8 q0,d0[0]
+{
+    _NEON2SSE_ALIGN_16 int8_t lanemask8[16] = {lane, lane, lane, lane, lane, lane, lane, lane, lane, lane, lane, lane, lane, lane, lane, lane};
+    return _mm_shuffle_epi8 (_pM128i(vec), *(__m128i*) lanemask8);
+}
+
+uint16x8_t vdupq_lane_u16(uint16x4_t vec, __constrange(0,3) int lane); // VDUP.16 q0,d0[0]
+_NEON2SSE_INLINE uint16x8_t vdupq_lane_u16(uint16x4_t vec, __constrange(0,3) int lane) // VDUP.16 q0,d0[0]
+{
+    //we could use 8bit shuffle for 16 bit as well
+    const int8_t lane16 = ((int8_t) lane) << 1;
+    _NEON2SSE_ALIGN_16 int8_t lanemask_e16[16] = {lane16, lane16 + 1, lane16, lane16 + 1, lane16, lane16 + 1, lane16, lane16 + 1,
+                                                lane16, lane16 + 1, lane16, lane16 + 1, lane16, lane16 + 1, lane16, lane16 + 1};
+    return _mm_shuffle_epi8 (_pM128i(vec), *(__m128i*)lanemask_e16);
+}
+
+uint32x4_t vdupq_lane_u32(uint32x2_t vec, __constrange(0,1) int lane); // VDUP.32 q0,d0[0]
+#define vdupq_lane_u32(vec,  lane) _mm_shuffle_epi32 (_pM128i(vec),  lane | (lane << 2) | (lane << 4) | (lane << 6))
+
+int8x16_t vdupq_lane_s8(int8x8_t vec, __constrange(0,7) int lane); // VDUP.8 q0,d0[0]
+#define vdupq_lane_s8 vdupq_lane_u8
+
+int16x8_t vdupq_lane_s16(int16x4_t vec, __constrange(0,3) int lane); // VDUP.16 q0,d0[0]
+#define vdupq_lane_s16 vdupq_lane_u16
+
+int32x4_t vdupq_lane_s32(int32x2_t vec, __constrange(0,1) int lane); // VDUP.32 q0,d0[0]
+#define vdupq_lane_s32 vdupq_lane_u32
+
+poly8x16_t vdupq_lane_p8(poly8x8_t vec, __constrange(0,7) int lane); // VDUP.8 q0,d0[0]
+#define vdupq_lane_p8 vdupq_lane_u8
+
+poly16x8_t vdupq_lane_p16(poly16x4_t vec, __constrange(0,3) int lane); // VDUP.16 q0,d0[0]
+#define vdupq_lane_p16 vdupq_lane_s16
+
+float32x4_t vdupq_lane_f32(float32x2_t vec, __constrange(0,1) int lane); // VDUP.32 q0,d0[0]
+#define  vdupq_lane_f32(vec, lane)  _mm_load1_ps((vec.m64_f32 + lane))
+
+int64x1_t vdup_lane_s64(int64x1_t vec, __constrange(0,0) int lane); // VMOV d0,d0
+#define vdup_lane_s64(vec,lane) vec
+
+uint64x1_t vdup_lane_u64(uint64x1_t vec, __constrange(0,0) int lane); // VMOV d0,d0
+#define vdup_lane_u64(vec,lane) vec
+
+int64x2_t vdupq_lane_s64(int64x1_t vec, __constrange(0,0) int lane); // VMOV q0,q0
+_NEON2SSE_INLINE int64x2_t vdupq_lane_s64(int64x1_t vec, __constrange(0,0) int lane)
+{
+    __m128i vec128;
+    vec128 = _pM128i(vec);
+    return _mm_unpacklo_epi64(vec128,vec128);
+}
+
+uint64x2_t vdupq_lane_u64(uint64x1_t vec, __constrange(0,0) int lane); // VMOV q0,q0
+#define vdupq_lane_u64 vdupq_lane_s64
 
 // ********************************************************************
 // ********************  Combining vectors *****************************
 // ********************************************************************
 //These intrinsics join two 64 bit vectors into a single 128bit vector.
+int8x16_t   vcombine_s8(int8x8_t low, int8x8_t high); // VMOV d0,d0
+#define vcombine_s8(low, high)   _mm_unpacklo_epi64 (_pM128i(low), _pM128i(high) )
 
+int16x8_t   vcombine_s16(int16x4_t low, int16x4_t high); // VMOV d0,d0
+#define vcombine_s16(low, high)    _mm_unpacklo_epi64 (_pM128i(low), _pM128i(high) )
+
+int32x4_t   vcombine_s32(int32x2_t low, int32x2_t high); // VMOV d0,d0
+#define vcombine_s32(low, high)   _mm_unpacklo_epi64 (_pM128i(low), _pM128i(high) )
+
+int64x2_t   vcombine_s64(int64x1_t low, int64x1_t high); // VMOV d0,d0
+#define vcombine_s64(low, high)   _mm_unpacklo_epi64 (_pM128i(low), _pM128i(high) )
+
+float16x8_t vcombine_f16(float16x4_t low, float16x4_t high); // VMOV d0,d0
 //current IA SIMD doesn't support float16
 
+float32x4_t vcombine_f32(float32x2_t low, float32x2_t high); // VMOV d0,d0
+_NEON2SSE_INLINE float32x4_t vcombine_f32(float32x2_t low, float32x2_t high)
+{
+    __m128i res;
+    res = _mm_unpacklo_epi64(_pM128i(low), _pM128i(high) );
+    return _M128(res);
+}
+
+uint8x16_t   vcombine_u8(uint8x8_t low, uint8x8_t high); // VMOV d0,d0
+#define vcombine_u8 vcombine_s8
+
+uint16x8_t   vcombine_u16(uint16x4_t low, uint16x4_t high); // VMOV d0,d0
+#define vcombine_u16 vcombine_s16
+
+uint32x4_t   vcombine_u32(uint32x2_t low, uint32x2_t high); // VMOV d0,d0
+#define vcombine_u32 vcombine_s32
+
+uint64x2_t   vcombine_u64(uint64x1_t low, uint64x1_t high); // VMOV d0,d0
+#define vcombine_u64 vcombine_s64
+
+poly8x16_t   vcombine_p8(poly8x8_t low, poly8x8_t high); // VMOV d0,d0
+#define vcombine_p8 vcombine_u8
+
+poly16x8_t   vcombine_p16(poly16x4_t low, poly16x4_t high); // VMOV d0,d0
+#define vcombine_p16 vcombine_u16
+
 //**********************************************************************
 //************************* Splitting vectors **************************
 //**********************************************************************
 //**************** Get high part ******************************************
 //These intrinsics split a 128 bit vector into 2 component 64 bit vectors
+int8x8_t vget_high_s8(int8x16_t a); // VMOV d0,d0
+_NEON2SSE_INLINE int8x8_t vget_high_s8(int8x16_t a)
+{
+    int8x8_t res64;
+    __m128i res;
+    res = _mm_unpackhi_epi64(a,a); //SSE2
+    return64(res);
+}
 
+int16x4_t vget_high_s16(int16x8_t a); // VMOV d0,d0
+_NEON2SSE_INLINE int16x4_t vget_high_s16(int16x8_t a)
+{
+    int16x4_t res64;
+    __m128i res;
+    res =  _mm_unpackhi_epi64(a,a); //SSE2
+    return64(res);
+}
+
+int32x2_t vget_high_s32(int32x4_t a); // VMOV d0,d0
+_NEON2SSE_INLINE int32x2_t vget_high_s32(int32x4_t a)
+{
+    int32x2_t res64;
+    __m128i res;
+    res =  _mm_unpackhi_epi64(a,a); //SSE2
+    return64(res);
+}
+
+int64x1_t vget_high_s64(int64x2_t a); // VMOV d0,d0
+_NEON2SSE_INLINE int64x1_t vget_high_s64(int64x2_t a)
+{
+    int64x1_t res64;
+    __m128i res;
+    res =  _mm_unpackhi_epi64(a,a); //SSE2
+    return64(res);
+}
+
+float16x4_t vget_high_f16(float16x8_t a); // VMOV d0,d0
 // IA32 SIMD doesn't work with 16bit floats currently
 
+float32x2_t vget_high_f32(float32x4_t a); // VMOV d0,d0
+_NEON2SSE_INLINE float32x2_t vget_high_f32(float32x4_t a)
+{
+    __m128i res;
+    __m64_128 res64;
+    res = _mm_unpackhi_epi64(_M128i(a),_M128i(a));
+    return64(res);
+}
+
+uint8x8_t vget_high_u8(uint8x16_t a); // VMOV d0,d0
+#define vget_high_u8 vget_high_s8
+
+uint16x4_t vget_high_u16(uint16x8_t a); // VMOV d0,d0
+#define vget_high_u16 vget_high_s16
+
+uint32x2_t vget_high_u32(uint32x4_t a); // VMOV d0,d0
+#define vget_high_u32 vget_high_s32
+
+uint64x1_t vget_high_u64(uint64x2_t a); // VMOV d0,d0
+#define vget_high_u64 vget_high_s64
+
+poly8x8_t vget_high_p8(poly8x16_t a); // VMOV d0,d0
+#define vget_high_p8 vget_high_u8
+
+poly16x4_t vget_high_p16(poly16x8_t a); // VMOV d0,d0
+#define vget_high_p16 vget_high_u16
+
 //********************** Get low part **********************
 //**********************************************************
+int8x8_t vget_low_s8(int8x16_t a); // VMOV d0,d0
+_NEON2SSE_INLINE int8x8_t vget_low_s8(int8x16_t a) // VMOV d0,d0
+{
+    int16x4_t res64;
+    return64(a);
+}
 
+int16x4_t vget_low_s16(int16x8_t a); // VMOV d0,d0
+_NEON2SSE_INLINE int16x4_t vget_low_s16(int16x8_t a) // VMOV d0,d0
+{
+    int16x4_t res64;
+    return64(a);
+}
+
+int32x2_t vget_low_s32(int32x4_t a); // VMOV d0,d0
+_NEON2SSE_INLINE int32x2_t vget_low_s32(int32x4_t a) // VMOV d0,d0
+{
+    int32x2_t res64;
+    return64(a);
+}
+
+int64x1_t vget_low_s64(int64x2_t a); // VMOV d0,d0
+_NEON2SSE_INLINE int64x1_t vget_low_s64(int64x2_t a) // VMOV d0,d0
+{
+    int64x1_t res64;
+    return64 (a);
+}
+
+float16x4_t vget_low_f16(float16x8_t a); // VMOV d0,d0
 // IA32 SIMD doesn't work with 16bit floats currently
 
+float32x2_t vget_low_f32(float32x4_t a); // VMOV d0,d0
+_NEON2SSE_INLINE float32x2_t vget_low_f32(float32x4_t a)
+{
+    float32x2_t res64;
+    _M64f(res64, a);
+    return res64;
+}
+
+uint8x8_t vget_low_u8(uint8x16_t a); // VMOV d0,d0
+#define vget_low_u8 vget_low_s8
+
+uint16x4_t vget_low_u16(uint16x8_t a); // VMOV d0,d0
+#define vget_low_u16 vget_low_s16
+
+uint32x2_t vget_low_u32(uint32x4_t a); // VMOV d0,d0
+#define vget_low_u32 vget_low_s32
+
+uint64x1_t vget_low_u64(uint64x2_t a); // VMOV d0,d0
+#define vget_low_u64 vget_low_s64
+
+poly8x8_t vget_low_p8(poly8x16_t a); // VMOV d0,d0
+#define vget_low_p8 vget_low_u8
+
+poly16x4_t vget_low_p16(poly16x8_t a); // VMOV d0,d0
+#define vget_low_p16 vget_low_s16
+
 //**************************************************************************
 //************************ Converting vectors **********************************
 //**************************************************************************
 //************* Convert from float ***************************************
 // need to set _MM_SET_ROUNDING_MODE ( x) accordingly
+int32x2_t   vcvt_s32_f32(float32x2_t a); // VCVT.S32.F32 d0, d0
+_NEON2SSE_INLINE int32x2_t   vcvt_s32_f32(float32x2_t a)
+{
+    int32x2_t res64;
+    __m128i res;
+    res =  _mm_cvttps_epi32(_pM128(a)); //use low 64 bits of result only
+    return64(res);
+}
 
-int32x4_t   vcvtq_s32_f32(float32x4_t a);         // VCVT.S32.F32 q0, q0
-#define vcvtq_s32_f32 _mm_cvtps_epi32
+uint32x2_t vcvt_u32_f32(float32x2_t a); // VCVT.U32.F32 d0, d0
+_NEON2SSE_INLINE uint32x2_t vcvt_u32_f32(float32x2_t a)
+{
+    //may be not effective compared with a serial SIMD solution
+    uint32x2_t res64;
+    __m128i res;
+    res = vcvtq_u32_f32(_pM128(a));
+    return64(res);
+}
 
-uint32x4_t vcvtq_u32_f32(float32x4_t a);         // VCVT.U32.F32 q0, q0
-_NEON2SSE_INLINE uint32x4_t vcvtq_u32_f32(float32x4_t a)         // VCVT.U32.F32 q0, q0
-{         //No single instruction SSE solution  but we could implement it as following:
+int32x4_t   vcvtq_s32_f32(float32x4_t a); // VCVT.S32.F32 q0, q0
+#define vcvtq_s32_f32 _mm_cvttps_epi32
+
+uint32x4_t vcvtq_u32_f32(float32x4_t a); // VCVT.U32.F32 q0, q0
+_NEON2SSE_INLINE uint32x4_t vcvtq_u32_f32(float32x4_t a) // VCVT.U32.F32 q0, q0
+{
+    //No single instruction SSE solution  but we could implement it as following:
     __m128i resi;
     __m128 zero,  mask, a_pos, mask_f_max_si, res;
     _NEON2SSE_ALIGN_16 int32_t c7fffffff[4] = {0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff};
@@ -7058,30 +12719,85 @@
     mask = _mm_cmpgt_ps(a, zero);
     a_pos = _mm_and_ps(a, mask);
     mask_f_max_si = _mm_cmpgt_ps(a_pos,*(__m128*)c7fffffff);
-    res =  _mm_sub_ps(a_pos, mask_f_max_si);         //if the input fits to signed we don't subtract anything
-    resi = _mm_cvtps_epi32(res);
+    res =  _mm_sub_ps(a_pos, mask_f_max_si); //if the input fits to signed we don't subtract anything
+    resi = _mm_cvttps_epi32(res);
     return _mm_add_epi32(resi, *(__m128i*)&mask_f_max_si);
 }
 
 // ***** Convert to the fixed point  with   the number of fraction bits specified by b ***********
 //*************************************************************************************************
-//Intel SIMD doesn't support fixed point
+int32x2_t vcvt_n_s32_f32(float32x2_t a, __constrange(1,32) int b); // VCVT.S32.F32 d0, d0, #32
+_NEON2SSE_INLINE int32x2_t vcvt_n_s32_f32(float32x2_t a, __constrange(1,32) int b)
+{
+    int32x2_t res64;
+    return64(vcvtq_n_s32_f32(_pM128(a),b));
+}
 
-int32x4_t vcvtq_n_s32_f32(float32x4_t a, __constrange(1,32) int b);         // VCVT.S32.F32 q0, q0, #32
-uint32x4_t vcvtq_n_u32_f32(float32x4_t a, __constrange(1,32) int b);         // VCVT.U32.F32 q0, q0, #32
+uint32x2_t vcvt_n_u32_f32(float32x2_t a, __constrange(1,32) int b); // VCVT.U32.F32 d0, d0, #32
+_NEON2SSE_INLINE uint32x2_t vcvt_n_u32_f32(float32x2_t a, __constrange(1,32) int b)
+{
+    uint32x2_t res;
+    float convconst;
+    convconst = (float)((uint32_t)1 << b);
+    res.m64_u32[0] = (uint32_t) (a.m64_f32[0] * convconst);
+    res.m64_u32[1] = (uint32_t) (a.m64_f32[1] * convconst);
+    return res;
+}
+
+int32x4_t vcvtq_n_s32_f32(float32x4_t a, __constrange(1,32) int b); // VCVT.S32.F32 q0, q0, #32
+_NEON2SSE_INLINE int32x4_t vcvtq_n_s32_f32(float32x4_t a, __constrange(1,32) int b)
+{
+    float convconst;
+    _NEON2SSE_ALIGN_16 uint32_t cmask[] = {0x80000000, 0x80000000, 0x80000000, 0x80000000};
+    __m128 cconst128;
+    __m128i mask, res;
+    convconst = (float)(1 << b);
+    cconst128 = vdupq_n_f32(convconst);
+    res =  _mm_cvttps_epi32(_mm_mul_ps(a,cconst128));
+    mask = _mm_cmpeq_epi32 (res, *(__m128i*)cmask);
+    return _mm_xor_si128 (res,  mask); //res saturated for 0x80000000
+}
+
+uint32x4_t vcvtq_n_u32_f32(float32x4_t a, __constrange(1,32) int b); // VCVT.U32.F32 q0, q0, #32
+_NEON2SSE_INLINE uint32x4_t vcvtq_n_u32_f32(float32x4_t a, __constrange(1,32) int b)
+{
+    float convconst;
+    __m128 cconst128;
+    convconst = (float)(1 << b);
+    cconst128 = vdupq_n_f32(convconst);
+    return vcvtq_u32_f32(_mm_mul_ps(a,cconst128));
+}
 
 //***************** Convert to float *************************
 //*************************************************************
+float32x2_t vcvt_f32_s32(int32x2_t a); // VCVT.F32.S32 d0, d0
+_NEON2SSE_INLINE float32x2_t vcvt_f32_s32(int32x2_t a) //use low 64 bits
+{
+    float32x2_t res;
+    res.m64_f32[0] = (float) a.m64_i32[0];
+    res.m64_f32[1] = (float) a.m64_i32[1];
+    return res;
+}
 
-float32x4_t vcvtq_f32_s32(int32x4_t a);         // VCVT.F32.S32 q0, q0
+float32x2_t vcvt_f32_u32(uint32x2_t a); // VCVT.F32.U32 d0, d0
+_NEON2SSE_INLINE float32x2_t vcvt_f32_u32(uint32x2_t a)
+{
+    float32x2_t res;
+    res.m64_f32[0] = (float) a.m64_u32[0];
+    res.m64_f32[1] = (float) a.m64_u32[1];
+    return res;
+}
+
+float32x4_t vcvtq_f32_s32(int32x4_t a); // VCVT.F32.S32 q0, q0
 #define vcvtq_f32_s32(a) _mm_cvtepi32_ps(a)
 
-float32x4_t vcvtq_f32_u32(uint32x4_t a);         // VCVT.F32.U32 q0, q0
-_NEON2SSE_INLINE float32x4_t vcvtq_f32_u32(uint32x4_t a)         // VCVT.F32.U32 q0, q0
-{         //solution may be not optimal
+float32x4_t vcvtq_f32_u32(uint32x4_t a); // VCVT.F32.U32 q0, q0
+_NEON2SSE_INLINE float32x4_t vcvtq_f32_u32(uint32x4_t a) // VCVT.F32.U32 q0, q0
+{
+    //solution may be not optimal
     __m128 two16, fHi, fLo;
     __m128i hi, lo;
-    two16 = _mm_set1_ps((float)0x10000);         //2^16
+    two16 = _mm_set1_ps((float)0x10000); //2^16
     // Avoid double rounding by doing two exact conversions
     // of high and low 16-bit segments
     hi = _mm_srli_epi32(a, 16);
@@ -7092,24 +12808,228 @@
     return _mm_add_ps(fHi, fLo);
 }
 
+// ***** Convert to the float from fixed point  with   the number of fraction bits specified by b ***********
+float32x2_t vcvt_n_f32_s32(int32x2_t a, __constrange(1,32) int b); // VCVT.F32.S32 d0, d0, #32
+_NEON2SSE_INLINE float32x2_t vcvt_n_f32_s32(int32x2_t a, __constrange(1,32) int b)
+{
+    float32x2_t res;
+    float convconst;
+    convconst = (float)(1. / ((uint32_t)1 << b));
+    res.m64_f32[0] =  a.m64_i32[0] * convconst;
+    res.m64_f32[1] = a.m64_i32[1] * convconst;
+    return res;
+}
+
+float32x2_t vcvt_n_f32_u32(uint32x2_t a, __constrange(1,32) int b); // VCVT.F32.U32 d0, d0, #32
+_NEON2SSE_INLINE float32x2_t vcvt_n_f32_u32(uint32x2_t a, __constrange(1,32) int b) // VCVT.F32.U32 d0, d0, #32
+{
+    float32x2_t res;
+    float convconst;
+    convconst = (float)(1. / ((uint32_t)1 << b));
+    res.m64_f32[0] =  a.m64_u32[0] * convconst;
+    res.m64_f32[1] = a.m64_u32[1] * convconst;
+    return res;
+}
+
+float32x4_t vcvtq_n_f32_s32(int32x4_t a, __constrange(1,32) int b); // VCVT.F32.S32 q0, q0, #32
+_NEON2SSE_INLINE float32x4_t vcvtq_n_f32_s32(int32x4_t a, __constrange(1,32) int b)
+{
+    float convconst;
+    __m128 cconst128, af;
+    convconst = (float)(1. / ((uint32_t)1 << b));
+    af = _mm_cvtepi32_ps(a);
+    cconst128 = vdupq_n_f32(convconst);
+    return _mm_mul_ps(af,cconst128);
+}
+
+float32x4_t vcvtq_n_f32_u32(uint32x4_t a, __constrange(1,32) int b); // VCVT.F32.U32 q0, q0, #32
+_NEON2SSE_INLINE float32x4_t vcvtq_n_f32_u32(uint32x4_t a, __constrange(1,32) int b)
+{
+    float convconst;
+    __m128 cconst128, af;
+    convconst = (float)(1. / (1 << b));
+    af = vcvtq_f32_u32(a);
+    cconst128 = vdupq_n_f32(convconst);
+    return _mm_mul_ps(af,cconst128);
+}
+
 //**************Convert between floats ***********************
 //************************************************************
-
+float16x4_t vcvt_f16_f32(float32x4_t a); // VCVT.F16.F32 d0, q0
 //Intel SIMD doesn't support 16bits floats curently
 
+float32x4_t vcvt_f32_f16(float16x4_t a); // VCVT.F32.F16 q0, d0
 //Intel SIMD doesn't support 16bits floats curently, the only solution is to store 16bit floats and load as 32 bits
 
 //************Vector narrow integer conversion (truncation) ******************
 //****************************************************************************
+int8x8_t vmovn_s16(int16x8_t a); // VMOVN.I16 d0,q0
+_NEON2SSE_INLINE int8x8_t vmovn_s16(int16x8_t a) // VMOVN.I16 d0,q0
+{
+    int8x8_t res64;
+    __m128i res;
+    _NEON2SSE_ALIGN_16 int8_t mask8_16_even_odd[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 };
+    res = _mm_shuffle_epi8 (a, *(__m128i*) mask8_16_even_odd); //use 64 low bits only
+    return64(res);
+}
+
+int16x4_t vmovn_s32(int32x4_t a); // VMOVN.I32 d0,q0
+_NEON2SSE_INLINE int16x4_t vmovn_s32(int32x4_t a) // VMOVN.I32 d0,q0
+{
+    int16x4_t res64;
+    __m128i res;
+    _NEON2SSE_ALIGN_16 int8_t mask8_32_even_odd[16] = { 0,1, 4,5, 8,9,  12,13,  2,3, 6,7,10,11,14,15};
+    res = _mm_shuffle_epi8 (a, *(__m128i*) mask8_32_even_odd); //use 64 low bits only
+    return64(res);
+}
+
+int32x2_t vmovn_s64(int64x2_t a); // VMOVN.I64 d0,q0
+_NEON2SSE_INLINE int32x2_t vmovn_s64(int64x2_t a)
+{
+    //may be not effective compared with a serial implementation
+    int32x2_t res64;
+    __m128i res;
+    res = _mm_shuffle_epi32 (a, 0 | (2 << 2) | (1 << 4) | (3 << 6)); //use 64 low bits only, _MM_SHUFFLE(3, 1, 2, 0)
+    return64(res);
+}
+
+uint8x8_t vmovn_u16(uint16x8_t a); // VMOVN.I16 d0,q0
+#define vmovn_u16 vmovn_s16
+
+uint16x4_t vmovn_u32(uint32x4_t a); // VMOVN.I32 d0,q0
+#define vmovn_u32 vmovn_s32
+
+uint32x2_t vmovn_u64(uint64x2_t a); // VMOVN.I64 d0,q0
+#define vmovn_u64 vmovn_s64
 
 //**************** Vector long move   ***********************
 //***********************************************************
+int16x8_t vmovl_s8(int8x8_t a); // VMOVL.S8 q0,d0
+#define vmovl_s8(a) _MM_CVTEPI8_EPI16(_pM128i(a)) //SSE4.1
+
+int32x4_t vmovl_s16(int16x4_t a); // VMOVL.S16 q0,d0
+#define vmovl_s16(a) _MM_CVTEPI16_EPI32(_pM128i(a)) //SSE4.1
+
+int64x2_t vmovl_s32(int32x2_t a); // VMOVL.S32 q0,d0
+#define vmovl_s32(a)  _MM_CVTEPI32_EPI64(_pM128i(a)) //SSE4.1
+
+uint16x8_t vmovl_u8(uint8x8_t a); // VMOVL.U8 q0,d0
+#define vmovl_u8(a) _MM_CVTEPU8_EPI16(_pM128i(a)) //SSE4.1
+
+uint32x4_t vmovl_u16(uint16x4_t a); // VMOVL.s16 q0,d0
+#define vmovl_u16(a) _MM_CVTEPU16_EPI32(_pM128i(a)) //SSE4.1
+
+uint64x2_t vmovl_u32(uint32x2_t a); // VMOVL.U32 q0,d0
+#define vmovl_u32(a)  _MM_CVTEPU32_EPI64(_pM128i(a)) //SSE4.1
 
 //*************Vector saturating narrow integer*****************
 //**************************************************************
+int8x8_t   vqmovn_s16(int16x8_t a); // VQMOVN.S16 d0,q0
+_NEON2SSE_INLINE int8x8_t   vqmovn_s16(int16x8_t a)
+{
+    int8x8_t res64;
+    __m128i res;
+    res = _mm_packs_epi16(a, a);
+    return64(res);
+}
 
+int16x4_t   vqmovn_s32(int32x4_t a); // VQMOVN.S32 d0,q0
+_NEON2SSE_INLINE int16x4_t   vqmovn_s32(int32x4_t a)
+{
+    int16x4_t res64;
+    __m128i res;
+    res = _mm_packs_epi32(a, a);
+    return64(res);
+}
+
+int32x2_t vqmovn_s64(int64x2_t a); // VQMOVN.S64 d0,q0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vqmovn_s64(int64x2_t a),_NEON2SSE_REASON_SLOW_SERIAL) //no effective SIMD solution
+{
+    int32x2_t res;
+    _NEON2SSE_ALIGN_16 int64_t atmp[2];
+    _mm_store_si128((__m128i*)atmp, a);
+    if(atmp[0]>SINT_MAX) atmp[0] = SINT_MAX;
+    if(atmp[0]<SINT_MIN) atmp[0] = SINT_MIN;
+    if(atmp[1]>SINT_MAX) atmp[1] = SINT_MAX;
+    if(atmp[1]<SINT_MIN) atmp[1] = SINT_MIN;
+    res.m64_i32[0] = (int32_t)atmp[0];
+    res.m64_i32[1] = (int32_t)atmp[1];
+    return res;
+}
+
+uint8x8_t vqmovn_u16(uint16x8_t a); // VQMOVN.s16 d0,q0
+_NEON2SSE_INLINE uint8x8_t vqmovn_u16(uint16x8_t a) // VQMOVN.s16 d0,q0
+{
+    //no uint16 to uint8 conversion in SSE, need truncate to max signed first
+    uint8x8_t res64;
+    __m128i c7fff, a_trunc;
+    c7fff = _mm_set1_epi16 (0x7fff); // 15-th bit set to zero
+    a_trunc =  _mm_and_si128(a,  c7fff); // a truncated to max signed
+    a_trunc =  _mm_packus_epi16 (a_trunc, a_trunc); //use low 64bits only
+    return64(a_trunc);
+}
+
+uint16x4_t vqmovn_u32(uint32x4_t a); // VQMOVN.U32 d0,q0
+_NEON2SSE_INLINE uint16x4_t vqmovn_u32(uint32x4_t a) // VQMOVN.U32 d0,q0
+{
+    //no uint32 to uint16 conversion in SSE, need truncate to max signed first
+    uint16x4_t res64;
+    __m128i c7fffffff, a_trunc;
+    c7fffffff = _mm_set1_epi32((uint32_t)0x7fffffff); // 31-th bit set to zero
+    a_trunc =  _mm_and_si128(a,  c7fffffff); // a truncated to max signed
+    a_trunc = _MM_PACKUS1_EPI32 (a_trunc); //use low 64bits only
+    return64(a_trunc);
+}
+
+uint32x2_t vqmovn_u64(uint64x2_t a); // VQMOVN.U64 d0,q0
+_NEON2SSE_INLINE uint32x2_t vqmovn_u64(uint64x2_t a)
+{
+    //serial solution may be faster
+    uint32x2_t res64;
+    __m128i res_hi, mask;
+    mask = _mm_setzero_si128();
+    res_hi = _mm_srli_epi64(a, 32);
+    res_hi = _mm_cmpeq_epi32(res_hi, mask);
+    mask = _mm_cmpeq_epi32(mask,mask); //all fff
+    mask = _mm_andnot_si128(res_hi,mask); //inverst res_hi to get >32 bits numbers
+    res_hi = _mm_or_si128(a, mask);
+    res_hi = _mm_shuffle_epi32(res_hi, 0 | (2 << 2) | (1 << 4) | (3 << 6)); //shuffle the data to get 2 32-bits
+    return64(res_hi);
+}
 //************* Vector saturating narrow integer signed->unsigned **************
 //*****************************************************************************
+uint8x8_t vqmovun_s16(int16x8_t a); // VQMOVUN.S16 d0,q0
+_NEON2SSE_INLINE uint8x8_t vqmovun_s16(int16x8_t a)
+{
+    uint8x8_t res64;
+    __m128i res;
+    res = _mm_packus_epi16(a, a); //use low 64bits only
+    return64(res);
+}
+
+uint16x4_t vqmovun_s32(int32x4_t a); // VQMOVUN.S32 d0,q0
+_NEON2SSE_INLINE uint16x4_t vqmovun_s32(int32x4_t a)
+{
+    uint16x4_t res64;
+    __m128i res;
+    res = _MM_PACKUS1_EPI32(a); //use low 64bits only
+    return64(res);
+}
+
+uint32x2_t vqmovun_s64(int64x2_t a); // VQMOVUN.S64 d0,q0
+_NEON2SSE_INLINE uint32x2_t vqmovun_s64(int64x2_t a)
+{
+    uint32x2_t res64;
+    __m128i res_hi,res_lo, zero, cmp;
+    zero = _mm_setzero_si128();
+    res_hi = _mm_srli_epi64(a,  32);
+    cmp = _mm_cmpgt_epi32(zero, res_hi); //if cmp<0 the result should be zero
+    res_lo = _mm_andnot_si128(cmp,a); //if cmp zero - do nothing, otherwise cmp <0  and the result is 0
+    cmp = _mm_cmpgt_epi32(res_hi,zero); //if cmp positive
+    res_lo =  _mm_or_si128(res_lo, cmp); //if cmp positive we are out of 32bits need to saturaate to 0xffffffff
+    res_lo = _mm_shuffle_epi32(res_lo, 0 | (2 << 2) | (1 << 4) | (3 << 6)); //shuffle the data to get 2 32-bits
+    return64(res_lo);
+}
 
 // ********************************************************
 // **************** Table look up **************************
@@ -7117,17 +13037,218 @@
 //VTBL (Vector Table Lookup) uses byte indexes in a control vector to look up byte values
 //in a table and generate a new vector. Indexes out of range return 0.
 //for Intel SIMD we need to set the MSB to 1 for zero return
+uint8x8_t vtbl1_u8(uint8x8_t a, uint8x8_t b); // VTBL.8 d0, {d0}, d0
+_NEON2SSE_INLINE uint8x8_t vtbl1_u8(uint8x8_t a, uint8x8_t b)
+{
+    uint8x8_t res64;
+    __m128i c7, maskgt, bmask, b128;
+    c7 = _mm_set1_epi8 (7);
+    b128 = _pM128i(b);
+    maskgt = _mm_cmpgt_epi8(b128,c7);
+    bmask = _mm_or_si128(b128,maskgt);
+    bmask = _mm_shuffle_epi8(_pM128i(a),bmask);
+    return64(bmask);
+}
+
+int8x8_t vtbl1_s8(int8x8_t a,  int8x8_t b); // VTBL.8 d0, {d0}, d0
+#define vtbl1_s8 vtbl1_u8
+
+poly8x8_t vtbl1_p8(poly8x8_t a, uint8x8_t b); // VTBL.8 d0, {d0}, d0
+#define vtbl1_p8 vtbl1_u8
 
 //Special trick to avoid __declspec(align('8')) won't be aligned" error
+//uint8x8_t vtbl2_u8(uint8x8x2_t a, uint8x8_t b); // VTBL.8 d0, {d0, d1}, d0
+uint8x8_t vtbl2_u8_ptr(uint8x8x2_t* a, uint8x8_t b); // VTBL.8 d0, {d0, d1}, d0
+_NEON2SSE_INLINE uint8x8_t vtbl2_u8_ptr(uint8x8x2_t* a, uint8x8_t b)
+{
+    uint8x8_t res64;
+    __m128i c15, a01, maskgt15, bmask, b128;
+    c15 = _mm_set1_epi8 (15);
+    b128 = _pM128i(b);
+    maskgt15 = _mm_cmpgt_epi8(b128,c15);
+    bmask = _mm_or_si128(b128, maskgt15);
+    a01 = _mm_unpacklo_epi64(_pM128i(a->val[0]), _pM128i(a->val[1]));
+    a01 =  _mm_shuffle_epi8(a01, bmask);
+    return64(a01);
+}
+#define vtbl2_u8(a, b) vtbl2_u8_ptr(&a, b)
+
+//int8x8_t vtbl2_s8(int8x8x2_t a, int8x8_t b); // VTBL.8 d0, {d0, d1}, d0
+#define vtbl2_s8 vtbl2_u8
+
+//poly8x8_t vtbl2_p8(poly8x8x2_t a, uint8x8_t b); // VTBL.8 d0, {d0, d1}, d0
+#define vtbl2_p8 vtbl2_u8
 
 //Special trick to avoid __declspec(align('16')) won't be aligned" error
+//uint8x8_t vtbl3_u8(uint8x8x3_t a, uint8x8_t b); // VTBL.8 d0, {d0, d1, d2}, d0
+_NEON2SSE_INLINE uint8x8_t vtbl3_u8_ptr(uint8x8x3_t* a, uint8x8_t b)
+{
+    //solution may be not optimal
+    uint8x8_t res64;
+    __m128i c15, c23, maskgt23, bmask, maskgt15, sh0, sh1, a01, b128;
+    c15 = _mm_set1_epi8 (15);
+    c23 = _mm_set1_epi8 (23);
+    b128 = _pM128i(b);
+    maskgt23 = _mm_cmpgt_epi8(b128,c23);
+    bmask = _mm_or_si128(b128, maskgt23);
+    maskgt15 = _mm_cmpgt_epi8(b128,c15);
+    a01 = _mm_unpacklo_epi64(_pM128i(a->val[0]),_pM128i(a->val[1]));
+    sh0 =  _mm_shuffle_epi8(a01, bmask);
+    sh1 =  _mm_shuffle_epi8(_pM128i(a->val[2]), bmask); //for bi>15 bi is wrapped (bi-=15)
+    sh0 = _MM_BLENDV_EPI8(sh0, sh1, maskgt15); //SSE4.1
+    return64(sh0);
+}
+#define vtbl3_u8(a,b) vtbl3_u8_ptr(&a,b)
+
+//int8x8_t vtbl3_s8(int8x8x3_t a, int8x8_t b); // VTBL.8 d0, {d0, d1, d2}, d0
+int8x8_t vtbl3_s8_ptr(int8x8x3_t* a, int8x8_t b); // VTBL.8 d0, {d0, d1, d2}, d0
+#define vtbl3_s8 vtbl3_u8
+
+//poly8x8_t vtbl3_p8(poly8x8x3_t a, uint8x8_t b); // VTBL.8 d0, {d0, d1, d2}, d0
+poly8x8_t vtbl3_p8_ptr(poly8x8x3_t* a, uint8x8_t b); // VTBL.8 d0, {d0, d1, d2}, d0
+#define vtbl3_p8 vtbl3_u8
+
+//uint8x8_t vtbl4_u8(uint8x8x4_t a, uint8x8_t b); // VTBL.8 d0, {d0, d1, d2, d3}, d0
+_NEON2SSE_INLINE uint8x8_t vtbl4_u8_ptr(uint8x8x4_t* a, uint8x8_t b)
+{
+    //solution may be not optimal
+    uint8x8_t res64;
+    __m128i c15, c31, maskgt31, bmask, maskgt15, sh0, sh1, a01, a23, b128;
+    c15 = _mm_set1_epi8 (15);
+    c31 = _mm_set1_epi8 (31);
+    b128 = _pM128i(b);
+    maskgt31 = _mm_cmpgt_epi8(b128,c31);
+    bmask = _mm_or_si128(b128, maskgt31);
+    maskgt15 = _mm_cmpgt_epi8(b128,c15);
+    a01 = _mm_unpacklo_epi64(_pM128i(a->val[0]),_pM128i(a->val[1]));
+    a23 = _mm_unpacklo_epi64(_pM128i(a->val[2]),_pM128i(a->val[3]));
+    sh0 =  _mm_shuffle_epi8(a01, bmask);
+    sh1 =  _mm_shuffle_epi8(a23, bmask); //for bi>15 bi is wrapped (bi-=15)
+    sh0 = _MM_BLENDV_EPI8 (sh0, sh1, maskgt15); //SSE4.1
+    return64(sh0);
+}
+#define vtbl4_u8(a,b) vtbl4_u8_ptr(&a,b)
+
+//int8x8_t vtbl4_s8(int8x8x4_t a, int8x8_t b); // VTBL.8 d0, {d0, d1, d2, d3}, d0
+int8x8_t vtbl4_s8_ptr(int8x8x4_t* a, int8x8_t b); // VTBL.8 d0, {d0, d1, d2, d3}, d0
+#define vtbl4_s8 vtbl4_u8
+
+//poly8x8_t vtbl4_p8(poly8x8x4_t a, uint8x8_t b); // VTBL.8 d0, {d0, d1, d2, d3}, d0
+poly8x8_t vtbl4_p8_ptr(poly8x8x4_t* a, uint8x8_t b); // VTBL.8 d0, {d0, d1, d2, d3}, d0
+#define vtbl4_p8 vtbl4_u8
 
 //****************** Extended table look up intrinsics ***************************
 //**********************************************************************************
 //VTBX (Vector Table Extension) works in the same way as VTBL do,
 // except that indexes out of range leave the destination element unchanged.
 
+uint8x8_t vtbx1_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c); // VTBX.8 d0, {d0}, d0
+_NEON2SSE_INLINE uint8x8_t vtbx1_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c)
+{
+    uint8x8_t res64;
+    __m128i c7, maskgt, sh, c128;
+    c7 = _mm_set1_epi8 (7);
+    c128 = _pM128i(c);
+    maskgt = _mm_cmpgt_epi8(c128,c7);
+    c7 = _mm_and_si128(maskgt,_pM128i(a));
+    sh = _mm_shuffle_epi8(_pM128i(b),c128);
+    sh = _mm_andnot_si128(maskgt,sh);
+    sh =  _mm_or_si128(sh,c7);
+    return64(sh);
+}
+
+int8x8_t vtbx1_s8(int8x8_t a,  int8x8_t b, int8x8_t c); // VTBX.8 d0, {d0}, d0
+#define vtbx1_s8 vtbx1_u8
+
+poly8x8_t vtbx1_p8(poly8x8_t a, poly8x8_t b, uint8x8_t c); // VTBX.8 d0, {d0}, d0
+#define vtbx1_p8 vtbx1_u8
+
 //Special trick to avoid __declspec(align('8')) won't be aligned" error
+//uint8x8_t vtbx2_u8(uint8x8_t a, uint8x8x2_t b, uint8x8_t c); // VTBX.8 d0, {d0, d1}, d0
+uint8x8_t vtbx2_u8_ptr(uint8x8_t a, uint8x8x2_t* b, uint8x8_t c); // VTBX.8 d0, {d0, d1}, d0
+_NEON2SSE_INLINE uint8x8_t vtbx2_u8_ptr(uint8x8_t a, uint8x8x2_t* b, uint8x8_t c)
+{
+    uint8x8_t res64;
+    __m128i c15, b01, maskgt15, sh, c128;
+    c15 = _mm_set1_epi8 (15);
+    c128 = _pM128i(c);
+    maskgt15 = _mm_cmpgt_epi8(c128, c15);
+    c15 = _mm_and_si128(maskgt15, _pM128i(a));
+    b01 = _mm_unpacklo_epi64(_pM128i(b->val[0]), _pM128i(b->val[1]));
+    sh =  _mm_shuffle_epi8(b01, c128);
+    sh = _mm_andnot_si128(maskgt15, sh);
+    sh =  _mm_or_si128(sh,c15);
+    return64(sh);
+}
+#define vtbx2_u8(a, b, c) vtbx2_u8_ptr(a, &b, c)
+
+//int8x8_t vtbx2_s8(int8x8_t a,  int8x8x2_t b, int8x8_t c);  // VTBX.8 d0, {d0, d1}, d0
+#define vtbx2_s8 vtbx2_u8
+
+//poly8x8_t vtbx2_p8(poly8x8_t a, poly8x8x2_t b, uint8x8_t c); // VTBX.8 d0, {d0, d1}, d0
+#define vtbx2_p8 vtbx2_u8
+
+//uint8x8_t vtbx3_u8(uint8x8_t a, uint8x8x3_t b, uint8x8_t c) // VTBX.8 d0, {d0, d1, d2}, d0
+_NEON2SSE_INLINE uint8x8_t vtbx3_u8_ptr(uint8x8_t a, uint8x8x3_t* b, uint8x8_t c)
+{
+    //solution may be not optimal
+    uint8x8_t res64;
+    __m128i c15, c23, maskgt15, maskgt23, sh0, sh1, b01, c128;
+    c15 = _mm_set1_epi8 (15);
+    c23 = _mm_set1_epi8 (23);
+    c128 = _pM128i(c);
+    maskgt15 = _mm_cmpgt_epi8(c128,c15);
+    maskgt23 = _mm_cmpgt_epi8(c128,c23);
+    c23 = _mm_and_si128(maskgt23, _pM128i(a));
+    b01 = _mm_unpacklo_epi64(_pM128i(b->val[0]),_pM128i(b->val[1]));
+    sh0 =  _mm_shuffle_epi8(b01, c128);
+    sh1 =  _mm_shuffle_epi8(_pM128i(b->val[2]), c128); //for bi>15 bi is wrapped (bi-=15)
+    sh0 = _MM_BLENDV_EPI8(sh0, sh1, maskgt15);
+    sh0 = _mm_andnot_si128(maskgt23,sh0);
+    sh0 = _mm_or_si128(sh0,c23);
+    return64(sh0);
+}
+#define vtbx3_u8(a, b, c) vtbx3_u8_ptr(a, &b, c)
+
+//int8x8_t vtbx3_s8(int8x8_t a, int8x8x3_t b, int8x8_t c); // VTBX.8 d0, {d0, d1, d2}, d0
+int8x8_t vtbx3_s8_ptr(int8x8_t a, int8x8x3_t* b, int8x8_t c);
+#define vtbx3_s8 vtbx3_u8
+
+//poly8x8_t vtbx3_p8(poly8x8_t a, poly8x8x3_t b, uint8x8_t c); // VTBX.8 d0, {d0, d1, d2}, d0
+poly8x8_t vtbx3_p8_ptr(poly8x8_t a, poly8x8x3_t* b, uint8x8_t c);
+#define vtbx3_p8 vtbx3_u8
+
+//uint8x8_t vtbx4_u8(uint8x8_t a, uint8x8x4_t b, uint8x8_t c) // VTBX.8 d0, {d0, d1, d2, d3}, d0
+_NEON2SSE_INLINE uint8x8_t vtbx4_u8_ptr(uint8x8_t a, uint8x8x4_t* b, uint8x8_t c)
+{
+    //solution may be not optimal
+    uint8x8_t res64;
+    __m128i c15, c31, maskgt15, maskgt31, sh0, sh1, b01, b23, c128;
+    c15 = _mm_set1_epi8 (15);
+    c31 = _mm_set1_epi8 (31);
+    c128 = _pM128i(c);
+    maskgt15 = _mm_cmpgt_epi8(c128,c15);
+    maskgt31 = _mm_cmpgt_epi8(c128,c31);
+    c31 = _mm_and_si128(maskgt31, _pM128i(a));
+
+    b01 = _mm_unpacklo_epi64(_pM128i(b->val[0]),_pM128i(b->val[1]));
+    b23 = _mm_unpacklo_epi64(_pM128i(b->val[2]),_pM128i(b->val[3]));
+    sh0 =  _mm_shuffle_epi8(b01, c128);
+    sh1 =  _mm_shuffle_epi8(b23, c128); //for bi>15 bi is wrapped (bi-=15)
+    sh0 = _MM_BLENDV_EPI8(sh0, sh1, maskgt15);
+    sh0 = _mm_andnot_si128(maskgt31,sh0);
+    sh0 =  _mm_or_si128(sh0,c31);
+    return64(sh0);
+}
+#define vtbx4_u8(a, b, c) vtbx4_u8_ptr(a, &b, c)
+
+//int8x8_t vtbx4_s8(int8x8_t a, int8x8x4_t b, int8x8_t c); // VTBX.8 d0, {d0, d1, d2, d3}, d0
+int8x8_t vtbx4_s8_ptr(int8x8_t a, int8x8x4_t* b, int8x8_t c);
+#define vtbx4_s8 vtbx4_u8
+
+//poly8x8_t vtbx4_p8(poly8x8_t a, poly8x8x4_t b, uint8x8_t c); // VTBX.8 d0, {d0, d1, d2, d3}, d0
+poly8x8_t vtbx4_p8_ptr(poly8x8_t a, poly8x8x4_t* b, uint8x8_t c);
+#define vtbx4_p8 vtbx4_u8
 
 //*************************************************************************************************
 // *************************** Operations with a scalar value *********************************
@@ -7135,83 +13256,603 @@
 
 //******* Vector multiply accumulate by scalar *************************************************
 //**********************************************************************************************
+int16x4_t vmla_lane_s16(int16x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); // VMLA.I16 d0, d0, d0[0]
+_NEON2SSE_INLINE int16x4_t vmla_lane_s16(int16x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l) // VMLA.I16 d0, d0, d0[0]
+{
+    int16_t c;
+    int16x4_t scalar;
+    c = vget_lane_s16(v, l);
+    scalar = vdup_n_s16(c);
+    return vmla_s16(a, b, scalar);
+}
+
+int32x2_t vmla_lane_s32(int32x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); // VMLA.I32 d0, d0, d0[0]
+_NEON2SSE_INLINE int32x2_t vmla_lane_s32(int32x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l) // VMLA.I32 d0, d0, d0[0]
+{
+    int32_t c;
+    int32x2_t scalar;
+    c = vget_lane_s32(v, l);
+    scalar = vdup_n_s32(c);
+    return vmla_s32(a, b, scalar);
+}
+
+uint16x4_t vmla_lane_u16(uint16x4_t a,  uint16x4_t b, uint16x4_t v, __constrange(0,3) int l); // VMLA.I16 d0, d0, d0[0]
+#define vmla_lane_u16 vmla_lane_s16
+
+
+uint32x2_t vmla_lane_u32(uint32x2_t a,  uint32x2_t b, uint32x2_t v, __constrange(0,1) int l); // VMLA.I32 d0, d0, d0[0]
+#define vmla_lane_u32 vmla_lane_s32
+
+float32x2_t vmla_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v, __constrange(0,1) int l); // VMLA.F32 d0, d0, d0[0]
+_NEON2SSE_INLINE float32x2_t vmla_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v, __constrange(0,1) int l)
+{
+    float32_t vlane;
+    float32x2_t c;
+    vlane = vget_lane_f32(v, l);
+    c = vdup_n_f32(vlane);
+    return vmla_f32(a,b,c);
+}
+
+int16x8_t vmlaq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t v, __constrange(0,3) int l); // VMLA.I16 q0, q0, d0[0]
+_NEON2SSE_INLINE int16x8_t vmlaq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t v, __constrange(0,3) int l) // VMLA.I16 q0, q0, d0[0]
+{
+    int16_t vlane;
+    int16x8_t c;
+    vlane = vget_lane_s16(v, l);
+    c = vdupq_n_s16(vlane);
+    return vmlaq_s16(a,b,c);
+}
+
+int32x4_t vmlaq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t v, __constrange(0,1) int l); // VMLA.I32 q0, q0, d0[0]
+_NEON2SSE_INLINE int32x4_t vmlaq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t v, __constrange(0,1) int l) // VMLA.I32 q0, q0, d0[0]
+{
+    int32_t vlane;
+    int32x4_t c;
+    vlane = vget_lane_s32(v, l);
+    c = vdupq_n_s32(vlane);
+    return vmlaq_s32(a,b,c);
+}
+
+uint16x8_t vmlaq_lane_u16(uint16x8_t a, uint16x8_t b, uint16x4_t v, __constrange(0,3) int l); // VMLA.I16 q0, q0, d0[0]
+#define vmlaq_lane_u16 vmlaq_lane_s16
+
+uint32x4_t vmlaq_lane_u32(uint32x4_t a, uint32x4_t b, uint32x2_t v, __constrange(0,1) int l); // VMLA.I32 q0, q0, d0[0]
+#define vmlaq_lane_u32 vmlaq_lane_s32
+
+float32x4_t vmlaq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v, __constrange(0,1) int l); // VMLA.F32 q0, q0, d0[0]
+_NEON2SSE_INLINE float32x4_t vmlaq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v, __constrange(0,1) int l) // VMLA.F32 q0, q0, d0[0]
+{
+    float32_t vlane;
+    float32x4_t c;
+    vlane = vget_lane_f32(v, l);
+    c = vdupq_n_f32(vlane);
+    return vmlaq_f32(a,b,c);
+}
 
 //***************** Vector widening multiply accumulate by scalar **********************
 //***************************************************************************************
+int32x4_t vmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); // VMLAL.S16 q0, d0, d0[0]
+_NEON2SSE_INLINE int32x4_t vmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l) // VMLAL.S16 q0, d0, d0[0]
+{
+    int16_t vlane;
+    int16x4_t c;
+    vlane = vget_lane_s16(v, l);
+    c = vdup_n_s16(vlane);
+    return vmlal_s16(a, b, c);
+}
+
+int64x2_t vmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); // VMLAL.S32 q0, d0, d0[0]
+_NEON2SSE_INLINE int64x2_t vmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l) // VMLAL.S32 q0, d0, d0[0]
+{
+    int32_t vlane;
+    int32x2_t c;
+    vlane = vget_lane_s32(v, l);
+    c = vdup_n_s32(vlane);
+    return vmlal_s32(a, b, c);
+}
+
+uint32x4_t vmlal_lane_u16(uint32x4_t a, uint16x4_t b, uint16x4_t v, __constrange(0,3) int l); // VMLAL.s16 q0, d0, d0[0]
+_NEON2SSE_INLINE uint32x4_t vmlal_lane_u16(uint32x4_t a, uint16x4_t b, uint16x4_t v, __constrange(0,3) int l) // VMLAL.s16 q0, d0, d0[0]
+{
+    uint16_t vlane;
+    uint16x4_t c;
+    vlane = vget_lane_u16(v, l);
+    c = vdup_n_u16(vlane);
+    return vmlal_u16(a, b, c);
+}
+
+uint64x2_t vmlal_lane_u32(uint64x2_t a, uint32x2_t b, uint32x2_t v, __constrange(0,1) int l); // VMLAL.U32 q0, d0, d0[0]
+_NEON2SSE_INLINE uint64x2_t vmlal_lane_u32(uint64x2_t a, uint32x2_t b, uint32x2_t v, __constrange(0,1) int l) // VMLAL.U32 q0, d0, d0[0]
+{
+    uint32_t vlane;
+    uint32x2_t c;
+    vlane = vget_lane_u32(v, l);
+    c = vdup_n_u32(vlane);
+    return vmlal_u32(a, b, c);
+}
 
 // ******** Vector widening saturating doubling multiply accumulate by scalar *******************************
 // ************************************************************************************************
+int32x4_t vqdmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); // VQDMLAL.S16 q0, d0, d0[0]
+_NEON2SSE_INLINE int32x4_t vqdmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l)
+{
+    int16_t vlane;
+    int16x4_t c;
+    vlane = vget_lane_s16(v, l);
+    c = vdup_n_s16(vlane);
+    return vqdmlal_s16(a, b, c);
+}
+
+int64x2_t vqdmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); // VQDMLAL.S32 q0, d0, d0[0]
+_NEON2SSE_INLINE int64x2_t vqdmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l)
+{
+    int32_t vlane;
+    uint32x2_t c;
+    vlane = vget_lane_s32(v, l);
+    c = vdup_n_s32(vlane);
+    return vqdmlal_s32(a, b, c);
+}
 
 // ****** Vector multiply subtract by scalar *****************
 // *************************************************************
+int16x4_t vmls_lane_s16(int16x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); // VMLS.I16 d0, d0, d0[0]
+_NEON2SSE_INLINE int16x4_t vmls_lane_s16(int16x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l) // VMLS.I16 d0, d0, d0[0]
+{
+    int16_t vlane;
+    int16x4_t c;
+    vlane = vget_lane_s16(v, l);
+    c = vdup_n_s16(vlane);
+    return vmls_s16(a, b, c);
+}
+
+int32x2_t vmls_lane_s32(int32x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); // VMLS.I32 d0, d0, d0[0]
+_NEON2SSE_INLINE int32x2_t vmls_lane_s32(int32x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l) // VMLS.I32 d0, d0, d0[0]
+{
+    int32_t vlane;
+    int32x2_t c;
+    vlane = vget_lane_s32(v, l);
+    c = vdup_n_s32(vlane);
+    return vmls_s32(a, b, c);
+}
+
+uint16x4_t vmls_lane_u16(uint16x4_t a, uint16x4_t b, uint16x4_t v, __constrange(0,3) int l); // VMLS.I16 d0, d0, d0[0]
+_NEON2SSE_INLINE uint16x4_t vmls_lane_u16(uint16x4_t a, uint16x4_t b, uint16x4_t v, __constrange(0,3) int l) // VMLS.I16 d0, d0, d0[0]
+{
+    uint16_t vlane;
+    uint16x4_t c;
+    vlane = vget_lane_s16(v, l);
+    c = vdup_n_s16(vlane);
+    return vmls_s16(a, b, c);
+}
+
+uint32x2_t vmls_lane_u32(uint32x2_t a, uint32x2_t b, uint32x2_t v, __constrange(0,1) int l); // VMLS.I32 d0, d0, d0[0]
+_NEON2SSE_INLINE uint32x2_t vmls_lane_u32(uint32x2_t a, uint32x2_t b, uint32x2_t v, __constrange(0,1) int l) // VMLS.I32 d0, d0, d0[0]
+{
+    uint32_t vlane;
+    uint32x2_t c;
+    vlane = vget_lane_u32(v, l);
+    c = vdup_n_u32(vlane);
+    return vmls_u32(a, b, c);
+}
+
+float32x2_t vmls_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v, __constrange(0,1) int l); // VMLS.F32 d0, d0, d0[0]
+_NEON2SSE_INLINE float32x2_t vmls_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v, __constrange(0,1) int l)
+{
+    float32_t vlane;
+    float32x2_t c;
+    vlane = (float) vget_lane_f32(v, l);
+    c = vdup_n_f32(vlane);
+    return vmls_f32(a,b,c);
+}
+
+int16x8_t vmlsq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t v, __constrange(0,3) int l); // VMLS.I16 q0, q0, d0[0]
+_NEON2SSE_INLINE int16x8_t vmlsq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t v, __constrange(0,3) int l) // VMLS.I16 q0, q0, d0[0]
+{
+    int16_t vlane;
+    int16x8_t c;
+    vlane = vget_lane_s16(v, l);
+    c = vdupq_n_s16(vlane);
+    return vmlsq_s16(a, b,c);
+}
+
+int32x4_t vmlsq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t v, __constrange(0,1) int l); // VMLS.I32 q0, q0, d0[0]
+_NEON2SSE_INLINE int32x4_t vmlsq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t v, __constrange(0,1) int l) // VMLS.I32 q0, q0, d0[0]
+{
+    int32_t vlane;
+    int32x4_t c;
+    vlane = vget_lane_s32(v, l);
+    c = vdupq_n_s32(vlane);
+    return vmlsq_s32(a,b,c);
+}
+
+uint16x8_t vmlsq_lane_u16(uint16x8_t a, uint16x8_t b, uint16x4_t v, __constrange(0,3) int l); // VMLA.I16 q0, q0, d0[0]
+_NEON2SSE_INLINE uint16x8_t vmlsq_lane_u16(uint16x8_t a, uint16x8_t b, uint16x4_t v, __constrange(0,3) int l) // VMLA.I16 q0, q0, d0[0]
+{
+    uint16_t vlane;
+    uint16x8_t c;
+    vlane = vget_lane_u16(v, l);
+    c = vdupq_n_u16(vlane);
+    return vmlsq_u16(a,b,c);
+}
+
+uint32x4_t vmlsq_lane_u32(uint32x4_t a, uint32x4_t b, uint32x2_t v, __constrange(0,1) int l); // VMLA.I32 q0, q0, d0[0]
+_NEON2SSE_INLINE uint32x4_t vmlsq_lane_u32(uint32x4_t a, uint32x4_t b, uint32x2_t v, __constrange(0,1) int l) // VMLA.I32 q0, q0, d0[0]
+{
+    uint32_t vlane;
+    uint32x4_t c;
+    vlane = vget_lane_u32(v, l);
+    c = vdupq_n_u32(vlane);
+    return vmlsq_u32(a,b,c);
+}
+
+float32x4_t vmlsq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v, __constrange(0,1) int l); // VMLA.F32 q0, q0, d0[0]
+_NEON2SSE_INLINE float32x4_t vmlsq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v, __constrange(0,1) int l) // VMLA.F32 q0, q0, d0[0]
+{
+    float32_t vlane;
+    float32x4_t c;
+    vlane = (float) vget_lane_f32(v, l);
+    c = vdupq_n_f32(vlane);
+    return vmlsq_f32(a,b,c);
+}
 
 // **** Vector widening multiply subtract by scalar ****
 // ****************************************************
+int32x4_t vmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); // VMLAL.S16 q0, d0, d0[0]
+_NEON2SSE_INLINE int32x4_t vmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l) // VMLAL.S16 q0, d0, d0[0]
+{
+    int16_t vlane;
+    int16x4_t c;
+    vlane = vget_lane_s16(v, l);
+    c = vdup_n_s16(vlane);
+    return vmlsl_s16(a, b, c);
+}
+
+int64x2_t vmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); // VMLAL.S32 q0, d0, d0[0]
+_NEON2SSE_INLINE int64x2_t vmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l) // VMLAL.S32 q0, d0, d0[0]
+{
+    int32_t vlane;
+    int32x2_t c;
+    vlane = vget_lane_s32(v, l);
+    c = vdup_n_s32(vlane);
+    return vmlsl_s32(a, b, c);
+}
+
+uint32x4_t vmlsl_lane_u16(uint32x4_t a, uint16x4_t b, uint16x4_t v, __constrange(0,3) int l); // VMLAL.s16 q0, d0, d0[0]
+_NEON2SSE_INLINE uint32x4_t vmlsl_lane_u16(uint32x4_t a, uint16x4_t b, uint16x4_t v, __constrange(0,3) int l) // VMLAL.s16 q0, d0, d0[0]
+{
+    uint16_t vlane;
+    uint16x4_t c;
+    vlane = vget_lane_s16(v, l);
+    c = vdup_n_s16(vlane);
+    return vmlsl_s16(a, b, c);
+}
+
+uint64x2_t vmlsl_lane_u32(uint64x2_t a, uint32x2_t b, uint32x2_t v, __constrange(0,1) int l); // VMLAL.U32 q0, d0, d0[0]
+_NEON2SSE_INLINE uint64x2_t vmlsl_lane_u32(uint64x2_t a, uint32x2_t b, uint32x2_t v, __constrange(0,1) int l) // VMLAL.U32 q0, d0, d0[0]
+{
+    uint32_t vlane;
+    uint32x2_t c;
+    vlane = vget_lane_u32(v, l);
+    c = vdup_n_u32(vlane);
+    return vmlsl_u32(a, b, c);
+}
 
 //********* Vector widening saturating doubling multiply subtract by scalar **************************
 //******************************************************************************************************
+int32x4_t vqdmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); // VQDMLSL.S16 q0, d0, d0[0]
+_NEON2SSE_INLINE int32x4_t vqdmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l)
+{
+    int16_t vlane;
+    int16x4_t c;
+    vlane = vget_lane_s16(v, l);
+    c = vdup_n_s16(vlane);
+    return vqdmlsl_s16(a, b, c);
+}
 
+int64x2_t vqdmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); // VQDMLSL.S32 q0, d0, d0[0]
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqdmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l), _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    int32_t vlane;
+    int32x2_t c;
+    vlane = vget_lane_s32(v, l);
+    c = vdup_n_s32(vlane);
+    return vqdmlsl_s32(a, b, c);
+}
 //********** Vector multiply with scalar *****************************
+int16x4_t vmul_n_s16(int16x4_t a, int16_t b); // VMUL.I16 d0,d0,d0[0]
+_NEON2SSE_INLINE int16x4_t vmul_n_s16(int16x4_t a, int16_t b) // VMUL.I16 d0,d0,d0[0]
+{
+    int16x4_t b16x4;
+    b16x4 = vdup_n_s16(b);
+    return vmul_s16(a, b16x4);
+}
 
-int16x8_t vmulq_n_s16(int16x8_t a, int16_t b);         // VMUL.I16 q0,q0,d0[0]
-_NEON2SSE_INLINE int16x8_t vmulq_n_s16(int16x8_t a, int16_t b)         // VMUL.I16 q0,q0,d0[0]
+int32x2_t vmul_n_s32(int32x2_t a, int32_t b); // VMUL.I32 d0,d0,d0[0]
+_NEON2SSE_INLINE int32x2_t vmul_n_s32(int32x2_t a, int32_t b) // VMUL.I32 d0,d0,d0[0]
+{
+    //serial solution looks faster
+    int32x2_t b32x2;
+    b32x2 = vdup_n_s32(b);
+    return vmul_s32(a, b32x2);
+}
+
+float32x2_t vmul_n_f32(float32x2_t a, float32_t b); // VMUL.F32 d0,d0,d0[0]
+_NEON2SSE_INLINE float32x2_t vmul_n_f32(float32x2_t a, float32_t b) // VMUL.F32 d0,d0,d0[0]
+{
+    float32x2_t b32x2;
+    b32x2 = vdup_n_f32(b);
+    return vmul_f32(a, b32x2);
+}
+
+uint16x4_t vmul_n_u16(uint16x4_t a, uint16_t b); // VMUL.I16 d0,d0,d0[0]
+_NEON2SSE_INLINE uint16x4_t vmul_n_u16(uint16x4_t a, uint16_t b) // VMUL.I16 d0,d0,d0[0]
+{
+    uint16x4_t b16x4;
+    b16x4 = vdup_n_s16(b);
+    return vmul_s16(a, b16x4);
+}
+
+uint32x2_t vmul_n_u32(uint32x2_t a, uint32_t b); // VMUL.I32 d0,d0,d0[0]
+_NEON2SSE_INLINE uint32x2_t vmul_n_u32(uint32x2_t a, uint32_t b) // VMUL.I32 d0,d0,d0[0]
+{
+    //serial solution looks faster
+    uint32x2_t b32x2;
+    b32x2 = vdup_n_u32(b);
+    return vmul_u32(a, b32x2);
+}
+
+int16x8_t vmulq_n_s16(int16x8_t a, int16_t b); // VMUL.I16 q0,q0,d0[0]
+_NEON2SSE_INLINE int16x8_t vmulq_n_s16(int16x8_t a, int16_t b) // VMUL.I16 q0,q0,d0[0]
 {
     int16x8_t b16x8;
     b16x8 = vdupq_n_s16(b);
     return vmulq_s16(a, b16x8);
 }
 
-int32x4_t vmulq_n_s32(int32x4_t a, int32_t b);         // VMUL.I32 q0,q0,d0[0]
-_NEON2SSE_INLINE int32x4_t vmulq_n_s32(int32x4_t a, int32_t b)         // VMUL.I32 q0,q0,d0[0]
+int32x4_t vmulq_n_s32(int32x4_t a, int32_t b); // VMUL.I32 q0,q0,d0[0]
+_NEON2SSE_INLINE int32x4_t vmulq_n_s32(int32x4_t a, int32_t b) // VMUL.I32 q0,q0,d0[0]
 {
     int32x4_t b32x4;
     b32x4 = vdupq_n_s32(b);
     return vmulq_s32(a, b32x4);
 }
 
-float32x4_t vmulq_n_f32(float32x4_t a, float32_t b);         // VMUL.F32 q0,q0,d0[0]
-_NEON2SSE_INLINE float32x4_t vmulq_n_f32(float32x4_t a, float32_t b)         // VMUL.F32 q0,q0,d0[0]
+float32x4_t vmulq_n_f32(float32x4_t a, float32_t b); // VMUL.F32 q0,q0,d0[0]
+_NEON2SSE_INLINE float32x4_t vmulq_n_f32(float32x4_t a, float32_t b) // VMUL.F32 q0,q0,d0[0]
 {
     float32x4_t b32x4;
     b32x4 = vdupq_n_f32(b);
     return vmulq_f32(a, b32x4);
 }
 
-uint16x8_t vmulq_n_u16(uint16x8_t a, uint16_t b);         // VMUL.I16 q0,q0,d0[0]
-_NEON2SSE_INLINE uint16x8_t vmulq_n_u16(uint16x8_t a, uint16_t b)         // VMUL.I16 q0,q0,d0[0]
+uint16x8_t vmulq_n_u16(uint16x8_t a, uint16_t b); // VMUL.I16 q0,q0,d0[0]
+_NEON2SSE_INLINE uint16x8_t vmulq_n_u16(uint16x8_t a, uint16_t b) // VMUL.I16 q0,q0,d0[0]
 {
     uint16x8_t b16x8;
     b16x8 = vdupq_n_s16(b);
     return vmulq_s16(a, b16x8);
 }
 
-uint32x4_t vmulq_n_u32(uint32x4_t a, uint32_t b);         // VMUL.I32 q0,q0,d0[0]
-_NEON2SSE_INLINE uint32x4_t vmulq_n_u32(uint32x4_t a, uint32_t b)         // VMUL.I32 q0,q0,d0[0]
+uint32x4_t vmulq_n_u32(uint32x4_t a, uint32_t b); // VMUL.I32 q0,q0,d0[0]
+_NEON2SSE_INLINE uint32x4_t vmulq_n_u32(uint32x4_t a, uint32_t b) // VMUL.I32 q0,q0,d0[0]
 {
     uint32x4_t b32x4;
     b32x4 = vdupq_n_u32(b);
     return vmulq_u32(a, b32x4);
 }
 
+//********** Vector multiply lane *****************************
+int16x4_t vmul_lane_s16 (int16x4_t a, int16x4_t b, __constrange(0,3) int c);
+_NEON2SSE_INLINE int16x4_t vmul_lane_s16 (int16x4_t a, int16x4_t b, __constrange(0,3) int c)
+{
+    int16x4_t b16x4;
+    int16_t vlane;
+    vlane = vget_lane_s16(b, c);
+    b16x4 = vdup_n_s16(vlane);
+    return vmul_s16(a, b16x4);
+}
+
+int32x2_t vmul_lane_s32 (int32x2_t a, int32x2_t b, __constrange(0,1) int c);
+_NEON2SSE_INLINE int32x2_t vmul_lane_s32 (int32x2_t a, int32x2_t b, __constrange(0,1) int c)
+{
+    int32x2_t b32x2;
+    int32_t vlane;
+    vlane = vget_lane_s32(b, c);
+    b32x2 = vdup_n_s32(vlane);
+    return vmul_s32(a, b32x2);
+}
+
+float32x2_t vmul_lane_f32 (float32x2_t a, float32x2_t b, __constrange(0,1) int c);
+_NEON2SSE_INLINE float32x2_t vmul_lane_f32 (float32x2_t a, float32x2_t b, __constrange(0,1) int c)
+{
+    float32x2_t b32x2;
+    float32_t vlane;
+    vlane = vget_lane_f32(b, c);
+    b32x2 = vdup_n_f32(vlane);
+    return vmul_f32(a, b32x2);
+}
+
+uint16x4_t vmul_lane_u16 (uint16x4_t a, uint16x4_t b, __constrange(0,3) int c);
+#define vmul_lane_u16 vmul_lane_s16
+
+uint32x2_t vmul_lane_u32 (uint32x2_t a, uint32x2_t b, __constrange(0,1) int c);
+#define vmul_lane_u32 vmul_lane_s32
+
+int16x8_t vmulq_lane_s16(int16x8_t a, int16x4_t b, __constrange(0,3) int c);
+_NEON2SSE_INLINE int16x8_t vmulq_lane_s16 (int16x8_t a, int16x4_t b, __constrange(0,3) int c)
+{
+    int16x8_t b16x8;
+    int16_t vlane;
+    vlane = vget_lane_s16(b, c);
+    b16x8 = vdupq_n_s16(vlane);
+    return vmulq_s16(a, b16x8);
+}
+
+int32x4_t vmulq_lane_s32 (int32x4_t a, int32x2_t b, __constrange(0,1) int c);
+_NEON2SSE_INLINE int32x4_t vmulq_lane_s32 (int32x4_t a, int32x2_t b, __constrange(0,1) int c)
+{
+    int32x4_t b32x4;
+    int32_t vlane;
+    vlane = vget_lane_s32(b, c);
+    b32x4 = vdupq_n_s32(vlane);
+    return vmulq_s32(a, b32x4);
+}
+
+float32x4_t vmulq_lane_f32 (float32x4_t a, float32x2_t b, __constrange(0,1) int c);
+_NEON2SSE_INLINE float32x4_t vmulq_lane_f32 (float32x4_t a, float32x2_t b, __constrange(0,1) int c)
+{
+    float32x4_t b32x4;
+    float32_t vlane;
+    vlane = vget_lane_f32(b, c);
+    b32x4 = vdupq_n_f32(vlane);
+    return vmulq_f32(a, b32x4);
+}
+
+uint16x8_t vmulq_lane_u16 (uint16x8_t a, uint16x4_t b, __constrange(0,3) int c);
+#define vmulq_lane_u16 vmulq_lane_s16
+
+uint32x4_t vmulq_lane_u32 (uint32x4_t a, uint32x2_t b, __constrange(0,1) int c);
+#define vmulq_lane_u32 vmulq_lane_s32
+
 //**** Vector long multiply with scalar ************
+int32x4_t vmull_n_s16(int16x4_t vec1, int16_t val2); // VMULL.S16 q0,d0,d0[0]
+_NEON2SSE_INLINE int32x4_t vmull_n_s16(int16x4_t vec1, int16_t val2) // VMULL.S16 q0,d0,d0[0]
+{
+    int16x4_t b16x4;
+    b16x4 = vdup_n_s16(val2);
+    return vmull_s16(vec1, b16x4);
+}
+
+int64x2_t vmull_n_s32(int32x2_t vec1, int32_t val2); // VMULL.S32 q0,d0,d0[0]
+_NEON2SSE_INLINE int64x2_t vmull_n_s32(int32x2_t vec1, int32_t val2) // VMULL.S32 q0,d0,d0[0]
+{
+    int32x2_t b32x2;
+    b32x2 = vdup_n_s32(val2);
+    return vmull_s32(vec1, b32x2);
+}
+
+uint32x4_t vmull_n_u16(uint16x4_t vec1, uint16_t val2); // VMULL.s16 q0,d0,d0[0]
+_NEON2SSE_INLINE uint32x4_t vmull_n_u16(uint16x4_t vec1, uint16_t val2) // VMULL.s16 q0,d0,d0[0]
+{
+    uint16x4_t b16x4;
+    b16x4 = vdup_n_s16(val2);
+    return vmull_s16(vec1, b16x4);
+}
+
+uint64x2_t vmull_n_u32(uint32x2_t vec1, uint32_t val2); // VMULL.U32 q0,d0,d0[0]
+_NEON2SSE_INLINE uint64x2_t vmull_n_u32(uint32x2_t vec1, uint32_t val2) // VMULL.U32 q0,d0,d0[0]
+{
+    uint32x2_t b32x2;
+    b32x2 = vdup_n_u32(val2);
+    return vmull_u32(vec1, b32x2);
+}
 
 //**** Vector long multiply by scalar ****
+int32x4_t vmull_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3); // VMULL.S16 q0,d0,d0[0]
+_NEON2SSE_INLINE int32x4_t vmull_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3) // VMULL.S16 q0,d0,d0[0]
+{
+    int16_t vlane;
+    int16x4_t b;
+    vlane = vget_lane_s16(val2, val3);
+    b = vdup_n_s16(vlane);
+    return vmull_s16(vec1, b);
+}
+
+int64x2_t vmull_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3); // VMULL.S32 q0,d0,d0[0]
+_NEON2SSE_INLINE int64x2_t vmull_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3) // VMULL.S32 q0,d0,d0[0]
+{
+    int32_t vlane;
+    int32x2_t b;
+    vlane = vget_lane_s32(val2, val3);
+    b = vdup_n_s32(vlane);
+    return vmull_s32(vec1, b);
+}
+
+uint32x4_t vmull_lane_u16(uint16x4_t vec1, uint16x4_t val2, __constrange(0, 3) int val3); // VMULL.s16 q0,d0,d0[0]
+_NEON2SSE_INLINE uint32x4_t vmull_lane_u16(uint16x4_t vec1, uint16x4_t val2, __constrange(0, 3) int val3) // VMULL.s16 q0,d0,d0[0]
+{
+    uint16_t vlane;
+    uint16x4_t b;
+    vlane = vget_lane_s16(val2, val3);
+    b = vdup_n_s16(vlane);
+    return vmull_s16(vec1, b);
+}
+
+uint64x2_t vmull_lane_u32(uint32x2_t vec1, uint32x2_t val2, __constrange(0, 1) int val3); // VMULL.U32 q0,d0,d0[0]
+_NEON2SSE_INLINE uint64x2_t vmull_lane_u32(uint32x2_t vec1, uint32x2_t val2, __constrange(0, 1) int val3) // VMULL.U32 q0,d0,d0[0]
+{
+    uint32_t vlane;
+    uint32x2_t b;
+    vlane = vget_lane_u32(val2, val3);
+    b = vdup_n_u32(vlane);
+    return vmull_u32(vec1, b);
+}
 
 //********* Vector saturating doubling long multiply with scalar  *******************
+int32x4_t vqdmull_n_s16(int16x4_t vec1, int16_t val2); // VQDMULL.S16 q0,d0,d0[0]
+_NEON2SSE_INLINE int32x4_t vqdmull_n_s16(int16x4_t vec1, int16_t val2)
+{
+    //the serial soulution may be faster due to saturation
+    int16x4_t b;
+    b = vdup_n_s16(val2);
+    return vqdmull_s16(vec1, b);
+}
+
+int64x2_t vqdmull_n_s32(int32x2_t vec1, int32_t val2); // VQDMULL.S32 q0,d0,d0[0]
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqdmull_n_s32(int32x2_t vec1, int32_t val2), _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    int32x2_t b;
+    b = vdup_n_s32(val2);
+    return vqdmull_s32(vec1,b); //slow serial function!!!!
+}
 
 //************* Vector saturating doubling long multiply by scalar ***********************************************
+int32x4_t vqdmull_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3); // VQDMULL.S16 q0,d0,d0[0]
+_NEON2SSE_INLINE int32x4_t vqdmull_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3)
+{
+    int16_t c;
+    int16x4_t scalar;
+    c = vget_lane_s16(val2, val3);
+    scalar = vdup_n_s16(c);
+    return vqdmull_s16(vec1, scalar);
+}
+
+
+int64x2_t vqdmull_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3); //  VQDMULL.S32 q0,d0,d0[0]
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqdmull_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3), _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    int32_t c;
+    int32x2_t scalar;
+    c = vget_lane_s32(val2, val3);
+    scalar = vdup_n_s32(c);
+    return vqdmull_s32(vec1,scalar); //slow serial function!!!!
+}
 
 // *****Vector saturating doubling multiply high with scalar *****
+int16x4_t vqdmulh_n_s16(int16x4_t vec1,  int16_t val2); //  VQDMULH.S16 d0,d0,d0[0]
+_NEON2SSE_INLINE int16x4_t vqdmulh_n_s16(int16x4_t vec1,  int16_t val2)
+{
+    int16x4_t res64;
+    return64(vqdmulhq_n_s16(_pM128i(vec1), val2));
+}
 
-int16x8_t vqdmulhq_n_s16(int16x8_t vec1, int16_t val2);         //  VQDMULH.S16 q0,q0,d0[0]
-_NEON2SSE_INLINE int16x8_t vqdmulhq_n_s16(int16x8_t vec1, int16_t val2)         //  VQDMULH.S16 q0,q0,d0[0]
-{         //solution may be not optimal
+int32x2_t vqdmulh_n_s32(int32x2_t vec1,  int32_t val2); //  VQDMULH.S32 d0,d0,d0[0]
+_NEON2SSE_INLINE int32x2_t vqdmulh_n_s32(int32x2_t vec1,  int32_t val2)
+{
+    int32x2_t res64;
+    return64(vqdmulhq_n_s32(_pM128i(vec1), val2));
+}
+
+int16x8_t vqdmulhq_n_s16(int16x8_t vec1, int16_t val2); //  VQDMULH.S16 q0,q0,d0[0]
+_NEON2SSE_INLINE int16x8_t vqdmulhq_n_s16(int16x8_t vec1, int16_t val2) //  VQDMULH.S16 q0,q0,d0[0]
+{
+    //solution may be not optimal
     int16x8_t scalar;
     scalar = vdupq_n_s16(val2);
     return vqdmulhq_s16(vec1, scalar);
 }
 
-int32x4_t vqdmulhq_n_s32(int32x4_t vec1, int32_t val2);         //  VQDMULH.S32 q0,q0,d0[0]
+int32x4_t vqdmulhq_n_s32(int32x4_t vec1, int32_t val2); //  VQDMULH.S32 q0,q0,d0[0]
 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vqdmulhq_n_s32(int32x4_t vec1, int32_t val2), _NEON2SSE_REASON_SLOW_UNEFFECTIVE)
 {
     int32x4_t scalar;
@@ -7220,57 +13861,185 @@
 }
 
 //***** Vector saturating doubling multiply high by scalar ****************
+int16x4_t vqdmulh_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3); //  VQDMULH.S16 d0,d0,d0[0]
+_NEON2SSE_INLINE int16x4_t vqdmulh_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3) //  VQDMULH.S16 d0,d0,d0[0]
+{
+    //solution may be not optimal
+    int16_t vlane;
+    int16x4_t scalar;
+    vlane = vget_lane_s16(val2, val3);
+    scalar = vdup_n_s16(vlane);
+    return vqdmulh_s16(vec1, scalar);
+}
+
+int32x2_t vqdmulh_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3); //  VQDMULH.S32 d0,d0,d0[0]
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vqdmulh_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3), _NEON2SSE_REASON_SLOW_UNEFFECTIVE)
+{
+    int32_t vlane;
+    int32x2_t scalar;
+    vlane = vget_lane_s32(val2, val3);
+    scalar = vdup_n_s32(vlane);
+    return vqdmulh_s32(vec1, scalar);
+}
+
+int16x8_t vqdmulhq_lane_s16(int16x8_t vec1, int16x4_t val2, __constrange(0, 3) int val3); //  VQDMULH.S16 q0,q0,d0[0]
+_NEON2SSE_INLINE int16x8_t vqdmulhq_lane_s16(int16x8_t vec1, int16x4_t val2, __constrange(0, 3) int val3) //  VQDMULH.S16 q0,q0,d0[0]
+{
+    //solution may be not optimal
+    int16_t vlane;
+    int16x8_t scalar;
+    vlane = vget_lane_s16(val2, val3);
+    scalar = vdupq_n_s16(vlane );
+    return vqdmulhq_s16(vec1, scalar);
+}
+
+int32x4_t vqdmulhq_lane_s32(int32x4_t vec1, int32x2_t val2, __constrange(0, 1) int val3); //  VQDMULH.S32 q0,q0,d0[0]
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vqdmulhq_lane_s32(int32x4_t vec1, int32x2_t val2, __constrange(0, 1) int val3), _NEON2SSE_REASON_SLOW_UNEFFECTIVE)
+{
+    //solution may be not optimal
+    int32_t vlane;
+    int32x4_t scalar;
+    vlane = vgetq_lane_s32(_pM128i(val2), val3);
+    scalar = vdupq_n_s32(vlane );
+    return vqdmulhq_s32(vec1, scalar);
+}
 
 //******** Vector saturating rounding doubling multiply high with scalar ***
+int16x4_t vqrdmulh_n_s16(int16x4_t vec1, int16_t val2); // VQRDMULH.S16 d0,d0,d0[0]
+_NEON2SSE_INLINE int16x4_t vqrdmulh_n_s16(int16x4_t vec1, int16_t val2) // VQRDMULH.S16 d0,d0,d0[0]
+{
+    //solution may be not optimal
+    int16x4_t scalar;
+    scalar = vdup_n_s16(val2);
+    return vqrdmulh_s16(vec1, scalar);
+}
 
-#if defined(USE_SSSE3)
-int16x8_t vqrdmulhq_n_s16(int16x8_t vec1, int16_t val2);         // VQRDMULH.S16 q0,q0,d0[0]
-_NEON2SSE_INLINE int16x8_t vqrdmulhq_n_s16(int16x8_t vec1, int16_t val2)         // VQRDMULH.S16 q0,q0,d0[0]
-{         //solution may be not optimal
+int32x2_t vqrdmulh_n_s32(int32x2_t vec1, int32_t val2); // VQRDMULH.S32 d0,d0,d0[0]
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vqrdmulh_n_s32(int32x2_t vec1, int32_t val2), _NEON2SSE_REASON_SLOW_UNEFFECTIVE)
+{
+    int32x2_t scalar;
+    scalar = vdup_n_s32(val2);
+    return vqrdmulh_s32(vec1, scalar);
+}
+
+int16x8_t vqrdmulhq_n_s16(int16x8_t vec1, int16_t val2); // VQRDMULH.S16 q0,q0,d0[0]
+_NEON2SSE_INLINE int16x8_t vqrdmulhq_n_s16(int16x8_t vec1, int16_t val2) // VQRDMULH.S16 q0,q0,d0[0]
+{
+    //solution may be not optimal
     int16x8_t scalar;
     scalar = vdupq_n_s16(val2);
     return vqrdmulhq_s16(vec1, scalar);
 }
-#endif
 
-#if defined(USE_SSSE3)
-int32x4_t vqrdmulhq_n_s32(int32x4_t vec1, int32_t val2);         // VQRDMULH.S32 q0,q0,d0[0]
+int32x4_t vqrdmulhq_n_s32(int32x4_t vec1, int32_t val2); // VQRDMULH.S32 q0,q0,d0[0]
 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vqrdmulhq_n_s32(int32x4_t vec1, int32_t val2), _NEON2SSE_REASON_SLOW_UNEFFECTIVE)
 {
     int32x4_t scalar;
     scalar = vdupq_n_s32(val2);
     return vqrdmulhq_s32(vec1, scalar);
 }
-#endif
 
 //********* Vector rounding saturating doubling multiply high by scalar  ****
+int16x4_t vqrdmulh_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3); // VQRDMULH.S16 d0,d0,d0[0]
+_NEON2SSE_INLINE int16x4_t vqrdmulh_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3) // VQRDMULH.S16 d0,d0,d0[0]
+{
+    //solution may be not optimal
+    int16_t vlane;
+    int16x4_t scalar;
+    vlane = vget_lane_s16(val2, val3);
+    scalar = vdup_n_s16(vlane);
+    return vqrdmulh_s16(vec1, scalar);
+}
+
+int32x2_t vqrdmulh_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3); // VQRDMULH.S32 d0,d0,d0[0]
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vqrdmulh_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3), _NEON2SSE_REASON_SLOW_UNEFFECTIVE)
+{
+    int32_t vlane;
+    int32x2_t scalar;
+    vlane = vget_lane_s32(val2, val3);
+    scalar = vdup_n_s32(vlane);
+    return vqrdmulh_s32(vec1, scalar);
+}
+
+int16x8_t vqrdmulhq_lane_s16(int16x8_t vec1, int16x4_t val2, __constrange(0, 3) int val3); // VQRDMULH.S16 q0,q0,d0[0]
+_NEON2SSE_INLINE int16x8_t vqrdmulhq_lane_s16(int16x8_t vec1, int16x4_t val2, __constrange(0, 3) int val3) // VQRDMULH.S16 q0,q0,d0[0]
+{
+    //solution may be not optimal
+    int16_t vlane;
+    int16x8_t scalar;
+    vlane = vget_lane_s16(val2, val3);
+    scalar = vdupq_n_s16(vlane);
+    return vqrdmulhq_s16(vec1, scalar);
+}
+
+int32x4_t vqrdmulhq_lane_s32(int32x4_t vec1, int32x2_t val2, __constrange(0, 1) int val3); // VQRDMULH.S32 q0,q0,d0[0]
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vqrdmulhq_lane_s32(int32x4_t vec1, int32x2_t val2, __constrange(0, 1) int val3), _NEON2SSE_REASON_SLOW_UNEFFECTIVE)
+{
+    //solution may be not optimal
+    int32_t vlane;
+    int32x4_t scalar;
+    vlane = vgetq_lane_s32(_pM128i(val2), val3);
+    scalar = vdupq_n_s32(vlane );
+    return vqrdmulhq_s32(vec1, scalar);
+}
 
 //**************Vector multiply accumulate with scalar *******************
+int16x4_t vmla_n_s16(int16x4_t a, int16x4_t b, int16_t c); // VMLA.I16 d0, d0, d0[0]
+_NEON2SSE_INLINE int16x4_t vmla_n_s16(int16x4_t a, int16x4_t b, int16_t c) // VMLA.I16 d0, d0, d0[0]
+{
+    int16x4_t scalar;
+    scalar = vdup_n_s16(c);
+    return vmla_s16(a, b, scalar);
+}
 
-int16x8_t vmlaq_n_s16(int16x8_t a, int16x8_t b, int16_t c);         // VMLA.I16 q0, q0, d0[0]
-_NEON2SSE_INLINE int16x8_t vmlaq_n_s16(int16x8_t a, int16x8_t b, int16_t c)         // VMLA.I16 q0, q0, d0[0]
+int32x2_t vmla_n_s32(int32x2_t a, int32x2_t b, int32_t c); // VMLA.I32 d0, d0, d0[0]
+_NEON2SSE_INLINE int32x2_t vmla_n_s32(int32x2_t a, int32x2_t b, int32_t c) // VMLA.I32 d0, d0, d0[0]
+{
+    int32x2_t scalar;
+    scalar = vdup_n_s32(c);
+    return vmla_s32(a, b, scalar);
+}
+
+uint16x4_t vmla_n_u16(uint16x4_t a,  uint16x4_t b, uint16_t c); // VMLA.I16 d0, d0, d0[0]
+#define vmla_n_u16 vmla_n_s16
+
+
+uint32x2_t vmla_n_u32(uint32x2_t a,  uint32x2_t b, uint32_t c); // VMLA.I32 d0, d0, d0[0]
+#define vmla_n_u32 vmla_n_s32
+
+
+float32x2_t vmla_n_f32(float32x2_t a, float32x2_t b, float32_t c); // VMLA.F32 d0, d0, d0[0]
+_NEON2SSE_INLINE float32x2_t vmla_n_f32(float32x2_t a, float32x2_t b, float32_t c) // VMLA.F32 d0, d0, d0[0]
+{
+    float32x2_t scalar;
+    scalar = vdup_n_f32(c);
+    return vmla_f32(a, b, scalar);
+}
+
+int16x8_t vmlaq_n_s16(int16x8_t a, int16x8_t b, int16_t c); // VMLA.I16 q0, q0, d0[0]
+_NEON2SSE_INLINE int16x8_t vmlaq_n_s16(int16x8_t a, int16x8_t b, int16_t c) // VMLA.I16 q0, q0, d0[0]
 {
     int16x8_t scalar;
     scalar = vdupq_n_s16(c);
     return vmlaq_s16(a,b,scalar);
 }
 
-int32x4_t vmlaq_n_s32(int32x4_t a, int32x4_t b, int32_t c);         // VMLA.I32 q0, q0, d0[0]
-_NEON2SSE_INLINE int32x4_t vmlaq_n_s32(int32x4_t a, int32x4_t b, int32_t c)         // VMLA.I32 q0, q0, d0[0]
+int32x4_t vmlaq_n_s32(int32x4_t a, int32x4_t b, int32_t c); // VMLA.I32 q0, q0, d0[0]
+_NEON2SSE_INLINE int32x4_t vmlaq_n_s32(int32x4_t a, int32x4_t b, int32_t c) // VMLA.I32 q0, q0, d0[0]
 {
     int32x4_t scalar;
     scalar = vdupq_n_s32(c);
     return vmlaq_s32(a,b,scalar);
 }
 
-uint16x8_t vmlaq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c);         // VMLA.I16 q0, q0, d0[0]
+uint16x8_t vmlaq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c); // VMLA.I16 q0, q0, d0[0]
 #define vmlaq_n_u16 vmlaq_n_s16
 
-uint32x4_t vmlaq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c);         // VMLA.I32 q0, q0, d0[0]
+uint32x4_t vmlaq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c); // VMLA.I32 q0, q0, d0[0]
 #define vmlaq_n_u32 vmlaq_n_s32
 
-float32x4_t vmlaq_n_f32(float32x4_t a, float32x4_t b, float32_t c);         // VMLA.F32 q0, q0, d0[0]
-_NEON2SSE_INLINE float32x4_t vmlaq_n_f32(float32x4_t a, float32x4_t b, float32_t c)         // VMLA.F32 q0, q0, d0[0]
+float32x4_t vmlaq_n_f32(float32x4_t a, float32x4_t b, float32_t c); // VMLA.F32 q0, q0, d0[0]
+_NEON2SSE_INLINE float32x4_t vmlaq_n_f32(float32x4_t a, float32x4_t b, float32_t c) // VMLA.F32 q0, q0, d0[0]
 {
     float32x4_t scalar;
     scalar = vdupq_n_f32(c);
@@ -7278,44 +14047,131 @@
 }
 
 //************Vector widening multiply accumulate with scalar****************************
+int32x4_t vmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c); // VMLAL.S16 q0, d0, d0[0]
+_NEON2SSE_INLINE int32x4_t vmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c) // VMLAL.S16 q0, d0, d0[0]
+{
+    int16x4_t vc;
+    vc = vdup_n_s16(c);
+    return vmlal_s16(a, b, vc);
+}
+
+int64x2_t vmlal_n_s32(int64x2_t a, int32x2_t b, int32_t c); // VMLAL.S32 q0, d0, d0[0]
+_NEON2SSE_INLINE int64x2_t vmlal_n_s32(int64x2_t a, int32x2_t b, int32_t c) // VMLAL.S32 q0, d0, d0[0]
+{
+    int32x2_t vc;
+    vc = vdup_n_s32(c);
+    return vmlal_s32(a, b, vc);
+}
+
+uint32x4_t vmlal_n_u16(uint32x4_t a, uint16x4_t b, uint16_t c); // VMLAL.s16 q0, d0, d0[0]
+_NEON2SSE_INLINE uint32x4_t vmlal_n_u16(uint32x4_t a, uint16x4_t b, uint16_t c) // VMLAL.s16 q0, d0, d0[0]
+{
+    uint16x4_t vc;
+    vc = vdup_n_s16(c);
+    return vmlal_s16(a, b, vc);
+}
+
+uint64x2_t vmlal_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c); // VMLAL.U32 q0, d0, d0[0]
+_NEON2SSE_INLINE uint64x2_t vmlal_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c) // VMLAL.U32 q0, d0, d0[0]
+{
+    uint32x2_t vc;
+    vc = vdup_n_u32(c);
+    return vmlal_u32(a, b, vc);
+}
 
 //************ Vector widening saturating doubling multiply accumulate with scalar **************
+int32x4_t vqdmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c); // VQDMLAL.S16 q0, d0, d0[0]
+_NEON2SSE_INLINE int32x4_t vqdmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c)
+{
+    //not optimal SIMD soulution, serial may be faster
+    int16x4_t vc;
+    vc = vdup_n_s16(c);
+    return vqdmlal_s16(a, b, vc);
+}
+
+int64x2_t vqdmlal_n_s32(int64x2_t a, int32x2_t b, int32_t c); // VQDMLAL.S32 q0, d0, d0[0]
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqdmlal_n_s32(int64x2_t a, int32x2_t b, int32_t c), _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    int32x2_t vc;
+    vc = vdup_n_s32(c);
+    return vqdmlal_s32(a, b, vc);
+}
 
 //******** Vector multiply subtract with scalar **************
+int16x4_t vmls_n_s16(int16x4_t a, int16x4_t b, int16_t c); // VMLS.I16 d0, d0, d0[0]
+_NEON2SSE_INLINE int16x4_t vmls_n_s16(int16x4_t a, int16x4_t b, int16_t c) // VMLS.I16 d0, d0, d0[0]
+{
+    int16x4_t vc;
+    vc = vdup_n_s16(c);
+    return vmls_s16(a, b, vc);
+}
 
-int16x8_t vmlsq_n_s16(int16x8_t a, int16x8_t b, int16_t c);         // VMLS.I16 q0, q0, d0[0]
-_NEON2SSE_INLINE int16x8_t vmlsq_n_s16(int16x8_t a, int16x8_t b, int16_t c)         // VMLS.I16 q0, q0, d0[0]
+int32x2_t vmls_n_s32(int32x2_t a, int32x2_t b, int32_t c); // VMLS.I32 d0, d0, d0[0]
+_NEON2SSE_INLINE int32x2_t vmls_n_s32(int32x2_t a, int32x2_t b, int32_t c) // VMLS.I32 d0, d0, d0[0]
+{
+    int32x2_t vc;
+    vc = vdup_n_s32(c);
+    return vmls_s32(a, b, vc);
+}
+
+uint16x4_t vmls_n_u16(uint16x4_t a, uint16x4_t b, uint16_t c); // VMLS.I16 d0, d0, d0[0]
+_NEON2SSE_INLINE uint16x4_t vmls_n_u16(uint16x4_t a, uint16x4_t b, uint16_t c) // VMLS.I16 d0, d0, d0[0]
+{
+    uint16x4_t vc;
+    vc = vdup_n_s16(c);
+    return vmls_s16(a, b, vc);
+}
+
+uint32x2_t vmls_n_u32(uint32x2_t a, uint32x2_t b, uint32_t c); // VMLS.I32 d0, d0, d0[0]
+_NEON2SSE_INLINE uint32x2_t vmls_n_u32(uint32x2_t a, uint32x2_t b, uint32_t c) // VMLS.I32 d0, d0, d0[0]
+{
+    uint32x2_t vc;
+    vc = vdup_n_u32(c);
+    return vmls_u32(a, b, vc);
+}
+
+float32x2_t vmls_n_f32(float32x2_t a, float32x2_t b, float32_t c); // VMLS.F32 d0, d0, d0[0]
+_NEON2SSE_INLINE float32x2_t vmls_n_f32(float32x2_t a, float32x2_t b, float32_t c)
+{
+    float32x2_t res;
+    res.m64_f32[0] = a.m64_f32[0] - b.m64_f32[0] * c;
+    res.m64_f32[1] = a.m64_f32[1] - b.m64_f32[1] * c;
+    return res;
+}
+
+int16x8_t vmlsq_n_s16(int16x8_t a, int16x8_t b, int16_t c); // VMLS.I16 q0, q0, d0[0]
+_NEON2SSE_INLINE int16x8_t vmlsq_n_s16(int16x8_t a, int16x8_t b, int16_t c) // VMLS.I16 q0, q0, d0[0]
 {
     int16x8_t vc;
     vc = vdupq_n_s16(c);
     return vmlsq_s16(a, b,vc);
 }
 
-int32x4_t vmlsq_n_s32(int32x4_t a, int32x4_t b, int32_t c);         // VMLS.I32 q0, q0, d0[0]
-_NEON2SSE_INLINE int32x4_t vmlsq_n_s32(int32x4_t a, int32x4_t b, int32_t c)         // VMLS.I32 q0, q0, d0[0]
+int32x4_t vmlsq_n_s32(int32x4_t a, int32x4_t b, int32_t c); // VMLS.I32 q0, q0, d0[0]
+_NEON2SSE_INLINE int32x4_t vmlsq_n_s32(int32x4_t a, int32x4_t b, int32_t c) // VMLS.I32 q0, q0, d0[0]
 {
     int32x4_t vc;
     vc = vdupq_n_s32(c);
     return vmlsq_s32(a,b,vc);
 }
 
-uint16x8_t vmlsq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c);         // VMLS.I16 q0, q0, d0[0]
-_NEON2SSE_INLINE uint16x8_t vmlsq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c)         // VMLS.I16 q0, q0, d0[0]
+uint16x8_t vmlsq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c); // VMLS.I16 q0, q0, d0[0]
+_NEON2SSE_INLINE uint16x8_t vmlsq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c) // VMLS.I16 q0, q0, d0[0]
 {
     uint32x4_t vc;
     vc = vdupq_n_u32(c);
     return vmlsq_u32(a,b,vc);
 }
 
-uint32x4_t vmlsq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c);         // VMLS.I32 q0, q0, d0[0]
-_NEON2SSE_INLINE uint32x4_t vmlsq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c)         // VMLS.I32 q0, q0, d0[0]
+uint32x4_t vmlsq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c); // VMLS.I32 q0, q0, d0[0]
+_NEON2SSE_INLINE uint32x4_t vmlsq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c) // VMLS.I32 q0, q0, d0[0]
 {
     uint32x4_t vc;
     vc = vdupq_n_u32(c);
     return vmlsq_u32(a,b,vc);
 }
 
-float32x4_t vmlsq_n_f32(float32x4_t a, float32x4_t b, float32_t c);         // VMLS.F32 q0, q0, d0[0]
+float32x4_t vmlsq_n_f32(float32x4_t a, float32x4_t b, float32_t c); // VMLS.F32 q0, q0, d0[0]
 _NEON2SSE_INLINE float32x4_t vmlsq_n_f32(float32x4_t a, float32x4_t b, float32_t c)
 {
     float32x4_t vc;
@@ -7324,156 +14180,353 @@
 }
 
 //**** Vector widening multiply subtract with scalar ******
+int32x4_t vmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c); // VMLSL.S16 q0, d0, d0[0]
+_NEON2SSE_INLINE int32x4_t vmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c) // VMLSL.S16 q0, d0, d0[0]
+{
+    int16x4_t vc;
+    vc = vdup_n_s16(c);
+    return vmlsl_s16(a, b, vc);
+}
+
+int64x2_t vmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c); // VMLSL.S32 q0, d0, d0[0]
+_NEON2SSE_INLINE int64x2_t vmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c) // VMLSL.S32 q0, d0, d0[0]
+{
+    int32x2_t vc;
+    vc = vdup_n_s32(c);
+    return vmlsl_s32(a, b, vc);
+}
+
+uint32x4_t vmlsl_n_u16(uint32x4_t a, uint16x4_t b, uint16_t c); // VMLSL.s16 q0, d0, d0[0]
+_NEON2SSE_INLINE uint32x4_t vmlsl_n_u16(uint32x4_t a, uint16x4_t b, uint16_t c) // VMLSL.s16 q0, d0, d0[0]
+{
+    uint16x4_t vc;
+    vc = vdup_n_u16(c);
+    return vmlsl_u16(a, b, vc);
+}
+
+uint64x2_t vmlsl_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c); // VMLSL.U32 q0, d0, d0[0]
+_NEON2SSE_INLINE uint64x2_t vmlsl_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c) // VMLSL.U32 q0, d0, d0[0]
+{
+    uint32x2_t vc;
+    vc = vdup_n_u32(c);
+    return vmlsl_u32(a, b, vc);
+}
 
 //***** Vector widening saturating doubling multiply subtract with scalar *********
 //**********************************************************************************
+int32x4_t vqdmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c); // VQDMLSL.S16 q0, d0, d0[0]
+_NEON2SSE_INLINE int32x4_t vqdmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c)
+{
+    int16x4_t vc;
+    vc = vdup_n_s16(c);
+    return vqdmlsl_s16(a, b, vc);
+}
+
+int64x2_t vqdmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c); // VQDMLSL.S32 q0, d0, d0[0]
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqdmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c), _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    int32x2_t vc;
+    vc = vdup_n_s32(c);
+    return vqdmlsl_s32(a, b, vc);
+}
 
 //*******************  Vector extract ***********************************************
 //*************************************************************************************
 //VEXT (Vector Extract) extracts  elements from the bottom end of the second operand
 //vector and the top end of the first, concatenates them, and places the result in the destination vector
 //c elements from the bottom end of the second operand and (8-c) from the top end of the first
+int8x8_t vext_s8(int8x8_t a, int8x8_t b, __constrange(0,7) int c); // VEXT.8 d0,d0,d0,#0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x8_t vext_s8(int8x8_t a, int8x8_t b, __constrange(0,7) int c),_NEON2SSE_REASON_SLOW_SERIAL)
+{
+    int8x8_t res;
+    int i;
+    for (i = 0; i<8 - c; i++) {
+        res.m64_i8[i] = a.m64_i8[i + c];
+    }
+    for(i = 0; i<c; i++) {
+        res.m64_i8[8 - c + i] = b.m64_i8[i];
+    }
+    return res;
+}
 
-#if defined(USE_SSSE3)
+uint8x8_t vext_u8(uint8x8_t a,  uint8x8_t b, __constrange(0,7) int c); // VEXT.8 d0,d0,d0,#0
+#define vext_u8 vext_s8
 //same result tested
 
-#endif
+poly8x8_t vext_p8(poly8x8_t a, poly8x8_t b, __constrange(0,7) int c); // VEXT.8 d0,d0,d0,#0
+#define vext_p8 vext_u8
 
-#if defined(USE_SSSE3)
-int8x16_t vextq_s8(int8x16_t a, int8x16_t b, __constrange(0,15) int c);         // VEXT.8 q0,q0,q0,#0
+int16x4_t vext_s16(int16x4_t a, int16x4_t b, __constrange(0,3) int c); // VEXT.16 d0,d0,d0,#0
+_NEON2SSE_INLINE int16x4_t  _NEON2SSE_PERFORMANCE_WARNING (vext_s16(int16x4_t a, int16x4_t b, __constrange(0,3) int c), _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    int16x4_t res;
+    int i;
+    for (i = 0; i<4 - c; i++) {
+        res.m64_i16[i] = a.m64_i16[i + c];
+    }
+    for(i = 0; i<c; i++) {
+        res.m64_i16[4 - c + i] = b.m64_i16[i];
+    }
+    return res;
+}
+
+uint16x4_t vext_u16(uint16x4_t a,  uint16x4_t b, __constrange(0,3) int c); // VEXT.16 d0,d0,d0,#0
+#define vext_u16 vext_s16
+
+poly16x4_t vext_p16(poly16x4_t a, poly16x4_t b, __constrange(0,3) int c); // VEXT.16 d0,d0,d0,#0
+#define vext_p16 vext_s16
+
+int32x2_t vext_s32(int32x2_t a, int32x2_t b, __constrange(0,1) int c); // VEXT.32 d0,d0,d0,#0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vext_s32(int32x2_t a, int32x2_t b, __constrange(0,1) int c), _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    int32x2_t res;
+    if (c==0) {
+        res.m64_i32[0] = a.m64_i32[0];
+        res.m64_i32[1] = a.m64_i32[1];
+    } else {
+        res.m64_i32[0] = a.m64_i32[1];
+        res.m64_i32[1] = b.m64_i32[0];
+    }
+    return res;
+}
+
+float32x2_t vext_f32(float32x2_t a, float32x2_t b, __constrange(0,1) int c); // VEXT.32 d0,d0,d0,#0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(float32x2_t vext_f32(float32x2_t a, float32x2_t b, __constrange(0,1) int c), _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    float32x2_t res;
+    if (c==0) {
+        res.m64_f32[0] = a.m64_f32[0];
+        res.m64_f32[1] = a.m64_f32[1];
+    } else {
+        res.m64_f32[0] = a.m64_f32[1];
+        res.m64_f32[1] = b.m64_f32[0];
+    }
+    return res;
+}
+
+uint32x2_t vext_u32(uint32x2_t a,  uint32x2_t b, __constrange(0,1) int c); // VEXT.32 d0,d0,d0,#0
+#define vext_u32 vext_s32
+
+
+int64x1_t vext_s64(int64x1_t a, int64x1_t b, __constrange(0,0) int c); // VEXT.64 d0,d0,d0,#0
+#define vext_s64(a,b,c) a
+
+uint64x1_t vext_u64(uint64x1_t a, uint64x1_t b, __constrange(0,0) int c); // VEXT.64 d0,d0,d0,#0
+#define vext_u64(a,b,c) a
+
+int8x16_t vextq_s8(int8x16_t a, int8x16_t b, __constrange(0,15) int c); // VEXT.8 q0,q0,q0,#0
 #define vextq_s8(a,b,c) _MM_ALIGNR_EPI8 (b,a,c)
 
-uint8x16_t vextq_u8(uint8x16_t a, uint8x16_t b, __constrange(0,15) int c);         // VEXT.8 q0,q0,q0,#0
+uint8x16_t vextq_u8(uint8x16_t a, uint8x16_t b, __constrange(0,15) int c); // VEXT.8 q0,q0,q0,#0
 #define vextq_u8(a,b,c) _MM_ALIGNR_EPI8 (b,a,c)
 
-poly8x16_t vextq_p8(poly8x16_t a, poly8x16_t b, __constrange(0,15) int c);         // VEXT.8 q0,q0,q0,#0
+poly8x16_t vextq_p8(poly8x16_t a, poly8x16_t b, __constrange(0,15) int c); // VEXT.8 q0,q0,q0,#0
 #define vextq_p8 vextq_s8
 
-int16x8_t vextq_s16(int16x8_t a, int16x8_t b, __constrange(0,7) int c);         // VEXT.16 q0,q0,q0,#0
+int16x8_t vextq_s16(int16x8_t a, int16x8_t b, __constrange(0,7) int c); // VEXT.16 q0,q0,q0,#0
 #define vextq_s16(a,b,c) _MM_ALIGNR_EPI8 (b,a,c * 2)
 
-uint16x8_t vextq_u16(uint16x8_t a, uint16x8_t b, __constrange(0,7) int c);         // VEXT.16 q0,q0,q0,#0
+uint16x8_t vextq_u16(uint16x8_t a, uint16x8_t b, __constrange(0,7) int c); // VEXT.16 q0,q0,q0,#0
 #define vextq_u16(a,b,c) _MM_ALIGNR_EPI8 (b,a,c * 2)
 
-poly16x8_t vextq_p16(poly16x8_t a, poly16x8_t b, __constrange(0,7) int c);         // VEXT.16 q0,q0,q0,#0
+poly16x8_t vextq_p16(poly16x8_t a, poly16x8_t b, __constrange(0,7) int c); // VEXT.16 q0,q0,q0,#0
 #define vextq_p16 vextq_s16
-#endif
 
-#if defined(USE_SSSE3)
-int32x4_t vextq_s32(int32x4_t a, int32x4_t b, __constrange(0,3) int c);         // VEXT.32 q0,q0,q0,#0
+int32x4_t vextq_s32(int32x4_t a, int32x4_t b, __constrange(0,3) int c); // VEXT.32 q0,q0,q0,#0
 #define vextq_s32(a,b,c) _MM_ALIGNR_EPI8 (b,a,c * 4)
 
-uint32x4_t vextq_u32(uint32x4_t a, uint32x4_t b, __constrange(0,3) int c);         // VEXT.32 q0,q0,q0,#0
+uint32x4_t vextq_u32(uint32x4_t a, uint32x4_t b, __constrange(0,3) int c); // VEXT.32 q0,q0,q0,#0
 #define vextq_u32(a,b,c) _MM_ALIGNR_EPI8 (b,a,c * 4)
 
-int64x2_t vextq_s64(int64x2_t a, int64x2_t b, __constrange(0,1) int c);         // VEXT.64 q0,q0,q0,#0
+float32x4_t vextq_f32(float32x4_t a, float32x4_t b, __constrange(0,3) float c); // VEXT.32 q0,q0,q0,#0
+#define vextq_f32(a,b,c) _M128(vextq_s32(_M128i(a),_M128i(b),c) )
+
+int64x2_t vextq_s64(int64x2_t a, int64x2_t b, __constrange(0,1) int c); // VEXT.64 q0,q0,q0,#0
 #define vextq_s64(a,b,c) _MM_ALIGNR_EPI8(b,a,c * 8)
 
-uint64x2_t vextq_u64(uint64x2_t a, uint64x2_t b, __constrange(0,1) int c);         // VEXT.64 q0,q0,q0,#0
+uint64x2_t vextq_u64(uint64x2_t a, uint64x2_t b, __constrange(0,1) int c); // VEXT.64 q0,q0,q0,#0
 #define vextq_u64(a,b,c) _MM_ALIGNR_EPI8(b,a,c * 8)
-#endif
 
 //************ Reverse vector elements (swap endianness)*****************
 //*************************************************************************
 //VREVn.m reverses the order of the m-bit lanes within a set that is n bits wide.
+int8x8_t vrev64_s8(int8x8_t vec); // VREV64.8 d0,d0
+_NEON2SSE_INLINE int8x8_t vrev64_s8(int8x8_t vec)
+{
+    int8x8_t res64;
+    __m128i res;
+    res = vrev64q_s8(_pM128i(vec));
+    return64(res);
+}
 
-#if defined(USE_SSSE3)
-int8x16_t vrev64q_s8(int8x16_t vec);         // VREV64.8 q0,q0
-_NEON2SSE_INLINE int8x16_t vrev64q_s8(int8x16_t vec)         // VREV64.8 q0,q0
+int16x4_t vrev64_s16(int16x4_t vec); // VREV64.16 d0,d0
+_NEON2SSE_INLINE int16x4_t vrev64_s16(int16x4_t vec)
+{
+    int16x4_t res64;
+    __m128i res;
+    res = vrev64q_s16(_pM128i(vec));
+    return64(res);
+}
+
+int32x2_t vrev64_s32(int32x2_t vec); // VREV64.32 d0,d0
+_NEON2SSE_INLINE int32x2_t vrev64_s32(int32x2_t vec)
+{
+    int32x2_t res;
+    res.m64_i32[0] = vec.m64_i32[1];
+    res.m64_i32[1] = vec.m64_i32[0];
+    return res;
+}
+
+uint8x8_t vrev64_u8(uint8x8_t vec); // VREV64.8 d0,d0
+#define vrev64_u8 vrev64_s8
+
+uint16x4_t vrev64_u16(uint16x4_t vec); // VREV64.16 d0,d0
+#define vrev64_u16 vrev64_s16
+
+uint32x2_t vrev64_u32(uint32x2_t vec); // VREV64.32 d0,d0
+#define vrev64_u32 vrev64_s32
+
+poly8x8_t vrev64_p8(poly8x8_t vec); // VREV64.8 d0,d0
+#define vrev64_p8 vrev64_u8
+
+poly16x4_t vrev64_p16(poly16x4_t vec); // VREV64.16 d0,d0
+#define vrev64_p16 vrev64_u16
+
+float32x2_t vrev64_f32(float32x2_t vec); // VREV64.32 d0,d0
+_NEON2SSE_INLINE float32x2_t vrev64_f32(float32x2_t vec)
+{
+    float32x2_t res;
+    res.m64_f32[0] = vec.m64_f32[1];
+    res.m64_f32[1] = vec.m64_f32[0];
+    return res;
+}
+
+int8x16_t vrev64q_s8(int8x16_t vec); // VREV64.8 q0,q0
+_NEON2SSE_INLINE int8x16_t vrev64q_s8(int8x16_t vec) // VREV64.8 q0,q0
 {
     _NEON2SSE_ALIGN_16 int8_t mask_rev_e8[16] = {7,6,5,4,3,2,1,0, 15,14,13,12,11,10,9, 8};
     return _mm_shuffle_epi8 (vec, *(__m128i*)  mask_rev_e8);
 }
-#endif
 
-#if defined(USE_SSSE3)
-int16x8_t vrev64q_s16(int16x8_t vec);         // VREV64.16 q0,q0
-_NEON2SSE_INLINE int16x8_t vrev64q_s16(int16x8_t vec)         // VREV64.16 q0,q0
-{         //no _mm_shuffle_epi16, _mm_shuffle_epi8 to be used with the corresponding mask
+int16x8_t vrev64q_s16(int16x8_t vec); // VREV64.16 q0,q0
+_NEON2SSE_INLINE int16x8_t vrev64q_s16(int16x8_t vec) // VREV64.16 q0,q0
+{
+    //no _mm_shuffle_epi16, _mm_shuffle_epi8 to be used with the corresponding mask
     _NEON2SSE_ALIGN_16 int8_t mask_rev_e16[16] = {6,7, 4,5,2,3,0,1,14,15,12,13,10,11,8,9};
     return _mm_shuffle_epi8 (vec, *(__m128i*)mask_rev_e16);
 }
-#endif
 
-int32x4_t vrev64q_s32(int32x4_t vec);         // VREV64.32 q0,q0
-_NEON2SSE_INLINE int32x4_t vrev64q_s32(int32x4_t vec)         // VREV64.32 q0,q0
+int32x4_t vrev64q_s32(int32x4_t vec); // VREV64.32 q0,q0
+_NEON2SSE_INLINE int32x4_t vrev64q_s32(int32x4_t vec) // VREV64.32 q0,q0
 {
     return _mm_shuffle_epi32 (vec, 1 | (0 << 2) | (3 << 4) | (2 << 6) );
 }
 
-#if defined(USE_SSSE3)
-uint8x16_t vrev64q_u8(uint8x16_t vec);         // VREV64.8 q0,q0
+uint8x16_t vrev64q_u8(uint8x16_t vec); // VREV64.8 q0,q0
 #define vrev64q_u8 vrev64q_s8
 
-uint16x8_t vrev64q_u16(uint16x8_t vec);         // VREV64.16 q0,q0
+uint16x8_t vrev64q_u16(uint16x8_t vec); // VREV64.16 q0,q0
 #define vrev64q_u16 vrev64q_s16
-#endif
 
-uint32x4_t vrev64q_u32(uint32x4_t vec);         // VREV64.32 q0,q0
+uint32x4_t vrev64q_u32(uint32x4_t vec); // VREV64.32 q0,q0
 #define vrev64q_u32 vrev64q_s32
 
-#if defined(USE_SSSE3)
-poly8x16_t vrev64q_p8(poly8x16_t vec);         // VREV64.8 q0,q0
+poly8x16_t vrev64q_p8(poly8x16_t vec); // VREV64.8 q0,q0
 #define vrev64q_p8 vrev64q_u8
 
-poly16x8_t vrev64q_p16(poly16x8_t vec);         // VREV64.16 q0,q0
-#define vrev64q_p16 vrev64q_s16
-#endif
+poly16x8_t vrev64q_p16(poly16x8_t vec); // VREV64.16 q0,q0
+#define vrev64q_p16 vrev64q_u16
 
-float32x4_t vrev64q_f32(float32x4_t vec);         // VREV64.32 q0,q0
+float32x4_t vrev64q_f32(float32x4_t vec); // VREV64.32 q0,q0
 #define vrev64q_f32(vec) _mm_shuffle_ps (vec,  vec, _MM_SHUFFLE(2,3, 0,1))
 
 //********************  32 bit shuffles **********************
 //************************************************************
+int8x8_t vrev32_s8(int8x8_t vec); // VREV32.8 d0,d0
+_NEON2SSE_INLINE int8x8_t vrev32_s8(int8x8_t vec)
+{
+    int8x8_t res64;
+    __m128i res;
+    res = vrev32q_s8(_pM128i(vec));
+    return64(res);
+}
 
-#if defined(USE_SSSE3)
-int8x16_t vrev32q_s8(int8x16_t vec);         // VREV32.8 q0,q0
-_NEON2SSE_INLINE int8x16_t vrev32q_s8(int8x16_t vec)         // VREV32.8 q0,q0
+int16x4_t vrev32_s16(int16x4_t vec); // VREV32.16 d0,d0
+_NEON2SSE_INLINE int16x4_t vrev32_s16(int16x4_t vec)
+{
+    int16x4_t res64;
+    __m128i res;
+    res = vrev32q_s16(_pM128i(vec));
+    return64(res);
+}
+
+uint8x8_t vrev32_u8(uint8x8_t vec); // VREV32.8 d0,d0
+#define vrev32_u8 vrev32_s8
+
+uint16x4_t vrev32_u16(uint16x4_t vec); // VREV32.16 d0,d0
+#define vrev32_u16 vrev32_s16
+
+poly8x8_t vrev32_p8(poly8x8_t vec); // VREV32.8 d0,d0
+#define vrev32_p8 vrev32_u8
+
+poly16x4_t vrev32_p16(poly16x4_t vec); // VREV32.16 d0,d0
+#define vrev32_p16 vrev32_u16
+
+int8x16_t vrev32q_s8(int8x16_t vec); // VREV32.8 q0,q0
+_NEON2SSE_INLINE int8x16_t vrev32q_s8(int8x16_t vec) // VREV32.8 q0,q0
 {
     _NEON2SSE_ALIGN_16 int8_t mask_rev_e8[16] = {3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12};
     return _mm_shuffle_epi8 (vec, *(__m128i*)  mask_rev_e8);
 }
-#endif
 
-#if defined(USE_SSSE3)
-int16x8_t vrev32q_s16(int16x8_t vec);         // VREV32.16 q0,q0
-_NEON2SSE_INLINE int16x8_t vrev32q_s16(int16x8_t vec)         // VREV32.16 q0,q0
+int16x8_t vrev32q_s16(int16x8_t vec); // VREV32.16 q0,q0
+_NEON2SSE_INLINE int16x8_t vrev32q_s16(int16x8_t vec) // VREV32.16 q0,q0
 {
     _NEON2SSE_ALIGN_16 int8_t mask_rev_e8[16] = {2,3,0,1, 6,7, 4,5, 10,11, 8,9, 14,15,12,13};
     return _mm_shuffle_epi8 (vec, *(__m128i*)  mask_rev_e8);
 }
-#endif
 
-#if defined(USE_SSSE3)
-uint8x16_t vrev32q_u8(uint8x16_t vec);         // VREV32.8 q0,q0
+uint8x16_t vrev32q_u8(uint8x16_t vec); // VREV32.8 q0,q0
 #define vrev32q_u8 vrev32q_s8
 
-uint16x8_t vrev32q_u16(uint16x8_t vec);         // VREV32.16 q0,q0
+uint16x8_t vrev32q_u16(uint16x8_t vec); // VREV32.16 q0,q0
 #define vrev32q_u16 vrev32q_s16
 
-poly8x16_t vrev32q_p8(poly8x16_t vec);         // VREV32.8 q0,q0
+poly8x16_t vrev32q_p8(poly8x16_t vec); // VREV32.8 q0,q0
 #define vrev32q_p8 vrev32q_u8
-#endif
+
+poly16x8_t vrev32q_p16(poly16x8_t vec); // VREV32.16 q0,q0
+#define vrev32q_p16 vrev32q_u16
 
 //*************  16 bit shuffles **********************
 //******************************************************
+int8x8_t vrev16_s8(int8x8_t vec); // VREV16.8 d0,d0
+_NEON2SSE_INLINE int8x8_t vrev16_s8(int8x8_t vec)
+{
+    int8x8_t res64;
+    __m128i res;
+    res = vrev16q_s8(_pM128i(vec));
+    return64(res);
+}
 
-#if defined(USE_SSSE3)
-int8x16_t vrev16q_s8(int8x16_t vec);         // VREV16.8 q0,q0
-_NEON2SSE_INLINE int8x16_t vrev16q_s8(int8x16_t vec)         // VREV16.8 q0,q0
+uint8x8_t vrev16_u8(uint8x8_t vec); // VREV16.8 d0,d0
+#define vrev16_u8 vrev16_s8
+
+poly8x8_t vrev16_p8(poly8x8_t vec); // VREV16.8 d0,d0
+#define vrev16_p8 vrev16_u8
+
+int8x16_t vrev16q_s8(int8x16_t vec); // VREV16.8 q0,q0
+_NEON2SSE_INLINE int8x16_t vrev16q_s8(int8x16_t vec) // VREV16.8 q0,q0
 {
     _NEON2SSE_ALIGN_16 int8_t mask_rev8[16] = {1,0, 3,2, 5,4, 7,6, 9,8, 11, 10, 13, 12, 15, 14};
     return _mm_shuffle_epi8 (vec, *(__m128i*)  mask_rev8);
 }
-#endif
 
-#if defined(USE_SSSE3)
-uint8x16_t vrev16q_u8(uint8x16_t vec);         // VREV16.8 q0,q0
+uint8x16_t vrev16q_u8(uint8x16_t vec); // VREV16.8 q0,q0
 #define vrev16q_u8 vrev16q_s8
 
-poly8x16_t vrev16q_p8(poly8x16_t vec);         // VREV16.8 q0,q0
+poly8x16_t vrev16q_p8(poly8x16_t vec); // VREV16.8 q0,q0
 #define vrev16q_p8 vrev16q_u8
-#endif
 
 //*********************************************************************
 //**************** Other single operand arithmetic *******************
@@ -7481,18 +14534,56 @@
 
 //*********** Absolute: Vd[i] = |Va[i]| **********************************
 //************************************************************************
+int8x8_t   vabs_s8(int8x8_t a); // VABS.S8 d0,d0
+_NEON2SSE_INLINE int8x8_t   vabs_s8(int8x8_t a)
+{
+    int8x8_t res64;
+    __m128i res;
+    res = _mm_abs_epi8(_pM128i(a));
+    return64(res);
+}
 
-int8x16_t   vabsq_s8(int8x16_t a);         // VABS.S8 q0,q0
+
+int16x4_t   vabs_s16(int16x4_t a); // VABS.S16 d0,d0
+_NEON2SSE_INLINE int16x4_t   vabs_s16(int16x4_t a)
+{
+    int16x4_t res64;
+    __m128i res;
+    res = _mm_abs_epi16(_pM128i(a));
+    return64(res);
+}
+
+int32x2_t   vabs_s32(int32x2_t a); // VABS.S32 d0,d0
+_NEON2SSE_INLINE int32x2_t   vabs_s32(int32x2_t a)
+{
+    int32x2_t res64;
+    __m128i res;
+    res = _mm_abs_epi32(_pM128i(a));
+    return64(res);
+}
+
+float32x2_t vabs_f32(float32x2_t a); // VABS.F32 d0,d0
+_NEON2SSE_INLINE float32x2_t vabs_f32(float32x2_t a) // VABS.F32 d0,d0
+{
+    float32x4_t res;
+    __m64_128 res64;
+    _NEON2SSE_ALIGN_16 int32_t c7fffffff[4] = {0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff};
+    res = _mm_and_ps (_pM128(a), *(__m128*)c7fffffff); //use 64 low bits only
+    _M64f(res64, res);
+    return res64;
+}
+
+int8x16_t   vabsq_s8(int8x16_t a); // VABS.S8 q0,q0
 #define vabsq_s8 _mm_abs_epi8
 
-int16x8_t   vabsq_s16(int16x8_t a);         // VABS.S16 q0,q0
+int16x8_t   vabsq_s16(int16x8_t a); // VABS.S16 q0,q0
 #define vabsq_s16 _mm_abs_epi16
 
-int32x4_t   vabsq_s32(int32x4_t a);         // VABS.S32 q0,q0
+int32x4_t   vabsq_s32(int32x4_t a); // VABS.S32 q0,q0
 #define vabsq_s32 _mm_abs_epi32
 
-float32x4_t vabsq_f32(float32x4_t a);         // VABS.F32 q0,q0
-_NEON2SSE_INLINE float32x4_t vabsq_f32(float32x4_t a)         // VABS.F32 q0,q0
+float32x4_t vabsq_f32(float32x4_t a); // VABS.F32 q0,q0
+_NEON2SSE_INLINE float32x4_t vabsq_f32(float32x4_t a) // VABS.F32 q0,q0
 {
     _NEON2SSE_ALIGN_16 int32_t c7fffffff[4] = {0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff};
     return _mm_and_ps (a, *(__m128*)c7fffffff);
@@ -7501,74 +14592,131 @@
 //****** Saturating absolute: Vd[i] = sat(|Va[i]|) *********************
 //**********************************************************************
 //For signed-integer data types, the absolute value of the most negative value is not representable by the data type, saturation takes place
+int8x8_t vqabs_s8(int8x8_t a); // VQABS.S8 d0,d0
+_NEON2SSE_INLINE int8x8_t vqabs_s8(int8x8_t a)
+{
+    int8x8_t res64;
+    __m128i res;
+    res = vqabsq_s8(_pM128i(a));
+    return64(res);
+}
 
-#if defined(USE_SSSE3)
-int8x16_t vqabsq_s8(int8x16_t a);         // VQABS.S8 q0,q0
-_NEON2SSE_INLINE int8x16_t vqabsq_s8(int8x16_t a)         // VQABS.S8 q0,q0
+int16x4_t vqabs_s16(int16x4_t a); // VQABS.S16 d0,d0
+_NEON2SSE_INLINE int16x4_t vqabs_s16(int16x4_t a)
+{
+    int16x4_t res64;
+    __m128i res;
+    res = vqabsq_s16(_pM128i(a));
+    return64(res);
+}
+
+int32x2_t vqabs_s32(int32x2_t a); // VQABS.S32 d0,d0
+_NEON2SSE_INLINE int32x2_t vqabs_s32(int32x2_t a)
+{
+    int32x2_t res64;
+    __m128i res;
+    res = vqabsq_s32(_pM128i(a));
+    return64(res);
+}
+
+int8x16_t vqabsq_s8(int8x16_t a); // VQABS.S8 q0,q0
+_NEON2SSE_INLINE int8x16_t vqabsq_s8(int8x16_t a) // VQABS.S8 q0,q0
 {
     __m128i c_128, abs, abs_cmp;
-    c_128 = _mm_set1_epi8 (0x80);         //-128
+    c_128 = _mm_set1_epi8 (0x80); //-128
     abs = _mm_abs_epi8 (a);
     abs_cmp = _mm_cmpeq_epi8 (abs, c_128);
     return _mm_xor_si128 (abs,  abs_cmp);
 }
-#endif
 
-#if defined(USE_SSSE3)
-int16x8_t vqabsq_s16(int16x8_t a);         // VQABS.S16 q0,q0
-_NEON2SSE_INLINE int16x8_t vqabsq_s16(int16x8_t a)         // VQABS.S16 q0,q0
+int16x8_t vqabsq_s16(int16x8_t a); // VQABS.S16 q0,q0
+_NEON2SSE_INLINE int16x8_t vqabsq_s16(int16x8_t a) // VQABS.S16 q0,q0
 {
     __m128i c_32768, abs, abs_cmp;
-    c_32768 = _mm_set1_epi16 (0x8000);         //-32768
+    c_32768 = _mm_set1_epi16 (0x8000); //-32768
     abs = _mm_abs_epi16 (a);
     abs_cmp = _mm_cmpeq_epi16 (abs, c_32768);
     return _mm_xor_si128 (abs,  abs_cmp);
 }
-#endif
 
-#if defined(USE_SSSE3)
-int32x4_t vqabsq_s32(int32x4_t a);         // VQABS.S32 q0,q0
-_NEON2SSE_INLINE int32x4_t vqabsq_s32(int32x4_t a)         // VQABS.S32 q0,q0
+int32x4_t vqabsq_s32(int32x4_t a); // VQABS.S32 q0,q0
+_NEON2SSE_INLINE int32x4_t vqabsq_s32(int32x4_t a) // VQABS.S32 q0,q0
 {
     __m128i c80000000, abs, abs_cmp;
-    c80000000 = _mm_set1_epi32 (0x80000000);         //most negative value
+    c80000000 = _mm_set1_epi32 (0x80000000); //most negative value
     abs = _mm_abs_epi32 (a);
     abs_cmp = _mm_cmpeq_epi32 (abs, c80000000);
     return _mm_xor_si128 (abs,  abs_cmp);
 }
-#endif
 
 //*************** Negate: Vd[i] = - Va[i] *************************************
 //*****************************************************************************
 //several Negate implementations possible for SIMD.
 //e.//function _mm_sign function(a, negative numbers vector), but the following one gives good performance:
+int8x8_t vneg_s8(int8x8_t a); // VNE//d0,d0
+_NEON2SSE_INLINE int8x8_t vneg_s8(int8x8_t a)
+{
+    int8x8_t res64;
+    __m128i res;
+    res = vnegq_s8(_pM128i(a));
+    return64(res);
+}
 
-int8x16_t vnegq_s8(int8x16_t a);         // VNE//q0,q0
-_NEON2SSE_INLINE int8x16_t vnegq_s8(int8x16_t a)         // VNE//q0,q0
+int16x4_t vneg_s16(int16x4_t a); // VNE//d0,d0
+_NEON2SSE_INLINE int16x4_t vneg_s16(int16x4_t a)
+{
+    int16x4_t res64;
+    __m128i res;
+    res = vnegq_s16(_pM128i(a));
+    return64(res);
+}
+
+int32x2_t vneg_s32(int32x2_t a); // VNE//d0,d0
+_NEON2SSE_INLINE int32x2_t vneg_s32(int32x2_t a)
+{
+    int32x2_t res64;
+    __m128i res;
+    res = vnegq_s32(_pM128i(a));
+    return64(res);
+}
+
+float32x2_t vneg_f32(float32x2_t a); // VNE//d0,d0
+_NEON2SSE_INLINE float32x2_t vneg_f32(float32x2_t a) // VNE//d0,d0
+{
+    float32x4_t res;
+    __m64_128 res64;
+    _NEON2SSE_ALIGN_16 int32_t c80000000[4] = {0x80000000, 0x80000000, 0x80000000, 0x80000000};
+    res = _mm_xor_ps (_pM128(a), *(__m128*) c80000000); //use low 64 bits
+    _M64f(res64, res);
+    return res64;
+}
+
+int8x16_t vnegq_s8(int8x16_t a); // VNE//q0,q0
+_NEON2SSE_INLINE int8x16_t vnegq_s8(int8x16_t a) // VNE//q0,q0
 {
     __m128i zero;
     zero = _mm_setzero_si128 ();
     return _mm_sub_epi8 (zero, a);
-}         //or _mm_sign_epi8 (a, negative numbers vector)
+} //or _mm_sign_epi8 (a, negative numbers vector)
 
-int16x8_t vnegq_s16(int16x8_t a);         // VNE//q0,q0
-_NEON2SSE_INLINE int16x8_t vnegq_s16(int16x8_t a)         // VNE//q0,q0
+int16x8_t vnegq_s16(int16x8_t a); // VNE//q0,q0
+_NEON2SSE_INLINE int16x8_t vnegq_s16(int16x8_t a) // VNE//q0,q0
 {
     __m128i zero;
     zero = _mm_setzero_si128 ();
     return _mm_sub_epi16 (zero, a);
-}         //or _mm_sign_epi16 (a, negative numbers vector)
+} //or _mm_sign_epi16 (a, negative numbers vector)
 
-int32x4_t vnegq_s32(int32x4_t a);         // VNE//q0,q0
-_NEON2SSE_INLINE int32x4_t vnegq_s32(int32x4_t a)         // VNE//q0,q0
+int32x4_t vnegq_s32(int32x4_t a); // VNE//q0,q0
+_NEON2SSE_INLINE int32x4_t vnegq_s32(int32x4_t a) // VNE//q0,q0
 {
     __m128i zero;
     zero = _mm_setzero_si128 ();
     return _mm_sub_epi32 (zero, a);
-}         //or _mm_sign_epi32 (a, negative numbers vector)
+} //or _mm_sign_epi32 (a, negative numbers vector)
 
-float32x4_t vnegq_f32(float32x4_t a);         // VNE//q0,q0
-_NEON2SSE_INLINE float32x4_t vnegq_f32(float32x4_t a)         // VNE//q0,q0
+float32x4_t vnegq_f32(float32x4_t a); // VNE//q0,q0
+_NEON2SSE_INLINE float32x4_t vnegq_f32(float32x4_t a) // VNE//q0,q0
 {
     _NEON2SSE_ALIGN_16 int32_t c80000000[4] = {0x80000000, 0x80000000, 0x80000000, 0x80000000};
     return _mm_xor_ps (a, *(__m128*) c80000000);
@@ -7577,30 +14725,57 @@
 //************** Saturating Negate: sat(Vd[i] = - Va[i]) **************************
 //***************************************************************************************
 //For signed-integer data types, the negation of the most negative value can't be produced without saturation, while with saturation it is max positive
+int8x8_t vqneg_s8(int8x8_t a); // VQNE//d0,d0
+_NEON2SSE_INLINE int8x8_t vqneg_s8(int8x8_t a)
+{
+    int8x8_t res64;
+    __m128i res;
+    res = vqnegq_s8(_pM128i(a));
+    return64(res);
+}
 
-int8x16_t vqnegq_s8(int8x16_t a);         // VQNE//q0,q0
-_NEON2SSE_INLINE int8x16_t vqnegq_s8(int8x16_t a)         // VQNE//q0,q0
+int16x4_t vqneg_s16(int16x4_t a); // VQNE//d0,d0
+_NEON2SSE_INLINE int16x4_t vqneg_s16(int16x4_t a)
+{
+    int16x4_t res64;
+    __m128i res;
+    res = vqnegq_s16(_pM128i(a));
+    return64(res);
+}
+
+int32x2_t vqneg_s32(int32x2_t a); // VQNE//d0,d0
+_NEON2SSE_INLINE int32x2_t vqneg_s32(int32x2_t a)
+{
+    int32x2_t res64;
+    __m128i res;
+    res = vqnegq_s32(_pM128i(a));
+    return64(res);
+}
+
+int8x16_t vqnegq_s8(int8x16_t a); // VQNE//q0,q0
+_NEON2SSE_INLINE int8x16_t vqnegq_s8(int8x16_t a) // VQNE//q0,q0
 {
     __m128i zero;
     zero = _mm_setzero_si128 ();
-    return _mm_subs_epi8 (zero, a);         //saturating substraction
+    return _mm_subs_epi8 (zero, a); //saturating substraction
 }
 
-int16x8_t vqnegq_s16(int16x8_t a);         // VQNE//q0,q0
-_NEON2SSE_INLINE int16x8_t vqnegq_s16(int16x8_t a)         // VQNE//q0,q0
+int16x8_t vqnegq_s16(int16x8_t a); // VQNE//q0,q0
+_NEON2SSE_INLINE int16x8_t vqnegq_s16(int16x8_t a) // VQNE//q0,q0
 {
     __m128i zero;
     zero = _mm_setzero_si128 ();
-    return _mm_subs_epi16 (zero, a);         //saturating substraction
+    return _mm_subs_epi16 (zero, a); //saturating substraction
 }
 
-int32x4_t vqnegq_s32(int32x4_t a);         // VQNE//q0,q0
-_NEON2SSE_INLINE int32x4_t vqnegq_s32(int32x4_t a)         // VQNE//q0,q0
-{         //solution may be not optimal compared with a serial
+int32x4_t vqnegq_s32(int32x4_t a); // VQNE//q0,q0
+_NEON2SSE_INLINE int32x4_t vqnegq_s32(int32x4_t a) // VQNE//q0,q0
+{
+    //solution may be not optimal compared with a serial
     __m128i c80000000, zero, sub, cmp;
-    c80000000 = _mm_set1_epi32 (0x80000000);         //most negative value
+    c80000000 = _mm_set1_epi32 (0x80000000); //most negative value
     zero = _mm_setzero_si128 ();
-    sub =  _mm_sub_epi32 (zero, a);         //substraction
+    sub =  _mm_sub_epi32 (zero, a); //substraction
     cmp = _mm_cmpeq_epi32 (a, c80000000);
     return _mm_xor_si128 (sub,  cmp);
 }
@@ -7608,47 +14783,79 @@
 //****************** Count leading zeros ********************************
 //**************************************************************************
 //no corresponding vector intrinsics in IA32, need to implement it.  While the implementation is effective for 8 bits, it may be not for 16 and 32 bits
+int8x8_t vclz_s8(int8x8_t a); // VCLZ.I8 d0,d0
+_NEON2SSE_INLINE int8x8_t vclz_s8(int8x8_t a)
+{
+    int8x8_t res64;
+    __m128i res;
+    res = vclzq_s8(_pM128i(a));
+    return64(res);
+}
 
-#if defined(USE_SSSE3)
-int8x16_t vclzq_s8(int8x16_t a);         // VCLZ.I8 q0,q0
+int16x4_t vclz_s16(int16x4_t a); // VCLZ.I16 d0,d0
+_NEON2SSE_INLINE int16x4_t vclz_s16(int16x4_t a)
+{
+    int16x4_t res64;
+    __m128i res;
+    res = vclzq_s16(_pM128i(a));
+    return64(res);
+}
+
+int32x2_t vclz_s32(int32x2_t a); // VCLZ.I32 d0,d0
+_NEON2SSE_INLINE int32x2_t vclz_s32(int32x2_t a)
+{
+    int32x2_t res64;
+    __m128i res;
+    res = vclzq_s32(_pM128i(a));
+    return64(res);
+}
+
+
+uint8x8_t vclz_u8(uint8x8_t a); // VCLZ.I8 d0,d0
+#define vclz_u8 vclz_s8
+
+uint16x4_t vclz_u16(uint16x4_t a); // VCLZ.I16 d0,d0
+#define vclz_u16 vclz_s16
+
+uint32x2_t vclz_u32(uint32x2_t a); // VCLZ.I32 d0,d0
+#define vclz_u32 vclz_s32
+
+int8x16_t vclzq_s8(int8x16_t a); // VCLZ.I8 q0,q0
 _NEON2SSE_INLINE int8x16_t vclzq_s8(int8x16_t a)
 {
     _NEON2SSE_ALIGN_16 int8_t mask_CLZ[16] = { /* 0 */ 4,/* 1 */ 3,/* 2 */ 2,/* 3 */ 2,
-                                       /* 4 */ 1,/* 5 */ 1,/* 6 */ 1,/* 7 */ 1,
-                                       /* 8 */ 0,/* 9 */ 0,/* a */ 0,/* b */ 0,
-                                       /* c */ 0,/* d */ 0,/* e */ 0,/* f */ 0};
+                                                    /* 4 */ 1,/* 5 */ 1,/* 6 */ 1,/* 7 */ 1,
+                                                    /* 8 */ 0,/* 9 */ 0,/* a */ 0,/* b */ 0,
+                                                    /* c */ 0,/* d */ 0,/* e */ 0,/* f */ 0                          };
     __m128i maskLOW, c4, lowclz, mask, hiclz;
-    maskLOW = _mm_set1_epi8(0x0f);         //low 4 bits, don't need masking low to avoid zero if MSB is set - it happens automatically
+    maskLOW = _mm_set1_epi8(0x0f); //low 4 bits, don't need masking low to avoid zero if MSB is set - it happens automatically
     c4 = _mm_set1_epi8(4);
-    lowclz = _mm_shuffle_epi8( *(__m128i*)mask_CLZ, a);         //uses low 4 bits anyway
-    mask =  _mm_srli_epi16(a, 4);         //get high 4 bits as low bits
-    mask = _mm_and_si128(mask, maskLOW);         //low 4 bits, need masking to avoid zero if MSB is set
-    hiclz = _mm_shuffle_epi8( *(__m128i*) mask_CLZ, mask);         //uses low 4 bits anyway
-    mask = _mm_cmpeq_epi8(hiclz, c4);         // shows the need to add lowclz zeros
+    lowclz = _mm_shuffle_epi8( *(__m128i*)mask_CLZ, a); //uses low 4 bits anyway
+    mask =  _mm_srli_epi16(a, 4); //get high 4 bits as low bits
+    mask = _mm_and_si128(mask, maskLOW); //low 4 bits, need masking to avoid zero if MSB is set
+    hiclz = _mm_shuffle_epi8( *(__m128i*) mask_CLZ, mask); //uses low 4 bits anyway
+    mask = _mm_cmpeq_epi8(hiclz, c4); // shows the need to add lowclz zeros
     lowclz = _mm_and_si128(lowclz,mask);
     return _mm_add_epi8(lowclz, hiclz);
 }
-#endif
 
-#if defined(USE_SSSE3)
-int16x8_t vclzq_s16(int16x8_t a);         // VCLZ.I16 q0,q0
+int16x8_t vclzq_s16(int16x8_t a); // VCLZ.I16 q0,q0
 _NEON2SSE_INLINE int16x8_t vclzq_s16(int16x8_t a)
 {
     __m128i c7, res8x16, res8x16_swap;
     _NEON2SSE_ALIGN_16 int8_t mask8_sab[16] = { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14};
     _NEON2SSE_ALIGN_16 uint16_t mask8bit[8] = {0x00ff, 0x00ff, 0x00ff, 0x00ff,0x00ff, 0x00ff, 0x00ff, 0x00ff};
-    c7 = _mm_srli_epi16(*(__m128i*)mask8bit, 5);         //7
+    c7 = _mm_srli_epi16(*(__m128i*)mask8bit, 5); //7
     res8x16 = vclzq_s8(a);
-    res8x16_swap = _mm_shuffle_epi8 (res8x16, *(__m128i*) mask8_sab);         //horisontal pairs swap
-    res8x16 = _mm_and_si128(res8x16, *(__m128i*)mask8bit);         //lowclz
-    res8x16_swap = _mm_and_si128(res8x16_swap, *(__m128i*)mask8bit);         //hiclz
-    c7 = _mm_cmpgt_epi16(res8x16_swap, c7);         // shows the need to add lowclz zeros
-    res8x16 = _mm_and_si128(res8x16, c7);         //lowclz
+    res8x16_swap = _mm_shuffle_epi8 (res8x16, *(__m128i*) mask8_sab); //horisontal pairs swap
+    res8x16 = _mm_and_si128(res8x16, *(__m128i*)mask8bit); //lowclz
+    res8x16_swap = _mm_and_si128(res8x16_swap, *(__m128i*)mask8bit); //hiclz
+    c7 = _mm_cmpgt_epi16(res8x16_swap, c7); // shows the need to add lowclz zeros
+    res8x16 = _mm_and_si128(res8x16, c7); //lowclz
     return _mm_add_epi16(res8x16_swap, res8x16);
 }
-#endif
 
-int32x4_t vclzq_s32(int32x4_t a);         // VCLZ.I32 q0,q0
+int32x4_t vclzq_s32(int32x4_t a); // VCLZ.I32 q0,q0
 _NEON2SSE_INLINE int32x4_t vclzq_s32(int32x4_t a)
 {
     __m128i c55555555, c33333333, c0f0f0f0f, c3f, c32, tmp, tmp1, res;
@@ -7658,49 +14865,47 @@
     c3f = _mm_set1_epi32(0x3f);
     c32 = _mm_set1_epi32(32);
     tmp = _mm_srli_epi32(a, 1);
-    res = _mm_or_si128(tmp, a);         //atmp[i] |= (atmp[i] >> 1);
+    res = _mm_or_si128(tmp, a); //atmp[i] |= (atmp[i] >> 1);
     tmp = _mm_srli_epi32(res, 2);
-    res = _mm_or_si128(tmp, res);         //atmp[i] |= (atmp[i] >> 2);
+    res = _mm_or_si128(tmp, res); //atmp[i] |= (atmp[i] >> 2);
     tmp = _mm_srli_epi32(res, 4);
-    res = _mm_or_si128(tmp, res);         //atmp[i] |= (atmp[i] >> 4);
+    res = _mm_or_si128(tmp, res); //atmp[i] |= (atmp[i] >> 4);
     tmp = _mm_srli_epi32(res, 8);
-    res = _mm_or_si128(tmp, res);         //atmp[i] |= (atmp[i] >> 8);
+    res = _mm_or_si128(tmp, res); //atmp[i] |= (atmp[i] >> 8);
     tmp = _mm_srli_epi32(res, 16);
-    res = _mm_or_si128(tmp, res);         //atmp[i] |= (atmp[i] >> 16);
+    res = _mm_or_si128(tmp, res); //atmp[i] |= (atmp[i] >> 16);
 
     tmp = _mm_srli_epi32(res, 1);
     tmp = _mm_and_si128(tmp, c55555555);
-    res = _mm_sub_epi32(res, tmp);         //atmp[i] -= ((atmp[i] >> 1) & 0x55555555);
+    res = _mm_sub_epi32(res, tmp); //atmp[i] -= ((atmp[i] >> 1) & 0x55555555);
 
     tmp = _mm_srli_epi32(res, 2);
     tmp = _mm_and_si128(tmp, c33333333);
     tmp1 = _mm_and_si128(res, c33333333);
-    res = _mm_add_epi32(tmp, tmp1);         //atmp[i] = (((atmp[i] >> 2) & 0x33333333) + (atmp[i] & 0x33333333));
+    res = _mm_add_epi32(tmp, tmp1); //atmp[i] = (((atmp[i] >> 2) & 0x33333333) + (atmp[i] & 0x33333333));
 
     tmp = _mm_srli_epi32(res, 4);
     tmp = _mm_add_epi32(tmp, res);
-    res = _mm_and_si128(tmp, c0f0f0f0f);         //atmp[i] = (((atmp[i] >> 4) + atmp[i]) & 0x0f0f0f0f);
+    res = _mm_and_si128(tmp, c0f0f0f0f); //atmp[i] = (((atmp[i] >> 4) + atmp[i]) & 0x0f0f0f0f);
 
     tmp = _mm_srli_epi32(res, 8);
-    res = _mm_add_epi32(tmp, res);         //atmp[i] += (atmp[i] >> 8);
+    res = _mm_add_epi32(tmp, res); //atmp[i] += (atmp[i] >> 8);
 
     tmp = _mm_srli_epi32(res, 16);
-    res = _mm_add_epi32(tmp, res);         //atmp[i] += (atmp[i] >> 16);
+    res = _mm_add_epi32(tmp, res); //atmp[i] += (atmp[i] >> 16);
 
-    res = _mm_and_si128(res, c3f);         //atmp[i] = atmp[i] & 0x0000003f;
+    res = _mm_and_si128(res, c3f); //atmp[i] = atmp[i] & 0x0000003f;
 
-    return _mm_sub_epi32(c32, res);         //res[i] = 32 - atmp[i];
+    return _mm_sub_epi32(c32, res); //res[i] = 32 - atmp[i];
 }
 
-#if defined(USE_SSSE3)
-uint8x16_t vclzq_u8(uint8x16_t a);         // VCLZ.I8 q0,q0
+uint8x16_t vclzq_u8(uint8x16_t a); // VCLZ.I8 q0,q0
 #define vclzq_u8 vclzq_s8
 
-uint16x8_t vclzq_u16(uint16x8_t a);         // VCLZ.I16 q0,q0
+uint16x8_t vclzq_u16(uint16x8_t a); // VCLZ.I16 q0,q0
 #define vclzq_u16 vclzq_s16
-#endif
 
-uint32x4_t vclzq_u32(uint32x4_t a);         // VCLZ.I32 q0,q0
+uint32x4_t vclzq_u32(uint32x4_t a); // VCLZ.I32 q0,q0
 #define vclzq_u32 vclzq_s32
 
 //************** Count leading sign bits **************************
@@ -7709,17 +14914,42 @@
 // the topmost bit, that are the same as the topmost bit, in each element in a vector
 //No corresponding vector intrinsics in IA32, need to implement it.
 //While the implementation is effective for 8 bits, it may be not for 16 and 32 bits
+int8x8_t vcls_s8(int8x8_t a); // VCLS.S8 d0,d0
+_NEON2SSE_INLINE int8x8_t vcls_s8(int8x8_t a)
+{
+    int8x8_t res64;
+    __m128i res;
+    res = vclsq_s8(_pM128i(a));
+    return64(res);
+}
 
-#if defined(USE_SSSE3)
-int8x16_t vclsq_s8(int8x16_t a);         // VCLS.S8 q0,q0
+int16x4_t vcls_s16(int16x4_t a); // VCLS.S16 d0,d0
+_NEON2SSE_INLINE int16x4_t vcls_s16(int16x4_t a)
+{
+    int16x4_t res64;
+    __m128i res;
+    res = vclsq_s16(_pM128i(a));
+    return64(res);
+}
+
+int32x2_t vcls_s32(int32x2_t a); // VCLS.S32 d0,d0
+_NEON2SSE_INLINE int32x2_t vcls_s32(int32x2_t a)
+{
+    int32x2_t res64;
+    __m128i res;
+    res = vclsq_s32(_pM128i(a));
+    return64(res);
+}
+
+int8x16_t vclsq_s8(int8x16_t a); // VCLS.S8 q0,q0
 _NEON2SSE_INLINE int8x16_t vclsq_s8(int8x16_t a)
 {
     __m128i cff, c80, c1, a_mask, a_neg, a_pos, a_comb;
-    cff = _mm_cmpeq_epi8 (a,a);         //0xff
+    cff = _mm_cmpeq_epi8 (a,a); //0xff
     c80 = _mm_set1_epi8(0x80);
     c1 = _mm_set1_epi8(1);
     a_mask = _mm_and_si128(a, c80);
-    a_mask = _mm_cmpeq_epi8(a_mask, c80);         //0xff if negative input and 0 if positive
+    a_mask = _mm_cmpeq_epi8(a_mask, c80); //0xff if negative input and 0 if positive
     a_neg = _mm_xor_si128(a, cff);
     a_neg = _mm_and_si128(a_mask, a_neg);
     a_pos = _mm_andnot_si128(a_mask, a);
@@ -7727,18 +14957,16 @@
     a_comb = vclzq_s8(a_comb);
     return _mm_sub_epi8(a_comb, c1);
 }
-#endif
 
-#if defined(USE_SSSE3)
-int16x8_t vclsq_s16(int16x8_t a);         // VCLS.S16 q0,q0
+int16x8_t vclsq_s16(int16x8_t a); // VCLS.S16 q0,q0
 _NEON2SSE_INLINE int16x8_t vclsq_s16(int16x8_t a)
 {
     __m128i cffff, c8000, c1, a_mask, a_neg, a_pos, a_comb;
     cffff = _mm_cmpeq_epi16(a,a);
-    c8000 =  _mm_slli_epi16(cffff, 15);         //0x8000
-    c1 = _mm_srli_epi16(cffff,15);         //0x1
+    c8000 =  _mm_slli_epi16(cffff, 15); //0x8000
+    c1 = _mm_srli_epi16(cffff,15); //0x1
     a_mask = _mm_and_si128(a, c8000);
-    a_mask = _mm_cmpeq_epi16(a_mask, c8000);         //0xffff if negative input and 0 if positive
+    a_mask = _mm_cmpeq_epi16(a_mask, c8000); //0xffff if negative input and 0 if positive
     a_neg = _mm_xor_si128(a, cffff);
     a_neg = _mm_and_si128(a_mask, a_neg);
     a_pos = _mm_andnot_si128(a_mask, a);
@@ -7746,17 +14974,16 @@
     a_comb = vclzq_s16(a_comb);
     return _mm_sub_epi16(a_comb, c1);
 }
-#endif
 
-int32x4_t vclsq_s32(int32x4_t a);         // VCLS.S32 q0,q0
+int32x4_t vclsq_s32(int32x4_t a); // VCLS.S32 q0,q0
 _NEON2SSE_INLINE int32x4_t vclsq_s32(int32x4_t a)
 {
     __m128i cffffffff, c80000000, c1, a_mask, a_neg, a_pos, a_comb;
     cffffffff = _mm_cmpeq_epi32(a,a);
-    c80000000 =  _mm_slli_epi32(cffffffff, 31);         //0x80000000
-    c1 = _mm_srli_epi32(cffffffff,31);         //0x1
+    c80000000 =  _mm_slli_epi32(cffffffff, 31); //0x80000000
+    c1 = _mm_srli_epi32(cffffffff,31); //0x1
     a_mask = _mm_and_si128(a, c80000000);
-    a_mask = _mm_cmpeq_epi32(a_mask, c80000000);         //0xffffffff if negative input and 0 if positive
+    a_mask = _mm_cmpeq_epi32(a_mask, c80000000); //0xffffffff if negative input and 0 if positive
     a_neg = _mm_xor_si128(a, cffffffff);
     a_neg = _mm_and_si128(a_mask, a_neg);
     a_pos = _mm_andnot_si128(a_mask, a);
@@ -7770,261 +14997,488 @@
 //No corresponding SIMD solution. One option is to get a elements, convert it to 32 bits and then use SSE4.2  _mm_popcnt__u32 (unsigned int v) for each element
 //another option is to do the following algorithm:
 
-#if defined(USE_SSSE3)
-uint8x16_t vcntq_u8(uint8x16_t a);         // VCNT.8 q0,q0
+uint8x8_t vcnt_u8(uint8x8_t a); // VCNT.8 d0,d0
+_NEON2SSE_INLINE uint8x8_t vcnt_u8(uint8x8_t a)
+{
+    uint8x8_t res64;
+    __m128i res;
+    res = vcntq_u8(_pM128i(a));
+    return64(res);
+}
+
+int8x8_t vcnt_s8(int8x8_t a); // VCNT.8 d0,d0
+#define vcnt_s8 vcnt_u8
+
+poly8x8_t vcnt_p8(poly8x8_t a); // VCNT.8 d0,d0
+#define vcnt_p8 vcnt_u8
+
+uint8x16_t vcntq_u8(uint8x16_t a); // VCNT.8 q0,q0
 _NEON2SSE_INLINE uint8x16_t vcntq_u8(uint8x16_t a)
 {
     _NEON2SSE_ALIGN_16 int8_t mask_POPCOUNT[16] = { /* 0 */ 0,/* 1 */ 1,/* 2 */ 1,/* 3 */ 2,
-                                            /* 4 */ 1,/* 5 */ 2,/* 6 */ 2,/* 7 */ 3,
-                                            /* 8 */ 1,/* 9 */ 2,/* a */ 2,/* b */ 3,
-                                            /* c */ 2,/* d */ 3,/* e */ 3,/* f */ 4};
+                                                        /* 4 */ 1,/* 5 */ 2,/* 6 */ 2,/* 7 */ 3,
+                                                        /* 8 */ 1,/* 9 */ 2,/* a */ 2,/* b */ 3,
+                                                        /* c */ 2,/* d */ 3,/* e */ 3,/* f */ 4                                   };
     __m128i maskLOW, mask, lowpopcnt, hipopcnt;
-    maskLOW = _mm_set1_epi8(0x0f);         //low 4 bits, need masking to avoid zero if MSB is set
+    maskLOW = _mm_set1_epi8(0x0f); //low 4 bits, need masking to avoid zero if MSB is set
     mask = _mm_and_si128(a, maskLOW);
-    lowpopcnt = _mm_shuffle_epi8( *(__m128i*)mask_POPCOUNT, mask);         //uses low 4 bits anyway
-    mask =  _mm_srli_epi16(a, 4);         //get high 4 bits as low bits
-    mask = _mm_and_si128(mask, maskLOW);         //low 4 bits, need masking to avoid zero if MSB is set
-    hipopcnt = _mm_shuffle_epi8( *(__m128i*) mask_POPCOUNT, mask);         //uses low 4 bits anyway
+    lowpopcnt = _mm_shuffle_epi8( *(__m128i*)mask_POPCOUNT, mask); //uses low 4 bits anyway
+    mask =  _mm_srli_epi16(a, 4); //get high 4 bits as low bits
+    mask = _mm_and_si128(mask, maskLOW); //low 4 bits, need masking to avoid zero if MSB is set
+    hipopcnt = _mm_shuffle_epi8( *(__m128i*) mask_POPCOUNT, mask); //uses low 4 bits anyway
     return _mm_add_epi8(lowpopcnt, hipopcnt);
 }
-#endif
 
-#if defined(USE_SSSE3)
-int8x16_t vcntq_s8(int8x16_t a);         // VCNT.8 q0,q0
+int8x16_t vcntq_s8(int8x16_t a); // VCNT.8 q0,q0
 #define vcntq_s8 vcntq_u8
 
-poly8x16_t vcntq_p8(poly8x16_t a);         // VCNT.8 q0,q0
+poly8x16_t vcntq_p8(poly8x16_t a); // VCNT.8 q0,q0
 #define vcntq_p8 vcntq_u8
-#endif
 
 //**************************************************************************************
 //*********************** Logical operations ****************************************
 //**************************************************************************************
 //************************** Bitwise not ***********************************
 //several Bitwise not implementations possible for SIMD. Eg "xor" with all ones, but the following one gives good performance
+int8x8_t vmvn_s8(int8x8_t a); // VMVN d0,d0
+_NEON2SSE_INLINE int8x8_t vmvn_s8(int8x8_t a)
+{
+    int8x8_t res64;
+    __m128i res;
+    res = vmvnq_s8(_pM128i(a));
+    return64(res);
+}
 
-int8x16_t vmvnq_s8(int8x16_t a);         // VMVN q0,q0
-_NEON2SSE_INLINE int8x16_t vmvnq_s8(int8x16_t a)         // VMVN q0,q0
+int16x4_t vmvn_s16(int16x4_t a); // VMVN d0,d0
+_NEON2SSE_INLINE int16x4_t vmvn_s16(int16x4_t a)
+{
+    int16x4_t res64;
+    __m128i res;
+    res = vmvnq_s16(_pM128i(a));
+    return64(res);
+}
+
+int32x2_t vmvn_s32(int32x2_t a); // VMVN d0,d0
+_NEON2SSE_INLINE int32x2_t vmvn_s32(int32x2_t a)
+{
+    int32x2_t res64;
+    __m128i res;
+    res = vmvnq_s32(_pM128i(a));
+    return64(res);
+}
+
+uint8x8_t vmvn_u8(uint8x8_t a); // VMVN d0,d0
+#define vmvn_u8 vmvn_s8
+
+uint16x4_t vmvn_u16(uint16x4_t a); // VMVN d0,d0
+#define vmvn_u16 vmvn_s16
+
+uint32x2_t vmvn_u32(uint32x2_t a); // VMVN d0,d0
+#define vmvn_u32 vmvn_s32
+
+poly8x8_t vmvn_p8(poly8x8_t a); // VMVN d0,d0
+#define vmvn_p8 vmvn_u8
+
+int8x16_t vmvnq_s8(int8x16_t a); // VMVN q0,q0
+_NEON2SSE_INLINE int8x16_t vmvnq_s8(int8x16_t a) // VMVN q0,q0
 {
     __m128i c1;
-    c1 = _mm_cmpeq_epi8 (a,a);         //0xff
+    c1 = _mm_cmpeq_epi8 (a,a); //0xff
     return _mm_andnot_si128 (a, c1);
 }
 
-int16x8_t vmvnq_s16(int16x8_t a);         // VMVN q0,q0
-_NEON2SSE_INLINE int16x8_t vmvnq_s16(int16x8_t a)         // VMVN q0,q0
+int16x8_t vmvnq_s16(int16x8_t a); // VMVN q0,q0
+_NEON2SSE_INLINE int16x8_t vmvnq_s16(int16x8_t a) // VMVN q0,q0
 {
     __m128i c1;
-    c1 = _mm_cmpeq_epi16 (a,a);         //0xffff
+    c1 = _mm_cmpeq_epi16 (a,a); //0xffff
     return _mm_andnot_si128 (a, c1);
 }
 
-int32x4_t vmvnq_s32(int32x4_t a);         // VMVN q0,q0
-_NEON2SSE_INLINE int32x4_t vmvnq_s32(int32x4_t a)         // VMVN q0,q0
+int32x4_t vmvnq_s32(int32x4_t a); // VMVN q0,q0
+_NEON2SSE_INLINE int32x4_t vmvnq_s32(int32x4_t a) // VMVN q0,q0
 {
     __m128i c1;
-    c1 = _mm_cmpeq_epi32 (a,a);         //0xffffffff
+    c1 = _mm_cmpeq_epi32 (a,a); //0xffffffff
     return _mm_andnot_si128 (a, c1);
 }
 
-uint8x16_t vmvnq_u8(uint8x16_t a);         // VMVN q0,q0
+uint8x16_t vmvnq_u8(uint8x16_t a); // VMVN q0,q0
 #define vmvnq_u8 vmvnq_s8
 
-uint16x8_t vmvnq_u16(uint16x8_t a);         // VMVN q0,q0
+uint16x8_t vmvnq_u16(uint16x8_t a); // VMVN q0,q0
 #define vmvnq_u16 vmvnq_s16
 
-uint32x4_t vmvnq_u32(uint32x4_t a);         // VMVN q0,q0
+uint32x4_t vmvnq_u32(uint32x4_t a); // VMVN q0,q0
 #define vmvnq_u32 vmvnq_s32
 
-poly8x16_t vmvnq_p8(poly8x16_t a);         // VMVN q0,q0
+poly8x16_t vmvnq_p8(poly8x16_t a); // VMVN q0,q0
 #define vmvnq_p8 vmvnq_u8
 
 //****************** Bitwise and ***********************
 //******************************************************
+int8x8_t vand_s8(int8x8_t a, int8x8_t b); // VAND d0,d0,d0
+_NEON2SSE_INLINE int8x8_t vand_s8(int8x8_t a, int8x8_t b)
+{
+    int8x8_t res64;
+    return64(_mm_and_si128(_pM128i(a),_pM128i(b)));
+}
 
-int8x16_t   vandq_s8(int8x16_t a, int8x16_t b);         // VAND q0,q0,q0
+int16x4_t vand_s16(int16x4_t a, int16x4_t b); // VAND d0,d0,d0
+_NEON2SSE_INLINE int16x4_t vand_s16(int16x4_t a, int16x4_t b)
+{
+    int16x4_t res64;
+    return64(_mm_and_si128(_pM128i(a),_pM128i(b)));
+}
+
+int32x2_t vand_s32(int32x2_t a, int32x2_t b); // VAND d0,d0,d0
+_NEON2SSE_INLINE int32x2_t vand_s32(int32x2_t a, int32x2_t b)
+{
+    int32x2_t res64;
+    return64(_mm_and_si128(_pM128i(a),_pM128i(b)));
+}
+
+
+int64x1_t vand_s64(int64x1_t a,  int64x1_t b); // VAND d0,d0,d0
+_NEON2SSE_INLINE int64x1_t vand_s64(int64x1_t a,  int64x1_t b)
+{
+    int64x1_t res;
+    res.m64_i64[0] = a.m64_i64[0] & b.m64_i64[0];
+    return res;
+}
+
+uint8x8_t vand_u8(uint8x8_t a, uint8x8_t b); // VAND d0,d0,d0
+#define vand_u8 vand_s8
+
+uint16x4_t vand_u16(uint16x4_t a, uint16x4_t b); // VAND d0,d0,d0
+#define vand_u16 vand_s16
+
+uint32x2_t vand_u32(uint32x2_t a, uint32x2_t b); // VAND d0,d0,d0
+#define vand_u32 vand_s32
+
+uint64x1_t vand_u64(uint64x1_t a,  uint64x1_t b); // VAND d0,d0,d0
+#define vand_u64 vand_s64
+
+
+int8x16_t   vandq_s8(int8x16_t a, int8x16_t b); // VAND q0,q0,q0
 #define vandq_s8 _mm_and_si128
 
-int16x8_t   vandq_s16(int16x8_t a, int16x8_t b);         // VAND q0,q0,q0
+int16x8_t   vandq_s16(int16x8_t a, int16x8_t b); // VAND q0,q0,q0
 #define vandq_s16 _mm_and_si128
 
-int32x4_t   vandq_s32(int32x4_t a, int32x4_t b);         // VAND q0,q0,q0
+int32x4_t   vandq_s32(int32x4_t a, int32x4_t b); // VAND q0,q0,q0
 #define vandq_s32 _mm_and_si128
 
-int64x2_t   vandq_s64(int64x2_t a, int64x2_t b);         // VAND q0,q0,q0
+int64x2_t   vandq_s64(int64x2_t a, int64x2_t b); // VAND q0,q0,q0
 #define vandq_s64 _mm_and_si128
 
-uint8x16_t   vandq_u8(uint8x16_t a, uint8x16_t b);         // VAND q0,q0,q0
+uint8x16_t   vandq_u8(uint8x16_t a, uint8x16_t b); // VAND q0,q0,q0
 #define vandq_u8 _mm_and_si128
 
-uint16x8_t   vandq_u16(uint16x8_t a, uint16x8_t b);         // VAND q0,q0,q0
+uint16x8_t   vandq_u16(uint16x8_t a, uint16x8_t b); // VAND q0,q0,q0
 #define vandq_u16 _mm_and_si128
 
-uint32x4_t   vandq_u32(uint32x4_t a, uint32x4_t b);         // VAND q0,q0,q0
+uint32x4_t   vandq_u32(uint32x4_t a, uint32x4_t b); // VAND q0,q0,q0
 #define vandq_u32 _mm_and_si128
 
-uint64x2_t   vandq_u64(uint64x2_t a, uint64x2_t b);         // VAND q0,q0,q0
+uint64x2_t   vandq_u64(uint64x2_t a, uint64x2_t b); // VAND q0,q0,q0
 #define vandq_u64 _mm_and_si128
 
 //******************** Bitwise or *********************************
 //******************************************************************
+int8x8_t vorr_s8(int8x8_t a, int8x8_t b); // VORR d0,d0,d0
+_NEON2SSE_INLINE int8x8_t vorr_s8(int8x8_t a, int8x8_t b)
+{
+    int8x8_t res64;
+    return64(_mm_or_si128(_pM128i(a),_pM128i(b)));
+}
 
-int8x16_t   vorrq_s8(int8x16_t a, int8x16_t b);         // VORR q0,q0,q0
+
+int16x4_t vorr_s16(int16x4_t a, int16x4_t b); // VORR d0,d0,d0
+_NEON2SSE_INLINE int16x4_t vorr_s16(int16x4_t a, int16x4_t b)
+{
+    int16x4_t res64;
+    return64(_mm_or_si128(_pM128i(a),_pM128i(b)));
+}
+
+
+int32x2_t vorr_s32(int32x2_t a, int32x2_t b); // VORR d0,d0,d0
+_NEON2SSE_INLINE int32x2_t vorr_s32(int32x2_t a, int32x2_t b)
+{
+    int32x2_t res64;
+    return64(_mm_or_si128(_pM128i(a),_pM128i(b)));
+}
+
+
+int64x1_t vorr_s64(int64x1_t a,  int64x1_t b); // VORR d0,d0,d0
+_NEON2SSE_INLINE int64x1_t vorr_s64(int64x1_t a,  int64x1_t b)
+{
+    int64x1_t res;
+    res.m64_i64[0] = a.m64_i64[0] | b.m64_i64[0];
+    return res;
+}
+
+uint8x8_t vorr_u8(uint8x8_t a, uint8x8_t b); // VORR d0,d0,d0
+#define vorr_u8 vorr_s8
+
+uint16x4_t vorr_u16(uint16x4_t a, uint16x4_t b); // VORR d0,d0,d0
+#define vorr_u16 vorr_s16
+
+uint32x2_t vorr_u32(uint32x2_t a, uint32x2_t b); // VORR d0,d0,d0
+#define vorr_u32 vorr_s32
+
+uint64x1_t vorr_u64(uint64x1_t a,  uint64x1_t b); // VORR d0,d0,d0
+#define vorr_u64 vorr_s64
+
+int8x16_t   vorrq_s8(int8x16_t a, int8x16_t b); // VORR q0,q0,q0
 #define vorrq_s8 _mm_or_si128
 
-int16x8_t   vorrq_s16(int16x8_t a, int16x8_t b);         // VORR q0,q0,q0
+int16x8_t   vorrq_s16(int16x8_t a, int16x8_t b); // VORR q0,q0,q0
 #define vorrq_s16 _mm_or_si128
 
-int32x4_t   vorrq_s32(int32x4_t a, int32x4_t b);         // VORR q0,q0,q0
+int32x4_t   vorrq_s32(int32x4_t a, int32x4_t b); // VORR q0,q0,q0
 #define vorrq_s32 _mm_or_si128
 
-int64x2_t   vorrq_s64(int64x2_t a, int64x2_t b);         // VORR q0,q0,q0
+int64x2_t   vorrq_s64(int64x2_t a, int64x2_t b); // VORR q0,q0,q0
 #define vorrq_s64 _mm_or_si128
 
-uint8x16_t   vorrq_u8(uint8x16_t a, uint8x16_t b);         // VORR q0,q0,q0
+uint8x16_t   vorrq_u8(uint8x16_t a, uint8x16_t b); // VORR q0,q0,q0
 #define vorrq_u8 _mm_or_si128
 
-uint16x8_t   vorrq_u16(uint16x8_t a, uint16x8_t b);         // VORR q0,q0,q0
+uint16x8_t   vorrq_u16(uint16x8_t a, uint16x8_t b); // VORR q0,q0,q0
 #define vorrq_u16 _mm_or_si128
 
-uint32x4_t   vorrq_u32(uint32x4_t a, uint32x4_t b);         // VORR q0,q0,q0
+uint32x4_t   vorrq_u32(uint32x4_t a, uint32x4_t b); // VORR q0,q0,q0
 #define vorrq_u32 _mm_or_si128
 
-uint64x2_t   vorrq_u64(uint64x2_t a, uint64x2_t b);         // VORR q0,q0,q0
+uint64x2_t   vorrq_u64(uint64x2_t a, uint64x2_t b); // VORR q0,q0,q0
 #define vorrq_u64 _mm_or_si128
 
 //************* Bitwise exclusive or (EOR or XOR) ******************
 //*******************************************************************
+int8x8_t veor_s8(int8x8_t a, int8x8_t b); // VEOR d0,d0,d0
+_NEON2SSE_INLINE int8x8_t veor_s8(int8x8_t a, int8x8_t b)
+{
+    int8x8_t res64;
+    return64(_mm_xor_si128(_pM128i(a),_pM128i(b)));
+}
 
-int8x16_t   veorq_s8(int8x16_t a, int8x16_t b);         // VEOR q0,q0,q0
+int16x4_t veor_s16(int16x4_t a, int16x4_t b); // VEOR d0,d0,d0
+#define veor_s16 veor_s8
+
+int32x2_t veor_s32(int32x2_t a, int32x2_t b); // VEOR d0,d0,d0
+#define veor_s32 veor_s8
+
+int64x1_t veor_s64(int64x1_t a,  int64x1_t b); // VEOR d0,d0,d0
+_NEON2SSE_INLINE int64x1_t veor_s64(int64x1_t a,  int64x1_t b)
+{
+    int64x1_t res;
+    res.m64_i64[0] = a.m64_i64[0] ^ b.m64_i64[0];
+    return res;
+}
+
+uint8x8_t veor_u8(uint8x8_t a, uint8x8_t b); // VEOR d0,d0,d0
+#define veor_u8 veor_s8
+
+uint16x4_t veor_u16(uint16x4_t a, uint16x4_t b); // VEOR d0,d0,d0
+#define veor_u16 veor_s16
+
+uint32x2_t veor_u32(uint32x2_t a, uint32x2_t b); // VEOR d0,d0,d0
+#define veor_u32 veor_s32
+
+uint64x1_t veor_u64(uint64x1_t a,  uint64x1_t b); // VEOR d0,d0,d0
+#define veor_u64 veor_s64
+
+int8x16_t   veorq_s8(int8x16_t a, int8x16_t b); // VEOR q0,q0,q0
 #define veorq_s8 _mm_xor_si128
 
-int16x8_t   veorq_s16(int16x8_t a, int16x8_t b);         // VEOR q0,q0,q0
+int16x8_t   veorq_s16(int16x8_t a, int16x8_t b); // VEOR q0,q0,q0
 #define veorq_s16 _mm_xor_si128
 
-int32x4_t   veorq_s32(int32x4_t a, int32x4_t b);         // VEOR q0,q0,q0
+int32x4_t   veorq_s32(int32x4_t a, int32x4_t b); // VEOR q0,q0,q0
 #define veorq_s32 _mm_xor_si128
 
-int64x2_t   veorq_s64(int64x2_t a, int64x2_t b);         // VEOR q0,q0,q0
+int64x2_t   veorq_s64(int64x2_t a, int64x2_t b); // VEOR q0,q0,q0
 #define veorq_s64 _mm_xor_si128
 
-uint8x16_t   veorq_u8(uint8x16_t a, uint8x16_t b);         // VEOR q0,q0,q0
+uint8x16_t   veorq_u8(uint8x16_t a, uint8x16_t b); // VEOR q0,q0,q0
 #define veorq_u8 _mm_xor_si128
 
-uint16x8_t   veorq_u16(uint16x8_t a, uint16x8_t b);         // VEOR q0,q0,q0
+uint16x8_t   veorq_u16(uint16x8_t a, uint16x8_t b); // VEOR q0,q0,q0
 #define veorq_u16 _mm_xor_si128
 
-uint32x4_t   veorq_u32(uint32x4_t a, uint32x4_t b);         // VEOR q0,q0,q0
+uint32x4_t   veorq_u32(uint32x4_t a, uint32x4_t b); // VEOR q0,q0,q0
 #define veorq_u32 _mm_xor_si128
 
-uint64x2_t   veorq_u64(uint64x2_t a, uint64x2_t b);         // VEOR q0,q0,q0
+uint64x2_t   veorq_u64(uint64x2_t a, uint64x2_t b); // VEOR q0,q0,q0
 #define veorq_u64 _mm_xor_si128
 
 //********************** Bit Clear **********************************
 //*******************************************************************
 //Logical AND complement (AND negation or AND NOT)
+int8x8_t   vbic_s8(int8x8_t a, int8x8_t b); // VBIC d0,d0,d0
+_NEON2SSE_INLINE int8x8_t   vbic_s8(int8x8_t a, int8x8_t b)
+{
+    int8x8_t res64;
+    return64(_mm_andnot_si128(_pM128i(b),_pM128i(a))); //notice the arguments "swap"
+}
 
-//notice arguments "swap"
+int16x4_t   vbic_s16(int16x4_t a, int16x4_t b); // VBIC d0,d0,d0
+#define vbic_s16 vbic_s8
 
-//notice arguments "swap"
+int32x2_t   vbic_s32(int32x2_t a, int32x2_t b); // VBIC d0,d0,d0
+#define vbic_s32 vbic_s8
 
-//notice arguments "swap"
+int64x1_t   vbic_s64(int64x1_t a, int64x1_t b); // VBIC d0,d0,d0
+_NEON2SSE_INLINE int64x1_t   vbic_s64(int64x1_t a, int64x1_t b)
+{
+    int64x1_t res;
+    res.m64_i64[0] = a.m64_i64[0] & (~b.m64_i64[0]);
+    return res;
+}
 
-//notice arguments "swap"
+uint8x8_t   vbic_u8(uint8x8_t a, uint8x8_t b); // VBIC d0,d0,d0
+#define vbic_u8 vbic_s8
 
-//notice arguments "swap"
+uint16x4_t   vbic_u16(uint16x4_t a, uint16x4_t b); // VBIC d0,d0,d0
+#define vbic_u16 vbic_s16
 
-//notice arguments "swap"
+uint32x2_t   vbic_u32(uint32x2_t a, uint32x2_t b); // VBIC d0,d0,d0
+#define vbic_u32 vbic_s32
 
-//notice arguments "swap"
+uint64x1_t   vbic_u64(uint64x1_t a, uint64x1_t b); // VBIC d0,d0,d0
+#define vbic_u64 vbic_s64
 
-//notice arguments "swap"
+int8x16_t   vbicq_s8(int8x16_t a, int8x16_t b); // VBIC q0,q0,q0
+#define vbicq_s8(a,b) _mm_andnot_si128 (b,a) //notice arguments "swap"
 
-int8x16_t   vbicq_s8(int8x16_t a, int8x16_t b);         // VBIC q0,q0,q0
-#define vbicq_s8(a,b) _mm_andnot_si128 (b,a)         //notice arguments "swap"
+int16x8_t   vbicq_s16(int16x8_t a, int16x8_t b); // VBIC q0,q0,q0
+#define vbicq_s16(a,b) _mm_andnot_si128 (b,a) //notice arguments "swap"
 
-int16x8_t   vbicq_s16(int16x8_t a, int16x8_t b);         // VBIC q0,q0,q0
-#define vbicq_s16(a,b) _mm_andnot_si128 (b,a)         //notice arguments "swap"
+int32x4_t   vbicq_s32(int32x4_t a, int32x4_t b); // VBIC q0,q0,q0
+#define vbicq_s32(a,b) _mm_andnot_si128 (b,a) //notice arguments "swap"
 
-int32x4_t   vbicq_s32(int32x4_t a, int32x4_t b);         // VBIC q0,q0,q0
-#define vbicq_s32(a,b) _mm_andnot_si128 (b,a)         //notice arguments "swap"
+int64x2_t   vbicq_s64(int64x2_t a, int64x2_t b); // VBIC q0,q0,q0
+#define vbicq_s64(a,b) _mm_andnot_si128 (b,a) //notice arguments "swap"
 
-int64x2_t   vbicq_s64(int64x2_t a, int64x2_t b);         // VBIC q0,q0,q0
-#define vbicq_s64(a,b) _mm_andnot_si128 (b,a)         //notice arguments "swap"
+uint8x16_t   vbicq_u8(uint8x16_t a, uint8x16_t b); // VBIC q0,q0,q0
+#define vbicq_u8(a,b) _mm_andnot_si128 (b,a) //notice arguments "swap"
 
-uint8x16_t   vbicq_u8(uint8x16_t a, uint8x16_t b);         // VBIC q0,q0,q0
-#define vbicq_u8(a,b) _mm_andnot_si128 (b,a)         //notice arguments "swap"
+uint16x8_t   vbicq_u16(uint16x8_t a, uint16x8_t b); // VBIC q0,q0,q0
+#define vbicq_u16(a,b) _mm_andnot_si128 (b,a) //notice arguments "swap"
 
-uint16x8_t   vbicq_u16(uint16x8_t a, uint16x8_t b);         // VBIC q0,q0,q0
-#define vbicq_u16(a,b) _mm_andnot_si128 (b,a)         //notice arguments "swap"
+uint32x4_t   vbicq_u32(uint32x4_t a, uint32x4_t b); // VBIC q0,q0,q0
+#define vbicq_u32(a,b) _mm_andnot_si128 (b,a) //notice arguments "swap"
 
-uint32x4_t   vbicq_u32(uint32x4_t a, uint32x4_t b);         // VBIC q0,q0,q0
-#define vbicq_u32(a,b) _mm_andnot_si128 (b,a)         //notice arguments "swap"
-
-uint64x2_t   vbicq_u64(uint64x2_t a, uint64x2_t b);         // VBIC q0,q0,q0
-#define vbicq_u64(a,b) _mm_andnot_si128 (b,a)         //notice arguments "swap"
+uint64x2_t   vbicq_u64(uint64x2_t a, uint64x2_t b); // VBIC q0,q0,q0
+#define vbicq_u64(a,b) _mm_andnot_si128 (b,a) //notice arguments "swap"
 
 //**************** Bitwise OR complement ********************************
 //**************************************** ********************************
 //no exact IA 32 match, need to implement it as following
+int8x8_t vorn_s8(int8x8_t a,  int8x8_t b); // VORN d0,d0,d0
+_NEON2SSE_INLINE int8x8_t vorn_s8(int8x8_t a,  int8x8_t b)
+{
+    int8x8_t res64;
+    return64(vornq_s8(_pM128i(a), _pM128i(b)));
+}
 
-int8x16_t vornq_s8(int8x16_t a, int8x16_t b);         // VORN q0,q0,q0
-_NEON2SSE_INLINE int8x16_t vornq_s8(int8x16_t a, int8x16_t b)         // VORN q0,q0,q0
+
+int16x4_t vorn_s16(int16x4_t a,  int16x4_t b); // VORN d0,d0,d0
+_NEON2SSE_INLINE int16x4_t vorn_s16(int16x4_t a,  int16x4_t b)
+{
+    int16x4_t res64;
+    return64(vornq_s16(_pM128i(a), _pM128i(b)));
+}
+
+
+int32x2_t vorn_s32(int32x2_t a,  int32x2_t b); // VORN d0,d0,d0
+_NEON2SSE_INLINE int32x2_t vorn_s32(int32x2_t a,  int32x2_t b)
+{
+    int32x2_t res64;
+    return64(vornq_s32(_pM128i(a), _pM128i(b)));
+}
+
+
+int64x1_t vorn_s64(int64x1_t a, int64x1_t b); // VORN d0,d0,d0
+_NEON2SSE_INLINE int64x1_t vorn_s64(int64x1_t a, int64x1_t b)
+{
+    int64x1_t res;
+    res.m64_i64[0] = a.m64_i64[0] | (~b.m64_i64[0]);
+    return res;
+}
+
+uint8x8_t vorn_u8(uint8x8_t a,  uint8x8_t b); // VORN d0,d0,d0
+#define vorn_u8 vorn_s8
+
+
+uint16x4_t vorn_u16(uint16x4_t a,  uint16x4_t b); // VORN d0,d0,d0
+#define vorn_u16 vorn_s16
+
+uint32x2_t vorn_u32(uint32x2_t a,  uint32x2_t b); // VORN d0,d0,d0
+#define vorn_u32 vorn_s32
+
+uint64x1_t vorn_u64(uint64x1_t a, uint64x1_t b); // VORN d0,d0,d0
+#define vorn_u64 vorn_s64
+
+
+int8x16_t vornq_s8(int8x16_t a, int8x16_t b); // VORN q0,q0,q0
+_NEON2SSE_INLINE int8x16_t vornq_s8(int8x16_t a, int8x16_t b) // VORN q0,q0,q0
 {
     __m128i b1;
-    b1 = vmvnq_s8( b);         //bitwise not for b
+    b1 = vmvnq_s8( b); //bitwise not for b
     return _mm_or_si128 (a, b1);
 }
 
-int16x8_t vornq_s16(int16x8_t a, int16x8_t b);         // VORN q0,q0,q0
-_NEON2SSE_INLINE int16x8_t vornq_s16(int16x8_t a, int16x8_t b)         // VORN q0,q0,q0
+int16x8_t vornq_s16(int16x8_t a, int16x8_t b); // VORN q0,q0,q0
+_NEON2SSE_INLINE int16x8_t vornq_s16(int16x8_t a, int16x8_t b) // VORN q0,q0,q0
 {
     __m128i b1;
-    b1 = vmvnq_s16( b);         //bitwise not for b
+    b1 = vmvnq_s16( b); //bitwise not for b
     return _mm_or_si128 (a, b1);
 }
 
-int32x4_t vornq_s32(int32x4_t a, int32x4_t b);         // VORN q0,q0,q0
-_NEON2SSE_INLINE int32x4_t vornq_s32(int32x4_t a, int32x4_t b)         // VORN q0,q0,q0
+int32x4_t vornq_s32(int32x4_t a, int32x4_t b); // VORN q0,q0,q0
+_NEON2SSE_INLINE int32x4_t vornq_s32(int32x4_t a, int32x4_t b) // VORN q0,q0,q0
 {
     __m128i b1;
-    b1 = vmvnq_s32( b);         //bitwise not for b
+    b1 = vmvnq_s32( b); //bitwise not for b
     return _mm_or_si128 (a, b1);
 }
 
-int64x2_t vornq_s64(int64x2_t a, int64x2_t b);         // VORN q0,q0,q0
+int64x2_t vornq_s64(int64x2_t a, int64x2_t b); // VORN q0,q0,q0
 _NEON2SSE_INLINE int64x2_t vornq_s64(int64x2_t a, int64x2_t b)
 {
     __m128i c1, b1;
-    c1 = _mm_cmpeq_epi8 (a, a);         //all ones 0xfffffff...fffff
+    c1 = _mm_cmpeq_epi8 (a, a); //all ones 0xfffffff...fffff
     b1 = _mm_andnot_si128 (b, c1);
     return _mm_or_si128 (a, b1);
 }
 
-uint8x16_t vornq_u8(uint8x16_t a, uint8x16_t b);         // VORN q0,q0,q0
-_NEON2SSE_INLINE uint8x16_t vornq_u8(uint8x16_t a, uint8x16_t b)         // VORN q0,q0,q0
+uint8x16_t vornq_u8(uint8x16_t a, uint8x16_t b); // VORN q0,q0,q0
+_NEON2SSE_INLINE uint8x16_t vornq_u8(uint8x16_t a, uint8x16_t b) // VORN q0,q0,q0
 {
     __m128i b1;
-    b1 = vmvnq_u8( b);         //bitwise not for b
+    b1 = vmvnq_u8( b); //bitwise not for b
     return _mm_or_si128 (a, b1);
 }
 
-uint16x8_t vornq_u16(uint16x8_t a, uint16x8_t b);         // VORN q0,q0,q0
-_NEON2SSE_INLINE uint16x8_t vornq_u16(uint16x8_t a, uint16x8_t b)         // VORN q0,q0,q0
+uint16x8_t vornq_u16(uint16x8_t a, uint16x8_t b); // VORN q0,q0,q0
+_NEON2SSE_INLINE uint16x8_t vornq_u16(uint16x8_t a, uint16x8_t b) // VORN q0,q0,q0
 {
     __m128i b1;
-    b1 = vmvnq_s16( b);         //bitwise not for b
+    b1 = vmvnq_s16( b); //bitwise not for b
     return _mm_or_si128 (a, b1);
 }
 
-uint32x4_t vornq_u32(uint32x4_t a, uint32x4_t b);         // VORN q0,q0,q0
-_NEON2SSE_INLINE uint32x4_t vornq_u32(uint32x4_t a, uint32x4_t b)         // VORN q0,q0,q0
+uint32x4_t vornq_u32(uint32x4_t a, uint32x4_t b); // VORN q0,q0,q0
+_NEON2SSE_INLINE uint32x4_t vornq_u32(uint32x4_t a, uint32x4_t b) // VORN q0,q0,q0
 {
     __m128i b1;
-    b1 = vmvnq_u32( b);         //bitwise not for b
+    b1 = vmvnq_u32( b); //bitwise not for b
     return _mm_or_si128 (a, b1);
 }
-uint64x2_t vornq_u64(uint64x2_t a, uint64x2_t b);         // VORN q0,q0,q0
+uint64x2_t vornq_u64(uint64x2_t a, uint64x2_t b); // VORN q0,q0,q0
 #define vornq_u64 vornq_s64
 
 //********************* Bitwise Select *****************************
@@ -8041,9 +15495,61 @@
 //if the corresponding bit of the second operand is 1, otherwise leaves the destination bit unchanged.
 
 //VBSL only is implemented for SIMD
+int8x8_t vbsl_s8(uint8x8_t a, int8x8_t b, int8x8_t c); // VBSL d0,d0,d0
+_NEON2SSE_INLINE int8x8_t vbsl_s8(uint8x8_t a, int8x8_t b, int8x8_t c)
+{
+    int8x8_t res64;
+    __m128i res;
+    res = vbslq_s8(_pM128i(a), _pM128i(b), _pM128i(c));
+    return64(res);
+}
 
-int8x16_t vbslq_s8(uint8x16_t a, int8x16_t b, int8x16_t c);         // VBSL q0,q0,q0
-_NEON2SSE_INLINE int8x16_t vbslq_s8(uint8x16_t a, int8x16_t b, int8x16_t c)         // VBSL q0,q0,q0
+int16x4_t vbsl_s16(uint16x4_t a, int16x4_t b, int16x4_t c); // VBSL d0,d0,d0
+#define vbsl_s16 vbsl_s8
+
+int32x2_t vbsl_s32(uint32x2_t a, int32x2_t b, int32x2_t c); // VBSL d0,d0,d0
+#define vbsl_s32 vbsl_s8
+
+int64x1_t vbsl_s64(uint64x1_t a, int64x1_t b, int64x1_t c); // VBSL d0,d0,d0
+_NEON2SSE_INLINE int64x1_t vbsl_s64(uint64x1_t a, int64x1_t b, int64x1_t c)
+{
+    int64x1_t res;
+    res.m64_i64[0] = (a.m64_i64[0] & b.m64_i64[0]) | ( (~a.m64_i64[0]) & c.m64_i64[0]);
+    return res;
+}
+
+uint8x8_t vbsl_u8(uint8x8_t a,  uint8x8_t b, uint8x8_t c); // VBSL d0,d0,d0
+#define vbsl_u8 vbsl_s8
+
+uint16x4_t vbsl_u16(uint16x4_t a,  uint16x4_t b, uint16x4_t c); // VBSL d0,d0,d0
+#define vbsl_u16 vbsl_s8
+
+uint32x2_t vbsl_u32(uint32x2_t a,  uint32x2_t b, uint32x2_t c); // VBSL d0,d0,d0
+#define vbsl_u32 vbsl_s8
+
+uint64x1_t vbsl_u64(uint64x1_t a, uint64x1_t b, uint64x1_t c); // VBSL d0,d0,d0
+#define vbsl_u64 vbsl_s64
+
+float32x2_t vbsl_f32(uint32x2_t a, float32x2_t b, float32x2_t c); // VBSL d0,d0,d0
+_NEON2SSE_INLINE float32x2_t vbsl_f32(uint32x2_t a, float32x2_t b, float32x2_t c)
+{
+    __m128 sel1, sel2;
+    __m64_128 res64;
+    sel1 = _mm_and_ps   (_pM128(a), _pM128(b));
+    sel2 = _mm_andnot_ps (_pM128(a), _pM128(c));
+    sel1 = _mm_or_ps (sel1, sel2);
+    _M64f(res64, sel1);
+    return res64;
+}
+
+poly8x8_t vbsl_p8(uint8x8_t a, poly8x8_t b, poly8x8_t c); // VBSL d0,d0,d0
+#define  vbsl_p8 vbsl_s8
+
+poly16x4_t vbsl_p16(uint16x4_t a, poly16x4_t b, poly16x4_t c); // VBSL d0,d0,d0
+#define  vbsl_p16 vbsl_s8
+
+int8x16_t vbslq_s8(uint8x16_t a, int8x16_t b, int8x16_t c); // VBSL q0,q0,q0
+_NEON2SSE_INLINE int8x16_t vbslq_s8(uint8x16_t a, int8x16_t b, int8x16_t c) // VBSL q0,q0,q0
 {
     __m128i sel1, sel2;
     sel1 = _mm_and_si128   (a, b);
@@ -8051,29 +15557,29 @@
     return _mm_or_si128 (sel1, sel2);
 }
 
-int16x8_t vbslq_s16(uint16x8_t a, int16x8_t b, int16x8_t c);         // VBSL q0,q0,q0
+int16x8_t vbslq_s16(uint16x8_t a, int16x8_t b, int16x8_t c); // VBSL q0,q0,q0
 #define vbslq_s16 vbslq_s8
 
-int32x4_t vbslq_s32(uint32x4_t a, int32x4_t b, int32x4_t c);         // VBSL q0,q0,q0
+int32x4_t vbslq_s32(uint32x4_t a, int32x4_t b, int32x4_t c); // VBSL q0,q0,q0
 #define vbslq_s32 vbslq_s8
 
-int64x2_t vbslq_s64(uint64x2_t a, int64x2_t b, int64x2_t c);         // VBSL q0,q0,q0
+int64x2_t vbslq_s64(uint64x2_t a, int64x2_t b, int64x2_t c); // VBSL q0,q0,q0
 #define vbslq_s64 vbslq_s8
 
-uint8x16_t vbslq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c);         // VBSL q0,q0,q0
+uint8x16_t vbslq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c); // VBSL q0,q0,q0
 #define vbslq_u8 vbslq_s8
 
-uint16x8_t vbslq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c);         // VBSL q0,q0,q0
+uint16x8_t vbslq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c); // VBSL q0,q0,q0
 #define vbslq_u16 vbslq_s8
 
-uint32x4_t vbslq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c);         // VBSL q0,q0,q0
+uint32x4_t vbslq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c); // VBSL q0,q0,q0
 #define vbslq_u32 vbslq_s8
 
-uint64x2_t vbslq_u64(uint64x2_t a, uint64x2_t b, uint64x2_t c);         // VBSL q0,q0,q0
+uint64x2_t vbslq_u64(uint64x2_t a, uint64x2_t b, uint64x2_t c); // VBSL q0,q0,q0
 #define vbslq_u64 vbslq_s8
 
-float32x4_t vbslq_f32(uint32x4_t a, float32x4_t b, float32x4_t c);         // VBSL q0,q0,q0
-_NEON2SSE_INLINE float32x4_t vbslq_f32(uint32x4_t a, float32x4_t b, float32x4_t c)         // VBSL q0,q0,q0
+float32x4_t vbslq_f32(uint32x4_t a, float32x4_t b, float32x4_t c); // VBSL q0,q0,q0
+_NEON2SSE_INLINE float32x4_t vbslq_f32(uint32x4_t a, float32x4_t b, float32x4_t c) // VBSL q0,q0,q0
 {
     __m128 sel1, sel2;
     sel1 = _mm_and_ps   (*(__m128*)&a, b);
@@ -8081,10 +15587,10 @@
     return _mm_or_ps (sel1, sel2);
 }
 
-poly8x16_t vbslq_p8(uint8x16_t a, poly8x16_t b, poly8x16_t c);         // VBSL q0,q0,q0
+poly8x16_t vbslq_p8(uint8x16_t a, poly8x16_t b, poly8x16_t c); // VBSL q0,q0,q0
 #define vbslq_p8 vbslq_u8
 
-poly16x8_t vbslq_p16(uint16x8_t a, poly16x8_t b, poly16x8_t c);         // VBSL q0,q0,q0
+poly16x8_t vbslq_p16(uint16x8_t a, poly16x8_t b, poly16x8_t c); // VBSL q0,q0,q0
 #define vbslq_p16 vbslq_s8
 
 //************************************************************************************
@@ -8094,89 +15600,182 @@
 //************************************************************************************
 //VTRN (Vector Transpose) treats the elements of its operand vectors as elements of 2 x 2 matrices, and transposes the matrices.
 // making the result look as (a0, b0, a2, b2, a4, b4,....) (a1, b1, a3, b3, a5, b5,.....)
+int8x8x2_t vtrn_s8(int8x8_t a, int8x8_t b); // VTRN.8 d0,d0
+_NEON2SSE_INLINE int8x8x2_t vtrn_s8(int8x8_t a, int8x8_t b) // VTRN.8 d0,d0
+{
+    int8x8x2_t val;
+    __m128i tmp, val0;
+    _NEON2SSE_ALIGN_16 int8_t mask16_even_odd[16] = { 0,1, 4,5, 8,9, 12,13, 2,3, 6,7, 10,11, 14,15}; //mask8_trnsp
+    tmp = _mm_unpacklo_epi8(_pM128i(a), _pM128i(b)); //a0,b0,a1,b1,a2,b2,a3,b3,...,a7,b7
+    val0 = _mm_shuffle_epi8 (tmp, *(__m128i*)mask16_even_odd); //(a0, b0, a2, b2, a4, b4, a6, b6), (a1,b1, a3,b3, a5,b5, a7,b7)
+    vst1q_s8 (val.val, val0); // _mm_shuffle_epi32 (val.val[0], _SWAP_HI_LOW32); //(a1,b1, a3,b3, a5,b5, a7,b7),(a0, b0, a2, b2, a4, b4, a6, b6),
+    return val;
+}
 
-#if defined(USE_SSSE3)
+int16x4x2_t vtrn_s16(int16x4_t a, int16x4_t b); // VTRN.16 d0,d0
+_NEON2SSE_INLINE int16x4x2_t vtrn_s16(int16x4_t a, int16x4_t b) // VTRN.16 d0,d0
+{
+    int16x4x2_t val;
+    __m128i tmp, val0;
+    _NEON2SSE_ALIGN_16 int8_t maskdlv16[16] = {0,1, 2,3, 8,9, 10,11, 4,5, 6,7, 12,13, 14, 15};
+    tmp = _mm_unpacklo_epi16(_pM128i(a), _pM128i(b)); //a0,b0,a1,b1,a2,b2,a3,b3
+    val0 =  _mm_shuffle_epi8 (tmp, *(__m128i*)maskdlv16); //a0, b0, a2, b2, a1,b1, a3, b3
+    vst1q_s16(val.val, val0); // _mm_shuffle_epi32 (val.val[0], _SWAP_HI_LOW32); //(a1,b1, a3,b3),(a0, b0, a2, b2),
+    return val;
+}
+
+int32x2x2_t vtrn_s32(int32x2_t a, int32x2_t b); // VTRN.32 d0,d0
+_NEON2SSE_INLINE int32x2x2_t vtrn_s32(int32x2_t a, int32x2_t b)
+{
+    int32x2x2_t val;
+    __m128i val0;
+    val0 = _mm_unpacklo_epi32(_pM128i(a), _pM128i(b)); //a0,b0,a1,b1
+    vst1q_s32(val.val, val0); // _mm_shuffle_epi32(val.val[0], _SWAP_HI_LOW32); //a1,b1, a0,b0,
+    return val;
+}
+
+uint8x8x2_t vtrn_u8(uint8x8_t a, uint8x8_t b); // VTRN.8 d0,d0
+#define vtrn_u8 vtrn_s8
+
+uint16x4x2_t vtrn_u16(uint16x4_t a, uint16x4_t b); // VTRN.16 d0,d0
+#define vtrn_u16 vtrn_s16
+
+uint32x2x2_t vtrn_u32(uint32x2_t a, uint32x2_t b); // VTRN.32 d0,d0
+#define vtrn_u32 vtrn_s32
+
+float32x2x2_t vtrn_f32(float32x2_t a, float32x2_t b); // VTRN.32 d0,d0
+_NEON2SSE_INLINE float32x2x2_t vtrn_f32(float32x2_t a, float32x2_t b)
+{
+    float32x2x2_t val;
+    val.val[0].m64_f32[0] = a.m64_f32[0];
+    val.val[0].m64_f32[1] = b.m64_f32[0];
+    val.val[1].m64_f32[0] = a.m64_f32[1];
+    val.val[1].m64_f32[1] = b.m64_f32[1];
+    return val; //a0,b0,a1,b1
+}
+
+poly8x8x2_t vtrn_p8(poly8x8_t a, poly8x8_t b); // VTRN.8 d0,d0
+#define  vtrn_p8 vtrn_u8
+
+poly16x4x2_t vtrn_p16(poly16x4_t a, poly16x4_t b); // VTRN.16 d0,d0
+#define  vtrn_p16 vtrn_s16
+
 //int8x16x2_t vtrnq_s8(int8x16_t a, int8x16_t b); // VTRN.8 q0,q0
-_NEON2SSE_INLINE int8x16x2_t vtrnq_s8(int8x16_t a, int8x16_t b)         // VTRN.8 q0,q0
+_NEON2SSE_INLINE int8x16x2_t vtrnq_s8(int8x16_t a, int8x16_t b) // VTRN.8 q0,q0
 {
     int8x16x2_t r8x16;
     __m128i a_sh, b_sh;
     _NEON2SSE_ALIGN_16 int8_t mask8_even_odd[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3,5, 7, 9, 11, 13, 15};
-    a_sh = _mm_shuffle_epi8 (a, *(__m128i*)mask8_even_odd);         //a0, a2, a4, a6, a8, a10, a12, a14, a1, a3, a5, a7, a9, a11, a13, a15
-    b_sh = _mm_shuffle_epi8 (b, *(__m128i*)mask8_even_odd);         //b0, b2, b4, b6, b8, b10, b12, b14, b1, b3, b5, b7, b9, b11, b13, b15
+    a_sh = _mm_shuffle_epi8 (a, *(__m128i*)mask8_even_odd); //a0, a2, a4, a6, a8, a10, a12, a14, a1, a3, a5, a7, a9, a11, a13, a15
+    b_sh = _mm_shuffle_epi8 (b, *(__m128i*)mask8_even_odd); //b0, b2, b4, b6, b8, b10, b12, b14, b1, b3, b5, b7, b9, b11, b13, b15
 
-    r8x16.val[0] =  _mm_unpacklo_epi8(a_sh, b_sh);         //(a0, b0, a2, b2, a4, b4, a6, b6, a8,b8, a10,b10, a12,b12, a14,b14)
-    r8x16.val[1] =  _mm_unpackhi_epi8(a_sh, b_sh);         // (a1, b1, a3, b3, a5, b5, a7, b7, a9,b9, a11,b11, a13,b13, a15,b15)
+    r8x16.val[0] =  _mm_unpacklo_epi8(a_sh, b_sh); //(a0, b0, a2, b2, a4, b4, a6, b6, a8,b8, a10,b10, a12,b12, a14,b14)
+    r8x16.val[1] =  _mm_unpackhi_epi8(a_sh, b_sh); // (a1, b1, a3, b3, a5, b5, a7, b7, a9,b9, a11,b11, a13,b13, a15,b15)
     return r8x16;
 }
-#endif
 
-#if defined(USE_SSSE3)
-int16x8x2_t vtrnq_s16(int16x8_t a, int16x8_t b);         // VTRN.16 q0,q0
-_NEON2SSE_INLINE int16x8x2_t vtrnq_s16(int16x8_t a, int16x8_t b)         // VTRN.16 q0,q0
+int16x8x2_t vtrnq_s16(int16x8_t a, int16x8_t b); // VTRN.16 q0,q0
+_NEON2SSE_INLINE int16x8x2_t vtrnq_s16(int16x8_t a, int16x8_t b) // VTRN.16 q0,q0
 {
     int16x8x2_t v16x8;
     __m128i a_sh, b_sh;
     _NEON2SSE_ALIGN_16 int8_t mask16_even_odd[16] = { 0,1, 4,5, 8,9, 12,13, 2,3, 6,7, 10,11, 14,15};
-    a_sh = _mm_shuffle_epi8 (a, *(__m128i*)mask16_even_odd);         //a0, a2, a4, a6,  a1, a3, a5, a7
-    b_sh = _mm_shuffle_epi8 (b, *(__m128i*)mask16_even_odd);         //b0, b2, b4, b6,  b1, b3, b5, b7
-    v16x8.val[0] = _mm_unpacklo_epi16(a_sh, b_sh);         //a0, b0, a2, b2, a4, b4, a6, b6
-    v16x8.val[1] = _mm_unpackhi_epi16(a_sh, b_sh);         //a1, b1, a3, b3, a5, b5, a7, b7
+    a_sh = _mm_shuffle_epi8 (a, *(__m128i*)mask16_even_odd); //a0, a2, a4, a6,  a1, a3, a5, a7
+    b_sh = _mm_shuffle_epi8 (b, *(__m128i*)mask16_even_odd); //b0, b2, b4, b6,  b1, b3, b5, b7
+    v16x8.val[0] = _mm_unpacklo_epi16(a_sh, b_sh); //a0, b0, a2, b2, a4, b4, a6, b6
+    v16x8.val[1] = _mm_unpackhi_epi16(a_sh, b_sh); //a1, b1, a3, b3, a5, b5, a7, b7
     return v16x8;
 }
-#endif
 
-int32x4x2_t vtrnq_s32(int32x4_t a, int32x4_t b);         // VTRN.32 q0,q0
-_NEON2SSE_INLINE int32x4x2_t vtrnq_s32(int32x4_t a, int32x4_t b)         // VTRN.32 q0,q0
-{         //may be not optimal solution compared with serial
+int32x4x2_t vtrnq_s32(int32x4_t a, int32x4_t b); // VTRN.32 q0,q0
+_NEON2SSE_INLINE int32x4x2_t vtrnq_s32(int32x4_t a, int32x4_t b) // VTRN.32 q0,q0
+{
+    //may be not optimal solution compared with serial
     int32x4x2_t v32x4;
     __m128i a_sh, b_sh;
-    a_sh = _mm_shuffle_epi32 (a, 216);         //a0, a2, a1, a3
-    b_sh = _mm_shuffle_epi32 (b, 216);         //b0, b2, b1, b3
+    a_sh = _mm_shuffle_epi32 (a, 216); //a0, a2, a1, a3
+    b_sh = _mm_shuffle_epi32 (b, 216); //b0, b2, b1, b3
 
-    v32x4.val[0] = _mm_unpacklo_epi32(a_sh, b_sh);         //a0, b0, a2, b2
-    v32x4.val[1] = _mm_unpackhi_epi32(a_sh, b_sh);         //a1, b1, a3,  b3
+    v32x4.val[0] = _mm_unpacklo_epi32(a_sh, b_sh); //a0, b0, a2, b2
+    v32x4.val[1] = _mm_unpackhi_epi32(a_sh, b_sh); //a1, b1, a3,  b3
     return v32x4;
 }
 
-#if defined(USE_SSSE3)
-uint8x16x2_t vtrnq_u8(uint8x16_t a, uint8x16_t b);         // VTRN.8 q0,q0
+uint8x16x2_t vtrnq_u8(uint8x16_t a, uint8x16_t b); // VTRN.8 q0,q0
 #define vtrnq_u8 vtrnq_s8
 
-uint16x8x2_t vtrnq_u16(uint16x8_t a, uint16x8_t b);         // VTRN.16 q0,q0
+uint16x8x2_t vtrnq_u16(uint16x8_t a, uint16x8_t b); // VTRN.16 q0,q0
 #define vtrnq_u16 vtrnq_s16
-#endif
 
-uint32x4x2_t vtrnq_u32(uint32x4_t a, uint32x4_t b);         // VTRN.32 q0,q0
+uint32x4x2_t vtrnq_u32(uint32x4_t a, uint32x4_t b); // VTRN.32 q0,q0
 #define vtrnq_u32 vtrnq_s32
 
-float32x4x2_t vtrnq_f32(float32x4_t a, float32x4_t b);         // VTRN.32 q0,q0
-_NEON2SSE_INLINE float32x4x2_t vtrnq_f32(float32x4_t a, float32x4_t b)         // VTRN.32 q0,q0
-{         //may be not optimal solution compared with serial
+float32x4x2_t vtrnq_f32(float32x4_t a, float32x4_t b); // VTRN.32 q0,q0
+_NEON2SSE_INLINE float32x4x2_t vtrnq_f32(float32x4_t a, float32x4_t b) // VTRN.32 q0,q0
+{
+    //may be not optimal solution compared with serial
     float32x4x2_t f32x4;
     __m128 a_sh, b_sh;
-    a_sh = _mm_shuffle_ps (a, a, _MM_SHUFFLE(3,1, 2, 0));         //a0, a2, a1, a3, need to check endiness
-    b_sh = _mm_shuffle_ps (b, b, _MM_SHUFFLE(3,1, 2, 0));         //b0, b2, b1, b3, need to check endiness
+    a_sh = _mm_shuffle_ps (a, a, _MM_SHUFFLE(3,1, 2, 0)); //a0, a2, a1, a3, need to check endiness
+    b_sh = _mm_shuffle_ps (b, b, _MM_SHUFFLE(3,1, 2, 0)); //b0, b2, b1, b3, need to check endiness
 
-    f32x4.val[0] = _mm_unpacklo_ps(a_sh, b_sh);         //a0, b0, a2, b2
-    f32x4.val[1] = _mm_unpackhi_ps(a_sh, b_sh);         //a1, b1, a3,  b3
+    f32x4.val[0] = _mm_unpacklo_ps(a_sh, b_sh); //a0, b0, a2, b2
+    f32x4.val[1] = _mm_unpackhi_ps(a_sh, b_sh); //a1, b1, a3,  b3
     return f32x4;
 }
 
-#if defined(USE_SSSE3)
-poly8x16x2_t vtrnq_p8(poly8x16_t a, poly8x16_t b);         // VTRN.8 q0,q0
+poly8x16x2_t vtrnq_p8(poly8x16_t a, poly8x16_t b); // VTRN.8 q0,q0
 #define vtrnq_p8 vtrnq_s8
 
-poly16x8x2_t vtrnq_p16(poly16x8_t a, poly16x8_t b);         // VTRN.16 q0,q0
+poly16x8x2_t vtrnq_p16(poly16x8_t a, poly16x8_t b); // VTRN.16 q0,q0
 #define vtrnq_p16 vtrnq_s16
-#endif
 
 //***************** Interleave elements ***************************
 //*****************************************************************
 //output has (a0,b0,a1,b1, a2,b2,.....)
+int8x8x2_t vzip_s8(int8x8_t a, int8x8_t b); // VZIP.8 d0,d0
+_NEON2SSE_INLINE int8x8x2_t vzip_s8(int8x8_t a, int8x8_t b) // VZIP.8 d0,d0
+{
+    int8x8x2_t val;
+    __m128i val0;
+    val0 = _mm_unpacklo_epi8(_pM128i(a), _pM128i(b));
+    vst1q_s8(val.val, val0); //_mm_shuffle_epi32(val.val[0], _SWAP_HI_LOW32);
+    return val;
+}
 
-int8x16x2_t vzipq_s8(int8x16_t a, int8x16_t b);         // VZIP.8 q0,q0
-_NEON2SSE_INLINE int8x16x2_t vzipq_s8(int8x16_t a, int8x16_t b)         // VZIP.8 q0,q0
+int16x4x2_t vzip_s16(int16x4_t a, int16x4_t b); // VZIP.16 d0,d0
+_NEON2SSE_INLINE int16x4x2_t vzip_s16(int16x4_t a, int16x4_t b) // VZIP.16 d0,d0
+{
+    int16x4x2_t val;
+    __m128i val0;
+    val0 = _mm_unpacklo_epi16(_pM128i(a), _pM128i(b));
+    vst1q_s16(val.val, val0); // _mm_shuffle_epi32(val.val[0], _SWAP_HI_LOW32);
+    return val;
+}
+
+int32x2x2_t vzip_s32(int32x2_t a, int32x2_t b); // VZIP.32 d0,d0
+#define vzip_s32 vtrn_s32
+
+uint8x8x2_t vzip_u8(uint8x8_t a, uint8x8_t b); // VZIP.8 d0,d0
+#define vzip_u8 vzip_s8
+
+uint16x4x2_t vzip_u16(uint16x4_t a, uint16x4_t b); // VZIP.16 d0,d0
+#define vzip_u16 vzip_s16
+
+uint32x2x2_t vzip_u32(uint32x2_t a, uint32x2_t b); // VZIP.32 d0,d0
+#define vzip_u32 vzip_s32
+
+float32x2x2_t vzip_f32(float32x2_t a, float32x2_t b); // VZIP.32 d0,d0
+#define vzip_f32 vtrn_f32
+
+poly8x8x2_t vzip_p8(poly8x8_t a, poly8x8_t b); // VZIP.8 d0,d0
+#define vzip_p8 vzip_u8
+
+poly16x4x2_t vzip_p16(poly16x4_t a, poly16x4_t b); // VZIP.16 d0,d0
+#define vzip_p16 vzip_u16
+
+int8x16x2_t vzipq_s8(int8x16_t a, int8x16_t b); // VZIP.8 q0,q0
+_NEON2SSE_INLINE int8x16x2_t vzipq_s8(int8x16_t a, int8x16_t b) // VZIP.8 q0,q0
 {
     int8x16x2_t r8x16;
     r8x16.val[0] =  _mm_unpacklo_epi8(a, b);
@@ -8184,8 +15783,8 @@
     return r8x16;
 }
 
-int16x8x2_t vzipq_s16(int16x8_t a, int16x8_t b);         // VZIP.16 q0,q0
-_NEON2SSE_INLINE int16x8x2_t vzipq_s16(int16x8_t a, int16x8_t b)         // VZIP.16 q0,q0
+int16x8x2_t vzipq_s16(int16x8_t a, int16x8_t b); // VZIP.16 q0,q0
+_NEON2SSE_INLINE int16x8x2_t vzipq_s16(int16x8_t a, int16x8_t b) // VZIP.16 q0,q0
 {
     int16x8x2_t r16x8;
     r16x8.val[0] =  _mm_unpacklo_epi16(a, b);
@@ -8193,8 +15792,8 @@
     return r16x8;
 }
 
-int32x4x2_t vzipq_s32(int32x4_t a, int32x4_t b);         // VZIP.32 q0,q0
-_NEON2SSE_INLINE int32x4x2_t vzipq_s32(int32x4_t a, int32x4_t b)         // VZIP.32 q0,q0
+int32x4x2_t vzipq_s32(int32x4_t a, int32x4_t b); // VZIP.32 q0,q0
+_NEON2SSE_INLINE int32x4x2_t vzipq_s32(int32x4_t a, int32x4_t b) // VZIP.32 q0,q0
 {
     int32x4x2_t r32x4;
     r32x4.val[0] =  _mm_unpacklo_epi32(a, b);
@@ -8202,17 +15801,17 @@
     return r32x4;
 }
 
-uint8x16x2_t vzipq_u8(uint8x16_t a, uint8x16_t b);         // VZIP.8 q0,q0
+uint8x16x2_t vzipq_u8(uint8x16_t a, uint8x16_t b); // VZIP.8 q0,q0
 #define vzipq_u8 vzipq_s8
 
-uint16x8x2_t vzipq_u16(uint16x8_t a, uint16x8_t b);         // VZIP.16 q0,q0
+uint16x8x2_t vzipq_u16(uint16x8_t a, uint16x8_t b); // VZIP.16 q0,q0
 #define vzipq_u16 vzipq_s16
 
-uint32x4x2_t vzipq_u32(uint32x4_t a, uint32x4_t b);         // VZIP.32 q0,q0
+uint32x4x2_t vzipq_u32(uint32x4_t a, uint32x4_t b); // VZIP.32 q0,q0
 #define vzipq_u32 vzipq_s32
 
-float32x4x2_t vzipq_f32(float32x4_t a, float32x4_t b);         // VZIP.32 q0,q0
-_NEON2SSE_INLINE float32x4x2_t vzipq_f32(float32x4_t a, float32x4_t b)         // VZIP.32 q0,q0
+float32x4x2_t vzipq_f32(float32x4_t a, float32x4_t b); // VZIP.32 q0,q0
+_NEON2SSE_INLINE float32x4x2_t vzipq_f32(float32x4_t a, float32x4_t b) // VZIP.32 q0,q0
 {
     float32x4x2_t f32x4;
     f32x4.val[0] =   _mm_unpacklo_ps ( a,  b);
@@ -8220,93 +15819,166 @@
     return f32x4;
 }
 
-poly8x16x2_t vzipq_p8(poly8x16_t a, poly8x16_t b);         // VZIP.8 q0,q0
+poly8x16x2_t vzipq_p8(poly8x16_t a, poly8x16_t b); // VZIP.8 q0,q0
 #define vzipq_p8 vzipq_u8
 
-poly16x8x2_t vzipq_p16(poly16x8_t a, poly16x8_t b);         // VZIP.16 q0,q0
+poly16x8x2_t vzipq_p16(poly16x8_t a, poly16x8_t b); // VZIP.16 q0,q0
 #define vzipq_p16 vzipq_u16
 
 //*********************** De-Interleave elements *************************
 //*************************************************************************
 //As the result of these functions first val  contains (a0,a2,a4,....,b0,b2, b4,...) and the second val (a1,a3,a5,....b1,b3,b5...)
 //no such functions in IA32 SIMD, shuffle is required
+int8x8x2_t vuzp_s8(int8x8_t a, int8x8_t b); // VUZP.8 d0,d0
+_NEON2SSE_INLINE int8x8x2_t vuzp_s8(int8x8_t a, int8x8_t b) // VUZP.8 d0,d0
+{
+    int8x8x2_t val;
+    __m128i tmp, val0;
+    _NEON2SSE_ALIGN_16 int8_t maskdlv8[16] = { 0, 4, 8, 12, 1, 5, 9, 13,  2, 6, 10, 14, 3, 7, 11,15};
+    tmp = _mm_unpacklo_epi8(_pM128i(a), _pM128i(b)); //a0,b0,a1,b1,a2,b2,a3,b3,...,a7,b7
+    val0 = _mm_shuffle_epi8 (tmp, *(__m128i*)maskdlv8); //(a0, a2, a4, a6, b0, b2, b4, b6),  (a1, a3, a5, a7, b1,b3, b5, b7)
+    vst1q_s8(val.val, val0); // _mm_shuffle_epi32(val.val[0], _SWAP_HI_LOW32);
+    return val;
+}
 
-#if defined(USE_SSSE3)
-int8x16x2_t vuzpq_s8(int8x16_t a, int8x16_t b);         // VUZP.8 q0,q0
-_NEON2SSE_INLINE int8x16x2_t vuzpq_s8(int8x16_t a, int8x16_t b)         // VUZP.8 q0,q0
+int16x4x2_t vuzp_s16(int16x4_t a, int16x4_t b); // VUZP.16 d0,d0
+_NEON2SSE_INLINE int16x4x2_t vuzp_s16(int16x4_t a, int16x4_t b) // VUZP.16 d0,d0
+{
+    int16x4x2_t val;
+    __m128i tmp, val0;
+    _NEON2SSE_ALIGN_16 int8_t maskdlv16[16] = {0,1,  8,9,  2,3, 10,11,  4,5, 12,13, 6,7, 14,15};
+    tmp = _mm_unpacklo_epi16(_pM128i(a), _pM128i(b)); //a0,b0,a1,b1,a2,b2,a3,b3
+    val0 = _mm_shuffle_epi8 (tmp, *(__m128i*)maskdlv16); //a0,a2, b0, b2, a1,a3, b1,b3
+    vst1q_s16(val.val, val0); // _mm_shuffle_epi32(val.val[0], _SWAP_HI_LOW32);
+    return val;
+}
+
+int32x2x2_t vuzp_s32(int32x2_t a, int32x2_t b); // VUZP.32 d0,d0
+_NEON2SSE_INLINE int32x2x2_t vuzp_s32(int32x2_t a, int32x2_t b) // VUZP.32 d0,d0
+{
+    int32x2x2_t val;
+    __m128i val0;
+    val0 = _mm_unpacklo_epi32(_pM128i(a), _pM128i(b)); //a0,b0, a1,b1
+    vst1q_s32(val.val, val0); // _mm_shuffle_epi32(val.val[0], _SWAP_HI_LOW32);
+    return val;
+}
+
+uint8x8x2_t vuzp_u8(uint8x8_t a, uint8x8_t b); // VUZP.8 d0,d0
+#define vuzp_u8 vuzp_s8
+
+uint16x4x2_t vuzp_u16(uint16x4_t a, uint16x4_t b); // VUZP.16 d0,d0
+#define vuzp_u16 vuzp_s16
+
+uint32x2x2_t vuzp_u32(uint32x2_t a, uint32x2_t b); // VUZP.32 d0,d0
+#define vuzp_u32 vuzp_s32
+
+float32x2x2_t vuzp_f32(float32x2_t a, float32x2_t b); // VUZP.32 d0,d0
+#define vuzp_f32 vzip_f32
+
+poly8x8x2_t vuzp_p8(poly8x8_t a, poly8x8_t b); // VUZP.8 d0,d0
+#define vuzp_p8 vuzp_u8
+
+poly16x4x2_t vuzp_p16(poly16x4_t a, poly16x4_t b); // VUZP.16 d0,d0
+#define vuzp_p16 vuzp_u16
+
+int8x16x2_t vuzpq_s8(int8x16_t a, int8x16_t b); // VUZP.8 q0,q0
+_NEON2SSE_INLINE int8x16x2_t vuzpq_s8(int8x16_t a, int8x16_t b) // VUZP.8 q0,q0
 {
     int8x16x2_t v8x16;
     __m128i a_sh, b_sh;
     _NEON2SSE_ALIGN_16 int8_t mask8_even_odd[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3,5, 7, 9, 11, 13, 15};
-    a_sh = _mm_shuffle_epi8 (a, *(__m128i*)mask8_even_odd);         //a0, a2, a4, a6, a8, a10, a12, a14, a1, a3, a5, a7, a9, a11, a13, a15
-    b_sh = _mm_shuffle_epi8 (b, *(__m128i*)mask8_even_odd);         //b0, b2, b4, b6, b8, b10, b12, b14, b1, b3, b5, b7, b9, b11, b13, b15
+    a_sh = _mm_shuffle_epi8 (a, *(__m128i*)mask8_even_odd); //a0, a2, a4, a6, a8, a10, a12, a14, a1, a3, a5, a7, a9, a11, a13, a15
+    b_sh = _mm_shuffle_epi8 (b, *(__m128i*)mask8_even_odd); //b0, b2, b4, b6, b8, b10, b12, b14, b1, b3, b5, b7, b9, b11, b13, b15
     //we need unpack64 to combine lower (upper) 64 bits from a with lower (upper) 64 bits from b
-    v8x16.val[0] = _mm_unpacklo_epi64(a_sh, b_sh);         ///a0, a2, a4, a6, a8, a10, a12, a14,  b0, b2, b4, b6, b8, b10, b12, b14,
-    v8x16.val[1] = _mm_unpackhi_epi64(a_sh, b_sh);         //a1, a3, a5, a7, a9, a11, a13, a15,  b1, b3, b5, b7, b9, b11, b13, b15
+    v8x16.val[0] = _mm_unpacklo_epi64(a_sh, b_sh); ///a0, a2, a4, a6, a8, a10, a12, a14,  b0, b2, b4, b6, b8, b10, b12, b14,
+    v8x16.val[1] = _mm_unpackhi_epi64(a_sh, b_sh); //a1, a3, a5, a7, a9, a11, a13, a15,  b1, b3, b5, b7, b9, b11, b13, b15
     return v8x16;
 }
-#endif
 
-#if defined(USE_SSSE3)
-int16x8x2_t vuzpq_s16(int16x8_t a, int16x8_t b);         // VUZP.16 q0,q0
-_NEON2SSE_INLINE int16x8x2_t vuzpq_s16(int16x8_t a, int16x8_t b)         // VUZP.16 q0,q0
+int16x8x2_t vuzpq_s16(int16x8_t a, int16x8_t b); // VUZP.16 q0,q0
+_NEON2SSE_INLINE int16x8x2_t vuzpq_s16(int16x8_t a, int16x8_t b) // VUZP.16 q0,q0
 {
     int16x8x2_t v16x8;
     __m128i a_sh, b_sh;
     _NEON2SSE_ALIGN_16 int8_t mask16_even_odd[16] = { 0,1, 4,5, 8,9, 12,13, 2,3, 6,7, 10,11, 14,15};
-    a_sh = _mm_shuffle_epi8 (a, *(__m128i*)mask16_even_odd);         //a0, a2, a4, a6,  a1, a3, a5, a7
-    b_sh = _mm_shuffle_epi8 (b, *(__m128i*)mask16_even_odd);         //b0, b2, b4, b6,  b1, b3, b5, b7
-    v16x8.val[0] = _mm_unpacklo_epi64(a_sh, b_sh);         //a0, a2, a4, a6, b0, b2, b4, b6
-    v16x8.val[1] = _mm_unpackhi_epi64(a_sh, b_sh);         //a1, a3, a5, a7, b1, b3, b5, b7
+    a_sh = _mm_shuffle_epi8 (a, *(__m128i*)mask16_even_odd); //a0, a2, a4, a6,  a1, a3, a5, a7
+    b_sh = _mm_shuffle_epi8 (b, *(__m128i*)mask16_even_odd); //b0, b2, b4, b6,  b1, b3, b5, b7
+    v16x8.val[0] = _mm_unpacklo_epi64(a_sh, b_sh); //a0, a2, a4, a6, b0, b2, b4, b6
+    v16x8.val[1] = _mm_unpackhi_epi64(a_sh, b_sh); //a1, a3, a5, a7, b1, b3, b5, b7
     return v16x8;
 }
-#endif
 
-int32x4x2_t vuzpq_s32(int32x4_t a, int32x4_t b);         // VUZP.32 q0,q0
-_NEON2SSE_INLINE int32x4x2_t vuzpq_s32(int32x4_t a, int32x4_t b)         // VUZP.32 q0,q0
-{         //may be not optimal solution compared with serial
+int32x4x2_t vuzpq_s32(int32x4_t a, int32x4_t b); // VUZP.32 q0,q0
+_NEON2SSE_INLINE int32x4x2_t vuzpq_s32(int32x4_t a, int32x4_t b) // VUZP.32 q0,q0
+{
+    //may be not optimal solution compared with serial
     int32x4x2_t v32x4;
     __m128i a_sh, b_sh;
-    a_sh = _mm_shuffle_epi32 (a, 216);         //a0, a2, a1, a3
-    b_sh = _mm_shuffle_epi32 (b, 216);         //b0, b2, b1, b3
+    a_sh = _mm_shuffle_epi32 (a, 216); //a0, a2, a1, a3
+    b_sh = _mm_shuffle_epi32 (b, 216); //b0, b2, b1, b3
 
-    v32x4.val[0] = _mm_unpacklo_epi64(a_sh, b_sh);         //a0, a2, b0, b2
-    v32x4.val[1] = _mm_unpackhi_epi64(a_sh, b_sh);         //a1, a3, b1, b3
+    v32x4.val[0] = _mm_unpacklo_epi64(a_sh, b_sh); //a0, a2, b0, b2
+    v32x4.val[1] = _mm_unpackhi_epi64(a_sh, b_sh); //a1, a3, b1, b3
     return v32x4;
 }
 
-#if defined(USE_SSSE3)
-uint8x16x2_t vuzpq_u8(uint8x16_t a, uint8x16_t b);         // VUZP.8 q0,q0
+uint8x16x2_t vuzpq_u8(uint8x16_t a, uint8x16_t b); // VUZP.8 q0,q0
 #define vuzpq_u8 vuzpq_s8
 
-uint16x8x2_t vuzpq_u16(uint16x8_t a, uint16x8_t b);         // VUZP.16 q0,q0
+uint16x8x2_t vuzpq_u16(uint16x8_t a, uint16x8_t b); // VUZP.16 q0,q0
 #define vuzpq_u16 vuzpq_s16
-#endif
 
-uint32x4x2_t vuzpq_u32(uint32x4_t a, uint32x4_t b);         // VUZP.32 q0,q0
+uint32x4x2_t vuzpq_u32(uint32x4_t a, uint32x4_t b); // VUZP.32 q0,q0
 #define vuzpq_u32 vuzpq_s32
 
-float32x4x2_t vuzpq_f32(float32x4_t a, float32x4_t b);         // VUZP.32 q0,q0
-_NEON2SSE_INLINE float32x4x2_t vuzpq_f32(float32x4_t a, float32x4_t b)         // VUZP.32 q0,q0
+float32x4x2_t vuzpq_f32(float32x4_t a, float32x4_t b); // VUZP.32 q0,q0
+_NEON2SSE_INLINE float32x4x2_t vuzpq_f32(float32x4_t a, float32x4_t b) // VUZP.32 q0,q0
 {
     float32x4x2_t v32x4;
-    v32x4.val[0] = _mm_shuffle_ps(a, b, _MM_SHUFFLE(2,0, 2, 0));         //a0, a2, b0, b2 , need to check endianess however
-    v32x4.val[1] = _mm_shuffle_ps(a, b, _MM_SHUFFLE(3,1, 3, 1));         //a1, a3, b1, b3, need to check endianess however
+    v32x4.val[0] = _mm_shuffle_ps(a, b, _MM_SHUFFLE(2,0, 2, 0)); //a0, a2, b0, b2 , need to check endianess however
+    v32x4.val[1] = _mm_shuffle_ps(a, b, _MM_SHUFFLE(3,1, 3, 1)); //a1, a3, b1, b3, need to check endianess however
     return v32x4;
 }
 
-#if defined(USE_SSSE3)
-poly8x16x2_t vuzpq_p8(poly8x16_t a, poly8x16_t b);         // VUZP.8 q0,q0
+poly8x16x2_t vuzpq_p8(poly8x16_t a, poly8x16_t b); // VUZP.8 q0,q0
 #define vuzpq_p8 vuzpq_u8
 
-poly16x8x2_t vuzpq_p16(poly16x8_t a, poly16x8_t b);         // VUZP.16 q0,q0
+poly16x8x2_t vuzpq_p16(poly16x8_t a, poly16x8_t b); // VUZP.16 q0,q0
 #define vuzpq_p16 vuzpq_u16
-#endif
 
 //##############################################################################################
 //*********************** Reinterpret cast intrinsics.******************************************
 //##############################################################################################
 // Not a part of oficial NEON instruction set but available in gcc compiler *********************
+poly8x8_t vreinterpret_p8_u32 (uint32x2_t t);
+#define vreinterpret_p8_u32
+
+poly8x8_t vreinterpret_p8_u16 (uint16x4_t t);
+#define vreinterpret_p8_u16
+
+poly8x8_t vreinterpret_p8_u8 (uint8x8_t t);
+#define vreinterpret_p8_u8
+
+poly8x8_t vreinterpret_p8_s32 (int32x2_t t);
+#define vreinterpret_p8_s32
+
+poly8x8_t vreinterpret_p8_s16 (int16x4_t t);
+#define vreinterpret_p8_s16
+
+poly8x8_t vreinterpret_p8_s8 (int8x8_t t);
+#define vreinterpret_p8_s8
+
+poly8x8_t vreinterpret_p8_u64 (uint64x1_t t);
+#define vreinterpret_p8_u64
+
+poly8x8_t vreinterpret_p8_s64 (int64x1_t t);
+#define vreinterpret_p8_s64
+
+poly8x8_t vreinterpret_p8_f32 (float32x2_t t);
+#define vreinterpret_p8_f32
+
+poly8x8_t vreinterpret_p8_p16 (poly16x4_t t);
+#define vreinterpret_p8_p16
 
 poly8x16_t vreinterpretq_p8_u32 (uint32x4_t t);
 #define vreinterpretq_p8_u32
@@ -8338,6 +16010,36 @@
 poly8x16_t vreinterpretq_p8_p16 (poly16x8_t t);
 #define vreinterpretq_p8_p16
 
+poly16x4_t vreinterpret_p16_u32 (uint32x2_t t);
+#define vreinterpret_p16_u32
+
+poly16x4_t vreinterpret_p16_u16 (uint16x4_t t);
+#define vreinterpret_p16_u16
+
+poly16x4_t vreinterpret_p16_u8 (uint8x8_t t);
+#define vreinterpret_p16_u8
+
+poly16x4_t vreinterpret_p16_s32 (int32x2_t t);
+#define vreinterpret_p16_s32
+
+poly16x4_t vreinterpret_p16_s16 (int16x4_t t);
+#define vreinterpret_p16_s16
+
+poly16x4_t vreinterpret_p16_s8 (int8x8_t t);
+#define vreinterpret_p16_s8
+
+poly16x4_t vreinterpret_p16_u64 (uint64x1_t t);
+#define vreinterpret_p16_u64
+
+poly16x4_t vreinterpret_p16_s64 (int64x1_t t);
+#define vreinterpret_p16_s64
+
+poly16x4_t vreinterpret_p16_f32 (float32x2_t t);
+#define vreinterpret_p16_f32
+
+poly16x4_t vreinterpret_p16_p8 (poly8x8_t t);
+#define vreinterpret_p16_p8
+
 poly16x8_t vreinterpretq_p16_u32 (uint32x4_t t);
 #define vreinterpretq_p16_u32
 
@@ -8366,6 +16068,42 @@
 #define vreinterpretq_p16_p8  vreinterpretq_s16_p8
 
 //****  Integer to float  ******
+float32x2_t vreinterpret_f32_u32 (uint32x2_t t);
+#define vreinterpret_f32_u32(t) (*(__m64_128*)&(t))
+
+
+float32x2_t vreinterpret_f32_u16 (uint16x4_t t);
+#define vreinterpret_f32_u16 vreinterpret_f32_u32
+
+
+float32x2_t vreinterpret_f32_u8 (uint8x8_t t);
+#define vreinterpret_f32_u8 vreinterpret_f32_u32
+
+
+float32x2_t vreinterpret_f32_s32 (int32x2_t t);
+#define vreinterpret_f32_s32 vreinterpret_f32_u32
+
+
+float32x2_t vreinterpret_f32_s16 (int16x4_t t);
+#define vreinterpret_f32_s16 vreinterpret_f32_u32
+
+float32x2_t vreinterpret_f32_s8 (int8x8_t t);
+#define vreinterpret_f32_s8 vreinterpret_f32_u32
+
+
+float32x2_t vreinterpret_f32_u64(uint64x1_t t);
+#define vreinterpret_f32_u64 vreinterpret_f32_u32
+
+
+float32x2_t vreinterpret_f32_s64 (int64x1_t t);
+#define vreinterpret_f32_s64 vreinterpret_f32_u32
+
+
+float32x2_t vreinterpret_f32_p16 (poly16x4_t t);
+#define vreinterpret_f32_p16 vreinterpret_f32_u32
+
+float32x2_t vreinterpret_f32_p8 (poly8x8_t t);
+#define vreinterpret_f32_p8 vreinterpret_f32_u32
 
 float32x4_t vreinterpretq_f32_u32 (uint32x4_t t);
 #define  vreinterpretq_f32_u32(t) *(__m128*)&(t)
@@ -8399,6 +16137,35 @@
 
 //*** Integer type conversions ******************
 //no conversion necessary for the following functions because it is same data type
+int64x1_t vreinterpret_s64_u32 (uint32x2_t t);
+#define vreinterpret_s64_u32
+
+int64x1_t vreinterpret_s64_u16 (uint16x4_t t);
+#define vreinterpret_s64_u16
+
+int64x1_t vreinterpret_s64_u8 (uint8x8_t t);
+#define vreinterpret_s64_u8
+
+int64x1_t vreinterpret_s64_s32 (int32x2_t t);
+#define  vreinterpret_s64_s32
+
+int64x1_t vreinterpret_s64_s16 (int16x4_t t);
+#define vreinterpret_s64_s16
+
+int64x1_t vreinterpret_s64_s8 (int8x8_t t);
+#define  vreinterpret_s64_s8
+
+int64x1_t vreinterpret_s64_u64 (uint64x1_t t);
+#define  vreinterpret_s64_u64
+
+int64x1_t vreinterpret_s64_f32 (float32x2_t t);
+#define  vreinterpret_s64_f32
+
+int64x1_t vreinterpret_s64_p16 (poly16x4_t t);
+#define vreinterpret_s64_p16
+
+int64x1_t vreinterpret_s64_p8 (poly8x8_t t);
+#define vreinterpret_s64_p8
 
 int64x2_t vreinterpretq_s64_u32 (uint32x4_t t);
 #define vreinterpretq_s64_u32
@@ -8430,6 +16197,36 @@
 int64x2_t vreinterpretq_s64_p8 (poly8x16_t t);
 #define vreinterpretq_s64_p8
 
+uint64x1_t vreinterpret_u64_u32 (uint32x2_t t);
+#define vreinterpret_u64_u32
+
+uint64x1_t vreinterpret_u64_u16 (uint16x4_t t);
+#define vreinterpret_u64_u16
+
+uint64x1_t vreinterpret_u64_u8 (uint8x8_t t);
+#define vreinterpret_u64_u8
+
+uint64x1_t vreinterpret_u64_s32 (int32x2_t t);
+#define vreinterpret_u64_s32
+
+uint64x1_t vreinterpret_u64_s16 (int16x4_t t);
+#define vreinterpret_u64_s16
+
+uint64x1_t vreinterpret_u64_s8 (int8x8_t t);
+#define vreinterpret_u64_s8
+
+uint64x1_t vreinterpret_u64_s64 (int64x1_t t);
+#define vreinterpret_u64_s64
+
+uint64x1_t vreinterpret_u64_f32 (float32x2_t t);
+#define vreinterpret_u64_f32
+
+uint64x1_t vreinterpret_u64_p16 (poly16x4_t t);
+#define vreinterpret_u64_p16
+
+uint64x1_t vreinterpret_u64_p8 (poly8x8_t t);
+#define vreinterpret_u64_p8
+
 uint64x2_t vreinterpretq_u64_u32 (uint32x4_t t);
 #define vreinterpretq_u64_u32
 
@@ -8460,6 +16257,36 @@
 uint64x2_t vreinterpretq_u64_p8 (poly8x16_t t);
 #define vreinterpretq_u64_p8
 
+int8x8_t vreinterpret_s8_u32 (uint32x2_t t);
+#define vreinterpret_s8_u32
+
+int8x8_t vreinterpret_s8_u16 (uint16x4_t t);
+#define vreinterpret_s8_u16
+
+int8x8_t vreinterpret_s8_u8 (uint8x8_t t);
+#define vreinterpret_s8_u8
+
+int8x8_t vreinterpret_s8_s32 (int32x2_t t);
+#define vreinterpret_s8_s32
+
+int8x8_t vreinterpret_s8_s16 (int16x4_t t);
+#define vreinterpret_s8_s16
+
+int8x8_t vreinterpret_s8_u64 (uint64x1_t t);
+#define vreinterpret_s8_u64
+
+int8x8_t vreinterpret_s8_s64 (int64x1_t t);
+#define vreinterpret_s8_s64
+
+int8x8_t vreinterpret_s8_f32 (float32x2_t t);
+#define vreinterpret_s8_f32
+
+int8x8_t vreinterpret_s8_p16 (poly16x4_t t);
+#define vreinterpret_s8_p16
+
+int8x8_t vreinterpret_s8_p8 (poly8x8_t t);
+#define vreinterpret_s8_p8
+
 int8x16_t vreinterpretq_s8_u32 (uint32x4_t t);
 #define vreinterpretq_s8_u32
 
@@ -8490,6 +16317,37 @@
 int8x16_t vreinterpretq_s8_p8 (poly8x16_t t);
 #define vreinterpretq_s8_p8
 
+int16x4_t vreinterpret_s16_u32 (uint32x2_t t);
+#define vreinterpret_s16_u32
+
+int16x4_t vreinterpret_s16_u16 (uint16x4_t t);
+#define vreinterpret_s16_u16
+
+int16x4_t vreinterpret_s16_u8 (uint8x8_t t);
+#define vreinterpret_s16_u8
+
+int16x4_t vreinterpret_s16_s32 (int32x2_t t);
+#define vreinterpret_s16_s32
+
+int16x4_t vreinterpret_s16_s8 (int8x8_t t);
+#define vreinterpret_s16_s8
+
+int16x4_t vreinterpret_s16_u64 (uint64x1_t t);
+#define vreinterpret_s16_u64
+
+int16x4_t vreinterpret_s16_s64 (int64x1_t t);
+#define vreinterpret_s16_s64
+
+int16x4_t vreinterpret_s16_f32 (float32x2_t t);
+#define vreinterpret_s16_f32
+
+
+int16x4_t vreinterpret_s16_p16 (poly16x4_t t);
+#define vreinterpret_s16_p16
+
+int16x4_t vreinterpret_s16_p8 (poly8x8_t t);
+#define vreinterpret_s16_p8
+
 int16x8_t vreinterpretq_s16_u32 (uint32x4_t t);
 #define vreinterpretq_s16_u32
 
@@ -8520,6 +16378,36 @@
 int16x8_t vreinterpretq_s16_p8 (poly8x16_t t);
 #define vreinterpretq_s16_p8
 
+int32x2_t vreinterpret_s32_u32 (uint32x2_t t);
+#define vreinterpret_s32_u32
+
+int32x2_t vreinterpret_s32_u16 (uint16x4_t t);
+#define vreinterpret_s32_u16
+
+int32x2_t vreinterpret_s32_u8 (uint8x8_t t);
+#define vreinterpret_s32_u8
+
+int32x2_t vreinterpret_s32_s16 (int16x4_t t);
+#define vreinterpret_s32_s16
+
+int32x2_t vreinterpret_s32_s8 (int8x8_t t);
+#define vreinterpret_s32_s8
+
+int32x2_t vreinterpret_s32_u64 (uint64x1_t t);
+#define vreinterpret_s32_u64
+
+int32x2_t vreinterpret_s32_s64 (int64x1_t t);
+#define vreinterpret_s32_s64
+
+int32x2_t vreinterpret_s32_f32 (float32x2_t t);
+#define vreinterpret_s32_f32
+
+int32x2_t vreinterpret_s32_p16 (poly16x4_t t);
+#define vreinterpret_s32_p16
+
+int32x2_t vreinterpret_s32_p8 (poly8x8_t t);
+#define vreinterpret_s32_p8
+
 int32x4_t vreinterpretq_s32_u32 (uint32x4_t t);
 #define vreinterpretq_s32_u32
 
@@ -8542,7 +16430,7 @@
 #define vreinterpretq_s32_s64
 
 int32x4_t vreinterpretq_s32_f32 (float32x4_t t);
-#define vreinterpretq_s32_f32(t)  _mm_castps_si128(t)         //(*(__m128i*)&(t))
+#define vreinterpretq_s32_f32(t)  _mm_castps_si128(t) //(*(__m128i*)&(t))
 
 int32x4_t vreinterpretq_s32_p16 (poly16x8_t t);
 #define vreinterpretq_s32_p16
@@ -8550,6 +16438,36 @@
 int32x4_t vreinterpretq_s32_p8 (poly8x16_t t);
 #define vreinterpretq_s32_p8
 
+uint8x8_t vreinterpret_u8_u32 (uint32x2_t t);
+#define vreinterpret_u8_u32
+
+uint8x8_t vreinterpret_u8_u16 (uint16x4_t t);
+#define vreinterpret_u8_u16
+
+uint8x8_t vreinterpret_u8_s32 (int32x2_t t);
+#define vreinterpret_u8_s32
+
+uint8x8_t vreinterpret_u8_s16 (int16x4_t t);
+#define vreinterpret_u8_s16
+
+uint8x8_t vreinterpret_u8_s8 (int8x8_t t);
+#define vreinterpret_u8_s8
+
+uint8x8_t vreinterpret_u8_u64 (uint64x1_t t);
+#define vreinterpret_u8_u64
+
+uint8x8_t vreinterpret_u8_s64 (int64x1_t t);
+#define vreinterpret_u8_s64
+
+uint8x8_t vreinterpret_u8_f32 (float32x2_t t);
+#define vreinterpret_u8_f32
+
+uint8x8_t vreinterpret_u8_p16 (poly16x4_t t);
+#define vreinterpret_u8_p16
+
+uint8x8_t vreinterpret_u8_p8 (poly8x8_t t);
+#define vreinterpret_u8_p8
+
 uint8x16_t vreinterpretq_u8_u32 (uint32x4_t t);
 #define vreinterpretq_u8_u32
 
@@ -8574,12 +16492,43 @@
 uint8x16_t vreinterpretq_u8_f32 (float32x4_t t);
 #define vreinterpretq_u8_f32(t) _M128i(t)
 
+
 uint8x16_t vreinterpretq_u8_p16 (poly16x8_t t);
 #define vreinterpretq_u8_p16
 
 uint8x16_t vreinterpretq_u8_p8 (poly8x16_t t);
 #define vreinterpretq_u8_p8
 
+uint16x4_t vreinterpret_u16_u32 (uint32x2_t t);
+#define vreinterpret_u16_u32
+
+uint16x4_t vreinterpret_u16_u8 (uint8x8_t t);
+#define vreinterpret_u16_u8
+
+uint16x4_t vreinterpret_u16_s32 (int32x2_t t);
+#define vreinterpret_u16_s32
+
+uint16x4_t vreinterpret_u16_s16 (int16x4_t t);
+#define vreinterpret_u16_s16
+
+uint16x4_t vreinterpret_u16_s8 (int8x8_t t);
+#define vreinterpret_u16_s8
+
+uint16x4_t vreinterpret_u16_u64 (uint64x1_t t);
+#define vreinterpret_u16_u64
+
+uint16x4_t vreinterpret_u16_s64 (int64x1_t t);
+#define vreinterpret_u16_s64
+
+uint16x4_t vreinterpret_u16_f32 (float32x2_t t);
+#define vreinterpret_u16_f32
+
+uint16x4_t vreinterpret_u16_p16 (poly16x4_t t);
+#define vreinterpret_u16_p16
+
+uint16x4_t vreinterpret_u16_p8 (poly8x8_t t);
+#define vreinterpret_u16_p8
+
 uint16x8_t vreinterpretq_u16_u32 (uint32x4_t t);
 #define vreinterpretq_u16_u32
 
@@ -8610,6 +16559,36 @@
 uint16x8_t vreinterpretq_u16_p8 (poly8x16_t t);
 #define vreinterpretq_u16_p8
 
+uint32x2_t vreinterpret_u32_u16 (uint16x4_t t);
+#define vreinterpret_u32_u16
+
+uint32x2_t vreinterpret_u32_u8 (uint8x8_t t);
+#define vreinterpret_u32_u8
+
+uint32x2_t vreinterpret_u32_s32 (int32x2_t t);
+#define vreinterpret_u32_s32
+
+uint32x2_t vreinterpret_u32_s16 (int16x4_t t);
+#define vreinterpret_u32_s16
+
+uint32x2_t vreinterpret_u32_s8 (int8x8_t t);
+#define vreinterpret_u32_s8
+
+uint32x2_t vreinterpret_u32_u64 (uint64x1_t t);
+#define vreinterpret_u32_u64
+
+uint32x2_t vreinterpret_u32_s64 (int64x1_t t);
+#define vreinterpret_u32_s64
+
+uint32x2_t vreinterpret_u32_f32 (float32x2_t t);
+#define vreinterpret_u32_f32
+
+uint32x2_t vreinterpret_u32_p16 (poly16x4_t t);
+#define vreinterpret_u32_p16
+
+uint32x2_t vreinterpret_u32_p8 (poly8x8_t t);
+#define vreinterpret_u32_p8
+
 uint32x4_t vreinterpretq_u32_u16 (uint16x8_t t);
 #define vreinterpretq_u32_u16
 
diff --git a/lib/gcc/x86_64-linux-android/4.9/include/avx512fintrin.h b/lib/gcc/x86_64-linux-android/4.9/include/avx512fintrin.h
index 314895a..c4caa5a 100644
--- a/lib/gcc/x86_64-linux-android/4.9/include/avx512fintrin.h
+++ b/lib/gcc/x86_64-linux-android/4.9/include/avx512fintrin.h
@@ -8103,6 +8103,22 @@
   return __builtin_ia32_movntdqa512 ((__v8di *)__P);
 }
 
+/* Constants for mantissa extraction */
+typedef enum
+{
+  _MM_MANT_NORM_1_2,		/* interval [1, 2)      */
+  _MM_MANT_NORM_p5_2,		/* interval [0.5, 2)    */
+  _MM_MANT_NORM_p5_1,		/* interval [0.5, 1)    */
+  _MM_MANT_NORM_p75_1p5		/* interval [0.75, 1.5) */
+} _MM_MANTISSA_NORM_ENUM;
+
+typedef enum
+{
+  _MM_MANT_SIGN_src,		/* sign = sign(SRC)     */
+  _MM_MANT_SIGN_zero,		/* sign = 0             */
+  _MM_MANT_SIGN_nan		/* DEST = NaN if sign(SRC) = 1 */
+} _MM_MANTISSA_SIGN_ENUM;
+
 #ifdef __OPTIMIZE__
 extern __inline __m128
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
@@ -8182,22 +8198,6 @@
 						    (__mmask8) __U, __R);
 }
 
-/* Constants for mantissa extraction */
-typedef enum
-{
-  _MM_MANT_NORM_1_2,		/* interval [1, 2)      */
-  _MM_MANT_NORM_p5_2,		/* interval [0.5, 2)    */
-  _MM_MANT_NORM_p5_1,		/* interval [0.5, 1)    */
-  _MM_MANT_NORM_p75_1p5		/* interval [0.75, 1.5) */
-} _MM_MANTISSA_NORM_ENUM;
-
-typedef enum
-{
-  _MM_MANT_SIGN_src,		/* sign = sign(SRC)     */
-  _MM_MANT_SIGN_zero,		/* sign = 0             */
-  _MM_MANT_SIGN_nan		/* DEST = NaN if sign(SRC) = 1 */
-} _MM_MANTISSA_SIGN_ENUM;
-
 extern __inline __m512d
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm512_getmant_round_pd (__m512d __A, _MM_MANTISSA_NORM_ENUM __B,
diff --git a/lib/gcc/x86_64-linux-android/4.9/include/ia32intrin.h b/lib/gcc/x86_64-linux-android/4.9/include/ia32intrin.h
index 5e7c893..614b0fa 100644
--- a/lib/gcc/x86_64-linux-android/4.9/include/ia32intrin.h
+++ b/lib/gcc/x86_64-linux-android/4.9/include/ia32intrin.h
@@ -256,11 +256,7 @@
 
 #define _bswap64(a)		__bswapq(a)
 #define _popcnt64(a)		__popcntq(a)
-#define _lrotl(a,b)		__rolq((a), (b))
-#define _lrotr(a,b)		__rorq((a), (b))
 #else
-#define _lrotl(a,b)		__rold((a), (b))
-#define _lrotr(a,b)		__rord((a), (b))
 
 /* Read flags register */
 extern __inline unsigned int
@@ -280,6 +276,16 @@
 
 #endif
 
+/* On LP64 systems, longs are 64-bit.  Use the appropriate rotate
+ * function.  */
+#ifdef __LP64__
+#define _lrotl(a,b)		__rolq((a), (b))
+#define _lrotr(a,b)		__rorq((a), (b))
+#else
+#define _lrotl(a,b)		__rold((a), (b))
+#define _lrotr(a,b)		__rord((a), (b))
+#endif
+
 #define _bit_scan_forward(a)	__bsfd(a)
 #define _bit_scan_reverse(a)	__bsrd(a)
 #define _bswap(a)		__bswapd(a)
diff --git a/lib/gcc/x86_64-linux-android/4.9/include/quadmath.h b/lib/gcc/x86_64-linux-android/4.9/include/quadmath.h
new file mode 100644
index 0000000..aa9ef51
--- /dev/null
+++ b/lib/gcc/x86_64-linux-android/4.9/include/quadmath.h
@@ -0,0 +1,200 @@
+/* GCC Quad-Precision Math Library
+   Copyright (C) 2010, 2011 Free Software Foundation, Inc.
+   Written by Francois-Xavier Coudert  <fxcoudert@gcc.gnu.org>
+
+This file is part of the libquadmath library.
+Libquadmath is free software; you can redistribute it and/or
+modify it under the terms of the GNU Library General Public
+License as published by the Free Software Foundation; either
+version 2 of the License, or (at your option) any later version.
+
+Libquadmath is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+Library General Public License for more details.
+
+You should have received a copy of the GNU Library General Public
+License along with libquadmath; see the file COPYING.LIB.  If
+not, write to the Free Software Foundation, Inc., 51 Franklin Street - Fifth Floor,
+Boston, MA 02110-1301, USA.  */
+
+#ifndef QUADMATH_H
+#define QUADMATH_H
+
+#include <stdlib.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* Define the complex type corresponding to __float128
+   ("_Complex __float128" is not allowed) */
+typedef _Complex float __attribute__((mode(TC))) __complex128;
+
+#ifdef __cplusplus
+# define __quadmath_throw throw ()
+# define __quadmath_nth(fct) fct throw ()
+#else
+# define __quadmath_throw __attribute__((__nothrow__))
+# define __quadmath_nth(fct) __attribute__((__nothrow__)) fct
+#endif
+
+/* Prototypes for real functions */
+extern __float128 acosq (__float128) __quadmath_throw;
+extern __float128 acoshq (__float128) __quadmath_throw;
+extern __float128 asinq (__float128) __quadmath_throw;
+extern __float128 asinhq (__float128) __quadmath_throw;
+extern __float128 atanq (__float128) __quadmath_throw;
+extern __float128 atanhq (__float128) __quadmath_throw;
+extern __float128 atan2q (__float128, __float128) __quadmath_throw;
+extern __float128 cbrtq (__float128) __quadmath_throw;
+extern __float128 ceilq (__float128) __quadmath_throw;
+extern __float128 copysignq (__float128, __float128) __quadmath_throw;
+extern __float128 coshq (__float128) __quadmath_throw;
+extern __float128 cosq (__float128) __quadmath_throw;
+extern __float128 erfq (__float128) __quadmath_throw;
+extern __float128 erfcq (__float128) __quadmath_throw;
+extern __float128 expq (__float128) __quadmath_throw;
+extern __float128 expm1q (__float128) __quadmath_throw;
+extern __float128 fabsq (__float128) __quadmath_throw;
+extern __float128 fdimq (__float128, __float128) __quadmath_throw;
+extern int finiteq (__float128) __quadmath_throw;
+extern __float128 floorq (__float128) __quadmath_throw;
+extern __float128 fmaq (__float128, __float128, __float128) __quadmath_throw;
+extern __float128 fmaxq (__float128, __float128) __quadmath_throw;
+extern __float128 fminq (__float128, __float128) __quadmath_throw;
+extern __float128 fmodq (__float128, __float128) __quadmath_throw;
+extern __float128 frexpq (__float128, int *) __quadmath_throw;
+extern __float128 hypotq (__float128, __float128) __quadmath_throw;
+extern int isinfq (__float128) __quadmath_throw;
+extern int ilogbq (__float128) __quadmath_throw;
+extern int isnanq (__float128) __quadmath_throw;
+extern __float128 j0q (__float128) __quadmath_throw;
+extern __float128 j1q (__float128) __quadmath_throw;
+extern __float128 jnq (int, __float128) __quadmath_throw;
+extern __float128 ldexpq (__float128, int) __quadmath_throw;
+extern __float128 lgammaq (__float128) __quadmath_throw;
+extern long long int llrintq (__float128) __quadmath_throw;
+extern long long int llroundq (__float128) __quadmath_throw;
+extern __float128 logq (__float128) __quadmath_throw;
+extern __float128 log10q (__float128) __quadmath_throw;
+extern __float128 log2q (__float128) __quadmath_throw;
+extern __float128 log1pq (__float128) __quadmath_throw;
+extern long int lrintq (__float128) __quadmath_throw;
+extern long int lroundq (__float128) __quadmath_throw;
+extern __float128 modfq (__float128, __float128 *) __quadmath_throw;
+extern __float128 nanq (const char *) __quadmath_throw;
+extern __float128 nearbyintq (__float128) __quadmath_throw;
+extern __float128 nextafterq (__float128, __float128) __quadmath_throw;
+extern __float128 powq (__float128, __float128) __quadmath_throw;
+extern __float128 remainderq (__float128, __float128) __quadmath_throw;
+extern __float128 remquoq (__float128, __float128, int *) __quadmath_throw;
+extern __float128 rintq (__float128) __quadmath_throw;
+extern __float128 roundq (__float128) __quadmath_throw;
+extern __float128 scalblnq (__float128, long int) __quadmath_throw;
+extern __float128 scalbnq (__float128, int) __quadmath_throw;
+extern int signbitq (__float128) __quadmath_throw;
+extern void sincosq (__float128, __float128 *, __float128 *) __quadmath_throw;
+extern __float128 sinhq (__float128) __quadmath_throw;
+extern __float128 sinq (__float128) __quadmath_throw;
+extern __float128 sqrtq (__float128) __quadmath_throw;
+extern __float128 tanq (__float128) __quadmath_throw;
+extern __float128 tanhq (__float128) __quadmath_throw;
+extern __float128 tgammaq (__float128) __quadmath_throw;
+extern __float128 truncq (__float128) __quadmath_throw;
+extern __float128 y0q (__float128) __quadmath_throw;
+extern __float128 y1q (__float128) __quadmath_throw;
+extern __float128 ynq (int, __float128) __quadmath_throw;
+
+
+/* Prototypes for complex functions */
+extern __float128 cabsq (__complex128) __quadmath_throw;
+extern __float128 cargq (__complex128) __quadmath_throw;
+extern __float128 cimagq (__complex128) __quadmath_throw;
+extern __float128 crealq (__complex128) __quadmath_throw;
+extern __complex128 cacosq (__complex128) __quadmath_throw;
+extern __complex128 cacoshq (__complex128) __quadmath_throw;
+extern __complex128 casinq (__complex128) __quadmath_throw;
+extern __complex128 casinhq (__complex128) __quadmath_throw;
+extern __complex128 catanq (__complex128) __quadmath_throw;
+extern __complex128 catanhq (__complex128) __quadmath_throw;
+extern __complex128 ccosq (__complex128) __quadmath_throw;
+extern __complex128 ccoshq (__complex128) __quadmath_throw;
+extern __complex128 cexpq (__complex128) __quadmath_throw;
+extern __complex128 cexpiq (__float128) __quadmath_throw;
+extern __complex128 clogq (__complex128) __quadmath_throw;
+extern __complex128 clog10q (__complex128) __quadmath_throw;
+extern __complex128 conjq (__complex128) __quadmath_throw;
+extern __complex128 cpowq (__complex128, __complex128) __quadmath_throw;
+extern __complex128 cprojq (__complex128) __quadmath_throw;
+extern __complex128 csinq (__complex128) __quadmath_throw;
+extern __complex128 csinhq (__complex128) __quadmath_throw;
+extern __complex128 csqrtq (__complex128) __quadmath_throw;
+extern __complex128 ctanq (__complex128) __quadmath_throw;
+extern __complex128 ctanhq (__complex128) __quadmath_throw;
+
+
+/* Prototypes for string <-> __float128 conversion functions */
+extern __float128 strtoflt128 (const char *, char **) __quadmath_throw;
+extern int quadmath_snprintf (char *str, size_t size,
+			      const char *format, ...) __quadmath_throw;
+
+
+/* Macros */
+#define FLT128_MAX 1.18973149535723176508575932662800702e4932Q
+#define FLT128_MIN 3.36210314311209350626267781732175260e-4932Q
+#define FLT128_EPSILON 1.92592994438723585305597794258492732e-34Q
+#define FLT128_DENORM_MIN 6.475175119438025110924438958227646552e-4966Q
+#define FLT128_MANT_DIG 113
+#define FLT128_MIN_EXP (-16381)
+#define FLT128_MAX_EXP 16384
+#define FLT128_DIG 33
+#define FLT128_MIN_10_EXP (-4931)
+#define FLT128_MAX_10_EXP 4932
+
+
+#define HUGE_VALQ __builtin_huge_valq()
+/* The following alternative is valid, but brings the warning:
+   (floating constant exceeds range of ‘__float128’)  */
+/* #define HUGE_VALQ (__extension__ 0x1.0p32767Q) */
+
+#define M_Eq		2.7182818284590452353602874713526625Q  /* e */
+#define M_LOG2Eq	1.4426950408889634073599246810018921Q  /* log_2 e */
+#define M_LOG10Eq	0.4342944819032518276511289189166051Q  /* log_10 e */
+#define M_LN2q		0.6931471805599453094172321214581766Q  /* log_e 2 */
+#define M_LN10q		2.3025850929940456840179914546843642Q  /* log_e 10 */
+#define M_PIq		3.1415926535897932384626433832795029Q  /* pi */
+#define M_PI_2q		1.5707963267948966192313216916397514Q  /* pi/2 */
+#define M_PI_4q		0.7853981633974483096156608458198757Q  /* pi/4 */
+#define M_1_PIq		0.3183098861837906715377675267450287Q  /* 1/pi */
+#define M_2_PIq		0.6366197723675813430755350534900574Q  /* 2/pi */
+#define M_2_SQRTPIq	1.1283791670955125738961589031215452Q  /* 2/sqrt(pi) */
+#define M_SQRT2q	1.4142135623730950488016887242096981Q  /* sqrt(2) */
+#define M_SQRT1_2q	0.7071067811865475244008443621048490Q  /* 1/sqrt(2) */
+
+#define __quadmath_extern_inline \
+  extern inline __attribute__ ((__gnu_inline__))
+
+__quadmath_extern_inline __float128
+__quadmath_nth (cimagq (__complex128 __z))
+{
+  return __imag__ __z;
+}
+
+__quadmath_extern_inline __float128
+__quadmath_nth (crealq (__complex128 __z))
+{
+  return __real__ __z;
+}
+
+__quadmath_extern_inline __complex128
+__quadmath_nth (conjq (__complex128 __z))
+{
+  return __extension__ ~__z;
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/lib/gcc/x86_64-linux-android/4.9/include/quadmath_weak.h b/lib/gcc/x86_64-linux-android/4.9/include/quadmath_weak.h
new file mode 100644
index 0000000..986079a
--- /dev/null
+++ b/lib/gcc/x86_64-linux-android/4.9/include/quadmath_weak.h
@@ -0,0 +1,137 @@
+/* GCC Quad-Precision Math Library
+   Copyright (C) 2010, 2011 Free Software Foundation, Inc.
+   Written by Tobias Burnus  <burnus@net-b.de>
+
+This file is part of the libquadmath library.
+Libquadmath is free software; you can redistribute it and/or
+modify it under the terms of the GNU Library General Public
+License as published by the Free Software Foundation; either
+version 2 of the License, or (at your option) any later version.
+
+Libquadmath is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+Library General Public License for more details.
+
+You should have received a copy of the GNU Library General Public
+License along with libquadmath; see the file COPYING.LIB.  If
+not, write to the Free Software Foundation, Inc., 51 Franklin Street - Fifth Floor,
+Boston, MA 02110-1301, USA.  */
+
+#ifndef QUADMATH_WEAK_H
+#define QUADMATH_WEAK_H
+
+#include "quadmath.h"
+
+#if SUPPORTS_WEAK
+# define __qmath2(name,name2,type) \
+  static __typeof(type) name __attribute__ ((__weakref__(#name2)));
+# define __qmath_(name) __qmath_ ## name
+#else
+# define __qmath2(name,name2,type)
+# define __qmath_(name) name
+#endif
+
+/* __qmath_foo is a weak reference to symbol foo.  */
+#define __qmath3(name) __qmath2(__qmath_ ## name,name,name)
+
+/* Prototypes for real functions.  */
+__qmath3 (acosq)
+__qmath3 (acoshq)
+__qmath3 (asinq)
+__qmath3 (asinhq)
+__qmath3 (atanq)
+__qmath3 (atanhq)
+__qmath3 (atan2q)
+__qmath3 (cbrtq)
+__qmath3 (ceilq)
+__qmath3 (copysignq)
+__qmath3 (coshq)
+__qmath3 (cosq)
+__qmath3 (erfq)
+__qmath3 (erfcq)
+__qmath3 (expq)
+__qmath3 (expm1q)
+__qmath3 (fabsq)
+__qmath3 (fdimq)
+__qmath3 (finiteq)
+__qmath3 (floorq)
+__qmath3 (fmaq)
+__qmath3 (fmaxq)
+__qmath3 (fminq)
+__qmath3 (fmodq)
+__qmath3 (frexpq)
+__qmath3 (hypotq)
+__qmath3 (ilogbq)
+__qmath3 (isinfq)
+__qmath3 (isnanq)
+__qmath3 (j0q)
+__qmath3 (j1q)
+__qmath3 (jnq)
+__qmath3 (ldexpq)
+__qmath3 (lgammaq)
+__qmath3 (llrintq)
+__qmath3 (llroundq)
+__qmath3 (logq)
+__qmath3 (log10q)
+__qmath3 (log1pq)
+__qmath3 (log2q)
+__qmath3 (lrintq)
+__qmath3 (lroundq)
+__qmath3 (modfq)
+__qmath3 (nanq)
+__qmath3 (nearbyintq)
+__qmath3 (nextafterq)
+__qmath3 (powq)
+__qmath3 (remainderq)
+__qmath3 (remquoq)
+__qmath3 (rintq)
+__qmath3 (roundq)
+__qmath3 (scalblnq)
+__qmath3 (scalbnq)
+__qmath3 (signbitq)
+__qmath3 (sincosq)
+__qmath3 (sinhq)
+__qmath3 (sinq)
+__qmath3 (sqrtq)
+__qmath3 (tanq)
+__qmath3 (tanhq)
+__qmath3 (tgammaq)
+__qmath3 (truncq)
+__qmath3 (y0q)
+__qmath3 (y1q)
+__qmath3 (ynq)
+
+
+/* Prototypes for complex functions.  */
+__qmath3 (cabsq)
+__qmath3 (cargq)
+__qmath3 (cimagq)
+__qmath3 (crealq)
+__qmath3 (cacosq)
+__qmath3 (cacoshq)
+__qmath3 (casinq)
+__qmath3 (casinhq)
+__qmath3 (catanq)
+__qmath3 (catanhq)
+__qmath3 (ccosq)
+__qmath3 (ccoshq)
+__qmath3 (cexpq)
+__qmath3 (cexpiq)
+__qmath3 (clogq)
+__qmath3 (clog10q)
+__qmath3 (conjq)
+__qmath3 (cpowq)
+__qmath3 (cprojq)
+__qmath3 (csinq)
+__qmath3 (csinhq)
+__qmath3 (csqrtq)
+__qmath3 (ctanq)
+__qmath3 (ctanhq)
+
+
+/* Prototypes for string <-> flt128 conversion functions.  */
+__qmath3 (strtoflt128)
+__qmath3 (quadmath_snprintf)
+
+#endif
diff --git a/lib/gcc/x86_64-linux-android/4.9/libgcc.a b/lib/gcc/x86_64-linux-android/4.9/libgcc.a
index eca4a22..6fea478 100644
--- a/lib/gcc/x86_64-linux-android/4.9/libgcc.a
+++ b/lib/gcc/x86_64-linux-android/4.9/libgcc.a
Binary files differ
diff --git a/lib/gcc/x86_64-linux-android/4.9/libgcov.a b/lib/gcc/x86_64-linux-android/4.9/libgcov.a
index 971c904..d7c7213 100644
--- a/lib/gcc/x86_64-linux-android/4.9/libgcov.a
+++ b/lib/gcc/x86_64-linux-android/4.9/libgcov.a
Binary files differ
diff --git a/lib/gcc/x86_64-linux-android/4.9/x32/crtbegin.o b/lib/gcc/x86_64-linux-android/4.9/x32/crtbegin.o
index 2842d16..4029a2c 100644
--- a/lib/gcc/x86_64-linux-android/4.9/x32/crtbegin.o
+++ b/lib/gcc/x86_64-linux-android/4.9/x32/crtbegin.o
Binary files differ
diff --git a/lib/gcc/x86_64-linux-android/4.9/x32/crtbeginS.o b/lib/gcc/x86_64-linux-android/4.9/x32/crtbeginS.o
index db7fb2e..b5b8967 100644
--- a/lib/gcc/x86_64-linux-android/4.9/x32/crtbeginS.o
+++ b/lib/gcc/x86_64-linux-android/4.9/x32/crtbeginS.o
Binary files differ
diff --git a/lib/gcc/x86_64-linux-android/4.9/x32/crtbeginT.o b/lib/gcc/x86_64-linux-android/4.9/x32/crtbeginT.o
index 2842d16..4029a2c 100644
--- a/lib/gcc/x86_64-linux-android/4.9/x32/crtbeginT.o
+++ b/lib/gcc/x86_64-linux-android/4.9/x32/crtbeginT.o
Binary files differ
diff --git a/lib/gcc/x86_64-linux-android/4.9/x32/crtend.o b/lib/gcc/x86_64-linux-android/4.9/x32/crtend.o
index 05fe6f7..96ca016 100644
--- a/lib/gcc/x86_64-linux-android/4.9/x32/crtend.o
+++ b/lib/gcc/x86_64-linux-android/4.9/x32/crtend.o
Binary files differ
diff --git a/lib/gcc/x86_64-linux-android/4.9/x32/crtendS.o b/lib/gcc/x86_64-linux-android/4.9/x32/crtendS.o
index 05fe6f7..96ca016 100644
--- a/lib/gcc/x86_64-linux-android/4.9/x32/crtendS.o
+++ b/lib/gcc/x86_64-linux-android/4.9/x32/crtendS.o
Binary files differ
diff --git a/lib/gcc/x86_64-linux-android/4.9/x32/crtfastmath.o b/lib/gcc/x86_64-linux-android/4.9/x32/crtfastmath.o
index 6041f6b..f62e448 100644
--- a/lib/gcc/x86_64-linux-android/4.9/x32/crtfastmath.o
+++ b/lib/gcc/x86_64-linux-android/4.9/x32/crtfastmath.o
Binary files differ
diff --git a/lib/gcc/x86_64-linux-android/4.9/x32/crtprec32.o b/lib/gcc/x86_64-linux-android/4.9/x32/crtprec32.o
index a876c44..c8cfa52 100644
--- a/lib/gcc/x86_64-linux-android/4.9/x32/crtprec32.o
+++ b/lib/gcc/x86_64-linux-android/4.9/x32/crtprec32.o
Binary files differ
diff --git a/lib/gcc/x86_64-linux-android/4.9/x32/crtprec64.o b/lib/gcc/x86_64-linux-android/4.9/x32/crtprec64.o
index 788d7ce..fd09196 100644
--- a/lib/gcc/x86_64-linux-android/4.9/x32/crtprec64.o
+++ b/lib/gcc/x86_64-linux-android/4.9/x32/crtprec64.o
Binary files differ
diff --git a/lib/gcc/x86_64-linux-android/4.9/x32/crtprec80.o b/lib/gcc/x86_64-linux-android/4.9/x32/crtprec80.o
index 74ca907..1b13e92 100644
--- a/lib/gcc/x86_64-linux-android/4.9/x32/crtprec80.o
+++ b/lib/gcc/x86_64-linux-android/4.9/x32/crtprec80.o
Binary files differ
diff --git a/lib/gcc/x86_64-linux-android/4.9/x32/libgcc.a b/lib/gcc/x86_64-linux-android/4.9/x32/libgcc.a
index 33fd595..7ae68bf 100644
--- a/lib/gcc/x86_64-linux-android/4.9/x32/libgcc.a
+++ b/lib/gcc/x86_64-linux-android/4.9/x32/libgcc.a
Binary files differ
diff --git a/lib/gcc/x86_64-linux-android/4.9/x32/libgcov.a b/lib/gcc/x86_64-linux-android/4.9/x32/libgcov.a
index 973c66e..958e8a2 100644
--- a/lib/gcc/x86_64-linux-android/4.9/x32/libgcov.a
+++ b/lib/gcc/x86_64-linux-android/4.9/x32/libgcov.a
Binary files differ
diff --git a/lib64/libiberty.a b/lib64/libiberty.a
index 7e81e85..291272b 100644
--- a/lib64/libiberty.a
+++ b/lib64/libiberty.a
Binary files differ
diff --git a/libexec/gcc/x86_64-linux-android/4.9/cc1 b/libexec/gcc/x86_64-linux-android/4.9/cc1
index 8224c06..97d3ede 100755
--- a/libexec/gcc/x86_64-linux-android/4.9/cc1
+++ b/libexec/gcc/x86_64-linux-android/4.9/cc1
Binary files differ
diff --git a/libexec/gcc/x86_64-linux-android/4.9/cc1plus b/libexec/gcc/x86_64-linux-android/4.9/cc1plus
index c6716ee..bfef1c3 100755
--- a/libexec/gcc/x86_64-linux-android/4.9/cc1plus
+++ b/libexec/gcc/x86_64-linux-android/4.9/cc1plus
Binary files differ
diff --git a/libexec/gcc/x86_64-linux-android/4.9/collect2 b/libexec/gcc/x86_64-linux-android/4.9/collect2
index 35cbdc1..3b56aa6 100755
--- a/libexec/gcc/x86_64-linux-android/4.9/collect2
+++ b/libexec/gcc/x86_64-linux-android/4.9/collect2
Binary files differ
diff --git a/libexec/gcc/x86_64-linux-android/4.9/libfunction_reordering_plugin.so b/libexec/gcc/x86_64-linux-android/4.9/libfunction_reordering_plugin.so
new file mode 120000
index 0000000..6818e7a
--- /dev/null
+++ b/libexec/gcc/x86_64-linux-android/4.9/libfunction_reordering_plugin.so
@@ -0,0 +1 @@
+libfunction_reordering_plugin.so.0.0.0
\ No newline at end of file
diff --git a/libexec/gcc/x86_64-linux-android/4.9/libfunction_reordering_plugin.so.0 b/libexec/gcc/x86_64-linux-android/4.9/libfunction_reordering_plugin.so.0
new file mode 120000
index 0000000..6818e7a
--- /dev/null
+++ b/libexec/gcc/x86_64-linux-android/4.9/libfunction_reordering_plugin.so.0
@@ -0,0 +1 @@
+libfunction_reordering_plugin.so.0.0.0
\ No newline at end of file
diff --git a/libexec/gcc/x86_64-linux-android/4.9/libfunction_reordering_plugin.so.0.0.0 b/libexec/gcc/x86_64-linux-android/4.9/libfunction_reordering_plugin.so.0.0.0
new file mode 100755
index 0000000..ad6397f
--- /dev/null
+++ b/libexec/gcc/x86_64-linux-android/4.9/libfunction_reordering_plugin.so.0.0.0
Binary files differ
diff --git a/libexec/gcc/x86_64-linux-android/4.9/lto-wrapper b/libexec/gcc/x86_64-linux-android/4.9/lto-wrapper
index 741fa53..2e65eb8 100755
--- a/libexec/gcc/x86_64-linux-android/4.9/lto-wrapper
+++ b/libexec/gcc/x86_64-linux-android/4.9/lto-wrapper
Binary files differ
diff --git a/libexec/gcc/x86_64-linux-android/4.9/lto1 b/libexec/gcc/x86_64-linux-android/4.9/lto1
index 9ff0c07..b7ba30d 100755
--- a/libexec/gcc/x86_64-linux-android/4.9/lto1
+++ b/libexec/gcc/x86_64-linux-android/4.9/lto1
Binary files differ
diff --git a/libexec/gcc/x86_64-linux-android/4.9/plugin/gengtype b/libexec/gcc/x86_64-linux-android/4.9/plugin/gengtype
index 88acc84..7781a02 100755
--- a/libexec/gcc/x86_64-linux-android/4.9/plugin/gengtype
+++ b/libexec/gcc/x86_64-linux-android/4.9/plugin/gengtype
Binary files differ
diff --git a/x86_64-linux-android/lib/ldscripts/elf32_x86_64.xr b/x86_64-linux-android/lib/ldscripts/elf32_x86_64.xr
index de6c8da..ca82fef 100644
--- a/x86_64-linux-android/lib/ldscripts/elf32_x86_64.xr
+++ b/x86_64-linux-android/lib/ldscripts/elf32_x86_64.xr
@@ -75,7 +75,6 @@
      could instead move the label definition inside the section, but
      the linker would then create the section even if it turns out to
      be empty, which isn't pretty.  */
-  . = ALIGN(32 / 8);
   .preinit_array   0 :
   {
     KEEP (*(.preinit_array))
diff --git a/x86_64-linux-android/lib/ldscripts/elf32_x86_64.xu b/x86_64-linux-android/lib/ldscripts/elf32_x86_64.xu
index 5c5fd9f..881dafd 100644
--- a/x86_64-linux-android/lib/ldscripts/elf32_x86_64.xu
+++ b/x86_64-linux-android/lib/ldscripts/elf32_x86_64.xu
@@ -75,7 +75,6 @@
      could instead move the label definition inside the section, but
      the linker would then create the section even if it turns out to
      be empty, which isn't pretty.  */
-  . = ALIGN(32 / 8);
   .preinit_array   0 :
   {
     KEEP (*(.preinit_array))
diff --git a/x86_64-linux-android/lib/ldscripts/elf_i386.xr b/x86_64-linux-android/lib/ldscripts/elf_i386.xr
index 0062568..6a84459 100644
--- a/x86_64-linux-android/lib/ldscripts/elf_i386.xr
+++ b/x86_64-linux-android/lib/ldscripts/elf_i386.xr
@@ -72,7 +72,6 @@
      could instead move the label definition inside the section, but
      the linker would then create the section even if it turns out to
      be empty, which isn't pretty.  */
-  . = ALIGN(32 / 8);
   .preinit_array   0 :
   {
     KEEP (*(.preinit_array))
diff --git a/x86_64-linux-android/lib/ldscripts/elf_i386.xu b/x86_64-linux-android/lib/ldscripts/elf_i386.xu
index 5726faa..42cca6d 100644
--- a/x86_64-linux-android/lib/ldscripts/elf_i386.xu
+++ b/x86_64-linux-android/lib/ldscripts/elf_i386.xu
@@ -72,7 +72,6 @@
      could instead move the label definition inside the section, but
      the linker would then create the section even if it turns out to
      be empty, which isn't pretty.  */
-  . = ALIGN(32 / 8);
   .preinit_array   0 :
   {
     KEEP (*(.preinit_array))
diff --git a/x86_64-linux-android/lib/ldscripts/elf_k1om.x b/x86_64-linux-android/lib/ldscripts/elf_k1om.x
index f20c044..2990d71 100644
--- a/x86_64-linux-android/lib/ldscripts/elf_k1om.x
+++ b/x86_64-linux-android/lib/ldscripts/elf_k1om.x
@@ -88,7 +88,7 @@
      could instead move the label definition inside the section, but
      the linker would then create the section even if it turns out to
      be empty, which isn't pretty.  */
-  . = ALIGN(32 / 8);
+  . = ALIGN(64 / 8);
   PROVIDE_HIDDEN (__preinit_array_start = .);
   .preinit_array     :
   {
diff --git a/x86_64-linux-android/lib/ldscripts/elf_k1om.xbn b/x86_64-linux-android/lib/ldscripts/elf_k1om.xbn
index 1694fe1..c0d2912 100644
--- a/x86_64-linux-android/lib/ldscripts/elf_k1om.xbn
+++ b/x86_64-linux-android/lib/ldscripts/elf_k1om.xbn
@@ -87,7 +87,7 @@
      could instead move the label definition inside the section, but
      the linker would then create the section even if it turns out to
      be empty, which isn't pretty.  */
-  . = ALIGN(32 / 8);
+  . = ALIGN(64 / 8);
   PROVIDE_HIDDEN (__preinit_array_start = .);
   .preinit_array     :
   {
diff --git a/x86_64-linux-android/lib/ldscripts/elf_k1om.xc b/x86_64-linux-android/lib/ldscripts/elf_k1om.xc
index 93708bb..19b532a 100644
--- a/x86_64-linux-android/lib/ldscripts/elf_k1om.xc
+++ b/x86_64-linux-android/lib/ldscripts/elf_k1om.xc
@@ -88,7 +88,7 @@
      could instead move the label definition inside the section, but
      the linker would then create the section even if it turns out to
      be empty, which isn't pretty.  */
-  . = ALIGN(32 / 8);
+  . = ALIGN(64 / 8);
   PROVIDE_HIDDEN (__preinit_array_start = .);
   .preinit_array     :
   {
diff --git a/x86_64-linux-android/lib/ldscripts/elf_k1om.xd b/x86_64-linux-android/lib/ldscripts/elf_k1om.xd
index ab78d27..b6b3a99 100644
--- a/x86_64-linux-android/lib/ldscripts/elf_k1om.xd
+++ b/x86_64-linux-android/lib/ldscripts/elf_k1om.xd
@@ -87,7 +87,7 @@
      could instead move the label definition inside the section, but
      the linker would then create the section even if it turns out to
      be empty, which isn't pretty.  */
-  . = ALIGN(32 / 8);
+  . = ALIGN(64 / 8);
   PROVIDE_HIDDEN (__preinit_array_start = .);
   .preinit_array     :
   {
diff --git a/x86_64-linux-android/lib/ldscripts/elf_k1om.xdc b/x86_64-linux-android/lib/ldscripts/elf_k1om.xdc
index 6943233..78d8292 100644
--- a/x86_64-linux-android/lib/ldscripts/elf_k1om.xdc
+++ b/x86_64-linux-android/lib/ldscripts/elf_k1om.xdc
@@ -88,7 +88,7 @@
      could instead move the label definition inside the section, but
      the linker would then create the section even if it turns out to
      be empty, which isn't pretty.  */
-  . = ALIGN(32 / 8);
+  . = ALIGN(64 / 8);
   PROVIDE_HIDDEN (__preinit_array_start = .);
   .preinit_array     :
   {
diff --git a/x86_64-linux-android/lib/ldscripts/elf_k1om.xdw b/x86_64-linux-android/lib/ldscripts/elf_k1om.xdw
index ed7be82..bd1d78f 100644
--- a/x86_64-linux-android/lib/ldscripts/elf_k1om.xdw
+++ b/x86_64-linux-android/lib/ldscripts/elf_k1om.xdw
@@ -88,7 +88,7 @@
      could instead move the label definition inside the section, but
      the linker would then create the section even if it turns out to
      be empty, which isn't pretty.  */
-  . = ALIGN(32 / 8);
+  . = ALIGN(64 / 8);
   PROVIDE_HIDDEN (__preinit_array_start = .);
   .preinit_array     :
   {
diff --git a/x86_64-linux-android/lib/ldscripts/elf_k1om.xn b/x86_64-linux-android/lib/ldscripts/elf_k1om.xn
index 977e042..1f64c0a 100644
--- a/x86_64-linux-android/lib/ldscripts/elf_k1om.xn
+++ b/x86_64-linux-android/lib/ldscripts/elf_k1om.xn
@@ -87,7 +87,7 @@
      could instead move the label definition inside the section, but
      the linker would then create the section even if it turns out to
      be empty, which isn't pretty.  */
-  . = ALIGN(32 / 8);
+  . = ALIGN(64 / 8);
   PROVIDE_HIDDEN (__preinit_array_start = .);
   .preinit_array     :
   {
diff --git a/x86_64-linux-android/lib/ldscripts/elf_k1om.xr b/x86_64-linux-android/lib/ldscripts/elf_k1om.xr
index fb08769..fb38fee 100644
--- a/x86_64-linux-android/lib/ldscripts/elf_k1om.xr
+++ b/x86_64-linux-android/lib/ldscripts/elf_k1om.xr
@@ -79,7 +79,6 @@
      could instead move the label definition inside the section, but
      the linker would then create the section even if it turns out to
      be empty, which isn't pretty.  */
-  . = ALIGN(32 / 8);
   .preinit_array   0 :
   {
     KEEP (*(.preinit_array))
diff --git a/x86_64-linux-android/lib/ldscripts/elf_k1om.xs b/x86_64-linux-android/lib/ldscripts/elf_k1om.xs
index e953374..7acdd5d 100644
--- a/x86_64-linux-android/lib/ldscripts/elf_k1om.xs
+++ b/x86_64-linux-android/lib/ldscripts/elf_k1om.xs
@@ -84,7 +84,7 @@
      could instead move the label definition inside the section, but
      the linker would then create the section even if it turns out to
      be empty, which isn't pretty.  */
-  . = ALIGN(32 / 8);
+  . = ALIGN(64 / 8);
   .preinit_array     :
   {
     KEEP (*(.preinit_array))
diff --git a/x86_64-linux-android/lib/ldscripts/elf_k1om.xsc b/x86_64-linux-android/lib/ldscripts/elf_k1om.xsc
index f98f7a9..03787d8 100644
--- a/x86_64-linux-android/lib/ldscripts/elf_k1om.xsc
+++ b/x86_64-linux-android/lib/ldscripts/elf_k1om.xsc
@@ -84,7 +84,7 @@
      could instead move the label definition inside the section, but
      the linker would then create the section even if it turns out to
      be empty, which isn't pretty.  */
-  . = ALIGN(32 / 8);
+  . = ALIGN(64 / 8);
   .preinit_array     :
   {
     KEEP (*(.preinit_array))
diff --git a/x86_64-linux-android/lib/ldscripts/elf_k1om.xsw b/x86_64-linux-android/lib/ldscripts/elf_k1om.xsw
index e620cd3..82edf36 100644
--- a/x86_64-linux-android/lib/ldscripts/elf_k1om.xsw
+++ b/x86_64-linux-android/lib/ldscripts/elf_k1om.xsw
@@ -83,7 +83,7 @@
      could instead move the label definition inside the section, but
      the linker would then create the section even if it turns out to
      be empty, which isn't pretty.  */
-  . = ALIGN(32 / 8);
+  . = ALIGN(64 / 8);
   .preinit_array     :
   {
     KEEP (*(.preinit_array))
diff --git a/x86_64-linux-android/lib/ldscripts/elf_k1om.xu b/x86_64-linux-android/lib/ldscripts/elf_k1om.xu
index 321a32a..e78e1c7 100644
--- a/x86_64-linux-android/lib/ldscripts/elf_k1om.xu
+++ b/x86_64-linux-android/lib/ldscripts/elf_k1om.xu
@@ -79,7 +79,6 @@
      could instead move the label definition inside the section, but
      the linker would then create the section even if it turns out to
      be empty, which isn't pretty.  */
-  . = ALIGN(32 / 8);
   .preinit_array   0 :
   {
     KEEP (*(.preinit_array))
diff --git a/x86_64-linux-android/lib/ldscripts/elf_k1om.xw b/x86_64-linux-android/lib/ldscripts/elf_k1om.xw
index 2dea8c1..2b43914 100644
--- a/x86_64-linux-android/lib/ldscripts/elf_k1om.xw
+++ b/x86_64-linux-android/lib/ldscripts/elf_k1om.xw
@@ -88,7 +88,7 @@
      could instead move the label definition inside the section, but
      the linker would then create the section even if it turns out to
      be empty, which isn't pretty.  */
-  . = ALIGN(32 / 8);
+  . = ALIGN(64 / 8);
   PROVIDE_HIDDEN (__preinit_array_start = .);
   .preinit_array     :
   {
diff --git a/x86_64-linux-android/lib/ldscripts/elf_l1om.x b/x86_64-linux-android/lib/ldscripts/elf_l1om.x
index ae3103c..7794a13 100644
--- a/x86_64-linux-android/lib/ldscripts/elf_l1om.x
+++ b/x86_64-linux-android/lib/ldscripts/elf_l1om.x
@@ -88,7 +88,7 @@
      could instead move the label definition inside the section, but
      the linker would then create the section even if it turns out to
      be empty, which isn't pretty.  */
-  . = ALIGN(32 / 8);
+  . = ALIGN(64 / 8);
   PROVIDE_HIDDEN (__preinit_array_start = .);
   .preinit_array     :
   {
diff --git a/x86_64-linux-android/lib/ldscripts/elf_l1om.xbn b/x86_64-linux-android/lib/ldscripts/elf_l1om.xbn
index 433891e..b6befd6 100644
--- a/x86_64-linux-android/lib/ldscripts/elf_l1om.xbn
+++ b/x86_64-linux-android/lib/ldscripts/elf_l1om.xbn
@@ -87,7 +87,7 @@
      could instead move the label definition inside the section, but
      the linker would then create the section even if it turns out to
      be empty, which isn't pretty.  */
-  . = ALIGN(32 / 8);
+  . = ALIGN(64 / 8);
   PROVIDE_HIDDEN (__preinit_array_start = .);
   .preinit_array     :
   {
diff --git a/x86_64-linux-android/lib/ldscripts/elf_l1om.xc b/x86_64-linux-android/lib/ldscripts/elf_l1om.xc
index 6d3e65e..1b05981 100644
--- a/x86_64-linux-android/lib/ldscripts/elf_l1om.xc
+++ b/x86_64-linux-android/lib/ldscripts/elf_l1om.xc
@@ -88,7 +88,7 @@
      could instead move the label definition inside the section, but
      the linker would then create the section even if it turns out to
      be empty, which isn't pretty.  */
-  . = ALIGN(32 / 8);
+  . = ALIGN(64 / 8);
   PROVIDE_HIDDEN (__preinit_array_start = .);
   .preinit_array     :
   {
diff --git a/x86_64-linux-android/lib/ldscripts/elf_l1om.xd b/x86_64-linux-android/lib/ldscripts/elf_l1om.xd
index 36b49fb..89fb0e4 100644
--- a/x86_64-linux-android/lib/ldscripts/elf_l1om.xd
+++ b/x86_64-linux-android/lib/ldscripts/elf_l1om.xd
@@ -87,7 +87,7 @@
      could instead move the label definition inside the section, but
      the linker would then create the section even if it turns out to
      be empty, which isn't pretty.  */
-  . = ALIGN(32 / 8);
+  . = ALIGN(64 / 8);
   PROVIDE_HIDDEN (__preinit_array_start = .);
   .preinit_array     :
   {
diff --git a/x86_64-linux-android/lib/ldscripts/elf_l1om.xdc b/x86_64-linux-android/lib/ldscripts/elf_l1om.xdc
index 33a359e..b91a287 100644
--- a/x86_64-linux-android/lib/ldscripts/elf_l1om.xdc
+++ b/x86_64-linux-android/lib/ldscripts/elf_l1om.xdc
@@ -88,7 +88,7 @@
      could instead move the label definition inside the section, but
      the linker would then create the section even if it turns out to
      be empty, which isn't pretty.  */
-  . = ALIGN(32 / 8);
+  . = ALIGN(64 / 8);
   PROVIDE_HIDDEN (__preinit_array_start = .);
   .preinit_array     :
   {
diff --git a/x86_64-linux-android/lib/ldscripts/elf_l1om.xdw b/x86_64-linux-android/lib/ldscripts/elf_l1om.xdw
index 412a745..8a8704b 100644
--- a/x86_64-linux-android/lib/ldscripts/elf_l1om.xdw
+++ b/x86_64-linux-android/lib/ldscripts/elf_l1om.xdw
@@ -88,7 +88,7 @@
      could instead move the label definition inside the section, but
      the linker would then create the section even if it turns out to
      be empty, which isn't pretty.  */
-  . = ALIGN(32 / 8);
+  . = ALIGN(64 / 8);
   PROVIDE_HIDDEN (__preinit_array_start = .);
   .preinit_array     :
   {
diff --git a/x86_64-linux-android/lib/ldscripts/elf_l1om.xn b/x86_64-linux-android/lib/ldscripts/elf_l1om.xn
index 2a035b3..4d6db18 100644
--- a/x86_64-linux-android/lib/ldscripts/elf_l1om.xn
+++ b/x86_64-linux-android/lib/ldscripts/elf_l1om.xn
@@ -87,7 +87,7 @@
      could instead move the label definition inside the section, but
      the linker would then create the section even if it turns out to
      be empty, which isn't pretty.  */
-  . = ALIGN(32 / 8);
+  . = ALIGN(64 / 8);
   PROVIDE_HIDDEN (__preinit_array_start = .);
   .preinit_array     :
   {
diff --git a/x86_64-linux-android/lib/ldscripts/elf_l1om.xr b/x86_64-linux-android/lib/ldscripts/elf_l1om.xr
index f9bd3e9..ead9c53 100644
--- a/x86_64-linux-android/lib/ldscripts/elf_l1om.xr
+++ b/x86_64-linux-android/lib/ldscripts/elf_l1om.xr
@@ -79,7 +79,6 @@
      could instead move the label definition inside the section, but
      the linker would then create the section even if it turns out to
      be empty, which isn't pretty.  */
-  . = ALIGN(32 / 8);
   .preinit_array   0 :
   {
     KEEP (*(.preinit_array))
diff --git a/x86_64-linux-android/lib/ldscripts/elf_l1om.xs b/x86_64-linux-android/lib/ldscripts/elf_l1om.xs
index bc8107a..d9cbe01 100644
--- a/x86_64-linux-android/lib/ldscripts/elf_l1om.xs
+++ b/x86_64-linux-android/lib/ldscripts/elf_l1om.xs
@@ -84,7 +84,7 @@
      could instead move the label definition inside the section, but
      the linker would then create the section even if it turns out to
      be empty, which isn't pretty.  */
-  . = ALIGN(32 / 8);
+  . = ALIGN(64 / 8);
   .preinit_array     :
   {
     KEEP (*(.preinit_array))
diff --git a/x86_64-linux-android/lib/ldscripts/elf_l1om.xsc b/x86_64-linux-android/lib/ldscripts/elf_l1om.xsc
index 3b6e2b5..e252f9f 100644
--- a/x86_64-linux-android/lib/ldscripts/elf_l1om.xsc
+++ b/x86_64-linux-android/lib/ldscripts/elf_l1om.xsc
@@ -84,7 +84,7 @@
      could instead move the label definition inside the section, but
      the linker would then create the section even if it turns out to
      be empty, which isn't pretty.  */
-  . = ALIGN(32 / 8);
+  . = ALIGN(64 / 8);
   .preinit_array     :
   {
     KEEP (*(.preinit_array))
diff --git a/x86_64-linux-android/lib/ldscripts/elf_l1om.xsw b/x86_64-linux-android/lib/ldscripts/elf_l1om.xsw
index 23422b2..f583d68 100644
--- a/x86_64-linux-android/lib/ldscripts/elf_l1om.xsw
+++ b/x86_64-linux-android/lib/ldscripts/elf_l1om.xsw
@@ -83,7 +83,7 @@
      could instead move the label definition inside the section, but
      the linker would then create the section even if it turns out to
      be empty, which isn't pretty.  */
-  . = ALIGN(32 / 8);
+  . = ALIGN(64 / 8);
   .preinit_array     :
   {
     KEEP (*(.preinit_array))
diff --git a/x86_64-linux-android/lib/ldscripts/elf_l1om.xu b/x86_64-linux-android/lib/ldscripts/elf_l1om.xu
index 1a53594..c6247bb 100644
--- a/x86_64-linux-android/lib/ldscripts/elf_l1om.xu
+++ b/x86_64-linux-android/lib/ldscripts/elf_l1om.xu
@@ -79,7 +79,6 @@
      could instead move the label definition inside the section, but
      the linker would then create the section even if it turns out to
      be empty, which isn't pretty.  */
-  . = ALIGN(32 / 8);
   .preinit_array   0 :
   {
     KEEP (*(.preinit_array))
diff --git a/x86_64-linux-android/lib/ldscripts/elf_l1om.xw b/x86_64-linux-android/lib/ldscripts/elf_l1om.xw
index dccca0a..44ebc93 100644
--- a/x86_64-linux-android/lib/ldscripts/elf_l1om.xw
+++ b/x86_64-linux-android/lib/ldscripts/elf_l1om.xw
@@ -88,7 +88,7 @@
      could instead move the label definition inside the section, but
      the linker would then create the section even if it turns out to
      be empty, which isn't pretty.  */
-  . = ALIGN(32 / 8);
+  . = ALIGN(64 / 8);
   PROVIDE_HIDDEN (__preinit_array_start = .);
   .preinit_array     :
   {
diff --git a/x86_64-linux-android/lib/ldscripts/elf_x86_64.x b/x86_64-linux-android/lib/ldscripts/elf_x86_64.x
index b7ad5ec..575d2ab 100644
--- a/x86_64-linux-android/lib/ldscripts/elf_x86_64.x
+++ b/x86_64-linux-android/lib/ldscripts/elf_x86_64.x
@@ -85,7 +85,7 @@
      could instead move the label definition inside the section, but
      the linker would then create the section even if it turns out to
      be empty, which isn't pretty.  */
-  . = ALIGN(32 / 8);
+  . = ALIGN(64 / 8);
   PROVIDE_HIDDEN (__preinit_array_start = .);
   .preinit_array     :
   {
diff --git a/x86_64-linux-android/lib/ldscripts/elf_x86_64.xbn b/x86_64-linux-android/lib/ldscripts/elf_x86_64.xbn
index 67a92e0..f0f36ad 100644
--- a/x86_64-linux-android/lib/ldscripts/elf_x86_64.xbn
+++ b/x86_64-linux-android/lib/ldscripts/elf_x86_64.xbn
@@ -84,7 +84,7 @@
      could instead move the label definition inside the section, but
      the linker would then create the section even if it turns out to
      be empty, which isn't pretty.  */
-  . = ALIGN(32 / 8);
+  . = ALIGN(64 / 8);
   PROVIDE_HIDDEN (__preinit_array_start = .);
   .preinit_array     :
   {
diff --git a/x86_64-linux-android/lib/ldscripts/elf_x86_64.xc b/x86_64-linux-android/lib/ldscripts/elf_x86_64.xc
index 7b2a9c5..9ac8216 100644
--- a/x86_64-linux-android/lib/ldscripts/elf_x86_64.xc
+++ b/x86_64-linux-android/lib/ldscripts/elf_x86_64.xc
@@ -86,7 +86,7 @@
      could instead move the label definition inside the section, but
      the linker would then create the section even if it turns out to
      be empty, which isn't pretty.  */
-  . = ALIGN(32 / 8);
+  . = ALIGN(64 / 8);
   PROVIDE_HIDDEN (__preinit_array_start = .);
   .preinit_array     :
   {
diff --git a/x86_64-linux-android/lib/ldscripts/elf_x86_64.xd b/x86_64-linux-android/lib/ldscripts/elf_x86_64.xd
index 0f41ca0..944b71a 100644
--- a/x86_64-linux-android/lib/ldscripts/elf_x86_64.xd
+++ b/x86_64-linux-android/lib/ldscripts/elf_x86_64.xd
@@ -84,7 +84,7 @@
      could instead move the label definition inside the section, but
      the linker would then create the section even if it turns out to
      be empty, which isn't pretty.  */
-  . = ALIGN(32 / 8);
+  . = ALIGN(64 / 8);
   PROVIDE_HIDDEN (__preinit_array_start = .);
   .preinit_array     :
   {
diff --git a/x86_64-linux-android/lib/ldscripts/elf_x86_64.xdc b/x86_64-linux-android/lib/ldscripts/elf_x86_64.xdc
index 6aedb53..4b8571a 100644
--- a/x86_64-linux-android/lib/ldscripts/elf_x86_64.xdc
+++ b/x86_64-linux-android/lib/ldscripts/elf_x86_64.xdc
@@ -86,7 +86,7 @@
      could instead move the label definition inside the section, but
      the linker would then create the section even if it turns out to
      be empty, which isn't pretty.  */
-  . = ALIGN(32 / 8);
+  . = ALIGN(64 / 8);
   PROVIDE_HIDDEN (__preinit_array_start = .);
   .preinit_array     :
   {
diff --git a/x86_64-linux-android/lib/ldscripts/elf_x86_64.xdw b/x86_64-linux-android/lib/ldscripts/elf_x86_64.xdw
index 4631e1d..c5e0ce0 100644
--- a/x86_64-linux-android/lib/ldscripts/elf_x86_64.xdw
+++ b/x86_64-linux-android/lib/ldscripts/elf_x86_64.xdw
@@ -86,7 +86,7 @@
      could instead move the label definition inside the section, but
      the linker would then create the section even if it turns out to
      be empty, which isn't pretty.  */
-  . = ALIGN(32 / 8);
+  . = ALIGN(64 / 8);
   PROVIDE_HIDDEN (__preinit_array_start = .);
   .preinit_array     :
   {
diff --git a/x86_64-linux-android/lib/ldscripts/elf_x86_64.xn b/x86_64-linux-android/lib/ldscripts/elf_x86_64.xn
index 6aa7208..6fde39b 100644
--- a/x86_64-linux-android/lib/ldscripts/elf_x86_64.xn
+++ b/x86_64-linux-android/lib/ldscripts/elf_x86_64.xn
@@ -84,7 +84,7 @@
      could instead move the label definition inside the section, but
      the linker would then create the section even if it turns out to
      be empty, which isn't pretty.  */
-  . = ALIGN(32 / 8);
+  . = ALIGN(64 / 8);
   PROVIDE_HIDDEN (__preinit_array_start = .);
   .preinit_array     :
   {
diff --git a/x86_64-linux-android/lib/ldscripts/elf_x86_64.xr b/x86_64-linux-android/lib/ldscripts/elf_x86_64.xr
index 7f92872..b7f1e1c 100644
--- a/x86_64-linux-android/lib/ldscripts/elf_x86_64.xr
+++ b/x86_64-linux-android/lib/ldscripts/elf_x86_64.xr
@@ -75,7 +75,6 @@
      could instead move the label definition inside the section, but
      the linker would then create the section even if it turns out to
      be empty, which isn't pretty.  */
-  . = ALIGN(32 / 8);
   .preinit_array   0 :
   {
     KEEP (*(.preinit_array))
diff --git a/x86_64-linux-android/lib/ldscripts/elf_x86_64.xs b/x86_64-linux-android/lib/ldscripts/elf_x86_64.xs
index d75a7f8..2d2b8ff 100644
--- a/x86_64-linux-android/lib/ldscripts/elf_x86_64.xs
+++ b/x86_64-linux-android/lib/ldscripts/elf_x86_64.xs
@@ -81,7 +81,7 @@
      could instead move the label definition inside the section, but
      the linker would then create the section even if it turns out to
      be empty, which isn't pretty.  */
-  . = ALIGN(32 / 8);
+  . = ALIGN(64 / 8);
   .preinit_array     :
   {
     KEEP (*(.preinit_array))
diff --git a/x86_64-linux-android/lib/ldscripts/elf_x86_64.xsc b/x86_64-linux-android/lib/ldscripts/elf_x86_64.xsc
index 379c1cd..5c41a31 100644
--- a/x86_64-linux-android/lib/ldscripts/elf_x86_64.xsc
+++ b/x86_64-linux-android/lib/ldscripts/elf_x86_64.xsc
@@ -84,7 +84,7 @@
      could instead move the label definition inside the section, but
      the linker would then create the section even if it turns out to
      be empty, which isn't pretty.  */
-  . = ALIGN(32 / 8);
+  . = ALIGN(64 / 8);
   .preinit_array     :
   {
     KEEP (*(.preinit_array))
diff --git a/x86_64-linux-android/lib/ldscripts/elf_x86_64.xsw b/x86_64-linux-android/lib/ldscripts/elf_x86_64.xsw
index ada2e50..3f77038 100644
--- a/x86_64-linux-android/lib/ldscripts/elf_x86_64.xsw
+++ b/x86_64-linux-android/lib/ldscripts/elf_x86_64.xsw
@@ -83,7 +83,7 @@
      could instead move the label definition inside the section, but
      the linker would then create the section even if it turns out to
      be empty, which isn't pretty.  */
-  . = ALIGN(32 / 8);
+  . = ALIGN(64 / 8);
   .preinit_array     :
   {
     KEEP (*(.preinit_array))
diff --git a/x86_64-linux-android/lib/ldscripts/elf_x86_64.xu b/x86_64-linux-android/lib/ldscripts/elf_x86_64.xu
index b1d4b5b..07a5422 100644
--- a/x86_64-linux-android/lib/ldscripts/elf_x86_64.xu
+++ b/x86_64-linux-android/lib/ldscripts/elf_x86_64.xu
@@ -75,7 +75,6 @@
      could instead move the label definition inside the section, but
      the linker would then create the section even if it turns out to
      be empty, which isn't pretty.  */
-  . = ALIGN(32 / 8);
   .preinit_array   0 :
   {
     KEEP (*(.preinit_array))
diff --git a/x86_64-linux-android/lib/ldscripts/elf_x86_64.xw b/x86_64-linux-android/lib/ldscripts/elf_x86_64.xw
index 21309d9..e131e33 100644
--- a/x86_64-linux-android/lib/ldscripts/elf_x86_64.xw
+++ b/x86_64-linux-android/lib/ldscripts/elf_x86_64.xw
@@ -86,7 +86,7 @@
      could instead move the label definition inside the section, but
      the linker would then create the section even if it turns out to
      be empty, which isn't pretty.  */
-  . = ALIGN(32 / 8);
+  . = ALIGN(64 / 8);
   PROVIDE_HIDDEN (__preinit_array_start = .);
   .preinit_array     :
   {
diff --git a/x86_64-linux-android/lib/libatomic.a b/x86_64-linux-android/lib/libatomic.a
index 2e7be55..e201e3b 100644
--- a/x86_64-linux-android/lib/libatomic.a
+++ b/x86_64-linux-android/lib/libatomic.a
Binary files differ
diff --git a/x86_64-linux-android/lib/libgomp.a b/x86_64-linux-android/lib/libgomp.a
index ddf6331..d16c3c7 100644
--- a/x86_64-linux-android/lib/libgomp.a
+++ b/x86_64-linux-android/lib/libgomp.a
Binary files differ
diff --git a/x86_64-linux-android/lib/libquadmath.a b/x86_64-linux-android/lib/libquadmath.a
new file mode 100644
index 0000000..0633dd2
--- /dev/null
+++ b/x86_64-linux-android/lib/libquadmath.a
Binary files differ
diff --git a/x86_64-linux-android/lib64/libatomic.a b/x86_64-linux-android/lib64/libatomic.a
index 6098208..9ddf8fc 100644
--- a/x86_64-linux-android/lib64/libatomic.a
+++ b/x86_64-linux-android/lib64/libatomic.a
Binary files differ
diff --git a/x86_64-linux-android/lib64/libgomp.a b/x86_64-linux-android/lib64/libgomp.a
index d04ed56..b81c20a 100644
--- a/x86_64-linux-android/lib64/libgomp.a
+++ b/x86_64-linux-android/lib64/libgomp.a
Binary files differ
diff --git a/x86_64-linux-android/lib64/libquadmath.a b/x86_64-linux-android/lib64/libquadmath.a
new file mode 100644
index 0000000..2d4d367
--- /dev/null
+++ b/x86_64-linux-android/lib64/libquadmath.a
Binary files differ
diff --git a/x86_64-linux-android/libx32/libatomic.a b/x86_64-linux-android/libx32/libatomic.a
index a0561dd..e95c272 100644
--- a/x86_64-linux-android/libx32/libatomic.a
+++ b/x86_64-linux-android/libx32/libatomic.a
Binary files differ
diff --git a/x86_64-linux-android/libx32/libgomp.a b/x86_64-linux-android/libx32/libgomp.a
index a63d951..a3954c2 100644
--- a/x86_64-linux-android/libx32/libgomp.a
+++ b/x86_64-linux-android/libx32/libgomp.a
Binary files differ
diff --git a/x86_64-linux-android/libx32/libquadmath.a b/x86_64-linux-android/libx32/libquadmath.a
new file mode 100644
index 0000000..df41d8a
--- /dev/null
+++ b/x86_64-linux-android/libx32/libquadmath.a
Binary files differ