Added dyanamic dispatch for x86 (#755)

Added dynamic dispatch for x86
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 420bbf3..3fbbce8 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -129,7 +129,7 @@
 
 SET(src
   THGeneral.c THAllocator.c THStorage.c THTensor.c THBlas.c THLapack.c
-  THLogAdd.c THRandom.c THFile.c THDiskFile.c THMemoryFile.c THAtomic.c)
+  THLogAdd.c THRandom.c THFile.c THDiskFile.c THMemoryFile.c THAtomic.c THVector.c)
 
 SET(src ${src} ${hdr} ${simd})
 ADD_LIBRARY(TH SHARED ${src})
@@ -331,7 +331,7 @@
   generic/THTensorMath.h
   generic/THTensorRandom.c
   generic/THTensorRandom.h
-  generic/THVector.c
+  generic/THVectorDispatch.c
   DESTINATION "${TH_INSTALL_INCLUDE_SUBDIR}/TH/generic")
 
 
diff --git a/THTensor.c b/THTensor.c
index b0ab0a5..2878fc9 100644
--- a/THTensor.c
+++ b/THTensor.c
@@ -1,6 +1,7 @@
 #include "THAtomic.h"
 #include "THTensor.h"
 #include "THVector.h"
+
 #include "THBlas.h"
 #include "THLapack.h"
 #include "THRandom.h"
diff --git a/THVector.c b/THVector.c
new file mode 100644
index 0000000..6179d89
--- /dev/null
+++ b/THVector.c
@@ -0,0 +1,17 @@
+#include "THVector.h"
+#include "generic/simd/simd.h"
+
+#ifdef __NEON__
+#include "vector/NEON.c"
+#endif
+
+#if defined(USE_SSE2) || defined(USE_SSE3) || defined(USE_SSSE3) \
+        || defined(USE_SSE4_1) || defined(USE_SSE4_2)
+#include "vector/SSE.c"
+#endif
+
+#include "generic/THVectorDefault.c"
+#include "THGenerateAllTypes.h"
+
+#include "generic/THVectorDispatch.c"
+#include "THGenerateAllTypes.h"
diff --git a/THVector.h b/THVector.h
index 1344e75..e29917b 100644
--- a/THVector.h
+++ b/THVector.h
@@ -5,570 +5,9 @@
 
 #define THVector_(NAME) TH_CONCAT_4(TH,Real,Vector_,NAME)
 
-#if defined USE_SSE2 || defined USE_SSE3 || defined USE_SSSE3 \
-  || defined USE_SSE4_1 || defined USE_SSE4_2
+/* We are going to use dynamic dispatch, and want only to generate declarations
+ * of the vector functions */
+#include "generic/THVector.h"
+#include "THGenerateAllTypes.h"
 
-#ifdef USE_SSE2
-#include <emmintrin.h>
-#endif
-
-#ifdef USE_SSE3
-#include <pmmintrin.h>
-#endif
-
-#ifdef USE_SSSE3
-#include <tmmintrin.h>
-#endif
-
-#if defined (USE_SSE4_2) || defined (USE_SSE4_1)
-#include <smmintrin.h>
-#endif
-
-#define THDoubleVector_fill(x, c, n) {          \
-    long i;                                     \
-    long off;                                   \
-    __m128d XMM0 = _mm_set1_pd(c);              \
-    for (i=0; i<=((n)-8); i+=8) {               \
-      _mm_storeu_pd((x)+i  , XMM0);             \
-      _mm_storeu_pd((x)+i+2, XMM0);             \
-      _mm_storeu_pd((x)+i+4, XMM0);             \
-      _mm_storeu_pd((x)+i+6, XMM0);             \
-    }                                           \
-    off = (n) - ((n)%8);                        \
-    for (i=0; i<((n)%8); i++) {                 \
-      x[off+i] = c;                             \
-    }                                           \
-  }
-
-
-#define THDoubleVector_add(y, x, c, n) {        \
-    long i = 0;                                 \
-    __m128d XMM7 = _mm_set1_pd(c);              \
-    __m128d XMM0,XMM2;                          \
-    for (; i<=((n)-2); i+=2) {                  \
-      XMM0 = _mm_loadu_pd((x)+i);               \
-      XMM2 = _mm_loadu_pd((y)+i);               \
-      XMM0 = _mm_mul_pd(XMM0, XMM7);            \
-      XMM2 = _mm_add_pd(XMM2, XMM0);            \
-      _mm_storeu_pd((y)+i  , XMM2);             \
-    }                                           \
-    for (; i<(n); i++) {                        \
-      y[i] += c * x[i];                         \
-    }                                           \
-  }
-
-#define THDoubleVector_diff(z, x, y, n) {       \
-    long i;                                     \
-    for (i=0; i<=((n)-8); i+=8) {               \
-      __m128d XMM0 = _mm_loadu_pd((x)+i  );     \
-      __m128d XMM1 = _mm_loadu_pd((x)+i+2);     \
-      __m128d XMM2 = _mm_loadu_pd((x)+i+4);     \
-      __m128d XMM3 = _mm_loadu_pd((x)+i+6);     \
-      __m128d XMM4 = _mm_loadu_pd((y)+i  );     \
-      __m128d XMM5 = _mm_loadu_pd((y)+i+2);     \
-      __m128d XMM6 = _mm_loadu_pd((y)+i+4);     \
-      __m128d XMM7 = _mm_loadu_pd((y)+i+6);     \
-      XMM0 = _mm_sub_pd(XMM0, XMM4);            \
-      XMM1 = _mm_sub_pd(XMM1, XMM5);            \
-      XMM2 = _mm_sub_pd(XMM2, XMM6);            \
-      XMM3 = _mm_sub_pd(XMM3, XMM7);            \
-      _mm_storeu_pd((z)+i  , XMM0);             \
-      _mm_storeu_pd((z)+i+2, XMM1);             \
-      _mm_storeu_pd((z)+i+4, XMM2);             \
-      _mm_storeu_pd((z)+i+6, XMM3);             \
-    }                                           \
-    long off = (n) - ((n)%8);                   \
-    for (i=0; i<((n)%8); i++) {                 \
-      z[off+i] = x[off+i] - y[off+i];           \
-    }                                           \
-  }
-
-#define THDoubleVector_scale(y, c, n) {         \
-    long i;                                     \
-    __m128d XMM7 = _mm_set1_pd(c);              \
-    for (i=0; i<=((n)-4); i+=4) {               \
-      __m128d XMM0 = _mm_loadu_pd((y)+i  );     \
-      __m128d XMM1 = _mm_loadu_pd((y)+i+2);     \
-      XMM0 = _mm_mul_pd(XMM0, XMM7);            \
-      XMM1 = _mm_mul_pd(XMM1, XMM7);            \
-      _mm_storeu_pd((y)+i  , XMM0);             \
-      _mm_storeu_pd((y)+i+2, XMM1);             \
-    }                                           \
-    long off = (n) - ((n)%4);                   \
-    for (i=0; i<((n)%4); i++) {                 \
-      y[off+i] *= c;                            \
-    }                                           \
-  }
-
-#define THDoubleVector_mul(y, x, n) {           \
-    long i;                                     \
-    for (i=0; i<=((n)-8); i+=8) {               \
-      __m128d XMM0 = _mm_loadu_pd((x)+i  );     \
-      __m128d XMM1 = _mm_loadu_pd((x)+i+2);     \
-      __m128d XMM2 = _mm_loadu_pd((x)+i+4);     \
-      __m128d XMM3 = _mm_loadu_pd((x)+i+6);     \
-      __m128d XMM4 = _mm_loadu_pd((y)+i  );     \
-      __m128d XMM5 = _mm_loadu_pd((y)+i+2);     \
-      __m128d XMM6 = _mm_loadu_pd((y)+i+4);     \
-      __m128d XMM7 = _mm_loadu_pd((y)+i+6);     \
-      XMM4 = _mm_mul_pd(XMM4, XMM0);            \
-      XMM5 = _mm_mul_pd(XMM5, XMM1);            \
-      XMM6 = _mm_mul_pd(XMM6, XMM2);            \
-      XMM7 = _mm_mul_pd(XMM7, XMM3);            \
-      _mm_storeu_pd((y)+i  , XMM4);             \
-      _mm_storeu_pd((y)+i+2, XMM5);             \
-      _mm_storeu_pd((y)+i+4, XMM6);             \
-      _mm_storeu_pd((y)+i+6, XMM7);             \
-    }                                           \
-    long off = (n) - ((n)%8);                   \
-    for (i=0; i<((n)%8); i++) {                 \
-      y[off+i] *= x[off+i];                     \
-    }                                           \
-  }
-
-#define THFloatVector_fill(x, c, n) {           \
-    long i;                                     \
-    __m128 XMM0 = _mm_set_ps1(c);               \
-    long off;                                   \
-    for (i=0; i<=((n)-16); i+=16) {             \
-      _mm_storeu_ps((x)+i  ,  XMM0);            \
-      _mm_storeu_ps((x)+i+4,  XMM0);            \
-      _mm_storeu_ps((x)+i+8,  XMM0);            \
-      _mm_storeu_ps((x)+i+12, XMM0);            \
-    }                                           \
-    off = (n) - ((n)%16);                       \
-    for (i=0; i<((n)%16); i++) {                \
-      x[off+i] = c;                             \
-    }                                           \
-  }
-
-#define THFloatVector_add(y, x, c, n) {         \
-    long i = 0;                                 \
-    __m128 XMM7 = _mm_set_ps1(c);               \
-    __m128 XMM0,XMM2;                           \
-    for (; i<=((n)-4); i+=4) {                  \
-      XMM0 = _mm_loadu_ps((x)+i);               \
-      XMM2 = _mm_loadu_ps((y)+i);               \
-      XMM0 = _mm_mul_ps(XMM0, XMM7);            \
-      XMM2 = _mm_add_ps(XMM2, XMM0);            \
-      _mm_storeu_ps((y)+i  , XMM2);             \
-    }                                           \
-    for (; i<(n); i++) {                        \
-      y[i] += c * x[i];                         \
-    }                                           \
-  }
-
-#define THFloatVector_diff(z, x, y, n) {        \
-    long i;                                     \
-    for (i=0; i<=((n)-16); i+=16) {             \
-      __m128 XMM0 = _mm_loadu_ps((x)+i   );     \
-      __m128 XMM1 = _mm_loadu_ps((x)+i+ 4);     \
-      __m128 XMM2 = _mm_loadu_ps((x)+i+ 8);     \
-      __m128 XMM3 = _mm_loadu_ps((x)+i+12);     \
-      __m128 XMM4 = _mm_loadu_ps((y)+i   );     \
-      __m128 XMM5 = _mm_loadu_ps((y)+i+ 4);     \
-      __m128 XMM6 = _mm_loadu_ps((y)+i+ 8);     \
-      __m128 XMM7 = _mm_loadu_ps((y)+i+12);     \
-      XMM0 = _mm_sub_ps(XMM0, XMM4);            \
-      XMM1 = _mm_sub_ps(XMM1, XMM5);            \
-      XMM2 = _mm_sub_ps(XMM2, XMM6);            \
-      XMM3 = _mm_sub_ps(XMM3, XMM7);            \
-      _mm_storeu_ps((z)+i   , XMM0);            \
-      _mm_storeu_ps((z)+i+ 4, XMM1);            \
-      _mm_storeu_ps((z)+i+ 8, XMM2);            \
-      _mm_storeu_ps((z)+i+12, XMM3);            \
-    }                                           \
-    long off = (n) - ((n)%16);                  \
-    for (i=0; i<((n)%16); i++) {                \
-      z[off+i] = x[off+i] - y[off+i];           \
-    }                                           \
-  }
-
-#define THFloatVector_scale(y, c, n) {          \
-    long i;                                     \
-    __m128 XMM7 = _mm_set_ps1(c);               \
-    for (i=0; i<=((n)-8); i+=8) {               \
-      __m128 XMM0 = _mm_loadu_ps((y)+i  );      \
-      __m128 XMM1 = _mm_loadu_ps((y)+i+4);      \
-      XMM0 = _mm_mul_ps(XMM0, XMM7);            \
-      XMM1 = _mm_mul_ps(XMM1, XMM7);            \
-      _mm_storeu_ps((y)+i  , XMM0);             \
-      _mm_storeu_ps((y)+i+4, XMM1);             \
-    }                                           \
-    long off = (n) - ((n)%8);                   \
-    for (i=0; i<((n)%8); i++) {                 \
-      y[off+i] *= c;                            \
-    }                                           \
-  }
-
-#define THFloatVector_mul(y, x, n) {            \
-    long i;                                     \
-    for (i=0; i<=((n)-16); i+=16) {             \
-      __m128 XMM0 = _mm_loadu_ps((x)+i   );     \
-      __m128 XMM1 = _mm_loadu_ps((x)+i+ 4);     \
-      __m128 XMM2 = _mm_loadu_ps((x)+i+ 8);     \
-      __m128 XMM3 = _mm_loadu_ps((x)+i+12);     \
-      __m128 XMM4 = _mm_loadu_ps((y)+i   );     \
-      __m128 XMM5 = _mm_loadu_ps((y)+i+ 4);     \
-      __m128 XMM6 = _mm_loadu_ps((y)+i+ 8);     \
-      __m128 XMM7 = _mm_loadu_ps((y)+i+12);     \
-      XMM4 = _mm_mul_ps(XMM4, XMM0);            \
-      XMM5 = _mm_mul_ps(XMM5, XMM1);            \
-      XMM6 = _mm_mul_ps(XMM6, XMM2);            \
-      XMM7 = _mm_mul_ps(XMM7, XMM3);            \
-      _mm_storeu_ps((y)+i   , XMM4);            \
-      _mm_storeu_ps((y)+i+ 4, XMM5);            \
-      _mm_storeu_ps((y)+i+ 8, XMM6);            \
-      _mm_storeu_ps((y)+i+12, XMM7);            \
-    }                                           \
-    long off = (n) - ((n)%16);                  \
-    for (i=0; i<((n)%16); i++) {                \
-      y[off+i] *= x[off+i];                     \
-    }                                           \
-  }
-
-#elif defined __NEON__
-/* ARM NEON Assembly routine for operating on floats */
-
-#define THFloatVector_fill(x, c, n) {                   \
-        float ctemp = c;                                \
-        float * caddr = &ctemp;                         \
-        __asm__ __volatile__ (                          \
-            "mov         r0, %0           @ \n\t"       \
-            "ldr         r4, [%1]         @ \n\t"       \
-            "vdup.32     q12, r4          @ \n\t"       \
-            "vdup.32     q13, r4          @ \n\t"       \
-            "lsrs        r4, %2, #3       @ \n\t"       \
-            "beq         3f               @ \n\t"       \
-            "1:                           @ \n\t"       \
-            "vst1.32     {d24-d27}, [r0]! @ \n\t"       \
-            "subs        r4, r4, #1       @ \n\t"       \
-            "bne         1b               @ \n\t"       \
-            "3:                           @ \n\t"       \
-            "ands        r4, %2, #7       @ \n\t"       \
-            "beq         5f               @ \n\t"       \
-            "4:                           @ \n\t"       \
-            "subs        r4, r4, #1       @ \n\t"       \
-            "vst1.32     {d24[0]}, [r0]!  @ \n\t"       \
-            "bne         4b               @ \n\t"       \
-            "5:                           @ "           \
-            :                                           \
-            :"r" (x), "r"(caddr),"r"(n)                 \
-            : "cc", "r0", "r4",  "memory",              \
-              "q12",                                    \
-              "d24", "d25", "d26", "d27"                \
-            );                                          \
-    }
-
-#define THFloatVector_diff(z, x, y, n) {                                \
-        __asm__ __volatile__ (                                          \
-            "mov         r0, %2           @ \n\t"                       \
-            "mov         r1, %1           @ \n\t"                       \
-            "mov         r2, %0           @ \n\t"                       \
-            "lsrs        r4, %3, #3       @ \n\t"                       \
-            "beq         3f               @ \n\t"                       \
-            "vld1.32     {d16-d19}, [r1]! @ \n\t"                       \
-            "vld1.32     {d0-d3}, [r0]!   @ \n\t"                       \
-            "1:                           @ \n\t"                       \
-            "vsub.f32    q12, q8, q0      @ \n\t"                       \
-            "vsub.f32    q13, q9, q1      @ \n\t"                       \
-            "subs        r4, r4, #1       @ \n\t"                       \
-            "beq         2f               @ \n\t"                       \
-            "vld1.32     {d16-d19}, [r1]! @ \n\t"                       \
-            "vld1.32     {d0-d3}, [r0]!   @ \n\t"                       \
-            "vst1.32     {d24-d27}, [r2]! @ \n\t"                       \
-            "b           1b               @ \n\t"                       \
-            "2:                           @ \n\t"                       \
-            "vst1.32     {d24-d27}, [r2]! @ \n\t"                       \
-            "3:                           @ \n\t"                       \
-            "ands        r4, %3, #7       @ \n\t"                       \
-            "beq         5f               @ \n\t"                       \
-            "4:                           @ \n\t"                       \
-            "subs        r4, r4, #1       @ \n\t"                       \
-            "vld1.32     {d16[0]}, [r1]!  @ \n\t"                       \
-            "vld1.32     {d0[0]}, [r0]!   @ \n\t"                       \
-            "vsub.f32    d24, d16, d0     @ \n\t"                       \
-            "vst1.32     {d24[0]}, [r2]!  @ \n\t"                       \
-            "bne         4b               @ \n\t"                       \
-            "5:                           @ "                           \
-            :                                                           \
-            :"r" (z), "r" (x),"r" (y), "r"(n)                           \
-            : "cc", "r0", "r1", "r2", "r4", "memory",                   \
-              "q0", "q1", "q8", "q9", "q12", "q13",                     \
-              "d0", "d1", "d2", "d3",                                   \
-              "d16", "d17", "d18", "d19", "d24", "d25", "d26", "d27"    \
-            );                                                          \
-    }
-
-#define THFloatVector_scale(y, c, n) {                                  \
-        float ctemp = c;                                                \
-        float * caddr = &ctemp;                                         \
-        __asm__ __volatile__ (                                          \
-            "mov         r0, %0           @ \n\t"                       \
-            "mov         r2, r0           @ \n\t"                       \
-            "ldr         r5, [%1]         @ \n\t"                       \
-            "vdup.32     q14, r5          @ \n\t"                       \
-            "lsrs        r5, %2, #5       @ \n\t"                       \
-            "beq         3f               @ \n\t"                       \
-            "vld1.32     {d0-d3}, [r0]!   @ \n\t"                       \
-            "vld1.32     {d4-d7}, [r0]!   @ \n\t"                       \
-            "vld1.32     {d8-d11}, [r0]!  @ \n\t"                       \
-            "vld1.32     {d12-d15}, [r0]! @ \n\t"                       \
-            "1:                           @ \n\t"                       \
-            "vmul.f32    q0, q0, q14      @ \n\t"                       \
-            "vmul.f32    q1, q1, q14      @ \n\t"                       \
-            "vmul.f32    q2, q2, q14      @ \n\t"                       \
-            "vmul.f32    q3, q3, q14      @ \n\t"                       \
-            "vmul.f32    q4, q4, q14      @ \n\t"                       \
-            "vmul.f32    q5, q5, q14      @ \n\t"                       \
-            "vmul.f32    q6, q6, q14      @ \n\t"                       \
-            "vmul.f32    q7, q7, q14      @ \n\t"                       \
-            "subs        r5, r5, #1       @ \n\t"                       \
-            "beq         2f               @ \n\t"                       \
-            "vst1.32     {d0-d3}, [r2]!   @ \n\t"                       \
-            "vld1.32     {d0-d3}, [r0]!   @ \n\t"                       \
-            "vst1.32     {d4-d7}, [r2]!   @ \n\t"                       \
-            "vld1.32     {d4-d7}, [r0]!   @ \n\t"                       \
-            "vst1.32     {d8-d11}, [r2]!  @ \n\t"                       \
-            "vld1.32     {d8-d11}, [r0]!  @ \n\t"                       \
-            "vst1.32     {d12-d15}, [r2]! @ \n\t"                       \
-            "vld1.32     {d12-d15}, [r0]! @ \n\t"                       \
-            "b           1b               @ \n\t"                       \
-            "2:                           @ \n\t"                       \
-            "vst1.32     {d0-d3}, [r2]!   @ \n\t"                       \
-            "vst1.32     {d4-d7}, [r2]!   @ \n\t"                       \
-            "vst1.32     {d8-d11}, [r2]!  @ \n\t"                       \
-            "vst1.32     {d12-d15}, [r2]! @ \n\t"                       \
-            "3:                           @ \n\t"                       \
-            "lsrs        r5, %2, #4       @ \n\t"                       \
-            "ands        r5, r5, #1       @ \n\t"                       \
-            "beq         4f               @ \n\t"                       \
-            "vld1.32     {d0-d3}, [r0]!   @ \n\t"                       \
-            "vld1.32     {d4-d7}, [r0]!   @ \n\t"                       \
-            "vmul.f32    q0, q0, q14      @ \n\t"                       \
-            "vmul.f32    q1, q1, q14      @ \n\t"                       \
-            "vmul.f32    q2, q2, q14      @ \n\t"                       \
-            "vmul.f32    q3, q3, q14      @ \n\t"                       \
-            "vst1.32     {d0-d3}, [r2]!   @ \n\t"                       \
-            "vst1.32     {d4-d7}, [r2]!   @ \n\t"                       \
-            "4:                           @ \n\t"                       \
-            "lsrs        r5, %2, #3       @ \n\t"                       \
-            "ands        r5, r5, #1       @ \n\t"                       \
-            "beq         5f               @ \n\t"                       \
-            "vld1.32     {d0-d3}, [r0]!   @ \n\t"                       \
-            "vmul.f32    q0, q0, q14      @ \n\t"                       \
-            "vmul.f32    q1, q1, q14      @ \n\t"                       \
-            "vst1.32     {d0-d3}, [r2]!   @ \n\t"                       \
-            "5:                           @ \n\t"                       \
-            "ands        r5, %2, #7       @ \n\t"                       \
-            "beq         7f               @ \n\t"                       \
-            "6:                           @ \n\t"                       \
-            "subs        r5, r5, #1       @ \n\t"                       \
-            "vld1.32     d0[0], [r0]!     @ \n\t"                       \
-            "vmul.f32    d0, d0, d28      @ \n\t"                       \
-            "vst1.32     d0[0], [r2]!     @ \n\t"                       \
-            "bne         6b               @ \n\t"                       \
-            "7:                           @ "                           \
-            :                                                           \
-            :"r" (y), "r"(caddr),"r"(n)                                 \
-            : "cc", "r0", "r2", "r5", "memory",                         \
-              "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q14",    \
-              "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7",           \
-              "d8", "d9", "d10", "d11", "d12", "d13", "d14", "d15",     \
-              "d28", "d29"                                              \
-            );                                                          \
-    }
-
-#define THFloatVector_mul(y, x, n) {                                    \
-        __asm__ __volatile__ (                                          \
-            "mov         r0, %0           @ \n\t"                       \
-            "mov         r1, %1           @ \n\t"                       \
-            "mov         r2, r0           @ \n\t"                       \
-            "lsrs        r4, %2, #3       @ \n\t"                       \
-            "beq         3f               @ \n\t"                       \
-            "vld1.32     {d16-d19}, [r1]! @ \n\t"                       \
-            "vld1.32     {d0-d3}, [r0]!   @ \n\t"                       \
-            "1:                           @ \n\t"                       \
-            "vmul.f32    q12, q8, q0      @ \n\t"                       \
-            "vmul.f32    q13, q9, q1      @ \n\t"                       \
-            "subs        r4, r4, #1       @ \n\t"                       \
-            "beq         2f               @ \n\t"                       \
-            "vld1.32     {d16-d19}, [r1]! @ \n\t"                       \
-            "vld1.32     {d0-d3}, [r0]!   @ \n\t"                       \
-            "vst1.32     {d24-d27}, [r2]! @ \n\t"                       \
-            "b           1b               @ \n\t"                       \
-            "2:                           @ \n\t"                       \
-            "vst1.32     {d24-d27}, [r2]! @ \n\t"                       \
-            "3:                           @ \n\t"                       \
-            "ands        r4, %2, #7       @ \n\t"                       \
-            "beq         5f               @ \n\t"                       \
-            "4:                           @ \n\t"                       \
-            "subs        r4, r4, #1       @ \n\t"                       \
-            "vld1.32     {d16[0]}, [r1]!  @ \n\t"                       \
-            "vld1.32     {d0[0]}, [r0]!   @ \n\t"                       \
-            "vmul.f32    q12, q8, q0      @ \n\t"                       \
-            "vst1.32     {d24[0]}, [r2]!  @ \n\t"                       \
-            "bne         4b               @ \n\t"                       \
-            "5:                           @ "                           \
-            :                                                           \
-            :"r" (y),"r" (x),"r"(n)                                     \
-            : "cc", "r0", "r1", "r2", "r4", "memory",                   \
-              "q0", "q1", "q8", "q9", "q12", "q13",                     \
-              "d0", "d1", "d2", "d3",                                   \
-              "d16", "d17", "d18", "d19", "d24", "d25", "d26", "d27"    \
-            );                                                          \
-    }
-#define THFloatVector_add(y, x, c, n) {                                 \
-        float ctemp = c;                                                \
-        float * caddr = &ctemp;                                         \
-        __asm__ __volatile__ (                                          \
-            "mov         r0, %0           @ \n\t"                       \
-            "mov         r1, %1           @ \n\t"                       \
-            "mov         r2, r0           @ \n\t"                       \
-            "ldr         r5, [%2]         @ \n\t"                       \
-            "vdup.32     q14, r5          @ \n\t"                       \
-            "lsrs        r5, %3, #4       @ \n\t"                       \
-            "beq         3f               @ \n\t"                       \
-            "vld1.32     {d16-d19}, [r1]! @ \n\t"                       \
-            "vld1.32     {d0-d3}, [r0]!   @ \n\t"                       \
-            "vld1.32     {d20-d23}, [r1]! @ \n\t"                       \
-            "vld1.32     {d4-d7}, [r0]!   @ \n\t"                       \
-            "1:                           @ \n\t"                       \
-            "vmla.f32    q0, q8, q14      @ \n\t"                       \
-            "vmla.f32    q1, q9, q14      @ \n\t"                       \
-            "vmla.f32    q2, q10, q14     @ \n\t"                       \
-            "vmla.f32    q3, q11, q14     @ \n\t"                       \
-            "subs        r5, r5, #1       @ \n\t"                       \
-            "beq         2f               @ \n\t"                       \
-            "vld1.32     {d16-d19}, [r1]! @ \n\t"                       \
-            "vld1.32     {d20-d23}, [r1]! @ \n\t"                       \
-            "vst1.32     {d0-d3}, [r2]!   @ \n\t"                       \
-            "vld1.32     {d0-d3}, [r0]!   @ \n\t"                       \
-            "vst1.32     {d4-d7}, [r2]!   @ \n\t"                       \
-            "vld1.32     {d4-d7}, [r0]!   @ \n\t"                       \
-            "b           1b               @ \n\t"                       \
-            "2:                           @ \n\t"                       \
-            "vst1.32     {d0-d3}, [r2]!   @ \n\t"                       \
-            "vst1.32     {d4-d7}, [r2]!   @ \n\t"                       \
-            "3:                           @ \n\t"                       \
-            "lsrs        r5, %3, #3       @ \n\t"                       \
-            "ands        r5, #1           @ \n\t"                       \
-            "beq         4f               @ \n\t"                       \
-            "vld1.32     {d16-d19}, [r1]! @ \n\t"                       \
-            "vld1.32     {d0-d3}, [r0]!   @ \n\t"                       \
-            "vmla.f32    q0, q8, q14      @ \n\t"                       \
-            "vmla.f32    q1, q9, q14      @ \n\t"                       \
-            "vst1.32     {d0-d3}, [r2]!   @ \n\t"                       \
-            "4:                           @ \n\t"                       \
-            "ands        r5, %3, #7       @ \n\t"                       \
-            "beq         6f               @ \n\t"                       \
-            "5:                           @ \n\t"                       \
-            "subs        r5, r5, #1       @ \n\t"                       \
-            "vld1.32     {d16[0]}, [r1]!  @ \n\t"                       \
-            "vld1.32     {d0[0]}, [r0]!   @ \n\t"                       \
-            "vmla.f32    d0, d16, d28     @ \n\t"                       \
-            "vst1.32     d0[0], [r2]!     @ \n\t"                       \
-            "bne         5b               @ \n\t"                       \
-            "6:                           @ "                           \
-            :                                                           \
-            :"r" (y),"r" (x), "r"(caddr),"r"(n)                         \
-            : "cc", "r0", "r1", "r2", "r5", "memory",                   \
-              "q0", "q1", "q2", "q3", "q14",                            \
-              "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7",           \
-              "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23", "d28", "d29" \
-            );                                                          \
-    }
-
-static inline void THDoubleVector_fill(double *x, const double c, const long n) {
-  long i = 0;
-
-  for(; i < n-4; i += 4)
-  {
-    x[i] = c;
-    x[i+1] = c;
-    x[i+2] = c;
-    x[i+3] = c;
-  }
-
-  for(; i < n; i++)
-    x[i] = c;
-}
-
-static inline void THDoubleVector_add(double *y, const double *x, const double c, const long n)
-{
-  long i = 0;
-
-  for(;i < n-4; i += 4)
-  {
-    y[i] += c * x[i];
-    y[i+1] += c * x[i+1];
-    y[i+2] += c * x[i+2];
-    y[i+3] += c * x[i+3];
-  }
-
-  for(; i < n; i++)
-    y[i] += c * x[i];
-}
-
-static inline void THDoubleVector_diff(double *z, const double *x, const double *y, const long n)
-{
-  long i = 0;
-
-  for(; i < n-4; i += 4)
-  {
-    z[i] = x[i] - y[i];
-    z[i+1] = x[i+1] - y[i+1];
-    z[i+2] = x[i+2] - y[i+2];
-    z[i+3] = x[i+3] - y[i+3];
-  }
-
-  for(; i < n; i++)
-    z[i] = x[i] - y[i];
-}
-
-static inline void THDoubleVector_scale(double *y, const double c, const long n)
-{
-  long i = 0;
-
-  for(; i < n-4; i +=4)
-  {
-    y[i] *= c;
-    y[i+1] *= c;
-    y[i+2] *= c;
-    y[i+3] *= c;
-  }
-
-  for(; i < n; i++)
-    y[i] *= c;
-}
-
-static inline void THDoubleVector_mul(double *y, const double *x, const long n)
-{
-  long i = 0;
-
-  for(; i < n-4; i += 4)
-  {
-    y[i] *= x[i];
-    y[i+1] *= x[i+1];
-    y[i+2] *= x[i+2];
-    y[i+3] *= x[i+3];
-  }
-
-  for(; i < n; i++)
-    y[i] *= x[i];
-}
-
-
-#else
-
-/* If SSE2 not defined, then generate plain C operators */
-#include "generic/THVector.c"
-#include "THGenerateFloatTypes.h"
-
-#endif
-
-/* For non-float types, generate plain C operators */
-#include "generic/THVector.c"
-#include "THGenerateIntTypes.h"
-
-#endif
+#endif // TH_VECTOR_INC
diff --git a/generic/THVector.h b/generic/THVector.h
new file mode 100644
index 0000000..09067e5
--- /dev/null
+++ b/generic/THVector.h
@@ -0,0 +1,14 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/THVector.h"
+#else
+
+TH_API void THVector_(fill)(real *x, const real c, const long n);
+TH_API void THVector_(add)(real *y, const real *x, const real c, const long n);
+TH_API void THVector_(diff)(real *z, const real *x, const real *y, const long n);
+TH_API void THVector_(scale)(real *y, const real c, const long n);
+TH_API void THVector_(mul)(real *y, const real *x, const long n);
+
+/* Initialize the dispatch pointers */
+TH_API void THVector_(vectorDispatchInit)();
+
+#endif
diff --git a/generic/THVector.c b/generic/THVectorDefault.c
similarity index 66%
rename from generic/THVector.c
rename to generic/THVectorDefault.c
index 6c8a96b..d51be03 100644
--- a/generic/THVector.c
+++ b/generic/THVectorDefault.c
@@ -1,8 +1,8 @@
 #ifndef TH_GENERIC_FILE
-#define TH_GENERIC_FILE "generic/THVector.c"
+#define TH_GENERIC_FILE "generic/THVectorDefault.c"
 #else
 
-static TH_INLINE void THVector_(fill)(real *x, const real c, const long n) {
+void THVector_(fill_DEFAULT)(real *x, const real c, const long n) {
   long i = 0;
 
   for(; i < n-4; i += 4)
@@ -17,7 +17,7 @@
     x[i] = c;
 }
 
-static TH_INLINE void THVector_(add)(real *y, const real *x, const real c, const long n)
+void THVector_(add_DEFAULT)(real *y, const real *x, const real c, const long n)
 {
   long i = 0;
 
@@ -33,7 +33,7 @@
     y[i] += c * x[i];
 }
 
-static TH_INLINE void THVector_(diff)(real *z, const real *x, const real *y, const long n)
+void THVector_(diff_DEFAULT)(real *z, const real *x, const real *y, const long n)
 {
   long i = 0;
 
@@ -49,7 +49,7 @@
     z[i] = x[i] - y[i];
 }
 
-static TH_INLINE void THVector_(scale)(real *y, const real c, const long n)
+void THVector_(scale_DEFAULT)(real *y, const real c, const long n)
 {
   long i = 0;
 
@@ -65,7 +65,7 @@
     y[i] *= c;
 }
 
-static TH_INLINE void THVector_(mul)(real *y, const real *x, const long n)
+void THVector_(mul_DEFAULT)(real *y, const real *x, const long n)
 {
   long i = 0;
 
diff --git a/generic/THVectorDispatch.c b/generic/THVectorDispatch.c
new file mode 100644
index 0000000..f16bcda
--- /dev/null
+++ b/generic/THVectorDispatch.c
@@ -0,0 +1,140 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/THVectorDispatch.c"
+#else
+
+/* For now there are only SIMD implementations for FLOAT and DOUBLE.
+ * Hopefully in the future this can be made totally generic (e.g, there are SIMD implementations
+ * for a lot of functions */
+/* Each function with multiple implementations has:
+ * 1. A DISPATCHPTR which will be initialized to point to the best available implementation for the host
+ * 2. A DISPATCHTABLE which holds pointers to each implementation of a function, and a value indicating
+ *    which SIMD extension a given implementation uses
+ * 3. A dispatch stub, which is what is actually called by clients, that simply wraps the dispatch pointer.
+ */
+
+static void (*THVector_(fill_DISPATCHPTR))(real *, const real, const long) = &THVector_(fill_DEFAULT);
+static FunctionDescription THVector_(fill_DISPATCHTABLE)[] = {
+  #if defined(__NEON__)
+    #if defined(TH_REAL_IS_FLOAT)
+      FUNCTION_IMPL(THVector_(fill_NEON), SIMDExtension_NEON);
+    #endif
+  #endif
+
+  #if defined(USE_SSE2) || defined(USE_SSE3) || defined(USE_SSSE3) \
+          || defined(USE_SSE4_1) || defined(USE_SSE4_2)
+    #if defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT)
+      FUNCTION_IMPL(THVector_(fill_SSE), SIMDExtension_SSE),
+    #endif
+  #endif
+  FUNCTION_IMPL(THVector_(fill_DEFAULT), SIMDExtension_DEFAULT)
+};
+void THVector_(fill)(real *x, const real c, const long n) {
+  THVector_(fill_DISPATCHPTR)(x, c, n);
+}
+
+
+static void (*THVector_(add_DISPATCHPTR))(real *, const real *, const real, const long) = &THVector_(add_DEFAULT);
+static FunctionDescription THVector_(add_DISPATCHTABLE)[] = {
+  #if defined(__NEON__)
+    #if defined(TH_REAL_IS_FLOAT)
+      FUNCTION_IMPL(THVector_(add_NEON), SIMDExtension_NEON);
+    #endif
+  #endif
+
+  #if defined(USE_SSE2) || defined(USE_SSE3) || defined(USE_SSSE3) \
+          || defined(USE_SSE4_1) || defined(USE_SSE4_2)
+    #if defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT)
+      FUNCTION_IMPL(THVector_(add_SSE), SIMDExtension_SSE),
+    #endif
+  #endif
+
+  FUNCTION_IMPL(THVector_(add_DEFAULT), SIMDExtension_DEFAULT)
+};
+void THVector_(add)(real *y, const real *x, const real c, const long n) {
+  THVector_(add_DISPATCHPTR)(y, x, c, n);
+}
+
+
+static void (*THVector_(diff_DISPATCHPTR))(real *, const real *, const real *, const long) = &THVector_(diff_DEFAULT);
+static FunctionDescription THVector_(diff_DISPATCHTABLE)[] = {
+  #if defined(__NEON__)
+    #if defined(TH_REAL_IS_FLOAT)
+      FUNCTION_IMPL(THVector_(diff_NEON), SIMDExtension_NEON);
+    #endif
+  #endif
+
+  #if defined(USE_SSE2) || defined(USE_SSE3) || defined(USE_SSSE3) \
+          || defined(USE_SSE4_1) || defined(USE_SSE4_2)
+    #if defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT)
+      FUNCTION_IMPL(THVector_(diff_SSE), SIMDExtension_SSE),
+    #endif
+  #endif
+
+  FUNCTION_IMPL(THVector_(diff_DEFAULT), SIMDExtension_DEFAULT)
+};
+void THVector_(diff)(real *z, const real *x, const real *y, const long n) {
+  THVector_(diff_DISPATCHPTR)(z, x, y, n);
+}
+
+
+static void (*THVector_(scale_DISPATCHPTR))(real *, const real, const long) = &THVector_(scale_DEFAULT);
+static FunctionDescription THVector_(scale_DISPATCHTABLE)[] = {
+  #if defined(__NEON__)
+    #if defined(TH_REAL_IS_FLOAT)
+      FUNCTION_IMPL(THVector_(scale_NEON), SIMDExtension_NEON);
+    #endif
+  #endif
+
+  #if defined(USE_SSE2) || defined(USE_SSE3) || defined(USE_SSSE3) \
+          || defined(USE_SSE4_1) || defined(USE_SSE4_2)
+    #if defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT)
+      FUNCTION_IMPL(THVector_(scale_SSE), SIMDExtension_SSE),
+    #endif
+  #endif
+
+  FUNCTION_IMPL(THVector_(scale_DEFAULT), SIMDExtension_DEFAULT)
+};
+TH_API void THVector_(scale)(real *y, const real c, const long n) {
+  THVector_(scale_DISPATCHPTR)(y, c, n);
+}
+
+
+static void (*THVector_(mul_DISPATCHPTR))(real *, const real *, const long) = &THVector_(mul_DEFAULT);
+static FunctionDescription THVector_(mul_DISPATCHTABLE)[] = {
+  #if defined(__NEON__)
+    #if defined(TH_REAL_IS_FLOAT)
+      FUNCTION_IMPL(THVector_(mul_NEON), SIMDExtension_NEON);
+    #endif
+  #endif
+
+  #if defined(USE_SSE2) || defined(USE_SSE3) || defined(USE_SSSE3) \
+          || defined(USE_SSE4_1) || defined(USE_SSE4_2)
+    #if defined(TH_REAL_IS_DOUBLE) || defined(TH_REAL_IS_FLOAT)
+      FUNCTION_IMPL(THVector_(mul_SSE), SIMDExtension_SSE),
+    #endif
+  #endif
+
+  FUNCTION_IMPL(THVector_(mul_DEFAULT), SIMDExtension_DEFAULT)
+};
+void THVector_(mul)(real *y, const real *x, const long n) {
+  THVector_(mul_DISPATCHPTR);
+}
+
+/* This needs to be called in order to initialize the dispatch pointers at runtime.
+ * This function simply checks what SIMD extensions are available, and then walks the dispatch table
+ * to choose the best function.
+ * NOTE: As implemented, it will initialize the dispatch pointer to the first supported function.
+ *       This means that in the dispatch tables, implementations supporting more recent extensions
+ *       need to come first
+ */
+void THVector_(vectorDispatchInit)()
+{
+  uint32_t hostSimdExts = detectHostSIMDExtensions();
+  INIT_DISPATCH_PTR(fill);
+  INIT_DISPATCH_PTR(add);
+  INIT_DISPATCH_PTR(diff);
+  INIT_DISPATCH_PTR(scale);
+  INIT_DISPATCH_PTR(mul);
+}
+
+#endif
diff --git a/generic/simd/simd.h b/generic/simd/simd.h
new file mode 100644
index 0000000..e4660b1
--- /dev/null
+++ b/generic/simd/simd.h
@@ -0,0 +1,91 @@
+#ifndef TH_SIMD_INC
+#define TH_SIMD_INC
+
+#include <stdint.h>
+
+// Can be found on Intel ISA Reference for CPUID
+#define CPUID_AVX2_BIT 0x10       // Bit 5 of EBX for EAX=0x7
+#define CPUID_AVX_BIT  0x10000000 // Bit 28 of ECX for EAX=0x1
+#define CPUID_SSE_BIT  0x2000000  // bit 25 of EDX for EAX=0x1
+
+// Helper macros for initialization
+#define FUNCTION_IMPL(NAME, EXT) \
+    { .function=(void *)NAME,    \
+      .supportedSimdExt=EXT      \
+    }
+
+#define INIT_DISPATCH_PTR(OP)    \
+  do {                           \
+    int i;                       \
+    for (i = 0; i < sizeof(THVector_(OP ## _DISPATCHTABLE)) / sizeof(FunctionDescription); ++i) { \
+      THVector_(OP ## _DISPATCHPTR) = THVector_(OP ## _DISPATCHTABLE)[i].function;                     \
+      if (THVector_(OP ## _DISPATCHTABLE)[i].supportedSimdExt & hostSimdExts) {                       \
+        break;                                                                                     \
+      }                                                                                            \
+    }                                                                                              \
+  } while(0)
+
+
+typedef struct FunctionDescription
+{
+  void *function;
+  uint32_t supportedSimdExt;
+} FunctionDescription;
+
+
+enum SIMDExtensions
+{
+#if defined(__NEON__)
+  SIMDExtension_NEON    = 0x1,
+#else
+  SIMDExtension_AVX2    = 0x1,
+  SIMDExtension_AVX     = 0x2,
+  SIMDExtension_SSE     = 0x4,
+#endif
+  SIMDExtension_DEFAULT = 0x0
+};
+
+#if defined(__NEON__)
+
+static inline uint32_t detectHostSIMDExtensions()
+{
+  return SIMDExtension_NEON;
+}
+
+#else // x86
+
+static inline void cpuid(uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx)
+{
+  uint32_t a = *eax, b, c, d;
+  asm volatile ( "cpuid\n\t"
+                 : "+a"(a), "=b"(b), "+c"(c), "=d"(d) );
+  *eax = a;
+  *ebx = b;
+  *ecx = c;
+  *edx = d;
+}
+
+static inline uint32_t detectHostSIMDExtensions()
+{
+  uint32_t eax, ebx, ecx, edx;
+  uint32_t hostSimdExts = 0x0;
+
+  // Check for AVX2. Requires separate CPUID
+  eax = 0x7;
+  cpuid(&eax, &ebx, &ecx, &edx);
+  if (ebx & CPUID_AVX2_BIT)
+    hostSimdExts |= SIMDExtension_AVX2;
+
+  eax = 0x1;
+  cpuid(&eax, &ebx, &ecx, &edx);
+  if (ecx & CPUID_AVX_BIT)
+    hostSimdExts |= SIMDExtension_AVX;
+  if (edx & CPUID_SSE_BIT)
+    hostSimdExts |= SIMDExtension_SSE;
+
+  return hostSimdExts;
+}
+
+#endif // end x86 SIMD extension detection code
+
+#endif
diff --git a/vector/NEON.c b/vector/NEON.c
new file mode 100644
index 0000000..9d65550
--- /dev/null
+++ b/vector/NEON.c
@@ -0,0 +1,252 @@
+static void THFloatVector_fill_NEON(float *x, const float c, const long n) {
+  float ctemp = c;
+  float * caddr = &ctemp;
+  __asm__ __volatile__ (
+      "mov         r0, %0           @ \n\t"
+      "ldr         r4, [%1]         @ \n\t"
+      "vdup.32     q12, r4          @ \n\t"
+      "vdup.32     q13, r4          @ \n\t"
+      "lsrs        r4, %2, #3       @ \n\t"
+      "beq         3f               @ \n\t"
+      "1:                           @ \n\t"
+      "vst1.32     {d24-d27}, [r0]! @ \n\t"
+      "subs        r4, r4, #1       @ \n\t"
+      "bne         1b               @ \n\t"
+      "3:                           @ \n\t"
+      "ands        r4, %2, #7       @ \n\t"
+      "beq         5f               @ \n\t"
+      "4:                           @ \n\t"
+      "subs        r4, r4, #1       @ \n\t"
+      "vst1.32     {d24[0]}, [r0]!  @ \n\t"
+      "bne         4b               @ \n\t"
+      "5:                           @ "
+      :
+      :"r" (x), "r"(caddr),"r"(n)
+      : "cc", "r0", "r4",  "memory",
+        "q12",
+        "d24", "d25", "d26", "d27"
+      );
+}
+
+
+static void THFloatVector_diff_NEON(float *y, const float *x, const float c, const long n) {
+  __asm__ __volatile__ (
+      "mov         r0, %2           @ \n\t"
+      "mov         r1, %1           @ \n\t"
+      "mov         r2, %0           @ \n\t"
+      "lsrs        r4, %3, #3       @ \n\t"
+      "beq         3f               @ \n\t"
+      "vld1.32     {d16-d19}, [r1]! @ \n\t"
+      "vld1.32     {d0-d3}, [r0]!   @ \n\t"
+      "1:                           @ \n\t"
+      "vsub.f32    q12, q8, q0      @ \n\t"
+      "vsub.f32    q13, q9, q1      @ \n\t"
+      "subs        r4, r4, #1       @ \n\t"
+      "beq         2f               @ \n\t"
+      "vld1.32     {d16-d19}, [r1]! @ \n\t"
+      "vld1.32     {d0-d3}, [r0]!   @ \n\t"
+      "vst1.32     {d24-d27}, [r2]! @ \n\t"
+      "b           1b               @ \n\t"
+      "2:                           @ \n\t"
+      "vst1.32     {d24-d27}, [r2]! @ \n\t"
+      "3:                           @ \n\t"
+      "ands        r4, %3, #7       @ \n\t"
+      "beq         5f               @ \n\t"
+      "4:                           @ \n\t"
+      "subs        r4, r4, #1       @ \n\t"
+      "vld1.32     {d16[0]}, [r1]!  @ \n\t"
+      "vld1.32     {d0[0]}, [r0]!   @ \n\t"
+      "vsub.f32    d24, d16, d0     @ \n\t"
+      "vst1.32     {d24[0]}, [r2]!  @ \n\t"
+      "bne         4b               @ \n\t"
+      "5:                           @ "
+      :
+      :"r" (z), "r" (x),"r" (y), "r"(n)
+      : "cc", "r0", "r1", "r2", "r4", "memory",
+        "q0", "q1", "q8", "q9", "q12", "q13",
+        "d0", "d1", "d2", "d3",
+        "d16", "d17", "d18", "d19", "d24", "d25", "d26", "d27"
+      );
+}
+
+
+static void THFloatVector_scale_NEON(float *y, const float c, const long n) {
+  float ctemp = c;
+  float * caddr = &ctemp;
+  __asm__ __volatile__ (
+      "mov         r0, %0           @ \n\t"
+      "mov         r2, r0           @ \n\t"
+      "ldr         r5, [%1]         @ \n\t"
+      "vdup.32     q14, r5          @ \n\t"
+      "lsrs        r5, %2, #5       @ \n\t"
+      "beq         3f               @ \n\t"
+      "vld1.32     {d0-d3}, [r0]!   @ \n\t"
+      "vld1.32     {d4-d7}, [r0]!   @ \n\t"
+      "vld1.32     {d8-d11}, [r0]!  @ \n\t"
+      "vld1.32     {d12-d15}, [r0]! @ \n\t"
+      "1:                           @ \n\t"
+      "vmul.f32    q0, q0, q14      @ \n\t"
+      "vmul.f32    q1, q1, q14      @ \n\t"
+      "vmul.f32    q2, q2, q14      @ \n\t"
+      "vmul.f32    q3, q3, q14      @ \n\t"
+      "vmul.f32    q4, q4, q14      @ \n\t"
+      "vmul.f32    q5, q5, q14      @ \n\t"
+      "vmul.f32    q6, q6, q14      @ \n\t"
+      "vmul.f32    q7, q7, q14      @ \n\t"
+      "subs        r5, r5, #1       @ \n\t"
+      "beq         2f               @ \n\t"
+      "vst1.32     {d0-d3}, [r2]!   @ \n\t"
+      "vld1.32     {d0-d3}, [r0]!   @ \n\t"
+      "vst1.32     {d4-d7}, [r2]!   @ \n\t"
+      "vld1.32     {d4-d7}, [r0]!   @ \n\t"
+      "vst1.32     {d8-d11}, [r2]!  @ \n\t"
+      "vld1.32     {d8-d11}, [r0]!  @ \n\t"
+      "vst1.32     {d12-d15}, [r2]! @ \n\t"
+      "vld1.32     {d12-d15}, [r0]! @ \n\t"
+      "b           1b               @ \n\t"
+      "2:                           @ \n\t"
+      "vst1.32     {d0-d3}, [r2]!   @ \n\t"
+      "vst1.32     {d4-d7}, [r2]!   @ \n\t"
+      "vst1.32     {d8-d11}, [r2]!  @ \n\t"
+      "vst1.32     {d12-d15}, [r2]! @ \n\t"
+      "3:                           @ \n\t"
+      "lsrs        r5, %2, #4       @ \n\t"
+      "ands        r5, r5, #1       @ \n\t"
+      "beq         4f               @ \n\t"
+      "vld1.32     {d0-d3}, [r0]!   @ \n\t"
+      "vld1.32     {d4-d7}, [r0]!   @ \n\t"
+      "vmul.f32    q0, q0, q14      @ \n\t"
+      "vmul.f32    q1, q1, q14      @ \n\t"
+      "vmul.f32    q2, q2, q14      @ \n\t"
+      "vmul.f32    q3, q3, q14      @ \n\t"
+      "vst1.32     {d0-d3}, [r2]!   @ \n\t"
+      "vst1.32     {d4-d7}, [r2]!   @ \n\t"
+      "4:                           @ \n\t"
+      "lsrs        r5, %2, #3       @ \n\t"
+      "ands        r5, r5, #1       @ \n\t"
+      "beq         5f               @ \n\t"
+      "vld1.32     {d0-d3}, [r0]!   @ \n\t"
+      "vmul.f32    q0, q0, q14      @ \n\t"
+      "vmul.f32    q1, q1, q14      @ \n\t"
+      "vst1.32     {d0-d3}, [r2]!   @ \n\t"
+      "5:                           @ \n\t"
+      "ands        r5, %2, #7       @ \n\t"
+      "beq         7f               @ \n\t"
+      "6:                           @ \n\t"
+      "subs        r5, r5, #1       @ \n\t"
+      "vld1.32     d0[0], [r0]!     @ \n\t"
+      "vmul.f32    d0, d0, d28      @ \n\t"
+      "vst1.32     d0[0], [r2]!     @ \n\t"
+      "bne         6b               @ \n\t"
+      "7:                           @ "
+      :
+      :"r" (y), "r"(caddr),"r"(n)
+      : "cc", "r0", "r2", "r5", "memory",
+        "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q14",
+        "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7",
+        "d8", "d9", "d10", "d11", "d12", "d13", "d14", "d15",
+        "d28", "d29"
+      );
+
+}
+
+static void THFloatVector_mul_NEON(float *y, const float *x, const long n) {
+  __asm__ __volatile__ (
+      "mov         r0, %0           @ \n\t"
+      "mov         r1, %1           @ \n\t"
+      "mov         r2, r0           @ \n\t"
+      "lsrs        r4, %2, #3       @ \n\t"
+      "beq         3f               @ \n\t"
+      "vld1.32     {d16-d19}, [r1]! @ \n\t"
+      "vld1.32     {d0-d3}, [r0]!   @ \n\t"
+      "1:                           @ \n\t"
+      "vmul.f32    q12, q8, q0      @ \n\t"
+      "vmul.f32    q13, q9, q1      @ \n\t"
+      "subs        r4, r4, #1       @ \n\t"
+      "beq         2f               @ \n\t"
+      "vld1.32     {d16-d19}, [r1]! @ \n\t"
+      "vld1.32     {d0-d3}, [r0]!   @ \n\t"
+      "vst1.32     {d24-d27}, [r2]! @ \n\t"
+      "b           1b               @ \n\t"
+      "2:                           @ \n\t"
+      "vst1.32     {d24-d27}, [r2]! @ \n\t"
+      "3:                           @ \n\t"
+      "ands        r4, %2, #7       @ \n\t"
+      "beq         5f               @ \n\t"
+      "4:                           @ \n\t"
+      "subs        r4, r4, #1       @ \n\t"
+      "vld1.32     {d16[0]}, [r1]!  @ \n\t"
+      "vld1.32     {d0[0]}, [r0]!   @ \n\t"
+      "vmul.f32    q12, q8, q0      @ \n\t"
+      "vst1.32     {d24[0]}, [r2]!  @ \n\t"
+      "bne         4b               @ \n\t"
+      "5:                           @ "
+      :
+      :"r" (y),"r" (x),"r"(n)
+      : "cc", "r0", "r1", "r2", "r4", "memory",
+        "q0", "q1", "q8", "q9", "q12", "q13",
+        "d0", "d1", "d2", "d3",
+        "d16", "d17", "d18", "d19", "d24", "d25", "d26", "d27"
+      );
+}
+
+static void THFloatVector_add_NEON(float *y, const float *x, const float c, const long n) {
+  float ctemp = c;
+  float * caddr = &ctemp;
+  __asm__ __volatile__ (
+      "mov         r0, %0           @ \n\t"
+      "mov         r1, %1           @ \n\t"
+      "mov         r2, r0           @ \n\t"
+      "ldr         r5, [%2]         @ \n\t"
+      "vdup.32     q14, r5          @ \n\t"
+      "lsrs        r5, %3, #4       @ \n\t"
+      "beq         3f               @ \n\t"
+      "vld1.32     {d16-d19}, [r1]! @ \n\t"
+      "vld1.32     {d0-d3}, [r0]!   @ \n\t"
+      "vld1.32     {d20-d23}, [r1]! @ \n\t"
+      "vld1.32     {d4-d7}, [r0]!   @ \n\t"
+      "1:                           @ \n\t"
+      "vmla.f32    q0, q8, q14      @ \n\t"
+      "vmla.f32    q1, q9, q14      @ \n\t"
+      "vmla.f32    q2, q10, q14     @ \n\t"
+      "vmla.f32    q3, q11, q14     @ \n\t"
+      "subs        r5, r5, #1       @ \n\t"
+      "beq         2f               @ \n\t"
+      "vld1.32     {d16-d19}, [r1]! @ \n\t"
+      "vld1.32     {d20-d23}, [r1]! @ \n\t"
+      "vst1.32     {d0-d3}, [r2]!   @ \n\t"
+      "vld1.32     {d0-d3}, [r0]!   @ \n\t"
+      "vst1.32     {d4-d7}, [r2]!   @ \n\t"
+      "vld1.32     {d4-d7}, [r0]!   @ \n\t"
+      "b           1b               @ \n\t"
+      "2:                           @ \n\t"
+      "vst1.32     {d0-d3}, [r2]!   @ \n\t"
+      "vst1.32     {d4-d7}, [r2]!   @ \n\t"
+      "3:                           @ \n\t"
+      "lsrs        r5, %3, #3       @ \n\t"
+      "ands        r5, #1           @ \n\t"
+      "beq         4f               @ \n\t"
+      "vld1.32     {d16-d19}, [r1]! @ \n\t"
+      "vld1.32     {d0-d3}, [r0]!   @ \n\t"
+      "vmla.f32    q0, q8, q14      @ \n\t"
+      "vmla.f32    q1, q9, q14      @ \n\t"
+      "vst1.32     {d0-d3}, [r2]!   @ \n\t"
+      "4:                           @ \n\t"
+      "ands        r5, %3, #7       @ \n\t"
+      "beq         6f               @ \n\t"
+      "5:                           @ \n\t"
+      "subs        r5, r5, #1       @ \n\t"
+      "vld1.32     {d16[0]}, [r1]!  @ \n\t"
+      "vld1.32     {d0[0]}, [r0]!   @ \n\t"
+      "vmla.f32    d0, d16, d28     @ \n\t"
+      "vst1.32     d0[0], [r2]!     @ \n\t"
+      "bne         5b               @ \n\t"
+      "6:                           @ "
+      :
+      :"r" (y),"r" (x), "r"(caddr),"r"(n)
+      : "cc", "r0", "r1", "r2", "r5", "memory",
+        "q0", "q1", "q2", "q3", "q14",
+        "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7",
+        "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23", "d28", "d29"
+      );
+}
diff --git a/vector/SSE.c b/vector/SSE.c
new file mode 100644
index 0000000..f909907
--- /dev/null
+++ b/vector/SSE.c
@@ -0,0 +1,213 @@
+#include <x86intrin.h>
+
+
+static void THDoubleVector_fill_SSE(double *x, const double c, const long n) {
+  long i;
+  long off;
+  __m128d XMM0 = _mm_set1_pd(c);
+  for (i=0; i<=((n)-8); i+=8) {
+    _mm_storeu_pd((x)+i  , XMM0);
+    _mm_storeu_pd((x)+i+2, XMM0);
+    _mm_storeu_pd((x)+i+4, XMM0);
+    _mm_storeu_pd((x)+i+6, XMM0);
+  }
+  off = (n) - ((n)%8);
+  for (i=0; i<((n)%8); i++) {
+    x[off+i] = c;
+  }
+}
+
+
+static void THDoubleVector_add_SSE(double *y, const double *x, const double c, const long n) {
+  long i = 0;
+  __m128d XMM7 = _mm_set1_pd(c);
+  __m128d XMM0,XMM2;
+  for (; i<=((n)-2); i+=2) {
+    XMM0 = _mm_loadu_pd((x)+i);
+    XMM2 = _mm_loadu_pd((y)+i);
+    XMM0 = _mm_mul_pd(XMM0, XMM7);
+    XMM2 = _mm_add_pd(XMM2, XMM0);
+    _mm_storeu_pd((y)+i  , XMM2);
+  }
+  for (; i<(n); i++) {
+    y[i] += c * x[i];
+  }
+}
+
+
+static void THDoubleVector_diff_SSE(double *z, const double *x, const double *y, const long n) {
+  long i;
+  for (i=0; i<=((n)-8); i+=8) {
+    __m128d XMM0 = _mm_loadu_pd((x)+i  );
+    __m128d XMM1 = _mm_loadu_pd((x)+i+2);
+    __m128d XMM2 = _mm_loadu_pd((x)+i+4);
+    __m128d XMM3 = _mm_loadu_pd((x)+i+6);
+    __m128d XMM4 = _mm_loadu_pd((y)+i  );
+    __m128d XMM5 = _mm_loadu_pd((y)+i+2);
+    __m128d XMM6 = _mm_loadu_pd((y)+i+4);
+    __m128d XMM7 = _mm_loadu_pd((y)+i+6);
+    XMM0 = _mm_sub_pd(XMM0, XMM4);
+    XMM1 = _mm_sub_pd(XMM1, XMM5);
+    XMM2 = _mm_sub_pd(XMM2, XMM6);
+    XMM3 = _mm_sub_pd(XMM3, XMM7);
+    _mm_storeu_pd((z)+i  , XMM0);
+    _mm_storeu_pd((z)+i+2, XMM1);
+    _mm_storeu_pd((z)+i+4, XMM2);
+    _mm_storeu_pd((z)+i+6, XMM3);
+  }
+  long off = (n) - ((n)%8);
+  for (i=0; i<((n)%8); i++) {
+    z[off+i] = x[off+i] - y[off+i];
+  }
+}
+
+
+static void THDoubleVector_scale_SSE(double *y, const double c, const long n) {
+  long i;
+  __m128d XMM7 = _mm_set1_pd(c);
+  for (i=0; i<=((n)-4); i+=4) {
+    __m128d XMM0 = _mm_loadu_pd((y)+i  );
+    __m128d XMM1 = _mm_loadu_pd((y)+i+2);
+    XMM0 = _mm_mul_pd(XMM0, XMM7);
+    XMM1 = _mm_mul_pd(XMM1, XMM7);
+    _mm_storeu_pd((y)+i  , XMM0);
+    _mm_storeu_pd((y)+i+2, XMM1);
+  }
+  long off = (n) - ((n)%4);
+  for (i=0; i<((n)%4); i++) {
+    y[off+i] *= c;
+  }
+}
+
+
+static void THDoubleVector_mul_SSE(double *y, const double *x, const long n) {
+  long i;
+  for (i=0; i<=((n)-8); i+=8) {
+    __m128d XMM0 = _mm_loadu_pd((x)+i  );
+    __m128d XMM1 = _mm_loadu_pd((x)+i+2);
+    __m128d XMM2 = _mm_loadu_pd((x)+i+4);
+    __m128d XMM3 = _mm_loadu_pd((x)+i+6);
+    __m128d XMM4 = _mm_loadu_pd((y)+i  );
+    __m128d XMM5 = _mm_loadu_pd((y)+i+2);
+    __m128d XMM6 = _mm_loadu_pd((y)+i+4);
+    __m128d XMM7 = _mm_loadu_pd((y)+i+6);
+    XMM4 = _mm_mul_pd(XMM4, XMM0);
+    XMM5 = _mm_mul_pd(XMM5, XMM1);
+    XMM6 = _mm_mul_pd(XMM6, XMM2);
+    XMM7 = _mm_mul_pd(XMM7, XMM3);
+    _mm_storeu_pd((y)+i  , XMM4);
+    _mm_storeu_pd((y)+i+2, XMM5);
+    _mm_storeu_pd((y)+i+4, XMM6);
+    _mm_storeu_pd((y)+i+6, XMM7);
+  }
+  long off = (n) - ((n)%8);
+  for (i=0; i<((n)%8); i++) {
+    y[off+i] *= x[off+i];
+  }
+}
+
+
+static void THFloatVector_fill_SSE(float *x, const float c, const long n) {
+  long i;
+  __m128 XMM0 = _mm_set_ps1(c);
+  long off;
+  for (i=0; i<=((n)-16); i+=16) {
+    _mm_storeu_ps((x)+i  ,  XMM0);
+    _mm_storeu_ps((x)+i+4,  XMM0);
+    _mm_storeu_ps((x)+i+8,  XMM0);
+    _mm_storeu_ps((x)+i+12, XMM0);
+  }
+  off = (n) - ((n)%16);
+  for (i=0; i<((n)%16); i++) {
+    x[off+i] = c;
+  }
+}
+
+
+static void THFloatVector_add_SSE(float *y, const float *x, const float c, const long n) {
+  long i = 0;
+  __m128 XMM7 = _mm_set_ps1(c);
+  __m128 XMM0,XMM2;
+  for (; i<=((n)-4); i+=4) {
+    XMM0 = _mm_loadu_ps((x)+i);
+    XMM2 = _mm_loadu_ps((y)+i);
+    XMM0 = _mm_mul_ps(XMM0, XMM7);
+    XMM2 = _mm_add_ps(XMM2, XMM0);
+    _mm_storeu_ps((y)+i  , XMM2);
+  }
+  for (; i<(n); i++) {
+    y[i] += c * x[i];
+  }
+}
+
+
+static void THFloatVector_diff_SSE(float *z, const float *x, const float *y, const long n) {
+  long i;
+  for (i=0; i<=((n)-16); i+=16) {
+    __m128 XMM0 = _mm_loadu_ps((x)+i   );
+    __m128 XMM1 = _mm_loadu_ps((x)+i+ 4);
+    __m128 XMM2 = _mm_loadu_ps((x)+i+ 8);
+    __m128 XMM3 = _mm_loadu_ps((x)+i+12);
+    __m128 XMM4 = _mm_loadu_ps((y)+i   );
+    __m128 XMM5 = _mm_loadu_ps((y)+i+ 4);
+    __m128 XMM6 = _mm_loadu_ps((y)+i+ 8);
+    __m128 XMM7 = _mm_loadu_ps((y)+i+12);
+    XMM0 = _mm_sub_ps(XMM0, XMM4);
+    XMM1 = _mm_sub_ps(XMM1, XMM5);
+    XMM2 = _mm_sub_ps(XMM2, XMM6);
+    XMM3 = _mm_sub_ps(XMM3, XMM7);
+    _mm_storeu_ps((z)+i   , XMM0);
+    _mm_storeu_ps((z)+i+ 4, XMM1);
+    _mm_storeu_ps((z)+i+ 8, XMM2);
+    _mm_storeu_ps((z)+i+12, XMM3);
+  }
+  long off = (n) - ((n)%16);
+  for (i=0; i<((n)%16); i++) {
+    z[off+i] = x[off+i] - y[off+i];
+  }
+}
+
+
+static void THFloatVector_scale_SSE(float *y, const float c, const long n) {
+  long i;
+  __m128 XMM7 = _mm_set_ps1(c);
+  for (i=0; i<=((n)-8); i+=8) {
+    __m128 XMM0 = _mm_loadu_ps((y)+i  );
+    __m128 XMM1 = _mm_loadu_ps((y)+i+4);
+    XMM0 = _mm_mul_ps(XMM0, XMM7);
+    XMM1 = _mm_mul_ps(XMM1, XMM7);
+    _mm_storeu_ps((y)+i  , XMM0);
+    _mm_storeu_ps((y)+i+4, XMM1);
+  }
+  long off = (n) - ((n)%8);
+  for (i=0; i<((n)%8); i++) {
+    y[off+i] *= c;
+  }
+}
+
+
+static void THFloatVector_mul_SSE(float *y, const float *x, const long n) {
+  long i;
+  for (i=0; i<=((n)-16); i+=16) {
+    __m128 XMM0 = _mm_loadu_ps((x)+i   );
+    __m128 XMM1 = _mm_loadu_ps((x)+i+ 4);
+    __m128 XMM2 = _mm_loadu_ps((x)+i+ 8);
+    __m128 XMM3 = _mm_loadu_ps((x)+i+12);
+    __m128 XMM4 = _mm_loadu_ps((y)+i   );
+    __m128 XMM5 = _mm_loadu_ps((y)+i+ 4);
+    __m128 XMM6 = _mm_loadu_ps((y)+i+ 8);
+    __m128 XMM7 = _mm_loadu_ps((y)+i+12);
+    XMM4 = _mm_mul_ps(XMM4, XMM0);
+    XMM5 = _mm_mul_ps(XMM5, XMM1);
+    XMM6 = _mm_mul_ps(XMM6, XMM2);
+    XMM7 = _mm_mul_ps(XMM7, XMM3);
+    _mm_storeu_ps((y)+i   , XMM4);
+    _mm_storeu_ps((y)+i+ 4, XMM5);
+    _mm_storeu_ps((y)+i+ 8, XMM6);
+    _mm_storeu_ps((y)+i+12, XMM7);
+  }
+  long off = (n) - ((n)%16);
+  for (i=0; i<((n)%16); i++) {
+    y[off+i] *= x[off+i];
+  }
+}