blob: 1344e75a74c96d1ca5138055064f08f7842f72f2 [file] [log] [blame]
#ifndef TH_VECTOR_INC
#define TH_VECTOR_INC
#include "THGeneral.h"
#define THVector_(NAME) TH_CONCAT_4(TH,Real,Vector_,NAME)
#if defined USE_SSE2 || defined USE_SSE3 || defined USE_SSSE3 \
|| defined USE_SSE4_1 || defined USE_SSE4_2
#ifdef USE_SSE2
#include <emmintrin.h>
#endif
#ifdef USE_SSE3
#include <pmmintrin.h>
#endif
#ifdef USE_SSSE3
#include <tmmintrin.h>
#endif
#if defined (USE_SSE4_2) || defined (USE_SSE4_1)
#include <smmintrin.h>
#endif
#define THDoubleVector_fill(x, c, n) { \
long i; \
long off; \
__m128d XMM0 = _mm_set1_pd(c); \
for (i=0; i<=((n)-8); i+=8) { \
_mm_storeu_pd((x)+i , XMM0); \
_mm_storeu_pd((x)+i+2, XMM0); \
_mm_storeu_pd((x)+i+4, XMM0); \
_mm_storeu_pd((x)+i+6, XMM0); \
} \
off = (n) - ((n)%8); \
for (i=0; i<((n)%8); i++) { \
x[off+i] = c; \
} \
}
#define THDoubleVector_add(y, x, c, n) { \
long i = 0; \
__m128d XMM7 = _mm_set1_pd(c); \
__m128d XMM0,XMM2; \
for (; i<=((n)-2); i+=2) { \
XMM0 = _mm_loadu_pd((x)+i); \
XMM2 = _mm_loadu_pd((y)+i); \
XMM0 = _mm_mul_pd(XMM0, XMM7); \
XMM2 = _mm_add_pd(XMM2, XMM0); \
_mm_storeu_pd((y)+i , XMM2); \
} \
for (; i<(n); i++) { \
y[i] += c * x[i]; \
} \
}
#define THDoubleVector_diff(z, x, y, n) { \
long i; \
for (i=0; i<=((n)-8); i+=8) { \
__m128d XMM0 = _mm_loadu_pd((x)+i ); \
__m128d XMM1 = _mm_loadu_pd((x)+i+2); \
__m128d XMM2 = _mm_loadu_pd((x)+i+4); \
__m128d XMM3 = _mm_loadu_pd((x)+i+6); \
__m128d XMM4 = _mm_loadu_pd((y)+i ); \
__m128d XMM5 = _mm_loadu_pd((y)+i+2); \
__m128d XMM6 = _mm_loadu_pd((y)+i+4); \
__m128d XMM7 = _mm_loadu_pd((y)+i+6); \
XMM0 = _mm_sub_pd(XMM0, XMM4); \
XMM1 = _mm_sub_pd(XMM1, XMM5); \
XMM2 = _mm_sub_pd(XMM2, XMM6); \
XMM3 = _mm_sub_pd(XMM3, XMM7); \
_mm_storeu_pd((z)+i , XMM0); \
_mm_storeu_pd((z)+i+2, XMM1); \
_mm_storeu_pd((z)+i+4, XMM2); \
_mm_storeu_pd((z)+i+6, XMM3); \
} \
long off = (n) - ((n)%8); \
for (i=0; i<((n)%8); i++) { \
z[off+i] = x[off+i] - y[off+i]; \
} \
}
#define THDoubleVector_scale(y, c, n) { \
long i; \
__m128d XMM7 = _mm_set1_pd(c); \
for (i=0; i<=((n)-4); i+=4) { \
__m128d XMM0 = _mm_loadu_pd((y)+i ); \
__m128d XMM1 = _mm_loadu_pd((y)+i+2); \
XMM0 = _mm_mul_pd(XMM0, XMM7); \
XMM1 = _mm_mul_pd(XMM1, XMM7); \
_mm_storeu_pd((y)+i , XMM0); \
_mm_storeu_pd((y)+i+2, XMM1); \
} \
long off = (n) - ((n)%4); \
for (i=0; i<((n)%4); i++) { \
y[off+i] *= c; \
} \
}
#define THDoubleVector_mul(y, x, n) { \
long i; \
for (i=0; i<=((n)-8); i+=8) { \
__m128d XMM0 = _mm_loadu_pd((x)+i ); \
__m128d XMM1 = _mm_loadu_pd((x)+i+2); \
__m128d XMM2 = _mm_loadu_pd((x)+i+4); \
__m128d XMM3 = _mm_loadu_pd((x)+i+6); \
__m128d XMM4 = _mm_loadu_pd((y)+i ); \
__m128d XMM5 = _mm_loadu_pd((y)+i+2); \
__m128d XMM6 = _mm_loadu_pd((y)+i+4); \
__m128d XMM7 = _mm_loadu_pd((y)+i+6); \
XMM4 = _mm_mul_pd(XMM4, XMM0); \
XMM5 = _mm_mul_pd(XMM5, XMM1); \
XMM6 = _mm_mul_pd(XMM6, XMM2); \
XMM7 = _mm_mul_pd(XMM7, XMM3); \
_mm_storeu_pd((y)+i , XMM4); \
_mm_storeu_pd((y)+i+2, XMM5); \
_mm_storeu_pd((y)+i+4, XMM6); \
_mm_storeu_pd((y)+i+6, XMM7); \
} \
long off = (n) - ((n)%8); \
for (i=0; i<((n)%8); i++) { \
y[off+i] *= x[off+i]; \
} \
}
#define THFloatVector_fill(x, c, n) { \
long i; \
__m128 XMM0 = _mm_set_ps1(c); \
long off; \
for (i=0; i<=((n)-16); i+=16) { \
_mm_storeu_ps((x)+i , XMM0); \
_mm_storeu_ps((x)+i+4, XMM0); \
_mm_storeu_ps((x)+i+8, XMM0); \
_mm_storeu_ps((x)+i+12, XMM0); \
} \
off = (n) - ((n)%16); \
for (i=0; i<((n)%16); i++) { \
x[off+i] = c; \
} \
}
#define THFloatVector_add(y, x, c, n) { \
long i = 0; \
__m128 XMM7 = _mm_set_ps1(c); \
__m128 XMM0,XMM2; \
for (; i<=((n)-4); i+=4) { \
XMM0 = _mm_loadu_ps((x)+i); \
XMM2 = _mm_loadu_ps((y)+i); \
XMM0 = _mm_mul_ps(XMM0, XMM7); \
XMM2 = _mm_add_ps(XMM2, XMM0); \
_mm_storeu_ps((y)+i , XMM2); \
} \
for (; i<(n); i++) { \
y[i] += c * x[i]; \
} \
}
#define THFloatVector_diff(z, x, y, n) { \
long i; \
for (i=0; i<=((n)-16); i+=16) { \
__m128 XMM0 = _mm_loadu_ps((x)+i ); \
__m128 XMM1 = _mm_loadu_ps((x)+i+ 4); \
__m128 XMM2 = _mm_loadu_ps((x)+i+ 8); \
__m128 XMM3 = _mm_loadu_ps((x)+i+12); \
__m128 XMM4 = _mm_loadu_ps((y)+i ); \
__m128 XMM5 = _mm_loadu_ps((y)+i+ 4); \
__m128 XMM6 = _mm_loadu_ps((y)+i+ 8); \
__m128 XMM7 = _mm_loadu_ps((y)+i+12); \
XMM0 = _mm_sub_ps(XMM0, XMM4); \
XMM1 = _mm_sub_ps(XMM1, XMM5); \
XMM2 = _mm_sub_ps(XMM2, XMM6); \
XMM3 = _mm_sub_ps(XMM3, XMM7); \
_mm_storeu_ps((z)+i , XMM0); \
_mm_storeu_ps((z)+i+ 4, XMM1); \
_mm_storeu_ps((z)+i+ 8, XMM2); \
_mm_storeu_ps((z)+i+12, XMM3); \
} \
long off = (n) - ((n)%16); \
for (i=0; i<((n)%16); i++) { \
z[off+i] = x[off+i] - y[off+i]; \
} \
}
#define THFloatVector_scale(y, c, n) { \
long i; \
__m128 XMM7 = _mm_set_ps1(c); \
for (i=0; i<=((n)-8); i+=8) { \
__m128 XMM0 = _mm_loadu_ps((y)+i ); \
__m128 XMM1 = _mm_loadu_ps((y)+i+4); \
XMM0 = _mm_mul_ps(XMM0, XMM7); \
XMM1 = _mm_mul_ps(XMM1, XMM7); \
_mm_storeu_ps((y)+i , XMM0); \
_mm_storeu_ps((y)+i+4, XMM1); \
} \
long off = (n) - ((n)%8); \
for (i=0; i<((n)%8); i++) { \
y[off+i] *= c; \
} \
}
#define THFloatVector_mul(y, x, n) { \
long i; \
for (i=0; i<=((n)-16); i+=16) { \
__m128 XMM0 = _mm_loadu_ps((x)+i ); \
__m128 XMM1 = _mm_loadu_ps((x)+i+ 4); \
__m128 XMM2 = _mm_loadu_ps((x)+i+ 8); \
__m128 XMM3 = _mm_loadu_ps((x)+i+12); \
__m128 XMM4 = _mm_loadu_ps((y)+i ); \
__m128 XMM5 = _mm_loadu_ps((y)+i+ 4); \
__m128 XMM6 = _mm_loadu_ps((y)+i+ 8); \
__m128 XMM7 = _mm_loadu_ps((y)+i+12); \
XMM4 = _mm_mul_ps(XMM4, XMM0); \
XMM5 = _mm_mul_ps(XMM5, XMM1); \
XMM6 = _mm_mul_ps(XMM6, XMM2); \
XMM7 = _mm_mul_ps(XMM7, XMM3); \
_mm_storeu_ps((y)+i , XMM4); \
_mm_storeu_ps((y)+i+ 4, XMM5); \
_mm_storeu_ps((y)+i+ 8, XMM6); \
_mm_storeu_ps((y)+i+12, XMM7); \
} \
long off = (n) - ((n)%16); \
for (i=0; i<((n)%16); i++) { \
y[off+i] *= x[off+i]; \
} \
}
#elif defined __NEON__
/* ARM NEON Assembly routine for operating on floats */
#define THFloatVector_fill(x, c, n) { \
float ctemp = c; \
float * caddr = &ctemp; \
__asm__ __volatile__ ( \
"mov r0, %0 @ \n\t" \
"ldr r4, [%1] @ \n\t" \
"vdup.32 q12, r4 @ \n\t" \
"vdup.32 q13, r4 @ \n\t" \
"lsrs r4, %2, #3 @ \n\t" \
"beq 3f @ \n\t" \
"1: @ \n\t" \
"vst1.32 {d24-d27}, [r0]! @ \n\t" \
"subs r4, r4, #1 @ \n\t" \
"bne 1b @ \n\t" \
"3: @ \n\t" \
"ands r4, %2, #7 @ \n\t" \
"beq 5f @ \n\t" \
"4: @ \n\t" \
"subs r4, r4, #1 @ \n\t" \
"vst1.32 {d24[0]}, [r0]! @ \n\t" \
"bne 4b @ \n\t" \
"5: @ " \
: \
:"r" (x), "r"(caddr),"r"(n) \
: "cc", "r0", "r4", "memory", \
"q12", \
"d24", "d25", "d26", "d27" \
); \
}
#define THFloatVector_diff(z, x, y, n) { \
__asm__ __volatile__ ( \
"mov r0, %2 @ \n\t" \
"mov r1, %1 @ \n\t" \
"mov r2, %0 @ \n\t" \
"lsrs r4, %3, #3 @ \n\t" \
"beq 3f @ \n\t" \
"vld1.32 {d16-d19}, [r1]! @ \n\t" \
"vld1.32 {d0-d3}, [r0]! @ \n\t" \
"1: @ \n\t" \
"vsub.f32 q12, q8, q0 @ \n\t" \
"vsub.f32 q13, q9, q1 @ \n\t" \
"subs r4, r4, #1 @ \n\t" \
"beq 2f @ \n\t" \
"vld1.32 {d16-d19}, [r1]! @ \n\t" \
"vld1.32 {d0-d3}, [r0]! @ \n\t" \
"vst1.32 {d24-d27}, [r2]! @ \n\t" \
"b 1b @ \n\t" \
"2: @ \n\t" \
"vst1.32 {d24-d27}, [r2]! @ \n\t" \
"3: @ \n\t" \
"ands r4, %3, #7 @ \n\t" \
"beq 5f @ \n\t" \
"4: @ \n\t" \
"subs r4, r4, #1 @ \n\t" \
"vld1.32 {d16[0]}, [r1]! @ \n\t" \
"vld1.32 {d0[0]}, [r0]! @ \n\t" \
"vsub.f32 d24, d16, d0 @ \n\t" \
"vst1.32 {d24[0]}, [r2]! @ \n\t" \
"bne 4b @ \n\t" \
"5: @ " \
: \
:"r" (z), "r" (x),"r" (y), "r"(n) \
: "cc", "r0", "r1", "r2", "r4", "memory", \
"q0", "q1", "q8", "q9", "q12", "q13", \
"d0", "d1", "d2", "d3", \
"d16", "d17", "d18", "d19", "d24", "d25", "d26", "d27" \
); \
}
#define THFloatVector_scale(y, c, n) { \
float ctemp = c; \
float * caddr = &ctemp; \
__asm__ __volatile__ ( \
"mov r0, %0 @ \n\t" \
"mov r2, r0 @ \n\t" \
"ldr r5, [%1] @ \n\t" \
"vdup.32 q14, r5 @ \n\t" \
"lsrs r5, %2, #5 @ \n\t" \
"beq 3f @ \n\t" \
"vld1.32 {d0-d3}, [r0]! @ \n\t" \
"vld1.32 {d4-d7}, [r0]! @ \n\t" \
"vld1.32 {d8-d11}, [r0]! @ \n\t" \
"vld1.32 {d12-d15}, [r0]! @ \n\t" \
"1: @ \n\t" \
"vmul.f32 q0, q0, q14 @ \n\t" \
"vmul.f32 q1, q1, q14 @ \n\t" \
"vmul.f32 q2, q2, q14 @ \n\t" \
"vmul.f32 q3, q3, q14 @ \n\t" \
"vmul.f32 q4, q4, q14 @ \n\t" \
"vmul.f32 q5, q5, q14 @ \n\t" \
"vmul.f32 q6, q6, q14 @ \n\t" \
"vmul.f32 q7, q7, q14 @ \n\t" \
"subs r5, r5, #1 @ \n\t" \
"beq 2f @ \n\t" \
"vst1.32 {d0-d3}, [r2]! @ \n\t" \
"vld1.32 {d0-d3}, [r0]! @ \n\t" \
"vst1.32 {d4-d7}, [r2]! @ \n\t" \
"vld1.32 {d4-d7}, [r0]! @ \n\t" \
"vst1.32 {d8-d11}, [r2]! @ \n\t" \
"vld1.32 {d8-d11}, [r0]! @ \n\t" \
"vst1.32 {d12-d15}, [r2]! @ \n\t" \
"vld1.32 {d12-d15}, [r0]! @ \n\t" \
"b 1b @ \n\t" \
"2: @ \n\t" \
"vst1.32 {d0-d3}, [r2]! @ \n\t" \
"vst1.32 {d4-d7}, [r2]! @ \n\t" \
"vst1.32 {d8-d11}, [r2]! @ \n\t" \
"vst1.32 {d12-d15}, [r2]! @ \n\t" \
"3: @ \n\t" \
"lsrs r5, %2, #4 @ \n\t" \
"ands r5, r5, #1 @ \n\t" \
"beq 4f @ \n\t" \
"vld1.32 {d0-d3}, [r0]! @ \n\t" \
"vld1.32 {d4-d7}, [r0]! @ \n\t" \
"vmul.f32 q0, q0, q14 @ \n\t" \
"vmul.f32 q1, q1, q14 @ \n\t" \
"vmul.f32 q2, q2, q14 @ \n\t" \
"vmul.f32 q3, q3, q14 @ \n\t" \
"vst1.32 {d0-d3}, [r2]! @ \n\t" \
"vst1.32 {d4-d7}, [r2]! @ \n\t" \
"4: @ \n\t" \
"lsrs r5, %2, #3 @ \n\t" \
"ands r5, r5, #1 @ \n\t" \
"beq 5f @ \n\t" \
"vld1.32 {d0-d3}, [r0]! @ \n\t" \
"vmul.f32 q0, q0, q14 @ \n\t" \
"vmul.f32 q1, q1, q14 @ \n\t" \
"vst1.32 {d0-d3}, [r2]! @ \n\t" \
"5: @ \n\t" \
"ands r5, %2, #7 @ \n\t" \
"beq 7f @ \n\t" \
"6: @ \n\t" \
"subs r5, r5, #1 @ \n\t" \
"vld1.32 d0[0], [r0]! @ \n\t" \
"vmul.f32 d0, d0, d28 @ \n\t" \
"vst1.32 d0[0], [r2]! @ \n\t" \
"bne 6b @ \n\t" \
"7: @ " \
: \
:"r" (y), "r"(caddr),"r"(n) \
: "cc", "r0", "r2", "r5", "memory", \
"q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q14", \
"d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", \
"d8", "d9", "d10", "d11", "d12", "d13", "d14", "d15", \
"d28", "d29" \
); \
}
#define THFloatVector_mul(y, x, n) { \
__asm__ __volatile__ ( \
"mov r0, %0 @ \n\t" \
"mov r1, %1 @ \n\t" \
"mov r2, r0 @ \n\t" \
"lsrs r4, %2, #3 @ \n\t" \
"beq 3f @ \n\t" \
"vld1.32 {d16-d19}, [r1]! @ \n\t" \
"vld1.32 {d0-d3}, [r0]! @ \n\t" \
"1: @ \n\t" \
"vmul.f32 q12, q8, q0 @ \n\t" \
"vmul.f32 q13, q9, q1 @ \n\t" \
"subs r4, r4, #1 @ \n\t" \
"beq 2f @ \n\t" \
"vld1.32 {d16-d19}, [r1]! @ \n\t" \
"vld1.32 {d0-d3}, [r0]! @ \n\t" \
"vst1.32 {d24-d27}, [r2]! @ \n\t" \
"b 1b @ \n\t" \
"2: @ \n\t" \
"vst1.32 {d24-d27}, [r2]! @ \n\t" \
"3: @ \n\t" \
"ands r4, %2, #7 @ \n\t" \
"beq 5f @ \n\t" \
"4: @ \n\t" \
"subs r4, r4, #1 @ \n\t" \
"vld1.32 {d16[0]}, [r1]! @ \n\t" \
"vld1.32 {d0[0]}, [r0]! @ \n\t" \
"vmul.f32 q12, q8, q0 @ \n\t" \
"vst1.32 {d24[0]}, [r2]! @ \n\t" \
"bne 4b @ \n\t" \
"5: @ " \
: \
:"r" (y),"r" (x),"r"(n) \
: "cc", "r0", "r1", "r2", "r4", "memory", \
"q0", "q1", "q8", "q9", "q12", "q13", \
"d0", "d1", "d2", "d3", \
"d16", "d17", "d18", "d19", "d24", "d25", "d26", "d27" \
); \
}
#define THFloatVector_add(y, x, c, n) { \
float ctemp = c; \
float * caddr = &ctemp; \
__asm__ __volatile__ ( \
"mov r0, %0 @ \n\t" \
"mov r1, %1 @ \n\t" \
"mov r2, r0 @ \n\t" \
"ldr r5, [%2] @ \n\t" \
"vdup.32 q14, r5 @ \n\t" \
"lsrs r5, %3, #4 @ \n\t" \
"beq 3f @ \n\t" \
"vld1.32 {d16-d19}, [r1]! @ \n\t" \
"vld1.32 {d0-d3}, [r0]! @ \n\t" \
"vld1.32 {d20-d23}, [r1]! @ \n\t" \
"vld1.32 {d4-d7}, [r0]! @ \n\t" \
"1: @ \n\t" \
"vmla.f32 q0, q8, q14 @ \n\t" \
"vmla.f32 q1, q9, q14 @ \n\t" \
"vmla.f32 q2, q10, q14 @ \n\t" \
"vmla.f32 q3, q11, q14 @ \n\t" \
"subs r5, r5, #1 @ \n\t" \
"beq 2f @ \n\t" \
"vld1.32 {d16-d19}, [r1]! @ \n\t" \
"vld1.32 {d20-d23}, [r1]! @ \n\t" \
"vst1.32 {d0-d3}, [r2]! @ \n\t" \
"vld1.32 {d0-d3}, [r0]! @ \n\t" \
"vst1.32 {d4-d7}, [r2]! @ \n\t" \
"vld1.32 {d4-d7}, [r0]! @ \n\t" \
"b 1b @ \n\t" \
"2: @ \n\t" \
"vst1.32 {d0-d3}, [r2]! @ \n\t" \
"vst1.32 {d4-d7}, [r2]! @ \n\t" \
"3: @ \n\t" \
"lsrs r5, %3, #3 @ \n\t" \
"ands r5, #1 @ \n\t" \
"beq 4f @ \n\t" \
"vld1.32 {d16-d19}, [r1]! @ \n\t" \
"vld1.32 {d0-d3}, [r0]! @ \n\t" \
"vmla.f32 q0, q8, q14 @ \n\t" \
"vmla.f32 q1, q9, q14 @ \n\t" \
"vst1.32 {d0-d3}, [r2]! @ \n\t" \
"4: @ \n\t" \
"ands r5, %3, #7 @ \n\t" \
"beq 6f @ \n\t" \
"5: @ \n\t" \
"subs r5, r5, #1 @ \n\t" \
"vld1.32 {d16[0]}, [r1]! @ \n\t" \
"vld1.32 {d0[0]}, [r0]! @ \n\t" \
"vmla.f32 d0, d16, d28 @ \n\t" \
"vst1.32 d0[0], [r2]! @ \n\t" \
"bne 5b @ \n\t" \
"6: @ " \
: \
:"r" (y),"r" (x), "r"(caddr),"r"(n) \
: "cc", "r0", "r1", "r2", "r5", "memory", \
"q0", "q1", "q2", "q3", "q14", \
"d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", \
"d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23", "d28", "d29" \
); \
}
static inline void THDoubleVector_fill(double *x, const double c, const long n) {
long i = 0;
for(; i < n-4; i += 4)
{
x[i] = c;
x[i+1] = c;
x[i+2] = c;
x[i+3] = c;
}
for(; i < n; i++)
x[i] = c;
}
static inline void THDoubleVector_add(double *y, const double *x, const double c, const long n)
{
long i = 0;
for(;i < n-4; i += 4)
{
y[i] += c * x[i];
y[i+1] += c * x[i+1];
y[i+2] += c * x[i+2];
y[i+3] += c * x[i+3];
}
for(; i < n; i++)
y[i] += c * x[i];
}
static inline void THDoubleVector_diff(double *z, const double *x, const double *y, const long n)
{
long i = 0;
for(; i < n-4; i += 4)
{
z[i] = x[i] - y[i];
z[i+1] = x[i+1] - y[i+1];
z[i+2] = x[i+2] - y[i+2];
z[i+3] = x[i+3] - y[i+3];
}
for(; i < n; i++)
z[i] = x[i] - y[i];
}
static inline void THDoubleVector_scale(double *y, const double c, const long n)
{
long i = 0;
for(; i < n-4; i +=4)
{
y[i] *= c;
y[i+1] *= c;
y[i+2] *= c;
y[i+3] *= c;
}
for(; i < n; i++)
y[i] *= c;
}
static inline void THDoubleVector_mul(double *y, const double *x, const long n)
{
long i = 0;
for(; i < n-4; i += 4)
{
y[i] *= x[i];
y[i+1] *= x[i+1];
y[i+2] *= x[i+2];
y[i+3] *= x[i+3];
}
for(; i < n; i++)
y[i] *= x[i];
}
#else
/* If SSE2 not defined, then generate plain C operators */
#include "generic/THVector.c"
#include "THGenerateFloatTypes.h"
#endif
/* For non-float types, generate plain C operators */
#include "generic/THVector.c"
#include "THGenerateIntTypes.h"
#endif