| |
| /* Copyright (c) 2013 Julien Pommier ( pommier@modartt.com ) |
| |
| Redistribution and use of the Software in source and binary forms, |
| with or without modification, is permitted provided that the |
| following conditions are met: |
| |
| - Neither the names of NCAR's Computational and Information Systems |
| Laboratory, the University Corporation for Atmospheric Research, |
| nor the names of its sponsors or contributors may be used to |
| endorse or promote products derived from this Software without |
| specific prior written permission. |
| |
| - Redistributions of source code must retain the above copyright |
| notices, this list of conditions, and the disclaimer below. |
| |
| - Redistributions in binary form must reproduce the above copyright |
| notice, this list of conditions, and the disclaimer below in the |
| documentation and/or other materials provided with the |
| distribution. |
| |
| THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, |
| EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF |
| MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND |
| NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT |
| HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL, |
| EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN |
| ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN |
| CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE |
| SOFTWARE. |
| */ |
| |
| #ifndef PF_SIMD_MACROS_H |
| #define PF_SIMD_MACROS_H |
| |
| #include <assert.h> |
| #include <string.h> |
| #include <stdint.h> |
| |
| |
| /* |
| * SIMD reference material: |
| * |
| * general SIMD introduction: |
| * https://www.linuxjournal.com/content/introduction-gcc-compiler-intrinsics-vector-processing |
| * |
| * SSE 1: |
| * https://software.intel.com/sites/landingpage/IntrinsicsGuide/ |
| * |
| * ARM NEON: |
| * https://developer.arm.com/architectures/instruction-sets/simd-isas/neon/intrinsics |
| * |
| * Altivec: |
| * https://www.nxp.com/docs/en/reference-manual/ALTIVECPIM.pdf |
| * https://gcc.gnu.org/onlinedocs/gcc-4.9.2/gcc/PowerPC-AltiVec_002fVSX-Built-in-Functions.html |
| * better one? |
| * |
| */ |
| |
| |
| /* |
| Altivec support macros |
| */ |
| #if !defined(PFFFT_SIMD_DISABLE) && (defined(__ppc__) || defined(__ppc64__)) |
| typedef vector float v4sf; |
| # define SIMD_SZ 4 |
| # define VREQUIRES_ALIGN 1 /* not sure, if really required */ |
| # define VZERO() ((vector float) vec_splat_u8(0)) |
| # define VMUL(a,b) vec_madd(a,b, VZERO()) |
| # define VADD(a,b) vec_add(a,b) |
| # define VMADD(a,b,c) vec_madd(a,b,c) |
| # define VSUB(a,b) vec_sub(a,b) |
| inline v4sf ld_ps1(const float *p) { v4sf v=vec_lde(0,p); return vec_splat(vec_perm(v, v, vec_lvsl(0, p)), 0); } |
| # define LD_PS1(p) ld_ps1(&p) |
| # define INTERLEAVE2(in1, in2, out1, out2) { v4sf tmp__ = vec_mergeh(in1, in2); out2 = vec_mergel(in1, in2); out1 = tmp__; } |
| # define UNINTERLEAVE2(in1, in2, out1, out2) { \ |
| vector unsigned char vperm1 = (vector unsigned char)(0,1,2,3,8,9,10,11,16,17,18,19,24,25,26,27); \ |
| vector unsigned char vperm2 = (vector unsigned char)(4,5,6,7,12,13,14,15,20,21,22,23,28,29,30,31); \ |
| v4sf tmp__ = vec_perm(in1, in2, vperm1); out2 = vec_perm(in1, in2, vperm2); out1 = tmp__; \ |
| } |
| # define VTRANSPOSE4(x0,x1,x2,x3) { \ |
| v4sf y0 = vec_mergeh(x0, x2); \ |
| v4sf y1 = vec_mergel(x0, x2); \ |
| v4sf y2 = vec_mergeh(x1, x3); \ |
| v4sf y3 = vec_mergel(x1, x3); \ |
| x0 = vec_mergeh(y0, y2); \ |
| x1 = vec_mergel(y0, y2); \ |
| x2 = vec_mergeh(y1, y3); \ |
| x3 = vec_mergel(y1, y3); \ |
| } |
| # define VSWAPHL(a,b) vec_perm(a,b, (vector unsigned char)(16,17,18,19,20,21,22,23,8,9,10,11,12,13,14,15)) |
| # define VALIGNED(ptr) ((((uintptr_t)(ptr)) & 0xF) == 0) |
| |
| /* |
| SSE1 support macros |
| */ |
| #elif !defined(PFFFT_SIMD_DISABLE) && (defined(__x86_64__) || defined(_M_X64) || defined(i386) || defined(_M_IX86)) |
| |
| #include <xmmintrin.h> |
| typedef __m128 v4sf; |
| /* 4 floats by simd vector -- this is pretty much hardcoded in the preprocess/finalize functions |
| * anyway so you will have to work if you want to enable AVX with its 256-bit vectors. */ |
| # define SIMD_SZ 4 |
| # define VREQUIRES_ALIGN 1 |
| # define VZERO() _mm_setzero_ps() |
| # define VMUL(a,b) _mm_mul_ps(a,b) |
| # define VADD(a,b) _mm_add_ps(a,b) |
| # define VMADD(a,b,c) _mm_add_ps(_mm_mul_ps(a,b), c) |
| # define VSUB(a,b) _mm_sub_ps(a,b) |
| # define LD_PS1(p) _mm_set1_ps(p) |
| # define VLOAD_UNALIGNED(ptr) _mm_loadu_ps(ptr) |
| # define VLOAD_ALIGNED(ptr) _mm_load_ps(ptr) |
| # define INTERLEAVE2(in1, in2, out1, out2) { v4sf tmp__ = _mm_unpacklo_ps(in1, in2); out2 = _mm_unpackhi_ps(in1, in2); out1 = tmp__; } |
| # define UNINTERLEAVE2(in1, in2, out1, out2) { v4sf tmp__ = _mm_shuffle_ps(in1, in2, _MM_SHUFFLE(2,0,2,0)); out2 = _mm_shuffle_ps(in1, in2, _MM_SHUFFLE(3,1,3,1)); out1 = tmp__; } |
| # define VTRANSPOSE4(x0,x1,x2,x3) _MM_TRANSPOSE4_PS(x0,x1,x2,x3) |
| # define VSWAPHL(a,b) _mm_shuffle_ps(b, a, _MM_SHUFFLE(3,2,1,0)) |
| /* reverse/flip all floats */ |
| # define VREV_S(a) _mm_shuffle_ps(a, a, _MM_SHUFFLE(0,1,2,3)) |
| /* reverse/flip complex floats */ |
| # define VREV_C(a) _mm_shuffle_ps(a, a, _MM_SHUFFLE(1,0,3,2)) |
| # define VALIGNED(ptr) ((((uintptr_t)(ptr)) & 0xF) == 0) |
| |
| /* |
| ARM NEON support macros |
| */ |
| #elif !defined(PFFFT_SIMD_DISABLE) && (defined(__arm__) || defined(__aarch64__) || defined(__arm64__)) |
| # include <arm_neon.h> |
| typedef float32x4_t v4sf; |
| # define SIMD_SZ 4 |
| # define VREQUIRES_ALIGN 0 /* usually no alignment required */ |
| # define VZERO() vdupq_n_f32(0) |
| # define VMUL(a,b) vmulq_f32(a,b) |
| # define VADD(a,b) vaddq_f32(a,b) |
| # define VMADD(a,b,c) vmlaq_f32(c,a,b) |
| # define VSUB(a,b) vsubq_f32(a,b) |
| # define LD_PS1(p) vld1q_dup_f32(&(p)) |
| # define VLOAD_UNALIGNED(ptr) (*(ptr)) |
| # define VLOAD_ALIGNED(ptr) (*(ptr)) |
| # define INTERLEAVE2(in1, in2, out1, out2) { float32x4x2_t tmp__ = vzipq_f32(in1,in2); out1=tmp__.val[0]; out2=tmp__.val[1]; } |
| # define UNINTERLEAVE2(in1, in2, out1, out2) { float32x4x2_t tmp__ = vuzpq_f32(in1,in2); out1=tmp__.val[0]; out2=tmp__.val[1]; } |
| # define VTRANSPOSE4(x0,x1,x2,x3) { \ |
| float32x4x2_t t0_ = vzipq_f32(x0, x2); \ |
| float32x4x2_t t1_ = vzipq_f32(x1, x3); \ |
| float32x4x2_t u0_ = vzipq_f32(t0_.val[0], t1_.val[0]); \ |
| float32x4x2_t u1_ = vzipq_f32(t0_.val[1], t1_.val[1]); \ |
| x0 = u0_.val[0]; x1 = u0_.val[1]; x2 = u1_.val[0]; x3 = u1_.val[1]; \ |
| } |
| // marginally faster version |
| //# define VTRANSPOSE4(x0,x1,x2,x3) { asm("vtrn.32 %q0, %q1;\n vtrn.32 %q2,%q3\n vswp %f0,%e2\n vswp %f1,%e3" : "+w"(x0), "+w"(x1), "+w"(x2), "+w"(x3)::); } |
| # define VSWAPHL(a,b) vcombine_f32(vget_low_f32(b), vget_high_f32(a)) |
| # define VALIGNED(ptr) ((((uintptr_t)(ptr)) & 0x3) == 0) |
| #else |
| # if !defined(PFFFT_SIMD_DISABLE) |
| # warning "building with simd disabled !\n"; |
| # define PFFFT_SIMD_DISABLE // fallback to scalar code |
| # endif |
| #endif |
| |
| // fallback mode for situations where SSE/Altivec are not available, use scalar mode instead |
| #ifdef PFFFT_SIMD_DISABLE |
| typedef float v4sf; |
| # define SIMD_SZ 1 |
| # define VREQUIRES_ALIGN 0 |
| # define VZERO() 0.f |
| # define VMUL(a,b) ((a)*(b)) |
| # define VADD(a,b) ((a)+(b)) |
| # define VMADD(a,b,c) ((a)*(b)+(c)) |
| # define VSUB(a,b) ((a)-(b)) |
| # define LD_PS1(p) (p) |
| # define VLOAD_UNALIGNED(ptr) (*(ptr)) |
| # define VLOAD_ALIGNED(ptr) (*(ptr)) |
| # define VALIGNED(ptr) ((((uintptr_t)(ptr)) & 0x3) == 0) |
| #endif |
| |
| // shortcuts for complex multiplcations |
| #define VCPLXMUL(ar,ai,br,bi) { v4sf tmp; tmp=VMUL(ar,bi); ar=VMUL(ar,br); ar=VSUB(ar,VMUL(ai,bi)); ai=VMUL(ai,br); ai=VADD(ai,tmp); } |
| #define VCPLXMULCONJ(ar,ai,br,bi) { v4sf tmp; tmp=VMUL(ar,bi); ar=VMUL(ar,br); ar=VADD(ar,VMUL(ai,bi)); ai=VMUL(ai,br); ai=VSUB(ai,tmp); } |
| #ifndef SVMUL |
| // multiply a scalar with a vector |
| #define SVMUL(f,v) VMUL(LD_PS1(f),v) |
| #endif |
| |
| typedef union v4sf_union { |
| v4sf v; |
| float f[SIMD_SZ]; |
| } v4sf_union; |
| |
| #if !defined(PFFFT_SIMD_DISABLE) |
| |
| #define assertv4(v,f0,f1,f2,f3) assert(v.f[0] == (f0) && v.f[1] == (f1) && v.f[2] == (f2) && v.f[3] == (f3)) |
| |
| /* detect bugs with the vector support macros */ |
| static void Vvalidate_simd() { |
| float f[16] = { 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 }; |
| v4sf_union a0, a1, a2, a3, t, u; |
| memcpy(a0.f, f, 4*sizeof(float)); |
| memcpy(a1.f, f+4, 4*sizeof(float)); |
| memcpy(a2.f, f+8, 4*sizeof(float)); |
| memcpy(a3.f, f+12, 4*sizeof(float)); |
| |
| t = a0; u = a1; t.v = VZERO(); |
| printf("VZERO=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]); assertv4(t, 0, 0, 0, 0); |
| t.v = VADD(a1.v, a2.v); |
| printf("VADD(4:7,8:11)=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]); assertv4(t, 12, 14, 16, 18); |
| t.v = VMUL(a1.v, a2.v); |
| printf("VMUL(4:7,8:11)=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]); assertv4(t, 32, 45, 60, 77); |
| t.v = VMADD(a1.v, a2.v,a0.v); |
| printf("VMADD(4:7,8:11,0:3)=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]); assertv4(t, 32, 46, 62, 80); |
| |
| INTERLEAVE2(a1.v,a2.v,t.v,u.v); |
| printf("INTERLEAVE2(4:7,8:11)=[%2g %2g %2g %2g] [%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3], u.f[0], u.f[1], u.f[2], u.f[3]); |
| assertv4(t, 4, 8, 5, 9); assertv4(u, 6, 10, 7, 11); |
| UNINTERLEAVE2(a1.v,a2.v,t.v,u.v); |
| printf("UNINTERLEAVE2(4:7,8:11)=[%2g %2g %2g %2g] [%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3], u.f[0], u.f[1], u.f[2], u.f[3]); |
| assertv4(t, 4, 6, 8, 10); assertv4(u, 5, 7, 9, 11); |
| |
| t.v=LD_PS1(f[15]); |
| printf("LD_PS1(15)=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]); |
| assertv4(t, 15, 15, 15, 15); |
| t.v = VSWAPHL(a1.v, a2.v); |
| printf("VSWAPHL(4:7,8:11)=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]); |
| assertv4(t, 8, 9, 6, 7); |
| VTRANSPOSE4(a0.v, a1.v, a2.v, a3.v); |
| printf("VTRANSPOSE4(0:3,4:7,8:11,12:15)=[%2g %2g %2g %2g] [%2g %2g %2g %2g] [%2g %2g %2g %2g] [%2g %2g %2g %2g]\n", |
| a0.f[0], a0.f[1], a0.f[2], a0.f[3], a1.f[0], a1.f[1], a1.f[2], a1.f[3], |
| a2.f[0], a2.f[1], a2.f[2], a2.f[3], a3.f[0], a3.f[1], a3.f[2], a3.f[3]); |
| assertv4(a0, 0, 4, 8, 12); assertv4(a1, 1, 5, 9, 13); assertv4(a2, 2, 6, 10, 14); assertv4(a3, 3, 7, 11, 15); |
| } |
| #endif //!PFFFT_SIMD_DISABLE |
| |
| /* SSE and co like 16-bytes aligned pointers */ |
| #define MALLOC_V4SF_ALIGNMENT 64 // with a 64-byte alignment, we are even aligned on L2 cache lines... |
| |
| static |
| void *Valigned_malloc(size_t nb_bytes) { |
| void *p, *p0 = malloc(nb_bytes + MALLOC_V4SF_ALIGNMENT); |
| if (!p0) return (void *) 0; |
| p = (void *) (((size_t) p0 + MALLOC_V4SF_ALIGNMENT) & (~((size_t) (MALLOC_V4SF_ALIGNMENT-1)))); |
| *((void **) p - 1) = p0; |
| return p; |
| } |
| |
| static |
| void Valigned_free(void *p) { |
| if (p) free(*((void **) p - 1)); |
| } |
| |
| |
| #endif /* PF_SIMD_MACROS_H */ |
| |