diff options
author | Chris Robinson <[email protected]> | 2023-10-06 21:06:03 -0700 |
---|---|---|
committer | Chris Robinson <[email protected]> | 2023-10-06 21:06:03 -0700 |
commit | 393790de91b7ab81c75f7ebff7874a3c92dc6bbf (patch) | |
tree | dee36b217188a35209d399c13fdcab470334d4b5 | |
parent | 1614fccd9fd893e104dcca2c92b83b2a7bfaa0c7 (diff) |
Add a generic GCC vector extension fallback for pffft
Also combine multiple allocations into one.
-rw-r--r-- | common/pffft.cpp | 93 |
1 files changed, 80 insertions, 13 deletions
diff --git a/common/pffft.cpp b/common/pffft.cpp index d42f7baf..146afef5 100644 --- a/common/pffft.cpp +++ b/common/pffft.cpp @@ -170,6 +170,63 @@ typedef float32x4_t v4sf; #define VSWAPHL(a,b) vcombine_f32(vget_low_f32(b), vget_high_f32(a)) #define VALIGNED(ptr) ((reinterpret_cast<uintptr_t>(ptr) & 0x3) == 0) +/* + * Generic GCC vector macros + */ +#elif defined(__GNUC__) + +using v4sf [[gnu::vector_size(16), gnu::aligned(16)]] = float; +#define SIMD_SZ 4 +#define VZERO() v4sf{0,0,0,0} +#define VMUL(a,b) ((a) * (b)) +#define VADD(a,b) ((a) + (b)) +#define VMADD(a,b,c) ((a)*(b) + (c)) +#define VSUB(a,b) ((a) - (b)) +#define SVMUL(f,v) ((f) * (v)) + +constexpr v4sf ld_ps1(float a) noexcept { return v4sf{a, a, a, a}; } +#define LD_PS1 ld_ps1 + +[[gnu::always_inline]] inline v4sf unpacklo(v4sf a, v4sf b) noexcept +{ return v4sf{a[0], b[0], a[1], b[1]}; } +[[gnu::always_inline]] inline v4sf unpackhi(v4sf a, v4sf b) noexcept +{ return v4sf{a[2], b[2], a[3], b[3]}; } + +[[gnu::always_inline]] inline void interleave2(v4sf in1, v4sf in2, v4sf &out1, v4sf &out2) noexcept +{ + v4sf tmp__{unpacklo(in1, in2)}; + out2 = unpackhi(in1, in2); + out1 = tmp__; +} +#define INTERLEAVE2 interleave2 + +[[gnu::always_inline]] inline void uninterleave2(v4sf in1, v4sf in2, v4sf &out1, v4sf &out2) noexcept +{ + v4sf tmp__{in1[0], in1[2], in2[0], in2[2]}; + out2 = v4sf{in1[1], in1[3], in2[1], in2[3]}; + out1 = tmp__; +} +#define UNINTERLEAVE2 uninterleave2 + +[[gnu::always_inline]] inline void vtranspose4(v4sf &x0, v4sf &x1, v4sf &x2, v4sf &x3) noexcept +{ + v4sf tmp0 = unpacklo(x0, x1); + v4sf tmp2 = unpacklo(x2, x3); + v4sf tmp1 = unpackhi(x0, x1); + v4sf tmp3 = unpackhi(x2, x3); + x0 = v4sf{tmp0[0], tmp0[1], tmp2[0], tmp2[1]}; + x1 = v4sf{tmp0[2], tmp0[3], tmp2[2], tmp2[3]}; + x2 = v4sf{tmp1[0], tmp1[1], tmp3[0], tmp3[1]}; + x3 = v4sf{tmp1[2], tmp1[3], tmp3[2], tmp3[3]}; +} +#define VTRANSPOSE4 vtranspose4 + +[[gnu::always_inline]] inline v4sf vswaphl(v4sf a, v4sf b) noexcept +{ return v4sf{b[0], b[1], a[2], a[3]}; } +#define VSWAPHL vswaphl + +#define VALIGNED(ptr) ((reinterpret_cast<uintptr_t>(ptr) & 0xF) == 0) + #else #warning "building with simd disabled !\n"; @@ -192,8 +249,8 @@ typedef float v4sf; #endif // shortcuts for complex multiplcations -#define VCPLXMUL(ar,ai,br,bi) do { v4sf tmp; tmp=VMUL(ar,bi); ar=VMUL(ar,br); ar=VSUB(ar,VMUL(ai,bi)); ai=VMUL(ai,br); ai=VADD(ai,tmp); } while(0) -#define VCPLXMULCONJ(ar,ai,br,bi) do { v4sf tmp; tmp=VMUL(ar,bi); ar=VMUL(ar,br); ar=VADD(ar,VMUL(ai,bi)); ai=VMUL(ai,br); ai=VSUB(ai,tmp); } while(0) +#define VCPLXMUL(ar,ai,br,bi) do { v4sf tmp=VMUL(ar,bi); ar=VMUL(ar,br); ar=VSUB(ar,VMUL(ai,bi)); ai=VMADD(ai,br,tmp); } while(0) +#define VCPLXMULCONJ(ar,ai,br,bi) do { v4sf tmp=VMUL(ar,bi); ar=VMUL(ar,br); ar=VMADD(ai,bi,ar); ai=VSUB(VMUL(ai,br),tmp); } while(0) #ifndef SVMUL // multiply a scalar with a vector #define SVMUL(f,v) VMUL(LD_PS1(f),v) @@ -1272,28 +1329,38 @@ struct PFFFT_Setup { int Ncvec; // nb of complex simd vectors (N/4 if PFFFT_COMPLEX, N/8 if PFFFT_REAL) int ifac[15]; pffft_transform_t transform; - v4sf *data; // allocated room for twiddle coefs float *e; // points into 'data' , N/4*3 elements float *twiddle; // points into 'data', N/4 elements + + alignas(MALLOC_V4SF_ALIGNMENT) v4sf data[1]; }; PFFFT_Setup *pffft_new_setup(int N, pffft_transform_t transform) { - PFFFT_Setup *s = new PFFFT_Setup{}; + assert(transform == PFFFT_REAL || transform == PFFFT_COMPLEX); + assert(N > 0); /* unfortunately, the fft size must be a multiple of 16 for complex FFTs * and 32 for real FFTs -- a lot of stuff would need to be rewritten to * handle other cases (or maybe just switch to a scalar fft, I don't know..) */ - if(transform == PFFFT_REAL) { assert((N%(2*SIMD_SZ*SIMD_SZ))==0 && N>0); } - if(transform == PFFFT_COMPLEX) { assert((N%(SIMD_SZ*SIMD_SZ))==0 && N>0); } - //assert((N % 32) == 0); + if(transform == PFFFT_REAL) + assert((N%(2*SIMD_SZ*SIMD_SZ)) == 0); + else + assert((N%(SIMD_SZ*SIMD_SZ)) == 0); + + const unsigned int Ncvec = static_cast<unsigned>(transform == PFFFT_REAL ? N/2 : N)/SIMD_SZ; + size_t storelen{offsetof(PFFFT_Setup, data[0]) + (2u*Ncvec * sizeof(v4sf))}; + + void *store{al_calloc(MALLOC_V4SF_ALIGNMENT, storelen)}; + if(!store) return nullptr; + + PFFFT_Setup *s = ::new(store) PFFFT_Setup{}; s->N = N; s->transform = transform; /* nb of complex simd vectors */ - s->Ncvec = (transform == PFFFT_REAL ? N/2 : N)/SIMD_SZ; - s->data = static_cast<v4sf*>(pffft_aligned_malloc(2u*static_cast<unsigned>(s->Ncvec) * sizeof(v4sf))); - s->e = reinterpret_cast<float*>(s->data); - s->twiddle = reinterpret_cast<float*>(s->data + (2u*static_cast<unsigned>(s->Ncvec)*(SIMD_SZ-1))/SIMD_SZ); + s->Ncvec = static_cast<int>(Ncvec); + s->e = reinterpret_cast<float*>(&s->data[0]); + s->twiddle = reinterpret_cast<float*>(&s->data[2u*Ncvec*(SIMD_SZ-1)/SIMD_SZ]); if(transform == PFFFT_REAL) { @@ -1343,8 +1410,8 @@ PFFFT_Setup *pffft_new_setup(int N, pffft_transform_t transform) void pffft_destroy_setup(PFFFT_Setup *s) { - pffft_aligned_free(s->data); - delete s; + std::destroy_at(s); + al_free(s); } #if !defined(PFFFT_SIMD_DISABLE) |