aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--common/pffft.cpp309
1 files changed, 148 insertions, 161 deletions
diff --git a/common/pffft.cpp b/common/pffft.cpp
index 0c2dd940..06ae66ec 100644
--- a/common/pffft.cpp
+++ b/common/pffft.cpp
@@ -57,14 +57,18 @@
#include "pffft.h"
+#include <array>
#include <assert.h>
#include <cmath>
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
+#include <vector>
+#include "albit.h"
#include "almalloc.h"
#include "alnumbers.h"
+#include "vector.h"
#if defined(__GNUC__)
#define ALWAYS_INLINE(return_type) inline return_type __attribute__ ((always_inline))
@@ -1328,10 +1332,9 @@ struct PFFFT_Setup {
int Ncvec; // nb of complex simd vectors (N/4 if PFFFT_COMPLEX, N/8 if PFFFT_REAL)
int ifac[15];
pffft_transform_t transform;
- float *e; // points into 'data' , N/4*3 elements
- float *twiddle; // points into 'data', N/4 elements
- alignas(MALLOC_V4SF_ALIGNMENT) v4sf data[1];
+ float *twiddle; // N/4 elements
+ alignas(MALLOC_V4SF_ALIGNMENT) v4sf e[1]; // N/4*3 elements
};
PFFFT_Setup *pffft_new_setup(int N, pffft_transform_t transform)
@@ -1348,7 +1351,7 @@ PFFFT_Setup *pffft_new_setup(int N, pffft_transform_t transform)
assert((N%(SIMD_SZ*SIMD_SZ)) == 0);
const unsigned int Ncvec = static_cast<unsigned>(transform == PFFFT_REAL ? N/2 : N)/SIMD_SZ;
- size_t storelen{offsetof(PFFFT_Setup, data[0]) + (2u*Ncvec * sizeof(v4sf))};
+ size_t storelen{offsetof(PFFFT_Setup, e[0]) + (2u*Ncvec * sizeof(v4sf))};
void *store{al_calloc(MALLOC_V4SF_ALIGNMENT, storelen)};
if(!store) return nullptr;
@@ -1358,39 +1361,28 @@ PFFFT_Setup *pffft_new_setup(int N, pffft_transform_t transform)
s->transform = transform;
/* nb of complex simd vectors */
s->Ncvec = static_cast<int>(Ncvec);
- s->e = reinterpret_cast<float*>(&s->data[0]);
- s->twiddle = reinterpret_cast<float*>(&s->data[2u*Ncvec*(SIMD_SZ-1)/SIMD_SZ]);
+ s->twiddle = reinterpret_cast<float*>(&s->e[2u*Ncvec*(SIMD_SZ-1)/SIMD_SZ]);
- if(transform == PFFFT_REAL)
+ if constexpr(SIMD_SZ > 1)
{
+ al::vector<float,16> e(2u*Ncvec*(SIMD_SZ-1));
for(int k=0; k < s->Ncvec; ++k)
{
- int i = k/SIMD_SZ;
- int j = k%SIMD_SZ;
- for(int m=0; m < SIMD_SZ-1; ++m)
+ size_t i{static_cast<size_t>(k) / SIMD_SZ};
+ size_t j{static_cast<size_t>(k) % SIMD_SZ};
+ for(size_t m{0};m < SIMD_SZ-1;++m)
{
- const double A = -2.0*al::numbers::pi*(m+1)*k / N;
- s->e[(2*(i*3 + m) + 0) * SIMD_SZ + j] = static_cast<float>(std::cos(A));
- s->e[(2*(i*3 + m) + 1) * SIMD_SZ + j] = static_cast<float>(std::sin(A));
+ const double A = -2.0*al::numbers::pi*static_cast<double>(m+1)*k / N;
+ e[(2*(i*3 + m) + 0)*SIMD_SZ + j] = static_cast<float>(std::cos(A));
+ e[(2*(i*3 + m) + 1)*SIMD_SZ + j] = static_cast<float>(std::sin(A));
}
}
- rffti1_ps(N/SIMD_SZ, s->twiddle, s->ifac);
+ std::memcpy(s->e, e.data(), e.size()*sizeof(float));
}
+ if(transform == PFFFT_REAL)
+ rffti1_ps(N/SIMD_SZ, s->twiddle, s->ifac);
else
- {
- for(int k=0; k < s->Ncvec; ++k)
- {
- int i = k/SIMD_SZ;
- int j = k%SIMD_SZ;
- for(int m=0; m < SIMD_SZ-1; ++m)
- {
- const double A = -2.0*al::numbers::pi*(m+1)*k / N;
- s->e[(2*(i*3 + m) + 0)*SIMD_SZ + j] = static_cast<float>(std::cos(A));
- s->e[(2*(i*3 + m) + 1)*SIMD_SZ + j] = static_cast<float>(std::sin(A));
- }
- }
cffti1_ps(N/SIMD_SZ, s->twiddle, s->ifac);
- }
/* check that N is decomposable with allowed prime factors */
int m = 1;
@@ -1455,13 +1447,14 @@ static void unreversed_copy(int N, const v4sf *in, v4sf *out, int out_stride)
void pffft_zreorder(PFFFT_Setup *setup, const float *in, float *out, pffft_direction_t direction)
{
- const int N = setup->N, Ncvec = setup->Ncvec;
- const v4sf *vin = reinterpret_cast<const v4sf*>(in);
- v4sf *vout = reinterpret_cast<v4sf*>(out);
assert(in != out);
+
+ const int N{setup->N}, Ncvec{setup->Ncvec};
+ const v4sf *vin{reinterpret_cast<const v4sf*>(in)};
+ v4sf *vout{reinterpret_cast<v4sf*>(out)};
if(setup->transform == PFFFT_REAL)
{
- const int dk = N/32;
+ const int dk{N/32};
if(direction == PFFFT_FORWARD)
{
for(int k=0; k < dk; ++k)
@@ -1469,8 +1462,8 @@ void pffft_zreorder(PFFFT_Setup *setup, const float *in, float *out, pffft_direc
INTERLEAVE2(vin[k*8 + 0], vin[k*8 + 1], vout[2*(0*dk + k) + 0], vout[2*(0*dk + k) + 1]);
INTERLEAVE2(vin[k*8 + 4], vin[k*8 + 5], vout[2*(2*dk + k) + 0], vout[2*(2*dk + k) + 1]);
}
- reversed_copy(dk, vin+2, 8, reinterpret_cast<v4sf*>(out + N/2));
- reversed_copy(dk, vin+6, 8, reinterpret_cast<v4sf*>(out + N));
+ reversed_copy(dk, vin+2, 8, vout + N/SIMD_SZ/2);
+ reversed_copy(dk, vin+6, 8, vout + N/SIMD_SZ);
}
else
{
@@ -1479,8 +1472,8 @@ void pffft_zreorder(PFFFT_Setup *setup, const float *in, float *out, pffft_direc
UNINTERLEAVE2(vin[2*(0*dk + k) + 0], vin[2*(0*dk + k) + 1], vout[k*8 + 0], vout[k*8 + 1]);
UNINTERLEAVE2(vin[2*(2*dk + k) + 0], vin[2*(2*dk + k) + 1], vout[k*8 + 4], vout[k*8 + 5]);
}
- unreversed_copy(dk, reinterpret_cast<const v4sf*>(in + N/4), reinterpret_cast<v4sf*>(out + N - 6*SIMD_SZ), -8);
- unreversed_copy(dk, reinterpret_cast<const v4sf*>(in + 3*N/4), reinterpret_cast<v4sf*>(out + N - 2*SIMD_SZ), -8);
+ unreversed_copy(dk, vin + N/SIMD_SZ/4, vout + N/SIMD_SZ - 6, -8);
+ unreversed_copy(dk, vin + 3*N/SIMD_SZ/4, vout + N/SIMD_SZ - 2, -8);
}
}
else
@@ -1504,31 +1497,29 @@ void pffft_zreorder(PFFFT_Setup *setup, const float *in, float *out, pffft_direc
}
}
-void pffft_cplx_finalize(int Ncvec, const v4sf *in, v4sf *out, const v4sf *e)
+void pffft_cplx_finalize(const int Ncvec, const v4sf *in, v4sf *out, const v4sf *e)
{
- const int dk = Ncvec/SIMD_SZ; // number of 4x4 matrix blocks
- v4sf r0, i0, r1, i1, r2, i2, r3, i3;
- v4sf sr0, dr0, sr1, dr1, si0, di0, si1, di1;
assert(in != out);
+
+ const int dk{Ncvec/SIMD_SZ}; // number of 4x4 matrix blocks
for(int k=0; k < dk; ++k)
{
- r0 = in[8*k+0]; i0 = in[8*k+1];
- r1 = in[8*k+2]; i1 = in[8*k+3];
- r2 = in[8*k+4]; i2 = in[8*k+5];
- r3 = in[8*k+6]; i3 = in[8*k+7];
+ v4sf r0{in[8*k+0]}, i0{in[8*k+1]};
+ v4sf r1{in[8*k+2]}, i1{in[8*k+3]};
+ v4sf r2{in[8*k+4]}, i2{in[8*k+5]};
+ v4sf r3{in[8*k+6]}, i3{in[8*k+7]};
VTRANSPOSE4(r0,r1,r2,r3);
VTRANSPOSE4(i0,i1,i2,i3);
VCPLXMUL(r1,i1,e[k*6+0],e[k*6+1]);
VCPLXMUL(r2,i2,e[k*6+2],e[k*6+3]);
VCPLXMUL(r3,i3,e[k*6+4],e[k*6+5]);
- sr0 = VADD(r0,r2); dr0 = VSUB(r0, r2);
- sr1 = VADD(r1,r3); dr1 = VSUB(r1, r3);
- si0 = VADD(i0,i2); di0 = VSUB(i0, i2);
- si1 = VADD(i1,i3); di1 = VSUB(i1, i3);
+ v4sf sr0{VADD(r0,r2)}, dr0{VSUB(r0, r2)};
+ v4sf sr1{VADD(r1,r3)}, dr1{VSUB(r1, r3)};
+ v4sf si0{VADD(i0,i2)}, di0{VSUB(i0, i2)};
+ v4sf si1{VADD(i1,i3)}, di1{VSUB(i1, i3)};
- /*
- * transformation for each column is:
+ /* transformation for each column is:
*
* [1 1 1 1 0 0 0 0] [r0]
* [1 0 -1 0 0 -1 0 1] [r1]
@@ -1550,23 +1541,22 @@ void pffft_cplx_finalize(int Ncvec, const v4sf *in, v4sf *out, const v4sf *e)
}
}
-void pffft_cplx_preprocess(int Ncvec, const v4sf *in, v4sf *out, const v4sf *e)
+void pffft_cplx_preprocess(const int Ncvec, const v4sf *in, v4sf *out, const v4sf *e)
{
- const int dk = Ncvec/SIMD_SZ; // number of 4x4 matrix blocks
- v4sf r0, i0, r1, i1, r2, i2, r3, i3;
- v4sf sr0, dr0, sr1, dr1, si0, di0, si1, di1;
assert(in != out);
+
+ const int dk{Ncvec/SIMD_SZ}; // number of 4x4 matrix blocks
for(int k=0; k < dk; ++k)
{
- r0 = in[8*k+0]; i0 = in[8*k+1];
- r1 = in[8*k+2]; i1 = in[8*k+3];
- r2 = in[8*k+4]; i2 = in[8*k+5];
- r3 = in[8*k+6]; i3 = in[8*k+7];
+ v4sf r0{in[8*k+0]}, i0{in[8*k+1]};
+ v4sf r1{in[8*k+2]}, i1{in[8*k+3]};
+ v4sf r2{in[8*k+4]}, i2{in[8*k+5]};
+ v4sf r3{in[8*k+6]}, i3{in[8*k+7]};
- sr0 = VADD(r0,r2); dr0 = VSUB(r0, r2);
- sr1 = VADD(r1,r3); dr1 = VSUB(r1, r3);
- si0 = VADD(i0,i2); di0 = VSUB(i0, i2);
- si1 = VADD(i1,i3); di1 = VSUB(i1, i3);
+ v4sf sr0{VADD(r0,r2)}, dr0{VSUB(r0, r2)};
+ v4sf sr1{VADD(r1,r3)}, dr1{VSUB(r1, r3)};
+ v4sf si0{VADD(i0,i2)}, di0{VSUB(i0, i2)};
+ v4sf si1{VADD(i1,i3)}, di1{VSUB(i1, i3)};
r0 = VADD(sr0, sr1); i0 = VADD(si0, si1);
r1 = VSUB(dr0, di1); i1 = VADD(di0, dr1);
@@ -1645,20 +1635,18 @@ static ALWAYS_INLINE(void) pffft_real_finalize_4x4(const v4sf *in0, const v4sf *
static NEVER_INLINE(void) pffft_real_finalize(int Ncvec, const v4sf *in, v4sf *out, const v4sf *e)
{
- static constexpr float s = al::numbers::sqrt2_v<float>/2.0f;
- const int dk = Ncvec/SIMD_SZ; // number of 4x4 matrix blocks
- /* fftpack order is f0r f1r f1i f2r f2i ... f(n-1)r f(n-1)i f(n)r */
-
- v4sf_union cr, ci, *uout = reinterpret_cast<v4sf_union*>(out);
- v4sf save = in[7], zero=VZERO();
- float xr0, xi0, xr1, xi1, xr2, xi2, xr3, xi3;
+ static constexpr float s{al::numbers::sqrt2_v<float>/2.0f};
- cr.v = in[0]; ci.v = in[Ncvec*2-1];
assert(in != out);
+ const int dk{Ncvec/SIMD_SZ}; // number of 4x4 matrix blocks
+ /* fftpack order is f0r f1r f1i f2r f2i ... f(n-1)r f(n-1)i f(n)r */
+
+ const v4sf zero{VZERO()};
+ const auto cr = al::bit_cast<std::array<float,SIMD_SZ>>(in[0]);
+ const auto ci = al::bit_cast<std::array<float,SIMD_SZ>>(in[Ncvec*2-1]);
pffft_real_finalize_4x4(&zero, &zero, in+1, e, out);
- /*
- * [cr0 cr1 cr2 cr3 ci0 ci1 ci2 ci3]
+ /* [cr0 cr1 cr2 cr3 ci0 ci1 ci2 ci3]
*
* [Xr(1)] ] [1 1 1 1 0 0 0 0]
* [Xr(N/4) ] [0 0 0 0 1 s 0 -s]
@@ -1670,29 +1658,26 @@ static NEVER_INLINE(void) pffft_real_finalize(int Ncvec, const v4sf *in, v4sf *o
* [Xi(3N/4)] [0 0 0 0 0 -s 1 -s]
*/
- xr0=(cr.f[0]+cr.f[2]) + (cr.f[1]+cr.f[3]); uout[0].f[0] = xr0;
- xi0=(cr.f[0]+cr.f[2]) - (cr.f[1]+cr.f[3]); uout[1].f[0] = xi0;
- xr2=(cr.f[0]-cr.f[2]); uout[4].f[0] = xr2;
- xi2=(cr.f[3]-cr.f[1]); uout[5].f[0] = xi2;
- xr1= ci.f[0] + s*(ci.f[1]-ci.f[3]); uout[2].f[0] = xr1;
- xi1=-ci.f[2] - s*(ci.f[1]+ci.f[3]); uout[3].f[0] = xi1;
- xr3= ci.f[0] - s*(ci.f[1]-ci.f[3]); uout[6].f[0] = xr3;
- xi3= ci.f[2] - s*(ci.f[1]+ci.f[3]); uout[7].f[0] = xi3;
-
- for(int k=1; k < dk; ++k)
- {
- v4sf save_next = in[8*k+7];
- pffft_real_finalize_4x4(&save, &in[8*k+0], in + 8*k+1, e + k*6, out + k*8);
- save = save_next;
- }
+ auto *uout = reinterpret_cast<v4sf_union*>(out);
+ const float xr0{(cr[0]+cr[2]) + (cr[1]+cr[3])}; uout[0].f[0] = xr0;
+ const float xi0{(cr[0]+cr[2]) - (cr[1]+cr[3])}; uout[1].f[0] = xi0;
+ const float xr2{(cr[0]-cr[2])}; uout[4].f[0] = xr2;
+ const float xi2{(cr[3]-cr[1])}; uout[5].f[0] = xi2;
+ const float xr1{ ci[0] + s*(ci[1]-ci[3])}; uout[2].f[0] = xr1;
+ const float xi1{-ci[2] - s*(ci[1]+ci[3])}; uout[3].f[0] = xi1;
+ const float xr3{ ci[0] - s*(ci[1]-ci[3])}; uout[6].f[0] = xr3;
+ const float xi3{ ci[2] - s*(ci[1]+ci[3])}; uout[7].f[0] = xi3;
+
+ for(int k{1};k < dk;++k)
+ pffft_real_finalize_4x4(&in[8*k-1], &in[8*k+0], in + 8*k+1, e + k*6, out + k*8);
}
static ALWAYS_INLINE(void) pffft_real_preprocess_4x4(const v4sf *in, const v4sf *e, v4sf *out,
int first)
{
v4sf r0=in[0], i0=in[1], r1=in[2], i1=in[3], r2=in[4], i2=in[5], r3=in[6], i3=in[7];
- /*
- * transformation for each column is:
+
+ /* transformation for each column is:
*
* [1 1 1 1 0 0 0 0] [r0]
* [1 0 0 -1 0 -1 -1 0] [r1]
@@ -1741,22 +1726,22 @@ static ALWAYS_INLINE(void) pffft_real_preprocess_4x4(const v4sf *in, const v4sf
static NEVER_INLINE(void) pffft_real_preprocess(int Ncvec, const v4sf *in, v4sf *out, const v4sf *e)
{
static constexpr float s = al::numbers::sqrt2_v<float>;
- const int dk = Ncvec/SIMD_SZ; // number of 4x4 matrix blocks
- /* fftpack order is f0r f1r f1i f2r f2i ... f(n-1)r f(n-1)i f(n)r */
- v4sf_union Xr, Xi, *uout = reinterpret_cast<v4sf_union*>(out);
- float cr0, ci0, cr1, ci1, cr2, ci2, cr3, ci3;
assert(in != out);
- for(int k=0; k < 4; ++k)
+ const int dk{Ncvec/SIMD_SZ}; // number of 4x4 matrix blocks
+ /* fftpack order is f0r f1r f1i f2r f2i ... f(n-1)r f(n-1)i f(n)r */
+
+ std::array<float,SIMD_SZ> Xr, Xi;
+ for(size_t k{0};k < 4;++k)
{
- Xr.f[k] = reinterpret_cast<const float*>(in)[8*k];
- Xi.f[k] = reinterpret_cast<const float*>(in)[8*k+4];
+ /* TODO: Use _mm_cvtss_f32 or equivalent. */
+ Xr[k] = al::bit_cast<std::array<float,SIMD_SZ>>(in[4*k])[0];
+ Xi[k] = al::bit_cast<std::array<float,SIMD_SZ>>(in[4*k + 1])[0];
}
pffft_real_preprocess_4x4(in, e, out+1, 1); // will write only 6 values
- /*
- * [Xr0 Xr1 Xr2 Xr3 Xi0 Xi1 Xi2 Xi3]
+ /* [Xr0 Xr1 Xr2 Xr3 Xi0 Xi1 Xi2 Xi3]
*
* [cr0] [1 0 2 0 1 0 0 0]
* [cr1] [1 0 0 0 -1 0 -2 0]
@@ -1767,57 +1752,60 @@ static NEVER_INLINE(void) pffft_real_preprocess(int Ncvec, const v4sf *in, v4sf
* [ci2] [0 0 0 0 0 -2 0 2]
* [ci3] [0 -s 0 s 0 -s 0 -s]
*/
- for(int k=1; k < dk; ++k)
+ for(int k{1};k < dk;++k)
pffft_real_preprocess_4x4(in+8*k, e + k*6, out-1+k*8, 0);
- cr0=(Xr.f[0]+Xi.f[0]) + 2*Xr.f[2]; uout[0].f[0] = cr0;
- cr1=(Xr.f[0]-Xi.f[0]) - 2*Xi.f[2]; uout[0].f[1] = cr1;
- cr2=(Xr.f[0]+Xi.f[0]) - 2*Xr.f[2]; uout[0].f[2] = cr2;
- cr3=(Xr.f[0]-Xi.f[0]) + 2*Xi.f[2]; uout[0].f[3] = cr3;
- ci0= 2*(Xr.f[1]+Xr.f[3]); uout[2*Ncvec-1].f[0] = ci0;
- ci1= s*(Xr.f[1]-Xr.f[3]) - s*(Xi.f[1]+Xi.f[3]); uout[2*Ncvec-1].f[1] = ci1;
- ci2= 2*(Xi.f[3]-Xi.f[1]); uout[2*Ncvec-1].f[2] = ci2;
- ci3=-s*(Xr.f[1]-Xr.f[3]) - s*(Xi.f[1]+Xi.f[3]); uout[2*Ncvec-1].f[3] = ci3;
+ /* TODO: Use _mm_set_ps or equivalent. */
+ auto *uout = reinterpret_cast<v4sf_union*>(out);
+ const float cr0{(Xr[0]+Xi[0]) + 2*Xr[2]}; uout[0].f[0] = cr0;
+ const float cr1{(Xr[0]-Xi[0]) - 2*Xi[2]}; uout[0].f[1] = cr1;
+ const float cr2{(Xr[0]+Xi[0]) - 2*Xr[2]}; uout[0].f[2] = cr2;
+ const float cr3{(Xr[0]-Xi[0]) + 2*Xi[2]}; uout[0].f[3] = cr3;
+ const float ci0{ 2*(Xr[1]+Xr[3])}; uout[2*Ncvec-1].f[0] = ci0;
+ const float ci1{ s*(Xr[1]-Xr[3]) - s*(Xi[1]+Xi[3])}; uout[2*Ncvec-1].f[1] = ci1;
+ const float ci2{ 2*(Xi[3]-Xi[1])}; uout[2*Ncvec-1].f[2] = ci2;
+ const float ci3{-s*(Xr[1]-Xr[3]) - s*(Xi[1]+Xi[3])}; uout[2*Ncvec-1].f[3] = ci3;
}
void pffft_transform_internal(PFFFT_Setup *setup, const float *finput, float *foutput,
v4sf *scratch, pffft_direction_t direction, int ordered)
{
- const int Ncvec = setup->Ncvec;
- const int nf_odd = (setup->ifac[1] & 1);
-
- // temporary buffer is allocated on the stack if the scratch pointer is NULL
assert(scratch != nullptr);
+ assert(VALIGNED(finput) && VALIGNED(foutput) && VALIGNED(scratch));
- const v4sf *vinput = reinterpret_cast<const v4sf*>(finput);
- v4sf *voutput = reinterpret_cast<v4sf*>(foutput);
- v4sf *buff[2] = { voutput, scratch };
- int ib = (nf_odd ^ ordered ? 1 : 0);
+ const int Ncvec{setup->Ncvec};
+ const int nf_odd{setup->ifac[1] & 1};
- assert(VALIGNED(finput) && VALIGNED(foutput));
+ auto *vinput = reinterpret_cast<const v4sf*>(finput);
+ auto *voutput = reinterpret_cast<v4sf*>(foutput);
+ assert(voutput != scratch);
- //assert(finput != foutput);
+ v4sf *buff[2]{voutput, scratch};
+ int ib{(nf_odd ^ ordered) ? 1 : 0};
if(direction == PFFFT_FORWARD)
{
+ /* Swap the initial work buffer for forward FFTs, which helps avoid an
+ * extra copy for output.
+ */
ib = !ib;
if(setup->transform == PFFFT_REAL)
{
- ib = (rfftf1_ps(Ncvec*2, vinput, buff[ib], buff[!ib], setup->twiddle, &setup->ifac[0]) == buff[0] ? 0 : 1);
- pffft_real_finalize(Ncvec, buff[ib], buff[!ib], reinterpret_cast<v4sf*>(setup->e));
+ ib = (rfftf1_ps(Ncvec*2, vinput, buff[ib], buff[!ib], setup->twiddle, setup->ifac) == buff[1]);
+ pffft_real_finalize(Ncvec, buff[ib], buff[!ib], setup->e);
}
else
{
- v4sf *tmp = buff[ib];
+ v4sf *tmp{buff[ib]};
for(int k=0; k < Ncvec; ++k)
- {
UNINTERLEAVE2(vinput[k*2], vinput[k*2+1], tmp[k*2], tmp[k*2+1]);
- }
- ib = (cfftf1_ps(Ncvec, buff[ib], buff[!ib], buff[ib], setup->twiddle, &setup->ifac[0], -1.0f) == buff[0] ? 0 : 1);
- pffft_cplx_finalize(Ncvec, buff[ib], buff[!ib], reinterpret_cast<v4sf*>(setup->e));
+
+ ib = (cfftf1_ps(Ncvec, buff[ib], buff[!ib], buff[ib], setup->twiddle, setup->ifac, -1.0f) == buff[1]);
+ pffft_cplx_finalize(Ncvec, buff[ib], buff[!ib], setup->e);
}
if(ordered)
- pffft_zreorder(setup, reinterpret_cast<float*>(buff[!ib]), reinterpret_cast<float*>(buff[ib]), PFFFT_FORWARD);
+ pffft_zreorder(setup, reinterpret_cast<float*>(buff[!ib]),
+ reinterpret_cast<float*>(buff[ib]), PFFFT_FORWARD);
else
ib = !ib;
}
@@ -1828,23 +1816,22 @@ void pffft_transform_internal(PFFFT_Setup *setup, const float *finput, float *fo
if(ordered)
{
- pffft_zreorder(setup, reinterpret_cast<const float*>(vinput), reinterpret_cast<float*>(buff[ib]), PFFFT_BACKWARD);
+ pffft_zreorder(setup, reinterpret_cast<const float*>(vinput),
+ reinterpret_cast<float*>(buff[ib]), PFFFT_BACKWARD);
vinput = buff[ib];
ib = !ib;
}
if(setup->transform == PFFFT_REAL)
{
- pffft_real_preprocess(Ncvec, vinput, buff[ib], reinterpret_cast<v4sf*>(setup->e));
- ib = (rfftb1_ps(Ncvec*2, buff[ib], buff[0], buff[1], setup->twiddle, &setup->ifac[0]) == buff[0] ? 0 : 1);
+ pffft_real_preprocess(Ncvec, vinput, buff[ib], setup->e);
+ ib = (rfftb1_ps(Ncvec*2, buff[ib], buff[0], buff[1], setup->twiddle, setup->ifac) == buff[1]);
}
else
{
- pffft_cplx_preprocess(Ncvec, vinput, buff[ib], reinterpret_cast<v4sf*>(setup->e));
- ib = (cfftf1_ps(Ncvec, buff[ib], buff[0], buff[1], setup->twiddle, &setup->ifac[0], +1.0f) == buff[0] ? 0 : 1);
- for(int k=0; k < Ncvec; ++k)
- {
+ pffft_cplx_preprocess(Ncvec, vinput, buff[ib], setup->e);
+ ib = (cfftf1_ps(Ncvec, buff[ib], buff[0], buff[1], setup->twiddle, setup->ifac, +1.0f) == buff[1]);
+ for(int k{0};k < Ncvec;++k)
INTERLEAVE2(buff[ib][k*2], buff[ib][k*2+1], buff[ib][k*2], buff[ib][k*2+1]);
- }
}
}
@@ -1852,14 +1839,13 @@ void pffft_transform_internal(PFFFT_Setup *setup, const float *finput, float *fo
{
/* extra copy required -- this situation should only happen when finput == foutput */
assert(finput==foutput);
- for(int k=0; k < Ncvec; ++k)
+ for(int k{0};k < Ncvec;++k)
{
- v4sf a = buff[ib][2*k], b = buff[ib][2*k+1];
+ v4sf a{buff[ib][2*k]}, b{buff[ib][2*k+1]};
voutput[2*k] = a; voutput[2*k+1] = b;
}
ib = !ib;
}
- assert(buff[ib] == voutput);
}
void pffft_zconvolve_accumulate(PFFFT_Setup *s, const float *a, const float *b, float *ab,
@@ -1867,10 +1853,10 @@ void pffft_zconvolve_accumulate(PFFFT_Setup *s, const float *a, const float *b,
{
assert(VALIGNED(a) && VALIGNED(b) && VALIGNED(ab));
- const int Ncvec = s->Ncvec;
- const v4sf *RESTRICT va = reinterpret_cast<const v4sf*>(a);
- const v4sf *RESTRICT vb = reinterpret_cast<const v4sf*>(b);
- v4sf *RESTRICT vab = reinterpret_cast<v4sf*>(ab);
+ const int Ncvec{s->Ncvec};
+ const v4sf *RESTRICT va{reinterpret_cast<const v4sf*>(a)};
+ const v4sf *RESTRICT vb{reinterpret_cast<const v4sf*>(b)};
+ v4sf *RESTRICT vab{reinterpret_cast<v4sf*>(ab)};
#ifdef __arm__
__builtin_prefetch(va);
@@ -1891,18 +1877,19 @@ void pffft_zconvolve_accumulate(PFFFT_Setup *s, const float *a, const float *b,
#endif
#ifndef ZCONVOLVE_USING_INLINE_ASM
- const v4sf vscal = LD_PS1(scaling);
+ const v4sf vscal{LD_PS1(scaling)};
#endif
- float ar1 = reinterpret_cast<const v4sf_union*>(va)[0].f[0];
- float ai1 = reinterpret_cast<const v4sf_union*>(va)[1].f[0];
- float br1 = reinterpret_cast<const v4sf_union*>(vb)[0].f[0];
- float bi1 = reinterpret_cast<const v4sf_union*>(vb)[1].f[0];
- float abr1 = reinterpret_cast<v4sf_union*>(vab)[0].f[0];
- float abi1 = reinterpret_cast<v4sf_union*>(vab)[1].f[0];
+ /* TODO: Use _mm_cvtss_f32 or equivalent. */
+ float ar1{reinterpret_cast<const v4sf_union*>(va)[0].f[0]};
+ float ai1{reinterpret_cast<const v4sf_union*>(va)[1].f[0]};
+ float br1{reinterpret_cast<const v4sf_union*>(vb)[0].f[0]};
+ float bi1{reinterpret_cast<const v4sf_union*>(vb)[1].f[0]};
+ float abr1{reinterpret_cast<v4sf_union*>(vab)[0].f[0]};
+ float abi1{reinterpret_cast<v4sf_union*>(vab)[1].f[0]};
#ifdef ZCONVOLVE_USING_INLINE_ASM // inline asm version, unfortunately miscompiled by clang 3.2, at least on ubuntu.. so this will be restricted to gcc
- const float *a_ = a, *b_ = b; float *ab_ = ab;
- int N = Ncvec;
+ const float *a_{a}, *b_{b}; float *ab_{ab};
+ int N{Ncvec};
asm volatile("mov r8, %2 \n"
"vdup.f32 q15, %4 \n"
"1: \n"
@@ -1941,9 +1928,8 @@ void pffft_zconvolve_accumulate(PFFFT_Setup *s, const float *a, const float *b,
for(int i=0; i < Ncvec; i += 2)
{
- v4sf ar4, ai4, br4, bi4;
- ar4 = va[2*i+0]; ai4 = va[2*i+1];
- br4 = vb[2*i+0]; bi4 = vb[2*i+1];
+ v4sf ar4{va[2*i+0]}, ai4{va[2*i+1]};
+ v4sf br4{vb[2*i+0]}, bi4{vb[2*i+1]};
VCPLXMUL(ar4, ai4, br4, bi4);
vab[2*i+0] = VMADD(ar4, vscal, vab[2*i+0]);
vab[2*i+1] = VMADD(ai4, vscal, vab[2*i+1]);
@@ -2000,22 +1986,24 @@ void pffft_zreorder_nosimd(PFFFT_Setup *setup, const float *in, float *out,
void pffft_transform_internal_nosimd(PFFFT_Setup *setup, const float *input, float *output,
float *scratch, pffft_direction_t direction, int ordered)
{
- const int Ncvec = setup->Ncvec;
- const int nf_odd = (setup->ifac[1] & 1);
+ const int Ncvec{setup->Ncvec};
+ const int nf_odd{setup->ifac[1] & 1};
assert(scratch != nullptr);
+ /* z-domain data for complex transforms is already ordered without SIMD. */
if(setup->transform == PFFFT_COMPLEX)
- ordered = 0; // it is always ordered.
- int ib = (nf_odd ^ ordered ? 1 : 0);
- float *buff[2] = { output, scratch };
+ ordered = 0;
+
+ float *buff[2]{output, scratch};
+ int ib{(nf_odd ^ ordered) ? 1 : 0};
if(direction == PFFFT_FORWARD)
{
if(setup->transform == PFFFT_REAL)
- ib = (rfftf1_ps(Ncvec*2, input, buff[ib], buff[!ib], setup->twiddle, &setup->ifac[0]) == buff[0] ? 0 : 1);
+ ib = (rfftf1_ps(Ncvec*2, input, buff[ib], buff[!ib], setup->twiddle, setup->ifac) == buff[1]);
else
- ib = (cfftf1_ps(Ncvec, input, buff[ib], buff[!ib], setup->twiddle, &setup->ifac[0], -1.0f) == buff[0] ? 0 : 1);
+ ib = (cfftf1_ps(Ncvec, input, buff[ib], buff[!ib], setup->twiddle, setup->ifac, -1.0f) == buff[1]);
if(ordered)
{
pffft_zreorder(setup, buff[ib], buff[!ib], PFFFT_FORWARD);
@@ -2034,9 +2022,9 @@ void pffft_transform_internal_nosimd(PFFFT_Setup *setup, const float *input, flo
ib = !ib;
}
if(setup->transform == PFFFT_REAL)
- ib = (rfftb1_ps(Ncvec*2, input, buff[ib], buff[!ib], setup->twiddle, &setup->ifac[0]) == buff[0] ? 0 : 1);
+ ib = (rfftb1_ps(Ncvec*2, input, buff[ib], buff[!ib], setup->twiddle, setup->ifac) == buff[1]);
else
- ib = (cfftf1_ps(Ncvec, input, buff[ib], buff[!ib], setup->twiddle, &setup->ifac[0], +1.0f) == buff[0] ? 0 : 1);
+ ib = (cfftf1_ps(Ncvec, input, buff[ib], buff[!ib], setup->twiddle, setup->ifac, +1.0f) == buff[1]);
}
if(buff[ib] != output)
{
@@ -2049,7 +2037,6 @@ void pffft_transform_internal_nosimd(PFFFT_Setup *setup, const float *input, flo
}
ib = !ib;
}
- assert(buff[ib] == output);
}
#define pffft_zconvolve_accumulate_nosimd pffft_zconvolve_accumulate