1 files changed, 148 insertions, 161 deletions
diff --git a/common/pffft.cpp b/common/pffft.cpp
index 0c2dd940..06ae66ec 100644
--- a/common/pffft.cpp
+++ b/common/pffft.cpp
@@ -57,14 +57,18 @@
 
 #include "pffft.h"
 
+#include <array>
 #include <assert.h>
 #include <cmath>
 #include <math.h>
 #include <stdio.h>
 #include <stdlib.h>
+#include <vector>
 
+#include "albit.h"
 #include "almalloc.h"
 #include "alnumbers.h"
+#include "vector.h"
 
 #if defined(__GNUC__)
 #define ALWAYS_INLINE(return_type) inline return_type __attribute__ ((always_inline))
@@ -1328,10 +1332,9 @@ struct PFFFT_Setup {
     int     Ncvec; // nb of complex simd vectors (N/4 if PFFFT_COMPLEX, N/8 if PFFFT_REAL)
     int ifac[15];
     pffft_transform_t transform;
-    float *e;    // points into 'data' , N/4*3 elements
-    float *twiddle; // points into 'data', N/4 elements
 
-    alignas(MALLOC_V4SF_ALIGNMENT) v4sf data[1];
+    float *twiddle; // N/4 elements
+    alignas(MALLOC_V4SF_ALIGNMENT) v4sf e[1]; // N/4*3 elements
 };
 
 PFFFT_Setup *pffft_new_setup(int N, pffft_transform_t transform)
@@ -1348,7 +1351,7 @@ PFFFT_Setup *pffft_new_setup(int N, pffft_transform_t transform)
         assert((N%(SIMD_SZ*SIMD_SZ)) == 0);
 
     const unsigned int Ncvec = static_cast<unsigned>(transform == PFFFT_REAL ? N/2 : N)/SIMD_SZ;
-    size_t storelen{offsetof(PFFFT_Setup, data[0]) + (2u*Ncvec * sizeof(v4sf))};
+    size_t storelen{offsetof(PFFFT_Setup, e[0]) + (2u*Ncvec * sizeof(v4sf))};
 
     void *store{al_calloc(MALLOC_V4SF_ALIGNMENT, storelen)};
     if(!store) return nullptr;
@@ -1358,39 +1361,28 @@ PFFFT_Setup *pffft_new_setup(int N, pffft_transform_t transform)
     s->transform = transform;
     /* nb of complex simd vectors */
     s->Ncvec = static_cast<int>(Ncvec);
-    s->e = reinterpret_cast<float*>(&s->data[0]);
-    s->twiddle = reinterpret_cast<float*>(&s->data[2u*Ncvec*(SIMD_SZ-1)/SIMD_SZ]);
+    s->twiddle = reinterpret_cast<float*>(&s->e[2u*Ncvec*(SIMD_SZ-1)/SIMD_SZ]);
 
-    if(transform == PFFFT_REAL)
+    if constexpr(SIMD_SZ > 1)
     {
+        al::vector<float,16> e(2u*Ncvec*(SIMD_SZ-1));
         for(int k=0; k < s->Ncvec; ++k)
         {
-            int i = k/SIMD_SZ;
-            int j = k%SIMD_SZ;
-            for(int m=0; m < SIMD_SZ-1; ++m)
+            size_t i{static_cast<size_t>(k) / SIMD_SZ};
+            size_t j{static_cast<size_t>(k) % SIMD_SZ};
+            for(size_t m{0};m < SIMD_SZ-1;++m)
             {
-                const double A = -2.0*al::numbers::pi*(m+1)*k / N;
-                s->e[(2*(i*3 + m) + 0) * SIMD_SZ + j] = static_cast<float>(std::cos(A));
-                s->e[(2*(i*3 + m) + 1) * SIMD_SZ + j] = static_cast<float>(std::sin(A));
+                const double A = -2.0*al::numbers::pi*static_cast<double>(m+1)*k / N;
+                e[(2*(i*3 + m) + 0)*SIMD_SZ + j] = static_cast<float>(std::cos(A));
+                e[(2*(i*3 + m) + 1)*SIMD_SZ + j] = static_cast<float>(std::sin(A));
             }
         }
-        rffti1_ps(N/SIMD_SZ, s->twiddle, s->ifac);
+        std::memcpy(s->e, e.data(), e.size()*sizeof(float));
     }
+    if(transform == PFFFT_REAL)
+        rffti1_ps(N/SIMD_SZ, s->twiddle, s->ifac);
     else
-    {
-        for(int k=0; k < s->Ncvec; ++k)
-        {
-            int i = k/SIMD_SZ;
-            int j = k%SIMD_SZ;
-            for(int m=0; m < SIMD_SZ-1; ++m)
-            {
-                const double A = -2.0*al::numbers::pi*(m+1)*k / N;
-                s->e[(2*(i*3 + m) + 0)*SIMD_SZ + j] = static_cast<float>(std::cos(A));
-                s->e[(2*(i*3 + m) + 1)*SIMD_SZ + j] = static_cast<float>(std::sin(A));
-            }
-        }
         cffti1_ps(N/SIMD_SZ, s->twiddle, s->ifac);
-    }
 
     /* check that N is decomposable with allowed prime factors */
     int m = 1;
@@ -1455,13 +1447,14 @@ static void unreversed_copy(int N, const v4sf *in, v4sf *out, int out_stride)
 
 void pffft_zreorder(PFFFT_Setup *setup, const float *in, float *out, pffft_direction_t direction)
 {
-    const int N = setup->N, Ncvec = setup->Ncvec;
-    const v4sf *vin = reinterpret_cast<const v4sf*>(in);
-    v4sf *vout = reinterpret_cast<v4sf*>(out);
     assert(in != out);
+
+    const int N{setup->N}, Ncvec{setup->Ncvec};
+    const v4sf *vin{reinterpret_cast<const v4sf*>(in)};
+    v4sf *vout{reinterpret_cast<v4sf*>(out)};
     if(setup->transform == PFFFT_REAL)
     {
-        const int dk = N/32;
+        const int dk{N/32};
         if(direction == PFFFT_FORWARD)
         {
             for(int k=0; k < dk; ++k)
@@ -1469,8 +1462,8 @@ void pffft_zreorder(PFFFT_Setup *setup, const float *in, float *out, pffft_direc
                 INTERLEAVE2(vin[k*8 + 0], vin[k*8 + 1], vout[2*(0*dk + k) + 0], vout[2*(0*dk + k) + 1]);
                 INTERLEAVE2(vin[k*8 + 4], vin[k*8 + 5], vout[2*(2*dk + k) + 0], vout[2*(2*dk + k) + 1]);
             }
-            reversed_copy(dk, vin+2, 8, reinterpret_cast<v4sf*>(out + N/2));
-            reversed_copy(dk, vin+6, 8, reinterpret_cast<v4sf*>(out + N));
+            reversed_copy(dk, vin+2, 8, vout + N/SIMD_SZ/2);
+            reversed_copy(dk, vin+6, 8, vout + N/SIMD_SZ);
         }
         else
         {
@@ -1479,8 +1472,8 @@ void pffft_zreorder(PFFFT_Setup *setup, const float *in, float *out, pffft_direc
                 UNINTERLEAVE2(vin[2*(0*dk + k) + 0], vin[2*(0*dk + k) + 1], vout[k*8 + 0], vout[k*8 + 1]);
                 UNINTERLEAVE2(vin[2*(2*dk + k) + 0], vin[2*(2*dk + k) + 1], vout[k*8 + 4], vout[k*8 + 5]);
             }
-            unreversed_copy(dk, reinterpret_cast<const v4sf*>(in + N/4), reinterpret_cast<v4sf*>(out + N - 6*SIMD_SZ), -8);
-            unreversed_copy(dk, reinterpret_cast<const v4sf*>(in + 3*N/4), reinterpret_cast<v4sf*>(out + N - 2*SIMD_SZ), -8);
+            unreversed_copy(dk, vin + N/SIMD_SZ/4, vout + N/SIMD_SZ - 6, -8);
+            unreversed_copy(dk, vin + 3*N/SIMD_SZ/4, vout + N/SIMD_SZ - 2, -8);
         }
     }
     else
@@ -1504,31 +1497,29 @@ void pffft_zreorder(PFFFT_Setup *setup, const float *in, float *out, pffft_direc
     }
 }
 
-void pffft_cplx_finalize(int Ncvec, const v4sf *in, v4sf *out, const v4sf *e)
+void pffft_cplx_finalize(const int Ncvec, const v4sf *in, v4sf *out, const v4sf *e)
 {
-    const int dk = Ncvec/SIMD_SZ; // number of 4x4 matrix blocks
-    v4sf r0, i0, r1, i1, r2, i2, r3, i3;
-    v4sf sr0, dr0, sr1, dr1, si0, di0, si1, di1;
     assert(in != out);
+
+    const int dk{Ncvec/SIMD_SZ}; // number of 4x4 matrix blocks
     for(int k=0; k < dk; ++k)
     {
-        r0 = in[8*k+0]; i0 = in[8*k+1];
-        r1 = in[8*k+2]; i1 = in[8*k+3];
-        r2 = in[8*k+4]; i2 = in[8*k+5];
-        r3 = in[8*k+6]; i3 = in[8*k+7];
+        v4sf r0{in[8*k+0]}, i0{in[8*k+1]};
+        v4sf r1{in[8*k+2]}, i1{in[8*k+3]};
+        v4sf r2{in[8*k+4]}, i2{in[8*k+5]};
+        v4sf r3{in[8*k+6]}, i3{in[8*k+7]};
         VTRANSPOSE4(r0,r1,r2,r3);
         VTRANSPOSE4(i0,i1,i2,i3);
         VCPLXMUL(r1,i1,e[k*6+0],e[k*6+1]);
         VCPLXMUL(r2,i2,e[k*6+2],e[k*6+3]);
         VCPLXMUL(r3,i3,e[k*6+4],e[k*6+5]);
 
-        sr0 = VADD(r0,r2); dr0 = VSUB(r0, r2);
-        sr1 = VADD(r1,r3); dr1 = VSUB(r1, r3);
-        si0 = VADD(i0,i2); di0 = VSUB(i0, i2);
-        si1 = VADD(i1,i3); di1 = VSUB(i1, i3);
+        v4sf sr0{VADD(r0,r2)}, dr0{VSUB(r0, r2)};
+        v4sf sr1{VADD(r1,r3)}, dr1{VSUB(r1, r3)};
+        v4sf si0{VADD(i0,i2)}, di0{VSUB(i0, i2)};
+        v4sf si1{VADD(i1,i3)}, di1{VSUB(i1, i3)};
 
-        /*
-         * transformation for each column is:
+        /* transformation for each column is:
          *
          * [1   1   1   1   0   0   0   0]   [r0]
          * [1   0  -1   0   0  -1   0   1]   [r1]
@@ -1550,23 +1541,22 @@ void pffft_cplx_finalize(int Ncvec, const v4sf *in, v4sf *out, const v4sf *e)
     }
 }
 
-void pffft_cplx_preprocess(int Ncvec, const v4sf *in, v4sf *out, const v4sf *e)
+void pffft_cplx_preprocess(const int Ncvec, const v4sf *in, v4sf *out, const v4sf *e)
 {
-    const int dk = Ncvec/SIMD_SZ; // number of 4x4 matrix blocks
-    v4sf r0, i0, r1, i1, r2, i2, r3, i3;
-    v4sf sr0, dr0, sr1, dr1, si0, di0, si1, di1;
     assert(in != out);
+
+    const int dk{Ncvec/SIMD_SZ}; // number of 4x4 matrix blocks
     for(int k=0; k < dk; ++k)
     {
-        r0 = in[8*k+0]; i0 = in[8*k+1];
-        r1 = in[8*k+2]; i1 = in[8*k+3];
-        r2 = in[8*k+4]; i2 = in[8*k+5];
-        r3 = in[8*k+6]; i3 = in[8*k+7];
+        v4sf r0{in[8*k+0]}, i0{in[8*k+1]};
+        v4sf r1{in[8*k+2]}, i1{in[8*k+3]};
+        v4sf r2{in[8*k+4]}, i2{in[8*k+5]};
+        v4sf r3{in[8*k+6]}, i3{in[8*k+7]};
 
-        sr0 = VADD(r0,r2); dr0 = VSUB(r0, r2);
-        sr1 = VADD(r1,r3); dr1 = VSUB(r1, r3);
-        si0 = VADD(i0,i2); di0 = VSUB(i0, i2);
-        si1 = VADD(i1,i3); di1 = VSUB(i1, i3);
+        v4sf sr0{VADD(r0,r2)}, dr0{VSUB(r0, r2)};
+        v4sf sr1{VADD(r1,r3)}, dr1{VSUB(r1, r3)};
+        v4sf si0{VADD(i0,i2)}, di0{VSUB(i0, i2)};
+        v4sf si1{VADD(i1,i3)}, di1{VSUB(i1, i3)};
 
         r0 = VADD(sr0, sr1); i0 = VADD(si0, si1);
         r1 = VSUB(dr0, di1); i1 = VADD(di0, dr1);
@@ -1645,20 +1635,18 @@ static ALWAYS_INLINE(void) pffft_real_finalize_4x4(const v4sf *in0, const v4sf *
 
 static NEVER_INLINE(void) pffft_real_finalize(int Ncvec, const v4sf *in, v4sf *out, const v4sf *e)
 {
-    static constexpr float s = al::numbers::sqrt2_v<float>/2.0f;
-    const int dk = Ncvec/SIMD_SZ; // number of 4x4 matrix blocks
-    /* fftpack order is f0r f1r f1i f2r f2i ... f(n-1)r f(n-1)i f(n)r */
-
-    v4sf_union cr, ci, *uout = reinterpret_cast<v4sf_union*>(out);
-    v4sf save = in[7], zero=VZERO();
-    float xr0, xi0, xr1, xi1, xr2, xi2, xr3, xi3;
+    static constexpr float s{al::numbers::sqrt2_v<float>/2.0f};
 
-    cr.v = in[0]; ci.v = in[Ncvec*2-1];
     assert(in != out);
+    const int dk{Ncvec/SIMD_SZ}; // number of 4x4 matrix blocks
+    /* fftpack order is f0r f1r f1i f2r f2i ... f(n-1)r f(n-1)i f(n)r */
+
+    const v4sf zero{VZERO()};
+    const auto cr = al::bit_cast<std::array<float,SIMD_SZ>>(in[0]);
+    const auto ci = al::bit_cast<std::array<float,SIMD_SZ>>(in[Ncvec*2-1]);
     pffft_real_finalize_4x4(&zero, &zero, in+1, e, out);
 
-    /*
-     * [cr0 cr1 cr2 cr3 ci0 ci1 ci2 ci3]
+    /* [cr0 cr1 cr2 cr3 ci0 ci1 ci2 ci3]
      *
      * [Xr(1)]  ] [1   1   1   1   0   0   0   0]
      * [Xr(N/4) ] [0   0   0   0   1   s   0  -s]
@@ -1670,29 +1658,26 @@ static NEVER_INLINE(void) pffft_real_finalize(int Ncvec, const v4sf *in, v4sf *o
      * [Xi(3N/4)] [0   0   0   0   0  -s   1  -s]
      */
 
-    xr0=(cr.f[0]+cr.f[2]) + (cr.f[1]+cr.f[3]); uout[0].f[0] = xr0;
-    xi0=(cr.f[0]+cr.f[2]) - (cr.f[1]+cr.f[3]); uout[1].f[0] = xi0;
-    xr2=(cr.f[0]-cr.f[2]);                     uout[4].f[0] = xr2;
-    xi2=(cr.f[3]-cr.f[1]);                     uout[5].f[0] = xi2;
-    xr1= ci.f[0] + s*(ci.f[1]-ci.f[3]);        uout[2].f[0] = xr1;
-    xi1=-ci.f[2] - s*(ci.f[1]+ci.f[3]);        uout[3].f[0] = xi1;
-    xr3= ci.f[0] - s*(ci.f[1]-ci.f[3]);        uout[6].f[0] = xr3;
-    xi3= ci.f[2] - s*(ci.f[1]+ci.f[3]);        uout[7].f[0] = xi3;
-
-    for(int k=1; k < dk; ++k)
-    {
-        v4sf save_next = in[8*k+7];
-        pffft_real_finalize_4x4(&save, &in[8*k+0], in + 8*k+1, e + k*6, out + k*8);
-        save = save_next;
-    }
+    auto *uout = reinterpret_cast<v4sf_union*>(out);
+    const float xr0{(cr[0]+cr[2]) + (cr[1]+cr[3])}; uout[0].f[0] = xr0;
+    const float xi0{(cr[0]+cr[2]) - (cr[1]+cr[3])}; uout[1].f[0] = xi0;
+    const float xr2{(cr[0]-cr[2])};                 uout[4].f[0] = xr2;
+    const float xi2{(cr[3]-cr[1])};                 uout[5].f[0] = xi2;
+    const float xr1{ ci[0] + s*(ci[1]-ci[3])};      uout[2].f[0] = xr1;
+    const float xi1{-ci[2] - s*(ci[1]+ci[3])};      uout[3].f[0] = xi1;
+    const float xr3{ ci[0] - s*(ci[1]-ci[3])};      uout[6].f[0] = xr3;
+    const float xi3{ ci[2] - s*(ci[1]+ci[3])};      uout[7].f[0] = xi3;
+
+    for(int k{1};k < dk;++k)
+        pffft_real_finalize_4x4(&in[8*k-1], &in[8*k+0], in + 8*k+1, e + k*6, out + k*8);
 }
 
 static ALWAYS_INLINE(void) pffft_real_preprocess_4x4(const v4sf *in, const v4sf *e, v4sf *out,
     int first)
 {
     v4sf r0=in[0], i0=in[1], r1=in[2], i1=in[3], r2=in[4], i2=in[5], r3=in[6], i3=in[7];
-    /*
-     * transformation for each column is:
+
+    /* transformation for each column is:
      *
      * [1   1   1   1   0   0   0   0]   [r0]
      * [1   0   0  -1   0  -1  -1   0]   [r1]
@@ -1741,22 +1726,22 @@ static ALWAYS_INLINE(void) pffft_real_preprocess_4x4(const v4sf *in, const v4sf
 static NEVER_INLINE(void) pffft_real_preprocess(int Ncvec, const v4sf *in, v4sf *out, const v4sf *e)
 {
     static constexpr float s = al::numbers::sqrt2_v<float>;
-    const int dk = Ncvec/SIMD_SZ; // number of 4x4 matrix blocks
-    /* fftpack order is f0r f1r f1i f2r f2i ... f(n-1)r f(n-1)i f(n)r */
 
-    v4sf_union Xr, Xi, *uout = reinterpret_cast<v4sf_union*>(out);
-    float cr0, ci0, cr1, ci1, cr2, ci2, cr3, ci3;
     assert(in != out);
-    for(int k=0; k < 4; ++k)
+    const int dk{Ncvec/SIMD_SZ}; // number of 4x4 matrix blocks
+    /* fftpack order is f0r f1r f1i f2r f2i ... f(n-1)r f(n-1)i f(n)r */
+
+    std::array<float,SIMD_SZ> Xr, Xi;
+    for(size_t k{0};k < 4;++k)
     {
-        Xr.f[k] = reinterpret_cast<const float*>(in)[8*k];
-        Xi.f[k] = reinterpret_cast<const float*>(in)[8*k+4];
+        /* TODO: Use _mm_cvtss_f32 or equivalent. */
+        Xr[k] = al::bit_cast<std::array<float,SIMD_SZ>>(in[4*k])[0];
+        Xi[k] = al::bit_cast<std::array<float,SIMD_SZ>>(in[4*k + 1])[0];
     }
 
     pffft_real_preprocess_4x4(in, e, out+1, 1); // will write only 6 values
 
-    /*
-     * [Xr0 Xr1 Xr2 Xr3 Xi0 Xi1 Xi2 Xi3]
+    /* [Xr0 Xr1 Xr2 Xr3 Xi0 Xi1 Xi2 Xi3]
      *
      * [cr0] [1   0   2   0   1   0   0   0]
      * [cr1] [1   0   0   0  -1   0  -2   0]
@@ -1767,57 +1752,60 @@ static NEVER_INLINE(void) pffft_real_preprocess(int Ncvec, const v4sf *in, v4sf
      * [ci2] [0   0   0   0   0  -2   0   2]
      * [ci3] [0  -s   0   s   0  -s   0  -s]
      */
-    for(int k=1; k < dk; ++k)
+    for(int k{1};k < dk;++k)
         pffft_real_preprocess_4x4(in+8*k, e + k*6, out-1+k*8, 0);
 
-    cr0=(Xr.f[0]+Xi.f[0]) + 2*Xr.f[2]; uout[0].f[0] = cr0;
-    cr1=(Xr.f[0]-Xi.f[0]) - 2*Xi.f[2]; uout[0].f[1] = cr1;
-    cr2=(Xr.f[0]+Xi.f[0]) - 2*Xr.f[2]; uout[0].f[2] = cr2;
-    cr3=(Xr.f[0]-Xi.f[0]) + 2*Xi.f[2]; uout[0].f[3] = cr3;
-    ci0= 2*(Xr.f[1]+Xr.f[3]);                       uout[2*Ncvec-1].f[0] = ci0;
-    ci1= s*(Xr.f[1]-Xr.f[3]) - s*(Xi.f[1]+Xi.f[3]); uout[2*Ncvec-1].f[1] = ci1;
-    ci2= 2*(Xi.f[3]-Xi.f[1]);                       uout[2*Ncvec-1].f[2] = ci2;
-    ci3=-s*(Xr.f[1]-Xr.f[3]) - s*(Xi.f[1]+Xi.f[3]); uout[2*Ncvec-1].f[3] = ci3;
+    /* TODO: Use _mm_set_ps or equivalent. */
+    auto *uout = reinterpret_cast<v4sf_union*>(out);
+    const float cr0{(Xr[0]+Xi[0]) + 2*Xr[2]}; uout[0].f[0] = cr0;
+    const float cr1{(Xr[0]-Xi[0]) - 2*Xi[2]}; uout[0].f[1] = cr1;
+    const float cr2{(Xr[0]+Xi[0]) - 2*Xr[2]}; uout[0].f[2] = cr2;
+    const float cr3{(Xr[0]-Xi[0]) + 2*Xi[2]}; uout[0].f[3] = cr3;
+    const float ci0{ 2*(Xr[1]+Xr[3])};                   uout[2*Ncvec-1].f[0] = ci0;
+    const float ci1{ s*(Xr[1]-Xr[3]) - s*(Xi[1]+Xi[3])}; uout[2*Ncvec-1].f[1] = ci1;
+    const float ci2{ 2*(Xi[3]-Xi[1])};                   uout[2*Ncvec-1].f[2] = ci2;
+    const float ci3{-s*(Xr[1]-Xr[3]) - s*(Xi[1]+Xi[3])}; uout[2*Ncvec-1].f[3] = ci3;
 }
 
 
 void pffft_transform_internal(PFFFT_Setup *setup, const float *finput, float *foutput,
     v4sf *scratch, pffft_direction_t direction, int ordered)
 {
-    const int Ncvec  = setup->Ncvec;
-    const int nf_odd = (setup->ifac[1] & 1);
-
-    // temporary buffer is allocated on the stack if the scratch pointer is NULL
     assert(scratch != nullptr);
+    assert(VALIGNED(finput) && VALIGNED(foutput) && VALIGNED(scratch));
 
-    const v4sf *vinput = reinterpret_cast<const v4sf*>(finput);
-    v4sf *voutput      = reinterpret_cast<v4sf*>(foutput);
-    v4sf *buff[2]      = { voutput, scratch };
-    int ib = (nf_odd ^ ordered ? 1 : 0);
+    const int Ncvec{setup->Ncvec};
+    const int nf_odd{setup->ifac[1] & 1};
 
-    assert(VALIGNED(finput) && VALIGNED(foutput));
+    auto *vinput = reinterpret_cast<const v4sf*>(finput);
+    auto *voutput = reinterpret_cast<v4sf*>(foutput);
+    assert(voutput != scratch);
 
-    //assert(finput != foutput);
+    v4sf *buff[2]{voutput, scratch};
+    int ib{(nf_odd ^ ordered) ? 1 : 0};
     if(direction == PFFFT_FORWARD)
     {
+        /* Swap the initial work buffer for forward FFTs, which helps avoid an
+         * extra copy for output.
+         */
         ib = !ib;
         if(setup->transform == PFFFT_REAL)
         {
-            ib = (rfftf1_ps(Ncvec*2, vinput, buff[ib], buff[!ib], setup->twiddle, &setup->ifac[0]) == buff[0] ? 0 : 1);
-            pffft_real_finalize(Ncvec, buff[ib], buff[!ib], reinterpret_cast<v4sf*>(setup->e));
+            ib = (rfftf1_ps(Ncvec*2, vinput, buff[ib], buff[!ib], setup->twiddle, setup->ifac) == buff[1]);
+            pffft_real_finalize(Ncvec, buff[ib], buff[!ib], setup->e);
         }
         else
         {
-            v4sf *tmp = buff[ib];
+            v4sf *tmp{buff[ib]};
             for(int k=0; k < Ncvec; ++k)
-            {
                 UNINTERLEAVE2(vinput[k*2], vinput[k*2+1], tmp[k*2], tmp[k*2+1]);
-            }
-            ib = (cfftf1_ps(Ncvec, buff[ib], buff[!ib], buff[ib], setup->twiddle, &setup->ifac[0], -1.0f) == buff[0] ? 0 : 1);
-            pffft_cplx_finalize(Ncvec, buff[ib], buff[!ib], reinterpret_cast<v4sf*>(setup->e));
+
+            ib = (cfftf1_ps(Ncvec, buff[ib], buff[!ib], buff[ib], setup->twiddle, setup->ifac, -1.0f) == buff[1]);
+            pffft_cplx_finalize(Ncvec, buff[ib], buff[!ib], setup->e);
         }
         if(ordered)
-            pffft_zreorder(setup, reinterpret_cast<float*>(buff[!ib]), reinterpret_cast<float*>(buff[ib]), PFFFT_FORWARD);
+            pffft_zreorder(setup, reinterpret_cast<float*>(buff[!ib]),
+                reinterpret_cast<float*>(buff[ib]), PFFFT_FORWARD);
         else
             ib = !ib;
     }
@@ -1828,23 +1816,22 @@ void pffft_transform_internal(PFFFT_Setup *setup, const float *finput, float *fo
 
         if(ordered)
         {
-            pffft_zreorder(setup, reinterpret_cast<const float*>(vinput), reinterpret_cast<float*>(buff[ib]), PFFFT_BACKWARD);
+            pffft_zreorder(setup, reinterpret_cast<const float*>(vinput),
+                reinterpret_cast<float*>(buff[ib]), PFFFT_BACKWARD);
             vinput = buff[ib];
             ib = !ib;
         }
         if(setup->transform == PFFFT_REAL)
         {
-            pffft_real_preprocess(Ncvec, vinput, buff[ib], reinterpret_cast<v4sf*>(setup->e));
-            ib = (rfftb1_ps(Ncvec*2, buff[ib], buff[0], buff[1], setup->twiddle, &setup->ifac[0]) == buff[0] ? 0 : 1);
+            pffft_real_preprocess(Ncvec, vinput, buff[ib], setup->e);
+            ib = (rfftb1_ps(Ncvec*2, buff[ib], buff[0], buff[1], setup->twiddle, setup->ifac) == buff[1]);
         }
         else
         {
-            pffft_cplx_preprocess(Ncvec, vinput, buff[ib], reinterpret_cast<v4sf*>(setup->e));
-            ib = (cfftf1_ps(Ncvec, buff[ib], buff[0], buff[1],  setup->twiddle, &setup->ifac[0], +1.0f) == buff[0] ? 0 : 1);
-            for(int k=0; k < Ncvec; ++k)
-            {
+            pffft_cplx_preprocess(Ncvec, vinput, buff[ib], setup->e);
+            ib = (cfftf1_ps(Ncvec, buff[ib], buff[0], buff[1],  setup->twiddle, setup->ifac, +1.0f) == buff[1]);
+            for(int k{0};k < Ncvec;++k)
                 INTERLEAVE2(buff[ib][k*2], buff[ib][k*2+1], buff[ib][k*2], buff[ib][k*2+1]);
-            }
         }
     }
 
@@ -1852,14 +1839,13 @@ void pffft_transform_internal(PFFFT_Setup *setup, const float *finput, float *fo
     {
         /* extra copy required -- this situation should only happen when finput == foutput */
         assert(finput==foutput);
-        for(int k=0; k < Ncvec; ++k)
+        for(int k{0};k < Ncvec;++k)
         {
-            v4sf a = buff[ib][2*k], b = buff[ib][2*k+1];
+            v4sf a{buff[ib][2*k]}, b{buff[ib][2*k+1]};
             voutput[2*k] = a; voutput[2*k+1] = b;
         }
         ib = !ib;
     }
-    assert(buff[ib] == voutput);
 }
 
 void pffft_zconvolve_accumulate(PFFFT_Setup *s, const float *a, const float *b, float *ab,
@@ -1867,10 +1853,10 @@ void pffft_zconvolve_accumulate(PFFFT_Setup *s, const float *a, const float *b,
 {
     assert(VALIGNED(a) && VALIGNED(b) && VALIGNED(ab));
 
-    const int Ncvec = s->Ncvec;
-    const v4sf *RESTRICT va = reinterpret_cast<const v4sf*>(a);
-    const v4sf *RESTRICT vb = reinterpret_cast<const v4sf*>(b);
-    v4sf *RESTRICT vab = reinterpret_cast<v4sf*>(ab);
+    const int Ncvec{s->Ncvec};
+    const v4sf *RESTRICT va{reinterpret_cast<const v4sf*>(a)};
+    const v4sf *RESTRICT vb{reinterpret_cast<const v4sf*>(b)};
+    v4sf *RESTRICT vab{reinterpret_cast<v4sf*>(ab)};
 
 #ifdef __arm__
     __builtin_prefetch(va);
@@ -1891,18 +1877,19 @@ void pffft_zconvolve_accumulate(PFFFT_Setup *s, const float *a, const float *b,
 #endif
 
 #ifndef ZCONVOLVE_USING_INLINE_ASM
-    const v4sf vscal = LD_PS1(scaling);
+    const v4sf vscal{LD_PS1(scaling)};
 #endif
-    float ar1 = reinterpret_cast<const v4sf_union*>(va)[0].f[0];
-    float ai1 = reinterpret_cast<const v4sf_union*>(va)[1].f[0];
-    float br1 = reinterpret_cast<const v4sf_union*>(vb)[0].f[0];
-    float bi1 = reinterpret_cast<const v4sf_union*>(vb)[1].f[0];
-    float abr1 = reinterpret_cast<v4sf_union*>(vab)[0].f[0];
-    float abi1 = reinterpret_cast<v4sf_union*>(vab)[1].f[0];
+    /* TODO: Use _mm_cvtss_f32 or equivalent. */
+    float ar1{reinterpret_cast<const v4sf_union*>(va)[0].f[0]};
+    float ai1{reinterpret_cast<const v4sf_union*>(va)[1].f[0]};
+    float br1{reinterpret_cast<const v4sf_union*>(vb)[0].f[0]};
+    float bi1{reinterpret_cast<const v4sf_union*>(vb)[1].f[0]};
+    float abr1{reinterpret_cast<v4sf_union*>(vab)[0].f[0]};
+    float abi1{reinterpret_cast<v4sf_union*>(vab)[1].f[0]};
 
 #ifdef ZCONVOLVE_USING_INLINE_ASM // inline asm version, unfortunately miscompiled by clang 3.2, at least on ubuntu.. so this will be restricted to gcc
-    const float *a_ = a, *b_ = b; float *ab_ = ab;
-    int N = Ncvec;
+    const float *a_{a}, *b_{b}; float *ab_{ab};
+    int N{Ncvec};
     asm volatile("mov         r8, %2                  \n"
                 "vdup.f32    q15, %4                 \n"
                 "1:                                  \n"
@@ -1941,9 +1928,8 @@ void pffft_zconvolve_accumulate(PFFFT_Setup *s, const float *a, const float *b,
 
     for(int i=0; i < Ncvec; i += 2)
     {
-        v4sf ar4, ai4, br4, bi4;
-        ar4 = va[2*i+0]; ai4 = va[2*i+1];
-        br4 = vb[2*i+0]; bi4 = vb[2*i+1];
+        v4sf ar4{va[2*i+0]}, ai4{va[2*i+1]};
+        v4sf br4{vb[2*i+0]}, bi4{vb[2*i+1]};
         VCPLXMUL(ar4, ai4, br4, bi4);
         vab[2*i+0] = VMADD(ar4, vscal, vab[2*i+0]);
         vab[2*i+1] = VMADD(ai4, vscal, vab[2*i+1]);
@@ -2000,22 +1986,24 @@ void pffft_zreorder_nosimd(PFFFT_Setup *setup, const float *in, float *out,
 void pffft_transform_internal_nosimd(PFFFT_Setup *setup, const float *input, float *output,
     float *scratch, pffft_direction_t direction, int ordered)
 {
-    const int Ncvec  = setup->Ncvec;
-    const int nf_odd = (setup->ifac[1] & 1);
+    const int Ncvec{setup->Ncvec};
+    const int nf_odd{setup->ifac[1] & 1};
 
     assert(scratch != nullptr);
 
+    /* z-domain data for complex transforms is already ordered without SIMD. */
     if(setup->transform == PFFFT_COMPLEX)
-        ordered = 0; // it is always ordered.
-    int ib = (nf_odd ^ ordered ? 1 : 0);
-    float *buff[2] = { output, scratch };
+        ordered = 0;
+
+    float *buff[2]{output, scratch};
+    int ib{(nf_odd ^ ordered) ? 1 : 0};
 
     if(direction == PFFFT_FORWARD)
     {
         if(setup->transform == PFFFT_REAL)
-            ib = (rfftf1_ps(Ncvec*2, input, buff[ib], buff[!ib], setup->twiddle, &setup->ifac[0]) == buff[0] ? 0 : 1);
+            ib = (rfftf1_ps(Ncvec*2, input, buff[ib], buff[!ib], setup->twiddle, setup->ifac) == buff[1]);
         else
-            ib = (cfftf1_ps(Ncvec, input, buff[ib], buff[!ib], setup->twiddle, &setup->ifac[0], -1.0f) == buff[0] ? 0 : 1);
+            ib = (cfftf1_ps(Ncvec, input, buff[ib], buff[!ib], setup->twiddle, setup->ifac, -1.0f) == buff[1]);
         if(ordered)
         {
             pffft_zreorder(setup, buff[ib], buff[!ib], PFFFT_FORWARD);
@@ -2034,9 +2022,9 @@ void pffft_transform_internal_nosimd(PFFFT_Setup *setup, const float *input, flo
             ib = !ib;
         }
         if(setup->transform == PFFFT_REAL)
-            ib = (rfftb1_ps(Ncvec*2, input, buff[ib], buff[!ib],  setup->twiddle, &setup->ifac[0]) == buff[0] ? 0 : 1);
+            ib = (rfftb1_ps(Ncvec*2, input, buff[ib], buff[!ib],  setup->twiddle, setup->ifac) == buff[1]);
         else
-            ib = (cfftf1_ps(Ncvec, input, buff[ib], buff[!ib], setup->twiddle, &setup->ifac[0], +1.0f) == buff[0] ? 0 : 1);
+            ib = (cfftf1_ps(Ncvec, input, buff[ib], buff[!ib], setup->twiddle, setup->ifac, +1.0f) == buff[1]);
     }
     if(buff[ib] != output)
     {
@@ -2049,7 +2037,6 @@ void pffft_transform_internal_nosimd(PFFFT_Setup *setup, const float *input, flo
         }
         ib = !ib;
     }
-    assert(buff[ib] == output);
 }
 
 #define pffft_zconvolve_accumulate_nosimd pffft_zconvolve_accumulate