diff options
author | Chris Robinson <[email protected]> | 2023-10-09 01:29:14 -0700 |
---|---|---|
committer | Chris Robinson <[email protected]> | 2023-10-09 01:29:14 -0700 |
commit | 60ed9ec8bad22cc904ff0dec9b6d7dfe3c704e56 (patch) | |
tree | 60c38d0a4f0f6648c20a7f66611af9f1a451fbf8 /common | |
parent | 9cbf4d99231bf495a23cb78be504bd9ffd29eadd (diff) |
Cleanup PFFFT
Make stylization more consistent.
Remove SVMUL (they all simulated it with a LD_PS1 on the scalar).
Avoid calling LD_PS1 on the same value in a loop.
Diffstat (limited to 'common')
-rw-r--r-- | common/pffft.cpp | 1175 |
1 files changed, 604 insertions, 571 deletions
diff --git a/common/pffft.cpp b/common/pffft.cpp index 883e44f0..8eb5a19b 100644 --- a/common/pffft.cpp +++ b/common/pffft.cpp @@ -68,6 +68,7 @@ #include "albit.h" #include "almalloc.h" #include "alnumbers.h" +#include "opthelpers.h" #include "vector.h" #if defined(__GNUC__) @@ -94,7 +95,7 @@ * vectors should be limited to these macros */ -// define PFFFT_SIMD_DISABLE if you want to use scalar code instead of simd code +/* Define PFFFT_SIMD_DISABLE if you want to use scalar code instead of SIMD code */ //#define PFFFT_SIMD_DISABLE #ifndef PFFFT_SIMD_DISABLE @@ -147,18 +148,18 @@ inline v4sf vset4(float a, float b, float c, float d) #include <xmmintrin.h> typedef __m128 v4sf; #define SIMD_SZ 4 // 4 floats by simd vector -- this is pretty much hardcoded in the preprocess/finalize functions anyway so you will have to work if you want to enable AVX with its 256-bit vectors. -#define VZERO() _mm_setzero_ps() -#define VMUL(a,b) _mm_mul_ps(a,b) -#define VADD(a,b) _mm_add_ps(a,b) +#define VZERO _mm_setzero_ps +#define VMUL _mm_mul_ps +#define VADD _mm_add_ps #define VMADD(a,b,c) _mm_add_ps(_mm_mul_ps(a,b), c) -#define VSUB(a,b) _mm_sub_ps(a,b) -#define LD_PS1(p) _mm_set1_ps(p) +#define VSUB _mm_sub_ps +#define LD_PS1 _mm_set1_ps #define VSET4 _mm_setr_ps #define VINSERT0(v, a) _mm_move_ss((v), _mm_set_ss(a)) #define VEXTRACT0 _mm_cvtss_f32 #define INTERLEAVE2(in1, in2, out1, out2) do { v4sf tmp__ = _mm_unpacklo_ps(in1, in2); out2 = _mm_unpackhi_ps(in1, in2); out1 = tmp__; } while(0) #define UNINTERLEAVE2(in1, in2, out1, out2) do { v4sf tmp__ = _mm_shuffle_ps(in1, in2, _MM_SHUFFLE(2,0,2,0)); out2 = _mm_shuffle_ps(in1, in2, _MM_SHUFFLE(3,1,3,1)); out1 = tmp__; } while(0) -#define VTRANSPOSE4(x0,x1,x2,x3) _MM_TRANSPOSE4_PS(x0,x1,x2,x3) +#define VTRANSPOSE4 _MM_TRANSPOSE4_PS #define VSWAPHL(a,b) _mm_shuffle_ps(b, a, _MM_SHUFFLE(3,2,1,0)) #define VALIGNED(ptr) ((reinterpret_cast<uintptr_t>(ptr) & 0xF) == 0) @@ -171,11 +172,11 @@ typedef __m128 v4sf; typedef float32x4_t v4sf; #define SIMD_SZ 4 #define VZERO() vdupq_n_f32(0) -#define VMUL(a,b) vmulq_f32(a,b) -#define VADD(a,b) vaddq_f32(a,b) +#define VMUL vmulq_f32 +#define VADD vaddq_f32 #define VMADD(a,b,c) vmlaq_f32(c,a,b) -#define VSUB(a,b) vsubq_f32(a,b) -#define LD_PS1(p) vld1q_dup_f32(&(p)) +#define VSUB vsubq_f32 +#define LD_PS1 vdupq_n_f32 inline v4sf vset4(float a, float b, float c, float d) { float32x4_t ret{vmovq_n_f32(a)}; @@ -213,7 +214,6 @@ using v4sf [[gnu::vector_size(16), gnu::aligned(16)]] = float; #define VADD(a,b) ((a) + (b)) #define VMADD(a,b,c) ((a)*(b) + (c)) #define VSUB(a,b) ((a) - (b)) -#define SVMUL(f,v) ((f) * (v)) constexpr v4sf ld_ps1(float a) noexcept { return v4sf{a, a, a, a}; } #define LD_PS1 ld_ps1 @@ -287,10 +287,6 @@ typedef float v4sf; // shortcuts for complex multiplications #define VCPLXMUL(ar,ai,br,bi) do { v4sf tmp=VMUL(ar,bi); ar=VMUL(ar,br); ar=VSUB(ar,VMUL(ai,bi)); ai=VMADD(ai,br,tmp); } while(0) #define VCPLXMULCONJ(ar,ai,br,bi) do { v4sf tmp=VMUL(ar,bi); ar=VMUL(ar,br); ar=VMADD(ai,bi,ar); ai=VSUB(VMUL(ai,br),tmp); } while(0) -#ifndef SVMUL -// multiply a scalar with a vector -#define SVMUL(f,v) VMUL(LD_PS1(f),v) -#endif #if !defined(PFFFT_SIMD_DISABLE) @@ -309,8 +305,8 @@ void validate_pffft_simd() std::memcpy(&a2_v, f+8, 4*sizeof(float)); std::memcpy(&a3_v, f+12, 4*sizeof(float)); - t_v = a0_v; u_v = a1_v; t_v = VZERO(); - t_f = al::bit_cast<float4>(t_v); + t_v = a0_v; u_v = a1_v; + t_v = VZERO(); t_f = al::bit_cast<float4>(t_v); printf("VZERO=[%2g %2g %2g %2g]\n", t_f[0], t_f[1], t_f[2], t_f[3]); assertv4(t, 0, 0, 0, 0); t_v = VADD(a1_v, a2_v); t_f = al::bit_cast<float4>(t_v); printf("VADD(4:7,8:11)=[%2g %2g %2g %2g]\n", t_f[0], t_f[1], t_f[2], t_f[3]); assertv4(t, 12, 14, 16, 18); @@ -357,12 +353,13 @@ int pffft_simd_size() { return SIMD_SZ; } /* passf2 and passb2 has been merged here, fsign = -1 for passf2, +1 for passb2 */ -static NEVER_INLINE(void) passf2_ps(int ido, int l1, const v4sf *cc, v4sf *ch, const float *wa1, float fsign) +static NEVER_INLINE(void) passf2_ps(const int ido, const int l1, const v4sf *cc, v4sf *ch, + const float *wa1, const float fsign) { - const int l1ido = l1*ido; + const int l1ido{l1*ido}; if(ido <= 2) { - for(int k=0; k < l1ido; k += ido, ch += ido, cc+= 2*ido) + for(int k{0};k < l1ido;k += ido, ch += ido, cc += 2*ido) { ch[0] = VADD(cc[0], cc[ido+0]); ch[l1ido] = VSUB(cc[0], cc[ido+0]); @@ -372,13 +369,14 @@ static NEVER_INLINE(void) passf2_ps(int ido, int l1, const v4sf *cc, v4sf *ch, c } else { - for(int k=0; k < l1ido; k += ido, ch += ido, cc += 2*ido) + const v4sf vsign{LD_PS1(fsign)}; + for(int k{0};k < l1ido;k += ido, ch += ido, cc += 2*ido) { - for(int i=0; i<ido-1; i+=2) + for(int i{0};i < ido-1;i += 2) { - v4sf tr2 = VSUB(cc[i+0], cc[i+ido+0]); - v4sf ti2 = VSUB(cc[i+1], cc[i+ido+1]); - v4sf wr = LD_PS1(wa1[i]), wi = VMUL(LD_PS1(fsign), LD_PS1(wa1[i+1])); + v4sf tr2{VSUB(cc[i+0], cc[i+ido+0])}; + v4sf ti2{VSUB(cc[i+1], cc[i+ido+1])}; + v4sf wr{LD_PS1(wa1[i])}, wi{VMUL(vsign, LD_PS1(wa1[i+1]))}; ch[i] = VADD(cc[i+0], cc[i+ido+0]); ch[i+1] = VADD(cc[i+1], cc[i+ido+1]); VCPLXMUL(tr2, ti2, wr, wi); @@ -392,30 +390,31 @@ static NEVER_INLINE(void) passf2_ps(int ido, int l1, const v4sf *cc, v4sf *ch, c /* passf3 and passb3 has been merged here, fsign = -1 for passf3, +1 for passb3 */ -static NEVER_INLINE(void) passf3_ps(int ido, int l1, const v4sf *cc, v4sf *ch, const float *wa1, - const float *wa2, float fsign) +static NEVER_INLINE(void) passf3_ps(const int ido, const int l1, const v4sf *cc, v4sf *ch, + const float *wa1, const float *wa2, const float fsign) { - static constexpr float taur = -0.5f; - const float taui = 0.866025403784439f*fsign; - const int l1ido = l1*ido; assert(ido > 2); - for(int k=0; k< l1ido; k += ido, cc+= 3*ido, ch +=ido) + + const v4sf vtaur{LD_PS1(-0.5f)}; + const v4sf vtaui{LD_PS1(0.866025403784439f*fsign)}; + const int l1ido{l1*ido}; + for(int k{0};k < l1ido;k += ido, cc += 3*ido, ch +=ido) { - for(int i=0; i<ido-1; i+=2) + for(int i{0};i < ido-1;i += 2) { - v4sf tr2 = VADD(cc[i+ido], cc[i+2*ido]); - v4sf cr2 = VADD(cc[i], SVMUL(taur,tr2)); - ch[i] = VADD(cc[i], tr2); - v4sf ti2 = VADD(cc[i+ido+1], cc[i+2*ido+1]); - v4sf ci2 = VADD(cc[i +1], SVMUL(taur,ti2)); - ch[i+1] = VADD(cc[i+1], ti2); - v4sf cr3 = SVMUL(taui, VSUB(cc[i+ido], cc[i+2*ido])); - v4sf ci3 = SVMUL(taui, VSUB(cc[i+ido+1], cc[i+2*ido+1])); - v4sf dr2 = VSUB(cr2, ci3); - v4sf dr3 = VADD(cr2, ci3); - v4sf di2 = VADD(ci2, cr3); - v4sf di3 = VSUB(ci2, cr3); - float wr1=wa1[i], wi1=fsign*wa1[i+1], wr2=wa2[i], wi2=fsign*wa2[i+1]; + v4sf tr2{VADD(cc[i+ido], cc[i+2*ido])}; + v4sf cr2{VADD(cc[i], VMUL(vtaur,tr2))}; + ch[i] = VADD(cc[i], tr2); + v4sf ti2{VADD(cc[i+ido+1], cc[i+2*ido+1])}; + v4sf ci2{VADD(cc[i +1], VMUL(vtaur,ti2))}; + ch[i+1] = VADD(cc[i+1], ti2); + v4sf cr3{VMUL(vtaui, VSUB(cc[i+ido], cc[i+2*ido]))}; + v4sf ci3{VMUL(vtaui, VSUB(cc[i+ido+1], cc[i+2*ido+1]))}; + v4sf dr2{VSUB(cr2, ci3)}; + v4sf dr3{VADD(cr2, ci3)}; + v4sf di2{VADD(ci2, cr3)}; + v4sf di3{VSUB(ci2, cr3)}; + float wr1{wa1[i]}, wi1{fsign*wa1[i+1]}, wr2{wa2[i]}, wi2{fsign*wa2[i+1]}; VCPLXMUL(dr2, di2, LD_PS1(wr1), LD_PS1(wi1)); ch[i+l1ido] = dr2; ch[i+l1ido + 1] = di2; @@ -426,23 +425,24 @@ static NEVER_INLINE(void) passf3_ps(int ido, int l1, const v4sf *cc, v4sf *ch, c } } /* passf3 */ -static NEVER_INLINE(void) passf4_ps(int ido, int l1, const v4sf *cc, v4sf *ch, const float *wa1, - const float *wa2, const float *wa3, float fsign) +static NEVER_INLINE(void) passf4_ps(const int ido, const int l1, const v4sf *cc, v4sf *ch, + const float *wa1, const float *wa2, const float *wa3, const float fsign) { - /* isign == -1 for forward transform and +1 for backward transform */ - const int l1ido = l1*ido; + /* fsign == -1 for forward transform and +1 for backward transform */ + const v4sf vsign{LD_PS1(fsign)}; + const int l1ido{l1*ido}; if(ido == 2) { - for(int k=0; k < l1ido; k += ido, ch += ido, cc += 4*ido) + for(int k{0};k < l1ido;k += ido, ch += ido, cc += 4*ido) { - v4sf tr1 = VSUB(cc[0], cc[2*ido + 0]); - v4sf tr2 = VADD(cc[0], cc[2*ido + 0]); - v4sf ti1 = VSUB(cc[1], cc[2*ido + 1]); - v4sf ti2 = VADD(cc[1], cc[2*ido + 1]); - v4sf ti4 = VMUL(VSUB(cc[1*ido + 0], cc[3*ido + 0]), LD_PS1(fsign)); - v4sf tr4 = VMUL(VSUB(cc[3*ido + 1], cc[1*ido + 1]), LD_PS1(fsign)); - v4sf tr3 = VADD(cc[ido + 0], cc[3*ido + 0]); - v4sf ti3 = VADD(cc[ido + 1], cc[3*ido + 1]); + v4sf tr1{VSUB(cc[0], cc[2*ido + 0])}; + v4sf tr2{VADD(cc[0], cc[2*ido + 0])}; + v4sf ti1{VSUB(cc[1], cc[2*ido + 1])}; + v4sf ti2{VADD(cc[1], cc[2*ido + 1])}; + v4sf ti4{VMUL(VSUB(cc[1*ido + 0], cc[3*ido + 0]), vsign)}; + v4sf tr4{VMUL(VSUB(cc[3*ido + 1], cc[1*ido + 1]), vsign)}; + v4sf tr3{VADD(cc[ido + 0], cc[3*ido + 0])}; + v4sf ti3{VADD(cc[ido + 1], cc[3*ido + 1])}; ch[0*l1ido + 0] = VADD(tr2, tr3); ch[0*l1ido + 1] = VADD(ti2, ti3); @@ -456,36 +456,36 @@ static NEVER_INLINE(void) passf4_ps(int ido, int l1, const v4sf *cc, v4sf *ch, c } else { - for(int k=0; k < l1ido; k += ido, ch+=ido, cc += 4*ido) + for(int k{0};k < l1ido;k += ido, ch+=ido, cc += 4*ido) { - for(int i=0; i<ido-1; i+=2) + for(int i{0};i < ido-1;i+=2) { - v4sf tr1 = VSUB(cc[i + 0], cc[i + 2*ido + 0]); - v4sf tr2 = VADD(cc[i + 0], cc[i + 2*ido + 0]); - v4sf ti1 = VSUB(cc[i + 1], cc[i + 2*ido + 1]); - v4sf ti2 = VADD(cc[i + 1], cc[i + 2*ido + 1]); - v4sf tr4 = VMUL(VSUB(cc[i + 3*ido + 1], cc[i + 1*ido + 1]), LD_PS1(fsign)); - v4sf ti4 = VMUL(VSUB(cc[i + 1*ido + 0], cc[i + 3*ido + 0]), LD_PS1(fsign)); - v4sf tr3 = VADD(cc[i + ido + 0], cc[i + 3*ido + 0]); - v4sf ti3 = VADD(cc[i + ido + 1], cc[i + 3*ido + 1]); + v4sf tr1{VSUB(cc[i + 0], cc[i + 2*ido + 0])}; + v4sf tr2{VADD(cc[i + 0], cc[i + 2*ido + 0])}; + v4sf ti1{VSUB(cc[i + 1], cc[i + 2*ido + 1])}; + v4sf ti2{VADD(cc[i + 1], cc[i + 2*ido + 1])}; + v4sf tr4{VMUL(VSUB(cc[i + 3*ido + 1], cc[i + 1*ido + 1]), vsign)}; + v4sf ti4{VMUL(VSUB(cc[i + 1*ido + 0], cc[i + 3*ido + 0]), vsign)}; + v4sf tr3{VADD(cc[i + ido + 0], cc[i + 3*ido + 0])}; + v4sf ti3{VADD(cc[i + ido + 1], cc[i + 3*ido + 1])}; ch[i] = VADD(tr2, tr3); - v4sf cr3 = VSUB(tr2, tr3); + v4sf cr3{VSUB(tr2, tr3)}; ch[i + 1] = VADD(ti2, ti3); - v4sf ci3 = VSUB(ti2, ti3); + v4sf ci3{VSUB(ti2, ti3)}; - v4sf cr2 = VADD(tr1, tr4); - v4sf cr4 = VSUB(tr1, tr4); - v4sf ci2 = VADD(ti1, ti4); - v4sf ci4 = VSUB(ti1, ti4); - float wr1=wa1[i], wi1=fsign*wa1[i+1]; + v4sf cr2{VADD(tr1, tr4)}; + v4sf cr4{VSUB(tr1, tr4)}; + v4sf ci2{VADD(ti1, ti4)}; + v4sf ci4{VSUB(ti1, ti4)}; + float wr1{wa1[i]}, wi1{fsign*wa1[i+1]}; VCPLXMUL(cr2, ci2, LD_PS1(wr1), LD_PS1(wi1)); - float wr2=wa2[i], wi2=fsign*wa2[i+1]; + float wr2{wa2[i]}, wi2{fsign*wa2[i+1]}; ch[i + l1ido] = cr2; ch[i + l1ido + 1] = ci2; VCPLXMUL(cr3, ci3, LD_PS1(wr2), LD_PS1(wi2)); - float wr3=wa3[i], wi3=fsign*wa3[i+1]; + float wr3{wa3[i]}, wi3{fsign*wa3[i+1]}; ch[i + 2*l1ido] = cr3; ch[i + 2*l1ido + 1] = ci3; @@ -500,50 +500,50 @@ static NEVER_INLINE(void) passf4_ps(int ido, int l1, const v4sf *cc, v4sf *ch, c /* * passf5 and passb5 has been merged here, fsign = -1 for passf5, +1 for passb5 */ -static NEVER_INLINE(void) passf5_ps(int ido, int l1, const v4sf *cc, v4sf *ch, const float *wa1, - const float *wa2, const float *wa3, const float *wa4, float fsign) +static NEVER_INLINE(void) passf5_ps(const int ido, const int l1, const v4sf *cc, v4sf *ch, + const float *wa1, const float *wa2, const float *wa3, const float *wa4, const float fsign) { - static constexpr float tr11 = 0.309016994374947f; - static constexpr float tr12 = -0.809016994374947f; - const float ti11 = 0.951056516295154f*fsign; - const float ti12 = 0.587785252292473f*fsign; + const v4sf vtr11{LD_PS1(0.309016994374947f)}; + const v4sf vtr12{LD_PS1(-0.809016994374947f)}; + const v4sf vti11{LD_PS1(0.951056516295154f*fsign)}; + const v4sf vti12{LD_PS1(0.587785252292473f*fsign)}; #define cc_ref(a_1,a_2) cc[(a_2-1)*ido + (a_1) + 1] #define ch_ref(a_1,a_3) ch[(a_3-1)*l1*ido + (a_1) + 1] assert(ido > 2); - for(int k = 0; k < l1; ++k, cc += 5*ido, ch += ido) + for(int k{0};k < l1;++k, cc += 5*ido, ch += ido) { - for(int i = 0; i < ido-1; i += 2) + for(int i{0};i < ido-1;i += 2) { - v4sf ti5 = VSUB(cc_ref(i , 2), cc_ref(i , 5)); - v4sf ti2 = VADD(cc_ref(i , 2), cc_ref(i , 5)); - v4sf ti4 = VSUB(cc_ref(i , 3), cc_ref(i , 4)); - v4sf ti3 = VADD(cc_ref(i , 3), cc_ref(i , 4)); - v4sf tr5 = VSUB(cc_ref(i-1, 2), cc_ref(i-1, 5)); - v4sf tr2 = VADD(cc_ref(i-1, 2), cc_ref(i-1, 5)); - v4sf tr4 = VSUB(cc_ref(i-1, 3), cc_ref(i-1, 4)); - v4sf tr3 = VADD(cc_ref(i-1, 3), cc_ref(i-1, 4)); + v4sf ti5{VSUB(cc_ref(i , 2), cc_ref(i , 5))}; + v4sf ti2{VADD(cc_ref(i , 2), cc_ref(i , 5))}; + v4sf ti4{VSUB(cc_ref(i , 3), cc_ref(i , 4))}; + v4sf ti3{VADD(cc_ref(i , 3), cc_ref(i , 4))}; + v4sf tr5{VSUB(cc_ref(i-1, 2), cc_ref(i-1, 5))}; + v4sf tr2{VADD(cc_ref(i-1, 2), cc_ref(i-1, 5))}; + v4sf tr4{VSUB(cc_ref(i-1, 3), cc_ref(i-1, 4))}; + v4sf tr3{VADD(cc_ref(i-1, 3), cc_ref(i-1, 4))}; ch_ref(i-1, 1) = VADD(cc_ref(i-1, 1), VADD(tr2, tr3)); ch_ref(i , 1) = VADD(cc_ref(i , 1), VADD(ti2, ti3)); - v4sf cr2 = VADD(cc_ref(i-1, 1), VADD(SVMUL(tr11, tr2),SVMUL(tr12, tr3))); - v4sf ci2 = VADD(cc_ref(i , 1), VADD(SVMUL(tr11, ti2),SVMUL(tr12, ti3))); - v4sf cr3 = VADD(cc_ref(i-1, 1), VADD(SVMUL(tr12, tr2),SVMUL(tr11, tr3))); - v4sf ci3 = VADD(cc_ref(i , 1), VADD(SVMUL(tr12, ti2),SVMUL(tr11, ti3))); - v4sf cr5 = VADD(SVMUL(ti11, tr5), SVMUL(ti12, tr4)); - v4sf ci5 = VADD(SVMUL(ti11, ti5), SVMUL(ti12, ti4)); - v4sf cr4 = VSUB(SVMUL(ti12, tr5), SVMUL(ti11, tr4)); - v4sf ci4 = VSUB(SVMUL(ti12, ti5), SVMUL(ti11, ti4)); - v4sf dr3 = VSUB(cr3, ci4); - v4sf dr4 = VADD(cr3, ci4); - v4sf di3 = VADD(ci3, cr4); - v4sf di4 = VSUB(ci3, cr4); - v4sf dr5 = VADD(cr2, ci5); - v4sf dr2 = VSUB(cr2, ci5); - v4sf di5 = VSUB(ci2, cr5); - v4sf di2 = VADD(ci2, cr5); - float wr1=wa1[i], wi1=fsign*wa1[i+1], wr2=wa2[i], wi2=fsign*wa2[i+1]; - float wr3=wa3[i], wi3=fsign*wa3[i+1], wr4=wa4[i], wi4=fsign*wa4[i+1]; + v4sf cr2{VADD(cc_ref(i-1, 1), VADD(VMUL(vtr11, tr2),VMUL(vtr12, tr3)))}; + v4sf ci2{VADD(cc_ref(i , 1), VADD(VMUL(vtr11, ti2),VMUL(vtr12, ti3)))}; + v4sf cr3{VADD(cc_ref(i-1, 1), VADD(VMUL(vtr12, tr2),VMUL(vtr11, tr3)))}; + v4sf ci3{VADD(cc_ref(i , 1), VADD(VMUL(vtr12, ti2),VMUL(vtr11, ti3)))}; + v4sf cr5{VADD(VMUL(vti11, tr5), VMUL(vti12, tr4))}; + v4sf ci5{VADD(VMUL(vti11, ti5), VMUL(vti12, ti4))}; + v4sf cr4{VSUB(VMUL(vti12, tr5), VMUL(vti11, tr4))}; + v4sf ci4{VSUB(VMUL(vti12, ti5), VMUL(vti11, ti4))}; + v4sf dr3{VSUB(cr3, ci4)}; + v4sf dr4{VADD(cr3, ci4)}; + v4sf di3{VADD(ci3, cr4)}; + v4sf di4{VSUB(ci3, cr4)}; + v4sf dr5{VADD(cr2, ci5)}; + v4sf dr2{VSUB(cr2, ci5)}; + v4sf di5{VSUB(ci2, cr5)}; + v4sf di2{VADD(ci2, cr5)}; + float wr1{wa1[i]}, wi1{fsign*wa1[i+1]}, wr2{wa2[i]}, wi2{fsign*wa2[i+1]}; + float wr3{wa3[i]}, wi3{fsign*wa3[i+1]}, wr4{wa4[i]}, wi4{fsign*wa4[i+1]}; VCPLXMUL(dr2, di2, LD_PS1(wr1), LD_PS1(wi1)); ch_ref(i - 1, 2) = dr2; ch_ref(i, 2) = di2; @@ -562,15 +562,13 @@ static NEVER_INLINE(void) passf5_ps(int ido, int l1, const v4sf *cc, v4sf *ch, c #undef cc_ref } -static NEVER_INLINE(void) radf2_ps(int ido, int l1, const v4sf *RESTRICT cc, v4sf *RESTRICT ch, - const float *wa1) +static NEVER_INLINE(void) radf2_ps(const int ido, const int l1, const v4sf *RESTRICT cc, + v4sf *RESTRICT ch, const float *wa1) { - static constexpr float minus_one = -1.f; - const int l1ido = l1*ido; - - for(int k=0; k < l1ido; k += ido) + const int l1ido{l1*ido}; + for(int k{0};k < l1ido;k += ido) { - v4sf a = cc[k], b = cc[k + l1ido]; + v4sf a{cc[k]}, b{cc[k + l1ido]}; ch[2*k] = VADD(a, b); ch[2*(k+ido)-1] = VSUB(a, b); } @@ -578,12 +576,12 @@ static NEVER_INLINE(void) radf2_ps(int ido, int l1, const v4sf *RESTRICT cc, v4s return; if(ido != 2) { - for(int k=0; k < l1ido; k += ido) + for(int k{0};k < l1ido;k += ido) { - for(int i=2; i<ido; i+=2) + for(int i{2};i < ido;i += 2) { - v4sf tr2 = cc[i - 1 + k + l1ido], ti2 = cc[i + k + l1ido]; - v4sf br = cc[i - 1 + k], bi = cc[i + k]; + v4sf tr2{cc[i - 1 + k + l1ido]}, ti2{cc[i + k + l1ido]}; + v4sf br{cc[i - 1 + k]}, bi{cc[i + k]}; VCPLXMULCONJ(tr2, ti2, LD_PS1(wa1[i - 2]), LD_PS1(wa1[i - 1])); ch[i + 2*k] = VADD(bi, ti2); ch[2*(k+ido) - i] = VSUB(ti2, bi); @@ -594,41 +592,42 @@ static NEVER_INLINE(void) radf2_ps(int ido, int l1, const v4sf *RESTRICT cc, v4s if((ido&1) == 1) return; } - for(int k=0; k < l1ido; k += ido) + const v4sf minus_one{LD_PS1(-1.0f)}; + for(int k{0};k < l1ido;k += ido) { - ch[2*k + ido] = SVMUL(minus_one, cc[ido-1 + k + l1ido]); + ch[2*k + ido] = VMUL(minus_one, cc[ido-1 + k + l1ido]); ch[2*k + ido-1] = cc[k + ido-1]; } } /* radf2 */ -static NEVER_INLINE(void) radb2_ps(int ido, int l1, const v4sf *cc, v4sf *ch, const float *wa1) +static NEVER_INLINE(void) radb2_ps(const int ido, const int l1, const v4sf *cc, v4sf *ch, + const float *wa1) { - static constexpr float minus_two=-2; - const int l1ido = l1*ido; - for(int k=0; k < l1ido; k += ido) + const int l1ido{l1*ido}; + for(int k{0};k < l1ido;k += ido) { - v4sf a = cc[2*k]; - v4sf b = cc[2*(k+ido) - 1]; + v4sf a{cc[2*k]}; + v4sf b{cc[2*(k+ido) - 1]}; ch[k] = VADD(a, b); - ch[k + l1ido] =VSUB(a, b); + ch[k + l1ido] = VSUB(a, b); } if(ido < 2) return; if(ido != 2) { - for(int k = 0; k < l1ido; k += ido) + for(int k{0};k < l1ido;k += ido) { - for(int i = 2; i < ido; i += 2) + for(int i{2};i < ido;i += 2) { - v4sf a = cc[i-1 + 2*k]; - v4sf b = cc[2*(k + ido) - i - 1]; - v4sf c = cc[i+0 + 2*k]; - v4sf d = cc[2*(k + ido) - i + 0]; + v4sf a{cc[i-1 + 2*k]}; + v4sf b{cc[2*(k + ido) - i - 1]}; + v4sf c{cc[i+0 + 2*k]}; + v4sf d{cc[2*(k + ido) - i + 0]}; ch[i-1 + k] = VADD(a, b); - v4sf tr2 = VSUB(a, b); + v4sf tr2{VSUB(a, b)}; ch[i+0 + k] = VSUB(c, d); - v4sf ti2 = VADD(c, d); + v4sf ti2{VADD(c, d)}; VCPLXMUL(tr2, ti2, LD_PS1(wa1[i - 2]), LD_PS1(wa1[i - 1])); ch[i-1 + k + l1ido] = tr2; ch[i+0 + k + l1ido] = ti2; @@ -637,54 +636,55 @@ static NEVER_INLINE(void) radb2_ps(int ido, int l1, const v4sf *cc, v4sf *ch, co if((ido&1) == 1) return; } - for(int k = 0; k < l1ido; k += ido) + const v4sf minus_two{LD_PS1(-2.0f)}; + for(int k{0};k < l1ido;k += ido) { - v4sf a = cc[2*k + ido-1]; - v4sf b = cc[2*k + ido]; + v4sf a{cc[2*k + ido-1]}; + v4sf b{cc[2*k + ido]}; ch[k + ido-1] = VADD(a,a); - ch[k + ido-1 + l1ido] = SVMUL(minus_two, b); + ch[k + ido-1 + l1ido] = VMUL(minus_two, b); } } /* radb2 */ -static void radf3_ps(int ido, int l1, const v4sf *RESTRICT cc, v4sf *RESTRICT ch, const float *wa1, - const float *wa2) +static void radf3_ps(const int ido, const int l1, const v4sf *RESTRICT cc, v4sf *RESTRICT ch, + const float *wa1, const float *wa2) { - static constexpr float taur = -0.5f; - static constexpr float taui = 0.866025403784439f; - for(int k=0; k<l1; k++) + const v4sf vtaur{LD_PS1(-0.5f)}; + const v4sf vtaui{LD_PS1(0.866025403784439f)}; + for(int k{0};k < l1;++k) { - v4sf cr2 = VADD(cc[(k + l1)*ido], cc[(k + 2*l1)*ido]); + v4sf cr2{VADD(cc[(k + l1)*ido], cc[(k + 2*l1)*ido])}; ch[3*k*ido] = VADD(cc[k*ido], cr2); - ch[(3*k+2)*ido] = SVMUL(taui, VSUB(cc[(k + l1*2)*ido], cc[(k + l1)*ido])); - ch[ido-1 + (3*k + 1)*ido] = VADD(cc[k*ido], SVMUL(taur, cr2)); + ch[(3*k+2)*ido] = VMUL(vtaui, VSUB(cc[(k + l1*2)*ido], cc[(k + l1)*ido])); + ch[ido-1 + (3*k + 1)*ido] = VADD(cc[k*ido], VMUL(vtaur, cr2)); } if(ido == 1) return; - for(int k=0; k<l1; k++) + for(int k{0};k < l1;++k) { - for(int i=2; i<ido; i+=2) + for(int i{2};i < ido;i += 2) { - const int ic = ido - i; - v4sf wr1 = LD_PS1(wa1[i - 2]); - v4sf wi1 = LD_PS1(wa1[i - 1]); - v4sf dr2 = cc[i - 1 + (k + l1)*ido]; - v4sf di2 = cc[i + (k + l1)*ido]; + const int ic{ido - i}; + v4sf wr1{LD_PS1(wa1[i - 2])}; + v4sf wi1{LD_PS1(wa1[i - 1])}; + v4sf dr2{cc[i - 1 + (k + l1)*ido]}; + v4sf di2{cc[i + (k + l1)*ido]}; VCPLXMULCONJ(dr2, di2, wr1, wi1); - v4sf wr2 = LD_PS1(wa2[i - 2]); - v4sf wi2 = LD_PS1(wa2[i - 1]); - v4sf dr3 = cc[i - 1 + (k + l1*2)*ido]; - v4sf di3 = cc[i + (k + l1*2)*ido]; + v4sf wr2{LD_PS1(wa2[i - 2])}; + v4sf wi2{LD_PS1(wa2[i - 1])}; + v4sf dr3{cc[i - 1 + (k + l1*2)*ido]}; + v4sf di3{cc[i + (k + l1*2)*ido]}; VCPLXMULCONJ(dr3, di3, wr2, wi2); - v4sf cr2 = VADD(dr2, dr3); - v4sf ci2 = VADD(di2, di3); + v4sf cr2{VADD(dr2, dr3)}; + v4sf ci2{VADD(di2, di3)}; ch[i - 1 + 3*k*ido] = VADD(cc[i - 1 + k*ido], cr2); ch[i + 3*k*ido] = VADD(cc[i + k*ido], ci2); - v4sf tr2 = VADD(cc[i - 1 + k*ido], SVMUL(taur, cr2)); - v4sf ti2 = VADD(cc[i + k*ido], SVMUL(taur, ci2)); - v4sf tr3 = SVMUL(taui, VSUB(di2, di3)); - v4sf ti3 = SVMUL(taui, VSUB(dr3, dr2)); + v4sf tr2{VADD(cc[i - 1 + k*ido], VMUL(vtaur, cr2))}; + v4sf ti2{VADD(cc[i + k*ido], VMUL(vtaur, ci2))}; + v4sf tr3{VMUL(vtaui, VSUB(di2, di3))}; + v4sf ti3{VMUL(vtaui, VSUB(dr3, dr2))}; ch[i - 1 + (3*k + 2)*ido] = VADD(tr2, tr3); ch[ic - 1 + (3*k + 1)*ido] = VSUB(tr2, tr3); ch[i + (3*k + 2)*ido] = VADD(ti2, ti3); @@ -697,39 +697,42 @@ static void radf3_ps(int ido, int l1, const v4sf *RESTRICT cc, v4sf *RESTRICT ch static void radb3_ps(int ido, int l1, const v4sf *RESTRICT cc, v4sf *RESTRICT ch, const float *wa1, const float *wa2) { - static constexpr float taur = -0.5f; - static constexpr float taui = 0.866025403784439f; - static constexpr float taui_2 = taui*2.0f; + static constexpr float taur{-0.5f}; + static constexpr float taui{0.866025403784439f}; + static constexpr float taui_2{taui*2.0f}; - for(int k=0; k<l1; k++) + const v4sf vtaur{LD_PS1(taur)}; + const v4sf vtaui_2{LD_PS1(taui_2)}; + for(int k{0};k < l1;++k) { v4sf tr2 = cc[ido-1 + (3*k + 1)*ido]; tr2 = VADD(tr2,tr2); - v4sf cr2 = VMADD(LD_PS1(taur), tr2, cc[3*k*ido]); + v4sf cr2 = VMADD(vtaur, tr2, cc[3*k*ido]); ch[k*ido] = VADD(cc[3*k*ido], tr2); - v4sf ci3 = SVMUL(taui_2, cc[(3*k + 2)*ido]); + v4sf ci3 = VMUL(vtaui_2, cc[(3*k + 2)*ido]); ch[(k + l1)*ido] = VSUB(cr2, ci3); ch[(k + 2*l1)*ido] = VADD(cr2, ci3); } if(ido == 1) return; - for(int k=0; k<l1; k++) + const v4sf vtaui{LD_PS1(taui)}; + for(int k{0};k < l1;++k) { - for(int i=2; i<ido; i+=2) + for(int i{2};i < ido;i += 2) { - const int ic = ido - i; - v4sf tr2 = VADD(cc[i - 1 + (3*k + 2)*ido], cc[ic - 1 + (3*k + 1)*ido]); - v4sf cr2 = VMADD(LD_PS1(taur), tr2, cc[i - 1 + 3*k*ido]); + const int ic{ido - i}; + v4sf tr2{VADD(cc[i - 1 + (3*k + 2)*ido], cc[ic - 1 + (3*k + 1)*ido])}; + v4sf cr2{VMADD(vtaur, tr2, cc[i - 1 + 3*k*ido])}; ch[i - 1 + k*ido] = VADD(cc[i - 1 + 3*k*ido], tr2); - v4sf ti2 = VSUB(cc[i + (3*k + 2)*ido], cc[ic + (3*k + 1)*ido]); - v4sf ci2 = VMADD(LD_PS1(taur), ti2, cc[i + 3*k*ido]); + v4sf ti2{VSUB(cc[i + (3*k + 2)*ido], cc[ic + (3*k + 1)*ido])}; + v4sf ci2{VMADD(vtaur, ti2, cc[i + 3*k*ido])}; ch[i + k*ido] = VADD(cc[i + 3*k*ido], ti2); - v4sf cr3 = SVMUL(taui, VSUB(cc[i - 1 + (3*k + 2)*ido], cc[ic - 1 + (3*k + 1)*ido])); - v4sf ci3 = SVMUL(taui, VADD(cc[i + (3*k + 2)*ido], cc[ic + (3*k + 1)*ido])); - v4sf dr2 = VSUB(cr2, ci3); - v4sf dr3 = VADD(cr2, ci3); - v4sf di2 = VADD(ci2, cr3); - v4sf di3 = VSUB(ci2, cr3); + v4sf cr3{VMUL(vtaui, VSUB(cc[i - 1 + (3*k + 2)*ido], cc[ic - 1 + (3*k + 1)*ido]))}; + v4sf ci3{VMUL(vtaui, VADD(cc[i + (3*k + 2)*ido], cc[ic + (3*k + 1)*ido]))}; + v4sf dr2{VSUB(cr2, ci3)}; + v4sf dr3{VADD(cr2, ci3)}; + v4sf di2{VADD(ci2, cr3)}; + v4sf di3{VSUB(ci2, cr3)}; VCPLXMUL(dr2, di2, LD_PS1(wa1[i-2]), LD_PS1(wa1[i-1])); ch[i - 1 + (k + l1)*ido] = dr2; ch[i + (k + l1)*ido] = di2; @@ -743,18 +746,17 @@ static void radb3_ps(int ido, int l1, const v4sf *RESTRICT cc, v4sf *RESTRICT ch static NEVER_INLINE(void) radf4_ps(int ido, int l1, const v4sf *RESTRICT cc, v4sf *RESTRICT ch, const float *RESTRICT wa1, const float * RESTRICT wa2, const float *RESTRICT wa3) { - static constexpr float minus_hsqt2 = al::numbers::sqrt2_v<float> * -0.5f; - const int l1ido = l1*ido; + const int l1ido{l1*ido}; { const v4sf *RESTRICT cc_ = cc, *RESTRICT cc_end = cc + l1ido; v4sf *RESTRICT ch_ = ch; while(cc != cc_end) { // this loop represents between 25% and 40% of total radf4_ps cost ! - v4sf a0 = cc[0], a1 = cc[l1ido]; - v4sf a2 = cc[2*l1ido], a3 = cc[3*l1ido]; - v4sf tr1 = VADD(a1, a3); - v4sf tr2 = VADD(a0, a2); + v4sf a0{cc[0]}, a1{cc[l1ido]}; + v4sf a2{cc[2*l1ido]}, a3{cc[3*l1ido]}; + v4sf tr1{VADD(a1, a3)}; + v4sf tr2{VADD(a0, a2)}; ch[2*ido-1] = VSUB(a0, a2); ch[2*ido ] = VSUB(a3, a1); ch[0 ] = VADD(tr1, tr2); @@ -768,47 +770,45 @@ static NEVER_INLINE(void) radf4_ps(int ido, int l1, const v4sf *RESTRICT cc, v4s return; if(ido != 2) { - for(int k = 0; k < l1ido; k += ido) + for(int k{0};k < l1ido;k += ido) { - const v4sf *RESTRICT pc = cc + 1 + k; - for(int i=2; i<ido; i += 2, pc += 2) + const v4sf *RESTRICT pc{cc + 1 + k}; + for(int i{2};i < ido;i += 2, pc += 2) { - const int ic = ido - i; - v4sf wr, wi, cr2, ci2, cr3, ci3, cr4, ci4; - v4sf tr1, ti1, tr2, ti2, tr3, ti3, tr4, ti4; - - cr2 = pc[1*l1ido+0]; - ci2 = pc[1*l1ido+1]; - wr=LD_PS1(wa1[i - 2]); - wi=LD_PS1(wa1[i - 1]); + const int ic{ido - i}; + + v4sf cr2{pc[1*l1ido+0]}; + v4sf ci2{pc[1*l1ido+1]}; + v4sf wr{LD_PS1(wa1[i - 2])}; + v4sf wi{LD_PS1(wa1[i - 1])}; VCPLXMULCONJ(cr2,ci2,wr,wi); - cr3 = pc[2*l1ido+0]; - ci3 = pc[2*l1ido+1]; + v4sf cr3{pc[2*l1ido+0]}; + v4sf ci3{pc[2*l1ido+1]}; wr = LD_PS1(wa2[i-2]); wi = LD_PS1(wa2[i-1]); VCPLXMULCONJ(cr3, ci3, wr, wi); - cr4 = pc[3*l1ido]; - ci4 = pc[3*l1ido+1]; + v4sf cr4{pc[3*l1ido]}; + v4sf ci4{pc[3*l1ido+1]}; wr = LD_PS1(wa3[i-2]); wi = LD_PS1(wa3[i-1]); VCPLXMULCONJ(cr4, ci4, wr, wi); /* at this point, on SSE, five of "cr2 cr3 cr4 ci2 ci3 ci4" should be loaded in registers */ - tr1 = VADD(cr2,cr4); - tr4 = VSUB(cr4,cr2); - tr2 = VADD(pc[0],cr3); - tr3 = VSUB(pc[0],cr3); + v4sf tr1{VADD(cr2,cr4)}; + v4sf tr4{VSUB(cr4,cr2)}; + v4sf tr2{VADD(pc[0],cr3)}; + v4sf tr3{VSUB(pc[0],cr3)}; ch[i - 1 + 4*k] = VADD(tr1,tr2); ch[ic - 1 + 4*k + 3*ido] = VSUB(tr2,tr1); // at this point tr1 and tr2 can be disposed - ti1 = VADD(ci2,ci4); - ti4 = VSUB(ci2,ci4); + v4sf ti1{VADD(ci2,ci4)}; + v4sf ti4{VSUB(ci2,ci4)}; ch[i - 1 + 4*k + 2*ido] = VADD(ti4,tr3); ch[ic - 1 + 4*k + 1*ido] = VSUB(tr3,ti4); // dispose tr3, ti4 - ti2 = VADD(pc[1],ci3); - ti3 = VSUB(pc[1],ci3); + v4sf ti2{VADD(pc[1],ci3)}; + v4sf ti3{VSUB(pc[1],ci3)}; ch[i + 4*k] = VADD(ti1, ti2); ch[ic + 4*k + 3*ido] = VSUB(ti1, ti2); ch[i + 4*k + 2*ido] = VADD(tr4, ti3); @@ -818,12 +818,13 @@ static NEVER_INLINE(void) radf4_ps(int ido, int l1, const v4sf *RESTRICT cc, v4s if((ido&1) == 1) return; } - for(int k=0; k<l1ido; k += ido) + const v4sf minus_hsqt2{LD_PS1(al::numbers::sqrt2_v<float> * -0.5f)}; + for(int k{0};k < l1ido;k += ido) { - v4sf a = cc[ido-1 + k + l1ido], b = cc[ido-1 + k + 3*l1ido]; - v4sf c = cc[ido-1 + k], d = cc[ido-1 + k + 2*l1ido]; - v4sf ti1 = SVMUL(minus_hsqt2, VADD(a, b)); - v4sf tr1 = SVMUL(minus_hsqt2, VSUB(b, a)); + v4sf a{cc[ido-1 + k + l1ido]}, b{cc[ido-1 + k + 3*l1ido]}; + v4sf c{cc[ido-1 + k]}, d{cc[ido-1 + k + 2*l1ido]}; + v4sf ti1{VMUL(minus_hsqt2, VADD(a, b))}; + v4sf tr1{VMUL(minus_hsqt2, VSUB(b, a))}; ch[ido-1 + 4*k] = VADD(tr1, c); ch[ido-1 + 4*k + 2*ido] = VSUB(c, tr1); ch[4*k + 1*ido] = VSUB(ti1, d); @@ -832,23 +833,23 @@ static NEVER_INLINE(void) radf4_ps(int ido, int l1, const v4sf *RESTRICT cc, v4s } /* radf4 */ -static NEVER_INLINE(void) radb4_ps(int ido, int l1, const v4sf * RESTRICT cc, v4sf *RESTRICT ch, - const float *RESTRICT wa1, const float *RESTRICT wa2, const float *RESTRICT wa3) +static NEVER_INLINE(void) radb4_ps(const int ido, const int l1, const v4sf * RESTRICT cc, + v4sf *RESTRICT ch, const float *RESTRICT wa1, const float *RESTRICT wa2, + const float *RESTRICT wa3) { - static constexpr float minus_sqrt2 = -1.414213562373095f; - static constexpr float two = 2.f; - const int l1ido = l1*ido; + const v4sf two{LD_PS1(2.0f)}; + const int l1ido{l1*ido}; { - const v4sf *RESTRICT cc_ = cc, *RESTRICT ch_end = ch + l1ido; - v4sf *ch_ = ch; + const v4sf *RESTRICT cc_{cc}, *RESTRICT ch_end{ch + l1ido}; + v4sf *ch_{ch}; while(ch != ch_end) { - v4sf a = cc[0], b = cc[4*ido-1]; - v4sf c = cc[2*ido], d = cc[2*ido-1]; - v4sf tr3 = SVMUL(two,d); - v4sf tr2 = VADD(a,b); - v4sf tr1 = VSUB(a,b); - v4sf tr4 = SVMUL(two,c); + v4sf a{cc[0]}, b{cc[4*ido-1]}; + v4sf c{cc[2*ido]}, d{cc[2*ido-1]}; + v4sf tr3{VMUL(two,d)}; + v4sf tr2{VADD(a,b)}; + v4sf tr1{VSUB(a,b)}; + v4sf tr4{VMUL(two,c)}; ch[0*l1ido] = VADD(tr2, tr3); ch[2*l1ido] = VSUB(tr2, tr3); ch[1*l1ido] = VSUB(tr1, tr4); @@ -862,31 +863,31 @@ static NEVER_INLINE(void) radb4_ps(int ido, int l1, const v4sf * RESTRICT cc, v4 return; if(ido != 2) { - for(int k = 0; k < l1ido; k += ido) + for(int k{0};k < l1ido;k += ido) { - const v4sf *RESTRICT pc = cc - 1 + 4*k; - v4sf *RESTRICT ph = ch + k + 1; - for(int i = 2; i < ido; i += 2) + const v4sf *RESTRICT pc{cc - 1 + 4*k}; + v4sf *RESTRICT ph{ch + k + 1}; + for(int i{2};i < ido;i += 2) { - v4sf tr1 = VSUB(pc[i], pc[4*ido - i]); - v4sf tr2 = VADD(pc[i], pc[4*ido - i]); - v4sf ti4 = VSUB(pc[2*ido + i], pc[2*ido - i]); - v4sf tr3 = VADD(pc[2*ido + i], pc[2*ido - i]); + v4sf tr1{VSUB(pc[i], pc[4*ido - i])}; + v4sf tr2{VADD(pc[i], pc[4*ido - i])}; + v4sf ti4{VSUB(pc[2*ido + i], pc[2*ido - i])}; + v4sf tr3{VADD(pc[2*ido + i], pc[2*ido - i])}; ph[0] = VADD(tr2, tr3); - v4sf cr3 = VSUB(tr2, tr3); + v4sf cr3{VSUB(tr2, tr3)}; - v4sf ti3 = VSUB(pc[2*ido + i + 1], pc[2*ido - i + 1]); - v4sf tr4 = VADD(pc[2*ido + i + 1], pc[2*ido - i + 1]); - v4sf cr2 = VSUB(tr1, tr4); - v4sf cr4 = VADD(tr1, tr4); + v4sf ti3{VSUB(pc[2*ido + i + 1], pc[2*ido - i + 1])}; + v4sf tr4{VADD(pc[2*ido + i + 1], pc[2*ido - i + 1])}; + v4sf cr2{VSUB(tr1, tr4)}; + v4sf cr4{VADD(tr1, tr4)}; - v4sf ti1 = VADD(pc[i + 1], pc[4*ido - i + 1]); - v4sf ti2 = VSUB(pc[i + 1], pc[4*ido - i + 1]); + v4sf ti1{VADD(pc[i + 1], pc[4*ido - i + 1])}; + v4sf ti2{VSUB(pc[i + 1], pc[4*ido - i + 1])}; ph[1] = VADD(ti2, ti3); ph += l1ido; - v4sf ci3 = VSUB(ti2, ti3); - v4sf ci2 = VADD(ti1, ti4); - v4sf ci4 = VSUB(ti1, ti4); + v4sf ci3{VSUB(ti2, ti3)}; + v4sf ci2{VADD(ti1, ti4)}; + v4sf ci4{VSUB(ti1, ti4)}; VCPLXMUL(cr2, ci2, LD_PS1(wa1[i-2]), LD_PS1(wa1[i-1])); ph[0] = cr2; ph[1] = ci2; ph += l1ido; @@ -901,92 +902,93 @@ static NEVER_INLINE(void) radb4_ps(int ido, int l1, const v4sf * RESTRICT cc, v4 if((ido&1) == 1) return; } - for(int k=0; k < l1ido; k+=ido) + const v4sf minus_sqrt2{LD_PS1(-1.414213562373095f)}; + for(int k{0};k < l1ido;k += ido) { - const int i0 = 4*k + ido; - v4sf c = cc[i0-1], d = cc[i0 + 2*ido-1]; - v4sf a = cc[i0+0], b = cc[i0 + 2*ido+0]; - v4sf tr1 = VSUB(c,d); - v4sf tr2 = VADD(c,d); - v4sf ti1 = VADD(b,a); - v4sf ti2 = VSUB(b,a); + const int i0{4*k + ido}; + v4sf c{cc[i0-1]}, d{cc[i0 + 2*ido-1]}; + v4sf a{cc[i0+0]}, b{cc[i0 + 2*ido+0]}; + v4sf tr1{VSUB(c,d)}; + v4sf tr2{VADD(c,d)}; + v4sf ti1{VADD(b,a)}; + v4sf ti2{VSUB(b,a)}; ch[ido-1 + k + 0*l1ido] = VADD(tr2,tr2); - ch[ido-1 + k + 1*l1ido] = SVMUL(minus_sqrt2, VSUB(ti1, tr1)); + ch[ido-1 + k + 1*l1ido] = VMUL(minus_sqrt2, VSUB(ti1, tr1)); ch[ido-1 + k + 2*l1ido] = VADD(ti2, ti2); - ch[ido-1 + k + 3*l1ido] = SVMUL(minus_sqrt2, VADD(ti1, tr1)); + ch[ido-1 + k + 3*l1ido] = VMUL(minus_sqrt2, VADD(ti1, tr1)); } } /* radb4 */ -static void radf5_ps(int ido, int l1, const v4sf *RESTRICT cc, v4sf *RESTRICT ch, const float *wa1, - const float *wa2, const float *wa3, const float *wa4) +static void radf5_ps(const int ido, const int l1, const v4sf *RESTRICT cc, v4sf *RESTRICT ch, + const float *wa1, const float *wa2, const float *wa3, const float *wa4) { - static constexpr float tr11 = 0.309016994374947f; - static constexpr float ti11 = 0.951056516295154f; - static constexpr float tr12 = -0.809016994374947f; - static constexpr float ti12 = 0.587785252292473f; + const v4sf tr11{LD_PS1(0.309016994374947f)}; + const v4sf ti11{LD_PS1(0.951056516295154f)}; + const v4sf tr12{LD_PS1(-0.809016994374947f)}; + const v4sf ti12{LD_PS1(0.587785252292473f)}; #define cc_ref(a_1,a_2,a_3) cc[((a_3)*l1 + (a_2))*ido + a_1] #define ch_ref(a_1,a_2,a_3) ch[((a_3)*5 + (a_2))*ido + a_1] /* Parameter adjustments */ - const int ch_offset = 1 + ido * 6; + const int ch_offset{1 + ido * 6}; ch -= ch_offset; - const int cc_offset = 1 + ido * (1 + l1); + const int cc_offset{1 + ido * (1 + l1)}; cc -= cc_offset; /* Function Body */ - for(int k = 1; k <= l1; ++k) + for(int k{1};k <= l1;++k) { - v4sf cr2 = VADD(cc_ref(1, k, 5), cc_ref(1, k, 2)); - v4sf ci5 = VSUB(cc_ref(1, k, 5), cc_ref(1, k, 2)); - v4sf cr3 = VADD(cc_ref(1, k, 4), cc_ref(1, k, 3)); - v4sf ci4 = VSUB(cc_ref(1, k, 4), cc_ref(1, k, 3)); + v4sf cr2{VADD(cc_ref(1, k, 5), cc_ref(1, k, 2))}; + v4sf ci5{VSUB(cc_ref(1, k, 5), cc_ref(1, k, 2))}; + v4sf cr3{VADD(cc_ref(1, k, 4), cc_ref(1, k, 3))}; + v4sf ci4{VSUB(cc_ref(1, k, 4), cc_ref(1, k, 3))}; ch_ref(1, 1, k) = VADD(cc_ref(1, k, 1), VADD(cr2, cr3)); - ch_ref(ido, 2, k) = VADD(cc_ref(1, k, 1), VADD(SVMUL(tr11, cr2), SVMUL(tr12, cr3))); - ch_ref(1, 3, k) = VADD(SVMUL(ti11, ci5), SVMUL(ti12, ci4)); - ch_ref(ido, 4, k) = VADD(cc_ref(1, k, 1), VADD(SVMUL(tr12, cr2), SVMUL(tr11, cr3))); - ch_ref(1, 5, k) = VSUB(SVMUL(ti12, ci5), SVMUL(ti11, ci4)); + ch_ref(ido, 2, k) = VADD(cc_ref(1, k, 1), VADD(VMUL(tr11, cr2), VMUL(tr12, cr3))); + ch_ref(1, 3, k) = VADD(VMUL(ti11, ci5), VMUL(ti12, ci4)); + ch_ref(ido, 4, k) = VADD(cc_ref(1, k, 1), VADD(VMUL(tr12, cr2), VMUL(tr11, cr3))); + ch_ref(1, 5, k) = VSUB(VMUL(ti12, ci5), VMUL(ti11, ci4)); //printf("pffft: radf5, k=%d ch_ref=%f, ci4=%f\n", k, ch_ref(1, 5, k), ci4); } if(ido == 1) return; - const int idp2 = ido + 2; - for(int k = 1; k <= l1; ++k) + const int idp2{ido + 2}; + for(int k{1};k <= l1;++k) { - for(int i = 3; i <= ido; i += 2) + for(int i{3};i <= ido;i += 2) { - const int ic = idp2 - i; - v4sf dr2 = LD_PS1(wa1[i-3]); - v4sf di2 = LD_PS1(wa1[i-2]); - v4sf dr3 = LD_PS1(wa2[i-3]); - v4sf di3 = LD_PS1(wa2[i-2]); - v4sf dr4 = LD_PS1(wa3[i-3]); - v4sf di4 = LD_PS1(wa3[i-2]); - v4sf dr5 = LD_PS1(wa4[i-3]); - v4sf di5 = LD_PS1(wa4[i-2]); + const int ic{idp2 - i}; + v4sf dr2{LD_PS1(wa1[i-3])}; + v4sf di2{LD_PS1(wa1[i-2])}; + v4sf dr3{LD_PS1(wa2[i-3])}; + v4sf di3{LD_PS1(wa2[i-2])}; + v4sf dr4{LD_PS1(wa3[i-3])}; + v4sf di4{LD_PS1(wa3[i-2])}; + v4sf dr5{LD_PS1(wa4[i-3])}; + v4sf di5{LD_PS1(wa4[i-2])}; VCPLXMULCONJ(dr2, di2, cc_ref(i-1, k, 2), cc_ref(i, k, 2)); VCPLXMULCONJ(dr3, di3, cc_ref(i-1, k, 3), cc_ref(i, k, 3)); VCPLXMULCONJ(dr4, di4, cc_ref(i-1, k, 4), cc_ref(i, k, 4)); VCPLXMULCONJ(dr5, di5, cc_ref(i-1, k, 5), cc_ref(i, k, 5)); - v4sf cr2 = VADD(dr2, dr5); - v4sf ci5 = VSUB(dr5, dr2); - v4sf cr5 = VSUB(di2, di5); - v4sf ci2 = VADD(di2, di5); - v4sf cr3 = VADD(dr3, dr4); - v4sf ci4 = VSUB(dr4, dr3); - v4sf cr4 = VSUB(di3, di4); - v4sf ci3 = VADD(di3, di4); + v4sf cr2{VADD(dr2, dr5)}; + v4sf ci5{VSUB(dr5, dr2)}; + v4sf cr5{VSUB(di2, di5)}; + v4sf ci2{VADD(di2, di5)}; + v4sf cr3{VADD(dr3, dr4)}; + v4sf ci4{VSUB(dr4, dr3)}; + v4sf cr4{VSUB(di3, di4)}; + v4sf ci3{VADD(di3, di4)}; ch_ref(i - 1, 1, k) = VADD(cc_ref(i - 1, k, 1), VADD(cr2, cr3)); - ch_ref(i, 1, k) = VSUB(cc_ref(i, k, 1), VADD(ci2, ci3));// - v4sf tr2 = VADD(cc_ref(i - 1, k, 1), VADD(SVMUL(tr11, cr2), SVMUL(tr12, cr3))); - v4sf ti2 = VSUB(cc_ref(i, k, 1), VADD(SVMUL(tr11, ci2), SVMUL(tr12, ci3)));// - v4sf tr3 = VADD(cc_ref(i - 1, k, 1), VADD(SVMUL(tr12, cr2), SVMUL(tr11, cr3))); - v4sf ti3 = VSUB(cc_ref(i, k, 1), VADD(SVMUL(tr12, ci2), SVMUL(tr11, ci3)));// - v4sf tr5 = VADD(SVMUL(ti11, cr5), SVMUL(ti12, cr4)); - v4sf ti5 = VADD(SVMUL(ti11, ci5), SVMUL(ti12, ci4)); - v4sf tr4 = VSUB(SVMUL(ti12, cr5), SVMUL(ti11, cr4)); - v4sf ti4 = VSUB(SVMUL(ti12, ci5), SVMUL(ti11, ci4)); + ch_ref(i, 1, k) = VSUB(cc_ref(i, k, 1), VADD(ci2, ci3)); + v4sf tr2{VADD(cc_ref(i - 1, k, 1), VADD(VMUL(tr11, cr2), VMUL(tr12, cr3)))}; + v4sf ti2{VSUB(cc_ref(i, k, 1), VADD(VMUL(tr11, ci2), VMUL(tr12, ci3)))}; + v4sf tr3{VADD(cc_ref(i - 1, k, 1), VADD(VMUL(tr12, cr2), VMUL(tr11, cr3)))}; + v4sf ti3{VSUB(cc_ref(i, k, 1), VADD(VMUL(tr12, ci2), VMUL(tr11, ci3)))}; + v4sf tr5{VADD(VMUL(ti11, cr5), VMUL(ti12, cr4))}; + v4sf ti5{VADD(VMUL(ti11, ci5), VMUL(ti12, ci4))}; + v4sf tr4{VSUB(VMUL(ti12, cr5), VMUL(ti11, cr4))}; + v4sf ti4{VSUB(VMUL(ti12, ci5), VMUL(ti11, ci4))}; ch_ref(i - 1, 3, k) = VSUB(tr2, tr5); ch_ref(ic - 1, 2, k) = VADD(tr2, tr5); ch_ref(i, 3, k) = VADD(ti2, ti5); @@ -1001,35 +1003,35 @@ static void radf5_ps(int ido, int l1, const v4sf *RESTRICT cc, v4sf *RESTRICT ch #undef ch_ref } /* radf5 */ -static void radb5_ps(int ido, int l1, const v4sf *RESTRICT cc, v4sf *RESTRICT ch, const float *wa1, - const float *wa2, const float *wa3, const float *wa4) +static void radb5_ps(const int ido, const int l1, const v4sf *RESTRICT cc, v4sf *RESTRICT ch, + const float *wa1, const float *wa2, const float *wa3, const float *wa4) { - static constexpr float tr11 = 0.309016994374947f; - static constexpr float ti11 = 0.951056516295154f; - static constexpr float tr12 = -0.809016994374947f; - static constexpr float ti12 = 0.587785252292473f; + const v4sf tr11{LD_PS1(0.309016994374947f)}; + const v4sf ti11{LD_PS1(0.951056516295154f)}; + const v4sf tr12{LD_PS1(-0.809016994374947f)}; + const v4sf ti12{LD_PS1(0.587785252292473f)}; #define cc_ref(a_1,a_2,a_3) cc[((a_3)*5 + (a_2))*ido + a_1] #define ch_ref(a_1,a_2,a_3) ch[((a_3)*l1 + (a_2))*ido + a_1] /* Parameter adjustments */ - const int ch_offset = 1 + ido * (1 + l1); + const int ch_offset{1 + ido*(1 + l1)}; ch -= ch_offset; - const int cc_offset = 1 + ido * 6; + const int cc_offset{1 + ido*6}; cc -= cc_offset; /* Function Body */ - for(int k = 1; k <= l1; ++k) + for(int k{1};k <= l1;++k) { - v4sf ti5 = VADD(cc_ref(1, 3, k), cc_ref(1, 3, k)); - v4sf ti4 = VADD(cc_ref(1, 5, k), cc_ref(1, 5, k)); - v4sf tr2 = VADD(cc_ref(ido, 2, k), cc_ref(ido, 2, k)); - v4sf tr3 = VADD(cc_ref(ido, 4, k), cc_ref(ido, 4, k)); + v4sf ti5{VADD(cc_ref(1, 3, k), cc_ref(1, 3, k))}; + v4sf ti4{VADD(cc_ref(1, 5, k), cc_ref(1, 5, k))}; + v4sf tr2{VADD(cc_ref(ido, 2, k), cc_ref(ido, 2, k))}; + v4sf tr3{VADD(cc_ref(ido, 4, k), cc_ref(ido, 4, k))}; ch_ref(1, k, 1) = VADD(cc_ref(1, 1, k), VADD(tr2, tr3)); - v4sf cr2 = VADD(cc_ref(1, 1, k), VADD(SVMUL(tr11, tr2), SVMUL(tr12, tr3))); - v4sf cr3 = VADD(cc_ref(1, 1, k), VADD(SVMUL(tr12, tr2), SVMUL(tr11, tr3))); - v4sf ci5 = VADD(SVMUL(ti11, ti5), SVMUL(ti12, ti4)); - v4sf ci4 = VSUB(SVMUL(ti12, ti5), SVMUL(ti11, ti4)); + v4sf cr2{VADD(cc_ref(1, 1, k), VADD(VMUL(tr11, tr2), VMUL(tr12, tr3)))}; + v4sf cr3{VADD(cc_ref(1, 1, k), VADD(VMUL(tr12, tr2), VMUL(tr11, tr3)))}; + v4sf ci5{VADD(VMUL(ti11, ti5), VMUL(ti12, ti4))}; + v4sf ci4{VSUB(VMUL(ti12, ti5), VMUL(ti11, ti4))}; ch_ref(1, k, 2) = VSUB(cr2, ci5); ch_ref(1, k, 3) = VSUB(cr3, ci4); ch_ref(1, k, 4) = VADD(cr3, ci4); @@ -1038,38 +1040,38 @@ static void radb5_ps(int ido, int l1, const v4sf *RESTRICT cc, v4sf *RESTRICT ch if(ido == 1) return; - const int idp2 = ido + 2; - for(int k = 1; k <= l1; ++k) + const int idp2{ido + 2}; + for(int k{1};k <= l1;++k) { - for(int i = 3; i <= ido; i += 2) + for(int i{3};i <= ido;i += 2) { - const int ic = idp2 - i; - v4sf ti5 = VADD(cc_ref(i , 3, k), cc_ref(ic , 2, k)); - v4sf ti2 = VSUB(cc_ref(i , 3, k), cc_ref(ic , 2, k)); - v4sf ti4 = VADD(cc_ref(i , 5, k), cc_ref(ic , 4, k)); - v4sf ti3 = VSUB(cc_ref(i , 5, k), cc_ref(ic , 4, k)); - v4sf tr5 = VSUB(cc_ref(i-1, 3, k), cc_ref(ic-1, 2, k)); - v4sf tr2 = VADD(cc_ref(i-1, 3, k), cc_ref(ic-1, 2, k)); - v4sf tr4 = VSUB(cc_ref(i-1, 5, k), cc_ref(ic-1, 4, k)); - v4sf tr3 = VADD(cc_ref(i-1, 5, k), cc_ref(ic-1, 4, k)); + const int ic{idp2 - i}; + v4sf ti5{VADD(cc_ref(i , 3, k), cc_ref(ic , 2, k))}; + v4sf ti2{VSUB(cc_ref(i , 3, k), cc_ref(ic , 2, k))}; + v4sf ti4{VADD(cc_ref(i , 5, k), cc_ref(ic , 4, k))}; + v4sf ti3{VSUB(cc_ref(i , 5, k), cc_ref(ic , 4, k))}; + v4sf tr5{VSUB(cc_ref(i-1, 3, k), cc_ref(ic-1, 2, k))}; + v4sf tr2{VADD(cc_ref(i-1, 3, k), cc_ref(ic-1, 2, k))}; + v4sf tr4{VSUB(cc_ref(i-1, 5, k), cc_ref(ic-1, 4, k))}; + v4sf tr3{VADD(cc_ref(i-1, 5, k), cc_ref(ic-1, 4, k))}; ch_ref(i - 1, k, 1) = VADD(cc_ref(i-1, 1, k), VADD(tr2, tr3)); ch_ref(i, k, 1) = VADD(cc_ref(i, 1, k), VADD(ti2, ti3)); - v4sf cr2 = VADD(cc_ref(i-1, 1, k), VADD(SVMUL(tr11, tr2), SVMUL(tr12, tr3))); - v4sf ci2 = VADD(cc_ref(i , 1, k), VADD(SVMUL(tr11, ti2), SVMUL(tr12, ti3))); - v4sf cr3 = VADD(cc_ref(i-1, 1, k), VADD(SVMUL(tr12, tr2), SVMUL(tr11, tr3))); - v4sf ci3 = VADD(cc_ref(i , 1, k), VADD(SVMUL(tr12, ti2), SVMUL(tr11, ti3))); - v4sf cr5 = VADD(SVMUL(ti11, tr5), SVMUL(ti12, tr4)); - v4sf ci5 = VADD(SVMUL(ti11, ti5), SVMUL(ti12, ti4)); - v4sf cr4 = VSUB(SVMUL(ti12, tr5), SVMUL(ti11, tr4)); - v4sf ci4 = VSUB(SVMUL(ti12, ti5), SVMUL(ti11, ti4)); - v4sf dr3 = VSUB(cr3, ci4); - v4sf dr4 = VADD(cr3, ci4); - v4sf di3 = VADD(ci3, cr4); - v4sf di4 = VSUB(ci3, cr4); - v4sf dr5 = VADD(cr2, ci5); - v4sf dr2 = VSUB(cr2, ci5); - v4sf di5 = VSUB(ci2, cr5); - v4sf di2 = VADD(ci2, cr5); + v4sf cr2{VADD(cc_ref(i-1, 1, k), VADD(VMUL(tr11, tr2), VMUL(tr12, tr3)))}; + v4sf ci2{VADD(cc_ref(i , 1, k), VADD(VMUL(tr11, ti2), VMUL(tr12, ti3)))}; + v4sf cr3{VADD(cc_ref(i-1, 1, k), VADD(VMUL(tr12, tr2), VMUL(tr11, tr3)))}; + v4sf ci3{VADD(cc_ref(i , 1, k), VADD(VMUL(tr12, ti2), VMUL(tr11, ti3)))}; + v4sf cr5{VADD(VMUL(ti11, tr5), VMUL(ti12, tr4))}; + v4sf ci5{VADD(VMUL(ti11, ti5), VMUL(ti12, ti4))}; + v4sf cr4{VSUB(VMUL(ti12, tr5), VMUL(ti11, tr4))}; + v4sf ci4{VSUB(VMUL(ti12, ti5), VMUL(ti11, ti4))}; + v4sf dr3{VSUB(cr3, ci4)}; + v4sf dr4{VADD(cr3, ci4)}; + v4sf di3{VADD(ci3, cr4)}; + v4sf di4{VSUB(ci3, cr4)}; + v4sf dr5{VADD(cr2, ci5)}; + v4sf dr2{VSUB(cr2, ci5)}; + v4sf di5{VSUB(ci2, cr5)}; + v4sf di2{VADD(ci2, cr5)}; VCPLXMUL(dr2, di2, LD_PS1(wa1[i-3]), LD_PS1(wa1[i-2])); VCPLXMUL(dr3, di3, LD_PS1(wa2[i-3]), LD_PS1(wa2[i-2])); VCPLXMUL(dr4, di4, LD_PS1(wa3[i-3]), LD_PS1(wa3[i-2])); @@ -1085,45 +1087,52 @@ static void radb5_ps(int ido, int l1, const v4sf *RESTRICT cc, v4sf *RESTRICT ch #undef ch_ref } /* radb5 */ -static NEVER_INLINE(v4sf *) rfftf1_ps(int n, const v4sf *input_readonly, v4sf *work1, v4sf *work2, - const float *wa, const int *ifac) +static NEVER_INLINE(v4sf *) rfftf1_ps(const int n, const v4sf *input_readonly, v4sf *work1, + v4sf *work2, const float *wa, const int *ifac) { - const v4sf *in = input_readonly; - v4sf *out = (in == work2 ? work1 : work2); - const int nf = ifac[1]; - int l2 = n; - int iw = n-1; - assert(in != out && work1 != work2); - for(int k1 = 1; k1 <= nf; ++k1) + assert(work1 != work2); + + const v4sf *in{input_readonly}; + v4sf *out{in == work2 ? work1 : work2}; + const int nf{ifac[1]}; + int l2{n}; + int iw{n-1}; + for(int k1{1};k1 <= nf;++k1) { - int kh = nf - k1; - int ip = ifac[kh + 2]; - int l1 = l2 / ip; - int ido = n / l2; + int kh{nf - k1}; + int ip{ifac[kh + 2]}; + int l1{l2 / ip}; + int ido{n / l2}; iw -= (ip - 1)*ido; - switch (ip) + switch(ip) { - case 5: { - int ix2 = iw + ido; - int ix3 = ix2 + ido; - int ix4 = ix3 + ido; + case 5: + { + int ix2{iw + ido}; + int ix3{ix2 + ido}; + int ix4{ix3 + ido}; radf5_ps(ido, l1, in, out, &wa[iw], &wa[ix2], &wa[ix3], &wa[ix4]); - } break; - case 4: { - int ix2 = iw + ido; - int ix3 = ix2 + ido; + } + break; + case 4: + { + int ix2{iw + ido}; + int ix3{ix2 + ido}; radf4_ps(ido, l1, in, out, &wa[iw], &wa[ix2], &wa[ix3]); - } break; - case 3: { - int ix2 = iw + ido; + } + break; + case 3: + { + int ix2{iw + ido}; radf3_ps(ido, l1, in, out, &wa[iw], &wa[ix2]); - } break; - case 2: - radf2_ps(ido, l1, in, out, &wa[iw]); - break; - default: - assert(0); - break; + } + break; + case 2: + radf2_ps(ido, l1, in, out, &wa[iw]); + break; + default: + assert(0); + break; } l2 = l1; if(out == work2) @@ -1140,43 +1149,50 @@ static NEVER_INLINE(v4sf *) rfftf1_ps(int n, const v4sf *input_readonly, v4sf *w return const_cast<v4sf*>(in); /* this is in fact the output .. */ } /* rfftf1 */ -static NEVER_INLINE(v4sf *) rfftb1_ps(int n, const v4sf *input_readonly, v4sf *work1, v4sf *work2, - const float *wa, const int *ifac) +static NEVER_INLINE(v4sf *) rfftb1_ps(const int n, const v4sf *input_readonly, v4sf *work1, + v4sf *work2, const float *wa, const int *ifac) { - const v4sf *in = input_readonly; - v4sf *out = (in == work2 ? work1 : work2); - const int nf = ifac[1]; - int l1 = 1; - int iw = 0; - assert(in != out); - for(int k1=1; k1<=nf; k1++) + assert(work1 != work2); + + const v4sf *in{input_readonly}; + v4sf *out{in == work2 ? work1 : work2}; + const int nf{ifac[1]}; + int l1{1}; + int iw{0}; + for(int k1{1};k1 <= nf;++k1) { - int ip = ifac[k1 + 1]; - int l2 = ip*l1; - int ido = n / l2; + int ip{ifac[k1 + 1]}; + int l2{ip*l1}; + int ido{n / l2}; switch(ip) { - case 5: { - int ix2 = iw + ido; - int ix3 = ix2 + ido; - int ix4 = ix3 + ido; + case 5: + { + int ix2{iw + ido}; + int ix3{ix2 + ido}; + int ix4{ix3 + ido}; radb5_ps(ido, l1, in, out, &wa[iw], &wa[ix2], &wa[ix3], &wa[ix4]); - } break; - case 4: { - int ix2 = iw + ido; - int ix3 = ix2 + ido; + } + break; + case 4: + { + int ix2{iw + ido}; + int ix3{ix2 + ido}; radb4_ps(ido, l1, in, out, &wa[iw], &wa[ix2], &wa[ix3]); - } break; - case 3: { - int ix2 = iw + ido; + } + break; + case 3: + { + int ix2{iw + ido}; radb3_ps(ido, l1, in, out, &wa[iw], &wa[ix2]); - } break; - case 2: - radb2_ps(ido, l1, in, out, &wa[iw]); - break; - default: - assert(0); - break; + } + break; + case 2: + radb2_ps(ido, l1, in, out, &wa[iw]); + break; + default: + assert(0); + break; } l1 = l2; iw += (ip - 1)*ido; @@ -1195,32 +1211,30 @@ static NEVER_INLINE(v4sf *) rfftb1_ps(int n, const v4sf *input_readonly, v4sf *w return const_cast<v4sf*>(in); /* this is in fact the output .. */ } -static int decompose(int n, int *ifac, const int *ntryh) +static int decompose(const int n, int *ifac, const int *ntryh) { - int nl = n, nf = 0; - for(int j=0; ntryh[j]; ++j) + int nl{n}, nf{0}; + for(int j{0};ntryh[j];++j) { - const int ntry = ntryh[j]; + const int ntry{ntryh[j]}; while(nl != 1) { - int nq = nl / ntry; - int nr = nl - ntry*nq; - if(nr == 0) + const int nq{nl / ntry}; + const int nr{nl - ntry*nq}; + if(nr != 0) + break; + + ifac[2+nf++] = ntry; + nl = nq; + if(ntry == 2 && nf != 1) { - ifac[2+nf++] = ntry; - nl = nq; - if(ntry == 2 && nf != 1) + for(int i{2};i <= nf;++i) { - for(int i = 2; i <= nf; ++i) - { - int ib = nf - i + 2; - ifac[ib + 1] = ifac[ib]; - } - ifac[2] = 2; + int ib{nf - i + 2}; + ifac[ib + 1] = ifac[ib]; } + ifac[2] = 2; } - else - break; } } ifac[0] = n; @@ -1230,28 +1244,28 @@ static int decompose(int n, int *ifac, const int *ntryh) -static void rffti1_ps(int n, float *wa, int *ifac) +static void rffti1_ps(const int n, float *wa, int *ifac) { - static constexpr int ntryh[] = { 4,2,3,5,0 }; - - const int nf = decompose(n,ifac,ntryh); - const double argh = 2.0*al::numbers::pi / n; - int is = 0; - int nfm1 = nf - 1; - int l1 = 1; - for(int k1 = 1; k1 <= nfm1; k1++) + static constexpr int ntryh[]{4,2,3,5,0}; + + const int nf{decompose(n, ifac, ntryh)}; + const double argh{2.0*al::numbers::pi / n}; + int is{0}; + int nfm1{nf - 1}; + int l1{1}; + for(int k1{1};k1 <= nfm1;++k1) { - int ip = ifac[k1 + 1]; - int ld = 0; - int l2 = l1*ip; - int ido = n / l2; - int ipm = ip - 1; - for(int j = 1; j <= ipm; ++j) + const int ip{ifac[k1 + 1]}; + const int l2{l1*ip}; + const int ido{n / l2}; + const int ipm{ip - 1}; + int ld{0}; + for(int j{1};j <= ipm;++j) { - int i = is, fi=0; + int i{is}, fi{0}; ld += l1; - double argld = ld*argh; - for(int ii = 3; ii <= ido; ii += 2) + double argld{ld*argh}; + for(int ii{3};ii <= ido;ii += 2) { i += 2; fi += 1; @@ -1264,25 +1278,25 @@ static void rffti1_ps(int n, float *wa, int *ifac) } } /* rffti1 */ -void cffti1_ps(int n, float *wa, int *ifac) +void cffti1_ps(const int n, float *wa, int *ifac) { - static constexpr int ntryh[] = { 5,3,4,2,0 }; + static constexpr int ntryh[]{5,3,4,2,0}; - const int nf = decompose(n,ifac,ntryh); - const double argh = 2.0*al::numbers::pi / n; - int i = 1; - int l1 = 1; - for(int k1=1; k1<=nf; k1++) + const int nf{decompose(n, ifac, ntryh)}; + const double argh{2.0*al::numbers::pi / n}; + int i{1}; + int l1{1}; + for(int k1{1};k1 <= nf;++k1) { - int ip = ifac[k1+1]; - int ld = 0; - int l2 = l1*ip; - int ido = n / l2; - int idot = ido + ido + 2; - int ipm = ip - 1; - for(int j=1; j<=ipm; j++) + const int ip{ifac[k1+1]}; + const int l2{l1*ip}; + const int ido{n / l2}; + const int idot{ido + ido + 2}; + const int ipm{ip - 1}; + int ld{0}; + for(int j{1};j <= ipm;++j) { - int i1 = i, fi = 0; + int i1{i}, fi{0}; wa[i-1] = 1; wa[i] = 0; ld += l1; @@ -1305,43 +1319,49 @@ void cffti1_ps(int n, float *wa, int *ifac) } /* cffti1 */ -v4sf *cfftf1_ps(int n, const v4sf *input_readonly, v4sf *work1, v4sf *work2, const float *wa, - const int *ifac, float fsign) +v4sf *cfftf1_ps(const int n, const v4sf *input_readonly, v4sf *work1, v4sf *work2, const float *wa, + const int *ifac, const float fsign) { - const v4sf *in = input_readonly; - v4sf *out = (in == work2 ? work1 : work2); - const int nf = ifac[1]; - int l1 = 1; - int iw = 0; - assert(in != out && work1 != work2); - for(int k1=2; k1<=nf+1; k1++) + assert(work1 != work2); + + const v4sf *in{input_readonly}; + v4sf *out{in == work2 ? work1 : work2}; + const int nf{ifac[1]}; + int l1{1}, iw{0}; + for(int k1{2};k1 <= nf+1;++k1) { - int ip = ifac[k1]; - int l2 = ip*l1; - int ido = n / l2; - int idot = ido + ido; + const int ip{ifac[k1]}; + const int l2{ip*l1}; + const int ido{n / l2}; + const int idot{ido + ido}; switch(ip) { - case 5: { - int ix2 = iw + idot; - int ix3 = ix2 + idot; - int ix4 = ix3 + idot; + case 5: + { + int ix2{iw + idot}; + int ix3{ix2 + idot}; + int ix4{ix3 + idot}; passf5_ps(idot, l1, in, out, &wa[iw], &wa[ix2], &wa[ix3], &wa[ix4], fsign); - } break; - case 4: { - int ix2 = iw + idot; - int ix3 = ix2 + idot; + } + break; + case 4: + { + int ix2{iw + idot}; + int ix3{ix2 + idot}; passf4_ps(idot, l1, in, out, &wa[iw], &wa[ix2], &wa[ix3], fsign); - } break; - case 2: - passf2_ps(idot, l1, in, out, &wa[iw], fsign); - break; - case 3: { - int ix2 = iw + idot; + } + break; + case 3: + { + int ix2{iw + idot}; passf3_ps(idot, l1, in, out, &wa[iw], &wa[ix2], fsign); - } break; - default: - assert(0); + } + break; + case 2: + passf2_ps(idot, l1, in, out, &wa[iw], fsign); + break; + default: + assert(0); } l1 = l2; iw += (ip - 1)*idot; @@ -1362,8 +1382,8 @@ v4sf *cfftf1_ps(int n, const v4sf *input_readonly, v4sf *work1, v4sf *work2, con struct PFFFT_Setup { - int N; - int Ncvec; // nb of complex simd vectors (N/4 if PFFFT_COMPLEX, N/8 if PFFFT_REAL) + int N; + int Ncvec; // nb of complex simd vectors (N/4 if PFFFT_COMPLEX, N/8 if PFFFT_REAL) int ifac[15]; pffft_transform_t transform; @@ -1384,13 +1404,13 @@ PFFFT_Setup *pffft_new_setup(int N, pffft_transform_t transform) else assert((N%(SIMD_SZ*SIMD_SZ)) == 0); - const unsigned int Ncvec = static_cast<unsigned>(transform == PFFFT_REAL ? N/2 : N)/SIMD_SZ; - size_t storelen{offsetof(PFFFT_Setup, e[0]) + (2u*Ncvec * sizeof(v4sf))}; + const auto Ncvec = static_cast<unsigned>(transform == PFFFT_REAL ? N/2 : N)/SIMD_SZ; + const size_t storelen{offsetof(PFFFT_Setup, e[0]) + (2u*Ncvec * sizeof(v4sf))}; void *store{al_calloc(MALLOC_V4SF_ALIGNMENT, storelen)}; if(!store) return nullptr; - PFFFT_Setup *s = ::new(store) PFFFT_Setup{}; + PFFFT_Setup *s{::new(store) PFFFT_Setup{}}; s->N = N; s->transform = transform; /* nb of complex simd vectors */ @@ -1400,10 +1420,10 @@ PFFFT_Setup *pffft_new_setup(int N, pffft_transform_t transform) if constexpr(SIMD_SZ > 1) { al::vector<float,16> e(2u*Ncvec*(SIMD_SZ-1)); - for(int k=0; k < s->Ncvec; ++k) + for(int k{0};k < s->Ncvec;++k) { - size_t i{static_cast<size_t>(k) / SIMD_SZ}; - size_t j{static_cast<size_t>(k) % SIMD_SZ}; + const size_t i{static_cast<size_t>(k) / SIMD_SZ}; + const size_t j{static_cast<size_t>(k) % SIMD_SZ}; for(size_t m{0};m < SIMD_SZ-1;++m) { const double A = -2.0*al::numbers::pi*static_cast<double>(m+1)*k / N; @@ -1419,8 +1439,8 @@ PFFFT_Setup *pffft_new_setup(int N, pffft_transform_t transform) cffti1_ps(N/SIMD_SZ, s->twiddle, s->ifac); /* check that N is decomposable with allowed prime factors */ - int m = 1; - for(int k=0; k < s->ifac[1]; ++k) + int m{1}; + for(int k{0};k < s->ifac[1];++k) m *= s->ifac[2+k]; if(m != N/SIMD_SZ) @@ -1442,17 +1462,18 @@ void pffft_destroy_setup(PFFFT_Setup *s) #if !defined(PFFFT_SIMD_DISABLE) /* [0 0 1 2 3 4 5 6 7 8] -> [0 8 7 6 5 4 3 2 1] */ -static void reversed_copy(int N, const v4sf *in, int in_stride, v4sf *out) +static void reversed_copy(const int N, const v4sf *in, const int in_stride, v4sf *out) { v4sf g0, g1; INTERLEAVE2(in[0], in[1], g0, g1); in += in_stride; *--out = VSWAPHL(g0, g1); // [g0l, g0h], [g1l g1h] -> [g1l, g0h] - for(int k=1; k < N; ++k) + for(int k{1};k < N;++k) { v4sf h0, h1; - INTERLEAVE2(in[0], in[1], h0, h1); in += in_stride; + INTERLEAVE2(in[0], in[1], h0, h1); + in += in_stride; *--out = VSWAPHL(g1, h0); *--out = VSWAPHL(h0, h1); g1 = h1; @@ -1460,20 +1481,20 @@ static void reversed_copy(int N, const v4sf *in, int in_stride, v4sf *out) *--out = VSWAPHL(g1, g0); } -static void unreversed_copy(int N, const v4sf *in, v4sf *out, int out_stride) +static void unreversed_copy(const int N, const v4sf *in, v4sf *out, const int out_stride) { - v4sf g0, g1, h0, h1; - g0 = g1 = in[0]; ++in; - for(int k=1; k < N; ++k) + v4sf g0{in[0]}, g1{g0}; + ++in; + for(int k{1};k < N;++k) { - h0 = *in++; h1 = *in++; + v4sf h0{*in++}; v4sf h1{*in++}; g1 = VSWAPHL(g1, h0); h0 = VSWAPHL(h0, h1); UNINTERLEAVE2(h0, g1, out[0], out[1]); out += out_stride; g1 = h1; } - h0 = *in++; h1 = g0; + v4sf h0{*in++}, h1{g0}; g1 = VSWAPHL(g1, h0); h0 = VSWAPHL(h0, h1); UNINTERLEAVE2(h0, g1, out[0], out[1]); @@ -1491,7 +1512,7 @@ void pffft_zreorder(PFFFT_Setup *setup, const float *in, float *out, pffft_direc const int dk{N/32}; if(direction == PFFFT_FORWARD) { - for(int k=0; k < dk; ++k) + for(int k{0};k < dk;++k) { INTERLEAVE2(vin[k*8 + 0], vin[k*8 + 1], vout[2*(0*dk + k) + 0], vout[2*(0*dk + k) + 1]); INTERLEAVE2(vin[k*8 + 4], vin[k*8 + 5], vout[2*(2*dk + k) + 0], vout[2*(2*dk + k) + 1]); @@ -1613,10 +1634,10 @@ void pffft_cplx_preprocess(const int Ncvec, const v4sf *in, v4sf *out, const v4s static ALWAYS_INLINE(void) pffft_real_finalize_4x4(const v4sf *in0, const v4sf *in1, const v4sf *in, const v4sf *e, v4sf *out) { - v4sf r0, i0, r1, i1, r2, i2, r3, i3; - v4sf sr0, dr0, sr1, dr1, si0, di0, si1, di1; - r0 = *in0; i0 = *in1; - r1 = *in++; i1 = *in++; r2 = *in++; i2 = *in++; r3 = *in++; i3 = *in++; + v4sf r0{*in0}, i0{*in1}; + v4sf r1{*in++}; v4sf i1{*in++}; + v4sf r2{*in++}; v4sf i2{*in++}; + v4sf r3{*in++}; v4sf i3{*in++}; VTRANSPOSE4(r0,r1,r2,r3); VTRANSPOSE4(i0,i1,i2,i3); @@ -1643,10 +1664,10 @@ static ALWAYS_INLINE(void) pffft_real_finalize_4x4(const v4sf *in0, const v4sf * //cerr << "matrix initial, real part:\n 1: " << r0 << "\n 1: " << r1 << "\n 1: " << r2 << "\n 1: " << r3 << "\n"; //cerr << "matrix initial, imag part:\n 1: " << i0 << "\n 1: " << i1 << "\n 1: " << i2 << "\n 1: " << i3 << "\n"; - sr0 = VADD(r0,r2); dr0 = VSUB(r0,r2); - sr1 = VADD(r1,r3); dr1 = VSUB(r3,r1); - si0 = VADD(i0,i2); di0 = VSUB(i0,i2); - si1 = VADD(i1,i3); di1 = VSUB(i3,i1); + v4sf sr0{VADD(r0,r2)}, dr0{VSUB(r0,r2)}; + v4sf sr1{VADD(r1,r3)}, dr1{VSUB(r3,r1)}; + v4sf si0{VADD(i0,i2)}, di0{VSUB(i0,i2)}; + v4sf si1{VADD(i1,i3)}, di1{VSUB(i3,i1)}; r0 = VADD(sr0, sr1); r3 = VSUB(sr0, sr1); @@ -1667,7 +1688,8 @@ static ALWAYS_INLINE(void) pffft_real_finalize_4x4(const v4sf *in0, const v4sf * *out++ = i3; } -static NEVER_INLINE(void) pffft_real_finalize(int Ncvec, const v4sf *in, v4sf *out, const v4sf *e) +static NEVER_INLINE(void) pffft_real_finalize(const int Ncvec, const v4sf *in, v4sf *out, + const v4sf *e) { static constexpr float s{al::numbers::sqrt2_v<float>/2.0f}; @@ -1706,9 +1728,10 @@ static NEVER_INLINE(void) pffft_real_finalize(int Ncvec, const v4sf *in, v4sf *o } static ALWAYS_INLINE(void) pffft_real_preprocess_4x4(const v4sf *in, const v4sf *e, v4sf *out, - int first) + const bool first) { - v4sf r0=in[0], i0=in[1], r1=in[2], i1=in[3], r2=in[4], i2=in[5], r3=in[6], i3=in[7]; + v4sf r0{in[0]}, i0{in[1]}, r1{in[2]}, i1{in[3]}; + v4sf r2{in[4]}, i2{in[5]}, r3{in[6]}, i3{in[7]}; /* transformation for each column is: * @@ -1722,10 +1745,10 @@ static ALWAYS_INLINE(void) pffft_real_preprocess_4x4(const v4sf *in, const v4sf * [0 1 -1 0 1 0 0 1] [i3] */ - v4sf sr0 = VADD(r0,r3), dr0 = VSUB(r0,r3); - v4sf sr1 = VADD(r1,r2), dr1 = VSUB(r1,r2); - v4sf si0 = VADD(i0,i3), di0 = VSUB(i0,i3); - v4sf si1 = VADD(i1,i2), di1 = VSUB(i1,i2); + v4sf sr0{VADD(r0,r3)}, dr0{VSUB(r0,r3)}; + v4sf sr1{VADD(r1,r2)}, dr1{VSUB(r1,r2)}; + v4sf si0{VADD(i0,i3)}, di0{VSUB(i0,i3)}; + v4sf si1{VADD(i1,i2)}, di1{VSUB(i1,i2)}; r0 = VADD(sr0, sr1); r2 = VSUB(sr0, sr1); @@ -1756,9 +1779,10 @@ static ALWAYS_INLINE(void) pffft_real_preprocess_4x4(const v4sf *in, const v4sf *out++ = i3; } -static NEVER_INLINE(void) pffft_real_preprocess(int Ncvec, const v4sf *in, v4sf *out, const v4sf *e) +static NEVER_INLINE(void) pffft_real_preprocess(const int Ncvec, const v4sf *in, v4sf *out, + const v4sf *e) { - static constexpr float s = al::numbers::sqrt2_v<float>; + static constexpr float sqrt2{al::numbers::sqrt2_v<float>}; assert(in != out); const int dk{Ncvec/SIMD_SZ}; // number of 4x4 matrix blocks @@ -1771,7 +1795,7 @@ static NEVER_INLINE(void) pffft_real_preprocess(int Ncvec, const v4sf *in, v4sf Xi[k] = VEXTRACT0(in[4*k + 1]); } - pffft_real_preprocess_4x4(in, e, out+1, 1); // will write only 6 values + pffft_real_preprocess_4x4(in, e, out+1, true); // will write only 6 values /* [Xr0 Xr1 Xr2 Xr3 Xi0 Xi1 Xi2 Xi3] * @@ -1785,34 +1809,30 @@ static NEVER_INLINE(void) pffft_real_preprocess(int Ncvec, const v4sf *in, v4sf * [ci3] [0 -s 0 s 0 -s 0 -s] */ for(int k{1};k < dk;++k) - pffft_real_preprocess_4x4(in+8*k, e + k*6, out-1+k*8, 0); + pffft_real_preprocess_4x4(in+8*k, e + k*6, out-1+k*8, false); const float cr0{(Xr[0]+Xi[0]) + 2*Xr[2]}; const float cr1{(Xr[0]-Xi[0]) - 2*Xi[2]}; const float cr2{(Xr[0]+Xi[0]) - 2*Xr[2]}; const float cr3{(Xr[0]-Xi[0]) + 2*Xi[2]}; out[0] = VSET4(cr0, cr1, cr2, cr3); - const float ci0{ 2*(Xr[1]+Xr[3])}; - const float ci1{ s*(Xr[1]-Xr[3]) - s*(Xi[1]+Xi[3])}; - const float ci2{ 2*(Xi[3]-Xi[1])}; - const float ci3{-s*(Xr[1]-Xr[3]) - s*(Xi[1]+Xi[3])}; + const float ci0{ 2*(Xr[1]+Xr[3])}; + const float ci1{ sqrt2*(Xr[1]-Xr[3]) - sqrt2*(Xi[1]+Xi[3])}; + const float ci2{ 2*(Xi[3]-Xi[1])}; + const float ci3{-sqrt2*(Xr[1]-Xr[3]) - sqrt2*(Xi[1]+Xi[3])}; out[2*Ncvec-1] = VSET4(ci0, ci1, ci2, ci3); } -void pffft_transform_internal(PFFFT_Setup *setup, const float *finput, float *foutput, - v4sf *scratch, pffft_direction_t direction, int ordered) +void pffft_transform_internal(PFFFT_Setup *setup, const v4sf *vinput, v4sf *voutput, + v4sf *scratch, const pffft_direction_t direction, const bool ordered) { assert(scratch != nullptr); - assert(VALIGNED(finput) && VALIGNED(foutput) && VALIGNED(scratch)); + assert(voutput != scratch); const int Ncvec{setup->Ncvec}; const int nf_odd{setup->ifac[1] & 1}; - auto *vinput = reinterpret_cast<const v4sf*>(finput); - auto *voutput = reinterpret_cast<v4sf*>(foutput); - assert(voutput != scratch); - v4sf *buff[2]{voutput, scratch}; int ib{(nf_odd ^ ordered) ? 1 : 0}; if(direction == PFFFT_FORWARD) @@ -1870,21 +1890,18 @@ void pffft_transform_internal(PFFFT_Setup *setup, const float *finput, float *fo if(buff[ib] != voutput) { /* extra copy required -- this situation should only happen when finput == foutput */ - assert(finput==foutput); + assert(vinput==voutput); for(int k{0};k < Ncvec;++k) { v4sf a{buff[ib][2*k]}, b{buff[ib][2*k+1]}; voutput[2*k] = a; voutput[2*k+1] = b; } - ib = !ib; } } void pffft_zconvolve_accumulate(PFFFT_Setup *s, const float *a, const float *b, float *ab, float scaling) { - assert(VALIGNED(a) && VALIGNED(b) && VALIGNED(ab)); - const int Ncvec{s->Ncvec}; const v4sf *RESTRICT va{reinterpret_cast<const v4sf*>(a)}; const v4sf *RESTRICT vb{reinterpret_cast<const v4sf*>(b)}; @@ -1911,12 +1928,12 @@ void pffft_zconvolve_accumulate(PFFFT_Setup *s, const float *a, const float *b, #ifndef ZCONVOLVE_USING_INLINE_ASM const v4sf vscal{LD_PS1(scaling)}; #endif - float ar1{VEXTRACT0(va[0])}; - float ai1{VEXTRACT0(va[1])}; - float br1{VEXTRACT0(vb[0])}; - float bi1{VEXTRACT0(vb[1])}; - float abr1{VEXTRACT0(vab[0])}; - float abi1{VEXTRACT0(vab[1])}; + const float ar1{VEXTRACT0(va[0])}; + const float ai1{VEXTRACT0(va[1])}; + const float br1{VEXTRACT0(vb[0])}; + const float bi1{VEXTRACT0(vb[1])}; + const float abr1{VEXTRACT0(vab[0])}; + const float abi1{VEXTRACT0(vab[1])}; #ifdef ZCONVOLVE_USING_INLINE_ASM // inline asm version, unfortunately miscompiled by clang 3.2, at least on ubuntu.. so this will be restricted to gcc const float *a_{a}, *b_{b}; float *ab_{ab}; @@ -1957,7 +1974,7 @@ void pffft_zconvolve_accumulate(PFFFT_Setup *s, const float *a, const float *b, #else // default routine, works fine for non-arm cpus with current compilers - for(int i=0; i < Ncvec; i += 2) + for(int i{0};i < Ncvec;i += 2) { v4sf ar4{va[2*i+0]}, ai4{va[2*i+1]}; v4sf br4{vb[2*i+0]}, bi4{vb[2*i+1]}; @@ -1980,6 +1997,22 @@ void pffft_zconvolve_accumulate(PFFFT_Setup *s, const float *a, const float *b, } +void pffft_transform(PFFFT_Setup *setup, const float *input, float *output, float *work, pffft_direction_t direction) +{ + assert(VALIGNED(input) && VALIGNED(output) && VALIGNED(work)); + pffft_transform_internal(setup, reinterpret_cast<const v4sf*>(al::assume_aligned<16>(input)), + reinterpret_cast<v4sf*>(al::assume_aligned<16>(output)), + reinterpret_cast<v4sf*>(al::assume_aligned<16>(work)), direction, false); +} + +void pffft_transform_ordered(PFFFT_Setup *setup, const float *input, float *output, float *work, pffft_direction_t direction) +{ + assert(VALIGNED(input) && VALIGNED(output) && VALIGNED(work)); + pffft_transform_internal(setup, reinterpret_cast<const v4sf*>(al::assume_aligned<16>(input)), + reinterpret_cast<v4sf*>(al::assume_aligned<16>(output)), + reinterpret_cast<v4sf*>(al::assume_aligned<16>(work)), direction, true); +} + #else // defined(PFFFT_SIMD_DISABLE) // standard routine using scalar floats, without SIMD stuff. @@ -1988,25 +2021,25 @@ void pffft_zconvolve_accumulate(PFFFT_Setup *s, const float *a, const float *b, void pffft_zreorder_nosimd(PFFFT_Setup *setup, const float *in, float *out, pffft_direction_t direction) { - const int N = setup->N; + const int N{setup->N}; if(setup->transform == PFFFT_COMPLEX) { - for(int k=0; k < 2*N; ++k) + for(int k{0};k < 2*N;++k) out[k] = in[k]; return; } else if(direction == PFFFT_FORWARD) { - float x_N = in[N-1]; - for(int k=N-1; k > 1; --k) + float x_N{in[N-1]}; + for(int k{N-1};k > 1;--k) out[k] = in[k-1]; out[0] = in[0]; out[1] = x_N; } else { - float x_N = in[1]; - for(int k=1; k < N-1; ++k) + float x_N{in[1]}; + for(int k{1};k < N-1;++k) out[k] = in[k+1]; out[0] = in[0]; out[N-1] = x_N; @@ -2015,7 +2048,7 @@ void pffft_zreorder_nosimd(PFFFT_Setup *setup, const float *in, float *out, #define pffft_transform_internal_nosimd pffft_transform_internal void pffft_transform_internal_nosimd(PFFFT_Setup *setup, const float *input, float *output, - float *scratch, pffft_direction_t direction, int ordered) + float *scratch, const pffft_direction_t direction, bool ordered) { const int Ncvec{setup->Ncvec}; const int nf_odd{setup->ifac[1] & 1}; @@ -2061,12 +2094,11 @@ void pffft_transform_internal_nosimd(PFFFT_Setup *setup, const float *input, flo { // extra copy required -- this situation should happens only when finput == foutput assert(input==output); - for(int k=0; k < Ncvec; ++k) + for(int k{0};k < Ncvec;++k) { - float a = buff[ib][2*k], b = buff[ib][2*k+1]; + float a{buff[ib][2*k]}, b{buff[ib][2*k+1]}; output[2*k] = a; output[2*k+1] = b; } - ib = !ib; } } @@ -2093,14 +2125,15 @@ void pffft_zconvolve_accumulate_nosimd(PFFFT_Setup *s, const float *a, const flo } } -#endif // defined(PFFFT_SIMD_DISABLE) void pffft_transform(PFFFT_Setup *setup, const float *input, float *output, float *work, pffft_direction_t direction) { - pffft_transform_internal(setup, input, output, reinterpret_cast<v4sf*>(work), direction, 0); + pffft_transform_internal(setup, input, output, work, direction, false); } void pffft_transform_ordered(PFFFT_Setup *setup, const float *input, float *output, float *work, pffft_direction_t direction) { - pffft_transform_internal(setup, input, output, reinterpret_cast<v4sf*>(work), direction, 1); + pffft_transform_internal(setup, input, output, work, direction, true); } + +#endif // defined(PFFFT_SIMD_DISABLE) |