aboutsummaryrefslogtreecommitdiffstats
path: root/common
diff options
context:
space:
mode:
authorChris Robinson <[email protected]>2023-10-09 01:29:14 -0700
committerChris Robinson <[email protected]>2023-10-09 01:29:14 -0700
commit60ed9ec8bad22cc904ff0dec9b6d7dfe3c704e56 (patch)
tree60c38d0a4f0f6648c20a7f66611af9f1a451fbf8 /common
parent9cbf4d99231bf495a23cb78be504bd9ffd29eadd (diff)
Cleanup PFFFT
Make stylization more consistent. Remove SVMUL (they all simulated it with a LD_PS1 on the scalar). Avoid calling LD_PS1 on the same value in a loop.
Diffstat (limited to 'common')
-rw-r--r--common/pffft.cpp1175
1 files changed, 604 insertions, 571 deletions
diff --git a/common/pffft.cpp b/common/pffft.cpp
index 883e44f0..8eb5a19b 100644
--- a/common/pffft.cpp
+++ b/common/pffft.cpp
@@ -68,6 +68,7 @@
#include "albit.h"
#include "almalloc.h"
#include "alnumbers.h"
+#include "opthelpers.h"
#include "vector.h"
#if defined(__GNUC__)
@@ -94,7 +95,7 @@
* vectors should be limited to these macros
*/
-// define PFFFT_SIMD_DISABLE if you want to use scalar code instead of simd code
+/* Define PFFFT_SIMD_DISABLE if you want to use scalar code instead of SIMD code */
//#define PFFFT_SIMD_DISABLE
#ifndef PFFFT_SIMD_DISABLE
@@ -147,18 +148,18 @@ inline v4sf vset4(float a, float b, float c, float d)
#include <xmmintrin.h>
typedef __m128 v4sf;
#define SIMD_SZ 4 // 4 floats by simd vector -- this is pretty much hardcoded in the preprocess/finalize functions anyway so you will have to work if you want to enable AVX with its 256-bit vectors.
-#define VZERO() _mm_setzero_ps()
-#define VMUL(a,b) _mm_mul_ps(a,b)
-#define VADD(a,b) _mm_add_ps(a,b)
+#define VZERO _mm_setzero_ps
+#define VMUL _mm_mul_ps
+#define VADD _mm_add_ps
#define VMADD(a,b,c) _mm_add_ps(_mm_mul_ps(a,b), c)
-#define VSUB(a,b) _mm_sub_ps(a,b)
-#define LD_PS1(p) _mm_set1_ps(p)
+#define VSUB _mm_sub_ps
+#define LD_PS1 _mm_set1_ps
#define VSET4 _mm_setr_ps
#define VINSERT0(v, a) _mm_move_ss((v), _mm_set_ss(a))
#define VEXTRACT0 _mm_cvtss_f32
#define INTERLEAVE2(in1, in2, out1, out2) do { v4sf tmp__ = _mm_unpacklo_ps(in1, in2); out2 = _mm_unpackhi_ps(in1, in2); out1 = tmp__; } while(0)
#define UNINTERLEAVE2(in1, in2, out1, out2) do { v4sf tmp__ = _mm_shuffle_ps(in1, in2, _MM_SHUFFLE(2,0,2,0)); out2 = _mm_shuffle_ps(in1, in2, _MM_SHUFFLE(3,1,3,1)); out1 = tmp__; } while(0)
-#define VTRANSPOSE4(x0,x1,x2,x3) _MM_TRANSPOSE4_PS(x0,x1,x2,x3)
+#define VTRANSPOSE4 _MM_TRANSPOSE4_PS
#define VSWAPHL(a,b) _mm_shuffle_ps(b, a, _MM_SHUFFLE(3,2,1,0))
#define VALIGNED(ptr) ((reinterpret_cast<uintptr_t>(ptr) & 0xF) == 0)
@@ -171,11 +172,11 @@ typedef __m128 v4sf;
typedef float32x4_t v4sf;
#define SIMD_SZ 4
#define VZERO() vdupq_n_f32(0)
-#define VMUL(a,b) vmulq_f32(a,b)
-#define VADD(a,b) vaddq_f32(a,b)
+#define VMUL vmulq_f32
+#define VADD vaddq_f32
#define VMADD(a,b,c) vmlaq_f32(c,a,b)
-#define VSUB(a,b) vsubq_f32(a,b)
-#define LD_PS1(p) vld1q_dup_f32(&(p))
+#define VSUB vsubq_f32
+#define LD_PS1 vdupq_n_f32
inline v4sf vset4(float a, float b, float c, float d)
{
float32x4_t ret{vmovq_n_f32(a)};
@@ -213,7 +214,6 @@ using v4sf [[gnu::vector_size(16), gnu::aligned(16)]] = float;
#define VADD(a,b) ((a) + (b))
#define VMADD(a,b,c) ((a)*(b) + (c))
#define VSUB(a,b) ((a) - (b))
-#define SVMUL(f,v) ((f) * (v))
constexpr v4sf ld_ps1(float a) noexcept { return v4sf{a, a, a, a}; }
#define LD_PS1 ld_ps1
@@ -287,10 +287,6 @@ typedef float v4sf;
// shortcuts for complex multiplications
#define VCPLXMUL(ar,ai,br,bi) do { v4sf tmp=VMUL(ar,bi); ar=VMUL(ar,br); ar=VSUB(ar,VMUL(ai,bi)); ai=VMADD(ai,br,tmp); } while(0)
#define VCPLXMULCONJ(ar,ai,br,bi) do { v4sf tmp=VMUL(ar,bi); ar=VMUL(ar,br); ar=VMADD(ai,bi,ar); ai=VSUB(VMUL(ai,br),tmp); } while(0)
-#ifndef SVMUL
-// multiply a scalar with a vector
-#define SVMUL(f,v) VMUL(LD_PS1(f),v)
-#endif
#if !defined(PFFFT_SIMD_DISABLE)
@@ -309,8 +305,8 @@ void validate_pffft_simd()
std::memcpy(&a2_v, f+8, 4*sizeof(float));
std::memcpy(&a3_v, f+12, 4*sizeof(float));
- t_v = a0_v; u_v = a1_v; t_v = VZERO();
- t_f = al::bit_cast<float4>(t_v);
+ t_v = a0_v; u_v = a1_v;
+ t_v = VZERO(); t_f = al::bit_cast<float4>(t_v);
printf("VZERO=[%2g %2g %2g %2g]\n", t_f[0], t_f[1], t_f[2], t_f[3]); assertv4(t, 0, 0, 0, 0);
t_v = VADD(a1_v, a2_v); t_f = al::bit_cast<float4>(t_v);
printf("VADD(4:7,8:11)=[%2g %2g %2g %2g]\n", t_f[0], t_f[1], t_f[2], t_f[3]); assertv4(t, 12, 14, 16, 18);
@@ -357,12 +353,13 @@ int pffft_simd_size() { return SIMD_SZ; }
/*
passf2 and passb2 has been merged here, fsign = -1 for passf2, +1 for passb2
*/
-static NEVER_INLINE(void) passf2_ps(int ido, int l1, const v4sf *cc, v4sf *ch, const float *wa1, float fsign)
+static NEVER_INLINE(void) passf2_ps(const int ido, const int l1, const v4sf *cc, v4sf *ch,
+ const float *wa1, const float fsign)
{
- const int l1ido = l1*ido;
+ const int l1ido{l1*ido};
if(ido <= 2)
{
- for(int k=0; k < l1ido; k += ido, ch += ido, cc+= 2*ido)
+ for(int k{0};k < l1ido;k += ido, ch += ido, cc += 2*ido)
{
ch[0] = VADD(cc[0], cc[ido+0]);
ch[l1ido] = VSUB(cc[0], cc[ido+0]);
@@ -372,13 +369,14 @@ static NEVER_INLINE(void) passf2_ps(int ido, int l1, const v4sf *cc, v4sf *ch, c
}
else
{
- for(int k=0; k < l1ido; k += ido, ch += ido, cc += 2*ido)
+ const v4sf vsign{LD_PS1(fsign)};
+ for(int k{0};k < l1ido;k += ido, ch += ido, cc += 2*ido)
{
- for(int i=0; i<ido-1; i+=2)
+ for(int i{0};i < ido-1;i += 2)
{
- v4sf tr2 = VSUB(cc[i+0], cc[i+ido+0]);
- v4sf ti2 = VSUB(cc[i+1], cc[i+ido+1]);
- v4sf wr = LD_PS1(wa1[i]), wi = VMUL(LD_PS1(fsign), LD_PS1(wa1[i+1]));
+ v4sf tr2{VSUB(cc[i+0], cc[i+ido+0])};
+ v4sf ti2{VSUB(cc[i+1], cc[i+ido+1])};
+ v4sf wr{LD_PS1(wa1[i])}, wi{VMUL(vsign, LD_PS1(wa1[i+1]))};
ch[i] = VADD(cc[i+0], cc[i+ido+0]);
ch[i+1] = VADD(cc[i+1], cc[i+ido+1]);
VCPLXMUL(tr2, ti2, wr, wi);
@@ -392,30 +390,31 @@ static NEVER_INLINE(void) passf2_ps(int ido, int l1, const v4sf *cc, v4sf *ch, c
/*
passf3 and passb3 has been merged here, fsign = -1 for passf3, +1 for passb3
*/
-static NEVER_INLINE(void) passf3_ps(int ido, int l1, const v4sf *cc, v4sf *ch, const float *wa1,
- const float *wa2, float fsign)
+static NEVER_INLINE(void) passf3_ps(const int ido, const int l1, const v4sf *cc, v4sf *ch,
+ const float *wa1, const float *wa2, const float fsign)
{
- static constexpr float taur = -0.5f;
- const float taui = 0.866025403784439f*fsign;
- const int l1ido = l1*ido;
assert(ido > 2);
- for(int k=0; k< l1ido; k += ido, cc+= 3*ido, ch +=ido)
+
+ const v4sf vtaur{LD_PS1(-0.5f)};
+ const v4sf vtaui{LD_PS1(0.866025403784439f*fsign)};
+ const int l1ido{l1*ido};
+ for(int k{0};k < l1ido;k += ido, cc += 3*ido, ch +=ido)
{
- for(int i=0; i<ido-1; i+=2)
+ for(int i{0};i < ido-1;i += 2)
{
- v4sf tr2 = VADD(cc[i+ido], cc[i+2*ido]);
- v4sf cr2 = VADD(cc[i], SVMUL(taur,tr2));
- ch[i] = VADD(cc[i], tr2);
- v4sf ti2 = VADD(cc[i+ido+1], cc[i+2*ido+1]);
- v4sf ci2 = VADD(cc[i +1], SVMUL(taur,ti2));
- ch[i+1] = VADD(cc[i+1], ti2);
- v4sf cr3 = SVMUL(taui, VSUB(cc[i+ido], cc[i+2*ido]));
- v4sf ci3 = SVMUL(taui, VSUB(cc[i+ido+1], cc[i+2*ido+1]));
- v4sf dr2 = VSUB(cr2, ci3);
- v4sf dr3 = VADD(cr2, ci3);
- v4sf di2 = VADD(ci2, cr3);
- v4sf di3 = VSUB(ci2, cr3);
- float wr1=wa1[i], wi1=fsign*wa1[i+1], wr2=wa2[i], wi2=fsign*wa2[i+1];
+ v4sf tr2{VADD(cc[i+ido], cc[i+2*ido])};
+ v4sf cr2{VADD(cc[i], VMUL(vtaur,tr2))};
+ ch[i] = VADD(cc[i], tr2);
+ v4sf ti2{VADD(cc[i+ido+1], cc[i+2*ido+1])};
+ v4sf ci2{VADD(cc[i +1], VMUL(vtaur,ti2))};
+ ch[i+1] = VADD(cc[i+1], ti2);
+ v4sf cr3{VMUL(vtaui, VSUB(cc[i+ido], cc[i+2*ido]))};
+ v4sf ci3{VMUL(vtaui, VSUB(cc[i+ido+1], cc[i+2*ido+1]))};
+ v4sf dr2{VSUB(cr2, ci3)};
+ v4sf dr3{VADD(cr2, ci3)};
+ v4sf di2{VADD(ci2, cr3)};
+ v4sf di3{VSUB(ci2, cr3)};
+ float wr1{wa1[i]}, wi1{fsign*wa1[i+1]}, wr2{wa2[i]}, wi2{fsign*wa2[i+1]};
VCPLXMUL(dr2, di2, LD_PS1(wr1), LD_PS1(wi1));
ch[i+l1ido] = dr2;
ch[i+l1ido + 1] = di2;
@@ -426,23 +425,24 @@ static NEVER_INLINE(void) passf3_ps(int ido, int l1, const v4sf *cc, v4sf *ch, c
}
} /* passf3 */
-static NEVER_INLINE(void) passf4_ps(int ido, int l1, const v4sf *cc, v4sf *ch, const float *wa1,
- const float *wa2, const float *wa3, float fsign)
+static NEVER_INLINE(void) passf4_ps(const int ido, const int l1, const v4sf *cc, v4sf *ch,
+ const float *wa1, const float *wa2, const float *wa3, const float fsign)
{
- /* isign == -1 for forward transform and +1 for backward transform */
- const int l1ido = l1*ido;
+ /* fsign == -1 for forward transform and +1 for backward transform */
+ const v4sf vsign{LD_PS1(fsign)};
+ const int l1ido{l1*ido};
if(ido == 2)
{
- for(int k=0; k < l1ido; k += ido, ch += ido, cc += 4*ido)
+ for(int k{0};k < l1ido;k += ido, ch += ido, cc += 4*ido)
{
- v4sf tr1 = VSUB(cc[0], cc[2*ido + 0]);
- v4sf tr2 = VADD(cc[0], cc[2*ido + 0]);
- v4sf ti1 = VSUB(cc[1], cc[2*ido + 1]);
- v4sf ti2 = VADD(cc[1], cc[2*ido + 1]);
- v4sf ti4 = VMUL(VSUB(cc[1*ido + 0], cc[3*ido + 0]), LD_PS1(fsign));
- v4sf tr4 = VMUL(VSUB(cc[3*ido + 1], cc[1*ido + 1]), LD_PS1(fsign));
- v4sf tr3 = VADD(cc[ido + 0], cc[3*ido + 0]);
- v4sf ti3 = VADD(cc[ido + 1], cc[3*ido + 1]);
+ v4sf tr1{VSUB(cc[0], cc[2*ido + 0])};
+ v4sf tr2{VADD(cc[0], cc[2*ido + 0])};
+ v4sf ti1{VSUB(cc[1], cc[2*ido + 1])};
+ v4sf ti2{VADD(cc[1], cc[2*ido + 1])};
+ v4sf ti4{VMUL(VSUB(cc[1*ido + 0], cc[3*ido + 0]), vsign)};
+ v4sf tr4{VMUL(VSUB(cc[3*ido + 1], cc[1*ido + 1]), vsign)};
+ v4sf tr3{VADD(cc[ido + 0], cc[3*ido + 0])};
+ v4sf ti3{VADD(cc[ido + 1], cc[3*ido + 1])};
ch[0*l1ido + 0] = VADD(tr2, tr3);
ch[0*l1ido + 1] = VADD(ti2, ti3);
@@ -456,36 +456,36 @@ static NEVER_INLINE(void) passf4_ps(int ido, int l1, const v4sf *cc, v4sf *ch, c
}
else
{
- for(int k=0; k < l1ido; k += ido, ch+=ido, cc += 4*ido)
+ for(int k{0};k < l1ido;k += ido, ch+=ido, cc += 4*ido)
{
- for(int i=0; i<ido-1; i+=2)
+ for(int i{0};i < ido-1;i+=2)
{
- v4sf tr1 = VSUB(cc[i + 0], cc[i + 2*ido + 0]);
- v4sf tr2 = VADD(cc[i + 0], cc[i + 2*ido + 0]);
- v4sf ti1 = VSUB(cc[i + 1], cc[i + 2*ido + 1]);
- v4sf ti2 = VADD(cc[i + 1], cc[i + 2*ido + 1]);
- v4sf tr4 = VMUL(VSUB(cc[i + 3*ido + 1], cc[i + 1*ido + 1]), LD_PS1(fsign));
- v4sf ti4 = VMUL(VSUB(cc[i + 1*ido + 0], cc[i + 3*ido + 0]), LD_PS1(fsign));
- v4sf tr3 = VADD(cc[i + ido + 0], cc[i + 3*ido + 0]);
- v4sf ti3 = VADD(cc[i + ido + 1], cc[i + 3*ido + 1]);
+ v4sf tr1{VSUB(cc[i + 0], cc[i + 2*ido + 0])};
+ v4sf tr2{VADD(cc[i + 0], cc[i + 2*ido + 0])};
+ v4sf ti1{VSUB(cc[i + 1], cc[i + 2*ido + 1])};
+ v4sf ti2{VADD(cc[i + 1], cc[i + 2*ido + 1])};
+ v4sf tr4{VMUL(VSUB(cc[i + 3*ido + 1], cc[i + 1*ido + 1]), vsign)};
+ v4sf ti4{VMUL(VSUB(cc[i + 1*ido + 0], cc[i + 3*ido + 0]), vsign)};
+ v4sf tr3{VADD(cc[i + ido + 0], cc[i + 3*ido + 0])};
+ v4sf ti3{VADD(cc[i + ido + 1], cc[i + 3*ido + 1])};
ch[i] = VADD(tr2, tr3);
- v4sf cr3 = VSUB(tr2, tr3);
+ v4sf cr3{VSUB(tr2, tr3)};
ch[i + 1] = VADD(ti2, ti3);
- v4sf ci3 = VSUB(ti2, ti3);
+ v4sf ci3{VSUB(ti2, ti3)};
- v4sf cr2 = VADD(tr1, tr4);
- v4sf cr4 = VSUB(tr1, tr4);
- v4sf ci2 = VADD(ti1, ti4);
- v4sf ci4 = VSUB(ti1, ti4);
- float wr1=wa1[i], wi1=fsign*wa1[i+1];
+ v4sf cr2{VADD(tr1, tr4)};
+ v4sf cr4{VSUB(tr1, tr4)};
+ v4sf ci2{VADD(ti1, ti4)};
+ v4sf ci4{VSUB(ti1, ti4)};
+ float wr1{wa1[i]}, wi1{fsign*wa1[i+1]};
VCPLXMUL(cr2, ci2, LD_PS1(wr1), LD_PS1(wi1));
- float wr2=wa2[i], wi2=fsign*wa2[i+1];
+ float wr2{wa2[i]}, wi2{fsign*wa2[i+1]};
ch[i + l1ido] = cr2;
ch[i + l1ido + 1] = ci2;
VCPLXMUL(cr3, ci3, LD_PS1(wr2), LD_PS1(wi2));
- float wr3=wa3[i], wi3=fsign*wa3[i+1];
+ float wr3{wa3[i]}, wi3{fsign*wa3[i+1]};
ch[i + 2*l1ido] = cr3;
ch[i + 2*l1ido + 1] = ci3;
@@ -500,50 +500,50 @@ static NEVER_INLINE(void) passf4_ps(int ido, int l1, const v4sf *cc, v4sf *ch, c
/*
* passf5 and passb5 has been merged here, fsign = -1 for passf5, +1 for passb5
*/
-static NEVER_INLINE(void) passf5_ps(int ido, int l1, const v4sf *cc, v4sf *ch, const float *wa1,
- const float *wa2, const float *wa3, const float *wa4, float fsign)
+static NEVER_INLINE(void) passf5_ps(const int ido, const int l1, const v4sf *cc, v4sf *ch,
+ const float *wa1, const float *wa2, const float *wa3, const float *wa4, const float fsign)
{
- static constexpr float tr11 = 0.309016994374947f;
- static constexpr float tr12 = -0.809016994374947f;
- const float ti11 = 0.951056516295154f*fsign;
- const float ti12 = 0.587785252292473f*fsign;
+ const v4sf vtr11{LD_PS1(0.309016994374947f)};
+ const v4sf vtr12{LD_PS1(-0.809016994374947f)};
+ const v4sf vti11{LD_PS1(0.951056516295154f*fsign)};
+ const v4sf vti12{LD_PS1(0.587785252292473f*fsign)};
#define cc_ref(a_1,a_2) cc[(a_2-1)*ido + (a_1) + 1]
#define ch_ref(a_1,a_3) ch[(a_3-1)*l1*ido + (a_1) + 1]
assert(ido > 2);
- for(int k = 0; k < l1; ++k, cc += 5*ido, ch += ido)
+ for(int k{0};k < l1;++k, cc += 5*ido, ch += ido)
{
- for(int i = 0; i < ido-1; i += 2)
+ for(int i{0};i < ido-1;i += 2)
{
- v4sf ti5 = VSUB(cc_ref(i , 2), cc_ref(i , 5));
- v4sf ti2 = VADD(cc_ref(i , 2), cc_ref(i , 5));
- v4sf ti4 = VSUB(cc_ref(i , 3), cc_ref(i , 4));
- v4sf ti3 = VADD(cc_ref(i , 3), cc_ref(i , 4));
- v4sf tr5 = VSUB(cc_ref(i-1, 2), cc_ref(i-1, 5));
- v4sf tr2 = VADD(cc_ref(i-1, 2), cc_ref(i-1, 5));
- v4sf tr4 = VSUB(cc_ref(i-1, 3), cc_ref(i-1, 4));
- v4sf tr3 = VADD(cc_ref(i-1, 3), cc_ref(i-1, 4));
+ v4sf ti5{VSUB(cc_ref(i , 2), cc_ref(i , 5))};
+ v4sf ti2{VADD(cc_ref(i , 2), cc_ref(i , 5))};
+ v4sf ti4{VSUB(cc_ref(i , 3), cc_ref(i , 4))};
+ v4sf ti3{VADD(cc_ref(i , 3), cc_ref(i , 4))};
+ v4sf tr5{VSUB(cc_ref(i-1, 2), cc_ref(i-1, 5))};
+ v4sf tr2{VADD(cc_ref(i-1, 2), cc_ref(i-1, 5))};
+ v4sf tr4{VSUB(cc_ref(i-1, 3), cc_ref(i-1, 4))};
+ v4sf tr3{VADD(cc_ref(i-1, 3), cc_ref(i-1, 4))};
ch_ref(i-1, 1) = VADD(cc_ref(i-1, 1), VADD(tr2, tr3));
ch_ref(i , 1) = VADD(cc_ref(i , 1), VADD(ti2, ti3));
- v4sf cr2 = VADD(cc_ref(i-1, 1), VADD(SVMUL(tr11, tr2),SVMUL(tr12, tr3)));
- v4sf ci2 = VADD(cc_ref(i , 1), VADD(SVMUL(tr11, ti2),SVMUL(tr12, ti3)));
- v4sf cr3 = VADD(cc_ref(i-1, 1), VADD(SVMUL(tr12, tr2),SVMUL(tr11, tr3)));
- v4sf ci3 = VADD(cc_ref(i , 1), VADD(SVMUL(tr12, ti2),SVMUL(tr11, ti3)));
- v4sf cr5 = VADD(SVMUL(ti11, tr5), SVMUL(ti12, tr4));
- v4sf ci5 = VADD(SVMUL(ti11, ti5), SVMUL(ti12, ti4));
- v4sf cr4 = VSUB(SVMUL(ti12, tr5), SVMUL(ti11, tr4));
- v4sf ci4 = VSUB(SVMUL(ti12, ti5), SVMUL(ti11, ti4));
- v4sf dr3 = VSUB(cr3, ci4);
- v4sf dr4 = VADD(cr3, ci4);
- v4sf di3 = VADD(ci3, cr4);
- v4sf di4 = VSUB(ci3, cr4);
- v4sf dr5 = VADD(cr2, ci5);
- v4sf dr2 = VSUB(cr2, ci5);
- v4sf di5 = VSUB(ci2, cr5);
- v4sf di2 = VADD(ci2, cr5);
- float wr1=wa1[i], wi1=fsign*wa1[i+1], wr2=wa2[i], wi2=fsign*wa2[i+1];
- float wr3=wa3[i], wi3=fsign*wa3[i+1], wr4=wa4[i], wi4=fsign*wa4[i+1];
+ v4sf cr2{VADD(cc_ref(i-1, 1), VADD(VMUL(vtr11, tr2),VMUL(vtr12, tr3)))};
+ v4sf ci2{VADD(cc_ref(i , 1), VADD(VMUL(vtr11, ti2),VMUL(vtr12, ti3)))};
+ v4sf cr3{VADD(cc_ref(i-1, 1), VADD(VMUL(vtr12, tr2),VMUL(vtr11, tr3)))};
+ v4sf ci3{VADD(cc_ref(i , 1), VADD(VMUL(vtr12, ti2),VMUL(vtr11, ti3)))};
+ v4sf cr5{VADD(VMUL(vti11, tr5), VMUL(vti12, tr4))};
+ v4sf ci5{VADD(VMUL(vti11, ti5), VMUL(vti12, ti4))};
+ v4sf cr4{VSUB(VMUL(vti12, tr5), VMUL(vti11, tr4))};
+ v4sf ci4{VSUB(VMUL(vti12, ti5), VMUL(vti11, ti4))};
+ v4sf dr3{VSUB(cr3, ci4)};
+ v4sf dr4{VADD(cr3, ci4)};
+ v4sf di3{VADD(ci3, cr4)};
+ v4sf di4{VSUB(ci3, cr4)};
+ v4sf dr5{VADD(cr2, ci5)};
+ v4sf dr2{VSUB(cr2, ci5)};
+ v4sf di5{VSUB(ci2, cr5)};
+ v4sf di2{VADD(ci2, cr5)};
+ float wr1{wa1[i]}, wi1{fsign*wa1[i+1]}, wr2{wa2[i]}, wi2{fsign*wa2[i+1]};
+ float wr3{wa3[i]}, wi3{fsign*wa3[i+1]}, wr4{wa4[i]}, wi4{fsign*wa4[i+1]};
VCPLXMUL(dr2, di2, LD_PS1(wr1), LD_PS1(wi1));
ch_ref(i - 1, 2) = dr2;
ch_ref(i, 2) = di2;
@@ -562,15 +562,13 @@ static NEVER_INLINE(void) passf5_ps(int ido, int l1, const v4sf *cc, v4sf *ch, c
#undef cc_ref
}
-static NEVER_INLINE(void) radf2_ps(int ido, int l1, const v4sf *RESTRICT cc, v4sf *RESTRICT ch,
- const float *wa1)
+static NEVER_INLINE(void) radf2_ps(const int ido, const int l1, const v4sf *RESTRICT cc,
+ v4sf *RESTRICT ch, const float *wa1)
{
- static constexpr float minus_one = -1.f;
- const int l1ido = l1*ido;
-
- for(int k=0; k < l1ido; k += ido)
+ const int l1ido{l1*ido};
+ for(int k{0};k < l1ido;k += ido)
{
- v4sf a = cc[k], b = cc[k + l1ido];
+ v4sf a{cc[k]}, b{cc[k + l1ido]};
ch[2*k] = VADD(a, b);
ch[2*(k+ido)-1] = VSUB(a, b);
}
@@ -578,12 +576,12 @@ static NEVER_INLINE(void) radf2_ps(int ido, int l1, const v4sf *RESTRICT cc, v4s
return;
if(ido != 2)
{
- for(int k=0; k < l1ido; k += ido)
+ for(int k{0};k < l1ido;k += ido)
{
- for(int i=2; i<ido; i+=2)
+ for(int i{2};i < ido;i += 2)
{
- v4sf tr2 = cc[i - 1 + k + l1ido], ti2 = cc[i + k + l1ido];
- v4sf br = cc[i - 1 + k], bi = cc[i + k];
+ v4sf tr2{cc[i - 1 + k + l1ido]}, ti2{cc[i + k + l1ido]};
+ v4sf br{cc[i - 1 + k]}, bi{cc[i + k]};
VCPLXMULCONJ(tr2, ti2, LD_PS1(wa1[i - 2]), LD_PS1(wa1[i - 1]));
ch[i + 2*k] = VADD(bi, ti2);
ch[2*(k+ido) - i] = VSUB(ti2, bi);
@@ -594,41 +592,42 @@ static NEVER_INLINE(void) radf2_ps(int ido, int l1, const v4sf *RESTRICT cc, v4s
if((ido&1) == 1)
return;
}
- for(int k=0; k < l1ido; k += ido)
+ const v4sf minus_one{LD_PS1(-1.0f)};
+ for(int k{0};k < l1ido;k += ido)
{
- ch[2*k + ido] = SVMUL(minus_one, cc[ido-1 + k + l1ido]);
+ ch[2*k + ido] = VMUL(minus_one, cc[ido-1 + k + l1ido]);
ch[2*k + ido-1] = cc[k + ido-1];
}
} /* radf2 */
-static NEVER_INLINE(void) radb2_ps(int ido, int l1, const v4sf *cc, v4sf *ch, const float *wa1)
+static NEVER_INLINE(void) radb2_ps(const int ido, const int l1, const v4sf *cc, v4sf *ch,
+ const float *wa1)
{
- static constexpr float minus_two=-2;
- const int l1ido = l1*ido;
- for(int k=0; k < l1ido; k += ido)
+ const int l1ido{l1*ido};
+ for(int k{0};k < l1ido;k += ido)
{
- v4sf a = cc[2*k];
- v4sf b = cc[2*(k+ido) - 1];
+ v4sf a{cc[2*k]};
+ v4sf b{cc[2*(k+ido) - 1]};
ch[k] = VADD(a, b);
- ch[k + l1ido] =VSUB(a, b);
+ ch[k + l1ido] = VSUB(a, b);
}
if(ido < 2)
return;
if(ido != 2)
{
- for(int k = 0; k < l1ido; k += ido)
+ for(int k{0};k < l1ido;k += ido)
{
- for(int i = 2; i < ido; i += 2)
+ for(int i{2};i < ido;i += 2)
{
- v4sf a = cc[i-1 + 2*k];
- v4sf b = cc[2*(k + ido) - i - 1];
- v4sf c = cc[i+0 + 2*k];
- v4sf d = cc[2*(k + ido) - i + 0];
+ v4sf a{cc[i-1 + 2*k]};
+ v4sf b{cc[2*(k + ido) - i - 1]};
+ v4sf c{cc[i+0 + 2*k]};
+ v4sf d{cc[2*(k + ido) - i + 0]};
ch[i-1 + k] = VADD(a, b);
- v4sf tr2 = VSUB(a, b);
+ v4sf tr2{VSUB(a, b)};
ch[i+0 + k] = VSUB(c, d);
- v4sf ti2 = VADD(c, d);
+ v4sf ti2{VADD(c, d)};
VCPLXMUL(tr2, ti2, LD_PS1(wa1[i - 2]), LD_PS1(wa1[i - 1]));
ch[i-1 + k + l1ido] = tr2;
ch[i+0 + k + l1ido] = ti2;
@@ -637,54 +636,55 @@ static NEVER_INLINE(void) radb2_ps(int ido, int l1, const v4sf *cc, v4sf *ch, co
if((ido&1) == 1)
return;
}
- for(int k = 0; k < l1ido; k += ido)
+ const v4sf minus_two{LD_PS1(-2.0f)};
+ for(int k{0};k < l1ido;k += ido)
{
- v4sf a = cc[2*k + ido-1];
- v4sf b = cc[2*k + ido];
+ v4sf a{cc[2*k + ido-1]};
+ v4sf b{cc[2*k + ido]};
ch[k + ido-1] = VADD(a,a);
- ch[k + ido-1 + l1ido] = SVMUL(minus_two, b);
+ ch[k + ido-1 + l1ido] = VMUL(minus_two, b);
}
} /* radb2 */
-static void radf3_ps(int ido, int l1, const v4sf *RESTRICT cc, v4sf *RESTRICT ch, const float *wa1,
- const float *wa2)
+static void radf3_ps(const int ido, const int l1, const v4sf *RESTRICT cc, v4sf *RESTRICT ch,
+ const float *wa1, const float *wa2)
{
- static constexpr float taur = -0.5f;
- static constexpr float taui = 0.866025403784439f;
- for(int k=0; k<l1; k++)
+ const v4sf vtaur{LD_PS1(-0.5f)};
+ const v4sf vtaui{LD_PS1(0.866025403784439f)};
+ for(int k{0};k < l1;++k)
{
- v4sf cr2 = VADD(cc[(k + l1)*ido], cc[(k + 2*l1)*ido]);
+ v4sf cr2{VADD(cc[(k + l1)*ido], cc[(k + 2*l1)*ido])};
ch[3*k*ido] = VADD(cc[k*ido], cr2);
- ch[(3*k+2)*ido] = SVMUL(taui, VSUB(cc[(k + l1*2)*ido], cc[(k + l1)*ido]));
- ch[ido-1 + (3*k + 1)*ido] = VADD(cc[k*ido], SVMUL(taur, cr2));
+ ch[(3*k+2)*ido] = VMUL(vtaui, VSUB(cc[(k + l1*2)*ido], cc[(k + l1)*ido]));
+ ch[ido-1 + (3*k + 1)*ido] = VADD(cc[k*ido], VMUL(vtaur, cr2));
}
if(ido == 1)
return;
- for(int k=0; k<l1; k++)
+ for(int k{0};k < l1;++k)
{
- for(int i=2; i<ido; i+=2)
+ for(int i{2};i < ido;i += 2)
{
- const int ic = ido - i;
- v4sf wr1 = LD_PS1(wa1[i - 2]);
- v4sf wi1 = LD_PS1(wa1[i - 1]);
- v4sf dr2 = cc[i - 1 + (k + l1)*ido];
- v4sf di2 = cc[i + (k + l1)*ido];
+ const int ic{ido - i};
+ v4sf wr1{LD_PS1(wa1[i - 2])};
+ v4sf wi1{LD_PS1(wa1[i - 1])};
+ v4sf dr2{cc[i - 1 + (k + l1)*ido]};
+ v4sf di2{cc[i + (k + l1)*ido]};
VCPLXMULCONJ(dr2, di2, wr1, wi1);
- v4sf wr2 = LD_PS1(wa2[i - 2]);
- v4sf wi2 = LD_PS1(wa2[i - 1]);
- v4sf dr3 = cc[i - 1 + (k + l1*2)*ido];
- v4sf di3 = cc[i + (k + l1*2)*ido];
+ v4sf wr2{LD_PS1(wa2[i - 2])};
+ v4sf wi2{LD_PS1(wa2[i - 1])};
+ v4sf dr3{cc[i - 1 + (k + l1*2)*ido]};
+ v4sf di3{cc[i + (k + l1*2)*ido]};
VCPLXMULCONJ(dr3, di3, wr2, wi2);
- v4sf cr2 = VADD(dr2, dr3);
- v4sf ci2 = VADD(di2, di3);
+ v4sf cr2{VADD(dr2, dr3)};
+ v4sf ci2{VADD(di2, di3)};
ch[i - 1 + 3*k*ido] = VADD(cc[i - 1 + k*ido], cr2);
ch[i + 3*k*ido] = VADD(cc[i + k*ido], ci2);
- v4sf tr2 = VADD(cc[i - 1 + k*ido], SVMUL(taur, cr2));
- v4sf ti2 = VADD(cc[i + k*ido], SVMUL(taur, ci2));
- v4sf tr3 = SVMUL(taui, VSUB(di2, di3));
- v4sf ti3 = SVMUL(taui, VSUB(dr3, dr2));
+ v4sf tr2{VADD(cc[i - 1 + k*ido], VMUL(vtaur, cr2))};
+ v4sf ti2{VADD(cc[i + k*ido], VMUL(vtaur, ci2))};
+ v4sf tr3{VMUL(vtaui, VSUB(di2, di3))};
+ v4sf ti3{VMUL(vtaui, VSUB(dr3, dr2))};
ch[i - 1 + (3*k + 2)*ido] = VADD(tr2, tr3);
ch[ic - 1 + (3*k + 1)*ido] = VSUB(tr2, tr3);
ch[i + (3*k + 2)*ido] = VADD(ti2, ti3);
@@ -697,39 +697,42 @@ static void radf3_ps(int ido, int l1, const v4sf *RESTRICT cc, v4sf *RESTRICT ch
static void radb3_ps(int ido, int l1, const v4sf *RESTRICT cc, v4sf *RESTRICT ch, const float *wa1,
const float *wa2)
{
- static constexpr float taur = -0.5f;
- static constexpr float taui = 0.866025403784439f;
- static constexpr float taui_2 = taui*2.0f;
+ static constexpr float taur{-0.5f};
+ static constexpr float taui{0.866025403784439f};
+ static constexpr float taui_2{taui*2.0f};
- for(int k=0; k<l1; k++)
+ const v4sf vtaur{LD_PS1(taur)};
+ const v4sf vtaui_2{LD_PS1(taui_2)};
+ for(int k{0};k < l1;++k)
{
v4sf tr2 = cc[ido-1 + (3*k + 1)*ido];
tr2 = VADD(tr2,tr2);
- v4sf cr2 = VMADD(LD_PS1(taur), tr2, cc[3*k*ido]);
+ v4sf cr2 = VMADD(vtaur, tr2, cc[3*k*ido]);
ch[k*ido] = VADD(cc[3*k*ido], tr2);
- v4sf ci3 = SVMUL(taui_2, cc[(3*k + 2)*ido]);
+ v4sf ci3 = VMUL(vtaui_2, cc[(3*k + 2)*ido]);
ch[(k + l1)*ido] = VSUB(cr2, ci3);
ch[(k + 2*l1)*ido] = VADD(cr2, ci3);
}
if(ido == 1)
return;
- for(int k=0; k<l1; k++)
+ const v4sf vtaui{LD_PS1(taui)};
+ for(int k{0};k < l1;++k)
{
- for(int i=2; i<ido; i+=2)
+ for(int i{2};i < ido;i += 2)
{
- const int ic = ido - i;
- v4sf tr2 = VADD(cc[i - 1 + (3*k + 2)*ido], cc[ic - 1 + (3*k + 1)*ido]);
- v4sf cr2 = VMADD(LD_PS1(taur), tr2, cc[i - 1 + 3*k*ido]);
+ const int ic{ido - i};
+ v4sf tr2{VADD(cc[i - 1 + (3*k + 2)*ido], cc[ic - 1 + (3*k + 1)*ido])};
+ v4sf cr2{VMADD(vtaur, tr2, cc[i - 1 + 3*k*ido])};
ch[i - 1 + k*ido] = VADD(cc[i - 1 + 3*k*ido], tr2);
- v4sf ti2 = VSUB(cc[i + (3*k + 2)*ido], cc[ic + (3*k + 1)*ido]);
- v4sf ci2 = VMADD(LD_PS1(taur), ti2, cc[i + 3*k*ido]);
+ v4sf ti2{VSUB(cc[i + (3*k + 2)*ido], cc[ic + (3*k + 1)*ido])};
+ v4sf ci2{VMADD(vtaur, ti2, cc[i + 3*k*ido])};
ch[i + k*ido] = VADD(cc[i + 3*k*ido], ti2);
- v4sf cr3 = SVMUL(taui, VSUB(cc[i - 1 + (3*k + 2)*ido], cc[ic - 1 + (3*k + 1)*ido]));
- v4sf ci3 = SVMUL(taui, VADD(cc[i + (3*k + 2)*ido], cc[ic + (3*k + 1)*ido]));
- v4sf dr2 = VSUB(cr2, ci3);
- v4sf dr3 = VADD(cr2, ci3);
- v4sf di2 = VADD(ci2, cr3);
- v4sf di3 = VSUB(ci2, cr3);
+ v4sf cr3{VMUL(vtaui, VSUB(cc[i - 1 + (3*k + 2)*ido], cc[ic - 1 + (3*k + 1)*ido]))};
+ v4sf ci3{VMUL(vtaui, VADD(cc[i + (3*k + 2)*ido], cc[ic + (3*k + 1)*ido]))};
+ v4sf dr2{VSUB(cr2, ci3)};
+ v4sf dr3{VADD(cr2, ci3)};
+ v4sf di2{VADD(ci2, cr3)};
+ v4sf di3{VSUB(ci2, cr3)};
VCPLXMUL(dr2, di2, LD_PS1(wa1[i-2]), LD_PS1(wa1[i-1]));
ch[i - 1 + (k + l1)*ido] = dr2;
ch[i + (k + l1)*ido] = di2;
@@ -743,18 +746,17 @@ static void radb3_ps(int ido, int l1, const v4sf *RESTRICT cc, v4sf *RESTRICT ch
static NEVER_INLINE(void) radf4_ps(int ido, int l1, const v4sf *RESTRICT cc, v4sf *RESTRICT ch,
const float *RESTRICT wa1, const float * RESTRICT wa2, const float *RESTRICT wa3)
{
- static constexpr float minus_hsqt2 = al::numbers::sqrt2_v<float> * -0.5f;
- const int l1ido = l1*ido;
+ const int l1ido{l1*ido};
{
const v4sf *RESTRICT cc_ = cc, *RESTRICT cc_end = cc + l1ido;
v4sf *RESTRICT ch_ = ch;
while(cc != cc_end)
{
// this loop represents between 25% and 40% of total radf4_ps cost !
- v4sf a0 = cc[0], a1 = cc[l1ido];
- v4sf a2 = cc[2*l1ido], a3 = cc[3*l1ido];
- v4sf tr1 = VADD(a1, a3);
- v4sf tr2 = VADD(a0, a2);
+ v4sf a0{cc[0]}, a1{cc[l1ido]};
+ v4sf a2{cc[2*l1ido]}, a3{cc[3*l1ido]};
+ v4sf tr1{VADD(a1, a3)};
+ v4sf tr2{VADD(a0, a2)};
ch[2*ido-1] = VSUB(a0, a2);
ch[2*ido ] = VSUB(a3, a1);
ch[0 ] = VADD(tr1, tr2);
@@ -768,47 +770,45 @@ static NEVER_INLINE(void) radf4_ps(int ido, int l1, const v4sf *RESTRICT cc, v4s
return;
if(ido != 2)
{
- for(int k = 0; k < l1ido; k += ido)
+ for(int k{0};k < l1ido;k += ido)
{
- const v4sf *RESTRICT pc = cc + 1 + k;
- for(int i=2; i<ido; i += 2, pc += 2)
+ const v4sf *RESTRICT pc{cc + 1 + k};
+ for(int i{2};i < ido;i += 2, pc += 2)
{
- const int ic = ido - i;
- v4sf wr, wi, cr2, ci2, cr3, ci3, cr4, ci4;
- v4sf tr1, ti1, tr2, ti2, tr3, ti3, tr4, ti4;
-
- cr2 = pc[1*l1ido+0];
- ci2 = pc[1*l1ido+1];
- wr=LD_PS1(wa1[i - 2]);
- wi=LD_PS1(wa1[i - 1]);
+ const int ic{ido - i};
+
+ v4sf cr2{pc[1*l1ido+0]};
+ v4sf ci2{pc[1*l1ido+1]};
+ v4sf wr{LD_PS1(wa1[i - 2])};
+ v4sf wi{LD_PS1(wa1[i - 1])};
VCPLXMULCONJ(cr2,ci2,wr,wi);
- cr3 = pc[2*l1ido+0];
- ci3 = pc[2*l1ido+1];
+ v4sf cr3{pc[2*l1ido+0]};
+ v4sf ci3{pc[2*l1ido+1]};
wr = LD_PS1(wa2[i-2]);
wi = LD_PS1(wa2[i-1]);
VCPLXMULCONJ(cr3, ci3, wr, wi);
- cr4 = pc[3*l1ido];
- ci4 = pc[3*l1ido+1];
+ v4sf cr4{pc[3*l1ido]};
+ v4sf ci4{pc[3*l1ido+1]};
wr = LD_PS1(wa3[i-2]);
wi = LD_PS1(wa3[i-1]);
VCPLXMULCONJ(cr4, ci4, wr, wi);
/* at this point, on SSE, five of "cr2 cr3 cr4 ci2 ci3 ci4" should be loaded in registers */
- tr1 = VADD(cr2,cr4);
- tr4 = VSUB(cr4,cr2);
- tr2 = VADD(pc[0],cr3);
- tr3 = VSUB(pc[0],cr3);
+ v4sf tr1{VADD(cr2,cr4)};
+ v4sf tr4{VSUB(cr4,cr2)};
+ v4sf tr2{VADD(pc[0],cr3)};
+ v4sf tr3{VSUB(pc[0],cr3)};
ch[i - 1 + 4*k] = VADD(tr1,tr2);
ch[ic - 1 + 4*k + 3*ido] = VSUB(tr2,tr1); // at this point tr1 and tr2 can be disposed
- ti1 = VADD(ci2,ci4);
- ti4 = VSUB(ci2,ci4);
+ v4sf ti1{VADD(ci2,ci4)};
+ v4sf ti4{VSUB(ci2,ci4)};
ch[i - 1 + 4*k + 2*ido] = VADD(ti4,tr3);
ch[ic - 1 + 4*k + 1*ido] = VSUB(tr3,ti4); // dispose tr3, ti4
- ti2 = VADD(pc[1],ci3);
- ti3 = VSUB(pc[1],ci3);
+ v4sf ti2{VADD(pc[1],ci3)};
+ v4sf ti3{VSUB(pc[1],ci3)};
ch[i + 4*k] = VADD(ti1, ti2);
ch[ic + 4*k + 3*ido] = VSUB(ti1, ti2);
ch[i + 4*k + 2*ido] = VADD(tr4, ti3);
@@ -818,12 +818,13 @@ static NEVER_INLINE(void) radf4_ps(int ido, int l1, const v4sf *RESTRICT cc, v4s
if((ido&1) == 1)
return;
}
- for(int k=0; k<l1ido; k += ido)
+ const v4sf minus_hsqt2{LD_PS1(al::numbers::sqrt2_v<float> * -0.5f)};
+ for(int k{0};k < l1ido;k += ido)
{
- v4sf a = cc[ido-1 + k + l1ido], b = cc[ido-1 + k + 3*l1ido];
- v4sf c = cc[ido-1 + k], d = cc[ido-1 + k + 2*l1ido];
- v4sf ti1 = SVMUL(minus_hsqt2, VADD(a, b));
- v4sf tr1 = SVMUL(minus_hsqt2, VSUB(b, a));
+ v4sf a{cc[ido-1 + k + l1ido]}, b{cc[ido-1 + k + 3*l1ido]};
+ v4sf c{cc[ido-1 + k]}, d{cc[ido-1 + k + 2*l1ido]};
+ v4sf ti1{VMUL(minus_hsqt2, VADD(a, b))};
+ v4sf tr1{VMUL(minus_hsqt2, VSUB(b, a))};
ch[ido-1 + 4*k] = VADD(tr1, c);
ch[ido-1 + 4*k + 2*ido] = VSUB(c, tr1);
ch[4*k + 1*ido] = VSUB(ti1, d);
@@ -832,23 +833,23 @@ static NEVER_INLINE(void) radf4_ps(int ido, int l1, const v4sf *RESTRICT cc, v4s
} /* radf4 */
-static NEVER_INLINE(void) radb4_ps(int ido, int l1, const v4sf * RESTRICT cc, v4sf *RESTRICT ch,
- const float *RESTRICT wa1, const float *RESTRICT wa2, const float *RESTRICT wa3)
+static NEVER_INLINE(void) radb4_ps(const int ido, const int l1, const v4sf * RESTRICT cc,
+ v4sf *RESTRICT ch, const float *RESTRICT wa1, const float *RESTRICT wa2,
+ const float *RESTRICT wa3)
{
- static constexpr float minus_sqrt2 = -1.414213562373095f;
- static constexpr float two = 2.f;
- const int l1ido = l1*ido;
+ const v4sf two{LD_PS1(2.0f)};
+ const int l1ido{l1*ido};
{
- const v4sf *RESTRICT cc_ = cc, *RESTRICT ch_end = ch + l1ido;
- v4sf *ch_ = ch;
+ const v4sf *RESTRICT cc_{cc}, *RESTRICT ch_end{ch + l1ido};
+ v4sf *ch_{ch};
while(ch != ch_end)
{
- v4sf a = cc[0], b = cc[4*ido-1];
- v4sf c = cc[2*ido], d = cc[2*ido-1];
- v4sf tr3 = SVMUL(two,d);
- v4sf tr2 = VADD(a,b);
- v4sf tr1 = VSUB(a,b);
- v4sf tr4 = SVMUL(two,c);
+ v4sf a{cc[0]}, b{cc[4*ido-1]};
+ v4sf c{cc[2*ido]}, d{cc[2*ido-1]};
+ v4sf tr3{VMUL(two,d)};
+ v4sf tr2{VADD(a,b)};
+ v4sf tr1{VSUB(a,b)};
+ v4sf tr4{VMUL(two,c)};
ch[0*l1ido] = VADD(tr2, tr3);
ch[2*l1ido] = VSUB(tr2, tr3);
ch[1*l1ido] = VSUB(tr1, tr4);
@@ -862,31 +863,31 @@ static NEVER_INLINE(void) radb4_ps(int ido, int l1, const v4sf * RESTRICT cc, v4
return;
if(ido != 2)
{
- for(int k = 0; k < l1ido; k += ido)
+ for(int k{0};k < l1ido;k += ido)
{
- const v4sf *RESTRICT pc = cc - 1 + 4*k;
- v4sf *RESTRICT ph = ch + k + 1;
- for(int i = 2; i < ido; i += 2)
+ const v4sf *RESTRICT pc{cc - 1 + 4*k};
+ v4sf *RESTRICT ph{ch + k + 1};
+ for(int i{2};i < ido;i += 2)
{
- v4sf tr1 = VSUB(pc[i], pc[4*ido - i]);
- v4sf tr2 = VADD(pc[i], pc[4*ido - i]);
- v4sf ti4 = VSUB(pc[2*ido + i], pc[2*ido - i]);
- v4sf tr3 = VADD(pc[2*ido + i], pc[2*ido - i]);
+ v4sf tr1{VSUB(pc[i], pc[4*ido - i])};
+ v4sf tr2{VADD(pc[i], pc[4*ido - i])};
+ v4sf ti4{VSUB(pc[2*ido + i], pc[2*ido - i])};
+ v4sf tr3{VADD(pc[2*ido + i], pc[2*ido - i])};
ph[0] = VADD(tr2, tr3);
- v4sf cr3 = VSUB(tr2, tr3);
+ v4sf cr3{VSUB(tr2, tr3)};
- v4sf ti3 = VSUB(pc[2*ido + i + 1], pc[2*ido - i + 1]);
- v4sf tr4 = VADD(pc[2*ido + i + 1], pc[2*ido - i + 1]);
- v4sf cr2 = VSUB(tr1, tr4);
- v4sf cr4 = VADD(tr1, tr4);
+ v4sf ti3{VSUB(pc[2*ido + i + 1], pc[2*ido - i + 1])};
+ v4sf tr4{VADD(pc[2*ido + i + 1], pc[2*ido - i + 1])};
+ v4sf cr2{VSUB(tr1, tr4)};
+ v4sf cr4{VADD(tr1, tr4)};
- v4sf ti1 = VADD(pc[i + 1], pc[4*ido - i + 1]);
- v4sf ti2 = VSUB(pc[i + 1], pc[4*ido - i + 1]);
+ v4sf ti1{VADD(pc[i + 1], pc[4*ido - i + 1])};
+ v4sf ti2{VSUB(pc[i + 1], pc[4*ido - i + 1])};
ph[1] = VADD(ti2, ti3); ph += l1ido;
- v4sf ci3 = VSUB(ti2, ti3);
- v4sf ci2 = VADD(ti1, ti4);
- v4sf ci4 = VSUB(ti1, ti4);
+ v4sf ci3{VSUB(ti2, ti3)};
+ v4sf ci2{VADD(ti1, ti4)};
+ v4sf ci4{VSUB(ti1, ti4)};
VCPLXMUL(cr2, ci2, LD_PS1(wa1[i-2]), LD_PS1(wa1[i-1]));
ph[0] = cr2;
ph[1] = ci2; ph += l1ido;
@@ -901,92 +902,93 @@ static NEVER_INLINE(void) radb4_ps(int ido, int l1, const v4sf * RESTRICT cc, v4
if((ido&1) == 1)
return;
}
- for(int k=0; k < l1ido; k+=ido)
+ const v4sf minus_sqrt2{LD_PS1(-1.414213562373095f)};
+ for(int k{0};k < l1ido;k += ido)
{
- const int i0 = 4*k + ido;
- v4sf c = cc[i0-1], d = cc[i0 + 2*ido-1];
- v4sf a = cc[i0+0], b = cc[i0 + 2*ido+0];
- v4sf tr1 = VSUB(c,d);
- v4sf tr2 = VADD(c,d);
- v4sf ti1 = VADD(b,a);
- v4sf ti2 = VSUB(b,a);
+ const int i0{4*k + ido};
+ v4sf c{cc[i0-1]}, d{cc[i0 + 2*ido-1]};
+ v4sf a{cc[i0+0]}, b{cc[i0 + 2*ido+0]};
+ v4sf tr1{VSUB(c,d)};
+ v4sf tr2{VADD(c,d)};
+ v4sf ti1{VADD(b,a)};
+ v4sf ti2{VSUB(b,a)};
ch[ido-1 + k + 0*l1ido] = VADD(tr2,tr2);
- ch[ido-1 + k + 1*l1ido] = SVMUL(minus_sqrt2, VSUB(ti1, tr1));
+ ch[ido-1 + k + 1*l1ido] = VMUL(minus_sqrt2, VSUB(ti1, tr1));
ch[ido-1 + k + 2*l1ido] = VADD(ti2, ti2);
- ch[ido-1 + k + 3*l1ido] = SVMUL(minus_sqrt2, VADD(ti1, tr1));
+ ch[ido-1 + k + 3*l1ido] = VMUL(minus_sqrt2, VADD(ti1, tr1));
}
} /* radb4 */
-static void radf5_ps(int ido, int l1, const v4sf *RESTRICT cc, v4sf *RESTRICT ch, const float *wa1,
- const float *wa2, const float *wa3, const float *wa4)
+static void radf5_ps(const int ido, const int l1, const v4sf *RESTRICT cc, v4sf *RESTRICT ch,
+ const float *wa1, const float *wa2, const float *wa3, const float *wa4)
{
- static constexpr float tr11 = 0.309016994374947f;
- static constexpr float ti11 = 0.951056516295154f;
- static constexpr float tr12 = -0.809016994374947f;
- static constexpr float ti12 = 0.587785252292473f;
+ const v4sf tr11{LD_PS1(0.309016994374947f)};
+ const v4sf ti11{LD_PS1(0.951056516295154f)};
+ const v4sf tr12{LD_PS1(-0.809016994374947f)};
+ const v4sf ti12{LD_PS1(0.587785252292473f)};
#define cc_ref(a_1,a_2,a_3) cc[((a_3)*l1 + (a_2))*ido + a_1]
#define ch_ref(a_1,a_2,a_3) ch[((a_3)*5 + (a_2))*ido + a_1]
/* Parameter adjustments */
- const int ch_offset = 1 + ido * 6;
+ const int ch_offset{1 + ido * 6};
ch -= ch_offset;
- const int cc_offset = 1 + ido * (1 + l1);
+ const int cc_offset{1 + ido * (1 + l1)};
cc -= cc_offset;
/* Function Body */
- for(int k = 1; k <= l1; ++k)
+ for(int k{1};k <= l1;++k)
{
- v4sf cr2 = VADD(cc_ref(1, k, 5), cc_ref(1, k, 2));
- v4sf ci5 = VSUB(cc_ref(1, k, 5), cc_ref(1, k, 2));
- v4sf cr3 = VADD(cc_ref(1, k, 4), cc_ref(1, k, 3));
- v4sf ci4 = VSUB(cc_ref(1, k, 4), cc_ref(1, k, 3));
+ v4sf cr2{VADD(cc_ref(1, k, 5), cc_ref(1, k, 2))};
+ v4sf ci5{VSUB(cc_ref(1, k, 5), cc_ref(1, k, 2))};
+ v4sf cr3{VADD(cc_ref(1, k, 4), cc_ref(1, k, 3))};
+ v4sf ci4{VSUB(cc_ref(1, k, 4), cc_ref(1, k, 3))};
ch_ref(1, 1, k) = VADD(cc_ref(1, k, 1), VADD(cr2, cr3));
- ch_ref(ido, 2, k) = VADD(cc_ref(1, k, 1), VADD(SVMUL(tr11, cr2), SVMUL(tr12, cr3)));
- ch_ref(1, 3, k) = VADD(SVMUL(ti11, ci5), SVMUL(ti12, ci4));
- ch_ref(ido, 4, k) = VADD(cc_ref(1, k, 1), VADD(SVMUL(tr12, cr2), SVMUL(tr11, cr3)));
- ch_ref(1, 5, k) = VSUB(SVMUL(ti12, ci5), SVMUL(ti11, ci4));
+ ch_ref(ido, 2, k) = VADD(cc_ref(1, k, 1), VADD(VMUL(tr11, cr2), VMUL(tr12, cr3)));
+ ch_ref(1, 3, k) = VADD(VMUL(ti11, ci5), VMUL(ti12, ci4));
+ ch_ref(ido, 4, k) = VADD(cc_ref(1, k, 1), VADD(VMUL(tr12, cr2), VMUL(tr11, cr3)));
+ ch_ref(1, 5, k) = VSUB(VMUL(ti12, ci5), VMUL(ti11, ci4));
//printf("pffft: radf5, k=%d ch_ref=%f, ci4=%f\n", k, ch_ref(1, 5, k), ci4);
}
if(ido == 1)
return;
- const int idp2 = ido + 2;
- for(int k = 1; k <= l1; ++k)
+ const int idp2{ido + 2};
+ for(int k{1};k <= l1;++k)
{
- for(int i = 3; i <= ido; i += 2)
+ for(int i{3};i <= ido;i += 2)
{
- const int ic = idp2 - i;
- v4sf dr2 = LD_PS1(wa1[i-3]);
- v4sf di2 = LD_PS1(wa1[i-2]);
- v4sf dr3 = LD_PS1(wa2[i-3]);
- v4sf di3 = LD_PS1(wa2[i-2]);
- v4sf dr4 = LD_PS1(wa3[i-3]);
- v4sf di4 = LD_PS1(wa3[i-2]);
- v4sf dr5 = LD_PS1(wa4[i-3]);
- v4sf di5 = LD_PS1(wa4[i-2]);
+ const int ic{idp2 - i};
+ v4sf dr2{LD_PS1(wa1[i-3])};
+ v4sf di2{LD_PS1(wa1[i-2])};
+ v4sf dr3{LD_PS1(wa2[i-3])};
+ v4sf di3{LD_PS1(wa2[i-2])};
+ v4sf dr4{LD_PS1(wa3[i-3])};
+ v4sf di4{LD_PS1(wa3[i-2])};
+ v4sf dr5{LD_PS1(wa4[i-3])};
+ v4sf di5{LD_PS1(wa4[i-2])};
VCPLXMULCONJ(dr2, di2, cc_ref(i-1, k, 2), cc_ref(i, k, 2));
VCPLXMULCONJ(dr3, di3, cc_ref(i-1, k, 3), cc_ref(i, k, 3));
VCPLXMULCONJ(dr4, di4, cc_ref(i-1, k, 4), cc_ref(i, k, 4));
VCPLXMULCONJ(dr5, di5, cc_ref(i-1, k, 5), cc_ref(i, k, 5));
- v4sf cr2 = VADD(dr2, dr5);
- v4sf ci5 = VSUB(dr5, dr2);
- v4sf cr5 = VSUB(di2, di5);
- v4sf ci2 = VADD(di2, di5);
- v4sf cr3 = VADD(dr3, dr4);
- v4sf ci4 = VSUB(dr4, dr3);
- v4sf cr4 = VSUB(di3, di4);
- v4sf ci3 = VADD(di3, di4);
+ v4sf cr2{VADD(dr2, dr5)};
+ v4sf ci5{VSUB(dr5, dr2)};
+ v4sf cr5{VSUB(di2, di5)};
+ v4sf ci2{VADD(di2, di5)};
+ v4sf cr3{VADD(dr3, dr4)};
+ v4sf ci4{VSUB(dr4, dr3)};
+ v4sf cr4{VSUB(di3, di4)};
+ v4sf ci3{VADD(di3, di4)};
ch_ref(i - 1, 1, k) = VADD(cc_ref(i - 1, k, 1), VADD(cr2, cr3));
- ch_ref(i, 1, k) = VSUB(cc_ref(i, k, 1), VADD(ci2, ci3));//
- v4sf tr2 = VADD(cc_ref(i - 1, k, 1), VADD(SVMUL(tr11, cr2), SVMUL(tr12, cr3)));
- v4sf ti2 = VSUB(cc_ref(i, k, 1), VADD(SVMUL(tr11, ci2), SVMUL(tr12, ci3)));//
- v4sf tr3 = VADD(cc_ref(i - 1, k, 1), VADD(SVMUL(tr12, cr2), SVMUL(tr11, cr3)));
- v4sf ti3 = VSUB(cc_ref(i, k, 1), VADD(SVMUL(tr12, ci2), SVMUL(tr11, ci3)));//
- v4sf tr5 = VADD(SVMUL(ti11, cr5), SVMUL(ti12, cr4));
- v4sf ti5 = VADD(SVMUL(ti11, ci5), SVMUL(ti12, ci4));
- v4sf tr4 = VSUB(SVMUL(ti12, cr5), SVMUL(ti11, cr4));
- v4sf ti4 = VSUB(SVMUL(ti12, ci5), SVMUL(ti11, ci4));
+ ch_ref(i, 1, k) = VSUB(cc_ref(i, k, 1), VADD(ci2, ci3));
+ v4sf tr2{VADD(cc_ref(i - 1, k, 1), VADD(VMUL(tr11, cr2), VMUL(tr12, cr3)))};
+ v4sf ti2{VSUB(cc_ref(i, k, 1), VADD(VMUL(tr11, ci2), VMUL(tr12, ci3)))};
+ v4sf tr3{VADD(cc_ref(i - 1, k, 1), VADD(VMUL(tr12, cr2), VMUL(tr11, cr3)))};
+ v4sf ti3{VSUB(cc_ref(i, k, 1), VADD(VMUL(tr12, ci2), VMUL(tr11, ci3)))};
+ v4sf tr5{VADD(VMUL(ti11, cr5), VMUL(ti12, cr4))};
+ v4sf ti5{VADD(VMUL(ti11, ci5), VMUL(ti12, ci4))};
+ v4sf tr4{VSUB(VMUL(ti12, cr5), VMUL(ti11, cr4))};
+ v4sf ti4{VSUB(VMUL(ti12, ci5), VMUL(ti11, ci4))};
ch_ref(i - 1, 3, k) = VSUB(tr2, tr5);
ch_ref(ic - 1, 2, k) = VADD(tr2, tr5);
ch_ref(i, 3, k) = VADD(ti2, ti5);
@@ -1001,35 +1003,35 @@ static void radf5_ps(int ido, int l1, const v4sf *RESTRICT cc, v4sf *RESTRICT ch
#undef ch_ref
} /* radf5 */
-static void radb5_ps(int ido, int l1, const v4sf *RESTRICT cc, v4sf *RESTRICT ch, const float *wa1,
- const float *wa2, const float *wa3, const float *wa4)
+static void radb5_ps(const int ido, const int l1, const v4sf *RESTRICT cc, v4sf *RESTRICT ch,
+ const float *wa1, const float *wa2, const float *wa3, const float *wa4)
{
- static constexpr float tr11 = 0.309016994374947f;
- static constexpr float ti11 = 0.951056516295154f;
- static constexpr float tr12 = -0.809016994374947f;
- static constexpr float ti12 = 0.587785252292473f;
+ const v4sf tr11{LD_PS1(0.309016994374947f)};
+ const v4sf ti11{LD_PS1(0.951056516295154f)};
+ const v4sf tr12{LD_PS1(-0.809016994374947f)};
+ const v4sf ti12{LD_PS1(0.587785252292473f)};
#define cc_ref(a_1,a_2,a_3) cc[((a_3)*5 + (a_2))*ido + a_1]
#define ch_ref(a_1,a_2,a_3) ch[((a_3)*l1 + (a_2))*ido + a_1]
/* Parameter adjustments */
- const int ch_offset = 1 + ido * (1 + l1);
+ const int ch_offset{1 + ido*(1 + l1)};
ch -= ch_offset;
- const int cc_offset = 1 + ido * 6;
+ const int cc_offset{1 + ido*6};
cc -= cc_offset;
/* Function Body */
- for(int k = 1; k <= l1; ++k)
+ for(int k{1};k <= l1;++k)
{
- v4sf ti5 = VADD(cc_ref(1, 3, k), cc_ref(1, 3, k));
- v4sf ti4 = VADD(cc_ref(1, 5, k), cc_ref(1, 5, k));
- v4sf tr2 = VADD(cc_ref(ido, 2, k), cc_ref(ido, 2, k));
- v4sf tr3 = VADD(cc_ref(ido, 4, k), cc_ref(ido, 4, k));
+ v4sf ti5{VADD(cc_ref(1, 3, k), cc_ref(1, 3, k))};
+ v4sf ti4{VADD(cc_ref(1, 5, k), cc_ref(1, 5, k))};
+ v4sf tr2{VADD(cc_ref(ido, 2, k), cc_ref(ido, 2, k))};
+ v4sf tr3{VADD(cc_ref(ido, 4, k), cc_ref(ido, 4, k))};
ch_ref(1, k, 1) = VADD(cc_ref(1, 1, k), VADD(tr2, tr3));
- v4sf cr2 = VADD(cc_ref(1, 1, k), VADD(SVMUL(tr11, tr2), SVMUL(tr12, tr3)));
- v4sf cr3 = VADD(cc_ref(1, 1, k), VADD(SVMUL(tr12, tr2), SVMUL(tr11, tr3)));
- v4sf ci5 = VADD(SVMUL(ti11, ti5), SVMUL(ti12, ti4));
- v4sf ci4 = VSUB(SVMUL(ti12, ti5), SVMUL(ti11, ti4));
+ v4sf cr2{VADD(cc_ref(1, 1, k), VADD(VMUL(tr11, tr2), VMUL(tr12, tr3)))};
+ v4sf cr3{VADD(cc_ref(1, 1, k), VADD(VMUL(tr12, tr2), VMUL(tr11, tr3)))};
+ v4sf ci5{VADD(VMUL(ti11, ti5), VMUL(ti12, ti4))};
+ v4sf ci4{VSUB(VMUL(ti12, ti5), VMUL(ti11, ti4))};
ch_ref(1, k, 2) = VSUB(cr2, ci5);
ch_ref(1, k, 3) = VSUB(cr3, ci4);
ch_ref(1, k, 4) = VADD(cr3, ci4);
@@ -1038,38 +1040,38 @@ static void radb5_ps(int ido, int l1, const v4sf *RESTRICT cc, v4sf *RESTRICT ch
if(ido == 1)
return;
- const int idp2 = ido + 2;
- for(int k = 1; k <= l1; ++k)
+ const int idp2{ido + 2};
+ for(int k{1};k <= l1;++k)
{
- for(int i = 3; i <= ido; i += 2)
+ for(int i{3};i <= ido;i += 2)
{
- const int ic = idp2 - i;
- v4sf ti5 = VADD(cc_ref(i , 3, k), cc_ref(ic , 2, k));
- v4sf ti2 = VSUB(cc_ref(i , 3, k), cc_ref(ic , 2, k));
- v4sf ti4 = VADD(cc_ref(i , 5, k), cc_ref(ic , 4, k));
- v4sf ti3 = VSUB(cc_ref(i , 5, k), cc_ref(ic , 4, k));
- v4sf tr5 = VSUB(cc_ref(i-1, 3, k), cc_ref(ic-1, 2, k));
- v4sf tr2 = VADD(cc_ref(i-1, 3, k), cc_ref(ic-1, 2, k));
- v4sf tr4 = VSUB(cc_ref(i-1, 5, k), cc_ref(ic-1, 4, k));
- v4sf tr3 = VADD(cc_ref(i-1, 5, k), cc_ref(ic-1, 4, k));
+ const int ic{idp2 - i};
+ v4sf ti5{VADD(cc_ref(i , 3, k), cc_ref(ic , 2, k))};
+ v4sf ti2{VSUB(cc_ref(i , 3, k), cc_ref(ic , 2, k))};
+ v4sf ti4{VADD(cc_ref(i , 5, k), cc_ref(ic , 4, k))};
+ v4sf ti3{VSUB(cc_ref(i , 5, k), cc_ref(ic , 4, k))};
+ v4sf tr5{VSUB(cc_ref(i-1, 3, k), cc_ref(ic-1, 2, k))};
+ v4sf tr2{VADD(cc_ref(i-1, 3, k), cc_ref(ic-1, 2, k))};
+ v4sf tr4{VSUB(cc_ref(i-1, 5, k), cc_ref(ic-1, 4, k))};
+ v4sf tr3{VADD(cc_ref(i-1, 5, k), cc_ref(ic-1, 4, k))};
ch_ref(i - 1, k, 1) = VADD(cc_ref(i-1, 1, k), VADD(tr2, tr3));
ch_ref(i, k, 1) = VADD(cc_ref(i, 1, k), VADD(ti2, ti3));
- v4sf cr2 = VADD(cc_ref(i-1, 1, k), VADD(SVMUL(tr11, tr2), SVMUL(tr12, tr3)));
- v4sf ci2 = VADD(cc_ref(i , 1, k), VADD(SVMUL(tr11, ti2), SVMUL(tr12, ti3)));
- v4sf cr3 = VADD(cc_ref(i-1, 1, k), VADD(SVMUL(tr12, tr2), SVMUL(tr11, tr3)));
- v4sf ci3 = VADD(cc_ref(i , 1, k), VADD(SVMUL(tr12, ti2), SVMUL(tr11, ti3)));
- v4sf cr5 = VADD(SVMUL(ti11, tr5), SVMUL(ti12, tr4));
- v4sf ci5 = VADD(SVMUL(ti11, ti5), SVMUL(ti12, ti4));
- v4sf cr4 = VSUB(SVMUL(ti12, tr5), SVMUL(ti11, tr4));
- v4sf ci4 = VSUB(SVMUL(ti12, ti5), SVMUL(ti11, ti4));
- v4sf dr3 = VSUB(cr3, ci4);
- v4sf dr4 = VADD(cr3, ci4);
- v4sf di3 = VADD(ci3, cr4);
- v4sf di4 = VSUB(ci3, cr4);
- v4sf dr5 = VADD(cr2, ci5);
- v4sf dr2 = VSUB(cr2, ci5);
- v4sf di5 = VSUB(ci2, cr5);
- v4sf di2 = VADD(ci2, cr5);
+ v4sf cr2{VADD(cc_ref(i-1, 1, k), VADD(VMUL(tr11, tr2), VMUL(tr12, tr3)))};
+ v4sf ci2{VADD(cc_ref(i , 1, k), VADD(VMUL(tr11, ti2), VMUL(tr12, ti3)))};
+ v4sf cr3{VADD(cc_ref(i-1, 1, k), VADD(VMUL(tr12, tr2), VMUL(tr11, tr3)))};
+ v4sf ci3{VADD(cc_ref(i , 1, k), VADD(VMUL(tr12, ti2), VMUL(tr11, ti3)))};
+ v4sf cr5{VADD(VMUL(ti11, tr5), VMUL(ti12, tr4))};
+ v4sf ci5{VADD(VMUL(ti11, ti5), VMUL(ti12, ti4))};
+ v4sf cr4{VSUB(VMUL(ti12, tr5), VMUL(ti11, tr4))};
+ v4sf ci4{VSUB(VMUL(ti12, ti5), VMUL(ti11, ti4))};
+ v4sf dr3{VSUB(cr3, ci4)};
+ v4sf dr4{VADD(cr3, ci4)};
+ v4sf di3{VADD(ci3, cr4)};
+ v4sf di4{VSUB(ci3, cr4)};
+ v4sf dr5{VADD(cr2, ci5)};
+ v4sf dr2{VSUB(cr2, ci5)};
+ v4sf di5{VSUB(ci2, cr5)};
+ v4sf di2{VADD(ci2, cr5)};
VCPLXMUL(dr2, di2, LD_PS1(wa1[i-3]), LD_PS1(wa1[i-2]));
VCPLXMUL(dr3, di3, LD_PS1(wa2[i-3]), LD_PS1(wa2[i-2]));
VCPLXMUL(dr4, di4, LD_PS1(wa3[i-3]), LD_PS1(wa3[i-2]));
@@ -1085,45 +1087,52 @@ static void radb5_ps(int ido, int l1, const v4sf *RESTRICT cc, v4sf *RESTRICT ch
#undef ch_ref
} /* radb5 */
-static NEVER_INLINE(v4sf *) rfftf1_ps(int n, const v4sf *input_readonly, v4sf *work1, v4sf *work2,
- const float *wa, const int *ifac)
+static NEVER_INLINE(v4sf *) rfftf1_ps(const int n, const v4sf *input_readonly, v4sf *work1,
+ v4sf *work2, const float *wa, const int *ifac)
{
- const v4sf *in = input_readonly;
- v4sf *out = (in == work2 ? work1 : work2);
- const int nf = ifac[1];
- int l2 = n;
- int iw = n-1;
- assert(in != out && work1 != work2);
- for(int k1 = 1; k1 <= nf; ++k1)
+ assert(work1 != work2);
+
+ const v4sf *in{input_readonly};
+ v4sf *out{in == work2 ? work1 : work2};
+ const int nf{ifac[1]};
+ int l2{n};
+ int iw{n-1};
+ for(int k1{1};k1 <= nf;++k1)
{
- int kh = nf - k1;
- int ip = ifac[kh + 2];
- int l1 = l2 / ip;
- int ido = n / l2;
+ int kh{nf - k1};
+ int ip{ifac[kh + 2]};
+ int l1{l2 / ip};
+ int ido{n / l2};
iw -= (ip - 1)*ido;
- switch (ip)
+ switch(ip)
{
- case 5: {
- int ix2 = iw + ido;
- int ix3 = ix2 + ido;
- int ix4 = ix3 + ido;
+ case 5:
+ {
+ int ix2{iw + ido};
+ int ix3{ix2 + ido};
+ int ix4{ix3 + ido};
radf5_ps(ido, l1, in, out, &wa[iw], &wa[ix2], &wa[ix3], &wa[ix4]);
- } break;
- case 4: {
- int ix2 = iw + ido;
- int ix3 = ix2 + ido;
+ }
+ break;
+ case 4:
+ {
+ int ix2{iw + ido};
+ int ix3{ix2 + ido};
radf4_ps(ido, l1, in, out, &wa[iw], &wa[ix2], &wa[ix3]);
- } break;
- case 3: {
- int ix2 = iw + ido;
+ }
+ break;
+ case 3:
+ {
+ int ix2{iw + ido};
radf3_ps(ido, l1, in, out, &wa[iw], &wa[ix2]);
- } break;
- case 2:
- radf2_ps(ido, l1, in, out, &wa[iw]);
- break;
- default:
- assert(0);
- break;
+ }
+ break;
+ case 2:
+ radf2_ps(ido, l1, in, out, &wa[iw]);
+ break;
+ default:
+ assert(0);
+ break;
}
l2 = l1;
if(out == work2)
@@ -1140,43 +1149,50 @@ static NEVER_INLINE(v4sf *) rfftf1_ps(int n, const v4sf *input_readonly, v4sf *w
return const_cast<v4sf*>(in); /* this is in fact the output .. */
} /* rfftf1 */
-static NEVER_INLINE(v4sf *) rfftb1_ps(int n, const v4sf *input_readonly, v4sf *work1, v4sf *work2,
- const float *wa, const int *ifac)
+static NEVER_INLINE(v4sf *) rfftb1_ps(const int n, const v4sf *input_readonly, v4sf *work1,
+ v4sf *work2, const float *wa, const int *ifac)
{
- const v4sf *in = input_readonly;
- v4sf *out = (in == work2 ? work1 : work2);
- const int nf = ifac[1];
- int l1 = 1;
- int iw = 0;
- assert(in != out);
- for(int k1=1; k1<=nf; k1++)
+ assert(work1 != work2);
+
+ const v4sf *in{input_readonly};
+ v4sf *out{in == work2 ? work1 : work2};
+ const int nf{ifac[1]};
+ int l1{1};
+ int iw{0};
+ for(int k1{1};k1 <= nf;++k1)
{
- int ip = ifac[k1 + 1];
- int l2 = ip*l1;
- int ido = n / l2;
+ int ip{ifac[k1 + 1]};
+ int l2{ip*l1};
+ int ido{n / l2};
switch(ip)
{
- case 5: {
- int ix2 = iw + ido;
- int ix3 = ix2 + ido;
- int ix4 = ix3 + ido;
+ case 5:
+ {
+ int ix2{iw + ido};
+ int ix3{ix2 + ido};
+ int ix4{ix3 + ido};
radb5_ps(ido, l1, in, out, &wa[iw], &wa[ix2], &wa[ix3], &wa[ix4]);
- } break;
- case 4: {
- int ix2 = iw + ido;
- int ix3 = ix2 + ido;
+ }
+ break;
+ case 4:
+ {
+ int ix2{iw + ido};
+ int ix3{ix2 + ido};
radb4_ps(ido, l1, in, out, &wa[iw], &wa[ix2], &wa[ix3]);
- } break;
- case 3: {
- int ix2 = iw + ido;
+ }
+ break;
+ case 3:
+ {
+ int ix2{iw + ido};
radb3_ps(ido, l1, in, out, &wa[iw], &wa[ix2]);
- } break;
- case 2:
- radb2_ps(ido, l1, in, out, &wa[iw]);
- break;
- default:
- assert(0);
- break;
+ }
+ break;
+ case 2:
+ radb2_ps(ido, l1, in, out, &wa[iw]);
+ break;
+ default:
+ assert(0);
+ break;
}
l1 = l2;
iw += (ip - 1)*ido;
@@ -1195,32 +1211,30 @@ static NEVER_INLINE(v4sf *) rfftb1_ps(int n, const v4sf *input_readonly, v4sf *w
return const_cast<v4sf*>(in); /* this is in fact the output .. */
}
-static int decompose(int n, int *ifac, const int *ntryh)
+static int decompose(const int n, int *ifac, const int *ntryh)
{
- int nl = n, nf = 0;
- for(int j=0; ntryh[j]; ++j)
+ int nl{n}, nf{0};
+ for(int j{0};ntryh[j];++j)
{
- const int ntry = ntryh[j];
+ const int ntry{ntryh[j]};
while(nl != 1)
{
- int nq = nl / ntry;
- int nr = nl - ntry*nq;
- if(nr == 0)
+ const int nq{nl / ntry};
+ const int nr{nl - ntry*nq};
+ if(nr != 0)
+ break;
+
+ ifac[2+nf++] = ntry;
+ nl = nq;
+ if(ntry == 2 && nf != 1)
{
- ifac[2+nf++] = ntry;
- nl = nq;
- if(ntry == 2 && nf != 1)
+ for(int i{2};i <= nf;++i)
{
- for(int i = 2; i <= nf; ++i)
- {
- int ib = nf - i + 2;
- ifac[ib + 1] = ifac[ib];
- }
- ifac[2] = 2;
+ int ib{nf - i + 2};
+ ifac[ib + 1] = ifac[ib];
}
+ ifac[2] = 2;
}
- else
- break;
}
}
ifac[0] = n;
@@ -1230,28 +1244,28 @@ static int decompose(int n, int *ifac, const int *ntryh)
-static void rffti1_ps(int n, float *wa, int *ifac)
+static void rffti1_ps(const int n, float *wa, int *ifac)
{
- static constexpr int ntryh[] = { 4,2,3,5,0 };
-
- const int nf = decompose(n,ifac,ntryh);
- const double argh = 2.0*al::numbers::pi / n;
- int is = 0;
- int nfm1 = nf - 1;
- int l1 = 1;
- for(int k1 = 1; k1 <= nfm1; k1++)
+ static constexpr int ntryh[]{4,2,3,5,0};
+
+ const int nf{decompose(n, ifac, ntryh)};
+ const double argh{2.0*al::numbers::pi / n};
+ int is{0};
+ int nfm1{nf - 1};
+ int l1{1};
+ for(int k1{1};k1 <= nfm1;++k1)
{
- int ip = ifac[k1 + 1];
- int ld = 0;
- int l2 = l1*ip;
- int ido = n / l2;
- int ipm = ip - 1;
- for(int j = 1; j <= ipm; ++j)
+ const int ip{ifac[k1 + 1]};
+ const int l2{l1*ip};
+ const int ido{n / l2};
+ const int ipm{ip - 1};
+ int ld{0};
+ for(int j{1};j <= ipm;++j)
{
- int i = is, fi=0;
+ int i{is}, fi{0};
ld += l1;
- double argld = ld*argh;
- for(int ii = 3; ii <= ido; ii += 2)
+ double argld{ld*argh};
+ for(int ii{3};ii <= ido;ii += 2)
{
i += 2;
fi += 1;
@@ -1264,25 +1278,25 @@ static void rffti1_ps(int n, float *wa, int *ifac)
}
} /* rffti1 */
-void cffti1_ps(int n, float *wa, int *ifac)
+void cffti1_ps(const int n, float *wa, int *ifac)
{
- static constexpr int ntryh[] = { 5,3,4,2,0 };
+ static constexpr int ntryh[]{5,3,4,2,0};
- const int nf = decompose(n,ifac,ntryh);
- const double argh = 2.0*al::numbers::pi / n;
- int i = 1;
- int l1 = 1;
- for(int k1=1; k1<=nf; k1++)
+ const int nf{decompose(n, ifac, ntryh)};
+ const double argh{2.0*al::numbers::pi / n};
+ int i{1};
+ int l1{1};
+ for(int k1{1};k1 <= nf;++k1)
{
- int ip = ifac[k1+1];
- int ld = 0;
- int l2 = l1*ip;
- int ido = n / l2;
- int idot = ido + ido + 2;
- int ipm = ip - 1;
- for(int j=1; j<=ipm; j++)
+ const int ip{ifac[k1+1]};
+ const int l2{l1*ip};
+ const int ido{n / l2};
+ const int idot{ido + ido + 2};
+ const int ipm{ip - 1};
+ int ld{0};
+ for(int j{1};j <= ipm;++j)
{
- int i1 = i, fi = 0;
+ int i1{i}, fi{0};
wa[i-1] = 1;
wa[i] = 0;
ld += l1;
@@ -1305,43 +1319,49 @@ void cffti1_ps(int n, float *wa, int *ifac)
} /* cffti1 */
-v4sf *cfftf1_ps(int n, const v4sf *input_readonly, v4sf *work1, v4sf *work2, const float *wa,
- const int *ifac, float fsign)
+v4sf *cfftf1_ps(const int n, const v4sf *input_readonly, v4sf *work1, v4sf *work2, const float *wa,
+ const int *ifac, const float fsign)
{
- const v4sf *in = input_readonly;
- v4sf *out = (in == work2 ? work1 : work2);
- const int nf = ifac[1];
- int l1 = 1;
- int iw = 0;
- assert(in != out && work1 != work2);
- for(int k1=2; k1<=nf+1; k1++)
+ assert(work1 != work2);
+
+ const v4sf *in{input_readonly};
+ v4sf *out{in == work2 ? work1 : work2};
+ const int nf{ifac[1]};
+ int l1{1}, iw{0};
+ for(int k1{2};k1 <= nf+1;++k1)
{
- int ip = ifac[k1];
- int l2 = ip*l1;
- int ido = n / l2;
- int idot = ido + ido;
+ const int ip{ifac[k1]};
+ const int l2{ip*l1};
+ const int ido{n / l2};
+ const int idot{ido + ido};
switch(ip)
{
- case 5: {
- int ix2 = iw + idot;
- int ix3 = ix2 + idot;
- int ix4 = ix3 + idot;
+ case 5:
+ {
+ int ix2{iw + idot};
+ int ix3{ix2 + idot};
+ int ix4{ix3 + idot};
passf5_ps(idot, l1, in, out, &wa[iw], &wa[ix2], &wa[ix3], &wa[ix4], fsign);
- } break;
- case 4: {
- int ix2 = iw + idot;
- int ix3 = ix2 + idot;
+ }
+ break;
+ case 4:
+ {
+ int ix2{iw + idot};
+ int ix3{ix2 + idot};
passf4_ps(idot, l1, in, out, &wa[iw], &wa[ix2], &wa[ix3], fsign);
- } break;
- case 2:
- passf2_ps(idot, l1, in, out, &wa[iw], fsign);
- break;
- case 3: {
- int ix2 = iw + idot;
+ }
+ break;
+ case 3:
+ {
+ int ix2{iw + idot};
passf3_ps(idot, l1, in, out, &wa[iw], &wa[ix2], fsign);
- } break;
- default:
- assert(0);
+ }
+ break;
+ case 2:
+ passf2_ps(idot, l1, in, out, &wa[iw], fsign);
+ break;
+ default:
+ assert(0);
}
l1 = l2;
iw += (ip - 1)*idot;
@@ -1362,8 +1382,8 @@ v4sf *cfftf1_ps(int n, const v4sf *input_readonly, v4sf *work1, v4sf *work2, con
struct PFFFT_Setup {
- int N;
- int Ncvec; // nb of complex simd vectors (N/4 if PFFFT_COMPLEX, N/8 if PFFFT_REAL)
+ int N;
+ int Ncvec; // nb of complex simd vectors (N/4 if PFFFT_COMPLEX, N/8 if PFFFT_REAL)
int ifac[15];
pffft_transform_t transform;
@@ -1384,13 +1404,13 @@ PFFFT_Setup *pffft_new_setup(int N, pffft_transform_t transform)
else
assert((N%(SIMD_SZ*SIMD_SZ)) == 0);
- const unsigned int Ncvec = static_cast<unsigned>(transform == PFFFT_REAL ? N/2 : N)/SIMD_SZ;
- size_t storelen{offsetof(PFFFT_Setup, e[0]) + (2u*Ncvec * sizeof(v4sf))};
+ const auto Ncvec = static_cast<unsigned>(transform == PFFFT_REAL ? N/2 : N)/SIMD_SZ;
+ const size_t storelen{offsetof(PFFFT_Setup, e[0]) + (2u*Ncvec * sizeof(v4sf))};
void *store{al_calloc(MALLOC_V4SF_ALIGNMENT, storelen)};
if(!store) return nullptr;
- PFFFT_Setup *s = ::new(store) PFFFT_Setup{};
+ PFFFT_Setup *s{::new(store) PFFFT_Setup{}};
s->N = N;
s->transform = transform;
/* nb of complex simd vectors */
@@ -1400,10 +1420,10 @@ PFFFT_Setup *pffft_new_setup(int N, pffft_transform_t transform)
if constexpr(SIMD_SZ > 1)
{
al::vector<float,16> e(2u*Ncvec*(SIMD_SZ-1));
- for(int k=0; k < s->Ncvec; ++k)
+ for(int k{0};k < s->Ncvec;++k)
{
- size_t i{static_cast<size_t>(k) / SIMD_SZ};
- size_t j{static_cast<size_t>(k) % SIMD_SZ};
+ const size_t i{static_cast<size_t>(k) / SIMD_SZ};
+ const size_t j{static_cast<size_t>(k) % SIMD_SZ};
for(size_t m{0};m < SIMD_SZ-1;++m)
{
const double A = -2.0*al::numbers::pi*static_cast<double>(m+1)*k / N;
@@ -1419,8 +1439,8 @@ PFFFT_Setup *pffft_new_setup(int N, pffft_transform_t transform)
cffti1_ps(N/SIMD_SZ, s->twiddle, s->ifac);
/* check that N is decomposable with allowed prime factors */
- int m = 1;
- for(int k=0; k < s->ifac[1]; ++k)
+ int m{1};
+ for(int k{0};k < s->ifac[1];++k)
m *= s->ifac[2+k];
if(m != N/SIMD_SZ)
@@ -1442,17 +1462,18 @@ void pffft_destroy_setup(PFFFT_Setup *s)
#if !defined(PFFFT_SIMD_DISABLE)
/* [0 0 1 2 3 4 5 6 7 8] -> [0 8 7 6 5 4 3 2 1] */
-static void reversed_copy(int N, const v4sf *in, int in_stride, v4sf *out)
+static void reversed_copy(const int N, const v4sf *in, const int in_stride, v4sf *out)
{
v4sf g0, g1;
INTERLEAVE2(in[0], in[1], g0, g1);
in += in_stride;
*--out = VSWAPHL(g0, g1); // [g0l, g0h], [g1l g1h] -> [g1l, g0h]
- for(int k=1; k < N; ++k)
+ for(int k{1};k < N;++k)
{
v4sf h0, h1;
- INTERLEAVE2(in[0], in[1], h0, h1); in += in_stride;
+ INTERLEAVE2(in[0], in[1], h0, h1);
+ in += in_stride;
*--out = VSWAPHL(g1, h0);
*--out = VSWAPHL(h0, h1);
g1 = h1;
@@ -1460,20 +1481,20 @@ static void reversed_copy(int N, const v4sf *in, int in_stride, v4sf *out)
*--out = VSWAPHL(g1, g0);
}
-static void unreversed_copy(int N, const v4sf *in, v4sf *out, int out_stride)
+static void unreversed_copy(const int N, const v4sf *in, v4sf *out, const int out_stride)
{
- v4sf g0, g1, h0, h1;
- g0 = g1 = in[0]; ++in;
- for(int k=1; k < N; ++k)
+ v4sf g0{in[0]}, g1{g0};
+ ++in;
+ for(int k{1};k < N;++k)
{
- h0 = *in++; h1 = *in++;
+ v4sf h0{*in++}; v4sf h1{*in++};
g1 = VSWAPHL(g1, h0);
h0 = VSWAPHL(h0, h1);
UNINTERLEAVE2(h0, g1, out[0], out[1]);
out += out_stride;
g1 = h1;
}
- h0 = *in++; h1 = g0;
+ v4sf h0{*in++}, h1{g0};
g1 = VSWAPHL(g1, h0);
h0 = VSWAPHL(h0, h1);
UNINTERLEAVE2(h0, g1, out[0], out[1]);
@@ -1491,7 +1512,7 @@ void pffft_zreorder(PFFFT_Setup *setup, const float *in, float *out, pffft_direc
const int dk{N/32};
if(direction == PFFFT_FORWARD)
{
- for(int k=0; k < dk; ++k)
+ for(int k{0};k < dk;++k)
{
INTERLEAVE2(vin[k*8 + 0], vin[k*8 + 1], vout[2*(0*dk + k) + 0], vout[2*(0*dk + k) + 1]);
INTERLEAVE2(vin[k*8 + 4], vin[k*8 + 5], vout[2*(2*dk + k) + 0], vout[2*(2*dk + k) + 1]);
@@ -1613,10 +1634,10 @@ void pffft_cplx_preprocess(const int Ncvec, const v4sf *in, v4sf *out, const v4s
static ALWAYS_INLINE(void) pffft_real_finalize_4x4(const v4sf *in0, const v4sf *in1,
const v4sf *in, const v4sf *e, v4sf *out)
{
- v4sf r0, i0, r1, i1, r2, i2, r3, i3;
- v4sf sr0, dr0, sr1, dr1, si0, di0, si1, di1;
- r0 = *in0; i0 = *in1;
- r1 = *in++; i1 = *in++; r2 = *in++; i2 = *in++; r3 = *in++; i3 = *in++;
+ v4sf r0{*in0}, i0{*in1};
+ v4sf r1{*in++}; v4sf i1{*in++};
+ v4sf r2{*in++}; v4sf i2{*in++};
+ v4sf r3{*in++}; v4sf i3{*in++};
VTRANSPOSE4(r0,r1,r2,r3);
VTRANSPOSE4(i0,i1,i2,i3);
@@ -1643,10 +1664,10 @@ static ALWAYS_INLINE(void) pffft_real_finalize_4x4(const v4sf *in0, const v4sf *
//cerr << "matrix initial, real part:\n 1: " << r0 << "\n 1: " << r1 << "\n 1: " << r2 << "\n 1: " << r3 << "\n";
//cerr << "matrix initial, imag part:\n 1: " << i0 << "\n 1: " << i1 << "\n 1: " << i2 << "\n 1: " << i3 << "\n";
- sr0 = VADD(r0,r2); dr0 = VSUB(r0,r2);
- sr1 = VADD(r1,r3); dr1 = VSUB(r3,r1);
- si0 = VADD(i0,i2); di0 = VSUB(i0,i2);
- si1 = VADD(i1,i3); di1 = VSUB(i3,i1);
+ v4sf sr0{VADD(r0,r2)}, dr0{VSUB(r0,r2)};
+ v4sf sr1{VADD(r1,r3)}, dr1{VSUB(r3,r1)};
+ v4sf si0{VADD(i0,i2)}, di0{VSUB(i0,i2)};
+ v4sf si1{VADD(i1,i3)}, di1{VSUB(i3,i1)};
r0 = VADD(sr0, sr1);
r3 = VSUB(sr0, sr1);
@@ -1667,7 +1688,8 @@ static ALWAYS_INLINE(void) pffft_real_finalize_4x4(const v4sf *in0, const v4sf *
*out++ = i3;
}
-static NEVER_INLINE(void) pffft_real_finalize(int Ncvec, const v4sf *in, v4sf *out, const v4sf *e)
+static NEVER_INLINE(void) pffft_real_finalize(const int Ncvec, const v4sf *in, v4sf *out,
+ const v4sf *e)
{
static constexpr float s{al::numbers::sqrt2_v<float>/2.0f};
@@ -1706,9 +1728,10 @@ static NEVER_INLINE(void) pffft_real_finalize(int Ncvec, const v4sf *in, v4sf *o
}
static ALWAYS_INLINE(void) pffft_real_preprocess_4x4(const v4sf *in, const v4sf *e, v4sf *out,
- int first)
+ const bool first)
{
- v4sf r0=in[0], i0=in[1], r1=in[2], i1=in[3], r2=in[4], i2=in[5], r3=in[6], i3=in[7];
+ v4sf r0{in[0]}, i0{in[1]}, r1{in[2]}, i1{in[3]};
+ v4sf r2{in[4]}, i2{in[5]}, r3{in[6]}, i3{in[7]};
/* transformation for each column is:
*
@@ -1722,10 +1745,10 @@ static ALWAYS_INLINE(void) pffft_real_preprocess_4x4(const v4sf *in, const v4sf
* [0 1 -1 0 1 0 0 1] [i3]
*/
- v4sf sr0 = VADD(r0,r3), dr0 = VSUB(r0,r3);
- v4sf sr1 = VADD(r1,r2), dr1 = VSUB(r1,r2);
- v4sf si0 = VADD(i0,i3), di0 = VSUB(i0,i3);
- v4sf si1 = VADD(i1,i2), di1 = VSUB(i1,i2);
+ v4sf sr0{VADD(r0,r3)}, dr0{VSUB(r0,r3)};
+ v4sf sr1{VADD(r1,r2)}, dr1{VSUB(r1,r2)};
+ v4sf si0{VADD(i0,i3)}, di0{VSUB(i0,i3)};
+ v4sf si1{VADD(i1,i2)}, di1{VSUB(i1,i2)};
r0 = VADD(sr0, sr1);
r2 = VSUB(sr0, sr1);
@@ -1756,9 +1779,10 @@ static ALWAYS_INLINE(void) pffft_real_preprocess_4x4(const v4sf *in, const v4sf
*out++ = i3;
}
-static NEVER_INLINE(void) pffft_real_preprocess(int Ncvec, const v4sf *in, v4sf *out, const v4sf *e)
+static NEVER_INLINE(void) pffft_real_preprocess(const int Ncvec, const v4sf *in, v4sf *out,
+ const v4sf *e)
{
- static constexpr float s = al::numbers::sqrt2_v<float>;
+ static constexpr float sqrt2{al::numbers::sqrt2_v<float>};
assert(in != out);
const int dk{Ncvec/SIMD_SZ}; // number of 4x4 matrix blocks
@@ -1771,7 +1795,7 @@ static NEVER_INLINE(void) pffft_real_preprocess(int Ncvec, const v4sf *in, v4sf
Xi[k] = VEXTRACT0(in[4*k + 1]);
}
- pffft_real_preprocess_4x4(in, e, out+1, 1); // will write only 6 values
+ pffft_real_preprocess_4x4(in, e, out+1, true); // will write only 6 values
/* [Xr0 Xr1 Xr2 Xr3 Xi0 Xi1 Xi2 Xi3]
*
@@ -1785,34 +1809,30 @@ static NEVER_INLINE(void) pffft_real_preprocess(int Ncvec, const v4sf *in, v4sf
* [ci3] [0 -s 0 s 0 -s 0 -s]
*/
for(int k{1};k < dk;++k)
- pffft_real_preprocess_4x4(in+8*k, e + k*6, out-1+k*8, 0);
+ pffft_real_preprocess_4x4(in+8*k, e + k*6, out-1+k*8, false);
const float cr0{(Xr[0]+Xi[0]) + 2*Xr[2]};
const float cr1{(Xr[0]-Xi[0]) - 2*Xi[2]};
const float cr2{(Xr[0]+Xi[0]) - 2*Xr[2]};
const float cr3{(Xr[0]-Xi[0]) + 2*Xi[2]};
out[0] = VSET4(cr0, cr1, cr2, cr3);
- const float ci0{ 2*(Xr[1]+Xr[3])};
- const float ci1{ s*(Xr[1]-Xr[3]) - s*(Xi[1]+Xi[3])};
- const float ci2{ 2*(Xi[3]-Xi[1])};
- const float ci3{-s*(Xr[1]-Xr[3]) - s*(Xi[1]+Xi[3])};
+ const float ci0{ 2*(Xr[1]+Xr[3])};
+ const float ci1{ sqrt2*(Xr[1]-Xr[3]) - sqrt2*(Xi[1]+Xi[3])};
+ const float ci2{ 2*(Xi[3]-Xi[1])};
+ const float ci3{-sqrt2*(Xr[1]-Xr[3]) - sqrt2*(Xi[1]+Xi[3])};
out[2*Ncvec-1] = VSET4(ci0, ci1, ci2, ci3);
}
-void pffft_transform_internal(PFFFT_Setup *setup, const float *finput, float *foutput,
- v4sf *scratch, pffft_direction_t direction, int ordered)
+void pffft_transform_internal(PFFFT_Setup *setup, const v4sf *vinput, v4sf *voutput,
+ v4sf *scratch, const pffft_direction_t direction, const bool ordered)
{
assert(scratch != nullptr);
- assert(VALIGNED(finput) && VALIGNED(foutput) && VALIGNED(scratch));
+ assert(voutput != scratch);
const int Ncvec{setup->Ncvec};
const int nf_odd{setup->ifac[1] & 1};
- auto *vinput = reinterpret_cast<const v4sf*>(finput);
- auto *voutput = reinterpret_cast<v4sf*>(foutput);
- assert(voutput != scratch);
-
v4sf *buff[2]{voutput, scratch};
int ib{(nf_odd ^ ordered) ? 1 : 0};
if(direction == PFFFT_FORWARD)
@@ -1870,21 +1890,18 @@ void pffft_transform_internal(PFFFT_Setup *setup, const float *finput, float *fo
if(buff[ib] != voutput)
{
/* extra copy required -- this situation should only happen when finput == foutput */
- assert(finput==foutput);
+ assert(vinput==voutput);
for(int k{0};k < Ncvec;++k)
{
v4sf a{buff[ib][2*k]}, b{buff[ib][2*k+1]};
voutput[2*k] = a; voutput[2*k+1] = b;
}
- ib = !ib;
}
}
void pffft_zconvolve_accumulate(PFFFT_Setup *s, const float *a, const float *b, float *ab,
float scaling)
{
- assert(VALIGNED(a) && VALIGNED(b) && VALIGNED(ab));
-
const int Ncvec{s->Ncvec};
const v4sf *RESTRICT va{reinterpret_cast<const v4sf*>(a)};
const v4sf *RESTRICT vb{reinterpret_cast<const v4sf*>(b)};
@@ -1911,12 +1928,12 @@ void pffft_zconvolve_accumulate(PFFFT_Setup *s, const float *a, const float *b,
#ifndef ZCONVOLVE_USING_INLINE_ASM
const v4sf vscal{LD_PS1(scaling)};
#endif
- float ar1{VEXTRACT0(va[0])};
- float ai1{VEXTRACT0(va[1])};
- float br1{VEXTRACT0(vb[0])};
- float bi1{VEXTRACT0(vb[1])};
- float abr1{VEXTRACT0(vab[0])};
- float abi1{VEXTRACT0(vab[1])};
+ const float ar1{VEXTRACT0(va[0])};
+ const float ai1{VEXTRACT0(va[1])};
+ const float br1{VEXTRACT0(vb[0])};
+ const float bi1{VEXTRACT0(vb[1])};
+ const float abr1{VEXTRACT0(vab[0])};
+ const float abi1{VEXTRACT0(vab[1])};
#ifdef ZCONVOLVE_USING_INLINE_ASM // inline asm version, unfortunately miscompiled by clang 3.2, at least on ubuntu.. so this will be restricted to gcc
const float *a_{a}, *b_{b}; float *ab_{ab};
@@ -1957,7 +1974,7 @@ void pffft_zconvolve_accumulate(PFFFT_Setup *s, const float *a, const float *b,
#else // default routine, works fine for non-arm cpus with current compilers
- for(int i=0; i < Ncvec; i += 2)
+ for(int i{0};i < Ncvec;i += 2)
{
v4sf ar4{va[2*i+0]}, ai4{va[2*i+1]};
v4sf br4{vb[2*i+0]}, bi4{vb[2*i+1]};
@@ -1980,6 +1997,22 @@ void pffft_zconvolve_accumulate(PFFFT_Setup *s, const float *a, const float *b,
}
+void pffft_transform(PFFFT_Setup *setup, const float *input, float *output, float *work, pffft_direction_t direction)
+{
+ assert(VALIGNED(input) && VALIGNED(output) && VALIGNED(work));
+ pffft_transform_internal(setup, reinterpret_cast<const v4sf*>(al::assume_aligned<16>(input)),
+ reinterpret_cast<v4sf*>(al::assume_aligned<16>(output)),
+ reinterpret_cast<v4sf*>(al::assume_aligned<16>(work)), direction, false);
+}
+
+void pffft_transform_ordered(PFFFT_Setup *setup, const float *input, float *output, float *work, pffft_direction_t direction)
+{
+ assert(VALIGNED(input) && VALIGNED(output) && VALIGNED(work));
+ pffft_transform_internal(setup, reinterpret_cast<const v4sf*>(al::assume_aligned<16>(input)),
+ reinterpret_cast<v4sf*>(al::assume_aligned<16>(output)),
+ reinterpret_cast<v4sf*>(al::assume_aligned<16>(work)), direction, true);
+}
+
#else // defined(PFFFT_SIMD_DISABLE)
// standard routine using scalar floats, without SIMD stuff.
@@ -1988,25 +2021,25 @@ void pffft_zconvolve_accumulate(PFFFT_Setup *s, const float *a, const float *b,
void pffft_zreorder_nosimd(PFFFT_Setup *setup, const float *in, float *out,
pffft_direction_t direction)
{
- const int N = setup->N;
+ const int N{setup->N};
if(setup->transform == PFFFT_COMPLEX)
{
- for(int k=0; k < 2*N; ++k)
+ for(int k{0};k < 2*N;++k)
out[k] = in[k];
return;
}
else if(direction == PFFFT_FORWARD)
{
- float x_N = in[N-1];
- for(int k=N-1; k > 1; --k)
+ float x_N{in[N-1]};
+ for(int k{N-1};k > 1;--k)
out[k] = in[k-1];
out[0] = in[0];
out[1] = x_N;
}
else
{
- float x_N = in[1];
- for(int k=1; k < N-1; ++k)
+ float x_N{in[1]};
+ for(int k{1};k < N-1;++k)
out[k] = in[k+1];
out[0] = in[0];
out[N-1] = x_N;
@@ -2015,7 +2048,7 @@ void pffft_zreorder_nosimd(PFFFT_Setup *setup, const float *in, float *out,
#define pffft_transform_internal_nosimd pffft_transform_internal
void pffft_transform_internal_nosimd(PFFFT_Setup *setup, const float *input, float *output,
- float *scratch, pffft_direction_t direction, int ordered)
+ float *scratch, const pffft_direction_t direction, bool ordered)
{
const int Ncvec{setup->Ncvec};
const int nf_odd{setup->ifac[1] & 1};
@@ -2061,12 +2094,11 @@ void pffft_transform_internal_nosimd(PFFFT_Setup *setup, const float *input, flo
{
// extra copy required -- this situation should happens only when finput == foutput
assert(input==output);
- for(int k=0; k < Ncvec; ++k)
+ for(int k{0};k < Ncvec;++k)
{
- float a = buff[ib][2*k], b = buff[ib][2*k+1];
+ float a{buff[ib][2*k]}, b{buff[ib][2*k+1]};
output[2*k] = a; output[2*k+1] = b;
}
- ib = !ib;
}
}
@@ -2093,14 +2125,15 @@ void pffft_zconvolve_accumulate_nosimd(PFFFT_Setup *s, const float *a, const flo
}
}
-#endif // defined(PFFFT_SIMD_DISABLE)
void pffft_transform(PFFFT_Setup *setup, const float *input, float *output, float *work, pffft_direction_t direction)
{
- pffft_transform_internal(setup, input, output, reinterpret_cast<v4sf*>(work), direction, 0);
+ pffft_transform_internal(setup, input, output, work, direction, false);
}
void pffft_transform_ordered(PFFFT_Setup *setup, const float *input, float *output, float *work, pffft_direction_t direction)
{
- pffft_transform_internal(setup, input, output, reinterpret_cast<v4sf*>(work), direction, 1);
+ pffft_transform_internal(setup, input, output, work, direction, true);
}
+
+#endif // defined(PFFFT_SIMD_DISABLE)