aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorChris Robinson <[email protected]>2023-10-16 09:01:41 -0700
committerChris Robinson <[email protected]>2023-10-16 09:01:41 -0700
commita82c5373667aae8f9e87b9d87ef9d2dec625f2fb (patch)
treee746fa69447e1c818962840a3d4536fd670ec260
parent50fce82c4043d989a6868b13a6930fa31b0cc420 (diff)
Replace some function-like macros with real functions
-rw-r--r--common/pffft.cpp253
1 files changed, 136 insertions, 117 deletions
diff --git a/common/pffft.cpp b/common/pffft.cpp
index 80d4e9c7..5a6bb4db 100644
--- a/common/pffft.cpp
+++ b/common/pffft.cpp
@@ -116,7 +116,7 @@ typedef vector float v4sf;
#define VMADD vec_madd
#define VSUB vec_sub
#define LD_PS1 vec_splats
-inline v4sf vset4(float a, float b, float c, float d)
+ALWAYS_INLINE(v4sf) vset4(float a, float b, float c, float d) noexcept
{
/* There a more efficient way to do this? */
alignas(16) std::array<float,4> vals{{a, b, c, d}};
@@ -125,32 +125,33 @@ inline v4sf vset4(float a, float b, float c, float d)
#define VSET4 vset4
#define VINSERT0(v, a) vec_insert((a), (v), 0)
#define VEXTRACT0(v) vec_extract((v), 0)
+
ALWAYS_INLINE(void) interleave2(v4sf in1, v4sf in2, v4sf &out1, v4sf &out2) noexcept
{
v4sf tmp{vec_mergeh(in1, in2)};
out2 = vec_mergel(in1, in2);
out1 = tmp;
}
-#define INTERLEAVE2 interleave2
ALWAYS_INLINE(void) uninterleave2(v4sf in1, v4sf in2, v4sf &out1, v4sf &out2) noexcept
{
v4sf tmp{vec_perm(in1, in2, (vector unsigned char)(0,1,2,3,8,9,10,11,16,17,18,19,24,25,26,27))};
out2 = vec_perm(in1, in2, (vector unsigned char)(4,5,6,7,12,13,14,15,20,21,22,23,28,29,30,31));
out1 = tmp;
}
-#define UNINTERLEAVE2 uninterleave2
-#define VTRANSPOSE4(x0,x1,x2,x3) do { \
- v4sf y0 = vec_mergeh(x0, x2); \
- v4sf y1 = vec_mergel(x0, x2); \
- v4sf y2 = vec_mergeh(x1, x3); \
- v4sf y3 = vec_mergel(x1, x3); \
- x0 = vec_mergeh(y0, y2); \
- x1 = vec_mergel(y0, y2); \
- x2 = vec_mergeh(y1, y3); \
- x3 = vec_mergel(y1, y3); \
-} while(0)
+
+ALWAYS_INLINE(void) vtranspose4(v4sf &x0, v4sf &x1, v4sf &x2, v4sf &x3) noexcept
+{
+ v4sf y0{vec_mergeh(x0, x2)};
+ v4sf y1{vec_mergel(x0, x2)};
+ v4sf y2{vec_mergeh(x1, x3)};
+ v4sf y3{vec_mergel(x1, x3)};
+ x0 = vec_mergeh(y0, y2);
+ x1 = vec_mergel(y0, y2);
+ x2 = vec_mergeh(y1, y3);
+ x3 = vec_mergel(y1, y3);
+}
+
#define VSWAPHL(a,b) vec_perm(a,b, (vector unsigned char)(16,17,18,19,20,21,22,23,8,9,10,11,12,13,14,15))
-#define VALIGNED(ptr) ((reinterpret_cast<uintptr_t>(ptr) & 0xF) == 0)
/*
* SSE1 support macros
@@ -170,23 +171,24 @@ typedef __m128 v4sf;
#define VSET4 _mm_setr_ps
#define VINSERT0(v, a) _mm_move_ss((v), _mm_set_ss(a))
#define VEXTRACT0 _mm_cvtss_f32
+
ALWAYS_INLINE(void) interleave2(v4sf in1, v4sf in2, v4sf &out1, v4sf &out2) noexcept
{
v4sf tmp{_mm_unpacklo_ps(in1, in2)};
out2 = _mm_unpackhi_ps(in1, in2);
out1 = tmp;
}
-#define INTERLEAVE2 interleave2
ALWAYS_INLINE(void) uninterleave2(v4sf in1, v4sf in2, v4sf &out1, v4sf &out2) noexcept
{
v4sf tmp{_mm_shuffle_ps(in1, in2, _MM_SHUFFLE(2,0,2,0))};
out2 = _mm_shuffle_ps(in1, in2, _MM_SHUFFLE(3,1,3,1));
out1 = tmp;
}
-#define UNINTERLEAVE2 uninterleave2
-#define VTRANSPOSE4 _MM_TRANSPOSE4_PS
+
+ALWAYS_INLINE(void) vtranspose4(v4sf &x0, v4sf &x1, v4sf &x2, v4sf &x3) noexcept
+{ _MM_TRANSPOSE4_PS(x0, x1, x2, x3); }
+
#define VSWAPHL(a,b) _mm_shuffle_ps(b, a, _MM_SHUFFLE(3,2,1,0))
-#define VALIGNED(ptr) ((reinterpret_cast<uintptr_t>(ptr) & 0xF) == 0)
/*
* ARM NEON support macros
@@ -213,19 +215,40 @@ ALWAYS_INLINE(v4sf) vset4(float a, float b, float c, float d) noexcept
#define VSET4 vset4
#define VINSERT0(v, a) vsetq_lane_f32((a), (v), 0)
#define VEXTRACT0(v) vgetq_lane_f32((v), 0)
-#define INTERLEAVE2(in1, in2, out1, out2) do { float32x4x2_t tmp__ = vzipq_f32(in1,in2); out1=tmp__.val[0]; out2=tmp__.val[1]; } while(0)
-#define UNINTERLEAVE2(in1, in2, out1, out2) do { float32x4x2_t tmp__ = vuzpq_f32(in1,in2); out1=tmp__.val[0]; out2=tmp__.val[1]; } while(0)
-#define VTRANSPOSE4(x0,x1,x2,x3) do { \
- float32x4x2_t t0_ = vzipq_f32(x0, x2); \
- float32x4x2_t t1_ = vzipq_f32(x1, x3); \
- float32x4x2_t u0_ = vzipq_f32(t0_.val[0], t1_.val[0]); \
- float32x4x2_t u1_ = vzipq_f32(t0_.val[1], t1_.val[1]); \
- x0 = u0_.val[0]; x1 = u0_.val[1]; x2 = u1_.val[0]; x3 = u1_.val[1]; \
-} while(0)
-// marginally faster version
-//#define VTRANSPOSE4(x0,x1,x2,x3) { asm("vtrn.32 %q0, %q1;\n vtrn.32 %q2,%q3\n vswp %f0,%e2\n vswp %f1,%e3" : "+w"(x0), "+w"(x1), "+w"(x2), "+w"(x3)::); }
+
+ALWAYS_INLINE(void) interleave2(v4sf in1, v4sf in2, v4sf &out1, v4sf &out2) noexcept
+{
+ float32x4x2_t tmp{vzipq_f32(in1, in2)};
+ out1 = tmp.val[0];
+ out2 = tmp.val[1];
+}
+ALWAYS_INLINE(void) uninterleave2(v4sf in1, v4sf in2, v4sf &out1, v4sf &out2) noexcept
+{
+ float32x4x2_t tmp{vuzpq_f32(in1, in2)};
+ out1 = tmp.val[0];
+ out2 = tmp.val[1];
+}
+
+ALWAYS_INLINE(void) vtranspose4(v4sf &x0, v4sf &x1, v4sf &x2, v4sf &x3) noexcept
+{
+ /* marginally faster version:
+ * asm("vtrn.32 %q0, %q1;\n"
+ * "vtrn.32 %q2, %q3\n
+ * "vswp %f0, %e2\n
+ * "vswp %f1, %e3"
+ * : "+w"(x0), "+w"(x1), "+w"(x2), "+w"(x3)::);
+ */
+ float32x4x2_t t0_{vzipq_f32(x0, x2)};
+ float32x4x2_t t1_{vzipq_f32(x1, x3)};
+ float32x4x2_t u0_{vzipq_f32(t0_.val[0], t1_.val[0])};
+ float32x4x2_t u1_{vzipq_f32(t0_.val[1], t1_.val[1])};
+ x0 = u0_.val[0];
+ x1 = u0_.val[1];
+ x2 = u1_.val[0];
+ x3 = u1_.val[1];
+}
+
#define VSWAPHL(a,b) vcombine_f32(vget_low_f32(b), vget_high_f32(a))
-#define VALIGNED(ptr) ((reinterpret_cast<uintptr_t>(ptr) & 0x3) == 0)
/*
* Generic GCC vector macros
@@ -255,19 +278,16 @@ ALWAYS_INLINE(v4sf) unpackhi(v4sf a, v4sf b) noexcept
ALWAYS_INLINE(void) interleave2(v4sf in1, v4sf in2, v4sf &out1, v4sf &out2) noexcept
{
- v4sf tmp__{unpacklo(in1, in2)};
+ v4sf tmp{unpacklo(in1, in2)};
out2 = unpackhi(in1, in2);
- out1 = tmp__;
+ out1 = tmp;
}
-#define INTERLEAVE2 interleave2
-
ALWAYS_INLINE(void) uninterleave2(v4sf in1, v4sf in2, v4sf &out1, v4sf &out2) noexcept
{
- v4sf tmp__{in1[0], in1[2], in2[0], in2[2]};
+ v4sf tmp{in1[0], in1[2], in2[0], in2[2]};
out2 = v4sf{in1[1], in1[3], in2[1], in2[3]};
- out1 = tmp__;
+ out1 = tmp;
}
-#define UNINTERLEAVE2 uninterleave2
ALWAYS_INLINE(void) vtranspose4(v4sf &x0, v4sf &x1, v4sf &x2, v4sf &x3) noexcept
{
@@ -280,14 +300,11 @@ ALWAYS_INLINE(void) vtranspose4(v4sf &x0, v4sf &x1, v4sf &x2, v4sf &x3) noexcept
x2 = v4sf{tmp1[0], tmp1[1], tmp3[0], tmp3[1]};
x3 = v4sf{tmp1[2], tmp1[3], tmp3[2], tmp3[3]};
}
-#define VTRANSPOSE4 vtranspose4
ALWAYS_INLINE(v4sf) vswaphl(v4sf a, v4sf b) noexcept
{ return v4sf{b[0], b[1], a[2], a[3]}; }
#define VSWAPHL vswaphl
-#define VALIGNED(ptr) ((reinterpret_cast<uintptr_t>(ptr) & 0xF) == 0)
-
#else
#warning "building with simd disabled !\n";
@@ -306,9 +323,14 @@ typedef float v4sf;
#define VMADD(a,b,c) ((a)*(b)+(c))
#define VSUB(a,b) ((a)-(b))
#define LD_PS1(p) (p)
-#define VALIGNED(ptr) ((reinterpret_cast<uintptr_t>(ptr) & 0x3) == 0)
#endif
+inline bool valigned(const float *ptr) noexcept
+{
+ static constexpr uintptr_t alignmask{SIMD_SZ*4 - 1};
+ return (reinterpret_cast<uintptr_t>(ptr) & alignmask) == 0;
+}
+
// shortcuts for complex multiplications
ALWAYS_INLINE(void) vcplxmul(v4sf &ar, v4sf &ai, v4sf br, v4sf bi) noexcept
{
@@ -316,15 +338,12 @@ ALWAYS_INLINE(void) vcplxmul(v4sf &ar, v4sf &ai, v4sf br, v4sf bi) noexcept
ar = VSUB(VMUL(ar, br), VMUL(ai, bi));
ai = VMADD(ai, br, tmp);
}
-#define VCPLXMUL vcplxmul
-
ALWAYS_INLINE(void) vcplxmulconj(v4sf &ar, v4sf &ai, v4sf br, v4sf bi) noexcept
{
v4sf tmp{VMUL(ar, bi)};
ar = VMADD(ai, bi, VMUL(ar, br));
ai = VSUB(VMUL(ai, br), tmp);
}
-#define VCPLXMULCONJ vcplxmulconj
#if !defined(PFFFT_SIMD_DISABLE)
@@ -352,10 +371,10 @@ ALWAYS_INLINE(void) vcplxmulconj(v4sf &ar, v4sf &ai, v4sf br, v4sf bi) noexcept
t_v = VMADD(a1_v, a2_v,a0_v); t_f = al::bit_cast<float4>(t_v);
printf("VMADD(4:7,8:11,0:3)=[%2g %2g %2g %2g]\n", t_f[0], t_f[1], t_f[2], t_f[3]); assertv4(t, 32, 46, 62, 80);
- INTERLEAVE2(a1_v,a2_v,t_v,u_v); t_f = al::bit_cast<float4>(t_v); u_f = al::bit_cast<float4>(u_v);
+ interleave2(a1_v,a2_v,t_v,u_v); t_f = al::bit_cast<float4>(t_v); u_f = al::bit_cast<float4>(u_v);
printf("INTERLEAVE2(4:7,8:11)=[%2g %2g %2g %2g] [%2g %2g %2g %2g]\n", t_f[0], t_f[1], t_f[2], t_f[3], u_f[0], u_f[1], u_f[2], u_f[3]);
assertv4(t, 4, 8, 5, 9); assertv4(u, 6, 10, 7, 11);
- UNINTERLEAVE2(a1_v,a2_v,t_v,u_v); t_f = al::bit_cast<float4>(t_v); u_f = al::bit_cast<float4>(u_v);
+ uninterleave2(a1_v,a2_v,t_v,u_v); t_f = al::bit_cast<float4>(t_v); u_f = al::bit_cast<float4>(u_v);
printf("UNINTERLEAVE2(4:7,8:11)=[%2g %2g %2g %2g] [%2g %2g %2g %2g]\n", t_f[0], t_f[1], t_f[2], t_f[3], u_f[0], u_f[1], u_f[2], u_f[3]);
assertv4(t, 4, 6, 8, 10); assertv4(u, 5, 7, 9, 11);
@@ -365,7 +384,7 @@ ALWAYS_INLINE(void) vcplxmulconj(v4sf &ar, v4sf &ai, v4sf br, v4sf bi) noexcept
t_v = VSWAPHL(a1_v, a2_v); t_f = al::bit_cast<float4>(t_v);
printf("VSWAPHL(4:7,8:11)=[%2g %2g %2g %2g]\n", t_f[0], t_f[1], t_f[2], t_f[3]);
assertv4(t, 8, 9, 6, 7);
- VTRANSPOSE4(a0_v, a1_v, a2_v, a3_v);
+ vtranspose4(a0_v, a1_v, a2_v, a3_v);
a0_f = al::bit_cast<float4>(a0_v);
a1_f = al::bit_cast<float4>(a1_v);
a2_f = al::bit_cast<float4>(a2_v);
@@ -408,7 +427,7 @@ NEVER_INLINE(void) passf2_ps(const size_t ido, const size_t l1, const v4sf *cc,
v4sf wr{LD_PS1(wa1[i])}, wi{LD_PS1(wa1[i+1]*fsign)};
ch[i] = VADD(cc[i+0], cc[i+ido+0]);
ch[i+1] = VADD(cc[i+1], cc[i+ido+1]);
- VCPLXMUL(tr2, ti2, wr, wi);
+ vcplxmul(tr2, ti2, wr, wi);
ch[i+l1ido] = tr2;
ch[i+l1ido+1] = ti2;
}
@@ -444,10 +463,10 @@ NEVER_INLINE(void) passf3_ps(const size_t ido, const size_t l1, const v4sf *cc,
v4sf di2{VADD(ci2, cr3)};
v4sf di3{VSUB(ci2, cr3)};
float wr1{wa1[i]}, wi1{fsign*wa1[i+1]}, wr2{wa2[i]}, wi2{fsign*wa2[i+1]};
- VCPLXMUL(dr2, di2, LD_PS1(wr1), LD_PS1(wi1));
+ vcplxmul(dr2, di2, LD_PS1(wr1), LD_PS1(wi1));
ch[i+l1ido] = dr2;
ch[i+l1ido + 1] = di2;
- VCPLXMUL(dr3, di3, LD_PS1(wr2), LD_PS1(wi2));
+ vcplxmul(dr3, di3, LD_PS1(wr2), LD_PS1(wi2));
ch[i+2*l1ido] = dr3;
ch[i+2*l1ido+1] = di3;
}
@@ -508,17 +527,17 @@ NEVER_INLINE(void) passf4_ps(const size_t ido, const size_t l1, const v4sf *cc,
v4sf ci2{VADD(ti1, ti4)};
v4sf ci4{VSUB(ti1, ti4)};
float wr1{wa1[i]}, wi1{fsign*wa1[i+1]};
- VCPLXMUL(cr2, ci2, LD_PS1(wr1), LD_PS1(wi1));
+ vcplxmul(cr2, ci2, LD_PS1(wr1), LD_PS1(wi1));
float wr2{wa2[i]}, wi2{fsign*wa2[i+1]};
ch[i + l1ido] = cr2;
ch[i + l1ido + 1] = ci2;
- VCPLXMUL(cr3, ci3, LD_PS1(wr2), LD_PS1(wi2));
+ vcplxmul(cr3, ci3, LD_PS1(wr2), LD_PS1(wi2));
float wr3{wa3[i]}, wi3{fsign*wa3[i+1]};
ch[i + 2*l1ido] = cr3;
ch[i + 2*l1ido + 1] = ci3;
- VCPLXMUL(cr4, ci4, LD_PS1(wr3), LD_PS1(wi3));
+ vcplxmul(cr4, ci4, LD_PS1(wr3), LD_PS1(wi3));
ch[i + 3*l1ido] = cr4;
ch[i + 3*l1ido + 1] = ci4;
}
@@ -573,16 +592,16 @@ NEVER_INLINE(void) passf5_ps(const size_t ido, const size_t l1, const v4sf *cc,
v4sf di2{VADD(ci2, cr5)};
float wr1{wa1[i]}, wi1{fsign*wa1[i+1]}, wr2{wa2[i]}, wi2{fsign*wa2[i+1]};
float wr3{wa3[i]}, wi3{fsign*wa3[i+1]}, wr4{wa4[i]}, wi4{fsign*wa4[i+1]};
- VCPLXMUL(dr2, di2, LD_PS1(wr1), LD_PS1(wi1));
+ vcplxmul(dr2, di2, LD_PS1(wr1), LD_PS1(wi1));
ch_ref(i - 1, 2) = dr2;
ch_ref(i, 2) = di2;
- VCPLXMUL(dr3, di3, LD_PS1(wr2), LD_PS1(wi2));
+ vcplxmul(dr3, di3, LD_PS1(wr2), LD_PS1(wi2));
ch_ref(i - 1, 3) = dr3;
ch_ref(i, 3) = di3;
- VCPLXMUL(dr4, di4, LD_PS1(wr3), LD_PS1(wi3));
+ vcplxmul(dr4, di4, LD_PS1(wr3), LD_PS1(wi3));
ch_ref(i - 1, 4) = dr4;
ch_ref(i, 4) = di4;
- VCPLXMUL(dr5, di5, LD_PS1(wr4), LD_PS1(wi4));
+ vcplxmul(dr5, di5, LD_PS1(wr4), LD_PS1(wi4));
ch_ref(i - 1, 5) = dr5;
ch_ref(i, 5) = di5;
}
@@ -611,7 +630,7 @@ NEVER_INLINE(void) radf2_ps(const size_t ido, const size_t l1, const v4sf *RESTR
{
v4sf tr2{cc[i - 1 + k + l1ido]}, ti2{cc[i + k + l1ido]};
v4sf br{cc[i - 1 + k]}, bi{cc[i + k]};
- VCPLXMULCONJ(tr2, ti2, LD_PS1(wa1[i - 2]), LD_PS1(wa1[i - 1]));
+ vcplxmulconj(tr2, ti2, LD_PS1(wa1[i - 2]), LD_PS1(wa1[i - 1]));
ch[i + 2*k] = VADD(bi, ti2);
ch[2*(k+ido) - i] = VSUB(ti2, bi);
ch[i - 1 + 2*k] = VADD(br, tr2);
@@ -657,7 +676,7 @@ NEVER_INLINE(void) radb2_ps(const size_t ido, const size_t l1, const v4sf *cc, v
v4sf tr2{VSUB(a, b)};
ch[i+0 + k] = VSUB(c, d);
v4sf ti2{VADD(c, d)};
- VCPLXMUL(tr2, ti2, LD_PS1(wa1[i - 2]), LD_PS1(wa1[i - 1]));
+ vcplxmul(tr2, ti2, LD_PS1(wa1[i - 2]), LD_PS1(wa1[i - 1]));
ch[i-1 + k + l1ido] = tr2;
ch[i+0 + k + l1ido] = ti2;
}
@@ -698,13 +717,13 @@ void radf3_ps(const size_t ido, const size_t l1, const v4sf *RESTRICT cc, v4sf *
v4sf wi1{LD_PS1(wa1[i - 1])};
v4sf dr2{cc[i - 1 + (k + l1)*ido]};
v4sf di2{cc[i + (k + l1)*ido]};
- VCPLXMULCONJ(dr2, di2, wr1, wi1);
+ vcplxmulconj(dr2, di2, wr1, wi1);
v4sf wr2{LD_PS1(wa2[i - 2])};
v4sf wi2{LD_PS1(wa2[i - 1])};
v4sf dr3{cc[i - 1 + (k + l1*2)*ido]};
v4sf di3{cc[i + (k + l1*2)*ido]};
- VCPLXMULCONJ(dr3, di3, wr2, wi2);
+ vcplxmulconj(dr3, di3, wr2, wi2);
v4sf cr2{VADD(dr2, dr3)};
v4sf ci2{VADD(di2, di3)};
@@ -762,10 +781,10 @@ void radb3_ps(const size_t ido, const size_t l1, const v4sf *RESTRICT cc, v4sf *
v4sf dr3{VADD(cr2, ci3)};
v4sf di2{VADD(ci2, cr3)};
v4sf di3{VSUB(ci2, cr3)};
- VCPLXMUL(dr2, di2, LD_PS1(wa1[i-2]), LD_PS1(wa1[i-1]));
+ vcplxmul(dr2, di2, LD_PS1(wa1[i-2]), LD_PS1(wa1[i-1]));
ch[i - 1 + (k + l1)*ido] = dr2;
ch[i + (k + l1)*ido] = di2;
- VCPLXMUL(dr3, di3, LD_PS1(wa2[i-2]), LD_PS1(wa2[i-1]));
+ vcplxmul(dr3, di3, LD_PS1(wa2[i-2]), LD_PS1(wa2[i-1]));
ch[i - 1 + (k + 2*l1)*ido] = dr3;
ch[i + (k + 2*l1)*ido] = di3;
}
@@ -811,19 +830,19 @@ NEVER_INLINE(void) radf4_ps(const size_t ido, const size_t l1, const v4sf *RESTR
v4sf ci2{pc[1*l1ido+1]};
v4sf wr{LD_PS1(wa1[i - 2])};
v4sf wi{LD_PS1(wa1[i - 1])};
- VCPLXMULCONJ(cr2,ci2,wr,wi);
+ vcplxmulconj(cr2,ci2,wr,wi);
v4sf cr3{pc[2*l1ido+0]};
v4sf ci3{pc[2*l1ido+1]};
wr = LD_PS1(wa2[i-2]);
wi = LD_PS1(wa2[i-1]);
- VCPLXMULCONJ(cr3, ci3, wr, wi);
+ vcplxmulconj(cr3, ci3, wr, wi);
v4sf cr4{pc[3*l1ido]};
v4sf ci4{pc[3*l1ido+1]};
wr = LD_PS1(wa3[i-2]);
wi = LD_PS1(wa3[i-1]);
- VCPLXMULCONJ(cr4, ci4, wr, wi);
+ vcplxmulconj(cr4, ci4, wr, wi);
/* at this point, on SSE, five of "cr2 cr3 cr4 ci2 ci3 ci4" should be loaded in registers */
@@ -918,13 +937,13 @@ NEVER_INLINE(void) radb4_ps(const size_t ido, const size_t l1, const v4sf *RESTR
v4sf ci3{VSUB(ti2, ti3)};
v4sf ci2{VADD(ti1, ti4)};
v4sf ci4{VSUB(ti1, ti4)};
- VCPLXMUL(cr2, ci2, LD_PS1(wa1[i-2]), LD_PS1(wa1[i-1]));
+ vcplxmul(cr2, ci2, LD_PS1(wa1[i-2]), LD_PS1(wa1[i-1]));
ph[0] = cr2;
ph[1] = ci2; ph += l1ido;
- VCPLXMUL(cr3, ci3, LD_PS1(wa2[i-2]), LD_PS1(wa2[i-1]));
+ vcplxmul(cr3, ci3, LD_PS1(wa2[i-2]), LD_PS1(wa2[i-1]));
ph[0] = cr3;
ph[1] = ci3; ph += l1ido;
- VCPLXMUL(cr4, ci4, LD_PS1(wa3[i-2]), LD_PS1(wa3[i-1]));
+ vcplxmul(cr4, ci4, LD_PS1(wa3[i-2]), LD_PS1(wa3[i-1]));
ph[0] = cr4;
ph[1] = ci4; ph = ph - 3*l1ido + 2;
}
@@ -997,10 +1016,10 @@ void radf5_ps(const size_t ido, const size_t l1, const v4sf *RESTRICT cc, v4sf *
v4sf di4{LD_PS1(wa3[i-2])};
v4sf dr5{LD_PS1(wa4[i-3])};
v4sf di5{LD_PS1(wa4[i-2])};
- VCPLXMULCONJ(dr2, di2, cc_ref(i-1, k, 2), cc_ref(i, k, 2));
- VCPLXMULCONJ(dr3, di3, cc_ref(i-1, k, 3), cc_ref(i, k, 3));
- VCPLXMULCONJ(dr4, di4, cc_ref(i-1, k, 4), cc_ref(i, k, 4));
- VCPLXMULCONJ(dr5, di5, cc_ref(i-1, k, 5), cc_ref(i, k, 5));
+ vcplxmulconj(dr2, di2, cc_ref(i-1, k, 2), cc_ref(i, k, 2));
+ vcplxmulconj(dr3, di3, cc_ref(i-1, k, 3), cc_ref(i, k, 3));
+ vcplxmulconj(dr4, di4, cc_ref(i-1, k, 4), cc_ref(i, k, 4));
+ vcplxmulconj(dr5, di5, cc_ref(i-1, k, 5), cc_ref(i, k, 5));
v4sf cr2{VADD(dr2, dr5)};
v4sf ci5{VSUB(dr5, dr2)};
v4sf cr5{VSUB(di2, di5)};
@@ -1102,10 +1121,10 @@ void radb5_ps(const size_t ido, const size_t l1, const v4sf *RESTRICT cc, v4sf *
v4sf dr2{VSUB(cr2, ci5)};
v4sf di5{VSUB(ci2, cr5)};
v4sf di2{VADD(ci2, cr5)};
- VCPLXMUL(dr2, di2, LD_PS1(wa1[i-3]), LD_PS1(wa1[i-2]));
- VCPLXMUL(dr3, di3, LD_PS1(wa2[i-3]), LD_PS1(wa2[i-2]));
- VCPLXMUL(dr4, di4, LD_PS1(wa3[i-3]), LD_PS1(wa3[i-2]));
- VCPLXMUL(dr5, di5, LD_PS1(wa4[i-3]), LD_PS1(wa4[i-2]));
+ vcplxmul(dr2, di2, LD_PS1(wa1[i-3]), LD_PS1(wa1[i-2]));
+ vcplxmul(dr3, di3, LD_PS1(wa2[i-3]), LD_PS1(wa2[i-2]));
+ vcplxmul(dr4, di4, LD_PS1(wa3[i-3]), LD_PS1(wa3[i-2]));
+ vcplxmul(dr5, di5, LD_PS1(wa4[i-3]), LD_PS1(wa4[i-2]));
ch_ref(i-1, k, 2) = dr2; ch_ref(i, k, 2) = di2;
ch_ref(i-1, k, 3) = dr3; ch_ref(i, k, 3) = di3;
@@ -1501,14 +1520,14 @@ namespace {
void reversed_copy(const size_t N, const v4sf *in, const int in_stride, v4sf *out)
{
v4sf g0, g1;
- INTERLEAVE2(in[0], in[1], g0, g1);
+ interleave2(in[0], in[1], g0, g1);
in += in_stride;
*--out = VSWAPHL(g0, g1); // [g0l, g0h], [g1l g1h] -> [g1l, g0h]
for(size_t k{1};k < N;++k)
{
v4sf h0, h1;
- INTERLEAVE2(in[0], in[1], h0, h1);
+ interleave2(in[0], in[1], h0, h1);
in += in_stride;
*--out = VSWAPHL(g1, h0);
*--out = VSWAPHL(h0, h1);
@@ -1526,14 +1545,14 @@ void unreversed_copy(const size_t N, const v4sf *in, v4sf *out, const int out_st
v4sf h0{*in++}; v4sf h1{*in++};
g1 = VSWAPHL(g1, h0);
h0 = VSWAPHL(h0, h1);
- UNINTERLEAVE2(h0, g1, out[0], out[1]);
+ uninterleave2(h0, g1, out[0], out[1]);
out += out_stride;
g1 = h1;
}
v4sf h0{*in++}, h1{g0};
g1 = VSWAPHL(g1, h0);
h0 = VSWAPHL(h0, h1);
- UNINTERLEAVE2(h0, g1, out[0], out[1]);
+ uninterleave2(h0, g1, out[0], out[1]);
}
void pffft_cplx_finalize(const size_t Ncvec, const v4sf *in, v4sf *out, const v4sf *e)
@@ -1547,11 +1566,11 @@ void pffft_cplx_finalize(const size_t Ncvec, const v4sf *in, v4sf *out, const v4
v4sf r1{in[8*k+2]}, i1{in[8*k+3]};
v4sf r2{in[8*k+4]}, i2{in[8*k+5]};
v4sf r3{in[8*k+6]}, i3{in[8*k+7]};
- VTRANSPOSE4(r0,r1,r2,r3);
- VTRANSPOSE4(i0,i1,i2,i3);
- VCPLXMUL(r1,i1,e[k*6+0],e[k*6+1]);
- VCPLXMUL(r2,i2,e[k*6+2],e[k*6+3]);
- VCPLXMUL(r3,i3,e[k*6+4],e[k*6+5]);
+ vtranspose4(r0,r1,r2,r3);
+ vtranspose4(i0,i1,i2,i3);
+ vcplxmul(r1,i1,e[k*6+0],e[k*6+1]);
+ vcplxmul(r2,i2,e[k*6+2],e[k*6+3]);
+ vcplxmul(r3,i3,e[k*6+4],e[k*6+5]);
v4sf sr0{VADD(r0,r2)}, dr0{VSUB(r0, r2)};
v4sf sr1{VADD(r1,r3)}, dr1{VSUB(r1, r3)};
@@ -1602,12 +1621,12 @@ void pffft_cplx_preprocess(const size_t Ncvec, const v4sf *in, v4sf *out, const
r2 = VSUB(sr0, sr1); i2 = VSUB(si0, si1);
r3 = VADD(dr0, di1); i3 = VSUB(di0, dr1);
- VCPLXMULCONJ(r1,i1,e[k*6+0],e[k*6+1]);
- VCPLXMULCONJ(r2,i2,e[k*6+2],e[k*6+3]);
- VCPLXMULCONJ(r3,i3,e[k*6+4],e[k*6+5]);
+ vcplxmulconj(r1,i1,e[k*6+0],e[k*6+1]);
+ vcplxmulconj(r2,i2,e[k*6+2],e[k*6+3]);
+ vcplxmulconj(r3,i3,e[k*6+4],e[k*6+5]);
- VTRANSPOSE4(r0,r1,r2,r3);
- VTRANSPOSE4(i0,i1,i2,i3);
+ vtranspose4(r0,r1,r2,r3);
+ vtranspose4(i0,i1,i2,i3);
*out++ = r0; *out++ = i0; *out++ = r1; *out++ = i1;
*out++ = r2; *out++ = i2; *out++ = r3; *out++ = i3;
@@ -1622,8 +1641,8 @@ ALWAYS_INLINE(void) pffft_real_finalize_4x4(const v4sf *in0, const v4sf *in1, co
v4sf r1{*in++}; v4sf i1{*in++};
v4sf r2{*in++}; v4sf i2{*in++};
v4sf r3{*in++}; v4sf i3{*in++};
- VTRANSPOSE4(r0,r1,r2,r3);
- VTRANSPOSE4(i0,i1,i2,i3);
+ vtranspose4(r0,r1,r2,r3);
+ vtranspose4(i0,i1,i2,i3);
/* transformation for each column is:
*
@@ -1640,9 +1659,9 @@ ALWAYS_INLINE(void) pffft_real_finalize_4x4(const v4sf *in0, const v4sf *in1, co
//cerr << "matrix initial, before e , REAL:\n 1: " << r0 << "\n 1: " << r1 << "\n 1: " << r2 << "\n 1: " << r3 << "\n";
//cerr << "matrix initial, before e, IMAG :\n 1: " << i0 << "\n 1: " << i1 << "\n 1: " << i2 << "\n 1: " << i3 << "\n";
- VCPLXMUL(r1,i1,e[0],e[1]);
- VCPLXMUL(r2,i2,e[2],e[3]);
- VCPLXMUL(r3,i3,e[4],e[5]);
+ vcplxmul(r1,i1,e[0],e[1]);
+ vcplxmul(r2,i2,e[2],e[3]);
+ vcplxmul(r3,i3,e[4],e[5]);
//cerr << "matrix initial, real part:\n 1: " << r0 << "\n 1: " << r1 << "\n 1: " << r2 << "\n 1: " << r3 << "\n";
//cerr << "matrix initial, imag part:\n 1: " << i0 << "\n 1: " << i1 << "\n 1: " << i2 << "\n 1: " << i3 << "\n";
@@ -1741,12 +1760,12 @@ ALWAYS_INLINE(void) pffft_real_preprocess_4x4(const v4sf *in, const v4sf *e, v4s
i1 = VSUB(si0, dr1);
i3 = VADD(si0, dr1);
- VCPLXMULCONJ(r1,i1,e[0],e[1]);
- VCPLXMULCONJ(r2,i2,e[2],e[3]);
- VCPLXMULCONJ(r3,i3,e[4],e[5]);
+ vcplxmulconj(r1,i1,e[0],e[1]);
+ vcplxmulconj(r2,i2,e[2],e[3]);
+ vcplxmulconj(r3,i3,e[4],e[5]);
- VTRANSPOSE4(r0,r1,r2,r3);
- VTRANSPOSE4(i0,i1,i2,i3);
+ vtranspose4(r0,r1,r2,r3);
+ vtranspose4(i0,i1,i2,i3);
if(!first)
{
@@ -1831,7 +1850,7 @@ void pffft_transform_internal(const PFFFT_Setup *setup, const v4sf *vinput, v4sf
{
v4sf *tmp{buff[ib]};
for(size_t k=0; k < Ncvec; ++k)
- UNINTERLEAVE2(vinput[k*2], vinput[k*2+1], tmp[k*2], tmp[k*2+1]);
+ uninterleave2(vinput[k*2], vinput[k*2+1], tmp[k*2], tmp[k*2+1]);
ib = (cfftf1_ps(Ncvec, buff[ib], buff[!ib], buff[ib], setup->twiddle, setup->ifac, -1.0f) == buff[1]);
pffft_cplx_finalize(Ncvec, buff[ib], buff[!ib], setup->e);
@@ -1864,7 +1883,7 @@ void pffft_transform_internal(const PFFFT_Setup *setup, const v4sf *vinput, v4sf
pffft_cplx_preprocess(Ncvec, vinput, buff[ib], setup->e);
ib = (cfftf1_ps(Ncvec, buff[ib], buff[0], buff[1], setup->twiddle, setup->ifac, +1.0f) == buff[1]);
for(size_t k{0};k < Ncvec;++k)
- INTERLEAVE2(buff[ib][k*2], buff[ib][k*2+1], buff[ib][k*2], buff[ib][k*2+1]);
+ interleave2(buff[ib][k*2], buff[ib][k*2+1], buff[ib][k*2], buff[ib][k*2+1]);
}
}
@@ -1897,8 +1916,8 @@ void pffft_zreorder(const PFFFT_Setup *setup, const float *in, float *out,
{
for(size_t k{0};k < dk;++k)
{
- INTERLEAVE2(vin[k*8 + 0], vin[k*8 + 1], vout[2*(0*dk + k) + 0], vout[2*(0*dk + k) + 1]);
- INTERLEAVE2(vin[k*8 + 4], vin[k*8 + 5], vout[2*(2*dk + k) + 0], vout[2*(2*dk + k) + 1]);
+ interleave2(vin[k*8 + 0], vin[k*8 + 1], vout[2*(0*dk + k) + 0], vout[2*(0*dk + k) + 1]);
+ interleave2(vin[k*8 + 4], vin[k*8 + 5], vout[2*(2*dk + k) + 0], vout[2*(2*dk + k) + 1]);
}
reversed_copy(dk, vin+2, 8, vout + N/SIMD_SZ/2);
reversed_copy(dk, vin+6, 8, vout + N/SIMD_SZ);
@@ -1907,8 +1926,8 @@ void pffft_zreorder(const PFFFT_Setup *setup, const float *in, float *out,
{
for(size_t k{0};k < dk;++k)
{
- UNINTERLEAVE2(vin[2*(0*dk + k) + 0], vin[2*(0*dk + k) + 1], vout[k*8 + 0], vout[k*8 + 1]);
- UNINTERLEAVE2(vin[2*(2*dk + k) + 0], vin[2*(2*dk + k) + 1], vout[k*8 + 4], vout[k*8 + 5]);
+ uninterleave2(vin[2*(0*dk + k) + 0], vin[2*(0*dk + k) + 1], vout[k*8 + 0], vout[k*8 + 1]);
+ uninterleave2(vin[2*(2*dk + k) + 0], vin[2*(2*dk + k) + 1], vout[k*8 + 4], vout[k*8 + 5]);
}
unreversed_copy(dk, vin + N/SIMD_SZ/4, vout + N/SIMD_SZ - 6, -8);
unreversed_copy(dk, vin + 3*N/SIMD_SZ/4, vout + N/SIMD_SZ - 2, -8);
@@ -1921,7 +1940,7 @@ void pffft_zreorder(const PFFFT_Setup *setup, const float *in, float *out,
for(size_t k{0};k < Ncvec;++k)
{
size_t kk{(k/4) + (k%4)*(Ncvec/4)};
- INTERLEAVE2(vin[k*2], vin[k*2+1], vout[kk*2], vout[kk*2+1]);
+ interleave2(vin[k*2], vin[k*2+1], vout[kk*2], vout[kk*2+1]);
}
}
else
@@ -1929,7 +1948,7 @@ void pffft_zreorder(const PFFFT_Setup *setup, const float *in, float *out,
for(size_t k{0};k < Ncvec;++k)
{
size_t kk{(k/4) + (k%4)*(Ncvec/4)};
- UNINTERLEAVE2(vin[kk*2], vin[kk*2+1], vout[k*2], vout[k*2+1]);
+ uninterleave2(vin[kk*2], vin[kk*2+1], vout[k*2], vout[k*2+1]);
}
}
}
@@ -2019,12 +2038,12 @@ void pffft_zconvolve_scale_accumulate(const PFFFT_Setup *s, const float *a, cons
{
v4sf ar4{va[2*i+0]}, ai4{va[2*i+1]};
v4sf br4{vb[2*i+0]}, bi4{vb[2*i+1]};
- VCPLXMUL(ar4, ai4, br4, bi4);
+ vcplxmul(ar4, ai4, br4, bi4);
vab[2*i+0] = VMADD(ar4, vscal, vab[2*i+0]);
vab[2*i+1] = VMADD(ai4, vscal, vab[2*i+1]);
ar4 = va[2*i+2]; ai4 = va[2*i+3];
br4 = vb[2*i+2]; bi4 = vb[2*i+3];
- VCPLXMUL(ar4, ai4, br4, bi4);
+ vcplxmul(ar4, ai4, br4, bi4);
vab[2*i+2] = VMADD(ar4, vscal, vab[2*i+2]);
vab[2*i+3] = VMADD(ai4, vscal, vab[2*i+3]);
}
@@ -2073,12 +2092,12 @@ void pffft_zconvolve_accumulate(const PFFFT_Setup *s, const float *a, const floa
{
v4sf ar4{va[2*i+0]}, ai4{va[2*i+1]};
v4sf br4{vb[2*i+0]}, bi4{vb[2*i+1]};
- VCPLXMUL(ar4, ai4, br4, bi4);
+ vcplxmul(ar4, ai4, br4, bi4);
vab[2*i+0] = VADD(ar4, vab[2*i+0]);
vab[2*i+1] = VADD(ai4, vab[2*i+1]);
ar4 = va[2*i+2]; ai4 = va[2*i+3];
br4 = vb[2*i+2]; bi4 = vb[2*i+3];
- VCPLXMUL(ar4, ai4, br4, bi4);
+ vcplxmul(ar4, ai4, br4, bi4);
vab[2*i+2] = VADD(ar4, vab[2*i+2]);
vab[2*i+3] = VADD(ai4, vab[2*i+3]);
}
@@ -2094,7 +2113,7 @@ void pffft_zconvolve_accumulate(const PFFFT_Setup *s, const float *a, const floa
void pffft_transform(const PFFFT_Setup *setup, const float *input, float *output, float *work,
pffft_direction_t direction)
{
- assert(VALIGNED(input) && VALIGNED(output) && VALIGNED(work));
+ assert(valigned(input) && valigned(output) && valigned(work));
pffft_transform_internal(setup, reinterpret_cast<const v4sf*>(al::assume_aligned<16>(input)),
reinterpret_cast<v4sf*>(al::assume_aligned<16>(output)),
reinterpret_cast<v4sf*>(al::assume_aligned<16>(work)), direction, false);
@@ -2103,7 +2122,7 @@ void pffft_transform(const PFFFT_Setup *setup, const float *input, float *output
void pffft_transform_ordered(const PFFFT_Setup *setup, const float *input, float *output,
float *work, pffft_direction_t direction)
{
- assert(VALIGNED(input) && VALIGNED(output) && VALIGNED(work));
+ assert(valigned(input) && valigned(output) && valigned(work));
pffft_transform_internal(setup, reinterpret_cast<const v4sf*>(al::assume_aligned<16>(input)),
reinterpret_cast<v4sf*>(al::assume_aligned<16>(output)),
reinterpret_cast<v4sf*>(al::assume_aligned<16>(work)), direction, true);
@@ -2217,7 +2236,7 @@ void pffft_zconvolve_scale_accumulate(const PFFFT_Setup *s, const float *a, cons
{
float ar{a[2*i+0]}, ai{a[2*i+1]};
const float br{b[2*i+0]}, bi{b[2*i+1]};
- VCPLXMUL(ar, ai, br, bi);
+ vcplxmul(ar, ai, br, bi);
ab[2*i+0] += ar*scaling;
ab[2*i+1] += ai*scaling;
}
@@ -2238,7 +2257,7 @@ void pffft_zconvolve_accumulate(const PFFFT_Setup *s, const float *a, const floa
{
float ar{a[2*i+0]}, ai{a[2*i+1]};
const float br{b[2*i+0]}, bi{b[2*i+1]};
- VCPLXMUL(ar, ai, br, bi);
+ vcplxmul(ar, ai, br, bi);
ab[2*i+0] += ar;
ab[2*i+1] += ai;
}