Replace some function-like macros with real functions

author: Chris Robinson <[email protected]> 2023-10-16 09:01:41 -0700
committer: Chris Robinson <[email protected]> 2023-10-16 09:01:41 -0700
commit: a82c5373667aae8f9e87b9d87ef9d2dec625f2fb (patch)
tree: e746fa69447e1c818962840a3d4536fd670ec260
parent: 50fce82c4043d989a6868b13a6930fa31b0cc420 (diff)
1 files changed, 136 insertions, 117 deletions
diff --git a/common/pffft.cpp b/common/pffft.cpp
index 80d4e9c7..5a6bb4db 100644
--- a/common/pffft.cpp
+++ b/common/pffft.cpp
@@ -116,7 +116,7 @@ typedef vector float v4sf;
 #define VMADD vec_madd
 #define VSUB vec_sub
 #define LD_PS1 vec_splats
-inline v4sf vset4(float a, float b, float c, float d)
+ALWAYS_INLINE(v4sf) vset4(float a, float b, float c, float d) noexcept
 {
     /* There a more efficient way to do this? */
     alignas(16) std::array<float,4> vals{{a, b, c, d}};
@@ -125,32 +125,33 @@ inline v4sf vset4(float a, float b, float c, float d)
 #define VSET4 vset4
 #define VINSERT0(v, a) vec_insert((a), (v), 0)
 #define VEXTRACT0(v) vec_extract((v), 0)
+
 ALWAYS_INLINE(void) interleave2(v4sf in1, v4sf in2, v4sf &out1, v4sf &out2) noexcept
 {
     v4sf tmp{vec_mergeh(in1, in2)};
     out2 = vec_mergel(in1, in2);
     out1 = tmp;
 }
-#define INTERLEAVE2 interleave2
 ALWAYS_INLINE(void) uninterleave2(v4sf in1, v4sf in2, v4sf &out1, v4sf &out2) noexcept
 {
     v4sf tmp{vec_perm(in1, in2, (vector unsigned char)(0,1,2,3,8,9,10,11,16,17,18,19,24,25,26,27))};
     out2 = vec_perm(in1, in2, (vector unsigned char)(4,5,6,7,12,13,14,15,20,21,22,23,28,29,30,31));
     out1 = tmp;
 }
-#define UNINTERLEAVE2 uninterleave2
-#define VTRANSPOSE4(x0,x1,x2,x3) do {           \
-    v4sf y0 = vec_mergeh(x0, x2);               \
-    v4sf y1 = vec_mergel(x0, x2);               \
-    v4sf y2 = vec_mergeh(x1, x3);               \
-    v4sf y3 = vec_mergel(x1, x3);               \
-    x0 = vec_mergeh(y0, y2);                    \
-    x1 = vec_mergel(y0, y2);                    \
-    x2 = vec_mergeh(y1, y3);                    \
-    x3 = vec_mergel(y1, y3);                    \
-} while(0)
+
+ALWAYS_INLINE(void) vtranspose4(v4sf &x0, v4sf &x1, v4sf &x2, v4sf &x3) noexcept
+{
+    v4sf y0{vec_mergeh(x0, x2)};
+    v4sf y1{vec_mergel(x0, x2)};
+    v4sf y2{vec_mergeh(x1, x3)};
+    v4sf y3{vec_mergel(x1, x3)};
+    x0 = vec_mergeh(y0, y2);
+    x1 = vec_mergel(y0, y2);
+    x2 = vec_mergeh(y1, y3);
+    x3 = vec_mergel(y1, y3);
+}
+
 #define VSWAPHL(a,b) vec_perm(a,b, (vector unsigned char)(16,17,18,19,20,21,22,23,8,9,10,11,12,13,14,15))
-#define VALIGNED(ptr) ((reinterpret_cast<uintptr_t>(ptr) & 0xF) == 0)
 
 /*
  * SSE1 support macros
@@ -170,23 +171,24 @@ typedef __m128 v4sf;
 #define VSET4 _mm_setr_ps
 #define VINSERT0(v, a) _mm_move_ss((v), _mm_set_ss(a))
 #define VEXTRACT0 _mm_cvtss_f32
+
 ALWAYS_INLINE(void) interleave2(v4sf in1, v4sf in2, v4sf &out1, v4sf &out2) noexcept
 {
     v4sf tmp{_mm_unpacklo_ps(in1, in2)};
     out2 = _mm_unpackhi_ps(in1, in2);
     out1 = tmp;
 }
-#define INTERLEAVE2 interleave2
 ALWAYS_INLINE(void) uninterleave2(v4sf in1, v4sf in2, v4sf &out1, v4sf &out2) noexcept
 {
     v4sf tmp{_mm_shuffle_ps(in1, in2, _MM_SHUFFLE(2,0,2,0))};
     out2 = _mm_shuffle_ps(in1, in2, _MM_SHUFFLE(3,1,3,1));
     out1 = tmp;
 }
-#define UNINTERLEAVE2 uninterleave2
-#define VTRANSPOSE4 _MM_TRANSPOSE4_PS
+
+ALWAYS_INLINE(void) vtranspose4(v4sf &x0, v4sf &x1, v4sf &x2, v4sf &x3) noexcept
+{ _MM_TRANSPOSE4_PS(x0, x1, x2, x3); }
+
 #define VSWAPHL(a,b) _mm_shuffle_ps(b, a, _MM_SHUFFLE(3,2,1,0))
-#define VALIGNED(ptr) ((reinterpret_cast<uintptr_t>(ptr) & 0xF) == 0)
 
 /*
  * ARM NEON support macros
@@ -213,19 +215,40 @@ ALWAYS_INLINE(v4sf) vset4(float a, float b, float c, float d) noexcept
 #define VSET4 vset4
 #define VINSERT0(v, a) vsetq_lane_f32((a), (v), 0)
 #define VEXTRACT0(v) vgetq_lane_f32((v), 0)
-#define INTERLEAVE2(in1, in2, out1, out2) do { float32x4x2_t tmp__ = vzipq_f32(in1,in2); out1=tmp__.val[0]; out2=tmp__.val[1]; } while(0)
-#define UNINTERLEAVE2(in1, in2, out1, out2) do { float32x4x2_t tmp__ = vuzpq_f32(in1,in2); out1=tmp__.val[0]; out2=tmp__.val[1]; } while(0)
-#define VTRANSPOSE4(x0,x1,x2,x3) do {                                   \
-    float32x4x2_t t0_ = vzipq_f32(x0, x2);                              \
-    float32x4x2_t t1_ = vzipq_f32(x1, x3);                              \
-    float32x4x2_t u0_ = vzipq_f32(t0_.val[0], t1_.val[0]);              \
-    float32x4x2_t u1_ = vzipq_f32(t0_.val[1], t1_.val[1]);              \
-    x0 = u0_.val[0]; x1 = u0_.val[1]; x2 = u1_.val[0]; x3 = u1_.val[1]; \
-} while(0)
-// marginally faster version
-//#define VTRANSPOSE4(x0,x1,x2,x3) { asm("vtrn.32 %q0, %q1;\n vtrn.32 %q2,%q3\n vswp %f0,%e2\n vswp %f1,%e3" : "+w"(x0), "+w"(x1), "+w"(x2), "+w"(x3)::); }
+
+ALWAYS_INLINE(void) interleave2(v4sf in1, v4sf in2, v4sf &out1, v4sf &out2) noexcept
+{
+    float32x4x2_t tmp{vzipq_f32(in1, in2)};
+    out1 = tmp.val[0];
+    out2 = tmp.val[1];
+}
+ALWAYS_INLINE(void) uninterleave2(v4sf in1, v4sf in2, v4sf &out1, v4sf &out2) noexcept
+{
+    float32x4x2_t tmp{vuzpq_f32(in1, in2)};
+    out1 = tmp.val[0];
+    out2 = tmp.val[1];
+}
+
+ALWAYS_INLINE(void) vtranspose4(v4sf &x0, v4sf &x1, v4sf &x2, v4sf &x3) noexcept
+{
+    /* marginally faster version:
+     * asm("vtrn.32 %q0, %q1;\n"
+     *     "vtrn.32 %q2, %q3\n
+     *     "vswp %f0, %e2\n
+     *     "vswp %f1, %e3"
+     *     : "+w"(x0), "+w"(x1), "+w"(x2), "+w"(x3)::);
+     */
+    float32x4x2_t t0_{vzipq_f32(x0, x2)};
+    float32x4x2_t t1_{vzipq_f32(x1, x3)};
+    float32x4x2_t u0_{vzipq_f32(t0_.val[0], t1_.val[0])};
+    float32x4x2_t u1_{vzipq_f32(t0_.val[1], t1_.val[1])};
+    x0 = u0_.val[0];
+    x1 = u0_.val[1];
+    x2 = u1_.val[0];
+    x3 = u1_.val[1];
+}
+
 #define VSWAPHL(a,b) vcombine_f32(vget_low_f32(b), vget_high_f32(a))
-#define VALIGNED(ptr) ((reinterpret_cast<uintptr_t>(ptr) & 0x3) == 0)
 
 /*
  * Generic GCC vector macros
@@ -255,19 +278,16 @@ ALWAYS_INLINE(v4sf) unpackhi(v4sf a, v4sf b) noexcept
 
 ALWAYS_INLINE(void) interleave2(v4sf in1, v4sf in2, v4sf &out1, v4sf &out2) noexcept
 {
-    v4sf tmp__{unpacklo(in1, in2)};
+    v4sf tmp{unpacklo(in1, in2)};
     out2 = unpackhi(in1, in2);
-    out1 = tmp__;
+    out1 = tmp;
 }
-#define INTERLEAVE2 interleave2
-
 ALWAYS_INLINE(void) uninterleave2(v4sf in1, v4sf in2, v4sf &out1, v4sf &out2) noexcept
 {
-    v4sf tmp__{in1[0], in1[2], in2[0], in2[2]};
+    v4sf tmp{in1[0], in1[2], in2[0], in2[2]};
     out2 = v4sf{in1[1], in1[3], in2[1], in2[3]};
-    out1 = tmp__;
+    out1 = tmp;
 }
-#define UNINTERLEAVE2 uninterleave2
 
 ALWAYS_INLINE(void) vtranspose4(v4sf &x0, v4sf &x1, v4sf &x2, v4sf &x3) noexcept
 {
@@ -280,14 +300,11 @@ ALWAYS_INLINE(void) vtranspose4(v4sf &x0, v4sf &x1, v4sf &x2, v4sf &x3) noexcept
     x2 = v4sf{tmp1[0], tmp1[1], tmp3[0], tmp3[1]};
     x3 = v4sf{tmp1[2], tmp1[3], tmp3[2], tmp3[3]};
 }
-#define VTRANSPOSE4 vtranspose4
 
 ALWAYS_INLINE(v4sf) vswaphl(v4sf a, v4sf b) noexcept
 { return v4sf{b[0], b[1], a[2], a[3]}; }
 #define VSWAPHL vswaphl
 
-#define VALIGNED(ptr) ((reinterpret_cast<uintptr_t>(ptr) & 0xF) == 0)
-
 #else
 
 #warning "building with simd disabled !\n";
@@ -306,9 +323,14 @@ typedef float v4sf;
 #define VMADD(a,b,c) ((a)*(b)+(c))
 #define VSUB(a,b) ((a)-(b))
 #define LD_PS1(p) (p)
-#define VALIGNED(ptr) ((reinterpret_cast<uintptr_t>(ptr) & 0x3) == 0)
 #endif
 
+inline bool valigned(const float *ptr) noexcept
+{
+    static constexpr uintptr_t alignmask{SIMD_SZ*4 - 1};
+    return (reinterpret_cast<uintptr_t>(ptr) & alignmask) == 0;
+}
+
 // shortcuts for complex multiplications
 ALWAYS_INLINE(void) vcplxmul(v4sf &ar, v4sf &ai, v4sf br, v4sf bi) noexcept
 {
@@ -316,15 +338,12 @@ ALWAYS_INLINE(void) vcplxmul(v4sf &ar, v4sf &ai, v4sf br, v4sf bi) noexcept
     ar = VSUB(VMUL(ar, br), VMUL(ai, bi));
     ai = VMADD(ai, br, tmp);
 }
-#define VCPLXMUL vcplxmul
-
 ALWAYS_INLINE(void) vcplxmulconj(v4sf &ar, v4sf &ai, v4sf br, v4sf bi) noexcept
 {
     v4sf tmp{VMUL(ar, bi)};
     ar = VMADD(ai, bi, VMUL(ar, br));
     ai = VSUB(VMUL(ai, br), tmp);
 }
-#define VCPLXMULCONJ vcplxmulconj
 
 #if !defined(PFFFT_SIMD_DISABLE)
 
@@ -352,10 +371,10 @@ ALWAYS_INLINE(void) vcplxmulconj(v4sf &ar, v4sf &ai, v4sf br, v4sf bi) noexcept
     t_v = VMADD(a1_v, a2_v,a0_v); t_f = al::bit_cast<float4>(t_v);
     printf("VMADD(4:7,8:11,0:3)=[%2g %2g %2g %2g]\n", t_f[0], t_f[1], t_f[2], t_f[3]); assertv4(t, 32, 46, 62, 80);
 
-    INTERLEAVE2(a1_v,a2_v,t_v,u_v); t_f = al::bit_cast<float4>(t_v); u_f = al::bit_cast<float4>(u_v);
+    interleave2(a1_v,a2_v,t_v,u_v); t_f = al::bit_cast<float4>(t_v); u_f = al::bit_cast<float4>(u_v);
     printf("INTERLEAVE2(4:7,8:11)=[%2g %2g %2g %2g] [%2g %2g %2g %2g]\n", t_f[0], t_f[1], t_f[2], t_f[3], u_f[0], u_f[1], u_f[2], u_f[3]);
     assertv4(t, 4, 8, 5, 9); assertv4(u, 6, 10, 7, 11);
-    UNINTERLEAVE2(a1_v,a2_v,t_v,u_v); t_f = al::bit_cast<float4>(t_v); u_f = al::bit_cast<float4>(u_v);
+    uninterleave2(a1_v,a2_v,t_v,u_v); t_f = al::bit_cast<float4>(t_v); u_f = al::bit_cast<float4>(u_v);
     printf("UNINTERLEAVE2(4:7,8:11)=[%2g %2g %2g %2g] [%2g %2g %2g %2g]\n", t_f[0], t_f[1], t_f[2], t_f[3], u_f[0], u_f[1], u_f[2], u_f[3]);
     assertv4(t, 4, 6, 8, 10); assertv4(u, 5, 7, 9, 11);
 
@@ -365,7 +384,7 @@ ALWAYS_INLINE(void) vcplxmulconj(v4sf &ar, v4sf &ai, v4sf br, v4sf bi) noexcept
     t_v = VSWAPHL(a1_v, a2_v); t_f = al::bit_cast<float4>(t_v);
     printf("VSWAPHL(4:7,8:11)=[%2g %2g %2g %2g]\n", t_f[0], t_f[1], t_f[2], t_f[3]);
     assertv4(t, 8, 9, 6, 7);
-    VTRANSPOSE4(a0_v, a1_v, a2_v, a3_v);
+    vtranspose4(a0_v, a1_v, a2_v, a3_v);
     a0_f = al::bit_cast<float4>(a0_v);
     a1_f = al::bit_cast<float4>(a1_v);
     a2_f = al::bit_cast<float4>(a2_v);
@@ -408,7 +427,7 @@ NEVER_INLINE(void) passf2_ps(const size_t ido, const size_t l1, const v4sf *cc,
                 v4sf wr{LD_PS1(wa1[i])}, wi{LD_PS1(wa1[i+1]*fsign)};
                 ch[i]   = VADD(cc[i+0], cc[i+ido+0]);
                 ch[i+1] = VADD(cc[i+1], cc[i+ido+1]);
-                VCPLXMUL(tr2, ti2, wr, wi);
+                vcplxmul(tr2, ti2, wr, wi);
                 ch[i+l1ido]   = tr2;
                 ch[i+l1ido+1] = ti2;
             }
@@ -444,10 +463,10 @@ NEVER_INLINE(void) passf3_ps(const size_t ido, const size_t l1, const v4sf *cc,
             v4sf di2{VADD(ci2, cr3)};
             v4sf di3{VSUB(ci2, cr3)};
             float wr1{wa1[i]}, wi1{fsign*wa1[i+1]}, wr2{wa2[i]}, wi2{fsign*wa2[i+1]};
-            VCPLXMUL(dr2, di2, LD_PS1(wr1), LD_PS1(wi1));
+            vcplxmul(dr2, di2, LD_PS1(wr1), LD_PS1(wi1));
             ch[i+l1ido] = dr2;
             ch[i+l1ido + 1] = di2;
-            VCPLXMUL(dr3, di3, LD_PS1(wr2), LD_PS1(wi2));
+            vcplxmul(dr3, di3, LD_PS1(wr2), LD_PS1(wi2));
             ch[i+2*l1ido] = dr3;
             ch[i+2*l1ido+1] = di3;
         }
@@ -508,17 +527,17 @@ NEVER_INLINE(void) passf4_ps(const size_t ido, const size_t l1, const v4sf *cc,
                 v4sf ci2{VADD(ti1, ti4)};
                 v4sf ci4{VSUB(ti1, ti4)};
                 float wr1{wa1[i]}, wi1{fsign*wa1[i+1]};
-                VCPLXMUL(cr2, ci2, LD_PS1(wr1), LD_PS1(wi1));
+                vcplxmul(cr2, ci2, LD_PS1(wr1), LD_PS1(wi1));
                 float wr2{wa2[i]}, wi2{fsign*wa2[i+1]};
                 ch[i + l1ido] = cr2;
                 ch[i + l1ido + 1] = ci2;
 
-                VCPLXMUL(cr3, ci3, LD_PS1(wr2), LD_PS1(wi2));
+                vcplxmul(cr3, ci3, LD_PS1(wr2), LD_PS1(wi2));
                 float wr3{wa3[i]}, wi3{fsign*wa3[i+1]};
                 ch[i + 2*l1ido] = cr3;
                 ch[i + 2*l1ido + 1] = ci3;
 
-                VCPLXMUL(cr4, ci4, LD_PS1(wr3), LD_PS1(wi3));
+                vcplxmul(cr4, ci4, LD_PS1(wr3), LD_PS1(wi3));
                 ch[i + 3*l1ido] = cr4;
                 ch[i + 3*l1ido + 1] = ci4;
             }
@@ -573,16 +592,16 @@ NEVER_INLINE(void) passf5_ps(const size_t ido, const size_t l1, const v4sf *cc,
             v4sf di2{VADD(ci2, cr5)};
             float wr1{wa1[i]}, wi1{fsign*wa1[i+1]}, wr2{wa2[i]}, wi2{fsign*wa2[i+1]};
             float wr3{wa3[i]}, wi3{fsign*wa3[i+1]}, wr4{wa4[i]}, wi4{fsign*wa4[i+1]};
-            VCPLXMUL(dr2, di2, LD_PS1(wr1), LD_PS1(wi1));
+            vcplxmul(dr2, di2, LD_PS1(wr1), LD_PS1(wi1));
             ch_ref(i - 1, 2) = dr2;
             ch_ref(i, 2)     = di2;
-            VCPLXMUL(dr3, di3, LD_PS1(wr2), LD_PS1(wi2));
+            vcplxmul(dr3, di3, LD_PS1(wr2), LD_PS1(wi2));
             ch_ref(i - 1, 3) = dr3;
             ch_ref(i, 3)     = di3;
-            VCPLXMUL(dr4, di4, LD_PS1(wr3), LD_PS1(wi3));
+            vcplxmul(dr4, di4, LD_PS1(wr3), LD_PS1(wi3));
             ch_ref(i - 1, 4) = dr4;
             ch_ref(i, 4)     = di4;
-            VCPLXMUL(dr5, di5, LD_PS1(wr4), LD_PS1(wi4));
+            vcplxmul(dr5, di5, LD_PS1(wr4), LD_PS1(wi4));
             ch_ref(i - 1, 5) = dr5;
             ch_ref(i, 5)     = di5;
         }
@@ -611,7 +630,7 @@ NEVER_INLINE(void) radf2_ps(const size_t ido, const size_t l1, const v4sf *RESTR
             {
                 v4sf tr2{cc[i - 1 + k + l1ido]}, ti2{cc[i + k + l1ido]};
                 v4sf br{cc[i - 1 + k]}, bi{cc[i + k]};
-                VCPLXMULCONJ(tr2, ti2, LD_PS1(wa1[i - 2]), LD_PS1(wa1[i - 1]));
+                vcplxmulconj(tr2, ti2, LD_PS1(wa1[i - 2]), LD_PS1(wa1[i - 1]));
                 ch[i + 2*k] = VADD(bi, ti2);
                 ch[2*(k+ido) - i] = VSUB(ti2, bi);
                 ch[i - 1 + 2*k] = VADD(br, tr2);
@@ -657,7 +676,7 @@ NEVER_INLINE(void) radb2_ps(const size_t ido, const size_t l1, const v4sf *cc, v
                 v4sf tr2{VSUB(a, b)};
                 ch[i+0 + k] = VSUB(c, d);
                 v4sf ti2{VADD(c, d)};
-                VCPLXMUL(tr2, ti2, LD_PS1(wa1[i - 2]), LD_PS1(wa1[i - 1]));
+                vcplxmul(tr2, ti2, LD_PS1(wa1[i - 2]), LD_PS1(wa1[i - 1]));
                 ch[i-1 + k + l1ido] = tr2;
                 ch[i+0 + k + l1ido] = ti2;
             }
@@ -698,13 +717,13 @@ void radf3_ps(const size_t ido, const size_t l1, const v4sf *RESTRICT cc, v4sf *
             v4sf wi1{LD_PS1(wa1[i - 1])};
             v4sf dr2{cc[i - 1 + (k + l1)*ido]};
             v4sf di2{cc[i + (k + l1)*ido]};
-            VCPLXMULCONJ(dr2, di2, wr1, wi1);
+            vcplxmulconj(dr2, di2, wr1, wi1);
 
             v4sf wr2{LD_PS1(wa2[i - 2])};
             v4sf wi2{LD_PS1(wa2[i - 1])};
             v4sf dr3{cc[i - 1 + (k + l1*2)*ido]};
             v4sf di3{cc[i + (k + l1*2)*ido]};
-            VCPLXMULCONJ(dr3, di3, wr2, wi2);
+            vcplxmulconj(dr3, di3, wr2, wi2);
 
             v4sf cr2{VADD(dr2, dr3)};
             v4sf ci2{VADD(di2, di3)};
@@ -762,10 +781,10 @@ void radb3_ps(const size_t ido, const size_t l1, const v4sf *RESTRICT cc, v4sf *
             v4sf dr3{VADD(cr2, ci3)};
             v4sf di2{VADD(ci2, cr3)};
             v4sf di3{VSUB(ci2, cr3)};
-            VCPLXMUL(dr2, di2, LD_PS1(wa1[i-2]), LD_PS1(wa1[i-1]));
+            vcplxmul(dr2, di2, LD_PS1(wa1[i-2]), LD_PS1(wa1[i-1]));
             ch[i - 1 + (k + l1)*ido] = dr2;
             ch[i + (k + l1)*ido] = di2;
-            VCPLXMUL(dr3, di3, LD_PS1(wa2[i-2]), LD_PS1(wa2[i-1]));
+            vcplxmul(dr3, di3, LD_PS1(wa2[i-2]), LD_PS1(wa2[i-1]));
             ch[i - 1 + (k + 2*l1)*ido] = dr3;
             ch[i + (k + 2*l1)*ido] = di3;
         }
@@ -811,19 +830,19 @@ NEVER_INLINE(void) radf4_ps(const size_t ido, const size_t l1, const v4sf *RESTR
                 v4sf ci2{pc[1*l1ido+1]};
                 v4sf wr{LD_PS1(wa1[i - 2])};
                 v4sf wi{LD_PS1(wa1[i - 1])};
-                VCPLXMULCONJ(cr2,ci2,wr,wi);
+                vcplxmulconj(cr2,ci2,wr,wi);
 
                 v4sf cr3{pc[2*l1ido+0]};
                 v4sf ci3{pc[2*l1ido+1]};
                 wr = LD_PS1(wa2[i-2]);
                 wi = LD_PS1(wa2[i-1]);
-                VCPLXMULCONJ(cr3, ci3, wr, wi);
+                vcplxmulconj(cr3, ci3, wr, wi);
 
                 v4sf cr4{pc[3*l1ido]};
                 v4sf ci4{pc[3*l1ido+1]};
                 wr = LD_PS1(wa3[i-2]);
                 wi = LD_PS1(wa3[i-1]);
-                VCPLXMULCONJ(cr4, ci4, wr, wi);
+                vcplxmulconj(cr4, ci4, wr, wi);
 
                 /* at this point, on SSE, five of "cr2 cr3 cr4 ci2 ci3 ci4" should be loaded in registers */
 
@@ -918,13 +937,13 @@ NEVER_INLINE(void) radb4_ps(const size_t ido, const size_t l1, const v4sf *RESTR
                 v4sf ci3{VSUB(ti2, ti3)};
                 v4sf ci2{VADD(ti1, ti4)};
                 v4sf ci4{VSUB(ti1, ti4)};
-                VCPLXMUL(cr2, ci2, LD_PS1(wa1[i-2]), LD_PS1(wa1[i-1]));
+                vcplxmul(cr2, ci2, LD_PS1(wa1[i-2]), LD_PS1(wa1[i-1]));
                 ph[0] = cr2;
                 ph[1] = ci2; ph += l1ido;
-                VCPLXMUL(cr3, ci3, LD_PS1(wa2[i-2]), LD_PS1(wa2[i-1]));
+                vcplxmul(cr3, ci3, LD_PS1(wa2[i-2]), LD_PS1(wa2[i-1]));
                 ph[0] = cr3;
                 ph[1] = ci3; ph += l1ido;
-                VCPLXMUL(cr4, ci4, LD_PS1(wa3[i-2]), LD_PS1(wa3[i-1]));
+                vcplxmul(cr4, ci4, LD_PS1(wa3[i-2]), LD_PS1(wa3[i-1]));
                 ph[0] = cr4;
                 ph[1] = ci4; ph = ph - 3*l1ido + 2;
             }
@@ -997,10 +1016,10 @@ void radf5_ps(const size_t ido, const size_t l1, const v4sf *RESTRICT cc, v4sf *
             v4sf di4{LD_PS1(wa3[i-2])};
             v4sf dr5{LD_PS1(wa4[i-3])};
             v4sf di5{LD_PS1(wa4[i-2])};
-            VCPLXMULCONJ(dr2, di2, cc_ref(i-1, k, 2), cc_ref(i, k, 2));
-            VCPLXMULCONJ(dr3, di3, cc_ref(i-1, k, 3), cc_ref(i, k, 3));
-            VCPLXMULCONJ(dr4, di4, cc_ref(i-1, k, 4), cc_ref(i, k, 4));
-            VCPLXMULCONJ(dr5, di5, cc_ref(i-1, k, 5), cc_ref(i, k, 5));
+            vcplxmulconj(dr2, di2, cc_ref(i-1, k, 2), cc_ref(i, k, 2));
+            vcplxmulconj(dr3, di3, cc_ref(i-1, k, 3), cc_ref(i, k, 3));
+            vcplxmulconj(dr4, di4, cc_ref(i-1, k, 4), cc_ref(i, k, 4));
+            vcplxmulconj(dr5, di5, cc_ref(i-1, k, 5), cc_ref(i, k, 5));
             v4sf cr2{VADD(dr2, dr5)};
             v4sf ci5{VSUB(dr5, dr2)};
             v4sf cr5{VSUB(di2, di5)};
@@ -1102,10 +1121,10 @@ void radb5_ps(const size_t ido, const size_t l1, const v4sf *RESTRICT cc, v4sf *
             v4sf dr2{VSUB(cr2, ci5)};
             v4sf di5{VSUB(ci2, cr5)};
             v4sf di2{VADD(ci2, cr5)};
-            VCPLXMUL(dr2, di2, LD_PS1(wa1[i-3]), LD_PS1(wa1[i-2]));
-            VCPLXMUL(dr3, di3, LD_PS1(wa2[i-3]), LD_PS1(wa2[i-2]));
-            VCPLXMUL(dr4, di4, LD_PS1(wa3[i-3]), LD_PS1(wa3[i-2]));
-            VCPLXMUL(dr5, di5, LD_PS1(wa4[i-3]), LD_PS1(wa4[i-2]));
+            vcplxmul(dr2, di2, LD_PS1(wa1[i-3]), LD_PS1(wa1[i-2]));
+            vcplxmul(dr3, di3, LD_PS1(wa2[i-3]), LD_PS1(wa2[i-2]));
+            vcplxmul(dr4, di4, LD_PS1(wa3[i-3]), LD_PS1(wa3[i-2]));
+            vcplxmul(dr5, di5, LD_PS1(wa4[i-3]), LD_PS1(wa4[i-2]));
 
             ch_ref(i-1, k, 2) = dr2; ch_ref(i, k, 2) = di2;
             ch_ref(i-1, k, 3) = dr3; ch_ref(i, k, 3) = di3;
@@ -1501,14 +1520,14 @@ namespace {
 void reversed_copy(const size_t N, const v4sf *in, const int in_stride, v4sf *out)
 {
     v4sf g0, g1;
-    INTERLEAVE2(in[0], in[1], g0, g1);
+    interleave2(in[0], in[1], g0, g1);
     in += in_stride;
 
     *--out = VSWAPHL(g0, g1); // [g0l, g0h], [g1l g1h] -> [g1l, g0h]
     for(size_t k{1};k < N;++k)
     {
         v4sf h0, h1;
-        INTERLEAVE2(in[0], in[1], h0, h1);
+        interleave2(in[0], in[1], h0, h1);
         in += in_stride;
         *--out = VSWAPHL(g1, h0);
         *--out = VSWAPHL(h0, h1);
@@ -1526,14 +1545,14 @@ void unreversed_copy(const size_t N, const v4sf *in, v4sf *out, const int out_st
         v4sf h0{*in++}; v4sf h1{*in++};
         g1 = VSWAPHL(g1, h0);
         h0 = VSWAPHL(h0, h1);
-        UNINTERLEAVE2(h0, g1, out[0], out[1]);
+        uninterleave2(h0, g1, out[0], out[1]);
         out += out_stride;
         g1 = h1;
     }
     v4sf h0{*in++}, h1{g0};
     g1 = VSWAPHL(g1, h0);
     h0 = VSWAPHL(h0, h1);
-    UNINTERLEAVE2(h0, g1, out[0], out[1]);
+    uninterleave2(h0, g1, out[0], out[1]);
 }
 
 void pffft_cplx_finalize(const size_t Ncvec, const v4sf *in, v4sf *out, const v4sf *e)
@@ -1547,11 +1566,11 @@ void pffft_cplx_finalize(const size_t Ncvec, const v4sf *in, v4sf *out, const v4
         v4sf r1{in[8*k+2]}, i1{in[8*k+3]};
         v4sf r2{in[8*k+4]}, i2{in[8*k+5]};
         v4sf r3{in[8*k+6]}, i3{in[8*k+7]};
-        VTRANSPOSE4(r0,r1,r2,r3);
-        VTRANSPOSE4(i0,i1,i2,i3);
-        VCPLXMUL(r1,i1,e[k*6+0],e[k*6+1]);
-        VCPLXMUL(r2,i2,e[k*6+2],e[k*6+3]);
-        VCPLXMUL(r3,i3,e[k*6+4],e[k*6+5]);
+        vtranspose4(r0,r1,r2,r3);
+        vtranspose4(i0,i1,i2,i3);
+        vcplxmul(r1,i1,e[k*6+0],e[k*6+1]);
+        vcplxmul(r2,i2,e[k*6+2],e[k*6+3]);
+        vcplxmul(r3,i3,e[k*6+4],e[k*6+5]);
 
         v4sf sr0{VADD(r0,r2)}, dr0{VSUB(r0, r2)};
         v4sf sr1{VADD(r1,r3)}, dr1{VSUB(r1, r3)};
@@ -1602,12 +1621,12 @@ void pffft_cplx_preprocess(const size_t Ncvec, const v4sf *in, v4sf *out, const
         r2 = VSUB(sr0, sr1); i2 = VSUB(si0, si1);
         r3 = VADD(dr0, di1); i3 = VSUB(di0, dr1);
 
-        VCPLXMULCONJ(r1,i1,e[k*6+0],e[k*6+1]);
-        VCPLXMULCONJ(r2,i2,e[k*6+2],e[k*6+3]);
-        VCPLXMULCONJ(r3,i3,e[k*6+4],e[k*6+5]);
+        vcplxmulconj(r1,i1,e[k*6+0],e[k*6+1]);
+        vcplxmulconj(r2,i2,e[k*6+2],e[k*6+3]);
+        vcplxmulconj(r3,i3,e[k*6+4],e[k*6+5]);
 
-        VTRANSPOSE4(r0,r1,r2,r3);
-        VTRANSPOSE4(i0,i1,i2,i3);
+        vtranspose4(r0,r1,r2,r3);
+        vtranspose4(i0,i1,i2,i3);
 
         *out++ = r0; *out++ = i0; *out++ = r1; *out++ = i1;
         *out++ = r2; *out++ = i2; *out++ = r3; *out++ = i3;
@@ -1622,8 +1641,8 @@ ALWAYS_INLINE(void) pffft_real_finalize_4x4(const v4sf *in0, const v4sf *in1, co
     v4sf r1{*in++}; v4sf i1{*in++};
     v4sf r2{*in++}; v4sf i2{*in++};
     v4sf r3{*in++}; v4sf i3{*in++};
-    VTRANSPOSE4(r0,r1,r2,r3);
-    VTRANSPOSE4(i0,i1,i2,i3);
+    vtranspose4(r0,r1,r2,r3);
+    vtranspose4(i0,i1,i2,i3);
 
     /* transformation for each column is:
      *
@@ -1640,9 +1659,9 @@ ALWAYS_INLINE(void) pffft_real_finalize_4x4(const v4sf *in0, const v4sf *in1, co
     //cerr << "matrix initial, before e , REAL:\n 1: " << r0 << "\n 1: " << r1 << "\n 1: " << r2 << "\n 1: " << r3 << "\n";
     //cerr << "matrix initial, before e, IMAG :\n 1: " << i0 << "\n 1: " << i1 << "\n 1: " << i2 << "\n 1: " << i3 << "\n";
 
-    VCPLXMUL(r1,i1,e[0],e[1]);
-    VCPLXMUL(r2,i2,e[2],e[3]);
-    VCPLXMUL(r3,i3,e[4],e[5]);
+    vcplxmul(r1,i1,e[0],e[1]);
+    vcplxmul(r2,i2,e[2],e[3]);
+    vcplxmul(r3,i3,e[4],e[5]);
 
     //cerr << "matrix initial, real part:\n 1: " << r0 << "\n 1: " << r1 << "\n 1: " << r2 << "\n 1: " << r3 << "\n";
     //cerr << "matrix initial, imag part:\n 1: " << i0 << "\n 1: " << i1 << "\n 1: " << i2 << "\n 1: " << i3 << "\n";
@@ -1741,12 +1760,12 @@ ALWAYS_INLINE(void) pffft_real_preprocess_4x4(const v4sf *in, const v4sf *e, v4s
     i1 = VSUB(si0, dr1);
     i3 = VADD(si0, dr1);
 
-    VCPLXMULCONJ(r1,i1,e[0],e[1]);
-    VCPLXMULCONJ(r2,i2,e[2],e[3]);
-    VCPLXMULCONJ(r3,i3,e[4],e[5]);
+    vcplxmulconj(r1,i1,e[0],e[1]);
+    vcplxmulconj(r2,i2,e[2],e[3]);
+    vcplxmulconj(r3,i3,e[4],e[5]);
 
-    VTRANSPOSE4(r0,r1,r2,r3);
-    VTRANSPOSE4(i0,i1,i2,i3);
+    vtranspose4(r0,r1,r2,r3);
+    vtranspose4(i0,i1,i2,i3);
 
     if(!first)
     {
@@ -1831,7 +1850,7 @@ void pffft_transform_internal(const PFFFT_Setup *setup, const v4sf *vinput, v4sf
         {
             v4sf *tmp{buff[ib]};
             for(size_t k=0; k < Ncvec; ++k)
-                UNINTERLEAVE2(vinput[k*2], vinput[k*2+1], tmp[k*2], tmp[k*2+1]);
+                uninterleave2(vinput[k*2], vinput[k*2+1], tmp[k*2], tmp[k*2+1]);
 
             ib = (cfftf1_ps(Ncvec, buff[ib], buff[!ib], buff[ib], setup->twiddle, setup->ifac, -1.0f) == buff[1]);
             pffft_cplx_finalize(Ncvec, buff[ib], buff[!ib], setup->e);
@@ -1864,7 +1883,7 @@ void pffft_transform_internal(const PFFFT_Setup *setup, const v4sf *vinput, v4sf
             pffft_cplx_preprocess(Ncvec, vinput, buff[ib], setup->e);
             ib = (cfftf1_ps(Ncvec, buff[ib], buff[0], buff[1],  setup->twiddle, setup->ifac, +1.0f) == buff[1]);
             for(size_t k{0};k < Ncvec;++k)
-                INTERLEAVE2(buff[ib][k*2], buff[ib][k*2+1], buff[ib][k*2], buff[ib][k*2+1]);
+                interleave2(buff[ib][k*2], buff[ib][k*2+1], buff[ib][k*2], buff[ib][k*2+1]);
         }
     }
 
@@ -1897,8 +1916,8 @@ void pffft_zreorder(const PFFFT_Setup *setup, const float *in, float *out,
         {
             for(size_t k{0};k < dk;++k)
             {
-                INTERLEAVE2(vin[k*8 + 0], vin[k*8 + 1], vout[2*(0*dk + k) + 0], vout[2*(0*dk + k) + 1]);
-                INTERLEAVE2(vin[k*8 + 4], vin[k*8 + 5], vout[2*(2*dk + k) + 0], vout[2*(2*dk + k) + 1]);
+                interleave2(vin[k*8 + 0], vin[k*8 + 1], vout[2*(0*dk + k) + 0], vout[2*(0*dk + k) + 1]);
+                interleave2(vin[k*8 + 4], vin[k*8 + 5], vout[2*(2*dk + k) + 0], vout[2*(2*dk + k) + 1]);
             }
             reversed_copy(dk, vin+2, 8, vout + N/SIMD_SZ/2);
             reversed_copy(dk, vin+6, 8, vout + N/SIMD_SZ);
@@ -1907,8 +1926,8 @@ void pffft_zreorder(const PFFFT_Setup *setup, const float *in, float *out,
         {
             for(size_t k{0};k < dk;++k)
             {
-                UNINTERLEAVE2(vin[2*(0*dk + k) + 0], vin[2*(0*dk + k) + 1], vout[k*8 + 0], vout[k*8 + 1]);
-                UNINTERLEAVE2(vin[2*(2*dk + k) + 0], vin[2*(2*dk + k) + 1], vout[k*8 + 4], vout[k*8 + 5]);
+                uninterleave2(vin[2*(0*dk + k) + 0], vin[2*(0*dk + k) + 1], vout[k*8 + 0], vout[k*8 + 1]);
+                uninterleave2(vin[2*(2*dk + k) + 0], vin[2*(2*dk + k) + 1], vout[k*8 + 4], vout[k*8 + 5]);
             }
             unreversed_copy(dk, vin + N/SIMD_SZ/4, vout + N/SIMD_SZ - 6, -8);
             unreversed_copy(dk, vin + 3*N/SIMD_SZ/4, vout + N/SIMD_SZ - 2, -8);
@@ -1921,7 +1940,7 @@ void pffft_zreorder(const PFFFT_Setup *setup, const float *in, float *out,
             for(size_t k{0};k < Ncvec;++k)
             {
                 size_t kk{(k/4) + (k%4)*(Ncvec/4)};
-                INTERLEAVE2(vin[k*2], vin[k*2+1], vout[kk*2], vout[kk*2+1]);
+                interleave2(vin[k*2], vin[k*2+1], vout[kk*2], vout[kk*2+1]);
             }
         }
         else
@@ -1929,7 +1948,7 @@ void pffft_zreorder(const PFFFT_Setup *setup, const float *in, float *out,
             for(size_t k{0};k < Ncvec;++k)
             {
                 size_t kk{(k/4) + (k%4)*(Ncvec/4)};
-                UNINTERLEAVE2(vin[kk*2], vin[kk*2+1], vout[k*2], vout[k*2+1]);
+                uninterleave2(vin[kk*2], vin[kk*2+1], vout[k*2], vout[k*2+1]);
             }
         }
     }
@@ -2019,12 +2038,12 @@ void pffft_zconvolve_scale_accumulate(const PFFFT_Setup *s, const float *a, cons
     {
         v4sf ar4{va[2*i+0]}, ai4{va[2*i+1]};
         v4sf br4{vb[2*i+0]}, bi4{vb[2*i+1]};
-        VCPLXMUL(ar4, ai4, br4, bi4);
+        vcplxmul(ar4, ai4, br4, bi4);
         vab[2*i+0] = VMADD(ar4, vscal, vab[2*i+0]);
         vab[2*i+1] = VMADD(ai4, vscal, vab[2*i+1]);
         ar4 = va[2*i+2]; ai4 = va[2*i+3];
         br4 = vb[2*i+2]; bi4 = vb[2*i+3];
-        VCPLXMUL(ar4, ai4, br4, bi4);
+        vcplxmul(ar4, ai4, br4, bi4);
         vab[2*i+2] = VMADD(ar4, vscal, vab[2*i+2]);
         vab[2*i+3] = VMADD(ai4, vscal, vab[2*i+3]);
     }
@@ -2073,12 +2092,12 @@ void pffft_zconvolve_accumulate(const PFFFT_Setup *s, const float *a, const floa
     {
         v4sf ar4{va[2*i+0]}, ai4{va[2*i+1]};
         v4sf br4{vb[2*i+0]}, bi4{vb[2*i+1]};
-        VCPLXMUL(ar4, ai4, br4, bi4);
+        vcplxmul(ar4, ai4, br4, bi4);
         vab[2*i+0] = VADD(ar4, vab[2*i+0]);
         vab[2*i+1] = VADD(ai4, vab[2*i+1]);
         ar4 = va[2*i+2]; ai4 = va[2*i+3];
         br4 = vb[2*i+2]; bi4 = vb[2*i+3];
-        VCPLXMUL(ar4, ai4, br4, bi4);
+        vcplxmul(ar4, ai4, br4, bi4);
         vab[2*i+2] = VADD(ar4, vab[2*i+2]);
         vab[2*i+3] = VADD(ai4, vab[2*i+3]);
     }
@@ -2094,7 +2113,7 @@ void pffft_zconvolve_accumulate(const PFFFT_Setup *s, const float *a, const floa
 void pffft_transform(const PFFFT_Setup *setup, const float *input, float *output, float *work,
     pffft_direction_t direction)
 {
-    assert(VALIGNED(input) && VALIGNED(output) && VALIGNED(work));
+    assert(valigned(input) && valigned(output) && valigned(work));
     pffft_transform_internal(setup, reinterpret_cast<const v4sf*>(al::assume_aligned<16>(input)),
         reinterpret_cast<v4sf*>(al::assume_aligned<16>(output)),
         reinterpret_cast<v4sf*>(al::assume_aligned<16>(work)), direction, false);
@@ -2103,7 +2122,7 @@ void pffft_transform(const PFFFT_Setup *setup, const float *input, float *output
 void pffft_transform_ordered(const PFFFT_Setup *setup, const float *input, float *output,
     float *work, pffft_direction_t direction)
 {
-    assert(VALIGNED(input) && VALIGNED(output) && VALIGNED(work));
+    assert(valigned(input) && valigned(output) && valigned(work));
     pffft_transform_internal(setup, reinterpret_cast<const v4sf*>(al::assume_aligned<16>(input)),
         reinterpret_cast<v4sf*>(al::assume_aligned<16>(output)),
         reinterpret_cast<v4sf*>(al::assume_aligned<16>(work)), direction, true);
@@ -2217,7 +2236,7 @@ void pffft_zconvolve_scale_accumulate(const PFFFT_Setup *s, const float *a, cons
     {
         float ar{a[2*i+0]}, ai{a[2*i+1]};
         const float br{b[2*i+0]}, bi{b[2*i+1]};
-        VCPLXMUL(ar, ai, br, bi);
+        vcplxmul(ar, ai, br, bi);
         ab[2*i+0] += ar*scaling;
         ab[2*i+1] += ai*scaling;
     }
@@ -2238,7 +2257,7 @@ void pffft_zconvolve_accumulate(const PFFFT_Setup *s, const float *a, const floa
     {
         float ar{a[2*i+0]}, ai{a[2*i+1]};
         const float br{b[2*i+0]}, bi{b[2*i+1]};
-        VCPLXMUL(ar, ai, br, bi);
+        vcplxmul(ar, ai, br, bi);
         ab[2*i+0] += ar;
         ab[2*i+1] += ai;
     }
author	Chris Robinson <[email protected]>	2023-10-16 09:01:41 -0700
committer	Chris Robinson <[email protected]>	2023-10-16 09:01:41 -0700
commit	a82c5373667aae8f9e87b9d87ef9d2dec625f2fb (patch)
tree	e746fa69447e1c818962840a3d4536fd670ec260
parent	50fce82c4043d989a6868b13a6930fa31b0cc420 (diff)