aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorChris Robinson <[email protected]>2021-02-18 14:58:03 -0800
committerChris Robinson <[email protected]>2021-02-18 14:58:03 -0800
commit28aaaab434d1b73eb794d593eff368815cb8b3ce (patch)
treeb6505125f629446d1045553bafd465c8088e72f9
parent89ddad8487ad0fdbd7ca91a4ae294ecdd0669bae (diff)
Store the all-pass FIR results more efficiently
-rw-r--r--core/uhjfilter.cpp37
1 files changed, 23 insertions, 14 deletions
diff --git a/core/uhjfilter.cpp b/core/uhjfilter.cpp
index 72506716..31acaefd 100644
--- a/core/uhjfilter.cpp
+++ b/core/uhjfilter.cpp
@@ -5,6 +5,7 @@
#ifdef HAVE_SSE_INTRINSICS
#include <xmmintrin.h>
+#include <emmintrin.h>
#elif defined(HAVE_NEON)
#include <arm_neon.h>
#endif
@@ -77,9 +78,9 @@ const PhaseShifterT PShift{};
void allpass_process(al::span<float> dst, const float *RESTRICT src)
{
#ifdef HAVE_SSE_INTRINSICS
- size_t pos{0};
if(size_t todo{dst.size()>>1})
{
+ auto *out = reinterpret_cast<__m64*>(dst.data());
do {
__m128 r04{_mm_setzero_ps()};
__m128 r14{_mm_setzero_ps()};
@@ -95,15 +96,13 @@ void allpass_process(al::span<float> dst, const float *RESTRICT src)
s = _mm_shuffle_ps(s0, s1, _MM_SHUFFLE(3, 1, 3, 1));
r14 = _mm_add_ps(r14, _mm_mul_ps(s, coeffs));
}
- r04 = _mm_add_ps(r04, _mm_shuffle_ps(r04, r04, _MM_SHUFFLE(0, 1, 2, 3)));
- r04 = _mm_add_ps(r04, _mm_movehl_ps(r04, r04));
- dst[pos++] += _mm_cvtss_f32(r04);
+ src += 2;
- r14 = _mm_add_ps(r14, _mm_shuffle_ps(r14, r14, _MM_SHUFFLE(0, 1, 2, 3)));
- r14 = _mm_add_ps(r14, _mm_movehl_ps(r14, r14));
- dst[pos++] += _mm_cvtss_f32(r14);
+ __m128 r4{_mm_add_ps(_mm_unpackhi_ps(r04, r14), _mm_unpacklo_ps(r04, r14))};
+ r4 = _mm_add_ps(r4, _mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(r4), 8)));
- src += 2;
+ _mm_storel_pi(out, _mm_add_ps(_mm_loadl_pi(_mm_undefined_ps(), out), r4));
+ ++out;
} while(--todo);
}
if((dst.size()&1))
@@ -121,7 +120,7 @@ void allpass_process(al::span<float> dst, const float *RESTRICT src)
r4 = _mm_add_ps(r4, _mm_shuffle_ps(r4, r4, _MM_SHUFFLE(0, 1, 2, 3)));
r4 = _mm_add_ps(r4, _mm_movehl_ps(r4, r4));
- dst[pos] += _mm_cvtss_f32(r4);
+ dst.back() += _mm_cvtss_f32(r4);
}
#elif defined(HAVE_NEON)
@@ -148,6 +147,16 @@ void allpass_process(al::span<float> dst, const float *RESTRICT src)
ret = vsetq_lane_f32(vgetq_lane_f32(b, 3), ret, 3);
return ret;
};
+ auto unpacklo = [](float32x4_t a, float32x4_t b)
+ {
+ float32x2x2_t result{vzip_f32(vget_low_f32(a), vget_low_f32(b))};
+ return vcombine_f32(result.val[0], result.val[1]);
+ };
+ auto unpackhi = [](float32x4_t a, float32x4_t b)
+ {
+ float32x2x2_t result{vzip_f32(vget_high_f32(a), vget_high_f32(b))};
+ return vcombine_f32(result.val[0], result.val[1]);
+ };
do {
float32x4_t r04{vdupq_n_f32(0.0f)};
float32x4_t r14{vdupq_n_f32(0.0f)};
@@ -160,13 +169,13 @@ void allpass_process(al::span<float> dst, const float *RESTRICT src)
r04 = vmlaq_f32(r04, shuffle_2020(s0, s1), coeffs);
r14 = vmlaq_f32(r14, shuffle_3131(s0, s1), coeffs);
}
- r04 = vaddq_f32(r04, vrev64q_f32(r04));
- dst[pos++] += vget_lane_f32(vadd_f32(vget_low_f32(r04), vget_high_f32(r04)), 0);
+ src += 2;
- r14 = vaddq_f32(r14, vrev64q_f32(r14));
- dst[pos++] += vget_lane_f32(vadd_f32(vget_low_f32(r14), vget_high_f32(r14)), 0);
+ float32x4_t r4{vaddq_f32(unpackhi(r04, r14), unpacklo(r04, r14))};
+ float32x2_t r2{vadd_f32(vget_low_f32(r4), vget_high_f32(r4))};
- src += 2;
+ vst1_f32(&dst[pos], vadd_f32(vld1_f32(&dst[pos]), r2));
+ pos += 2;
} while(--todo);
}
if((dst.size()&1))