diff options
author | Chris Robinson <[email protected]> | 2020-12-31 13:01:17 -0800 |
---|---|---|
committer | Chris Robinson <[email protected]> | 2020-12-31 13:01:17 -0800 |
commit | f2b7a063ef49e2377c41dddae095d5c66b84bf9b (patch) | |
tree | d0e14e0603233ad2318a5b0188943c84bccba964 /core | |
parent | 9d354f721c39dc643399b36297c57ef809451f6f (diff) |
Add NEON-enhanced FIR loops for convolution and UHJ
Diffstat (limited to 'core')
-rw-r--r-- | core/uhjfilter.cpp | 68 |
1 files changed, 68 insertions, 0 deletions
diff --git a/core/uhjfilter.cpp b/core/uhjfilter.cpp index 7b0ff920..92f35901 100644 --- a/core/uhjfilter.cpp +++ b/core/uhjfilter.cpp @@ -5,6 +5,8 @@ #ifdef HAVE_SSE_INTRINSICS #include <xmmintrin.h> +#elif defined(HAVE_NEON) +#include <arm_neon.h> #endif #include <algorithm> @@ -122,6 +124,72 @@ void allpass_process(al::span<float> dst, const float *RESTRICT src) dst[pos] += _mm_cvtss_f32(r4); } +#elif defined(HAVE_NEON) + + size_t pos{0}; + if(size_t todo{dst.size()>>1}) + { + /* There doesn't seem to be NEON intrinsics to do this kind of stipple + * shuffling, so there's two custom methods for it. + */ + auto shuffle_2020 = [](float32x4_t a, float32x4_t b) + { + float32x4_t ret{vmovq_n_f32(vgetq_lane_f32(a, 0))}; + ret = vsetq_lane_f32(vgetq_lane_f32(a, 2), ret, 1); + ret = vsetq_lane_f32(vgetq_lane_f32(b, 0), ret, 2); + ret = vsetq_lane_f32(vgetq_lane_f32(b, 2), ret, 3); + return ret; + }; + auto shuffle_3131 = [](float32x4_t a, float32x4_t b) + { + float32x4_t ret{vmovq_n_f32(vgetq_lane_f32(a, 1))}; + ret = vsetq_lane_f32(vgetq_lane_f32(a, 3), ret, 1); + ret = vsetq_lane_f32(vgetq_lane_f32(b, 1), ret, 2); + ret = vsetq_lane_f32(vgetq_lane_f32(b, 3), ret, 3); + return ret; + }; + do { + float32x4_t r04{vdupq_n_f32(0.0f)}; + float32x4_t r14{vdupq_n_f32(0.0f)}; + for(size_t j{0};j < PShift.Coeffs.size();j+=4) + { + const float32x4_t coeffs{vld1q_f32(&PShift.Coeffs[j])}; + const float32x4_t s0{vld1q_f32(&src[j*2])}; + const float32x4_t s1{vld1q_f32(&src[j*2 + 4])}; + + r04 = vmlaq_f32(r04, shuffle_2020(s0, s1), coeffs); + r14 = vmlaq_f32(r14, shuffle_3131(s0, s1), coeffs); + } + r04 = vaddq_f32(r04, vrev64q_f32(r04)); + dst[pos++] = vget_lane_f32(vadd_f32(vget_low_f32(r04), vget_high_f32(r04)), 0); + + r14 = vaddq_f32(r14, vrev64q_f32(r14)); + dst[pos++] = vget_lane_f32(vadd_f32(vget_low_f32(r14), vget_high_f32(r14)), 0); + + src += 2; + } while(--todo); + } + if((dst.size()&1)) + { + auto load4 = [](float32_t a, float32_t b, float32_t c, float32_t d) + { + float32x4_t ret{vmovq_n_f32(a)}; + ret = vsetq_lane_f32(b, ret, 1); + ret = vsetq_lane_f32(c, ret, 2); + ret = vsetq_lane_f32(d, ret, 3); + return ret; + }; + float32x4_t r4{vdupq_n_f32(0.0f)}; + for(size_t j{0};j < PShift.Coeffs.size();j+=4) + { + const float32x4_t coeffs{vld1q_f32(&PShift.Coeffs[j])}; + const float32x4_t s{load4(src[j*2], src[j*2 + 2], src[j*2 + 4], src[j*2 + 6])}; + r4 = vmlaq_f32(r4, s, coeffs); + } + r4 = vaddq_f32(r4, vrev64q_f32(r4)); + dst[pos] = vget_lane_f32(vadd_f32(vget_low_f32(r4), vget_high_f32(r4)), 0); + } + #else for(float &output : dst) |