diff options
author | Chris Robinson <[email protected]> | 2018-01-07 05:32:07 -0800 |
---|---|---|
committer | Chris Robinson <[email protected]> | 2018-01-07 05:32:07 -0800 |
commit | 4cc1c646466737ba411aa23ce4a6116936ada8c2 (patch) | |
tree | 796fc698eb910630ac5f398fe38f26aef2cd456c /Alc/mixer_neon.c | |
parent | 0e1fd34c89d8f09f68c2c243ceccd0dab4f7c6c0 (diff) |
Replace the sinc4 resampler with cubic
Turns out the C version of the cubic resampler is just slightly faster than
even the SSE3 version of the FIR4 resampler. This is likely due to not using a
64KB random-access lookup table along with unaligned loads, both offseting the
gains from SSE.
Diffstat (limited to 'Alc/mixer_neon.c')
-rw-r--r-- | Alc/mixer_neon.c | 71 |
1 files changed, 0 insertions, 71 deletions
diff --git a/Alc/mixer_neon.c b/Alc/mixer_neon.c index b99dcf69..631e4f7c 100644 --- a/Alc/mixer_neon.c +++ b/Alc/mixer_neon.c @@ -66,77 +66,6 @@ const ALfloat *Resample_lerp_Neon(const InterpState* UNUSED(state), return dst; } -const ALfloat *Resample_fir4_Neon(const InterpState *state, - const ALfloat *restrict src, ALsizei frac, ALint increment, - ALfloat *restrict dst, ALsizei numsamples) -{ - const ALfloat (*restrict filter)[4] = ASSUME_ALIGNED(state->sinc4.filter, 16); - const int32x4_t increment4 = vdupq_n_s32(increment*4); - const int32x4_t fracMask4 = vdupq_n_s32(FRACTIONMASK); - alignas(16) ALint pos_[4]; - alignas(16) ALsizei frac_[4]; - int32x4_t pos4; - int32x4_t frac4; - ALsizei i; - - InitiatePositionArrays(frac, increment, frac_, pos_, 4); - - frac4 = vld1q_s32(frac_); - pos4 = vld1q_s32(pos_); - - --src; - for(i = 0;numsamples-i > 3;i += 4) - { - const float32x4_t val0 = vld1q_f32(&src[pos_[0]]); - const float32x4_t val1 = vld1q_f32(&src[pos_[1]]); - const float32x4_t val2 = vld1q_f32(&src[pos_[2]]); - const float32x4_t val3 = vld1q_f32(&src[pos_[3]]); - float32x4_t k0 = vld1q_f32(filter[frac_[0]]); - float32x4_t k1 = vld1q_f32(filter[frac_[1]]); - float32x4_t k2 = vld1q_f32(filter[frac_[2]]); - float32x4_t k3 = vld1q_f32(filter[frac_[3]]); - float32x4_t out; - - k0 = vmulq_f32(k0, val0); - k1 = vmulq_f32(k1, val1); - k2 = vmulq_f32(k2, val2); - k3 = vmulq_f32(k3, val3); - k0 = vcombine_f32(vpadd_f32(vget_low_f32(k0), vget_high_f32(k0)), - vpadd_f32(vget_low_f32(k1), vget_high_f32(k1))); - k2 = vcombine_f32(vpadd_f32(vget_low_f32(k2), vget_high_f32(k2)), - vpadd_f32(vget_low_f32(k3), vget_high_f32(k3))); - out = vcombine_f32(vpadd_f32(vget_low_f32(k0), vget_high_f32(k0)), - vpadd_f32(vget_low_f32(k2), vget_high_f32(k2))); - - vst1q_f32(&dst[i], out); - - frac4 = vaddq_s32(frac4, increment4); - pos4 = vaddq_s32(pos4, vshrq_n_s32(frac4, FRACTIONBITS)); - frac4 = vandq_s32(frac4, fracMask4); - - vst1q_s32(pos_, pos4); - vst1q_s32(frac_, frac4); - } - - if(i < numsamples) - { - /* NOTE: These four elements represent the position *after* the last - * four samples, so the lowest element is the next position to - * resample. - */ - ALint pos = pos_[0]; - frac = frac_[0]; - do { - dst[i] = resample_fir4(src[pos], src[pos+1], src[pos+2], src[pos+3], filter[frac]); - - frac += increment; - pos += frac>>FRACTIONBITS; - frac &= FRACTIONMASK; - } while(++i < numsamples); - } - return dst; -} - const ALfloat *Resample_bsinc_Neon(const InterpState *state, const ALfloat *restrict src, ALsizei frac, ALint increment, ALfloat *restrict dst, ALsizei dstlen) |