From d45dd9c668b2f4331492600d8ff99dc20c068664 Mon Sep 17 00:00:00 2001 From: Chris Robinson Date: Sun, 19 Feb 2017 14:36:06 -0800 Subject: Remove the sinc8 resampler option Perf shows less than 1 percent CPU difference from the higher quality bsinc resampler, but uses almost twice as much memory (a 128KB lookup table). --- Alc/mixer_neon.c | 83 +++----------------------------------------------------- 1 file changed, 4 insertions(+), 79 deletions(-) (limited to 'Alc/mixer_neon.c') diff --git a/Alc/mixer_neon.c b/Alc/mixer_neon.c index 543ff0f3..727c5c55 100644 --- a/Alc/mixer_neon.c +++ b/Alc/mixer_neon.c @@ -90,10 +90,10 @@ const ALfloat *Resample_fir4_32_Neon(const InterpState* UNUSED(state), const float32x4_t val1 = vld1q_f32(&src[pos_[1]]); const float32x4_t val2 = vld1q_f32(&src[pos_[2]]); const float32x4_t val3 = vld1q_f32(&src[pos_[3]]); - float32x4_t k0 = vld1q_f32(ResampleCoeffs.FIR4[frac_[0]]); - float32x4_t k1 = vld1q_f32(ResampleCoeffs.FIR4[frac_[1]]); - float32x4_t k2 = vld1q_f32(ResampleCoeffs.FIR4[frac_[2]]); - float32x4_t k3 = vld1q_f32(ResampleCoeffs.FIR4[frac_[3]]); + float32x4_t k0 = vld1q_f32(ResampleCoeffs_FIR4[frac_[0]]); + float32x4_t k1 = vld1q_f32(ResampleCoeffs_FIR4[frac_[1]]); + float32x4_t k2 = vld1q_f32(ResampleCoeffs_FIR4[frac_[2]]); + float32x4_t k3 = vld1q_f32(ResampleCoeffs_FIR4[frac_[3]]); float32x4_t out; k0 = vmulq_f32(k0, val0); @@ -136,81 +136,6 @@ const ALfloat *Resample_fir4_32_Neon(const InterpState* UNUSED(state), return dst; } -const ALfloat *Resample_fir8_32_Neon(const InterpState* UNUSED(state), - const ALfloat *restrict src, ALuint frac, ALint increment, - ALfloat *restrict dst, ALsizei numsamples) -{ - const int32x4_t increment4 = vdupq_n_s32(increment*4); - const uint32x4_t fracMask4 = vdupq_n_u32(FRACTIONMASK); - alignas(16) ALint pos_[4]; - alignas(16) ALuint frac_[4]; - int32x4_t pos4; - uint32x4_t frac4; - ALsizei i, j; - - InitiatePositionArrays(frac, increment, frac_, pos_, 4); - - frac4 = vld1q_u32(frac_); - pos4 = vld1q_s32(pos_); - - src -= 3; - for(i = 0;numsamples-i > 3;i += 4) - { - float32x4_t out[2]; - for(j = 0;j < 8;j+=4) - { - const float32x4_t val0 = vld1q_f32(&src[pos_[0]+j]); - const float32x4_t val1 = vld1q_f32(&src[pos_[1]+j]); - const float32x4_t val2 = vld1q_f32(&src[pos_[2]+j]); - const float32x4_t val3 = vld1q_f32(&src[pos_[3]+j]); - float32x4_t k0 = vld1q_f32(&ResampleCoeffs.FIR4[frac_[0]][j]); - float32x4_t k1 = vld1q_f32(&ResampleCoeffs.FIR4[frac_[1]][j]); - float32x4_t k2 = vld1q_f32(&ResampleCoeffs.FIR4[frac_[2]][j]); - float32x4_t k3 = vld1q_f32(&ResampleCoeffs.FIR4[frac_[3]][j]); - - k0 = vmulq_f32(k0, val0); - k1 = vmulq_f32(k1, val1); - k2 = vmulq_f32(k2, val2); - k3 = vmulq_f32(k3, val3); - k0 = vcombine_f32(vpadd_f32(vget_low_f32(k0), vget_high_f32(k0)), - vpadd_f32(vget_low_f32(k1), vget_high_f32(k1))); - k2 = vcombine_f32(vpadd_f32(vget_low_f32(k2), vget_high_f32(k2)), - vpadd_f32(vget_low_f32(k3), vget_high_f32(k3))); - out[j>>2] = vcombine_f32(vpadd_f32(vget_low_f32(k0), vget_high_f32(k0)), - vpadd_f32(vget_low_f32(k2), vget_high_f32(k2))); - } - - out[0] = vaddq_f32(out[0], out[1]); - vst1q_f32(&dst[i], out[0]); - - frac4 = vaddq_u32(frac4, (uint32x4_t)increment4); - pos4 = vaddq_s32(pos4, (int32x4_t)vshrq_n_u32(frac4, FRACTIONBITS)); - frac4 = vandq_u32(frac4, fracMask4); - - vst1q_s32(pos_, pos4); - vst1q_u32(frac_, frac4); - } - - if(i < numsamples) - { - /* NOTE: These four elements represent the position *after* the last - * four samples, so the lowest element is the next position to - * resample. - */ - ALint pos = pos_[0]; - frac = frac_[0]; - do { - dst[i] = resample_fir8(src[pos ], src[pos+1], src[pos+2], src[pos+3], - src[pos+4], src[pos+5], src[pos+6], src[pos+7], frac); - - frac += increment; - pos += frac>>FRACTIONBITS; - frac &= FRACTIONMASK; - } while(++i < numsamples); - } - return dst; -} - const ALfloat *Resample_bsinc32_Neon(const InterpState *state, const ALfloat *restrict src, ALuint frac, ALint increment, ALfloat *restrict dst, ALsizei dstlen) -- cgit v1.2.3