diff options
-rw-r--r-- | Alc/mixer_c.c | 8 | ||||
-rw-r--r-- | Alc/mixer_neon.c | 4 | ||||
-rw-r--r-- | Alc/mixer_sse.c | 42 |
3 files changed, 28 insertions, 26 deletions
diff --git a/Alc/mixer_c.c b/Alc/mixer_c.c index 15c603ca..1284371b 100644 --- a/Alc/mixer_c.c +++ b/Alc/mixer_c.c @@ -62,10 +62,10 @@ const ALfloat *Resample_bsinc32_C(const BsincState *state, const ALfloat *restri const ALfloat *fil, *scd, *phd, *spd; const ALfloat sf = state->sf; const ALsizei m = state->m; - const ALint l = state->l; - ALsizei j_s, j_f, pi, i; + ALsizei j_f, pi, i; ALfloat pf, r; + src += state->l; for(i = 0;i < dstlen;i++) { // Calculate the phase index and factor. @@ -81,9 +81,9 @@ const ALfloat *Resample_bsinc32_C(const BsincState *state, const ALfloat *restri // Apply the scale and phase interpolated filter. r = 0.0f; - for(j_f = 0,j_s = l;j_f < m;j_f++,j_s++) + for(j_f = 0;j_f < m;j_f++) r += (fil[j_f] + sf*scd[j_f] + pf*(phd[j_f] + sf*spd[j_f])) * - src[j_s]; + src[j_f]; dst[i] = r; frac += increment; diff --git a/Alc/mixer_neon.c b/Alc/mixer_neon.c index 75ad14eb..2875b321 100644 --- a/Alc/mixer_neon.c +++ b/Alc/mixer_neon.c @@ -248,8 +248,8 @@ const ALfloat *Resample_bsinc32_Neon(const BsincState *state, const ALfloat *res const float32x4_t pf4 = vdupq_n_f32(pf); for(j = 0;j < m;j+=4) { - float32x4_t f4 = vmlaq_f32(vld1q_f32(&fil[j]), sf4, vld1q_f32(&scd[j])); - f4 = vmlaq_f32(f4, + const float32x4_t f4 = vmlaq_f32(vmlaq_f32(vld1q_f32(&fil[j]), + sf4, vld1q_f32(&scd[j])), pf4, vmlaq_f32(vld1q_f32(&phd[j]), sf4, vld1q_f32(&spd[j]) ) diff --git a/Alc/mixer_sse.c b/Alc/mixer_sse.c index 96228b47..25daf00b 100644 --- a/Alc/mixer_sse.c +++ b/Alc/mixer_sse.c @@ -12,18 +12,24 @@ #include "mixer_defs.h" +#ifdef __GNUC__ +#define ASSUME_ALIGNED(ptr, ...) __builtin_assume_aligned((ptr), __VA_ARGS__) +#else +#define ASSUME_ALIGNED(ptr, ...) (ptr) +#endif + const ALfloat *Resample_bsinc32_SSE(const BsincState *state, const ALfloat *restrict src, ALuint frac, ALint increment, ALfloat *restrict dst, ALsizei dstlen) { const __m128 sf4 = _mm_set1_ps(state->sf); const ALsizei m = state->m; - const ALint l = state->l; const ALfloat *fil, *scd, *phd, *spd; - ALsizei j_s, pi, j_f, i; + ALsizei pi, i, j; ALfloat pf; __m128 r4; + src += state->l; for(i = 0;i < dstlen;i++) { // Calculate the phase index and factor. @@ -32,32 +38,28 @@ const ALfloat *Resample_bsinc32_SSE(const BsincState *state, const ALfloat *rest pf = (frac & ((1<<FRAC_PHASE_BITDIFF)-1)) * (1.0f/(1<<FRAC_PHASE_BITDIFF)); #undef FRAC_PHASE_BITDIFF - fil = state->coeffs[pi].filter; - scd = state->coeffs[pi].scDelta; - phd = state->coeffs[pi].phDelta; - spd = state->coeffs[pi].spDelta; + fil = ASSUME_ALIGNED(state->coeffs[pi].filter, 16); + scd = ASSUME_ALIGNED(state->coeffs[pi].scDelta, 16); + phd = ASSUME_ALIGNED(state->coeffs[pi].phDelta, 16); + spd = ASSUME_ALIGNED(state->coeffs[pi].spDelta, 16); // Apply the scale and phase interpolated filter. r4 = _mm_setzero_ps(); { const __m128 pf4 = _mm_set1_ps(pf); - for(j_f = 0,j_s = l;j_f < m;j_f+=4,j_s+=4) +#define LD4(x) _mm_load_ps(x) +#define ULD4(x) _mm_loadu_ps(x) +#define MLA4(x, y, z) _mm_add_ps(x, _mm_mul_ps(y, z)) + for(j = 0;j < m;j+=4) { - const __m128 f4 = _mm_add_ps( - _mm_add_ps( - _mm_load_ps(&fil[j_f]), - _mm_mul_ps(sf4, _mm_load_ps(&scd[j_f])) - ), - _mm_mul_ps( - pf4, - _mm_add_ps( - _mm_load_ps(&phd[j_f]), - _mm_mul_ps(sf4, _mm_load_ps(&spd[j_f])) - ) - ) + const __m128 f4 = MLA4(MLA4(LD4(&fil[j]), sf4, LD4(&scd[j])), + pf4, MLA4(LD4(&phd[j]), sf4, LD4(&spd[j])) ); - r4 = _mm_add_ps(r4, _mm_mul_ps(f4, _mm_loadu_ps(&src[j_s]))); + r4 = MLA4(r4, f4, ULD4(&src[j])); } +#undef MLA4 +#undef ULD4 +#undef LD4 } r4 = _mm_add_ps(r4, _mm_shuffle_ps(r4, r4, _MM_SHUFFLE(0, 1, 2, 3))); r4 = _mm_add_ps(r4, _mm_movehl_ps(r4, r4)); |