diff options
author | Chris Robinson <[email protected]> | 2017-08-30 11:30:19 -0700 |
---|---|---|
committer | Chris Robinson <[email protected]> | 2017-08-30 11:30:19 -0700 |
commit | cd15b1775e9c81283f998a874bda55821e79f3fc (patch) | |
tree | b9406ddd9bc4a83c0e7763f57294d91bee9def65 /Alc/mixer_sse.c | |
parent | 2916efee213d0d0c6fe67669bf666141e6149f27 (diff) |
Avoid some extraneous load calls
This likely doesn't change anything given a working optimizer, but it cleans up
the code some.
Diffstat (limited to 'Alc/mixer_sse.c')
-rw-r--r-- | Alc/mixer_sse.c | 26 |
1 files changed, 12 insertions, 14 deletions
diff --git a/Alc/mixer_sse.c b/Alc/mixer_sse.c index 5273a088..773a1c2f 100644 --- a/Alc/mixer_sse.c +++ b/Alc/mixer_sse.c @@ -19,8 +19,8 @@ const ALfloat *Resample_bsinc_SSE(const InterpState *state, const ALfloat *restr const ALfloat *const filter = state->bsinc.filter; const __m128 sf4 = _mm_set1_ps(state->bsinc.sf); const ALsizei m = state->bsinc.m; - const ALfloat *fil, *scd, *phd, *spd; - ALsizei pi, i, j; + const __m128 *fil, *scd, *phd, *spd; + ALsizei pi, i, j, offset; ALfloat pf; __m128 r4; @@ -33,30 +33,28 @@ const ALfloat *Resample_bsinc_SSE(const InterpState *state, const ALfloat *restr pf = (frac & ((1<<FRAC_PHASE_BITDIFF)-1)) * (1.0f/(1<<FRAC_PHASE_BITDIFF)); #undef FRAC_PHASE_BITDIFF - fil = ASSUME_ALIGNED(filter + m*pi*4, 16); - scd = ASSUME_ALIGNED(fil + m, 16); - phd = ASSUME_ALIGNED(scd + m, 16); - spd = ASSUME_ALIGNED(phd + m, 16); + offset = m*pi*4; + fil = ASSUME_ALIGNED(filter + offset, 16); offset += m; + scd = ASSUME_ALIGNED(filter + offset, 16); offset += m; + phd = ASSUME_ALIGNED(filter + offset, 16); offset += m; + spd = ASSUME_ALIGNED(filter + offset, 16); // Apply the scale and phase interpolated filter. r4 = _mm_setzero_ps(); { const __m128 pf4 = _mm_set1_ps(pf); -#define LD4(x) _mm_load_ps(x) -#define ULD4(x) _mm_loadu_ps(x) #define MLA4(x, y, z) _mm_add_ps(x, _mm_mul_ps(y, z)) - for(j = 0;j < m;j+=4) + for(j = 0;j < m;j+=4,fil++,scd++,phd++,spd++) { /* f = ((fil + sf*scd) + pf*(phd + sf*spd)) */ - const __m128 f4 = MLA4(MLA4(LD4(&fil[j]), sf4, LD4(&scd[j])), - pf4, MLA4(LD4(&phd[j]), sf4, LD4(&spd[j])) + const __m128 f4 = MLA4( + MLA4(*fil, sf4, *scd), + pf4, MLA4(*phd, sf4, *spd) ); /* r += f*src */ - r4 = MLA4(r4, f4, ULD4(&src[j])); + r4 = MLA4(r4, f4, _mm_loadu_ps(&src[j])); } #undef MLA4 -#undef ULD4 -#undef LD4 } r4 = _mm_add_ps(r4, _mm_shuffle_ps(r4, r4, _MM_SHUFFLE(0, 1, 2, 3))); r4 = _mm_add_ps(r4, _mm_movehl_ps(r4, r4)); |