aboutsummaryrefslogtreecommitdiffstats
path: root/Alc/mixer_sse.c
diff options
context:
space:
mode:
authorChris Robinson <[email protected]>2017-08-30 11:30:19 -0700
committerChris Robinson <[email protected]>2017-08-30 11:30:19 -0700
commitcd15b1775e9c81283f998a874bda55821e79f3fc (patch)
treeb9406ddd9bc4a83c0e7763f57294d91bee9def65 /Alc/mixer_sse.c
parent2916efee213d0d0c6fe67669bf666141e6149f27 (diff)
Avoid some extraneous load calls
This likely doesn't change anything given a working optimizer, but it cleans up the code some.
Diffstat (limited to 'Alc/mixer_sse.c')
-rw-r--r--Alc/mixer_sse.c26
1 files changed, 12 insertions, 14 deletions
diff --git a/Alc/mixer_sse.c b/Alc/mixer_sse.c
index 5273a088..773a1c2f 100644
--- a/Alc/mixer_sse.c
+++ b/Alc/mixer_sse.c
@@ -19,8 +19,8 @@ const ALfloat *Resample_bsinc_SSE(const InterpState *state, const ALfloat *restr
const ALfloat *const filter = state->bsinc.filter;
const __m128 sf4 = _mm_set1_ps(state->bsinc.sf);
const ALsizei m = state->bsinc.m;
- const ALfloat *fil, *scd, *phd, *spd;
- ALsizei pi, i, j;
+ const __m128 *fil, *scd, *phd, *spd;
+ ALsizei pi, i, j, offset;
ALfloat pf;
__m128 r4;
@@ -33,30 +33,28 @@ const ALfloat *Resample_bsinc_SSE(const InterpState *state, const ALfloat *restr
pf = (frac & ((1<<FRAC_PHASE_BITDIFF)-1)) * (1.0f/(1<<FRAC_PHASE_BITDIFF));
#undef FRAC_PHASE_BITDIFF
- fil = ASSUME_ALIGNED(filter + m*pi*4, 16);
- scd = ASSUME_ALIGNED(fil + m, 16);
- phd = ASSUME_ALIGNED(scd + m, 16);
- spd = ASSUME_ALIGNED(phd + m, 16);
+ offset = m*pi*4;
+ fil = ASSUME_ALIGNED(filter + offset, 16); offset += m;
+ scd = ASSUME_ALIGNED(filter + offset, 16); offset += m;
+ phd = ASSUME_ALIGNED(filter + offset, 16); offset += m;
+ spd = ASSUME_ALIGNED(filter + offset, 16);
// Apply the scale and phase interpolated filter.
r4 = _mm_setzero_ps();
{
const __m128 pf4 = _mm_set1_ps(pf);
-#define LD4(x) _mm_load_ps(x)
-#define ULD4(x) _mm_loadu_ps(x)
#define MLA4(x, y, z) _mm_add_ps(x, _mm_mul_ps(y, z))
- for(j = 0;j < m;j+=4)
+ for(j = 0;j < m;j+=4,fil++,scd++,phd++,spd++)
{
/* f = ((fil + sf*scd) + pf*(phd + sf*spd)) */
- const __m128 f4 = MLA4(MLA4(LD4(&fil[j]), sf4, LD4(&scd[j])),
- pf4, MLA4(LD4(&phd[j]), sf4, LD4(&spd[j]))
+ const __m128 f4 = MLA4(
+ MLA4(*fil, sf4, *scd),
+ pf4, MLA4(*phd, sf4, *spd)
);
/* r += f*src */
- r4 = MLA4(r4, f4, ULD4(&src[j]));
+ r4 = MLA4(r4, f4, _mm_loadu_ps(&src[j]));
}
#undef MLA4
-#undef ULD4
-#undef LD4
}
r4 = _mm_add_ps(r4, _mm_shuffle_ps(r4, r4, _MM_SHUFFLE(0, 1, 2, 3)));
r4 = _mm_add_ps(r4, _mm_movehl_ps(r4, r4));