aboutsummaryrefslogtreecommitdiffstats
path: root/Alc/mixer_sse.c
diff options
context:
space:
mode:
Diffstat (limited to 'Alc/mixer_sse.c')
-rw-r--r--Alc/mixer_sse.c42
1 files changed, 22 insertions, 20 deletions
diff --git a/Alc/mixer_sse.c b/Alc/mixer_sse.c
index 96228b47..25daf00b 100644
--- a/Alc/mixer_sse.c
+++ b/Alc/mixer_sse.c
@@ -12,18 +12,24 @@
#include "mixer_defs.h"
+#ifdef __GNUC__
+#define ASSUME_ALIGNED(ptr, ...) __builtin_assume_aligned((ptr), __VA_ARGS__)
+#else
+#define ASSUME_ALIGNED(ptr, ...) (ptr)
+#endif
+
const ALfloat *Resample_bsinc32_SSE(const BsincState *state, const ALfloat *restrict src,
ALuint frac, ALint increment, ALfloat *restrict dst,
ALsizei dstlen)
{
const __m128 sf4 = _mm_set1_ps(state->sf);
const ALsizei m = state->m;
- const ALint l = state->l;
const ALfloat *fil, *scd, *phd, *spd;
- ALsizei j_s, pi, j_f, i;
+ ALsizei pi, i, j;
ALfloat pf;
__m128 r4;
+ src += state->l;
for(i = 0;i < dstlen;i++)
{
// Calculate the phase index and factor.
@@ -32,32 +38,28 @@ const ALfloat *Resample_bsinc32_SSE(const BsincState *state, const ALfloat *rest
pf = (frac & ((1<<FRAC_PHASE_BITDIFF)-1)) * (1.0f/(1<<FRAC_PHASE_BITDIFF));
#undef FRAC_PHASE_BITDIFF
- fil = state->coeffs[pi].filter;
- scd = state->coeffs[pi].scDelta;
- phd = state->coeffs[pi].phDelta;
- spd = state->coeffs[pi].spDelta;
+ fil = ASSUME_ALIGNED(state->coeffs[pi].filter, 16);
+ scd = ASSUME_ALIGNED(state->coeffs[pi].scDelta, 16);
+ phd = ASSUME_ALIGNED(state->coeffs[pi].phDelta, 16);
+ spd = ASSUME_ALIGNED(state->coeffs[pi].spDelta, 16);
// Apply the scale and phase interpolated filter.
r4 = _mm_setzero_ps();
{
const __m128 pf4 = _mm_set1_ps(pf);
- for(j_f = 0,j_s = l;j_f < m;j_f+=4,j_s+=4)
+#define LD4(x) _mm_load_ps(x)
+#define ULD4(x) _mm_loadu_ps(x)
+#define MLA4(x, y, z) _mm_add_ps(x, _mm_mul_ps(y, z))
+ for(j = 0;j < m;j+=4)
{
- const __m128 f4 = _mm_add_ps(
- _mm_add_ps(
- _mm_load_ps(&fil[j_f]),
- _mm_mul_ps(sf4, _mm_load_ps(&scd[j_f]))
- ),
- _mm_mul_ps(
- pf4,
- _mm_add_ps(
- _mm_load_ps(&phd[j_f]),
- _mm_mul_ps(sf4, _mm_load_ps(&spd[j_f]))
- )
- )
+ const __m128 f4 = MLA4(MLA4(LD4(&fil[j]), sf4, LD4(&scd[j])),
+ pf4, MLA4(LD4(&phd[j]), sf4, LD4(&spd[j]))
);
- r4 = _mm_add_ps(r4, _mm_mul_ps(f4, _mm_loadu_ps(&src[j_s])));
+ r4 = MLA4(r4, f4, ULD4(&src[j]));
}
+#undef MLA4
+#undef ULD4
+#undef LD4
}
r4 = _mm_add_ps(r4, _mm_shuffle_ps(r4, r4, _MM_SHUFFLE(0, 1, 2, 3)));
r4 = _mm_add_ps(r4, _mm_movehl_ps(r4, r4));