aboutsummaryrefslogtreecommitdiffstats
path: root/Alc
diff options
context:
space:
mode:
Diffstat (limited to 'Alc')
-rw-r--r--Alc/mixer_c.c8
-rw-r--r--Alc/mixer_neon.c4
-rw-r--r--Alc/mixer_sse.c42
3 files changed, 28 insertions, 26 deletions
diff --git a/Alc/mixer_c.c b/Alc/mixer_c.c
index 15c603ca..1284371b 100644
--- a/Alc/mixer_c.c
+++ b/Alc/mixer_c.c
@@ -62,10 +62,10 @@ const ALfloat *Resample_bsinc32_C(const BsincState *state, const ALfloat *restri
const ALfloat *fil, *scd, *phd, *spd;
const ALfloat sf = state->sf;
const ALsizei m = state->m;
- const ALint l = state->l;
- ALsizei j_s, j_f, pi, i;
+ ALsizei j_f, pi, i;
ALfloat pf, r;
+ src += state->l;
for(i = 0;i < dstlen;i++)
{
// Calculate the phase index and factor.
@@ -81,9 +81,9 @@ const ALfloat *Resample_bsinc32_C(const BsincState *state, const ALfloat *restri
// Apply the scale and phase interpolated filter.
r = 0.0f;
- for(j_f = 0,j_s = l;j_f < m;j_f++,j_s++)
+ for(j_f = 0;j_f < m;j_f++)
r += (fil[j_f] + sf*scd[j_f] + pf*(phd[j_f] + sf*spd[j_f])) *
- src[j_s];
+ src[j_f];
dst[i] = r;
frac += increment;
diff --git a/Alc/mixer_neon.c b/Alc/mixer_neon.c
index 75ad14eb..2875b321 100644
--- a/Alc/mixer_neon.c
+++ b/Alc/mixer_neon.c
@@ -248,8 +248,8 @@ const ALfloat *Resample_bsinc32_Neon(const BsincState *state, const ALfloat *res
const float32x4_t pf4 = vdupq_n_f32(pf);
for(j = 0;j < m;j+=4)
{
- float32x4_t f4 = vmlaq_f32(vld1q_f32(&fil[j]), sf4, vld1q_f32(&scd[j]));
- f4 = vmlaq_f32(f4,
+ const float32x4_t f4 = vmlaq_f32(vmlaq_f32(vld1q_f32(&fil[j]),
+ sf4, vld1q_f32(&scd[j])),
pf4, vmlaq_f32(vld1q_f32(&phd[j]),
sf4, vld1q_f32(&spd[j])
)
diff --git a/Alc/mixer_sse.c b/Alc/mixer_sse.c
index 96228b47..25daf00b 100644
--- a/Alc/mixer_sse.c
+++ b/Alc/mixer_sse.c
@@ -12,18 +12,24 @@
#include "mixer_defs.h"
+#ifdef __GNUC__
+#define ASSUME_ALIGNED(ptr, ...) __builtin_assume_aligned((ptr), __VA_ARGS__)
+#else
+#define ASSUME_ALIGNED(ptr, ...) (ptr)
+#endif
+
const ALfloat *Resample_bsinc32_SSE(const BsincState *state, const ALfloat *restrict src,
ALuint frac, ALint increment, ALfloat *restrict dst,
ALsizei dstlen)
{
const __m128 sf4 = _mm_set1_ps(state->sf);
const ALsizei m = state->m;
- const ALint l = state->l;
const ALfloat *fil, *scd, *phd, *spd;
- ALsizei j_s, pi, j_f, i;
+ ALsizei pi, i, j;
ALfloat pf;
__m128 r4;
+ src += state->l;
for(i = 0;i < dstlen;i++)
{
// Calculate the phase index and factor.
@@ -32,32 +38,28 @@ const ALfloat *Resample_bsinc32_SSE(const BsincState *state, const ALfloat *rest
pf = (frac & ((1<<FRAC_PHASE_BITDIFF)-1)) * (1.0f/(1<<FRAC_PHASE_BITDIFF));
#undef FRAC_PHASE_BITDIFF
- fil = state->coeffs[pi].filter;
- scd = state->coeffs[pi].scDelta;
- phd = state->coeffs[pi].phDelta;
- spd = state->coeffs[pi].spDelta;
+ fil = ASSUME_ALIGNED(state->coeffs[pi].filter, 16);
+ scd = ASSUME_ALIGNED(state->coeffs[pi].scDelta, 16);
+ phd = ASSUME_ALIGNED(state->coeffs[pi].phDelta, 16);
+ spd = ASSUME_ALIGNED(state->coeffs[pi].spDelta, 16);
// Apply the scale and phase interpolated filter.
r4 = _mm_setzero_ps();
{
const __m128 pf4 = _mm_set1_ps(pf);
- for(j_f = 0,j_s = l;j_f < m;j_f+=4,j_s+=4)
+#define LD4(x) _mm_load_ps(x)
+#define ULD4(x) _mm_loadu_ps(x)
+#define MLA4(x, y, z) _mm_add_ps(x, _mm_mul_ps(y, z))
+ for(j = 0;j < m;j+=4)
{
- const __m128 f4 = _mm_add_ps(
- _mm_add_ps(
- _mm_load_ps(&fil[j_f]),
- _mm_mul_ps(sf4, _mm_load_ps(&scd[j_f]))
- ),
- _mm_mul_ps(
- pf4,
- _mm_add_ps(
- _mm_load_ps(&phd[j_f]),
- _mm_mul_ps(sf4, _mm_load_ps(&spd[j_f]))
- )
- )
+ const __m128 f4 = MLA4(MLA4(LD4(&fil[j]), sf4, LD4(&scd[j])),
+ pf4, MLA4(LD4(&phd[j]), sf4, LD4(&spd[j]))
);
- r4 = _mm_add_ps(r4, _mm_mul_ps(f4, _mm_loadu_ps(&src[j_s])));
+ r4 = MLA4(r4, f4, ULD4(&src[j]));
}
+#undef MLA4
+#undef ULD4
+#undef LD4
}
r4 = _mm_add_ps(r4, _mm_shuffle_ps(r4, r4, _MM_SHUFFLE(0, 1, 2, 3)));
r4 = _mm_add_ps(r4, _mm_movehl_ps(r4, r4));