aboutsummaryrefslogtreecommitdiffstats
path: root/Alc/mixer_sse.c
diff options
context:
space:
mode:
Diffstat (limited to 'Alc/mixer_sse.c')
-rw-r--r--Alc/mixer_sse.c86
1 files changed, 86 insertions, 0 deletions
diff --git a/Alc/mixer_sse.c b/Alc/mixer_sse.c
index aff0152b..84884de3 100644
--- a/Alc/mixer_sse.c
+++ b/Alc/mixer_sse.c
@@ -58,6 +58,92 @@ void Resample_lerp32_SSE(const ALfloat *data, ALuint frac,
}
}
+void Resample_cubic32_SSE(const ALfloat *data, ALuint frac,
+ ALuint increment, ALuint NumChannels, ALfloat *RESTRICT OutBuffer,
+ ALuint BufferSize)
+{
+ /* Cubic interpolation mainly consists of a matrix4 * vector4 operation,
+ * followed by scalars being applied to the resulting elements before all
+ * four are added together for the final sample. */
+ static const __m128 matrix[4] = {
+ { -0.5, 1.0f, -0.5f, 0.0f },
+ { 1.5, -2.5f, 0.0f, 1.0f },
+ { -1.5, 2.0f, 0.5f, 0.0f },
+ { 0.5, -0.5f, 0.0f, 0.0f },
+ };
+ ALIGN(16) float value[4];
+ ALuint pos = 0;
+ ALuint i, j;
+
+ for(i = 0;i < BufferSize+1-3;i+=4)
+ {
+ __m128 result, final[4];
+
+ for(j = 0;j < 4;j++)
+ {
+ __m128 val4, s;
+ ALfloat mu;
+
+ val4 = _mm_set_ps(data[(pos-1)*NumChannels],
+ data[(pos )*NumChannels],
+ data[(pos+1)*NumChannels],
+ data[(pos+2)*NumChannels]);
+ mu = frac * (1.0f/FRACTIONONE);
+ s = _mm_set_ps(1.0f, mu, mu*mu, mu*mu*mu);
+
+ /* result = matrix * val4 */
+ result = _mm_mul_ps(val4, matrix[0]) ;
+ result = _mm_add_ps(result, _mm_mul_ps(val4, matrix[1]));
+ result = _mm_add_ps(result, _mm_mul_ps(val4, matrix[2]));
+ result = _mm_add_ps(result, _mm_mul_ps(val4, matrix[3]));
+
+ /* final[j] = result * { mu^0, mu^1, mu^2, mu^3 } */
+ final[j] = _mm_mul_ps(result, s);
+
+ frac += increment;
+ pos += frac>>FRACTIONBITS;
+ frac &= FRACTIONMASK;
+ }
+ /* Transpose the final "matrix" so adding the rows will give the four
+ * samples. TODO: Is this faster than doing..
+ * _mm_store_ps(value, result);
+ * OutBuffer[i] = value[0] + value[1] + value[2] + value[3];
+ * ..for each sample?
+ */
+ _MM_TRANSPOSE4_PS(final[0], final[1], final[2], final[3]);
+ result = _mm_add_ps(_mm_add_ps(final[0], final[1]),
+ _mm_add_ps(final[2], final[3]));
+
+ _mm_store_ps(&OutBuffer[i], result);
+ }
+ for(;i < BufferSize+1;i++)
+ {
+ __m128 val4, s, result;
+ ALfloat mu;
+
+ val4 = _mm_set_ps(data[(pos-1)*NumChannels],
+ data[(pos )*NumChannels],
+ data[(pos+1)*NumChannels],
+ data[(pos+2)*NumChannels]);
+ mu = frac * (1.0f/FRACTIONONE);
+ s = _mm_set_ps(1.0f, mu, mu*mu, mu*mu*mu);
+
+ /* result = matrix * val4 */
+ result = _mm_mul_ps(val4, matrix[0]) ;
+ result = _mm_add_ps(result, _mm_mul_ps(val4, matrix[1]));
+ result = _mm_add_ps(result, _mm_mul_ps(val4, matrix[2]));
+ result = _mm_add_ps(result, _mm_mul_ps(val4, matrix[3]));
+
+ /* value = result * { mu^0, mu^1, mu^2, mu^3 } */
+ _mm_store_ps(value, _mm_mul_ps(result, s));
+
+ OutBuffer[i] = value[0] + value[1] + value[2] + value[3];
+
+ frac += increment;
+ pos += frac>>FRACTIONBITS;
+ frac &= FRACTIONMASK;
+ }
+}
static __inline void ApplyCoeffsStep(ALuint Offset, ALfloat (*RESTRICT Values)[2],