aboutsummaryrefslogtreecommitdiffstats
path: root/Alc/mixer_sse.c
diff options
context:
space:
mode:
authorChris Robinson <[email protected]>2012-09-11 01:59:42 -0700
committerChris Robinson <[email protected]>2012-09-11 02:11:51 -0700
commit7e81918f7b76135928f7e40eff77493ea98596d5 (patch)
tree4c369af77374ac8022806b4495ada3b2d8971063 /Alc/mixer_sse.c
parent1b840a3db80788813d44631357280520ee50e03f (diff)
Update HRTF code
This update allows for much more flexibility in the HRTF data. It also allows for HRTF table file names to include "%r" to represent the device's playback rate (e.g. if you set hrtf-%r.mhr, then it will try to use hrtf-44100.mhr or hrtf-48000.mhr depending if the device's output rate is 44100 or 48000, respectively). The makehrtf utility has also been updated to support more options and input file formats, as well as the new mhr format.
Diffstat (limited to 'Alc/mixer_sse.c')
-rw-r--r--Alc/mixer_sse.c109
1 files changed, 86 insertions, 23 deletions
diff --git a/Alc/mixer_sse.c b/Alc/mixer_sse.c
index 443209bc..27b550e0 100644
--- a/Alc/mixer_sse.c
+++ b/Alc/mixer_sse.c
@@ -14,53 +14,116 @@
static __inline void ApplyCoeffsStep(ALuint Offset, ALfloat (*RESTRICT Values)[2],
+ const ALuint IrSize,
ALfloat (*RESTRICT Coeffs)[2],
ALfloat (*RESTRICT CoeffStep)[2],
ALfloat left, ALfloat right)
{
const __m128 lrlr = { left, right, left, right };
+ __m128 coeffs, deltas, imp0, imp1;
__m128 vals = _mm_setzero_ps();
- __m128 coeffs, coeffstep;
- ALuint c;
- for(c = 0;c < HRIR_LENGTH;c += 2)
+ ALuint i;
+
+ if((Offset&1))
{
- const ALuint o0 = (Offset++)&HRIR_MASK;
- const ALuint o1 = (Offset++)&HRIR_MASK;
+ const ALuint o0 = Offset&HRIR_MASK;
+ const ALuint o1 = (Offset+IrSize-1)&HRIR_MASK;
- coeffs = _mm_load_ps(&Coeffs[c][0]);
+ coeffs = _mm_load_ps(&Coeffs[0][0]);
+ deltas = _mm_load_ps(&CoeffStep[0][0]);
vals = _mm_loadl_pi(vals, (__m64*)&Values[o0][0]);
- vals = _mm_loadh_pi(vals, (__m64*)&Values[o1][0]);
-
- vals = _mm_add_ps(vals, _mm_mul_ps(coeffs, lrlr));
+ imp0 = _mm_mul_ps(lrlr, coeffs);
+ coeffs = _mm_add_ps(coeffs, deltas);
+ vals = _mm_add_ps(imp0, vals);
+ _mm_store_ps(&Coeffs[0][0], coeffs);
_mm_storel_pi((__m64*)&Values[o0][0], vals);
- _mm_storeh_pi((__m64*)&Values[o1][0], vals);
-
- coeffstep = _mm_load_ps(&CoeffStep[c][0]);
- coeffs = _mm_add_ps(coeffs, coeffstep);
- _mm_store_ps(&Coeffs[c][0], coeffs);
+ for(i = 1;i < IrSize-1;i += 2)
+ {
+ const ALuint o2 = (Offset+i)&HRIR_MASK;
+
+ coeffs = _mm_load_ps(&Coeffs[i+1][0]);
+ deltas = _mm_load_ps(&CoeffStep[i+1][0]);
+ vals = _mm_load_ps(&Values[o2][0]);
+ imp1 = _mm_mul_ps(lrlr, coeffs);
+ coeffs = _mm_add_ps(coeffs, deltas);
+ imp0 = _mm_shuffle_ps(imp0, imp1, _MM_SHUFFLE(1, 0, 3, 2));
+ vals = _mm_add_ps(imp0, vals);
+ _mm_store_ps(&Coeffs[i+1][0], coeffs);
+ _mm_store_ps(&Values[o2][0], vals);
+ imp0 = imp1;
+ }
+ vals = _mm_loadl_pi(vals, (__m64*)&Values[o1][0]);
+ imp0 = _mm_movehl_ps(imp0, imp0);
+ vals = _mm_add_ps(imp0, vals);
+ _mm_storel_pi((__m64*)&Values[o1][0], vals);
+ }
+ else
+ {
+ for(i = 0;i < IrSize;i += 2)
+ {
+ const ALuint o = (Offset + i)&HRIR_MASK;
+
+ coeffs = _mm_load_ps(&Coeffs[i][0]);
+ deltas = _mm_load_ps(&CoeffStep[i][0]);
+ vals = _mm_load_ps(&Values[o][0]);
+ imp0 = _mm_mul_ps(lrlr, coeffs);
+ coeffs = _mm_add_ps(coeffs, deltas);
+ vals = _mm_add_ps(imp0, vals);
+ _mm_store_ps(&Coeffs[i][0], coeffs);
+ _mm_store_ps(&Values[o][0], vals);
+ }
}
}
static __inline void ApplyCoeffs(ALuint Offset, ALfloat (*RESTRICT Values)[2],
+ const ALuint IrSize,
ALfloat (*RESTRICT Coeffs)[2],
ALfloat left, ALfloat right)
{
const __m128 lrlr = { left, right, left, right };
__m128 vals = _mm_setzero_ps();
__m128 coeffs;
- ALuint c;
- for(c = 0;c < HRIR_LENGTH;c += 2)
+ ALuint i;
+
+ if((Offset&1))
{
- const ALuint o0 = (Offset++)&HRIR_MASK;
- const ALuint o1 = (Offset++)&HRIR_MASK;
+ const ALuint o0 = Offset&HRIR_MASK;
+ const ALuint o1 = (Offset+IrSize-1)&HRIR_MASK;
+ __m128 imp0, imp1;
- coeffs = _mm_load_ps(&Coeffs[c][0]);
+ coeffs = _mm_load_ps(&Coeffs[0][0]);
vals = _mm_loadl_pi(vals, (__m64*)&Values[o0][0]);
- vals = _mm_loadh_pi(vals, (__m64*)&Values[o1][0]);
-
- vals = _mm_add_ps(vals, _mm_mul_ps(coeffs, lrlr));
+ imp0 = _mm_mul_ps(lrlr, coeffs);
+ vals = _mm_add_ps(imp0, vals);
_mm_storel_pi((__m64*)&Values[o0][0], vals);
- _mm_storeh_pi((__m64*)&Values[o1][0], vals);
+ for(i = 1;i < IrSize-1;i += 2)
+ {
+ const ALuint o2 = (Offset+i)&HRIR_MASK;
+
+ coeffs = _mm_load_ps(&Coeffs[i+1][0]);
+ vals = _mm_load_ps(&Values[o2][0]);
+ imp1 = _mm_mul_ps(lrlr, coeffs);
+ imp0 = _mm_shuffle_ps(imp0, imp1, _MM_SHUFFLE(1, 0, 3, 2));
+ vals = _mm_add_ps(imp0, vals);
+ _mm_store_ps(&Values[o2][0], vals);
+ imp0 = imp1;
+ }
+ vals = _mm_loadl_pi(vals, (__m64*)&Values[o1][0]);
+ imp0 = _mm_movehl_ps(imp0, imp0);
+ vals = _mm_add_ps(imp0, vals);
+ _mm_storel_pi((__m64*)&Values[o1][0], vals);
+ }
+ else
+ {
+ for(i = 0;i < IrSize;i += 2)
+ {
+ const ALuint o = (Offset + i)&HRIR_MASK;
+
+ coeffs = _mm_load_ps(&Coeffs[i][0]);
+ vals = _mm_load_ps(&Values[o][0]);
+ vals = _mm_add_ps(vals, _mm_mul_ps(lrlr, coeffs));
+ _mm_store_ps(&Values[o][0], vals);
+ }
}
}