Partially revert "Use a different method for HRTF mixing"

The sound localization with virtual channel mixing was just too poor, so while it's more costly to do per-source HRTF mixing, it's unavoidable if you want good localization. This is only partially reverted because having the virtual channel is still beneficial, particularly with B-Format rendering and effect mixing which otherwise skip HRTF processing. As before, the number of virtual channels can potentially be customized, specifying more or less channels depending on the system's needs.
author: Chris Robinson <[email protected]> 2014-11-23 10:49:54 -0800
committer: Chris Robinson <[email protected]> 2014-11-23 10:49:54 -0800
commit: 45d6bb58a4293c5b1ab229cea86e0ef24a2a084b (patch)
tree: ec03ad6eac812ae209f8d973687afa5b99616133 /Alc/mixer_sse.c
parent: fc3608b381c0492674b4cfc1da0dcf5389ace722 (diff)
1 files changed, 62 insertions, 0 deletions
diff --git a/Alc/mixer_sse.c b/Alc/mixer_sse.c
index d0dca40e..d86cf749 100644
--- a/Alc/mixer_sse.c
+++ b/Alc/mixer_sse.c
@@ -19,6 +19,68 @@
 #include "mixer_defs.h"
 
 
+static inline void ApplyCoeffsStep(ALuint Offset, ALfloat (*restrict Values)[2],
+                                   const ALuint IrSize,
+                                   ALfloat (*restrict Coeffs)[2],
+                                   const ALfloat (*restrict CoeffStep)[2],
+                                   ALfloat left, ALfloat right)
+{
+    const __m128 lrlr = _mm_setr_ps(left, right, left, right);
+    __m128 coeffs, deltas, imp0, imp1;
+    __m128 vals = _mm_setzero_ps();
+    ALuint i;
+
+    if((Offset&1))
+    {
+        const ALuint o0 = Offset&HRIR_MASK;
+        const ALuint o1 = (Offset+IrSize-1)&HRIR_MASK;
+
+        coeffs = _mm_load_ps(&Coeffs[0][0]);
+        deltas = _mm_load_ps(&CoeffStep[0][0]);
+        vals = _mm_loadl_pi(vals, (__m64*)&Values[o0][0]);
+        imp0 = _mm_mul_ps(lrlr, coeffs);
+        coeffs = _mm_add_ps(coeffs, deltas);
+        vals = _mm_add_ps(imp0, vals);
+        _mm_store_ps(&Coeffs[0][0], coeffs);
+        _mm_storel_pi((__m64*)&Values[o0][0], vals);
+        for(i = 1;i < IrSize-1;i += 2)
+        {
+            const ALuint o2 = (Offset+i)&HRIR_MASK;
+
+            coeffs = _mm_load_ps(&Coeffs[i+1][0]);
+            deltas = _mm_load_ps(&CoeffStep[i+1][0]);
+            vals = _mm_load_ps(&Values[o2][0]);
+            imp1 = _mm_mul_ps(lrlr, coeffs);
+            coeffs = _mm_add_ps(coeffs, deltas);
+            imp0 = _mm_shuffle_ps(imp0, imp1, _MM_SHUFFLE(1, 0, 3, 2));
+            vals = _mm_add_ps(imp0, vals);
+            _mm_store_ps(&Coeffs[i+1][0], coeffs);
+            _mm_store_ps(&Values[o2][0], vals);
+            imp0 = imp1;
+        }
+        vals = _mm_loadl_pi(vals, (__m64*)&Values[o1][0]);
+        imp0 = _mm_movehl_ps(imp0, imp0);
+        vals = _mm_add_ps(imp0, vals);
+        _mm_storel_pi((__m64*)&Values[o1][0], vals);
+    }
+    else
+    {
+        for(i = 0;i < IrSize;i += 2)
+        {
+            const ALuint o = (Offset + i)&HRIR_MASK;
+
+            coeffs = _mm_load_ps(&Coeffs[i][0]);
+            deltas = _mm_load_ps(&CoeffStep[i][0]);
+            vals = _mm_load_ps(&Values[o][0]);
+            imp0 = _mm_mul_ps(lrlr, coeffs);
+            coeffs = _mm_add_ps(coeffs, deltas);
+            vals = _mm_add_ps(imp0, vals);
+            _mm_store_ps(&Coeffs[i][0], coeffs);
+            _mm_store_ps(&Values[o][0], vals);
+        }
+    }
+}
+
 static inline void ApplyCoeffs(ALuint Offset, ALfloat (*restrict Values)[2],
                                const ALuint IrSize,
                                ALfloat (*restrict Coeffs)[2],
author	Chris Robinson <[email protected]>	2014-11-23 10:49:54 -0800
committer	Chris Robinson <[email protected]>	2014-11-23 10:49:54 -0800
commit	45d6bb58a4293c5b1ab229cea86e0ef24a2a084b (patch)
tree	ec03ad6eac812ae209f8d973687afa5b99616133 /Alc/mixer_sse.c
parent	fc3608b381c0492674b4cfc1da0dcf5389ace722 (diff)