4 files changed, 100 insertions, 76 deletions
diff --git a/Alc/mixer/hrtf_inc.c b/Alc/mixer/hrtf_inc.c
index d6bd8042..3ef22f24 100644
--- a/Alc/mixer/hrtf_inc.c
+++ b/Alc/mixer/hrtf_inc.c
@@ -22,8 +22,9 @@ void MixHrtf(ALfloat *restrict LeftOut, ALfloat *restrict RightOut,
 {
     const ALfloat (*Coeffs)[2] = ASSUME_ALIGNED(hrtfparams->Coeffs, 16);
     const ALsizei Delay[2] = { hrtfparams->Delay[0], hrtfparams->Delay[1] };
-    ALfloat gainstep = hrtfparams->GainStep;
-    ALfloat gain = hrtfparams->Gain;
+    const ALfloat gainstep = hrtfparams->GainStep;
+    const ALfloat gain = hrtfparams->Gain;
+    ALfloat g, stepcount = 0.0f;
     ALfloat left, right;
     ALsizei i;
 
@@ -35,8 +36,10 @@ void MixHrtf(ALfloat *restrict LeftOut, ALfloat *restrict RightOut,
     for(i = 0;i < BufferSize;i++)
     {
         hrtfstate->History[Offset&HRTF_HISTORY_MASK] = *(data++);
-        left = hrtfstate->History[(Offset-Delay[0])&HRTF_HISTORY_MASK]*gain;
-        right = hrtfstate->History[(Offset-Delay[1])&HRTF_HISTORY_MASK]*gain;
+
+        g = gain + gainstep*stepcount;
+        left = hrtfstate->History[(Offset-Delay[0])&HRTF_HISTORY_MASK]*g;
+        right = hrtfstate->History[(Offset-Delay[1])&HRTF_HISTORY_MASK]*g;
 
         hrtfstate->Values[(Offset+IrSize-1)&HRIR_MASK][0] = 0.0f;
         hrtfstate->Values[(Offset+IrSize-1)&HRIR_MASK][1] = 0.0f;
@@ -45,10 +48,10 @@ void MixHrtf(ALfloat *restrict LeftOut, ALfloat *restrict RightOut,
         *(LeftOut++)  += hrtfstate->Values[Offset&HRIR_MASK][0];
         *(RightOut++) += hrtfstate->Values[Offset&HRIR_MASK][1];
 
-        gain += gainstep;
+        stepcount += 1.0f;
         Offset++;
     }
-    hrtfparams->Gain = gain;
+    hrtfparams->Gain = gain + gainstep*stepcount;
 }
 
 void MixHrtfBlend(ALfloat *restrict LeftOut, ALfloat *restrict RightOut,
@@ -59,12 +62,13 @@ void MixHrtfBlend(ALfloat *restrict LeftOut, ALfloat *restrict RightOut,
 {
     const ALfloat (*OldCoeffs)[2] = ASSUME_ALIGNED(oldparams->Coeffs, 16);
     const ALsizei OldDelay[2] = { oldparams->Delay[0], oldparams->Delay[1] };
-    ALfloat oldGain = oldparams->Gain;
-    ALfloat oldGainStep = -oldGain / (ALfloat)BufferSize;
+    const ALfloat oldGain = oldparams->Gain;
+    const ALfloat oldGainStep = -oldGain / (ALfloat)BufferSize;
     const ALfloat (*NewCoeffs)[2] = ASSUME_ALIGNED(newparams->Coeffs, 16);
     const ALsizei NewDelay[2] = { newparams->Delay[0], newparams->Delay[1] };
-    ALfloat newGain = newparams->Gain;
-    ALfloat newGainStep = newparams->GainStep;
+    const ALfloat newGain = newparams->Gain;
+    const ALfloat newGainStep = newparams->GainStep;
+    ALfloat g, stepcount = 0.0f;
     ALfloat left, right;
     ALsizei i;
 
@@ -80,22 +84,23 @@ void MixHrtfBlend(ALfloat *restrict LeftOut, ALfloat *restrict RightOut,
 
         hrtfstate->History[Offset&HRTF_HISTORY_MASK] = *(data++);
 
-        left = hrtfstate->History[(Offset-OldDelay[0])&HRTF_HISTORY_MASK]*oldGain;
-        right = hrtfstate->History[(Offset-OldDelay[1])&HRTF_HISTORY_MASK]*oldGain;
+        g = oldGain + oldGainStep*stepcount;
+        left = hrtfstate->History[(Offset-OldDelay[0])&HRTF_HISTORY_MASK]*g;
+        right = hrtfstate->History[(Offset-OldDelay[1])&HRTF_HISTORY_MASK]*g;
         ApplyCoeffs(Offset, hrtfstate->Values, IrSize, OldCoeffs, left, right);
 
-        left = hrtfstate->History[(Offset-NewDelay[0])&HRTF_HISTORY_MASK]*newGain;
-        right = hrtfstate->History[(Offset-NewDelay[1])&HRTF_HISTORY_MASK]*newGain;
+        g = newGain + newGainStep*stepcount;
+        left = hrtfstate->History[(Offset-NewDelay[0])&HRTF_HISTORY_MASK]*g;
+        right = hrtfstate->History[(Offset-NewDelay[1])&HRTF_HISTORY_MASK]*g;
         ApplyCoeffs(Offset, hrtfstate->Values, IrSize, NewCoeffs, left, right);
 
         *(LeftOut++)  += hrtfstate->Values[Offset&HRIR_MASK][0];
         *(RightOut++) += hrtfstate->Values[Offset&HRIR_MASK][1];
 
-        oldGain += oldGainStep;
-        newGain += newGainStep;
+        stepcount += 1.0f;
         Offset++;
     }
-    newparams->Gain = newGain;
+    newparams->Gain = newGain + newGainStep*stepcount;
 }
 
 void MixDirectHrtf(ALfloat *restrict LeftOut, ALfloat *restrict RightOut,
diff --git a/Alc/mixer/mixer_c.c b/Alc/mixer/mixer_c.c
index 84485206..25149e00 100644
--- a/Alc/mixer/mixer_c.c
+++ b/Alc/mixer/mixer_c.c
@@ -134,13 +134,16 @@ void Mix_C(const ALfloat *data, ALsizei OutChans, ALfloat (*restrict OutBuffer)[
         if(fabsf(step) > FLT_EPSILON)
         {
             ALsizei minsize = mini(BufferSize, Counter);
+            ALfloat step_count = 0.0f;
             for(;pos < minsize;pos++)
             {
-                OutBuffer[c][OutPos+pos] += data[pos]*gain;
-                gain += step;
+                OutBuffer[c][OutPos+pos] += data[pos] * (gain + step*step_count);
+                step_count += 1.0f;
             }
             if(pos == Counter)
                 gain = TargetGains[c];
+            else
+                gain += step*step_count;
             CurrentGains[c] = gain;
         }
 
diff --git a/Alc/mixer/mixer_neon.c b/Alc/mixer/mixer_neon.c
index 1a5e8ee7..b6181b42 100644
--- a/Alc/mixer/mixer_neon.c
+++ b/Alc/mixer/mixer_neon.c
@@ -165,8 +165,7 @@ void Mix_Neon(const ALfloat *data, ALsizei OutChans, ALfloat (*restrict OutBuffe
               ALfloat *CurrentGains, const ALfloat *TargetGains, ALsizei Counter, ALsizei OutPos,
               ALsizei BufferSize)
 {
-    ALfloat gain, delta, step;
-    float32x4_t gain4;
+    const ALfloat delta = (Counter > 0) ? 1.0f/(ALfloat)Counter : 0.0f;
     ALsizei c;
 
     ASSUME(OutChans > 0);
@@ -174,47 +173,54 @@ void Mix_Neon(const ALfloat *data, ALsizei OutChans, ALfloat (*restrict OutBuffe
     data = ASSUME_ALIGNED(data, 16);
     OutBuffer = ASSUME_ALIGNED(OutBuffer, 16);
 
-    delta = (Counter > 0) ? 1.0f/(ALfloat)Counter : 0.0f;
-
     for(c = 0;c < OutChans;c++)
     {
         ALsizei pos = 0;
-        gain = CurrentGains[c];
-        step = (TargetGains[c] - gain) * delta;
+        ALfloat gain = CurrentGains[c];
+        const ALfloat step = (TargetGains[c] - gain) * delta;
+
         if(fabsf(step) > FLT_EPSILON)
         {
             ALsizei minsize = mini(BufferSize, Counter);
+            ALfloat step_count = 0.0f;
             /* Mix with applying gain steps in aligned multiples of 4. */
-            if(minsize-pos > 3)
+            if(LIKELY(minsize > 3))
             {
-                float32x4_t step4;
-                gain4 = vsetq_lane_f32(gain, gain4, 0);
-                gain4 = vsetq_lane_f32(gain + step, gain4, 1);
-                gain4 = vsetq_lane_f32(gain + step + step, gain4, 2);
-                gain4 = vsetq_lane_f32(gain + step + step + step, gain4, 3);
-                step4 = vdupq_n_f32(step + step + step + step);
+                const float32x4_t four4 = vdupq_n_f32(4.0f);
+                const float32x4_t step4 = vdupq_n_f32(step);
+                const float32x4_t gain4 = vdupq_n_f32(gain);
+                float32x4_t step_count4 = vsetq_lane_f32(0.0f,
+                    vsetq_lane_f32(1.0f,
+                    vsetq_lane_f32(2.0f,
+                    vsetq_lane_f32(3.0f, vdupq_n_f32(0.0f), 3),
+                    2), 1), 0
+                );
+                ALsizei todo = minsize >> 2;
+
                 do {
                     const float32x4_t val4 = vld1q_f32(&data[pos]);
                     float32x4_t dry4 = vld1q_f32(&OutBuffer[c][OutPos+pos]);
-                    dry4 = vmlaq_f32(dry4, val4, gain4);
-                    gain4 = vaddq_f32(gain4, step4);
+                    dry4 = vmlaq_f32(dry4, val4, vmlaq_f32(gain4, step4, step_count4));
+                    step_count4 = vaddq_f32(step_count4, four4);
                     vst1q_f32(&OutBuffer[c][OutPos+pos], dry4);
                     pos += 4;
-                } while(minsize-pos > 3);
-                /* NOTE: gain4 now represents the next four gains after the
-                 * last four mixed samples, so the lowest element represents
-                 * the next gain to apply.
+                } while(--todo);
+                /* NOTE: step_count4 now represents the next four counts after
+                 * the last four mixed samples, so the lowest element
+                 * represents the next step count to apply.
                  */
-                gain = vgetq_lane_f32(gain4, 0);
+                step_count = vgetq_lane_f32(step_count4, 0);
             }
             /* Mix with applying left over gain steps that aren't aligned multiples of 4. */
             for(;pos < minsize;pos++)
             {
-                OutBuffer[c][OutPos+pos] += data[pos]*gain;
-                gain += step;
+                OutBuffer[c][OutPos+pos] += data[pos]*(gain + step*step_count);
+                step_count += 1.0f;
             }
             if(pos == Counter)
                 gain = TargetGains[c];
+            else
+                gain += step*step_count;
             CurrentGains[c] = gain;
 
             /* Mix until pos is aligned with 4 or the mix is done. */
@@ -225,13 +231,17 @@ void Mix_Neon(const ALfloat *data, ALsizei OutChans, ALfloat (*restrict OutBuffe
 
         if(!(fabsf(gain) > GAIN_SILENCE_THRESHOLD))
             continue;
-        gain4 = vdupq_n_f32(gain);
-        for(;BufferSize-pos > 3;pos += 4)
+        if(LIKELY(BufferSize-pos > 3))
         {
-            const float32x4_t val4 = vld1q_f32(&data[pos]);
-            float32x4_t dry4 = vld1q_f32(&OutBuffer[c][OutPos+pos]);
-            dry4 = vmlaq_f32(dry4, val4, gain4);
-            vst1q_f32(&OutBuffer[c][OutPos+pos], dry4);
+            ALsizei todo = (BufferSize-pos) >> 2;
+            const float32x4_t gain4 = vdupq_n_f32(gain);
+            do {
+                const float32x4_t val4 = vld1q_f32(&data[pos]);
+                float32x4_t dry4 = vld1q_f32(&OutBuffer[c][OutPos+pos]);
+                dry4 = vmlaq_f32(dry4, val4, gain4);
+                vst1q_f32(&OutBuffer[c][OutPos+pos], dry4);
+                pos += 4;
+            } while(--todo);
         }
         for(;pos < BufferSize;pos++)
             OutBuffer[c][OutPos+pos] += data[pos]*gain;
diff --git a/Alc/mixer/mixer_sse.c b/Alc/mixer/mixer_sse.c
index a178477f..fa79eb4d 100644
--- a/Alc/mixer/mixer_sse.c
+++ b/Alc/mixer/mixer_sse.c
@@ -135,55 +135,57 @@ void Mix_SSE(const ALfloat *data, ALsizei OutChans, ALfloat (*restrict OutBuffer
              ALfloat *CurrentGains, const ALfloat *TargetGains, ALsizei Counter, ALsizei OutPos,
              ALsizei BufferSize)
 {
-    ALfloat gain, delta, step;
-    __m128 gain4;
+    const ALfloat delta = (Counter > 0) ? 1.0f/(ALfloat)Counter : 0.0f;
     ALsizei c;
 
     ASSUME(OutChans > 0);
     ASSUME(BufferSize > 0);
-    delta = (Counter > 0) ? 1.0f/(ALfloat)Counter : 0.0f;
 
     for(c = 0;c < OutChans;c++)
     {
         ALsizei pos = 0;
-        gain = CurrentGains[c];
-        step = (TargetGains[c] - gain) * delta;
+        ALfloat gain = CurrentGains[c];
+        const ALfloat step = (TargetGains[c] - gain) * delta;
+
         if(fabsf(step) > FLT_EPSILON)
         {
             ALsizei minsize = mini(BufferSize, Counter);
+            ALfloat step_count = 0.0f;
             /* Mix with applying gain steps in aligned multiples of 4. */
-            if(minsize-pos > 3)
+            if(LIKELY(minsize > 3))
             {
-                __m128 step4;
-                gain4 = _mm_setr_ps(
-                    gain,
-                    gain + step,
-                    gain + step + step,
-                    gain + step + step + step
-                );
-                step4 = _mm_set1_ps(step + step + step + step);
+                const __m128 four4 = _mm_set1_ps(4.0f);
+                const __m128 step4 = _mm_set1_ps(step);
+                const __m128 gain4 = _mm_set1_ps(gain);
+                __m128 step_count4 = _mm_setr_ps(0.0f, 1.0f, 2.0f, 3.0f);
+                ALsizei todo = minsize >> 2;
                 do {
                     const __m128 val4 = _mm_load_ps(&data[pos]);
                     __m128 dry4 = _mm_load_ps(&OutBuffer[c][OutPos+pos]);
-                    dry4 = _mm_add_ps(dry4, _mm_mul_ps(val4, gain4));
-                    gain4 = _mm_add_ps(gain4, step4);
+#define MLA4(x, y, z) _mm_add_ps(x, _mm_mul_ps(y, z))
+                    /* dry += val * (gain + step*step_count) */
+                    dry4 = MLA4(dry4, val4, MLA4(gain4, step4, step_count4));
+#undef MLA4
                     _mm_store_ps(&OutBuffer[c][OutPos+pos], dry4);
+                    step_count4 = _mm_add_ps(step_count4, four4);
                     pos += 4;
-                } while(minsize-pos > 3);
-                /* NOTE: gain4 now represents the next four gains after the
-                 * last four mixed samples, so the lowest element represents
-                 * the next gain to apply.
+                } while(--todo);
+                /* NOTE: step_count4 now represents the next four counts after
+                 * the last four mixed samples, so the lowest element
+                 * represents the next step count to apply.
                  */
-                gain = _mm_cvtss_f32(gain4);
+                step_count = _mm_cvtss_f32(step_count4);
             }
             /* Mix with applying left over gain steps that aren't aligned multiples of 4. */
             for(;pos < minsize;pos++)
             {
-                OutBuffer[c][OutPos+pos] += data[pos]*gain;
-                gain += step;
+                OutBuffer[c][OutPos+pos] += data[pos]*(gain + step*step_count);
+                step_count += 1.0f;
             }
             if(pos == Counter)
                 gain = TargetGains[c];
+            else
+                gain += step*step_count;
             CurrentGains[c] = gain;
 
             /* Mix until pos is aligned with 4 or the mix is done. */
@@ -194,13 +196,17 @@ void Mix_SSE(const ALfloat *data, ALsizei OutChans, ALfloat (*restrict OutBuffer
 
         if(!(fabsf(gain) > GAIN_SILENCE_THRESHOLD))
             continue;
-        gain4 = _mm_set1_ps(gain);
-        for(;BufferSize-pos > 3;pos += 4)
+        if(LIKELY(BufferSize-pos > 3))
         {
-            const __m128 val4 = _mm_load_ps(&data[pos]);
-            __m128 dry4 = _mm_load_ps(&OutBuffer[c][OutPos+pos]);
-            dry4 = _mm_add_ps(dry4, _mm_mul_ps(val4, gain4));
-            _mm_store_ps(&OutBuffer[c][OutPos+pos], dry4);
+            ALsizei todo = (BufferSize-pos) >> 2;
+            const __m128 gain4 = _mm_set1_ps(gain);
+            do {
+                const __m128 val4 = _mm_load_ps(&data[pos]);
+                __m128 dry4 = _mm_load_ps(&OutBuffer[c][OutPos+pos]);
+                dry4 = _mm_add_ps(dry4, _mm_mul_ps(val4, gain4));
+                _mm_store_ps(&OutBuffer[c][OutPos+pos], dry4);
+                pos += 4;
+            } while(--todo);
         }
         for(;pos < BufferSize;pos++)
             OutBuffer[c][OutPos+pos] += data[pos]*gain;