Implement an SSE cubic resampler

author: Chris Robinson <[email protected]> 2012-09-14 07:01:58 -0700
committer: Chris Robinson <[email protected]> 2012-09-14 07:01:58 -0700
commit: 28086f6cb7f0e5c35a97ba60c8e28969981a8817 (patch)
tree: e403589004fd2fcbfd92698dda343913e938bd55 /Alc
parent: 45bb010b2801fefb0e154cc6d18c2f7d60d302db (diff)
3 files changed, 92 insertions, 1 deletions
diff --git a/Alc/ALu.c b/Alc/ALu.c
index 3ab723dd..b94216ff 100644
--- a/Alc/ALu.c
+++ b/Alc/ALu.c
@@ -66,13 +66,17 @@ static ResamplerFunc SelectResampler(enum Resampler Resampler, ALuint increment)
 #endif
             return Resample_lerp32_C;
         case CubicResampler:
+#ifdef HAVE_SSE
+            if((CPUCapFlags&CPU_CAP_SSE))
+                return Resample_cubic32_SSE;
+#endif
             return Resample_cubic32_C;
         case ResamplerMax:
             /* Shouldn't happen */
             break;
     }
 
-    return NULL;
+    return Resample_point32_C;
 }
 
 
diff --git a/Alc/mixer_defs.h b/Alc/mixer_defs.h
index 91ae24d6..f9dbadbb 100644
--- a/Alc/mixer_defs.h
+++ b/Alc/mixer_defs.h
@@ -17,6 +17,7 @@ void Resample_cubic32_C(const ALfloat *src, ALuint frac, ALuint increment, ALuin
 
 /* SSE resamplers */
 void Resample_lerp32_SSE(const ALfloat *src, ALuint frac, ALuint increment, ALuint NumChannels, ALfloat *RESTRICT dst, ALuint dstlen);
+void Resample_cubic32_SSE(const ALfloat *src, ALuint frac, ALuint increment, ALuint NumChannels, ALfloat *RESTRICT dst, ALuint dstlen);
 
 
 /* C mixers */
diff --git a/Alc/mixer_sse.c b/Alc/mixer_sse.c
index aff0152b..84884de3 100644
--- a/Alc/mixer_sse.c
+++ b/Alc/mixer_sse.c
@@ -58,6 +58,92 @@ void Resample_lerp32_SSE(const ALfloat *data, ALuint frac,
     }
 }
 
+void Resample_cubic32_SSE(const ALfloat *data, ALuint frac,
+  ALuint increment, ALuint NumChannels, ALfloat *RESTRICT OutBuffer,
+  ALuint BufferSize)
+{
+    /* Cubic interpolation mainly consists of a matrix4 * vector4 operation,
+     * followed by scalars being applied to the resulting elements before all
+     * four are added together for the final sample. */
+    static const __m128 matrix[4] = {
+        { -0.5,  1.0f, -0.5f,  0.0f },
+        {  1.5, -2.5f,  0.0f,  1.0f },
+        { -1.5,  2.0f,  0.5f,  0.0f },
+        {  0.5, -0.5f,  0.0f,  0.0f },
+    };
+    ALIGN(16) float value[4];
+    ALuint pos = 0;
+    ALuint i, j;
+
+    for(i = 0;i < BufferSize+1-3;i+=4)
+    {
+        __m128 result, final[4];
+
+        for(j = 0;j < 4;j++)
+        {
+            __m128 val4, s;
+            ALfloat mu;
+
+            val4 = _mm_set_ps(data[(pos-1)*NumChannels],
+                              data[(pos  )*NumChannels],
+                              data[(pos+1)*NumChannels],
+                              data[(pos+2)*NumChannels]);
+            mu = frac * (1.0f/FRACTIONONE);
+            s = _mm_set_ps(1.0f, mu, mu*mu, mu*mu*mu);
+
+            /* result = matrix * val4 */
+            result =                    _mm_mul_ps(val4, matrix[0]) ;
+            result = _mm_add_ps(result, _mm_mul_ps(val4, matrix[1]));
+            result = _mm_add_ps(result, _mm_mul_ps(val4, matrix[2]));
+            result = _mm_add_ps(result, _mm_mul_ps(val4, matrix[3]));
+
+            /* final[j] = result * { mu^0, mu^1, mu^2, mu^3 } */
+            final[j] = _mm_mul_ps(result, s);
+
+            frac += increment;
+            pos  += frac>>FRACTIONBITS;
+            frac &= FRACTIONMASK;
+        }
+        /* Transpose the final "matrix" so adding the rows will give the four
+         * samples. TODO: Is this faster than doing..
+         * _mm_store_ps(value, result);
+         * OutBuffer[i] = value[0] + value[1] + value[2] + value[3];
+         * ..for each sample?
+         */
+        _MM_TRANSPOSE4_PS(final[0], final[1], final[2], final[3]);
+        result = _mm_add_ps(_mm_add_ps(final[0], final[1]),
+                            _mm_add_ps(final[2], final[3]));
+
+        _mm_store_ps(&OutBuffer[i], result);
+    }
+    for(;i < BufferSize+1;i++)
+    {
+        __m128 val4, s, result;
+        ALfloat mu;
+
+        val4 = _mm_set_ps(data[(pos-1)*NumChannels],
+                          data[(pos  )*NumChannels],
+                          data[(pos+1)*NumChannels],
+                          data[(pos+2)*NumChannels]);
+        mu = frac * (1.0f/FRACTIONONE);
+        s = _mm_set_ps(1.0f, mu, mu*mu, mu*mu*mu);
+
+        /* result = matrix * val4 */
+        result =                    _mm_mul_ps(val4, matrix[0]) ;
+        result = _mm_add_ps(result, _mm_mul_ps(val4, matrix[1]));
+        result = _mm_add_ps(result, _mm_mul_ps(val4, matrix[2]));
+        result = _mm_add_ps(result, _mm_mul_ps(val4, matrix[3]));
+
+        /* value = result * { mu^0, mu^1, mu^2, mu^3 } */
+        _mm_store_ps(value, _mm_mul_ps(result, s));
+
+        OutBuffer[i] = value[0] + value[1] + value[2] + value[3];
+
+        frac += increment;
+        pos  += frac>>FRACTIONBITS;
+        frac &= FRACTIONMASK;
+    }
+}
 
 
 static __inline void ApplyCoeffsStep(ALuint Offset, ALfloat (*RESTRICT Values)[2],
author	Chris Robinson <[email protected]>	2012-09-14 07:01:58 -0700
committer	Chris Robinson <[email protected]>	2012-09-14 07:01:58 -0700
commit	28086f6cb7f0e5c35a97ba60c8e28969981a8817 (patch)
tree	e403589004fd2fcbfd92698dda343913e938bd55 /Alc
parent	45bb010b2801fefb0e154cc6d18c2f7d60d302db (diff)