Implement a "fast" bsinc path

This takes advantage of the fact than when increment <= 1 (when not down- sampling), the scale factor is always 0. As a result, the scale and scale-phase deltas never contribute to the filtered output. Removing those multiply+add operations cuts half of the work done by the inner loop. Sounds that do need to down-sample (when played with a high pitch, or is 48khz on 44.1khz output, for example), still go through the normal bsinc process.
author: Chris Robinson <[email protected]> 2019-09-26 19:24:29 -0700
committer: Chris Robinson <[email protected]> 2019-09-26 19:24:29 -0700
commit: 9b64e5e0db2a778313bb873bb38f481ce40efe31 (patch)
tree: 4ac057e992a8720abf57e1ce548d4bda1a750f42
parent: d50ca464cd5e4f07bc399fd244578b0b34d72aef (diff)
8 files changed, 135 insertions, 6 deletions
diff --git a/alc/alu.cpp b/alc/alu.cpp
index 8affbde4..86d2789e 100644
--- a/alc/alu.cpp
+++ b/alc/alu.cpp
@@ -951,7 +951,7 @@ void CalcNonAttnSourceParams(ALvoice *voice, const ALvoicePropsBase *props, cons
         BsincPrepare(voice->mStep, &voice->mResampleState.bsinc, &bsinc24);
     else if(props->mResampler == Resampler::BSinc12)
         BsincPrepare(voice->mStep, &voice->mResampleState.bsinc, &bsinc12);
-    voice->mResampler = SelectResampler(props->mResampler);
+    voice->mResampler = SelectResampler(props->mResampler, voice->mStep);
 
     /* Calculate gains */
     const ALlistener &Listener = ALContext->mListener;
@@ -1281,7 +1281,7 @@ void CalcAttnSourceParams(ALvoice *voice, const ALvoicePropsBase *props, const A
         BsincPrepare(voice->mStep, &voice->mResampleState.bsinc, &bsinc24);
     else if(props->mResampler == Resampler::BSinc12)
         BsincPrepare(voice->mStep, &voice->mResampleState.bsinc, &bsinc12);
-    voice->mResampler = SelectResampler(props->mResampler);
+    voice->mResampler = SelectResampler(props->mResampler, voice->mStep);
 
     ALfloat spread{0.0f};
     if(props->Radius > Distance)
diff --git a/alc/alu.h b/alc/alu.h
index aa698f7d..e9858ddb 100644
--- a/alc/alu.h
+++ b/alc/alu.h
@@ -370,7 +370,7 @@ void aluInit(void);
 
 void aluInitMixer(void);
 
-ResamplerFunc SelectResampler(Resampler resampler);
+ResamplerFunc SelectResampler(Resampler resampler, ALuint increment);
 
 /* aluInitRenderer
  *
diff --git a/alc/converter.cpp b/alc/converter.cpp
index 6622a997..b0efb839 100644
--- a/alc/converter.cpp
+++ b/alc/converter.cpp
@@ -171,7 +171,7 @@ SampleConverterPtr CreateSampleConverter(DevFmtType srcType, DevFmtType dstType,
             BsincPrepare(converter->mIncrement, &converter->mState.bsinc, &bsinc24);
         else if(resampler == Resampler::BSinc12)
             BsincPrepare(converter->mIncrement, &converter->mState.bsinc, &bsinc12);
-        converter->mResample = SelectResampler(resampler);
+        converter->mResample = SelectResampler(resampler, converter->mIncrement);
     }
 
     return converter;
diff --git a/alc/mixer/defs.h b/alc/mixer/defs.h
index 62e7d3ba..ce572973 100644
--- a/alc/mixer/defs.h
+++ b/alc/mixer/defs.h
@@ -23,7 +23,8 @@ enum ResampleType {
     PointTag,
     LerpTag,
     CubicTag,
-    BSincTag
+    BSincTag,
+    FastBSincTag
 };
 
 template<ResampleType TypeTag, InstSetType InstTag>
diff --git a/alc/mixer/mixer_c.cpp b/alc/mixer/mixer_c.cpp
index 42d515ae..fafda70d 100644
--- a/alc/mixer/mixer_c.cpp
+++ b/alc/mixer/mixer_c.cpp
@@ -41,6 +41,26 @@ inline ALfloat do_bsinc(const InterpState &istate, const ALfloat *RESTRICT vals,
         r += (fil[j_f] + istate.bsinc.sf*scd[j_f] + pf*(phd[j_f] + istate.bsinc.sf*spd[j_f])) * vals[j_f];
     return r;
 }
+inline ALfloat do_fastbsinc(const InterpState &istate, const ALfloat *RESTRICT vals, const ALuint frac)
+{
+    const size_t m{istate.bsinc.m};
+
+    // Calculate the phase index and factor.
+#define FRAC_PHASE_BITDIFF (FRACTIONBITS-BSINC_PHASE_BITS)
+    const ALuint pi{frac >> FRAC_PHASE_BITDIFF};
+    const ALfloat pf{static_cast<float>(frac & ((1<<FRAC_PHASE_BITDIFF)-1)) *
+        (1.0f/(1<<FRAC_PHASE_BITDIFF))};
+#undef FRAC_PHASE_BITDIFF
+
+    const ALfloat *fil{istate.bsinc.filter + m*pi*4};
+    const ALfloat *phd{fil + m*2};
+
+    // Apply the phase interpolated filter.
+    ALfloat r{0.0f};
+    for(size_t j_f{0};j_f < m;j_f++)
+        r += (fil[j_f] + pf*phd[j_f]) * vals[j_f];
+    return r;
+}
 
 using SamplerT = ALfloat(const InterpState&, const ALfloat*RESTRICT, const ALuint);
 template<SamplerT &Sampler>
@@ -98,6 +118,11 @@ const ALfloat *Resample_<BSincTag,CTag>(const InterpState *state, const ALfloat
     ALuint frac, ALuint increment, const al::span<float> dst)
 { return DoResample<do_bsinc>(state, src-state->bsinc.l, frac, increment, dst); }
 
+template<>
+const ALfloat *Resample_<FastBSincTag,CTag>(const InterpState *state, const ALfloat *RESTRICT src,
+    ALuint frac, ALuint increment, const al::span<float> dst)
+{ return DoResample<do_fastbsinc>(state, src-state->bsinc.l, frac, increment, dst); }
+
 
 static inline void ApplyCoeffs(size_t /*Offset*/, float2 *RESTRICT Values, const ALuint IrSize,
     const HrirArray &Coeffs, const ALfloat left, const ALfloat right)
diff --git a/alc/mixer/mixer_neon.cpp b/alc/mixer/mixer_neon.cpp
index cd2b0ebc..178c7d6e 100644
--- a/alc/mixer/mixer_neon.cpp
+++ b/alc/mixer/mixer_neon.cpp
@@ -118,6 +118,50 @@ const ALfloat *Resample_<BSincTag,NEONTag>(const InterpState *state, const ALflo
     return dst.begin();
 }
 
+template<>
+const ALfloat *Resample_<FastBSincTag,NEONTag>(const InterpState *state,
+    const ALfloat *RESTRICT src, ALuint frac, ALuint increment, const al::span<float> dst)
+{
+    const ALfloat *const filter{state->bsinc.filter};
+    const size_t m{state->bsinc.m};
+
+    src -= state->bsinc.l;
+    for(float &out_sample : dst)
+    {
+        // Calculate the phase index and factor.
+#define FRAC_PHASE_BITDIFF (FRACTIONBITS-BSINC_PHASE_BITS)
+        const ALuint pi{frac >> FRAC_PHASE_BITDIFF};
+        const ALfloat pf{static_cast<float>(frac & ((1<<FRAC_PHASE_BITDIFF)-1)) *
+            (1.0f/(1<<FRAC_PHASE_BITDIFF))};
+#undef FRAC_PHASE_BITDIFF
+
+        // Apply the phase interpolated filter.
+        float32x4_t r4{vdupq_n_f32(0.0f)};
+        {
+            const float32x4_t pf4{vdupq_n_f32(pf)};
+            const float *fil{filter + m*pi*4};
+            const float *phd{fil + m*2};
+            size_t td{m >> 2};
+            size_t j{0u};
+
+            do {
+                /* f = fil + pf*phd */
+                const float32x4_t f4 = vmlaq_f32(vld1q_f32(fil), pf4, vld1q_f32(phd));
+                /* r += f*src */
+                r4 = vmlaq_f32(r4, f4, vld1q_f32(&src[j]));
+                fil += 4; phd += 4; j += 4;
+            } while(--td);
+        }
+        r4 = vaddq_f32(r4, vrev64q_f32(r4));
+        out_sample = vget_lane_f32(vadd_f32(vget_low_f32(r4), vget_high_f32(r4)), 0);
+
+        frac += increment;
+        src  += frac>>FRACTIONBITS;
+        frac &= FRACTIONMASK;
+    }
+    return dst.begin();
+}
+
 
 static inline void ApplyCoeffs(size_t /*Offset*/, float2 *RESTRICT Values, const ALuint IrSize,
     const HrirArray &Coeffs, const ALfloat left, const ALfloat right)
diff --git a/alc/mixer/mixer_sse.cpp b/alc/mixer/mixer_sse.cpp
index 9bb3bb8a..002d6064 100644
--- a/alc/mixer/mixer_sse.cpp
+++ b/alc/mixer/mixer_sse.cpp
@@ -66,6 +66,53 @@ const ALfloat *Resample_<BSincTag,SSETag>(const InterpState *state, const ALfloa
     return dst.begin();
 }
 
+template<>
+const ALfloat *Resample_<FastBSincTag,SSETag>(const InterpState *state,
+    const ALfloat *RESTRICT src, ALuint frac, ALuint increment, const al::span<float> dst)
+{
+    const ALfloat *const filter{state->bsinc.filter};
+    const size_t m{state->bsinc.m};
+
+    src -= state->bsinc.l;
+    for(float &out_sample : dst)
+    {
+        // Calculate the phase index and factor.
+#define FRAC_PHASE_BITDIFF (FRACTIONBITS-BSINC_PHASE_BITS)
+        const ALuint pi{frac >> FRAC_PHASE_BITDIFF};
+        const ALfloat pf{static_cast<float>(frac & ((1<<FRAC_PHASE_BITDIFF)-1)) *
+            (1.0f/(1<<FRAC_PHASE_BITDIFF))};
+#undef FRAC_PHASE_BITDIFF
+
+        // Apply the phase interpolated filter.
+        __m128 r4{_mm_setzero_ps()};
+        {
+            const __m128 pf4{_mm_set1_ps(pf)};
+            const float *fil{filter + m*pi*4};
+            const float *phd{fil + m*2};
+            size_t td{m >> 2};
+            size_t j{0u};
+
+#define MLA4(x, y, z) _mm_add_ps(x, _mm_mul_ps(y, z))
+            do {
+                /* f = fil + pf*phd */
+                const __m128 f4 = MLA4(_mm_load_ps(fil), pf4, _mm_load_ps(phd));
+                /* r += f*src */
+                r4 = MLA4(r4, f4, _mm_loadu_ps(&src[j]));
+                fil += 4; phd += 4; j += 4;
+            } while(--td);
+#undef MLA4
+        }
+        r4 = _mm_add_ps(r4, _mm_shuffle_ps(r4, r4, _MM_SHUFFLE(0, 1, 2, 3)));
+        r4 = _mm_add_ps(r4, _mm_movehl_ps(r4, r4));
+        out_sample = _mm_cvtss_f32(r4);
+
+        frac += increment;
+        src  += frac>>FRACTIONBITS;
+        frac &= FRACTIONMASK;
+    }
+    return dst.begin();
+}
+
 
 static inline void ApplyCoeffs(size_t Offset, float2 *RESTRICT Values, const ALuint IrSize,
     const HrirArray &Coeffs, const ALfloat left, const ALfloat right)
diff --git a/alc/mixvoice.cpp b/alc/mixvoice.cpp
index bfb3b663..58a08ce3 100644
--- a/alc/mixvoice.cpp
+++ b/alc/mixvoice.cpp
@@ -135,7 +135,7 @@ inline HrtfMixerBlendFunc SelectHrtfBlendMixer()
 } // namespace
 
 
-ResamplerFunc SelectResampler(Resampler resampler)
+ResamplerFunc SelectResampler(Resampler resampler, ALuint increment)
 {
     switch(resampler)
     {
@@ -159,6 +159,18 @@ ResamplerFunc SelectResampler(Resampler resampler)
             return Resample_<CubicTag,CTag>;
         case Resampler::BSinc12:
         case Resampler::BSinc24:
+            if(increment <= FRACTIONONE)
+            {
+#ifdef HAVE_NEON
+                if((CPUCapFlags&CPU_CAP_NEON))
+                    return Resample_<FastBSincTag,NEONTag>;
+#endif
+#ifdef HAVE_SSE
+                if((CPUCapFlags&CPU_CAP_SSE))
+                    return Resample_<FastBSincTag,SSETag>;
+#endif
+                return Resample_<FastBSincTag,CTag>;
+            }
 #ifdef HAVE_NEON
             if((CPUCapFlags&CPU_CAP_NEON))
                 return Resample_<BSincTag,NEONTag>;
author	Chris Robinson <[email protected]>	2019-09-26 19:24:29 -0700
committer	Chris Robinson <[email protected]>	2019-09-26 19:24:29 -0700
commit	9b64e5e0db2a778313bb873bb38f481ce40efe31 (patch)
tree	4ac057e992a8720abf57e1ce548d4bda1a750f42
parent	d50ca464cd5e4f07bc399fd244578b0b34d72aef (diff)