aboutsummaryrefslogtreecommitdiffstats
path: root/alc/mixer/mixer_sse.cpp
diff options
context:
space:
mode:
authorChris Robinson <[email protected]>2019-09-26 19:24:29 -0700
committerChris Robinson <[email protected]>2019-09-26 19:24:29 -0700
commit9b64e5e0db2a778313bb873bb38f481ce40efe31 (patch)
tree4ac057e992a8720abf57e1ce548d4bda1a750f42 /alc/mixer/mixer_sse.cpp
parentd50ca464cd5e4f07bc399fd244578b0b34d72aef (diff)
Implement a "fast" bsinc path
This takes advantage of the fact than when increment <= 1 (when not down- sampling), the scale factor is always 0. As a result, the scale and scale-phase deltas never contribute to the filtered output. Removing those multiply+add operations cuts half of the work done by the inner loop. Sounds that do need to down-sample (when played with a high pitch, or is 48khz on 44.1khz output, for example), still go through the normal bsinc process.
Diffstat (limited to 'alc/mixer/mixer_sse.cpp')
-rw-r--r--alc/mixer/mixer_sse.cpp47
1 files changed, 47 insertions, 0 deletions
diff --git a/alc/mixer/mixer_sse.cpp b/alc/mixer/mixer_sse.cpp
index 9bb3bb8a..002d6064 100644
--- a/alc/mixer/mixer_sse.cpp
+++ b/alc/mixer/mixer_sse.cpp
@@ -66,6 +66,53 @@ const ALfloat *Resample_<BSincTag,SSETag>(const InterpState *state, const ALfloa
return dst.begin();
}
+template<>
+const ALfloat *Resample_<FastBSincTag,SSETag>(const InterpState *state,
+ const ALfloat *RESTRICT src, ALuint frac, ALuint increment, const al::span<float> dst)
+{
+ const ALfloat *const filter{state->bsinc.filter};
+ const size_t m{state->bsinc.m};
+
+ src -= state->bsinc.l;
+ for(float &out_sample : dst)
+ {
+ // Calculate the phase index and factor.
+#define FRAC_PHASE_BITDIFF (FRACTIONBITS-BSINC_PHASE_BITS)
+ const ALuint pi{frac >> FRAC_PHASE_BITDIFF};
+ const ALfloat pf{static_cast<float>(frac & ((1<<FRAC_PHASE_BITDIFF)-1)) *
+ (1.0f/(1<<FRAC_PHASE_BITDIFF))};
+#undef FRAC_PHASE_BITDIFF
+
+ // Apply the phase interpolated filter.
+ __m128 r4{_mm_setzero_ps()};
+ {
+ const __m128 pf4{_mm_set1_ps(pf)};
+ const float *fil{filter + m*pi*4};
+ const float *phd{fil + m*2};
+ size_t td{m >> 2};
+ size_t j{0u};
+
+#define MLA4(x, y, z) _mm_add_ps(x, _mm_mul_ps(y, z))
+ do {
+ /* f = fil + pf*phd */
+ const __m128 f4 = MLA4(_mm_load_ps(fil), pf4, _mm_load_ps(phd));
+ /* r += f*src */
+ r4 = MLA4(r4, f4, _mm_loadu_ps(&src[j]));
+ fil += 4; phd += 4; j += 4;
+ } while(--td);
+#undef MLA4
+ }
+ r4 = _mm_add_ps(r4, _mm_shuffle_ps(r4, r4, _MM_SHUFFLE(0, 1, 2, 3)));
+ r4 = _mm_add_ps(r4, _mm_movehl_ps(r4, r4));
+ out_sample = _mm_cvtss_f32(r4);
+
+ frac += increment;
+ src += frac>>FRACTIONBITS;
+ frac &= FRACTIONMASK;
+ }
+ return dst.begin();
+}
+
static inline void ApplyCoeffs(size_t Offset, float2 *RESTRICT Values, const ALuint IrSize,
const HrirArray &Coeffs, const ALfloat left, const ALfloat right)