diff options
author | Sven Gothel <[email protected]> | 2019-04-07 23:39:04 +0200 |
---|---|---|
committer | Sven Gothel <[email protected]> | 2019-04-07 23:39:04 +0200 |
commit | 73233ce69919fc19c53ce8663c5b8cc05227f07e (patch) | |
tree | f2b6ccc1a14d7c387f33398a44ea4511d7ecb212 /Alc/mixer_sse.c | |
parent | 8efa4c7ba5ee8eb399d31a9884e45f743d4625ad (diff) | |
parent | 99a55c445211fea77af6ab61cbc6a6ec4fbdc9b9 (diff) |
Merge branch 'v1.19' of git://repo.or.cz/openal-soft into v1.19v1.19
Diffstat (limited to 'Alc/mixer_sse.c')
-rw-r--r-- | Alc/mixer_sse.c | 279 |
1 files changed, 0 insertions, 279 deletions
diff --git a/Alc/mixer_sse.c b/Alc/mixer_sse.c deleted file mode 100644 index 090b7a5a..00000000 --- a/Alc/mixer_sse.c +++ /dev/null @@ -1,279 +0,0 @@ -#include "config.h" - -#include <xmmintrin.h> - -#include "AL/al.h" -#include "AL/alc.h" -#include "alMain.h" -#include "alu.h" - -#include "alSource.h" -#include "alAuxEffectSlot.h" -#include "mixer_defs.h" - - -const ALfloat *Resample_bsinc32_SSE(const BsincState *state, const ALfloat *src, ALuint frac, - ALuint increment, ALfloat *restrict dst, ALuint dstlen) -{ - const __m128 sf4 = _mm_set1_ps(state->sf); - const ALuint m = state->m; - const ALint l = state->l; - const ALfloat *fil, *scd, *phd, *spd; - ALuint pi, j_f, i; - ALfloat pf; - ALint j_s; - __m128 r4; - - for(i = 0;i < dstlen;i++) - { - // Calculate the phase index and factor. -#define FRAC_PHASE_BITDIFF (FRACTIONBITS-BSINC_PHASE_BITS) - pi = frac >> FRAC_PHASE_BITDIFF; - pf = (frac & ((1<<FRAC_PHASE_BITDIFF)-1)) * (1.0f/(1<<FRAC_PHASE_BITDIFF)); -#undef FRAC_PHASE_BITDIFF - - fil = state->coeffs[pi].filter; - scd = state->coeffs[pi].scDelta; - phd = state->coeffs[pi].phDelta; - spd = state->coeffs[pi].spDelta; - - // Apply the scale and phase interpolated filter. - r4 = _mm_setzero_ps(); - { - const __m128 pf4 = _mm_set1_ps(pf); - for(j_f = 0,j_s = l;j_f < m;j_f+=4,j_s+=4) - { - const __m128 f4 = _mm_add_ps( - _mm_add_ps( - _mm_load_ps(&fil[j_f]), - _mm_mul_ps(sf4, _mm_load_ps(&scd[j_f])) - ), - _mm_mul_ps( - pf4, - _mm_add_ps( - _mm_load_ps(&phd[j_f]), - _mm_mul_ps(sf4, _mm_load_ps(&spd[j_f])) - ) - ) - ); - r4 = _mm_add_ps(r4, _mm_mul_ps(f4, _mm_loadu_ps(&src[j_s]))); - } - } - r4 = _mm_add_ps(r4, _mm_shuffle_ps(r4, r4, _MM_SHUFFLE(0, 1, 2, 3))); - r4 = _mm_add_ps(r4, _mm_movehl_ps(r4, r4)); - dst[i] = _mm_cvtss_f32(r4); - - frac += increment; - src += frac>>FRACTIONBITS; - frac &= FRACTIONMASK; - } - return dst; -} - - -static inline void SetupCoeffs(ALfloat (*restrict OutCoeffs)[2], - const HrtfParams *hrtfparams, - ALuint IrSize, ALuint Counter) -{ - const __m128 counter4 = _mm_set1_ps((float)Counter); - __m128 coeffs, step4; - ALuint i; - - for(i = 0;i < IrSize;i += 2) - { - step4 = _mm_load_ps(&hrtfparams->CoeffStep[i][0]); - coeffs = _mm_load_ps(&hrtfparams->Coeffs[i][0]); - coeffs = _mm_sub_ps(coeffs, _mm_mul_ps(step4, counter4)); - _mm_store_ps(&OutCoeffs[i][0], coeffs); - } -} - -static inline void ApplyCoeffsStep(ALuint Offset, ALfloat (*restrict Values)[2], - const ALuint IrSize, - ALfloat (*restrict Coeffs)[2], - const ALfloat (*restrict CoeffStep)[2], - ALfloat left, ALfloat right) -{ - const __m128 lrlr = _mm_setr_ps(left, right, left, right); - __m128 coeffs, deltas, imp0, imp1; - __m128 vals = _mm_setzero_ps(); - ALuint i; - - if((Offset&1)) - { - const ALuint o0 = Offset&HRIR_MASK; - const ALuint o1 = (Offset+IrSize-1)&HRIR_MASK; - - coeffs = _mm_load_ps(&Coeffs[0][0]); - deltas = _mm_load_ps(&CoeffStep[0][0]); - vals = _mm_loadl_pi(vals, (__m64*)&Values[o0][0]); - imp0 = _mm_mul_ps(lrlr, coeffs); - coeffs = _mm_add_ps(coeffs, deltas); - vals = _mm_add_ps(imp0, vals); - _mm_store_ps(&Coeffs[0][0], coeffs); - _mm_storel_pi((__m64*)&Values[o0][0], vals); - for(i = 1;i < IrSize-1;i += 2) - { - const ALuint o2 = (Offset+i)&HRIR_MASK; - - coeffs = _mm_load_ps(&Coeffs[i+1][0]); - deltas = _mm_load_ps(&CoeffStep[i+1][0]); - vals = _mm_load_ps(&Values[o2][0]); - imp1 = _mm_mul_ps(lrlr, coeffs); - coeffs = _mm_add_ps(coeffs, deltas); - imp0 = _mm_shuffle_ps(imp0, imp1, _MM_SHUFFLE(1, 0, 3, 2)); - vals = _mm_add_ps(imp0, vals); - _mm_store_ps(&Coeffs[i+1][0], coeffs); - _mm_store_ps(&Values[o2][0], vals); - imp0 = imp1; - } - vals = _mm_loadl_pi(vals, (__m64*)&Values[o1][0]); - imp0 = _mm_movehl_ps(imp0, imp0); - vals = _mm_add_ps(imp0, vals); - _mm_storel_pi((__m64*)&Values[o1][0], vals); - } - else - { - for(i = 0;i < IrSize;i += 2) - { - const ALuint o = (Offset + i)&HRIR_MASK; - - coeffs = _mm_load_ps(&Coeffs[i][0]); - deltas = _mm_load_ps(&CoeffStep[i][0]); - vals = _mm_load_ps(&Values[o][0]); - imp0 = _mm_mul_ps(lrlr, coeffs); - coeffs = _mm_add_ps(coeffs, deltas); - vals = _mm_add_ps(imp0, vals); - _mm_store_ps(&Coeffs[i][0], coeffs); - _mm_store_ps(&Values[o][0], vals); - } - } -} - -static inline void ApplyCoeffs(ALuint Offset, ALfloat (*restrict Values)[2], - const ALuint IrSize, - ALfloat (*restrict Coeffs)[2], - ALfloat left, ALfloat right) -{ - const __m128 lrlr = _mm_setr_ps(left, right, left, right); - __m128 vals = _mm_setzero_ps(); - __m128 coeffs; - ALuint i; - - if((Offset&1)) - { - const ALuint o0 = Offset&HRIR_MASK; - const ALuint o1 = (Offset+IrSize-1)&HRIR_MASK; - __m128 imp0, imp1; - - coeffs = _mm_load_ps(&Coeffs[0][0]); - vals = _mm_loadl_pi(vals, (__m64*)&Values[o0][0]); - imp0 = _mm_mul_ps(lrlr, coeffs); - vals = _mm_add_ps(imp0, vals); - _mm_storel_pi((__m64*)&Values[o0][0], vals); - for(i = 1;i < IrSize-1;i += 2) - { - const ALuint o2 = (Offset+i)&HRIR_MASK; - - coeffs = _mm_load_ps(&Coeffs[i+1][0]); - vals = _mm_load_ps(&Values[o2][0]); - imp1 = _mm_mul_ps(lrlr, coeffs); - imp0 = _mm_shuffle_ps(imp0, imp1, _MM_SHUFFLE(1, 0, 3, 2)); - vals = _mm_add_ps(imp0, vals); - _mm_store_ps(&Values[o2][0], vals); - imp0 = imp1; - } - vals = _mm_loadl_pi(vals, (__m64*)&Values[o1][0]); - imp0 = _mm_movehl_ps(imp0, imp0); - vals = _mm_add_ps(imp0, vals); - _mm_storel_pi((__m64*)&Values[o1][0], vals); - } - else - { - for(i = 0;i < IrSize;i += 2) - { - const ALuint o = (Offset + i)&HRIR_MASK; - - coeffs = _mm_load_ps(&Coeffs[i][0]); - vals = _mm_load_ps(&Values[o][0]); - vals = _mm_add_ps(vals, _mm_mul_ps(lrlr, coeffs)); - _mm_store_ps(&Values[o][0], vals); - } - } -} - -#define MixHrtf MixHrtf_SSE -#include "mixer_inc.c" -#undef MixHrtf - - -void Mix_SSE(const ALfloat *data, ALuint OutChans, ALfloat (*restrict OutBuffer)[BUFFERSIZE], - MixGains *Gains, ALuint Counter, ALuint OutPos, ALuint BufferSize) -{ - ALfloat gain, step; - __m128 gain4; - ALuint c; - - for(c = 0;c < OutChans;c++) - { - ALuint pos = 0; - gain = Gains[c].Current; - step = Gains[c].Step; - if(step != 0.0f && Counter > 0) - { - ALuint minsize = minu(BufferSize, Counter); - /* Mix with applying gain steps in aligned multiples of 4. */ - if(minsize-pos > 3) - { - __m128 step4; - gain4 = _mm_setr_ps( - gain, - gain + step, - gain + step + step, - gain + step + step + step - ); - step4 = _mm_set1_ps(step + step + step + step); - do { - const __m128 val4 = _mm_load_ps(&data[pos]); - __m128 dry4 = _mm_load_ps(&OutBuffer[c][OutPos+pos]); - dry4 = _mm_add_ps(dry4, _mm_mul_ps(val4, gain4)); - gain4 = _mm_add_ps(gain4, step4); - _mm_store_ps(&OutBuffer[c][OutPos+pos], dry4); - pos += 4; - } while(minsize-pos > 3); - /* NOTE: gain4 now represents the next four gains after the - * last four mixed samples, so the lowest element represents - * the next gain to apply. - */ - gain = _mm_cvtss_f32(gain4); - } - /* Mix with applying left over gain steps that aren't aligned multiples of 4. */ - for(;pos < minsize;pos++) - { - OutBuffer[c][OutPos+pos] += data[pos]*gain; - gain += step; - } - if(pos == Counter) - gain = Gains[c].Target; - Gains[c].Current = gain; - - /* Mix until pos is aligned with 4 or the mix is done. */ - minsize = minu(BufferSize, (pos+3)&~3); - for(;pos < minsize;pos++) - OutBuffer[c][OutPos+pos] += data[pos]*gain; - } - - if(!(fabsf(gain) > GAIN_SILENCE_THRESHOLD)) - continue; - gain4 = _mm_set1_ps(gain); - for(;BufferSize-pos > 3;pos += 4) - { - const __m128 val4 = _mm_load_ps(&data[pos]); - __m128 dry4 = _mm_load_ps(&OutBuffer[c][OutPos+pos]); - dry4 = _mm_add_ps(dry4, _mm_mul_ps(val4, gain4)); - _mm_store_ps(&OutBuffer[c][OutPos+pos], dry4); - } - for(;pos < BufferSize;pos++) - OutBuffer[c][OutPos+pos] += data[pos]*gain; - } -} |