diff options
author | Chris Robinson <[email protected]> | 2018-12-26 14:59:21 -0800 |
---|---|---|
committer | Chris Robinson <[email protected]> | 2018-12-26 14:59:21 -0800 |
commit | c5be03b51e8fd9bda3a46c345bdc945cfd965c2e (patch) | |
tree | 9560a78b363b6422e24dfc0c39209441e99c1789 /Alc/mixer/mixer_sse.cpp | |
parent | 5c449de73f491a73cbc948b3301b8305f20be648 (diff) |
Avoid masking in ApplyCoeffs's inner loop
This unfortunately does not apply to NEON, which would need a bit more
reworking of its method.
Diffstat (limited to 'Alc/mixer/mixer_sse.cpp')
-rw-r--r-- | Alc/mixer/mixer_sse.cpp | 68 |
1 files changed, 43 insertions, 25 deletions
diff --git a/Alc/mixer/mixer_sse.cpp b/Alc/mixer/mixer_sse.cpp index f0620cb5..09307697 100644 --- a/Alc/mixer/mixer_sse.cpp +++ b/Alc/mixer/mixer_sse.cpp @@ -87,46 +87,64 @@ static inline void ApplyCoeffs(ALsizei Offset, ALfloat (*RESTRICT Values)[2], const __m128 lrlr = _mm_setr_ps(left, right, left, right); __m128 vals = _mm_setzero_ps(); __m128 coeffs; - ALsizei i; + ASSUME(IrSize > 1); + + ALsizei off{Offset&HRIR_MASK}; if((Offset&1)) { - const ALsizei o0 = Offset&HRIR_MASK; - const ALsizei o1 = (Offset+IrSize-1)&HRIR_MASK; - __m128 imp0, imp1; + ALsizei count{mini(IrSize-1, HRIR_LENGTH - off)}; + ASSUME(count >= 1); + __m128 imp0, imp1; coeffs = _mm_load_ps(&Coeffs[0][0]); - vals = _mm_loadl_pi(vals, (__m64*)&Values[o0][0]); + vals = _mm_loadl_pi(vals, (__m64*)&Values[off][0]); imp0 = _mm_mul_ps(lrlr, coeffs); vals = _mm_add_ps(imp0, vals); - _mm_storel_pi((__m64*)&Values[o0][0], vals); - for(i = 1;i < IrSize-1;i += 2) + _mm_storel_pi((__m64*)&Values[off][0], vals); + ++off; + for(ALsizei i{1};;) { - const ALsizei o2 = (Offset+i)&HRIR_MASK; - - coeffs = _mm_load_ps(&Coeffs[i+1][0]); - vals = _mm_load_ps(&Values[o2][0]); - imp1 = _mm_mul_ps(lrlr, coeffs); - imp0 = _mm_shuffle_ps(imp0, imp1, _MM_SHUFFLE(1, 0, 3, 2)); - vals = _mm_add_ps(imp0, vals); - _mm_store_ps(&Values[o2][0], vals); - imp0 = imp1; + for(;i < count;i += 2) + { + coeffs = _mm_load_ps(&Coeffs[i+1][0]); + vals = _mm_load_ps(&Values[off][0]); + imp1 = _mm_mul_ps(lrlr, coeffs); + imp0 = _mm_shuffle_ps(imp0, imp1, _MM_SHUFFLE(1, 0, 3, 2)); + vals = _mm_add_ps(imp0, vals); + _mm_store_ps(&Values[off][0], vals); + imp0 = imp1; + off += 2; + } + off &= HRIR_MASK; + if(i >= IrSize-1) + break; + count = IrSize-1; } - vals = _mm_loadl_pi(vals, (__m64*)&Values[o1][0]); + vals = _mm_loadl_pi(vals, (__m64*)&Values[off][0]); imp0 = _mm_movehl_ps(imp0, imp0); vals = _mm_add_ps(imp0, vals); - _mm_storel_pi((__m64*)&Values[o1][0], vals); + _mm_storel_pi((__m64*)&Values[off][0], vals); } else { - for(i = 0;i < IrSize;i += 2) - { - const ALsizei o = (Offset + i)&HRIR_MASK; + ALsizei count{mini(IrSize, HRIR_LENGTH - off)}; + ASSUME(count >= 2); - coeffs = _mm_load_ps(&Coeffs[i][0]); - vals = _mm_load_ps(&Values[o][0]); - vals = _mm_add_ps(vals, _mm_mul_ps(lrlr, coeffs)); - _mm_store_ps(&Values[o][0], vals); + for(ALsizei i{0};;) + { + for(;i < count;i += 2) + { + coeffs = _mm_load_ps(&Coeffs[i][0]); + vals = _mm_load_ps(&Values[off][0]); + vals = _mm_add_ps(vals, _mm_mul_ps(lrlr, coeffs)); + _mm_store_ps(&Values[off][0], vals); + off += 2; + } + if(i >= IrSize) + break; + off = 0; + count = IrSize; } } } |