diff options
author | Chris Robinson <[email protected]> | 2016-05-31 10:18:34 -0700 |
---|---|---|
committer | Chris Robinson <[email protected]> | 2016-05-31 10:18:34 -0700 |
commit | 5e64882be9ad3e3a1552e41befef5a6216f4ecfe (patch) | |
tree | 47059e787d77c86a0e2575e0d5a1d6b13798b8b4 /Alc | |
parent | 72d2febccbc670843669494fe5bc052839f54294 (diff) |
Use SSE for applying the HQ B-Format decoder matrices
Diffstat (limited to 'Alc')
-rw-r--r-- | Alc/bformatdec.c | 56 | ||||
-rw-r--r-- | Alc/mixer_c.c | 21 | ||||
-rw-r--r-- | Alc/mixer_defs.h | 4 | ||||
-rw-r--r-- | Alc/mixer_sse.c | 25 |
4 files changed, 79 insertions, 27 deletions
diff --git a/Alc/bformatdec.c b/Alc/bformatdec.c index 49052cb8..a871fb09 100644 --- a/Alc/bformatdec.c +++ b/Alc/bformatdec.c @@ -3,6 +3,7 @@ #include "bformatdec.h" #include "ambdec.h" +#include "mixer_defs.h" #include "alu.h" #include "threads.h" @@ -151,12 +152,27 @@ static const ALfloat CubeMatrixLF[8][MAX_AMBI_COEFFS] = { }; static ALfloat CubeEncoder[8][MAX_AMBI_COEFFS]; -static alonce_flag encoder_inited = AL_ONCE_FLAG_INIT; -static void init_encoder(void) +static inline MatrixMixerFunc SelectMixer(void) +{ +#ifdef HAVE_SSE + if((CPUCapFlags&CPU_CAP_SSE)) + return MixRow_SSE; +#endif + return MixRow_C; +} + +static MatrixMixerFunc MixMatrixRow = MixRow_C; + + +static alonce_flag bformatdec_inited = AL_ONCE_FLAG_INIT; + +static void init_bformatdec(void) { ALuint i, j; + MixMatrixRow = SelectMixer(); + CalcXYZCoeffs(-0.577350269f, 0.577350269f, -0.577350269f, 0.0f, CubeEncoder[0]); CalcXYZCoeffs( 0.577350269f, 0.577350269f, -0.577350269f, 0.0f, CubeEncoder[1]); CalcXYZCoeffs(-0.577350269f, 0.577350269f, 0.577350269f, 0.0f, CubeEncoder[2]); @@ -226,7 +242,7 @@ typedef struct BFormatDec { BFormatDec *bformatdec_alloc() { - alcall_once(&encoder_inited, init_encoder); + alcall_once(&bformatdec_inited, init_bformatdec); return al_calloc(16, sizeof(BFormatDec)); } @@ -435,20 +451,6 @@ void bformatdec_reset(BFormatDec *dec, const AmbDecConf *conf, ALuint chancount, } -static void apply_row(ALfloat *out, const ALfloat *mtx, ALfloat (*restrict in)[BUFFERSIZE], ALuint inchans, ALuint todo) -{ - ALuint c, i; - - for(c = 0;c < inchans;c++) - { - ALfloat gain = mtx[c]; - if(!(fabsf(gain) > GAIN_SILENCE_THRESHOLD)) - continue; - for(i = 0;i < todo;i++) - out[i] += in[c][i] * gain; - } -} - void bformatdec_process(struct BFormatDec *dec, ALfloat (*restrict OutBuffer)[BUFFERSIZE], ALuint OutChannels, ALfloat (*restrict InSamples)[BUFFERSIZE], ALuint SamplesToDo) { ALuint chan, i; @@ -465,10 +467,10 @@ void bformatdec_process(struct BFormatDec *dec, ALfloat (*restrict OutBuffer)[BU continue; memset(dec->ChannelMix, 0, SamplesToDo*sizeof(ALfloat)); - apply_row(dec->ChannelMix, dec->MatrixHF[chan], dec->SamplesHF, - dec->NumChannels, SamplesToDo); - apply_row(dec->ChannelMix, dec->MatrixLF[chan], dec->SamplesLF, - dec->NumChannels, SamplesToDo); + MixMatrixRow(dec->ChannelMix, dec->MatrixHF[chan], dec->SamplesHF, + dec->NumChannels, SamplesToDo); + MixMatrixRow(dec->ChannelMix, dec->MatrixLF[chan], dec->SamplesLF, + dec->NumChannels, SamplesToDo); if(dec->Delay[chan].Length > 0) { @@ -504,8 +506,8 @@ void bformatdec_process(struct BFormatDec *dec, ALfloat (*restrict OutBuffer)[BU continue; memset(dec->ChannelMix, 0, SamplesToDo*sizeof(ALfloat)); - apply_row(dec->ChannelMix, dec->MatrixHF[chan], InSamples, - dec->NumChannels, SamplesToDo); + MixMatrixRow(dec->ChannelMix, dec->MatrixHF[chan], InSamples, + dec->NumChannels, SamplesToDo); if(dec->Delay[chan].Length > 0) { @@ -556,10 +558,10 @@ void bformatdec_upSample(struct BFormatDec *dec, ALfloat (*restrict OutBuffer)[B for(k = 0;k < dec->UpSampler.NumChannels;k++) { memset(dec->ChannelMix, 0, SamplesToDo*sizeof(ALfloat)); - apply_row(dec->ChannelMix, dec->UpSampler.MatrixHF[k], dec->SamplesHF, - InChannels, SamplesToDo); - apply_row(dec->ChannelMix, dec->UpSampler.MatrixLF[k], dec->SamplesLF, - InChannels, SamplesToDo); + MixMatrixRow(dec->ChannelMix, dec->UpSampler.MatrixHF[k], dec->SamplesHF, + InChannels, SamplesToDo); + MixMatrixRow(dec->ChannelMix, dec->UpSampler.MatrixLF[k], dec->SamplesLF, + InChannels, SamplesToDo); for(j = 0;j < dec->NumChannels;j++) { diff --git a/Alc/mixer_c.c b/Alc/mixer_c.c index e9d26140..7952ec93 100644 --- a/Alc/mixer_c.c +++ b/Alc/mixer_c.c @@ -167,3 +167,24 @@ void Mix_C(const ALfloat *data, ALuint OutChans, ALfloat (*restrict OutBuffer)[B OutBuffer[c][OutPos+pos] += data[pos]*gain; } } + +/* Basically the inverse of the above. Rather than one input going to multiple + * outputs (each with its own gain), it's multiple inputs (each with its own + * gain) going to one output. This applies one row (vs one column) of a matrix + * transform. And as the matrices are more or less static once set up, no + * stepping is necessary. + */ +void MixRow_C(ALfloat *OutBuffer, const ALfloat *Mtx, ALfloat (*restrict data)[BUFFERSIZE], ALuint InChans, ALuint BufferSize) +{ + ALuint c, i; + + for(c = 0;c < InChans;c++) + { + ALfloat gain = Mtx[c]; + if(!(fabsf(gain) > GAIN_SILENCE_THRESHOLD)) + continue; + + for(i = 0;i < BufferSize;i++) + OutBuffer[i] += data[c][i] * gain; + } +} diff --git a/Alc/mixer_defs.h b/Alc/mixer_defs.h index 5db804b1..8d208b9c 100644 --- a/Alc/mixer_defs.h +++ b/Alc/mixer_defs.h @@ -27,6 +27,8 @@ void MixHrtf_C(ALfloat (*restrict OutBuffer)[BUFFERSIZE], ALuint lidx, ALuint ri struct HrtfState *hrtfstate, ALuint BufferSize); void Mix_C(const ALfloat *data, ALuint OutChans, ALfloat (*restrict OutBuffer)[BUFFERSIZE], struct MixGains *Gains, ALuint Counter, ALuint OutPos, ALuint BufferSize); +void MixRow_C(ALfloat *OutBuffer, const ALfloat *Mtx, ALfloat (*restrict data)[BUFFERSIZE], + ALuint InChans, ALuint BufferSize); /* SSE mixers */ void MixHrtf_SSE(ALfloat (*restrict OutBuffer)[BUFFERSIZE], ALuint lidx, ALuint ridx, @@ -35,6 +37,8 @@ void MixHrtf_SSE(ALfloat (*restrict OutBuffer)[BUFFERSIZE], ALuint lidx, ALuint struct HrtfState *hrtfstate, ALuint BufferSize); void Mix_SSE(const ALfloat *data, ALuint OutChans, ALfloat (*restrict OutBuffer)[BUFFERSIZE], struct MixGains *Gains, ALuint Counter, ALuint OutPos, ALuint BufferSize); +void MixRow_SSE(ALfloat *OutBuffer, const ALfloat *Mtx, ALfloat (*restrict data)[BUFFERSIZE], + ALuint InChans, ALuint BufferSize); /* SSE resamplers */ inline void InitiatePositionArrays(ALuint frac, ALuint increment, ALuint *frac_arr, ALuint *pos_arr, ALuint size) diff --git a/Alc/mixer_sse.c b/Alc/mixer_sse.c index 942e0453..120ac4a0 100644 --- a/Alc/mixer_sse.c +++ b/Alc/mixer_sse.c @@ -260,3 +260,28 @@ void Mix_SSE(const ALfloat *data, ALuint OutChans, ALfloat (*restrict OutBuffer) OutBuffer[c][OutPos+pos] += data[pos]*gain; } } + +void MixRow_SSE(ALfloat *OutBuffer, const ALfloat *Mtx, ALfloat (*restrict data)[BUFFERSIZE], ALuint InChans, ALuint BufferSize) +{ + __m128 gain4; + ALuint c; + + for(c = 0;c < InChans;c++) + { + ALuint pos = 0; + ALfloat gain = Mtx[c]; + if(!(fabsf(gain) > GAIN_SILENCE_THRESHOLD)) + continue; + + gain4 = _mm_set1_ps(gain); + for(;BufferSize-pos > 3;pos += 4) + { + const __m128 val4 = _mm_load_ps(&data[c][pos]); + __m128 dry4 = _mm_load_ps(&OutBuffer[pos]); + dry4 = _mm_add_ps(dry4, _mm_mul_ps(val4, gain4)); + _mm_store_ps(&OutBuffer[pos], dry4); + } + for(;pos < BufferSize;pos++) + OutBuffer[pos] += data[c][pos]*gain; + } +} |