From f424d7f7a623f610c4d87b3d161057ed4118316e Mon Sep 17 00:00:00 2001 From: Chris Robinson Date: Thu, 16 Feb 2023 12:31:16 -0800 Subject: Decode IMA4 blocks with fewer loops Rather than decoding samples in chunks of 8, calculate the read offset from an incrementing index. --- core/voice.cpp | 65 +++++++++++++++++++++++++++------------------------------- 1 file changed, 30 insertions(+), 35 deletions(-) (limited to 'core') diff --git a/core/voice.cpp b/core/voice.cpp index 9e349d8e..b390df49 100644 --- a/core/voice.cpp +++ b/core/voice.cpp @@ -318,47 +318,42 @@ inline void LoadSamples(float *RESTRICT dstSamples, const al::byte *src }; /* The rest of the block is arranged as a series of nibbles, contained - * in 4 *bytes* per channel interleaved. So we can decode a series of 8 - * samples at once from each of these 4 bytes. + * in 4 *bytes* per channel interleaved. So every 8 nibbles we need to + * skip 4 bytes per channel to get the next nibbles for this channel. * - * First, decode the 8 sample sets being skipped entirely (they still - * need to be decoded for proper state on the remaining samples). + * First, decode the samples that we need to skip in the block (will + * always be less than the block size). They need to be decoded despite + * being ignored for proper state on the remaining samples. */ - const size_t startOffset{(skip&~size_t{7}) + 1}; const al::byte *nibbleData{src + (srcStep+srcChan)*4}; - for(;skip >= 8;skip-=8) + size_t nibbleOffset{0}; + const size_t startOffset{skip + 1}; + for(;skip;--skip) { - uint code{uint{nibbleData[0]} | (uint{nibbleData[1]} << 8) - | (uint{nibbleData[2]} << 16) | (uint{nibbleData[3]} << 24)}; - nibbleData += 4*srcStep; + const size_t byteShift{(nibbleOffset&1) * 4}; + const size_t wordOffset{(nibbleOffset>>1) & ~size_t{3}}; + const size_t byteOffset{wordOffset*srcStep + ((nibbleOffset>>1)&3u)}; + ++nibbleOffset; - for(size_t j{0};j < 8;++j) - { - std::ignore = decode_sample(code & 0xf); - code >>= 4; - } + std::ignore = decode_sample((nibbleData[byteOffset]>>byteShift) & 15u); } - int samples[8]{}; - for(size_t i{startOffset};i < samplesPerBlock;i+=8) + /* Second, decode the rest of the block and write to the output, until + * the end of the block or the end of output. + */ + const size_t todo{minz(samplesPerBlock-startOffset, samplesToLoad-wrote)}; + for(size_t i{0};i < todo;++i) { - uint code{uint{nibbleData[0]} | (uint{nibbleData[1]} << 8) - | (uint{nibbleData[2]} << 16) | (uint{nibbleData[3]} << 24)}; - nibbleData += 4*srcStep; + const size_t byteShift{(nibbleOffset&1) * 4}; + const size_t wordOffset{(nibbleOffset>>1) & ~size_t{3}}; + const size_t byteOffset{wordOffset*srcStep + ((nibbleOffset>>1)&3u)}; + ++nibbleOffset; - for(size_t j{0};j < 8;++j) - { - samples[j] = decode_sample(code & 0xf); - code >>= 4; - } - - const size_t todo{minz(8-skip, samplesToLoad-wrote)}; - for(size_t j{0};j < todo;++j) - dstSamples[wrote++] = static_cast(samples[j+skip]) / 32768.0f; - if(wrote == samplesToLoad) - return; - skip = 0; + const int result{decode_sample((nibbleData[byteOffset]>>byteShift) & 15u)}; + dstSamples[wrote++] = static_cast(result) / 32768.0f; } + if(wrote == samplesToLoad) + return; src += blockBytes; } while(true); @@ -434,8 +429,7 @@ inline void LoadSamples(float *RESTRICT dstSamples, const al::byte * }; /* The rest of the block is a series of nibbles, interleaved per- - * channel. Decode the number of samples that we need to skip in the - * block (will always be less than the block size). + * channel. First, skip samples. */ const size_t startOffset{skip + 2}; size_t nibbleOffset{srcChan}; @@ -443,12 +437,13 @@ inline void LoadSamples(float *RESTRICT dstSamples, const al::byte * { const size_t byteOffset{nibbleOffset>>1}; const size_t byteShift{((nibbleOffset&1)^1) * 4}; - std::ignore = decode_sample((input[byteOffset]>>byteShift) & 15); nibbleOffset += srcStep; + + std::ignore = decode_sample((input[byteOffset]>>byteShift) & 15); } /* Now decode the rest of the block, until the end of the block or the - * dst buffer is full. + * dst buffer is filled. */ const size_t todo{minz(samplesPerBlock-startOffset, samplesToLoad-wrote)}; for(size_t j{0};j < todo;++j) -- cgit v1.2.3