12 files changed, 215 insertions, 65 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 74eeb9fd..9ab92b7b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -612,6 +612,7 @@ set(COMMON_OBJS
     common/alspan.h
     common/alstring.cpp
     common/alstring.h
+    common/altraits.h
     common/atomic.h
     common/comptr.h
     common/dynload.cpp
diff --git a/al/buffer.cpp b/al/buffer.cpp
index ff416fda..bc007219 100644
--- a/al/buffer.cpp
+++ b/al/buffer.cpp
@@ -571,7 +571,7 @@ void LoadData(ALCcontext *context, ALbuffer *ALBuf, ALsizei freq, ALuint size,
         /* Can only preserve data with the same format and alignment. */
         if(ALBuf->mChannels != *DstChannels || ALBuf->OriginalType != SrcType) [[unlikely]]
             return context->setError(AL_INVALID_VALUE, "Preserving data of mismatched format");
-        if(ALBuf->OriginalAlign != align) [[unlikely]]
+        if(ALBuf->mBlockAlign != align) [[unlikely]]
             return context->setError(AL_INVALID_VALUE, "Preserving data of mismatched alignment");
         if(ALBuf->mAmbiOrder != ambiorder) [[unlikely]]
             return context->setError(AL_INVALID_VALUE, "Preserving data of mismatched order");
@@ -641,7 +641,7 @@ void LoadData(ALCcontext *context, ALbuffer *ALBuf, ALsizei freq, ALuint size,
         if(SrcData != nullptr && !ALBuf->mData.empty())
             Convert_int16_ima4(reinterpret_cast<int16_t*>(ALBuf->mData.data()), SrcData,
                 NumChannels, frames, align);
-        ALBuf->OriginalAlign = align;
+        ALBuf->mBlockAlign = align;
     }
     else if(SrcType == UserFmtMSADPCM)
     {
@@ -649,14 +649,14 @@ void LoadData(ALCcontext *context, ALbuffer *ALBuf, ALsizei freq, ALuint size,
         if(SrcData != nullptr && !ALBuf->mData.empty())
             Convert_int16_msadpcm(reinterpret_cast<int16_t*>(ALBuf->mData.data()), SrcData,
                 NumChannels, frames, align);
-        ALBuf->OriginalAlign = align;
+        ALBuf->mBlockAlign = align;
     }
     else
     {
         assert(DstType.has_value());
         if(SrcData != nullptr && !ALBuf->mData.empty())
             std::copy_n(SrcData, frames*FrameSize, ALBuf->mData.begin());
-        ALBuf->OriginalAlign = 1;
+        ALBuf->mBlockAlign = 1;
     }
     ALBuf->OriginalSize = size;
     ALBuf->OriginalType = SrcType;
@@ -722,7 +722,7 @@ void PrepareCallback(ALCcontext *context, ALbuffer *ALBuf, ALsizei freq,
 
     ALBuf->OriginalType = SrcType;
     ALBuf->OriginalSize = 0;
-    ALBuf->OriginalAlign = 1;
+    ALBuf->mBlockAlign = 1;
     ALBuf->Access = 0;
 
     ALBuf->mSampleRate = static_cast<ALuint>(freq);
@@ -1105,10 +1105,10 @@ START_API_FUNC
     else if(al::to_underlying(usrfmt->channels) != al::to_underlying(albuf->mChannels)
         || usrfmt->type != albuf->OriginalType) [[unlikely]]
         context->setError(AL_INVALID_ENUM, "Unpacking data with mismatched format");
-    else if(align != albuf->OriginalAlign) [[unlikely]]
+    else if(align != albuf->mBlockAlign) [[unlikely]]
         context->setError(AL_INVALID_VALUE,
             "Unpacking data with alignment %u does not match original alignment %u", align,
-            albuf->OriginalAlign);
+            albuf->mBlockAlign);
     else if(albuf->isBFormat() && albuf->UnpackAmbiOrder != albuf->mAmbiOrder) [[unlikely]]
         context->setError(AL_INVALID_VALUE, "Unpacking data with mismatched ambisonic order");
     else if(albuf->MappedAccess != 0) [[unlikely]]
diff --git a/al/buffer.h b/al/buffer.h
index 7ded83bd..322b918f 100644
--- a/al/buffer.h
+++ b/al/buffer.h
@@ -51,7 +51,6 @@ struct ALbuffer : public BufferStorage {
 
     UserFmtType OriginalType{UserFmtShort};
     ALuint OriginalSize{0};
-    ALuint OriginalAlign{0};
 
     ALuint UnpackAlign{0};
     ALuint PackAlign{0};
diff --git a/al/source.cpp b/al/source.cpp
index 7dc5df37..7db175ef 100644
--- a/al/source.cpp
+++ b/al/source.cpp
@@ -334,8 +334,8 @@ double GetSourceOffset(ALsource *Source, ALenum name, ALCcontext *context)
     case AL_BYTE_OFFSET:
         if(BufferFmt->OriginalType == UserFmtIMA4)
         {
-            ALuint FrameBlockSize{BufferFmt->OriginalAlign};
-            ALuint align{(BufferFmt->OriginalAlign-1)/2 + 4};
+            ALuint FrameBlockSize{BufferFmt->mBlockAlign};
+            ALuint align{(BufferFmt->mBlockAlign-1)/2 + 4};
             ALuint BlockSize{align * BufferFmt->channelsFromFmt()};
 
             /* Round down to nearest ADPCM block */
@@ -343,7 +343,7 @@ double GetSourceOffset(ALsource *Source, ALenum name, ALCcontext *context)
         }
         else if(BufferFmt->OriginalType == UserFmtMSADPCM)
         {
-            ALuint FrameBlockSize{BufferFmt->OriginalAlign};
+            ALuint FrameBlockSize{BufferFmt->mBlockAlign};
             ALuint align{(FrameBlockSize-2)/2 + 7};
             ALuint BlockSize{align * BufferFmt->channelsFromFmt()};
 
@@ -390,8 +390,8 @@ double GetSourceLength(const ALsource *source, ALenum name)
     case AL_BYTE_LENGTH_SOFT:
         if(BufferFmt->OriginalType == UserFmtIMA4)
         {
-            ALuint FrameBlockSize{BufferFmt->OriginalAlign};
-            ALuint align{(BufferFmt->OriginalAlign-1)/2 + 4};
+            ALuint FrameBlockSize{BufferFmt->mBlockAlign};
+            ALuint align{(BufferFmt->mBlockAlign-1)/2 + 4};
             ALuint BlockSize{align * BufferFmt->channelsFromFmt()};
 
             /* Round down to nearest ADPCM block */
@@ -399,7 +399,7 @@ double GetSourceLength(const ALsource *source, ALenum name)
         }
         else if(BufferFmt->OriginalType == UserFmtMSADPCM)
         {
-            ALuint FrameBlockSize{BufferFmt->OriginalAlign};
+            ALuint FrameBlockSize{BufferFmt->mBlockAlign};
             ALuint align{(FrameBlockSize-2)/2 + 7};
             ALuint BlockSize{align * BufferFmt->channelsFromFmt()};
 
@@ -474,15 +474,15 @@ al::optional<VoicePos> GetSampleOffset(al::deque<ALbufferQueueItem> &BufferList,
         /* Determine the ByteOffset (and ensure it is block aligned) */
         if(BufferFmt->OriginalType == UserFmtIMA4)
         {
-            const ALuint align{(BufferFmt->OriginalAlign-1)/2 + 4};
+            const ALuint align{(BufferFmt->mBlockAlign-1)/2 + 4};
             Offset = std::floor(Offset / align / BufferFmt->channelsFromFmt());
-            Offset *= BufferFmt->OriginalAlign;
+            Offset *= BufferFmt->mBlockAlign;
         }
         else if(BufferFmt->OriginalType == UserFmtMSADPCM)
         {
-            const ALuint align{(BufferFmt->OriginalAlign-2)/2 + 7};
+            const ALuint align{(BufferFmt->mBlockAlign-2)/2 + 7};
             Offset = std::floor(Offset / align / BufferFmt->channelsFromFmt());
-            Offset *= BufferFmt->OriginalAlign;
+            Offset *= BufferFmt->mBlockAlign;
         }
         else
             Offset = std::floor(Offset / BufferFmt->channelsFromFmt());
@@ -530,14 +530,15 @@ void InitVoice(Voice *voice, ALsource *source, ALbufferQueueItem *BufferList, AL
         FmtSuperStereo : buffer->mChannels;
     voice->mFmtType = buffer->mType;
     voice->mFrameStep = buffer->channelsFromFmt();
-    voice->mFrameSize = buffer->frameSizeFromFmt();
+    voice->mBytesPerBlock = buffer->blockSizeFromFmt();
+    voice->mSamplesPerBlock = buffer->mBlockAlign;
     voice->mAmbiLayout = IsUHJ(voice->mFmtChannels) ? AmbiLayout::FuMa : buffer->mAmbiLayout;
     voice->mAmbiScaling = IsUHJ(voice->mFmtChannels) ? AmbiScaling::UHJ : buffer->mAmbiScaling;
     voice->mAmbiOrder = (voice->mFmtChannels == FmtSuperStereo) ? 1 : buffer->mAmbiOrder;
 
     if(buffer->mCallback) voice->mFlags.set(VoiceIsCallback);
     else if(source->SourceType == AL_STATIC) voice->mFlags.set(VoiceIsStatic);
-    voice->mNumCallbackSamples = 0;
+    voice->mNumCallbackBlocks = 0;
 
     voice->prepare(device);
 
@@ -1536,6 +1537,7 @@ try {
             newlist.emplace_back();
             newlist.back().mCallback = buffer->mCallback;
             newlist.back().mUserData = buffer->mUserData;
+            newlist.back().mBlockAlign = buffer->mBlockAlign;
             newlist.back().mSampleLen = buffer->mSampleLen;
             newlist.back().mLoopStart = buffer->mLoopStart;
             newlist.back().mLoopEnd = buffer->mLoopEnd;
@@ -3604,6 +3606,7 @@ START_API_FUNC
             BufferList = &item;
         }
         if(!buffer) continue;
+        BufferList->mBlockAlign = buffer->mBlockAlign;
         BufferList->mSampleLen = buffer->mSampleLen;
         BufferList->mLoopEnd = buffer->mSampleLen;
         BufferList->mSamples = buffer->mData.data();
diff --git a/alc/effects/convolution.cpp b/alc/effects/convolution.cpp
index e88fb0d0..1c5c3691 100644
--- a/alc/effects/convolution.cpp
+++ b/alc/effects/convolution.cpp
@@ -84,6 +84,10 @@ void LoadSamples(float *RESTRICT dst, const al::byte *src, const size_t srcstep,
     HANDLE_FMT(FmtDouble);
     HANDLE_FMT(FmtMulaw);
     HANDLE_FMT(FmtAlaw);
+    /* FIXME: Handle ADPCM decoding here. */
+    case FmtIMA4:
+        std::fill_n(dst, samples, 0.0f);
+        break;
     }
 #undef HANDLE_FMT
 }
diff --git a/common/alnumeric.h b/common/alnumeric.h
index 13e61645..a426763f 100644
--- a/common/alnumeric.h
+++ b/common/alnumeric.h
@@ -12,6 +12,7 @@
 #include <xmmintrin.h>
 #endif
 
+#include "altraits.h"
 #include "opthelpers.h"
 
 
@@ -97,12 +98,20 @@ inline uint32_t NextPowerOf2(uint32_t value) noexcept
     return value+1;
 }
 
-/** Round up a value to the next multiple. */
-inline size_t RoundUp(size_t value, size_t r) noexcept
-{
-    value += r-1;
-    return value - (value%r);
-}
+/**
+ * If the value is not already a multiple of r, round down to the next
+ * multiple.
+ */
+template<typename T>
+constexpr T RoundDown(T value, al::type_identity_t<T> r) noexcept
+{ return value - (value%r); }
+
+/**
+ * If the value is not already a multiple of r, round up to the next multiple.
+ */
+template<typename T>
+constexpr T RoundUp(T value, al::type_identity_t<T> r) noexcept
+{ return RoundDown(value + r-1, r); }
 
 
 /**
diff --git a/common/alspan.h b/common/alspan.h
index 519f22e4..1d6cdfe5 100644
--- a/common/alspan.h
+++ b/common/alspan.h
@@ -8,6 +8,7 @@
 #include <type_traits>
 
 #include "almalloc.h"
+#include "altraits.h"
 
 namespace al {
 
@@ -37,13 +38,6 @@ constexpr const T* data(std::initializer_list<T> list) noexcept
 { return list.begin(); }
 
 
-template<typename T>
-struct type_identity { using type = T; };
-
-template<typename T>
-using type_identity_t = typename type_identity<T>::type;
-
-
 constexpr size_t dynamic_extent{static_cast<size_t>(-1)};
 
 template<typename T, size_t E=dynamic_extent>
diff --git a/common/altraits.h b/common/altraits.h
new file mode 100644
index 00000000..7ce0422e
--- /dev/null
+++ b/common/altraits.h
@@ -0,0 +1,14 @@
+#ifndef COMMON_ALTRAITS_H
+#define COMMON_ALTRAITS_H
+
+namespace al {
+
+template<typename T>
+struct type_identity { using type = T; };
+
+template<typename T>
+using type_identity_t = typename type_identity<T>::type;
+
+} // namespace al
+
+#endif /* COMMON_ALTRAITS_H */
diff --git a/core/buffer_storage.cpp b/core/buffer_storage.cpp
index 1c80e7ef..1e826bff 100644
--- a/core/buffer_storage.cpp
+++ b/core/buffer_storage.cpp
@@ -16,6 +16,7 @@ uint BytesFromFmt(FmtType type) noexcept
     case FmtDouble: return sizeof(double);
     case FmtMulaw: return sizeof(uint8_t);
     case FmtAlaw: return sizeof(uint8_t);
+    case FmtIMA4: break;
     }
     return 0;
 }
diff --git a/core/buffer_storage.h b/core/buffer_storage.h
index ec934681..a4d1b289 100644
--- a/core/buffer_storage.h
+++ b/core/buffer_storage.h
@@ -18,6 +18,7 @@ enum FmtType : unsigned char {
     FmtDouble,
     FmtMulaw,
     FmtAlaw,
+    FmtIMA4,
 };
 enum FmtChannels : unsigned char {
     FmtMono,
@@ -83,6 +84,7 @@ struct BufferStorage {
     FmtChannels mChannels{FmtMono};
     FmtType mType{FmtShort};
     uint mSampleLen{0u};
+    uint mBlockAlign{0u};
 
     AmbiLayout mAmbiLayout{AmbiLayout::FuMa};
     AmbiScaling mAmbiScaling{AmbiScaling::FuMa};
@@ -93,6 +95,12 @@ struct BufferStorage {
     { return ChannelsFromFmt(mChannels, mAmbiOrder); }
     inline uint frameSizeFromFmt() const noexcept { return channelsFromFmt() * bytesFromFmt(); }
 
+    inline uint blockSizeFromFmt() const noexcept
+    {
+        if(mType == FmtIMA4) return ((mBlockAlign-1)/2 + 4) * channelsFromFmt();
+        return frameSizeFromFmt();
+    };
+
     inline bool isBFormat() const noexcept { return IsBFormat(mChannels); }
 };
 
diff --git a/core/voice.cpp b/core/voice.cpp
index 4ca62a02..f6954cbe 100644
--- a/core/voice.cpp
+++ b/core/voice.cpp
@@ -178,6 +178,32 @@ void Voice::InitMixer(al::optional<std::string> resampler)
 
 namespace {
 
+/* IMA ADPCM Stepsize table */
+constexpr int IMAStep_size[89] = {
+       7,    8,    9,   10,   11,   12,   13,   14,   16,   17,   19,
+      21,   23,   25,   28,   31,   34,   37,   41,   45,   50,   55,
+      60,   66,   73,   80,   88,   97,  107,  118,  130,  143,  157,
+     173,  190,  209,  230,  253,  279,  307,  337,  371,  408,  449,
+     494,  544,  598,  658,  724,  796,  876,  963, 1060, 1166, 1282,
+    1411, 1552, 1707, 1878, 2066, 2272, 2499, 2749, 3024, 3327, 3660,
+    4026, 4428, 4871, 5358, 5894, 6484, 7132, 7845, 8630, 9493,10442,
+   11487,12635,13899,15289,16818,18500,20350,22358,24633,27086,29794,
+   32767
+};
+
+/* IMA4 ADPCM Codeword decode table */
+constexpr int IMA4Codeword[16] = {
+    1, 3, 5, 7, 9, 11, 13, 15,
+   -1,-3,-5,-7,-9,-11,-13,-15,
+};
+
+/* IMA4 ADPCM Step index adjust decode table */
+constexpr int IMA4Index_adjust[16] = {
+   -1,-1,-1,-1, 2, 4, 6, 8,
+   -1,-1,-1,-1, 2, 4, 6, 8
+};
+
+
 void SendSourceStoppedEvent(ContextBase *context, uint id)
 {
     RingBuffer *ring{context->mAsyncEvents.get()};
@@ -221,8 +247,9 @@ const float *DoFilters(BiquadFilter &lpfilter, BiquadFilter &hpfilter, float *ds
 
 
 template<FmtType Type>
-inline void LoadSamples(float *dstSamples, const al::byte *src, const size_t srcChan,
-    const size_t srcOffset, const size_t srcStep, const size_t samples) noexcept
+inline void LoadSamples(float *RESTRICT dstSamples, const al::byte *src, const size_t srcChan,
+    const size_t srcOffset, const size_t srcStep, const size_t /*samplesPerBlock*/,
+    const size_t samples) noexcept
 {
     constexpr size_t sampleSize{sizeof(typename al::FmtTypeTraits<Type>::Type)};
     auto s = src + (srcOffset*srcStep + srcChan)*sampleSize;
@@ -230,12 +257,91 @@ inline void LoadSamples(float *dstSamples, const al::byte *src, const size_t src
     al::LoadSampleArray<Type>(dstSamples, s, srcStep, samples);
 }
 
+template<>
+inline void LoadSamples<FmtIMA4>(float *RESTRICT dstSamples, const al::byte *src,
+    const size_t srcChan, const size_t srcOffset, const size_t srcStep,
+    const size_t samplesPerBlock, const size_t samples) noexcept
+{
+    const size_t blockBytes{((samplesPerBlock-1)/2 + 4)*srcStep};
+
+    /* Skip to the ADPCM block containing the srcOffset sample. */
+    src += srcOffset/samplesPerBlock*blockBytes;
+    /* Calculate how many samples need to be skipped in the block. */
+    size_t skip{srcOffset % samplesPerBlock};
+
+    /* NOTE: This could probably be optimized better. */
+    size_t wrote{0};
+    do {
+        /* Each IMA4 block starts with a signed 16-bit sample, and a signed
+         * 16-bit table index. The table index needs to be clamped.
+         */
+        int sample{src[srcChan*4] | (src[srcChan*4 + 1] << 8)};
+        int index{src[srcChan*4 + 2] | (src[srcChan*4 + 3] << 8)};
+
+        sample = (sample^0x8000) - 32768;
+        index = clampi((index^0x8000) - 32768, 0, al::size(IMAStep_size)-1);
+
+        if(!skip) [[likely]]
+        {
+            dstSamples[++wrote] = static_cast<float>(sample) / 32768.0f;
+            if(wrote == samples) return;
+        }
+        else
+            --skip;
+
+        int tempsamples[8]{};
+        const al::byte *nibbleData{src + (srcStep+srcChan)*4};
+        for(size_t i{1};i < samplesPerBlock;i+=8)
+        {
+            /* The rest of the block is arranged as a series of nibbles, with 4
+             * bytes per channel interleaved. So we can decode a series of 8
+             * samples at once from these next 4 bytes.
+             */
+            uint code{uint{nibbleData[0]} | (uint{nibbleData[1]} << 8)
+                | (uint{nibbleData[2]} << 16) | (uint{nibbleData[3]} << 24)};
+            for(size_t j{0};j < 8;++j)
+            {
+                const uint nibble{code & 0xf};
+                code >>= 4;
+
+                sample += IMA4Codeword[nibble] * IMAStep_size[index] / 8;
+                sample = clampi(sample, -32768, 32767);
+                tempsamples[j] = sample;
+
+                index += IMA4Index_adjust[nibble];
+                index = clampi(index, 0, al::size(IMAStep_size)-1);
+            }
+            nibbleData += 4*srcStep;
+
+            /* If we're skipping these 8 samples, go on to the next set. They
+             * still need to be decoded to update the predictor state for the
+             * next set.
+             */
+            if(skip >= 8)
+            {
+                skip -= 8;
+                continue;
+            }
+
+            const size_t todo{minz(8-skip, samples-wrote)};
+            for(size_t j{0};j < todo;++j)
+                dstSamples[++wrote] = static_cast<float>(tempsamples[j+skip]) / 32768.0f;
+            if(wrote == samples)
+                return;
+            skip = 0;
+        }
+
+        src += blockBytes;
+    } while(1);
+}
+
 void LoadSamples(float *dstSamples, const al::byte *src, const size_t srcChan,
-    const size_t srcOffset, const FmtType srcType, const size_t srcStep, const size_t samples)
-    noexcept
+    const size_t srcOffset, const FmtType srcType, const size_t srcStep,
+    const size_t samplesPerBlock, const size_t samples) noexcept
 {
 #define HANDLE_FMT(T) case T:                                                 \
-    LoadSamples<T>(dstSamples, src, srcChan, srcOffset, srcStep, samples);    \
+    LoadSamples<T>(dstSamples, src, srcChan, srcOffset, srcStep,              \
+        samplesPerBlock, samples);                                            \
     break
 
     switch(srcType)
@@ -246,6 +352,7 @@ void LoadSamples(float *dstSamples, const al::byte *src, const size_t srcChan,
     HANDLE_FMT(FmtDouble);
     HANDLE_FMT(FmtMulaw);
     HANDLE_FMT(FmtAlaw);
+    HANDLE_FMT(FmtIMA4);
     }
 #undef HANDLE_FMT
 }
@@ -263,7 +370,7 @@ void LoadBufferStatic(VoiceBufferItem *buffer, VoiceBufferItem *bufferLoopItem,
             const size_t buffer_remaining{buffer->mSampleLen - dataPosInt};
             const size_t remaining{minz(samplesToLoad-samplesLoaded, buffer_remaining)};
             LoadSamples(voiceSamples+samplesLoaded, buffer->mSamples, srcChannel, dataPosInt,
-                sampleType, srcStep, remaining);
+                sampleType, srcStep, buffer->mBlockAlign, remaining);
             samplesLoaded += remaining;
         }
 
@@ -285,7 +392,7 @@ void LoadBufferStatic(VoiceBufferItem *buffer, VoiceBufferItem *bufferLoopItem,
         /* Load what's left of this loop iteration */
         const size_t remaining{minz(samplesToLoad-samplesLoaded, loopEnd-dataPosInt)};
         LoadSamples(voiceSamples+samplesLoaded, buffer->mSamples, srcChannel, intPos, sampleType,
-            srcStep, remaining);
+            srcStep, buffer->mBlockAlign, remaining);
         samplesLoaded += remaining;
 
         /* Load repeats of the loop to fill the buffer. */
@@ -293,7 +400,7 @@ void LoadBufferStatic(VoiceBufferItem *buffer, VoiceBufferItem *bufferLoopItem,
         while(const size_t toFill{minz(samplesToLoad - samplesLoaded, loopSize)})
         {
             LoadSamples(voiceSamples+samplesLoaded, buffer->mSamples, srcChannel, loopStart,
-                sampleType, srcStep, toFill);
+                sampleType, srcStep, buffer->mBlockAlign, toFill);
             samplesLoaded += toFill;
         }
     }
@@ -308,7 +415,7 @@ void LoadBufferCallback(VoiceBufferItem *buffer, const size_t dataPosInt,
     {
         const size_t remaining{minz(samplesToLoad-samplesLoaded, numCallbackSamples-dataPosInt)};
         LoadSamples(voiceSamples+samplesLoaded, buffer->mSamples, srcChannel, dataPosInt,
-            sampleType, srcStep, remaining);
+            sampleType, srcStep, buffer->mBlockAlign, remaining);
         samplesLoaded += remaining;
     }
 
@@ -337,7 +444,7 @@ void LoadBufferQueue(VoiceBufferItem *buffer, VoiceBufferItem *bufferLoopItem,
 
         const size_t remaining{minz(samplesToLoad-samplesLoaded, buffer->mSampleLen-dataPosInt)};
         LoadSamples(voiceSamples+samplesLoaded, buffer->mSamples, srcChannel, dataPosInt,
-            sampleType, srcStep, remaining);
+            sampleType, srcStep, buffer->mBlockAlign, remaining);
 
         samplesLoaded += remaining;
         if(samplesLoaded == samplesToLoad)
@@ -527,7 +634,7 @@ void Voice::mix(const State vstate, ContextBase *Context, const nanoseconds devi
     const uint samplesToLoad{samplesToMix + mDecoderPadding};
 
     /* Get a span of pointers to hold the floating point, deinterlaced,
-     * resampled buffer data.
+     * resampled buffer data to be mixed.
      */
     std::array<float*,DeviceBase::MixerChannelsMax> SamplePointers;
     const al::span<float*> MixingSamples{SamplePointers.data(), mChans.size()};
@@ -544,6 +651,12 @@ void Voice::mix(const State vstate, ContextBase *Context, const nanoseconds devi
             const al::span<float> dst) { std::copy_n(src, dst.size(), dst.begin()); }}
         : mResampler};
 
+    /* For callback buffers, this is the sample offset for the start of the
+     * buffer data. This is needed with compressed formats to track how many
+     * samples into a block we're starting from.
+     */
+    const uint callbackBase{RoundDown(static_cast<uint>(maxi(DataPosInt, 0)), mSamplesPerBlock)};
+
     /* UHJ2 and SuperStereo only have 2 buffer channels, but 3 mixing channels
      * (3rd channel is generated from decoding).
      */
@@ -557,7 +670,6 @@ void Voice::mix(const State vstate, ContextBase *Context, const nanoseconds devi
         const auto prevSamples = al::as_span(mPrevSamples[chan]);
         const auto resampleBuffer = std::copy(prevSamples.cbegin(), prevSamples.cend(),
             Device->mResampleData.begin()) - MaxResamplerEdge;
-        const uint callbackBase{static_cast<uint>(maxi(DataPosInt, 0))};
         int intPos{DataPosInt};
         uint fracPos{DataPosFrac};
 
@@ -567,7 +679,8 @@ void Voice::mix(const State vstate, ContextBase *Context, const nanoseconds devi
         for(uint samplesLoaded{0};samplesLoaded < samplesToLoad;)
         {
             /* Calculate the number of dst samples that can be loaded this
-             * iteration, given the available resampler buffer size.
+             * iteration, given the available resampler buffer size, and the
+             * number of src samples that are needed to load it.
              */
             auto calc_buffer_sizes = [fracPos,increment](uint dstBufferSize)
             {
@@ -599,7 +712,7 @@ void Voice::mix(const State vstate, ContextBase *Context, const nanoseconds devi
                 {
                     /* Some resamplers require the destination being 16-byte
                      * aligned, so limit to a multiple of 4 samples to maintain
-                     * alignment.
+                     * alignment if we need to do another iteration after this.
                      */
                     dstBufferSize = static_cast<uint>(dataSize64) & ~3u;
                 }
@@ -654,11 +767,12 @@ void Voice::mix(const State vstate, ContextBase *Context, const nanoseconds devi
                 else if(mFlags.test(VoiceIsCallback))
                 {
                     const size_t bufferOffset{uintPos - callbackBase};
-                    const size_t getTotal{bufferOffset + srcBufferSize - srcSampleDelay};
-                    if(!mFlags.test(VoiceCallbackStopped) && getTotal > mNumCallbackSamples)
+                    const size_t needSamples{bufferOffset + srcBufferSize - srcSampleDelay};
+                    const size_t needBlocks{(needSamples + mSamplesPerBlock-1) / mSamplesPerBlock};
+                    if(!mFlags.test(VoiceCallbackStopped) && needBlocks > mNumCallbackBlocks)
                     {
-                        const size_t byteOffset{mNumCallbackSamples*mFrameSize};
-                        const size_t needBytes{getTotal*mFrameSize - byteOffset};
+                        const size_t byteOffset{mNumCallbackBlocks*mBytesPerBlock};
+                        const size_t needBytes{(needBlocks-mNumCallbackBlocks)*mBytesPerBlock};
 
                         const int gotBytes{BufferListItem->mCallback(BufferListItem->mUserData,
                             &BufferListItem->mSamples[byteOffset], static_cast<int>(needBytes))};
@@ -667,14 +781,14 @@ void Voice::mix(const State vstate, ContextBase *Context, const nanoseconds devi
                         else if(static_cast<uint>(gotBytes) < needBytes)
                         {
                             mFlags.set(VoiceCallbackStopped);
-                            mNumCallbackSamples += static_cast<uint>(gotBytes) / mFrameSize;
+                            mNumCallbackBlocks += static_cast<uint>(gotBytes) / mBytesPerBlock;
                         }
                         else
-                            mNumCallbackSamples = static_cast<uint>(getTotal);
+                            mNumCallbackBlocks = static_cast<uint>(needBlocks);
                     }
-                    LoadBufferCallback(BufferListItem, bufferOffset, mNumCallbackSamples,
-                        mFmtType, chan, mFrameStep, srcSampleDelay, srcBufferSize,
-                        al::to_address(resampleBuffer));
+                    const size_t numSamples{uint{mNumCallbackBlocks} * mSamplesPerBlock};
+                    LoadBufferCallback(BufferListItem, bufferOffset, numSamples, mFmtType, chan,
+                        mFrameStep, srcSampleDelay, srcBufferSize, al::to_address(resampleBuffer));
                 }
                 else
                     LoadBufferQueue(BufferListItem, BufferLoopItem, uintPos, mFmtType, chan,
@@ -815,13 +929,12 @@ void Voice::mix(const State vstate, ContextBase *Context, const nanoseconds devi
         return;
     }
 
-    /* Update positions */
+    /* Update voice positions and buffers as needed. */
     DataPosFrac += increment*samplesToMix;
     const uint SrcSamplesDone{DataPosFrac>>MixerFracBits};
     DataPosInt  += SrcSamplesDone;
     DataPosFrac &= MixerFracMask;
 
-    /* Update voice positions and buffers as needed. */
     uint buffers_done{0u};
     if(BufferListItem && DataPosInt >= 0) [[likely]]
     {
@@ -850,18 +963,20 @@ void Voice::mix(const State vstate, ContextBase *Context, const nanoseconds devi
         else if(mFlags.test(VoiceIsCallback))
         {
             /* Handle callback buffer source */
-            if(SrcSamplesDone < mNumCallbackSamples)
+            const uint samplesDone{static_cast<uint>(DataPosInt) - callbackBase};
+            const uint blocksDone{samplesDone / mSamplesPerBlock};
+            if(blocksDone < mNumCallbackBlocks)
             {
-                const size_t byteOffset{SrcSamplesDone*mFrameSize};
-                const size_t byteEnd{mNumCallbackSamples*mFrameSize};
+                const size_t byteOffset{blocksDone*mBytesPerBlock};
+                const size_t byteEnd{mNumCallbackBlocks*mBytesPerBlock};
                 al::byte *data{BufferListItem->mSamples};
                 std::copy(data+byteOffset, data+byteEnd, data);
-                mNumCallbackSamples -= SrcSamplesDone;
+                mNumCallbackBlocks -= blocksDone;
             }
             else
             {
                 BufferListItem = nullptr;
-                mNumCallbackSamples = 0;
+                mNumCallbackBlocks = 0;
             }
         }
         else
diff --git a/core/voice.h b/core/voice.h
index cf7345a1..f197e463 100644
--- a/core/voice.h
+++ b/core/voice.h
@@ -100,6 +100,7 @@ struct VoiceBufferItem {
     CallbackType mCallback{nullptr};
     void *mUserData{nullptr};
 
+    uint mBlockAlign{0u};
     uint mSampleLen{0u};
     uint mLoopStart{0u};
     uint mLoopEnd{0u};
@@ -219,7 +220,8 @@ struct Voice {
     FmtType mFmtType;
     uint mFrequency;
     uint mFrameStep; /**< In steps of the sample type size. */
-    uint mFrameSize; /**< In bytes. */
+    uint mBytesPerBlock; /**< Or for PCM formats, BytesPerFrame. */
+    uint mSamplesPerBlock; /**< Always 1 for PCM formats. */
     AmbiLayout mAmbiLayout;
     AmbiScaling mAmbiScaling;
     uint mAmbiOrder;
@@ -235,7 +237,7 @@ struct Voice {
     InterpState mResampleState;
 
     std::bitset<VoiceFlagCount> mFlags{};
-    uint mNumCallbackSamples{0};
+    uint mNumCallbackBlocks{0};
 
     struct TargetData {
         int FilterType;