42 files changed, 5689 insertions, 324 deletions
diff --git a/core/ambidefs.cpp b/core/ambidefs.cpp
new file mode 100644
index 00000000..2725748e
--- /dev/null
+++ b/core/ambidefs.cpp
@@ -0,0 +1,44 @@
+
+#include "config.h"
+
+#include "ambidefs.h"
+
+#include <cassert>
+
+
+namespace {
+
+constexpr std::array<float,MaxAmbiOrder+1> Ambi3DDecoderHFScale{{
+    1.00000000e+00f, 1.00000000e+00f
+}};
+constexpr std::array<float,MaxAmbiOrder+1> Ambi3DDecoderHFScale2O{{
+    7.45355990e-01f, 1.00000000e+00f, 1.00000000e+00f
+}};
+constexpr std::array<float,MaxAmbiOrder+1> Ambi3DDecoderHFScale3O{{
+    5.89792205e-01f, 8.79693856e-01f, 1.00000000e+00f, 1.00000000e+00f
+}};
+
+inline auto& GetDecoderHFScales(uint order) noexcept
+{
+    if(order >= 3) return Ambi3DDecoderHFScale3O;
+    if(order == 2) return Ambi3DDecoderHFScale2O;
+    return Ambi3DDecoderHFScale;
+}
+
+} // namespace
+
+auto AmbiScale::GetHFOrderScales(const uint in_order, const uint out_order) noexcept
+    -> std::array<float,MaxAmbiOrder+1>
+{
+    std::array<float,MaxAmbiOrder+1> ret{};
+
+    assert(out_order >= in_order);
+
+    const auto &target = GetDecoderHFScales(out_order);
+    const auto &input = GetDecoderHFScales(in_order);
+
+    for(size_t i{0};i < in_order+1;++i)
+        ret[i] = input[i] / target[i];
+
+    return ret;
+}
diff --git a/core/ambidefs.h b/core/ambidefs.h
index a72f7b78..22739359 100644
--- a/core/ambidefs.h
+++ b/core/ambidefs.h
@@ -97,6 +97,10 @@ struct AmbiScale {
         }};
         return ret;
     }
+
+    /* Retrieves per-order HF scaling factors for "upsampling" ambisonic data. */
+    static std::array<float,MaxAmbiOrder+1> GetHFOrderScales(const uint in_order,
+        const uint out_order) noexcept;
 };
 
 struct AmbiIndex {
diff --git a/core/async_event.h b/core/async_event.h
new file mode 100644
index 00000000..054f0563
--- /dev/null
+++ b/core/async_event.h
@@ -0,0 +1,55 @@
+#ifndef CORE_EVENT_H
+#define CORE_EVENT_H
+
+#include "almalloc.h"
+
+struct EffectState;
+
+using uint = unsigned int;
+
+
+enum {
+    /* End event thread processing. */
+    EventType_KillThread = 0,
+
+    /* User event types. */
+    EventType_SourceStateChange = 1<<0,
+    EventType_BufferCompleted   = 1<<1,
+    EventType_Disconnected      = 1<<2,
+
+    /* Internal events. */
+    EventType_ReleaseEffectState = 65536,
+};
+
+struct AsyncEvent {
+    enum class SrcState {
+        Reset,
+        Stop,
+        Play,
+        Pause
+    };
+
+    uint EnumType{0u};
+    union {
+        char dummy;
+        struct {
+            uint id;
+            SrcState state;
+        } srcstate;
+        struct {
+            uint id;
+            uint count;
+        } bufcomp;
+        struct {
+            char msg[244];
+        } disconnect;
+        EffectState *mEffectState;
+    } u{};
+
+    AsyncEvent() noexcept = default;
+    constexpr AsyncEvent(uint type) noexcept : EnumType{type} { }
+
+    DISABLE_ALLOC()
+};
+
+#endif
diff --git a/core/bformatdec.cpp b/core/bformatdec.cpp
new file mode 100644
index 00000000..6bf85ec9
--- /dev/null
+++ b/core/bformatdec.cpp
@@ -0,0 +1,263 @@
+
+#include "config.h"
+
+#include "bformatdec.h"
+
+#include <algorithm>
+#include <array>
+#include <cmath>
+#include <utility>
+
+#include "almalloc.h"
+#include "ambdec.h"
+#include "filters/splitter.h"
+#include "front_stablizer.h"
+#include "math_defs.h"
+#include "mixer.h"
+#include "opthelpers.h"
+
+
+namespace {
+
+inline auto& GetAmbiScales(AmbDecScale scaletype) noexcept
+{
+    if(scaletype == AmbDecScale::FuMa) return AmbiScale::FromFuMa();
+    if(scaletype == AmbDecScale::SN3D) return AmbiScale::FromSN3D();
+    return AmbiScale::FromN3D();
+}
+
+} // namespace
+
+
+BFormatDec::BFormatDec(const AmbDecConf *conf, const bool allow_2band, const size_t inchans,
+    const uint srate, const uint (&chanmap)[MAX_OUTPUT_CHANNELS],
+    std::unique_ptr<FrontStablizer> stablizer)
+    : mStablizer{std::move(stablizer)}, mDualBand{allow_2band && (conf->FreqBands == 2)}
+    , mChannelDec{inchans}
+{
+    const bool periphonic{(conf->ChanMask&AmbiPeriphonicMask) != 0};
+    auto&& coeff_scale = GetAmbiScales(conf->CoeffScale);
+
+    if(!mDualBand)
+    {
+        for(size_t j{0},k{0};j < mChannelDec.size();++j)
+        {
+            const size_t acn{periphonic ? j : AmbiIndex::FromACN2D()[j]};
+            if(!(conf->ChanMask&(1u<<acn))) continue;
+            const size_t order{AmbiIndex::OrderFromChannel()[acn]};
+            const float gain{conf->HFOrderGain[order] / coeff_scale[acn]};
+            for(size_t i{0u};i < conf->NumSpeakers;++i)
+            {
+                const size_t chanidx{chanmap[i]};
+                mChannelDec[j].mGains.Single[chanidx] = conf->Matrix[i][k] * gain;
+            }
+            ++k;
+        }
+    }
+    else
+    {
+        mChannelDec[0].mXOver.init(conf->XOverFreq / static_cast<float>(srate));
+        for(size_t j{1};j < mChannelDec.size();++j)
+            mChannelDec[j].mXOver = mChannelDec[0].mXOver;
+
+        const float ratio{std::pow(10.0f, conf->XOverRatio / 40.0f)};
+        for(size_t j{0},k{0};j < mChannelDec.size();++j)
+        {
+            const size_t acn{periphonic ? j : AmbiIndex::FromACN2D()[j]};
+            if(!(conf->ChanMask&(1u<<acn))) continue;
+            const size_t order{AmbiIndex::OrderFromChannel()[acn]};
+            const float hfGain{conf->HFOrderGain[order] * ratio / coeff_scale[acn]};
+            const float lfGain{conf->LFOrderGain[order] / ratio / coeff_scale[acn]};
+            for(size_t i{0u};i < conf->NumSpeakers;++i)
+            {
+                const size_t chanidx{chanmap[i]};
+                mChannelDec[j].mGains.Dual[sHFBand][chanidx] = conf->HFMatrix[i][k] * hfGain;
+                mChannelDec[j].mGains.Dual[sLFBand][chanidx] = conf->LFMatrix[i][k] * lfGain;
+            }
+            ++k;
+        }
+    }
+}
+
+BFormatDec::BFormatDec(const size_t inchans, const al::span<const ChannelDec> coeffs,
+    const al::span<const ChannelDec> coeffslf, std::unique_ptr<FrontStablizer> stablizer)
+    : mStablizer{std::move(stablizer)}, mDualBand{!coeffslf.empty()}, mChannelDec{inchans}
+{
+    if(!mDualBand)
+    {
+        for(size_t j{0};j < mChannelDec.size();++j)
+        {
+            float *outcoeffs{mChannelDec[j].mGains.Single};
+            for(const ChannelDec &incoeffs : coeffs)
+                *(outcoeffs++) = incoeffs[j];
+        }
+    }
+    else
+    {
+        for(size_t j{0};j < mChannelDec.size();++j)
+        {
+            float *outcoeffs{mChannelDec[j].mGains.Dual[sHFBand]};
+            for(const ChannelDec &incoeffs : coeffs)
+                *(outcoeffs++) = incoeffs[j];
+
+            outcoeffs = mChannelDec[j].mGains.Dual[sLFBand];
+            for(const ChannelDec &incoeffs : coeffslf)
+                *(outcoeffs++) = incoeffs[j];
+        }
+    }
+}
+
+
+void BFormatDec::process(const al::span<FloatBufferLine> OutBuffer,
+    const FloatBufferLine *InSamples, const size_t SamplesToDo)
+{
+    ASSUME(SamplesToDo > 0);
+
+    if(mDualBand)
+    {
+        const al::span<float> hfSamples{mSamples[sHFBand].data(), SamplesToDo};
+        const al::span<float> lfSamples{mSamples[sLFBand].data(), SamplesToDo};
+        for(auto &chandec : mChannelDec)
+        {
+            chandec.mXOver.process({InSamples->data(), SamplesToDo}, hfSamples.data(),
+                lfSamples.data());
+            MixSamples(hfSamples, OutBuffer, chandec.mGains.Dual[sHFBand],
+                chandec.mGains.Dual[sHFBand], 0, 0);
+            MixSamples(lfSamples, OutBuffer, chandec.mGains.Dual[sLFBand],
+                chandec.mGains.Dual[sLFBand], 0, 0);
+            ++InSamples;
+        }
+    }
+    else
+    {
+        for(auto &chandec : mChannelDec)
+        {
+            MixSamples({InSamples->data(), SamplesToDo}, OutBuffer, chandec.mGains.Single,
+                chandec.mGains.Single, 0, 0);
+            ++InSamples;
+        }
+    }
+}
+
+void BFormatDec::processStablize(const al::span<FloatBufferLine> OutBuffer,
+    const FloatBufferLine *InSamples, const size_t lidx, const size_t ridx, const size_t cidx,
+    const size_t SamplesToDo)
+{
+    ASSUME(SamplesToDo > 0);
+
+    /* Move the existing direct L/R signal out so it doesn't get processed by
+     * the stablizer. Add a delay to it so it stays aligned with the stablizer
+     * delay.
+     */
+    float *RESTRICT mid{al::assume_aligned<16>(mStablizer->MidDirect.data())};
+    float *RESTRICT side{al::assume_aligned<16>(mStablizer->Side.data())};
+    for(size_t i{0};i < SamplesToDo;++i)
+    {
+        mid[FrontStablizer::DelayLength+i] = OutBuffer[lidx][i] + OutBuffer[ridx][i];
+        side[FrontStablizer::DelayLength+i] = OutBuffer[lidx][i] - OutBuffer[ridx][i];
+    }
+    std::fill_n(OutBuffer[lidx].begin(), SamplesToDo, 0.0f);
+    std::fill_n(OutBuffer[ridx].begin(), SamplesToDo, 0.0f);
+
+    /* Decode the B-Format input to OutBuffer. */
+    process(OutBuffer, InSamples, SamplesToDo);
+
+    /* Apply a delay to all channels, except the front-left and front-right, so
+     * they maintain correct timing.
+     */
+    const size_t NumChannels{OutBuffer.size()};
+    for(size_t i{0u};i < NumChannels;i++)
+    {
+        if(i == lidx || i == ridx)
+            continue;
+
+        auto &DelayBuf = mStablizer->DelayBuf[i];
+        auto buffer_end = OutBuffer[i].begin() + SamplesToDo;
+        if LIKELY(SamplesToDo >= FrontStablizer::DelayLength)
+        {
+            auto delay_end = std::rotate(OutBuffer[i].begin(),
+                buffer_end - FrontStablizer::DelayLength, buffer_end);
+            std::swap_ranges(OutBuffer[i].begin(), delay_end, DelayBuf.begin());
+        }
+        else
+        {
+            auto delay_start = std::swap_ranges(OutBuffer[i].begin(), buffer_end,
+                DelayBuf.begin());
+            std::rotate(DelayBuf.begin(), delay_start, DelayBuf.end());
+        }
+    }
+
+    /* Include the side signal for what was just decoded. */
+    for(size_t i{0};i < SamplesToDo;++i)
+        side[FrontStablizer::DelayLength+i] += OutBuffer[lidx][i] - OutBuffer[ridx][i];
+
+    /* Combine the delayed mid signal with the decoded mid signal. Note that
+     * the samples are stored and combined in reverse, so the newest samples
+     * are at the front and the oldest at the back.
+     */
+    al::span<float> tmpbuf{mStablizer->TempBuf.data(), SamplesToDo+FrontStablizer::DelayLength};
+    auto tmpiter = tmpbuf.begin() + SamplesToDo;
+    std::copy(mStablizer->MidDelay.cbegin(), mStablizer->MidDelay.cend(), tmpiter);
+    for(size_t i{0};i < SamplesToDo;++i)
+        *--tmpiter = OutBuffer[lidx][i] + OutBuffer[ridx][i];
+    /* Save the newest samples for next time. */
+    std::copy_n(tmpbuf.cbegin(), mStablizer->MidDelay.size(), mStablizer->MidDelay.begin());
+
+    /* Apply an all-pass on the reversed signal, then reverse the samples to
+     * get the forward signal with a reversed phase shift. The future samples
+     * are included with the all-pass to reduce the error in the output
+     * samples (the smaller the delay, the more error is introduced).
+     */
+    mStablizer->MidFilter.applyAllpass(tmpbuf);
+    tmpbuf = tmpbuf.subspan<FrontStablizer::DelayLength>();
+    std::reverse(tmpbuf.begin(), tmpbuf.end());
+
+    /* Now apply the band-splitter, combining its phase shift with the reversed
+     * phase shift, restoring the original phase on the split signal.
+     */
+    mStablizer->MidFilter.process(tmpbuf, mStablizer->MidHF.data(), mStablizer->MidLF.data());
+
+    /* This pans the separate low- and high-frequency signals between being on
+     * the center channel and the left+right channels. The low-frequency signal
+     * is panned 1/3rd toward center and the high-frequency signal is panned
+     * 1/4th toward center. These values can be tweaked.
+     */
+    const float cos_lf{std::cos(1.0f/3.0f * (al::MathDefs<float>::Pi()*0.5f))};
+    const float cos_hf{std::cos(1.0f/4.0f * (al::MathDefs<float>::Pi()*0.5f))};
+    const float sin_lf{std::sin(1.0f/3.0f * (al::MathDefs<float>::Pi()*0.5f))};
+    const float sin_hf{std::sin(1.0f/4.0f * (al::MathDefs<float>::Pi()*0.5f))};
+    for(size_t i{0};i < SamplesToDo;i++)
+    {
+        const float m{mStablizer->MidLF[i]*cos_lf + mStablizer->MidHF[i]*cos_hf + mid[i]};
+        const float c{mStablizer->MidLF[i]*sin_lf + mStablizer->MidHF[i]*sin_hf};
+        const float s{side[i]};
+
+        /* The generated center channel signal adds to the existing signal,
+         * while the modified left and right channels replace.
+         */
+        OutBuffer[lidx][i] = (m + s) * 0.5f;
+        OutBuffer[ridx][i] = (m - s) * 0.5f;
+        OutBuffer[cidx][i] += c * 0.5f;
+    }
+    /* Move the delayed mid/side samples to the front for next time. */
+    auto mid_end = mStablizer->MidDirect.cbegin() + SamplesToDo;
+    std::copy(mid_end, mid_end+FrontStablizer::DelayLength, mStablizer->MidDirect.begin());
+    auto side_end = mStablizer->Side.cbegin() + SamplesToDo;
+    std::copy(side_end, side_end+FrontStablizer::DelayLength, mStablizer->Side.begin());
+}
+
+
+std::unique_ptr<BFormatDec> BFormatDec::Create(const AmbDecConf *conf, const bool allow_2band,
+    const size_t inchans, const uint srate, const uint (&chanmap)[MAX_OUTPUT_CHANNELS],
+    std::unique_ptr<FrontStablizer> stablizer)
+{
+    return std::unique_ptr<BFormatDec>{new(FamCount(inchans))
+        BFormatDec{conf, allow_2band, inchans, srate, chanmap, std::move(stablizer)}};
+}
+std::unique_ptr<BFormatDec> BFormatDec::Create(const size_t inchans,
+    const al::span<const ChannelDec> coeffs, const al::span<const ChannelDec> coeffslf,
+    std::unique_ptr<FrontStablizer> stablizer)
+{
+    return std::unique_ptr<BFormatDec>{new(FamCount(inchans))
+        BFormatDec{inchans, coeffs, coeffslf, std::move(stablizer)}};
+}
diff --git a/core/bformatdec.h b/core/bformatdec.h
new file mode 100644
index 00000000..a0ae3f27
--- /dev/null
+++ b/core/bformatdec.h
@@ -0,0 +1,71 @@
+#ifndef CORE_BFORMATDEC_H
+#define CORE_BFORMATDEC_H
+
+#include <array>
+#include <cstddef>
+#include <memory>
+
+#include "almalloc.h"
+#include "alspan.h"
+#include "ambidefs.h"
+#include "bufferline.h"
+#include "devformat.h"
+#include "filters/splitter.h"
+
+struct AmbDecConf;
+struct FrontStablizer;
+
+
+using ChannelDec = std::array<float,MaxAmbiChannels>;
+
+class BFormatDec {
+    static constexpr size_t sHFBand{0};
+    static constexpr size_t sLFBand{1};
+    static constexpr size_t sNumBands{2};
+
+    struct ChannelDecoder {
+        union MatrixU {
+            float Dual[sNumBands][MAX_OUTPUT_CHANNELS];
+            float Single[MAX_OUTPUT_CHANNELS];
+        } mGains{};
+
+        /* NOTE: BandSplitter filter is unused with single-band decoding. */
+        BandSplitter mXOver;
+    };
+
+    alignas(16) std::array<FloatBufferLine,2> mSamples;
+
+    const std::unique_ptr<FrontStablizer> mStablizer;
+    const bool mDualBand{false};
+
+    al::FlexArray<ChannelDecoder> mChannelDec;
+
+public:
+    BFormatDec(const AmbDecConf *conf, const bool allow_2band, const size_t inchans,
+        const uint srate, const uint (&chanmap)[MAX_OUTPUT_CHANNELS],
+        std::unique_ptr<FrontStablizer> stablizer);
+    BFormatDec(const size_t inchans, const al::span<const ChannelDec> coeffs,
+        const al::span<const ChannelDec> coeffslf, std::unique_ptr<FrontStablizer> stablizer);
+
+    bool hasStablizer() const noexcept { return mStablizer != nullptr; };
+
+    /* Decodes the ambisonic input to the given output channels. */
+    void process(const al::span<FloatBufferLine> OutBuffer, const FloatBufferLine *InSamples,
+        const size_t SamplesToDo);
+
+    /* Decodes the ambisonic input to the given output channels with stablization. */
+    void processStablize(const al::span<FloatBufferLine> OutBuffer,
+        const FloatBufferLine *InSamples, const size_t lidx, const size_t ridx, const size_t cidx,
+        const size_t SamplesToDo);
+
+    static std::unique_ptr<BFormatDec> Create(const AmbDecConf *conf, const bool allow_2band,
+        const size_t inchans, const uint srate, const uint (&chanmap)[MAX_OUTPUT_CHANNELS],
+        std::unique_ptr<FrontStablizer> stablizer);
+    static std::unique_ptr<BFormatDec> Create(const size_t inchans,
+        const al::span<const ChannelDec> coeffs, const al::span<const ChannelDec> coeffslf,
+        std::unique_ptr<FrontStablizer> stablizer);
+
+    DEF_FAM_NEWDEL(BFormatDec, mChannelDec)
+};
+
+#endif /* CORE_BFORMATDEC_H */
diff --git a/core/bsinc_defs.h b/core/bsinc_defs.h
index 43865289..f2958231 100644
--- a/core/bsinc_defs.h
+++ b/core/bsinc_defs.h
@@ -7,10 +7,4 @@ constexpr unsigned int BSincScaleCount{1 << BSincScaleBits};
 constexpr unsigned int BSincPhaseBits{5};
 constexpr unsigned int BSincPhaseCount{1 << BSincPhaseBits};
 
-/* The maximum number of sample points for the bsinc filters. The max points
- * includes the doubling for downsampling, so the maximum number of base sample
- * points is 24, which is 23rd order.
- */
-constexpr unsigned int BSincPointsMax{48};
-
 #endif /* CORE_BSINC_DEFS_H */
diff --git a/core/bsinc_tables.cpp b/core/bsinc_tables.cpp
index 315e1448..ff73c301 100644
--- a/core/bsinc_tables.cpp
+++ b/core/bsinc_tables.cpp
@@ -9,6 +9,7 @@
 #include <memory>
 #include <stdexcept>
 
+#include "core/mixer/defs.h"
 #include "math_defs.h"
 
 
@@ -24,7 +25,8 @@ using uint = unsigned int;
  */
 constexpr double Sinc(const double x)
 {
-    if(!(x > 1e-15 || x < -1e-15))
+    constexpr double epsilon{std::numeric_limits<double>::epsilon()};
+    if(!(x > epsilon || x < -epsilon))
         return 1.0;
     return std::sin(al::MathDefs<double>::Pi()*x) / (al::MathDefs<double>::Pi()*x);
 }
@@ -35,7 +37,7 @@ constexpr double Sinc(const double x)
  *   I_0(x) = sum_{k=0}^inf (1 / k!)^2 (x / 2)^(2 k)
  *          = sum_{k=0}^inf ((x / 2)^k / k!)^2
  */
-constexpr double BesselI_0(const double x)
+constexpr double BesselI_0(const double x) noexcept
 {
     /* Start at k=1 since k=0 is trivial. */
     const double x2{x / 2.0};
@@ -82,7 +84,7 @@ constexpr double Kaiser(const double beta, const double k, const double besseli_
 /* Calculates the (normalized frequency) transition width of the Kaiser window.
  * Rejection is in dB.
  */
-constexpr double CalcKaiserWidth(const double rejection, const uint order)
+constexpr double CalcKaiserWidth(const double rejection, const uint order) noexcept
 {
     if(rejection > 21.19)
         return (rejection - 7.95) / (order * 2.285 * al::MathDefs<double>::Tau());
@@ -122,7 +124,7 @@ struct BSincHeader {
         uint num_points{Order+1};
         for(uint si{0};si < BSincScaleCount;++si)
         {
-            const double scale{scaleBase + (scaleRange * si / (BSincScaleCount-1))};
+            const double scale{scaleBase + (scaleRange * (si+1) / BSincScaleCount)};
             const uint a_{std::min(static_cast<uint>(num_points / 2.0 / scale), num_points)};
             const uint m{2 * a_};
 
@@ -144,21 +146,33 @@ constexpr BSincHeader bsinc24_hdr{60, 23};
  * namespace while also being used as non-type template parameters.
  */
 #if !defined(__clang__) && defined(__GNUC__) && __GNUC__ < 6
+
+/* The number of sample points is double the a value (rounded up to a multiple
+ * of 4), and scale index 0 includes the doubling for downsampling. bsinc24 is
+ * currently the highest quality filter, and will use the most sample points.
+ */
+constexpr uint BSincPointsMax{(bsinc24_hdr.a[0]*2 + 3) & ~3u};
+static_assert(BSincPointsMax <= MaxResamplerPadding, "MaxResamplerPadding is too small");
+
 template<size_t total_size>
 struct BSincFilterArray {
     alignas(16) std::array<float, total_size> mTable;
+    const BSincHeader &hdr;
 
-    BSincFilterArray(const BSincHeader &hdr)
+    BSincFilterArray(const BSincHeader &hdr_) : hdr{hdr_}
+    {
 #else
 template<const BSincHeader &hdr>
 struct BSincFilterArray {
-    alignas(16) std::array<float, hdr.total_size> mTable;
+    alignas(16) std::array<float, hdr.total_size> mTable{};
 
     BSincFilterArray()
-#endif
     {
-        using filter_type = double[][BSincPhaseCount+1][BSincPointsMax];
-        auto filter = std::make_unique<filter_type>(BSincScaleCount);
+        constexpr uint BSincPointsMax{(hdr.a[0]*2 + 3) & ~3u};
+        static_assert(BSincPointsMax <= MaxResamplerPadding, "MaxResamplerPadding is too small");
+#endif
+        using filter_type = double[BSincPhaseCount+1][BSincPointsMax];
+        auto filter = std::make_unique<filter_type[]>(BSincScaleCount);
 
         /* Calculate the Kaiser-windowed Sinc filter coefficients for each
          * scale and phase index.
@@ -167,38 +181,38 @@ struct BSincFilterArray {
         {
             const uint m{hdr.a[si] * 2};
             const size_t o{(BSincPointsMax-m) / 2};
-            const double scale{hdr.scaleBase + (hdr.scaleRange * si / (BSincScaleCount-1))};
-            const double cutoff{scale - (hdr.scaleBase * std::max(0.5, scale) * 2.0)};
+            const double scale{hdr.scaleBase + (hdr.scaleRange * (si+1) / BSincScaleCount)};
+            const double cutoff{scale - (hdr.scaleBase * std::max(1.0, scale*2.0))};
             const auto a = static_cast<double>(hdr.a[si]);
-            const double l{a - 1.0};
+            const double l{a - 1.0/BSincPhaseCount};
 
             /* Do one extra phase index so that the phase delta has a proper
              * target for its last index.
              */
             for(uint pi{0};pi <= BSincPhaseCount;++pi)
             {
-                const double phase{l + (pi/double{BSincPhaseCount})};
+                const double phase{std::floor(l) + (pi/double{BSincPhaseCount})};
 
                 for(uint i{0};i < m;++i)
                 {
                     const double x{i - phase};
-                    filter[si][pi][o+i] = Kaiser(hdr.beta, x/a, hdr.besseli_0_beta) * cutoff *
+                    filter[si][pi][o+i] = Kaiser(hdr.beta, x/l, hdr.besseli_0_beta) * cutoff *
                         Sinc(cutoff*x);
                 }
             }
         }
 
         size_t idx{0};
-        for(size_t si{0};si < BSincScaleCount-1;++si)
+        for(size_t si{0};si < BSincScaleCount;++si)
         {
             const size_t m{((hdr.a[si]*2) + 3) & ~3u};
             const size_t o{(BSincPointsMax-m) / 2};
 
+            /* Write out each phase index's filter and phase delta for this
+             * quality scale.
+             */
             for(size_t pi{0};pi < BSincPhaseCount;++pi)
             {
-                /* Write out the filter. Also calculate and write out the phase
-                 * and scale deltas.
-                 */
                 for(size_t i{0};i < m;++i)
                     mTable[idx++] = static_cast<float>(filter[si][pi][o+i]);
 
@@ -210,11 +224,22 @@ struct BSincFilterArray {
                     const double phDelta{filter[si][pi+1][o+i] - filter[si][pi][o+i]};
                     mTable[idx++] = static_cast<float>(phDelta);
                 }
-
+            }
+            /* Calculate and write out each phase index's filter quality scale
+             * deltas. The last scale index doesn't have any scale or scale-
+             * phase deltas.
+             */
+            if(si == BSincScaleCount-1)
+            {
+                for(size_t i{0};i < BSincPhaseCount*m*2;++i)
+                    mTable[idx++] = 0.0f;
+            }
+            else for(size_t pi{0};pi < BSincPhaseCount;++pi)
+            {
                 /* Linear interpolation between scales is also simplified.
                  *
-                 * Given a difference in points between scales, the destination
-                 * points will be 0, thus: x = a + f (-a)
+                 * Given a difference in the number of points between scales,
+                 * the destination points will be 0, thus: x = a + f (-a)
                  */
                 for(size_t i{0};i < m;++i)
                 {
@@ -233,31 +258,11 @@ struct BSincFilterArray {
                 }
             }
         }
-        {
-            /* The last scale index doesn't have any scale or scale-phase
-             * deltas.
-             */
-            constexpr size_t si{BSincScaleCount-1};
-            const size_t m{((hdr.a[si]*2) + 3) & ~3u};
-            const size_t o{(BSincPointsMax-m) / 2};
-
-            for(size_t pi{0};pi < BSincPhaseCount;++pi)
-            {
-                for(size_t i{0};i < m;++i)
-                    mTable[idx++] = static_cast<float>(filter[si][pi][o+i]);
-                for(size_t i{0};i < m;++i)
-                {
-                    const double phDelta{filter[si][pi+1][o+i] - filter[si][pi][o+i]};
-                    mTable[idx++] = static_cast<float>(phDelta);
-                }
-                for(size_t i{0};i < m;++i)
-                    mTable[idx++] = 0.0f;
-                for(size_t i{0};i < m;++i)
-                    mTable[idx++] = 0.0f;
-            }
-        }
         assert(idx == hdr.total_size);
     }
+
+    constexpr const BSincHeader &getHeader() const noexcept { return hdr; }
+    constexpr const float *getTable() const noexcept { return &mTable.front(); }
 };
 
 #if !defined(__clang__) && defined(__GNUC__) && __GNUC__ < 6
@@ -268,9 +273,11 @@ const BSincFilterArray<bsinc12_hdr> bsinc12_filter{};
 const BSincFilterArray<bsinc24_hdr> bsinc24_filter{};
 #endif
 
-constexpr BSincTable GenerateBSincTable(const BSincHeader &hdr, const float *tab)
+template<typename T>
+constexpr BSincTable GenerateBSincTable(const T &filter)
 {
     BSincTable ret{};
+    const BSincHeader &hdr = filter.getHeader();
     ret.scaleBase = static_cast<float>(hdr.scaleBase);
     ret.scaleRange = static_cast<float>(1.0 / hdr.scaleRange);
     for(size_t i{0};i < BSincScaleCount;++i)
@@ -278,11 +285,11 @@ constexpr BSincTable GenerateBSincTable(const BSincHeader &hdr, const float *tab
     ret.filterOffset[0] = 0;
     for(size_t i{1};i < BSincScaleCount;++i)
         ret.filterOffset[i] = ret.filterOffset[i-1] + ret.m[i-1]*4*BSincPhaseCount;
-    ret.Tab = tab;
+    ret.Tab = filter.getTable();
     return ret;
 }
 
 } // namespace
 
-const BSincTable bsinc12{GenerateBSincTable(bsinc12_hdr, &bsinc12_filter.mTable.front())};
-const BSincTable bsinc24{GenerateBSincTable(bsinc24_hdr, &bsinc24_filter.mTable.front())};
+const BSincTable bsinc12{GenerateBSincTable(bsinc12_filter)};
+const BSincTable bsinc24{GenerateBSincTable(bsinc24_filter)};
diff --git a/core/buffer_storage.cpp b/core/buffer_storage.cpp
new file mode 100644
index 00000000..5179db13
--- /dev/null
+++ b/core/buffer_storage.cpp
@@ -0,0 +1,41 @@
+
+#include "config.h"
+
+#include "buffer_storage.h"
+
+#include <stdint.h>
+
+
+uint BytesFromFmt(FmtType type) noexcept
+{
+    switch(type)
+    {
+    case FmtUByte: return sizeof(uint8_t);
+    case FmtShort: return sizeof(int16_t);
+    case FmtFloat: return sizeof(float);
+    case FmtDouble: return sizeof(double);
+    case FmtMulaw: return sizeof(uint8_t);
+    case FmtAlaw: return sizeof(uint8_t);
+    }
+    return 0;
+}
+
+uint ChannelsFromFmt(FmtChannels chans, uint ambiorder) noexcept
+{
+    switch(chans)
+    {
+    case FmtMono: return 1;
+    case FmtStereo: return 2;
+    case FmtRear: return 2;
+    case FmtQuad: return 4;
+    case FmtX51: return 6;
+    case FmtX61: return 7;
+    case FmtX71: return 8;
+    case FmtBFormat2D: return (ambiorder*2) + 1;
+    case FmtBFormat3D: return (ambiorder+1) * (ambiorder+1);
+    case FmtUHJ2: return 2;
+    case FmtUHJ3: return 3;
+    case FmtUHJ4: return 4;
+    }
+    return 0;
+}
diff --git a/core/buffer_storage.h b/core/buffer_storage.h
new file mode 100644
index 00000000..59280354
--- /dev/null
+++ b/core/buffer_storage.h
@@ -0,0 +1,75 @@
+#ifndef CORE_BUFFER_STORAGE_H
+#define CORE_BUFFER_STORAGE_H
+
+#include <atomic>
+
+#include "albyte.h"
+
+
+using uint = unsigned int;
+
+/* Storable formats */
+enum FmtType : unsigned char {
+    FmtUByte,
+    FmtShort,
+    FmtFloat,
+    FmtDouble,
+    FmtMulaw,
+    FmtAlaw,
+};
+enum FmtChannels : unsigned char {
+    FmtMono,
+    FmtStereo,
+    FmtRear,
+    FmtQuad,
+    FmtX51, /* (WFX order) */
+    FmtX61, /* (WFX order) */
+    FmtX71, /* (WFX order) */
+    FmtBFormat2D,
+    FmtBFormat3D,
+    FmtUHJ2, /* 2-channel UHJ, aka "BHJ", stereo-compatible */
+    FmtUHJ3, /* 3-channel UHJ, aka "THJ" */
+    FmtUHJ4, /* 4-channel UHJ, aka "PHJ" */
+};
+
+enum class AmbiLayout : unsigned char {
+    FuMa,
+    ACN,
+};
+enum class AmbiScaling : unsigned char {
+    FuMa,
+    SN3D,
+    N3D,
+};
+
+uint BytesFromFmt(FmtType type) noexcept;
+uint ChannelsFromFmt(FmtChannels chans, uint ambiorder) noexcept;
+inline uint FrameSizeFromFmt(FmtChannels chans, FmtType type, uint ambiorder) noexcept
+{ return ChannelsFromFmt(chans, ambiorder) * BytesFromFmt(type); }
+
+
+using CallbackType = int(*)(void*, void*, int);
+
+struct BufferStorage {
+    CallbackType mCallback{nullptr};
+    void *mUserData{nullptr};
+
+    uint mSampleRate{0u};
+    FmtChannels mChannels{FmtMono};
+    FmtType mType{FmtShort};
+    uint mSampleLen{0u};
+
+    AmbiLayout mAmbiLayout{AmbiLayout::FuMa};
+    AmbiScaling mAmbiScaling{AmbiScaling::FuMa};
+    uint mAmbiOrder{0u};
+
+    inline uint bytesFromFmt() const noexcept { return BytesFromFmt(mType); }
+    inline uint channelsFromFmt() const noexcept
+    { return ChannelsFromFmt(mChannels, mAmbiOrder); }
+    inline uint frameSizeFromFmt() const noexcept { return channelsFromFmt() * bytesFromFmt(); }
+
+    inline bool isBFormat() const noexcept
+    { return mChannels == FmtBFormat2D || mChannels == FmtBFormat3D; }
+};
+
+#endif /* CORE_BUFFER_STORAGE_H */
diff --git a/core/bufferline.h b/core/bufferline.h
index 503e208d..8b445f3f 100644
--- a/core/bufferline.h
+++ b/core/bufferline.h
@@ -3,6 +3,8 @@
 
 #include <array>
 
+#include "alspan.h"
+
 /* Size for temporary storage of buffer data, in floats. Larger values need
  * more memory and are harder on cache, while smaller values may need more
  * iterations for mixing.
@@ -10,5 +12,6 @@
 constexpr int BufferLineSize{1024};
 
 using FloatBufferLine = std::array<float,BufferLineSize>;
+using FloatBufferSpan = al::span<float,BufferLineSize>;
 
 #endif /* CORE_BUFFERLINE_H */
diff --git a/core/context.cpp b/core/context.cpp
new file mode 100644
index 00000000..f1c310aa
--- /dev/null
+++ b/core/context.cpp
@@ -0,0 +1,5 @@
+
+#include "config.h"
+
+#include "context.h"
+
diff --git a/core/context.h b/core/context.h
new file mode 100644
index 00000000..bf439053
--- /dev/null
+++ b/core/context.h
@@ -0,0 +1,171 @@
+#ifndef CORE_CONTEXT_H
+#define CORE_CONTEXT_H
+
+#include <array>
+#include <atomic>
+#include <cstddef>
+#include <memory>
+#include <thread>
+
+#include "almalloc.h"
+#include "alspan.h"
+#include "atomic.h"
+#include "core/bufferline.h"
+#include "threads.h"
+#include "vecmat.h"
+#include "vector.h"
+
+struct DeviceBase;
+struct EffectSlot;
+struct EffectSlotProps;
+struct RingBuffer;
+struct Voice;
+struct VoiceChange;
+struct VoicePropsItem;
+
+using uint = unsigned int;
+
+
+constexpr float SpeedOfSoundMetersPerSec{343.3f};
+
+enum class DistanceModel : unsigned char {
+    Disable,
+    Inverse, InverseClamped,
+    Linear, LinearClamped,
+    Exponent, ExponentClamped,
+
+    Default = InverseClamped
+};
+
+
+struct WetBuffer {
+    bool mInUse;
+    al::FlexArray<FloatBufferLine, 16> mBuffer;
+
+    WetBuffer(size_t count) : mBuffer{count} { }
+
+    DEF_FAM_NEWDEL(WetBuffer, mBuffer)
+};
+using WetBufferPtr = std::unique_ptr<WetBuffer>;
+
+
+struct ContextProps {
+    float DopplerFactor;
+    float DopplerVelocity;
+    float SpeedOfSound;
+    bool SourceDistanceModel;
+    DistanceModel mDistanceModel;
+
+    std::atomic<ContextProps*> next;
+
+    DEF_NEWDEL(ContextProps)
+};
+
+struct ListenerProps {
+    std::array<float,3> Position;
+    std::array<float,3> Velocity;
+    std::array<float,3> OrientAt;
+    std::array<float,3> OrientUp;
+    float Gain;
+    float MetersPerUnit;
+
+    std::atomic<ListenerProps*> next;
+
+    DEF_NEWDEL(ListenerProps)
+};
+
+struct ContextParams {
+    /* Pointer to the most recent property values that are awaiting an update. */
+    std::atomic<ContextProps*> ContextUpdate{nullptr};
+    std::atomic<ListenerProps*> ListenerUpdate{nullptr};
+
+    alu::Matrix Matrix{alu::Matrix::Identity()};
+    alu::Vector Velocity{};
+
+    float Gain{1.0f};
+    float MetersPerUnit{1.0f};
+
+    float DopplerFactor{1.0f};
+    float SpeedOfSound{343.3f}; /* in units per sec! */
+
+    bool SourceDistanceModel{false};
+    DistanceModel mDistanceModel{};
+};
+
+struct ContextBase {
+    DeviceBase *const mDevice;
+
+    /* Counter for the pre-mixing updates, in 31.1 fixed point (lowest bit
+     * indicates if updates are currently happening).
+     */
+    RefCount mUpdateCount{0u};
+    std::atomic<bool> mHoldUpdates{false};
+    std::atomic<bool> mStopVoicesOnDisconnect{true};
+
+    float mGainBoost{1.0f};
+
+    /* Linked lists of unused property containers, free to use for future
+     * updates.
+     */
+    std::atomic<ContextProps*> mFreeContextProps{nullptr};
+    std::atomic<ListenerProps*> mFreeListenerProps{nullptr};
+    std::atomic<VoicePropsItem*> mFreeVoiceProps{nullptr};
+    std::atomic<EffectSlotProps*> mFreeEffectslotProps{nullptr};
+
+    /* The voice change tail is the beginning of the "free" elements, up to and
+     * *excluding* the current. If tail==current, there's no free elements and
+     * new ones need to be allocated. The current voice change is the element
+     * last processed, and any after are pending.
+     */
+    VoiceChange *mVoiceChangeTail{};
+    std::atomic<VoiceChange*> mCurrentVoiceChange{};
+
+    void allocVoiceChanges(size_t addcount);
+
+
+    ContextParams mParams;
+
+    using VoiceArray = al::FlexArray<Voice*>;
+    std::atomic<VoiceArray*> mVoices{};
+    std::atomic<size_t> mActiveVoiceCount{};
+
+    void allocVoices(size_t addcount);
+    al::span<Voice*> getVoicesSpan() const noexcept
+    {
+        return {mVoices.load(std::memory_order_relaxed)->data(),
+            mActiveVoiceCount.load(std::memory_order_relaxed)};
+    }
+    al::span<Voice*> getVoicesSpanAcquired() const noexcept
+    {
+        return {mVoices.load(std::memory_order_acquire)->data(),
+            mActiveVoiceCount.load(std::memory_order_acquire)};
+    }
+
+
+    using EffectSlotArray = al::FlexArray<EffectSlot*>;
+    std::atomic<EffectSlotArray*> mActiveAuxSlots{nullptr};
+
+    std::thread mEventThread;
+    al::semaphore mEventSem;
+    std::unique_ptr<RingBuffer> mAsyncEvents;
+    std::atomic<uint> mEnabledEvts{0u};
+
+    /* Asynchronous voice change actions are processed as a linked list of
+     * VoiceChange objects by the mixer, which is atomically appended to.
+     * However, to avoid allocating each object individually, they're allocated
+     * in clusters that are stored in a vector for easy automatic cleanup.
+     */
+    using VoiceChangeCluster = std::unique_ptr<VoiceChange[]>;
+    al::vector<VoiceChangeCluster> mVoiceChangeClusters;
+
+    using VoiceCluster = std::unique_ptr<Voice[]>;
+    al::vector<VoiceCluster> mVoiceClusters;
+
+
+    ContextBase(DeviceBase *device);
+    ContextBase(const ContextBase&) = delete;
+    ContextBase& operator=(const ContextBase&) = delete;
+    ~ContextBase();
+};
+
+#endif /* CORE_CONTEXT_H */
diff --git a/core/converter.cpp b/core/converter.cpp
new file mode 100644
index 00000000..6a06b464
--- /dev/null
+++ b/core/converter.cpp
@@ -0,0 +1,371 @@
+
+#include "config.h"
+
+#include "converter.h"
+
+#include <algorithm>
+#include <cmath>
+#include <cstdint>
+#include <iterator>
+#include <limits.h>
+
+#include "albit.h"
+#include "albyte.h"
+#include "alnumeric.h"
+#include "fpu_ctrl.h"
+
+struct CTag;
+struct CopyTag;
+
+
+namespace {
+
+constexpr uint MaxPitch{10};
+
+static_assert((BufferLineSize-1)/MaxPitch > 0, "MaxPitch is too large for BufferLineSize!");
+static_assert((INT_MAX>>MixerFracBits)/MaxPitch > BufferLineSize,
+    "MaxPitch and/or BufferLineSize are too large for MixerFracBits!");
+
+/* Base template left undefined. Should be marked =delete, but Clang 3.8.1
+ * chokes on that given the inline specializations.
+ */
+template<DevFmtType T>
+inline float LoadSample(DevFmtType_t<T> val) noexcept;
+
+template<> inline float LoadSample<DevFmtByte>(DevFmtType_t<DevFmtByte> val) noexcept
+{ return val * (1.0f/128.0f); }
+template<> inline float LoadSample<DevFmtShort>(DevFmtType_t<DevFmtShort> val) noexcept
+{ return val * (1.0f/32768.0f); }
+template<> inline float LoadSample<DevFmtInt>(DevFmtType_t<DevFmtInt> val) noexcept
+{ return static_cast<float>(val) * (1.0f/2147483648.0f); }
+template<> inline float LoadSample<DevFmtFloat>(DevFmtType_t<DevFmtFloat> val) noexcept
+{ return val; }
+
+template<> inline float LoadSample<DevFmtUByte>(DevFmtType_t<DevFmtUByte> val) noexcept
+{ return LoadSample<DevFmtByte>(static_cast<int8_t>(val - 128)); }
+template<> inline float LoadSample<DevFmtUShort>(DevFmtType_t<DevFmtUShort> val) noexcept
+{ return LoadSample<DevFmtShort>(static_cast<int16_t>(val - 32768)); }
+template<> inline float LoadSample<DevFmtUInt>(DevFmtType_t<DevFmtUInt> val) noexcept
+{ return LoadSample<DevFmtInt>(static_cast<int32_t>(val - 2147483648u)); }
+
+
+template<DevFmtType T>
+inline void LoadSampleArray(float *RESTRICT dst, const void *src, const size_t srcstep,
+    const size_t samples) noexcept
+{
+    const DevFmtType_t<T> *ssrc = static_cast<const DevFmtType_t<T>*>(src);
+    for(size_t i{0u};i < samples;i++)
+        dst[i] = LoadSample<T>(ssrc[i*srcstep]);
+}
+
+void LoadSamples(float *dst, const void *src, const size_t srcstep, const DevFmtType srctype,
+    const size_t samples) noexcept
+{
+#define HANDLE_FMT(T)                                                         \
+    case T: LoadSampleArray<T>(dst, src, srcstep, samples); break
+    switch(srctype)
+    {
+        HANDLE_FMT(DevFmtByte);
+        HANDLE_FMT(DevFmtUByte);
+        HANDLE_FMT(DevFmtShort);
+        HANDLE_FMT(DevFmtUShort);
+        HANDLE_FMT(DevFmtInt);
+        HANDLE_FMT(DevFmtUInt);
+        HANDLE_FMT(DevFmtFloat);
+    }
+#undef HANDLE_FMT
+}
+
+
+template<DevFmtType T>
+inline DevFmtType_t<T> StoreSample(float) noexcept;
+
+template<> inline float StoreSample<DevFmtFloat>(float val) noexcept
+{ return val; }
+template<> inline int32_t StoreSample<DevFmtInt>(float val) noexcept
+{ return fastf2i(clampf(val*2147483648.0f, -2147483648.0f, 2147483520.0f)); }
+template<> inline int16_t StoreSample<DevFmtShort>(float val) noexcept
+{ return static_cast<int16_t>(fastf2i(clampf(val*32768.0f, -32768.0f, 32767.0f))); }
+template<> inline int8_t StoreSample<DevFmtByte>(float val) noexcept
+{ return static_cast<int8_t>(fastf2i(clampf(val*128.0f, -128.0f, 127.0f))); }
+
+/* Define unsigned output variations. */
+template<> inline uint32_t StoreSample<DevFmtUInt>(float val) noexcept
+{ return static_cast<uint32_t>(StoreSample<DevFmtInt>(val)) + 2147483648u; }
+template<> inline uint16_t StoreSample<DevFmtUShort>(float val) noexcept
+{ return static_cast<uint16_t>(StoreSample<DevFmtShort>(val) + 32768); }
+template<> inline uint8_t StoreSample<DevFmtUByte>(float val) noexcept
+{ return static_cast<uint8_t>(StoreSample<DevFmtByte>(val) + 128); }
+
+template<DevFmtType T>
+inline void StoreSampleArray(void *dst, const float *RESTRICT src, const size_t dststep,
+    const size_t samples) noexcept
+{
+    DevFmtType_t<T> *sdst = static_cast<DevFmtType_t<T>*>(dst);
+    for(size_t i{0u};i < samples;i++)
+        sdst[i*dststep] = StoreSample<T>(src[i]);
+}
+
+
+void StoreSamples(void *dst, const float *src, const size_t dststep, const DevFmtType dsttype,
+    const size_t samples) noexcept
+{
+#define HANDLE_FMT(T)                                                         \
+    case T: StoreSampleArray<T>(dst, src, dststep, samples); break
+    switch(dsttype)
+    {
+        HANDLE_FMT(DevFmtByte);
+        HANDLE_FMT(DevFmtUByte);
+        HANDLE_FMT(DevFmtShort);
+        HANDLE_FMT(DevFmtUShort);
+        HANDLE_FMT(DevFmtInt);
+        HANDLE_FMT(DevFmtUInt);
+        HANDLE_FMT(DevFmtFloat);
+    }
+#undef HANDLE_FMT
+}
+
+
+template<DevFmtType T>
+void Mono2Stereo(float *RESTRICT dst, const void *src, const size_t frames) noexcept
+{
+    const DevFmtType_t<T> *ssrc = static_cast<const DevFmtType_t<T>*>(src);
+    for(size_t i{0u};i < frames;i++)
+        dst[i*2 + 1] = dst[i*2 + 0] = LoadSample<T>(ssrc[i]) * 0.707106781187f;
+}
+
+template<DevFmtType T>
+void Multi2Mono(uint chanmask, const size_t step, const float scale, float *RESTRICT dst,
+    const void *src, const size_t frames) noexcept
+{
+    const DevFmtType_t<T> *ssrc = static_cast<const DevFmtType_t<T>*>(src);
+    std::fill_n(dst, frames, 0.0f);
+    for(size_t c{0};chanmask;++c)
+    {
+        if LIKELY((chanmask&1))
+        {
+            for(size_t i{0u};i < frames;i++)
+                dst[i] += LoadSample<T>(ssrc[i*step + c]);
+        }
+        chanmask >>= 1;
+    }
+    for(size_t i{0u};i < frames;i++)
+        dst[i] *= scale;
+}
+
+} // namespace
+
+SampleConverterPtr CreateSampleConverter(DevFmtType srcType, DevFmtType dstType, size_t numchans,
+    uint srcRate, uint dstRate, Resampler resampler)
+{
+    if(numchans < 1 || srcRate < 1 || dstRate < 1)
+        return nullptr;
+
+    SampleConverterPtr converter{new(FamCount(numchans)) SampleConverter{numchans}};
+    converter->mSrcType = srcType;
+    converter->mDstType = dstType;
+    converter->mSrcTypeSize = BytesFromDevFmt(srcType);
+    converter->mDstTypeSize = BytesFromDevFmt(dstType);
+
+    converter->mSrcPrepCount = 0;
+    converter->mFracOffset = 0;
+
+    /* Have to set the mixer FPU mode since that's what the resampler code expects. */
+    FPUCtl mixer_mode{};
+    auto step = static_cast<uint>(
+        mind(srcRate*double{MixerFracOne}/dstRate + 0.5, MaxPitch*MixerFracOne));
+    converter->mIncrement = maxu(step, 1);
+    if(converter->mIncrement == MixerFracOne)
+        converter->mResample = Resample_<CopyTag,CTag>;
+    else
+        converter->mResample = PrepareResampler(resampler, converter->mIncrement,
+            &converter->mState);
+
+    return converter;
+}
+
+uint SampleConverter::availableOut(uint srcframes) const
+{
+    int prepcount{mSrcPrepCount};
+    if(prepcount < 0)
+    {
+        /* Negative prepcount means we need to skip that many input samples. */
+        if(static_cast<uint>(-prepcount) >= srcframes)
+            return 0;
+        srcframes -= static_cast<uint>(-prepcount);
+        prepcount = 0;
+    }
+
+    if(srcframes < 1)
+    {
+        /* No output samples if there's no input samples. */
+        return 0;
+    }
+
+    if(prepcount < MaxResamplerPadding
+        && static_cast<uint>(MaxResamplerPadding - prepcount) >= srcframes)
+    {
+        /* Not enough input samples to generate an output sample. */
+        return 0;
+    }
+
+    auto DataSize64 = static_cast<uint64_t>(prepcount);
+    DataSize64 += srcframes;
+    DataSize64 -= MaxResamplerPadding;
+    DataSize64 <<= MixerFracBits;
+    DataSize64 -= mFracOffset;
+
+    /* If we have a full prep, we can generate at least one sample. */
+    return static_cast<uint>(clampu64((DataSize64 + mIncrement-1)/mIncrement, 1,
+        std::numeric_limits<int>::max()));
+}
+
+uint SampleConverter::convert(const void **src, uint *srcframes, void *dst, uint dstframes)
+{
+    const uint SrcFrameSize{static_cast<uint>(mChan.size()) * mSrcTypeSize};
+    const uint DstFrameSize{static_cast<uint>(mChan.size()) * mDstTypeSize};
+    const uint increment{mIncrement};
+    auto SamplesIn = static_cast<const al::byte*>(*src);
+    uint NumSrcSamples{*srcframes};
+
+    FPUCtl mixer_mode{};
+    uint pos{0};
+    while(pos < dstframes && NumSrcSamples > 0)
+    {
+        int prepcount{mSrcPrepCount};
+        if(prepcount < 0)
+        {
+            /* Negative prepcount means we need to skip that many input samples. */
+            if(static_cast<uint>(-prepcount) >= NumSrcSamples)
+            {
+                mSrcPrepCount = static_cast<int>(NumSrcSamples) + prepcount;
+                NumSrcSamples = 0;
+                break;
+            }
+            SamplesIn += SrcFrameSize*static_cast<uint>(-prepcount);
+            NumSrcSamples -= static_cast<uint>(-prepcount);
+            mSrcPrepCount = 0;
+            continue;
+        }
+        const uint toread{minu(NumSrcSamples, BufferLineSize - MaxResamplerPadding)};
+
+        if(prepcount < MaxResamplerPadding
+            && static_cast<uint>(MaxResamplerPadding - prepcount) >= toread)
+        {
+            /* Not enough input samples to generate an output sample. Store
+             * what we're given for later.
+             */
+            for(size_t chan{0u};chan < mChan.size();chan++)
+                LoadSamples(&mChan[chan].PrevSamples[prepcount], SamplesIn + mSrcTypeSize*chan,
+                    mChan.size(), mSrcType, toread);
+
+            mSrcPrepCount = prepcount + static_cast<int>(toread);
+            NumSrcSamples = 0;
+            break;
+        }
+
+        float *RESTRICT SrcData{mSrcSamples};
+        float *RESTRICT DstData{mDstSamples};
+        uint DataPosFrac{mFracOffset};
+        auto DataSize64 = static_cast<uint64_t>(prepcount);
+        DataSize64 += toread;
+        DataSize64 -= MaxResamplerPadding;
+        DataSize64 <<= MixerFracBits;
+        DataSize64 -= DataPosFrac;
+
+        /* If we have a full prep, we can generate at least one sample. */
+        auto DstSize = static_cast<uint>(
+            clampu64((DataSize64 + increment-1)/increment, 1, BufferLineSize));
+        DstSize = minu(DstSize, dstframes-pos);
+
+        for(size_t chan{0u};chan < mChan.size();chan++)
+        {
+            const al::byte *SrcSamples{SamplesIn + mSrcTypeSize*chan};
+            al::byte *DstSamples = static_cast<al::byte*>(dst) + mDstTypeSize*chan;
+
+            /* Load the previous samples into the source data first, then the
+             * new samples from the input buffer.
+             */
+            std::copy_n(mChan[chan].PrevSamples, prepcount, SrcData);
+            LoadSamples(SrcData + prepcount, SrcSamples, mChan.size(), mSrcType, toread);
+
+            /* Store as many prep samples for next time as possible, given the
+             * number of output samples being generated.
+             */
+            uint SrcDataEnd{(DstSize*increment + DataPosFrac)>>MixerFracBits};
+            if(SrcDataEnd >= static_cast<uint>(prepcount)+toread)
+                std::fill(std::begin(mChan[chan].PrevSamples),
+                    std::end(mChan[chan].PrevSamples), 0.0f);
+            else
+            {
+                const size_t len{minz(al::size(mChan[chan].PrevSamples),
+                    static_cast<uint>(prepcount)+toread-SrcDataEnd)};
+                std::copy_n(SrcData+SrcDataEnd, len, mChan[chan].PrevSamples);
+                std::fill(std::begin(mChan[chan].PrevSamples)+len,
+                    std::end(mChan[chan].PrevSamples), 0.0f);
+            }
+
+            /* Now resample, and store the result in the output buffer. */
+            const float *ResampledData{mResample(&mState, SrcData+(MaxResamplerPadding>>1),
+                DataPosFrac, increment, {DstData, DstSize})};
+
+            StoreSamples(DstSamples, ResampledData, mChan.size(), mDstType, DstSize);
+        }
+
+        /* Update the number of prep samples still available, as well as the
+         * fractional offset.
+         */
+        DataPosFrac += increment*DstSize;
+        mSrcPrepCount = mini(prepcount + static_cast<int>(toread - (DataPosFrac>>MixerFracBits)),
+            MaxResamplerPadding);
+        mFracOffset = DataPosFrac & MixerFracMask;
+
+        /* Update the src and dst pointers in case there's still more to do. */
+        SamplesIn += SrcFrameSize*(DataPosFrac>>MixerFracBits);
+        NumSrcSamples -= minu(NumSrcSamples, (DataPosFrac>>MixerFracBits));
+
+        dst = static_cast<al::byte*>(dst) + DstFrameSize*DstSize;
+        pos += DstSize;
+    }
+
+    *src = SamplesIn;
+    *srcframes = NumSrcSamples;
+
+    return pos;
+}
+
+
+void ChannelConverter::convert(const void *src, float *dst, uint frames) const
+{
+    if(mDstChans == DevFmtMono)
+    {
+        const float scale{std::sqrt(1.0f / static_cast<float>(al::popcount(mChanMask)))};
+        switch(mSrcType)
+        {
+#define HANDLE_FMT(T) case T: Multi2Mono<T>(mChanMask, mSrcStep, scale, dst, src, frames); break
+        HANDLE_FMT(DevFmtByte);
+        HANDLE_FMT(DevFmtUByte);
+        HANDLE_FMT(DevFmtShort);
+        HANDLE_FMT(DevFmtUShort);
+        HANDLE_FMT(DevFmtInt);
+        HANDLE_FMT(DevFmtUInt);
+        HANDLE_FMT(DevFmtFloat);
+#undef HANDLE_FMT
+        }
+    }
+    else if(mChanMask == 0x1 && mDstChans == DevFmtStereo)
+    {
+        switch(mSrcType)
+        {
+#define HANDLE_FMT(T) case T: Mono2Stereo<T>(dst, src, frames); break
+        HANDLE_FMT(DevFmtByte);
+        HANDLE_FMT(DevFmtUByte);
+        HANDLE_FMT(DevFmtShort);
+        HANDLE_FMT(DevFmtUShort);
+        HANDLE_FMT(DevFmtInt);
+        HANDLE_FMT(DevFmtUInt);
+        HANDLE_FMT(DevFmtFloat);
+#undef HANDLE_FMT
+        }
+    }
+}
diff --git a/core/converter.h b/core/converter.h
new file mode 100644
index 00000000..2d22ae38
--- /dev/null
+++ b/core/converter.h
@@ -0,0 +1,59 @@
+#ifndef CORE_CONVERTER_H
+#define CORE_CONVERTER_H
+
+#include <cstddef>
+#include <memory>
+
+#include "almalloc.h"
+#include "devformat.h"
+#include "mixer/defs.h"
+
+using uint = unsigned int;
+
+
+struct SampleConverter {
+    DevFmtType mSrcType{};
+    DevFmtType mDstType{};
+    uint mSrcTypeSize{};
+    uint mDstTypeSize{};
+
+    int mSrcPrepCount{};
+
+    uint mFracOffset{};
+    uint mIncrement{};
+    InterpState mState{};
+    ResamplerFunc mResample{};
+
+    alignas(16) float mSrcSamples[BufferLineSize]{};
+    alignas(16) float mDstSamples[BufferLineSize]{};
+
+    struct ChanSamples {
+        alignas(16) float PrevSamples[MaxResamplerPadding];
+    };
+    al::FlexArray<ChanSamples> mChan;
+
+    SampleConverter(size_t numchans) : mChan{numchans} { }
+
+    uint convert(const void **src, uint *srcframes, void *dst, uint dstframes);
+    uint availableOut(uint srcframes) const;
+
+    DEF_FAM_NEWDEL(SampleConverter, mChan)
+};
+using SampleConverterPtr = std::unique_ptr<SampleConverter>;
+
+SampleConverterPtr CreateSampleConverter(DevFmtType srcType, DevFmtType dstType, size_t numchans,
+    uint srcRate, uint dstRate, Resampler resampler);
+
+
+struct ChannelConverter {
+    DevFmtType mSrcType{};
+    uint mSrcStep{};
+    uint mChanMask{};
+    DevFmtChannels mDstChans{};
+
+    bool is_active() const noexcept { return mChanMask != 0; }
+
+    void convert(const void *src, float *dst, uint frames) const;
+};
+
+#endif /* CORE_CONVERTER_H */
diff --git a/core/dbus_wrap.cpp b/core/dbus_wrap.cpp
new file mode 100644
index 00000000..506dd815
--- /dev/null
+++ b/core/dbus_wrap.cpp
@@ -0,0 +1,46 @@
+
+#include "config.h"
+
+#include "dbus_wrap.h"
+
+#ifdef HAVE_DYNLOAD
+
+#include <mutex>
+#include <type_traits>
+
+#include "logging.h"
+
+
+void *dbus_handle{nullptr};
+#define DECL_FUNC(x) decltype(x) *p##x{};
+DBUS_FUNCTIONS(DECL_FUNC)
+#undef DECL_FUNC
+
+void PrepareDBus()
+{
+    static constexpr char libname[] = "libdbus-1.so.3";
+
+    auto load_func = [](auto &f, const char *name) -> void
+    { f = reinterpret_cast<std::remove_reference_t<decltype(f)>>(GetSymbol(dbus_handle, name)); };
+#define LOAD_FUNC(x) do {                         \
+    load_func(p##x, #x);                          \
+    if(!p##x)                                     \
+    {                                             \
+        WARN("Failed to load function %s\n", #x); \
+        CloseLib(dbus_handle);                    \
+        dbus_handle = nullptr;                    \
+        return;                                   \
+    }                                             \
+} while(0);
+
+    dbus_handle = LoadLib(libname);
+    if(!dbus_handle)
+    {
+        WARN("Failed to load %s\n", libname);
+        return;
+    }
+
+DBUS_FUNCTIONS(LOAD_FUNC)
+#undef LOAD_FUNC
+}
+#endif
diff --git a/core/dbus_wrap.h b/core/dbus_wrap.h
new file mode 100644
index 00000000..61dbb971
--- /dev/null
+++ b/core/dbus_wrap.h
@@ -0,0 +1,75 @@
+#ifndef CORE_DBUS_WRAP_H
+#define CORE_DBUS_WRAP_H
+
+#include <memory>
+
+#include <dbus/dbus.h>
+
+#include "dynload.h"
+
+
+#define DBUS_FUNCTIONS(MAGIC) \
+MAGIC(dbus_error_init) \
+MAGIC(dbus_error_free) \
+MAGIC(dbus_bus_get) \
+MAGIC(dbus_connection_set_exit_on_disconnect) \
+MAGIC(dbus_connection_unref) \
+MAGIC(dbus_connection_send_with_reply_and_block) \
+MAGIC(dbus_message_unref) \
+MAGIC(dbus_message_new_method_call) \
+MAGIC(dbus_message_append_args) \
+MAGIC(dbus_message_iter_init) \
+MAGIC(dbus_message_iter_next) \
+MAGIC(dbus_message_iter_recurse) \
+MAGIC(dbus_message_iter_get_arg_type) \
+MAGIC(dbus_message_iter_get_basic) \
+MAGIC(dbus_set_error_from_message)
+
+#ifdef HAVE_DYNLOAD
+
+#include <mutex>
+
+extern void *dbus_handle;
+#define DECL_FUNC(x) extern decltype(x) *p##x;
+DBUS_FUNCTIONS(DECL_FUNC)
+#undef DECL_FUNC
+
+void PrepareDBus();
+
+inline auto HasDBus()
+{
+    static std::once_flag init_dbus{};
+    std::call_once(init_dbus, PrepareDBus);
+    return dbus_handle;
+}
+
+#else
+
+#define DECL_FUNC(x) constexpr auto p##x = &x;
+DBUS_FUNCTIONS(DECL_FUNC)
+#undef DECL_FUNC
+
+constexpr bool HasDBus() noexcept { return true; }
+#endif /* HAVE_DYNLOAD */
+
+
+namespace dbus {
+
+struct Error {
+    Error() { (*pdbus_error_init)(&mError); }
+    ~Error() { (*pdbus_error_free)(&mError); }
+    DBusError* operator->() { return &mError; }
+    DBusError &get() { return mError; }
+private:
+    DBusError mError{};
+};
+
+struct ConnectionDeleter {
+    void operator()(DBusConnection *c) { (*pdbus_connection_unref)(c); }
+};
+using ConnectionPtr = std::unique_ptr<DBusConnection,ConnectionDeleter>;
+
+} // namespace dbus
+
+
+#endif /* CORE_DBUS_WRAP_H */
diff --git a/core/device.cpp b/core/device.cpp
new file mode 100644
index 00000000..9705c0ac
--- /dev/null
+++ b/core/device.cpp
@@ -0,0 +1,7 @@
+
+#include "config.h"
+
+#include "device.h"
+
+
+al::FlexArray<ContextBase*> DeviceBase::sEmptyContextArray{0u};
diff --git a/core/device.h b/core/device.h
new file mode 100644
index 00000000..4cc822cc
--- /dev/null
+++ b/core/device.h
@@ -0,0 +1,290 @@
+#ifndef CORE_DEVICE_H
+#define CORE_DEVICE_H
+
+#include <stddef.h>
+
+#include <array>
+#include <atomic>
+#include <bitset>
+#include <chrono>
+#include <memory>
+#include <mutex>
+#include <string>
+
+#include "almalloc.h"
+#include "alspan.h"
+#include "ambidefs.h"
+#include "atomic.h"
+#include "core/bufferline.h"
+#include "devformat.h"
+#include "intrusive_ptr.h"
+#include "mixer/hrtfdefs.h"
+#include "opthelpers.h"
+#include "vector.h"
+
+struct BackendBase;
+class BFormatDec;
+struct bs2b;
+struct Compressor;
+struct ContextBase;
+struct DirectHrtfState;
+struct HrtfStore;
+struct UhjEncoder;
+
+using uint = unsigned int;
+
+
+#define MIN_OUTPUT_RATE      8000
+#define MAX_OUTPUT_RATE      192000
+#define DEFAULT_OUTPUT_RATE  44100
+
+#define DEFAULT_UPDATE_SIZE  882 /* 20ms */
+#define DEFAULT_NUM_UPDATES  3
+
+
+enum class DeviceType : unsigned char {
+    Playback,
+    Capture,
+    Loopback
+};
+
+
+enum class RenderMode : unsigned char {
+    Normal,
+    Pairwise,
+    Hrtf
+};
+
+
+struct InputRemixMap {
+    struct TargetMix { Channel channel; float mix; };
+
+    Channel channel;
+    std::array<TargetMix,2> targets;
+};
+
+
+/* Maximum delay in samples for speaker distance compensation. */
+#define MAX_DELAY_LENGTH 1024
+
+struct DistanceComp {
+    struct ChanData {
+        float Gain{1.0f};
+        uint Length{0u}; /* Valid range is [0...MAX_DELAY_LENGTH). */
+        float *Buffer{nullptr};
+    };
+
+    std::array<ChanData,MAX_OUTPUT_CHANNELS> mChannels;
+    al::FlexArray<float,16> mSamples;
+
+    DistanceComp(size_t count) : mSamples{count} { }
+
+    static std::unique_ptr<DistanceComp> Create(size_t numsamples)
+    { return std::unique_ptr<DistanceComp>{new(FamCount(numsamples)) DistanceComp{numsamples}}; }
+
+    DEF_FAM_NEWDEL(DistanceComp, mSamples)
+};
+
+
+struct BFChannelConfig {
+    float Scale;
+    uint Index;
+};
+
+
+struct MixParams {
+    /* Coefficient channel mapping for mixing to the buffer. */
+    std::array<BFChannelConfig,MAX_OUTPUT_CHANNELS> AmbiMap{};
+
+    al::span<FloatBufferLine> Buffer;
+};
+
+struct RealMixParams {
+    al::span<const InputRemixMap> RemixMap;
+    std::array<uint,MaxChannels> ChannelIndex{};
+
+    al::span<FloatBufferLine> Buffer;
+};
+
+enum {
+    // Frequency was requested by the app or config file
+    FrequencyRequest,
+    // Channel configuration was requested by the config file
+    ChannelsRequest,
+    // Sample type was requested by the config file
+    SampleTypeRequest,
+
+    // Specifies if the DSP is paused at user request
+    DevicePaused,
+    // Specifies if the device is currently running
+    DeviceRunning,
+
+    DeviceFlagsCount
+};
+
+struct DeviceBase {
+    /* To avoid extraneous allocations, a 0-sized FlexArray<ContextBase*> is
+     * defined globally as a sharable object.
+     */
+    static al::FlexArray<ContextBase*> sEmptyContextArray;
+
+    std::atomic<bool> Connected{true};
+    const DeviceType Type{};
+
+    uint Frequency{};
+    uint UpdateSize{};
+    uint BufferSize{};
+
+    DevFmtChannels FmtChans{};
+    DevFmtType FmtType{};
+    bool IsHeadphones{false};
+    uint mAmbiOrder{0};
+    float mXOverFreq{400.0f};
+    /* For DevFmtAmbi* output only, specifies the channel order and
+     * normalization.
+     */
+    DevAmbiLayout mAmbiLayout{DevAmbiLayout::Default};
+    DevAmbiScaling mAmbiScale{DevAmbiScaling::Default};
+
+    std::string DeviceName;
+
+    // Device flags
+    std::bitset<DeviceFlagsCount> Flags{};
+
+    uint NumAuxSends{};
+
+    /* Rendering mode. */
+    RenderMode mRenderMode{RenderMode::Normal};
+
+    /* The average speaker distance as determined by the ambdec configuration,
+     * HRTF data set, or the NFC-HOA reference delay. Only used for NFC.
+     */
+    float AvgSpeakerDist{0.0f};
+
+    uint SamplesDone{0u};
+    std::chrono::nanoseconds ClockBase{0};
+    std::chrono::nanoseconds FixedLatency{0};
+
+    /* Temp storage used for mixer processing. */
+    alignas(16) float ResampledData[BufferLineSize];
+    alignas(16) float FilteredData[BufferLineSize];
+    union {
+        alignas(16) float HrtfSourceData[BufferLineSize + HrtfHistoryLength];
+        alignas(16) float NfcSampleData[BufferLineSize];
+    };
+
+    /* Persistent storage for HRTF mixing. */
+    alignas(16) float2 HrtfAccumData[BufferLineSize + HrirLength + HrtfDirectDelay];
+
+    /* Mixing buffer used by the Dry mix and Real output. */
+    al::vector<FloatBufferLine, 16> MixBuffer;
+
+    /* The "dry" path corresponds to the main output. */
+    MixParams Dry;
+    uint NumChannelsPerOrder[MaxAmbiOrder+1]{};
+
+    /* "Real" output, which will be written to the device buffer. May alias the
+     * dry buffer.
+     */
+    RealMixParams RealOut;
+
+    /* HRTF state and info */
+    std::unique_ptr<DirectHrtfState> mHrtfState;
+    al::intrusive_ptr<HrtfStore> mHrtf;
+    uint mIrSize{0};
+
+    /* Ambisonic-to-UHJ encoder */
+    std::unique_ptr<UhjEncoder> mUhjEncoder;
+
+    /* Ambisonic decoder for speakers */
+    std::unique_ptr<BFormatDec> AmbiDecoder;
+
+    /* Stereo-to-binaural filter */
+    std::unique_ptr<bs2b> Bs2b;
+
+    using PostProc = void(DeviceBase::*)(const size_t SamplesToDo);
+    PostProc PostProcess{nullptr};
+
+    std::unique_ptr<Compressor> Limiter;
+
+    /* Delay buffers used to compensate for speaker distances. */
+    std::unique_ptr<DistanceComp> ChannelDelays;
+
+    /* Dithering control. */
+    float DitherDepth{0.0f};
+    uint DitherSeed{0u};
+
+    /* Running count of the mixer invocations, in 31.1 fixed point. This
+     * actually increments *twice* when mixing, first at the start and then at
+     * the end, so the bottom bit indicates if the device is currently mixing
+     * and the upper bits indicates how many mixes have been done.
+     */
+    RefCount MixCount{0u};
+
+    // Contexts created on this device
+    std::atomic<al::FlexArray<ContextBase*>*> mContexts{nullptr};
+
+    /* This lock protects the device state (format, update size, etc) from
+     * being from being changed in multiple threads, or being accessed while
+     * being changed. It's also used to serialize calls to the backend.
+     */
+    std::mutex StateLock;
+    std::unique_ptr<BackendBase> Backend;
+
+
+    DeviceBase(DeviceType type);
+    DeviceBase(const DeviceBase&) = delete;
+    DeviceBase& operator=(const DeviceBase&) = delete;
+    ~DeviceBase();
+
+    uint bytesFromFmt() const noexcept { return BytesFromDevFmt(FmtType); }
+    uint channelsFromFmt() const noexcept { return ChannelsFromDevFmt(FmtChans, mAmbiOrder); }
+    uint frameSizeFromFmt() const noexcept { return bytesFromFmt() * channelsFromFmt(); }
+
+    uint waitForMix() const noexcept
+    {
+        uint refcount;
+        while((refcount=MixCount.load(std::memory_order_acquire))&1) {
+        }
+        return refcount;
+    }
+
+    void ProcessHrtf(const size_t SamplesToDo);
+    void ProcessAmbiDec(const size_t SamplesToDo);
+    void ProcessAmbiDecStablized(const size_t SamplesToDo);
+    void ProcessUhj(const size_t SamplesToDo);
+    void ProcessBs2b(const size_t SamplesToDo);
+
+    inline void postProcess(const size_t SamplesToDo)
+    { if LIKELY(PostProcess) (this->*PostProcess)(SamplesToDo); }
+
+    void renderSamples(void *outBuffer, const uint numSamples, const size_t frameStep);
+
+    /* Caller must lock the device state, and the mixer must not be running. */
+#ifdef __USE_MINGW_ANSI_STDIO
+    [[gnu::format(gnu_printf,2,3)]]
+#else
+    [[gnu::format(printf,2,3)]]
+#endif
+    void handleDisconnect(const char *msg, ...);
+
+    DISABLE_ALLOC()
+};
+
+
+/* Must be less than 15 characters (16 including terminating null) for
+ * compatibility with pthread_setname_np limitations. */
+#define MIXER_THREAD_NAME "alsoft-mixer"
+
+#define RECORD_THREAD_NAME "alsoft-record"
+
+
+/**
+ * Returns the index for the given channel name (e.g. FrontCenter), or
+ * INVALID_CHANNEL_INDEX if it doesn't exist.
+ */
+inline uint GetChannelIdxByName(const RealMixParams &real, Channel chan) noexcept
+{ return real.ChannelIndex[chan]; }
+#define INVALID_CHANNEL_INDEX ~0u
+
+#endif /* CORE_DEVICE_H */
diff --git a/core/front_stablizer.h b/core/front_stablizer.h
new file mode 100644
index 00000000..3d328a8d
--- /dev/null
+++ b/core/front_stablizer.h
@@ -0,0 +1,36 @@
+#ifndef CORE_FRONT_STABLIZER_H
+#define CORE_FRONT_STABLIZER_H
+
+#include <array>
+#include <memory>
+
+#include "almalloc.h"
+#include "bufferline.h"
+#include "filters/splitter.h"
+
+
+struct FrontStablizer {
+    static constexpr size_t DelayLength{256u};
+
+    FrontStablizer(size_t numchans) : DelayBuf{numchans} { }
+
+    alignas(16) std::array<float,BufferLineSize + DelayLength> Side{};
+    alignas(16) std::array<float,BufferLineSize + DelayLength> MidDirect{};
+    alignas(16) std::array<float,DelayLength> MidDelay{};
+
+    alignas(16) std::array<float,BufferLineSize + DelayLength> TempBuf{};
+
+    BandSplitter MidFilter;
+    alignas(16) FloatBufferLine MidLF{};
+    alignas(16) FloatBufferLine MidHF{};
+
+    using DelayLine = std::array<float,DelayLength>;
+    al::FlexArray<DelayLine,16> DelayBuf;
+
+    static std::unique_ptr<FrontStablizer> Create(size_t numchans)
+    { return std::unique_ptr<FrontStablizer>{new(FamCount(numchans)) FrontStablizer{numchans}}; }
+
+    DEF_FAM_NEWDEL(FrontStablizer, DelayBuf)
+};
+
+#endif /* CORE_FRONT_STABLIZER_H */
diff --git a/core/helpers.cpp b/core/helpers.cpp
new file mode 100644
index 00000000..dcb785c9
--- /dev/null
+++ b/core/helpers.cpp
@@ -0,0 +1,514 @@
+
+#include "config.h"
+
+#include "helpers.h"
+
+#include <algorithm>
+#include <cerrno>
+#include <cstdarg>
+#include <cstdlib>
+#include <cstdio>
+#include <cstring>
+#include <mutex>
+#include <limits>
+#include <string>
+
+#include "almalloc.h"
+#include "alfstream.h"
+#include "aloptional.h"
+#include "alspan.h"
+#include "alstring.h"
+#include "logging.h"
+#include "strutils.h"
+#include "vector.h"
+
+
+/* Mixing thread piority level */
+int RTPrioLevel{1};
+
+/* Allow reducing the process's RTTime limit for RTKit. */
+bool AllowRTTimeLimit{true};
+
+
+#ifdef _WIN32
+
+#include <shlobj.h>
+
+const PathNamePair &GetProcBinary()
+{
+    static al::optional<PathNamePair> procbin;
+    if(procbin) return *procbin;
+
+    auto fullpath = al::vector<WCHAR>(256);
+    DWORD len{GetModuleFileNameW(nullptr, fullpath.data(), static_cast<DWORD>(fullpath.size()))};
+    while(len == fullpath.size())
+    {
+        fullpath.resize(fullpath.size() << 1);
+        len = GetModuleFileNameW(nullptr, fullpath.data(), static_cast<DWORD>(fullpath.size()));
+    }
+    if(len == 0)
+    {
+        ERR("Failed to get process name: error %lu\n", GetLastError());
+        procbin = al::make_optional<PathNamePair>();
+        return *procbin;
+    }
+
+    fullpath.resize(len);
+    if(fullpath.back() != 0)
+        fullpath.push_back(0);
+
+    auto sep = std::find(fullpath.rbegin()+1, fullpath.rend(), '\\');
+    sep = std::find(fullpath.rbegin()+1, sep, '/');
+    if(sep != fullpath.rend())
+    {
+        *sep = 0;
+        procbin = al::make_optional<PathNamePair>(wstr_to_utf8(fullpath.data()),
+            wstr_to_utf8(&*sep + 1));
+    }
+    else
+        procbin = al::make_optional<PathNamePair>(std::string{}, wstr_to_utf8(fullpath.data()));
+
+    TRACE("Got binary: %s, %s\n", procbin->path.c_str(), procbin->fname.c_str());
+    return *procbin;
+}
+
+namespace {
+
+void DirectorySearch(const char *path, const char *ext, al::vector<std::string> *const results)
+{
+    std::string pathstr{path};
+    pathstr += "\\*";
+    pathstr += ext;
+    TRACE("Searching %s\n", pathstr.c_str());
+
+    std::wstring wpath{utf8_to_wstr(pathstr.c_str())};
+    WIN32_FIND_DATAW fdata;
+    HANDLE hdl{FindFirstFileW(wpath.c_str(), &fdata)};
+    if(hdl == INVALID_HANDLE_VALUE) return;
+
+    const auto base = results->size();
+
+    do {
+        results->emplace_back();
+        std::string &str = results->back();
+        str = path;
+        str += '\\';
+        str += wstr_to_utf8(fdata.cFileName);
+    } while(FindNextFileW(hdl, &fdata));
+    FindClose(hdl);
+
+    const al::span<std::string> newlist{results->data()+base, results->size()-base};
+    std::sort(newlist.begin(), newlist.end());
+    for(const auto &name : newlist)
+        TRACE(" got %s\n", name.c_str());
+}
+
+} // namespace
+
+al::vector<std::string> SearchDataFiles(const char *ext, const char *subdir)
+{
+    auto is_slash = [](int c) noexcept -> int { return (c == '\\' || c == '/'); };
+
+    static std::mutex search_lock;
+    std::lock_guard<std::mutex> _{search_lock};
+
+    /* If the path is absolute, use it directly. */
+    al::vector<std::string> results;
+    if(isalpha(subdir[0]) && subdir[1] == ':' && is_slash(subdir[2]))
+    {
+        std::string path{subdir};
+        std::replace(path.begin(), path.end(), '/', '\\');
+        DirectorySearch(path.c_str(), ext, &results);
+        return results;
+    }
+    if(subdir[0] == '\\' && subdir[1] == '\\' && subdir[2] == '?' && subdir[3] == '\\')
+    {
+        DirectorySearch(subdir, ext, &results);
+        return results;
+    }
+
+    std::string path;
+
+    /* Search the app-local directory. */
+    if(auto localpath = al::getenv(L"ALSOFT_LOCAL_PATH"))
+    {
+        path = wstr_to_utf8(localpath->c_str());
+        if(is_slash(path.back()))
+            path.pop_back();
+    }
+    else if(WCHAR *cwdbuf{_wgetcwd(nullptr, 0)})
+    {
+        path = wstr_to_utf8(cwdbuf);
+        if(is_slash(path.back()))
+            path.pop_back();
+        free(cwdbuf);
+    }
+    else
+        path = ".";
+    std::replace(path.begin(), path.end(), '/', '\\');
+    DirectorySearch(path.c_str(), ext, &results);
+
+    /* Search the local and global data dirs. */
+    static const int ids[2]{ CSIDL_APPDATA, CSIDL_COMMON_APPDATA };
+    for(int id : ids)
+    {
+        WCHAR buffer[MAX_PATH];
+        if(SHGetSpecialFolderPathW(nullptr, buffer, id, FALSE) == FALSE)
+            continue;
+
+        path = wstr_to_utf8(buffer);
+        if(!is_slash(path.back()))
+            path += '\\';
+        path += subdir;
+        std::replace(path.begin(), path.end(), '/', '\\');
+
+        DirectorySearch(path.c_str(), ext, &results);
+    }
+
+    return results;
+}
+
+void SetRTPriority(void)
+{
+    if(RTPrioLevel > 0)
+    {
+        if(!SetThreadPriority(GetCurrentThread(), THREAD_PRIORITY_TIME_CRITICAL))
+            ERR("Failed to set priority level for thread\n");
+    }
+}
+
+#else
+
+#include <sys/types.h>
+#include <unistd.h>
+#include <dirent.h>
+#ifdef __FreeBSD__
+#include <sys/sysctl.h>
+#endif
+#ifdef __HAIKU__
+#include <FindDirectory.h>
+#endif
+#ifdef HAVE_PROC_PIDPATH
+#include <libproc.h>
+#endif
+#if defined(HAVE_PTHREAD_SETSCHEDPARAM) && !defined(__OpenBSD__)
+#include <pthread.h>
+#include <sched.h>
+#endif
+#ifdef HAVE_RTKIT
+#include <sys/time.h>
+#include <sys/resource.h>
+
+#include "dbus_wrap.h"
+#include "rtkit.h"
+#ifndef RLIMIT_RTTIME
+#define RLIMIT_RTTIME 15
+#endif
+#endif
+
+const PathNamePair &GetProcBinary()
+{
+    static al::optional<PathNamePair> procbin;
+    if(procbin) return *procbin;
+
+    al::vector<char> pathname;
+#ifdef __FreeBSD__
+    size_t pathlen;
+    int mib[4] = { CTL_KERN, KERN_PROC, KERN_PROC_PATHNAME, -1 };
+    if(sysctl(mib, 4, nullptr, &pathlen, nullptr, 0) == -1)
+        WARN("Failed to sysctl kern.proc.pathname: %s\n", strerror(errno));
+    else
+    {
+        pathname.resize(pathlen + 1);
+        sysctl(mib, 4, pathname.data(), &pathlen, nullptr, 0);
+        pathname.resize(pathlen);
+    }
+#endif
+#ifdef HAVE_PROC_PIDPATH
+    if(pathname.empty())
+    {
+        char procpath[PROC_PIDPATHINFO_MAXSIZE]{};
+        const pid_t pid{getpid()};
+        if(proc_pidpath(pid, procpath, sizeof(procpath)) < 1)
+            ERR("proc_pidpath(%d, ...) failed: %s\n", pid, strerror(errno));
+        else
+            pathname.insert(pathname.end(), procpath, procpath+strlen(procpath));
+    }
+#endif
+#ifdef __HAIKU__
+    if(pathname.empty())
+    {
+        char procpath[PATH_MAX];
+        if(find_path(B_APP_IMAGE_SYMBOL, B_FIND_PATH_IMAGE_PATH, NULL, procpath, sizeof(procpath)) == B_OK)
+            pathname.insert(pathname.end(), procpath, procpath+strlen(procpath));
+    }
+#endif
+#ifndef __SWITCH__
+    if(pathname.empty())
+    {
+        static const char SelfLinkNames[][32]{
+            "/proc/self/exe",
+            "/proc/self/file",
+            "/proc/curproc/exe",
+            "/proc/curproc/file"
+        };
+
+        pathname.resize(256);
+
+        const char *selfname{};
+        ssize_t len{};
+        for(const char *name : SelfLinkNames)
+        {
+            selfname = name;
+            len = readlink(selfname, pathname.data(), pathname.size());
+            if(len >= 0 || errno != ENOENT) break;
+        }
+
+        while(len > 0 && static_cast<size_t>(len) == pathname.size())
+        {
+            pathname.resize(pathname.size() << 1);
+            len = readlink(selfname, pathname.data(), pathname.size());
+        }
+        if(len <= 0)
+        {
+            WARN("Failed to readlink %s: %s\n", selfname, strerror(errno));
+            len = 0;
+        }
+
+        pathname.resize(static_cast<size_t>(len));
+    }
+#endif
+    while(!pathname.empty() && pathname.back() == 0)
+        pathname.pop_back();
+
+    auto sep = std::find(pathname.crbegin(), pathname.crend(), '/');
+    if(sep != pathname.crend())
+        procbin = al::make_optional<PathNamePair>(std::string(pathname.cbegin(), sep.base()-1),
+            std::string(sep.base(), pathname.cend()));
+    else
+        procbin = al::make_optional<PathNamePair>(std::string{},
+            std::string(pathname.cbegin(), pathname.cend()));
+
+    TRACE("Got binary: \"%s\", \"%s\"\n", procbin->path.c_str(), procbin->fname.c_str());
+    return *procbin;
+}
+
+namespace {
+
+void DirectorySearch(const char *path, const char *ext, al::vector<std::string> *const results)
+{
+    TRACE("Searching %s for *%s\n", path, ext);
+    DIR *dir{opendir(path)};
+    if(!dir) return;
+
+    const auto base = results->size();
+    const size_t extlen{strlen(ext)};
+
+    while(struct dirent *dirent{readdir(dir)})
+    {
+        if(strcmp(dirent->d_name, ".") == 0 || strcmp(dirent->d_name, "..") == 0)
+            continue;
+
+        const size_t len{strlen(dirent->d_name)};
+        if(len <= extlen) continue;
+        if(al::strcasecmp(dirent->d_name+len-extlen, ext) != 0)
+            continue;
+
+        results->emplace_back();
+        std::string &str = results->back();
+        str = path;
+        if(str.back() != '/')
+            str.push_back('/');
+        str += dirent->d_name;
+    }
+    closedir(dir);
+
+    const al::span<std::string> newlist{results->data()+base, results->size()-base};
+    std::sort(newlist.begin(), newlist.end());
+    for(const auto &name : newlist)
+        TRACE(" got %s\n", name.c_str());
+}
+
+} // namespace
+
+al::vector<std::string> SearchDataFiles(const char *ext, const char *subdir)
+{
+    static std::mutex search_lock;
+    std::lock_guard<std::mutex> _{search_lock};
+
+    al::vector<std::string> results;
+    if(subdir[0] == '/')
+    {
+        DirectorySearch(subdir, ext, &results);
+        return results;
+    }
+
+    /* Search the app-local directory. */
+    if(auto localpath = al::getenv("ALSOFT_LOCAL_PATH"))
+        DirectorySearch(localpath->c_str(), ext, &results);
+    else
+    {
+        al::vector<char> cwdbuf(256);
+        while(!getcwd(cwdbuf.data(), cwdbuf.size()))
+        {
+            if(errno != ERANGE)
+            {
+                cwdbuf.clear();
+                break;
+            }
+            cwdbuf.resize(cwdbuf.size() << 1);
+        }
+        if(cwdbuf.empty())
+            DirectorySearch(".", ext, &results);
+        else
+        {
+            DirectorySearch(cwdbuf.data(), ext, &results);
+            cwdbuf.clear();
+        }
+    }
+
+    // Search local data dir
+    if(auto datapath = al::getenv("XDG_DATA_HOME"))
+    {
+        std::string &path = *datapath;
+        if(path.back() != '/')
+            path += '/';
+        path += subdir;
+        DirectorySearch(path.c_str(), ext, &results);
+    }
+    else if(auto homepath = al::getenv("HOME"))
+    {
+        std::string &path = *homepath;
+        if(path.back() == '/')
+            path.pop_back();
+        path += "/.local/share/";
+        path += subdir;
+        DirectorySearch(path.c_str(), ext, &results);
+    }
+
+    // Search global data dirs
+    std::string datadirs{al::getenv("XDG_DATA_DIRS").value_or("/usr/local/share/:/usr/share/")};
+
+    size_t curpos{0u};
+    while(curpos < datadirs.size())
+    {
+        size_t nextpos{datadirs.find(':', curpos)};
+
+        std::string path{(nextpos != std::string::npos) ?
+            datadirs.substr(curpos, nextpos++ - curpos) : datadirs.substr(curpos)};
+        curpos = nextpos;
+
+        if(path.empty()) continue;
+        if(path.back() != '/')
+            path += '/';
+        path += subdir;
+
+        DirectorySearch(path.c_str(), ext, &results);
+    }
+
+    return results;
+}
+
+void SetRTPriority()
+{
+    if(RTPrioLevel <= 0)
+        return;
+
+    int err{-ENOTSUP};
+#if defined(HAVE_PTHREAD_SETSCHEDPARAM) && !defined(__OpenBSD__)
+    struct sched_param param{};
+    /* Use the minimum real-time priority possible for now (on Linux this
+     * should be 1 for SCHED_RR).
+     */
+    param.sched_priority = sched_get_priority_min(SCHED_RR);
+#ifdef SCHED_RESET_ON_FORK
+    err = pthread_setschedparam(pthread_self(), SCHED_RR|SCHED_RESET_ON_FORK, &param);
+    if(err == EINVAL)
+#endif
+        err = pthread_setschedparam(pthread_self(), SCHED_RR, &param);
+    if(err == 0) return;
+
+    WARN("pthread_setschedparam failed: %s (%d)\n", std::strerror(err), err);
+#endif
+#ifdef HAVE_RTKIT
+    if(HasDBus())
+    {
+        dbus::Error error;
+        if(dbus::ConnectionPtr conn{(*pdbus_bus_get)(DBUS_BUS_SYSTEM, &error.get())})
+        {
+            using ulonglong = unsigned long long;
+            auto limit_rttime = [](DBusConnection *c) -> int
+            {
+                long long maxrttime{rtkit_get_rttime_usec_max(c)};
+                if(maxrttime <= 0) return static_cast<int>(std::abs(maxrttime));
+                const ulonglong umaxtime{static_cast<ulonglong>(maxrttime)};
+
+                struct rlimit rlim{};
+                if(getrlimit(RLIMIT_RTTIME, &rlim) != 0)
+                    return errno;
+                TRACE("RTTime max: %llu (hard: %llu, soft: %llu)\n", umaxtime,
+                    ulonglong{rlim.rlim_max}, ulonglong{rlim.rlim_cur});
+                if(rlim.rlim_max > umaxtime)
+                {
+                    rlim.rlim_max = static_cast<rlim_t>(std::min<ulonglong>(umaxtime,
+                        std::numeric_limits<rlim_t>::max()));
+                    rlim.rlim_cur = std::min(rlim.rlim_cur, rlim.rlim_max);
+                    if(setrlimit(RLIMIT_RTTIME, &rlim) != 0)
+                        return errno;
+                }
+                return 0;
+            };
+
+            /* Don't stupidly exit if the connection dies while doing this. */
+            (*pdbus_connection_set_exit_on_disconnect)(conn.get(), false);
+
+            int nicemin{};
+            err = rtkit_get_min_nice_level(conn.get(), &nicemin);
+            if(err == -ENOENT)
+            {
+                err = std::abs(err);
+                ERR("Could not query RTKit: %s (%d)\n", std::strerror(err), err);
+                return;
+            }
+            int rtmax{rtkit_get_max_realtime_priority(conn.get())};
+            TRACE("Maximum real-time priority: %d, minimum niceness: %d\n", rtmax, nicemin);
+
+            err = EINVAL;
+            if(rtmax > 0)
+            {
+                if(AllowRTTimeLimit)
+                {
+                    err = limit_rttime(conn.get());
+                    if(err != 0)
+                        WARN("Failed to set RLIMIT_RTTIME for RTKit: %s (%d)\n",
+                            std::strerror(err), err);
+                }
+
+                /* Use half the maximum real-time priority allowed. */
+                TRACE("Making real-time with priority %d\n", (rtmax+1)/2);
+                err = rtkit_make_realtime(conn.get(), 0, (rtmax+1)/2);
+                if(err == 0) return;
+
+                err = std::abs(err);
+                WARN("Failed to set real-time priority: %s (%d)\n", std::strerror(err), err);
+            }
+            if(nicemin < 0)
+            {
+                TRACE("Making high priority with niceness %d\n", nicemin);
+                err = rtkit_make_high_priority(conn.get(), 0, nicemin);
+                if(err == 0) return;
+
+                err = std::abs(err);
+                WARN("Failed to set high priority: %s (%d)\n", std::strerror(err), err);
+            }
+        }
+        else
+            WARN("D-Bus connection failed with %s: %s\n", error->name, error->message);
+    }
+    else
+        WARN("D-Bus not available\n");
+#endif
+    ERR("Could not set elevated priority: %s (%d)\n", std::strerror(err), err);
+}
+
+#endif
diff --git a/core/helpers.h b/core/helpers.h
new file mode 100644
index 00000000..f0bfcf1b
--- /dev/null
+++ b/core/helpers.h
@@ -0,0 +1,18 @@
+#ifndef CORE_HELPERS_H
+#define CORE_HELPERS_H
+
+#include <string>
+
+#include "vector.h"
+
+
+struct PathNamePair { std::string path, fname; };
+const PathNamePair &GetProcBinary(void);
+
+extern int RTPrioLevel;
+extern bool AllowRTTimeLimit;
+void SetRTPriority(void);
+
+al::vector<std::string> SearchDataFiles(const char *match, const char *subdir);
+
+#endif /* CORE_HELPERS_H */
diff --git a/core/hrtf.cpp b/core/hrtf.cpp
new file mode 100644
index 00000000..e0ab8f0a
--- /dev/null
+++ b/core/hrtf.cpp
@@ -0,0 +1,1447 @@
+
+#include "config.h"
+
+#include "hrtf.h"
+
+#include <algorithm>
+#include <array>
+#include <cassert>
+#include <cctype>
+#include <cmath>
+#include <cstdint>
+#include <cstdio>
+#include <cstring>
+#include <fstream>
+#include <iterator>
+#include <memory>
+#include <mutex>
+#include <numeric>
+#include <type_traits>
+#include <utility>
+
+#include "albit.h"
+#include "albyte.h"
+#include "alfstream.h"
+#include "almalloc.h"
+#include "alnumeric.h"
+#include "aloptional.h"
+#include "alspan.h"
+#include "ambidefs.h"
+#include "filters/splitter.h"
+#include "helpers.h"
+#include "logging.h"
+#include "math_defs.h"
+#include "mixer/hrtfdefs.h"
+#include "opthelpers.h"
+#include "polyphase_resampler.h"
+#include "vector.h"
+
+
+namespace {
+
+struct HrtfEntry {
+    std::string mDispName;
+    std::string mFilename;
+};
+
+struct LoadedHrtf {
+    std::string mFilename;
+    std::unique_ptr<HrtfStore> mEntry;
+};
+
+/* Data set limits must be the same as or more flexible than those defined in
+ * the makemhr utility.
+ */
+constexpr uint MinFdCount{1};
+constexpr uint MaxFdCount{16};
+
+constexpr uint MinFdDistance{50};
+constexpr uint MaxFdDistance{2500};
+
+constexpr uint MinEvCount{5};
+constexpr uint MaxEvCount{181};
+
+constexpr uint MinAzCount{1};
+constexpr uint MaxAzCount{255};
+
+constexpr uint MaxHrirDelay{HrtfHistoryLength - 1};
+
+constexpr uint HrirDelayFracBits{2};
+constexpr uint HrirDelayFracOne{1 << HrirDelayFracBits};
+constexpr uint HrirDelayFracHalf{HrirDelayFracOne >> 1};
+
+static_assert(MaxHrirDelay*HrirDelayFracOne < 256, "MAX_HRIR_DELAY or DELAY_FRAC too large");
+
+constexpr char magicMarker00[8]{'M','i','n','P','H','R','0','0'};
+constexpr char magicMarker01[8]{'M','i','n','P','H','R','0','1'};
+constexpr char magicMarker02[8]{'M','i','n','P','H','R','0','2'};
+constexpr char magicMarker03[8]{'M','i','n','P','H','R','0','3'};
+
+/* First value for pass-through coefficients (remaining are 0), used for omni-
+ * directional sounds. */
+constexpr float PassthruCoeff{0.707106781187f/*sqrt(0.5)*/};
+
+std::mutex LoadedHrtfLock;
+al::vector<LoadedHrtf> LoadedHrtfs;
+
+std::mutex EnumeratedHrtfLock;
+al::vector<HrtfEntry> EnumeratedHrtfs;
+
+
+class databuf final : public std::streambuf {
+    int_type underflow() override
+    { return traits_type::eof(); }
+
+    pos_type seekoff(off_type offset, std::ios_base::seekdir whence, std::ios_base::openmode mode) override
+    {
+        if((mode&std::ios_base::out) || !(mode&std::ios_base::in))
+            return traits_type::eof();
+
+        char_type *cur;
+        switch(whence)
+        {
+            case std::ios_base::beg:
+                if(offset < 0 || offset > egptr()-eback())
+                    return traits_type::eof();
+                cur = eback() + offset;
+                break;
+
+            case std::ios_base::cur:
+                if((offset >= 0 && offset > egptr()-gptr()) ||
+                   (offset < 0 && -offset > gptr()-eback()))
+                    return traits_type::eof();
+                cur = gptr() + offset;
+                break;
+
+            case std::ios_base::end:
+                if(offset > 0 || -offset > egptr()-eback())
+                    return traits_type::eof();
+                cur = egptr() + offset;
+                break;
+
+            default:
+                return traits_type::eof();
+        }
+
+        setg(eback(), cur, egptr());
+        return cur - eback();
+    }
+
+    pos_type seekpos(pos_type pos, std::ios_base::openmode mode) override
+    {
+        // Simplified version of seekoff
+        if((mode&std::ios_base::out) || !(mode&std::ios_base::in))
+            return traits_type::eof();
+
+        if(pos < 0 || pos > egptr()-eback())
+            return traits_type::eof();
+
+        setg(eback(), eback() + static_cast<size_t>(pos), egptr());
+        return pos;
+    }
+
+public:
+    databuf(const char_type *start_, const char_type *end_) noexcept
+    {
+        setg(const_cast<char_type*>(start_), const_cast<char_type*>(start_),
+             const_cast<char_type*>(end_));
+    }
+};
+
+class idstream final : public std::istream {
+    databuf mStreamBuf;
+
+public:
+    idstream(const char *start_, const char *end_)
+      : std::istream{nullptr}, mStreamBuf{start_, end_}
+    { init(&mStreamBuf); }
+};
+
+
+struct IdxBlend { uint idx; float blend; };
+/* Calculate the elevation index given the polar elevation in radians. This
+ * will return an index between 0 and (evcount - 1).
+ */
+IdxBlend CalcEvIndex(uint evcount, float ev)
+{
+    ev = (al::MathDefs<float>::Pi()*0.5f + ev) * static_cast<float>(evcount-1) /
+        al::MathDefs<float>::Pi();
+    uint idx{float2uint(ev)};
+
+    return IdxBlend{minu(idx, evcount-1), ev-static_cast<float>(idx)};
+}
+
+/* Calculate the azimuth index given the polar azimuth in radians. This will
+ * return an index between 0 and (azcount - 1).
+ */
+IdxBlend CalcAzIndex(uint azcount, float az)
+{
+    az = (al::MathDefs<float>::Tau()+az) * static_cast<float>(azcount) /
+        al::MathDefs<float>::Tau();
+    uint idx{float2uint(az)};
+
+    return IdxBlend{idx%azcount, az-static_cast<float>(idx)};
+}
+
+} // namespace
+
+
+/* Calculates static HRIR coefficients and delays for the given polar elevation
+ * and azimuth in radians. The coefficients are normalized.
+ */
+void GetHrtfCoeffs(const HrtfStore *Hrtf, float elevation, float azimuth, float distance,
+    float spread, HrirArray &coeffs, const al::span<uint,2> delays)
+{
+    const float dirfact{1.0f - (spread / al::MathDefs<float>::Tau())};
+
+    const auto *field = Hrtf->field;
+    const auto *field_end = field + Hrtf->fdCount-1;
+    size_t ebase{0};
+    while(distance < field->distance && field != field_end)
+    {
+        ebase += field->evCount;
+        ++field;
+    }
+
+    /* Calculate the elevation indices. */
+    const auto elev0 = CalcEvIndex(field->evCount, elevation);
+    const size_t elev1_idx{minu(elev0.idx+1, field->evCount-1)};
+    const size_t ir0offset{Hrtf->elev[ebase + elev0.idx].irOffset};
+    const size_t ir1offset{Hrtf->elev[ebase + elev1_idx].irOffset};
+
+    /* Calculate azimuth indices. */
+    const auto az0 = CalcAzIndex(Hrtf->elev[ebase + elev0.idx].azCount, azimuth);
+    const auto az1 = CalcAzIndex(Hrtf->elev[ebase + elev1_idx].azCount, azimuth);
+
+    /* Calculate the HRIR indices to blend. */
+    const size_t idx[4]{
+        ir0offset + az0.idx,
+        ir0offset + ((az0.idx+1) % Hrtf->elev[ebase + elev0.idx].azCount),
+        ir1offset + az1.idx,
+        ir1offset + ((az1.idx+1) % Hrtf->elev[ebase + elev1_idx].azCount)
+    };
+
+    /* Calculate bilinear blending weights, attenuated according to the
+     * directional panning factor.
+     */
+    const float blend[4]{
+        (1.0f-elev0.blend) * (1.0f-az0.blend) * dirfact,
+        (1.0f-elev0.blend) * (     az0.blend) * dirfact,
+        (     elev0.blend) * (1.0f-az1.blend) * dirfact,
+        (     elev0.blend) * (     az1.blend) * dirfact
+    };
+
+    /* Calculate the blended HRIR delays. */
+    float d{Hrtf->delays[idx[0]][0]*blend[0] + Hrtf->delays[idx[1]][0]*blend[1] +
+        Hrtf->delays[idx[2]][0]*blend[2] + Hrtf->delays[idx[3]][0]*blend[3]};
+    delays[0] = fastf2u(d * float{1.0f/HrirDelayFracOne});
+    d = Hrtf->delays[idx[0]][1]*blend[0] + Hrtf->delays[idx[1]][1]*blend[1] +
+        Hrtf->delays[idx[2]][1]*blend[2] + Hrtf->delays[idx[3]][1]*blend[3];
+    delays[1] = fastf2u(d * float{1.0f/HrirDelayFracOne});
+
+    /* Calculate the blended HRIR coefficients. */
+    float *coeffout{al::assume_aligned<16>(&coeffs[0][0])};
+    coeffout[0] = PassthruCoeff * (1.0f-dirfact);
+    coeffout[1] = PassthruCoeff * (1.0f-dirfact);
+    std::fill_n(coeffout+2, size_t{HrirLength-1}*2, 0.0f);
+    for(size_t c{0};c < 4;c++)
+    {
+        const float *srccoeffs{al::assume_aligned<16>(Hrtf->coeffs[idx[c]][0].data())};
+        const float mult{blend[c]};
+        auto blend_coeffs = [mult](const float src, const float coeff) noexcept -> float
+        { return src*mult + coeff; };
+        std::transform(srccoeffs, srccoeffs + HrirLength*2, coeffout, coeffout, blend_coeffs);
+    }
+}
+
+
+std::unique_ptr<DirectHrtfState> DirectHrtfState::Create(size_t num_chans)
+{ return std::unique_ptr<DirectHrtfState>{new(FamCount(num_chans)) DirectHrtfState{num_chans}}; }
+
+void DirectHrtfState::build(const HrtfStore *Hrtf, const uint irSize,
+    const al::span<const AngularPoint> AmbiPoints, const float (*AmbiMatrix)[MaxAmbiChannels],
+    const float XOverFreq, const al::span<const float,MaxAmbiOrder+1> AmbiOrderHFGain)
+{
+    using double2 = std::array<double,2>;
+    struct ImpulseResponse {
+        const ConstHrirSpan hrir;
+        uint ldelay, rdelay;
+    };
+
+    const double xover_norm{double{XOverFreq} / Hrtf->sampleRate};
+    for(size_t i{0};i < mChannels.size();++i)
+    {
+        const size_t order{AmbiIndex::OrderFromChannel()[i]};
+        mChannels[i].mSplitter.init(static_cast<float>(xover_norm));
+        mChannels[i].mHfScale = AmbiOrderHFGain[order];
+    }
+
+    uint min_delay{HrtfHistoryLength*HrirDelayFracOne}, max_delay{0};
+    al::vector<ImpulseResponse> impres; impres.reserve(AmbiPoints.size());
+    auto calc_res = [Hrtf,&max_delay,&min_delay](const AngularPoint &pt) -> ImpulseResponse
+    {
+        auto &field = Hrtf->field[0];
+        const auto elev0 = CalcEvIndex(field.evCount, pt.Elev.value);
+        const size_t elev1_idx{minu(elev0.idx+1, field.evCount-1)};
+        const size_t ir0offset{Hrtf->elev[elev0.idx].irOffset};
+        const size_t ir1offset{Hrtf->elev[elev1_idx].irOffset};
+
+        const auto az0 = CalcAzIndex(Hrtf->elev[elev0.idx].azCount, pt.Azim.value);
+        const auto az1 = CalcAzIndex(Hrtf->elev[elev1_idx].azCount, pt.Azim.value);
+
+        const size_t idx[4]{
+            ir0offset + az0.idx,
+            ir0offset + ((az0.idx+1) % Hrtf->elev[elev0.idx].azCount),
+            ir1offset + az1.idx,
+            ir1offset + ((az1.idx+1) % Hrtf->elev[elev1_idx].azCount)
+        };
+
+        const std::array<double,4> blend{{
+            (1.0-elev0.blend) * (1.0-az0.blend),
+            (1.0-elev0.blend) * (    az0.blend),
+            (    elev0.blend) * (1.0-az1.blend),
+            (    elev0.blend) * (    az1.blend)
+        }};
+
+        /* The largest blend factor serves as the closest HRIR. */
+        const size_t irOffset{idx[std::max_element(blend.begin(), blend.end()) - blend.begin()]};
+        ImpulseResponse res{Hrtf->coeffs[irOffset],
+            Hrtf->delays[irOffset][0], Hrtf->delays[irOffset][1]};
+
+        min_delay = minu(min_delay, minu(res.ldelay, res.rdelay));
+        max_delay = maxu(max_delay, maxu(res.ldelay, res.rdelay));
+
+        return res;
+    };
+    std::transform(AmbiPoints.begin(), AmbiPoints.end(), std::back_inserter(impres), calc_res);
+    auto hrir_delay_round = [](const uint d) noexcept -> uint
+    { return (d+HrirDelayFracHalf) >> HrirDelayFracBits; };
+
+    TRACE("Min delay: %.2f, max delay: %.2f, FIR length: %u\n",
+        min_delay/double{HrirDelayFracOne}, max_delay/double{HrirDelayFracOne}, irSize);
+
+    const bool per_hrir_min{mChannels.size() > AmbiChannelsFromOrder(1)};
+    auto tmpres = al::vector<std::array<double2,HrirLength>>(mChannels.size());
+    max_delay = 0;
+    for(size_t c{0u};c < AmbiPoints.size();++c)
+    {
+        const ConstHrirSpan hrir{impres[c].hrir};
+        const uint base_delay{per_hrir_min ? minu(impres[c].ldelay, impres[c].rdelay) : min_delay};
+        const uint ldelay{hrir_delay_round(impres[c].ldelay - base_delay)};
+        const uint rdelay{hrir_delay_round(impres[c].rdelay - base_delay)};
+        max_delay = maxu(max_delay, maxu(impres[c].ldelay, impres[c].rdelay) - base_delay);
+
+        for(size_t i{0u};i < mChannels.size();++i)
+        {
+            const double mult{AmbiMatrix[c][i]};
+            const size_t numirs{HrirLength - maxz(ldelay, rdelay)};
+            size_t lidx{ldelay}, ridx{rdelay};
+            for(size_t j{0};j < numirs;++j)
+            {
+                tmpres[i][lidx++][0] += hrir[j][0] * mult;
+                tmpres[i][ridx++][1] += hrir[j][1] * mult;
+            }
+        }
+    }
+    impres.clear();
+
+    for(size_t i{0u};i < mChannels.size();++i)
+    {
+        auto copy_arr = [](const double2 &in) noexcept -> float2
+        { return float2{{static_cast<float>(in[0]), static_cast<float>(in[1])}}; };
+        std::transform(tmpres[i].cbegin(), tmpres[i].cend(), mChannels[i].mCoeffs.begin(),
+            copy_arr);
+    }
+    tmpres.clear();
+
+    const uint max_length{minu(hrir_delay_round(max_delay) + irSize, HrirLength)};
+    TRACE("New max delay: %.2f, FIR length: %u\n", max_delay/double{HrirDelayFracOne},
+        max_length);
+    mIrSize = max_length;
+}
+
+
+namespace {
+
+std::unique_ptr<HrtfStore> CreateHrtfStore(uint rate, ushort irSize,
+    const al::span<const HrtfStore::Field> fields,
+    const al::span<const HrtfStore::Elevation> elevs, const HrirArray *coeffs,
+    const ubyte2 *delays, const char *filename)
+{
+    std::unique_ptr<HrtfStore> Hrtf;
+
+    const size_t irCount{size_t{elevs.back().azCount} + elevs.back().irOffset};
+    size_t total{sizeof(HrtfStore)};
+    total  = RoundUp(total, alignof(HrtfStore::Field)); /* Align for field infos */
+    total += sizeof(HrtfStore::Field)*fields.size();
+    total  = RoundUp(total, alignof(HrtfStore::Elevation)); /* Align for elevation infos */
+    total += sizeof(Hrtf->elev[0])*elevs.size();
+    total  = RoundUp(total, 16); /* Align for coefficients using SIMD */
+    total += sizeof(Hrtf->coeffs[0])*irCount;
+    total += sizeof(Hrtf->delays[0])*irCount;
+
+    Hrtf.reset(new (al_calloc(16, total)) HrtfStore{});
+    if(!Hrtf)
+        ERR("Out of memory allocating storage for %s.\n", filename);
+    else
+    {
+        InitRef(Hrtf->mRef, 1u);
+        Hrtf->sampleRate = rate;
+        Hrtf->irSize = irSize;
+        Hrtf->fdCount = static_cast<uint>(fields.size());
+
+        /* Set up pointers to storage following the main HRTF struct. */
+        char *base = reinterpret_cast<char*>(Hrtf.get());
+        size_t offset{sizeof(HrtfStore)};
+
+        offset = RoundUp(offset, alignof(HrtfStore::Field)); /* Align for field infos */
+        auto field_ = reinterpret_cast<HrtfStore::Field*>(base + offset);
+        offset += sizeof(field_[0])*fields.size();
+
+        offset = RoundUp(offset, alignof(HrtfStore::Elevation)); /* Align for elevation infos */
+        auto elev_ = reinterpret_cast<HrtfStore::Elevation*>(base + offset);
+        offset += sizeof(elev_[0])*elevs.size();
+
+        offset = RoundUp(offset, 16); /* Align for coefficients using SIMD */
+        auto coeffs_ = reinterpret_cast<HrirArray*>(base + offset);
+        offset += sizeof(coeffs_[0])*irCount;
+
+        auto delays_ = reinterpret_cast<ubyte2*>(base + offset);
+        offset += sizeof(delays_[0])*irCount;
+
+        assert(offset == total);
+
+        /* Copy input data to storage. */
+        std::copy(fields.cbegin(), fields.cend(), field_);
+        std::copy(elevs.cbegin(), elevs.cend(), elev_);
+        std::copy_n(coeffs, irCount, coeffs_);
+        std::copy_n(delays, irCount, delays_);
+
+        /* Finally, assign the storage pointers. */
+        Hrtf->field = field_;
+        Hrtf->elev = elev_;
+        Hrtf->coeffs = coeffs_;
+        Hrtf->delays = delays_;
+    }
+
+    return Hrtf;
+}
+
+void MirrorLeftHrirs(const al::span<const HrtfStore::Elevation> elevs, HrirArray *coeffs,
+    ubyte2 *delays)
+{
+    for(const auto &elev : elevs)
+    {
+        const ushort evoffset{elev.irOffset};
+        const ushort azcount{elev.azCount};
+        for(size_t j{0};j < azcount;j++)
+        {
+            const size_t lidx{evoffset + j};
+            const size_t ridx{evoffset + ((azcount-j) % azcount)};
+
+            const size_t irSize{coeffs[ridx].size()};
+            for(size_t k{0};k < irSize;k++)
+                coeffs[ridx][k][1] = coeffs[lidx][k][0];
+            delays[ridx][1] = delays[lidx][0];
+        }
+    }
+}
+
+
+template<typename T, size_t num_bits=sizeof(T)*8>
+inline T readle(std::istream &data)
+{
+    static_assert((num_bits&7) == 0, "num_bits must be a multiple of 8");
+    static_assert(num_bits <= sizeof(T)*8, "num_bits is too large for the type");
+
+    T ret{};
+    if_constexpr(al::endian::native == al::endian::little)
+    {
+        if(!data.read(reinterpret_cast<char*>(&ret), num_bits/8))
+            return static_cast<T>(EOF);
+    }
+    else
+    {
+        al::byte b[sizeof(T)]{};
+        if(!data.read(reinterpret_cast<char*>(b), num_bits/8))
+            return static_cast<T>(EOF);
+        std::reverse_copy(std::begin(b), std::end(b), reinterpret_cast<al::byte*>(&ret));
+    }
+
+    if_constexpr(std::is_signed<T>::value && num_bits < sizeof(T)*8)
+    {
+        constexpr auto signbit = static_cast<T>(1u << (num_bits-1));
+        return static_cast<T>((ret^signbit) - signbit);
+    }
+    return ret;
+}
+
+template<>
+inline uint8_t readle<uint8_t,8>(std::istream &data)
+{ return static_cast<uint8_t>(data.get()); }
+
+
+std::unique_ptr<HrtfStore> LoadHrtf00(std::istream &data, const char *filename)
+{
+    uint rate{readle<uint32_t>(data)};
+    ushort irCount{readle<uint16_t>(data)};
+    ushort irSize{readle<uint16_t>(data)};
+    ubyte evCount{readle<uint8_t>(data)};
+    if(!data || data.eof())
+    {
+        ERR("Failed reading %s\n", filename);
+        return nullptr;
+    }
+
+    if(irSize < MinIrLength || irSize > HrirLength)
+    {
+        ERR("Unsupported HRIR size, irSize=%d (%d to %d)\n", irSize, MinIrLength, HrirLength);
+        return nullptr;
+    }
+    if(evCount < MinEvCount || evCount > MaxEvCount)
+    {
+        ERR("Unsupported elevation count: evCount=%d (%d to %d)\n",
+            evCount, MinEvCount, MaxEvCount);
+        return nullptr;
+    }
+
+    auto elevs = al::vector<HrtfStore::Elevation>(evCount);
+    for(auto &elev : elevs)
+        elev.irOffset = readle<uint16_t>(data);
+    if(!data || data.eof())
+    {
+        ERR("Failed reading %s\n", filename);
+        return nullptr;
+    }
+    for(size_t i{1};i < evCount;i++)
+    {
+        if(elevs[i].irOffset <= elevs[i-1].irOffset)
+        {
+            ERR("Invalid evOffset: evOffset[%zu]=%d (last=%d)\n", i, elevs[i].irOffset,
+                elevs[i-1].irOffset);
+            return nullptr;
+        }
+    }
+    if(irCount <= elevs.back().irOffset)
+    {
+        ERR("Invalid evOffset: evOffset[%zu]=%d (irCount=%d)\n",
+            elevs.size()-1, elevs.back().irOffset, irCount);
+        return nullptr;
+    }
+
+    for(size_t i{1};i < evCount;i++)
+    {
+        elevs[i-1].azCount = static_cast<ushort>(elevs[i].irOffset - elevs[i-1].irOffset);
+        if(elevs[i-1].azCount < MinAzCount || elevs[i-1].azCount > MaxAzCount)
+        {
+            ERR("Unsupported azimuth count: azCount[%zd]=%d (%d to %d)\n",
+                i-1, elevs[i-1].azCount, MinAzCount, MaxAzCount);
+            return nullptr;
+        }
+    }
+    elevs.back().azCount = static_cast<ushort>(irCount - elevs.back().irOffset);
+    if(elevs.back().azCount < MinAzCount || elevs.back().azCount > MaxAzCount)
+    {
+        ERR("Unsupported azimuth count: azCount[%zu]=%d (%d to %d)\n",
+            elevs.size()-1, elevs.back().azCount, MinAzCount, MaxAzCount);
+        return nullptr;
+    }
+
+    auto coeffs = al::vector<HrirArray>(irCount, HrirArray{});
+    auto delays = al::vector<ubyte2>(irCount);
+    for(auto &hrir : coeffs)
+    {
+        for(auto &val : al::span<float2>{hrir.data(), irSize})
+            val[0] = readle<int16_t>(data) / 32768.0f;
+    }
+    for(auto &val : delays)
+        val[0] = readle<uint8_t>(data);
+    if(!data || data.eof())
+    {
+        ERR("Failed reading %s\n", filename);
+        return nullptr;
+    }
+    for(size_t i{0};i < irCount;i++)
+    {
+        if(delays[i][0] > MaxHrirDelay)
+        {
+            ERR("Invalid delays[%zd]: %d (%d)\n", i, delays[i][0], MaxHrirDelay);
+            return nullptr;
+        }
+        delays[i][0] <<= HrirDelayFracBits;
+    }
+
+    /* Mirror the left ear responses to the right ear. */
+    MirrorLeftHrirs({elevs.data(), elevs.size()}, coeffs.data(), delays.data());
+
+    const HrtfStore::Field field[1]{{0.0f, evCount}};
+    return CreateHrtfStore(rate, irSize, field, {elevs.data(), elevs.size()}, coeffs.data(),
+        delays.data(), filename);
+}
+
+std::unique_ptr<HrtfStore> LoadHrtf01(std::istream &data, const char *filename)
+{
+    uint rate{readle<uint32_t>(data)};
+    ushort irSize{readle<uint8_t>(data)};
+    ubyte evCount{readle<uint8_t>(data)};
+    if(!data || data.eof())
+    {
+        ERR("Failed reading %s\n", filename);
+        return nullptr;
+    }
+
+    if(irSize < MinIrLength || irSize > HrirLength)
+    {
+        ERR("Unsupported HRIR size, irSize=%d (%d to %d)\n", irSize, MinIrLength, HrirLength);
+        return nullptr;
+    }
+    if(evCount < MinEvCount || evCount > MaxEvCount)
+    {
+        ERR("Unsupported elevation count: evCount=%d (%d to %d)\n",
+            evCount, MinEvCount, MaxEvCount);
+        return nullptr;
+    }
+
+    auto elevs = al::vector<HrtfStore::Elevation>(evCount);
+    for(auto &elev : elevs)
+        elev.azCount = readle<uint8_t>(data);
+    if(!data || data.eof())
+    {
+        ERR("Failed reading %s\n", filename);
+        return nullptr;
+    }
+    for(size_t i{0};i < evCount;++i)
+    {
+        if(elevs[i].azCount < MinAzCount || elevs[i].azCount > MaxAzCount)
+        {
+            ERR("Unsupported azimuth count: azCount[%zd]=%d (%d to %d)\n", i, elevs[i].azCount,
+                MinAzCount, MaxAzCount);
+            return nullptr;
+        }
+    }
+
+    elevs[0].irOffset = 0;
+    for(size_t i{1};i < evCount;i++)
+        elevs[i].irOffset = static_cast<ushort>(elevs[i-1].irOffset + elevs[i-1].azCount);
+    const ushort irCount{static_cast<ushort>(elevs.back().irOffset + elevs.back().azCount)};
+
+    auto coeffs = al::vector<HrirArray>(irCount, HrirArray{});
+    auto delays = al::vector<ubyte2>(irCount);
+    for(auto &hrir : coeffs)
+    {
+        for(auto &val : al::span<float2>{hrir.data(), irSize})
+            val[0] = readle<int16_t>(data) / 32768.0f;
+    }
+    for(auto &val : delays)
+        val[0] = readle<uint8_t>(data);
+    if(!data || data.eof())
+    {
+        ERR("Failed reading %s\n", filename);
+        return nullptr;
+    }
+    for(size_t i{0};i < irCount;i++)
+    {
+        if(delays[i][0] > MaxHrirDelay)
+        {
+            ERR("Invalid delays[%zd]: %d (%d)\n", i, delays[i][0], MaxHrirDelay);
+            return nullptr;
+        }
+        delays[i][0] <<= HrirDelayFracBits;
+    }
+
+    /* Mirror the left ear responses to the right ear. */
+    MirrorLeftHrirs({elevs.data(), elevs.size()}, coeffs.data(), delays.data());
+
+    const HrtfStore::Field field[1]{{0.0f, evCount}};
+    return CreateHrtfStore(rate, irSize, field, {elevs.data(), elevs.size()}, coeffs.data(),
+        delays.data(), filename);
+}
+
+std::unique_ptr<HrtfStore> LoadHrtf02(std::istream &data, const char *filename)
+{
+    constexpr ubyte SampleType_S16{0};
+    constexpr ubyte SampleType_S24{1};
+    constexpr ubyte ChanType_LeftOnly{0};
+    constexpr ubyte ChanType_LeftRight{1};
+
+    uint rate{readle<uint32_t>(data)};
+    ubyte sampleType{readle<uint8_t>(data)};
+    ubyte channelType{readle<uint8_t>(data)};
+    ushort irSize{readle<uint8_t>(data)};
+    ubyte fdCount{readle<uint8_t>(data)};
+    if(!data || data.eof())
+    {
+        ERR("Failed reading %s\n", filename);
+        return nullptr;
+    }
+
+    if(sampleType > SampleType_S24)
+    {
+        ERR("Unsupported sample type: %d\n", sampleType);
+        return nullptr;
+    }
+    if(channelType > ChanType_LeftRight)
+    {
+        ERR("Unsupported channel type: %d\n", channelType);
+        return nullptr;
+    }
+
+    if(irSize < MinIrLength || irSize > HrirLength)
+    {
+        ERR("Unsupported HRIR size, irSize=%d (%d to %d)\n", irSize, MinIrLength, HrirLength);
+        return nullptr;
+    }
+    if(fdCount < 1 || fdCount > MaxFdCount)
+    {
+        ERR("Unsupported number of field-depths: fdCount=%d (%d to %d)\n", fdCount, MinFdCount,
+            MaxFdCount);
+        return nullptr;
+    }
+
+    auto fields = al::vector<HrtfStore::Field>(fdCount);
+    auto elevs = al::vector<HrtfStore::Elevation>{};
+    for(size_t f{0};f < fdCount;f++)
+    {
+        const ushort distance{readle<uint16_t>(data)};
+        const ubyte evCount{readle<uint8_t>(data)};
+        if(!data || data.eof())
+        {
+            ERR("Failed reading %s\n", filename);
+            return nullptr;
+        }
+
+        if(distance < MinFdDistance || distance > MaxFdDistance)
+        {
+            ERR("Unsupported field distance[%zu]=%d (%d to %d millimeters)\n", f, distance,
+                MinFdDistance, MaxFdDistance);
+            return nullptr;
+        }
+        if(evCount < MinEvCount || evCount > MaxEvCount)
+        {
+            ERR("Unsupported elevation count: evCount[%zu]=%d (%d to %d)\n", f, evCount,
+                MinEvCount, MaxEvCount);
+            return nullptr;
+        }
+
+        fields[f].distance = distance / 1000.0f;
+        fields[f].evCount = evCount;
+        if(f > 0 && fields[f].distance <= fields[f-1].distance)
+        {
+            ERR("Field distance[%zu] is not after previous (%f > %f)\n", f, fields[f].distance,
+                fields[f-1].distance);
+            return nullptr;
+        }
+
+        const size_t ebase{elevs.size()};
+        elevs.resize(ebase + evCount);
+        for(auto &elev : al::span<HrtfStore::Elevation>(elevs.data()+ebase, evCount))
+            elev.azCount = readle<uint8_t>(data);
+        if(!data || data.eof())
+        {
+            ERR("Failed reading %s\n", filename);
+            return nullptr;
+        }
+
+        for(size_t e{0};e < evCount;e++)
+        {
+            if(elevs[ebase+e].azCount < MinAzCount || elevs[ebase+e].azCount > MaxAzCount)
+            {
+                ERR("Unsupported azimuth count: azCount[%zu][%zu]=%d (%d to %d)\n", f, e,
+                    elevs[ebase+e].azCount, MinAzCount, MaxAzCount);
+                return nullptr;
+            }
+        }
+    }
+
+    elevs[0].irOffset = 0;
+    std::partial_sum(elevs.cbegin(), elevs.cend(), elevs.begin(),
+        [](const HrtfStore::Elevation &last, const HrtfStore::Elevation &cur)
+            -> HrtfStore::Elevation
+        {
+            return HrtfStore::Elevation{cur.azCount,
+                static_cast<ushort>(last.azCount + last.irOffset)};
+        });
+    const auto irTotal = static_cast<ushort>(elevs.back().azCount + elevs.back().irOffset);
+
+    auto coeffs = al::vector<HrirArray>(irTotal, HrirArray{});
+    auto delays = al::vector<ubyte2>(irTotal);
+    if(channelType == ChanType_LeftOnly)
+    {
+        if(sampleType == SampleType_S16)
+        {
+            for(auto &hrir : coeffs)
+            {
+                for(auto &val : al::span<float2>{hrir.data(), irSize})
+                    val[0] = readle<int16_t>(data) / 32768.0f;
+            }
+        }
+        else if(sampleType == SampleType_S24)
+        {
+            for(auto &hrir : coeffs)
+            {
+                for(auto &val : al::span<float2>{hrir.data(), irSize})
+                    val[0] = static_cast<float>(readle<int,24>(data)) / 8388608.0f;
+            }
+        }
+        for(auto &val : delays)
+            val[0] = readle<uint8_t>(data);
+        if(!data || data.eof())
+        {
+            ERR("Failed reading %s\n", filename);
+            return nullptr;
+        }
+        for(size_t i{0};i < irTotal;++i)
+        {
+            if(delays[i][0] > MaxHrirDelay)
+            {
+                ERR("Invalid delays[%zu][0]: %d (%d)\n", i, delays[i][0], MaxHrirDelay);
+                return nullptr;
+            }
+            delays[i][0] <<= HrirDelayFracBits;
+        }
+
+        /* Mirror the left ear responses to the right ear. */
+        MirrorLeftHrirs({elevs.data(), elevs.size()}, coeffs.data(), delays.data());
+    }
+    else if(channelType == ChanType_LeftRight)
+    {
+        if(sampleType == SampleType_S16)
+        {
+            for(auto &hrir : coeffs)
+            {
+                for(auto &val : al::span<float2>{hrir.data(), irSize})
+                {
+                    val[0] = readle<int16_t>(data) / 32768.0f;
+                    val[1] = readle<int16_t>(data) / 32768.0f;
+                }
+            }
+        }
+        else if(sampleType == SampleType_S24)
+        {
+            for(auto &hrir : coeffs)
+            {
+                for(auto &val : al::span<float2>{hrir.data(), irSize})
+                {
+                    val[0] = static_cast<float>(readle<int,24>(data)) / 8388608.0f;
+                    val[1] = static_cast<float>(readle<int,24>(data)) / 8388608.0f;
+                }
+            }
+        }
+        for(auto &val : delays)
+        {
+            val[0] = readle<uint8_t>(data);
+            val[1] = readle<uint8_t>(data);
+        }
+        if(!data || data.eof())
+        {
+            ERR("Failed reading %s\n", filename);
+            return nullptr;
+        }
+
+        for(size_t i{0};i < irTotal;++i)
+        {
+            if(delays[i][0] > MaxHrirDelay)
+            {
+                ERR("Invalid delays[%zu][0]: %d (%d)\n", i, delays[i][0], MaxHrirDelay);
+                return nullptr;
+            }
+            if(delays[i][1] > MaxHrirDelay)
+            {
+                ERR("Invalid delays[%zu][1]: %d (%d)\n", i, delays[i][1], MaxHrirDelay);
+                return nullptr;
+            }
+            delays[i][0] <<= HrirDelayFracBits;
+            delays[i][1] <<= HrirDelayFracBits;
+        }
+    }
+
+    if(fdCount > 1)
+    {
+        auto fields_ = al::vector<HrtfStore::Field>(fields.size());
+        auto elevs_ = al::vector<HrtfStore::Elevation>(elevs.size());
+        auto coeffs_ = al::vector<HrirArray>(coeffs.size());
+        auto delays_ = al::vector<ubyte2>(delays.size());
+
+        /* Simple reverse for the per-field elements. */
+        std::reverse_copy(fields.cbegin(), fields.cend(), fields_.begin());
+
+        /* Each field has a group of elevations, which each have an azimuth
+         * count. Reverse the order of the groups, keeping the relative order
+         * of per-group azimuth counts.
+         */
+        auto elevs__end = elevs_.end();
+        auto copy_azs = [&elevs,&elevs__end](const ptrdiff_t ebase, const HrtfStore::Field &field)
+            -> ptrdiff_t
+        {
+            auto elevs_src = elevs.begin()+ebase;
+            elevs__end = std::copy_backward(elevs_src, elevs_src+field.evCount, elevs__end);
+            return ebase + field.evCount;
+        };
+        (void)std::accumulate(fields.cbegin(), fields.cend(), ptrdiff_t{0}, copy_azs);
+        assert(elevs_.begin() == elevs__end);
+
+        /* Reestablish the IR offset for each elevation index, given the new
+         * ordering of elevations.
+         */
+        elevs_[0].irOffset = 0;
+        std::partial_sum(elevs_.cbegin(), elevs_.cend(), elevs_.begin(),
+            [](const HrtfStore::Elevation &last, const HrtfStore::Elevation &cur)
+                -> HrtfStore::Elevation
+            {
+                return HrtfStore::Elevation{cur.azCount,
+                    static_cast<ushort>(last.azCount + last.irOffset)};
+            });
+
+        /* Reverse the order of each field's group of IRs. */
+        auto coeffs_end = coeffs_.end();
+        auto delays_end = delays_.end();
+        auto copy_irs = [&elevs,&coeffs,&delays,&coeffs_end,&delays_end](
+            const ptrdiff_t ebase, const HrtfStore::Field &field) -> ptrdiff_t
+        {
+            auto accum_az = [](int count, const HrtfStore::Elevation &elev) noexcept -> int
+            { return count + elev.azCount; };
+            const auto elevs_mid = elevs.cbegin() + ebase;
+            const auto elevs_end = elevs_mid + field.evCount;
+            const int abase{std::accumulate(elevs.cbegin(), elevs_mid, 0, accum_az)};
+            const int num_azs{std::accumulate(elevs_mid, elevs_end, 0, accum_az)};
+
+            coeffs_end = std::copy_backward(coeffs.cbegin() + abase,
+                coeffs.cbegin() + (abase+num_azs), coeffs_end);
+            delays_end = std::copy_backward(delays.cbegin() + abase,
+                delays.cbegin() + (abase+num_azs), delays_end);
+
+            return ebase + field.evCount;
+        };
+        (void)std::accumulate(fields.cbegin(), fields.cend(), ptrdiff_t{0}, copy_irs);
+        assert(coeffs_.begin() == coeffs_end);
+        assert(delays_.begin() == delays_end);
+
+        fields = std::move(fields_);
+        elevs = std::move(elevs_);
+        coeffs = std::move(coeffs_);
+        delays = std::move(delays_);
+    }
+
+    return CreateHrtfStore(rate, irSize, {fields.data(), fields.size()},
+        {elevs.data(), elevs.size()}, coeffs.data(), delays.data(), filename);
+}
+
+std::unique_ptr<HrtfStore> LoadHrtf03(std::istream &data, const char *filename)
+{
+    constexpr ubyte ChanType_LeftOnly{0};
+    constexpr ubyte ChanType_LeftRight{1};
+
+    uint rate{readle<uint32_t>(data)};
+    ubyte channelType{readle<uint8_t>(data)};
+    ushort irSize{readle<uint8_t>(data)};
+    ubyte fdCount{readle<uint8_t>(data)};
+    if(!data || data.eof())
+    {
+        ERR("Failed reading %s\n", filename);
+        return nullptr;
+    }
+
+    if(channelType > ChanType_LeftRight)
+    {
+        ERR("Unsupported channel type: %d\n", channelType);
+        return nullptr;
+    }
+
+    if(irSize < MinIrLength || irSize > HrirLength)
+    {
+        ERR("Unsupported HRIR size, irSize=%d (%d to %d)\n", irSize, MinIrLength, HrirLength);
+        return nullptr;
+    }
+    if(fdCount < 1 || fdCount > MaxFdCount)
+    {
+        ERR("Unsupported number of field-depths: fdCount=%d (%d to %d)\n", fdCount, MinFdCount,
+            MaxFdCount);
+        return nullptr;
+    }
+
+    auto fields = al::vector<HrtfStore::Field>(fdCount);
+    auto elevs = al::vector<HrtfStore::Elevation>{};
+    for(size_t f{0};f < fdCount;f++)
+    {
+        const ushort distance{readle<uint16_t>(data)};
+        const ubyte evCount{readle<uint8_t>(data)};
+        if(!data || data.eof())
+        {
+            ERR("Failed reading %s\n", filename);
+            return nullptr;
+        }
+
+        if(distance < MinFdDistance || distance > MaxFdDistance)
+        {
+            ERR("Unsupported field distance[%zu]=%d (%d to %d millimeters)\n", f, distance,
+                MinFdDistance, MaxFdDistance);
+            return nullptr;
+        }
+        if(evCount < MinEvCount || evCount > MaxEvCount)
+        {
+            ERR("Unsupported elevation count: evCount[%zu]=%d (%d to %d)\n", f, evCount,
+                MinEvCount, MaxEvCount);
+            return nullptr;
+        }
+
+        fields[f].distance = distance / 1000.0f;
+        fields[f].evCount = evCount;
+        if(f > 0 && fields[f].distance > fields[f-1].distance)
+        {
+            ERR("Field distance[%zu] is not before previous (%f <= %f)\n", f, fields[f].distance,
+                fields[f-1].distance);
+            return nullptr;
+        }
+
+        const size_t ebase{elevs.size()};
+        elevs.resize(ebase + evCount);
+        for(auto &elev : al::span<HrtfStore::Elevation>(elevs.data()+ebase, evCount))
+            elev.azCount = readle<uint8_t>(data);
+        if(!data || data.eof())
+        {
+            ERR("Failed reading %s\n", filename);
+            return nullptr;
+        }
+
+        for(size_t e{0};e < evCount;e++)
+        {
+            if(elevs[ebase+e].azCount < MinAzCount || elevs[ebase+e].azCount > MaxAzCount)
+            {
+                ERR("Unsupported azimuth count: azCount[%zu][%zu]=%d (%d to %d)\n", f, e,
+                    elevs[ebase+e].azCount, MinAzCount, MaxAzCount);
+                return nullptr;
+            }
+        }
+    }
+
+    elevs[0].irOffset = 0;
+    std::partial_sum(elevs.cbegin(), elevs.cend(), elevs.begin(),
+        [](const HrtfStore::Elevation &last, const HrtfStore::Elevation &cur)
+            -> HrtfStore::Elevation
+        {
+            return HrtfStore::Elevation{cur.azCount,
+                static_cast<ushort>(last.azCount + last.irOffset)};
+        });
+    const auto irTotal = static_cast<ushort>(elevs.back().azCount + elevs.back().irOffset);
+
+    auto coeffs = al::vector<HrirArray>(irTotal, HrirArray{});
+    auto delays = al::vector<ubyte2>(irTotal);
+    if(channelType == ChanType_LeftOnly)
+    {
+        for(auto &hrir : coeffs)
+        {
+            for(auto &val : al::span<float2>{hrir.data(), irSize})
+                val[0] = static_cast<float>(readle<int,24>(data)) / 8388608.0f;
+        }
+        for(auto &val : delays)
+            val[0] = readle<uint8_t>(data);
+        if(!data || data.eof())
+        {
+            ERR("Failed reading %s\n", filename);
+            return nullptr;
+        }
+        for(size_t i{0};i < irTotal;++i)
+        {
+            if(delays[i][0] > MaxHrirDelay<<HrirDelayFracBits)
+            {
+                ERR("Invalid delays[%zu][0]: %f (%d)\n", i,
+                    delays[i][0] / float{HrirDelayFracOne}, MaxHrirDelay);
+                return nullptr;
+            }
+        }
+
+        /* Mirror the left ear responses to the right ear. */
+        MirrorLeftHrirs({elevs.data(), elevs.size()}, coeffs.data(), delays.data());
+    }
+    else if(channelType == ChanType_LeftRight)
+    {
+        for(auto &hrir : coeffs)
+        {
+            for(auto &val : al::span<float2>{hrir.data(), irSize})
+            {
+                val[0] = static_cast<float>(readle<int,24>(data)) / 8388608.0f;
+                val[1] = static_cast<float>(readle<int,24>(data)) / 8388608.0f;
+            }
+        }
+        for(auto &val : delays)
+        {
+            val[0] = readle<uint8_t>(data);
+            val[1] = readle<uint8_t>(data);
+        }
+        if(!data || data.eof())
+        {
+            ERR("Failed reading %s\n", filename);
+            return nullptr;
+        }
+
+        for(size_t i{0};i < irTotal;++i)
+        {
+            if(delays[i][0] > MaxHrirDelay<<HrirDelayFracBits)
+            {
+                ERR("Invalid delays[%zu][0]: %f (%d)\n", i,
+                    delays[i][0] / float{HrirDelayFracOne}, MaxHrirDelay);
+                return nullptr;
+            }
+            if(delays[i][1] > MaxHrirDelay<<HrirDelayFracBits)
+            {
+                ERR("Invalid delays[%zu][1]: %f (%d)\n", i,
+                    delays[i][1] / float{HrirDelayFracOne}, MaxHrirDelay);
+                return nullptr;
+            }
+        }
+    }
+
+    return CreateHrtfStore(rate, irSize, {fields.data(), fields.size()},
+        {elevs.data(), elevs.size()}, coeffs.data(), delays.data(), filename);
+}
+
+
+bool checkName(const std::string &name)
+{
+    auto match_name = [&name](const HrtfEntry &entry) -> bool { return name == entry.mDispName; };
+    auto &enum_names = EnumeratedHrtfs;
+    return std::find_if(enum_names.cbegin(), enum_names.cend(), match_name) != enum_names.cend();
+}
+
+void AddFileEntry(const std::string &filename)
+{
+    /* Check if this file has already been enumerated. */
+    auto enum_iter = std::find_if(EnumeratedHrtfs.cbegin(), EnumeratedHrtfs.cend(),
+        [&filename](const HrtfEntry &entry) -> bool
+        { return entry.mFilename == filename; });
+    if(enum_iter != EnumeratedHrtfs.cend())
+    {
+        TRACE("Skipping duplicate file entry %s\n", filename.c_str());
+        return;
+    }
+
+    /* TODO: Get a human-readable name from the HRTF data (possibly coming in a
+     * format update). */
+    size_t namepos{filename.find_last_of('/')+1};
+    if(!namepos) namepos = filename.find_last_of('\\')+1;
+
+    size_t extpos{filename.find_last_of('.')};
+    if(extpos <= namepos) extpos = std::string::npos;
+
+    const std::string basename{(extpos == std::string::npos) ?
+        filename.substr(namepos) : filename.substr(namepos, extpos-namepos)};
+    std::string newname{basename};
+    int count{1};
+    while(checkName(newname))
+    {
+        newname = basename;
+        newname += " #";
+        newname += std::to_string(++count);
+    }
+    EnumeratedHrtfs.emplace_back(HrtfEntry{newname, filename});
+    const HrtfEntry &entry = EnumeratedHrtfs.back();
+
+    TRACE("Adding file entry \"%s\"\n", entry.mFilename.c_str());
+}
+
+/* Unfortunate that we have to duplicate AddFileEntry to take a memory buffer
+ * for input instead of opening the given filename.
+ */
+void AddBuiltInEntry(const std::string &dispname, uint residx)
+{
+    const std::string filename{'!'+std::to_string(residx)+'_'+dispname};
+
+    auto enum_iter = std::find_if(EnumeratedHrtfs.cbegin(), EnumeratedHrtfs.cend(),
+        [&filename](const HrtfEntry &entry) -> bool
+        { return entry.mFilename == filename; });
+    if(enum_iter != EnumeratedHrtfs.cend())
+    {
+        TRACE("Skipping duplicate file entry %s\n", filename.c_str());
+        return;
+    }
+
+    /* TODO: Get a human-readable name from the HRTF data (possibly coming in a
+     * format update). */
+
+    std::string newname{dispname};
+    int count{1};
+    while(checkName(newname))
+    {
+        newname = dispname;
+        newname += " #";
+        newname += std::to_string(++count);
+    }
+    EnumeratedHrtfs.emplace_back(HrtfEntry{newname, filename});
+    const HrtfEntry &entry = EnumeratedHrtfs.back();
+
+    TRACE("Adding built-in entry \"%s\"\n", entry.mFilename.c_str());
+}
+
+
+#define IDR_DEFAULT_HRTF_MHR 1
+
+#ifndef ALSOFT_EMBED_HRTF_DATA
+
+al::span<const char> GetResource(int /*name*/)
+{ return {}; }
+
+#else
+
+#include "hrtf_default.h"
+
+al::span<const char> GetResource(int name)
+{
+    if(name == IDR_DEFAULT_HRTF_MHR)
+        return {reinterpret_cast<const char*>(hrtf_default), sizeof(hrtf_default)};
+    return {};
+}
+#endif
+
+} // namespace
+
+
+al::vector<std::string> EnumerateHrtf(al::optional<std::string> pathopt)
+{
+    std::lock_guard<std::mutex> _{EnumeratedHrtfLock};
+    EnumeratedHrtfs.clear();
+
+    bool usedefaults{true};
+    if(pathopt)
+    {
+        const char *pathlist{pathopt->c_str()};
+        while(pathlist && *pathlist)
+        {
+            const char *next, *end;
+
+            while(isspace(*pathlist) || *pathlist == ',')
+                pathlist++;
+            if(*pathlist == '\0')
+                continue;
+
+            next = strchr(pathlist, ',');
+            if(next)
+                end = next++;
+            else
+            {
+                end = pathlist + strlen(pathlist);
+                usedefaults = false;
+            }
+
+            while(end != pathlist && isspace(*(end-1)))
+                --end;
+            if(end != pathlist)
+            {
+                const std::string pname{pathlist, end};
+                for(const auto &fname : SearchDataFiles(".mhr", pname.c_str()))
+                    AddFileEntry(fname);
+            }
+
+            pathlist = next;
+        }
+    }
+
+    if(usedefaults)
+    {
+        for(const auto &fname : SearchDataFiles(".mhr", "openal/hrtf"))
+            AddFileEntry(fname);
+
+        if(!GetResource(IDR_DEFAULT_HRTF_MHR).empty())
+            AddBuiltInEntry("Built-In HRTF", IDR_DEFAULT_HRTF_MHR);
+    }
+
+    al::vector<std::string> list;
+    list.reserve(EnumeratedHrtfs.size());
+    for(auto &entry : EnumeratedHrtfs)
+        list.emplace_back(entry.mDispName);
+
+    return list;
+}
+
+HrtfStorePtr GetLoadedHrtf(const std::string &name, const uint devrate)
+{
+    std::lock_guard<std::mutex> _{EnumeratedHrtfLock};
+    auto entry_iter = std::find_if(EnumeratedHrtfs.cbegin(), EnumeratedHrtfs.cend(),
+        [&name](const HrtfEntry &entry) -> bool { return entry.mDispName == name; });
+    if(entry_iter == EnumeratedHrtfs.cend())
+        return nullptr;
+    const std::string &fname = entry_iter->mFilename;
+
+    std::lock_guard<std::mutex> __{LoadedHrtfLock};
+    auto hrtf_lt_fname = [](LoadedHrtf &hrtf, const std::string &filename) -> bool
+    { return hrtf.mFilename < filename; };
+    auto handle = std::lower_bound(LoadedHrtfs.begin(), LoadedHrtfs.end(), fname, hrtf_lt_fname);
+    while(handle != LoadedHrtfs.end() && handle->mFilename == fname)
+    {
+        HrtfStore *hrtf{handle->mEntry.get()};
+        if(hrtf && hrtf->sampleRate == devrate)
+        {
+            hrtf->add_ref();
+            return HrtfStorePtr{hrtf};
+        }
+        ++handle;
+    }
+
+    std::unique_ptr<std::istream> stream;
+    int residx{};
+    char ch{};
+    if(sscanf(fname.c_str(), "!%d%c", &residx, &ch) == 2 && ch == '_')
+    {
+        TRACE("Loading %s...\n", fname.c_str());
+        al::span<const char> res{GetResource(residx)};
+        if(res.empty())
+        {
+            ERR("Could not get resource %u, %s\n", residx, name.c_str());
+            return nullptr;
+        }
+        stream = std::make_unique<idstream>(res.begin(), res.end());
+    }
+    else
+    {
+        TRACE("Loading %s...\n", fname.c_str());
+        auto fstr = std::make_unique<al::ifstream>(fname.c_str(), std::ios::binary);
+        if(!fstr->is_open())
+        {
+            ERR("Could not open %s\n", fname.c_str());
+            return nullptr;
+        }
+        stream = std::move(fstr);
+    }
+
+    std::unique_ptr<HrtfStore> hrtf;
+    char magic[sizeof(magicMarker03)];
+    stream->read(magic, sizeof(magic));
+    if(stream->gcount() < static_cast<std::streamsize>(sizeof(magicMarker03)))
+        ERR("%s data is too short (%zu bytes)\n", name.c_str(), stream->gcount());
+    else if(memcmp(magic, magicMarker03, sizeof(magicMarker03)) == 0)
+    {
+        TRACE("Detected data set format v3\n");
+        hrtf = LoadHrtf03(*stream, name.c_str());
+    }
+    else if(memcmp(magic, magicMarker02, sizeof(magicMarker02)) == 0)
+    {
+        TRACE("Detected data set format v2\n");
+        hrtf = LoadHrtf02(*stream, name.c_str());
+    }
+    else if(memcmp(magic, magicMarker01, sizeof(magicMarker01)) == 0)
+    {
+        TRACE("Detected data set format v1\n");
+        hrtf = LoadHrtf01(*stream, name.c_str());
+    }
+    else if(memcmp(magic, magicMarker00, sizeof(magicMarker00)) == 0)
+    {
+        TRACE("Detected data set format v0\n");
+        hrtf = LoadHrtf00(*stream, name.c_str());
+    }
+    else
+        ERR("Invalid header in %s: \"%.8s\"\n", name.c_str(), magic);
+    stream.reset();
+
+    if(!hrtf)
+    {
+        ERR("Failed to load %s\n", name.c_str());
+        return nullptr;
+    }
+
+    if(hrtf->sampleRate != devrate)
+    {
+        TRACE("Resampling HRTF %s (%uhz -> %uhz)\n", name.c_str(), hrtf->sampleRate, devrate);
+
+        /* Calculate the last elevation's index and get the total IR count. */
+        const size_t lastEv{std::accumulate(hrtf->field, hrtf->field+hrtf->fdCount, size_t{0},
+            [](const size_t curval, const HrtfStore::Field &field) noexcept -> size_t
+            { return curval + field.evCount; }
+        ) - 1};
+        const size_t irCount{size_t{hrtf->elev[lastEv].irOffset} + hrtf->elev[lastEv].azCount};
+
+        /* Resample all the IRs. */
+        std::array<std::array<double,HrirLength>,2> inout;
+        PPhaseResampler rs;
+        rs.init(hrtf->sampleRate, devrate);
+        for(size_t i{0};i < irCount;++i)
+        {
+            HrirArray &coeffs = const_cast<HrirArray&>(hrtf->coeffs[i]);
+            for(size_t j{0};j < 2;++j)
+            {
+                std::transform(coeffs.cbegin(), coeffs.cend(), inout[0].begin(),
+                    [j](const float2 &in) noexcept -> double { return in[j]; });
+                rs.process(HrirLength, inout[0].data(), HrirLength, inout[1].data());
+                for(size_t k{0};k < HrirLength;++k)
+                    coeffs[k][j] = static_cast<float>(inout[1][k]);
+            }
+        }
+        rs = {};
+
+        /* Scale the delays for the new sample rate. */
+        float max_delay{0.0f};
+        auto new_delays = al::vector<float2>(irCount);
+        const float rate_scale{static_cast<float>(devrate)/static_cast<float>(hrtf->sampleRate)};
+        for(size_t i{0};i < irCount;++i)
+        {
+            for(size_t j{0};j < 2;++j)
+            {
+                const float new_delay{std::round(hrtf->delays[i][j] * rate_scale) /
+                    float{HrirDelayFracOne}};
+                max_delay = maxf(max_delay, new_delay);
+                new_delays[i][j] = new_delay;
+            }
+        }
+
+        /* If the new delays exceed the max, scale it down to fit (essentially
+         * shrinking the head radius; not ideal but better than a per-delay
+         * clamp).
+         */
+        float delay_scale{HrirDelayFracOne};
+        if(max_delay > MaxHrirDelay)
+        {
+            WARN("Resampled delay exceeds max (%.2f > %d)\n", max_delay, MaxHrirDelay);
+            delay_scale *= float{MaxHrirDelay} / max_delay;
+        }
+
+        for(size_t i{0};i < irCount;++i)
+        {
+            ubyte2 &delays = const_cast<ubyte2&>(hrtf->delays[i]);
+            for(size_t j{0};j < 2;++j)
+                delays[j] = static_cast<ubyte>(float2int(new_delays[i][j]*delay_scale + 0.5f));
+        }
+
+        /* Scale the IR size for the new sample rate and update the stored
+         * sample rate.
+         */
+        const float newIrSize{std::round(static_cast<float>(hrtf->irSize) * rate_scale)};
+        hrtf->irSize = static_cast<uint>(minf(HrirLength, newIrSize));
+        hrtf->sampleRate = devrate;
+    }
+
+    TRACE("Loaded HRTF %s for sample rate %uhz, %u-sample filter\n", name.c_str(),
+        hrtf->sampleRate, hrtf->irSize);
+    handle = LoadedHrtfs.emplace(handle, LoadedHrtf{fname, std::move(hrtf)});
+
+    return HrtfStorePtr{handle->mEntry.get()};
+}
+
+
+void HrtfStore::add_ref()
+{
+    auto ref = IncrementRef(mRef);
+    TRACE("HrtfStore %p increasing refcount to %u\n", decltype(std::declval<void*>()){this}, ref);
+}
+
+void HrtfStore::release()
+{
+    auto ref = DecrementRef(mRef);
+    TRACE("HrtfStore %p decreasing refcount to %u\n", decltype(std::declval<void*>()){this}, ref);
+    if(ref == 0)
+    {
+        std::lock_guard<std::mutex> _{LoadedHrtfLock};
+
+        /* Go through and remove all unused HRTFs. */
+        auto remove_unused = [](LoadedHrtf &hrtf) -> bool
+        {
+            HrtfStore *entry{hrtf.mEntry.get()};
+            if(entry && ReadRef(entry->mRef) == 0)
+            {
+                TRACE("Unloading unused HRTF %s\n", hrtf.mFilename.data());
+                hrtf.mEntry = nullptr;
+                return true;
+            }
+            return false;
+        };
+        auto iter = std::remove_if(LoadedHrtfs.begin(), LoadedHrtfs.end(), remove_unused);
+        LoadedHrtfs.erase(iter, LoadedHrtfs.end());
+    }
+}
diff --git a/core/hrtf.h b/core/hrtf.h
new file mode 100644
index 00000000..61e5bada
--- /dev/null
+++ b/core/hrtf.h
@@ -0,0 +1,90 @@
+#ifndef CORE_HRTF_H
+#define CORE_HRTF_H
+
+#include <array>
+#include <cstddef>
+#include <memory>
+#include <string>
+
+#include "almalloc.h"
+#include "aloptional.h"
+#include "alspan.h"
+#include "atomic.h"
+#include "ambidefs.h"
+#include "bufferline.h"
+#include "mixer/hrtfdefs.h"
+#include "intrusive_ptr.h"
+#include "vector.h"
+
+
+struct HrtfStore {
+    RefCount mRef;
+
+    uint sampleRate;
+    uint irSize;
+
+    struct Field {
+        float distance;
+        ubyte evCount;
+    };
+    /* NOTE: Fields are stored *backwards*. field[0] is the farthest field, and
+     * field[fdCount-1] is the nearest.
+     */
+    uint fdCount;
+    const Field *field;
+
+    struct Elevation {
+        ushort azCount;
+        ushort irOffset;
+    };
+    Elevation *elev;
+    const HrirArray *coeffs;
+    const ubyte2 *delays;
+
+    void add_ref();
+    void release();
+
+    DEF_PLACE_NEWDEL()
+};
+using HrtfStorePtr = al::intrusive_ptr<HrtfStore>;
+
+
+struct EvRadians { float value; };
+struct AzRadians { float value; };
+struct AngularPoint {
+    EvRadians Elev;
+    AzRadians Azim;
+};
+
+
+struct DirectHrtfState {
+    std::array<float,HrtfDirectDelay+BufferLineSize> mTemp;
+
+    /* HRTF filter state for dry buffer content */
+    uint mIrSize{0};
+    al::FlexArray<HrtfChannelState> mChannels;
+
+    DirectHrtfState(size_t numchans) : mChannels{numchans} { }
+    /**
+     * Produces HRTF filter coefficients for decoding B-Format, given a set of
+     * virtual speaker positions, a matching decoding matrix, and per-order
+     * high-frequency gains for the decoder. The calculated impulse responses
+     * are ordered and scaled according to the matrix input.
+     */
+    void build(const HrtfStore *Hrtf, const uint irSize,
+        const al::span<const AngularPoint> AmbiPoints, const float (*AmbiMatrix)[MaxAmbiChannels],
+        const float XOverFreq, const al::span<const float,MaxAmbiOrder+1> AmbiOrderHFGain);
+
+    static std::unique_ptr<DirectHrtfState> Create(size_t num_chans);
+
+    DEF_FAM_NEWDEL(DirectHrtfState, mChannels)
+};
+
+
+al::vector<std::string> EnumerateHrtf(al::optional<std::string> pathopt);
+HrtfStorePtr GetLoadedHrtf(const std::string &name, const uint devrate);
+
+void GetHrtfCoeffs(const HrtfStore *Hrtf, float elevation, float azimuth, float distance,
+    float spread, HrirArray &coeffs, const al::span<uint,2> delays);
+
+#endif /* CORE_HRTF_H */
diff --git a/core/logging.h b/core/logging.h
index b931c27e..81465929 100644
--- a/core/logging.h
+++ b/core/logging.h
@@ -35,7 +35,12 @@ extern FILE *gLogFile;
 
 #else
 
-[[gnu::format(printf,3,4)]] void al_print(LogLevel level, FILE *logfile, const char *fmt, ...);
+#ifdef __USE_MINGW_ANSI_STDIO
+[[gnu::format(gnu_printf,3,4)]]
+#else
+[[gnu::format(printf,3,4)]]
+#endif
+void al_print(LogLevel level, FILE *logfile, const char *fmt, ...);
 
 #define TRACE(...) al_print(LogLevel::Trace, gLogFile, "[ALSOFT] (II) " __VA_ARGS__)
 
diff --git a/core/mixer.cpp b/core/mixer.cpp
new file mode 100644
index 00000000..71e48fe3
--- /dev/null
+++ b/core/mixer.cpp
@@ -0,0 +1,126 @@
+
+#include "config.h"
+
+#include "mixer.h"
+
+#include <cmath>
+
+#include "devformat.h"
+#include "device.h"
+#include "math_defs.h"
+#include "mixer/defs.h"
+
+struct CTag;
+
+
+MixerFunc MixSamples{Mix_<CTag>};
+
+
+std::array<float,MaxAmbiChannels> CalcAmbiCoeffs(const float y, const float z, const float x,
+    const float spread)
+{
+    std::array<float,MaxAmbiChannels> coeffs;
+
+    /* Zeroth-order */
+    coeffs[0]  = 1.0f; /* ACN 0 = 1 */
+    /* First-order */
+    coeffs[1]  = 1.732050808f * y; /* ACN 1 = sqrt(3) * Y */
+    coeffs[2]  = 1.732050808f * z; /* ACN 2 = sqrt(3) * Z */
+    coeffs[3]  = 1.732050808f * x; /* ACN 3 = sqrt(3) * X */
+    /* Second-order */
+    const float xx{x*x}, yy{y*y}, zz{z*z}, xy{x*y}, yz{y*z}, xz{x*z};
+    coeffs[4]  = 3.872983346f * xy;               /* ACN 4 = sqrt(15) * X * Y */
+    coeffs[5]  = 3.872983346f * yz;               /* ACN 5 = sqrt(15) * Y * Z */
+    coeffs[6]  = 1.118033989f * (3.0f*zz - 1.0f); /* ACN 6 = sqrt(5)/2 * (3*Z*Z - 1) */
+    coeffs[7]  = 3.872983346f * xz;               /* ACN 7 = sqrt(15) * X * Z */
+    coeffs[8]  = 1.936491673f * (xx - yy);        /* ACN 8 = sqrt(15)/2 * (X*X - Y*Y) */
+    /* Third-order */
+    coeffs[9]  =  2.091650066f * (y*(3.0f*xx - yy));   /* ACN  9 = sqrt(35/8) * Y * (3*X*X - Y*Y) */
+    coeffs[10] = 10.246950766f * (z*xy);               /* ACN 10 = sqrt(105) * Z * X * Y */
+    coeffs[11] =  1.620185175f * (y*(5.0f*zz - 1.0f)); /* ACN 11 = sqrt(21/8) * Y * (5*Z*Z - 1) */
+    coeffs[12] =  1.322875656f * (z*(5.0f*zz - 3.0f)); /* ACN 12 = sqrt(7)/2 * Z * (5*Z*Z - 3) */
+    coeffs[13] =  1.620185175f * (x*(5.0f*zz - 1.0f)); /* ACN 13 = sqrt(21/8) * X * (5*Z*Z - 1) */
+    coeffs[14] =  5.123475383f * (z*(xx - yy));        /* ACN 14 = sqrt(105)/2 * Z * (X*X - Y*Y) */
+    coeffs[15] =  2.091650066f * (x*(xx - 3.0f*yy));   /* ACN 15 = sqrt(35/8) * X * (X*X - 3*Y*Y) */
+    /* Fourth-order */
+    /* ACN 16 = sqrt(35)*3/2 * X * Y * (X*X - Y*Y) */
+    /* ACN 17 = sqrt(35/2)*3/2 * (3*X*X - Y*Y) * Y * Z */
+    /* ACN 18 = sqrt(5)*3/2 * X * Y * (7*Z*Z - 1) */
+    /* ACN 19 = sqrt(5/2)*3/2 * Y * Z * (7*Z*Z - 3)  */
+    /* ACN 20 = 3/8 * (35*Z*Z*Z*Z - 30*Z*Z + 3) */
+    /* ACN 21 = sqrt(5/2)*3/2 * X * Z * (7*Z*Z - 3) */
+    /* ACN 22 = sqrt(5)*3/4 * (X*X - Y*Y) * (7*Z*Z - 1) */
+    /* ACN 23 = sqrt(35/2)*3/2 * (X*X - 3*Y*Y) * X * Z */
+    /* ACN 24 = sqrt(35)*3/8 * (X*X*X*X - 6*X*X*Y*Y + Y*Y*Y*Y) */
+
+    if(spread > 0.0f)
+    {
+        /* Implement the spread by using a spherical source that subtends the
+         * angle spread. See:
+         * http://www.ppsloan.org/publications/StupidSH36.pdf - Appendix A3
+         *
+         * When adjusted for N3D normalization instead of SN3D, these
+         * calculations are:
+         *
+         * ZH0 = -sqrt(pi) * (-1+ca);
+         * ZH1 =  0.5*sqrt(pi) * sa*sa;
+         * ZH2 = -0.5*sqrt(pi) * ca*(-1+ca)*(ca+1);
+         * ZH3 = -0.125*sqrt(pi) * (-1+ca)*(ca+1)*(5*ca*ca - 1);
+         * ZH4 = -0.125*sqrt(pi) * ca*(-1+ca)*(ca+1)*(7*ca*ca - 3);
+         * ZH5 = -0.0625*sqrt(pi) * (-1+ca)*(ca+1)*(21*ca*ca*ca*ca - 14*ca*ca + 1);
+         *
+         * The gain of the source is compensated for size, so that the
+         * loudness doesn't depend on the spread. Thus:
+         *
+         * ZH0 = 1.0f;
+         * ZH1 = 0.5f * (ca+1.0f);
+         * ZH2 = 0.5f * (ca+1.0f)*ca;
+         * ZH3 = 0.125f * (ca+1.0f)*(5.0f*ca*ca - 1.0f);
+         * ZH4 = 0.125f * (ca+1.0f)*(7.0f*ca*ca - 3.0f)*ca;
+         * ZH5 = 0.0625f * (ca+1.0f)*(21.0f*ca*ca*ca*ca - 14.0f*ca*ca + 1.0f);
+         */
+        const float ca{std::cos(spread * 0.5f)};
+        /* Increase the source volume by up to +3dB for a full spread. */
+        const float scale{std::sqrt(1.0f + spread/al::MathDefs<float>::Tau())};
+
+        const float ZH0_norm{scale};
+        const float ZH1_norm{scale * 0.5f * (ca+1.f)};
+        const float ZH2_norm{scale * 0.5f * (ca+1.f)*ca};
+        const float ZH3_norm{scale * 0.125f * (ca+1.f)*(5.f*ca*ca-1.f)};
+
+        /* Zeroth-order */
+        coeffs[0]  *= ZH0_norm;
+        /* First-order */
+        coeffs[1]  *= ZH1_norm;
+        coeffs[2]  *= ZH1_norm;
+        coeffs[3]  *= ZH1_norm;
+        /* Second-order */
+        coeffs[4]  *= ZH2_norm;
+        coeffs[5]  *= ZH2_norm;
+        coeffs[6]  *= ZH2_norm;
+        coeffs[7]  *= ZH2_norm;
+        coeffs[8]  *= ZH2_norm;
+        /* Third-order */
+        coeffs[9]  *= ZH3_norm;
+        coeffs[10] *= ZH3_norm;
+        coeffs[11] *= ZH3_norm;
+        coeffs[12] *= ZH3_norm;
+        coeffs[13] *= ZH3_norm;
+        coeffs[14] *= ZH3_norm;
+        coeffs[15] *= ZH3_norm;
+    }
+
+    return coeffs;
+}
+
+void ComputePanGains(const MixParams *mix, const float*RESTRICT coeffs, const float ingain,
+    const al::span<float,MAX_OUTPUT_CHANNELS> gains)
+{
+    auto ambimap = mix->AmbiMap.cbegin();
+
+    auto iter = std::transform(ambimap, ambimap+mix->Buffer.size(), gains.begin(),
+        [coeffs,ingain](const BFChannelConfig &chanmap) noexcept -> float
+        { return chanmap.Scale * coeffs[chanmap.Index] * ingain; }
+    );
+    std::fill(iter, gains.end(), 0.0f);
+}
diff --git a/core/mixer.h b/core/mixer.h
new file mode 100644
index 00000000..309f4224
--- /dev/null
+++ b/core/mixer.h
@@ -0,0 +1,101 @@
+#ifndef CORE_MIXER_H
+#define CORE_MIXER_H
+
+#include <array>
+#include <cmath>
+#include <stddef.h>
+#include <type_traits>
+
+#include "alspan.h"
+#include "ambidefs.h"
+#include "bufferline.h"
+#include "devformat.h"
+
+struct MixParams;
+
+using MixerFunc = void(*)(const al::span<const float> InSamples,
+    const al::span<FloatBufferLine> OutBuffer, float *CurrentGains, const float *TargetGains,
+    const size_t Counter, const size_t OutPos);
+
+extern MixerFunc MixSamples;
+
+
+/**
+ * Calculates ambisonic encoder coefficients using the X, Y, and Z direction
+ * components, which must represent a normalized (unit length) vector, and the
+ * spread is the angular width of the sound (0...tau).
+ *
+ * NOTE: The components use ambisonic coordinates. As a result:
+ *
+ * Ambisonic Y = OpenAL -X
+ * Ambisonic Z = OpenAL Y
+ * Ambisonic X = OpenAL -Z
+ *
+ * The components are ordered such that OpenAL's X, Y, and Z are the first,
+ * second, and third parameters respectively -- simply negate X and Z.
+ */
+std::array<float,MaxAmbiChannels> CalcAmbiCoeffs(const float y, const float z, const float x,
+    const float spread);
+
+/**
+ * CalcDirectionCoeffs
+ *
+ * Calculates ambisonic coefficients based on an OpenAL direction vector. The
+ * vector must be normalized (unit length), and the spread is the angular width
+ * of the sound (0...tau).
+ */
+inline std::array<float,MaxAmbiChannels> CalcDirectionCoeffs(const float (&dir)[3],
+    const float spread)
+{
+    /* Convert from OpenAL coords to Ambisonics. */
+    return CalcAmbiCoeffs(-dir[0], dir[1], -dir[2], spread);
+}
+
+/**
+ * CalcAngleCoeffs
+ *
+ * Calculates ambisonic coefficients based on azimuth and elevation. The
+ * azimuth and elevation parameters are in radians, going right and up
+ * respectively.
+ */
+inline std::array<float,MaxAmbiChannels> CalcAngleCoeffs(const float azimuth,
+    const float elevation, const float spread)
+{
+    const float x{-std::sin(azimuth) * std::cos(elevation)};
+    const float y{ std::sin(elevation)};
+    const float z{ std::cos(azimuth) * std::cos(elevation)};
+
+    return CalcAmbiCoeffs(x, y, z, spread);
+}
+
+
+/**
+ * ComputePanGains
+ *
+ * Computes panning gains using the given channel decoder coefficients and the
+ * pre-calculated direction or angle coefficients. For B-Format sources, the
+ * coeffs are a 'slice' of a transform matrix for the input channel, used to
+ * scale and orient the sound samples.
+ */
+void ComputePanGains(const MixParams *mix, const float*RESTRICT coeffs, const float ingain,
+    const al::span<float,MAX_OUTPUT_CHANNELS> gains);
+
+
+/** Helper to set an identity/pass-through panning for ambisonic mixing (3D input). */
+template<typename T, typename I, typename F>
+auto SetAmbiPanIdentity(T iter, I count, F func) -> std::enable_if_t<std::is_integral<I>::value>
+{
+    if(count < 1) return;
+
+    std::array<float,MaxAmbiChannels> coeffs{{1.0f}};
+    func(*iter, coeffs);
+    ++iter;
+    for(I i{1};i < count;++i,++iter)
+    {
+        coeffs[i-1] = 0.0f;
+        coeffs[i  ] = 1.0f;
+        func(*iter, coeffs);
+    }
+}
+
+#endif /* CORE_MIXER_H */
diff --git a/core/mixer/defs.h b/core/mixer/defs.h
index acf60350..ba304f22 100644
--- a/core/mixer/defs.h
+++ b/core/mixer/defs.h
@@ -6,6 +6,7 @@
 
 #include "alspan.h"
 #include "core/bufferline.h"
+#include "core/resampler_limits.h"
 
 struct HrtfChannelState;
 struct HrtfFilter;
@@ -19,12 +20,6 @@ constexpr int MixerFracBits{12};
 constexpr int MixerFracOne{1 << MixerFracBits};
 constexpr int MixerFracMask{MixerFracOne - 1};
 
-/* Maximum number of samples to pad on the ends of a buffer for resampling.
- * Note that the padding is symmetric (half at the beginning and half at the
- * end)!
- */
-constexpr int MaxResamplerPadding{48};
-
 constexpr float GainSilenceThreshold{0.00001f}; /* -100dB */
 
 
@@ -80,7 +75,7 @@ template<typename InstTag>
 void MixHrtfBlend_(const float *InSamples, float2 *AccumSamples, const uint IrSize,
     const HrtfFilter *oldparams, const MixHrtfFilter *newparams, const size_t BufferSize);
 template<typename InstTag>
-void MixDirectHrtf_(FloatBufferLine &LeftOut, FloatBufferLine &RightOut,
+void MixDirectHrtf_(const FloatBufferSpan LeftOut, const FloatBufferSpan RightOut,
     const al::span<const FloatBufferLine> InSamples, float2 *AccumSamples,
     float *TempBuf, HrtfChannelState *ChanState, const size_t IrSize, const size_t BufferSize);
 
diff --git a/core/mixer/hrtfbase.h b/core/mixer/hrtfbase.h
index 7419f960..79b09a3d 100644
--- a/core/mixer/hrtfbase.h
+++ b/core/mixer/hrtfbase.h
@@ -12,7 +12,7 @@
 using uint = unsigned int;
 
 using ApplyCoeffsT = void(&)(float2 *RESTRICT Values, const size_t irSize,
-    const HrirArray &Coeffs, const float left, const float right);
+    const ConstHrirSpan Coeffs, const float left, const float right);
 
 template<ApplyCoeffsT ApplyCoeffs>
 inline void MixHrtfBase(const float *InSamples, float2 *RESTRICT AccumSamples, const size_t IrSize,
@@ -20,7 +20,7 @@ inline void MixHrtfBase(const float *InSamples, float2 *RESTRICT AccumSamples, c
 {
     ASSUME(BufferSize > 0);
 
-    const HrirArray &Coeffs = *hrtfparams->Coeffs;
+    const ConstHrirSpan Coeffs{hrtfparams->Coeffs};
     const float gainstep{hrtfparams->GainStep};
     const float gain{hrtfparams->Gain};
 
@@ -45,9 +45,9 @@ inline void MixHrtfBlendBase(const float *InSamples, float2 *RESTRICT AccumSampl
 {
     ASSUME(BufferSize > 0);
 
-    const auto &OldCoeffs = oldparams->Coeffs;
+    const ConstHrirSpan OldCoeffs{oldparams->Coeffs};
     const float oldGainStep{oldparams->Gain / static_cast<float>(BufferSize)};
-    const auto &NewCoeffs = *newparams->Coeffs;
+    const ConstHrirSpan NewCoeffs{newparams->Coeffs};
     const float newGainStep{newparams->GainStep};
 
     if LIKELY(oldparams->Gain > GainSilenceThreshold)
@@ -84,7 +84,7 @@ inline void MixHrtfBlendBase(const float *InSamples, float2 *RESTRICT AccumSampl
 }
 
 template<ApplyCoeffsT ApplyCoeffs>
-inline void MixDirectHrtfBase(FloatBufferLine &LeftOut, FloatBufferLine &RightOut,
+inline void MixDirectHrtfBase(const FloatBufferSpan LeftOut, const FloatBufferSpan RightOut,
     const al::span<const FloatBufferLine> InSamples, float2 *RESTRICT AccumSamples,
     float *TempBuf, HrtfChannelState *ChanState, const size_t IrSize, const size_t BufferSize)
 {
@@ -133,7 +133,7 @@ inline void MixDirectHrtfBase(FloatBufferLine &LeftOut, FloatBufferLine &RightOu
         ChanState->mSplitter.processHfScale(tempbuf, ChanState->mHfScale);
 
         /* Now apply the HRIR coefficients to this channel. */
-        const auto &Coeffs = ChanState->mCoeffs;
+        const ConstHrirSpan Coeffs{ChanState->mCoeffs};
         for(size_t i{0u};i < BufferSize;++i)
         {
             const float insample{tempbuf[i]};
diff --git a/core/mixer/hrtfdefs.h b/core/mixer/hrtfdefs.h
index 89a9bb8d..7046a31e 100644
--- a/core/mixer/hrtfdefs.h
+++ b/core/mixer/hrtfdefs.h
@@ -3,6 +3,7 @@
 
 #include <array>
 
+#include "alspan.h"
 #include "core/ambidefs.h"
 #include "core/bufferline.h"
 #include "core/filters/splitter.h"
@@ -28,9 +29,11 @@ constexpr uint MinIrLength{8};
 constexpr uint HrtfDirectDelay{256};
 
 using HrirArray = std::array<float2,HrirLength>;
+using HrirSpan = al::span<float2,HrirLength>;
+using ConstHrirSpan = al::span<const float2,HrirLength>;
 
 struct MixHrtfFilter {
-    const HrirArray *Coeffs;
+    const ConstHrirSpan Coeffs;
     uint2 Delay;
     float Gain;
     float GainStep;
diff --git a/core/mixer/mixer_c.cpp b/core/mixer/mixer_c.cpp
index ff9538a4..f82f7dd1 100644
--- a/core/mixer/mixer_c.cpp
+++ b/core/mixer/mixer_c.cpp
@@ -32,15 +32,16 @@ inline float do_cubic(const InterpState&, const float *RESTRICT vals, const uint
 inline float do_bsinc(const InterpState &istate, const float *RESTRICT vals, const uint frac)
 {
     const size_t m{istate.bsinc.m};
+    ASSUME(m > 0);
 
     // Calculate the phase index and factor.
     const uint pi{frac >> FracPhaseBitDiff};
     const float pf{static_cast<float>(frac & (FracPhaseDiffOne-1)) * (1.0f/FracPhaseDiffOne)};
 
-    const float *fil{istate.bsinc.filter + m*pi*4};
-    const float *phd{fil + m};
-    const float *scd{phd + m};
-    const float *spd{scd + m};
+    const float *RESTRICT fil{istate.bsinc.filter + m*pi*2};
+    const float *RESTRICT phd{fil + m};
+    const float *RESTRICT scd{fil + BSincPhaseCount*2*m};
+    const float *RESTRICT spd{scd + m};
 
     // Apply the scale and phase interpolated filter.
     float r{0.0f};
@@ -51,13 +52,14 @@ inline float do_bsinc(const InterpState &istate, const float *RESTRICT vals, con
 inline float do_fastbsinc(const InterpState &istate, const float *RESTRICT vals, const uint frac)
 {
     const size_t m{istate.bsinc.m};
+    ASSUME(m > 0);
 
     // Calculate the phase index and factor.
     const uint pi{frac >> FracPhaseBitDiff};
     const float pf{static_cast<float>(frac & (FracPhaseDiffOne-1)) * (1.0f/FracPhaseDiffOne)};
 
-    const float *fil{istate.bsinc.filter + m*pi*4};
-    const float *phd{fil + m};
+    const float *RESTRICT fil{istate.bsinc.filter + m*pi*2};
+    const float *RESTRICT phd{fil + m};
 
     // Apply the phase interpolated filter.
     float r{0.0f};
@@ -83,7 +85,7 @@ float *DoResample(const InterpState *state, float *RESTRICT src, uint frac, uint
     return dst.data();
 }
 
-inline void ApplyCoeffs(float2 *RESTRICT Values, const size_t IrSize, const HrirArray &Coeffs,
+inline void ApplyCoeffs(float2 *RESTRICT Values, const size_t IrSize, const ConstHrirSpan Coeffs,
     const float left, const float right)
 {
     ASSUME(IrSize >= MinIrLength);
@@ -149,7 +151,7 @@ void MixHrtfBlend_<CTag>(const float *InSamples, float2 *AccumSamples, const uin
 }
 
 template<>
-void MixDirectHrtf_<CTag>(FloatBufferLine &LeftOut, FloatBufferLine &RightOut,
+void MixDirectHrtf_<CTag>(const FloatBufferSpan LeftOut, const FloatBufferSpan RightOut,
     const al::span<const FloatBufferLine> InSamples, float2 *AccumSamples,
     float *TempBuf, HrtfChannelState *ChanState, const size_t IrSize, const size_t BufferSize)
 {
diff --git a/core/mixer/mixer_neon.cpp b/core/mixer/mixer_neon.cpp
index f3e5f130..a3afdc6b 100644
--- a/core/mixer/mixer_neon.cpp
+++ b/core/mixer/mixer_neon.cpp
@@ -34,7 +34,7 @@ inline float32x4_t set_f4(float l0, float l1, float l2, float l3)
 constexpr uint FracPhaseBitDiff{MixerFracBits - BSincPhaseBits};
 constexpr uint FracPhaseDiffOne{1 << FracPhaseBitDiff};
 
-inline void ApplyCoeffs(float2 *RESTRICT Values, const size_t IrSize, const HrirArray &Coeffs,
+inline void ApplyCoeffs(float2 *RESTRICT Values, const size_t IrSize, const ConstHrirSpan Coeffs,
     const float left, const float right)
 {
     float32x4_t leftright4;
@@ -118,6 +118,7 @@ float *Resample_<BSincTag,NEONTag>(const InterpState *state, float *RESTRICT src
     const float *const filter{state->bsinc.filter};
     const float32x4_t sf4{vdupq_n_f32(state->bsinc.sf)};
     const size_t m{state->bsinc.m};
+    ASSUME(m > 0);
 
     src -= state->bsinc.l;
     for(float &out_sample : dst)
@@ -130,10 +131,10 @@ float *Resample_<BSincTag,NEONTag>(const InterpState *state, float *RESTRICT src
         float32x4_t r4{vdupq_n_f32(0.0f)};
         {
             const float32x4_t pf4{vdupq_n_f32(pf)};
-            const float *fil{filter + m*pi*4};
-            const float *phd{fil + m};
-            const float *scd{phd + m};
-            const float *spd{scd + m};
+            const float *RESTRICT fil{filter + m*pi*2};
+            const float *RESTRICT phd{fil + m};
+            const float *RESTRICT scd{fil + BSincPhaseCount*2*m};
+            const float *RESTRICT spd{scd + m};
             size_t td{m >> 2};
             size_t j{0u};
 
@@ -163,6 +164,7 @@ float *Resample_<FastBSincTag,NEONTag>(const InterpState *state, float *RESTRICT
 {
     const float *const filter{state->bsinc.filter};
     const size_t m{state->bsinc.m};
+    ASSUME(m > 0);
 
     src -= state->bsinc.l;
     for(float &out_sample : dst)
@@ -175,8 +177,8 @@ float *Resample_<FastBSincTag,NEONTag>(const InterpState *state, float *RESTRICT
         float32x4_t r4{vdupq_n_f32(0.0f)};
         {
             const float32x4_t pf4{vdupq_n_f32(pf)};
-            const float *fil{filter + m*pi*4};
-            const float *phd{fil + m};
+            const float *RESTRICT fil{filter + m*pi*2};
+            const float *RESTRICT phd{fil + m};
             size_t td{m >> 2};
             size_t j{0u};
 
@@ -213,7 +215,7 @@ void MixHrtfBlend_<NEONTag>(const float *InSamples, float2 *AccumSamples, const
 }
 
 template<>
-void MixDirectHrtf_<NEONTag>(FloatBufferLine &LeftOut, FloatBufferLine &RightOut,
+void MixDirectHrtf_<NEONTag>(const FloatBufferSpan LeftOut, const FloatBufferSpan RightOut,
     const al::span<const FloatBufferLine> InSamples, float2 *AccumSamples,
     float *TempBuf, HrtfChannelState *ChanState, const size_t IrSize, const size_t BufferSize)
 {
diff --git a/core/mixer/mixer_sse.cpp b/core/mixer/mixer_sse.cpp
index c0fd8fa1..3cfb00a5 100644
--- a/core/mixer/mixer_sse.cpp
+++ b/core/mixer/mixer_sse.cpp
@@ -26,7 +26,7 @@ constexpr uint FracPhaseDiffOne{1 << FracPhaseBitDiff};
 
 #define MLA4(x, y, z) _mm_add_ps(x, _mm_mul_ps(y, z))
 
-inline void ApplyCoeffs(float2 *RESTRICT Values, const size_t IrSize, const HrirArray &Coeffs,
+inline void ApplyCoeffs(float2 *RESTRICT Values, const size_t IrSize, const ConstHrirSpan Coeffs,
     const float left, const float right)
 {
     const __m128 lrlr{_mm_setr_ps(left, right, left, right)};
@@ -82,6 +82,7 @@ float *Resample_<BSincTag,SSETag>(const InterpState *state, float *RESTRICT src,
     const float *const filter{state->bsinc.filter};
     const __m128 sf4{_mm_set1_ps(state->bsinc.sf)};
     const size_t m{state->bsinc.m};
+    ASSUME(m > 0);
 
     src -= state->bsinc.l;
     for(float &out_sample : dst)
@@ -94,10 +95,10 @@ float *Resample_<BSincTag,SSETag>(const InterpState *state, float *RESTRICT src,
         __m128 r4{_mm_setzero_ps()};
         {
             const __m128 pf4{_mm_set1_ps(pf)};
-            const float *fil{filter + m*pi*4};
-            const float *phd{fil + m};
-            const float *scd{phd + m};
-            const float *spd{scd + m};
+            const float *RESTRICT fil{filter + m*pi*2};
+            const float *RESTRICT phd{fil + m};
+            const float *RESTRICT scd{fil + BSincPhaseCount*2*m};
+            const float *RESTRICT spd{scd + m};
             size_t td{m >> 2};
             size_t j{0u};
 
@@ -128,6 +129,7 @@ float *Resample_<FastBSincTag,SSETag>(const InterpState *state, float *RESTRICT
 {
     const float *const filter{state->bsinc.filter};
     const size_t m{state->bsinc.m};
+    ASSUME(m > 0);
 
     src -= state->bsinc.l;
     for(float &out_sample : dst)
@@ -140,8 +142,8 @@ float *Resample_<FastBSincTag,SSETag>(const InterpState *state, float *RESTRICT
         __m128 r4{_mm_setzero_ps()};
         {
             const __m128 pf4{_mm_set1_ps(pf)};
-            const float *fil{filter + m*pi*4};
-            const float *phd{fil + m};
+            const float *RESTRICT fil{filter + m*pi*2};
+            const float *RESTRICT phd{fil + m};
             size_t td{m >> 2};
             size_t j{0u};
 
@@ -179,7 +181,7 @@ void MixHrtfBlend_<SSETag>(const float *InSamples, float2 *AccumSamples, const u
 }
 
 template<>
-void MixDirectHrtf_<SSETag>(FloatBufferLine &LeftOut, FloatBufferLine &RightOut,
+void MixDirectHrtf_<SSETag>(const FloatBufferSpan LeftOut, const FloatBufferSpan RightOut,
     const al::span<const FloatBufferLine> InSamples, float2 *AccumSamples,
     float *TempBuf, HrtfChannelState *ChanState, const size_t IrSize, const size_t BufferSize)
 {
diff --git a/core/mixer/mixer_sse2.cpp b/core/mixer/mixer_sse2.cpp
index f91d5dcd..99d04210 100644
--- a/core/mixer/mixer_sse2.cpp
+++ b/core/mixer/mixer_sse2.cpp
@@ -52,10 +52,10 @@ float *Resample_<LerpTag,SSE2Tag>(const InterpState*, float *RESTRICT src, uint
     auto dst_iter = dst.begin();
     for(size_t todo{dst.size()>>2};todo;--todo)
     {
-        const int pos0{_mm_cvtsi128_si32(_mm_shuffle_epi32(pos4, _MM_SHUFFLE(0, 0, 0, 0)))};
-        const int pos1{_mm_cvtsi128_si32(_mm_shuffle_epi32(pos4, _MM_SHUFFLE(1, 1, 1, 1)))};
-        const int pos2{_mm_cvtsi128_si32(_mm_shuffle_epi32(pos4, _MM_SHUFFLE(2, 2, 2, 2)))};
-        const int pos3{_mm_cvtsi128_si32(_mm_shuffle_epi32(pos4, _MM_SHUFFLE(3, 3, 3, 3)))};
+        const int pos0{_mm_cvtsi128_si32(pos4)};
+        const int pos1{_mm_cvtsi128_si32(_mm_srli_si128(pos4, 4))};
+        const int pos2{_mm_cvtsi128_si32(_mm_srli_si128(pos4, 8))};
+        const int pos3{_mm_cvtsi128_si32(_mm_srli_si128(pos4, 12))};
         const __m128 val1{_mm_setr_ps(src[pos0  ], src[pos1  ], src[pos2  ], src[pos3  ])};
         const __m128 val2{_mm_setr_ps(src[pos0+1], src[pos1+1], src[pos2+1], src[pos3+1])};
 
diff --git a/core/resampler_limits.h b/core/resampler_limits.h
new file mode 100644
index 00000000..9d4cefda
--- /dev/null
+++ b/core/resampler_limits.h
@@ -0,0 +1,12 @@
+#ifndef CORE_RESAMPLER_LIMITS_H
+#define CORE_RESAMPLER_LIMITS_H
+
+/* Maximum number of samples to pad on the ends of a buffer for resampling.
+ * Note that the padding is symmetric (half at the beginning and half at the
+ * end)!
+ */
+constexpr int MaxResamplerPadding{48};
+
+constexpr int MaxResamplerEdge{MaxResamplerPadding >> 1};
+
+#endif /* CORE_RESAMPLER_LIMITS_H */
diff --git a/core/rtkit.cpp b/core/rtkit.cpp
new file mode 100644
index 00000000..8b489e71
--- /dev/null
+++ b/core/rtkit.cpp
@@ -0,0 +1,240 @@
+/*-*- Mode: C; c-basic-offset: 8 -*-*/
+
+/***
+        Copyright 2009 Lennart Poettering
+        Copyright 2010 David Henningsson <[email protected]>
+        Copyright 2021 Chris Robinson
+
+        Permission is hereby granted, free of charge, to any person
+        obtaining a copy of this software and associated documentation files
+        (the "Software"), to deal in the Software without restriction,
+        including without limitation the rights to use, copy, modify, merge,
+        publish, distribute, sublicense, and/or sell copies of the Software,
+        and to permit persons to whom the Software is furnished to do so,
+        subject to the following conditions:
+
+        The above copyright notice and this permission notice shall be
+        included in all copies or substantial portions of the Software.
+
+        THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+        EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+        MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+        NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+        BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+        ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+        CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+        SOFTWARE.
+***/
+
+#include "config.h"
+
+#include "rtkit.h"
+
+#include <errno.h>
+
+#ifdef __linux__
+
+#ifndef _GNU_SOURCE
+#define _GNU_SOURCE
+#endif
+
+#include <memory>
+#include <string.h>
+#include <unistd.h>
+#include <sys/types.h>
+#include <sys/syscall.h>
+
+
+namespace dbus {
+    constexpr int TypeString{'s'};
+    constexpr int TypeVariant{'v'};
+    constexpr int TypeInt32{'i'};
+    constexpr int TypeUInt32{'u'};
+    constexpr int TypeInt64{'x'};
+    constexpr int TypeUInt64{'t'};
+    constexpr int TypeInvalid{'\0'};
+
+    struct MessageDeleter {
+        void operator()(DBusMessage *m) { (*pdbus_message_unref)(m); }
+    };
+    using MessagePtr = std::unique_ptr<DBusMessage,MessageDeleter>;
+} // namespace dbus
+
+namespace {
+
+inline pid_t _gettid()
+{ return static_cast<pid_t>(syscall(SYS_gettid)); }
+
+int translate_error(const char *name)
+{
+    if(strcmp(name, DBUS_ERROR_NO_MEMORY) == 0)
+        return -ENOMEM;
+    if(strcmp(name, DBUS_ERROR_SERVICE_UNKNOWN) == 0
+        || strcmp(name, DBUS_ERROR_NAME_HAS_NO_OWNER) == 0)
+        return -ENOENT;
+    if(strcmp(name, DBUS_ERROR_ACCESS_DENIED) == 0
+        || strcmp(name, DBUS_ERROR_AUTH_FAILED) == 0)
+        return -EACCES;
+    return -EIO;
+}
+
+int rtkit_get_int_property(DBusConnection *connection, const char *propname, long long *propval)
+{
+    dbus::MessagePtr m{(*pdbus_message_new_method_call)(RTKIT_SERVICE_NAME, RTKIT_OBJECT_PATH,
+        "org.freedesktop.DBus.Properties", "Get")};
+    if(!m) return -ENOMEM;
+
+    const char *interfacestr = RTKIT_SERVICE_NAME;
+    auto ready = (*pdbus_message_append_args)(m.get(),
+        dbus::TypeString, &interfacestr,
+        dbus::TypeString, &propname,
+        dbus::TypeInvalid);
+    if(!ready) return -ENOMEM;
+
+    dbus::Error error;
+    dbus::MessagePtr r{(*pdbus_connection_send_with_reply_and_block)(connection, m.get(), -1,
+        &error.get())};
+    if(!r) return translate_error(error->name);
+
+    if((*pdbus_set_error_from_message)(&error.get(), r.get()))
+        return translate_error(error->name);
+
+    int ret{-EBADMSG};
+    DBusMessageIter iter{};
+    (*pdbus_message_iter_init)(r.get(), &iter);
+    while(int curtype{(*pdbus_message_iter_get_arg_type)(&iter)})
+    {
+        if(curtype == dbus::TypeVariant)
+        {
+            DBusMessageIter subiter{};
+            (*pdbus_message_iter_recurse)(&iter, &subiter);
+
+            while((curtype=(*pdbus_message_iter_get_arg_type)(&subiter)) != dbus::TypeInvalid)
+            {
+                if(curtype == dbus::TypeInt32)
+                {
+                    dbus_int32_t i32{};
+                    (*pdbus_message_iter_get_basic)(&subiter, &i32);
+                    *propval = i32;
+                    ret = 0;
+                }
+
+                if(curtype == dbus::TypeInt64)
+                {
+                    dbus_int64_t i64{};
+                    (*pdbus_message_iter_get_basic)(&subiter, &i64);
+                    *propval = i64;
+                    ret = 0;
+                }
+
+                (*pdbus_message_iter_next)(&subiter);
+            }
+        }
+        (*pdbus_message_iter_next)(&iter);
+    }
+
+    return ret;
+}
+
+} // namespace
+
+extern "C" {
+int rtkit_get_max_realtime_priority(DBusConnection *connection)
+{
+    long long retval{};
+    int err{rtkit_get_int_property(connection, "MaxRealtimePriority", &retval)};
+    return err < 0 ? err : static_cast<int>(retval);
+}
+
+int rtkit_get_min_nice_level(DBusConnection *connection, int *min_nice_level)
+{
+    long long retval{};
+    int err{rtkit_get_int_property(connection, "MinNiceLevel", &retval)};
+    if(err >= 0) *min_nice_level = static_cast<int>(retval);
+    return err;
+}
+
+long long rtkit_get_rttime_usec_max(DBusConnection *connection)
+{
+    long long retval{};
+    int err{rtkit_get_int_property(connection, "RTTimeUSecMax", &retval)};
+    return err < 0 ? err : retval;
+}
+
+int rtkit_make_realtime(DBusConnection *connection, pid_t thread, int priority)
+{
+    if(thread == 0)
+        thread = _gettid();
+
+    dbus::MessagePtr m{(*pdbus_message_new_method_call)(RTKIT_SERVICE_NAME, RTKIT_OBJECT_PATH,
+        "org.freedesktop.RealtimeKit1", "MakeThreadRealtime")};
+    if(!m) return -ENOMEM;
+
+    auto u64 = static_cast<dbus_uint64_t>(thread);
+    auto u32 = static_cast<dbus_uint32_t>(priority);
+    auto ready = (*pdbus_message_append_args)(m.get(),
+        dbus::TypeUInt64, &u64,
+        dbus::TypeUInt32, &u32,
+        dbus::TypeInvalid);
+    if(!ready) return -ENOMEM;
+
+    dbus::Error error;
+    dbus::MessagePtr r{(*pdbus_connection_send_with_reply_and_block)(connection, m.get(), -1,
+        &error.get())};
+    if(!r) return translate_error(error->name);
+
+    if((*pdbus_set_error_from_message)(&error.get(), r.get()))
+        return translate_error(error->name);
+
+    return 0;
+}
+
+int rtkit_make_high_priority(DBusConnection *connection, pid_t thread, int nice_level)
+{
+    if(thread == 0)
+        thread = _gettid();
+
+    dbus::MessagePtr m{(*pdbus_message_new_method_call)(RTKIT_SERVICE_NAME, RTKIT_OBJECT_PATH,
+        "org.freedesktop.RealtimeKit1", "MakeThreadHighPriority")};
+    if(!m) return -ENOMEM;
+
+    auto u64 = static_cast<dbus_uint64_t>(thread);
+    auto s32 = static_cast<dbus_int32_t>(nice_level);
+    auto ready = (*pdbus_message_append_args)(m.get(),
+        dbus::TypeUInt64, &u64,
+        dbus::TypeInt32, &s32,
+        dbus::TypeInvalid);
+    if(!ready) return -ENOMEM;
+
+    dbus::Error error;
+    dbus::MessagePtr r{(*pdbus_connection_send_with_reply_and_block)(connection, m.get(), -1,
+        &error.get())};
+    if(!r) return translate_error(error->name);
+
+    if((*pdbus_set_error_from_message)(&error.get(), r.get()))
+        return translate_error(error->name);
+
+    return 0;
+}
+} // extern "C"
+
+#else
+
+extern "C" {
+int rtkit_make_realtime(DBusConnection *connection, pid_t thread, int priority)
+{ return -ENOTSUP; }
+
+int rtkit_make_high_priority(DBusConnection *connection, pid_t thread, int nice_level)
+{ return -ENOTSUP; }
+
+int rtkit_get_max_realtime_priority(DBusConnection *connection)
+{ return -ENOTSUP; }
+
+int rtkit_get_min_nice_level(DBusConnection *connection, int *min_nice_level)
+{ return -ENOTSUP; }
+
+long long rtkit_get_rttime_usec_max(DBusConnection *connection)
+{ return -ENOTSUP; }
+} // extern "C"
+
+#endif
diff --git a/core/rtkit.h b/core/rtkit.h
new file mode 100644
index 00000000..96e81d4a
--- /dev/null
+++ b/core/rtkit.h
@@ -0,0 +1,80 @@
+/*-*- Mode: C; c-basic-offset: 8 -*-*/
+
+#ifndef foortkithfoo
+#define foortkithfoo
+
+/***
+        Copyright 2009 Lennart Poettering
+        Copyright 2010 David Henningsson <[email protected]>
+
+        Permission is hereby granted, free of charge, to any person
+        obtaining a copy of this software and associated documentation files
+        (the "Software"), to deal in the Software without restriction,
+        including without limitation the rights to use, copy, modify, merge,
+        publish, distribute, sublicense, and/or sell copies of the Software,
+        and to permit persons to whom the Software is furnished to do so,
+        subject to the following conditions:
+
+        The above copyright notice and this permission notice shall be
+        included in all copies or substantial portions of the Software.
+
+        THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+        EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+        MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+        NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+        BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+        ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+        CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+        SOFTWARE.
+***/
+
+#include <sys/types.h>
+
+#include "dbus_wrap.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* This is the reference implementation for a client for
+ * RealtimeKit. You don't have to use this, but if do, just copy these
+ * sources into your repository */
+
+#define RTKIT_SERVICE_NAME "org.freedesktop.RealtimeKit1"
+#define RTKIT_OBJECT_PATH "/org/freedesktop/RealtimeKit1"
+
+/* This is mostly equivalent to sched_setparam(thread, SCHED_RR, {
+ * .sched_priority = priority }). 'thread' needs to be a kernel thread
+ * id as returned by gettid(), not a pthread_t! If 'thread' is 0 the
+ * current thread is used. The returned value is a negative errno
+ * style error code, or 0 on success. */
+int rtkit_make_realtime(DBusConnection *system_bus, pid_t thread, int priority);
+
+/* This is mostly equivalent to setpriority(PRIO_PROCESS, thread,
+ * nice_level). 'thread' needs to be a kernel thread id as returned by
+ * gettid(), not a pthread_t! If 'thread' is 0 the current thread is
+ * used. The returned value is a negative errno style error code, or 0
+ * on success.*/
+int rtkit_make_high_priority(DBusConnection *system_bus, pid_t thread, int nice_level);
+
+/* Return the maximum value of realtime priority available. Realtime requests
+ * above this value will fail. A negative value is an errno style error code.
+ */
+int rtkit_get_max_realtime_priority(DBusConnection *system_bus);
+
+/* Retreive the minimum value of nice level available. High prio requests
+ * below this value will fail. The returned value is a negative errno
+ * style error code, or 0 on success.*/
+int rtkit_get_min_nice_level(DBusConnection *system_bus, int *min_nice_level);
+
+/* Return the maximum value of RLIMIT_RTTIME to set before attempting a
+ * realtime request. A negative value is an errno style error code.
+ */
+long long rtkit_get_rttime_usec_max(DBusConnection *system_bus);
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/core/uhjfilter.cpp b/core/uhjfilter.cpp
index 92f35901..f1af4b94 100644
--- a/core/uhjfilter.cpp
+++ b/core/uhjfilter.cpp
@@ -3,227 +3,49 @@
 
 #include "uhjfilter.h"
 
-#ifdef HAVE_SSE_INTRINSICS
-#include <xmmintrin.h>
-#elif defined(HAVE_NEON)
-#include <arm_neon.h>
-#endif
-
 #include <algorithm>
 #include <iterator>
 
 #include "alcomplex.h"
 #include "alnumeric.h"
 #include "opthelpers.h"
+#include "phase_shifter.h"
 
 
 namespace {
 
-using complex_d = std::complex<double>;
-
-struct PhaseShifterT {
-    alignas(16) std::array<float,Uhj2Encoder::sFilterSize> Coeffs;
-
-    /* Some notes on this filter construction.
-     *
-     * A wide-band phase-shift filter needs a delay to maintain linearity. A
-     * dirac impulse in the center of a time-domain buffer represents a filter
-     * passing all frequencies through as-is with a pure delay. Converting that
-     * to the frequency domain, adjusting the phase of each frequency bin by
-     * +90 degrees, then converting back to the time domain, results in a FIR
-     * filter that applies a +90 degree wide-band phase-shift.
-     *
-     * A particularly notable aspect of the time-domain filter response is that
-     * every other coefficient is 0. This allows doubling the effective size of
-     * the filter, by storing only the non-0 coefficients and double-stepping
-     * over the input to apply it.
-     *
-     * Additionally, the resulting filter is independent of the sample rate.
-     * The same filter can be applied regardless of the device's sample rate
-     * and achieve the same effect.
-     */
-    PhaseShifterT()
-    {
-        constexpr size_t fft_size{Uhj2Encoder::sFilterSize * 2};
-        constexpr size_t half_size{fft_size / 2};
-
-        /* Generate a frequency domain impulse with a +90 degree phase offset.
-         * Reconstruct the mirrored frequencies to convert to the time domain.
-         */
-        auto fftBuffer = std::make_unique<complex_d[]>(fft_size);
-        std::fill_n(fftBuffer.get(), fft_size, complex_d{});
-        fftBuffer[half_size] = 1.0;
-
-        forward_fft({fftBuffer.get(), fft_size});
-        for(size_t i{0};i < half_size+1;++i)
-            fftBuffer[i] = complex_d{-fftBuffer[i].imag(), fftBuffer[i].real()};
-        for(size_t i{half_size+1};i < fft_size;++i)
-            fftBuffer[i] = std::conj(fftBuffer[fft_size - i]);
-        inverse_fft({fftBuffer.get(), fft_size});
-
-        /* Reverse the filter for simpler processing, and store only the non-0
-         * coefficients.
-         */
-        auto fftiter = fftBuffer.get() + half_size + (Uhj2Encoder::sFilterSize-1);
-        for(float &coeff : Coeffs)
-        {
-            coeff = static_cast<float>(fftiter->real() / double{fft_size});
-            fftiter -= 2;
-        }
-    }
-};
-const PhaseShifterT PShift{};
-
-void allpass_process(al::span<float> dst, const float *RESTRICT src)
-{
-#ifdef HAVE_SSE_INTRINSICS
-    size_t pos{0};
-    if(size_t todo{dst.size()>>1})
-    {
-        do {
-            __m128 r04{_mm_setzero_ps()};
-            __m128 r14{_mm_setzero_ps()};
-            for(size_t j{0};j < PShift.Coeffs.size();j+=4)
-            {
-                const __m128 coeffs{_mm_load_ps(&PShift.Coeffs[j])};
-                const __m128 s0{_mm_loadu_ps(&src[j*2])};
-                const __m128 s1{_mm_loadu_ps(&src[j*2 + 4])};
-
-                __m128 s{_mm_shuffle_ps(s0, s1, _MM_SHUFFLE(2, 0, 2, 0))};
-                r04 = _mm_add_ps(r04, _mm_mul_ps(s, coeffs));
-
-                s = _mm_shuffle_ps(s0, s1, _MM_SHUFFLE(3, 1, 3, 1));
-                r14 = _mm_add_ps(r14, _mm_mul_ps(s, coeffs));
-            }
-            r04 = _mm_add_ps(r04, _mm_shuffle_ps(r04, r04, _MM_SHUFFLE(0, 1, 2, 3)));
-            r04 = _mm_add_ps(r04, _mm_movehl_ps(r04, r04));
-            dst[pos++] += _mm_cvtss_f32(r04);
-
-            r14 = _mm_add_ps(r14, _mm_shuffle_ps(r14, r14, _MM_SHUFFLE(0, 1, 2, 3)));
-            r14 = _mm_add_ps(r14, _mm_movehl_ps(r14, r14));
-            dst[pos++] += _mm_cvtss_f32(r14);
-
-            src += 2;
-        } while(--todo);
-    }
-    if((dst.size()&1))
-    {
-        __m128 r4{_mm_setzero_ps()};
-        for(size_t j{0};j < PShift.Coeffs.size();j+=4)
-        {
-            const __m128 coeffs{_mm_load_ps(&PShift.Coeffs[j])};
-            /* NOTE: This could alternatively be done with two unaligned loads
-             * and a shuffle. Which would be better?
-             */
-            const __m128 s{_mm_setr_ps(src[j*2], src[j*2 + 2], src[j*2 + 4], src[j*2 + 6])};
-            r4 = _mm_add_ps(r4, _mm_mul_ps(s, coeffs));
-        }
-        r4 = _mm_add_ps(r4, _mm_shuffle_ps(r4, r4, _MM_SHUFFLE(0, 1, 2, 3)));
-        r4 = _mm_add_ps(r4, _mm_movehl_ps(r4, r4));
-
-        dst[pos] += _mm_cvtss_f32(r4);
-    }
-
-#elif defined(HAVE_NEON)
+static_assert(UhjEncoder::sFilterDelay==UhjDecoder::sFilterDelay, "UHJ filter delays mismatch");
 
-    size_t pos{0};
-    if(size_t todo{dst.size()>>1})
-    {
-        /* There doesn't seem to be NEON intrinsics to do this kind of stipple
-         * shuffling, so there's two custom methods for it.
-         */
-        auto shuffle_2020 = [](float32x4_t a, float32x4_t b)
-        {
-            float32x4_t ret{vmovq_n_f32(vgetq_lane_f32(a, 0))};
-            ret = vsetq_lane_f32(vgetq_lane_f32(a, 2), ret, 1);
-            ret = vsetq_lane_f32(vgetq_lane_f32(b, 0), ret, 2);
-            ret = vsetq_lane_f32(vgetq_lane_f32(b, 2), ret, 3);
-            return ret;
-        };
-        auto shuffle_3131 = [](float32x4_t a, float32x4_t b)
-        {
-            float32x4_t ret{vmovq_n_f32(vgetq_lane_f32(a, 1))};
-            ret = vsetq_lane_f32(vgetq_lane_f32(a, 3), ret, 1);
-            ret = vsetq_lane_f32(vgetq_lane_f32(b, 1), ret, 2);
-            ret = vsetq_lane_f32(vgetq_lane_f32(b, 3), ret, 3);
-            return ret;
-        };
-        do {
-            float32x4_t r04{vdupq_n_f32(0.0f)};
-            float32x4_t r14{vdupq_n_f32(0.0f)};
-            for(size_t j{0};j < PShift.Coeffs.size();j+=4)
-            {
-                const float32x4_t coeffs{vld1q_f32(&PShift.Coeffs[j])};
-                const float32x4_t s0{vld1q_f32(&src[j*2])};
-                const float32x4_t s1{vld1q_f32(&src[j*2 + 4])};
-
-                r04 = vmlaq_f32(r04, shuffle_2020(s0, s1), coeffs);
-                r14 = vmlaq_f32(r14, shuffle_3131(s0, s1), coeffs);
-            }
-            r04 = vaddq_f32(r04, vrev64q_f32(r04));
-            dst[pos++] = vget_lane_f32(vadd_f32(vget_low_f32(r04), vget_high_f32(r04)), 0);
-
-            r14 = vaddq_f32(r14, vrev64q_f32(r14));
-            dst[pos++] = vget_lane_f32(vadd_f32(vget_low_f32(r14), vget_high_f32(r14)), 0);
-
-            src += 2;
-        } while(--todo);
-    }
-    if((dst.size()&1))
-    {
-        auto load4 = [](float32_t a, float32_t b, float32_t c, float32_t d)
-        {
-            float32x4_t ret{vmovq_n_f32(a)};
-            ret = vsetq_lane_f32(b, ret, 1);
-            ret = vsetq_lane_f32(c, ret, 2);
-            ret = vsetq_lane_f32(d, ret, 3);
-            return ret;
-        };
-        float32x4_t r4{vdupq_n_f32(0.0f)};
-        for(size_t j{0};j < PShift.Coeffs.size();j+=4)
-        {
-            const float32x4_t coeffs{vld1q_f32(&PShift.Coeffs[j])};
-            const float32x4_t s{load4(src[j*2], src[j*2 + 2], src[j*2 + 4], src[j*2 + 6])};
-            r4 = vmlaq_f32(r4, s, coeffs);
-        }
-        r4 = vaddq_f32(r4, vrev64q_f32(r4));
-        dst[pos] = vget_lane_f32(vadd_f32(vget_low_f32(r4), vget_high_f32(r4)), 0);
-    }
-
-#else
-
-    for(float &output : dst)
-    {
-        float ret{0.0f};
-        for(size_t j{0};j < PShift.Coeffs.size();++j)
-            ret += src[j*2] * PShift.Coeffs[j];
+using complex_d = std::complex<double>;
 
-        output += ret;
-        ++src;
-    }
-#endif
-}
+const PhaseShifterT<UhjEncoder::sFilterDelay*2> PShift{};
 
 } // namespace
 
 
-/* Encoding 2-channel UHJ from B-Format is done as:
+/* Encoding UHJ from B-Format is done as:
  *
  * S = 0.9396926*W + 0.1855740*X
  * D = j(-0.3420201*W + 0.5098604*X) + 0.6554516*Y
  *
  * Left = (S + D)/2.0
  * Right = (S - D)/2.0
+ * T = j(-0.1432*W + 0.6511746*X) - 0.7071068*Y
+ * Q = 0.9772*Z
  *
- * where j is a wide-band +90 degree phase shift.
+ * where j is a wide-band +90 degree phase shift. T is excluded from 2-channel
+ * output, and Q is excluded from 2- and 3-channel output.
  *
  * The phase shift is done using a FIR filter derived from an FFT'd impulse
  * with the desired shift.
  */
 
-void Uhj2Encoder::encode(FloatBufferLine &LeftOut, FloatBufferLine &RightOut,
+void UhjEncoder::encode(const FloatBufferSpan LeftOut, const FloatBufferSpan RightOut,
     const FloatBufferLine *InSamples, const size_t SamplesToDo)
 {
+    /* Given FuMa input, a +3dB boost is needed for the expected levels. */
+    static constexpr float sqrt2{1.41421356237f};
+
     ASSUME(SamplesToDo > 0);
 
     float *RESTRICT left{al::assume_aligned<16>(LeftOut.data())};
@@ -233,43 +55,120 @@ void Uhj2Encoder::encode(FloatBufferLine &LeftOut, FloatBufferLine &RightOut,
     const float *RESTRICT xinput{al::assume_aligned<16>(InSamples[1].data())};
     const float *RESTRICT yinput{al::assume_aligned<16>(InSamples[2].data())};
 
-    /* Combine the previously delayed mid/side signal with the input. */
+    /* Combine the previously delayed S/D signal with the input. Include any
+     * existing direct signal with it.
+     */
 
     /* S = 0.9396926*W + 0.1855740*X */
-    auto miditer = std::copy(mMidDelay.cbegin(), mMidDelay.cend(), mMid.begin());
+    auto miditer = mS.begin() + sFilterDelay;
     std::transform(winput, winput+SamplesToDo, xinput, miditer,
         [](const float w, const float x) noexcept -> float
-        { return 0.9396926f*w + 0.1855740f*x; });
+        { return 0.9396926f*sqrt2*w + 0.1855740f*sqrt2*x; });
+    for(size_t i{0};i < SamplesToDo;++i,++miditer)
+        *miditer += left[i] + right[i];
 
     /* D = 0.6554516*Y */
-    auto sideiter = std::copy(mSideDelay.cbegin(), mSideDelay.cend(), mSide.begin());
+    auto sideiter = mD.begin() + sFilterDelay;
     std::transform(yinput, yinput+SamplesToDo, sideiter,
-        [](const float y) noexcept -> float { return 0.6554516f*y; });
-
-    /* Include any existing direct signal in the mid/side buffers. */
-    for(size_t i{0};i < SamplesToDo;++i,++miditer)
-        *miditer += left[i] + right[i];
+        [](const float y) noexcept -> float { return 0.6554516f*sqrt2*y; });
     for(size_t i{0};i < SamplesToDo;++i,++sideiter)
         *sideiter += left[i] - right[i];
 
-    /* Copy the future samples back to the delay buffers for next time. */
-    std::copy_n(mMid.cbegin()+SamplesToDo, mMidDelay.size(), mMidDelay.begin());
-    std::copy_n(mSide.cbegin()+SamplesToDo, mSideDelay.size(), mSideDelay.begin());
-
-    /* Now add the all-passed signal into the side signal. */
-
     /* D += j(-0.3420201*W + 0.5098604*X) */
-    auto tmpiter = std::copy(mSideHistory.cbegin(), mSideHistory.cend(), mTemp.begin());
+    auto tmpiter = std::copy(mWXHistory.cbegin(), mWXHistory.cend(), mTemp.begin());
     std::transform(winput, winput+SamplesToDo, xinput, tmpiter,
         [](const float w, const float x) noexcept -> float
-        { return -0.3420201f*w + 0.5098604f*x; });
-    std::copy_n(mTemp.cbegin()+SamplesToDo, mSideHistory.size(), mSideHistory.begin());
-    allpass_process({mSide.data(), SamplesToDo}, mTemp.data());
+        { return -0.3420201f*sqrt2*w + 0.5098604f*sqrt2*x; });
+    std::copy_n(mTemp.cbegin()+SamplesToDo, mWXHistory.size(), mWXHistory.begin());
+    PShift.processAccum({mD.data(), SamplesToDo}, mTemp.data());
 
     /* Left = (S + D)/2.0 */
     for(size_t i{0};i < SamplesToDo;i++)
-        left[i] = (mMid[i] + mSide[i]) * 0.5f;
+        left[i] = (mS[i] + mD[i]) * 0.5f;
     /* Right = (S - D)/2.0 */
     for(size_t i{0};i < SamplesToDo;i++)
-        right[i] = (mMid[i] - mSide[i]) * 0.5f;
+        right[i] = (mS[i] - mD[i]) * 0.5f;
+
+    /* Copy the future samples to the front for next time. */
+    std::copy(mS.cbegin()+SamplesToDo, mS.cbegin()+SamplesToDo+sFilterDelay, mS.begin());
+    std::copy(mD.cbegin()+SamplesToDo, mD.cbegin()+SamplesToDo+sFilterDelay, mD.begin());
+}
+
+
+/* Decoding UHJ is done as:
+ *
+ * S = Left + Right
+ * D = Left - Right
+ *
+ * W = 0.981530*S + 0.197484*j(0.828347*D + 0.767835*T)
+ * X = 0.418504*S - j(0.828347*D + 0.767835*T)
+ * Y = 0.795954*D - 0.676406*T + j(0.186626*S)
+ * Z = 1.023332*Q
+ *
+ * where j is a +90 degree phase shift. 3-channel UHJ excludes Q, while 2-
+ * channel excludes Q and T. The B-Format signal reconstructed from 2-channel
+ * UHJ should not be run through a normal B-Format decoder, as it needs
+ * different shelf filters.
+ */
+void UhjDecoder::decode(const al::span<BufferLine> samples, const size_t offset,
+    const size_t samplesToDo, const size_t forwardSamples)
+{
+    /* A -3dB attenuation is needed for FuMa output. */
+    static constexpr float sqrt1_2{0.707106781187f};
+
+    ASSUME(samplesToDo > 0);
+
+    {
+        const float *RESTRICT left{al::assume_aligned<16>(samples[0].data() + offset)};
+        const float *RESTRICT right{al::assume_aligned<16>(samples[1].data() + offset)};
+        const float *RESTRICT t{al::assume_aligned<16>(samples[2].data() + offset)};
+
+        /* S = Left + Right */
+        for(size_t i{0};i < samplesToDo+sFilterDelay;++i)
+            mS[i] = (left[i] + right[i]) * sqrt1_2;
+
+        /* D = Left - Right */
+        for(size_t i{0};i < samplesToDo+sFilterDelay;++i)
+            mD[i] = (left[i] - right[i]) * sqrt1_2;
+
+        /* T */
+        for(size_t i{0};i < samplesToDo+sFilterDelay;++i)
+            mT[i] = t[i] * sqrt1_2;
+    }
+
+    float *RESTRICT woutput{al::assume_aligned<16>(samples[0].data() + offset)};
+    float *RESTRICT xoutput{al::assume_aligned<16>(samples[1].data() + offset)};
+    float *RESTRICT youtput{al::assume_aligned<16>(samples[2].data() + offset)};
+
+    /* Precompute j(0.828347*D + 0.767835*T) and store in xoutput. */
+    auto tmpiter = std::copy(mDTHistory.cbegin(), mDTHistory.cend(), mTemp.begin());
+    std::transform(mD.cbegin(), mD.cbegin()+samplesToDo+sFilterDelay, mT.cbegin(), tmpiter,
+        [](const float d, const float t) noexcept { return 0.828347f*d + 0.767835f*t; });
+    std::copy_n(mTemp.cbegin()+forwardSamples, mDTHistory.size(), mDTHistory.begin());
+    PShift.process({xoutput, samplesToDo}, mTemp.data());
+
+    /* W = 0.981530*S + 0.197484*j(0.828347*D + 0.767835*T) */
+    for(size_t i{0};i < samplesToDo;++i)
+        woutput[i] = 0.981530f*mS[i] + 0.197484f*xoutput[i];
+    /* X = 0.418504*S - j(0.828347*D + 0.767835*T) */
+    for(size_t i{0};i < samplesToDo;++i)
+        xoutput[i] = 0.418504f*mS[i] - xoutput[i];
+
+    /* Precompute j*S and store in youtput. */
+    tmpiter = std::copy(mSHistory.cbegin(), mSHistory.cend(), mTemp.begin());
+    std::copy_n(mS.cbegin(), samplesToDo+sFilterDelay, tmpiter);
+    std::copy_n(mTemp.cbegin()+forwardSamples, mSHistory.size(), mSHistory.begin());
+    PShift.process({youtput, samplesToDo}, mTemp.data());
+
+    /* Y = 0.795954*D - 0.676406*T + j(0.186626*S) */
+    for(size_t i{0};i < samplesToDo;++i)
+        youtput[i] = 0.795954f*mD[i] - 0.676406f*mT[i] + 0.186626f*youtput[i];
+
+    if(samples.size() > 3)
+    {
+        float *RESTRICT zoutput{samples[3].data() + offset};
+        /* Z = 1.023332*Q */
+        for(size_t i{0};i < samplesToDo;++i)
+            zoutput[i] = 1.023332f*sqrt1_2*zoutput[i];
+    }
 }
diff --git a/core/uhjfilter.h b/core/uhjfilter.h
index c2cb8722..c04913b4 100644
--- a/core/uhjfilter.h
+++ b/core/uhjfilter.h
@@ -5,35 +5,60 @@
 
 #include "almalloc.h"
 #include "bufferline.h"
+#include "resampler_limits.h"
 
 
-struct Uhj2Encoder {
-    /* A particular property of the filter allows it to cover nearly twice its
-     * length, so the filter size is also the effective delay (despite being
-     * center-aligned).
+struct UhjEncoder {
+    /* The filter delay is half it's effective size, so a delay of 128 has a
+     * FIR length of 256.
      */
-    constexpr static size_t sFilterSize{128};
+    constexpr static size_t sFilterDelay{128};
 
-    /* Delays for the unfiltered signal. */
-    alignas(16) std::array<float,sFilterSize> mMidDelay{};
-    alignas(16) std::array<float,sFilterSize> mSideDelay{};
-
-    alignas(16) std::array<float,BufferLineSize+sFilterSize> mMid{};
-    alignas(16) std::array<float,BufferLineSize+sFilterSize> mSide{};
+    /* Delays and processing storage for the unfiltered signal. */
+    alignas(16) std::array<float,BufferLineSize+sFilterDelay> mS{};
+    alignas(16) std::array<float,BufferLineSize+sFilterDelay> mD{};
 
     /* History for the FIR filter. */
-    alignas(16) std::array<float,sFilterSize*2 - 1> mSideHistory{};
+    alignas(16) std::array<float,sFilterDelay*2 - 1> mWXHistory{};
 
-    alignas(16) std::array<float,BufferLineSize + sFilterSize*2> mTemp{};
+    alignas(16) std::array<float,BufferLineSize + sFilterDelay*2> mTemp{};
 
     /**
      * Encodes a 2-channel UHJ (stereo-compatible) signal from a B-Format input
      * signal. The input must use FuMa channel ordering and scaling.
      */
-    void encode(FloatBufferLine &LeftOut, FloatBufferLine &RightOut,
+    void encode(const FloatBufferSpan LeftOut, const FloatBufferSpan RightOut,
         const FloatBufferLine *InSamples, const size_t SamplesToDo);
 
-    DEF_NEWDEL(Uhj2Encoder)
+    DEF_NEWDEL(UhjEncoder)
+};
+
+
+struct UhjDecoder {
+    constexpr static size_t sFilterDelay{128};
+
+    constexpr static size_t sLineSize{BufferLineSize+MaxResamplerPadding+sFilterDelay};
+    using BufferLine = std::array<float,sLineSize>;
+
+    alignas(16) std::array<float,BufferLineSize+MaxResamplerEdge+sFilterDelay> mS{};
+    alignas(16) std::array<float,BufferLineSize+MaxResamplerEdge+sFilterDelay> mD{};
+    alignas(16) std::array<float,BufferLineSize+MaxResamplerEdge+sFilterDelay> mT{};
+
+    alignas(16) std::array<float,sFilterDelay-1> mDTHistory{};
+    alignas(16) std::array<float,sFilterDelay-1> mSHistory{};
+
+    alignas(16) std::array<float,BufferLineSize+MaxResamplerEdge + sFilterDelay*2> mTemp{};
+
+    /**
+     * Decodes a 3- or 4-channel UHJ signal into a B-Format signal with FuMa
+     * channel ordering and scaling. For 3-channel, the 3rd channel may be
+     * attenuated by 'n', where 0 <= n <= 1. So 2-channel UHJ can be decoded by
+     * leaving the 3rd channel input silent (n=0).
+     */
+    void decode(const al::span<BufferLine> samples, const size_t offset, const size_t samplesToDo,
+        const size_t forwardSamples);
+
+    DEF_NEWDEL(UhjDecoder)
 };
 
 #endif /* CORE_UHJFILTER_H */
diff --git a/core/uiddefs.cpp b/core/uiddefs.cpp
new file mode 100644
index 00000000..244c01a5
--- /dev/null
+++ b/core/uiddefs.cpp
@@ -0,0 +1,37 @@
+
+#include "config.h"
+
+
+#ifndef AL_NO_UID_DEFS
+
+#if defined(HAVE_GUIDDEF_H) || defined(HAVE_INITGUID_H)
+#define INITGUID
+#include <windows.h>
+#ifdef HAVE_GUIDDEF_H
+#include <guiddef.h>
+#else
+#include <initguid.h>
+#endif
+
+DEFINE_GUID(KSDATAFORMAT_SUBTYPE_PCM,        0x00000001, 0x0000, 0x0010, 0x80,0x00, 0x00,0xaa,0x00,0x38,0x9b,0x71);
+DEFINE_GUID(KSDATAFORMAT_SUBTYPE_IEEE_FLOAT, 0x00000003, 0x0000, 0x0010, 0x80,0x00, 0x00,0xaa,0x00,0x38,0x9b,0x71);
+
+DEFINE_GUID(IID_IDirectSoundNotify,   0xb0210783, 0x89cd, 0x11d0, 0xaf,0x08, 0x00,0xa0,0xc9,0x25,0xcd,0x16);
+
+DEFINE_GUID(CLSID_MMDeviceEnumerator, 0xbcde0395, 0xe52f, 0x467c, 0x8e,0x3d, 0xc4,0x57,0x92,0x91,0x69,0x2e);
+DEFINE_GUID(IID_IMMDeviceEnumerator,  0xa95664d2, 0x9614, 0x4f35, 0xa7,0x46, 0xde,0x8d,0xb6,0x36,0x17,0xe6);
+DEFINE_GUID(IID_IAudioClient,         0x1cb9ad4c, 0xdbfa, 0x4c32, 0xb1,0x78, 0xc2,0xf5,0x68,0xa7,0x03,0xb2);
+DEFINE_GUID(IID_IAudioRenderClient,   0xf294acfc, 0x3146, 0x4483, 0xa7,0xbf, 0xad,0xdc,0xa7,0xc2,0x60,0xe2);
+DEFINE_GUID(IID_IAudioCaptureClient,  0xc8adbd64, 0xe71e, 0x48a0, 0xa4,0xde, 0x18,0x5c,0x39,0x5c,0xd3,0x17);
+
+#ifdef HAVE_WASAPI
+#include <wtypes.h>
+#include <devpropdef.h>
+#include <propkeydef.h>
+DEFINE_DEVPROPKEY(DEVPKEY_Device_FriendlyName, 0xa45c254e, 0xdf1c, 0x4efd, 0x80,0x20, 0x67,0xd1,0x46,0xa8,0x50,0xe0, 14);
+DEFINE_PROPERTYKEY(PKEY_AudioEndpoint_FormFactor, 0x1da5d803, 0xd492, 0x4edd, 0x8c,0x23, 0xe0,0xc0,0xff,0xee,0x7f,0x0e, 0);
+DEFINE_PROPERTYKEY(PKEY_AudioEndpoint_GUID, 0x1da5d803, 0xd492, 0x4edd, 0x8c, 0x23,0xe0, 0xc0,0xff,0xee,0x7f,0x0e, 4 );
+#endif
+#endif
+
+#endif /* AL_NO_UID_DEFS */
diff --git a/core/voice.cpp b/core/voice.cpp
new file mode 100644
index 00000000..c764a277
--- /dev/null
+++ b/core/voice.cpp
@@ -0,0 +1,849 @@
+
+#include "config.h"
+
+#include "voice.h"
+
+#include <algorithm>
+#include <array>
+#include <atomic>
+#include <cassert>
+#include <cstdint>
+#include <iterator>
+#include <memory>
+#include <new>
+#include <stdlib.h>
+#include <utility>
+#include <vector>
+
+#include "albyte.h"
+#include "alnumeric.h"
+#include "aloptional.h"
+#include "alspan.h"
+#include "alstring.h"
+#include "ambidefs.h"
+#include "async_event.h"
+#include "buffer_storage.h"
+#include "context.h"
+#include "cpu_caps.h"
+#include "devformat.h"
+#include "device.h"
+#include "filters/biquad.h"
+#include "filters/nfc.h"
+#include "filters/splitter.h"
+#include "fmt_traits.h"
+#include "logging.h"
+#include "mixer.h"
+#include "mixer/defs.h"
+#include "mixer/hrtfdefs.h"
+#include "opthelpers.h"
+#include "resampler_limits.h"
+#include "ringbuffer.h"
+#include "vector.h"
+#include "voice_change.h"
+
+struct CTag;
+#ifdef HAVE_SSE
+struct SSETag;
+#endif
+#ifdef HAVE_NEON
+struct NEONTag;
+#endif
+struct CopyTag;
+
+
+static_assert(!(sizeof(Voice::BufferLine)&15), "Voice::BufferLine must be a multiple of 16 bytes");
+
+Resampler ResamplerDefault{Resampler::Linear};
+
+namespace {
+
+using uint = unsigned int;
+
+using HrtfMixerFunc = void(*)(const float *InSamples, float2 *AccumSamples, const uint IrSize,
+    const MixHrtfFilter *hrtfparams, const size_t BufferSize);
+using HrtfMixerBlendFunc = void(*)(const float *InSamples, float2 *AccumSamples,
+    const uint IrSize, const HrtfFilter *oldparams, const MixHrtfFilter *newparams,
+    const size_t BufferSize);
+
+HrtfMixerFunc MixHrtfSamples{MixHrtf_<CTag>};
+HrtfMixerBlendFunc MixHrtfBlendSamples{MixHrtfBlend_<CTag>};
+
+inline MixerFunc SelectMixer()
+{
+#ifdef HAVE_NEON
+    if((CPUCapFlags&CPU_CAP_NEON))
+        return Mix_<NEONTag>;
+#endif
+#ifdef HAVE_SSE
+    if((CPUCapFlags&CPU_CAP_SSE))
+        return Mix_<SSETag>;
+#endif
+    return Mix_<CTag>;
+}
+
+inline HrtfMixerFunc SelectHrtfMixer()
+{
+#ifdef HAVE_NEON
+    if((CPUCapFlags&CPU_CAP_NEON))
+        return MixHrtf_<NEONTag>;
+#endif
+#ifdef HAVE_SSE
+    if((CPUCapFlags&CPU_CAP_SSE))
+        return MixHrtf_<SSETag>;
+#endif
+    return MixHrtf_<CTag>;
+}
+
+inline HrtfMixerBlendFunc SelectHrtfBlendMixer()
+{
+#ifdef HAVE_NEON
+    if((CPUCapFlags&CPU_CAP_NEON))
+        return MixHrtfBlend_<NEONTag>;
+#endif
+#ifdef HAVE_SSE
+    if((CPUCapFlags&CPU_CAP_SSE))
+        return MixHrtfBlend_<SSETag>;
+#endif
+    return MixHrtfBlend_<CTag>;
+}
+
+} // namespace
+
+void Voice::InitMixer(al::optional<std::string> resampler)
+{
+    if(resampler)
+    {
+        struct ResamplerEntry {
+            const char name[16];
+            const Resampler resampler;
+        };
+        constexpr ResamplerEntry ResamplerList[]{
+            { "none", Resampler::Point },
+            { "point", Resampler::Point },
+            { "linear", Resampler::Linear },
+            { "cubic", Resampler::Cubic },
+            { "bsinc12", Resampler::BSinc12 },
+            { "fast_bsinc12", Resampler::FastBSinc12 },
+            { "bsinc24", Resampler::BSinc24 },
+            { "fast_bsinc24", Resampler::FastBSinc24 },
+        };
+
+        const char *str{resampler->c_str()};
+        if(al::strcasecmp(str, "bsinc") == 0)
+        {
+            WARN("Resampler option \"%s\" is deprecated, using bsinc12\n", str);
+            str = "bsinc12";
+        }
+        else if(al::strcasecmp(str, "sinc4") == 0 || al::strcasecmp(str, "sinc8") == 0)
+        {
+            WARN("Resampler option \"%s\" is deprecated, using cubic\n", str);
+            str = "cubic";
+        }
+
+        auto iter = std::find_if(std::begin(ResamplerList), std::end(ResamplerList),
+            [str](const ResamplerEntry &entry) -> bool
+            { return al::strcasecmp(str, entry.name) == 0; });
+        if(iter == std::end(ResamplerList))
+            ERR("Invalid resampler: %s\n", str);
+        else
+            ResamplerDefault = iter->resampler;
+    }
+
+    MixSamples = SelectMixer();
+    MixHrtfBlendSamples = SelectHrtfBlendMixer();
+    MixHrtfSamples = SelectHrtfMixer();
+}
+
+
+namespace {
+
+void SendSourceStoppedEvent(ContextBase *context, uint id)
+{
+    RingBuffer *ring{context->mAsyncEvents.get()};
+    auto evt_vec = ring->getWriteVector();
+    if(evt_vec.first.len < 1) return;
+
+    AsyncEvent *evt{::new(evt_vec.first.buf) AsyncEvent{EventType_SourceStateChange}};
+    evt->u.srcstate.id = id;
+    evt->u.srcstate.state = AsyncEvent::SrcState::Stop;
+
+    ring->writeAdvance(1);
+}
+
+
+const float *DoFilters(BiquadFilter &lpfilter, BiquadFilter &hpfilter, float *dst,
+    const al::span<const float> src, int type)
+{
+    switch(type)
+    {
+    case AF_None:
+        lpfilter.clear();
+        hpfilter.clear();
+        break;
+
+    case AF_LowPass:
+        lpfilter.process(src, dst);
+        hpfilter.clear();
+        return dst;
+    case AF_HighPass:
+        lpfilter.clear();
+        hpfilter.process(src, dst);
+        return dst;
+
+    case AF_BandPass:
+        DualBiquad{lpfilter, hpfilter}.process(src, dst);
+        return dst;
+    }
+    return src.data();
+}
+
+
+void LoadSamples(const al::span<Voice::BufferLine> dstSamples, const size_t dstOffset,
+    const al::byte *src, const size_t srcOffset, const FmtType srctype, const FmtChannels srcchans,
+    const size_t samples) noexcept
+{
+#define HANDLE_FMT(T) case T:                                                 \
+    {                                                                         \
+        constexpr size_t sampleSize{sizeof(al::FmtTypeTraits<T>::Type)};      \
+        if(srcchans == FmtUHJ2)                                               \
+        {                                                                     \
+            constexpr size_t srcstep{2u};                                     \
+            src += srcOffset*srcstep*sampleSize;                              \
+            al::LoadSampleArray<T>(dstSamples[0].data() + dstOffset, src,     \
+                srcstep, samples);                                            \
+            al::LoadSampleArray<T>(dstSamples[1].data() + dstOffset,          \
+                src + sampleSize, srcstep, samples);                          \
+            std::fill_n(dstSamples[2].data() + dstOffset, samples, 0.0f);     \
+        }                                                                     \
+        else                                                                  \
+        {                                                                     \
+            const size_t srcstep{dstSamples.size()};                          \
+            src += srcOffset*srcstep*sampleSize;                              \
+            for(auto &dst : dstSamples)                                       \
+            {                                                                 \
+                al::LoadSampleArray<T>(dst.data() + dstOffset, src, srcstep,  \
+                    samples);                                                 \
+                src += sampleSize;                                            \
+            }                                                                 \
+        }                                                                     \
+    }                                                                         \
+    break
+
+    switch(srctype)
+    {
+    HANDLE_FMT(FmtUByte);
+    HANDLE_FMT(FmtShort);
+    HANDLE_FMT(FmtFloat);
+    HANDLE_FMT(FmtDouble);
+    HANDLE_FMT(FmtMulaw);
+    HANDLE_FMT(FmtAlaw);
+    }
+#undef HANDLE_FMT
+}
+
+void LoadBufferStatic(VoiceBufferItem *buffer, VoiceBufferItem *bufferLoopItem,
+    const size_t dataPosInt, const FmtType sampleType, const FmtChannels sampleChannels,
+    const size_t samplesToLoad, const al::span<Voice::BufferLine> voiceSamples)
+{
+    const uint loopStart{buffer->mLoopStart};
+    const uint loopEnd{buffer->mLoopEnd};
+    ASSUME(loopEnd > loopStart);
+
+    /* If current pos is beyond the loop range, do not loop */
+    if(!bufferLoopItem || dataPosInt >= loopEnd)
+    {
+        /* Load what's left to play from the buffer */
+        const size_t remaining{minz(samplesToLoad, buffer->mSampleLen-dataPosInt)};
+        LoadSamples(voiceSamples, MaxResamplerEdge, buffer->mSamples, dataPosInt, sampleType,
+            sampleChannels, remaining);
+
+        if(const size_t toFill{samplesToLoad - remaining})
+        {
+            for(auto &chanbuffer : voiceSamples)
+            {
+                auto srcsamples = chanbuffer.data() + MaxResamplerEdge - 1 + remaining;
+                std::fill_n(srcsamples + 1, toFill, *srcsamples);
+            }
+        }
+    }
+    else
+    {
+        /* Load what's left of this loop iteration */
+        const size_t remaining{minz(samplesToLoad, loopEnd-dataPosInt)};
+        LoadSamples(voiceSamples, MaxResamplerEdge, buffer->mSamples, dataPosInt, sampleType,
+            sampleChannels, remaining);
+
+        /* Load repeats of the loop to fill the buffer. */
+        const auto loopSize = static_cast<size_t>(loopEnd - loopStart);
+        size_t samplesLoaded{remaining};
+        while(const size_t toFill{minz(samplesToLoad - samplesLoaded, loopSize)})
+        {
+            LoadSamples(voiceSamples, MaxResamplerEdge + samplesLoaded, buffer->mSamples,
+                loopStart, sampleType, sampleChannels, toFill);
+            samplesLoaded += toFill;
+        }
+    }
+}
+
+void LoadBufferCallback(VoiceBufferItem *buffer, const size_t numCallbackSamples,
+    const FmtType sampleType, const FmtChannels sampleChannels, const size_t samplesToLoad,
+    const al::span<Voice::BufferLine> voiceSamples)
+{
+    /* Load what's left to play from the buffer */
+    const size_t remaining{minz(samplesToLoad, numCallbackSamples)};
+    LoadSamples(voiceSamples, MaxResamplerEdge, buffer->mSamples, 0, sampleType, sampleChannels,
+        remaining);
+
+    if(const size_t toFill{samplesToLoad - remaining})
+    {
+        for(auto &chanbuffer : voiceSamples)
+        {
+            auto srcsamples = chanbuffer.data() + MaxResamplerEdge - 1 + remaining;
+            std::fill_n(srcsamples + 1, toFill, *srcsamples);
+        }
+    }
+}
+
+void LoadBufferQueue(VoiceBufferItem *buffer, VoiceBufferItem *bufferLoopItem,
+    size_t dataPosInt, const FmtType sampleType, const FmtChannels sampleChannels,
+    const size_t samplesToLoad, const al::span<Voice::BufferLine> voiceSamples)
+{
+    /* Crawl the buffer queue to fill in the temp buffer */
+    size_t samplesLoaded{0};
+    while(buffer && samplesLoaded != samplesToLoad)
+    {
+        if(dataPosInt >= buffer->mSampleLen)
+        {
+            dataPosInt -= buffer->mSampleLen;
+            buffer = buffer->mNext.load(std::memory_order_acquire);
+            if(!buffer) buffer = bufferLoopItem;
+            continue;
+        }
+
+        const size_t remaining{minz(samplesToLoad-samplesLoaded, buffer->mSampleLen-dataPosInt)};
+        LoadSamples(voiceSamples, MaxResamplerEdge+samplesLoaded, buffer->mSamples, dataPosInt,
+            sampleType, sampleChannels, remaining);
+
+        samplesLoaded += remaining;
+        if(samplesLoaded == samplesToLoad)
+            break;
+
+        dataPosInt = 0;
+        buffer = buffer->mNext.load(std::memory_order_acquire);
+        if(!buffer) buffer = bufferLoopItem;
+    }
+    if(const size_t toFill{samplesToLoad - samplesLoaded})
+    {
+        size_t chanidx{0};
+        for(auto &chanbuffer : voiceSamples)
+        {
+            auto srcsamples = chanbuffer.data() + MaxResamplerEdge - 1 + samplesLoaded;
+            std::fill_n(srcsamples + 1, toFill, *srcsamples);
+            ++chanidx;
+        }
+    }
+}
+
+
+void DoHrtfMix(const float *samples, const uint DstBufferSize, DirectParams &parms,
+    const float TargetGain, const uint Counter, uint OutPos, DeviceBase *Device)
+{
+    const uint IrSize{Device->mIrSize};
+    auto &HrtfSamples = Device->HrtfSourceData;
+    /* Source HRTF mixing needs to include the direct delay so it remains
+     * aligned with the direct mix's HRTF filtering.
+     */
+    float2 *AccumSamples{Device->HrtfAccumData + HrtfDirectDelay};
+
+    /* Copy the HRTF history and new input samples into a temp buffer. */
+    auto src_iter = std::copy(parms.Hrtf.History.begin(), parms.Hrtf.History.end(),
+        std::begin(HrtfSamples));
+    std::copy_n(samples, DstBufferSize, src_iter);
+    /* Copy the last used samples back into the history buffer for later. */
+    std::copy_n(std::begin(HrtfSamples) + DstBufferSize, parms.Hrtf.History.size(),
+        parms.Hrtf.History.begin());
+
+    /* If fading and this is the first mixing pass, fade between the IRs. */
+    uint fademix{0u};
+    if(Counter && OutPos == 0)
+    {
+        fademix = minu(DstBufferSize, Counter);
+
+        float gain{TargetGain};
+
+        /* The new coefficients need to fade in completely since they're
+         * replacing the old ones. To keep the gain fading consistent,
+         * interpolate between the old and new target gains given how much of
+         * the fade time this mix handles.
+         */
+        if(Counter > fademix)
+        {
+            const float a{static_cast<float>(fademix) / static_cast<float>(Counter)};
+            gain = lerp(parms.Hrtf.Old.Gain, TargetGain, a);
+        }
+
+        MixHrtfFilter hrtfparams{
+            parms.Hrtf.Target.Coeffs,
+            parms.Hrtf.Target.Delay,
+            0.0f, gain / static_cast<float>(fademix)};
+        MixHrtfBlendSamples(HrtfSamples, AccumSamples+OutPos, IrSize, &parms.Hrtf.Old, &hrtfparams,
+            fademix);
+
+        /* Update the old parameters with the result. */
+        parms.Hrtf.Old = parms.Hrtf.Target;
+        parms.Hrtf.Old.Gain = gain;
+        OutPos += fademix;
+    }
+
+    if(fademix < DstBufferSize)
+    {
+        const uint todo{DstBufferSize - fademix};
+        float gain{TargetGain};
+
+        /* Interpolate the target gain if the gain fading lasts longer than
+         * this mix.
+         */
+        if(Counter > DstBufferSize)
+        {
+            const float a{static_cast<float>(todo) / static_cast<float>(Counter-fademix)};
+            gain = lerp(parms.Hrtf.Old.Gain, TargetGain, a);
+        }
+
+        MixHrtfFilter hrtfparams{
+            parms.Hrtf.Target.Coeffs,
+            parms.Hrtf.Target.Delay,
+            parms.Hrtf.Old.Gain,
+            (gain - parms.Hrtf.Old.Gain) / static_cast<float>(todo)};
+        MixHrtfSamples(HrtfSamples+fademix, AccumSamples+OutPos, IrSize, &hrtfparams, todo);
+
+        /* Store the now-current gain for next time. */
+        parms.Hrtf.Old.Gain = gain;
+    }
+}
+
+void DoNfcMix(const al::span<const float> samples, FloatBufferLine *OutBuffer, DirectParams &parms,
+    const float *TargetGains, const uint Counter, const uint OutPos, DeviceBase *Device)
+{
+    using FilterProc = void (NfcFilter::*)(const al::span<const float>, float*);
+    static constexpr FilterProc NfcProcess[MaxAmbiOrder+1]{
+        nullptr, &NfcFilter::process1, &NfcFilter::process2, &NfcFilter::process3};
+
+    float *CurrentGains{parms.Gains.Current.data()};
+    MixSamples(samples, {OutBuffer, 1u}, CurrentGains, TargetGains, Counter, OutPos);
+    ++OutBuffer;
+    ++CurrentGains;
+    ++TargetGains;
+
+    const al::span<float> nfcsamples{Device->NfcSampleData, samples.size()};
+    size_t order{1};
+    while(const size_t chancount{Device->NumChannelsPerOrder[order]})
+    {
+        (parms.NFCtrlFilter.*NfcProcess[order])(samples, nfcsamples.data());
+        MixSamples(nfcsamples, {OutBuffer, chancount}, CurrentGains, TargetGains, Counter, OutPos);
+        OutBuffer += chancount;
+        CurrentGains += chancount;
+        TargetGains += chancount;
+        if(++order == MaxAmbiOrder+1)
+            break;
+    }
+}
+
+} // namespace
+
+void Voice::mix(const State vstate, ContextBase *Context, const uint SamplesToDo)
+{
+    static constexpr std::array<float,MAX_OUTPUT_CHANNELS> SilentTarget{};
+
+    ASSUME(SamplesToDo > 0);
+
+    /* Get voice info */
+    uint DataPosInt{mPosition.load(std::memory_order_relaxed)};
+    uint DataPosFrac{mPositionFrac.load(std::memory_order_relaxed)};
+    VoiceBufferItem *BufferListItem{mCurrentBuffer.load(std::memory_order_relaxed)};
+    VoiceBufferItem *BufferLoopItem{mLoopBuffer.load(std::memory_order_relaxed)};
+    const uint increment{mStep};
+    if UNLIKELY(increment < 1)
+    {
+        /* If the voice is supposed to be stopping but can't be mixed, just
+         * stop it before bailing.
+         */
+        if(vstate == Stopping)
+            mPlayState.store(Stopped, std::memory_order_release);
+        return;
+    }
+
+    DeviceBase *Device{Context->mDevice};
+    const uint NumSends{Device->NumAuxSends};
+
+    ResamplerFunc Resample{(increment == MixerFracOne && DataPosFrac == 0) ?
+                           Resample_<CopyTag,CTag> : mResampler};
+
+    uint Counter{(mFlags&VoiceIsFading) ? SamplesToDo : 0};
+    if(!Counter)
+    {
+        /* No fading, just overwrite the old/current params. */
+        for(auto &chandata : mChans)
+        {
+            {
+                DirectParams &parms = chandata.mDryParams;
+                if(!(mFlags&VoiceHasHrtf))
+                    parms.Gains.Current = parms.Gains.Target;
+                else
+                    parms.Hrtf.Old = parms.Hrtf.Target;
+            }
+            for(uint send{0};send < NumSends;++send)
+            {
+                if(mSend[send].Buffer.empty())
+                    continue;
+
+                SendParams &parms = chandata.mWetParams[send];
+                parms.Gains.Current = parms.Gains.Target;
+            }
+        }
+    }
+    else if UNLIKELY(!BufferListItem)
+        Counter = std::min(Counter, 64u);
+
+    const uint PostPadding{MaxResamplerEdge +
+        ((mFmtChannels==FmtUHJ2 || mFmtChannels==FmtUHJ3 || mFmtChannels==FmtUHJ4)
+            ? uint{UhjDecoder::sFilterDelay} : 0u)};
+    uint buffers_done{0u};
+    uint OutPos{0u};
+    do {
+        /* Figure out how many buffer samples will be needed */
+        uint DstBufferSize{SamplesToDo - OutPos};
+        uint SrcBufferSize;
+
+        if(increment <= MixerFracOne)
+        {
+            /* Calculate the last written dst sample pos. */
+            uint64_t DataSize64{DstBufferSize - 1};
+            /* Calculate the last read src sample pos. */
+            DataSize64 = (DataSize64*increment + DataPosFrac) >> MixerFracBits;
+            /* +1 to get the src sample count, include padding. */
+            DataSize64 += 1 + PostPadding;
+
+            /* Result is guaranteed to be <= BufferLineSize+ResamplerPrePadding
+             * since we won't use more src samples than dst samples+padding.
+             */
+            SrcBufferSize = static_cast<uint>(DataSize64);
+        }
+        else
+        {
+            uint64_t DataSize64{DstBufferSize};
+            /* Calculate the end src sample pos, include padding. */
+            DataSize64 = (DataSize64*increment + DataPosFrac) >> MixerFracBits;
+            DataSize64 += PostPadding;
+
+            if(DataSize64 <= LineSize - MaxResamplerEdge)
+                SrcBufferSize = static_cast<uint>(DataSize64);
+            else
+            {
+                /* If the source size got saturated, we can't fill the desired
+                 * dst size. Figure out how many samples we can actually mix.
+                 */
+                SrcBufferSize = LineSize - MaxResamplerEdge;
+
+                DataSize64 = SrcBufferSize - PostPadding;
+                DataSize64 = ((DataSize64<<MixerFracBits) - DataPosFrac) / increment;
+                if(DataSize64 < DstBufferSize)
+                {
+                    /* Some mixers require being 16-byte aligned, so also limit
+                     * to a multiple of 4 samples to maintain alignment.
+                     */
+                    DstBufferSize = static_cast<uint>(DataSize64) & ~3u;
+                }
+                ASSUME(DstBufferSize > 0);
+            }
+        }
+
+        if((mFlags&(VoiceIsCallback|VoiceCallbackStopped)) == VoiceIsCallback && BufferListItem)
+        {
+            if(SrcBufferSize > mNumCallbackSamples)
+            {
+                const size_t byteOffset{mNumCallbackSamples*mFrameSize};
+                const size_t needBytes{SrcBufferSize*mFrameSize - byteOffset};
+
+                const int gotBytes{BufferListItem->mCallback(BufferListItem->mUserData,
+                    &BufferListItem->mSamples[byteOffset], static_cast<int>(needBytes))};
+                if(gotBytes < 0)
+                    mFlags |= VoiceCallbackStopped;
+                else if(static_cast<uint>(gotBytes) < needBytes)
+                {
+                    mFlags |= VoiceCallbackStopped;
+                    mNumCallbackSamples += static_cast<uint>(static_cast<uint>(gotBytes) /
+                        mFrameSize);
+                }
+                else
+                    mNumCallbackSamples = SrcBufferSize;
+            }
+        }
+
+        if UNLIKELY(!BufferListItem)
+        {
+            for(auto &chanbuffer : mVoiceSamples)
+            {
+                auto srciter = chanbuffer.data() + MaxResamplerEdge;
+                auto srcend = chanbuffer.data() + MaxResamplerPadding;
+
+                /* When loading from a voice that ended prematurely, only take
+                 * the samples that get closest to 0 amplitude. This helps
+                 * certain sounds fade out better.
+                 */
+                auto abs_lt = [](const float lhs, const float rhs) noexcept -> bool
+                { return std::abs(lhs) < std::abs(rhs); };
+                srciter = std::min_element(srciter, srcend, abs_lt);
+
+                SrcBufferSize = SrcBufferSize - PostPadding + MaxResamplerPadding;
+                std::fill(srciter+1, chanbuffer.data() + SrcBufferSize, *srciter);
+            }
+        }
+        else
+        {
+            if((mFlags&VoiceIsStatic))
+                LoadBufferStatic(BufferListItem, BufferLoopItem, DataPosInt, mFmtType, mFmtChannels,
+                    SrcBufferSize, mVoiceSamples);
+            else if((mFlags&VoiceIsCallback))
+                LoadBufferCallback(BufferListItem, mNumCallbackSamples, mFmtType, mFmtChannels,
+                    SrcBufferSize, mVoiceSamples);
+            else
+                LoadBufferQueue(BufferListItem, BufferLoopItem, DataPosInt, mFmtType, mFmtChannels,
+                    SrcBufferSize, mVoiceSamples);
+
+            if(mDecoder)
+            {
+                const size_t srcOffset{(increment*DstBufferSize + DataPosFrac)>>MixerFracBits};
+                SrcBufferSize = SrcBufferSize - PostPadding + MaxResamplerEdge;
+                mDecoder->decode(mVoiceSamples, MaxResamplerEdge, SrcBufferSize, srcOffset);
+            }
+        }
+
+        auto voiceSamples = mVoiceSamples.begin();
+        for(auto &chandata : mChans)
+        {
+            /* Resample, then apply ambisonic upsampling as needed. */
+            float *ResampledData{Resample(&mResampleState,
+                voiceSamples->data() + MaxResamplerEdge, DataPosFrac, increment,
+                {Device->ResampledData, DstBufferSize})};
+            if((mFlags&VoiceIsAmbisonic))
+                chandata.mAmbiSplitter.processHfScale({ResampledData, DstBufferSize},
+                    chandata.mAmbiScale);
+
+            /* Now filter and mix to the appropriate outputs. */
+            const al::span<float,BufferLineSize> FilterBuf{Device->FilteredData};
+            {
+                DirectParams &parms = chandata.mDryParams;
+                const float *samples{DoFilters(parms.LowPass, parms.HighPass, FilterBuf.data(),
+                    {ResampledData, DstBufferSize}, mDirect.FilterType)};
+
+                if((mFlags&VoiceHasHrtf))
+                {
+                    const float TargetGain{UNLIKELY(vstate == Stopping) ? 0.0f :
+                        parms.Hrtf.Target.Gain};
+                    DoHrtfMix(samples, DstBufferSize, parms, TargetGain, Counter, OutPos, Device);
+                }
+                else if((mFlags&VoiceHasNfc))
+                {
+                    const float *TargetGains{UNLIKELY(vstate == Stopping) ? SilentTarget.data()
+                        : parms.Gains.Target.data()};
+                    DoNfcMix({samples, DstBufferSize}, mDirect.Buffer.data(), parms, TargetGains,
+                        Counter, OutPos, Device);
+                }
+                else
+                {
+                    const float *TargetGains{UNLIKELY(vstate == Stopping) ? SilentTarget.data()
+                        : parms.Gains.Target.data()};
+                    MixSamples({samples, DstBufferSize}, mDirect.Buffer,
+                        parms.Gains.Current.data(), TargetGains, Counter, OutPos);
+                }
+            }
+
+            for(uint send{0};send < NumSends;++send)
+            {
+                if(mSend[send].Buffer.empty())
+                    continue;
+
+                SendParams &parms = chandata.mWetParams[send];
+                const float *samples{DoFilters(parms.LowPass, parms.HighPass, FilterBuf.data(),
+                    {ResampledData, DstBufferSize}, mSend[send].FilterType)};
+
+                const float *TargetGains{UNLIKELY(vstate == Stopping) ? SilentTarget.data()
+                    : parms.Gains.Target.data()};
+                MixSamples({samples, DstBufferSize}, mSend[send].Buffer,
+                    parms.Gains.Current.data(), TargetGains, Counter, OutPos);
+            }
+
+            /* Store the last source samples used for next time. */
+            const size_t srcOffset{(increment*DstBufferSize + DataPosFrac)>>MixerFracBits};
+            std::copy_n(voiceSamples->data()+srcOffset, MaxResamplerPadding, voiceSamples->data());
+            ++voiceSamples;
+        }
+        /* Update positions */
+        DataPosFrac += increment*DstBufferSize;
+        const uint SrcSamplesDone{DataPosFrac>>MixerFracBits};
+        DataPosInt  += SrcSamplesDone;
+        DataPosFrac &= MixerFracMask;
+
+        OutPos += DstBufferSize;
+        Counter = maxu(DstBufferSize, Counter) - DstBufferSize;
+
+        if UNLIKELY(!BufferListItem)
+        {
+            /* Do nothing extra when there's no buffers. */
+        }
+        else if((mFlags&VoiceIsStatic))
+        {
+            if(BufferLoopItem)
+            {
+                /* Handle looping static source */
+                const uint LoopStart{BufferListItem->mLoopStart};
+                const uint LoopEnd{BufferListItem->mLoopEnd};
+                if(DataPosInt >= LoopEnd)
+                {
+                    assert(LoopEnd > LoopStart);
+                    DataPosInt = ((DataPosInt-LoopStart)%(LoopEnd-LoopStart)) + LoopStart;
+                }
+            }
+            else
+            {
+                /* Handle non-looping static source */
+                if(DataPosInt >= BufferListItem->mSampleLen)
+                {
+                    BufferListItem = nullptr;
+                    break;
+                }
+            }
+        }
+        else if((mFlags&VoiceIsCallback))
+        {
+            if(SrcSamplesDone < mNumCallbackSamples)
+            {
+                const size_t byteOffset{SrcSamplesDone*mFrameSize};
+                const size_t byteEnd{mNumCallbackSamples*mFrameSize};
+                al::byte *data{BufferListItem->mSamples};
+                std::copy(data+byteOffset, data+byteEnd, data);
+                mNumCallbackSamples -= SrcSamplesDone;
+            }
+            else
+            {
+                BufferListItem = nullptr;
+                mNumCallbackSamples = 0;
+            }
+        }
+        else
+        {
+            /* Handle streaming source */
+            do {
+                if(BufferListItem->mSampleLen > DataPosInt)
+                    break;
+
+                DataPosInt -= BufferListItem->mSampleLen;
+
+                ++buffers_done;
+                BufferListItem = BufferListItem->mNext.load(std::memory_order_relaxed);
+                if(!BufferListItem) BufferListItem = BufferLoopItem;
+            } while(BufferListItem);
+        }
+    } while(OutPos < SamplesToDo);
+
+    mFlags |= VoiceIsFading;
+
+    /* Don't update positions and buffers if we were stopping. */
+    if UNLIKELY(vstate == Stopping)
+    {
+        mPlayState.store(Stopped, std::memory_order_release);
+        return;
+    }
+
+    /* Capture the source ID in case it's reset for stopping. */
+    const uint SourceID{mSourceID.load(std::memory_order_relaxed)};
+
+    /* Update voice info */
+    mPosition.store(DataPosInt, std::memory_order_relaxed);
+    mPositionFrac.store(DataPosFrac, std::memory_order_relaxed);
+    mCurrentBuffer.store(BufferListItem, std::memory_order_relaxed);
+    if(!BufferListItem)
+    {
+        mLoopBuffer.store(nullptr, std::memory_order_relaxed);
+        mSourceID.store(0u, std::memory_order_relaxed);
+    }
+    std::atomic_thread_fence(std::memory_order_release);
+
+    /* Send any events now, after the position/buffer info was updated. */
+    const uint enabledevt{Context->mEnabledEvts.load(std::memory_order_acquire)};
+    if(buffers_done > 0 && (enabledevt&EventType_BufferCompleted))
+    {
+        RingBuffer *ring{Context->mAsyncEvents.get()};
+        auto evt_vec = ring->getWriteVector();
+        if(evt_vec.first.len > 0)
+        {
+            AsyncEvent *evt{::new(evt_vec.first.buf) AsyncEvent{EventType_BufferCompleted}};
+            evt->u.bufcomp.id = SourceID;
+            evt->u.bufcomp.count = buffers_done;
+            ring->writeAdvance(1);
+        }
+    }
+
+    if(!BufferListItem)
+    {
+        /* If the voice just ended, set it to Stopping so the next render
+         * ensures any residual noise fades to 0 amplitude.
+         */
+        mPlayState.store(Stopping, std::memory_order_release);
+        if((enabledevt&EventType_SourceStateChange))
+            SendSourceStoppedEvent(Context, SourceID);
+    }
+}
+
+void Voice::prepare(DeviceBase *device)
+{
+    if((mFmtChannels == FmtUHJ2 || mFmtChannels == FmtUHJ3 || mFmtChannels==FmtUHJ4) && !mDecoder)
+        mDecoder = std::make_unique<UhjDecoder>();
+    else if(mFmtChannels != FmtUHJ2 && mFmtChannels != FmtUHJ3 && mFmtChannels != FmtUHJ4)
+        mDecoder = nullptr;
+
+    /* Clear the stepping value explicitly so the mixer knows not to mix this
+     * until the update gets applied.
+     */
+    mStep = 0;
+
+    /* Make sure the sample history is cleared. */
+    std::fill(mVoiceSamples.begin(), mVoiceSamples.end(), BufferLine{});
+
+    /* Don't need to set the VoiceIsAmbisonic flag if the device is not higher
+     * order than the voice. No HF scaling is necessary to mix it.
+     */
+    if(mAmbiOrder && device->mAmbiOrder > mAmbiOrder)
+    {
+        const uint8_t *OrderFromChan{(mFmtChannels == FmtBFormat2D) ?
+            AmbiIndex::OrderFrom2DChannel().data() : AmbiIndex::OrderFromChannel().data()};
+        const auto scales = AmbiScale::GetHFOrderScales(mAmbiOrder, device->mAmbiOrder);
+
+        const BandSplitter splitter{device->mXOverFreq / static_cast<float>(device->Frequency)};
+        for(auto &chandata : mChans)
+        {
+            chandata.mAmbiScale = scales[*(OrderFromChan++)];
+            chandata.mAmbiSplitter = splitter;
+            chandata.mDryParams = DirectParams{};
+            std::fill_n(chandata.mWetParams.begin(), device->NumAuxSends, SendParams{});
+        }
+        mFlags |= VoiceIsAmbisonic;
+    }
+    else
+    {
+        for(auto &chandata : mChans)
+        {
+            chandata.mDryParams = DirectParams{};
+            std::fill_n(chandata.mWetParams.begin(), device->NumAuxSends, SendParams{});
+        }
+        mFlags &= ~VoiceIsAmbisonic;
+    }
+
+    if(device->AvgSpeakerDist > 0.0f)
+    {
+        const float w1{SpeedOfSoundMetersPerSec /
+            (device->AvgSpeakerDist * static_cast<float>(device->Frequency))};
+        for(auto &chandata : mChans)
+            chandata.mDryParams.NFCtrlFilter.init(w1);
+    }
+}
diff --git a/core/voice.h b/core/voice.h
new file mode 100644
index 00000000..c3347cda
--- /dev/null
+++ b/core/voice.h
@@ -0,0 +1,270 @@
+#ifndef CORE_VOICE_H
+#define CORE_VOICE_H
+
+#include <array>
+#include <atomic>
+#include <memory>
+#include <stddef.h>
+#include <string>
+
+#include "albyte.h"
+#include "almalloc.h"
+#include "aloptional.h"
+#include "alspan.h"
+#include "bufferline.h"
+#include "buffer_storage.h"
+#include "devformat.h"
+#include "filters/biquad.h"
+#include "filters/nfc.h"
+#include "filters/splitter.h"
+#include "mixer/defs.h"
+#include "mixer/hrtfdefs.h"
+#include "resampler_limits.h"
+#include "uhjfilter.h"
+#include "vector.h"
+
+struct ContextBase;
+struct DeviceBase;
+struct EffectSlot;
+enum class DistanceModel : unsigned char;
+
+using uint = unsigned int;
+
+
+#define MAX_SENDS  6
+
+
+enum class SpatializeMode : unsigned char {
+    Off,
+    On,
+    Auto
+};
+
+enum class DirectMode : unsigned char {
+    Off,
+    DropMismatch,
+    RemixMismatch
+};
+
+
+/* Maximum number of extra source samples that may need to be loaded, for
+ * resampling or conversion purposes.
+ */
+constexpr uint MaxPostVoiceLoad{MaxResamplerEdge + UhjDecoder::sFilterDelay};
+
+
+enum {
+    AF_None = 0,
+    AF_LowPass = 1,
+    AF_HighPass = 2,
+    AF_BandPass = AF_LowPass | AF_HighPass
+};
+
+
+struct DirectParams {
+    BiquadFilter LowPass;
+    BiquadFilter HighPass;
+
+    NfcFilter NFCtrlFilter;
+
+    struct {
+        HrtfFilter Old;
+        HrtfFilter Target;
+        alignas(16) std::array<float,HrtfHistoryLength> History;
+    } Hrtf;
+
+    struct {
+        std::array<float,MAX_OUTPUT_CHANNELS> Current;
+        std::array<float,MAX_OUTPUT_CHANNELS> Target;
+    } Gains;
+};
+
+struct SendParams {
+    BiquadFilter LowPass;
+    BiquadFilter HighPass;
+
+    struct {
+        std::array<float,MAX_OUTPUT_CHANNELS> Current;
+        std::array<float,MAX_OUTPUT_CHANNELS> Target;
+    } Gains;
+};
+
+
+struct VoiceBufferItem {
+    std::atomic<VoiceBufferItem*> mNext{nullptr};
+
+    CallbackType mCallback{nullptr};
+    void *mUserData{nullptr};
+
+    uint mSampleLen{0u};
+    uint mLoopStart{0u};
+    uint mLoopEnd{0u};
+
+    al::byte *mSamples{nullptr};
+};
+
+
+struct VoiceProps {
+    float Pitch;
+    float Gain;
+    float OuterGain;
+    float MinGain;
+    float MaxGain;
+    float InnerAngle;
+    float OuterAngle;
+    float RefDistance;
+    float MaxDistance;
+    float RolloffFactor;
+    std::array<float,3> Position;
+    std::array<float,3> Velocity;
+    std::array<float,3> Direction;
+    std::array<float,3> OrientAt;
+    std::array<float,3> OrientUp;
+    bool HeadRelative;
+    DistanceModel mDistanceModel;
+    Resampler mResampler;
+    DirectMode DirectChannels;
+    SpatializeMode mSpatializeMode;
+
+    bool DryGainHFAuto;
+    bool WetGainAuto;
+    bool WetGainHFAuto;
+    float OuterGainHF;
+
+    float AirAbsorptionFactor;
+    float RoomRolloffFactor;
+    float DopplerFactor;
+
+    std::array<float,2> StereoPan;
+
+    float Radius;
+
+    /** Direct filter and auxiliary send info. */
+    struct {
+        float Gain;
+        float GainHF;
+        float HFReference;
+        float GainLF;
+        float LFReference;
+    } Direct;
+    struct SendData {
+        EffectSlot *Slot;
+        float Gain;
+        float GainHF;
+        float HFReference;
+        float GainLF;
+        float LFReference;
+    } Send[MAX_SENDS];
+};
+
+struct VoicePropsItem : public VoiceProps {
+    std::atomic<VoicePropsItem*> next{nullptr};
+
+    DEF_NEWDEL(VoicePropsItem)
+};
+
+constexpr uint VoiceIsStatic{       1u<<0};
+constexpr uint VoiceIsCallback{     1u<<1};
+constexpr uint VoiceIsAmbisonic{    1u<<2}; /* Needs HF scaling for ambisonic upsampling. */
+constexpr uint VoiceCallbackStopped{1u<<3};
+constexpr uint VoiceIsFading{       1u<<4}; /* Use gain stepping for smooth transitions. */
+constexpr uint VoiceHasHrtf{        1u<<5};
+constexpr uint VoiceHasNfc{         1u<<6};
+
+struct Voice {
+    enum State {
+        Stopped,
+        Playing,
+        Stopping,
+        Pending
+    };
+
+    std::atomic<VoicePropsItem*> mUpdate{nullptr};
+
+    VoiceProps mProps;
+
+    std::atomic<uint> mSourceID{0u};
+    std::atomic<State> mPlayState{Stopped};
+    std::atomic<bool> mPendingChange{false};
+
+    /**
+     * Source offset in samples, relative to the currently playing buffer, NOT
+     * the whole queue.
+     */
+    std::atomic<uint> mPosition;
+    /** Fractional (fixed-point) offset to the next sample. */
+    std::atomic<uint> mPositionFrac;
+
+    /* Current buffer queue item being played. */
+    std::atomic<VoiceBufferItem*> mCurrentBuffer;
+
+    /* Buffer queue item to loop to at end of queue (will be NULL for non-
+     * looping voices).
+     */
+    std::atomic<VoiceBufferItem*> mLoopBuffer;
+
+    /* Properties for the attached buffer(s). */
+    FmtChannels mFmtChannels;
+    FmtType mFmtType;
+    uint mFrequency;
+    uint mFrameSize;
+    AmbiLayout mAmbiLayout;
+    AmbiScaling mAmbiScaling;
+    uint mAmbiOrder;
+
+    std::unique_ptr<UhjDecoder> mDecoder;
+
+    /** Current target parameters used for mixing. */
+    uint mStep{0};
+
+    ResamplerFunc mResampler;
+
+    InterpState mResampleState;
+
+    uint mFlags{};
+    uint mNumCallbackSamples{0};
+
+    struct TargetData {
+        int FilterType;
+        al::span<FloatBufferLine> Buffer;
+    };
+    TargetData mDirect;
+    std::array<TargetData,MAX_SENDS> mSend;
+
+    /* The first MaxResamplerPadding/2 elements are the sample history from the
+     * previous mix, with an additional MaxResamplerPadding/2 elements that are
+     * now current (which may be overwritten if the buffer data is still
+     * available).
+     */
+    static constexpr size_t LineSize{BufferLineSize + MaxResamplerPadding +
+        UhjDecoder::sFilterDelay};
+    using BufferLine = std::array<float,LineSize>;
+    al::vector<BufferLine,16> mVoiceSamples{2};
+
+    struct ChannelData {
+        float mAmbiScale;
+        BandSplitter mAmbiSplitter;
+
+        DirectParams mDryParams;
+        std::array<SendParams,MAX_SENDS> mWetParams;
+    };
+    al::vector<ChannelData> mChans{2};
+
+    Voice() = default;
+    ~Voice() { delete mUpdate.exchange(nullptr, std::memory_order_acq_rel); }
+
+    Voice(const Voice&) = delete;
+    Voice& operator=(const Voice&) = delete;
+
+    void mix(const State vstate, ContextBase *Context, const uint SamplesToDo);
+
+    void prepare(DeviceBase *device);
+
+    static void InitMixer(al::optional<std::string> resampler);
+
+    DEF_NEWDEL(Voice)
+};
+
+extern Resampler ResamplerDefault;
+
+#endif /* CORE_VOICE_H */
diff --git a/core/voice_change.h b/core/voice_change.h
new file mode 100644
index 00000000..ddc6186f
--- /dev/null
+++ b/core/voice_change.h
@@ -0,0 +1,31 @@
+#ifndef VOICE_CHANGE_H
+#define VOICE_CHANGE_H
+
+#include <atomic>
+
+#include "almalloc.h"
+
+struct Voice;
+
+using uint = unsigned int;
+
+
+enum class VChangeState {
+    Reset,
+    Stop,
+    Play,
+    Pause,
+    Restart
+};
+struct VoiceChange {
+    Voice *mOldVoice{nullptr};
+    Voice *mVoice{nullptr};
+    uint mSourceID{0};
+    VChangeState mState{};
+
+    std::atomic<VoiceChange*> mNext{nullptr};
+
+    DEF_NEWDEL(VoiceChange)
+};
+
+#endif /* VOICE_CHANGE_H */