74 files changed, 12481 insertions, 0 deletions
diff --git a/core/ambdec.cpp b/core/ambdec.cpp
new file mode 100644
index 00000000..8ca182c4
--- /dev/null
+++ b/core/ambdec.cpp
@@ -0,0 +1,306 @@
+
+#include "config.h"
+
+#include "ambdec.h"
+
+#include <algorithm>
+#include <cctype>
+#include <cstdarg>
+#include <cstddef>
+#include <cstdio>
+#include <iterator>
+#include <sstream>
+#include <string>
+
+#include "albit.h"
+#include "alfstream.h"
+#include "alspan.h"
+#include "opthelpers.h"
+
+
+namespace {
+
+std::string read_word(std::istream &f)
+{
+    std::string ret;
+    f >> ret;
+    return ret;
+}
+
+bool is_at_end(const std::string &buffer, std::size_t endpos)
+{
+    while(endpos < buffer.length() && std::isspace(buffer[endpos]))
+        ++endpos;
+    return !(endpos < buffer.length() && buffer[endpos] != '#');
+}
+
+
+enum class ReaderScope {
+    Global,
+    Speakers,
+    LFMatrix,
+    HFMatrix,
+};
+
+#ifdef __USE_MINGW_ANSI_STDIO
+[[gnu::format(gnu_printf,2,3)]]
+#else
+[[gnu::format(printf,2,3)]]
+#endif
+al::optional<std::string> make_error(size_t linenum, const char *fmt, ...)
+{
+    al::optional<std::string> ret;
+    auto &str = ret.emplace();
+
+    str.resize(256);
+    int printed{std::snprintf(const_cast<char*>(str.data()), str.length(), "Line %zu: ", linenum)};
+    if(printed < 0) printed = 0;
+    auto plen = std::min(static_cast<size_t>(printed), str.length());
+
+    std::va_list args, args2;
+    va_start(args, fmt);
+    va_copy(args2, args);
+    const int msglen{std::vsnprintf(&str[plen], str.size()-plen, fmt, args)};
+    if(msglen >= 0 && static_cast<size_t>(msglen) >= str.size()-plen)
+    {
+        str.resize(static_cast<size_t>(msglen) + plen + 1u);
+        std::vsnprintf(&str[plen], str.size()-plen, fmt, args2);
+    }
+    va_end(args2);
+    va_end(args);
+
+    return ret;
+}
+
+} // namespace
+
+AmbDecConf::~AmbDecConf() = default;
+
+
+al::optional<std::string> AmbDecConf::load(const char *fname) noexcept
+{
+    al::ifstream f{fname};
+    if(!f.is_open())
+        return std::string("Failed to open file \"")+fname+"\"";
+
+    ReaderScope scope{ReaderScope::Global};
+    size_t speaker_pos{0};
+    size_t lfmatrix_pos{0};
+    size_t hfmatrix_pos{0};
+    size_t linenum{0};
+
+    std::string buffer;
+    while(f.good() && std::getline(f, buffer))
+    {
+        ++linenum;
+
+        std::istringstream istr{buffer};
+        std::string command{read_word(istr)};
+        if(command.empty() || command[0] == '#')
+            continue;
+
+        if(command == "/}")
+        {
+            if(scope == ReaderScope::Global)
+                return make_error(linenum, "Unexpected /} in global scope");
+            scope = ReaderScope::Global;
+            continue;
+        }
+
+        if(scope == ReaderScope::Speakers)
+        {
+            if(command == "add_spkr")
+            {
+                if(speaker_pos == NumSpeakers)
+                    return make_error(linenum, "Too many speakers specified");
+
+                AmbDecConf::SpeakerConf &spkr = Speakers[speaker_pos++];
+                istr >> spkr.Name;
+                istr >> spkr.Distance;
+                istr >> spkr.Azimuth;
+                istr >> spkr.Elevation;
+                istr >> spkr.Connection;
+            }
+            else
+                return make_error(linenum, "Unexpected speakers command: %s", command.c_str());
+        }
+        else if(scope == ReaderScope::LFMatrix || scope == ReaderScope::HFMatrix)
+        {
+            auto &gains = (scope == ReaderScope::LFMatrix) ? LFOrderGain : HFOrderGain;
+            auto *matrix = (scope == ReaderScope::LFMatrix) ? LFMatrix : HFMatrix;
+            auto &pos = (scope == ReaderScope::LFMatrix) ? lfmatrix_pos : hfmatrix_pos;
+
+            if(command == "order_gain")
+            {
+                size_t toread{(ChanMask > Ambi3OrderMask) ? 5u : 4u};
+                std::size_t curgain{0u};
+                float value{};
+                while(toread)
+                {
+                    --toread;
+                    istr >> value;
+                    if(curgain < al::size(gains))
+                        gains[curgain++] = value;
+                }
+            }
+            else if(command == "add_row")
+            {
+                if(pos == NumSpeakers)
+                    return make_error(linenum, "Too many matrix rows specified");
+
+                unsigned int mask{ChanMask};
+
+                AmbDecConf::CoeffArray &mtxrow = matrix[pos++];
+                mtxrow.fill(0.0f);
+
+                float value{};
+                while(mask)
+                {
+                    auto idx = static_cast<unsigned>(al::countr_zero(mask));
+                    mask &= ~(1u << idx);
+
+                    istr >> value;
+                    if(idx < mtxrow.size())
+                        mtxrow[idx] = value;
+                }
+            }
+            else
+                return make_error(linenum, "Unexpected matrix command: %s", command.c_str());
+        }
+        // Global scope commands
+        else if(command == "/description")
+        {
+            while(istr.good() && std::isspace(istr.peek()))
+                istr.ignore();
+            std::getline(istr, Description);
+            while(!Description.empty() && std::isspace(Description.back()))
+                Description.pop_back();
+        }
+        else if(command == "/version")
+        {
+            if(Version)
+                return make_error(linenum, "Duplicate version definition");
+            istr >> Version;
+            if(Version != 3)
+                return make_error(linenum, "Unsupported version: %d", Version);
+        }
+        else if(command == "/dec/chan_mask")
+        {
+            if(ChanMask)
+                return make_error(linenum, "Duplicate chan_mask definition");
+            istr >> std::hex >> ChanMask >> std::dec;
+
+            if(!ChanMask || ChanMask > Ambi4OrderMask)
+                return make_error(linenum, "Invalid chan_mask: 0x%x", ChanMask);
+            if(ChanMask > Ambi3OrderMask && CoeffScale == AmbDecScale::FuMa)
+                return make_error(linenum, "FuMa not compatible with over third-order");
+        }
+        else if(command == "/dec/freq_bands")
+        {
+            if(FreqBands)
+                return make_error(linenum, "Duplicate freq_bands");
+            istr >> FreqBands;
+            if(FreqBands != 1 && FreqBands != 2)
+                return make_error(linenum, "Invalid freq_bands: %u", FreqBands);
+        }
+        else if(command == "/dec/speakers")
+        {
+            if(NumSpeakers)
+                return make_error(linenum, "Duplicate speakers");
+            istr >> NumSpeakers;
+            if(!NumSpeakers)
+                return make_error(linenum, "Invalid speakers: %zu", NumSpeakers);
+            Speakers = std::make_unique<SpeakerConf[]>(NumSpeakers);
+        }
+        else if(command == "/dec/coeff_scale")
+        {
+            if(CoeffScale != AmbDecScale::Unset)
+                return make_error(linenum, "Duplicate coeff_scale");
+
+            std::string scale{read_word(istr)};
+            if(scale == "n3d") CoeffScale = AmbDecScale::N3D;
+            else if(scale == "sn3d") CoeffScale = AmbDecScale::SN3D;
+            else if(scale == "fuma") CoeffScale = AmbDecScale::FuMa;
+            else
+                return make_error(linenum, "Unexpected coeff_scale: %s", scale.c_str());
+
+            if(ChanMask > Ambi3OrderMask && CoeffScale == AmbDecScale::FuMa)
+                return make_error(linenum, "FuMa not compatible with over third-order");
+        }
+        else if(command == "/opt/xover_freq")
+        {
+            istr >> XOverFreq;
+        }
+        else if(command == "/opt/xover_ratio")
+        {
+            istr >> XOverRatio;
+        }
+        else if(command == "/opt/input_scale" || command == "/opt/nfeff_comp"
+            || command == "/opt/delay_comp" || command == "/opt/level_comp")
+        {
+            /* Unused */
+            read_word(istr);
+        }
+        else if(command == "/speakers/{")
+        {
+            if(!NumSpeakers)
+                return make_error(linenum, "Speakers defined without a count");
+            scope = ReaderScope::Speakers;
+        }
+        else if(command == "/lfmatrix/{" || command == "/hfmatrix/{" || command == "/matrix/{")
+        {
+            if(!NumSpeakers)
+                return make_error(linenum, "Matrix defined without a speaker count");
+            if(!ChanMask)
+                return make_error(linenum, "Matrix defined without a channel mask");
+
+            if(!Matrix)
+            {
+                Matrix = std::make_unique<CoeffArray[]>(NumSpeakers * FreqBands);
+                LFMatrix = Matrix.get();
+                HFMatrix = LFMatrix + NumSpeakers*(FreqBands-1);
+            }
+
+            if(FreqBands == 1)
+            {
+                if(command != "/matrix/{")
+                    return make_error(linenum, "Unexpected \"%s\" for a single-band decoder",
+                        command.c_str());
+                scope = ReaderScope::HFMatrix;
+            }
+            else
+            {
+                if(command == "/lfmatrix/{")
+                    scope = ReaderScope::LFMatrix;
+                else if(command == "/hfmatrix/{")
+                    scope = ReaderScope::HFMatrix;
+                else
+                    return make_error(linenum, "Unexpected \"%s\" for a dual-band decoder",
+                        command.c_str());
+            }
+        }
+        else if(command == "/end")
+        {
+            const auto endpos = static_cast<std::size_t>(istr.tellg());
+            if(!is_at_end(buffer, endpos))
+                return make_error(linenum, "Extra junk on end: %s", buffer.substr(endpos).c_str());
+
+            if(speaker_pos < NumSpeakers || hfmatrix_pos < NumSpeakers
+                || (FreqBands == 2 && lfmatrix_pos < NumSpeakers))
+                return make_error(linenum, "Incomplete decoder definition");
+            if(CoeffScale == AmbDecScale::Unset)
+                return make_error(linenum, "No coefficient scaling defined");
+
+            return al::nullopt;
+        }
+        else
+            return make_error(linenum, "Unexpected command: %s", command.c_str());
+
+        istr.clear();
+        const auto endpos = static_cast<std::size_t>(istr.tellg());
+        if(!is_at_end(buffer, endpos))
+            return make_error(linenum, "Extra junk on line: %s", buffer.substr(endpos).c_str());
+        buffer.clear();
+    }
+    return make_error(linenum, "Unexpected end of file");
+}
diff --git a/core/ambdec.h b/core/ambdec.h
new file mode 100644
index 00000000..7f739781
--- /dev/null
+++ b/core/ambdec.h
@@ -0,0 +1,55 @@
+#ifndef CORE_AMBDEC_H
+#define CORE_AMBDEC_H
+
+#include <array>
+#include <memory>
+#include <string>
+
+#include "aloptional.h"
+#include "core/ambidefs.h"
+
+/* Helpers to read .ambdec configuration files. */
+
+enum class AmbDecScale {
+    Unset,
+    N3D,
+    SN3D,
+    FuMa,
+};
+struct AmbDecConf {
+    std::string Description;
+    int Version{0}; /* Must be 3 */
+
+    unsigned int ChanMask{0u};
+    unsigned int FreqBands{0u}; /* Must be 1 or 2 */
+    AmbDecScale CoeffScale{AmbDecScale::Unset};
+
+    float XOverFreq{0.0f};
+    float XOverRatio{0.0f};
+
+    struct SpeakerConf {
+        std::string Name;
+        float Distance{0.0f};
+        float Azimuth{0.0f};
+        float Elevation{0.0f};
+        std::string Connection;
+    };
+    size_t NumSpeakers{0};
+    std::unique_ptr<SpeakerConf[]> Speakers;
+
+    using CoeffArray = std::array<float,MaxAmbiChannels>;
+    std::unique_ptr<CoeffArray[]> Matrix;
+
+    /* Unused when FreqBands == 1 */
+    float LFOrderGain[MaxAmbiOrder+1]{};
+    CoeffArray *LFMatrix;
+
+    float HFOrderGain[MaxAmbiOrder+1]{};
+    CoeffArray *HFMatrix;
+
+    ~AmbDecConf();
+
+    al::optional<std::string> load(const char *fname) noexcept;
+};
+
+#endif /* CORE_AMBDEC_H */
diff --git a/core/ambidefs.cpp b/core/ambidefs.cpp
new file mode 100644
index 00000000..70d6f356
--- /dev/null
+++ b/core/ambidefs.cpp
@@ -0,0 +1,308 @@
+
+#include "config.h"
+
+#include "ambidefs.h"
+
+#include "alnumbers.h"
+
+
+namespace {
+
+using AmbiChannelFloatArray = std::array<float,MaxAmbiChannels>;
+
+constexpr auto inv_sqrt2f = static_cast<float>(1.0/al::numbers::sqrt2);
+constexpr auto inv_sqrt3f = static_cast<float>(1.0/al::numbers::sqrt3);
+
+
+/* These HF gains are derived from the same 32-point speaker array. The scale
+ * factor between orders represents the same scale factors for any (regular)
+ * speaker array decoder. e.g. Given a first-order source and second-order
+ * output, applying an HF scale of HFScales[1][0] / HFScales[2][0] to channel 0
+ * will result in that channel being subsequently decoded for second-order as
+ * if it was a first-order decoder for that same speaker array.
+ */
+constexpr std::array<std::array<float,MaxAmbiOrder+1>,MaxAmbiOrder+1> HFScales{{
+    {{ 4.000000000e+00f, 2.309401077e+00f, 1.192569588e+00f, 7.189495850e-01f }},
+    {{ 4.000000000e+00f, 2.309401077e+00f, 1.192569588e+00f, 7.189495850e-01f }},
+    {{ 2.981423970e+00f, 2.309401077e+00f, 1.192569588e+00f, 7.189495850e-01f }},
+    {{ 2.359168820e+00f, 2.031565936e+00f, 1.444598386e+00f, 7.189495850e-01f }},
+    /* 1.947005434e+00f, 1.764337084e+00f, 1.424707344e+00f, 9.755104127e-01f, 4.784482742e-01f */
+}};
+
+/* Same as above, but using a 10-point horizontal-only speaker array. Should
+ * only be used when the device is mixing in 2D B-Format for horizontal-only
+ * output.
+ */
+constexpr std::array<std::array<float,MaxAmbiOrder+1>,MaxAmbiOrder+1> HFScales2D{{
+    {{ 2.236067977e+00f, 1.581138830e+00f, 9.128709292e-01f, 6.050756345e-01f }},
+    {{ 2.236067977e+00f, 1.581138830e+00f, 9.128709292e-01f, 6.050756345e-01f }},
+    {{ 1.825741858e+00f, 1.581138830e+00f, 9.128709292e-01f, 6.050756345e-01f }},
+    {{ 1.581138830e+00f, 1.460781803e+00f, 1.118033989e+00f, 6.050756345e-01f }},
+    /* 1.414213562e+00f, 1.344997024e+00f, 1.144122806e+00f, 8.312538756e-01f, 4.370160244e-01f */
+}};
+
+
+/* This calculates a first-order "upsampler" matrix. It combines a first-order
+ * decoder matrix with a max-order encoder matrix, creating a matrix that
+ * behaves as if the B-Format input signal is first decoded to a speaker array
+ * at first-order, then those speaker feeds are encoded to a higher-order
+ * signal. While not perfect, this should accurately encode a lower-order
+ * signal into a higher-order signal.
+ */
+constexpr std::array<std::array<float,4>,8> FirstOrderDecoder{{
+    {{ 1.250000000e-01f,  1.250000000e-01f,  1.250000000e-01f,  1.250000000e-01f, }},
+    {{ 1.250000000e-01f,  1.250000000e-01f,  1.250000000e-01f, -1.250000000e-01f, }},
+    {{ 1.250000000e-01f, -1.250000000e-01f,  1.250000000e-01f,  1.250000000e-01f, }},
+    {{ 1.250000000e-01f, -1.250000000e-01f,  1.250000000e-01f, -1.250000000e-01f, }},
+    {{ 1.250000000e-01f,  1.250000000e-01f, -1.250000000e-01f,  1.250000000e-01f, }},
+    {{ 1.250000000e-01f,  1.250000000e-01f, -1.250000000e-01f, -1.250000000e-01f, }},
+    {{ 1.250000000e-01f, -1.250000000e-01f, -1.250000000e-01f,  1.250000000e-01f, }},
+    {{ 1.250000000e-01f, -1.250000000e-01f, -1.250000000e-01f, -1.250000000e-01f, }},
+}};
+constexpr std::array<AmbiChannelFloatArray,8> FirstOrderEncoder{{
+    CalcAmbiCoeffs( inv_sqrt3f,  inv_sqrt3f,  inv_sqrt3f),
+    CalcAmbiCoeffs( inv_sqrt3f,  inv_sqrt3f, -inv_sqrt3f),
+    CalcAmbiCoeffs(-inv_sqrt3f,  inv_sqrt3f,  inv_sqrt3f),
+    CalcAmbiCoeffs(-inv_sqrt3f,  inv_sqrt3f, -inv_sqrt3f),
+    CalcAmbiCoeffs( inv_sqrt3f, -inv_sqrt3f,  inv_sqrt3f),
+    CalcAmbiCoeffs( inv_sqrt3f, -inv_sqrt3f, -inv_sqrt3f),
+    CalcAmbiCoeffs(-inv_sqrt3f, -inv_sqrt3f,  inv_sqrt3f),
+    CalcAmbiCoeffs(-inv_sqrt3f, -inv_sqrt3f, -inv_sqrt3f),
+}};
+static_assert(FirstOrderDecoder.size() == FirstOrderEncoder.size(), "First-order mismatch");
+
+/* This calculates a 2D first-order "upsampler" matrix. Same as the first-order
+ * matrix, just using a more optimized speaker array for horizontal-only
+ * content.
+ */
+constexpr std::array<std::array<float,4>,4> FirstOrder2DDecoder{{
+    {{ 2.500000000e-01f,  2.041241452e-01f, 0.0f,  2.041241452e-01f, }},
+    {{ 2.500000000e-01f,  2.041241452e-01f, 0.0f, -2.041241452e-01f, }},
+    {{ 2.500000000e-01f, -2.041241452e-01f, 0.0f,  2.041241452e-01f, }},
+    {{ 2.500000000e-01f, -2.041241452e-01f, 0.0f, -2.041241452e-01f, }},
+}};
+constexpr std::array<AmbiChannelFloatArray,4> FirstOrder2DEncoder{{
+    CalcAmbiCoeffs( inv_sqrt2f, 0.0f,  inv_sqrt2f),
+    CalcAmbiCoeffs( inv_sqrt2f, 0.0f, -inv_sqrt2f),
+    CalcAmbiCoeffs(-inv_sqrt2f, 0.0f,  inv_sqrt2f),
+    CalcAmbiCoeffs(-inv_sqrt2f, 0.0f, -inv_sqrt2f),
+}};
+static_assert(FirstOrder2DDecoder.size() == FirstOrder2DEncoder.size(), "First-order 2D mismatch");
+
+
+/* This calculates a second-order "upsampler" matrix. Same as the first-order
+ * matrix, just using a slightly more dense speaker array suitable for second-
+ * order content.
+ */
+constexpr std::array<std::array<float,9>,12> SecondOrderDecoder{{
+    {{ 8.333333333e-02f,  0.000000000e+00f, -7.588274978e-02f,  1.227808683e-01f,  0.000000000e+00f,  0.000000000e+00f, -1.591525047e-02f, -1.443375673e-01f,  1.167715449e-01f, }},
+    {{ 8.333333333e-02f, -1.227808683e-01f,  0.000000000e+00f,  7.588274978e-02f, -1.443375673e-01f,  0.000000000e+00f, -9.316949906e-02f,  0.000000000e+00f, -7.216878365e-02f, }},
+    {{ 8.333333333e-02f, -7.588274978e-02f,  1.227808683e-01f,  0.000000000e+00f,  0.000000000e+00f, -1.443375673e-01f,  1.090847495e-01f,  0.000000000e+00f, -4.460276122e-02f, }},
+    {{ 8.333333333e-02f,  0.000000000e+00f,  7.588274978e-02f,  1.227808683e-01f,  0.000000000e+00f,  0.000000000e+00f, -1.591525047e-02f,  1.443375673e-01f,  1.167715449e-01f, }},
+    {{ 8.333333333e-02f, -1.227808683e-01f,  0.000000000e+00f, -7.588274978e-02f,  1.443375673e-01f,  0.000000000e+00f, -9.316949906e-02f,  0.000000000e+00f, -7.216878365e-02f, }},
+    {{ 8.333333333e-02f,  7.588274978e-02f, -1.227808683e-01f,  0.000000000e+00f,  0.000000000e+00f, -1.443375673e-01f,  1.090847495e-01f,  0.000000000e+00f, -4.460276122e-02f, }},
+    {{ 8.333333333e-02f,  0.000000000e+00f, -7.588274978e-02f, -1.227808683e-01f,  0.000000000e+00f,  0.000000000e+00f, -1.591525047e-02f,  1.443375673e-01f,  1.167715449e-01f, }},
+    {{ 8.333333333e-02f,  1.227808683e-01f,  0.000000000e+00f, -7.588274978e-02f, -1.443375673e-01f,  0.000000000e+00f, -9.316949906e-02f,  0.000000000e+00f, -7.216878365e-02f, }},
+    {{ 8.333333333e-02f,  7.588274978e-02f,  1.227808683e-01f,  0.000000000e+00f,  0.000000000e+00f,  1.443375673e-01f,  1.090847495e-01f,  0.000000000e+00f, -4.460276122e-02f, }},
+    {{ 8.333333333e-02f,  0.000000000e+00f,  7.588274978e-02f, -1.227808683e-01f,  0.000000000e+00f,  0.000000000e+00f, -1.591525047e-02f, -1.443375673e-01f,  1.167715449e-01f, }},
+    {{ 8.333333333e-02f,  1.227808683e-01f,  0.000000000e+00f,  7.588274978e-02f,  1.443375673e-01f,  0.000000000e+00f, -9.316949906e-02f,  0.000000000e+00f, -7.216878365e-02f, }},
+    {{ 8.333333333e-02f, -7.588274978e-02f, -1.227808683e-01f,  0.000000000e+00f,  0.000000000e+00f,  1.443375673e-01f,  1.090847495e-01f,  0.000000000e+00f, -4.460276122e-02f, }},
+}};
+constexpr std::array<AmbiChannelFloatArray,12> SecondOrderEncoder{{
+    CalcAmbiCoeffs( 0.000000000e+00f, -5.257311121e-01f,  8.506508084e-01f),
+    CalcAmbiCoeffs(-8.506508084e-01f,  0.000000000e+00f,  5.257311121e-01f),
+    CalcAmbiCoeffs(-5.257311121e-01f,  8.506508084e-01f,  0.000000000e+00f),
+    CalcAmbiCoeffs( 0.000000000e+00f,  5.257311121e-01f,  8.506508084e-01f),
+    CalcAmbiCoeffs(-8.506508084e-01f,  0.000000000e+00f, -5.257311121e-01f),
+    CalcAmbiCoeffs( 5.257311121e-01f, -8.506508084e-01f,  0.000000000e+00f),
+    CalcAmbiCoeffs( 0.000000000e+00f, -5.257311121e-01f, -8.506508084e-01f),
+    CalcAmbiCoeffs( 8.506508084e-01f,  0.000000000e+00f, -5.257311121e-01f),
+    CalcAmbiCoeffs( 5.257311121e-01f,  8.506508084e-01f,  0.000000000e+00f),
+    CalcAmbiCoeffs( 0.000000000e+00f,  5.257311121e-01f, -8.506508084e-01f),
+    CalcAmbiCoeffs( 8.506508084e-01f,  0.000000000e+00f,  5.257311121e-01f),
+    CalcAmbiCoeffs(-5.257311121e-01f, -8.506508084e-01f,  0.000000000e+00f),
+}};
+static_assert(SecondOrderDecoder.size() == SecondOrderEncoder.size(), "Second-order mismatch");
+
+/* This calculates a 2D second-order "upsampler" matrix. Same as the second-
+ * order matrix, just using a more optimized speaker array for horizontal-only
+ * content.
+ */
+constexpr std::array<std::array<float,9>,6> SecondOrder2DDecoder{{
+    {{ 1.666666667e-01f, -9.622504486e-02f, 0.0f,  1.666666667e-01f, -1.490711985e-01f, 0.0f, 0.0f, 0.0f,  8.606629658e-02f, }},
+    {{ 1.666666667e-01f, -1.924500897e-01f, 0.0f,  0.000000000e+00f,  0.000000000e+00f, 0.0f, 0.0f, 0.0f, -1.721325932e-01f, }},
+    {{ 1.666666667e-01f, -9.622504486e-02f, 0.0f, -1.666666667e-01f,  1.490711985e-01f, 0.0f, 0.0f, 0.0f,  8.606629658e-02f, }},
+    {{ 1.666666667e-01f,  9.622504486e-02f, 0.0f, -1.666666667e-01f, -1.490711985e-01f, 0.0f, 0.0f, 0.0f,  8.606629658e-02f, }},
+    {{ 1.666666667e-01f,  1.924500897e-01f, 0.0f,  0.000000000e+00f,  0.000000000e+00f, 0.0f, 0.0f, 0.0f, -1.721325932e-01f, }},
+    {{ 1.666666667e-01f,  9.622504486e-02f, 0.0f,  1.666666667e-01f,  1.490711985e-01f, 0.0f, 0.0f, 0.0f,  8.606629658e-02f, }},
+}};
+constexpr std::array<AmbiChannelFloatArray,6> SecondOrder2DEncoder{{
+    CalcAmbiCoeffs(-0.50000000000f, 0.0f,  0.86602540379f),
+    CalcAmbiCoeffs(-1.00000000000f, 0.0f,  0.00000000000f),
+    CalcAmbiCoeffs(-0.50000000000f, 0.0f, -0.86602540379f),
+    CalcAmbiCoeffs( 0.50000000000f, 0.0f, -0.86602540379f),
+    CalcAmbiCoeffs( 1.00000000000f, 0.0f,  0.00000000000f),
+    CalcAmbiCoeffs( 0.50000000000f, 0.0f,  0.86602540379f),
+}};
+static_assert(SecondOrder2DDecoder.size() == SecondOrder2DEncoder.size(),
+    "Second-order 2D mismatch");
+
+
+/* This calculates a third-order "upsampler" matrix. Same as the first-order
+ * matrix, just using a more dense speaker array suitable for third-order
+ * content.
+ */
+constexpr std::array<std::array<float,16>,20> ThirdOrderDecoder{{
+    {{ 5.000000000e-02f,  3.090169944e-02f,  8.090169944e-02f,  0.000000000e+00f,  0.000000000e+00f,  6.454972244e-02f,  9.045084972e-02f,  0.000000000e+00f, -1.232790000e-02f, -1.256118221e-01f,  0.000000000e+00f,  1.126112056e-01f,  7.944389175e-02f,  0.000000000e+00f,  2.421151497e-02f,  0.000000000e+00f, }},
+    {{ 5.000000000e-02f, -3.090169944e-02f,  8.090169944e-02f,  0.000000000e+00f,  0.000000000e+00f, -6.454972244e-02f,  9.045084972e-02f,  0.000000000e+00f, -1.232790000e-02f,  1.256118221e-01f,  0.000000000e+00f, -1.126112056e-01f,  7.944389175e-02f,  0.000000000e+00f,  2.421151497e-02f,  0.000000000e+00f, }},
+    {{ 5.000000000e-02f,  3.090169944e-02f, -8.090169944e-02f,  0.000000000e+00f,  0.000000000e+00f, -6.454972244e-02f,  9.045084972e-02f,  0.000000000e+00f, -1.232790000e-02f, -1.256118221e-01f,  0.000000000e+00f,  1.126112056e-01f, -7.944389175e-02f,  0.000000000e+00f, -2.421151497e-02f,  0.000000000e+00f, }},
+    {{ 5.000000000e-02f, -3.090169944e-02f, -8.090169944e-02f,  0.000000000e+00f,  0.000000000e+00f,  6.454972244e-02f,  9.045084972e-02f,  0.000000000e+00f, -1.232790000e-02f,  1.256118221e-01f,  0.000000000e+00f, -1.126112056e-01f, -7.944389175e-02f,  0.000000000e+00f, -2.421151497e-02f,  0.000000000e+00f, }},
+    {{ 5.000000000e-02f,  8.090169944e-02f,  0.000000000e+00f,  3.090169944e-02f,  6.454972244e-02f,  0.000000000e+00f, -5.590169944e-02f,  0.000000000e+00f, -7.216878365e-02f, -7.763237543e-02f,  0.000000000e+00f, -2.950836627e-02f,  0.000000000e+00f, -1.497759251e-01f,  0.000000000e+00f, -7.763237543e-02f, }},
+    {{ 5.000000000e-02f,  8.090169944e-02f,  0.000000000e+00f, -3.090169944e-02f, -6.454972244e-02f,  0.000000000e+00f, -5.590169944e-02f,  0.000000000e+00f, -7.216878365e-02f, -7.763237543e-02f,  0.000000000e+00f, -2.950836627e-02f,  0.000000000e+00f,  1.497759251e-01f,  0.000000000e+00f,  7.763237543e-02f, }},
+    {{ 5.000000000e-02f, -8.090169944e-02f,  0.000000000e+00f,  3.090169944e-02f, -6.454972244e-02f,  0.000000000e+00f, -5.590169944e-02f,  0.000000000e+00f, -7.216878365e-02f,  7.763237543e-02f,  0.000000000e+00f,  2.950836627e-02f,  0.000000000e+00f, -1.497759251e-01f,  0.000000000e+00f, -7.763237543e-02f, }},
+    {{ 5.000000000e-02f, -8.090169944e-02f,  0.000000000e+00f, -3.090169944e-02f,  6.454972244e-02f,  0.000000000e+00f, -5.590169944e-02f,  0.000000000e+00f, -7.216878365e-02f,  7.763237543e-02f,  0.000000000e+00f,  2.950836627e-02f,  0.000000000e+00f,  1.497759251e-01f,  0.000000000e+00f,  7.763237543e-02f, }},
+    {{ 5.000000000e-02f,  0.000000000e+00f,  3.090169944e-02f,  8.090169944e-02f,  0.000000000e+00f,  0.000000000e+00f, -3.454915028e-02f,  6.454972244e-02f,  8.449668365e-02f,  0.000000000e+00f,  0.000000000e+00f,  0.000000000e+00f,  3.034486645e-02f, -6.779013272e-02f,  1.659481923e-01f,  4.797944664e-02f, }},
+    {{ 5.000000000e-02f,  0.000000000e+00f,  3.090169944e-02f, -8.090169944e-02f,  0.000000000e+00f,  0.000000000e+00f, -3.454915028e-02f, -6.454972244e-02f,  8.449668365e-02f,  0.000000000e+00f,  0.000000000e+00f,  0.000000000e+00f,  3.034486645e-02f,  6.779013272e-02f,  1.659481923e-01f, -4.797944664e-02f, }},
+    {{ 5.000000000e-02f,  0.000000000e+00f, -3.090169944e-02f,  8.090169944e-02f,  0.000000000e+00f,  0.000000000e+00f, -3.454915028e-02f, -6.454972244e-02f,  8.449668365e-02f,  0.000000000e+00f,  0.000000000e+00f,  0.000000000e+00f, -3.034486645e-02f, -6.779013272e-02f, -1.659481923e-01f,  4.797944664e-02f, }},
+    {{ 5.000000000e-02f,  0.000000000e+00f, -3.090169944e-02f, -8.090169944e-02f,  0.000000000e+00f,  0.000000000e+00f, -3.454915028e-02f,  6.454972244e-02f,  8.449668365e-02f,  0.000000000e+00f,  0.000000000e+00f,  0.000000000e+00f, -3.034486645e-02f,  6.779013272e-02f, -1.659481923e-01f, -4.797944664e-02f, }},
+    {{ 5.000000000e-02f,  5.000000000e-02f,  5.000000000e-02f,  5.000000000e-02f,  6.454972244e-02f,  6.454972244e-02f,  0.000000000e+00f,  6.454972244e-02f,  0.000000000e+00f,  1.016220987e-01f,  6.338656910e-02f, -1.092600649e-02f, -7.364853795e-02f,  1.011266756e-01f, -7.086833869e-02f, -1.482646439e-02f, }},
+    {{ 5.000000000e-02f,  5.000000000e-02f,  5.000000000e-02f, -5.000000000e-02f, -6.454972244e-02f,  6.454972244e-02f,  0.000000000e+00f, -6.454972244e-02f,  0.000000000e+00f,  1.016220987e-01f, -6.338656910e-02f, -1.092600649e-02f, -7.364853795e-02f, -1.011266756e-01f, -7.086833869e-02f,  1.482646439e-02f, }},
+    {{ 5.000000000e-02f, -5.000000000e-02f,  5.000000000e-02f,  5.000000000e-02f, -6.454972244e-02f, -6.454972244e-02f,  0.000000000e+00f,  6.454972244e-02f,  0.000000000e+00f, -1.016220987e-01f, -6.338656910e-02f,  1.092600649e-02f, -7.364853795e-02f,  1.011266756e-01f, -7.086833869e-02f, -1.482646439e-02f, }},
+    {{ 5.000000000e-02f, -5.000000000e-02f,  5.000000000e-02f, -5.000000000e-02f,  6.454972244e-02f, -6.454972244e-02f,  0.000000000e+00f, -6.454972244e-02f,  0.000000000e+00f, -1.016220987e-01f,  6.338656910e-02f,  1.092600649e-02f, -7.364853795e-02f, -1.011266756e-01f, -7.086833869e-02f,  1.482646439e-02f, }},
+    {{ 5.000000000e-02f,  5.000000000e-02f, -5.000000000e-02f,  5.000000000e-02f,  6.454972244e-02f, -6.454972244e-02f,  0.000000000e+00f, -6.454972244e-02f,  0.000000000e+00f,  1.016220987e-01f, -6.338656910e-02f, -1.092600649e-02f,  7.364853795e-02f,  1.011266756e-01f,  7.086833869e-02f, -1.482646439e-02f, }},
+    {{ 5.000000000e-02f,  5.000000000e-02f, -5.000000000e-02f, -5.000000000e-02f, -6.454972244e-02f, -6.454972244e-02f,  0.000000000e+00f,  6.454972244e-02f,  0.000000000e+00f,  1.016220987e-01f,  6.338656910e-02f, -1.092600649e-02f,  7.364853795e-02f, -1.011266756e-01f,  7.086833869e-02f,  1.482646439e-02f, }},
+    {{ 5.000000000e-02f, -5.000000000e-02f, -5.000000000e-02f,  5.000000000e-02f, -6.454972244e-02f,  6.454972244e-02f,  0.000000000e+00f, -6.454972244e-02f,  0.000000000e+00f, -1.016220987e-01f,  6.338656910e-02f,  1.092600649e-02f,  7.364853795e-02f,  1.011266756e-01f,  7.086833869e-02f, -1.482646439e-02f, }},
+    {{ 5.000000000e-02f, -5.000000000e-02f, -5.000000000e-02f, -5.000000000e-02f,  6.454972244e-02f,  6.454972244e-02f,  0.000000000e+00f,  6.454972244e-02f,  0.000000000e+00f, -1.016220987e-01f, -6.338656910e-02f,  1.092600649e-02f,  7.364853795e-02f, -1.011266756e-01f,  7.086833869e-02f,  1.482646439e-02f, }},
+}};
+constexpr std::array<AmbiChannelFloatArray,20> ThirdOrderEncoder{{
+    CalcAmbiCoeffs( 0.35682208976f,  0.93417235897f,  0.00000000000f),
+    CalcAmbiCoeffs(-0.35682208976f,  0.93417235897f,  0.00000000000f),
+    CalcAmbiCoeffs( 0.35682208976f, -0.93417235897f,  0.00000000000f),
+    CalcAmbiCoeffs(-0.35682208976f, -0.93417235897f,  0.00000000000f),
+    CalcAmbiCoeffs( 0.93417235897f,  0.00000000000f,  0.35682208976f),
+    CalcAmbiCoeffs( 0.93417235897f,  0.00000000000f, -0.35682208976f),
+    CalcAmbiCoeffs(-0.93417235897f,  0.00000000000f,  0.35682208976f),
+    CalcAmbiCoeffs(-0.93417235897f,  0.00000000000f, -0.35682208976f),
+    CalcAmbiCoeffs( 0.00000000000f,  0.35682208976f,  0.93417235897f),
+    CalcAmbiCoeffs( 0.00000000000f,  0.35682208976f, -0.93417235897f),
+    CalcAmbiCoeffs( 0.00000000000f, -0.35682208976f,  0.93417235897f),
+    CalcAmbiCoeffs( 0.00000000000f, -0.35682208976f, -0.93417235897f),
+    CalcAmbiCoeffs(     inv_sqrt3f,      inv_sqrt3f,      inv_sqrt3f),
+    CalcAmbiCoeffs(     inv_sqrt3f,      inv_sqrt3f,     -inv_sqrt3f),
+    CalcAmbiCoeffs(    -inv_sqrt3f,      inv_sqrt3f,      inv_sqrt3f),
+    CalcAmbiCoeffs(    -inv_sqrt3f,      inv_sqrt3f,     -inv_sqrt3f),
+    CalcAmbiCoeffs(     inv_sqrt3f,     -inv_sqrt3f,      inv_sqrt3f),
+    CalcAmbiCoeffs(     inv_sqrt3f,     -inv_sqrt3f,     -inv_sqrt3f),
+    CalcAmbiCoeffs(    -inv_sqrt3f,     -inv_sqrt3f,      inv_sqrt3f),
+    CalcAmbiCoeffs(    -inv_sqrt3f,     -inv_sqrt3f,     -inv_sqrt3f),
+}};
+static_assert(ThirdOrderDecoder.size() == ThirdOrderEncoder.size(), "Third-order mismatch");
+
+/* This calculates a 2D third-order "upsampler" matrix. Same as the third-order
+ * matrix, just using a more optimized speaker array for horizontal-only
+ * content.
+ */
+constexpr std::array<std::array<float,16>,8> ThirdOrder2DDecoder{{
+    {{ 1.250000000e-01f, -5.523559567e-02f, 0.0f,  1.333505242e-01f, -9.128709292e-02f, 0.0f, 0.0f, 0.0f,  9.128709292e-02f, -1.104247249e-01f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,  4.573941867e-02f, }},
+    {{ 1.250000000e-01f, -1.333505242e-01f, 0.0f,  5.523559567e-02f, -9.128709292e-02f, 0.0f, 0.0f, 0.0f, -9.128709292e-02f,  4.573941867e-02f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, -1.104247249e-01f, }},
+    {{ 1.250000000e-01f, -1.333505242e-01f, 0.0f, -5.523559567e-02f,  9.128709292e-02f, 0.0f, 0.0f, 0.0f, -9.128709292e-02f,  4.573941867e-02f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,  1.104247249e-01f, }},
+    {{ 1.250000000e-01f, -5.523559567e-02f, 0.0f, -1.333505242e-01f,  9.128709292e-02f, 0.0f, 0.0f, 0.0f,  9.128709292e-02f, -1.104247249e-01f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, -4.573941867e-02f, }},
+    {{ 1.250000000e-01f,  5.523559567e-02f, 0.0f, -1.333505242e-01f, -9.128709292e-02f, 0.0f, 0.0f, 0.0f,  9.128709292e-02f,  1.104247249e-01f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, -4.573941867e-02f, }},
+    {{ 1.250000000e-01f,  1.333505242e-01f, 0.0f, -5.523559567e-02f, -9.128709292e-02f, 0.0f, 0.0f, 0.0f, -9.128709292e-02f, -4.573941867e-02f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,  1.104247249e-01f, }},
+    {{ 1.250000000e-01f,  1.333505242e-01f, 0.0f,  5.523559567e-02f,  9.128709292e-02f, 0.0f, 0.0f, 0.0f, -9.128709292e-02f, -4.573941867e-02f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, -1.104247249e-01f, }},
+    {{ 1.250000000e-01f,  5.523559567e-02f, 0.0f,  1.333505242e-01f,  9.128709292e-02f, 0.0f, 0.0f, 0.0f,  9.128709292e-02f,  1.104247249e-01f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,  4.573941867e-02f, }},
+}};
+constexpr std::array<AmbiChannelFloatArray,8> ThirdOrder2DEncoder{{
+    CalcAmbiCoeffs(-0.38268343237f, 0.0f,  0.92387953251f),
+    CalcAmbiCoeffs(-0.92387953251f, 0.0f,  0.38268343237f),
+    CalcAmbiCoeffs(-0.92387953251f, 0.0f, -0.38268343237f),
+    CalcAmbiCoeffs(-0.38268343237f, 0.0f, -0.92387953251f),
+    CalcAmbiCoeffs( 0.38268343237f, 0.0f, -0.92387953251f),
+    CalcAmbiCoeffs( 0.92387953251f, 0.0f, -0.38268343237f),
+    CalcAmbiCoeffs( 0.92387953251f, 0.0f,  0.38268343237f),
+    CalcAmbiCoeffs( 0.38268343237f, 0.0f,  0.92387953251f),
+}};
+static_assert(ThirdOrder2DDecoder.size() == ThirdOrder2DEncoder.size(), "Third-order 2D mismatch");
+
+
+/* This calculates a 2D fourth-order "upsampler" matrix. There is no 3D fourth-
+ * order upsampler since fourth-order is the max order we'll be supporting for
+ * the foreseeable future. This is only necessary for mixing horizontal-only
+ * fourth-order content to 3D.
+ */
+constexpr std::array<std::array<float,25>,10> FourthOrder2DDecoder{{
+    {{ 1.000000000e-01f,  3.568220898e-02f, 0.0f,  1.098185471e-01f,  6.070619982e-02f, 0.0f, 0.0f, 0.0f,  8.355491589e-02f,  7.735682057e-02f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,  5.620301997e-02f,  8.573754253e-02f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,  2.785781628e-02f, }},
+    {{ 1.000000000e-01f,  9.341723590e-02f, 0.0f,  6.787159473e-02f,  9.822469464e-02f, 0.0f, 0.0f, 0.0f, -3.191513794e-02f,  2.954767620e-02f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, -9.093839659e-02f, -5.298871540e-02f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, -7.293270986e-02f, }},
+    {{ 1.000000000e-01f,  1.154700538e-01f, 0.0f,  0.000000000e+00f,  0.000000000e+00f, 0.0f, 0.0f, 0.0f, -1.032795559e-01f, -9.561828875e-02f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,  0.000000000e+00f,  0.000000000e+00f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,  9.014978717e-02f, }},
+    {{ 1.000000000e-01f,  9.341723590e-02f, 0.0f, -6.787159473e-02f, -9.822469464e-02f, 0.0f, 0.0f, 0.0f, -3.191513794e-02f,  2.954767620e-02f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,  9.093839659e-02f,  5.298871540e-02f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, -7.293270986e-02f, }},
+    {{ 1.000000000e-01f,  3.568220898e-02f, 0.0f, -1.098185471e-01f, -6.070619982e-02f, 0.0f, 0.0f, 0.0f,  8.355491589e-02f,  7.735682057e-02f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, -5.620301997e-02f, -8.573754253e-02f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,  2.785781628e-02f, }},
+    {{ 1.000000000e-01f, -3.568220898e-02f, 0.0f, -1.098185471e-01f,  6.070619982e-02f, 0.0f, 0.0f, 0.0f,  8.355491589e-02f, -7.735682057e-02f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, -5.620301997e-02f,  8.573754253e-02f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,  2.785781628e-02f, }},
+    {{ 1.000000000e-01f, -9.341723590e-02f, 0.0f, -6.787159473e-02f,  9.822469464e-02f, 0.0f, 0.0f, 0.0f, -3.191513794e-02f, -2.954767620e-02f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,  9.093839659e-02f, -5.298871540e-02f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, -7.293270986e-02f, }},
+    {{ 1.000000000e-01f, -1.154700538e-01f, 0.0f,  0.000000000e+00f,  0.000000000e+00f, 0.0f, 0.0f, 0.0f, -1.032795559e-01f,  9.561828875e-02f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,  0.000000000e+00f,  0.000000000e+00f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,  9.014978717e-02f, }},
+    {{ 1.000000000e-01f, -9.341723590e-02f, 0.0f,  6.787159473e-02f, -9.822469464e-02f, 0.0f, 0.0f, 0.0f, -3.191513794e-02f, -2.954767620e-02f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, -9.093839659e-02f,  5.298871540e-02f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, -7.293270986e-02f, }},
+    {{ 1.000000000e-01f, -3.568220898e-02f, 0.0f,  1.098185471e-01f, -6.070619982e-02f, 0.0f, 0.0f, 0.0f,  8.355491589e-02f, -7.735682057e-02f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,  5.620301997e-02f, -8.573754253e-02f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,  2.785781628e-02f, }},
+}};
+constexpr std::array<AmbiChannelFloatArray,10> FourthOrder2DEncoder{{
+    CalcAmbiCoeffs( 3.090169944e-01f,  0.000000000e+00f,  9.510565163e-01f),
+    CalcAmbiCoeffs( 8.090169944e-01f,  0.000000000e+00f,  5.877852523e-01f),
+    CalcAmbiCoeffs( 1.000000000e+00f,  0.000000000e+00f,  0.000000000e+00f),
+    CalcAmbiCoeffs( 8.090169944e-01f,  0.000000000e+00f, -5.877852523e-01f),
+    CalcAmbiCoeffs( 3.090169944e-01f,  0.000000000e+00f, -9.510565163e-01f),
+    CalcAmbiCoeffs(-3.090169944e-01f,  0.000000000e+00f, -9.510565163e-01f),
+    CalcAmbiCoeffs(-8.090169944e-01f,  0.000000000e+00f, -5.877852523e-01f),
+    CalcAmbiCoeffs(-1.000000000e+00f,  0.000000000e+00f,  0.000000000e+00f),
+    CalcAmbiCoeffs(-8.090169944e-01f,  0.000000000e+00f,  5.877852523e-01f),
+    CalcAmbiCoeffs(-3.090169944e-01f,  0.000000000e+00f,  9.510565163e-01f),
+}};
+static_assert(FourthOrder2DDecoder.size() == FourthOrder2DEncoder.size(), "Fourth-order 2D mismatch");
+
+
+template<size_t N, size_t M>
+auto CalcAmbiUpsampler(const std::array<std::array<float,N>,M> &decoder,
+    const std::array<AmbiChannelFloatArray,M> &encoder)
+{
+    std::array<AmbiChannelFloatArray,N> res{};
+
+    for(size_t i{0};i < decoder[0].size();++i)
+    {
+        for(size_t j{0};j < encoder[0].size();++j)
+        {
+            double sum{0.0};
+            for(size_t k{0};k < decoder.size();++k)
+                sum += double{decoder[k][i]} * encoder[k][j];
+            res[i][j] = static_cast<float>(sum);
+        }
+    }
+
+    return res;
+}
+
+} // namespace
+
+const std::array<AmbiChannelFloatArray,4> AmbiScale::FirstOrderUp{CalcAmbiUpsampler(FirstOrderDecoder, FirstOrderEncoder)};
+const std::array<AmbiChannelFloatArray,4> AmbiScale::FirstOrder2DUp{CalcAmbiUpsampler(FirstOrder2DDecoder, FirstOrder2DEncoder)};
+const std::array<AmbiChannelFloatArray,9> AmbiScale::SecondOrderUp{CalcAmbiUpsampler(SecondOrderDecoder, SecondOrderEncoder)};
+const std::array<AmbiChannelFloatArray,9> AmbiScale::SecondOrder2DUp{CalcAmbiUpsampler(SecondOrder2DDecoder, SecondOrder2DEncoder)};
+const std::array<AmbiChannelFloatArray,16> AmbiScale::ThirdOrderUp{CalcAmbiUpsampler(ThirdOrderDecoder, ThirdOrderEncoder)};
+const std::array<AmbiChannelFloatArray,16> AmbiScale::ThirdOrder2DUp{CalcAmbiUpsampler(ThirdOrder2DDecoder, ThirdOrder2DEncoder)};
+const std::array<AmbiChannelFloatArray,25> AmbiScale::FourthOrder2DUp{CalcAmbiUpsampler(FourthOrder2DDecoder, FourthOrder2DEncoder)};
+
+
+std::array<float,MaxAmbiOrder+1> AmbiScale::GetHFOrderScales(const uint src_order,
+    const uint dev_order, const bool horizontalOnly) noexcept
+{
+    std::array<float,MaxAmbiOrder+1> res{};
+
+    if(!horizontalOnly)
+    {
+        for(size_t i{0};i < MaxAmbiOrder+1;++i)
+            res[i] = HFScales[src_order][i] / HFScales[dev_order][i];
+    }
+    else
+    {
+        for(size_t i{0};i < MaxAmbiOrder+1;++i)
+            res[i] = HFScales2D[src_order][i] / HFScales2D[dev_order][i];
+    }
+
+    return res;
+}
diff --git a/core/ambidefs.h b/core/ambidefs.h
new file mode 100644
index 00000000..b7d2bcd1
--- /dev/null
+++ b/core/ambidefs.h
@@ -0,0 +1,250 @@
+#ifndef CORE_AMBIDEFS_H
+#define CORE_AMBIDEFS_H
+
+#include <array>
+#include <stddef.h>
+#include <stdint.h>
+
+#include "alnumbers.h"
+
+
+using uint = unsigned int;
+
+/* The maximum number of Ambisonics channels. For a given order (o), the size
+ * needed will be (o+1)**2, thus zero-order has 1, first-order has 4, second-
+ * order has 9, third-order has 16, and fourth-order has 25.
+ */
+constexpr uint8_t MaxAmbiOrder{3};
+constexpr inline size_t AmbiChannelsFromOrder(size_t order) noexcept
+{ return (order+1) * (order+1); }
+constexpr size_t MaxAmbiChannels{AmbiChannelsFromOrder(MaxAmbiOrder)};
+
+/* A bitmask of ambisonic channels for 0 to 4th order. This only specifies up
+ * to 4th order, which is the highest order a 32-bit mask value can specify (a
+ * 64-bit mask could handle up to 7th order).
+ */
+constexpr uint Ambi0OrderMask{0x00000001};
+constexpr uint Ambi1OrderMask{0x0000000f};
+constexpr uint Ambi2OrderMask{0x000001ff};
+constexpr uint Ambi3OrderMask{0x0000ffff};
+constexpr uint Ambi4OrderMask{0x01ffffff};
+
+/* A bitmask of ambisonic channels with height information. If none of these
+ * channels are used/needed, there's no height (e.g. with most surround sound
+ * speaker setups). This is ACN ordering, with bit 0 being ACN 0, etc.
+ */
+constexpr uint AmbiPeriphonicMask{0xfe7ce4};
+
+/* The maximum number of ambisonic channels for 2D (non-periphonic)
+ * representation. This is 2 per each order above zero-order, plus 1 for zero-
+ * order. Or simply, o*2 + 1.
+ */
+constexpr inline size_t Ambi2DChannelsFromOrder(size_t order) noexcept
+{ return order*2 + 1; }
+constexpr size_t MaxAmbi2DChannels{Ambi2DChannelsFromOrder(MaxAmbiOrder)};
+
+
+/* NOTE: These are scale factors as applied to Ambisonics content. Decoder
+ * coefficients should be divided by these values to get proper scalings.
+ */
+struct AmbiScale {
+    static auto& FromN3D() noexcept
+    {
+        static constexpr const std::array<float,MaxAmbiChannels> ret{{
+            1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
+            1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f
+        }};
+        return ret;
+    }
+    static auto& FromSN3D() noexcept
+    {
+        static constexpr const std::array<float,MaxAmbiChannels> ret{{
+            1.000000000f, /* ACN  0, sqrt(1) */
+            1.732050808f, /* ACN  1, sqrt(3) */
+            1.732050808f, /* ACN  2, sqrt(3) */
+            1.732050808f, /* ACN  3, sqrt(3) */
+            2.236067978f, /* ACN  4, sqrt(5) */
+            2.236067978f, /* ACN  5, sqrt(5) */
+            2.236067978f, /* ACN  6, sqrt(5) */
+            2.236067978f, /* ACN  7, sqrt(5) */
+            2.236067978f, /* ACN  8, sqrt(5) */
+            2.645751311f, /* ACN  9, sqrt(7) */
+            2.645751311f, /* ACN 10, sqrt(7) */
+            2.645751311f, /* ACN 11, sqrt(7) */
+            2.645751311f, /* ACN 12, sqrt(7) */
+            2.645751311f, /* ACN 13, sqrt(7) */
+            2.645751311f, /* ACN 14, sqrt(7) */
+            2.645751311f, /* ACN 15, sqrt(7) */
+        }};
+        return ret;
+    }
+    static auto& FromFuMa() noexcept
+    {
+        static constexpr const std::array<float,MaxAmbiChannels> ret{{
+            1.414213562f, /* ACN  0 (W), sqrt(2) */
+            1.732050808f, /* ACN  1 (Y), sqrt(3) */
+            1.732050808f, /* ACN  2 (Z), sqrt(3) */
+            1.732050808f, /* ACN  3 (X), sqrt(3) */
+            1.936491673f, /* ACN  4 (V), sqrt(15)/2 */
+            1.936491673f, /* ACN  5 (T), sqrt(15)/2 */
+            2.236067978f, /* ACN  6 (R), sqrt(5) */
+            1.936491673f, /* ACN  7 (S), sqrt(15)/2 */
+            1.936491673f, /* ACN  8 (U), sqrt(15)/2 */
+            2.091650066f, /* ACN  9 (Q), sqrt(35/8) */
+            1.972026594f, /* ACN 10 (O), sqrt(35)/3 */
+            2.231093404f, /* ACN 11 (M), sqrt(224/45) */
+            2.645751311f, /* ACN 12 (K), sqrt(7) */
+            2.231093404f, /* ACN 13 (L), sqrt(224/45) */
+            1.972026594f, /* ACN 14 (N), sqrt(35)/3 */
+            2.091650066f, /* ACN 15 (P), sqrt(35/8) */
+        }};
+        return ret;
+    }
+    static auto& FromUHJ() noexcept
+    {
+        static constexpr const std::array<float,MaxAmbiChannels> ret{{
+            1.000000000f, /* ACN  0 (W), sqrt(1) */
+            1.224744871f, /* ACN  1 (Y), sqrt(3/2) */
+            1.224744871f, /* ACN  2 (Z), sqrt(3/2) */
+            1.224744871f, /* ACN  3 (X), sqrt(3/2) */
+            /* Higher orders not relevant for UHJ. */
+            1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
+        }};
+        return ret;
+    }
+
+    /* Retrieves per-order HF scaling factors for "upsampling" ambisonic data. */
+    static std::array<float,MaxAmbiOrder+1> GetHFOrderScales(const uint src_order,
+        const uint dev_order, const bool horizontalOnly) noexcept;
+
+    static const std::array<std::array<float,MaxAmbiChannels>,4> FirstOrderUp;
+    static const std::array<std::array<float,MaxAmbiChannels>,4> FirstOrder2DUp;
+    static const std::array<std::array<float,MaxAmbiChannels>,9> SecondOrderUp;
+    static const std::array<std::array<float,MaxAmbiChannels>,9> SecondOrder2DUp;
+    static const std::array<std::array<float,MaxAmbiChannels>,16> ThirdOrderUp;
+    static const std::array<std::array<float,MaxAmbiChannels>,16> ThirdOrder2DUp;
+    static const std::array<std::array<float,MaxAmbiChannels>,25> FourthOrder2DUp;
+};
+
+struct AmbiIndex {
+    static auto& FromFuMa() noexcept
+    {
+        static constexpr const std::array<uint8_t,MaxAmbiChannels> ret{{
+            0,  /* W */
+            3,  /* X */
+            1,  /* Y */
+            2,  /* Z */
+            6,  /* R */
+            7,  /* S */
+            5,  /* T */
+            8,  /* U */
+            4,  /* V */
+            12, /* K */
+            13, /* L */
+            11, /* M */
+            14, /* N */
+            10, /* O */
+            15, /* P */
+            9,  /* Q */
+        }};
+        return ret;
+    }
+    static auto& FromFuMa2D() noexcept
+    {
+        static constexpr const std::array<uint8_t,MaxAmbi2DChannels> ret{{
+            0,  /* W */
+            3,  /* X */
+            1,  /* Y */
+            8,  /* U */
+            4,  /* V */
+            15, /* P */
+            9,  /* Q */
+        }};
+        return ret;
+    }
+
+    static auto& FromACN() noexcept
+    {
+        static constexpr const std::array<uint8_t,MaxAmbiChannels> ret{{
+            0,  1,  2,  3,  4,  5,  6,  7,
+            8,  9, 10, 11, 12, 13, 14, 15
+        }};
+        return ret;
+    }
+    static auto& FromACN2D() noexcept
+    {
+        static constexpr const std::array<uint8_t,MaxAmbi2DChannels> ret{{
+            0, 1,3, 4,8, 9,15
+        }};
+        return ret;
+    }
+
+    static auto& OrderFromChannel() noexcept
+    {
+        static constexpr const std::array<uint8_t,MaxAmbiChannels> ret{{
+            0, 1,1,1, 2,2,2,2,2, 3,3,3,3,3,3,3,
+        }};
+        return ret;
+    }
+    static auto& OrderFrom2DChannel() noexcept
+    {
+        static constexpr const std::array<uint8_t,MaxAmbi2DChannels> ret{{
+            0, 1,1, 2,2, 3,3,
+        }};
+        return ret;
+    }
+};
+
+
+/**
+ * Calculates ambisonic encoder coefficients using the X, Y, and Z direction
+ * components, which must represent a normalized (unit length) vector.
+ *
+ * NOTE: The components use ambisonic coordinates. As a result:
+ *
+ * Ambisonic Y = OpenAL -X
+ * Ambisonic Z = OpenAL Y
+ * Ambisonic X = OpenAL -Z
+ *
+ * The components are ordered such that OpenAL's X, Y, and Z are the first,
+ * second, and third parameters respectively -- simply negate X and Z.
+ */
+constexpr auto CalcAmbiCoeffs(const float y, const float z, const float x)
+{
+    const float xx{x*x}, yy{y*y}, zz{z*z}, xy{x*y}, yz{y*z}, xz{x*z};
+
+    return std::array<float,MaxAmbiChannels>{{
+        /* Zeroth-order */
+        1.0f, /* ACN 0 = 1 */
+        /* First-order */
+        al::numbers::sqrt3_v<float> * y, /* ACN 1 = sqrt(3) * Y */
+        al::numbers::sqrt3_v<float> * z, /* ACN 2 = sqrt(3) * Z */
+        al::numbers::sqrt3_v<float> * x, /* ACN 3 = sqrt(3) * X */
+        /* Second-order */
+        3.872983346e+00f * xy,               /* ACN 4 = sqrt(15) * X * Y */
+        3.872983346e+00f * yz,               /* ACN 5 = sqrt(15) * Y * Z */
+        1.118033989e+00f * (3.0f*zz - 1.0f), /* ACN 6 = sqrt(5)/2 * (3*Z*Z - 1) */
+        3.872983346e+00f * xz,               /* ACN 7 = sqrt(15) * X * Z */
+        1.936491673e+00f * (xx - yy),        /* ACN 8 = sqrt(15)/2 * (X*X - Y*Y) */
+        /* Third-order */
+        2.091650066e+00f * (y*(3.0f*xx - yy)),   /* ACN  9 = sqrt(35/8) * Y * (3*X*X - Y*Y) */
+        1.024695076e+01f * (z*xy),               /* ACN 10 = sqrt(105) * Z * X * Y */
+        1.620185175e+00f * (y*(5.0f*zz - 1.0f)), /* ACN 11 = sqrt(21/8) * Y * (5*Z*Z - 1) */
+        1.322875656e+00f * (z*(5.0f*zz - 3.0f)), /* ACN 12 = sqrt(7)/2 * Z * (5*Z*Z - 3) */
+        1.620185175e+00f * (x*(5.0f*zz - 1.0f)), /* ACN 13 = sqrt(21/8) * X * (5*Z*Z - 1) */
+        5.123475383e+00f * (z*(xx - yy)),        /* ACN 14 = sqrt(105)/2 * Z * (X*X - Y*Y) */
+        2.091650066e+00f * (x*(xx - 3.0f*yy)),   /* ACN 15 = sqrt(35/8) * X * (X*X - 3*Y*Y) */
+        /* Fourth-order */
+        /* ACN 16 = sqrt(35)*3/2 * X * Y * (X*X - Y*Y) */
+        /* ACN 17 = sqrt(35/2)*3/2 * (3*X*X - Y*Y) * Y * Z */
+        /* ACN 18 = sqrt(5)*3/2 * X * Y * (7*Z*Z - 1) */
+        /* ACN 19 = sqrt(5/2)*3/2 * Y * Z * (7*Z*Z - 3) */
+        /* ACN 20 = 3/8 * (35*Z*Z*Z*Z - 30*Z*Z + 3) */
+        /* ACN 21 = sqrt(5/2)*3/2 * X * Z * (7*Z*Z - 3) */
+        /* ACN 22 = sqrt(5)*3/4 * (X*X - Y*Y) * (7*Z*Z - 1) */
+        /* ACN 23 = sqrt(35/2)*3/2 * (X*X - 3*Y*Y) * X * Z */
+        /* ACN 24 = sqrt(35)*3/8 * (X*X*X*X - 6*X*X*Y*Y + Y*Y*Y*Y) */
+    }};
+}
+
+#endif /* CORE_AMBIDEFS_H */
diff --git a/core/async_event.h b/core/async_event.h
new file mode 100644
index 00000000..5a2f5f91
--- /dev/null
+++ b/core/async_event.h
@@ -0,0 +1,55 @@
+#ifndef CORE_EVENT_H
+#define CORE_EVENT_H
+
+#include "almalloc.h"
+
+struct EffectState;
+
+using uint = unsigned int;
+
+
+struct AsyncEvent {
+    enum : uint {
+        /* User event types. */
+        SourceStateChange,
+        BufferCompleted,
+        Disconnected,
+        UserEventCount,
+
+        /* Internal events, always processed. */
+        ReleaseEffectState = 128,
+
+        /* End event thread processing. */
+        KillThread,
+    };
+
+    enum class SrcState {
+        Reset,
+        Stop,
+        Play,
+        Pause
+    };
+
+    const uint EnumType;
+    union {
+        char dummy;
+        struct {
+            uint id;
+            SrcState state;
+        } srcstate;
+        struct {
+            uint id;
+            uint count;
+        } bufcomp;
+        struct {
+            char msg[244];
+        } disconnect;
+        EffectState *mEffectState;
+    } u{};
+
+    constexpr AsyncEvent(uint type) noexcept : EnumType{type} { }
+
+    DISABLE_ALLOC()
+};
+
+#endif
diff --git a/core/bformatdec.cpp b/core/bformatdec.cpp
new file mode 100644
index 00000000..129b9976
--- /dev/null
+++ b/core/bformatdec.cpp
@@ -0,0 +1,170 @@
+
+#include "config.h"
+
+#include "bformatdec.h"
+
+#include <algorithm>
+#include <array>
+#include <cmath>
+#include <utility>
+
+#include "almalloc.h"
+#include "alnumbers.h"
+#include "filters/splitter.h"
+#include "front_stablizer.h"
+#include "mixer.h"
+#include "opthelpers.h"
+
+
+BFormatDec::BFormatDec(const size_t inchans, const al::span<const ChannelDec> coeffs,
+    const al::span<const ChannelDec> coeffslf, const float xover_f0norm,
+    std::unique_ptr<FrontStablizer> stablizer)
+    : mStablizer{std::move(stablizer)}, mDualBand{!coeffslf.empty()}, mChannelDec{inchans}
+{
+    if(!mDualBand)
+    {
+        for(size_t j{0};j < mChannelDec.size();++j)
+        {
+            float *outcoeffs{mChannelDec[j].mGains.Single};
+            for(const ChannelDec &incoeffs : coeffs)
+                *(outcoeffs++) = incoeffs[j];
+        }
+    }
+    else
+    {
+        mChannelDec[0].mXOver.init(xover_f0norm);
+        for(size_t j{1};j < mChannelDec.size();++j)
+            mChannelDec[j].mXOver = mChannelDec[0].mXOver;
+
+        for(size_t j{0};j < mChannelDec.size();++j)
+        {
+            float *outcoeffs{mChannelDec[j].mGains.Dual[sHFBand]};
+            for(const ChannelDec &incoeffs : coeffs)
+                *(outcoeffs++) = incoeffs[j];
+
+            outcoeffs = mChannelDec[j].mGains.Dual[sLFBand];
+            for(const ChannelDec &incoeffs : coeffslf)
+                *(outcoeffs++) = incoeffs[j];
+        }
+    }
+}
+
+
+void BFormatDec::process(const al::span<FloatBufferLine> OutBuffer,
+    const FloatBufferLine *InSamples, const size_t SamplesToDo)
+{
+    ASSUME(SamplesToDo > 0);
+
+    if(mDualBand)
+    {
+        const al::span<float> hfSamples{mSamples[sHFBand].data(), SamplesToDo};
+        const al::span<float> lfSamples{mSamples[sLFBand].data(), SamplesToDo};
+        for(auto &chandec : mChannelDec)
+        {
+            chandec.mXOver.process({InSamples->data(), SamplesToDo}, hfSamples.data(),
+                lfSamples.data());
+            MixSamples(hfSamples, OutBuffer, chandec.mGains.Dual[sHFBand],
+                chandec.mGains.Dual[sHFBand], 0, 0);
+            MixSamples(lfSamples, OutBuffer, chandec.mGains.Dual[sLFBand],
+                chandec.mGains.Dual[sLFBand], 0, 0);
+            ++InSamples;
+        }
+    }
+    else
+    {
+        for(auto &chandec : mChannelDec)
+        {
+            MixSamples({InSamples->data(), SamplesToDo}, OutBuffer, chandec.mGains.Single,
+                chandec.mGains.Single, 0, 0);
+            ++InSamples;
+        }
+    }
+}
+
+void BFormatDec::processStablize(const al::span<FloatBufferLine> OutBuffer,
+    const FloatBufferLine *InSamples, const size_t lidx, const size_t ridx, const size_t cidx,
+    const size_t SamplesToDo)
+{
+    ASSUME(SamplesToDo > 0);
+
+    /* Move the existing direct L/R signal out so it doesn't get processed by
+     * the stablizer.
+     */
+    float *RESTRICT mid{al::assume_aligned<16>(mStablizer->MidDirect.data())};
+    float *RESTRICT side{al::assume_aligned<16>(mStablizer->Side.data())};
+    for(size_t i{0};i < SamplesToDo;++i)
+    {
+        mid[i] = OutBuffer[lidx][i] + OutBuffer[ridx][i];
+        side[i] = OutBuffer[lidx][i] - OutBuffer[ridx][i];
+    }
+    std::fill_n(OutBuffer[lidx].begin(), SamplesToDo, 0.0f);
+    std::fill_n(OutBuffer[ridx].begin(), SamplesToDo, 0.0f);
+
+    /* Decode the B-Format input to OutBuffer. */
+    process(OutBuffer, InSamples, SamplesToDo);
+
+    /* Include the decoded side signal with the direct side signal. */
+    for(size_t i{0};i < SamplesToDo;++i)
+        side[i] += OutBuffer[lidx][i] - OutBuffer[ridx][i];
+
+    /* Get the decoded mid signal and band-split it. */
+    std::transform(OutBuffer[lidx].cbegin(), OutBuffer[lidx].cbegin()+SamplesToDo,
+        OutBuffer[ridx].cbegin(), mStablizer->Temp.begin(),
+        [](const float l, const float r) noexcept { return l + r; });
+
+    mStablizer->MidFilter.process({mStablizer->Temp.data(), SamplesToDo}, mStablizer->MidHF.data(),
+        mStablizer->MidLF.data());
+
+    /* Apply an all-pass to all channels to match the band-splitter's phase
+     * shift. This is to keep the phase synchronized between the existing
+     * signal and the split mid signal.
+     */
+    const size_t NumChannels{OutBuffer.size()};
+    for(size_t i{0u};i < NumChannels;i++)
+    {
+        /* Skip the left and right channels, which are going to get overwritten,
+         * and substitute the direct mid signal and direct+decoded side signal.
+         */
+        if(i == lidx)
+            mStablizer->ChannelFilters[i].processAllPass({mid, SamplesToDo});
+        else if(i == ridx)
+            mStablizer->ChannelFilters[i].processAllPass({side, SamplesToDo});
+        else
+            mStablizer->ChannelFilters[i].processAllPass({OutBuffer[i].data(), SamplesToDo});
+    }
+
+    /* This pans the separate low- and high-frequency signals between being on
+     * the center channel and the left+right channels. The low-frequency signal
+     * is panned 1/3rd toward center and the high-frequency signal is panned
+     * 1/4th toward center. These values can be tweaked.
+     */
+    const float cos_lf{std::cos(1.0f/3.0f * (al::numbers::pi_v<float>*0.5f))};
+    const float cos_hf{std::cos(1.0f/4.0f * (al::numbers::pi_v<float>*0.5f))};
+    const float sin_lf{std::sin(1.0f/3.0f * (al::numbers::pi_v<float>*0.5f))};
+    const float sin_hf{std::sin(1.0f/4.0f * (al::numbers::pi_v<float>*0.5f))};
+    for(size_t i{0};i < SamplesToDo;i++)
+    {
+        /* Add the direct mid signal to the processed mid signal so it can be
+         * properly combined with the direct+decoded side signal.
+         */
+        const float m{mStablizer->MidLF[i]*cos_lf + mStablizer->MidHF[i]*cos_hf + mid[i]};
+        const float c{mStablizer->MidLF[i]*sin_lf + mStablizer->MidHF[i]*sin_hf};
+        const float s{side[i]};
+
+        /* The generated center channel signal adds to the existing signal,
+         * while the modified left and right channels replace.
+         */
+        OutBuffer[lidx][i] = (m + s) * 0.5f;
+        OutBuffer[ridx][i] = (m - s) * 0.5f;
+        OutBuffer[cidx][i] += c * 0.5f;
+    }
+}
+
+
+std::unique_ptr<BFormatDec> BFormatDec::Create(const size_t inchans,
+    const al::span<const ChannelDec> coeffs, const al::span<const ChannelDec> coeffslf,
+    const float xover_f0norm, std::unique_ptr<FrontStablizer> stablizer)
+{
+    return std::make_unique<BFormatDec>(inchans, coeffs, coeffslf, xover_f0norm,
+        std::move(stablizer));
+}
diff --git a/core/bformatdec.h b/core/bformatdec.h
new file mode 100644
index 00000000..7a27a5a4
--- /dev/null
+++ b/core/bformatdec.h
@@ -0,0 +1,71 @@
+#ifndef CORE_BFORMATDEC_H
+#define CORE_BFORMATDEC_H
+
+#include <array>
+#include <cstddef>
+#include <memory>
+
+#include "almalloc.h"
+#include "alspan.h"
+#include "ambidefs.h"
+#include "bufferline.h"
+#include "devformat.h"
+#include "filters/splitter.h"
+#include "vector.h"
+
+struct FrontStablizer;
+
+
+using ChannelDec = std::array<float,MaxAmbiChannels>;
+
+class BFormatDec {
+    static constexpr size_t sHFBand{0};
+    static constexpr size_t sLFBand{1};
+    static constexpr size_t sNumBands{2};
+
+    struct ChannelDecoder {
+        union MatrixU {
+            float Dual[sNumBands][MAX_OUTPUT_CHANNELS];
+            float Single[MAX_OUTPUT_CHANNELS];
+        } mGains{};
+
+        /* NOTE: BandSplitter filter is unused with single-band decoding. */
+        BandSplitter mXOver;
+    };
+
+    alignas(16) std::array<FloatBufferLine,2> mSamples;
+
+    const std::unique_ptr<FrontStablizer> mStablizer;
+    const bool mDualBand{false};
+
+    /* TODO: This should ideally be a FlexArray, since ChannelDecoder is rather
+     * small and only a few are needed (3, 4, 5, 7, typically). But that can
+     * only be used in a standard layout struct, and a std::unique_ptr member
+     * (mStablizer) causes GCC and Clang to warn it's not.
+     */
+    al::vector<ChannelDecoder> mChannelDec;
+
+public:
+    BFormatDec(const size_t inchans, const al::span<const ChannelDec> coeffs,
+        const al::span<const ChannelDec> coeffslf, const float xover_f0norm,
+        std::unique_ptr<FrontStablizer> stablizer);
+
+    bool hasStablizer() const noexcept { return mStablizer != nullptr; }
+
+    /* Decodes the ambisonic input to the given output channels. */
+    void process(const al::span<FloatBufferLine> OutBuffer, const FloatBufferLine *InSamples,
+        const size_t SamplesToDo);
+
+    /* Decodes the ambisonic input to the given output channels with stablization. */
+    void processStablize(const al::span<FloatBufferLine> OutBuffer,
+        const FloatBufferLine *InSamples, const size_t lidx, const size_t ridx, const size_t cidx,
+        const size_t SamplesToDo);
+
+    static std::unique_ptr<BFormatDec> Create(const size_t inchans,
+        const al::span<const ChannelDec> coeffs, const al::span<const ChannelDec> coeffslf,
+        const float xover_f0norm, std::unique_ptr<FrontStablizer> stablizer);
+
+    DEF_NEWDEL(BFormatDec)
+};
+
+#endif /* CORE_BFORMATDEC_H */
diff --git a/core/bs2b.cpp b/core/bs2b.cpp
new file mode 100644
index 00000000..303bf9bd
--- /dev/null
+++ b/core/bs2b.cpp
@@ -0,0 +1,183 @@
+/*-
+ * Copyright (c) 2005 Boris Mikhaylov
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "config.h"
+
+#include <algorithm>
+#include <cmath>
+#include <iterator>
+
+#include "alnumbers.h"
+#include "bs2b.h"
+
+
+/* Set up all data. */
+static void init(struct bs2b *bs2b)
+{
+    float Fc_lo, Fc_hi;
+    float G_lo, G_hi;
+    float x, g;
+
+    switch(bs2b->level)
+    {
+    case BS2B_LOW_CLEVEL: /* Low crossfeed level */
+        Fc_lo = 360.0f;
+        Fc_hi = 501.0f;
+        G_lo  = 0.398107170553497f;
+        G_hi  = 0.205671765275719f;
+        break;
+
+    case BS2B_MIDDLE_CLEVEL: /* Middle crossfeed level */
+        Fc_lo = 500.0f;
+        Fc_hi = 711.0f;
+        G_lo  = 0.459726988530872f;
+        G_hi  = 0.228208484414988f;
+        break;
+
+    case BS2B_HIGH_CLEVEL: /* High crossfeed level (virtual speakers are closer to itself) */
+        Fc_lo = 700.0f;
+        Fc_hi = 1021.0f;
+        G_lo  = 0.530884444230988f;
+        G_hi  = 0.250105790667544f;
+        break;
+
+    case BS2B_LOW_ECLEVEL: /* Low easy crossfeed level */
+        Fc_lo = 360.0f;
+        Fc_hi = 494.0f;
+        G_lo  = 0.316227766016838f;
+        G_hi  = 0.168236228897329f;
+        break;
+
+    case BS2B_MIDDLE_ECLEVEL: /* Middle easy crossfeed level */
+        Fc_lo = 500.0f;
+        Fc_hi = 689.0f;
+        G_lo  = 0.354813389233575f;
+        G_hi  = 0.187169483835901f;
+        break;
+
+    default: /* High easy crossfeed level */
+        bs2b->level = BS2B_HIGH_ECLEVEL;
+
+        Fc_lo = 700.0f;
+        Fc_hi = 975.0f;
+        G_lo  = 0.398107170553497f;
+        G_hi  = 0.205671765275719f;
+        break;
+    } /* switch */
+
+    g = 1.0f / (1.0f - G_hi + G_lo);
+
+    /* $fc = $Fc / $s;
+     * $d  = 1 / 2 / pi / $fc;
+     * $x  = exp(-1 / $d);
+     */
+    x           = std::exp(-al::numbers::pi_v<float>*2.0f*Fc_lo/static_cast<float>(bs2b->srate));
+    bs2b->b1_lo = x;
+    bs2b->a0_lo = G_lo * (1.0f - x) * g;
+
+    x           = std::exp(-al::numbers::pi_v<float>*2.0f*Fc_hi/static_cast<float>(bs2b->srate));
+    bs2b->b1_hi = x;
+    bs2b->a0_hi = (1.0f - G_hi * (1.0f - x)) * g;
+    bs2b->a1_hi = -x * g;
+} /* init */
+
+
+/* Exported functions.
+ * See descriptions in "bs2b.h"
+ */
+
+void bs2b_set_params(struct bs2b *bs2b, int level, int srate)
+{
+    if(srate <= 0) srate = 1;
+
+    bs2b->level = level;
+    bs2b->srate = srate;
+    init(bs2b);
+} /* bs2b_set_params */
+
+int bs2b_get_level(struct bs2b *bs2b)
+{
+    return bs2b->level;
+} /* bs2b_get_level */
+
+int bs2b_get_srate(struct bs2b *bs2b)
+{
+    return bs2b->srate;
+} /* bs2b_get_srate */
+
+void bs2b_clear(struct bs2b *bs2b)
+{
+    std::fill(std::begin(bs2b->history), std::end(bs2b->history), bs2b::t_last_sample{});
+} /* bs2b_clear */
+
+void bs2b_cross_feed(struct bs2b *bs2b, float *Left, float *Right, size_t SamplesToDo)
+{
+    const float a0_lo{bs2b->a0_lo};
+    const float b1_lo{bs2b->b1_lo};
+    const float a0_hi{bs2b->a0_hi};
+    const float a1_hi{bs2b->a1_hi};
+    const float b1_hi{bs2b->b1_hi};
+    float lsamples[128][2];
+    float rsamples[128][2];
+
+    for(size_t base{0};base < SamplesToDo;)
+    {
+        const size_t todo{std::min<size_t>(128, SamplesToDo-base)};
+
+        /* Process left input */
+        float z_lo{bs2b->history[0].lo};
+        float z_hi{bs2b->history[0].hi};
+        for(size_t i{0};i < todo;i++)
+        {
+            lsamples[i][0] = a0_lo*Left[i] + z_lo;
+            z_lo = b1_lo*lsamples[i][0];
+
+            lsamples[i][1] = a0_hi*Left[i] + z_hi;
+            z_hi = a1_hi*Left[i] + b1_hi*lsamples[i][1];
+        }
+        bs2b->history[0].lo = z_lo;
+        bs2b->history[0].hi = z_hi;
+
+        /* Process right input */
+        z_lo = bs2b->history[1].lo;
+        z_hi = bs2b->history[1].hi;
+        for(size_t i{0};i < todo;i++)
+        {
+            rsamples[i][0] = a0_lo*Right[i] + z_lo;
+            z_lo = b1_lo*rsamples[i][0];
+
+            rsamples[i][1] = a0_hi*Right[i] + z_hi;
+            z_hi = a1_hi*Right[i] + b1_hi*rsamples[i][1];
+        }
+        bs2b->history[1].lo = z_lo;
+        bs2b->history[1].hi = z_hi;
+
+        /* Crossfeed */
+        for(size_t i{0};i < todo;i++)
+            *(Left++) = lsamples[i][1] + rsamples[i][0];
+        for(size_t i{0};i < todo;i++)
+            *(Right++) = rsamples[i][1] + lsamples[i][0];
+
+        base += todo;
+    }
+} /* bs2b_cross_feed */
diff --git a/core/bs2b.h b/core/bs2b.h
new file mode 100644
index 00000000..4d0b9dd8
--- /dev/null
+++ b/core/bs2b.h
@@ -0,0 +1,89 @@
+/*-
+ * Copyright (c) 2005 Boris Mikhaylov
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef CORE_BS2B_H
+#define CORE_BS2B_H
+
+#include "almalloc.h"
+
+/* Number of crossfeed levels */
+#define BS2B_CLEVELS           3
+
+/* Normal crossfeed levels */
+#define BS2B_HIGH_CLEVEL       3
+#define BS2B_MIDDLE_CLEVEL     2
+#define BS2B_LOW_CLEVEL        1
+
+/* Easy crossfeed levels */
+#define BS2B_HIGH_ECLEVEL      BS2B_HIGH_CLEVEL    + BS2B_CLEVELS
+#define BS2B_MIDDLE_ECLEVEL    BS2B_MIDDLE_CLEVEL  + BS2B_CLEVELS
+#define BS2B_LOW_ECLEVEL       BS2B_LOW_CLEVEL     + BS2B_CLEVELS
+
+/* Default crossfeed levels */
+#define BS2B_DEFAULT_CLEVEL    BS2B_HIGH_ECLEVEL
+/* Default sample rate (Hz) */
+#define BS2B_DEFAULT_SRATE     44100
+
+struct bs2b {
+    int level;  /* Crossfeed level */
+    int srate;   /* Sample rate (Hz) */
+
+    /* Lowpass IIR filter coefficients */
+    float a0_lo;
+    float b1_lo;
+
+    /* Highboost IIR filter coefficients */
+    float a0_hi;
+    float a1_hi;
+    float b1_hi;
+
+    /* Buffer of filter history
+     * [0] - first channel, [1] - second channel
+     */
+    struct t_last_sample {
+        float lo;
+        float hi;
+    } history[2];
+
+    DEF_NEWDEL(bs2b)
+};
+
+/* Clear buffers and set new coefficients with new crossfeed level and sample
+ * rate values.
+ * level - crossfeed level of *LEVEL values.
+ * srate - sample rate by Hz.
+ */
+void bs2b_set_params(bs2b *bs2b, int level, int srate);
+
+/* Return current crossfeed level value */
+int bs2b_get_level(bs2b *bs2b);
+
+/* Return current sample rate value */
+int bs2b_get_srate(bs2b *bs2b);
+
+/* Clear buffer */
+void bs2b_clear(bs2b *bs2b);
+
+void bs2b_cross_feed(bs2b *bs2b, float *Left, float *Right, size_t SamplesToDo);
+
+#endif /* CORE_BS2B_H */
diff --git a/core/bsinc_defs.h b/core/bsinc_defs.h
new file mode 100644
index 00000000..01bd3c29
--- /dev/null
+++ b/core/bsinc_defs.h
@@ -0,0 +1,12 @@
+#ifndef CORE_BSINC_DEFS_H
+#define CORE_BSINC_DEFS_H
+
+/* The number of distinct scale and phase intervals within the bsinc filter
+ * tables.
+ */
+constexpr unsigned int BSincScaleBits{4};
+constexpr unsigned int BSincScaleCount{1 << BSincScaleBits};
+constexpr unsigned int BSincPhaseBits{5};
+constexpr unsigned int BSincPhaseCount{1 << BSincPhaseBits};
+
+#endif /* CORE_BSINC_DEFS_H */
diff --git a/core/bsinc_tables.cpp b/core/bsinc_tables.cpp
new file mode 100644
index 00000000..693645f4
--- /dev/null
+++ b/core/bsinc_tables.cpp
@@ -0,0 +1,295 @@
+
+#include "bsinc_tables.h"
+
+#include <algorithm>
+#include <array>
+#include <cassert>
+#include <cmath>
+#include <limits>
+#include <memory>
+#include <stdexcept>
+
+#include "alnumbers.h"
+#include "core/mixer/defs.h"
+
+
+namespace {
+
+using uint = unsigned int;
+
+
+/* This is the normalized cardinal sine (sinc) function.
+ *
+ *   sinc(x) = { 1,                   x = 0
+ *             { sin(pi x) / (pi x),  otherwise.
+ */
+constexpr double Sinc(const double x)
+{
+    constexpr double epsilon{std::numeric_limits<double>::epsilon()};
+    if(!(x > epsilon || x < -epsilon))
+        return 1.0;
+    return std::sin(al::numbers::pi*x) / (al::numbers::pi*x);
+}
+
+/* The zero-order modified Bessel function of the first kind, used for the
+ * Kaiser window.
+ *
+ *   I_0(x) = sum_{k=0}^inf (1 / k!)^2 (x / 2)^(2 k)
+ *          = sum_{k=0}^inf ((x / 2)^k / k!)^2
+ */
+constexpr double BesselI_0(const double x) noexcept
+{
+    /* Start at k=1 since k=0 is trivial. */
+    const double x2{x / 2.0};
+    double term{1.0};
+    double sum{1.0};
+    double last_sum{};
+    int k{1};
+
+    /* Let the integration converge until the term of the sum is no longer
+     * significant.
+     */
+    do {
+        const double y{x2 / k};
+        ++k;
+        last_sum = sum;
+        term *= y * y;
+        sum += term;
+    } while(sum != last_sum);
+
+    return sum;
+}
+
+/* Calculate a Kaiser window from the given beta value and a normalized k
+ * [-1, 1].
+ *
+ *   w(k) = { I_0(B sqrt(1 - k^2)) / I_0(B),  -1 <= k <= 1
+ *          { 0,                              elsewhere.
+ *
+ * Where k can be calculated as:
+ *
+ *   k = i / l,         where -l <= i <= l.
+ *
+ * or:
+ *
+ *   k = 2 i / M - 1,   where 0 <= i <= M.
+ */
+constexpr double Kaiser(const double beta, const double k, const double besseli_0_beta)
+{
+    if(!(k >= -1.0 && k <= 1.0))
+        return 0.0;
+    return BesselI_0(beta * std::sqrt(1.0 - k*k)) / besseli_0_beta;
+}
+
+/* Calculates the (normalized frequency) transition width of the Kaiser window.
+ * Rejection is in dB.
+ */
+constexpr double CalcKaiserWidth(const double rejection, const uint order) noexcept
+{
+    if(rejection > 21.19)
+        return (rejection - 7.95) / (2.285 * al::numbers::pi*2.0 * order);
+    /* This enforces a minimum rejection of just above 21.18dB */
+    return 5.79 / (al::numbers::pi*2.0 * order);
+}
+
+/* Calculates the beta value of the Kaiser window. Rejection is in dB. */
+constexpr double CalcKaiserBeta(const double rejection)
+{
+    if(rejection > 50.0)
+        return 0.1102 * (rejection-8.7);
+    else if(rejection >= 21.0)
+        return (0.5842 * std::pow(rejection-21.0, 0.4)) + (0.07886 * (rejection-21.0));
+    return 0.0;
+}
+
+
+struct BSincHeader {
+    double width{};
+    double beta{};
+    double scaleBase{};
+    double scaleRange{};
+    double besseli_0_beta{};
+
+    uint a[BSincScaleCount]{};
+    uint total_size{};
+
+    constexpr BSincHeader(uint Rejection, uint Order) noexcept
+    {
+        width = CalcKaiserWidth(Rejection, Order);
+        beta = CalcKaiserBeta(Rejection);
+        scaleBase = width / 2.0;
+        scaleRange = 1.0 - scaleBase;
+        besseli_0_beta = BesselI_0(beta);
+
+        uint num_points{Order+1};
+        for(uint si{0};si < BSincScaleCount;++si)
+        {
+            const double scale{scaleBase + (scaleRange * (si+1) / BSincScaleCount)};
+            const uint a_{std::min(static_cast<uint>(num_points / 2.0 / scale), num_points)};
+            const uint m{2 * a_};
+
+            a[si] = a_;
+            total_size += 4 * BSincPhaseCount * ((m+3) & ~3u);
+        }
+    }
+};
+
+/* 11th and 23rd order filters (12 and 24-point respectively) with a 60dB drop
+ * at nyquist. Each filter will scale up the order when downsampling, to 23rd
+ * and 47th order respectively.
+ */
+constexpr BSincHeader bsinc12_hdr{60, 11};
+constexpr BSincHeader bsinc24_hdr{60, 23};
+
+
+/* NOTE: GCC 5 has an issue with BSincHeader objects being in an anonymous
+ * namespace while also being used as non-type template parameters.
+ */
+#if !defined(__clang__) && defined(__GNUC__) && __GNUC__ < 6
+
+/* The number of sample points is double the a value (rounded up to a multiple
+ * of 4), and scale index 0 includes the doubling for downsampling. bsinc24 is
+ * currently the highest quality filter, and will use the most sample points.
+ */
+constexpr uint BSincPointsMax{(bsinc24_hdr.a[0]*2 + 3) & ~3u};
+static_assert(BSincPointsMax <= MaxResamplerPadding, "MaxResamplerPadding is too small");
+
+template<size_t total_size>
+struct BSincFilterArray {
+    alignas(16) std::array<float, total_size> mTable;
+    const BSincHeader &hdr;
+
+    BSincFilterArray(const BSincHeader &hdr_) : hdr{hdr_}
+    {
+#else
+template<const BSincHeader &hdr>
+struct BSincFilterArray {
+    alignas(16) std::array<float, hdr.total_size> mTable{};
+
+    BSincFilterArray()
+    {
+        constexpr uint BSincPointsMax{(hdr.a[0]*2 + 3) & ~3u};
+        static_assert(BSincPointsMax <= MaxResamplerPadding, "MaxResamplerPadding is too small");
+#endif
+        using filter_type = double[BSincPhaseCount+1][BSincPointsMax];
+        auto filter = std::make_unique<filter_type[]>(BSincScaleCount);
+
+        /* Calculate the Kaiser-windowed Sinc filter coefficients for each
+         * scale and phase index.
+         */
+        for(uint si{0};si < BSincScaleCount;++si)
+        {
+            const uint m{hdr.a[si] * 2};
+            const size_t o{(BSincPointsMax-m) / 2};
+            const double scale{hdr.scaleBase + (hdr.scaleRange * (si+1) / BSincScaleCount)};
+            const double cutoff{scale - (hdr.scaleBase * std::max(1.0, scale*2.0))};
+            const auto a = static_cast<double>(hdr.a[si]);
+            const double l{a - 1.0/BSincPhaseCount};
+
+            /* Do one extra phase index so that the phase delta has a proper
+             * target for its last index.
+             */
+            for(uint pi{0};pi <= BSincPhaseCount;++pi)
+            {
+                const double phase{std::floor(l) + (pi/double{BSincPhaseCount})};
+
+                for(uint i{0};i < m;++i)
+                {
+                    const double x{i - phase};
+                    filter[si][pi][o+i] = Kaiser(hdr.beta, x/l, hdr.besseli_0_beta) * cutoff *
+                        Sinc(cutoff*x);
+                }
+            }
+        }
+
+        size_t idx{0};
+        for(size_t si{0};si < BSincScaleCount;++si)
+        {
+            const size_t m{((hdr.a[si]*2) + 3) & ~3u};
+            const size_t o{(BSincPointsMax-m) / 2};
+
+            /* Write out each phase index's filter and phase delta for this
+             * quality scale.
+             */
+            for(size_t pi{0};pi < BSincPhaseCount;++pi)
+            {
+                for(size_t i{0};i < m;++i)
+                    mTable[idx++] = static_cast<float>(filter[si][pi][o+i]);
+
+                /* Linear interpolation between phases is simplified by pre-
+                 * calculating the delta (b - a) in: x = a + f (b - a)
+                 */
+                for(size_t i{0};i < m;++i)
+                {
+                    const double phDelta{filter[si][pi+1][o+i] - filter[si][pi][o+i]};
+                    mTable[idx++] = static_cast<float>(phDelta);
+                }
+            }
+            /* Calculate and write out each phase index's filter quality scale
+             * deltas. The last scale index doesn't have any scale or scale-
+             * phase deltas.
+             */
+            if(si == BSincScaleCount-1)
+            {
+                for(size_t i{0};i < BSincPhaseCount*m*2;++i)
+                    mTable[idx++] = 0.0f;
+            }
+            else for(size_t pi{0};pi < BSincPhaseCount;++pi)
+            {
+                /* Linear interpolation between scales is also simplified.
+                 *
+                 * Given a difference in the number of points between scales,
+                 * the destination points will be 0, thus: x = a + f (-a)
+                 */
+                for(size_t i{0};i < m;++i)
+                {
+                    const double scDelta{filter[si+1][pi][o+i] - filter[si][pi][o+i]};
+                    mTable[idx++] = static_cast<float>(scDelta);
+                }
+
+                /* This last simplification is done to complete the bilinear
+                 * equation for the combination of phase and scale.
+                 */
+                for(size_t i{0};i < m;++i)
+                {
+                    const double spDelta{(filter[si+1][pi+1][o+i] - filter[si+1][pi][o+i]) -
+                        (filter[si][pi+1][o+i] - filter[si][pi][o+i])};
+                    mTable[idx++] = static_cast<float>(spDelta);
+                }
+            }
+        }
+        assert(idx == hdr.total_size);
+    }
+
+    constexpr const BSincHeader &getHeader() const noexcept { return hdr; }
+    constexpr const float *getTable() const noexcept { return &mTable.front(); }
+};
+
+#if !defined(__clang__) && defined(__GNUC__) && __GNUC__ < 6
+const BSincFilterArray<bsinc12_hdr.total_size> bsinc12_filter{bsinc12_hdr};
+const BSincFilterArray<bsinc24_hdr.total_size> bsinc24_filter{bsinc24_hdr};
+#else
+const BSincFilterArray<bsinc12_hdr> bsinc12_filter{};
+const BSincFilterArray<bsinc24_hdr> bsinc24_filter{};
+#endif
+
+template<typename T>
+constexpr BSincTable GenerateBSincTable(const T &filter)
+{
+    BSincTable ret{};
+    const BSincHeader &hdr = filter.getHeader();
+    ret.scaleBase = static_cast<float>(hdr.scaleBase);
+    ret.scaleRange = static_cast<float>(1.0 / hdr.scaleRange);
+    for(size_t i{0};i < BSincScaleCount;++i)
+        ret.m[i] = ((hdr.a[i]*2) + 3) & ~3u;
+    ret.filterOffset[0] = 0;
+    for(size_t i{1};i < BSincScaleCount;++i)
+        ret.filterOffset[i] = ret.filterOffset[i-1] + ret.m[i-1]*4*BSincPhaseCount;
+    ret.Tab = filter.getTable();
+    return ret;
+}
+
+} // namespace
+
+const BSincTable gBSinc12{GenerateBSincTable(bsinc12_filter)};
+const BSincTable gBSinc24{GenerateBSincTable(bsinc24_filter)};
diff --git a/core/bsinc_tables.h b/core/bsinc_tables.h
new file mode 100644
index 00000000..aca4b274
--- /dev/null
+++ b/core/bsinc_tables.h
@@ -0,0 +1,17 @@
+#ifndef CORE_BSINC_TABLES_H
+#define CORE_BSINC_TABLES_H
+
+#include "bsinc_defs.h"
+
+
+struct BSincTable {
+    float scaleBase, scaleRange;
+    unsigned int m[BSincScaleCount];
+    unsigned int filterOffset[BSincScaleCount];
+    const float *Tab;
+};
+
+extern const BSincTable gBSinc12;
+extern const BSincTable gBSinc24;
+
+#endif /* CORE_BSINC_TABLES_H */
diff --git a/core/buffer_storage.cpp b/core/buffer_storage.cpp
new file mode 100644
index 00000000..98ca2c1b
--- /dev/null
+++ b/core/buffer_storage.cpp
@@ -0,0 +1,81 @@
+
+#include "config.h"
+
+#include "buffer_storage.h"
+
+#include <stdint.h>
+
+
+const char *NameFromFormat(FmtType type) noexcept
+{
+    switch(type)
+    {
+    case FmtUByte: return "UInt8";
+    case FmtShort: return "Int16";
+    case FmtFloat: return "Float";
+    case FmtDouble: return "Double";
+    case FmtMulaw: return "muLaw";
+    case FmtAlaw: return "aLaw";
+    case FmtIMA4: return "IMA4 ADPCM";
+    case FmtMSADPCM: return "MS ADPCM";
+    }
+    return "<internal error>";
+}
+
+const char *NameFromFormat(FmtChannels channels) noexcept
+{
+    switch(channels)
+    {
+    case FmtMono: return "Mono";
+    case FmtStereo: return "Stereo";
+    case FmtRear: return "Rear";
+    case FmtQuad: return "Quadraphonic";
+    case FmtX51: return "Surround 5.1";
+    case FmtX61: return "Surround 6.1";
+    case FmtX71: return "Surround 7.1";
+    case FmtBFormat2D: return "B-Format 2D";
+    case FmtBFormat3D: return "B-Format 3D";
+    case FmtUHJ2: return "UHJ2";
+    case FmtUHJ3: return "UHJ3";
+    case FmtUHJ4: return "UHJ4";
+    case FmtSuperStereo: return "Super Stereo";
+    }
+    return "<internal error>";
+}
+
+uint BytesFromFmt(FmtType type) noexcept
+{
+    switch(type)
+    {
+    case FmtUByte: return sizeof(uint8_t);
+    case FmtShort: return sizeof(int16_t);
+    case FmtFloat: return sizeof(float);
+    case FmtDouble: return sizeof(double);
+    case FmtMulaw: return sizeof(uint8_t);
+    case FmtAlaw: return sizeof(uint8_t);
+    case FmtIMA4: break;
+    case FmtMSADPCM: break;
+    }
+    return 0;
+}
+
+uint ChannelsFromFmt(FmtChannels chans, uint ambiorder) noexcept
+{
+    switch(chans)
+    {
+    case FmtMono: return 1;
+    case FmtStereo: return 2;
+    case FmtRear: return 2;
+    case FmtQuad: return 4;
+    case FmtX51: return 6;
+    case FmtX61: return 7;
+    case FmtX71: return 8;
+    case FmtBFormat2D: return (ambiorder*2) + 1;
+    case FmtBFormat3D: return (ambiorder+1) * (ambiorder+1);
+    case FmtUHJ2: return 2;
+    case FmtUHJ3: return 3;
+    case FmtUHJ4: return 4;
+    case FmtSuperStereo: return 2;
+    }
+    return 0;
+}
diff --git a/core/buffer_storage.h b/core/buffer_storage.h
new file mode 100644
index 00000000..282d5b53
--- /dev/null
+++ b/core/buffer_storage.h
@@ -0,0 +1,115 @@
+#ifndef CORE_BUFFER_STORAGE_H
+#define CORE_BUFFER_STORAGE_H
+
+#include <atomic>
+
+#include "albyte.h"
+#include "alnumeric.h"
+#include "alspan.h"
+#include "ambidefs.h"
+
+
+using uint = unsigned int;
+
+/* Storable formats */
+enum FmtType : unsigned char {
+    FmtUByte,
+    FmtShort,
+    FmtFloat,
+    FmtDouble,
+    FmtMulaw,
+    FmtAlaw,
+    FmtIMA4,
+    FmtMSADPCM,
+};
+enum FmtChannels : unsigned char {
+    FmtMono,
+    FmtStereo,
+    FmtRear,
+    FmtQuad,
+    FmtX51, /* (WFX order) */
+    FmtX61, /* (WFX order) */
+    FmtX71, /* (WFX order) */
+    FmtBFormat2D,
+    FmtBFormat3D,
+    FmtUHJ2, /* 2-channel UHJ, aka "BHJ", stereo-compatible */
+    FmtUHJ3, /* 3-channel UHJ, aka "THJ" */
+    FmtUHJ4, /* 4-channel UHJ, aka "PHJ" */
+    FmtSuperStereo, /* Stereo processed with Super Stereo. */
+};
+
+enum class AmbiLayout : unsigned char {
+    FuMa,
+    ACN,
+};
+enum class AmbiScaling : unsigned char {
+    FuMa,
+    SN3D,
+    N3D,
+    UHJ,
+};
+
+const char *NameFromFormat(FmtType type) noexcept;
+const char *NameFromFormat(FmtChannels channels) noexcept;
+
+uint BytesFromFmt(FmtType type) noexcept;
+uint ChannelsFromFmt(FmtChannels chans, uint ambiorder) noexcept;
+inline uint FrameSizeFromFmt(FmtChannels chans, FmtType type, uint ambiorder) noexcept
+{ return ChannelsFromFmt(chans, ambiorder) * BytesFromFmt(type); }
+
+constexpr bool IsBFormat(FmtChannels chans) noexcept
+{ return chans == FmtBFormat2D || chans == FmtBFormat3D; }
+
+/* Super Stereo is considered part of the UHJ family here, since it goes
+ * through similar processing as UHJ, both result in a B-Format signal, and
+ * needs the same consideration as BHJ (three channel result with only two
+ * channel input).
+ */
+constexpr bool IsUHJ(FmtChannels chans) noexcept
+{ return chans == FmtUHJ2 || chans == FmtUHJ3 || chans == FmtUHJ4 || chans == FmtSuperStereo; }
+
+/** Ambisonic formats are either B-Format or UHJ formats. */
+constexpr bool IsAmbisonic(FmtChannels chans) noexcept
+{ return IsBFormat(chans) || IsUHJ(chans); }
+
+constexpr bool Is2DAmbisonic(FmtChannels chans) noexcept
+{
+    return chans == FmtBFormat2D || chans == FmtUHJ2 || chans == FmtUHJ3
+        || chans == FmtSuperStereo;
+}
+
+
+using CallbackType = int(*)(void*, void*, int);
+
+struct BufferStorage {
+    CallbackType mCallback{nullptr};
+    void *mUserData{nullptr};
+
+    al::span<al::byte> mData;
+
+    uint mSampleRate{0u};
+    FmtChannels mChannels{FmtMono};
+    FmtType mType{FmtShort};
+    uint mSampleLen{0u};
+    uint mBlockAlign{0u};
+
+    AmbiLayout mAmbiLayout{AmbiLayout::FuMa};
+    AmbiScaling mAmbiScaling{AmbiScaling::FuMa};
+    uint mAmbiOrder{0u};
+
+    inline uint bytesFromFmt() const noexcept { return BytesFromFmt(mType); }
+    inline uint channelsFromFmt() const noexcept
+    { return ChannelsFromFmt(mChannels, mAmbiOrder); }
+    inline uint frameSizeFromFmt() const noexcept { return channelsFromFmt() * bytesFromFmt(); }
+
+    inline uint blockSizeFromFmt() const noexcept
+    {
+        if(mType == FmtIMA4) return ((mBlockAlign-1)/2 + 4) * channelsFromFmt();
+        if(mType == FmtMSADPCM) return ((mBlockAlign-2)/2 + 7) * channelsFromFmt();
+        return frameSizeFromFmt();
+    };
+
+    inline bool isBFormat() const noexcept { return IsBFormat(mChannels); }
+};
+
+#endif /* CORE_BUFFER_STORAGE_H */
diff --git a/core/bufferline.h b/core/bufferline.h
new file mode 100644
index 00000000..8b445f3f
--- /dev/null
+++ b/core/bufferline.h
@@ -0,0 +1,17 @@
+#ifndef CORE_BUFFERLINE_H
+#define CORE_BUFFERLINE_H
+
+#include <array>
+
+#include "alspan.h"
+
+/* Size for temporary storage of buffer data, in floats. Larger values need
+ * more memory and are harder on cache, while smaller values may need more
+ * iterations for mixing.
+ */
+constexpr int BufferLineSize{1024};
+
+using FloatBufferLine = std::array<float,BufferLineSize>;
+using FloatBufferSpan = al::span<float,BufferLineSize>;
+
+#endif /* CORE_BUFFERLINE_H */
diff --git a/core/context.cpp b/core/context.cpp
new file mode 100644
index 00000000..d68d8327
--- /dev/null
+++ b/core/context.cpp
@@ -0,0 +1,164 @@
+
+#include "config.h"
+
+#include <cassert>
+#include <memory>
+
+#include "async_event.h"
+#include "context.h"
+#include "device.h"
+#include "effectslot.h"
+#include "logging.h"
+#include "ringbuffer.h"
+#include "voice.h"
+#include "voice_change.h"
+
+
+#ifdef __cpp_lib_atomic_is_always_lock_free
+static_assert(std::atomic<ContextBase::AsyncEventBitset>::is_always_lock_free, "atomic<bitset> isn't lock-free");
+#endif
+
+ContextBase::ContextBase(DeviceBase *device) : mDevice{device}
+{ assert(mEnabledEvts.is_lock_free()); }
+
+ContextBase::~ContextBase()
+{
+    size_t count{0};
+    ContextProps *cprops{mParams.ContextUpdate.exchange(nullptr, std::memory_order_relaxed)};
+    if(cprops)
+    {
+        ++count;
+        delete cprops;
+    }
+    cprops = mFreeContextProps.exchange(nullptr, std::memory_order_acquire);
+    while(cprops)
+    {
+        std::unique_ptr<ContextProps> old{cprops};
+        cprops = old->next.load(std::memory_order_relaxed);
+        ++count;
+    }
+    TRACE("Freed %zu context property object%s\n", count, (count==1)?"":"s");
+
+    count = 0;
+    EffectSlotProps *eprops{mFreeEffectslotProps.exchange(nullptr, std::memory_order_acquire)};
+    while(eprops)
+    {
+        std::unique_ptr<EffectSlotProps> old{eprops};
+        eprops = old->next.load(std::memory_order_relaxed);
+        ++count;
+    }
+    TRACE("Freed %zu AuxiliaryEffectSlot property object%s\n", count, (count==1)?"":"s");
+
+    if(EffectSlotArray *curarray{mActiveAuxSlots.exchange(nullptr, std::memory_order_relaxed)})
+    {
+        al::destroy_n(curarray->end(), curarray->size());
+        delete curarray;
+    }
+
+    delete mVoices.exchange(nullptr, std::memory_order_relaxed);
+
+    if(mAsyncEvents)
+    {
+        count = 0;
+        auto evt_vec = mAsyncEvents->getReadVector();
+        if(evt_vec.first.len > 0)
+        {
+            al::destroy_n(reinterpret_cast<AsyncEvent*>(evt_vec.first.buf), evt_vec.first.len);
+            count += evt_vec.first.len;
+        }
+        if(evt_vec.second.len > 0)
+        {
+            al::destroy_n(reinterpret_cast<AsyncEvent*>(evt_vec.second.buf), evt_vec.second.len);
+            count += evt_vec.second.len;
+        }
+        if(count > 0)
+            TRACE("Destructed %zu orphaned event%s\n", count, (count==1)?"":"s");
+        mAsyncEvents->readAdvance(count);
+    }
+}
+
+
+void ContextBase::allocVoiceChanges()
+{
+    constexpr size_t clustersize{128};
+
+    VoiceChangeCluster cluster{std::make_unique<VoiceChange[]>(clustersize)};
+    for(size_t i{1};i < clustersize;++i)
+        cluster[i-1].mNext.store(std::addressof(cluster[i]), std::memory_order_relaxed);
+    cluster[clustersize-1].mNext.store(mVoiceChangeTail, std::memory_order_relaxed);
+
+    mVoiceChangeClusters.emplace_back(std::move(cluster));
+    mVoiceChangeTail = mVoiceChangeClusters.back().get();
+}
+
+void ContextBase::allocVoiceProps()
+{
+    constexpr size_t clustersize{32};
+
+    TRACE("Increasing allocated voice properties to %zu\n",
+        (mVoicePropClusters.size()+1) * clustersize);
+
+    VoicePropsCluster cluster{std::make_unique<VoicePropsItem[]>(clustersize)};
+    for(size_t i{1};i < clustersize;++i)
+        cluster[i-1].next.store(std::addressof(cluster[i]), std::memory_order_relaxed);
+    mVoicePropClusters.emplace_back(std::move(cluster));
+
+    VoicePropsItem *oldhead{mFreeVoiceProps.load(std::memory_order_acquire)};
+    do {
+        mVoicePropClusters.back()[clustersize-1].next.store(oldhead, std::memory_order_relaxed);
+    } while(mFreeVoiceProps.compare_exchange_weak(oldhead, mVoicePropClusters.back().get(),
+        std::memory_order_acq_rel, std::memory_order_acquire) == false);
+}
+
+void ContextBase::allocVoices(size_t addcount)
+{
+    constexpr size_t clustersize{32};
+    /* Convert element count to cluster count. */
+    addcount = (addcount+(clustersize-1)) / clustersize;
+
+    if(addcount >= std::numeric_limits<int>::max()/clustersize - mVoiceClusters.size())
+        throw std::runtime_error{"Allocating too many voices"};
+    const size_t totalcount{(mVoiceClusters.size()+addcount) * clustersize};
+    TRACE("Increasing allocated voices to %zu\n", totalcount);
+
+    auto newarray = VoiceArray::Create(totalcount);
+    while(addcount)
+    {
+        mVoiceClusters.emplace_back(std::make_unique<Voice[]>(clustersize));
+        --addcount;
+    }
+
+    auto voice_iter = newarray->begin();
+    for(VoiceCluster &cluster : mVoiceClusters)
+    {
+        for(size_t i{0};i < clustersize;++i)
+            *(voice_iter++) = &cluster[i];
+    }
+
+    if(auto *oldvoices = mVoices.exchange(newarray.release(), std::memory_order_acq_rel))
+    {
+        mDevice->waitForMix();
+        delete oldvoices;
+    }
+}
+
+
+EffectSlot *ContextBase::getEffectSlot()
+{
+    for(auto& cluster : mEffectSlotClusters)
+    {
+        for(size_t i{0};i < EffectSlotClusterSize;++i)
+        {
+            if(!cluster[i].InUse)
+                return &cluster[i];
+        }
+    }
+
+    if(1 >= std::numeric_limits<int>::max()/EffectSlotClusterSize - mEffectSlotClusters.size())
+        throw std::runtime_error{"Allocating too many effect slots"};
+    const size_t totalcount{(mEffectSlotClusters.size()+1) * EffectSlotClusterSize};
+    TRACE("Increasing allocated effect slots to %zu\n", totalcount);
+
+    mEffectSlotClusters.emplace_back(std::make_unique<EffectSlot[]>(EffectSlotClusterSize));
+    return getEffectSlot();
+}
diff --git a/core/context.h b/core/context.h
new file mode 100644
index 00000000..9723eac3
--- /dev/null
+++ b/core/context.h
@@ -0,0 +1,171 @@
+#ifndef CORE_CONTEXT_H
+#define CORE_CONTEXT_H
+
+#include <array>
+#include <atomic>
+#include <bitset>
+#include <cstddef>
+#include <memory>
+#include <thread>
+
+#include "almalloc.h"
+#include "alspan.h"
+#include "async_event.h"
+#include "atomic.h"
+#include "bufferline.h"
+#include "threads.h"
+#include "vecmat.h"
+#include "vector.h"
+
+struct DeviceBase;
+struct EffectSlot;
+struct EffectSlotProps;
+struct RingBuffer;
+struct Voice;
+struct VoiceChange;
+struct VoicePropsItem;
+
+using uint = unsigned int;
+
+
+constexpr float SpeedOfSoundMetersPerSec{343.3f};
+
+constexpr float AirAbsorbGainHF{0.99426f}; /* -0.05dB */
+
+enum class DistanceModel : unsigned char {
+    Disable,
+    Inverse, InverseClamped,
+    Linear, LinearClamped,
+    Exponent, ExponentClamped,
+
+    Default = InverseClamped
+};
+
+
+struct ContextProps {
+    std::array<float,3> Position;
+    std::array<float,3> Velocity;
+    std::array<float,3> OrientAt;
+    std::array<float,3> OrientUp;
+    float Gain;
+    float MetersPerUnit;
+    float AirAbsorptionGainHF;
+
+    float DopplerFactor;
+    float DopplerVelocity;
+    float SpeedOfSound;
+    bool SourceDistanceModel;
+    DistanceModel mDistanceModel;
+
+    std::atomic<ContextProps*> next;
+
+    DEF_NEWDEL(ContextProps)
+};
+
+struct ContextParams {
+    /* Pointer to the most recent property values that are awaiting an update. */
+    std::atomic<ContextProps*> ContextUpdate{nullptr};
+
+    alu::Vector Position{};
+    alu::Matrix Matrix{alu::Matrix::Identity()};
+    alu::Vector Velocity{};
+
+    float Gain{1.0f};
+    float MetersPerUnit{1.0f};
+    float AirAbsorptionGainHF{AirAbsorbGainHF};
+
+    float DopplerFactor{1.0f};
+    float SpeedOfSound{SpeedOfSoundMetersPerSec}; /* in units per sec! */
+
+    bool SourceDistanceModel{false};
+    DistanceModel mDistanceModel{};
+};
+
+struct ContextBase {
+    DeviceBase *const mDevice;
+
+    /* Counter for the pre-mixing updates, in 31.1 fixed point (lowest bit
+     * indicates if updates are currently happening).
+     */
+    RefCount mUpdateCount{0u};
+    std::atomic<bool> mHoldUpdates{false};
+    std::atomic<bool> mStopVoicesOnDisconnect{true};
+
+    float mGainBoost{1.0f};
+
+    /* Linked lists of unused property containers, free to use for future
+     * updates.
+     */
+    std::atomic<ContextProps*> mFreeContextProps{nullptr};
+    std::atomic<VoicePropsItem*> mFreeVoiceProps{nullptr};
+    std::atomic<EffectSlotProps*> mFreeEffectslotProps{nullptr};
+
+    /* The voice change tail is the beginning of the "free" elements, up to and
+     * *excluding* the current. If tail==current, there's no free elements and
+     * new ones need to be allocated. The current voice change is the element
+     * last processed, and any after are pending.
+     */
+    VoiceChange *mVoiceChangeTail{};
+    std::atomic<VoiceChange*> mCurrentVoiceChange{};
+
+    void allocVoiceChanges();
+    void allocVoiceProps();
+
+
+    ContextParams mParams;
+
+    using VoiceArray = al::FlexArray<Voice*>;
+    std::atomic<VoiceArray*> mVoices{};
+    std::atomic<size_t> mActiveVoiceCount{};
+
+    void allocVoices(size_t addcount);
+    al::span<Voice*> getVoicesSpan() const noexcept
+    {
+        return {mVoices.load(std::memory_order_relaxed)->data(),
+            mActiveVoiceCount.load(std::memory_order_relaxed)};
+    }
+    al::span<Voice*> getVoicesSpanAcquired() const noexcept
+    {
+        return {mVoices.load(std::memory_order_acquire)->data(),
+            mActiveVoiceCount.load(std::memory_order_acquire)};
+    }
+
+
+    using EffectSlotArray = al::FlexArray<EffectSlot*>;
+    std::atomic<EffectSlotArray*> mActiveAuxSlots{nullptr};
+
+    std::thread mEventThread;
+    al::semaphore mEventSem;
+    std::unique_ptr<RingBuffer> mAsyncEvents;
+    using AsyncEventBitset = std::bitset<AsyncEvent::UserEventCount>;
+    std::atomic<AsyncEventBitset> mEnabledEvts{0u};
+
+    /* Asynchronous voice change actions are processed as a linked list of
+     * VoiceChange objects by the mixer, which is atomically appended to.
+     * However, to avoid allocating each object individually, they're allocated
+     * in clusters that are stored in a vector for easy automatic cleanup.
+     */
+    using VoiceChangeCluster = std::unique_ptr<VoiceChange[]>;
+    al::vector<VoiceChangeCluster> mVoiceChangeClusters;
+
+    using VoiceCluster = std::unique_ptr<Voice[]>;
+    al::vector<VoiceCluster> mVoiceClusters;
+
+    using VoicePropsCluster = std::unique_ptr<VoicePropsItem[]>;
+    al::vector<VoicePropsCluster> mVoicePropClusters;
+
+
+    static constexpr size_t EffectSlotClusterSize{4};
+    EffectSlot *getEffectSlot();
+
+    using EffectSlotCluster = std::unique_ptr<EffectSlot[]>;
+    al::vector<EffectSlotCluster> mEffectSlotClusters;
+
+
+    ContextBase(DeviceBase *device);
+    ContextBase(const ContextBase&) = delete;
+    ContextBase& operator=(const ContextBase&) = delete;
+    ~ContextBase();
+};
+
+#endif /* CORE_CONTEXT_H */
diff --git a/core/converter.cpp b/core/converter.cpp
new file mode 100644
index 00000000..a5141448
--- /dev/null
+++ b/core/converter.cpp
@@ -0,0 +1,346 @@
+
+#include "config.h"
+
+#include "converter.h"
+
+#include <algorithm>
+#include <cassert>
+#include <cmath>
+#include <cstdint>
+#include <iterator>
+#include <limits.h>
+
+#include "albit.h"
+#include "albyte.h"
+#include "alnumeric.h"
+#include "fpu_ctrl.h"
+
+
+namespace {
+
+constexpr uint MaxPitch{10};
+
+static_assert((BufferLineSize-1)/MaxPitch > 0, "MaxPitch is too large for BufferLineSize!");
+static_assert((INT_MAX>>MixerFracBits)/MaxPitch > BufferLineSize,
+    "MaxPitch and/or BufferLineSize are too large for MixerFracBits!");
+
+/* Base template left undefined. Should be marked =delete, but Clang 3.8.1
+ * chokes on that given the inline specializations.
+ */
+template<DevFmtType T>
+inline float LoadSample(DevFmtType_t<T> val) noexcept;
+
+template<> inline float LoadSample<DevFmtByte>(DevFmtType_t<DevFmtByte> val) noexcept
+{ return val * (1.0f/128.0f); }
+template<> inline float LoadSample<DevFmtShort>(DevFmtType_t<DevFmtShort> val) noexcept
+{ return val * (1.0f/32768.0f); }
+template<> inline float LoadSample<DevFmtInt>(DevFmtType_t<DevFmtInt> val) noexcept
+{ return static_cast<float>(val) * (1.0f/2147483648.0f); }
+template<> inline float LoadSample<DevFmtFloat>(DevFmtType_t<DevFmtFloat> val) noexcept
+{ return val; }
+
+template<> inline float LoadSample<DevFmtUByte>(DevFmtType_t<DevFmtUByte> val) noexcept
+{ return LoadSample<DevFmtByte>(static_cast<int8_t>(val - 128)); }
+template<> inline float LoadSample<DevFmtUShort>(DevFmtType_t<DevFmtUShort> val) noexcept
+{ return LoadSample<DevFmtShort>(static_cast<int16_t>(val - 32768)); }
+template<> inline float LoadSample<DevFmtUInt>(DevFmtType_t<DevFmtUInt> val) noexcept
+{ return LoadSample<DevFmtInt>(static_cast<int32_t>(val - 2147483648u)); }
+
+
+template<DevFmtType T>
+inline void LoadSampleArray(float *RESTRICT dst, const void *src, const size_t srcstep,
+    const size_t samples) noexcept
+{
+    const DevFmtType_t<T> *ssrc = static_cast<const DevFmtType_t<T>*>(src);
+    for(size_t i{0u};i < samples;i++)
+        dst[i] = LoadSample<T>(ssrc[i*srcstep]);
+}
+
+void LoadSamples(float *dst, const void *src, const size_t srcstep, const DevFmtType srctype,
+    const size_t samples) noexcept
+{
+#define HANDLE_FMT(T)                                                         \
+    case T: LoadSampleArray<T>(dst, src, srcstep, samples); break
+    switch(srctype)
+    {
+        HANDLE_FMT(DevFmtByte);
+        HANDLE_FMT(DevFmtUByte);
+        HANDLE_FMT(DevFmtShort);
+        HANDLE_FMT(DevFmtUShort);
+        HANDLE_FMT(DevFmtInt);
+        HANDLE_FMT(DevFmtUInt);
+        HANDLE_FMT(DevFmtFloat);
+    }
+#undef HANDLE_FMT
+}
+
+
+template<DevFmtType T>
+inline DevFmtType_t<T> StoreSample(float) noexcept;
+
+template<> inline float StoreSample<DevFmtFloat>(float val) noexcept
+{ return val; }
+template<> inline int32_t StoreSample<DevFmtInt>(float val) noexcept
+{ return fastf2i(clampf(val*2147483648.0f, -2147483648.0f, 2147483520.0f)); }
+template<> inline int16_t StoreSample<DevFmtShort>(float val) noexcept
+{ return static_cast<int16_t>(fastf2i(clampf(val*32768.0f, -32768.0f, 32767.0f))); }
+template<> inline int8_t StoreSample<DevFmtByte>(float val) noexcept
+{ return static_cast<int8_t>(fastf2i(clampf(val*128.0f, -128.0f, 127.0f))); }
+
+/* Define unsigned output variations. */
+template<> inline uint32_t StoreSample<DevFmtUInt>(float val) noexcept
+{ return static_cast<uint32_t>(StoreSample<DevFmtInt>(val)) + 2147483648u; }
+template<> inline uint16_t StoreSample<DevFmtUShort>(float val) noexcept
+{ return static_cast<uint16_t>(StoreSample<DevFmtShort>(val) + 32768); }
+template<> inline uint8_t StoreSample<DevFmtUByte>(float val) noexcept
+{ return static_cast<uint8_t>(StoreSample<DevFmtByte>(val) + 128); }
+
+template<DevFmtType T>
+inline void StoreSampleArray(void *dst, const float *RESTRICT src, const size_t dststep,
+    const size_t samples) noexcept
+{
+    DevFmtType_t<T> *sdst = static_cast<DevFmtType_t<T>*>(dst);
+    for(size_t i{0u};i < samples;i++)
+        sdst[i*dststep] = StoreSample<T>(src[i]);
+}
+
+
+void StoreSamples(void *dst, const float *src, const size_t dststep, const DevFmtType dsttype,
+    const size_t samples) noexcept
+{
+#define HANDLE_FMT(T)                                                         \
+    case T: StoreSampleArray<T>(dst, src, dststep, samples); break
+    switch(dsttype)
+    {
+        HANDLE_FMT(DevFmtByte);
+        HANDLE_FMT(DevFmtUByte);
+        HANDLE_FMT(DevFmtShort);
+        HANDLE_FMT(DevFmtUShort);
+        HANDLE_FMT(DevFmtInt);
+        HANDLE_FMT(DevFmtUInt);
+        HANDLE_FMT(DevFmtFloat);
+    }
+#undef HANDLE_FMT
+}
+
+
+template<DevFmtType T>
+void Mono2Stereo(float *RESTRICT dst, const void *src, const size_t frames) noexcept
+{
+    const DevFmtType_t<T> *ssrc = static_cast<const DevFmtType_t<T>*>(src);
+    for(size_t i{0u};i < frames;i++)
+        dst[i*2 + 1] = dst[i*2 + 0] = LoadSample<T>(ssrc[i]) * 0.707106781187f;
+}
+
+template<DevFmtType T>
+void Multi2Mono(uint chanmask, const size_t step, const float scale, float *RESTRICT dst,
+    const void *src, const size_t frames) noexcept
+{
+    const DevFmtType_t<T> *ssrc = static_cast<const DevFmtType_t<T>*>(src);
+    std::fill_n(dst, frames, 0.0f);
+    for(size_t c{0};chanmask;++c)
+    {
+        if((chanmask&1)) LIKELY
+        {
+            for(size_t i{0u};i < frames;i++)
+                dst[i] += LoadSample<T>(ssrc[i*step + c]);
+        }
+        chanmask >>= 1;
+    }
+    for(size_t i{0u};i < frames;i++)
+        dst[i] *= scale;
+}
+
+} // namespace
+
+SampleConverterPtr SampleConverter::Create(DevFmtType srcType, DevFmtType dstType, size_t numchans,
+    uint srcRate, uint dstRate, Resampler resampler)
+{
+    if(numchans < 1 || srcRate < 1 || dstRate < 1)
+        return nullptr;
+
+    SampleConverterPtr converter{new(FamCount(numchans)) SampleConverter{numchans}};
+    converter->mSrcType = srcType;
+    converter->mDstType = dstType;
+    converter->mSrcTypeSize = BytesFromDevFmt(srcType);
+    converter->mDstTypeSize = BytesFromDevFmt(dstType);
+
+    converter->mSrcPrepCount = MaxResamplerPadding;
+    converter->mFracOffset = 0;
+    for(auto &chan : converter->mChan)
+    {
+        const al::span<float> buffer{chan.PrevSamples};
+        std::fill(buffer.begin(), buffer.end(), 0.0f);
+    }
+
+    /* Have to set the mixer FPU mode since that's what the resampler code expects. */
+    FPUCtl mixer_mode{};
+    auto step = static_cast<uint>(
+        mind(srcRate*double{MixerFracOne}/dstRate + 0.5, MaxPitch*MixerFracOne));
+    converter->mIncrement = maxu(step, 1);
+    if(converter->mIncrement == MixerFracOne)
+        converter->mResample = [](const InterpState*, const float *RESTRICT src, uint, const uint,
+            const al::span<float> dst) { std::copy_n(src, dst.size(), dst.begin()); };
+    else
+        converter->mResample = PrepareResampler(resampler, converter->mIncrement,
+            &converter->mState);
+
+    return converter;
+}
+
+uint SampleConverter::availableOut(uint srcframes) const
+{
+    if(srcframes < 1)
+    {
+        /* No output samples if there's no input samples. */
+        return 0;
+    }
+
+    const uint prepcount{mSrcPrepCount};
+    if(prepcount < MaxResamplerPadding && MaxResamplerPadding - prepcount >= srcframes)
+    {
+        /* Not enough input samples to generate an output sample. */
+        return 0;
+    }
+
+    uint64_t DataSize64{prepcount};
+    DataSize64 += srcframes;
+    DataSize64 -= MaxResamplerPadding;
+    DataSize64 <<= MixerFracBits;
+    DataSize64 -= mFracOffset;
+
+    /* If we have a full prep, we can generate at least one sample. */
+    return static_cast<uint>(clampu64((DataSize64 + mIncrement-1)/mIncrement, 1,
+        std::numeric_limits<int>::max()));
+}
+
+uint SampleConverter::convert(const void **src, uint *srcframes, void *dst, uint dstframes)
+{
+    const uint SrcFrameSize{static_cast<uint>(mChan.size()) * mSrcTypeSize};
+    const uint DstFrameSize{static_cast<uint>(mChan.size()) * mDstTypeSize};
+    const uint increment{mIncrement};
+    auto SamplesIn = static_cast<const al::byte*>(*src);
+    uint NumSrcSamples{*srcframes};
+
+    FPUCtl mixer_mode{};
+    uint pos{0};
+    while(pos < dstframes && NumSrcSamples > 0)
+    {
+        const uint prepcount{mSrcPrepCount};
+        const uint readable{minu(NumSrcSamples, BufferLineSize - prepcount)};
+
+        if(prepcount < MaxResamplerPadding && MaxResamplerPadding-prepcount >= readable)
+        {
+            /* Not enough input samples to generate an output sample. Store
+             * what we're given for later.
+             */
+            for(size_t chan{0u};chan < mChan.size();chan++)
+                LoadSamples(&mChan[chan].PrevSamples[prepcount], SamplesIn + mSrcTypeSize*chan,
+                    mChan.size(), mSrcType, readable);
+
+            mSrcPrepCount = prepcount + readable;
+            NumSrcSamples = 0;
+            break;
+        }
+
+        float *RESTRICT SrcData{mSrcSamples};
+        float *RESTRICT DstData{mDstSamples};
+        uint DataPosFrac{mFracOffset};
+        uint64_t DataSize64{prepcount};
+        DataSize64 += readable;
+        DataSize64 -= MaxResamplerPadding;
+        DataSize64 <<= MixerFracBits;
+        DataSize64 -= DataPosFrac;
+
+        /* If we have a full prep, we can generate at least one sample. */
+        auto DstSize = static_cast<uint>(
+            clampu64((DataSize64 + increment-1)/increment, 1, BufferLineSize));
+        DstSize = minu(DstSize, dstframes-pos);
+
+        const uint DataPosEnd{DstSize*increment + DataPosFrac};
+        const uint SrcDataEnd{DataPosEnd>>MixerFracBits};
+
+        assert(prepcount+readable >= SrcDataEnd);
+        const uint nextprep{minu(prepcount + readable - SrcDataEnd, MaxResamplerPadding)};
+
+        for(size_t chan{0u};chan < mChan.size();chan++)
+        {
+            const al::byte *SrcSamples{SamplesIn + mSrcTypeSize*chan};
+            al::byte *DstSamples = static_cast<al::byte*>(dst) + mDstTypeSize*chan;
+
+            /* Load the previous samples into the source data first, then the
+             * new samples from the input buffer.
+             */
+            std::copy_n(mChan[chan].PrevSamples, prepcount, SrcData);
+            LoadSamples(SrcData + prepcount, SrcSamples, mChan.size(), mSrcType, readable);
+
+            /* Store as many prep samples for next time as possible, given the
+             * number of output samples being generated.
+             */
+            std::copy_n(SrcData+SrcDataEnd, nextprep, mChan[chan].PrevSamples);
+            std::fill(std::begin(mChan[chan].PrevSamples)+nextprep,
+                std::end(mChan[chan].PrevSamples), 0.0f);
+
+            /* Now resample, and store the result in the output buffer. */
+            mResample(&mState, SrcData+MaxResamplerEdge, DataPosFrac, increment,
+                {DstData, DstSize});
+
+            StoreSamples(DstSamples, DstData, mChan.size(), mDstType, DstSize);
+        }
+
+        /* Update the number of prep samples still available, as well as the
+         * fractional offset.
+         */
+        mSrcPrepCount = nextprep;
+        mFracOffset = DataPosEnd & MixerFracMask;
+
+        /* Update the src and dst pointers in case there's still more to do. */
+        const uint srcread{minu(NumSrcSamples, SrcDataEnd + mSrcPrepCount - prepcount)};
+        SamplesIn += SrcFrameSize*srcread;
+        NumSrcSamples -= srcread;
+
+        dst = static_cast<al::byte*>(dst) + DstFrameSize*DstSize;
+        pos += DstSize;
+    }
+
+    *src = SamplesIn;
+    *srcframes = NumSrcSamples;
+
+    return pos;
+}
+
+
+void ChannelConverter::convert(const void *src, float *dst, uint frames) const
+{
+    if(mDstChans == DevFmtMono)
+    {
+        const float scale{std::sqrt(1.0f / static_cast<float>(al::popcount(mChanMask)))};
+        switch(mSrcType)
+        {
+#define HANDLE_FMT(T) case T: Multi2Mono<T>(mChanMask, mSrcStep, scale, dst, src, frames); break
+        HANDLE_FMT(DevFmtByte);
+        HANDLE_FMT(DevFmtUByte);
+        HANDLE_FMT(DevFmtShort);
+        HANDLE_FMT(DevFmtUShort);
+        HANDLE_FMT(DevFmtInt);
+        HANDLE_FMT(DevFmtUInt);
+        HANDLE_FMT(DevFmtFloat);
+#undef HANDLE_FMT
+        }
+    }
+    else if(mChanMask == 0x1 && mDstChans == DevFmtStereo)
+    {
+        switch(mSrcType)
+        {
+#define HANDLE_FMT(T) case T: Mono2Stereo<T>(dst, src, frames); break
+        HANDLE_FMT(DevFmtByte);
+        HANDLE_FMT(DevFmtUByte);
+        HANDLE_FMT(DevFmtShort);
+        HANDLE_FMT(DevFmtUShort);
+        HANDLE_FMT(DevFmtInt);
+        HANDLE_FMT(DevFmtUInt);
+        HANDLE_FMT(DevFmtFloat);
+#undef HANDLE_FMT
+        }
+    }
+}
diff --git a/core/converter.h b/core/converter.h
new file mode 100644
index 00000000..01becea2
--- /dev/null
+++ b/core/converter.h
@@ -0,0 +1,66 @@
+#ifndef CORE_CONVERTER_H
+#define CORE_CONVERTER_H
+
+#include <chrono>
+#include <cstddef>
+#include <memory>
+
+#include "almalloc.h"
+#include "devformat.h"
+#include "mixer/defs.h"
+
+using uint = unsigned int;
+
+
+struct SampleConverter {
+    DevFmtType mSrcType{};
+    DevFmtType mDstType{};
+    uint mSrcTypeSize{};
+    uint mDstTypeSize{};
+
+    uint mSrcPrepCount{};
+
+    uint mFracOffset{};
+    uint mIncrement{};
+    InterpState mState{};
+    ResamplerFunc mResample{};
+
+    alignas(16) float mSrcSamples[BufferLineSize]{};
+    alignas(16) float mDstSamples[BufferLineSize]{};
+
+    struct ChanSamples {
+        alignas(16) float PrevSamples[MaxResamplerPadding];
+    };
+    al::FlexArray<ChanSamples> mChan;
+
+    SampleConverter(size_t numchans) : mChan{numchans} { }
+
+    uint convert(const void **src, uint *srcframes, void *dst, uint dstframes);
+    uint availableOut(uint srcframes) const;
+
+    using SampleOffset = std::chrono::duration<int64_t, std::ratio<1,MixerFracOne>>;
+    SampleOffset currentInputDelay() const noexcept
+    {
+        const int64_t prep{int64_t{mSrcPrepCount} - MaxResamplerEdge};
+        return SampleOffset{(prep<<MixerFracBits) + mFracOffset};
+    }
+
+    static std::unique_ptr<SampleConverter> Create(DevFmtType srcType, DevFmtType dstType,
+        size_t numchans, uint srcRate, uint dstRate, Resampler resampler);
+
+    DEF_FAM_NEWDEL(SampleConverter, mChan)
+};
+using SampleConverterPtr = std::unique_ptr<SampleConverter>;
+
+struct ChannelConverter {
+    DevFmtType mSrcType{};
+    uint mSrcStep{};
+    uint mChanMask{};
+    DevFmtChannels mDstChans{};
+
+    bool is_active() const noexcept { return mChanMask != 0; }
+
+    void convert(const void *src, float *dst, uint frames) const;
+};
+
+#endif /* CORE_CONVERTER_H */
diff --git a/core/cpu_caps.cpp b/core/cpu_caps.cpp
new file mode 100644
index 00000000..d4b4d86c
--- /dev/null
+++ b/core/cpu_caps.cpp
@@ -0,0 +1,141 @@
+
+#include "config.h"
+
+#include "cpu_caps.h"
+
+#if defined(_WIN32) && (defined(_M_ARM) || defined(_M_ARM64))
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+#ifndef PF_ARM_NEON_INSTRUCTIONS_AVAILABLE
+#define PF_ARM_NEON_INSTRUCTIONS_AVAILABLE 19
+#endif
+#endif
+
+#if defined(HAVE_CPUID_H)
+#include <cpuid.h>
+#elif defined(HAVE_INTRIN_H)
+#include <intrin.h>
+#endif
+
+#include <array>
+#include <cctype>
+#include <string>
+
+
+int CPUCapFlags{0};
+
+namespace {
+
+#if defined(HAVE_GCC_GET_CPUID) \
+    && (defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64))
+using reg_type = unsigned int;
+inline std::array<reg_type,4> get_cpuid(unsigned int f)
+{
+    std::array<reg_type,4> ret{};
+    __get_cpuid(f, ret.data(), &ret[1], &ret[2], &ret[3]);
+    return ret;
+}
+#define CAN_GET_CPUID
+#elif defined(HAVE_CPUID_INTRINSIC) \
+    && (defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64))
+using reg_type = int;
+inline std::array<reg_type,4> get_cpuid(unsigned int f)
+{
+    std::array<reg_type,4> ret{};
+    (__cpuid)(ret.data(), f);
+    return ret;
+}
+#define CAN_GET_CPUID
+#endif
+
+} // namespace
+
+al::optional<CPUInfo> GetCPUInfo()
+{
+    CPUInfo ret;
+
+#ifdef CAN_GET_CPUID
+    auto cpuregs = get_cpuid(0);
+    if(cpuregs[0] == 0)
+        return al::nullopt;
+
+    const reg_type maxfunc{cpuregs[0]};
+
+    cpuregs = get_cpuid(0x80000000);
+    const reg_type maxextfunc{cpuregs[0]};
+
+    ret.mVendor.append(reinterpret_cast<char*>(&cpuregs[1]), 4);
+    ret.mVendor.append(reinterpret_cast<char*>(&cpuregs[3]), 4);
+    ret.mVendor.append(reinterpret_cast<char*>(&cpuregs[2]), 4);
+    auto iter_end = std::remove(ret.mVendor.begin(), ret.mVendor.end(), '\0');
+    iter_end = std::unique(ret.mVendor.begin(), iter_end,
+        [](auto&& c0, auto&& c1) { return std::isspace(c0) && std::isspace(c1); });
+    ret.mVendor.erase(iter_end, ret.mVendor.end());
+    if(!ret.mVendor.empty() && std::isspace(ret.mVendor.back()))
+        ret.mVendor.pop_back();
+    if(!ret.mVendor.empty() && std::isspace(ret.mVendor.front()))
+        ret.mVendor.erase(ret.mVendor.begin());
+
+    if(maxextfunc >= 0x80000004)
+    {
+        cpuregs = get_cpuid(0x80000002);
+        ret.mName.append(reinterpret_cast<char*>(cpuregs.data()), 16);
+        cpuregs = get_cpuid(0x80000003);
+        ret.mName.append(reinterpret_cast<char*>(cpuregs.data()), 16);
+        cpuregs = get_cpuid(0x80000004);
+        ret.mName.append(reinterpret_cast<char*>(cpuregs.data()), 16);
+        iter_end = std::remove(ret.mName.begin(), ret.mName.end(), '\0');
+        iter_end = std::unique(ret.mName.begin(), iter_end,
+            [](auto&& c0, auto&& c1) { return std::isspace(c0) && std::isspace(c1); });
+        ret.mName.erase(iter_end, ret.mName.end());
+        if(!ret.mName.empty() && std::isspace(ret.mName.back()))
+            ret.mName.pop_back();
+        if(!ret.mName.empty() && std::isspace(ret.mName.front()))
+            ret.mName.erase(ret.mName.begin());
+    }
+
+    if(maxfunc >= 1)
+    {
+        cpuregs = get_cpuid(1);
+        if((cpuregs[3]&(1<<25)))
+            ret.mCaps |= CPU_CAP_SSE;
+        if((ret.mCaps&CPU_CAP_SSE) && (cpuregs[3]&(1<<26)))
+            ret.mCaps |= CPU_CAP_SSE2;
+        if((ret.mCaps&CPU_CAP_SSE2) && (cpuregs[2]&(1<<0)))
+            ret.mCaps |= CPU_CAP_SSE3;
+        if((ret.mCaps&CPU_CAP_SSE3) && (cpuregs[2]&(1<<19)))
+            ret.mCaps |= CPU_CAP_SSE4_1;
+    }
+
+#else
+
+    /* Assume support for whatever's supported if we can't check for it */
+#if defined(HAVE_SSE4_1)
+#warning "Assuming SSE 4.1 run-time support!"
+    ret.mCaps |= CPU_CAP_SSE | CPU_CAP_SSE2 | CPU_CAP_SSE3 | CPU_CAP_SSE4_1;
+#elif defined(HAVE_SSE3)
+#warning "Assuming SSE 3 run-time support!"
+    ret.mCaps |= CPU_CAP_SSE | CPU_CAP_SSE2 | CPU_CAP_SSE3;
+#elif defined(HAVE_SSE2)
+#warning "Assuming SSE 2 run-time support!"
+    ret.mCaps |= CPU_CAP_SSE | CPU_CAP_SSE2;
+#elif defined(HAVE_SSE)
+#warning "Assuming SSE run-time support!"
+    ret.mCaps |= CPU_CAP_SSE;
+#endif
+#endif /* CAN_GET_CPUID */
+
+#ifdef HAVE_NEON
+#ifdef __ARM_NEON
+    ret.mCaps |= CPU_CAP_NEON;
+#elif defined(_WIN32) && (defined(_M_ARM) || defined(_M_ARM64))
+    if(IsProcessorFeaturePresent(PF_ARM_NEON_INSTRUCTIONS_AVAILABLE))
+        ret.mCaps |= CPU_CAP_NEON;
+#else
+#warning "Assuming NEON run-time support!"
+    ret.mCaps |= CPU_CAP_NEON;
+#endif
+#endif
+
+    return ret;
+}
diff --git a/core/cpu_caps.h b/core/cpu_caps.h
new file mode 100644
index 00000000..ffd671d0
--- /dev/null
+++ b/core/cpu_caps.h
@@ -0,0 +1,26 @@
+#ifndef CORE_CPU_CAPS_H
+#define CORE_CPU_CAPS_H
+
+#include <string>
+
+#include "aloptional.h"
+
+
+extern int CPUCapFlags;
+enum {
+    CPU_CAP_SSE    = 1<<0,
+    CPU_CAP_SSE2   = 1<<1,
+    CPU_CAP_SSE3   = 1<<2,
+    CPU_CAP_SSE4_1 = 1<<3,
+    CPU_CAP_NEON   = 1<<4,
+};
+
+struct CPUInfo {
+    std::string mVendor;
+    std::string mName;
+    int mCaps{0};
+};
+
+al::optional<CPUInfo> GetCPUInfo();
+
+#endif /* CORE_CPU_CAPS_H */
diff --git a/core/cubic_defs.h b/core/cubic_defs.h
new file mode 100644
index 00000000..33751c97
--- /dev/null
+++ b/core/cubic_defs.h
@@ -0,0 +1,13 @@
+#ifndef CORE_CUBIC_DEFS_H
+#define CORE_CUBIC_DEFS_H
+
+/* The number of distinct phase intervals within the cubic filter tables. */
+constexpr unsigned int CubicPhaseBits{5};
+constexpr unsigned int CubicPhaseCount{1 << CubicPhaseBits};
+
+struct CubicCoefficients {
+    float mCoeffs[4];
+    float mDeltas[4];
+};
+
+#endif /* CORE_CUBIC_DEFS_H */
diff --git a/core/cubic_tables.cpp b/core/cubic_tables.cpp
new file mode 100644
index 00000000..73ec6b3f
--- /dev/null
+++ b/core/cubic_tables.cpp
@@ -0,0 +1,59 @@
+
+#include "cubic_tables.h"
+
+#include <algorithm>
+#include <array>
+#include <cassert>
+#include <cmath>
+#include <limits>
+#include <memory>
+#include <stdexcept>
+
+#include "alnumbers.h"
+#include "core/mixer/defs.h"
+
+
+namespace {
+
+using uint = unsigned int;
+
+struct SplineFilterArray {
+    alignas(16) CubicCoefficients mTable[CubicPhaseCount]{};
+
+    constexpr SplineFilterArray()
+    {
+        /* Fill in the main coefficients. */
+        for(size_t pi{0};pi < CubicPhaseCount;++pi)
+        {
+            const double mu{static_cast<double>(pi) / CubicPhaseCount};
+            const double mu2{mu*mu}, mu3{mu2*mu};
+            mTable[pi].mCoeffs[0] = static_cast<float>(-0.5*mu3 +      mu2 + -0.5*mu);
+            mTable[pi].mCoeffs[1] = static_cast<float>( 1.5*mu3 + -2.5*mu2           + 1.0);
+            mTable[pi].mCoeffs[2] = static_cast<float>(-1.5*mu3 +  2.0*mu2 +  0.5*mu);
+            mTable[pi].mCoeffs[3] = static_cast<float>( 0.5*mu3 + -0.5*mu2);
+        }
+
+        /* Fill in the coefficient deltas. */
+        for(size_t pi{0};pi < CubicPhaseCount-1;++pi)
+        {
+            mTable[pi].mDeltas[0] = mTable[pi+1].mCoeffs[0] - mTable[pi].mCoeffs[0];
+            mTable[pi].mDeltas[1] = mTable[pi+1].mCoeffs[1] - mTable[pi].mCoeffs[1];
+            mTable[pi].mDeltas[2] = mTable[pi+1].mCoeffs[2] - mTable[pi].mCoeffs[2];
+            mTable[pi].mDeltas[3] = mTable[pi+1].mCoeffs[3] - mTable[pi].mCoeffs[3];
+        }
+
+        const size_t pi{CubicPhaseCount - 1};
+        mTable[pi].mDeltas[0] = -mTable[pi].mCoeffs[0];
+        mTable[pi].mDeltas[1] = -mTable[pi].mCoeffs[1];
+        mTable[pi].mDeltas[2] = 1.0f - mTable[pi].mCoeffs[2];
+        mTable[pi].mDeltas[3] = -mTable[pi].mCoeffs[3];
+    }
+
+    constexpr auto getTable() const noexcept { return al::as_span(mTable); }
+};
+
+constexpr SplineFilterArray SplineFilter{};
+
+} // namespace
+
+const CubicTable gCubicSpline{SplineFilter.getTable()};
diff --git a/core/cubic_tables.h b/core/cubic_tables.h
new file mode 100644
index 00000000..88097ae2
--- /dev/null
+++ b/core/cubic_tables.h
@@ -0,0 +1,17 @@
+#ifndef CORE_CUBIC_TABLES_H
+#define CORE_CUBIC_TABLES_H
+
+#include "alspan.h"
+#include "cubic_defs.h"
+
+
+struct CubicTable {
+    al::span<const CubicCoefficients,CubicPhaseCount> Tab;
+};
+
+/* A Catmull-Rom spline. The spline passes through the center two samples,
+ * ensuring no discontinuity while moving through a series of samples.
+ */
+extern const CubicTable gCubicSpline;
+
+#endif /* CORE_CUBIC_TABLES_H */
diff --git a/core/dbus_wrap.cpp b/core/dbus_wrap.cpp
new file mode 100644
index 00000000..7f221706
--- /dev/null
+++ b/core/dbus_wrap.cpp
@@ -0,0 +1,46 @@
+
+#include "config.h"
+
+#include "dbus_wrap.h"
+
+#ifdef HAVE_DYNLOAD
+
+#include <mutex>
+#include <type_traits>
+
+#include "logging.h"
+
+
+void *dbus_handle{nullptr};
+#define DECL_FUNC(x) decltype(p##x) p##x{};
+DBUS_FUNCTIONS(DECL_FUNC)
+#undef DECL_FUNC
+
+void PrepareDBus()
+{
+    static constexpr char libname[] = "libdbus-1.so.3";
+
+    auto load_func = [](auto &f, const char *name) -> void
+    { f = reinterpret_cast<std::remove_reference_t<decltype(f)>>(GetSymbol(dbus_handle, name)); };
+#define LOAD_FUNC(x) do {                         \
+    load_func(p##x, #x);                          \
+    if(!p##x)                                     \
+    {                                             \
+        WARN("Failed to load function %s\n", #x); \
+        CloseLib(dbus_handle);                    \
+        dbus_handle = nullptr;                    \
+        return;                                   \
+    }                                             \
+} while(0);
+
+    dbus_handle = LoadLib(libname);
+    if(!dbus_handle)
+    {
+        WARN("Failed to load %s\n", libname);
+        return;
+    }
+
+DBUS_FUNCTIONS(LOAD_FUNC)
+#undef LOAD_FUNC
+}
+#endif
diff --git a/core/dbus_wrap.h b/core/dbus_wrap.h
new file mode 100644
index 00000000..09eaacf9
--- /dev/null
+++ b/core/dbus_wrap.h
@@ -0,0 +1,87 @@
+#ifndef CORE_DBUS_WRAP_H
+#define CORE_DBUS_WRAP_H
+
+#include <memory>
+
+#include <dbus/dbus.h>
+
+#include "dynload.h"
+
+#ifdef HAVE_DYNLOAD
+
+#include <mutex>
+
+#define DBUS_FUNCTIONS(MAGIC) \
+MAGIC(dbus_error_init) \
+MAGIC(dbus_error_free) \
+MAGIC(dbus_bus_get) \
+MAGIC(dbus_connection_set_exit_on_disconnect) \
+MAGIC(dbus_connection_unref) \
+MAGIC(dbus_connection_send_with_reply_and_block) \
+MAGIC(dbus_message_unref) \
+MAGIC(dbus_message_new_method_call) \
+MAGIC(dbus_message_append_args) \
+MAGIC(dbus_message_iter_init) \
+MAGIC(dbus_message_iter_next) \
+MAGIC(dbus_message_iter_recurse) \
+MAGIC(dbus_message_iter_get_arg_type) \
+MAGIC(dbus_message_iter_get_basic) \
+MAGIC(dbus_set_error_from_message)
+
+extern void *dbus_handle;
+#define DECL_FUNC(x) extern decltype(x) *p##x;
+DBUS_FUNCTIONS(DECL_FUNC)
+#undef DECL_FUNC
+
+#ifndef IN_IDE_PARSER
+#define dbus_error_init (*pdbus_error_init)
+#define dbus_error_free (*pdbus_error_free)
+#define dbus_bus_get (*pdbus_bus_get)
+#define dbus_connection_set_exit_on_disconnect (*pdbus_connection_set_exit_on_disconnect)
+#define dbus_connection_unref (*pdbus_connection_unref)
+#define dbus_connection_send_with_reply_and_block (*pdbus_connection_send_with_reply_and_block)
+#define dbus_message_unref (*pdbus_message_unref)
+#define dbus_message_new_method_call (*pdbus_message_new_method_call)
+#define dbus_message_append_args (*pdbus_message_append_args)
+#define dbus_message_iter_init (*pdbus_message_iter_init)
+#define dbus_message_iter_next (*pdbus_message_iter_next)
+#define dbus_message_iter_recurse (*pdbus_message_iter_recurse)
+#define dbus_message_iter_get_arg_type (*pdbus_message_iter_get_arg_type)
+#define dbus_message_iter_get_basic (*pdbus_message_iter_get_basic)
+#define dbus_set_error_from_message (*pdbus_set_error_from_message)
+#endif
+
+void PrepareDBus();
+
+inline auto HasDBus()
+{
+    static std::once_flag init_dbus{};
+    std::call_once(init_dbus, []{ PrepareDBus(); });
+    return dbus_handle;
+}
+
+#else
+
+constexpr bool HasDBus() noexcept { return true; }
+#endif /* HAVE_DYNLOAD */
+
+
+namespace dbus {
+
+struct Error {
+    Error() { dbus_error_init(&mError); }
+    ~Error() { dbus_error_free(&mError); }
+    DBusError* operator->() { return &mError; }
+    DBusError &get() { return mError; }
+private:
+    DBusError mError{};
+};
+
+struct ConnectionDeleter {
+    void operator()(DBusConnection *c) { dbus_connection_unref(c); }
+};
+using ConnectionPtr = std::unique_ptr<DBusConnection,ConnectionDeleter>;
+
+} // namespace dbus
+
+#endif /* CORE_DBUS_WRAP_H */
diff --git a/core/devformat.cpp b/core/devformat.cpp
new file mode 100644
index 00000000..acdabc4f
--- /dev/null
+++ b/core/devformat.cpp
@@ -0,0 +1,67 @@
+
+#include "config.h"
+
+#include "devformat.h"
+
+
+uint BytesFromDevFmt(DevFmtType type) noexcept
+{
+    switch(type)
+    {
+    case DevFmtByte: return sizeof(int8_t);
+    case DevFmtUByte: return sizeof(uint8_t);
+    case DevFmtShort: return sizeof(int16_t);
+    case DevFmtUShort: return sizeof(uint16_t);
+    case DevFmtInt: return sizeof(int32_t);
+    case DevFmtUInt: return sizeof(uint32_t);
+    case DevFmtFloat: return sizeof(float);
+    }
+    return 0;
+}
+uint ChannelsFromDevFmt(DevFmtChannels chans, uint ambiorder) noexcept
+{
+    switch(chans)
+    {
+    case DevFmtMono: return 1;
+    case DevFmtStereo: return 2;
+    case DevFmtQuad: return 4;
+    case DevFmtX51: return 6;
+    case DevFmtX61: return 7;
+    case DevFmtX71: return 8;
+    case DevFmtX714: return 12;
+    case DevFmtX3D71: return 8;
+    case DevFmtAmbi3D: return (ambiorder+1) * (ambiorder+1);
+    }
+    return 0;
+}
+
+const char *DevFmtTypeString(DevFmtType type) noexcept
+{
+    switch(type)
+    {
+    case DevFmtByte: return "Int8";
+    case DevFmtUByte: return "UInt8";
+    case DevFmtShort: return "Int16";
+    case DevFmtUShort: return "UInt16";
+    case DevFmtInt: return "Int32";
+    case DevFmtUInt: return "UInt32";
+    case DevFmtFloat: return "Float32";
+    }
+    return "(unknown type)";
+}
+const char *DevFmtChannelsString(DevFmtChannels chans) noexcept
+{
+    switch(chans)
+    {
+    case DevFmtMono: return "Mono";
+    case DevFmtStereo: return "Stereo";
+    case DevFmtQuad: return "Quadraphonic";
+    case DevFmtX51: return "5.1 Surround";
+    case DevFmtX61: return "6.1 Surround";
+    case DevFmtX71: return "7.1 Surround";
+    case DevFmtX714: return "7.1.4 Surround";
+    case DevFmtX3D71: return "3D7.1 Surround";
+    case DevFmtAmbi3D: return "Ambisonic 3D";
+    }
+    return "(unknown channels)";
+}
diff --git a/core/devformat.h b/core/devformat.h
new file mode 100644
index 00000000..485826a3
--- /dev/null
+++ b/core/devformat.h
@@ -0,0 +1,122 @@
+#ifndef CORE_DEVFORMAT_H
+#define CORE_DEVFORMAT_H
+
+#include <cstdint>
+
+
+using uint = unsigned int;
+
+enum Channel : unsigned char {
+    FrontLeft = 0,
+    FrontRight,
+    FrontCenter,
+    LFE,
+    BackLeft,
+    BackRight,
+    BackCenter,
+    SideLeft,
+    SideRight,
+
+    TopCenter,
+    TopFrontLeft,
+    TopFrontCenter,
+    TopFrontRight,
+    TopBackLeft,
+    TopBackCenter,
+    TopBackRight,
+
+    Aux0,
+    Aux1,
+    Aux2,
+    Aux3,
+    Aux4,
+    Aux5,
+    Aux6,
+    Aux7,
+    Aux8,
+    Aux9,
+    Aux10,
+    Aux11,
+    Aux12,
+    Aux13,
+    Aux14,
+    Aux15,
+
+    MaxChannels
+};
+
+
+/* Device formats */
+enum DevFmtType : unsigned char {
+    DevFmtByte,
+    DevFmtUByte,
+    DevFmtShort,
+    DevFmtUShort,
+    DevFmtInt,
+    DevFmtUInt,
+    DevFmtFloat,
+
+    DevFmtTypeDefault = DevFmtFloat
+};
+enum DevFmtChannels : unsigned char {
+    DevFmtMono,
+    DevFmtStereo,
+    DevFmtQuad,
+    DevFmtX51,
+    DevFmtX61,
+    DevFmtX71,
+    DevFmtX714,
+    DevFmtX3D71,
+    DevFmtAmbi3D,
+
+    DevFmtChannelsDefault = DevFmtStereo
+};
+#define MAX_OUTPUT_CHANNELS  16
+
+/* DevFmtType traits, providing the type, etc given a DevFmtType. */
+template<DevFmtType T>
+struct DevFmtTypeTraits { };
+
+template<>
+struct DevFmtTypeTraits<DevFmtByte> { using Type = int8_t; };
+template<>
+struct DevFmtTypeTraits<DevFmtUByte> { using Type = uint8_t; };
+template<>
+struct DevFmtTypeTraits<DevFmtShort> { using Type = int16_t; };
+template<>
+struct DevFmtTypeTraits<DevFmtUShort> { using Type = uint16_t; };
+template<>
+struct DevFmtTypeTraits<DevFmtInt> { using Type = int32_t; };
+template<>
+struct DevFmtTypeTraits<DevFmtUInt> { using Type = uint32_t; };
+template<>
+struct DevFmtTypeTraits<DevFmtFloat> { using Type = float; };
+
+template<DevFmtType T>
+using DevFmtType_t = typename DevFmtTypeTraits<T>::Type;
+
+
+uint BytesFromDevFmt(DevFmtType type) noexcept;
+uint ChannelsFromDevFmt(DevFmtChannels chans, uint ambiorder) noexcept;
+inline uint FrameSizeFromDevFmt(DevFmtChannels chans, DevFmtType type, uint ambiorder) noexcept
+{ return ChannelsFromDevFmt(chans, ambiorder) * BytesFromDevFmt(type); }
+
+const char *DevFmtTypeString(DevFmtType type) noexcept;
+const char *DevFmtChannelsString(DevFmtChannels chans) noexcept;
+
+enum class DevAmbiLayout : bool {
+    FuMa,
+    ACN,
+
+    Default = ACN
+};
+
+enum class DevAmbiScaling : unsigned char {
+    FuMa,
+    SN3D,
+    N3D,
+
+    Default = SN3D
+};
+
+#endif /* CORE_DEVFORMAT_H */
diff --git a/core/device.cpp b/core/device.cpp
new file mode 100644
index 00000000..2766c5e4
--- /dev/null
+++ b/core/device.cpp
@@ -0,0 +1,23 @@
+
+#include "config.h"
+
+#include "bformatdec.h"
+#include "bs2b.h"
+#include "device.h"
+#include "front_stablizer.h"
+#include "hrtf.h"
+#include "mastering.h"
+
+
+al::FlexArray<ContextBase*> DeviceBase::sEmptyContextArray{0u};
+
+
+DeviceBase::DeviceBase(DeviceType type) : Type{type}, mContexts{&sEmptyContextArray}
+{
+}
+
+DeviceBase::~DeviceBase()
+{
+    auto *oldarray = mContexts.exchange(nullptr, std::memory_order_relaxed);
+    if(oldarray != &sEmptyContextArray) delete oldarray;
+}
diff --git a/core/device.h b/core/device.h
new file mode 100644
index 00000000..9aaf7adb
--- /dev/null
+++ b/core/device.h
@@ -0,0 +1,345 @@
+#ifndef CORE_DEVICE_H
+#define CORE_DEVICE_H
+
+#include <stddef.h>
+
+#include <array>
+#include <atomic>
+#include <bitset>
+#include <chrono>
+#include <memory>
+#include <mutex>
+#include <string>
+
+#include "almalloc.h"
+#include "alspan.h"
+#include "ambidefs.h"
+#include "atomic.h"
+#include "bufferline.h"
+#include "devformat.h"
+#include "filters/nfc.h"
+#include "intrusive_ptr.h"
+#include "mixer/hrtfdefs.h"
+#include "opthelpers.h"
+#include "resampler_limits.h"
+#include "uhjfilter.h"
+#include "vector.h"
+
+class BFormatDec;
+struct bs2b;
+struct Compressor;
+struct ContextBase;
+struct DirectHrtfState;
+struct HrtfStore;
+
+using uint = unsigned int;
+
+
+#define MIN_OUTPUT_RATE      8000
+#define MAX_OUTPUT_RATE      192000
+#define DEFAULT_OUTPUT_RATE  48000
+
+#define DEFAULT_UPDATE_SIZE  960 /* 20ms */
+#define DEFAULT_NUM_UPDATES  3
+
+
+enum class DeviceType : unsigned char {
+    Playback,
+    Capture,
+    Loopback
+};
+
+
+enum class RenderMode : unsigned char {
+    Normal,
+    Pairwise,
+    Hrtf
+};
+
+enum class StereoEncoding : unsigned char {
+    Basic,
+    Uhj,
+    Hrtf,
+
+    Default = Basic
+};
+
+
+struct InputRemixMap {
+    struct TargetMix { Channel channel; float mix; };
+
+    Channel channel;
+    al::span<const TargetMix> targets;
+};
+
+
+struct DistanceComp {
+    /* Maximum delay in samples for speaker distance compensation. */
+    static constexpr uint MaxDelay{1024};
+
+    struct ChanData {
+        float Gain{1.0f};
+        uint Length{0u}; /* Valid range is [0...MaxDelay). */
+        float *Buffer{nullptr};
+    };
+
+    std::array<ChanData,MAX_OUTPUT_CHANNELS> mChannels;
+    al::FlexArray<float,16> mSamples;
+
+    DistanceComp(size_t count) : mSamples{count} { }
+
+    static std::unique_ptr<DistanceComp> Create(size_t numsamples)
+    { return std::unique_ptr<DistanceComp>{new(FamCount(numsamples)) DistanceComp{numsamples}}; }
+
+    DEF_FAM_NEWDEL(DistanceComp, mSamples)
+};
+
+
+constexpr uint InvalidChannelIndex{~0u};
+
+struct BFChannelConfig {
+    float Scale;
+    uint Index;
+};
+
+struct MixParams {
+    /* Coefficient channel mapping for mixing to the buffer. */
+    std::array<BFChannelConfig,MaxAmbiChannels> AmbiMap{};
+
+    al::span<FloatBufferLine> Buffer;
+
+    /**
+     * Helper to set an identity/pass-through panning for ambisonic mixing. The
+     * source is expected to be a 3D ACN/N3D ambisonic buffer, and for each
+     * channel [0...count), the given functor is called with the source channel
+     * index, destination channel index, and the gain for that channel. If the
+     * destination channel is INVALID_CHANNEL_INDEX, the given source channel
+     * is not used for output.
+     */
+    template<typename F>
+    void setAmbiMixParams(const MixParams &inmix, const float gainbase, F func) const
+    {
+        const size_t numIn{inmix.Buffer.size()};
+        const size_t numOut{Buffer.size()};
+        for(size_t i{0};i < numIn;++i)
+        {
+            auto idx = InvalidChannelIndex;
+            auto gain = 0.0f;
+
+            for(size_t j{0};j < numOut;++j)
+            {
+                if(AmbiMap[j].Index == inmix.AmbiMap[i].Index)
+                {
+                    idx = static_cast<uint>(j);
+                    gain = AmbiMap[j].Scale * gainbase;
+                    break;
+                }
+            }
+            func(i, idx, gain);
+        }
+    }
+};
+
+struct RealMixParams {
+    al::span<const InputRemixMap> RemixMap;
+    std::array<uint,MaxChannels> ChannelIndex{};
+
+    al::span<FloatBufferLine> Buffer;
+};
+
+using AmbiRotateMatrix = std::array<std::array<float,MaxAmbiChannels>,MaxAmbiChannels>;
+
+enum {
+    // Frequency was requested by the app or config file
+    FrequencyRequest,
+    // Channel configuration was requested by the app or config file
+    ChannelsRequest,
+    // Sample type was requested by the config file
+    SampleTypeRequest,
+
+    // Specifies if the DSP is paused at user request
+    DevicePaused,
+    // Specifies if the device is currently running
+    DeviceRunning,
+
+    // Specifies if the output plays directly on/in ears (headphones, headset,
+    // ear buds, etc).
+    DirectEar,
+
+    DeviceFlagsCount
+};
+
+struct DeviceBase {
+    /* To avoid extraneous allocations, a 0-sized FlexArray<ContextBase*> is
+     * defined globally as a sharable object.
+     */
+    static al::FlexArray<ContextBase*> sEmptyContextArray;
+
+    std::atomic<bool> Connected{true};
+    const DeviceType Type{};
+
+    uint Frequency{};
+    uint UpdateSize{};
+    uint BufferSize{};
+
+    DevFmtChannels FmtChans{};
+    DevFmtType FmtType{};
+    uint mAmbiOrder{0};
+    float mXOverFreq{400.0f};
+    /* If the main device mix is horizontal/2D only. */
+    bool m2DMixing{false};
+    /* For DevFmtAmbi* output only, specifies the channel order and
+     * normalization.
+     */
+    DevAmbiLayout mAmbiLayout{DevAmbiLayout::Default};
+    DevAmbiScaling mAmbiScale{DevAmbiScaling::Default};
+
+    std::string DeviceName;
+
+    // Device flags
+    std::bitset<DeviceFlagsCount> Flags{};
+
+    uint NumAuxSends{};
+
+    /* Rendering mode. */
+    RenderMode mRenderMode{RenderMode::Normal};
+
+    /* The average speaker distance as determined by the ambdec configuration,
+     * HRTF data set, or the NFC-HOA reference delay. Only used for NFC.
+     */
+    float AvgSpeakerDist{0.0f};
+
+    /* The default NFC filter. Not used directly, but is pre-initialized with
+     * the control distance from AvgSpeakerDist.
+     */
+    NfcFilter mNFCtrlFilter{};
+
+    uint SamplesDone{0u};
+    std::chrono::nanoseconds ClockBase{0};
+    std::chrono::nanoseconds FixedLatency{0};
+
+    AmbiRotateMatrix mAmbiRotateMatrix{};
+    AmbiRotateMatrix mAmbiRotateMatrix2{};
+
+    /* Temp storage used for mixer processing. */
+    static constexpr size_t MixerLineSize{BufferLineSize + DecoderBase::sMaxPadding};
+    static constexpr size_t MixerChannelsMax{16};
+    using MixerBufferLine = std::array<float,MixerLineSize>;
+    alignas(16) std::array<MixerBufferLine,MixerChannelsMax> mSampleData;
+    alignas(16) std::array<float,MixerLineSize+MaxResamplerPadding> mResampleData;
+
+    alignas(16) float FilteredData[BufferLineSize];
+    union {
+        alignas(16) float HrtfSourceData[BufferLineSize + HrtfHistoryLength];
+        alignas(16) float NfcSampleData[BufferLineSize];
+    };
+
+    /* Persistent storage for HRTF mixing. */
+    alignas(16) float2 HrtfAccumData[BufferLineSize + HrirLength];
+
+    /* Mixing buffer used by the Dry mix and Real output. */
+    al::vector<FloatBufferLine, 16> MixBuffer;
+
+    /* The "dry" path corresponds to the main output. */
+    MixParams Dry;
+    uint NumChannelsPerOrder[MaxAmbiOrder+1]{};
+
+    /* "Real" output, which will be written to the device buffer. May alias the
+     * dry buffer.
+     */
+    RealMixParams RealOut;
+
+    /* HRTF state and info */
+    std::unique_ptr<DirectHrtfState> mHrtfState;
+    al::intrusive_ptr<HrtfStore> mHrtf;
+    uint mIrSize{0};
+
+    /* Ambisonic-to-UHJ encoder */
+    std::unique_ptr<UhjEncoderBase> mUhjEncoder;
+
+    /* Ambisonic decoder for speakers */
+    std::unique_ptr<BFormatDec> AmbiDecoder;
+
+    /* Stereo-to-binaural filter */
+    std::unique_ptr<bs2b> Bs2b;
+
+    using PostProc = void(DeviceBase::*)(const size_t SamplesToDo);
+    PostProc PostProcess{nullptr};
+
+    std::unique_ptr<Compressor> Limiter;
+
+    /* Delay buffers used to compensate for speaker distances. */
+    std::unique_ptr<DistanceComp> ChannelDelays;
+
+    /* Dithering control. */
+    float DitherDepth{0.0f};
+    uint DitherSeed{0u};
+
+    /* Running count of the mixer invocations, in 31.1 fixed point. This
+     * actually increments *twice* when mixing, first at the start and then at
+     * the end, so the bottom bit indicates if the device is currently mixing
+     * and the upper bits indicates how many mixes have been done.
+     */
+    RefCount MixCount{0u};
+
+    // Contexts created on this device
+    std::atomic<al::FlexArray<ContextBase*>*> mContexts{nullptr};
+
+
+    DeviceBase(DeviceType type);
+    DeviceBase(const DeviceBase&) = delete;
+    DeviceBase& operator=(const DeviceBase&) = delete;
+    ~DeviceBase();
+
+    uint bytesFromFmt() const noexcept { return BytesFromDevFmt(FmtType); }
+    uint channelsFromFmt() const noexcept { return ChannelsFromDevFmt(FmtChans, mAmbiOrder); }
+    uint frameSizeFromFmt() const noexcept { return bytesFromFmt() * channelsFromFmt(); }
+
+    uint waitForMix() const noexcept
+    {
+        uint refcount;
+        while((refcount=MixCount.load(std::memory_order_acquire))&1) {
+        }
+        return refcount;
+    }
+
+    void ProcessHrtf(const size_t SamplesToDo);
+    void ProcessAmbiDec(const size_t SamplesToDo);
+    void ProcessAmbiDecStablized(const size_t SamplesToDo);
+    void ProcessUhj(const size_t SamplesToDo);
+    void ProcessBs2b(const size_t SamplesToDo);
+
+    inline void postProcess(const size_t SamplesToDo)
+    { if(PostProcess) LIKELY (this->*PostProcess)(SamplesToDo); }
+
+    void renderSamples(const al::span<float*> outBuffers, const uint numSamples);
+    void renderSamples(void *outBuffer, const uint numSamples, const size_t frameStep);
+
+    /* Caller must lock the device state, and the mixer must not be running. */
+#ifdef __USE_MINGW_ANSI_STDIO
+    [[gnu::format(gnu_printf,2,3)]]
+#else
+    [[gnu::format(printf,2,3)]]
+#endif
+    void handleDisconnect(const char *msg, ...);
+
+    /**
+     * Returns the index for the given channel name (e.g. FrontCenter), or
+     * INVALID_CHANNEL_INDEX if it doesn't exist.
+     */
+    uint channelIdxByName(Channel chan) const noexcept
+    { return RealOut.ChannelIndex[chan]; }
+
+    DISABLE_ALLOC()
+
+private:
+    uint renderSamples(const uint numSamples);
+};
+
+/* Must be less than 15 characters (16 including terminating null) for
+ * compatibility with pthread_setname_np limitations. */
+#define MIXER_THREAD_NAME "alsoft-mixer"
+
+#define RECORD_THREAD_NAME "alsoft-record"
+
+#endif /* CORE_DEVICE_H */
diff --git a/core/effects/base.h b/core/effects/base.h
new file mode 100644
index 00000000..4ee19f37
--- /dev/null
+++ b/core/effects/base.h
@@ -0,0 +1,197 @@
+#ifndef CORE_EFFECTS_BASE_H
+#define CORE_EFFECTS_BASE_H
+
+#include <stddef.h>
+
+#include "albyte.h"
+#include "almalloc.h"
+#include "alspan.h"
+#include "atomic.h"
+#include "core/bufferline.h"
+#include "intrusive_ptr.h"
+
+struct BufferStorage;
+struct ContextBase;
+struct DeviceBase;
+struct EffectSlot;
+struct MixParams;
+struct RealMixParams;
+
+
+/** Target gain for the reverb decay feedback reaching the decay time. */
+constexpr float ReverbDecayGain{0.001f}; /* -60 dB */
+
+constexpr float ReverbMaxReflectionsDelay{0.3f};
+constexpr float ReverbMaxLateReverbDelay{0.1f};
+
+enum class ChorusWaveform {
+    Sinusoid,
+    Triangle
+};
+
+constexpr float ChorusMaxDelay{0.016f};
+constexpr float FlangerMaxDelay{0.004f};
+
+constexpr float EchoMaxDelay{0.207f};
+constexpr float EchoMaxLRDelay{0.404f};
+
+enum class FShifterDirection {
+    Down,
+    Up,
+    Off
+};
+
+enum class ModulatorWaveform {
+    Sinusoid,
+    Sawtooth,
+    Square
+};
+
+enum class VMorpherPhenome {
+    A, E, I, O, U,
+    AA, AE, AH, AO, EH, ER, IH, IY, UH, UW,
+    B, D, F, G, J, K, L, M, N, P, R, S, T, V, Z
+};
+
+enum class VMorpherWaveform {
+    Sinusoid,
+    Triangle,
+    Sawtooth
+};
+
+union EffectProps {
+    struct {
+        float Density;
+        float Diffusion;
+        float Gain;
+        float GainHF;
+        float GainLF;
+        float DecayTime;
+        float DecayHFRatio;
+        float DecayLFRatio;
+        float ReflectionsGain;
+        float ReflectionsDelay;
+        float ReflectionsPan[3];
+        float LateReverbGain;
+        float LateReverbDelay;
+        float LateReverbPan[3];
+        float EchoTime;
+        float EchoDepth;
+        float ModulationTime;
+        float ModulationDepth;
+        float AirAbsorptionGainHF;
+        float HFReference;
+        float LFReference;
+        float RoomRolloffFactor;
+        bool DecayHFLimit;
+    } Reverb;
+
+    struct {
+        float AttackTime;
+        float ReleaseTime;
+        float Resonance;
+        float PeakGain;
+    } Autowah;
+
+    struct {
+        ChorusWaveform Waveform;
+        int Phase;
+        float Rate;
+        float Depth;
+        float Feedback;
+        float Delay;
+    } Chorus; /* Also Flanger */
+
+    struct {
+        bool OnOff;
+    } Compressor;
+
+    struct {
+        float Edge;
+        float Gain;
+        float LowpassCutoff;
+        float EQCenter;
+        float EQBandwidth;
+    } Distortion;
+
+    struct {
+        float Delay;
+        float LRDelay;
+
+        float Damping;
+        float Feedback;
+
+        float Spread;
+    } Echo;
+
+    struct {
+        float LowCutoff;
+        float LowGain;
+        float Mid1Center;
+        float Mid1Gain;
+        float Mid1Width;
+        float Mid2Center;
+        float Mid2Gain;
+        float Mid2Width;
+        float HighCutoff;
+        float HighGain;
+    } Equalizer;
+
+    struct {
+        float Frequency;
+        FShifterDirection LeftDirection;
+        FShifterDirection RightDirection;
+    } Fshifter;
+
+    struct {
+        float Frequency;
+        float HighPassCutoff;
+        ModulatorWaveform Waveform;
+    } Modulator;
+
+    struct {
+        int CoarseTune;
+        int FineTune;
+    } Pshifter;
+
+    struct {
+        float Rate;
+        VMorpherPhenome PhonemeA;
+        VMorpherPhenome PhonemeB;
+        int PhonemeACoarseTuning;
+        int PhonemeBCoarseTuning;
+        VMorpherWaveform Waveform;
+    } Vmorpher;
+
+    struct {
+        float Gain;
+    } Dedicated;
+};
+
+
+struct EffectTarget {
+    MixParams *Main;
+    RealMixParams *RealOut;
+};
+
+struct EffectState : public al::intrusive_ref<EffectState> {
+    al::span<FloatBufferLine> mOutTarget;
+
+
+    virtual ~EffectState() = default;
+
+    virtual void deviceUpdate(const DeviceBase *device, const BufferStorage *buffer) = 0;
+    virtual void update(const ContextBase *context, const EffectSlot *slot,
+        const EffectProps *props, const EffectTarget target) = 0;
+    virtual void process(const size_t samplesToDo, const al::span<const FloatBufferLine> samplesIn,
+        const al::span<FloatBufferLine> samplesOut) = 0;
+};
+
+
+struct EffectStateFactory {
+    virtual ~EffectStateFactory() = default;
+
+    virtual al::intrusive_ptr<EffectState> create() = 0;
+};
+
+#endif /* CORE_EFFECTS_BASE_H */
diff --git a/core/effectslot.cpp b/core/effectslot.cpp
new file mode 100644
index 00000000..db8aa078
--- /dev/null
+++ b/core/effectslot.cpp
@@ -0,0 +1,19 @@
+
+#include "config.h"
+
+#include "effectslot.h"
+
+#include <stddef.h>
+
+#include "almalloc.h"
+#include "context.h"
+
+
+EffectSlotArray *EffectSlot::CreatePtrArray(size_t count) noexcept
+{
+    /* Allocate space for twice as many pointers, so the mixer has scratch
+     * space to store a sorted list during mixing.
+     */
+    void *ptr{al_calloc(alignof(EffectSlotArray), EffectSlotArray::Sizeof(count*2))};
+    return al::construct_at(static_cast<EffectSlotArray*>(ptr), count);
+}
diff --git a/core/effectslot.h b/core/effectslot.h
new file mode 100644
index 00000000..2624ae5f
--- /dev/null
+++ b/core/effectslot.h
@@ -0,0 +1,89 @@
+#ifndef CORE_EFFECTSLOT_H
+#define CORE_EFFECTSLOT_H
+
+#include <atomic>
+
+#include "almalloc.h"
+#include "device.h"
+#include "effects/base.h"
+#include "intrusive_ptr.h"
+
+struct EffectSlot;
+struct WetBuffer;
+
+using EffectSlotArray = al::FlexArray<EffectSlot*>;
+
+
+enum class EffectSlotType : unsigned char {
+    None,
+    Reverb,
+    Chorus,
+    Distortion,
+    Echo,
+    Flanger,
+    FrequencyShifter,
+    VocalMorpher,
+    PitchShifter,
+    RingModulator,
+    Autowah,
+    Compressor,
+    Equalizer,
+    EAXReverb,
+    DedicatedLFE,
+    DedicatedDialog,
+    Convolution
+};
+
+struct EffectSlotProps {
+    float Gain;
+    bool  AuxSendAuto;
+    EffectSlot *Target;
+
+    EffectSlotType Type;
+    EffectProps Props;
+
+    al::intrusive_ptr<EffectState> State;
+
+    std::atomic<EffectSlotProps*> next;
+
+    DEF_NEWDEL(EffectSlotProps)
+};
+
+
+struct EffectSlot {
+    bool InUse{false};
+
+    std::atomic<EffectSlotProps*> Update{nullptr};
+
+    /* Wet buffer configuration is ACN channel order with N3D scaling.
+     * Consequently, effects that only want to work with mono input can use
+     * channel 0 by itself. Effects that want multichannel can process the
+     * ambisonics signal and make a B-Format source pan.
+     */
+    MixParams Wet;
+
+    float Gain{1.0f};
+    bool  AuxSendAuto{true};
+    EffectSlot *Target{nullptr};
+
+    EffectSlotType EffectType{EffectSlotType::None};
+    EffectProps mEffectProps{};
+    al::intrusive_ptr<EffectState> mEffectState;
+
+    float RoomRolloff{0.0f}; /* Added to the source's room rolloff, not multiplied. */
+    float DecayTime{0.0f};
+    float DecayLFRatio{0.0f};
+    float DecayHFRatio{0.0f};
+    bool DecayHFLimit{false};
+    float AirAbsorptionGainHF{1.0f};
+
+    /* Mixing buffer used by the Wet mix. */
+    al::vector<FloatBufferLine,16> mWetBuffer;
+
+
+    static EffectSlotArray *CreatePtrArray(size_t count) noexcept;
+
+    DEF_NEWDEL(EffectSlot)
+};
+
+#endif /* CORE_EFFECTSLOT_H */
diff --git a/core/except.cpp b/core/except.cpp
new file mode 100644
index 00000000..45fd4eb5
--- /dev/null
+++ b/core/except.cpp
@@ -0,0 +1,30 @@
+
+#include "config.h"
+
+#include "except.h"
+
+#include <cstdio>
+#include <cstdarg>
+
+#include "opthelpers.h"
+
+
+namespace al {
+
+base_exception::~base_exception() = default;
+
+void base_exception::setMessage(const char* msg, std::va_list args)
+{
+    std::va_list args2;
+    va_copy(args2, args);
+    int msglen{std::vsnprintf(nullptr, 0, msg, args)};
+    if(msglen > 0) LIKELY
+    {
+        mMessage.resize(static_cast<size_t>(msglen)+1);
+        std::vsnprintf(const_cast<char*>(mMessage.data()), mMessage.length(), msg, args2);
+        mMessage.pop_back();
+    }
+    va_end(args2);
+}
+
+} // namespace al
diff --git a/core/except.h b/core/except.h
new file mode 100644
index 00000000..0e28e9df
--- /dev/null
+++ b/core/except.h
@@ -0,0 +1,31 @@
+#ifndef CORE_EXCEPT_H
+#define CORE_EXCEPT_H
+
+#include <cstdarg>
+#include <exception>
+#include <string>
+#include <utility>
+
+
+namespace al {
+
+class base_exception : public std::exception {
+    std::string mMessage;
+
+protected:
+    base_exception() = default;
+    virtual ~base_exception();
+
+    void setMessage(const char *msg, std::va_list args);
+
+public:
+    const char *what() const noexcept override { return mMessage.c_str(); }
+};
+
+} // namespace al
+
+#define START_API_FUNC try
+
+#define END_API_FUNC catch(...) { std::terminate(); }
+
+#endif /* CORE_EXCEPT_H */
diff --git a/core/filters/biquad.cpp b/core/filters/biquad.cpp
new file mode 100644
index 00000000..a0a62eb8
--- /dev/null
+++ b/core/filters/biquad.cpp
@@ -0,0 +1,168 @@
+
+#include "config.h"
+
+#include "biquad.h"
+
+#include <algorithm>
+#include <cassert>
+#include <cmath>
+
+#include "alnumbers.h"
+#include "opthelpers.h"
+
+
+template<typename Real>
+void BiquadFilterR<Real>::setParams(BiquadType type, Real f0norm, Real gain, Real rcpQ)
+{
+    /* HACK: Limit gain to -100dB. This shouldn't ever happen, all callers
+     * already clamp to minimum of 0.001, or have a limited range of values
+     * that don't go below 0.126. But it seems to with some callers. This needs
+     * to be investigated.
+     */
+    gain = std::max(gain, Real(0.00001));
+
+    const Real w0{al::numbers::pi_v<Real>*2.0f * f0norm};
+    const Real sin_w0{std::sin(w0)};
+    const Real cos_w0{std::cos(w0)};
+    const Real alpha{sin_w0/2.0f * rcpQ};
+
+    Real sqrtgain_alpha_2;
+    Real a[3]{ 1.0f, 0.0f, 0.0f };
+    Real b[3]{ 1.0f, 0.0f, 0.0f };
+
+    /* Calculate filter coefficients depending on filter type */
+    switch(type)
+    {
+        case BiquadType::HighShelf:
+            sqrtgain_alpha_2 = 2.0f * std::sqrt(gain) * alpha;
+            b[0] =       gain*((gain+1.0f) + (gain-1.0f)*cos_w0 + sqrtgain_alpha_2);
+            b[1] = -2.0f*gain*((gain-1.0f) + (gain+1.0f)*cos_w0                   );
+            b[2] =       gain*((gain+1.0f) + (gain-1.0f)*cos_w0 - sqrtgain_alpha_2);
+            a[0] =             (gain+1.0f) - (gain-1.0f)*cos_w0 + sqrtgain_alpha_2;
+            a[1] =  2.0f*     ((gain-1.0f) - (gain+1.0f)*cos_w0                   );
+            a[2] =             (gain+1.0f) - (gain-1.0f)*cos_w0 - sqrtgain_alpha_2;
+            break;
+        case BiquadType::LowShelf:
+            sqrtgain_alpha_2 = 2.0f * std::sqrt(gain) * alpha;
+            b[0] =       gain*((gain+1.0f) - (gain-1.0f)*cos_w0 + sqrtgain_alpha_2);
+            b[1] =  2.0f*gain*((gain-1.0f) - (gain+1.0f)*cos_w0                   );
+            b[2] =       gain*((gain+1.0f) - (gain-1.0f)*cos_w0 - sqrtgain_alpha_2);
+            a[0] =             (gain+1.0f) + (gain-1.0f)*cos_w0 + sqrtgain_alpha_2;
+            a[1] = -2.0f*     ((gain-1.0f) + (gain+1.0f)*cos_w0                   );
+            a[2] =             (gain+1.0f) + (gain-1.0f)*cos_w0 - sqrtgain_alpha_2;
+            break;
+        case BiquadType::Peaking:
+            b[0] =  1.0f + alpha * gain;
+            b[1] = -2.0f * cos_w0;
+            b[2] =  1.0f - alpha * gain;
+            a[0] =  1.0f + alpha / gain;
+            a[1] = -2.0f * cos_w0;
+            a[2] =  1.0f - alpha / gain;
+            break;
+
+        case BiquadType::LowPass:
+            b[0] = (1.0f - cos_w0) / 2.0f;
+            b[1] =  1.0f - cos_w0;
+            b[2] = (1.0f - cos_w0) / 2.0f;
+            a[0] =  1.0f + alpha;
+            a[1] = -2.0f * cos_w0;
+            a[2] =  1.0f - alpha;
+            break;
+        case BiquadType::HighPass:
+            b[0] =  (1.0f + cos_w0) / 2.0f;
+            b[1] = -(1.0f + cos_w0);
+            b[2] =  (1.0f + cos_w0) / 2.0f;
+            a[0] =   1.0f + alpha;
+            a[1] =  -2.0f * cos_w0;
+            a[2] =   1.0f - alpha;
+            break;
+        case BiquadType::BandPass:
+            b[0] =  alpha;
+            b[1] =  0.0f;
+            b[2] = -alpha;
+            a[0] =  1.0f + alpha;
+            a[1] = -2.0f * cos_w0;
+            a[2] =  1.0f - alpha;
+            break;
+    }
+
+    mA1 = a[1] / a[0];
+    mA2 = a[2] / a[0];
+    mB0 = b[0] / a[0];
+    mB1 = b[1] / a[0];
+    mB2 = b[2] / a[0];
+}
+
+template<typename Real>
+void BiquadFilterR<Real>::process(const al::span<const Real> src, Real *dst)
+{
+    const Real b0{mB0};
+    const Real b1{mB1};
+    const Real b2{mB2};
+    const Real a1{mA1};
+    const Real a2{mA2};
+    Real z1{mZ1};
+    Real z2{mZ2};
+
+    /* Processing loop is Transposed Direct Form II. This requires less storage
+     * compared to Direct Form I (only two delay components, instead of a four-
+     * sample history; the last two inputs and outputs), and works better for
+     * floating-point which favors summing similarly-sized values while being
+     * less bothered by overflow.
+     *
+     * See: http://www.earlevel.com/main/2003/02/28/biquads/
+     */
+    auto proc_sample = [b0,b1,b2,a1,a2,&z1,&z2](Real input) noexcept -> Real
+    {
+        const Real output{input*b0 + z1};
+        z1 = input*b1 - output*a1 + z2;
+        z2 = input*b2 - output*a2;
+        return output;
+    };
+    std::transform(src.cbegin(), src.cend(), dst, proc_sample);
+
+    mZ1 = z1;
+    mZ2 = z2;
+}
+
+template<typename Real>
+void BiquadFilterR<Real>::dualProcess(BiquadFilterR &other, const al::span<const Real> src,
+    Real *dst)
+{
+    const Real b00{mB0};
+    const Real b01{mB1};
+    const Real b02{mB2};
+    const Real a01{mA1};
+    const Real a02{mA2};
+    const Real b10{other.mB0};
+    const Real b11{other.mB1};
+    const Real b12{other.mB2};
+    const Real a11{other.mA1};
+    const Real a12{other.mA2};
+    Real z01{mZ1};
+    Real z02{mZ2};
+    Real z11{other.mZ1};
+    Real z12{other.mZ2};
+
+    auto proc_sample = [b00,b01,b02,a01,a02,b10,b11,b12,a11,a12,&z01,&z02,&z11,&z12](Real input) noexcept -> Real
+    {
+        const Real tmpout{input*b00 + z01};
+        z01 = input*b01 - tmpout*a01 + z02;
+        z02 = input*b02 - tmpout*a02;
+        input = tmpout;
+
+        const Real output{input*b10 + z11};
+        z11 = input*b11 - output*a11 + z12;
+        z12 = input*b12 - output*a12;
+        return output;
+    };
+    std::transform(src.cbegin(), src.cend(), dst, proc_sample);
+
+    mZ1 = z01;
+    mZ2 = z02;
+    other.mZ1 = z11;
+    other.mZ2 = z12;
+}
+
+template class BiquadFilterR<float>;
+template class BiquadFilterR<double>;
diff --git a/core/filters/biquad.h b/core/filters/biquad.h
new file mode 100644
index 00000000..75a4009b
--- /dev/null
+++ b/core/filters/biquad.h
@@ -0,0 +1,144 @@
+#ifndef CORE_FILTERS_BIQUAD_H
+#define CORE_FILTERS_BIQUAD_H
+
+#include <algorithm>
+#include <cmath>
+#include <cstddef>
+#include <utility>
+
+#include "alnumbers.h"
+#include "alspan.h"
+
+
+/* Filters implementation is based on the "Cookbook formulae for audio
+ * EQ biquad filter coefficients" by Robert Bristow-Johnson
+ * http://www.musicdsp.org/files/Audio-EQ-Cookbook.txt
+ */
+/* Implementation note: For the shelf and peaking filters, the specified gain
+ * is for the centerpoint of the transition band. This better fits EFX filter
+ * behavior, which expects the shelf's reference frequency to reach the given
+ * gain. To set the gain for the shelf or peak itself, use the square root of
+ * the desired linear gain (or halve the dB gain).
+ */
+
+enum class BiquadType {
+    /** EFX-style low-pass filter, specifying a gain and reference frequency. */
+    HighShelf,
+    /** EFX-style high-pass filter, specifying a gain and reference frequency. */
+    LowShelf,
+    /** Peaking filter, specifying a gain and reference frequency. */
+    Peaking,
+
+    /** Low-pass cut-off filter, specifying a cut-off frequency. */
+    LowPass,
+    /** High-pass cut-off filter, specifying a cut-off frequency. */
+    HighPass,
+    /** Band-pass filter, specifying a center frequency. */
+    BandPass,
+};
+
+template<typename Real>
+class BiquadFilterR {
+    /* Last two delayed components for direct form II. */
+    Real mZ1{0}, mZ2{0};
+    /* Transfer function coefficients "b" (numerator) */
+    Real mB0{1}, mB1{0}, mB2{0};
+    /* Transfer function coefficients "a" (denominator; a0 is pre-applied). */
+    Real mA1{0}, mA2{0};
+
+    void setParams(BiquadType type, Real f0norm, Real gain, Real rcpQ);
+
+    /**
+     * Calculates the rcpQ (i.e. 1/Q) coefficient for shelving filters, using
+     * the reference gain and shelf slope parameter.
+     * \param gain 0 < gain
+     * \param slope 0 < slope <= 1
+     */
+    static Real rcpQFromSlope(Real gain, Real slope)
+    { return std::sqrt((gain + Real{1}/gain)*(Real{1}/slope - Real{1}) + Real{2}); }
+
+    /**
+     * Calculates the rcpQ (i.e. 1/Q) coefficient for filters, using the
+     * normalized reference frequency and bandwidth.
+     * \param f0norm 0 < f0norm < 0.5.
+     * \param bandwidth 0 < bandwidth
+     */
+    static Real rcpQFromBandwidth(Real f0norm, Real bandwidth)
+    {
+        const Real w0{al::numbers::pi_v<Real>*Real{2} * f0norm};
+        return 2.0f*std::sinh(std::log(Real{2})/Real{2}*bandwidth*w0/std::sin(w0));
+    }
+
+public:
+    void clear() noexcept { mZ1 = mZ2 = Real{0}; }
+
+    /**
+     * Sets the filter state for the specified filter type and its parameters.
+     *
+     * \param type The type of filter to apply.
+     * \param f0norm The normalized reference frequency (ref / sample_rate).
+     * This is the center point for the Shelf, Peaking, and BandPass filter
+     * types, or the cutoff frequency for the LowPass and HighPass filter
+     * types.
+     * \param gain The gain for the reference frequency response. Only used by
+     * the Shelf and Peaking filter types.
+     * \param slope Slope steepness of the transition band.
+     */
+    void setParamsFromSlope(BiquadType type, Real f0norm, Real gain, Real slope)
+    {
+        gain = std::max<Real>(gain, 0.001f); /* Limit -60dB */
+        setParams(type, f0norm, gain, rcpQFromSlope(gain, slope));
+    }
+
+    /**
+     * Sets the filter state for the specified filter type and its parameters.
+     *
+     * \param type The type of filter to apply.
+     * \param f0norm The normalized reference frequency (ref / sample_rate).
+     * This is the center point for the Shelf, Peaking, and BandPass filter
+     * types, or the cutoff frequency for the LowPass and HighPass filter
+     * types.
+     * \param gain The gain for the reference frequency response. Only used by
+     * the Shelf and Peaking filter types.
+     * \param bandwidth Normalized bandwidth of the transition band.
+     */
+    void setParamsFromBandwidth(BiquadType type, Real f0norm, Real gain, Real bandwidth)
+    { setParams(type, f0norm, gain, rcpQFromBandwidth(f0norm, bandwidth)); }
+
+    void copyParamsFrom(const BiquadFilterR &other)
+    {
+        mB0 = other.mB0;
+        mB1 = other.mB1;
+        mB2 = other.mB2;
+        mA1 = other.mA1;
+        mA2 = other.mA2;
+    }
+
+    void process(const al::span<const Real> src, Real *dst);
+    /** Processes this filter and the other at the same time. */
+    void dualProcess(BiquadFilterR &other, const al::span<const Real> src, Real *dst);
+
+    /* Rather hacky. It's just here to support "manual" processing. */
+    std::pair<Real,Real> getComponents() const noexcept { return {mZ1, mZ2}; }
+    void setComponents(Real z1, Real z2) noexcept { mZ1 = z1; mZ2 = z2; }
+    Real processOne(const Real in, Real &z1, Real &z2) const noexcept
+    {
+        const Real out{in*mB0 + z1};
+        z1 = in*mB1 - out*mA1 + z2;
+        z2 = in*mB2 - out*mA2;
+        return out;
+    }
+};
+
+template<typename Real>
+struct DualBiquadR {
+    BiquadFilterR<Real> &f0, &f1;
+
+    void process(const al::span<const Real> src, Real *dst)
+    { f0.dualProcess(f1, src, dst); }
+};
+
+using BiquadFilter = BiquadFilterR<float>;
+using DualBiquad = DualBiquadR<float>;
+
+#endif /* CORE_FILTERS_BIQUAD_H */
diff --git a/core/filters/nfc.cpp b/core/filters/nfc.cpp
new file mode 100644
index 00000000..aa64c613
--- /dev/null
+++ b/core/filters/nfc.cpp
@@ -0,0 +1,367 @@
+
+#include "config.h"
+
+#include "nfc.h"
+
+#include <algorithm>
+
+#include "opthelpers.h"
+
+
+/* Near-field control filters are the basis for handling the near-field effect.
+ * The near-field effect is a bass-boost present in the directional components
+ * of a recorded signal, created as a result of the wavefront curvature (itself
+ * a function of sound distance). Proper reproduction dictates this be
+ * compensated for using a bass-cut given the playback speaker distance, to
+ * avoid excessive bass in the playback.
+ *
+ * For real-time rendered audio, emulating the near-field effect based on the
+ * sound source's distance, and subsequently compensating for it at output
+ * based on the speaker distances, can create a more realistic perception of
+ * sound distance beyond a simple 1/r attenuation.
+ *
+ * These filters do just that. Each one applies a low-shelf filter, created as
+ * the combination of a bass-boost for a given sound source distance (near-
+ * field emulation) along with a bass-cut for a given control/speaker distance
+ * (near-field compensation).
+ *
+ * Note that it is necessary to apply a cut along with the boost, since the
+ * boost alone is unstable in higher-order ambisonics as it causes an infinite
+ * DC gain (even first-order ambisonics requires there to be no DC offset for
+ * the boost to work). Consequently, ambisonics requires a control parameter to
+ * be used to avoid an unstable boost-only filter. NFC-HOA defines this control
+ * as a reference delay, calculated with:
+ *
+ * reference_delay = control_distance / speed_of_sound
+ *
+ * This means w0 (for input) or w1 (for output) should be set to:
+ *
+ * wN = 1 / (reference_delay * sample_rate)
+ *
+ * when dealing with NFC-HOA content. For FOA input content, which does not
+ * specify a reference_delay variable, w0 should be set to 0 to apply only
+ * near-field compensation for output. It's important that w1 be a finite,
+ * positive, non-0 value or else the bass-boost will become unstable again.
+ * Also, w0 should not be too large compared to w1, to avoid excessively loud
+ * low frequencies.
+ */
+
+namespace {
+
+constexpr float B[5][4] = {
+    {    0.0f                             },
+    {    1.0f                             },
+    {    3.0f,     3.0f                   },
+    { 3.6778f,  6.4595f, 2.3222f          },
+    { 4.2076f, 11.4877f, 5.7924f, 9.1401f }
+};
+
+NfcFilter1 NfcFilterCreate1(const float w0, const float w1) noexcept
+{
+    NfcFilter1 nfc{};
+    float b_00, g_0;
+    float r;
+
+    /* Calculate bass-cut coefficients. */
+    r = 0.5f * w1;
+    b_00 = B[1][0] * r;
+    g_0 = 1.0f + b_00;
+
+    nfc.base_gain = 1.0f / g_0;
+    nfc.a1 = 2.0f * b_00 / g_0;
+
+    /* Calculate bass-boost coefficients. */
+    r = 0.5f * w0;
+    b_00 = B[1][0] * r;
+    g_0 = 1.0f + b_00;
+
+    nfc.gain = nfc.base_gain * g_0;
+    nfc.b1 = 2.0f * b_00 / g_0;
+
+    return nfc;
+}
+
+void NfcFilterAdjust1(NfcFilter1 *nfc, const float w0) noexcept
+{
+    const float r{0.5f * w0};
+    const float b_00{B[1][0] * r};
+    const float g_0{1.0f + b_00};
+
+    nfc->gain = nfc->base_gain * g_0;
+    nfc->b1 = 2.0f * b_00 / g_0;
+}
+
+
+NfcFilter2 NfcFilterCreate2(const float w0, const float w1) noexcept
+{
+    NfcFilter2 nfc{};
+    float b_10, b_11, g_1;
+    float r;
+
+    /* Calculate bass-cut coefficients. */
+    r = 0.5f * w1;
+    b_10 = B[2][0] * r;
+    b_11 = B[2][1] * r * r;
+    g_1 = 1.0f + b_10 + b_11;
+
+    nfc.base_gain = 1.0f / g_1;
+    nfc.a1 = (2.0f*b_10 + 4.0f*b_11) / g_1;
+    nfc.a2 = 4.0f * b_11 / g_1;
+
+    /* Calculate bass-boost coefficients. */
+    r = 0.5f * w0;
+    b_10 = B[2][0] * r;
+    b_11 = B[2][1] * r * r;
+    g_1 = 1.0f + b_10 + b_11;
+
+    nfc.gain = nfc.base_gain * g_1;
+    nfc.b1 = (2.0f*b_10 + 4.0f*b_11) / g_1;
+    nfc.b2 = 4.0f * b_11 / g_1;
+
+    return nfc;
+}
+
+void NfcFilterAdjust2(NfcFilter2 *nfc, const float w0) noexcept
+{
+    const float r{0.5f * w0};
+    const float b_10{B[2][0] * r};
+    const float b_11{B[2][1] * r * r};
+    const float g_1{1.0f + b_10 + b_11};
+
+    nfc->gain = nfc->base_gain * g_1;
+    nfc->b1 = (2.0f*b_10 + 4.0f*b_11) / g_1;
+    nfc->b2 = 4.0f * b_11 / g_1;
+}
+
+
+NfcFilter3 NfcFilterCreate3(const float w0, const float w1) noexcept
+{
+    NfcFilter3 nfc{};
+    float b_10, b_11, g_1;
+    float b_00, g_0;
+    float r;
+
+    /* Calculate bass-cut coefficients. */
+    r = 0.5f * w1;
+    b_10 = B[3][0] * r;
+    b_11 = B[3][1] * r * r;
+    b_00 = B[3][2] * r;
+    g_1 = 1.0f + b_10 + b_11;
+    g_0 = 1.0f + b_00;
+
+    nfc.base_gain = 1.0f / (g_1 * g_0);
+    nfc.a1 = (2.0f*b_10 + 4.0f*b_11) / g_1;
+    nfc.a2 = 4.0f * b_11 / g_1;
+    nfc.a3 = 2.0f * b_00 / g_0;
+
+    /* Calculate bass-boost coefficients. */
+    r = 0.5f * w0;
+    b_10 = B[3][0] * r;
+    b_11 = B[3][1] * r * r;
+    b_00 = B[3][2] * r;
+    g_1 = 1.0f + b_10 + b_11;
+    g_0 = 1.0f + b_00;
+
+    nfc.gain = nfc.base_gain * (g_1 * g_0);
+    nfc.b1 = (2.0f*b_10 + 4.0f*b_11) / g_1;
+    nfc.b2 = 4.0f * b_11 / g_1;
+    nfc.b3 = 2.0f * b_00 / g_0;
+
+    return nfc;
+}
+
+void NfcFilterAdjust3(NfcFilter3 *nfc, const float w0) noexcept
+{
+    const float r{0.5f * w0};
+    const float b_10{B[3][0] * r};
+    const float b_11{B[3][1] * r * r};
+    const float b_00{B[3][2] * r};
+    const float g_1{1.0f + b_10 + b_11};
+    const float g_0{1.0f + b_00};
+
+    nfc->gain = nfc->base_gain * (g_1 * g_0);
+    nfc->b1 = (2.0f*b_10 + 4.0f*b_11) / g_1;
+    nfc->b2 = 4.0f * b_11 / g_1;
+    nfc->b3 = 2.0f * b_00 / g_0;
+}
+
+
+NfcFilter4 NfcFilterCreate4(const float w0, const float w1) noexcept
+{
+    NfcFilter4 nfc{};
+    float b_10, b_11, g_1;
+    float b_00, b_01, g_0;
+    float r;
+
+    /* Calculate bass-cut coefficients. */
+    r = 0.5f * w1;
+    b_10 = B[4][0] * r;
+    b_11 = B[4][1] * r * r;
+    b_00 = B[4][2] * r;
+    b_01 = B[4][3] * r * r;
+    g_1 = 1.0f + b_10 + b_11;
+    g_0 = 1.0f + b_00 + b_01;
+
+    nfc.base_gain = 1.0f / (g_1 * g_0);
+    nfc.a1 = (2.0f*b_10 + 4.0f*b_11) / g_1;
+    nfc.a2 = 4.0f * b_11 / g_1;
+    nfc.a3 = (2.0f*b_00 + 4.0f*b_01) / g_0;
+    nfc.a4 = 4.0f * b_01 / g_0;
+
+    /* Calculate bass-boost coefficients. */
+    r = 0.5f * w0;
+    b_10 = B[4][0] * r;
+    b_11 = B[4][1] * r * r;
+    b_00 = B[4][2] * r;
+    b_01 = B[4][3] * r * r;
+    g_1 = 1.0f + b_10 + b_11;
+    g_0 = 1.0f + b_00 + b_01;
+
+    nfc.gain = nfc.base_gain * (g_1 * g_0);
+    nfc.b1 = (2.0f*b_10 + 4.0f*b_11) / g_1;
+    nfc.b2 = 4.0f * b_11 / g_1;
+    nfc.b3 = (2.0f*b_00 + 4.0f*b_01) / g_0;
+    nfc.b4 = 4.0f * b_01 / g_0;
+
+    return nfc;
+}
+
+void NfcFilterAdjust4(NfcFilter4 *nfc, const float w0) noexcept
+{
+    const float r{0.5f * w0};
+    const float b_10{B[4][0] * r};
+    const float b_11{B[4][1] * r * r};
+    const float b_00{B[4][2] * r};
+    const float b_01{B[4][3] * r * r};
+    const float g_1{1.0f + b_10 + b_11};
+    const float g_0{1.0f + b_00 + b_01};
+
+    nfc->gain = nfc->base_gain * (g_1 * g_0);
+    nfc->b1 = (2.0f*b_10 + 4.0f*b_11) / g_1;
+    nfc->b2 = 4.0f * b_11 / g_1;
+    nfc->b3 = (2.0f*b_00 + 4.0f*b_01) / g_0;
+    nfc->b4 = 4.0f * b_01 / g_0;
+}
+
+} // namespace
+
+void NfcFilter::init(const float w1) noexcept
+{
+    first = NfcFilterCreate1(0.0f, w1);
+    second = NfcFilterCreate2(0.0f, w1);
+    third = NfcFilterCreate3(0.0f, w1);
+    fourth = NfcFilterCreate4(0.0f, w1);
+}
+
+void NfcFilter::adjust(const float w0) noexcept
+{
+    NfcFilterAdjust1(&first, w0);
+    NfcFilterAdjust2(&second, w0);
+    NfcFilterAdjust3(&third, w0);
+    NfcFilterAdjust4(&fourth, w0);
+}
+
+
+void NfcFilter::process1(const al::span<const float> src, float *RESTRICT dst)
+{
+    const float gain{first.gain};
+    const float b1{first.b1};
+    const float a1{first.a1};
+    float z1{first.z[0]};
+    auto proc_sample = [gain,b1,a1,&z1](const float in) noexcept -> float
+    {
+        const float y{in*gain - a1*z1};
+        const float out{y + b1*z1};
+        z1 += y;
+        return out;
+    };
+    std::transform(src.cbegin(), src.cend(), dst, proc_sample);
+    first.z[0] = z1;
+}
+
+void NfcFilter::process2(const al::span<const float> src, float *RESTRICT dst)
+{
+    const float gain{second.gain};
+    const float b1{second.b1};
+    const float b2{second.b2};
+    const float a1{second.a1};
+    const float a2{second.a2};
+    float z1{second.z[0]};
+    float z2{second.z[1]};
+    auto proc_sample = [gain,b1,b2,a1,a2,&z1,&z2](const float in) noexcept -> float
+    {
+        const float y{in*gain - a1*z1 - a2*z2};
+        const float out{y + b1*z1 + b2*z2};
+        z2 += z1;
+        z1 += y;
+        return out;
+    };
+    std::transform(src.cbegin(), src.cend(), dst, proc_sample);
+    second.z[0] = z1;
+    second.z[1] = z2;
+}
+
+void NfcFilter::process3(const al::span<const float> src, float *RESTRICT dst)
+{
+    const float gain{third.gain};
+    const float b1{third.b1};
+    const float b2{third.b2};
+    const float b3{third.b3};
+    const float a1{third.a1};
+    const float a2{third.a2};
+    const float a3{third.a3};
+    float z1{third.z[0]};
+    float z2{third.z[1]};
+    float z3{third.z[2]};
+    auto proc_sample = [gain,b1,b2,b3,a1,a2,a3,&z1,&z2,&z3](const float in) noexcept -> float
+    {
+        float y{in*gain - a1*z1 - a2*z2};
+        float out{y + b1*z1 + b2*z2};
+        z2 += z1;
+        z1 += y;
+
+        y = out - a3*z3;
+        out = y + b3*z3;
+        z3 += y;
+        return out;
+    };
+    std::transform(src.cbegin(), src.cend(), dst, proc_sample);
+    third.z[0] = z1;
+    third.z[1] = z2;
+    third.z[2] = z3;
+}
+
+void NfcFilter::process4(const al::span<const float> src, float *RESTRICT dst)
+{
+    const float gain{fourth.gain};
+    const float b1{fourth.b1};
+    const float b2{fourth.b2};
+    const float b3{fourth.b3};
+    const float b4{fourth.b4};
+    const float a1{fourth.a1};
+    const float a2{fourth.a2};
+    const float a3{fourth.a3};
+    const float a4{fourth.a4};
+    float z1{fourth.z[0]};
+    float z2{fourth.z[1]};
+    float z3{fourth.z[2]};
+    float z4{fourth.z[3]};
+    auto proc_sample = [gain,b1,b2,b3,b4,a1,a2,a3,a4,&z1,&z2,&z3,&z4](const float in) noexcept -> float
+    {
+        float y{in*gain - a1*z1 - a2*z2};
+        float out{y + b1*z1 + b2*z2};
+        z2 += z1;
+        z1 += y;
+
+        y = out - a3*z3 - a4*z4;
+        out = y + b3*z3 + b4*z4;
+        z4 += z3;
+        z3 += y;
+        return out;
+    };
+    std::transform(src.cbegin(), src.cend(), dst, proc_sample);
+    fourth.z[0] = z1;
+    fourth.z[1] = z2;
+    fourth.z[2] = z3;
+    fourth.z[3] = z4;
+}
diff --git a/core/filters/nfc.h b/core/filters/nfc.h
new file mode 100644
index 00000000..33f67a5f
--- /dev/null
+++ b/core/filters/nfc.h
@@ -0,0 +1,63 @@
+#ifndef CORE_FILTERS_NFC_H
+#define CORE_FILTERS_NFC_H
+
+#include <cstddef>
+
+#include "alspan.h"
+
+
+struct NfcFilter1 {
+    float base_gain, gain;
+    float b1, a1;
+    float z[1];
+};
+struct NfcFilter2 {
+    float base_gain, gain;
+    float b1, b2, a1, a2;
+    float z[2];
+};
+struct NfcFilter3 {
+    float base_gain, gain;
+    float b1, b2, b3, a1, a2, a3;
+    float z[3];
+};
+struct NfcFilter4 {
+    float base_gain, gain;
+    float b1, b2, b3, b4, a1, a2, a3, a4;
+    float z[4];
+};
+
+class NfcFilter {
+    NfcFilter1 first;
+    NfcFilter2 second;
+    NfcFilter3 third;
+    NfcFilter4 fourth;
+
+public:
+    /* NOTE:
+     * w0 = speed_of_sound / (source_distance * sample_rate);
+     * w1 = speed_of_sound / (control_distance * sample_rate);
+     *
+     * Generally speaking, the control distance should be approximately the
+     * average speaker distance, or based on the reference delay if outputing
+     * NFC-HOA. It must not be negative, 0, or infinite. The source distance
+     * should not be too small relative to the control distance.
+     */
+
+    void init(const float w1) noexcept;
+    void adjust(const float w0) noexcept;
+
+    /* Near-field control filter for first-order ambisonic channels (1-3). */
+    void process1(const al::span<const float> src, float *RESTRICT dst);
+
+    /* Near-field control filter for second-order ambisonic channels (4-8). */
+    void process2(const al::span<const float> src, float *RESTRICT dst);
+
+    /* Near-field control filter for third-order ambisonic channels (9-15). */
+    void process3(const al::span<const float> src, float *RESTRICT dst);
+
+    /* Near-field control filter for fourth-order ambisonic channels (16-24). */
+    void process4(const al::span<const float> src, float *RESTRICT dst);
+};
+
+#endif /* CORE_FILTERS_NFC_H */
diff --git a/core/filters/splitter.cpp b/core/filters/splitter.cpp
new file mode 100644
index 00000000..983ba36f
--- /dev/null
+++ b/core/filters/splitter.cpp
@@ -0,0 +1,179 @@
+
+#include "config.h"
+
+#include "splitter.h"
+
+#include <algorithm>
+#include <cmath>
+#include <limits>
+
+#include "alnumbers.h"
+#include "opthelpers.h"
+
+
+template<typename Real>
+void BandSplitterR<Real>::init(Real f0norm)
+{
+    const Real w{f0norm * (al::numbers::pi_v<Real>*2)};
+    const Real cw{std::cos(w)};
+    if(cw > std::numeric_limits<float>::epsilon())
+        mCoeff = (std::sin(w) - 1.0f) / cw;
+    else
+        mCoeff = cw * -0.5f;
+
+    mLpZ1 = 0.0f;
+    mLpZ2 = 0.0f;
+    mApZ1 = 0.0f;
+}
+
+template<typename Real>
+void BandSplitterR<Real>::process(const al::span<const Real> input, Real *hpout, Real *lpout)
+{
+    const Real ap_coeff{mCoeff};
+    const Real lp_coeff{mCoeff*0.5f + 0.5f};
+    Real lp_z1{mLpZ1};
+    Real lp_z2{mLpZ2};
+    Real ap_z1{mApZ1};
+    auto proc_sample = [ap_coeff,lp_coeff,&lp_z1,&lp_z2,&ap_z1,&lpout](const Real in) noexcept -> Real
+    {
+        /* Low-pass sample processing. */
+        Real d{(in - lp_z1) * lp_coeff};
+        Real lp_y{lp_z1 + d};
+        lp_z1 = lp_y + d;
+
+        d = (lp_y - lp_z2) * lp_coeff;
+        lp_y = lp_z2 + d;
+        lp_z2 = lp_y + d;
+
+        *(lpout++) = lp_y;
+
+        /* All-pass sample processing. */
+        Real ap_y{in*ap_coeff + ap_z1};
+        ap_z1 = in - ap_y*ap_coeff;
+
+        /* High-pass generated from removing low-passed output. */
+        return ap_y - lp_y;
+    };
+    std::transform(input.cbegin(), input.cend(), hpout, proc_sample);
+    mLpZ1 = lp_z1;
+    mLpZ2 = lp_z2;
+    mApZ1 = ap_z1;
+}
+
+template<typename Real>
+void BandSplitterR<Real>::processHfScale(const al::span<const Real> input, Real *RESTRICT output,
+    const Real hfscale)
+{
+    const Real ap_coeff{mCoeff};
+    const Real lp_coeff{mCoeff*0.5f + 0.5f};
+    Real lp_z1{mLpZ1};
+    Real lp_z2{mLpZ2};
+    Real ap_z1{mApZ1};
+    auto proc_sample = [hfscale,ap_coeff,lp_coeff,&lp_z1,&lp_z2,&ap_z1](const Real in) noexcept -> Real
+    {
+        /* Low-pass sample processing. */
+        Real d{(in - lp_z1) * lp_coeff};
+        Real lp_y{lp_z1 + d};
+        lp_z1 = lp_y + d;
+
+        d = (lp_y - lp_z2) * lp_coeff;
+        lp_y = lp_z2 + d;
+        lp_z2 = lp_y + d;
+
+        /* All-pass sample processing. */
+        Real ap_y{in*ap_coeff + ap_z1};
+        ap_z1 = in - ap_y*ap_coeff;
+
+        /* High-pass generated by removing the low-passed signal, which is then
+         * scaled and added back to the low-passed signal.
+         */
+        return (ap_y-lp_y)*hfscale + lp_y;
+    };
+    std::transform(input.begin(), input.end(), output, proc_sample);
+    mLpZ1 = lp_z1;
+    mLpZ2 = lp_z2;
+    mApZ1 = ap_z1;
+}
+
+template<typename Real>
+void BandSplitterR<Real>::processHfScale(const al::span<Real> samples, const Real hfscale)
+{
+    const Real ap_coeff{mCoeff};
+    const Real lp_coeff{mCoeff*0.5f + 0.5f};
+    Real lp_z1{mLpZ1};
+    Real lp_z2{mLpZ2};
+    Real ap_z1{mApZ1};
+    auto proc_sample = [hfscale,ap_coeff,lp_coeff,&lp_z1,&lp_z2,&ap_z1](const Real in) noexcept -> Real
+    {
+        /* Low-pass sample processing. */
+        Real d{(in - lp_z1) * lp_coeff};
+        Real lp_y{lp_z1 + d};
+        lp_z1 = lp_y + d;
+
+        d = (lp_y - lp_z2) * lp_coeff;
+        lp_y = lp_z2 + d;
+        lp_z2 = lp_y + d;
+
+        /* All-pass sample processing. */
+        Real ap_y{in*ap_coeff + ap_z1};
+        ap_z1 = in - ap_y*ap_coeff;
+
+        /* High-pass generated by removing the low-passed signal, which is then
+         * scaled and added back to the low-passed signal.
+         */
+        return (ap_y-lp_y)*hfscale + lp_y;
+    };
+    std::transform(samples.begin(), samples.end(), samples.begin(), proc_sample);
+    mLpZ1 = lp_z1;
+    mLpZ2 = lp_z2;
+    mApZ1 = ap_z1;
+}
+
+template<typename Real>
+void BandSplitterR<Real>::processScale(const al::span<Real> samples, const Real hfscale, const Real lfscale)
+{
+    const Real ap_coeff{mCoeff};
+    const Real lp_coeff{mCoeff*0.5f + 0.5f};
+    Real lp_z1{mLpZ1};
+    Real lp_z2{mLpZ2};
+    Real ap_z1{mApZ1};
+    auto proc_sample = [hfscale,lfscale,ap_coeff,lp_coeff,&lp_z1,&lp_z2,&ap_z1](const Real in) noexcept -> Real
+    {
+        Real d{(in - lp_z1) * lp_coeff};
+        Real lp_y{lp_z1 + d};
+        lp_z1 = lp_y + d;
+
+        d = (lp_y - lp_z2) * lp_coeff;
+        lp_y = lp_z2 + d;
+        lp_z2 = lp_y + d;
+
+        Real ap_y{in*ap_coeff + ap_z1};
+        ap_z1 = in - ap_y*ap_coeff;
+
+        /* Apply separate factors to the high and low frequencies. */
+        return (ap_y-lp_y)*hfscale + lp_y*lfscale;
+    };
+    std::transform(samples.begin(), samples.end(), samples.begin(), proc_sample);
+    mLpZ1 = lp_z1;
+    mLpZ2 = lp_z2;
+    mApZ1 = ap_z1;
+}
+
+template<typename Real>
+void BandSplitterR<Real>::processAllPass(const al::span<Real> samples)
+{
+    const Real coeff{mCoeff};
+    Real z1{mApZ1};
+    auto proc_sample = [coeff,&z1](const Real in) noexcept -> Real
+    {
+        const Real out{in*coeff + z1};
+        z1 = in - out*coeff;
+        return out;
+    };
+    std::transform(samples.cbegin(), samples.cend(), samples.begin(), proc_sample);
+    mApZ1 = z1;
+}
+
+
+template class BandSplitterR<float>;
+template class BandSplitterR<double>;
diff --git a/core/filters/splitter.h b/core/filters/splitter.h
new file mode 100644
index 00000000..e853eb38
--- /dev/null
+++ b/core/filters/splitter.h
@@ -0,0 +1,40 @@
+#ifndef CORE_FILTERS_SPLITTER_H
+#define CORE_FILTERS_SPLITTER_H
+
+#include <cstddef>
+
+#include "alspan.h"
+
+
+/* Band splitter. Splits a signal into two phase-matching frequency bands. */
+template<typename Real>
+class BandSplitterR {
+    Real mCoeff{0.0f};
+    Real mLpZ1{0.0f};
+    Real mLpZ2{0.0f};
+    Real mApZ1{0.0f};
+
+public:
+    BandSplitterR() = default;
+    BandSplitterR(const BandSplitterR&) = default;
+    BandSplitterR(Real f0norm) { init(f0norm); }
+    BandSplitterR& operator=(const BandSplitterR&) = default;
+
+    void init(Real f0norm);
+    void clear() noexcept { mLpZ1 = mLpZ2 = mApZ1 = 0.0f; }
+    void process(const al::span<const Real> input, Real *hpout, Real *lpout);
+
+    void processHfScale(const al::span<const Real> input, Real *output, const Real hfscale);
+
+    void processHfScale(const al::span<Real> samples, const Real hfscale);
+    void processScale(const al::span<Real> samples, const Real hfscale, const Real lfscale);
+
+    /**
+     * The all-pass portion of the band splitter. Applies the same phase shift
+     * without splitting or scaling the signal.
+     */
+    void processAllPass(const al::span<Real> samples);
+};
+using BandSplitter = BandSplitterR<float>;
+
+#endif /* CORE_FILTERS_SPLITTER_H */
diff --git a/core/fmt_traits.cpp b/core/fmt_traits.cpp
new file mode 100644
index 00000000..054d8766
--- /dev/null
+++ b/core/fmt_traits.cpp
@@ -0,0 +1,79 @@
+
+#include "config.h"
+
+#include "fmt_traits.h"
+
+
+namespace al {
+
+const int16_t muLawDecompressionTable[256] = {
+    -32124,-31100,-30076,-29052,-28028,-27004,-25980,-24956,
+    -23932,-22908,-21884,-20860,-19836,-18812,-17788,-16764,
+    -15996,-15484,-14972,-14460,-13948,-13436,-12924,-12412,
+    -11900,-11388,-10876,-10364, -9852, -9340, -8828, -8316,
+     -7932, -7676, -7420, -7164, -6908, -6652, -6396, -6140,
+     -5884, -5628, -5372, -5116, -4860, -4604, -4348, -4092,
+     -3900, -3772, -3644, -3516, -3388, -3260, -3132, -3004,
+     -2876, -2748, -2620, -2492, -2364, -2236, -2108, -1980,
+     -1884, -1820, -1756, -1692, -1628, -1564, -1500, -1436,
+     -1372, -1308, -1244, -1180, -1116, -1052,  -988,  -924,
+      -876,  -844,  -812,  -780,  -748,  -716,  -684,  -652,
+      -620,  -588,  -556,  -524,  -492,  -460,  -428,  -396,
+      -372,  -356,  -340,  -324,  -308,  -292,  -276,  -260,
+      -244,  -228,  -212,  -196,  -180,  -164,  -148,  -132,
+      -120,  -112,  -104,   -96,   -88,   -80,   -72,   -64,
+       -56,   -48,   -40,   -32,   -24,   -16,    -8,     0,
+     32124, 31100, 30076, 29052, 28028, 27004, 25980, 24956,
+     23932, 22908, 21884, 20860, 19836, 18812, 17788, 16764,
+     15996, 15484, 14972, 14460, 13948, 13436, 12924, 12412,
+     11900, 11388, 10876, 10364,  9852,  9340,  8828,  8316,
+      7932,  7676,  7420,  7164,  6908,  6652,  6396,  6140,
+      5884,  5628,  5372,  5116,  4860,  4604,  4348,  4092,
+      3900,  3772,  3644,  3516,  3388,  3260,  3132,  3004,
+      2876,  2748,  2620,  2492,  2364,  2236,  2108,  1980,
+      1884,  1820,  1756,  1692,  1628,  1564,  1500,  1436,
+      1372,  1308,  1244,  1180,  1116,  1052,   988,   924,
+       876,   844,   812,   780,   748,   716,   684,   652,
+       620,   588,   556,   524,   492,   460,   428,   396,
+       372,   356,   340,   324,   308,   292,   276,   260,
+       244,   228,   212,   196,   180,   164,   148,   132,
+       120,   112,   104,    96,    88,    80,    72,    64,
+        56,    48,    40,    32,    24,    16,     8,     0
+};
+
+const int16_t aLawDecompressionTable[256] = {
+     -5504, -5248, -6016, -5760, -4480, -4224, -4992, -4736,
+     -7552, -7296, -8064, -7808, -6528, -6272, -7040, -6784,
+     -2752, -2624, -3008, -2880, -2240, -2112, -2496, -2368,
+     -3776, -3648, -4032, -3904, -3264, -3136, -3520, -3392,
+    -22016,-20992,-24064,-23040,-17920,-16896,-19968,-18944,
+    -30208,-29184,-32256,-31232,-26112,-25088,-28160,-27136,
+    -11008,-10496,-12032,-11520, -8960, -8448, -9984, -9472,
+    -15104,-14592,-16128,-15616,-13056,-12544,-14080,-13568,
+      -344,  -328,  -376,  -360,  -280,  -264,  -312,  -296,
+      -472,  -456,  -504,  -488,  -408,  -392,  -440,  -424,
+       -88,   -72,  -120,  -104,   -24,    -8,   -56,   -40,
+      -216,  -200,  -248,  -232,  -152,  -136,  -184,  -168,
+     -1376, -1312, -1504, -1440, -1120, -1056, -1248, -1184,
+     -1888, -1824, -2016, -1952, -1632, -1568, -1760, -1696,
+      -688,  -656,  -752,  -720,  -560,  -528,  -624,  -592,
+      -944,  -912, -1008,  -976,  -816,  -784,  -880,  -848,
+      5504,  5248,  6016,  5760,  4480,  4224,  4992,  4736,
+      7552,  7296,  8064,  7808,  6528,  6272,  7040,  6784,
+      2752,  2624,  3008,  2880,  2240,  2112,  2496,  2368,
+      3776,  3648,  4032,  3904,  3264,  3136,  3520,  3392,
+     22016, 20992, 24064, 23040, 17920, 16896, 19968, 18944,
+     30208, 29184, 32256, 31232, 26112, 25088, 28160, 27136,
+     11008, 10496, 12032, 11520,  8960,  8448,  9984,  9472,
+     15104, 14592, 16128, 15616, 13056, 12544, 14080, 13568,
+       344,   328,   376,   360,   280,   264,   312,   296,
+       472,   456,   504,   488,   408,   392,   440,   424,
+        88,    72,   120,   104,    24,     8,    56,    40,
+       216,   200,   248,   232,   152,   136,   184,   168,
+      1376,  1312,  1504,  1440,  1120,  1056,  1248,  1184,
+      1888,  1824,  2016,  1952,  1632,  1568,  1760,  1696,
+       688,   656,   752,   720,   560,   528,   624,   592,
+       944,   912,  1008,   976,   816,   784,   880,   848
+};
+
+} // namespace al
diff --git a/core/fmt_traits.h b/core/fmt_traits.h
new file mode 100644
index 00000000..f797f836
--- /dev/null
+++ b/core/fmt_traits.h
@@ -0,0 +1,81 @@
+#ifndef CORE_FMT_TRAITS_H
+#define CORE_FMT_TRAITS_H
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "albyte.h"
+#include "buffer_storage.h"
+
+
+namespace al {
+
+extern const int16_t muLawDecompressionTable[256];
+extern const int16_t aLawDecompressionTable[256];
+
+
+template<FmtType T>
+struct FmtTypeTraits { };
+
+template<>
+struct FmtTypeTraits<FmtUByte> {
+    using Type = uint8_t;
+
+    template<typename OutT>
+    static constexpr inline OutT to(const Type val) noexcept
+    { return val*OutT{1.0/128.0} - OutT{1.0}; }
+};
+template<>
+struct FmtTypeTraits<FmtShort> {
+    using Type = int16_t;
+
+    template<typename OutT>
+    static constexpr inline OutT to(const Type val) noexcept { return val*OutT{1.0/32768.0}; }
+};
+template<>
+struct FmtTypeTraits<FmtFloat> {
+    using Type = float;
+
+    template<typename OutT>
+    static constexpr inline OutT to(const Type val) noexcept { return val; }
+};
+template<>
+struct FmtTypeTraits<FmtDouble> {
+    using Type = double;
+
+    template<typename OutT>
+    static constexpr inline OutT to(const Type val) noexcept { return static_cast<OutT>(val); }
+};
+template<>
+struct FmtTypeTraits<FmtMulaw> {
+    using Type = uint8_t;
+
+    template<typename OutT>
+    static constexpr inline OutT to(const Type val) noexcept
+    { return muLawDecompressionTable[val] * OutT{1.0/32768.0}; }
+};
+template<>
+struct FmtTypeTraits<FmtAlaw> {
+    using Type = uint8_t;
+
+    template<typename OutT>
+    static constexpr inline OutT to(const Type val) noexcept
+    { return aLawDecompressionTable[val] * OutT{1.0/32768.0}; }
+};
+
+
+template<FmtType SrcType, typename DstT>
+inline void LoadSampleArray(DstT *RESTRICT dst, const al::byte *src, const size_t srcstep,
+    const size_t samples) noexcept
+{
+    using TypeTraits = FmtTypeTraits<SrcType>;
+    using SampleType = typename TypeTraits::Type;
+
+    const SampleType *RESTRICT ssrc{reinterpret_cast<const SampleType*>(src)};
+    for(size_t i{0u};i < samples;i++)
+        dst[i] = TypeTraits::template to<DstT>(ssrc[i*srcstep]);
+}
+
+} // namespace al
+
+#endif /* CORE_FMT_TRAITS_H */
diff --git a/core/fpu_ctrl.cpp b/core/fpu_ctrl.cpp
new file mode 100644
index 00000000..0cf0d6e7
--- /dev/null
+++ b/core/fpu_ctrl.cpp
@@ -0,0 +1,61 @@
+
+#include "config.h"
+
+#include "fpu_ctrl.h"
+
+#ifdef HAVE_INTRIN_H
+#include <intrin.h>
+#endif
+#ifdef HAVE_SSE_INTRINSICS
+#include <emmintrin.h>
+#ifndef _MM_DENORMALS_ZERO_MASK
+/* Some headers seem to be missing these? */
+#define _MM_DENORMALS_ZERO_MASK 0x0040u
+#define _MM_DENORMALS_ZERO_ON 0x0040u
+#endif
+#endif
+
+#include "cpu_caps.h"
+
+
+void FPUCtl::enter() noexcept
+{
+    if(this->in_mode) return;
+
+#if defined(HAVE_SSE_INTRINSICS)
+    this->sse_state = _mm_getcsr();
+    unsigned int sseState{this->sse_state};
+    sseState &= ~(_MM_FLUSH_ZERO_MASK | _MM_DENORMALS_ZERO_MASK);
+    sseState |= _MM_FLUSH_ZERO_ON | _MM_DENORMALS_ZERO_ON;
+    _mm_setcsr(sseState);
+
+#elif defined(__GNUC__) && defined(HAVE_SSE)
+
+    if((CPUCapFlags&CPU_CAP_SSE))
+    {
+        __asm__ __volatile__("stmxcsr %0" : "=m" (*&this->sse_state));
+        unsigned int sseState{this->sse_state};
+        sseState |= 0x8000; /* set flush-to-zero */
+        if((CPUCapFlags&CPU_CAP_SSE2))
+            sseState |= 0x0040; /* set denormals-are-zero */
+        __asm__ __volatile__("ldmxcsr %0" : : "m" (*&sseState));
+    }
+#endif
+
+    this->in_mode = true;
+}
+
+void FPUCtl::leave() noexcept
+{
+    if(!this->in_mode) return;
+
+#if defined(HAVE_SSE_INTRINSICS)
+    _mm_setcsr(this->sse_state);
+
+#elif defined(__GNUC__) && defined(HAVE_SSE)
+
+    if((CPUCapFlags&CPU_CAP_SSE))
+        __asm__ __volatile__("ldmxcsr %0" : : "m" (*&this->sse_state));
+#endif
+    this->in_mode = false;
+}
diff --git a/core/fpu_ctrl.h b/core/fpu_ctrl.h
new file mode 100644
index 00000000..9554313a
--- /dev/null
+++ b/core/fpu_ctrl.h
@@ -0,0 +1,21 @@
+#ifndef CORE_FPU_CTRL_H
+#define CORE_FPU_CTRL_H
+
+class FPUCtl {
+#if defined(HAVE_SSE_INTRINSICS) || (defined(__GNUC__) && defined(HAVE_SSE))
+    unsigned int sse_state{};
+#endif
+    bool in_mode{};
+
+public:
+    FPUCtl() noexcept { enter(); in_mode = true; }
+    ~FPUCtl() { if(in_mode) leave(); }
+
+    FPUCtl(const FPUCtl&) = delete;
+    FPUCtl& operator=(const FPUCtl&) = delete;
+
+    void enter() noexcept;
+    void leave() noexcept;
+};
+
+#endif /* CORE_FPU_CTRL_H */
diff --git a/core/front_stablizer.h b/core/front_stablizer.h
new file mode 100644
index 00000000..6825111a
--- /dev/null
+++ b/core/front_stablizer.h
@@ -0,0 +1,31 @@
+#ifndef CORE_FRONT_STABLIZER_H
+#define CORE_FRONT_STABLIZER_H
+
+#include <array>
+#include <memory>
+
+#include "almalloc.h"
+#include "bufferline.h"
+#include "filters/splitter.h"
+
+
+struct FrontStablizer {
+    FrontStablizer(size_t numchans) : ChannelFilters{numchans} { }
+
+    alignas(16) std::array<float,BufferLineSize> MidDirect{};
+    alignas(16) std::array<float,BufferLineSize> Side{};
+    alignas(16) std::array<float,BufferLineSize> Temp{};
+
+    BandSplitter MidFilter;
+    alignas(16) FloatBufferLine MidLF{};
+    alignas(16) FloatBufferLine MidHF{};
+
+    al::FlexArray<BandSplitter,16> ChannelFilters;
+
+    static std::unique_ptr<FrontStablizer> Create(size_t numchans)
+    { return std::unique_ptr<FrontStablizer>{new(FamCount(numchans)) FrontStablizer{numchans}}; }
+
+    DEF_FAM_NEWDEL(FrontStablizer, ChannelFilters)
+};
+
+#endif /* CORE_FRONT_STABLIZER_H */
diff --git a/core/helpers.cpp b/core/helpers.cpp
new file mode 100644
index 00000000..99cf009c
--- /dev/null
+++ b/core/helpers.cpp
@@ -0,0 +1,569 @@
+
+#include "config.h"
+
+#include "helpers.h"
+
+#include <algorithm>
+#include <cerrno>
+#include <cstdarg>
+#include <cstdlib>
+#include <cstdio>
+#include <cstring>
+#include <mutex>
+#include <limits>
+#include <string>
+#include <tuple>
+
+#include "almalloc.h"
+#include "alfstream.h"
+#include "alnumeric.h"
+#include "aloptional.h"
+#include "alspan.h"
+#include "alstring.h"
+#include "logging.h"
+#include "strutils.h"
+#include "vector.h"
+
+
+/* Mixing thread piority level */
+int RTPrioLevel{1};
+
+/* Allow reducing the process's RTTime limit for RTKit. */
+bool AllowRTTimeLimit{true};
+
+
+#ifdef _WIN32
+
+#include <shlobj.h>
+
+const PathNamePair &GetProcBinary()
+{
+    static al::optional<PathNamePair> procbin;
+    if(procbin) return *procbin;
+
+    auto fullpath = al::vector<WCHAR>(256);
+    DWORD len{GetModuleFileNameW(nullptr, fullpath.data(), static_cast<DWORD>(fullpath.size()))};
+    while(len == fullpath.size())
+    {
+        fullpath.resize(fullpath.size() << 1);
+        len = GetModuleFileNameW(nullptr, fullpath.data(), static_cast<DWORD>(fullpath.size()));
+    }
+    if(len == 0)
+    {
+        ERR("Failed to get process name: error %lu\n", GetLastError());
+        procbin.emplace();
+        return *procbin;
+    }
+
+    fullpath.resize(len);
+    if(fullpath.back() != 0)
+        fullpath.push_back(0);
+
+    std::replace(fullpath.begin(), fullpath.end(), '/', '\\');
+    auto sep = std::find(fullpath.rbegin()+1, fullpath.rend(), '\\');
+    if(sep != fullpath.rend())
+    {
+        *sep = 0;
+        procbin.emplace(wstr_to_utf8(fullpath.data()), wstr_to_utf8(al::to_address(sep.base())));
+    }
+    else
+        procbin.emplace(std::string{}, wstr_to_utf8(fullpath.data()));
+
+    TRACE("Got binary: %s, %s\n", procbin->path.c_str(), procbin->fname.c_str());
+    return *procbin;
+}
+
+namespace {
+
+void DirectorySearch(const char *path, const char *ext, al::vector<std::string> *const results)
+{
+    std::string pathstr{path};
+    pathstr += "\\*";
+    pathstr += ext;
+    TRACE("Searching %s\n", pathstr.c_str());
+
+    std::wstring wpath{utf8_to_wstr(pathstr.c_str())};
+    WIN32_FIND_DATAW fdata;
+    HANDLE hdl{FindFirstFileW(wpath.c_str(), &fdata)};
+    if(hdl == INVALID_HANDLE_VALUE) return;
+
+    const auto base = results->size();
+
+    do {
+        results->emplace_back();
+        std::string &str = results->back();
+        str = path;
+        str += '\\';
+        str += wstr_to_utf8(fdata.cFileName);
+    } while(FindNextFileW(hdl, &fdata));
+    FindClose(hdl);
+
+    const al::span<std::string> newlist{results->data()+base, results->size()-base};
+    std::sort(newlist.begin(), newlist.end());
+    for(const auto &name : newlist)
+        TRACE(" got %s\n", name.c_str());
+}
+
+} // namespace
+
+al::vector<std::string> SearchDataFiles(const char *ext, const char *subdir)
+{
+    auto is_slash = [](int c) noexcept -> int { return (c == '\\' || c == '/'); };
+
+    static std::mutex search_lock;
+    std::lock_guard<std::mutex> _{search_lock};
+
+    /* If the path is absolute, use it directly. */
+    al::vector<std::string> results;
+    if(isalpha(subdir[0]) && subdir[1] == ':' && is_slash(subdir[2]))
+    {
+        std::string path{subdir};
+        std::replace(path.begin(), path.end(), '/', '\\');
+        DirectorySearch(path.c_str(), ext, &results);
+        return results;
+    }
+    if(subdir[0] == '\\' && subdir[1] == '\\' && subdir[2] == '?' && subdir[3] == '\\')
+    {
+        DirectorySearch(subdir, ext, &results);
+        return results;
+    }
+
+    std::string path;
+
+    /* Search the app-local directory. */
+    if(auto localpath = al::getenv(L"ALSOFT_LOCAL_PATH"))
+    {
+        path = wstr_to_utf8(localpath->c_str());
+        if(is_slash(path.back()))
+            path.pop_back();
+    }
+    else if(WCHAR *cwdbuf{_wgetcwd(nullptr, 0)})
+    {
+        path = wstr_to_utf8(cwdbuf);
+        if(is_slash(path.back()))
+            path.pop_back();
+        free(cwdbuf);
+    }
+    else
+        path = ".";
+    std::replace(path.begin(), path.end(), '/', '\\');
+    DirectorySearch(path.c_str(), ext, &results);
+
+    /* Search the local and global data dirs. */
+    static const int ids[2]{ CSIDL_APPDATA, CSIDL_COMMON_APPDATA };
+    for(int id : ids)
+    {
+        WCHAR buffer[MAX_PATH];
+        if(SHGetSpecialFolderPathW(nullptr, buffer, id, FALSE) == FALSE)
+            continue;
+
+        path = wstr_to_utf8(buffer);
+        if(!is_slash(path.back()))
+            path += '\\';
+        path += subdir;
+        std::replace(path.begin(), path.end(), '/', '\\');
+
+        DirectorySearch(path.c_str(), ext, &results);
+    }
+
+    return results;
+}
+
+void SetRTPriority(void)
+{
+    if(RTPrioLevel > 0)
+    {
+        if(!SetThreadPriority(GetCurrentThread(), THREAD_PRIORITY_TIME_CRITICAL))
+            ERR("Failed to set priority level for thread\n");
+    }
+}
+
+#else
+
+#include <sys/types.h>
+#include <unistd.h>
+#include <dirent.h>
+#ifdef __FreeBSD__
+#include <sys/sysctl.h>
+#endif
+#ifdef __HAIKU__
+#include <FindDirectory.h>
+#endif
+#ifdef HAVE_PROC_PIDPATH
+#include <libproc.h>
+#endif
+#if defined(HAVE_PTHREAD_SETSCHEDPARAM) && !defined(__OpenBSD__)
+#include <pthread.h>
+#include <sched.h>
+#endif
+#ifdef HAVE_RTKIT
+#include <sys/time.h>
+#include <sys/resource.h>
+
+#include "dbus_wrap.h"
+#include "rtkit.h"
+#ifndef RLIMIT_RTTIME
+#define RLIMIT_RTTIME 15
+#endif
+#endif
+
+const PathNamePair &GetProcBinary()
+{
+    static al::optional<PathNamePair> procbin;
+    if(procbin) return *procbin;
+
+    al::vector<char> pathname;
+#ifdef __FreeBSD__
+    size_t pathlen;
+    int mib[4] = { CTL_KERN, KERN_PROC, KERN_PROC_PATHNAME, -1 };
+    if(sysctl(mib, 4, nullptr, &pathlen, nullptr, 0) == -1)
+        WARN("Failed to sysctl kern.proc.pathname: %s\n", strerror(errno));
+    else
+    {
+        pathname.resize(pathlen + 1);
+        sysctl(mib, 4, pathname.data(), &pathlen, nullptr, 0);
+        pathname.resize(pathlen);
+    }
+#endif
+#ifdef HAVE_PROC_PIDPATH
+    if(pathname.empty())
+    {
+        char procpath[PROC_PIDPATHINFO_MAXSIZE]{};
+        const pid_t pid{getpid()};
+        if(proc_pidpath(pid, procpath, sizeof(procpath)) < 1)
+            ERR("proc_pidpath(%d, ...) failed: %s\n", pid, strerror(errno));
+        else
+            pathname.insert(pathname.end(), procpath, procpath+strlen(procpath));
+    }
+#endif
+#ifdef __HAIKU__
+    if(pathname.empty())
+    {
+        char procpath[PATH_MAX];
+        if(find_path(B_APP_IMAGE_SYMBOL, B_FIND_PATH_IMAGE_PATH, NULL, procpath, sizeof(procpath)) == B_OK)
+            pathname.insert(pathname.end(), procpath, procpath+strlen(procpath));
+    }
+#endif
+#ifndef __SWITCH__
+    if(pathname.empty())
+    {
+        static const char SelfLinkNames[][32]{
+            "/proc/self/exe",
+            "/proc/self/file",
+            "/proc/curproc/exe",
+            "/proc/curproc/file"
+        };
+
+        pathname.resize(256);
+
+        const char *selfname{};
+        ssize_t len{};
+        for(const char *name : SelfLinkNames)
+        {
+            selfname = name;
+            len = readlink(selfname, pathname.data(), pathname.size());
+            if(len >= 0 || errno != ENOENT) break;
+        }
+
+        while(len > 0 && static_cast<size_t>(len) == pathname.size())
+        {
+            pathname.resize(pathname.size() << 1);
+            len = readlink(selfname, pathname.data(), pathname.size());
+        }
+        if(len <= 0)
+        {
+            WARN("Failed to readlink %s: %s\n", selfname, strerror(errno));
+            len = 0;
+        }
+
+        pathname.resize(static_cast<size_t>(len));
+    }
+#endif
+    while(!pathname.empty() && pathname.back() == 0)
+        pathname.pop_back();
+
+    auto sep = std::find(pathname.crbegin(), pathname.crend(), '/');
+    if(sep != pathname.crend())
+        procbin.emplace(std::string(pathname.cbegin(), sep.base()-1),
+            std::string(sep.base(), pathname.cend()));
+    else
+        procbin.emplace(std::string{}, std::string(pathname.cbegin(), pathname.cend()));
+
+    TRACE("Got binary: \"%s\", \"%s\"\n", procbin->path.c_str(), procbin->fname.c_str());
+    return *procbin;
+}
+
+namespace {
+
+void DirectorySearch(const char *path, const char *ext, al::vector<std::string> *const results)
+{
+    TRACE("Searching %s for *%s\n", path, ext);
+    DIR *dir{opendir(path)};
+    if(!dir) return;
+
+    const auto base = results->size();
+    const size_t extlen{strlen(ext)};
+
+    while(struct dirent *dirent{readdir(dir)})
+    {
+        if(strcmp(dirent->d_name, ".") == 0 || strcmp(dirent->d_name, "..") == 0)
+            continue;
+
+        const size_t len{strlen(dirent->d_name)};
+        if(len <= extlen) continue;
+        if(al::strcasecmp(dirent->d_name+len-extlen, ext) != 0)
+            continue;
+
+        results->emplace_back();
+        std::string &str = results->back();
+        str = path;
+        if(str.back() != '/')
+            str.push_back('/');
+        str += dirent->d_name;
+    }
+    closedir(dir);
+
+    const al::span<std::string> newlist{results->data()+base, results->size()-base};
+    std::sort(newlist.begin(), newlist.end());
+    for(const auto &name : newlist)
+        TRACE(" got %s\n", name.c_str());
+}
+
+} // namespace
+
+al::vector<std::string> SearchDataFiles(const char *ext, const char *subdir)
+{
+    static std::mutex search_lock;
+    std::lock_guard<std::mutex> _{search_lock};
+
+    al::vector<std::string> results;
+    if(subdir[0] == '/')
+    {
+        DirectorySearch(subdir, ext, &results);
+        return results;
+    }
+
+    /* Search the app-local directory. */
+    if(auto localpath = al::getenv("ALSOFT_LOCAL_PATH"))
+        DirectorySearch(localpath->c_str(), ext, &results);
+    else
+    {
+        al::vector<char> cwdbuf(256);
+        while(!getcwd(cwdbuf.data(), cwdbuf.size()))
+        {
+            if(errno != ERANGE)
+            {
+                cwdbuf.clear();
+                break;
+            }
+            cwdbuf.resize(cwdbuf.size() << 1);
+        }
+        if(cwdbuf.empty())
+            DirectorySearch(".", ext, &results);
+        else
+        {
+            DirectorySearch(cwdbuf.data(), ext, &results);
+            cwdbuf.clear();
+        }
+    }
+
+    // Search local data dir
+    if(auto datapath = al::getenv("XDG_DATA_HOME"))
+    {
+        std::string &path = *datapath;
+        if(path.back() != '/')
+            path += '/';
+        path += subdir;
+        DirectorySearch(path.c_str(), ext, &results);
+    }
+    else if(auto homepath = al::getenv("HOME"))
+    {
+        std::string &path = *homepath;
+        if(path.back() == '/')
+            path.pop_back();
+        path += "/.local/share/";
+        path += subdir;
+        DirectorySearch(path.c_str(), ext, &results);
+    }
+
+    // Search global data dirs
+    std::string datadirs{al::getenv("XDG_DATA_DIRS").value_or("/usr/local/share/:/usr/share/")};
+
+    size_t curpos{0u};
+    while(curpos < datadirs.size())
+    {
+        size_t nextpos{datadirs.find(':', curpos)};
+
+        std::string path{(nextpos != std::string::npos) ?
+            datadirs.substr(curpos, nextpos++ - curpos) : datadirs.substr(curpos)};
+        curpos = nextpos;
+
+        if(path.empty()) continue;
+        if(path.back() != '/')
+            path += '/';
+        path += subdir;
+
+        DirectorySearch(path.c_str(), ext, &results);
+    }
+
+#ifdef ALSOFT_INSTALL_DATADIR
+    // Search the installation data directory
+    {
+        std::string path{ALSOFT_INSTALL_DATADIR};
+        if(!path.empty())
+        {
+            if(path.back() != '/')
+                path += '/';
+            path += subdir;
+            DirectorySearch(path.c_str(), ext, &results);
+        }
+    }
+#endif
+
+    return results;
+}
+
+namespace {
+
+bool SetRTPriorityPthread(int prio)
+{
+    int err{ENOTSUP};
+#if defined(HAVE_PTHREAD_SETSCHEDPARAM) && !defined(__OpenBSD__)
+    /* Get the min and max priority for SCHED_RR. Limit the max priority to
+     * half, for now, to ensure the thread can't take the highest priority and
+     * go rogue.
+     */
+    int rtmin{sched_get_priority_min(SCHED_RR)};
+    int rtmax{sched_get_priority_max(SCHED_RR)};
+    rtmax = (rtmax-rtmin)/2 + rtmin;
+
+    struct sched_param param{};
+    param.sched_priority = clampi(prio, rtmin, rtmax);
+#ifdef SCHED_RESET_ON_FORK
+    err = pthread_setschedparam(pthread_self(), SCHED_RR|SCHED_RESET_ON_FORK, &param);
+    if(err == EINVAL)
+#endif
+        err = pthread_setschedparam(pthread_self(), SCHED_RR, &param);
+    if(err == 0) return true;
+
+#else
+
+    std::ignore = prio;
+#endif
+    WARN("pthread_setschedparam failed: %s (%d)\n", std::strerror(err), err);
+    return false;
+}
+
+bool SetRTPriorityRTKit(int prio)
+{
+#ifdef HAVE_RTKIT
+    if(!HasDBus())
+    {
+        WARN("D-Bus not available\n");
+        return false;
+    }
+    dbus::Error error;
+    dbus::ConnectionPtr conn{dbus_bus_get(DBUS_BUS_SYSTEM, &error.get())};
+    if(!conn)
+    {
+        WARN("D-Bus connection failed with %s: %s\n", error->name, error->message);
+        return false;
+    }
+
+    /* Don't stupidly exit if the connection dies while doing this. */
+    dbus_connection_set_exit_on_disconnect(conn.get(), false);
+
+    int nicemin{};
+    int err{rtkit_get_min_nice_level(conn.get(), &nicemin)};
+    if(err == -ENOENT)
+    {
+        err = std::abs(err);
+        ERR("Could not query RTKit: %s (%d)\n", std::strerror(err), err);
+        return false;
+    }
+    int rtmax{rtkit_get_max_realtime_priority(conn.get())};
+    TRACE("Maximum real-time priority: %d, minimum niceness: %d\n", rtmax, nicemin);
+
+    auto limit_rttime = [](DBusConnection *c) -> int
+    {
+        using ulonglong = unsigned long long;
+        long long maxrttime{rtkit_get_rttime_usec_max(c)};
+        if(maxrttime <= 0) return static_cast<int>(std::abs(maxrttime));
+        const ulonglong umaxtime{static_cast<ulonglong>(maxrttime)};
+
+        struct rlimit rlim{};
+        if(getrlimit(RLIMIT_RTTIME, &rlim) != 0)
+            return errno;
+
+        TRACE("RTTime max: %llu (hard: %llu, soft: %llu)\n", umaxtime,
+            static_cast<ulonglong>(rlim.rlim_max), static_cast<ulonglong>(rlim.rlim_cur));
+        if(rlim.rlim_max > umaxtime)
+        {
+            rlim.rlim_max = static_cast<rlim_t>(std::min<ulonglong>(umaxtime,
+                std::numeric_limits<rlim_t>::max()));
+            rlim.rlim_cur = std::min(rlim.rlim_cur, rlim.rlim_max);
+            if(setrlimit(RLIMIT_RTTIME, &rlim) != 0)
+                return errno;
+        }
+        return 0;
+    };
+    if(rtmax > 0)
+    {
+        if(AllowRTTimeLimit)
+        {
+            err = limit_rttime(conn.get());
+            if(err != 0)
+                WARN("Failed to set RLIMIT_RTTIME for RTKit: %s (%d)\n",
+                    std::strerror(err), err);
+        }
+
+        /* Limit the maximum real-time priority to half. */
+        rtmax = (rtmax+1)/2;
+        prio = clampi(prio, 1, rtmax);
+
+        TRACE("Making real-time with priority %d (max: %d)\n", prio, rtmax);
+        err = rtkit_make_realtime(conn.get(), 0, prio);
+        if(err == 0) return true;
+
+        err = std::abs(err);
+        WARN("Failed to set real-time priority: %s (%d)\n", std::strerror(err), err);
+    }
+    /* Don't try to set the niceness for non-Linux systems. Standard POSIX has
+     * niceness as a per-process attribute, while the intent here is for the
+     * audio processing thread only to get a priority boost. Currently only
+     * Linux is known to have per-thread niceness.
+     */
+#ifdef __linux__
+    if(nicemin < 0)
+    {
+        TRACE("Making high priority with niceness %d\n", nicemin);
+        err = rtkit_make_high_priority(conn.get(), 0, nicemin);
+        if(err == 0) return true;
+
+        err = std::abs(err);
+        WARN("Failed to set high priority: %s (%d)\n", std::strerror(err), err);
+    }
+#endif /* __linux__ */
+
+#else
+
+    std::ignore = prio;
+    WARN("D-Bus not supported\n");
+#endif
+    return false;
+}
+
+} // namespace
+
+void SetRTPriority()
+{
+    if(RTPrioLevel <= 0)
+        return;
+
+    if(SetRTPriorityPthread(RTPrioLevel))
+        return;
+    if(SetRTPriorityRTKit(RTPrioLevel))
+        return;
+}
+
+#endif
diff --git a/core/helpers.h b/core/helpers.h
new file mode 100644
index 00000000..f0bfcf1b
--- /dev/null
+++ b/core/helpers.h
@@ -0,0 +1,18 @@
+#ifndef CORE_HELPERS_H
+#define CORE_HELPERS_H
+
+#include <string>
+
+#include "vector.h"
+
+
+struct PathNamePair { std::string path, fname; };
+const PathNamePair &GetProcBinary(void);
+
+extern int RTPrioLevel;
+extern bool AllowRTTimeLimit;
+void SetRTPriority(void);
+
+al::vector<std::string> SearchDataFiles(const char *match, const char *subdir);
+
+#endif /* CORE_HELPERS_H */
diff --git a/core/hrtf.cpp b/core/hrtf.cpp
new file mode 100644
index 00000000..d5c7573a
--- /dev/null
+++ b/core/hrtf.cpp
@@ -0,0 +1,1473 @@
+
+#include "config.h"
+
+#include "hrtf.h"
+
+#include <algorithm>
+#include <array>
+#include <cassert>
+#include <cctype>
+#include <cmath>
+#include <cstdint>
+#include <cstdio>
+#include <cstring>
+#include <fstream>
+#include <iterator>
+#include <memory>
+#include <mutex>
+#include <numeric>
+#include <type_traits>
+#include <utility>
+
+#include "albit.h"
+#include "albyte.h"
+#include "alfstream.h"
+#include "almalloc.h"
+#include "alnumbers.h"
+#include "alnumeric.h"
+#include "aloptional.h"
+#include "alspan.h"
+#include "ambidefs.h"
+#include "filters/splitter.h"
+#include "helpers.h"
+#include "logging.h"
+#include "mixer/hrtfdefs.h"
+#include "opthelpers.h"
+#include "polyphase_resampler.h"
+#include "vector.h"
+
+
+namespace {
+
+struct HrtfEntry {
+    std::string mDispName;
+    std::string mFilename;
+
+    /* GCC warns when it tries to inline this. */
+    ~HrtfEntry();
+};
+HrtfEntry::~HrtfEntry() = default;
+
+struct LoadedHrtf {
+    std::string mFilename;
+    std::unique_ptr<HrtfStore> mEntry;
+
+    template<typename T, typename U>
+    LoadedHrtf(T&& name, U&& entry)
+        : mFilename{std::forward<T>(name)}, mEntry{std::forward<U>(entry)}
+    { }
+    LoadedHrtf(LoadedHrtf&&) = default;
+    /* GCC warns when it tries to inline this. */
+    ~LoadedHrtf();
+
+    LoadedHrtf& operator=(LoadedHrtf&&) = default;
+};
+LoadedHrtf::~LoadedHrtf() = default;
+
+
+/* Data set limits must be the same as or more flexible than those defined in
+ * the makemhr utility.
+ */
+constexpr uint MinFdCount{1};
+constexpr uint MaxFdCount{16};
+
+constexpr uint MinFdDistance{50};
+constexpr uint MaxFdDistance{2500};
+
+constexpr uint MinEvCount{5};
+constexpr uint MaxEvCount{181};
+
+constexpr uint MinAzCount{1};
+constexpr uint MaxAzCount{255};
+
+constexpr uint MaxHrirDelay{HrtfHistoryLength - 1};
+
+constexpr uint HrirDelayFracBits{2};
+constexpr uint HrirDelayFracOne{1 << HrirDelayFracBits};
+constexpr uint HrirDelayFracHalf{HrirDelayFracOne >> 1};
+
+static_assert(MaxHrirDelay*HrirDelayFracOne < 256, "MAX_HRIR_DELAY or DELAY_FRAC too large");
+
+constexpr char magicMarker00[8]{'M','i','n','P','H','R','0','0'};
+constexpr char magicMarker01[8]{'M','i','n','P','H','R','0','1'};
+constexpr char magicMarker02[8]{'M','i','n','P','H','R','0','2'};
+constexpr char magicMarker03[8]{'M','i','n','P','H','R','0','3'};
+
+/* First value for pass-through coefficients (remaining are 0), used for omni-
+ * directional sounds. */
+constexpr auto PassthruCoeff = static_cast<float>(1.0/al::numbers::sqrt2);
+
+std::mutex LoadedHrtfLock;
+al::vector<LoadedHrtf> LoadedHrtfs;
+
+std::mutex EnumeratedHrtfLock;
+al::vector<HrtfEntry> EnumeratedHrtfs;
+
+
+class databuf final : public std::streambuf {
+    int_type underflow() override
+    { return traits_type::eof(); }
+
+    pos_type seekoff(off_type offset, std::ios_base::seekdir whence, std::ios_base::openmode mode) override
+    {
+        if((mode&std::ios_base::out) || !(mode&std::ios_base::in))
+            return traits_type::eof();
+
+        char_type *cur;
+        switch(whence)
+        {
+            case std::ios_base::beg:
+                if(offset < 0 || offset > egptr()-eback())
+                    return traits_type::eof();
+                cur = eback() + offset;
+                break;
+
+            case std::ios_base::cur:
+                if((offset >= 0 && offset > egptr()-gptr()) ||
+                   (offset < 0 && -offset > gptr()-eback()))
+                    return traits_type::eof();
+                cur = gptr() + offset;
+                break;
+
+            case std::ios_base::end:
+                if(offset > 0 || -offset > egptr()-eback())
+                    return traits_type::eof();
+                cur = egptr() + offset;
+                break;
+
+            default:
+                return traits_type::eof();
+        }
+
+        setg(eback(), cur, egptr());
+        return cur - eback();
+    }
+
+    pos_type seekpos(pos_type pos, std::ios_base::openmode mode) override
+    {
+        // Simplified version of seekoff
+        if((mode&std::ios_base::out) || !(mode&std::ios_base::in))
+            return traits_type::eof();
+
+        if(pos < 0 || pos > egptr()-eback())
+            return traits_type::eof();
+
+        setg(eback(), eback() + static_cast<size_t>(pos), egptr());
+        return pos;
+    }
+
+public:
+    databuf(const char_type *start_, const char_type *end_) noexcept
+    {
+        setg(const_cast<char_type*>(start_), const_cast<char_type*>(start_),
+             const_cast<char_type*>(end_));
+    }
+};
+
+class idstream final : public std::istream {
+    databuf mStreamBuf;
+
+public:
+    idstream(const char *start_, const char *end_)
+      : std::istream{nullptr}, mStreamBuf{start_, end_}
+    { init(&mStreamBuf); }
+};
+
+
+struct IdxBlend { uint idx; float blend; };
+/* Calculate the elevation index given the polar elevation in radians. This
+ * will return an index between 0 and (evcount - 1).
+ */
+IdxBlend CalcEvIndex(uint evcount, float ev)
+{
+    ev = (al::numbers::pi_v<float>*0.5f + ev) * static_cast<float>(evcount-1) *
+        al::numbers::inv_pi_v<float>;
+    uint idx{float2uint(ev)};
+
+    return IdxBlend{minu(idx, evcount-1), ev-static_cast<float>(idx)};
+}
+
+/* Calculate the azimuth index given the polar azimuth in radians. This will
+ * return an index between 0 and (azcount - 1).
+ */
+IdxBlend CalcAzIndex(uint azcount, float az)
+{
+    az = (al::numbers::pi_v<float>*2.0f + az) * static_cast<float>(azcount) *
+        (al::numbers::inv_pi_v<float>*0.5f);
+    uint idx{float2uint(az)};
+
+    return IdxBlend{idx%azcount, az-static_cast<float>(idx)};
+}
+
+} // namespace
+
+
+/* Calculates static HRIR coefficients and delays for the given polar elevation
+ * and azimuth in radians. The coefficients are normalized.
+ */
+void HrtfStore::getCoeffs(float elevation, float azimuth, float distance, float spread,
+    HrirArray &coeffs, const al::span<uint,2> delays)
+{
+    const float dirfact{1.0f - (al::numbers::inv_pi_v<float>/2.0f * spread)};
+
+    size_t ebase{0};
+    auto match_field = [&ebase,distance](const Field &field) noexcept -> bool
+    {
+        if(distance >= field.distance)
+            return true;
+        ebase += field.evCount;
+        return false;
+    };
+    auto field = std::find_if(mFields.begin(), mFields.end()-1, match_field);
+
+    /* Calculate the elevation indices. */
+    const auto elev0 = CalcEvIndex(field->evCount, elevation);
+    const size_t elev1_idx{minu(elev0.idx+1, field->evCount-1)};
+    const size_t ir0offset{mElev[ebase + elev0.idx].irOffset};
+    const size_t ir1offset{mElev[ebase + elev1_idx].irOffset};
+
+    /* Calculate azimuth indices. */
+    const auto az0 = CalcAzIndex(mElev[ebase + elev0.idx].azCount, azimuth);
+    const auto az1 = CalcAzIndex(mElev[ebase + elev1_idx].azCount, azimuth);
+
+    /* Calculate the HRIR indices to blend. */
+    const size_t idx[4]{
+        ir0offset + az0.idx,
+        ir0offset + ((az0.idx+1) % mElev[ebase + elev0.idx].azCount),
+        ir1offset + az1.idx,
+        ir1offset + ((az1.idx+1) % mElev[ebase + elev1_idx].azCount)
+    };
+
+    /* Calculate bilinear blending weights, attenuated according to the
+     * directional panning factor.
+     */
+    const float blend[4]{
+        (1.0f-elev0.blend) * (1.0f-az0.blend) * dirfact,
+        (1.0f-elev0.blend) * (     az0.blend) * dirfact,
+        (     elev0.blend) * (1.0f-az1.blend) * dirfact,
+        (     elev0.blend) * (     az1.blend) * dirfact
+    };
+
+    /* Calculate the blended HRIR delays. */
+    float d{mDelays[idx[0]][0]*blend[0] + mDelays[idx[1]][0]*blend[1] + mDelays[idx[2]][0]*blend[2]
+        + mDelays[idx[3]][0]*blend[3]};
+    delays[0] = fastf2u(d * float{1.0f/HrirDelayFracOne});
+    d = mDelays[idx[0]][1]*blend[0] + mDelays[idx[1]][1]*blend[1] + mDelays[idx[2]][1]*blend[2]
+        + mDelays[idx[3]][1]*blend[3];
+    delays[1] = fastf2u(d * float{1.0f/HrirDelayFracOne});
+
+    /* Calculate the blended HRIR coefficients. */
+    float *coeffout{al::assume_aligned<16>(coeffs[0].data())};
+    coeffout[0] = PassthruCoeff * (1.0f-dirfact);
+    coeffout[1] = PassthruCoeff * (1.0f-dirfact);
+    std::fill_n(coeffout+2, size_t{HrirLength-1}*2, 0.0f);
+    for(size_t c{0};c < 4;c++)
+    {
+        const float *srccoeffs{al::assume_aligned<16>(mCoeffs[idx[c]][0].data())};
+        const float mult{blend[c]};
+        auto blend_coeffs = [mult](const float src, const float coeff) noexcept -> float
+        { return src*mult + coeff; };
+        std::transform(srccoeffs, srccoeffs + HrirLength*2, coeffout, coeffout, blend_coeffs);
+    }
+}
+
+
+std::unique_ptr<DirectHrtfState> DirectHrtfState::Create(size_t num_chans)
+{ return std::unique_ptr<DirectHrtfState>{new(FamCount(num_chans)) DirectHrtfState{num_chans}}; }
+
+void DirectHrtfState::build(const HrtfStore *Hrtf, const uint irSize, const bool perHrirMin,
+    const al::span<const AngularPoint> AmbiPoints, const float (*AmbiMatrix)[MaxAmbiChannels],
+    const float XOverFreq, const al::span<const float,MaxAmbiOrder+1> AmbiOrderHFGain)
+{
+    using double2 = std::array<double,2>;
+    struct ImpulseResponse {
+        const ConstHrirSpan hrir;
+        uint ldelay, rdelay;
+    };
+
+    const double xover_norm{double{XOverFreq} / Hrtf->mSampleRate};
+    mChannels[0].mSplitter.init(static_cast<float>(xover_norm));
+    for(size_t i{0};i < mChannels.size();++i)
+    {
+        const size_t order{AmbiIndex::OrderFromChannel()[i]};
+        mChannels[i].mSplitter = mChannels[0].mSplitter;
+        mChannels[i].mHfScale = AmbiOrderHFGain[order];
+    }
+
+    uint min_delay{HrtfHistoryLength*HrirDelayFracOne}, max_delay{0};
+    al::vector<ImpulseResponse> impres; impres.reserve(AmbiPoints.size());
+    auto calc_res = [Hrtf,&max_delay,&min_delay](const AngularPoint &pt) -> ImpulseResponse
+    {
+        auto &field = Hrtf->mFields[0];
+        const auto elev0 = CalcEvIndex(field.evCount, pt.Elev.value);
+        const size_t elev1_idx{minu(elev0.idx+1, field.evCount-1)};
+        const size_t ir0offset{Hrtf->mElev[elev0.idx].irOffset};
+        const size_t ir1offset{Hrtf->mElev[elev1_idx].irOffset};
+
+        const auto az0 = CalcAzIndex(Hrtf->mElev[elev0.idx].azCount, pt.Azim.value);
+        const auto az1 = CalcAzIndex(Hrtf->mElev[elev1_idx].azCount, pt.Azim.value);
+
+        const size_t idx[4]{
+            ir0offset + az0.idx,
+            ir0offset + ((az0.idx+1) % Hrtf->mElev[elev0.idx].azCount),
+            ir1offset + az1.idx,
+            ir1offset + ((az1.idx+1) % Hrtf->mElev[elev1_idx].azCount)
+        };
+
+        /* The largest blend factor serves as the closest HRIR. */
+        const size_t irOffset{idx[(elev0.blend >= 0.5f)*2 + (az1.blend >= 0.5f)]};
+        ImpulseResponse res{Hrtf->mCoeffs[irOffset],
+            Hrtf->mDelays[irOffset][0], Hrtf->mDelays[irOffset][1]};
+
+        min_delay = minu(min_delay, minu(res.ldelay, res.rdelay));
+        max_delay = maxu(max_delay, maxu(res.ldelay, res.rdelay));
+
+        return res;
+    };
+    std::transform(AmbiPoints.begin(), AmbiPoints.end(), std::back_inserter(impres), calc_res);
+    auto hrir_delay_round = [](const uint d) noexcept -> uint
+    { return (d+HrirDelayFracHalf) >> HrirDelayFracBits; };
+
+    TRACE("Min delay: %.2f, max delay: %.2f, FIR length: %u\n",
+        min_delay/double{HrirDelayFracOne}, max_delay/double{HrirDelayFracOne}, irSize);
+
+    auto tmpres = al::vector<std::array<double2,HrirLength>>(mChannels.size());
+    max_delay = 0;
+    for(size_t c{0u};c < AmbiPoints.size();++c)
+    {
+        const ConstHrirSpan hrir{impres[c].hrir};
+        const uint base_delay{perHrirMin ? minu(impres[c].ldelay, impres[c].rdelay) : min_delay};
+        const uint ldelay{hrir_delay_round(impres[c].ldelay - base_delay)};
+        const uint rdelay{hrir_delay_round(impres[c].rdelay - base_delay)};
+        max_delay = maxu(max_delay, maxu(impres[c].ldelay, impres[c].rdelay) - base_delay);
+
+        for(size_t i{0u};i < mChannels.size();++i)
+        {
+            const double mult{AmbiMatrix[c][i]};
+            const size_t numirs{HrirLength - maxz(ldelay, rdelay)};
+            size_t lidx{ldelay}, ridx{rdelay};
+            for(size_t j{0};j < numirs;++j)
+            {
+                tmpres[i][lidx++][0] += hrir[j][0] * mult;
+                tmpres[i][ridx++][1] += hrir[j][1] * mult;
+            }
+        }
+    }
+    impres.clear();
+
+    for(size_t i{0u};i < mChannels.size();++i)
+    {
+        auto copy_arr = [](const double2 &in) noexcept -> float2
+        { return float2{{static_cast<float>(in[0]), static_cast<float>(in[1])}}; };
+        std::transform(tmpres[i].cbegin(), tmpres[i].cend(), mChannels[i].mCoeffs.begin(),
+            copy_arr);
+    }
+    tmpres.clear();
+
+    const uint max_length{minu(hrir_delay_round(max_delay) + irSize, HrirLength)};
+    TRACE("New max delay: %.2f, FIR length: %u\n", max_delay/double{HrirDelayFracOne},
+        max_length);
+    mIrSize = max_length;
+}
+
+
+namespace {
+
+std::unique_ptr<HrtfStore> CreateHrtfStore(uint rate, uint8_t irSize,
+    const al::span<const HrtfStore::Field> fields,
+    const al::span<const HrtfStore::Elevation> elevs, const HrirArray *coeffs,
+    const ubyte2 *delays, const char *filename)
+{
+    const size_t irCount{size_t{elevs.back().azCount} + elevs.back().irOffset};
+    size_t total{sizeof(HrtfStore)};
+    total  = RoundUp(total, alignof(HrtfStore::Field)); /* Align for field infos */
+    total += sizeof(std::declval<HrtfStore&>().mFields[0])*fields.size();
+    total  = RoundUp(total, alignof(HrtfStore::Elevation)); /* Align for elevation infos */
+    total += sizeof(std::declval<HrtfStore&>().mElev[0])*elevs.size();
+    total  = RoundUp(total, 16); /* Align for coefficients using SIMD */
+    total += sizeof(std::declval<HrtfStore&>().mCoeffs[0])*irCount;
+    total += sizeof(std::declval<HrtfStore&>().mDelays[0])*irCount;
+
+    std::unique_ptr<HrtfStore> Hrtf{};
+    if(void *ptr{al_calloc(16, total)})
+    {
+        Hrtf.reset(al::construct_at(static_cast<HrtfStore*>(ptr)));
+        InitRef(Hrtf->mRef, 1u);
+        Hrtf->mSampleRate = rate;
+        Hrtf->mIrSize = irSize;
+
+        /* Set up pointers to storage following the main HRTF struct. */
+        char *base = reinterpret_cast<char*>(Hrtf.get());
+        size_t offset{sizeof(HrtfStore)};
+
+        offset = RoundUp(offset, alignof(HrtfStore::Field)); /* Align for field infos */
+        auto field_ = reinterpret_cast<HrtfStore::Field*>(base + offset);
+        offset += sizeof(field_[0])*fields.size();
+
+        offset = RoundUp(offset, alignof(HrtfStore::Elevation)); /* Align for elevation infos */
+        auto elev_ = reinterpret_cast<HrtfStore::Elevation*>(base + offset);
+        offset += sizeof(elev_[0])*elevs.size();
+
+        offset = RoundUp(offset, 16); /* Align for coefficients using SIMD */
+        auto coeffs_ = reinterpret_cast<HrirArray*>(base + offset);
+        offset += sizeof(coeffs_[0])*irCount;
+
+        auto delays_ = reinterpret_cast<ubyte2*>(base + offset);
+        offset += sizeof(delays_[0])*irCount;
+
+        if(offset != total)
+            throw std::runtime_error{"HrtfStore allocation size mismatch"};
+
+        /* Copy input data to storage. */
+        std::uninitialized_copy(fields.cbegin(), fields.cend(), field_);
+        std::uninitialized_copy(elevs.cbegin(), elevs.cend(), elev_);
+        std::uninitialized_copy_n(coeffs, irCount, coeffs_);
+        std::uninitialized_copy_n(delays, irCount, delays_);
+
+        /* Finally, assign the storage pointers. */
+        Hrtf->mFields = al::as_span(field_, fields.size());
+        Hrtf->mElev = elev_;
+        Hrtf->mCoeffs = coeffs_;
+        Hrtf->mDelays = delays_;
+    }
+    else
+        ERR("Out of memory allocating storage for %s.\n", filename);
+
+    return Hrtf;
+}
+
+void MirrorLeftHrirs(const al::span<const HrtfStore::Elevation> elevs, HrirArray *coeffs,
+    ubyte2 *delays)
+{
+    for(const auto &elev : elevs)
+    {
+        const ushort evoffset{elev.irOffset};
+        const ushort azcount{elev.azCount};
+        for(size_t j{0};j < azcount;j++)
+        {
+            const size_t lidx{evoffset + j};
+            const size_t ridx{evoffset + ((azcount-j) % azcount)};
+
+            const size_t irSize{coeffs[ridx].size()};
+            for(size_t k{0};k < irSize;k++)
+                coeffs[ridx][k][1] = coeffs[lidx][k][0];
+            delays[ridx][1] = delays[lidx][0];
+        }
+    }
+}
+
+
+template<size_t num_bits, typename T>
+constexpr std::enable_if_t<std::is_signed<T>::value && num_bits < sizeof(T)*8,
+T> fixsign(T value) noexcept
+{
+    constexpr auto signbit = static_cast<T>(1u << (num_bits-1));
+    return static_cast<T>((value^signbit) - signbit);
+}
+
+template<size_t num_bits, typename T>
+constexpr std::enable_if_t<!std::is_signed<T>::value || num_bits == sizeof(T)*8,
+T> fixsign(T value) noexcept
+{ return value; }
+
+template<typename T, size_t num_bits=sizeof(T)*8>
+inline std::enable_if_t<al::endian::native == al::endian::little,
+T> readle(std::istream &data)
+{
+    static_assert((num_bits&7) == 0, "num_bits must be a multiple of 8");
+    static_assert(num_bits <= sizeof(T)*8, "num_bits is too large for the type");
+
+    T ret{};
+    if(!data.read(reinterpret_cast<char*>(&ret), num_bits/8))
+        return static_cast<T>(EOF);
+
+    return fixsign<num_bits>(ret);
+}
+
+template<typename T, size_t num_bits=sizeof(T)*8>
+inline std::enable_if_t<al::endian::native == al::endian::big,
+T> readle(std::istream &data)
+{
+    static_assert((num_bits&7) == 0, "num_bits must be a multiple of 8");
+    static_assert(num_bits <= sizeof(T)*8, "num_bits is too large for the type");
+
+    T ret{};
+    al::byte b[sizeof(T)]{};
+    if(!data.read(reinterpret_cast<char*>(b), num_bits/8))
+        return static_cast<T>(EOF);
+    std::reverse_copy(std::begin(b), std::end(b), reinterpret_cast<al::byte*>(&ret));
+
+    return fixsign<num_bits>(ret);
+}
+
+template<>
+inline uint8_t readle<uint8_t,8>(std::istream &data)
+{ return static_cast<uint8_t>(data.get()); }
+
+
+std::unique_ptr<HrtfStore> LoadHrtf00(std::istream &data, const char *filename)
+{
+    uint rate{readle<uint32_t>(data)};
+    ushort irCount{readle<uint16_t>(data)};
+    ushort irSize{readle<uint16_t>(data)};
+    ubyte evCount{readle<uint8_t>(data)};
+    if(!data || data.eof())
+    {
+        ERR("Failed reading %s\n", filename);
+        return nullptr;
+    }
+
+    if(irSize < MinIrLength || irSize > HrirLength)
+    {
+        ERR("Unsupported HRIR size, irSize=%d (%d to %d)\n", irSize, MinIrLength, HrirLength);
+        return nullptr;
+    }
+    if(evCount < MinEvCount || evCount > MaxEvCount)
+    {
+        ERR("Unsupported elevation count: evCount=%d (%d to %d)\n",
+            evCount, MinEvCount, MaxEvCount);
+        return nullptr;
+    }
+
+    auto elevs = al::vector<HrtfStore::Elevation>(evCount);
+    for(auto &elev : elevs)
+        elev.irOffset = readle<uint16_t>(data);
+    if(!data || data.eof())
+    {
+        ERR("Failed reading %s\n", filename);
+        return nullptr;
+    }
+    for(size_t i{1};i < evCount;i++)
+    {
+        if(elevs[i].irOffset <= elevs[i-1].irOffset)
+        {
+            ERR("Invalid evOffset: evOffset[%zu]=%d (last=%d)\n", i, elevs[i].irOffset,
+                elevs[i-1].irOffset);
+            return nullptr;
+        }
+    }
+    if(irCount <= elevs.back().irOffset)
+    {
+        ERR("Invalid evOffset: evOffset[%zu]=%d (irCount=%d)\n",
+            elevs.size()-1, elevs.back().irOffset, irCount);
+        return nullptr;
+    }
+
+    for(size_t i{1};i < evCount;i++)
+    {
+        elevs[i-1].azCount = static_cast<ushort>(elevs[i].irOffset - elevs[i-1].irOffset);
+        if(elevs[i-1].azCount < MinAzCount || elevs[i-1].azCount > MaxAzCount)
+        {
+            ERR("Unsupported azimuth count: azCount[%zd]=%d (%d to %d)\n",
+                i-1, elevs[i-1].azCount, MinAzCount, MaxAzCount);
+            return nullptr;
+        }
+    }
+    elevs.back().azCount = static_cast<ushort>(irCount - elevs.back().irOffset);
+    if(elevs.back().azCount < MinAzCount || elevs.back().azCount > MaxAzCount)
+    {
+        ERR("Unsupported azimuth count: azCount[%zu]=%d (%d to %d)\n",
+            elevs.size()-1, elevs.back().azCount, MinAzCount, MaxAzCount);
+        return nullptr;
+    }
+
+    auto coeffs = al::vector<HrirArray>(irCount, HrirArray{});
+    auto delays = al::vector<ubyte2>(irCount);
+    for(auto &hrir : coeffs)
+    {
+        for(auto &val : al::span<float2>{hrir.data(), irSize})
+            val[0] = readle<int16_t>(data) / 32768.0f;
+    }
+    for(auto &val : delays)
+        val[0] = readle<uint8_t>(data);
+    if(!data || data.eof())
+    {
+        ERR("Failed reading %s\n", filename);
+        return nullptr;
+    }
+    for(size_t i{0};i < irCount;i++)
+    {
+        if(delays[i][0] > MaxHrirDelay)
+        {
+            ERR("Invalid delays[%zd]: %d (%d)\n", i, delays[i][0], MaxHrirDelay);
+            return nullptr;
+        }
+        delays[i][0] <<= HrirDelayFracBits;
+    }
+
+    /* Mirror the left ear responses to the right ear. */
+    MirrorLeftHrirs({elevs.data(), elevs.size()}, coeffs.data(), delays.data());
+
+    const HrtfStore::Field field[1]{{0.0f, evCount}};
+    return CreateHrtfStore(rate, static_cast<uint8_t>(irSize), field, {elevs.data(), elevs.size()},
+        coeffs.data(), delays.data(), filename);
+}
+
+std::unique_ptr<HrtfStore> LoadHrtf01(std::istream &data, const char *filename)
+{
+    uint rate{readle<uint32_t>(data)};
+    uint8_t irSize{readle<uint8_t>(data)};
+    ubyte evCount{readle<uint8_t>(data)};
+    if(!data || data.eof())
+    {
+        ERR("Failed reading %s\n", filename);
+        return nullptr;
+    }
+
+    if(irSize < MinIrLength || irSize > HrirLength)
+    {
+        ERR("Unsupported HRIR size, irSize=%d (%d to %d)\n", irSize, MinIrLength, HrirLength);
+        return nullptr;
+    }
+    if(evCount < MinEvCount || evCount > MaxEvCount)
+    {
+        ERR("Unsupported elevation count: evCount=%d (%d to %d)\n",
+            evCount, MinEvCount, MaxEvCount);
+        return nullptr;
+    }
+
+    auto elevs = al::vector<HrtfStore::Elevation>(evCount);
+    for(auto &elev : elevs)
+        elev.azCount = readle<uint8_t>(data);
+    if(!data || data.eof())
+    {
+        ERR("Failed reading %s\n", filename);
+        return nullptr;
+    }
+    for(size_t i{0};i < evCount;++i)
+    {
+        if(elevs[i].azCount < MinAzCount || elevs[i].azCount > MaxAzCount)
+        {
+            ERR("Unsupported azimuth count: azCount[%zd]=%d (%d to %d)\n", i, elevs[i].azCount,
+                MinAzCount, MaxAzCount);
+            return nullptr;
+        }
+    }
+
+    elevs[0].irOffset = 0;
+    for(size_t i{1};i < evCount;i++)
+        elevs[i].irOffset = static_cast<ushort>(elevs[i-1].irOffset + elevs[i-1].azCount);
+    const ushort irCount{static_cast<ushort>(elevs.back().irOffset + elevs.back().azCount)};
+
+    auto coeffs = al::vector<HrirArray>(irCount, HrirArray{});
+    auto delays = al::vector<ubyte2>(irCount);
+    for(auto &hrir : coeffs)
+    {
+        for(auto &val : al::span<float2>{hrir.data(), irSize})
+            val[0] = readle<int16_t>(data) / 32768.0f;
+    }
+    for(auto &val : delays)
+        val[0] = readle<uint8_t>(data);
+    if(!data || data.eof())
+    {
+        ERR("Failed reading %s\n", filename);
+        return nullptr;
+    }
+    for(size_t i{0};i < irCount;i++)
+    {
+        if(delays[i][0] > MaxHrirDelay)
+        {
+            ERR("Invalid delays[%zd]: %d (%d)\n", i, delays[i][0], MaxHrirDelay);
+            return nullptr;
+        }
+        delays[i][0] <<= HrirDelayFracBits;
+    }
+
+    /* Mirror the left ear responses to the right ear. */
+    MirrorLeftHrirs({elevs.data(), elevs.size()}, coeffs.data(), delays.data());
+
+    const HrtfStore::Field field[1]{{0.0f, evCount}};
+    return CreateHrtfStore(rate, irSize, field, {elevs.data(), elevs.size()}, coeffs.data(),
+        delays.data(), filename);
+}
+
+std::unique_ptr<HrtfStore> LoadHrtf02(std::istream &data, const char *filename)
+{
+    constexpr ubyte SampleType_S16{0};
+    constexpr ubyte SampleType_S24{1};
+    constexpr ubyte ChanType_LeftOnly{0};
+    constexpr ubyte ChanType_LeftRight{1};
+
+    uint rate{readle<uint32_t>(data)};
+    ubyte sampleType{readle<uint8_t>(data)};
+    ubyte channelType{readle<uint8_t>(data)};
+    uint8_t irSize{readle<uint8_t>(data)};
+    ubyte fdCount{readle<uint8_t>(data)};
+    if(!data || data.eof())
+    {
+        ERR("Failed reading %s\n", filename);
+        return nullptr;
+    }
+
+    if(sampleType > SampleType_S24)
+    {
+        ERR("Unsupported sample type: %d\n", sampleType);
+        return nullptr;
+    }
+    if(channelType > ChanType_LeftRight)
+    {
+        ERR("Unsupported channel type: %d\n", channelType);
+        return nullptr;
+    }
+
+    if(irSize < MinIrLength || irSize > HrirLength)
+    {
+        ERR("Unsupported HRIR size, irSize=%d (%d to %d)\n", irSize, MinIrLength, HrirLength);
+        return nullptr;
+    }
+    if(fdCount < 1 || fdCount > MaxFdCount)
+    {
+        ERR("Unsupported number of field-depths: fdCount=%d (%d to %d)\n", fdCount, MinFdCount,
+            MaxFdCount);
+        return nullptr;
+    }
+
+    auto fields = al::vector<HrtfStore::Field>(fdCount);
+    auto elevs = al::vector<HrtfStore::Elevation>{};
+    for(size_t f{0};f < fdCount;f++)
+    {
+        const ushort distance{readle<uint16_t>(data)};
+        const ubyte evCount{readle<uint8_t>(data)};
+        if(!data || data.eof())
+        {
+            ERR("Failed reading %s\n", filename);
+            return nullptr;
+        }
+
+        if(distance < MinFdDistance || distance > MaxFdDistance)
+        {
+            ERR("Unsupported field distance[%zu]=%d (%d to %d millimeters)\n", f, distance,
+                MinFdDistance, MaxFdDistance);
+            return nullptr;
+        }
+        if(evCount < MinEvCount || evCount > MaxEvCount)
+        {
+            ERR("Unsupported elevation count: evCount[%zu]=%d (%d to %d)\n", f, evCount,
+                MinEvCount, MaxEvCount);
+            return nullptr;
+        }
+
+        fields[f].distance = distance / 1000.0f;
+        fields[f].evCount = evCount;
+        if(f > 0 && fields[f].distance <= fields[f-1].distance)
+        {
+            ERR("Field distance[%zu] is not after previous (%f > %f)\n", f, fields[f].distance,
+                fields[f-1].distance);
+            return nullptr;
+        }
+
+        const size_t ebase{elevs.size()};
+        elevs.resize(ebase + evCount);
+        for(auto &elev : al::span<HrtfStore::Elevation>(elevs.data()+ebase, evCount))
+            elev.azCount = readle<uint8_t>(data);
+        if(!data || data.eof())
+        {
+            ERR("Failed reading %s\n", filename);
+            return nullptr;
+        }
+
+        for(size_t e{0};e < evCount;e++)
+        {
+            if(elevs[ebase+e].azCount < MinAzCount || elevs[ebase+e].azCount > MaxAzCount)
+            {
+                ERR("Unsupported azimuth count: azCount[%zu][%zu]=%d (%d to %d)\n", f, e,
+                    elevs[ebase+e].azCount, MinAzCount, MaxAzCount);
+                return nullptr;
+            }
+        }
+    }
+
+    elevs[0].irOffset = 0;
+    std::partial_sum(elevs.cbegin(), elevs.cend(), elevs.begin(),
+        [](const HrtfStore::Elevation &last, const HrtfStore::Elevation &cur)
+            -> HrtfStore::Elevation
+        {
+            return HrtfStore::Elevation{cur.azCount,
+                static_cast<ushort>(last.azCount + last.irOffset)};
+        });
+    const auto irTotal = static_cast<ushort>(elevs.back().azCount + elevs.back().irOffset);
+
+    auto coeffs = al::vector<HrirArray>(irTotal, HrirArray{});
+    auto delays = al::vector<ubyte2>(irTotal);
+    if(channelType == ChanType_LeftOnly)
+    {
+        if(sampleType == SampleType_S16)
+        {
+            for(auto &hrir : coeffs)
+            {
+                for(auto &val : al::span<float2>{hrir.data(), irSize})
+                    val[0] = readle<int16_t>(data) / 32768.0f;
+            }
+        }
+        else if(sampleType == SampleType_S24)
+        {
+            for(auto &hrir : coeffs)
+            {
+                for(auto &val : al::span<float2>{hrir.data(), irSize})
+                    val[0] = static_cast<float>(readle<int,24>(data)) / 8388608.0f;
+            }
+        }
+        for(auto &val : delays)
+            val[0] = readle<uint8_t>(data);
+        if(!data || data.eof())
+        {
+            ERR("Failed reading %s\n", filename);
+            return nullptr;
+        }
+        for(size_t i{0};i < irTotal;++i)
+        {
+            if(delays[i][0] > MaxHrirDelay)
+            {
+                ERR("Invalid delays[%zu][0]: %d (%d)\n", i, delays[i][0], MaxHrirDelay);
+                return nullptr;
+            }
+            delays[i][0] <<= HrirDelayFracBits;
+        }
+
+        /* Mirror the left ear responses to the right ear. */
+        MirrorLeftHrirs({elevs.data(), elevs.size()}, coeffs.data(), delays.data());
+    }
+    else if(channelType == ChanType_LeftRight)
+    {
+        if(sampleType == SampleType_S16)
+        {
+            for(auto &hrir : coeffs)
+            {
+                for(auto &val : al::span<float2>{hrir.data(), irSize})
+                {
+                    val[0] = readle<int16_t>(data) / 32768.0f;
+                    val[1] = readle<int16_t>(data) / 32768.0f;
+                }
+            }
+        }
+        else if(sampleType == SampleType_S24)
+        {
+            for(auto &hrir : coeffs)
+            {
+                for(auto &val : al::span<float2>{hrir.data(), irSize})
+                {
+                    val[0] = static_cast<float>(readle<int,24>(data)) / 8388608.0f;
+                    val[1] = static_cast<float>(readle<int,24>(data)) / 8388608.0f;
+                }
+            }
+        }
+        for(auto &val : delays)
+        {
+            val[0] = readle<uint8_t>(data);
+            val[1] = readle<uint8_t>(data);
+        }
+        if(!data || data.eof())
+        {
+            ERR("Failed reading %s\n", filename);
+            return nullptr;
+        }
+
+        for(size_t i{0};i < irTotal;++i)
+        {
+            if(delays[i][0] > MaxHrirDelay)
+            {
+                ERR("Invalid delays[%zu][0]: %d (%d)\n", i, delays[i][0], MaxHrirDelay);
+                return nullptr;
+            }
+            if(delays[i][1] > MaxHrirDelay)
+            {
+                ERR("Invalid delays[%zu][1]: %d (%d)\n", i, delays[i][1], MaxHrirDelay);
+                return nullptr;
+            }
+            delays[i][0] <<= HrirDelayFracBits;
+            delays[i][1] <<= HrirDelayFracBits;
+        }
+    }
+
+    if(fdCount > 1)
+    {
+        auto fields_ = al::vector<HrtfStore::Field>(fields.size());
+        auto elevs_ = al::vector<HrtfStore::Elevation>(elevs.size());
+        auto coeffs_ = al::vector<HrirArray>(coeffs.size());
+        auto delays_ = al::vector<ubyte2>(delays.size());
+
+        /* Simple reverse for the per-field elements. */
+        std::reverse_copy(fields.cbegin(), fields.cend(), fields_.begin());
+
+        /* Each field has a group of elevations, which each have an azimuth
+         * count. Reverse the order of the groups, keeping the relative order
+         * of per-group azimuth counts.
+         */
+        auto elevs__end = elevs_.end();
+        auto copy_azs = [&elevs,&elevs__end](const ptrdiff_t ebase, const HrtfStore::Field &field)
+            -> ptrdiff_t
+        {
+            auto elevs_src = elevs.begin()+ebase;
+            elevs__end = std::copy_backward(elevs_src, elevs_src+field.evCount, elevs__end);
+            return ebase + field.evCount;
+        };
+        (void)std::accumulate(fields.cbegin(), fields.cend(), ptrdiff_t{0}, copy_azs);
+        assert(elevs_.begin() == elevs__end);
+
+        /* Reestablish the IR offset for each elevation index, given the new
+         * ordering of elevations.
+         */
+        elevs_[0].irOffset = 0;
+        std::partial_sum(elevs_.cbegin(), elevs_.cend(), elevs_.begin(),
+            [](const HrtfStore::Elevation &last, const HrtfStore::Elevation &cur)
+                -> HrtfStore::Elevation
+            {
+                return HrtfStore::Elevation{cur.azCount,
+                    static_cast<ushort>(last.azCount + last.irOffset)};
+            });
+
+        /* Reverse the order of each field's group of IRs. */
+        auto coeffs_end = coeffs_.end();
+        auto delays_end = delays_.end();
+        auto copy_irs = [&elevs,&coeffs,&delays,&coeffs_end,&delays_end](
+            const ptrdiff_t ebase, const HrtfStore::Field &field) -> ptrdiff_t
+        {
+            auto accum_az = [](int count, const HrtfStore::Elevation &elev) noexcept -> int
+            { return count + elev.azCount; };
+            const auto elevs_mid = elevs.cbegin() + ebase;
+            const auto elevs_end = elevs_mid + field.evCount;
+            const int abase{std::accumulate(elevs.cbegin(), elevs_mid, 0, accum_az)};
+            const int num_azs{std::accumulate(elevs_mid, elevs_end, 0, accum_az)};
+
+            coeffs_end = std::copy_backward(coeffs.cbegin() + abase,
+                coeffs.cbegin() + (abase+num_azs), coeffs_end);
+            delays_end = std::copy_backward(delays.cbegin() + abase,
+                delays.cbegin() + (abase+num_azs), delays_end);
+
+            return ebase + field.evCount;
+        };
+        (void)std::accumulate(fields.cbegin(), fields.cend(), ptrdiff_t{0}, copy_irs);
+        assert(coeffs_.begin() == coeffs_end);
+        assert(delays_.begin() == delays_end);
+
+        fields = std::move(fields_);
+        elevs = std::move(elevs_);
+        coeffs = std::move(coeffs_);
+        delays = std::move(delays_);
+    }
+
+    return CreateHrtfStore(rate, irSize, {fields.data(), fields.size()},
+        {elevs.data(), elevs.size()}, coeffs.data(), delays.data(), filename);
+}
+
+std::unique_ptr<HrtfStore> LoadHrtf03(std::istream &data, const char *filename)
+{
+    constexpr ubyte ChanType_LeftOnly{0};
+    constexpr ubyte ChanType_LeftRight{1};
+
+    uint rate{readle<uint32_t>(data)};
+    ubyte channelType{readle<uint8_t>(data)};
+    uint8_t irSize{readle<uint8_t>(data)};
+    ubyte fdCount{readle<uint8_t>(data)};
+    if(!data || data.eof())
+    {
+        ERR("Failed reading %s\n", filename);
+        return nullptr;
+    }
+
+    if(channelType > ChanType_LeftRight)
+    {
+        ERR("Unsupported channel type: %d\n", channelType);
+        return nullptr;
+    }
+
+    if(irSize < MinIrLength || irSize > HrirLength)
+    {
+        ERR("Unsupported HRIR size, irSize=%d (%d to %d)\n", irSize, MinIrLength, HrirLength);
+        return nullptr;
+    }
+    if(fdCount < 1 || fdCount > MaxFdCount)
+    {
+        ERR("Unsupported number of field-depths: fdCount=%d (%d to %d)\n", fdCount, MinFdCount,
+            MaxFdCount);
+        return nullptr;
+    }
+
+    auto fields = al::vector<HrtfStore::Field>(fdCount);
+    auto elevs = al::vector<HrtfStore::Elevation>{};
+    for(size_t f{0};f < fdCount;f++)
+    {
+        const ushort distance{readle<uint16_t>(data)};
+        const ubyte evCount{readle<uint8_t>(data)};
+        if(!data || data.eof())
+        {
+            ERR("Failed reading %s\n", filename);
+            return nullptr;
+        }
+
+        if(distance < MinFdDistance || distance > MaxFdDistance)
+        {
+            ERR("Unsupported field distance[%zu]=%d (%d to %d millimeters)\n", f, distance,
+                MinFdDistance, MaxFdDistance);
+            return nullptr;
+        }
+        if(evCount < MinEvCount || evCount > MaxEvCount)
+        {
+            ERR("Unsupported elevation count: evCount[%zu]=%d (%d to %d)\n", f, evCount,
+                MinEvCount, MaxEvCount);
+            return nullptr;
+        }
+
+        fields[f].distance = distance / 1000.0f;
+        fields[f].evCount = evCount;
+        if(f > 0 && fields[f].distance > fields[f-1].distance)
+        {
+            ERR("Field distance[%zu] is not before previous (%f <= %f)\n", f, fields[f].distance,
+                fields[f-1].distance);
+            return nullptr;
+        }
+
+        const size_t ebase{elevs.size()};
+        elevs.resize(ebase + evCount);
+        for(auto &elev : al::span<HrtfStore::Elevation>(elevs.data()+ebase, evCount))
+            elev.azCount = readle<uint8_t>(data);
+        if(!data || data.eof())
+        {
+            ERR("Failed reading %s\n", filename);
+            return nullptr;
+        }
+
+        for(size_t e{0};e < evCount;e++)
+        {
+            if(elevs[ebase+e].azCount < MinAzCount || elevs[ebase+e].azCount > MaxAzCount)
+            {
+                ERR("Unsupported azimuth count: azCount[%zu][%zu]=%d (%d to %d)\n", f, e,
+                    elevs[ebase+e].azCount, MinAzCount, MaxAzCount);
+                return nullptr;
+            }
+        }
+    }
+
+    elevs[0].irOffset = 0;
+    std::partial_sum(elevs.cbegin(), elevs.cend(), elevs.begin(),
+        [](const HrtfStore::Elevation &last, const HrtfStore::Elevation &cur)
+            -> HrtfStore::Elevation
+        {
+            return HrtfStore::Elevation{cur.azCount,
+                static_cast<ushort>(last.azCount + last.irOffset)};
+        });
+    const auto irTotal = static_cast<ushort>(elevs.back().azCount + elevs.back().irOffset);
+
+    auto coeffs = al::vector<HrirArray>(irTotal, HrirArray{});
+    auto delays = al::vector<ubyte2>(irTotal);
+    if(channelType == ChanType_LeftOnly)
+    {
+        for(auto &hrir : coeffs)
+        {
+            for(auto &val : al::span<float2>{hrir.data(), irSize})
+                val[0] = static_cast<float>(readle<int,24>(data)) / 8388608.0f;
+        }
+        for(auto &val : delays)
+            val[0] = readle<uint8_t>(data);
+        if(!data || data.eof())
+        {
+            ERR("Failed reading %s\n", filename);
+            return nullptr;
+        }
+        for(size_t i{0};i < irTotal;++i)
+        {
+            if(delays[i][0] > MaxHrirDelay<<HrirDelayFracBits)
+            {
+                ERR("Invalid delays[%zu][0]: %f (%d)\n", i,
+                    delays[i][0] / float{HrirDelayFracOne}, MaxHrirDelay);
+                return nullptr;
+            }
+        }
+
+        /* Mirror the left ear responses to the right ear. */
+        MirrorLeftHrirs({elevs.data(), elevs.size()}, coeffs.data(), delays.data());
+    }
+    else if(channelType == ChanType_LeftRight)
+    {
+        for(auto &hrir : coeffs)
+        {
+            for(auto &val : al::span<float2>{hrir.data(), irSize})
+            {
+                val[0] = static_cast<float>(readle<int,24>(data)) / 8388608.0f;
+                val[1] = static_cast<float>(readle<int,24>(data)) / 8388608.0f;
+            }
+        }
+        for(auto &val : delays)
+        {
+            val[0] = readle<uint8_t>(data);
+            val[1] = readle<uint8_t>(data);
+        }
+        if(!data || data.eof())
+        {
+            ERR("Failed reading %s\n", filename);
+            return nullptr;
+        }
+
+        for(size_t i{0};i < irTotal;++i)
+        {
+            if(delays[i][0] > MaxHrirDelay<<HrirDelayFracBits)
+            {
+                ERR("Invalid delays[%zu][0]: %f (%d)\n", i,
+                    delays[i][0] / float{HrirDelayFracOne}, MaxHrirDelay);
+                return nullptr;
+            }
+            if(delays[i][1] > MaxHrirDelay<<HrirDelayFracBits)
+            {
+                ERR("Invalid delays[%zu][1]: %f (%d)\n", i,
+                    delays[i][1] / float{HrirDelayFracOne}, MaxHrirDelay);
+                return nullptr;
+            }
+        }
+    }
+
+    return CreateHrtfStore(rate, irSize, {fields.data(), fields.size()},
+        {elevs.data(), elevs.size()}, coeffs.data(), delays.data(), filename);
+}
+
+
+bool checkName(const std::string &name)
+{
+    auto match_name = [&name](const HrtfEntry &entry) -> bool { return name == entry.mDispName; };
+    auto &enum_names = EnumeratedHrtfs;
+    return std::find_if(enum_names.cbegin(), enum_names.cend(), match_name) != enum_names.cend();
+}
+
+void AddFileEntry(const std::string &filename)
+{
+    /* Check if this file has already been enumerated. */
+    auto enum_iter = std::find_if(EnumeratedHrtfs.cbegin(), EnumeratedHrtfs.cend(),
+        [&filename](const HrtfEntry &entry) -> bool
+        { return entry.mFilename == filename; });
+    if(enum_iter != EnumeratedHrtfs.cend())
+    {
+        TRACE("Skipping duplicate file entry %s\n", filename.c_str());
+        return;
+    }
+
+    /* TODO: Get a human-readable name from the HRTF data (possibly coming in a
+     * format update). */
+    size_t namepos{filename.find_last_of('/')+1};
+    if(!namepos) namepos = filename.find_last_of('\\')+1;
+
+    size_t extpos{filename.find_last_of('.')};
+    if(extpos <= namepos) extpos = std::string::npos;
+
+    const std::string basename{(extpos == std::string::npos) ?
+        filename.substr(namepos) : filename.substr(namepos, extpos-namepos)};
+    std::string newname{basename};
+    int count{1};
+    while(checkName(newname))
+    {
+        newname = basename;
+        newname += " #";
+        newname += std::to_string(++count);
+    }
+    EnumeratedHrtfs.emplace_back(HrtfEntry{newname, filename});
+    const HrtfEntry &entry = EnumeratedHrtfs.back();
+
+    TRACE("Adding file entry \"%s\"\n", entry.mFilename.c_str());
+}
+
+/* Unfortunate that we have to duplicate AddFileEntry to take a memory buffer
+ * for input instead of opening the given filename.
+ */
+void AddBuiltInEntry(const std::string &dispname, uint residx)
+{
+    const std::string filename{'!'+std::to_string(residx)+'_'+dispname};
+
+    auto enum_iter = std::find_if(EnumeratedHrtfs.cbegin(), EnumeratedHrtfs.cend(),
+        [&filename](const HrtfEntry &entry) -> bool
+        { return entry.mFilename == filename; });
+    if(enum_iter != EnumeratedHrtfs.cend())
+    {
+        TRACE("Skipping duplicate file entry %s\n", filename.c_str());
+        return;
+    }
+
+    /* TODO: Get a human-readable name from the HRTF data (possibly coming in a
+     * format update). */
+
+    std::string newname{dispname};
+    int count{1};
+    while(checkName(newname))
+    {
+        newname = dispname;
+        newname += " #";
+        newname += std::to_string(++count);
+    }
+    EnumeratedHrtfs.emplace_back(HrtfEntry{newname, filename});
+    const HrtfEntry &entry = EnumeratedHrtfs.back();
+
+    TRACE("Adding built-in entry \"%s\"\n", entry.mFilename.c_str());
+}
+
+
+#define IDR_DEFAULT_HRTF_MHR 1
+
+#ifndef ALSOFT_EMBED_HRTF_DATA
+
+al::span<const char> GetResource(int /*name*/)
+{ return {}; }
+
+#else
+
+constexpr unsigned char hrtf_default[]{
+#include "default_hrtf.txt"
+};
+
+al::span<const char> GetResource(int name)
+{
+    if(name == IDR_DEFAULT_HRTF_MHR)
+        return {reinterpret_cast<const char*>(hrtf_default), sizeof(hrtf_default)};
+    return {};
+}
+#endif
+
+} // namespace
+
+
+al::vector<std::string> EnumerateHrtf(al::optional<std::string> pathopt)
+{
+    std::lock_guard<std::mutex> _{EnumeratedHrtfLock};
+    EnumeratedHrtfs.clear();
+
+    bool usedefaults{true};
+    if(pathopt)
+    {
+        const char *pathlist{pathopt->c_str()};
+        while(pathlist && *pathlist)
+        {
+            const char *next, *end;
+
+            while(isspace(*pathlist) || *pathlist == ',')
+                pathlist++;
+            if(*pathlist == '\0')
+                continue;
+
+            next = strchr(pathlist, ',');
+            if(next)
+                end = next++;
+            else
+            {
+                end = pathlist + strlen(pathlist);
+                usedefaults = false;
+            }
+
+            while(end != pathlist && isspace(*(end-1)))
+                --end;
+            if(end != pathlist)
+            {
+                const std::string pname{pathlist, end};
+                for(const auto &fname : SearchDataFiles(".mhr", pname.c_str()))
+                    AddFileEntry(fname);
+            }
+
+            pathlist = next;
+        }
+    }
+
+    if(usedefaults)
+    {
+        for(const auto &fname : SearchDataFiles(".mhr", "openal/hrtf"))
+            AddFileEntry(fname);
+
+        if(!GetResource(IDR_DEFAULT_HRTF_MHR).empty())
+            AddBuiltInEntry("Built-In HRTF", IDR_DEFAULT_HRTF_MHR);
+    }
+
+    al::vector<std::string> list;
+    list.reserve(EnumeratedHrtfs.size());
+    for(auto &entry : EnumeratedHrtfs)
+        list.emplace_back(entry.mDispName);
+
+    return list;
+}
+
+HrtfStorePtr GetLoadedHrtf(const std::string &name, const uint devrate)
+{
+    std::lock_guard<std::mutex> _{EnumeratedHrtfLock};
+    auto entry_iter = std::find_if(EnumeratedHrtfs.cbegin(), EnumeratedHrtfs.cend(),
+        [&name](const HrtfEntry &entry) -> bool { return entry.mDispName == name; });
+    if(entry_iter == EnumeratedHrtfs.cend())
+        return nullptr;
+    const std::string &fname = entry_iter->mFilename;
+
+    std::lock_guard<std::mutex> __{LoadedHrtfLock};
+    auto hrtf_lt_fname = [](LoadedHrtf &hrtf, const std::string &filename) -> bool
+    { return hrtf.mFilename < filename; };
+    auto handle = std::lower_bound(LoadedHrtfs.begin(), LoadedHrtfs.end(), fname, hrtf_lt_fname);
+    while(handle != LoadedHrtfs.end() && handle->mFilename == fname)
+    {
+        HrtfStore *hrtf{handle->mEntry.get()};
+        if(hrtf && hrtf->mSampleRate == devrate)
+        {
+            hrtf->add_ref();
+            return HrtfStorePtr{hrtf};
+        }
+        ++handle;
+    }
+
+    std::unique_ptr<std::istream> stream;
+    int residx{};
+    char ch{};
+    if(sscanf(fname.c_str(), "!%d%c", &residx, &ch) == 2 && ch == '_')
+    {
+        TRACE("Loading %s...\n", fname.c_str());
+        al::span<const char> res{GetResource(residx)};
+        if(res.empty())
+        {
+            ERR("Could not get resource %u, %s\n", residx, name.c_str());
+            return nullptr;
+        }
+        stream = std::make_unique<idstream>(res.begin(), res.end());
+    }
+    else
+    {
+        TRACE("Loading %s...\n", fname.c_str());
+        auto fstr = std::make_unique<al::ifstream>(fname.c_str(), std::ios::binary);
+        if(!fstr->is_open())
+        {
+            ERR("Could not open %s\n", fname.c_str());
+            return nullptr;
+        }
+        stream = std::move(fstr);
+    }
+
+    std::unique_ptr<HrtfStore> hrtf;
+    char magic[sizeof(magicMarker03)];
+    stream->read(magic, sizeof(magic));
+    if(stream->gcount() < static_cast<std::streamsize>(sizeof(magicMarker03)))
+        ERR("%s data is too short (%zu bytes)\n", name.c_str(), stream->gcount());
+    else if(memcmp(magic, magicMarker03, sizeof(magicMarker03)) == 0)
+    {
+        TRACE("Detected data set format v3\n");
+        hrtf = LoadHrtf03(*stream, name.c_str());
+    }
+    else if(memcmp(magic, magicMarker02, sizeof(magicMarker02)) == 0)
+    {
+        TRACE("Detected data set format v2\n");
+        hrtf = LoadHrtf02(*stream, name.c_str());
+    }
+    else if(memcmp(magic, magicMarker01, sizeof(magicMarker01)) == 0)
+    {
+        TRACE("Detected data set format v1\n");
+        hrtf = LoadHrtf01(*stream, name.c_str());
+    }
+    else if(memcmp(magic, magicMarker00, sizeof(magicMarker00)) == 0)
+    {
+        TRACE("Detected data set format v0\n");
+        hrtf = LoadHrtf00(*stream, name.c_str());
+    }
+    else
+        ERR("Invalid header in %s: \"%.8s\"\n", name.c_str(), magic);
+    stream.reset();
+
+    if(!hrtf)
+    {
+        ERR("Failed to load %s\n", name.c_str());
+        return nullptr;
+    }
+
+    if(hrtf->mSampleRate != devrate)
+    {
+        TRACE("Resampling HRTF %s (%uhz -> %uhz)\n", name.c_str(), hrtf->mSampleRate, devrate);
+
+        /* Calculate the last elevation's index and get the total IR count. */
+        const size_t lastEv{std::accumulate(hrtf->mFields.begin(), hrtf->mFields.end(), size_t{0},
+            [](const size_t curval, const HrtfStore::Field &field) noexcept -> size_t
+            { return curval + field.evCount; }
+        ) - 1};
+        const size_t irCount{size_t{hrtf->mElev[lastEv].irOffset} + hrtf->mElev[lastEv].azCount};
+
+        /* Resample all the IRs. */
+        std::array<std::array<double,HrirLength>,2> inout;
+        PPhaseResampler rs;
+        rs.init(hrtf->mSampleRate, devrate);
+        for(size_t i{0};i < irCount;++i)
+        {
+            HrirArray &coeffs = const_cast<HrirArray&>(hrtf->mCoeffs[i]);
+            for(size_t j{0};j < 2;++j)
+            {
+                std::transform(coeffs.cbegin(), coeffs.cend(), inout[0].begin(),
+                    [j](const float2 &in) noexcept -> double { return in[j]; });
+                rs.process(HrirLength, inout[0].data(), HrirLength, inout[1].data());
+                for(size_t k{0};k < HrirLength;++k)
+                    coeffs[k][j] = static_cast<float>(inout[1][k]);
+            }
+        }
+        rs = {};
+
+        /* Scale the delays for the new sample rate. */
+        float max_delay{0.0f};
+        auto new_delays = al::vector<float2>(irCount);
+        const float rate_scale{static_cast<float>(devrate)/static_cast<float>(hrtf->mSampleRate)};
+        for(size_t i{0};i < irCount;++i)
+        {
+            for(size_t j{0};j < 2;++j)
+            {
+                const float new_delay{std::round(hrtf->mDelays[i][j] * rate_scale) /
+                    float{HrirDelayFracOne}};
+                max_delay = maxf(max_delay, new_delay);
+                new_delays[i][j] = new_delay;
+            }
+        }
+
+        /* If the new delays exceed the max, scale it down to fit (essentially
+         * shrinking the head radius; not ideal but better than a per-delay
+         * clamp).
+         */
+        float delay_scale{HrirDelayFracOne};
+        if(max_delay > MaxHrirDelay)
+        {
+            WARN("Resampled delay exceeds max (%.2f > %d)\n", max_delay, MaxHrirDelay);
+            delay_scale *= float{MaxHrirDelay} / max_delay;
+        }
+
+        for(size_t i{0};i < irCount;++i)
+        {
+            ubyte2 &delays = const_cast<ubyte2&>(hrtf->mDelays[i]);
+            for(size_t j{0};j < 2;++j)
+                delays[j] = static_cast<ubyte>(float2int(new_delays[i][j]*delay_scale + 0.5f));
+        }
+
+        /* Scale the IR size for the new sample rate and update the stored
+         * sample rate.
+         */
+        const float newIrSize{std::round(static_cast<float>(hrtf->mIrSize) * rate_scale)};
+        hrtf->mIrSize = static_cast<uint8_t>(minf(HrirLength, newIrSize));
+        hrtf->mSampleRate = devrate;
+    }
+
+    TRACE("Loaded HRTF %s for sample rate %uhz, %u-sample filter\n", name.c_str(),
+        hrtf->mSampleRate, hrtf->mIrSize);
+    handle = LoadedHrtfs.emplace(handle, fname, std::move(hrtf));
+
+    return HrtfStorePtr{handle->mEntry.get()};
+}
+
+
+void HrtfStore::add_ref()
+{
+    auto ref = IncrementRef(mRef);
+    TRACE("HrtfStore %p increasing refcount to %u\n", decltype(std::declval<void*>()){this}, ref);
+}
+
+void HrtfStore::dec_ref()
+{
+    auto ref = DecrementRef(mRef);
+    TRACE("HrtfStore %p decreasing refcount to %u\n", decltype(std::declval<void*>()){this}, ref);
+    if(ref == 0)
+    {
+        std::lock_guard<std::mutex> _{LoadedHrtfLock};
+
+        /* Go through and remove all unused HRTFs. */
+        auto remove_unused = [](LoadedHrtf &hrtf) -> bool
+        {
+            HrtfStore *entry{hrtf.mEntry.get()};
+            if(entry && ReadRef(entry->mRef) == 0)
+            {
+                TRACE("Unloading unused HRTF %s\n", hrtf.mFilename.data());
+                hrtf.mEntry = nullptr;
+                return true;
+            }
+            return false;
+        };
+        auto iter = std::remove_if(LoadedHrtfs.begin(), LoadedHrtfs.end(), remove_unused);
+        LoadedHrtfs.erase(iter, LoadedHrtfs.end());
+    }
+}
diff --git a/core/hrtf.h b/core/hrtf.h
new file mode 100644
index 00000000..eb18682a
--- /dev/null
+++ b/core/hrtf.h
@@ -0,0 +1,89 @@
+#ifndef CORE_HRTF_H
+#define CORE_HRTF_H
+
+#include <array>
+#include <cstddef>
+#include <memory>
+#include <string>
+
+#include "almalloc.h"
+#include "aloptional.h"
+#include "alspan.h"
+#include "atomic.h"
+#include "ambidefs.h"
+#include "bufferline.h"
+#include "mixer/hrtfdefs.h"
+#include "intrusive_ptr.h"
+#include "vector.h"
+
+
+struct HrtfStore {
+    RefCount mRef;
+
+    uint mSampleRate : 24;
+    uint mIrSize : 8;
+
+    struct Field {
+        float distance;
+        ubyte evCount;
+    };
+    /* NOTE: Fields are stored *backwards*. field[0] is the farthest field, and
+     * field[fdCount-1] is the nearest.
+     */
+    al::span<const Field> mFields;
+
+    struct Elevation {
+        ushort azCount;
+        ushort irOffset;
+    };
+    Elevation *mElev;
+    const HrirArray *mCoeffs;
+    const ubyte2 *mDelays;
+
+    void getCoeffs(float elevation, float azimuth, float distance, float spread, HrirArray &coeffs,
+        const al::span<uint,2> delays);
+
+    void add_ref();
+    void dec_ref();
+
+    DEF_PLACE_NEWDEL()
+};
+using HrtfStorePtr = al::intrusive_ptr<HrtfStore>;
+
+
+struct EvRadians { float value; };
+struct AzRadians { float value; };
+struct AngularPoint {
+    EvRadians Elev;
+    AzRadians Azim;
+};
+
+
+struct DirectHrtfState {
+    std::array<float,BufferLineSize> mTemp;
+
+    /* HRTF filter state for dry buffer content */
+    uint mIrSize{0};
+    al::FlexArray<HrtfChannelState> mChannels;
+
+    DirectHrtfState(size_t numchans) : mChannels{numchans} { }
+    /**
+     * Produces HRTF filter coefficients for decoding B-Format, given a set of
+     * virtual speaker positions, a matching decoding matrix, and per-order
+     * high-frequency gains for the decoder. The calculated impulse responses
+     * are ordered and scaled according to the matrix input.
+     */
+    void build(const HrtfStore *Hrtf, const uint irSize, const bool perHrirMin,
+        const al::span<const AngularPoint> AmbiPoints, const float (*AmbiMatrix)[MaxAmbiChannels],
+        const float XOverFreq, const al::span<const float,MaxAmbiOrder+1> AmbiOrderHFGain);
+
+    static std::unique_ptr<DirectHrtfState> Create(size_t num_chans);
+
+    DEF_FAM_NEWDEL(DirectHrtfState, mChannels)
+};
+
+
+al::vector<std::string> EnumerateHrtf(al::optional<std::string> pathopt);
+HrtfStorePtr GetLoadedHrtf(const std::string &name, const uint devrate);
+
+#endif /* CORE_HRTF_H */
diff --git a/core/logging.cpp b/core/logging.cpp
new file mode 100644
index 00000000..34a95e5a
--- /dev/null
+++ b/core/logging.cpp
@@ -0,0 +1,89 @@
+
+#include "config.h"
+
+#include "logging.h"
+
+#include <cstdarg>
+#include <cstdio>
+#include <string>
+
+#include "alspan.h"
+#include "strutils.h"
+#include "vector.h"
+
+
+#if defined(_WIN32)
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+#elif defined(__ANDROID__)
+#include <android/log.h>
+#endif
+
+void al_print(LogLevel level, FILE *logfile, const char *fmt, ...)
+{
+    /* Kind of ugly since string literals are const char arrays with a size
+     * that includes the null terminator, which we want to exclude from the
+     * span.
+     */
+    auto prefix = al::as_span("[ALSOFT] (--) ").first<14>();
+    switch(level)
+    {
+    case LogLevel::Disable: break;
+    case LogLevel::Error: prefix = al::as_span("[ALSOFT] (EE) ").first<14>(); break;
+    case LogLevel::Warning: prefix = al::as_span("[ALSOFT] (WW) ").first<14>(); break;
+    case LogLevel::Trace: prefix = al::as_span("[ALSOFT] (II) ").first<14>(); break;
+    }
+
+    al::vector<char> dynmsg;
+    std::array<char,256> stcmsg{};
+
+    char *str{stcmsg.data()};
+    auto prefend1 = std::copy_n(prefix.begin(), prefix.size(), stcmsg.begin());
+    al::span<char> msg{prefend1, stcmsg.end()};
+
+    std::va_list args, args2;
+    va_start(args, fmt);
+    va_copy(args2, args);
+    const int msglen{std::vsnprintf(msg.data(), msg.size(), fmt, args)};
+    if(msglen >= 0 && static_cast<size_t>(msglen) >= msg.size()) UNLIKELY
+    {
+        dynmsg.resize(static_cast<size_t>(msglen)+prefix.size() + 1u);
+
+        str = dynmsg.data();
+        auto prefend2 = std::copy_n(prefix.begin(), prefix.size(), dynmsg.begin());
+        msg = {prefend2, dynmsg.end()};
+
+        std::vsnprintf(msg.data(), msg.size(), fmt, args2);
+    }
+    va_end(args2);
+    va_end(args);
+
+    if(gLogLevel >= level)
+    {
+        fputs(str, logfile);
+        fflush(logfile);
+    }
+#if defined(_WIN32) && !defined(NDEBUG)
+    /* OutputDebugStringW has no 'level' property to distinguish between
+     * informational, warning, or error debug messages. So only print them for
+     * non-Release builds.
+     */
+    std::wstring wstr{utf8_to_wstr(str)};
+    OutputDebugStringW(wstr.c_str());
+#elif defined(__ANDROID__)
+    auto android_severity = [](LogLevel l) noexcept
+    {
+        switch(l)
+        {
+        case LogLevel::Trace: return ANDROID_LOG_DEBUG;
+        case LogLevel::Warning: return ANDROID_LOG_WARN;
+        case LogLevel::Error: return ANDROID_LOG_ERROR;
+        /* Should not happen. */
+        case LogLevel::Disable:
+            break;
+        }
+        return ANDROID_LOG_ERROR;
+    };
+    __android_log_print(android_severity(level), "openal", "%s", str);
+#endif
+}
diff --git a/core/logging.h b/core/logging.h
new file mode 100644
index 00000000..f4b6ab56
--- /dev/null
+++ b/core/logging.h
@@ -0,0 +1,51 @@
+#ifndef CORE_LOGGING_H
+#define CORE_LOGGING_H
+
+#include <stdio.h>
+
+#include "opthelpers.h"
+
+
+enum class LogLevel {
+    Disable,
+    Error,
+    Warning,
+    Trace
+};
+extern LogLevel gLogLevel;
+
+extern FILE *gLogFile;
+
+#ifdef __USE_MINGW_ANSI_STDIO
+[[gnu::format(gnu_printf,3,4)]]
+#else
+[[gnu::format(printf,3,4)]]
+#endif
+void al_print(LogLevel level, FILE *logfile, const char *fmt, ...);
+
+#if (!defined(_WIN32) || defined(NDEBUG)) && !defined(__ANDROID__)
+#define TRACE(...) do {                                                       \
+    if(gLogLevel >= LogLevel::Trace) UNLIKELY                                 \
+        al_print(LogLevel::Trace, gLogFile, __VA_ARGS__);                     \
+} while(0)
+
+#define WARN(...) do {                                                        \
+    if(gLogLevel >= LogLevel::Warning) UNLIKELY                               \
+        al_print(LogLevel::Warning, gLogFile, __VA_ARGS__);                   \
+} while(0)
+
+#define ERR(...) do {                                                         \
+    if(gLogLevel >= LogLevel::Error) UNLIKELY                                 \
+        al_print(LogLevel::Error, gLogFile, __VA_ARGS__);                     \
+} while(0)
+
+#else
+
+#define TRACE(...) al_print(LogLevel::Trace, gLogFile, __VA_ARGS__)
+
+#define WARN(...) al_print(LogLevel::Warning, gLogFile, __VA_ARGS__)
+
+#define ERR(...) al_print(LogLevel::Error, gLogFile, __VA_ARGS__)
+#endif
+
+#endif /* CORE_LOGGING_H */
diff --git a/core/mastering.cpp b/core/mastering.cpp
new file mode 100644
index 00000000..97a4008e
--- /dev/null
+++ b/core/mastering.cpp
@@ -0,0 +1,439 @@
+
+#include "config.h"
+
+#include "mastering.h"
+
+#include <algorithm>
+#include <cmath>
+#include <cstddef>
+#include <functional>
+#include <iterator>
+#include <limits>
+#include <new>
+
+#include "almalloc.h"
+#include "alnumeric.h"
+#include "alspan.h"
+#include "opthelpers.h"
+
+
+/* These structures assume BufferLineSize is a power of 2. */
+static_assert((BufferLineSize & (BufferLineSize-1)) == 0, "BufferLineSize is not a power of 2");
+
+struct SlidingHold {
+    alignas(16) float mValues[BufferLineSize];
+    uint mExpiries[BufferLineSize];
+    uint mLowerIndex;
+    uint mUpperIndex;
+    uint mLength;
+};
+
+
+namespace {
+
+using namespace std::placeholders;
+
+/* This sliding hold follows the input level with an instant attack and a
+ * fixed duration hold before an instant release to the next highest level.
+ * It is a sliding window maximum (descending maxima) implementation based on
+ * Richard Harter's ascending minima algorithm available at:
+ *
+ *   http://www.richardhartersworld.com/cri/2001/slidingmin.html
+ */
+float UpdateSlidingHold(SlidingHold *Hold, const uint i, const float in)
+{
+    static constexpr uint mask{BufferLineSize - 1};
+    const uint length{Hold->mLength};
+    float (&values)[BufferLineSize] = Hold->mValues;
+    uint (&expiries)[BufferLineSize] = Hold->mExpiries;
+    uint lowerIndex{Hold->mLowerIndex};
+    uint upperIndex{Hold->mUpperIndex};
+
+    if(i >= expiries[upperIndex])
+        upperIndex = (upperIndex + 1) & mask;
+
+    if(in >= values[upperIndex])
+    {
+        values[upperIndex] = in;
+        expiries[upperIndex] = i + length;
+        lowerIndex = upperIndex;
+    }
+    else
+    {
+        do {
+            do {
+                if(!(in >= values[lowerIndex]))
+                    goto found_place;
+            } while(lowerIndex--);
+            lowerIndex = mask;
+        } while(true);
+    found_place:
+
+        lowerIndex = (lowerIndex + 1) & mask;
+        values[lowerIndex] = in;
+        expiries[lowerIndex] = i + length;
+    }
+
+    Hold->mLowerIndex = lowerIndex;
+    Hold->mUpperIndex = upperIndex;
+
+    return values[upperIndex];
+}
+
+void ShiftSlidingHold(SlidingHold *Hold, const uint n)
+{
+    auto exp_begin = std::begin(Hold->mExpiries) + Hold->mUpperIndex;
+    auto exp_last = std::begin(Hold->mExpiries) + Hold->mLowerIndex;
+    if(exp_last-exp_begin < 0)
+    {
+        std::transform(exp_begin, std::end(Hold->mExpiries), exp_begin,
+            [n](uint e){ return e - n; });
+        exp_begin = std::begin(Hold->mExpiries);
+    }
+    std::transform(exp_begin, exp_last+1, exp_begin, [n](uint e){ return e - n; });
+}
+
+
+/* Multichannel compression is linked via the absolute maximum of all
+ * channels.
+ */
+void LinkChannels(Compressor *Comp, const uint SamplesToDo, const FloatBufferLine *OutBuffer)
+{
+    const size_t numChans{Comp->mNumChans};
+
+    ASSUME(SamplesToDo > 0);
+    ASSUME(numChans > 0);
+
+    auto side_begin = std::begin(Comp->mSideChain) + Comp->mLookAhead;
+    std::fill(side_begin, side_begin+SamplesToDo, 0.0f);
+
+    auto fill_max = [SamplesToDo,side_begin](const FloatBufferLine &input) -> void
+    {
+        const float *RESTRICT buffer{al::assume_aligned<16>(input.data())};
+        auto max_abs = std::bind(maxf, _1, std::bind(static_cast<float(&)(float)>(std::fabs), _2));
+        std::transform(side_begin, side_begin+SamplesToDo, buffer, side_begin, max_abs);
+    };
+    std::for_each(OutBuffer, OutBuffer+numChans, fill_max);
+}
+
+/* This calculates the squared crest factor of the control signal for the
+ * basic automation of the attack/release times.  As suggested by the paper,
+ * it uses an instantaneous squared peak detector and a squared RMS detector
+ * both with 200ms release times.
+ */
+void CrestDetector(Compressor *Comp, const uint SamplesToDo)
+{
+    const float a_crest{Comp->mCrestCoeff};
+    float y2_peak{Comp->mLastPeakSq};
+    float y2_rms{Comp->mLastRmsSq};
+
+    ASSUME(SamplesToDo > 0);
+
+    auto calc_crest = [&y2_rms,&y2_peak,a_crest](const float x_abs) noexcept -> float
+    {
+        const float x2{clampf(x_abs * x_abs, 0.000001f, 1000000.0f)};
+
+        y2_peak = maxf(x2, lerpf(x2, y2_peak, a_crest));
+        y2_rms = lerpf(x2, y2_rms, a_crest);
+        return y2_peak / y2_rms;
+    };
+    auto side_begin = std::begin(Comp->mSideChain) + Comp->mLookAhead;
+    std::transform(side_begin, side_begin+SamplesToDo, std::begin(Comp->mCrestFactor), calc_crest);
+
+    Comp->mLastPeakSq = y2_peak;
+    Comp->mLastRmsSq = y2_rms;
+}
+
+/* The side-chain starts with a simple peak detector (based on the absolute
+ * value of the incoming signal) and performs most of its operations in the
+ * log domain.
+ */
+void PeakDetector(Compressor *Comp, const uint SamplesToDo)
+{
+    ASSUME(SamplesToDo > 0);
+
+    /* Clamp the minimum amplitude to near-zero and convert to logarithm. */
+    auto side_begin = std::begin(Comp->mSideChain) + Comp->mLookAhead;
+    std::transform(side_begin, side_begin+SamplesToDo, side_begin,
+        [](float s) { return std::log(maxf(0.000001f, s)); });
+}
+
+/* An optional hold can be used to extend the peak detector so it can more
+ * solidly detect fast transients.  This is best used when operating as a
+ * limiter.
+ */
+void PeakHoldDetector(Compressor *Comp, const uint SamplesToDo)
+{
+    ASSUME(SamplesToDo > 0);
+
+    SlidingHold *hold{Comp->mHold};
+    uint i{0};
+    auto detect_peak = [&i,hold](const float x_abs) -> float
+    {
+        const float x_G{std::log(maxf(0.000001f, x_abs))};
+        return UpdateSlidingHold(hold, i++, x_G);
+    };
+    auto side_begin = std::begin(Comp->mSideChain) + Comp->mLookAhead;
+    std::transform(side_begin, side_begin+SamplesToDo, side_begin, detect_peak);
+
+    ShiftSlidingHold(hold, SamplesToDo);
+}
+
+/* This is the heart of the feed-forward compressor.  It operates in the log
+ * domain (to better match human hearing) and can apply some basic automation
+ * to knee width, attack/release times, make-up/post gain, and clipping
+ * reduction.
+ */
+void GainCompressor(Compressor *Comp, const uint SamplesToDo)
+{
+    const bool autoKnee{Comp->mAuto.Knee};
+    const bool autoAttack{Comp->mAuto.Attack};
+    const bool autoRelease{Comp->mAuto.Release};
+    const bool autoPostGain{Comp->mAuto.PostGain};
+    const bool autoDeclip{Comp->mAuto.Declip};
+    const uint lookAhead{Comp->mLookAhead};
+    const float threshold{Comp->mThreshold};
+    const float slope{Comp->mSlope};
+    const float attack{Comp->mAttack};
+    const float release{Comp->mRelease};
+    const float c_est{Comp->mGainEstimate};
+    const float a_adp{Comp->mAdaptCoeff};
+    const float *crestFactor{Comp->mCrestFactor};
+    float postGain{Comp->mPostGain};
+    float knee{Comp->mKnee};
+    float t_att{attack};
+    float t_rel{release - attack};
+    float a_att{std::exp(-1.0f / t_att)};
+    float a_rel{std::exp(-1.0f / t_rel)};
+    float y_1{Comp->mLastRelease};
+    float y_L{Comp->mLastAttack};
+    float c_dev{Comp->mLastGainDev};
+
+    ASSUME(SamplesToDo > 0);
+
+    for(float &sideChain : al::span<float>{Comp->mSideChain, SamplesToDo})
+    {
+        if(autoKnee)
+            knee = maxf(0.0f, 2.5f * (c_dev + c_est));
+        const float knee_h{0.5f * knee};
+
+        /* This is the gain computer.  It applies a static compression curve
+         * to the control signal.
+         */
+        const float x_over{std::addressof(sideChain)[lookAhead] - threshold};
+        const float y_G{
+            (x_over <= -knee_h) ? 0.0f :
+            (std::fabs(x_over) < knee_h) ? (x_over + knee_h) * (x_over + knee_h) / (2.0f * knee) :
+            x_over};
+
+        const float y2_crest{*(crestFactor++)};
+        if(autoAttack)
+        {
+            t_att = 2.0f*attack/y2_crest;
+            a_att = std::exp(-1.0f / t_att);
+        }
+        if(autoRelease)
+        {
+            t_rel = 2.0f*release/y2_crest - t_att;
+            a_rel = std::exp(-1.0f / t_rel);
+        }
+
+        /* Gain smoothing (ballistics) is done via a smooth decoupled peak
+         * detector.  The attack time is subtracted from the release time
+         * above to compensate for the chained operating mode.
+         */
+        const float x_L{-slope * y_G};
+        y_1 = maxf(x_L, lerpf(x_L, y_1, a_rel));
+        y_L = lerpf(y_1, y_L, a_att);
+
+        /* Knee width and make-up gain automation make use of a smoothed
+         * measurement of deviation between the control signal and estimate.
+         * The estimate is also used to bias the measurement to hot-start its
+         * average.
+         */
+        c_dev = lerpf(-(y_L+c_est), c_dev, a_adp);
+
+        if(autoPostGain)
+        {
+            /* Clipping reduction is only viable when make-up gain is being
+             * automated. It modifies the deviation to further attenuate the
+             * control signal when clipping is detected. The adaptation time
+             * is sufficiently long enough to suppress further clipping at the
+             * same output level.
+             */
+            if(autoDeclip)
+                c_dev = maxf(c_dev, sideChain - y_L - threshold - c_est);
+
+            postGain = -(c_dev + c_est);
+        }
+
+        sideChain = std::exp(postGain - y_L);
+    }
+
+    Comp->mLastRelease = y_1;
+    Comp->mLastAttack = y_L;
+    Comp->mLastGainDev = c_dev;
+}
+
+/* Combined with the hold time, a look-ahead delay can improve handling of
+ * fast transients by allowing the envelope time to converge prior to
+ * reaching the offending impulse.  This is best used when operating as a
+ * limiter.
+ */
+void SignalDelay(Compressor *Comp, const uint SamplesToDo, FloatBufferLine *OutBuffer)
+{
+    const size_t numChans{Comp->mNumChans};
+    const uint lookAhead{Comp->mLookAhead};
+
+    ASSUME(SamplesToDo > 0);
+    ASSUME(numChans > 0);
+    ASSUME(lookAhead > 0);
+
+    for(size_t c{0};c < numChans;c++)
+    {
+        float *inout{al::assume_aligned<16>(OutBuffer[c].data())};
+        float *delaybuf{al::assume_aligned<16>(Comp->mDelay[c].data())};
+
+        auto inout_end = inout + SamplesToDo;
+        if(SamplesToDo >= lookAhead) LIKELY
+        {
+            auto delay_end = std::rotate(inout, inout_end - lookAhead, inout_end);
+            std::swap_ranges(inout, delay_end, delaybuf);
+        }
+        else
+        {
+            auto delay_start = std::swap_ranges(inout, inout_end, delaybuf);
+            std::rotate(delaybuf, delay_start, delaybuf + lookAhead);
+        }
+    }
+}
+
+} // namespace
+
+
+std::unique_ptr<Compressor> Compressor::Create(const size_t NumChans, const float SampleRate,
+    const bool AutoKnee, const bool AutoAttack, const bool AutoRelease, const bool AutoPostGain,
+    const bool AutoDeclip, const float LookAheadTime, const float HoldTime, const float PreGainDb,
+    const float PostGainDb, const float ThresholdDb, const float Ratio, const float KneeDb,
+    const float AttackTime, const float ReleaseTime)
+{
+    const auto lookAhead = static_cast<uint>(
+        clampf(std::round(LookAheadTime*SampleRate), 0.0f, BufferLineSize-1));
+    const auto hold = static_cast<uint>(
+        clampf(std::round(HoldTime*SampleRate), 0.0f, BufferLineSize-1));
+
+    size_t size{sizeof(Compressor)};
+    if(lookAhead > 0)
+    {
+        size += sizeof(*Compressor::mDelay) * NumChans;
+        /* The sliding hold implementation doesn't handle a length of 1. A 1-
+         * sample hold is useless anyway, it would only ever give back what was
+         * just given to it.
+         */
+        if(hold > 1)
+            size += sizeof(*Compressor::mHold);
+    }
+
+    auto Comp = CompressorPtr{al::construct_at(static_cast<Compressor*>(al_calloc(16, size)))};
+    Comp->mNumChans = NumChans;
+    Comp->mAuto.Knee = AutoKnee;
+    Comp->mAuto.Attack = AutoAttack;
+    Comp->mAuto.Release = AutoRelease;
+    Comp->mAuto.PostGain = AutoPostGain;
+    Comp->mAuto.Declip = AutoPostGain && AutoDeclip;
+    Comp->mLookAhead = lookAhead;
+    Comp->mPreGain = std::pow(10.0f, PreGainDb / 20.0f);
+    Comp->mPostGain = PostGainDb * std::log(10.0f) / 20.0f;
+    Comp->mThreshold = ThresholdDb * std::log(10.0f) / 20.0f;
+    Comp->mSlope = 1.0f / maxf(1.0f, Ratio) - 1.0f;
+    Comp->mKnee = maxf(0.0f, KneeDb * std::log(10.0f) / 20.0f);
+    Comp->mAttack = maxf(1.0f, AttackTime * SampleRate);
+    Comp->mRelease = maxf(1.0f, ReleaseTime * SampleRate);
+
+    /* Knee width automation actually treats the compressor as a limiter. By
+     * varying the knee width, it can effectively be seen as applying
+     * compression over a wide range of ratios.
+     */
+    if(AutoKnee)
+        Comp->mSlope = -1.0f;
+
+    if(lookAhead > 0)
+    {
+        if(hold > 1)
+        {
+            Comp->mHold = al::construct_at(reinterpret_cast<SlidingHold*>(Comp.get() + 1));
+            Comp->mHold->mValues[0] = -std::numeric_limits<float>::infinity();
+            Comp->mHold->mExpiries[0] = hold;
+            Comp->mHold->mLength = hold;
+            Comp->mDelay = reinterpret_cast<FloatBufferLine*>(Comp->mHold + 1);
+        }
+        else
+            Comp->mDelay = reinterpret_cast<FloatBufferLine*>(Comp.get() + 1);
+        std::uninitialized_fill_n(Comp->mDelay, NumChans, FloatBufferLine{});
+    }
+
+    Comp->mCrestCoeff = std::exp(-1.0f / (0.200f * SampleRate)); // 200ms
+    Comp->mGainEstimate = Comp->mThreshold * -0.5f * Comp->mSlope;
+    Comp->mAdaptCoeff = std::exp(-1.0f / (2.0f * SampleRate)); // 2s
+
+    return Comp;
+}
+
+Compressor::~Compressor()
+{
+    if(mHold)
+        al::destroy_at(mHold);
+    mHold = nullptr;
+    if(mDelay)
+        al::destroy_n(mDelay, mNumChans);
+    mDelay = nullptr;
+}
+
+
+void Compressor::process(const uint SamplesToDo, FloatBufferLine *OutBuffer)
+{
+    const size_t numChans{mNumChans};
+
+    ASSUME(SamplesToDo > 0);
+    ASSUME(numChans > 0);
+
+    const float preGain{mPreGain};
+    if(preGain != 1.0f)
+    {
+        auto apply_gain = [SamplesToDo,preGain](FloatBufferLine &input) noexcept -> void
+        {
+            float *buffer{al::assume_aligned<16>(input.data())};
+            std::transform(buffer, buffer+SamplesToDo, buffer,
+                [preGain](float s) { return s * preGain; });
+        };
+        std::for_each(OutBuffer, OutBuffer+numChans, apply_gain);
+    }
+
+    LinkChannels(this, SamplesToDo, OutBuffer);
+
+    if(mAuto.Attack || mAuto.Release)
+        CrestDetector(this, SamplesToDo);
+
+    if(mHold)
+        PeakHoldDetector(this, SamplesToDo);
+    else
+        PeakDetector(this, SamplesToDo);
+
+    GainCompressor(this, SamplesToDo);
+
+    if(mDelay)
+        SignalDelay(this, SamplesToDo, OutBuffer);
+
+    const float (&sideChain)[BufferLineSize*2] = mSideChain;
+    auto apply_comp = [SamplesToDo,&sideChain](FloatBufferLine &input) noexcept -> void
+    {
+        float *buffer{al::assume_aligned<16>(input.data())};
+        const float *gains{al::assume_aligned<16>(&sideChain[0])};
+        std::transform(gains, gains+SamplesToDo, buffer, buffer,
+            [](float g, float s) { return g * s; });
+    };
+    std::for_each(OutBuffer, OutBuffer+numChans, apply_comp);
+
+    auto side_begin = std::begin(mSideChain) + SamplesToDo;
+    std::copy(side_begin, side_begin+mLookAhead, std::begin(mSideChain));
+}
diff --git a/core/mastering.h b/core/mastering.h
new file mode 100644
index 00000000..1a36937c
--- /dev/null
+++ b/core/mastering.h
@@ -0,0 +1,105 @@
+#ifndef CORE_MASTERING_H
+#define CORE_MASTERING_H
+
+#include <memory>
+
+#include "almalloc.h"
+#include "bufferline.h"
+
+struct SlidingHold;
+
+using uint = unsigned int;
+
+
+/* General topology and basic automation was based on the following paper:
+ *
+ *   D. Giannoulis, M. Massberg and J. D. Reiss,
+ *   "Parameter Automation in a Dynamic Range Compressor,"
+ *   Journal of the Audio Engineering Society, v61 (10), Oct. 2013
+ *
+ * Available (along with supplemental reading) at:
+ *
+ *   http://c4dm.eecs.qmul.ac.uk/audioengineering/compressors/
+ */
+struct Compressor {
+    size_t mNumChans{0u};
+
+    struct {
+        bool Knee : 1;
+        bool Attack : 1;
+        bool Release : 1;
+        bool PostGain : 1;
+        bool Declip : 1;
+    } mAuto{};
+
+    uint mLookAhead{0};
+
+    float mPreGain{0.0f};
+    float mPostGain{0.0f};
+
+    float mThreshold{0.0f};
+    float mSlope{0.0f};
+    float mKnee{0.0f};
+
+    float mAttack{0.0f};
+    float mRelease{0.0f};
+
+    alignas(16) float mSideChain[2*BufferLineSize]{};
+    alignas(16) float mCrestFactor[BufferLineSize]{};
+
+    SlidingHold *mHold{nullptr};
+    FloatBufferLine *mDelay{nullptr};
+
+    float mCrestCoeff{0.0f};
+    float mGainEstimate{0.0f};
+    float mAdaptCoeff{0.0f};
+
+    float mLastPeakSq{0.0f};
+    float mLastRmsSq{0.0f};
+    float mLastRelease{0.0f};
+    float mLastAttack{0.0f};
+    float mLastGainDev{0.0f};
+
+
+    ~Compressor();
+    void process(const uint SamplesToDo, FloatBufferLine *OutBuffer);
+    int getLookAhead() const noexcept { return static_cast<int>(mLookAhead); }
+
+    DEF_PLACE_NEWDEL()
+
+    /**
+     * The compressor is initialized with the following settings:
+     *
+     * \param NumChans      Number of channels to process.
+     * \param SampleRate    Sample rate to process.
+     * \param AutoKnee      Whether to automate the knee width parameter.
+     * \param AutoAttack    Whether to automate the attack time parameter.
+     * \param AutoRelease   Whether to automate the release time parameter.
+     * \param AutoPostGain  Whether to automate the make-up (post) gain
+     *        parameter.
+     * \param AutoDeclip    Whether to automate clipping reduction. Ignored
+     *        when not automating make-up gain.
+     * \param LookAheadTime Look-ahead time (in seconds).
+     * \param HoldTime      Peak hold-time (in seconds).
+     * \param PreGainDb     Gain applied before detection (in dB).
+     * \param PostGainDb    Make-up gain applied after compression (in dB).
+     * \param ThresholdDb   Triggering threshold (in dB).
+     * \param Ratio         Compression ratio (x:1). Set to INFINIFTY for true
+     *        limiting. Ignored when automating knee width.
+     * \param KneeDb        Knee width (in dB). Ignored when automating knee
+     *        width.
+     * \param AttackTime    Attack time (in seconds). Acts as a maximum when
+     *        automating attack time.
+     * \param ReleaseTime   Release time (in seconds). Acts as a maximum when
+     *        automating release time.
+     */
+    static std::unique_ptr<Compressor> Create(const size_t NumChans, const float SampleRate,
+        const bool AutoKnee, const bool AutoAttack, const bool AutoRelease,
+        const bool AutoPostGain, const bool AutoDeclip, const float LookAheadTime,
+        const float HoldTime, const float PreGainDb, const float PostGainDb,
+        const float ThresholdDb, const float Ratio, const float KneeDb, const float AttackTime,
+        const float ReleaseTime);
+};
+using CompressorPtr = std::unique_ptr<Compressor>;
+
+#endif /* CORE_MASTERING_H */
diff --git a/core/mixer.cpp b/core/mixer.cpp
new file mode 100644
index 00000000..066c57bd
--- /dev/null
+++ b/core/mixer.cpp
@@ -0,0 +1,95 @@
+
+#include "config.h"
+
+#include "mixer.h"
+
+#include <cmath>
+
+#include "alnumbers.h"
+#include "devformat.h"
+#include "device.h"
+#include "mixer/defs.h"
+
+struct CTag;
+
+
+MixerOutFunc MixSamplesOut{Mix_<CTag>};
+MixerOneFunc MixSamplesOne{Mix_<CTag>};
+
+
+std::array<float,MaxAmbiChannels> CalcAmbiCoeffs(const float y, const float z, const float x,
+    const float spread)
+{
+    std::array<float,MaxAmbiChannels> coeffs{CalcAmbiCoeffs(y, z, x)};
+
+    if(spread > 0.0f)
+    {
+        /* Implement the spread by using a spherical source that subtends the
+         * angle spread. See:
+         * http://www.ppsloan.org/publications/StupidSH36.pdf - Appendix A3
+         *
+         * When adjusted for N3D normalization instead of SN3D, these
+         * calculations are:
+         *
+         * ZH0 = -sqrt(pi) * (-1+ca);
+         * ZH1 =  0.5*sqrt(pi) * sa*sa;
+         * ZH2 = -0.5*sqrt(pi) * ca*(-1+ca)*(ca+1);
+         * ZH3 = -0.125*sqrt(pi) * (-1+ca)*(ca+1)*(5*ca*ca - 1);
+         * ZH4 = -0.125*sqrt(pi) * ca*(-1+ca)*(ca+1)*(7*ca*ca - 3);
+         * ZH5 = -0.0625*sqrt(pi) * (-1+ca)*(ca+1)*(21*ca*ca*ca*ca - 14*ca*ca + 1);
+         *
+         * The gain of the source is compensated for size, so that the
+         * loudness doesn't depend on the spread. Thus:
+         *
+         * ZH0 = 1.0f;
+         * ZH1 = 0.5f * (ca+1.0f);
+         * ZH2 = 0.5f * (ca+1.0f)*ca;
+         * ZH3 = 0.125f * (ca+1.0f)*(5.0f*ca*ca - 1.0f);
+         * ZH4 = 0.125f * (ca+1.0f)*(7.0f*ca*ca - 3.0f)*ca;
+         * ZH5 = 0.0625f * (ca+1.0f)*(21.0f*ca*ca*ca*ca - 14.0f*ca*ca + 1.0f);
+         */
+        const float ca{std::cos(spread * 0.5f)};
+        /* Increase the source volume by up to +3dB for a full spread. */
+        const float scale{std::sqrt(1.0f + al::numbers::inv_pi_v<float>/2.0f*spread)};
+
+        const float ZH0_norm{scale};
+        const float ZH1_norm{scale * 0.5f * (ca+1.f)};
+        const float ZH2_norm{scale * 0.5f * (ca+1.f)*ca};
+        const float ZH3_norm{scale * 0.125f * (ca+1.f)*(5.f*ca*ca-1.f)};
+
+        /* Zeroth-order */
+        coeffs[0]  *= ZH0_norm;
+        /* First-order */
+        coeffs[1]  *= ZH1_norm;
+        coeffs[2]  *= ZH1_norm;
+        coeffs[3]  *= ZH1_norm;
+        /* Second-order */
+        coeffs[4]  *= ZH2_norm;
+        coeffs[5]  *= ZH2_norm;
+        coeffs[6]  *= ZH2_norm;
+        coeffs[7]  *= ZH2_norm;
+        coeffs[8]  *= ZH2_norm;
+        /* Third-order */
+        coeffs[9]  *= ZH3_norm;
+        coeffs[10] *= ZH3_norm;
+        coeffs[11] *= ZH3_norm;
+        coeffs[12] *= ZH3_norm;
+        coeffs[13] *= ZH3_norm;
+        coeffs[14] *= ZH3_norm;
+        coeffs[15] *= ZH3_norm;
+    }
+
+    return coeffs;
+}
+
+void ComputePanGains(const MixParams *mix, const float*RESTRICT coeffs, const float ingain,
+    const al::span<float,MaxAmbiChannels> gains)
+{
+    auto ambimap = mix->AmbiMap.cbegin();
+
+    auto iter = std::transform(ambimap, ambimap+mix->Buffer.size(), gains.begin(),
+        [coeffs,ingain](const BFChannelConfig &chanmap) noexcept -> float
+        { return chanmap.Scale * coeffs[chanmap.Index] * ingain; }
+    );
+    std::fill(iter, gains.end(), 0.0f);
+}
diff --git a/core/mixer.h b/core/mixer.h
new file mode 100644
index 00000000..aa7597bb
--- /dev/null
+++ b/core/mixer.h
@@ -0,0 +1,109 @@
+#ifndef CORE_MIXER_H
+#define CORE_MIXER_H
+
+#include <array>
+#include <cmath>
+#include <stddef.h>
+#include <type_traits>
+
+#include "alspan.h"
+#include "ambidefs.h"
+#include "bufferline.h"
+#include "devformat.h"
+
+struct MixParams;
+
+/* Mixer functions that handle one input and multiple output channels. */
+using MixerOutFunc = void(*)(const al::span<const float> InSamples,
+    const al::span<FloatBufferLine> OutBuffer, float *CurrentGains, const float *TargetGains,
+    const size_t Counter, const size_t OutPos);
+
+extern MixerOutFunc MixSamplesOut;
+inline void MixSamples(const al::span<const float> InSamples,
+    const al::span<FloatBufferLine> OutBuffer, float *CurrentGains, const float *TargetGains,
+    const size_t Counter, const size_t OutPos)
+{ MixSamplesOut(InSamples, OutBuffer, CurrentGains, TargetGains, Counter, OutPos); }
+
+/* Mixer functions that handle one input and one output channel. */
+using MixerOneFunc = void(*)(const al::span<const float> InSamples, float *OutBuffer,
+    float &CurrentGain, const float TargetGain, const size_t Counter);
+
+extern MixerOneFunc MixSamplesOne;
+inline void MixSamples(const al::span<const float> InSamples, float *OutBuffer, float &CurrentGain,
+    const float TargetGain, const size_t Counter)
+{ MixSamplesOne(InSamples, OutBuffer, CurrentGain, TargetGain, Counter); }
+
+
+/**
+ * Calculates ambisonic encoder coefficients using the X, Y, and Z direction
+ * components, which must represent a normalized (unit length) vector, and the
+ * spread is the angular width of the sound (0...tau).
+ *
+ * NOTE: The components use ambisonic coordinates. As a result:
+ *
+ * Ambisonic Y = OpenAL -X
+ * Ambisonic Z = OpenAL Y
+ * Ambisonic X = OpenAL -Z
+ *
+ * The components are ordered such that OpenAL's X, Y, and Z are the first,
+ * second, and third parameters respectively -- simply negate X and Z.
+ */
+std::array<float,MaxAmbiChannels> CalcAmbiCoeffs(const float y, const float z, const float x,
+    const float spread);
+
+/**
+ * CalcDirectionCoeffs
+ *
+ * Calculates ambisonic coefficients based on an OpenAL direction vector. The
+ * vector must be normalized (unit length), and the spread is the angular width
+ * of the sound (0...tau).
+ */
+inline std::array<float,MaxAmbiChannels> CalcDirectionCoeffs(const float (&dir)[3],
+    const float spread)
+{
+    /* Convert from OpenAL coords to Ambisonics. */
+    return CalcAmbiCoeffs(-dir[0], dir[1], -dir[2], spread);
+}
+
+/**
+ * CalcDirectionCoeffs
+ *
+ * Calculates ambisonic coefficients based on an OpenAL direction vector. The
+ * vector must be normalized (unit length).
+ */
+constexpr std::array<float,MaxAmbiChannels> CalcDirectionCoeffs(const float (&dir)[3])
+{
+    /* Convert from OpenAL coords to Ambisonics. */
+    return CalcAmbiCoeffs(-dir[0], dir[1], -dir[2]);
+}
+
+/**
+ * CalcAngleCoeffs
+ *
+ * Calculates ambisonic coefficients based on azimuth and elevation. The
+ * azimuth and elevation parameters are in radians, going right and up
+ * respectively.
+ */
+inline std::array<float,MaxAmbiChannels> CalcAngleCoeffs(const float azimuth,
+    const float elevation, const float spread)
+{
+    const float x{-std::sin(azimuth) * std::cos(elevation)};
+    const float y{ std::sin(elevation)};
+    const float z{ std::cos(azimuth) * std::cos(elevation)};
+
+    return CalcAmbiCoeffs(x, y, z, spread);
+}
+
+
+/**
+ * ComputePanGains
+ *
+ * Computes panning gains using the given channel decoder coefficients and the
+ * pre-calculated direction or angle coefficients. For B-Format sources, the
+ * coeffs are a 'slice' of a transform matrix for the input channel, used to
+ * scale and orient the sound samples.
+ */
+void ComputePanGains(const MixParams *mix, const float*RESTRICT coeffs, const float ingain,
+    const al::span<float,MaxAmbiChannels> gains);
+
+#endif /* CORE_MIXER_H */
diff --git a/core/mixer/defs.h b/core/mixer/defs.h
new file mode 100644
index 00000000..48daca9b
--- /dev/null
+++ b/core/mixer/defs.h
@@ -0,0 +1,109 @@
+#ifndef CORE_MIXER_DEFS_H
+#define CORE_MIXER_DEFS_H
+
+#include <array>
+#include <stdlib.h>
+
+#include "alspan.h"
+#include "core/bufferline.h"
+#include "core/resampler_limits.h"
+
+struct CubicCoefficients;
+struct HrtfChannelState;
+struct HrtfFilter;
+struct MixHrtfFilter;
+
+using uint = unsigned int;
+using float2 = std::array<float,2>;
+
+
+constexpr int MixerFracBits{16};
+constexpr int MixerFracOne{1 << MixerFracBits};
+constexpr int MixerFracMask{MixerFracOne - 1};
+constexpr int MixerFracHalf{MixerFracOne >> 1};
+
+constexpr float GainSilenceThreshold{0.00001f}; /* -100dB */
+
+
+enum class Resampler : uint8_t {
+    Point,
+    Linear,
+    Cubic,
+    FastBSinc12,
+    BSinc12,
+    FastBSinc24,
+    BSinc24,
+
+    Max = BSinc24
+};
+
+/* Interpolator state. Kind of a misnomer since the interpolator itself is
+ * stateless. This just keeps it from having to recompute scale-related
+ * mappings for every sample.
+ */
+struct BsincState {
+    float sf; /* Scale interpolation factor. */
+    uint m; /* Coefficient count. */
+    uint l; /* Left coefficient offset. */
+    /* Filter coefficients, followed by the phase, scale, and scale-phase
+     * delta coefficients. Starting at phase index 0, each subsequent phase
+     * index follows contiguously.
+     */
+    const float *filter;
+};
+
+struct CubicState {
+    /* Filter coefficients, and coefficient deltas. Starting at phase index 0,
+     * each subsequent phase index follows contiguously.
+     */
+    const CubicCoefficients *filter;
+};
+
+union InterpState {
+    CubicState cubic;
+    BsincState bsinc;
+};
+
+using ResamplerFunc = void(*)(const InterpState *state, const float *RESTRICT src, uint frac,
+    const uint increment, const al::span<float> dst);
+
+ResamplerFunc PrepareResampler(Resampler resampler, uint increment, InterpState *state);
+
+
+template<typename TypeTag, typename InstTag>
+void Resample_(const InterpState *state, const float *RESTRICT src, uint frac,
+    const uint increment, const al::span<float> dst);
+
+template<typename InstTag>
+void Mix_(const al::span<const float> InSamples, const al::span<FloatBufferLine> OutBuffer,
+    float *CurrentGains, const float *TargetGains, const size_t Counter, const size_t OutPos);
+template<typename InstTag>
+void Mix_(const al::span<const float> InSamples, float *OutBuffer, float &CurrentGain,
+    const float TargetGain, const size_t Counter);
+
+template<typename InstTag>
+void MixHrtf_(const float *InSamples, float2 *AccumSamples, const uint IrSize,
+    const MixHrtfFilter *hrtfparams, const size_t BufferSize);
+template<typename InstTag>
+void MixHrtfBlend_(const float *InSamples, float2 *AccumSamples, const uint IrSize,
+    const HrtfFilter *oldparams, const MixHrtfFilter *newparams, const size_t BufferSize);
+template<typename InstTag>
+void MixDirectHrtf_(const FloatBufferSpan LeftOut, const FloatBufferSpan RightOut,
+    const al::span<const FloatBufferLine> InSamples, float2 *AccumSamples,
+    float *TempBuf, HrtfChannelState *ChanState, const size_t IrSize, const size_t BufferSize);
+
+/* Vectorized resampler helpers */
+template<size_t N>
+inline void InitPosArrays(uint frac, uint increment, uint (&frac_arr)[N], uint (&pos_arr)[N])
+{
+    pos_arr[0] = 0;
+    frac_arr[0] = frac;
+    for(size_t i{1};i < N;i++)
+    {
+        const uint frac_tmp{frac_arr[i-1] + increment};
+        pos_arr[i] = pos_arr[i-1] + (frac_tmp>>MixerFracBits);
+        frac_arr[i] = frac_tmp&MixerFracMask;
+    }
+}
+
+#endif /* CORE_MIXER_DEFS_H */
diff --git a/core/mixer/hrtfbase.h b/core/mixer/hrtfbase.h
new file mode 100644
index 00000000..36f88e49
--- /dev/null
+++ b/core/mixer/hrtfbase.h
@@ -0,0 +1,129 @@
+#ifndef CORE_MIXER_HRTFBASE_H
+#define CORE_MIXER_HRTFBASE_H
+
+#include <algorithm>
+#include <cmath>
+
+#include "almalloc.h"
+#include "hrtfdefs.h"
+#include "opthelpers.h"
+
+
+using uint = unsigned int;
+
+using ApplyCoeffsT = void(&)(float2 *RESTRICT Values, const size_t irSize,
+    const ConstHrirSpan Coeffs, const float left, const float right);
+
+template<ApplyCoeffsT ApplyCoeffs>
+inline void MixHrtfBase(const float *InSamples, float2 *RESTRICT AccumSamples, const size_t IrSize,
+    const MixHrtfFilter *hrtfparams, const size_t BufferSize)
+{
+    ASSUME(BufferSize > 0);
+
+    const ConstHrirSpan Coeffs{hrtfparams->Coeffs};
+    const float gainstep{hrtfparams->GainStep};
+    const float gain{hrtfparams->Gain};
+
+    size_t ldelay{HrtfHistoryLength - hrtfparams->Delay[0]};
+    size_t rdelay{HrtfHistoryLength - hrtfparams->Delay[1]};
+    float stepcount{0.0f};
+    for(size_t i{0u};i < BufferSize;++i)
+    {
+        const float g{gain + gainstep*stepcount};
+        const float left{InSamples[ldelay++] * g};
+        const float right{InSamples[rdelay++] * g};
+        ApplyCoeffs(AccumSamples+i, IrSize, Coeffs, left, right);
+
+        stepcount += 1.0f;
+    }
+}
+
+template<ApplyCoeffsT ApplyCoeffs>
+inline void MixHrtfBlendBase(const float *InSamples, float2 *RESTRICT AccumSamples,
+    const size_t IrSize, const HrtfFilter *oldparams, const MixHrtfFilter *newparams,
+    const size_t BufferSize)
+{
+    ASSUME(BufferSize > 0);
+
+    const ConstHrirSpan OldCoeffs{oldparams->Coeffs};
+    const float oldGainStep{oldparams->Gain / static_cast<float>(BufferSize)};
+    const ConstHrirSpan NewCoeffs{newparams->Coeffs};
+    const float newGainStep{newparams->GainStep};
+
+    if(oldparams->Gain > GainSilenceThreshold) LIKELY
+    {
+        size_t ldelay{HrtfHistoryLength - oldparams->Delay[0]};
+        size_t rdelay{HrtfHistoryLength - oldparams->Delay[1]};
+        auto stepcount = static_cast<float>(BufferSize);
+        for(size_t i{0u};i < BufferSize;++i)
+        {
+            const float g{oldGainStep*stepcount};
+            const float left{InSamples[ldelay++] * g};
+            const float right{InSamples[rdelay++] * g};
+            ApplyCoeffs(AccumSamples+i, IrSize, OldCoeffs, left, right);
+
+            stepcount -= 1.0f;
+        }
+    }
+
+    if(newGainStep*static_cast<float>(BufferSize) > GainSilenceThreshold) LIKELY
+    {
+        size_t ldelay{HrtfHistoryLength+1 - newparams->Delay[0]};
+        size_t rdelay{HrtfHistoryLength+1 - newparams->Delay[1]};
+        float stepcount{1.0f};
+        for(size_t i{1u};i < BufferSize;++i)
+        {
+            const float g{newGainStep*stepcount};
+            const float left{InSamples[ldelay++] * g};
+            const float right{InSamples[rdelay++] * g};
+            ApplyCoeffs(AccumSamples+i, IrSize, NewCoeffs, left, right);
+
+            stepcount += 1.0f;
+        }
+    }
+}
+
+template<ApplyCoeffsT ApplyCoeffs>
+inline void MixDirectHrtfBase(const FloatBufferSpan LeftOut, const FloatBufferSpan RightOut,
+    const al::span<const FloatBufferLine> InSamples, float2 *RESTRICT AccumSamples,
+    float *TempBuf, HrtfChannelState *ChanState, const size_t IrSize, const size_t BufferSize)
+{
+    ASSUME(BufferSize > 0);
+
+    for(const FloatBufferLine &input : InSamples)
+    {
+        /* For dual-band processing, the signal needs extra scaling applied to
+         * the high frequency response. The band-splitter applies this scaling
+         * with a consistent phase shift regardless of the scale amount.
+         */
+        ChanState->mSplitter.processHfScale({input.data(), BufferSize}, TempBuf,
+            ChanState->mHfScale);
+
+        /* Now apply the HRIR coefficients to this channel. */
+        const float *RESTRICT tempbuf{al::assume_aligned<16>(TempBuf)};
+        const ConstHrirSpan Coeffs{ChanState->mCoeffs};
+        for(size_t i{0u};i < BufferSize;++i)
+        {
+            const float insample{tempbuf[i]};
+            ApplyCoeffs(AccumSamples+i, IrSize, Coeffs, insample, insample);
+        }
+
+        ++ChanState;
+    }
+
+    /* Add the HRTF signal to the existing "direct" signal. */
+    float *RESTRICT left{al::assume_aligned<16>(LeftOut.data())};
+    float *RESTRICT right{al::assume_aligned<16>(RightOut.data())};
+    for(size_t i{0u};i < BufferSize;++i)
+        left[i]  += AccumSamples[i][0];
+    for(size_t i{0u};i < BufferSize;++i)
+        right[i] += AccumSamples[i][1];
+
+    /* Copy the new in-progress accumulation values to the front and clear the
+     * following samples for the next mix.
+     */
+    auto accum_iter = std::copy_n(AccumSamples+BufferSize, HrirLength, AccumSamples);
+    std::fill_n(accum_iter, BufferSize, float2{});
+}
+
+#endif /* CORE_MIXER_HRTFBASE_H */
diff --git a/core/mixer/hrtfdefs.h b/core/mixer/hrtfdefs.h
new file mode 100644
index 00000000..3c903ed8
--- /dev/null
+++ b/core/mixer/hrtfdefs.h
@@ -0,0 +1,53 @@
+#ifndef CORE_MIXER_HRTFDEFS_H
+#define CORE_MIXER_HRTFDEFS_H
+
+#include <array>
+
+#include "alspan.h"
+#include "core/ambidefs.h"
+#include "core/bufferline.h"
+#include "core/filters/splitter.h"
+
+
+using float2 = std::array<float,2>;
+using ubyte = unsigned char;
+using ubyte2 = std::array<ubyte,2>;
+using ushort = unsigned short;
+using uint = unsigned int;
+using uint2 = std::array<uint,2>;
+
+constexpr uint HrtfHistoryBits{6};
+constexpr uint HrtfHistoryLength{1 << HrtfHistoryBits};
+constexpr uint HrtfHistoryMask{HrtfHistoryLength - 1};
+
+constexpr uint HrirBits{7};
+constexpr uint HrirLength{1 << HrirBits};
+constexpr uint HrirMask{HrirLength - 1};
+
+constexpr uint MinIrLength{8};
+
+using HrirArray = std::array<float2,HrirLength>;
+using HrirSpan = al::span<float2,HrirLength>;
+using ConstHrirSpan = al::span<const float2,HrirLength>;
+
+struct MixHrtfFilter {
+    const ConstHrirSpan Coeffs;
+    uint2 Delay;
+    float Gain;
+    float GainStep;
+};
+
+struct HrtfFilter {
+    alignas(16) HrirArray Coeffs;
+    uint2 Delay;
+    float Gain;
+};
+
+
+struct HrtfChannelState {
+    BandSplitter mSplitter;
+    float mHfScale{};
+    alignas(16) HrirArray mCoeffs{};
+};
+
+#endif /* CORE_MIXER_HRTFDEFS_H */
diff --git a/core/mixer/mixer_c.cpp b/core/mixer/mixer_c.cpp
new file mode 100644
index 00000000..28a92ef7
--- /dev/null
+++ b/core/mixer/mixer_c.cpp
@@ -0,0 +1,218 @@
+#include "config.h"
+
+#include <cassert>
+#include <cmath>
+#include <limits>
+
+#include "alnumeric.h"
+#include "core/bsinc_defs.h"
+#include "core/cubic_defs.h"
+#include "defs.h"
+#include "hrtfbase.h"
+
+struct CTag;
+struct PointTag;
+struct LerpTag;
+struct CubicTag;
+struct BSincTag;
+struct FastBSincTag;
+
+
+namespace {
+
+constexpr uint BsincPhaseDiffBits{MixerFracBits - BSincPhaseBits};
+constexpr uint BsincPhaseDiffOne{1 << BsincPhaseDiffBits};
+constexpr uint BsincPhaseDiffMask{BsincPhaseDiffOne - 1u};
+
+constexpr uint CubicPhaseDiffBits{MixerFracBits - CubicPhaseBits};
+constexpr uint CubicPhaseDiffOne{1 << CubicPhaseDiffBits};
+constexpr uint CubicPhaseDiffMask{CubicPhaseDiffOne - 1u};
+
+inline float do_point(const InterpState&, const float *RESTRICT vals, const uint)
+{ return vals[0]; }
+inline float do_lerp(const InterpState&, const float *RESTRICT vals, const uint frac)
+{ return lerpf(vals[0], vals[1], static_cast<float>(frac)*(1.0f/MixerFracOne)); }
+inline float do_cubic(const InterpState &istate, const float *RESTRICT vals, const uint frac)
+{
+    /* Calculate the phase index and factor. */
+    const uint pi{frac >> CubicPhaseDiffBits};
+    const float pf{static_cast<float>(frac&CubicPhaseDiffMask) * (1.0f/CubicPhaseDiffOne)};
+
+    const float *RESTRICT fil{al::assume_aligned<16>(istate.cubic.filter[pi].mCoeffs)};
+    const float *RESTRICT phd{al::assume_aligned<16>(istate.cubic.filter[pi].mDeltas)};
+
+    /* Apply the phase interpolated filter. */
+    return (fil[0] + pf*phd[0])*vals[0] + (fil[1] + pf*phd[1])*vals[1]
+        + (fil[2] + pf*phd[2])*vals[2] + (fil[3] + pf*phd[3])*vals[3];
+}
+inline float do_bsinc(const InterpState &istate, const float *RESTRICT vals, const uint frac)
+{
+    const size_t m{istate.bsinc.m};
+    ASSUME(m > 0);
+
+    /* Calculate the phase index and factor. */
+    const uint pi{frac >> BsincPhaseDiffBits};
+    const float pf{static_cast<float>(frac&BsincPhaseDiffMask) * (1.0f/BsincPhaseDiffOne)};
+
+    const float *RESTRICT fil{istate.bsinc.filter + m*pi*2};
+    const float *RESTRICT phd{fil + m};
+    const float *RESTRICT scd{fil + BSincPhaseCount*2*m};
+    const float *RESTRICT spd{scd + m};
+
+    /* Apply the scale and phase interpolated filter. */
+    float r{0.0f};
+    for(size_t j_f{0};j_f < m;j_f++)
+        r += (fil[j_f] + istate.bsinc.sf*scd[j_f] + pf*(phd[j_f] + istate.bsinc.sf*spd[j_f])) * vals[j_f];
+    return r;
+}
+inline float do_fastbsinc(const InterpState &istate, const float *RESTRICT vals, const uint frac)
+{
+    const size_t m{istate.bsinc.m};
+    ASSUME(m > 0);
+
+    /* Calculate the phase index and factor. */
+    const uint pi{frac >> BsincPhaseDiffBits};
+    const float pf{static_cast<float>(frac&BsincPhaseDiffMask) * (1.0f/BsincPhaseDiffOne)};
+
+    const float *RESTRICT fil{istate.bsinc.filter + m*pi*2};
+    const float *RESTRICT phd{fil + m};
+
+    /* Apply the phase interpolated filter. */
+    float r{0.0f};
+    for(size_t j_f{0};j_f < m;j_f++)
+        r += (fil[j_f] + pf*phd[j_f]) * vals[j_f];
+    return r;
+}
+
+using SamplerT = float(&)(const InterpState&, const float*RESTRICT, const uint);
+template<SamplerT Sampler>
+void DoResample(const InterpState *state, const float *RESTRICT src, uint frac,
+    const uint increment, const al::span<float> dst)
+{
+    const InterpState istate{*state};
+    ASSUME(frac < MixerFracOne);
+    for(float &out : dst)
+    {
+        out = Sampler(istate, src, frac);
+
+        frac += increment;
+        src  += frac>>MixerFracBits;
+        frac &= MixerFracMask;
+    }
+}
+
+inline void ApplyCoeffs(float2 *RESTRICT Values, const size_t IrSize, const ConstHrirSpan Coeffs,
+    const float left, const float right)
+{
+    ASSUME(IrSize >= MinIrLength);
+    for(size_t c{0};c < IrSize;++c)
+    {
+        Values[c][0] += Coeffs[c][0] * left;
+        Values[c][1] += Coeffs[c][1] * right;
+    }
+}
+
+force_inline void MixLine(const al::span<const float> InSamples, float *RESTRICT dst,
+    float &CurrentGain, const float TargetGain, const float delta, const size_t min_len,
+    size_t Counter)
+{
+    float gain{CurrentGain};
+    const float step{(TargetGain-gain) * delta};
+
+    size_t pos{0};
+    if(!(std::abs(step) > std::numeric_limits<float>::epsilon()))
+        gain = TargetGain;
+    else
+    {
+        float step_count{0.0f};
+        for(;pos != min_len;++pos)
+        {
+            dst[pos] += InSamples[pos] * (gain + step*step_count);
+            step_count += 1.0f;
+        }
+        if(pos == Counter)
+            gain = TargetGain;
+        else
+            gain += step*step_count;
+    }
+    CurrentGain = gain;
+
+    if(!(std::abs(gain) > GainSilenceThreshold))
+        return;
+    for(;pos != InSamples.size();++pos)
+        dst[pos] += InSamples[pos] * gain;
+}
+
+} // namespace
+
+template<>
+void Resample_<PointTag,CTag>(const InterpState *state, const float *RESTRICT src, uint frac,
+    const uint increment, const al::span<float> dst)
+{ DoResample<do_point>(state, src, frac, increment, dst); }
+
+template<>
+void Resample_<LerpTag,CTag>(const InterpState *state, const float *RESTRICT src, uint frac,
+    const uint increment, const al::span<float> dst)
+{ DoResample<do_lerp>(state, src, frac, increment, dst); }
+
+template<>
+void Resample_<CubicTag,CTag>(const InterpState *state, const float *RESTRICT src, uint frac,
+    const uint increment, const al::span<float> dst)
+{ DoResample<do_cubic>(state, src-1, frac, increment, dst); }
+
+template<>
+void Resample_<BSincTag,CTag>(const InterpState *state, const float *RESTRICT src, uint frac,
+    const uint increment, const al::span<float> dst)
+{ DoResample<do_bsinc>(state, src-state->bsinc.l, frac, increment, dst); }
+
+template<>
+void Resample_<FastBSincTag,CTag>(const InterpState *state, const float *RESTRICT src, uint frac,
+    const uint increment, const al::span<float> dst)
+{ DoResample<do_fastbsinc>(state, src-state->bsinc.l, frac, increment, dst); }
+
+
+template<>
+void MixHrtf_<CTag>(const float *InSamples, float2 *AccumSamples, const uint IrSize,
+    const MixHrtfFilter *hrtfparams, const size_t BufferSize)
+{ MixHrtfBase<ApplyCoeffs>(InSamples, AccumSamples, IrSize, hrtfparams, BufferSize); }
+
+template<>
+void MixHrtfBlend_<CTag>(const float *InSamples, float2 *AccumSamples, const uint IrSize,
+    const HrtfFilter *oldparams, const MixHrtfFilter *newparams, const size_t BufferSize)
+{
+    MixHrtfBlendBase<ApplyCoeffs>(InSamples, AccumSamples, IrSize, oldparams, newparams,
+        BufferSize);
+}
+
+template<>
+void MixDirectHrtf_<CTag>(const FloatBufferSpan LeftOut, const FloatBufferSpan RightOut,
+    const al::span<const FloatBufferLine> InSamples, float2 *AccumSamples,
+    float *TempBuf, HrtfChannelState *ChanState, const size_t IrSize, const size_t BufferSize)
+{
+    MixDirectHrtfBase<ApplyCoeffs>(LeftOut, RightOut, InSamples, AccumSamples, TempBuf, ChanState,
+        IrSize, BufferSize);
+}
+
+
+template<>
+void Mix_<CTag>(const al::span<const float> InSamples, const al::span<FloatBufferLine> OutBuffer,
+    float *CurrentGains, const float *TargetGains, const size_t Counter, const size_t OutPos)
+{
+    const float delta{(Counter > 0) ? 1.0f / static_cast<float>(Counter) : 0.0f};
+    const auto min_len = minz(Counter, InSamples.size());
+
+    for(FloatBufferLine &output : OutBuffer)
+        MixLine(InSamples, al::assume_aligned<16>(output.data()+OutPos), *CurrentGains++,
+            *TargetGains++, delta, min_len, Counter);
+}
+
+template<>
+void Mix_<CTag>(const al::span<const float> InSamples, float *OutBuffer, float &CurrentGain,
+    const float TargetGain, const size_t Counter)
+{
+    const float delta{(Counter > 0) ? 1.0f / static_cast<float>(Counter) : 0.0f};
+    const auto min_len = minz(Counter, InSamples.size());
+
+    MixLine(InSamples, al::assume_aligned<16>(OutBuffer), CurrentGain,
+        TargetGain, delta, min_len, Counter);
+}
diff --git a/core/mixer/mixer_neon.cpp b/core/mixer/mixer_neon.cpp
new file mode 100644
index 00000000..ef2936b3
--- /dev/null
+++ b/core/mixer/mixer_neon.cpp
@@ -0,0 +1,362 @@
+#include "config.h"
+
+#include <arm_neon.h>
+
+#include <cmath>
+#include <limits>
+
+#include "alnumeric.h"
+#include "core/bsinc_defs.h"
+#include "core/cubic_defs.h"
+#include "defs.h"
+#include "hrtfbase.h"
+
+struct NEONTag;
+struct LerpTag;
+struct CubicTag;
+struct BSincTag;
+struct FastBSincTag;
+
+
+#if defined(__GNUC__) && !defined(__clang__) && !defined(__ARM_NEON)
+#pragma GCC target("fpu=neon")
+#endif
+
+namespace {
+
+constexpr uint BSincPhaseDiffBits{MixerFracBits - BSincPhaseBits};
+constexpr uint BSincPhaseDiffOne{1 << BSincPhaseDiffBits};
+constexpr uint BSincPhaseDiffMask{BSincPhaseDiffOne - 1u};
+
+constexpr uint CubicPhaseDiffBits{MixerFracBits - CubicPhaseBits};
+constexpr uint CubicPhaseDiffOne{1 << CubicPhaseDiffBits};
+constexpr uint CubicPhaseDiffMask{CubicPhaseDiffOne - 1u};
+
+inline float32x4_t set_f4(float l0, float l1, float l2, float l3)
+{
+    float32x4_t ret{vmovq_n_f32(l0)};
+    ret = vsetq_lane_f32(l1, ret, 1);
+    ret = vsetq_lane_f32(l2, ret, 2);
+    ret = vsetq_lane_f32(l3, ret, 3);
+    return ret;
+}
+
+inline void ApplyCoeffs(float2 *RESTRICT Values, const size_t IrSize, const ConstHrirSpan Coeffs,
+    const float left, const float right)
+{
+    float32x4_t leftright4;
+    {
+        float32x2_t leftright2{vmov_n_f32(left)};
+        leftright2 = vset_lane_f32(right, leftright2, 1);
+        leftright4 = vcombine_f32(leftright2, leftright2);
+    }
+
+    ASSUME(IrSize >= MinIrLength);
+    for(size_t c{0};c < IrSize;c += 2)
+    {
+        float32x4_t vals = vld1q_f32(&Values[c][0]);
+        float32x4_t coefs = vld1q_f32(&Coeffs[c][0]);
+
+        vals = vmlaq_f32(vals, coefs, leftright4);
+
+        vst1q_f32(&Values[c][0], vals);
+    }
+}
+
+force_inline void MixLine(const al::span<const float> InSamples, float *RESTRICT dst,
+    float &CurrentGain, const float TargetGain, const float delta, const size_t min_len,
+    const size_t aligned_len, size_t Counter)
+{
+    float gain{CurrentGain};
+    const float step{(TargetGain-gain) * delta};
+
+    size_t pos{0};
+    if(!(std::abs(step) > std::numeric_limits<float>::epsilon()))
+        gain = TargetGain;
+    else
+    {
+        float step_count{0.0f};
+        /* Mix with applying gain steps in aligned multiples of 4. */
+        if(size_t todo{min_len >> 2})
+        {
+            const float32x4_t four4{vdupq_n_f32(4.0f)};
+            const float32x4_t step4{vdupq_n_f32(step)};
+            const float32x4_t gain4{vdupq_n_f32(gain)};
+            float32x4_t step_count4{vdupq_n_f32(0.0f)};
+            step_count4 = vsetq_lane_f32(1.0f, step_count4, 1);
+            step_count4 = vsetq_lane_f32(2.0f, step_count4, 2);
+            step_count4 = vsetq_lane_f32(3.0f, step_count4, 3);
+
+            do {
+                const float32x4_t val4 = vld1q_f32(&InSamples[pos]);
+                float32x4_t dry4 = vld1q_f32(&dst[pos]);
+                dry4 = vmlaq_f32(dry4, val4, vmlaq_f32(gain4, step4, step_count4));
+                step_count4 = vaddq_f32(step_count4, four4);
+                vst1q_f32(&dst[pos], dry4);
+                pos += 4;
+            } while(--todo);
+            /* NOTE: step_count4 now represents the next four counts after the
+             * last four mixed samples, so the lowest element represents the
+             * next step count to apply.
+             */
+            step_count = vgetq_lane_f32(step_count4, 0);
+        }
+        /* Mix with applying left over gain steps that aren't aligned multiples of 4. */
+        for(size_t leftover{min_len&3};leftover;++pos,--leftover)
+        {
+            dst[pos] += InSamples[pos] * (gain + step*step_count);
+            step_count += 1.0f;
+        }
+        if(pos == Counter)
+            gain = TargetGain;
+        else
+            gain += step*step_count;
+
+        /* Mix until pos is aligned with 4 or the mix is done. */
+        for(size_t leftover{aligned_len&3};leftover;++pos,--leftover)
+            dst[pos] += InSamples[pos] * gain;
+    }
+    CurrentGain = gain;
+
+    if(!(std::abs(gain) > GainSilenceThreshold))
+        return;
+    if(size_t todo{(InSamples.size()-pos) >> 2})
+    {
+        const float32x4_t gain4 = vdupq_n_f32(gain);
+        do {
+            const float32x4_t val4 = vld1q_f32(&InSamples[pos]);
+            float32x4_t dry4 = vld1q_f32(&dst[pos]);
+            dry4 = vmlaq_f32(dry4, val4, gain4);
+            vst1q_f32(&dst[pos], dry4);
+            pos += 4;
+        } while(--todo);
+    }
+    for(size_t leftover{(InSamples.size()-pos)&3};leftover;++pos,--leftover)
+        dst[pos] += InSamples[pos] * gain;
+}
+
+} // namespace
+
+template<>
+void Resample_<LerpTag,NEONTag>(const InterpState*, const float *RESTRICT src, uint frac,
+    const uint increment, const al::span<float> dst)
+{
+    ASSUME(frac < MixerFracOne);
+
+    const int32x4_t increment4 = vdupq_n_s32(static_cast<int>(increment*4));
+    const float32x4_t fracOne4 = vdupq_n_f32(1.0f/MixerFracOne);
+    const int32x4_t fracMask4 = vdupq_n_s32(MixerFracMask);
+    alignas(16) uint pos_[4], frac_[4];
+    int32x4_t pos4, frac4;
+
+    InitPosArrays(frac, increment, frac_, pos_);
+    frac4 = vld1q_s32(reinterpret_cast<int*>(frac_));
+    pos4 = vld1q_s32(reinterpret_cast<int*>(pos_));
+
+    auto dst_iter = dst.begin();
+    for(size_t todo{dst.size()>>2};todo;--todo)
+    {
+        const int pos0{vgetq_lane_s32(pos4, 0)};
+        const int pos1{vgetq_lane_s32(pos4, 1)};
+        const int pos2{vgetq_lane_s32(pos4, 2)};
+        const int pos3{vgetq_lane_s32(pos4, 3)};
+        const float32x4_t val1{set_f4(src[pos0], src[pos1], src[pos2], src[pos3])};
+        const float32x4_t val2{set_f4(src[pos0+1], src[pos1+1], src[pos2+1], src[pos3+1])};
+
+        /* val1 + (val2-val1)*mu */
+        const float32x4_t r0{vsubq_f32(val2, val1)};
+        const float32x4_t mu{vmulq_f32(vcvtq_f32_s32(frac4), fracOne4)};
+        const float32x4_t out{vmlaq_f32(val1, mu, r0)};
+
+        vst1q_f32(dst_iter, out);
+        dst_iter += 4;
+
+        frac4 = vaddq_s32(frac4, increment4);
+        pos4 = vaddq_s32(pos4, vshrq_n_s32(frac4, MixerFracBits));
+        frac4 = vandq_s32(frac4, fracMask4);
+    }
+
+    if(size_t todo{dst.size()&3})
+    {
+        src += static_cast<uint>(vgetq_lane_s32(pos4, 0));
+        frac = static_cast<uint>(vgetq_lane_s32(frac4, 0));
+
+        do {
+            *(dst_iter++) = lerpf(src[0], src[1], static_cast<float>(frac) * (1.0f/MixerFracOne));
+
+            frac += increment;
+            src  += frac>>MixerFracBits;
+            frac &= MixerFracMask;
+        } while(--todo);
+    }
+}
+
+template<>
+void Resample_<CubicTag,NEONTag>(const InterpState *state, const float *RESTRICT src, uint frac,
+    const uint increment, const al::span<float> dst)
+{
+    ASSUME(frac < MixerFracOne);
+
+    const CubicCoefficients *RESTRICT filter = al::assume_aligned<16>(state->cubic.filter);
+
+    src -= 1;
+    for(float &out_sample : dst)
+    {
+        const uint pi{frac >> CubicPhaseDiffBits};
+        const float pf{static_cast<float>(frac&CubicPhaseDiffMask) * (1.0f/CubicPhaseDiffOne)};
+        const float32x4_t pf4{vdupq_n_f32(pf)};
+
+        /* Apply the phase interpolated filter. */
+
+        /* f = fil + pf*phd */
+        const float32x4_t f4 = vmlaq_f32(vld1q_f32(filter[pi].mCoeffs), pf4,
+            vld1q_f32(filter[pi].mDeltas));
+        /* r = f*src */
+        float32x4_t r4{vmulq_f32(f4, vld1q_f32(src))};
+
+        r4 = vaddq_f32(r4, vrev64q_f32(r4));
+        out_sample = vget_lane_f32(vadd_f32(vget_low_f32(r4), vget_high_f32(r4)), 0);
+
+        frac += increment;
+        src  += frac>>MixerFracBits;
+        frac &= MixerFracMask;
+    }
+}
+
+template<>
+void Resample_<BSincTag,NEONTag>(const InterpState *state, const float *RESTRICT src, uint frac,
+    const uint increment, const al::span<float> dst)
+{
+    const float *const filter{state->bsinc.filter};
+    const float32x4_t sf4{vdupq_n_f32(state->bsinc.sf)};
+    const size_t m{state->bsinc.m};
+    ASSUME(m > 0);
+    ASSUME(frac < MixerFracOne);
+
+    src -= state->bsinc.l;
+    for(float &out_sample : dst)
+    {
+        // Calculate the phase index and factor.
+        const uint pi{frac >> BSincPhaseDiffBits};
+        const float pf{static_cast<float>(frac&BSincPhaseDiffMask) * (1.0f/BSincPhaseDiffOne)};
+
+        // Apply the scale and phase interpolated filter.
+        float32x4_t r4{vdupq_n_f32(0.0f)};
+        {
+            const float32x4_t pf4{vdupq_n_f32(pf)};
+            const float *RESTRICT fil{filter + m*pi*2};
+            const float *RESTRICT phd{fil + m};
+            const float *RESTRICT scd{fil + BSincPhaseCount*2*m};
+            const float *RESTRICT spd{scd + m};
+            size_t td{m >> 2};
+            size_t j{0u};
+
+            do {
+                /* f = ((fil + sf*scd) + pf*(phd + sf*spd)) */
+                const float32x4_t f4 = vmlaq_f32(
+                    vmlaq_f32(vld1q_f32(&fil[j]), sf4, vld1q_f32(&scd[j])),
+                    pf4, vmlaq_f32(vld1q_f32(&phd[j]), sf4, vld1q_f32(&spd[j])));
+                /* r += f*src */
+                r4 = vmlaq_f32(r4, f4, vld1q_f32(&src[j]));
+                j += 4;
+            } while(--td);
+        }
+        r4 = vaddq_f32(r4, vrev64q_f32(r4));
+        out_sample = vget_lane_f32(vadd_f32(vget_low_f32(r4), vget_high_f32(r4)), 0);
+
+        frac += increment;
+        src  += frac>>MixerFracBits;
+        frac &= MixerFracMask;
+    }
+}
+
+template<>
+void Resample_<FastBSincTag,NEONTag>(const InterpState *state, const float *RESTRICT src, uint frac,
+    const uint increment, const al::span<float> dst)
+{
+    const float *const filter{state->bsinc.filter};
+    const size_t m{state->bsinc.m};
+    ASSUME(m > 0);
+    ASSUME(frac < MixerFracOne);
+
+    src -= state->bsinc.l;
+    for(float &out_sample : dst)
+    {
+        // Calculate the phase index and factor.
+        const uint pi{frac >> BSincPhaseDiffBits};
+        const float pf{static_cast<float>(frac&BSincPhaseDiffMask) * (1.0f/BSincPhaseDiffOne)};
+
+        // Apply the phase interpolated filter.
+        float32x4_t r4{vdupq_n_f32(0.0f)};
+        {
+            const float32x4_t pf4{vdupq_n_f32(pf)};
+            const float *RESTRICT fil{filter + m*pi*2};
+            const float *RESTRICT phd{fil + m};
+            size_t td{m >> 2};
+            size_t j{0u};
+
+            do {
+                /* f = fil + pf*phd */
+                const float32x4_t f4 = vmlaq_f32(vld1q_f32(&fil[j]), pf4, vld1q_f32(&phd[j]));
+                /* r += f*src */
+                r4 = vmlaq_f32(r4, f4, vld1q_f32(&src[j]));
+                j += 4;
+            } while(--td);
+        }
+        r4 = vaddq_f32(r4, vrev64q_f32(r4));
+        out_sample = vget_lane_f32(vadd_f32(vget_low_f32(r4), vget_high_f32(r4)), 0);
+
+        frac += increment;
+        src  += frac>>MixerFracBits;
+        frac &= MixerFracMask;
+    }
+}
+
+
+template<>
+void MixHrtf_<NEONTag>(const float *InSamples, float2 *AccumSamples, const uint IrSize,
+    const MixHrtfFilter *hrtfparams, const size_t BufferSize)
+{ MixHrtfBase<ApplyCoeffs>(InSamples, AccumSamples, IrSize, hrtfparams, BufferSize); }
+
+template<>
+void MixHrtfBlend_<NEONTag>(const float *InSamples, float2 *AccumSamples, const uint IrSize,
+    const HrtfFilter *oldparams, const MixHrtfFilter *newparams, const size_t BufferSize)
+{
+    MixHrtfBlendBase<ApplyCoeffs>(InSamples, AccumSamples, IrSize, oldparams, newparams,
+        BufferSize);
+}
+
+template<>
+void MixDirectHrtf_<NEONTag>(const FloatBufferSpan LeftOut, const FloatBufferSpan RightOut,
+    const al::span<const FloatBufferLine> InSamples, float2 *AccumSamples,
+    float *TempBuf, HrtfChannelState *ChanState, const size_t IrSize, const size_t BufferSize)
+{
+    MixDirectHrtfBase<ApplyCoeffs>(LeftOut, RightOut, InSamples, AccumSamples, TempBuf, ChanState,
+        IrSize, BufferSize);
+}
+
+
+template<>
+void Mix_<NEONTag>(const al::span<const float> InSamples, const al::span<FloatBufferLine> OutBuffer,
+    float *CurrentGains, const float *TargetGains, const size_t Counter, const size_t OutPos)
+{
+    const float delta{(Counter > 0) ? 1.0f / static_cast<float>(Counter) : 0.0f};
+    const auto min_len = minz(Counter, InSamples.size());
+    const auto aligned_len = minz((min_len+3) & ~size_t{3}, InSamples.size()) - min_len;
+
+    for(FloatBufferLine &output : OutBuffer)
+        MixLine(InSamples, al::assume_aligned<16>(output.data()+OutPos), *CurrentGains++,
+            *TargetGains++, delta, min_len, aligned_len, Counter);
+}
+
+template<>
+void Mix_<NEONTag>(const al::span<const float> InSamples, float *OutBuffer, float &CurrentGain,
+    const float TargetGain, const size_t Counter)
+{
+    const float delta{(Counter > 0) ? 1.0f / static_cast<float>(Counter) : 0.0f};
+    const auto min_len = minz(Counter, InSamples.size());
+    const auto aligned_len = minz((min_len+3) & ~size_t{3}, InSamples.size()) - min_len;
+
+    MixLine(InSamples, al::assume_aligned<16>(OutBuffer), CurrentGain, TargetGain, delta, min_len,
+        aligned_len, Counter);
+}
diff --git a/core/mixer/mixer_sse.cpp b/core/mixer/mixer_sse.cpp
new file mode 100644
index 00000000..0aa5d5fb
--- /dev/null
+++ b/core/mixer/mixer_sse.cpp
@@ -0,0 +1,327 @@
+#include "config.h"
+
+#include <xmmintrin.h>
+
+#include <cmath>
+#include <limits>
+
+#include "alnumeric.h"
+#include "core/bsinc_defs.h"
+#include "core/cubic_defs.h"
+#include "defs.h"
+#include "hrtfbase.h"
+
+struct SSETag;
+struct CubicTag;
+struct BSincTag;
+struct FastBSincTag;
+
+
+#if defined(__GNUC__) && !defined(__clang__) && !defined(__SSE__)
+#pragma GCC target("sse")
+#endif
+
+namespace {
+
+constexpr uint BSincPhaseDiffBits{MixerFracBits - BSincPhaseBits};
+constexpr uint BSincPhaseDiffOne{1 << BSincPhaseDiffBits};
+constexpr uint BSincPhaseDiffMask{BSincPhaseDiffOne - 1u};
+
+constexpr uint CubicPhaseDiffBits{MixerFracBits - CubicPhaseBits};
+constexpr uint CubicPhaseDiffOne{1 << CubicPhaseDiffBits};
+constexpr uint CubicPhaseDiffMask{CubicPhaseDiffOne - 1u};
+
+#define MLA4(x, y, z) _mm_add_ps(x, _mm_mul_ps(y, z))
+
+inline void ApplyCoeffs(float2 *RESTRICT Values, const size_t IrSize, const ConstHrirSpan Coeffs,
+    const float left, const float right)
+{
+    const __m128 lrlr{_mm_setr_ps(left, right, left, right)};
+
+    ASSUME(IrSize >= MinIrLength);
+    /* This isn't technically correct to test alignment, but it's true for
+     * systems that support SSE, which is the only one that needs to know the
+     * alignment of Values (which alternates between 8- and 16-byte aligned).
+     */
+    if(!(reinterpret_cast<uintptr_t>(Values)&15))
+    {
+        for(size_t i{0};i < IrSize;i += 2)
+        {
+            const __m128 coeffs{_mm_load_ps(Coeffs[i].data())};
+            __m128 vals{_mm_load_ps(Values[i].data())};
+            vals = MLA4(vals, lrlr, coeffs);
+            _mm_store_ps(Values[i].data(), vals);
+        }
+    }
+    else
+    {
+        __m128 imp0, imp1;
+        __m128 coeffs{_mm_load_ps(Coeffs[0].data())};
+        __m128 vals{_mm_loadl_pi(_mm_setzero_ps(), reinterpret_cast<__m64*>(Values[0].data()))};
+        imp0 = _mm_mul_ps(lrlr, coeffs);
+        vals = _mm_add_ps(imp0, vals);
+        _mm_storel_pi(reinterpret_cast<__m64*>(Values[0].data()), vals);
+        size_t td{((IrSize+1)>>1) - 1};
+        size_t i{1};
+        do {
+            coeffs = _mm_load_ps(Coeffs[i+1].data());
+            vals = _mm_load_ps(Values[i].data());
+            imp1 = _mm_mul_ps(lrlr, coeffs);
+            imp0 = _mm_shuffle_ps(imp0, imp1, _MM_SHUFFLE(1, 0, 3, 2));
+            vals = _mm_add_ps(imp0, vals);
+            _mm_store_ps(Values[i].data(), vals);
+            imp0 = imp1;
+            i += 2;
+        } while(--td);
+        vals = _mm_loadl_pi(vals, reinterpret_cast<__m64*>(Values[i].data()));
+        imp0 = _mm_movehl_ps(imp0, imp0);
+        vals = _mm_add_ps(imp0, vals);
+        _mm_storel_pi(reinterpret_cast<__m64*>(Values[i].data()), vals);
+    }
+}
+
+force_inline void MixLine(const al::span<const float> InSamples, float *RESTRICT dst,
+    float &CurrentGain, const float TargetGain, const float delta, const size_t min_len,
+    const size_t aligned_len, size_t Counter)
+{
+    float gain{CurrentGain};
+    const float step{(TargetGain-gain) * delta};
+
+    size_t pos{0};
+    if(!(std::abs(step) > std::numeric_limits<float>::epsilon()))
+        gain = TargetGain;
+    else
+    {
+        float step_count{0.0f};
+        /* Mix with applying gain steps in aligned multiples of 4. */
+        if(size_t todo{min_len >> 2})
+        {
+            const __m128 four4{_mm_set1_ps(4.0f)};
+            const __m128 step4{_mm_set1_ps(step)};
+            const __m128 gain4{_mm_set1_ps(gain)};
+            __m128 step_count4{_mm_setr_ps(0.0f, 1.0f, 2.0f, 3.0f)};
+            do {
+                const __m128 val4{_mm_load_ps(&InSamples[pos])};
+                __m128 dry4{_mm_load_ps(&dst[pos])};
+
+                /* dry += val * (gain + step*step_count) */
+                dry4 = MLA4(dry4, val4, MLA4(gain4, step4, step_count4));
+
+                _mm_store_ps(&dst[pos], dry4);
+                step_count4 = _mm_add_ps(step_count4, four4);
+                pos += 4;
+            } while(--todo);
+            /* NOTE: step_count4 now represents the next four counts after the
+             * last four mixed samples, so the lowest element represents the
+             * next step count to apply.
+             */
+            step_count = _mm_cvtss_f32(step_count4);
+        }
+        /* Mix with applying left over gain steps that aren't aligned multiples of 4. */
+        for(size_t leftover{min_len&3};leftover;++pos,--leftover)
+        {
+            dst[pos] += InSamples[pos] * (gain + step*step_count);
+            step_count += 1.0f;
+        }
+        if(pos == Counter)
+            gain = TargetGain;
+        else
+            gain += step*step_count;
+
+        /* Mix until pos is aligned with 4 or the mix is done. */
+        for(size_t leftover{aligned_len&3};leftover;++pos,--leftover)
+            dst[pos] += InSamples[pos] * gain;
+    }
+    CurrentGain = gain;
+
+    if(!(std::abs(gain) > GainSilenceThreshold))
+        return;
+    if(size_t todo{(InSamples.size()-pos) >> 2})
+    {
+        const __m128 gain4{_mm_set1_ps(gain)};
+        do {
+            const __m128 val4{_mm_load_ps(&InSamples[pos])};
+            __m128 dry4{_mm_load_ps(&dst[pos])};
+            dry4 = _mm_add_ps(dry4, _mm_mul_ps(val4, gain4));
+            _mm_store_ps(&dst[pos], dry4);
+            pos += 4;
+        } while(--todo);
+    }
+    for(size_t leftover{(InSamples.size()-pos)&3};leftover;++pos,--leftover)
+        dst[pos] += InSamples[pos] * gain;
+}
+
+} // namespace
+
+template<>
+void Resample_<CubicTag,SSETag>(const InterpState *state, const float *RESTRICT src, uint frac,
+    const uint increment, const al::span<float> dst)
+{
+    ASSUME(frac < MixerFracOne);
+
+    const CubicCoefficients *RESTRICT filter = al::assume_aligned<16>(state->cubic.filter);
+
+    src -= 1;
+    for(float &out_sample : dst)
+    {
+        const uint pi{frac >> CubicPhaseDiffBits};
+        const float pf{static_cast<float>(frac&CubicPhaseDiffMask) * (1.0f/CubicPhaseDiffOne)};
+        const __m128 pf4{_mm_set1_ps(pf)};
+
+        /* Apply the phase interpolated filter. */
+
+        /* f = fil + pf*phd */
+        const __m128 f4 = MLA4(_mm_load_ps(filter[pi].mCoeffs), pf4,
+            _mm_load_ps(filter[pi].mDeltas));
+        /* r = f*src */
+        __m128 r4{_mm_mul_ps(f4, _mm_loadu_ps(src))};
+
+        r4 = _mm_add_ps(r4, _mm_shuffle_ps(r4, r4, _MM_SHUFFLE(0, 1, 2, 3)));
+        r4 = _mm_add_ps(r4, _mm_movehl_ps(r4, r4));
+        out_sample = _mm_cvtss_f32(r4);
+
+        frac += increment;
+        src  += frac>>MixerFracBits;
+        frac &= MixerFracMask;
+    }
+}
+
+template<>
+void Resample_<BSincTag,SSETag>(const InterpState *state, const float *RESTRICT src, uint frac,
+    const uint increment, const al::span<float> dst)
+{
+    const float *const filter{state->bsinc.filter};
+    const __m128 sf4{_mm_set1_ps(state->bsinc.sf)};
+    const size_t m{state->bsinc.m};
+    ASSUME(m > 0);
+    ASSUME(frac < MixerFracOne);
+
+    src -= state->bsinc.l;
+    for(float &out_sample : dst)
+    {
+        // Calculate the phase index and factor.
+        const uint pi{frac >> BSincPhaseDiffBits};
+        const float pf{static_cast<float>(frac&BSincPhaseDiffMask) * (1.0f/BSincPhaseDiffOne)};
+
+        // Apply the scale and phase interpolated filter.
+        __m128 r4{_mm_setzero_ps()};
+        {
+            const __m128 pf4{_mm_set1_ps(pf)};
+            const float *RESTRICT fil{filter + m*pi*2};
+            const float *RESTRICT phd{fil + m};
+            const float *RESTRICT scd{fil + BSincPhaseCount*2*m};
+            const float *RESTRICT spd{scd + m};
+            size_t td{m >> 2};
+            size_t j{0u};
+
+            do {
+                /* f = ((fil + sf*scd) + pf*(phd + sf*spd)) */
+                const __m128 f4 = MLA4(
+                    MLA4(_mm_load_ps(&fil[j]), sf4, _mm_load_ps(&scd[j])),
+                    pf4, MLA4(_mm_load_ps(&phd[j]), sf4, _mm_load_ps(&spd[j])));
+                /* r += f*src */
+                r4 = MLA4(r4, f4, _mm_loadu_ps(&src[j]));
+                j += 4;
+            } while(--td);
+        }
+        r4 = _mm_add_ps(r4, _mm_shuffle_ps(r4, r4, _MM_SHUFFLE(0, 1, 2, 3)));
+        r4 = _mm_add_ps(r4, _mm_movehl_ps(r4, r4));
+        out_sample = _mm_cvtss_f32(r4);
+
+        frac += increment;
+        src  += frac>>MixerFracBits;
+        frac &= MixerFracMask;
+    }
+}
+
+template<>
+void Resample_<FastBSincTag,SSETag>(const InterpState *state, const float *RESTRICT src, uint frac,
+    const uint increment, const al::span<float> dst)
+{
+    const float *const filter{state->bsinc.filter};
+    const size_t m{state->bsinc.m};
+    ASSUME(m > 0);
+    ASSUME(frac < MixerFracOne);
+
+    src -= state->bsinc.l;
+    for(float &out_sample : dst)
+    {
+        // Calculate the phase index and factor.
+        const uint pi{frac >> BSincPhaseDiffBits};
+        const float pf{static_cast<float>(frac&BSincPhaseDiffMask) * (1.0f/BSincPhaseDiffOne)};
+
+        // Apply the phase interpolated filter.
+        __m128 r4{_mm_setzero_ps()};
+        {
+            const __m128 pf4{_mm_set1_ps(pf)};
+            const float *RESTRICT fil{filter + m*pi*2};
+            const float *RESTRICT phd{fil + m};
+            size_t td{m >> 2};
+            size_t j{0u};
+
+            do {
+                /* f = fil + pf*phd */
+                const __m128 f4 = MLA4(_mm_load_ps(&fil[j]), pf4, _mm_load_ps(&phd[j]));
+                /* r += f*src */
+                r4 = MLA4(r4, f4, _mm_loadu_ps(&src[j]));
+                j += 4;
+            } while(--td);
+        }
+        r4 = _mm_add_ps(r4, _mm_shuffle_ps(r4, r4, _MM_SHUFFLE(0, 1, 2, 3)));
+        r4 = _mm_add_ps(r4, _mm_movehl_ps(r4, r4));
+        out_sample = _mm_cvtss_f32(r4);
+
+        frac += increment;
+        src  += frac>>MixerFracBits;
+        frac &= MixerFracMask;
+    }
+}
+
+
+template<>
+void MixHrtf_<SSETag>(const float *InSamples, float2 *AccumSamples, const uint IrSize,
+    const MixHrtfFilter *hrtfparams, const size_t BufferSize)
+{ MixHrtfBase<ApplyCoeffs>(InSamples, AccumSamples, IrSize, hrtfparams, BufferSize); }
+
+template<>
+void MixHrtfBlend_<SSETag>(const float *InSamples, float2 *AccumSamples, const uint IrSize,
+    const HrtfFilter *oldparams, const MixHrtfFilter *newparams, const size_t BufferSize)
+{
+    MixHrtfBlendBase<ApplyCoeffs>(InSamples, AccumSamples, IrSize, oldparams, newparams,
+        BufferSize);
+}
+
+template<>
+void MixDirectHrtf_<SSETag>(const FloatBufferSpan LeftOut, const FloatBufferSpan RightOut,
+    const al::span<const FloatBufferLine> InSamples, float2 *AccumSamples,
+    float *TempBuf, HrtfChannelState *ChanState, const size_t IrSize, const size_t BufferSize)
+{
+    MixDirectHrtfBase<ApplyCoeffs>(LeftOut, RightOut, InSamples, AccumSamples, TempBuf, ChanState,
+        IrSize, BufferSize);
+}
+
+
+template<>
+void Mix_<SSETag>(const al::span<const float> InSamples, const al::span<FloatBufferLine> OutBuffer,
+    float *CurrentGains, const float *TargetGains, const size_t Counter, const size_t OutPos)
+{
+    const float delta{(Counter > 0) ? 1.0f / static_cast<float>(Counter) : 0.0f};
+    const auto min_len = minz(Counter, InSamples.size());
+    const auto aligned_len = minz((min_len+3) & ~size_t{3}, InSamples.size()) - min_len;
+
+    for(FloatBufferLine &output : OutBuffer)
+        MixLine(InSamples, al::assume_aligned<16>(output.data()+OutPos), *CurrentGains++,
+            *TargetGains++, delta, min_len, aligned_len, Counter);
+}
+
+template<>
+void Mix_<SSETag>(const al::span<const float> InSamples, float *OutBuffer, float &CurrentGain,
+    const float TargetGain, const size_t Counter)
+{
+    const float delta{(Counter > 0) ? 1.0f / static_cast<float>(Counter) : 0.0f};
+    const auto min_len = minz(Counter, InSamples.size());
+    const auto aligned_len = minz((min_len+3) & ~size_t{3}, InSamples.size()) - min_len;
+
+    MixLine(InSamples, al::assume_aligned<16>(OutBuffer), CurrentGain, TargetGain, delta, min_len,
+        aligned_len, Counter);
+}
diff --git a/core/mixer/mixer_sse2.cpp b/core/mixer/mixer_sse2.cpp
new file mode 100644
index 00000000..edaaf7a1
--- /dev/null
+++ b/core/mixer/mixer_sse2.cpp
@@ -0,0 +1,90 @@
+/**
+ * OpenAL cross platform audio library
+ * Copyright (C) 2014 by Timothy Arceri <[email protected]>.
+ * This library is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU Library General Public
+ *  License as published by the Free Software Foundation; either
+ *  version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ *  Library General Public License for more details.
+ *
+ * You should have received a copy of the GNU Library General Public
+ *  License along with this library; if not, write to the
+ *  Free Software Foundation, Inc.,
+ *  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ * Or go to http://www.gnu.org/copyleft/lgpl.html
+ */
+
+#include "config.h"
+
+#include <xmmintrin.h>
+#include <emmintrin.h>
+
+#include "alnumeric.h"
+#include "defs.h"
+
+struct SSE2Tag;
+struct LerpTag;
+
+
+#if defined(__GNUC__) && !defined(__clang__) && !defined(__SSE2__)
+#pragma GCC target("sse2")
+#endif
+
+template<>
+void Resample_<LerpTag,SSE2Tag>(const InterpState*, const float *RESTRICT src, uint frac,
+    const uint increment, const al::span<float> dst)
+{
+    ASSUME(frac < MixerFracOne);
+
+    const __m128i increment4{_mm_set1_epi32(static_cast<int>(increment*4))};
+    const __m128 fracOne4{_mm_set1_ps(1.0f/MixerFracOne)};
+    const __m128i fracMask4{_mm_set1_epi32(MixerFracMask)};
+
+    alignas(16) uint pos_[4], frac_[4];
+    InitPosArrays(frac, increment, frac_, pos_);
+    __m128i frac4{_mm_setr_epi32(static_cast<int>(frac_[0]), static_cast<int>(frac_[1]),
+        static_cast<int>(frac_[2]), static_cast<int>(frac_[3]))};
+    __m128i pos4{_mm_setr_epi32(static_cast<int>(pos_[0]), static_cast<int>(pos_[1]),
+        static_cast<int>(pos_[2]), static_cast<int>(pos_[3]))};
+
+    auto dst_iter = dst.begin();
+    for(size_t todo{dst.size()>>2};todo;--todo)
+    {
+        const int pos0{_mm_cvtsi128_si32(pos4)};
+        const int pos1{_mm_cvtsi128_si32(_mm_srli_si128(pos4, 4))};
+        const int pos2{_mm_cvtsi128_si32(_mm_srli_si128(pos4, 8))};
+        const int pos3{_mm_cvtsi128_si32(_mm_srli_si128(pos4, 12))};
+        const __m128 val1{_mm_setr_ps(src[pos0  ], src[pos1  ], src[pos2  ], src[pos3  ])};
+        const __m128 val2{_mm_setr_ps(src[pos0+1], src[pos1+1], src[pos2+1], src[pos3+1])};
+
+        /* val1 + (val2-val1)*mu */
+        const __m128 r0{_mm_sub_ps(val2, val1)};
+        const __m128 mu{_mm_mul_ps(_mm_cvtepi32_ps(frac4), fracOne4)};
+        const __m128 out{_mm_add_ps(val1, _mm_mul_ps(mu, r0))};
+
+        _mm_store_ps(dst_iter, out);
+        dst_iter += 4;
+
+        frac4 = _mm_add_epi32(frac4, increment4);
+        pos4 = _mm_add_epi32(pos4, _mm_srli_epi32(frac4, MixerFracBits));
+        frac4 = _mm_and_si128(frac4, fracMask4);
+    }
+
+    if(size_t todo{dst.size()&3})
+    {
+        src += static_cast<uint>(_mm_cvtsi128_si32(pos4));
+        frac = static_cast<uint>(_mm_cvtsi128_si32(frac4));
+
+        do {
+            *(dst_iter++) = lerpf(src[0], src[1], static_cast<float>(frac) * (1.0f/MixerFracOne));
+
+            frac += increment;
+            src  += frac>>MixerFracBits;
+            frac &= MixerFracMask;
+        } while(--todo);
+    }
+}
diff --git a/core/mixer/mixer_sse3.cpp b/core/mixer/mixer_sse3.cpp
new file mode 100644
index 00000000..e69de29b
--- /dev/null
+++ b/core/mixer/mixer_sse3.cpp
diff --git a/core/mixer/mixer_sse41.cpp b/core/mixer/mixer_sse41.cpp
new file mode 100644
index 00000000..8ccd9fd3
--- /dev/null
+++ b/core/mixer/mixer_sse41.cpp
@@ -0,0 +1,95 @@
+/**
+ * OpenAL cross platform audio library
+ * Copyright (C) 2014 by Timothy Arceri <[email protected]>.
+ * This library is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU Library General Public
+ *  License as published by the Free Software Foundation; either
+ *  version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ *  Library General Public License for more details.
+ *
+ * You should have received a copy of the GNU Library General Public
+ *  License along with this library; if not, write to the
+ *  Free Software Foundation, Inc.,
+ *  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ * Or go to http://www.gnu.org/copyleft/lgpl.html
+ */
+
+#include "config.h"
+
+#include <xmmintrin.h>
+#include <emmintrin.h>
+#include <smmintrin.h>
+
+#include "alnumeric.h"
+#include "defs.h"
+
+struct SSE4Tag;
+struct LerpTag;
+
+
+#if defined(__GNUC__) && !defined(__clang__) && !defined(__SSE4_1__)
+#pragma GCC target("sse4.1")
+#endif
+
+template<>
+void Resample_<LerpTag,SSE4Tag>(const InterpState*, const float *RESTRICT src, uint frac,
+    const uint increment, const al::span<float> dst)
+{
+    ASSUME(frac < MixerFracOne);
+
+    const __m128i increment4{_mm_set1_epi32(static_cast<int>(increment*4))};
+    const __m128 fracOne4{_mm_set1_ps(1.0f/MixerFracOne)};
+    const __m128i fracMask4{_mm_set1_epi32(MixerFracMask)};
+
+    alignas(16) uint pos_[4], frac_[4];
+    InitPosArrays(frac, increment, frac_, pos_);
+    __m128i frac4{_mm_setr_epi32(static_cast<int>(frac_[0]), static_cast<int>(frac_[1]),
+        static_cast<int>(frac_[2]), static_cast<int>(frac_[3]))};
+    __m128i pos4{_mm_setr_epi32(static_cast<int>(pos_[0]), static_cast<int>(pos_[1]),
+        static_cast<int>(pos_[2]), static_cast<int>(pos_[3]))};
+
+    auto dst_iter = dst.begin();
+    for(size_t todo{dst.size()>>2};todo;--todo)
+    {
+        const int pos0{_mm_extract_epi32(pos4, 0)};
+        const int pos1{_mm_extract_epi32(pos4, 1)};
+        const int pos2{_mm_extract_epi32(pos4, 2)};
+        const int pos3{_mm_extract_epi32(pos4, 3)};
+        const __m128 val1{_mm_setr_ps(src[pos0  ], src[pos1  ], src[pos2  ], src[pos3  ])};
+        const __m128 val2{_mm_setr_ps(src[pos0+1], src[pos1+1], src[pos2+1], src[pos3+1])};
+
+        /* val1 + (val2-val1)*mu */
+        const __m128 r0{_mm_sub_ps(val2, val1)};
+        const __m128 mu{_mm_mul_ps(_mm_cvtepi32_ps(frac4), fracOne4)};
+        const __m128 out{_mm_add_ps(val1, _mm_mul_ps(mu, r0))};
+
+        _mm_store_ps(dst_iter, out);
+        dst_iter += 4;
+
+        frac4 = _mm_add_epi32(frac4, increment4);
+        pos4 = _mm_add_epi32(pos4, _mm_srli_epi32(frac4, MixerFracBits));
+        frac4 = _mm_and_si128(frac4, fracMask4);
+    }
+
+    if(size_t todo{dst.size()&3})
+    {
+        /* NOTE: These four elements represent the position *after* the last
+         * four samples, so the lowest element is the next position to
+         * resample.
+         */
+        src += static_cast<uint>(_mm_cvtsi128_si32(pos4));
+        frac = static_cast<uint>(_mm_cvtsi128_si32(frac4));
+
+        do {
+            *(dst_iter++) = lerpf(src[0], src[1], static_cast<float>(frac) * (1.0f/MixerFracOne));
+
+            frac += increment;
+            src  += frac>>MixerFracBits;
+            frac &= MixerFracMask;
+        } while(--todo);
+    }
+}
diff --git a/core/resampler_limits.h b/core/resampler_limits.h
new file mode 100644
index 00000000..9d4cefda
--- /dev/null
+++ b/core/resampler_limits.h
@@ -0,0 +1,12 @@
+#ifndef CORE_RESAMPLER_LIMITS_H
+#define CORE_RESAMPLER_LIMITS_H
+
+/* Maximum number of samples to pad on the ends of a buffer for resampling.
+ * Note that the padding is symmetric (half at the beginning and half at the
+ * end)!
+ */
+constexpr int MaxResamplerPadding{48};
+
+constexpr int MaxResamplerEdge{MaxResamplerPadding >> 1};
+
+#endif /* CORE_RESAMPLER_LIMITS_H */
diff --git a/core/rtkit.cpp b/core/rtkit.cpp
new file mode 100644
index 00000000..ff944ebf
--- /dev/null
+++ b/core/rtkit.cpp
@@ -0,0 +1,236 @@
+/*-*- Mode: C; c-basic-offset: 8 -*-*/
+
+/***
+        Copyright 2009 Lennart Poettering
+        Copyright 2010 David Henningsson <[email protected]>
+        Copyright 2021 Chris Robinson
+
+        Permission is hereby granted, free of charge, to any person
+        obtaining a copy of this software and associated documentation files
+        (the "Software"), to deal in the Software without restriction,
+        including without limitation the rights to use, copy, modify, merge,
+        publish, distribute, sublicense, and/or sell copies of the Software,
+        and to permit persons to whom the Software is furnished to do so,
+        subject to the following conditions:
+
+        The above copyright notice and this permission notice shall be
+        included in all copies or substantial portions of the Software.
+
+        THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+        EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+        MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+        NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+        BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+        ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+        CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+        SOFTWARE.
+***/
+
+#include "config.h"
+
+#include "rtkit.h"
+
+#include <errno.h>
+
+#ifndef _GNU_SOURCE
+#define _GNU_SOURCE
+#endif
+
+#include <memory>
+#include <string.h>
+#include <unistd.h>
+#include <sys/types.h>
+#ifdef __linux__
+#include <sys/syscall.h>
+#elif defined(__FreeBSD__)
+#include <sys/thr.h>
+#endif
+
+
+namespace dbus {
+
+constexpr int TypeString{'s'};
+constexpr int TypeVariant{'v'};
+constexpr int TypeInt32{'i'};
+constexpr int TypeUInt32{'u'};
+constexpr int TypeInt64{'x'};
+constexpr int TypeUInt64{'t'};
+constexpr int TypeInvalid{'\0'};
+
+struct MessageDeleter {
+    void operator()(DBusMessage *m) { dbus_message_unref(m); }
+};
+using MessagePtr = std::unique_ptr<DBusMessage,MessageDeleter>;
+
+} // namespace dbus
+
+namespace {
+
+inline pid_t _gettid()
+{
+#ifdef __linux__
+    return static_cast<pid_t>(syscall(SYS_gettid));
+#elif defined(__FreeBSD__)
+    long pid{};
+    thr_self(&pid);
+    return static_cast<pid_t>(pid);
+#else
+#warning gettid not available
+    return 0;
+#endif
+}
+
+int translate_error(const char *name)
+{
+    if(strcmp(name, DBUS_ERROR_NO_MEMORY) == 0)
+        return -ENOMEM;
+    if(strcmp(name, DBUS_ERROR_SERVICE_UNKNOWN) == 0
+        || strcmp(name, DBUS_ERROR_NAME_HAS_NO_OWNER) == 0)
+        return -ENOENT;
+    if(strcmp(name, DBUS_ERROR_ACCESS_DENIED) == 0
+        || strcmp(name, DBUS_ERROR_AUTH_FAILED) == 0)
+        return -EACCES;
+    return -EIO;
+}
+
+int rtkit_get_int_property(DBusConnection *connection, const char *propname, long long *propval)
+{
+    dbus::MessagePtr m{dbus_message_new_method_call(RTKIT_SERVICE_NAME, RTKIT_OBJECT_PATH,
+        "org.freedesktop.DBus.Properties", "Get")};
+    if(!m) return -ENOMEM;
+
+    const char *interfacestr = RTKIT_SERVICE_NAME;
+    auto ready = dbus_message_append_args(m.get(),
+        dbus::TypeString, &interfacestr,
+        dbus::TypeString, &propname,
+        dbus::TypeInvalid);
+    if(!ready) return -ENOMEM;
+
+    dbus::Error error;
+    dbus::MessagePtr r{dbus_connection_send_with_reply_and_block(connection, m.get(), -1,
+        &error.get())};
+    if(!r) return translate_error(error->name);
+
+    if(dbus_set_error_from_message(&error.get(), r.get()))
+        return translate_error(error->name);
+
+    int ret{-EBADMSG};
+    DBusMessageIter iter{};
+    dbus_message_iter_init(r.get(), &iter);
+    while(int curtype{dbus_message_iter_get_arg_type(&iter)})
+    {
+        if(curtype == dbus::TypeVariant)
+        {
+            DBusMessageIter subiter{};
+            dbus_message_iter_recurse(&iter, &subiter);
+
+            while((curtype=dbus_message_iter_get_arg_type(&subiter)) != dbus::TypeInvalid)
+            {
+                if(curtype == dbus::TypeInt32)
+                {
+                    dbus_int32_t i32{};
+                    dbus_message_iter_get_basic(&subiter, &i32);
+                    *propval = i32;
+                    ret = 0;
+                }
+
+                if(curtype == dbus::TypeInt64)
+                {
+                    dbus_int64_t i64{};
+                    dbus_message_iter_get_basic(&subiter, &i64);
+                    *propval = i64;
+                    ret = 0;
+                }
+
+                dbus_message_iter_next(&subiter);
+            }
+        }
+        dbus_message_iter_next(&iter);
+    }
+
+    return ret;
+}
+
+} // namespace
+
+int rtkit_get_max_realtime_priority(DBusConnection *connection)
+{
+    long long retval{};
+    int err{rtkit_get_int_property(connection, "MaxRealtimePriority", &retval)};
+    return err < 0 ? err : static_cast<int>(retval);
+}
+
+int rtkit_get_min_nice_level(DBusConnection *connection, int *min_nice_level)
+{
+    long long retval{};
+    int err{rtkit_get_int_property(connection, "MinNiceLevel", &retval)};
+    if(err >= 0) *min_nice_level = static_cast<int>(retval);
+    return err;
+}
+
+long long rtkit_get_rttime_usec_max(DBusConnection *connection)
+{
+    long long retval{};
+    int err{rtkit_get_int_property(connection, "RTTimeUSecMax", &retval)};
+    return err < 0 ? err : retval;
+}
+
+int rtkit_make_realtime(DBusConnection *connection, pid_t thread, int priority)
+{
+    if(thread == 0)
+        thread = _gettid();
+    if(thread == 0)
+        return -ENOTSUP;
+
+    dbus::MessagePtr m{dbus_message_new_method_call(RTKIT_SERVICE_NAME, RTKIT_OBJECT_PATH,
+        "org.freedesktop.RealtimeKit1", "MakeThreadRealtime")};
+    if(!m) return -ENOMEM;
+
+    auto u64 = static_cast<dbus_uint64_t>(thread);
+    auto u32 = static_cast<dbus_uint32_t>(priority);
+    auto ready = dbus_message_append_args(m.get(),
+        dbus::TypeUInt64, &u64,
+        dbus::TypeUInt32, &u32,
+        dbus::TypeInvalid);
+    if(!ready) return -ENOMEM;
+
+    dbus::Error error;
+    dbus::MessagePtr r{dbus_connection_send_with_reply_and_block(connection, m.get(), -1,
+        &error.get())};
+    if(!r) return translate_error(error->name);
+
+    if(dbus_set_error_from_message(&error.get(), r.get()))
+        return translate_error(error->name);
+
+    return 0;
+}
+
+int rtkit_make_high_priority(DBusConnection *connection, pid_t thread, int nice_level)
+{
+    if(thread == 0)
+        thread = _gettid();
+    if(thread == 0)
+        return -ENOTSUP;
+
+    dbus::MessagePtr m{dbus_message_new_method_call(RTKIT_SERVICE_NAME, RTKIT_OBJECT_PATH,
+        "org.freedesktop.RealtimeKit1", "MakeThreadHighPriority")};
+    if(!m) return -ENOMEM;
+
+    auto u64 = static_cast<dbus_uint64_t>(thread);
+    auto s32 = static_cast<dbus_int32_t>(nice_level);
+    auto ready = dbus_message_append_args(m.get(),
+        dbus::TypeUInt64, &u64,
+        dbus::TypeInt32, &s32,
+        dbus::TypeInvalid);
+    if(!ready) return -ENOMEM;
+
+    dbus::Error error;
+    dbus::MessagePtr r{dbus_connection_send_with_reply_and_block(connection, m.get(), -1,
+        &error.get())};
+    if(!r) return translate_error(error->name);
+
+    if(dbus_set_error_from_message(&error.get(), r.get()))
+        return translate_error(error->name);
+
+    return 0;
+}
diff --git a/core/rtkit.h b/core/rtkit.h
new file mode 100644
index 00000000..d4994e27
--- /dev/null
+++ b/core/rtkit.h
@@ -0,0 +1,71 @@
+/*-*- Mode: C; c-basic-offset: 8 -*-*/
+
+#ifndef foortkithfoo
+#define foortkithfoo
+
+/***
+        Copyright 2009 Lennart Poettering
+        Copyright 2010 David Henningsson <[email protected]>
+
+        Permission is hereby granted, free of charge, to any person
+        obtaining a copy of this software and associated documentation files
+        (the "Software"), to deal in the Software without restriction,
+        including without limitation the rights to use, copy, modify, merge,
+        publish, distribute, sublicense, and/or sell copies of the Software,
+        and to permit persons to whom the Software is furnished to do so,
+        subject to the following conditions:
+
+        The above copyright notice and this permission notice shall be
+        included in all copies or substantial portions of the Software.
+
+        THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+        EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+        MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+        NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+        BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+        ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+        CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+        SOFTWARE.
+***/
+
+#include <sys/types.h>
+
+#include "dbus_wrap.h"
+
+/* This is the reference implementation for a client for
+ * RealtimeKit. You don't have to use this, but if do, just copy these
+ * sources into your repository */
+
+#define RTKIT_SERVICE_NAME "org.freedesktop.RealtimeKit1"
+#define RTKIT_OBJECT_PATH "/org/freedesktop/RealtimeKit1"
+
+/* This is mostly equivalent to sched_setparam(thread, SCHED_RR, {
+ * .sched_priority = priority }). 'thread' needs to be a kernel thread
+ * id as returned by gettid(), not a pthread_t! If 'thread' is 0 the
+ * current thread is used. The returned value is a negative errno
+ * style error code, or 0 on success. */
+int rtkit_make_realtime(DBusConnection *system_bus, pid_t thread, int priority);
+
+/* This is mostly equivalent to setpriority(PRIO_PROCESS, thread,
+ * nice_level). 'thread' needs to be a kernel thread id as returned by
+ * gettid(), not a pthread_t! If 'thread' is 0 the current thread is
+ * used. The returned value is a negative errno style error code, or 0
+ * on success.*/
+int rtkit_make_high_priority(DBusConnection *system_bus, pid_t thread, int nice_level);
+
+/* Return the maximum value of realtime priority available. Realtime requests
+ * above this value will fail. A negative value is an errno style error code.
+ */
+int rtkit_get_max_realtime_priority(DBusConnection *system_bus);
+
+/* Retreive the minimum value of nice level available. High prio requests
+ * below this value will fail. The returned value is a negative errno
+ * style error code, or 0 on success.*/
+int rtkit_get_min_nice_level(DBusConnection *system_bus, int *min_nice_level);
+
+/* Return the maximum value of RLIMIT_RTTIME to set before attempting a
+ * realtime request. A negative value is an errno style error code.
+ */
+long long rtkit_get_rttime_usec_max(DBusConnection *system_bus);
+
+#endif
diff --git a/core/uhjfilter.cpp b/core/uhjfilter.cpp
new file mode 100644
index 00000000..df50956a
--- /dev/null
+++ b/core/uhjfilter.cpp
@@ -0,0 +1,539 @@
+
+#include "config.h"
+
+#include "uhjfilter.h"
+
+#include <algorithm>
+#include <iterator>
+
+#include "alcomplex.h"
+#include "alnumeric.h"
+#include "opthelpers.h"
+#include "phase_shifter.h"
+
+
+UhjQualityType UhjDecodeQuality{UhjQualityType::Default};
+UhjQualityType UhjEncodeQuality{UhjQualityType::Default};
+
+
+namespace {
+
+const PhaseShifterT<UhjLength256> PShiftLq{};
+const PhaseShifterT<UhjLength512> PShiftHq{};
+
+template<size_t N>
+struct GetPhaseShifter;
+template<>
+struct GetPhaseShifter<UhjLength256> { static auto& Get() noexcept { return PShiftLq; } };
+template<>
+struct GetPhaseShifter<UhjLength512> { static auto& Get() noexcept { return PShiftHq; } };
+
+
+constexpr float square(float x) noexcept
+{ return x*x; }
+
+/* Filter coefficients for the 'base' all-pass IIR, which applies a frequency-
+ * dependent phase-shift of N degrees. The output of the filter requires a 1-
+ * sample delay.
+ */
+constexpr std::array<float,4> Filter1Coeff{{
+    square(0.6923878f), square(0.9360654322959f), square(0.9882295226860f),
+    square(0.9987488452737f)
+}};
+/* Filter coefficients for the offset all-pass IIR, which applies a frequency-
+ * dependent phase-shift of N+90 degrees.
+ */
+constexpr std::array<float,4> Filter2Coeff{{
+    square(0.4021921162426f), square(0.8561710882420f), square(0.9722909545651f),
+    square(0.9952884791278f)
+}};
+
+} // namespace
+
+void UhjAllPassFilter::process(const al::span<const float,4> coeffs,
+    const al::span<const float> src, const bool updateState, float *RESTRICT dst)
+{
+    auto state = mState;
+
+    auto proc_sample = [&state,coeffs](float x) noexcept -> float
+    {
+        for(size_t i{0};i < 4;++i)
+        {
+            const float y{x*coeffs[i] + state[i].z[0]};
+            state[i].z[0] = state[i].z[1];
+            state[i].z[1] = y*coeffs[i] - x;
+            x = y;
+        }
+        return x;
+    };
+    std::transform(src.begin(), src.end(), dst, proc_sample);
+    if(updateState) LIKELY mState = state;
+}
+
+
+/* Encoding UHJ from B-Format is done as:
+ *
+ * S = 0.9396926*W + 0.1855740*X
+ * D = j(-0.3420201*W + 0.5098604*X) + 0.6554516*Y
+ *
+ * Left = (S + D)/2.0
+ * Right = (S - D)/2.0
+ * T = j(-0.1432*W + 0.6512*X) - 0.7071068*Y
+ * Q = 0.9772*Z
+ *
+ * where j is a wide-band +90 degree phase shift. 3-channel UHJ excludes Q,
+ * while 2-channel excludes Q and T.
+ *
+ * The phase shift is done using a linear FIR filter derived from an FFT'd
+ * impulse with the desired shift.
+ */
+
+template<size_t N>
+void UhjEncoder<N>::encode(float *LeftOut, float *RightOut,
+    const al::span<const float*const,3> InSamples, const size_t SamplesToDo)
+{
+    const auto &PShift = GetPhaseShifter<N>::Get();
+
+    ASSUME(SamplesToDo > 0);
+
+    const float *RESTRICT winput{al::assume_aligned<16>(InSamples[0])};
+    const float *RESTRICT xinput{al::assume_aligned<16>(InSamples[1])};
+    const float *RESTRICT yinput{al::assume_aligned<16>(InSamples[2])};
+
+    std::copy_n(winput, SamplesToDo, mW.begin()+sFilterDelay);
+    std::copy_n(xinput, SamplesToDo, mX.begin()+sFilterDelay);
+    std::copy_n(yinput, SamplesToDo, mY.begin()+sFilterDelay);
+
+    /* S = 0.9396926*W + 0.1855740*X */
+    for(size_t i{0};i < SamplesToDo;++i)
+        mS[i] = 0.9396926f*mW[i] + 0.1855740f*mX[i];
+
+    /* Precompute j(-0.3420201*W + 0.5098604*X) and store in mD. */
+    std::transform(winput, winput+SamplesToDo, xinput, mWX.begin() + sWXInOffset,
+        [](const float w, const float x) noexcept -> float
+        { return -0.3420201f*w + 0.5098604f*x; });
+    PShift.process({mD.data(), SamplesToDo}, mWX.data());
+
+    /* D = j(-0.3420201*W + 0.5098604*X) + 0.6554516*Y */
+    for(size_t i{0};i < SamplesToDo;++i)
+        mD[i] = mD[i] + 0.6554516f*mY[i];
+
+    /* Copy the future samples to the front for next time. */
+    std::copy(mW.cbegin()+SamplesToDo, mW.cbegin()+SamplesToDo+sFilterDelay, mW.begin());
+    std::copy(mX.cbegin()+SamplesToDo, mX.cbegin()+SamplesToDo+sFilterDelay, mX.begin());
+    std::copy(mY.cbegin()+SamplesToDo, mY.cbegin()+SamplesToDo+sFilterDelay, mY.begin());
+    std::copy(mWX.cbegin()+SamplesToDo, mWX.cbegin()+SamplesToDo+sWXInOffset, mWX.begin());
+
+    /* Apply a delay to the existing output to align with the input delay. */
+    auto *delayBuffer = mDirectDelay.data();
+    for(float *buffer : {LeftOut, RightOut})
+    {
+        float *distbuf{al::assume_aligned<16>(delayBuffer->data())};
+        ++delayBuffer;
+
+        float *inout{al::assume_aligned<16>(buffer)};
+        auto inout_end = inout + SamplesToDo;
+        if(SamplesToDo >= sFilterDelay) LIKELY
+        {
+            auto delay_end = std::rotate(inout, inout_end - sFilterDelay, inout_end);
+            std::swap_ranges(inout, delay_end, distbuf);
+        }
+        else
+        {
+            auto delay_start = std::swap_ranges(inout, inout_end, distbuf);
+            std::rotate(distbuf, delay_start, distbuf + sFilterDelay);
+        }
+    }
+
+    /* Combine the direct signal with the produced output. */
+
+    /* Left = (S + D)/2.0 */
+    float *RESTRICT left{al::assume_aligned<16>(LeftOut)};
+    for(size_t i{0};i < SamplesToDo;i++)
+        left[i] += (mS[i] + mD[i]) * 0.5f;
+    /* Right = (S - D)/2.0 */
+    float *RESTRICT right{al::assume_aligned<16>(RightOut)};
+    for(size_t i{0};i < SamplesToDo;i++)
+        right[i] += (mS[i] - mD[i]) * 0.5f;
+}
+
+/* This encoding implementation uses two sets of four chained IIR filters to
+ * produce the desired relative phase shift. The first filter chain produces a
+ * phase shift of varying degrees over a wide range of frequencies, while the
+ * second filter chain produces a phase shift 90 degrees ahead of the first
+ * over the same range. Further details are described here:
+ *
+ * https://web.archive.org/web/20060708031958/http://www.biochem.oulu.fi/~oniemita/dsp/hilbert/
+ *
+ * 2-channel UHJ output requires the use of three filter chains. The S channel
+ * output uses a Filter1 chain on the W and X channel mix, while the D channel
+ * output uses a Filter1 chain on the Y channel plus a Filter2 chain on the W
+ * and X channel mix. This results in the W and X input mix on the D channel
+ * output having the required +90 degree phase shift relative to the other
+ * inputs.
+ */
+void UhjEncoderIIR::encode(float *LeftOut, float *RightOut,
+    const al::span<const float *const, 3> InSamples, const size_t SamplesToDo)
+{
+    ASSUME(SamplesToDo > 0);
+
+    const float *RESTRICT winput{al::assume_aligned<16>(InSamples[0])};
+    const float *RESTRICT xinput{al::assume_aligned<16>(InSamples[1])};
+    const float *RESTRICT yinput{al::assume_aligned<16>(InSamples[2])};
+
+    /* S = 0.9396926*W + 0.1855740*X */
+    std::transform(winput, winput+SamplesToDo, xinput, mTemp.begin(),
+        [](const float w, const float x) noexcept { return 0.9396926f*w + 0.1855740f*x; });
+    mFilter1WX.process(Filter1Coeff, {mTemp.data(), SamplesToDo}, true, mS.data()+1);
+    mS[0] = mDelayWX; mDelayWX = mS[SamplesToDo];
+
+    /* Precompute j(-0.3420201*W + 0.5098604*X) and store in mWX. */
+    std::transform(winput, winput+SamplesToDo, xinput, mTemp.begin(),
+        [](const float w, const float x) noexcept { return -0.3420201f*w + 0.5098604f*x; });
+    mFilter2WX.process(Filter2Coeff, {mTemp.data(), SamplesToDo}, true, mWX.data());
+
+    /* Apply filter1 to Y and store in mD. */
+    mFilter1Y.process(Filter1Coeff, {yinput, SamplesToDo}, SamplesToDo, mD.data()+1);
+    mD[0] = mDelayY; mDelayY = mD[SamplesToDo];
+
+    /* D = j(-0.3420201*W + 0.5098604*X) + 0.6554516*Y */
+    for(size_t i{0};i < SamplesToDo;++i)
+        mD[i] = mWX[i] + 0.6554516f*mD[i];
+
+    /* Apply the base filter to the existing output to align with the processed
+     * signal.
+     */
+    mFilter1Direct[0].process(Filter1Coeff, {LeftOut, SamplesToDo}, true, mTemp.data()+1);
+    mTemp[0] = mDirectDelay[0]; mDirectDelay[0] = mTemp[SamplesToDo];
+
+    /* Left = (S + D)/2.0 */
+    float *RESTRICT left{al::assume_aligned<16>(LeftOut)};
+    for(size_t i{0};i < SamplesToDo;i++)
+        left[i] = (mS[i] + mD[i])*0.5f + mTemp[i];
+
+    mFilter1Direct[1].process(Filter1Coeff, {RightOut, SamplesToDo}, true, mTemp.data()+1);
+    mTemp[0] = mDirectDelay[1]; mDirectDelay[1] = mTemp[SamplesToDo];
+
+    /* Right = (S - D)/2.0 */
+    float *RESTRICT right{al::assume_aligned<16>(RightOut)};
+    for(size_t i{0};i < SamplesToDo;i++)
+        right[i] = (mS[i] - mD[i])*0.5f + mTemp[i];
+}
+
+
+/* Decoding UHJ is done as:
+ *
+ * S = Left + Right
+ * D = Left - Right
+ *
+ * W = 0.981532*S + 0.197484*j(0.828331*D + 0.767820*T)
+ * X = 0.418496*S - j(0.828331*D + 0.767820*T)
+ * Y = 0.795968*D - 0.676392*T + j(0.186633*S)
+ * Z = 1.023332*Q
+ *
+ * where j is a +90 degree phase shift. 3-channel UHJ excludes Q, while 2-
+ * channel excludes Q and T.
+ */
+template<size_t N>
+void UhjDecoder<N>::decode(const al::span<float*> samples, const size_t samplesToDo,
+    const bool updateState)
+{
+    static_assert(sInputPadding <= sMaxPadding, "Filter padding is too large");
+
+    const auto &PShift = GetPhaseShifter<N>::Get();
+
+    ASSUME(samplesToDo > 0);
+
+    {
+        const float *RESTRICT left{al::assume_aligned<16>(samples[0])};
+        const float *RESTRICT right{al::assume_aligned<16>(samples[1])};
+        const float *RESTRICT t{al::assume_aligned<16>(samples[2])};
+
+        /* S = Left + Right */
+        for(size_t i{0};i < samplesToDo+sInputPadding;++i)
+            mS[i] = left[i] + right[i];
+
+        /* D = Left - Right */
+        for(size_t i{0};i < samplesToDo+sInputPadding;++i)
+            mD[i] = left[i] - right[i];
+
+        /* T */
+        for(size_t i{0};i < samplesToDo+sInputPadding;++i)
+            mT[i] = t[i];
+    }
+
+    float *RESTRICT woutput{al::assume_aligned<16>(samples[0])};
+    float *RESTRICT xoutput{al::assume_aligned<16>(samples[1])};
+    float *RESTRICT youtput{al::assume_aligned<16>(samples[2])};
+
+    /* Precompute j(0.828331*D + 0.767820*T) and store in xoutput. */
+    auto tmpiter = std::copy(mDTHistory.cbegin(), mDTHistory.cend(), mTemp.begin());
+    std::transform(mD.cbegin(), mD.cbegin()+samplesToDo+sInputPadding, mT.cbegin(), tmpiter,
+        [](const float d, const float t) noexcept { return 0.828331f*d + 0.767820f*t; });
+    if(updateState) LIKELY
+        std::copy_n(mTemp.cbegin()+samplesToDo, mDTHistory.size(), mDTHistory.begin());
+    PShift.process({xoutput, samplesToDo}, mTemp.data());
+
+    /* W = 0.981532*S + 0.197484*j(0.828331*D + 0.767820*T) */
+    for(size_t i{0};i < samplesToDo;++i)
+        woutput[i] = 0.981532f*mS[i] + 0.197484f*xoutput[i];
+    /* X = 0.418496*S - j(0.828331*D + 0.767820*T) */
+    for(size_t i{0};i < samplesToDo;++i)
+        xoutput[i] = 0.418496f*mS[i] - xoutput[i];
+
+    /* Precompute j*S and store in youtput. */
+    tmpiter = std::copy(mSHistory.cbegin(), mSHistory.cend(), mTemp.begin());
+    std::copy_n(mS.cbegin(), samplesToDo+sInputPadding, tmpiter);
+    if(updateState) LIKELY
+        std::copy_n(mTemp.cbegin()+samplesToDo, mSHistory.size(), mSHistory.begin());
+    PShift.process({youtput, samplesToDo}, mTemp.data());
+
+    /* Y = 0.795968*D - 0.676392*T + j(0.186633*S) */
+    for(size_t i{0};i < samplesToDo;++i)
+        youtput[i] = 0.795968f*mD[i] - 0.676392f*mT[i] + 0.186633f*youtput[i];
+
+    if(samples.size() > 3)
+    {
+        float *RESTRICT zoutput{al::assume_aligned<16>(samples[3])};
+        /* Z = 1.023332*Q */
+        for(size_t i{0};i < samplesToDo;++i)
+            zoutput[i] = 1.023332f*zoutput[i];
+    }
+}
+
+void UhjDecoderIIR::decode(const al::span<float*> samples, const size_t samplesToDo,
+    const bool updateState)
+{
+    static_assert(sInputPadding <= sMaxPadding, "Filter padding is too large");
+
+    ASSUME(samplesToDo > 0);
+
+    {
+        const float *RESTRICT left{al::assume_aligned<16>(samples[0])};
+        const float *RESTRICT right{al::assume_aligned<16>(samples[1])};
+
+        /* S = Left + Right */
+        for(size_t i{0};i < samplesToDo;++i)
+            mS[i] = left[i] + right[i];
+
+        /* D = Left - Right */
+        for(size_t i{0};i < samplesToDo;++i)
+            mD[i] = left[i] - right[i];
+    }
+
+    float *RESTRICT woutput{al::assume_aligned<16>(samples[0])};
+    float *RESTRICT xoutput{al::assume_aligned<16>(samples[1])};
+    float *RESTRICT youtput{al::assume_aligned<16>(samples[2])};
+
+    /* Precompute j(0.828331*D + 0.767820*T) and store in xoutput. */
+    std::transform(mD.cbegin(), mD.cbegin()+samplesToDo, youtput, mTemp.begin(),
+        [](const float d, const float t) noexcept { return 0.828331f*d + 0.767820f*t; });
+    mFilter2DT.process(Filter2Coeff, {mTemp.data(), samplesToDo}, updateState, xoutput);
+
+    /* Apply filter1 to S and store in mTemp. */
+    mTemp[0] = mDelayS;
+    mFilter1S.process(Filter1Coeff, {mS.data(), samplesToDo}, updateState, mTemp.data()+1);
+    if(updateState) LIKELY mDelayS = mTemp[samplesToDo];
+
+    /* W = 0.981532*S + 0.197484*j(0.828331*D + 0.767820*T) */
+    for(size_t i{0};i < samplesToDo;++i)
+        woutput[i] = 0.981532f*mTemp[i] + 0.197484f*xoutput[i];
+    /* X = 0.418496*S - j(0.828331*D + 0.767820*T) */
+    for(size_t i{0};i < samplesToDo;++i)
+        xoutput[i] = 0.418496f*mTemp[i] - xoutput[i];
+
+
+    /* Apply filter1 to (0.795968*D - 0.676392*T) and store in mTemp. */
+    std::transform(mD.cbegin(), mD.cbegin()+samplesToDo, youtput, youtput,
+        [](const float d, const float t) noexcept { return 0.795968f*d - 0.676392f*t; });
+    mTemp[0] = mDelayDT;
+    mFilter1DT.process(Filter1Coeff, {youtput, samplesToDo}, updateState, mTemp.data()+1);
+    if(updateState) LIKELY mDelayDT = mTemp[samplesToDo];
+
+    /* Precompute j*S and store in youtput. */
+    mFilter2S.process(Filter2Coeff, {mS.data(), samplesToDo}, updateState, youtput);
+
+    /* Y = 0.795968*D - 0.676392*T + j(0.186633*S) */
+    for(size_t i{0};i < samplesToDo;++i)
+        youtput[i] = mTemp[i] + 0.186633f*youtput[i];
+
+
+    if(samples.size() > 3)
+    {
+        float *RESTRICT zoutput{al::assume_aligned<16>(samples[3])};
+
+        /* Apply filter1 to Q and store in mTemp. */
+        mTemp[0] = mDelayQ;
+        mFilter1Q.process(Filter1Coeff, {zoutput, samplesToDo}, updateState, mTemp.data()+1);
+        if(updateState) LIKELY mDelayQ = mTemp[samplesToDo];
+
+        /* Z = 1.023332*Q */
+        for(size_t i{0};i < samplesToDo;++i)
+            zoutput[i] = 1.023332f*mTemp[i];
+    }
+}
+
+
+/* Super Stereo processing is done as:
+ *
+ * S = Left + Right
+ * D = Left - Right
+ *
+ * W = 0.6098637*S - 0.6896511*j*w*D
+ * X = 0.8624776*S + 0.7626955*j*w*D
+ * Y = 1.6822415*w*D - 0.2156194*j*S
+ *
+ * where j is a +90 degree phase shift. w is a variable control for the
+ * resulting stereo width, with the range 0 <= w <= 0.7.
+ */
+template<size_t N>
+void UhjStereoDecoder<N>::decode(const al::span<float*> samples, const size_t samplesToDo,
+    const bool updateState)
+{
+    static_assert(sInputPadding <= sMaxPadding, "Filter padding is too large");
+
+    const auto &PShift = GetPhaseShifter<N>::Get();
+
+    ASSUME(samplesToDo > 0);
+
+    {
+        const float *RESTRICT left{al::assume_aligned<16>(samples[0])};
+        const float *RESTRICT right{al::assume_aligned<16>(samples[1])};
+
+        for(size_t i{0};i < samplesToDo+sInputPadding;++i)
+            mS[i] = left[i] + right[i];
+
+        /* Pre-apply the width factor to the difference signal D. Smoothly
+         * interpolate when it changes.
+         */
+        const float wtarget{mWidthControl};
+        const float wcurrent{(mCurrentWidth < 0.0f) ? wtarget : mCurrentWidth};
+        if(wtarget == wcurrent || !updateState)
+        {
+            for(size_t i{0};i < samplesToDo+sInputPadding;++i)
+                mD[i] = (left[i] - right[i]) * wcurrent;
+            mCurrentWidth = wcurrent;
+        }
+        else
+        {
+            const float wstep{(wtarget - wcurrent) / static_cast<float>(samplesToDo)};
+            float fi{0.0f};
+            for(size_t i{0};i < samplesToDo;++i)
+            {
+                mD[i] = (left[i] - right[i]) * (wcurrent + wstep*fi);
+                fi += 1.0f;
+            }
+            for(size_t i{samplesToDo};i < samplesToDo+sInputPadding;++i)
+                mD[i] = (left[i] - right[i]) * wtarget;
+            mCurrentWidth = wtarget;
+        }
+    }
+
+    float *RESTRICT woutput{al::assume_aligned<16>(samples[0])};
+    float *RESTRICT xoutput{al::assume_aligned<16>(samples[1])};
+    float *RESTRICT youtput{al::assume_aligned<16>(samples[2])};
+
+    /* Precompute j*D and store in xoutput. */
+    auto tmpiter = std::copy(mDTHistory.cbegin(), mDTHistory.cend(), mTemp.begin());
+    std::copy_n(mD.cbegin(), samplesToDo+sInputPadding, tmpiter);
+    if(updateState) LIKELY
+        std::copy_n(mTemp.cbegin()+samplesToDo, mDTHistory.size(), mDTHistory.begin());
+    PShift.process({xoutput, samplesToDo}, mTemp.data());
+
+    /* W = 0.6098637*S - 0.6896511*j*w*D */
+    for(size_t i{0};i < samplesToDo;++i)
+        woutput[i] = 0.6098637f*mS[i] - 0.6896511f*xoutput[i];
+    /* X = 0.8624776*S + 0.7626955*j*w*D */
+    for(size_t i{0};i < samplesToDo;++i)
+        xoutput[i] = 0.8624776f*mS[i] + 0.7626955f*xoutput[i];
+
+    /* Precompute j*S and store in youtput. */
+    tmpiter = std::copy(mSHistory.cbegin(), mSHistory.cend(), mTemp.begin());
+    std::copy_n(mS.cbegin(), samplesToDo+sInputPadding, tmpiter);
+    if(updateState) LIKELY
+        std::copy_n(mTemp.cbegin()+samplesToDo, mSHistory.size(), mSHistory.begin());
+    PShift.process({youtput, samplesToDo}, mTemp.data());
+
+    /* Y = 1.6822415*w*D - 0.2156194*j*S */
+    for(size_t i{0};i < samplesToDo;++i)
+        youtput[i] = 1.6822415f*mD[i] - 0.2156194f*youtput[i];
+}
+
+void UhjStereoDecoderIIR::decode(const al::span<float*> samples, const size_t samplesToDo,
+    const bool updateState)
+{
+    static_assert(sInputPadding <= sMaxPadding, "Filter padding is too large");
+
+    ASSUME(samplesToDo > 0);
+
+    {
+        const float *RESTRICT left{al::assume_aligned<16>(samples[0])};
+        const float *RESTRICT right{al::assume_aligned<16>(samples[1])};
+
+        for(size_t i{0};i < samplesToDo;++i)
+            mS[i] = left[i] + right[i];
+
+        /* Pre-apply the width factor to the difference signal D. Smoothly
+         * interpolate when it changes.
+         */
+        const float wtarget{mWidthControl};
+        const float wcurrent{(mCurrentWidth < 0.0f) ? wtarget : mCurrentWidth};
+        if(wtarget == wcurrent || !updateState)
+        {
+            for(size_t i{0};i < samplesToDo;++i)
+                mD[i] = (left[i] - right[i]) * wcurrent;
+            mCurrentWidth = wcurrent;
+        }
+        else
+        {
+            const float wstep{(wtarget - wcurrent) / static_cast<float>(samplesToDo)};
+            float fi{0.0f};
+            for(size_t i{0};i < samplesToDo;++i)
+            {
+                mD[i] = (left[i] - right[i]) * (wcurrent + wstep*fi);
+                fi += 1.0f;
+            }
+            mCurrentWidth = wtarget;
+        }
+    }
+
+    float *RESTRICT woutput{al::assume_aligned<16>(samples[0])};
+    float *RESTRICT xoutput{al::assume_aligned<16>(samples[1])};
+    float *RESTRICT youtput{al::assume_aligned<16>(samples[2])};
+
+    /* Apply filter1 to S and store in mTemp. */
+    mTemp[0] = mDelayS;
+    mFilter1S.process(Filter1Coeff, {mS.data(), samplesToDo}, updateState, mTemp.data()+1);
+    if(updateState) LIKELY mDelayS = mTemp[samplesToDo];
+
+    /* Precompute j*D and store in xoutput. */
+    mFilter2D.process(Filter2Coeff, {mD.data(), samplesToDo}, updateState, xoutput);
+
+    /* W = 0.6098637*S - 0.6896511*j*w*D */
+    for(size_t i{0};i < samplesToDo;++i)
+        woutput[i] = 0.6098637f*mTemp[i] - 0.6896511f*xoutput[i];
+    /* X = 0.8624776*S + 0.7626955*j*w*D */
+    for(size_t i{0};i < samplesToDo;++i)
+        xoutput[i] = 0.8624776f*mTemp[i] + 0.7626955f*xoutput[i];
+
+    /* Precompute j*S and store in youtput. */
+    mFilter2S.process(Filter2Coeff, {mS.data(), samplesToDo}, updateState, youtput);
+
+    /* Apply filter1 to D and store in mTemp. */
+    mTemp[0] = mDelayD;
+    mFilter1D.process(Filter1Coeff, {mD.data(), samplesToDo}, updateState, mTemp.data()+1);
+    if(updateState) LIKELY mDelayD = mTemp[samplesToDo];
+
+    /* Y = 1.6822415*w*D - 0.2156194*j*S */
+    for(size_t i{0};i < samplesToDo;++i)
+        youtput[i] = 1.6822415f*mTemp[i] - 0.2156194f*youtput[i];
+}
+
+
+template struct UhjEncoder<UhjLength256>;
+template struct UhjDecoder<UhjLength256>;
+template struct UhjStereoDecoder<UhjLength256>;
+
+template struct UhjEncoder<UhjLength512>;
+template struct UhjDecoder<UhjLength512>;
+template struct UhjStereoDecoder<UhjLength512>;
diff --git a/core/uhjfilter.h b/core/uhjfilter.h
new file mode 100644
index 00000000..df308094
--- /dev/null
+++ b/core/uhjfilter.h
@@ -0,0 +1,234 @@
+#ifndef CORE_UHJFILTER_H
+#define CORE_UHJFILTER_H
+
+#include <array>
+
+#include "almalloc.h"
+#include "alspan.h"
+#include "bufferline.h"
+
+
+static constexpr size_t UhjLength256{256};
+static constexpr size_t UhjLength512{512};
+
+enum class UhjQualityType : uint8_t {
+    IIR = 0,
+    FIR256,
+    FIR512,
+    Default = IIR
+};
+
+extern UhjQualityType UhjDecodeQuality;
+extern UhjQualityType UhjEncodeQuality;
+
+
+struct UhjAllPassFilter {
+    struct AllPassState {
+        /* Last two delayed components for direct form II. */
+        float z[2];
+    };
+    std::array<AllPassState,4> mState;
+
+    void process(const al::span<const float,4> coeffs, const al::span<const float> src,
+        const bool update, float *RESTRICT dst);
+};
+
+
+struct UhjEncoderBase {
+    virtual ~UhjEncoderBase() = default;
+
+    virtual size_t getDelay() noexcept = 0;
+
+    /**
+     * Encodes a 2-channel UHJ (stereo-compatible) signal from a B-Format input
+     * signal. The input must use FuMa channel ordering and UHJ scaling (FuMa
+     * with an additional +3dB boost).
+     */
+    virtual void encode(float *LeftOut, float *RightOut,
+        const al::span<const float*const,3> InSamples, const size_t SamplesToDo) = 0;
+};
+
+template<size_t N>
+struct UhjEncoder final : public UhjEncoderBase {
+    static constexpr size_t sFilterDelay{N/2};
+
+    /* Delays and processing storage for the input signal. */
+    alignas(16) std::array<float,BufferLineSize+sFilterDelay> mW{};
+    alignas(16) std::array<float,BufferLineSize+sFilterDelay> mX{};
+    alignas(16) std::array<float,BufferLineSize+sFilterDelay> mY{};
+
+    alignas(16) std::array<float,BufferLineSize> mS{};
+    alignas(16) std::array<float,BufferLineSize> mD{};
+
+    /* History and temp storage for the FIR filter. New samples should be
+     * written to index sFilterDelay*2 - 1.
+     */
+    static constexpr size_t sWXInOffset{sFilterDelay*2 - 1};
+    alignas(16) std::array<float,BufferLineSize + sFilterDelay*2> mWX{};
+
+    alignas(16) std::array<std::array<float,sFilterDelay>,2> mDirectDelay{};
+
+    size_t getDelay() noexcept override { return sFilterDelay; }
+
+    /**
+     * Encodes a 2-channel UHJ (stereo-compatible) signal from a B-Format input
+     * signal. The input must use FuMa channel ordering and UHJ scaling (FuMa
+     * with an additional +3dB boost).
+     */
+    void encode(float *LeftOut, float *RightOut, const al::span<const float*const,3> InSamples,
+        const size_t SamplesToDo) override;
+
+    DEF_NEWDEL(UhjEncoder)
+};
+
+struct UhjEncoderIIR final : public UhjEncoderBase {
+    static constexpr size_t sFilterDelay{1};
+
+    /* Processing storage for the input signal. */
+    alignas(16) std::array<float,BufferLineSize+1> mS{};
+    alignas(16) std::array<float,BufferLineSize+1> mD{};
+    alignas(16) std::array<float,BufferLineSize+sFilterDelay> mWX{};
+    alignas(16) std::array<float,BufferLineSize+sFilterDelay> mTemp{};
+    float mDelayWX{}, mDelayY{};
+
+    UhjAllPassFilter mFilter1WX;
+    UhjAllPassFilter mFilter2WX;
+    UhjAllPassFilter mFilter1Y;
+
+    std::array<UhjAllPassFilter,2> mFilter1Direct;
+    std::array<float,2> mDirectDelay{};
+
+    size_t getDelay() noexcept override { return sFilterDelay; }
+
+    /**
+     * Encodes a 2-channel UHJ (stereo-compatible) signal from a B-Format input
+     * signal. The input must use FuMa channel ordering and UHJ scaling (FuMa
+     * with an additional +3dB boost).
+     */
+    void encode(float *LeftOut, float *RightOut, const al::span<const float*const,3> InSamples,
+        const size_t SamplesToDo) override;
+
+    DEF_NEWDEL(UhjEncoderIIR)
+};
+
+
+struct DecoderBase {
+    static constexpr size_t sMaxPadding{256};
+
+    /* For 2-channel UHJ, shelf filters should use these LF responses. */
+    static constexpr float sWLFScale{0.661f};
+    static constexpr float sXYLFScale{1.293f};
+
+    virtual ~DecoderBase() = default;
+
+    virtual void decode(const al::span<float*> samples, const size_t samplesToDo,
+        const bool updateState) = 0;
+
+    /**
+     * The width factor for Super Stereo processing. Can be changed in between
+     * calls to decode, with valid values being between 0...0.7.
+     */
+    float mWidthControl{0.593f};
+};
+
+template<size_t N>
+struct UhjDecoder final : public DecoderBase {
+    /* The number of extra sample frames needed for input. */
+    static constexpr size_t sInputPadding{N/2};
+
+    alignas(16) std::array<float,BufferLineSize+sInputPadding> mS{};
+    alignas(16) std::array<float,BufferLineSize+sInputPadding> mD{};
+    alignas(16) std::array<float,BufferLineSize+sInputPadding> mT{};
+
+    alignas(16) std::array<float,sInputPadding-1> mDTHistory{};
+    alignas(16) std::array<float,sInputPadding-1> mSHistory{};
+
+    alignas(16) std::array<float,BufferLineSize + sInputPadding*2> mTemp{};
+
+    /**
+     * Decodes a 3- or 4-channel UHJ signal into a B-Format signal with FuMa
+     * channel ordering and UHJ scaling. For 3-channel, the 3rd channel may be
+     * attenuated by 'n', where 0 <= n <= 1. So to decode 2-channel UHJ, supply
+     * 3 channels with the 3rd channel silent (n=0). The B-Format signal
+     * reconstructed from 2-channel UHJ should not be run through a normal
+     * B-Format decoder, as it needs different shelf filters.
+     */
+    void decode(const al::span<float*> samples, const size_t samplesToDo,
+        const bool updateState) override;
+
+    DEF_NEWDEL(UhjDecoder)
+};
+
+struct UhjDecoderIIR final : public DecoderBase {
+    /* FIXME: These IIR decoder filters actually have a 1-sample delay on the
+     * non-filtered components, which is not reflected in the source latency
+     * value. sInputPadding is 0, however, because it doesn't need any extra
+     * input samples.
+     */
+    static constexpr size_t sInputPadding{0};
+
+    alignas(16) std::array<float,BufferLineSize> mS{};
+    alignas(16) std::array<float,BufferLineSize> mD{};
+    alignas(16) std::array<float,BufferLineSize+1> mTemp{};
+    float mDelayS{}, mDelayDT{}, mDelayQ{};
+
+    UhjAllPassFilter mFilter1S;
+    UhjAllPassFilter mFilter2DT;
+    UhjAllPassFilter mFilter1DT;
+    UhjAllPassFilter mFilter2S;
+    UhjAllPassFilter mFilter1Q;
+
+    void decode(const al::span<float*> samples, const size_t samplesToDo,
+        const bool updateState) override;
+
+    DEF_NEWDEL(UhjDecoderIIR)
+};
+
+template<size_t N>
+struct UhjStereoDecoder final : public DecoderBase {
+    static constexpr size_t sInputPadding{N/2};
+
+    float mCurrentWidth{-1.0f};
+
+    alignas(16) std::array<float,BufferLineSize+sInputPadding> mS{};
+    alignas(16) std::array<float,BufferLineSize+sInputPadding> mD{};
+
+    alignas(16) std::array<float,sInputPadding-1> mDTHistory{};
+    alignas(16) std::array<float,sInputPadding-1> mSHistory{};
+
+    alignas(16) std::array<float,BufferLineSize + sInputPadding*2> mTemp{};
+
+    /**
+     * Applies Super Stereo processing on a stereo signal to create a B-Format
+     * signal with FuMa channel ordering and UHJ scaling. The samples span
+     * should contain 3 channels, the first two being the left and right stereo
+     * channels, and the third left empty.
+     */
+    void decode(const al::span<float*> samples, const size_t samplesToDo,
+        const bool updateState) override;
+
+    DEF_NEWDEL(UhjStereoDecoder)
+};
+
+struct UhjStereoDecoderIIR final : public DecoderBase {
+    static constexpr size_t sInputPadding{0};
+
+    float mCurrentWidth{-1.0f};
+
+    alignas(16) std::array<float,BufferLineSize> mS{};
+    alignas(16) std::array<float,BufferLineSize> mD{};
+    alignas(16) std::array<float,BufferLineSize+1> mTemp{};
+    float mDelayS{}, mDelayD{};
+
+    UhjAllPassFilter mFilter1S;
+    UhjAllPassFilter mFilter2D;
+    UhjAllPassFilter mFilter1D;
+    UhjAllPassFilter mFilter2S;
+
+    void decode(const al::span<float*> samples, const size_t samplesToDo,
+        const bool updateState) override;
+
+    DEF_NEWDEL(UhjStereoDecoderIIR)
+};
+
+#endif /* CORE_UHJFILTER_H */
diff --git a/core/uiddefs.cpp b/core/uiddefs.cpp
new file mode 100644
index 00000000..244c01a5
--- /dev/null
+++ b/core/uiddefs.cpp
@@ -0,0 +1,37 @@
+
+#include "config.h"
+
+
+#ifndef AL_NO_UID_DEFS
+
+#if defined(HAVE_GUIDDEF_H) || defined(HAVE_INITGUID_H)
+#define INITGUID
+#include <windows.h>
+#ifdef HAVE_GUIDDEF_H
+#include <guiddef.h>
+#else
+#include <initguid.h>
+#endif
+
+DEFINE_GUID(KSDATAFORMAT_SUBTYPE_PCM,        0x00000001, 0x0000, 0x0010, 0x80,0x00, 0x00,0xaa,0x00,0x38,0x9b,0x71);
+DEFINE_GUID(KSDATAFORMAT_SUBTYPE_IEEE_FLOAT, 0x00000003, 0x0000, 0x0010, 0x80,0x00, 0x00,0xaa,0x00,0x38,0x9b,0x71);
+
+DEFINE_GUID(IID_IDirectSoundNotify,   0xb0210783, 0x89cd, 0x11d0, 0xaf,0x08, 0x00,0xa0,0xc9,0x25,0xcd,0x16);
+
+DEFINE_GUID(CLSID_MMDeviceEnumerator, 0xbcde0395, 0xe52f, 0x467c, 0x8e,0x3d, 0xc4,0x57,0x92,0x91,0x69,0x2e);
+DEFINE_GUID(IID_IMMDeviceEnumerator,  0xa95664d2, 0x9614, 0x4f35, 0xa7,0x46, 0xde,0x8d,0xb6,0x36,0x17,0xe6);
+DEFINE_GUID(IID_IAudioClient,         0x1cb9ad4c, 0xdbfa, 0x4c32, 0xb1,0x78, 0xc2,0xf5,0x68,0xa7,0x03,0xb2);
+DEFINE_GUID(IID_IAudioRenderClient,   0xf294acfc, 0x3146, 0x4483, 0xa7,0xbf, 0xad,0xdc,0xa7,0xc2,0x60,0xe2);
+DEFINE_GUID(IID_IAudioCaptureClient,  0xc8adbd64, 0xe71e, 0x48a0, 0xa4,0xde, 0x18,0x5c,0x39,0x5c,0xd3,0x17);
+
+#ifdef HAVE_WASAPI
+#include <wtypes.h>
+#include <devpropdef.h>
+#include <propkeydef.h>
+DEFINE_DEVPROPKEY(DEVPKEY_Device_FriendlyName, 0xa45c254e, 0xdf1c, 0x4efd, 0x80,0x20, 0x67,0xd1,0x46,0xa8,0x50,0xe0, 14);
+DEFINE_PROPERTYKEY(PKEY_AudioEndpoint_FormFactor, 0x1da5d803, 0xd492, 0x4edd, 0x8c,0x23, 0xe0,0xc0,0xff,0xee,0x7f,0x0e, 0);
+DEFINE_PROPERTYKEY(PKEY_AudioEndpoint_GUID, 0x1da5d803, 0xd492, 0x4edd, 0x8c, 0x23,0xe0, 0xc0,0xff,0xee,0x7f,0x0e, 4 );
+#endif
+#endif
+
+#endif /* AL_NO_UID_DEFS */
diff --git a/core/voice.cpp b/core/voice.cpp
new file mode 100644
index 00000000..e8fbcccd
--- /dev/null
+++ b/core/voice.cpp
@@ -0,0 +1,1304 @@
+
+#include "config.h"
+
+#include "voice.h"
+
+#include <algorithm>
+#include <array>
+#include <atomic>
+#include <cassert>
+#include <climits>
+#include <cstdint>
+#include <iterator>
+#include <memory>
+#include <new>
+#include <stdlib.h>
+#include <utility>
+#include <vector>
+
+#include "albyte.h"
+#include "alnumeric.h"
+#include "aloptional.h"
+#include "alspan.h"
+#include "alstring.h"
+#include "ambidefs.h"
+#include "async_event.h"
+#include "buffer_storage.h"
+#include "context.h"
+#include "cpu_caps.h"
+#include "devformat.h"
+#include "device.h"
+#include "filters/biquad.h"
+#include "filters/nfc.h"
+#include "filters/splitter.h"
+#include "fmt_traits.h"
+#include "logging.h"
+#include "mixer.h"
+#include "mixer/defs.h"
+#include "mixer/hrtfdefs.h"
+#include "opthelpers.h"
+#include "resampler_limits.h"
+#include "ringbuffer.h"
+#include "vector.h"
+#include "voice_change.h"
+
+struct CTag;
+#ifdef HAVE_SSE
+struct SSETag;
+#endif
+#ifdef HAVE_NEON
+struct NEONTag;
+#endif
+
+
+static_assert(!(sizeof(DeviceBase::MixerBufferLine)&15),
+    "DeviceBase::MixerBufferLine must be a multiple of 16 bytes");
+static_assert(!(MaxResamplerEdge&3), "MaxResamplerEdge is not a multiple of 4");
+
+static_assert((BufferLineSize-1)/MaxPitch > 0, "MaxPitch is too large for BufferLineSize!");
+static_assert((INT_MAX>>MixerFracBits)/MaxPitch > BufferLineSize,
+    "MaxPitch and/or BufferLineSize are too large for MixerFracBits!");
+
+Resampler ResamplerDefault{Resampler::Cubic};
+
+namespace {
+
+using uint = unsigned int;
+using namespace std::chrono;
+
+using HrtfMixerFunc = void(*)(const float *InSamples, float2 *AccumSamples, const uint IrSize,
+    const MixHrtfFilter *hrtfparams, const size_t BufferSize);
+using HrtfMixerBlendFunc = void(*)(const float *InSamples, float2 *AccumSamples,
+    const uint IrSize, const HrtfFilter *oldparams, const MixHrtfFilter *newparams,
+    const size_t BufferSize);
+
+HrtfMixerFunc MixHrtfSamples{MixHrtf_<CTag>};
+HrtfMixerBlendFunc MixHrtfBlendSamples{MixHrtfBlend_<CTag>};
+
+inline MixerOutFunc SelectMixer()
+{
+#ifdef HAVE_NEON
+    if((CPUCapFlags&CPU_CAP_NEON))
+        return Mix_<NEONTag>;
+#endif
+#ifdef HAVE_SSE
+    if((CPUCapFlags&CPU_CAP_SSE))
+        return Mix_<SSETag>;
+#endif
+    return Mix_<CTag>;
+}
+
+inline MixerOneFunc SelectMixerOne()
+{
+#ifdef HAVE_NEON
+    if((CPUCapFlags&CPU_CAP_NEON))
+        return Mix_<NEONTag>;
+#endif
+#ifdef HAVE_SSE
+    if((CPUCapFlags&CPU_CAP_SSE))
+        return Mix_<SSETag>;
+#endif
+    return Mix_<CTag>;
+}
+
+inline HrtfMixerFunc SelectHrtfMixer()
+{
+#ifdef HAVE_NEON
+    if((CPUCapFlags&CPU_CAP_NEON))
+        return MixHrtf_<NEONTag>;
+#endif
+#ifdef HAVE_SSE
+    if((CPUCapFlags&CPU_CAP_SSE))
+        return MixHrtf_<SSETag>;
+#endif
+    return MixHrtf_<CTag>;
+}
+
+inline HrtfMixerBlendFunc SelectHrtfBlendMixer()
+{
+#ifdef HAVE_NEON
+    if((CPUCapFlags&CPU_CAP_NEON))
+        return MixHrtfBlend_<NEONTag>;
+#endif
+#ifdef HAVE_SSE
+    if((CPUCapFlags&CPU_CAP_SSE))
+        return MixHrtfBlend_<SSETag>;
+#endif
+    return MixHrtfBlend_<CTag>;
+}
+
+} // namespace
+
+void Voice::InitMixer(al::optional<std::string> resampler)
+{
+    if(resampler)
+    {
+        struct ResamplerEntry {
+            const char name[16];
+            const Resampler resampler;
+        };
+        constexpr ResamplerEntry ResamplerList[]{
+            { "none", Resampler::Point },
+            { "point", Resampler::Point },
+            { "linear", Resampler::Linear },
+            { "cubic", Resampler::Cubic },
+            { "bsinc12", Resampler::BSinc12 },
+            { "fast_bsinc12", Resampler::FastBSinc12 },
+            { "bsinc24", Resampler::BSinc24 },
+            { "fast_bsinc24", Resampler::FastBSinc24 },
+        };
+
+        const char *str{resampler->c_str()};
+        if(al::strcasecmp(str, "bsinc") == 0)
+        {
+            WARN("Resampler option \"%s\" is deprecated, using bsinc12\n", str);
+            str = "bsinc12";
+        }
+        else if(al::strcasecmp(str, "sinc4") == 0 || al::strcasecmp(str, "sinc8") == 0)
+        {
+            WARN("Resampler option \"%s\" is deprecated, using cubic\n", str);
+            str = "cubic";
+        }
+
+        auto iter = std::find_if(std::begin(ResamplerList), std::end(ResamplerList),
+            [str](const ResamplerEntry &entry) -> bool
+            { return al::strcasecmp(str, entry.name) == 0; });
+        if(iter == std::end(ResamplerList))
+            ERR("Invalid resampler: %s\n", str);
+        else
+            ResamplerDefault = iter->resampler;
+    }
+
+    MixSamplesOut = SelectMixer();
+    MixSamplesOne = SelectMixerOne();
+    MixHrtfBlendSamples = SelectHrtfBlendMixer();
+    MixHrtfSamples = SelectHrtfMixer();
+}
+
+
+namespace {
+
+/* IMA ADPCM Stepsize table */
+constexpr int IMAStep_size[89] = {
+       7,    8,    9,   10,   11,   12,   13,   14,   16,   17,   19,
+      21,   23,   25,   28,   31,   34,   37,   41,   45,   50,   55,
+      60,   66,   73,   80,   88,   97,  107,  118,  130,  143,  157,
+     173,  190,  209,  230,  253,  279,  307,  337,  371,  408,  449,
+     494,  544,  598,  658,  724,  796,  876,  963, 1060, 1166, 1282,
+    1411, 1552, 1707, 1878, 2066, 2272, 2499, 2749, 3024, 3327, 3660,
+    4026, 4428, 4871, 5358, 5894, 6484, 7132, 7845, 8630, 9493,10442,
+   11487,12635,13899,15289,16818,18500,20350,22358,24633,27086,29794,
+   32767
+};
+
+/* IMA4 ADPCM Codeword decode table */
+constexpr int IMA4Codeword[16] = {
+    1, 3, 5, 7, 9, 11, 13, 15,
+   -1,-3,-5,-7,-9,-11,-13,-15,
+};
+
+/* IMA4 ADPCM Step index adjust decode table */
+constexpr int IMA4Index_adjust[16] = {
+   -1,-1,-1,-1, 2, 4, 6, 8,
+   -1,-1,-1,-1, 2, 4, 6, 8
+};
+
+/* MSADPCM Adaption table */
+constexpr int MSADPCMAdaption[16] = {
+    230, 230, 230, 230, 307, 409, 512, 614,
+    768, 614, 512, 409, 307, 230, 230, 230
+};
+
+/* MSADPCM Adaption Coefficient tables */
+constexpr int MSADPCMAdaptionCoeff[7][2] = {
+    { 256,    0 },
+    { 512, -256 },
+    {   0,    0 },
+    { 192,   64 },
+    { 240,    0 },
+    { 460, -208 },
+    { 392, -232 }
+};
+
+
+void SendSourceStoppedEvent(ContextBase *context, uint id)
+{
+    RingBuffer *ring{context->mAsyncEvents.get()};
+    auto evt_vec = ring->getWriteVector();
+    if(evt_vec.first.len < 1) return;
+
+    AsyncEvent *evt{al::construct_at(reinterpret_cast<AsyncEvent*>(evt_vec.first.buf),
+        AsyncEvent::SourceStateChange)};
+    evt->u.srcstate.id = id;
+    evt->u.srcstate.state = AsyncEvent::SrcState::Stop;
+
+    ring->writeAdvance(1);
+}
+
+
+const float *DoFilters(BiquadFilter &lpfilter, BiquadFilter &hpfilter, float *dst,
+    const al::span<const float> src, int type)
+{
+    switch(type)
+    {
+    case AF_None:
+        lpfilter.clear();
+        hpfilter.clear();
+        break;
+
+    case AF_LowPass:
+        lpfilter.process(src, dst);
+        hpfilter.clear();
+        return dst;
+    case AF_HighPass:
+        lpfilter.clear();
+        hpfilter.process(src, dst);
+        return dst;
+
+    case AF_BandPass:
+        DualBiquad{lpfilter, hpfilter}.process(src, dst);
+        return dst;
+    }
+    return src.data();
+}
+
+
+template<FmtType Type>
+inline void LoadSamples(float *RESTRICT dstSamples, const al::byte *src, const size_t srcChan,
+    const size_t srcOffset, const size_t srcStep, const size_t /*samplesPerBlock*/,
+    const size_t samplesToLoad) noexcept
+{
+    constexpr size_t sampleSize{sizeof(typename al::FmtTypeTraits<Type>::Type)};
+    auto s = src + (srcOffset*srcStep + srcChan)*sampleSize;
+
+    al::LoadSampleArray<Type>(dstSamples, s, srcStep, samplesToLoad);
+}
+
+template<>
+inline void LoadSamples<FmtIMA4>(float *RESTRICT dstSamples, const al::byte *src,
+    const size_t srcChan, const size_t srcOffset, const size_t srcStep,
+    const size_t samplesPerBlock, const size_t samplesToLoad) noexcept
+{
+    const size_t blockBytes{((samplesPerBlock-1)/2 + 4)*srcStep};
+
+    /* Skip to the ADPCM block containing the srcOffset sample. */
+    src += srcOffset/samplesPerBlock*blockBytes;
+    /* Calculate how many samples need to be skipped in the block. */
+    size_t skip{srcOffset % samplesPerBlock};
+
+    /* NOTE: This could probably be optimized better. */
+    size_t wrote{0};
+    do {
+        /* Each IMA4 block starts with a signed 16-bit sample, and a signed
+         * 16-bit table index. The table index needs to be clamped.
+         */
+        int sample{src[srcChan*4] | (src[srcChan*4 + 1] << 8)};
+        int index{src[srcChan*4 + 2] | (src[srcChan*4 + 3] << 8)};
+
+        sample = (sample^0x8000) - 32768;
+        index = clampi((index^0x8000) - 32768, 0, al::size(IMAStep_size)-1);
+
+        if(skip == 0)
+        {
+            dstSamples[wrote++] = static_cast<float>(sample) / 32768.0f;
+            if(wrote == samplesToLoad) return;
+        }
+        else
+            --skip;
+
+        auto decode_sample = [&sample,&index](const uint nibble)
+        {
+            sample += IMA4Codeword[nibble] * IMAStep_size[index] / 8;
+            sample = clampi(sample, -32768, 32767);
+
+            index += IMA4Index_adjust[nibble];
+            index = clampi(index, 0, al::size(IMAStep_size)-1);
+
+            return sample;
+        };
+
+        /* The rest of the block is arranged as a series of nibbles, contained
+         * in 4 *bytes* per channel interleaved. So every 8 nibbles we need to
+         * skip 4 bytes per channel to get the next nibbles for this channel.
+         *
+         * First, decode the samples that we need to skip in the block (will
+         * always be less than the block size). They need to be decoded despite
+         * being ignored for proper state on the remaining samples.
+         */
+        const al::byte *nibbleData{src + (srcStep+srcChan)*4};
+        size_t nibbleOffset{0};
+        const size_t startOffset{skip + 1};
+        for(;skip;--skip)
+        {
+            const size_t byteShift{(nibbleOffset&1) * 4};
+            const size_t wordOffset{(nibbleOffset>>1) & ~size_t{3}};
+            const size_t byteOffset{wordOffset*srcStep + ((nibbleOffset>>1)&3u)};
+            ++nibbleOffset;
+
+            std::ignore = decode_sample((nibbleData[byteOffset]>>byteShift) & 15u);
+        }
+
+        /* Second, decode the rest of the block and write to the output, until
+         * the end of the block or the end of output.
+         */
+        const size_t todo{minz(samplesPerBlock-startOffset, samplesToLoad-wrote)};
+        for(size_t i{0};i < todo;++i)
+        {
+            const size_t byteShift{(nibbleOffset&1) * 4};
+            const size_t wordOffset{(nibbleOffset>>1) & ~size_t{3}};
+            const size_t byteOffset{wordOffset*srcStep + ((nibbleOffset>>1)&3u)};
+            ++nibbleOffset;
+
+            const int result{decode_sample((nibbleData[byteOffset]>>byteShift) & 15u)};
+            dstSamples[wrote++] = static_cast<float>(result) / 32768.0f;
+        }
+        if(wrote == samplesToLoad)
+            return;
+
+        src += blockBytes;
+    } while(true);
+}
+
+template<>
+inline void LoadSamples<FmtMSADPCM>(float *RESTRICT dstSamples, const al::byte *src,
+    const size_t srcChan, const size_t srcOffset, const size_t srcStep,
+    const size_t samplesPerBlock, const size_t samplesToLoad) noexcept
+{
+    const size_t blockBytes{((samplesPerBlock-2)/2 + 7)*srcStep};
+
+    src += srcOffset/samplesPerBlock*blockBytes;
+    size_t skip{srcOffset % samplesPerBlock};
+
+    size_t wrote{0};
+    do {
+        /* Each MS ADPCM block starts with an 8-bit block predictor, used to
+         * dictate how the two sample history values are mixed with the decoded
+         * sample, and an initial signed 16-bit delta value which scales the
+         * nibble sample value. This is followed by the two initial 16-bit
+         * sample history values.
+         */
+        const al::byte *input{src};
+        const uint8_t blockpred{std::min(input[srcChan], uint8_t{6})};
+        input += srcStep;
+        int delta{input[2*srcChan + 0] | (input[2*srcChan + 1] << 8)};
+        input += srcStep*2;
+
+        int sampleHistory[2]{};
+        sampleHistory[0] = input[2*srcChan + 0] | (input[2*srcChan + 1]<<8);
+        input += srcStep*2;
+        sampleHistory[1] = input[2*srcChan + 0] | (input[2*srcChan + 1]<<8);
+        input += srcStep*2;
+
+        const auto coeffs = al::as_span(MSADPCMAdaptionCoeff[blockpred]);
+        delta = (delta^0x8000) - 32768;
+        sampleHistory[0] = (sampleHistory[0]^0x8000) - 32768;
+        sampleHistory[1] = (sampleHistory[1]^0x8000) - 32768;
+
+        /* The second history sample is "older", so it's the first to be
+         * written out.
+         */
+        if(skip == 0)
+        {
+            dstSamples[wrote++] = static_cast<float>(sampleHistory[1]) / 32768.0f;
+            if(wrote == samplesToLoad) return;
+            dstSamples[wrote++] = static_cast<float>(sampleHistory[0]) / 32768.0f;
+            if(wrote == samplesToLoad) return;
+        }
+        else if(skip == 1)
+        {
+            --skip;
+            dstSamples[wrote++] = static_cast<float>(sampleHistory[0]) / 32768.0f;
+            if(wrote == samplesToLoad) return;
+        }
+        else
+            skip -= 2;
+
+        auto decode_sample = [&sampleHistory,&delta,coeffs](const int nibble)
+        {
+            int pred{(sampleHistory[0]*coeffs[0] + sampleHistory[1]*coeffs[1]) / 256};
+            pred += ((nibble^0x08) - 0x08) * delta;
+            pred  = clampi(pred, -32768, 32767);
+
+            sampleHistory[1] = sampleHistory[0];
+            sampleHistory[0] = pred;
+
+            delta = (MSADPCMAdaption[nibble] * delta) / 256;
+            delta = maxi(16, delta);
+
+            return pred;
+        };
+
+        /* The rest of the block is a series of nibbles, interleaved per-
+         * channel. First, skip samples.
+         */
+        const size_t startOffset{skip + 2};
+        size_t nibbleOffset{srcChan};
+        for(;skip;--skip)
+        {
+            const size_t byteOffset{nibbleOffset>>1};
+            const size_t byteShift{((nibbleOffset&1)^1) * 4};
+            nibbleOffset += srcStep;
+
+            std::ignore = decode_sample((input[byteOffset]>>byteShift) & 15);
+        }
+
+        /* Now decode the rest of the block, until the end of the block or the
+         * dst buffer is filled.
+         */
+        const size_t todo{minz(samplesPerBlock-startOffset, samplesToLoad-wrote)};
+        for(size_t j{0};j < todo;++j)
+        {
+            const size_t byteOffset{nibbleOffset>>1};
+            const size_t byteShift{((nibbleOffset&1)^1) * 4};
+            nibbleOffset += srcStep;
+
+            const int sample{decode_sample((input[byteOffset]>>byteShift) & 15)};
+            dstSamples[wrote++] = static_cast<float>(sample) / 32768.0f;
+        }
+        if(wrote == samplesToLoad)
+            return;
+
+        src += blockBytes;
+    } while(true);
+}
+
+void LoadSamples(float *dstSamples, const al::byte *src, const size_t srcChan,
+    const size_t srcOffset, const FmtType srcType, const size_t srcStep,
+    const size_t samplesPerBlock, const size_t samplesToLoad) noexcept
+{
+#define HANDLE_FMT(T) case T:                                                 \
+    LoadSamples<T>(dstSamples, src, srcChan, srcOffset, srcStep,              \
+        samplesPerBlock, samplesToLoad);                                      \
+    break
+
+    switch(srcType)
+    {
+    HANDLE_FMT(FmtUByte);
+    HANDLE_FMT(FmtShort);
+    HANDLE_FMT(FmtFloat);
+    HANDLE_FMT(FmtDouble);
+    HANDLE_FMT(FmtMulaw);
+    HANDLE_FMT(FmtAlaw);
+    HANDLE_FMT(FmtIMA4);
+    HANDLE_FMT(FmtMSADPCM);
+    }
+#undef HANDLE_FMT
+}
+
+void LoadBufferStatic(VoiceBufferItem *buffer, VoiceBufferItem *bufferLoopItem,
+    const size_t dataPosInt, const FmtType sampleType, const size_t srcChannel,
+    const size_t srcStep, size_t samplesLoaded, const size_t samplesToLoad,
+    float *voiceSamples)
+{
+    if(!bufferLoopItem)
+    {
+        /* Load what's left to play from the buffer */
+        if(buffer->mSampleLen > dataPosInt) LIKELY
+        {
+            const size_t buffer_remaining{buffer->mSampleLen - dataPosInt};
+            const size_t remaining{minz(samplesToLoad-samplesLoaded, buffer_remaining)};
+            LoadSamples(voiceSamples+samplesLoaded, buffer->mSamples, srcChannel, dataPosInt,
+                sampleType, srcStep, buffer->mBlockAlign, remaining);
+            samplesLoaded += remaining;
+        }
+
+        if(const size_t toFill{samplesToLoad - samplesLoaded})
+        {
+            auto srcsamples = voiceSamples + samplesLoaded;
+            std::fill_n(srcsamples, toFill, *(srcsamples-1));
+        }
+    }
+    else
+    {
+        const size_t loopStart{buffer->mLoopStart};
+        const size_t loopEnd{buffer->mLoopEnd};
+        ASSUME(loopEnd > loopStart);
+
+        const size_t intPos{(dataPosInt < loopEnd) ? dataPosInt
+            : (((dataPosInt-loopStart)%(loopEnd-loopStart)) + loopStart)};
+
+        /* Load what's left of this loop iteration */
+        const size_t remaining{minz(samplesToLoad-samplesLoaded, loopEnd-dataPosInt)};
+        LoadSamples(voiceSamples+samplesLoaded, buffer->mSamples, srcChannel, intPos, sampleType,
+            srcStep, buffer->mBlockAlign, remaining);
+        samplesLoaded += remaining;
+
+        /* Load repeats of the loop to fill the buffer. */
+        const size_t loopSize{loopEnd - loopStart};
+        while(const size_t toFill{minz(samplesToLoad - samplesLoaded, loopSize)})
+        {
+            LoadSamples(voiceSamples+samplesLoaded, buffer->mSamples, srcChannel, loopStart,
+                sampleType, srcStep, buffer->mBlockAlign, toFill);
+            samplesLoaded += toFill;
+        }
+    }
+}
+
+void LoadBufferCallback(VoiceBufferItem *buffer, const size_t dataPosInt,
+    const size_t numCallbackSamples, const FmtType sampleType, const size_t srcChannel,
+    const size_t srcStep, size_t samplesLoaded, const size_t samplesToLoad, float *voiceSamples)
+{
+    /* Load what's left to play from the buffer */
+    if(numCallbackSamples > dataPosInt) LIKELY
+    {
+        const size_t remaining{minz(samplesToLoad-samplesLoaded, numCallbackSamples-dataPosInt)};
+        LoadSamples(voiceSamples+samplesLoaded, buffer->mSamples, srcChannel, dataPosInt,
+            sampleType, srcStep, buffer->mBlockAlign, remaining);
+        samplesLoaded += remaining;
+    }
+
+    if(const size_t toFill{samplesToLoad - samplesLoaded})
+    {
+        auto srcsamples = voiceSamples + samplesLoaded;
+        std::fill_n(srcsamples, toFill, *(srcsamples-1));
+    }
+}
+
+void LoadBufferQueue(VoiceBufferItem *buffer, VoiceBufferItem *bufferLoopItem,
+    size_t dataPosInt, const FmtType sampleType, const size_t srcChannel,
+    const size_t srcStep, size_t samplesLoaded, const size_t samplesToLoad,
+    float *voiceSamples)
+{
+    /* Crawl the buffer queue to fill in the temp buffer */
+    while(buffer && samplesLoaded != samplesToLoad)
+    {
+        if(dataPosInt >= buffer->mSampleLen)
+        {
+            dataPosInt -= buffer->mSampleLen;
+            buffer = buffer->mNext.load(std::memory_order_acquire);
+            if(!buffer) buffer = bufferLoopItem;
+            continue;
+        }
+
+        const size_t remaining{minz(samplesToLoad-samplesLoaded, buffer->mSampleLen-dataPosInt)};
+        LoadSamples(voiceSamples+samplesLoaded, buffer->mSamples, srcChannel, dataPosInt,
+            sampleType, srcStep, buffer->mBlockAlign, remaining);
+
+        samplesLoaded += remaining;
+        if(samplesLoaded == samplesToLoad)
+            break;
+
+        dataPosInt = 0;
+        buffer = buffer->mNext.load(std::memory_order_acquire);
+        if(!buffer) buffer = bufferLoopItem;
+    }
+    if(const size_t toFill{samplesToLoad - samplesLoaded})
+    {
+        auto srcsamples = voiceSamples + samplesLoaded;
+        std::fill_n(srcsamples, toFill, *(srcsamples-1));
+    }
+}
+
+
+void DoHrtfMix(const float *samples, const uint DstBufferSize, DirectParams &parms,
+    const float TargetGain, const uint Counter, uint OutPos, const bool IsPlaying,
+    DeviceBase *Device)
+{
+    const uint IrSize{Device->mIrSize};
+    auto &HrtfSamples = Device->HrtfSourceData;
+    auto &AccumSamples = Device->HrtfAccumData;
+
+    /* Copy the HRTF history and new input samples into a temp buffer. */
+    auto src_iter = std::copy(parms.Hrtf.History.begin(), parms.Hrtf.History.end(),
+        std::begin(HrtfSamples));
+    std::copy_n(samples, DstBufferSize, src_iter);
+    /* Copy the last used samples back into the history buffer for later. */
+    if(IsPlaying) LIKELY
+        std::copy_n(std::begin(HrtfSamples) + DstBufferSize, parms.Hrtf.History.size(),
+            parms.Hrtf.History.begin());
+
+    /* If fading and this is the first mixing pass, fade between the IRs. */
+    uint fademix{0u};
+    if(Counter && OutPos == 0)
+    {
+        fademix = minu(DstBufferSize, Counter);
+
+        float gain{TargetGain};
+
+        /* The new coefficients need to fade in completely since they're
+         * replacing the old ones. To keep the gain fading consistent,
+         * interpolate between the old and new target gains given how much of
+         * the fade time this mix handles.
+         */
+        if(Counter > fademix)
+        {
+            const float a{static_cast<float>(fademix) / static_cast<float>(Counter)};
+            gain = lerpf(parms.Hrtf.Old.Gain, TargetGain, a);
+        }
+
+        MixHrtfFilter hrtfparams{
+            parms.Hrtf.Target.Coeffs,
+            parms.Hrtf.Target.Delay,
+            0.0f, gain / static_cast<float>(fademix)};
+        MixHrtfBlendSamples(HrtfSamples, AccumSamples+OutPos, IrSize, &parms.Hrtf.Old, &hrtfparams,
+            fademix);
+
+        /* Update the old parameters with the result. */
+        parms.Hrtf.Old = parms.Hrtf.Target;
+        parms.Hrtf.Old.Gain = gain;
+        OutPos += fademix;
+    }
+
+    if(fademix < DstBufferSize)
+    {
+        const uint todo{DstBufferSize - fademix};
+        float gain{TargetGain};
+
+        /* Interpolate the target gain if the gain fading lasts longer than
+         * this mix.
+         */
+        if(Counter > DstBufferSize)
+        {
+            const float a{static_cast<float>(todo) / static_cast<float>(Counter-fademix)};
+            gain = lerpf(parms.Hrtf.Old.Gain, TargetGain, a);
+        }
+
+        MixHrtfFilter hrtfparams{
+            parms.Hrtf.Target.Coeffs,
+            parms.Hrtf.Target.Delay,
+            parms.Hrtf.Old.Gain,
+            (gain - parms.Hrtf.Old.Gain) / static_cast<float>(todo)};
+        MixHrtfSamples(HrtfSamples+fademix, AccumSamples+OutPos, IrSize, &hrtfparams, todo);
+
+        /* Store the now-current gain for next time. */
+        parms.Hrtf.Old.Gain = gain;
+    }
+}
+
+void DoNfcMix(const al::span<const float> samples, FloatBufferLine *OutBuffer, DirectParams &parms,
+    const float *TargetGains, const uint Counter, const uint OutPos, DeviceBase *Device)
+{
+    using FilterProc = void (NfcFilter::*)(const al::span<const float>, float*);
+    static constexpr FilterProc NfcProcess[MaxAmbiOrder+1]{
+        nullptr, &NfcFilter::process1, &NfcFilter::process2, &NfcFilter::process3};
+
+    float *CurrentGains{parms.Gains.Current.data()};
+    MixSamples(samples, {OutBuffer, 1u}, CurrentGains, TargetGains, Counter, OutPos);
+    ++OutBuffer;
+    ++CurrentGains;
+    ++TargetGains;
+
+    const al::span<float> nfcsamples{Device->NfcSampleData, samples.size()};
+    size_t order{1};
+    while(const size_t chancount{Device->NumChannelsPerOrder[order]})
+    {
+        (parms.NFCtrlFilter.*NfcProcess[order])(samples, nfcsamples.data());
+        MixSamples(nfcsamples, {OutBuffer, chancount}, CurrentGains, TargetGains, Counter, OutPos);
+        OutBuffer += chancount;
+        CurrentGains += chancount;
+        TargetGains += chancount;
+        if(++order == MaxAmbiOrder+1)
+            break;
+    }
+}
+
+} // namespace
+
+void Voice::mix(const State vstate, ContextBase *Context, const nanoseconds deviceTime,
+    const uint SamplesToDo)
+{
+    static constexpr std::array<float,MAX_OUTPUT_CHANNELS> SilentTarget{};
+
+    ASSUME(SamplesToDo > 0);
+
+    DeviceBase *Device{Context->mDevice};
+    const uint NumSends{Device->NumAuxSends};
+
+    /* Get voice info */
+    int DataPosInt{mPosition.load(std::memory_order_relaxed)};
+    uint DataPosFrac{mPositionFrac.load(std::memory_order_relaxed)};
+    VoiceBufferItem *BufferListItem{mCurrentBuffer.load(std::memory_order_relaxed)};
+    VoiceBufferItem *BufferLoopItem{mLoopBuffer.load(std::memory_order_relaxed)};
+    const uint increment{mStep};
+    if(increment < 1) UNLIKELY
+    {
+        /* If the voice is supposed to be stopping but can't be mixed, just
+         * stop it before bailing.
+         */
+        if(vstate == Stopping)
+            mPlayState.store(Stopped, std::memory_order_release);
+        return;
+    }
+
+    /* If the static voice's current position is beyond the buffer loop end
+     * position, disable looping.
+     */
+    if(mFlags.test(VoiceIsStatic) && BufferLoopItem)
+    {
+        if(DataPosInt >= 0 && static_cast<uint>(DataPosInt) >= BufferListItem->mLoopEnd)
+            BufferLoopItem = nullptr;
+    }
+
+    uint OutPos{0u};
+
+    /* Check if we're doing a delayed start, and we start in this update. */
+    if(mStartTime > deviceTime) UNLIKELY
+    {
+        /* If the voice is supposed to be stopping but hasn't actually started
+         * yet, make sure its stopped.
+         */
+        if(vstate == Stopping)
+        {
+            mPlayState.store(Stopped, std::memory_order_release);
+            return;
+        }
+
+        /* If the start time is too far ahead, don't bother. */
+        auto diff = mStartTime - deviceTime;
+        if(diff >= seconds{1})
+            return;
+
+        /* Get the number of samples ahead of the current time that output
+         * should start at. Skip this update if it's beyond the output sample
+         * count.
+         *
+         * Round the start position to a multiple of 4, which some mixers want.
+         * This makes the start time accurate to 4 samples. This could be made
+         * sample-accurate by forcing non-SIMD functions on the first run.
+         */
+        seconds::rep sampleOffset{duration_cast<seconds>(diff * Device->Frequency).count()};
+        sampleOffset = (sampleOffset+2) & ~seconds::rep{3};
+        if(sampleOffset >= SamplesToDo)
+            return;
+
+        OutPos = static_cast<uint>(sampleOffset);
+    }
+
+    /* Calculate the number of samples to mix, and the number of (resampled)
+     * samples that need to be loaded (mixing samples and decoder padding).
+     */
+    const uint samplesToMix{SamplesToDo - OutPos};
+    const uint samplesToLoad{samplesToMix + mDecoderPadding};
+
+    /* Get a span of pointers to hold the floating point, deinterlaced,
+     * resampled buffer data to be mixed.
+     */
+    std::array<float*,DeviceBase::MixerChannelsMax> SamplePointers;
+    const al::span<float*> MixingSamples{SamplePointers.data(), mChans.size()};
+    auto get_bufferline = [](DeviceBase::MixerBufferLine &bufline) noexcept -> float*
+    { return bufline.data(); };
+    std::transform(Device->mSampleData.end() - mChans.size(), Device->mSampleData.end(),
+        MixingSamples.begin(), get_bufferline);
+
+    /* If there's a matching sample step and no phase offset, use a simple copy
+     * for resampling.
+     */
+    const ResamplerFunc Resample{(increment == MixerFracOne && DataPosFrac == 0)
+        ? ResamplerFunc{[](const InterpState*, const float *RESTRICT src, uint, const uint,
+            const al::span<float> dst) { std::copy_n(src, dst.size(), dst.begin()); }}
+        : mResampler};
+
+    /* UHJ2 and SuperStereo only have 2 buffer channels, but 3 mixing channels
+     * (3rd channel is generated from decoding).
+     */
+    const size_t realChannels{(mFmtChannels == FmtUHJ2 || mFmtChannels == FmtSuperStereo) ? 2u
+        : MixingSamples.size()};
+    for(size_t chan{0};chan < realChannels;++chan)
+    {
+        using ResBufType = decltype(DeviceBase::mResampleData);
+        static constexpr uint srcSizeMax{static_cast<uint>(ResBufType{}.size()-MaxResamplerEdge)};
+
+        const auto prevSamples = al::as_span(mPrevSamples[chan]);
+        const auto resampleBuffer = std::copy(prevSamples.cbegin(), prevSamples.cend(),
+            Device->mResampleData.begin()) - MaxResamplerEdge;
+        int intPos{DataPosInt};
+        uint fracPos{DataPosFrac};
+
+        /* Load samples for this channel from the available buffer(s), with
+         * resampling.
+         */
+        for(uint samplesLoaded{0};samplesLoaded < samplesToLoad;)
+        {
+            /* Calculate the number of dst samples that can be loaded this
+             * iteration, given the available resampler buffer size, and the
+             * number of src samples that are needed to load it.
+             */
+            auto calc_buffer_sizes = [fracPos,increment](uint dstBufferSize)
+            {
+                /* If ext=true, calculate the last written dst pos from the dst
+                 * count, convert to the last read src pos, then add one to get
+                 * the src count.
+                 *
+                 * If ext=false, convert the dst count to src count directly.
+                 *
+                 * Without this, the src count could be short by one when
+                 * increment < 1.0, or not have a full src at the end when
+                 * increment > 1.0.
+                 */
+                const bool ext{increment <= MixerFracOne};
+                uint64_t dataSize64{dstBufferSize - ext};
+                dataSize64 = (dataSize64*increment + fracPos) >> MixerFracBits;
+                /* Also include resampler padding. */
+                dataSize64 += ext + MaxResamplerEdge;
+
+                if(dataSize64 <= srcSizeMax)
+                    return std::make_pair(dstBufferSize, static_cast<uint>(dataSize64));
+
+                /* If the source size got saturated, we can't fill the desired
+                 * dst size. Figure out how many dst samples we can fill.
+                 */
+                dataSize64 = srcSizeMax - MaxResamplerEdge;
+                dataSize64 = ((dataSize64<<MixerFracBits) - fracPos) / increment;
+                if(dataSize64 < dstBufferSize)
+                {
+                    /* Some resamplers require the destination being 16-byte
+                     * aligned, so limit to a multiple of 4 samples to maintain
+                     * alignment if we need to do another iteration after this.
+                     */
+                    dstBufferSize = static_cast<uint>(dataSize64) & ~3u;
+                }
+                return std::make_pair(dstBufferSize, srcSizeMax);
+            };
+            const auto bufferSizes = calc_buffer_sizes(samplesToLoad - samplesLoaded);
+            const auto dstBufferSize = bufferSizes.first;
+            const auto srcBufferSize = bufferSizes.second;
+
+            /* Load the necessary samples from the given buffer(s). */
+            if(!BufferListItem)
+            {
+                const uint avail{minu(srcBufferSize, MaxResamplerEdge)};
+                const uint tofill{maxu(srcBufferSize, MaxResamplerEdge)};
+
+                /* When loading from a voice that ended prematurely, only take
+                 * the samples that get closest to 0 amplitude. This helps
+                 * certain sounds fade out better.
+                 */
+                auto abs_lt = [](const float lhs, const float rhs) noexcept -> bool
+                { return std::abs(lhs) < std::abs(rhs); };
+                auto srciter = std::min_element(resampleBuffer, resampleBuffer+avail, abs_lt);
+
+                std::fill(srciter+1, resampleBuffer+tofill, *srciter);
+            }
+            else
+            {
+                size_t srcSampleDelay{0};
+                if(intPos < 0) UNLIKELY
+                {
+                    /* If the current position is negative, there's that many
+                     * silent samples to load before using the buffer.
+                     */
+                    srcSampleDelay = static_cast<uint>(-intPos);
+                    if(srcSampleDelay >= srcBufferSize)
+                    {
+                        /* If the number of silent source samples exceeds the
+                         * number to load, the output will be silent.
+                         */
+                        std::fill_n(MixingSamples[chan]+samplesLoaded, dstBufferSize, 0.0f);
+                        std::fill_n(resampleBuffer, srcBufferSize, 0.0f);
+                        goto skip_resample;
+                    }
+
+                    std::fill_n(resampleBuffer, srcSampleDelay, 0.0f);
+                }
+                const uint uintPos{static_cast<uint>(maxi(intPos, 0))};
+
+                if(mFlags.test(VoiceIsStatic))
+                    LoadBufferStatic(BufferListItem, BufferLoopItem, uintPos, mFmtType, chan,
+                        mFrameStep, srcSampleDelay, srcBufferSize, al::to_address(resampleBuffer));
+                else if(mFlags.test(VoiceIsCallback))
+                {
+                    const uint callbackBase{mCallbackBlockBase * mSamplesPerBlock};
+                    const size_t bufferOffset{uintPos - callbackBase};
+                    const size_t needSamples{bufferOffset + srcBufferSize - srcSampleDelay};
+                    const size_t needBlocks{(needSamples + mSamplesPerBlock-1) / mSamplesPerBlock};
+                    if(!mFlags.test(VoiceCallbackStopped) && needBlocks > mNumCallbackBlocks)
+                    {
+                        const size_t byteOffset{mNumCallbackBlocks*mBytesPerBlock};
+                        const size_t needBytes{(needBlocks-mNumCallbackBlocks)*mBytesPerBlock};
+
+                        const int gotBytes{BufferListItem->mCallback(BufferListItem->mUserData,
+                            &BufferListItem->mSamples[byteOffset], static_cast<int>(needBytes))};
+                        if(gotBytes < 0)
+                            mFlags.set(VoiceCallbackStopped);
+                        else if(static_cast<uint>(gotBytes) < needBytes)
+                        {
+                            mFlags.set(VoiceCallbackStopped);
+                            mNumCallbackBlocks += static_cast<uint>(gotBytes) / mBytesPerBlock;
+                        }
+                        else
+                            mNumCallbackBlocks = static_cast<uint>(needBlocks);
+                    }
+                    const size_t numSamples{uint{mNumCallbackBlocks} * mSamplesPerBlock};
+                    LoadBufferCallback(BufferListItem, bufferOffset, numSamples, mFmtType, chan,
+                        mFrameStep, srcSampleDelay, srcBufferSize, al::to_address(resampleBuffer));
+                }
+                else
+                    LoadBufferQueue(BufferListItem, BufferLoopItem, uintPos, mFmtType, chan,
+                        mFrameStep, srcSampleDelay, srcBufferSize, al::to_address(resampleBuffer));
+            }
+
+            Resample(&mResampleState, al::to_address(resampleBuffer), fracPos, increment,
+                {MixingSamples[chan]+samplesLoaded, dstBufferSize});
+
+            /* Store the last source samples used for next time. */
+            if(vstate == Playing) LIKELY
+            {
+                /* Only store samples for the end of the mix, excluding what
+                 * gets loaded for decoder padding.
+                 */
+                const uint loadEnd{samplesLoaded + dstBufferSize};
+                if(samplesToMix > samplesLoaded && samplesToMix <= loadEnd) LIKELY
+                {
+                    const size_t dstOffset{samplesToMix - samplesLoaded};
+                    const size_t srcOffset{(dstOffset*increment + fracPos) >> MixerFracBits};
+                    std::copy_n(resampleBuffer-MaxResamplerEdge+srcOffset, prevSamples.size(),
+                        prevSamples.begin());
+                }
+            }
+
+        skip_resample:
+            samplesLoaded += dstBufferSize;
+            if(samplesLoaded < samplesToLoad)
+            {
+                fracPos += dstBufferSize*increment;
+                const uint srcOffset{fracPos >> MixerFracBits};
+                fracPos &= MixerFracMask;
+                intPos += srcOffset;
+
+                /* If more samples need to be loaded, copy the back of the
+                 * resampleBuffer to the front to reuse it. prevSamples isn't
+                 * reliable since it's only updated for the end of the mix.
+                 */
+                std::copy(resampleBuffer-MaxResamplerEdge+srcOffset,
+                    resampleBuffer+MaxResamplerEdge+srcOffset, resampleBuffer-MaxResamplerEdge);
+            }
+        }
+    }
+    for(auto &samples : MixingSamples.subspan(realChannels))
+        std::fill_n(samples, samplesToLoad, 0.0f);
+
+    if(mDecoder)
+        mDecoder->decode(MixingSamples, samplesToMix, (vstate==Playing));
+
+    if(mFlags.test(VoiceIsAmbisonic))
+    {
+        auto voiceSamples = MixingSamples.begin();
+        for(auto &chandata : mChans)
+        {
+            chandata.mAmbiSplitter.processScale({*voiceSamples, samplesToMix},
+                chandata.mAmbiHFScale, chandata.mAmbiLFScale);
+            ++voiceSamples;
+        }
+    }
+
+    const uint Counter{mFlags.test(VoiceIsFading) ? minu(samplesToMix, 64u) : 0u};
+    if(!Counter)
+    {
+        /* No fading, just overwrite the old/current params. */
+        for(auto &chandata : mChans)
+        {
+            {
+                DirectParams &parms = chandata.mDryParams;
+                if(!mFlags.test(VoiceHasHrtf))
+                    parms.Gains.Current = parms.Gains.Target;
+                else
+                    parms.Hrtf.Old = parms.Hrtf.Target;
+            }
+            for(uint send{0};send < NumSends;++send)
+            {
+                if(mSend[send].Buffer.empty())
+                    continue;
+
+                SendParams &parms = chandata.mWetParams[send];
+                parms.Gains.Current = parms.Gains.Target;
+            }
+        }
+    }
+
+    auto voiceSamples = MixingSamples.begin();
+    for(auto &chandata : mChans)
+    {
+        /* Now filter and mix to the appropriate outputs. */
+        const al::span<float,BufferLineSize> FilterBuf{Device->FilteredData};
+        {
+            DirectParams &parms = chandata.mDryParams;
+            const float *samples{DoFilters(parms.LowPass, parms.HighPass, FilterBuf.data(),
+                {*voiceSamples, samplesToMix}, mDirect.FilterType)};
+
+            if(mFlags.test(VoiceHasHrtf))
+            {
+                const float TargetGain{parms.Hrtf.Target.Gain * (vstate == Playing)};
+                DoHrtfMix(samples, samplesToMix, parms, TargetGain, Counter, OutPos,
+                    (vstate == Playing), Device);
+            }
+            else
+            {
+                const float *TargetGains{(vstate == Playing) ? parms.Gains.Target.data()
+                    : SilentTarget.data()};
+                if(mFlags.test(VoiceHasNfc))
+                    DoNfcMix({samples, samplesToMix}, mDirect.Buffer.data(), parms,
+                        TargetGains, Counter, OutPos, Device);
+                else
+                    MixSamples({samples, samplesToMix}, mDirect.Buffer,
+                        parms.Gains.Current.data(), TargetGains, Counter, OutPos);
+            }
+        }
+
+        for(uint send{0};send < NumSends;++send)
+        {
+            if(mSend[send].Buffer.empty())
+                continue;
+
+            SendParams &parms = chandata.mWetParams[send];
+            const float *samples{DoFilters(parms.LowPass, parms.HighPass, FilterBuf.data(),
+                {*voiceSamples, samplesToMix}, mSend[send].FilterType)};
+
+            const float *TargetGains{(vstate == Playing) ? parms.Gains.Target.data()
+                : SilentTarget.data()};
+            MixSamples({samples, samplesToMix}, mSend[send].Buffer,
+                parms.Gains.Current.data(), TargetGains, Counter, OutPos);
+        }
+
+        ++voiceSamples;
+    }
+
+    mFlags.set(VoiceIsFading);
+
+    /* Don't update positions and buffers if we were stopping. */
+    if(vstate == Stopping) UNLIKELY
+    {
+        mPlayState.store(Stopped, std::memory_order_release);
+        return;
+    }
+
+    /* Update voice positions and buffers as needed. */
+    DataPosFrac += increment*samplesToMix;
+    const uint SrcSamplesDone{DataPosFrac>>MixerFracBits};
+    DataPosInt  += SrcSamplesDone;
+    DataPosFrac &= MixerFracMask;
+
+    uint buffers_done{0u};
+    if(BufferListItem && DataPosInt >= 0) LIKELY
+    {
+        if(mFlags.test(VoiceIsStatic))
+        {
+            if(BufferLoopItem)
+            {
+                /* Handle looping static source */
+                const uint LoopStart{BufferListItem->mLoopStart};
+                const uint LoopEnd{BufferListItem->mLoopEnd};
+                uint DataPosUInt{static_cast<uint>(DataPosInt)};
+                if(DataPosUInt >= LoopEnd)
+                {
+                    assert(LoopEnd > LoopStart);
+                    DataPosUInt = ((DataPosUInt-LoopStart)%(LoopEnd-LoopStart)) + LoopStart;
+                    DataPosInt = static_cast<int>(DataPosUInt);
+                }
+            }
+            else
+            {
+                /* Handle non-looping static source */
+                if(static_cast<uint>(DataPosInt) >= BufferListItem->mSampleLen)
+                    BufferListItem = nullptr;
+            }
+        }
+        else if(mFlags.test(VoiceIsCallback))
+        {
+            /* Handle callback buffer source */
+            const uint currentBlock{static_cast<uint>(DataPosInt) / mSamplesPerBlock};
+            const uint blocksDone{currentBlock - mCallbackBlockBase};
+            if(blocksDone < mNumCallbackBlocks)
+            {
+                const size_t byteOffset{blocksDone*mBytesPerBlock};
+                const size_t byteEnd{mNumCallbackBlocks*mBytesPerBlock};
+                al::byte *data{BufferListItem->mSamples};
+                std::copy(data+byteOffset, data+byteEnd, data);
+                mNumCallbackBlocks -= blocksDone;
+                mCallbackBlockBase += blocksDone;
+            }
+            else
+            {
+                BufferListItem = nullptr;
+                mNumCallbackBlocks = 0;
+                mCallbackBlockBase += blocksDone;
+            }
+        }
+        else
+        {
+            /* Handle streaming source */
+            do {
+                if(BufferListItem->mSampleLen > static_cast<uint>(DataPosInt))
+                    break;
+
+                DataPosInt -= BufferListItem->mSampleLen;
+
+                ++buffers_done;
+                BufferListItem = BufferListItem->mNext.load(std::memory_order_relaxed);
+                if(!BufferListItem) BufferListItem = BufferLoopItem;
+            } while(BufferListItem);
+        }
+    }
+
+    /* Capture the source ID in case it gets reset for stopping. */
+    const uint SourceID{mSourceID.load(std::memory_order_relaxed)};
+
+    /* Update voice info */
+    mPosition.store(DataPosInt, std::memory_order_relaxed);
+    mPositionFrac.store(DataPosFrac, std::memory_order_relaxed);
+    mCurrentBuffer.store(BufferListItem, std::memory_order_relaxed);
+    if(!BufferListItem)
+    {
+        mLoopBuffer.store(nullptr, std::memory_order_relaxed);
+        mSourceID.store(0u, std::memory_order_relaxed);
+    }
+    std::atomic_thread_fence(std::memory_order_release);
+
+    /* Send any events now, after the position/buffer info was updated. */
+    const auto enabledevt = Context->mEnabledEvts.load(std::memory_order_acquire);
+    if(buffers_done > 0 && enabledevt.test(AsyncEvent::BufferCompleted))
+    {
+        RingBuffer *ring{Context->mAsyncEvents.get()};
+        auto evt_vec = ring->getWriteVector();
+        if(evt_vec.first.len > 0)
+        {
+            AsyncEvent *evt{al::construct_at(reinterpret_cast<AsyncEvent*>(evt_vec.first.buf),
+                AsyncEvent::BufferCompleted)};
+            evt->u.bufcomp.id = SourceID;
+            evt->u.bufcomp.count = buffers_done;
+            ring->writeAdvance(1);
+        }
+    }
+
+    if(!BufferListItem)
+    {
+        /* If the voice just ended, set it to Stopping so the next render
+         * ensures any residual noise fades to 0 amplitude.
+         */
+        mPlayState.store(Stopping, std::memory_order_release);
+        if(enabledevt.test(AsyncEvent::SourceStateChange))
+            SendSourceStoppedEvent(Context, SourceID);
+    }
+}
+
+void Voice::prepare(DeviceBase *device)
+{
+    /* Even if storing really high order ambisonics, we only mix channels for
+     * orders up to the device order. The rest are simply dropped.
+     */
+    uint num_channels{(mFmtChannels == FmtUHJ2 || mFmtChannels == FmtSuperStereo) ? 3 :
+        ChannelsFromFmt(mFmtChannels, minu(mAmbiOrder, device->mAmbiOrder))};
+    if(num_channels > device->mSampleData.size()) UNLIKELY
+    {
+        ERR("Unexpected channel count: %u (limit: %zu, %d:%d)\n", num_channels,
+            device->mSampleData.size(), mFmtChannels, mAmbiOrder);
+        num_channels = static_cast<uint>(device->mSampleData.size());
+    }
+    if(mChans.capacity() > 2 && num_channels < mChans.capacity())
+    {
+        decltype(mChans){}.swap(mChans);
+        decltype(mPrevSamples){}.swap(mPrevSamples);
+    }
+    mChans.reserve(maxu(2, num_channels));
+    mChans.resize(num_channels);
+    mPrevSamples.reserve(maxu(2, num_channels));
+    mPrevSamples.resize(num_channels);
+
+    mDecoder = nullptr;
+    mDecoderPadding = 0;
+    if(mFmtChannels == FmtSuperStereo)
+    {
+        switch(UhjDecodeQuality)
+        {
+        case UhjQualityType::IIR:
+            mDecoder = std::make_unique<UhjStereoDecoderIIR>();
+            mDecoderPadding = UhjStereoDecoderIIR::sInputPadding;
+            break;
+        case UhjQualityType::FIR256:
+            mDecoder = std::make_unique<UhjStereoDecoder<UhjLength256>>();
+            mDecoderPadding = UhjStereoDecoder<UhjLength256>::sInputPadding;
+            break;
+        case UhjQualityType::FIR512:
+            mDecoder = std::make_unique<UhjStereoDecoder<UhjLength512>>();
+            mDecoderPadding = UhjStereoDecoder<UhjLength512>::sInputPadding;
+            break;
+        }
+    }
+    else if(IsUHJ(mFmtChannels))
+    {
+        switch(UhjDecodeQuality)
+        {
+        case UhjQualityType::IIR:
+            mDecoder = std::make_unique<UhjDecoderIIR>();
+            mDecoderPadding = UhjDecoderIIR::sInputPadding;
+            break;
+        case UhjQualityType::FIR256:
+            mDecoder = std::make_unique<UhjDecoder<UhjLength256>>();
+            mDecoderPadding = UhjDecoder<UhjLength256>::sInputPadding;
+            break;
+        case UhjQualityType::FIR512:
+            mDecoder = std::make_unique<UhjDecoder<UhjLength512>>();
+            mDecoderPadding = UhjDecoder<UhjLength512>::sInputPadding;
+            break;
+        }
+    }
+
+    /* Clear the stepping value explicitly so the mixer knows not to mix this
+     * until the update gets applied.
+     */
+    mStep = 0;
+
+    /* Make sure the sample history is cleared. */
+    std::fill(mPrevSamples.begin(), mPrevSamples.end(), HistoryLine{});
+
+    if(mFmtChannels == FmtUHJ2 && !device->mUhjEncoder)
+    {
+        /* 2-channel UHJ needs different shelf filters. However, we can't just
+         * use different shelf filters after mixing it, given any old speaker
+         * setup the user has. To make this work, we apply the expected shelf
+         * filters for decoding UHJ2 to quad (only needs LF scaling), and act
+         * as if those 4 quad channels are encoded right back into B-Format.
+         *
+         * This isn't perfect, but without an entirely separate and limited
+         * UHJ2 path, it's better than nothing.
+         *
+         * Note this isn't needed with UHJ output (UHJ2->B-Format->UHJ2 is
+         * identity, so don't mess with it).
+         */
+        const BandSplitter splitter{device->mXOverFreq / static_cast<float>(device->Frequency)};
+        for(auto &chandata : mChans)
+        {
+            chandata.mAmbiHFScale = 1.0f;
+            chandata.mAmbiLFScale = 1.0f;
+            chandata.mAmbiSplitter = splitter;
+            chandata.mDryParams = DirectParams{};
+            chandata.mDryParams.NFCtrlFilter = device->mNFCtrlFilter;
+            std::fill_n(chandata.mWetParams.begin(), device->NumAuxSends, SendParams{});
+        }
+        mChans[0].mAmbiLFScale = DecoderBase::sWLFScale;
+        mChans[1].mAmbiLFScale = DecoderBase::sXYLFScale;
+        mChans[2].mAmbiLFScale = DecoderBase::sXYLFScale;
+        mFlags.set(VoiceIsAmbisonic);
+    }
+    /* Don't need to set the VoiceIsAmbisonic flag if the device is not higher
+     * order than the voice. No HF scaling is necessary to mix it.
+     */
+    else if(mAmbiOrder && device->mAmbiOrder > mAmbiOrder)
+    {
+        const uint8_t *OrderFromChan{Is2DAmbisonic(mFmtChannels) ?
+            AmbiIndex::OrderFrom2DChannel().data() : AmbiIndex::OrderFromChannel().data()};
+        const auto scales = AmbiScale::GetHFOrderScales(mAmbiOrder, device->mAmbiOrder,
+            device->m2DMixing);
+
+        const BandSplitter splitter{device->mXOverFreq / static_cast<float>(device->Frequency)};
+        for(auto &chandata : mChans)
+        {
+            chandata.mAmbiHFScale = scales[*(OrderFromChan++)];
+            chandata.mAmbiLFScale = 1.0f;
+            chandata.mAmbiSplitter = splitter;
+            chandata.mDryParams = DirectParams{};
+            chandata.mDryParams.NFCtrlFilter = device->mNFCtrlFilter;
+            std::fill_n(chandata.mWetParams.begin(), device->NumAuxSends, SendParams{});
+        }
+        mFlags.set(VoiceIsAmbisonic);
+    }
+    else
+    {
+        for(auto &chandata : mChans)
+        {
+            chandata.mDryParams = DirectParams{};
+            chandata.mDryParams.NFCtrlFilter = device->mNFCtrlFilter;
+            std::fill_n(chandata.mWetParams.begin(), device->NumAuxSends, SendParams{});
+        }
+        mFlags.reset(VoiceIsAmbisonic);
+    }
+}
diff --git a/core/voice.h b/core/voice.h
new file mode 100644
index 00000000..57ee7b01
--- /dev/null
+++ b/core/voice.h
@@ -0,0 +1,280 @@
+#ifndef CORE_VOICE_H
+#define CORE_VOICE_H
+
+#include <array>
+#include <atomic>
+#include <bitset>
+#include <chrono>
+#include <memory>
+#include <stddef.h>
+#include <string>
+
+#include "albyte.h"
+#include "almalloc.h"
+#include "aloptional.h"
+#include "alspan.h"
+#include "bufferline.h"
+#include "buffer_storage.h"
+#include "devformat.h"
+#include "filters/biquad.h"
+#include "filters/nfc.h"
+#include "filters/splitter.h"
+#include "mixer/defs.h"
+#include "mixer/hrtfdefs.h"
+#include "resampler_limits.h"
+#include "uhjfilter.h"
+#include "vector.h"
+
+struct ContextBase;
+struct DeviceBase;
+struct EffectSlot;
+enum class DistanceModel : unsigned char;
+
+using uint = unsigned int;
+
+
+#define MAX_SENDS  6
+
+
+enum class SpatializeMode : unsigned char {
+    Off,
+    On,
+    Auto
+};
+
+enum class DirectMode : unsigned char {
+    Off,
+    DropMismatch,
+    RemixMismatch
+};
+
+
+constexpr uint MaxPitch{10};
+
+
+enum {
+    AF_None = 0,
+    AF_LowPass = 1,
+    AF_HighPass = 2,
+    AF_BandPass = AF_LowPass | AF_HighPass
+};
+
+
+struct DirectParams {
+    BiquadFilter LowPass;
+    BiquadFilter HighPass;
+
+    NfcFilter NFCtrlFilter;
+
+    struct {
+        HrtfFilter Old;
+        HrtfFilter Target;
+        alignas(16) std::array<float,HrtfHistoryLength> History;
+    } Hrtf;
+
+    struct {
+        std::array<float,MAX_OUTPUT_CHANNELS> Current;
+        std::array<float,MAX_OUTPUT_CHANNELS> Target;
+    } Gains;
+};
+
+struct SendParams {
+    BiquadFilter LowPass;
+    BiquadFilter HighPass;
+
+    struct {
+        std::array<float,MaxAmbiChannels> Current;
+        std::array<float,MaxAmbiChannels> Target;
+    } Gains;
+};
+
+
+struct VoiceBufferItem {
+    std::atomic<VoiceBufferItem*> mNext{nullptr};
+
+    CallbackType mCallback{nullptr};
+    void *mUserData{nullptr};
+
+    uint mBlockAlign{0u};
+    uint mSampleLen{0u};
+    uint mLoopStart{0u};
+    uint mLoopEnd{0u};
+
+    al::byte *mSamples{nullptr};
+};
+
+
+struct VoiceProps {
+    float Pitch;
+    float Gain;
+    float OuterGain;
+    float MinGain;
+    float MaxGain;
+    float InnerAngle;
+    float OuterAngle;
+    float RefDistance;
+    float MaxDistance;
+    float RolloffFactor;
+    std::array<float,3> Position;
+    std::array<float,3> Velocity;
+    std::array<float,3> Direction;
+    std::array<float,3> OrientAt;
+    std::array<float,3> OrientUp;
+    bool HeadRelative;
+    DistanceModel mDistanceModel;
+    Resampler mResampler;
+    DirectMode DirectChannels;
+    SpatializeMode mSpatializeMode;
+
+    bool DryGainHFAuto;
+    bool WetGainAuto;
+    bool WetGainHFAuto;
+    float OuterGainHF;
+
+    float AirAbsorptionFactor;
+    float RoomRolloffFactor;
+    float DopplerFactor;
+
+    std::array<float,2> StereoPan;
+
+    float Radius;
+    float EnhWidth;
+
+    /** Direct filter and auxiliary send info. */
+    struct {
+        float Gain;
+        float GainHF;
+        float HFReference;
+        float GainLF;
+        float LFReference;
+    } Direct;
+    struct SendData {
+        EffectSlot *Slot;
+        float Gain;
+        float GainHF;
+        float HFReference;
+        float GainLF;
+        float LFReference;
+    } Send[MAX_SENDS];
+};
+
+struct VoicePropsItem : public VoiceProps {
+    std::atomic<VoicePropsItem*> next{nullptr};
+
+    DEF_NEWDEL(VoicePropsItem)
+};
+
+enum : uint {
+    VoiceIsStatic,
+    VoiceIsCallback,
+    VoiceIsAmbisonic,
+    VoiceCallbackStopped,
+    VoiceIsFading,
+    VoiceHasHrtf,
+    VoiceHasNfc,
+
+    VoiceFlagCount
+};
+
+struct Voice {
+    enum State {
+        Stopped,
+        Playing,
+        Stopping,
+        Pending
+    };
+
+    std::atomic<VoicePropsItem*> mUpdate{nullptr};
+
+    VoiceProps mProps;
+
+    std::atomic<uint> mSourceID{0u};
+    std::atomic<State> mPlayState{Stopped};
+    std::atomic<bool> mPendingChange{false};
+
+    /**
+     * Source offset in samples, relative to the currently playing buffer, NOT
+     * the whole queue.
+     */
+    std::atomic<int> mPosition;
+    /** Fractional (fixed-point) offset to the next sample. */
+    std::atomic<uint> mPositionFrac;
+
+    /* Current buffer queue item being played. */
+    std::atomic<VoiceBufferItem*> mCurrentBuffer;
+
+    /* Buffer queue item to loop to at end of queue (will be NULL for non-
+     * looping voices).
+     */
+    std::atomic<VoiceBufferItem*> mLoopBuffer;
+
+    std::chrono::nanoseconds mStartTime{};
+
+    /* Properties for the attached buffer(s). */
+    FmtChannels mFmtChannels;
+    FmtType mFmtType;
+    uint mFrequency;
+    uint mFrameStep; /**< In steps of the sample type size. */
+    uint mBytesPerBlock; /**< Or for PCM formats, BytesPerFrame. */
+    uint mSamplesPerBlock; /**< Always 1 for PCM formats. */
+    AmbiLayout mAmbiLayout;
+    AmbiScaling mAmbiScaling;
+    uint mAmbiOrder;
+
+    std::unique_ptr<DecoderBase> mDecoder;
+    uint mDecoderPadding{};
+
+    /** Current target parameters used for mixing. */
+    uint mStep{0};
+
+    ResamplerFunc mResampler;
+
+    InterpState mResampleState;
+
+    std::bitset<VoiceFlagCount> mFlags{};
+    uint mNumCallbackBlocks{0};
+    uint mCallbackBlockBase{0};
+
+    struct TargetData {
+        int FilterType;
+        al::span<FloatBufferLine> Buffer;
+    };
+    TargetData mDirect;
+    std::array<TargetData,MAX_SENDS> mSend;
+
+    /* The first MaxResamplerPadding/2 elements are the sample history from the
+     * previous mix, with an additional MaxResamplerPadding/2 elements that are
+     * now current (which may be overwritten if the buffer data is still
+     * available).
+     */
+    using HistoryLine = std::array<float,MaxResamplerPadding>;
+    al::vector<HistoryLine,16> mPrevSamples{2};
+
+    struct ChannelData {
+        float mAmbiHFScale, mAmbiLFScale;
+        BandSplitter mAmbiSplitter;
+
+        DirectParams mDryParams;
+        std::array<SendParams,MAX_SENDS> mWetParams;
+    };
+    al::vector<ChannelData> mChans{2};
+
+    Voice() = default;
+    ~Voice() = default;
+
+    Voice(const Voice&) = delete;
+    Voice& operator=(const Voice&) = delete;
+
+    void mix(const State vstate, ContextBase *Context, const std::chrono::nanoseconds deviceTime,
+        const uint SamplesToDo);
+
+    void prepare(DeviceBase *device);
+
+    static void InitMixer(al::optional<std::string> resampler);
+
+    DEF_NEWDEL(Voice)
+};
+
+extern Resampler ResamplerDefault;
+
+#endif /* CORE_VOICE_H */
diff --git a/core/voice_change.h b/core/voice_change.h
new file mode 100644
index 00000000..ddc6186f
--- /dev/null
+++ b/core/voice_change.h
@@ -0,0 +1,31 @@
+#ifndef VOICE_CHANGE_H
+#define VOICE_CHANGE_H
+
+#include <atomic>
+
+#include "almalloc.h"
+
+struct Voice;
+
+using uint = unsigned int;
+
+
+enum class VChangeState {
+    Reset,
+    Stop,
+    Play,
+    Pause,
+    Restart
+};
+struct VoiceChange {
+    Voice *mOldVoice{nullptr};
+    Voice *mVoice{nullptr};
+    uint mSourceID{0};
+    VChangeState mState{};
+
+    std::atomic<VoiceChange*> mNext{nullptr};
+
+    DEF_NEWDEL(VoiceChange)
+};
+
+#endif /* VOICE_CHANGE_H */