alc/uhjfilter.cpp


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184


#include "config.h"

#include "uhjfilter.h"

#ifdef HAVE_SSE_INTRINSICS
#include <xmmintrin.h>
#endif

#include <algorithm>
#include <iterator>

#include "AL/al.h"

#include "alcomplex.h"
#include "alnumeric.h"
#include "opthelpers.h"


namespace {

using complex_d = std::complex<double>;

std::array<float,Uhj2Encoder::sFilterSize> GenerateFilter()
{
    /* Some notes on this filter construction.
     *
     * An impulse in the frequency domain is represented by a continuous series
     * of +1,-1 values, with a 0 imaginary term. Consequently, that impulse
     * with a +90 degree phase offset would be represented by 0s with imaginary
     * terms that alternate between +1,-1. Converting that to the time domain
     * results in a FIR filter that can be convolved with the incoming signal
     * to apply a wide-band 90-degree phase shift.
     *
     * A particularly notable aspect of the time-domain filter response is that
     * every other coefficient is 0. This allows doubling the effective size of
     * the filter, by only storing the non-0 coefficients and double-stepping
     * over the input to apply it.
     *
     * Additionally, the resulting filter is independent of the sample rate.
     * The same filter can be applied regardless of the device's sample rate
     * and achieve the same effect, although a lower rate allows the filter to
     * cover more time and improve the results.
     */
    constexpr complex_d c0{0.0,  1.0};
    constexpr complex_d c1{0.0, -1.0};
    constexpr size_t half_size{32768};

    /* Generate a frequency domain impulse with a +90 degree phase offset. Keep
     * the mirrored frequencies clear for converting to the time domain.
     */
    auto fftBuffer = std::vector<complex_d>(half_size*2, complex_d{});
    for(size_t i{0};i < half_size;i += 2)
    {
        fftBuffer[i  ] = c0;
        fftBuffer[i+1] = c1;
    }
    fftBuffer[half_size] = c0;
    complex_fft(fftBuffer, 1.0);

    /* Reverse and truncate the filter to a usable size, and store only the
     * non-0 terms. Should this be windowed?
     */
    std::array<float,Uhj2Encoder::sFilterSize> ret;
    auto fftiter = fftBuffer.data() + half_size + (Uhj2Encoder::sFilterSize-1);
    for(float &coeff : ret)
    {
        coeff = static_cast<float>(fftiter->real() / (half_size+1));
        fftiter -= 2;
    }
    return ret;
}
const auto PShiftCoeffs = GenerateFilter();


void allpass_process(al::span<float> dst, const float *RESTRICT src)
{
    for(float &output : dst)
    {
#ifdef HAVE_SSE_INTRINSICS
        constexpr size_t todo{PShiftCoeffs.size()>>2};
        __m128 r4{_mm_setzero_ps()};
        for(size_t i{0};i < todo;i+=4)
        {
            const __m128 coeffs{_mm_load_ps(&PShiftCoeffs[i])};
            /* NOTE: This could alternatively be done with two unaligned loads
             * and a shuffle. Which would be better?
             */
            const __m128 s{_mm_setr_ps(src[i*2], src[i*2 + 2], src[i*2 + 4], src[i*2 + 6])};
            r4 = _mm_add_ps(r4, _mm_mul_ps(s, coeffs));
        }
        r4 = _mm_add_ps(r4, _mm_shuffle_ps(r4, r4, _MM_SHUFFLE(0, 1, 2, 3)));
        r4 = _mm_add_ps(r4, _mm_movehl_ps(r4, r4));
        float ret{_mm_cvtss_f32(r4)};
#else
        float ret{0.0f};
        for(size_t i{0};i < PShiftCoeffs.size();++i)
            ret += src[i*2] * PShiftCoeffs[i];
#endif
        output += ret;
        ++src;
    }
}

} // namespace


/* NOTE: There seems to be a bit of an inconsistency in how this encoding is
 * supposed to work. Some references, such as
 *
 * http://members.tripod.com/martin_leese/Ambisonic/UHJ_file_format.html
 *
 * specify a pre-scaling of sqrt(2) on the W channel input, while other
 * references, such as
 *
 * https://en.wikipedia.org/wiki/Ambisonic_UHJ_format#Encoding.5B1.5D
 * and
 * https://wiki.xiph.org/Ambisonics#UHJ_format
 *
 * do not. The sqrt(2) scaling is in line with B-Format decoder coefficients
 * which include such a scaling for the W channel input, however the original
 * source for this equation is a 1985 paper by Michael Gerzon, which does not
 * apparently include the scaling. Applying the extra scaling creates a louder
 * result with a narrower stereo image compared to not scaling, and I don't
 * know which is the intended result.
 */

void Uhj2Encoder::encode(FloatBufferLine &LeftOut, FloatBufferLine &RightOut,
    const FloatBufferLine *InSamples, const size_t SamplesToDo)
{
    ASSUME(SamplesToDo > 0);

    const float *RESTRICT winput{al::assume_aligned<16>(InSamples[0].data())};
    const float *RESTRICT xinput{al::assume_aligned<16>(InSamples[1].data())};
    const float *RESTRICT yinput{al::assume_aligned<16>(InSamples[2].data())};

    /* S = 0.9396926*W + 0.1855740*X */
    std::transform(winput, winput+SamplesToDo, xinput, mMid.begin(),
        [](const float w, const float x) noexcept -> float
        { return 0.9396926f*w + 0.1855740f*x; });

    /* D = 0.6554516*Y */
    std::transform(yinput, yinput+SamplesToDo, mSide.begin(),
        [](const float y) noexcept -> float { return 0.6554516f*y; });

    /* Apply a delay to the non-filtered signal to align with the filter delay. */
    if LIKELY(SamplesToDo >= sFilterSize)
    {
        auto buffer_end = mMid.begin() + SamplesToDo;
        auto delay_end = std::rotate(mMid.begin(), buffer_end - sFilterSize, buffer_end);
        std::swap_ranges(mMid.begin(), delay_end, mMidDelay.begin());

        buffer_end = mSide.begin() + SamplesToDo;
        delay_end = std::rotate(mSide.begin(), buffer_end - sFilterSize, buffer_end);
        std::swap_ranges(mSide.begin(), delay_end, mSideDelay.begin());
    }
    else
    {
        auto buffer_end = mMid.begin() + SamplesToDo;
        auto delay_start = std::swap_ranges(mMid.begin(), buffer_end, mMidDelay.begin());
        std::rotate(mMidDelay.begin(), delay_start, mMidDelay.end());

        buffer_end = mSide.begin() + SamplesToDo;
        delay_start = std::swap_ranges(mSide.begin(), buffer_end, mSideDelay.begin());
        std::rotate(mSideDelay.begin(), delay_start, mSideDelay.end());
    }

    /* D += j(-0.3420201*W + 0.5098604*X) */
    auto tmpiter = std::copy(mSideHistory.cbegin(), mSideHistory.cend(), mTemp.begin());
    std::transform(winput, winput+SamplesToDo, xinput, tmpiter,
        [](const float w, const float x) noexcept -> float
        { return -0.3420201f*w + 0.5098604f*x; });
    std::copy_n(mTemp.cbegin()+SamplesToDo, mSideHistory.size(), mSideHistory.begin());
    allpass_process({mSide.data(), SamplesToDo}, mTemp.data());

    /* Left = (S + D)/2.0 */
    float *RESTRICT left{al::assume_aligned<16>(LeftOut.data())};
    for(size_t i{0};i < SamplesToDo;i++)
        left[i] += (mMid[i] + mSide[i]) * 0.5f;
    /* Right = (S - D)/2.0 */
    float *RESTRICT right{al::assume_aligned<16>(RightOut.data())};
    for(size_t i{0};i < SamplesToDo;i++)
        right[i] += (mMid[i] - mSide[i]) * 0.5f;
}