aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorChris Robinson <[email protected]>2023-10-06 00:58:21 -0700
committerChris Robinson <[email protected]>2023-10-06 00:58:21 -0700
commitf1bf7a42ac66124b6f6dc5e01897a5f052519df3 (patch)
tree79c618159131741bdcb1edb11a9770ecc8c1321d
parent6a70a30ca1ca1f0d0397ef9e6a9817b0f6cba079 (diff)
Use PFFFT for the convolution effect
-rw-r--r--alc/effects/convolution.cpp132
1 files changed, 92 insertions, 40 deletions
diff --git a/alc/effects/convolution.cpp b/alc/effects/convolution.cpp
index f46422d4..c7a342dc 100644
--- a/alc/effects/convolution.cpp
+++ b/alc/effects/convolution.cpp
@@ -34,6 +34,7 @@
#include "core/fmt_traits.h"
#include "core/mixer.h"
#include "intrusive_ptr.h"
+#include "pffft.h"
#include "polyphase_resampler.h"
#include "vector.h"
@@ -45,6 +46,10 @@ namespace {
* each segment has an FFT applied with a 256-sample buffer (the latter half
* left silent) to get its frequency-domain response. The resulting response
* has its positive/non-mirrored frequencies saved (129 bins) in each segment.
+ * Note that since the 0th and 129th bins are real for a real signal, their
+ * imaginary components are always 0 and can be dropped, allowing their real
+ * components to be combined so only 128 complex values are stored for the 129
+ * bins.
*
* Input samples are similarly broken up into 128-sample segments, with an FFT
* applied to each new incoming segment to get its 129 bins. A history of FFT'd
@@ -52,8 +57,8 @@ namespace {
*
* To apply the reverberation, each impulse response segment is convolved with
* its paired input segment (using complex multiplies, far cheaper than FIRs),
- * accumulating into a 256-bin FFT buffer. The input history is then shifted to
- * align with later impulse response segments for next time.
+ * accumulating into a 129-bin FFT buffer. The input history is then shifted to
+ * align with later impulse response segments for the next input segment.
*
* An inverse FFT is then applied to the accumulated FFT buffer to get a 256-
* sample time-domain response for output, which is split in two halves. The
@@ -183,6 +188,35 @@ void apply_fir(al::span<float> dst, const float *RESTRICT src, const float *REST
#endif
}
+
+template<typename T>
+struct AlignedDeleter { };
+
+template<typename T>
+struct AlignedDeleter<T[]> {
+ static_assert(std::is_trivially_destructible_v<T>);
+ using type = T;
+
+ void operator()(T *ptr) { al_free(ptr); }
+};
+template<typename T>
+using AlignedUPtr = std::unique_ptr<T,AlignedDeleter<T>>;
+
+template<typename T, size_t A>
+auto MakeAlignedPtr(size_t count) -> AlignedUPtr<T>
+{
+ using Type = typename AlignedDeleter<T>::type;
+ void *ptr{al_calloc(A, sizeof(Type)*count)};
+ return AlignedUPtr<T>{::new(ptr) Type[count]};
+}
+
+
+struct PFFFTSetupDeleter {
+ void operator()(PFFFT_Setup *ptr) { pffft_destroy_setup(ptr); }
+};
+using PFFFTSetupPtr = std::unique_ptr<PFFFT_Setup,PFFFTSetupDeleter>;
+
+
struct ConvolutionState final : public EffectState {
FmtChannels mChannels{};
AmbiLayout mAmbiLayout{};
@@ -194,7 +228,9 @@ struct ConvolutionState final : public EffectState {
al::vector<std::array<float,ConvolveUpdateSamples>,16> mFilter;
al::vector<std::array<float,ConvolveUpdateSamples*2>,16> mOutput;
- alignas(16) std::array<complex_f,ConvolveUpdateSize> mFftBuffer{};
+ PFFFTSetupPtr mFft{};
+ alignas(16) std::array<float,ConvolveUpdateSize> mFftBuffer{};
+ alignas(16) std::array<float,ConvolveUpdateSize> mFftWorkBuffer{};
size_t mCurrentSegment{0};
size_t mNumConvolveSegs{0};
@@ -208,7 +244,7 @@ struct ConvolutionState final : public EffectState {
};
using ChannelDataArray = al::FlexArray<ChannelData>;
std::unique_ptr<ChannelDataArray> mChans;
- std::unique_ptr<complex_f[]> mComplexData;
+ AlignedUPtr<float[]> mComplexData;
ConvolutionState() = default;
@@ -253,13 +289,17 @@ void ConvolutionState::deviceUpdate(const DeviceBase *device, const BufferStorag
using UhjDecoderType = UhjDecoder<512>;
static constexpr auto DecoderPadding = UhjDecoderType::sInputPadding;
- constexpr uint MaxConvolveAmbiOrder{1u};
+ static constexpr uint MaxConvolveAmbiOrder{1u};
+
+ if(!mFft)
+ mFft = PFFFTSetupPtr{pffft_new_setup(ConvolveUpdateSize, PFFFT_REAL)};
mFifoPos = 0;
mInput.fill(0.0f);
decltype(mFilter){}.swap(mFilter);
decltype(mOutput){}.swap(mOutput);
- mFftBuffer.fill(complex_f{});
+ mFftBuffer.fill(0.0f);
+ mFftWorkBuffer.fill(0.0f);
mCurrentSegment = 0;
mNumConvolveSegs = 0;
@@ -275,7 +315,6 @@ void ConvolutionState::deviceUpdate(const DeviceBase *device, const BufferStorag
mAmbiScaling = IsUHJ(mChannels) ? AmbiScaling::UHJ : buffer->mAmbiScaling;
mAmbiOrder = minu(buffer->mAmbiOrder, MaxConvolveAmbiOrder);
- constexpr size_t m{ConvolveUpdateSize/2 + 1};
const auto bytesPerSample = BytesFromFmt(buffer->mType);
const auto realChannels = buffer->channelsFromFmt();
const auto numChannels = (mChannels == FmtUHJ2) ? 3u : ChannelsFromFmt(mChannels, mAmbiOrder);
@@ -309,9 +348,9 @@ void ConvolutionState::deviceUpdate(const DeviceBase *device, const BufferStorag
mNumConvolveSegs = (resampledCount+(ConvolveUpdateSamples-1)) / ConvolveUpdateSamples;
mNumConvolveSegs = maxz(mNumConvolveSegs, 2) - 1;
- const size_t complex_length{mNumConvolveSegs * m * (numChannels+1)};
- mComplexData = std::make_unique<complex_f[]>(complex_length);
- std::fill_n(mComplexData.get(), complex_length, complex_f{});
+ const size_t complex_length{mNumConvolveSegs * ConvolveUpdateSize * (numChannels+1)};
+ mComplexData = MakeAlignedPtr<float[],16>(complex_length);
+ std::fill_n(mComplexData.get(), complex_length, 0.0f);
/* Load the samples from the buffer. */
const size_t srclinelength{RoundUp(buffer->mSampleLen+DecoderPadding, 16)};
@@ -332,7 +371,10 @@ void ConvolutionState::deviceUpdate(const DeviceBase *device, const BufferStorag
auto ressamples = std::make_unique<double[]>(buffer->mSampleLen +
(resampler ? resampledCount : 0));
- complex_f *filteriter = mComplexData.get() + mNumConvolveSegs*m;
+ auto ffttmp = al::vector<float,16>(ConvolveUpdateSize);
+ auto fftbuffer = std::vector<std::complex<double>>(ConvolveUpdateSize);
+
+ float *filteriter = mComplexData.get() + mNumConvolveSegs*ConvolveUpdateSize;
for(size_t c{0};c < numChannels;++c)
{
/* Resample to match the device. */
@@ -353,18 +395,34 @@ void ConvolutionState::deviceUpdate(const DeviceBase *device, const BufferStorag
std::transform(ressamples.get(), ressamples.get()+first_size, mFilter[c].rbegin(),
[](const double d) noexcept -> float { return static_cast<float>(d); });
- auto fftbuffer = std::vector<std::complex<double>>(ConvolveUpdateSize);
size_t done{first_size};
for(size_t s{0};s < mNumConvolveSegs;++s)
{
const size_t todo{minz(resampledCount-done, ConvolveUpdateSamples)};
+ /* Apply a double-precision forward FFT for more precise frequency
+ * measurements.
+ */
auto iter = std::copy_n(&ressamples[done], todo, fftbuffer.begin());
done += todo;
std::fill(iter, fftbuffer.end(), std::complex<double>{});
-
forward_fft(al::span{fftbuffer});
- filteriter = std::copy_n(fftbuffer.cbegin(), m, filteriter);
+
+ /* Convert to, and pack in, a float buffer for PFFFT. Note that the
+ * first bin stores the real component of the half-frequency bin in
+ * the imaginary component.
+ */
+ for(size_t i{0};i < ConvolveUpdateSamples;++i)
+ {
+ ffttmp[i*2 ] = static_cast<float>(fftbuffer[i].real());
+ ffttmp[i*2 + 1] = static_cast<float>((i == 0) ?
+ fftbuffer[ConvolveUpdateSamples+1].real() : fftbuffer[i].imag());
+ }
+ /* Reorder backward to make it suitable for pffft_zconvolve and the
+ * subsequent pffft_transform(..., PFFFT_BACKWARD).
+ */
+ pffft_zreorder(mFft.get(), ffttmp.data(), al::to_address(filteriter), PFFFT_BACKWARD);
+ filteriter += ConvolveUpdateSize;
}
}
}
@@ -554,7 +612,6 @@ void ConvolutionState::process(const size_t samplesToDo,
if(mNumConvolveSegs < 1) UNLIKELY
return;
- constexpr size_t m{ConvolveUpdateSize/2 + 1};
size_t curseg{mCurrentSegment};
auto &chans = *mChans;
@@ -591,54 +648,49 @@ void ConvolutionState::process(const size_t samplesToDo,
* frequency bins to the FFT history.
*/
auto fftiter = std::copy_n(mInput.cbegin(), ConvolveUpdateSamples, mFftBuffer.begin());
- std::fill(fftiter, mFftBuffer.end(), complex_f{});
- forward_fft(al::span{mFftBuffer});
+ std::fill(fftiter, mFftBuffer.end(), 0.0f);
+ pffft_transform(mFft.get(), mFftBuffer.data(),
+ mComplexData.get() + curseg*ConvolveUpdateSize, mFftWorkBuffer.data(), PFFFT_FORWARD);
- std::copy_n(mFftBuffer.cbegin(), m, &mComplexData[curseg*m]);
-
- const complex_f *RESTRICT filter{mComplexData.get() + mNumConvolveSegs*m};
+ const float *RESTRICT filter{mComplexData.get() + mNumConvolveSegs*ConvolveUpdateSize};
for(size_t c{0};c < chans.size();++c)
{
- std::fill_n(mFftBuffer.begin(), m, complex_f{});
+ /* The iFFT'd response is scaled up by the number of bins, so apply
+ * the inverse to normalize the output.
+ */
+ static constexpr float fftscale{1.0f / float{ConvolveUpdateSize}};
/* Convolve each input segment with its IR filter counterpart
* (aligned in time).
*/
- const complex_f *RESTRICT input{&mComplexData[curseg*m]};
+ mFftBuffer.fill(0.0f);
+ const float *RESTRICT input{&mComplexData[curseg*ConvolveUpdateSize]};
for(size_t s{curseg};s < mNumConvolveSegs;++s)
{
- for(size_t i{0};i < m;++i,++input,++filter)
- mFftBuffer[i] += *input * *filter;
+ pffft_zconvolve_accumulate(mFft.get(), input, filter, mFftBuffer.data(), fftscale);
+ input += ConvolveUpdateSize;
+ filter += ConvolveUpdateSize;
}
input = mComplexData.get();
for(size_t s{0};s < curseg;++s)
{
- for(size_t i{0};i < m;++i,++input,++filter)
- mFftBuffer[i] += *input * *filter;
+ pffft_zconvolve_accumulate(mFft.get(), input, filter, mFftBuffer.data(), fftscale);
+ input += ConvolveUpdateSize;
+ filter += ConvolveUpdateSize;
}
- /* Reconstruct the mirrored/negative frequencies to do a proper
- * inverse FFT.
- */
- for(size_t i{m};i < ConvolveUpdateSize;++i)
- mFftBuffer[i] = std::conj(mFftBuffer[ConvolveUpdateSize-i]);
-
/* Apply iFFT to get the 256 (really 255) samples for output. The
* 128 output samples are combined with the last output's 127
* second-half samples (and this output's second half is
* subsequently saved for next time).
*/
- inverse_fft(al::span{mFftBuffer});
+ pffft_transform(mFft.get(), mFftBuffer.data(), mFftBuffer.data(),
+ mFftWorkBuffer.data(), PFFFT_BACKWARD);
- /* The iFFT'd response is scaled up by the number of bins, so apply
- * the inverse to normalize the output.
- */
for(size_t i{0};i < ConvolveUpdateSamples;++i)
- mOutput[c][i] =
- (mFftBuffer[i].real()+mOutput[c][ConvolveUpdateSamples+i]) *
- (1.0f/float{ConvolveUpdateSize});
+ mOutput[c][i] = mFftBuffer[i] + mOutput[c][ConvolveUpdateSamples+i];
for(size_t i{0};i < ConvolveUpdateSamples;++i)
- mOutput[c][ConvolveUpdateSamples+i] = mFftBuffer[ConvolveUpdateSamples+i].real();
+ mOutput[c][ConvolveUpdateSamples+i] = mFftBuffer[ConvolveUpdateSamples+i];
}
/* Shift the input history. */