5 files changed, 49 insertions, 37 deletions
diff --git a/alc/effects/convolution.cpp b/alc/effects/convolution.cpp
index f655cf89..26ef6fd9 100644
--- a/alc/effects/convolution.cpp
+++ b/alc/effects/convolution.cpp
@@ -124,7 +124,7 @@ constexpr float Deg2Rad(float x) noexcept
 { return static_cast<float>(al::numbers::pi / 180.0 * x); }
 
 
-using complex_d = std::complex<double>;
+using complex_f = std::complex<float>;
 
 constexpr size_t ConvolveUpdateSize{256};
 constexpr size_t ConvolveUpdateSamples{ConvolveUpdateSize / 2};
@@ -187,7 +187,7 @@ struct ConvolutionState final : public EffectState {
     al::vector<std::array<float,ConvolveUpdateSamples>,16> mFilter;
     al::vector<std::array<float,ConvolveUpdateSamples*2>,16> mOutput;
 
-    alignas(16) std::array<complex_d,ConvolveUpdateSize> mFftBuffer{};
+    alignas(16) std::array<complex_f,ConvolveUpdateSize> mFftBuffer{};
 
     size_t mCurrentSegment{0};
     size_t mNumConvolveSegs{0};
@@ -201,7 +201,7 @@ struct ConvolutionState final : public EffectState {
     };
     using ChannelDataArray = al::FlexArray<ChannelData>;
     std::unique_ptr<ChannelDataArray> mChans;
-    std::unique_ptr<complex_d[]> mComplexData;
+    std::unique_ptr<complex_f[]> mComplexData;
 
 
     ConvolutionState() = default;
@@ -249,7 +249,7 @@ void ConvolutionState::deviceUpdate(const DeviceBase *device, const Buffer &buff
     mInput.fill(0.0f);
     decltype(mFilter){}.swap(mFilter);
     decltype(mOutput){}.swap(mOutput);
-    mFftBuffer.fill(complex_d{});
+    mFftBuffer.fill(complex_f{});
 
     mCurrentSegment = 0;
     mNumConvolveSegs = 0;
@@ -296,8 +296,8 @@ void ConvolutionState::deviceUpdate(const DeviceBase *device, const Buffer &buff
     mNumConvolveSegs = maxz(mNumConvolveSegs, 2) - 1;
 
     const size_t complex_length{mNumConvolveSegs * m * (numChannels+1)};
-    mComplexData = std::make_unique<complex_d[]>(complex_length);
-    std::fill_n(mComplexData.get(), complex_length, complex_d{});
+    mComplexData = std::make_unique<complex_f[]>(complex_length);
+    std::fill_n(mComplexData.get(), complex_length, complex_f{});
 
     mChannels = buffer.storage->mChannels;
     mAmbiLayout = buffer.storage->mAmbiLayout;
@@ -305,7 +305,7 @@ void ConvolutionState::deviceUpdate(const DeviceBase *device, const Buffer &buff
     mAmbiOrder = minu(buffer.storage->mAmbiOrder, MaxConvolveAmbiOrder);
 
     auto srcsamples = std::make_unique<double[]>(maxz(buffer.storage->mSampleLen, resampledCount));
-    complex_d *filteriter = mComplexData.get() + mNumConvolveSegs*m;
+    complex_f *filteriter = mComplexData.get() + mNumConvolveSegs*m;
     for(size_t c{0};c < numChannels;++c)
     {
         /* Load the samples from the buffer, and resample to match the device. */
@@ -322,17 +322,18 @@ void ConvolutionState::deviceUpdate(const DeviceBase *device, const Buffer &buff
         std::transform(srcsamples.get(), srcsamples.get()+first_size, mFilter[c].rbegin(),
             [](const double d) noexcept -> float { return static_cast<float>(d); });
 
+        auto fftbuffer = std::vector<std::complex<double>>(ConvolveUpdateSize);
         size_t done{first_size};
         for(size_t s{0};s < mNumConvolveSegs;++s)
         {
             const size_t todo{minz(resampledCount-done, ConvolveUpdateSamples)};
 
-            auto iter = std::copy_n(&srcsamples[done], todo, mFftBuffer.begin());
+            auto iter = std::copy_n(&srcsamples[done], todo, fftbuffer.begin());
             done += todo;
-            std::fill(iter, mFftBuffer.end(), complex_d{});
+            std::fill(iter, fftbuffer.end(), std::complex<double>{});
 
-            forward_fft(mFftBuffer);
-            filteriter = std::copy_n(mFftBuffer.cbegin(), m, filteriter);
+            forward_fft<double>(fftbuffer);
+            filteriter = std::copy_n(fftbuffer.cbegin(), m, filteriter);
         }
     }
 }
@@ -537,20 +538,20 @@ void ConvolutionState::process(const size_t samplesToDo,
          * frequency bins to the FFT history.
          */
         auto fftiter = std::copy_n(mInput.cbegin(), ConvolveUpdateSamples, mFftBuffer.begin());
-        std::fill(fftiter, mFftBuffer.end(), complex_d{});
-        forward_fft(mFftBuffer);
+        std::fill(fftiter, mFftBuffer.end(), complex_f{});
+        forward_fft<float>(mFftBuffer);
 
         std::copy_n(mFftBuffer.cbegin(), m, &mComplexData[curseg*m]);
 
-        const complex_d *RESTRICT filter{mComplexData.get() + mNumConvolveSegs*m};
+        const complex_f *RESTRICT filter{mComplexData.get() + mNumConvolveSegs*m};
         for(size_t c{0};c < chans.size();++c)
         {
-            std::fill_n(mFftBuffer.begin(), m, complex_d{});
+            std::fill_n(mFftBuffer.begin(), m, complex_f{});
 
             /* Convolve each input segment with its IR filter counterpart
              * (aligned in time).
              */
-            const complex_d *RESTRICT input{&mComplexData[curseg*m]};
+            const complex_f *RESTRICT input{&mComplexData[curseg*m]};
             for(size_t s{curseg};s < mNumConvolveSegs;++s)
             {
                 for(size_t i{0};i < m;++i,++input,++filter)
@@ -574,19 +575,17 @@ void ConvolutionState::process(const size_t samplesToDo,
              * second-half samples (and this output's second half is
              * subsequently saved for next time).
              */
-            inverse_fft(mFftBuffer);
+            inverse_fft<float>(mFftBuffer);
 
             /* The iFFT'd response is scaled up by the number of bins, so apply
              * the inverse to normalize the output.
              */
             for(size_t i{0};i < ConvolveUpdateSamples;++i)
                 mOutput[c][i] =
-                    static_cast<float>(mFftBuffer[i].real() * (1.0/double{ConvolveUpdateSize})) +
-                    mOutput[c][ConvolveUpdateSamples+i];
+                    (mFftBuffer[i].real()+mOutput[c][ConvolveUpdateSamples+i]) *
+                    (1.0f/float{ConvolveUpdateSize});
             for(size_t i{0};i < ConvolveUpdateSamples;++i)
-                mOutput[c][ConvolveUpdateSamples+i] =
-                    static_cast<float>(mFftBuffer[ConvolveUpdateSamples+i].real() *
-                        (1.0/double{ConvolveUpdateSize}));
+                mOutput[c][ConvolveUpdateSamples+i] = mFftBuffer[ConvolveUpdateSamples+i].real();
         }
 
         /* Shift the input history. */
diff --git a/alc/effects/pshifter.cpp b/alc/effects/pshifter.cpp
index b1f6d859..f8409292 100644
--- a/alc/effects/pshifter.cpp
+++ b/alc/effects/pshifter.cpp
@@ -184,7 +184,7 @@ void PshifterState::process(const size_t samplesToDo, const al::span<const Float
             mFftBuffer[k] = mFIFO[src] * HannWindow[k];
         for(size_t src{0u}, k{STFT_SIZE-mPos};src < mPos;++src,++k)
             mFftBuffer[k] = mFIFO[src] * HannWindow[k];
-        forward_fft(mFftBuffer);
+        forward_fft<double>(mFftBuffer);
 
         /* Analyze the obtained data. Since the real FFT is symmetric, only
          * STFT_HALF_SIZE+1 samples are needed.
@@ -243,7 +243,7 @@ void PshifterState::process(const size_t samplesToDo, const al::span<const Float
         /* Apply an inverse FFT to get the time-domain siganl, and accumulate
          * for the output with windowing.
          */
-        inverse_fft(mFftBuffer);
+        inverse_fft<double>(mFftBuffer);
         for(size_t dst{mPos}, k{0u};dst < STFT_SIZE;++dst,++k)
             mOutputAccum[dst] += HannWindow[k]*mFftBuffer[k].real() * (4.0/OVERSAMP/STFT_SIZE);
         for(size_t dst{0u}, k{STFT_SIZE-mPos};dst < mPos;++dst,++k)
diff --git a/common/alcomplex.cpp b/common/alcomplex.cpp
index eae47227..c08ac751 100644
--- a/common/alcomplex.cpp
+++ b/common/alcomplex.cpp
@@ -91,7 +91,9 @@ constexpr std::array<al::span<const ushort2>,11> gBitReverses{{
 
 } // namespace
 
-void complex_fft(const al::span<std::complex<double>> buffer, const double sign)
+template<typename Real>
+std::enable_if_t<std::is_floating_point<Real>::value>
+complex_fft(const al::span<std::complex<Real>> buffer, const Real sign)
 {
     const size_t fftsize{buffer.size()};
     /* Get the number of bits used for indexing. Simplifies bit-reversal and
@@ -118,21 +120,21 @@ void complex_fft(const al::span<std::complex<double>> buffer, const double sign)
         std::swap(buffer[rev.first], buffer[rev.second]);
 
     /* Iterative form of Danielson-Lanczos lemma */
-    const double pi{al::numbers::pi * sign};
+    const Real pi{al::numbers::pi_v<Real> * sign};
     size_t step2{1u};
     for(size_t i{0};i < log2_size;++i)
     {
-        const double arg{pi / static_cast<double>(step2)};
+        const Real arg{pi / static_cast<Real>(step2)};
 
         /* TODO: Would std::polar(1.0, arg) be any better? */
-        const std::complex<double> w{std::cos(arg), std::sin(arg)};
-        std::complex<double> u{1.0, 0.0};
+        const std::complex<Real> w{std::cos(arg), std::sin(arg)};
+        std::complex<Real> u{1.0, 0.0};
         const size_t step{step2 << 1};
         for(size_t j{0};j < step2;j++)
         {
             for(size_t k{j};k < fftsize;k+=step)
             {
-                std::complex<double> temp{buffer[k+step2] * u};
+                std::complex<Real> temp{buffer[k+step2] * u};
                 buffer[k+step2] = buffer[k] - temp;
                 buffer[k] += temp;
             }
@@ -163,3 +165,7 @@ void complex_hilbert(const al::span<std::complex<double>> buffer)
 
     forward_fft(buffer);
 }
+
+
+template void complex_fft<>(const al::span<std::complex<float>> buffer, const float sign);
+template void complex_fft<>(const al::span<std::complex<double>> buffer, const double sign);
diff --git a/common/alcomplex.h b/common/alcomplex.h
index 23b8114a..46dbb5bc 100644
--- a/common/alcomplex.h
+++ b/common/alcomplex.h
@@ -2,6 +2,7 @@
 #define ALCOMPLEX_H
 
 #include <complex>
+#include <type_traits>
 
 #include "alspan.h"
 
@@ -10,21 +11,27 @@
  * FFT and 1 is inverse FFT. Applies the Discrete Fourier Transform (DFT) to
  * the data supplied in the buffer, which MUST BE power of two.
  */
-void complex_fft(const al::span<std::complex<double>> buffer, const double sign);
+template<typename Real>
+std::enable_if_t<std::is_floating_point<Real>::value>
+complex_fft(const al::span<std::complex<Real>> buffer, const Real sign);
 
 /**
  * Calculate the frequency-domain response of the time-domain signal in the
  * provided buffer, which MUST BE power of two.
  */
-inline void forward_fft(const al::span<std::complex<double>> buffer)
-{ complex_fft(buffer, -1.0); }
+template<typename Real>
+std::enable_if_t<std::is_floating_point<Real>::value>
+forward_fft(const al::span<std::complex<Real>> buffer)
+{ complex_fft(buffer, Real{-1}); }
 
 /**
  * Calculate the time-domain signal of the frequency-domain response in the
  * provided buffer, which MUST BE power of two.
  */
-inline void inverse_fft(const al::span<std::complex<double>> buffer)
-{ complex_fft(buffer, 1.0); }
+template<typename Real>
+std::enable_if_t<std::is_floating_point<Real>::value>
+inverse_fft(const al::span<std::complex<Real>> buffer)
+{ complex_fft(buffer, Real{1}); }
 
 /**
  * Calculate the complex helical sequence (discrete-time analytical signal) of
diff --git a/common/phase_shifter.h b/common/phase_shifter.h
index 83e07c7a..6cfdf053 100644
--- a/common/phase_shifter.h
+++ b/common/phase_shifter.h
@@ -53,12 +53,12 @@ struct PhaseShifterT {
         std::fill_n(fftBuffer.get(), fft_size, complex_d{});
         fftBuffer[half_size] = 1.0;
 
-        forward_fft({fftBuffer.get(), fft_size});
+        forward_fft<double>({fftBuffer.get(), fft_size});
         for(size_t i{0};i < half_size+1;++i)
             fftBuffer[i] = complex_d{-fftBuffer[i].imag(), fftBuffer[i].real()};
         for(size_t i{half_size+1};i < fft_size;++i)
             fftBuffer[i] = std::conj(fftBuffer[fft_size - i]);
-        inverse_fft({fftBuffer.get(), fft_size});
+        inverse_fft<double>({fftBuffer.get(), fft_size});
 
         auto fftiter = fftBuffer.get() + half_size + (FilterSize/2 - 1);
         for(float &coeff : mCoeffs)