common/pffft.h


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239

/* Copyright (c) 2013  Julien Pommier ( pommier@modartt.com )

   Based on original fortran 77 code from FFTPACKv4 from NETLIB,
   authored by Dr Paul Swarztrauber of NCAR, in 1985.

   As confirmed by the NCAR fftpack software curators, the following
   FFTPACKv5 license applies to FFTPACKv4 sources. My changes are
   released under the same terms.

   FFTPACK license:

   http://www.cisl.ucar.edu/css/software/fftpack5/ftpk.html

   Copyright (c) 2004 the University Corporation for Atmospheric
   Research ("UCAR"). All rights reserved. Developed by NCAR's
   Computational and Information Systems Laboratory, UCAR,
   www.cisl.ucar.edu.

   Redistribution and use of the Software in source and binary forms,
   with or without modification, is permitted provided that the
   following conditions are met:

   - Neither the names of NCAR's Computational and Information Systems
   Laboratory, the University Corporation for Atmospheric Research,
   nor the names of its sponsors or contributors may be used to
   endorse or promote products derived from this Software without
   specific prior written permission.

   - Redistributions of source code must retain the above copyright
   notices, this list of conditions, and the disclaimer below.

   - Redistributions in binary form must reproduce the above copyright
   notice, this list of conditions, and the disclaimer below in the
   documentation and/or other materials provided with the
   distribution.

   THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
   EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF
   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
   NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
   HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL,
   EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN
   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
   SOFTWARE.
*/

/* PFFFT : a Pretty Fast FFT.
 *
 * This is basically an adaptation of the single precision fftpack (v4) as
 * found on netlib taking advantage of SIMD instructions found on CPUs such as
 * Intel x86 (SSE1), PowerPC (Altivec), and Arm (NEON).
 *
 * For architectures where SIMD instructions aren't available, the code falls
 * back to a scalar version.
 *
 * Restrictions:
 *
 * - 1D transforms only, with 32-bit single precision.
 *
 * - supports only transforms for inputs of length N of the form
 * N=(2^a)*(3^b)*(5^c), given a >= 5, b >=0, c >= 0 (32, 48, 64, 96, 128, 144,
 * 160, etc are all acceptable lengths). Performance is best for 128<=N<=8192.
 *
 * - all (float*) pointers for the functions below are expected to have a
 * "SIMD-compatible" alignment, that is 16 bytes.
 *
 * You can allocate such buffers with the pffft_aligned_malloc function, and
 * deallocate them with pffft_aligned_free (or with stuff like posix_memalign,
 * aligned_alloc, etc).
 *
 * Note that for the z-domain data of real transforms, when in the canonical
 * order (as interleaved complex numbers) both 0-frequency and half-frequency
 * components, which are real, are assembled in the first entry as
 * F(0)+i*F(n/2+1). The original fftpack placed F(n/2+1) at the end of the
 * arrays instead.
 */

#ifndef PFFFT_H
#define PFFFT_H

#include <stddef.h> // for size_t
#include <stdint.h>

#ifdef __cplusplus
#include <cstddef>
#include <utility>

extern "C" {
#endif

/* opaque struct holding internal stuff (precomputed twiddle factors) this
 * struct can be shared by many threads as it contains only read-only data.
 */
struct PFFFT_Setup;

/* direction of the transform */
enum pffft_direction_t { PFFFT_FORWARD, PFFFT_BACKWARD };

/* type of transform */
enum pffft_transform_t { PFFFT_REAL, PFFFT_COMPLEX };

#ifndef __cplusplus
typedef struct PFFFT_Setup PFFFT_Setup;
typedef enum pffft_direction_t pffft_direction_t;
typedef enum pffft_transform_t pffft_transform_t;
#endif

/**
 * Prepare for performing transforms of size N -- the returned PFFFT_Setup
 * structure is read-only so it can safely be shared by multiple concurrent
 * threads.
 */
[[gnu::malloc]]
PFFFT_Setup *pffft_new_setup(unsigned int N, pffft_transform_t transform);
void pffft_destroy_setup(PFFFT_Setup *setup) noexcept;

/**
 * Perform a Fourier transform. The z-domain data is stored in the most
 * efficient order for transforming back or using for convolution, and as
 * such, there's no guarantee to the order of the values. If you need to have
 * its content sorted in the usual way, that is as an array of interleaved
 * complex numbers, either use pffft_transform_ordered, or call pffft_zreorder
 * after the forward fft and before the backward fft.
 *
 * Transforms are not scaled: PFFFT_BACKWARD(PFFFT_FORWARD(x)) = N*x. Typically
 * you will want to scale the backward transform by 1/N.
 *
 * The 'work' pointer must point to an area of N (2*N for complex fft) floats,
 * properly aligned. It cannot be NULL.
 *
 * The input and output parameters may alias.
 */
void pffft_transform(const PFFFT_Setup *setup, const float *input, float *output, float *work, pffft_direction_t direction);

/**
 * Similar to pffft_transform, but handles the complex values in the usual form
 * (interleaved complex numbers). This is similar to calling
 * pffft_transform(..., PFFFT_FORWARD) followed by
 * pffft_zreorder(..., PFFFT_FORWARD), or
 * pffft_zreorder(..., PFFFT_BACKWARD) followed by
 * pffft_transform(..., PFFFT_BACKWARD), for the given direction.
 *
 * The input and output parameters may alias.
 */
void pffft_transform_ordered(const PFFFT_Setup *setup, const float *input, float *output, float *work, pffft_direction_t direction);

/**
 * Reorder the z-domain data. For PFFFT_FORWARD, it reorders from the internal
 * representation to the "canonical" order (as interleaved complex numbers).
 * For PFFFT_BACKWARD, it reorders from the canonical order to the internal
 * order suitable for pffft_transform(..., PFFFT_BACKWARD) or
 * pffft_zconvolve_accumulate.
 *
 * The input and output parameters should not alias.
 */
void pffft_zreorder(const PFFFT_Setup *setup, const float *input, float *output, pffft_direction_t direction);

/**
 * Perform a multiplication of the z-domain data in dft_a and dft_b, and scale
 * and accumulate into dft_ab. The arrays should have been obtained with
 * pffft_transform(..., PFFFT_FORWARD) or pffft_zreorder(..., PFFFT_BACKWARD)
 * and should *not* be in the usual order (otherwise just perform the operation
 * yourself as the dft coeffs are stored as interleaved complex numbers).
 *
 * The operation performed is: dft_ab += (dft_a * dft_b)*scaling
 *
 * The dft_a, dft_b, and dft_ab parameters may alias.
 */
void pffft_zconvolve_scale_accumulate(const PFFFT_Setup *setup, const float *dft_a, const float *dft_b, float *dft_ab, float scaling);

/**
 * Perform a multiplication of the z-domain data in dft_a and dft_b, and
 * accumulate into dft_ab.
 *
 * The operation performed is: dft_ab += dft_a * dft_b
 *
 * The dft_a, dft_b, and dft_ab parameters may alias.
 */
void pffft_zconvolve_accumulate(const PFFFT_Setup *setup, const float *dft_a, const float *dft_b, float *dft_ab);

/**
 * The float buffers must have the correct alignment (16-byte boundary on intel
 * and powerpc). This function may be used to obtain such correctly aligned
 * buffers.
 */
[[gnu::alloc_size(1), gnu::malloc]]
void *pffft_aligned_malloc(size_t nb_bytes);
void pffft_aligned_free(void *ptr) noexcept;

/* Return 4 or 1 depending if vectorization was enable when building pffft.cpp. */
int pffft_simd_size() noexcept;

#ifdef __cplusplus
}

struct PFFFTSetup {
    PFFFT_Setup *mSetup{};

    PFFFTSetup() = default;
    PFFFTSetup(const PFFFTSetup&) = delete;
    PFFFTSetup(PFFFTSetup&& rhs) noexcept : mSetup{rhs.mSetup} { rhs.mSetup = nullptr; }
    explicit PFFFTSetup(std::nullptr_t) { }
    explicit PFFFTSetup(unsigned int n, pffft_transform_t transform)
        : mSetup{pffft_new_setup(n, transform)}
    { }
    ~PFFFTSetup() { if(mSetup) pffft_destroy_setup(mSetup); }

    PFFFTSetup& operator=(const PFFFTSetup&) = delete;
    PFFFTSetup& operator=(PFFFTSetup&& rhs) noexcept
    {
        if(mSetup)
            pffft_destroy_setup(mSetup);
        mSetup = std::exchange(rhs.mSetup, nullptr);
        return *this;
    }

    void transform(const float *input, float *output, float *work, pffft_direction_t direction) const
    { pffft_transform(mSetup, input, output, work, direction); }

    void transform_ordered(const float *input, float *output, float *work,
        pffft_direction_t direction) const
    { pffft_transform_ordered(mSetup, input, output, work, direction); }

    void zreorder(const float *input, float *output, pffft_direction_t direction) const
    { pffft_zreorder(mSetup, input, output, direction); }

    void zconvolve_scale_accumulate(const float *dft_a, const float *dft_b, float *dft_ab,
        float scaling) const
    { pffft_zconvolve_scale_accumulate(mSetup, dft_a, dft_b, dft_ab, scaling); }

    void zconvolve_accumulate(const float *dft_a, const float *dft_b, float *dft_ab) const
    { pffft_zconvolve_accumulate(mSetup, dft_a, dft_b, dft_ab); }

    [[nodiscard]] operator bool() const noexcept { return mSetup != nullptr; }
};
#endif

#endif // PFFFT_H