diff options
-rw-r--r-- | common/pffft.cpp | 8 | ||||
-rw-r--r-- | common/pffft.h | 176 |
2 files changed, 93 insertions, 91 deletions
diff --git a/common/pffft.cpp b/common/pffft.cpp index 146afef5..0c2dd940 100644 --- a/common/pffft.cpp +++ b/common/pffft.cpp @@ -79,8 +79,7 @@ #endif -/* - * vector support macros: the rest of the code is independant of +/* Vector support macros: the rest of the code is independent of * SSE/Altivec/NEON -- adding support for other platforms with 4-element * vectors should be limited to these macros */ @@ -248,7 +247,7 @@ typedef float v4sf; #define VALIGNED(ptr) ((reinterpret_cast<uintptr_t>(ptr) & 0x3) == 0) #endif -// shortcuts for complex multiplcations +// shortcuts for complex multiplications #define VCPLXMUL(ar,ai,br,bi) do { v4sf tmp=VMUL(ar,bi); ar=VMUL(ar,br); ar=VSUB(ar,VMUL(ai,bi)); ai=VMADD(ai,br,tmp); } while(0) #define VCPLXMULCONJ(ar,ai,br,bi) do { v4sf tmp=VMUL(ar,bi); ar=VMUL(ar,br); ar=VMADD(ai,bi,ar); ai=VSUB(VMUL(ai,br),tmp); } while(0) #ifndef SVMUL @@ -1866,6 +1865,8 @@ void pffft_transform_internal(PFFFT_Setup *setup, const float *finput, float *fo void pffft_zconvolve_accumulate(PFFFT_Setup *s, const float *a, const float *b, float *ab, float scaling) { + assert(VALIGNED(a) && VALIGNED(b) && VALIGNED(ab)); + const int Ncvec = s->Ncvec; const v4sf *RESTRICT va = reinterpret_cast<const v4sf*>(a); const v4sf *RESTRICT vb = reinterpret_cast<const v4sf*>(b); @@ -1892,7 +1893,6 @@ void pffft_zconvolve_accumulate(PFFFT_Setup *s, const float *a, const float *b, #ifndef ZCONVOLVE_USING_INLINE_ASM const v4sf vscal = LD_PS1(scaling); #endif - assert(VALIGNED(a) && VALIGNED(b) && VALIGNED(ab)); float ar1 = reinterpret_cast<const v4sf_union*>(va)[0].f[0]; float ai1 = reinterpret_cast<const v4sf_union*>(va)[1].f[0]; float br1 = reinterpret_cast<const v4sf_union*>(vb)[0].f[0]; diff --git a/common/pffft.h b/common/pffft.h index d9dfa808..87d10216 100644 --- a/common/pffft.h +++ b/common/pffft.h @@ -45,34 +45,36 @@ SOFTWARE. */ -/* - PFFFT : a Pretty Fast FFT. - - This is basically an adaptation of the single precision fftpack - (v4) as found on netlib taking advantage of SIMD instruction found - on cpus such as intel x86 (SSE1), powerpc (Altivec), and arm (NEON). - - For architectures where no SIMD instruction is available, the code - falls back to a scalar version. - - Restrictions: - - - 1D transforms only, with 32-bit single precision. - - - supports only transforms for inputs of length N of the form - N=(2^a)*(3^b)*(5^c), a >= 5, b >=0, c >= 0 (32, 48, 64, 96, 128, - 144, 160, etc are all acceptable lengths). Performance is best for - 128<=N<=8192. - - - all (float*) pointers in the functions below are expected to - have an "simd-compatible" alignment, that is 16 bytes on x86 and - powerpc CPUs. - - You can allocate such buffers with the functions - pffft_aligned_malloc / pffft_aligned_free (or with stuff like - posix_memalign..) - -*/ +/* PFFFT : a Pretty Fast FFT. + * + * This is basically an adaptation of the single precision fftpack (v4) as + * found on netlib taking advantage of SIMD instructions found on CPUs such as + * Intel x86 (SSE1), PowerPC (Altivec), and Arm (NEON). + * + * For architectures where SIMD instructions aren't available, the code falls + * back to a scalar version. + * + * Restrictions: + * + * - 1D transforms only, with 32-bit single precision. + * + * - supports only transforms for inputs of length N of the form + * N=(2^a)*(3^b)*(5^c), given a >= 5, b >=0, c >= 0 (32, 48, 64, 96, 128, 144, + * 160, etc are all acceptable lengths). Performance is best for 128<=N<=8192. + * + * - all (float*) pointers for the functions below are expected to have a + * "SIMD-compatible" alignment, that is 16 bytes. + * + * You can allocate such buffers with the pffft_aligned_malloc function, and + * deallocate them with pffft_aligned_free (or with stuff like posix_memalign, + * aligned_alloc, etc). + * + * Note that for the z-domain data of real transforms, when in the canonical + * order (as interleaved complex numbers) both 0-frequency and half-frequency + * components, which are real, are assembled in the first entry as + * F(0)+i*F(n/2+1). The original fftpack placed F(n/2+1) at the end of the + * arrays instead. + */ #ifndef PFFFT_H #define PFFFT_H @@ -100,77 +102,77 @@ typedef enum { PFFFT_REAL, PFFFT_COMPLEX } pffft_transform_t; #endif -/* - prepare for performing transforms of size N -- the returned - PFFFT_Setup structure is read-only so it can safely be shared by - multiple concurrent threads. -*/ +/** + * Prepare for performing transforms of size N -- the returned PFFFT_Setup + * structure is read-only so it can safely be shared by multiple concurrent + * threads. + */ PFFFT_Setup *pffft_new_setup(int N, pffft_transform_t transform); -void pffft_destroy_setup(PFFFT_Setup *); -/* - Perform a Fourier transform , The z-domain data is stored in the - most efficient order for transforming it back, or using it for - convolution. If you need to have its content sorted in the - "usual" way, that is as an array of interleaved complex numbers, - either use pffft_transform_ordered , or call pffft_zreorder after - the forward fft, and before the backward fft. - - Transforms are not scaled: PFFFT_BACKWARD(PFFFT_FORWARD(x)) = N*x. - Typically you will want to scale the backward transform by 1/N. - - The 'work' pointer must point to an area of N (2*N for complex - fft) floats, properly aligned. It cannot be NULL. - - input and output may alias. -*/ +void pffft_destroy_setup(PFFFT_Setup *setup); + +/** + * Perform a Fourier transform. The z-domain data is stored in the most + * efficient order for transforming back or using for convolution, and as + * such, there's no guarantee to the order of the values. If you need to have + * its content sorted in the usual way, that is as an array of interleaved + * complex numbers, either use pffft_transform_ordered, or call pffft_zreorder + * after the forward fft and before the backward fft. + * + * Transforms are not scaled: PFFFT_BACKWARD(PFFFT_FORWARD(x)) = N*x. Typically + * you will want to scale the backward transform by 1/N. + * + * The 'work' pointer must point to an area of N (2*N for complex fft) floats, + * properly aligned. It cannot be NULL. + * + * The input and output parameters may alias. + */ void pffft_transform(PFFFT_Setup *setup, const float *input, float *output, float *work, pffft_direction_t direction); -/* - Similar to pffft_transform, but makes sure that the output is - ordered as expected (interleaved complex numbers). This is - similar to calling pffft_transform and then pffft_zreorder. - - input and output may alias. -*/ +/** + * Similar to pffft_transform, but handles the complex values in the usual form + * (interleaved complex numbers). This is similar to calling + * pffft_transform(..., PFFFT_FORWARD) followed by + * pffft_zreorder(..., PFFFT_FORWARD), or + * pffft_zreorder(..., PFFFT_BACKWARD) followed by + * pffft_transform(..., PFFFT_BACKWARD), for the given direction. + * + * The input and output parameters may alias. + */ void pffft_transform_ordered(PFFFT_Setup *setup, const float *input, float *output, float *work, pffft_direction_t direction); -/* - call pffft_zreorder(.., PFFFT_FORWARD) after pffft_transform(..., - PFFFT_FORWARD) if you want to have the frequency components in - the correct "canonical" order, as interleaved complex numbers. - - (for real transforms, both 0-frequency and half frequency - components, which are real, are assembled in the first entry as - F(0)+i*F(n/2+1). Note that the original fftpack did place - F(n/2+1) at the end of the arrays). - - input and output should not alias. -*/ +/** + * Reorder the z-domain data. For PFFFT_FORWARD, it reorders from the internal + * representation to the "canonical" order (as interleaved complex numbers). + * For PFFFT_BACKWARD, it reorders from the canonical order to the internal + * order suitable for pffft_transform(..., PFFFT_BACKWARD) or + * pffft_zconvolve_accumulate. + * + * The input and output parameters should not alias. + */ void pffft_zreorder(PFFFT_Setup *setup, const float *input, float *output, pffft_direction_t direction); -/* - Perform a multiplication of the frequency components of dft_a and - dft_b and accumulate them into dft_ab. The arrays should have - been obtained with pffft_transform(.., PFFFT_FORWARD) and should - *not* have been reordered with pffft_zreorder (otherwise just - perform the operation yourself as the dft coefs are stored as - interleaved complex numbers). - - the operation performed is: dft_ab += (dft_a * fdt_b)*scaling - - The dft_a, dft_b and dft_ab pointers may alias. -*/ +/** + * Perform a multiplication of the z-domain data in dft_a and dft_b and + * accumulate them into dft_ab. The arrays should have been obtained with + * pffft_transform(..., PFFFT_FORWARD) or pffft_zreorder(..., PFFFT_BACKWARD) + * and should *not* be in the usual order (otherwise just perform the operation + * yourself as the dft coeffs are stored as interleaved complex numbers). + * + * The operation performed is: dft_ab += (dft_a * dft_b)*scaling + * + * The dft_a, dft_b, and dft_ab parameters may alias. + */ void pffft_zconvolve_accumulate(PFFFT_Setup *setup, const float *dft_a, const float *dft_b, float *dft_ab, float scaling); -/* - the float buffers must have the correct alignment (16-byte boundary - on intel and powerpc). This function may be used to obtain such - correctly aligned buffers. -*/ +/** + * The float buffers must have the correct alignment (16-byte boundary on intel + * and powerpc). This function may be used to obtain such correctly aligned + * buffers. + */ void *pffft_aligned_malloc(size_t nb_bytes); void pffft_aligned_free(void *); -/* return 4 or 1 wether support SSE/Altivec instructions was enable when building pffft.c */ +/* Return 4 or 1 depending if vectorization was enable when building pffft.cpp. */ int pffft_simd_size(); #ifdef __cplusplus |