1/* Copyright (c) 2013 Julien Pommier ( pommier@modartt.com )
2
3 Based on original fortran 77 code from FFTPACKv4 from NETLIB
4 (http://www.netlib.org/fftpack), authored by Dr Paul Swarztrauber
5 of NCAR, in 1985.
6
7 As confirmed by the NCAR fftpack software curators, the following
8 FFTPACKv5 license applies to FFTPACKv4 sources. My changes are
9 released under the same terms.
10
11 FFTPACK license:
12
13 http://www.cisl.ucar.edu/css/software/fftpack5/ftpk.html
14
15 Copyright (c) 2004 the University Corporation for Atmospheric
16 Research ("UCAR"). All rights reserved. Developed by NCAR's
17 Computational and Information Systems Laboratory, UCAR,
18 www.cisl.ucar.edu.
19
20 Redistribution and use of the Software in source and binary forms,
21 with or without modification, is permitted provided that the
22 following conditions are met:
23
24 - Neither the names of NCAR's Computational and Information Systems
25 Laboratory, the University Corporation for Atmospheric Research,
26 nor the names of its sponsors or contributors may be used to
27 endorse or promote products derived from this Software without
28 specific prior written permission.
29
30 - Redistributions of source code must retain the above copyright
31 notices, this list of conditions, and the disclaimer below.
32
33 - Redistributions in binary form must reproduce the above copyright
34 notice, this list of conditions, and the disclaimer below in the
35 documentation and/or other materials provided with the
36 distribution.
37
38 THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
39 EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF
40 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
41 NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
42 HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL,
43 EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN
44 ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
45 CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
46 SOFTWARE.
47
48
49 PFFFT : a Pretty Fast FFT.
50
51 This file is largerly based on the original FFTPACK implementation, modified in
52 order to take advantage of SIMD instructions of modern CPUs.
53*/
54
55/*
56 ChangeLog:
57 - 2011/10/02, version 1: This is the very first release of this file.
58*/
59
60#ifndef _USE_MATH_DEFINES
61# define _USE_MATH_DEFINES // ask gently MSVC to define M_PI, M_SQRT2 etc.
62#endif
63
64#include "pffft.h"
65#include <stdlib.h>
66#include <stdio.h>
67#include <math.h>
68#include <assert.h>
69
70/* detect compiler flavour */
71#if defined(_MSC_VER)
72# define COMPILER_MSVC
73#elif defined(__GNUC__)
74# define COMPILER_GCC
75#endif
76
77#if defined(COMPILER_GCC)
78# define ALWAYS_INLINE(return_type) inline return_type __attribute__ ((always_inline))
79# define NEVER_INLINE(return_type) return_type __attribute__ ((noinline))
80# define RESTRICT __restrict
81# define VLA_ARRAY_ON_STACK(type__, varname__, size__) type__ varname__[size__];
82#elif defined(COMPILER_MSVC)
83# define ALWAYS_INLINE(return_type) __forceinline return_type
84# define NEVER_INLINE(return_type) __declspec(noinline) return_type
85# define RESTRICT __restrict
86# define VLA_ARRAY_ON_STACK(type__, varname__, size__) type__ *varname__ = (type__*)_alloca(size__ * sizeof(type__))
87#endif
88
89
90/*
91 vector support macros: the rest of the code is independant of
92 SSE/Altivec/NEON -- adding support for other platforms with 4-element
93 vectors should be limited to these macros
94*/
95
96
97// define PFFFT_SIMD_DISABLE if you want to use scalar code instead of simd code
98//#define PFFFT_SIMD_DISABLE
99
100/* select which SIMD intrinsics will be used */
101#if !defined(PFFFT_SIMD_DISABLE)
102# if (defined(__ppc__) || defined(__ppc64__) || defined(__powerpc__) || defined(__powerpc64__)) \
103 && (defined(__VEC__) || defined(__ALTIVEC__))
104# define PFFFT_SIMD_ALTIVEC
105# elif defined(__ARM_NEON) || defined(__aarch64__) || defined(__arm64) \
106 || defined(_M_ARM64) || defined(_M_ARM64EC) || defined(__wasm_simd128__)
107 // we test _M_ARM64EC before _M_X64 because when _M_ARM64EC is defined, the microsoft compiler also defines _M_X64
108# define PFFFT_SIMD_NEON
109# elif defined(__x86_64__) || defined(__SSE__) || defined(_M_X64) || (defined(_M_IX86_FP) && _M_IX86_FP >= 1)
110# define PFFFT_SIMD_SSE
111# endif
112#endif // PFFFT_SIMD_DISABLE
113
114/*
115 Altivec support macros
116*/
117#ifdef PFFFT_SIMD_ALTIVEC
118#include <altivec.h>
119typedef vector float v4sf;
120# define SIMD_SZ 4
121# define VZERO() ((vector float) vec_splat_u8(0))
122# define VMUL(a,b) vec_madd(a,b, VZERO())
123# define VADD(a,b) vec_add(a,b)
124# define VMADD(a,b,c) vec_madd(a,b,c)
125# define VSUB(a,b) vec_sub(a,b)
126inline v4sf ld_ps1(const float *p) { v4sf v=vec_lde(0,p); return vec_splat(vec_perm(v, v, vec_lvsl(0, p)), 0); }
127# define LD_PS1(p) ld_ps1(&p)
128# define INTERLEAVE2(in1, in2, out1, out2) { v4sf tmp__ = vec_mergeh(in1, in2); out2 = vec_mergel(in1, in2); out1 = tmp__; }
129# define UNINTERLEAVE2(in1, in2, out1, out2) { \
130 vector unsigned char vperm1 = (vector unsigned char){0,1,2,3,8,9,10,11,16,17,18,19,24,25,26,27}; \
131 vector unsigned char vperm2 = (vector unsigned char){4,5,6,7,12,13,14,15,20,21,22,23,28,29,30,31}; \
132 v4sf tmp__ = vec_perm(in1, in2, vperm1); out2 = vec_perm(in1, in2, vperm2); out1 = tmp__; \
133 }
134# define VTRANSPOSE4(x0,x1,x2,x3) { \
135 v4sf y0 = vec_mergeh(x0, x2); \
136 v4sf y1 = vec_mergel(x0, x2); \
137 v4sf y2 = vec_mergeh(x1, x3); \
138 v4sf y3 = vec_mergel(x1, x3); \
139 x0 = vec_mergeh(y0, y2); \
140 x1 = vec_mergel(y0, y2); \
141 x2 = vec_mergeh(y1, y3); \
142 x3 = vec_mergel(y1, y3); \
143 }
144# define VSWAPHL(a,b) vec_perm(a,b, (vector unsigned char){16,17,18,19,20,21,22,23,8,9,10,11,12,13,14,15})
145# define VALIGNED(ptr) ((((size_t)(ptr)) & 0xF) == 0)
146
147/*
148 SSE1 support macros
149*/
150#elif defined(PFFFT_SIMD_SSE)
151
152#include <xmmintrin.h>
153typedef __m128 v4sf;
154# define SIMD_SZ 4 // 4 floats by simd vector -- this is pretty much hardcoded in the preprocess/finalize functions anyway so you will have to work if you want to enable AVX with its 256-bit vectors.
155# define VZERO() _mm_setzero_ps()
156# define VMUL(a,b) _mm_mul_ps(a,b)
157# define VADD(a,b) _mm_add_ps(a,b)
158# define VMADD(a,b,c) _mm_add_ps(_mm_mul_ps(a,b), c)
159# define VSUB(a,b) _mm_sub_ps(a,b)
160# define LD_PS1(p) _mm_set1_ps(p)
161# define INTERLEAVE2(in1, in2, out1, out2) { v4sf tmp__ = _mm_unpacklo_ps(in1, in2); out2 = _mm_unpackhi_ps(in1, in2); out1 = tmp__; }
162# define UNINTERLEAVE2(in1, in2, out1, out2) { v4sf tmp__ = _mm_shuffle_ps(in1, in2, _MM_SHUFFLE(2,0,2,0)); out2 = _mm_shuffle_ps(in1, in2, _MM_SHUFFLE(3,1,3,1)); out1 = tmp__; }
163# define VTRANSPOSE4(x0,x1,x2,x3) _MM_TRANSPOSE4_PS(x0,x1,x2,x3)
164# define VSWAPHL(a,b) _mm_shuffle_ps(b, a, _MM_SHUFFLE(3,2,1,0))
165# define VALIGNED(ptr) ((((size_t)(ptr)) & 0xF) == 0)
166
167/*
168 ARM NEON support macros
169*/
170#elif defined(PFFFT_SIMD_NEON)
171# include <arm_neon.h>
172typedef float32x4_t v4sf;
173# define SIMD_SZ 4
174# define VZERO() vdupq_n_f32(0)
175# define VMUL(a,b) vmulq_f32(a,b)
176# define VADD(a,b) vaddq_f32(a,b)
177# define VMADD(a,b,c) vmlaq_f32(c,a,b)
178# define VSUB(a,b) vsubq_f32(a,b)
179# define LD_PS1(p) vld1q_dup_f32(&(p))
180# define INTERLEAVE2(in1, in2, out1, out2) { float32x4x2_t tmp__ = vzipq_f32(in1,in2); out1=tmp__.val[0]; out2=tmp__.val[1]; }
181# define UNINTERLEAVE2(in1, in2, out1, out2) { float32x4x2_t tmp__ = vuzpq_f32(in1,in2); out1=tmp__.val[0]; out2=tmp__.val[1]; }
182# define VTRANSPOSE4(x0,x1,x2,x3) { \
183 float32x4x2_t t0_ = vzipq_f32(x0, x2); \
184 float32x4x2_t t1_ = vzipq_f32(x1, x3); \
185 float32x4x2_t u0_ = vzipq_f32(t0_.val[0], t1_.val[0]); \
186 float32x4x2_t u1_ = vzipq_f32(t0_.val[1], t1_.val[1]); \
187 x0 = u0_.val[0]; x1 = u0_.val[1]; x2 = u1_.val[0]; x3 = u1_.val[1]; \
188 }
189// marginally faster version
190//# define VTRANSPOSE4(x0,x1,x2,x3) { asm("vtrn.32 %q0, %q1;\n vtrn.32 %q2,%q3\n vswp %f0,%e2\n vswp %f1,%e3" : "+w"(x0), "+w"(x1), "+w"(x2), "+w"(x3)::); }
191# define VSWAPHL(a,b) vcombine_f32(vget_low_f32(b), vget_high_f32(a))
192# define VALIGNED(ptr) ((((size_t)(ptr)) & 0x3) == 0)
193#else
194# if !defined(PFFFT_SIMD_DISABLE)
195# warning "building with simd disabled !\n";
196# define PFFFT_SIMD_DISABLE // fallback to scalar code
197# endif
198#endif
199
200// fallback mode for situations where SSE/Altivec are not available, use scalar mode instead
201#ifdef PFFFT_SIMD_DISABLE
202typedef float v4sf;
203# define SIMD_SZ 1
204# define VZERO() 0.f
205# define VMUL(a,b) ((a)*(b))
206# define VADD(a,b) ((a)+(b))
207# define VMADD(a,b,c) ((a)*(b)+(c))
208# define VSUB(a,b) ((a)-(b))
209# define LD_PS1(p) (p)
210# define VALIGNED(ptr) ((((size_t)(ptr)) & 0x3) == 0)
211#endif
212
213// shortcuts for complex multiplcations
214#define VCPLXMUL(ar,ai,br,bi) { v4sf tmp; tmp=VMUL(ar,bi); ar=VMUL(ar,br); ar=VSUB(ar,VMUL(ai,bi)); ai=VMUL(ai,br); ai=VADD(ai,tmp); }
215#define VCPLXMULCONJ(ar,ai,br,bi) { v4sf tmp; tmp=VMUL(ar,bi); ar=VMUL(ar,br); ar=VADD(ar,VMUL(ai,bi)); ai=VMUL(ai,br); ai=VSUB(ai,tmp); }
216#ifndef SVMUL
217// multiply a scalar with a vector
218#define SVMUL(f,v) VMUL(LD_PS1(f),v)
219#endif
220
221#if !defined(PFFFT_SIMD_DISABLE)
222typedef union v4sf_union {
223 v4sf v;
224 float f[4];
225} v4sf_union;
226
227#include <string.h>
228
229#define assertv4(v,f0,f1,f2,f3) assert(v.f[0] == (f0) && v.f[1] == (f1) && v.f[2] == (f2) && v.f[3] == (f3))
230
231/* detect bugs with the vector support macros */
232void validate_pffft_simd(void) {
233 float f[16] = { 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 };
234 v4sf_union a0, a1, a2, a3, t, u;
235 memcpy(dest: a0.f, src: f, n: 4*sizeof(float));
236 memcpy(dest: a1.f, src: f+4, n: 4*sizeof(float));
237 memcpy(dest: a2.f, src: f+8, n: 4*sizeof(float));
238 memcpy(dest: a3.f, src: f+12, n: 4*sizeof(float));
239
240 t = a0; u = a1; t.v = VZERO();
241 printf(format: "VZERO=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]); assertv4(t, 0, 0, 0, 0);
242 t.v = VADD(a1.v, a2.v);
243 printf(format: "VADD(4:7,8:11)=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]); assertv4(t, 12, 14, 16, 18);
244 t.v = VMUL(a1.v, a2.v);
245 printf(format: "VMUL(4:7,8:11)=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]); assertv4(t, 32, 45, 60, 77);
246 t.v = VMADD(a1.v, a2.v,a0.v);
247 printf(format: "VMADD(4:7,8:11,0:3)=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]); assertv4(t, 32, 46, 62, 80);
248
249 INTERLEAVE2(a1.v,a2.v,t.v,u.v);
250 printf(format: "INTERLEAVE2(4:7,8:11)=[%2g %2g %2g %2g] [%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3], u.f[0], u.f[1], u.f[2], u.f[3]);
251 assertv4(t, 4, 8, 5, 9); assertv4(u, 6, 10, 7, 11);
252 UNINTERLEAVE2(a1.v,a2.v,t.v,u.v);
253 printf(format: "UNINTERLEAVE2(4:7,8:11)=[%2g %2g %2g %2g] [%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3], u.f[0], u.f[1], u.f[2], u.f[3]);
254 assertv4(t, 4, 6, 8, 10); assertv4(u, 5, 7, 9, 11);
255
256 t.v=LD_PS1(f[15]);
257 printf(format: "LD_PS1(15)=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]);
258 assertv4(t, 15, 15, 15, 15);
259 t.v = VSWAPHL(a1.v, a2.v);
260 printf(format: "VSWAPHL(4:7,8:11)=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]);
261 assertv4(t, 8, 9, 6, 7);
262 VTRANSPOSE4(a0.v, a1.v, a2.v, a3.v);
263 printf(format: "VTRANSPOSE4(0:3,4:7,8:11,12:15)=[%2g %2g %2g %2g] [%2g %2g %2g %2g] [%2g %2g %2g %2g] [%2g %2g %2g %2g]\n",
264 a0.f[0], a0.f[1], a0.f[2], a0.f[3], a1.f[0], a1.f[1], a1.f[2], a1.f[3],
265 a2.f[0], a2.f[1], a2.f[2], a2.f[3], a3.f[0], a3.f[1], a3.f[2], a3.f[3]);
266 assertv4(a0, 0, 4, 8, 12); assertv4(a1, 1, 5, 9, 13); assertv4(a2, 2, 6, 10, 14); assertv4(a3, 3, 7, 11, 15);
267}
268#else
269void validate_pffft_simd() {} // allow test_pffft.c to call this function even when simd is not available..
270#endif //!PFFFT_SIMD_DISABLE
271
272/* SSE and co like 16-bytes aligned pointers */
273#define MALLOC_V4SF_ALIGNMENT 64 // with a 64-byte alignment, we are even aligned on L2 cache lines...
274void *pffft_aligned_malloc(size_t nb_bytes) {
275 void *p, *p0 = malloc(size: nb_bytes + MALLOC_V4SF_ALIGNMENT);
276 if (!p0) return (void *) 0;
277 p = (void *) (((size_t) p0 + MALLOC_V4SF_ALIGNMENT) & (~((size_t) (MALLOC_V4SF_ALIGNMENT-1))));
278 *((void **) p - 1) = p0;
279 return p;
280}
281
282void pffft_aligned_free(void *p) {
283 if (p) free(ptr: *((void **) p - 1));
284}
285
286int pffft_simd_size(void) { return SIMD_SZ; }
287
288/*
289 passf2 and passb2 has been merged here, fsign = -1 for passf2, +1 for passb2
290*/
291static NEVER_INLINE(void) passf2_ps(int ido, int l1, const v4sf *cc, v4sf *ch, const float *wa1, float fsign) {
292 int k, i;
293 int l1ido = l1*ido;
294 if (ido <= 2) {
295 for (k=0; k < l1ido; k += ido, ch += ido, cc+= 2*ido) {
296 ch[0] = VADD(cc[0], cc[ido+0]);
297 ch[l1ido] = VSUB(cc[0], cc[ido+0]);
298 ch[1] = VADD(cc[1], cc[ido+1]);
299 ch[l1ido + 1] = VSUB(cc[1], cc[ido+1]);
300 }
301 } else {
302 for (k=0; k < l1ido; k += ido, ch += ido, cc += 2*ido) {
303 for (i=0; i<ido-1; i+=2) {
304 v4sf tr2 = VSUB(cc[i+0], cc[i+ido+0]);
305 v4sf ti2 = VSUB(cc[i+1], cc[i+ido+1]);
306 v4sf wr = LD_PS1(wa1[i]), wi = VMUL(LD_PS1(fsign), LD_PS1(wa1[i+1]));
307 ch[i] = VADD(cc[i+0], cc[i+ido+0]);
308 ch[i+1] = VADD(cc[i+1], cc[i+ido+1]);
309 VCPLXMUL(tr2, ti2, wr, wi);
310 ch[i+l1ido] = tr2;
311 ch[i+l1ido+1] = ti2;
312 }
313 }
314 }
315}
316
317/*
318 passf3 and passb3 has been merged here, fsign = -1 for passf3, +1 for passb3
319*/
320static NEVER_INLINE(void) passf3_ps(int ido, int l1, const v4sf *cc, v4sf *ch,
321 const float *wa1, const float *wa2, float fsign) {
322 static const float taur = -0.5f;
323 float taui = 0.866025403784439f*fsign;
324 int i, k;
325 v4sf tr2, ti2, cr2, ci2, cr3, ci3, dr2, di2, dr3, di3;
326 int l1ido = l1*ido;
327 float wr1, wi1, wr2, wi2;
328 assert(ido > 2);
329 for (k=0; k< l1ido; k += ido, cc+= 3*ido, ch +=ido) {
330 for (i=0; i<ido-1; i+=2) {
331 tr2 = VADD(cc[i+ido], cc[i+2*ido]);
332 cr2 = VADD(cc[i], SVMUL(taur,tr2));
333 ch[i] = VADD(cc[i], tr2);
334 ti2 = VADD(cc[i+ido+1], cc[i+2*ido+1]);
335 ci2 = VADD(cc[i +1], SVMUL(taur,ti2));
336 ch[i+1] = VADD(cc[i+1], ti2);
337 cr3 = SVMUL(taui, VSUB(cc[i+ido], cc[i+2*ido]));
338 ci3 = SVMUL(taui, VSUB(cc[i+ido+1], cc[i+2*ido+1]));
339 dr2 = VSUB(cr2, ci3);
340 dr3 = VADD(cr2, ci3);
341 di2 = VADD(ci2, cr3);
342 di3 = VSUB(ci2, cr3);
343 wr1=wa1[i]; wi1=fsign*wa1[i+1]; wr2=wa2[i]; wi2=fsign*wa2[i+1];
344 VCPLXMUL(dr2, di2, LD_PS1(wr1), LD_PS1(wi1));
345 ch[i+l1ido] = dr2;
346 ch[i+l1ido + 1] = di2;
347 VCPLXMUL(dr3, di3, LD_PS1(wr2), LD_PS1(wi2));
348 ch[i+2*l1ido] = dr3;
349 ch[i+2*l1ido+1] = di3;
350 }
351 }
352} /* passf3 */
353
354static NEVER_INLINE(void) passf4_ps(int ido, int l1, const v4sf *cc, v4sf *ch,
355 const float *wa1, const float *wa2, const float *wa3, float fsign) {
356 /* isign == -1 for forward transform and +1 for backward transform */
357
358 int i, k;
359 v4sf ci2, ci3, ci4, cr2, cr3, cr4, ti1, ti2, ti3, ti4, tr1, tr2, tr3, tr4;
360 int l1ido = l1*ido;
361 if (ido == 2) {
362 for (k=0; k < l1ido; k += ido, ch += ido, cc += 4*ido) {
363 tr1 = VSUB(cc[0], cc[2*ido + 0]);
364 tr2 = VADD(cc[0], cc[2*ido + 0]);
365 ti1 = VSUB(cc[1], cc[2*ido + 1]);
366 ti2 = VADD(cc[1], cc[2*ido + 1]);
367 ti4 = VMUL(VSUB(cc[1*ido + 0], cc[3*ido + 0]), LD_PS1(fsign));
368 tr4 = VMUL(VSUB(cc[3*ido + 1], cc[1*ido + 1]), LD_PS1(fsign));
369 tr3 = VADD(cc[ido + 0], cc[3*ido + 0]);
370 ti3 = VADD(cc[ido + 1], cc[3*ido + 1]);
371
372 ch[0*l1ido + 0] = VADD(tr2, tr3);
373 ch[0*l1ido + 1] = VADD(ti2, ti3);
374 ch[1*l1ido + 0] = VADD(tr1, tr4);
375 ch[1*l1ido + 1] = VADD(ti1, ti4);
376 ch[2*l1ido + 0] = VSUB(tr2, tr3);
377 ch[2*l1ido + 1] = VSUB(ti2, ti3);
378 ch[3*l1ido + 0] = VSUB(tr1, tr4);
379 ch[3*l1ido + 1] = VSUB(ti1, ti4);
380 }
381 } else {
382 for (k=0; k < l1ido; k += ido, ch+=ido, cc += 4*ido) {
383 for (i=0; i<ido-1; i+=2) {
384 float wr1, wi1, wr2, wi2, wr3, wi3;
385 tr1 = VSUB(cc[i + 0], cc[i + 2*ido + 0]);
386 tr2 = VADD(cc[i + 0], cc[i + 2*ido + 0]);
387 ti1 = VSUB(cc[i + 1], cc[i + 2*ido + 1]);
388 ti2 = VADD(cc[i + 1], cc[i + 2*ido + 1]);
389 tr4 = VMUL(VSUB(cc[i + 3*ido + 1], cc[i + 1*ido + 1]), LD_PS1(fsign));
390 ti4 = VMUL(VSUB(cc[i + 1*ido + 0], cc[i + 3*ido + 0]), LD_PS1(fsign));
391 tr3 = VADD(cc[i + ido + 0], cc[i + 3*ido + 0]);
392 ti3 = VADD(cc[i + ido + 1], cc[i + 3*ido + 1]);
393
394 ch[i] = VADD(tr2, tr3);
395 cr3 = VSUB(tr2, tr3);
396 ch[i + 1] = VADD(ti2, ti3);
397 ci3 = VSUB(ti2, ti3);
398
399 cr2 = VADD(tr1, tr4);
400 cr4 = VSUB(tr1, tr4);
401 ci2 = VADD(ti1, ti4);
402 ci4 = VSUB(ti1, ti4);
403 wr1=wa1[i]; wi1=fsign*wa1[i+1];
404 VCPLXMUL(cr2, ci2, LD_PS1(wr1), LD_PS1(wi1));
405 wr2=wa2[i]; wi2=fsign*wa2[i+1];
406 ch[i + l1ido] = cr2;
407 ch[i + l1ido + 1] = ci2;
408
409 VCPLXMUL(cr3, ci3, LD_PS1(wr2), LD_PS1(wi2));
410 wr3=wa3[i]; wi3=fsign*wa3[i+1];
411 ch[i + 2*l1ido] = cr3;
412 ch[i + 2*l1ido + 1] = ci3;
413
414 VCPLXMUL(cr4, ci4, LD_PS1(wr3), LD_PS1(wi3));
415 ch[i + 3*l1ido] = cr4;
416 ch[i + 3*l1ido + 1] = ci4;
417 }
418 }
419 }
420} /* passf4 */
421
422/*
423 passf5 and passb5 has been merged here, fsign = -1 for passf5, +1 for passb5
424*/
425static NEVER_INLINE(void) passf5_ps(int ido, int l1, const v4sf *cc, v4sf *ch,
426 const float *wa1, const float *wa2,
427 const float *wa3, const float *wa4, float fsign) {
428 static const float tr11 = .309016994374947f;
429 const float ti11 = .951056516295154f*fsign;
430 static const float tr12 = -.809016994374947f;
431 const float ti12 = .587785252292473f*fsign;
432
433 /* Local variables */
434 int i, k;
435 v4sf ci2, ci3, ci4, ci5, di3, di4, di5, di2, cr2, cr3, cr5, cr4, ti2, ti3,
436 ti4, ti5, dr3, dr4, dr5, dr2, tr2, tr3, tr4, tr5;
437
438 float wr1, wi1, wr2, wi2, wr3, wi3, wr4, wi4;
439
440#define cc_ref(a_1,a_2) cc[(a_2-1)*ido + a_1 + 1]
441#define ch_ref(a_1,a_3) ch[(a_3-1)*l1*ido + a_1 + 1]
442
443 assert(ido > 2);
444 for (k = 0; k < l1; ++k, cc += 5*ido, ch += ido) {
445 for (i = 0; i < ido-1; i += 2) {
446 ti5 = VSUB(cc_ref(i , 2), cc_ref(i , 5));
447 ti2 = VADD(cc_ref(i , 2), cc_ref(i , 5));
448 ti4 = VSUB(cc_ref(i , 3), cc_ref(i , 4));
449 ti3 = VADD(cc_ref(i , 3), cc_ref(i , 4));
450 tr5 = VSUB(cc_ref(i-1, 2), cc_ref(i-1, 5));
451 tr2 = VADD(cc_ref(i-1, 2), cc_ref(i-1, 5));
452 tr4 = VSUB(cc_ref(i-1, 3), cc_ref(i-1, 4));
453 tr3 = VADD(cc_ref(i-1, 3), cc_ref(i-1, 4));
454 ch_ref(i-1, 1) = VADD(cc_ref(i-1, 1), VADD(tr2, tr3));
455 ch_ref(i , 1) = VADD(cc_ref(i , 1), VADD(ti2, ti3));
456 cr2 = VADD(cc_ref(i-1, 1), VADD(SVMUL(tr11, tr2),SVMUL(tr12, tr3)));
457 ci2 = VADD(cc_ref(i , 1), VADD(SVMUL(tr11, ti2),SVMUL(tr12, ti3)));
458 cr3 = VADD(cc_ref(i-1, 1), VADD(SVMUL(tr12, tr2),SVMUL(tr11, tr3)));
459 ci3 = VADD(cc_ref(i , 1), VADD(SVMUL(tr12, ti2),SVMUL(tr11, ti3)));
460 cr5 = VADD(SVMUL(ti11, tr5), SVMUL(ti12, tr4));
461 ci5 = VADD(SVMUL(ti11, ti5), SVMUL(ti12, ti4));
462 cr4 = VSUB(SVMUL(ti12, tr5), SVMUL(ti11, tr4));
463 ci4 = VSUB(SVMUL(ti12, ti5), SVMUL(ti11, ti4));
464 dr3 = VSUB(cr3, ci4);
465 dr4 = VADD(cr3, ci4);
466 di3 = VADD(ci3, cr4);
467 di4 = VSUB(ci3, cr4);
468 dr5 = VADD(cr2, ci5);
469 dr2 = VSUB(cr2, ci5);
470 di5 = VSUB(ci2, cr5);
471 di2 = VADD(ci2, cr5);
472 wr1=wa1[i]; wi1=fsign*wa1[i+1]; wr2=wa2[i]; wi2=fsign*wa2[i+1];
473 wr3=wa3[i]; wi3=fsign*wa3[i+1]; wr4=wa4[i]; wi4=fsign*wa4[i+1];
474 VCPLXMUL(dr2, di2, LD_PS1(wr1), LD_PS1(wi1));
475 ch_ref(i - 1, 2) = dr2;
476 ch_ref(i, 2) = di2;
477 VCPLXMUL(dr3, di3, LD_PS1(wr2), LD_PS1(wi2));
478 ch_ref(i - 1, 3) = dr3;
479 ch_ref(i, 3) = di3;
480 VCPLXMUL(dr4, di4, LD_PS1(wr3), LD_PS1(wi3));
481 ch_ref(i - 1, 4) = dr4;
482 ch_ref(i, 4) = di4;
483 VCPLXMUL(dr5, di5, LD_PS1(wr4), LD_PS1(wi4));
484 ch_ref(i - 1, 5) = dr5;
485 ch_ref(i, 5) = di5;
486 }
487 }
488#undef ch_ref
489#undef cc_ref
490}
491
492static NEVER_INLINE(void) radf2_ps(int ido, int l1, const v4sf * RESTRICT cc, v4sf * RESTRICT ch, const float *wa1) {
493 static const float minus_one = -1.f;
494 int i, k, l1ido = l1*ido;
495 for (k=0; k < l1ido; k += ido) {
496 v4sf a = cc[k], b = cc[k + l1ido];
497 ch[2*k] = VADD(a, b);
498 ch[2*(k+ido)-1] = VSUB(a, b);
499 }
500 if (ido < 2) return;
501 if (ido != 2) {
502 for (k=0; k < l1ido; k += ido) {
503 for (i=2; i<ido; i+=2) {
504 v4sf tr2 = cc[i - 1 + k + l1ido], ti2 = cc[i + k + l1ido];
505 v4sf br = cc[i - 1 + k], bi = cc[i + k];
506 VCPLXMULCONJ(tr2, ti2, LD_PS1(wa1[i - 2]), LD_PS1(wa1[i - 1]));
507 ch[i + 2*k] = VADD(bi, ti2);
508 ch[2*(k+ido) - i] = VSUB(ti2, bi);
509 ch[i - 1 + 2*k] = VADD(br, tr2);
510 ch[2*(k+ido) - i -1] = VSUB(br, tr2);
511 }
512 }
513 if (ido % 2 == 1) return;
514 }
515 for (k=0; k < l1ido; k += ido) {
516 ch[2*k + ido] = SVMUL(minus_one, cc[ido-1 + k + l1ido]);
517 ch[2*k + ido-1] = cc[k + ido-1];
518 }
519} /* radf2 */
520
521
522static NEVER_INLINE(void) radb2_ps(int ido, int l1, const v4sf *cc, v4sf *ch, const float *wa1) {
523 static const float minus_two=-2;
524 int i, k, l1ido = l1*ido;
525 v4sf a,b,c,d, tr2, ti2;
526 for (k=0; k < l1ido; k += ido) {
527 a = cc[2*k]; b = cc[2*(k+ido) - 1];
528 ch[k] = VADD(a, b);
529 ch[k + l1ido] =VSUB(a, b);
530 }
531 if (ido < 2) return;
532 if (ido != 2) {
533 for (k = 0; k < l1ido; k += ido) {
534 for (i = 2; i < ido; i += 2) {
535 a = cc[i-1 + 2*k]; b = cc[2*(k + ido) - i - 1];
536 c = cc[i+0 + 2*k]; d = cc[2*(k + ido) - i + 0];
537 ch[i-1 + k] = VADD(a, b);
538 tr2 = VSUB(a, b);
539 ch[i+0 + k] = VSUB(c, d);
540 ti2 = VADD(c, d);
541 VCPLXMUL(tr2, ti2, LD_PS1(wa1[i - 2]), LD_PS1(wa1[i - 1]));
542 ch[i-1 + k + l1ido] = tr2;
543 ch[i+0 + k + l1ido] = ti2;
544 }
545 }
546 if (ido % 2 == 1) return;
547 }
548 for (k = 0; k < l1ido; k += ido) {
549 a = cc[2*k + ido-1]; b = cc[2*k + ido];
550 ch[k + ido-1] = VADD(a,a);
551 ch[k + ido-1 + l1ido] = SVMUL(minus_two, b);
552 }
553} /* radb2 */
554
555static void radf3_ps(int ido, int l1, const v4sf * RESTRICT cc, v4sf * RESTRICT ch,
556 const float *wa1, const float *wa2) {
557 static const float taur = -0.5f;
558 static const float taui = 0.866025403784439f;
559 int i, k, ic;
560 v4sf ci2, di2, di3, cr2, dr2, dr3, ti2, ti3, tr2, tr3, wr1, wi1, wr2, wi2;
561 for (k=0; k<l1; k++) {
562 cr2 = VADD(cc[(k + l1)*ido], cc[(k + 2*l1)*ido]);
563 ch[3*k*ido] = VADD(cc[k*ido], cr2);
564 ch[(3*k+2)*ido] = SVMUL(taui, VSUB(cc[(k + l1*2)*ido], cc[(k + l1)*ido]));
565 ch[ido-1 + (3*k + 1)*ido] = VADD(cc[k*ido], SVMUL(taur, cr2));
566 }
567 if (ido == 1) return;
568 for (k=0; k<l1; k++) {
569 for (i=2; i<ido; i+=2) {
570 ic = ido - i;
571 wr1 = LD_PS1(wa1[i - 2]); wi1 = LD_PS1(wa1[i - 1]);
572 dr2 = cc[i - 1 + (k + l1)*ido]; di2 = cc[i + (k + l1)*ido];
573 VCPLXMULCONJ(dr2, di2, wr1, wi1);
574
575 wr2 = LD_PS1(wa2[i - 2]); wi2 = LD_PS1(wa2[i - 1]);
576 dr3 = cc[i - 1 + (k + l1*2)*ido]; di3 = cc[i + (k + l1*2)*ido];
577 VCPLXMULCONJ(dr3, di3, wr2, wi2);
578
579 cr2 = VADD(dr2, dr3);
580 ci2 = VADD(di2, di3);
581 ch[i - 1 + 3*k*ido] = VADD(cc[i - 1 + k*ido], cr2);
582 ch[i + 3*k*ido] = VADD(cc[i + k*ido], ci2);
583 tr2 = VADD(cc[i - 1 + k*ido], SVMUL(taur, cr2));
584 ti2 = VADD(cc[i + k*ido], SVMUL(taur, ci2));
585 tr3 = SVMUL(taui, VSUB(di2, di3));
586 ti3 = SVMUL(taui, VSUB(dr3, dr2));
587 ch[i - 1 + (3*k + 2)*ido] = VADD(tr2, tr3);
588 ch[ic - 1 + (3*k + 1)*ido] = VSUB(tr2, tr3);
589 ch[i + (3*k + 2)*ido] = VADD(ti2, ti3);
590 ch[ic + (3*k + 1)*ido] = VSUB(ti3, ti2);
591 }
592 }
593} /* radf3 */
594
595
596static void radb3_ps(int ido, int l1, const v4sf *RESTRICT cc, v4sf *RESTRICT ch,
597 const float *wa1, const float *wa2)
598{
599 static const float taur = -0.5f;
600 static const float taui = 0.866025403784439f;
601 static const float taui_2 = 0.866025403784439f*2;
602 int i, k, ic;
603 v4sf ci2, ci3, di2, di3, cr2, cr3, dr2, dr3, ti2, tr2;
604 for (k=0; k<l1; k++) {
605 tr2 = cc[ido-1 + (3*k + 1)*ido]; tr2 = VADD(tr2,tr2);
606 cr2 = VMADD(LD_PS1(taur), tr2, cc[3*k*ido]);
607 ch[k*ido] = VADD(cc[3*k*ido], tr2);
608 ci3 = SVMUL(taui_2, cc[(3*k + 2)*ido]);
609 ch[(k + l1)*ido] = VSUB(cr2, ci3);
610 ch[(k + 2*l1)*ido] = VADD(cr2, ci3);
611 }
612 if (ido == 1) return;
613 for (k=0; k<l1; k++) {
614 for (i=2; i<ido; i+=2) {
615 ic = ido - i;
616 tr2 = VADD(cc[i - 1 + (3*k + 2)*ido], cc[ic - 1 + (3*k + 1)*ido]);
617 cr2 = VMADD(LD_PS1(taur), tr2, cc[i - 1 + 3*k*ido]);
618 ch[i - 1 + k*ido] = VADD(cc[i - 1 + 3*k*ido], tr2);
619 ti2 = VSUB(cc[i + (3*k + 2)*ido], cc[ic + (3*k + 1)*ido]);
620 ci2 = VMADD(LD_PS1(taur), ti2, cc[i + 3*k*ido]);
621 ch[i + k*ido] = VADD(cc[i + 3*k*ido], ti2);
622 cr3 = SVMUL(taui, VSUB(cc[i - 1 + (3*k + 2)*ido], cc[ic - 1 + (3*k + 1)*ido]));
623 ci3 = SVMUL(taui, VADD(cc[i + (3*k + 2)*ido], cc[ic + (3*k + 1)*ido]));
624 dr2 = VSUB(cr2, ci3);
625 dr3 = VADD(cr2, ci3);
626 di2 = VADD(ci2, cr3);
627 di3 = VSUB(ci2, cr3);
628 VCPLXMUL(dr2, di2, LD_PS1(wa1[i-2]), LD_PS1(wa1[i-1]));
629 ch[i - 1 + (k + l1)*ido] = dr2;
630 ch[i + (k + l1)*ido] = di2;
631 VCPLXMUL(dr3, di3, LD_PS1(wa2[i-2]), LD_PS1(wa2[i-1]));
632 ch[i - 1 + (k + 2*l1)*ido] = dr3;
633 ch[i + (k + 2*l1)*ido] = di3;
634 }
635 }
636} /* radb3 */
637
638static NEVER_INLINE(void) radf4_ps(int ido, int l1, const v4sf *RESTRICT cc, v4sf * RESTRICT ch,
639 const float * RESTRICT wa1, const float * RESTRICT wa2, const float * RESTRICT wa3)
640{
641 static const float minus_hsqt2 = (float)-0.7071067811865475;
642 int i, k, l1ido = l1*ido;
643 {
644 const v4sf *RESTRICT cc_ = cc, * RESTRICT cc_end = cc + l1ido;
645 v4sf * RESTRICT ch_ = ch;
646 while (cc < cc_end) {
647 // this loop represents between 25% and 40% of total radf4_ps cost !
648 v4sf a0 = cc[0], a1 = cc[l1ido];
649 v4sf a2 = cc[2*l1ido], a3 = cc[3*l1ido];
650 v4sf tr1 = VADD(a1, a3);
651 v4sf tr2 = VADD(a0, a2);
652 ch[2*ido-1] = VSUB(a0, a2);
653 ch[2*ido ] = VSUB(a3, a1);
654 ch[0 ] = VADD(tr1, tr2);
655 ch[4*ido-1] = VSUB(tr2, tr1);
656 cc += ido; ch += 4*ido;
657 }
658 cc = cc_; ch = ch_;
659 }
660 if (ido < 2) return;
661 if (ido != 2) {
662 for (k = 0; k < l1ido; k += ido) {
663 const v4sf * RESTRICT pc = (v4sf*)(cc + 1 + k);
664 for (i=2; i<ido; i += 2, pc += 2) {
665 int ic = ido - i;
666 v4sf wr, wi, cr2, ci2, cr3, ci3, cr4, ci4;
667 v4sf tr1, ti1, tr2, ti2, tr3, ti3, tr4, ti4;
668
669 cr2 = pc[1*l1ido+0];
670 ci2 = pc[1*l1ido+1];
671 wr=LD_PS1(wa1[i - 2]);
672 wi=LD_PS1(wa1[i - 1]);
673 VCPLXMULCONJ(cr2,ci2,wr,wi);
674
675 cr3 = pc[2*l1ido+0];
676 ci3 = pc[2*l1ido+1];
677 wr = LD_PS1(wa2[i-2]);
678 wi = LD_PS1(wa2[i-1]);
679 VCPLXMULCONJ(cr3, ci3, wr, wi);
680
681 cr4 = pc[3*l1ido];
682 ci4 = pc[3*l1ido+1];
683 wr = LD_PS1(wa3[i-2]);
684 wi = LD_PS1(wa3[i-1]);
685 VCPLXMULCONJ(cr4, ci4, wr, wi);
686
687 /* at this point, on SSE, five of "cr2 cr3 cr4 ci2 ci3 ci4" should be loaded in registers */
688
689 tr1 = VADD(cr2,cr4);
690 tr4 = VSUB(cr4,cr2);
691 tr2 = VADD(pc[0],cr3);
692 tr3 = VSUB(pc[0],cr3);
693 ch[i - 1 + 4*k] = VADD(tr1,tr2);
694 ch[ic - 1 + 4*k + 3*ido] = VSUB(tr2,tr1); // at this point tr1 and tr2 can be disposed
695 ti1 = VADD(ci2,ci4);
696 ti4 = VSUB(ci2,ci4);
697 ch[i - 1 + 4*k + 2*ido] = VADD(ti4,tr3);
698 ch[ic - 1 + 4*k + 1*ido] = VSUB(tr3,ti4); // dispose tr3, ti4
699 ti2 = VADD(pc[1],ci3);
700 ti3 = VSUB(pc[1],ci3);
701 ch[i + 4*k] = VADD(ti1, ti2);
702 ch[ic + 4*k + 3*ido] = VSUB(ti1, ti2);
703 ch[i + 4*k + 2*ido] = VADD(tr4, ti3);
704 ch[ic + 4*k + 1*ido] = VSUB(tr4, ti3);
705 }
706 }
707 if (ido % 2 == 1) return;
708 }
709 for (k=0; k<l1ido; k += ido) {
710 v4sf a = cc[ido-1 + k + l1ido], b = cc[ido-1 + k + 3*l1ido];
711 v4sf c = cc[ido-1 + k], d = cc[ido-1 + k + 2*l1ido];
712 v4sf ti1 = SVMUL(minus_hsqt2, VADD(a, b));
713 v4sf tr1 = SVMUL(minus_hsqt2, VSUB(b, a));
714 ch[ido-1 + 4*k] = VADD(tr1, c);
715 ch[ido-1 + 4*k + 2*ido] = VSUB(c, tr1);
716 ch[4*k + 1*ido] = VSUB(ti1, d);
717 ch[4*k + 3*ido] = VADD(ti1, d);
718 }
719} /* radf4 */
720
721
722static NEVER_INLINE(void) radb4_ps(int ido, int l1, const v4sf * RESTRICT cc, v4sf * RESTRICT ch,
723 const float * RESTRICT wa1, const float * RESTRICT wa2, const float *RESTRICT wa3)
724{
725 static const float minus_sqrt2 = (float)-1.414213562373095;
726 static const float two = 2.f;
727 int i, k, l1ido = l1*ido;
728 v4sf ci2, ci3, ci4, cr2, cr3, cr4, ti1, ti2, ti3, ti4, tr1, tr2, tr3, tr4;
729 {
730 const v4sf *RESTRICT cc_ = cc, * RESTRICT ch_end = ch + l1ido;
731 v4sf *ch_ = ch;
732 while (ch < ch_end) {
733 v4sf a = cc[0], b = cc[4*ido-1];
734 v4sf c = cc[2*ido], d = cc[2*ido-1];
735 tr3 = SVMUL(two,d);
736 tr2 = VADD(a,b);
737 tr1 = VSUB(a,b);
738 tr4 = SVMUL(two,c);
739 ch[0*l1ido] = VADD(tr2, tr3);
740 ch[2*l1ido] = VSUB(tr2, tr3);
741 ch[1*l1ido] = VSUB(tr1, tr4);
742 ch[3*l1ido] = VADD(tr1, tr4);
743
744 cc += 4*ido; ch += ido;
745 }
746 cc = cc_; ch = ch_;
747 }
748 if (ido < 2) return;
749 if (ido != 2) {
750 for (k = 0; k < l1ido; k += ido) {
751 const v4sf * RESTRICT pc = (v4sf*)(cc - 1 + 4*k);
752 v4sf * RESTRICT ph = (v4sf*)(ch + k + 1);
753 for (i = 2; i < ido; i += 2) {
754
755 tr1 = VSUB(pc[i], pc[4*ido - i]);
756 tr2 = VADD(pc[i], pc[4*ido - i]);
757 ti4 = VSUB(pc[2*ido + i], pc[2*ido - i]);
758 tr3 = VADD(pc[2*ido + i], pc[2*ido - i]);
759 ph[0] = VADD(tr2, tr3);
760 cr3 = VSUB(tr2, tr3);
761
762 ti3 = VSUB(pc[2*ido + i + 1], pc[2*ido - i + 1]);
763 tr4 = VADD(pc[2*ido + i + 1], pc[2*ido - i + 1]);
764 cr2 = VSUB(tr1, tr4);
765 cr4 = VADD(tr1, tr4);
766
767 ti1 = VADD(pc[i + 1], pc[4*ido - i + 1]);
768 ti2 = VSUB(pc[i + 1], pc[4*ido - i + 1]);
769
770 ph[1] = VADD(ti2, ti3); ph += l1ido;
771 ci3 = VSUB(ti2, ti3);
772 ci2 = VADD(ti1, ti4);
773 ci4 = VSUB(ti1, ti4);
774 VCPLXMUL(cr2, ci2, LD_PS1(wa1[i-2]), LD_PS1(wa1[i-1]));
775 ph[0] = cr2;
776 ph[1] = ci2; ph += l1ido;
777 VCPLXMUL(cr3, ci3, LD_PS1(wa2[i-2]), LD_PS1(wa2[i-1]));
778 ph[0] = cr3;
779 ph[1] = ci3; ph += l1ido;
780 VCPLXMUL(cr4, ci4, LD_PS1(wa3[i-2]), LD_PS1(wa3[i-1]));
781 ph[0] = cr4;
782 ph[1] = ci4; ph = ph - 3*l1ido + 2;
783 }
784 }
785 if (ido % 2 == 1) return;
786 }
787 for (k=0; k < l1ido; k+=ido) {
788 int i0 = 4*k + ido;
789 v4sf c = cc[i0-1], d = cc[i0 + 2*ido-1];
790 v4sf a = cc[i0+0], b = cc[i0 + 2*ido+0];
791 tr1 = VSUB(c,d);
792 tr2 = VADD(c,d);
793 ti1 = VADD(b,a);
794 ti2 = VSUB(b,a);
795 ch[ido-1 + k + 0*l1ido] = VADD(tr2,tr2);
796 ch[ido-1 + k + 1*l1ido] = SVMUL(minus_sqrt2, VSUB(ti1, tr1));
797 ch[ido-1 + k + 2*l1ido] = VADD(ti2, ti2);
798 ch[ido-1 + k + 3*l1ido] = SVMUL(minus_sqrt2, VADD(ti1, tr1));
799 }
800} /* radb4 */
801
802static void radf5_ps(int ido, int l1, const v4sf * RESTRICT cc, v4sf * RESTRICT ch,
803 const float *wa1, const float *wa2, const float *wa3, const float *wa4)
804{
805 static const float tr11 = .309016994374947f;
806 static const float ti11 = .951056516295154f;
807 static const float tr12 = -.809016994374947f;
808 static const float ti12 = .587785252292473f;
809
810 /* System generated locals */
811 int cc_offset, ch_offset;
812
813 /* Local variables */
814 int i, k, ic;
815 v4sf ci2, di2, ci4, ci5, di3, di4, di5, ci3, cr2, cr3, dr2, dr3, dr4, dr5,
816 cr5, cr4, ti2, ti3, ti5, ti4, tr2, tr3, tr4, tr5;
817 int idp2;
818
819
820#define cc_ref(a_1,a_2,a_3) cc[((a_3)*l1 + (a_2))*ido + a_1]
821#define ch_ref(a_1,a_2,a_3) ch[((a_3)*5 + (a_2))*ido + a_1]
822
823 /* Parameter adjustments */
824 ch_offset = 1 + ido * 6;
825 ch -= ch_offset;
826 cc_offset = 1 + ido * (1 + l1);
827 cc -= cc_offset;
828
829 /* Function Body */
830 for (k = 1; k <= l1; ++k) {
831 cr2 = VADD(cc_ref(1, k, 5), cc_ref(1, k, 2));
832 ci5 = VSUB(cc_ref(1, k, 5), cc_ref(1, k, 2));
833 cr3 = VADD(cc_ref(1, k, 4), cc_ref(1, k, 3));
834 ci4 = VSUB(cc_ref(1, k, 4), cc_ref(1, k, 3));
835 ch_ref(1, 1, k) = VADD(cc_ref(1, k, 1), VADD(cr2, cr3));
836 ch_ref(ido, 2, k) = VADD(cc_ref(1, k, 1), VADD(SVMUL(tr11, cr2), SVMUL(tr12, cr3)));
837 ch_ref(1, 3, k) = VADD(SVMUL(ti11, ci5), SVMUL(ti12, ci4));
838 ch_ref(ido, 4, k) = VADD(cc_ref(1, k, 1), VADD(SVMUL(tr12, cr2), SVMUL(tr11, cr3)));
839 ch_ref(1, 5, k) = VSUB(SVMUL(ti12, ci5), SVMUL(ti11, ci4));
840 //printf("pffft: radf5, k=%d ch_ref=%f, ci4=%f\n", k, ch_ref(1, 5, k), ci4);
841 }
842 if (ido == 1) {
843 return;
844 }
845 idp2 = ido + 2;
846 for (k = 1; k <= l1; ++k) {
847 for (i = 3; i <= ido; i += 2) {
848 ic = idp2 - i;
849 dr2 = LD_PS1(wa1[i-3]); di2 = LD_PS1(wa1[i-2]);
850 dr3 = LD_PS1(wa2[i-3]); di3 = LD_PS1(wa2[i-2]);
851 dr4 = LD_PS1(wa3[i-3]); di4 = LD_PS1(wa3[i-2]);
852 dr5 = LD_PS1(wa4[i-3]); di5 = LD_PS1(wa4[i-2]);
853 VCPLXMULCONJ(dr2, di2, cc_ref(i-1, k, 2), cc_ref(i, k, 2));
854 VCPLXMULCONJ(dr3, di3, cc_ref(i-1, k, 3), cc_ref(i, k, 3));
855 VCPLXMULCONJ(dr4, di4, cc_ref(i-1, k, 4), cc_ref(i, k, 4));
856 VCPLXMULCONJ(dr5, di5, cc_ref(i-1, k, 5), cc_ref(i, k, 5));
857 cr2 = VADD(dr2, dr5);
858 ci5 = VSUB(dr5, dr2);
859 cr5 = VSUB(di2, di5);
860 ci2 = VADD(di2, di5);
861 cr3 = VADD(dr3, dr4);
862 ci4 = VSUB(dr4, dr3);
863 cr4 = VSUB(di3, di4);
864 ci3 = VADD(di3, di4);
865 ch_ref(i - 1, 1, k) = VADD(cc_ref(i - 1, k, 1), VADD(cr2, cr3));
866 ch_ref(i, 1, k) = VSUB(cc_ref(i, k, 1), VADD(ci2, ci3));//
867 tr2 = VADD(cc_ref(i - 1, k, 1), VADD(SVMUL(tr11, cr2), SVMUL(tr12, cr3)));
868 ti2 = VSUB(cc_ref(i, k, 1), VADD(SVMUL(tr11, ci2), SVMUL(tr12, ci3)));//
869 tr3 = VADD(cc_ref(i - 1, k, 1), VADD(SVMUL(tr12, cr2), SVMUL(tr11, cr3)));
870 ti3 = VSUB(cc_ref(i, k, 1), VADD(SVMUL(tr12, ci2), SVMUL(tr11, ci3)));//
871 tr5 = VADD(SVMUL(ti11, cr5), SVMUL(ti12, cr4));
872 ti5 = VADD(SVMUL(ti11, ci5), SVMUL(ti12, ci4));
873 tr4 = VSUB(SVMUL(ti12, cr5), SVMUL(ti11, cr4));
874 ti4 = VSUB(SVMUL(ti12, ci5), SVMUL(ti11, ci4));
875 ch_ref(i - 1, 3, k) = VSUB(tr2, tr5);
876 ch_ref(ic - 1, 2, k) = VADD(tr2, tr5);
877 ch_ref(i, 3, k) = VADD(ti2, ti5);
878 ch_ref(ic, 2, k) = VSUB(ti5, ti2);
879 ch_ref(i - 1, 5, k) = VSUB(tr3, tr4);
880 ch_ref(ic - 1, 4, k) = VADD(tr3, tr4);
881 ch_ref(i, 5, k) = VADD(ti3, ti4);
882 ch_ref(ic, 4, k) = VSUB(ti4, ti3);
883 }
884 }
885#undef cc_ref
886#undef ch_ref
887} /* radf5 */
888
889static void radb5_ps(int ido, int l1, const v4sf *RESTRICT cc, v4sf *RESTRICT ch,
890 const float *wa1, const float *wa2, const float *wa3, const float *wa4)
891{
892 static const float tr11 = .309016994374947f;
893 static const float ti11 = .951056516295154f;
894 static const float tr12 = -.809016994374947f;
895 static const float ti12 = .587785252292473f;
896
897 int cc_offset, ch_offset;
898
899 /* Local variables */
900 int i, k, ic;
901 v4sf ci2, ci3, ci4, ci5, di3, di4, di5, di2, cr2, cr3, cr5, cr4, ti2, ti3,
902 ti4, ti5, dr3, dr4, dr5, dr2, tr2, tr3, tr4, tr5;
903 int idp2;
904
905#define cc_ref(a_1,a_2,a_3) cc[((a_3)*5 + (a_2))*ido + a_1]
906#define ch_ref(a_1,a_2,a_3) ch[((a_3)*l1 + (a_2))*ido + a_1]
907
908 /* Parameter adjustments */
909 ch_offset = 1 + ido * (1 + l1);
910 ch -= ch_offset;
911 cc_offset = 1 + ido * 6;
912 cc -= cc_offset;
913
914 /* Function Body */
915 for (k = 1; k <= l1; ++k) {
916 ti5 = VADD(cc_ref(1, 3, k), cc_ref(1, 3, k));
917 ti4 = VADD(cc_ref(1, 5, k), cc_ref(1, 5, k));
918 tr2 = VADD(cc_ref(ido, 2, k), cc_ref(ido, 2, k));
919 tr3 = VADD(cc_ref(ido, 4, k), cc_ref(ido, 4, k));
920 ch_ref(1, k, 1) = VADD(cc_ref(1, 1, k), VADD(tr2, tr3));
921 cr2 = VADD(cc_ref(1, 1, k), VADD(SVMUL(tr11, tr2), SVMUL(tr12, tr3)));
922 cr3 = VADD(cc_ref(1, 1, k), VADD(SVMUL(tr12, tr2), SVMUL(tr11, tr3)));
923 ci5 = VADD(SVMUL(ti11, ti5), SVMUL(ti12, ti4));
924 ci4 = VSUB(SVMUL(ti12, ti5), SVMUL(ti11, ti4));
925 ch_ref(1, k, 2) = VSUB(cr2, ci5);
926 ch_ref(1, k, 3) = VSUB(cr3, ci4);
927 ch_ref(1, k, 4) = VADD(cr3, ci4);
928 ch_ref(1, k, 5) = VADD(cr2, ci5);
929 }
930 if (ido == 1) {
931 return;
932 }
933 idp2 = ido + 2;
934 for (k = 1; k <= l1; ++k) {
935 for (i = 3; i <= ido; i += 2) {
936 ic = idp2 - i;
937 ti5 = VADD(cc_ref(i , 3, k), cc_ref(ic , 2, k));
938 ti2 = VSUB(cc_ref(i , 3, k), cc_ref(ic , 2, k));
939 ti4 = VADD(cc_ref(i , 5, k), cc_ref(ic , 4, k));
940 ti3 = VSUB(cc_ref(i , 5, k), cc_ref(ic , 4, k));
941 tr5 = VSUB(cc_ref(i-1, 3, k), cc_ref(ic-1, 2, k));
942 tr2 = VADD(cc_ref(i-1, 3, k), cc_ref(ic-1, 2, k));
943 tr4 = VSUB(cc_ref(i-1, 5, k), cc_ref(ic-1, 4, k));
944 tr3 = VADD(cc_ref(i-1, 5, k), cc_ref(ic-1, 4, k));
945 ch_ref(i - 1, k, 1) = VADD(cc_ref(i-1, 1, k), VADD(tr2, tr3));
946 ch_ref(i, k, 1) = VADD(cc_ref(i, 1, k), VADD(ti2, ti3));
947 cr2 = VADD(cc_ref(i-1, 1, k), VADD(SVMUL(tr11, tr2), SVMUL(tr12, tr3)));
948 ci2 = VADD(cc_ref(i , 1, k), VADD(SVMUL(tr11, ti2), SVMUL(tr12, ti3)));
949 cr3 = VADD(cc_ref(i-1, 1, k), VADD(SVMUL(tr12, tr2), SVMUL(tr11, tr3)));
950 ci3 = VADD(cc_ref(i , 1, k), VADD(SVMUL(tr12, ti2), SVMUL(tr11, ti3)));
951 cr5 = VADD(SVMUL(ti11, tr5), SVMUL(ti12, tr4));
952 ci5 = VADD(SVMUL(ti11, ti5), SVMUL(ti12, ti4));
953 cr4 = VSUB(SVMUL(ti12, tr5), SVMUL(ti11, tr4));
954 ci4 = VSUB(SVMUL(ti12, ti5), SVMUL(ti11, ti4));
955 dr3 = VSUB(cr3, ci4);
956 dr4 = VADD(cr3, ci4);
957 di3 = VADD(ci3, cr4);
958 di4 = VSUB(ci3, cr4);
959 dr5 = VADD(cr2, ci5);
960 dr2 = VSUB(cr2, ci5);
961 di5 = VSUB(ci2, cr5);
962 di2 = VADD(ci2, cr5);
963 VCPLXMUL(dr2, di2, LD_PS1(wa1[i-3]), LD_PS1(wa1[i-2]));
964 VCPLXMUL(dr3, di3, LD_PS1(wa2[i-3]), LD_PS1(wa2[i-2]));
965 VCPLXMUL(dr4, di4, LD_PS1(wa3[i-3]), LD_PS1(wa3[i-2]));
966 VCPLXMUL(dr5, di5, LD_PS1(wa4[i-3]), LD_PS1(wa4[i-2]));
967
968 ch_ref(i-1, k, 2) = dr2; ch_ref(i, k, 2) = di2;
969 ch_ref(i-1, k, 3) = dr3; ch_ref(i, k, 3) = di3;
970 ch_ref(i-1, k, 4) = dr4; ch_ref(i, k, 4) = di4;
971 ch_ref(i-1, k, 5) = dr5; ch_ref(i, k, 5) = di5;
972 }
973 }
974#undef cc_ref
975#undef ch_ref
976} /* radb5 */
977
978static NEVER_INLINE(v4sf *) rfftf1_ps(int n, const v4sf *input_readonly, v4sf *work1, v4sf *work2,
979 const float *wa, const int *ifac) {
980 v4sf *in = (v4sf*)input_readonly;
981 v4sf *out = (in == work2 ? work1 : work2);
982 int nf = ifac[1], k1;
983 int l2 = n;
984 int iw = n-1;
985 assert(in != out && work1 != work2);
986 for (k1 = 1; k1 <= nf; ++k1) {
987 int kh = nf - k1;
988 int ip = ifac[kh + 2];
989 int l1 = l2 / ip;
990 int ido = n / l2;
991 iw -= (ip - 1)*ido;
992 switch (ip) {
993 case 5: {
994 int ix2 = iw + ido;
995 int ix3 = ix2 + ido;
996 int ix4 = ix3 + ido;
997 radf5_ps(ido, l1, cc: in, ch: out, wa1: &wa[iw], wa2: &wa[ix2], wa3: &wa[ix3], wa4: &wa[ix4]);
998 } break;
999 case 4: {
1000 int ix2 = iw + ido;
1001 int ix3 = ix2 + ido;
1002 radf4_ps(ido, l1, cc: in, ch: out, wa1: &wa[iw], wa2: &wa[ix2], wa3: &wa[ix3]);
1003 } break;
1004 case 3: {
1005 int ix2 = iw + ido;
1006 radf3_ps(ido, l1, cc: in, ch: out, wa1: &wa[iw], wa2: &wa[ix2]);
1007 } break;
1008 case 2:
1009 radf2_ps(ido, l1, cc: in, ch: out, wa1: &wa[iw]);
1010 break;
1011 default:
1012 assert(0);
1013 break;
1014 }
1015 l2 = l1;
1016 if (out == work2) {
1017 out = work1; in = work2;
1018 } else {
1019 out = work2; in = work1;
1020 }
1021 }
1022 return in; /* this is in fact the output .. */
1023} /* rfftf1 */
1024
1025static NEVER_INLINE(v4sf *) rfftb1_ps(int n, const v4sf *input_readonly, v4sf *work1, v4sf *work2,
1026 const float *wa, const int *ifac) {
1027 v4sf *in = (v4sf*)input_readonly;
1028 v4sf *out = (in == work2 ? work1 : work2);
1029 int nf = ifac[1], k1;
1030 int l1 = 1;
1031 int iw = 0;
1032 assert(in != out);
1033 for (k1=1; k1<=nf; k1++) {
1034 int ip = ifac[k1 + 1];
1035 int l2 = ip*l1;
1036 int ido = n / l2;
1037 switch (ip) {
1038 case 5: {
1039 int ix2 = iw + ido;
1040 int ix3 = ix2 + ido;
1041 int ix4 = ix3 + ido;
1042 radb5_ps(ido, l1, cc: in, ch: out, wa1: &wa[iw], wa2: &wa[ix2], wa3: &wa[ix3], wa4: &wa[ix4]);
1043 } break;
1044 case 4: {
1045 int ix2 = iw + ido;
1046 int ix3 = ix2 + ido;
1047 radb4_ps(ido, l1, cc: in, ch: out, wa1: &wa[iw], wa2: &wa[ix2], wa3: &wa[ix3]);
1048 } break;
1049 case 3: {
1050 int ix2 = iw + ido;
1051 radb3_ps(ido, l1, cc: in, ch: out, wa1: &wa[iw], wa2: &wa[ix2]);
1052 } break;
1053 case 2:
1054 radb2_ps(ido, l1, cc: in, ch: out, wa1: &wa[iw]);
1055 break;
1056 default:
1057 assert(0);
1058 break;
1059 }
1060 l1 = l2;
1061 iw += (ip - 1)*ido;
1062
1063 if (out == work2) {
1064 out = work1; in = work2;
1065 } else {
1066 out = work2; in = work1;
1067 }
1068 }
1069 return in; /* this is in fact the output .. */
1070}
1071
1072#define IFAC_MAX_SIZE 25 /* max number of integer factors for the decomposition, +2 */
1073static int decompose(int n, int *ifac, const int *ntryh) {
1074 int nl = n, nf = 0, i, j = 0;
1075 for (j=0; ntryh[j]; ++j) {
1076 int ntry = ntryh[j];
1077 while (nl != 1) {
1078 int nq = nl / ntry;
1079 int nr = nl - ntry * nq;
1080 if (nr == 0) {
1081 assert(2 + nf < IFAC_MAX_SIZE);
1082 ifac[2+nf++] = ntry;
1083 nl = nq;
1084 if (ntry == 2 && nf != 1) {
1085 for (i = 2; i <= nf; ++i) {
1086 int ib = nf - i + 2;
1087 ifac[ib + 1] = ifac[ib];
1088 }
1089 ifac[2] = 2;
1090 }
1091 } else break;
1092 }
1093 }
1094 ifac[0] = n;
1095 ifac[1] = nf;
1096 return nf;
1097}
1098
1099
1100
1101static void rffti1_ps(int n, float *wa, int *ifac)
1102{
1103 static const int ntryh[] = { 4,2,3,5,0 };
1104 int k1, j, ii;
1105
1106 int nf = decompose(n,ifac,ntryh);
1107 float argh = (2*M_PI) / n;
1108 int is = 0;
1109 int nfm1 = nf - 1;
1110 int l1 = 1;
1111 for (k1 = 1; k1 <= nfm1; k1++) {
1112 int ip = ifac[k1 + 1];
1113 int ld = 0;
1114 int l2 = l1*ip;
1115 int ido = n / l2;
1116 int ipm = ip - 1;
1117 for (j = 1; j <= ipm; ++j) {
1118 float argld;
1119 int i = is, fi=0;
1120 ld += l1;
1121 argld = ld*argh;
1122 for (ii = 3; ii <= ido; ii += 2) {
1123 i += 2;
1124 fi += 1;
1125 wa[i - 2] = cos(x: fi*argld);
1126 wa[i - 1] = sin(x: fi*argld);
1127 }
1128 is += ido;
1129 }
1130 l1 = l2;
1131 }
1132} /* rffti1 */
1133
1134void cffti1_ps(int n, float *wa, int *ifac)
1135{
1136 static const int ntryh[] = { 5,3,4,2,0 };
1137 int k1, j, ii;
1138
1139 int nf = decompose(n,ifac,ntryh);
1140 float argh = (2*M_PI)/(float)n;
1141 int i = 1;
1142 int l1 = 1;
1143 for (k1=1; k1<=nf; k1++) {
1144 int ip = ifac[k1+1];
1145 int ld = 0;
1146 int l2 = l1*ip;
1147 int ido = n / l2;
1148 int idot = ido + ido + 2;
1149 int ipm = ip - 1;
1150 for (j=1; j<=ipm; j++) {
1151 float argld;
1152 int i1 = i, fi = 0;
1153 wa[i-1] = 1;
1154 wa[i] = 0;
1155 ld += l1;
1156 argld = ld*argh;
1157 for (ii = 4; ii <= idot; ii += 2) {
1158 i += 2;
1159 fi += 1;
1160 wa[i-1] = cos(x: fi*argld);
1161 wa[i] = sin(x: fi*argld);
1162 }
1163 if (ip > 5) {
1164 wa[i1-1] = wa[i-1];
1165 wa[i1] = wa[i];
1166 }
1167 }
1168 l1 = l2;
1169 }
1170} /* cffti1 */
1171
1172
1173v4sf *cfftf1_ps(int n, const v4sf *input_readonly, v4sf *work1, v4sf *work2, const float *wa, const int *ifac, int isign) {
1174 v4sf *in = (v4sf*)input_readonly;
1175 v4sf *out = (in == work2 ? work1 : work2);
1176 int nf = ifac[1], k1;
1177 int l1 = 1;
1178 int iw = 0;
1179 assert(in != out && work1 != work2);
1180 for (k1=2; k1<=nf+1; k1++) {
1181 int ip = ifac[k1];
1182 int l2 = ip*l1;
1183 int ido = n / l2;
1184 int idot = ido + ido;
1185 switch (ip) {
1186 case 5: {
1187 int ix2 = iw + idot;
1188 int ix3 = ix2 + idot;
1189 int ix4 = ix3 + idot;
1190 passf5_ps(ido: idot, l1, cc: in, ch: out, wa1: &wa[iw], wa2: &wa[ix2], wa3: &wa[ix3], wa4: &wa[ix4], fsign: isign);
1191 } break;
1192 case 4: {
1193 int ix2 = iw + idot;
1194 int ix3 = ix2 + idot;
1195 passf4_ps(ido: idot, l1, cc: in, ch: out, wa1: &wa[iw], wa2: &wa[ix2], wa3: &wa[ix3], fsign: isign);
1196 } break;
1197 case 2: {
1198 passf2_ps(ido: idot, l1, cc: in, ch: out, wa1: &wa[iw], fsign: isign);
1199 } break;
1200 case 3: {
1201 int ix2 = iw + idot;
1202 passf3_ps(ido: idot, l1, cc: in, ch: out, wa1: &wa[iw], wa2: &wa[ix2], fsign: isign);
1203 } break;
1204 default:
1205 assert(0);
1206 }
1207 l1 = l2;
1208 iw += (ip - 1)*idot;
1209 if (out == work2) {
1210 out = work1; in = work2;
1211 } else {
1212 out = work2; in = work1;
1213 }
1214 }
1215
1216 return in; /* this is in fact the output .. */
1217}
1218
1219
1220struct PFFFT_Setup {
1221 int N;
1222 int Ncvec; // nb of complex simd vectors (N/4 if PFFFT_COMPLEX, N/8 if PFFFT_REAL)
1223 // hold the decomposition into small integers of N
1224 int ifac[IFAC_MAX_SIZE]; // N , number of factors, factors (admitted values: 2, 3, 4 ou 5)
1225 pffft_transform_t transform;
1226 v4sf *data; // allocated room for twiddle coefs
1227 float *e; // points into 'data' , N/4*3 elements
1228 float *twiddle; // points into 'data', N/4 elements
1229};
1230
1231PFFFT_Setup *pffft_new_setup(int N, pffft_transform_t transform) {
1232 // validate N for negative values or potential int overflow
1233 if (N < 0) {
1234 return 0;
1235 }
1236 if (N > (1<<26)) {
1237 // higher values of N will make you enter in the integer overflow world...
1238 assert(0);
1239 return 0;
1240 }
1241 PFFFT_Setup *s = (PFFFT_Setup*)malloc(size: sizeof(PFFFT_Setup));
1242 int k, m;
1243 /* unfortunately, the fft size must be a multiple of 16 for complex FFTs
1244 and 32 for real FFTs -- a lot of stuff would need to be rewritten to
1245 handle other cases (or maybe just switch to a scalar fft, I don't know..) */
1246 if (transform == PFFFT_REAL) { assert((N%(2*SIMD_SZ*SIMD_SZ))==0 && N>0); }
1247 if (transform == PFFFT_COMPLEX) { assert((N%(SIMD_SZ*SIMD_SZ))==0 && N>0); }
1248 //assert((N % 32) == 0);
1249 s->N = N;
1250 s->transform = transform;
1251 /* nb of complex simd vectors */
1252 s->Ncvec = (transform == PFFFT_REAL ? N/2 : N)/SIMD_SZ;
1253 s->data = (v4sf*)pffft_aligned_malloc(nb_bytes: 2*s->Ncvec * sizeof(v4sf));
1254 s->e = (float*)s->data;
1255 s->twiddle = (float*)(s->data + (2*s->Ncvec*(SIMD_SZ-1))/SIMD_SZ);
1256
1257 for (k=0; k < s->Ncvec; ++k) {
1258 int i = k/SIMD_SZ;
1259 int j = k%SIMD_SZ;
1260 for (m=0; m < SIMD_SZ-1; ++m) {
1261 float A = -2*M_PI*(m+1)*k / N;
1262 s->e[(2*(i*3 + m) + 0) * SIMD_SZ + j] = cos(x: A);
1263 s->e[(2*(i*3 + m) + 1) * SIMD_SZ + j] = sin(x: A);
1264 }
1265 }
1266
1267 if (transform == PFFFT_REAL) {
1268 rffti1_ps(n: N/SIMD_SZ, wa: s->twiddle, ifac: s->ifac);
1269 } else {
1270 cffti1_ps(n: N/SIMD_SZ, wa: s->twiddle, ifac: s->ifac);
1271 }
1272
1273 /* check that N is decomposable with allowed prime factors */
1274 for (k=0, m=1; k < s->ifac[1]; ++k) { m *= s->ifac[2+k]; }
1275 if (m != N/SIMD_SZ) {
1276 pffft_destroy_setup(s); s = 0;
1277 }
1278
1279 return s;
1280}
1281
1282
1283void pffft_destroy_setup(PFFFT_Setup *s) {
1284 pffft_aligned_free(p: s->data);
1285 free(ptr: s);
1286}
1287
1288#if !defined(PFFFT_SIMD_DISABLE)
1289
1290/* [0 0 1 2 3 4 5 6 7 8] -> [0 8 7 6 5 4 3 2 1] */
1291static void reversed_copy(int N, const v4sf *in, int in_stride, v4sf *out) {
1292 v4sf g0, g1;
1293 int k;
1294 INTERLEAVE2(in[0], in[1], g0, g1); in += in_stride;
1295
1296 *--out = VSWAPHL(g0, g1); // [g0l, g0h], [g1l g1h] -> [g1l, g0h]
1297 for (k=1; k < N; ++k) {
1298 v4sf h0, h1;
1299 INTERLEAVE2(in[0], in[1], h0, h1); in += in_stride;
1300 *--out = VSWAPHL(g1, h0);
1301 *--out = VSWAPHL(h0, h1);
1302 g1 = h1;
1303 }
1304 *--out = VSWAPHL(g1, g0);
1305}
1306
1307static void unreversed_copy(int N, const v4sf *in, v4sf *out, int out_stride) {
1308 v4sf g0, g1, h0, h1;
1309 int k;
1310 g0 = g1 = in[0]; ++in;
1311 for (k=1; k < N; ++k) {
1312 h0 = *in++; h1 = *in++;
1313 g1 = VSWAPHL(g1, h0);
1314 h0 = VSWAPHL(h0, h1);
1315 UNINTERLEAVE2(h0, g1, out[0], out[1]); out += out_stride;
1316 g1 = h1;
1317 }
1318 h0 = *in++; h1 = g0;
1319 g1 = VSWAPHL(g1, h0);
1320 h0 = VSWAPHL(h0, h1);
1321 UNINTERLEAVE2(h0, g1, out[0], out[1]);
1322}
1323
1324void pffft_zreorder(PFFFT_Setup *setup, const float *in, float *out, pffft_direction_t direction) {
1325 int k, N = setup->N, Ncvec = setup->Ncvec;
1326 const v4sf *vin = (const v4sf*)in;
1327 v4sf *vout = (v4sf*)out;
1328 assert(in != out);
1329 if (setup->transform == PFFFT_REAL) {
1330 int k, dk = N/32;
1331 if (direction == PFFFT_FORWARD) {
1332 for (k=0; k < dk; ++k) {
1333 INTERLEAVE2(vin[k*8 + 0], vin[k*8 + 1], vout[2*(0*dk + k) + 0], vout[2*(0*dk + k) + 1]);
1334 INTERLEAVE2(vin[k*8 + 4], vin[k*8 + 5], vout[2*(2*dk + k) + 0], vout[2*(2*dk + k) + 1]);
1335 }
1336 reversed_copy(N: dk, in: vin+2, in_stride: 8, out: (v4sf*)(out + N/2));
1337 reversed_copy(N: dk, in: vin+6, in_stride: 8, out: (v4sf*)(out + N));
1338 } else {
1339 for (k=0; k < dk; ++k) {
1340 UNINTERLEAVE2(vin[2*(0*dk + k) + 0], vin[2*(0*dk + k) + 1], vout[k*8 + 0], vout[k*8 + 1]);
1341 UNINTERLEAVE2(vin[2*(2*dk + k) + 0], vin[2*(2*dk + k) + 1], vout[k*8 + 4], vout[k*8 + 5]);
1342 }
1343 unreversed_copy(N: dk, in: (v4sf*)(in + N/4), out: (v4sf*)(out + N - 6*SIMD_SZ), out_stride: -8);
1344 unreversed_copy(N: dk, in: (v4sf*)(in + 3*N/4), out: (v4sf*)(out + N - 2*SIMD_SZ), out_stride: -8);
1345 }
1346 } else {
1347 if (direction == PFFFT_FORWARD) {
1348 for (k=0; k < Ncvec; ++k) {
1349 int kk = (k/4) + (k%4)*(Ncvec/4);
1350 INTERLEAVE2(vin[k*2], vin[k*2+1], vout[kk*2], vout[kk*2+1]);
1351 }
1352 } else {
1353 for (k=0; k < Ncvec; ++k) {
1354 int kk = (k/4) + (k%4)*(Ncvec/4);
1355 UNINTERLEAVE2(vin[kk*2], vin[kk*2+1], vout[k*2], vout[k*2+1]);
1356 }
1357 }
1358 }
1359}
1360
1361void pffft_cplx_finalize(int Ncvec, const v4sf *in, v4sf *out, const v4sf *e) {
1362 int k, dk = Ncvec/SIMD_SZ; // number of 4x4 matrix blocks
1363 v4sf r0, i0, r1, i1, r2, i2, r3, i3;
1364 v4sf sr0, dr0, sr1, dr1, si0, di0, si1, di1;
1365 assert(in != out);
1366 for (k=0; k < dk; ++k) {
1367 r0 = in[8*k+0]; i0 = in[8*k+1];
1368 r1 = in[8*k+2]; i1 = in[8*k+3];
1369 r2 = in[8*k+4]; i2 = in[8*k+5];
1370 r3 = in[8*k+6]; i3 = in[8*k+7];
1371 VTRANSPOSE4(r0,r1,r2,r3);
1372 VTRANSPOSE4(i0,i1,i2,i3);
1373 VCPLXMUL(r1,i1,e[k*6+0],e[k*6+1]);
1374 VCPLXMUL(r2,i2,e[k*6+2],e[k*6+3]);
1375 VCPLXMUL(r3,i3,e[k*6+4],e[k*6+5]);
1376
1377 sr0 = VADD(r0,r2); dr0 = VSUB(r0, r2);
1378 sr1 = VADD(r1,r3); dr1 = VSUB(r1, r3);
1379 si0 = VADD(i0,i2); di0 = VSUB(i0, i2);
1380 si1 = VADD(i1,i3); di1 = VSUB(i1, i3);
1381
1382 /*
1383 transformation for each column is:
1384
1385 [1 1 1 1 0 0 0 0] [r0]
1386 [1 0 -1 0 0 -1 0 1] [r1]
1387 [1 -1 1 -1 0 0 0 0] [r2]
1388 [1 0 -1 0 0 1 0 -1] [r3]
1389 [0 0 0 0 1 1 1 1] * [i0]
1390 [0 1 0 -1 1 0 -1 0] [i1]
1391 [0 0 0 0 1 -1 1 -1] [i2]
1392 [0 -1 0 1 1 0 -1 0] [i3]
1393 */
1394
1395 r0 = VADD(sr0, sr1); i0 = VADD(si0, si1);
1396 r1 = VADD(dr0, di1); i1 = VSUB(di0, dr1);
1397 r2 = VSUB(sr0, sr1); i2 = VSUB(si0, si1);
1398 r3 = VSUB(dr0, di1); i3 = VADD(di0, dr1);
1399
1400 *out++ = r0; *out++ = i0; *out++ = r1; *out++ = i1;
1401 *out++ = r2; *out++ = i2; *out++ = r3; *out++ = i3;
1402 }
1403}
1404
1405void pffft_cplx_preprocess(int Ncvec, const v4sf *in, v4sf *out, const v4sf *e) {
1406 int k, dk = Ncvec/SIMD_SZ; // number of 4x4 matrix blocks
1407 v4sf r0, i0, r1, i1, r2, i2, r3, i3;
1408 v4sf sr0, dr0, sr1, dr1, si0, di0, si1, di1;
1409 assert(in != out);
1410 for (k=0; k < dk; ++k) {
1411 r0 = in[8*k+0]; i0 = in[8*k+1];
1412 r1 = in[8*k+2]; i1 = in[8*k+3];
1413 r2 = in[8*k+4]; i2 = in[8*k+5];
1414 r3 = in[8*k+6]; i3 = in[8*k+7];
1415
1416 sr0 = VADD(r0,r2); dr0 = VSUB(r0, r2);
1417 sr1 = VADD(r1,r3); dr1 = VSUB(r1, r3);
1418 si0 = VADD(i0,i2); di0 = VSUB(i0, i2);
1419 si1 = VADD(i1,i3); di1 = VSUB(i1, i3);
1420
1421 r0 = VADD(sr0, sr1); i0 = VADD(si0, si1);
1422 r1 = VSUB(dr0, di1); i1 = VADD(di0, dr1);
1423 r2 = VSUB(sr0, sr1); i2 = VSUB(si0, si1);
1424 r3 = VADD(dr0, di1); i3 = VSUB(di0, dr1);
1425
1426 VCPLXMULCONJ(r1,i1,e[k*6+0],e[k*6+1]);
1427 VCPLXMULCONJ(r2,i2,e[k*6+2],e[k*6+3]);
1428 VCPLXMULCONJ(r3,i3,e[k*6+4],e[k*6+5]);
1429
1430 VTRANSPOSE4(r0,r1,r2,r3);
1431 VTRANSPOSE4(i0,i1,i2,i3);
1432
1433 *out++ = r0; *out++ = i0; *out++ = r1; *out++ = i1;
1434 *out++ = r2; *out++ = i2; *out++ = r3; *out++ = i3;
1435 }
1436}
1437
1438
1439static ALWAYS_INLINE(void) pffft_real_finalize_4x4(const v4sf *in0, const v4sf *in1, const v4sf *in,
1440 const v4sf *e, v4sf *out) {
1441 v4sf r0, i0, r1, i1, r2, i2, r3, i3;
1442 v4sf sr0, dr0, sr1, dr1, si0, di0, si1, di1;
1443 r0 = *in0; i0 = *in1;
1444 r1 = *in++; i1 = *in++; r2 = *in++; i2 = *in++; r3 = *in++; i3 = *in++;
1445 VTRANSPOSE4(r0,r1,r2,r3);
1446 VTRANSPOSE4(i0,i1,i2,i3);
1447
1448 /*
1449 transformation for each column is:
1450
1451 [1 1 1 1 0 0 0 0] [r0]
1452 [1 0 -1 0 0 -1 0 1] [r1]
1453 [1 0 -1 0 0 1 0 -1] [r2]
1454 [1 -1 1 -1 0 0 0 0] [r3]
1455 [0 0 0 0 1 1 1 1] * [i0]
1456 [0 -1 0 1 -1 0 1 0] [i1]
1457 [0 -1 0 1 1 0 -1 0] [i2]
1458 [0 0 0 0 -1 1 -1 1] [i3]
1459 */
1460
1461 //cerr << "matrix initial, before e , REAL:\n 1: " << r0 << "\n 1: " << r1 << "\n 1: " << r2 << "\n 1: " << r3 << "\n";
1462 //cerr << "matrix initial, before e, IMAG :\n 1: " << i0 << "\n 1: " << i1 << "\n 1: " << i2 << "\n 1: " << i3 << "\n";
1463
1464 VCPLXMUL(r1,i1,e[0],e[1]);
1465 VCPLXMUL(r2,i2,e[2],e[3]);
1466 VCPLXMUL(r3,i3,e[4],e[5]);
1467
1468 //cerr << "matrix initial, real part:\n 1: " << r0 << "\n 1: " << r1 << "\n 1: " << r2 << "\n 1: " << r3 << "\n";
1469 //cerr << "matrix initial, imag part:\n 1: " << i0 << "\n 1: " << i1 << "\n 1: " << i2 << "\n 1: " << i3 << "\n";
1470
1471 sr0 = VADD(r0,r2); dr0 = VSUB(r0,r2);
1472 sr1 = VADD(r1,r3); dr1 = VSUB(r3,r1);
1473 si0 = VADD(i0,i2); di0 = VSUB(i0,i2);
1474 si1 = VADD(i1,i3); di1 = VSUB(i3,i1);
1475
1476 r0 = VADD(sr0, sr1);
1477 r3 = VSUB(sr0, sr1);
1478 i0 = VADD(si0, si1);
1479 i3 = VSUB(si1, si0);
1480 r1 = VADD(dr0, di1);
1481 r2 = VSUB(dr0, di1);
1482 i1 = VSUB(dr1, di0);
1483 i2 = VADD(dr1, di0);
1484
1485 *out++ = r0;
1486 *out++ = i0;
1487 *out++ = r1;
1488 *out++ = i1;
1489 *out++ = r2;
1490 *out++ = i2;
1491 *out++ = r3;
1492 *out++ = i3;
1493
1494}
1495
1496static NEVER_INLINE(void) pffft_real_finalize(int Ncvec, const v4sf *in, v4sf *out, const v4sf *e) {
1497 int k, dk = Ncvec/SIMD_SZ; // number of 4x4 matrix blocks
1498 /* fftpack order is f0r f1r f1i f2r f2i ... f(n-1)r f(n-1)i f(n)r */
1499
1500 v4sf_union cr, ci, *uout = (v4sf_union*)out;
1501 v4sf save = in[7], zero=VZERO();
1502 float xr0, xi0, xr1, xi1, xr2, xi2, xr3, xi3;
1503 static const float s = (float)(M_SQRT2/2);
1504
1505 cr.v = in[0]; ci.v = in[Ncvec*2-1];
1506 assert(in != out);
1507 pffft_real_finalize_4x4(in0: &zero, in1: &zero, in: in+1, e, out);
1508
1509 /*
1510 [cr0 cr1 cr2 cr3 ci0 ci1 ci2 ci3]
1511
1512 [Xr(1)] ] [1 1 1 1 0 0 0 0]
1513 [Xr(N/4) ] [0 0 0 0 1 s 0 -s]
1514 [Xr(N/2) ] [1 0 -1 0 0 0 0 0]
1515 [Xr(3N/4)] [0 0 0 0 1 -s 0 s]
1516 [Xi(1) ] [1 -1 1 -1 0 0 0 0]
1517 [Xi(N/4) ] [0 0 0 0 0 -s -1 -s]
1518 [Xi(N/2) ] [0 -1 0 1 0 0 0 0]
1519 [Xi(3N/4)] [0 0 0 0 0 -s 1 -s]
1520 */
1521
1522 xr0=(cr.f[0]+cr.f[2]) + (cr.f[1]+cr.f[3]); uout[0].f[0] = xr0;
1523 xi0=(cr.f[0]+cr.f[2]) - (cr.f[1]+cr.f[3]); uout[1].f[0] = xi0;
1524 xr2=(cr.f[0]-cr.f[2]); uout[4].f[0] = xr2;
1525 xi2=(cr.f[3]-cr.f[1]); uout[5].f[0] = xi2;
1526 xr1= ci.f[0] + s*(ci.f[1]-ci.f[3]); uout[2].f[0] = xr1;
1527 xi1=-ci.f[2] - s*(ci.f[1]+ci.f[3]); uout[3].f[0] = xi1;
1528 xr3= ci.f[0] - s*(ci.f[1]-ci.f[3]); uout[6].f[0] = xr3;
1529 xi3= ci.f[2] - s*(ci.f[1]+ci.f[3]); uout[7].f[0] = xi3;
1530
1531 for (k=1; k < dk; ++k) {
1532 v4sf save_next = in[8*k+7];
1533 pffft_real_finalize_4x4(in0: &save, in1: &in[8*k+0], in: in + 8*k+1,
1534 e: e + k*6, out: out + k*8);
1535 save = save_next;
1536 }
1537
1538}
1539
1540static ALWAYS_INLINE(void) pffft_real_preprocess_4x4(const v4sf *in,
1541 const v4sf *e, v4sf *out, int first) {
1542 v4sf r0=in[0], i0=in[1], r1=in[2], i1=in[3], r2=in[4], i2=in[5], r3=in[6], i3=in[7];
1543 /*
1544 transformation for each column is:
1545
1546 [1 1 1 1 0 0 0 0] [r0]
1547 [1 0 0 -1 0 -1 -1 0] [r1]
1548 [1 -1 -1 1 0 0 0 0] [r2]
1549 [1 0 0 -1 0 1 1 0] [r3]
1550 [0 0 0 0 1 -1 1 -1] * [i0]
1551 [0 -1 1 0 1 0 0 1] [i1]
1552 [0 0 0 0 1 1 -1 -1] [i2]
1553 [0 1 -1 0 1 0 0 1] [i3]
1554 */
1555
1556 v4sf sr0 = VADD(r0,r3), dr0 = VSUB(r0,r3);
1557 v4sf sr1 = VADD(r1,r2), dr1 = VSUB(r1,r2);
1558 v4sf si0 = VADD(i0,i3), di0 = VSUB(i0,i3);
1559 v4sf si1 = VADD(i1,i2), di1 = VSUB(i1,i2);
1560
1561 r0 = VADD(sr0, sr1);
1562 r2 = VSUB(sr0, sr1);
1563 r1 = VSUB(dr0, si1);
1564 r3 = VADD(dr0, si1);
1565 i0 = VSUB(di0, di1);
1566 i2 = VADD(di0, di1);
1567 i1 = VSUB(si0, dr1);
1568 i3 = VADD(si0, dr1);
1569
1570 VCPLXMULCONJ(r1,i1,e[0],e[1]);
1571 VCPLXMULCONJ(r2,i2,e[2],e[3]);
1572 VCPLXMULCONJ(r3,i3,e[4],e[5]);
1573
1574 VTRANSPOSE4(r0,r1,r2,r3);
1575 VTRANSPOSE4(i0,i1,i2,i3);
1576
1577 if (!first) {
1578 *out++ = r0;
1579 *out++ = i0;
1580 }
1581 *out++ = r1;
1582 *out++ = i1;
1583 *out++ = r2;
1584 *out++ = i2;
1585 *out++ = r3;
1586 *out++ = i3;
1587}
1588
1589static NEVER_INLINE(void) pffft_real_preprocess(int Ncvec, const v4sf *in, v4sf *out, const v4sf *e) {
1590 int k, dk = Ncvec/SIMD_SZ; // number of 4x4 matrix blocks
1591 /* fftpack order is f0r f1r f1i f2r f2i ... f(n-1)r f(n-1)i f(n)r */
1592
1593 v4sf_union Xr, Xi, *uout = (v4sf_union*)out;
1594 float cr0, ci0, cr1, ci1, cr2, ci2, cr3, ci3;
1595 static const float s = (float)M_SQRT2;
1596 assert(in != out);
1597 for (k=0; k < 4; ++k) {
1598 Xr.f[k] = ((float*)in)[8*k];
1599 Xi.f[k] = ((float*)in)[8*k+4];
1600 }
1601
1602 pffft_real_preprocess_4x4(in, e, out: out+1, first: 1); // will write only 6 values
1603
1604 /*
1605 [Xr0 Xr1 Xr2 Xr3 Xi0 Xi1 Xi2 Xi3]
1606
1607 [cr0] [1 0 2 0 1 0 0 0]
1608 [cr1] [1 0 0 0 -1 0 -2 0]
1609 [cr2] [1 0 -2 0 1 0 0 0]
1610 [cr3] [1 0 0 0 -1 0 2 0]
1611 [ci0] [0 2 0 2 0 0 0 0]
1612 [ci1] [0 s 0 -s 0 -s 0 -s]
1613 [ci2] [0 0 0 0 0 -2 0 2]
1614 [ci3] [0 -s 0 s 0 -s 0 -s]
1615 */
1616 for (k=1; k < dk; ++k) {
1617 pffft_real_preprocess_4x4(in: in+8*k, e: e + k*6, out: out-1+k*8, first: 0);
1618 }
1619
1620 cr0=(Xr.f[0]+Xi.f[0]) + 2*Xr.f[2]; uout[0].f[0] = cr0;
1621 cr1=(Xr.f[0]-Xi.f[0]) - 2*Xi.f[2]; uout[0].f[1] = cr1;
1622 cr2=(Xr.f[0]+Xi.f[0]) - 2*Xr.f[2]; uout[0].f[2] = cr2;
1623 cr3=(Xr.f[0]-Xi.f[0]) + 2*Xi.f[2]; uout[0].f[3] = cr3;
1624 ci0= 2*(Xr.f[1]+Xr.f[3]); uout[2*Ncvec-1].f[0] = ci0;
1625 ci1= s*(Xr.f[1]-Xr.f[3]) - s*(Xi.f[1]+Xi.f[3]); uout[2*Ncvec-1].f[1] = ci1;
1626 ci2= 2*(Xi.f[3]-Xi.f[1]); uout[2*Ncvec-1].f[2] = ci2;
1627 ci3=-s*(Xr.f[1]-Xr.f[3]) - s*(Xi.f[1]+Xi.f[3]); uout[2*Ncvec-1].f[3] = ci3;
1628}
1629
1630
1631void pffft_transform_internal(PFFFT_Setup *setup, const float *finput, float *foutput, v4sf *scratch,
1632 pffft_direction_t direction, int ordered) {
1633 int k, Ncvec = setup->Ncvec;
1634 int nf_odd = (setup->ifac[1] & 1);
1635
1636 // temporary buffer is allocated on the stack if the scratch pointer is NULL
1637 int stack_allocate = (scratch == 0 ? Ncvec*2 : 1);
1638 VLA_ARRAY_ON_STACK(v4sf, scratch_on_stack, stack_allocate);
1639
1640 const v4sf *vinput = (const v4sf*)finput;
1641 v4sf *voutput = (v4sf*)foutput;
1642 v4sf *buff[2] = { voutput, scratch ? scratch : scratch_on_stack };
1643 int ib = (nf_odd ^ ordered ? 1 : 0);
1644
1645 assert(VALIGNED(finput) && VALIGNED(foutput));
1646
1647 //assert(finput != foutput);
1648 if (direction == PFFFT_FORWARD) {
1649 ib = !ib;
1650 if (setup->transform == PFFFT_REAL) {
1651 ib = (rfftf1_ps(n: Ncvec*2, input_readonly: vinput, work1: buff[ib], work2: buff[!ib],
1652 wa: setup->twiddle, ifac: &setup->ifac[0]) == buff[0] ? 0 : 1);
1653 pffft_real_finalize(Ncvec, in: buff[ib], out: buff[!ib], e: (v4sf*)setup->e);
1654 } else {
1655 v4sf *tmp = buff[ib];
1656 for (k=0; k < Ncvec; ++k) {
1657 UNINTERLEAVE2(vinput[k*2], vinput[k*2+1], tmp[k*2], tmp[k*2+1]);
1658 }
1659 ib = (cfftf1_ps(n: Ncvec, input_readonly: buff[ib], work1: buff[!ib], work2: buff[ib],
1660 wa: setup->twiddle, ifac: &setup->ifac[0], isign: -1) == buff[0] ? 0 : 1);
1661 pffft_cplx_finalize(Ncvec, in: buff[ib], out: buff[!ib], e: (v4sf*)setup->e);
1662 }
1663 if (ordered) {
1664 pffft_zreorder(setup, in: (float*)buff[!ib], out: (float*)buff[ib], direction: PFFFT_FORWARD);
1665 } else ib = !ib;
1666 } else {
1667 if (vinput == buff[ib]) {
1668 ib = !ib; // may happen when finput == foutput
1669 }
1670 if (ordered) {
1671 pffft_zreorder(setup, in: (float*)vinput, out: (float*)buff[ib], direction: PFFFT_BACKWARD);
1672 vinput = buff[ib]; ib = !ib;
1673 }
1674 if (setup->transform == PFFFT_REAL) {
1675 pffft_real_preprocess(Ncvec, in: vinput, out: buff[ib], e: (v4sf*)setup->e);
1676 ib = (rfftb1_ps(n: Ncvec*2, input_readonly: buff[ib], work1: buff[0], work2: buff[1],
1677 wa: setup->twiddle, ifac: &setup->ifac[0]) == buff[0] ? 0 : 1);
1678 } else {
1679 pffft_cplx_preprocess(Ncvec, in: vinput, out: buff[ib], e: (v4sf*)setup->e);
1680 ib = (cfftf1_ps(n: Ncvec, input_readonly: buff[ib], work1: buff[0], work2: buff[1],
1681 wa: setup->twiddle, ifac: &setup->ifac[0], isign: +1) == buff[0] ? 0 : 1);
1682 for (k=0; k < Ncvec; ++k) {
1683 INTERLEAVE2(buff[ib][k*2], buff[ib][k*2+1], buff[ib][k*2], buff[ib][k*2+1]);
1684 }
1685 }
1686 }
1687
1688 if (buff[ib] != voutput) {
1689 /* extra copy required -- this situation should only happen when finput == foutput */
1690 assert(finput==foutput);
1691 for (k=0; k < Ncvec; ++k) {
1692 v4sf a = buff[ib][2*k], b = buff[ib][2*k+1];
1693 voutput[2*k] = a; voutput[2*k+1] = b;
1694 }
1695 ib = !ib;
1696 }
1697 assert(buff[ib] == voutput);
1698}
1699
1700void pffft_zconvolve_accumulate(PFFFT_Setup *s, const float *a, const float *b, float *ab, float scaling) {
1701 int Ncvec = s->Ncvec;
1702 const v4sf * RESTRICT va = (const v4sf*)a;
1703 const v4sf * RESTRICT vb = (const v4sf*)b;
1704 v4sf * RESTRICT vab = (v4sf*)ab;
1705
1706#ifdef __arm__
1707 __builtin_prefetch(va);
1708 __builtin_prefetch(vb);
1709 __builtin_prefetch(vab);
1710 __builtin_prefetch(va+2);
1711 __builtin_prefetch(vb+2);
1712 __builtin_prefetch(vab+2);
1713 __builtin_prefetch(va+4);
1714 __builtin_prefetch(vb+4);
1715 __builtin_prefetch(vab+4);
1716 __builtin_prefetch(va+6);
1717 __builtin_prefetch(vb+6);
1718 __builtin_prefetch(vab+6);
1719# ifndef __clang__
1720# define ZCONVOLVE_USING_INLINE_NEON_ASM
1721# endif
1722#endif
1723
1724 float ar0, ai0, br0, bi0, abr0, abi0;
1725#ifndef ZCONVOLVE_USING_INLINE_ASM
1726 v4sf vscal = LD_PS1(scaling);
1727 int i;
1728#endif
1729
1730 assert(VALIGNED(a) && VALIGNED(b) && VALIGNED(ab));
1731 ar0 = ((v4sf_union*)va)[0].f[0];
1732 ai0 = ((v4sf_union*)va)[1].f[0];
1733 br0 = ((v4sf_union*)vb)[0].f[0];
1734 bi0 = ((v4sf_union*)vb)[1].f[0];
1735 abr0 = ((v4sf_union*)vab)[0].f[0];
1736 abi0 = ((v4sf_union*)vab)[1].f[0];
1737
1738#ifdef ZCONVOLVE_USING_INLINE_ASM // inline asm version, unfortunately miscompiled by clang 3.2, at least on ubuntu.. so this will be restricted to gcc
1739 const float *a_ = a, *b_ = b; float *ab_ = ab;
1740 int N = Ncvec;
1741 asm volatile("mov r8, %2 \n"
1742 "vdup.f32 q15, %4 \n"
1743 "1: \n"
1744 "pld [%0,#64] \n"
1745 "pld [%1,#64] \n"
1746 "pld [%2,#64] \n"
1747 "pld [%0,#96] \n"
1748 "pld [%1,#96] \n"
1749 "pld [%2,#96] \n"
1750 "vld1.f32 {q0,q1}, [%0,:128]! \n"
1751 "vld1.f32 {q4,q5}, [%1,:128]! \n"
1752 "vld1.f32 {q2,q3}, [%0,:128]! \n"
1753 "vld1.f32 {q6,q7}, [%1,:128]! \n"
1754 "vld1.f32 {q8,q9}, [r8,:128]! \n"
1755
1756 "vmul.f32 q10, q0, q4 \n"
1757 "vmul.f32 q11, q0, q5 \n"
1758 "vmul.f32 q12, q2, q6 \n"
1759 "vmul.f32 q13, q2, q7 \n"
1760 "vmls.f32 q10, q1, q5 \n"
1761 "vmla.f32 q11, q1, q4 \n"
1762 "vld1.f32 {q0,q1}, [r8,:128]! \n"
1763 "vmls.f32 q12, q3, q7 \n"
1764 "vmla.f32 q13, q3, q6 \n"
1765 "vmla.f32 q8, q10, q15 \n"
1766 "vmla.f32 q9, q11, q15 \n"
1767 "vmla.f32 q0, q12, q15 \n"
1768 "vmla.f32 q1, q13, q15 \n"
1769 "vst1.f32 {q8,q9},[%2,:128]! \n"
1770 "vst1.f32 {q0,q1},[%2,:128]! \n"
1771 "subs %3, #2 \n"
1772 "bne 1b \n"
1773 : "+r"(a_), "+r"(b_), "+r"(ab_), "+r"(N) : "r"(scaling) : "r8", "q0","q1","q2","q3","q4","q5","q6","q7","q8","q9", "q10","q11","q12","q13","q15","memory");
1774#else // default routine, works fine for non-arm cpus with current compilers
1775 for (i=0; i < Ncvec; i += 2) {
1776 v4sf ar, ai, br, bi;
1777 ar = va[2*i+0]; ai = va[2*i+1];
1778 br = vb[2*i+0]; bi = vb[2*i+1];
1779 VCPLXMUL(ar, ai, br, bi);
1780 vab[2*i+0] = VMADD(ar, vscal, vab[2*i+0]);
1781 vab[2*i+1] = VMADD(ai, vscal, vab[2*i+1]);
1782 ar = va[2*i+2]; ai = va[2*i+3];
1783 br = vb[2*i+2]; bi = vb[2*i+3];
1784 VCPLXMUL(ar, ai, br, bi);
1785 vab[2*i+2] = VMADD(ar, vscal, vab[2*i+2]);
1786 vab[2*i+3] = VMADD(ai, vscal, vab[2*i+3]);
1787 }
1788#endif
1789 if (s->transform == PFFFT_REAL) {
1790 ((v4sf_union*)vab)[0].f[0] = abr0 + ar0*br0*scaling;
1791 ((v4sf_union*)vab)[1].f[0] = abi0 + ai0*bi0*scaling;
1792 }
1793}
1794
1795
1796#else // defined(PFFFT_SIMD_DISABLE)
1797
1798// standard routine using scalar floats, without SIMD stuff.
1799
1800#define pffft_zreorder_nosimd pffft_zreorder
1801void pffft_zreorder_nosimd(PFFFT_Setup *setup, const float *in, float *out, pffft_direction_t direction) {
1802 int k, N = setup->N;
1803 if (setup->transform == PFFFT_COMPLEX) {
1804 for (k=0; k < 2*N; ++k) out[k] = in[k];
1805 return;
1806 }
1807 else if (direction == PFFFT_FORWARD) {
1808 float x_N = in[N-1];
1809 for (k=N-1; k > 1; --k) out[k] = in[k-1];
1810 out[0] = in[0];
1811 out[1] = x_N;
1812 } else {
1813 float x_N = in[1];
1814 for (k=1; k < N-1; ++k) out[k] = in[k+1];
1815 out[0] = in[0];
1816 out[N-1] = x_N;
1817 }
1818}
1819
1820#define pffft_transform_internal_nosimd pffft_transform_internal
1821void pffft_transform_internal_nosimd(PFFFT_Setup *setup, const float *input, float *output, float *scratch,
1822 pffft_direction_t direction, int ordered) {
1823 int Ncvec = setup->Ncvec;
1824 int nf_odd = (setup->ifac[1] & 1);
1825
1826 // temporary buffer is allocated on the stack if the scratch pointer is NULL
1827 int stack_allocate = (scratch == 0 ? Ncvec*2 : 1);
1828 VLA_ARRAY_ON_STACK(v4sf, scratch_on_stack, stack_allocate);
1829 float *buff[2];
1830 int ib;
1831 if (scratch == 0) scratch = scratch_on_stack;
1832 buff[0] = output; buff[1] = scratch;
1833
1834 if (setup->transform == PFFFT_COMPLEX) ordered = 0; // it is always ordered.
1835 ib = (nf_odd ^ ordered ? 1 : 0);
1836
1837 if (direction == PFFFT_FORWARD) {
1838 if (setup->transform == PFFFT_REAL) {
1839 ib = (rfftf1_ps(Ncvec*2, input, buff[ib], buff[!ib],
1840 setup->twiddle, &setup->ifac[0]) == buff[0] ? 0 : 1);
1841 } else {
1842 ib = (cfftf1_ps(Ncvec, input, buff[ib], buff[!ib],
1843 setup->twiddle, &setup->ifac[0], -1) == buff[0] ? 0 : 1);
1844 }
1845 if (ordered) {
1846 pffft_zreorder(setup, buff[ib], buff[!ib], PFFFT_FORWARD); ib = !ib;
1847 }
1848 } else {
1849 if (input == buff[ib]) {
1850 ib = !ib; // may happen when finput == foutput
1851 }
1852 if (ordered) {
1853 pffft_zreorder(setup, input, buff[!ib], PFFFT_BACKWARD);
1854 input = buff[!ib];
1855 }
1856 if (setup->transform == PFFFT_REAL) {
1857 ib = (rfftb1_ps(Ncvec*2, input, buff[ib], buff[!ib],
1858 setup->twiddle, &setup->ifac[0]) == buff[0] ? 0 : 1);
1859 } else {
1860 ib = (cfftf1_ps(Ncvec, input, buff[ib], buff[!ib],
1861 setup->twiddle, &setup->ifac[0], +1) == buff[0] ? 0 : 1);
1862 }
1863 }
1864 if (buff[ib] != output) {
1865 int k;
1866 // extra copy required -- this situation should happens only when finput == foutput
1867 assert(input==output);
1868 for (k=0; k < Ncvec; ++k) {
1869 float a = buff[ib][2*k], b = buff[ib][2*k+1];
1870 output[2*k] = a; output[2*k+1] = b;
1871 }
1872 ib = !ib;
1873 }
1874 assert(buff[ib] == output);
1875}
1876
1877#define pffft_zconvolve_accumulate_nosimd pffft_zconvolve_accumulate
1878void pffft_zconvolve_accumulate_nosimd(PFFFT_Setup *s, const float *a, const float *b,
1879 float *ab, float scaling) {
1880 int i, Ncvec = s->Ncvec;
1881
1882 if (s->transform == PFFFT_REAL) {
1883 // take care of the fftpack ordering
1884 ab[0] += a[0]*b[0]*scaling;
1885 ab[2*Ncvec-1] += a[2*Ncvec-1]*b[2*Ncvec-1]*scaling;
1886 ++ab; ++a; ++b; --Ncvec;
1887 }
1888 for (i=0; i < Ncvec; ++i) {
1889 float ar, ai, br, bi;
1890 ar = a[2*i+0]; ai = a[2*i+1];
1891 br = b[2*i+0]; bi = b[2*i+1];
1892 VCPLXMUL(ar, ai, br, bi);
1893 ab[2*i+0] += ar*scaling;
1894 ab[2*i+1] += ai*scaling;
1895 }
1896}
1897
1898#endif // defined(PFFFT_SIMD_DISABLE)
1899
1900void pffft_transform(PFFFT_Setup *setup, const float *input, float *output, float *work, pffft_direction_t direction) {
1901 pffft_transform_internal(setup, finput: input, foutput: output, scratch: (v4sf*)work, direction, ordered: 0);
1902}
1903
1904void pffft_transform_ordered(PFFFT_Setup *setup, const float *input, float *output, float *work, pffft_direction_t direction) {
1905 pffft_transform_internal(setup, finput: input, foutput: output, scratch: (v4sf*)work, direction, ordered: 1);
1906}
1907

source code of qtmultimedia/src/3rdparty/pffft/pffft.c