1/* Copyright (c) 2013 Julien Pommier ( pommier@modartt.com )
2
3 Based on original fortran 77 code from FFTPACKv4 from NETLIB
4 (http://www.netlib.org/fftpack), authored by Dr Paul Swarztrauber
5 of NCAR, in 1985.
6
7 As confirmed by the NCAR fftpack software curators, the following
8 FFTPACKv5 license applies to FFTPACKv4 sources. My changes are
9 released under the same terms.
10
11 FFTPACK license:
12
13 http://www.cisl.ucar.edu/css/software/fftpack5/ftpk.html
14
15 Copyright (c) 2004 the University Corporation for Atmospheric
16 Research ("UCAR"). All rights reserved. Developed by NCAR's
17 Computational and Information Systems Laboratory, UCAR,
18 www.cisl.ucar.edu.
19
20 Redistribution and use of the Software in source and binary forms,
21 with or without modification, is permitted provided that the
22 following conditions are met:
23
24 - Neither the names of NCAR's Computational and Information Systems
25 Laboratory, the University Corporation for Atmospheric Research,
26 nor the names of its sponsors or contributors may be used to
27 endorse or promote products derived from this Software without
28 specific prior written permission.
29
30 - Redistributions of source code must retain the above copyright
31 notices, this list of conditions, and the disclaimer below.
32
33 - Redistributions in binary form must reproduce the above copyright
34 notice, this list of conditions, and the disclaimer below in the
35 documentation and/or other materials provided with the
36 distribution.
37
38 THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
39 EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF
40 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
41 NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
42 HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL,
43 EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN
44 ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
45 CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
46 SOFTWARE.
47
48
49 PFFFT : a Pretty Fast FFT.
50
51 This file is largerly based on the original FFTPACK implementation, modified in
52 order to take advantage of SIMD instructions of modern CPUs.
53*/
54
55/*
56 ChangeLog:
57 - 2011/10/02, version 1: This is the very first release of this file.
58*/
59
60#include "pffft.h"
61#include <stdlib.h>
62#include <stdio.h>
63#include <math.h>
64#include <assert.h>
65
66#ifndef M_PI
67#define M_PI 3.14159265358979323846264338327950288
68#endif
69#ifndef M_SQRT2
70#define M_SQRT2 1.41421356237309504880168872420969808
71#endif
72
73/* detect compiler flavour */
74#if defined(_MSC_VER)
75# define COMPILER_MSVC
76#elif defined(__GNUC__)
77# define COMPILER_GCC
78#endif
79
80#if defined(COMPILER_GCC)
81# define ALWAYS_INLINE(return_type) inline return_type __attribute__ ((always_inline))
82# define NEVER_INLINE(return_type) return_type __attribute__ ((noinline))
83# define RESTRICT __restrict
84# define VLA_ARRAY_ON_STACK(type__, varname__, size__) type__ varname__[size__];
85#elif defined(COMPILER_MSVC)
86# define ALWAYS_INLINE(return_type) __forceinline return_type
87# define NEVER_INLINE(return_type) __declspec(noinline) return_type
88# define RESTRICT __restrict
89# define VLA_ARRAY_ON_STACK(type__, varname__, size__) type__ *varname__ = (type__*)_alloca(size__ * sizeof(type__))
90#endif
91
92
93/*
94 vector support macros: the rest of the code is independant of
95 SSE/Altivec/NEON -- adding support for other platforms with 4-element
96 vectors should be limited to these macros
97*/
98
99
100// define PFFFT_SIMD_DISABLE if you want to use scalar code instead of simd code
101//#define PFFFT_SIMD_DISABLE
102
103/*
104 Altivec support macros
105*/
106#if !defined(PFFFT_SIMD_DISABLE) && (defined(__ppc__) || defined(__ppc64__) || defined(__powerpc__) || defined(__powerpc64__))
107#include <altivec.h>
108typedef vector float v4sf;
109# define SIMD_SZ 4
110# define VZERO() ((vector float) vec_splat_u8(0))
111# define VMUL(a,b) vec_madd(a,b, VZERO())
112# define VADD(a,b) vec_add(a,b)
113# define VMADD(a,b,c) vec_madd(a,b,c)
114# define VSUB(a,b) vec_sub(a,b)
115inline v4sf ld_ps1(const float *p) { v4sf v=vec_lde(0,p); return vec_splat(vec_perm(v, v, vec_lvsl(0, p)), 0); }
116# define LD_PS1(p) ld_ps1(&p)
117# define INTERLEAVE2(in1, in2, out1, out2) { v4sf tmp__ = vec_mergeh(in1, in2); out2 = vec_mergel(in1, in2); out1 = tmp__; }
118# define UNINTERLEAVE2(in1, in2, out1, out2) { \
119 vector unsigned char vperm1 = (vector unsigned char){0,1,2,3,8,9,10,11,16,17,18,19,24,25,26,27}; \
120 vector unsigned char vperm2 = (vector unsigned char){4,5,6,7,12,13,14,15,20,21,22,23,28,29,30,31}; \
121 v4sf tmp__ = vec_perm(in1, in2, vperm1); out2 = vec_perm(in1, in2, vperm2); out1 = tmp__; \
122 }
123# define VTRANSPOSE4(x0,x1,x2,x3) { \
124 v4sf y0 = vec_mergeh(x0, x2); \
125 v4sf y1 = vec_mergel(x0, x2); \
126 v4sf y2 = vec_mergeh(x1, x3); \
127 v4sf y3 = vec_mergel(x1, x3); \
128 x0 = vec_mergeh(y0, y2); \
129 x1 = vec_mergel(y0, y2); \
130 x2 = vec_mergeh(y1, y3); \
131 x3 = vec_mergel(y1, y3); \
132 }
133# define VSWAPHL(a,b) vec_perm(a,b, (vector unsigned char){16,17,18,19,20,21,22,23,8,9,10,11,12,13,14,15})
134# define VALIGNED(ptr) ((((long long)(ptr)) & 0xF) == 0)
135
136/*
137 SSE1 support macros
138*/
139#elif !defined(PFFFT_SIMD_DISABLE) && (defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || defined(i386) || defined(_M_IX86))
140
141#include <xmmintrin.h>
142typedef __m128 v4sf;
143# define SIMD_SZ 4 // 4 floats by simd vector -- this is pretty much hardcoded in the preprocess/finalize functions anyway so you will have to work if you want to enable AVX with its 256-bit vectors.
144# define VZERO() _mm_setzero_ps()
145# define VMUL(a,b) _mm_mul_ps(a,b)
146# define VADD(a,b) _mm_add_ps(a,b)
147# define VMADD(a,b,c) _mm_add_ps(_mm_mul_ps(a,b), c)
148# define VSUB(a,b) _mm_sub_ps(a,b)
149# define LD_PS1(p) _mm_set1_ps(p)
150# define INTERLEAVE2(in1, in2, out1, out2) { v4sf tmp__ = _mm_unpacklo_ps(in1, in2); out2 = _mm_unpackhi_ps(in1, in2); out1 = tmp__; }
151# define UNINTERLEAVE2(in1, in2, out1, out2) { v4sf tmp__ = _mm_shuffle_ps(in1, in2, _MM_SHUFFLE(2,0,2,0)); out2 = _mm_shuffle_ps(in1, in2, _MM_SHUFFLE(3,1,3,1)); out1 = tmp__; }
152# define VTRANSPOSE4(x0,x1,x2,x3) _MM_TRANSPOSE4_PS(x0,x1,x2,x3)
153# define VSWAPHL(a,b) _mm_shuffle_ps(b, a, _MM_SHUFFLE(3,2,1,0))
154# define VALIGNED(ptr) ((((long long)(ptr)) & 0xF) == 0)
155
156/*
157 ARM NEON support macros
158*/
159#elif !defined(PFFFT_SIMD_DISABLE) && \
160 (((defined(__arm__) || defined(__TARGET_ARCH_ARM) || defined(_M_ARM)) && defined(__ARM_NEON__)) || \
161 defined(_M_ARM64) || defined(__aarch64__) || defined(__ARM64__))
162# include <arm_neon.h>
163typedef float32x4_t v4sf;
164# define SIMD_SZ 4
165# define VZERO() vdupq_n_f32(0)
166# define VMUL(a,b) vmulq_f32(a,b)
167# define VADD(a,b) vaddq_f32(a,b)
168# define VMADD(a,b,c) vmlaq_f32(c,a,b)
169# define VSUB(a,b) vsubq_f32(a,b)
170# define LD_PS1(p) vld1q_dup_f32(&(p))
171# define INTERLEAVE2(in1, in2, out1, out2) { float32x4x2_t tmp__ = vzipq_f32(in1,in2); out1=tmp__.val[0]; out2=tmp__.val[1]; }
172# define UNINTERLEAVE2(in1, in2, out1, out2) { float32x4x2_t tmp__ = vuzpq_f32(in1,in2); out1=tmp__.val[0]; out2=tmp__.val[1]; }
173# define VTRANSPOSE4(x0,x1,x2,x3) { \
174 float32x4x2_t t0_ = vzipq_f32(x0, x2); \
175 float32x4x2_t t1_ = vzipq_f32(x1, x3); \
176 float32x4x2_t u0_ = vzipq_f32(t0_.val[0], t1_.val[0]); \
177 float32x4x2_t u1_ = vzipq_f32(t0_.val[1], t1_.val[1]); \
178 x0 = u0_.val[0]; x1 = u0_.val[1]; x2 = u1_.val[0]; x3 = u1_.val[1]; \
179 }
180// marginally faster version
181//# define VTRANSPOSE4(x0,x1,x2,x3) { asm("vtrn.32 %q0, %q1;\n vtrn.32 %q2,%q3\n vswp %f0,%e2\n vswp %f1,%e3" : "+w"(x0), "+w"(x1), "+w"(x2), "+w"(x3)::); }
182# define VSWAPHL(a,b) vcombine_f32(vget_low_f32(b), vget_high_f32(a))
183# define VALIGNED(ptr) ((((long long)(ptr)) & 0x3) == 0)
184#else
185# if !defined(PFFFT_SIMD_DISABLE)
186# warning "building with simd disabled !\n";
187# define PFFFT_SIMD_DISABLE // fallback to scalar code
188# endif
189#endif
190
191// fallback mode for situations where SSE/Altivec are not available, use scalar mode instead
192#ifdef PFFFT_SIMD_DISABLE
193typedef float v4sf;
194# define SIMD_SZ 1
195# define VZERO() 0.f
196# define VMUL(a,b) ((a)*(b))
197# define VADD(a,b) ((a)+(b))
198# define VMADD(a,b,c) ((a)*(b)+(c))
199# define VSUB(a,b) ((a)-(b))
200# define LD_PS1(p) (p)
201# define VALIGNED(ptr) ((((long long)(ptr)) & 0x3) == 0)
202#endif
203
204// shortcuts for complex multiplcations
205#define VCPLXMUL(ar,ai,br,bi) { v4sf tmp; tmp=VMUL(ar,bi); ar=VMUL(ar,br); ar=VSUB(ar,VMUL(ai,bi)); ai=VMUL(ai,br); ai=VADD(ai,tmp); }
206#define VCPLXMULCONJ(ar,ai,br,bi) { v4sf tmp; tmp=VMUL(ar,bi); ar=VMUL(ar,br); ar=VADD(ar,VMUL(ai,bi)); ai=VMUL(ai,br); ai=VSUB(ai,tmp); }
207#ifndef SVMUL
208// multiply a scalar with a vector
209#define SVMUL(f,v) VMUL(LD_PS1(f),v)
210#endif
211
212#if !defined(PFFFT_SIMD_DISABLE)
213typedef union v4sf_union {
214 v4sf v;
215 float f[4];
216} v4sf_union;
217
218#include <string.h>
219
220#define assertv4(v,f0,f1,f2,f3) assert(v.f[0] == (f0) && v.f[1] == (f1) && v.f[2] == (f2) && v.f[3] == (f3))
221
222/* detect bugs with the vector support macros */
223void validate_pffft_simd(void) {
224 float f[16] = { 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 };
225 v4sf_union a0, a1, a2, a3, t, u;
226 memcpy(dest: a0.f, src: f, n: 4*sizeof(float));
227 memcpy(dest: a1.f, src: f+4, n: 4*sizeof(float));
228 memcpy(dest: a2.f, src: f+8, n: 4*sizeof(float));
229 memcpy(dest: a3.f, src: f+12, n: 4*sizeof(float));
230
231 t = a0; u = a1; t.v = VZERO();
232 printf(format: "VZERO=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]); assertv4(t, 0, 0, 0, 0);
233 t.v = VADD(a1.v, a2.v);
234 printf(format: "VADD(4:7,8:11)=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]); assertv4(t, 12, 14, 16, 18);
235 t.v = VMUL(a1.v, a2.v);
236 printf(format: "VMUL(4:7,8:11)=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]); assertv4(t, 32, 45, 60, 77);
237 t.v = VMADD(a1.v, a2.v,a0.v);
238 printf(format: "VMADD(4:7,8:11,0:3)=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]); assertv4(t, 32, 46, 62, 80);
239
240 INTERLEAVE2(a1.v,a2.v,t.v,u.v);
241 printf(format: "INTERLEAVE2(4:7,8:11)=[%2g %2g %2g %2g] [%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3], u.f[0], u.f[1], u.f[2], u.f[3]);
242 assertv4(t, 4, 8, 5, 9); assertv4(u, 6, 10, 7, 11);
243 UNINTERLEAVE2(a1.v,a2.v,t.v,u.v);
244 printf(format: "UNINTERLEAVE2(4:7,8:11)=[%2g %2g %2g %2g] [%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3], u.f[0], u.f[1], u.f[2], u.f[3]);
245 assertv4(t, 4, 6, 8, 10); assertv4(u, 5, 7, 9, 11);
246
247 t.v=LD_PS1(f[15]);
248 printf(format: "LD_PS1(15)=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]);
249 assertv4(t, 15, 15, 15, 15);
250 t.v = VSWAPHL(a1.v, a2.v);
251 printf(format: "VSWAPHL(4:7,8:11)=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]);
252 assertv4(t, 8, 9, 6, 7);
253 VTRANSPOSE4(a0.v, a1.v, a2.v, a3.v);
254 printf(format: "VTRANSPOSE4(0:3,4:7,8:11,12:15)=[%2g %2g %2g %2g] [%2g %2g %2g %2g] [%2g %2g %2g %2g] [%2g %2g %2g %2g]\n",
255 a0.f[0], a0.f[1], a0.f[2], a0.f[3], a1.f[0], a1.f[1], a1.f[2], a1.f[3],
256 a2.f[0], a2.f[1], a2.f[2], a2.f[3], a3.f[0], a3.f[1], a3.f[2], a3.f[3]);
257 assertv4(a0, 0, 4, 8, 12); assertv4(a1, 1, 5, 9, 13); assertv4(a2, 2, 6, 10, 14); assertv4(a3, 3, 7, 11, 15);
258}
259#else
260void validate_pffft_simd() {} // allow test_pffft.c to call this function even when simd is not available..
261#endif //!PFFFT_SIMD_DISABLE
262
263/* SSE and co like 16-bytes aligned pointers */
264#define MALLOC_V4SF_ALIGNMENT 64 // with a 64-byte alignment, we are even aligned on L2 cache lines...
265void *pffft_aligned_malloc(size_t nb_bytes) {
266 void *p, *p0 = malloc(size: nb_bytes + MALLOC_V4SF_ALIGNMENT);
267 if (!p0) return (void *) 0;
268 p = (void *) (((size_t) p0 + MALLOC_V4SF_ALIGNMENT) & (~((size_t) (MALLOC_V4SF_ALIGNMENT-1))));
269 *((void **) p - 1) = p0;
270 return p;
271}
272
273void pffft_aligned_free(void *p) {
274 if (p) free(ptr: *((void **) p - 1));
275}
276
277int pffft_simd_size() { return SIMD_SZ; }
278
279/*
280 passf2 and passb2 has been merged here, fsign = -1 for passf2, +1 for passb2
281*/
282static NEVER_INLINE(void) passf2_ps(int ido, int l1, const v4sf *cc, v4sf *ch, const float *wa1, float fsign) {
283 int k, i;
284 int l1ido = l1*ido;
285 if (ido <= 2) {
286 for (k=0; k < l1ido; k += ido, ch += ido, cc+= 2*ido) {
287 ch[0] = VADD(cc[0], cc[ido+0]);
288 ch[l1ido] = VSUB(cc[0], cc[ido+0]);
289 ch[1] = VADD(cc[1], cc[ido+1]);
290 ch[l1ido + 1] = VSUB(cc[1], cc[ido+1]);
291 }
292 } else {
293 for (k=0; k < l1ido; k += ido, ch += ido, cc += 2*ido) {
294 for (i=0; i<ido-1; i+=2) {
295 v4sf tr2 = VSUB(cc[i+0], cc[i+ido+0]);
296 v4sf ti2 = VSUB(cc[i+1], cc[i+ido+1]);
297 v4sf wr = LD_PS1(wa1[i]), wi = VMUL(LD_PS1(fsign), LD_PS1(wa1[i+1]));
298 ch[i] = VADD(cc[i+0], cc[i+ido+0]);
299 ch[i+1] = VADD(cc[i+1], cc[i+ido+1]);
300 VCPLXMUL(tr2, ti2, wr, wi);
301 ch[i+l1ido] = tr2;
302 ch[i+l1ido+1] = ti2;
303 }
304 }
305 }
306}
307
308/*
309 passf3 and passb3 has been merged here, fsign = -1 for passf3, +1 for passb3
310*/
311static NEVER_INLINE(void) passf3_ps(int ido, int l1, const v4sf *cc, v4sf *ch,
312 const float *wa1, const float *wa2, float fsign) {
313 static const float taur = -0.5f;
314 float taui = 0.866025403784439f*fsign;
315 int i, k;
316 v4sf tr2, ti2, cr2, ci2, cr3, ci3, dr2, di2, dr3, di3;
317 int l1ido = l1*ido;
318 float wr1, wi1, wr2, wi2;
319 assert(ido > 2);
320 for (k=0; k< l1ido; k += ido, cc+= 3*ido, ch +=ido) {
321 for (i=0; i<ido-1; i+=2) {
322 tr2 = VADD(cc[i+ido], cc[i+2*ido]);
323 cr2 = VADD(cc[i], SVMUL(taur,tr2));
324 ch[i] = VADD(cc[i], tr2);
325 ti2 = VADD(cc[i+ido+1], cc[i+2*ido+1]);
326 ci2 = VADD(cc[i +1], SVMUL(taur,ti2));
327 ch[i+1] = VADD(cc[i+1], ti2);
328 cr3 = SVMUL(taui, VSUB(cc[i+ido], cc[i+2*ido]));
329 ci3 = SVMUL(taui, VSUB(cc[i+ido+1], cc[i+2*ido+1]));
330 dr2 = VSUB(cr2, ci3);
331 dr3 = VADD(cr2, ci3);
332 di2 = VADD(ci2, cr3);
333 di3 = VSUB(ci2, cr3);
334 wr1=wa1[i]; wi1=fsign*wa1[i+1]; wr2=wa2[i]; wi2=fsign*wa2[i+1];
335 VCPLXMUL(dr2, di2, LD_PS1(wr1), LD_PS1(wi1));
336 ch[i+l1ido] = dr2;
337 ch[i+l1ido + 1] = di2;
338 VCPLXMUL(dr3, di3, LD_PS1(wr2), LD_PS1(wi2));
339 ch[i+2*l1ido] = dr3;
340 ch[i+2*l1ido+1] = di3;
341 }
342 }
343} /* passf3 */
344
345static NEVER_INLINE(void) passf4_ps(int ido, int l1, const v4sf *cc, v4sf *ch,
346 const float *wa1, const float *wa2, const float *wa3, float fsign) {
347 /* isign == -1 for forward transform and +1 for backward transform */
348
349 int i, k;
350 v4sf ci2, ci3, ci4, cr2, cr3, cr4, ti1, ti2, ti3, ti4, tr1, tr2, tr3, tr4;
351 int l1ido = l1*ido;
352 if (ido == 2) {
353 for (k=0; k < l1ido; k += ido, ch += ido, cc += 4*ido) {
354 tr1 = VSUB(cc[0], cc[2*ido + 0]);
355 tr2 = VADD(cc[0], cc[2*ido + 0]);
356 ti1 = VSUB(cc[1], cc[2*ido + 1]);
357 ti2 = VADD(cc[1], cc[2*ido + 1]);
358 ti4 = VMUL(VSUB(cc[1*ido + 0], cc[3*ido + 0]), LD_PS1(fsign));
359 tr4 = VMUL(VSUB(cc[3*ido + 1], cc[1*ido + 1]), LD_PS1(fsign));
360 tr3 = VADD(cc[ido + 0], cc[3*ido + 0]);
361 ti3 = VADD(cc[ido + 1], cc[3*ido + 1]);
362
363 ch[0*l1ido + 0] = VADD(tr2, tr3);
364 ch[0*l1ido + 1] = VADD(ti2, ti3);
365 ch[1*l1ido + 0] = VADD(tr1, tr4);
366 ch[1*l1ido + 1] = VADD(ti1, ti4);
367 ch[2*l1ido + 0] = VSUB(tr2, tr3);
368 ch[2*l1ido + 1] = VSUB(ti2, ti3);
369 ch[3*l1ido + 0] = VSUB(tr1, tr4);
370 ch[3*l1ido + 1] = VSUB(ti1, ti4);
371 }
372 } else {
373 for (k=0; k < l1ido; k += ido, ch+=ido, cc += 4*ido) {
374 for (i=0; i<ido-1; i+=2) {
375 float wr1, wi1, wr2, wi2, wr3, wi3;
376 tr1 = VSUB(cc[i + 0], cc[i + 2*ido + 0]);
377 tr2 = VADD(cc[i + 0], cc[i + 2*ido + 0]);
378 ti1 = VSUB(cc[i + 1], cc[i + 2*ido + 1]);
379 ti2 = VADD(cc[i + 1], cc[i + 2*ido + 1]);
380 tr4 = VMUL(VSUB(cc[i + 3*ido + 1], cc[i + 1*ido + 1]), LD_PS1(fsign));
381 ti4 = VMUL(VSUB(cc[i + 1*ido + 0], cc[i + 3*ido + 0]), LD_PS1(fsign));
382 tr3 = VADD(cc[i + ido + 0], cc[i + 3*ido + 0]);
383 ti3 = VADD(cc[i + ido + 1], cc[i + 3*ido + 1]);
384
385 ch[i] = VADD(tr2, tr3);
386 cr3 = VSUB(tr2, tr3);
387 ch[i + 1] = VADD(ti2, ti3);
388 ci3 = VSUB(ti2, ti3);
389
390 cr2 = VADD(tr1, tr4);
391 cr4 = VSUB(tr1, tr4);
392 ci2 = VADD(ti1, ti4);
393 ci4 = VSUB(ti1, ti4);
394 wr1=wa1[i]; wi1=fsign*wa1[i+1];
395 VCPLXMUL(cr2, ci2, LD_PS1(wr1), LD_PS1(wi1));
396 wr2=wa2[i]; wi2=fsign*wa2[i+1];
397 ch[i + l1ido] = cr2;
398 ch[i + l1ido + 1] = ci2;
399
400 VCPLXMUL(cr3, ci3, LD_PS1(wr2), LD_PS1(wi2));
401 wr3=wa3[i]; wi3=fsign*wa3[i+1];
402 ch[i + 2*l1ido] = cr3;
403 ch[i + 2*l1ido + 1] = ci3;
404
405 VCPLXMUL(cr4, ci4, LD_PS1(wr3), LD_PS1(wi3));
406 ch[i + 3*l1ido] = cr4;
407 ch[i + 3*l1ido + 1] = ci4;
408 }
409 }
410 }
411} /* passf4 */
412
413/*
414 passf5 and passb5 has been merged here, fsign = -1 for passf5, +1 for passb5
415*/
416static NEVER_INLINE(void) passf5_ps(int ido, int l1, const v4sf *cc, v4sf *ch,
417 const float *wa1, const float *wa2,
418 const float *wa3, const float *wa4, float fsign) {
419 static const float tr11 = .309016994374947f;
420 const float ti11 = .951056516295154f*fsign;
421 static const float tr12 = -.809016994374947f;
422 const float ti12 = .587785252292473f*fsign;
423
424 /* Local variables */
425 int i, k;
426 v4sf ci2, ci3, ci4, ci5, di3, di4, di5, di2, cr2, cr3, cr5, cr4, ti2, ti3,
427 ti4, ti5, dr3, dr4, dr5, dr2, tr2, tr3, tr4, tr5;
428
429 float wr1, wi1, wr2, wi2, wr3, wi3, wr4, wi4;
430
431#define cc_ref(a_1,a_2) cc[(a_2-1)*ido + a_1 + 1]
432#define ch_ref(a_1,a_3) ch[(a_3-1)*l1*ido + a_1 + 1]
433
434 assert(ido > 2);
435 for (k = 0; k < l1; ++k, cc += 5*ido, ch += ido) {
436 for (i = 0; i < ido-1; i += 2) {
437 ti5 = VSUB(cc_ref(i , 2), cc_ref(i , 5));
438 ti2 = VADD(cc_ref(i , 2), cc_ref(i , 5));
439 ti4 = VSUB(cc_ref(i , 3), cc_ref(i , 4));
440 ti3 = VADD(cc_ref(i , 3), cc_ref(i , 4));
441 tr5 = VSUB(cc_ref(i-1, 2), cc_ref(i-1, 5));
442 tr2 = VADD(cc_ref(i-1, 2), cc_ref(i-1, 5));
443 tr4 = VSUB(cc_ref(i-1, 3), cc_ref(i-1, 4));
444 tr3 = VADD(cc_ref(i-1, 3), cc_ref(i-1, 4));
445 ch_ref(i-1, 1) = VADD(cc_ref(i-1, 1), VADD(tr2, tr3));
446 ch_ref(i , 1) = VADD(cc_ref(i , 1), VADD(ti2, ti3));
447 cr2 = VADD(cc_ref(i-1, 1), VADD(SVMUL(tr11, tr2),SVMUL(tr12, tr3)));
448 ci2 = VADD(cc_ref(i , 1), VADD(SVMUL(tr11, ti2),SVMUL(tr12, ti3)));
449 cr3 = VADD(cc_ref(i-1, 1), VADD(SVMUL(tr12, tr2),SVMUL(tr11, tr3)));
450 ci3 = VADD(cc_ref(i , 1), VADD(SVMUL(tr12, ti2),SVMUL(tr11, ti3)));
451 cr5 = VADD(SVMUL(ti11, tr5), SVMUL(ti12, tr4));
452 ci5 = VADD(SVMUL(ti11, ti5), SVMUL(ti12, ti4));
453 cr4 = VSUB(SVMUL(ti12, tr5), SVMUL(ti11, tr4));
454 ci4 = VSUB(SVMUL(ti12, ti5), SVMUL(ti11, ti4));
455 dr3 = VSUB(cr3, ci4);
456 dr4 = VADD(cr3, ci4);
457 di3 = VADD(ci3, cr4);
458 di4 = VSUB(ci3, cr4);
459 dr5 = VADD(cr2, ci5);
460 dr2 = VSUB(cr2, ci5);
461 di5 = VSUB(ci2, cr5);
462 di2 = VADD(ci2, cr5);
463 wr1=wa1[i]; wi1=fsign*wa1[i+1]; wr2=wa2[i]; wi2=fsign*wa2[i+1];
464 wr3=wa3[i]; wi3=fsign*wa3[i+1]; wr4=wa4[i]; wi4=fsign*wa4[i+1];
465 VCPLXMUL(dr2, di2, LD_PS1(wr1), LD_PS1(wi1));
466 ch_ref(i - 1, 2) = dr2;
467 ch_ref(i, 2) = di2;
468 VCPLXMUL(dr3, di3, LD_PS1(wr2), LD_PS1(wi2));
469 ch_ref(i - 1, 3) = dr3;
470 ch_ref(i, 3) = di3;
471 VCPLXMUL(dr4, di4, LD_PS1(wr3), LD_PS1(wi3));
472 ch_ref(i - 1, 4) = dr4;
473 ch_ref(i, 4) = di4;
474 VCPLXMUL(dr5, di5, LD_PS1(wr4), LD_PS1(wi4));
475 ch_ref(i - 1, 5) = dr5;
476 ch_ref(i, 5) = di5;
477 }
478 }
479#undef ch_ref
480#undef cc_ref
481}
482
483static NEVER_INLINE(void) radf2_ps(int ido, int l1, const v4sf * RESTRICT cc, v4sf * RESTRICT ch, const float *wa1) {
484 static const float minus_one = -1.f;
485 int i, k, l1ido = l1*ido;
486 for (k=0; k < l1ido; k += ido) {
487 v4sf a = cc[k], b = cc[k + l1ido];
488 ch[2*k] = VADD(a, b);
489 ch[2*(k+ido)-1] = VSUB(a, b);
490 }
491 if (ido < 2) return;
492 if (ido != 2) {
493 for (k=0; k < l1ido; k += ido) {
494 for (i=2; i<ido; i+=2) {
495 v4sf tr2 = cc[i - 1 + k + l1ido], ti2 = cc[i + k + l1ido];
496 v4sf br = cc[i - 1 + k], bi = cc[i + k];
497 VCPLXMULCONJ(tr2, ti2, LD_PS1(wa1[i - 2]), LD_PS1(wa1[i - 1]));
498 ch[i + 2*k] = VADD(bi, ti2);
499 ch[2*(k+ido) - i] = VSUB(ti2, bi);
500 ch[i - 1 + 2*k] = VADD(br, tr2);
501 ch[2*(k+ido) - i -1] = VSUB(br, tr2);
502 }
503 }
504 if (ido % 2 == 1) return;
505 }
506 for (k=0; k < l1ido; k += ido) {
507 ch[2*k + ido] = SVMUL(minus_one, cc[ido-1 + k + l1ido]);
508 ch[2*k + ido-1] = cc[k + ido-1];
509 }
510} /* radf2 */
511
512
513static NEVER_INLINE(void) radb2_ps(int ido, int l1, const v4sf *cc, v4sf *ch, const float *wa1) {
514 static const float minus_two=-2;
515 int i, k, l1ido = l1*ido;
516 v4sf a,b,c,d, tr2, ti2;
517 for (k=0; k < l1ido; k += ido) {
518 a = cc[2*k]; b = cc[2*(k+ido) - 1];
519 ch[k] = VADD(a, b);
520 ch[k + l1ido] =VSUB(a, b);
521 }
522 if (ido < 2) return;
523 if (ido != 2) {
524 for (k = 0; k < l1ido; k += ido) {
525 for (i = 2; i < ido; i += 2) {
526 a = cc[i-1 + 2*k]; b = cc[2*(k + ido) - i - 1];
527 c = cc[i+0 + 2*k]; d = cc[2*(k + ido) - i + 0];
528 ch[i-1 + k] = VADD(a, b);
529 tr2 = VSUB(a, b);
530 ch[i+0 + k] = VSUB(c, d);
531 ti2 = VADD(c, d);
532 VCPLXMUL(tr2, ti2, LD_PS1(wa1[i - 2]), LD_PS1(wa1[i - 1]));
533 ch[i-1 + k + l1ido] = tr2;
534 ch[i+0 + k + l1ido] = ti2;
535 }
536 }
537 if (ido % 2 == 1) return;
538 }
539 for (k = 0; k < l1ido; k += ido) {
540 a = cc[2*k + ido-1]; b = cc[2*k + ido];
541 ch[k + ido-1] = VADD(a,a);
542 ch[k + ido-1 + l1ido] = SVMUL(minus_two, b);
543 }
544} /* radb2 */
545
546static void radf3_ps(int ido, int l1, const v4sf * RESTRICT cc, v4sf * RESTRICT ch,
547 const float *wa1, const float *wa2) {
548 static const float taur = -0.5f;
549 static const float taui = 0.866025403784439f;
550 int i, k, ic;
551 v4sf ci2, di2, di3, cr2, dr2, dr3, ti2, ti3, tr2, tr3, wr1, wi1, wr2, wi2;
552 for (k=0; k<l1; k++) {
553 cr2 = VADD(cc[(k + l1)*ido], cc[(k + 2*l1)*ido]);
554 ch[3*k*ido] = VADD(cc[k*ido], cr2);
555 ch[(3*k+2)*ido] = SVMUL(taui, VSUB(cc[(k + l1*2)*ido], cc[(k + l1)*ido]));
556 ch[ido-1 + (3*k + 1)*ido] = VADD(cc[k*ido], SVMUL(taur, cr2));
557 }
558 if (ido == 1) return;
559 for (k=0; k<l1; k++) {
560 for (i=2; i<ido; i+=2) {
561 ic = ido - i;
562 wr1 = LD_PS1(wa1[i - 2]); wi1 = LD_PS1(wa1[i - 1]);
563 dr2 = cc[i - 1 + (k + l1)*ido]; di2 = cc[i + (k + l1)*ido];
564 VCPLXMULCONJ(dr2, di2, wr1, wi1);
565
566 wr2 = LD_PS1(wa2[i - 2]); wi2 = LD_PS1(wa2[i - 1]);
567 dr3 = cc[i - 1 + (k + l1*2)*ido]; di3 = cc[i + (k + l1*2)*ido];
568 VCPLXMULCONJ(dr3, di3, wr2, wi2);
569
570 cr2 = VADD(dr2, dr3);
571 ci2 = VADD(di2, di3);
572 ch[i - 1 + 3*k*ido] = VADD(cc[i - 1 + k*ido], cr2);
573 ch[i + 3*k*ido] = VADD(cc[i + k*ido], ci2);
574 tr2 = VADD(cc[i - 1 + k*ido], SVMUL(taur, cr2));
575 ti2 = VADD(cc[i + k*ido], SVMUL(taur, ci2));
576 tr3 = SVMUL(taui, VSUB(di2, di3));
577 ti3 = SVMUL(taui, VSUB(dr3, dr2));
578 ch[i - 1 + (3*k + 2)*ido] = VADD(tr2, tr3);
579 ch[ic - 1 + (3*k + 1)*ido] = VSUB(tr2, tr3);
580 ch[i + (3*k + 2)*ido] = VADD(ti2, ti3);
581 ch[ic + (3*k + 1)*ido] = VSUB(ti3, ti2);
582 }
583 }
584} /* radf3 */
585
586
587static void radb3_ps(int ido, int l1, const v4sf *RESTRICT cc, v4sf *RESTRICT ch,
588 const float *wa1, const float *wa2)
589{
590 static const float taur = -0.5f;
591 static const float taui = 0.866025403784439f;
592 static const float taui_2 = 0.866025403784439f*2;
593 int i, k, ic;
594 v4sf ci2, ci3, di2, di3, cr2, cr3, dr2, dr3, ti2, tr2;
595 for (k=0; k<l1; k++) {
596 tr2 = cc[ido-1 + (3*k + 1)*ido]; tr2 = VADD(tr2,tr2);
597 cr2 = VMADD(LD_PS1(taur), tr2, cc[3*k*ido]);
598 ch[k*ido] = VADD(cc[3*k*ido], tr2);
599 ci3 = SVMUL(taui_2, cc[(3*k + 2)*ido]);
600 ch[(k + l1)*ido] = VSUB(cr2, ci3);
601 ch[(k + 2*l1)*ido] = VADD(cr2, ci3);
602 }
603 if (ido == 1) return;
604 for (k=0; k<l1; k++) {
605 for (i=2; i<ido; i+=2) {
606 ic = ido - i;
607 tr2 = VADD(cc[i - 1 + (3*k + 2)*ido], cc[ic - 1 + (3*k + 1)*ido]);
608 cr2 = VMADD(LD_PS1(taur), tr2, cc[i - 1 + 3*k*ido]);
609 ch[i - 1 + k*ido] = VADD(cc[i - 1 + 3*k*ido], tr2);
610 ti2 = VSUB(cc[i + (3*k + 2)*ido], cc[ic + (3*k + 1)*ido]);
611 ci2 = VMADD(LD_PS1(taur), ti2, cc[i + 3*k*ido]);
612 ch[i + k*ido] = VADD(cc[i + 3*k*ido], ti2);
613 cr3 = SVMUL(taui, VSUB(cc[i - 1 + (3*k + 2)*ido], cc[ic - 1 + (3*k + 1)*ido]));
614 ci3 = SVMUL(taui, VADD(cc[i + (3*k + 2)*ido], cc[ic + (3*k + 1)*ido]));
615 dr2 = VSUB(cr2, ci3);
616 dr3 = VADD(cr2, ci3);
617 di2 = VADD(ci2, cr3);
618 di3 = VSUB(ci2, cr3);
619 VCPLXMUL(dr2, di2, LD_PS1(wa1[i-2]), LD_PS1(wa1[i-1]));
620 ch[i - 1 + (k + l1)*ido] = dr2;
621 ch[i + (k + l1)*ido] = di2;
622 VCPLXMUL(dr3, di3, LD_PS1(wa2[i-2]), LD_PS1(wa2[i-1]));
623 ch[i - 1 + (k + 2*l1)*ido] = dr3;
624 ch[i + (k + 2*l1)*ido] = di3;
625 }
626 }
627} /* radb3 */
628
629static NEVER_INLINE(void) radf4_ps(int ido, int l1, const v4sf *RESTRICT cc, v4sf * RESTRICT ch,
630 const float * RESTRICT wa1, const float * RESTRICT wa2, const float * RESTRICT wa3)
631{
632 static const float minus_hsqt2 = (float)-0.7071067811865475;
633 int i, k, l1ido = l1*ido;
634 {
635 const v4sf *RESTRICT cc_ = cc, * RESTRICT cc_end = cc + l1ido;
636 v4sf * RESTRICT ch_ = ch;
637 while (cc < cc_end) {
638 // this loop represents between 25% and 40% of total radf4_ps cost !
639 v4sf a0 = cc[0], a1 = cc[l1ido];
640 v4sf a2 = cc[2*l1ido], a3 = cc[3*l1ido];
641 v4sf tr1 = VADD(a1, a3);
642 v4sf tr2 = VADD(a0, a2);
643 ch[2*ido-1] = VSUB(a0, a2);
644 ch[2*ido ] = VSUB(a3, a1);
645 ch[0 ] = VADD(tr1, tr2);
646 ch[4*ido-1] = VSUB(tr2, tr1);
647 cc += ido; ch += 4*ido;
648 }
649 cc = cc_; ch = ch_;
650 }
651 if (ido < 2) return;
652 if (ido != 2) {
653 for (k = 0; k < l1ido; k += ido) {
654 const v4sf * RESTRICT pc = (v4sf*)(cc + 1 + k);
655 for (i=2; i<ido; i += 2, pc += 2) {
656 int ic = ido - i;
657 v4sf wr, wi, cr2, ci2, cr3, ci3, cr4, ci4;
658 v4sf tr1, ti1, tr2, ti2, tr3, ti3, tr4, ti4;
659
660 cr2 = pc[1*l1ido+0];
661 ci2 = pc[1*l1ido+1];
662 wr=LD_PS1(wa1[i - 2]);
663 wi=LD_PS1(wa1[i - 1]);
664 VCPLXMULCONJ(cr2,ci2,wr,wi);
665
666 cr3 = pc[2*l1ido+0];
667 ci3 = pc[2*l1ido+1];
668 wr = LD_PS1(wa2[i-2]);
669 wi = LD_PS1(wa2[i-1]);
670 VCPLXMULCONJ(cr3, ci3, wr, wi);
671
672 cr4 = pc[3*l1ido];
673 ci4 = pc[3*l1ido+1];
674 wr = LD_PS1(wa3[i-2]);
675 wi = LD_PS1(wa3[i-1]);
676 VCPLXMULCONJ(cr4, ci4, wr, wi);
677
678 /* at this point, on SSE, five of "cr2 cr3 cr4 ci2 ci3 ci4" should be loaded in registers */
679
680 tr1 = VADD(cr2,cr4);
681 tr4 = VSUB(cr4,cr2);
682 tr2 = VADD(pc[0],cr3);
683 tr3 = VSUB(pc[0],cr3);
684 ch[i - 1 + 4*k] = VADD(tr1,tr2);
685 ch[ic - 1 + 4*k + 3*ido] = VSUB(tr2,tr1); // at this point tr1 and tr2 can be disposed
686 ti1 = VADD(ci2,ci4);
687 ti4 = VSUB(ci2,ci4);
688 ch[i - 1 + 4*k + 2*ido] = VADD(ti4,tr3);
689 ch[ic - 1 + 4*k + 1*ido] = VSUB(tr3,ti4); // dispose tr3, ti4
690 ti2 = VADD(pc[1],ci3);
691 ti3 = VSUB(pc[1],ci3);
692 ch[i + 4*k] = VADD(ti1, ti2);
693 ch[ic + 4*k + 3*ido] = VSUB(ti1, ti2);
694 ch[i + 4*k + 2*ido] = VADD(tr4, ti3);
695 ch[ic + 4*k + 1*ido] = VSUB(tr4, ti3);
696 }
697 }
698 if (ido % 2 == 1) return;
699 }
700 for (k=0; k<l1ido; k += ido) {
701 v4sf a = cc[ido-1 + k + l1ido], b = cc[ido-1 + k + 3*l1ido];
702 v4sf c = cc[ido-1 + k], d = cc[ido-1 + k + 2*l1ido];
703 v4sf ti1 = SVMUL(minus_hsqt2, VADD(a, b));
704 v4sf tr1 = SVMUL(minus_hsqt2, VSUB(b, a));
705 ch[ido-1 + 4*k] = VADD(tr1, c);
706 ch[ido-1 + 4*k + 2*ido] = VSUB(c, tr1);
707 ch[4*k + 1*ido] = VSUB(ti1, d);
708 ch[4*k + 3*ido] = VADD(ti1, d);
709 }
710} /* radf4 */
711
712
713static NEVER_INLINE(void) radb4_ps(int ido, int l1, const v4sf * RESTRICT cc, v4sf * RESTRICT ch,
714 const float * RESTRICT wa1, const float * RESTRICT wa2, const float *RESTRICT wa3)
715{
716 static const float minus_sqrt2 = (float)-1.414213562373095;
717 static const float two = 2.f;
718 int i, k, l1ido = l1*ido;
719 v4sf ci2, ci3, ci4, cr2, cr3, cr4, ti1, ti2, ti3, ti4, tr1, tr2, tr3, tr4;
720 {
721 const v4sf *RESTRICT cc_ = cc, * RESTRICT ch_end = ch + l1ido;
722 v4sf *ch_ = ch;
723 while (ch < ch_end) {
724 v4sf a = cc[0], b = cc[4*ido-1];
725 v4sf c = cc[2*ido], d = cc[2*ido-1];
726 tr3 = SVMUL(two,d);
727 tr2 = VADD(a,b);
728 tr1 = VSUB(a,b);
729 tr4 = SVMUL(two,c);
730 ch[0*l1ido] = VADD(tr2, tr3);
731 ch[2*l1ido] = VSUB(tr2, tr3);
732 ch[1*l1ido] = VSUB(tr1, tr4);
733 ch[3*l1ido] = VADD(tr1, tr4);
734
735 cc += 4*ido; ch += ido;
736 }
737 cc = cc_; ch = ch_;
738 }
739 if (ido < 2) return;
740 if (ido != 2) {
741 for (k = 0; k < l1ido; k += ido) {
742 const v4sf * RESTRICT pc = (v4sf*)(cc - 1 + 4*k);
743 v4sf * RESTRICT ph = (v4sf*)(ch + k + 1);
744 for (i = 2; i < ido; i += 2) {
745
746 tr1 = VSUB(pc[i], pc[4*ido - i]);
747 tr2 = VADD(pc[i], pc[4*ido - i]);
748 ti4 = VSUB(pc[2*ido + i], pc[2*ido - i]);
749 tr3 = VADD(pc[2*ido + i], pc[2*ido - i]);
750 ph[0] = VADD(tr2, tr3);
751 cr3 = VSUB(tr2, tr3);
752
753 ti3 = VSUB(pc[2*ido + i + 1], pc[2*ido - i + 1]);
754 tr4 = VADD(pc[2*ido + i + 1], pc[2*ido - i + 1]);
755 cr2 = VSUB(tr1, tr4);
756 cr4 = VADD(tr1, tr4);
757
758 ti1 = VADD(pc[i + 1], pc[4*ido - i + 1]);
759 ti2 = VSUB(pc[i + 1], pc[4*ido - i + 1]);
760
761 ph[1] = VADD(ti2, ti3); ph += l1ido;
762 ci3 = VSUB(ti2, ti3);
763 ci2 = VADD(ti1, ti4);
764 ci4 = VSUB(ti1, ti4);
765 VCPLXMUL(cr2, ci2, LD_PS1(wa1[i-2]), LD_PS1(wa1[i-1]));
766 ph[0] = cr2;
767 ph[1] = ci2; ph += l1ido;
768 VCPLXMUL(cr3, ci3, LD_PS1(wa2[i-2]), LD_PS1(wa2[i-1]));
769 ph[0] = cr3;
770 ph[1] = ci3; ph += l1ido;
771 VCPLXMUL(cr4, ci4, LD_PS1(wa3[i-2]), LD_PS1(wa3[i-1]));
772 ph[0] = cr4;
773 ph[1] = ci4; ph = ph - 3*l1ido + 2;
774 }
775 }
776 if (ido % 2 == 1) return;
777 }
778 for (k=0; k < l1ido; k+=ido) {
779 int i0 = 4*k + ido;
780 v4sf c = cc[i0-1], d = cc[i0 + 2*ido-1];
781 v4sf a = cc[i0+0], b = cc[i0 + 2*ido+0];
782 tr1 = VSUB(c,d);
783 tr2 = VADD(c,d);
784 ti1 = VADD(b,a);
785 ti2 = VSUB(b,a);
786 ch[ido-1 + k + 0*l1ido] = VADD(tr2,tr2);
787 ch[ido-1 + k + 1*l1ido] = SVMUL(minus_sqrt2, VSUB(ti1, tr1));
788 ch[ido-1 + k + 2*l1ido] = VADD(ti2, ti2);
789 ch[ido-1 + k + 3*l1ido] = SVMUL(minus_sqrt2, VADD(ti1, tr1));
790 }
791} /* radb4 */
792
793static void radf5_ps(int ido, int l1, const v4sf * RESTRICT cc, v4sf * RESTRICT ch,
794 const float *wa1, const float *wa2, const float *wa3, const float *wa4)
795{
796 static const float tr11 = .309016994374947f;
797 static const float ti11 = .951056516295154f;
798 static const float tr12 = -.809016994374947f;
799 static const float ti12 = .587785252292473f;
800
801 /* System generated locals */
802 int cc_offset, ch_offset;
803
804 /* Local variables */
805 int i, k, ic;
806 v4sf ci2, di2, ci4, ci5, di3, di4, di5, ci3, cr2, cr3, dr2, dr3, dr4, dr5,
807 cr5, cr4, ti2, ti3, ti5, ti4, tr2, tr3, tr4, tr5;
808 int idp2;
809
810
811#define cc_ref(a_1,a_2,a_3) cc[((a_3)*l1 + (a_2))*ido + a_1]
812#define ch_ref(a_1,a_2,a_3) ch[((a_3)*5 + (a_2))*ido + a_1]
813
814 /* Parameter adjustments */
815 ch_offset = 1 + ido * 6;
816 ch -= ch_offset;
817 cc_offset = 1 + ido * (1 + l1);
818 cc -= cc_offset;
819
820 /* Function Body */
821 for (k = 1; k <= l1; ++k) {
822 cr2 = VADD(cc_ref(1, k, 5), cc_ref(1, k, 2));
823 ci5 = VSUB(cc_ref(1, k, 5), cc_ref(1, k, 2));
824 cr3 = VADD(cc_ref(1, k, 4), cc_ref(1, k, 3));
825 ci4 = VSUB(cc_ref(1, k, 4), cc_ref(1, k, 3));
826 ch_ref(1, 1, k) = VADD(cc_ref(1, k, 1), VADD(cr2, cr3));
827 ch_ref(ido, 2, k) = VADD(cc_ref(1, k, 1), VADD(SVMUL(tr11, cr2), SVMUL(tr12, cr3)));
828 ch_ref(1, 3, k) = VADD(SVMUL(ti11, ci5), SVMUL(ti12, ci4));
829 ch_ref(ido, 4, k) = VADD(cc_ref(1, k, 1), VADD(SVMUL(tr12, cr2), SVMUL(tr11, cr3)));
830 ch_ref(1, 5, k) = VSUB(SVMUL(ti12, ci5), SVMUL(ti11, ci4));
831 //printf("pffft: radf5, k=%d ch_ref=%f, ci4=%f\n", k, ch_ref(1, 5, k), ci4);
832 }
833 if (ido == 1) {
834 return;
835 }
836 idp2 = ido + 2;
837 for (k = 1; k <= l1; ++k) {
838 for (i = 3; i <= ido; i += 2) {
839 ic = idp2 - i;
840 dr2 = LD_PS1(wa1[i-3]); di2 = LD_PS1(wa1[i-2]);
841 dr3 = LD_PS1(wa2[i-3]); di3 = LD_PS1(wa2[i-2]);
842 dr4 = LD_PS1(wa3[i-3]); di4 = LD_PS1(wa3[i-2]);
843 dr5 = LD_PS1(wa4[i-3]); di5 = LD_PS1(wa4[i-2]);
844 VCPLXMULCONJ(dr2, di2, cc_ref(i-1, k, 2), cc_ref(i, k, 2));
845 VCPLXMULCONJ(dr3, di3, cc_ref(i-1, k, 3), cc_ref(i, k, 3));
846 VCPLXMULCONJ(dr4, di4, cc_ref(i-1, k, 4), cc_ref(i, k, 4));
847 VCPLXMULCONJ(dr5, di5, cc_ref(i-1, k, 5), cc_ref(i, k, 5));
848 cr2 = VADD(dr2, dr5);
849 ci5 = VSUB(dr5, dr2);
850 cr5 = VSUB(di2, di5);
851 ci2 = VADD(di2, di5);
852 cr3 = VADD(dr3, dr4);
853 ci4 = VSUB(dr4, dr3);
854 cr4 = VSUB(di3, di4);
855 ci3 = VADD(di3, di4);
856 ch_ref(i - 1, 1, k) = VADD(cc_ref(i - 1, k, 1), VADD(cr2, cr3));
857 ch_ref(i, 1, k) = VSUB(cc_ref(i, k, 1), VADD(ci2, ci3));//
858 tr2 = VADD(cc_ref(i - 1, k, 1), VADD(SVMUL(tr11, cr2), SVMUL(tr12, cr3)));
859 ti2 = VSUB(cc_ref(i, k, 1), VADD(SVMUL(tr11, ci2), SVMUL(tr12, ci3)));//
860 tr3 = VADD(cc_ref(i - 1, k, 1), VADD(SVMUL(tr12, cr2), SVMUL(tr11, cr3)));
861 ti3 = VSUB(cc_ref(i, k, 1), VADD(SVMUL(tr12, ci2), SVMUL(tr11, ci3)));//
862 tr5 = VADD(SVMUL(ti11, cr5), SVMUL(ti12, cr4));
863 ti5 = VADD(SVMUL(ti11, ci5), SVMUL(ti12, ci4));
864 tr4 = VSUB(SVMUL(ti12, cr5), SVMUL(ti11, cr4));
865 ti4 = VSUB(SVMUL(ti12, ci5), SVMUL(ti11, ci4));
866 ch_ref(i - 1, 3, k) = VSUB(tr2, tr5);
867 ch_ref(ic - 1, 2, k) = VADD(tr2, tr5);
868 ch_ref(i, 3, k) = VADD(ti2, ti5);
869 ch_ref(ic, 2, k) = VSUB(ti5, ti2);
870 ch_ref(i - 1, 5, k) = VSUB(tr3, tr4);
871 ch_ref(ic - 1, 4, k) = VADD(tr3, tr4);
872 ch_ref(i, 5, k) = VADD(ti3, ti4);
873 ch_ref(ic, 4, k) = VSUB(ti4, ti3);
874 }
875 }
876#undef cc_ref
877#undef ch_ref
878} /* radf5 */
879
880static void radb5_ps(int ido, int l1, const v4sf *RESTRICT cc, v4sf *RESTRICT ch,
881 const float *wa1, const float *wa2, const float *wa3, const float *wa4)
882{
883 static const float tr11 = .309016994374947f;
884 static const float ti11 = .951056516295154f;
885 static const float tr12 = -.809016994374947f;
886 static const float ti12 = .587785252292473f;
887
888 int cc_offset, ch_offset;
889
890 /* Local variables */
891 int i, k, ic;
892 v4sf ci2, ci3, ci4, ci5, di3, di4, di5, di2, cr2, cr3, cr5, cr4, ti2, ti3,
893 ti4, ti5, dr3, dr4, dr5, dr2, tr2, tr3, tr4, tr5;
894 int idp2;
895
896#define cc_ref(a_1,a_2,a_3) cc[((a_3)*5 + (a_2))*ido + a_1]
897#define ch_ref(a_1,a_2,a_3) ch[((a_3)*l1 + (a_2))*ido + a_1]
898
899 /* Parameter adjustments */
900 ch_offset = 1 + ido * (1 + l1);
901 ch -= ch_offset;
902 cc_offset = 1 + ido * 6;
903 cc -= cc_offset;
904
905 /* Function Body */
906 for (k = 1; k <= l1; ++k) {
907 ti5 = VADD(cc_ref(1, 3, k), cc_ref(1, 3, k));
908 ti4 = VADD(cc_ref(1, 5, k), cc_ref(1, 5, k));
909 tr2 = VADD(cc_ref(ido, 2, k), cc_ref(ido, 2, k));
910 tr3 = VADD(cc_ref(ido, 4, k), cc_ref(ido, 4, k));
911 ch_ref(1, k, 1) = VADD(cc_ref(1, 1, k), VADD(tr2, tr3));
912 cr2 = VADD(cc_ref(1, 1, k), VADD(SVMUL(tr11, tr2), SVMUL(tr12, tr3)));
913 cr3 = VADD(cc_ref(1, 1, k), VADD(SVMUL(tr12, tr2), SVMUL(tr11, tr3)));
914 ci5 = VADD(SVMUL(ti11, ti5), SVMUL(ti12, ti4));
915 ci4 = VSUB(SVMUL(ti12, ti5), SVMUL(ti11, ti4));
916 ch_ref(1, k, 2) = VSUB(cr2, ci5);
917 ch_ref(1, k, 3) = VSUB(cr3, ci4);
918 ch_ref(1, k, 4) = VADD(cr3, ci4);
919 ch_ref(1, k, 5) = VADD(cr2, ci5);
920 }
921 if (ido == 1) {
922 return;
923 }
924 idp2 = ido + 2;
925 for (k = 1; k <= l1; ++k) {
926 for (i = 3; i <= ido; i += 2) {
927 ic = idp2 - i;
928 ti5 = VADD(cc_ref(i , 3, k), cc_ref(ic , 2, k));
929 ti2 = VSUB(cc_ref(i , 3, k), cc_ref(ic , 2, k));
930 ti4 = VADD(cc_ref(i , 5, k), cc_ref(ic , 4, k));
931 ti3 = VSUB(cc_ref(i , 5, k), cc_ref(ic , 4, k));
932 tr5 = VSUB(cc_ref(i-1, 3, k), cc_ref(ic-1, 2, k));
933 tr2 = VADD(cc_ref(i-1, 3, k), cc_ref(ic-1, 2, k));
934 tr4 = VSUB(cc_ref(i-1, 5, k), cc_ref(ic-1, 4, k));
935 tr3 = VADD(cc_ref(i-1, 5, k), cc_ref(ic-1, 4, k));
936 ch_ref(i - 1, k, 1) = VADD(cc_ref(i-1, 1, k), VADD(tr2, tr3));
937 ch_ref(i, k, 1) = VADD(cc_ref(i, 1, k), VADD(ti2, ti3));
938 cr2 = VADD(cc_ref(i-1, 1, k), VADD(SVMUL(tr11, tr2), SVMUL(tr12, tr3)));
939 ci2 = VADD(cc_ref(i , 1, k), VADD(SVMUL(tr11, ti2), SVMUL(tr12, ti3)));
940 cr3 = VADD(cc_ref(i-1, 1, k), VADD(SVMUL(tr12, tr2), SVMUL(tr11, tr3)));
941 ci3 = VADD(cc_ref(i , 1, k), VADD(SVMUL(tr12, ti2), SVMUL(tr11, ti3)));
942 cr5 = VADD(SVMUL(ti11, tr5), SVMUL(ti12, tr4));
943 ci5 = VADD(SVMUL(ti11, ti5), SVMUL(ti12, ti4));
944 cr4 = VSUB(SVMUL(ti12, tr5), SVMUL(ti11, tr4));
945 ci4 = VSUB(SVMUL(ti12, ti5), SVMUL(ti11, ti4));
946 dr3 = VSUB(cr3, ci4);
947 dr4 = VADD(cr3, ci4);
948 di3 = VADD(ci3, cr4);
949 di4 = VSUB(ci3, cr4);
950 dr5 = VADD(cr2, ci5);
951 dr2 = VSUB(cr2, ci5);
952 di5 = VSUB(ci2, cr5);
953 di2 = VADD(ci2, cr5);
954 VCPLXMUL(dr2, di2, LD_PS1(wa1[i-3]), LD_PS1(wa1[i-2]));
955 VCPLXMUL(dr3, di3, LD_PS1(wa2[i-3]), LD_PS1(wa2[i-2]));
956 VCPLXMUL(dr4, di4, LD_PS1(wa3[i-3]), LD_PS1(wa3[i-2]));
957 VCPLXMUL(dr5, di5, LD_PS1(wa4[i-3]), LD_PS1(wa4[i-2]));
958
959 ch_ref(i-1, k, 2) = dr2; ch_ref(i, k, 2) = di2;
960 ch_ref(i-1, k, 3) = dr3; ch_ref(i, k, 3) = di3;
961 ch_ref(i-1, k, 4) = dr4; ch_ref(i, k, 4) = di4;
962 ch_ref(i-1, k, 5) = dr5; ch_ref(i, k, 5) = di5;
963 }
964 }
965#undef cc_ref
966#undef ch_ref
967} /* radb5 */
968
969static NEVER_INLINE(v4sf *) rfftf1_ps(int n, const v4sf *input_readonly, v4sf *work1, v4sf *work2,
970 const float *wa, const int *ifac) {
971 v4sf *in = (v4sf*)input_readonly;
972 v4sf *out = (in == work2 ? work1 : work2);
973 int nf = ifac[1], k1;
974 int l2 = n;
975 int iw = n-1;
976 assert(in != out && work1 != work2);
977 for (k1 = 1; k1 <= nf; ++k1) {
978 int kh = nf - k1;
979 int ip = ifac[kh + 2];
980 int l1 = l2 / ip;
981 int ido = n / l2;
982 iw -= (ip - 1)*ido;
983 switch (ip) {
984 case 5: {
985 int ix2 = iw + ido;
986 int ix3 = ix2 + ido;
987 int ix4 = ix3 + ido;
988 radf5_ps(ido, l1, cc: in, ch: out, wa1: &wa[iw], wa2: &wa[ix2], wa3: &wa[ix3], wa4: &wa[ix4]);
989 } break;
990 case 4: {
991 int ix2 = iw + ido;
992 int ix3 = ix2 + ido;
993 radf4_ps(ido, l1, cc: in, ch: out, wa1: &wa[iw], wa2: &wa[ix2], wa3: &wa[ix3]);
994 } break;
995 case 3: {
996 int ix2 = iw + ido;
997 radf3_ps(ido, l1, cc: in, ch: out, wa1: &wa[iw], wa2: &wa[ix2]);
998 } break;
999 case 2:
1000 radf2_ps(ido, l1, cc: in, ch: out, wa1: &wa[iw]);
1001 break;
1002 default:
1003 assert(0);
1004 break;
1005 }
1006 l2 = l1;
1007 if (out == work2) {
1008 out = work1; in = work2;
1009 } else {
1010 out = work2; in = work1;
1011 }
1012 }
1013 return in; /* this is in fact the output .. */
1014} /* rfftf1 */
1015
1016static NEVER_INLINE(v4sf *) rfftb1_ps(int n, const v4sf *input_readonly, v4sf *work1, v4sf *work2,
1017 const float *wa, const int *ifac) {
1018 v4sf *in = (v4sf*)input_readonly;
1019 v4sf *out = (in == work2 ? work1 : work2);
1020 int nf = ifac[1], k1;
1021 int l1 = 1;
1022 int iw = 0;
1023 assert(in != out);
1024 for (k1=1; k1<=nf; k1++) {
1025 int ip = ifac[k1 + 1];
1026 int l2 = ip*l1;
1027 int ido = n / l2;
1028 switch (ip) {
1029 case 5: {
1030 int ix2 = iw + ido;
1031 int ix3 = ix2 + ido;
1032 int ix4 = ix3 + ido;
1033 radb5_ps(ido, l1, cc: in, ch: out, wa1: &wa[iw], wa2: &wa[ix2], wa3: &wa[ix3], wa4: &wa[ix4]);
1034 } break;
1035 case 4: {
1036 int ix2 = iw + ido;
1037 int ix3 = ix2 + ido;
1038 radb4_ps(ido, l1, cc: in, ch: out, wa1: &wa[iw], wa2: &wa[ix2], wa3: &wa[ix3]);
1039 } break;
1040 case 3: {
1041 int ix2 = iw + ido;
1042 radb3_ps(ido, l1, cc: in, ch: out, wa1: &wa[iw], wa2: &wa[ix2]);
1043 } break;
1044 case 2:
1045 radb2_ps(ido, l1, cc: in, ch: out, wa1: &wa[iw]);
1046 break;
1047 default:
1048 assert(0);
1049 break;
1050 }
1051 l1 = l2;
1052 iw += (ip - 1)*ido;
1053
1054 if (out == work2) {
1055 out = work1; in = work2;
1056 } else {
1057 out = work2; in = work1;
1058 }
1059 }
1060 return in; /* this is in fact the output .. */
1061}
1062
1063static int decompose(int n, int *ifac, const int *ntryh) {
1064 int nl = n, nf = 0, i, j = 0;
1065 for (j=0; ntryh[j]; ++j) {
1066 int ntry = ntryh[j];
1067 while (nl != 1) {
1068 int nq = nl / ntry;
1069 int nr = nl - ntry * nq;
1070 if (nr == 0) {
1071 ifac[2+nf++] = ntry;
1072 nl = nq;
1073 if (ntry == 2 && nf != 1) {
1074 for (i = 2; i <= nf; ++i) {
1075 int ib = nf - i + 2;
1076 ifac[ib + 1] = ifac[ib];
1077 }
1078 ifac[2] = 2;
1079 }
1080 } else break;
1081 }
1082 }
1083 ifac[0] = n;
1084 ifac[1] = nf;
1085 return nf;
1086}
1087
1088
1089
1090static void rffti1_ps(int n, float *wa, int *ifac)
1091{
1092 static const int ntryh[] = { 4,2,3,5,0 };
1093 int k1, j, ii;
1094
1095 int nf = decompose(n,ifac,ntryh);
1096 float argh = (2*M_PI) / n;
1097 int is = 0;
1098 int nfm1 = nf - 1;
1099 int l1 = 1;
1100 for (k1 = 1; k1 <= nfm1; k1++) {
1101 int ip = ifac[k1 + 1];
1102 int ld = 0;
1103 int l2 = l1*ip;
1104 int ido = n / l2;
1105 int ipm = ip - 1;
1106 for (j = 1; j <= ipm; ++j) {
1107 float argld;
1108 int i = is, fi=0;
1109 ld += l1;
1110 argld = ld*argh;
1111 for (ii = 3; ii <= ido; ii += 2) {
1112 i += 2;
1113 fi += 1;
1114 wa[i - 2] = cos(x: fi*argld);
1115 wa[i - 1] = sin(x: fi*argld);
1116 }
1117 is += ido;
1118 }
1119 l1 = l2;
1120 }
1121} /* rffti1 */
1122
1123void cffti1_ps(int n, float *wa, int *ifac)
1124{
1125 static const int ntryh[] = { 5,3,4,2,0 };
1126 int k1, j, ii;
1127
1128 int nf = decompose(n,ifac,ntryh);
1129 float argh = (2*M_PI)/(float)n;
1130 int i = 1;
1131 int l1 = 1;
1132 for (k1=1; k1<=nf; k1++) {
1133 int ip = ifac[k1+1];
1134 int ld = 0;
1135 int l2 = l1*ip;
1136 int ido = n / l2;
1137 int idot = ido + ido + 2;
1138 int ipm = ip - 1;
1139 for (j=1; j<=ipm; j++) {
1140 float argld;
1141 int i1 = i, fi = 0;
1142 wa[i-1] = 1;
1143 wa[i] = 0;
1144 ld += l1;
1145 argld = ld*argh;
1146 for (ii = 4; ii <= idot; ii += 2) {
1147 i += 2;
1148 fi += 1;
1149 wa[i-1] = cos(x: fi*argld);
1150 wa[i] = sin(x: fi*argld);
1151 }
1152 if (ip > 5) {
1153 wa[i1-1] = wa[i-1];
1154 wa[i1] = wa[i];
1155 }
1156 }
1157 l1 = l2;
1158 }
1159} /* cffti1 */
1160
1161
1162v4sf *cfftf1_ps(int n, const v4sf *input_readonly, v4sf *work1, v4sf *work2, const float *wa, const int *ifac, int isign) {
1163 v4sf *in = (v4sf*)input_readonly;
1164 v4sf *out = (in == work2 ? work1 : work2);
1165 int nf = ifac[1], k1;
1166 int l1 = 1;
1167 int iw = 0;
1168 assert(in != out && work1 != work2);
1169 for (k1=2; k1<=nf+1; k1++) {
1170 int ip = ifac[k1];
1171 int l2 = ip*l1;
1172 int ido = n / l2;
1173 int idot = ido + ido;
1174 switch (ip) {
1175 case 5: {
1176 int ix2 = iw + idot;
1177 int ix3 = ix2 + idot;
1178 int ix4 = ix3 + idot;
1179 passf5_ps(ido: idot, l1, cc: in, ch: out, wa1: &wa[iw], wa2: &wa[ix2], wa3: &wa[ix3], wa4: &wa[ix4], fsign: isign);
1180 } break;
1181 case 4: {
1182 int ix2 = iw + idot;
1183 int ix3 = ix2 + idot;
1184 passf4_ps(ido: idot, l1, cc: in, ch: out, wa1: &wa[iw], wa2: &wa[ix2], wa3: &wa[ix3], fsign: isign);
1185 } break;
1186 case 2: {
1187 passf2_ps(ido: idot, l1, cc: in, ch: out, wa1: &wa[iw], fsign: isign);
1188 } break;
1189 case 3: {
1190 int ix2 = iw + idot;
1191 passf3_ps(ido: idot, l1, cc: in, ch: out, wa1: &wa[iw], wa2: &wa[ix2], fsign: isign);
1192 } break;
1193 default:
1194 assert(0);
1195 }
1196 l1 = l2;
1197 iw += (ip - 1)*idot;
1198 if (out == work2) {
1199 out = work1; in = work2;
1200 } else {
1201 out = work2; in = work1;
1202 }
1203 }
1204
1205 return in; /* this is in fact the output .. */
1206}
1207
1208
1209struct PFFFT_Setup {
1210 int N;
1211 int Ncvec; // nb of complex simd vectors (N/4 if PFFFT_COMPLEX, N/8 if PFFFT_REAL)
1212 int ifac[15];
1213 pffft_transform_t transform;
1214 v4sf *data; // allocated room for twiddle coefs
1215 float *e; // points into 'data' , N/4*3 elements
1216 float *twiddle; // points into 'data', N/4 elements
1217};
1218
1219PFFFT_Setup *pffft_new_setup(int N, pffft_transform_t transform) {
1220 PFFFT_Setup *s = (PFFFT_Setup*)malloc(size: sizeof(PFFFT_Setup));
1221 int k, m;
1222 /* unfortunately, the fft size must be a multiple of 16 for complex FFTs
1223 and 32 for real FFTs -- a lot of stuff would need to be rewritten to
1224 handle other cases (or maybe just switch to a scalar fft, I don't know..) */
1225 if (transform == PFFFT_REAL) { assert((N%(2*SIMD_SZ*SIMD_SZ))==0 && N>0); }
1226 if (transform == PFFFT_COMPLEX) { assert((N%(SIMD_SZ*SIMD_SZ))==0 && N>0); }
1227 //assert((N % 32) == 0);
1228 s->N = N;
1229 s->transform = transform;
1230 /* nb of complex simd vectors */
1231 s->Ncvec = (transform == PFFFT_REAL ? N/2 : N)/SIMD_SZ;
1232 s->data = (v4sf*)pffft_aligned_malloc(nb_bytes: 2*s->Ncvec * sizeof(v4sf));
1233 s->e = (float*)s->data;
1234 s->twiddle = (float*)(s->data + (2*s->Ncvec*(SIMD_SZ-1))/SIMD_SZ);
1235
1236 if (transform == PFFFT_REAL) {
1237 for (k=0; k < s->Ncvec; ++k) {
1238 int i = k/SIMD_SZ;
1239 int j = k%SIMD_SZ;
1240 for (m=0; m < SIMD_SZ-1; ++m) {
1241 float A = -2*M_PI*(m+1)*k / N;
1242 s->e[(2*(i*3 + m) + 0) * SIMD_SZ + j] = cos(x: A);
1243 s->e[(2*(i*3 + m) + 1) * SIMD_SZ + j] = sin(x: A);
1244 }
1245 }
1246 rffti1_ps(n: N/SIMD_SZ, wa: s->twiddle, ifac: s->ifac);
1247 } else {
1248 for (k=0; k < s->Ncvec; ++k) {
1249 int i = k/SIMD_SZ;
1250 int j = k%SIMD_SZ;
1251 for (m=0; m < SIMD_SZ-1; ++m) {
1252 float A = -2*M_PI*(m+1)*k / N;
1253 s->e[(2*(i*3 + m) + 0)*SIMD_SZ + j] = cos(x: A);
1254 s->e[(2*(i*3 + m) + 1)*SIMD_SZ + j] = sin(x: A);
1255 }
1256 }
1257 cffti1_ps(n: N/SIMD_SZ, wa: s->twiddle, ifac: s->ifac);
1258 }
1259
1260 /* check that N is decomposable with allowed prime factors */
1261 for (k=0, m=1; k < s->ifac[1]; ++k) { m *= s->ifac[2+k]; }
1262 if (m != N/SIMD_SZ) {
1263 pffft_destroy_setup(s); s = 0;
1264 }
1265
1266 return s;
1267}
1268
1269
1270void pffft_destroy_setup(PFFFT_Setup *s) {
1271 pffft_aligned_free(p: s->data);
1272 free(ptr: s);
1273}
1274
1275#if !defined(PFFFT_SIMD_DISABLE)
1276
1277/* [0 0 1 2 3 4 5 6 7 8] -> [0 8 7 6 5 4 3 2 1] */
1278static void reversed_copy(int N, const v4sf *in, int in_stride, v4sf *out) {
1279 v4sf g0, g1;
1280 int k;
1281 INTERLEAVE2(in[0], in[1], g0, g1); in += in_stride;
1282
1283 *--out = VSWAPHL(g0, g1); // [g0l, g0h], [g1l g1h] -> [g1l, g0h]
1284 for (k=1; k < N; ++k) {
1285 v4sf h0, h1;
1286 INTERLEAVE2(in[0], in[1], h0, h1); in += in_stride;
1287 *--out = VSWAPHL(g1, h0);
1288 *--out = VSWAPHL(h0, h1);
1289 g1 = h1;
1290 }
1291 *--out = VSWAPHL(g1, g0);
1292}
1293
1294static void unreversed_copy(int N, const v4sf *in, v4sf *out, int out_stride) {
1295 v4sf g0, g1, h0, h1;
1296 int k;
1297 g0 = g1 = in[0]; ++in;
1298 for (k=1; k < N; ++k) {
1299 h0 = *in++; h1 = *in++;
1300 g1 = VSWAPHL(g1, h0);
1301 h0 = VSWAPHL(h0, h1);
1302 UNINTERLEAVE2(h0, g1, out[0], out[1]); out += out_stride;
1303 g1 = h1;
1304 }
1305 h0 = *in++; h1 = g0;
1306 g1 = VSWAPHL(g1, h0);
1307 h0 = VSWAPHL(h0, h1);
1308 UNINTERLEAVE2(h0, g1, out[0], out[1]);
1309}
1310
1311void pffft_zreorder(PFFFT_Setup *setup, const float *in, float *out, pffft_direction_t direction) {
1312 int k, N = setup->N, Ncvec = setup->Ncvec;
1313 const v4sf *vin = (const v4sf*)in;
1314 v4sf *vout = (v4sf*)out;
1315 assert(in != out);
1316 if (setup->transform == PFFFT_REAL) {
1317 int dk = N/32;
1318 if (direction == PFFFT_FORWARD) {
1319 for (k=0; k < dk; ++k) {
1320 INTERLEAVE2(vin[k*8 + 0], vin[k*8 + 1], vout[2*(0*dk + k) + 0], vout[2*(0*dk + k) + 1]);
1321 INTERLEAVE2(vin[k*8 + 4], vin[k*8 + 5], vout[2*(2*dk + k) + 0], vout[2*(2*dk + k) + 1]);
1322 }
1323 reversed_copy(N: dk, in: vin+2, in_stride: 8, out: (v4sf*)(out + N/2));
1324 reversed_copy(N: dk, in: vin+6, in_stride: 8, out: (v4sf*)(out + N));
1325 } else {
1326 for (k=0; k < dk; ++k) {
1327 UNINTERLEAVE2(vin[2*(0*dk + k) + 0], vin[2*(0*dk + k) + 1], vout[k*8 + 0], vout[k*8 + 1]);
1328 UNINTERLEAVE2(vin[2*(2*dk + k) + 0], vin[2*(2*dk + k) + 1], vout[k*8 + 4], vout[k*8 + 5]);
1329 }
1330 unreversed_copy(N: dk, in: (v4sf*)(in + N/4), out: (v4sf*)(out + N - 6*SIMD_SZ), out_stride: -8);
1331 unreversed_copy(N: dk, in: (v4sf*)(in + 3*N/4), out: (v4sf*)(out + N - 2*SIMD_SZ), out_stride: -8);
1332 }
1333 } else {
1334 if (direction == PFFFT_FORWARD) {
1335 for (k=0; k < Ncvec; ++k) {
1336 int kk = (k/4) + (k%4)*(Ncvec/4);
1337 INTERLEAVE2(vin[k*2], vin[k*2+1], vout[kk*2], vout[kk*2+1]);
1338 }
1339 } else {
1340 for (k=0; k < Ncvec; ++k) {
1341 int kk = (k/4) + (k%4)*(Ncvec/4);
1342 UNINTERLEAVE2(vin[kk*2], vin[kk*2+1], vout[k*2], vout[k*2+1]);
1343 }
1344 }
1345 }
1346}
1347
1348void pffft_cplx_finalize(int Ncvec, const v4sf *in, v4sf *out, const v4sf *e) {
1349 int k, dk = Ncvec/SIMD_SZ; // number of 4x4 matrix blocks
1350 v4sf r0, i0, r1, i1, r2, i2, r3, i3;
1351 v4sf sr0, dr0, sr1, dr1, si0, di0, si1, di1;
1352 assert(in != out);
1353 for (k=0; k < dk; ++k) {
1354 r0 = in[8*k+0]; i0 = in[8*k+1];
1355 r1 = in[8*k+2]; i1 = in[8*k+3];
1356 r2 = in[8*k+4]; i2 = in[8*k+5];
1357 r3 = in[8*k+6]; i3 = in[8*k+7];
1358 VTRANSPOSE4(r0,r1,r2,r3);
1359 VTRANSPOSE4(i0,i1,i2,i3);
1360 VCPLXMUL(r1,i1,e[k*6+0],e[k*6+1]);
1361 VCPLXMUL(r2,i2,e[k*6+2],e[k*6+3]);
1362 VCPLXMUL(r3,i3,e[k*6+4],e[k*6+5]);
1363
1364 sr0 = VADD(r0,r2); dr0 = VSUB(r0, r2);
1365 sr1 = VADD(r1,r3); dr1 = VSUB(r1, r3);
1366 si0 = VADD(i0,i2); di0 = VSUB(i0, i2);
1367 si1 = VADD(i1,i3); di1 = VSUB(i1, i3);
1368
1369 /*
1370 transformation for each column is:
1371
1372 [1 1 1 1 0 0 0 0] [r0]
1373 [1 0 -1 0 0 -1 0 1] [r1]
1374 [1 -1 1 -1 0 0 0 0] [r2]
1375 [1 0 -1 0 0 1 0 -1] [r3]
1376 [0 0 0 0 1 1 1 1] * [i0]
1377 [0 1 0 -1 1 0 -1 0] [i1]
1378 [0 0 0 0 1 -1 1 -1] [i2]
1379 [0 -1 0 1 1 0 -1 0] [i3]
1380 */
1381
1382 r0 = VADD(sr0, sr1); i0 = VADD(si0, si1);
1383 r1 = VADD(dr0, di1); i1 = VSUB(di0, dr1);
1384 r2 = VSUB(sr0, sr1); i2 = VSUB(si0, si1);
1385 r3 = VSUB(dr0, di1); i3 = VADD(di0, dr1);
1386
1387 *out++ = r0; *out++ = i0; *out++ = r1; *out++ = i1;
1388 *out++ = r2; *out++ = i2; *out++ = r3; *out++ = i3;
1389 }
1390}
1391
1392void pffft_cplx_preprocess(int Ncvec, const v4sf *in, v4sf *out, const v4sf *e) {
1393 int k, dk = Ncvec/SIMD_SZ; // number of 4x4 matrix blocks
1394 v4sf r0, i0, r1, i1, r2, i2, r3, i3;
1395 v4sf sr0, dr0, sr1, dr1, si0, di0, si1, di1;
1396 assert(in != out);
1397 for (k=0; k < dk; ++k) {
1398 r0 = in[8*k+0]; i0 = in[8*k+1];
1399 r1 = in[8*k+2]; i1 = in[8*k+3];
1400 r2 = in[8*k+4]; i2 = in[8*k+5];
1401 r3 = in[8*k+6]; i3 = in[8*k+7];
1402
1403 sr0 = VADD(r0,r2); dr0 = VSUB(r0, r2);
1404 sr1 = VADD(r1,r3); dr1 = VSUB(r1, r3);
1405 si0 = VADD(i0,i2); di0 = VSUB(i0, i2);
1406 si1 = VADD(i1,i3); di1 = VSUB(i1, i3);
1407
1408 r0 = VADD(sr0, sr1); i0 = VADD(si0, si1);
1409 r1 = VSUB(dr0, di1); i1 = VADD(di0, dr1);
1410 r2 = VSUB(sr0, sr1); i2 = VSUB(si0, si1);
1411 r3 = VADD(dr0, di1); i3 = VSUB(di0, dr1);
1412
1413 VCPLXMULCONJ(r1,i1,e[k*6+0],e[k*6+1]);
1414 VCPLXMULCONJ(r2,i2,e[k*6+2],e[k*6+3]);
1415 VCPLXMULCONJ(r3,i3,e[k*6+4],e[k*6+5]);
1416
1417 VTRANSPOSE4(r0,r1,r2,r3);
1418 VTRANSPOSE4(i0,i1,i2,i3);
1419
1420 *out++ = r0; *out++ = i0; *out++ = r1; *out++ = i1;
1421 *out++ = r2; *out++ = i2; *out++ = r3; *out++ = i3;
1422 }
1423}
1424
1425
1426static ALWAYS_INLINE(void) pffft_real_finalize_4x4(const v4sf *in0, const v4sf *in1, const v4sf *in,
1427 const v4sf *e, v4sf *out) {
1428 v4sf r0, i0, r1, i1, r2, i2, r3, i3;
1429 v4sf sr0, dr0, sr1, dr1, si0, di0, si1, di1;
1430 r0 = *in0; i0 = *in1;
1431 r1 = *in++; i1 = *in++; r2 = *in++; i2 = *in++; r3 = *in++; i3 = *in++;
1432 VTRANSPOSE4(r0,r1,r2,r3);
1433 VTRANSPOSE4(i0,i1,i2,i3);
1434
1435 /*
1436 transformation for each column is:
1437
1438 [1 1 1 1 0 0 0 0] [r0]
1439 [1 0 -1 0 0 -1 0 1] [r1]
1440 [1 0 -1 0 0 1 0 -1] [r2]
1441 [1 -1 1 -1 0 0 0 0] [r3]
1442 [0 0 0 0 1 1 1 1] * [i0]
1443 [0 -1 0 1 -1 0 1 0] [i1]
1444 [0 -1 0 1 1 0 -1 0] [i2]
1445 [0 0 0 0 -1 1 -1 1] [i3]
1446 */
1447
1448 //cerr << "matrix initial, before e , REAL:\n 1: " << r0 << "\n 1: " << r1 << "\n 1: " << r2 << "\n 1: " << r3 << "\n";
1449 //cerr << "matrix initial, before e, IMAG :\n 1: " << i0 << "\n 1: " << i1 << "\n 1: " << i2 << "\n 1: " << i3 << "\n";
1450
1451 VCPLXMUL(r1,i1,e[0],e[1]);
1452 VCPLXMUL(r2,i2,e[2],e[3]);
1453 VCPLXMUL(r3,i3,e[4],e[5]);
1454
1455 //cerr << "matrix initial, real part:\n 1: " << r0 << "\n 1: " << r1 << "\n 1: " << r2 << "\n 1: " << r3 << "\n";
1456 //cerr << "matrix initial, imag part:\n 1: " << i0 << "\n 1: " << i1 << "\n 1: " << i2 << "\n 1: " << i3 << "\n";
1457
1458 sr0 = VADD(r0,r2); dr0 = VSUB(r0,r2);
1459 sr1 = VADD(r1,r3); dr1 = VSUB(r3,r1);
1460 si0 = VADD(i0,i2); di0 = VSUB(i0,i2);
1461 si1 = VADD(i1,i3); di1 = VSUB(i3,i1);
1462
1463 r0 = VADD(sr0, sr1);
1464 r3 = VSUB(sr0, sr1);
1465 i0 = VADD(si0, si1);
1466 i3 = VSUB(si1, si0);
1467 r1 = VADD(dr0, di1);
1468 r2 = VSUB(dr0, di1);
1469 i1 = VSUB(dr1, di0);
1470 i2 = VADD(dr1, di0);
1471
1472 *out++ = r0;
1473 *out++ = i0;
1474 *out++ = r1;
1475 *out++ = i1;
1476 *out++ = r2;
1477 *out++ = i2;
1478 *out++ = r3;
1479 *out++ = i3;
1480
1481}
1482
1483static NEVER_INLINE(void) pffft_real_finalize(int Ncvec, const v4sf *in, v4sf *out, const v4sf *e) {
1484 int k, dk = Ncvec/SIMD_SZ; // number of 4x4 matrix blocks
1485 /* fftpack order is f0r f1r f1i f2r f2i ... f(n-1)r f(n-1)i f(n)r */
1486
1487 v4sf_union cr, ci, *uout = (v4sf_union*)out;
1488 v4sf save = in[7], zero=VZERO();
1489 float xr0, xi0, xr1, xi1, xr2, xi2, xr3, xi3;
1490 static const float s = M_SQRT2/2;
1491
1492 cr.v = in[0]; ci.v = in[Ncvec*2-1];
1493 assert(in != out);
1494 pffft_real_finalize_4x4(in0: &zero, in1: &zero, in: in+1, e, out);
1495
1496 /*
1497 [cr0 cr1 cr2 cr3 ci0 ci1 ci2 ci3]
1498
1499 [Xr(1)] ] [1 1 1 1 0 0 0 0]
1500 [Xr(N/4) ] [0 0 0 0 1 s 0 -s]
1501 [Xr(N/2) ] [1 0 -1 0 0 0 0 0]
1502 [Xr(3N/4)] [0 0 0 0 1 -s 0 s]
1503 [Xi(1) ] [1 -1 1 -1 0 0 0 0]
1504 [Xi(N/4) ] [0 0 0 0 0 -s -1 -s]
1505 [Xi(N/2) ] [0 -1 0 1 0 0 0 0]
1506 [Xi(3N/4)] [0 0 0 0 0 -s 1 -s]
1507 */
1508
1509 xr0=(cr.f[0]+cr.f[2]) + (cr.f[1]+cr.f[3]); uout[0].f[0] = xr0;
1510 xi0=(cr.f[0]+cr.f[2]) - (cr.f[1]+cr.f[3]); uout[1].f[0] = xi0;
1511 xr2=(cr.f[0]-cr.f[2]); uout[4].f[0] = xr2;
1512 xi2=(cr.f[3]-cr.f[1]); uout[5].f[0] = xi2;
1513 xr1= ci.f[0] + s*(ci.f[1]-ci.f[3]); uout[2].f[0] = xr1;
1514 xi1=-ci.f[2] - s*(ci.f[1]+ci.f[3]); uout[3].f[0] = xi1;
1515 xr3= ci.f[0] - s*(ci.f[1]-ci.f[3]); uout[6].f[0] = xr3;
1516 xi3= ci.f[2] - s*(ci.f[1]+ci.f[3]); uout[7].f[0] = xi3;
1517
1518 for (k=1; k < dk; ++k) {
1519 v4sf save_next = in[8*k+7];
1520 pffft_real_finalize_4x4(in0: &save, in1: &in[8*k+0], in: in + 8*k+1,
1521 e: e + k*6, out: out + k*8);
1522 save = save_next;
1523 }
1524
1525}
1526
1527static ALWAYS_INLINE(void) pffft_real_preprocess_4x4(const v4sf *in,
1528 const v4sf *e, v4sf *out, int first) {
1529 v4sf r0=in[0], i0=in[1], r1=in[2], i1=in[3], r2=in[4], i2=in[5], r3=in[6], i3=in[7];
1530 /*
1531 transformation for each column is:
1532
1533 [1 1 1 1 0 0 0 0] [r0]
1534 [1 0 0 -1 0 -1 -1 0] [r1]
1535 [1 -1 -1 1 0 0 0 0] [r2]
1536 [1 0 0 -1 0 1 1 0] [r3]
1537 [0 0 0 0 1 -1 1 -1] * [i0]
1538 [0 -1 1 0 1 0 0 1] [i1]
1539 [0 0 0 0 1 1 -1 -1] [i2]
1540 [0 1 -1 0 1 0 0 1] [i3]
1541 */
1542
1543 v4sf sr0 = VADD(r0,r3), dr0 = VSUB(r0,r3);
1544 v4sf sr1 = VADD(r1,r2), dr1 = VSUB(r1,r2);
1545 v4sf si0 = VADD(i0,i3), di0 = VSUB(i0,i3);
1546 v4sf si1 = VADD(i1,i2), di1 = VSUB(i1,i2);
1547
1548 r0 = VADD(sr0, sr1);
1549 r2 = VSUB(sr0, sr1);
1550 r1 = VSUB(dr0, si1);
1551 r3 = VADD(dr0, si1);
1552 i0 = VSUB(di0, di1);
1553 i2 = VADD(di0, di1);
1554 i1 = VSUB(si0, dr1);
1555 i3 = VADD(si0, dr1);
1556
1557 VCPLXMULCONJ(r1,i1,e[0],e[1]);
1558 VCPLXMULCONJ(r2,i2,e[2],e[3]);
1559 VCPLXMULCONJ(r3,i3,e[4],e[5]);
1560
1561 VTRANSPOSE4(r0,r1,r2,r3);
1562 VTRANSPOSE4(i0,i1,i2,i3);
1563
1564 if (!first) {
1565 *out++ = r0;
1566 *out++ = i0;
1567 }
1568 *out++ = r1;
1569 *out++ = i1;
1570 *out++ = r2;
1571 *out++ = i2;
1572 *out++ = r3;
1573 *out++ = i3;
1574}
1575
1576static NEVER_INLINE(void) pffft_real_preprocess(int Ncvec, const v4sf *in, v4sf *out, const v4sf *e) {
1577 int k, dk = Ncvec/SIMD_SZ; // number of 4x4 matrix blocks
1578 /* fftpack order is f0r f1r f1i f2r f2i ... f(n-1)r f(n-1)i f(n)r */
1579
1580 v4sf_union Xr, Xi, *uout = (v4sf_union*)out;
1581 float cr0, ci0, cr1, ci1, cr2, ci2, cr3, ci3;
1582 static const float s = M_SQRT2;
1583 assert(in != out);
1584 for (k=0; k < 4; ++k) {
1585 Xr.f[k] = ((float*)in)[8*k];
1586 Xi.f[k] = ((float*)in)[8*k+4];
1587 }
1588
1589 pffft_real_preprocess_4x4(in, e, out: out+1, first: 1); // will write only 6 values
1590
1591 /*
1592 [Xr0 Xr1 Xr2 Xr3 Xi0 Xi1 Xi2 Xi3]
1593
1594 [cr0] [1 0 2 0 1 0 0 0]
1595 [cr1] [1 0 0 0 -1 0 -2 0]
1596 [cr2] [1 0 -2 0 1 0 0 0]
1597 [cr3] [1 0 0 0 -1 0 2 0]
1598 [ci0] [0 2 0 2 0 0 0 0]
1599 [ci1] [0 s 0 -s 0 -s 0 -s]
1600 [ci2] [0 0 0 0 0 -2 0 2]
1601 [ci3] [0 -s 0 s 0 -s 0 -s]
1602 */
1603 for (k=1; k < dk; ++k) {
1604 pffft_real_preprocess_4x4(in: in+8*k, e: e + k*6, out: out-1+k*8, first: 0);
1605 }
1606
1607 cr0=(Xr.f[0]+Xi.f[0]) + 2*Xr.f[2]; uout[0].f[0] = cr0;
1608 cr1=(Xr.f[0]-Xi.f[0]) - 2*Xi.f[2]; uout[0].f[1] = cr1;
1609 cr2=(Xr.f[0]+Xi.f[0]) - 2*Xr.f[2]; uout[0].f[2] = cr2;
1610 cr3=(Xr.f[0]-Xi.f[0]) + 2*Xi.f[2]; uout[0].f[3] = cr3;
1611 ci0= 2*(Xr.f[1]+Xr.f[3]); uout[2*Ncvec-1].f[0] = ci0;
1612 ci1= s*(Xr.f[1]-Xr.f[3]) - s*(Xi.f[1]+Xi.f[3]); uout[2*Ncvec-1].f[1] = ci1;
1613 ci2= 2*(Xi.f[3]-Xi.f[1]); uout[2*Ncvec-1].f[2] = ci2;
1614 ci3=-s*(Xr.f[1]-Xr.f[3]) - s*(Xi.f[1]+Xi.f[3]); uout[2*Ncvec-1].f[3] = ci3;
1615}
1616
1617
1618void pffft_transform_internal(PFFFT_Setup *setup, const float *finput, float *foutput, v4sf *scratch,
1619 pffft_direction_t direction, int ordered) {
1620 int k, Ncvec = setup->Ncvec;
1621 int nf_odd = (setup->ifac[1] & 1);
1622
1623 // temporary buffer is allocated on the stack if the scratch pointer is NULL
1624 int stack_allocate = (scratch == 0 ? Ncvec*2 : 1);
1625 VLA_ARRAY_ON_STACK(v4sf, scratch_on_stack, stack_allocate);
1626
1627 const v4sf *vinput = (const v4sf*)finput;
1628 v4sf *voutput = (v4sf*)foutput;
1629 v4sf *buff[2] = { voutput, scratch ? scratch : scratch_on_stack };
1630 int ib = (nf_odd ^ ordered ? 1 : 0);
1631
1632 assert(VALIGNED(finput) && VALIGNED(foutput));
1633
1634 //assert(finput != foutput);
1635 if (direction == PFFFT_FORWARD) {
1636 ib = !ib;
1637 if (setup->transform == PFFFT_REAL) {
1638 ib = (rfftf1_ps(n: Ncvec*2, input_readonly: vinput, work1: buff[ib], work2: buff[!ib],
1639 wa: setup->twiddle, ifac: &setup->ifac[0]) == buff[0] ? 0 : 1);
1640 pffft_real_finalize(Ncvec, in: buff[ib], out: buff[!ib], e: (v4sf*)setup->e);
1641 } else {
1642 v4sf *tmp = buff[ib];
1643 for (k=0; k < Ncvec; ++k) {
1644 UNINTERLEAVE2(vinput[k*2], vinput[k*2+1], tmp[k*2], tmp[k*2+1]);
1645 }
1646 ib = (cfftf1_ps(n: Ncvec, input_readonly: buff[ib], work1: buff[!ib], work2: buff[ib],
1647 wa: setup->twiddle, ifac: &setup->ifac[0], isign: -1) == buff[0] ? 0 : 1);
1648 pffft_cplx_finalize(Ncvec, in: buff[ib], out: buff[!ib], e: (v4sf*)setup->e);
1649 }
1650 if (ordered) {
1651 pffft_zreorder(setup, in: (float*)buff[!ib], out: (float*)buff[ib], direction: PFFFT_FORWARD);
1652 } else ib = !ib;
1653 } else {
1654 if (vinput == buff[ib]) {
1655 ib = !ib; // may happen when finput == foutput
1656 }
1657 if (ordered) {
1658 pffft_zreorder(setup, in: (float*)vinput, out: (float*)buff[ib], direction: PFFFT_BACKWARD);
1659 vinput = buff[ib]; ib = !ib;
1660 }
1661 if (setup->transform == PFFFT_REAL) {
1662 pffft_real_preprocess(Ncvec, in: vinput, out: buff[ib], e: (v4sf*)setup->e);
1663 ib = (rfftb1_ps(n: Ncvec*2, input_readonly: buff[ib], work1: buff[0], work2: buff[1],
1664 wa: setup->twiddle, ifac: &setup->ifac[0]) == buff[0] ? 0 : 1);
1665 } else {
1666 pffft_cplx_preprocess(Ncvec, in: vinput, out: buff[ib], e: (v4sf*)setup->e);
1667 ib = (cfftf1_ps(n: Ncvec, input_readonly: buff[ib], work1: buff[0], work2: buff[1],
1668 wa: setup->twiddle, ifac: &setup->ifac[0], isign: +1) == buff[0] ? 0 : 1);
1669 for (k=0; k < Ncvec; ++k) {
1670 INTERLEAVE2(buff[ib][k*2], buff[ib][k*2+1], buff[ib][k*2], buff[ib][k*2+1]);
1671 }
1672 }
1673 }
1674
1675 if (buff[ib] != voutput) {
1676 /* extra copy required -- this situation should only happen when finput == foutput */
1677 assert(finput==foutput);
1678 for (k=0; k < Ncvec; ++k) {
1679 v4sf a = buff[ib][2*k], b = buff[ib][2*k+1];
1680 voutput[2*k] = a; voutput[2*k+1] = b;
1681 }
1682 ib = !ib;
1683 }
1684 assert(buff[ib] == voutput);
1685}
1686
1687void pffft_zconvolve_accumulate(PFFFT_Setup *s, const float *a, const float *b, float *ab, float scaling) {
1688 int Ncvec = s->Ncvec;
1689 const v4sf * RESTRICT va = (const v4sf*)a;
1690 const v4sf * RESTRICT vb = (const v4sf*)b;
1691 v4sf * RESTRICT vab = (v4sf*)ab;
1692
1693#ifdef __arm__
1694 __builtin_prefetch(va);
1695 __builtin_prefetch(vb);
1696 __builtin_prefetch(vab);
1697 __builtin_prefetch(va+2);
1698 __builtin_prefetch(vb+2);
1699 __builtin_prefetch(vab+2);
1700 __builtin_prefetch(va+4);
1701 __builtin_prefetch(vb+4);
1702 __builtin_prefetch(vab+4);
1703 __builtin_prefetch(va+6);
1704 __builtin_prefetch(vb+6);
1705 __builtin_prefetch(vab+6);
1706# ifndef __clang__
1707# define ZCONVOLVE_USING_INLINE_NEON_ASM
1708# endif
1709#endif
1710
1711 float ar, ai, br, bi, abr, abi;
1712#ifndef ZCONVOLVE_USING_INLINE_ASM
1713 v4sf vscal = LD_PS1(scaling);
1714 int i;
1715#endif
1716
1717 assert(VALIGNED(a) && VALIGNED(b) && VALIGNED(ab));
1718 ar = ((v4sf_union*)va)[0].f[0];
1719 ai = ((v4sf_union*)va)[1].f[0];
1720 br = ((v4sf_union*)vb)[0].f[0];
1721 bi = ((v4sf_union*)vb)[1].f[0];
1722 abr = ((v4sf_union*)vab)[0].f[0];
1723 abi = ((v4sf_union*)vab)[1].f[0];
1724
1725#ifdef ZCONVOLVE_USING_INLINE_ASM // inline asm version, unfortunately miscompiled by clang 3.2, at least on ubuntu.. so this will be restricted to gcc
1726 const float *a_ = a, *b_ = b; float *ab_ = ab;
1727 int N = Ncvec;
1728 asm volatile("mov r8, %2 \n"
1729 "vdup.f32 q15, %4 \n"
1730 "1: \n"
1731 "pld [%0,#64] \n"
1732 "pld [%1,#64] \n"
1733 "pld [%2,#64] \n"
1734 "pld [%0,#96] \n"
1735 "pld [%1,#96] \n"
1736 "pld [%2,#96] \n"
1737 "vld1.f32 {q0,q1}, [%0,:128]! \n"
1738 "vld1.f32 {q4,q5}, [%1,:128]! \n"
1739 "vld1.f32 {q2,q3}, [%0,:128]! \n"
1740 "vld1.f32 {q6,q7}, [%1,:128]! \n"
1741 "vld1.f32 {q8,q9}, [r8,:128]! \n"
1742
1743 "vmul.f32 q10, q0, q4 \n"
1744 "vmul.f32 q11, q0, q5 \n"
1745 "vmul.f32 q12, q2, q6 \n"
1746 "vmul.f32 q13, q2, q7 \n"
1747 "vmls.f32 q10, q1, q5 \n"
1748 "vmla.f32 q11, q1, q4 \n"
1749 "vld1.f32 {q0,q1}, [r8,:128]! \n"
1750 "vmls.f32 q12, q3, q7 \n"
1751 "vmla.f32 q13, q3, q6 \n"
1752 "vmla.f32 q8, q10, q15 \n"
1753 "vmla.f32 q9, q11, q15 \n"
1754 "vmla.f32 q0, q12, q15 \n"
1755 "vmla.f32 q1, q13, q15 \n"
1756 "vst1.f32 {q8,q9},[%2,:128]! \n"
1757 "vst1.f32 {q0,q1},[%2,:128]! \n"
1758 "subs %3, #2 \n"
1759 "bne 1b \n"
1760 : "+r"(a_), "+r"(b_), "+r"(ab_), "+r"(N) : "r"(scaling) : "r8", "q0","q1","q2","q3","q4","q5","q6","q7","q8","q9", "q10","q11","q12","q13","q15","memory");
1761#else // default routine, works fine for non-arm cpus with current compilers
1762 for (i=0; i < Ncvec; i += 2) {
1763 v4sf ar, ai, br, bi;
1764 ar = va[2*i+0]; ai = va[2*i+1];
1765 br = vb[2*i+0]; bi = vb[2*i+1];
1766 VCPLXMUL(ar, ai, br, bi);
1767 vab[2*i+0] = VMADD(ar, vscal, vab[2*i+0]);
1768 vab[2*i+1] = VMADD(ai, vscal, vab[2*i+1]);
1769 ar = va[2*i+2]; ai = va[2*i+3];
1770 br = vb[2*i+2]; bi = vb[2*i+3];
1771 VCPLXMUL(ar, ai, br, bi);
1772 vab[2*i+2] = VMADD(ar, vscal, vab[2*i+2]);
1773 vab[2*i+3] = VMADD(ai, vscal, vab[2*i+3]);
1774 }
1775#endif
1776 if (s->transform == PFFFT_REAL) {
1777 ((v4sf_union*)vab)[0].f[0] = abr + ar*br*scaling;
1778 ((v4sf_union*)vab)[1].f[0] = abi + ai*bi*scaling;
1779 }
1780}
1781
1782
1783#else // defined(PFFFT_SIMD_DISABLE)
1784
1785// standard routine using scalar floats, without SIMD stuff.
1786
1787#define pffft_zreorder_nosimd pffft_zreorder
1788void pffft_zreorder_nosimd(PFFFT_Setup *setup, const float *in, float *out, pffft_direction_t direction) {
1789 int k, N = setup->N;
1790 if (setup->transform == PFFFT_COMPLEX) {
1791 for (k=0; k < 2*N; ++k) out[k] = in[k];
1792 return;
1793 }
1794 else if (direction == PFFFT_FORWARD) {
1795 float x_N = in[N-1];
1796 for (k=N-1; k > 1; --k) out[k] = in[k-1];
1797 out[0] = in[0];
1798 out[1] = x_N;
1799 } else {
1800 float x_N = in[1];
1801 for (k=1; k < N-1; ++k) out[k] = in[k+1];
1802 out[0] = in[0];
1803 out[N-1] = x_N;
1804 }
1805}
1806
1807#define pffft_transform_internal_nosimd pffft_transform_internal
1808void pffft_transform_internal_nosimd(PFFFT_Setup *setup, const float *input, float *output, float *scratch,
1809 pffft_direction_t direction, int ordered) {
1810 int Ncvec = setup->Ncvec;
1811 int nf_odd = (setup->ifac[1] & 1);
1812
1813 // temporary buffer is allocated on the stack if the scratch pointer is NULL
1814 int stack_allocate = (scratch == 0 ? Ncvec*2 : 1);
1815 VLA_ARRAY_ON_STACK(v4sf, scratch_on_stack, stack_allocate);
1816 float *buff[2];
1817 int ib;
1818 if (scratch == 0) scratch = scratch_on_stack;
1819 buff[0] = output; buff[1] = scratch;
1820
1821 if (setup->transform == PFFFT_COMPLEX) ordered = 0; // it is always ordered.
1822 ib = (nf_odd ^ ordered ? 1 : 0);
1823
1824 if (direction == PFFFT_FORWARD) {
1825 if (setup->transform == PFFFT_REAL) {
1826 ib = (rfftf1_ps(Ncvec*2, input, buff[ib], buff[!ib],
1827 setup->twiddle, &setup->ifac[0]) == buff[0] ? 0 : 1);
1828 } else {
1829 ib = (cfftf1_ps(Ncvec, input, buff[ib], buff[!ib],
1830 setup->twiddle, &setup->ifac[0], -1) == buff[0] ? 0 : 1);
1831 }
1832 if (ordered) {
1833 pffft_zreorder(setup, buff[ib], buff[!ib], PFFFT_FORWARD); ib = !ib;
1834 }
1835 } else {
1836 if (input == buff[ib]) {
1837 ib = !ib; // may happen when finput == foutput
1838 }
1839 if (ordered) {
1840 pffft_zreorder(setup, input, buff[!ib], PFFFT_BACKWARD);
1841 input = buff[!ib];
1842 }
1843 if (setup->transform == PFFFT_REAL) {
1844 ib = (rfftb1_ps(Ncvec*2, input, buff[ib], buff[!ib],
1845 setup->twiddle, &setup->ifac[0]) == buff[0] ? 0 : 1);
1846 } else {
1847 ib = (cfftf1_ps(Ncvec, input, buff[ib], buff[!ib],
1848 setup->twiddle, &setup->ifac[0], +1) == buff[0] ? 0 : 1);
1849 }
1850 }
1851 if (buff[ib] != output) {
1852 int k;
1853 // extra copy required -- this situation should happens only when finput == foutput
1854 assert(input==output);
1855 for (k=0; k < Ncvec; ++k) {
1856 float a = buff[ib][2*k], b = buff[ib][2*k+1];
1857 output[2*k] = a; output[2*k+1] = b;
1858 }
1859 ib = !ib;
1860 }
1861 assert(buff[ib] == output);
1862}
1863
1864#define pffft_zconvolve_accumulate_nosimd pffft_zconvolve_accumulate
1865void pffft_zconvolve_accumulate_nosimd(PFFFT_Setup *s, const float *a, const float *b,
1866 float *ab, float scaling) {
1867 int i, Ncvec = s->Ncvec;
1868
1869 if (s->transform == PFFFT_REAL) {
1870 // take care of the fftpack ordering
1871 ab[0] += a[0]*b[0]*scaling;
1872 ab[2*Ncvec-1] += a[2*Ncvec-1]*b[2*Ncvec-1]*scaling;
1873 ++ab; ++a; ++b; --Ncvec;
1874 }
1875 for (i=0; i < Ncvec; ++i) {
1876 float ar, ai, br, bi;
1877 ar = a[2*i+0]; ai = a[2*i+1];
1878 br = b[2*i+0]; bi = b[2*i+1];
1879 VCPLXMUL(ar, ai, br, bi);
1880 ab[2*i+0] += ar*scaling;
1881 ab[2*i+1] += ai*scaling;
1882 }
1883}
1884
1885#endif // defined(PFFFT_SIMD_DISABLE)
1886
1887void pffft_transform(PFFFT_Setup *setup, const float *input, float *output, float *work, pffft_direction_t direction) {
1888 pffft_transform_internal(setup, finput: input, foutput: output, scratch: (v4sf*)work, direction, ordered: 0);
1889}
1890
1891void pffft_transform_ordered(PFFFT_Setup *setup, const float *input, float *output, float *work, pffft_direction_t direction) {
1892 pffft_transform_internal(setup, finput: input, foutput: output, scratch: (v4sf*)work, direction, ordered: 1);
1893}
1894

source code of qtmultimedia/src/3rdparty/pffft/pffft.c