1/* Copyright (c) 2013 Julien Pommier ( pommier@modartt.com )
2
3 Based on original fortran 77 code from FFTPACKv4 from NETLIB
4 (http://www.netlib.org/fftpack), authored by Dr Paul Swarztrauber
5 of NCAR, in 1985.
6
7 As confirmed by the NCAR fftpack software curators, the following
8 FFTPACKv5 license applies to FFTPACKv4 sources. My changes are
9 released under the same terms.
10
11 FFTPACK license:
12
13 http://www.cisl.ucar.edu/css/software/fftpack5/ftpk.html
14
15 Copyright (c) 2004 the University Corporation for Atmospheric
16 Research ("UCAR"). All rights reserved. Developed by NCAR's
17 Computational and Information Systems Laboratory, UCAR,
18 www.cisl.ucar.edu.
19
20 Redistribution and use of the Software in source and binary forms,
21 with or without modification, is permitted provided that the
22 following conditions are met:
23
24 - Neither the names of NCAR's Computational and Information Systems
25 Laboratory, the University Corporation for Atmospheric Research,
26 nor the names of its sponsors or contributors may be used to
27 endorse or promote products derived from this Software without
28 specific prior written permission.
29
30 - Redistributions of source code must retain the above copyright
31 notices, this list of conditions, and the disclaimer below.
32
33 - Redistributions in binary form must reproduce the above copyright
34 notice, this list of conditions, and the disclaimer below in the
35 documentation and/or other materials provided with the
36 distribution.
37
38 THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
39 EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF
40 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
41 NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
42 HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL,
43 EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN
44 ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
45 CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
46 SOFTWARE.
47
48
49 PFFFT : a Pretty Fast FFT.
50
51 This file is largerly based on the original FFTPACK implementation, modified in
52 order to take advantage of SIMD instructions of modern CPUs.
53*/
54
55/*
56 ChangeLog:
57 - 2011/10/02, version 1: This is the very first release of this file.
58*/
59
60#include "pffft.h"
61#include <stdlib.h>
62#include <stdio.h>
63#include <math.h>
64#include <assert.h>
65
66#ifndef M_PI
67#define M_PI 3.14159265358979323846264338327950288
68#endif
69#ifndef M_SQRT2
70#define M_SQRT2 1.41421356237309504880168872420969808
71#endif
72
73/* detect compiler flavour */
74#if defined(_MSC_VER)
75# define COMPILER_MSVC
76#elif defined(__GNUC__)
77# define COMPILER_GCC
78#endif
79
80#if defined(COMPILER_GCC)
81# define ALWAYS_INLINE(return_type) inline return_type __attribute__ ((always_inline))
82# define NEVER_INLINE(return_type) return_type __attribute__ ((noinline))
83# define RESTRICT __restrict
84# define VLA_ARRAY_ON_STACK(type__, varname__, size__) type__ varname__[size__];
85#elif defined(COMPILER_MSVC)
86# define ALWAYS_INLINE(return_type) __forceinline return_type
87# define NEVER_INLINE(return_type) __declspec(noinline) return_type
88# define RESTRICT __restrict
89# define VLA_ARRAY_ON_STACK(type__, varname__, size__) type__ *varname__ = (type__*)_alloca(size__ * sizeof(type__))
90#endif
91
92
93/*
94 vector support macros: the rest of the code is independant of
95 SSE/Altivec/NEON -- adding support for other platforms with 4-element
96 vectors should be limited to these macros
97*/
98
99
100// define PFFFT_SIMD_DISABLE if you want to use scalar code instead of simd code
101//#define PFFFT_SIMD_DISABLE
102
103/*
104 Altivec support macros
105*/
106#if !defined(PFFFT_SIMD_DISABLE) && (defined(__ppc__) || defined(__ppc64__) || defined(__powerpc__) || defined(__powerpc64__))
107#include <altivec.h>
108typedef vector float v4sf;
109# define SIMD_SZ 4
110# define VZERO() ((vector float) vec_splat_u8(0))
111# define VMUL(a,b) vec_madd(a,b, VZERO())
112# define VADD(a,b) vec_add(a,b)
113# define VMADD(a,b,c) vec_madd(a,b,c)
114# define VSUB(a,b) vec_sub(a,b)
115inline v4sf ld_ps1(const float *p) { v4sf v=vec_lde(0,p); return vec_splat(vec_perm(v, v, vec_lvsl(0, p)), 0); }
116# define LD_PS1(p) ld_ps1(&p)
117# define INTERLEAVE2(in1, in2, out1, out2) { v4sf tmp__ = vec_mergeh(in1, in2); out2 = vec_mergel(in1, in2); out1 = tmp__; }
118# define UNINTERLEAVE2(in1, in2, out1, out2) { \
119 vector unsigned char vperm1 = (vector unsigned char){0,1,2,3,8,9,10,11,16,17,18,19,24,25,26,27}; \
120 vector unsigned char vperm2 = (vector unsigned char){4,5,6,7,12,13,14,15,20,21,22,23,28,29,30,31}; \
121 v4sf tmp__ = vec_perm(in1, in2, vperm1); out2 = vec_perm(in1, in2, vperm2); out1 = tmp__; \
122 }
123# define VTRANSPOSE4(x0,x1,x2,x3) { \
124 v4sf y0 = vec_mergeh(x0, x2); \
125 v4sf y1 = vec_mergel(x0, x2); \
126 v4sf y2 = vec_mergeh(x1, x3); \
127 v4sf y3 = vec_mergel(x1, x3); \
128 x0 = vec_mergeh(y0, y2); \
129 x1 = vec_mergel(y0, y2); \
130 x2 = vec_mergeh(y1, y3); \
131 x3 = vec_mergel(y1, y3); \
132 }
133# define VSWAPHL(a,b) vec_perm(a,b, (vector unsigned char){16,17,18,19,20,21,22,23,8,9,10,11,12,13,14,15})
134# define VALIGNED(ptr) ((((long long)(ptr)) & 0xF) == 0)
135
136/*
137 SSE1 support macros
138*/
139#elif !defined(PFFFT_SIMD_DISABLE) && (defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || defined(i386) || defined(_M_IX86))
140
141#include <xmmintrin.h>
142typedef __m128 v4sf;
143# define SIMD_SZ 4 // 4 floats by simd vector -- this is pretty much hardcoded in the preprocess/finalize functions anyway so you will have to work if you want to enable AVX with its 256-bit vectors.
144# define VZERO() _mm_setzero_ps()
145# define VMUL(a,b) _mm_mul_ps(a,b)
146# define VADD(a,b) _mm_add_ps(a,b)
147# define VMADD(a,b,c) _mm_add_ps(_mm_mul_ps(a,b), c)
148# define VSUB(a,b) _mm_sub_ps(a,b)
149# define LD_PS1(p) _mm_set1_ps(p)
150# define INTERLEAVE2(in1, in2, out1, out2) { v4sf tmp__ = _mm_unpacklo_ps(in1, in2); out2 = _mm_unpackhi_ps(in1, in2); out1 = tmp__; }
151# define UNINTERLEAVE2(in1, in2, out1, out2) { v4sf tmp__ = _mm_shuffle_ps(in1, in2, _MM_SHUFFLE(2,0,2,0)); out2 = _mm_shuffle_ps(in1, in2, _MM_SHUFFLE(3,1,3,1)); out1 = tmp__; }
152# define VTRANSPOSE4(x0,x1,x2,x3) _MM_TRANSPOSE4_PS(x0,x1,x2,x3)
153# define VSWAPHL(a,b) _mm_shuffle_ps(b, a, _MM_SHUFFLE(3,2,1,0))
154# define VALIGNED(ptr) ((((long long)(ptr)) & 0xF) == 0)
155
156/*
157 ARM NEON support macros
158*/
159#elif !defined(PFFFT_SIMD_DISABLE) && \
160 (((defined(__arm__) || defined(__TARGET_ARCH_ARM) || defined(_M_ARM)) && defined(__ARM_NEON__)) || \
161 defined(_M_ARM64) || defined(__aarch64__) || defined(__ARM64__))
162# include <arm_neon.h>
163typedef float32x4_t v4sf;
164# define SIMD_SZ 4
165# define VZERO() vdupq_n_f32(0)
166# define VMUL(a,b) vmulq_f32(a,b)
167# define VADD(a,b) vaddq_f32(a,b)
168# define VMADD(a,b,c) vmlaq_f32(c,a,b)
169# define VSUB(a,b) vsubq_f32(a,b)
170# define LD_PS1(p) vld1q_dup_f32(&(p))
171# define INTERLEAVE2(in1, in2, out1, out2) { float32x4x2_t tmp__ = vzipq_f32(in1,in2); out1=tmp__.val[0]; out2=tmp__.val[1]; }
172# define UNINTERLEAVE2(in1, in2, out1, out2) { float32x4x2_t tmp__ = vuzpq_f32(in1,in2); out1=tmp__.val[0]; out2=tmp__.val[1]; }
173# define VTRANSPOSE4(x0,x1,x2,x3) { \
174 float32x4x2_t t0_ = vzipq_f32(x0, x2); \
175 float32x4x2_t t1_ = vzipq_f32(x1, x3); \
176 float32x4x2_t u0_ = vzipq_f32(t0_.val[0], t1_.val[0]); \
177 float32x4x2_t u1_ = vzipq_f32(t0_.val[1], t1_.val[1]); \
178 x0 = u0_.val[0]; x1 = u0_.val[1]; x2 = u1_.val[0]; x3 = u1_.val[1]; \
179 }
180// marginally faster version
181//# define VTRANSPOSE4(x0,x1,x2,x3) { asm("vtrn.32 %q0, %q1;\n vtrn.32 %q2,%q3\n vswp %f0,%e2\n vswp %f1,%e3" : "+w"(x0), "+w"(x1), "+w"(x2), "+w"(x3)::); }
182# define VSWAPHL(a,b) vcombine_f32(vget_low_f32(b), vget_high_f32(a))
183# define VALIGNED(ptr) ((((long long)(ptr)) & 0x3) == 0)
184#else
185# if !defined(PFFFT_SIMD_DISABLE)
186# warning "building with simd disabled !\n";
187# define PFFFT_SIMD_DISABLE // fallback to scalar code
188# endif
189#endif
190
191// fallback mode for situations where SSE/Altivec are not available, use scalar mode instead
192#ifdef PFFFT_SIMD_DISABLE
193typedef float v4sf;
194# define SIMD_SZ 1
195# define VZERO() 0.f
196# define VMUL(a,b) ((a)*(b))
197# define VADD(a,b) ((a)+(b))
198# define VMADD(a,b,c) ((a)*(b)+(c))
199# define VSUB(a,b) ((a)-(b))
200# define LD_PS1(p) (p)
201# define VALIGNED(ptr) ((((long long)(ptr)) & 0x3) == 0)
202#endif
203
204// shortcuts for complex multiplcations
205#define VCPLXMUL(ar,ai,br,bi) { v4sf tmp; tmp=VMUL(ar,bi); ar=VMUL(ar,br); ar=VSUB(ar,VMUL(ai,bi)); ai=VMUL(ai,br); ai=VADD(ai,tmp); }
206#define VCPLXMULCONJ(ar,ai,br,bi) { v4sf tmp; tmp=VMUL(ar,bi); ar=VMUL(ar,br); ar=VADD(ar,VMUL(ai,bi)); ai=VMUL(ai,br); ai=VSUB(ai,tmp); }
207#ifndef SVMUL
208// multiply a scalar with a vector
209#define SVMUL(f,v) VMUL(LD_PS1(f),v)
210#endif
211
212#if !defined(PFFFT_SIMD_DISABLE)
213typedef union v4sf_union {
214 v4sf v;
215 float f[4];
216} v4sf_union;
217
218#include <string.h>
219
220#define assertv4(v,f0,f1,f2,f3) assert(v.f[0] == (f0) && v.f[1] == (f1) && v.f[2] == (f2) && v.f[3] == (f3))
221
222/* detect bugs with the vector support macros */
223void validate_pffft_simd(void) {
224 float f[16] = { 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 };
225 v4sf_union a0, a1, a2, a3, t, u;
226 memcpy(dest: a0.f, src: f, n: 4*sizeof(float));
227 memcpy(dest: a1.f, src: f+4, n: 4*sizeof(float));
228 memcpy(dest: a2.f, src: f+8, n: 4*sizeof(float));
229 memcpy(dest: a3.f, src: f+12, n: 4*sizeof(float));
230
231 t = a0; u = a1; t.v = VZERO();
232 printf(format: "VZERO=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]); assertv4(t, 0, 0, 0, 0);
233 t.v = VADD(a1.v, a2.v);
234 printf(format: "VADD(4:7,8:11)=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]); assertv4(t, 12, 14, 16, 18);
235 t.v = VMUL(a1.v, a2.v);
236 printf(format: "VMUL(4:7,8:11)=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]); assertv4(t, 32, 45, 60, 77);
237 t.v = VMADD(a1.v, a2.v,a0.v);
238 printf(format: "VMADD(4:7,8:11,0:3)=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]); assertv4(t, 32, 46, 62, 80);
239
240 INTERLEAVE2(a1.v,a2.v,t.v,u.v);
241 printf(format: "INTERLEAVE2(4:7,8:11)=[%2g %2g %2g %2g] [%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3], u.f[0], u.f[1], u.f[2], u.f[3]);
242 assertv4(t, 4, 8, 5, 9); assertv4(u, 6, 10, 7, 11);
243 UNINTERLEAVE2(a1.v,a2.v,t.v,u.v);
244 printf(format: "UNINTERLEAVE2(4:7,8:11)=[%2g %2g %2g %2g] [%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3], u.f[0], u.f[1], u.f[2], u.f[3]);
245 assertv4(t, 4, 6, 8, 10); assertv4(u, 5, 7, 9, 11);
246
247 t.v=LD_PS1(f[15]);
248 printf(format: "LD_PS1(15)=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]);
249 assertv4(t, 15, 15, 15, 15);
250 t.v = VSWAPHL(a1.v, a2.v);
251 printf(format: "VSWAPHL(4:7,8:11)=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]);
252 assertv4(t, 8, 9, 6, 7);
253 VTRANSPOSE4(a0.v, a1.v, a2.v, a3.v);
254 printf(format: "VTRANSPOSE4(0:3,4:7,8:11,12:15)=[%2g %2g %2g %2g] [%2g %2g %2g %2g] [%2g %2g %2g %2g] [%2g %2g %2g %2g]\n",
255 a0.f[0], a0.f[1], a0.f[2], a0.f[3], a1.f[0], a1.f[1], a1.f[2], a1.f[3],
256 a2.f[0], a2.f[1], a2.f[2], a2.f[3], a3.f[0], a3.f[1], a3.f[2], a3.f[3]);
257 assertv4(a0, 0, 4, 8, 12); assertv4(a1, 1, 5, 9, 13); assertv4(a2, 2, 6, 10, 14); assertv4(a3, 3, 7, 11, 15);
258}
259#else
260void validate_pffft_simd() {} // allow test_pffft.c to call this function even when simd is not available..
261#endif //!PFFFT_SIMD_DISABLE
262
263/* SSE and co like 16-bytes aligned pointers */
264#define MALLOC_V4SF_ALIGNMENT 64 // with a 64-byte alignment, we are even aligned on L2 cache lines...
265void *pffft_aligned_malloc(size_t nb_bytes) {
266 void *p, *p0 = malloc(size: nb_bytes + MALLOC_V4SF_ALIGNMENT);
267 if (!p0) return (void *) 0;
268 p = (void *) (((size_t) p0 + MALLOC_V4SF_ALIGNMENT) & (~((size_t) (MALLOC_V4SF_ALIGNMENT-1))));
269 *((void **) p - 1) = p0;
270 return p;
271}
272
273void pffft_aligned_free(void *p) {
274 if (p) free(ptr: *((void **) p - 1));
275}
276
277int pffft_simd_size(void) { return SIMD_SZ; }
278
279/*
280 passf2 and passb2 has been merged here, fsign = -1 for passf2, +1 for passb2
281*/
282static NEVER_INLINE(void) passf2_ps(int ido, int l1, const v4sf *cc, v4sf *ch, const float *wa1, float fsign) {
283 int k, i;
284 int l1ido = l1*ido;
285 if (ido <= 2) {
286 for (k=0; k < l1ido; k += ido, ch += ido, cc+= 2*ido) {
287 ch[0] = VADD(cc[0], cc[ido+0]);
288 ch[l1ido] = VSUB(cc[0], cc[ido+0]);
289 ch[1] = VADD(cc[1], cc[ido+1]);
290 ch[l1ido + 1] = VSUB(cc[1], cc[ido+1]);
291 }
292 } else {
293 for (k=0; k < l1ido; k += ido, ch += ido, cc += 2*ido) {
294 for (i=0; i<ido-1; i+=2) {
295 v4sf tr2 = VSUB(cc[i+0], cc[i+ido+0]);
296 v4sf ti2 = VSUB(cc[i+1], cc[i+ido+1]);
297 v4sf wr = LD_PS1(wa1[i]), wi = VMUL(LD_PS1(fsign), LD_PS1(wa1[i+1]));
298 ch[i] = VADD(cc[i+0], cc[i+ido+0]);
299 ch[i+1] = VADD(cc[i+1], cc[i+ido+1]);
300 VCPLXMUL(tr2, ti2, wr, wi);
301 ch[i+l1ido] = tr2;
302 ch[i+l1ido+1] = ti2;
303 }
304 }
305 }
306}
307
308/*
309 passf3 and passb3 has been merged here, fsign = -1 for passf3, +1 for passb3
310*/
311static NEVER_INLINE(void) passf3_ps(int ido, int l1, const v4sf *cc, v4sf *ch,
312 const float *wa1, const float *wa2, float fsign) {
313 static const float taur = -0.5f;
314 float taui = 0.866025403784439f*fsign;
315 int i, k;
316 v4sf tr2, ti2, cr2, ci2, cr3, ci3, dr2, di2, dr3, di3;
317 int l1ido = l1*ido;
318 float wr1, wi1, wr2, wi2;
319 assert(ido > 2);
320 for (k=0; k< l1ido; k += ido, cc+= 3*ido, ch +=ido) {
321 for (i=0; i<ido-1; i+=2) {
322 tr2 = VADD(cc[i+ido], cc[i+2*ido]);
323 cr2 = VADD(cc[i], SVMUL(taur,tr2));
324 ch[i] = VADD(cc[i], tr2);
325 ti2 = VADD(cc[i+ido+1], cc[i+2*ido+1]);
326 ci2 = VADD(cc[i +1], SVMUL(taur,ti2));
327 ch[i+1] = VADD(cc[i+1], ti2);
328 cr3 = SVMUL(taui, VSUB(cc[i+ido], cc[i+2*ido]));
329 ci3 = SVMUL(taui, VSUB(cc[i+ido+1], cc[i+2*ido+1]));
330 dr2 = VSUB(cr2, ci3);
331 dr3 = VADD(cr2, ci3);
332 di2 = VADD(ci2, cr3);
333 di3 = VSUB(ci2, cr3);
334 wr1=wa1[i]; wi1=fsign*wa1[i+1]; wr2=wa2[i]; wi2=fsign*wa2[i+1];
335 VCPLXMUL(dr2, di2, LD_PS1(wr1), LD_PS1(wi1));
336 ch[i+l1ido] = dr2;
337 ch[i+l1ido + 1] = di2;
338 VCPLXMUL(dr3, di3, LD_PS1(wr2), LD_PS1(wi2));
339 ch[i+2*l1ido] = dr3;
340 ch[i+2*l1ido+1] = di3;
341 }
342 }
343} /* passf3 */
344
345static NEVER_INLINE(void) passf4_ps(int ido, int l1, const v4sf *cc, v4sf *ch,
346 const float *wa1, const float *wa2, const float *wa3, float fsign) {
347 /* isign == -1 for forward transform and +1 for backward transform */
348
349 int i, k;
350 v4sf ci2, ci3, ci4, cr2, cr3, cr4, ti1, ti2, ti3, ti4, tr1, tr2, tr3, tr4;
351 int l1ido = l1*ido;
352 if (ido == 2) {
353 for (k=0; k < l1ido; k += ido, ch += ido, cc += 4*ido) {
354 tr1 = VSUB(cc[0], cc[2*ido + 0]);
355 tr2 = VADD(cc[0], cc[2*ido + 0]);
356 ti1 = VSUB(cc[1], cc[2*ido + 1]);
357 ti2 = VADD(cc[1], cc[2*ido + 1]);
358 ti4 = VMUL(VSUB(cc[1*ido + 0], cc[3*ido + 0]), LD_PS1(fsign));
359 tr4 = VMUL(VSUB(cc[3*ido + 1], cc[1*ido + 1]), LD_PS1(fsign));
360 tr3 = VADD(cc[ido + 0], cc[3*ido + 0]);
361 ti3 = VADD(cc[ido + 1], cc[3*ido + 1]);
362
363 ch[0*l1ido + 0] = VADD(tr2, tr3);
364 ch[0*l1ido + 1] = VADD(ti2, ti3);
365 ch[1*l1ido + 0] = VADD(tr1, tr4);
366 ch[1*l1ido + 1] = VADD(ti1, ti4);
367 ch[2*l1ido + 0] = VSUB(tr2, tr3);
368 ch[2*l1ido + 1] = VSUB(ti2, ti3);
369 ch[3*l1ido + 0] = VSUB(tr1, tr4);
370 ch[3*l1ido + 1] = VSUB(ti1, ti4);
371 }
372 } else {
373 for (k=0; k < l1ido; k += ido, ch+=ido, cc += 4*ido) {
374 for (i=0; i<ido-1; i+=2) {
375 float wr1, wi1, wr2, wi2, wr3, wi3;
376 tr1 = VSUB(cc[i + 0], cc[i + 2*ido + 0]);
377 tr2 = VADD(cc[i + 0], cc[i + 2*ido + 0]);
378 ti1 = VSUB(cc[i + 1], cc[i + 2*ido + 1]);
379 ti2 = VADD(cc[i + 1], cc[i + 2*ido + 1]);
380 tr4 = VMUL(VSUB(cc[i + 3*ido + 1], cc[i + 1*ido + 1]), LD_PS1(fsign));
381 ti4 = VMUL(VSUB(cc[i + 1*ido + 0], cc[i + 3*ido + 0]), LD_PS1(fsign));
382 tr3 = VADD(cc[i + ido + 0], cc[i + 3*ido + 0]);
383 ti3 = VADD(cc[i + ido + 1], cc[i + 3*ido + 1]);
384
385 ch[i] = VADD(tr2, tr3);
386 cr3 = VSUB(tr2, tr3);
387 ch[i + 1] = VADD(ti2, ti3);
388 ci3 = VSUB(ti2, ti3);
389
390 cr2 = VADD(tr1, tr4);
391 cr4 = VSUB(tr1, tr4);
392 ci2 = VADD(ti1, ti4);
393 ci4 = VSUB(ti1, ti4);
394 wr1=wa1[i]; wi1=fsign*wa1[i+1];
395 VCPLXMUL(cr2, ci2, LD_PS1(wr1), LD_PS1(wi1));
396 wr2=wa2[i]; wi2=fsign*wa2[i+1];
397 ch[i + l1ido] = cr2;
398 ch[i + l1ido + 1] = ci2;
399
400 VCPLXMUL(cr3, ci3, LD_PS1(wr2), LD_PS1(wi2));
401 wr3=wa3[i]; wi3=fsign*wa3[i+1];
402 ch[i + 2*l1ido] = cr3;
403 ch[i + 2*l1ido + 1] = ci3;
404
405 VCPLXMUL(cr4, ci4, LD_PS1(wr3), LD_PS1(wi3));
406 ch[i + 3*l1ido] = cr4;
407 ch[i + 3*l1ido + 1] = ci4;
408 }
409 }
410 }
411} /* passf4 */
412
413/*
414 passf5 and passb5 has been merged here, fsign = -1 for passf5, +1 for passb5
415*/
416static NEVER_INLINE(void) passf5_ps(int ido, int l1, const v4sf *cc, v4sf *ch,
417 const float *wa1, const float *wa2,
418 const float *wa3, const float *wa4, float fsign) {
419 static const float tr11 = .309016994374947f;
420 const float ti11 = .951056516295154f*fsign;
421 static const float tr12 = -.809016994374947f;
422 const float ti12 = .587785252292473f*fsign;
423
424 /* Local variables */
425 int i, k;
426 v4sf ci2, ci3, ci4, ci5, di3, di4, di5, di2, cr2, cr3, cr5, cr4, ti2, ti3,
427 ti4, ti5, dr3, dr4, dr5, dr2, tr2, tr3, tr4, tr5;
428
429 float wr1, wi1, wr2, wi2, wr3, wi3, wr4, wi4;
430
431#define cc_ref(a_1,a_2) cc[(a_2-1)*ido + a_1 + 1]
432#define ch_ref(a_1,a_3) ch[(a_3-1)*l1*ido + a_1 + 1]
433
434 assert(ido > 2);
435 for (k = 0; k < l1; ++k, cc += 5*ido, ch += ido) {
436 for (i = 0; i < ido-1; i += 2) {
437 ti5 = VSUB(cc_ref(i , 2), cc_ref(i , 5));
438 ti2 = VADD(cc_ref(i , 2), cc_ref(i , 5));
439 ti4 = VSUB(cc_ref(i , 3), cc_ref(i , 4));
440 ti3 = VADD(cc_ref(i , 3), cc_ref(i , 4));
441 tr5 = VSUB(cc_ref(i-1, 2), cc_ref(i-1, 5));
442 tr2 = VADD(cc_ref(i-1, 2), cc_ref(i-1, 5));
443 tr4 = VSUB(cc_ref(i-1, 3), cc_ref(i-1, 4));
444 tr3 = VADD(cc_ref(i-1, 3), cc_ref(i-1, 4));
445 ch_ref(i-1, 1) = VADD(cc_ref(i-1, 1), VADD(tr2, tr3));
446 ch_ref(i , 1) = VADD(cc_ref(i , 1), VADD(ti2, ti3));
447 cr2 = VADD(cc_ref(i-1, 1), VADD(SVMUL(tr11, tr2),SVMUL(tr12, tr3)));
448 ci2 = VADD(cc_ref(i , 1), VADD(SVMUL(tr11, ti2),SVMUL(tr12, ti3)));
449 cr3 = VADD(cc_ref(i-1, 1), VADD(SVMUL(tr12, tr2),SVMUL(tr11, tr3)));
450 ci3 = VADD(cc_ref(i , 1), VADD(SVMUL(tr12, ti2),SVMUL(tr11, ti3)));
451 cr5 = VADD(SVMUL(ti11, tr5), SVMUL(ti12, tr4));
452 ci5 = VADD(SVMUL(ti11, ti5), SVMUL(ti12, ti4));
453 cr4 = VSUB(SVMUL(ti12, tr5), SVMUL(ti11, tr4));
454 ci4 = VSUB(SVMUL(ti12, ti5), SVMUL(ti11, ti4));
455 dr3 = VSUB(cr3, ci4);
456 dr4 = VADD(cr3, ci4);
457 di3 = VADD(ci3, cr4);
458 di4 = VSUB(ci3, cr4);
459 dr5 = VADD(cr2, ci5);
460 dr2 = VSUB(cr2, ci5);
461 di5 = VSUB(ci2, cr5);
462 di2 = VADD(ci2, cr5);
463 wr1=wa1[i]; wi1=fsign*wa1[i+1]; wr2=wa2[i]; wi2=fsign*wa2[i+1];
464 wr3=wa3[i]; wi3=fsign*wa3[i+1]; wr4=wa4[i]; wi4=fsign*wa4[i+1];
465 VCPLXMUL(dr2, di2, LD_PS1(wr1), LD_PS1(wi1));
466 ch_ref(i - 1, 2) = dr2;
467 ch_ref(i, 2) = di2;
468 VCPLXMUL(dr3, di3, LD_PS1(wr2), LD_PS1(wi2));
469 ch_ref(i - 1, 3) = dr3;
470 ch_ref(i, 3) = di3;
471 VCPLXMUL(dr4, di4, LD_PS1(wr3), LD_PS1(wi3));
472 ch_ref(i - 1, 4) = dr4;
473 ch_ref(i, 4) = di4;
474 VCPLXMUL(dr5, di5, LD_PS1(wr4), LD_PS1(wi4));
475 ch_ref(i - 1, 5) = dr5;
476 ch_ref(i, 5) = di5;
477 }
478 }
479#undef ch_ref
480#undef cc_ref
481}
482
483static NEVER_INLINE(void) radf2_ps(int ido, int l1, const v4sf * RESTRICT cc, v4sf * RESTRICT ch, const float *wa1) {
484 static const float minus_one = -1.f;
485 int i, k, l1ido = l1*ido;
486 for (k=0; k < l1ido; k += ido) {
487 v4sf a = cc[k], b = cc[k + l1ido];
488 ch[2*k] = VADD(a, b);
489 ch[2*(k+ido)-1] = VSUB(a, b);
490 }
491 if (ido < 2) return;
492 if (ido != 2) {
493 for (k=0; k < l1ido; k += ido) {
494 for (i=2; i<ido; i+=2) {
495 v4sf tr2 = cc[i - 1 + k + l1ido], ti2 = cc[i + k + l1ido];
496 v4sf br = cc[i - 1 + k], bi = cc[i + k];
497 VCPLXMULCONJ(tr2, ti2, LD_PS1(wa1[i - 2]), LD_PS1(wa1[i - 1]));
498 ch[i + 2*k] = VADD(bi, ti2);
499 ch[2*(k+ido) - i] = VSUB(ti2, bi);
500 ch[i - 1 + 2*k] = VADD(br, tr2);
501 ch[2*(k+ido) - i -1] = VSUB(br, tr2);
502 }
503 }
504 if (ido % 2 == 1) return;
505 }
506 for (k=0; k < l1ido; k += ido) {
507 ch[2*k + ido] = SVMUL(minus_one, cc[ido-1 + k + l1ido]);
508 ch[2*k + ido-1] = cc[k + ido-1];
509 }
510} /* radf2 */
511
512
513static NEVER_INLINE(void) radb2_ps(int ido, int l1, const v4sf *cc, v4sf *ch, const float *wa1) {
514 static const float minus_two=-2;
515 int i, k, l1ido = l1*ido;
516 v4sf a,b,c,d, tr2, ti2;
517 for (k=0; k < l1ido; k += ido) {
518 a = cc[2*k]; b = cc[2*(k+ido) - 1];
519 ch[k] = VADD(a, b);
520 ch[k + l1ido] =VSUB(a, b);
521 }
522 if (ido < 2) return;
523 if (ido != 2) {
524 for (k = 0; k < l1ido; k += ido) {
525 for (i = 2; i < ido; i += 2) {
526 a = cc[i-1 + 2*k]; b = cc[2*(k + ido) - i - 1];
527 c = cc[i+0 + 2*k]; d = cc[2*(k + ido) - i + 0];
528 ch[i-1 + k] = VADD(a, b);
529 tr2 = VSUB(a, b);
530 ch[i+0 + k] = VSUB(c, d);
531 ti2 = VADD(c, d);
532 VCPLXMUL(tr2, ti2, LD_PS1(wa1[i - 2]), LD_PS1(wa1[i - 1]));
533 ch[i-1 + k + l1ido] = tr2;
534 ch[i+0 + k + l1ido] = ti2;
535 }
536 }
537 if (ido % 2 == 1) return;
538 }
539 for (k = 0; k < l1ido; k += ido) {
540 a = cc[2*k + ido-1]; b = cc[2*k + ido];
541 ch[k + ido-1] = VADD(a,a);
542 ch[k + ido-1 + l1ido] = SVMUL(minus_two, b);
543 }
544} /* radb2 */
545
546static void radf3_ps(int ido, int l1, const v4sf * RESTRICT cc, v4sf * RESTRICT ch,
547 const float *wa1, const float *wa2) {
548 static const float taur = -0.5f;
549 static const float taui = 0.866025403784439f;
550 int i, k, ic;
551 v4sf ci2, di2, di3, cr2, dr2, dr3, ti2, ti3, tr2, tr3, wr1, wi1, wr2, wi2;
552 for (k=0; k<l1; k++) {
553 cr2 = VADD(cc[(k + l1)*ido], cc[(k + 2*l1)*ido]);
554 ch[3*k*ido] = VADD(cc[k*ido], cr2);
555 ch[(3*k+2)*ido] = SVMUL(taui, VSUB(cc[(k + l1*2)*ido], cc[(k + l1)*ido]));
556 ch[ido-1 + (3*k + 1)*ido] = VADD(cc[k*ido], SVMUL(taur, cr2));
557 }
558 if (ido == 1) return;
559 for (k=0; k<l1; k++) {
560 for (i=2; i<ido; i+=2) {
561 ic = ido - i;
562 wr1 = LD_PS1(wa1[i - 2]); wi1 = LD_PS1(wa1[i - 1]);
563 dr2 = cc[i - 1 + (k + l1)*ido]; di2 = cc[i + (k + l1)*ido];
564 VCPLXMULCONJ(dr2, di2, wr1, wi1);
565
566 wr2 = LD_PS1(wa2[i - 2]); wi2 = LD_PS1(wa2[i - 1]);
567 dr3 = cc[i - 1 + (k + l1*2)*ido]; di3 = cc[i + (k + l1*2)*ido];
568 VCPLXMULCONJ(dr3, di3, wr2, wi2);
569
570 cr2 = VADD(dr2, dr3);
571 ci2 = VADD(di2, di3);
572 ch[i - 1 + 3*k*ido] = VADD(cc[i - 1 + k*ido], cr2);
573 ch[i + 3*k*ido] = VADD(cc[i + k*ido], ci2);
574 tr2 = VADD(cc[i - 1 + k*ido], SVMUL(taur, cr2));
575 ti2 = VADD(cc[i + k*ido], SVMUL(taur, ci2));
576 tr3 = SVMUL(taui, VSUB(di2, di3));
577 ti3 = SVMUL(taui, VSUB(dr3, dr2));
578 ch[i - 1 + (3*k + 2)*ido] = VADD(tr2, tr3);
579 ch[ic - 1 + (3*k + 1)*ido] = VSUB(tr2, tr3);
580 ch[i + (3*k + 2)*ido] = VADD(ti2, ti3);
581 ch[ic + (3*k + 1)*ido] = VSUB(ti3, ti2);
582 }
583 }
584} /* radf3 */
585
586
587static void radb3_ps(int ido, int l1, const v4sf *RESTRICT cc, v4sf *RESTRICT ch,
588 const float *wa1, const float *wa2)
589{
590 static const float taur = -0.5f;
591 static const float taui = 0.866025403784439f;
592 static const float taui_2 = 0.866025403784439f*2;
593 int i, k, ic;
594 v4sf ci2, ci3, di2, di3, cr2, cr3, dr2, dr3, ti2, tr2;
595 for (k=0; k<l1; k++) {
596 tr2 = cc[ido-1 + (3*k + 1)*ido]; tr2 = VADD(tr2,tr2);
597 cr2 = VMADD(LD_PS1(taur), tr2, cc[3*k*ido]);
598 ch[k*ido] = VADD(cc[3*k*ido], tr2);
599 ci3 = SVMUL(taui_2, cc[(3*k + 2)*ido]);
600 ch[(k + l1)*ido] = VSUB(cr2, ci3);
601 ch[(k + 2*l1)*ido] = VADD(cr2, ci3);
602 }
603 if (ido == 1) return;
604 for (k=0; k<l1; k++) {
605 for (i=2; i<ido; i+=2) {
606 ic = ido - i;
607 tr2 = VADD(cc[i - 1 + (3*k + 2)*ido], cc[ic - 1 + (3*k + 1)*ido]);
608 cr2 = VMADD(LD_PS1(taur), tr2, cc[i - 1 + 3*k*ido]);
609 ch[i - 1 + k*ido] = VADD(cc[i - 1 + 3*k*ido], tr2);
610 ti2 = VSUB(cc[i + (3*k + 2)*ido], cc[ic + (3*k + 1)*ido]);
611 ci2 = VMADD(LD_PS1(taur), ti2, cc[i + 3*k*ido]);
612 ch[i + k*ido] = VADD(cc[i + 3*k*ido], ti2);
613 cr3 = SVMUL(taui, VSUB(cc[i - 1 + (3*k + 2)*ido], cc[ic - 1 + (3*k + 1)*ido]));
614 ci3 = SVMUL(taui, VADD(cc[i + (3*k + 2)*ido], cc[ic + (3*k + 1)*ido]));
615 dr2 = VSUB(cr2, ci3);
616 dr3 = VADD(cr2, ci3);
617 di2 = VADD(ci2, cr3);
618 di3 = VSUB(ci2, cr3);
619 VCPLXMUL(dr2, di2, LD_PS1(wa1[i-2]), LD_PS1(wa1[i-1]));
620 ch[i - 1 + (k + l1)*ido] = dr2;
621 ch[i + (k + l1)*ido] = di2;
622 VCPLXMUL(dr3, di3, LD_PS1(wa2[i-2]), LD_PS1(wa2[i-1]));
623 ch[i - 1 + (k + 2*l1)*ido] = dr3;
624 ch[i + (k + 2*l1)*ido] = di3;
625 }
626 }
627} /* radb3 */
628
629static NEVER_INLINE(void) radf4_ps(int ido, int l1, const v4sf *RESTRICT cc, v4sf * RESTRICT ch,
630 const float * RESTRICT wa1, const float * RESTRICT wa2, const float * RESTRICT wa3)
631{
632 static const float minus_hsqt2 = (float)-0.7071067811865475;
633 int i, k, l1ido = l1*ido;
634 {
635 const v4sf *RESTRICT cc_ = cc, * RESTRICT cc_end = cc + l1ido;
636 v4sf * RESTRICT ch_ = ch;
637 while (cc < cc_end) {
638 // this loop represents between 25% and 40% of total radf4_ps cost !
639 v4sf a0 = cc[0], a1 = cc[l1ido];
640 v4sf a2 = cc[2*l1ido], a3 = cc[3*l1ido];
641 v4sf tr1 = VADD(a1, a3);
642 v4sf tr2 = VADD(a0, a2);
643 ch[2*ido-1] = VSUB(a0, a2);
644 ch[2*ido ] = VSUB(a3, a1);
645 ch[0 ] = VADD(tr1, tr2);
646 ch[4*ido-1] = VSUB(tr2, tr1);
647 cc += ido; ch += 4*ido;
648 }
649 cc = cc_; ch = ch_;
650 }
651 if (ido < 2) return;
652 if (ido != 2) {
653 for (k = 0; k < l1ido; k += ido) {
654 const v4sf * RESTRICT pc = (v4sf*)(cc + 1 + k);
655 for (i=2; i<ido; i += 2, pc += 2) {
656 int ic = ido - i;
657 v4sf wr, wi, cr2, ci2, cr3, ci3, cr4, ci4;
658 v4sf tr1, ti1, tr2, ti2, tr3, ti3, tr4, ti4;
659
660 cr2 = pc[1*l1ido+0];
661 ci2 = pc[1*l1ido+1];
662 wr=LD_PS1(wa1[i - 2]);
663 wi=LD_PS1(wa1[i - 1]);
664 VCPLXMULCONJ(cr2,ci2,wr,wi);
665
666 cr3 = pc[2*l1ido+0];
667 ci3 = pc[2*l1ido+1];
668 wr = LD_PS1(wa2[i-2]);
669 wi = LD_PS1(wa2[i-1]);
670 VCPLXMULCONJ(cr3, ci3, wr, wi);
671
672 cr4 = pc[3*l1ido];
673 ci4 = pc[3*l1ido+1];
674 wr = LD_PS1(wa3[i-2]);
675 wi = LD_PS1(wa3[i-1]);
676 VCPLXMULCONJ(cr4, ci4, wr, wi);
677
678 /* at this point, on SSE, five of "cr2 cr3 cr4 ci2 ci3 ci4" should be loaded in registers */
679
680 tr1 = VADD(cr2,cr4);
681 tr4 = VSUB(cr4,cr2);
682 tr2 = VADD(pc[0],cr3);
683 tr3 = VSUB(pc[0],cr3);
684 ch[i - 1 + 4*k] = VADD(tr1,tr2);
685 ch[ic - 1 + 4*k + 3*ido] = VSUB(tr2,tr1); // at this point tr1 and tr2 can be disposed
686 ti1 = VADD(ci2,ci4);
687 ti4 = VSUB(ci2,ci4);
688 ch[i - 1 + 4*k + 2*ido] = VADD(ti4,tr3);
689 ch[ic - 1 + 4*k + 1*ido] = VSUB(tr3,ti4); // dispose tr3, ti4
690 ti2 = VADD(pc[1],ci3);
691 ti3 = VSUB(pc[1],ci3);
692 ch[i + 4*k] = VADD(ti1, ti2);
693 ch[ic + 4*k + 3*ido] = VSUB(ti1, ti2);
694 ch[i + 4*k + 2*ido] = VADD(tr4, ti3);
695 ch[ic + 4*k + 1*ido] = VSUB(tr4, ti3);
696 }
697 }
698 if (ido % 2 == 1) return;
699 }
700 for (k=0; k<l1ido; k += ido) {
701 v4sf a = cc[ido-1 + k + l1ido], b = cc[ido-1 + k + 3*l1ido];
702 v4sf c = cc[ido-1 + k], d = cc[ido-1 + k + 2*l1ido];
703 v4sf ti1 = SVMUL(minus_hsqt2, VADD(a, b));
704 v4sf tr1 = SVMUL(minus_hsqt2, VSUB(b, a));
705 ch[ido-1 + 4*k] = VADD(tr1, c);
706 ch[ido-1 + 4*k + 2*ido] = VSUB(c, tr1);
707 ch[4*k + 1*ido] = VSUB(ti1, d);
708 ch[4*k + 3*ido] = VADD(ti1, d);
709 }
710} /* radf4 */
711
712
713static NEVER_INLINE(void) radb4_ps(int ido, int l1, const v4sf * RESTRICT cc, v4sf * RESTRICT ch,
714 const float * RESTRICT wa1, const float * RESTRICT wa2, const float *RESTRICT wa3)
715{
716 static const float minus_sqrt2 = (float)-1.414213562373095;
717 static const float two = 2.f;
718 int i, k, l1ido = l1*ido;
719 v4sf ci2, ci3, ci4, cr2, cr3, cr4, ti1, ti2, ti3, ti4, tr1, tr2, tr3, tr4;
720 {
721 const v4sf *RESTRICT cc_ = cc, * RESTRICT ch_end = ch + l1ido;
722 v4sf *ch_ = ch;
723 while (ch < ch_end) {
724 v4sf a = cc[0], b = cc[4*ido-1];
725 v4sf c = cc[2*ido], d = cc[2*ido-1];
726 tr3 = SVMUL(two,d);
727 tr2 = VADD(a,b);
728 tr1 = VSUB(a,b);
729 tr4 = SVMUL(two,c);
730 ch[0*l1ido] = VADD(tr2, tr3);
731 ch[2*l1ido] = VSUB(tr2, tr3);
732 ch[1*l1ido] = VSUB(tr1, tr4);
733 ch[3*l1ido] = VADD(tr1, tr4);
734
735 cc += 4*ido; ch += ido;
736 }
737 cc = cc_; ch = ch_;
738 }
739 if (ido < 2) return;
740 if (ido != 2) {
741 for (k = 0; k < l1ido; k += ido) {
742 const v4sf * RESTRICT pc = (v4sf*)(cc - 1 + 4*k);
743 v4sf * RESTRICT ph = (v4sf*)(ch + k + 1);
744 for (i = 2; i < ido; i += 2) {
745
746 tr1 = VSUB(pc[i], pc[4*ido - i]);
747 tr2 = VADD(pc[i], pc[4*ido - i]);
748 ti4 = VSUB(pc[2*ido + i], pc[2*ido - i]);
749 tr3 = VADD(pc[2*ido + i], pc[2*ido - i]);
750 ph[0] = VADD(tr2, tr3);
751 cr3 = VSUB(tr2, tr3);
752
753 ti3 = VSUB(pc[2*ido + i + 1], pc[2*ido - i + 1]);
754 tr4 = VADD(pc[2*ido + i + 1], pc[2*ido - i + 1]);
755 cr2 = VSUB(tr1, tr4);
756 cr4 = VADD(tr1, tr4);
757
758 ti1 = VADD(pc[i + 1], pc[4*ido - i + 1]);
759 ti2 = VSUB(pc[i + 1], pc[4*ido - i + 1]);
760
761 ph[1] = VADD(ti2, ti3); ph += l1ido;
762 ci3 = VSUB(ti2, ti3);
763 ci2 = VADD(ti1, ti4);
764 ci4 = VSUB(ti1, ti4);
765 VCPLXMUL(cr2, ci2, LD_PS1(wa1[i-2]), LD_PS1(wa1[i-1]));
766 ph[0] = cr2;
767 ph[1] = ci2; ph += l1ido;
768 VCPLXMUL(cr3, ci3, LD_PS1(wa2[i-2]), LD_PS1(wa2[i-1]));
769 ph[0] = cr3;
770 ph[1] = ci3; ph += l1ido;
771 VCPLXMUL(cr4, ci4, LD_PS1(wa3[i-2]), LD_PS1(wa3[i-1]));
772 ph[0] = cr4;
773 ph[1] = ci4; ph = ph - 3*l1ido + 2;
774 }
775 }
776 if (ido % 2 == 1) return;
777 }
778 for (k=0; k < l1ido; k+=ido) {
779 int i0 = 4*k + ido;
780 v4sf c = cc[i0-1], d = cc[i0 + 2*ido-1];
781 v4sf a = cc[i0+0], b = cc[i0 + 2*ido+0];
782 tr1 = VSUB(c,d);
783 tr2 = VADD(c,d);
784 ti1 = VADD(b,a);
785 ti2 = VSUB(b,a);
786 ch[ido-1 + k + 0*l1ido] = VADD(tr2,tr2);
787 ch[ido-1 + k + 1*l1ido] = SVMUL(minus_sqrt2, VSUB(ti1, tr1));
788 ch[ido-1 + k + 2*l1ido] = VADD(ti2, ti2);
789 ch[ido-1 + k + 3*l1ido] = SVMUL(minus_sqrt2, VADD(ti1, tr1));
790 }
791} /* radb4 */
792
793static void radf5_ps(int ido, int l1, const v4sf * RESTRICT cc, v4sf * RESTRICT ch,
794 const float *wa1, const float *wa2, const float *wa3, const float *wa4)
795{
796 static const float tr11 = .309016994374947f;
797 static const float ti11 = .951056516295154f;
798 static const float tr12 = -.809016994374947f;
799 static const float ti12 = .587785252292473f;
800
801 /* System generated locals */
802 int cc_offset, ch_offset;
803
804 /* Local variables */
805 int i, k, ic;
806 v4sf ci2, di2, ci4, ci5, di3, di4, di5, ci3, cr2, cr3, dr2, dr3, dr4, dr5,
807 cr5, cr4, ti2, ti3, ti5, ti4, tr2, tr3, tr4, tr5;
808 int idp2;
809
810
811#define cc_ref(a_1,a_2,a_3) cc[((a_3)*l1 + (a_2))*ido + a_1]
812#define ch_ref(a_1,a_2,a_3) ch[((a_3)*5 + (a_2))*ido + a_1]
813
814 /* Parameter adjustments */
815 ch_offset = 1 + ido * 6;
816 ch -= ch_offset;
817 cc_offset = 1 + ido * (1 + l1);
818 cc -= cc_offset;
819
820 /* Function Body */
821 for (k = 1; k <= l1; ++k) {
822 cr2 = VADD(cc_ref(1, k, 5), cc_ref(1, k, 2));
823 ci5 = VSUB(cc_ref(1, k, 5), cc_ref(1, k, 2));
824 cr3 = VADD(cc_ref(1, k, 4), cc_ref(1, k, 3));
825 ci4 = VSUB(cc_ref(1, k, 4), cc_ref(1, k, 3));
826 ch_ref(1, 1, k) = VADD(cc_ref(1, k, 1), VADD(cr2, cr3));
827 ch_ref(ido, 2, k) = VADD(cc_ref(1, k, 1), VADD(SVMUL(tr11, cr2), SVMUL(tr12, cr3)));
828 ch_ref(1, 3, k) = VADD(SVMUL(ti11, ci5), SVMUL(ti12, ci4));
829 ch_ref(ido, 4, k) = VADD(cc_ref(1, k, 1), VADD(SVMUL(tr12, cr2), SVMUL(tr11, cr3)));
830 ch_ref(1, 5, k) = VSUB(SVMUL(ti12, ci5), SVMUL(ti11, ci4));
831 //printf("pffft: radf5, k=%d ch_ref=%f, ci4=%f\n", k, ch_ref(1, 5, k), ci4);
832 }
833 if (ido == 1) {
834 return;
835 }
836 idp2 = ido + 2;
837 for (k = 1; k <= l1; ++k) {
838 for (i = 3; i <= ido; i += 2) {
839 ic = idp2 - i;
840 dr2 = LD_PS1(wa1[i-3]); di2 = LD_PS1(wa1[i-2]);
841 dr3 = LD_PS1(wa2[i-3]); di3 = LD_PS1(wa2[i-2]);
842 dr4 = LD_PS1(wa3[i-3]); di4 = LD_PS1(wa3[i-2]);
843 dr5 = LD_PS1(wa4[i-3]); di5 = LD_PS1(wa4[i-2]);
844 VCPLXMULCONJ(dr2, di2, cc_ref(i-1, k, 2), cc_ref(i, k, 2));
845 VCPLXMULCONJ(dr3, di3, cc_ref(i-1, k, 3), cc_ref(i, k, 3));
846 VCPLXMULCONJ(dr4, di4, cc_ref(i-1, k, 4), cc_ref(i, k, 4));
847 VCPLXMULCONJ(dr5, di5, cc_ref(i-1, k, 5), cc_ref(i, k, 5));
848 cr2 = VADD(dr2, dr5);
849 ci5 = VSUB(dr5, dr2);
850 cr5 = VSUB(di2, di5);
851 ci2 = VADD(di2, di5);
852 cr3 = VADD(dr3, dr4);
853 ci4 = VSUB(dr4, dr3);
854 cr4 = VSUB(di3, di4);
855 ci3 = VADD(di3, di4);
856 ch_ref(i - 1, 1, k) = VADD(cc_ref(i - 1, k, 1), VADD(cr2, cr3));
857 ch_ref(i, 1, k) = VSUB(cc_ref(i, k, 1), VADD(ci2, ci3));//
858 tr2 = VADD(cc_ref(i - 1, k, 1), VADD(SVMUL(tr11, cr2), SVMUL(tr12, cr3)));
859 ti2 = VSUB(cc_ref(i, k, 1), VADD(SVMUL(tr11, ci2), SVMUL(tr12, ci3)));//
860 tr3 = VADD(cc_ref(i - 1, k, 1), VADD(SVMUL(tr12, cr2), SVMUL(tr11, cr3)));
861 ti3 = VSUB(cc_ref(i, k, 1), VADD(SVMUL(tr12, ci2), SVMUL(tr11, ci3)));//
862 tr5 = VADD(SVMUL(ti11, cr5), SVMUL(ti12, cr4));
863 ti5 = VADD(SVMUL(ti11, ci5), SVMUL(ti12, ci4));
864 tr4 = VSUB(SVMUL(ti12, cr5), SVMUL(ti11, cr4));
865 ti4 = VSUB(SVMUL(ti12, ci5), SVMUL(ti11, ci4));
866 ch_ref(i - 1, 3, k) = VSUB(tr2, tr5);
867 ch_ref(ic - 1, 2, k) = VADD(tr2, tr5);
868 ch_ref(i, 3, k) = VADD(ti2, ti5);
869 ch_ref(ic, 2, k) = VSUB(ti5, ti2);
870 ch_ref(i - 1, 5, k) = VSUB(tr3, tr4);
871 ch_ref(ic - 1, 4, k) = VADD(tr3, tr4);
872 ch_ref(i, 5, k) = VADD(ti3, ti4);
873 ch_ref(ic, 4, k) = VSUB(ti4, ti3);
874 }
875 }
876#undef cc_ref
877#undef ch_ref
878} /* radf5 */
879
880static void radb5_ps(int ido, int l1, const v4sf *RESTRICT cc, v4sf *RESTRICT ch,
881 const float *wa1, const float *wa2, const float *wa3, const float *wa4)
882{
883 static const float tr11 = .309016994374947f;
884 static const float ti11 = .951056516295154f;
885 static const float tr12 = -.809016994374947f;
886 static const float ti12 = .587785252292473f;
887
888 int cc_offset, ch_offset;
889
890 /* Local variables */
891 int i, k, ic;
892 v4sf ci2, ci3, ci4, ci5, di3, di4, di5, di2, cr2, cr3, cr5, cr4, ti2, ti3,
893 ti4, ti5, dr3, dr4, dr5, dr2, tr2, tr3, tr4, tr5;
894 int idp2;
895
896#define cc_ref(a_1,a_2,a_3) cc[((a_3)*5 + (a_2))*ido + a_1]
897#define ch_ref(a_1,a_2,a_3) ch[((a_3)*l1 + (a_2))*ido + a_1]
898
899 /* Parameter adjustments */
900 ch_offset = 1 + ido * (1 + l1);
901 ch -= ch_offset;
902 cc_offset = 1 + ido * 6;
903 cc -= cc_offset;
904
905 /* Function Body */
906 for (k = 1; k <= l1; ++k) {
907 ti5 = VADD(cc_ref(1, 3, k), cc_ref(1, 3, k));
908 ti4 = VADD(cc_ref(1, 5, k), cc_ref(1, 5, k));
909 tr2 = VADD(cc_ref(ido, 2, k), cc_ref(ido, 2, k));
910 tr3 = VADD(cc_ref(ido, 4, k), cc_ref(ido, 4, k));
911 ch_ref(1, k, 1) = VADD(cc_ref(1, 1, k), VADD(tr2, tr3));
912 cr2 = VADD(cc_ref(1, 1, k), VADD(SVMUL(tr11, tr2), SVMUL(tr12, tr3)));
913 cr3 = VADD(cc_ref(1, 1, k), VADD(SVMUL(tr12, tr2), SVMUL(tr11, tr3)));
914 ci5 = VADD(SVMUL(ti11, ti5), SVMUL(ti12, ti4));
915 ci4 = VSUB(SVMUL(ti12, ti5), SVMUL(ti11, ti4));
916 ch_ref(1, k, 2) = VSUB(cr2, ci5);
917 ch_ref(1, k, 3) = VSUB(cr3, ci4);
918 ch_ref(1, k, 4) = VADD(cr3, ci4);
919 ch_ref(1, k, 5) = VADD(cr2, ci5);
920 }
921 if (ido == 1) {
922 return;
923 }
924 idp2 = ido + 2;
925 for (k = 1; k <= l1; ++k) {
926 for (i = 3; i <= ido; i += 2) {
927 ic = idp2 - i;
928 ti5 = VADD(cc_ref(i , 3, k), cc_ref(ic , 2, k));
929 ti2 = VSUB(cc_ref(i , 3, k), cc_ref(ic , 2, k));
930 ti4 = VADD(cc_ref(i , 5, k), cc_ref(ic , 4, k));
931 ti3 = VSUB(cc_ref(i , 5, k), cc_ref(ic , 4, k));
932 tr5 = VSUB(cc_ref(i-1, 3, k), cc_ref(ic-1, 2, k));
933 tr2 = VADD(cc_ref(i-1, 3, k), cc_ref(ic-1, 2, k));
934 tr4 = VSUB(cc_ref(i-1, 5, k), cc_ref(ic-1, 4, k));
935 tr3 = VADD(cc_ref(i-1, 5, k), cc_ref(ic-1, 4, k));
936 ch_ref(i - 1, k, 1) = VADD(cc_ref(i-1, 1, k), VADD(tr2, tr3));
937 ch_ref(i, k, 1) = VADD(cc_ref(i, 1, k), VADD(ti2, ti3));
938 cr2 = VADD(cc_ref(i-1, 1, k), VADD(SVMUL(tr11, tr2), SVMUL(tr12, tr3)));
939 ci2 = VADD(cc_ref(i , 1, k), VADD(SVMUL(tr11, ti2), SVMUL(tr12, ti3)));
940 cr3 = VADD(cc_ref(i-1, 1, k), VADD(SVMUL(tr12, tr2), SVMUL(tr11, tr3)));
941 ci3 = VADD(cc_ref(i , 1, k), VADD(SVMUL(tr12, ti2), SVMUL(tr11, ti3)));
942 cr5 = VADD(SVMUL(ti11, tr5), SVMUL(ti12, tr4));
943 ci5 = VADD(SVMUL(ti11, ti5), SVMUL(ti12, ti4));
944 cr4 = VSUB(SVMUL(ti12, tr5), SVMUL(ti11, tr4));
945 ci4 = VSUB(SVMUL(ti12, ti5), SVMUL(ti11, ti4));
946 dr3 = VSUB(cr3, ci4);
947 dr4 = VADD(cr3, ci4);
948 di3 = VADD(ci3, cr4);
949 di4 = VSUB(ci3, cr4);
950 dr5 = VADD(cr2, ci5);
951 dr2 = VSUB(cr2, ci5);
952 di5 = VSUB(ci2, cr5);
953 di2 = VADD(ci2, cr5);
954 VCPLXMUL(dr2, di2, LD_PS1(wa1[i-3]), LD_PS1(wa1[i-2]));
955 VCPLXMUL(dr3, di3, LD_PS1(wa2[i-3]), LD_PS1(wa2[i-2]));
956 VCPLXMUL(dr4, di4, LD_PS1(wa3[i-3]), LD_PS1(wa3[i-2]));
957 VCPLXMUL(dr5, di5, LD_PS1(wa4[i-3]), LD_PS1(wa4[i-2]));
958
959 ch_ref(i-1, k, 2) = dr2; ch_ref(i, k, 2) = di2;
960 ch_ref(i-1, k, 3) = dr3; ch_ref(i, k, 3) = di3;
961 ch_ref(i-1, k, 4) = dr4; ch_ref(i, k, 4) = di4;
962 ch_ref(i-1, k, 5) = dr5; ch_ref(i, k, 5) = di5;
963 }
964 }
965#undef cc_ref
966#undef ch_ref
967} /* radb5 */
968
969static NEVER_INLINE(v4sf *) rfftf1_ps(int n, const v4sf *input_readonly, v4sf *work1, v4sf *work2,
970 const float *wa, const int *ifac) {
971 v4sf *in = (v4sf*)input_readonly;
972 v4sf *out = (in == work2 ? work1 : work2);
973 int nf = ifac[1], k1;
974 int l2 = n;
975 int iw = n-1;
976 assert(in != out && work1 != work2);
977 for (k1 = 1; k1 <= nf; ++k1) {
978 int kh = nf - k1;
979 int ip = ifac[kh + 2];
980 int l1 = l2 / ip;
981 int ido = n / l2;
982 iw -= (ip - 1)*ido;
983 switch (ip) {
984 case 5: {
985 int ix2 = iw + ido;
986 int ix3 = ix2 + ido;
987 int ix4 = ix3 + ido;
988 radf5_ps(ido, l1, cc: in, ch: out, wa1: &wa[iw], wa2: &wa[ix2], wa3: &wa[ix3], wa4: &wa[ix4]);
989 } break;
990 case 4: {
991 int ix2 = iw + ido;
992 int ix3 = ix2 + ido;
993 radf4_ps(ido, l1, cc: in, ch: out, wa1: &wa[iw], wa2: &wa[ix2], wa3: &wa[ix3]);
994 } break;
995 case 3: {
996 int ix2 = iw + ido;
997 radf3_ps(ido, l1, cc: in, ch: out, wa1: &wa[iw], wa2: &wa[ix2]);
998 } break;
999 case 2:
1000 radf2_ps(ido, l1, cc: in, ch: out, wa1: &wa[iw]);
1001 break;
1002 default:
1003 assert(0);
1004 break;
1005 }
1006 l2 = l1;
1007 if (out == work2) {
1008 out = work1; in = work2;
1009 } else {
1010 out = work2; in = work1;
1011 }
1012 }
1013 return in; /* this is in fact the output .. */
1014} /* rfftf1 */
1015
1016static NEVER_INLINE(v4sf *) rfftb1_ps(int n, const v4sf *input_readonly, v4sf *work1, v4sf *work2,
1017 const float *wa, const int *ifac) {
1018 v4sf *in = (v4sf*)input_readonly;
1019 v4sf *out = (in == work2 ? work1 : work2);
1020 int nf = ifac[1], k1;
1021 int l1 = 1;
1022 int iw = 0;
1023 assert(in != out);
1024 for (k1=1; k1<=nf; k1++) {
1025 int ip = ifac[k1 + 1];
1026 int l2 = ip*l1;
1027 int ido = n / l2;
1028 switch (ip) {
1029 case 5: {
1030 int ix2 = iw + ido;
1031 int ix3 = ix2 + ido;
1032 int ix4 = ix3 + ido;
1033 radb5_ps(ido, l1, cc: in, ch: out, wa1: &wa[iw], wa2: &wa[ix2], wa3: &wa[ix3], wa4: &wa[ix4]);
1034 } break;
1035 case 4: {
1036 int ix2 = iw + ido;
1037 int ix3 = ix2 + ido;
1038 radb4_ps(ido, l1, cc: in, ch: out, wa1: &wa[iw], wa2: &wa[ix2], wa3: &wa[ix3]);
1039 } break;
1040 case 3: {
1041 int ix2 = iw + ido;
1042 radb3_ps(ido, l1, cc: in, ch: out, wa1: &wa[iw], wa2: &wa[ix2]);
1043 } break;
1044 case 2:
1045 radb2_ps(ido, l1, cc: in, ch: out, wa1: &wa[iw]);
1046 break;
1047 default:
1048 assert(0);
1049 break;
1050 }
1051 l1 = l2;
1052 iw += (ip - 1)*ido;
1053
1054 if (out == work2) {
1055 out = work1; in = work2;
1056 } else {
1057 out = work2; in = work1;
1058 }
1059 }
1060 return in; /* this is in fact the output .. */
1061}
1062
1063static int decompose(int n, int *ifac, const int *ntryh) {
1064 int nl = n, nf = 0, i, j = 0;
1065 for (j=0; ntryh[j]; ++j) {
1066 int ntry = ntryh[j];
1067 while (nl != 1) {
1068 int nq = nl / ntry;
1069 int nr = nl - ntry * nq;
1070 if (nr == 0) {
1071 ifac[2+nf++] = ntry;
1072 nl = nq;
1073 if (ntry == 2 && nf != 1) {
1074 for (i = 2; i <= nf; ++i) {
1075 int ib = nf - i + 2;
1076 ifac[ib + 1] = ifac[ib];
1077 }
1078 ifac[2] = 2;
1079 }
1080 } else break;
1081 }
1082 }
1083 ifac[0] = n;
1084 ifac[1] = nf;
1085 return nf;
1086}
1087
1088
1089
1090static void rffti1_ps(int n, float *wa, int *ifac)
1091{
1092 static const int ntryh[] = { 4,2,3,5,0 };
1093 int k1, j, ii;
1094
1095 int nf = decompose(n,ifac,ntryh);
1096 float argh = (2*M_PI) / n;
1097 int is = 0;
1098 int nfm1 = nf - 1;
1099 int l1 = 1;
1100 for (k1 = 1; k1 <= nfm1; k1++) {
1101 int ip = ifac[k1 + 1];
1102 int ld = 0;
1103 int l2 = l1*ip;
1104 int ido = n / l2;
1105 int ipm = ip - 1;
1106 for (j = 1; j <= ipm; ++j) {
1107 float argld;
1108 int i = is, fi=0;
1109 ld += l1;
1110 argld = ld*argh;
1111 for (ii = 3; ii <= ido; ii += 2) {
1112 i += 2;
1113 fi += 1;
1114 wa[i - 2] = cos(x: fi*argld);
1115 wa[i - 1] = sin(x: fi*argld);
1116 }
1117 is += ido;
1118 }
1119 l1 = l2;
1120 }
1121} /* rffti1 */
1122
1123void cffti1_ps(int n, float *wa, int *ifac)
1124{
1125 static const int ntryh[] = { 5,3,4,2,0 };
1126 int k1, j, ii;
1127
1128 int nf = decompose(n,ifac,ntryh);
1129 float argh = (2*M_PI)/(float)n;
1130 int i = 1;
1131 int l1 = 1;
1132 for (k1=1; k1<=nf; k1++) {
1133 int ip = ifac[k1+1];
1134 int ld = 0;
1135 int l2 = l1*ip;
1136 int ido = n / l2;
1137 int idot = ido + ido + 2;
1138 int ipm = ip - 1;
1139 for (j=1; j<=ipm; j++) {
1140 float argld;
1141 int i1 = i, fi = 0;
1142 wa[i-1] = 1;
1143 wa[i] = 0;
1144 ld += l1;
1145 argld = ld*argh;
1146 for (ii = 4; ii <= idot; ii += 2) {
1147 i += 2;
1148 fi += 1;
1149 wa[i-1] = cos(x: fi*argld);
1150 wa[i] = sin(x: fi*argld);
1151 }
1152 if (ip > 5) {
1153 wa[i1-1] = wa[i-1];
1154 wa[i1] = wa[i];
1155 }
1156 }
1157 l1 = l2;
1158 }
1159} /* cffti1 */
1160
1161
1162v4sf *cfftf1_ps(int n, const v4sf *input_readonly, v4sf *work1, v4sf *work2, const float *wa, const int *ifac, int isign) {
1163 v4sf *in = (v4sf*)input_readonly;
1164 v4sf *out = (in == work2 ? work1 : work2);
1165 int nf = ifac[1], k1;
1166 int l1 = 1;
1167 int iw = 0;
1168 assert(in != out && work1 != work2);
1169 for (k1=2; k1<=nf+1; k1++) {
1170 int ip = ifac[k1];
1171 int l2 = ip*l1;
1172 int ido = n / l2;
1173 int idot = ido + ido;
1174 switch (ip) {
1175 case 5: {
1176 int ix2 = iw + idot;
1177 int ix3 = ix2 + idot;
1178 int ix4 = ix3 + idot;
1179 passf5_ps(ido: idot, l1, cc: in, ch: out, wa1: &wa[iw], wa2: &wa[ix2], wa3: &wa[ix3], wa4: &wa[ix4], fsign: isign);
1180 } break;
1181 case 4: {
1182 int ix2 = iw + idot;
1183 int ix3 = ix2 + idot;
1184 passf4_ps(ido: idot, l1, cc: in, ch: out, wa1: &wa[iw], wa2: &wa[ix2], wa3: &wa[ix3], fsign: isign);
1185 } break;
1186 case 2: {
1187 passf2_ps(ido: idot, l1, cc: in, ch: out, wa1: &wa[iw], fsign: isign);
1188 } break;
1189 case 3: {
1190 int ix2 = iw + idot;
1191 passf3_ps(ido: idot, l1, cc: in, ch: out, wa1: &wa[iw], wa2: &wa[ix2], fsign: isign);
1192 } break;
1193 default:
1194 assert(0);
1195 }
1196 l1 = l2;
1197 iw += (ip - 1)*idot;
1198 if (out == work2) {
1199 out = work1; in = work2;
1200 } else {
1201 out = work2; in = work1;
1202 }
1203 }
1204
1205 return in; /* this is in fact the output .. */
1206}
1207
1208
1209struct PFFFT_Setup {
1210 int N;
1211 int Ncvec; // nb of complex simd vectors (N/4 if PFFFT_COMPLEX, N/8 if PFFFT_REAL)
1212 int ifac[15];
1213 pffft_transform_t transform;
1214 v4sf *data; // allocated room for twiddle coefs
1215 float *e; // points into 'data' , N/4*3 elements
1216 float *twiddle; // points into 'data', N/4 elements
1217};
1218
1219PFFFT_Setup *pffft_new_setup(int N, pffft_transform_t transform) {
1220 PFFFT_Setup *s = (PFFFT_Setup*)malloc(size: sizeof(PFFFT_Setup));
1221 int k, m;
1222 /* unfortunately, the fft size must be a multiple of 16 for complex FFTs
1223 and 32 for real FFTs -- a lot of stuff would need to be rewritten to
1224 handle other cases (or maybe just switch to a scalar fft, I don't know..) */
1225 if (transform == PFFFT_REAL) { assert((N%(2*SIMD_SZ*SIMD_SZ))==0 && N>0); }
1226 if (transform == PFFFT_COMPLEX) { assert((N%(SIMD_SZ*SIMD_SZ))==0 && N>0); }
1227 //assert((N % 32) == 0);
1228 s->N = N;
1229 s->transform = transform;
1230 /* nb of complex simd vectors */
1231 s->Ncvec = (transform == PFFFT_REAL ? N/2 : N)/SIMD_SZ;
1232 s->data = (v4sf*)pffft_aligned_malloc(nb_bytes: 2*s->Ncvec * sizeof(v4sf));
1233 s->e = (float*)s->data;
1234 s->twiddle = (float*)(s->data + (2*s->Ncvec*(SIMD_SZ-1))/SIMD_SZ);
1235
1236 for (k=0; k < s->Ncvec; ++k) {
1237 int i = k/SIMD_SZ;
1238 int j = k%SIMD_SZ;
1239 for (m=0; m < SIMD_SZ-1; ++m) {
1240 float A = -2*M_PI*(m+1)*k / N;
1241 s->e[(2*(i*3 + m) + 0) * SIMD_SZ + j] = cos(x: A);
1242 s->e[(2*(i*3 + m) + 1) * SIMD_SZ + j] = sin(x: A);
1243 }
1244 }
1245
1246 if (transform == PFFFT_REAL) {
1247 rffti1_ps(n: N/SIMD_SZ, wa: s->twiddle, ifac: s->ifac);
1248 } else {
1249 cffti1_ps(n: N/SIMD_SZ, wa: s->twiddle, ifac: s->ifac);
1250 }
1251
1252 /* check that N is decomposable with allowed prime factors */
1253 for (k=0, m=1; k < s->ifac[1]; ++k) { m *= s->ifac[2+k]; }
1254 if (m != N/SIMD_SZ) {
1255 pffft_destroy_setup(s); s = 0;
1256 }
1257
1258 return s;
1259}
1260
1261
1262void pffft_destroy_setup(PFFFT_Setup *s) {
1263 pffft_aligned_free(p: s->data);
1264 free(ptr: s);
1265}
1266
1267#if !defined(PFFFT_SIMD_DISABLE)
1268
1269/* [0 0 1 2 3 4 5 6 7 8] -> [0 8 7 6 5 4 3 2 1] */
1270static void reversed_copy(int N, const v4sf *in, int in_stride, v4sf *out) {
1271 v4sf g0, g1;
1272 int k;
1273 INTERLEAVE2(in[0], in[1], g0, g1); in += in_stride;
1274
1275 *--out = VSWAPHL(g0, g1); // [g0l, g0h], [g1l g1h] -> [g1l, g0h]
1276 for (k=1; k < N; ++k) {
1277 v4sf h0, h1;
1278 INTERLEAVE2(in[0], in[1], h0, h1); in += in_stride;
1279 *--out = VSWAPHL(g1, h0);
1280 *--out = VSWAPHL(h0, h1);
1281 g1 = h1;
1282 }
1283 *--out = VSWAPHL(g1, g0);
1284}
1285
1286static void unreversed_copy(int N, const v4sf *in, v4sf *out, int out_stride) {
1287 v4sf g0, g1, h0, h1;
1288 int k;
1289 g0 = g1 = in[0]; ++in;
1290 for (k=1; k < N; ++k) {
1291 h0 = *in++; h1 = *in++;
1292 g1 = VSWAPHL(g1, h0);
1293 h0 = VSWAPHL(h0, h1);
1294 UNINTERLEAVE2(h0, g1, out[0], out[1]); out += out_stride;
1295 g1 = h1;
1296 }
1297 h0 = *in++; h1 = g0;
1298 g1 = VSWAPHL(g1, h0);
1299 h0 = VSWAPHL(h0, h1);
1300 UNINTERLEAVE2(h0, g1, out[0], out[1]);
1301}
1302
1303void pffft_zreorder(PFFFT_Setup *setup, const float *in, float *out, pffft_direction_t direction) {
1304 int k, N = setup->N, Ncvec = setup->Ncvec;
1305 const v4sf *vin = (const v4sf*)in;
1306 v4sf *vout = (v4sf*)out;
1307 assert(in != out);
1308 if (setup->transform == PFFFT_REAL) {
1309 int dk = N/32;
1310 if (direction == PFFFT_FORWARD) {
1311 for (k=0; k < dk; ++k) {
1312 INTERLEAVE2(vin[k*8 + 0], vin[k*8 + 1], vout[2*(0*dk + k) + 0], vout[2*(0*dk + k) + 1]);
1313 INTERLEAVE2(vin[k*8 + 4], vin[k*8 + 5], vout[2*(2*dk + k) + 0], vout[2*(2*dk + k) + 1]);
1314 }
1315 reversed_copy(N: dk, in: vin+2, in_stride: 8, out: (v4sf*)(out + N/2));
1316 reversed_copy(N: dk, in: vin+6, in_stride: 8, out: (v4sf*)(out + N));
1317 } else {
1318 for (k=0; k < dk; ++k) {
1319 UNINTERLEAVE2(vin[2*(0*dk + k) + 0], vin[2*(0*dk + k) + 1], vout[k*8 + 0], vout[k*8 + 1]);
1320 UNINTERLEAVE2(vin[2*(2*dk + k) + 0], vin[2*(2*dk + k) + 1], vout[k*8 + 4], vout[k*8 + 5]);
1321 }
1322 unreversed_copy(N: dk, in: (v4sf*)(in + N/4), out: (v4sf*)(out + N - 6*SIMD_SZ), out_stride: -8);
1323 unreversed_copy(N: dk, in: (v4sf*)(in + 3*N/4), out: (v4sf*)(out + N - 2*SIMD_SZ), out_stride: -8);
1324 }
1325 } else {
1326 if (direction == PFFFT_FORWARD) {
1327 for (k=0; k < Ncvec; ++k) {
1328 int kk = (k/4) + (k%4)*(Ncvec/4);
1329 INTERLEAVE2(vin[k*2], vin[k*2+1], vout[kk*2], vout[kk*2+1]);
1330 }
1331 } else {
1332 for (k=0; k < Ncvec; ++k) {
1333 int kk = (k/4) + (k%4)*(Ncvec/4);
1334 UNINTERLEAVE2(vin[kk*2], vin[kk*2+1], vout[k*2], vout[k*2+1]);
1335 }
1336 }
1337 }
1338}
1339
1340void pffft_cplx_finalize(int Ncvec, const v4sf *in, v4sf *out, const v4sf *e) {
1341 int k, dk = Ncvec/SIMD_SZ; // number of 4x4 matrix blocks
1342 v4sf r0, i0, r1, i1, r2, i2, r3, i3;
1343 v4sf sr0, dr0, sr1, dr1, si0, di0, si1, di1;
1344 assert(in != out);
1345 for (k=0; k < dk; ++k) {
1346 r0 = in[8*k+0]; i0 = in[8*k+1];
1347 r1 = in[8*k+2]; i1 = in[8*k+3];
1348 r2 = in[8*k+4]; i2 = in[8*k+5];
1349 r3 = in[8*k+6]; i3 = in[8*k+7];
1350 VTRANSPOSE4(r0,r1,r2,r3);
1351 VTRANSPOSE4(i0,i1,i2,i3);
1352 VCPLXMUL(r1,i1,e[k*6+0],e[k*6+1]);
1353 VCPLXMUL(r2,i2,e[k*6+2],e[k*6+3]);
1354 VCPLXMUL(r3,i3,e[k*6+4],e[k*6+5]);
1355
1356 sr0 = VADD(r0,r2); dr0 = VSUB(r0, r2);
1357 sr1 = VADD(r1,r3); dr1 = VSUB(r1, r3);
1358 si0 = VADD(i0,i2); di0 = VSUB(i0, i2);
1359 si1 = VADD(i1,i3); di1 = VSUB(i1, i3);
1360
1361 /*
1362 transformation for each column is:
1363
1364 [1 1 1 1 0 0 0 0] [r0]
1365 [1 0 -1 0 0 -1 0 1] [r1]
1366 [1 -1 1 -1 0 0 0 0] [r2]
1367 [1 0 -1 0 0 1 0 -1] [r3]
1368 [0 0 0 0 1 1 1 1] * [i0]
1369 [0 1 0 -1 1 0 -1 0] [i1]
1370 [0 0 0 0 1 -1 1 -1] [i2]
1371 [0 -1 0 1 1 0 -1 0] [i3]
1372 */
1373
1374 r0 = VADD(sr0, sr1); i0 = VADD(si0, si1);
1375 r1 = VADD(dr0, di1); i1 = VSUB(di0, dr1);
1376 r2 = VSUB(sr0, sr1); i2 = VSUB(si0, si1);
1377 r3 = VSUB(dr0, di1); i3 = VADD(di0, dr1);
1378
1379 *out++ = r0; *out++ = i0; *out++ = r1; *out++ = i1;
1380 *out++ = r2; *out++ = i2; *out++ = r3; *out++ = i3;
1381 }
1382}
1383
1384void pffft_cplx_preprocess(int Ncvec, const v4sf *in, v4sf *out, const v4sf *e) {
1385 int k, dk = Ncvec/SIMD_SZ; // number of 4x4 matrix blocks
1386 v4sf r0, i0, r1, i1, r2, i2, r3, i3;
1387 v4sf sr0, dr0, sr1, dr1, si0, di0, si1, di1;
1388 assert(in != out);
1389 for (k=0; k < dk; ++k) {
1390 r0 = in[8*k+0]; i0 = in[8*k+1];
1391 r1 = in[8*k+2]; i1 = in[8*k+3];
1392 r2 = in[8*k+4]; i2 = in[8*k+5];
1393 r3 = in[8*k+6]; i3 = in[8*k+7];
1394
1395 sr0 = VADD(r0,r2); dr0 = VSUB(r0, r2);
1396 sr1 = VADD(r1,r3); dr1 = VSUB(r1, r3);
1397 si0 = VADD(i0,i2); di0 = VSUB(i0, i2);
1398 si1 = VADD(i1,i3); di1 = VSUB(i1, i3);
1399
1400 r0 = VADD(sr0, sr1); i0 = VADD(si0, si1);
1401 r1 = VSUB(dr0, di1); i1 = VADD(di0, dr1);
1402 r2 = VSUB(sr0, sr1); i2 = VSUB(si0, si1);
1403 r3 = VADD(dr0, di1); i3 = VSUB(di0, dr1);
1404
1405 VCPLXMULCONJ(r1,i1,e[k*6+0],e[k*6+1]);
1406 VCPLXMULCONJ(r2,i2,e[k*6+2],e[k*6+3]);
1407 VCPLXMULCONJ(r3,i3,e[k*6+4],e[k*6+5]);
1408
1409 VTRANSPOSE4(r0,r1,r2,r3);
1410 VTRANSPOSE4(i0,i1,i2,i3);
1411
1412 *out++ = r0; *out++ = i0; *out++ = r1; *out++ = i1;
1413 *out++ = r2; *out++ = i2; *out++ = r3; *out++ = i3;
1414 }
1415}
1416
1417
1418static ALWAYS_INLINE(void) pffft_real_finalize_4x4(const v4sf *in0, const v4sf *in1, const v4sf *in,
1419 const v4sf *e, v4sf *out) {
1420 v4sf r0, i0, r1, i1, r2, i2, r3, i3;
1421 v4sf sr0, dr0, sr1, dr1, si0, di0, si1, di1;
1422 r0 = *in0; i0 = *in1;
1423 r1 = *in++; i1 = *in++; r2 = *in++; i2 = *in++; r3 = *in++; i3 = *in++;
1424 VTRANSPOSE4(r0,r1,r2,r3);
1425 VTRANSPOSE4(i0,i1,i2,i3);
1426
1427 /*
1428 transformation for each column is:
1429
1430 [1 1 1 1 0 0 0 0] [r0]
1431 [1 0 -1 0 0 -1 0 1] [r1]
1432 [1 0 -1 0 0 1 0 -1] [r2]
1433 [1 -1 1 -1 0 0 0 0] [r3]
1434 [0 0 0 0 1 1 1 1] * [i0]
1435 [0 -1 0 1 -1 0 1 0] [i1]
1436 [0 -1 0 1 1 0 -1 0] [i2]
1437 [0 0 0 0 -1 1 -1 1] [i3]
1438 */
1439
1440 //cerr << "matrix initial, before e , REAL:\n 1: " << r0 << "\n 1: " << r1 << "\n 1: " << r2 << "\n 1: " << r3 << "\n";
1441 //cerr << "matrix initial, before e, IMAG :\n 1: " << i0 << "\n 1: " << i1 << "\n 1: " << i2 << "\n 1: " << i3 << "\n";
1442
1443 VCPLXMUL(r1,i1,e[0],e[1]);
1444 VCPLXMUL(r2,i2,e[2],e[3]);
1445 VCPLXMUL(r3,i3,e[4],e[5]);
1446
1447 //cerr << "matrix initial, real part:\n 1: " << r0 << "\n 1: " << r1 << "\n 1: " << r2 << "\n 1: " << r3 << "\n";
1448 //cerr << "matrix initial, imag part:\n 1: " << i0 << "\n 1: " << i1 << "\n 1: " << i2 << "\n 1: " << i3 << "\n";
1449
1450 sr0 = VADD(r0,r2); dr0 = VSUB(r0,r2);
1451 sr1 = VADD(r1,r3); dr1 = VSUB(r3,r1);
1452 si0 = VADD(i0,i2); di0 = VSUB(i0,i2);
1453 si1 = VADD(i1,i3); di1 = VSUB(i3,i1);
1454
1455 r0 = VADD(sr0, sr1);
1456 r3 = VSUB(sr0, sr1);
1457 i0 = VADD(si0, si1);
1458 i3 = VSUB(si1, si0);
1459 r1 = VADD(dr0, di1);
1460 r2 = VSUB(dr0, di1);
1461 i1 = VSUB(dr1, di0);
1462 i2 = VADD(dr1, di0);
1463
1464 *out++ = r0;
1465 *out++ = i0;
1466 *out++ = r1;
1467 *out++ = i1;
1468 *out++ = r2;
1469 *out++ = i2;
1470 *out++ = r3;
1471 *out++ = i3;
1472
1473}
1474
1475static NEVER_INLINE(void) pffft_real_finalize(int Ncvec, const v4sf *in, v4sf *out, const v4sf *e) {
1476 int k, dk = Ncvec/SIMD_SZ; // number of 4x4 matrix blocks
1477 /* fftpack order is f0r f1r f1i f2r f2i ... f(n-1)r f(n-1)i f(n)r */
1478
1479 v4sf_union cr, ci, *uout = (v4sf_union*)out;
1480 v4sf save = in[7], zero=VZERO();
1481 float xr0, xi0, xr1, xi1, xr2, xi2, xr3, xi3;
1482 static const float s = M_SQRT2/2;
1483
1484 cr.v = in[0]; ci.v = in[Ncvec*2-1];
1485 assert(in != out);
1486 pffft_real_finalize_4x4(in0: &zero, in1: &zero, in: in+1, e, out);
1487
1488 /*
1489 [cr0 cr1 cr2 cr3 ci0 ci1 ci2 ci3]
1490
1491 [Xr(1)] ] [1 1 1 1 0 0 0 0]
1492 [Xr(N/4) ] [0 0 0 0 1 s 0 -s]
1493 [Xr(N/2) ] [1 0 -1 0 0 0 0 0]
1494 [Xr(3N/4)] [0 0 0 0 1 -s 0 s]
1495 [Xi(1) ] [1 -1 1 -1 0 0 0 0]
1496 [Xi(N/4) ] [0 0 0 0 0 -s -1 -s]
1497 [Xi(N/2) ] [0 -1 0 1 0 0 0 0]
1498 [Xi(3N/4)] [0 0 0 0 0 -s 1 -s]
1499 */
1500
1501 xr0=(cr.f[0]+cr.f[2]) + (cr.f[1]+cr.f[3]); uout[0].f[0] = xr0;
1502 xi0=(cr.f[0]+cr.f[2]) - (cr.f[1]+cr.f[3]); uout[1].f[0] = xi0;
1503 xr2=(cr.f[0]-cr.f[2]); uout[4].f[0] = xr2;
1504 xi2=(cr.f[3]-cr.f[1]); uout[5].f[0] = xi2;
1505 xr1= ci.f[0] + s*(ci.f[1]-ci.f[3]); uout[2].f[0] = xr1;
1506 xi1=-ci.f[2] - s*(ci.f[1]+ci.f[3]); uout[3].f[0] = xi1;
1507 xr3= ci.f[0] - s*(ci.f[1]-ci.f[3]); uout[6].f[0] = xr3;
1508 xi3= ci.f[2] - s*(ci.f[1]+ci.f[3]); uout[7].f[0] = xi3;
1509
1510 for (k=1; k < dk; ++k) {
1511 v4sf save_next = in[8*k+7];
1512 pffft_real_finalize_4x4(in0: &save, in1: &in[8*k+0], in: in + 8*k+1,
1513 e: e + k*6, out: out + k*8);
1514 save = save_next;
1515 }
1516
1517}
1518
1519static ALWAYS_INLINE(void) pffft_real_preprocess_4x4(const v4sf *in,
1520 const v4sf *e, v4sf *out, int first) {
1521 v4sf r0=in[0], i0=in[1], r1=in[2], i1=in[3], r2=in[4], i2=in[5], r3=in[6], i3=in[7];
1522 /*
1523 transformation for each column is:
1524
1525 [1 1 1 1 0 0 0 0] [r0]
1526 [1 0 0 -1 0 -1 -1 0] [r1]
1527 [1 -1 -1 1 0 0 0 0] [r2]
1528 [1 0 0 -1 0 1 1 0] [r3]
1529 [0 0 0 0 1 -1 1 -1] * [i0]
1530 [0 -1 1 0 1 0 0 1] [i1]
1531 [0 0 0 0 1 1 -1 -1] [i2]
1532 [0 1 -1 0 1 0 0 1] [i3]
1533 */
1534
1535 v4sf sr0 = VADD(r0,r3), dr0 = VSUB(r0,r3);
1536 v4sf sr1 = VADD(r1,r2), dr1 = VSUB(r1,r2);
1537 v4sf si0 = VADD(i0,i3), di0 = VSUB(i0,i3);
1538 v4sf si1 = VADD(i1,i2), di1 = VSUB(i1,i2);
1539
1540 r0 = VADD(sr0, sr1);
1541 r2 = VSUB(sr0, sr1);
1542 r1 = VSUB(dr0, si1);
1543 r3 = VADD(dr0, si1);
1544 i0 = VSUB(di0, di1);
1545 i2 = VADD(di0, di1);
1546 i1 = VSUB(si0, dr1);
1547 i3 = VADD(si0, dr1);
1548
1549 VCPLXMULCONJ(r1,i1,e[0],e[1]);
1550 VCPLXMULCONJ(r2,i2,e[2],e[3]);
1551 VCPLXMULCONJ(r3,i3,e[4],e[5]);
1552
1553 VTRANSPOSE4(r0,r1,r2,r3);
1554 VTRANSPOSE4(i0,i1,i2,i3);
1555
1556 if (!first) {
1557 *out++ = r0;
1558 *out++ = i0;
1559 }
1560 *out++ = r1;
1561 *out++ = i1;
1562 *out++ = r2;
1563 *out++ = i2;
1564 *out++ = r3;
1565 *out++ = i3;
1566}
1567
1568static NEVER_INLINE(void) pffft_real_preprocess(int Ncvec, const v4sf *in, v4sf *out, const v4sf *e) {
1569 int k, dk = Ncvec/SIMD_SZ; // number of 4x4 matrix blocks
1570 /* fftpack order is f0r f1r f1i f2r f2i ... f(n-1)r f(n-1)i f(n)r */
1571
1572 v4sf_union Xr, Xi, *uout = (v4sf_union*)out;
1573 float cr0, ci0, cr1, ci1, cr2, ci2, cr3, ci3;
1574 static const float s = M_SQRT2;
1575 assert(in != out);
1576 for (k=0; k < 4; ++k) {
1577 Xr.f[k] = ((float*)in)[8*k];
1578 Xi.f[k] = ((float*)in)[8*k+4];
1579 }
1580
1581 pffft_real_preprocess_4x4(in, e, out: out+1, first: 1); // will write only 6 values
1582
1583 /*
1584 [Xr0 Xr1 Xr2 Xr3 Xi0 Xi1 Xi2 Xi3]
1585
1586 [cr0] [1 0 2 0 1 0 0 0]
1587 [cr1] [1 0 0 0 -1 0 -2 0]
1588 [cr2] [1 0 -2 0 1 0 0 0]
1589 [cr3] [1 0 0 0 -1 0 2 0]
1590 [ci0] [0 2 0 2 0 0 0 0]
1591 [ci1] [0 s 0 -s 0 -s 0 -s]
1592 [ci2] [0 0 0 0 0 -2 0 2]
1593 [ci3] [0 -s 0 s 0 -s 0 -s]
1594 */
1595 for (k=1; k < dk; ++k) {
1596 pffft_real_preprocess_4x4(in: in+8*k, e: e + k*6, out: out-1+k*8, first: 0);
1597 }
1598
1599 cr0=(Xr.f[0]+Xi.f[0]) + 2*Xr.f[2]; uout[0].f[0] = cr0;
1600 cr1=(Xr.f[0]-Xi.f[0]) - 2*Xi.f[2]; uout[0].f[1] = cr1;
1601 cr2=(Xr.f[0]+Xi.f[0]) - 2*Xr.f[2]; uout[0].f[2] = cr2;
1602 cr3=(Xr.f[0]-Xi.f[0]) + 2*Xi.f[2]; uout[0].f[3] = cr3;
1603 ci0= 2*(Xr.f[1]+Xr.f[3]); uout[2*Ncvec-1].f[0] = ci0;
1604 ci1= s*(Xr.f[1]-Xr.f[3]) - s*(Xi.f[1]+Xi.f[3]); uout[2*Ncvec-1].f[1] = ci1;
1605 ci2= 2*(Xi.f[3]-Xi.f[1]); uout[2*Ncvec-1].f[2] = ci2;
1606 ci3=-s*(Xr.f[1]-Xr.f[3]) - s*(Xi.f[1]+Xi.f[3]); uout[2*Ncvec-1].f[3] = ci3;
1607}
1608
1609
1610void pffft_transform_internal(PFFFT_Setup *setup, const float *finput, float *foutput, v4sf *scratch,
1611 pffft_direction_t direction, int ordered) {
1612 int k, Ncvec = setup->Ncvec;
1613 int nf_odd = (setup->ifac[1] & 1);
1614
1615 // temporary buffer is allocated on the stack if the scratch pointer is NULL
1616 int stack_allocate = (scratch == 0 ? Ncvec*2 : 1);
1617 VLA_ARRAY_ON_STACK(v4sf, scratch_on_stack, stack_allocate);
1618
1619 const v4sf *vinput = (const v4sf*)finput;
1620 v4sf *voutput = (v4sf*)foutput;
1621 v4sf *buff[2] = { voutput, scratch ? scratch : scratch_on_stack };
1622 int ib = (nf_odd ^ ordered ? 1 : 0);
1623
1624 assert(VALIGNED(finput) && VALIGNED(foutput));
1625
1626 //assert(finput != foutput);
1627 if (direction == PFFFT_FORWARD) {
1628 ib = !ib;
1629 if (setup->transform == PFFFT_REAL) {
1630 ib = (rfftf1_ps(n: Ncvec*2, input_readonly: vinput, work1: buff[ib], work2: buff[!ib],
1631 wa: setup->twiddle, ifac: &setup->ifac[0]) == buff[0] ? 0 : 1);
1632 pffft_real_finalize(Ncvec, in: buff[ib], out: buff[!ib], e: (v4sf*)setup->e);
1633 } else {
1634 v4sf *tmp = buff[ib];
1635 for (k=0; k < Ncvec; ++k) {
1636 UNINTERLEAVE2(vinput[k*2], vinput[k*2+1], tmp[k*2], tmp[k*2+1]);
1637 }
1638 ib = (cfftf1_ps(n: Ncvec, input_readonly: buff[ib], work1: buff[!ib], work2: buff[ib],
1639 wa: setup->twiddle, ifac: &setup->ifac[0], isign: -1) == buff[0] ? 0 : 1);
1640 pffft_cplx_finalize(Ncvec, in: buff[ib], out: buff[!ib], e: (v4sf*)setup->e);
1641 }
1642 if (ordered) {
1643 pffft_zreorder(setup, in: (float*)buff[!ib], out: (float*)buff[ib], direction: PFFFT_FORWARD);
1644 } else ib = !ib;
1645 } else {
1646 if (vinput == buff[ib]) {
1647 ib = !ib; // may happen when finput == foutput
1648 }
1649 if (ordered) {
1650 pffft_zreorder(setup, in: (float*)vinput, out: (float*)buff[ib], direction: PFFFT_BACKWARD);
1651 vinput = buff[ib]; ib = !ib;
1652 }
1653 if (setup->transform == PFFFT_REAL) {
1654 pffft_real_preprocess(Ncvec, in: vinput, out: buff[ib], e: (v4sf*)setup->e);
1655 ib = (rfftb1_ps(n: Ncvec*2, input_readonly: buff[ib], work1: buff[0], work2: buff[1],
1656 wa: setup->twiddle, ifac: &setup->ifac[0]) == buff[0] ? 0 : 1);
1657 } else {
1658 pffft_cplx_preprocess(Ncvec, in: vinput, out: buff[ib], e: (v4sf*)setup->e);
1659 ib = (cfftf1_ps(n: Ncvec, input_readonly: buff[ib], work1: buff[0], work2: buff[1],
1660 wa: setup->twiddle, ifac: &setup->ifac[0], isign: +1) == buff[0] ? 0 : 1);
1661 for (k=0; k < Ncvec; ++k) {
1662 INTERLEAVE2(buff[ib][k*2], buff[ib][k*2+1], buff[ib][k*2], buff[ib][k*2+1]);
1663 }
1664 }
1665 }
1666
1667 if (buff[ib] != voutput) {
1668 /* extra copy required -- this situation should only happen when finput == foutput */
1669 assert(finput==foutput);
1670 for (k=0; k < Ncvec; ++k) {
1671 v4sf a = buff[ib][2*k], b = buff[ib][2*k+1];
1672 voutput[2*k] = a; voutput[2*k+1] = b;
1673 }
1674 ib = !ib;
1675 }
1676 assert(buff[ib] == voutput);
1677}
1678
1679void pffft_zconvolve_accumulate(PFFFT_Setup *s, const float *a, const float *b, float *ab, float scaling) {
1680 int Ncvec = s->Ncvec;
1681 const v4sf * RESTRICT va = (const v4sf*)a;
1682 const v4sf * RESTRICT vb = (const v4sf*)b;
1683 v4sf * RESTRICT vab = (v4sf*)ab;
1684
1685#ifdef __arm__
1686 __builtin_prefetch(va);
1687 __builtin_prefetch(vb);
1688 __builtin_prefetch(vab);
1689 __builtin_prefetch(va+2);
1690 __builtin_prefetch(vb+2);
1691 __builtin_prefetch(vab+2);
1692 __builtin_prefetch(va+4);
1693 __builtin_prefetch(vb+4);
1694 __builtin_prefetch(vab+4);
1695 __builtin_prefetch(va+6);
1696 __builtin_prefetch(vb+6);
1697 __builtin_prefetch(vab+6);
1698# ifndef __clang__
1699# define ZCONVOLVE_USING_INLINE_NEON_ASM
1700# endif
1701#endif
1702
1703 float ar0, ai0, br0, bi0, abr0, abi0;
1704#ifndef ZCONVOLVE_USING_INLINE_ASM
1705 v4sf vscal = LD_PS1(scaling);
1706 int i;
1707#endif
1708
1709 assert(VALIGNED(a) && VALIGNED(b) && VALIGNED(ab));
1710 ar0 = ((v4sf_union*)va)[0].f[0];
1711 ai0 = ((v4sf_union*)va)[1].f[0];
1712 br0 = ((v4sf_union*)vb)[0].f[0];
1713 bi0 = ((v4sf_union*)vb)[1].f[0];
1714 abr0 = ((v4sf_union*)vab)[0].f[0];
1715 abi0 = ((v4sf_union*)vab)[1].f[0];
1716
1717#ifdef ZCONVOLVE_USING_INLINE_ASM // inline asm version, unfortunately miscompiled by clang 3.2, at least on ubuntu.. so this will be restricted to gcc
1718 const float *a_ = a, *b_ = b; float *ab_ = ab;
1719 int N = Ncvec;
1720 asm volatile("mov r8, %2 \n"
1721 "vdup.f32 q15, %4 \n"
1722 "1: \n"
1723 "pld [%0,#64] \n"
1724 "pld [%1,#64] \n"
1725 "pld [%2,#64] \n"
1726 "pld [%0,#96] \n"
1727 "pld [%1,#96] \n"
1728 "pld [%2,#96] \n"
1729 "vld1.f32 {q0,q1}, [%0,:128]! \n"
1730 "vld1.f32 {q4,q5}, [%1,:128]! \n"
1731 "vld1.f32 {q2,q3}, [%0,:128]! \n"
1732 "vld1.f32 {q6,q7}, [%1,:128]! \n"
1733 "vld1.f32 {q8,q9}, [r8,:128]! \n"
1734
1735 "vmul.f32 q10, q0, q4 \n"
1736 "vmul.f32 q11, q0, q5 \n"
1737 "vmul.f32 q12, q2, q6 \n"
1738 "vmul.f32 q13, q2, q7 \n"
1739 "vmls.f32 q10, q1, q5 \n"
1740 "vmla.f32 q11, q1, q4 \n"
1741 "vld1.f32 {q0,q1}, [r8,:128]! \n"
1742 "vmls.f32 q12, q3, q7 \n"
1743 "vmla.f32 q13, q3, q6 \n"
1744 "vmla.f32 q8, q10, q15 \n"
1745 "vmla.f32 q9, q11, q15 \n"
1746 "vmla.f32 q0, q12, q15 \n"
1747 "vmla.f32 q1, q13, q15 \n"
1748 "vst1.f32 {q8,q9},[%2,:128]! \n"
1749 "vst1.f32 {q0,q1},[%2,:128]! \n"
1750 "subs %3, #2 \n"
1751 "bne 1b \n"
1752 : "+r"(a_), "+r"(b_), "+r"(ab_), "+r"(N) : "r"(scaling) : "r8", "q0","q1","q2","q3","q4","q5","q6","q7","q8","q9", "q10","q11","q12","q13","q15","memory");
1753#else // default routine, works fine for non-arm cpus with current compilers
1754 for (i=0; i < Ncvec; i += 2) {
1755 v4sf ar, ai, br, bi;
1756 ar = va[2*i+0]; ai = va[2*i+1];
1757 br = vb[2*i+0]; bi = vb[2*i+1];
1758 VCPLXMUL(ar, ai, br, bi);
1759 vab[2*i+0] = VMADD(ar, vscal, vab[2*i+0]);
1760 vab[2*i+1] = VMADD(ai, vscal, vab[2*i+1]);
1761 ar = va[2*i+2]; ai = va[2*i+3];
1762 br = vb[2*i+2]; bi = vb[2*i+3];
1763 VCPLXMUL(ar, ai, br, bi);
1764 vab[2*i+2] = VMADD(ar, vscal, vab[2*i+2]);
1765 vab[2*i+3] = VMADD(ai, vscal, vab[2*i+3]);
1766 }
1767#endif
1768 if (s->transform == PFFFT_REAL) {
1769 ((v4sf_union*)vab)[0].f[0] = abr0 + ar0*br0*scaling;
1770 ((v4sf_union*)vab)[1].f[0] = abi0 + ai0*bi0*scaling;
1771 }
1772}
1773
1774
1775#else // defined(PFFFT_SIMD_DISABLE)
1776
1777// standard routine using scalar floats, without SIMD stuff.
1778
1779#define pffft_zreorder_nosimd pffft_zreorder
1780void pffft_zreorder_nosimd(PFFFT_Setup *setup, const float *in, float *out, pffft_direction_t direction) {
1781 int k, N = setup->N;
1782 if (setup->transform == PFFFT_COMPLEX) {
1783 for (k=0; k < 2*N; ++k) out[k] = in[k];
1784 return;
1785 }
1786 else if (direction == PFFFT_FORWARD) {
1787 float x_N = in[N-1];
1788 for (k=N-1; k > 1; --k) out[k] = in[k-1];
1789 out[0] = in[0];
1790 out[1] = x_N;
1791 } else {
1792 float x_N = in[1];
1793 for (k=1; k < N-1; ++k) out[k] = in[k+1];
1794 out[0] = in[0];
1795 out[N-1] = x_N;
1796 }
1797}
1798
1799#define pffft_transform_internal_nosimd pffft_transform_internal
1800void pffft_transform_internal_nosimd(PFFFT_Setup *setup, const float *input, float *output, float *scratch,
1801 pffft_direction_t direction, int ordered) {
1802 int Ncvec = setup->Ncvec;
1803 int nf_odd = (setup->ifac[1] & 1);
1804
1805 // temporary buffer is allocated on the stack if the scratch pointer is NULL
1806 int stack_allocate = (scratch == 0 ? Ncvec*2 : 1);
1807 VLA_ARRAY_ON_STACK(v4sf, scratch_on_stack, stack_allocate);
1808 float *buff[2];
1809 int ib;
1810 if (scratch == 0) scratch = scratch_on_stack;
1811 buff[0] = output; buff[1] = scratch;
1812
1813 if (setup->transform == PFFFT_COMPLEX) ordered = 0; // it is always ordered.
1814 ib = (nf_odd ^ ordered ? 1 : 0);
1815
1816 if (direction == PFFFT_FORWARD) {
1817 if (setup->transform == PFFFT_REAL) {
1818 ib = (rfftf1_ps(Ncvec*2, input, buff[ib], buff[!ib],
1819 setup->twiddle, &setup->ifac[0]) == buff[0] ? 0 : 1);
1820 } else {
1821 ib = (cfftf1_ps(Ncvec, input, buff[ib], buff[!ib],
1822 setup->twiddle, &setup->ifac[0], -1) == buff[0] ? 0 : 1);
1823 }
1824 if (ordered) {
1825 pffft_zreorder(setup, buff[ib], buff[!ib], PFFFT_FORWARD); ib = !ib;
1826 }
1827 } else {
1828 if (input == buff[ib]) {
1829 ib = !ib; // may happen when finput == foutput
1830 }
1831 if (ordered) {
1832 pffft_zreorder(setup, input, buff[!ib], PFFFT_BACKWARD);
1833 input = buff[!ib];
1834 }
1835 if (setup->transform == PFFFT_REAL) {
1836 ib = (rfftb1_ps(Ncvec*2, input, buff[ib], buff[!ib],
1837 setup->twiddle, &setup->ifac[0]) == buff[0] ? 0 : 1);
1838 } else {
1839 ib = (cfftf1_ps(Ncvec, input, buff[ib], buff[!ib],
1840 setup->twiddle, &setup->ifac[0], +1) == buff[0] ? 0 : 1);
1841 }
1842 }
1843 if (buff[ib] != output) {
1844 int k;
1845 // extra copy required -- this situation should happens only when finput == foutput
1846 assert(input==output);
1847 for (k=0; k < Ncvec; ++k) {
1848 float a = buff[ib][2*k], b = buff[ib][2*k+1];
1849 output[2*k] = a; output[2*k+1] = b;
1850 }
1851 ib = !ib;
1852 }
1853 assert(buff[ib] == output);
1854}
1855
1856#define pffft_zconvolve_accumulate_nosimd pffft_zconvolve_accumulate
1857void pffft_zconvolve_accumulate_nosimd(PFFFT_Setup *s, const float *a, const float *b,
1858 float *ab, float scaling) {
1859 int i, Ncvec = s->Ncvec;
1860
1861 if (s->transform == PFFFT_REAL) {
1862 // take care of the fftpack ordering
1863 ab[0] += a[0]*b[0]*scaling;
1864 ab[2*Ncvec-1] += a[2*Ncvec-1]*b[2*Ncvec-1]*scaling;
1865 ++ab; ++a; ++b; --Ncvec;
1866 }
1867 for (i=0; i < Ncvec; ++i) {
1868 float ar, ai, br, bi;
1869 ar = a[2*i+0]; ai = a[2*i+1];
1870 br = b[2*i+0]; bi = b[2*i+1];
1871 VCPLXMUL(ar, ai, br, bi);
1872 ab[2*i+0] += ar*scaling;
1873 ab[2*i+1] += ai*scaling;
1874 }
1875}
1876
1877#endif // defined(PFFFT_SIMD_DISABLE)
1878
1879void pffft_transform(PFFFT_Setup *setup, const float *input, float *output, float *work, pffft_direction_t direction) {
1880 pffft_transform_internal(setup, finput: input, foutput: output, scratch: (v4sf*)work, direction, ordered: 0);
1881}
1882
1883void pffft_transform_ordered(PFFFT_Setup *setup, const float *input, float *output, float *work, pffft_direction_t direction) {
1884 pffft_transform_internal(setup, finput: input, foutput: output, scratch: (v4sf*)work, direction, ordered: 1);
1885}
1886

source code of qtmultimedia/src/3rdparty/pffft/pffft.c