| 1 | /* Copyright (c) 2013  Julien Pommier ( pommier@modartt.com ) | 
| 2 |  | 
| 3 |    Based on original fortran 77 code from FFTPACKv4 from NETLIB | 
| 4 |    (http://www.netlib.org/fftpack), authored by Dr Paul Swarztrauber | 
| 5 |    of NCAR, in 1985. | 
| 6 |  | 
| 7 |    As confirmed by the NCAR fftpack software curators, the following | 
| 8 |    FFTPACKv5 license applies to FFTPACKv4 sources. My changes are | 
| 9 |    released under the same terms. | 
| 10 |  | 
| 11 |    FFTPACK license: | 
| 12 |  | 
| 13 |    http://www.cisl.ucar.edu/css/software/fftpack5/ftpk.html | 
| 14 |  | 
| 15 |    Copyright (c) 2004 the University Corporation for Atmospheric | 
| 16 |    Research ("UCAR"). All rights reserved. Developed by NCAR's | 
| 17 |    Computational and Information Systems Laboratory, UCAR, | 
| 18 |    www.cisl.ucar.edu. | 
| 19 |  | 
| 20 |    Redistribution and use of the Software in source and binary forms, | 
| 21 |    with or without modification, is permitted provided that the | 
| 22 |    following conditions are met: | 
| 23 |  | 
| 24 |    - Neither the names of NCAR's Computational and Information Systems | 
| 25 |    Laboratory, the University Corporation for Atmospheric Research, | 
| 26 |    nor the names of its sponsors or contributors may be used to | 
| 27 |    endorse or promote products derived from this Software without | 
| 28 |    specific prior written permission. | 
| 29 |  | 
| 30 |    - Redistributions of source code must retain the above copyright | 
| 31 |    notices, this list of conditions, and the disclaimer below. | 
| 32 |  | 
| 33 |    - Redistributions in binary form must reproduce the above copyright | 
| 34 |    notice, this list of conditions, and the disclaimer below in the | 
| 35 |    documentation and/or other materials provided with the | 
| 36 |    distribution. | 
| 37 |  | 
| 38 |    THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, | 
| 39 |    EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF | 
| 40 |    MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND | 
| 41 |    NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT | 
| 42 |    HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL, | 
| 43 |    EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN | 
| 44 |    ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN | 
| 45 |    CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE | 
| 46 |    SOFTWARE. | 
| 47 |  | 
| 48 |  | 
| 49 |    PFFFT : a Pretty Fast FFT. | 
| 50 |  | 
| 51 |    This file is largerly based on the original FFTPACK implementation, modified in | 
| 52 |    order to take advantage of SIMD instructions of modern CPUs. | 
| 53 | */ | 
| 54 |  | 
| 55 | /* | 
| 56 |   ChangeLog: | 
| 57 |   - 2011/10/02, version 1: This is the very first release of this file. | 
| 58 | */ | 
| 59 |  | 
| 60 | #ifndef _USE_MATH_DEFINES | 
| 61 | #  define _USE_MATH_DEFINES // ask gently MSVC to define M_PI, M_SQRT2 etc. | 
| 62 | #endif | 
| 63 |  | 
| 64 | #include "pffft.h" | 
| 65 | #include <stdlib.h> | 
| 66 | #include <stdio.h> | 
| 67 | #include <math.h> | 
| 68 | #include <assert.h> | 
| 69 |  | 
| 70 | /* detect compiler flavour */ | 
| 71 | #if defined(_MSC_VER) | 
| 72 | #  define COMPILER_MSVC | 
| 73 | #elif defined(__GNUC__) | 
| 74 | #  define COMPILER_GCC | 
| 75 | #endif | 
| 76 |  | 
| 77 | #if defined(COMPILER_GCC) | 
| 78 | #  define ALWAYS_INLINE(return_type) inline return_type __attribute__ ((always_inline)) | 
| 79 | #  define NEVER_INLINE(return_type) return_type __attribute__ ((noinline)) | 
| 80 | #  define RESTRICT __restrict | 
| 81 | #  define VLA_ARRAY_ON_STACK(type__, varname__, size__) type__ varname__[size__]; | 
| 82 | #elif defined(COMPILER_MSVC) | 
| 83 | #  define ALWAYS_INLINE(return_type) __forceinline return_type | 
| 84 | #  define NEVER_INLINE(return_type) __declspec(noinline) return_type | 
| 85 | #  define RESTRICT __restrict | 
| 86 | #  define VLA_ARRAY_ON_STACK(type__, varname__, size__) type__ *varname__ = (type__*)_alloca(size__ * sizeof(type__)) | 
| 87 | #endif | 
| 88 |  | 
| 89 |  | 
| 90 | /* | 
| 91 |   vector support macros: the rest of the code is independant of | 
| 92 |   SSE/Altivec/NEON -- adding support for other platforms with 4-element | 
| 93 |   vectors should be limited to these macros | 
| 94 | */ | 
| 95 |  | 
| 96 |  | 
| 97 | // define PFFFT_SIMD_DISABLE if you want to use scalar code instead of simd code | 
| 98 | //#define PFFFT_SIMD_DISABLE | 
| 99 |  | 
| 100 | /* select which SIMD intrinsics will be used */ | 
| 101 | #if !defined(PFFFT_SIMD_DISABLE) | 
| 102 | #  if (defined(__ppc__) || defined(__ppc64__) || defined(__powerpc__) || defined(__powerpc64__)) \ | 
| 103 |    && (defined(__VEC__) || defined(__ALTIVEC__)) | 
| 104 | #    define PFFFT_SIMD_ALTIVEC | 
| 105 | #  elif defined(__ARM_NEON) || defined(__aarch64__) || defined(__arm64)  \ | 
| 106 |    || defined(_M_ARM64) || defined(_M_ARM64EC) || defined(__wasm_simd128__) | 
| 107 |      // we test _M_ARM64EC before _M_X64 because when _M_ARM64EC is defined, the microsoft compiler also defines _M_X64 | 
| 108 | #    define PFFFT_SIMD_NEON | 
| 109 | #  elif defined(__x86_64__) || defined(__SSE__) || defined(_M_X64) || (defined(_M_IX86_FP) && _M_IX86_FP >= 1) | 
| 110 | #    define PFFFT_SIMD_SSE | 
| 111 | #   endif | 
| 112 | #endif // PFFFT_SIMD_DISABLE | 
| 113 |  | 
| 114 | /* | 
| 115 |   Altivec support macros | 
| 116 | */ | 
| 117 | #ifdef PFFFT_SIMD_ALTIVEC | 
| 118 | #include <altivec.h> | 
| 119 | typedef vector float v4sf; | 
| 120 | #  define SIMD_SZ 4 | 
| 121 | #  define VZERO() ((vector float) vec_splat_u8(0)) | 
| 122 | #  define VMUL(a,b) vec_madd(a,b, VZERO()) | 
| 123 | #  define VADD(a,b) vec_add(a,b) | 
| 124 | #  define VMADD(a,b,c) vec_madd(a,b,c) | 
| 125 | #  define VSUB(a,b) vec_sub(a,b) | 
| 126 | inline v4sf ld_ps1(const float *p) { v4sf v=vec_lde(0,p); return vec_splat(vec_perm(v, v, vec_lvsl(0, p)), 0); } | 
| 127 | #  define LD_PS1(p) ld_ps1(&p) | 
| 128 | #  define INTERLEAVE2(in1, in2, out1, out2) { v4sf tmp__ = vec_mergeh(in1, in2); out2 = vec_mergel(in1, in2); out1 = tmp__; } | 
| 129 | #  define UNINTERLEAVE2(in1, in2, out1, out2) {                           \ | 
| 130 |     vector unsigned char vperm1 =  (vector unsigned char){0,1,2,3,8,9,10,11,16,17,18,19,24,25,26,27}; \ | 
| 131 |     vector unsigned char vperm2 =  (vector unsigned char){4,5,6,7,12,13,14,15,20,21,22,23,28,29,30,31}; \ | 
| 132 |     v4sf tmp__ = vec_perm(in1, in2, vperm1); out2 = vec_perm(in1, in2, vperm2); out1 = tmp__; \ | 
| 133 |   } | 
| 134 | #  define VTRANSPOSE4(x0,x1,x2,x3) {            \ | 
| 135 |     v4sf y0 = vec_mergeh(x0, x2);               \ | 
| 136 |     v4sf y1 = vec_mergel(x0, x2);               \ | 
| 137 |     v4sf y2 = vec_mergeh(x1, x3);               \ | 
| 138 |     v4sf y3 = vec_mergel(x1, x3);               \ | 
| 139 |     x0 = vec_mergeh(y0, y2);                    \ | 
| 140 |     x1 = vec_mergel(y0, y2);                    \ | 
| 141 |     x2 = vec_mergeh(y1, y3);                    \ | 
| 142 |     x3 = vec_mergel(y1, y3);                    \ | 
| 143 |   } | 
| 144 | #  define VSWAPHL(a,b) vec_perm(a,b, (vector unsigned char){16,17,18,19,20,21,22,23,8,9,10,11,12,13,14,15}) | 
| 145 | #  define VALIGNED(ptr) ((((size_t)(ptr)) & 0xF) == 0) | 
| 146 |  | 
| 147 | /* | 
| 148 |   SSE1 support macros | 
| 149 | */ | 
| 150 | #elif defined(PFFFT_SIMD_SSE) | 
| 151 |  | 
| 152 | #include <xmmintrin.h> | 
| 153 | typedef __m128 v4sf; | 
| 154 | #  define SIMD_SZ 4 // 4 floats by simd vector -- this is pretty much hardcoded in the preprocess/finalize functions anyway so you will have to work if you want to enable AVX with its 256-bit vectors. | 
| 155 | #  define VZERO() _mm_setzero_ps() | 
| 156 | #  define VMUL(a,b) _mm_mul_ps(a,b) | 
| 157 | #  define VADD(a,b) _mm_add_ps(a,b) | 
| 158 | #  define VMADD(a,b,c) _mm_add_ps(_mm_mul_ps(a,b), c) | 
| 159 | #  define VSUB(a,b) _mm_sub_ps(a,b) | 
| 160 | #  define LD_PS1(p) _mm_set1_ps(p) | 
| 161 | #  define INTERLEAVE2(in1, in2, out1, out2) { v4sf tmp__ = _mm_unpacklo_ps(in1, in2); out2 = _mm_unpackhi_ps(in1, in2); out1 = tmp__; } | 
| 162 | #  define UNINTERLEAVE2(in1, in2, out1, out2) { v4sf tmp__ = _mm_shuffle_ps(in1, in2, _MM_SHUFFLE(2,0,2,0)); out2 = _mm_shuffle_ps(in1, in2, _MM_SHUFFLE(3,1,3,1)); out1 = tmp__; } | 
| 163 | #  define VTRANSPOSE4(x0,x1,x2,x3) _MM_TRANSPOSE4_PS(x0,x1,x2,x3) | 
| 164 | #  define VSWAPHL(a,b) _mm_shuffle_ps(b, a, _MM_SHUFFLE(3,2,1,0)) | 
| 165 | #  define VALIGNED(ptr) ((((size_t)(ptr)) & 0xF) == 0) | 
| 166 |  | 
| 167 | /* | 
| 168 |   ARM NEON support macros | 
| 169 | */ | 
| 170 | #elif defined(PFFFT_SIMD_NEON) | 
| 171 | #  include <arm_neon.h> | 
| 172 | typedef float32x4_t v4sf; | 
| 173 | #  define SIMD_SZ 4 | 
| 174 | #  define VZERO() vdupq_n_f32(0) | 
| 175 | #  define VMUL(a,b) vmulq_f32(a,b) | 
| 176 | #  define VADD(a,b) vaddq_f32(a,b) | 
| 177 | #  define VMADD(a,b,c) vmlaq_f32(c,a,b) | 
| 178 | #  define VSUB(a,b) vsubq_f32(a,b) | 
| 179 | #  define LD_PS1(p) vld1q_dup_f32(&(p)) | 
| 180 | #  define INTERLEAVE2(in1, in2, out1, out2) { float32x4x2_t tmp__ = vzipq_f32(in1,in2); out1=tmp__.val[0]; out2=tmp__.val[1]; } | 
| 181 | #  define UNINTERLEAVE2(in1, in2, out1, out2) { float32x4x2_t tmp__ = vuzpq_f32(in1,in2); out1=tmp__.val[0]; out2=tmp__.val[1]; } | 
| 182 | #  define VTRANSPOSE4(x0,x1,x2,x3) {                                    \ | 
| 183 |     float32x4x2_t t0_ = vzipq_f32(x0, x2);                              \ | 
| 184 |     float32x4x2_t t1_ = vzipq_f32(x1, x3);                              \ | 
| 185 |     float32x4x2_t u0_ = vzipq_f32(t0_.val[0], t1_.val[0]);              \ | 
| 186 |     float32x4x2_t u1_ = vzipq_f32(t0_.val[1], t1_.val[1]);              \ | 
| 187 |     x0 = u0_.val[0]; x1 = u0_.val[1]; x2 = u1_.val[0]; x3 = u1_.val[1]; \ | 
| 188 |   } | 
| 189 | // marginally faster version | 
| 190 | //#  define VTRANSPOSE4(x0,x1,x2,x3) { asm("vtrn.32 %q0, %q1;\n vtrn.32 %q2,%q3\n vswp %f0,%e2\n vswp %f1,%e3" : "+w"(x0), "+w"(x1), "+w"(x2), "+w"(x3)::); } | 
| 191 | #  define VSWAPHL(a,b) vcombine_f32(vget_low_f32(b), vget_high_f32(a)) | 
| 192 | #  define VALIGNED(ptr) ((((size_t)(ptr)) & 0x3) == 0) | 
| 193 | #else | 
| 194 | #  if !defined(PFFFT_SIMD_DISABLE) | 
| 195 | #    warning "building with simd disabled !\n"; | 
| 196 | #    define PFFFT_SIMD_DISABLE // fallback to scalar code | 
| 197 | #  endif | 
| 198 | #endif | 
| 199 |  | 
| 200 | // fallback mode for situations where SSE/Altivec are not available, use scalar mode instead | 
| 201 | #ifdef PFFFT_SIMD_DISABLE | 
| 202 | typedef float v4sf; | 
| 203 | #  define SIMD_SZ 1 | 
| 204 | #  define VZERO() 0.f | 
| 205 | #  define VMUL(a,b) ((a)*(b)) | 
| 206 | #  define VADD(a,b) ((a)+(b)) | 
| 207 | #  define VMADD(a,b,c) ((a)*(b)+(c)) | 
| 208 | #  define VSUB(a,b) ((a)-(b)) | 
| 209 | #  define LD_PS1(p) (p) | 
| 210 | #  define VALIGNED(ptr) ((((size_t)(ptr)) & 0x3) == 0) | 
| 211 | #endif | 
| 212 |  | 
| 213 | // shortcuts for complex multiplcations | 
| 214 | #define VCPLXMUL(ar,ai,br,bi) { v4sf tmp; tmp=VMUL(ar,bi); ar=VMUL(ar,br); ar=VSUB(ar,VMUL(ai,bi)); ai=VMUL(ai,br); ai=VADD(ai,tmp); } | 
| 215 | #define VCPLXMULCONJ(ar,ai,br,bi) { v4sf tmp; tmp=VMUL(ar,bi); ar=VMUL(ar,br); ar=VADD(ar,VMUL(ai,bi)); ai=VMUL(ai,br); ai=VSUB(ai,tmp); } | 
| 216 | #ifndef SVMUL | 
| 217 | // multiply a scalar with a vector | 
| 218 | #define SVMUL(f,v) VMUL(LD_PS1(f),v) | 
| 219 | #endif | 
| 220 |  | 
| 221 | #if !defined(PFFFT_SIMD_DISABLE) | 
| 222 | typedef union v4sf_union { | 
| 223 |   v4sf  v; | 
| 224 |   float f[4]; | 
| 225 | } v4sf_union; | 
| 226 |  | 
| 227 | #include <string.h> | 
| 228 |  | 
| 229 | #define assertv4(v,f0,f1,f2,f3) assert(v.f[0] == (f0) && v.f[1] == (f1) && v.f[2] == (f2) && v.f[3] == (f3)) | 
| 230 |  | 
| 231 | /* detect bugs with the vector support macros */ | 
| 232 | void validate_pffft_simd(void) { | 
| 233 |   float f[16] = { 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 }; | 
| 234 |   v4sf_union a0, a1, a2, a3, t, u; | 
| 235 |   memcpy(dest: a0.f, src: f, n: 4*sizeof(float)); | 
| 236 |   memcpy(dest: a1.f, src: f+4, n: 4*sizeof(float)); | 
| 237 |   memcpy(dest: a2.f, src: f+8, n: 4*sizeof(float)); | 
| 238 |   memcpy(dest: a3.f, src: f+12, n: 4*sizeof(float)); | 
| 239 |  | 
| 240 |   t = a0; u = a1; t.v = VZERO(); | 
| 241 |   printf(format: "VZERO=[%2g %2g %2g %2g]\n" , t.f[0], t.f[1], t.f[2], t.f[3]); assertv4(t, 0, 0, 0, 0); | 
| 242 |   t.v = VADD(a1.v, a2.v); | 
| 243 |   printf(format: "VADD(4:7,8:11)=[%2g %2g %2g %2g]\n" , t.f[0], t.f[1], t.f[2], t.f[3]); assertv4(t, 12, 14, 16, 18); | 
| 244 |   t.v = VMUL(a1.v, a2.v); | 
| 245 |   printf(format: "VMUL(4:7,8:11)=[%2g %2g %2g %2g]\n" , t.f[0], t.f[1], t.f[2], t.f[3]); assertv4(t, 32, 45, 60, 77); | 
| 246 |   t.v = VMADD(a1.v, a2.v,a0.v); | 
| 247 |   printf(format: "VMADD(4:7,8:11,0:3)=[%2g %2g %2g %2g]\n" , t.f[0], t.f[1], t.f[2], t.f[3]); assertv4(t, 32, 46, 62, 80); | 
| 248 |  | 
| 249 |   INTERLEAVE2(a1.v,a2.v,t.v,u.v); | 
| 250 |   printf(format: "INTERLEAVE2(4:7,8:11)=[%2g %2g %2g %2g] [%2g %2g %2g %2g]\n" , t.f[0], t.f[1], t.f[2], t.f[3], u.f[0], u.f[1], u.f[2], u.f[3]); | 
| 251 |   assertv4(t, 4, 8, 5, 9); assertv4(u, 6, 10, 7, 11); | 
| 252 |   UNINTERLEAVE2(a1.v,a2.v,t.v,u.v); | 
| 253 |   printf(format: "UNINTERLEAVE2(4:7,8:11)=[%2g %2g %2g %2g] [%2g %2g %2g %2g]\n" , t.f[0], t.f[1], t.f[2], t.f[3], u.f[0], u.f[1], u.f[2], u.f[3]); | 
| 254 |   assertv4(t, 4, 6, 8, 10); assertv4(u, 5, 7, 9, 11); | 
| 255 |  | 
| 256 |   t.v=LD_PS1(f[15]); | 
| 257 |   printf(format: "LD_PS1(15)=[%2g %2g %2g %2g]\n" , t.f[0], t.f[1], t.f[2], t.f[3]); | 
| 258 |   assertv4(t, 15, 15, 15, 15); | 
| 259 |   t.v = VSWAPHL(a1.v, a2.v); | 
| 260 |   printf(format: "VSWAPHL(4:7,8:11)=[%2g %2g %2g %2g]\n" , t.f[0], t.f[1], t.f[2], t.f[3]); | 
| 261 |   assertv4(t, 8, 9, 6, 7); | 
| 262 |   VTRANSPOSE4(a0.v, a1.v, a2.v, a3.v); | 
| 263 |   printf(format: "VTRANSPOSE4(0:3,4:7,8:11,12:15)=[%2g %2g %2g %2g] [%2g %2g %2g %2g] [%2g %2g %2g %2g] [%2g %2g %2g %2g]\n" , | 
| 264 |          a0.f[0], a0.f[1], a0.f[2], a0.f[3], a1.f[0], a1.f[1], a1.f[2], a1.f[3], | 
| 265 |          a2.f[0], a2.f[1], a2.f[2], a2.f[3], a3.f[0], a3.f[1], a3.f[2], a3.f[3]); | 
| 266 |   assertv4(a0, 0, 4, 8, 12); assertv4(a1, 1, 5, 9, 13); assertv4(a2, 2, 6, 10, 14); assertv4(a3, 3, 7, 11, 15); | 
| 267 | } | 
| 268 | #else | 
| 269 | void validate_pffft_simd() {} // allow test_pffft.c to call this function even when simd is not available.. | 
| 270 | #endif //!PFFFT_SIMD_DISABLE | 
| 271 |  | 
| 272 | /* SSE and co like 16-bytes aligned pointers */ | 
| 273 | #define MALLOC_V4SF_ALIGNMENT 64 // with a 64-byte alignment, we are even aligned on L2 cache lines... | 
| 274 | void *pffft_aligned_malloc(size_t nb_bytes) { | 
| 275 |   void *p, *p0 = malloc(size: nb_bytes + MALLOC_V4SF_ALIGNMENT); | 
| 276 |   if (!p0) return (void *) 0; | 
| 277 |   p = (void *) (((size_t) p0 + MALLOC_V4SF_ALIGNMENT) & (~((size_t) (MALLOC_V4SF_ALIGNMENT-1)))); | 
| 278 |   *((void **) p - 1) = p0; | 
| 279 |   return p; | 
| 280 | } | 
| 281 |  | 
| 282 | void pffft_aligned_free(void *p) { | 
| 283 |   if (p) free(ptr: *((void **) p - 1)); | 
| 284 | } | 
| 285 |  | 
| 286 | int pffft_simd_size(void) { return SIMD_SZ; } | 
| 287 |  | 
| 288 | /* | 
| 289 |   passf2 and passb2 has been merged here, fsign = -1 for passf2, +1 for passb2 | 
| 290 | */ | 
| 291 | static NEVER_INLINE(void) passf2_ps(int ido, int l1, const v4sf *cc, v4sf *ch, const float *wa1, float fsign) { | 
| 292 |   int k, i; | 
| 293 |   int l1ido = l1*ido; | 
| 294 |   if (ido <= 2) { | 
| 295 |     for (k=0; k < l1ido; k += ido, ch += ido, cc+= 2*ido) { | 
| 296 |       ch[0]         = VADD(cc[0], cc[ido+0]); | 
| 297 |       ch[l1ido]     = VSUB(cc[0], cc[ido+0]); | 
| 298 |       ch[1]         = VADD(cc[1], cc[ido+1]); | 
| 299 |       ch[l1ido + 1] = VSUB(cc[1], cc[ido+1]); | 
| 300 |     } | 
| 301 |   } else { | 
| 302 |     for (k=0; k < l1ido; k += ido, ch += ido, cc += 2*ido) { | 
| 303 |       for (i=0; i<ido-1; i+=2) { | 
| 304 |         v4sf tr2 = VSUB(cc[i+0], cc[i+ido+0]); | 
| 305 |         v4sf ti2 = VSUB(cc[i+1], cc[i+ido+1]); | 
| 306 |         v4sf wr = LD_PS1(wa1[i]), wi = VMUL(LD_PS1(fsign), LD_PS1(wa1[i+1])); | 
| 307 |         ch[i]   = VADD(cc[i+0], cc[i+ido+0]); | 
| 308 |         ch[i+1] = VADD(cc[i+1], cc[i+ido+1]); | 
| 309 |         VCPLXMUL(tr2, ti2, wr, wi); | 
| 310 |         ch[i+l1ido]   = tr2; | 
| 311 |         ch[i+l1ido+1] = ti2; | 
| 312 |       } | 
| 313 |     } | 
| 314 |   } | 
| 315 | } | 
| 316 |  | 
| 317 | /* | 
| 318 |   passf3 and passb3 has been merged here, fsign = -1 for passf3, +1 for passb3 | 
| 319 | */ | 
| 320 | static NEVER_INLINE(void) passf3_ps(int ido, int l1, const v4sf *cc, v4sf *ch, | 
| 321 |                                     const float *wa1, const float *wa2, float fsign) { | 
| 322 |   static const float taur = -0.5f; | 
| 323 |   float taui = 0.866025403784439f*fsign; | 
| 324 |   int i, k; | 
| 325 |   v4sf tr2, ti2, cr2, ci2, cr3, ci3, dr2, di2, dr3, di3; | 
| 326 |   int l1ido = l1*ido; | 
| 327 |   float wr1, wi1, wr2, wi2; | 
| 328 |   assert(ido > 2); | 
| 329 |   for (k=0; k< l1ido; k += ido, cc+= 3*ido, ch +=ido) { | 
| 330 |     for (i=0; i<ido-1; i+=2) { | 
| 331 |       tr2 = VADD(cc[i+ido], cc[i+2*ido]); | 
| 332 |       cr2 = VADD(cc[i], SVMUL(taur,tr2)); | 
| 333 |       ch[i]    = VADD(cc[i], tr2); | 
| 334 |       ti2 = VADD(cc[i+ido+1], cc[i+2*ido+1]); | 
| 335 |       ci2 = VADD(cc[i    +1], SVMUL(taur,ti2)); | 
| 336 |       ch[i+1]  = VADD(cc[i+1], ti2); | 
| 337 |       cr3 = SVMUL(taui, VSUB(cc[i+ido], cc[i+2*ido])); | 
| 338 |       ci3 = SVMUL(taui, VSUB(cc[i+ido+1], cc[i+2*ido+1])); | 
| 339 |       dr2 = VSUB(cr2, ci3); | 
| 340 |       dr3 = VADD(cr2, ci3); | 
| 341 |       di2 = VADD(ci2, cr3); | 
| 342 |       di3 = VSUB(ci2, cr3); | 
| 343 |       wr1=wa1[i]; wi1=fsign*wa1[i+1]; wr2=wa2[i]; wi2=fsign*wa2[i+1]; | 
| 344 |       VCPLXMUL(dr2, di2, LD_PS1(wr1), LD_PS1(wi1)); | 
| 345 |       ch[i+l1ido] = dr2; | 
| 346 |       ch[i+l1ido + 1] = di2; | 
| 347 |       VCPLXMUL(dr3, di3, LD_PS1(wr2), LD_PS1(wi2)); | 
| 348 |       ch[i+2*l1ido] = dr3; | 
| 349 |       ch[i+2*l1ido+1] = di3; | 
| 350 |     } | 
| 351 |   } | 
| 352 | } /* passf3 */ | 
| 353 |  | 
| 354 | static NEVER_INLINE(void) passf4_ps(int ido, int l1, const v4sf *cc, v4sf *ch, | 
| 355 |                                     const float *wa1, const float *wa2, const float *wa3, float fsign) { | 
| 356 |   /* isign == -1 for forward transform and +1 for backward transform */ | 
| 357 |  | 
| 358 |   int i, k; | 
| 359 |   v4sf ci2, ci3, ci4, cr2, cr3, cr4, ti1, ti2, ti3, ti4, tr1, tr2, tr3, tr4; | 
| 360 |   int l1ido = l1*ido; | 
| 361 |   if (ido == 2) { | 
| 362 |     for (k=0; k < l1ido; k += ido, ch += ido, cc += 4*ido) { | 
| 363 |       tr1 = VSUB(cc[0], cc[2*ido + 0]); | 
| 364 |       tr2 = VADD(cc[0], cc[2*ido + 0]); | 
| 365 |       ti1 = VSUB(cc[1], cc[2*ido + 1]); | 
| 366 |       ti2 = VADD(cc[1], cc[2*ido + 1]); | 
| 367 |       ti4 = VMUL(VSUB(cc[1*ido + 0], cc[3*ido + 0]), LD_PS1(fsign)); | 
| 368 |       tr4 = VMUL(VSUB(cc[3*ido + 1], cc[1*ido + 1]), LD_PS1(fsign)); | 
| 369 |       tr3 = VADD(cc[ido + 0], cc[3*ido + 0]); | 
| 370 |       ti3 = VADD(cc[ido + 1], cc[3*ido + 1]); | 
| 371 |  | 
| 372 |       ch[0*l1ido + 0] = VADD(tr2, tr3); | 
| 373 |       ch[0*l1ido + 1] = VADD(ti2, ti3); | 
| 374 |       ch[1*l1ido + 0] = VADD(tr1, tr4); | 
| 375 |       ch[1*l1ido + 1] = VADD(ti1, ti4); | 
| 376 |       ch[2*l1ido + 0] = VSUB(tr2, tr3); | 
| 377 |       ch[2*l1ido + 1] = VSUB(ti2, ti3); | 
| 378 |       ch[3*l1ido + 0] = VSUB(tr1, tr4); | 
| 379 |       ch[3*l1ido + 1] = VSUB(ti1, ti4); | 
| 380 |     } | 
| 381 |   } else { | 
| 382 |     for (k=0; k < l1ido; k += ido, ch+=ido, cc += 4*ido) { | 
| 383 |       for (i=0; i<ido-1; i+=2) { | 
| 384 |         float wr1, wi1, wr2, wi2, wr3, wi3; | 
| 385 |         tr1 = VSUB(cc[i + 0], cc[i + 2*ido + 0]); | 
| 386 |         tr2 = VADD(cc[i + 0], cc[i + 2*ido + 0]); | 
| 387 |         ti1 = VSUB(cc[i + 1], cc[i + 2*ido + 1]); | 
| 388 |         ti2 = VADD(cc[i + 1], cc[i + 2*ido + 1]); | 
| 389 |         tr4 = VMUL(VSUB(cc[i + 3*ido + 1], cc[i + 1*ido + 1]), LD_PS1(fsign)); | 
| 390 |         ti4 = VMUL(VSUB(cc[i + 1*ido + 0], cc[i + 3*ido + 0]), LD_PS1(fsign)); | 
| 391 |         tr3 = VADD(cc[i + ido + 0], cc[i + 3*ido + 0]); | 
| 392 |         ti3 = VADD(cc[i + ido + 1], cc[i + 3*ido + 1]); | 
| 393 |  | 
| 394 |         ch[i] = VADD(tr2, tr3); | 
| 395 |         cr3    = VSUB(tr2, tr3); | 
| 396 |         ch[i + 1] = VADD(ti2, ti3); | 
| 397 |         ci3 = VSUB(ti2, ti3); | 
| 398 |  | 
| 399 |         cr2 = VADD(tr1, tr4); | 
| 400 |         cr4 = VSUB(tr1, tr4); | 
| 401 |         ci2 = VADD(ti1, ti4); | 
| 402 |         ci4 = VSUB(ti1, ti4); | 
| 403 |         wr1=wa1[i]; wi1=fsign*wa1[i+1]; | 
| 404 |         VCPLXMUL(cr2, ci2, LD_PS1(wr1), LD_PS1(wi1)); | 
| 405 |         wr2=wa2[i]; wi2=fsign*wa2[i+1]; | 
| 406 |         ch[i + l1ido] = cr2; | 
| 407 |         ch[i + l1ido + 1] = ci2; | 
| 408 |  | 
| 409 |         VCPLXMUL(cr3, ci3, LD_PS1(wr2), LD_PS1(wi2)); | 
| 410 |         wr3=wa3[i]; wi3=fsign*wa3[i+1]; | 
| 411 |         ch[i + 2*l1ido] = cr3; | 
| 412 |         ch[i + 2*l1ido + 1] = ci3; | 
| 413 |  | 
| 414 |         VCPLXMUL(cr4, ci4, LD_PS1(wr3), LD_PS1(wi3)); | 
| 415 |         ch[i + 3*l1ido] = cr4; | 
| 416 |         ch[i + 3*l1ido + 1] = ci4; | 
| 417 |       } | 
| 418 |     } | 
| 419 |   } | 
| 420 | } /* passf4 */ | 
| 421 |  | 
| 422 | /* | 
| 423 |   passf5 and passb5 has been merged here, fsign = -1 for passf5, +1 for passb5 | 
| 424 | */ | 
| 425 | static NEVER_INLINE(void) passf5_ps(int ido, int l1, const v4sf *cc, v4sf *ch, | 
| 426 |                                     const float *wa1, const float *wa2, | 
| 427 |                                     const float *wa3, const float *wa4, float fsign) { | 
| 428 |   static const float tr11 = .309016994374947f; | 
| 429 |   const float ti11 = .951056516295154f*fsign; | 
| 430 |   static const float tr12 = -.809016994374947f; | 
| 431 |   const float ti12 = .587785252292473f*fsign; | 
| 432 |  | 
| 433 |   /* Local variables */ | 
| 434 |   int i, k; | 
| 435 |   v4sf ci2, ci3, ci4, ci5, di3, di4, di5, di2, cr2, cr3, cr5, cr4, ti2, ti3, | 
| 436 |     ti4, ti5, dr3, dr4, dr5, dr2, tr2, tr3, tr4, tr5; | 
| 437 |  | 
| 438 |   float wr1, wi1, wr2, wi2, wr3, wi3, wr4, wi4; | 
| 439 |  | 
| 440 | #define cc_ref(a_1,a_2) cc[(a_2-1)*ido + a_1 + 1] | 
| 441 | #define ch_ref(a_1,a_3) ch[(a_3-1)*l1*ido + a_1 + 1] | 
| 442 |  | 
| 443 |   assert(ido > 2); | 
| 444 |   for (k = 0; k < l1; ++k, cc += 5*ido, ch += ido) { | 
| 445 |     for (i = 0; i < ido-1; i += 2) { | 
| 446 |       ti5 = VSUB(cc_ref(i  , 2), cc_ref(i  , 5)); | 
| 447 |       ti2 = VADD(cc_ref(i  , 2), cc_ref(i  , 5)); | 
| 448 |       ti4 = VSUB(cc_ref(i  , 3), cc_ref(i  , 4)); | 
| 449 |       ti3 = VADD(cc_ref(i  , 3), cc_ref(i  , 4)); | 
| 450 |       tr5 = VSUB(cc_ref(i-1, 2), cc_ref(i-1, 5)); | 
| 451 |       tr2 = VADD(cc_ref(i-1, 2), cc_ref(i-1, 5)); | 
| 452 |       tr4 = VSUB(cc_ref(i-1, 3), cc_ref(i-1, 4)); | 
| 453 |       tr3 = VADD(cc_ref(i-1, 3), cc_ref(i-1, 4)); | 
| 454 |       ch_ref(i-1, 1) = VADD(cc_ref(i-1, 1), VADD(tr2, tr3)); | 
| 455 |       ch_ref(i  , 1) = VADD(cc_ref(i  , 1), VADD(ti2, ti3)); | 
| 456 |       cr2 = VADD(cc_ref(i-1, 1), VADD(SVMUL(tr11, tr2),SVMUL(tr12, tr3))); | 
| 457 |       ci2 = VADD(cc_ref(i  , 1), VADD(SVMUL(tr11, ti2),SVMUL(tr12, ti3))); | 
| 458 |       cr3 = VADD(cc_ref(i-1, 1), VADD(SVMUL(tr12, tr2),SVMUL(tr11, tr3))); | 
| 459 |       ci3 = VADD(cc_ref(i  , 1), VADD(SVMUL(tr12, ti2),SVMUL(tr11, ti3))); | 
| 460 |       cr5 = VADD(SVMUL(ti11, tr5), SVMUL(ti12, tr4)); | 
| 461 |       ci5 = VADD(SVMUL(ti11, ti5), SVMUL(ti12, ti4)); | 
| 462 |       cr4 = VSUB(SVMUL(ti12, tr5), SVMUL(ti11, tr4)); | 
| 463 |       ci4 = VSUB(SVMUL(ti12, ti5), SVMUL(ti11, ti4)); | 
| 464 |       dr3 = VSUB(cr3, ci4); | 
| 465 |       dr4 = VADD(cr3, ci4); | 
| 466 |       di3 = VADD(ci3, cr4); | 
| 467 |       di4 = VSUB(ci3, cr4); | 
| 468 |       dr5 = VADD(cr2, ci5); | 
| 469 |       dr2 = VSUB(cr2, ci5); | 
| 470 |       di5 = VSUB(ci2, cr5); | 
| 471 |       di2 = VADD(ci2, cr5); | 
| 472 |       wr1=wa1[i]; wi1=fsign*wa1[i+1]; wr2=wa2[i]; wi2=fsign*wa2[i+1]; | 
| 473 |       wr3=wa3[i]; wi3=fsign*wa3[i+1]; wr4=wa4[i]; wi4=fsign*wa4[i+1]; | 
| 474 |       VCPLXMUL(dr2, di2, LD_PS1(wr1), LD_PS1(wi1)); | 
| 475 |       ch_ref(i - 1, 2) = dr2; | 
| 476 |       ch_ref(i, 2)     = di2; | 
| 477 |       VCPLXMUL(dr3, di3, LD_PS1(wr2), LD_PS1(wi2)); | 
| 478 |       ch_ref(i - 1, 3) = dr3; | 
| 479 |       ch_ref(i, 3)     = di3; | 
| 480 |       VCPLXMUL(dr4, di4, LD_PS1(wr3), LD_PS1(wi3)); | 
| 481 |       ch_ref(i - 1, 4) = dr4; | 
| 482 |       ch_ref(i, 4)     = di4; | 
| 483 |       VCPLXMUL(dr5, di5, LD_PS1(wr4), LD_PS1(wi4)); | 
| 484 |       ch_ref(i - 1, 5) = dr5; | 
| 485 |       ch_ref(i, 5)     = di5; | 
| 486 |     } | 
| 487 |   } | 
| 488 | #undef ch_ref | 
| 489 | #undef cc_ref | 
| 490 | } | 
| 491 |  | 
| 492 | static NEVER_INLINE(void) radf2_ps(int ido, int l1, const v4sf * RESTRICT cc, v4sf * RESTRICT ch, const float *wa1) { | 
| 493 |   static const float minus_one = -1.f; | 
| 494 |   int i, k, l1ido = l1*ido; | 
| 495 |   for (k=0; k < l1ido; k += ido) { | 
| 496 |     v4sf a = cc[k], b = cc[k + l1ido]; | 
| 497 |     ch[2*k] = VADD(a, b); | 
| 498 |     ch[2*(k+ido)-1] = VSUB(a, b); | 
| 499 |   } | 
| 500 |   if (ido < 2) return; | 
| 501 |   if (ido != 2) { | 
| 502 |     for (k=0; k < l1ido; k += ido) { | 
| 503 |       for (i=2; i<ido; i+=2) { | 
| 504 |         v4sf tr2 = cc[i - 1 + k + l1ido], ti2 = cc[i + k + l1ido]; | 
| 505 |         v4sf br = cc[i - 1 + k], bi = cc[i + k]; | 
| 506 |         VCPLXMULCONJ(tr2, ti2, LD_PS1(wa1[i - 2]), LD_PS1(wa1[i - 1])); | 
| 507 |         ch[i + 2*k] = VADD(bi, ti2); | 
| 508 |         ch[2*(k+ido) - i] = VSUB(ti2, bi); | 
| 509 |         ch[i - 1 + 2*k] = VADD(br, tr2); | 
| 510 |         ch[2*(k+ido) - i -1] = VSUB(br, tr2); | 
| 511 |       } | 
| 512 |     } | 
| 513 |     if (ido % 2 == 1) return; | 
| 514 |   } | 
| 515 |   for (k=0; k < l1ido; k += ido) { | 
| 516 |     ch[2*k + ido] = SVMUL(minus_one, cc[ido-1 + k + l1ido]); | 
| 517 |     ch[2*k + ido-1] = cc[k + ido-1]; | 
| 518 |   } | 
| 519 | } /* radf2 */ | 
| 520 |  | 
| 521 |  | 
| 522 | static NEVER_INLINE(void) radb2_ps(int ido, int l1, const v4sf *cc, v4sf *ch, const float *wa1) { | 
| 523 |   static const float minus_two=-2; | 
| 524 |   int i, k, l1ido = l1*ido; | 
| 525 |   v4sf a,b,c,d, tr2, ti2; | 
| 526 |   for (k=0; k < l1ido; k += ido) { | 
| 527 |     a = cc[2*k]; b = cc[2*(k+ido) - 1]; | 
| 528 |     ch[k] = VADD(a, b); | 
| 529 |     ch[k + l1ido] =VSUB(a, b); | 
| 530 |   } | 
| 531 |   if (ido < 2) return; | 
| 532 |   if (ido != 2) { | 
| 533 |     for (k = 0; k < l1ido; k += ido) { | 
| 534 |       for (i = 2; i < ido; i += 2) { | 
| 535 |         a = cc[i-1 + 2*k]; b = cc[2*(k + ido) - i - 1]; | 
| 536 |         c = cc[i+0 + 2*k]; d = cc[2*(k + ido) - i + 0]; | 
| 537 |         ch[i-1 + k] = VADD(a, b); | 
| 538 |         tr2 = VSUB(a, b); | 
| 539 |         ch[i+0 + k] = VSUB(c, d); | 
| 540 |         ti2 = VADD(c, d); | 
| 541 |         VCPLXMUL(tr2, ti2, LD_PS1(wa1[i - 2]), LD_PS1(wa1[i - 1])); | 
| 542 |         ch[i-1 + k + l1ido] = tr2; | 
| 543 |         ch[i+0 + k + l1ido] = ti2; | 
| 544 |       } | 
| 545 |     } | 
| 546 |     if (ido % 2 == 1) return; | 
| 547 |   } | 
| 548 |   for (k = 0; k < l1ido; k += ido) { | 
| 549 |     a = cc[2*k + ido-1]; b = cc[2*k + ido]; | 
| 550 |     ch[k + ido-1] = VADD(a,a); | 
| 551 |     ch[k + ido-1 + l1ido] = SVMUL(minus_two, b); | 
| 552 |   } | 
| 553 | } /* radb2 */ | 
| 554 |  | 
| 555 | static void radf3_ps(int ido, int l1, const v4sf * RESTRICT cc, v4sf * RESTRICT ch, | 
| 556 |                      const float *wa1, const float *wa2) { | 
| 557 |   static const float taur = -0.5f; | 
| 558 |   static const float taui = 0.866025403784439f; | 
| 559 |   int i, k, ic; | 
| 560 |   v4sf ci2, di2, di3, cr2, dr2, dr3, ti2, ti3, tr2, tr3, wr1, wi1, wr2, wi2; | 
| 561 |   for (k=0; k<l1; k++) { | 
| 562 |     cr2 = VADD(cc[(k + l1)*ido], cc[(k + 2*l1)*ido]); | 
| 563 |     ch[3*k*ido] = VADD(cc[k*ido], cr2); | 
| 564 |     ch[(3*k+2)*ido] = SVMUL(taui, VSUB(cc[(k + l1*2)*ido], cc[(k + l1)*ido])); | 
| 565 |     ch[ido-1 + (3*k + 1)*ido] = VADD(cc[k*ido], SVMUL(taur, cr2)); | 
| 566 |   } | 
| 567 |   if (ido == 1) return; | 
| 568 |   for (k=0; k<l1; k++) { | 
| 569 |     for (i=2; i<ido; i+=2) { | 
| 570 |       ic = ido - i; | 
| 571 |       wr1 = LD_PS1(wa1[i - 2]); wi1 = LD_PS1(wa1[i - 1]); | 
| 572 |       dr2 = cc[i - 1 + (k + l1)*ido]; di2 = cc[i + (k + l1)*ido]; | 
| 573 |       VCPLXMULCONJ(dr2, di2, wr1, wi1); | 
| 574 |  | 
| 575 |       wr2 = LD_PS1(wa2[i - 2]); wi2 = LD_PS1(wa2[i - 1]); | 
| 576 |       dr3 = cc[i - 1 + (k + l1*2)*ido]; di3 = cc[i + (k + l1*2)*ido]; | 
| 577 |       VCPLXMULCONJ(dr3, di3, wr2, wi2); | 
| 578 |  | 
| 579 |       cr2 = VADD(dr2, dr3); | 
| 580 |       ci2 = VADD(di2, di3); | 
| 581 |       ch[i - 1 + 3*k*ido] = VADD(cc[i - 1 + k*ido], cr2); | 
| 582 |       ch[i + 3*k*ido] = VADD(cc[i + k*ido], ci2); | 
| 583 |       tr2 = VADD(cc[i - 1 + k*ido], SVMUL(taur, cr2)); | 
| 584 |       ti2 = VADD(cc[i + k*ido], SVMUL(taur, ci2)); | 
| 585 |       tr3 = SVMUL(taui, VSUB(di2, di3)); | 
| 586 |       ti3 = SVMUL(taui, VSUB(dr3, dr2)); | 
| 587 |       ch[i - 1 + (3*k + 2)*ido] = VADD(tr2, tr3); | 
| 588 |       ch[ic - 1 + (3*k + 1)*ido] = VSUB(tr2, tr3); | 
| 589 |       ch[i + (3*k + 2)*ido] = VADD(ti2, ti3); | 
| 590 |       ch[ic + (3*k + 1)*ido] = VSUB(ti3, ti2); | 
| 591 |     } | 
| 592 |   } | 
| 593 | } /* radf3 */ | 
| 594 |  | 
| 595 |  | 
| 596 | static void radb3_ps(int ido, int l1, const v4sf *RESTRICT cc, v4sf *RESTRICT ch, | 
| 597 |                      const float *wa1, const float *wa2) | 
| 598 | { | 
| 599 |   static const float taur = -0.5f; | 
| 600 |   static const float taui = 0.866025403784439f; | 
| 601 |   static const float taui_2 = 0.866025403784439f*2; | 
| 602 |   int i, k, ic; | 
| 603 |   v4sf ci2, ci3, di2, di3, cr2, cr3, dr2, dr3, ti2, tr2; | 
| 604 |   for (k=0; k<l1; k++) { | 
| 605 |     tr2 = cc[ido-1 + (3*k + 1)*ido]; tr2 = VADD(tr2,tr2); | 
| 606 |     cr2 = VMADD(LD_PS1(taur), tr2, cc[3*k*ido]); | 
| 607 |     ch[k*ido] = VADD(cc[3*k*ido], tr2); | 
| 608 |     ci3 = SVMUL(taui_2, cc[(3*k + 2)*ido]); | 
| 609 |     ch[(k + l1)*ido] = VSUB(cr2, ci3); | 
| 610 |     ch[(k + 2*l1)*ido] = VADD(cr2, ci3); | 
| 611 |   } | 
| 612 |   if (ido == 1) return; | 
| 613 |   for (k=0; k<l1; k++) { | 
| 614 |     for (i=2; i<ido; i+=2) { | 
| 615 |       ic = ido - i; | 
| 616 |       tr2 = VADD(cc[i - 1 + (3*k + 2)*ido], cc[ic - 1 + (3*k + 1)*ido]); | 
| 617 |       cr2 = VMADD(LD_PS1(taur), tr2, cc[i - 1 + 3*k*ido]); | 
| 618 |       ch[i - 1 + k*ido] = VADD(cc[i - 1 + 3*k*ido], tr2); | 
| 619 |       ti2 = VSUB(cc[i + (3*k + 2)*ido], cc[ic + (3*k + 1)*ido]); | 
| 620 |       ci2 = VMADD(LD_PS1(taur), ti2, cc[i + 3*k*ido]); | 
| 621 |       ch[i + k*ido] = VADD(cc[i + 3*k*ido], ti2); | 
| 622 |       cr3 = SVMUL(taui, VSUB(cc[i - 1 + (3*k + 2)*ido], cc[ic - 1 + (3*k + 1)*ido])); | 
| 623 |       ci3 = SVMUL(taui, VADD(cc[i + (3*k + 2)*ido], cc[ic + (3*k + 1)*ido])); | 
| 624 |       dr2 = VSUB(cr2, ci3); | 
| 625 |       dr3 = VADD(cr2, ci3); | 
| 626 |       di2 = VADD(ci2, cr3); | 
| 627 |       di3 = VSUB(ci2, cr3); | 
| 628 |       VCPLXMUL(dr2, di2, LD_PS1(wa1[i-2]), LD_PS1(wa1[i-1])); | 
| 629 |       ch[i - 1 + (k + l1)*ido] = dr2; | 
| 630 |       ch[i + (k + l1)*ido] = di2; | 
| 631 |       VCPLXMUL(dr3, di3, LD_PS1(wa2[i-2]), LD_PS1(wa2[i-1])); | 
| 632 |       ch[i - 1 + (k + 2*l1)*ido] = dr3; | 
| 633 |       ch[i + (k + 2*l1)*ido] = di3; | 
| 634 |     } | 
| 635 |   } | 
| 636 | } /* radb3 */ | 
| 637 |  | 
| 638 | static NEVER_INLINE(void) radf4_ps(int ido, int l1, const v4sf *RESTRICT cc, v4sf * RESTRICT ch, | 
| 639 |                                    const float * RESTRICT wa1, const float * RESTRICT wa2, const float * RESTRICT wa3) | 
| 640 | { | 
| 641 |   static const float minus_hsqt2 = (float)-0.7071067811865475; | 
| 642 |   int i, k, l1ido = l1*ido; | 
| 643 |   { | 
| 644 |     const v4sf *RESTRICT cc_ = cc, * RESTRICT cc_end = cc + l1ido; | 
| 645 |     v4sf * RESTRICT ch_ = ch; | 
| 646 |     while (cc < cc_end) { | 
| 647 |       // this loop represents between 25% and 40% of total radf4_ps cost ! | 
| 648 |       v4sf a0 = cc[0], a1 = cc[l1ido]; | 
| 649 |       v4sf a2 = cc[2*l1ido], a3 = cc[3*l1ido]; | 
| 650 |       v4sf tr1 = VADD(a1, a3); | 
| 651 |       v4sf tr2 = VADD(a0, a2); | 
| 652 |       ch[2*ido-1] = VSUB(a0, a2); | 
| 653 |       ch[2*ido  ] = VSUB(a3, a1); | 
| 654 |       ch[0      ] = VADD(tr1, tr2); | 
| 655 |       ch[4*ido-1] = VSUB(tr2, tr1); | 
| 656 |       cc += ido; ch += 4*ido; | 
| 657 |     } | 
| 658 |     cc = cc_; ch = ch_; | 
| 659 |   } | 
| 660 |   if (ido < 2) return; | 
| 661 |   if (ido != 2) { | 
| 662 |     for (k = 0; k < l1ido; k += ido) { | 
| 663 |       const v4sf * RESTRICT pc = (v4sf*)(cc + 1 + k); | 
| 664 |       for (i=2; i<ido; i += 2, pc += 2) { | 
| 665 |         int ic = ido - i; | 
| 666 |         v4sf wr, wi, cr2, ci2, cr3, ci3, cr4, ci4; | 
| 667 |         v4sf tr1, ti1, tr2, ti2, tr3, ti3, tr4, ti4; | 
| 668 |  | 
| 669 |         cr2 = pc[1*l1ido+0]; | 
| 670 |         ci2 = pc[1*l1ido+1]; | 
| 671 |         wr=LD_PS1(wa1[i - 2]); | 
| 672 |         wi=LD_PS1(wa1[i - 1]); | 
| 673 |         VCPLXMULCONJ(cr2,ci2,wr,wi); | 
| 674 |  | 
| 675 |         cr3 = pc[2*l1ido+0]; | 
| 676 |         ci3 = pc[2*l1ido+1]; | 
| 677 |         wr = LD_PS1(wa2[i-2]); | 
| 678 |         wi = LD_PS1(wa2[i-1]); | 
| 679 |         VCPLXMULCONJ(cr3, ci3, wr, wi); | 
| 680 |  | 
| 681 |         cr4 = pc[3*l1ido]; | 
| 682 |         ci4 = pc[3*l1ido+1]; | 
| 683 |         wr = LD_PS1(wa3[i-2]); | 
| 684 |         wi = LD_PS1(wa3[i-1]); | 
| 685 |         VCPLXMULCONJ(cr4, ci4, wr, wi); | 
| 686 |  | 
| 687 |         /* at this point, on SSE, five of "cr2 cr3 cr4 ci2 ci3 ci4" should be loaded in registers */ | 
| 688 |  | 
| 689 |         tr1 = VADD(cr2,cr4); | 
| 690 |         tr4 = VSUB(cr4,cr2); | 
| 691 |         tr2 = VADD(pc[0],cr3); | 
| 692 |         tr3 = VSUB(pc[0],cr3); | 
| 693 |         ch[i - 1 + 4*k] = VADD(tr1,tr2); | 
| 694 |         ch[ic - 1 + 4*k + 3*ido] = VSUB(tr2,tr1); // at this point tr1 and tr2 can be disposed | 
| 695 |         ti1 = VADD(ci2,ci4); | 
| 696 |         ti4 = VSUB(ci2,ci4); | 
| 697 |         ch[i - 1 + 4*k + 2*ido] = VADD(ti4,tr3); | 
| 698 |         ch[ic - 1 + 4*k + 1*ido] = VSUB(tr3,ti4); // dispose tr3, ti4 | 
| 699 |         ti2 = VADD(pc[1],ci3); | 
| 700 |         ti3 = VSUB(pc[1],ci3); | 
| 701 |         ch[i + 4*k] = VADD(ti1, ti2); | 
| 702 |         ch[ic + 4*k + 3*ido] = VSUB(ti1, ti2); | 
| 703 |         ch[i + 4*k + 2*ido] = VADD(tr4, ti3); | 
| 704 |         ch[ic + 4*k + 1*ido] = VSUB(tr4, ti3); | 
| 705 |       } | 
| 706 |     } | 
| 707 |     if (ido % 2 == 1) return; | 
| 708 |   } | 
| 709 |   for (k=0; k<l1ido; k += ido) { | 
| 710 |     v4sf a = cc[ido-1 + k + l1ido], b = cc[ido-1 + k + 3*l1ido]; | 
| 711 |     v4sf c = cc[ido-1 + k], d = cc[ido-1 + k + 2*l1ido]; | 
| 712 |     v4sf ti1 = SVMUL(minus_hsqt2, VADD(a, b)); | 
| 713 |     v4sf tr1 = SVMUL(minus_hsqt2, VSUB(b, a)); | 
| 714 |     ch[ido-1 + 4*k] = VADD(tr1, c); | 
| 715 |     ch[ido-1 + 4*k + 2*ido] = VSUB(c, tr1); | 
| 716 |     ch[4*k + 1*ido] = VSUB(ti1, d); | 
| 717 |     ch[4*k + 3*ido] = VADD(ti1, d); | 
| 718 |   } | 
| 719 | } /* radf4 */ | 
| 720 |  | 
| 721 |  | 
| 722 | static NEVER_INLINE(void) radb4_ps(int ido, int l1, const v4sf * RESTRICT cc, v4sf * RESTRICT ch, | 
| 723 |                                    const float * RESTRICT wa1, const float * RESTRICT wa2, const float *RESTRICT wa3) | 
| 724 | { | 
| 725 |   static const float minus_sqrt2 = (float)-1.414213562373095; | 
| 726 |   static const float two = 2.f; | 
| 727 |   int i, k, l1ido = l1*ido; | 
| 728 |   v4sf ci2, ci3, ci4, cr2, cr3, cr4, ti1, ti2, ti3, ti4, tr1, tr2, tr3, tr4; | 
| 729 |   { | 
| 730 |     const v4sf *RESTRICT cc_ = cc, * RESTRICT ch_end = ch + l1ido; | 
| 731 |     v4sf *ch_ = ch; | 
| 732 |     while (ch < ch_end) { | 
| 733 |       v4sf a = cc[0], b = cc[4*ido-1]; | 
| 734 |       v4sf c = cc[2*ido], d = cc[2*ido-1]; | 
| 735 |       tr3 = SVMUL(two,d); | 
| 736 |       tr2 = VADD(a,b); | 
| 737 |       tr1 = VSUB(a,b); | 
| 738 |       tr4 = SVMUL(two,c); | 
| 739 |       ch[0*l1ido] = VADD(tr2, tr3); | 
| 740 |       ch[2*l1ido] = VSUB(tr2, tr3); | 
| 741 |       ch[1*l1ido] = VSUB(tr1, tr4); | 
| 742 |       ch[3*l1ido] = VADD(tr1, tr4); | 
| 743 |  | 
| 744 |       cc += 4*ido; ch += ido; | 
| 745 |     } | 
| 746 |     cc = cc_; ch = ch_; | 
| 747 |   } | 
| 748 |   if (ido < 2) return; | 
| 749 |   if (ido != 2) { | 
| 750 |     for (k = 0; k < l1ido; k += ido) { | 
| 751 |       const v4sf * RESTRICT pc = (v4sf*)(cc - 1 + 4*k); | 
| 752 |       v4sf * RESTRICT ph = (v4sf*)(ch + k + 1); | 
| 753 |       for (i = 2; i < ido; i += 2) { | 
| 754 |  | 
| 755 |         tr1 = VSUB(pc[i], pc[4*ido - i]); | 
| 756 |         tr2 = VADD(pc[i], pc[4*ido - i]); | 
| 757 |         ti4 = VSUB(pc[2*ido + i], pc[2*ido - i]); | 
| 758 |         tr3 = VADD(pc[2*ido + i], pc[2*ido - i]); | 
| 759 |         ph[0] = VADD(tr2, tr3); | 
| 760 |         cr3 = VSUB(tr2, tr3); | 
| 761 |  | 
| 762 |         ti3 = VSUB(pc[2*ido + i + 1], pc[2*ido - i + 1]); | 
| 763 |         tr4 = VADD(pc[2*ido + i + 1], pc[2*ido - i + 1]); | 
| 764 |         cr2 = VSUB(tr1, tr4); | 
| 765 |         cr4 = VADD(tr1, tr4); | 
| 766 |  | 
| 767 |         ti1 = VADD(pc[i + 1], pc[4*ido - i + 1]); | 
| 768 |         ti2 = VSUB(pc[i + 1], pc[4*ido - i + 1]); | 
| 769 |  | 
| 770 |         ph[1] = VADD(ti2, ti3); ph += l1ido; | 
| 771 |         ci3 = VSUB(ti2, ti3); | 
| 772 |         ci2 = VADD(ti1, ti4); | 
| 773 |         ci4 = VSUB(ti1, ti4); | 
| 774 |         VCPLXMUL(cr2, ci2, LD_PS1(wa1[i-2]), LD_PS1(wa1[i-1])); | 
| 775 |         ph[0] = cr2; | 
| 776 |         ph[1] = ci2; ph += l1ido; | 
| 777 |         VCPLXMUL(cr3, ci3, LD_PS1(wa2[i-2]), LD_PS1(wa2[i-1])); | 
| 778 |         ph[0] = cr3; | 
| 779 |         ph[1] = ci3; ph += l1ido; | 
| 780 |         VCPLXMUL(cr4, ci4, LD_PS1(wa3[i-2]), LD_PS1(wa3[i-1])); | 
| 781 |         ph[0] = cr4; | 
| 782 |         ph[1] = ci4; ph = ph - 3*l1ido + 2; | 
| 783 |       } | 
| 784 |     } | 
| 785 |     if (ido % 2 == 1) return; | 
| 786 |   } | 
| 787 |   for (k=0; k < l1ido; k+=ido) { | 
| 788 |     int i0 = 4*k + ido; | 
| 789 |     v4sf c = cc[i0-1], d = cc[i0 + 2*ido-1]; | 
| 790 |     v4sf a = cc[i0+0], b = cc[i0 + 2*ido+0]; | 
| 791 |     tr1 = VSUB(c,d); | 
| 792 |     tr2 = VADD(c,d); | 
| 793 |     ti1 = VADD(b,a); | 
| 794 |     ti2 = VSUB(b,a); | 
| 795 |     ch[ido-1 + k + 0*l1ido] = VADD(tr2,tr2); | 
| 796 |     ch[ido-1 + k + 1*l1ido] = SVMUL(minus_sqrt2, VSUB(ti1, tr1)); | 
| 797 |     ch[ido-1 + k + 2*l1ido] = VADD(ti2, ti2); | 
| 798 |     ch[ido-1 + k + 3*l1ido] = SVMUL(minus_sqrt2, VADD(ti1, tr1)); | 
| 799 |   } | 
| 800 | } /* radb4 */ | 
| 801 |  | 
| 802 | static void radf5_ps(int ido, int l1, const v4sf * RESTRICT cc, v4sf * RESTRICT ch, | 
| 803 |                      const float *wa1, const float *wa2, const float *wa3, const float *wa4) | 
| 804 | { | 
| 805 |   static const float tr11 = .309016994374947f; | 
| 806 |   static const float ti11 = .951056516295154f; | 
| 807 |   static const float tr12 = -.809016994374947f; | 
| 808 |   static const float ti12 = .587785252292473f; | 
| 809 |  | 
| 810 |   /* System generated locals */ | 
| 811 |   int cc_offset, ch_offset; | 
| 812 |  | 
| 813 |   /* Local variables */ | 
| 814 |   int i, k, ic; | 
| 815 |   v4sf ci2, di2, ci4, ci5, di3, di4, di5, ci3, cr2, cr3, dr2, dr3, dr4, dr5, | 
| 816 |     cr5, cr4, ti2, ti3, ti5, ti4, tr2, tr3, tr4, tr5; | 
| 817 |   int idp2; | 
| 818 |  | 
| 819 |  | 
| 820 | #define cc_ref(a_1,a_2,a_3) cc[((a_3)*l1 + (a_2))*ido + a_1] | 
| 821 | #define ch_ref(a_1,a_2,a_3) ch[((a_3)*5 + (a_2))*ido + a_1] | 
| 822 |  | 
| 823 |   /* Parameter adjustments */ | 
| 824 |   ch_offset = 1 + ido * 6; | 
| 825 |   ch -= ch_offset; | 
| 826 |   cc_offset = 1 + ido * (1 + l1); | 
| 827 |   cc -= cc_offset; | 
| 828 |  | 
| 829 |   /* Function Body */ | 
| 830 |   for (k = 1; k <= l1; ++k) { | 
| 831 |     cr2 = VADD(cc_ref(1, k, 5), cc_ref(1, k, 2)); | 
| 832 |     ci5 = VSUB(cc_ref(1, k, 5), cc_ref(1, k, 2)); | 
| 833 |     cr3 = VADD(cc_ref(1, k, 4), cc_ref(1, k, 3)); | 
| 834 |     ci4 = VSUB(cc_ref(1, k, 4), cc_ref(1, k, 3)); | 
| 835 |     ch_ref(1, 1, k) = VADD(cc_ref(1, k, 1), VADD(cr2, cr3)); | 
| 836 |     ch_ref(ido, 2, k) = VADD(cc_ref(1, k, 1), VADD(SVMUL(tr11, cr2), SVMUL(tr12, cr3))); | 
| 837 |     ch_ref(1, 3, k) = VADD(SVMUL(ti11, ci5), SVMUL(ti12, ci4)); | 
| 838 |     ch_ref(ido, 4, k) = VADD(cc_ref(1, k, 1), VADD(SVMUL(tr12, cr2), SVMUL(tr11, cr3))); | 
| 839 |     ch_ref(1, 5, k) = VSUB(SVMUL(ti12, ci5), SVMUL(ti11, ci4)); | 
| 840 |     //printf("pffft: radf5, k=%d ch_ref=%f, ci4=%f\n", k, ch_ref(1, 5, k), ci4); | 
| 841 |   } | 
| 842 |   if (ido == 1) { | 
| 843 |     return; | 
| 844 |   } | 
| 845 |   idp2 = ido + 2; | 
| 846 |   for (k = 1; k <= l1; ++k) { | 
| 847 |     for (i = 3; i <= ido; i += 2) { | 
| 848 |       ic = idp2 - i; | 
| 849 |       dr2 = LD_PS1(wa1[i-3]); di2 = LD_PS1(wa1[i-2]); | 
| 850 |       dr3 = LD_PS1(wa2[i-3]); di3 = LD_PS1(wa2[i-2]); | 
| 851 |       dr4 = LD_PS1(wa3[i-3]); di4 = LD_PS1(wa3[i-2]); | 
| 852 |       dr5 = LD_PS1(wa4[i-3]); di5 = LD_PS1(wa4[i-2]); | 
| 853 |       VCPLXMULCONJ(dr2, di2, cc_ref(i-1, k, 2), cc_ref(i, k, 2)); | 
| 854 |       VCPLXMULCONJ(dr3, di3, cc_ref(i-1, k, 3), cc_ref(i, k, 3)); | 
| 855 |       VCPLXMULCONJ(dr4, di4, cc_ref(i-1, k, 4), cc_ref(i, k, 4)); | 
| 856 |       VCPLXMULCONJ(dr5, di5, cc_ref(i-1, k, 5), cc_ref(i, k, 5)); | 
| 857 |       cr2 = VADD(dr2, dr5); | 
| 858 |       ci5 = VSUB(dr5, dr2); | 
| 859 |       cr5 = VSUB(di2, di5); | 
| 860 |       ci2 = VADD(di2, di5); | 
| 861 |       cr3 = VADD(dr3, dr4); | 
| 862 |       ci4 = VSUB(dr4, dr3); | 
| 863 |       cr4 = VSUB(di3, di4); | 
| 864 |       ci3 = VADD(di3, di4); | 
| 865 |       ch_ref(i - 1, 1, k) = VADD(cc_ref(i - 1, k, 1), VADD(cr2, cr3)); | 
| 866 |       ch_ref(i, 1, k) = VSUB(cc_ref(i, k, 1), VADD(ci2, ci3));// | 
| 867 |       tr2 = VADD(cc_ref(i - 1, k, 1), VADD(SVMUL(tr11, cr2), SVMUL(tr12, cr3))); | 
| 868 |       ti2 = VSUB(cc_ref(i, k, 1), VADD(SVMUL(tr11, ci2), SVMUL(tr12, ci3)));// | 
| 869 |       tr3 = VADD(cc_ref(i - 1, k, 1), VADD(SVMUL(tr12, cr2), SVMUL(tr11, cr3))); | 
| 870 |       ti3 = VSUB(cc_ref(i, k, 1), VADD(SVMUL(tr12, ci2), SVMUL(tr11, ci3)));// | 
| 871 |       tr5 = VADD(SVMUL(ti11, cr5), SVMUL(ti12, cr4)); | 
| 872 |       ti5 = VADD(SVMUL(ti11, ci5), SVMUL(ti12, ci4)); | 
| 873 |       tr4 = VSUB(SVMUL(ti12, cr5), SVMUL(ti11, cr4)); | 
| 874 |       ti4 = VSUB(SVMUL(ti12, ci5), SVMUL(ti11, ci4)); | 
| 875 |       ch_ref(i - 1, 3, k) = VSUB(tr2, tr5); | 
| 876 |       ch_ref(ic - 1, 2, k) = VADD(tr2, tr5); | 
| 877 |       ch_ref(i, 3, k) = VADD(ti2, ti5); | 
| 878 |       ch_ref(ic, 2, k) = VSUB(ti5, ti2); | 
| 879 |       ch_ref(i - 1, 5, k) = VSUB(tr3, tr4); | 
| 880 |       ch_ref(ic - 1, 4, k) = VADD(tr3, tr4); | 
| 881 |       ch_ref(i, 5, k) = VADD(ti3, ti4); | 
| 882 |       ch_ref(ic, 4, k) = VSUB(ti4, ti3); | 
| 883 |     } | 
| 884 |   } | 
| 885 | #undef cc_ref | 
| 886 | #undef ch_ref | 
| 887 | } /* radf5 */ | 
| 888 |  | 
| 889 | static void radb5_ps(int ido, int l1, const v4sf *RESTRICT cc, v4sf *RESTRICT ch, | 
| 890 |                      const float *wa1, const float *wa2, const float *wa3, const float *wa4) | 
| 891 | { | 
| 892 |   static const float tr11 = .309016994374947f; | 
| 893 |   static const float ti11 = .951056516295154f; | 
| 894 |   static const float tr12 = -.809016994374947f; | 
| 895 |   static const float ti12 = .587785252292473f; | 
| 896 |  | 
| 897 |   int cc_offset, ch_offset; | 
| 898 |  | 
| 899 |   /* Local variables */ | 
| 900 |   int i, k, ic; | 
| 901 |   v4sf ci2, ci3, ci4, ci5, di3, di4, di5, di2, cr2, cr3, cr5, cr4, ti2, ti3, | 
| 902 |     ti4, ti5, dr3, dr4, dr5, dr2, tr2, tr3, tr4, tr5; | 
| 903 |   int idp2; | 
| 904 |  | 
| 905 | #define cc_ref(a_1,a_2,a_3) cc[((a_3)*5 + (a_2))*ido + a_1] | 
| 906 | #define ch_ref(a_1,a_2,a_3) ch[((a_3)*l1 + (a_2))*ido + a_1] | 
| 907 |  | 
| 908 |   /* Parameter adjustments */ | 
| 909 |   ch_offset = 1 + ido * (1 + l1); | 
| 910 |   ch -= ch_offset; | 
| 911 |   cc_offset = 1 + ido * 6; | 
| 912 |   cc -= cc_offset; | 
| 913 |  | 
| 914 |   /* Function Body */ | 
| 915 |   for (k = 1; k <= l1; ++k) { | 
| 916 |     ti5 = VADD(cc_ref(1, 3, k), cc_ref(1, 3, k)); | 
| 917 |     ti4 = VADD(cc_ref(1, 5, k), cc_ref(1, 5, k)); | 
| 918 |     tr2 = VADD(cc_ref(ido, 2, k), cc_ref(ido, 2, k)); | 
| 919 |     tr3 = VADD(cc_ref(ido, 4, k), cc_ref(ido, 4, k)); | 
| 920 |     ch_ref(1, k, 1) = VADD(cc_ref(1, 1, k), VADD(tr2, tr3)); | 
| 921 |     cr2 = VADD(cc_ref(1, 1, k), VADD(SVMUL(tr11, tr2), SVMUL(tr12, tr3))); | 
| 922 |     cr3 = VADD(cc_ref(1, 1, k), VADD(SVMUL(tr12, tr2), SVMUL(tr11, tr3))); | 
| 923 |     ci5 = VADD(SVMUL(ti11, ti5), SVMUL(ti12, ti4)); | 
| 924 |     ci4 = VSUB(SVMUL(ti12, ti5), SVMUL(ti11, ti4)); | 
| 925 |     ch_ref(1, k, 2) = VSUB(cr2, ci5); | 
| 926 |     ch_ref(1, k, 3) = VSUB(cr3, ci4); | 
| 927 |     ch_ref(1, k, 4) = VADD(cr3, ci4); | 
| 928 |     ch_ref(1, k, 5) = VADD(cr2, ci5); | 
| 929 |   } | 
| 930 |   if (ido == 1) { | 
| 931 |     return; | 
| 932 |   } | 
| 933 |   idp2 = ido + 2; | 
| 934 |   for (k = 1; k <= l1; ++k) { | 
| 935 |     for (i = 3; i <= ido; i += 2) { | 
| 936 |       ic = idp2 - i; | 
| 937 |       ti5 = VADD(cc_ref(i  , 3, k), cc_ref(ic  , 2, k)); | 
| 938 |       ti2 = VSUB(cc_ref(i  , 3, k), cc_ref(ic  , 2, k)); | 
| 939 |       ti4 = VADD(cc_ref(i  , 5, k), cc_ref(ic  , 4, k)); | 
| 940 |       ti3 = VSUB(cc_ref(i  , 5, k), cc_ref(ic  , 4, k)); | 
| 941 |       tr5 = VSUB(cc_ref(i-1, 3, k), cc_ref(ic-1, 2, k)); | 
| 942 |       tr2 = VADD(cc_ref(i-1, 3, k), cc_ref(ic-1, 2, k)); | 
| 943 |       tr4 = VSUB(cc_ref(i-1, 5, k), cc_ref(ic-1, 4, k)); | 
| 944 |       tr3 = VADD(cc_ref(i-1, 5, k), cc_ref(ic-1, 4, k)); | 
| 945 |       ch_ref(i - 1, k, 1) = VADD(cc_ref(i-1, 1, k), VADD(tr2, tr3)); | 
| 946 |       ch_ref(i, k, 1) = VADD(cc_ref(i, 1, k), VADD(ti2, ti3)); | 
| 947 |       cr2 = VADD(cc_ref(i-1, 1, k), VADD(SVMUL(tr11, tr2), SVMUL(tr12, tr3))); | 
| 948 |       ci2 = VADD(cc_ref(i  , 1, k), VADD(SVMUL(tr11, ti2), SVMUL(tr12, ti3))); | 
| 949 |       cr3 = VADD(cc_ref(i-1, 1, k), VADD(SVMUL(tr12, tr2), SVMUL(tr11, tr3))); | 
| 950 |       ci3 = VADD(cc_ref(i  , 1, k), VADD(SVMUL(tr12, ti2), SVMUL(tr11, ti3))); | 
| 951 |       cr5 = VADD(SVMUL(ti11, tr5), SVMUL(ti12, tr4)); | 
| 952 |       ci5 = VADD(SVMUL(ti11, ti5), SVMUL(ti12, ti4)); | 
| 953 |       cr4 = VSUB(SVMUL(ti12, tr5), SVMUL(ti11, tr4)); | 
| 954 |       ci4 = VSUB(SVMUL(ti12, ti5), SVMUL(ti11, ti4)); | 
| 955 |       dr3 = VSUB(cr3, ci4); | 
| 956 |       dr4 = VADD(cr3, ci4); | 
| 957 |       di3 = VADD(ci3, cr4); | 
| 958 |       di4 = VSUB(ci3, cr4); | 
| 959 |       dr5 = VADD(cr2, ci5); | 
| 960 |       dr2 = VSUB(cr2, ci5); | 
| 961 |       di5 = VSUB(ci2, cr5); | 
| 962 |       di2 = VADD(ci2, cr5); | 
| 963 |       VCPLXMUL(dr2, di2, LD_PS1(wa1[i-3]), LD_PS1(wa1[i-2])); | 
| 964 |       VCPLXMUL(dr3, di3, LD_PS1(wa2[i-3]), LD_PS1(wa2[i-2])); | 
| 965 |       VCPLXMUL(dr4, di4, LD_PS1(wa3[i-3]), LD_PS1(wa3[i-2])); | 
| 966 |       VCPLXMUL(dr5, di5, LD_PS1(wa4[i-3]), LD_PS1(wa4[i-2])); | 
| 967 |  | 
| 968 |       ch_ref(i-1, k, 2) = dr2; ch_ref(i, k, 2) = di2; | 
| 969 |       ch_ref(i-1, k, 3) = dr3; ch_ref(i, k, 3) = di3; | 
| 970 |       ch_ref(i-1, k, 4) = dr4; ch_ref(i, k, 4) = di4; | 
| 971 |       ch_ref(i-1, k, 5) = dr5; ch_ref(i, k, 5) = di5; | 
| 972 |     } | 
| 973 |   } | 
| 974 | #undef cc_ref | 
| 975 | #undef ch_ref | 
| 976 | } /* radb5 */ | 
| 977 |  | 
| 978 | static NEVER_INLINE(v4sf *) rfftf1_ps(int n, const v4sf *input_readonly, v4sf *work1, v4sf *work2, | 
| 979 |                                       const float *wa, const int *ifac) { | 
| 980 |   v4sf *in  = (v4sf*)input_readonly; | 
| 981 |   v4sf *out = (in == work2 ? work1 : work2); | 
| 982 |   int nf = ifac[1], k1; | 
| 983 |   int l2 = n; | 
| 984 |   int iw = n-1; | 
| 985 |   assert(in != out && work1 != work2); | 
| 986 |   for (k1 = 1; k1 <= nf; ++k1) { | 
| 987 |     int kh = nf - k1; | 
| 988 |     int ip = ifac[kh + 2]; | 
| 989 |     int l1 = l2 / ip; | 
| 990 |     int ido = n / l2; | 
| 991 |     iw -= (ip - 1)*ido; | 
| 992 |     switch (ip) { | 
| 993 |       case 5: { | 
| 994 |         int ix2 = iw + ido; | 
| 995 |         int ix3 = ix2 + ido; | 
| 996 |         int ix4 = ix3 + ido; | 
| 997 |         radf5_ps(ido, l1, cc: in, ch: out, wa1: &wa[iw], wa2: &wa[ix2], wa3: &wa[ix3], wa4: &wa[ix4]); | 
| 998 |       } break; | 
| 999 |       case 4: { | 
| 1000 |         int ix2 = iw + ido; | 
| 1001 |         int ix3 = ix2 + ido; | 
| 1002 |         radf4_ps(ido, l1, cc: in, ch: out, wa1: &wa[iw], wa2: &wa[ix2], wa3: &wa[ix3]); | 
| 1003 |       } break; | 
| 1004 |       case 3: { | 
| 1005 |         int ix2 = iw + ido; | 
| 1006 |         radf3_ps(ido, l1, cc: in, ch: out, wa1: &wa[iw], wa2: &wa[ix2]); | 
| 1007 |       } break; | 
| 1008 |       case 2: | 
| 1009 |         radf2_ps(ido, l1, cc: in, ch: out, wa1: &wa[iw]); | 
| 1010 |         break; | 
| 1011 |       default: | 
| 1012 |         assert(0); | 
| 1013 |         break; | 
| 1014 |     } | 
| 1015 |     l2 = l1; | 
| 1016 |     if (out == work2) { | 
| 1017 |       out = work1; in = work2; | 
| 1018 |     } else { | 
| 1019 |       out = work2; in = work1; | 
| 1020 |     } | 
| 1021 |   } | 
| 1022 |   return in; /* this is in fact the output .. */ | 
| 1023 | } /* rfftf1 */ | 
| 1024 |  | 
| 1025 | static NEVER_INLINE(v4sf *) rfftb1_ps(int n, const v4sf *input_readonly, v4sf *work1, v4sf *work2, | 
| 1026 |                                       const float *wa, const int *ifac) { | 
| 1027 |   v4sf *in  = (v4sf*)input_readonly; | 
| 1028 |   v4sf *out = (in == work2 ? work1 : work2); | 
| 1029 |   int nf = ifac[1], k1; | 
| 1030 |   int l1 = 1; | 
| 1031 |   int iw = 0; | 
| 1032 |   assert(in != out); | 
| 1033 |   for (k1=1; k1<=nf; k1++) { | 
| 1034 |     int ip = ifac[k1 + 1]; | 
| 1035 |     int l2 = ip*l1; | 
| 1036 |     int ido = n / l2; | 
| 1037 |     switch (ip) { | 
| 1038 |       case 5: { | 
| 1039 |         int ix2 = iw + ido; | 
| 1040 |         int ix3 = ix2 + ido; | 
| 1041 |         int ix4 = ix3 + ido; | 
| 1042 |         radb5_ps(ido, l1, cc: in, ch: out, wa1: &wa[iw], wa2: &wa[ix2], wa3: &wa[ix3], wa4: &wa[ix4]); | 
| 1043 |       } break; | 
| 1044 |       case 4: { | 
| 1045 |         int ix2 = iw + ido; | 
| 1046 |         int ix3 = ix2 + ido; | 
| 1047 |         radb4_ps(ido, l1, cc: in, ch: out, wa1: &wa[iw], wa2: &wa[ix2], wa3: &wa[ix3]); | 
| 1048 |       } break; | 
| 1049 |       case 3: { | 
| 1050 |         int ix2 = iw + ido; | 
| 1051 |         radb3_ps(ido, l1, cc: in, ch: out, wa1: &wa[iw], wa2: &wa[ix2]); | 
| 1052 |       } break; | 
| 1053 |       case 2: | 
| 1054 |         radb2_ps(ido, l1, cc: in, ch: out, wa1: &wa[iw]); | 
| 1055 |         break; | 
| 1056 |       default: | 
| 1057 |         assert(0); | 
| 1058 |         break; | 
| 1059 |     } | 
| 1060 |     l1 = l2; | 
| 1061 |     iw += (ip - 1)*ido; | 
| 1062 |  | 
| 1063 |     if (out == work2) { | 
| 1064 |       out = work1; in = work2; | 
| 1065 |     } else { | 
| 1066 |       out = work2; in = work1; | 
| 1067 |     } | 
| 1068 |   } | 
| 1069 |   return in; /* this is in fact the output .. */ | 
| 1070 | } | 
| 1071 |  | 
| 1072 | #define IFAC_MAX_SIZE 25 /* max number of integer factors for the decomposition, +2 */ | 
| 1073 | static int decompose(int n, int *ifac, const int *ntryh) { | 
| 1074 |   int nl = n, nf = 0, i, j = 0; | 
| 1075 |   for (j=0; ntryh[j]; ++j) { | 
| 1076 |     int ntry = ntryh[j]; | 
| 1077 |     while (nl != 1) { | 
| 1078 |       int nq = nl / ntry; | 
| 1079 |       int nr = nl - ntry * nq; | 
| 1080 |       if (nr == 0) { | 
| 1081 |         assert(2 + nf < IFAC_MAX_SIZE); | 
| 1082 |         ifac[2+nf++] = ntry; | 
| 1083 |         nl = nq; | 
| 1084 |         if (ntry == 2 && nf != 1) { | 
| 1085 |           for (i = 2; i <= nf; ++i) { | 
| 1086 |             int ib = nf - i + 2; | 
| 1087 |             ifac[ib + 1] = ifac[ib]; | 
| 1088 |           } | 
| 1089 |           ifac[2] = 2; | 
| 1090 |         } | 
| 1091 |       } else break; | 
| 1092 |     } | 
| 1093 |   } | 
| 1094 |   ifac[0] = n; | 
| 1095 |   ifac[1] = nf; | 
| 1096 |   return nf; | 
| 1097 | } | 
| 1098 |  | 
| 1099 |  | 
| 1100 |  | 
| 1101 | static void rffti1_ps(int n, float *wa, int *ifac) | 
| 1102 | { | 
| 1103 |   static const int ntryh[] = { 4,2,3,5,0 }; | 
| 1104 |   int k1, j, ii; | 
| 1105 |  | 
| 1106 |   int nf = decompose(n,ifac,ntryh); | 
| 1107 |   float argh = (2*M_PI) / n; | 
| 1108 |   int is = 0; | 
| 1109 |   int nfm1 = nf - 1; | 
| 1110 |   int l1 = 1; | 
| 1111 |   for (k1 = 1; k1 <= nfm1; k1++) { | 
| 1112 |     int ip = ifac[k1 + 1]; | 
| 1113 |     int ld = 0; | 
| 1114 |     int l2 = l1*ip; | 
| 1115 |     int ido = n / l2; | 
| 1116 |     int ipm = ip - 1; | 
| 1117 |     for (j = 1; j <= ipm; ++j) { | 
| 1118 |       float argld; | 
| 1119 |       int i = is, fi=0; | 
| 1120 |       ld += l1; | 
| 1121 |       argld = ld*argh; | 
| 1122 |       for (ii = 3; ii <= ido; ii += 2) { | 
| 1123 |         i += 2; | 
| 1124 |         fi += 1; | 
| 1125 |         wa[i - 2] = cos(x: fi*argld); | 
| 1126 |         wa[i - 1] = sin(x: fi*argld); | 
| 1127 |       } | 
| 1128 |       is += ido; | 
| 1129 |     } | 
| 1130 |     l1 = l2; | 
| 1131 |   } | 
| 1132 | } /* rffti1 */ | 
| 1133 |  | 
| 1134 | void cffti1_ps(int n, float *wa, int *ifac) | 
| 1135 | { | 
| 1136 |   static const int ntryh[] = { 5,3,4,2,0 }; | 
| 1137 |   int k1, j, ii; | 
| 1138 |  | 
| 1139 |   int nf = decompose(n,ifac,ntryh); | 
| 1140 |   float argh = (2*M_PI)/(float)n; | 
| 1141 |   int i = 1; | 
| 1142 |   int l1 = 1; | 
| 1143 |   for (k1=1; k1<=nf; k1++) { | 
| 1144 |     int ip = ifac[k1+1]; | 
| 1145 |     int ld = 0; | 
| 1146 |     int l2 = l1*ip; | 
| 1147 |     int ido = n / l2; | 
| 1148 |     int idot = ido + ido + 2; | 
| 1149 |     int ipm = ip - 1; | 
| 1150 |     for (j=1; j<=ipm; j++) { | 
| 1151 |       float argld; | 
| 1152 |       int i1 = i, fi = 0; | 
| 1153 |       wa[i-1] = 1; | 
| 1154 |       wa[i] = 0; | 
| 1155 |       ld += l1; | 
| 1156 |       argld = ld*argh; | 
| 1157 |       for (ii = 4; ii <= idot; ii += 2) { | 
| 1158 |         i += 2; | 
| 1159 |         fi += 1; | 
| 1160 |         wa[i-1] = cos(x: fi*argld); | 
| 1161 |         wa[i] = sin(x: fi*argld); | 
| 1162 |       } | 
| 1163 |       if (ip > 5) { | 
| 1164 |         wa[i1-1] = wa[i-1]; | 
| 1165 |         wa[i1] = wa[i]; | 
| 1166 |       } | 
| 1167 |     } | 
| 1168 |     l1 = l2; | 
| 1169 |   } | 
| 1170 | } /* cffti1 */ | 
| 1171 |  | 
| 1172 |  | 
| 1173 | v4sf *cfftf1_ps(int n, const v4sf *input_readonly, v4sf *work1, v4sf *work2, const float *wa, const int *ifac, int isign) { | 
| 1174 |   v4sf *in  = (v4sf*)input_readonly; | 
| 1175 |   v4sf *out = (in == work2 ? work1 : work2); | 
| 1176 |   int nf = ifac[1], k1; | 
| 1177 |   int l1 = 1; | 
| 1178 |   int iw = 0; | 
| 1179 |   assert(in != out && work1 != work2); | 
| 1180 |   for (k1=2; k1<=nf+1; k1++) { | 
| 1181 |     int ip = ifac[k1]; | 
| 1182 |     int l2 = ip*l1; | 
| 1183 |     int ido = n / l2; | 
| 1184 |     int idot = ido + ido; | 
| 1185 |     switch (ip) { | 
| 1186 |       case 5: { | 
| 1187 |         int ix2 = iw + idot; | 
| 1188 |         int ix3 = ix2 + idot; | 
| 1189 |         int ix4 = ix3 + idot; | 
| 1190 |         passf5_ps(ido: idot, l1, cc: in, ch: out, wa1: &wa[iw], wa2: &wa[ix2], wa3: &wa[ix3], wa4: &wa[ix4], fsign: isign); | 
| 1191 |       } break; | 
| 1192 |       case 4: { | 
| 1193 |         int ix2 = iw + idot; | 
| 1194 |         int ix3 = ix2 + idot; | 
| 1195 |         passf4_ps(ido: idot, l1, cc: in, ch: out, wa1: &wa[iw], wa2: &wa[ix2], wa3: &wa[ix3], fsign: isign); | 
| 1196 |       } break; | 
| 1197 |       case 2: { | 
| 1198 |         passf2_ps(ido: idot, l1, cc: in, ch: out, wa1: &wa[iw], fsign: isign); | 
| 1199 |       } break; | 
| 1200 |       case 3: { | 
| 1201 |         int ix2 = iw + idot; | 
| 1202 |         passf3_ps(ido: idot, l1, cc: in, ch: out, wa1: &wa[iw], wa2: &wa[ix2], fsign: isign); | 
| 1203 |       } break; | 
| 1204 |       default: | 
| 1205 |         assert(0); | 
| 1206 |     } | 
| 1207 |     l1 = l2; | 
| 1208 |     iw += (ip - 1)*idot; | 
| 1209 |     if (out == work2) { | 
| 1210 |       out = work1; in = work2; | 
| 1211 |     } else { | 
| 1212 |       out = work2; in = work1; | 
| 1213 |     } | 
| 1214 |   } | 
| 1215 |  | 
| 1216 |   return in; /* this is in fact the output .. */ | 
| 1217 | } | 
| 1218 |  | 
| 1219 |  | 
| 1220 | struct PFFFT_Setup { | 
| 1221 |   int     N; | 
| 1222 |   int     Ncvec; // nb of complex simd vectors (N/4 if PFFFT_COMPLEX, N/8 if PFFFT_REAL) | 
| 1223 |   // hold the decomposition into small integers of N | 
| 1224 |   int ifac[IFAC_MAX_SIZE]; // N , number of factors, factors (admitted values: 2, 3, 4 ou 5) | 
| 1225 |   pffft_transform_t transform; | 
| 1226 |   v4sf *data; // allocated room for twiddle coefs | 
| 1227 |   float *e;    // points into 'data' , N/4*3 elements | 
| 1228 |   float *twiddle; // points into 'data', N/4 elements | 
| 1229 | }; | 
| 1230 |  | 
| 1231 | PFFFT_Setup *pffft_new_setup(int N, pffft_transform_t transform) { | 
| 1232 |   // validate N for negative values or potential int overflow | 
| 1233 |   if (N < 0) { | 
| 1234 |     return 0; | 
| 1235 |   } | 
| 1236 |   if (N > (1<<26)) { | 
| 1237 |     // higher values of N will make you enter in the integer overflow world... | 
| 1238 |     assert(0); | 
| 1239 |     return 0; | 
| 1240 |   } | 
| 1241 |   PFFFT_Setup *s = (PFFFT_Setup*)malloc(size: sizeof(PFFFT_Setup)); | 
| 1242 |   int k, m; | 
| 1243 |   /* unfortunately, the fft size must be a multiple of 16 for complex FFTs | 
| 1244 |      and 32 for real FFTs -- a lot of stuff would need to be rewritten to | 
| 1245 |      handle other cases (or maybe just switch to a scalar fft, I don't know..) */ | 
| 1246 |   if (transform == PFFFT_REAL) { assert((N%(2*SIMD_SZ*SIMD_SZ))==0 && N>0); } | 
| 1247 |   if (transform == PFFFT_COMPLEX) { assert((N%(SIMD_SZ*SIMD_SZ))==0 && N>0); } | 
| 1248 |   //assert((N % 32) == 0); | 
| 1249 |   s->N = N; | 
| 1250 |   s->transform = transform; | 
| 1251 |   /* nb of complex simd vectors */ | 
| 1252 |   s->Ncvec = (transform == PFFFT_REAL ? N/2 : N)/SIMD_SZ; | 
| 1253 |   s->data = (v4sf*)pffft_aligned_malloc(nb_bytes: 2*s->Ncvec * sizeof(v4sf)); | 
| 1254 |   s->e = (float*)s->data; | 
| 1255 |   s->twiddle = (float*)(s->data + (2*s->Ncvec*(SIMD_SZ-1))/SIMD_SZ); | 
| 1256 |  | 
| 1257 |   for (k=0; k < s->Ncvec; ++k) { | 
| 1258 |     int i = k/SIMD_SZ; | 
| 1259 |     int j = k%SIMD_SZ; | 
| 1260 |     for (m=0; m < SIMD_SZ-1; ++m) { | 
| 1261 |       float A = -2*M_PI*(m+1)*k / N; | 
| 1262 |       s->e[(2*(i*3 + m) + 0) * SIMD_SZ + j] = cos(x: A); | 
| 1263 |       s->e[(2*(i*3 + m) + 1) * SIMD_SZ + j] = sin(x: A); | 
| 1264 |     } | 
| 1265 |   } | 
| 1266 |  | 
| 1267 |   if (transform == PFFFT_REAL) { | 
| 1268 |     rffti1_ps(n: N/SIMD_SZ, wa: s->twiddle, ifac: s->ifac); | 
| 1269 |   } else { | 
| 1270 |     cffti1_ps(n: N/SIMD_SZ, wa: s->twiddle, ifac: s->ifac); | 
| 1271 |   } | 
| 1272 |  | 
| 1273 |   /* check that N is decomposable with allowed prime factors */ | 
| 1274 |   for (k=0, m=1; k < s->ifac[1]; ++k) { m *= s->ifac[2+k]; } | 
| 1275 |   if (m != N/SIMD_SZ) { | 
| 1276 |     pffft_destroy_setup(s); s = 0; | 
| 1277 |   } | 
| 1278 |  | 
| 1279 |   return s; | 
| 1280 | } | 
| 1281 |  | 
| 1282 |  | 
| 1283 | void pffft_destroy_setup(PFFFT_Setup *s) { | 
| 1284 |   pffft_aligned_free(p: s->data); | 
| 1285 |   free(ptr: s); | 
| 1286 | } | 
| 1287 |  | 
| 1288 | #if !defined(PFFFT_SIMD_DISABLE) | 
| 1289 |  | 
| 1290 | /* [0 0 1 2 3 4 5 6 7 8] -> [0 8 7 6 5 4 3 2 1] */ | 
| 1291 | static void reversed_copy(int N, const v4sf *in, int in_stride, v4sf *out) { | 
| 1292 |   v4sf g0, g1; | 
| 1293 |   int k; | 
| 1294 |   INTERLEAVE2(in[0], in[1], g0, g1); in += in_stride; | 
| 1295 |  | 
| 1296 |   *--out = VSWAPHL(g0, g1); // [g0l, g0h], [g1l g1h] -> [g1l, g0h] | 
| 1297 |   for (k=1; k < N; ++k) { | 
| 1298 |     v4sf h0, h1; | 
| 1299 |     INTERLEAVE2(in[0], in[1], h0, h1); in += in_stride; | 
| 1300 |     *--out = VSWAPHL(g1, h0); | 
| 1301 |     *--out = VSWAPHL(h0, h1); | 
| 1302 |     g1 = h1; | 
| 1303 |   } | 
| 1304 |   *--out = VSWAPHL(g1, g0); | 
| 1305 | } | 
| 1306 |  | 
| 1307 | static void unreversed_copy(int N, const v4sf *in, v4sf *out, int out_stride) { | 
| 1308 |   v4sf g0, g1, h0, h1; | 
| 1309 |   int k; | 
| 1310 |   g0 = g1 = in[0]; ++in; | 
| 1311 |   for (k=1; k < N; ++k) { | 
| 1312 |     h0 = *in++; h1 = *in++; | 
| 1313 |     g1 = VSWAPHL(g1, h0); | 
| 1314 |     h0 = VSWAPHL(h0, h1); | 
| 1315 |     UNINTERLEAVE2(h0, g1, out[0], out[1]); out += out_stride; | 
| 1316 |     g1 = h1; | 
| 1317 |   } | 
| 1318 |   h0 = *in++; h1 = g0; | 
| 1319 |   g1 = VSWAPHL(g1, h0); | 
| 1320 |   h0 = VSWAPHL(h0, h1); | 
| 1321 |   UNINTERLEAVE2(h0, g1, out[0], out[1]); | 
| 1322 | } | 
| 1323 |  | 
| 1324 | void pffft_zreorder(PFFFT_Setup *setup, const float *in, float *out, pffft_direction_t direction) { | 
| 1325 |   int k, N = setup->N, Ncvec = setup->Ncvec; | 
| 1326 |   const v4sf *vin = (const v4sf*)in; | 
| 1327 |   v4sf *vout = (v4sf*)out; | 
| 1328 |   assert(in != out); | 
| 1329 |   if (setup->transform == PFFFT_REAL) { | 
| 1330 |     int k, dk = N/32; | 
| 1331 |     if (direction == PFFFT_FORWARD) { | 
| 1332 |       for (k=0; k < dk; ++k) { | 
| 1333 |         INTERLEAVE2(vin[k*8 + 0], vin[k*8 + 1], vout[2*(0*dk + k) + 0], vout[2*(0*dk + k) + 1]); | 
| 1334 |         INTERLEAVE2(vin[k*8 + 4], vin[k*8 + 5], vout[2*(2*dk + k) + 0], vout[2*(2*dk + k) + 1]); | 
| 1335 |       } | 
| 1336 |       reversed_copy(N: dk, in: vin+2, in_stride: 8, out: (v4sf*)(out + N/2)); | 
| 1337 |       reversed_copy(N: dk, in: vin+6, in_stride: 8, out: (v4sf*)(out + N)); | 
| 1338 |     } else { | 
| 1339 |       for (k=0; k < dk; ++k) { | 
| 1340 |         UNINTERLEAVE2(vin[2*(0*dk + k) + 0], vin[2*(0*dk + k) + 1], vout[k*8 + 0], vout[k*8 + 1]); | 
| 1341 |         UNINTERLEAVE2(vin[2*(2*dk + k) + 0], vin[2*(2*dk + k) + 1], vout[k*8 + 4], vout[k*8 + 5]); | 
| 1342 |       } | 
| 1343 |       unreversed_copy(N: dk, in: (v4sf*)(in + N/4), out: (v4sf*)(out + N - 6*SIMD_SZ), out_stride: -8); | 
| 1344 |       unreversed_copy(N: dk, in: (v4sf*)(in + 3*N/4), out: (v4sf*)(out + N - 2*SIMD_SZ), out_stride: -8); | 
| 1345 |     } | 
| 1346 |   } else { | 
| 1347 |     if (direction == PFFFT_FORWARD) { | 
| 1348 |       for (k=0; k < Ncvec; ++k) { | 
| 1349 |         int kk = (k/4) + (k%4)*(Ncvec/4); | 
| 1350 |         INTERLEAVE2(vin[k*2], vin[k*2+1], vout[kk*2], vout[kk*2+1]); | 
| 1351 |       } | 
| 1352 |     } else { | 
| 1353 |       for (k=0; k < Ncvec; ++k) { | 
| 1354 |         int kk = (k/4) + (k%4)*(Ncvec/4); | 
| 1355 |         UNINTERLEAVE2(vin[kk*2], vin[kk*2+1], vout[k*2], vout[k*2+1]); | 
| 1356 |       } | 
| 1357 |     } | 
| 1358 |   } | 
| 1359 | } | 
| 1360 |  | 
| 1361 | void pffft_cplx_finalize(int Ncvec, const v4sf *in, v4sf *out, const v4sf *e) { | 
| 1362 |   int k, dk = Ncvec/SIMD_SZ; // number of 4x4 matrix blocks | 
| 1363 |   v4sf r0, i0, r1, i1, r2, i2, r3, i3; | 
| 1364 |   v4sf sr0, dr0, sr1, dr1, si0, di0, si1, di1; | 
| 1365 |   assert(in != out); | 
| 1366 |   for (k=0; k < dk; ++k) { | 
| 1367 |     r0 = in[8*k+0]; i0 = in[8*k+1]; | 
| 1368 |     r1 = in[8*k+2]; i1 = in[8*k+3]; | 
| 1369 |     r2 = in[8*k+4]; i2 = in[8*k+5]; | 
| 1370 |     r3 = in[8*k+6]; i3 = in[8*k+7]; | 
| 1371 |     VTRANSPOSE4(r0,r1,r2,r3); | 
| 1372 |     VTRANSPOSE4(i0,i1,i2,i3); | 
| 1373 |     VCPLXMUL(r1,i1,e[k*6+0],e[k*6+1]); | 
| 1374 |     VCPLXMUL(r2,i2,e[k*6+2],e[k*6+3]); | 
| 1375 |     VCPLXMUL(r3,i3,e[k*6+4],e[k*6+5]); | 
| 1376 |  | 
| 1377 |     sr0 = VADD(r0,r2); dr0 = VSUB(r0, r2); | 
| 1378 |     sr1 = VADD(r1,r3); dr1 = VSUB(r1, r3); | 
| 1379 |     si0 = VADD(i0,i2); di0 = VSUB(i0, i2); | 
| 1380 |     si1 = VADD(i1,i3); di1 = VSUB(i1, i3); | 
| 1381 |  | 
| 1382 |     /* | 
| 1383 |       transformation for each column is: | 
| 1384 |  | 
| 1385 |       [1   1   1   1   0   0   0   0]   [r0] | 
| 1386 |       [1   0  -1   0   0  -1   0   1]   [r1] | 
| 1387 |       [1  -1   1  -1   0   0   0   0]   [r2] | 
| 1388 |       [1   0  -1   0   0   1   0  -1]   [r3] | 
| 1389 |       [0   0   0   0   1   1   1   1] * [i0] | 
| 1390 |       [0   1   0  -1   1   0  -1   0]   [i1] | 
| 1391 |       [0   0   0   0   1  -1   1  -1]   [i2] | 
| 1392 |       [0  -1   0   1   1   0  -1   0]   [i3] | 
| 1393 |     */ | 
| 1394 |  | 
| 1395 |     r0 = VADD(sr0, sr1); i0 = VADD(si0, si1); | 
| 1396 |     r1 = VADD(dr0, di1); i1 = VSUB(di0, dr1); | 
| 1397 |     r2 = VSUB(sr0, sr1); i2 = VSUB(si0, si1); | 
| 1398 |     r3 = VSUB(dr0, di1); i3 = VADD(di0, dr1); | 
| 1399 |  | 
| 1400 |     *out++ = r0; *out++ = i0; *out++ = r1; *out++ = i1; | 
| 1401 |     *out++ = r2; *out++ = i2; *out++ = r3; *out++ = i3; | 
| 1402 |   } | 
| 1403 | } | 
| 1404 |  | 
| 1405 | void pffft_cplx_preprocess(int Ncvec, const v4sf *in, v4sf *out, const v4sf *e) { | 
| 1406 |   int k, dk = Ncvec/SIMD_SZ; // number of 4x4 matrix blocks | 
| 1407 |   v4sf r0, i0, r1, i1, r2, i2, r3, i3; | 
| 1408 |   v4sf sr0, dr0, sr1, dr1, si0, di0, si1, di1; | 
| 1409 |   assert(in != out); | 
| 1410 |   for (k=0; k < dk; ++k) { | 
| 1411 |     r0 = in[8*k+0]; i0 = in[8*k+1]; | 
| 1412 |     r1 = in[8*k+2]; i1 = in[8*k+3]; | 
| 1413 |     r2 = in[8*k+4]; i2 = in[8*k+5]; | 
| 1414 |     r3 = in[8*k+6]; i3 = in[8*k+7]; | 
| 1415 |  | 
| 1416 |     sr0 = VADD(r0,r2); dr0 = VSUB(r0, r2); | 
| 1417 |     sr1 = VADD(r1,r3); dr1 = VSUB(r1, r3); | 
| 1418 |     si0 = VADD(i0,i2); di0 = VSUB(i0, i2); | 
| 1419 |     si1 = VADD(i1,i3); di1 = VSUB(i1, i3); | 
| 1420 |  | 
| 1421 |     r0 = VADD(sr0, sr1); i0 = VADD(si0, si1); | 
| 1422 |     r1 = VSUB(dr0, di1); i1 = VADD(di0, dr1); | 
| 1423 |     r2 = VSUB(sr0, sr1); i2 = VSUB(si0, si1); | 
| 1424 |     r3 = VADD(dr0, di1); i3 = VSUB(di0, dr1); | 
| 1425 |  | 
| 1426 |     VCPLXMULCONJ(r1,i1,e[k*6+0],e[k*6+1]); | 
| 1427 |     VCPLXMULCONJ(r2,i2,e[k*6+2],e[k*6+3]); | 
| 1428 |     VCPLXMULCONJ(r3,i3,e[k*6+4],e[k*6+5]); | 
| 1429 |  | 
| 1430 |     VTRANSPOSE4(r0,r1,r2,r3); | 
| 1431 |     VTRANSPOSE4(i0,i1,i2,i3); | 
| 1432 |  | 
| 1433 |     *out++ = r0; *out++ = i0; *out++ = r1; *out++ = i1; | 
| 1434 |     *out++ = r2; *out++ = i2; *out++ = r3; *out++ = i3; | 
| 1435 |   } | 
| 1436 | } | 
| 1437 |  | 
| 1438 |  | 
| 1439 | static ALWAYS_INLINE(void) pffft_real_finalize_4x4(const v4sf *in0, const v4sf *in1, const v4sf *in, | 
| 1440 |                                                    const v4sf *e, v4sf *out) { | 
| 1441 |   v4sf r0, i0, r1, i1, r2, i2, r3, i3; | 
| 1442 |   v4sf sr0, dr0, sr1, dr1, si0, di0, si1, di1; | 
| 1443 |   r0 = *in0; i0 = *in1; | 
| 1444 |   r1 = *in++; i1 = *in++; r2 = *in++; i2 = *in++; r3 = *in++; i3 = *in++; | 
| 1445 |   VTRANSPOSE4(r0,r1,r2,r3); | 
| 1446 |   VTRANSPOSE4(i0,i1,i2,i3); | 
| 1447 |  | 
| 1448 |   /* | 
| 1449 |     transformation for each column is: | 
| 1450 |  | 
| 1451 |     [1   1   1   1   0   0   0   0]   [r0] | 
| 1452 |     [1   0  -1   0   0  -1   0   1]   [r1] | 
| 1453 |     [1   0  -1   0   0   1   0  -1]   [r2] | 
| 1454 |     [1  -1   1  -1   0   0   0   0]   [r3] | 
| 1455 |     [0   0   0   0   1   1   1   1] * [i0] | 
| 1456 |     [0  -1   0   1  -1   0   1   0]   [i1] | 
| 1457 |     [0  -1   0   1   1   0  -1   0]   [i2] | 
| 1458 |     [0   0   0   0  -1   1  -1   1]   [i3] | 
| 1459 |   */ | 
| 1460 |  | 
| 1461 |   //cerr << "matrix initial, before e , REAL:\n 1: " << r0 << "\n 1: " << r1 << "\n 1: " << r2 << "\n 1: " << r3 << "\n"; | 
| 1462 |   //cerr << "matrix initial, before e, IMAG :\n 1: " << i0 << "\n 1: " << i1 << "\n 1: " << i2 << "\n 1: " << i3 << "\n"; | 
| 1463 |  | 
| 1464 |   VCPLXMUL(r1,i1,e[0],e[1]); | 
| 1465 |   VCPLXMUL(r2,i2,e[2],e[3]); | 
| 1466 |   VCPLXMUL(r3,i3,e[4],e[5]); | 
| 1467 |  | 
| 1468 |   //cerr << "matrix initial, real part:\n 1: " << r0 << "\n 1: " << r1 << "\n 1: " << r2 << "\n 1: " << r3 << "\n"; | 
| 1469 |   //cerr << "matrix initial, imag part:\n 1: " << i0 << "\n 1: " << i1 << "\n 1: " << i2 << "\n 1: " << i3 << "\n"; | 
| 1470 |  | 
| 1471 |   sr0 = VADD(r0,r2); dr0 = VSUB(r0,r2); | 
| 1472 |   sr1 = VADD(r1,r3); dr1 = VSUB(r3,r1); | 
| 1473 |   si0 = VADD(i0,i2); di0 = VSUB(i0,i2); | 
| 1474 |   si1 = VADD(i1,i3); di1 = VSUB(i3,i1); | 
| 1475 |  | 
| 1476 |   r0 = VADD(sr0, sr1); | 
| 1477 |   r3 = VSUB(sr0, sr1); | 
| 1478 |   i0 = VADD(si0, si1); | 
| 1479 |   i3 = VSUB(si1, si0); | 
| 1480 |   r1 = VADD(dr0, di1); | 
| 1481 |   r2 = VSUB(dr0, di1); | 
| 1482 |   i1 = VSUB(dr1, di0); | 
| 1483 |   i2 = VADD(dr1, di0); | 
| 1484 |  | 
| 1485 |   *out++ = r0; | 
| 1486 |   *out++ = i0; | 
| 1487 |   *out++ = r1; | 
| 1488 |   *out++ = i1; | 
| 1489 |   *out++ = r2; | 
| 1490 |   *out++ = i2; | 
| 1491 |   *out++ = r3; | 
| 1492 |   *out++ = i3; | 
| 1493 |  | 
| 1494 | } | 
| 1495 |  | 
| 1496 | static NEVER_INLINE(void) pffft_real_finalize(int Ncvec, const v4sf *in, v4sf *out, const v4sf *e) { | 
| 1497 |   int k, dk = Ncvec/SIMD_SZ; // number of 4x4 matrix blocks | 
| 1498 |   /* fftpack order is f0r f1r f1i f2r f2i ... f(n-1)r f(n-1)i f(n)r */ | 
| 1499 |  | 
| 1500 |   v4sf_union cr, ci, *uout = (v4sf_union*)out; | 
| 1501 |   v4sf save = in[7], zero=VZERO(); | 
| 1502 |   float xr0, xi0, xr1, xi1, xr2, xi2, xr3, xi3; | 
| 1503 |   static const float s = (float)(M_SQRT2/2); | 
| 1504 |  | 
| 1505 |   cr.v = in[0]; ci.v = in[Ncvec*2-1]; | 
| 1506 |   assert(in != out); | 
| 1507 |   pffft_real_finalize_4x4(in0: &zero, in1: &zero, in: in+1, e, out); | 
| 1508 |  | 
| 1509 |   /* | 
| 1510 |     [cr0 cr1 cr2 cr3 ci0 ci1 ci2 ci3] | 
| 1511 |  | 
| 1512 |     [Xr(1)]  ] [1   1   1   1   0   0   0   0] | 
| 1513 |     [Xr(N/4) ] [0   0   0   0   1   s   0  -s] | 
| 1514 |     [Xr(N/2) ] [1   0  -1   0   0   0   0   0] | 
| 1515 |     [Xr(3N/4)] [0   0   0   0   1  -s   0   s] | 
| 1516 |     [Xi(1)   ] [1  -1   1  -1   0   0   0   0] | 
| 1517 |     [Xi(N/4) ] [0   0   0   0   0  -s  -1  -s] | 
| 1518 |     [Xi(N/2) ] [0  -1   0   1   0   0   0   0] | 
| 1519 |     [Xi(3N/4)] [0   0   0   0   0  -s   1  -s] | 
| 1520 |   */ | 
| 1521 |  | 
| 1522 |   xr0=(cr.f[0]+cr.f[2]) + (cr.f[1]+cr.f[3]); uout[0].f[0] = xr0; | 
| 1523 |   xi0=(cr.f[0]+cr.f[2]) - (cr.f[1]+cr.f[3]); uout[1].f[0] = xi0; | 
| 1524 |   xr2=(cr.f[0]-cr.f[2]);                     uout[4].f[0] = xr2; | 
| 1525 |   xi2=(cr.f[3]-cr.f[1]);                     uout[5].f[0] = xi2; | 
| 1526 |   xr1= ci.f[0] + s*(ci.f[1]-ci.f[3]);        uout[2].f[0] = xr1; | 
| 1527 |   xi1=-ci.f[2] - s*(ci.f[1]+ci.f[3]);        uout[3].f[0] = xi1; | 
| 1528 |   xr3= ci.f[0] - s*(ci.f[1]-ci.f[3]);        uout[6].f[0] = xr3; | 
| 1529 |   xi3= ci.f[2] - s*(ci.f[1]+ci.f[3]);        uout[7].f[0] = xi3; | 
| 1530 |  | 
| 1531 |   for (k=1; k < dk; ++k) { | 
| 1532 |     v4sf save_next = in[8*k+7]; | 
| 1533 |     pffft_real_finalize_4x4(in0: &save, in1: &in[8*k+0], in: in + 8*k+1, | 
| 1534 |                             e: e + k*6, out: out + k*8); | 
| 1535 |     save = save_next; | 
| 1536 |   } | 
| 1537 |  | 
| 1538 | } | 
| 1539 |  | 
| 1540 | static ALWAYS_INLINE(void) pffft_real_preprocess_4x4(const v4sf *in, | 
| 1541 |                                                      const v4sf *e, v4sf *out, int first) { | 
| 1542 |   v4sf r0=in[0], i0=in[1], r1=in[2], i1=in[3], r2=in[4], i2=in[5], r3=in[6], i3=in[7]; | 
| 1543 |   /* | 
| 1544 |     transformation for each column is: | 
| 1545 |  | 
| 1546 |     [1   1   1   1   0   0   0   0]   [r0] | 
| 1547 |     [1   0   0  -1   0  -1  -1   0]   [r1] | 
| 1548 |     [1  -1  -1   1   0   0   0   0]   [r2] | 
| 1549 |     [1   0   0  -1   0   1   1   0]   [r3] | 
| 1550 |     [0   0   0   0   1  -1   1  -1] * [i0] | 
| 1551 |     [0  -1   1   0   1   0   0   1]   [i1] | 
| 1552 |     [0   0   0   0   1   1  -1  -1]   [i2] | 
| 1553 |     [0   1  -1   0   1   0   0   1]   [i3] | 
| 1554 |   */ | 
| 1555 |  | 
| 1556 |   v4sf sr0 = VADD(r0,r3), dr0 = VSUB(r0,r3); | 
| 1557 |   v4sf sr1 = VADD(r1,r2), dr1 = VSUB(r1,r2); | 
| 1558 |   v4sf si0 = VADD(i0,i3), di0 = VSUB(i0,i3); | 
| 1559 |   v4sf si1 = VADD(i1,i2), di1 = VSUB(i1,i2); | 
| 1560 |  | 
| 1561 |   r0 = VADD(sr0, sr1); | 
| 1562 |   r2 = VSUB(sr0, sr1); | 
| 1563 |   r1 = VSUB(dr0, si1); | 
| 1564 |   r3 = VADD(dr0, si1); | 
| 1565 |   i0 = VSUB(di0, di1); | 
| 1566 |   i2 = VADD(di0, di1); | 
| 1567 |   i1 = VSUB(si0, dr1); | 
| 1568 |   i3 = VADD(si0, dr1); | 
| 1569 |  | 
| 1570 |   VCPLXMULCONJ(r1,i1,e[0],e[1]); | 
| 1571 |   VCPLXMULCONJ(r2,i2,e[2],e[3]); | 
| 1572 |   VCPLXMULCONJ(r3,i3,e[4],e[5]); | 
| 1573 |  | 
| 1574 |   VTRANSPOSE4(r0,r1,r2,r3); | 
| 1575 |   VTRANSPOSE4(i0,i1,i2,i3); | 
| 1576 |  | 
| 1577 |   if (!first) { | 
| 1578 |     *out++ = r0; | 
| 1579 |     *out++ = i0; | 
| 1580 |   } | 
| 1581 |   *out++ = r1; | 
| 1582 |   *out++ = i1; | 
| 1583 |   *out++ = r2; | 
| 1584 |   *out++ = i2; | 
| 1585 |   *out++ = r3; | 
| 1586 |   *out++ = i3; | 
| 1587 | } | 
| 1588 |  | 
| 1589 | static NEVER_INLINE(void) pffft_real_preprocess(int Ncvec, const v4sf *in, v4sf *out, const v4sf *e) { | 
| 1590 |   int k, dk = Ncvec/SIMD_SZ; // number of 4x4 matrix blocks | 
| 1591 |   /* fftpack order is f0r f1r f1i f2r f2i ... f(n-1)r f(n-1)i f(n)r */ | 
| 1592 |  | 
| 1593 |   v4sf_union Xr, Xi, *uout = (v4sf_union*)out; | 
| 1594 |   float cr0, ci0, cr1, ci1, cr2, ci2, cr3, ci3; | 
| 1595 |   static const float s = (float)M_SQRT2; | 
| 1596 |   assert(in != out); | 
| 1597 |   for (k=0; k < 4; ++k) { | 
| 1598 |     Xr.f[k] = ((float*)in)[8*k]; | 
| 1599 |     Xi.f[k] = ((float*)in)[8*k+4]; | 
| 1600 |   } | 
| 1601 |  | 
| 1602 |   pffft_real_preprocess_4x4(in, e, out: out+1, first: 1); // will write only 6 values | 
| 1603 |  | 
| 1604 |   /* | 
| 1605 |     [Xr0 Xr1 Xr2 Xr3 Xi0 Xi1 Xi2 Xi3] | 
| 1606 |  | 
| 1607 |     [cr0] [1   0   2   0   1   0   0   0] | 
| 1608 |     [cr1] [1   0   0   0  -1   0  -2   0] | 
| 1609 |     [cr2] [1   0  -2   0   1   0   0   0] | 
| 1610 |     [cr3] [1   0   0   0  -1   0   2   0] | 
| 1611 |     [ci0] [0   2   0   2   0   0   0   0] | 
| 1612 |     [ci1] [0   s   0  -s   0  -s   0  -s] | 
| 1613 |     [ci2] [0   0   0   0   0  -2   0   2] | 
| 1614 |     [ci3] [0  -s   0   s   0  -s   0  -s] | 
| 1615 |   */ | 
| 1616 |   for (k=1; k < dk; ++k) { | 
| 1617 |     pffft_real_preprocess_4x4(in: in+8*k, e: e + k*6, out: out-1+k*8, first: 0); | 
| 1618 |   } | 
| 1619 |  | 
| 1620 |   cr0=(Xr.f[0]+Xi.f[0]) + 2*Xr.f[2]; uout[0].f[0] = cr0; | 
| 1621 |   cr1=(Xr.f[0]-Xi.f[0]) - 2*Xi.f[2]; uout[0].f[1] = cr1; | 
| 1622 |   cr2=(Xr.f[0]+Xi.f[0]) - 2*Xr.f[2]; uout[0].f[2] = cr2; | 
| 1623 |   cr3=(Xr.f[0]-Xi.f[0]) + 2*Xi.f[2]; uout[0].f[3] = cr3; | 
| 1624 |   ci0= 2*(Xr.f[1]+Xr.f[3]);                       uout[2*Ncvec-1].f[0] = ci0; | 
| 1625 |   ci1= s*(Xr.f[1]-Xr.f[3]) - s*(Xi.f[1]+Xi.f[3]); uout[2*Ncvec-1].f[1] = ci1; | 
| 1626 |   ci2= 2*(Xi.f[3]-Xi.f[1]);                       uout[2*Ncvec-1].f[2] = ci2; | 
| 1627 |   ci3=-s*(Xr.f[1]-Xr.f[3]) - s*(Xi.f[1]+Xi.f[3]); uout[2*Ncvec-1].f[3] = ci3; | 
| 1628 | } | 
| 1629 |  | 
| 1630 |  | 
| 1631 | void pffft_transform_internal(PFFFT_Setup *setup, const float *finput, float *foutput, v4sf *scratch, | 
| 1632 |                               pffft_direction_t direction, int ordered) { | 
| 1633 |   int k, Ncvec   = setup->Ncvec; | 
| 1634 |   int nf_odd = (setup->ifac[1] & 1); | 
| 1635 |  | 
| 1636 |   // temporary buffer is allocated on the stack if the scratch pointer is NULL | 
| 1637 |   int stack_allocate = (scratch == 0 ? Ncvec*2 : 1); | 
| 1638 |   VLA_ARRAY_ON_STACK(v4sf, scratch_on_stack, stack_allocate); | 
| 1639 |  | 
| 1640 |   const v4sf *vinput = (const v4sf*)finput; | 
| 1641 |   v4sf *voutput      = (v4sf*)foutput; | 
| 1642 |   v4sf *buff[2]      = { voutput, scratch ? scratch : scratch_on_stack }; | 
| 1643 |   int ib = (nf_odd ^ ordered ? 1 : 0); | 
| 1644 |  | 
| 1645 |   assert(VALIGNED(finput) && VALIGNED(foutput)); | 
| 1646 |  | 
| 1647 |   //assert(finput != foutput); | 
| 1648 |   if (direction == PFFFT_FORWARD) { | 
| 1649 |     ib = !ib; | 
| 1650 |     if (setup->transform == PFFFT_REAL) { | 
| 1651 |       ib = (rfftf1_ps(n: Ncvec*2, input_readonly: vinput, work1: buff[ib], work2: buff[!ib], | 
| 1652 |                       wa: setup->twiddle, ifac: &setup->ifac[0]) == buff[0] ? 0 : 1); | 
| 1653 |       pffft_real_finalize(Ncvec, in: buff[ib], out: buff[!ib], e: (v4sf*)setup->e); | 
| 1654 |     } else { | 
| 1655 |       v4sf *tmp = buff[ib]; | 
| 1656 |       for (k=0; k < Ncvec; ++k) { | 
| 1657 |         UNINTERLEAVE2(vinput[k*2], vinput[k*2+1], tmp[k*2], tmp[k*2+1]); | 
| 1658 |       } | 
| 1659 |       ib = (cfftf1_ps(n: Ncvec, input_readonly: buff[ib], work1: buff[!ib], work2: buff[ib], | 
| 1660 |                       wa: setup->twiddle, ifac: &setup->ifac[0], isign: -1) == buff[0] ? 0 : 1); | 
| 1661 |       pffft_cplx_finalize(Ncvec, in: buff[ib], out: buff[!ib], e: (v4sf*)setup->e); | 
| 1662 |     } | 
| 1663 |     if (ordered) { | 
| 1664 |       pffft_zreorder(setup, in: (float*)buff[!ib], out: (float*)buff[ib], direction: PFFFT_FORWARD); | 
| 1665 |     } else ib = !ib; | 
| 1666 |   } else { | 
| 1667 |     if (vinput == buff[ib]) { | 
| 1668 |       ib = !ib; // may happen when finput == foutput | 
| 1669 |     } | 
| 1670 |     if (ordered) { | 
| 1671 |       pffft_zreorder(setup, in: (float*)vinput, out: (float*)buff[ib], direction: PFFFT_BACKWARD); | 
| 1672 |       vinput = buff[ib]; ib = !ib; | 
| 1673 |     } | 
| 1674 |     if (setup->transform == PFFFT_REAL) { | 
| 1675 |       pffft_real_preprocess(Ncvec, in: vinput, out: buff[ib], e: (v4sf*)setup->e); | 
| 1676 |       ib = (rfftb1_ps(n: Ncvec*2, input_readonly: buff[ib], work1: buff[0], work2: buff[1], | 
| 1677 |                       wa: setup->twiddle, ifac: &setup->ifac[0]) == buff[0] ? 0 : 1); | 
| 1678 |     } else { | 
| 1679 |       pffft_cplx_preprocess(Ncvec, in: vinput, out: buff[ib], e: (v4sf*)setup->e); | 
| 1680 |       ib = (cfftf1_ps(n: Ncvec, input_readonly: buff[ib], work1: buff[0], work2: buff[1], | 
| 1681 |                       wa: setup->twiddle, ifac: &setup->ifac[0], isign: +1) == buff[0] ? 0 : 1); | 
| 1682 |       for (k=0; k < Ncvec; ++k) { | 
| 1683 |         INTERLEAVE2(buff[ib][k*2], buff[ib][k*2+1], buff[ib][k*2], buff[ib][k*2+1]); | 
| 1684 |       } | 
| 1685 |     } | 
| 1686 |   } | 
| 1687 |  | 
| 1688 |   if (buff[ib] != voutput) { | 
| 1689 |     /* extra copy required -- this situation should only happen when finput == foutput */ | 
| 1690 |     assert(finput==foutput); | 
| 1691 |     for (k=0; k < Ncvec; ++k) { | 
| 1692 |       v4sf a = buff[ib][2*k], b = buff[ib][2*k+1]; | 
| 1693 |       voutput[2*k] = a; voutput[2*k+1] = b; | 
| 1694 |     } | 
| 1695 |     ib = !ib; | 
| 1696 |   } | 
| 1697 |   assert(buff[ib] == voutput); | 
| 1698 | } | 
| 1699 |  | 
| 1700 | void pffft_zconvolve_accumulate(PFFFT_Setup *s, const float *a, const float *b, float *ab, float scaling) { | 
| 1701 |   int Ncvec = s->Ncvec; | 
| 1702 |   const v4sf * RESTRICT va = (const v4sf*)a; | 
| 1703 |   const v4sf * RESTRICT vb = (const v4sf*)b; | 
| 1704 |   v4sf * RESTRICT vab = (v4sf*)ab; | 
| 1705 |  | 
| 1706 | #ifdef __arm__ | 
| 1707 |   __builtin_prefetch(va); | 
| 1708 |   __builtin_prefetch(vb); | 
| 1709 |   __builtin_prefetch(vab); | 
| 1710 |   __builtin_prefetch(va+2); | 
| 1711 |   __builtin_prefetch(vb+2); | 
| 1712 |   __builtin_prefetch(vab+2); | 
| 1713 |   __builtin_prefetch(va+4); | 
| 1714 |   __builtin_prefetch(vb+4); | 
| 1715 |   __builtin_prefetch(vab+4); | 
| 1716 |   __builtin_prefetch(va+6); | 
| 1717 |   __builtin_prefetch(vb+6); | 
| 1718 |   __builtin_prefetch(vab+6); | 
| 1719 | # ifndef __clang__ | 
| 1720 | #   define ZCONVOLVE_USING_INLINE_NEON_ASM | 
| 1721 | # endif | 
| 1722 | #endif | 
| 1723 |  | 
| 1724 |   float ar0, ai0, br0, bi0, abr0, abi0; | 
| 1725 | #ifndef ZCONVOLVE_USING_INLINE_ASM | 
| 1726 |   v4sf vscal = LD_PS1(scaling); | 
| 1727 |   int i; | 
| 1728 | #endif | 
| 1729 |  | 
| 1730 |   assert(VALIGNED(a) && VALIGNED(b) && VALIGNED(ab)); | 
| 1731 |   ar0 = ((v4sf_union*)va)[0].f[0]; | 
| 1732 |   ai0 = ((v4sf_union*)va)[1].f[0]; | 
| 1733 |   br0 = ((v4sf_union*)vb)[0].f[0]; | 
| 1734 |   bi0 = ((v4sf_union*)vb)[1].f[0]; | 
| 1735 |   abr0 = ((v4sf_union*)vab)[0].f[0]; | 
| 1736 |   abi0 = ((v4sf_union*)vab)[1].f[0]; | 
| 1737 |   | 
| 1738 | #ifdef ZCONVOLVE_USING_INLINE_ASM // inline asm version, unfortunately miscompiled by clang 3.2, at least on ubuntu.. so this will be restricted to gcc | 
| 1739 |   const float *a_ = a, *b_ = b; float *ab_ = ab; | 
| 1740 |   int N = Ncvec; | 
| 1741 |   asm volatile("mov         r8, %2                  \n"  | 
| 1742 |                "vdup.f32    q15, %4                 \n"  | 
| 1743 |                "1:                                  \n"  | 
| 1744 |                "pld         [%0,#64]                \n"  | 
| 1745 |                "pld         [%1,#64]                \n"  | 
| 1746 |                "pld         [%2,#64]                \n"  | 
| 1747 |                "pld         [%0,#96]                \n"  | 
| 1748 |                "pld         [%1,#96]                \n"  | 
| 1749 |                "pld         [%2,#96]                \n"  | 
| 1750 |                "vld1.f32    {q0,q1},   [%0,:128]!         \n"  | 
| 1751 |                "vld1.f32    {q4,q5},   [%1,:128]!         \n"  | 
| 1752 |                "vld1.f32    {q2,q3},   [%0,:128]!         \n"  | 
| 1753 |                "vld1.f32    {q6,q7},   [%1,:128]!         \n"  | 
| 1754 |                "vld1.f32    {q8,q9},   [r8,:128]!          \n"  | 
| 1755 |  | 
| 1756 |                "vmul.f32    q10, q0, q4             \n"  | 
| 1757 |                "vmul.f32    q11, q0, q5             \n"  | 
| 1758 |                "vmul.f32    q12, q2, q6             \n"  | 
| 1759 |                "vmul.f32    q13, q2, q7             \n"  | 
| 1760 |                "vmls.f32    q10, q1, q5             \n"  | 
| 1761 |                "vmla.f32    q11, q1, q4             \n"  | 
| 1762 |                "vld1.f32    {q0,q1}, [r8,:128]!     \n"  | 
| 1763 |                "vmls.f32    q12, q3, q7             \n"  | 
| 1764 |                "vmla.f32    q13, q3, q6             \n"  | 
| 1765 |                "vmla.f32    q8, q10, q15            \n"  | 
| 1766 |                "vmla.f32    q9, q11, q15            \n"  | 
| 1767 |                "vmla.f32    q0, q12, q15            \n"  | 
| 1768 |                "vmla.f32    q1, q13, q15            \n"  | 
| 1769 |                "vst1.f32    {q8,q9},[%2,:128]!    \n"  | 
| 1770 |                "vst1.f32    {q0,q1},[%2,:128]!    \n"  | 
| 1771 |                "subs        %3, #2                  \n"  | 
| 1772 |                "bne         1b                      \n"  | 
| 1773 |                : "+r" (a_), "+r" (b_), "+r" (ab_), "+r" (N) : "r" (scaling) : "r8" , "q0" ,"q1" ,"q2" ,"q3" ,"q4" ,"q5" ,"q6" ,"q7" ,"q8" ,"q9" , "q10" ,"q11" ,"q12" ,"q13" ,"q15" ,"memory" ); | 
| 1774 | #else // default routine, works fine for non-arm cpus with current compilers | 
| 1775 |   for (i=0; i < Ncvec; i += 2) { | 
| 1776 |     v4sf ar, ai, br, bi; | 
| 1777 |     ar = va[2*i+0]; ai = va[2*i+1]; | 
| 1778 |     br = vb[2*i+0]; bi = vb[2*i+1]; | 
| 1779 |     VCPLXMUL(ar, ai, br, bi); | 
| 1780 |     vab[2*i+0] = VMADD(ar, vscal, vab[2*i+0]); | 
| 1781 |     vab[2*i+1] = VMADD(ai, vscal, vab[2*i+1]); | 
| 1782 |     ar = va[2*i+2]; ai = va[2*i+3]; | 
| 1783 |     br = vb[2*i+2]; bi = vb[2*i+3]; | 
| 1784 |     VCPLXMUL(ar, ai, br, bi); | 
| 1785 |     vab[2*i+2] = VMADD(ar, vscal, vab[2*i+2]); | 
| 1786 |     vab[2*i+3] = VMADD(ai, vscal, vab[2*i+3]); | 
| 1787 |   } | 
| 1788 | #endif | 
| 1789 |   if (s->transform == PFFFT_REAL) { | 
| 1790 |     ((v4sf_union*)vab)[0].f[0] = abr0 + ar0*br0*scaling; | 
| 1791 |     ((v4sf_union*)vab)[1].f[0] = abi0 + ai0*bi0*scaling; | 
| 1792 |   } | 
| 1793 | } | 
| 1794 |  | 
| 1795 |  | 
| 1796 | #else // defined(PFFFT_SIMD_DISABLE) | 
| 1797 |  | 
| 1798 | // standard routine using scalar floats, without SIMD stuff. | 
| 1799 |  | 
| 1800 | #define pffft_zreorder_nosimd pffft_zreorder | 
| 1801 | void pffft_zreorder_nosimd(PFFFT_Setup *setup, const float *in, float *out, pffft_direction_t direction) { | 
| 1802 |   int k, N = setup->N; | 
| 1803 |   if (setup->transform == PFFFT_COMPLEX) { | 
| 1804 |     for (k=0; k < 2*N; ++k) out[k] = in[k]; | 
| 1805 |     return; | 
| 1806 |   } | 
| 1807 |   else if (direction == PFFFT_FORWARD) { | 
| 1808 |     float x_N = in[N-1]; | 
| 1809 |     for (k=N-1; k > 1; --k) out[k] = in[k-1]; | 
| 1810 |     out[0] = in[0]; | 
| 1811 |     out[1] = x_N; | 
| 1812 |   } else { | 
| 1813 |     float x_N = in[1]; | 
| 1814 |     for (k=1; k < N-1; ++k) out[k] = in[k+1]; | 
| 1815 |     out[0] = in[0]; | 
| 1816 |     out[N-1] = x_N; | 
| 1817 |   } | 
| 1818 | } | 
| 1819 |  | 
| 1820 | #define pffft_transform_internal_nosimd pffft_transform_internal | 
| 1821 | void pffft_transform_internal_nosimd(PFFFT_Setup *setup, const float *input, float *output, float *scratch, | 
| 1822 |                                      pffft_direction_t direction, int ordered) { | 
| 1823 |   int Ncvec   = setup->Ncvec; | 
| 1824 |   int nf_odd = (setup->ifac[1] & 1); | 
| 1825 |  | 
| 1826 |   // temporary buffer is allocated on the stack if the scratch pointer is NULL | 
| 1827 |   int stack_allocate = (scratch == 0 ? Ncvec*2 : 1); | 
| 1828 |   VLA_ARRAY_ON_STACK(v4sf, scratch_on_stack, stack_allocate); | 
| 1829 |   float *buff[2]; | 
| 1830 |   int ib; | 
| 1831 |   if (scratch == 0) scratch = scratch_on_stack; | 
| 1832 |   buff[0] = output; buff[1] = scratch; | 
| 1833 |  | 
| 1834 |   if (setup->transform == PFFFT_COMPLEX) ordered = 0; // it is always ordered. | 
| 1835 |   ib = (nf_odd ^ ordered ? 1 : 0); | 
| 1836 |  | 
| 1837 |   if (direction == PFFFT_FORWARD) { | 
| 1838 |     if (setup->transform == PFFFT_REAL) { | 
| 1839 |       ib = (rfftf1_ps(Ncvec*2, input, buff[ib], buff[!ib], | 
| 1840 |                       setup->twiddle, &setup->ifac[0]) == buff[0] ? 0 : 1); | 
| 1841 |     } else { | 
| 1842 |       ib = (cfftf1_ps(Ncvec, input, buff[ib], buff[!ib], | 
| 1843 |                       setup->twiddle, &setup->ifac[0], -1) == buff[0] ? 0 : 1); | 
| 1844 |     } | 
| 1845 |     if (ordered) { | 
| 1846 |       pffft_zreorder(setup, buff[ib], buff[!ib], PFFFT_FORWARD); ib = !ib; | 
| 1847 |     } | 
| 1848 |   } else { | 
| 1849 |     if (input == buff[ib]) { | 
| 1850 |       ib = !ib; // may happen when finput == foutput | 
| 1851 |     } | 
| 1852 |     if (ordered) { | 
| 1853 |       pffft_zreorder(setup, input, buff[!ib], PFFFT_BACKWARD); | 
| 1854 |       input = buff[!ib]; | 
| 1855 |     } | 
| 1856 |     if (setup->transform == PFFFT_REAL) { | 
| 1857 |       ib = (rfftb1_ps(Ncvec*2, input, buff[ib], buff[!ib], | 
| 1858 |                       setup->twiddle, &setup->ifac[0]) == buff[0] ? 0 : 1); | 
| 1859 |     } else { | 
| 1860 |       ib = (cfftf1_ps(Ncvec, input, buff[ib], buff[!ib], | 
| 1861 |                       setup->twiddle, &setup->ifac[0], +1) == buff[0] ? 0 : 1); | 
| 1862 |     } | 
| 1863 |   } | 
| 1864 |   if (buff[ib] != output) { | 
| 1865 |     int k; | 
| 1866 |     // extra copy required -- this situation should happens only when finput == foutput | 
| 1867 |     assert(input==output); | 
| 1868 |     for (k=0; k < Ncvec; ++k) { | 
| 1869 |       float a = buff[ib][2*k], b = buff[ib][2*k+1]; | 
| 1870 |       output[2*k] = a; output[2*k+1] = b; | 
| 1871 |     } | 
| 1872 |     ib = !ib; | 
| 1873 |   } | 
| 1874 |   assert(buff[ib] == output); | 
| 1875 | } | 
| 1876 |  | 
| 1877 | #define pffft_zconvolve_accumulate_nosimd pffft_zconvolve_accumulate | 
| 1878 | void pffft_zconvolve_accumulate_nosimd(PFFFT_Setup *s, const float *a, const float *b, | 
| 1879 |                                        float *ab, float scaling) { | 
| 1880 |   int i, Ncvec = s->Ncvec; | 
| 1881 |  | 
| 1882 |   if (s->transform == PFFFT_REAL) { | 
| 1883 |     // take care of the fftpack ordering | 
| 1884 |     ab[0] += a[0]*b[0]*scaling; | 
| 1885 |     ab[2*Ncvec-1] += a[2*Ncvec-1]*b[2*Ncvec-1]*scaling; | 
| 1886 |     ++ab; ++a; ++b; --Ncvec; | 
| 1887 |   } | 
| 1888 |   for (i=0; i < Ncvec; ++i) { | 
| 1889 |     float ar, ai, br, bi; | 
| 1890 |     ar = a[2*i+0]; ai = a[2*i+1]; | 
| 1891 |     br = b[2*i+0]; bi = b[2*i+1]; | 
| 1892 |     VCPLXMUL(ar, ai, br, bi); | 
| 1893 |     ab[2*i+0] += ar*scaling; | 
| 1894 |     ab[2*i+1] += ai*scaling; | 
| 1895 |   } | 
| 1896 | } | 
| 1897 |  | 
| 1898 | #endif // defined(PFFFT_SIMD_DISABLE) | 
| 1899 |  | 
| 1900 | void pffft_transform(PFFFT_Setup *setup, const float *input, float *output, float *work, pffft_direction_t direction) { | 
| 1901 |   pffft_transform_internal(setup, finput: input, foutput: output, scratch: (v4sf*)work, direction, ordered: 0); | 
| 1902 | } | 
| 1903 |  | 
| 1904 | void pffft_transform_ordered(PFFFT_Setup *setup, const float *input, float *output, float *work, pffft_direction_t direction) { | 
| 1905 |   pffft_transform_internal(setup, finput: input, foutput: output, scratch: (v4sf*)work, direction, ordered: 1); | 
| 1906 | } | 
| 1907 |  |