ITK 6.0.0
Insight Toolkit
 
Loading...
Searching...
No Matches
pocketfft_hdronly.h
Go to the documentation of this file.
1/*
2This file is part of pocketfft.
3
4Copyright (C) 2010-2024 Max-Planck-Society
5Copyright (C) 2019-2020 Peter Bell
6
7For the odd-sized DCT-IV transforms:
8 Copyright (C) 2003, 2007-14 Matteo Frigo
9 Copyright (C) 2003, 2007-14 Massachusetts Institute of Technology
10
11For the prev_good_size search:
12 Copyright (C) 2024 Tan Ping Liang, Peter Bell
13
14For the safeguards against integer overflow in good_size search:
15 Copyright (C) 2024 Cris Luengo
16
17Authors: Martin Reinecke, Peter Bell
18
19All rights reserved.
20
21Redistribution and use in source and binary forms, with or without modification,
22are permitted provided that the following conditions are met:
23
24* Redistributions of source code must retain the above copyright notice, this
25 list of conditions and the following disclaimer.
26* Redistributions in binary form must reproduce the above copyright notice, this
27 list of conditions and the following disclaimer in the documentation and/or
28 other materials provided with the distribution.
29* Neither the name of the copyright holder nor the names of its contributors may
30 be used to endorse or promote products derived from this software without
31 specific prior written permission.
32
33THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
34ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
35WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
36DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
37ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
38(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
39LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
40ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
41(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
42SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
43*/
44
45#ifndef POCKETFFT_HDRONLY_H
46#define POCKETFFT_HDRONLY_H
47
48#ifndef __cplusplus
49#error This file is C++ and requires a C++ compiler.
50#endif
51
52#if !(__cplusplus >= 201103L || (defined(_MSVC_LANG) && _MSVC_LANG >= 201103L))
53#error This file requires at least C++11 support.
54#endif
55
56#ifndef POCKETFFT_NAMESPACE
57#define POCKETFFT_NAMESPACE pocketfft
58#endif
59
60#ifndef POCKETFFT_CACHE_SIZE
61// ITK default: save up to 16 plans so repeated same-size transforms (e.g. the
62// per-iteration N3/N4 histogram FFT) reuse a plan instead of rebuilding it.
63// Any nonzero size is a robust choice: it gives a significant speedup over 0
64// across every tested ITK environment (including the ANTs and BRAINSTools
65// test suites); 16 covers typical multi-size pipelines with no measured cost.
66#define POCKETFFT_CACHE_SIZE 16
67#endif
68
69#include <cmath>
70#include <cstdlib>
71#include <cstddef>
72#include <cstdint>
73#include <exception>
74#include <stdexcept>
75#include <memory>
76#include <vector>
77#include <complex>
78#include <algorithm>
79#include <limits>
80#if POCKETFFT_CACHE_SIZE!=0
81#include <array>
82#include <mutex>
83#endif
84
85#ifndef POCKETFFT_NO_MULTITHREADING
86#include <mutex>
87#include <condition_variable>
88#include <thread>
89#include <queue>
90#include <atomic>
91#include <functional>
92#include <new>
93
94#ifdef POCKETFFT_PTHREADS
95# include <pthread.h>
96#endif
97#endif
98
99#if defined(__GNUC__)
100#define POCKETFFT_NOINLINE __attribute__((noinline))
101#define POCKETFFT_RESTRICT __restrict__
102#elif defined(_MSC_VER)
103#define POCKETFFT_NOINLINE __declspec(noinline)
104#define POCKETFFT_RESTRICT __restrict
105#else
106#define POCKETFFT_NOINLINE
107#define POCKETFFT_RESTRICT
108#endif
109
111
112namespace detail {
113using std::size_t;
114using std::ptrdiff_t;
115
116// Always use std:: for <cmath> functions
117template <typename T> T cos(T) = delete;
118template <typename T> T sin(T) = delete;
119template <typename T> T sqrt(T) = delete;
120
121using shape_t = std::vector<size_t>;
122using stride_t = std::vector<ptrdiff_t>;
123
124constexpr bool FORWARD = true,
125 BACKWARD = false;
126
127// only enable vector support for gcc>=5.0 and clang>=5.0
128#ifndef POCKETFFT_NO_VECTORS
129#define POCKETFFT_NO_VECTORS
130#if defined(__INTEL_COMPILER)
131// do nothing. This is necessary because this compiler also sets __GNUC__.
132#elif defined(__clang__)
133// AppleClang has their own version numbering
134#ifdef __apple_build_version__
135# if (__clang_major__ > 9) || (__clang_major__ == 9 && __clang_minor__ >= 1)
136# undef POCKETFFT_NO_VECTORS
137# endif
138#elif __clang_major__ >= 5
139# undef POCKETFFT_NO_VECTORS
140#endif
141#elif defined(__GNUC__)
142#if __GNUC__>=5
143#undef POCKETFFT_NO_VECTORS
144#endif
145#endif
146#endif
147
148template<typename T> struct VLEN { static constexpr size_t val=1; };
149
150#ifndef POCKETFFT_NO_VECTORS
151#if (defined(__AVX512F__))
152template<> struct VLEN<float> { static constexpr size_t val=16; };
153template<> struct VLEN<double> { static constexpr size_t val=8; };
154#elif (defined(__AVX__))
155template<> struct VLEN<float> { static constexpr size_t val=8; };
156template<> struct VLEN<double> { static constexpr size_t val=4; };
157#elif (defined(__SSE2__))
158template<> struct VLEN<float> { static constexpr size_t val=4; };
159template<> struct VLEN<double> { static constexpr size_t val=2; };
160#elif (defined(__VSX__))
161template<> struct VLEN<float> { static constexpr size_t val=4; };
162template<> struct VLEN<double> { static constexpr size_t val=2; };
163#elif (defined(__ARM_NEON__) || defined(__ARM_NEON))
164template<> struct VLEN<float> { static constexpr size_t val=4; };
165template<> struct VLEN<double> { static constexpr size_t val=2; };
166#elif (defined(__riscv_vector) || defined(__riscv_v_intrinsic))
167template<> struct VLEN<float> { static constexpr size_t val=8; };
168template<> struct VLEN<double> { static constexpr size_t val=4; };
169#else
170#define POCKETFFT_NO_VECTORS
171#endif
172#endif
173
174// std::aligned_alloc is a bit cursed ... it doesn't exist on MacOS < 10.15
175// and in musl, and other OSes seem to have even more peculiarities.
176// Let's unconditionally work around it for now.
177#if defined(POCKETFFT_USE_POSIX_MEMALIGN) && (defined(__APPLE__) || defined(__unix__))
178// Use posix_memalign on POSIX systems when explicitly enabled.
179// The portable aligned_alloc below stores metadata at ptr[-1], which conflicts
180// with ASAN's heap redzone and causes intermittent bus errors.
181inline void *aligned_alloc(size_t align, size_t size)
182 {
183 align = std::max(align, sizeof(void*)); // posix_memalign requires align >= sizeof(void*)
184 void *ptr = nullptr;
185 if (posix_memalign(&ptr, align, size) != 0)
186 throw std::bad_alloc();
187 return ptr;
188 }
189inline void aligned_dealloc(void *ptr)
190 { free(ptr); }
191#else
192# if 0
193//#if (__cplusplus >= 201703L) && (!defined(__MINGW32__)) && (!defined(_MSC_VER)) && (__MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_X_VERSION_10_15)
194inline void *aligned_alloc(size_t align, size_t size)
195 {
196 // aligned_alloc() requires that the requested size is a multiple of "align"
197 void *ptr = ::aligned_alloc(align,(size+align-1)&(~(align-1)));
198 if (!ptr) throw std::bad_alloc();
199 return ptr;
200 }
201inline void aligned_dealloc(void *ptr)
202 { free(ptr); }
203#else // portable emulation
204inline void *aligned_alloc(size_t align, size_t size)
205 {
206 align = std::max(align, alignof(max_align_t));
207 void *ptr = malloc(size+align);
208 if (!ptr) throw std::bad_alloc();
209 void *res = reinterpret_cast<void *>
210 ((reinterpret_cast<uintptr_t>(ptr) & ~(uintptr_t(align-1))) + uintptr_t(align));
211 (reinterpret_cast<void**>(res))[-1] = ptr;
212 return res;
213 }
214inline void aligned_dealloc(void *ptr)
215 { if (ptr) free((reinterpret_cast<void**>(ptr))[-1]); }
216# endif
217#endif
218
219template<typename T> class arr
220 {
221 private:
222 T *p;
223 size_t sz;
224
225#if defined(POCKETFFT_NO_VECTORS)
226 static T *ralloc(size_t num)
227 {
228 if (num==0) return nullptr;
229 void *res = malloc(num*sizeof(T));
230 if (!res) throw std::bad_alloc();
231 return reinterpret_cast<T *>(res);
232 }
233 static void dealloc(T *ptr)
234 { free(ptr); }
235#else
236 static T *ralloc(size_t num)
237 {
238 if (num==0) return nullptr;
239 void *ptr = aligned_alloc(64, num*sizeof(T));
240 return static_cast<T*>(ptr);
241 }
242 static void dealloc(T *ptr)
243 { aligned_dealloc(ptr); }
244#endif
245
246 public:
247 arr() : p(0), sz(0) {}
248 explicit arr(size_t n) : p(ralloc(n)), sz(n) {}
249 arr(arr &&other) noexcept
250 : p(other.p), sz(other.sz)
251 { other.p=nullptr; other.sz=0; }
252 ~arr() { dealloc(p); }
253
254 void resize(size_t n)
255 {
256 if (n==sz) return;
257 dealloc(p);
258 p = ralloc(n);
259 sz = n;
260 }
261
262 T &operator[](size_t idx) { return p[idx]; }
263 const T &operator[](size_t idx) const { return p[idx]; }
264
265 T *data() { return p; }
266 const T *data() const { return p; }
267
268 size_t size() const { return sz; }
269 };
270
271template<typename T> struct cmplx {
272 T r, i;
273 cmplx() = default;
274 cmplx(T r_, T i_) : r(r_), i(i_) {}
275 void Set(T r_, T i_) { r=r_; i=i_; }
276 void Set(T r_) { r=r_; i=T(0); }
277 cmplx &operator+= (const cmplx &other)
278 { r+=other.r; i+=other.i; return *this; }
279 template<typename T2>cmplx &operator*= (T2 other)
280 { r*=other; i*=other; return *this; }
281 template<typename T2>cmplx &operator*= (const cmplx<T2> &other)
282 {
283 T tmp = r*other.r - i*other.i;
284 i = r*other.i + i*other.r;
285 r = tmp;
286 return *this;
287 }
288 template<typename T2>cmplx &operator+= (const cmplx<T2> &other)
289 { r+=other.r; i+=other.i; return *this; }
290 template<typename T2>cmplx &operator-= (const cmplx<T2> &other)
291 { r-=other.r; i-=other.i; return *this; }
292 template<typename T2> auto operator* (const T2 &other) const
293 -> cmplx<decltype(r*other)>
294 { return {r*other, i*other}; }
295 template<typename T2> auto operator+ (const cmplx<T2> &other) const
296 -> cmplx<decltype(r+other.r)>
297 { return {r+other.r, i+other.i}; }
298 template<typename T2> auto operator- (const cmplx<T2> &other) const
299 -> cmplx<decltype(r+other.r)>
300 { return {r-other.r, i-other.i}; }
301 template<typename T2> auto operator* (const cmplx<T2> &other) const
302 -> cmplx<decltype(r+other.r)>
303 { return {r*other.r-i*other.i, r*other.i + i*other.r}; }
304 template<bool fwd, typename T2> auto special_mul (const cmplx<T2> &other) const
305 -> cmplx<decltype(r+other.r)>
306 {
307 using Tres = cmplx<decltype(r+other.r)>;
308 return fwd ? Tres(r*other.r+i*other.i, i*other.r-r*other.i)
309 : Tres(r*other.r-i*other.i, r*other.i+i*other.r);
310 }
311};
312template<typename T> inline void PM(T &a, T &b, T c, T d)
313 { a=c+d; b=c-d; }
314template<typename T> inline void PMINPLACE(T &a, T &b)
315 { T t = a; a+=b; b=t-b; }
316template<typename T> inline void MPINPLACE(T &a, T &b)
317 { T t = a; a-=b; b=t+b; }
318template<typename T> cmplx<T> conj(const cmplx<T> &a)
319 { return {a.r, -a.i}; }
320template<bool fwd, typename T, typename T2> void special_mul (const cmplx<T> &v1, const cmplx<T2> &v2, cmplx<T> &res)
321 {
322 res = fwd ? cmplx<T>(v1.r*v2.r+v1.i*v2.i, v1.i*v2.r-v1.r*v2.i)
323 : cmplx<T>(v1.r*v2.r-v1.i*v2.i, v1.r*v2.i+v1.i*v2.r);
324 }
325
326template<typename T> void ROT90(cmplx<T> &a)
327 { auto tmp_=a.r; a.r=-a.i; a.i=tmp_; }
328template<bool fwd, typename T> void ROTX90(cmplx<T> &a)
329 { auto tmp_= fwd ? -a.r : a.r; a.r = fwd ? a.i : -a.i; a.i=tmp_; }
330
331//
332// twiddle factor section
333//
334template<typename T> class sincos_2pibyn
335 {
336 private:
337 using Thigh = typename std::conditional<(sizeof(T)>sizeof(double)), T, double>::type;
338 size_t N, mask, shift;
340
341 static cmplx<Thigh> calc(size_t x, size_t n, Thigh ang)
342 {
343 x<<=3;
344 if (x<4*n) // first half
345 {
346 if (x<2*n) // first quadrant
347 {
348 if (x<n) return cmplx<Thigh>(std::cos(Thigh(x)*ang), std::sin(Thigh(x)*ang));
349 return cmplx<Thigh>(std::sin(Thigh(2*n-x)*ang), std::cos(Thigh(2*n-x)*ang));
350 }
351 else // second quadrant
352 {
353 x-=2*n;
354 if (x<n) return cmplx<Thigh>(-std::sin(Thigh(x)*ang), std::cos(Thigh(x)*ang));
355 return cmplx<Thigh>(-std::cos(Thigh(2*n-x)*ang), std::sin(Thigh(2*n-x)*ang));
356 }
357 }
358 else
359 {
360 x=8*n-x;
361 if (x<2*n) // third quadrant
362 {
363 if (x<n) return cmplx<Thigh>(std::cos(Thigh(x)*ang), -std::sin(Thigh(x)*ang));
364 return cmplx<Thigh>(std::sin(Thigh(2*n-x)*ang), -std::cos(Thigh(2*n-x)*ang));
365 }
366 else // fourth quadrant
367 {
368 x-=2*n;
369 if (x<n) return cmplx<Thigh>(-std::sin(Thigh(x)*ang), -std::cos(Thigh(x)*ang));
370 return cmplx<Thigh>(-std::cos(Thigh(2*n-x)*ang), -std::sin(Thigh(2*n-x)*ang));
371 }
372 }
373 }
374
375 public:
377 : N(n)
378 {
379 constexpr auto pi = 3.141592653589793238462643383279502884197L;
380 Thigh ang = Thigh(0.25L*pi/n);
381 size_t nval = (n+2)/2;
382 shift = 1;
383 while((size_t(1)<<shift)*(size_t(1)<<shift) < nval) ++shift;
384 mask = (size_t(1)<<shift)-1;
385 v1.resize(mask+1);
386 v1[0].Set(Thigh(1), Thigh(0));
387 for (size_t i=1; i<v1.size(); ++i)
388 v1[i]=calc(i,n,ang);
389 v2.resize((nval+mask)/(mask+1));
390 v2[0].Set(Thigh(1), Thigh(0));
391 for (size_t i=1; i<v2.size(); ++i)
392 v2[i]=calc(i*(mask+1),n,ang);
393 }
394
395 cmplx<T> operator[](size_t idx) const
396 {
397 if (2*idx<=N)
398 {
399 auto x1=v1[idx&mask], x2=v2[idx>>shift];
400 return cmplx<T>(T(x1.r*x2.r-x1.i*x2.i), T(x1.r*x2.i+x1.i*x2.r));
401 }
402 idx = N-idx;
403 auto x1=v1[idx&mask], x2=v2[idx>>shift];
404 return cmplx<T>(T(x1.r*x2.r-x1.i*x2.i), -T(x1.r*x2.i+x1.i*x2.r));
405 }
406 };
407
408struct util // hack to avoid duplicate symbols
409 {
411 {
412 size_t res=1;
413 while ((n&1)==0)
414 { res=2; n>>=1; }
415 for (size_t x=3; x*x<=n; x+=2)
416 while ((n%x)==0)
417 { res=x; n/=x; }
418 if (n>1) res=n;
419 return res;
420 }
421
422 static POCKETFFT_NOINLINE double cost_guess (size_t n)
423 {
424 constexpr double lfp=1.1; // penalty for non-hardcoded larger factors
425 size_t ni=n;
426 double result=0.;
427 while ((n&1)==0)
428 { result+=2; n>>=1; }
429 for (size_t x=3; x*x<=n; x+=2)
430 while ((n%x)==0)
431 {
432 result+= (x<=5) ? double(x) : lfp*double(x); // penalize larger prime factors
433 n/=x;
434 }
435 if (n>1) result+=(n<=5) ? double(n) : lfp*double(n);
436 return result*double(ni);
437 }
438
439 /* inner workings of good_size_cmplx() */
440 template<typename UIntT>
442 {
443 static_assert(std::numeric_limits<UIntT>::is_integer && (!std::numeric_limits<UIntT>::is_signed),
444 "type must be unsigned integer");
445 if (n<=12) return n;
446 if (n>std::numeric_limits<UIntT>::max()/11/2)
447 {
448 // The algorithm below doesn't work for this value, the multiplication can overflow.
449 if (sizeof(UIntT)<sizeof(std::uint64_t))
450 {
451 // We can try using this algorithm with 64-bit integers:
453 if (res<=std::numeric_limits<UIntT>::max())
454 return static_cast<UIntT>(res);
455 }
456 // Otherwise, this size is ridiculously large, people shouldn't be computing FFTs this large.
457 throw std::runtime_error("FFT size is too large.");
458 }
459
460 UIntT bestfac=2*n;
461 for (UIntT f11=1; f11<bestfac; f11*=11)
462 for (UIntT f117=f11; f117<bestfac; f117*=7)
463 for (UIntT f1175=f117; f1175<bestfac; f1175*=5)
464 {
465 UIntT x=f1175;
466 while (x<n) x*=2;
467 for (;;)
468 {
469 if (x<n)
470 x*=3;
471 else if (x>n)
472 {
473 if (x<bestfac) bestfac=x;
474 if (x&1) break;
475 x>>=1;
476 }
477 else
478 return n;
479 }
480 }
481 return bestfac;
482 }
483 /* returns the smallest composite of 2, 3, 5, 7 and 11 which is >= n */
484 static POCKETFFT_NOINLINE size_t good_size_cmplx(size_t n)
485 {
486 return good_size_cmplx_typed(n);
487 }
488 /* returns the smallest composite of 2, 3, 5, 7 and 11 which is >= n
489 and a multiple of required_factor. */
490 static POCKETFFT_NOINLINE size_t good_size_cmplx(size_t n,
491 size_t required_factor)
492 {
493 if (required_factor<1)
494 throw std::runtime_error("required factor must not be 0");
495 return good_size_cmplx((n+required_factor-1)/required_factor) * required_factor;
496 }
497
498 /* inner workings of good_size_real() */
499 template<typename UIntT>
501 {
502 static_assert(std::numeric_limits<UIntT>::is_integer && (!std::numeric_limits<UIntT>::is_signed),
503 "type must be unsigned integer");
504 if (n<=6) return n;
505 if (n>std::numeric_limits<UIntT>::max()/5/2)
506 {
507 // The algorithm below doesn't work for this value, the multiplication can overflow.
508 if (sizeof(UIntT)<sizeof(std::uint64_t))
509 {
510 // We can try using this algorithm with 64-bit integers:
511 std::uint64_t res = good_size_real_typed<std::uint64_t>(n);
512 if (res<=std::numeric_limits<UIntT>::max())
513 return static_cast<UIntT>(res);
514 }
515 // Otherwise, this size is ridiculously large, people shouldn't be computing FFTs this large.
516 throw std::runtime_error("FFT size is too large.");
517 }
518
519 UIntT bestfac=2*n;
520 for (UIntT f5=1; f5<bestfac; f5*=5)
521 {
522 UIntT x = f5;
523 while (x<n) x *= 2;
524 for (;;)
525 {
526 if (x<n)
527 x*=3;
528 else if (x>n)
529 {
530 if (x<bestfac) bestfac=x;
531 if (x&1) break;
532 x>>=1;
533 }
534 else
535 return n;
536 }
537 }
538 return bestfac;
539 }
540 /* returns the smallest composite of 2, 3, 5 which is >= n */
541 static POCKETFFT_NOINLINE size_t good_size_real(size_t n)
542 {
543 return good_size_real_typed(n);
544 }
545 /* returns the smallest composite of 2, 3, 5 which is >= n
546 and a multiple of required_factor. */
547 static POCKETFFT_NOINLINE size_t good_size_real(size_t n,
548 size_t required_factor)
549 {
550 if (required_factor<1)
551 throw std::runtime_error("required factor must not be 0");
552 return good_size_real((n+required_factor-1)/required_factor) * required_factor;
553 }
554
555 /* inner workings of prev_good_size_cmplx() */
556 template<typename UIntT>
558 {
559 static_assert(std::numeric_limits<UIntT>::is_integer && (!std::numeric_limits<UIntT>::is_signed),
560 "type must be unsigned integer");
561 if (n<=12) return n;
562 if (n>std::numeric_limits<UIntT>::max()/11)
563 {
564 // The algorithm below doesn't work for this value, the multiplication can overflow.
565 if (sizeof(UIntT)<sizeof(std::uint64_t))
566 {
567 // We can try using this algorithm with 64-bit integers:
569 if (res<=std::numeric_limits<UIntT>::max())
570 return static_cast<UIntT>(res);
571 }
572 // Otherwise, this size is ridiculously large, people shouldn't be computing FFTs this large.
573 throw std::runtime_error("FFT size is too large.");
574 }
575
576 UIntT bestfound = 1;
577 for (UIntT f11 = 1;f11 <= n; f11 *= 11)
578 for (UIntT f117 = f11; f117 <= n; f117 *= 7)
579 for (UIntT f1175 = f117; f1175 <= n; f1175 *= 5)
580 {
581 UIntT x = f1175;
582 while (x*2 <= n) x *= 2;
583 if (x > bestfound) bestfound = x;
584 while (true)
585 {
586 if (x * 3 <= n) x *= 3;
587 else if (x % 2 == 0) x /= 2;
588 else break;
589
590 if (x > bestfound) bestfound = x;
591 }
592 }
593 return bestfound;
594 }
595 /* returns the largest composite of 2, 3, 5, 7 and 11 which is <= n */
597 {
599 }
600
601 /* inner workings of prev_good_size_real() */
602 template<typename UIntT>
604 {
605 static_assert(std::numeric_limits<UIntT>::is_integer && (!std::numeric_limits<UIntT>::is_signed),
606 "type must be unsigned integer");
607 if (n<=6) return n;
608 if (n>std::numeric_limits<UIntT>::max()/5)
609 {
610 // The algorithm below doesn't work for this value, the multiplication can overflow.
611 if (sizeof(UIntT)<sizeof(std::uint64_t))
612 {
613 // We can try using this algorithm with 64-bit integers:
615 if (res<=std::numeric_limits<UIntT>::max())
616 return static_cast<UIntT>(res);
617 }
618 // Otherwise, this size is ridiculously large, people shouldn't be computing FFTs this large.
619 throw std::runtime_error("FFT size is too large.");
620 }
621
622 UIntT bestfound = 1;
623 for (UIntT f5 = 1; f5 <= n; f5 *= 5)
624 {
625 UIntT x = f5;
626 while (x*2 <= n) x *= 2;
627 if (x > bestfound) bestfound = x;
628 while (true)
629 {
630 if (x * 3 <= n) x *= 3;
631 else if (x % 2 == 0) x /= 2;
632 else break;
633
634 if (x > bestfound) bestfound = x;
635 }
636 }
637 return bestfound;
638 }
639 /* returns the largest composite of 2, 3, 5 which is <= n */
641 {
643 }
644
645 static size_t prod(const shape_t &shape)
646 {
647 size_t res=1;
648 for (auto sz: shape)
649 res*=sz;
650 return res;
651 }
652
653 static POCKETFFT_NOINLINE void sanity_check(const shape_t &shape,
654 const stride_t &stride_in, const stride_t &stride_out, bool inplace)
655 {
656 auto ndim = shape.size();
657 if (ndim<1) throw std::runtime_error("ndim must be >= 1");
658 if ((stride_in.size()!=ndim) || (stride_out.size()!=ndim))
659 throw std::runtime_error("stride dimension mismatch");
660 if (inplace && (stride_in!=stride_out))
661 throw std::runtime_error("stride mismatch");
662 }
663
664 static POCKETFFT_NOINLINE void sanity_check(const shape_t &shape,
665 const stride_t &stride_in, const stride_t &stride_out, bool inplace,
666 const shape_t &axes)
667 {
668 sanity_check(shape, stride_in, stride_out, inplace);
669 auto ndim = shape.size();
670 shape_t tmp(ndim,0);
671 for (auto ax : axes)
672 {
673 if (ax>=ndim) throw std::invalid_argument("bad axis number");
674 if (++tmp[ax]>1) throw std::invalid_argument("axis specified repeatedly");
675 }
676 }
677
678 static POCKETFFT_NOINLINE void sanity_check(const shape_t &shape,
679 const stride_t &stride_in, const stride_t &stride_out, bool inplace,
680 size_t axis)
681 {
682 sanity_check(shape, stride_in, stride_out, inplace);
683 if (axis>=shape.size()) throw std::invalid_argument("bad axis number");
684 }
685
686#ifdef POCKETFFT_NO_MULTITHREADING
687 static size_t thread_count (size_t /*nthreads*/, const shape_t &/*shape*/,
688 size_t /*axis*/, size_t /*vlen*/)
689 { return 1; }
690#else
691 static size_t thread_count (size_t nthreads, const shape_t &shape,
692 size_t axis, size_t vlen)
693 {
694 if (nthreads==1) return 1;
695 size_t size = prod(shape);
696 size_t parallel = size / (shape[axis] * vlen);
697 if (shape[axis] < 1000)
698 parallel /= 4;
699 size_t max_threads = nthreads == 0 ?
700 std::thread::hardware_concurrency() : nthreads;
701 return std::max(size_t(1), std::min(parallel, max_threads));
702 }
703#endif
704 };
705
706namespace threading {
707
708#ifdef POCKETFFT_NO_MULTITHREADING
709
710constexpr inline size_t thread_id() { return 0; }
711constexpr inline size_t num_threads() { return 1; }
712
713template <typename Func>
714void thread_map(size_t /* nthreads */, Func f)
715 { f(); }
716
717#else
718
719inline size_t &thread_id()
720 {
721 static thread_local size_t thread_id_=0;
722 return thread_id_;
723 }
724inline size_t &num_threads()
725 {
726 static thread_local size_t num_threads_=1;
727 return num_threads_;
728 }
729static const size_t max_threads = std::max(1u, std::thread::hardware_concurrency());
730
731class latch
732 {
733 std::atomic<size_t> num_left_;
734 std::mutex mut_;
735 std::condition_variable completed_;
736 using lock_t = std::unique_lock<std::mutex>;
737
738 public:
739 explicit latch(size_t n): num_left_(n) {}
740
742 {
743 lock_t lock(mut_);
744 if (--num_left_)
745 return;
746 completed_.notify_all();
747 }
748
749 void wait()
750 {
751 lock_t lock(mut_);
752 completed_.wait(lock, [this]{ return is_ready(); });
753 }
754 bool is_ready() { return num_left_ == 0; }
755 };
756
757template <typename T> class concurrent_queue
758 {
759 std::queue<T> q_;
760 std::mutex mut_;
761 std::atomic<size_t> size_;
762 using lock_t = std::lock_guard<std::mutex>;
763
764 public:
765
766 void push(T val)
767 {
768 lock_t lock(mut_);
769 ++size_;
770 q_.push(std::move(val));
771 }
772
773 bool try_pop(T &val)
774 {
775 if (size_ == 0) return false;
776 lock_t lock(mut_);
777 // Queue might have been emptied while we acquired the lock
778 if (q_.empty()) return false;
779
780 val = std::move(q_.front());
781 --size_;
782 q_.pop();
783 return true;
784 }
785
786 bool empty() const { return size_==0; }
787 };
788
789// C++ allocator with support for over-aligned types
790template <typename T> struct aligned_allocator
791 {
792 using value_type = T;
793 template <class U>
795 aligned_allocator() = default;
796
797 T *allocate(size_t n)
798 {
799 void* mem = aligned_alloc(alignof(T), n*sizeof(T));
800 return static_cast<T*>(mem);
801 }
802
803 void deallocate(T *p, size_t /*n*/)
804 { aligned_dealloc(p); }
805 };
806
808 {
809 // A reasonable guess, probably close enough for most hardware
810 static constexpr size_t cache_line_size = 64;
811 struct alignas(cache_line_size) worker
812 {
813 std::thread thread;
814 std::condition_variable work_ready;
815 std::mutex mut;
816 std::atomic_flag busy_flag = ATOMIC_FLAG_INIT;
817 std::function<void()> work;
818
820 std::atomic<bool> &shutdown_flag,
821 std::atomic<size_t> &unscheduled_tasks,
822 concurrent_queue<std::function<void()>> &overflow_work)
823 {
824 using lock_t_inner = std::unique_lock<std::mutex>;
825 bool expect_work = true;
826 while (!shutdown_flag || expect_work)
827 {
828 std::function<void()> local_work;
829 if (expect_work || unscheduled_tasks == 0)
830 {
831 lock_t_inner lock(mut);
832 // Wait until there is work to be executed
833 work_ready.wait(lock, [&]{ return (work || shutdown_flag); });
834 local_work.swap(work);
835 expect_work = false;
836 }
837
838 bool marked_busy = false;
839 if (local_work)
840 {
841 marked_busy = true;
842 local_work();
843 }
844
845 if (!overflow_work.empty())
846 {
847 if (!marked_busy && busy_flag.test_and_set())
848 {
849 expect_work = true;
850 continue;
851 }
852 marked_busy = true;
853
854 while (overflow_work.try_pop(local_work))
855 {
856 --unscheduled_tasks;
857 local_work();
858 }
859 }
860
861 if (marked_busy) busy_flag.clear();
862 }
863 }
864 };
865
866 concurrent_queue<std::function<void()>> overflow_work_;
867 std::mutex mut_;
868 std::vector<worker, aligned_allocator<worker>> workers_;
869 std::atomic<bool> shutdown_;
870 std::atomic<size_t> unscheduled_tasks_;
871 using lock_t = std::lock_guard<std::mutex>;
872
874 {
875 lock_t lock(mut_);
876 size_t nthreads=workers_.size();
877 for (size_t i=0; i<nthreads; ++i)
878 {
879 try
880 {
881 auto *worker_i = &workers_[i];
882 worker_i->busy_flag.clear();
883 worker_i->work = nullptr;
884 worker_i->thread = std::thread([worker_i, this]
885 {
886 worker_i->worker_main(shutdown_, unscheduled_tasks_, overflow_work_);
887 });
888 }
889 catch (...)
890 {
892 throw;
893 }
894 }
895 }
896
898 {
899 shutdown_ = true;
900 for (auto &worker_i : workers_)
901 worker_i.work_ready.notify_all();
902
903 for (auto &worker_i : workers_)
904 if (worker_i.thread.joinable())
905 worker_i.thread.join();
906 }
907
908 public:
909 explicit thread_pool(size_t nthreads):
910 workers_(nthreads)
911 { create_threads(); }
912
914
916
917 void submit(std::function<void()> work)
918 {
919 lock_t lock(mut_);
920 if (shutdown_)
921 throw std::runtime_error("Work item submitted after shutdown");
922
924
925 // First check for any idle workers and wake those
926 for (auto &worker_i : workers_)
927 if (!worker_i.busy_flag.test_and_set())
928 {
930 {
931 lock_t lock_inner(worker_i.mut);
932 worker_i.work = std::move(work);
933 }
934 worker_i.work_ready.notify_one();
935 return;
936 }
937
938 // If no workers were idle, push onto the overflow queue for later
939 overflow_work_.push(std::move(work));
940 }
941
942 void shutdown()
943 {
944 lock_t lock(mut_);
946 }
947
948 void restart()
949 {
950 shutdown_ = false;
952 }
953 };
954
956 {
957 static thread_pool pool;
958#ifdef POCKETFFT_PTHREADS
959 static std::once_flag f;
960 std::call_once(f,
961 []{
962 pthread_atfork(
963 +[]{ get_pool().shutdown(); }, // prepare
964 +[]{ get_pool().restart(); }, // parent
965 +[]{ get_pool().restart(); } // child
966 );
967 });
968#endif
969
970 return pool;
971 }
972
974template <typename Func>
975void thread_map(size_t nthreads, Func f)
976 {
977 if (nthreads == 0)
978 nthreads = max_threads;
979
980 if (nthreads == 1)
981 { f(); return; }
982
983 auto & pool = get_pool();
984 latch counter(nthreads);
985 std::exception_ptr ex;
986 std::mutex ex_mut;
987 for (size_t i=0; i<nthreads; ++i)
988 {
989 pool.submit(
990 [&f, &counter, &ex, &ex_mut, i, nthreads] {
991 thread_id() = i;
992 num_threads() = nthreads;
993 try { f(); }
994 catch (...)
995 {
996 std::lock_guard<std::mutex> lock(ex_mut);
997 ex = std::current_exception();
998 }
999 counter.count_down();
1000 });
1001 }
1002 counter.wait();
1003 if (ex)
1004 std::rethrow_exception(ex);
1005 }
1006
1007#endif
1008
1009}
1010
1011//
1012// complex FFTPACK transforms
1013//
1014
1015template<typename T0> class cfftp
1016 {
1017 private:
1018 struct fctdata
1019 {
1020 size_t fct;
1022 };
1023
1024 size_t length;
1026 std::vector<fctdata> fact;
1027
1028 void add_factor(size_t factor)
1029 { fact.push_back({factor, nullptr, nullptr}); }
1030
1031template<bool fwd, typename T> void pass2 (size_t ido, size_t l1,
1032 const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch,
1033 const cmplx<T0> * POCKETFFT_RESTRICT wa) const
1034 {
1035 auto CH = [ch,ido,l1](size_t a, size_t b, size_t c) -> T&
1036 { return ch[a+ido*(b+l1*c)]; };
1037 auto CC = [cc,ido](size_t a, size_t b, size_t c) -> const T&
1038 { return cc[a+ido*(b+2*c)]; };
1039 auto WA = [wa, ido](size_t x, size_t i)
1040 { return wa[i-1+x*(ido-1)]; };
1041
1042 if (ido==1)
1043 for (size_t k=0; k<l1; ++k)
1044 {
1045 CH(0,k,0) = CC(0,0,k)+CC(0,1,k);
1046 CH(0,k,1) = CC(0,0,k)-CC(0,1,k);
1047 }
1048 else
1049 for (size_t k=0; k<l1; ++k)
1050 {
1051 CH(0,k,0) = CC(0,0,k)+CC(0,1,k);
1052 CH(0,k,1) = CC(0,0,k)-CC(0,1,k);
1053 for (size_t i=1; i<ido; ++i)
1054 {
1055 CH(i,k,0) = CC(i,0,k)+CC(i,1,k);
1056 special_mul<fwd>(CC(i,0,k)-CC(i,1,k),WA(0,i),CH(i,k,1));
1057 }
1058 }
1059 }
1060
1061#define POCKETFFT_PREP3(idx) \
1062 T t0 = CC(idx,0,k), t1, t2; \
1063 PM (t1,t2,CC(idx,1,k),CC(idx,2,k)); \
1064 CH(idx,k,0)=t0+t1;
1065#define POCKETFFT_PARTSTEP3a(u1,u2,twr,twi) \
1066 { \
1067 T ca=t0+t1*twr; \
1068 T cb{-t2.i*twi, t2.r*twi}; \
1069 PM(CH(0,k,u1),CH(0,k,u2),ca,cb) ;\
1070 }
1071#define POCKETFFT_PARTSTEP3b(u1,u2,twr,twi) \
1072 { \
1073 T ca=t0+t1*twr; \
1074 T cb{-t2.i*twi, t2.r*twi}; \
1075 special_mul<fwd>(ca+cb,WA(u1-1,i),CH(i,k,u1)); \
1076 special_mul<fwd>(ca-cb,WA(u2-1,i),CH(i,k,u2)); \
1077 }
1078template<bool fwd, typename T> void pass3 (size_t ido, size_t l1,
1079 const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch,
1080 const cmplx<T0> * POCKETFFT_RESTRICT wa) const
1081 {
1082 constexpr T0 tw1r=-0.5,
1083 tw1i= (fwd ? -1: 1) * T0(0.8660254037844386467637231707529362L);
1084
1085 auto CH = [ch,ido,l1](size_t a, size_t b, size_t c) -> T&
1086 { return ch[a+ido*(b+l1*c)]; };
1087 auto CC = [cc,ido](size_t a, size_t b, size_t c) -> const T&
1088 { return cc[a+ido*(b+3*c)]; };
1089 auto WA = [wa, ido](size_t x, size_t i)
1090 { return wa[i-1+x*(ido-1)]; };
1091
1092 if (ido==1)
1093 for (size_t k=0; k<l1; ++k)
1094 {
1096 POCKETFFT_PARTSTEP3a(1,2,tw1r,tw1i)
1097 }
1098 else
1099 for (size_t k=0; k<l1; ++k)
1100 {
1101 {
1103 POCKETFFT_PARTSTEP3a(1,2,tw1r,tw1i)
1104 }
1105 for (size_t i=1; i<ido; ++i)
1106 {
1108 POCKETFFT_PARTSTEP3b(1,2,tw1r,tw1i)
1109 }
1110 }
1111 }
1112
1113#undef POCKETFFT_PARTSTEP3b
1114#undef POCKETFFT_PARTSTEP3a
1115#undef POCKETFFT_PREP3
1116
1117template<bool fwd, typename T> void pass4 (size_t ido, size_t l1,
1118 const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch,
1119 const cmplx<T0> * POCKETFFT_RESTRICT wa) const
1120 {
1121 auto CH = [ch,ido,l1](size_t a, size_t b, size_t c) -> T&
1122 { return ch[a+ido*(b+l1*c)]; };
1123 auto CC = [cc,ido](size_t a, size_t b, size_t c) -> const T&
1124 { return cc[a+ido*(b+4*c)]; };
1125 auto WA = [wa, ido](size_t x, size_t i)
1126 { return wa[i-1+x*(ido-1)]; };
1127
1128 if (ido==1)
1129 for (size_t k=0; k<l1; ++k)
1130 {
1131 T t1, t2, t3, t4;
1132 PM(t2,t1,CC(0,0,k),CC(0,2,k));
1133 PM(t3,t4,CC(0,1,k),CC(0,3,k));
1134 ROTX90<fwd>(t4);
1135 PM(CH(0,k,0),CH(0,k,2),t2,t3);
1136 PM(CH(0,k,1),CH(0,k,3),t1,t4);
1137 }
1138 else
1139 for (size_t k=0; k<l1; ++k)
1140 {
1141 {
1142 T t1, t2, t3, t4;
1143 PM(t2,t1,CC(0,0,k),CC(0,2,k));
1144 PM(t3,t4,CC(0,1,k),CC(0,3,k));
1145 ROTX90<fwd>(t4);
1146 PM(CH(0,k,0),CH(0,k,2),t2,t3);
1147 PM(CH(0,k,1),CH(0,k,3),t1,t4);
1148 }
1149 for (size_t i=1; i<ido; ++i)
1150 {
1151 T t1, t2, t3, t4;
1152 T cc0=CC(i,0,k), cc1=CC(i,1,k),cc2=CC(i,2,k),cc3=CC(i,3,k);
1153 PM(t2,t1,cc0,cc2);
1154 PM(t3,t4,cc1,cc3);
1155 ROTX90<fwd>(t4);
1156 CH(i,k,0) = t2+t3;
1157 special_mul<fwd>(t1+t4,WA(0,i),CH(i,k,1));
1158 special_mul<fwd>(t2-t3,WA(1,i),CH(i,k,2));
1159 special_mul<fwd>(t1-t4,WA(2,i),CH(i,k,3));
1160 }
1161 }
1162 }
1163
1164#define POCKETFFT_PREP5(idx) \
1165 T t0 = CC(idx,0,k), t1, t2, t3, t4; \
1166 PM (t1,t4,CC(idx,1,k),CC(idx,4,k)); \
1167 PM (t2,t3,CC(idx,2,k),CC(idx,3,k)); \
1168 CH(idx,k,0).r=t0.r+t1.r+t2.r; \
1169 CH(idx,k,0).i=t0.i+t1.i+t2.i;
1170
1171#define POCKETFFT_PARTSTEP5a(u1,u2,twar,twbr,twai,twbi) \
1172 { \
1173 T ca,cb; \
1174 ca.r=t0.r+twar*t1.r+twbr*t2.r; \
1175 ca.i=t0.i+twar*t1.i+twbr*t2.i; \
1176 cb.i=twai*t4.r twbi*t3.r; \
1177 cb.r=-(twai*t4.i twbi*t3.i); \
1178 PM(CH(0,k,u1),CH(0,k,u2),ca,cb); \
1179 }
1180
1181#define POCKETFFT_PARTSTEP5b(u1,u2,twar,twbr,twai,twbi) \
1182 { \
1183 T ca,cb; \
1184 ca.r=t0.r+twar*t1.r+twbr*t2.r; \
1185 ca.i=t0.i+twar*t1.i+twbr*t2.i; \
1186 cb.i=twai*t4.r twbi*t3.r; \
1187 cb.r=-(twai*t4.i twbi*t3.i); \
1188 special_mul<fwd>(ca+cb,WA(u1-1,i),CH(i,k,u1)); \
1189 special_mul<fwd>(ca-cb,WA(u2-1,i),CH(i,k,u2)); \
1190 }
1191template<bool fwd, typename T> void pass5 (size_t ido, size_t l1,
1192 const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch,
1193 const cmplx<T0> * POCKETFFT_RESTRICT wa) const
1194 {
1195 constexpr T0 tw1r= T0(0.3090169943749474241022934171828191L),
1196 tw1i= (fwd ? -1: 1) * T0(0.9510565162951535721164393333793821L),
1197 tw2r= T0(-0.8090169943749474241022934171828191L),
1198 tw2i= (fwd ? -1: 1) * T0(0.5877852522924731291687059546390728L);
1199
1200 auto CH = [ch,ido,l1](size_t a, size_t b, size_t c) -> T&
1201 { return ch[a+ido*(b+l1*c)]; };
1202 auto CC = [cc,ido](size_t a, size_t b, size_t c) -> const T&
1203 { return cc[a+ido*(b+5*c)]; };
1204 auto WA = [wa, ido](size_t x, size_t i)
1205 { return wa[i-1+x*(ido-1)]; };
1206
1207 if (ido==1)
1208 for (size_t k=0; k<l1; ++k)
1209 {
1211 POCKETFFT_PARTSTEP5a(1,4,tw1r,tw2r,+tw1i,+tw2i)
1212 POCKETFFT_PARTSTEP5a(2,3,tw2r,tw1r,+tw2i,-tw1i)
1213 }
1214 else
1215 for (size_t k=0; k<l1; ++k)
1216 {
1217 {
1219 POCKETFFT_PARTSTEP5a(1,4,tw1r,tw2r,+tw1i,+tw2i)
1220 POCKETFFT_PARTSTEP5a(2,3,tw2r,tw1r,+tw2i,-tw1i)
1221 }
1222 for (size_t i=1; i<ido; ++i)
1223 {
1225 POCKETFFT_PARTSTEP5b(1,4,tw1r,tw2r,+tw1i,+tw2i)
1226 POCKETFFT_PARTSTEP5b(2,3,tw2r,tw1r,+tw2i,-tw1i)
1227 }
1228 }
1229 }
1230
1231#undef POCKETFFT_PARTSTEP5b
1232#undef POCKETFFT_PARTSTEP5a
1233#undef POCKETFFT_PREP5
1234
1235#define POCKETFFT_PREP7(idx) \
1236 T t1 = CC(idx,0,k), t2, t3, t4, t5, t6, t7; \
1237 PM (t2,t7,CC(idx,1,k),CC(idx,6,k)); \
1238 PM (t3,t6,CC(idx,2,k),CC(idx,5,k)); \
1239 PM (t4,t5,CC(idx,3,k),CC(idx,4,k)); \
1240 CH(idx,k,0).r=t1.r+t2.r+t3.r+t4.r; \
1241 CH(idx,k,0).i=t1.i+t2.i+t3.i+t4.i;
1242
1243#define POCKETFFT_PARTSTEP7a0(u1,u2,x1,x2,x3,y1,y2,y3,out1,out2) \
1244 { \
1245 T ca,cb; \
1246 ca.r=t1.r+x1*t2.r+x2*t3.r+x3*t4.r; \
1247 ca.i=t1.i+x1*t2.i+x2*t3.i+x3*t4.i; \
1248 cb.i=y1*t7.r y2*t6.r y3*t5.r; \
1249 cb.r=-(y1*t7.i y2*t6.i y3*t5.i); \
1250 PM(out1,out2,ca,cb); \
1251 }
1252#define POCKETFFT_PARTSTEP7a(u1,u2,x1,x2,x3,y1,y2,y3) \
1253 POCKETFFT_PARTSTEP7a0(u1,u2,x1,x2,x3,y1,y2,y3,CH(0,k,u1),CH(0,k,u2))
1254#define POCKETFFT_PARTSTEP7(u1,u2,x1,x2,x3,y1,y2,y3) \
1255 { \
1256 T da,db; \
1257 POCKETFFT_PARTSTEP7a0(u1,u2,x1,x2,x3,y1,y2,y3,da,db) \
1258 special_mul<fwd>(da,WA(u1-1,i),CH(i,k,u1)); \
1259 special_mul<fwd>(db,WA(u2-1,i),CH(i,k,u2)); \
1260 }
1261
1262template<bool fwd, typename T> void pass7(size_t ido, size_t l1,
1263 const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch,
1264 const cmplx<T0> * POCKETFFT_RESTRICT wa) const
1265 {
1266 constexpr T0 tw1r= T0(0.6234898018587335305250048840042398L),
1267 tw1i= (fwd ? -1 : 1) * T0(0.7818314824680298087084445266740578L),
1268 tw2r= T0(-0.2225209339563144042889025644967948L),
1269 tw2i= (fwd ? -1 : 1) * T0(0.9749279121818236070181316829939312L),
1270 tw3r= T0(-0.9009688679024191262361023195074451L),
1271 tw3i= (fwd ? -1 : 1) * T0(0.433883739117558120475768332848359L);
1272
1273 auto CH = [ch,ido,l1](size_t a, size_t b, size_t c) -> T&
1274 { return ch[a+ido*(b+l1*c)]; };
1275 auto CC = [cc,ido](size_t a, size_t b, size_t c) -> const T&
1276 { return cc[a+ido*(b+7*c)]; };
1277 auto WA = [wa, ido](size_t x, size_t i)
1278 { return wa[i-1+x*(ido-1)]; };
1279
1280 if (ido==1)
1281 for (size_t k=0; k<l1; ++k)
1282 {
1284 POCKETFFT_PARTSTEP7a(1,6,tw1r,tw2r,tw3r,+tw1i,+tw2i,+tw3i)
1285 POCKETFFT_PARTSTEP7a(2,5,tw2r,tw3r,tw1r,+tw2i,-tw3i,-tw1i)
1286 POCKETFFT_PARTSTEP7a(3,4,tw3r,tw1r,tw2r,+tw3i,-tw1i,+tw2i)
1287 }
1288 else
1289 for (size_t k=0; k<l1; ++k)
1290 {
1291 {
1293 POCKETFFT_PARTSTEP7a(1,6,tw1r,tw2r,tw3r,+tw1i,+tw2i,+tw3i)
1294 POCKETFFT_PARTSTEP7a(2,5,tw2r,tw3r,tw1r,+tw2i,-tw3i,-tw1i)
1295 POCKETFFT_PARTSTEP7a(3,4,tw3r,tw1r,tw2r,+tw3i,-tw1i,+tw2i)
1296 }
1297 for (size_t i=1; i<ido; ++i)
1298 {
1300 POCKETFFT_PARTSTEP7(1,6,tw1r,tw2r,tw3r,+tw1i,+tw2i,+tw3i)
1301 POCKETFFT_PARTSTEP7(2,5,tw2r,tw3r,tw1r,+tw2i,-tw3i,-tw1i)
1302 POCKETFFT_PARTSTEP7(3,4,tw3r,tw1r,tw2r,+tw3i,-tw1i,+tw2i)
1303 }
1304 }
1305 }
1306
1307#undef POCKETFFT_PARTSTEP7
1308#undef POCKETFFT_PARTSTEP7a0
1309#undef POCKETFFT_PARTSTEP7a
1310#undef POCKETFFT_PREP7
1311
1312template <bool fwd, typename T> void ROTX45(T &a) const
1313 {
1314 constexpr T0 hsqt2=T0(0.707106781186547524400844362104849L);
1315 if (fwd)
1316 { auto tmp_=a.r; a.r=hsqt2*(a.r+a.i); a.i=hsqt2*(a.i-tmp_); }
1317 else
1318 { auto tmp_=a.r; a.r=hsqt2*(a.r-a.i); a.i=hsqt2*(a.i+tmp_); }
1319 }
1320template <bool fwd, typename T> void ROTX135(T &a) const
1321 {
1322 constexpr T0 hsqt2=T0(0.707106781186547524400844362104849L);
1323 if (fwd)
1324 { auto tmp_=a.r; a.r=hsqt2*(a.i-a.r); a.i=hsqt2*(-tmp_-a.i); }
1325 else
1326 { auto tmp_=a.r; a.r=hsqt2*(-a.r-a.i); a.i=hsqt2*(tmp_-a.i); }
1327 }
1328
1329template<bool fwd, typename T> void pass8 (size_t ido, size_t l1,
1330 const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch,
1331 const cmplx<T0> * POCKETFFT_RESTRICT wa) const
1332 {
1333 auto CH = [ch,ido,l1](size_t a, size_t b, size_t c) -> T&
1334 { return ch[a+ido*(b+l1*c)]; };
1335 auto CC = [cc,ido](size_t a, size_t b, size_t c) -> const T&
1336 { return cc[a+ido*(b+8*c)]; };
1337 auto WA = [wa, ido](size_t x, size_t i)
1338 { return wa[i-1+x*(ido-1)]; };
1339
1340 if (ido==1)
1341 for (size_t k=0; k<l1; ++k)
1342 {
1343 T a0, a1, a2, a3, a4, a5, a6, a7;
1344 PM(a1,a5,CC(0,1,k),CC(0,5,k));
1345 PM(a3,a7,CC(0,3,k),CC(0,7,k));
1346 PMINPLACE(a1,a3);
1347 ROTX90<fwd>(a3);
1348
1349 ROTX90<fwd>(a7);
1350 PMINPLACE(a5,a7);
1351 ROTX45<fwd>(a5);
1352 ROTX135<fwd>(a7);
1353
1354 PM(a0,a4,CC(0,0,k),CC(0,4,k));
1355 PM(a2,a6,CC(0,2,k),CC(0,6,k));
1356 PM(CH(0,k,0),CH(0,k,4),a0+a2,a1);
1357 PM(CH(0,k,2),CH(0,k,6),a0-a2,a3);
1358 ROTX90<fwd>(a6);
1359 PM(CH(0,k,1),CH(0,k,5),a4+a6,a5);
1360 PM(CH(0,k,3),CH(0,k,7),a4-a6,a7);
1361 }
1362 else
1363 for (size_t k=0; k<l1; ++k)
1364 {
1365 {
1366 T a0, a1, a2, a3, a4, a5, a6, a7;
1367 PM(a1,a5,CC(0,1,k),CC(0,5,k));
1368 PM(a3,a7,CC(0,3,k),CC(0,7,k));
1369 PMINPLACE(a1,a3);
1370 ROTX90<fwd>(a3);
1371
1372 ROTX90<fwd>(a7);
1373 PMINPLACE(a5,a7);
1374 ROTX45<fwd>(a5);
1375 ROTX135<fwd>(a7);
1376
1377 PM(a0,a4,CC(0,0,k),CC(0,4,k));
1378 PM(a2,a6,CC(0,2,k),CC(0,6,k));
1379 PM(CH(0,k,0),CH(0,k,4),a0+a2,a1);
1380 PM(CH(0,k,2),CH(0,k,6),a0-a2,a3);
1381 ROTX90<fwd>(a6);
1382 PM(CH(0,k,1),CH(0,k,5),a4+a6,a5);
1383 PM(CH(0,k,3),CH(0,k,7),a4-a6,a7);
1384 }
1385 for (size_t i=1; i<ido; ++i)
1386 {
1387 T a0, a1, a2, a3, a4, a5, a6, a7;
1388 PM(a1,a5,CC(i,1,k),CC(i,5,k));
1389 PM(a3,a7,CC(i,3,k),CC(i,7,k));
1390 ROTX90<fwd>(a7);
1391 PMINPLACE(a1,a3);
1392 ROTX90<fwd>(a3);
1393 PMINPLACE(a5,a7);
1394 ROTX45<fwd>(a5);
1395 ROTX135<fwd>(a7);
1396 PM(a0,a4,CC(i,0,k),CC(i,4,k));
1397 PM(a2,a6,CC(i,2,k),CC(i,6,k));
1398 PMINPLACE(a0,a2);
1399 CH(i,k,0) = a0+a1;
1400 special_mul<fwd>(a0-a1,WA(3,i),CH(i,k,4));
1401 special_mul<fwd>(a2+a3,WA(1,i),CH(i,k,2));
1402 special_mul<fwd>(a2-a3,WA(5,i),CH(i,k,6));
1403 ROTX90<fwd>(a6);
1404 PMINPLACE(a4,a6);
1405 special_mul<fwd>(a4+a5,WA(0,i),CH(i,k,1));
1406 special_mul<fwd>(a4-a5,WA(4,i),CH(i,k,5));
1407 special_mul<fwd>(a6+a7,WA(2,i),CH(i,k,3));
1408 special_mul<fwd>(a6-a7,WA(6,i),CH(i,k,7));
1409 }
1410 }
1411 }
1412
1413
1414#define POCKETFFT_PREP11(idx) \
1415 T t1 = CC(idx,0,k), t2, t3, t4, t5, t6, t7, t8, t9, t10, t11; \
1416 PM (t2,t11,CC(idx,1,k),CC(idx,10,k)); \
1417 PM (t3,t10,CC(idx,2,k),CC(idx, 9,k)); \
1418 PM (t4,t9 ,CC(idx,3,k),CC(idx, 8,k)); \
1419 PM (t5,t8 ,CC(idx,4,k),CC(idx, 7,k)); \
1420 PM (t6,t7 ,CC(idx,5,k),CC(idx, 6,k)); \
1421 CH(idx,k,0).r=t1.r+t2.r+t3.r+t4.r+t5.r+t6.r; \
1422 CH(idx,k,0).i=t1.i+t2.i+t3.i+t4.i+t5.i+t6.i;
1423
1424#define POCKETFFT_PARTSTEP11a0(u1,u2,x1,x2,x3,x4,x5,y1,y2,y3,y4,y5,out1,out2) \
1425 { \
1426 T ca = t1 + t2*x1 + t3*x2 + t4*x3 + t5*x4 +t6*x5, \
1427 cb; \
1428 cb.i=y1*t11.r y2*t10.r y3*t9.r y4*t8.r y5*t7.r; \
1429 cb.r=-(y1*t11.i y2*t10.i y3*t9.i y4*t8.i y5*t7.i ); \
1430 PM(out1,out2,ca,cb); \
1431 }
1432#define POCKETFFT_PARTSTEP11a(u1,u2,x1,x2,x3,x4,x5,y1,y2,y3,y4,y5) \
1433 POCKETFFT_PARTSTEP11a0(u1,u2,x1,x2,x3,x4,x5,y1,y2,y3,y4,y5,CH(0,k,u1),CH(0,k,u2))
1434#define POCKETFFT_PARTSTEP11(u1,u2,x1,x2,x3,x4,x5,y1,y2,y3,y4,y5) \
1435 { \
1436 T da,db; \
1437 POCKETFFT_PARTSTEP11a0(u1,u2,x1,x2,x3,x4,x5,y1,y2,y3,y4,y5,da,db) \
1438 special_mul<fwd>(da,WA(u1-1,i),CH(i,k,u1)); \
1439 special_mul<fwd>(db,WA(u2-1,i),CH(i,k,u2)); \
1440 }
1441
1442template<bool fwd, typename T> void pass11 (size_t ido, size_t l1,
1443 const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch,
1444 const cmplx<T0> * POCKETFFT_RESTRICT wa) const
1445 {
1446 constexpr T0 tw1r= T0(0.8412535328311811688618116489193677L),
1447 tw1i= (fwd ? -1 : 1) * T0(0.5406408174555975821076359543186917L),
1448 tw2r= T0(0.4154150130018864255292741492296232L),
1449 tw2i= (fwd ? -1 : 1) * T0(0.9096319953545183714117153830790285L),
1450 tw3r= T0(-0.1423148382732851404437926686163697L),
1451 tw3i= (fwd ? -1 : 1) * T0(0.9898214418809327323760920377767188L),
1452 tw4r= T0(-0.6548607339452850640569250724662936L),
1453 tw4i= (fwd ? -1 : 1) * T0(0.7557495743542582837740358439723444L),
1454 tw5r= T0(-0.9594929736144973898903680570663277L),
1455 tw5i= (fwd ? -1 : 1) * T0(0.2817325568414296977114179153466169L);
1456
1457 auto CH = [ch,ido,l1](size_t a, size_t b, size_t c) -> T&
1458 { return ch[a+ido*(b+l1*c)]; };
1459 auto CC = [cc,ido](size_t a, size_t b, size_t c) -> const T&
1460 { return cc[a+ido*(b+11*c)]; };
1461 auto WA = [wa, ido](size_t x, size_t i)
1462 { return wa[i-1+x*(ido-1)]; };
1463
1464 if (ido==1)
1465 for (size_t k=0; k<l1; ++k)
1466 {
1468 POCKETFFT_PARTSTEP11a(1,10,tw1r,tw2r,tw3r,tw4r,tw5r,+tw1i,+tw2i,+tw3i,+tw4i,+tw5i)
1469 POCKETFFT_PARTSTEP11a(2, 9,tw2r,tw4r,tw5r,tw3r,tw1r,+tw2i,+tw4i,-tw5i,-tw3i,-tw1i)
1470 POCKETFFT_PARTSTEP11a(3, 8,tw3r,tw5r,tw2r,tw1r,tw4r,+tw3i,-tw5i,-tw2i,+tw1i,+tw4i)
1471 POCKETFFT_PARTSTEP11a(4, 7,tw4r,tw3r,tw1r,tw5r,tw2r,+tw4i,-tw3i,+tw1i,+tw5i,-tw2i)
1472 POCKETFFT_PARTSTEP11a(5, 6,tw5r,tw1r,tw4r,tw2r,tw3r,+tw5i,-tw1i,+tw4i,-tw2i,+tw3i)
1473 }
1474 else
1475 for (size_t k=0; k<l1; ++k)
1476 {
1477 {
1479 POCKETFFT_PARTSTEP11a(1,10,tw1r,tw2r,tw3r,tw4r,tw5r,+tw1i,+tw2i,+tw3i,+tw4i,+tw5i)
1480 POCKETFFT_PARTSTEP11a(2, 9,tw2r,tw4r,tw5r,tw3r,tw1r,+tw2i,+tw4i,-tw5i,-tw3i,-tw1i)
1481 POCKETFFT_PARTSTEP11a(3, 8,tw3r,tw5r,tw2r,tw1r,tw4r,+tw3i,-tw5i,-tw2i,+tw1i,+tw4i)
1482 POCKETFFT_PARTSTEP11a(4, 7,tw4r,tw3r,tw1r,tw5r,tw2r,+tw4i,-tw3i,+tw1i,+tw5i,-tw2i)
1483 POCKETFFT_PARTSTEP11a(5, 6,tw5r,tw1r,tw4r,tw2r,tw3r,+tw5i,-tw1i,+tw4i,-tw2i,+tw3i)
1484 }
1485 for (size_t i=1; i<ido; ++i)
1486 {
1488 POCKETFFT_PARTSTEP11(1,10,tw1r,tw2r,tw3r,tw4r,tw5r,+tw1i,+tw2i,+tw3i,+tw4i,+tw5i)
1489 POCKETFFT_PARTSTEP11(2, 9,tw2r,tw4r,tw5r,tw3r,tw1r,+tw2i,+tw4i,-tw5i,-tw3i,-tw1i)
1490 POCKETFFT_PARTSTEP11(3, 8,tw3r,tw5r,tw2r,tw1r,tw4r,+tw3i,-tw5i,-tw2i,+tw1i,+tw4i)
1491 POCKETFFT_PARTSTEP11(4, 7,tw4r,tw3r,tw1r,tw5r,tw2r,+tw4i,-tw3i,+tw1i,+tw5i,-tw2i)
1492 POCKETFFT_PARTSTEP11(5, 6,tw5r,tw1r,tw4r,tw2r,tw3r,+tw5i,-tw1i,+tw4i,-tw2i,+tw3i)
1493 }
1494 }
1495 }
1496
1497#undef POCKETFFT_PARTSTEP11
1498#undef POCKETFFT_PARTSTEP11a0
1499#undef POCKETFFT_PARTSTEP11a
1500#undef POCKETFFT_PREP11
1501
1502template<bool fwd, typename T> void passg (size_t ido, size_t ip,
1503 size_t l1, T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch,
1504 const cmplx<T0> * POCKETFFT_RESTRICT wa,
1505 const cmplx<T0> * POCKETFFT_RESTRICT csarr) const
1506 {
1507 const size_t cdim=ip;
1508 size_t ipph = (ip+1)/2;
1509 size_t idl1 = ido*l1;
1510
1511 auto CH = [ch,ido,l1](size_t a, size_t b, size_t c) -> T&
1512 { return ch[a+ido*(b+l1*c)]; };
1513 auto CC = [cc,ido,cdim](size_t a, size_t b, size_t c) -> const T&
1514 { return cc[a+ido*(b+cdim*c)]; };
1515 auto CX = [cc, ido, l1](size_t a, size_t b, size_t c) -> T&
1516 { return cc[a+ido*(b+l1*c)]; };
1517 auto CX2 = [cc, idl1](size_t a, size_t b) -> T&
1518 { return cc[a+idl1*b]; };
1519 auto CH2 = [ch, idl1](size_t a, size_t b) -> const T&
1520 { return ch[a+idl1*b]; };
1521
1522 arr<cmplx<T0>> wal(ip);
1523 wal[0] = cmplx<T0>(1., 0.);
1524 for (size_t i=1; i<ip; ++i)
1525 wal[i]=cmplx<T0>(csarr[i].r,fwd ? -csarr[i].i : csarr[i].i);
1526
1527 for (size_t k=0; k<l1; ++k)
1528 for (size_t i=0; i<ido; ++i)
1529 CH(i,k,0) = CC(i,0,k);
1530 for (size_t j=1, jc=ip-1; j<ipph; ++j, --jc)
1531 for (size_t k=0; k<l1; ++k)
1532 for (size_t i=0; i<ido; ++i)
1533 PM(CH(i,k,j),CH(i,k,jc),CC(i,j,k),CC(i,jc,k));
1534 for (size_t k=0; k<l1; ++k)
1535 for (size_t i=0; i<ido; ++i)
1536 {
1537 T tmp = CH(i,k,0);
1538 for (size_t j=1; j<ipph; ++j)
1539 tmp+=CH(i,k,j);
1540 CX(i,k,0) = tmp;
1541 }
1542 for (size_t l=1, lc=ip-1; l<ipph; ++l, --lc)
1543 {
1544 // j=0
1545 for (size_t ik=0; ik<idl1; ++ik)
1546 {
1547 CX2(ik,l).r = CH2(ik,0).r+wal[l].r*CH2(ik,1).r+wal[2*l].r*CH2(ik,2).r;
1548 CX2(ik,l).i = CH2(ik,0).i+wal[l].r*CH2(ik,1).i+wal[2*l].r*CH2(ik,2).i;
1549 CX2(ik,lc).r=-wal[l].i*CH2(ik,ip-1).i-wal[2*l].i*CH2(ik,ip-2).i;
1550 CX2(ik,lc).i=wal[l].i*CH2(ik,ip-1).r+wal[2*l].i*CH2(ik,ip-2).r;
1551 }
1552
1553 size_t iwal=2*l;
1554 size_t j=3, jc=ip-3;
1555 for (; j<ipph-1; j+=2, jc-=2)
1556 {
1557 iwal+=l; if (iwal>ip) iwal-=ip;
1558 cmplx<T0> xwal=wal[iwal];
1559 iwal+=l; if (iwal>ip) iwal-=ip;
1560 cmplx<T0> xwal2=wal[iwal];
1561 for (size_t ik=0; ik<idl1; ++ik)
1562 {
1563 CX2(ik,l).r += CH2(ik,j).r*xwal.r+CH2(ik,j+1).r*xwal2.r;
1564 CX2(ik,l).i += CH2(ik,j).i*xwal.r+CH2(ik,j+1).i*xwal2.r;
1565 CX2(ik,lc).r -= CH2(ik,jc).i*xwal.i+CH2(ik,jc-1).i*xwal2.i;
1566 CX2(ik,lc).i += CH2(ik,jc).r*xwal.i+CH2(ik,jc-1).r*xwal2.i;
1567 }
1568 }
1569 for (; j<ipph; ++j, --jc)
1570 {
1571 iwal+=l; if (iwal>ip) iwal-=ip;
1572 cmplx<T0> xwal=wal[iwal];
1573 for (size_t ik=0; ik<idl1; ++ik)
1574 {
1575 CX2(ik,l).r += CH2(ik,j).r*xwal.r;
1576 CX2(ik,l).i += CH2(ik,j).i*xwal.r;
1577 CX2(ik,lc).r -= CH2(ik,jc).i*xwal.i;
1578 CX2(ik,lc).i += CH2(ik,jc).r*xwal.i;
1579 }
1580 }
1581 }
1582
1583 // shuffling and twiddling
1584 if (ido==1)
1585 for (size_t j=1, jc=ip-1; j<ipph; ++j, --jc)
1586 for (size_t ik=0; ik<idl1; ++ik)
1587 {
1588 T t1=CX2(ik,j), t2=CX2(ik,jc);
1589 PM(CX2(ik,j),CX2(ik,jc),t1,t2);
1590 }
1591 else
1592 {
1593 for (size_t j=1, jc=ip-1; j<ipph; ++j,--jc)
1594 for (size_t k=0; k<l1; ++k)
1595 {
1596 T t1=CX(0,k,j), t2=CX(0,k,jc);
1597 PM(CX(0,k,j),CX(0,k,jc),t1,t2);
1598 for (size_t i=1; i<ido; ++i)
1599 {
1600 T x1, x2;
1601 PM(x1,x2,CX(i,k,j),CX(i,k,jc));
1602 size_t idij=(j-1)*(ido-1)+i-1;
1603 special_mul<fwd>(x1,wa[idij],CX(i,k,j));
1604 idij=(jc-1)*(ido-1)+i-1;
1605 special_mul<fwd>(x2,wa[idij],CX(i,k,jc));
1606 }
1607 }
1608 }
1609 }
1610
1611template<bool fwd, typename T> void pass_all(T c[], T0 fct) const
1612 {
1613 if (length==1) { c[0]*=fct; return; }
1614 size_t l1=1;
1615 arr<T> ch(length);
1616 T *p1=c, *p2=ch.data();
1617
1618 for(size_t k1=0; k1<fact.size(); k1++)
1619 {
1620 size_t ip=fact[k1].fct;
1621 size_t l2=ip*l1;
1622 size_t ido = length/l2;
1623 if (ip==4)
1624 pass4<fwd> (ido, l1, p1, p2, fact[k1].tw);
1625 else if(ip==8)
1626 pass8<fwd>(ido, l1, p1, p2, fact[k1].tw);
1627 else if(ip==2)
1628 pass2<fwd>(ido, l1, p1, p2, fact[k1].tw);
1629 else if(ip==3)
1630 pass3<fwd> (ido, l1, p1, p2, fact[k1].tw);
1631 else if(ip==5)
1632 pass5<fwd> (ido, l1, p1, p2, fact[k1].tw);
1633 else if(ip==7)
1634 pass7<fwd> (ido, l1, p1, p2, fact[k1].tw);
1635 else if(ip==11)
1636 pass11<fwd> (ido, l1, p1, p2, fact[k1].tw);
1637 else
1638 {
1639 passg<fwd>(ido, ip, l1, p1, p2, fact[k1].tw, fact[k1].tws);
1640 std::swap(p1,p2);
1641 }
1642 std::swap(p1,p2);
1643 l1=l2;
1644 }
1645 if (p1!=c)
1646 {
1647 if (fct!=1.)
1648 for (size_t i=0; i<length; ++i)
1649 c[i] = ch[i]*fct;
1650 else
1651 std::copy_n (p1, length, c);
1652 }
1653 else
1654 if (fct!=1.)
1655 for (size_t i=0; i<length; ++i)
1656 c[i] *= fct;
1657 }
1658
1659 public:
1660 template<typename T> void exec(T c[], T0 fct, bool fwd) const
1661 { fwd ? pass_all<true>(c, fct) : pass_all<false>(c, fct); }
1662
1663 private:
1665 {
1666 size_t len=length;
1667 while ((len&7)==0)
1668 { add_factor(8); len>>=3; }
1669 while ((len&3)==0)
1670 { add_factor(4); len>>=2; }
1671 if ((len&1)==0)
1672 {
1673 len>>=1;
1674 // factor 2 should be at the front of the factor list
1675 add_factor(2);
1676 std::swap(fact[0].fct, fact.back().fct);
1677 }
1678 for (size_t divisor=3; divisor*divisor<=len; divisor+=2)
1679 while ((len%divisor)==0)
1680 {
1681 add_factor(divisor);
1682 len/=divisor;
1683 }
1684 if (len>1) add_factor(len);
1685 }
1686
1687 size_t twsize() const
1688 {
1689 size_t twsize=0, l1=1;
1690 for (size_t k=0; k<fact.size(); ++k)
1691 {
1692 size_t ip=fact[k].fct, ido= length/(l1*ip);
1693 twsize+=(ip-1)*(ido-1);
1694 if (ip>11)
1695 twsize+=ip;
1696 l1*=ip;
1697 }
1698 return twsize;
1699 }
1700
1702 {
1703 sincos_2pibyn<T0> twiddle(length);
1704 size_t l1=1;
1705 size_t memofs=0;
1706 for (size_t k=0; k<fact.size(); ++k)
1707 {
1708 size_t ip=fact[k].fct, ido=length/(l1*ip);
1709 fact[k].tw=mem.data()+memofs;
1710 memofs+=(ip-1)*(ido-1);
1711 for (size_t j=1; j<ip; ++j)
1712 for (size_t i=1; i<ido; ++i)
1713 fact[k].tw[(j-1)*(ido-1)+i-1] = twiddle[j*l1*i];
1714 if (ip>11)
1715 {
1716 fact[k].tws=mem.data()+memofs;
1717 memofs+=ip;
1718 for (size_t j=0; j<ip; ++j)
1719 fact[k].tws[j] = twiddle[j*l1*ido];
1720 }
1721 l1*=ip;
1722 }
1723 }
1724
1725 public:
1726 POCKETFFT_NOINLINE explicit cfftp(size_t length_)
1727 : length(length_)
1728 {
1729 if (length==0) throw std::runtime_error("zero-length FFT requested");
1730 if (length==1) return;
1731 factorize();
1732 mem.resize(twsize());
1733 comp_twiddle();
1734 }
1735 };
1736
1737//
1738// real-valued FFTPACK transforms
1739//
1740
1741template<typename T0> class rfftp
1742 {
1743 private:
1744 struct fctdata
1745 {
1746 size_t fct;
1747 T0 *tw, *tws;
1748 };
1749
1750 size_t length;
1752 std::vector<fctdata> fact;
1753
1754 void add_factor(size_t factor)
1755 { fact.push_back({factor, nullptr, nullptr}); }
1756
1757/* (a+ib) = conj(c+id) * (e+if) */
1758template<typename T1, typename T2, typename T3> inline void MULPM
1759 (T1 &a, T1 &b, T2 c, T2 d, T3 e, T3 f) const
1760 { a=c*e+d*f; b=c*f-d*e; }
1761
1762template<typename T> void radf2 (size_t ido, size_t l1,
1763 const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch,
1764 const T0 * POCKETFFT_RESTRICT wa) const
1765 {
1766 auto WA = [wa,ido](size_t x, size_t i) { return wa[i+x*(ido-1)]; };
1767 auto CC = [cc,ido,l1](size_t a, size_t b, size_t c) -> const T&
1768 { return cc[a+ido*(b+l1*c)]; };
1769 auto CH = [ch,ido](size_t a, size_t b, size_t c) -> T&
1770 { return ch[a+ido*(b+2*c)]; };
1771
1772 for (size_t k=0; k<l1; k++)
1773 PM (CH(0,0,k),CH(ido-1,1,k),CC(0,k,0),CC(0,k,1));
1774 if ((ido&1)==0)
1775 for (size_t k=0; k<l1; k++)
1776 {
1777 CH( 0,1,k) = -CC(ido-1,k,1);
1778 CH(ido-1,0,k) = CC(ido-1,k,0);
1779 }
1780 if (ido<=2) return;
1781 for (size_t k=0; k<l1; k++)
1782 for (size_t i=2; i<ido; i+=2)
1783 {
1784 size_t ic=ido-i;
1785 T tr2, ti2;
1786 MULPM (tr2,ti2,WA(0,i-2),WA(0,i-1),CC(i-1,k,1),CC(i,k,1));
1787 PM (CH(i-1,0,k),CH(ic-1,1,k),CC(i-1,k,0),tr2);
1788 PM (CH(i ,0,k),CH(ic ,1,k),ti2,CC(i ,k,0));
1789 }
1790 }
1791
1792// a2=a+b; b2=i*(b-a);
1793#define POCKETFFT_REARRANGE(rx, ix, ry, iy) \
1794 {\
1795 auto t1=rx+ry, t2=ry-rx, t3=ix+iy, t4=ix-iy; \
1796 rx=t1; ix=t3; ry=t4; iy=t2; \
1797 }
1798
1799template<typename T> void radf3(size_t ido, size_t l1,
1800 const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch,
1801 const T0 * POCKETFFT_RESTRICT wa) const
1802 {
1803 constexpr T0 taur=-0.5, taui=T0(0.8660254037844386467637231707529362L);
1804
1805 auto WA = [wa,ido](size_t x, size_t i) { return wa[i+x*(ido-1)]; };
1806 auto CC = [cc,ido,l1](size_t a, size_t b, size_t c) -> const T&
1807 { return cc[a+ido*(b+l1*c)]; };
1808 auto CH = [ch,ido](size_t a, size_t b, size_t c) -> T&
1809 { return ch[a+ido*(b+3*c)]; };
1810
1811 for (size_t k=0; k<l1; k++)
1812 {
1813 T cr2=CC(0,k,1)+CC(0,k,2);
1814 CH(0,0,k) = CC(0,k,0)+cr2;
1815 CH(0,2,k) = taui*(CC(0,k,2)-CC(0,k,1));
1816 CH(ido-1,1,k) = CC(0,k,0)+taur*cr2;
1817 }
1818 if (ido==1) return;
1819 for (size_t k=0; k<l1; k++)
1820 for (size_t i=2; i<ido; i+=2)
1821 {
1822 size_t ic=ido-i;
1823 T di2, di3, dr2, dr3;
1824 MULPM (dr2,di2,WA(0,i-2),WA(0,i-1),CC(i-1,k,1),CC(i,k,1)); // d2=conj(WA0)*CC1
1825 MULPM (dr3,di3,WA(1,i-2),WA(1,i-1),CC(i-1,k,2),CC(i,k,2)); // d3=conj(WA1)*CC2
1826 POCKETFFT_REARRANGE(dr2, di2, dr3, di3);
1827 CH(i-1,0,k) = CC(i-1,k,0)+dr2; // c add
1828 CH(i ,0,k) = CC(i ,k,0)+di2;
1829 T tr2 = CC(i-1,k,0)+taur*dr2; // c add
1830 T ti2 = CC(i ,k,0)+taur*di2;
1831 T tr3 = taui*dr3; // t3 = taui*i*(d3-d2)?
1832 T ti3 = taui*di3;
1833 PM(CH(i-1,2,k),CH(ic-1,1,k),tr2,tr3); // PM(i) = t2+t3
1834 PM(CH(i ,2,k),CH(ic ,1,k),ti3,ti2); // PM(ic) = conj(t2-t3)
1835 }
1836 }
1837
1838template<typename T> void radf4(size_t ido, size_t l1,
1839 const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch,
1840 const T0 * POCKETFFT_RESTRICT wa) const
1841 {
1842 constexpr T0 hsqt2=T0(0.707106781186547524400844362104849L);
1843
1844 auto WA = [wa,ido](size_t x, size_t i) { return wa[i+x*(ido-1)]; };
1845 auto CC = [cc,ido,l1](size_t a, size_t b, size_t c) -> const T&
1846 { return cc[a+ido*(b+l1*c)]; };
1847 auto CH = [ch,ido](size_t a, size_t b, size_t c) -> T&
1848 { return ch[a+ido*(b+4*c)]; };
1849
1850 for (size_t k=0; k<l1; k++)
1851 {
1852 T tr1,tr2;
1853 PM (tr1,CH(0,2,k),CC(0,k,3),CC(0,k,1));
1854 PM (tr2,CH(ido-1,1,k),CC(0,k,0),CC(0,k,2));
1855 PM (CH(0,0,k),CH(ido-1,3,k),tr2,tr1);
1856 }
1857 if ((ido&1)==0)
1858 for (size_t k=0; k<l1; k++)
1859 {
1860 T ti1=-hsqt2*(CC(ido-1,k,1)+CC(ido-1,k,3));
1861 T tr1= hsqt2*(CC(ido-1,k,1)-CC(ido-1,k,3));
1862 PM (CH(ido-1,0,k),CH(ido-1,2,k),CC(ido-1,k,0),tr1);
1863 PM (CH( 0,3,k),CH( 0,1,k),ti1,CC(ido-1,k,2));
1864 }
1865 if (ido<=2) return;
1866 for (size_t k=0; k<l1; k++)
1867 for (size_t i=2; i<ido; i+=2)
1868 {
1869 size_t ic=ido-i;
1870 T ci2, ci3, ci4, cr2, cr3, cr4, ti1, ti2, ti3, ti4, tr1, tr2, tr3, tr4;
1871 MULPM(cr2,ci2,WA(0,i-2),WA(0,i-1),CC(i-1,k,1),CC(i,k,1));
1872 MULPM(cr3,ci3,WA(1,i-2),WA(1,i-1),CC(i-1,k,2),CC(i,k,2));
1873 MULPM(cr4,ci4,WA(2,i-2),WA(2,i-1),CC(i-1,k,3),CC(i,k,3));
1874 PM(tr1,tr4,cr4,cr2);
1875 PM(ti1,ti4,ci2,ci4);
1876 PM(tr2,tr3,CC(i-1,k,0),cr3);
1877 PM(ti2,ti3,CC(i ,k,0),ci3);
1878 PM(CH(i-1,0,k),CH(ic-1,3,k),tr2,tr1);
1879 PM(CH(i ,0,k),CH(ic ,3,k),ti1,ti2);
1880 PM(CH(i-1,2,k),CH(ic-1,1,k),tr3,ti4);
1881 PM(CH(i ,2,k),CH(ic ,1,k),tr4,ti3);
1882 }
1883 }
1884
1885template<typename T> void radf5(size_t ido, size_t l1,
1886 const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch,
1887 const T0 * POCKETFFT_RESTRICT wa) const
1888 {
1889 constexpr T0 tr11= T0(0.3090169943749474241022934171828191L),
1890 ti11= T0(0.9510565162951535721164393333793821L),
1891 tr12= T0(-0.8090169943749474241022934171828191L),
1892 ti12= T0(0.5877852522924731291687059546390728L);
1893
1894 auto WA = [wa,ido](size_t x, size_t i) { return wa[i+x*(ido-1)]; };
1895 auto CC = [cc,ido,l1](size_t a, size_t b, size_t c) -> const T&
1896 { return cc[a+ido*(b+l1*c)]; };
1897 auto CH = [ch,ido](size_t a, size_t b, size_t c) -> T&
1898 { return ch[a+ido*(b+5*c)]; };
1899
1900 for (size_t k=0; k<l1; k++)
1901 {
1902 T cr2, cr3, ci4, ci5;
1903 PM (cr2,ci5,CC(0,k,4),CC(0,k,1));
1904 PM (cr3,ci4,CC(0,k,3),CC(0,k,2));
1905 CH(0,0,k)=CC(0,k,0)+cr2+cr3;
1906 CH(ido-1,1,k)=CC(0,k,0)+tr11*cr2+tr12*cr3;
1907 CH(0,2,k)=ti11*ci5+ti12*ci4;
1908 CH(ido-1,3,k)=CC(0,k,0)+tr12*cr2+tr11*cr3;
1909 CH(0,4,k)=ti12*ci5-ti11*ci4;
1910 }
1911 if (ido==1) return;
1912 for (size_t k=0; k<l1;++k)
1913 for (size_t i=2, ic=ido-2; i<ido; i+=2, ic-=2)
1914 {
1915 T di2, di3, di4, di5, dr2, dr3, dr4, dr5;
1916 MULPM (dr2,di2,WA(0,i-2),WA(0,i-1),CC(i-1,k,1),CC(i,k,1));
1917 MULPM (dr3,di3,WA(1,i-2),WA(1,i-1),CC(i-1,k,2),CC(i,k,2));
1918 MULPM (dr4,di4,WA(2,i-2),WA(2,i-1),CC(i-1,k,3),CC(i,k,3));
1919 MULPM (dr5,di5,WA(3,i-2),WA(3,i-1),CC(i-1,k,4),CC(i,k,4));
1920 POCKETFFT_REARRANGE(dr2, di2, dr5, di5);
1921 POCKETFFT_REARRANGE(dr3, di3, dr4, di4);
1922 CH(i-1,0,k)=CC(i-1,k,0)+dr2+dr3;
1923 CH(i ,0,k)=CC(i ,k,0)+di2+di3;
1924 T tr2=CC(i-1,k,0)+tr11*dr2+tr12*dr3;
1925 T ti2=CC(i ,k,0)+tr11*di2+tr12*di3;
1926 T tr3=CC(i-1,k,0)+tr12*dr2+tr11*dr3;
1927 T ti3=CC(i ,k,0)+tr12*di2+tr11*di3;
1928 T tr5 = ti11*dr5 + ti12*dr4;
1929 T ti5 = ti11*di5 + ti12*di4;
1930 T tr4 = ti12*dr5 - ti11*dr4;
1931 T ti4 = ti12*di5 - ti11*di4;
1932 PM(CH(i-1,2,k),CH(ic-1,1,k),tr2,tr5);
1933 PM(CH(i ,2,k),CH(ic ,1,k),ti5,ti2);
1934 PM(CH(i-1,4,k),CH(ic-1,3,k),tr3,tr4);
1935 PM(CH(i ,4,k),CH(ic ,3,k),ti4,ti3);
1936 }
1937 }
1938
1939#undef POCKETFFT_REARRANGE
1940
1941template<typename T> void radfg(size_t ido, size_t ip, size_t l1,
1943 const T0 * POCKETFFT_RESTRICT wa, const T0 * POCKETFFT_RESTRICT csarr) const
1944 {
1945 const size_t cdim=ip;
1946 size_t ipph=(ip+1)/2;
1947 size_t idl1 = ido*l1;
1948
1949 auto CC = [cc,ido,cdim](size_t a, size_t b, size_t c) -> T&
1950 { return cc[a+ido*(b+cdim*c)]; };
1951 auto CH = [ch,ido,l1](size_t a, size_t b, size_t c) -> const T&
1952 { return ch[a+ido*(b+l1*c)]; };
1953 auto C1 = [cc,ido,l1] (size_t a, size_t b, size_t c) -> T&
1954 { return cc[a+ido*(b+l1*c)]; };
1955 auto C2 = [cc,idl1] (size_t a, size_t b) -> T&
1956 { return cc[a+idl1*b]; };
1957 auto CH2 = [ch,idl1] (size_t a, size_t b) -> T&
1958 { return ch[a+idl1*b]; };
1959
1960 if (ido>1)
1961 {
1962 for (size_t j=1, jc=ip-1; j<ipph; ++j,--jc) // 114
1963 {
1964 size_t is=(j-1)*(ido-1),
1965 is2=(jc-1)*(ido-1);
1966 for (size_t k=0; k<l1; ++k) // 113
1967 {
1968 size_t idij=is;
1969 size_t idij2=is2;
1970 for (size_t i=1; i<=ido-2; i+=2) // 112
1971 {
1972 T t1=C1(i,k,j ), t2=C1(i+1,k,j ),
1973 t3=C1(i,k,jc), t4=C1(i+1,k,jc);
1974 T x1=wa[idij]*t1 + wa[idij+1]*t2,
1975 x2=wa[idij]*t2 - wa[idij+1]*t1,
1976 x3=wa[idij2]*t3 + wa[idij2+1]*t4,
1977 x4=wa[idij2]*t4 - wa[idij2+1]*t3;
1978 PM(C1(i,k,j),C1(i+1,k,jc),x3,x1);
1979 PM(C1(i+1,k,j),C1(i,k,jc),x2,x4);
1980 idij+=2;
1981 idij2+=2;
1982 }
1983 }
1984 }
1985 }
1986
1987 for (size_t j=1, jc=ip-1; j<ipph; ++j,--jc) // 123
1988 for (size_t k=0; k<l1; ++k) // 122
1989 MPINPLACE(C1(0,k,jc), C1(0,k,j));
1990
1991//everything in C
1992//memset(ch,0,ip*l1*ido*sizeof(double));
1993
1994 for (size_t l=1,lc=ip-1; l<ipph; ++l,--lc) // 127
1995 {
1996 for (size_t ik=0; ik<idl1; ++ik) // 124
1997 {
1998 CH2(ik,l ) = C2(ik,0)+csarr[2*l]*C2(ik,1)+csarr[4*l]*C2(ik,2);
1999 CH2(ik,lc) = csarr[2*l+1]*C2(ik,ip-1)+csarr[4*l+1]*C2(ik,ip-2);
2000 }
2001 size_t iang = 2*l;
2002 size_t j=3, jc=ip-3;
2003 for (; j<ipph-3; j+=4,jc-=4) // 126
2004 {
2005 iang+=l; if (iang>=ip) iang-=ip;
2006 T0 ar1=csarr[2*iang], ai1=csarr[2*iang+1];
2007 iang+=l; if (iang>=ip) iang-=ip;
2008 T0 ar2=csarr[2*iang], ai2=csarr[2*iang+1];
2009 iang+=l; if (iang>=ip) iang-=ip;
2010 T0 ar3=csarr[2*iang], ai3=csarr[2*iang+1];
2011 iang+=l; if (iang>=ip) iang-=ip;
2012 T0 ar4=csarr[2*iang], ai4=csarr[2*iang+1];
2013 for (size_t ik=0; ik<idl1; ++ik) // 125
2014 {
2015 CH2(ik,l ) += ar1*C2(ik,j )+ar2*C2(ik,j +1)
2016 +ar3*C2(ik,j +2)+ar4*C2(ik,j +3);
2017 CH2(ik,lc) += ai1*C2(ik,jc)+ai2*C2(ik,jc-1)
2018 +ai3*C2(ik,jc-2)+ai4*C2(ik,jc-3);
2019 }
2020 }
2021 for (; j<ipph-1; j+=2,jc-=2) // 126
2022 {
2023 iang+=l; if (iang>=ip) iang-=ip;
2024 T0 ar1=csarr[2*iang], ai1=csarr[2*iang+1];
2025 iang+=l; if (iang>=ip) iang-=ip;
2026 T0 ar2=csarr[2*iang], ai2=csarr[2*iang+1];
2027 for (size_t ik=0; ik<idl1; ++ik) // 125
2028 {
2029 CH2(ik,l ) += ar1*C2(ik,j )+ar2*C2(ik,j +1);
2030 CH2(ik,lc) += ai1*C2(ik,jc)+ai2*C2(ik,jc-1);
2031 }
2032 }
2033 for (; j<ipph; ++j,--jc) // 126
2034 {
2035 iang+=l; if (iang>=ip) iang-=ip;
2036 T0 ar=csarr[2*iang], ai=csarr[2*iang+1];
2037 for (size_t ik=0; ik<idl1; ++ik) // 125
2038 {
2039 CH2(ik,l ) += ar*C2(ik,j );
2040 CH2(ik,lc) += ai*C2(ik,jc);
2041 }
2042 }
2043 }
2044 for (size_t ik=0; ik<idl1; ++ik) // 101
2045 CH2(ik,0) = C2(ik,0);
2046 for (size_t j=1; j<ipph; ++j) // 129
2047 for (size_t ik=0; ik<idl1; ++ik) // 128
2048 CH2(ik,0) += C2(ik,j);
2049
2050// everything in CH at this point!
2051//memset(cc,0,ip*l1*ido*sizeof(double));
2052
2053 for (size_t k=0; k<l1; ++k) // 131
2054 for (size_t i=0; i<ido; ++i) // 130
2055 CC(i,0,k) = CH(i,k,0);
2056
2057 for (size_t j=1, jc=ip-1; j<ipph; ++j,--jc) // 137
2058 {
2059 size_t j2=2*j-1;
2060 for (size_t k=0; k<l1; ++k) // 136
2061 {
2062 CC(ido-1,j2,k) = CH(0,k,j);
2063 CC(0,j2+1,k) = CH(0,k,jc);
2064 }
2065 }
2066
2067 if (ido==1) return;
2068
2069 for (size_t j=1, jc=ip-1; j<ipph; ++j,--jc) // 140
2070 {
2071 size_t j2=2*j-1;
2072 for(size_t k=0; k<l1; ++k) // 139
2073 for(size_t i=1, ic=ido-i-2; i<=ido-2; i+=2, ic-=2) // 138
2074 {
2075 CC(i ,j2+1,k) = CH(i ,k,j )+CH(i ,k,jc);
2076 CC(ic ,j2 ,k) = CH(i ,k,j )-CH(i ,k,jc);
2077 CC(i+1 ,j2+1,k) = CH(i+1,k,j )+CH(i+1,k,jc);
2078 CC(ic+1,j2 ,k) = CH(i+1,k,jc)-CH(i+1,k,j );
2079 }
2080 }
2081 }
2082
2083template<typename T> void radb2(size_t ido, size_t l1,
2084 const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch,
2085 const T0 * POCKETFFT_RESTRICT wa) const
2086 {
2087 auto WA = [wa,ido](size_t x, size_t i) { return wa[i+x*(ido-1)]; };
2088 auto CC = [cc,ido](size_t a, size_t b, size_t c) -> const T&
2089 { return cc[a+ido*(b+2*c)]; };
2090 auto CH = [ch,ido,l1](size_t a, size_t b, size_t c) -> T&
2091 { return ch[a+ido*(b+l1*c)]; };
2092
2093 for (size_t k=0; k<l1; k++)
2094 PM (CH(0,k,0),CH(0,k,1),CC(0,0,k),CC(ido-1,1,k));
2095 if ((ido&1)==0)
2096 for (size_t k=0; k<l1; k++)
2097 {
2098 CH(ido-1,k,0) = 2*CC(ido-1,0,k);
2099 CH(ido-1,k,1) =-2*CC(0 ,1,k);
2100 }
2101 if (ido<=2) return;
2102 for (size_t k=0; k<l1;++k)
2103 for (size_t i=2; i<ido; i+=2)
2104 {
2105 size_t ic=ido-i;
2106 T ti2, tr2;
2107 PM (CH(i-1,k,0),tr2,CC(i-1,0,k),CC(ic-1,1,k));
2108 PM (ti2,CH(i ,k,0),CC(i ,0,k),CC(ic ,1,k));
2109 MULPM (CH(i,k,1),CH(i-1,k,1),WA(0,i-2),WA(0,i-1),ti2,tr2);
2110 }
2111 }
2112
2113template<typename T> void radb3(size_t ido, size_t l1,
2114 const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch,
2115 const T0 * POCKETFFT_RESTRICT wa) const
2116 {
2117 constexpr T0 taur=-0.5, taui=T0(0.8660254037844386467637231707529362L);
2118
2119 auto WA = [wa,ido](size_t x, size_t i) { return wa[i+x*(ido-1)]; };
2120 auto CC = [cc,ido](size_t a, size_t b, size_t c) -> const T&
2121 { return cc[a+ido*(b+3*c)]; };
2122 auto CH = [ch,ido,l1](size_t a, size_t b, size_t c) -> T&
2123 { return ch[a+ido*(b+l1*c)]; };
2124
2125 for (size_t k=0; k<l1; k++)
2126 {
2127 T tr2=2*CC(ido-1,1,k);
2128 T cr2=CC(0,0,k)+taur*tr2;
2129 CH(0,k,0)=CC(0,0,k)+tr2;
2130 T ci3=2*taui*CC(0,2,k);
2131 PM (CH(0,k,2),CH(0,k,1),cr2,ci3);
2132 }
2133 if (ido==1) return;
2134 for (size_t k=0; k<l1; k++)
2135 for (size_t i=2, ic=ido-2; i<ido; i+=2, ic-=2)
2136 {
2137 T tr2=CC(i-1,2,k)+CC(ic-1,1,k); // t2=CC(I) + conj(CC(ic))
2138 T ti2=CC(i ,2,k)-CC(ic ,1,k);
2139 T cr2=CC(i-1,0,k)+taur*tr2; // c2=CC +taur*t2
2140 T ci2=CC(i ,0,k)+taur*ti2;
2141 CH(i-1,k,0)=CC(i-1,0,k)+tr2; // CH=CC+t2
2142 CH(i ,k,0)=CC(i ,0,k)+ti2;
2143 T cr3=taui*(CC(i-1,2,k)-CC(ic-1,1,k));// c3=taui*(CC(i)-conj(CC(ic)))
2144 T ci3=taui*(CC(i ,2,k)+CC(ic ,1,k));
2145 T di2, di3, dr2, dr3;
2146 PM(dr3,dr2,cr2,ci3); // d2= (cr2-ci3, ci2+cr3) = c2+i*c3
2147 PM(di2,di3,ci2,cr3); // d3= (cr2+ci3, ci2-cr3) = c2-i*c3
2148 MULPM(CH(i,k,1),CH(i-1,k,1),WA(0,i-2),WA(0,i-1),di2,dr2); // ch = WA*d2
2149 MULPM(CH(i,k,2),CH(i-1,k,2),WA(1,i-2),WA(1,i-1),di3,dr3);
2150 }
2151 }
2152
2153template<typename T> void radb4(size_t ido, size_t l1,
2154 const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch,
2155 const T0 * POCKETFFT_RESTRICT wa) const
2156 {
2157 constexpr T0 sqrt2=T0(1.414213562373095048801688724209698L);
2158
2159 auto WA = [wa,ido](size_t x, size_t i) { return wa[i+x*(ido-1)]; };
2160 auto CC = [cc,ido](size_t a, size_t b, size_t c) -> const T&
2161 { return cc[a+ido*(b+4*c)]; };
2162 auto CH = [ch,ido,l1](size_t a, size_t b, size_t c) -> T&
2163 { return ch[a+ido*(b+l1*c)]; };
2164
2165 for (size_t k=0; k<l1; k++)
2166 {
2167 T tr1, tr2;
2168 PM (tr2,tr1,CC(0,0,k),CC(ido-1,3,k));
2169 T tr3=2*CC(ido-1,1,k);
2170 T tr4=2*CC(0,2,k);
2171 PM (CH(0,k,0),CH(0,k,2),tr2,tr3);
2172 PM (CH(0,k,3),CH(0,k,1),tr1,tr4);
2173 }
2174 if ((ido&1)==0)
2175 for (size_t k=0; k<l1; k++)
2176 {
2177 T tr1,tr2,ti1,ti2;
2178 PM (ti1,ti2,CC(0 ,3,k),CC(0 ,1,k));
2179 PM (tr2,tr1,CC(ido-1,0,k),CC(ido-1,2,k));
2180 CH(ido-1,k,0)=tr2+tr2;
2181 CH(ido-1,k,1)=sqrt2*(tr1-ti1);
2182 CH(ido-1,k,2)=ti2+ti2;
2183 CH(ido-1,k,3)=-sqrt2*(tr1+ti1);
2184 }
2185 if (ido<=2) return;
2186 for (size_t k=0; k<l1;++k)
2187 for (size_t i=2; i<ido; i+=2)
2188 {
2189 T ci2, ci3, ci4, cr2, cr3, cr4, ti1, ti2, ti3, ti4, tr1, tr2, tr3, tr4;
2190 size_t ic=ido-i;
2191 PM (tr2,tr1,CC(i-1,0,k),CC(ic-1,3,k));
2192 PM (ti1,ti2,CC(i ,0,k),CC(ic ,3,k));
2193 PM (tr4,ti3,CC(i ,2,k),CC(ic ,1,k));
2194 PM (tr3,ti4,CC(i-1,2,k),CC(ic-1,1,k));
2195 PM (CH(i-1,k,0),cr3,tr2,tr3);
2196 PM (CH(i ,k,0),ci3,ti2,ti3);
2197 PM (cr4,cr2,tr1,tr4);
2198 PM (ci2,ci4,ti1,ti4);
2199 MULPM (CH(i,k,1),CH(i-1,k,1),WA(0,i-2),WA(0,i-1),ci2,cr2);
2200 MULPM (CH(i,k,2),CH(i-1,k,2),WA(1,i-2),WA(1,i-1),ci3,cr3);
2201 MULPM (CH(i,k,3),CH(i-1,k,3),WA(2,i-2),WA(2,i-1),ci4,cr4);
2202 }
2203 }
2204
2205template<typename T> void radb5(size_t ido, size_t l1,
2206 const T * POCKETFFT_RESTRICT cc, T * POCKETFFT_RESTRICT ch,
2207 const T0 * POCKETFFT_RESTRICT wa) const
2208 {
2209 constexpr T0 tr11= T0(0.3090169943749474241022934171828191L),
2210 ti11= T0(0.9510565162951535721164393333793821L),
2211 tr12= T0(-0.8090169943749474241022934171828191L),
2212 ti12= T0(0.5877852522924731291687059546390728L);
2213
2214 auto WA = [wa,ido](size_t x, size_t i) { return wa[i+x*(ido-1)]; };
2215 auto CC = [cc,ido](size_t a, size_t b, size_t c) -> const T&
2216 { return cc[a+ido*(b+5*c)]; };
2217 auto CH = [ch,ido,l1](size_t a, size_t b, size_t c) -> T&
2218 { return ch[a+ido*(b+l1*c)]; };
2219
2220 for (size_t k=0; k<l1; k++)
2221 {
2222 T ti5=CC(0,2,k)+CC(0,2,k);
2223 T ti4=CC(0,4,k)+CC(0,4,k);
2224 T tr2=CC(ido-1,1,k)+CC(ido-1,1,k);
2225 T tr3=CC(ido-1,3,k)+CC(ido-1,3,k);
2226 CH(0,k,0)=CC(0,0,k)+tr2+tr3;
2227 T cr2=CC(0,0,k)+tr11*tr2+tr12*tr3;
2228 T cr3=CC(0,0,k)+tr12*tr2+tr11*tr3;
2229 T ci4, ci5;
2230 MULPM(ci5,ci4,ti5,ti4,ti11,ti12);
2231 PM(CH(0,k,4),CH(0,k,1),cr2,ci5);
2232 PM(CH(0,k,3),CH(0,k,2),cr3,ci4);
2233 }
2234 if (ido==1) return;
2235 for (size_t k=0; k<l1;++k)
2236 for (size_t i=2, ic=ido-2; i<ido; i+=2, ic-=2)
2237 {
2238 T tr2, tr3, tr4, tr5, ti2, ti3, ti4, ti5;
2239 PM(tr2,tr5,CC(i-1,2,k),CC(ic-1,1,k));
2240 PM(ti5,ti2,CC(i ,2,k),CC(ic ,1,k));
2241 PM(tr3,tr4,CC(i-1,4,k),CC(ic-1,3,k));
2242 PM(ti4,ti3,CC(i ,4,k),CC(ic ,3,k));
2243 CH(i-1,k,0)=CC(i-1,0,k)+tr2+tr3;
2244 CH(i ,k,0)=CC(i ,0,k)+ti2+ti3;
2245 T cr2=CC(i-1,0,k)+tr11*tr2+tr12*tr3;
2246 T ci2=CC(i ,0,k)+tr11*ti2+tr12*ti3;
2247 T cr3=CC(i-1,0,k)+tr12*tr2+tr11*tr3;
2248 T ci3=CC(i ,0,k)+tr12*ti2+tr11*ti3;
2249 T ci4, ci5, cr5, cr4;
2250 MULPM(cr5,cr4,tr5,tr4,ti11,ti12);
2251 MULPM(ci5,ci4,ti5,ti4,ti11,ti12);
2252 T dr2, dr3, dr4, dr5, di2, di3, di4, di5;
2253 PM(dr4,dr3,cr3,ci4);
2254 PM(di3,di4,ci3,cr4);
2255 PM(dr5,dr2,cr2,ci5);
2256 PM(di2,di5,ci2,cr5);
2257 MULPM(CH(i,k,1),CH(i-1,k,1),WA(0,i-2),WA(0,i-1),di2,dr2);
2258 MULPM(CH(i,k,2),CH(i-1,k,2),WA(1,i-2),WA(1,i-1),di3,dr3);
2259 MULPM(CH(i,k,3),CH(i-1,k,3),WA(2,i-2),WA(2,i-1),di4,dr4);
2260 MULPM(CH(i,k,4),CH(i-1,k,4),WA(3,i-2),WA(3,i-1),di5,dr5);
2261 }
2262 }
2263
2264template<typename T> void radbg(size_t ido, size_t ip, size_t l1,
2266 const T0 * POCKETFFT_RESTRICT wa, const T0 * POCKETFFT_RESTRICT csarr) const
2267 {
2268 const size_t cdim=ip;
2269 size_t ipph=(ip+1)/ 2;
2270 size_t idl1 = ido*l1;
2271
2272 auto CC = [cc,ido,cdim](size_t a, size_t b, size_t c) -> const T&
2273 { return cc[a+ido*(b+cdim*c)]; };
2274 auto CH = [ch,ido,l1](size_t a, size_t b, size_t c) -> T&
2275 { return ch[a+ido*(b+l1*c)]; };
2276 auto C1 = [cc,ido,l1](size_t a, size_t b, size_t c) -> const T&
2277 { return cc[a+ido*(b+l1*c)]; };
2278 auto C2 = [cc,idl1](size_t a, size_t b) -> T&
2279 { return cc[a+idl1*b]; };
2280 auto CH2 = [ch,idl1](size_t a, size_t b) -> T&
2281 { return ch[a+idl1*b]; };
2282
2283 for (size_t k=0; k<l1; ++k) // 102
2284 for (size_t i=0; i<ido; ++i) // 101
2285 CH(i,k,0) = CC(i,0,k);
2286 for (size_t j=1, jc=ip-1; j<ipph; ++j, --jc) // 108
2287 {
2288 size_t j2=2*j-1;
2289 for (size_t k=0; k<l1; ++k)
2290 {
2291 CH(0,k,j ) = 2*CC(ido-1,j2,k);
2292 CH(0,k,jc) = 2*CC(0,j2+1,k);
2293 }
2294 }
2295
2296 if (ido!=1)
2297 {
2298 for (size_t j=1, jc=ip-1; j<ipph; ++j,--jc) // 111
2299 {
2300 size_t j2=2*j-1;
2301 for (size_t k=0; k<l1; ++k)
2302 for (size_t i=1, ic=ido-i-2; i<=ido-2; i+=2, ic-=2) // 109
2303 {
2304 CH(i ,k,j ) = CC(i ,j2+1,k)+CC(ic ,j2,k);
2305 CH(i ,k,jc) = CC(i ,j2+1,k)-CC(ic ,j2,k);
2306 CH(i+1,k,j ) = CC(i+1,j2+1,k)-CC(ic+1,j2,k);
2307 CH(i+1,k,jc) = CC(i+1,j2+1,k)+CC(ic+1,j2,k);
2308 }
2309 }
2310 }
2311 for (size_t l=1,lc=ip-1; l<ipph; ++l,--lc)
2312 {
2313 for (size_t ik=0; ik<idl1; ++ik)
2314 {
2315 C2(ik,l ) = CH2(ik,0)+csarr[2*l]*CH2(ik,1)+csarr[4*l]*CH2(ik,2);
2316 C2(ik,lc) = csarr[2*l+1]*CH2(ik,ip-1)+csarr[4*l+1]*CH2(ik,ip-2);
2317 }
2318 size_t iang=2*l;
2319 size_t j=3,jc=ip-3;
2320 for(; j<ipph-3; j+=4,jc-=4)
2321 {
2322 iang+=l; if(iang>ip) iang-=ip;
2323 T0 ar1=csarr[2*iang], ai1=csarr[2*iang+1];
2324 iang+=l; if(iang>ip) iang-=ip;
2325 T0 ar2=csarr[2*iang], ai2=csarr[2*iang+1];
2326 iang+=l; if(iang>ip) iang-=ip;
2327 T0 ar3=csarr[2*iang], ai3=csarr[2*iang+1];
2328 iang+=l; if(iang>ip) iang-=ip;
2329 T0 ar4=csarr[2*iang], ai4=csarr[2*iang+1];
2330 for (size_t ik=0; ik<idl1; ++ik)
2331 {
2332 C2(ik,l ) += ar1*CH2(ik,j )+ar2*CH2(ik,j +1)
2333 +ar3*CH2(ik,j +2)+ar4*CH2(ik,j +3);
2334 C2(ik,lc) += ai1*CH2(ik,jc)+ai2*CH2(ik,jc-1)
2335 +ai3*CH2(ik,jc-2)+ai4*CH2(ik,jc-3);
2336 }
2337 }
2338 for(; j<ipph-1; j+=2,jc-=2)
2339 {
2340 iang+=l; if(iang>ip) iang-=ip;
2341 T0 ar1=csarr[2*iang], ai1=csarr[2*iang+1];
2342 iang+=l; if(iang>ip) iang-=ip;
2343 T0 ar2=csarr[2*iang], ai2=csarr[2*iang+1];
2344 for (size_t ik=0; ik<idl1; ++ik)
2345 {
2346 C2(ik,l ) += ar1*CH2(ik,j )+ar2*CH2(ik,j +1);
2347 C2(ik,lc) += ai1*CH2(ik,jc)+ai2*CH2(ik,jc-1);
2348 }
2349 }
2350 for(; j<ipph; ++j,--jc)
2351 {
2352 iang+=l; if(iang>ip) iang-=ip;
2353 T0 war=csarr[2*iang], wai=csarr[2*iang+1];
2354 for (size_t ik=0; ik<idl1; ++ik)
2355 {
2356 C2(ik,l ) += war*CH2(ik,j );
2357 C2(ik,lc) += wai*CH2(ik,jc);
2358 }
2359 }
2360 }
2361 for (size_t j=1; j<ipph; ++j)
2362 for (size_t ik=0; ik<idl1; ++ik)
2363 CH2(ik,0) += CH2(ik,j);
2364 for (size_t j=1, jc=ip-1; j<ipph; ++j,--jc) // 124
2365 for (size_t k=0; k<l1; ++k)
2366 PM(CH(0,k,jc),CH(0,k,j),C1(0,k,j),C1(0,k,jc));
2367
2368 if (ido==1) return;
2369
2370 for (size_t j=1, jc=ip-1; j<ipph; ++j, --jc) // 127
2371 for (size_t k=0; k<l1; ++k)
2372 for (size_t i=1; i<=ido-2; i+=2)
2373 {
2374 CH(i ,k,j ) = C1(i ,k,j)-C1(i+1,k,jc);
2375 CH(i ,k,jc) = C1(i ,k,j)+C1(i+1,k,jc);
2376 CH(i+1,k,j ) = C1(i+1,k,j)+C1(i ,k,jc);
2377 CH(i+1,k,jc) = C1(i+1,k,j)-C1(i ,k,jc);
2378 }
2379
2380// All in CH
2381
2382 for (size_t j=1; j<ip; ++j)
2383 {
2384 size_t is = (j-1)*(ido-1);
2385 for (size_t k=0; k<l1; ++k)
2386 {
2387 size_t idij = is;
2388 for (size_t i=1; i<=ido-2; i+=2)
2389 {
2390 T t1=CH(i,k,j), t2=CH(i+1,k,j);
2391 CH(i ,k,j) = wa[idij]*t1-wa[idij+1]*t2;
2392 CH(i+1,k,j) = wa[idij]*t2+wa[idij+1]*t1;
2393 idij+=2;
2394 }
2395 }
2396 }
2397 }
2398
2399 template<typename T> void copy_and_norm(T *c, T *p1, T0 fct) const
2400 {
2401 if (p1!=c)
2402 {
2403 if (fct!=1.)
2404 for (size_t i=0; i<length; ++i)
2405 c[i] = fct*p1[i];
2406 else
2407 std::copy_n (p1, length, c);
2408 }
2409 else
2410 if (fct!=1.)
2411 for (size_t i=0; i<length; ++i)
2412 c[i] *= fct;
2413 }
2414
2415 public:
2416 template<typename T> void exec(T c[], T0 fct, bool r2hc) const
2417 {
2418 if (length==1) { c[0]*=fct; return; }
2419 size_t nf=fact.size();
2420 arr<T> ch(length);
2421 T *p1=c, *p2=ch.data();
2422
2423 if (r2hc)
2424 for(size_t k1=0, l1=length; k1<nf;++k1)
2425 {
2426 size_t k=nf-k1-1;
2427 size_t ip=fact[k].fct;
2428 size_t ido=length / l1;
2429 l1 /= ip;
2430 if(ip==4)
2431 radf4(ido, l1, p1, p2, fact[k].tw);
2432 else if(ip==2)
2433 radf2(ido, l1, p1, p2, fact[k].tw);
2434 else if(ip==3)
2435 radf3(ido, l1, p1, p2, fact[k].tw);
2436 else if(ip==5)
2437 radf5(ido, l1, p1, p2, fact[k].tw);
2438 else
2439 { radfg(ido, ip, l1, p1, p2, fact[k].tw, fact[k].tws); std::swap (p1,p2); }
2440 std::swap (p1,p2);
2441 }
2442 else
2443 for(size_t k=0, l1=1; k<nf; k++)
2444 {
2445 size_t ip = fact[k].fct,
2446 ido= length/(ip*l1);
2447 if(ip==4)
2448 radb4(ido, l1, p1, p2, fact[k].tw);
2449 else if(ip==2)
2450 radb2(ido, l1, p1, p2, fact[k].tw);
2451 else if(ip==3)
2452 radb3(ido, l1, p1, p2, fact[k].tw);
2453 else if(ip==5)
2454 radb5(ido, l1, p1, p2, fact[k].tw);
2455 else
2456 radbg(ido, ip, l1, p1, p2, fact[k].tw, fact[k].tws);
2457 std::swap (p1,p2);
2458 l1*=ip;
2459 }
2460
2461 copy_and_norm(c,p1,fct);
2462 }
2463
2464 private:
2466 {
2467 size_t len=length;
2468 while ((len%4)==0)
2469 { add_factor(4); len>>=2; }
2470 if ((len%2)==0)
2471 {
2472 len>>=1;
2473 // factor 2 should be at the front of the factor list
2474 add_factor(2);
2475 std::swap(fact[0].fct, fact.back().fct);
2476 }
2477 for (size_t divisor=3; divisor*divisor<=len; divisor+=2)
2478 while ((len%divisor)==0)
2479 {
2480 add_factor(divisor);
2481 len/=divisor;
2482 }
2483 if (len>1) add_factor(len);
2484 }
2485
2486 size_t twsize() const
2487 {
2488 size_t twsz=0, l1=1;
2489 for (size_t k=0; k<fact.size(); ++k)
2490 {
2491 size_t ip=fact[k].fct, ido=length/(l1*ip);
2492 twsz+=(ip-1)*(ido-1);
2493 if (ip>5) twsz+=2*ip;
2494 l1*=ip;
2495 }
2496 return twsz;
2497 }
2498
2500 {
2502 size_t l1=1;
2503 T0 *ptr=mem.data();
2504 for (size_t k=0; k<fact.size(); ++k)
2505 {
2506 size_t ip=fact[k].fct, ido=length/(l1*ip);
2507 if (k<fact.size()-1) // last factor doesn't need twiddles
2508 {
2509 fact[k].tw=ptr; ptr+=(ip-1)*(ido-1);
2510 for (size_t j=1; j<ip; ++j)
2511 for (size_t i=1; i<=(ido-1)/2; ++i)
2512 {
2513 fact[k].tw[(j-1)*(ido-1)+2*i-2] = twid[j*l1*i].r;
2514 fact[k].tw[(j-1)*(ido-1)+2*i-1] = twid[j*l1*i].i;
2515 }
2516 }
2517 if (ip>5) // special factors required by *g functions
2518 {
2519 fact[k].tws=ptr; ptr+=2*ip;
2520 fact[k].tws[0] = 1.;
2521 fact[k].tws[1] = 0.;
2522 for (size_t i=2, ic=2*ip-2; i<=ic; i+=2, ic-=2)
2523 {
2524 fact[k].tws[i ] = twid[i/2*(length/ip)].r;
2525 fact[k].tws[i+1] = twid[i/2*(length/ip)].i;
2526 fact[k].tws[ic] = twid[i/2*(length/ip)].r;
2527 fact[k].tws[ic+1] = -twid[i/2*(length/ip)].i;
2528 }
2529 }
2530 l1*=ip;
2531 }
2532 }
2533
2534 public:
2535 POCKETFFT_NOINLINE explicit rfftp(size_t length_)
2536 : length(length_)
2537 {
2538 if (length==0) throw std::runtime_error("zero-length FFT requested");
2539 if (length==1) return;
2540 factorize();
2541 mem.resize(twsize());
2542 comp_twiddle();
2543 }
2544};
2545
2546//
2547// complex Bluestein transforms
2548//
2549
2550template<typename T0> class fftblue
2551 {
2552 private:
2553 size_t n, n2;
2557
2558 template<bool fwd, typename T> void fft(cmplx<T> c[], T0 fct) const
2559 {
2560 arr<cmplx<T>> akf(n2);
2561
2562 /* initialize a_k and FFT it */
2563 for (size_t m=0; m<n; ++m)
2564 special_mul<fwd>(c[m],bk[m],akf[m]);
2565 auto zero = akf[0]*T0(0);
2566 for (size_t m=n; m<n2; ++m)
2567 akf[m]=zero;
2568
2569 plan.exec (akf.data(),1.,true);
2570
2571 /* do the convolution */
2572 akf[0] = akf[0].template special_mul<!fwd>(bkf[0]);
2573 for (size_t m=1; m<(n2+1)/2; ++m)
2574 {
2575 akf[m] = akf[m].template special_mul<!fwd>(bkf[m]);
2576 akf[n2-m] = akf[n2-m].template special_mul<!fwd>(bkf[m]);
2577 }
2578 if ((n2&1)==0)
2579 akf[n2/2] = akf[n2/2].template special_mul<!fwd>(bkf[n2/2]);
2580
2581 /* inverse FFT */
2582 plan.exec (akf.data(),1.,false);
2583
2584 /* multiply by b_k */
2585 for (size_t m=0; m<n; ++m)
2586 c[m] = akf[m].template special_mul<fwd>(bk[m])*fct;
2587 }
2588
2589 public:
2590 POCKETFFT_NOINLINE explicit fftblue(size_t length)
2591 : n(length), n2(util::good_size_cmplx(n*2-1)), plan(n2), mem(n+n2/2+1),
2592 bk(mem.data()), bkf(mem.data()+n)
2593 {
2594 /* initialize b_k */
2595 sincos_2pibyn<T0> tmp(2*n);
2596 bk[0].Set(1, 0);
2597
2598 size_t coeff=0;
2599 for (size_t m=1; m<n; ++m)
2600 {
2601 coeff+=2*m-1;
2602 if (coeff>=2*n) coeff-=2*n;
2603 bk[m] = tmp[coeff];
2604 }
2605
2606 /* initialize the zero-padded, Fourier transformed b_k. Add normalisation. */
2607 arr<cmplx<T0>> tbkf(n2);
2608 T0 xn2 = T0(1)/T0(n2);
2609 tbkf[0] = bk[0]*xn2;
2610 for (size_t m=1; m<n; ++m)
2611 tbkf[m] = tbkf[n2-m] = bk[m]*xn2;
2612 for (size_t m=n;m<=(n2-n);++m)
2613 tbkf[m].Set(0.,0.);
2614 plan.exec(tbkf.data(),1.,true);
2615 for (size_t i=0; i<n2/2+1; ++i)
2616 bkf[i] = tbkf[i];
2617 }
2618
2619 template<typename T> void exec(cmplx<T> c[], T0 fct, bool fwd) const
2620 { fwd ? fft<true>(c,fct) : fft<false>(c,fct); }
2621
2622 template<typename T> void exec_r(T c[], T0 fct, bool fwd)
2623 {
2624 arr<cmplx<T>> tmp(n);
2625 if (fwd)
2626 {
2627 auto zero = T0(0)*c[0];
2628 for (size_t m=0; m<n; ++m)
2629 tmp[m].Set(c[m], zero);
2630 fft<true>(tmp.data(),fct);
2631 c[0] = tmp[0].r;
2632 std::copy_n (&tmp[1].r, n-1, &c[1]);
2633 }
2634 else
2635 {
2636 tmp[0].Set(c[0],c[0]*0);
2637 std::copy_n (c+1, n-1, &tmp[1].r);
2638 if ((n&1)==0) tmp[n/2].i=T0(0)*c[0];
2639 for (size_t m=1; 2*m<n; ++m)
2640 tmp[n-m].Set(tmp[m].r, -tmp[m].i);
2641 fft<false>(tmp.data(),fct);
2642 for (size_t m=0; m<n; ++m)
2643 c[m] = tmp[m].r;
2644 }
2645 }
2646 };
2647
2648//
2649// flexible (FFTPACK/Bluestein) complex 1D transform
2650//
2651
2652template<typename T0> class pocketfft_c
2653 {
2654 private:
2655 std::unique_ptr<cfftp<T0>> packplan;
2656 std::unique_ptr<fftblue<T0>> blueplan;
2657 size_t len;
2658
2659 public:
2661 : len(length)
2662 {
2663 if (length==0) throw std::runtime_error("zero-length FFT requested");
2664 size_t tmp = (length<50) ? 0 : util::largest_prime_factor(length);
2665 if (tmp*tmp <= length)
2666 {
2667 packplan=std::unique_ptr<cfftp<T0>>(new cfftp<T0>(length));
2668 return;
2669 }
2670 double comp1 = util::cost_guess(length);
2671 double comp2 = 2*util::cost_guess(util::good_size_cmplx(2*length-1));
2672 comp2*=1.5; /* fudge factor that appears to give good overall performance */
2673 if (comp2<comp1) // use Bluestein
2674 blueplan=std::unique_ptr<fftblue<T0>>(new fftblue<T0>(length));
2675 else
2676 packplan=std::unique_ptr<cfftp<T0>>(new cfftp<T0>(length));
2677 }
2678
2679 template<typename T> POCKETFFT_NOINLINE void exec(cmplx<T> c[], T0 fct, bool fwd) const
2680 { packplan ? packplan->exec(c,fct,fwd) : blueplan->exec(c,fct,fwd); }
2681
2682 size_t length() const { return len; }
2683 };
2684
2685//
2686// flexible (FFTPACK/Bluestein) real-valued 1D transform
2687//
2688
2689template<typename T0> class pocketfft_r
2690 {
2691 private:
2692 std::unique_ptr<rfftp<T0>> packplan;
2693 std::unique_ptr<fftblue<T0>> blueplan;
2694 size_t len;
2695
2696 public:
2698 : len(length)
2699 {
2700 if (length==0) throw std::runtime_error("zero-length FFT requested");
2701 size_t tmp = (length<50) ? 0 : util::largest_prime_factor(length);
2702 if (tmp*tmp <= length)
2703 {
2704 packplan=std::unique_ptr<rfftp<T0>>(new rfftp<T0>(length));
2705 return;
2706 }
2707 double comp1 = 0.5*util::cost_guess(length);
2708 double comp2 = 2*util::cost_guess(util::good_size_cmplx(2*length-1));
2709 comp2*=1.5; /* fudge factor that appears to give good overall performance */
2710 if (comp2<comp1) // use Bluestein
2711 blueplan=std::unique_ptr<fftblue<T0>>(new fftblue<T0>(length));
2712 else
2713 packplan=std::unique_ptr<rfftp<T0>>(new rfftp<T0>(length));
2714 }
2715
2716 template<typename T> POCKETFFT_NOINLINE void exec(T c[], T0 fct, bool fwd) const
2717 { packplan ? packplan->exec(c,fct,fwd) : blueplan->exec_r(c,fct,fwd); }
2718
2719 size_t length() const { return len; }
2720 };
2721
2722
2723//
2724// sine/cosine transforms
2725//
2726
2727template<typename T0> class T_dct1
2728 {
2729 private:
2731
2732 public:
2734 : fftplan(2*(length-1)) {}
2735
2736 template<typename T> POCKETFFT_NOINLINE void exec(T c[], T0 fct, bool ortho,
2737 int /*type*/, bool /*cosine*/) const
2738 {
2739 constexpr T0 sqrt2=T0(1.414213562373095048801688724209698L);
2740 size_t N=fftplan.length(), n=N/2+1;
2741 if (ortho)
2742 { c[0]*=sqrt2; c[n-1]*=sqrt2; }
2743 arr<T> tmp(N);
2744 tmp[0] = c[0];
2745 for (size_t i=1; i<n; ++i)
2746 tmp[i] = tmp[N-i] = c[i];
2747 fftplan.exec(tmp.data(), fct, true);
2748 c[0] = tmp[0];
2749 for (size_t i=1; i<n; ++i)
2750 c[i] = tmp[2*i-1];
2751 if (ortho)
2752 { c[0]*=sqrt2*T0(0.5); c[n-1]*=sqrt2*T0(0.5); }
2753 }
2754
2755 size_t length() const { return fftplan.length()/2+1; }
2756 };
2757
2758template<typename T0> class T_dst1
2759 {
2760 private:
2762
2763 public:
2765 : fftplan(2*(length+1)) {}
2766
2767 template<typename T> POCKETFFT_NOINLINE void exec(T c[], T0 fct,
2768 bool /*ortho*/, int /*type*/, bool /*cosine*/) const
2769 {
2770 size_t N=fftplan.length(), n=N/2-1;
2771 arr<T> tmp(N);
2772 tmp[0] = tmp[n+1] = c[0]*0;
2773 for (size_t i=0; i<n; ++i)
2774 { tmp[i+1]=c[i]; tmp[N-1-i]=-c[i]; }
2775 fftplan.exec(tmp.data(), fct, true);
2776 for (size_t i=0; i<n; ++i)
2777 c[i] = -tmp[2*i+2];
2778 }
2779
2780 size_t length() const { return fftplan.length()/2-1; }
2781 };
2782
2783template<typename T0> class T_dcst23
2784 {
2785 private:
2787 std::vector<T0> twiddle;
2788
2789 public:
2792 {
2794 for (size_t i=0; i<length; ++i)
2795 twiddle[i] = tw[i+1].r;
2796 }
2797
2798 template<typename T> POCKETFFT_NOINLINE void exec(T c[], T0 fct, bool ortho,
2799 int type, bool cosine) const
2800 {
2801 constexpr T0 sqrt2=T0(1.414213562373095048801688724209698L);
2802 size_t N=length();
2803 size_t NS2 = (N+1)/2;
2804 if (type==2)
2805 {
2806 if (!cosine)
2807 for (size_t k=1; k<N; k+=2)
2808 c[k] = -c[k];
2809 c[0] *= 2;
2810 if ((N&1)==0) c[N-1]*=2;
2811 for (size_t k=1; k<N-1; k+=2)
2812 MPINPLACE(c[k+1], c[k]);
2813 fftplan.exec(c, fct, false);
2814 for (size_t k=1, kc=N-1; k<NS2; ++k, --kc)
2815 {
2816 T t1 = twiddle[k-1]*c[kc]+twiddle[kc-1]*c[k];
2817 T t2 = twiddle[k-1]*c[k]-twiddle[kc-1]*c[kc];
2818 c[k] = T0(0.5)*(t1+t2); c[kc]=T0(0.5)*(t1-t2);
2819 }
2820 if ((N&1)==0)
2821 c[NS2] *= twiddle[NS2-1];
2822 if (!cosine)
2823 for (size_t k=0, kc=N-1; k<kc; ++k, --kc)
2824 std::swap(c[k], c[kc]);
2825 if (ortho)
2826 cosine ? c[0]*=sqrt2*T0(0.5) : c[N-1]*=sqrt2*T0(0.5);
2827 }
2828 else
2829 {
2830 if (ortho)
2831 cosine ? c[0]*=sqrt2 : c[N-1]*=sqrt2;
2832 if (!cosine)
2833 for (size_t k=0, kc=N-1; k<NS2; ++k, --kc)
2834 std::swap(c[k], c[kc]);
2835 for (size_t k=1, kc=N-1; k<NS2; ++k, --kc)
2836 {
2837 T t1=c[k]+c[kc], t2=c[k]-c[kc];
2838 c[k] = twiddle[k-1]*t2+twiddle[kc-1]*t1;
2839 c[kc]= twiddle[k-1]*t1-twiddle[kc-1]*t2;
2840 }
2841 if ((N&1)==0)
2842 c[NS2] *= 2*twiddle[NS2-1];
2843 fftplan.exec(c, fct, true);
2844 for (size_t k=1; k<N-1; k+=2)
2845 MPINPLACE(c[k], c[k+1]);
2846 if (!cosine)
2847 for (size_t k=1; k<N; k+=2)
2848 c[k] = -c[k];
2849 }
2850 }
2851
2852 size_t length() const { return fftplan.length(); }
2853 };
2854
2855template<typename T0> class T_dcst4
2856 {
2857 private:
2858 size_t N;
2859 std::unique_ptr<pocketfft_c<T0>> fft;
2860 std::unique_ptr<pocketfft_r<T0>> rfft;
2862
2863 public:
2865 : N(length),
2866 fft((N&1) ? nullptr : new pocketfft_c<T0>(N/2)),
2867 rfft((N&1)? new pocketfft_r<T0>(N) : nullptr),
2868 C2((N&1) ? 0 : N/2)
2869 {
2870 if ((N&1)==0)
2871 {
2872 sincos_2pibyn<T0> tw(16*N);
2873 for (size_t i=0; i<N/2; ++i)
2874 C2[i] = conj(tw[8*i+1]);
2875 }
2876 }
2877
2878 template<typename T> POCKETFFT_NOINLINE void exec(T c[], T0 fct,
2879 bool /*ortho*/, int /*type*/, bool cosine) const
2880 {
2881 size_t n2 = N/2;
2882 if (!cosine)
2883 for (size_t k=0, kc=N-1; k<n2; ++k, --kc)
2884 std::swap(c[k], c[kc]);
2885 if (N&1)
2886 {
2887 // The following code is derived from the FFTW3 function apply_re11()
2888 // and is released under the 3-clause BSD license with friendly
2889 // permission of Matteo Frigo and Steven G. Johnson.
2890
2891 arr<T> y(N);
2892 {
2893 size_t i=0, m=n2;
2894 for (; m<N; ++i, m+=4)
2895 y[i] = c[m];
2896 for (; m<2*N; ++i, m+=4)
2897 y[i] = -c[2*N-m-1];
2898 for (; m<3*N; ++i, m+=4)
2899 y[i] = -c[m-2*N];
2900 for (; m<4*N; ++i, m+=4)
2901 y[i] = c[4*N-m-1];
2902 for (; i<N; ++i, m+=4)
2903 y[i] = c[m-4*N];
2904 }
2905 rfft->exec(y.data(), fct, true);
2906 {
2907 auto SGN = [](size_t i)
2908 {
2909 constexpr T0 sqrt2=T0(1.414213562373095048801688724209698L);
2910 return (i&2) ? -sqrt2 : sqrt2;
2911 };
2912 c[n2] = y[0]*SGN(n2+1);
2913 size_t i=0, i1=1, k=1;
2914 for (; k<n2; ++i, ++i1, k+=2)
2915 {
2916 c[i ] = y[2*k-1]*SGN(i1) + y[2*k ]*SGN(i);
2917 c[N -i1] = y[2*k-1]*SGN(N -i) - y[2*k ]*SGN(N -i1);
2918 c[n2-i1] = y[2*k+1]*SGN(n2-i) - y[2*k+2]*SGN(n2-i1);
2919 c[n2+i1] = y[2*k+1]*SGN(n2+i+2) + y[2*k+2]*SGN(n2+i1);
2920 }
2921 if (k == n2)
2922 {
2923 c[i ] = y[2*k-1]*SGN(i+1) + y[2*k]*SGN(i);
2924 c[N-i1] = y[2*k-1]*SGN(i+2) + y[2*k]*SGN(i1);
2925 }
2926 }
2927
2928 // FFTW-derived code ends here
2929 }
2930 else
2931 {
2932 // even length algorithm from
2933 // https://www.appletonaudio.com/blog/2013/derivation-of-fast-dct-4-algorithm-based-on-dft/
2934 arr<cmplx<T>> y(n2);
2935 for(size_t i=0; i<n2; ++i)
2936 {
2937 y[i].Set(c[2*i],c[N-1-2*i]);
2938 y[i] *= C2[i];
2939 }
2940 fft->exec(y.data(), fct, true);
2941 for(size_t i=0, ic=n2-1; i<n2; ++i, --ic)
2942 {
2943 c[2*i ] = 2*(y[i ].r*C2[i ].r-y[i ].i*C2[i ].i);
2944 c[2*i+1] = -2*(y[ic].i*C2[ic].r+y[ic].r*C2[ic].i);
2945 }
2946 }
2947 if (!cosine)
2948 for (size_t k=1; k<N; k+=2)
2949 c[k] = -c[k];
2950 }
2951
2952 size_t length() const { return N; }
2953 };
2954
2955
2956//
2957// multi-D infrastructure
2958//
2959
2960template<typename T> std::shared_ptr<T> get_plan(size_t length)
2961 {
2962#if POCKETFFT_CACHE_SIZE==0
2963 return std::make_shared<T>(length);
2964#else
2965 constexpr size_t nmax=POCKETFFT_CACHE_SIZE;
2966 static std::array<std::shared_ptr<T>, nmax> cache;
2967 static std::array<size_t, nmax> last_access{{0}};
2968 static size_t access_counter = 0;
2969 static std::mutex mut;
2970
2971 auto find_in_cache = [&]() -> std::shared_ptr<T>
2972 {
2973 for (size_t i=0; i<nmax; ++i)
2974 if (cache[i] && (cache[i]->length()==length))
2975 {
2976 // no need to update if this is already the most recent entry
2977 if (last_access[i]!=access_counter)
2978 {
2979 last_access[i] = ++access_counter;
2980 // Guard against overflow
2981 if (access_counter == 0)
2982 last_access.fill(0);
2983 }
2984 return cache[i];
2985 }
2986
2987 return nullptr;
2988 };
2989
2990 {
2991 std::lock_guard<std::mutex> lock(mut);
2992 auto p = find_in_cache();
2993 if (p) return p;
2994 }
2995 auto plan = std::make_shared<T>(length);
2996 {
2997 std::lock_guard<std::mutex> lock(mut);
2998 auto p = find_in_cache();
2999 if (p) return p;
3000
3001 size_t lru = 0;
3002 for (size_t i=1; i<nmax; ++i)
3003 if (last_access[i] < last_access[lru])
3004 lru = i;
3005
3006 cache[lru] = plan;
3007 last_access[lru] = ++access_counter;
3008 }
3009 return plan;
3010#endif
3011 }
3012
3014 {
3015 protected:
3018
3019 public:
3020 arr_info(shape_t shape_, stride_t stride_)
3021 : shp(std::move(shape_)), str(std::move(stride_)) {}
3022 size_t ndim() const { return shp.size(); }
3023 size_t size() const { return util::prod(shp); }
3024 const shape_t &shape() const { return shp; }
3025 size_t shape(size_t i) const { return shp[i]; }
3026 const stride_t &stride() const { return str; }
3027 const ptrdiff_t &stride(size_t i) const { return str[i]; }
3028 };
3029
3030template<typename T> class cndarr: public arr_info
3031 {
3032 protected:
3033 const char *d;
3034
3035 public:
3036 cndarr(const void *data_, const shape_t &shape_, const stride_t &stride_)
3037 : arr_info(shape_, stride_),
3038 d(reinterpret_cast<const char *>(data_)) {}
3039 const T &operator[](ptrdiff_t ofs) const
3040 { return *reinterpret_cast<const T *>(d+ofs); }
3041 };
3042
3043template<typename T> class ndarr: public cndarr<T>
3044 {
3045 public:
3046 ndarr(void *data_, const shape_t &shape_, const stride_t &stride_)
3047 : cndarr<T>::cndarr(const_cast<const void *>(data_), shape_, stride_)
3048 {}
3049 T &operator[](ptrdiff_t ofs)
3050 { return *reinterpret_cast<T *>(const_cast<char *>(cndarr<T>::d+ofs)); }
3051 };
3052
3053template<size_t N> class multi_iter
3054 {
3055 private:
3058 ptrdiff_t p_ii, p_i[N], str_i, p_oi, p_o[N], str_o;
3059 size_t idim, rem;
3060
3062 {
3063 for (int i_=int(pos.size())-1; i_>=0; --i_)
3064 {
3065 auto i = size_t(i_);
3066 if (i==idim) continue;
3067 p_ii += iarr.stride(i);
3068 p_oi += oarr.stride(i);
3069 if (++pos[i] < iarr.shape(i))
3070 return;
3071 pos[i] = 0;
3072 p_ii -= ptrdiff_t(iarr.shape(i))*iarr.stride(i);
3073 p_oi -= ptrdiff_t(oarr.shape(i))*oarr.stride(i);
3074 }
3075 }
3076
3077 public:
3078 multi_iter(const arr_info &iarr_, const arr_info &oarr_, size_t idim_)
3079 : pos(iarr_.ndim(), 0), iarr(iarr_), oarr(oarr_), p_ii(0),
3080 str_i(iarr.stride(idim_)), p_oi(0), str_o(oarr.stride(idim_)),
3081 idim(idim_), rem(iarr.size()/iarr.shape(idim))
3082 {
3083 auto nshares = threading::num_threads();
3084 if (nshares==1) return;
3085 if (nshares==0) throw std::runtime_error("can't run with zero threads");
3086 auto myshare = threading::thread_id();
3087 if (myshare>=nshares) throw std::runtime_error("impossible share requested");
3088 size_t nbase = rem/nshares;
3089 size_t additional = rem%nshares;
3090 size_t lo = myshare*nbase + ((myshare<additional) ? myshare : additional);
3091 size_t hi = lo+nbase+(myshare<additional);
3092 size_t todo = hi-lo;
3093
3094 size_t chunk = rem;
3095 for (size_t i=0; i<pos.size(); ++i)
3096 {
3097 if (i==idim) continue;
3098 chunk /= iarr.shape(i);
3099 size_t n_advance = lo/chunk;
3100 pos[i] += n_advance;
3101 p_ii += ptrdiff_t(n_advance)*iarr.stride(i);
3102 p_oi += ptrdiff_t(n_advance)*oarr.stride(i);
3103 lo -= n_advance*chunk;
3104 }
3105 rem = todo;
3106 }
3107 void advance(size_t n)
3108 {
3109 if (rem<n) throw std::runtime_error("underrun");
3110 for (size_t i=0; i<n; ++i)
3111 {
3112 p_i[i] = p_ii;
3113 p_o[i] = p_oi;
3114 advance_i();
3115 }
3116 rem -= n;
3117 }
3118 ptrdiff_t iofs(size_t i) const { return p_i[0] + ptrdiff_t(i)*str_i; }
3119 ptrdiff_t iofs(size_t j, size_t i) const { return p_i[j] + ptrdiff_t(i)*str_i; }
3120 ptrdiff_t oofs(size_t i) const { return p_o[0] + ptrdiff_t(i)*str_o; }
3121 ptrdiff_t oofs(size_t j, size_t i) const { return p_o[j] + ptrdiff_t(i)*str_o; }
3122 size_t length_in() const { return iarr.shape(idim); }
3123 size_t length_out() const { return oarr.shape(idim); }
3124 ptrdiff_t stride_in() const { return str_i; }
3125 ptrdiff_t stride_out() const { return str_o; }
3126 size_t remaining() const { return rem; }
3127 };
3128
3130 {
3131 private:
3134 ptrdiff_t p;
3135 size_t rem;
3136
3137 public:
3138 explicit simple_iter(const arr_info &arr_)
3139 : pos(arr_.ndim(), 0), arr(arr_), p(0), rem(arr_.size()) {}
3140 void advance()
3141 {
3142 --rem;
3143 for (int i_=int(pos.size())-1; i_>=0; --i_)
3144 {
3145 auto i = size_t(i_);
3146 p += arr.stride(i);
3147 if (++pos[i] < arr.shape(i))
3148 return;
3149 pos[i] = 0;
3150 p -= ptrdiff_t(arr.shape(i))*arr.stride(i);
3151 }
3152 }
3153 ptrdiff_t ofs() const { return p; }
3154 size_t remaining() const { return rem; }
3155 };
3156
3158 {
3159 private:
3162 std::vector<char> rev_axis;
3163 std::vector<char> rev_jump;
3166 ptrdiff_t p, rp;
3167 size_t rem;
3168
3169 public:
3170 rev_iter(const arr_info &arr_, const shape_t &axes)
3171 : pos(arr_.ndim(), 0), arr(arr_), rev_axis(arr_.ndim(), 0),
3172 rev_jump(arr_.ndim(), 1), p(0), rp(0)
3173 {
3174 for (auto ax: axes)
3175 rev_axis[ax]=1;
3176 last_axis = axes.back();
3177 last_size = arr.shape(last_axis)/2 + 1;
3178 shp = arr.shape();
3180 rem=1;
3181 for (auto i: shp)
3182 rem *= i;
3183 }
3184 void advance()
3185 {
3186 --rem;
3187 for (int i_=int(pos.size())-1; i_>=0; --i_)
3188 {
3189 auto i = size_t(i_);
3190 p += arr.stride(i);
3191 if (!rev_axis[i])
3192 rp += arr.stride(i);
3193 else
3194 {
3195 rp -= arr.stride(i);
3196 if (rev_jump[i])
3197 {
3198 rp += ptrdiff_t(arr.shape(i))*arr.stride(i);
3199 rev_jump[i] = 0;
3200 }
3201 }
3202 if (++pos[i] < shp[i])
3203 return;
3204 pos[i] = 0;
3205 p -= ptrdiff_t(shp[i])*arr.stride(i);
3206 if (rev_axis[i])
3207 {
3208 rp -= ptrdiff_t(arr.shape(i)-shp[i])*arr.stride(i);
3209 rev_jump[i] = 1;
3210 }
3211 else
3212 rp -= ptrdiff_t(shp[i])*arr.stride(i);
3213 }
3214 }
3215 ptrdiff_t ofs() const { return p; }
3216 ptrdiff_t rev_ofs() const { return rp; }
3217 size_t remaining() const { return rem; }
3218 };
3219
3220template<typename T> struct VTYPE {};
3221template <typename T> using vtype_t = typename VTYPE<T>::type;
3222
3223#ifndef POCKETFFT_NO_VECTORS
3224template<> struct VTYPE<float>
3225 {
3226 using type = float __attribute__ ((vector_size (VLEN<float>::val*sizeof(float))));
3227 };
3228template<> struct VTYPE<double>
3229 {
3230 using type = double __attribute__ ((vector_size (VLEN<double>::val*sizeof(double))));
3231 };
3232template<> struct VTYPE<long double>
3233 {
3234 using type = long double __attribute__ ((vector_size (VLEN<long double>::val*sizeof(long double))));
3235 };
3236#endif
3237
3238template<typename T> arr<char> alloc_tmp(const shape_t &shape,
3239 size_t axsize, size_t elemsize)
3240 {
3241 auto othersize = util::prod(shape)/axsize;
3242 auto tmpsize = axsize*((othersize>=VLEN<T>::val) ? VLEN<T>::val : 1);
3243 return arr<char>(tmpsize*elemsize);
3244 }
3245template<typename T> arr<char> alloc_tmp(const shape_t &shape,
3246 const shape_t &axes, size_t elemsize)
3247 {
3248 size_t fullsize=util::prod(shape);
3249 size_t tmpsize=0;
3250 for (size_t i=0; i<axes.size(); ++i)
3251 {
3252 auto axsize = shape[axes[i]];
3253 auto othersize = fullsize/axsize;
3254 auto sz = axsize*((othersize>=VLEN<T>::val) ? VLEN<T>::val : 1);
3255 if (sz>tmpsize) tmpsize=sz;
3256 }
3257 return arr<char>(tmpsize*elemsize);
3258 }
3259
3260template <typename T, size_t vlen> void copy_input(const multi_iter<vlen> &it,
3262 {
3263 for (size_t i=0; i<it.length_in(); ++i)
3264 for (size_t j=0; j<vlen; ++j)
3265 {
3266 dst[i].r[j] = src[it.iofs(j,i)].r;
3267 dst[i].i[j] = src[it.iofs(j,i)].i;
3268 }
3269 }
3270
3271template <typename T, size_t vlen> void copy_input(const multi_iter<vlen> &it,
3273 {
3274 for (size_t i=0; i<it.length_in(); ++i)
3275 for (size_t j=0; j<vlen; ++j)
3276 dst[i][j] = src[it.iofs(j,i)];
3277 }
3278
3279template <typename T, size_t vlen> void copy_input(const multi_iter<vlen> &it,
3280 const cndarr<T> &src, T *POCKETFFT_RESTRICT dst)
3281 {
3282 if (dst == &src[it.iofs(0)]) return; // in-place
3283 for (size_t i=0; i<it.length_in(); ++i)
3284 dst[i] = src[it.iofs(i)];
3285 }
3286
3287template<typename T, size_t vlen> void copy_output(const multi_iter<vlen> &it,
3289 {
3290 for (size_t i=0; i<it.length_out(); ++i)
3291 for (size_t j=0; j<vlen; ++j)
3292 dst[it.oofs(j,i)].Set(src[i].r[j],src[i].i[j]);
3293 }
3294
3295template<typename T, size_t vlen> void copy_output(const multi_iter<vlen> &it,
3297 {
3298 for (size_t i=0; i<it.length_out(); ++i)
3299 for (size_t j=0; j<vlen; ++j)
3300 dst[it.oofs(j,i)] = src[i][j];
3301 }
3302
3303template<typename T, size_t vlen> void copy_output(const multi_iter<vlen> &it,
3304 const T *POCKETFFT_RESTRICT src, ndarr<T> &dst)
3305 {
3306 if (src == &dst[it.oofs(0)]) return; // in-place
3307 for (size_t i=0; i<it.length_out(); ++i)
3308 dst[it.oofs(i)] = src[i];
3309 }
3310
3311template <typename T> struct add_vec { using type = vtype_t<T>; };
3312template <typename T> struct add_vec<cmplx<T>>
3313 { using type = cmplx<vtype_t<T>>; };
3314template <typename T> using add_vec_t = typename add_vec<T>::type;
3315
3316template<typename Tplan, typename T, typename T0, typename Exec>
3318 const shape_t &axes, T0 fct, size_t nthreads, const Exec & exec,
3319 const bool allow_inplace=true)
3320 {
3321 std::shared_ptr<Tplan> plan;
3322
3323 for (size_t iax=0; iax<axes.size(); ++iax)
3324 {
3325 size_t len=in.shape(axes[iax]);
3326 if ((!plan) || (len!=plan->length()))
3327 plan = get_plan<Tplan>(len);
3328
3330 util::thread_count(nthreads, in.shape(), axes[iax], VLEN<T>::val),
3331 [&] {
3332 constexpr auto vlen = VLEN<T0>::val;
3333 auto storage = alloc_tmp<T0>(in.shape(), len, sizeof(T));
3334 const auto &tin(iax==0? in : out);
3335 multi_iter<vlen> it(tin, out, axes[iax]);
3336#ifndef POCKETFFT_NO_VECTORS
3337 if (vlen>1)
3338 while (it.remaining()>=vlen)
3339 {
3340 it.advance(vlen);
3341 auto tdatav = reinterpret_cast<add_vec_t<T> *>(storage.data());
3342 exec(it, tin, out, tdatav, *plan, fct);
3343 }
3344#endif
3345 while (it.remaining()>0)
3346 {
3347 it.advance(1);
3348 auto buf = allow_inplace && it.stride_out() == sizeof(T) ?
3349 &out[it.oofs(0)] : reinterpret_cast<T *>(storage.data());
3350 exec(it, tin, out, buf, *plan, fct);
3351 }
3352 }); // end of parallel region
3353 fct = T0(1); // factor has been applied, use 1 for remaining axes
3354 }
3355 }
3356
3358 {
3360
3361 template <typename T0, typename T, size_t vlen> void operator () (
3362 const multi_iter<vlen> &it, const cndarr<cmplx<T0>> &in,
3363 ndarr<cmplx<T0>> &out, T * buf, const pocketfft_c<T0> &plan, T0 fct) const
3364 {
3365 copy_input(it, in, buf);
3366 plan.exec(buf, fct, forward);
3367 copy_output(it, buf, out);
3368 }
3369 };
3370
3371template <typename T, size_t vlen> void copy_hartley(const multi_iter<vlen> &it,
3373 {
3374 for (size_t j=0; j<vlen; ++j)
3375 dst[it.oofs(j,0)] = src[0][j];
3376 size_t i=1, i1=1, i2=it.length_out()-1;
3377 for (i=1; i<it.length_out()-1; i+=2, ++i1, --i2)
3378 for (size_t j=0; j<vlen; ++j)
3379 {
3380 dst[it.oofs(j,i1)] = src[i][j]+src[i+1][j];
3381 dst[it.oofs(j,i2)] = src[i][j]-src[i+1][j];
3382 }
3383 if (i<it.length_out())
3384 for (size_t j=0; j<vlen; ++j)
3385 dst[it.oofs(j,i1)] = src[i][j];
3386 }
3387
3388template <typename T, size_t vlen> void copy_hartley(const multi_iter<vlen> &it,
3389 const T *POCKETFFT_RESTRICT src, ndarr<T> &dst)
3390 {
3391 dst[it.oofs(0)] = src[0];
3392 size_t i=1, i1=1, i2=it.length_out()-1;
3393 for (i=1; i<it.length_out()-1; i+=2, ++i1, --i2)
3394 {
3395 dst[it.oofs(i1)] = src[i]+src[i+1];
3396 dst[it.oofs(i2)] = src[i]-src[i+1];
3397 }
3398 if (i<it.length_out())
3399 dst[it.oofs(i1)] = src[i];
3400 }
3401
3402template <typename T, size_t vlen> void copy_FHT(const multi_iter<vlen> &it,
3404 {
3405 for (size_t j=0; j<vlen; ++j)
3406 dst[it.oofs(j,0)] = src[0][j];
3407 size_t i=1, i1=1, i2=it.length_out()-1;
3408 for (i=1; i<it.length_out()-1; i+=2, ++i1, --i2)
3409 for (size_t j=0; j<vlen; ++j)
3410 {
3411 dst[it.oofs(j,i1)] = src[i][j]-src[i+1][j];
3412 dst[it.oofs(j,i2)] = src[i][j]+src[i+1][j];
3413 }
3414 if (i<it.length_out())
3415 for (size_t j=0; j<vlen; ++j)
3416 dst[it.oofs(j,i1)] = src[i][j];
3417 }
3418
3419template <typename T, size_t vlen> void copy_FHT(const multi_iter<vlen> &it,
3420 const T *POCKETFFT_RESTRICT src, ndarr<T> &dst)
3421 {
3422 dst[it.oofs(0)] = src[0];
3423 size_t i=1, i1=1, i2=it.length_out()-1;
3424 for (i=1; i<it.length_out()-1; i+=2, ++i1, --i2)
3425 {
3426 dst[it.oofs(i1)] = src[i]-src[i+1];
3427 dst[it.oofs(i2)] = src[i]+src[i+1];
3428 }
3429 if (i<it.length_out())
3430 dst[it.oofs(i1)] = src[i];
3431 }
3432
3434 {
3435 template <typename T0, typename T, size_t vlen> void operator () (
3436 const multi_iter<vlen> &it, const cndarr<T0> &in, ndarr<T0> &out,
3437 T * buf, const pocketfft_r<T0> &plan, T0 fct) const
3438 {
3439 copy_input(it, in, buf);
3440 plan.exec(buf, fct, true);
3441 copy_hartley(it, buf, out);
3442 }
3443 };
3445 {
3446 template <typename T0, typename T, size_t vlen> void operator () (
3447 const multi_iter<vlen> &it, const cndarr<T0> &in, ndarr<T0> &out,
3448 T * buf, const pocketfft_r<T0> &plan, T0 fct) const
3449 {
3450 copy_input(it, in, buf);
3451 plan.exec(buf, fct, true);
3452 copy_FHT(it, buf, out);
3453 }
3454 };
3455
3457 {
3458 bool ortho;
3459 int type;
3461
3462 template <typename T0, typename T, typename Tplan, size_t vlen>
3463 void operator () (const multi_iter<vlen> &it, const cndarr<T0> &in,
3464 ndarr<T0> &out, T * buf, const Tplan &plan, T0 fct) const
3465 {
3466 copy_input(it, in, buf);
3467 plan.exec(buf, fct, ortho, type, cosine);
3468 copy_output(it, buf, out);
3469 }
3470 };
3471
3472template<typename T> POCKETFFT_NOINLINE void general_r2c(
3473 const cndarr<T> &in, ndarr<cmplx<T>> &out, size_t axis, bool forward, T fct,
3474 size_t nthreads)
3475 {
3476 auto plan = get_plan<pocketfft_r<T>>(in.shape(axis));
3477 size_t len=in.shape(axis);
3479 util::thread_count(nthreads, in.shape(), axis, VLEN<T>::val),
3480 [&] {
3481 constexpr auto vlen = VLEN<T>::val;
3482 auto storage = alloc_tmp<T>(in.shape(), len, sizeof(T));
3483 multi_iter<vlen> it(in, out, axis);
3484#ifndef POCKETFFT_NO_VECTORS
3485 if (vlen>1)
3486 while (it.remaining()>=vlen)
3487 {
3488 it.advance(vlen);
3489 auto tdatav = reinterpret_cast<vtype_t<T> *>(storage.data());
3490 copy_input(it, in, tdatav);
3491 plan->exec(tdatav, fct, true);
3492 for (size_t j=0; j<vlen; ++j)
3493 out[it.oofs(j,0)].Set(tdatav[0][j]);
3494 size_t i=1, ii=1;
3495 if (forward)
3496 for (; i<len-1; i+=2, ++ii)
3497 for (size_t j=0; j<vlen; ++j)
3498 out[it.oofs(j,ii)].Set(tdatav[i][j], tdatav[i+1][j]);
3499 else
3500 for (; i<len-1; i+=2, ++ii)
3501 for (size_t j=0; j<vlen; ++j)
3502 out[it.oofs(j,ii)].Set(tdatav[i][j], -tdatav[i+1][j]);
3503 if (i<len)
3504 for (size_t j=0; j<vlen; ++j)
3505 out[it.oofs(j,ii)].Set(tdatav[i][j]);
3506 }
3507#endif
3508 while (it.remaining()>0)
3509 {
3510 it.advance(1);
3511 auto tdata = reinterpret_cast<T *>(storage.data());
3512 copy_input(it, in, tdata);
3513 plan->exec(tdata, fct, true);
3514 out[it.oofs(0)].Set(tdata[0]);
3515 size_t i=1, ii=1;
3516 if (forward)
3517 for (; i<len-1; i+=2, ++ii)
3518 out[it.oofs(ii)].Set(tdata[i], tdata[i+1]);
3519 else
3520 for (; i<len-1; i+=2, ++ii)
3521 out[it.oofs(ii)].Set(tdata[i], -tdata[i+1]);
3522 if (i<len)
3523 out[it.oofs(ii)].Set(tdata[i]);
3524 }
3525 }); // end of parallel region
3526 }
3527template<typename T> POCKETFFT_NOINLINE void general_c2r(
3528 const cndarr<cmplx<T>> &in, ndarr<T> &out, size_t axis, bool forward, T fct,
3529 size_t nthreads)
3530 {
3531 auto plan = get_plan<pocketfft_r<T>>(out.shape(axis));
3532 size_t len=out.shape(axis);
3534 util::thread_count(nthreads, in.shape(), axis, VLEN<T>::val),
3535 [&] {
3536 constexpr auto vlen = VLEN<T>::val;
3537 auto storage = alloc_tmp<T>(out.shape(), len, sizeof(T));
3538 multi_iter<vlen> it(in, out, axis);
3539#ifndef POCKETFFT_NO_VECTORS
3540 if (vlen>1)
3541 while (it.remaining()>=vlen)
3542 {
3543 it.advance(vlen);
3544 auto tdatav = reinterpret_cast<vtype_t<T> *>(storage.data());
3545 for (size_t j=0; j<vlen; ++j)
3546 tdatav[0][j]=in[it.iofs(j,0)].r;
3547 {
3548 size_t i=1, ii=1;
3549 if (forward)
3550 for (; i<len-1; i+=2, ++ii)
3551 for (size_t j=0; j<vlen; ++j)
3552 {
3553 tdatav[i ][j] = in[it.iofs(j,ii)].r;
3554 tdatav[i+1][j] = -in[it.iofs(j,ii)].i;
3555 }
3556 else
3557 for (; i<len-1; i+=2, ++ii)
3558 for (size_t j=0; j<vlen; ++j)
3559 {
3560 tdatav[i ][j] = in[it.iofs(j,ii)].r;
3561 tdatav[i+1][j] = in[it.iofs(j,ii)].i;
3562 }
3563 if (i<len)
3564 for (size_t j=0; j<vlen; ++j)
3565 tdatav[i][j] = in[it.iofs(j,ii)].r;
3566 }
3567 plan->exec(tdatav, fct, false);
3568 copy_output(it, tdatav, out);
3569 }
3570#endif
3571 while (it.remaining()>0)
3572 {
3573 it.advance(1);
3574 auto tdata = reinterpret_cast<T *>(storage.data());
3575 tdata[0]=in[it.iofs(0)].r;
3576 {
3577 size_t i=1, ii=1;
3578 if (forward)
3579 for (; i<len-1; i+=2, ++ii)
3580 {
3581 tdata[i ] = in[it.iofs(ii)].r;
3582 tdata[i+1] = -in[it.iofs(ii)].i;
3583 }
3584 else
3585 for (; i<len-1; i+=2, ++ii)
3586 {
3587 tdata[i ] = in[it.iofs(ii)].r;
3588 tdata[i+1] = in[it.iofs(ii)].i;
3589 }
3590 if (i<len)
3591 tdata[i] = in[it.iofs(ii)].r;
3592 }
3593 plan->exec(tdata, fct, false);
3594 copy_output(it, tdata, out);
3595 }
3596 }); // end of parallel region
3597 }
3598
3600 {
3602
3603 template <typename T0, typename T, size_t vlen> void operator () (
3604 const multi_iter<vlen> &it, const cndarr<T0> &in, ndarr<T0> &out, T * buf,
3605 const pocketfft_r<T0> &plan, T0 fct) const
3606 {
3607 copy_input(it, in, buf);
3608 if ((!r2h) && forward)
3609 for (size_t i=2; i<it.length_out(); i+=2)
3610 buf[i] = -buf[i];
3611 plan.exec(buf, fct, r2h);
3612 if (r2h && (!forward))
3613 for (size_t i=2; i<it.length_out(); i+=2)
3614 buf[i] = -buf[i];
3615 copy_output(it, buf, out);
3616 }
3617 };
3618
3619template<typename T> void c2c(const shape_t &shape, const stride_t &stride_in,
3620 const stride_t &stride_out, const shape_t &axes, bool forward,
3621 const std::complex<T> *data_in, std::complex<T> *data_out, T fct,
3622 size_t nthreads=1)
3623 {
3624 if (util::prod(shape)==0) return;
3625 util::sanity_check(shape, stride_in, stride_out, data_in==data_out, axes);
3626 cndarr<cmplx<T>> ain(data_in, shape, stride_in);
3627 ndarr<cmplx<T>> aout(data_out, shape, stride_out);
3628 general_nd<pocketfft_c<T>>(ain, aout, axes, fct, nthreads, ExecC2C{forward});
3629 }
3630
3631template<typename T> void dct(const shape_t &shape,
3632 const stride_t &stride_in, const stride_t &stride_out, const shape_t &axes,
3633 int type, const T *data_in, T *data_out, T fct, bool ortho, size_t nthreads=1)
3634 {
3635 if ((type<1) || (type>4)) throw std::invalid_argument("invalid DCT type");
3636 if (util::prod(shape)==0) return;
3637 util::sanity_check(shape, stride_in, stride_out, data_in==data_out, axes);
3638 cndarr<T> ain(data_in, shape, stride_in);
3639 ndarr<T> aout(data_out, shape, stride_out);
3640 const ExecDcst exec{ortho, type, true};
3641 if (type==1)
3642 general_nd<T_dct1<T>>(ain, aout, axes, fct, nthreads, exec);
3643 else if (type==4)
3644 general_nd<T_dcst4<T>>(ain, aout, axes, fct, nthreads, exec);
3645 else
3646 general_nd<T_dcst23<T>>(ain, aout, axes, fct, nthreads, exec);
3647 }
3648
3649template<typename T> void dst(const shape_t &shape,
3650 const stride_t &stride_in, const stride_t &stride_out, const shape_t &axes,
3651 int type, const T *data_in, T *data_out, T fct, bool ortho, size_t nthreads=1)
3652 {
3653 if ((type<1) || (type>4)) throw std::invalid_argument("invalid DST type");
3654 if (util::prod(shape)==0) return;
3655 util::sanity_check(shape, stride_in, stride_out, data_in==data_out, axes);
3656 cndarr<T> ain(data_in, shape, stride_in);
3657 ndarr<T> aout(data_out, shape, stride_out);
3658 const ExecDcst exec{ortho, type, false};
3659 if (type==1)
3660 general_nd<T_dst1<T>>(ain, aout, axes, fct, nthreads, exec);
3661 else if (type==4)
3662 general_nd<T_dcst4<T>>(ain, aout, axes, fct, nthreads, exec);
3663 else
3664 general_nd<T_dcst23<T>>(ain, aout, axes, fct, nthreads, exec);
3665 }
3666
3667template<typename T> void r2c(const shape_t &shape_in,
3668 const stride_t &stride_in, const stride_t &stride_out, size_t axis,
3669 bool forward, const T *data_in, std::complex<T> *data_out, T fct,
3670 size_t nthreads=1)
3671 {
3672 if (util::prod(shape_in)==0) return;
3673 util::sanity_check(shape_in, stride_in, stride_out, false, axis);
3674 cndarr<T> ain(data_in, shape_in, stride_in);
3675 shape_t shape_out(shape_in);
3676 shape_out[axis] = shape_in[axis]/2 + 1;
3677 ndarr<cmplx<T>> aout(data_out, shape_out, stride_out);
3678 general_r2c(ain, aout, axis, forward, fct, nthreads);
3679 }
3680
3681template<typename T> void r2c(const shape_t &shape_in,
3682 const stride_t &stride_in, const stride_t &stride_out, const shape_t &axes,
3683 bool forward, const T *data_in, std::complex<T> *data_out, T fct,
3684 size_t nthreads=1)
3685 {
3686 if (util::prod(shape_in)==0) return;
3687 util::sanity_check(shape_in, stride_in, stride_out, false, axes);
3688 r2c(shape_in, stride_in, stride_out, axes.back(), forward, data_in, data_out,
3689 fct, nthreads);
3690 if (axes.size()==1) return;
3691
3692 shape_t shape_out(shape_in);
3693 shape_out[axes.back()] = shape_in[axes.back()]/2 + 1;
3694 auto newaxes = shape_t{axes.begin(), --axes.end()};
3695 c2c(shape_out, stride_out, stride_out, newaxes, forward, data_out, data_out,
3696 T(1), nthreads);
3697 }
3698
3699template<typename T> void c2r(const shape_t &shape_out,
3700 const stride_t &stride_in, const stride_t &stride_out, size_t axis,
3701 bool forward, const std::complex<T> *data_in, T *data_out, T fct,
3702 size_t nthreads=1)
3703 {
3704 if (util::prod(shape_out)==0) return;
3705 util::sanity_check(shape_out, stride_in, stride_out, false, axis);
3706 shape_t shape_in(shape_out);
3707 shape_in[axis] = shape_out[axis]/2 + 1;
3708 cndarr<cmplx<T>> ain(data_in, shape_in, stride_in);
3709 ndarr<T> aout(data_out, shape_out, stride_out);
3710 general_c2r(ain, aout, axis, forward, fct, nthreads);
3711 }
3712
3713template<typename T> void c2r(const shape_t &shape_out,
3714 const stride_t &stride_in, const stride_t &stride_out, const shape_t &axes,
3715 bool forward, const std::complex<T> *data_in, T *data_out, T fct,
3716 size_t nthreads=1)
3717 {
3718 if (util::prod(shape_out)==0) return;
3719 if (axes.size()==1)
3720 return c2r(shape_out, stride_in, stride_out, axes[0], forward,
3721 data_in, data_out, fct, nthreads);
3722 util::sanity_check(shape_out, stride_in, stride_out, false, axes);
3723 auto shape_in = shape_out;
3724 shape_in[axes.back()] = shape_out[axes.back()]/2 + 1;
3725 auto nval = util::prod(shape_in);
3726 stride_t stride_inter(shape_in.size());
3727 stride_inter.back() = sizeof(cmplx<T>);
3728 for (int i=int(shape_in.size())-2; i>=0; --i)
3729 stride_inter[size_t(i)] =
3730 stride_inter[size_t(i+1)]*ptrdiff_t(shape_in[size_t(i+1)]);
3731 arr<std::complex<T>> tmp(nval);
3732 auto newaxes = shape_t{axes.begin(), --axes.end()};
3733 c2c(shape_in, stride_in, stride_inter, newaxes, forward, data_in, tmp.data(),
3734 T(1), nthreads);
3735 c2r(shape_out, stride_inter, stride_out, axes.back(), forward,
3736 tmp.data(), data_out, fct, nthreads);
3737 }
3738
3739template<typename T> void r2r_fftpack(const shape_t &shape,
3740 const stride_t &stride_in, const stride_t &stride_out, const shape_t &axes,
3741 bool real2hermitian, bool forward, const T *data_in, T *data_out, T fct,
3742 size_t nthreads=1)
3743 {
3744 if (util::prod(shape)==0) return;
3745 util::sanity_check(shape, stride_in, stride_out, data_in==data_out, axes);
3746 cndarr<T> ain(data_in, shape, stride_in);
3747 ndarr<T> aout(data_out, shape, stride_out);
3748 general_nd<pocketfft_r<T>>(ain, aout, axes, fct, nthreads,
3749 ExecR2R{real2hermitian, forward});
3750 }
3751
3752template<typename T> void r2r_separable_hartley(const shape_t &shape,
3753 const stride_t &stride_in, const stride_t &stride_out, const shape_t &axes,
3754 const T *data_in, T *data_out, T fct, size_t nthreads=1)
3755 {
3756 if (util::prod(shape)==0) return;
3757 util::sanity_check(shape, stride_in, stride_out, data_in==data_out, axes);
3758 cndarr<T> ain(data_in, shape, stride_in);
3759 ndarr<T> aout(data_out, shape, stride_out);
3760 general_nd<pocketfft_r<T>>(ain, aout, axes, fct, nthreads, ExecHartley{},
3761 false);
3762 }
3763
3764template<typename T> void r2r_genuine_hartley(const shape_t &shape,
3765 const stride_t &stride_in, const stride_t &stride_out, const shape_t &axes,
3766 const T *data_in, T *data_out, T fct, size_t nthreads=1)
3767 {
3768 if (util::prod(shape)==0) return;
3769 if (axes.size()==1)
3770 return r2r_separable_hartley(shape, stride_in, stride_out, axes, data_in,
3771 data_out, fct, nthreads);
3772 util::sanity_check(shape, stride_in, stride_out, data_in==data_out, axes);
3773 shape_t tshp(shape);
3774 tshp[axes.back()] = tshp[axes.back()]/2+1;
3775 arr<std::complex<T>> tdata(util::prod(tshp));
3776 stride_t tstride(shape.size());
3777 tstride.back()=sizeof(std::complex<T>);
3778 for (size_t i=tstride.size()-1; i>0; --i)
3779 tstride[i-1]=tstride[i]*ptrdiff_t(tshp[i]);
3780 r2c(shape, stride_in, tstride, axes, true, data_in, tdata.data(), fct, nthreads);
3781 cndarr<cmplx<T>> atmp(tdata.data(), tshp, tstride);
3782 ndarr<T> aout(data_out, shape, stride_out);
3783 simple_iter iin(atmp);
3784 rev_iter iout(aout, axes);
3785 while(iin.remaining()>0)
3786 {
3787 auto v = atmp[iin.ofs()];
3788 aout[iout.ofs()] = v.r+v.i;
3789 aout[iout.rev_ofs()] = v.r-v.i;
3790 iin.advance(); iout.advance();
3791 }
3792 }
3793
3794template<typename T> void r2r_separable_fht(const shape_t &shape,
3795 const stride_t &stride_in, const stride_t &stride_out, const shape_t &axes,
3796 const T *data_in, T *data_out, T fct, size_t nthreads=1)
3797 {
3798 if (util::prod(shape)==0) return;
3799 util::sanity_check(shape, stride_in, stride_out, data_in==data_out, axes);
3800 cndarr<T> ain(data_in, shape, stride_in);
3801 ndarr<T> aout(data_out, shape, stride_out);
3802 general_nd<pocketfft_r<T>>(ain, aout, axes, fct, nthreads, ExecFHT{},
3803 false);
3804 }
3805
3806template<typename T> void r2r_genuine_fht(const shape_t &shape,
3807 const stride_t &stride_in, const stride_t &stride_out, const shape_t &axes,
3808 const T *data_in, T *data_out, T fct, size_t nthreads=1)
3809 {
3810 if (util::prod(shape)==0) return;
3811 if (axes.size()==1)
3812 return r2r_separable_fht(shape, stride_in, stride_out, axes, data_in,
3813 data_out, fct, nthreads);
3814 util::sanity_check(shape, stride_in, stride_out, data_in==data_out, axes);
3815 shape_t tshp(shape);
3816 tshp[axes.back()] = tshp[axes.back()]/2+1;
3817 arr<std::complex<T>> tdata(util::prod(tshp));
3818 stride_t tstride(shape.size());
3819 tstride.back()=sizeof(std::complex<T>);
3820 for (size_t i=tstride.size()-1; i>0; --i)
3821 tstride[i-1]=tstride[i]*ptrdiff_t(tshp[i]);
3822 r2c(shape, stride_in, tstride, axes, true, data_in, tdata.data(), fct, nthreads);
3823 cndarr<cmplx<T>> atmp(tdata.data(), tshp, tstride);
3824 ndarr<T> aout(data_out, shape, stride_out);
3825 simple_iter iin(atmp);
3826 rev_iter iout(aout, axes);
3827 while(iin.remaining()>0)
3828 {
3829 auto v = atmp[iin.ofs()];
3830 aout[iout.ofs()] = v.r-v.i;
3831 aout[iout.rev_ofs()] = v.r+v.i;
3832 iin.advance(); iout.advance();
3833 }
3834 }
3835
3836} // namespace detail
3837
3838using detail::FORWARD;
3839using detail::BACKWARD;
3840using detail::shape_t;
3841using detail::stride_t;
3842using detail::c2c;
3843using detail::c2r;
3844using detail::r2c;
3845using detail::r2r_fftpack;
3846using detail::r2r_separable_hartley;
3847using detail::r2r_genuine_hartley;
3848using detail::r2r_separable_fht;
3849using detail::r2r_genuine_fht;
3850using detail::dct;
3851using detail::dst;
3852
3853} // namespace pocketfft
3854
3855#undef POCKETFFT_NOINLINE
3856#undef POCKETFFT_RESTRICT
3857
3858#endif // POCKETFFT_HDRONLY_H
POCKETFFT_NOINLINE void exec(T c[], T0 fct, bool ortho, int type, bool cosine) const
POCKETFFT_NOINLINE T_dcst23(vcl_size_t length)
POCKETFFT_NOINLINE void exec(T c[], T0 fct, bool, int, bool cosine) const
POCKETFFT_NOINLINE T_dcst4(vcl_size_t length)
std::unique_ptr< pocketfft_c< T0 > > fft
std::unique_ptr< pocketfft_r< T0 > > rfft
POCKETFFT_NOINLINE T_dct1(vcl_size_t length)
POCKETFFT_NOINLINE void exec(T c[], T0 fct, bool ortho, int, bool) const
POCKETFFT_NOINLINE T_dst1(vcl_size_t length)
POCKETFFT_NOINLINE void exec(T c[], T0 fct, bool, int, bool) const
vcl_size_t shape(vcl_size_t i) const
arr_info(shape_t shape_, stride_t stride_)
const ptrdiff_t & stride(vcl_size_t i) const
static T * ralloc(vcl_size_t num)
const T & operator[](vcl_size_t idx) const
void passg(vcl_size_t ido, vcl_size_t ip, vcl_size_t l1, T *POCKETFFT_RESTRICT cc, T *POCKETFFT_RESTRICT ch, const cmplx< T0 > *POCKETFFT_RESTRICT wa, const cmplx< T0 > *POCKETFFT_RESTRICT csarr) const
void pass2(vcl_size_t ido, vcl_size_t l1, const T *POCKETFFT_RESTRICT cc, T *POCKETFFT_RESTRICT ch, const cmplx< T0 > *POCKETFFT_RESTRICT wa) const
POCKETFFT_NOINLINE void factorize()
void pass11(vcl_size_t ido, vcl_size_t l1, const T *POCKETFFT_RESTRICT cc, T *POCKETFFT_RESTRICT ch, const cmplx< T0 > *POCKETFFT_RESTRICT wa) const
void pass7(vcl_size_t ido, vcl_size_t l1, const T *POCKETFFT_RESTRICT cc, T *POCKETFFT_RESTRICT ch, const cmplx< T0 > *POCKETFFT_RESTRICT wa) const
void pass3(vcl_size_t ido, vcl_size_t l1, const T *POCKETFFT_RESTRICT cc, T *POCKETFFT_RESTRICT ch, const cmplx< T0 > *POCKETFFT_RESTRICT wa) const
POCKETFFT_NOINLINE cfftp(vcl_size_t length_)
void pass8(vcl_size_t ido, vcl_size_t l1, const T *POCKETFFT_RESTRICT cc, T *POCKETFFT_RESTRICT ch, const cmplx< T0 > *POCKETFFT_RESTRICT wa) const
void exec(T c[], T0 fct, bool fwd) const
void pass4(vcl_size_t ido, vcl_size_t l1, const T *POCKETFFT_RESTRICT cc, T *POCKETFFT_RESTRICT ch, const cmplx< T0 > *POCKETFFT_RESTRICT wa) const
void pass5(vcl_size_t ido, vcl_size_t l1, const T *POCKETFFT_RESTRICT cc, T *POCKETFFT_RESTRICT ch, const cmplx< T0 > *POCKETFFT_RESTRICT wa) const
const T & operator[](ptrdiff_t ofs) const
cndarr(const void *data_, const shape_t &shape_, const stride_t &stride_)
void exec(cmplx< T > c[], T0 fct, bool fwd) const
void fft(cmplx< T > c[], T0 fct) const
POCKETFFT_NOINLINE fftblue(vcl_size_t length)
void exec_r(T c[], T0 fct, bool fwd)
ptrdiff_t oofs(vcl_size_t j, vcl_size_t i) const
ptrdiff_t iofs(vcl_size_t j, vcl_size_t i) const
multi_iter(const arr_info &iarr_, const arr_info &oarr_, vcl_size_t idim_)
ndarr(void *data_, const shape_t &shape_, const stride_t &stride_)
std::unique_ptr< fftblue< T0 > > blueplan
POCKETFFT_NOINLINE void exec(cmplx< T > c[], T0 fct, bool fwd) const
std::unique_ptr< cfftp< T0 > > packplan
POCKETFFT_NOINLINE pocketfft_c(vcl_size_t length)
POCKETFFT_NOINLINE pocketfft_r(vcl_size_t length)
std::unique_ptr< rfftp< T0 > > packplan
POCKETFFT_NOINLINE void exec(T c[], T0 fct, bool fwd) const
std::unique_ptr< fftblue< T0 > > blueplan
rev_iter(const arr_info &arr_, const shape_t &axes)
void copy_and_norm(T *c, T *p1, T0 fct) const
void exec(T c[], T0 fct, bool r2hc) const
void radf5(vcl_size_t ido, vcl_size_t l1, const T *POCKETFFT_RESTRICT cc, T *POCKETFFT_RESTRICT ch, const T0 *POCKETFFT_RESTRICT wa) const
void radf4(vcl_size_t ido, vcl_size_t l1, const T *POCKETFFT_RESTRICT cc, T *POCKETFFT_RESTRICT ch, const T0 *POCKETFFT_RESTRICT wa) const
void radb3(vcl_size_t ido, vcl_size_t l1, const T *POCKETFFT_RESTRICT cc, T *POCKETFFT_RESTRICT ch, const T0 *POCKETFFT_RESTRICT wa) const
void radf3(vcl_size_t ido, vcl_size_t l1, const T *POCKETFFT_RESTRICT cc, T *POCKETFFT_RESTRICT ch, const T0 *POCKETFFT_RESTRICT wa) const
void radb4(vcl_size_t ido, vcl_size_t l1, const T *POCKETFFT_RESTRICT cc, T *POCKETFFT_RESTRICT ch, const T0 *POCKETFFT_RESTRICT wa) const
void radf2(vcl_size_t ido, vcl_size_t l1, const T *POCKETFFT_RESTRICT cc, T *POCKETFFT_RESTRICT ch, const T0 *POCKETFFT_RESTRICT wa) const
void radb2(vcl_size_t ido, vcl_size_t l1, const T *POCKETFFT_RESTRICT cc, T *POCKETFFT_RESTRICT ch, const T0 *POCKETFFT_RESTRICT wa) const
void radb5(vcl_size_t ido, vcl_size_t l1, const T *POCKETFFT_RESTRICT cc, T *POCKETFFT_RESTRICT ch, const T0 *POCKETFFT_RESTRICT wa) const
void radbg(vcl_size_t ido, vcl_size_t ip, vcl_size_t l1, T *POCKETFFT_RESTRICT cc, T *POCKETFFT_RESTRICT ch, const T0 *POCKETFFT_RESTRICT wa, const T0 *POCKETFFT_RESTRICT csarr) const
void MULPM(T1 &a, T1 &b, T2 c, T2 d, T3 e, T3 f) const
void radfg(vcl_size_t ido, vcl_size_t ip, vcl_size_t l1, T *POCKETFFT_RESTRICT cc, T *POCKETFFT_RESTRICT ch, const T0 *POCKETFFT_RESTRICT wa, const T0 *POCKETFFT_RESTRICT csarr) const
POCKETFFT_NOINLINE rfftp(vcl_size_t length_)
static cmplx< Thigh > calc(vcl_size_t x, vcl_size_t n, Thigh ang)
typename std::conditional<(sizeof(T)>sizeof(double)), T, double >::type Thigh
POCKETFFT_NOINLINE sincos_2pibyn(vcl_size_t n)
cmplx< T > operator[](vcl_size_t idx) const
concurrent_queue< std::function< void()> > overflow_work_
std::vector< worker, aligned_allocator< worker > > workers_
void thread_map(vcl_size_t nthreads, Func f)
void r2c(const shape_t &shape_in, const stride_t &stride_in, const stride_t &stride_out, vcl_size_t axis, bool forward, const T *data_in, std::complex< T > *data_out, T fct, vcl_size_t nthreads=1)
void dst(const shape_t &shape, const stride_t &stride_in, const stride_t &stride_out, const shape_t &axes, int type, const T *data_in, T *data_out, T fct, bool ortho, vcl_size_t nthreads=1)
POCKETFFT_NOINLINE void general_r2c(const cndarr< T > &in, ndarr< cmplx< T > > &out, vcl_size_t axis, bool forward, T fct, vcl_size_t nthreads)
void c2c(const shape_t &shape, const stride_t &stride_in, const stride_t &stride_out, const shape_t &axes, bool forward, const std::complex< T > *data_in, std::complex< T > *data_out, T fct, vcl_size_t nthreads=1)
void * aligned_alloc(vcl_size_t align, vcl_size_t size)
cmplx< T > conj(const cmplx< T > &a)
std::vector< vcl_size_t > shape_t
void r2r_genuine_fht(const shape_t &shape, const stride_t &stride_in, const stride_t &stride_out, const shape_t &axes, const T *data_in, T *data_out, T fct, vcl_size_t nthreads=1)
typename add_vec< T >::type add_vec_t
void copy_output(const multi_iter< vlen > &it, const cmplx< vtype_t< T > > *POCKETFFT_RESTRICT src, ndarr< cmplx< T > > &dst)
void dct(const shape_t &shape, const stride_t &stride_in, const stride_t &stride_out, const shape_t &axes, int type, const T *data_in, T *data_out, T fct, bool ortho, vcl_size_t nthreads=1)
arr< char > alloc_tmp(const shape_t &shape, vcl_size_t axsize, vcl_size_t elemsize)
void copy_hartley(const multi_iter< vlen > &it, const vtype_t< T > *POCKETFFT_RESTRICT src, ndarr< T > &dst)
void r2r_separable_hartley(const shape_t &shape, const stride_t &stride_in, const stride_t &stride_out, const shape_t &axes, const T *data_in, T *data_out, T fct, vcl_size_t nthreads=1)
void r2r_fftpack(const shape_t &shape, const stride_t &stride_in, const stride_t &stride_out, const shape_t &axes, bool real2hermitian, bool forward, const T *data_in, T *data_out, T fct, vcl_size_t nthreads=1)
std::shared_ptr< T > get_plan(vcl_size_t length)
POCKETFFT_NOINLINE void general_c2r(const cndarr< cmplx< T > > &in, ndarr< T > &out, vcl_size_t axis, bool forward, T fct, vcl_size_t nthreads)
POCKETFFT_NOINLINE void general_nd(const cndarr< T > &in, ndarr< T > &out, const shape_t &axes, T0 fct, vcl_size_t nthreads, const Exec &exec, const bool allow_inplace=true)
void c2r(const shape_t &shape_out, const stride_t &stride_in, const stride_t &stride_out, vcl_size_t axis, bool forward, const std::complex< T > *data_in, T *data_out, T fct, vcl_size_t nthreads=1)
void PM(T &a, T &b, T c, T d)
void r2r_separable_fht(const shape_t &shape, const stride_t &stride_in, const stride_t &stride_out, const shape_t &axes, const T *data_in, T *data_out, T fct, vcl_size_t nthreads=1)
void copy_FHT(const multi_iter< vlen > &it, const vtype_t< T > *POCKETFFT_RESTRICT src, ndarr< T > &dst)
typename VTYPE< T >::type vtype_t
void special_mul(const cmplx< T > &v1, const cmplx< T2 > &v2, cmplx< T > &res)
std::vector< ptrdiff_t > stride_t
void r2r_genuine_hartley(const shape_t &shape, const stride_t &stride_in, const stride_t &stride_out, const shape_t &axes, const T *data_in, T *data_out, T fct, vcl_size_t nthreads=1)
void copy_input(const multi_iter< vlen > &it, const cndarr< cmplx< T > > &src, cmplx< vtype_t< T > > *POCKETFFT_RESTRICT dst)
STL namespace.
#define POCKETFFT_PARTSTEP5a(u1, u2, twar, twbr, twai, twbi)
#define POCKETFFT_PARTSTEP11(u1, u2, x1, x2, x3, x4, x5, y1, y2, y3, y4, y5)
#define POCKETFFT_PARTSTEP7a(u1, u2, x1, x2, x3, y1, y2, y3)
#define POCKETFFT_PARTSTEP3b(u1, u2, twr, twi)
#define POCKETFFT_PREP11(idx)
#define POCKETFFT_NOINLINE
#define POCKETFFT_PREP5(idx)
#define POCKETFFT_CACHE_SIZE
#define POCKETFFT_PARTSTEP5b(u1, u2, twar, twbr, twai, twbi)
#define POCKETFFT_RESTRICT
#define POCKETFFT_PARTSTEP3a(u1, u2, twr, twi)
#define POCKETFFT_PARTSTEP11a(u1, u2, x1, x2, x3, x4, x5, y1, y2, y3, y4, y5)
#define POCKETFFT_REARRANGE(rx, ix, ry, iy)
#define POCKETFFT_PREP3(idx)
#define POCKETFFT_PREP7(idx)
#define POCKETFFT_PARTSTEP7(u1, u2, x1, x2, x3, y1, y2, y3)
static constexpr vcl_size_t val
auto special_mul(const cmplx< T2 > &other) const -> cmplx< decltype(r+other.r)>
void worker_main(std::atomic< bool > &shutdown_flag, std::atomic< vcl_size_t > &unscheduled_tasks, concurrent_queue< std::function< void()> > &overflow_work)
static POCKETFFT_NOINLINE void sanity_check(const shape_t &shape, const stride_t &stride_in, const stride_t &stride_out, bool inplace)
static POCKETFFT_NOINLINE vcl_size_t prev_good_size_cmplx(vcl_size_t n)
static POCKETFFT_NOINLINE void sanity_check(const shape_t &shape, const stride_t &stride_in, const stride_t &stride_out, bool inplace, const shape_t &axes)
static POCKETFFT_NOINLINE UIntT prev_good_size_cmplx_typed(UIntT n)
static POCKETFFT_NOINLINE vcl_size_t good_size_cmplx(vcl_size_t n)
static vcl_size_t prod(const shape_t &shape)
static POCKETFFT_NOINLINE double cost_guess(vcl_size_t n)
static POCKETFFT_NOINLINE vcl_size_t largest_prime_factor(vcl_size_t n)
static POCKETFFT_NOINLINE vcl_size_t good_size_real(vcl_size_t n, vcl_size_t required_factor)
static POCKETFFT_NOINLINE vcl_size_t good_size_real(vcl_size_t n)
static POCKETFFT_NOINLINE vcl_size_t good_size_cmplx(vcl_size_t n, vcl_size_t required_factor)
static POCKETFFT_NOINLINE void sanity_check(const shape_t &shape, const stride_t &stride_in, const stride_t &stride_out, bool inplace, vcl_size_t axis)
static POCKETFFT_NOINLINE vcl_size_t prev_good_size_real(vcl_size_t n)
static vcl_size_t thread_count(vcl_size_t nthreads, const shape_t &shape, vcl_size_t axis, vcl_size_t vlen)
static POCKETFFT_NOINLINE UIntT good_size_cmplx_typed(UIntT n)
static POCKETFFT_NOINLINE UIntT prev_good_size_real_typed(UIntT n)
static POCKETFFT_NOINLINE UIntT good_size_real_typed(UIntT n)