45#ifndef POCKETFFT_HDRONLY_H
46#define POCKETFFT_HDRONLY_H
49#error This file is C++ and requires a C++ compiler.
52#if !(__cplusplus >= 201103L || (defined(_MSVC_LANG) && _MSVC_LANG >= 201103L))
53#error This file requires at least C++11 support.
56#ifndef POCKETFFT_NAMESPACE
57#define POCKETFFT_NAMESPACE pocketfft
60#ifndef POCKETFFT_CACHE_SIZE
66#define POCKETFFT_CACHE_SIZE 16
80#if POCKETFFT_CACHE_SIZE!=0
85#ifndef POCKETFFT_NO_MULTITHREADING
87#include <condition_variable>
94#ifdef POCKETFFT_PTHREADS
100#define POCKETFFT_NOINLINE __attribute__((noinline))
101#define POCKETFFT_RESTRICT __restrict__
102#elif defined(_MSC_VER)
103#define POCKETFFT_NOINLINE __declspec(noinline)
104#define POCKETFFT_RESTRICT __restrict
106#define POCKETFFT_NOINLINE
107#define POCKETFFT_RESTRICT
117template <
typename T> T
cos(T) =
delete;
118template <
typename T> T
sin(T) =
delete;
119template <
typename T> T
sqrt(T) =
delete;
128#ifndef POCKETFFT_NO_VECTORS
129#define POCKETFFT_NO_VECTORS
130#if defined(__INTEL_COMPILER)
132#elif defined(__clang__)
134#ifdef __apple_build_version__
135# if (__clang_major__ > 9) || (__clang_major__ == 9 && __clang_minor__ >= 1)
136# undef POCKETFFT_NO_VECTORS
138#elif __clang_major__ >= 5
139# undef POCKETFFT_NO_VECTORS
141#elif defined(__GNUC__)
143#undef POCKETFFT_NO_VECTORS
148template<
typename T>
struct VLEN {
static constexpr size_t val=1; };
150#ifndef POCKETFFT_NO_VECTORS
151#if (defined(__AVX512F__))
152template<>
struct VLEN<float> {
static constexpr size_t val=16; };
153template<>
struct VLEN<double> {
static constexpr size_t val=8; };
154#elif (defined(__AVX__))
155template<>
struct VLEN<float> {
static constexpr size_t val=8; };
156template<>
struct VLEN<double> {
static constexpr size_t val=4; };
157#elif (defined(__SSE2__))
158template<>
struct VLEN<float> {
static constexpr size_t val=4; };
159template<>
struct VLEN<double> {
static constexpr size_t val=2; };
160#elif (defined(__VSX__))
161template<>
struct VLEN<float> {
static constexpr size_t val=4; };
162template<>
struct VLEN<double> {
static constexpr size_t val=2; };
163#elif (defined(__ARM_NEON__) || defined(__ARM_NEON))
164template<>
struct VLEN<float> {
static constexpr size_t val=4; };
165template<>
struct VLEN<double> {
static constexpr size_t val=2; };
166#elif (defined(__riscv_vector) || defined(__riscv_v_intrinsic))
167template<>
struct VLEN<float> {
static constexpr size_t val=8; };
168template<>
struct VLEN<double> {
static constexpr size_t val=4; };
170#define POCKETFFT_NO_VECTORS
177#if defined(POCKETFFT_USE_POSIX_MEMALIGN) && (defined(__APPLE__) || defined(__unix__))
183 align = std::max(align,
sizeof(
void*));
185 if (posix_memalign(&ptr, align, size) != 0)
186 throw std::bad_alloc();
197 void *ptr = ::aligned_alloc(align,(size+align-1)&(~(align-1)));
198 if (!ptr)
throw std::bad_alloc();
206 align = std::max(align,
alignof(max_align_t));
207 void *ptr = malloc(size+align);
208 if (!ptr)
throw std::bad_alloc();
209 void *res =
reinterpret_cast<void *
>
210 ((
reinterpret_cast<uintptr_t
>(ptr) & ~(uintptr_t(align-1))) + uintptr_t(align));
211 (
reinterpret_cast<void**
>(res))[-1] = ptr;
215 {
if (ptr) free((
reinterpret_cast<void**
>(ptr))[-1]); }
219template<
typename T>
class arr
225#if defined(POCKETFFT_NO_VECTORS)
228 if (num==0)
return nullptr;
229 void *res = malloc(num*
sizeof(T));
230 if (!res)
throw std::bad_alloc();
231 return reinterpret_cast<T *
>(res);
236 static T *ralloc(
size_t num)
238 if (num==0)
return nullptr;
240 return static_cast<T*
>(ptr);
242 static void dealloc(T *ptr)
243 { aligned_dealloc(ptr); }
250 :
p(other.p),
sz(other.sz)
251 { other.p=
nullptr; other.sz=0; }
275 void Set(T r_, T i_) {
r=r_;
i=i_; }
278 {
r+=other.
r;
i+=other.
i;
return *
this; }
279 template<
typename T2>
cmplx &operator*= (T2 other)
280 {
r*=other;
i*=other;
return *
this; }
283 T tmp =
r*other.
r -
i*other.
i;
284 i =
r*other.
i +
i*other.
r;
289 {
r+=other.
r;
i+=other.
i;
return *
this; }
291 {
r-=other.
r;
i-=other.
i;
return *
this; }
292 template<
typename T2>
auto operator* (
const T2 &other)
const
293 ->
cmplx<
decltype(
r*other)>
294 {
return {
r*other,
i*other}; }
295 template<
typename T2>
auto operator+ (
const cmplx<T2> &other)
const
296 ->
cmplx<
decltype(
r+other.r)>
297 {
return {
r+other.r,
i+other.i}; }
298 template<
typename T2>
auto operator- (
const cmplx<T2> &other)
const
299 ->
cmplx<
decltype(
r+other.r)>
300 {
return {
r-other.r,
i-other.i}; }
301 template<
typename T2>
auto operator* (
const cmplx<T2> &other)
const
302 ->
cmplx<
decltype(
r+other.r)>
303 {
return {
r*other.r-
i*other.i,
r*other.i +
i*other.r}; }
305 ->
cmplx<
decltype(
r+other.r)>
307 using Tres =
cmplx<
decltype(
r+other.r)>;
308 return fwd ? Tres(
r*other.r+
i*other.i,
i*other.r-
r*other.i)
309 : Tres(
r*other.r-
i*other.i,
r*other.i+
i*other.r);
312template<
typename T>
inline void PM(T &a, T &b, T c, T d)
315 { T t = a; a+=b; b=t-b; }
317 { T t = a; a-=b; b=t+b; }
319 {
return {a.
r, -a.
i}; }
327 {
auto tmp_=a.
r; a.
r=-a.
i; a.
i=tmp_; }
329 {
auto tmp_= fwd ? -a.
r : a.
r; a.
r = fwd ? a.
i : -a.
i; a.
i=tmp_; }
337 using Thigh =
typename std::conditional<(
sizeof(T)>
sizeof(double)), T,
double>::type;
379 constexpr auto pi = 3.141592653589793238462643383279502884197L;
381 size_t nval = (n+2)/2;
387 for (
size_t i=1; i<
v1.size(); ++i)
391 for (
size_t i=1; i<
v2.size(); ++i)
400 return cmplx<T>(T(x1.r*x2.r-x1.i*x2.i), T(x1.r*x2.i+x1.i*x2.r));
404 return cmplx<T>(T(x1.r*x2.r-x1.i*x2.i), -T(x1.r*x2.i+x1.i*x2.r));
415 for (
size_t x=3; x*x<=n; x+=2)
424 constexpr double lfp=1.1;
428 { result+=2; n>>=1; }
429 for (
size_t x=3; x*x<=n; x+=2)
432 result+= (x<=5) ?
double(x) : lfp*double(x);
435 if (n>1) result+=(n<=5) ?
double(n) : lfp*double(n);
436 return result*double(ni);
440 template<
typename UIntT>
443 static_assert(std::numeric_limits<UIntT>::is_integer && (!std::numeric_limits<UIntT>::is_signed),
444 "type must be unsigned integer");
446 if (n>std::numeric_limits<UIntT>::max()/11/2)
449 if (
sizeof(UIntT)<
sizeof(std::uint64_t))
453 if (res<=std::numeric_limits<UIntT>::max())
454 return static_cast<UIntT
>(res);
457 throw std::runtime_error(
"FFT size is too large.");
461 for (UIntT f11=1; f11<bestfac; f11*=11)
462 for (UIntT f117=f11; f117<bestfac; f117*=7)
463 for (UIntT f1175=f117; f1175<bestfac; f1175*=5)
473 if (x<bestfac) bestfac=x;
491 size_t required_factor)
493 if (required_factor<1)
494 throw std::runtime_error(
"required factor must not be 0");
495 return good_size_cmplx((n+required_factor-1)/required_factor) * required_factor;
499 template<
typename UIntT>
502 static_assert(std::numeric_limits<UIntT>::is_integer && (!std::numeric_limits<UIntT>::is_signed),
503 "type must be unsigned integer");
505 if (n>std::numeric_limits<UIntT>::max()/5/2)
508 if (
sizeof(UIntT)<
sizeof(std::uint64_t))
512 if (res<=std::numeric_limits<UIntT>::max())
513 return static_cast<UIntT
>(res);
516 throw std::runtime_error(
"FFT size is too large.");
520 for (UIntT f5=1; f5<bestfac; f5*=5)
530 if (x<bestfac) bestfac=x;
548 size_t required_factor)
550 if (required_factor<1)
551 throw std::runtime_error(
"required factor must not be 0");
552 return good_size_real((n+required_factor-1)/required_factor) * required_factor;
556 template<
typename UIntT>
559 static_assert(std::numeric_limits<UIntT>::is_integer && (!std::numeric_limits<UIntT>::is_signed),
560 "type must be unsigned integer");
562 if (n>std::numeric_limits<UIntT>::max()/11)
565 if (
sizeof(UIntT)<
sizeof(std::uint64_t))
569 if (res<=std::numeric_limits<UIntT>::max())
570 return static_cast<UIntT
>(res);
573 throw std::runtime_error(
"FFT size is too large.");
577 for (UIntT f11 = 1;f11 <= n; f11 *= 11)
578 for (UIntT f117 = f11; f117 <= n; f117 *= 7)
579 for (UIntT f1175 = f117; f1175 <= n; f1175 *= 5)
582 while (x*2 <= n) x *= 2;
583 if (x > bestfound) bestfound = x;
586 if (x * 3 <= n) x *= 3;
587 else if (x % 2 == 0) x /= 2;
590 if (x > bestfound) bestfound = x;
602 template<
typename UIntT>
605 static_assert(std::numeric_limits<UIntT>::is_integer && (!std::numeric_limits<UIntT>::is_signed),
606 "type must be unsigned integer");
608 if (n>std::numeric_limits<UIntT>::max()/5)
611 if (
sizeof(UIntT)<
sizeof(std::uint64_t))
615 if (res<=std::numeric_limits<UIntT>::max())
616 return static_cast<UIntT
>(res);
619 throw std::runtime_error(
"FFT size is too large.");
623 for (UIntT f5 = 1; f5 <= n; f5 *= 5)
626 while (x*2 <= n) x *= 2;
627 if (x > bestfound) bestfound = x;
630 if (x * 3 <= n) x *= 3;
631 else if (x % 2 == 0) x /= 2;
634 if (x > bestfound) bestfound = x;
656 auto ndim = shape.size();
657 if (ndim<1)
throw std::runtime_error(
"ndim must be >= 1");
658 if ((stride_in.size()!=ndim) || (stride_out.size()!=ndim))
659 throw std::runtime_error(
"stride dimension mismatch");
660 if (inplace && (stride_in!=stride_out))
661 throw std::runtime_error(
"stride mismatch");
669 auto ndim = shape.size();
673 if (ax>=ndim)
throw std::invalid_argument(
"bad axis number");
674 if (++tmp[ax]>1)
throw std::invalid_argument(
"axis specified repeatedly");
683 if (axis>=shape.size())
throw std::invalid_argument(
"bad axis number");
686#ifdef POCKETFFT_NO_MULTITHREADING
687 static size_t thread_count (
size_t ,
const shape_t &,
692 size_t axis,
size_t vlen)
694 if (nthreads==1)
return 1;
695 size_t size =
prod(shape);
696 size_t parallel = size / (shape[axis] * vlen);
697 if (shape[axis] < 1000)
699 size_t max_threads = nthreads == 0 ?
700 std::thread::hardware_concurrency() : nthreads;
701 return std::max(
size_t(1), std::min(parallel, max_threads));
708#ifdef POCKETFFT_NO_MULTITHREADING
710constexpr inline size_t thread_id() {
return 0; }
713template <
typename Func>
721 static thread_local size_t thread_id_=0;
726 static thread_local size_t num_threads_=1;
729static const size_t max_threads = std::max(1u, std::thread::hardware_concurrency());
736 using lock_t = std::unique_lock<std::mutex>;
762 using lock_t = std::lock_guard<std::mutex>;
770 q_.push(std::move(val));
775 if (
size_ == 0)
return false;
778 if (
q_.empty())
return false;
780 val = std::move(
q_.front());
800 return static_cast<T*
>(mem);
820 std::atomic<bool> &shutdown_flag,
821 std::atomic<size_t> &unscheduled_tasks,
824 using lock_t_inner = std::unique_lock<std::mutex>;
825 bool expect_work =
true;
826 while (!shutdown_flag || expect_work)
828 std::function<void()> local_work;
829 if (expect_work || unscheduled_tasks == 0)
831 lock_t_inner lock(
mut);
834 local_work.swap(
work);
838 bool marked_busy =
false;
845 if (!overflow_work.empty())
847 if (!marked_busy &&
busy_flag.test_and_set())
854 while (overflow_work.try_pop(local_work))
868 std::vector<worker, aligned_allocator<worker>>
workers_;
871 using lock_t = std::lock_guard<std::mutex>;
877 for (
size_t i=0; i<nthreads; ++i)
882 worker_i->busy_flag.clear();
883 worker_i->work =
nullptr;
884 worker_i->thread = std::thread([worker_i,
this]
901 worker_i.work_ready.notify_all();
904 if (worker_i.thread.joinable())
905 worker_i.thread.join();
921 throw std::runtime_error(
"Work item submitted after shutdown");
927 if (!worker_i.busy_flag.test_and_set())
931 lock_t lock_inner(worker_i.mut);
932 worker_i.work = std::move(work);
934 worker_i.work_ready.notify_one();
958#ifdef POCKETFFT_PTHREADS
959 static std::once_flag f;
974template <
typename Func>
984 latch counter(nthreads);
985 std::exception_ptr ex;
987 for (
size_t i=0; i<nthreads; ++i)
990 [&f, &counter, &ex, &ex_mut, i, nthreads] {
996 std::lock_guard<std::mutex> lock(ex_mut);
997 ex = std::current_exception();
1004 std::rethrow_exception(ex);
1029 {
fact.push_back({factor,
nullptr,
nullptr}); }
1031template<
bool fwd,
typename T>
void pass2 (
size_t ido,
size_t l1,
1035 auto CH = [ch,ido,l1](
size_t a,
size_t b,
size_t c) -> T&
1036 {
return ch[a+ido*(b+l1*c)]; };
1037 auto CC = [cc,ido](
size_t a,
size_t b,
size_t c) ->
const T&
1038 {
return cc[a+ido*(b+2*c)]; };
1039 auto WA = [wa, ido](
size_t x,
size_t i)
1040 {
return wa[i-1+x*(ido-1)]; };
1043 for (
size_t k=0; k<l1; ++k)
1045 CH(0,k,0) = CC(0,0,k)+CC(0,1,k);
1046 CH(0,k,1) = CC(0,0,k)-CC(0,1,k);
1049 for (
size_t k=0; k<l1; ++k)
1051 CH(0,k,0) = CC(0,0,k)+CC(0,1,k);
1052 CH(0,k,1) = CC(0,0,k)-CC(0,1,k);
1053 for (
size_t i=1; i<ido; ++i)
1055 CH(i,k,0) = CC(i,0,k)+CC(i,1,k);
1061#define POCKETFFT_PREP3(idx) \
1062 T t0 = CC(idx,0,k), t1, t2; \
1063 PM (t1,t2,CC(idx,1,k),CC(idx,2,k)); \
1065#define POCKETFFT_PARTSTEP3a(u1,u2,twr,twi) \
1068 T cb{-t2.i*twi, t2.r*twi}; \
1069 PM(CH(0,k,u1),CH(0,k,u2),ca,cb) ;\
1071#define POCKETFFT_PARTSTEP3b(u1,u2,twr,twi) \
1074 T cb{-t2.i*twi, t2.r*twi}; \
1075 special_mul<fwd>(ca+cb,WA(u1-1,i),CH(i,k,u1)); \
1076 special_mul<fwd>(ca-cb,WA(u2-1,i),CH(i,k,u2)); \
1078template<
bool fwd,
typename T>
void pass3 (
size_t ido,
size_t l1,
1082 constexpr T0 tw1r=-0.5,
1083 tw1i= (fwd ? -1: 1) * T0(0.8660254037844386467637231707529362L);
1085 auto CH = [ch,ido,l1](
size_t a,
size_t b,
size_t c) -> T&
1086 {
return ch[a+ido*(b+l1*c)]; };
1087 auto CC = [cc,ido](
size_t a,
size_t b,
size_t c) ->
const T&
1088 {
return cc[a+ido*(b+3*c)]; };
1089 auto WA = [wa, ido](
size_t x,
size_t i)
1090 {
return wa[i-1+x*(ido-1)]; };
1093 for (
size_t k=0; k<l1; ++k)
1099 for (
size_t k=0; k<l1; ++k)
1105 for (
size_t i=1; i<ido; ++i)
1113#undef POCKETFFT_PARTSTEP3b
1114#undef POCKETFFT_PARTSTEP3a
1115#undef POCKETFFT_PREP3
1117template<
bool fwd,
typename T>
void pass4 (
size_t ido,
size_t l1,
1121 auto CH = [ch,ido,l1](
size_t a,
size_t b,
size_t c) -> T&
1122 {
return ch[a+ido*(b+l1*c)]; };
1123 auto CC = [cc,ido](
size_t a,
size_t b,
size_t c) ->
const T&
1124 {
return cc[a+ido*(b+4*c)]; };
1125 auto WA = [wa, ido](
size_t x,
size_t i)
1126 {
return wa[i-1+x*(ido-1)]; };
1129 for (
size_t k=0; k<l1; ++k)
1132 PM(t2,t1,CC(0,0,k),CC(0,2,k));
1133 PM(t3,t4,CC(0,1,k),CC(0,3,k));
1135 PM(CH(0,k,0),CH(0,k,2),t2,t3);
1136 PM(CH(0,k,1),CH(0,k,3),t1,t4);
1139 for (
size_t k=0; k<l1; ++k)
1143 PM(t2,t1,CC(0,0,k),CC(0,2,k));
1144 PM(t3,t4,CC(0,1,k),CC(0,3,k));
1146 PM(CH(0,k,0),CH(0,k,2),t2,t3);
1147 PM(CH(0,k,1),CH(0,k,3),t1,t4);
1149 for (
size_t i=1; i<ido; ++i)
1152 T cc0=CC(i,0,k), cc1=CC(i,1,k),cc2=CC(i,2,k),cc3=CC(i,3,k);
1164#define POCKETFFT_PREP5(idx) \
1165 T t0 = CC(idx,0,k), t1, t2, t3, t4; \
1166 PM (t1,t4,CC(idx,1,k),CC(idx,4,k)); \
1167 PM (t2,t3,CC(idx,2,k),CC(idx,3,k)); \
1168 CH(idx,k,0).r=t0.r+t1.r+t2.r; \
1169 CH(idx,k,0).i=t0.i+t1.i+t2.i;
1171#define POCKETFFT_PARTSTEP5a(u1,u2,twar,twbr,twai,twbi) \
1174 ca.r=t0.r+twar*t1.r+twbr*t2.r; \
1175 ca.i=t0.i+twar*t1.i+twbr*t2.i; \
1176 cb.i=twai*t4.r twbi*t3.r; \
1177 cb.r=-(twai*t4.i twbi*t3.i); \
1178 PM(CH(0,k,u1),CH(0,k,u2),ca,cb); \
1181#define POCKETFFT_PARTSTEP5b(u1,u2,twar,twbr,twai,twbi) \
1184 ca.r=t0.r+twar*t1.r+twbr*t2.r; \
1185 ca.i=t0.i+twar*t1.i+twbr*t2.i; \
1186 cb.i=twai*t4.r twbi*t3.r; \
1187 cb.r=-(twai*t4.i twbi*t3.i); \
1188 special_mul<fwd>(ca+cb,WA(u1-1,i),CH(i,k,u1)); \
1189 special_mul<fwd>(ca-cb,WA(u2-1,i),CH(i,k,u2)); \
1191template<
bool fwd,
typename T>
void pass5 (
size_t ido,
size_t l1,
1195 constexpr T0 tw1r= T0(0.3090169943749474241022934171828191L),
1196 tw1i= (fwd ? -1: 1) * T0(0.9510565162951535721164393333793821L),
1197 tw2r= T0(-0.8090169943749474241022934171828191L),
1198 tw2i= (fwd ? -1: 1) * T0(0.5877852522924731291687059546390728L);
1200 auto CH = [ch,ido,l1](
size_t a,
size_t b,
size_t c) -> T&
1201 {
return ch[a+ido*(b+l1*c)]; };
1202 auto CC = [cc,ido](
size_t a,
size_t b,
size_t c) ->
const T&
1203 {
return cc[a+ido*(b+5*c)]; };
1204 auto WA = [wa, ido](
size_t x,
size_t i)
1205 {
return wa[i-1+x*(ido-1)]; };
1208 for (
size_t k=0; k<l1; ++k)
1215 for (
size_t k=0; k<l1; ++k)
1222 for (
size_t i=1; i<ido; ++i)
1231#undef POCKETFFT_PARTSTEP5b
1232#undef POCKETFFT_PARTSTEP5a
1233#undef POCKETFFT_PREP5
1235#define POCKETFFT_PREP7(idx) \
1236 T t1 = CC(idx,0,k), t2, t3, t4, t5, t6, t7; \
1237 PM (t2,t7,CC(idx,1,k),CC(idx,6,k)); \
1238 PM (t3,t6,CC(idx,2,k),CC(idx,5,k)); \
1239 PM (t4,t5,CC(idx,3,k),CC(idx,4,k)); \
1240 CH(idx,k,0).r=t1.r+t2.r+t3.r+t4.r; \
1241 CH(idx,k,0).i=t1.i+t2.i+t3.i+t4.i;
1243#define POCKETFFT_PARTSTEP7a0(u1,u2,x1,x2,x3,y1,y2,y3,out1,out2) \
1246 ca.r=t1.r+x1*t2.r+x2*t3.r+x3*t4.r; \
1247 ca.i=t1.i+x1*t2.i+x2*t3.i+x3*t4.i; \
1248 cb.i=y1*t7.r y2*t6.r y3*t5.r; \
1249 cb.r=-(y1*t7.i y2*t6.i y3*t5.i); \
1250 PM(out1,out2,ca,cb); \
1252#define POCKETFFT_PARTSTEP7a(u1,u2,x1,x2,x3,y1,y2,y3) \
1253 POCKETFFT_PARTSTEP7a0(u1,u2,x1,x2,x3,y1,y2,y3,CH(0,k,u1),CH(0,k,u2))
1254#define POCKETFFT_PARTSTEP7(u1,u2,x1,x2,x3,y1,y2,y3) \
1257 POCKETFFT_PARTSTEP7a0(u1,u2,x1,x2,x3,y1,y2,y3,da,db) \
1258 special_mul<fwd>(da,WA(u1-1,i),CH(i,k,u1)); \
1259 special_mul<fwd>(db,WA(u2-1,i),CH(i,k,u2)); \
1262template<
bool fwd,
typename T>
void pass7(
size_t ido,
size_t l1,
1266 constexpr T0 tw1r= T0(0.6234898018587335305250048840042398L),
1267 tw1i= (fwd ? -1 : 1) * T0(0.7818314824680298087084445266740578L),
1268 tw2r= T0(-0.2225209339563144042889025644967948L),
1269 tw2i= (fwd ? -1 : 1) * T0(0.9749279121818236070181316829939312L),
1270 tw3r= T0(-0.9009688679024191262361023195074451L),
1271 tw3i= (fwd ? -1 : 1) * T0(0.433883739117558120475768332848359L);
1273 auto CH = [ch,ido,l1](
size_t a,
size_t b,
size_t c) -> T&
1274 {
return ch[a+ido*(b+l1*c)]; };
1275 auto CC = [cc,ido](
size_t a,
size_t b,
size_t c) ->
const T&
1276 {
return cc[a+ido*(b+7*c)]; };
1277 auto WA = [wa, ido](
size_t x,
size_t i)
1278 {
return wa[i-1+x*(ido-1)]; };
1281 for (
size_t k=0; k<l1; ++k)
1289 for (
size_t k=0; k<l1; ++k)
1297 for (
size_t i=1; i<ido; ++i)
1307#undef POCKETFFT_PARTSTEP7
1308#undef POCKETFFT_PARTSTEP7a0
1309#undef POCKETFFT_PARTSTEP7a
1310#undef POCKETFFT_PREP7
1312template <
bool fwd,
typename T>
void ROTX45(T &a)
const
1314 constexpr T0 hsqt2=T0(0.707106781186547524400844362104849L);
1316 {
auto tmp_=a.r; a.r=hsqt2*(a.r+a.i); a.i=hsqt2*(a.i-tmp_); }
1318 {
auto tmp_=a.r; a.r=hsqt2*(a.r-a.i); a.i=hsqt2*(a.i+tmp_); }
1320template <
bool fwd,
typename T>
void ROTX135(T &a)
const
1322 constexpr T0 hsqt2=T0(0.707106781186547524400844362104849L);
1324 {
auto tmp_=a.r; a.r=hsqt2*(a.i-a.r); a.i=hsqt2*(-tmp_-a.i); }
1326 {
auto tmp_=a.r; a.r=hsqt2*(-a.r-a.i); a.i=hsqt2*(tmp_-a.i); }
1329template<
bool fwd,
typename T>
void pass8 (
size_t ido,
size_t l1,
1333 auto CH = [ch,ido,l1](
size_t a,
size_t b,
size_t c) -> T&
1334 {
return ch[a+ido*(b+l1*c)]; };
1335 auto CC = [cc,ido](
size_t a,
size_t b,
size_t c) ->
const T&
1336 {
return cc[a+ido*(b+8*c)]; };
1337 auto WA = [wa, ido](
size_t x,
size_t i)
1338 {
return wa[i-1+x*(ido-1)]; };
1341 for (
size_t k=0; k<l1; ++k)
1343 T a0, a1, a2, a3, a4, a5, a6, a7;
1344 PM(a1,a5,CC(0,1,k),CC(0,5,k));
1345 PM(a3,a7,CC(0,3,k),CC(0,7,k));
1354 PM(a0,a4,CC(0,0,k),CC(0,4,k));
1355 PM(a2,a6,CC(0,2,k),CC(0,6,k));
1356 PM(CH(0,k,0),CH(0,k,4),a0+a2,a1);
1357 PM(CH(0,k,2),CH(0,k,6),a0-a2,a3);
1359 PM(CH(0,k,1),CH(0,k,5),a4+a6,a5);
1360 PM(CH(0,k,3),CH(0,k,7),a4-a6,a7);
1363 for (
size_t k=0; k<l1; ++k)
1366 T a0, a1, a2, a3, a4, a5, a6, a7;
1367 PM(a1,a5,CC(0,1,k),CC(0,5,k));
1368 PM(a3,a7,CC(0,3,k),CC(0,7,k));
1377 PM(a0,a4,CC(0,0,k),CC(0,4,k));
1378 PM(a2,a6,CC(0,2,k),CC(0,6,k));
1379 PM(CH(0,k,0),CH(0,k,4),a0+a2,a1);
1380 PM(CH(0,k,2),CH(0,k,6),a0-a2,a3);
1382 PM(CH(0,k,1),CH(0,k,5),a4+a6,a5);
1383 PM(CH(0,k,3),CH(0,k,7),a4-a6,a7);
1385 for (
size_t i=1; i<ido; ++i)
1387 T a0, a1, a2, a3, a4, a5, a6, a7;
1388 PM(a1,a5,CC(i,1,k),CC(i,5,k));
1389 PM(a3,a7,CC(i,3,k),CC(i,7,k));
1396 PM(a0,a4,CC(i,0,k),CC(i,4,k));
1397 PM(a2,a6,CC(i,2,k),CC(i,6,k));
1414#define POCKETFFT_PREP11(idx) \
1415 T t1 = CC(idx,0,k), t2, t3, t4, t5, t6, t7, t8, t9, t10, t11; \
1416 PM (t2,t11,CC(idx,1,k),CC(idx,10,k)); \
1417 PM (t3,t10,CC(idx,2,k),CC(idx, 9,k)); \
1418 PM (t4,t9 ,CC(idx,3,k),CC(idx, 8,k)); \
1419 PM (t5,t8 ,CC(idx,4,k),CC(idx, 7,k)); \
1420 PM (t6,t7 ,CC(idx,5,k),CC(idx, 6,k)); \
1421 CH(idx,k,0).r=t1.r+t2.r+t3.r+t4.r+t5.r+t6.r; \
1422 CH(idx,k,0).i=t1.i+t2.i+t3.i+t4.i+t5.i+t6.i;
1424#define POCKETFFT_PARTSTEP11a0(u1,u2,x1,x2,x3,x4,x5,y1,y2,y3,y4,y5,out1,out2) \
1426 T ca = t1 + t2*x1 + t3*x2 + t4*x3 + t5*x4 +t6*x5, \
1428 cb.i=y1*t11.r y2*t10.r y3*t9.r y4*t8.r y5*t7.r; \
1429 cb.r=-(y1*t11.i y2*t10.i y3*t9.i y4*t8.i y5*t7.i ); \
1430 PM(out1,out2,ca,cb); \
1432#define POCKETFFT_PARTSTEP11a(u1,u2,x1,x2,x3,x4,x5,y1,y2,y3,y4,y5) \
1433 POCKETFFT_PARTSTEP11a0(u1,u2,x1,x2,x3,x4,x5,y1,y2,y3,y4,y5,CH(0,k,u1),CH(0,k,u2))
1434#define POCKETFFT_PARTSTEP11(u1,u2,x1,x2,x3,x4,x5,y1,y2,y3,y4,y5) \
1437 POCKETFFT_PARTSTEP11a0(u1,u2,x1,x2,x3,x4,x5,y1,y2,y3,y4,y5,da,db) \
1438 special_mul<fwd>(da,WA(u1-1,i),CH(i,k,u1)); \
1439 special_mul<fwd>(db,WA(u2-1,i),CH(i,k,u2)); \
1442template<
bool fwd,
typename T>
void pass11 (
size_t ido,
size_t l1,
1446 constexpr T0 tw1r= T0(0.8412535328311811688618116489193677L),
1447 tw1i= (fwd ? -1 : 1) * T0(0.5406408174555975821076359543186917L),
1448 tw2r= T0(0.4154150130018864255292741492296232L),
1449 tw2i= (fwd ? -1 : 1) * T0(0.9096319953545183714117153830790285L),
1450 tw3r= T0(-0.1423148382732851404437926686163697L),
1451 tw3i= (fwd ? -1 : 1) * T0(0.9898214418809327323760920377767188L),
1452 tw4r= T0(-0.6548607339452850640569250724662936L),
1453 tw4i= (fwd ? -1 : 1) * T0(0.7557495743542582837740358439723444L),
1454 tw5r= T0(-0.9594929736144973898903680570663277L),
1455 tw5i= (fwd ? -1 : 1) * T0(0.2817325568414296977114179153466169L);
1457 auto CH = [ch,ido,l1](
size_t a,
size_t b,
size_t c) -> T&
1458 {
return ch[a+ido*(b+l1*c)]; };
1459 auto CC = [cc,ido](
size_t a,
size_t b,
size_t c) ->
const T&
1460 {
return cc[a+ido*(b+11*c)]; };
1461 auto WA = [wa, ido](
size_t x,
size_t i)
1462 {
return wa[i-1+x*(ido-1)]; };
1465 for (
size_t k=0; k<l1; ++k)
1468 POCKETFFT_PARTSTEP11a(1,10,tw1r,tw2r,tw3r,tw4r,tw5r,+tw1i,+tw2i,+tw3i,+tw4i,+tw5i)
1469 POCKETFFT_PARTSTEP11a(2, 9,tw2r,tw4r,tw5r,tw3r,tw1r,+tw2i,+tw4i,-tw5i,-tw3i,-tw1i)
1470 POCKETFFT_PARTSTEP11a(3, 8,tw3r,tw5r,tw2r,tw1r,tw4r,+tw3i,-tw5i,-tw2i,+tw1i,+tw4i)
1471 POCKETFFT_PARTSTEP11a(4, 7,tw4r,tw3r,tw1r,tw5r,tw2r,+tw4i,-tw3i,+tw1i,+tw5i,-tw2i)
1472 POCKETFFT_PARTSTEP11a(5, 6,tw5r,tw1r,tw4r,tw2r,tw3r,+tw5i,-tw1i,+tw4i,-tw2i,+tw3i)
1475 for (
size_t k=0; k<l1; ++k)
1479 POCKETFFT_PARTSTEP11a(1,10,tw1r,tw2r,tw3r,tw4r,tw5r,+tw1i,+tw2i,+tw3i,+tw4i,+tw5i)
1480 POCKETFFT_PARTSTEP11a(2, 9,tw2r,tw4r,tw5r,tw3r,tw1r,+tw2i,+tw4i,-tw5i,-tw3i,-tw1i)
1481 POCKETFFT_PARTSTEP11a(3, 8,tw3r,tw5r,tw2r,tw1r,tw4r,+tw3i,-tw5i,-tw2i,+tw1i,+tw4i)
1482 POCKETFFT_PARTSTEP11a(4, 7,tw4r,tw3r,tw1r,tw5r,tw2r,+tw4i,-tw3i,+tw1i,+tw5i,-tw2i)
1483 POCKETFFT_PARTSTEP11a(5, 6,tw5r,tw1r,tw4r,tw2r,tw3r,+tw5i,-tw1i,+tw4i,-tw2i,+tw3i)
1485 for (
size_t i=1; i<ido; ++i)
1488 POCKETFFT_PARTSTEP11(1,10,tw1r,tw2r,tw3r,tw4r,tw5r,+tw1i,+tw2i,+tw3i,+tw4i,+tw5i)
1489 POCKETFFT_PARTSTEP11(2, 9,tw2r,tw4r,tw5r,tw3r,tw1r,+tw2i,+tw4i,-tw5i,-tw3i,-tw1i)
1490 POCKETFFT_PARTSTEP11(3, 8,tw3r,tw5r,tw2r,tw1r,tw4r,+tw3i,-tw5i,-tw2i,+tw1i,+tw4i)
1491 POCKETFFT_PARTSTEP11(4, 7,tw4r,tw3r,tw1r,tw5r,tw2r,+tw4i,-tw3i,+tw1i,+tw5i,-tw2i)
1492 POCKETFFT_PARTSTEP11(5, 6,tw5r,tw1r,tw4r,tw2r,tw3r,+tw5i,-tw1i,+tw4i,-tw2i,+tw3i)
1497#undef POCKETFFT_PARTSTEP11
1498#undef POCKETFFT_PARTSTEP11a0
1499#undef POCKETFFT_PARTSTEP11a
1500#undef POCKETFFT_PREP11
1502template<
bool fwd,
typename T>
void passg (
size_t ido,
size_t ip,
1507 const size_t cdim=ip;
1508 size_t ipph = (ip+1)/2;
1509 size_t idl1 = ido*l1;
1511 auto CH = [ch,ido,l1](
size_t a,
size_t b,
size_t c) -> T&
1512 {
return ch[a+ido*(b+l1*c)]; };
1513 auto CC = [cc,ido,cdim](
size_t a,
size_t b,
size_t c) ->
const T&
1514 {
return cc[a+ido*(b+cdim*c)]; };
1515 auto CX = [cc, ido, l1](
size_t a,
size_t b,
size_t c) -> T&
1516 {
return cc[a+ido*(b+l1*c)]; };
1517 auto CX2 = [cc, idl1](
size_t a,
size_t b) -> T&
1518 {
return cc[a+idl1*b]; };
1519 auto CH2 = [ch, idl1](
size_t a,
size_t b) ->
const T&
1520 {
return ch[a+idl1*b]; };
1524 for (
size_t i=1; i<ip; ++i)
1525 wal[i]=
cmplx<T0>(csarr[i].r,fwd ? -csarr[i].i : csarr[i].i);
1527 for (
size_t k=0; k<l1; ++k)
1528 for (
size_t i=0; i<ido; ++i)
1529 CH(i,k,0) = CC(i,0,k);
1530 for (
size_t j=1, jc=ip-1; j<ipph; ++j, --jc)
1531 for (
size_t k=0; k<l1; ++k)
1532 for (
size_t i=0; i<ido; ++i)
1533 PM(CH(i,k,j),CH(i,k,jc),CC(i,j,k),CC(i,jc,k));
1534 for (
size_t k=0; k<l1; ++k)
1535 for (
size_t i=0; i<ido; ++i)
1538 for (
size_t j=1; j<ipph; ++j)
1542 for (
size_t l=1, lc=ip-1; l<ipph; ++l, --lc)
1545 for (
size_t ik=0; ik<idl1; ++ik)
1547 CX2(ik,l).r = CH2(ik,0).r+wal[l].r*CH2(ik,1).r+wal[2*l].r*CH2(ik,2).r;
1548 CX2(ik,l).i = CH2(ik,0).i+wal[l].r*CH2(ik,1).i+wal[2*l].r*CH2(ik,2).i;
1549 CX2(ik,lc).r=-wal[l].i*CH2(ik,ip-1).i-wal[2*l].i*CH2(ik,ip-2).i;
1550 CX2(ik,lc).i=wal[l].i*CH2(ik,ip-1).r+wal[2*l].i*CH2(ik,ip-2).r;
1554 size_t j=3, jc=ip-3;
1555 for (; j<ipph-1; j+=2, jc-=2)
1557 iwal+=l;
if (iwal>ip) iwal-=ip;
1559 iwal+=l;
if (iwal>ip) iwal-=ip;
1561 for (
size_t ik=0; ik<idl1; ++ik)
1563 CX2(ik,l).r += CH2(ik,j).r*xwal.
r+CH2(ik,j+1).r*xwal2.
r;
1564 CX2(ik,l).i += CH2(ik,j).i*xwal.
r+CH2(ik,j+1).i*xwal2.
r;
1565 CX2(ik,lc).r -= CH2(ik,jc).i*xwal.
i+CH2(ik,jc-1).i*xwal2.
i;
1566 CX2(ik,lc).i += CH2(ik,jc).r*xwal.
i+CH2(ik,jc-1).r*xwal2.
i;
1569 for (; j<ipph; ++j, --jc)
1571 iwal+=l;
if (iwal>ip) iwal-=ip;
1573 for (
size_t ik=0; ik<idl1; ++ik)
1575 CX2(ik,l).r += CH2(ik,j).r*xwal.
r;
1576 CX2(ik,l).i += CH2(ik,j).i*xwal.
r;
1577 CX2(ik,lc).r -= CH2(ik,jc).i*xwal.
i;
1578 CX2(ik,lc).i += CH2(ik,jc).r*xwal.
i;
1585 for (
size_t j=1, jc=ip-1; j<ipph; ++j, --jc)
1586 for (
size_t ik=0; ik<idl1; ++ik)
1588 T t1=CX2(ik,j), t2=CX2(ik,jc);
1589 PM(CX2(ik,j),CX2(ik,jc),t1,t2);
1593 for (
size_t j=1, jc=ip-1; j<ipph; ++j,--jc)
1594 for (
size_t k=0; k<l1; ++k)
1596 T t1=CX(0,k,j), t2=CX(0,k,jc);
1597 PM(CX(0,k,j),CX(0,k,jc),t1,t2);
1598 for (
size_t i=1; i<ido; ++i)
1601 PM(x1,x2,CX(i,k,j),CX(i,k,jc));
1602 size_t idij=(j-1)*(ido-1)+i-1;
1604 idij=(jc-1)*(ido-1)+i-1;
1611template<
bool fwd,
typename T>
void pass_all(T c[], T0 fct)
const
1613 if (
length==1) { c[0]*=fct;
return; }
1616 T *p1=c, *p2=ch.
data();
1618 for(
size_t k1=0; k1<
fact.size(); k1++)
1620 size_t ip=
fact[k1].fct;
1648 for (
size_t i=0; i<
length; ++i)
1651 std::copy_n (p1,
length, c);
1655 for (
size_t i=0; i<
length; ++i)
1660 template<
typename T>
void exec(T c[], T0 fct,
bool fwd)
const
1676 std::swap(
fact[0].fct,
fact.back().fct);
1678 for (
size_t divisor=3; divisor*divisor<=len; divisor+=2)
1679 while ((len%divisor)==0)
1690 for (
size_t k=0; k<
fact.size(); ++k)
1706 for (
size_t k=0; k<
fact.size(); ++k)
1710 memofs+=(ip-1)*(ido-1);
1711 for (
size_t j=1; j<ip; ++j)
1712 for (
size_t i=1; i<ido; ++i)
1713 fact[k].tw[(j-1)*(ido-1)+i-1] = twiddle[j*l1*i];
1716 fact[k].tws=
mem.data()+memofs;
1718 for (
size_t j=0; j<ip; ++j)
1719 fact[k].tws[j] = twiddle[j*l1*ido];
1729 if (
length==0)
throw std::runtime_error(
"zero-length FFT requested");
1755 {
fact.push_back({factor,
nullptr,
nullptr}); }
1758template<
typename T1,
typename T2,
typename T3>
inline void MULPM
1759 (T1 &a, T1 &b, T2 c, T2 d, T3 e, T3 f)
const
1760 { a=c*e+d*f; b=c*f-d*e; }
1762template<
typename T>
void radf2 (
size_t ido,
size_t l1,
1766 auto WA = [wa,ido](
size_t x,
size_t i) {
return wa[i+x*(ido-1)]; };
1767 auto CC = [cc,ido,l1](
size_t a,
size_t b,
size_t c) ->
const T&
1768 {
return cc[a+ido*(b+l1*c)]; };
1769 auto CH = [ch,ido](
size_t a,
size_t b,
size_t c) -> T&
1770 {
return ch[a+ido*(b+2*c)]; };
1772 for (
size_t k=0; k<l1; k++)
1773 PM (CH(0,0,k),CH(ido-1,1,k),CC(0,k,0),CC(0,k,1));
1775 for (
size_t k=0; k<l1; k++)
1777 CH( 0,1,k) = -CC(ido-1,k,1);
1778 CH(ido-1,0,k) = CC(ido-1,k,0);
1781 for (
size_t k=0; k<l1; k++)
1782 for (
size_t i=2; i<ido; i+=2)
1786 MULPM (tr2,ti2,WA(0,i-2),WA(0,i-1),CC(i-1,k,1),CC(i,k,1));
1787 PM (CH(i-1,0,k),CH(ic-1,1,k),CC(i-1,k,0),tr2);
1788 PM (CH(i ,0,k),CH(ic ,1,k),ti2,CC(i ,k,0));
1793#define POCKETFFT_REARRANGE(rx, ix, ry, iy) \
1795 auto t1=rx+ry, t2=ry-rx, t3=ix+iy, t4=ix-iy; \
1796 rx=t1; ix=t3; ry=t4; iy=t2; \
1799template<
typename T>
void radf3(
size_t ido,
size_t l1,
1803 constexpr T0 taur=-0.5, taui=T0(0.8660254037844386467637231707529362L);
1805 auto WA = [wa,ido](
size_t x,
size_t i) {
return wa[i+x*(ido-1)]; };
1806 auto CC = [cc,ido,l1](
size_t a,
size_t b,
size_t c) ->
const T&
1807 {
return cc[a+ido*(b+l1*c)]; };
1808 auto CH = [ch,ido](
size_t a,
size_t b,
size_t c) -> T&
1809 {
return ch[a+ido*(b+3*c)]; };
1811 for (
size_t k=0; k<l1; k++)
1813 T cr2=CC(0,k,1)+CC(0,k,2);
1814 CH(0,0,k) = CC(0,k,0)+cr2;
1815 CH(0,2,k) = taui*(CC(0,k,2)-CC(0,k,1));
1816 CH(ido-1,1,k) = CC(0,k,0)+taur*cr2;
1819 for (
size_t k=0; k<l1; k++)
1820 for (
size_t i=2; i<ido; i+=2)
1823 T di2, di3, dr2, dr3;
1824 MULPM (dr2,di2,WA(0,i-2),WA(0,i-1),CC(i-1,k,1),CC(i,k,1));
1825 MULPM (dr3,di3,WA(1,i-2),WA(1,i-1),CC(i-1,k,2),CC(i,k,2));
1827 CH(i-1,0,k) = CC(i-1,k,0)+dr2;
1828 CH(i ,0,k) = CC(i ,k,0)+di2;
1829 T tr2 = CC(i-1,k,0)+taur*dr2;
1830 T ti2 = CC(i ,k,0)+taur*di2;
1833 PM(CH(i-1,2,k),CH(ic-1,1,k),tr2,tr3);
1834 PM(CH(i ,2,k),CH(ic ,1,k),ti3,ti2);
1838template<
typename T>
void radf4(
size_t ido,
size_t l1,
1842 constexpr T0 hsqt2=T0(0.707106781186547524400844362104849L);
1844 auto WA = [wa,ido](
size_t x,
size_t i) {
return wa[i+x*(ido-1)]; };
1845 auto CC = [cc,ido,l1](
size_t a,
size_t b,
size_t c) ->
const T&
1846 {
return cc[a+ido*(b+l1*c)]; };
1847 auto CH = [ch,ido](
size_t a,
size_t b,
size_t c) -> T&
1848 {
return ch[a+ido*(b+4*c)]; };
1850 for (
size_t k=0; k<l1; k++)
1853 PM (tr1,CH(0,2,k),CC(0,k,3),CC(0,k,1));
1854 PM (tr2,CH(ido-1,1,k),CC(0,k,0),CC(0,k,2));
1855 PM (CH(0,0,k),CH(ido-1,3,k),tr2,tr1);
1858 for (
size_t k=0; k<l1; k++)
1860 T ti1=-hsqt2*(CC(ido-1,k,1)+CC(ido-1,k,3));
1861 T tr1= hsqt2*(CC(ido-1,k,1)-CC(ido-1,k,3));
1862 PM (CH(ido-1,0,k),CH(ido-1,2,k),CC(ido-1,k,0),tr1);
1863 PM (CH( 0,3,k),CH( 0,1,k),ti1,CC(ido-1,k,2));
1866 for (
size_t k=0; k<l1; k++)
1867 for (
size_t i=2; i<ido; i+=2)
1870 T ci2, ci3, ci4, cr2, cr3, cr4, ti1, ti2, ti3, ti4, tr1, tr2, tr3, tr4;
1871 MULPM(cr2,ci2,WA(0,i-2),WA(0,i-1),CC(i-1,k,1),CC(i,k,1));
1872 MULPM(cr3,ci3,WA(1,i-2),WA(1,i-1),CC(i-1,k,2),CC(i,k,2));
1873 MULPM(cr4,ci4,WA(2,i-2),WA(2,i-1),CC(i-1,k,3),CC(i,k,3));
1874 PM(tr1,tr4,cr4,cr2);
1875 PM(ti1,ti4,ci2,ci4);
1876 PM(tr2,tr3,CC(i-1,k,0),cr3);
1877 PM(ti2,ti3,CC(i ,k,0),ci3);
1878 PM(CH(i-1,0,k),CH(ic-1,3,k),tr2,tr1);
1879 PM(CH(i ,0,k),CH(ic ,3,k),ti1,ti2);
1880 PM(CH(i-1,2,k),CH(ic-1,1,k),tr3,ti4);
1881 PM(CH(i ,2,k),CH(ic ,1,k),tr4,ti3);
1885template<
typename T>
void radf5(
size_t ido,
size_t l1,
1889 constexpr T0 tr11= T0(0.3090169943749474241022934171828191L),
1890 ti11= T0(0.9510565162951535721164393333793821L),
1891 tr12= T0(-0.8090169943749474241022934171828191L),
1892 ti12= T0(0.5877852522924731291687059546390728L);
1894 auto WA = [wa,ido](
size_t x,
size_t i) {
return wa[i+x*(ido-1)]; };
1895 auto CC = [cc,ido,l1](
size_t a,
size_t b,
size_t c) ->
const T&
1896 {
return cc[a+ido*(b+l1*c)]; };
1897 auto CH = [ch,ido](
size_t a,
size_t b,
size_t c) -> T&
1898 {
return ch[a+ido*(b+5*c)]; };
1900 for (
size_t k=0; k<l1; k++)
1902 T cr2, cr3, ci4, ci5;
1903 PM (cr2,ci5,CC(0,k,4),CC(0,k,1));
1904 PM (cr3,ci4,CC(0,k,3),CC(0,k,2));
1905 CH(0,0,k)=CC(0,k,0)+cr2+cr3;
1906 CH(ido-1,1,k)=CC(0,k,0)+tr11*cr2+tr12*cr3;
1907 CH(0,2,k)=ti11*ci5+ti12*ci4;
1908 CH(ido-1,3,k)=CC(0,k,0)+tr12*cr2+tr11*cr3;
1909 CH(0,4,k)=ti12*ci5-ti11*ci4;
1912 for (
size_t k=0; k<l1;++k)
1913 for (
size_t i=2, ic=ido-2; i<ido; i+=2, ic-=2)
1915 T di2, di3, di4, di5, dr2, dr3, dr4, dr5;
1916 MULPM (dr2,di2,WA(0,i-2),WA(0,i-1),CC(i-1,k,1),CC(i,k,1));
1917 MULPM (dr3,di3,WA(1,i-2),WA(1,i-1),CC(i-1,k,2),CC(i,k,2));
1918 MULPM (dr4,di4,WA(2,i-2),WA(2,i-1),CC(i-1,k,3),CC(i,k,3));
1919 MULPM (dr5,di5,WA(3,i-2),WA(3,i-1),CC(i-1,k,4),CC(i,k,4));
1922 CH(i-1,0,k)=CC(i-1,k,0)+dr2+dr3;
1923 CH(i ,0,k)=CC(i ,k,0)+di2+di3;
1924 T tr2=CC(i-1,k,0)+tr11*dr2+tr12*dr3;
1925 T ti2=CC(i ,k,0)+tr11*di2+tr12*di3;
1926 T tr3=CC(i-1,k,0)+tr12*dr2+tr11*dr3;
1927 T ti3=CC(i ,k,0)+tr12*di2+tr11*di3;
1928 T tr5 = ti11*dr5 + ti12*dr4;
1929 T ti5 = ti11*di5 + ti12*di4;
1930 T tr4 = ti12*dr5 - ti11*dr4;
1931 T ti4 = ti12*di5 - ti11*di4;
1932 PM(CH(i-1,2,k),CH(ic-1,1,k),tr2,tr5);
1933 PM(CH(i ,2,k),CH(ic ,1,k),ti5,ti2);
1934 PM(CH(i-1,4,k),CH(ic-1,3,k),tr3,tr4);
1935 PM(CH(i ,4,k),CH(ic ,3,k),ti4,ti3);
1939#undef POCKETFFT_REARRANGE
1941template<
typename T>
void radfg(
size_t ido,
size_t ip,
size_t l1,
1945 const size_t cdim=ip;
1946 size_t ipph=(ip+1)/2;
1947 size_t idl1 = ido*l1;
1949 auto CC = [cc,ido,cdim](
size_t a,
size_t b,
size_t c) -> T&
1950 {
return cc[a+ido*(b+cdim*c)]; };
1951 auto CH = [ch,ido,l1](
size_t a,
size_t b,
size_t c) ->
const T&
1952 {
return ch[a+ido*(b+l1*c)]; };
1953 auto C1 = [cc,ido,l1] (
size_t a,
size_t b,
size_t c) -> T&
1954 {
return cc[a+ido*(b+l1*c)]; };
1955 auto C2 = [cc,idl1] (
size_t a,
size_t b) -> T&
1956 {
return cc[a+idl1*b]; };
1957 auto CH2 = [ch,idl1] (
size_t a,
size_t b) -> T&
1958 {
return ch[a+idl1*b]; };
1962 for (
size_t j=1, jc=ip-1; j<ipph; ++j,--jc)
1964 size_t is=(j-1)*(ido-1),
1966 for (
size_t k=0; k<l1; ++k)
1970 for (
size_t i=1; i<=ido-2; i+=2)
1972 T t1=C1(i,k,j ), t2=C1(i+1,k,j ),
1973 t3=C1(i,k,jc), t4=C1(i+1,k,jc);
1974 T x1=wa[idij]*t1 + wa[idij+1]*t2,
1975 x2=wa[idij]*t2 - wa[idij+1]*t1,
1976 x3=wa[idij2]*t3 + wa[idij2+1]*t4,
1977 x4=wa[idij2]*t4 - wa[idij2+1]*t3;
1978 PM(C1(i,k,j),C1(i+1,k,jc),x3,x1);
1979 PM(C1(i+1,k,j),C1(i,k,jc),x2,x4);
1987 for (
size_t j=1, jc=ip-1; j<ipph; ++j,--jc)
1988 for (
size_t k=0; k<l1; ++k)
1994 for (
size_t l=1,lc=ip-1; l<ipph; ++l,--lc)
1996 for (
size_t ik=0; ik<idl1; ++ik)
1998 CH2(ik,l ) = C2(ik,0)+csarr[2*l]*C2(ik,1)+csarr[4*l]*C2(ik,2);
1999 CH2(ik,lc) = csarr[2*l+1]*C2(ik,ip-1)+csarr[4*l+1]*C2(ik,ip-2);
2002 size_t j=3, jc=ip-3;
2003 for (; j<ipph-3; j+=4,jc-=4)
2005 iang+=l;
if (iang>=ip) iang-=ip;
2006 T0 ar1=csarr[2*iang], ai1=csarr[2*iang+1];
2007 iang+=l;
if (iang>=ip) iang-=ip;
2008 T0 ar2=csarr[2*iang], ai2=csarr[2*iang+1];
2009 iang+=l;
if (iang>=ip) iang-=ip;
2010 T0 ar3=csarr[2*iang], ai3=csarr[2*iang+1];
2011 iang+=l;
if (iang>=ip) iang-=ip;
2012 T0 ar4=csarr[2*iang], ai4=csarr[2*iang+1];
2013 for (
size_t ik=0; ik<idl1; ++ik)
2015 CH2(ik,l ) += ar1*C2(ik,j )+ar2*C2(ik,j +1)
2016 +ar3*C2(ik,j +2)+ar4*C2(ik,j +3);
2017 CH2(ik,lc) += ai1*C2(ik,jc)+ai2*C2(ik,jc-1)
2018 +ai3*C2(ik,jc-2)+ai4*C2(ik,jc-3);
2021 for (; j<ipph-1; j+=2,jc-=2)
2023 iang+=l;
if (iang>=ip) iang-=ip;
2024 T0 ar1=csarr[2*iang], ai1=csarr[2*iang+1];
2025 iang+=l;
if (iang>=ip) iang-=ip;
2026 T0 ar2=csarr[2*iang], ai2=csarr[2*iang+1];
2027 for (
size_t ik=0; ik<idl1; ++ik)
2029 CH2(ik,l ) += ar1*C2(ik,j )+ar2*C2(ik,j +1);
2030 CH2(ik,lc) += ai1*C2(ik,jc)+ai2*C2(ik,jc-1);
2033 for (; j<ipph; ++j,--jc)
2035 iang+=l;
if (iang>=ip) iang-=ip;
2036 T0 ar=csarr[2*iang], ai=csarr[2*iang+1];
2037 for (
size_t ik=0; ik<idl1; ++ik)
2039 CH2(ik,l ) += ar*C2(ik,j );
2040 CH2(ik,lc) += ai*C2(ik,jc);
2044 for (
size_t ik=0; ik<idl1; ++ik)
2045 CH2(ik,0) = C2(ik,0);
2046 for (
size_t j=1; j<ipph; ++j)
2047 for (
size_t ik=0; ik<idl1; ++ik)
2048 CH2(ik,0) += C2(ik,j);
2053 for (
size_t k=0; k<l1; ++k)
2054 for (
size_t i=0; i<ido; ++i)
2055 CC(i,0,k) = CH(i,k,0);
2057 for (
size_t j=1, jc=ip-1; j<ipph; ++j,--jc)
2060 for (
size_t k=0; k<l1; ++k)
2062 CC(ido-1,j2,k) = CH(0,k,j);
2063 CC(0,j2+1,k) = CH(0,k,jc);
2069 for (
size_t j=1, jc=ip-1; j<ipph; ++j,--jc)
2072 for(
size_t k=0; k<l1; ++k)
2073 for(
size_t i=1, ic=ido-i-2; i<=ido-2; i+=2, ic-=2)
2075 CC(i ,j2+1,k) = CH(i ,k,j )+CH(i ,k,jc);
2076 CC(ic ,j2 ,k) = CH(i ,k,j )-CH(i ,k,jc);
2077 CC(i+1 ,j2+1,k) = CH(i+1,k,j )+CH(i+1,k,jc);
2078 CC(ic+1,j2 ,k) = CH(i+1,k,jc)-CH(i+1,k,j );
2083template<
typename T>
void radb2(
size_t ido,
size_t l1,
2087 auto WA = [wa,ido](
size_t x,
size_t i) {
return wa[i+x*(ido-1)]; };
2088 auto CC = [cc,ido](
size_t a,
size_t b,
size_t c) ->
const T&
2089 {
return cc[a+ido*(b+2*c)]; };
2090 auto CH = [ch,ido,l1](
size_t a,
size_t b,
size_t c) -> T&
2091 {
return ch[a+ido*(b+l1*c)]; };
2093 for (
size_t k=0; k<l1; k++)
2094 PM (CH(0,k,0),CH(0,k,1),CC(0,0,k),CC(ido-1,1,k));
2096 for (
size_t k=0; k<l1; k++)
2098 CH(ido-1,k,0) = 2*CC(ido-1,0,k);
2099 CH(ido-1,k,1) =-2*CC(0 ,1,k);
2102 for (
size_t k=0; k<l1;++k)
2103 for (
size_t i=2; i<ido; i+=2)
2107 PM (CH(i-1,k,0),tr2,CC(i-1,0,k),CC(ic-1,1,k));
2108 PM (ti2,CH(i ,k,0),CC(i ,0,k),CC(ic ,1,k));
2109 MULPM (CH(i,k,1),CH(i-1,k,1),WA(0,i-2),WA(0,i-1),ti2,tr2);
2113template<
typename T>
void radb3(
size_t ido,
size_t l1,
2117 constexpr T0 taur=-0.5, taui=T0(0.8660254037844386467637231707529362L);
2119 auto WA = [wa,ido](
size_t x,
size_t i) {
return wa[i+x*(ido-1)]; };
2120 auto CC = [cc,ido](
size_t a,
size_t b,
size_t c) ->
const T&
2121 {
return cc[a+ido*(b+3*c)]; };
2122 auto CH = [ch,ido,l1](
size_t a,
size_t b,
size_t c) -> T&
2123 {
return ch[a+ido*(b+l1*c)]; };
2125 for (
size_t k=0; k<l1; k++)
2127 T tr2=2*CC(ido-1,1,k);
2128 T cr2=CC(0,0,k)+taur*tr2;
2129 CH(0,k,0)=CC(0,0,k)+tr2;
2130 T ci3=2*taui*CC(0,2,k);
2131 PM (CH(0,k,2),CH(0,k,1),cr2,ci3);
2134 for (
size_t k=0; k<l1; k++)
2135 for (
size_t i=2, ic=ido-2; i<ido; i+=2, ic-=2)
2137 T tr2=CC(i-1,2,k)+CC(ic-1,1,k);
2138 T ti2=CC(i ,2,k)-CC(ic ,1,k);
2139 T cr2=CC(i-1,0,k)+taur*tr2;
2140 T ci2=CC(i ,0,k)+taur*ti2;
2141 CH(i-1,k,0)=CC(i-1,0,k)+tr2;
2142 CH(i ,k,0)=CC(i ,0,k)+ti2;
2143 T cr3=taui*(CC(i-1,2,k)-CC(ic-1,1,k));
2144 T ci3=taui*(CC(i ,2,k)+CC(ic ,1,k));
2145 T di2, di3, dr2, dr3;
2146 PM(dr3,dr2,cr2,ci3);
2147 PM(di2,di3,ci2,cr3);
2148 MULPM(CH(i,k,1),CH(i-1,k,1),WA(0,i-2),WA(0,i-1),di2,dr2);
2149 MULPM(CH(i,k,2),CH(i-1,k,2),WA(1,i-2),WA(1,i-1),di3,dr3);
2153template<
typename T>
void radb4(
size_t ido,
size_t l1,
2157 constexpr T0 sqrt2=T0(1.414213562373095048801688724209698L);
2159 auto WA = [wa,ido](
size_t x,
size_t i) {
return wa[i+x*(ido-1)]; };
2160 auto CC = [cc,ido](
size_t a,
size_t b,
size_t c) ->
const T&
2161 {
return cc[a+ido*(b+4*c)]; };
2162 auto CH = [ch,ido,l1](
size_t a,
size_t b,
size_t c) -> T&
2163 {
return ch[a+ido*(b+l1*c)]; };
2165 for (
size_t k=0; k<l1; k++)
2168 PM (tr2,tr1,CC(0,0,k),CC(ido-1,3,k));
2169 T tr3=2*CC(ido-1,1,k);
2171 PM (CH(0,k,0),CH(0,k,2),tr2,tr3);
2172 PM (CH(0,k,3),CH(0,k,1),tr1,tr4);
2175 for (
size_t k=0; k<l1; k++)
2178 PM (ti1,ti2,CC(0 ,3,k),CC(0 ,1,k));
2179 PM (tr2,tr1,CC(ido-1,0,k),CC(ido-1,2,k));
2180 CH(ido-1,k,0)=tr2+tr2;
2181 CH(ido-1,k,1)=sqrt2*(tr1-ti1);
2182 CH(ido-1,k,2)=ti2+ti2;
2183 CH(ido-1,k,3)=-sqrt2*(tr1+ti1);
2186 for (
size_t k=0; k<l1;++k)
2187 for (
size_t i=2; i<ido; i+=2)
2189 T ci2, ci3, ci4, cr2, cr3, cr4, ti1, ti2, ti3, ti4, tr1, tr2, tr3, tr4;
2191 PM (tr2,tr1,CC(i-1,0,k),CC(ic-1,3,k));
2192 PM (ti1,ti2,CC(i ,0,k),CC(ic ,3,k));
2193 PM (tr4,ti3,CC(i ,2,k),CC(ic ,1,k));
2194 PM (tr3,ti4,CC(i-1,2,k),CC(ic-1,1,k));
2195 PM (CH(i-1,k,0),cr3,tr2,tr3);
2196 PM (CH(i ,k,0),ci3,ti2,ti3);
2197 PM (cr4,cr2,tr1,tr4);
2198 PM (ci2,ci4,ti1,ti4);
2199 MULPM (CH(i,k,1),CH(i-1,k,1),WA(0,i-2),WA(0,i-1),ci2,cr2);
2200 MULPM (CH(i,k,2),CH(i-1,k,2),WA(1,i-2),WA(1,i-1),ci3,cr3);
2201 MULPM (CH(i,k,3),CH(i-1,k,3),WA(2,i-2),WA(2,i-1),ci4,cr4);
2205template<
typename T>
void radb5(
size_t ido,
size_t l1,
2209 constexpr T0 tr11= T0(0.3090169943749474241022934171828191L),
2210 ti11= T0(0.9510565162951535721164393333793821L),
2211 tr12= T0(-0.8090169943749474241022934171828191L),
2212 ti12= T0(0.5877852522924731291687059546390728L);
2214 auto WA = [wa,ido](
size_t x,
size_t i) {
return wa[i+x*(ido-1)]; };
2215 auto CC = [cc,ido](
size_t a,
size_t b,
size_t c) ->
const T&
2216 {
return cc[a+ido*(b+5*c)]; };
2217 auto CH = [ch,ido,l1](
size_t a,
size_t b,
size_t c) -> T&
2218 {
return ch[a+ido*(b+l1*c)]; };
2220 for (
size_t k=0; k<l1; k++)
2222 T ti5=CC(0,2,k)+CC(0,2,k);
2223 T ti4=CC(0,4,k)+CC(0,4,k);
2224 T tr2=CC(ido-1,1,k)+CC(ido-1,1,k);
2225 T tr3=CC(ido-1,3,k)+CC(ido-1,3,k);
2226 CH(0,k,0)=CC(0,0,k)+tr2+tr3;
2227 T cr2=CC(0,0,k)+tr11*tr2+tr12*tr3;
2228 T cr3=CC(0,0,k)+tr12*tr2+tr11*tr3;
2230 MULPM(ci5,ci4,ti5,ti4,ti11,ti12);
2231 PM(CH(0,k,4),CH(0,k,1),cr2,ci5);
2232 PM(CH(0,k,3),CH(0,k,2),cr3,ci4);
2235 for (
size_t k=0; k<l1;++k)
2236 for (
size_t i=2, ic=ido-2; i<ido; i+=2, ic-=2)
2238 T tr2, tr3, tr4, tr5, ti2, ti3, ti4, ti5;
2239 PM(tr2,tr5,CC(i-1,2,k),CC(ic-1,1,k));
2240 PM(ti5,ti2,CC(i ,2,k),CC(ic ,1,k));
2241 PM(tr3,tr4,CC(i-1,4,k),CC(ic-1,3,k));
2242 PM(ti4,ti3,CC(i ,4,k),CC(ic ,3,k));
2243 CH(i-1,k,0)=CC(i-1,0,k)+tr2+tr3;
2244 CH(i ,k,0)=CC(i ,0,k)+ti2+ti3;
2245 T cr2=CC(i-1,0,k)+tr11*tr2+tr12*tr3;
2246 T ci2=CC(i ,0,k)+tr11*ti2+tr12*ti3;
2247 T cr3=CC(i-1,0,k)+tr12*tr2+tr11*tr3;
2248 T ci3=CC(i ,0,k)+tr12*ti2+tr11*ti3;
2249 T ci4, ci5, cr5, cr4;
2250 MULPM(cr5,cr4,tr5,tr4,ti11,ti12);
2251 MULPM(ci5,ci4,ti5,ti4,ti11,ti12);
2252 T dr2, dr3, dr4, dr5, di2, di3, di4, di5;
2253 PM(dr4,dr3,cr3,ci4);
2254 PM(di3,di4,ci3,cr4);
2255 PM(dr5,dr2,cr2,ci5);
2256 PM(di2,di5,ci2,cr5);
2257 MULPM(CH(i,k,1),CH(i-1,k,1),WA(0,i-2),WA(0,i-1),di2,dr2);
2258 MULPM(CH(i,k,2),CH(i-1,k,2),WA(1,i-2),WA(1,i-1),di3,dr3);
2259 MULPM(CH(i,k,3),CH(i-1,k,3),WA(2,i-2),WA(2,i-1),di4,dr4);
2260 MULPM(CH(i,k,4),CH(i-1,k,4),WA(3,i-2),WA(3,i-1),di5,dr5);
2264template<
typename T>
void radbg(
size_t ido,
size_t ip,
size_t l1,
2268 const size_t cdim=ip;
2269 size_t ipph=(ip+1)/ 2;
2270 size_t idl1 = ido*l1;
2272 auto CC = [cc,ido,cdim](
size_t a,
size_t b,
size_t c) ->
const T&
2273 {
return cc[a+ido*(b+cdim*c)]; };
2274 auto CH = [ch,ido,l1](
size_t a,
size_t b,
size_t c) -> T&
2275 {
return ch[a+ido*(b+l1*c)]; };
2276 auto C1 = [cc,ido,l1](
size_t a,
size_t b,
size_t c) ->
const T&
2277 {
return cc[a+ido*(b+l1*c)]; };
2278 auto C2 = [cc,idl1](
size_t a,
size_t b) -> T&
2279 {
return cc[a+idl1*b]; };
2280 auto CH2 = [ch,idl1](
size_t a,
size_t b) -> T&
2281 {
return ch[a+idl1*b]; };
2283 for (
size_t k=0; k<l1; ++k)
2284 for (
size_t i=0; i<ido; ++i)
2285 CH(i,k,0) = CC(i,0,k);
2286 for (
size_t j=1, jc=ip-1; j<ipph; ++j, --jc)
2289 for (
size_t k=0; k<l1; ++k)
2291 CH(0,k,j ) = 2*CC(ido-1,j2,k);
2292 CH(0,k,jc) = 2*CC(0,j2+1,k);
2298 for (
size_t j=1, jc=ip-1; j<ipph; ++j,--jc)
2301 for (
size_t k=0; k<l1; ++k)
2302 for (
size_t i=1, ic=ido-i-2; i<=ido-2; i+=2, ic-=2)
2304 CH(i ,k,j ) = CC(i ,j2+1,k)+CC(ic ,j2,k);
2305 CH(i ,k,jc) = CC(i ,j2+1,k)-CC(ic ,j2,k);
2306 CH(i+1,k,j ) = CC(i+1,j2+1,k)-CC(ic+1,j2,k);
2307 CH(i+1,k,jc) = CC(i+1,j2+1,k)+CC(ic+1,j2,k);
2311 for (
size_t l=1,lc=ip-1; l<ipph; ++l,--lc)
2313 for (
size_t ik=0; ik<idl1; ++ik)
2315 C2(ik,l ) = CH2(ik,0)+csarr[2*l]*CH2(ik,1)+csarr[4*l]*CH2(ik,2);
2316 C2(ik,lc) = csarr[2*l+1]*CH2(ik,ip-1)+csarr[4*l+1]*CH2(ik,ip-2);
2320 for(; j<ipph-3; j+=4,jc-=4)
2322 iang+=l;
if(iang>ip) iang-=ip;
2323 T0 ar1=csarr[2*iang], ai1=csarr[2*iang+1];
2324 iang+=l;
if(iang>ip) iang-=ip;
2325 T0 ar2=csarr[2*iang], ai2=csarr[2*iang+1];
2326 iang+=l;
if(iang>ip) iang-=ip;
2327 T0 ar3=csarr[2*iang], ai3=csarr[2*iang+1];
2328 iang+=l;
if(iang>ip) iang-=ip;
2329 T0 ar4=csarr[2*iang], ai4=csarr[2*iang+1];
2330 for (
size_t ik=0; ik<idl1; ++ik)
2332 C2(ik,l ) += ar1*CH2(ik,j )+ar2*CH2(ik,j +1)
2333 +ar3*CH2(ik,j +2)+ar4*CH2(ik,j +3);
2334 C2(ik,lc) += ai1*CH2(ik,jc)+ai2*CH2(ik,jc-1)
2335 +ai3*CH2(ik,jc-2)+ai4*CH2(ik,jc-3);
2338 for(; j<ipph-1; j+=2,jc-=2)
2340 iang+=l;
if(iang>ip) iang-=ip;
2341 T0 ar1=csarr[2*iang], ai1=csarr[2*iang+1];
2342 iang+=l;
if(iang>ip) iang-=ip;
2343 T0 ar2=csarr[2*iang], ai2=csarr[2*iang+1];
2344 for (
size_t ik=0; ik<idl1; ++ik)
2346 C2(ik,l ) += ar1*CH2(ik,j )+ar2*CH2(ik,j +1);
2347 C2(ik,lc) += ai1*CH2(ik,jc)+ai2*CH2(ik,jc-1);
2350 for(; j<ipph; ++j,--jc)
2352 iang+=l;
if(iang>ip) iang-=ip;
2353 T0 war=csarr[2*iang], wai=csarr[2*iang+1];
2354 for (
size_t ik=0; ik<idl1; ++ik)
2356 C2(ik,l ) += war*CH2(ik,j );
2357 C2(ik,lc) += wai*CH2(ik,jc);
2361 for (
size_t j=1; j<ipph; ++j)
2362 for (
size_t ik=0; ik<idl1; ++ik)
2363 CH2(ik,0) += CH2(ik,j);
2364 for (
size_t j=1, jc=ip-1; j<ipph; ++j,--jc)
2365 for (
size_t k=0; k<l1; ++k)
2366 PM(CH(0,k,jc),CH(0,k,j),C1(0,k,j),C1(0,k,jc));
2370 for (
size_t j=1, jc=ip-1; j<ipph; ++j, --jc)
2371 for (
size_t k=0; k<l1; ++k)
2372 for (
size_t i=1; i<=ido-2; i+=2)
2374 CH(i ,k,j ) = C1(i ,k,j)-C1(i+1,k,jc);
2375 CH(i ,k,jc) = C1(i ,k,j)+C1(i+1,k,jc);
2376 CH(i+1,k,j ) = C1(i+1,k,j)+C1(i ,k,jc);
2377 CH(i+1,k,jc) = C1(i+1,k,j)-C1(i ,k,jc);
2382 for (
size_t j=1; j<ip; ++j)
2384 size_t is = (j-1)*(ido-1);
2385 for (
size_t k=0; k<l1; ++k)
2388 for (
size_t i=1; i<=ido-2; i+=2)
2390 T t1=CH(i,k,j), t2=CH(i+1,k,j);
2391 CH(i ,k,j) = wa[idij]*t1-wa[idij+1]*t2;
2392 CH(i+1,k,j) = wa[idij]*t2+wa[idij+1]*t1;
2404 for (
size_t i=0; i<
length; ++i)
2407 std::copy_n (p1,
length, c);
2411 for (
size_t i=0; i<
length; ++i)
2416 template<
typename T>
void exec(T c[], T0 fct,
bool r2hc)
const
2418 if (
length==1) { c[0]*=fct;
return; }
2419 size_t nf=
fact.size();
2421 T *p1=c, *p2=ch.
data();
2424 for(
size_t k1=0, l1=
length; k1<nf;++k1)
2427 size_t ip=
fact[k].fct;
2439 {
radfg(ido, ip, l1, p1, p2,
fact[k].tw,
fact[k].tws); std::swap (p1,p2); }
2443 for(
size_t k=0, l1=1; k<nf; k++)
2445 size_t ip =
fact[k].fct,
2475 std::swap(
fact[0].fct,
fact.back().fct);
2477 for (
size_t divisor=3; divisor*divisor<=len; divisor+=2)
2478 while ((len%divisor)==0)
2488 size_t twsz=0, l1=1;
2489 for (
size_t k=0; k<
fact.size(); ++k)
2492 twsz+=(ip-1)*(ido-1);
2493 if (ip>5) twsz+=2*ip;
2504 for (
size_t k=0; k<
fact.size(); ++k)
2507 if (k<
fact.size()-1)
2509 fact[k].tw=ptr; ptr+=(ip-1)*(ido-1);
2510 for (
size_t j=1; j<ip; ++j)
2511 for (
size_t i=1; i<=(ido-1)/2; ++i)
2513 fact[k].tw[(j-1)*(ido-1)+2*i-2] = twid[j*l1*i].r;
2514 fact[k].tw[(j-1)*(ido-1)+2*i-1] = twid[j*l1*i].i;
2519 fact[k].tws=ptr; ptr+=2*ip;
2520 fact[k].tws[0] = 1.;
2521 fact[k].tws[1] = 0.;
2522 for (
size_t i=2, ic=2*ip-2; i<=ic; i+=2, ic-=2)
2527 fact[k].tws[ic+1] = -twid[i/2*(
length/ip)].i;
2538 if (
length==0)
throw std::runtime_error(
"zero-length FFT requested");
2563 for (
size_t m=0; m<
n; ++m)
2565 auto zero = akf[0]*T0(0);
2566 for (
size_t m=
n; m<
n2; ++m)
2573 for (
size_t m=1; m<(
n2+1)/2; ++m)
2585 for (
size_t m=0; m<
n; ++m)
2599 for (
size_t m=1; m<
n; ++m)
2602 if (coeff>=2*
n) coeff-=2*
n;
2608 T0 xn2 = T0(1)/T0(
n2);
2609 tbkf[0] =
bk[0]*xn2;
2610 for (
size_t m=1; m<
n; ++m)
2611 tbkf[m] = tbkf[
n2-m] =
bk[m]*xn2;
2612 for (
size_t m=
n;m<=(
n2-
n);++m)
2615 for (
size_t i=0; i<
n2/2+1; ++i)
2622 template<
typename T>
void exec_r(T c[], T0 fct,
bool fwd)
2627 auto zero = T0(0)*c[0];
2628 for (
size_t m=0; m<
n; ++m)
2629 tmp[m].Set(c[m], zero);
2632 std::copy_n (&tmp[1].r,
n-1, &c[1]);
2636 tmp[0].Set(c[0],c[0]*0);
2637 std::copy_n (c+1,
n-1, &tmp[1].r);
2638 if ((
n&1)==0) tmp[
n/2].i=T0(0)*c[0];
2639 for (
size_t m=1; 2*m<
n; ++m)
2640 tmp[
n-m].Set(tmp[m].r, -tmp[m].i);
2642 for (
size_t m=0; m<
n; ++m)
2663 if (
length==0)
throw std::runtime_error(
"zero-length FFT requested");
2700 if (
length==0)
throw std::runtime_error(
"zero-length FFT requested");
2739 constexpr T0 sqrt2=T0(1.414213562373095048801688724209698L);
2740 size_t N=
fftplan.length(), n=N/2+1;
2742 { c[0]*=sqrt2; c[n-1]*=sqrt2; }
2745 for (
size_t i=1; i<n; ++i)
2746 tmp[i] = tmp[N-i] = c[i];
2749 for (
size_t i=1; i<n; ++i)
2752 { c[0]*=sqrt2*T0(0.5); c[n-1]*=sqrt2*T0(0.5); }
2768 bool ,
int ,
bool )
const
2770 size_t N=
fftplan.length(), n=N/2-1;
2772 tmp[0] = tmp[n+1] = c[0]*0;
2773 for (
size_t i=0; i<n; ++i)
2774 { tmp[i+1]=c[i]; tmp[N-1-i]=-c[i]; }
2776 for (
size_t i=0; i<n; ++i)
2794 for (
size_t i=0; i<
length; ++i)
2799 int type,
bool cosine)
const
2801 constexpr T0 sqrt2=T0(1.414213562373095048801688724209698L);
2803 size_t NS2 = (N+1)/2;
2807 for (
size_t k=1; k<N; k+=2)
2810 if ((N&1)==0) c[N-1]*=2;
2811 for (
size_t k=1; k<N-1; k+=2)
2814 for (
size_t k=1, kc=N-1; k<NS2; ++k, --kc)
2818 c[k] = T0(0.5)*(t1+t2); c[kc]=T0(0.5)*(t1-t2);
2823 for (
size_t k=0, kc=N-1; k<kc; ++k, --kc)
2824 std::swap(c[k], c[kc]);
2826 cosine ? c[0]*=sqrt2*T0(0.5) : c[N-1]*=sqrt2*T0(0.5);
2831 cosine ? c[0]*=sqrt2 : c[N-1]*=sqrt2;
2833 for (
size_t k=0, kc=N-1; k<NS2; ++k, --kc)
2834 std::swap(c[k], c[kc]);
2835 for (
size_t k=1, kc=N-1; k<NS2; ++k, --kc)
2837 T t1=c[k]+c[kc], t2=c[k]-c[kc];
2844 for (
size_t k=1; k<N-1; k+=2)
2847 for (
size_t k=1; k<N; k+=2)
2859 std::unique_ptr<pocketfft_c<T0>>
fft;
2860 std::unique_ptr<pocketfft_r<T0>>
rfft;
2873 for (
size_t i=0; i<
N/2; ++i)
2879 bool ,
int ,
bool cosine)
const
2883 for (
size_t k=0, kc=
N-1; k<n2; ++k, --kc)
2884 std::swap(c[k], c[kc]);
2894 for (; m<
N; ++i, m+=4)
2896 for (; m<2*
N; ++i, m+=4)
2898 for (; m<3*
N; ++i, m+=4)
2900 for (; m<4*
N; ++i, m+=4)
2902 for (; i<
N; ++i, m+=4)
2907 auto SGN = [](
size_t i)
2909 constexpr T0 sqrt2=T0(1.414213562373095048801688724209698L);
2910 return (i&2) ? -sqrt2 : sqrt2;
2912 c[n2] = y[0]*SGN(n2+1);
2913 size_t i=0, i1=1, k=1;
2914 for (; k<n2; ++i, ++i1, k+=2)
2916 c[i ] = y[2*k-1]*SGN(i1) + y[2*k ]*SGN(i);
2917 c[
N -i1] = y[2*k-1]*SGN(
N -i) - y[2*k ]*SGN(
N -i1);
2918 c[n2-i1] = y[2*k+1]*SGN(n2-i) - y[2*k+2]*SGN(n2-i1);
2919 c[n2+i1] = y[2*k+1]*SGN(n2+i+2) + y[2*k+2]*SGN(n2+i1);
2923 c[i ] = y[2*k-1]*SGN(i+1) + y[2*k]*SGN(i);
2924 c[
N-i1] = y[2*k-1]*SGN(i+2) + y[2*k]*SGN(i1);
2935 for(
size_t i=0; i<n2; ++i)
2937 y[i].Set(c[2*i],c[
N-1-2*i]);
2940 fft->exec(y.
data(), fct,
true);
2941 for(
size_t i=0, ic=n2-1; i<n2; ++i, --ic)
2943 c[2*i ] = 2*(y[i ].r*
C2[i ].r-y[i ].i*
C2[i ].i);
2944 c[2*i+1] = -2*(y[ic].i*
C2[ic].r+y[ic].r*
C2[ic].i);
2948 for (
size_t k=1; k<
N; k+=2)
2960template<
typename T> std::shared_ptr<T>
get_plan(
size_t length)
2962#if POCKETFFT_CACHE_SIZE==0
2963 return std::make_shared<T>(length);
2966 static std::array<std::shared_ptr<T>, nmax> cache;
2967 static std::array<size_t, nmax> last_access{{0}};
2968 static size_t access_counter = 0;
2969 static std::mutex mut;
2971 auto find_in_cache = [&]() -> std::shared_ptr<T>
2973 for (
size_t i=0; i<nmax; ++i)
2974 if (cache[i] && (cache[i]->length()==length))
2977 if (last_access[i]!=access_counter)
2979 last_access[i] = ++access_counter;
2981 if (access_counter == 0)
2982 last_access.fill(0);
2991 std::lock_guard<std::mutex> lock(mut);
2992 auto p = find_in_cache();
2995 auto plan = std::make_shared<T>(length);
2997 std::lock_guard<std::mutex> lock(mut);
2998 auto p = find_in_cache();
3002 for (
size_t i=1; i<nmax; ++i)
3003 if (last_access[i] < last_access[lru])
3007 last_access[lru] = ++access_counter;
3038 d(reinterpret_cast<const char *>(data_)) {}
3040 {
return *
reinterpret_cast<const T *
>(
d+ofs); }
3047 :
cndarr<T>::
cndarr(const_cast<const void *>(data_), shape_, stride_)
3050 {
return *
reinterpret_cast<T *
>(
const_cast<char *
>(
cndarr<T>::d+ofs)); }
3063 for (
int i_=
int(
pos.size())-1; i_>=0; --i_)
3065 auto i = size_t(i_);
3066 if (i==
idim)
continue;
3084 if (nshares==1)
return;
3085 if (nshares==0)
throw std::runtime_error(
"can't run with zero threads");
3087 if (myshare>=nshares)
throw std::runtime_error(
"impossible share requested");
3088 size_t nbase =
rem/nshares;
3089 size_t additional =
rem%nshares;
3090 size_t lo = myshare*nbase + ((myshare<additional) ? myshare : additional);
3091 size_t hi = lo+nbase+(myshare<additional);
3092 size_t todo = hi-lo;
3095 for (
size_t i=0; i<
pos.size(); ++i)
3097 if (i==
idim)
continue;
3098 chunk /=
iarr.shape(i);
3099 size_t n_advance = lo/chunk;
3100 pos[i] += n_advance;
3101 p_ii += ptrdiff_t(n_advance)*
iarr.stride(i);
3102 p_oi += ptrdiff_t(n_advance)*
oarr.stride(i);
3103 lo -= n_advance*chunk;
3109 if (
rem<n)
throw std::runtime_error(
"underrun");
3110 for (
size_t i=0; i<n; ++i)
3119 ptrdiff_t
iofs(
size_t j,
size_t i)
const {
return p_i[j] + ptrdiff_t(i)*
str_i; }
3121 ptrdiff_t
oofs(
size_t j,
size_t i)
const {
return p_o[j] + ptrdiff_t(i)*
str_o; }
3139 :
pos(arr_.ndim(), 0),
arr(arr_),
p(0),
rem(arr_.size()) {}
3143 for (
int i_=
int(
pos.size())-1; i_>=0; --i_)
3145 auto i = size_t(i_);
3147 if (++
pos[i] <
arr.shape(i))
3150 p -= ptrdiff_t(
arr.shape(i))*
arr.stride(i);
3153 ptrdiff_t
ofs()
const {
return p; }
3187 for (
int i_=
int(
pos.size())-1; i_>=0; --i_)
3189 auto i = size_t(i_);
3192 rp +=
arr.stride(i);
3195 rp -=
arr.stride(i);
3198 rp += ptrdiff_t(
arr.shape(i))*
arr.stride(i);
3205 p -= ptrdiff_t(
shp[i])*
arr.stride(i);
3208 rp -= ptrdiff_t(
arr.shape(i)-
shp[i])*
arr.stride(i);
3212 rp -= ptrdiff_t(
shp[i])*
arr.stride(i);
3215 ptrdiff_t
ofs()
const {
return p; }
3223#ifndef POCKETFFT_NO_VECTORS
3224template<>
struct VTYPE<float>
3226 using type =
float __attribute__ ((vector_size (
VLEN<float>::val*
sizeof(
float))));
3228template<>
struct VTYPE<double>
3230 using type =
double __attribute__ ((vector_size (VLEN<double>::val*
sizeof(
double))));
3232template<>
struct VTYPE<long double>
3234 using type =
long double __attribute__ ((vector_size (VLEN<long double>::val*
sizeof(
long double))));
3239 size_t axsize,
size_t elemsize)
3246 const shape_t &axes,
size_t elemsize)
3250 for (
size_t i=0; i<axes.size(); ++i)
3252 auto axsize = shape[axes[i]];
3253 auto othersize = fullsize/axsize;
3255 if (sz>tmpsize) tmpsize=sz;
3264 for (
size_t j=0; j<vlen; ++j)
3266 dst[i].r[j] = src[it.
iofs(j,i)].r;
3267 dst[i].i[j] = src[it.
iofs(j,i)].i;
3275 for (
size_t j=0; j<vlen; ++j)
3276 dst[i][j] = src[it.
iofs(j,i)];
3282 if (
dst == &src[it.
iofs(0)])
return;
3291 for (
size_t j=0; j<vlen; ++j)
3292 dst[it.
oofs(j,i)].Set(src[i].r[j],src[i].i[j]);
3299 for (
size_t j=0; j<vlen; ++j)
3300 dst[it.
oofs(j,i)] = src[i][j];
3306 if (src == &
dst[it.
oofs(0)])
return;
3316template<
typename Tplan,
typename T,
typename T0,
typename Exec>
3318 const shape_t &axes, T0 fct,
size_t nthreads,
const Exec & exec,
3319 const bool allow_inplace=
true)
3321 std::shared_ptr<Tplan> plan;
3323 for (
size_t iax=0; iax<axes.size(); ++iax)
3325 size_t len=in.
shape(axes[iax]);
3326 if ((!plan) || (len!=plan->length()))
3332 constexpr auto vlen = VLEN<T0>::val;
3333 auto storage = alloc_tmp<T0>(in.shape(), len, sizeof(T));
3334 const auto &tin(iax==0? in : out);
3335 multi_iter<vlen> it(tin, out, axes[iax]);
3336#ifndef POCKETFFT_NO_VECTORS
3338 while (it.remaining()>=vlen)
3341 auto tdatav = reinterpret_cast<add_vec_t<T> *>(storage.data());
3342 exec(it, tin, out, tdatav, *plan, fct);
3345 while (it.remaining()>0)
3348 auto buf = allow_inplace && it.stride_out() == sizeof(T) ?
3349 &out[it.oofs(0)] : reinterpret_cast<T *>(storage.data());
3350 exec(it, tin, out, buf, *plan, fct);
3361 template <
typename T0,
typename T,
size_t vlen>
void operator () (
3374 for (
size_t j=0; j<vlen; ++j)
3375 dst[it.
oofs(j,0)] = src[0][j];
3377 for (i=1; i<it.
length_out()-1; i+=2, ++i1, --i2)
3378 for (
size_t j=0; j<vlen; ++j)
3380 dst[it.
oofs(j,i1)] = src[i][j]+src[i+1][j];
3381 dst[it.
oofs(j,i2)] = src[i][j]-src[i+1][j];
3384 for (
size_t j=0; j<vlen; ++j)
3385 dst[it.
oofs(j,i1)] = src[i][j];
3393 for (i=1; i<it.
length_out()-1; i+=2, ++i1, --i2)
3395 dst[it.
oofs(i1)] = src[i]+src[i+1];
3396 dst[it.
oofs(i2)] = src[i]-src[i+1];
3405 for (
size_t j=0; j<vlen; ++j)
3406 dst[it.
oofs(j,0)] = src[0][j];
3408 for (i=1; i<it.
length_out()-1; i+=2, ++i1, --i2)
3409 for (
size_t j=0; j<vlen; ++j)
3411 dst[it.
oofs(j,i1)] = src[i][j]-src[i+1][j];
3412 dst[it.
oofs(j,i2)] = src[i][j]+src[i+1][j];
3415 for (
size_t j=0; j<vlen; ++j)
3416 dst[it.
oofs(j,i1)] = src[i][j];
3424 for (i=1; i<it.
length_out()-1; i+=2, ++i1, --i2)
3426 dst[it.
oofs(i1)] = src[i]-src[i+1];
3427 dst[it.
oofs(i2)] = src[i]+src[i+1];
3435 template <
typename T0,
typename T,
size_t vlen>
void operator () (
3440 plan.
exec(buf, fct,
true);
3446 template <
typename T0,
typename T,
size_t vlen>
void operator () (
3451 plan.
exec(buf, fct,
true);
3462 template <
typename T0,
typename T,
typename Tplan,
size_t vlen>
3464 ndarr<T0> &out, T * buf,
const Tplan &plan, T0 fct)
const
3477 size_t len=in.
shape(axis);
3481 constexpr auto vlen = VLEN<T>::val;
3482 auto storage = alloc_tmp<T>(in.shape(), len, sizeof(T));
3483 multi_iter<vlen> it(in, out, axis);
3484#ifndef POCKETFFT_NO_VECTORS
3486 while (it.remaining()>=vlen)
3489 auto tdatav = reinterpret_cast<vtype_t<T> *>(storage.data());
3490 copy_input(it, in, tdatav);
3491 plan->exec(tdatav, fct, true);
3492 for (size_t j=0; j<vlen; ++j)
3493 out[it.oofs(j,0)].Set(tdatav[0][j]);
3496 for (; i<len-1; i+=2, ++ii)
3497 for (size_t j=0; j<vlen; ++j)
3498 out[it.oofs(j,ii)].Set(tdatav[i][j], tdatav[i+1][j]);
3500 for (; i<len-1; i+=2, ++ii)
3501 for (size_t j=0; j<vlen; ++j)
3502 out[it.oofs(j,ii)].Set(tdatav[i][j], -tdatav[i+1][j]);
3504 for (size_t j=0; j<vlen; ++j)
3505 out[it.oofs(j,ii)].Set(tdatav[i][j]);
3508 while (it.remaining()>0)
3511 auto tdata = reinterpret_cast<T *>(storage.data());
3512 copy_input(it, in, tdata);
3513 plan->exec(tdata, fct, true);
3514 out[it.oofs(0)].Set(tdata[0]);
3517 for (; i<len-1; i+=2, ++ii)
3518 out[it.oofs(ii)].Set(tdata[i], tdata[i+1]);
3520 for (; i<len-1; i+=2, ++ii)
3521 out[it.oofs(ii)].Set(tdata[i], -tdata[i+1]);
3523 out[it.oofs(ii)].Set(tdata[i]);
3532 size_t len=out.
shape(axis);
3536 constexpr auto vlen = VLEN<T>::val;
3537 auto storage = alloc_tmp<T>(out.shape(), len, sizeof(T));
3538 multi_iter<vlen> it(in, out, axis);
3539#ifndef POCKETFFT_NO_VECTORS
3541 while (it.remaining()>=vlen)
3544 auto tdatav = reinterpret_cast<vtype_t<T> *>(storage.data());
3545 for (size_t j=0; j<vlen; ++j)
3546 tdatav[0][j]=in[it.iofs(j,0)].r;
3550 for (; i<len-1; i+=2, ++ii)
3551 for (size_t j=0; j<vlen; ++j)
3553 tdatav[i ][j] = in[it.iofs(j,ii)].r;
3554 tdatav[i+1][j] = -in[it.iofs(j,ii)].i;
3557 for (; i<len-1; i+=2, ++ii)
3558 for (size_t j=0; j<vlen; ++j)
3560 tdatav[i ][j] = in[it.iofs(j,ii)].r;
3561 tdatav[i+1][j] = in[it.iofs(j,ii)].i;
3564 for (size_t j=0; j<vlen; ++j)
3565 tdatav[i][j] = in[it.iofs(j,ii)].r;
3567 plan->exec(tdatav, fct, false);
3568 copy_output(it, tdatav, out);
3571 while (it.remaining()>0)
3574 auto tdata = reinterpret_cast<T *>(storage.data());
3575 tdata[0]=in[it.iofs(0)].r;
3579 for (; i<len-1; i+=2, ++ii)
3581 tdata[i ] = in[it.iofs(ii)].r;
3582 tdata[i+1] = -in[it.iofs(ii)].i;
3585 for (; i<len-1; i+=2, ++ii)
3587 tdata[i ] = in[it.iofs(ii)].r;
3588 tdata[i+1] = in[it.iofs(ii)].i;
3591 tdata[i] = in[it.iofs(ii)].r;
3593 plan->exec(tdata, fct,
false);
3603 template <
typename T0,
typename T,
size_t vlen>
void operator () (
3621 const std::complex<T> *data_in, std::complex<T> *data_out, T fct,
3633 int type,
const T *data_in, T *data_out, T fct,
bool ortho,
size_t nthreads=1)
3635 if ((type<1) || (type>4))
throw std::invalid_argument(
"invalid DCT type");
3638 cndarr<T> ain(data_in, shape, stride_in);
3639 ndarr<T> aout(data_out, shape, stride_out);
3640 const ExecDcst exec{ortho, type,
true};
3651 int type,
const T *data_in, T *data_out, T fct,
bool ortho,
size_t nthreads=1)
3653 if ((type<1) || (type>4))
throw std::invalid_argument(
"invalid DST type");
3656 cndarr<T> ain(data_in, shape, stride_in);
3657 ndarr<T> aout(data_out, shape, stride_out);
3658 const ExecDcst exec{ortho, type,
false};
3669 bool forward,
const T *data_in, std::complex<T> *data_out, T fct,
3674 cndarr<T> ain(data_in, shape_in, stride_in);
3676 shape_out[axis] = shape_in[axis]/2 + 1;
3678 general_r2c(ain, aout, axis, forward, fct, nthreads);
3683 bool forward,
const T *data_in, std::complex<T> *data_out, T fct,
3688 r2c(shape_in, stride_in, stride_out, axes.back(), forward, data_in, data_out,
3690 if (axes.size()==1)
return;
3693 shape_out[axes.back()] = shape_in[axes.back()]/2 + 1;
3694 auto newaxes =
shape_t{axes.begin(), --axes.end()};
3695 c2c(shape_out, stride_out, stride_out, newaxes, forward, data_out, data_out,
3701 bool forward,
const std::complex<T> *data_in, T *data_out, T fct,
3707 shape_in[axis] = shape_out[axis]/2 + 1;
3709 ndarr<T> aout(data_out, shape_out, stride_out);
3710 general_c2r(ain, aout, axis, forward, fct, nthreads);
3715 bool forward,
const std::complex<T> *data_in, T *data_out, T fct,
3720 return c2r(shape_out, stride_in, stride_out, axes[0], forward,
3721 data_in, data_out, fct, nthreads);
3723 auto shape_in = shape_out;
3724 shape_in[axes.back()] = shape_out[axes.back()]/2 + 1;
3726 stride_t stride_inter(shape_in.size());
3727 stride_inter.back() =
sizeof(
cmplx<T>);
3728 for (
int i=
int(shape_in.size())-2; i>=0; --i)
3729 stride_inter[size_t(i)] =
3730 stride_inter[size_t(i+1)]*ptrdiff_t(shape_in[
size_t(i+1)]);
3732 auto newaxes =
shape_t{axes.begin(), --axes.end()};
3733 c2c(shape_in, stride_in, stride_inter, newaxes, forward, data_in, tmp.
data(),
3735 c2r(shape_out, stride_inter, stride_out, axes.back(), forward,
3736 tmp.
data(), data_out, fct, nthreads);
3741 bool real2hermitian,
bool forward,
const T *data_in, T *data_out, T fct,
3746 cndarr<T> ain(data_in, shape, stride_in);
3747 ndarr<T> aout(data_out, shape, stride_out);
3749 ExecR2R{real2hermitian, forward});
3754 const T *data_in, T *data_out, T fct,
size_t nthreads=1)
3758 cndarr<T> ain(data_in, shape, stride_in);
3759 ndarr<T> aout(data_out, shape, stride_out);
3766 const T *data_in, T *data_out, T fct,
size_t nthreads=1)
3771 data_out, fct, nthreads);
3774 tshp[axes.back()] = tshp[axes.back()]/2+1;
3777 tstride.back()=
sizeof(std::complex<T>);
3778 for (
size_t i=tstride.size()-1; i>0; --i)
3779 tstride[i-1]=tstride[i]*ptrdiff_t(tshp[i]);
3780 r2c(shape, stride_in, tstride, axes,
true, data_in, tdata.
data(), fct, nthreads);
3782 ndarr<T> aout(data_out, shape, stride_out);
3787 auto v = atmp[iin.
ofs()];
3788 aout[iout.
ofs()] = v.r+v.i;
3789 aout[iout.
rev_ofs()] = v.r-v.i;
3796 const T *data_in, T *data_out, T fct,
size_t nthreads=1)
3800 cndarr<T> ain(data_in, shape, stride_in);
3801 ndarr<T> aout(data_out, shape, stride_out);
3808 const T *data_in, T *data_out, T fct,
size_t nthreads=1)
3813 data_out, fct, nthreads);
3816 tshp[axes.back()] = tshp[axes.back()]/2+1;
3819 tstride.back()=
sizeof(std::complex<T>);
3820 for (
size_t i=tstride.size()-1; i>0; --i)
3821 tstride[i-1]=tstride[i]*ptrdiff_t(tshp[i]);
3822 r2c(shape, stride_in, tstride, axes,
true, data_in, tdata.
data(), fct, nthreads);
3824 ndarr<T> aout(data_out, shape, stride_out);
3829 auto v = atmp[iin.
ofs()];
3830 aout[iout.
ofs()] = v.r-v.i;
3831 aout[iout.
rev_ofs()] = v.r+v.i;
3838using detail::FORWARD;
3839using detail::BACKWARD;
3840using detail::shape_t;
3841using detail::stride_t;
3845using detail::r2r_fftpack;
3846using detail::r2r_separable_hartley;
3847using detail::r2r_genuine_hartley;
3848using detail::r2r_separable_fht;
3849using detail::r2r_genuine_fht;
3855#undef POCKETFFT_NOINLINE
3856#undef POCKETFFT_RESTRICT
pocketfft_r< T0 > fftplan
std::vector< T0 > twiddle
vcl_size_t length() const
POCKETFFT_NOINLINE void exec(T c[], T0 fct, bool ortho, int type, bool cosine) const
POCKETFFT_NOINLINE T_dcst23(vcl_size_t length)
vcl_size_t length() const
POCKETFFT_NOINLINE void exec(T c[], T0 fct, bool, int, bool cosine) const
POCKETFFT_NOINLINE T_dcst4(vcl_size_t length)
std::unique_ptr< pocketfft_c< T0 > > fft
std::unique_ptr< pocketfft_r< T0 > > rfft
pocketfft_r< T0 > fftplan
vcl_size_t length() const
POCKETFFT_NOINLINE T_dct1(vcl_size_t length)
POCKETFFT_NOINLINE void exec(T c[], T0 fct, bool ortho, int, bool) const
pocketfft_r< T0 > fftplan
vcl_size_t length() const
POCKETFFT_NOINLINE T_dst1(vcl_size_t length)
POCKETFFT_NOINLINE void exec(T c[], T0 fct, bool, int, bool) const
vcl_size_t shape(vcl_size_t i) const
const shape_t & shape() const
const stride_t & stride() const
arr_info(shape_t shape_, stride_t stride_)
const ptrdiff_t & stride(vcl_size_t i) const
static T * ralloc(vcl_size_t num)
arr(arr &&other) noexcept
void resize(vcl_size_t n)
T & operator[](vcl_size_t idx)
const T & operator[](vcl_size_t idx) const
static void dealloc(T *ptr)
void passg(vcl_size_t ido, vcl_size_t ip, vcl_size_t l1, T *POCKETFFT_RESTRICT cc, T *POCKETFFT_RESTRICT ch, const cmplx< T0 > *POCKETFFT_RESTRICT wa, const cmplx< T0 > *POCKETFFT_RESTRICT csarr) const
void pass2(vcl_size_t ido, vcl_size_t l1, const T *POCKETFFT_RESTRICT cc, T *POCKETFFT_RESTRICT ch, const cmplx< T0 > *POCKETFFT_RESTRICT wa) const
POCKETFFT_NOINLINE void factorize()
vcl_size_t twsize() const
void pass_all(T c[], T0 fct) const
void pass11(vcl_size_t ido, vcl_size_t l1, const T *POCKETFFT_RESTRICT cc, T *POCKETFFT_RESTRICT ch, const cmplx< T0 > *POCKETFFT_RESTRICT wa) const
void pass7(vcl_size_t ido, vcl_size_t l1, const T *POCKETFFT_RESTRICT cc, T *POCKETFFT_RESTRICT ch, const cmplx< T0 > *POCKETFFT_RESTRICT wa) const
void pass3(vcl_size_t ido, vcl_size_t l1, const T *POCKETFFT_RESTRICT cc, T *POCKETFFT_RESTRICT ch, const cmplx< T0 > *POCKETFFT_RESTRICT wa) const
POCKETFFT_NOINLINE cfftp(vcl_size_t length_)
void add_factor(vcl_size_t factor)
void pass8(vcl_size_t ido, vcl_size_t l1, const T *POCKETFFT_RESTRICT cc, T *POCKETFFT_RESTRICT ch, const cmplx< T0 > *POCKETFFT_RESTRICT wa) const
void exec(T c[], T0 fct, bool fwd) const
void pass4(vcl_size_t ido, vcl_size_t l1, const T *POCKETFFT_RESTRICT cc, T *POCKETFFT_RESTRICT ch, const cmplx< T0 > *POCKETFFT_RESTRICT wa) const
std::vector< fctdata > fact
void pass5(vcl_size_t ido, vcl_size_t l1, const T *POCKETFFT_RESTRICT cc, T *POCKETFFT_RESTRICT ch, const cmplx< T0 > *POCKETFFT_RESTRICT wa) const
const T & operator[](ptrdiff_t ofs) const
cndarr(const void *data_, const shape_t &shape_, const stride_t &stride_)
void exec(cmplx< T > c[], T0 fct, bool fwd) const
void fft(cmplx< T > c[], T0 fct) const
POCKETFFT_NOINLINE fftblue(vcl_size_t length)
void exec_r(T c[], T0 fct, bool fwd)
ptrdiff_t oofs(vcl_size_t j, vcl_size_t i) const
ptrdiff_t stride_out() const
ptrdiff_t iofs(vcl_size_t i) const
vcl_size_t length_in() const
ptrdiff_t iofs(vcl_size_t j, vcl_size_t i) const
multi_iter(const arr_info &iarr_, const arr_info &oarr_, vcl_size_t idim_)
void advance(vcl_size_t n)
vcl_size_t length_out() const
ptrdiff_t stride_in() const
ptrdiff_t oofs(vcl_size_t i) const
vcl_size_t remaining() const
T & operator[](ptrdiff_t ofs)
ndarr(void *data_, const shape_t &shape_, const stride_t &stride_)
vcl_size_t length() const
std::unique_ptr< fftblue< T0 > > blueplan
POCKETFFT_NOINLINE void exec(cmplx< T > c[], T0 fct, bool fwd) const
std::unique_ptr< cfftp< T0 > > packplan
POCKETFFT_NOINLINE pocketfft_c(vcl_size_t length)
POCKETFFT_NOINLINE pocketfft_r(vcl_size_t length)
std::unique_ptr< rfftp< T0 > > packplan
POCKETFFT_NOINLINE void exec(T c[], T0 fct, bool fwd) const
vcl_size_t length() const
std::unique_ptr< fftblue< T0 > > blueplan
std::vector< char > rev_jump
ptrdiff_t rev_ofs() const
vcl_size_t remaining() const
std::vector< char > rev_axis
rev_iter(const arr_info &arr_, const shape_t &axes)
void copy_and_norm(T *c, T *p1, T0 fct) const
void exec(T c[], T0 fct, bool r2hc) const
void radf5(vcl_size_t ido, vcl_size_t l1, const T *POCKETFFT_RESTRICT cc, T *POCKETFFT_RESTRICT ch, const T0 *POCKETFFT_RESTRICT wa) const
void radf4(vcl_size_t ido, vcl_size_t l1, const T *POCKETFFT_RESTRICT cc, T *POCKETFFT_RESTRICT ch, const T0 *POCKETFFT_RESTRICT wa) const
std::vector< fctdata > fact
void radb3(vcl_size_t ido, vcl_size_t l1, const T *POCKETFFT_RESTRICT cc, T *POCKETFFT_RESTRICT ch, const T0 *POCKETFFT_RESTRICT wa) const
void radf3(vcl_size_t ido, vcl_size_t l1, const T *POCKETFFT_RESTRICT cc, T *POCKETFFT_RESTRICT ch, const T0 *POCKETFFT_RESTRICT wa) const
void radb4(vcl_size_t ido, vcl_size_t l1, const T *POCKETFFT_RESTRICT cc, T *POCKETFFT_RESTRICT ch, const T0 *POCKETFFT_RESTRICT wa) const
void radf2(vcl_size_t ido, vcl_size_t l1, const T *POCKETFFT_RESTRICT cc, T *POCKETFFT_RESTRICT ch, const T0 *POCKETFFT_RESTRICT wa) const
void add_factor(vcl_size_t factor)
void radb2(vcl_size_t ido, vcl_size_t l1, const T *POCKETFFT_RESTRICT cc, T *POCKETFFT_RESTRICT ch, const T0 *POCKETFFT_RESTRICT wa) const
void radb5(vcl_size_t ido, vcl_size_t l1, const T *POCKETFFT_RESTRICT cc, T *POCKETFFT_RESTRICT ch, const T0 *POCKETFFT_RESTRICT wa) const
void radbg(vcl_size_t ido, vcl_size_t ip, vcl_size_t l1, T *POCKETFFT_RESTRICT cc, T *POCKETFFT_RESTRICT ch, const T0 *POCKETFFT_RESTRICT wa, const T0 *POCKETFFT_RESTRICT csarr) const
void MULPM(T1 &a, T1 &b, T2 c, T2 d, T3 e, T3 f) const
void radfg(vcl_size_t ido, vcl_size_t ip, vcl_size_t l1, T *POCKETFFT_RESTRICT cc, T *POCKETFFT_RESTRICT ch, const T0 *POCKETFFT_RESTRICT wa, const T0 *POCKETFFT_RESTRICT csarr) const
vcl_size_t twsize() const
POCKETFFT_NOINLINE rfftp(vcl_size_t length_)
simple_iter(const arr_info &arr_)
vcl_size_t remaining() const
static cmplx< Thigh > calc(vcl_size_t x, vcl_size_t n, Thigh ang)
typename std::conditional<(sizeof(T)>sizeof(double)), T, double >::type Thigh
POCKETFFT_NOINLINE sincos_2pibyn(vcl_size_t n)
cmplx< T > operator[](vcl_size_t idx) const
std::lock_guard< std::mutex > lock_t
std::atomic< vcl_size_t > size_
std::condition_variable completed_
std::unique_lock< std::mutex > lock_t
std::atomic< vcl_size_t > num_left_
concurrent_queue< std::function< void()> > overflow_work_
std::atomic< bool > shutdown_
std::atomic< vcl_size_t > unscheduled_tasks_
std::lock_guard< std::mutex > lock_t
static constexpr vcl_size_t cache_line_size
thread_pool(vcl_size_t nthreads)
std::vector< worker, aligned_allocator< worker > > workers_
void submit(std::function< void()> work)
vcl_size_t & num_threads()
void thread_map(vcl_size_t nthreads, Func f)
static const vcl_size_t max_threads
void r2c(const shape_t &shape_in, const stride_t &stride_in, const stride_t &stride_out, vcl_size_t axis, bool forward, const T *data_in, std::complex< T > *data_out, T fct, vcl_size_t nthreads=1)
void dst(const shape_t &shape, const stride_t &stride_in, const stride_t &stride_out, const shape_t &axes, int type, const T *data_in, T *data_out, T fct, bool ortho, vcl_size_t nthreads=1)
POCKETFFT_NOINLINE void general_r2c(const cndarr< T > &in, ndarr< cmplx< T > > &out, vcl_size_t axis, bool forward, T fct, vcl_size_t nthreads)
void PMINPLACE(T &a, T &b)
void c2c(const shape_t &shape, const stride_t &stride_in, const stride_t &stride_out, const shape_t &axes, bool forward, const std::complex< T > *data_in, std::complex< T > *data_out, T fct, vcl_size_t nthreads=1)
void * aligned_alloc(vcl_size_t align, vcl_size_t size)
cmplx< T > conj(const cmplx< T > &a)
std::vector< vcl_size_t > shape_t
void r2r_genuine_fht(const shape_t &shape, const stride_t &stride_in, const stride_t &stride_out, const shape_t &axes, const T *data_in, T *data_out, T fct, vcl_size_t nthreads=1)
typename add_vec< T >::type add_vec_t
void copy_output(const multi_iter< vlen > &it, const cmplx< vtype_t< T > > *POCKETFFT_RESTRICT src, ndarr< cmplx< T > > &dst)
void dct(const shape_t &shape, const stride_t &stride_in, const stride_t &stride_out, const shape_t &axes, int type, const T *data_in, T *data_out, T fct, bool ortho, vcl_size_t nthreads=1)
arr< char > alloc_tmp(const shape_t &shape, vcl_size_t axsize, vcl_size_t elemsize)
void copy_hartley(const multi_iter< vlen > &it, const vtype_t< T > *POCKETFFT_RESTRICT src, ndarr< T > &dst)
void r2r_separable_hartley(const shape_t &shape, const stride_t &stride_in, const stride_t &stride_out, const shape_t &axes, const T *data_in, T *data_out, T fct, vcl_size_t nthreads=1)
void r2r_fftpack(const shape_t &shape, const stride_t &stride_in, const stride_t &stride_out, const shape_t &axes, bool real2hermitian, bool forward, const T *data_in, T *data_out, T fct, vcl_size_t nthreads=1)
std::shared_ptr< T > get_plan(vcl_size_t length)
POCKETFFT_NOINLINE void general_c2r(const cndarr< cmplx< T > > &in, ndarr< T > &out, vcl_size_t axis, bool forward, T fct, vcl_size_t nthreads)
POCKETFFT_NOINLINE void general_nd(const cndarr< T > &in, ndarr< T > &out, const shape_t &axes, T0 fct, vcl_size_t nthreads, const Exec &exec, const bool allow_inplace=true)
void aligned_dealloc(void *ptr)
void c2r(const shape_t &shape_out, const stride_t &stride_in, const stride_t &stride_out, vcl_size_t axis, bool forward, const std::complex< T > *data_in, T *data_out, T fct, vcl_size_t nthreads=1)
void PM(T &a, T &b, T c, T d)
void MPINPLACE(T &a, T &b)
void r2r_separable_fht(const shape_t &shape, const stride_t &stride_in, const stride_t &stride_out, const shape_t &axes, const T *data_in, T *data_out, T fct, vcl_size_t nthreads=1)
void copy_FHT(const multi_iter< vlen > &it, const vtype_t< T > *POCKETFFT_RESTRICT src, ndarr< T > &dst)
void ROTX90(cmplx< T > &a)
typename VTYPE< T >::type vtype_t
void special_mul(const cmplx< T > &v1, const cmplx< T2 > &v2, cmplx< T > &res)
std::vector< ptrdiff_t > stride_t
void r2r_genuine_hartley(const shape_t &shape, const stride_t &stride_in, const stride_t &stride_out, const shape_t &axes, const T *data_in, T *data_out, T fct, vcl_size_t nthreads=1)
void ROT90(cmplx< T > &a)
void copy_input(const multi_iter< vlen > &it, const cndarr< cmplx< T > > &src, cmplx< vtype_t< T > > *POCKETFFT_RESTRICT dst)
#define POCKETFFT_PARTSTEP5a(u1, u2, twar, twbr, twai, twbi)
#define POCKETFFT_PARTSTEP11(u1, u2, x1, x2, x3, x4, x5, y1, y2, y3, y4, y5)
#define POCKETFFT_PARTSTEP7a(u1, u2, x1, x2, x3, y1, y2, y3)
#define POCKETFFT_PARTSTEP3b(u1, u2, twr, twi)
#define POCKETFFT_PREP11(idx)
#define POCKETFFT_NOINLINE
#define POCKETFFT_PREP5(idx)
#define POCKETFFT_CACHE_SIZE
#define POCKETFFT_PARTSTEP5b(u1, u2, twar, twbr, twai, twbi)
#define POCKETFFT_RESTRICT
#define POCKETFFT_PARTSTEP3a(u1, u2, twr, twi)
#define POCKETFFT_PARTSTEP11a(u1, u2, x1, x2, x3, x4, x5, y1, y2, y3, y4, y5)
#define POCKETFFT_REARRANGE(rx, ix, ry, iy)
#define POCKETFFT_PREP3(idx)
#define POCKETFFT_PREP7(idx)
#define POCKETFFT_PARTSTEP7(u1, u2, x1, x2, x3, y1, y2, y3)
static constexpr vcl_size_t val
cmplx< vtype_t< T > > type
auto special_mul(const cmplx< T2 > &other) const -> cmplx< decltype(r+other.r)>
aligned_allocator(const aligned_allocator< U > &)
void deallocate(T *p, vcl_size_t)
aligned_allocator()=default
T * allocate(vcl_size_t n)
std::atomic_flag busy_flag
std::condition_variable work_ready
std::function< void()> work
void worker_main(std::atomic< bool > &shutdown_flag, std::atomic< vcl_size_t > &unscheduled_tasks, concurrent_queue< std::function< void()> > &overflow_work)
static POCKETFFT_NOINLINE void sanity_check(const shape_t &shape, const stride_t &stride_in, const stride_t &stride_out, bool inplace)
static POCKETFFT_NOINLINE vcl_size_t prev_good_size_cmplx(vcl_size_t n)
static POCKETFFT_NOINLINE void sanity_check(const shape_t &shape, const stride_t &stride_in, const stride_t &stride_out, bool inplace, const shape_t &axes)
static POCKETFFT_NOINLINE UIntT prev_good_size_cmplx_typed(UIntT n)
static POCKETFFT_NOINLINE vcl_size_t good_size_cmplx(vcl_size_t n)
static vcl_size_t prod(const shape_t &shape)
static POCKETFFT_NOINLINE double cost_guess(vcl_size_t n)
static POCKETFFT_NOINLINE vcl_size_t largest_prime_factor(vcl_size_t n)
static POCKETFFT_NOINLINE vcl_size_t good_size_real(vcl_size_t n, vcl_size_t required_factor)
static POCKETFFT_NOINLINE vcl_size_t good_size_real(vcl_size_t n)
static POCKETFFT_NOINLINE vcl_size_t good_size_cmplx(vcl_size_t n, vcl_size_t required_factor)
static POCKETFFT_NOINLINE void sanity_check(const shape_t &shape, const stride_t &stride_in, const stride_t &stride_out, bool inplace, vcl_size_t axis)
static POCKETFFT_NOINLINE vcl_size_t prev_good_size_real(vcl_size_t n)
static vcl_size_t thread_count(vcl_size_t nthreads, const shape_t &shape, vcl_size_t axis, vcl_size_t vlen)
static POCKETFFT_NOINLINE UIntT good_size_cmplx_typed(UIntT n)
static POCKETFFT_NOINLINE UIntT prev_good_size_real_typed(UIntT n)
static POCKETFFT_NOINLINE UIntT good_size_real_typed(UIntT n)