diff options
author | waker <wakeroid@gmail.com> | 2011-02-13 12:44:16 +0100 |
---|---|---|
committer | waker <wakeroid@gmail.com> | 2011-02-13 12:44:16 +0100 |
commit | b892ac27f3792998ff059d612cdf3d18ac1a8c03 (patch) | |
tree | bc0b01febdbbd4020682669fb27af02c20c71c81 /plugins/supereq | |
parent | 40073ef868e09e86480f5331ef5bc95bf7de3347 (diff) |
fft experiments
Diffstat (limited to 'plugins/supereq')
-rw-r--r-- | plugins/supereq/Equ.cpp | 8 | ||||
-rw-r--r-- | plugins/supereq/Fftsg_fl.cpp | 3 | ||||
-rw-r--r-- | plugins/supereq/Makefile.am | 71 | ||||
l--------- | plugins/supereq/nsfft-1.00/dft/Makefile | 2 | ||||
-rw-r--r-- | plugins/supereq/nsfft-1.00/ooura/fftsg.c | 3314 | ||||
-rw-r--r-- | plugins/supereq/nsfft-1.00/ooura/pi_fft.c | 1616 | ||||
l--------- | plugins/supereq/nsfft-1.00/simd/Makefile | 2 | ||||
-rw-r--r-- | plugins/supereq/nsfft-1.00/simd/SIMDBase.h | 2 | ||||
-rw-r--r-- | plugins/supereq/shibatch_rdft.c | 46 | ||||
-rw-r--r-- | plugins/supereq/supereq.c | 4 |
10 files changed, 5024 insertions, 44 deletions
diff --git a/plugins/supereq/Equ.cpp b/plugins/supereq/Equ.cpp index a3139d9d..0aff4f8a 100644 --- a/plugins/supereq/Equ.cpp +++ b/plugins/supereq/Equ.cpp @@ -24,8 +24,6 @@ #include "paramlist.hpp"
#include "Equ.h"
-#define USE_FFMPEG
-
int _Unwind_Resume_or_Rethrow;
int _Unwind_RaiseException;
int _Unwind_GetLanguageSpecificData;
@@ -39,8 +37,8 @@ int _Unwind_SetGR; int _Unwind_GetIPInfo;
#ifdef USE_OOURA
-void rdft(int, int, REAL *, int *, REAL *);
-void rfft(int n,int isign,REAL x[])
+extern "C" void rdft(int, int, REAL *, int *, REAL *);
+void rfft(int n,int isign,REAL *x)
{
static int ipsize = 0,wsize=0;
static int *ip = NULL;
@@ -71,7 +69,7 @@ void rfft(int n,int isign,REAL x[]) rdft(n,isign,x,ip,w);
}
#elif defined(USE_FFMPEG) || defined(USE_SHIBATCH)
-extern "C" void rfft(int n,int isign,REAL x[]);
+extern "C" void rfft(int n,int isign,REAL *x);
#endif
#if defined(USE_SHIBATCH)
diff --git a/plugins/supereq/Fftsg_fl.cpp b/plugins/supereq/Fftsg_fl.cpp index d31d80f6..636f8b8a 100644 --- a/plugins/supereq/Fftsg_fl.cpp +++ b/plugins/supereq/Fftsg_fl.cpp @@ -285,6 +285,7 @@ Appendix : w[] and ip[] are compatible with all routines.
*/
+extern "C" {
void cdft(int n, int isgn, REAL *a, int *ip, REAL *w)
{
@@ -2649,4 +2650,4 @@ void dstsub(int n, REAL *a, int nc, REAL *c) }
a[m] *= c[0];
}
-
+}
diff --git a/plugins/supereq/Makefile.am b/plugins/supereq/Makefile.am index 40ed5663..45010ec8 100644 --- a/plugins/supereq/Makefile.am +++ b/plugins/supereq/Makefile.am @@ -1,34 +1,51 @@ if HAVE_SUPEREQ supereqdir = $(libdir)/$(PACKAGE) pkglib_LTLIBRARIES = supereq.la -supereq_la_SOURCES = supereq.c supereq.h Equ.cpp Fftsg_fl.cpp paramlist.hpp\ -ffmpeg_fft/libavutil/mem.c\ -ffmpeg_fft/libavutil/mathematics.c\ -ffmpeg_fft/libavutil/rational.c\ -ffmpeg_fft/libavutil/intfloat_readwrite.c\ -ffmpeg_fft/libavcodec/dct.c\ -ffmpeg_fft/libavcodec/avfft.c\ -ffmpeg_fft/libavcodec/fft.c\ -ffmpeg_fft/libavcodec/dct32.c\ -ffmpeg_fft/libavcodec/rdft.c\ -ffmpeg_fft/libavutil/intfloat_readwrite.h\ -ffmpeg_fft/libavutil/avutil.h\ -ffmpeg_fft/libavutil/common.h\ -ffmpeg_fft/libavutil/attributes.h\ -ffmpeg_fft/libavutil/mem.h\ -ffmpeg_fft/libavutil/avconfig.h\ -ffmpeg_fft/libavutil/mathematics.h\ -ffmpeg_fft/libavutil/rational.h\ -ffmpeg_fft/publik.h\ -ffmpeg_fft/ffmpeg_fft.h\ -ffmpeg_fft/libavcodec/dct32.h\ -ffmpeg_fft/libavcodec/fft.h\ -ffmpeg_fft/libavcodec/avfft.h\ -ffmpeg_fft/config.h\ -ff_rdft.c +supereq_la_SOURCES = supereq.c supereq.h Equ.cpp Fftsg_fl.cpp paramlist.hpp -AM_CFLAGS = $(CFLAGS) -I ffmpeg_fft -I ffmpeg_fft/libavcodec -I ffmpeg_fft/libavutil -std=c99 -AM_CPPFLAGS = $(CXXFLAGS) -fno-exceptions -fno-rtti -nostdlib -fno-unwind-tables -I ffmpeg_fft -I ffmpeg_fft/libavcodec -I ffmpeg_fft/libavutil +#nsfft-1.00/simd/SIMDBaseUndiff.c\ +#nsfft-1.00/simd/SIMDBase.c\ +#nsfft-1.00/dft/DFT.c\ +#nsfft-1.00/dft/DFTUndiff.c\ +#nsfft-1.00/simd/SIMDBase.h\ +#nsfft-1.00/simd/SIMDBaseUndiff.h\ +#nsfft-1.00/dft/DFTUndiff.h\ +#nsfft-1.00/dft/DFT.h\ +#shibatch_rdft.c + +#ffmpeg_fft/libavutil/mem.c\ +#ffmpeg_fft/libavutil/mathematics.c\ +#ffmpeg_fft/libavutil/rational.c\ +#ffmpeg_fft/libavutil/intfloat_readwrite.c\ +#ffmpeg_fft/libavcodec/dct.c\ +#ffmpeg_fft/libavcodec/avfft.c\ +#ffmpeg_fft/libavcodec/fft.c\ +#ffmpeg_fft/libavcodec/dct32.c\ +#ffmpeg_fft/libavcodec/rdft.c\ +#ffmpeg_fft/libavutil/intfloat_readwrite.h\ +#ffmpeg_fft/libavutil/avutil.h\ +#ffmpeg_fft/libavutil/common.h\ +#ffmpeg_fft/libavutil/attributes.h\ +#ffmpeg_fft/libavutil/mem.h\ +#ffmpeg_fft/libavutil/avconfig.h\ +#ffmpeg_fft/libavutil/mathematics.h\ +#ffmpeg_fft/libavutil/rational.h\ +#ffmpeg_fft/publik.h\ +#ffmpeg_fft/ffmpeg_fft.h\ +#ffmpeg_fft/libavcodec/dct32.h\ +#ffmpeg_fft/libavcodec/fft.h\ +#ffmpeg_fft/libavcodec/avfft.h\ +#ffmpeg_fft/config.h\ +#ff_rdft.c + +#AM_CFLAGS = $(CFLAGS) -I ffmpeg_fft -I ffmpeg_fft/libavcodec -I ffmpeg_fft/libavutil -std=c99 +#AM_CPPFLAGS = $(CXXFLAGS) -fno-exceptions -fno-rtti -nostdlib -fno-unwind-tables -I ffmpeg_fft -I ffmpeg_fft/libavcodec -I ffmpeg_fft/libavutil + +#AM_CFLAGS = $(CFLAGS) -I nsfft-1.00/dft -I nsfft-1.00/simd -std=c99 -msse -DENABLE_SSE_FLOAT -DUSE_SHIBATCH +#AM_CPPFLAGS = $(CXXFLAGS) -fno-exceptions -fno-rtti -nostdlib -fno-unwind-tables -I nsfft-1.00/dft -I nsfft-1.00/simd -msse -DENABLE_SSE_FLOAT -DUSE_SHIBATCH + +AM_CFLAGS = $(CFLAGS) -std=c99 -DUSE_OOURA +AM_CPPFLAGS = $(CXXFLAGS) -fno-exceptions -fno-rtti -nostdlib -fno-unwind-tables -DUSE_OOURA supereq_la_LDFLAGS = -module -nostdlib -lsupc++ diff --git a/plugins/supereq/nsfft-1.00/dft/Makefile b/plugins/supereq/nsfft-1.00/dft/Makefile index 5d253498..fc484116 120000 --- a/plugins/supereq/nsfft-1.00/dft/Makefile +++ b/plugins/supereq/nsfft-1.00/dft/Makefile @@ -1 +1 @@ -Makefile.x86avx
\ No newline at end of file +Makefile.x86
\ No newline at end of file diff --git a/plugins/supereq/nsfft-1.00/ooura/fftsg.c b/plugins/supereq/nsfft-1.00/ooura/fftsg.c new file mode 100644 index 00000000..43d75344 --- /dev/null +++ b/plugins/supereq/nsfft-1.00/ooura/fftsg.c @@ -0,0 +1,3314 @@ +/* +Fast Fourier/Cosine/Sine Transform + dimension :one + data length :power of 2 + decimation :frequency + radix :split-radix + data :inplace + table :use +functions + cdft: Complex Discrete Fourier Transform + rdft: Real Discrete Fourier Transform + ddct: Discrete Cosine Transform + ddst: Discrete Sine Transform + dfct: Cosine Transform of RDFT (Real Symmetric DFT) + dfst: Sine Transform of RDFT (Real Anti-symmetric DFT) +function prototypes + void cdft(int, int, double *, int *, double *); + void rdft(int, int, double *, int *, double *); + void ddct(int, int, double *, int *, double *); + void ddst(int, int, double *, int *, double *); + void dfct(int, double *, double *, int *, double *); + void dfst(int, double *, double *, int *, double *); +macro definitions + USE_CDFT_PTHREADS : default=not defined + CDFT_THREADS_BEGIN_N : must be >= 512, default=8192 + CDFT_4THREADS_BEGIN_N : must be >= 512, default=65536 + USE_CDFT_WINTHREADS : default=not defined + CDFT_THREADS_BEGIN_N : must be >= 512, default=32768 + CDFT_4THREADS_BEGIN_N : must be >= 512, default=524288 + + +-------- Complex DFT (Discrete Fourier Transform) -------- + [definition] + <case1> + X[k] = sum_j=0^n-1 x[j]*exp(2*pi*i*j*k/n), 0<=k<n + <case2> + X[k] = sum_j=0^n-1 x[j]*exp(-2*pi*i*j*k/n), 0<=k<n + (notes: sum_j=0^n-1 is a summation from j=0 to n-1) + [usage] + <case1> + ip[0] = 0; // first time only + cdft(2*n, 1, a, ip, w); + <case2> + ip[0] = 0; // first time only + cdft(2*n, -1, a, ip, w); + [parameters] + 2*n :data length (int) + n >= 1, n = power of 2 + a[0...2*n-1] :input/output data (double *) + input data + a[2*j] = Re(x[j]), + a[2*j+1] = Im(x[j]), 0<=j<n + output data + a[2*k] = Re(X[k]), + a[2*k+1] = Im(X[k]), 0<=k<n + ip[0...*] :work area for bit reversal (int *) + length of ip >= 2+sqrt(n) + strictly, + length of ip >= + 2+(1<<(int)(log(n+0.5)/log(2))/2). + ip[0],ip[1] are pointers of the cos/sin table. + w[0...n/2-1] :cos/sin table (double *) + w[],ip[] are initialized if ip[0] == 0. + [remark] + Inverse of + cdft(2*n, -1, a, ip, w); + is + cdft(2*n, 1, a, ip, w); + for (j = 0; j <= 2 * n - 1; j++) { + a[j] *= 1.0 / n; + } + . + + +-------- Real DFT / Inverse of Real DFT -------- + [definition] + <case1> RDFT + R[k] = sum_j=0^n-1 a[j]*cos(2*pi*j*k/n), 0<=k<=n/2 + I[k] = sum_j=0^n-1 a[j]*sin(2*pi*j*k/n), 0<k<n/2 + <case2> IRDFT (excluding scale) + a[k] = (R[0] + R[n/2]*cos(pi*k))/2 + + sum_j=1^n/2-1 R[j]*cos(2*pi*j*k/n) + + sum_j=1^n/2-1 I[j]*sin(2*pi*j*k/n), 0<=k<n + [usage] + <case1> + ip[0] = 0; // first time only + rdft(n, 1, a, ip, w); + <case2> + ip[0] = 0; // first time only + rdft(n, -1, a, ip, w); + [parameters] + n :data length (int) + n >= 2, n = power of 2 + a[0...n-1] :input/output data (double *) + <case1> + output data + a[2*k] = R[k], 0<=k<n/2 + a[2*k+1] = I[k], 0<k<n/2 + a[1] = R[n/2] + <case2> + input data + a[2*j] = R[j], 0<=j<n/2 + a[2*j+1] = I[j], 0<j<n/2 + a[1] = R[n/2] + ip[0...*] :work area for bit reversal (int *) + length of ip >= 2+sqrt(n/2) + strictly, + length of ip >= + 2+(1<<(int)(log(n/2+0.5)/log(2))/2). + ip[0],ip[1] are pointers of the cos/sin table. + w[0...n/2-1] :cos/sin table (double *) + w[],ip[] are initialized if ip[0] == 0. + [remark] + Inverse of + rdft(n, 1, a, ip, w); + is + rdft(n, -1, a, ip, w); + for (j = 0; j <= n - 1; j++) { + a[j] *= 2.0 / n; + } + . + + +-------- DCT (Discrete Cosine Transform) / Inverse of DCT -------- + [definition] + <case1> IDCT (excluding scale) + C[k] = sum_j=0^n-1 a[j]*cos(pi*j*(k+1/2)/n), 0<=k<n + <case2> DCT + C[k] = sum_j=0^n-1 a[j]*cos(pi*(j+1/2)*k/n), 0<=k<n + [usage] + <case1> + ip[0] = 0; // first time only + ddct(n, 1, a, ip, w); + <case2> + ip[0] = 0; // first time only + ddct(n, -1, a, ip, w); + [parameters] + n :data length (int) + n >= 2, n = power of 2 + a[0...n-1] :input/output data (double *) + output data + a[k] = C[k], 0<=k<n + ip[0...*] :work area for bit reversal (int *) + length of ip >= 2+sqrt(n/2) + strictly, + length of ip >= + 2+(1<<(int)(log(n/2+0.5)/log(2))/2). + ip[0],ip[1] are pointers of the cos/sin table. + w[0...n*5/4-1] :cos/sin table (double *) + w[],ip[] are initialized if ip[0] == 0. + [remark] + Inverse of + ddct(n, -1, a, ip, w); + is + a[0] *= 0.5; + ddct(n, 1, a, ip, w); + for (j = 0; j <= n - 1; j++) { + a[j] *= 2.0 / n; + } + . + + +-------- DST (Discrete Sine Transform) / Inverse of DST -------- + [definition] + <case1> IDST (excluding scale) + S[k] = sum_j=1^n A[j]*sin(pi*j*(k+1/2)/n), 0<=k<n + <case2> DST + S[k] = sum_j=0^n-1 a[j]*sin(pi*(j+1/2)*k/n), 0<k<=n + [usage] + <case1> + ip[0] = 0; // first time only + ddst(n, 1, a, ip, w); + <case2> + ip[0] = 0; // first time only + ddst(n, -1, a, ip, w); + [parameters] + n :data length (int) + n >= 2, n = power of 2 + a[0...n-1] :input/output data (double *) + <case1> + input data + a[j] = A[j], 0<j<n + a[0] = A[n] + output data + a[k] = S[k], 0<=k<n + <case2> + output data + a[k] = S[k], 0<k<n + a[0] = S[n] + ip[0...*] :work area for bit reversal (int *) + length of ip >= 2+sqrt(n/2) + strictly, + length of ip >= + 2+(1<<(int)(log(n/2+0.5)/log(2))/2). + ip[0],ip[1] are pointers of the cos/sin table. + w[0...n*5/4-1] :cos/sin table (double *) + w[],ip[] are initialized if ip[0] == 0. + [remark] + Inverse of + ddst(n, -1, a, ip, w); + is + a[0] *= 0.5; + ddst(n, 1, a, ip, w); + for (j = 0; j <= n - 1; j++) { + a[j] *= 2.0 / n; + } + . + + +-------- Cosine Transform of RDFT (Real Symmetric DFT) -------- + [definition] + C[k] = sum_j=0^n a[j]*cos(pi*j*k/n), 0<=k<=n + [usage] + ip[0] = 0; // first time only + dfct(n, a, t, ip, w); + [parameters] + n :data length - 1 (int) + n >= 2, n = power of 2 + a[0...n] :input/output data (double *) + output data + a[k] = C[k], 0<=k<=n + t[0...n/2] :work area (double *) + ip[0...*] :work area for bit reversal (int *) + length of ip >= 2+sqrt(n/4) + strictly, + length of ip >= + 2+(1<<(int)(log(n/4+0.5)/log(2))/2). + ip[0],ip[1] are pointers of the cos/sin table. + w[0...n*5/8-1] :cos/sin table (double *) + w[],ip[] are initialized if ip[0] == 0. + [remark] + Inverse of + a[0] *= 0.5; + a[n] *= 0.5; + dfct(n, a, t, ip, w); + is + a[0] *= 0.5; + a[n] *= 0.5; + dfct(n, a, t, ip, w); + for (j = 0; j <= n; j++) { + a[j] *= 2.0 / n; + } + . + + +-------- Sine Transform of RDFT (Real Anti-symmetric DFT) -------- + [definition] + S[k] = sum_j=1^n-1 a[j]*sin(pi*j*k/n), 0<k<n + [usage] + ip[0] = 0; // first time only + dfst(n, a, t, ip, w); + [parameters] + n :data length + 1 (int) + n >= 2, n = power of 2 + a[0...n-1] :input/output data (double *) + output data + a[k] = S[k], 0<k<n + (a[0] is used for work area) + t[0...n/2-1] :work area (double *) + ip[0...*] :work area for bit reversal (int *) + length of ip >= 2+sqrt(n/4) + strictly, + length of ip >= + 2+(1<<(int)(log(n/4+0.5)/log(2))/2). + ip[0],ip[1] are pointers of the cos/sin table. + w[0...n*5/8-1] :cos/sin table (double *) + w[],ip[] are initialized if ip[0] == 0. + [remark] + Inverse of + dfst(n, a, t, ip, w); + is + dfst(n, a, t, ip, w); + for (j = 1; j <= n - 1; j++) { + a[j] *= 2.0 / n; + } + . + + +Appendix : + The cos/sin table is recalculated when the larger table required. + w[] and ip[] are compatible with all routines. +*/ + + +void cdft(int n, int isgn, double *a, int *ip, double *w) +{ + void makewt(int nw, int *ip, double *w); + void cftfsub(int n, double *a, int *ip, int nw, double *w); + void cftbsub(int n, double *a, int *ip, int nw, double *w); + int nw; + + nw = ip[0]; + if (n > (nw << 2)) { + nw = n >> 2; + makewt(nw, ip, w); + } + if (isgn >= 0) { + cftfsub(n, a, ip, nw, w); + } else { + cftbsub(n, a, ip, nw, w); + } +} + + +void rdft(int n, int isgn, double *a, int *ip, double *w) +{ + void makewt(int nw, int *ip, double *w); + void makect(int nc, int *ip, double *c); + void cftfsub(int n, double *a, int *ip, int nw, double *w); + void cftbsub(int n, double *a, int *ip, int nw, double *w); + void rftfsub(int n, double *a, int nc, double *c); + void rftbsub(int n, double *a, int nc, double *c); + int nw, nc; + double xi; + + nw = ip[0]; + if (n > (nw << 2)) { + nw = n >> 2; + makewt(nw, ip, w); + } + nc = ip[1]; + if (n > (nc << 2)) { + nc = n >> 2; + makect(nc, ip, w + nw); + } + if (isgn >= 0) { + if (n > 4) { + cftfsub(n, a, ip, nw, w); + rftfsub(n, a, nc, w + nw); + } else if (n == 4) { + cftfsub(n, a, ip, nw, w); + } + xi = a[0] - a[1]; + a[0] += a[1]; + a[1] = xi; + } else { + a[1] = 0.5 * (a[0] - a[1]); + a[0] -= a[1]; + if (n > 4) { + rftbsub(n, a, nc, w + nw); + cftbsub(n, a, ip, nw, w); + } else if (n == 4) { + cftbsub(n, a, ip, nw, w); + } + } +} + + +void ddct(int n, int isgn, double *a, int *ip, double *w) +{ + void makewt(int nw, int *ip, double *w); + void makect(int nc, int *ip, double *c); + void cftfsub(int n, double *a, int *ip, int nw, double *w); + void cftbsub(int n, double *a, int *ip, int nw, double *w); + void rftfsub(int n, double *a, int nc, double *c); + void rftbsub(int n, double *a, int nc, double *c); + void dctsub(int n, double *a, int nc, double *c); + int j, nw, nc; + double xr; + + nw = ip[0]; + if (n > (nw << 2)) { + nw = n >> 2; + makewt(nw, ip, w); + } + nc = ip[1]; + if (n > nc) { + nc = n; + makect(nc, ip, w + nw); + } + if (isgn < 0) { + xr = a[n - 1]; + for (j = n - 2; j >= 2; j -= 2) { + a[j + 1] = a[j] - a[j - 1]; + a[j] += a[j - 1]; + } + a[1] = a[0] - xr; + a[0] += xr; + if (n > 4) { + rftbsub(n, a, nc, w + nw); + cftbsub(n, a, ip, nw, w); + } else if (n == 4) { + cftbsub(n, a, ip, nw, w); + } + } + dctsub(n, a, nc, w + nw); + if (isgn >= 0) { + if (n > 4) { + cftfsub(n, a, ip, nw, w); + rftfsub(n, a, nc, w + nw); + } else if (n == 4) { + cftfsub(n, a, ip, nw, w); + } + xr = a[0] - a[1]; + a[0] += a[1]; + for (j = 2; j < n; j += 2) { + a[j - 1] = a[j] - a[j + 1]; + a[j] += a[j + 1]; + } + a[n - 1] = xr; + } +} + + +void ddst(int n, int isgn, double *a, int *ip, double *w) +{ + void makewt(int nw, int *ip, double *w); + void makect(int nc, int *ip, double *c); + void cftfsub(int n, double *a, int *ip, int nw, double *w); + void cftbsub(int n, double *a, int *ip, int nw, double *w); + void rftfsub(int n, double *a, int nc, double *c); + void rftbsub(int n, double *a, int nc, double *c); + void dstsub(int n, double *a, int nc, double *c); + int j, nw, nc; + double xr; + + nw = ip[0]; + if (n > (nw << 2)) { + nw = n >> 2; + makewt(nw, ip, w); + } + nc = ip[1]; + if (n > nc) { + nc = n; + makect(nc, ip, w + nw); + } + if (isgn < 0) { + xr = a[n - 1]; + for (j = n - 2; j >= 2; j -= 2) { + a[j + 1] = -a[j] - a[j - 1]; + a[j] -= a[j - 1]; + } + a[1] = a[0] + xr; + a[0] -= xr; + if (n > 4) { + rftbsub(n, a, nc, w + nw); + cftbsub(n, a, ip, nw, w); + } else if (n == 4) { + cftbsub(n, a, ip, nw, w); + } + } + dstsub(n, a, nc, w + nw); + if (isgn >= 0) { + if (n > 4) { + cftfsub(n, a, ip, nw, w); + rftfsub(n, a, nc, w + nw); + } else if (n == 4) { + cftfsub(n, a, ip, nw, w); + } + xr = a[0] - a[1]; + a[0] += a[1]; + for (j = 2; j < n; j += 2) { + a[j - 1] = -a[j] - a[j + 1]; + a[j] -= a[j + 1]; + } + a[n - 1] = -xr; + } +} + + +void dfct(int n, double *a, double *t, int *ip, double *w) +{ + void makewt(int nw, int *ip, double *w); + void makect(int nc, int *ip, double *c); + void cftfsub(int n, double *a, int *ip, int nw, double *w); + void rftfsub(int n, double *a, int nc, double *c); + void dctsub(int n, double *a, int nc, double *c); + int j, k, l, m, mh, nw, nc; + double xr, xi, yr, yi; + + nw = ip[0]; + if (n > (nw << 3)) { + nw = n >> 3; + makewt(nw, ip, w); + } + nc = ip[1]; + if (n > (nc << 1)) { + nc = n >> 1; + makect(nc, ip, w + nw); + } + m = n >> 1; + yi = a[m]; + xi = a[0] + a[n]; + a[0] -= a[n]; + t[0] = xi - yi; + t[m] = xi + yi; + if (n > 2) { + mh = m >> 1; + for (j = 1; j < mh; j++) { + k = m - j; + xr = a[j] - a[n - j]; + xi = a[j] + a[n - j]; + yr = a[k] - a[n - k]; + yi = a[k] + a[n - k]; + a[j] = xr; + a[k] = yr; + t[j] = xi - yi; + t[k] = xi + yi; + } + t[mh] = a[mh] + a[n - mh]; + a[mh] -= a[n - mh]; + dctsub(m, a, nc, w + nw); + if (m > 4) { + cftfsub(m, a, ip, nw, w); + rftfsub(m, a, nc, w + nw); + } else if (m == 4) { + cftfsub(m, a, ip, nw, w); + } + a[n - 1] = a[0] - a[1]; + a[1] = a[0] + a[1]; + for (j = m - 2; j >= 2; j -= 2) { + a[2 * j + 1] = a[j] + a[j + 1]; + a[2 * j - 1] = a[j] - a[j + 1]; + } + l = 2; + m = mh; + while (m >= 2) { + dctsub(m, t, nc, w + nw); + if (m > 4) { + cftfsub(m, t, ip, nw, w); + rftfsub(m, t, nc, w + nw); + } else if (m == 4) { + cftfsub(m, t, ip, nw, w); + } + a[n - l] = t[0] - t[1]; + a[l] = t[0] + t[1]; + k = 0; + for (j = 2; j < m; j += 2) { + k += l << 2; + a[k - l] = t[j] - t[j + 1]; + a[k + l] = t[j] + t[j + 1]; + } + l <<= 1; + mh = m >> 1; + for (j = 0; j < mh; j++) { + k = m - j; + t[j] = t[m + k] - t[m + j]; + t[k] = t[m + k] + t[m + j]; + } + t[mh] = t[m + mh]; + m = mh; + } + a[l] = t[0]; + a[n] = t[2] - t[1]; + a[0] = t[2] + t[1]; + } else { + a[1] = a[0]; + a[2] = t[0]; + a[0] = t[1]; + } +} + + +void dfst(int n, double *a, double *t, int *ip, double *w) +{ + void makewt(int nw, int *ip, double *w); + void makect(int nc, int *ip, double *c); + void cftfsub(int n, double *a, int *ip, int nw, double *w); + void rftfsub(int n, double *a, int nc, double *c); + void dstsub(int n, double *a, int nc, double *c); + int j, k, l, m, mh, nw, nc; + double xr, xi, yr, yi; + + nw = ip[0]; + if (n > (nw << 3)) { + nw = n >> 3; + makewt(nw, ip, w); + } + nc = ip[1]; + if (n > (nc << 1)) { + nc = n >> 1; + makect(nc, ip, w + nw); + } + if (n > 2) { + m = n >> 1; + mh = m >> 1; + for (j = 1; j < mh; j++) { + k = m - j; + xr = a[j] + a[n - j]; + xi = a[j] - a[n - j]; + yr = a[k] + a[n - k]; + yi = a[k] - a[n - k]; + a[j] = xr; + a[k] = yr; + t[j] = xi + yi; + t[k] = xi - yi; + } + t[0] = a[mh] - a[n - mh]; + a[mh] += a[n - mh]; + a[0] = a[m]; + dstsub(m, a, nc, w + nw); + if (m > 4) { + cftfsub(m, a, ip, nw, w); + rftfsub(m, a, nc, w + nw); + } else if (m == 4) { + cftfsub(m, a, ip, nw, w); + } + a[n - 1] = a[1] - a[0]; + a[1] = a[0] + a[1]; + for (j = m - 2; j >= 2; j -= 2) { + a[2 * j + 1] = a[j] - a[j + 1]; + a[2 * j - 1] = -a[j] - a[j + 1]; + } + l = 2; + m = mh; + while (m >= 2) { + dstsub(m, t, nc, w + nw); + if (m > 4) { + cftfsub(m, t, ip, nw, w); + rftfsub(m, t, nc, w + nw); + } else if (m == 4) { + cftfsub(m, t, ip, nw, w); + } + a[n - l] = t[1] - t[0]; + a[l] = t[0] + t[1]; + k = 0; + for (j = 2; j < m; j += 2) { + k += l << 2; + a[k - l] = -t[j] - t[j + 1]; + a[k + l] = t[j] - t[j + 1]; + } + l <<= 1; + mh = m >> 1; + for (j = 1; j < mh; j++) { + k = m - j; + t[j] = t[m + k] + t[m + j]; + t[k] = t[m + k] - t[m + j]; + } + t[0] = t[m + mh]; + m = mh; + } + a[l] = t[0]; + } + a[0] = 0; +} + + +/* -------- initializing routines -------- */ + + +#include <math.h> + +void makewt(int nw, int *ip, double *w) +{ + void makeipt(int nw, int *ip); + int j, nwh, nw0, nw1; + double delta, wn4r, wk1r, wk1i, wk3r, wk3i; + + ip[0] = nw; + ip[1] = 1; + if (nw > 2) { + nwh = nw >> 1; + delta = atan(1.0) / nwh; + wn4r = cos(delta * nwh); + w[0] = 1; + w[1] = wn4r; + if (nwh == 4) { + w[2] = cos(delta * 2); + w[3] = sin(delta * 2); + } else if (nwh > 4) { + makeipt(nw, ip); + w[2] = 0.5 / cos(delta * 2); + w[3] = 0.5 / cos(delta * 6); + for (j = 4; j < nwh; j += 4) { + w[j] = cos(delta * j); + w[j + 1] = sin(delta * j); + w[j + 2] = cos(3 * delta * j); + w[j + 3] = -sin(3 * delta * j); + } + } + nw0 = 0; + while (nwh > 2) { + nw1 = nw0 + nwh; + nwh >>= 1; + w[nw1] = 1; + w[nw1 + 1] = wn4r; + if (nwh == 4) { + wk1r = w[nw0 + 4]; + wk1i = w[nw0 + 5]; + w[nw1 + 2] = wk1r; + w[nw1 + 3] = wk1i; + } else if (nwh > 4) { + wk1r = w[nw0 + 4]; + wk3r = w[nw0 + 6]; + w[nw1 + 2] = 0.5 / wk1r; + w[nw1 + 3] = 0.5 / wk3r; + for (j = 4; j < nwh; j += 4) { + wk1r = w[nw0 + 2 * j]; + wk1i = w[nw0 + 2 * j + 1]; + wk3r = w[nw0 + 2 * j + 2]; + wk3i = w[nw0 + 2 * j + 3]; + w[nw1 + j] = wk1r; + w[nw1 + j + 1] = wk1i; + w[nw1 + j + 2] = wk3r; + w[nw1 + j + 3] = wk3i; + } + } + nw0 = nw1; + } + } +} + + +void makeipt(int nw, int *ip) +{ + int j, l, m, m2, p, q; + + ip[2] = 0; + ip[3] = 16; + m = 2; + for (l = nw; l > 32; l >>= 2) { + m2 = m << 1; + q = m2 << 3; + for (j = m; j < m2; j++) { + p = ip[j] << 2; + ip[m + j] = p; + ip[m2 + j] = p + q; + } + m = m2; + } +} + + +void makect(int nc, int *ip, double *c) +{ + int j, nch; + double delta; + + ip[1] = nc; + if (nc > 1) { + nch = nc >> 1; + delta = atan(1.0) / nch; + c[0] = cos(delta * nch); + c[nch] = 0.5 * c[0]; + for (j = 1; j < nch; j++) { + c[j] = 0.5 * cos(delta * j); + c[nc - j] = 0.5 * sin(delta * j); + } + } +} + + +/* -------- child routines -------- */ + + +#ifdef USE_CDFT_PTHREADS +#define USE_CDFT_THREADS +#ifndef CDFT_THREADS_BEGIN_N +#define CDFT_THREADS_BEGIN_N 8192 +#endif +#ifndef CDFT_4THREADS_BEGIN_N +#define CDFT_4THREADS_BEGIN_N 65536 +#endif +#include <pthread.h> +#include <stdio.h> +#include <stdlib.h> +#define cdft_thread_t pthread_t +#define cdft_thread_create(thp,func,argp) { \ + if (pthread_create(thp, NULL, func, (void *) argp) != 0) { \ + fprintf(stderr, "cdft thread error\n"); \ + exit(1); \ + } \ +} +#define cdft_thread_wait(th) { \ + if (pthread_join(th, NULL) != 0) { \ + fprintf(stderr, "cdft thread error\n"); \ + exit(1); \ + } \ +} +#endif /* USE_CDFT_PTHREADS */ + + +#ifdef USE_CDFT_WINTHREADS +#define USE_CDFT_THREADS +#ifndef CDFT_THREADS_BEGIN_N +#define CDFT_THREADS_BEGIN_N 32768 +#endif +#ifndef CDFT_4THREADS_BEGIN_N +#define CDFT_4THREADS_BEGIN_N 524288 +#endif +#include <windows.h> +#include <stdio.h> +#include <stdlib.h> +#define cdft_thread_t HANDLE +#define cdft_thread_create(thp,func,argp) { \ + DWORD thid; \ + *(thp) = CreateThread(NULL, 0, (LPTHREAD_START_ROUTINE) func, (LPVOID) argp, 0, &thid); \ + if (*(thp) == 0) { \ + fprintf(stderr, "cdft thread error\n"); \ + exit(1); \ + } \ +} +#define cdft_thread_wait(th) { \ + WaitForSingleObject(th, INFINITE); \ + CloseHandle(th); \ +} +#endif /* USE_CDFT_WINTHREADS */ + + +void cftfsub(int n, double *a, int *ip, int nw, double *w) +{ + void bitrv2(int n, int *ip, double *a); + void bitrv216(double *a); + void bitrv208(double *a); + void cftf1st(int n, double *a, double *w); + void cftrec4(int n, double *a, int nw, double *w); + void cftleaf(int n, int isplt, double *a, int nw, double *w); + void cftfx41(int n, double *a, int nw, double *w); + void cftf161(double *a, double *w); + void cftf081(double *a, double *w); + void cftf040(double *a); + void cftx020(double *a); +#ifdef USE_CDFT_THREADS + void cftrec4_th(int n, double *a, int nw, double *w); +#endif /* USE_CDFT_THREADS */ + + if (n > 8) { + if (n > 32) { + cftf1st(n, a, &w[nw - (n >> 2)]); +#ifdef USE_CDFT_THREADS + if (n > CDFT_THREADS_BEGIN_N) { + cftrec4_th(n, a, nw, w); + } else +#endif /* USE_CDFT_THREADS */ + if (n > 512) { + cftrec4(n, a, nw, w); + } else if (n > 128) { + cftleaf(n, 1, a, nw, w); + } else { + cftfx41(n, a, nw, w); + } + bitrv2(n, ip, a); + } else if (n == 32) { + cftf161(a, &w[nw - 8]); + bitrv216(a); + } else { + cftf081(a, w); + bitrv208(a); + } + } else if (n == 8) { + cftf040(a); + } else if (n == 4) { + cftx020(a); + } +} + + +void cftbsub(int n, double *a, int *ip, int nw, double *w) +{ + void bitrv2conj(int n, int *ip, double *a); + void bitrv216neg(double *a); + void bitrv208neg(double *a); + void cftb1st(int n, double *a, double *w); + void cftrec4(int n, double *a, int nw, double *w); + void cftleaf(int n, int isplt, double *a, int nw, double *w); + void cftfx41(int n, double *a, int nw, double *w); + void cftf161(double *a, double *w); + void cftf081(double *a, double *w); + void cftb040(double *a); + void cftx020(double *a); +#ifdef USE_CDFT_THREADS + void cftrec4_th(int n, double *a, int nw, double *w); +#endif /* USE_CDFT_THREADS */ + + if (n > 8) { + if (n > 32) { + cftb1st(n, a, &w[nw - (n >> 2)]); +#ifdef USE_CDFT_THREADS + if (n > CDFT_THREADS_BEGIN_N) { + cftrec4_th(n, a, nw, w); + } else +#endif /* USE_CDFT_THREADS */ + if (n > 512) { + cftrec4(n, a, nw, w); + } else if (n > 128) { + cftleaf(n, 1, a, nw, w); + } else { + cftfx41(n, a, nw, w); + } + bitrv2conj(n, ip, a); + } else if (n == 32) { + cftf161(a, &w[nw - 8]); + bitrv216neg(a); + } else { + cftf081(a, w); + bitrv208neg(a); + } + } else if (n == 8) { + cftb040(a); + } else if (n == 4) { + cftx020(a); + } +} + + +void bitrv2(int n, int *ip, double *a) +{ + int j, j1, k, k1, l, m, nh, nm; + double xr, xi, yr, yi; + + m = 1; + for (l = n >> 2; l > 8; l >>= 2) { + m <<= 1; + } + nh = n >> 1; + nm = 4 * m; + if (l == 8) { + for (k = 0; k < m; k++) { + for (j = 0; j < k; j++) { + j1 = 4 * j + 2 * ip[m + k]; + k1 = 4 * k + 2 * ip[m + j]; + xr = a[j1]; + xi = a[j1 + 1]; + yr = a[k1]; + yi = a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 += nm; + k1 += 2 * nm; + xr = a[j1]; + xi = a[j1 + 1]; + yr = a[k1]; + yi = a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 += nm; + k1 -= nm; + xr = a[j1]; + xi = a[j1 + 1]; + yr = a[k1]; + yi = a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 += nm; + k1 += 2 * nm; + xr = a[j1]; + xi = a[j1 + 1]; + yr = a[k1]; + yi = a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 += nh; + k1 += 2; + xr = a[j1]; + xi = a[j1 + 1]; + yr = a[k1]; + yi = a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 -= nm; + k1 -= 2 * nm; + xr = a[j1]; + xi = a[j1 + 1]; + yr = a[k1]; + yi = a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 -= nm; + k1 += nm; + xr = a[j1]; + xi = a[j1 + 1]; + yr = a[k1]; + yi = a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 -= nm; + k1 -= 2 * nm; + xr = a[j1]; + xi = a[j1 + 1]; + yr = a[k1]; + yi = a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 += 2; + k1 += nh; + xr = a[j1]; + xi = a[j1 + 1]; + yr = a[k1]; + yi = a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 += nm; + k1 += 2 * nm; + xr = a[j1]; + xi = a[j1 + 1]; + yr = a[k1]; + yi = a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 += nm; + k1 -= nm; + xr = a[j1]; + xi = a[j1 + 1]; + yr = a[k1]; + yi = a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 += nm; + k1 += 2 * nm; + xr = a[j1]; + xi = a[j1 + 1]; + yr = a[k1]; + yi = a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 -= nh; + k1 -= 2; + xr = a[j1]; + xi = a[j1 + 1]; + yr = a[k1]; + yi = a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 -= nm; + k1 -= 2 * nm; + xr = a[j1]; + xi = a[j1 + 1]; + yr = a[k1]; + yi = a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 -= nm; + k1 += nm; + xr = a[j1]; + xi = a[j1 + 1]; + yr = a[k1]; + yi = a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 -= nm; + k1 -= 2 * nm; + xr = a[j1]; + xi = a[j1 + 1]; + yr = a[k1]; + yi = a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + } + k1 = 4 * k + 2 * ip[m + k]; + j1 = k1 + 2; + k1 += nh; + xr = a[j1]; + xi = a[j1 + 1]; + yr = a[k1]; + yi = a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 += nm; + k1 += 2 * nm; + xr = a[j1]; + xi = a[j1 + 1]; + yr = a[k1]; + yi = a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 += nm; + k1 -= nm; + xr = a[j1]; + xi = a[j1 + 1]; + yr = a[k1]; + yi = a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 -= 2; + k1 -= nh; + xr = a[j1]; + xi = a[j1 + 1]; + yr = a[k1]; + yi = a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 += nh + 2; + k1 += nh + 2; + xr = a[j1]; + xi = a[j1 + 1]; + yr = a[k1]; + yi = a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 -= nh - nm; + k1 += 2 * nm - 2; + xr = a[j1]; + xi = a[j1 + 1]; + yr = a[k1]; + yi = a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + } + } else { + for (k = 0; k < m; k++) { + for (j = 0; j < k; j++) { + j1 = 4 * j + ip[m + k]; + k1 = 4 * k + ip[m + j]; + xr = a[j1]; + xi = a[j1 + 1]; + yr = a[k1]; + yi = a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 += nm; + k1 += nm; + xr = a[j1]; + xi = a[j1 + 1]; + yr = a[k1]; + yi = a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 += nh; + k1 += 2; + xr = a[j1]; + xi = a[j1 + 1]; + yr = a[k1]; + yi = a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 -= nm; + k1 -= nm; + xr = a[j1]; + xi = a[j1 + 1]; + yr = a[k1]; + yi = a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 += 2; + k1 += nh; + xr = a[j1]; + xi = a[j1 + 1]; + yr = a[k1]; + yi = a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 += nm; + k1 += nm; + xr = a[j1]; + xi = a[j1 + 1]; + yr = a[k1]; + yi = a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 -= nh; + k1 -= 2; + xr = a[j1]; + xi = a[j1 + 1]; + yr = a[k1]; + yi = a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 -= nm; + k1 -= nm; + xr = a[j1]; + xi = a[j1 + 1]; + yr = a[k1]; + yi = a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + } + k1 = 4 * k + ip[m + k]; + j1 = k1 + 2; + k1 += nh; + xr = a[j1]; + xi = a[j1 + 1]; + yr = a[k1]; + yi = a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 += nm; + k1 += nm; + xr = a[j1]; + xi = a[j1 + 1]; + yr = a[k1]; + yi = a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + } + } +} + + +void bitrv2conj(int n, int *ip, double *a) +{ + int j, j1, k, k1, l, m, nh, nm; + double xr, xi, yr, yi; + + m = 1; + for (l = n >> 2; l > 8; l >>= 2) { + m <<= 1; + } + nh = n >> 1; + nm = 4 * m; + if (l == 8) { + for (k = 0; k < m; k++) { + for (j = 0; j < k; j++) { + j1 = 4 * j + 2 * ip[m + k]; + k1 = 4 * k + 2 * ip[m + j]; + xr = a[j1]; + xi = -a[j1 + 1]; + yr = a[k1]; + yi = -a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 += nm; + k1 += 2 * nm; + xr = a[j1]; + xi = -a[j1 + 1]; + yr = a[k1]; + yi = -a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 += nm; + k1 -= nm; + xr = a[j1]; + xi = -a[j1 + 1]; + yr = a[k1]; + yi = -a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 += nm; + k1 += 2 * nm; + xr = a[j1]; + xi = -a[j1 + 1]; + yr = a[k1]; + yi = -a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 += nh; + k1 += 2; + xr = a[j1]; + xi = -a[j1 + 1]; + yr = a[k1]; + yi = -a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 -= nm; + k1 -= 2 * nm; + xr = a[j1]; + xi = -a[j1 + 1]; + yr = a[k1]; + yi = -a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 -= nm; + k1 += nm; + xr = a[j1]; + xi = -a[j1 + 1]; + yr = a[k1]; + yi = -a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 -= nm; + k1 -= 2 * nm; + xr = a[j1]; + xi = -a[j1 + 1]; + yr = a[k1]; + yi = -a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 += 2; + k1 += nh; + xr = a[j1]; + xi = -a[j1 + 1]; + yr = a[k1]; + yi = -a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 += nm; + k1 += 2 * nm; + xr = a[j1]; + xi = -a[j1 + 1]; + yr = a[k1]; + yi = -a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 += nm; + k1 -= nm; + xr = a[j1]; + xi = -a[j1 + 1]; + yr = a[k1]; + yi = -a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 += nm; + k1 += 2 * nm; + xr = a[j1]; + xi = -a[j1 + 1]; + yr = a[k1]; + yi = -a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 -= nh; + k1 -= 2; + xr = a[j1]; + xi = -a[j1 + 1]; + yr = a[k1]; + yi = -a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 -= nm; + k1 -= 2 * nm; + xr = a[j1]; + xi = -a[j1 + 1]; + yr = a[k1]; + yi = -a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 -= nm; + k1 += nm; + xr = a[j1]; + xi = -a[j1 + 1]; + yr = a[k1]; + yi = -a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 -= nm; + k1 -= 2 * nm; + xr = a[j1]; + xi = -a[j1 + 1]; + yr = a[k1]; + yi = -a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + } + k1 = 4 * k + 2 * ip[m + k]; + j1 = k1 + 2; + k1 += nh; + a[j1 - 1] = -a[j1 - 1]; + xr = a[j1]; + xi = -a[j1 + 1]; + yr = a[k1]; + yi = -a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + a[k1 + 3] = -a[k1 + 3]; + j1 += nm; + k1 += 2 * nm; + xr = a[j1]; + xi = -a[j1 + 1]; + yr = a[k1]; + yi = -a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 += nm; + k1 -= nm; + xr = a[j1]; + xi = -a[j1 + 1]; + yr = a[k1]; + yi = -a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 -= 2; + k1 -= nh; + xr = a[j1]; + xi = -a[j1 + 1]; + yr = a[k1]; + yi = -a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 += nh + 2; + k1 += nh + 2; + xr = a[j1]; + xi = -a[j1 + 1]; + yr = a[k1]; + yi = -a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 -= nh - nm; + k1 += 2 * nm - 2; + a[j1 - 1] = -a[j1 - 1]; + xr = a[j1]; + xi = -a[j1 + 1]; + yr = a[k1]; + yi = -a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + a[k1 + 3] = -a[k1 + 3]; + } + } else { + for (k = 0; k < m; k++) { + for (j = 0; j < k; j++) { + j1 = 4 * j + ip[m + k]; + k1 = 4 * k + ip[m + j]; + xr = a[j1]; + xi = -a[j1 + 1]; + yr = a[k1]; + yi = -a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 += nm; + k1 += nm; + xr = a[j1]; + xi = -a[j1 + 1]; + yr = a[k1]; + yi = -a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 += nh; + k1 += 2; + xr = a[j1]; + xi = -a[j1 + 1]; + yr = a[k1]; + yi = -a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 -= nm; + k1 -= nm; + xr = a[j1]; + xi = -a[j1 + 1]; + yr = a[k1]; + yi = -a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 += 2; + k1 += nh; + xr = a[j1]; + xi = -a[j1 + 1]; + yr = a[k1]; + yi = -a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 += nm; + k1 += nm; + xr = a[j1]; + xi = -a[j1 + 1]; + yr = a[k1]; + yi = -a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 -= nh; + k1 -= 2; + xr = a[j1]; + xi = -a[j1 + 1]; + yr = a[k1]; + yi = -a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 -= nm; + k1 -= nm; + xr = a[j1]; + xi = -a[j1 + 1]; + yr = a[k1]; + yi = -a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + } + k1 = 4 * k + ip[m + k]; + j1 = k1 + 2; + k1 += nh; + a[j1 - 1] = -a[j1 - 1]; + xr = a[j1]; + xi = -a[j1 + 1]; + yr = a[k1]; + yi = -a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + a[k1 + 3] = -a[k1 + 3]; + j1 += nm; + k1 += nm; + a[j1 - 1] = -a[j1 - 1]; + xr = a[j1]; + xi = -a[j1 + 1]; + yr = a[k1]; + yi = -a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + a[k1 + 3] = -a[k1 + 3]; + } + } +} + + +void bitrv216(double *a) +{ + double x1r, x1i, x2r, x2i, x3r, x3i, x4r, x4i, + x5r, x5i, x7r, x7i, x8r, x8i, x10r, x10i, + x11r, x11i, x12r, x12i, x13r, x13i, x14r, x14i; + + x1r = a[2]; + x1i = a[3]; + x2r = a[4]; + x2i = a[5]; + x3r = a[6]; + x3i = a[7]; + x4r = a[8]; + x4i = a[9]; + x5r = a[10]; + x5i = a[11]; + x7r = a[14]; + x7i = a[15]; + x8r = a[16]; + x8i = a[17]; + x10r = a[20]; + x10i = a[21]; + x11r = a[22]; + x11i = a[23]; + x12r = a[24]; + x12i = a[25]; + x13r = a[26]; + x13i = a[27]; + x14r = a[28]; + x14i = a[29]; + a[2] = x8r; + a[3] = x8i; + a[4] = x4r; + a[5] = x4i; + a[6] = x12r; + a[7] = x12i; + a[8] = x2r; + a[9] = x2i; + a[10] = x10r; + a[11] = x10i; + a[14] = x14r; + a[15] = x14i; + a[16] = x1r; + a[17] = x1i; + a[20] = x5r; + a[21] = x5i; + a[22] = x13r; + a[23] = x13i; + a[24] = x3r; + a[25] = x3i; + a[26] = x11r; + a[27] = x11i; + a[28] = x7r; + a[29] = x7i; +} + + +void bitrv216neg(double *a) +{ + double x1r, x1i, x2r, x2i, x3r, x3i, x4r, x4i, + x5r, x5i, x6r, x6i, x7r, x7i, x8r, x8i, + x9r, x9i, x10r, x10i, x11r, x11i, x12r, x12i, + x13r, x13i, x14r, x14i, x15r, x15i; + + x1r = a[2]; + x1i = a[3]; + x2r = a[4]; + x2i = a[5]; + x3r = a[6]; + x3i = a[7]; + x4r = a[8]; + x4i = a[9]; + x5r = a[10]; + x5i = a[11]; + x6r = a[12]; + x6i = a[13]; + x7r = a[14]; + x7i = a[15]; + x8r = a[16]; + x8i = a[17]; + x9r = a[18]; + x9i = a[19]; + x10r = a[20]; + x10i = a[21]; + x11r = a[22]; + x11i = a[23]; + x12r = a[24]; + x12i = a[25]; + x13r = a[26]; + x13i = a[27]; + x14r = a[28]; + x14i = a[29]; + x15r = a[30]; + x15i = a[31]; + a[2] = x15r; + a[3] = x15i; + a[4] = x7r; + a[5] = x7i; + a[6] = x11r; + a[7] = x11i; + a[8] = x3r; + a[9] = x3i; + a[10] = x13r; + a[11] = x13i; + a[12] = x5r; + a[13] = x5i; + a[14] = x9r; + a[15] = x9i; + a[16] = x1r; + a[17] = x1i; + a[18] = x14r; + a[19] = x14i; + a[20] = x6r; + a[21] = x6i; + a[22] = x10r; + a[23] = x10i; + a[24] = x2r; + a[25] = x2i; + a[26] = x12r; + a[27] = x12i; + a[28] = x4r; + a[29] = x4i; + a[30] = x8r; + a[31] = x8i; +} + + +void bitrv208(double *a) +{ + double x1r, x1i, x3r, x3i, x4r, x4i, x6r, x6i; + + x1r = a[2]; + x1i = a[3]; + x3r = a[6]; + x3i = a[7]; + x4r = a[8]; + x4i = a[9]; + x6r = a[12]; + x6i = a[13]; + a[2] = x4r; + a[3] = x4i; + a[6] = x6r; + a[7] = x6i; + a[8] = x1r; + a[9] = x1i; + a[12] = x3r; + a[13] = x3i; +} + + +void bitrv208neg(double *a) +{ + double x1r, x1i, x2r, x2i, x3r, x3i, x4r, x4i, + x5r, x5i, x6r, x6i, x7r, x7i; + + x1r = a[2]; + x1i = a[3]; + x2r = a[4]; + x2i = a[5]; + x3r = a[6]; + x3i = a[7]; + x4r = a[8]; + x4i = a[9]; + x5r = a[10]; + x5i = a[11]; + x6r = a[12]; + x6i = a[13]; + x7r = a[14]; + x7i = a[15]; + a[2] = x7r; + a[3] = x7i; + a[4] = x3r; + a[5] = x3i; + a[6] = x5r; + a[7] = x5i; + a[8] = x1r; + a[9] = x1i; + a[10] = x6r; + a[11] = x6i; + a[12] = x2r; + a[13] = x2i; + a[14] = x4r; + a[15] = x4i; +} + + +void cftf1st(int n, double *a, double *w) +{ + int j, j0, j1, j2, j3, k, m, mh; + double wn4r, csc1, csc3, wk1r, wk1i, wk3r, wk3i, + wd1r, wd1i, wd3r, wd3i; + double x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i, + y0r, y0i, y1r, y1i, y2r, y2i, y3r, y3i; + + mh = n >> 3; + m = 2 * mh; + j1 = m; + j2 = j1 + m; + j3 = j2 + m; + x0r = a[0] + a[j2]; + x0i = a[1] + a[j2 + 1]; + x1r = a[0] - a[j2]; + x1i = a[1] - a[j2 + 1]; + x2r = a[j1] + a[j3]; + x2i = a[j1 + 1] + a[j3 + 1]; + x3r = a[j1] - a[j3]; + x3i = a[j1 + 1] - a[j3 + 1]; + a[0] = x0r + x2r; + a[1] = x0i + x2i; + a[j1] = x0r - x2r; + a[j1 + 1] = x0i - x2i; + a[j2] = x1r - x3i; + a[j2 + 1] = x1i + x3r; + a[j3] = x1r + x3i; + a[j3 + 1] = x1i - x3r; + wn4r = w[1]; + csc1 = w[2]; + csc3 = w[3]; + wd1r = 1; + wd1i = 0; + wd3r = 1; + wd3i = 0; + k = 0; + for (j = 2; j < mh - 2; j += 4) { + k += 4; + wk1r = csc1 * (wd1r + w[k]); + wk1i = csc1 * (wd1i + w[k + 1]); + wk3r = csc3 * (wd3r + w[k + 2]); + wk3i = csc3 * (wd3i + w[k + 3]); + wd1r = w[k]; + wd1i = w[k + 1]; + wd3r = w[k + 2]; + wd3i = w[k + 3]; + j1 = j + m; + j2 = j1 + m; + j3 = j2 + m; + x0r = a[j] + a[j2]; + x0i = a[j + 1] + a[j2 + 1]; + x1r = a[j] - a[j2]; + x1i = a[j + 1] - a[j2 + 1]; + y0r = a[j + 2] + a[j2 + 2]; + y0i = a[j + 3] + a[j2 + 3]; + y1r = a[j + 2] - a[j2 + 2]; + y1i = a[j + 3] - a[j2 + 3]; + x2r = a[j1] + a[j3]; + x2i = a[j1 + 1] + a[j3 + 1]; + x3r = a[j1] - a[j3]; + x3i = a[j1 + 1] - a[j3 + 1]; + y2r = a[j1 + 2] + a[j3 + 2]; + y2i = a[j1 + 3] + a[j3 + 3]; + y3r = a[j1 + 2] - a[j3 + 2]; + y3i = a[j1 + 3] - a[j3 + 3]; + a[j] = x0r + x2r; + a[j + 1] = x0i + x2i; + a[j + 2] = y0r + y2r; + a[j + 3] = y0i + y2i; + a[j1] = x0r - x2r; + a[j1 + 1] = x0i - x2i; + a[j1 + 2] = y0r - y2r; + a[j1 + 3] = y0i - y2i; + x0r = x1r - x3i; + x0i = x1i + x3r; + a[j2] = wk1r * x0r - wk1i * x0i; + a[j2 + 1] = wk1r * x0i + wk1i * x0r; + x0r = y1r - y3i; + x0i = y1i + y3r; + a[j2 + 2] = wd1r * x0r - wd1i * x0i; + a[j2 + 3] = wd1r * x0i + wd1i * x0r; + x0r = x1r + x3i; + x0i = x1i - x3r; + a[j3] = wk3r * x0r + wk3i * x0i; + a[j3 + 1] = wk3r * x0i - wk3i * x0r; + x0r = y1r + y3i; + x0i = y1i - y3r; + a[j3 + 2] = wd3r * x0r + wd3i * x0i; + a[j3 + 3] = wd3r * x0i - wd3i * x0r; + j0 = m - j; + j1 = j0 + m; + j2 = j1 + m; + j3 = j2 + m; + x0r = a[j0] + a[j2]; + x0i = a[j0 + 1] + a[j2 + 1]; + x1r = a[j0] - a[j2]; + x1i = a[j0 + 1] - a[j2 + 1]; + y0r = a[j0 - 2] + a[j2 - 2]; + y0i = a[j0 - 1] + a[j2 - 1]; + y1r = a[j0 - 2] - a[j2 - 2]; + y1i = a[j0 - 1] - a[j2 - 1]; + x2r = a[j1] + a[j3]; + x2i = a[j1 + 1] + a[j3 + 1]; + x3r = a[j1] - a[j3]; + x3i = a[j1 + 1] - a[j3 + 1]; + y2r = a[j1 - 2] + a[j3 - 2]; + y2i = a[j1 - 1] + a[j3 - 1]; + y3r = a[j1 - 2] - a[j3 - 2]; + y3i = a[j1 - 1] - a[j3 - 1]; + a[j0] = x0r + x2r; + a[j0 + 1] = x0i + x2i; + a[j0 - 2] = y0r + y2r; + a[j0 - 1] = y0i + y2i; + a[j1] = x0r - x2r; + a[j1 + 1] = x0i - x2i; + a[j1 - 2] = y0r - y2r; + a[j1 - 1] = y0i - y2i; + x0r = x1r - x3i; + x0i = x1i + x3r; + a[j2] = wk1i * x0r - wk1r * x0i; + a[j2 + 1] = wk1i * x0i + wk1r * x0r; + x0r = y1r - y3i; + x0i = y1i + y3r; + a[j2 - 2] = wd1i * x0r - wd1r * x0i; + a[j2 - 1] = wd1i * x0i + wd1r * x0r; + x0r = x1r + x3i; + x0i = x1i - x3r; + a[j3] = wk3i * x0r + wk3r * x0i; + a[j3 + 1] = wk3i * x0i - wk3r * x0r; + x0r = y1r + y3i; + x0i = y1i - y3r; + a[j3 - 2] = wd3i * x0r + wd3r * x0i; + a[j3 - 1] = wd3i * x0i - wd3r * x0r; + } + wk1r = csc1 * (wd1r + wn4r); + wk1i = csc1 * (wd1i + wn4r); + wk3r = csc3 * (wd3r - wn4r); + wk3i = csc3 * (wd3i - wn4r); + j0 = mh; + j1 = j0 + m; + j2 = j1 + m; + j3 = j2 + m; + x0r = a[j0 - 2] + a[j2 - 2]; + x0i = a[j0 - 1] + a[j2 - 1]; + x1r = a[j0 - 2] - a[j2 - 2]; + x1i = a[j0 - 1] - a[j2 - 1]; + x2r = a[j1 - 2] + a[j3 - 2]; + x2i = a[j1 - 1] + a[j3 - 1]; + x3r = a[j1 - 2] - a[j3 - 2]; + x3i = a[j1 - 1] - a[j3 - 1]; + a[j0 - 2] = x0r + x2r; + a[j0 - 1] = x0i + x2i; + a[j1 - 2] = x0r - x2r; + a[j1 - 1] = x0i - x2i; + x0r = x1r - x3i; + x0i = x1i + x3r; + a[j2 - 2] = wk1r * x0r - wk1i * x0i; + a[j2 - 1] = wk1r * x0i + wk1i * x0r; + x0r = x1r + x3i; + x0i = x1i - x3r; + a[j3 - 2] = wk3r * x0r + wk3i * x0i; + a[j3 - 1] = wk3r * x0i - wk3i * x0r; + x0r = a[j0] + a[j2]; + x0i = a[j0 + 1] + a[j2 + 1]; + x1r = a[j0] - a[j2]; + x1i = a[j0 + 1] - a[j2 + 1]; + x2r = a[j1] + a[j3]; + x2i = a[j1 + 1] + a[j3 + 1]; + x3r = a[j1] - a[j3]; + x3i = a[j1 + 1] - a[j3 + 1]; + a[j0] = x0r + x2r; + a[j0 + 1] = x0i + x2i; + a[j1] = x0r - x2r; + a[j1 + 1] = x0i - x2i; + x0r = x1r - x3i; + x0i = x1i + x3r; + a[j2] = wn4r * (x0r - x0i); + a[j2 + 1] = wn4r * (x0i + x0r); + x0r = x1r + x3i; + x0i = x1i - x3r; + a[j3] = -wn4r * (x0r + x0i); + a[j3 + 1] = -wn4r * (x0i - x0r); + x0r = a[j0 + 2] + a[j2 + 2]; + x0i = a[j0 + 3] + a[j2 + 3]; + x1r = a[j0 + 2] - a[j2 + 2]; + x1i = a[j0 + 3] - a[j2 + 3]; + x2r = a[j1 + 2] + a[j3 + 2]; + x2i = a[j1 + 3] + a[j3 + 3]; + x3r = a[j1 + 2] - a[j3 + 2]; + x3i = a[j1 + 3] - a[j3 + 3]; + a[j0 + 2] = x0r + x2r; + a[j0 + 3] = x0i + x2i; + a[j1 + 2] = x0r - x2r; + a[j1 + 3] = x0i - x2i; + x0r = x1r - x3i; + x0i = x1i + x3r; + a[j2 + 2] = wk1i * x0r - wk1r * x0i; + a[j2 + 3] = wk1i * x0i + wk1r * x0r; + x0r = x1r + x3i; + x0i = x1i - x3r; + a[j3 + 2] = wk3i * x0r + wk3r * x0i; + a[j3 + 3] = wk3i * x0i - wk3r * x0r; +} + + +void cftb1st(int n, double *a, double *w) +{ + int j, j0, j1, j2, j3, k, m, mh; + double wn4r, csc1, csc3, wk1r, wk1i, wk3r, wk3i, + wd1r, wd1i, wd3r, wd3i; + double x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i, + y0r, y0i, y1r, y1i, y2r, y2i, y3r, y3i; + + mh = n >> 3; + m = 2 * mh; + j1 = m; + j2 = j1 + m; + j3 = j2 + m; + x0r = a[0] + a[j2]; + x0i = -a[1] - a[j2 + 1]; + x1r = a[0] - a[j2]; + x1i = -a[1] + a[j2 + 1]; + x2r = a[j1] + a[j3]; + x2i = a[j1 + 1] + a[j3 + 1]; + x3r = a[j1] - a[j3]; + x3i = a[j1 + 1] - a[j3 + 1]; + a[0] = x0r + x2r; + a[1] = x0i - x2i; + a[j1] = x0r - x2r; + a[j1 + 1] = x0i + x2i; + a[j2] = x1r + x3i; + a[j2 + 1] = x1i + x3r; + a[j3] = x1r - x3i; + a[j3 + 1] = x1i - x3r; + wn4r = w[1]; + csc1 = w[2]; + csc3 = w[3]; + wd1r = 1; + wd1i = 0; + wd3r = 1; + wd3i = 0; + k = 0; + for (j = 2; j < mh - 2; j += 4) { + k += 4; + wk1r = csc1 * (wd1r + w[k]); + wk1i = csc1 * (wd1i + w[k + 1]); + wk3r = csc3 * (wd3r + w[k + 2]); + wk3i = csc3 * (wd3i + w[k + 3]); + wd1r = w[k]; + wd1i = w[k + 1]; + wd3r = w[k + 2]; + wd3i = w[k + 3]; + j1 = j + m; + j2 = j1 + m; + j3 = j2 + m; + x0r = a[j] + a[j2]; + x0i = -a[j + 1] - a[j2 + 1]; + x1r = a[j] - a[j2]; + x1i = -a[j + 1] + a[j2 + 1]; + y0r = a[j + 2] + a[j2 + 2]; + y0i = -a[j + 3] - a[j2 + 3]; + y1r = a[j + 2] - a[j2 + 2]; + y1i = -a[j + 3] + a[j2 + 3]; + x2r = a[j1] + a[j3]; + x2i = a[j1 + 1] + a[j3 + 1]; + x3r = a[j1] - a[j3]; + x3i = a[j1 + 1] - a[j3 + 1]; + y2r = a[j1 + 2] + a[j3 + 2]; + y2i = a[j1 + 3] + a[j3 + 3]; + y3r = a[j1 + 2] - a[j3 + 2]; + y3i = a[j1 + 3] - a[j3 + 3]; + a[j] = x0r + x2r; + a[j + 1] = x0i - x2i; + a[j + 2] = y0r + y2r; + a[j + 3] = y0i - y2i; + a[j1] = x0r - x2r; + a[j1 + 1] = x0i + x2i; + a[j1 + 2] = y0r - y2r; + a[j1 + 3] = y0i + y2i; + x0r = x1r + x3i; + x0i = x1i + x3r; + a[j2] = wk1r * x0r - wk1i * x0i; + a[j2 + 1] = wk1r * x0i + wk1i * x0r; + x0r = y1r + y3i; + x0i = y1i + y3r; + a[j2 + 2] = wd1r * x0r - wd1i * x0i; + a[j2 + 3] = wd1r * x0i + wd1i * x0r; + x0r = x1r - x3i; + x0i = x1i - x3r; + a[j3] = wk3r * x0r + wk3i * x0i; + a[j3 + 1] = wk3r * x0i - wk3i * x0r; + x0r = y1r - y3i; + x0i = y1i - y3r; + a[j3 + 2] = wd3r * x0r + wd3i * x0i; + a[j3 + 3] = wd3r * x0i - wd3i * x0r; + j0 = m - j; + j1 = j0 + m; + j2 = j1 + m; + j3 = j2 + m; + x0r = a[j0] + a[j2]; + x0i = -a[j0 + 1] - a[j2 + 1]; + x1r = a[j0] - a[j2]; + x1i = -a[j0 + 1] + a[j2 + 1]; + y0r = a[j0 - 2] + a[j2 - 2]; + y0i = -a[j0 - 1] - a[j2 - 1]; + y1r = a[j0 - 2] - a[j2 - 2]; + y1i = -a[j0 - 1] + a[j2 - 1]; + x2r = a[j1] + a[j3]; + x2i = a[j1 + 1] + a[j3 + 1]; + x3r = a[j1] - a[j3]; + x3i = a[j1 + 1] - a[j3 + 1]; + y2r = a[j1 - 2] + a[j3 - 2]; + y2i = a[j1 - 1] + a[j3 - 1]; + y3r = a[j1 - 2] - a[j3 - 2]; + y3i = a[j1 - 1] - a[j3 - 1]; + a[j0] = x0r + x2r; + a[j0 + 1] = x0i - x2i; + a[j0 - 2] = y0r + y2r; + a[j0 - 1] = y0i - y2i; + a[j1] = x0r - x2r; + a[j1 + 1] = x0i + x2i; + a[j1 - 2] = y0r - y2r; + a[j1 - 1] = y0i + y2i; + x0r = x1r + x3i; + x0i = x1i + x3r; + a[j2] = wk1i * x0r - wk1r * x0i; + a[j2 + 1] = wk1i * x0i + wk1r * x0r; + x0r = y1r + y3i; + x0i = y1i + y3r; + a[j2 - 2] = wd1i * x0r - wd1r * x0i; + a[j2 - 1] = wd1i * x0i + wd1r * x0r; + x0r = x1r - x3i; + x0i = x1i - x3r; + a[j3] = wk3i * x0r + wk3r * x0i; + a[j3 + 1] = wk3i * x0i - wk3r * x0r; + x0r = y1r - y3i; + x0i = y1i - y3r; + a[j3 - 2] = wd3i * x0r + wd3r * x0i; + a[j3 - 1] = wd3i * x0i - wd3r * x0r; + } + wk1r = csc1 * (wd1r + wn4r); + wk1i = csc1 * (wd1i + wn4r); + wk3r = csc3 * (wd3r - wn4r); + wk3i = csc3 * (wd3i - wn4r); + j0 = mh; + j1 = j0 + m; + j2 = j1 + m; + j3 = j2 + m; + x0r = a[j0 - 2] + a[j2 - 2]; + x0i = -a[j0 - 1] - a[j2 - 1]; + x1r = a[j0 - 2] - a[j2 - 2]; + x1i = -a[j0 - 1] + a[j2 - 1]; + x2r = a[j1 - 2] + a[j3 - 2]; + x2i = a[j1 - 1] + a[j3 - 1]; + x3r = a[j1 - 2] - a[j3 - 2]; + x3i = a[j1 - 1] - a[j3 - 1]; + a[j0 - 2] = x0r + x2r; + a[j0 - 1] = x0i - x2i; + a[j1 - 2] = x0r - x2r; + a[j1 - 1] = x0i + x2i; + x0r = x1r + x3i; + x0i = x1i + x3r; + a[j2 - 2] = wk1r * x0r - wk1i * x0i; + a[j2 - 1] = wk1r * x0i + wk1i * x0r; + x0r = x1r - x3i; + x0i = x1i - x3r; + a[j3 - 2] = wk3r * x0r + wk3i * x0i; + a[j3 - 1] = wk3r * x0i - wk3i * x0r; + x0r = a[j0] + a[j2]; + x0i = -a[j0 + 1] - a[j2 + 1]; + x1r = a[j0] - a[j2]; + x1i = -a[j0 + 1] + a[j2 + 1]; + x2r = a[j1] + a[j3]; + x2i = a[j1 + 1] + a[j3 + 1]; + x3r = a[j1] - a[j3]; + x3i = a[j1 + 1] - a[j3 + 1]; + a[j0] = x0r + x2r; + a[j0 + 1] = x0i - x2i; + a[j1] = x0r - x2r; + a[j1 + 1] = x0i + x2i; + x0r = x1r + x3i; + x0i = x1i + x3r; + a[j2] = wn4r * (x0r - x0i); + a[j2 + 1] = wn4r * (x0i + x0r); + x0r = x1r - x3i; + x0i = x1i - x3r; + a[j3] = -wn4r * (x0r + x0i); + a[j3 + 1] = -wn4r * (x0i - x0r); + x0r = a[j0 + 2] + a[j2 + 2]; + x0i = -a[j0 + 3] - a[j2 + 3]; + x1r = a[j0 + 2] - a[j2 + 2]; + x1i = -a[j0 + 3] + a[j2 + 3]; + x2r = a[j1 + 2] + a[j3 + 2]; + x2i = a[j1 + 3] + a[j3 + 3]; + x3r = a[j1 + 2] - a[j3 + 2]; + x3i = a[j1 + 3] - a[j3 + 3]; + a[j0 + 2] = x0r + x2r; + a[j0 + 3] = x0i - x2i; + a[j1 + 2] = x0r - x2r; + a[j1 + 3] = x0i + x2i; + x0r = x1r + x3i; + x0i = x1i + x3r; + a[j2 + 2] = wk1i * x0r - wk1r * x0i; + a[j2 + 3] = wk1i * x0i + wk1r * x0r; + x0r = x1r - x3i; + x0i = x1i - x3r; + a[j3 + 2] = wk3i * x0r + wk3r * x0i; + a[j3 + 3] = wk3i * x0i - wk3r * x0r; +} + + +#ifdef USE_CDFT_THREADS +struct cdft_arg_st { + int n0; + int n; + double *a; + int nw; + double *w; +}; +typedef struct cdft_arg_st cdft_arg_t; + + +void cftrec4_th(int n, double *a, int nw, double *w) +{ + void *cftrec1_th(void *p); + void *cftrec2_th(void *p); + int i, idiv4, m, nthread; + cdft_thread_t th[4]; + cdft_arg_t ag[4]; + + nthread = 2; + idiv4 = 0; + m = n >> 1; + if (n > CDFT_4THREADS_BEGIN_N) { + nthread = 4; + idiv4 = 1; + m >>= 1; + } + for (i = 0; i < nthread; i++) { + ag[i].n0 = n; + ag[i].n = m; + ag[i].a = &a[i * m]; + ag[i].nw = nw; + ag[i].w = w; + if (i != idiv4) { + cdft_thread_create(&th[i], cftrec1_th, &ag[i]); + } else { + cdft_thread_create(&th[i], cftrec2_th, &ag[i]); + } + } + for (i = 0; i < nthread; i++) { + cdft_thread_wait(th[i]); + } +} + + +void *cftrec1_th(void *p) +{ + int cfttree(int n, int j, int k, double *a, int nw, double *w); + void cftleaf(int n, int isplt, double *a, int nw, double *w); + void cftmdl1(int n, double *a, double *w); + int isplt, j, k, m, n, n0, nw; + double *a, *w; + + n0 = ((cdft_arg_t *) p)->n0; + n = ((cdft_arg_t *) p)->n; + a = ((cdft_arg_t *) p)->a; + nw = ((cdft_arg_t *) p)->nw; + w = ((cdft_arg_t *) p)->w; + m = n0; + while (m > 512) { + m >>= 2; + cftmdl1(m, &a[n - m], &w[nw - (m >> 1)]); + } + cftleaf(m, 1, &a[n - m], nw, w); + k = 0; + for (j = n - m; j > 0; j -= m) { + k++; + isplt = cfttree(m, j, k, a, nw, w); + cftleaf(m, isplt, &a[j - m], nw, w); + } + return (void *) 0; +} + + +void *cftrec2_th(void *p) +{ + int cfttree(int n, int j, int k, double *a, int nw, double *w); + void cftleaf(int n, int isplt, double *a, int nw, double *w); + void cftmdl2(int n, double *a, double *w); + int isplt, j, k, m, n, n0, nw; + double *a, *w; + + n0 = ((cdft_arg_t *) p)->n0; + n = ((cdft_arg_t *) p)->n; + a = ((cdft_arg_t *) p)->a; + nw = ((cdft_arg_t *) p)->nw; + w = ((cdft_arg_t *) p)->w; + k = 1; + m = n0; + while (m > 512) { + m >>= 2; + k <<= 2; + cftmdl2(m, &a[n - m], &w[nw - m]); + } + cftleaf(m, 0, &a[n - m], nw, w); + k >>= 1; + for (j = n - m; j > 0; j -= m) { + k++; + isplt = cfttree(m, j, k, a, nw, w); + cftleaf(m, isplt, &a[j - m], nw, w); + } + return (void *) 0; +} +#endif /* USE_CDFT_THREADS */ + + +void cftrec4(int n, double *a, int nw, double *w) +{ + int cfttree(int n, int j, int k, double *a, int nw, double *w); + void cftleaf(int n, int isplt, double *a, int nw, double *w); + void cftmdl1(int n, double *a, double *w); + int isplt, j, k, m; + + m = n; + while (m > 512) { + m >>= 2; + cftmdl1(m, &a[n - m], &w[nw - (m >> 1)]); + } + cftleaf(m, 1, &a[n - m], nw, w); + k = 0; + for (j = n - m; j > 0; j -= m) { + k++; + isplt = cfttree(m, j, k, a, nw, w); + cftleaf(m, isplt, &a[j - m], nw, w); + } +} + + +int cfttree(int n, int j, int k, double *a, int nw, double *w) +{ + void cftmdl1(int n, double *a, double *w); + void cftmdl2(int n, double *a, double *w); + int i, isplt, m; + + if ((k & 3) != 0) { + isplt = k & 1; + if (isplt != 0) { + cftmdl1(n, &a[j - n], &w[nw - (n >> 1)]); + } else { + cftmdl2(n, &a[j - n], &w[nw - n]); + } + } else { + m = n; + for (i = k; (i & 3) == 0; i >>= 2) { + m <<= 2; + } + isplt = i & 1; + if (isplt != 0) { + while (m > 128) { + cftmdl1(m, &a[j - m], &w[nw - (m >> 1)]); + m >>= 2; + } + } else { + while (m > 128) { + cftmdl2(m, &a[j - m], &w[nw - m]); + m >>= 2; + } + } + } + return isplt; +} + + +void cftleaf(int n, int isplt, double *a, int nw, double *w) +{ + void cftmdl1(int n, double *a, double *w); + void cftmdl2(int n, double *a, double *w); + void cftf161(double *a, double *w); + void cftf162(double *a, double *w); + void cftf081(double *a, double *w); + void cftf082(double *a, double *w); + + if (n == 512) { + cftmdl1(128, a, &w[nw - 64]); + cftf161(a, &w[nw - 8]); + cftf162(&a[32], &w[nw - 32]); + cftf161(&a[64], &w[nw - 8]); + cftf161(&a[96], &w[nw - 8]); + cftmdl2(128, &a[128], &w[nw - 128]); + cftf161(&a[128], &w[nw - 8]); + cftf162(&a[160], &w[nw - 32]); + cftf161(&a[192], &w[nw - 8]); + cftf162(&a[224], &w[nw - 32]); + cftmdl1(128, &a[256], &w[nw - 64]); + cftf161(&a[256], &w[nw - 8]); + cftf162(&a[288], &w[nw - 32]); + cftf161(&a[320], &w[nw - 8]); + cftf161(&a[352], &w[nw - 8]); + if (isplt != 0) { + cftmdl1(128, &a[384], &w[nw - 64]); + cftf161(&a[480], &w[nw - 8]); + } else { + cftmdl2(128, &a[384], &w[nw - 128]); + cftf162(&a[480], &w[nw - 32]); + } + cftf161(&a[384], &w[nw - 8]); + cftf162(&a[416], &w[nw - 32]); + cftf161(&a[448], &w[nw - 8]); + } else { + cftmdl1(64, a, &w[nw - 32]); + cftf081(a, &w[nw - 8]); + cftf082(&a[16], &w[nw - 8]); + cftf081(&a[32], &w[nw - 8]); + cftf081(&a[48], &w[nw - 8]); + cftmdl2(64, &a[64], &w[nw - 64]); + cftf081(&a[64], &w[nw - 8]); + cftf082(&a[80], &w[nw - 8]); + cftf081(&a[96], &w[nw - 8]); + cftf082(&a[112], &w[nw - 8]); + cftmdl1(64, &a[128], &w[nw - 32]); + cftf081(&a[128], &w[nw - 8]); + cftf082(&a[144], &w[nw - 8]); + cftf081(&a[160], &w[nw - 8]); + cftf081(&a[176], &w[nw - 8]); + if (isplt != 0) { + cftmdl1(64, &a[192], &w[nw - 32]); + cftf081(&a[240], &w[nw - 8]); + } else { + cftmdl2(64, &a[192], &w[nw - 64]); + cftf082(&a[240], &w[nw - 8]); + } + cftf081(&a[192], &w[nw - 8]); + cftf082(&a[208], &w[nw - 8]); + cftf081(&a[224], &w[nw - 8]); + } +} + + +void cftmdl1(int n, double *a, double *w) +{ + int j, j0, j1, j2, j3, k, m, mh; + double wn4r, wk1r, wk1i, wk3r, wk3i; + double x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i; + + mh = n >> 3; + m = 2 * mh; + j1 = m; + j2 = j1 + m; + j3 = j2 + m; + x0r = a[0] + a[j2]; + x0i = a[1] + a[j2 + 1]; + x1r = a[0] - a[j2]; + x1i = a[1] - a[j2 + 1]; + x2r = a[j1] + a[j3]; + x2i = a[j1 + 1] + a[j3 + 1]; + x3r = a[j1] - a[j3]; + x3i = a[j1 + 1] - a[j3 + 1]; + a[0] = x0r + x2r; + a[1] = x0i + x2i; + a[j1] = x0r - x2r; + a[j1 + 1] = x0i - x2i; + a[j2] = x1r - x3i; + a[j2 + 1] = x1i + x3r; + a[j3] = x1r + x3i; + a[j3 + 1] = x1i - x3r; + wn4r = w[1]; + k = 0; + for (j = 2; j < mh; j += 2) { + k += 4; + wk1r = w[k]; + wk1i = w[k + 1]; + wk3r = w[k + 2]; + wk3i = w[k + 3]; + j1 = j + m; + j2 = j1 + m; + j3 = j2 + m; + x0r = a[j] + a[j2]; + x0i = a[j + 1] + a[j2 + 1]; + x1r = a[j] - a[j2]; + x1i = a[j + 1] - a[j2 + 1]; + x2r = a[j1] + a[j3]; + x2i = a[j1 + 1] + a[j3 + 1]; + x3r = a[j1] - a[j3]; + x3i = a[j1 + 1] - a[j3 + 1]; + a[j] = x0r + x2r; + a[j + 1] = x0i + x2i; + a[j1] = x0r - x2r; + a[j1 + 1] = x0i - x2i; + x0r = x1r - x3i; + x0i = x1i + x3r; + a[j2] = wk1r * x0r - wk1i * x0i; + a[j2 + 1] = wk1r * x0i + wk1i * x0r; + x0r = x1r + x3i; + x0i = x1i - x3r; + a[j3] = wk3r * x0r + wk3i * x0i; + a[j3 + 1] = wk3r * x0i - wk3i * x0r; + j0 = m - j; + j1 = j0 + m; + j2 = j1 + m; + j3 = j2 + m; + x0r = a[j0] + a[j2]; + x0i = a[j0 + 1] + a[j2 + 1]; + x1r = a[j0] - a[j2]; + x1i = a[j0 + 1] - a[j2 + 1]; + x2r = a[j1] + a[j3]; + x2i = a[j1 + 1] + a[j3 + 1]; + x3r = a[j1] - a[j3]; + x3i = a[j1 + 1] - a[j3 + 1]; + a[j0] = x0r + x2r; + a[j0 + 1] = x0i + x2i; + a[j1] = x0r - x2r; + a[j1 + 1] = x0i - x2i; + x0r = x1r - x3i; + x0i = x1i + x3r; + a[j2] = wk1i * x0r - wk1r * x0i; + a[j2 + 1] = wk1i * x0i + wk1r * x0r; + x0r = x1r + x3i; + x0i = x1i - x3r; + a[j3] = wk3i * x0r + wk3r * x0i; + a[j3 + 1] = wk3i * x0i - wk3r * x0r; + } + j0 = mh; + j1 = j0 + m; + j2 = j1 + m; + j3 = j2 + m; + x0r = a[j0] + a[j2]; + x0i = a[j0 + 1] + a[j2 + 1]; + x1r = a[j0] - a[j2]; + x1i = a[j0 + 1] - a[j2 + 1]; + x2r = a[j1] + a[j3]; + x2i = a[j1 + 1] + a[j3 + 1]; + x3r = a[j1] - a[j3]; + x3i = a[j1 + 1] - a[j3 + 1]; + a[j0] = x0r + x2r; + a[j0 + 1] = x0i + x2i; + a[j1] = x0r - x2r; + a[j1 + 1] = x0i - x2i; + x0r = x1r - x3i; + x0i = x1i + x3r; + a[j2] = wn4r * (x0r - x0i); + a[j2 + 1] = wn4r * (x0i + x0r); + x0r = x1r + x3i; + x0i = x1i - x3r; + a[j3] = -wn4r * (x0r + x0i); + a[j3 + 1] = -wn4r * (x0i - x0r); +} + + +void cftmdl2(int n, double *a, double *w) +{ + int j, j0, j1, j2, j3, k, kr, m, mh; + double wn4r, wk1r, wk1i, wk3r, wk3i, wd1r, wd1i, wd3r, wd3i; + double x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i, y0r, y0i, y2r, y2i; + + mh = n >> 3; + m = 2 * mh; + wn4r = w[1]; + j1 = m; + j2 = j1 + m; + j3 = j2 + m; + x0r = a[0] - a[j2 + 1]; + x0i = a[1] + a[j2]; + x1r = a[0] + a[j2 + 1]; + x1i = a[1] - a[j2]; + x2r = a[j1] - a[j3 + 1]; + x2i = a[j1 + 1] + a[j3]; + x3r = a[j1] + a[j3 + 1]; + x3i = a[j1 + 1] - a[j3]; + y0r = wn4r * (x2r - x2i); + y0i = wn4r * (x2i + x2r); + a[0] = x0r + y0r; + a[1] = x0i + y0i; + a[j1] = x0r - y0r; + a[j1 + 1] = x0i - y0i; + y0r = wn4r * (x3r - x3i); + y0i = wn4r * (x3i + x3r); + a[j2] = x1r - y0i; + a[j2 + 1] = x1i + y0r; + a[j3] = x1r + y0i; + a[j3 + 1] = x1i - y0r; + k = 0; + kr = 2 * m; + for (j = 2; j < mh; j += 2) { + k += 4; + wk1r = w[k]; + wk1i = w[k + 1]; + wk3r = w[k + 2]; + wk3i = w[k + 3]; + kr -= 4; + wd1i = w[kr]; + wd1r = w[kr + 1]; + wd3i = w[kr + 2]; + wd3r = w[kr + 3]; + j1 = j + m; + j2 = j1 + m; + j3 = j2 + m; + x0r = a[j] - a[j2 + 1]; + x0i = a[j + 1] + a[j2]; + x1r = a[j] + a[j2 + 1]; + x1i = a[j + 1] - a[j2]; + x2r = a[j1] - a[j3 + 1]; + x2i = a[j1 + 1] + a[j3]; + x3r = a[j1] + a[j3 + 1]; + x3i = a[j1 + 1] - a[j3]; + y0r = wk1r * x0r - wk1i * x0i; + y0i = wk1r * x0i + wk1i * x0r; + y2r = wd1r * x2r - wd1i * x2i; + y2i = wd1r * x2i + wd1i * x2r; + a[j] = y0r + y2r; + a[j + 1] = y0i + y2i; + a[j1] = y0r - y2r; + a[j1 + 1] = y0i - y2i; + y0r = wk3r * x1r + wk3i * x1i; + y0i = wk3r * x1i - wk3i * x1r; + y2r = wd3r * x3r + wd3i * x3i; + y2i = wd3r * x3i - wd3i * x3r; + a[j2] = y0r + y2r; + a[j2 + 1] = y0i + y2i; + a[j3] = y0r - y2r; + a[j3 + 1] = y0i - y2i; + j0 = m - j; + j1 = j0 + m; + j2 = j1 + m; + j3 = j2 + m; + x0r = a[j0] - a[j2 + 1]; + x0i = a[j0 + 1] + a[j2]; + x1r = a[j0] + a[j2 + 1]; + x1i = a[j0 + 1] - a[j2]; + x2r = a[j1] - a[j3 + 1]; + x2i = a[j1 + 1] + a[j3]; + x3r = a[j1] + a[j3 + 1]; + x3i = a[j1 + 1] - a[j3]; + y0r = wd1i * x0r - wd1r * x0i; + y0i = wd1i * x0i + wd1r * x0r; + y2r = wk1i * x2r - wk1r * x2i; + y2i = wk1i * x2i + wk1r * x2r; + a[j0] = y0r + y2r; + a[j0 + 1] = y0i + y2i; + a[j1] = y0r - y2r; + a[j1 + 1] = y0i - y2i; + y0r = wd3i * x1r + wd3r * x1i; + y0i = wd3i * x1i - wd3r * x1r; + y2r = wk3i * x3r + wk3r * x3i; + y2i = wk3i * x3i - wk3r * x3r; + a[j2] = y0r + y2r; + a[j2 + 1] = y0i + y2i; + a[j3] = y0r - y2r; + a[j3 + 1] = y0i - y2i; + } + wk1r = w[m]; + wk1i = w[m + 1]; + j0 = mh; + j1 = j0 + m; + j2 = j1 + m; + j3 = j2 + m; + x0r = a[j0] - a[j2 + 1]; + x0i = a[j0 + 1] + a[j2]; + x1r = a[j0] + a[j2 + 1]; + x1i = a[j0 + 1] - a[j2]; + x2r = a[j1] - a[j3 + 1]; + x2i = a[j1 + 1] + a[j3]; + x3r = a[j1] + a[j3 + 1]; + x3i = a[j1 + 1] - a[j3]; + y0r = wk1r * x0r - wk1i * x0i; + y0i = wk1r * x0i + wk1i * x0r; + y2r = wk1i * x2r - wk1r * x2i; + y2i = wk1i * x2i + wk1r * x2r; + a[j0] = y0r + y2r; + a[j0 + 1] = y0i + y2i; + a[j1] = y0r - y2r; + a[j1 + 1] = y0i - y2i; + y0r = wk1i * x1r - wk1r * x1i; + y0i = wk1i * x1i + wk1r * x1r; + y2r = wk1r * x3r - wk1i * x3i; + y2i = wk1r * x3i + wk1i * x3r; + a[j2] = y0r - y2r; + a[j2 + 1] = y0i - y2i; + a[j3] = y0r + y2r; + a[j3 + 1] = y0i + y2i; +} + + +void cftfx41(int n, double *a, int nw, double *w) +{ + void cftf161(double *a, double *w); + void cftf162(double *a, double *w); + void cftf081(double *a, double *w); + void cftf082(double *a, double *w); + + if (n == 128) { + cftf161(a, &w[nw - 8]); + cftf162(&a[32], &w[nw - 32]); + cftf161(&a[64], &w[nw - 8]); + cftf161(&a[96], &w[nw - 8]); + } else { + cftf081(a, &w[nw - 8]); + cftf082(&a[16], &w[nw - 8]); + cftf081(&a[32], &w[nw - 8]); + cftf081(&a[48], &w[nw - 8]); + } +} + + +void cftf161(double *a, double *w) +{ + double wn4r, wk1r, wk1i, + x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i, + y0r, y0i, y1r, y1i, y2r, y2i, y3r, y3i, + y4r, y4i, y5r, y5i, y6r, y6i, y7r, y7i, + y8r, y8i, y9r, y9i, y10r, y10i, y11r, y11i, + y12r, y12i, y13r, y13i, y14r, y14i, y15r, y15i; + + wn4r = w[1]; + wk1r = w[2]; + wk1i = w[3]; + x0r = a[0] + a[16]; + x0i = a[1] + a[17]; + x1r = a[0] - a[16]; + x1i = a[1] - a[17]; + x2r = a[8] + a[24]; + x2i = a[9] + a[25]; + x3r = a[8] - a[24]; + x3i = a[9] - a[25]; + y0r = x0r + x2r; + y0i = x0i + x2i; + y4r = x0r - x2r; + y4i = x0i - x2i; + y8r = x1r - x3i; + y8i = x1i + x3r; + y12r = x1r + x3i; + y12i = x1i - x3r; + x0r = a[2] + a[18]; + x0i = a[3] + a[19]; + x1r = a[2] - a[18]; + x1i = a[3] - a[19]; + x2r = a[10] + a[26]; + x2i = a[11] + a[27]; + x3r = a[10] - a[26]; + x3i = a[11] - a[27]; + y1r = x0r + x2r; + y1i = x0i + x2i; + y5r = x0r - x2r; + y5i = x0i - x2i; + x0r = x1r - x3i; + x0i = x1i + x3r; + y9r = wk1r * x0r - wk1i * x0i; + y9i = wk1r * x0i + wk1i * x0r; + x0r = x1r + x3i; + x0i = x1i - x3r; + y13r = wk1i * x0r - wk1r * x0i; + y13i = wk1i * x0i + wk1r * x0r; + x0r = a[4] + a[20]; + x0i = a[5] + a[21]; + x1r = a[4] - a[20]; + x1i = a[5] - a[21]; + x2r = a[12] + a[28]; + x2i = a[13] + a[29]; + x3r = a[12] - a[28]; + x3i = a[13] - a[29]; + y2r = x0r + x2r; + y2i = x0i + x2i; + y6r = x0r - x2r; + y6i = x0i - x2i; + x0r = x1r - x3i; + x0i = x1i + x3r; + y10r = wn4r * (x0r - x0i); + y10i = wn4r * (x0i + x0r); + x0r = x1r + x3i; + x0i = x1i - x3r; + y14r = wn4r * (x0r + x0i); + y14i = wn4r * (x0i - x0r); + x0r = a[6] + a[22]; + x0i = a[7] + a[23]; + x1r = a[6] - a[22]; + x1i = a[7] - a[23]; + x2r = a[14] + a[30]; + x2i = a[15] + a[31]; + x3r = a[14] - a[30]; + x3i = a[15] - a[31]; + y3r = x0r + x2r; + y3i = x0i + x2i; + y7r = x0r - x2r; + y7i = x0i - x2i; + x0r = x1r - x3i; + x0i = x1i + x3r; + y11r = wk1i * x0r - wk1r * x0i; + y11i = wk1i * x0i + wk1r * x0r; + x0r = x1r + x3i; + x0i = x1i - x3r; + y15r = wk1r * x0r - wk1i * x0i; + y15i = wk1r * x0i + wk1i * x0r; + x0r = y12r - y14r; + x0i = y12i - y14i; + x1r = y12r + y14r; + x1i = y12i + y14i; + x2r = y13r - y15r; + x2i = y13i - y15i; + x3r = y13r + y15r; + x3i = y13i + y15i; + a[24] = x0r + x2r; + a[25] = x0i + x2i; + a[26] = x0r - x2r; + a[27] = x0i - x2i; + a[28] = x1r - x3i; + a[29] = x1i + x3r; + a[30] = x1r + x3i; + a[31] = x1i - x3r; + x0r = y8r + y10r; + x0i = y8i + y10i; + x1r = y8r - y10r; + x1i = y8i - y10i; + x2r = y9r + y11r; + x2i = y9i + y11i; + x3r = y9r - y11r; + x3i = y9i - y11i; + a[16] = x0r + x2r; + a[17] = x0i + x2i; + a[18] = x0r - x2r; + a[19] = x0i - x2i; + a[20] = x1r - x3i; + a[21] = x1i + x3r; + a[22] = x1r + x3i; + a[23] = x1i - x3r; + x0r = y5r - y7i; + x0i = y5i + y7r; + x2r = wn4r * (x0r - x0i); + x2i = wn4r * (x0i + x0r); + x0r = y5r + y7i; + x0i = y5i - y7r; + x3r = wn4r * (x0r - x0i); + x3i = wn4r * (x0i + x0r); + x0r = y4r - y6i; + x0i = y4i + y6r; + x1r = y4r + y6i; + x1i = y4i - y6r; + a[8] = x0r + x2r; + a[9] = x0i + x2i; + a[10] = x0r - x2r; + a[11] = x0i - x2i; + a[12] = x1r - x3i; + a[13] = x1i + x3r; + a[14] = x1r + x3i; + a[15] = x1i - x3r; + x0r = y0r + y2r; + x0i = y0i + y2i; + x1r = y0r - y2r; + x1i = y0i - y2i; + x2r = y1r + y3r; + x2i = y1i + y3i; + x3r = y1r - y3r; + x3i = y1i - y3i; + a[0] = x0r + x2r; + a[1] = x0i + x2i; + a[2] = x0r - x2r; + a[3] = x0i - x2i; + a[4] = x1r - x3i; + a[5] = x1i + x3r; + a[6] = x1r + x3i; + a[7] = x1i - x3r; +} + + +void cftf162(double *a, double *w) +{ + double wn4r, wk1r, wk1i, wk2r, wk2i, wk3r, wk3i, + x0r, x0i, x1r, x1i, x2r, x2i, + y0r, y0i, y1r, y1i, y2r, y2i, y3r, y3i, + y4r, y4i, y5r, y5i, y6r, y6i, y7r, y7i, + y8r, y8i, y9r, y9i, y10r, y10i, y11r, y11i, + y12r, y12i, y13r, y13i, y14r, y14i, y15r, y15i; + + wn4r = w[1]; + wk1r = w[4]; + wk1i = w[5]; + wk3r = w[6]; + wk3i = -w[7]; + wk2r = w[8]; + wk2i = w[9]; + x1r = a[0] - a[17]; + x1i = a[1] + a[16]; + x0r = a[8] - a[25]; + x0i = a[9] + a[24]; + x2r = wn4r * (x0r - x0i); + x2i = wn4r * (x0i + x0r); + y0r = x1r + x2r; + y0i = x1i + x2i; + y4r = x1r - x2r; + y4i = x1i - x2i; + x1r = a[0] + a[17]; + x1i = a[1] - a[16]; + x0r = a[8] + a[25]; + x0i = a[9] - a[24]; + x2r = wn4r * (x0r - x0i); + x2i = wn4r * (x0i + x0r); + y8r = x1r - x2i; + y8i = x1i + x2r; + y12r = x1r + x2i; + y12i = x1i - x2r; + x0r = a[2] - a[19]; + x0i = a[3] + a[18]; + x1r = wk1r * x0r - wk1i * x0i; + x1i = wk1r * x0i + wk1i * x0r; + x0r = a[10] - a[27]; + x0i = a[11] + a[26]; + x2r = wk3i * x0r - wk3r * x0i; + x2i = wk3i * x0i + wk3r * x0r; + y1r = x1r + x2r; + y1i = x1i + x2i; + y5r = x1r - x2r; + y5i = x1i - x2i; + x0r = a[2] + a[19]; + x0i = a[3] - a[18]; + x1r = wk3r * x0r - wk3i * x0i; + x1i = wk3r * x0i + wk3i * x0r; + x0r = a[10] + a[27]; + x0i = a[11] - a[26]; + x2r = wk1r * x0r + wk1i * x0i; + x2i = wk1r * x0i - wk1i * x0r; + y9r = x1r - x2r; + y9i = x1i - x2i; + y13r = x1r + x2r; + y13i = x1i + x2i; + x0r = a[4] - a[21]; + x0i = a[5] + a[20]; + x1r = wk2r * x0r - wk2i * x0i; + x1i = wk2r * x0i + wk2i * x0r; + x0r = a[12] - a[29]; + x0i = a[13] + a[28]; + x2r = wk2i * x0r - wk2r * x0i; + x2i = wk2i * x0i + wk2r * x0r; + y2r = x1r + x2r; + y2i = x1i + x2i; + y6r = x1r - x2r; + y6i = x1i - x2i; + x0r = a[4] + a[21]; + x0i = a[5] - a[20]; + x1r = wk2i * x0r - wk2r * x0i; + x1i = wk2i * x0i + wk2r * x0r; + x0r = a[12] + a[29]; + x0i = a[13] - a[28]; + x2r = wk2r * x0r - wk2i * x0i; + x2i = wk2r * x0i + wk2i * x0r; + y10r = x1r - x2r; + y10i = x1i - x2i; + y14r = x1r + x2r; + y14i = x1i + x2i; + x0r = a[6] - a[23]; + x0i = a[7] + a[22]; + x1r = wk3r * x0r - wk3i * x0i; + x1i = wk3r * x0i + wk3i * x0r; + x0r = a[14] - a[31]; + x0i = a[15] + a[30]; + x2r = wk1i * x0r - wk1r * x0i; + x2i = wk1i * x0i + wk1r * x0r; + y3r = x1r + x2r; + y3i = x1i + x2i; + y7r = x1r - x2r; + y7i = x1i - x2i; + x0r = a[6] + a[23]; + x0i = a[7] - a[22]; + x1r = wk1i * x0r + wk1r * x0i; + x1i = wk1i * x0i - wk1r * x0r; + x0r = a[14] + a[31]; + x0i = a[15] - a[30]; + x2r = wk3i * x0r - wk3r * x0i; + x2i = wk3i * x0i + wk3r * x0r; + y11r = x1r + x2r; + y11i = x1i + x2i; + y15r = x1r - x2r; + y15i = x1i - x2i; + x1r = y0r + y2r; + x1i = y0i + y2i; + x2r = y1r + y3r; + x2i = y1i + y3i; + a[0] = x1r + x2r; + a[1] = x1i + x2i; + a[2] = x1r - x2r; + a[3] = x1i - x2i; + x1r = y0r - y2r; + x1i = y0i - y2i; + x2r = y1r - y3r; + x2i = y1i - y3i; + a[4] = x1r - x2i; + a[5] = x1i + x2r; + a[6] = x1r + x2i; + a[7] = x1i - x2r; + x1r = y4r - y6i; + x1i = y4i + y6r; + x0r = y5r - y7i; + x0i = y5i + y7r; + x2r = wn4r * (x0r - x0i); + x2i = wn4r * (x0i + x0r); + a[8] = x1r + x2r; + a[9] = x1i + x2i; + a[10] = x1r - x2r; + a[11] = x1i - x2i; + x1r = y4r + y6i; + x1i = y4i - y6r; + x0r = y5r + y7i; + x0i = y5i - y7r; + x2r = wn4r * (x0r - x0i); + x2i = wn4r * (x0i + x0r); + a[12] = x1r - x2i; + a[13] = x1i + x2r; + a[14] = x1r + x2i; + a[15] = x1i - x2r; + x1r = y8r + y10r; + x1i = y8i + y10i; + x2r = y9r - y11r; + x2i = y9i - y11i; + a[16] = x1r + x2r; + a[17] = x1i + x2i; + a[18] = x1r - x2r; + a[19] = x1i - x2i; + x1r = y8r - y10r; + x1i = y8i - y10i; + x2r = y9r + y11r; + x2i = y9i + y11i; + a[20] = x1r - x2i; + a[21] = x1i + x2r; + a[22] = x1r + x2i; + a[23] = x1i - x2r; + x1r = y12r - y14i; + x1i = y12i + y14r; + x0r = y13r + y15i; + x0i = y13i - y15r; + x2r = wn4r * (x0r - x0i); + x2i = wn4r * (x0i + x0r); + a[24] = x1r + x2r; + a[25] = x1i + x2i; + a[26] = x1r - x2r; + a[27] = x1i - x2i; + x1r = y12r + y14i; + x1i = y12i - y14r; + x0r = y13r - y15i; + x0i = y13i + y15r; + x2r = wn4r * (x0r - x0i); + x2i = wn4r * (x0i + x0r); + a[28] = x1r - x2i; + a[29] = x1i + x2r; + a[30] = x1r + x2i; + a[31] = x1i - x2r; +} + + +void cftf081(double *a, double *w) +{ + double wn4r, x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i, + y0r, y0i, y1r, y1i, y2r, y2i, y3r, y3i, + y4r, y4i, y5r, y5i, y6r, y6i, y7r, y7i; + + wn4r = w[1]; + x0r = a[0] + a[8]; + x0i = a[1] + a[9]; + x1r = a[0] - a[8]; + x1i = a[1] - a[9]; + x2r = a[4] + a[12]; + x2i = a[5] + a[13]; + x3r = a[4] - a[12]; + x3i = a[5] - a[13]; + y0r = x0r + x2r; + y0i = x0i + x2i; + y2r = x0r - x2r; + y2i = x0i - x2i; + y1r = x1r - x3i; + y1i = x1i + x3r; + y3r = x1r + x3i; + y3i = x1i - x3r; + x0r = a[2] + a[10]; + x0i = a[3] + a[11]; + x1r = a[2] - a[10]; + x1i = a[3] - a[11]; + x2r = a[6] + a[14]; + x2i = a[7] + a[15]; + x3r = a[6] - a[14]; + x3i = a[7] - a[15]; + y4r = x0r + x2r; + y4i = x0i + x2i; + y6r = x0r - x2r; + y6i = x0i - x2i; + x0r = x1r - x3i; + x0i = x1i + x3r; + x2r = x1r + x3i; + x2i = x1i - x3r; + y5r = wn4r * (x0r - x0i); + y5i = wn4r * (x0r + x0i); + y7r = wn4r * (x2r - x2i); + y7i = wn4r * (x2r + x2i); + a[8] = y1r + y5r; + a[9] = y1i + y5i; + a[10] = y1r - y5r; + a[11] = y1i - y5i; + a[12] = y3r - y7i; + a[13] = y3i + y7r; + a[14] = y3r + y7i; + a[15] = y3i - y7r; + a[0] = y0r + y4r; + a[1] = y0i + y4i; + a[2] = y0r - y4r; + a[3] = y0i - y4i; + a[4] = y2r - y6i; + a[5] = y2i + y6r; + a[6] = y2r + y6i; + a[7] = y2i - y6r; +} + + +void cftf082(double *a, double *w) +{ + double wn4r, wk1r, wk1i, x0r, x0i, x1r, x1i, + y0r, y0i, y1r, y1i, y2r, y2i, y3r, y3i, + y4r, y4i, y5r, y5i, y6r, y6i, y7r, y7i; + + wn4r = w[1]; + wk1r = w[2]; + wk1i = w[3]; + y0r = a[0] - a[9]; + y0i = a[1] + a[8]; + y1r = a[0] + a[9]; + y1i = a[1] - a[8]; + x0r = a[4] - a[13]; + x0i = a[5] + a[12]; + y2r = wn4r * (x0r - x0i); + y2i = wn4r * (x0i + x0r); + x0r = a[4] + a[13]; + x0i = a[5] - a[12]; + y3r = wn4r * (x0r - x0i); + y3i = wn4r * (x0i + x0r); + x0r = a[2] - a[11]; + x0i = a[3] + a[10]; + y4r = wk1r * x0r - wk1i * x0i; + y4i = wk1r * x0i + wk1i * x0r; + x0r = a[2] + a[11]; + x0i = a[3] - a[10]; + y5r = wk1i * x0r - wk1r * x0i; + y5i = wk1i * x0i + wk1r * x0r; + x0r = a[6] - a[15]; + x0i = a[7] + a[14]; + y6r = wk1i * x0r - wk1r * x0i; + y6i = wk1i * x0i + wk1r * x0r; + x0r = a[6] + a[15]; + x0i = a[7] - a[14]; + y7r = wk1r * x0r - wk1i * x0i; + y7i = wk1r * x0i + wk1i * x0r; + x0r = y0r + y2r; + x0i = y0i + y2i; + x1r = y4r + y6r; + x1i = y4i + y6i; + a[0] = x0r + x1r; + a[1] = x0i + x1i; + a[2] = x0r - x1r; + a[3] = x0i - x1i; + x0r = y0r - y2r; + x0i = y0i - y2i; + x1r = y4r - y6r; + x1i = y4i - y6i; + a[4] = x0r - x1i; + a[5] = x0i + x1r; + a[6] = x0r + x1i; + a[7] = x0i - x1r; + x0r = y1r - y3i; + x0i = y1i + y3r; + x1r = y5r - y7r; + x1i = y5i - y7i; + a[8] = x0r + x1r; + a[9] = x0i + x1i; + a[10] = x0r - x1r; + a[11] = x0i - x1i; + x0r = y1r + y3i; + x0i = y1i - y3r; + x1r = y5r + y7r; + x1i = y5i + y7i; + a[12] = x0r - x1i; + a[13] = x0i + x1r; + a[14] = x0r + x1i; + a[15] = x0i - x1r; +} + + +void cftf040(double *a) +{ + double x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i; + + x0r = a[0] + a[4]; + x0i = a[1] + a[5]; + x1r = a[0] - a[4]; + x1i = a[1] - a[5]; + x2r = a[2] + a[6]; + x2i = a[3] + a[7]; + x3r = a[2] - a[6]; + x3i = a[3] - a[7]; + a[0] = x0r + x2r; + a[1] = x0i + x2i; + a[2] = x1r - x3i; + a[3] = x1i + x3r; + a[4] = x0r - x2r; + a[5] = x0i - x2i; + a[6] = x1r + x3i; + a[7] = x1i - x3r; +} + + +void cftb040(double *a) +{ + double x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i; + + x0r = a[0] + a[4]; + x0i = a[1] + a[5]; + x1r = a[0] - a[4]; + x1i = a[1] - a[5]; + x2r = a[2] + a[6]; + x2i = a[3] + a[7]; + x3r = a[2] - a[6]; + x3i = a[3] - a[7]; + a[0] = x0r + x2r; + a[1] = x0i + x2i; + a[2] = x1r + x3i; + a[3] = x1i - x3r; + a[4] = x0r - x2r; + a[5] = x0i - x2i; + a[6] = x1r - x3i; + a[7] = x1i + x3r; +} + + +void cftx020(double *a) +{ + double x0r, x0i; + + x0r = a[0] - a[2]; + x0i = a[1] - a[3]; + a[0] += a[2]; + a[1] += a[3]; + a[2] = x0r; + a[3] = x0i; +} + + +void rftfsub(int n, double *a, int nc, double *c) +{ + int j, k, kk, ks, m; + double wkr, wki, xr, xi, yr, yi; + + m = n >> 1; + ks = 2 * nc / m; + kk = 0; + for (j = 2; j < m; j += 2) { + k = n - j; + kk += ks; + wkr = 0.5 - c[nc - kk]; + wki = c[kk]; + xr = a[j] - a[k]; + xi = a[j + 1] + a[k + 1]; + yr = wkr * xr - wki * xi; + yi = wkr * xi + wki * xr; + a[j] -= yr; + a[j + 1] -= yi; + a[k] += yr; + a[k + 1] -= yi; + } +} + + +void rftbsub(int n, double *a, int nc, double *c) +{ + int j, k, kk, ks, m; + double wkr, wki, xr, xi, yr, yi; + + m = n >> 1; + ks = 2 * nc / m; + kk = 0; + for (j = 2; j < m; j += 2) { + k = n - j; + kk += ks; + wkr = 0.5 - c[nc - kk]; + wki = c[kk]; + xr = a[j] - a[k]; + xi = a[j + 1] + a[k + 1]; + yr = wkr * xr + wki * xi; + yi = wkr * xi - wki * xr; + a[j] -= yr; + a[j + 1] -= yi; + a[k] += yr; + a[k + 1] -= yi; + } +} + + +void dctsub(int n, double *a, int nc, double *c) +{ + int j, k, kk, ks, m; + double wkr, wki, xr; + + m = n >> 1; + ks = nc / n; + kk = 0; + for (j = 1; j < m; j++) { + k = n - j; + kk += ks; + wkr = c[kk] - c[nc - kk]; + wki = c[kk] + c[nc - kk]; + xr = wki * a[j] - wkr * a[k]; + a[j] = wkr * a[j] + wki * a[k]; + a[k] = xr; + } + a[m] *= c[0]; +} + + +void dstsub(int n, double *a, int nc, double *c) +{ + int j, k, kk, ks, m; + double wkr, wki, xr; + + m = n >> 1; + ks = nc / n; + kk = 0; + for (j = 1; j < m; j++) { + k = n - j; + kk += ks; + wkr = c[kk] - c[nc - kk]; + wki = c[kk] + c[nc - kk]; + xr = wki * a[k] - wkr * a[j]; + a[k] = wkr * a[k] + wki * a[j]; + a[j] = xr; + } + a[m] *= c[0]; +} + diff --git a/plugins/supereq/nsfft-1.00/ooura/pi_fft.c b/plugins/supereq/nsfft-1.00/ooura/pi_fft.c new file mode 100644 index 00000000..c9a76bf8 --- /dev/null +++ b/plugins/supereq/nsfft-1.00/ooura/pi_fft.c @@ -0,0 +1,1616 @@ +/* +---- calculation of PI(= 3.14159...) using FFT ---- + by T.Ooura, ver. LG1.1.2-MP1.5a Sep. 2001. + +This is a test program to estimate the performance of +the FFT routines: fft*g.c. + +Example compilation: + GNU : gcc -O6 -ffast-math pi_fft.c fftsg.c -lm -o pi_fftsg + SUN : cc -fast -xO5 pi_fft.c fft8g.c -lm -o pi_fft8g + Microsoft: cl /O2 /G6 pi_fft.c fft4g.c /Fepi_fft4g.exe + ... + etc. +*/ + +/* Please check the following macros before compiling */ +#ifndef DBL_ERROR_MARGIN +#define DBL_ERROR_MARGIN 0.3 /* must be < 0.5 */ +#endif + + +#include <math.h> +#include <limits.h> +#include <float.h> +#include <stdio.h> +#include <stdlib.h> +#include <time.h> + + +void mp_load_0(int n, int radix, int out[]); +void mp_load_1(int n, int radix, int out[]); +void mp_copy(int n, int radix, int in[], int out[]); +void mp_round(int n, int radix, int m, int inout[]); +int mp_cmp(int n, int radix, int in1[], int in2[]); +void mp_add(int n, int radix, int in1[], int in2[], int out[]); +void mp_sub(int n, int radix, int in1[], int in2[], int out[]); +void mp_imul(int n, int radix, int in1[], int in2, int out[]); +int mp_idiv(int n, int radix, int in1[], int in2, int out[]); +void mp_idiv_2(int n, int radix, int in[], int out[]); +double mp_mul_radix_test(int n, int radix, int nfft, + double tmpfft[], int ip[], double w[]); +void mp_mul(int n, int radix, int in1[], int in2[], int out[], + int tmp[], int nfft, double tmp1fft[], double tmp2fft[], + double tmp3fft[], int ip[], double w[]); +void mp_squ(int n, int radix, int in[], int out[], int tmp[], + int nfft, double tmp1fft[], double tmp2fft[], + int ip[], double w[]); +void mp_mulh(int n, int radix, int in1[], int in2[], int out[], + int nfft, double in1fft[], double outfft[], + int ip[], double w[]); +void mp_squh(int n, int radix, int in[], int out[], + int nfft, double inoutfft[], int ip[], double w[]); +int mp_inv(int n, int radix, int in[], int out[], + int tmp1[], int tmp2[], int nfft, + double tmp1fft[], double tmp2fft[], int ip[], double w[]); +int mp_sqrt(int n, int radix, int in[], int out[], + int tmp1[], int tmp2[], int nfft, + double tmp1fft[], double tmp2fft[], int ip[], double w[]); +void mp_sprintf(int n, int log10_radix, int in[], char out[]); +void mp_sscanf(int n, int log10_radix, char in[], int out[]); +void mp_fprintf(int n, int log10_radix, int in[], FILE *fout); + + +int main() +{ + int nfft, log2_nfft, radix, log10_radix, n, npow, nprc; + double err, d_time, n_op; + int *a, *b, *c, *e, *i1, *i2, *ip; + double *d1, *d2, *d3, *w; + time_t t_1, t_2; + FILE *f_log, *f_out; + + f_log = fopen("pi.log", "w"); + printf("PI calculation to estimate the FFT benchmarks\n"); + fprintf(f_log, "PI calculation to estimate the FFT benchmarks\n"); + printf("length of FFT =?\n"); + scanf("%d", &nfft); + + printf("initializing...\n"); + for (log2_nfft = 1; (1 << log2_nfft) < nfft; log2_nfft++); + nfft = 1 << log2_nfft; + n = nfft + 2; + ip = (int *) malloc((3 + (int) sqrt(0.5 * nfft)) * sizeof(int)); + w = (double *) malloc(nfft / 2 * sizeof(double)); + a = (int *) malloc((n + 2) * sizeof(int)); + b = (int *) malloc((n + 2) * sizeof(int)); + c = (int *) malloc((n + 2) * sizeof(int)); + e = (int *) malloc((n + 2) * sizeof(int)); + i1 = (int *) malloc((n + 2) * sizeof(int)); + i2 = (int *) malloc((n + 2) * sizeof(int)); + d1 = (double *) malloc((nfft + 2) * sizeof(double)); + d2 = (double *) malloc((nfft + 2) * sizeof(double)); + d3 = (double *) malloc((nfft + 2) * sizeof(double)); + if (d3 == NULL) { + printf("Allocation Failure!\n"); + exit(1); + } + ip[0] = 0; + /* ---- radix test ---- */ + log10_radix = 1; + radix = 10; + err = mp_mul_radix_test(n, radix, nfft, d1, ip, w); + err += DBL_EPSILON * (n * radix * radix / 4); + while (100 * err < DBL_ERROR_MARGIN && radix <= INT_MAX / 20) { + err *= 100; + log10_radix++; + radix *= 10; + } + printf("nfft= %d\nradix= %d\nerror_margin= %g\n", nfft, radix, err); + fprintf(f_log, "nfft= %d\nradix= %d\nerror_margin= %g\n", nfft, radix, err); + printf("calculating %d digits of PI...\n", log10_radix * (n - 2)); + fprintf(f_log, "calculating %d digits of PI...\n", log10_radix * (n - 2)); + /* ---- time check ---- */ + time(&t_1); + /* + * ---- a formula based on the AGM (Arithmetic-Geometric Mean) ---- + * c = sqrt(0.125); + * a = 1 + 3 * c; + * b = sqrt(a); + * e = b - 0.625; + * b = 2 * b; + * c = e - c; + * a = a + e; + * npow = 4; + * do { + * npow = 2 * npow; + * e = (a + b) / 2; + * b = sqrt(a * b); + * e = e - b; + * b = 2 * b; + * c = c - e; + * a = e + b; + * } while (e > SQRT_SQRT_EPSILON); + * e = e * e / 4; + * a = a + b; + * pi = (a * a - e - e / 2) / (a * c - e) / npow; + * ---- modification ---- + * This is a modified version of Gauss-Legendre formula + * (by T.Ooura). It is faster than original version. + * ---- reference ---- + * 1. E.Salamin, + * Computation of PI Using Arithmetic-Geometric Mean, + * Mathematics of Computation, Vol.30 1976. + * 2. R.P.Brent, + * Fast Multiple-Precision Evaluation of Elementary Functions, + * J. ACM 23 1976. + * 3. D.Takahasi, Y.Kanada, + * Calculation of PI to 51.5 Billion Decimal Digits on + * Distributed Memoriy Parallel Processors, + * Transactions of Information Processing Society of Japan, + * Vol.39 No.7 1998. + * 4. T.Ooura, + * Improvement of the PI Calculation Algorithm and + * Implementation of Fast Multiple-Precision Computation, + * Information Processing Society of Japan SIG Notes, + * 98-HPC-74, 1998. + */ + /* ---- c = sqrt(0.125) ---- */ + mp_sscanf(n, log10_radix, "0.125", a); + mp_sqrt(n, radix, a, c, i1, i2, nfft, d1, d2, ip, w); + /* ---- a = 1 + 3 * c ---- */ + mp_imul(n, radix, c, 3, e); + mp_sscanf(n, log10_radix, "1", a); + mp_add(n, radix, a, e, a); + /* ---- b = sqrt(a) ---- */ + mp_sqrt(n, radix, a, b, i1, i2, nfft, d1, d2, ip, w); + /* ---- e = b - 0.625 ---- */ + mp_sscanf(n, log10_radix, "0.625", e); + mp_sub(n, radix, b, e, e); + /* ---- b = 2 * b ---- */ + mp_add(n, radix, b, b, b); + /* ---- c = e - c ---- */ + mp_sub(n, radix, e, c, c); + /* ---- a = a + e ---- */ + mp_add(n, radix, a, e, a); + printf("AGM iteration\n"); + fprintf(f_log, "AGM iteration\n"); + npow = 4; + do { + npow *= 2; + /* ---- e = (a + b) / 2 ---- */ + mp_add(n, radix, a, b, e); + mp_idiv_2(n, radix, e, e); + /* ---- b = sqrt(a * b) ---- */ + mp_mul(n, radix, a, b, a, i1, nfft, d1, d2, d3, ip, w); + mp_sqrt(n, radix, a, b, i1, i2, nfft, d1, d2, ip, w); + /* ---- e = e - b ---- */ + mp_sub(n, radix, e, b, e); + /* ---- b = 2 * b ---- */ + mp_add(n, radix, b, b, b); + /* ---- c = c - e ---- */ + mp_sub(n, radix, c, e, c); + /* ---- a = e + b ---- */ + mp_add(n, radix, e, b, a); + /* ---- convergence check ---- */ + nprc = -e[1]; + if (e[0] == 0) { + nprc = n; + } + printf("precision= %d\n", 4 * nprc * log10_radix); + fprintf(f_log, "precision= %d\n", 4 * nprc * log10_radix); + } while (4 * nprc <= n); + /* ---- e = e * e / 4 (half precision) ---- */ + mp_idiv_2(n, radix, e, e); + mp_squh(n, radix, e, e, nfft, d1, ip, w); + /* ---- a = a + b ---- */ + mp_add(n, radix, a, b, a); + /* ---- a = (a * a - e - e / 2) / (a * c - e) / npow ---- */ + mp_mul(n, radix, a, c, c, i1, nfft, d1, d2, d3, ip, w); + mp_sub(n, radix, c, e, c); + mp_inv(n, radix, c, b, i1, i2, nfft, d1, d2, ip, w); + mp_squ(n, radix, a, a, i1, nfft, d1, d2, ip, w); + mp_sub(n, radix, a, e, a); + mp_idiv_2(n, radix, e, e); + mp_sub(n, radix, a, e, a); + mp_mul(n, radix, a, b, a, i1, nfft, d1, d2, d3, ip, w); + mp_idiv(n, radix, a, npow, a); + /* ---- time check ---- */ + time(&t_2); + /* ---- output ---- */ + f_out = fopen("pi.dat", "w"); + printf("writing pi.dat...\n"); + mp_fprintf(n - 1, log10_radix, a, f_out); + fclose(f_out); + free(d3); + free(d2); + free(d1); + free(i2); + free(i1); + free(e); + free(c); + free(b); + free(a); + free(w); + free(ip); + /* ---- benchmark ---- */ + n_op = 50.0 * nfft * log2_nfft * log2_nfft; + printf("floating point operation: %g op.\n", n_op); + fprintf(f_log, "floating point operation: %g op.\n", n_op); + /* ---- difftime ---- */ + d_time = difftime(t_2, t_1); + printf("execution time: %g sec. (real time)\n", d_time); + fprintf(f_log, "execution time: %g sec. (real time)\n", d_time); + fclose(f_log); + return 0; +} + + +/* -------- multiple precision routines -------- */ + + +#include <math.h> +#include <float.h> +#include <stdio.h> + +/* ---- floating point format ---- + data := data[0] * pow(radix, data[1]) * + (data[2] + data[3]/radix + data[4]/radix/radix + ...), + data[0] : sign (1;data>0, -1;data<0, 0;data==0) + data[1] : exponent (0;data==0) + data[2...n+1] : digits + ---- function prototypes ---- + void mp_load_0(int n, int radix, int out[]); + void mp_load_1(int n, int radix, int out[]); + void mp_copy(int n, int radix, int in[], int out[]); + void mp_round(int n, int radix, int m, int inout[]); + int mp_cmp(int n, int radix, int in1[], int in2[]); + void mp_add(int n, int radix, int in1[], int in2[], int out[]); + void mp_sub(int n, int radix, int in1[], int in2[], int out[]); + void mp_imul(int n, int radix, int in1[], int in2, int out[]); + int mp_idiv(int n, int radix, int in1[], int in2, int out[]); + void mp_idiv_2(int n, int radix, int in[], int out[]); + double mp_mul_radix_test(int n, int radix, int nfft, + double tmpfft[], int ip[], double w[]); + void mp_mul(int n, int radix, int in1[], int in2[], int out[], + int tmp[], int nfft, double tmp1fft[], double tmp2fft[], + double tmp3fft[], int ip[], double w[]); + void mp_squ(int n, int radix, int in[], int out[], int tmp[], + int nfft, double tmp1fft[], double tmp2fft[], + int ip[], double w[]); + void mp_mulh(int n, int radix, int in1[], int in2[], int out[], + int nfft, double in1fft[], double outfft[], + int ip[], double w[]); + void mp_squh(int n, int radix, int in[], int out[], + int nfft, double inoutfft[], int ip[], double w[]); + int mp_inv(int n, int radix, int in[], int out[], + int tmp1[], int tmp2[], int nfft, + double tmp1fft[], double tmp2fft[], int ip[], double w[]); + int mp_sqrt(int n, int radix, int in[], int out[], + int tmp1[], int tmp2[], int nfft, + double tmp1fft[], double tmp2fft[], int ip[], double w[]); + void mp_sprintf(int n, int log10_radix, int in[], char out[]); + void mp_sscanf(int n, int log10_radix, char in[], int out[]); + void mp_fprintf(int n, int log10_radix, int in[], FILE *fout); + ---- +*/ + + +/* -------- mp_load routines -------- */ + + +void mp_load_0(int n, int radix, int out[]) +{ + int j; + + for (j = 0; j <= n + 1; j++) { + out[j] = 0; + } +} + + +void mp_load_1(int n, int radix, int out[]) +{ + int j; + + out[0] = 1; + out[1] = 0; + out[2] = 1; + for (j = 3; j <= n + 1; j++) { + out[j] = 0; + } +} + + +void mp_copy(int n, int radix, int in[], int out[]) +{ + int j; + + for (j = 0; j <= n + 1; j++) { + out[j] = in[j]; + } +} + + +void mp_round(int n, int radix, int m, int inout[]) +{ + int j, x; + + if (m < n) { + for (j = n + 1; j > m + 2; j--) { + inout[j] = 0; + } + x = 2 * inout[m + 2]; + inout[m + 2] = 0; + if (x >= radix) { + for (j = m + 1; j >= 2; j--) { + x = inout[j] + 1; + if (x < radix) { + inout[j] = x; + break; + } + inout[j] = 0; + } + if (x >= radix) { + inout[2] = 1; + inout[1]++; + } + } + } +} + + +/* -------- mp_add routines -------- */ + + +int mp_cmp(int n, int radix, int in1[], int in2[]) +{ + int mp_unsgn_cmp(int n, int in1[], int in2[]); + + if (in1[0] > in2[0]) { + return 1; + } else if (in1[0] < in2[0]) { + return -1; + } + return in1[0] * mp_unsgn_cmp(n, &in1[1], &in2[1]); +} + + +void mp_add(int n, int radix, int in1[], int in2[], int out[]) +{ + int mp_unsgn_cmp(int n, int in1[], int in2[]); + int mp_unexp_add(int n, int radix, int expdif, + int in1[], int in2[], int out[]); + int mp_unexp_sub(int n, int radix, int expdif, + int in1[], int in2[], int out[]); + int outsgn, outexp, expdif; + + expdif = in1[1] - in2[1]; + outexp = in1[1]; + if (expdif < 0) { + outexp = in2[1]; + } + outsgn = in1[0] * in2[0]; + if (outsgn >= 0) { + if (outsgn > 0) { + outsgn = in1[0]; + } else { + outsgn = in1[0] + in2[0]; + outexp = in1[1] + in2[1]; + expdif = 0; + } + if (expdif >= 0) { + outexp += mp_unexp_add(n, radix, expdif, + &in1[2], &in2[2], &out[2]); + } else { + outexp += mp_unexp_add(n, radix, -expdif, + &in2[2], &in1[2], &out[2]); + } + } else { + outsgn = mp_unsgn_cmp(n, &in1[1], &in2[1]); + if (outsgn >= 0) { + expdif = mp_unexp_sub(n, radix, expdif, + &in1[2], &in2[2], &out[2]); + } else { + expdif = mp_unexp_sub(n, radix, -expdif, + &in2[2], &in1[2], &out[2]); + } + outexp -= expdif; + outsgn *= in1[0]; + if (expdif == n) { + outsgn = 0; + } + } + if (outsgn == 0) { + outexp = 0; + } + out[0] = outsgn; + out[1] = outexp; +} + + +void mp_sub(int n, int radix, int in1[], int in2[], int out[]) +{ + int mp_unsgn_cmp(int n, int in1[], int in2[]); + int mp_unexp_add(int n, int radix, int expdif, + int in1[], int in2[], int out[]); + int mp_unexp_sub(int n, int radix, int expdif, + int in1[], int in2[], int out[]); + int outsgn, outexp, expdif; + + expdif = in1[1] - in2[1]; + outexp = in1[1]; + if (expdif < 0) { + outexp = in2[1]; + } + outsgn = in1[0] * in2[0]; + if (outsgn <= 0) { + if (outsgn < 0) { + outsgn = in1[0]; + } else { + outsgn = in1[0] - in2[0]; + outexp = in1[1] + in2[1]; + expdif = 0; + } + if (expdif >= 0) { + outexp += mp_unexp_add(n, radix, expdif, + &in1[2], &in2[2], &out[2]); + } else { + outexp += mp_unexp_add(n, radix, -expdif, + &in2[2], &in1[2], &out[2]); + } + } else { + outsgn = mp_unsgn_cmp(n, &in1[1], &in2[1]); + if (outsgn >= 0) { + expdif = mp_unexp_sub(n, radix, expdif, + &in1[2], &in2[2], &out[2]); + } else { + expdif = mp_unexp_sub(n, radix, -expdif, + &in2[2], &in1[2], &out[2]); + } + outexp -= expdif; + outsgn *= in1[0]; + if (expdif == n) { + outsgn = 0; + } + } + if (outsgn == 0) { + outexp = 0; + } + out[0] = outsgn; + out[1] = outexp; +} + + +/* -------- mp_add child routines -------- */ + + +int mp_unsgn_cmp(int n, int in1[], int in2[]) +{ + int j, cmp; + + cmp = 0; + for (j = 0; j <= n && cmp == 0; j++) { + cmp = in1[j] - in2[j]; + } + if (cmp > 0) { + cmp = 1; + } else if (cmp < 0) { + cmp = -1; + } + return cmp; +} + + +int mp_unexp_add(int n, int radix, int expdif, + int in1[], int in2[], int out[]) +{ + int j, x, carry; + + carry = 0; + if (expdif == 0 && in1[0] + in2[0] >= radix) { + x = in1[n - 1] + in2[n - 1]; + carry = x >= radix ? -1 : 0; + for (j = n - 1; j > 0; j--) { + x = in1[j - 1] + in2[j - 1] - carry; + carry = x >= radix ? -1 : 0; + out[j] = x - (radix & carry); + } + out[0] = -carry; + } else { + if (expdif > n) { + expdif = n; + } + for (j = n - 1; j >= expdif; j--) { + x = in1[j] + in2[j - expdif] - carry; + carry = x >= radix ? -1 : 0; + out[j] = x - (radix & carry); + } + for (j = expdif - 1; j >= 0; j--) { + x = in1[j] - carry; + carry = x >= radix ? -1 : 0; + out[j] = x - (radix & carry); + } + if (carry != 0) { + for (j = n - 1; j > 0; j--) { + out[j] = out[j - 1]; + } + out[0] = -carry; + } + } + return -carry; +} + + +int mp_unexp_sub(int n, int radix, int expdif, + int in1[], int in2[], int out[]) +{ + int j, x, borrow, ncancel; + + if (expdif > n) { + expdif = n; + } + borrow = 0; + for (j = n - 1; j >= expdif; j--) { + x = in1[j] - in2[j - expdif] + borrow; + borrow = x < 0 ? -1 : 0; + out[j] = x + (radix & borrow); + } + for (j = expdif - 1; j >= 0; j--) { + x = in1[j] + borrow; + borrow = x < 0 ? -1 : 0; + out[j] = x + (radix & borrow); + } + ncancel = 0; + for (j = 0; j < n && out[j] == 0; j++) { + ncancel = j + 1; + } + if (ncancel > 0 && ncancel < n) { + for (j = 0; j < n - ncancel; j++) { + out[j] = out[j + ncancel]; + } + for (j = n - ncancel; j < n; j++) { + out[j] = 0; + } + } + return ncancel; +} + + +/* -------- mp_imul routines -------- */ + + +void mp_imul(int n, int radix, int in1[], int in2, int out[]) +{ + void mp_unsgn_imul(int n, double dradix, int in1[], double din2, + int out[]); + + if (in2 > 0) { + out[0] = in1[0]; + } else if (in2 < 0) { + out[0] = -in1[0]; + in2 = -in2; + } else { + out[0] = 0; + } + mp_unsgn_imul(n, radix, &in1[1], in2, &out[1]); + if (out[0] == 0) { + out[1] = 0; + } +} + + +int mp_idiv(int n, int radix, int in1[], int in2, int out[]) +{ + void mp_load_0(int n, int radix, int out[]); + void mp_unsgn_idiv(int n, double dradix, int in1[], double din2, + int out[]); + + if (in2 == 0) { + return -1; + } + if (in2 > 0) { + out[0] = in1[0]; + } else { + out[0] = -in1[0]; + in2 = -in2; + } + if (in1[0] == 0) { + mp_load_0(n, radix, out); + return 0; + } + mp_unsgn_idiv(n, radix, &in1[1], in2, &out[1]); + return 0; +} + + +void mp_idiv_2(int n, int radix, int in[], int out[]) +{ + int j, ix, carry, shift; + + out[0] = in[0]; + shift = 0; + if (in[2] == 1) { + shift = 1; + } + out[1] = in[1] - shift; + carry = -shift; + for (j = 2; j <= n + 1 - shift; j++) { + ix = in[j + shift] + (radix & carry); + carry = -(ix & 1); + out[j] = ix >> 1; + } + if (shift > 0) { + out[n + 1] = (radix & carry) >> 1; + } +} + + +/* -------- mp_imul child routines -------- */ + + +void mp_unsgn_imul(int n, double dradix, int in1[], double din2, + int out[]) +{ + int j, carry, shift; + double x, d1_radix; + + d1_radix = 1.0 / dradix; + carry = 0; + for (j = n; j >= 1; j--) { + x = din2 * in1[j] + carry + 0.5; + carry = (int) (d1_radix * x); + out[j] = (int) (x - dradix * carry); + } + shift = 0; + x = carry + 0.5; + while (x > 1) { + x *= d1_radix; + shift++; + } + out[0] = in1[0] + shift; + if (shift > 0) { + while (shift > n) { + carry = (int) (d1_radix * carry + 0.5); + shift--; + } + for (j = n; j >= shift + 1; j--) { + out[j] = out[j - shift]; + } + for (j = shift; j >= 1; j--) { + x = carry + 0.5; + carry = (int) (d1_radix * x); + out[j] = (int) (x - dradix * carry); + } + } +} + + +void mp_unsgn_idiv(int n, double dradix, int in1[], double din2, + int out[]) +{ + int j, ix, carry, shift; + double x, d1_in2; + + d1_in2 = 1.0 / din2; + shift = 0; + x = 0; + do { + shift++; + x *= dradix; + if (shift <= n) { + x += in1[shift]; + } + } while (x < din2 - 0.5); + x += 0.5; + ix = (int) (d1_in2 * x); + carry = (int) (x - din2 * ix); + out[1] = ix; + shift--; + out[0] = in1[0] - shift; + if (shift >= n) { + shift = n - 1; + } + for (j = 2; j <= n - shift; j++) { + x = in1[j + shift] + dradix * carry + 0.5; + ix = (int) (d1_in2 * x); + carry = (int) (x - din2 * ix); + out[j] = ix; + } + for (j = n - shift + 1; j <= n; j++) { + x = dradix * carry + 0.5; + ix = (int) (d1_in2 * x); + carry = (int) (x - din2 * ix); + out[j] = ix; + } +} + + +/* -------- mp_mul routines -------- */ + + +double mp_mul_radix_test(int n, int radix, int nfft, + double tmpfft[], int ip[], double w[]) +{ + void rdft(int n, int isgn, double *a, int *ip, double *w); + void mp_mul_csqu(int nfft, double dinout[]); + double mp_mul_d2i_test(int radix, int nfft, double din[]); + int j, ndata, radix_2; + + ndata = (nfft >> 1) + 1; + if (ndata > n) { + ndata = n; + } + tmpfft[nfft + 1] = radix - 1; + for (j = nfft; j > ndata; j--) { + tmpfft[j] = 0; + } + radix_2 = (radix + 1) / 2; + for (j = ndata; j > 2; j--) { + tmpfft[j] = radix_2; + } + tmpfft[2] = radix; + tmpfft[1] = radix - 1; + tmpfft[0] = 0; + rdft(nfft, 1, &tmpfft[1], ip, w); + mp_mul_csqu(nfft, tmpfft); + rdft(nfft, -1, &tmpfft[1], ip, w); + return 2 * mp_mul_d2i_test(radix, nfft, tmpfft); +} + + +void mp_mul(int n, int radix, int in1[], int in2[], int out[], + int tmp[], int nfft, double tmp1fft[], double tmp2fft[], + double tmp3fft[], int ip[], double w[]) +{ + void mp_copy(int n, int radix, int in[], int out[]); + void mp_add(int n, int radix, int in1[], int in2[], int out[]); + void rdft(int n, int isgn, double *a, int *ip, double *w); + void mp_mul_i2d(int n, int radix, int nfft, int shift, + int in[], double dout[]); + void mp_mul_cmul(int nfft, double din[], double dinout[]); + void mp_mul_cmuladd(int nfft, double din1[], double din2[], + double dinout[]); + void mp_mul_d2i(int n, int radix, int nfft, double din[], int out[]); + int n_h, shift; + + shift = (nfft >> 1) + 1; + while (n > shift) { + if (in1[shift + 2] + in2[shift + 2] != 0) { + break; + } + shift++; + } + n_h = n / 2 + 1; + if (n_h < n - shift) { + n_h = n - shift; + } + /* ---- tmp3fft = (upper) in1 * (lower) in2 ---- */ + mp_mul_i2d(n, radix, nfft, 0, in1, tmp1fft); + rdft(nfft, 1, &tmp1fft[1], ip, w); + mp_mul_i2d(n, radix, nfft, shift, in2, tmp3fft); + rdft(nfft, 1, &tmp3fft[1], ip, w); + mp_mul_cmul(nfft, tmp1fft, tmp3fft); + /* ---- tmp = (upper) in1 * (upper) in2 ---- */ + mp_mul_i2d(n, radix, nfft, 0, in2, tmp2fft); + rdft(nfft, 1, &tmp2fft[1], ip, w); + mp_mul_cmul(nfft, tmp2fft, tmp1fft); + rdft(nfft, -1, &tmp1fft[1], ip, w); + mp_mul_d2i(n, radix, nfft, tmp1fft, tmp); + /* ---- tmp3fft += (upper) in2 * (lower) in1 ---- */ + mp_mul_i2d(n, radix, nfft, shift, in1, tmp1fft); + rdft(nfft, 1, &tmp1fft[1], ip, w); + mp_mul_cmuladd(nfft, tmp1fft, tmp2fft, tmp3fft); + /* ---- out = tmp + tmp3fft ---- */ + rdft(nfft, -1, &tmp3fft[1], ip, w); + mp_mul_d2i(n_h, radix, nfft, tmp3fft, out); + if (out[0] != 0) { + mp_add(n, radix, out, tmp, out); + } else { + mp_copy(n, radix, tmp, out); + } +} + + +void mp_squ(int n, int radix, int in[], int out[], int tmp[], + int nfft, double tmp1fft[], double tmp2fft[], + int ip[], double w[]) +{ + void mp_add(int n, int radix, int in1[], int in2[], int out[]); + void rdft(int n, int isgn, double *a, int *ip, double *w); + void mp_mul_i2d(int n, int radix, int nfft, int shift, + int in[], double dout[]); + void mp_mul_cmul(int nfft, double din[], double dinout[]); + void mp_mul_csqu(int nfft, double dinout[]); + void mp_mul_d2i(int n, int radix, int nfft, double din[], int out[]); + int n_h, shift; + + shift = (nfft >> 1) + 1; + while (n > shift) { + if (in[shift + 2] != 0) { + break; + } + shift++; + } + n_h = n / 2 + 1; + if (n_h < n - shift) { + n_h = n - shift; + } + /* ---- tmp = (upper) in * (lower) in ---- */ + mp_mul_i2d(n, radix, nfft, 0, in, tmp1fft); + rdft(nfft, 1, &tmp1fft[1], ip, w); + mp_mul_i2d(n, radix, nfft, shift, in, tmp2fft); + rdft(nfft, 1, &tmp2fft[1], ip, w); + mp_mul_cmul(nfft, tmp1fft, tmp2fft); + rdft(nfft, -1, &tmp2fft[1], ip, w); + mp_mul_d2i(n_h, radix, nfft, tmp2fft, tmp); + /* ---- out = 2 * tmp + ((upper) in)^2 ---- */ + mp_mul_csqu(nfft, tmp1fft); + rdft(nfft, -1, &tmp1fft[1], ip, w); + mp_mul_d2i(n, radix, nfft, tmp1fft, out); + if (tmp[0] != 0) { + mp_add(n_h, radix, tmp, tmp, tmp); + mp_add(n, radix, out, tmp, out); + } +} + + +void mp_mulh(int n, int radix, int in1[], int in2[], int out[], + int nfft, double in1fft[], double outfft[], int ip[], double w[]) +{ + void rdft(int n, int isgn, double *a, int *ip, double *w); + void mp_mul_i2d(int n, int radix, int nfft, int shift, + int in[], double dout[]); + void mp_mul_cmul(int nfft, double din[], double dinout[]); + void mp_mul_d2i(int n, int radix, int nfft, double din[], int out[]); + + mp_mul_i2d(n, radix, nfft, 0, in1, in1fft); + rdft(nfft, 1, &in1fft[1], ip, w); + mp_mul_i2d(n, radix, nfft, 0, in2, outfft); + rdft(nfft, 1, &outfft[1], ip, w); + mp_mul_cmul(nfft, in1fft, outfft); + rdft(nfft, -1, &outfft[1], ip, w); + mp_mul_d2i(n, radix, nfft, outfft, out); +} + + +void mp_mulh_use_in1fft(int n, int radix, double in1fft[], + int shift, int in2[], int out[], int nfft, double outfft[], + int ip[], double w[]) +{ + void rdft(int n, int isgn, double *a, int *ip, double *w); + void mp_mul_i2d(int n, int radix, int nfft, int shift, + int in[], double dout[]); + void mp_mul_cmul(int nfft, double din[], double dinout[]); + void mp_mul_d2i(int n, int radix, int nfft, double din[], int out[]); + int n_h; + + while (n > shift) { + if (in2[shift + 2] != 0) { + break; + } + shift++; + } + n_h = n / 2 + 1; + if (n_h < n - shift) { + n_h = n - shift; + } + mp_mul_i2d(n, radix, nfft, shift, in2, outfft); + rdft(nfft, 1, &outfft[1], ip, w); + mp_mul_cmul(nfft, in1fft, outfft); + rdft(nfft, -1, &outfft[1], ip, w); + mp_mul_d2i(n_h, radix, nfft, outfft, out); +} + + +void mp_squh(int n, int radix, int in[], int out[], + int nfft, double inoutfft[], int ip[], double w[]) +{ + void rdft(int n, int isgn, double *a, int *ip, double *w); + void mp_mul_i2d(int n, int radix, int nfft, int shift, + int in[], double dout[]); + void mp_mul_csqu(int nfft, double dinout[]); + void mp_mul_d2i(int n, int radix, int nfft, double din[], int out[]); + + mp_mul_i2d(n, radix, nfft, 0, in, inoutfft); + rdft(nfft, 1, &inoutfft[1], ip, w); + mp_mul_csqu(nfft, inoutfft); + rdft(nfft, -1, &inoutfft[1], ip, w); + mp_mul_d2i(n, radix, nfft, inoutfft, out); +} + + +void mp_squh_use_in1fft(int n, int radix, double inoutfft[], int out[], + int nfft, int ip[], double w[]) +{ + void rdft(int n, int isgn, double *a, int *ip, double *w); + void mp_mul_csqu(int nfft, double dinout[]); + void mp_mul_d2i(int n, int radix, int nfft, double din[], int out[]); + + mp_mul_csqu(nfft, inoutfft); + rdft(nfft, -1, &inoutfft[1], ip, w); + mp_mul_d2i(n, radix, nfft, inoutfft, out); +} + + +/* -------- mp_mul child routines -------- */ + + +void mp_mul_i2d(int n, int radix, int nfft, int shift, + int in[], double dout[]) +{ + int j, x, carry, ndata, radix_2, topdgt; + + ndata = 0; + topdgt = 0; + if (n > shift) { + topdgt = in[shift + 2]; + ndata = (nfft >> 1) + 1; + if (ndata > n - shift) { + ndata = n - shift; + } + } + dout[nfft + 1] = in[0] * topdgt; + for (j = nfft; j > ndata; j--) { + dout[j] = 0; + } + /* ---- abs(dout[j]) <= radix/2 (to keep FFT precision) ---- */ + if (ndata > 1) { + radix_2 = radix / 2; + carry = 0; + for (j = ndata + 1; j > 3; j--) { + x = in[j + shift] - carry; + carry = x >= radix_2 ? -1 : 0; + dout[j - 1] = x - (radix & carry); + } + dout[2] = in[shift + 3] - carry; + } + dout[1] = topdgt; + dout[0] = in[1] - shift; +} + + +void mp_mul_cmul(int nfft, double din[], double dinout[]) +{ + int j; + double xr, xi, yr, yi; + + dinout[0] += din[0]; + dinout[1] *= din[1]; + dinout[2] *= din[2]; + for (j = 3; j < nfft; j += 2) { + xr = din[j]; + xi = din[j + 1]; + yr = dinout[j]; + yi = dinout[j + 1]; + dinout[j] = xr * yr - xi * yi; + dinout[j + 1] = xr * yi + xi * yr; + } + dinout[nfft + 1] *= din[nfft + 1]; +} + + +void mp_mul_cmuladd(int nfft, double din1[], double din2[], + double dinout[]) +{ + int j; + double xr, xi, yr, yi; + + dinout[1] += din1[1] * din2[1]; + dinout[2] += din1[2] * din2[2]; + for (j = 3; j < nfft; j += 2) { + xr = din1[j]; + xi = din1[j + 1]; + yr = din2[j]; + yi = din2[j + 1]; + dinout[j] += xr * yr - xi * yi; + dinout[j + 1] += xr * yi + xi * yr; + } + dinout[nfft + 1] += din1[nfft + 1] * din2[nfft + 1]; +} + + +void mp_mul_csqu(int nfft, double dinout[]) +{ + int j; + double xr, xi; + + dinout[0] *= 2; + dinout[1] *= dinout[1]; + dinout[2] *= dinout[2]; + for (j = 3; j < nfft; j += 2) { + xr = dinout[j]; + xi = dinout[j + 1]; + dinout[j] = xr * xr - xi * xi; + dinout[j + 1] = 2 * xr * xi; + } + dinout[nfft + 1] *= dinout[nfft + 1]; +} + + +void mp_mul_d2i(int n, int radix, int nfft, double din[], int out[]) +{ + int j, carry, carry1, carry2, shift, ndata; + double x, scale, d1_radix, d1_radix2, pow_radix, topdgt; + + scale = 2.0 / nfft; + d1_radix = 1.0 / radix; + d1_radix2 = d1_radix * d1_radix; + topdgt = din[nfft + 1]; + x = topdgt < 0 ? -topdgt : topdgt; + shift = x + 0.5 >= radix ? 1 : 0; + /* ---- correction of cyclic convolution of din[1] ---- */ + x *= nfft * 0.5; + din[nfft + 1] = din[1] - x; + din[1] = x; + /* ---- output of digits ---- */ + ndata = n; + if (n > nfft + 1 + shift) { + ndata = nfft + 1 + shift; + for (j = n + 1; j > ndata + 1; j--) { + out[j] = 0; + } + } + x = 0; + pow_radix = 1; + for (j = ndata + 1 - shift; j <= nfft + 1; j++) { + x += pow_radix * din[j]; + pow_radix *= d1_radix; + if (pow_radix < DBL_EPSILON) { + break; + } + } + x = d1_radix2 * (scale * x + 0.5); + carry2 = ((int) x) - 1; + carry = (int) (radix * (x - carry2) + 0.5); + for (j = ndata; j > 1; j--) { + x = d1_radix2 * (scale * din[j - shift] + carry + 0.5); + carry = carry2; + carry2 = ((int) x) - 1; + x = radix * (x - carry2); + carry1 = (int) x; + out[j + 1] = (int) (radix * (x - carry1)); + carry += carry1; + } + x = carry + ((double) radix) * carry2 + 0.5; + if (shift == 0) { + x += scale * din[1]; + } + carry = (int) (d1_radix * x); + out[2] = (int) (x - ((double) radix) * carry); + if (carry > 0) { + for (j = n + 1; j > 2; j--) { + out[j] = out[j - 1]; + } + out[2] = carry; + shift++; + } + /* ---- output of exp, sgn ---- */ + x = din[0] + shift + 0.5; + shift = ((int) x) - 1; + out[1] = shift + ((int) (x - shift)); + out[0] = topdgt > 0.5 ? 1 : -1; + if (out[2] == 0) { + out[0] = 0; + out[1] = 0; + } +} + + +double mp_mul_d2i_test(int radix, int nfft, double din[]) +{ + int j, carry, carry1, carry2; + double x, scale, d1_radix, d1_radix2, err; + + scale = 2.0 / nfft; + d1_radix = 1.0 / radix; + d1_radix2 = d1_radix * d1_radix; + /* ---- correction of cyclic convolution of din[1] ---- */ + x = din[nfft + 1] * nfft * 0.5; + if (x < 0) { + x = -x; + } + din[nfft + 1] = din[1] - x; + /* ---- check of digits ---- */ + err = 0; + carry = 0; + carry2 = 0; + for (j = nfft + 1; j > 1; j--) { + x = d1_radix2 * (scale * din[j] + carry + 0.5); + carry = carry2; + carry2 = ((int) x) - 1; + x = radix * (x - carry2); + carry1 = (int) x; + x = radix * (x - carry1); + carry += carry1; + x = x - 0.5 - ((int) x); + if (x > err) { + err = x; + } else if (-x > err) { + err = -x; + } + } + return err; +} + + +/* -------- mp_inv routines -------- */ + + +int mp_inv(int n, int radix, int in[], int out[], + int tmp1[], int tmp2[], int nfft, + double tmp1fft[], double tmp2fft[], int ip[], double w[]) +{ + int mp_get_nfft_init(int radix, int nfft_max); + void mp_inv_init(int n, int radix, int in[], int out[]); + int mp_inv_newton(int n, int radix, int in[], int inout[], + int tmp1[], int tmp2[], int nfft, double tmp1fft[], + double tmp2fft[], int ip[], double w[]); + int n_nwt, nfft_nwt, thr, prc; + + if (in[0] == 0) { + return -1; + } + nfft_nwt = mp_get_nfft_init(radix, nfft); + n_nwt = nfft_nwt + 2; + if (n_nwt > n) { + n_nwt = n; + } + mp_inv_init(n_nwt, radix, in, out); + thr = 8; + do { + n_nwt = nfft_nwt + 2; + if (n_nwt > n) { + n_nwt = n; + } + prc = mp_inv_newton(n_nwt, radix, in, out, + tmp1, tmp2, nfft_nwt, tmp1fft, tmp2fft, ip, w); + if (thr * nfft_nwt >= nfft) { + thr = 0; + if (2 * prc <= n_nwt - 2) { + nfft_nwt >>= 1; + } + } else { + if (3 * prc < n_nwt - 2) { + nfft_nwt >>= 1; + } + } + nfft_nwt <<= 1; + } while (nfft_nwt <= nfft); + return 0; +} + + +int mp_sqrt(int n, int radix, int in[], int out[], + int tmp1[], int tmp2[], int nfft, + double tmp1fft[], double tmp2fft[], int ip[], double w[]) +{ + void mp_load_0(int n, int radix, int out[]); + int mp_get_nfft_init(int radix, int nfft_max); + void mp_sqrt_init(int n, int radix, int in[], int out[], int out_rev[]); + int mp_sqrt_newton(int n, int radix, int in[], int inout[], + int inout_rev[], int tmp[], int nfft, double tmp1fft[], + double tmp2fft[], int ip[], double w[], int *n_tmp1fft); + int n_nwt, nfft_nwt, thr, prc, n_tmp1fft; + + if (in[0] < 0) { + return -1; + } else if (in[0] == 0) { + mp_load_0(n, radix, out); + return 0; + } + nfft_nwt = mp_get_nfft_init(radix, nfft); + n_nwt = nfft_nwt + 2; + if (n_nwt > n) { + n_nwt = n; + } + mp_sqrt_init(n_nwt, radix, in, out, tmp1); + n_tmp1fft = 0; + thr = 8; + do { + n_nwt = nfft_nwt + 2; + if (n_nwt > n) { + n_nwt = n; + } + prc = mp_sqrt_newton(n_nwt, radix, in, out, + tmp1, tmp2, nfft_nwt, tmp1fft, tmp2fft, + ip, w, &n_tmp1fft); + if (thr * nfft_nwt >= nfft) { + thr = 0; + if (2 * prc <= n_nwt - 2) { + nfft_nwt >>= 1; + } + } else { + if (3 * prc < n_nwt - 2) { + nfft_nwt >>= 1; + } + } + nfft_nwt <<= 1; + } while (nfft_nwt <= nfft); + return 0; +} + + +/* -------- mp_inv child routines -------- */ + + +int mp_get_nfft_init(int radix, int nfft_max) +{ + int nfft_init; + double r; + + r = radix; + nfft_init = 1; + do { + r *= r; + nfft_init <<= 1; + } while (DBL_EPSILON * r < 1 && nfft_init < nfft_max); + return nfft_init; +} + + +void mp_inv_init(int n, int radix, int in[], int out[]) +{ + void mp_unexp_d2mp(int n, int radix, double din, int out[]); + double mp_unexp_mp2d(int n, int radix, int in[]); + int outexp; + double din; + + out[0] = in[0]; + outexp = -in[1]; + din = 1.0 / mp_unexp_mp2d(n, radix, &in[2]); + while (din < 1) { + din *= radix; + outexp--; + } + out[1] = outexp; + mp_unexp_d2mp(n, radix, din, &out[2]); +} + + +void mp_sqrt_init(int n, int radix, int in[], int out[], int out_rev[]) +{ + void mp_unexp_d2mp(int n, int radix, double din, int out[]); + double mp_unexp_mp2d(int n, int radix, int in[]); + int outexp; + double din; + + out[0] = 1; + out_rev[0] = 1; + outexp = in[1]; + din = mp_unexp_mp2d(n, radix, &in[2]); + if (outexp % 2 != 0) { + din *= radix; + outexp--; + } + outexp /= 2; + din = sqrt(din); + if (din < 1) { + din *= radix; + outexp--; + } + out[1] = outexp; + mp_unexp_d2mp(n, radix, din, &out[2]); + outexp = -outexp; + din = 1.0 / din; + while (din < 1) { + din *= radix; + outexp--; + } + out_rev[1] = outexp; + mp_unexp_d2mp(n, radix, din, &out_rev[2]); +} + + +void mp_unexp_d2mp(int n, int radix, double din, int out[]) +{ + int j, x; + + for (j = 0; j < n; j++) { + x = (int) din; + if (x >= radix) { + x = radix - 1; + din = radix; + } + din = radix * (din - x); + out[j] = x; + } +} + + +double mp_unexp_mp2d(int n, int radix, int in[]) +{ + int j; + double d1_radix, dout; + + d1_radix = 1.0 / radix; + dout = 0; + for (j = n - 1; j >= 0; j--) { + dout = d1_radix * dout + in[j]; + } + return dout; +} + + +int mp_inv_newton(int n, int radix, int in[], int inout[], + int tmp1[], int tmp2[], int nfft, double tmp1fft[], + double tmp2fft[], int ip[], double w[]) +{ + void mp_load_1(int n, int radix, int out[]); + void mp_round(int n, int radix, int m, int inout[]); + void mp_add(int n, int radix, int in1[], int in2[], int out[]); + void mp_sub(int n, int radix, int in1[], int in2[], int out[]); + void mp_mulh(int n, int radix, int in1[], int in2[], int out[], + int nfft, double in1fft[], double outfft[], + int ip[], double w[]); + void mp_mulh_use_in1fft(int n, int radix, double in1fft[], + int shift, int in2[], int out[], int nfft, double outfft[], + int ip[], double w[]); + int n_h, shift, prc; + + shift = (nfft >> 1) + 1; + n_h = n / 2 + 1; + if (n_h < n - shift) { + n_h = n - shift; + } + /* ---- tmp1 = inout * (upper) in (half to normal precision) ---- */ + mp_round(n, radix, shift, inout); + mp_mulh(n, radix, inout, in, tmp1, + nfft, tmp1fft, tmp2fft, ip, w); + /* ---- tmp2 = 1 - tmp1 ---- */ + mp_load_1(n, radix, tmp2); + mp_sub(n, radix, tmp2, tmp1, tmp2); + /* ---- tmp2 -= inout * (lower) in (half precision) ---- */ + mp_mulh_use_in1fft(n, radix, tmp1fft, shift, in, tmp1, + nfft, tmp2fft, ip, w); + mp_sub(n_h, radix, tmp2, tmp1, tmp2); + /* ---- get precision ---- */ + prc = -tmp2[1]; + if (tmp2[0] == 0) { + prc = nfft + 1; + } + /* ---- tmp2 *= inout (half precision) ---- */ + mp_mulh_use_in1fft(n_h, radix, tmp1fft, 0, tmp2, tmp2, + nfft, tmp2fft, ip, w); + /* ---- inout += tmp2 ---- */ + if (tmp2[0] != 0) { + mp_add(n, radix, inout, tmp2, inout); + } + return prc; +} + + +int mp_sqrt_newton(int n, int radix, int in[], int inout[], + int inout_rev[], int tmp[], int nfft, double tmp1fft[], + double tmp2fft[], int ip[], double w[], int *n_tmp1fft) +{ + void mp_round(int n, int radix, int m, int inout[]); + void mp_add(int n, int radix, int in1[], int in2[], int out[]); + void mp_sub(int n, int radix, int in1[], int in2[], int out[]); + void mp_idiv_2(int n, int radix, int in[], int out[]); + void mp_mulh(int n, int radix, int in1[], int in2[], int out[], + int nfft, double in1fft[], double outfft[], + int ip[], double w[]); + void mp_squh(int n, int radix, int in[], int out[], + int nfft, double inoutfft[], int ip[], double w[]); + void mp_squh_use_in1fft(int n, int radix, double inoutfft[], int out[], + int nfft, int ip[], double w[]); + int n_h, nfft_h, shift, prc; + + nfft_h = nfft >> 1; + shift = nfft_h + 1; + if (nfft_h < 2) { + nfft_h = 2; + } + n_h = n / 2 + 1; + if (n_h < n - shift) { + n_h = n - shift; + } + /* ---- tmp = inout_rev^2 (1/4 to half precision) ---- */ + mp_round(n_h, radix, (nfft_h >> 1) + 1, inout_rev); + if (*n_tmp1fft != nfft_h) { + mp_squh(n_h, radix, inout_rev, tmp, + nfft_h, tmp1fft, ip, w); + } else { + mp_squh_use_in1fft(n_h, radix, tmp1fft, tmp, + nfft_h, ip, w); + } + /* ---- tmp = inout_rev - inout * tmp (half precision) ---- */ + mp_round(n, radix, shift, inout); + mp_mulh(n_h, radix, inout, tmp, tmp, + nfft, tmp1fft, tmp2fft, ip, w); + mp_sub(n_h, radix, inout_rev, tmp, tmp); + /* ---- inout_rev += tmp ---- */ + mp_add(n_h, radix, inout_rev, tmp, inout_rev); + /* ---- tmp = in - inout^2 (half to normal precision) ---- */ + mp_squh_use_in1fft(n, radix, tmp1fft, tmp, + nfft, ip, w); + mp_sub(n, radix, in, tmp, tmp); + /* ---- get precision ---- */ + prc = in[1] - tmp[1]; + if (in[2] > tmp[2]) { + prc++; + } + if (tmp[0] == 0) { + prc = nfft + 1; + } + /* ---- tmp = tmp * inout_rev / 2 (half precision) ---- */ + mp_round(n_h, radix, shift, inout_rev); + mp_mulh(n_h, radix, inout_rev, tmp, tmp, + nfft, tmp1fft, tmp2fft, ip, w); + *n_tmp1fft = nfft; + mp_idiv_2(n_h, radix, tmp, tmp); + /* ---- inout += tmp ---- */ + if (tmp[0] != 0) { + mp_add(n, radix, inout, tmp, inout); + } + return prc; +} + + +/* -------- mp_io routines -------- */ + + +void mp_sprintf(int n, int log10_radix, int in[], char out[]) +{ + int j, k, x, y, outexp, shift; + + if (in[0] < 0) { + *out++ = '-'; + } + x = in[2]; + shift = log10_radix; + for (k = log10_radix; k > 0; k--) { + y = x % 10; + x /= 10; + out[k] = '0' + y; + if (y != 0) { + shift = k; + } + } + out[0] = out[shift]; + out[1] = '.'; + for (k = 1; k <= log10_radix - shift; k++) { + out[k + 1] = out[k + shift]; + } + outexp = log10_radix - shift; + out += outexp + 2; + for (j = 3; j <= n + 1; j++) { + x = in[j]; + for (k = log10_radix - 1; k >= 0; k--) { + y = x % 10; + x /= 10; + out[k] = '0' + y; + } + out += log10_radix; + } + *out++ = 'e'; + outexp += log10_radix * in[1]; + sprintf(out, "%d", outexp); +} + + +void mp_sscanf(int n, int log10_radix, char in[], int out[]) +{ + char *s; + int j, x, outexp, outexp_mod; + + while (*in == ' ') { + in++; + } + out[0] = 1; + if (*in == '-') { + out[0] = -1; + in++; + } else if (*in == '+') { + in++; + } + while (*in == ' ' || *in == '0') { + in++; + } + outexp = 0; + for (s = in; *s != '\0'; s++) { + if (*s == 'e' || *s == 'E' || *s == 'd' || *s == 'D') { + if (sscanf(++s, "%d", &outexp) != 1) { + outexp = 0; + } + break; + } + } + if (*in == '.') { + do { + outexp--; + while (*++in == ' '); + } while (*in == '0' && *in != '\0'); + } else if (*in != '\0') { + s = in; + while (*++s == ' '); + while (*s >= '0' && *s <= '9' && *s != '\0') { + outexp++; + while (*++s == ' '); + } + } + x = outexp / log10_radix; + outexp_mod = outexp - log10_radix * x; + if (outexp_mod < 0) { + x--; + outexp_mod += log10_radix; + } + out[1] = x; + x = 0; + j = 2; + for (s = in; *s != '\0'; s++) { + if (*s == '.' || *s == ' ') { + continue; + } + if (*s < '0' || *s > '9') { + break; + } + x = 10 * x + (*s - '0'); + if (--outexp_mod < 0) { + if (j > n + 1) { + break; + } + out[j++] = x; + x = 0; + outexp_mod = log10_radix - 1; + } + } + while (outexp_mod-- >= 0) { + x *= 10; + } + while (j <= n + 1) { + out[j++] = x; + x = 0; + } + if (out[2] == 0) { + out[0] = 0; + out[1] = 0; + } +} + + +void mp_fprintf(int n, int log10_radix, int in[], FILE *fout) +{ + int j, k, x, y, outexp, shift; + char out[256]; + + if (in[0] < 0) { + putc('-', fout); + } + x = in[2]; + shift = log10_radix; + for (k = log10_radix; k > 0; k--) { + y = x % 10; + x /= 10; + out[k] = '0' + y; + if (y != 0) { + shift = k; + } + } + putc(out[shift], fout); + putc('.', fout); + for (k = 1; k <= log10_radix - shift; k++) { + putc(out[k + shift], fout); + } + outexp = log10_radix - shift; + for (j = 3; j <= n + 1; j++) { + x = in[j]; + for (k = log10_radix - 1; k >= 0; k--) { + y = x % 10; + x /= 10; + out[k] = '0' + y; + } + for (k = 0; k < log10_radix; k++) { + putc(out[k], fout); + } + } + putc('e', fout); + outexp += log10_radix * in[1]; + sprintf(out, "%d", outexp); + for (k = 0; out[k] != '\0'; k++) { + putc(out[k], fout); + } +} + + diff --git a/plugins/supereq/nsfft-1.00/simd/Makefile b/plugins/supereq/nsfft-1.00/simd/Makefile index 5d253498..fc484116 120000 --- a/plugins/supereq/nsfft-1.00/simd/Makefile +++ b/plugins/supereq/nsfft-1.00/simd/Makefile @@ -1 +1 @@ -Makefile.x86avx
\ No newline at end of file +Makefile.x86
\ No newline at end of file diff --git a/plugins/supereq/nsfft-1.00/simd/SIMDBase.h b/plugins/supereq/nsfft-1.00/simd/SIMDBase.h index 5382b4d1..10cdeb81 100644 --- a/plugins/supereq/nsfft-1.00/simd/SIMDBase.h +++ b/plugins/supereq/nsfft-1.00/simd/SIMDBase.h @@ -1,6 +1,8 @@ #ifndef _SIMDBase_H_ #define _SIMDBase_H_ +#include <stdint.h> + #define SIMDBase_TYPE_FLOAT ( 1 | ( 1 << 24 )) #define SIMDBase_TYPE_DOUBLE ( 2 | ( 1 << 24 )) #define SIMDBase_TYPE_LONGDOUBLE ( 3 | ( 1 << 24 )) diff --git a/plugins/supereq/shibatch_rdft.c b/plugins/supereq/shibatch_rdft.c index 2a352f7e..db453eb8 100644 --- a/plugins/supereq/shibatch_rdft.c +++ b/plugins/supereq/shibatch_rdft.c @@ -6,34 +6,66 @@ #include "SIMDBase.h" #include "DFT.h" -typedef float REAL; #define TYPE SIMDBase_TYPE_FLOAT -void rfft(int n,int isign,REAL *x) { +void rfft(int n,int isign,float *x) { static DFT *p = NULL; + static float *buf = NULL; static int ipsize = 0; static int mode = 0; + static int veclen = 0; int newipsize; if (n == 0) { + if (buf) { + SIMDBase_alignedFree (buf); + buf = NULL; + } if (p) { DFT_dispose(p, mode); p = NULL; } return; } - n = 1 << n; - newipsize = 2+sqrt(n/2); - if (newipsize > ipsize) { + int nn = n; + n = 1<<n; + newipsize = n; + if (newipsize != ipsize) { ipsize = newipsize; + if (buf) { + SIMDBase_alignedFree (buf); + buf = NULL; + } + if (p) { DFT_dispose(p, mode); p = NULL; } + buf = SIMDBase_alignedMalloc (n * sizeof (float)); + mode = SIMDBase_chooseBestMode(TYPE); - p = DFT_init(mode, n, 0); + veclen = SIMDBase_getModeParamInt(SIMDBase_PARAMID_VECTOR_LEN, mode); + int sizeOfVect = SIMDBase_getModeParamInt(SIMDBase_PARAMID_SIZE_OF_VECT, mode); + printf ("n: %d, veclen: %d, sizeOfVect: %d\n", n, veclen, sizeOfVect); + p = DFT_init(mode, n/veclen, DFT_FLAG_REAL); + } + + // store in simd order + int asize = n / veclen; + int i, j; + for(j=0;j<veclen;j++) { + for (i = 0; i < asize; i++) { + buf[i * veclen + j] = x[j * asize + i]; + } } - DFT_execute(p, mode, x, 1); + DFT_execute(p, mode, buf, isign); + +#define THRES 1e-3 + for(j=0;j<veclen;j++) { + for (i = 0; i < asize; i++) { + x[j * asize + i] = buf[i * veclen + j]; + } + } } diff --git a/plugins/supereq/supereq.c b/plugins/supereq/supereq.c index 34fc6d41..e4f33015 100644 --- a/plugins/supereq/supereq.c +++ b/plugins/supereq/supereq.c @@ -97,7 +97,7 @@ supereq_process (ddb_dsp_context_t *ctx, float *samples, int frames, int maxfram deadbeef->mutex_lock (supereq->mutex); supereq->last_srate = fmt->samplerate; supereq->last_nch = fmt->channels; - equ_init (&supereq->state, 14, fmt->channels); + equ_init (&supereq->state, 10, fmt->channels); recalc_table (supereq); equ_clearbuf(&supereq->state); deadbeef->mutex_unlock (supereq->mutex); @@ -222,7 +222,7 @@ supereq_open (void) { ddb_supereq_ctx_t *supereq = malloc (sizeof (ddb_supereq_ctx_t)); DDB_INIT_DSP_CONTEXT (supereq,ddb_supereq_ctx_t,&plugin); - equ_init (&supereq->state, 14, 2); + equ_init (&supereq->state, 10, 2); supereq->paramsroot = paramlist_alloc (); supereq->last_srate = 44100; supereq->last_nch = 2; |