summaryrefslogtreecommitdiff
path: root/plugins/supereq/nsfft-1.00
diff options
context:
space:
mode:
Diffstat (limited to 'plugins/supereq/nsfft-1.00')
-rw-r--r--plugins/supereq/nsfft-1.00/README15
-rw-r--r--plugins/supereq/nsfft-1.00/dft/DFT.c327
-rw-r--r--plugins/supereq/nsfft-1.00/dft/DFT.h56
-rw-r--r--plugins/supereq/nsfft-1.00/dft/DFTUndiff.c1807
-rw-r--r--plugins/supereq/nsfft-1.00/dft/DFTUndiff.h114
l---------plugins/supereq/nsfft-1.00/dft/Makefile1
-rw-r--r--plugins/supereq/nsfft-1.00/dft/Makefile.altivec26
-rw-r--r--plugins/supereq/nsfft-1.00/dft/Makefile.neon26
-rw-r--r--plugins/supereq/nsfft-1.00/dft/Makefile.purec35
-rw-r--r--plugins/supereq/nsfft-1.00/dft/Makefile.x8629
-rw-r--r--plugins/supereq/nsfft-1.00/dft/Makefile.x86avx35
-rw-r--r--plugins/supereq/nsfft-1.00/dfttest/DFTExample.c88
-rw-r--r--plugins/supereq/nsfft-1.00/dfttest/DFTTestFFTW.c317
-rw-r--r--plugins/supereq/nsfft-1.00/dfttest/DFTTestNaive.c419
-rw-r--r--plugins/supereq/nsfft-1.00/dfttest/DFTTestOoura.c260
-rw-r--r--plugins/supereq/nsfft-1.00/dfttest/Makefile35
-rw-r--r--plugins/supereq/nsfft-1.00/dfttest/pi_fft.c.patch131
-rw-r--r--plugins/supereq/nsfft-1.00/doc/default.css34
-rw-r--r--plugins/supereq/nsfft-1.00/doc/index.xhtml2016
-rw-r--r--plugins/supereq/nsfft-1.00/doc/nsfft.pdfbin0 -> 78973 bytes
-rw-r--r--plugins/supereq/nsfft-1.00/ooura/Makefile11
-rw-r--r--plugins/supereq/nsfft-1.00/ooura/README2
l---------plugins/supereq/nsfft-1.00/simd/Makefile1
-rw-r--r--plugins/supereq/nsfft-1.00/simd/Makefile.altivec26
-rw-r--r--plugins/supereq/nsfft-1.00/simd/Makefile.neon26
-rw-r--r--plugins/supereq/nsfft-1.00/simd/Makefile.purec35
-rw-r--r--plugins/supereq/nsfft-1.00/simd/Makefile.x8635
-rw-r--r--plugins/supereq/nsfft-1.00/simd/Makefile.x86avx35
-rw-r--r--plugins/supereq/nsfft-1.00/simd/SIMDBase.c454
-rw-r--r--plugins/supereq/nsfft-1.00/simd/SIMDBase.h51
-rw-r--r--plugins/supereq/nsfft-1.00/simd/SIMDBaseUndiff.c38
-rw-r--r--plugins/supereq/nsfft-1.00/simd/SIMDBaseUndiff.h231
32 files changed, 6716 insertions, 0 deletions
diff --git a/plugins/supereq/nsfft-1.00/README b/plugins/supereq/nsfft-1.00/README
new file mode 100644
index 00000000..1ca873b1
--- /dev/null
+++ b/plugins/supereq/nsfft-1.00/README
@@ -0,0 +1,15 @@
+
+NSFFT (Nonrestrictive SIMD FFT) is yet another FFT library for
+performing 1-dimensional fast Fourier transforms. NSDFT is a simple,
+small and portable library, and it is efficient since it can utilize
+SIMD instruction sets in modern processors. It performs multiple
+transforms simultaneously, and thus it is especially suitable for
+digital signal processing. It does not need so much computation to
+make a good execution plan. This library is in public domain, so that
+you can incorporate this library into your product without any
+obligation.
+
+Visit http://shibatch.sourceforge.net/ to get the latest version of
+this library.
+
+Contact : Naoki Shibata shibatch@users.sourceforge.net
diff --git a/plugins/supereq/nsfft-1.00/dft/DFT.c b/plugins/supereq/nsfft-1.00/dft/DFT.c
new file mode 100644
index 00000000..d59e6ab8
--- /dev/null
+++ b/plugins/supereq/nsfft-1.00/dft/DFT.c
@@ -0,0 +1,327 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+#include <stdint.h>
+#include <sys/time.h>
+
+#include "SIMDBase.h"
+#include "DFT.h"
+#include "DFTUndiff.h"
+
+int32_t getModeParamInt_purec_float(int32_t paramId);
+int32_t getModeParamInt_purec_double(int32_t paramId);
+int32_t getModeParamInt_purec_longdouble(int32_t paramId);
+int32_t getModeParamInt_sse_float(int32_t paramId);
+int32_t getModeParamInt_sse2_double(int32_t paramId);
+int32_t getModeParamInt_neon_float(int32_t paramId);
+int32_t getModeParamInt_avx_float(int32_t paramId);
+int32_t getModeParamInt_avx_double(int32_t paramId);
+int32_t getModeParamInt_altivec_float(int32_t paramId);
+
+char * getModeParamString_purec_float(int32_t paramId);
+char * getModeParamString_purec_double(int32_t paramId);
+char * getModeParamString_purec_longdouble(int32_t paramId);
+char * getModeParamString_sse_float(int32_t paramId);
+char * getModeParamString_sse2_double(int32_t paramId);
+char * getModeParamString_neon_float(int32_t paramId);
+char * getModeParamString_avx_float(int32_t paramId);
+char * getModeParamString_avx_double(int32_t paramId);
+char * getModeParamString_altivec_float(int32_t paramId);
+
+void *makePlan_purec_float(uint64_t n, uint64_t flags);
+void *makePlan_purec_double(uint64_t n, uint64_t flags);
+void *makePlan_purec_longdouble(uint64_t n, uint64_t flags);
+void *makePlan_sse_float(uint64_t n, uint64_t flags);
+void *makePlan_sse2_double(uint64_t n, uint64_t flags);
+void *makePlan_neon_float(uint64_t n, uint64_t flags);
+void *makePlan_avx_float(uint64_t n, uint64_t flags);
+void *makePlan_avx_double(uint64_t n, uint64_t flags);
+void *makePlan_altivec_float(uint64_t n, uint64_t flags);
+
+void *makePlanSub_purec_float(uint64_t n, int32_t radix2thres, int32_t useCobra, uint64_t flags);
+void *makePlanSub_purec_double(uint64_t n, int32_t radix2thres, int32_t useCobra, uint64_t flags);
+void *makePlanSub_purec_longdouble(uint64_t n, int32_t radix2thres, int32_t useCobra, uint64_t flags);
+void *makePlanSub_sse_float(uint64_t n, int32_t radix2thres, int32_t useCobra, uint64_t flags);
+void *makePlanSub_sse2_double(uint64_t n, int32_t radix2thres, int32_t useCobra, uint64_t flags);
+void *makePlanSub_neon_float(uint64_t n, int32_t radix2thres, int32_t useCobra, uint64_t flags);
+void *makePlanSub_avx_float(uint64_t n, int32_t radix2thres, int32_t useCobra, uint64_t flags);
+void *makePlanSub_avx_double(uint64_t n, int32_t radix2thres, int32_t useCobra, uint64_t flags);
+void *makePlanSub_altivec_float(uint64_t n, int32_t radix2thres, int32_t useCobra, uint64_t flags);
+
+void destroyPlan_purec_float(void *p);
+void destroyPlan_purec_double(void *p);
+void destroyPlan_purec_longdouble(void *p);
+void destroyPlan_sse_float(void *p);
+void destroyPlan_sse2_double(void *p);
+void destroyPlan_neon_float(void *p);
+void destroyPlan_avx_float(void *p);
+void destroyPlan_avx_double(void *p);
+void destroyPlan_altivec_float(void *p);
+
+void execute_purec_float(void *p, void *s, int32_t dir);
+void execute_purec_double(void *p, void *s, int32_t dir);
+void execute_purec_longdouble(void *p, void *s, int32_t dir);
+void execute_sse_float(void *p, void *s, int32_t dir);
+void execute_sse2_double(void *p, void *s, int32_t dir);
+void execute_neon_float(void *p, void *s, int32_t dir);
+void execute_avx_float(void *p, void *s, int32_t dir);
+void execute_avx_double(void *p, void *s, int32_t dir);
+void execute_altivec_float(void *p, void *s, int32_t dir);
+
+void *DFT_init(int32_t mode, uint64_t n, uint64_t flags) {
+ switch(mode) {
+#if defined(ENABLE_PUREC_FLOAT)
+ case 1: return makePlan_purec_float(n, flags); break;
+#endif
+#if defined(ENABLE_PUREC_DOUBLE)
+ case 2: return makePlan_purec_double(n, flags); break;
+#endif
+#if defined(ENABLE_PUREC_LONGDOUBLE)
+ case 3: return makePlan_purec_longdouble(n, flags); break;
+#endif
+#if defined(ENABLE_SSE_FLOAT)
+ case 4: return makePlan_sse_float(n, flags); break;
+#endif
+#if defined(ENABLE_SSE2_DOUBLE)
+ case 5: return makePlan_sse2_double(n, flags); break;
+#endif
+#if defined(ENABLE_NEON_FLOAT)
+ case 6: return makePlan_neon_float(n, flags); break;
+#endif
+#if defined(ENABLE_AVX_FLOAT)
+ case 7: return makePlan_avx_float(n, flags); break;
+#endif
+#if defined(ENABLE_AVX_DOUBLE)
+ case 8: return makePlan_avx_double(n, flags); break;
+#endif
+#if defined(ENABLE_ALTIVEC_FLOAT)
+ case 9: return makePlan_altivec_float(n, flags); break;
+#endif
+ default: break;
+ }
+
+ return NULL;
+}
+
+void DFT_dispose(void *p, int32_t mode) {
+ switch(mode) {
+#if defined(ENABLE_PUREC_FLOAT)
+ case 1: destroyPlan_purec_float(p); break;
+#endif
+#if defined(ENABLE_PUREC_DOUBLE)
+ case 2: destroyPlan_purec_double(p); break;
+#endif
+#if defined(ENABLE_PUREC_LONGDOUBLE)
+ case 3: destroyPlan_purec_longdouble(p); break;
+#endif
+#if defined(ENABLE_SSE_FLOAT)
+ case 4: destroyPlan_sse_float(p); break;
+#endif
+#if defined(ENABLE_SSE2_DOUBLE)
+ case 5: destroyPlan_sse2_double(p); break;
+#endif
+#if defined(ENABLE_NEON_FLOAT)
+ case 6: destroyPlan_neon_float(p); break;
+#endif
+#if defined(ENABLE_AVX_FLOAT)
+ case 7: destroyPlan_avx_float(p); break;
+#endif
+#if defined(ENABLE_AVX_DOUBLE)
+ case 8: destroyPlan_avx_double(p); break;
+#endif
+#if defined(ENABLE_ALTIVEC_FLOAT)
+ case 9: destroyPlan_altivec_float(p); break;
+#endif
+ default: break;
+ }
+}
+
+void DFT_execute(void *p, int32_t mode, void *s, int32_t dir) {
+ switch(mode) {
+#if defined(ENABLE_PUREC_FLOAT)
+ case 1: return execute_purec_float(p, s, dir); break;
+#endif
+#if defined(ENABLE_PUREC_DOUBLE)
+ case 2: return execute_purec_double(p, s, dir); break;
+#endif
+#if defined(ENABLE_PUREC_LONGDOUBLE)
+ case 3: return execute_purec_longdouble(p, s, dir); break;
+#endif
+#if defined(ENABLE_SSE_FLOAT)
+ case 4: return execute_sse_float(p, s, dir); break;
+#endif
+#if defined(ENABLE_SSE2_DOUBLE)
+ case 5: return execute_sse2_double(p, s, dir); break;
+#endif
+#if defined(ENABLE_NEON_FLOAT)
+ case 6: return execute_neon_float(p, s, dir); break;
+#endif
+#if defined(ENABLE_AVX_FLOAT)
+ case 7: return execute_avx_float(p, s, dir); break;
+#endif
+#if defined(ENABLE_AVX_DOUBLE)
+ case 8: return execute_avx_double(p, s, dir); break;
+#endif
+#if defined(ENABLE_ALTIVEC_FLOAT)
+ case 9: return execute_altivec_float(p, s, dir); break;
+#endif
+ default: break;
+ }
+}
+
+#define FILE_FORMAT_VERSION 0
+
+int32_t DFT_fwrite(void *p2, FILE *fp) {
+ DFTUndiff *p = (DFTUndiff *)p2;
+ if (p->magic != MAGIC_DFT) abort();
+
+ if (fprintf(fp, "nsfft file format : %d\n", FILE_FORMAT_VERSION) <= 0) return 0;
+ if (fprintf(fp, "arch : %s\n", SIMDBase_getProcessorNameString()) <= 0) return 0;
+ if (fprintf(fp, "computation mode : %d\n", p->mode) <= 0) return 0;
+ if (fprintf(fp, "length : %d\n", ((p->flags & DFT_FLAG_REAL) != 0 || (p->flags & DFT_FLAG_ALT_REAL) != 0)? p->length * 2 : p->length) <= 0) return 0;
+ if (fprintf(fp, "radix2 threshold : %d\n", p->radix2thres) <= 0) return 0;
+ if (fprintf(fp, "transpose : %d\n", p->flagTrans) <= 0) return 0;
+ if (fprintf(fp, "bit reversal : %d\n", p->useCobra) <= 0) return 0;
+ if (fprintf(fp, "flags : %llx\n", (unsigned long long int)p->flags) <= 0) return 0;
+ if (fprintf(fp, "%s\n", "end :") <= 0) return 0;
+
+ return 1;
+}
+
+static char *startsWith(char *str1, char *str2) {
+ if (strncmp(str1, str2, strlen(str2)) == 0) {
+ return str1 + strlen(str2);
+ }
+
+ return NULL;
+}
+
+DFT *DFT_fread(FILE *fp, int32_t *errcode) {
+ int length = -1, radix2thres = -1, flagTrans = -1, useCobra = -1;
+ int mode = -1, formatver = -1;
+ unsigned long long int flags = (1ULL << 63);
+
+ if (errcode != NULL) *errcode = DFT_ERROR_NOERROR;
+
+ for(;;) {
+ char buf[256], *q;
+ if (fgets(buf, 255, fp) == NULL) { if (errcode != NULL) *errcode = DFT_ERROR_UNEXPECTED_EOF; return NULL; }
+
+ if ((q = startsWith(buf, "nsfft file format :")) != NULL) {
+ if (1 != sscanf(q, "%d", &formatver)) { if (errcode != NULL) *errcode = DFT_ERROR_FILE_IO; return NULL; }
+ } else if ((q = startsWith(buf, "computation mode :")) != NULL) {
+ if (1 != sscanf(q, "%d", &mode)) { if (errcode != NULL) *errcode = DFT_ERROR_FILE_IO; return NULL; }
+ } else if ((q = startsWith(buf, "length :")) != NULL) {
+ if (1 != sscanf(q, "%d", &length)) { if (errcode != NULL) *errcode = DFT_ERROR_FILE_IO; return NULL; }
+ } else if ((q = startsWith(buf, "radix2 threshold :")) != NULL) {
+ if (1 != sscanf(q, "%d", &radix2thres)) { if (errcode != NULL) *errcode = DFT_ERROR_FILE_IO; return NULL; }
+ } else if ((q = startsWith(buf, "transpose :")) != NULL) {
+ if (1 != sscanf(q, "%d", &flagTrans)) { if (errcode != NULL) *errcode = DFT_ERROR_FILE_IO; return NULL; }
+ } else if ((q = startsWith(buf, "bit reversal :")) != NULL) {
+ if (1 != sscanf(q, "%d", &useCobra)) { if (errcode != NULL) *errcode = DFT_ERROR_FILE_IO; return NULL; }
+ } else if ((q = startsWith(buf, "flags :")) != NULL) {
+ if (1 != sscanf(q, "%llx", &flags)) { if (errcode != NULL) *errcode = DFT_ERROR_FILE_IO; return NULL; }
+ } else if ((q = startsWith(buf, "end :")) != NULL) {
+ break;
+ }
+ }
+
+ if (formatver > FILE_FORMAT_VERSION) {
+ if (errcode != NULL) *errcode = DFT_ERROR_FILE_VERSION;
+ return NULL;
+ }
+
+ switch(SIMDBase_detect(mode)) {
+ case 1:
+ break;
+ case 0:
+ if (errcode != NULL) *errcode = DFT_ERROR_MODE_NOT_AVAILABLE;
+ return NULL;
+ case -1:
+ if (errcode != NULL) *errcode = DFT_ERROR_MODE_NOT_COMPILED_IN;
+ return NULL;
+ }
+
+ switch(mode) {
+#if defined(ENABLE_PUREC_FLOAT)
+ case 1: return makePlanSub_purec_float(length, radix2thres, useCobra, flags);
+#endif
+#if defined(ENABLE_PUREC_DOUBLE)
+ case 2: return makePlanSub_purec_double(length, radix2thres, useCobra, flags);
+#endif
+#if defined(ENABLE_PUREC_LONGDOUBLE)
+ case 3: return makePlanSub_purec_longdouble(length, radix2thres, useCobra, flags);
+#endif
+#if defined(ENABLE_SSE_FLOAT)
+ case 4: return makePlanSub_sse_float(length, radix2thres, useCobra, flags);
+#endif
+#if defined(ENABLE_SSE2_DOUBLE)
+ case 5: return makePlanSub_sse2_double(length, radix2thres, useCobra, flags);
+#endif
+#if defined(ENABLE_NEON_FLOAT)
+ case 6: return makePlanSub_neon_float(length, radix2thres, useCobra, flags);
+#endif
+#if defined(ENABLE_AVX_FLOAT)
+ case 7: return makePlanSub_avx_float(length, radix2thres, useCobra, flags);
+#endif
+#if defined(ENABLE_AVX_DOUBLE)
+ case 8: return makePlanSub_avx_double(length, radix2thres, useCobra, flags);
+#endif
+#if defined(ENABLE_ALTIVEC_FLOAT)
+ case 9: return makePlanSub_altivec_float(length, radix2thres, useCobra, flags);
+#endif
+ }
+
+ if (errcode != NULL) *errcode = DFT_ERROR_UNKNOWN_MODE;
+
+ return NULL;
+}
+
+int32_t DFT_getPlanParamInt(int32_t paramId, void *p2) {
+ DFTUndiff *p = (DFTUndiff *)p2;
+ if (p->magic != MAGIC_DFT) abort();
+
+ switch(paramId) {
+ case DFT_PARAMID_MODE: return p->mode;
+ case DFT_PARAMID_FFT_LENGTH:
+ if ((p->flags & DFT_FLAG_REAL) != 0) return p->length * 2;
+ if ((p->flags & DFT_FLAG_ALT_REAL) != 0) return p->length * 2;
+ return p->length;
+ case DFT_PARAMID_IS_REAL_TRANSFORM: return (p->flags & DFT_FLAG_REAL) ? 1 : 0;
+ case DFT_PARAMID_IS_ALT_REAL_TRANSFORM: return (p->flags & DFT_FLAG_ALT_REAL) ? 1 : 0;
+ case DFT_PARAMID_NO_BIT_REVERSAL: return (p->flags & DFT_FLAG_NO_BITREVERSAL) ? 1 : 0;
+ case DFT_PARAMID_TEST_RUN: return p->flags & 3;
+ }
+
+ return -1;
+}
+
+#if 0
+char *DFT_getPlanParamString(int32_t paramId, void *p2) {
+ dft_t *p = (dft_t *)p2;
+ if (p->magic != MAGIC_NSDFT) abort();
+
+ return NULL;
+}
+#endif
+
+uint32_t DFT_ilog2(uint32_t q) {
+ static const uint32_t tab[] = {0,1,2,2,3,3,3,3,4,4,4,4,4,4,4,4};
+ uint32_t r = 0,qq;
+
+ if (q & 0xffff0000) r = 16;
+
+ q >>= r;
+ qq = q | (q >> 1);
+ qq |= (qq >> 2);
+ qq = ((qq & 0x10) >> 4) | ((qq & 0x100) >> 7) | ((qq & 0x1000) >> 10);
+
+ return r + tab[qq] * 4 + tab[q >> (tab[qq] * 4)] - 1;
+}
+
+double DFT_timeofday(void) {
+ struct timeval tp;
+ gettimeofday(&tp, NULL);
+ return (double)tp.tv_sec+(1e-6)*tp.tv_usec;
+}
diff --git a/plugins/supereq/nsfft-1.00/dft/DFT.h b/plugins/supereq/nsfft-1.00/dft/DFT.h
new file mode 100644
index 00000000..facb701a
--- /dev/null
+++ b/plugins/supereq/nsfft-1.00/dft/DFT.h
@@ -0,0 +1,56 @@
+#ifndef __DFT_H__
+#define __DFT_H__
+
+#include <stdio.h>
+#include <stdint.h>
+
+typedef void DFT;
+
+int32_t DFT_getParamInt(int32_t paramId);
+char *DFT_getParamString(int32_t paramId);
+
+int32_t DFT_getModeParamInt(int32_t paramId, int32_t mode);
+char *DFT_getModeParamString(int32_t paramId, int32_t mode);
+
+DFT *DFT_init(int32_t mode, uint64_t n, uint64_t flags);
+void DFT_dispose(DFT *p, int32_t mode);
+
+int32_t DFT_fwrite(DFT *p, FILE *fp);
+DFT *DFT_fread(FILE *fp, int32_t *errcode);
+
+int32_t DFT_getPlanParamInt(int32_t paramId, void *p);
+
+void DFT_execute(DFT *p, int32_t mode, void *s, int32_t dir);
+
+uint32_t DFT_ilog2(uint32_t q);
+double DFT_timeofday(void);
+
+#define DFT_FLAG_NO_TEST_RUN ( 0ULL << 0)
+#define DFT_FLAG_LIGHT_TEST_RUN ( 1ULL << 0)
+#define DFT_FLAG_HEAVY_TEST_RUN ( 2ULL << 0)
+#define DFT_FLAG_EXHAUSTIVE_TEST_RUN ( 3ULL << 0)
+
+#define DFT_FLAG_REAL (1ULL << 2)
+#define DFT_FLAG_ALT_REAL (1ULL << 3)
+#define DFT_FLAG_VERBOSE (1ULL << 4)
+#define DFT_FLAG_NO_BITREVERSAL (1ULL << 5)
+#define DFT_FLAG_FORCE_RECURSIVE (1ULL << 6)
+#define DFT_FLAG_FORCE_COBRA (1ULL << 7)
+
+#define DFT_PARAMID_TYPE ( 1 | ( 3 << 24 ))
+#define DFT_PARAMID_MODE ( 2 | ( 3 << 24 ))
+#define DFT_PARAMID_FFT_LENGTH ( 3 | ( 3 << 24 ))
+#define DFT_PARAMID_IS_REAL_TRANSFORM ( 4 | ( 3 << 24 ))
+#define DFT_PARAMID_IS_ALT_REAL_TRANSFORM ( 5 | ( 3 << 24 ))
+#define DFT_PARAMID_NO_BIT_REVERSAL ( 6 | ( 3 << 24 ))
+#define DFT_PARAMID_TEST_RUN ( 7 | ( 3 << 24 ))
+
+#define DFT_ERROR_NOERROR 0
+#define DFT_ERROR_FILE_VERSION 1
+#define DFT_ERROR_FILE_IO 2
+#define DFT_ERROR_UNEXPECTED_EOF 3
+#define DFT_ERROR_MODE_NOT_COMPILED_IN 4
+#define DFT_ERROR_MODE_NOT_AVAILABLE 5
+#define DFT_ERROR_UNKNOWN_MODE 6
+
+#endif
diff --git a/plugins/supereq/nsfft-1.00/dft/DFTUndiff.c b/plugins/supereq/nsfft-1.00/dft/DFTUndiff.c
new file mode 100644
index 00000000..4985da33
--- /dev/null
+++ b/plugins/supereq/nsfft-1.00/dft/DFTUndiff.c
@@ -0,0 +1,1807 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include <stdint.h>
+
+#include "SIMDBase.h"
+#include "SIMDBaseUndiff.h"
+#include "DFT.h"
+#include "DFTUndiff.h"
+
+//
+
+#define SIN(x) sin(x)
+#define COS(x) cos(x)
+
+#define SQRT2_2 .7071067811865475244008443621048490392848359376884740365883398689953L
+
+#ifndef M_PIl
+#define M_PIl 3.141592653589793238462643383279502884197169399375105820974944592307L
+#endif
+
+//
+
+static inline void srBut2(DFTUndiff *p) {
+ SIMDBase_VECT *s = p->s;
+ int32_t o = p->offset1;
+ SIMDBase_VECT t0, t1;
+
+ t0 = SIMDBase_ADDm(&s[o ], &s[o+2]); t1 = SIMDBase_SUBm(&s[o ], &s[o+2]);
+ SIMDBase_STOR(&s[o ], t0); SIMDBase_STOR(&s[o+2], t1);
+ t0 = SIMDBase_ADDm(&s[o+1], &s[o+3]); t1 = SIMDBase_SUBm(&s[o+1], &s[o+3]);
+ SIMDBase_STOR(&s[o+1], t0); SIMDBase_STOR(&s[o+3], t1);
+}
+
+static inline void srButForward4(DFTUndiff *p) {
+ SIMDBase_VECT *s = p->s;
+ int32_t o = p->offset1;
+ SIMDBase_VECT t0r, t0i, t1r, t1i, t2r, t2i, t3r, t3i;
+
+ t0r = SIMDBase_ADDm(&s[o+0], &s[o+4]); t2r = SIMDBase_SUBm(&s[o+0], &s[o+4]);
+ t0i = SIMDBase_ADDm(&s[o+1], &s[o+5]); t2i = SIMDBase_SUBm(&s[o+1], &s[o+5]);
+ t1r = SIMDBase_ADDm(&s[o+2], &s[o+6]); t3i = SIMDBase_SUBm(&s[o+2], &s[o+6]);
+ t1i = SIMDBase_ADDm(&s[o+7], &s[o+3]); t3r = SIMDBase_SUBm(&s[o+7], &s[o+3]);
+
+ SIMDBase_STOR(&s[o+0], SIMDBase_ADDi(t0r, t1r)); SIMDBase_STOR(&s[o+1], SIMDBase_ADDi(t0i, t1i));
+ SIMDBase_STOR(&s[o+2], SIMDBase_SUBi(t0r, t1r)); SIMDBase_STOR(&s[o+3], SIMDBase_SUBi(t0i, t1i));
+ SIMDBase_STOR(&s[o+4], SIMDBase_SUBi(t2r, t3r)); SIMDBase_STOR(&s[o+5], SIMDBase_SUBi(t2i, t3i));
+ SIMDBase_STOR(&s[o+6], SIMDBase_ADDi(t2r, t3r)); SIMDBase_STOR(&s[o+7], SIMDBase_ADDi(t2i, t3i));
+}
+
+static inline void srButBackward4(DFTUndiff *p) {
+ SIMDBase_VECT *s = p->s;
+ int32_t o = p->offset1;
+
+ SIMDBase_VECT t0r, t0i, t1r, t1i;
+ SIMDBase_VECT s0 = SIMDBase_LOAD(&s[o+0]), s1 = SIMDBase_LOAD(&s[o+1]), s2 = SIMDBase_LOAD(&s[o+2]), s3 = SIMDBase_LOAD(&s[o+3]);
+
+ t0r = SIMDBase_ADDi(s0, s2); t0i = SIMDBase_SUBi(s0, s2); s0 = t0r; s2 = t0i;
+ t0r = SIMDBase_ADDi(s1, s3); t0i = SIMDBase_SUBi(s1, s3); s1 = t0r; s3 = t0i;
+ t0r = SIMDBase_ADDm(&s[o+4], &s[o+6]); t1i = SIMDBase_SUBm(&s[o+4], &s[o+6]);
+ t0i = SIMDBase_ADDm(&s[o+7], &s[o+5]); t1r = SIMDBase_SUBm(&s[o+7], &s[o+5]);
+
+ SIMDBase_STOR(&s[o+4], SIMDBase_SUBi(s0, t0r)); SIMDBase_STOR(&s[o+5], SIMDBase_SUBi(s1, t0i));
+ SIMDBase_STOR(&s[o+6], SIMDBase_SUBi(s2, t1r)); SIMDBase_STOR(&s[o+7], SIMDBase_SUBi(s3, t1i));
+ SIMDBase_STOR(&s[o+0], SIMDBase_ADDi(s0, t0r)); SIMDBase_STOR(&s[o+1], SIMDBase_ADDi(s1, t0i));
+ SIMDBase_STOR(&s[o+2], SIMDBase_ADDi(s2, t1r)); SIMDBase_STOR(&s[o+3], SIMDBase_ADDi(s3, t1i));
+}
+
+static inline void srButForward8(DFTUndiff *p) {
+ SIMDBase_VECT *s = p->s;
+ int32_t o = p->offset1;
+ SIMDBase_VECT t0r, t0i, t1r, t1i, t2r, t2i, t3r, t3i;
+
+ SIMDBase_VECT s0 = SIMDBase_LOAD(&s[o+ 0]), s1 = SIMDBase_LOAD(&s[o+ 1]), s2 = SIMDBase_LOAD(&s[o+ 2]), s3 = SIMDBase_LOAD(&s[o+ 3]);
+ SIMDBase_VECT s4 = SIMDBase_LOAD(&s[o+ 4]), s5 = SIMDBase_LOAD(&s[o+ 5]), s6 = SIMDBase_LOAD(&s[o+ 6]), s7 = SIMDBase_LOAD(&s[o+ 7]);
+ SIMDBase_VECT s8 = SIMDBase_LOAD(&s[o+ 8]), s9 = SIMDBase_LOAD(&s[o+ 9]), sa = SIMDBase_LOAD(&s[o+10]) ,sb = SIMDBase_LOAD(&s[o+11]);
+ SIMDBase_VECT sc = SIMDBase_LOAD(&s[o+12]), sd = SIMDBase_LOAD(&s[o+13]), se = SIMDBase_LOAD(&s[o+14]), sf = SIMDBase_LOAD(&s[o+15]);
+
+ t2r = SIMDBase_SUBi(s0, s8); t2i = SIMDBase_SUBi(s1, s9);
+ t3r = SIMDBase_SUBi(sd, s5); t3i = SIMDBase_SUBi(s4, sc);
+
+ s0 = SIMDBase_ADDi(s0, s8); s1 = SIMDBase_ADDi(s1, s9);
+ s4 = SIMDBase_ADDi(s4, sc); s5 = SIMDBase_ADDi(s5, sd);
+
+ s8 = SIMDBase_SUBi(t2r, t3r); s9 = SIMDBase_SUBi(t2i, t3i);
+ sc = SIMDBase_ADDi(t2r, t3r); sd = SIMDBase_ADDi(t2i, t3i);
+
+ t2r = SIMDBase_SUBi(s2, sa); t2i = SIMDBase_SUBi(s3, sb);
+ t3r = SIMDBase_SUBi(sf, s7); t3i = SIMDBase_SUBi(s6, se);
+
+ s2 = SIMDBase_ADDi(s2, sa); s3 = SIMDBase_ADDi(s3, sb);
+ s6 = SIMDBase_ADDi(s6, se); s7 = SIMDBase_ADDi(s7, sf);
+
+ t0r = SIMDBase_SUBi(t2r, t3r); t1r = SIMDBase_ADDi(t2r, t3r);
+ t0i = SIMDBase_SUBi(t2i, t3i); t1i = SIMDBase_ADDi(t2i, t3i);
+
+ sa = SIMDBase_MULi(SIMDBase_ADDi(t0r, t0i), SIMDBase_SET1( SQRT2_2));
+ sb = SIMDBase_MULi(SIMDBase_SUBi(t0i, t0r), SIMDBase_SET1( SQRT2_2));
+ se = SIMDBase_MULi(SIMDBase_SUBi(t1i, t1r), SIMDBase_SET1( SQRT2_2));
+ sf = SIMDBase_MULi(SIMDBase_ADDi(t1r, t1i), SIMDBase_SET1(-SQRT2_2));
+
+ SIMDBase_STOR(&s[o+ 8], SIMDBase_ADDi(s8, sa)); SIMDBase_STOR(&s[o+ 9], SIMDBase_ADDi(s9, sb));
+ SIMDBase_STOR(&s[o+10], SIMDBase_SUBi(s8, sa)); SIMDBase_STOR(&s[o+11], SIMDBase_SUBi(s9, sb));
+
+ SIMDBase_STOR(&s[o+12], SIMDBase_ADDi(sc, se)); SIMDBase_STOR(&s[o+13], SIMDBase_ADDi(sd, sf));
+ SIMDBase_STOR(&s[o+14], SIMDBase_SUBi(sc, se)); SIMDBase_STOR(&s[o+15], SIMDBase_SUBi(sd, sf));
+
+ t0r = SIMDBase_ADDi(s0, s4); t2r = SIMDBase_SUBi(s0, s4);
+ t0i = SIMDBase_ADDi(s1, s5); t2i = SIMDBase_SUBi(s1, s5);
+
+ t1r = SIMDBase_ADDi(s2, s6); t3i = SIMDBase_SUBi(s2, s6);
+ t1i = SIMDBase_ADDi(s3, s7); t3r = SIMDBase_SUBi(s7, s3);
+
+ SIMDBase_STOR(&s[o+0], SIMDBase_ADDi(t0r, t1r)); SIMDBase_STOR(&s[o+1], SIMDBase_ADDi(t0i, t1i));
+ SIMDBase_STOR(&s[o+2], SIMDBase_SUBi(t0r, t1r)); SIMDBase_STOR(&s[o+3], SIMDBase_SUBi(t0i, t1i));
+ SIMDBase_STOR(&s[o+4], SIMDBase_SUBi(t2r, t3r)); SIMDBase_STOR(&s[o+5], SIMDBase_SUBi(t2i, t3i));
+ SIMDBase_STOR(&s[o+6], SIMDBase_ADDi(t2r, t3r)); SIMDBase_STOR(&s[o+7], SIMDBase_ADDi(t2i, t3i));
+}
+
+static void srButBackward8(DFTUndiff *p) {
+ SIMDBase_VECT *s = p->s;
+ int32_t o = p->offset1;
+ SIMDBase_VECT t0r, t0i, t1r, t1i;
+
+ SIMDBase_VECT s0 = SIMDBase_LOAD(&s[o+ 0]), s1 = SIMDBase_LOAD(&s[o+ 1]), s2 = SIMDBase_LOAD(&s[o+ 2]), s3 = SIMDBase_LOAD(&s[o+ 3]);
+ SIMDBase_VECT s4 = SIMDBase_LOAD(&s[o+ 4]), s5 = SIMDBase_LOAD(&s[o+ 5]), s6 = SIMDBase_LOAD(&s[o+ 6]), s7 = SIMDBase_LOAD(&s[o+ 7]);
+ SIMDBase_VECT s8 = SIMDBase_LOAD(&s[o+ 8]), s9 = SIMDBase_LOAD(&s[o+ 9]), sa = SIMDBase_LOAD(&s[o+10]) ,sb = SIMDBase_LOAD(&s[o+11]);
+ SIMDBase_VECT sc = SIMDBase_LOAD(&s[o+12]), sd = SIMDBase_LOAD(&s[o+13]), se = SIMDBase_LOAD(&s[o+14]), sf = SIMDBase_LOAD(&s[o+15]);
+
+ t0r = SIMDBase_ADDi(s8, sa); t0i = SIMDBase_SUBi(s8, sa); s8 = t0r; sa = t0i;
+ t0r = SIMDBase_ADDi(s9, sb); t0i = SIMDBase_SUBi(s9, sb); s9 = t0r; sb = t0i;
+ t0r = SIMDBase_ADDi(sc, se); t0i = SIMDBase_SUBi(sc, se); sc = t0r; se = t0i;
+ t0r = SIMDBase_ADDi(sd, sf); t0i = SIMDBase_SUBi(sd, sf); sd = t0r; sf = t0i;
+ t0r = SIMDBase_ADDi(s0, s2); t0i = SIMDBase_SUBi(s0, s2); s0 = t0r; s2 = t0i;
+ t0r = SIMDBase_ADDi(s1, s3); t0i = SIMDBase_SUBi(s1, s3); s1 = t0r; s3 = t0i;
+
+ t0r = SIMDBase_ADDi(s4, s6); t0i = SIMDBase_ADDi(s7, s5);
+ t1r = SIMDBase_SUBi(s7, s5); t1i = SIMDBase_SUBi(s4, s6);
+
+ s4 = SIMDBase_SUBi(s0, t0r); s5 = SIMDBase_SUBi(s1, t0i);
+ s6 = SIMDBase_SUBi(s2, t1r); s7 = SIMDBase_SUBi(s3, t1i);
+ s0 = SIMDBase_ADDi(s0, t0r); s1 = SIMDBase_ADDi(s1, t0i);
+ s2 = SIMDBase_ADDi(s2, t1r); s3 = SIMDBase_ADDi(s3, t1i);
+
+ t0r = SIMDBase_ADDi(s8, sc); t0i = SIMDBase_ADDi(s9, sd);
+ t1r = SIMDBase_SUBi(sd, s9); t1i = SIMDBase_SUBi(s8, sc);
+
+ s8 = SIMDBase_SUBi(s0, t0r); s9 = SIMDBase_SUBi(s1, t0i);
+ sc = SIMDBase_SUBi(s4, t1r); sd = SIMDBase_SUBi(s5, t1i);
+ s0 = SIMDBase_ADDi(s0, t0r); s1 = SIMDBase_ADDi(s1, t0i);
+ s4 = SIMDBase_ADDi(s4, t1r); s5 = SIMDBase_ADDi(s5, t1i);
+
+ t0r = SIMDBase_MULi(SIMDBase_SUBi(sa, sb), SIMDBase_SET1( SQRT2_2));
+ t0i = SIMDBase_MULi(SIMDBase_ADDi(sa, sb), SIMDBase_SET1( SQRT2_2));
+ t1r = SIMDBase_MULi(SIMDBase_ADDi(se, sf), SIMDBase_SET1(-SQRT2_2));
+ t1i = SIMDBase_MULi(SIMDBase_SUBi(se, sf), SIMDBase_SET1( SQRT2_2));
+
+ sa = t0r; sb = t0i; se = t1r; sf = t1i;
+
+ t0r = SIMDBase_ADDi(sa, se); t0i = SIMDBase_ADDi(sb, sf);
+ t1r = SIMDBase_SUBi(sf, sb); t1i = SIMDBase_SUBi(sa, se);
+
+ sa = SIMDBase_SUBi(s2, t0r); sb = SIMDBase_SUBi(s3, t0i);
+ se = SIMDBase_SUBi(s6, t1r); sf = SIMDBase_SUBi(s7, t1i);
+ s2 = SIMDBase_ADDi(s2, t0r); s3 = SIMDBase_ADDi(s3, t0i);
+ s6 = SIMDBase_ADDi(s6, t1r); s7 = SIMDBase_ADDi(s7, t1i);
+
+ SIMDBase_STOR(&s[o+ 0], s0); SIMDBase_STOR(&s[o+ 1], s1); SIMDBase_STOR(&s[o+ 2], s2); SIMDBase_STOR(&s[o+ 3], s3);
+ SIMDBase_STOR(&s[o+ 4], s4); SIMDBase_STOR(&s[o+ 5], s5); SIMDBase_STOR(&s[o+ 6], s6); SIMDBase_STOR(&s[o+ 7], s7);
+ SIMDBase_STOR(&s[o+ 8], s8); SIMDBase_STOR(&s[o+ 9], s9); SIMDBase_STOR(&s[o+10], sa); SIMDBase_STOR(&s[o+11], sb);
+ SIMDBase_STOR(&s[o+12], sc); SIMDBase_STOR(&s[o+13], sd); SIMDBase_STOR(&s[o+14], se); SIMDBase_STOR(&s[o+15], sf);
+}
+
+#if 0
+static inline void srButForwardSub(DFTUndiff *p) {
+ SIMDBase_VECT *s = p->s;
+ SIMDBase_REAL *tbl = p->ptTable[p->log2butlen];
+
+ int32_t i0 = p->offset1;
+ int32_t i1 = i0 + p->stride;
+ int32_t i2 = i1 + p->stride;
+ int32_t i3 = i2 + p->stride;
+ int32_t im = i1;
+
+ int32_t p0 = p->offset2 & (p->butlen*4-1);
+
+ while(i0 < im) {
+ SIMDBase_VECT t0r, t0i, t1r, t1i;
+ SIMDBase_VECT s00, s01, s10, s11, s20, s21, s30, s31;
+ SIMDBase_VECT a0, a1, a2, a3;
+
+ s00 = SIMDBase_LOAD(&s[i0+0]), s01 = SIMDBase_LOAD(&s[i0+1]);
+ s10 = SIMDBase_LOAD(&s[i1+0]), s11 = SIMDBase_LOAD(&s[i1+1]);
+ s20 = SIMDBase_LOAD(&s[i2+0]), s21 = SIMDBase_LOAD(&s[i2+1]);
+ s30 = SIMDBase_LOAD(&s[i3+0]), s31 = SIMDBase_LOAD(&s[i3+1]);
+
+ t0r = SIMDBase_SUBi(SIMDBase_SUBi(s00, s20), SIMDBase_SUBi(s31, s11));
+ t0i = SIMDBase_SUBi(SIMDBase_SUBi(s01, s21), SIMDBase_SUBi(s10, s30));
+
+ t1r = SIMDBase_ADDi(SIMDBase_SUBi(s00, s20), SIMDBase_SUBi(s31, s11));
+ t1i = SIMDBase_ADDi(SIMDBase_SUBi(s01, s21), SIMDBase_SUBi(s10, s30));
+
+ a0 = SIMDBase_LOAD1(&tbl[p0+0]); a1 = SIMDBase_LOAD1(&tbl[p0+1]);
+ a2 = SIMDBase_LOAD1(&tbl[p0+2]); a3 = SIMDBase_LOAD1(&tbl[p0+3]);
+
+ SIMDBase_STOR(&s[i0 ], SIMDBase_ADDi(s00, s20)); SIMDBase_STOR(&s[i0+1], SIMDBase_ADDi(s01, s21));
+ SIMDBase_STOR(&s[i1 ], SIMDBase_ADDi(s10, s30)); SIMDBase_STOR(&s[i1+1], SIMDBase_ADDi(s11, s31));
+
+#ifndef SIMDBase_FMADD_AVAILABLE
+ SIMDBase_STOR(&s[i2 ], SIMDBase_SUBi(SIMDBase_MULi(t0r, a0), SIMDBase_MULi(t0i, a1)));
+ SIMDBase_STOR(&s[i2+1], SIMDBase_ADDi(SIMDBase_MULi(t0r, a1), SIMDBase_MULi(t0i, a0)));
+ SIMDBase_STOR(&s[i3 ], SIMDBase_SUBi(SIMDBase_MULi(t1r, a2), SIMDBase_MULi(t1i, a3)));
+ SIMDBase_STOR(&s[i3+1], SIMDBase_ADDi(SIMDBase_MULi(t1r, a3), SIMDBase_MULi(t1i, a2)));
+#else
+ SIMDBase_STOR(&s[i2 ], SIMDBase_FMSUBi(t0i, a1, SIMDBase_MULi(t0r, a0)));
+ SIMDBase_STOR(&s[i2+1], SIMDBase_FMADDi(t0r, a1, SIMDBase_MULi(t0i, a0)));
+ SIMDBase_STOR(&s[i3 ], SIMDBase_FMSUBi(t1i, a3, SIMDBase_MULi(t1r, a2)));
+ SIMDBase_STOR(&s[i3+1], SIMDBase_FMADDi(t1r, a3, SIMDBase_MULi(t1i, a2)));
+#endif
+
+ i0 += 2; i1 += 2; i2 += 2; i3 += 2;
+ p0 += 4;
+ }
+}
+#endif
+
+#if 0
+static inline void srButBackwardSub(DFTUndiff *p) {
+ SIMDBase_VECT *s = p->s;
+ SIMDBase_REAL *tbl = p->ptTable[p->log2butlen];
+
+ int32_t i0 = p->offset1;
+ int32_t i1 = i0 + p->stride;
+ int32_t i2 = i1 + p->stride;
+ int32_t i3 = i2 + p->stride;
+ int32_t im = i1;
+
+ int32_t p0 = p->offset2 & (p->butlen*4-1);
+
+ while(i0 < im) {
+ SIMDBase_VECT t0r, t0i, t1r, t1i, u, v;
+ SIMDBase_VECT s00, s01, s10, s11, s20, s21, s30, s31;
+ SIMDBase_VECT a0, a1, a2, a3;
+
+ s20 = SIMDBase_LOAD(&s[i2+0]); s21 = SIMDBase_LOAD(&s[i2+1]);
+ a0 = SIMDBase_LOAD1(&tbl[p0+0]); a1 = SIMDBase_LOAD1(&tbl[p0+1]);
+ u = SIMDBase_ADDi(SIMDBase_MULi(s20, a0), SIMDBase_MULi(s21, a1));
+
+ s30 = SIMDBase_LOAD(&s[i3+0]); s31 = SIMDBase_LOAD(&s[i3+1]);
+ a2 = SIMDBase_LOAD1(&tbl[p0+2]); a3 = SIMDBase_LOAD1(&tbl[p0+3]);
+ v = SIMDBase_ADDi(SIMDBase_MULi(s30, a2), SIMDBase_MULi(s31, a3));
+
+ t0r = SIMDBase_ADDi(u, v); t1i = SIMDBase_SUBi(u, v);
+ u = SIMDBase_SUBi(SIMDBase_MULi(s31, a2), SIMDBase_MULi(s30, a3));
+ v = SIMDBase_SUBi(SIMDBase_MULi(s21, a0), SIMDBase_MULi(s20, a1));
+ t0i = SIMDBase_ADDi(u, v); t1r = SIMDBase_SUBi(u, v);
+
+ s00 = SIMDBase_LOAD(&s[i0+0]); s01 = SIMDBase_LOAD(&s[i0+1]);
+ s10 = SIMDBase_LOAD(&s[i1+0]); s11 = SIMDBase_LOAD(&s[i1+1]);
+
+ SIMDBase_STOR(&s[i2+0], SIMDBase_SUBi(s00, t0r)); SIMDBase_STOR(&s[i0+0], SIMDBase_ADDi(s00, t0r));
+ SIMDBase_STOR(&s[i2+1], SIMDBase_SUBi(s01, t0i)); SIMDBase_STOR(&s[i0+1], SIMDBase_ADDi(s01, t0i));
+ SIMDBase_STOR(&s[i3+0], SIMDBase_SUBi(s10, t1r)); SIMDBase_STOR(&s[i1+0], SIMDBase_ADDi(s10, t1r));
+ SIMDBase_STOR(&s[i3+1], SIMDBase_SUBi(s11, t1i)); SIMDBase_STOR(&s[i1+1], SIMDBase_ADDi(s11, t1i));
+
+ i0 += 2; i1 += 2; i2 += 2; i3 += 2;
+ p0 += 4;
+ }
+}
+
+static void srButBackwardSubUnrolled(DFTUndiff *p) {
+ srButBackwardSub(p);
+}
+#endif
+
+static inline void srButForwardSubUnrolled(DFTUndiff *p) {
+ SIMDBase_VECT *s = p->s;
+ SIMDBase_REAL *tbl = p->ptTable[p->log2butlen];
+
+ int32_t i0 = p->offset1;
+ int32_t i1 = i0 + p->stride;
+ int32_t i2 = i1 + p->stride;
+ int32_t i3 = i2 + p->stride;
+ int32_t im = i1;
+
+ int32_t p0 = p->offset2 & (p->butlen*4-1);
+
+ while(i0 < im) {
+ SIMDBase_VECT t0r, t0i, t1r, t1i;
+ SIMDBase_VECT s00, s01, s10, s11, s20, s21, s30, s31;
+ SIMDBase_VECT a0, a1, a2, a3;
+
+ //
+
+ s00 = SIMDBase_LOAD(&s[i0+0]); s01 = SIMDBase_LOAD(&s[i0+1]);
+ s10 = SIMDBase_LOAD(&s[i1+0]); s11 = SIMDBase_LOAD(&s[i1+1]);
+ s20 = SIMDBase_LOAD(&s[i2+0]); s21 = SIMDBase_LOAD(&s[i2+1]);
+ s30 = SIMDBase_LOAD(&s[i3+0]); s31 = SIMDBase_LOAD(&s[i3+1]);
+
+ t0r = SIMDBase_SUBi(SIMDBase_SUBi(s00, s20), SIMDBase_SUBi(s31, s11));
+ t0i = SIMDBase_SUBi(SIMDBase_SUBi(s01, s21), SIMDBase_SUBi(s10, s30));
+
+ t1r = SIMDBase_ADDi(SIMDBase_SUBi(s00, s20), SIMDBase_SUBi(s31, s11));
+ t1i = SIMDBase_ADDi(SIMDBase_SUBi(s01, s21), SIMDBase_SUBi(s10, s30));
+
+ a0 = SIMDBase_LOAD1(&tbl[p0+0]); a1 = SIMDBase_LOAD1(&tbl[p0+1]);
+ a2 = SIMDBase_LOAD1(&tbl[p0+2]); a3 = SIMDBase_LOAD1(&tbl[p0+3]);
+
+ SIMDBase_STOR(&s[i0 ], SIMDBase_ADDi(s00, s20)); SIMDBase_STOR(&s[i0+1], SIMDBase_ADDi(s01, s21));
+ SIMDBase_STOR(&s[i1 ], SIMDBase_ADDi(s10, s30)); SIMDBase_STOR(&s[i1+1], SIMDBase_ADDi(s11, s31));
+
+#ifndef SIMDBase_FMADD_AVAILABLE
+ SIMDBase_STOR(&s[i2 ], SIMDBase_SUBi(SIMDBase_MULi(t0r, a0), SIMDBase_MULi(t0i, a1)));
+ SIMDBase_STOR(&s[i2+1], SIMDBase_ADDi(SIMDBase_MULi(t0r, a1), SIMDBase_MULi(t0i, a0)));
+ SIMDBase_STOR(&s[i3 ], SIMDBase_SUBi(SIMDBase_MULi(t1r, a2), SIMDBase_MULi(t1i, a3)));
+ SIMDBase_STOR(&s[i3+1], SIMDBase_ADDi(SIMDBase_MULi(t1r, a3), SIMDBase_MULi(t1i, a2)));
+#else
+ SIMDBase_STOR(&s[i2 ], SIMDBase_FMSUBi(t0i, a1, SIMDBase_MULi(t0r, a0)));
+ SIMDBase_STOR(&s[i2+1], SIMDBase_FMADDi(t0r, a1, SIMDBase_MULi(t0i, a0)));
+ SIMDBase_STOR(&s[i3 ], SIMDBase_FMSUBi(t1i, a3, SIMDBase_MULi(t1r, a2)));
+ SIMDBase_STOR(&s[i3+1], SIMDBase_FMADDi(t1r, a3, SIMDBase_MULi(t1i, a2)));
+#endif
+
+ //
+
+ s00 = SIMDBase_LOAD(&s[i0+2]); s01 = SIMDBase_LOAD(&s[i0+3]);
+ s10 = SIMDBase_LOAD(&s[i1+2]); s11 = SIMDBase_LOAD(&s[i1+3]);
+ s20 = SIMDBase_LOAD(&s[i2+2]); s21 = SIMDBase_LOAD(&s[i2+3]);
+ s30 = SIMDBase_LOAD(&s[i3+2]); s31 = SIMDBase_LOAD(&s[i3+3]);
+
+ t0r = SIMDBase_SUBi(SIMDBase_SUBi(s00, s20), SIMDBase_SUBi(s31, s11));
+ t0i = SIMDBase_SUBi(SIMDBase_SUBi(s01, s21), SIMDBase_SUBi(s10, s30));
+
+ t1r = SIMDBase_ADDi(SIMDBase_SUBi(s00, s20), SIMDBase_SUBi(s31, s11));
+ t1i = SIMDBase_ADDi(SIMDBase_SUBi(s01, s21), SIMDBase_SUBi(s10, s30));
+
+ a0 = SIMDBase_LOAD1(&tbl[p0+4]); a1 = SIMDBase_LOAD1(&tbl[p0+5]);
+ a2 = SIMDBase_LOAD1(&tbl[p0+6]); a3 = SIMDBase_LOAD1(&tbl[p0+7]);
+
+ SIMDBase_STOR(&s[i0+2], SIMDBase_ADDi(s00, s20)); SIMDBase_STOR(&s[i0+3], SIMDBase_ADDi(s01, s21));
+ SIMDBase_STOR(&s[i1+2], SIMDBase_ADDi(s10, s30)); SIMDBase_STOR(&s[i1+3], SIMDBase_ADDi(s11, s31));
+
+#ifndef SIMDBase_FMADD_AVAILABLE
+ SIMDBase_STOR(&s[i2+2], SIMDBase_SUBi(SIMDBase_MULi(t0r, a0), SIMDBase_MULi(t0i, a1)));
+ SIMDBase_STOR(&s[i2+3], SIMDBase_ADDi(SIMDBase_MULi(t0r, a1), SIMDBase_MULi(t0i, a0)));
+ SIMDBase_STOR(&s[i3+2], SIMDBase_SUBi(SIMDBase_MULi(t1r, a2), SIMDBase_MULi(t1i, a3)));
+ SIMDBase_STOR(&s[i3+3], SIMDBase_ADDi(SIMDBase_MULi(t1r, a3), SIMDBase_MULi(t1i, a2)));
+#else
+ SIMDBase_STOR(&s[i2+2], SIMDBase_FMSUBi(t0i, a1, SIMDBase_MULi(t0r, a0)));
+ SIMDBase_STOR(&s[i2+3], SIMDBase_FMADDi(t0r, a1, SIMDBase_MULi(t0i, a0)));
+ SIMDBase_STOR(&s[i3+2], SIMDBase_FMSUBi(t1i, a3, SIMDBase_MULi(t1r, a2)));
+ SIMDBase_STOR(&s[i3+3], SIMDBase_FMADDi(t1r, a3, SIMDBase_MULi(t1i, a2)));
+#endif
+
+ //
+
+ s00 = SIMDBase_LOAD(&s[i0+4]); s01 = SIMDBase_LOAD(&s[i0+5]);
+ s10 = SIMDBase_LOAD(&s[i1+4]); s11 = SIMDBase_LOAD(&s[i1+5]);
+ s20 = SIMDBase_LOAD(&s[i2+4]); s21 = SIMDBase_LOAD(&s[i2+5]);
+ s30 = SIMDBase_LOAD(&s[i3+4]); s31 = SIMDBase_LOAD(&s[i3+5]);
+
+ t0r = SIMDBase_SUBi(SIMDBase_SUBi(s00, s20), SIMDBase_SUBi(s31, s11));
+ t0i = SIMDBase_SUBi(SIMDBase_SUBi(s01, s21), SIMDBase_SUBi(s10, s30));
+
+ t1r = SIMDBase_ADDi(SIMDBase_SUBi(s00, s20), SIMDBase_SUBi(s31, s11));
+ t1i = SIMDBase_ADDi(SIMDBase_SUBi(s01, s21), SIMDBase_SUBi(s10, s30));
+
+ a0 = SIMDBase_LOAD1(&tbl[p0+ 8]); a1 = SIMDBase_LOAD1(&tbl[p0+ 9]);
+ a2 = SIMDBase_LOAD1(&tbl[p0+10]); a3 = SIMDBase_LOAD1(&tbl[p0+11]);
+
+ SIMDBase_STOR(&s[i0+4], SIMDBase_ADDi(s00, s20)); SIMDBase_STOR(&s[i0+5], SIMDBase_ADDi(s01, s21));
+ SIMDBase_STOR(&s[i1+4], SIMDBase_ADDi(s10, s30)); SIMDBase_STOR(&s[i1+5], SIMDBase_ADDi(s11, s31));
+
+#ifndef SIMDBase_FMADD_AVAILABLE
+ SIMDBase_STOR(&s[i2+4], SIMDBase_SUBi(SIMDBase_MULi(t0r, a0), SIMDBase_MULi(t0i, a1)));
+ SIMDBase_STOR(&s[i2+5], SIMDBase_ADDi(SIMDBase_MULi(t0r, a1), SIMDBase_MULi(t0i, a0)));
+ SIMDBase_STOR(&s[i3+4], SIMDBase_SUBi(SIMDBase_MULi(t1r, a2), SIMDBase_MULi(t1i, a3)));
+ SIMDBase_STOR(&s[i3+5], SIMDBase_ADDi(SIMDBase_MULi(t1r, a3), SIMDBase_MULi(t1i, a2)));
+#else
+ SIMDBase_STOR(&s[i2+4], SIMDBase_FMSUBi(t0i, a1, SIMDBase_MULi(t0r, a0)));
+ SIMDBase_STOR(&s[i2+5], SIMDBase_FMADDi(t0r, a1, SIMDBase_MULi(t0i, a0)));
+ SIMDBase_STOR(&s[i3+4], SIMDBase_FMSUBi(t1i, a3, SIMDBase_MULi(t1r, a2)));
+ SIMDBase_STOR(&s[i3+5], SIMDBase_FMADDi(t1r, a3, SIMDBase_MULi(t1i, a2)));
+#endif
+
+ //
+
+ s00 = SIMDBase_LOAD(&s[i0+6]); s01 = SIMDBase_LOAD(&s[i0+7]);
+ s10 = SIMDBase_LOAD(&s[i1+6]); s11 = SIMDBase_LOAD(&s[i1+7]);
+ s20 = SIMDBase_LOAD(&s[i2+6]); s21 = SIMDBase_LOAD(&s[i2+7]);
+ s30 = SIMDBase_LOAD(&s[i3+6]); s31 = SIMDBase_LOAD(&s[i3+7]);
+
+ t0r = SIMDBase_SUBi(SIMDBase_SUBi(s00, s20), SIMDBase_SUBi(s31, s11));
+ t0i = SIMDBase_SUBi(SIMDBase_SUBi(s01, s21), SIMDBase_SUBi(s10, s30));
+
+ t1r = SIMDBase_ADDi(SIMDBase_SUBi(s00, s20), SIMDBase_SUBi(s31, s11));
+ t1i = SIMDBase_ADDi(SIMDBase_SUBi(s01, s21), SIMDBase_SUBi(s10, s30));
+
+ a0 = SIMDBase_LOAD1(&tbl[p0+12]); a1 = SIMDBase_LOAD1(&tbl[p0+13]);
+ a2 = SIMDBase_LOAD1(&tbl[p0+14]); a3 = SIMDBase_LOAD1(&tbl[p0+15]);
+
+ SIMDBase_STOR(&s[i0+6], SIMDBase_ADDi(s00, s20)); SIMDBase_STOR(&s[i0+7], SIMDBase_ADDi(s01, s21));
+ SIMDBase_STOR(&s[i1+6], SIMDBase_ADDi(s10, s30)); SIMDBase_STOR(&s[i1+7], SIMDBase_ADDi(s11, s31));
+
+#ifndef SIMDBase_FMADD_AVAILABLE
+ SIMDBase_STOR(&s[i2+6], SIMDBase_SUBi(SIMDBase_MULi(t0r, a0), SIMDBase_MULi(t0i, a1)));
+ SIMDBase_STOR(&s[i2+7], SIMDBase_ADDi(SIMDBase_MULi(t0r, a1), SIMDBase_MULi(t0i, a0)));
+ SIMDBase_STOR(&s[i3+6], SIMDBase_SUBi(SIMDBase_MULi(t1r, a2), SIMDBase_MULi(t1i, a3)));
+ SIMDBase_STOR(&s[i3+7], SIMDBase_ADDi(SIMDBase_MULi(t1r, a3), SIMDBase_MULi(t1i, a2)));
+#else
+ SIMDBase_STOR(&s[i2+6], SIMDBase_FMSUBi(t0i, a1, SIMDBase_MULi(t0r, a0)));
+ SIMDBase_STOR(&s[i2+7], SIMDBase_FMADDi(t0r, a1, SIMDBase_MULi(t0i, a0)));
+ SIMDBase_STOR(&s[i3+6], SIMDBase_FMSUBi(t1i, a3, SIMDBase_MULi(t1r, a2)));
+ SIMDBase_STOR(&s[i3+7], SIMDBase_FMADDi(t1r, a3, SIMDBase_MULi(t1i, a2)));
+#endif
+
+ //
+
+ i0 += 8; i1 += 8; i2 += 8; i3 += 8;
+ p0 += 16;
+ }
+}
+
+#if 1
+static void srButBackwardSubUnrolled(DFTUndiff *p) {
+ SIMDBase_VECT *s = p->s;
+ SIMDBase_REAL *tbl = p->ptTable[p->log2butlen];
+
+ int32_t i0 = p->offset1;
+ int32_t i1 = i0 + p->stride;
+ int32_t i2 = i1 + p->stride;
+ int32_t i3 = i2 + p->stride;
+ int32_t im = i1;
+
+ int32_t p0 = p->offset2 & (p->butlen*4-1);
+
+ while(i0 < im) {
+ SIMDBase_VECT t0r, t0i, t1r, t1i, u, v;
+ SIMDBase_VECT s00, s01, s10, s11, s20, s21, s30, s31;
+ SIMDBase_VECT a0, a1, a2, a3;
+
+ //
+
+ s20 = SIMDBase_LOAD(&s[i2+ 0]); s21 = SIMDBase_LOAD(&s[i2+ 1]);
+ a0 = SIMDBase_LOAD1(&tbl[p0+ 0]); a1 = SIMDBase_LOAD1(&tbl[p0+ 1]);
+#ifndef SIMDBase_FMADD_AVAILABLE
+ u = SIMDBase_ADDi(SIMDBase_MULi(s20, a0), SIMDBase_MULi(s21, a1));
+#else
+ u = SIMDBase_FMADDi(s20, a0, SIMDBase_MULi(s21, a1));
+#endif
+
+ s30 = SIMDBase_LOAD(&s[i3+ 0]); s31 = SIMDBase_LOAD(&s[i3+ 1]);
+ a2 = SIMDBase_LOAD1(&tbl[p0+ 2]); a3 = SIMDBase_LOAD1(&tbl[p0+ 3]);
+#ifndef SIMDBase_FMADD_AVAILABLE
+ v = SIMDBase_ADDi(SIMDBase_MULi(s30, a2), SIMDBase_MULi(s31, a3));
+#else
+ v = SIMDBase_FMADDi(s30, a2, SIMDBase_MULi(s31, a3));
+#endif
+
+ t0r = SIMDBase_ADDi(u, v); t1i = SIMDBase_SUBi(u, v);
+
+#ifndef SIMDBase_FMADD_AVAILABLE
+ u = SIMDBase_SUBi(SIMDBase_MULi(s31, a2), SIMDBase_MULi(s30, a3));
+ v = SIMDBase_SUBi(SIMDBase_MULi(s21, a0), SIMDBase_MULi(s20, a1));
+#else
+ u = SIMDBase_FMSUBi(s30, a3, SIMDBase_MULi(s31, a2));
+ v = SIMDBase_FMSUBi(s20, a1, SIMDBase_MULi(s21, a0));
+#endif
+ t0i = SIMDBase_ADDi(u, v); t1r = SIMDBase_SUBi(u, v);
+
+ s00 = SIMDBase_LOAD(&s[i0+ 0]); s01 = SIMDBase_LOAD(&s[i0+ 1]);
+ s10 = SIMDBase_LOAD(&s[i1+ 0]); s11 = SIMDBase_LOAD(&s[i1+ 1]);
+
+ SIMDBase_STOR(&s[i2+ 0], SIMDBase_SUBi(s00, t0r)); SIMDBase_STOR(&s[i0+ 0], SIMDBase_ADDi(s00, t0r));
+ SIMDBase_STOR(&s[i2+ 1], SIMDBase_SUBi(s01, t0i)); SIMDBase_STOR(&s[i0+ 1], SIMDBase_ADDi(s01, t0i));
+ SIMDBase_STOR(&s[i3+ 0], SIMDBase_SUBi(s10, t1r)); SIMDBase_STOR(&s[i1+ 0], SIMDBase_ADDi(s10, t1r));
+ SIMDBase_STOR(&s[i3+ 1], SIMDBase_SUBi(s11, t1i)); SIMDBase_STOR(&s[i1+ 1], SIMDBase_ADDi(s11, t1i));
+
+ //
+
+ s20 = SIMDBase_LOAD(&s[i2+ 2]); s21 = SIMDBase_LOAD(&s[i2+ 3]);
+ a0 = SIMDBase_LOAD1(&tbl[p0+ 4]); a1 = SIMDBase_LOAD1(&tbl[p0+ 5]);
+#ifndef SIMDBase_FMADD_AVAILABLE
+ u = SIMDBase_ADDi(SIMDBase_MULi(s20, a0), SIMDBase_MULi(s21, a1));
+#else
+ u = SIMDBase_FMADDi(s20, a0, SIMDBase_MULi(s21, a1));
+#endif
+
+ s30 = SIMDBase_LOAD(&s[i3+ 2]); s31 = SIMDBase_LOAD(&s[i3+ 3]);
+ a2 = SIMDBase_LOAD1(&tbl[p0+ 6]); a3 = SIMDBase_LOAD1(&tbl[p0+ 7]);
+#ifndef SIMDBase_FMADD_AVAILABLE
+ v = SIMDBase_ADDi(SIMDBase_MULi(s30, a2), SIMDBase_MULi(s31, a3));
+#else
+ v = SIMDBase_FMADDi(s30, a2, SIMDBase_MULi(s31, a3));
+#endif
+
+ t0r = SIMDBase_ADDi(u, v); t1i = SIMDBase_SUBi(u, v);
+
+#ifndef SIMDBase_FMADD_AVAILABLE
+ u = SIMDBase_SUBi(SIMDBase_MULi(s31, a2), SIMDBase_MULi(s30, a3));
+ v = SIMDBase_SUBi(SIMDBase_MULi(s21, a0), SIMDBase_MULi(s20, a1));
+#else
+ u = SIMDBase_FMSUBi(s30, a3, SIMDBase_MULi(s31, a2));
+ v = SIMDBase_FMSUBi(s20, a1, SIMDBase_MULi(s21, a0));
+#endif
+ t0i = SIMDBase_ADDi(u, v); t1r = SIMDBase_SUBi(u, v);
+
+ s00 = SIMDBase_LOAD(&s[i0+ 2]); s01 = SIMDBase_LOAD(&s[i0+ 3]);
+ s10 = SIMDBase_LOAD(&s[i1+ 2]); s11 = SIMDBase_LOAD(&s[i1+ 3]);
+
+ SIMDBase_STOR(&s[i2+ 2], SIMDBase_SUBi(s00, t0r)); SIMDBase_STOR(&s[i0+ 2], SIMDBase_ADDi(s00, t0r));
+ SIMDBase_STOR(&s[i2+ 3], SIMDBase_SUBi(s01, t0i)); SIMDBase_STOR(&s[i0+ 3], SIMDBase_ADDi(s01, t0i));
+ SIMDBase_STOR(&s[i3+ 2], SIMDBase_SUBi(s10, t1r)); SIMDBase_STOR(&s[i1+ 2], SIMDBase_ADDi(s10, t1r));
+ SIMDBase_STOR(&s[i3+ 3], SIMDBase_SUBi(s11, t1i)); SIMDBase_STOR(&s[i1+ 3], SIMDBase_ADDi(s11, t1i));
+
+ //
+
+ s20 = SIMDBase_LOAD(&s[i2+ 4]); s21 = SIMDBase_LOAD(&s[i2+ 5]);
+ a0 = SIMDBase_LOAD1(&tbl[p0+ 8]); a1 = SIMDBase_LOAD1(&tbl[p0+ 9]);
+#ifndef SIMDBase_FMADD_AVAILABLE
+ u = SIMDBase_ADDi(SIMDBase_MULi(s20, a0), SIMDBase_MULi(s21, a1));
+#else
+ u = SIMDBase_FMADDi(s20, a0, SIMDBase_MULi(s21, a1));
+#endif
+
+ s30 = SIMDBase_LOAD(&s[i3+ 4]); s31 = SIMDBase_LOAD(&s[i3+ 5]);
+ a2 = SIMDBase_LOAD1(&tbl[p0+10]); a3 = SIMDBase_LOAD1(&tbl[p0+11]);
+#ifndef SIMDBase_FMADD_AVAILABLE
+ v = SIMDBase_ADDi(SIMDBase_MULi(s30, a2), SIMDBase_MULi(s31, a3));
+#else
+ v = SIMDBase_FMADDi(s30, a2, SIMDBase_MULi(s31, a3));
+#endif
+
+ t0r = SIMDBase_ADDi(u, v); t1i = SIMDBase_SUBi(u, v);
+
+#ifndef SIMDBase_FMADD_AVAILABLE
+ u = SIMDBase_SUBi(SIMDBase_MULi(s31, a2), SIMDBase_MULi(s30, a3));
+ v = SIMDBase_SUBi(SIMDBase_MULi(s21, a0), SIMDBase_MULi(s20, a1));
+#else
+ u = SIMDBase_FMSUBi(s30, a3, SIMDBase_MULi(s31, a2));
+ v = SIMDBase_FMSUBi(s20, a1, SIMDBase_MULi(s21, a0));
+#endif
+ t0i = SIMDBase_ADDi(u, v); t1r = SIMDBase_SUBi(u, v);
+
+ s00 = SIMDBase_LOAD(&s[i0+ 4]); s01 = SIMDBase_LOAD(&s[i0+ 5]);
+ s10 = SIMDBase_LOAD(&s[i1+ 4]); s11 = SIMDBase_LOAD(&s[i1+ 5]);
+
+ SIMDBase_STOR(&s[i2+ 4], SIMDBase_SUBi(s00, t0r)); SIMDBase_STOR(&s[i0+ 4], SIMDBase_ADDi(s00, t0r));
+ SIMDBase_STOR(&s[i2+ 5], SIMDBase_SUBi(s01, t0i)); SIMDBase_STOR(&s[i0+ 5], SIMDBase_ADDi(s01, t0i));
+ SIMDBase_STOR(&s[i3+ 4], SIMDBase_SUBi(s10, t1r)); SIMDBase_STOR(&s[i1+ 4], SIMDBase_ADDi(s10, t1r));
+ SIMDBase_STOR(&s[i3+ 5], SIMDBase_SUBi(s11, t1i)); SIMDBase_STOR(&s[i1+ 5], SIMDBase_ADDi(s11, t1i));
+
+ //
+
+ s20 = SIMDBase_LOAD(&s[i2+ 6]); s21 = SIMDBase_LOAD(&s[i2+ 7]);
+ a0 = SIMDBase_LOAD1(&tbl[p0+12]); a1 = SIMDBase_LOAD1(&tbl[p0+13]);
+#ifndef SIMDBase_FMADD_AVAILABLE
+ u = SIMDBase_ADDi(SIMDBase_MULi(s20, a0), SIMDBase_MULi(s21, a1));
+#else
+ u = SIMDBase_FMADDi(s20, a0, SIMDBase_MULi(s21, a1));
+#endif
+
+ s30 = SIMDBase_LOAD(&s[i3+ 6]); s31 = SIMDBase_LOAD(&s[i3+ 7]);
+ a2 = SIMDBase_LOAD1(&tbl[p0+14]); a3 = SIMDBase_LOAD1(&tbl[p0+15]);
+#ifndef SIMDBase_FMADD_AVAILABLE
+ v = SIMDBase_ADDi(SIMDBase_MULi(s30, a2), SIMDBase_MULi(s31, a3));
+#else
+ v = SIMDBase_FMADDi(s30, a2, SIMDBase_MULi(s31, a3));
+#endif
+
+ t0r = SIMDBase_ADDi(u, v); t1i = SIMDBase_SUBi(u, v);
+
+#ifndef SIMDBase_FMADD_AVAILABLE
+ u = SIMDBase_SUBi(SIMDBase_MULi(s31, a2), SIMDBase_MULi(s30, a3));
+ v = SIMDBase_SUBi(SIMDBase_MULi(s21, a0), SIMDBase_MULi(s20, a1));
+#else
+ u = SIMDBase_FMSUBi(s30, a3, SIMDBase_MULi(s31, a2));
+ v = SIMDBase_FMSUBi(s20, a1, SIMDBase_MULi(s21, a0));
+#endif
+ t0i = SIMDBase_ADDi(u, v); t1r = SIMDBase_SUBi(u, v);
+
+ s00 = SIMDBase_LOAD(&s[i0+ 6]); s01 = SIMDBase_LOAD(&s[i0+ 7]);
+ s10 = SIMDBase_LOAD(&s[i1+ 6]); s11 = SIMDBase_LOAD(&s[i1+ 7]);
+
+ SIMDBase_STOR(&s[i2+ 6], SIMDBase_SUBi(s00, t0r)); SIMDBase_STOR(&s[i0+ 6], SIMDBase_ADDi(s00, t0r));
+ SIMDBase_STOR(&s[i2+ 7], SIMDBase_SUBi(s01, t0i)); SIMDBase_STOR(&s[i0+ 7], SIMDBase_ADDi(s01, t0i));
+ SIMDBase_STOR(&s[i3+ 6], SIMDBase_SUBi(s10, t1r)); SIMDBase_STOR(&s[i1+ 6], SIMDBase_ADDi(s10, t1r));
+ SIMDBase_STOR(&s[i3+ 7], SIMDBase_SUBi(s11, t1i)); SIMDBase_STOR(&s[i1+ 7], SIMDBase_ADDi(s11, t1i));
+
+ //
+
+ i0 += 8; i1 += 8; i2 += 8; i3 += 8;
+ p0 += 16;
+ }
+}
+#endif
+
+static void r2ButForwardSub(DFTUndiff *p) {
+ SIMDBase_VECT *s = p->s;
+
+ SIMDBase_REAL *tbl = p->ptTable[p->log2butlen];
+
+ int32_t i0 = p->offset1;
+ int32_t i2 = i0 + p->stride*2;
+ int32_t cp = 0, sp = p->butlen/4;
+
+ do {
+ SIMDBase_VECT t0r, t0i, s0, s1, s2, s3, t0, t1;
+
+ s0 = SIMDBase_LOAD(&s[i0+0]); s2 = SIMDBase_LOAD(&s[i0+1]);
+ s1 = SIMDBase_LOAD(&s[i2+0]); s3 = SIMDBase_LOAD(&s[i2+1]);
+ t0 = SIMDBase_LOAD1(&tbl[cp+0]); t1 = SIMDBase_LOAD1(&tbl[sp-0]);
+ t0r = SIMDBase_SUBi(s0, s1); SIMDBase_STOR(&s[i0+0], SIMDBase_ADDi(s0, s1));
+ t0i = SIMDBase_SUBi(s2, s3); SIMDBase_STOR(&s[i0+1], SIMDBase_ADDi(s2, s3));
+ SIMDBase_STOR(&s[i2+0], SIMDBase_ADDi(SIMDBase_MULi(t0r, t0), SIMDBase_MULi(t0i, t1)));
+ SIMDBase_STOR(&s[i2+1], SIMDBase_SUBi(SIMDBase_MULi(t0i, t0), SIMDBase_MULi(t0r, t1)));
+
+ s0 = SIMDBase_LOAD(&s[i0+2]); s2 = SIMDBase_LOAD(&s[i0+3]);
+ s1 = SIMDBase_LOAD(&s[i2+2]); s3 = SIMDBase_LOAD(&s[i2+3]);
+ t0 = SIMDBase_LOAD1(&tbl[cp+1]); t1 = SIMDBase_LOAD1(&tbl[sp-1]);
+ t0r = SIMDBase_SUBi(s0, s1); SIMDBase_STOR(&s[i0+2], SIMDBase_ADDi(s0, s1));
+ t0i = SIMDBase_SUBi(s2, s3); SIMDBase_STOR(&s[i0+3], SIMDBase_ADDi(s2, s3));
+ SIMDBase_STOR(&s[i2+2], SIMDBase_ADDi(SIMDBase_MULi(t0r, t0), SIMDBase_MULi(t0i, t1)));
+ SIMDBase_STOR(&s[i2+3], SIMDBase_SUBi(SIMDBase_MULi(t0i, t0), SIMDBase_MULi(t0r, t1)));
+
+ s0 = SIMDBase_LOAD(&s[i0+4]); s2 = SIMDBase_LOAD(&s[i0+5]);
+ s1 = SIMDBase_LOAD(&s[i2+4]); s3 = SIMDBase_LOAD(&s[i2+5]);
+ t0 = SIMDBase_LOAD1(&tbl[cp+2]); t1 = SIMDBase_LOAD1(&tbl[sp-2]);
+ t0r = SIMDBase_SUBi(s0, s1); SIMDBase_STOR(&s[i0+4], SIMDBase_ADDi(s0, s1));
+ t0i = SIMDBase_SUBi(s2, s3); SIMDBase_STOR(&s[i0+5], SIMDBase_ADDi(s2, s3));
+ SIMDBase_STOR(&s[i2+4], SIMDBase_ADDi(SIMDBase_MULi(t0r, t0), SIMDBase_MULi(t0i, t1)));
+ SIMDBase_STOR(&s[i2+5], SIMDBase_SUBi(SIMDBase_MULi(t0i, t0), SIMDBase_MULi(t0r, t1)));
+
+ s0 = SIMDBase_LOAD(&s[i0+6]); s2 = SIMDBase_LOAD(&s[i0+7]);
+ s1 = SIMDBase_LOAD(&s[i2+6]); s3 = SIMDBase_LOAD(&s[i2+7]);
+ t0 = SIMDBase_LOAD1(&tbl[cp+3]); t1 = SIMDBase_LOAD1(&tbl[sp-3]);
+ t0r = SIMDBase_SUBi(s0, s1); SIMDBase_STOR(&s[i0+6], SIMDBase_ADDi(s0, s1));
+ t0i = SIMDBase_SUBi(s2, s3); SIMDBase_STOR(&s[i0+7], SIMDBase_ADDi(s2, s3));
+ SIMDBase_STOR(&s[i2+6], SIMDBase_ADDi(SIMDBase_MULi(t0r, t0), SIMDBase_MULi(t0i, t1)));
+ SIMDBase_STOR(&s[i2+7], SIMDBase_SUBi(SIMDBase_MULi(t0i, t0), SIMDBase_MULi(t0r, t1)));
+
+ //
+
+ i0 += 8; i2 += 8; cp += 4; sp -= 4;
+ } while(sp > 0);
+
+ do {
+ SIMDBase_VECT t0r, t0i, s0, s1, s2, s3, t0, t1;
+
+ s0 = SIMDBase_LOAD(&s[i0+0]); s2 = SIMDBase_LOAD(&s[i0+1]);
+ s1 = SIMDBase_LOAD(&s[i2+0]); s3 = SIMDBase_LOAD(&s[i2+1]);
+ t0 = SIMDBase_LOAD1(&tbl[cp-0]); t1 = SIMDBase_LOAD1(&tbl[sp+0]);
+ t0r = SIMDBase_SUBi(s0, s1); SIMDBase_STOR(&s[i0+0], SIMDBase_ADDi(s0, s1));
+ t0i = SIMDBase_SUBi(s2, s3); SIMDBase_STOR(&s[i0+1], SIMDBase_ADDi(s2, s3));
+ SIMDBase_STOR(&s[i2+0], SIMDBase_SUBi(SIMDBase_MULi(t0i, t1), SIMDBase_MULi(t0r, t0)));
+ SIMDBase_STOR(&s[i2+1], SIMDBase_NEGi(SIMDBase_ADDi(SIMDBase_MULi(t0i, t0), SIMDBase_MULi(t0r, t1))));
+
+ s0 = SIMDBase_LOAD(&s[i0+2]); s2 = SIMDBase_LOAD(&s[i0+3]);
+ s1 = SIMDBase_LOAD(&s[i2+2]); s3 = SIMDBase_LOAD(&s[i2+3]);
+ t0 = SIMDBase_LOAD1(&tbl[cp-1]); t1 = SIMDBase_LOAD1(&tbl[sp+1]);
+ t0r = SIMDBase_SUBi(s0, s1); SIMDBase_STOR(&s[i0+2], SIMDBase_ADDi(s0, s1));
+ t0i = SIMDBase_SUBi(s2, s3); SIMDBase_STOR(&s[i0+3], SIMDBase_ADDi(s2, s3));
+ SIMDBase_STOR(&s[i2+2], SIMDBase_SUBi(SIMDBase_MULi(t0i, t1), SIMDBase_MULi(t0r, t0)));
+ SIMDBase_STOR(&s[i2+3], SIMDBase_NEGi(SIMDBase_ADDi(SIMDBase_MULi(t0i, t0), SIMDBase_MULi(t0r, t1))));
+
+ s0 = SIMDBase_LOAD(&s[i0+4]); s2 = SIMDBase_LOAD(&s[i0+5]);
+ s1 = SIMDBase_LOAD(&s[i2+4]); s3 = SIMDBase_LOAD(&s[i2+5]);
+ t0 = SIMDBase_LOAD1(&tbl[cp-2]); t1 = SIMDBase_LOAD1(&tbl[sp+2]);
+ t0r = SIMDBase_SUBi(s0, s1); SIMDBase_STOR(&s[i0+4], SIMDBase_ADDi(s0, s1));
+ t0i = SIMDBase_SUBi(s2, s3); SIMDBase_STOR(&s[i0+5], SIMDBase_ADDi(s2, s3));
+ SIMDBase_STOR(&s[i2+4], SIMDBase_SUBi(SIMDBase_MULi(t0i, t1), SIMDBase_MULi(t0r, t0)));
+ SIMDBase_STOR(&s[i2+5], SIMDBase_NEGi(SIMDBase_ADDi(SIMDBase_MULi(t0i, t0), SIMDBase_MULi(t0r, t1))));
+
+ s0 = SIMDBase_LOAD(&s[i0+6]); s2 = SIMDBase_LOAD(&s[i0+7]);
+ s1 = SIMDBase_LOAD(&s[i2+6]); s3 = SIMDBase_LOAD(&s[i2+7]);
+ t0 = SIMDBase_LOAD1(&tbl[cp-3]); t1 = SIMDBase_LOAD1(&tbl[sp+3]);
+ t0r = SIMDBase_SUBi(s0, s1); SIMDBase_STOR(&s[i0+6], SIMDBase_ADDi(s0, s1));
+ t0i = SIMDBase_SUBi(s2, s3); SIMDBase_STOR(&s[i0+7], SIMDBase_ADDi(s2, s3));
+ SIMDBase_STOR(&s[i2+6], SIMDBase_SUBi(SIMDBase_MULi(t0i, t1), SIMDBase_MULi(t0r, t0)));
+ SIMDBase_STOR(&s[i2+7], SIMDBase_NEGi(SIMDBase_ADDi(SIMDBase_MULi(t0i, t0), SIMDBase_MULi(t0r, t1))));
+
+ //
+
+ i0 += 8; i2 += 8; cp -= 4; sp += 4;
+ } while(cp > 0);
+}
+
+static void r2ButBackwardSub(DFTUndiff *p) {
+ SIMDBase_VECT *s = p->s;
+
+ SIMDBase_REAL *tbl = p->ptTable[p->log2butlen];
+
+ int i0 = p->offset1;
+ int i2 = i0 + p->stride*2;
+
+ int cp = 0, sp = p->butlen/4;
+
+ do {
+ SIMDBase_VECT t0r, t0i, s0, s1, s2, s3, t0, t1;
+
+ s0 = SIMDBase_LOAD(&s[i0+0]); s2 = SIMDBase_LOAD(&s[i0+1]);
+ s1 = SIMDBase_LOAD(&s[i2+0]); s3 = SIMDBase_LOAD(&s[i2+1]);
+ t0 = SIMDBase_LOAD1(&tbl[cp+0]); t1 = SIMDBase_LOAD1(&tbl[sp-0]);
+ t0r = SIMDBase_SUBi(SIMDBase_MULi(s1, t0), SIMDBase_MULi(s3, t1));
+ t0i = SIMDBase_ADDi(SIMDBase_MULi(s1, t1), SIMDBase_MULi(s3, t0));
+ SIMDBase_STOR(&s[i2+0], SIMDBase_SUBi(s0, t0r)); SIMDBase_STOR(&s[i0+0], SIMDBase_ADDi(s0, t0r));
+ SIMDBase_STOR(&s[i2+1], SIMDBase_SUBi(s2, t0i)); SIMDBase_STOR(&s[i0+1], SIMDBase_ADDi(s2, t0i));
+
+ s0 = SIMDBase_LOAD(&s[i0+2]); s2 = SIMDBase_LOAD(&s[i0+3]);
+ s1 = SIMDBase_LOAD(&s[i2+2]); s3 = SIMDBase_LOAD(&s[i2+3]);
+ t0 = SIMDBase_LOAD1(&tbl[cp+1]); t1 = SIMDBase_LOAD1(&tbl[sp-1]);
+ t0r = SIMDBase_SUBi(SIMDBase_MULi(s1, t0), SIMDBase_MULi(s3, t1));
+ t0i = SIMDBase_ADDi(SIMDBase_MULi(s1, t1), SIMDBase_MULi(s3, t0));
+ SIMDBase_STOR(&s[i2+2], SIMDBase_SUBi(s0, t0r)); SIMDBase_STOR(&s[i0+2], SIMDBase_ADDi(s0, t0r));
+ SIMDBase_STOR(&s[i2+3], SIMDBase_SUBi(s2, t0i)); SIMDBase_STOR(&s[i0+3], SIMDBase_ADDi(s2, t0i));
+
+ s0 = SIMDBase_LOAD(&s[i0+4]); s2 = SIMDBase_LOAD(&s[i0+5]);
+ s1 = SIMDBase_LOAD(&s[i2+4]); s3 = SIMDBase_LOAD(&s[i2+5]);
+ t0 = SIMDBase_LOAD1(&tbl[cp+2]); t1 = SIMDBase_LOAD1(&tbl[sp-2]);
+ t0r = SIMDBase_SUBi(SIMDBase_MULi(s1, t0), SIMDBase_MULi(s3, t1));
+ t0i = SIMDBase_ADDi(SIMDBase_MULi(s1, t1), SIMDBase_MULi(s3, t0));
+ SIMDBase_STOR(&s[i2+4], SIMDBase_SUBi(s0, t0r)); SIMDBase_STOR(&s[i0+4], SIMDBase_ADDi(s0, t0r));
+ SIMDBase_STOR(&s[i2+5], SIMDBase_SUBi(s2, t0i)); SIMDBase_STOR(&s[i0+5], SIMDBase_ADDi(s2, t0i));
+
+ s0 = SIMDBase_LOAD(&s[i0+6]); s2 = SIMDBase_LOAD(&s[i0+7]);
+ s1 = SIMDBase_LOAD(&s[i2+6]); s3 = SIMDBase_LOAD(&s[i2+7]);
+ t0 = SIMDBase_LOAD1(&tbl[cp+3]); t1 = SIMDBase_LOAD1(&tbl[sp-3]);
+ t0r = SIMDBase_SUBi(SIMDBase_MULi(s1, t0), SIMDBase_MULi(s3, t1));
+ t0i = SIMDBase_ADDi(SIMDBase_MULi(s1, t1), SIMDBase_MULi(s3, t0));
+ SIMDBase_STOR(&s[i2+6], SIMDBase_SUBi(s0, t0r)); SIMDBase_STOR(&s[i0+6], SIMDBase_ADDi(s0, t0r));
+ SIMDBase_STOR(&s[i2+7], SIMDBase_SUBi(s2, t0i)); SIMDBase_STOR(&s[i0+7], SIMDBase_ADDi(s2, t0i));
+
+ i0 += 8; i2 += 8; cp += 4; sp -= 4;
+ } while(sp > 0);
+
+ do {
+ SIMDBase_VECT t0r, t0i, s0, s1, s2, s3, t0, t1;
+
+ s0 = SIMDBase_LOAD(&s[i0+0]); s2 = SIMDBase_LOAD(&s[i0+1]);
+ s1 = SIMDBase_LOAD(&s[i2+0]); s3 = SIMDBase_LOAD(&s[i2+1]);
+ t0 = SIMDBase_LOAD1(&tbl[cp-0]); t1 = SIMDBase_LOAD1(&tbl[sp+0]);
+ t0r = SIMDBase_NEGi(SIMDBase_ADDi(SIMDBase_MULi(s1, t0), SIMDBase_MULi(s3, t1)));
+ t0i = SIMDBase_SUBi(SIMDBase_MULi(s1, t1), SIMDBase_MULi(s3, t0));
+ SIMDBase_STOR(&s[i2+0], SIMDBase_SUBi(s0, t0r)); SIMDBase_STOR(&s[i0+0], SIMDBase_ADDi(s0, t0r));
+ SIMDBase_STOR(&s[i2+1], SIMDBase_SUBi(s2, t0i)); SIMDBase_STOR(&s[i0+1], SIMDBase_ADDi(s2, t0i));
+
+ s0 = SIMDBase_LOAD(&s[i0+2]); s2 = SIMDBase_LOAD(&s[i0+3]);
+ s1 = SIMDBase_LOAD(&s[i2+2]); s3 = SIMDBase_LOAD(&s[i2+3]);
+ t0 = SIMDBase_LOAD1(&tbl[cp-1]); t1 = SIMDBase_LOAD1(&tbl[sp+1]);
+ t0r = SIMDBase_NEGi(SIMDBase_ADDi(SIMDBase_MULi(s1, t0), SIMDBase_MULi(s3, t1)));
+ t0i = SIMDBase_SUBi(SIMDBase_MULi(s1, t1), SIMDBase_MULi(s3, t0));
+ SIMDBase_STOR(&s[i2+2], SIMDBase_SUBi(s0, t0r)); SIMDBase_STOR(&s[i0+2], SIMDBase_ADDi(s0, t0r));
+ SIMDBase_STOR(&s[i2+3], SIMDBase_SUBi(s2, t0i)); SIMDBase_STOR(&s[i0+3], SIMDBase_ADDi(s2, t0i));
+
+ s0 = SIMDBase_LOAD(&s[i0+4]); s2 = SIMDBase_LOAD(&s[i0+5]);
+ s1 = SIMDBase_LOAD(&s[i2+4]); s3 = SIMDBase_LOAD(&s[i2+5]);
+ t0 = SIMDBase_LOAD1(&tbl[cp-2]); t1 = SIMDBase_LOAD1(&tbl[sp+2]);
+ t0r = SIMDBase_NEGi(SIMDBase_ADDi(SIMDBase_MULi(s1, t0), SIMDBase_MULi(s3, t1)));
+ t0i = SIMDBase_SUBi(SIMDBase_MULi(s1, t1), SIMDBase_MULi(s3, t0));
+ SIMDBase_STOR(&s[i2+4], SIMDBase_SUBi(s0, t0r)); SIMDBase_STOR(&s[i0+4], SIMDBase_ADDi(s0, t0r));
+ SIMDBase_STOR(&s[i2+5], SIMDBase_SUBi(s2, t0i)); SIMDBase_STOR(&s[i0+5], SIMDBase_ADDi(s2, t0i));
+
+ s0 = SIMDBase_LOAD(&s[i0+6]); s2 = SIMDBase_LOAD(&s[i0+7]);
+ s1 = SIMDBase_LOAD(&s[i2+6]); s3 = SIMDBase_LOAD(&s[i2+7]);
+ t0 = SIMDBase_LOAD1(&tbl[cp-3]); t1 = SIMDBase_LOAD1(&tbl[sp+3]);
+ t0r = SIMDBase_NEGi(SIMDBase_ADDi(SIMDBase_MULi(s1, t0), SIMDBase_MULi(s3, t1)));
+ t0i = SIMDBase_SUBi(SIMDBase_MULi(s1, t1), SIMDBase_MULi(s3, t0));
+ SIMDBase_STOR(&s[i2+6], SIMDBase_SUBi(s0, t0r)); SIMDBase_STOR(&s[i0+6], SIMDBase_ADDi(s0, t0r));
+ SIMDBase_STOR(&s[i2+7], SIMDBase_SUBi(s2, t0i)); SIMDBase_STOR(&s[i0+7], SIMDBase_ADDi(s2, t0i));
+
+ i0 += 8; i2 += 8; cp -= 4; sp += 4;
+ } while(cp > 0);
+}
+
+static void srButForward16(DFTUndiff *p) {
+ int32_t o = p->offset1;
+
+ p->butlen = 16; p->log2butlen = 4; p->stride = p->butlen/2;
+ srButForwardSubUnrolled(p);
+
+ p->offset1 = o + 16*6/4;
+ srButForward4(p);
+
+ p->offset1 = o + 16*4/4;
+ srButForward4(p);
+
+ p->offset1 = o;
+ srButForward8(p);
+}
+
+static void srButBackward16(DFTUndiff *p) {
+ int32_t o = p->offset1;
+
+ p->offset1 = o + 16*6/4;
+ srButBackward4(p);
+
+ p->offset1 = o + 16*4/4;
+ srButBackward4(p);
+
+ p->offset1 = o;
+ srButBackward8(p);
+
+ p->butlen = 16; p->log2butlen = 4; p->stride = p->butlen/2;
+ srButBackwardSubUnrolled(p);
+}
+
+static void srButForward32(DFTUndiff *p) {
+ int32_t o = p->offset1;
+
+ p->butlen = 32; p->log2butlen = 5; p->stride = p->butlen/2;
+ srButForwardSubUnrolled(p);
+
+ p->offset1 = o + 32*6/4;
+ srButForward8 (p);
+
+ p->offset1 = o + 32*4/4;
+ srButForward8 (p);
+
+ p->offset1 = o;
+ srButForward16(p);
+}
+
+static void srButBackward32(DFTUndiff *p) {
+ int32_t o = p->offset1;
+
+ p->offset1 = o + 32*6/4;
+ srButBackward8 (p);
+
+ p->offset1 = o + 32*4/4;
+ srButBackward8 (p);
+
+ p->offset1 = o;
+ srButBackward16(p);
+
+ p->butlen = 32; p->log2butlen = 5; p->stride = p->butlen/2;
+ srButBackwardSubUnrolled(p);
+}
+
+//
+
+#if 1
+static inline void bitReversalUnit(SIMDBase_VECT *p, SIMDBase_VECT *q) {
+ SIMDBase_VECT w, x, y, z;
+
+ w = SIMDBase_LOAD(p); x = SIMDBase_LOAD(p+1);
+ y = SIMDBase_LOAD(q); z = SIMDBase_LOAD(q+1);
+
+ SIMDBase_STOR(q, w); SIMDBase_STOR(q+1, x);
+ SIMDBase_STOR(p, y); SIMDBase_STOR(p+1, z);
+}
+#else
+#define bitReversalUnit(p0, q0) { \
+ SIMDBase_VECT *px = (p0), *qx = (q0); \
+ SIMDBase_VECT wx, xx, yx, zx; \
+ \
+ wx = SIMDBase_LOAD(px); xx = SIMDBase_LOAD(px+1); \
+ yx = SIMDBase_LOAD(qx); zx = SIMDBase_LOAD(qx+1); \
+ \
+ SIMDBase_STOR(qx, wx); SIMDBase_STOR(qx+1, xx); \
+ SIMDBase_STOR(px, yx); SIMDBase_STOR(px+1, zx); \
+}
+#endif
+
+static inline void bitReversal4s(SIMDBase_VECT *s, int32_t sc, int32_t o1, int32_t o2) {
+ SIMDBase_VECT *p = &s[o1*2], *q = &s[o2*2];
+ int b1 = sc*2*1, b2 = b1*2;
+ p += b1; q += b2;
+ bitReversalUnit(p, q);
+}
+
+static inline void bitReversal8s(SIMDBase_VECT *s, int32_t sc, int32_t o1, int32_t o2) {
+ SIMDBase_VECT *p = &s[o1*2], *q = &s[o2*2];
+ int b1 = sc*2*1, b2 = b1*2, b4 = b2*2;
+ p += b1; q += b4;
+ bitReversalUnit(p, q); p += b2; q += b2;
+ bitReversalUnit(p, q);
+}
+
+static inline void bitReversal8d(SIMDBase_VECT *s, int32_t sc, int32_t o1, int32_t o2) {
+ SIMDBase_VECT *p = &s[o1*2], *q = &s[o2*2];
+ int32_t b1 = sc*2*1, b2 = b1*2, b4 = b2*2;
+ bitReversalUnit(p, q); p += b1; q += b4;
+ bitReversalUnit(p, q); p += b2; q += b2;
+ bitReversalUnit(p, q); p -= b1; q -= b4;
+ bitReversalUnit(p, q); p += b4; q += b1;
+ bitReversalUnit(p, q); p += b1; q += b4;
+ bitReversalUnit(p, q); p -= b2; q -= b2;
+ bitReversalUnit(p, q); p -= b1; q -= b4;
+ bitReversalUnit(p, q);
+}
+
+static inline void bitReversal16s(SIMDBase_VECT *s, int32_t sc, int32_t o1, int32_t o2) {
+ SIMDBase_VECT *p = &s[o1*2], *q = &s[o2*2];
+ int32_t b1 = sc*2*1, b2 = b1*2, b4 = b2*2, b8 = b4*2;
+ p += b1; q += b8;
+ bitReversalUnit(p, q); p += b2; q += b4;
+ bitReversalUnit(p, q); p -= b1; q -= b8;
+ bitReversalUnit(p, q); p += b1 + b4; q += b2 + b8;
+ bitReversalUnit(p, q); p -= b2; q -= b4;
+ bitReversalUnit(p, q); p += b2 + b4; q += b1 + b2;
+ bitReversalUnit(p, q);
+}
+
+static inline void bitReversal16d(SIMDBase_VECT *s, int32_t sc, int32_t o1, int32_t o2) {
+ SIMDBase_VECT *p = &s[o1*2], *q = &s[o2*2];
+ int32_t b1 = sc*2*1, b2 = b1*2, b4 = b2*2, b8 = b4*2;
+ bitReversalUnit(p, q); p += b1; q += b8;
+ bitReversalUnit(p, q); p += b2; q += b4;
+ bitReversalUnit(p, q); p -= b1; q -= b8;
+ bitReversalUnit(p, q); p += b4; q += b2;
+ bitReversalUnit(p, q); p += b1; q += b8;
+ bitReversalUnit(p, q); p -= b2; q -= b4;
+ bitReversalUnit(p, q); p -= b1; q -= b8;
+ bitReversalUnit(p, q); p += b8; q += b1;
+ bitReversalUnit(p, q); p += b1; q += b8;
+ bitReversalUnit(p, q); p += b2; q += b4;
+ bitReversalUnit(p, q); p -= b1; q -= b8;
+ bitReversalUnit(p, q); p -= b4; q -= b2;
+ bitReversalUnit(p, q); p += b1; q += b8;
+ bitReversalUnit(p, q); p -= b2; q -= b4;
+ bitReversalUnit(p, q); p -= b1; q -= b8;
+ bitReversalUnit(p, q);
+}
+
+static inline void bitReversal32s(SIMDBase_VECT *s, int32_t sc, int32_t o1, int32_t o2) {
+ SIMDBase_VECT *p = &s[o1*2], *q = &s[o2*2];
+ int32_t b1 = sc*2*1, b2 = b1*2, b4 = b2*2, b8 = b4*2, b16 = b8*2;
+ p += b1; q += b16;
+ bitReversalUnit(p, q); p += b2; q += b8;
+ bitReversalUnit(p, q); p -= b1; q -= b16;
+ bitReversalUnit(p, q); p += b4; q += b4;
+ bitReversalUnit(p, q); p += b1; q += b16;
+ bitReversalUnit(p, q); p -= b2; q -= b8;
+ bitReversalUnit(p, q); p += b8; q += b2;
+ bitReversalUnit(p, q); p += b2; q += b8;
+ bitReversalUnit(p, q); p -= b4; q -= b4;
+ bitReversalUnit(p, q); p -= b2; q -= b8;
+ bitReversalUnit(p, q); p += b16 - b2; q += b1 + b2 + b8;
+ bitReversalUnit(p, q); p -= b4; q -= b4;
+ bitReversalUnit(p, q);
+}
+
+static void bitReversal32d(SIMDBase_VECT *s, int32_t sc, int32_t o1, int32_t o2) {
+ const int32_t k = 32;
+
+ bitReversal8d(s,2*sc, sc*(k/2 )+o1, sc* 1 +o2);
+ bitReversal8d(s,2*sc, sc* 0 +o1, sc* 0 +o2);
+ bitReversal8d(s,2*sc, sc* 1 +o1, sc*(k/2 )+o2);
+ bitReversal8d(s,2*sc, sc*(k/2+1)+o1, sc*(k/2+1)+o2);
+}
+
+static void bitReversalRecursive(SIMDBase_VECT *s, int32_t n, int32_t sc, int32_t o1, int32_t o2) {
+ if (n >= 64) {
+ if (o1 != o2) bitReversalRecursive(s, n/4, 2*sc, sc*(n/2)+o1, sc*1+o2);
+
+ bitReversalRecursive(s, n/4, 2*sc, sc* 0 +o1, sc* 0 +o2);
+ bitReversalRecursive(s, n/4, 2*sc, sc* 1 +o1, sc*(n/2 )+o2);
+ bitReversalRecursive(s, n/4, 2*sc, sc*(n/2+1)+o1, sc*(n/2+1)+o2);
+ } else {
+ if (o1 == o2) {
+ switch(n) {
+ case 4: bitReversal4s (s,sc,o1,o2); return;
+ case 8: bitReversal8s (s,sc,o1,o2); return;
+ case 16: bitReversal16s(s,sc,o1,o2); return;
+ case 32: bitReversal32s(s,sc,o1,o2); return;
+ }
+ } else {
+ switch(n) {
+ case 8: bitReversal8d (s,sc,o1,o2); return;
+ case 16: bitReversal16d(s,sc,o1,o2); return;
+ case 32: bitReversal32d(s,sc,o1,o2); return;
+ }
+ }
+ }
+}
+
+//
+
+static int bitR(int a, int logN) {
+ int ret = 0;
+ int i,j,k;
+ for(i=0,j=1,k=1<<(logN-1);i<logN;i++,j=j<<1,k=k>>1) {
+ if ((a & j) != 0) ret |= k;
+ }
+ return ret;
+}
+
+static void bitReversalCobraInplace(DFTUndiff *p) {
+ SIMDBase_VECT *s = p->s;
+ int cobraQ = p->cobraQ;
+ SIMDBase_VECT *cobraT = p->cobraT;
+ int *cobraR = p->cobraR;
+ int logN = p->log2len;
+
+ int b;
+
+ for(b=0;b<(1 << (logN-2*cobraQ));b++) {
+ int a,c;
+ int b2 = bitR(b, logN-2*cobraQ);
+
+ if (b2 < b) continue;
+
+ if (b2 == b) {
+ for(a=0;a<(1 << cobraQ);a++) {
+ int abc = ((a << (logN-2*cobraQ)) | b) << (cobraQ + 1);
+
+ int a2c = (cobraR[a] << cobraQ) << 1, a2cm = a2c+(1 << cobraQ)*2;
+
+ while(a2c < a2cm) {
+ SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++]));
+ SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++]));
+ SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++]));
+ SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++]));
+ SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++]));
+ SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++]));
+ SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++]));
+ SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++]));
+ SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++]));
+ SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++]));
+ SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++]));
+ SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++]));
+ SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++]));
+ SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++]));
+ SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++]));
+ SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++]));
+ }
+ }
+
+ for(c=0;c<(1 << cobraQ);c++) {
+ int c2 = cobraR[c];
+ int c2b2a2 = ((c2 << (logN-2*cobraQ)) | b2) << (cobraQ+1);
+
+ int a2c = c << 1;
+ int a2ci = 1 << (cobraQ+1);
+ int c2b2a2m = c2b2a2 + (1 << cobraQ)*2;
+
+ while(c2b2a2 < c2b2a2m) {
+ SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c ])); SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c+1])); a2c += a2ci;
+ SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c ])); SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c+1])); a2c += a2ci;
+ SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c ])); SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c+1])); a2c += a2ci;
+ SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c ])); SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c+1])); a2c += a2ci;
+ SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c ])); SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c+1])); a2c += a2ci;
+ SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c ])); SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c+1])); a2c += a2ci;
+ SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c ])); SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c+1])); a2c += a2ci;
+ SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c ])); SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c+1])); a2c += a2ci;
+ SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c ])); SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c+1])); a2c += a2ci;
+ SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c ])); SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c+1])); a2c += a2ci;
+ SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c ])); SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c+1])); a2c += a2ci;
+ SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c ])); SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c+1])); a2c += a2ci;
+ SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c ])); SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c+1])); a2c += a2ci;
+ SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c ])); SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c+1])); a2c += a2ci;
+ SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c ])); SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c+1])); a2c += a2ci;
+ SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c ])); SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c+1])); a2c += a2ci;
+ }
+ }
+ } else {
+ for(a=0;a<(1 << cobraQ);a++) {
+ int a2c = (cobraR[a] << cobraQ) << 1, a2cm = a2c+(1 << cobraQ)*2;
+ int abc = ((a << (logN-2*cobraQ)) | b) << (cobraQ + 1);
+
+ while(a2c < a2cm) {
+ SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++]));
+ SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++]));
+ SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++]));
+ SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++]));
+ SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++]));
+ SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++]));
+ SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++]));
+ SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++]));
+ SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++]));
+ SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++]));
+ SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++]));
+ SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++]));
+ SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++]));
+ SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++]));
+ SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++]));
+ SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++]));
+ }
+ }
+
+ for(c=0;c<(1 << cobraQ);c++) {
+ int c2 = cobraR[c];
+ int c2b2a2 = ((c2 << (logN-2*cobraQ)) | b2) << (cobraQ+1);
+
+ int a2c = c << 1;
+ int a2ci = 1 << (cobraQ+1);
+ int c2b2a2m = c2b2a2 + (1 << cobraQ)*2;
+
+ while(c2b2a2 < c2b2a2m) {
+ SIMDBase_VECT t0, t1, t2, t3, t4, t5, t6, t7;
+
+ t0 = SIMDBase_LOAD(&s[c2b2a2+0]); t1 = SIMDBase_LOAD(&s[c2b2a2+1]);
+ t2 = SIMDBase_LOAD(&s[c2b2a2+2]); t3 = SIMDBase_LOAD(&s[c2b2a2+3]);
+ t4 = SIMDBase_LOAD(&s[c2b2a2+4]); t5 = SIMDBase_LOAD(&s[c2b2a2+5]);
+ t6 = SIMDBase_LOAD(&s[c2b2a2+6]); t7 = SIMDBase_LOAD(&s[c2b2a2+7]);
+
+ SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c ])); SIMDBase_STOR(&cobraT[a2c ], t0);
+ SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c+1])); SIMDBase_STOR(&cobraT[a2c+1], t1); a2c += a2ci;
+ SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c ])); SIMDBase_STOR(&cobraT[a2c ], t2);
+ SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c+1])); SIMDBase_STOR(&cobraT[a2c+1], t3); a2c += a2ci;
+ SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c ])); SIMDBase_STOR(&cobraT[a2c ], t4);
+ SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c+1])); SIMDBase_STOR(&cobraT[a2c+1], t5); a2c += a2ci;
+ SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c ])); SIMDBase_STOR(&cobraT[a2c ], t6);
+ SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c+1])); SIMDBase_STOR(&cobraT[a2c+1], t7); a2c += a2ci;
+
+ t0 = SIMDBase_LOAD(&s[c2b2a2+0]); t1 = SIMDBase_LOAD(&s[c2b2a2+1]);
+ t2 = SIMDBase_LOAD(&s[c2b2a2+2]); t3 = SIMDBase_LOAD(&s[c2b2a2+3]);
+ t4 = SIMDBase_LOAD(&s[c2b2a2+4]); t5 = SIMDBase_LOAD(&s[c2b2a2+5]);
+ t6 = SIMDBase_LOAD(&s[c2b2a2+6]); t7 = SIMDBase_LOAD(&s[c2b2a2+7]);
+
+ SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c ])); SIMDBase_STOR(&cobraT[a2c ], t0);
+ SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c+1])); SIMDBase_STOR(&cobraT[a2c+1], t1); a2c += a2ci;
+ SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c ])); SIMDBase_STOR(&cobraT[a2c ], t2);
+ SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c+1])); SIMDBase_STOR(&cobraT[a2c+1], t3); a2c += a2ci;
+ SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c ])); SIMDBase_STOR(&cobraT[a2c ], t4);
+ SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c+1])); SIMDBase_STOR(&cobraT[a2c+1], t5); a2c += a2ci;
+ SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c ])); SIMDBase_STOR(&cobraT[a2c ], t6);
+ SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c+1])); SIMDBase_STOR(&cobraT[a2c+1], t7); a2c += a2ci;
+
+ t0 = SIMDBase_LOAD(&s[c2b2a2+0]); t1 = SIMDBase_LOAD(&s[c2b2a2+1]);
+ t2 = SIMDBase_LOAD(&s[c2b2a2+2]); t3 = SIMDBase_LOAD(&s[c2b2a2+3]);
+ t4 = SIMDBase_LOAD(&s[c2b2a2+4]); t5 = SIMDBase_LOAD(&s[c2b2a2+5]);
+ t6 = SIMDBase_LOAD(&s[c2b2a2+6]); t7 = SIMDBase_LOAD(&s[c2b2a2+7]);
+
+ SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c ])); SIMDBase_STOR(&cobraT[a2c ], t0);
+ SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c+1])); SIMDBase_STOR(&cobraT[a2c+1], t1); a2c += a2ci;
+ SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c ])); SIMDBase_STOR(&cobraT[a2c ], t2);
+ SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c+1])); SIMDBase_STOR(&cobraT[a2c+1], t3); a2c += a2ci;
+ SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c ])); SIMDBase_STOR(&cobraT[a2c ], t4);
+ SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c+1])); SIMDBase_STOR(&cobraT[a2c+1], t5); a2c += a2ci;
+ SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c ])); SIMDBase_STOR(&cobraT[a2c ], t6);
+ SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c+1])); SIMDBase_STOR(&cobraT[a2c+1], t7); a2c += a2ci;
+
+ t0 = SIMDBase_LOAD(&s[c2b2a2+0]); t1 = SIMDBase_LOAD(&s[c2b2a2+1]);
+ t2 = SIMDBase_LOAD(&s[c2b2a2+2]); t3 = SIMDBase_LOAD(&s[c2b2a2+3]);
+ t4 = SIMDBase_LOAD(&s[c2b2a2+4]); t5 = SIMDBase_LOAD(&s[c2b2a2+5]);
+ t6 = SIMDBase_LOAD(&s[c2b2a2+6]); t7 = SIMDBase_LOAD(&s[c2b2a2+7]);
+
+ SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c ])); SIMDBase_STOR(&cobraT[a2c ], t0);
+ SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c+1])); SIMDBase_STOR(&cobraT[a2c+1], t1); a2c += a2ci;
+ SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c ])); SIMDBase_STOR(&cobraT[a2c ], t2);
+ SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c+1])); SIMDBase_STOR(&cobraT[a2c+1], t3); a2c += a2ci;
+ SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c ])); SIMDBase_STOR(&cobraT[a2c ], t4);
+ SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c+1])); SIMDBase_STOR(&cobraT[a2c+1], t5); a2c += a2ci;
+ SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c ])); SIMDBase_STOR(&cobraT[a2c ], t6);
+ SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c+1])); SIMDBase_STOR(&cobraT[a2c+1], t7); a2c += a2ci;
+ }
+ }
+
+ for(a=0;a<(1 << cobraQ);a++) {
+ int a2c = (cobraR[a] << cobraQ) << 1, a2cm = a2c+(1 << cobraQ)*2;
+ int abc = ((a << (logN-2*cobraQ)) | b) << (cobraQ + 1);
+
+ while(a2c < a2cm) {
+ SIMDBase_STOR(&s[abc++], SIMDBase_LOAD(&cobraT[a2c++])); SIMDBase_STOR(&s[abc++], SIMDBase_LOAD(&cobraT[a2c++]));
+ SIMDBase_STOR(&s[abc++], SIMDBase_LOAD(&cobraT[a2c++])); SIMDBase_STOR(&s[abc++], SIMDBase_LOAD(&cobraT[a2c++]));
+ SIMDBase_STOR(&s[abc++], SIMDBase_LOAD(&cobraT[a2c++])); SIMDBase_STOR(&s[abc++], SIMDBase_LOAD(&cobraT[a2c++]));
+ SIMDBase_STOR(&s[abc++], SIMDBase_LOAD(&cobraT[a2c++])); SIMDBase_STOR(&s[abc++], SIMDBase_LOAD(&cobraT[a2c++]));
+ SIMDBase_STOR(&s[abc++], SIMDBase_LOAD(&cobraT[a2c++])); SIMDBase_STOR(&s[abc++], SIMDBase_LOAD(&cobraT[a2c++]));
+ SIMDBase_STOR(&s[abc++], SIMDBase_LOAD(&cobraT[a2c++])); SIMDBase_STOR(&s[abc++], SIMDBase_LOAD(&cobraT[a2c++]));
+ SIMDBase_STOR(&s[abc++], SIMDBase_LOAD(&cobraT[a2c++])); SIMDBase_STOR(&s[abc++], SIMDBase_LOAD(&cobraT[a2c++]));
+ SIMDBase_STOR(&s[abc++], SIMDBase_LOAD(&cobraT[a2c++])); SIMDBase_STOR(&s[abc++], SIMDBase_LOAD(&cobraT[a2c++]));
+ SIMDBase_STOR(&s[abc++], SIMDBase_LOAD(&cobraT[a2c++])); SIMDBase_STOR(&s[abc++], SIMDBase_LOAD(&cobraT[a2c++]));
+ SIMDBase_STOR(&s[abc++], SIMDBase_LOAD(&cobraT[a2c++])); SIMDBase_STOR(&s[abc++], SIMDBase_LOAD(&cobraT[a2c++]));
+ SIMDBase_STOR(&s[abc++], SIMDBase_LOAD(&cobraT[a2c++])); SIMDBase_STOR(&s[abc++], SIMDBase_LOAD(&cobraT[a2c++]));
+ SIMDBase_STOR(&s[abc++], SIMDBase_LOAD(&cobraT[a2c++])); SIMDBase_STOR(&s[abc++], SIMDBase_LOAD(&cobraT[a2c++]));
+ SIMDBase_STOR(&s[abc++], SIMDBase_LOAD(&cobraT[a2c++])); SIMDBase_STOR(&s[abc++], SIMDBase_LOAD(&cobraT[a2c++]));
+ SIMDBase_STOR(&s[abc++], SIMDBase_LOAD(&cobraT[a2c++])); SIMDBase_STOR(&s[abc++], SIMDBase_LOAD(&cobraT[a2c++]));
+ SIMDBase_STOR(&s[abc++], SIMDBase_LOAD(&cobraT[a2c++])); SIMDBase_STOR(&s[abc++], SIMDBase_LOAD(&cobraT[a2c++]));
+ SIMDBase_STOR(&s[abc++], SIMDBase_LOAD(&cobraT[a2c++])); SIMDBase_STOR(&s[abc++], SIMDBase_LOAD(&cobraT[a2c++]));
+ }
+ }
+ }
+ }
+}
+
+//
+
+static void srForwardMain2(DFTUndiff *p) {
+ int32_t o = p->offset1;
+ int32_t butlen = p->butlen;
+ int32_t log2butlen = p->log2butlen;
+
+ if (butlen >= p->radix2thres) {
+ p->stride = p->butlen/2;
+ r2ButForwardSub(p);
+
+ p->offset1 = o + butlen*4/4;
+ p->butlen = butlen/2;
+ p->log2butlen = log2butlen-1;
+ srForwardMain2(p);
+
+ p->offset1 = o;
+ p->butlen = butlen/2;
+ p->log2butlen = log2butlen-1;
+ srForwardMain2(p);
+
+ return;
+ }
+
+ if (butlen >= 256) {
+ p->stride = p->butlen/2;
+ srButForwardSubUnrolled(p);
+
+ p->offset1 = o + butlen*6/4;
+ p->butlen = butlen/4;
+ p->log2butlen = log2butlen-2;
+ srForwardMain2(p);
+
+ p->offset1 = o + butlen*4/4;
+ p->butlen = butlen/4;
+ p->log2butlen = log2butlen-2;
+ srForwardMain2(p);
+
+ p->offset1 = o;
+ p->butlen = butlen/2;
+ p->log2butlen = log2butlen-1;
+ srForwardMain2(p);
+
+ return;
+ }
+
+ if (butlen == 128) {
+ p->stride = p->butlen/2;
+ srButForwardSubUnrolled(p);
+
+ p->offset1 = o + butlen*6/4;
+ srButForward32(p);
+
+ p->offset1 = o + butlen*4/4;
+ srButForward32(p);
+
+ p->offset1 = o;
+ p->butlen = butlen/2;
+ p->log2butlen = log2butlen-1;
+ srForwardMain2 (p);
+
+ return;
+ }
+
+ // butlen == 64
+
+ p->stride = p->butlen/2;
+ srButForwardSubUnrolled(p);
+
+ p->offset1 = o + butlen*6/4;
+ srButForward16(p);
+
+ p->offset1 = o + butlen*4/4;
+ srButForward16(p);
+
+ p->offset1 = o;
+ srButForward32(p);
+}
+
+static void srBackwardMain2(DFTUndiff *p) {
+ int32_t o = p->offset1;
+ int32_t butlen = p->butlen;
+ int32_t log2butlen = p->log2butlen;
+
+ if (butlen >= p->radix2thres) {
+ p->offset1 = o + butlen*4/4;
+ p->butlen = butlen/2;
+ p->log2butlen = log2butlen-1;
+ srBackwardMain2(p);
+
+ p->offset1 = o;
+ p->butlen = butlen/2;
+ p->log2butlen = log2butlen-1;
+ srBackwardMain2(p);
+
+ p->butlen = butlen;
+ p->stride = p->butlen/2;
+ p->log2butlen = log2butlen;
+ r2ButBackwardSub(p);
+
+ return;
+ }
+
+ if (butlen >= 256) {
+ p->offset1 = o + butlen*6/4;
+ p->butlen = butlen/4;
+ p->log2butlen = log2butlen-2;
+ srBackwardMain2(p);
+
+ p->offset1 = o + butlen*4/4;
+ p->butlen = butlen/4;
+ p->log2butlen = log2butlen-2;
+ srBackwardMain2(p);
+
+ p->offset1 = o;
+ p->butlen = butlen/2;
+ p->log2butlen = log2butlen-1;
+ srBackwardMain2(p);
+
+ p->butlen = butlen;
+ p->stride = p->butlen/2;
+ p->log2butlen = log2butlen;
+ srButBackwardSubUnrolled(p);
+
+ return;
+ }
+
+ if (butlen == 128) {
+ p->offset1 = o + butlen*6/4;
+ srButBackward32(p);
+
+ p->offset1 = o + butlen*4/4;
+ srButBackward32(p);
+
+ p->offset1 = o;
+ p->butlen = butlen/2;
+ p->log2butlen = log2butlen-1;
+ srBackwardMain2 (p);
+
+ p->butlen = butlen;
+ p->stride = p->butlen/2;
+ p->log2butlen = log2butlen;
+ srButBackwardSubUnrolled(p);
+
+ return;
+ }
+
+ // butlen == 64
+
+ p->offset1 = o + butlen*6/4;
+ srButBackward16(p);
+
+ p->offset1 = o + butlen*4/4;
+ srButBackward16(p);
+
+ p->offset1 = o;
+ srButBackward32(p);
+
+ p->butlen = butlen;
+ p->stride = p->butlen/2;
+ p->log2butlen = log2butlen;
+ srButBackwardSubUnrolled(p);
+}
+
+static void srForwardMain(DFTUndiff *p) {
+ if (p->length >= 64) {
+ p->butlen = p->length;
+ p->log2butlen = p->log2len;
+ p->offset1 = p->offset2 = 0;
+
+ srForwardMain2(p);
+ } else {
+ switch(p->length) {
+ case 32:
+ srButForward32(p);
+ break;
+ case 16:
+ srButForward16(p);
+ break;
+ case 8:
+ srButForward8(p);
+ break;
+ case 4:
+ srButForward4(p);
+ break;
+ case 2:
+ srBut2(p);
+ break;
+ }
+ }
+}
+
+static void srBackwardMain(DFTUndiff *p) {
+ if (p->length >= 64) {
+ p->butlen = p->length;
+ p->log2butlen = p->log2len;
+ p->offset1 = p->offset2 = 0;
+
+ srBackwardMain2(p);
+ } else {
+ switch(p->length) {
+ case 32:
+ srButBackward32(p);
+ break;
+ case 16:
+ srButBackward16(p);
+ break;
+ case 8:
+ srButBackward8(p);
+ break;
+ case 4:
+ srButBackward4(p);
+ break;
+ case 2:
+ srBut2(p);
+ break;
+ }
+ }
+}
+
+static void realSub0(DFTUndiff *p, SIMDBase_VECT *s, int32_t ts) {
+ SIMDBase_VECT tr, ti, ur, ui, mr, mi;
+ int32_t n = p->length*2;
+ int32_t k;
+
+ for(k=1;k<n/4;k++) {
+ SIMDBase_VECT s00 = SIMDBase_LOAD(&s[k*2+0]), s01 = SIMDBase_LOAD(&s[k*2+1]);
+ SIMDBase_VECT s10 = SIMDBase_LOAD(&s[(n/2-k)*2+0]), s11 = SIMDBase_LOAD(&s[(n/2-k)*2+1]);
+
+ tr = SIMDBase_SUBi(s00, s10); ti = SIMDBase_ADDi(s01, s11);
+ ur = SIMDBase_LOAD1(&(p->rtTable[ts][k*2+0]));
+ ui = SIMDBase_LOAD1(&(p->rtTable[ts][k*2+1]));
+ mr = SIMDBase_SUBi(SIMDBase_MULi(tr, ur), SIMDBase_MULi(ti, ui));
+ mi = SIMDBase_ADDi(SIMDBase_MULi(tr, ui), SIMDBase_MULi(ti, ur));
+ SIMDBase_STOR(&s[k*2+0], SIMDBase_SUBi(s00, mr));
+ SIMDBase_STOR(&s[k*2+1], SIMDBase_SUBi(s01, mi));
+ SIMDBase_STOR(&s[(n/2-k)*2+0], SIMDBase_ADDi(s10, mr));
+ SIMDBase_STOR(&s[(n/2-k)*2+1], SIMDBase_SUBi(s11, mi));
+ }
+
+ tr = SIMDBase_LOAD(&s[0]); ti = SIMDBase_LOAD(&s[1]);
+ SIMDBase_STOR(&s[0], SIMDBase_ADDi(tr, ti));
+ SIMDBase_STOR(&s[1], SIMDBase_SUBi(tr, ti));
+}
+
+static void realSub1(DFTUndiff *p, SIMDBase_VECT *s, int32_t ts) {
+ SIMDBase_VECT tr, ti, ur, ui, mr, mi;
+ int32_t n = p->length*2;
+ int32_t k;
+
+ tr = SIMDBase_LOAD(&s[0]); ti = SIMDBase_LOAD(&s[1]);
+ SIMDBase_STOR(&s[0], SIMDBase_MULi(SIMDBase_ADDi(tr, ti), SIMDBase_SET1(0.5)));
+ SIMDBase_STOR(&s[1], SIMDBase_MULi(SIMDBase_SUBi(tr, ti), SIMDBase_SET1(0.5)));
+
+ for(k=1;k<n/4;k++) {
+ SIMDBase_VECT s00 = SIMDBase_LOAD(&s[k*2+0]), s01 = SIMDBase_LOAD(&s[k*2+1]);
+ SIMDBase_VECT s10 = SIMDBase_LOAD(&s[(n/2-k)*2+0]), s11 = SIMDBase_LOAD(&s[(n/2-k)*2+1]);
+
+ tr = SIMDBase_SUBi(s00, s10); ti = SIMDBase_ADDi(s01, s11);
+ ur = SIMDBase_LOAD1(&(p->rtTable[ts][k*2+0]));
+ ui = SIMDBase_LOAD1(&(p->rtTable[ts][k*2+1]));
+ mr = SIMDBase_SUBi(SIMDBase_MULi(tr, ur), SIMDBase_MULi(ti, ui));
+ mi = SIMDBase_ADDi(SIMDBase_MULi(tr, ui), SIMDBase_MULi(ti, ur));
+ tr = SIMDBase_SUBi(s00, mr); ti = SIMDBase_SUBi(mi, s01);
+ SIMDBase_STOR(&s[k*2+0], SIMDBase_ADDi(mr, s10));
+ SIMDBase_STOR(&s[k*2+1], SIMDBase_SUBi(mi, s11));
+ SIMDBase_STOR(&s[(n/2-k)*2+0], tr);
+ SIMDBase_STOR(&s[(n/2-k)*2+1], ti);
+ }
+}
+
+void DFTUndiff_EXECUTE(void *p2, void *s2, int32_t dir) {
+ DFTUndiff *p = (DFTUndiff *)p2;
+ SIMDBase_VECT *s = (SIMDBase_VECT *)s2;
+
+ if (p->magic != MAGIC_DFT) abort();
+
+ p->s = s;
+
+ if (dir == -1) {
+ if ((p->flags & DFT_FLAG_ALT_REAL) != 0) {
+ realSub1(p, s, 0);
+ }
+
+ srForwardMain(p);
+
+ if ((p->flags & DFT_FLAG_NO_BITREVERSAL) == 0) {
+ if (p->useCobra) {
+ bitReversalCobraInplace(p);
+ } else {
+ bitReversalRecursive(p->s, p->length, 1, 0, 0);
+ }
+ }
+
+ if ((p->flags & DFT_FLAG_REAL) != 0) {
+ realSub0(p, s, 0);
+ s[p->length+1] = SIMDBase_NEGi(s[p->length+1]);
+ }
+ } else {
+ if ((p->flags & DFT_FLAG_REAL) != 0) {
+ s[p->length+1] = SIMDBase_NEGi(s[p->length+1]);
+ realSub1(p, s, 1);
+ }
+
+ if ((p->flags & DFT_FLAG_NO_BITREVERSAL) == 0) {
+ if (p->useCobra) {
+ bitReversalCobraInplace(p);
+ } else {
+ bitReversalRecursive(p->s, p->length, 1, 0, 0);
+ }
+ }
+
+ srBackwardMain(p);
+
+ if ((p->flags & DFT_FLAG_ALT_REAL) != 0) {
+ realSub0(p, s, 1);
+ }
+ }
+}
+
+void DFTUndiff_DESTROYPLAN(void *p2) {
+ DFTUndiff *plan = (DFTUndiff *)p2;
+ if (plan->magic != MAGIC_DFT) abort();
+
+ free(*(plan->ptTable));
+ free(plan->ptTable);
+ free(plan->cobraT);
+ free(plan->cobraR);
+ //free(plan->t);
+ if (plan->rtTable != NULL) {
+ free(plan->rtTable[0]);
+ free(plan->rtTable[1]);
+ free(plan->rtTable);
+ }
+
+ plan->magic = 0;
+ free(plan);
+}
+
+DFTUndiff *DFTUndiff_MAKEPLANSUB(uint64_t n, int32_t radix2thres, int32_t useCobra, uint64_t flags) {
+ int32_t i, j, k;
+
+ uint32_t linesize = SIMDBase_sizeOfCachelineInByte();
+ uint32_t cachesize = SIMDBase_sizeOfDataCacheInByte();
+
+ //
+
+ if ((flags & DFT_FLAG_REAL) != 0 || (flags & DFT_FLAG_ALT_REAL) != 0) n /= 2;
+
+ DFTUndiff *d = calloc(1, sizeof(DFTUndiff));
+
+ d->magic = MAGIC_DFT;
+ d->mode = SIMDBase_MODE;
+ d->flags = flags;
+
+ d->radix2thres = radix2thres;
+ d->useCobra = useCobra;
+
+ d->length = (uint32_t) n;
+ d->log2len = DFT_ilog2((uint32_t) n);
+
+ //
+
+ SIMDBase_REAL *trigTable = SIMDBase_alignedMalloc(sizeof(SIMDBase_REAL)*n*2);
+ d->ptTable = malloc(sizeof(SIMDBase_REAL *) * (d->log2len+1));
+
+ SIMDBase_REAL *p = trigTable, **pp = d->ptTable;
+
+ for(j=0;j<(int32_t)d->log2len+1;j++) {
+ *pp++ = p;
+
+ if ((1 << j) >= d->radix2thres) {
+ for(i=0;i<(1 << j)/4+1;i++) {
+ *p++ = (SIMDBase_REAL)COS(-2*M_PIl*i/(1 << j));
+ }
+ const int32_t step = linesize / sizeof(SIMDBase_REAL);
+ p += (step - (p - trigTable) % step) % step;
+ } else {
+ for(i=0;i<(1 << j)/4;i++) {
+ *p++ = (SIMDBase_REAL)COS(-2*M_PIl*i/(1 << j));
+ *p++ = (SIMDBase_REAL)SIN(-2*M_PIl*i/(1 << j));
+ *p++ = (SIMDBase_REAL)COS(-6*M_PIl*i/(1 << j));
+ *p++ = (SIMDBase_REAL)SIN(-6*M_PIl*i/(1 << j));
+ }
+ }
+ }
+
+ //
+
+ int32_t cobraQ;
+
+ cobraQ = linesize / (sizeof(SIMDBase_VECT) * 2);
+
+ for(;;) {
+ if (1 << (cobraQ*2) >
+ (cachesize / (sizeof(SIMDBase_VECT) * 2)/2))
+ break;
+
+ cobraQ++;
+ }
+ cobraQ--;
+
+ d->cobraQ = cobraQ;
+
+ if (cobraQ >= 4 && d->log2len >= 2*cobraQ) {
+ SIMDBase_VECT *cobraT;
+ int32_t *cobraR;
+
+ if (d->log2len <= 2*cobraQ) cobraQ = d->log2len / 2;
+
+ cobraT = SIMDBase_alignedMalloc(sizeof(SIMDBase_VECT)*2 * (1 << (cobraQ*2)));
+ cobraR = (int32_t *)SIMDBase_alignedMalloc(sizeof(int32_t) * (1 << cobraQ));
+
+ for(i=0;i<(1 << cobraQ);i++) cobraR[i] = bitR(i, cobraQ);
+
+ d->cobraT = cobraT; d->cobraR = cobraR;
+ } else {
+ d->useCobra = 0;
+ }
+
+ //
+
+ if ((d->flags & DFT_FLAG_REAL) != 0 || (d->flags & DFT_FLAG_ALT_REAL) != 0) {
+ int32_t m = n*2;
+
+ d->rtTable = malloc(sizeof(SIMDBase_REAL *)*2);
+ d->rtTable[0] = SIMDBase_alignedMalloc(sizeof(SIMDBase_REAL)*m/2);
+ d->rtTable[1] = SIMDBase_alignedMalloc(sizeof(SIMDBase_REAL)*m/2);
+
+ for(k=0;k<m/4;k++) {
+ d->rtTable[0][k*2+0] = 0.5-0.5*SIN(-2*M_PIl*k/m);
+ d->rtTable[0][k*2+1] = 0.5*COS(-2*M_PIl*k/m);
+ d->rtTable[1][k*2+0] = 0.5-0.5*SIN( 2*M_PIl*k/m);
+ d->rtTable[1][k*2+1] = 0.5*COS( 2*M_PIl*k/m);
+ }
+ }
+
+ //
+
+ return (void *)d;
+}
+
+void *DFTUndiff_MAKEPLAN(uint64_t n, uint64_t flags) {
+ if (flags & DFT_FLAG_VERBOSE) {
+ printf("\n--------------------------------\n");
+ printf("Making plan, mode = %s, dft length = %d\n", SIMDBase_NAME, (int)n);
+ printf("Processor : %s\n", SIMDBase_getProcessorNameString());
+ printf("Cache size (L2 + L3) : %d kbytes / thread\n", SIMDBase_sizeOfDataCacheInByte() / 1024);
+ printf("Cache Line Size : %d bytes\n", SIMDBase_sizeOfCachelineInByte());
+ }
+
+ if (n <= 256 || (flags & 3) == 0) {
+ return DFTUndiff_MAKEPLANSUB(n, n*2, (flags & DFT_FLAG_FORCE_COBRA) != 0, flags);
+ }
+
+ SIMDBase_REAL *s1 = SIMDBase_alignedMalloc(sizeof(SIMDBase_VECT)*n*2);
+
+ int32_t i, j, ts, tsbest, useCobra = 0;
+ double tick, tickmin;
+
+ if (flags & DFT_FLAG_VERBOSE) {
+ printf("\nWarming up before calibration ...");
+ fflush(stdout);
+ }
+
+ // warming up
+ tick = DFT_timeofday();
+ while(DFT_timeofday() - tick < 0.5)
+ ;
+
+ if (flags & DFT_FLAG_VERBOSE) {
+ printf(" done\n");
+ }
+
+ int32_t ntimes = 20000000.0 / n / DFT_ilog2(n);
+ if (ntimes == 0) ntimes = 1;
+
+ if (flags & DFT_FLAG_VERBOSE) {
+ printf("nTimes = %d\n", ntimes);
+ }
+
+ //
+
+ DFTUndiff *plan = DFTUndiff_MAKEPLANSUB(n, n*2, 0, flags);
+
+ for(i=0;i<n*2*SIMDBase_VECTLEN;i++) {
+ s1[i] = 0;
+ }
+
+ plan->s = (SIMDBase_VECT *)s1;
+
+ if (plan->cobraT != NULL) {
+ double tcobra = 0, trecur = 0;
+
+ if (flags & DFT_FLAG_VERBOSE) {
+ printf("\nChecking which bit-reversal method is faster\n");
+ }
+
+ //
+
+ bitReversalCobraInplace(plan);
+
+ tick = DFT_timeofday();
+
+ for(j=0;j<ntimes*4;j++) {
+ bitReversalCobraInplace(plan);
+ }
+
+ tcobra += DFT_timeofday() - tick;
+
+ //
+
+ bitReversalRecursive(plan->s, plan->length, 1, 0, 0);
+
+ tick = DFT_timeofday();
+
+ for(j=0;j<ntimes*4;j++) {
+ bitReversalRecursive(plan->s, plan->length, 1, 0, 0);
+ }
+
+ trecur += DFT_timeofday() - tick;
+
+ //
+
+ bitReversalCobraInplace(plan);
+
+ tick = DFT_timeofday();
+
+ for(j=0;j<ntimes*4;j++) {
+ bitReversalCobraInplace(plan);
+ }
+
+ tcobra += DFT_timeofday() - tick;
+
+ //
+
+ bitReversalRecursive(plan->s, plan->length, 1, 0, 0);
+
+ tick = DFT_timeofday();
+
+ for(j=0;j<ntimes*4;j++) {
+ bitReversalRecursive(plan->s, plan->length, 1, 0, 0);
+ }
+
+ trecur += DFT_timeofday() - tick;
+
+ //
+
+ useCobra = tcobra < trecur;
+
+ if ((flags & DFT_FLAG_FORCE_RECURSIVE) != 0) useCobra = 0;
+ if ((flags & DFT_FLAG_FORCE_COBRA) != 0) useCobra = 1;
+
+ if (flags & DFT_FLAG_VERBOSE) {
+ printf("cobra : %g\n", tcobra);
+ printf("recur : %g\n", trecur);
+ if (useCobra) {
+ printf("will use Cobra\n");
+ } else {
+ printf("will use the recursive reverser\n");
+ }
+ }
+ }
+
+ DFTUndiff_DESTROYPLAN(plan);
+
+ //
+
+ if (flags & DFT_FLAG_VERBOSE) {
+ printf("\nDetermining radix 2 threshold\n");
+ }
+
+ plan = DFTUndiff_MAKEPLANSUB(n, n*2, useCobra, flags);
+
+ for(j=0;j<ntimes;j++) {
+ DFTUndiff_EXECUTE(plan, s1, -1);
+ DFTUndiff_EXECUTE(plan, s1, 1);
+ }
+
+ DFTUndiff_DESTROYPLAN(plan);
+
+ tsbest = -1;
+ tickmin = 0;
+
+ for(ts = 1024;ts <= n*2;ts *= 2) {
+ plan = DFTUndiff_MAKEPLANSUB(n, ts, useCobra, flags);
+
+ tick = DFT_timeofday();
+
+ for(j=0;j<ntimes;j++) {
+ DFTUndiff_EXECUTE(plan, s1, -1);
+ DFTUndiff_EXECUTE(plan, s1, 1);
+ }
+
+ tick = DFT_timeofday() - tick;
+
+ DFTUndiff_DESTROYPLAN(plan);
+
+ if (tickmin == 0) tickmin = tick;
+
+ if (flags & DFT_FLAG_VERBOSE) {
+ printf("%d : %g\n",ts, (double)tick);
+ }
+
+ if (tick < tickmin) {
+ tickmin = tick;
+ tsbest = ts;
+ }
+ }
+
+ if (tsbest == -1) tsbest = n*2;;
+
+ if (flags & DFT_FLAG_VERBOSE) {
+ //printf("forcing tsbest = 1024\n");
+ //tsbest = 1024;
+ printf("radix 2 threshold : %d\n\n", tsbest);
+
+ double t = tickmin / ntimes / 2;
+ double nf = 5 * n * log(n) / log(2) / (t * 1000000);
+
+ printf("nFlops = %d x %g\n", SIMDBase_VECTLEN, nf);
+ }
+
+ plan = DFTUndiff_MAKEPLANSUB(n, tsbest, useCobra, flags);
+
+ if (flags & DFT_FLAG_VERBOSE) {
+ printf("\nDone making plan\n--------------------------------\n");
+ }
+
+ return plan;
+}
diff --git a/plugins/supereq/nsfft-1.00/dft/DFTUndiff.h b/plugins/supereq/nsfft-1.00/dft/DFTUndiff.h
new file mode 100644
index 00000000..d26b0d9b
--- /dev/null
+++ b/plugins/supereq/nsfft-1.00/dft/DFTUndiff.h
@@ -0,0 +1,114 @@
+#ifndef __DFTIMPL_H__
+#define __DFTIMPL_H__
+
+#include "SIMDBaseUndiff.h"
+
+#define MAGIC_DFT 0x18839f6d82bb02b6ULL
+
+typedef struct {
+ uint64_t magic;
+
+ SIMDBase_VECT *s;
+ uint32_t offset1, offset2;
+ uint32_t butlen, log2butlen;
+ uint32_t stride;
+
+ SIMDBase_REAL **ptTable;
+ uint32_t length, log2len;
+
+ int32_t radix2thres, flagTrans, useCobra;
+
+ int32_t cobraQ;
+ SIMDBase_VECT *cobraT;
+ int32_t *cobraR;
+
+ SIMDBase_REAL **rtTable;
+
+ uint64_t flags;
+ int32_t mode;
+} DFTUndiff;
+
+#if defined(ENABLE_PUREC_FLOAT) ////////////////////////////////////////////
+
+#define DFTUndiff_GETMODEPARAMINT getModeParamInt_purec_float
+#define DFTUndiff_GETMODEPARAMSTRING getModeParamString_purec_float
+#define DFTUndiff_EXECUTE execute_purec_float
+#define DFTUndiff_MAKEPLAN makePlan_purec_float
+#define DFTUndiff_MAKEPLANSUB makePlanSub_purec_float
+#define DFTUndiff_DESTROYPLAN destroyPlan_purec_float
+
+#elif defined(ENABLE_PUREC_DOUBLE) ////////////////////////////////////////////
+
+#define DFTUndiff_GETMODEPARAMINT getModeParamInt_purec_double
+#define DFTUndiff_GETMODEPARAMSTRING getModeParamString_purec_double
+#define DFTUndiff_EXECUTE execute_purec_double
+#define DFTUndiff_MAKEPLAN makePlan_purec_double
+#define DFTUndiff_MAKEPLANSUB makePlanSub_purec_double
+#define DFTUndiff_DESTROYPLAN destroyPlan_purec_double
+
+#elif defined(ENABLE_PUREC_LONGDOUBLE) ////////////////////////////////////////////
+
+#define DFTUndiff_GETMODEPARAMINT getModeParamInt_purec_longdouble
+#define DFTUndiff_GETMODEPARAMSTRING getModeParamString_purec_longdouble
+#define DFTUndiff_EXECUTE execute_purec_longdouble
+#define DFTUndiff_MAKEPLAN makePlan_purec_longdouble
+#define DFTUndiff_MAKEPLANSUB makePlanSub_purec_longdouble
+#define DFTUndiff_DESTROYPLAN destroyPlan_purec_longdouble
+
+#elif defined(ENABLE_SSE_FLOAT) ////////////////////////////////////////////
+
+#define DFTUndiff_GETMODEPARAMINT getModeParamInt_sse_float
+#define DFTUndiff_GETMODEPARAMSTRING getModeParamString_sse_float
+#define DFTUndiff_EXECUTE execute_sse_float
+#define DFTUndiff_MAKEPLAN makePlan_sse_float
+#define DFTUndiff_MAKEPLANSUB makePlanSub_sse_float
+#define DFTUndiff_DESTROYPLAN destroyPlan_sse_float
+
+#elif defined(ENABLE_SSE2_DOUBLE) ////////////////////////////////////////////
+
+#define DFTUndiff_GETMODEPARAMINT getModeParamInt_sse2_double
+#define DFTUndiff_GETMODEPARAMSTRING getModeParamString_sse2_double
+#define DFTUndiff_EXECUTE execute_sse2_double
+#define DFTUndiff_MAKEPLAN makePlan_sse2_double
+#define DFTUndiff_MAKEPLANSUB makePlanSub_sse2_double
+#define DFTUndiff_DESTROYPLAN destroyPlan_sse2_double
+
+#elif defined(ENABLE_NEON_FLOAT) ////////////////////////////////////////////
+
+#define DFTUndiff_GETMODEPARAMINT getModeParamInt_neon_float
+#define DFTUndiff_GETMODEPARAMSTRING getModeParamString_neon_float
+#define DFTUndiff_EXECUTE execute_neon_float
+#define DFTUndiff_MAKEPLAN makePlan_neon_float
+#define DFTUndiff_MAKEPLANSUB makePlanSub_neon_float
+#define DFTUndiff_DESTROYPLAN destroyPlan_neon_float
+
+#elif defined(ENABLE_AVX_FLOAT) ////////////////////////////////////////////
+
+#define DFTUndiff_GETMODEPARAMINT getModeParamInt_avx_float
+#define DFTUndiff_GETMODEPARAMSTRING getModeParamString_avx_float
+#define DFTUndiff_EXECUTE execute_avx_float
+#define DFTUndiff_MAKEPLAN makePlan_avx_float
+#define DFTUndiff_MAKEPLANSUB makePlanSub_avx_float
+#define DFTUndiff_DESTROYPLAN destroyPlan_avx_float
+
+#elif defined(ENABLE_AVX_DOUBLE) ////////////////////////////////////////////
+
+#define DFTUndiff_GETMODEPARAMINT getModeParamInt_avx_double
+#define DFTUndiff_GETMODEPARAMSTRING getModeParamString_avx_double
+#define DFTUndiff_EXECUTE execute_avx_double
+#define DFTUndiff_MAKEPLAN makePlan_avx_double
+#define DFTUndiff_MAKEPLANSUB makePlanSub_avx_double
+#define DFTUndiff_DESTROYPLAN destroyPlan_avx_double
+
+#elif defined(ENABLE_ALTIVEC_FLOAT) ////////////////////////////////////////////
+
+#define DFTUndiff_GETMODEPARAMINT getModeParamInt_altivec_float
+#define DFTUndiff_GETMODEPARAMSTRING getModeParamString_altivec_float
+#define DFTUndiff_EXECUTE execute_altivec_float
+#define DFTUndiff_MAKEPLAN makePlan_altivec_float
+#define DFTUndiff_MAKEPLANSUB makePlanSub_altivec_float
+#define DFTUndiff_DESTROYPLAN destroyPlan_altivec_float
+
+#endif ////////////////////////////////////////////////////////////////////
+
+#endif
diff --git a/plugins/supereq/nsfft-1.00/dft/Makefile b/plugins/supereq/nsfft-1.00/dft/Makefile
new file mode 120000
index 00000000..5d253498
--- /dev/null
+++ b/plugins/supereq/nsfft-1.00/dft/Makefile
@@ -0,0 +1 @@
+Makefile.x86avx \ No newline at end of file
diff --git a/plugins/supereq/nsfft-1.00/dft/Makefile.altivec b/plugins/supereq/nsfft-1.00/dft/Makefile.altivec
new file mode 100644
index 00000000..fe7fc993
--- /dev/null
+++ b/plugins/supereq/nsfft-1.00/dft/Makefile.altivec
@@ -0,0 +1,26 @@
+CC=gcc
+BASEOPT=-Wall -I ../simd -maltivec -mabi=altivec
+OPT=$(BASEOPT) -O3
+
+all : libDFT.a
+
+DFTpurecfloat.o : DFTUndiff.c DFT.h ../simd/SIMDBase.h ../simd/SIMDBaseUndiff.h
+ $(CC) $(OPT) -DENABLE_PUREC_FLOAT DFTUndiff.c -c -o DFTpurecfloat.o
+
+DFTpurecdouble.o : DFTUndiff.c DFT.h ../simd/SIMDBase.h ../simd/SIMDBaseUndiff.h
+ $(CC) $(OPT) -DENABLE_PUREC_DOUBLE DFTUndiff.c -c -o DFTpurecdouble.o
+
+DFTpureclongdouble.o : DFTUndiff.c DFT.h ../simd/SIMDBase.h ../simd/SIMDBaseUndiff.h
+ $(CC) $(OPT) -DENABLE_PUREC_LONGDOUBLE DFTUndiff.c -c -o DFTpureclongdouble.o
+
+DFTaltivecfloat.o : DFTUndiff.c DFT.h ../simd/SIMDBase.h ../simd/SIMDBaseUndiff.h
+ $(CC) $(OPT) -DENABLE_ALTIVEC_FLOAT DFTUndiff.c -c -o DFTaltivecfloat.o
+
+DFT.o : DFT.c DFT.h
+ $(CC) $(OPT) -DENABLE_PUREC_FLOAT -DENABLE_PUREC_DOUBLE -DENABLE_PUREC_LONGDOUBLE -DENABLE_ALTIVEC_FLOAT DFT.c -c -o DFT.o
+
+libDFT.a : DFTpurecfloat.o DFTpurecdouble.o DFTpureclongdouble.o DFTaltivecfloat.o DFT.o
+ rm -f libDFT.a; ar -cvq libDFT.a DFTpurecfloat.o DFTpurecdouble.o DFTpureclongdouble.o DFTaltivecfloat.o DFT.o
+
+clean :
+ rm -f *~ *.o *.s *.a
diff --git a/plugins/supereq/nsfft-1.00/dft/Makefile.neon b/plugins/supereq/nsfft-1.00/dft/Makefile.neon
new file mode 100644
index 00000000..111a04ae
--- /dev/null
+++ b/plugins/supereq/nsfft-1.00/dft/Makefile.neon
@@ -0,0 +1,26 @@
+CC=gcc
+BASEOPT=-Wall -I ../simd -mfloat-abi=softfp
+OPT=$(BASEOPT) -O3
+
+all : libDFT.a
+
+DFTpurecfloat.o : DFTUndiff.c DFT.h ../simd/SIMDBase.h
+ $(CC) $(OPT) -DENABLE_PUREC_FLOAT DFTUndiff.c -c -o DFTpurecfloat.o
+
+DFTpurecdouble.o : DFTUndiff.c DFT.h ../simd/SIMDBase.h
+ $(CC) $(OPT) -DENABLE_PUREC_DOUBLE DFTUndiff.c -c -o DFTpurecdouble.o
+
+DFTpureclongdouble.o : DFTUndiff.c DFT.h ../simd/SIMDBase.h
+ $(CC) $(OPT) -DENABLE_PUREC_LONGDOUBLE DFTUndiff.c -c -o DFTpureclongdouble.o
+
+DFTneonfloat.o : DFTUndiff.c DFT.h ../simd/SIMDBase.h
+ $(CC) $(OPT) -mfpu=neon -DENABLE_NEON_FLOAT DFTUndiff.c -c -o DFTneonfloat.o
+
+DFT.o : DFT.c DFT.h
+ $(CC) $(OPT) -DENABLE_PUREC_FLOAT -DENABLE_PUREC_DOUBLE -DENABLE_PUREC_LONGDOUBLE -DENABLE_NEON_FLOAT DFT.c -c -o DFT.o
+
+libDFT.a : DFTpurecfloat.o DFTpurecdouble.o DFTpureclongdouble.o DFTneonfloat.o DFT.o
+ rm -f libDFT.a; ar -cvq libDFT.a DFTpurecfloat.o DFTpurecdouble.o DFTpureclongdouble.o DFTneonfloat.o DFT.o
+
+clean :
+ rm -f *~ *.o *.s *.a
diff --git a/plugins/supereq/nsfft-1.00/dft/Makefile.purec b/plugins/supereq/nsfft-1.00/dft/Makefile.purec
new file mode 100644
index 00000000..2c8b04f1
--- /dev/null
+++ b/plugins/supereq/nsfft-1.00/dft/Makefile.purec
@@ -0,0 +1,35 @@
+CC=gcc
+BASEOPT=-Wall
+OPT=$(BASEOPT) -O3
+
+all : libDFT.a
+
+DFTpurecfloat.o : DFTUndiff.c DFT.h SIMDBase.h
+ $(CC) $(OPT) -DENABLE_PUREC_FLOAT DFTUndiff.c -c -o DFTpurecfloat.o
+
+DFTpurecdouble.o : DFTUndiff.c DFT.h SIMDBase.h
+ $(CC) $(OPT) -DENABLE_PUREC_DOUBLE DFTUndiff.c -c -o DFTpurecdouble.o
+
+DFTpureclongdouble.o : DFTUndiff.c DFT.h SIMDBase.h
+ $(CC) $(OPT) -DENABLE_PUREC_LONGDOUBLE DFTUndiff.c -c -o DFTpureclongdouble.o
+
+SIMDBaseUndiff_purecfloat.o : SIMDBaseUndiff.c SIMDBase.h SIMDBaseUndiff.h
+ $(CC) $(OPT) -DENABLE_PUREC_FLOAT SIMDBaseUndiff.c -c -o SIMDBaseUndiff_purecfloat.o
+
+SIMDBaseUndiff_purecdouble.o : SIMDBaseUndiff.c DFT.h SIMDBase.h SIMDBaseUndiff.h
+ $(CC) $(OPT) -DENABLE_PUREC_DOUBLE SIMDBaseUndiff.c -c -o SIMDBaseUndiff_purecdouble.o
+
+SIMDBaseUndiff_pureclongdouble.o : SIMDBaseUndiff.c SIMDBase.h SIMDBaseUndiff.h
+ $(CC) $(OPT) -DENABLE_PUREC_LONGDOUBLE SIMDBaseUndiff.c -c -o SIMDBaseUndiff_pureclongdouble.o
+
+SIMDBase.o : SIMDBase.c SIMDBase.h
+ $(CC) $(BASEOPT) -O -DENABLE_PUREC_FLOAT -DENABLE_PUREC_DOUBLE -DENABLE_PUREC_LONGDOUBLE SIMDBase.c -c -o SIMDBase.o
+
+DFT.o : DFT.c DFT.h
+ $(CC) $(OPT) -DENABLE_PUREC_FLOAT -DENABLE_PUREC_DOUBLE -DENABLE_PUREC_LONGDOUBLE DFT.c -c -o DFT.o
+
+libDFT.a : DFTpurecfloat.o DFTpurecdouble.o DFTpureclongdouble.o DFT.o SIMDBase.o SIMDBaseUndiff_purecfloat.o SIMDBaseUndiff_purecdouble.o SIMDBaseUndiff_pureclongdouble.o
+ rm -f libDFT.a; ar -cvq libDFT.a DFTpurecfloat.o DFTpurecdouble.o DFTpureclongdouble.o DFT.o SIMDBase.o SIMDBaseUndiff_purecfloat.o SIMDBaseUndiff_purecdouble.o SIMDBaseUndiff_pureclongdouble.o
+
+clean :
+ rm -f *~ *.o *.s *.a
diff --git a/plugins/supereq/nsfft-1.00/dft/Makefile.x86 b/plugins/supereq/nsfft-1.00/dft/Makefile.x86
new file mode 100644
index 00000000..6ecbacec
--- /dev/null
+++ b/plugins/supereq/nsfft-1.00/dft/Makefile.x86
@@ -0,0 +1,29 @@
+CC=gcc
+BASEOPT=-Wall -I ../simd
+OPT=$(BASEOPT) -O3
+
+all : libDFT.a
+
+DFTpurecfloat.o : DFTUndiff.c DFT.h ../simd/SIMDBase.h ../simd/SIMDBaseUndiff.h
+ $(CC) $(OPT) -DENABLE_PUREC_FLOAT DFTUndiff.c -c -o DFTpurecfloat.o
+
+DFTpurecdouble.o : DFTUndiff.c DFT.h ../simd/SIMDBase.h ../simd/SIMDBaseUndiff.h
+ $(CC) $(OPT) -DENABLE_PUREC_DOUBLE DFTUndiff.c -c -o DFTpurecdouble.o
+
+DFTpureclongdouble.o : DFTUndiff.c DFT.h ../simd/SIMDBase.h ../simd/SIMDBaseUndiff.h
+ $(CC) $(OPT) -DENABLE_PUREC_LONGDOUBLE DFTUndiff.c -c -o DFTpureclongdouble.o
+
+DFTssefloat.o : DFTUndiff.c DFT.h ../simd/SIMDBase.h ../simd/SIMDBaseUndiff.h
+ $(CC) $(OPT) -msse -DENABLE_SSE_FLOAT DFTUndiff.c -c -o DFTssefloat.o
+
+DFTsse2double.o : DFTUndiff.c DFT.h ../simd/SIMDBase.h ../simd/SIMDBaseUndiff.h
+ $(CC) $(OPT) -msse2 -DENABLE_SSE2_DOUBLE DFTUndiff.c -c -o DFTsse2double.o
+
+DFT.o : DFT.c DFT.h
+ $(CC) $(OPT) -DENABLE_PUREC_FLOAT -DENABLE_PUREC_DOUBLE -DENABLE_PUREC_LONGDOUBLE -DENABLE_SSE_FLOAT -DENABLE_SSE2_DOUBLE DFT.c -c -o DFT.o
+
+libDFT.a : DFTpurecfloat.o DFTpurecdouble.o DFTpureclongdouble.o DFTssefloat.o DFTsse2double.o DFT.o
+ rm -f libDFT.a; ar -cvq libDFT.a DFTpurecfloat.o DFTpurecdouble.o DFTpureclongdouble.o DFTssefloat.o DFTsse2double.o DFT.o
+
+clean :
+ rm -f *~ *.o *.s *.a a.out
diff --git a/plugins/supereq/nsfft-1.00/dft/Makefile.x86avx b/plugins/supereq/nsfft-1.00/dft/Makefile.x86avx
new file mode 100644
index 00000000..b38909cb
--- /dev/null
+++ b/plugins/supereq/nsfft-1.00/dft/Makefile.x86avx
@@ -0,0 +1,35 @@
+CC=gcc
+BASEOPT=-Wall -I ../simd
+OPT=$(BASEOPT) -O3
+
+all : libDFT.a
+
+DFTpurecfloat.o : DFTUndiff.c DFT.h ../simd/SIMDBase.h ../simd/SIMDBaseUndiff.h
+ $(CC) $(OPT) -DENABLE_PUREC_FLOAT DFTUndiff.c -c -o DFTpurecfloat.o
+
+DFTpurecdouble.o : DFTUndiff.c DFT.h ../simd/SIMDBase.h ../simd/SIMDBaseUndiff.h
+ $(CC) $(OPT) -DENABLE_PUREC_DOUBLE DFTUndiff.c -c -o DFTpurecdouble.o
+
+DFTpureclongdouble.o : DFTUndiff.c DFT.h ../simd/SIMDBase.h ../simd/SIMDBaseUndiff.h
+ $(CC) $(OPT) -DENABLE_PUREC_LONGDOUBLE DFTUndiff.c -c -o DFTpureclongdouble.o
+
+DFTssefloat.o : DFTUndiff.c DFT.h ../simd/SIMDBase.h ../simd/SIMDBaseUndiff.h
+ $(CC) $(OPT) -msse -DENABLE_SSE_FLOAT DFTUndiff.c -c -o DFTssefloat.o
+
+DFTsse2double.o : DFTUndiff.c DFT.h ../simd/SIMDBase.h ../simd/SIMDBaseUndiff.h
+ $(CC) $(OPT) -msse2 -DENABLE_SSE2_DOUBLE DFTUndiff.c -c -o DFTsse2double.o
+
+DFTavxfloat.o : DFTUndiff.c DFT.h ../simd/SIMDBase.h ../simd/SIMDBaseUndiff.h
+ $(CC) $(OPT) -mavx -DENABLE_AVX_FLOAT DFTUndiff.c -c -o DFTavxfloat.o
+
+DFTavxdouble.o : DFTUndiff.c DFT.h ../simd/SIMDBase.h ../simd/SIMDBaseUndiff.h
+ $(CC) $(OPT) -mavx -DENABLE_AVX_DOUBLE DFTUndiff.c -c -o DFTavxdouble.o
+
+DFT.o : DFT.c DFT.h
+ $(CC) $(OPT) -DENABLE_PUREC_FLOAT -DENABLE_PUREC_DOUBLE -DENABLE_PUREC_LONGDOUBLE -DENABLE_SSE_FLOAT -DENABLE_SSE2_DOUBLE -DENABLE_AVX_FLOAT -DENABLE_AVX_DOUBLE DFT.c -c -o DFT.o
+
+libDFT.a : DFTpurecfloat.o DFTpurecdouble.o DFTpureclongdouble.o DFTssefloat.o DFTsse2double.o DFTavxfloat.o DFTavxdouble.o DFT.o
+ rm -f libDFT.a; ar -cvq libDFT.a DFTpurecfloat.o DFTpurecdouble.o DFTpureclongdouble.o DFTssefloat.o DFTsse2double.o DFTavxfloat.o DFTavxdouble.o DFT.o
+
+clean :
+ rm -f *~ *.o *.s *.a a.out
diff --git a/plugins/supereq/nsfft-1.00/dfttest/DFTExample.c b/plugins/supereq/nsfft-1.00/dfttest/DFTExample.c
new file mode 100644
index 00000000..78ff14dc
--- /dev/null
+++ b/plugins/supereq/nsfft-1.00/dfttest/DFTExample.c
@@ -0,0 +1,88 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include <stdint.h>
+#include <complex.h>
+
+#include "SIMDBase.h"
+#include "DFT.h"
+
+typedef float REAL;
+#define TYPE SIMDBase_TYPE_FLOAT
+
+#define THRES 1e-3
+
+double complex omega(double n, double kn) {
+ return cexp((-2 * M_PI * _Complex_I / n) * kn);
+}
+
+void forward(double complex *ts, double complex *fs, int len) {
+ int k, n;
+
+ for(k=0;k<len;k++) {
+ fs[k] = 0;
+
+ for(n=0;n<len;n++) {
+ fs[k] += ts[n] * omega(len, n*k);
+ }
+ }
+}
+
+int main(int argc, char **argv) {
+ const int n = 256;
+
+ int mode = SIMDBase_chooseBestMode(TYPE);
+ printf("mode : %d, %s\n", mode, SIMDBase_getModeParamString(SIMDBase_PARAMID_MODE_NAME, mode));
+
+ int veclen = SIMDBase_getModeParamInt(SIMDBase_PARAMID_VECTOR_LEN, mode);
+ int sizeOfVect = SIMDBase_getModeParamInt(SIMDBase_PARAMID_SIZE_OF_VECT, mode);
+
+ //
+
+ int i, j;
+
+ DFT *p = DFT_init(mode, n, 0);
+ REAL *sx = SIMDBase_alignedMalloc(sizeOfVect*n*2);
+
+ //
+
+ double complex ts[veclen][n], fs[veclen][n];
+
+ for(j=0;j<veclen;j++) {
+ for(i=0;i<n;i++) {
+ ts[j][i] = (random() / (double)RAND_MAX) + (random() / (double)RAND_MAX) * _Complex_I;
+ sx[(i*2+0)*veclen+j] = creal(ts[j][i]);
+ sx[(i*2+1)*veclen+j] = cimag(ts[j][i]);
+ }
+ }
+
+ //
+
+ DFT_execute(p, mode, sx, -1);
+
+ for(j=0;j<veclen;j++) {
+ forward(ts[j], fs[j], n);
+ }
+
+ //
+
+ int success = 1;
+
+ for(j=0;j<veclen;j++) {
+ for(i=0;i<n;i++) {
+ if ((fabs(sx[(i*2+0)*veclen+j] - creal(fs[j][i])) > THRES) ||
+ (fabs(sx[(i*2+1)*veclen+j] - cimag(fs[j][i])) > THRES)) {
+ success = 0;
+ }
+ }
+ }
+
+ printf("%s\n", success ? "OK" : "NG");
+
+ //
+
+ SIMDBase_alignedFree(sx);
+ DFT_dispose(p, mode);
+
+ exit(0);
+}
diff --git a/plugins/supereq/nsfft-1.00/dfttest/DFTTestFFTW.c b/plugins/supereq/nsfft-1.00/dfttest/DFTTestFFTW.c
new file mode 100644
index 00000000..42825ed9
--- /dev/null
+++ b/plugins/supereq/nsfft-1.00/dfttest/DFTTestFFTW.c
@@ -0,0 +1,317 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include <stdint.h>
+#include <time.h>
+#include <complex.h>
+
+#include <fftw3.h>
+
+#include "SIMDBase.h"
+#include "DFT.h"
+
+#if 1
+typedef float REAL;
+#define TYPE SIMDBase_TYPE_FLOAT
+#else
+typedef double REAL;
+#define TYPE SIMDBase_TYPE_DOUBLE
+#endif
+
+#define THRES 1e-3
+
+// complex forward
+int check_cf(int n, int mode, int veclen, int sizeOfVect) {
+ int i, j;
+
+ DFT *p = DFT_init(mode, n, 0);
+ fftw_plan w[n];
+
+ fftw_complex *in[sizeOfVect], *out[sizeOfVect];
+
+ REAL *sx = SIMDBase_alignedMalloc(sizeOfVect*n*2);
+
+ //
+
+ for(j=0;j<veclen;j++) {
+ in[j] = (fftw_complex*) fftw_malloc(sizeof(fftw_complex) * n);
+ out[j] = (fftw_complex*) fftw_malloc(sizeof(fftw_complex) * n);
+ w[j] = fftw_plan_dft_1d(n, in[j], out[j], FFTW_FORWARD, FFTW_ESTIMATE);
+
+ for(i=0;i<n;i++) {
+ double re = random() / (double)RAND_MAX;
+ double im = random() / (double)RAND_MAX;
+ sx[(i*2+0)*veclen+j] = re;
+ sx[(i*2+1)*veclen+j] = im;
+ in[j][i] = re + im * _Complex_I;
+ }
+ }
+
+ //
+
+ DFT_execute(p, mode, sx, -1);
+
+ for(j=0;j<veclen;j++) {
+ fftw_execute(w[j]);
+ }
+
+ //
+
+ int success = 1;
+
+ for(j=0;j<veclen;j++) {
+ for(i=0;i<n;i++) {
+ if (fabs(sx[(i*2+0)*veclen+j] - creal(out[j][i])) > THRES) success = 0;
+ if (fabs(sx[(i*2+1)*veclen+j] - cimag(out[j][i])) > THRES) success = 0;
+ }
+ }
+
+ //
+
+ for(j=0;j<veclen;j++) {
+ fftw_destroy_plan(w[j]);
+ fftw_free(in[j]);
+ fftw_free(out[j]);
+ }
+
+ SIMDBase_alignedFree(sx);
+
+ DFT_dispose(p, mode);
+
+ //
+
+ return success;
+}
+
+// complex backward
+int check_cb(int n, int mode, int veclen, int sizeOfVect) {
+ int i, j;
+
+ DFT *p = DFT_init(mode, n, 0);
+ fftw_plan w[n];
+
+ fftw_complex *in[sizeOfVect], *out[sizeOfVect];
+
+ REAL *sx = SIMDBase_alignedMalloc(sizeOfVect*n*2);
+
+ //
+
+ for(j=0;j<veclen;j++) {
+ in[j] = (fftw_complex*) fftw_malloc(sizeof(fftw_complex) * n);
+ out[j] = (fftw_complex*) fftw_malloc(sizeof(fftw_complex) * n);
+ w[j] = fftw_plan_dft_1d(n, in[j], out[j], FFTW_BACKWARD, FFTW_ESTIMATE);
+
+ for(i=0;i<n;i++) {
+ double re = random() / (double)RAND_MAX;
+ double im = random() / (double)RAND_MAX;
+ sx[(i*2+0)*veclen+j] = re;
+ sx[(i*2+1)*veclen+j] = im;
+ in[j][i] = re + im * _Complex_I;
+ }
+ }
+
+ //
+
+ DFT_execute(p, mode, sx, 1);
+
+ for(j=0;j<veclen;j++) {
+ fftw_execute(w[j]);
+ }
+
+ //
+
+ int success = 1;
+
+ for(j=0;j<veclen;j++) {
+ for(i=0;i<n;i++) {
+ if (fabs(sx[(i*2+0)*veclen+j] - creal(out[j][i])) > THRES) success = 0;
+ if (fabs(sx[(i*2+1)*veclen+j] - cimag(out[j][i])) > THRES) success = 0;
+ }
+ }
+
+ //
+
+ for(j=0;j<veclen;j++) {
+ fftw_destroy_plan(w[j]);
+ fftw_free(in[j]);
+ fftw_free(out[j]);
+ }
+
+ SIMDBase_alignedFree(sx);
+
+ DFT_dispose(p, mode);
+
+ //
+
+ return success;
+}
+
+// real forward
+int check_rf(int n, int mode, int veclen, int sizeOfVect) {
+ int i, j;
+
+ DFT *p = DFT_init(mode, n, DFT_FLAG_REAL);
+ fftw_plan w[n];
+
+ double *in[sizeOfVect];
+ fftw_complex *out[sizeOfVect];
+
+ REAL *sx = SIMDBase_alignedMalloc(sizeOfVect*n*2);
+
+ //
+
+ for(j=0;j<veclen;j++) {
+ in[j] = (double *) fftw_malloc(sizeof(double) * n);
+ out[j] = (fftw_complex*) fftw_malloc(sizeof(fftw_complex) * (n/2+1));
+ w[j] = fftw_plan_dft_r2c_1d(n, in[j], out[j], FFTW_ESTIMATE);
+
+ for(i=0;i<n;i++) {
+ double re = random() / (double)RAND_MAX;
+ sx[i*veclen+j] = re;
+ in[j][i] = re;
+ }
+ }
+
+ //
+
+ DFT_execute(p, mode, sx, -1);
+
+ for(j=0;j<veclen;j++) {
+ fftw_execute(w[j]);
+ }
+
+ //
+
+ int success = 1;
+
+ for(j=0;j<veclen;j++) {
+ for(i=0;i<n/2;i++) {
+ if (i == 0) {
+ if (fabs(sx[(i*2+0)*veclen+j] - creal(out[j][0])) > THRES) success = 0;
+ if (fabs(sx[(i*2+1)*veclen+j] - creal(out[j][n/2])) > THRES) success = 0;
+ } else {
+ if (fabs(sx[(i*2+0)*veclen+j] - creal(out[j][i])) > THRES) success = 0;
+ if (fabs(sx[(i*2+1)*veclen+j] - cimag(out[j][i])) > THRES) success = 0;
+ }
+ }
+ }
+
+ //
+
+ for(j=0;j<veclen;j++) {
+ fftw_destroy_plan(w[j]);
+ fftw_free(in[j]);
+ fftw_free(out[j]);
+ }
+
+ SIMDBase_alignedFree(sx);
+
+ DFT_dispose(p, mode);
+
+ //
+
+ return success;
+}
+
+// real backward
+int check_rb(int n, int mode, int veclen, int sizeOfVect) {
+ int i, j;
+
+ DFT *p = DFT_init(mode, n, DFT_FLAG_REAL);
+ fftw_plan w[n];
+
+ fftw_complex *in[sizeOfVect];
+ double *out[sizeOfVect];
+
+ REAL *sx = SIMDBase_alignedMalloc(sizeOfVect*n*2);
+
+ //
+
+ for(j=0;j<veclen;j++) {
+ in[j] = (fftw_complex*) fftw_malloc(sizeof(fftw_complex) * (n/2+1));
+ out[j] = (double *) fftw_malloc(sizeof(double) * n);
+ w[j] = fftw_plan_dft_c2r_1d(n, in[j], out[j], FFTW_ESTIMATE);
+
+ for(i=0;i<n/2;i++) {
+ if (i == 0) {
+ in[j][0 ] = (random() / (double)RAND_MAX);
+ in[j][n/2] = (random() / (double)RAND_MAX);
+ } else {
+ in[j][i ] = (random() / (double)RAND_MAX) + (random() / (double)RAND_MAX) * _Complex_I;
+ }
+ }
+
+ for(i=0;i<n/2;i++) {
+ if (i == 0) {
+ sx[(2*0+0) * veclen + j] = creal(in[j][0 ]);
+ sx[(2*0+1) * veclen + j] = creal(in[j][n/2]);
+ } else {
+ sx[(2*i+0) * veclen + j] = creal(in[j][i]);
+ sx[(2*i+1) * veclen + j] = cimag(in[j][i]);
+ }
+ }
+ }
+
+ //
+
+ DFT_execute(p, mode, sx, 1);
+
+ for(j=0;j<veclen;j++) {
+ fftw_execute(w[j]);
+ }
+
+ //
+
+ int success = 1;
+
+ for(j=0;j<veclen;j++) {
+ for(i=0;i<n/2;i++) {
+ if ((fabs(sx[i * veclen + j]*2 - out[j][i]) > THRES)) {
+ success = 0;
+ }
+ }
+ }
+
+ //
+
+ for(j=0;j<veclen;j++) {
+ fftw_destroy_plan(w[j]);
+ fftw_free(in[j]);
+ fftw_free(out[j]);
+ }
+
+ SIMDBase_alignedFree(sx);
+
+ DFT_dispose(p, mode);
+
+ //
+
+ return success;
+}
+
+int main(int argc, char **argv) {
+ if (argc != 2) {
+ fprintf(stderr, "%s <log2n>\n", argv[0]);
+ exit(-1);
+ }
+
+ const int n = 1 << atoi(argv[1]);
+
+ srandom(time(NULL));
+
+ //
+
+ int mode = SIMDBase_chooseBestMode(TYPE);
+
+ printf("mode : %d, %s\n", mode, SIMDBase_getModeParamString(SIMDBase_PARAMID_MODE_NAME, mode));
+
+ int veclen = SIMDBase_getModeParamInt(SIMDBase_PARAMID_VECTOR_LEN, mode);
+ int sizeOfVect = SIMDBase_getModeParamInt(SIMDBase_PARAMID_SIZE_OF_VECT, mode);
+
+ printf("complex forward : %s\n", check_cf(n, mode, veclen, sizeOfVect) ? "OK" : "NG");
+ printf("complex backward : %s\n", check_cb(n, mode, veclen, sizeOfVect) ? "OK" : "NG");
+ printf("real forward : %s\n", check_rf(n, mode, veclen, sizeOfVect) ? "OK" : "NG");
+ printf("real backward : %s\n", check_rb(n, mode, veclen, sizeOfVect) ? "OK" : "NG");
+
+ exit(0);
+}
diff --git a/plugins/supereq/nsfft-1.00/dfttest/DFTTestNaive.c b/plugins/supereq/nsfft-1.00/dfttest/DFTTestNaive.c
new file mode 100644
index 00000000..9d4bdaae
--- /dev/null
+++ b/plugins/supereq/nsfft-1.00/dfttest/DFTTestNaive.c
@@ -0,0 +1,419 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include <stdint.h>
+#include <time.h>
+#include <complex.h>
+
+#include "SIMDBase.h"
+#include "DFT.h"
+
+#if 1
+typedef float REAL;
+#define TYPE SIMDBase_TYPE_FLOAT
+#else
+typedef double REAL;
+#define TYPE SIMDBase_TYPE_DOUBLE
+#endif
+
+#define THRES 1e-3
+
+double complex omega(double n, double kn) {
+ return cexp((-2 * M_PI * _Complex_I / n) * kn);
+}
+
+void forward(double complex *ts, double complex *fs, int len) {
+ int k, n;
+
+ for(k=0;k<len;k++) {
+ fs[k] = 0;
+
+ for(n=0;n<len;n++) {
+ fs[k] += ts[n] * omega(len, n*k);
+ }
+ }
+}
+
+void backward(double complex *fs, double complex *ts, int len) {
+ int k, n;
+
+ for(k=0;k<len;k++) {
+ ts[k] = 0;
+
+ for(n=0;n<len;n++) {
+ ts[k] += fs[n] * omega(-len, n*k);
+ }
+ }
+}
+
+// complex forward
+int check_cf(int n, int mode, int veclen, int sizeOfVect) {
+ int i, j;
+
+ DFT *p = DFT_init(mode, n, 0);
+ REAL *sx = SIMDBase_alignedMalloc(sizeOfVect*n*2);
+
+ //
+
+ double complex ts[veclen][n], fs[veclen][n];
+
+ for(j=0;j<veclen;j++) {
+ for(i=0;i<n;i++) {
+ ts[j][i] = (random() / (double)RAND_MAX) + (random() / (double)RAND_MAX) * _Complex_I;
+ sx[(i*2+0)*veclen+j] = creal(ts[j][i]);
+ sx[(i*2+1)*veclen+j] = cimag(ts[j][i]);
+ }
+ }
+
+ //
+
+ DFT_execute(p, mode, sx, -1);
+
+ for(j=0;j<veclen;j++) {
+ forward(ts[j], fs[j], n);
+ }
+
+ //
+
+ int success = 1;
+
+ for(j=0;j<veclen;j++) {
+ for(i=0;i<n;i++) {
+ if ((fabs(sx[(i*2+0)*veclen+j] - creal(fs[j][i])) > THRES) ||
+ (fabs(sx[(i*2+1)*veclen+j] - cimag(fs[j][i])) > THRES)) {
+ success = 0;
+ }
+ }
+ }
+
+ //
+
+ SIMDBase_alignedFree(sx);
+ DFT_dispose(p, mode);
+
+ //
+
+ return success;
+}
+
+// complex backward
+int check_cb(int n, int mode, int veclen, int sizeOfVect) {
+ int i,j;
+
+ DFT *p = DFT_init(mode, n, 0);
+ REAL *sx = SIMDBase_alignedMalloc(sizeOfVect*n*2);
+
+ //
+
+ double complex fs[veclen][n], ts[veclen][n];
+
+ for(j=0;j<veclen;j++) {
+ for(i=0;i<n;i++) {
+ fs[j][i] = (random() / (double)RAND_MAX) + (random() / (double)RAND_MAX) * _Complex_I;
+
+ sx[(i*2+0)*veclen+j] = creal(fs[j][i]);
+ sx[(i*2+1)*veclen+j] = cimag(fs[j][i]);
+ }
+ }
+
+ //
+
+ DFT_execute(p, mode, sx, 1);
+
+ for(j=0;j<veclen;j++) {
+ backward(fs[j], ts[j], n);
+ }
+
+ //
+
+ int success = 1;
+
+ for(j=0;j<veclen;j++) {
+ for(i=0;i<n;i++) {
+ if ((fabs(sx[(i*2+0)*veclen+j] - creal(ts[j][i])) > THRES) ||
+ (fabs(sx[(i*2+1)*veclen+j] - cimag(ts[j][i])) > THRES)) {
+ success = 0;
+ }
+ }
+ }
+
+ //
+
+ SIMDBase_alignedFree(sx);
+ DFT_dispose(p, mode);
+
+ //
+
+ return success;
+}
+
+// real forward
+int check_rf(int n, int mode, int veclen, int sizeOfVect) {
+ int i,j;
+
+ DFT *p = DFT_init(mode, n, DFT_FLAG_REAL);
+ REAL *sx = SIMDBase_alignedMalloc(sizeOfVect*n);
+
+ //
+
+ double complex ts[veclen][n], fs[veclen][n];
+
+ for(j=0;j<veclen;j++) {
+ for(i=0;i<n;i++) {
+ ts[j][i] = (random() / (double)RAND_MAX);
+ sx[i*veclen+j] = creal(ts[j][i]);
+ }
+ }
+
+ //
+
+ DFT_execute(p, mode, sx, -1);
+
+ for(j=0;j<veclen;j++) {
+ forward(ts[j], fs[j], n);
+ }
+
+ //
+
+ int success = 1;
+
+ for(j=0;j<veclen;j++) {
+ for(i=0;i<n/2;i++) {
+ if (i == 0) {
+ if (fabs(sx[(2*0+0) * veclen + j] - creal(fs[j][0 ])) > THRES) success = 0;
+ if (fabs(sx[(2*0+1) * veclen + j] - creal(fs[j][n/2])) > THRES) success = 0;
+ } else {
+ if (fabs(sx[(2*i+0) * veclen + j] - creal(fs[j][i])) > THRES) success = 0;
+ if (fabs(sx[(2*i+1) * veclen + j] - cimag(fs[j][i])) > THRES) success = 0;
+ }
+ }
+ }
+
+ //
+
+ SIMDBase_alignedFree(sx);
+ DFT_dispose(p, mode);
+
+ //
+
+ return success;
+}
+
+// real backward
+int check_rb(int n, int mode, int veclen, int sizeOfVect) {
+ int i,j;
+
+ DFT *p = DFT_init(mode, n, DFT_FLAG_REAL);
+ REAL *sx = SIMDBase_alignedMalloc(sizeOfVect*n);
+
+ //
+
+ double complex fs[veclen][n], ts[veclen][n];
+
+ for(j=0;j<veclen;j++) {
+ for(i=0;i<n/2;i++) {
+ if (i == 0) {
+ fs[j][0 ] = (random() / (double)RAND_MAX);
+ fs[j][n/2] = (random() / (double)RAND_MAX);
+ } else {
+ fs[j][i ] = (random() / (double)RAND_MAX) + (random() / (double)RAND_MAX) * _Complex_I;
+ fs[j][n-i] = conj(fs[j][i]);
+ }
+ }
+ }
+
+ for(j=0;j<veclen;j++) {
+ for(i=0;i<n/2;i++) {
+ if (i == 0) {
+ sx[(2*0+0) * veclen + j] = creal(fs[j][0 ]);
+ sx[(2*0+1) * veclen + j] = creal(fs[j][n/2]);
+ } else {
+ sx[(2*i+0) * veclen + j] = creal(fs[j][i]);
+ sx[(2*i+1) * veclen + j] = cimag(fs[j][i]);
+ }
+ }
+ }
+
+ //
+
+ for(j=0;j<veclen;j++) {
+ backward(fs[j], ts[j], n);
+ }
+
+ DFT_execute(p, mode, sx, 1);
+
+ //
+
+ int success = 1;
+
+ for(j=0;j<veclen;j++) {
+ for(i=0;i<n;i++) {
+ if (fabs(cimag(ts[j][i])) > THRES) {
+ success = 0;
+ }
+
+ if ((fabs(sx[i * veclen + j]*2 - creal(ts[j][i])) > THRES)) {
+ success = 0;
+ }
+ }
+ }
+
+ //
+
+ SIMDBase_alignedFree(sx);
+ DFT_dispose(p, mode);
+
+ //
+
+ return success;
+}
+
+// alt real forward
+int check_arf(int n, int mode, int veclen, int sizeOfVect) {
+ int i,j;
+
+ DFT *p = DFT_init(mode, n, DFT_FLAG_ALT_REAL);
+ REAL *sx = SIMDBase_alignedMalloc(sizeOfVect*n);
+
+ //
+
+ double complex ts[veclen][n], fs[veclen][n];
+
+ for(j=0;j<veclen;j++) {
+ for(i=0;i<n;i++) {
+ ts[j][i] = (random() / (double)RAND_MAX);
+ sx[i*veclen+j] = creal(ts[j][i]);
+ }
+ }
+
+ //
+
+ DFT_execute(p, mode, sx, 1);
+
+ for(j=0;j<veclen;j++) {
+ backward(ts[j], fs[j], n);
+ }
+
+ //
+
+ int success = 1;
+
+ for(j=0;j<veclen;j++) {
+ for(i=0;i<n/2;i++) {
+ if (i == 0) {
+ if (fabs(sx[(2*0+0) * veclen + j] - creal(fs[j][0 ])) > THRES) success = 0;
+ if (fabs(sx[(2*0+1) * veclen + j] - creal(fs[j][n/2])) > THRES) success = 0;
+ } else {
+ if (fabs(sx[(2*i+0) * veclen + j] - creal(fs[j][i])) > THRES) success = 0;
+ if (fabs(sx[(2*i+1) * veclen + j] - cimag(fs[j][i])) > THRES) success = 0;
+ }
+ }
+ }
+
+ //
+
+ SIMDBase_alignedFree(sx);
+ DFT_dispose(p, mode);
+
+ //
+
+ return success;
+}
+
+// alt real backward
+int check_arb(int n, int mode, int veclen, int sizeOfVect) {
+ int i,j;
+
+ DFT *p = DFT_init(mode, n, DFT_FLAG_ALT_REAL);
+ REAL *sx = SIMDBase_alignedMalloc(sizeOfVect*n);
+
+ //
+
+ double complex fs[veclen][n], ts[veclen][n];
+
+ for(j=0;j<veclen;j++) {
+ for(i=0;i<n/2;i++) {
+ if (i == 0) {
+ fs[j][0 ] = (random() / (double)RAND_MAX);
+ fs[j][n/2] = (random() / (double)RAND_MAX);
+ } else {
+ fs[j][i ] = (random() / (double)RAND_MAX) + (random() / (double)RAND_MAX) * _Complex_I;
+ fs[j][n-i] = conj(fs[j][i]);
+ }
+ }
+ }
+
+ for(j=0;j<veclen;j++) {
+ for(i=0;i<n/2;i++) {
+ if (i == 0) {
+ sx[(2*0+0) * veclen + j] = creal(fs[j][0 ]);
+ sx[(2*0+1) * veclen + j] = creal(fs[j][n/2]);
+ } else {
+ sx[(2*i+0) * veclen + j] = creal(fs[j][i]);
+ sx[(2*i+1) * veclen + j] = cimag(fs[j][i]);
+ }
+ }
+ }
+
+ //
+
+ for(j=0;j<veclen;j++) {
+ forward(fs[j], ts[j], n);
+ }
+
+ DFT_execute(p, mode, sx, -1);
+
+ //
+
+ int success = 1;
+
+ for(j=0;j<veclen;j++) {
+ for(i=0;i<n;i++) {
+ if (fabs(cimag(ts[j][i])) > THRES) {
+ success = 0;
+ }
+
+ if ((fabs(sx[i * veclen + j]*2 - creal(ts[j][i])) > THRES)) {
+ success = 0;
+ }
+ }
+ }
+
+ //
+
+ SIMDBase_alignedFree(sx);
+ DFT_dispose(p, mode);
+
+ //
+
+ return success;
+}
+
+int main(int argc, char **argv) {
+ if (argc != 2) {
+ fprintf(stderr, "%s <log2n>\n", argv[0]);
+ exit(-1);
+ }
+
+ const int n = 1 << atoi(argv[1]);
+
+ srandom(time(NULL));
+
+ //
+
+ int mode = SIMDBase_chooseBestMode(TYPE);
+
+ printf("mode : %d, %s\n", mode, SIMDBase_getModeParamString(SIMDBase_PARAMID_MODE_NAME, mode));
+
+ int veclen = SIMDBase_getModeParamInt(SIMDBase_PARAMID_VECTOR_LEN, mode);
+ int sizeOfVect = SIMDBase_getModeParamInt(SIMDBase_PARAMID_SIZE_OF_VECT, mode);
+
+ printf("complex forward : %s\n", check_cf(n, mode, veclen, sizeOfVect) ? "OK" : "NG");
+ printf("complex backward : %s\n", check_cb(n, mode, veclen, sizeOfVect) ? "OK" : "NG");
+ printf("real forward : %s\n", check_rf(n, mode, veclen, sizeOfVect) ? "OK" : "NG");
+ printf("real backward : %s\n", check_rb(n, mode, veclen, sizeOfVect) ? "OK" : "NG");
+ printf("alt real forward : %s\n", check_arf(n, mode, veclen, sizeOfVect) ? "OK" : "NG");
+ printf("alt real backward : %s\n", check_arb(n, mode, veclen, sizeOfVect) ? "OK" : "NG");
+
+ exit(0);
+}
diff --git a/plugins/supereq/nsfft-1.00/dfttest/DFTTestOoura.c b/plugins/supereq/nsfft-1.00/dfttest/DFTTestOoura.c
new file mode 100644
index 00000000..08c8315f
--- /dev/null
+++ b/plugins/supereq/nsfft-1.00/dfttest/DFTTestOoura.c
@@ -0,0 +1,260 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include <stdint.h>
+#include <time.h>
+
+#include "SIMDBase.h"
+#include "DFT.h"
+
+void cdft(int, int, double *, int *, double *);
+void rdft(int, int, double *, int *, double *);
+
+#if 1
+typedef float REAL;
+#define TYPE SIMDBase_TYPE_FLOAT
+#else
+typedef double REAL;
+#define TYPE SIMDBase_TYPE_DOUBLE
+#endif
+
+#define THRES 1e-3
+
+// complex forward
+int check_cf(int n, int mode, int veclen, int sizeOfVect) {
+ int i, j;
+
+ DFT *p = DFT_init(mode, n, 0);
+
+ int *ip = calloc(n, sizeof(int));
+ double *trigTable = SIMDBase_alignedMalloc(sizeof(double)*n/2);
+
+ REAL *sx = SIMDBase_alignedMalloc(sizeOfVect*n*2);
+ double *sy = SIMDBase_alignedMalloc(veclen * sizeof(double) *n*2);
+
+ //
+
+ for(j=0;j<veclen;j++) {
+ for(i=0;i<n*2;i++) {
+ sx[i*veclen + j] = random() / (double)RAND_MAX;
+ sy[j*n*2 + i] = sx[i*veclen + j];
+ }
+ }
+
+ //
+
+ DFT_execute(p, mode, sx, -1);
+
+ for(j=0;j<veclen;j++) {
+ cdft(n*2, -1, &sy[j*n*2], ip, trigTable);
+ }
+
+ //
+
+ int success = 1;
+
+ for(j=0;j<veclen;j++) {
+ for(i=0;i<n*2;i++) {
+ if (fabs(sx[i*veclen+j] - sy[j*n*2 + i]) > THRES) success = 0;
+ }
+ }
+
+ //
+
+ SIMDBase_alignedFree(sy);
+ SIMDBase_alignedFree(sx);
+ SIMDBase_alignedFree(trigTable);
+ free(ip);
+
+ DFT_dispose(p, mode);
+
+ //
+
+ return success;
+}
+
+// complex backward
+int check_cb(int n, int mode, int veclen, int sizeOfVect) {
+ int i, j;
+
+ DFT *p = DFT_init(mode, n, 0);
+
+ int *ip = calloc(n, sizeof(int));
+ double *trigTable = SIMDBase_alignedMalloc(sizeof(double)*n/2);
+
+ REAL *sx = SIMDBase_alignedMalloc(sizeOfVect*n*2);
+ double *sy = SIMDBase_alignedMalloc(veclen * sizeof(double) *n*2);
+
+ //
+
+ for(j=0;j<veclen;j++) {
+ for(i=0;i<n*2;i++) {
+ sx[i*veclen + j] = random() / (double)RAND_MAX;
+ sy[j*n*2 + i] = sx[i*veclen + j];
+ }
+ }
+
+ //
+
+ DFT_execute(p, mode, sx, 1);
+
+ for(j=0;j<veclen;j++) {
+ cdft(n*2, 1, &sy[j*n*2], ip, trigTable);
+ }
+
+ //
+
+ int success = 1;
+
+ for(j=0;j<veclen;j++) {
+ for(i=0;i<n*2;i++) {
+ if (fabs(sx[i*veclen+j] - sy[j*n*2 + i]) > THRES) success = 0;
+ }
+ }
+
+ //
+
+ SIMDBase_alignedFree(sy);
+ SIMDBase_alignedFree(sx);
+ SIMDBase_alignedFree(trigTable);
+ free(ip);
+
+ DFT_dispose(p, mode);
+
+ //
+
+ return success;
+}
+
+// real forward
+int check_rf(int n, int mode, int veclen, int sizeOfVect) {
+ int i, j;
+
+ DFT *p = DFT_init(mode, n, DFT_FLAG_ALT_REAL);
+
+ int *ip = calloc(n, sizeof(int));
+ double *trigTable = SIMDBase_alignedMalloc(sizeof(double)*n/2);
+
+ REAL *sx = SIMDBase_alignedMalloc(sizeOfVect*n);
+ double *sy = SIMDBase_alignedMalloc(veclen * sizeof(double) *n);
+
+ //
+
+ for(j=0;j<veclen;j++) {
+ for(i=0;i<n;i++) {
+ sx[i*veclen + j] = random() / (double)RAND_MAX;
+ sy[j*n + i] = sx[i*veclen + j];
+ }
+ }
+
+ //
+
+ DFT_execute(p, mode, sx, -1);
+
+ for(j=0;j<veclen;j++) {
+ rdft(n, -1, &sy[j*n], ip, trigTable);
+ }
+
+ //
+
+ int success = 1;
+
+ for(j=0;j<veclen;j++) {
+ for(i=0;i<n;i++) {
+ if (fabs(sx[i*veclen+j] - sy[j*n + i]) > THRES) success = 0;
+ }
+ }
+
+ //
+
+ SIMDBase_alignedFree(sy);
+ SIMDBase_alignedFree(sx);
+ SIMDBase_alignedFree(trigTable);
+ free(ip);
+
+ DFT_dispose(p, mode);
+
+ //
+
+ return success;
+}
+
+// real backward
+int check_rb(int n, int mode, int veclen, int sizeOfVect) {
+ int i, j;
+
+ DFT *p = DFT_init(mode, n, DFT_FLAG_ALT_REAL);
+
+ int *ip = calloc(n, sizeof(int));
+ double *trigTable = SIMDBase_alignedMalloc(sizeof(double)*n/2);
+
+ REAL *sx = SIMDBase_alignedMalloc(sizeOfVect*n);
+ double *sy = SIMDBase_alignedMalloc(veclen * sizeof(double) *n);
+
+ //
+
+ for(j=0;j<veclen;j++) {
+ for(i=0;i<n;i++) {
+ sx[i*veclen + j] = random() / (double)RAND_MAX;
+ sy[j*n + i] = sx[i*veclen + j];
+ }
+ }
+
+ //
+
+ DFT_execute(p, mode, sx, 1);
+
+ for(j=0;j<veclen;j++) {
+ rdft(n, 1, &sy[j*n], ip, trigTable);
+ }
+
+ //
+
+ int success = 1;
+
+ for(j=0;j<veclen;j++) {
+ for(i=0;i<n;i++) {
+ if (fabs(sx[i*veclen+j] - sy[j*n + i]) > THRES) success = 0;
+ }
+ }
+
+ //
+
+ SIMDBase_alignedFree(sy);
+ SIMDBase_alignedFree(sx);
+ SIMDBase_alignedFree(trigTable);
+ free(ip);
+
+ DFT_dispose(p, mode);
+
+ //
+
+ return success;
+}
+
+int main(int argc, char **argv) {
+ if (argc != 2) {
+ fprintf(stderr, "%s <log2n>\n", argv[0]);
+ exit(-1);
+ }
+
+ const int n = 1 << atoi(argv[1]);
+
+ srandom(time(NULL));
+
+ //
+
+ int mode = SIMDBase_chooseBestMode(TYPE);
+
+ printf("mode : %d, %s\n", mode, SIMDBase_getModeParamString(SIMDBase_PARAMID_MODE_NAME, mode));
+
+ int veclen = SIMDBase_getModeParamInt(SIMDBase_PARAMID_VECTOR_LEN, mode);
+ int sizeOfVect = SIMDBase_getModeParamInt(SIMDBase_PARAMID_SIZE_OF_VECT, mode);
+
+ printf("complex forward : %s\n", check_cf(n, mode, veclen, sizeOfVect) ? "OK" : "NG");
+ printf("complex backward : %s\n", check_cb(n, mode, veclen, sizeOfVect) ? "OK" : "NG");
+ printf("real forward : %s\n", check_rf(n, mode, veclen, sizeOfVect) ? "OK" : "NG");
+ printf("real backward : %s\n", check_rb(n, mode, veclen, sizeOfVect) ? "OK" : "NG");
+
+ exit(0);
+}
diff --git a/plugins/supereq/nsfft-1.00/dfttest/Makefile b/plugins/supereq/nsfft-1.00/dfttest/Makefile
new file mode 100644
index 00000000..924b8656
--- /dev/null
+++ b/plugins/supereq/nsfft-1.00/dfttest/Makefile
@@ -0,0 +1,35 @@
+CC=gcc
+BASEOPT=-Wall -g -I ../simd -I ../dft -L../simd -L../dft
+OPT=$(BASEOPT) -O
+
+all : DFTExample DFTTestNaive
+
+clean :
+ rm -f *~ *.o nsfftplan.*.txt *.log *.dat a.out DFTExample DFTTestNaive DFTTestOoura DFTTestFFTW pi_fft_mod pi_fft_mod.c
+
+../simd/libSIMD.a :
+ @cd ../simd; make
+
+../dft/libDFT.a :
+ @cd ../dft; make
+
+../ooura/fftsg.o :
+ @cd ../ooura; make
+
+DFTExample : DFTExample.c ../simd/libSIMD.a ../dft/libDFT.a
+ $(CC) $(OPT) DFTExample.c -lDFT -lSIMD -lm -o DFTExample
+
+DFTTestNaive : DFTTestNaive.c ../simd/libSIMD.a ../dft/libDFT.a
+ $(CC) $(OPT) DFTTestNaive.c -lDFT -lSIMD -lm -o DFTTestNaive
+
+DFTTestOoura : DFTTestOoura.c ../ooura/fftsg.o ../simd/libSIMD.a ../dft/libDFT.a
+ $(CC) $(OPT) DFTTestOoura.c ../ooura/fftsg.o -lDFT -lSIMD -lm -o DFTTestOoura
+
+DFTTestFFTW : DFTTestFFTW.c ../simd/libSIMD.a ../dft/libDFT.a
+ $(CC) $(OPT) DFTTestFFTW.c -lDFT -lSIMD -lfftw3 -lm -o DFTTestFFTW
+
+pi_fft_mod.c : ../ooura/pi_fft.c pi_fft.c.patch
+ patch -o pi_fft_mod.c ../ooura/pi_fft.c pi_fft.c.patch
+
+pi_fft_mod : ../simd/libSIMD.a ../dft/libDFT.a pi_fft_mod.c
+ $(CC) $(OPT) pi_fft_mod.c -I ../dft -I ../simd -L../dft -L../simd -lm -lDFT -lSIMD -o pi_fft_mod
diff --git a/plugins/supereq/nsfft-1.00/dfttest/pi_fft.c.patch b/plugins/supereq/nsfft-1.00/dfttest/pi_fft.c.patch
new file mode 100644
index 00000000..c50133cc
--- /dev/null
+++ b/plugins/supereq/nsfft-1.00/dfttest/pi_fft.c.patch
@@ -0,0 +1,131 @@
+--- pi_fft.c 2010-07-30 13:04:25.000000000 +0900
++++ pi_fft_mod.c 2010-07-31 20:50:11.000000000 +0900
+@@ -25,7 +25,75 @@
+ #include <stdio.h>
+ #include <stdlib.h>
+ #include <time.h>
++#include <sys/time.h>
++#include <unistd.h>
+
++/****/
++
++#include <stdint.h>
++#include "SIMDBase.h"
++#include "DFT.h"
++
++DFT* dft[64];
++
++void initdft(int n) {
++ int i, logn = 31 - __builtin_clz(n), writeflag = 0;
++ char buf[20], fn[256];
++ gethostname(buf, 19);
++ sprintf(fn, "nsfftplan.%s.txt", buf);
++ FILE *fp = fopen(fn, "r");
++ if (fp != NULL) {
++ for(i=1;i<=logn;i++) {
++ int err;
++ dft[i] = DFT_fread(fp, &err);
++ if (err != DFT_ERROR_NOERROR) {
++ printf("error when reading plan %d : %d\n", i, err);
++ break;
++ }
++ if (DFT_getPlanParamInt(DFT_PARAMID_MODE, dft[i]) != SIMDBase_MODE_PUREC_DOUBLE ||
++ DFT_getPlanParamInt(DFT_PARAMID_FFT_LENGTH, dft[i]) != (1 << i) ||
++ DFT_getPlanParamInt(DFT_PARAMID_IS_ALT_REAL_TRANSFORM, dft[i]) != 1) {
++ fprintf(stderr, "plan not compatible : %d\n", i);
++ break;
++ }
++ }
++ }
++ if (fp != NULL) fclose(fp);
++
++ for(i=1;i<=logn;i++) {
++ if (dft[i] == NULL) {
++ dft[i] = DFT_init(SIMDBase_MODE_PUREC_DOUBLE, 1 << i, DFT_FLAG_ALT_REAL | DFT_FLAG_LIGHT_TEST_RUN | DFT_FLAG_VERBOSE);
++ if (dft[i] == NULL) {
++ printf("dft[%d] == NULL\n", i);
++ exit(-1);
++ }
++ writeflag = 1;
++ }
++ }
++
++ if (writeflag) {
++ fp = fopen(fn, "w");
++ if (fp != NULL) {
++ for(i=1;i<=logn;i++) {
++ DFT_fwrite(dft[i], fp);
++ }
++ fclose(fp);
++ }
++ }
++}
++
++void rdft(int n, int isgn, double *a, int *ip, double *w) {
++ int logn = 31 - __builtin_clz(n);
++ DFT_execute(dft[logn], SIMDBase_MODE_PUREC_DOUBLE, a, isgn);
++}
++
++double timeofday(void) {
++ struct timeval tp;
++ gettimeofday(&tp, NULL);
++ return (double)tp.tv_sec+(1e-6)*tp.tv_usec;
++}
++
++/****/
+
+ void mp_load_0(int n, int radix, int out[]);
+ void mp_load_1(int n, int radix, int out[]);
+@@ -67,7 +135,7 @@
+ double err, d_time, n_op;
+ int *a, *b, *c, *e, *i1, *i2, *ip;
+ double *d1, *d2, *d3, *w;
+- time_t t_1, t_2;
++ double t_1, t_2;
+ FILE *f_log, *f_out;
+
+ f_log = fopen("pi.log", "w");
+@@ -96,6 +164,8 @@
+ exit(1);
+ }
+ ip[0] = 0;
++
++ initdft(nfft);
+ /* ---- radix test ---- */
+ log10_radix = 1;
+ radix = 10;
+@@ -111,7 +181,7 @@
+ printf("calculating %d digits of PI...\n", log10_radix * (n - 2));
+ fprintf(f_log, "calculating %d digits of PI...\n", log10_radix * (n - 2));
+ /* ---- time check ---- */
+- time(&t_1);
++ t_1 = timeofday();
+ /*
+ * ---- a formula based on the AGM (Arithmetic-Geometric Mean) ----
+ * c = sqrt(0.125);
+@@ -216,10 +286,10 @@
+ mp_mul(n, radix, a, b, a, i1, nfft, d1, d2, d3, ip, w);
+ mp_idiv(n, radix, a, npow, a);
+ /* ---- time check ---- */
+- time(&t_2);
++ t_2 = timeofday();
+ /* ---- output ---- */
+ f_out = fopen("pi_mod.dat", "w");
+- printf("writing pi.dat...\n");
++ printf("writing pi_mod.dat...\n");
+ mp_fprintf(n - 1, log10_radix, a, f_out);
+ fclose(f_out);
+ free(d3);
+@@ -238,9 +308,9 @@
+ printf("floating point operation: %g op.\n", n_op);
+ fprintf(f_log, "floating point operation: %g op.\n", n_op);
+ /* ---- difftime ---- */
+- d_time = difftime(t_2, t_1);
+- printf("execution time: %g sec. (real time)\n", d_time);
+- fprintf(f_log, "execution time: %g sec. (real time)\n", d_time);
++ d_time = t_2 - t_1;
++ printf("execution time: %.5g sec. (real time)\n", d_time);
++ fprintf(f_log, "execution time: %.5g sec. (real time)\n", d_time);
+ fclose(f_log);
+ return 0;
+ }
diff --git a/plugins/supereq/nsfft-1.00/doc/default.css b/plugins/supereq/nsfft-1.00/doc/default.css
new file mode 100644
index 00000000..09721163
--- /dev/null
+++ b/plugins/supereq/nsfft-1.00/doc/default.css
@@ -0,0 +1,34 @@
+body {margin-left: 1.5cm; padding-left: 0.1cm; margin-right: 1.5cm; padding-right: 0.1cm; margin-top: 2.5cm; padding-top: 0.5cm; margin-bottom: 1cm; padding-bottom: 1.0cm; border-top-style:solid; border-bottom-style:solid; }
+h1 {font-family: arial, sansserif; font-weight: bold; font-style: italic; margin-top: 0.8cm; }
+h2 {font-family: arial, sansserif; font-weight: bold; font-style: italic; margin-top: 0.8cm; }
+h3 {font-family: arial, sansserif; font-weight: bold; margin-top: 1.2cm; margin-bottom: 0.8cm; }
+h4 {font-family: arial, sansserif; font-weight: bold; margin-top: 1.2cm; margin-bottom: 0.8cm; }
+p {font-family: Georgia, "Times New Roman", times, serif; margin-top: 0.3cm; margin-left: 0.5cm; margin-bottom: 0.3cm;}
+p.dir {font-family: arial, sansserif; margin-top: 0cm; margin-bottom: 0cm;}
+dl { margin-left: 0.5cm; }
+dt { font-weight: bold; }
+a:link {color: black;}
+a:visited {color: black;}
+ul.disc {list-style-type: disc; font-family: times, serif;}
+ul.circle {list-style-type: circle; font-family: times, serif;}
+ul.square {list-style-type: square; font-family: times, serif;}
+ul.none {list-style-type: none; font-family: times, serif;}
+pre.code { margin-top: 1.0cm; margin-bottom: 1.0cm; margin-left: 1.0cm; margin-right: 1.0cm; border:3px solid #c0c0c0; padding: 0.5cm; font-family: tahoma, sansserif; font-weight: normal; background-color:#f8f8f8; }
+pre.command { margin-top: 1.0cm; margin-bottom: 1.0cm; margin-left: 1.5cm; margin-right: 0.0cm; border:0px; padding:0.0cm; font-family: tahoma, sansserif; font-weight: bold; background-color:#f8fffc; }
+ol.level1 { font-family: arial, sansserif; font-weight: bold; font-style: italic; font-size:1.5em; }
+ol.level2 { font-family: "Times New Roman", serif; font-weight: normal; font-style: normal; font-size:0.85em; margin-top: 0.2cm; margin-bottom: 0.5cm; }
+table.figure { margin-left:auto; margin-right:auto; margin-top:1.0cm; margin-bottom:1.0cm; }
+
+td.caption { font-family: arial, sansserif; font-size: 75%; color: black; }
+td { font-family: times, serif; }
+
+table.lt { border-collapse: collapse; border-style: none; }
+td.lt- { margin: 0px; padding: 4px; padding-left:0.3cm; padding-right:0.3cm; border-width: 1px; border-style: none; padding-left=0.2cm; padding-right=0.2cm; }
+td.lt-r { margin: 0px; padding: 4px; padding-left:0.3cm; padding-right:0.3cm; border-style: none; border-right-style: solid; border-width: 1px; border-color: black; }
+td.lt-l { margin: 0px; padding: 4px; padding-left:0.3cm; padding-right:0.3cm; border-style: none; border-left-style: solid; border-width: 1px; border-color: black; }
+td.lt-lr { margin: 0px; padding: 4px; padding-left:0.3cm; padding-right:0.3cm; border-style: none; border-right-style: solid; border-left-style: solid; border-width: 1px; border-color: black; }
+td.lt-b { margin: 0px; padding: 4px; padding-left:0.3cm; padding-right:0.3cm; border-style: none; border-bottom-style: solid; border-width: 1px; border-color: black; }
+td.lt-hl { margin: 0px; border-style: none; border-bottom-style: solid; border-width: 1px; border-color: black; height: 2px; }
+td.lt-bl { margin: 0px; padding: 4px; padding-left:0.3cm; padding-right:0.3cm; border-style: none; border-bottom-style: solid; border-left-style: solid; border-width: 1px; border-color: black; }
+td.lt-br { margin: 0px; padding: 4px; padding-left:0.3cm; padding-right:0.3cm; border-style: none; border-bottom-style: solid; border-right-style: solid; border-width: 1px; border-color: black; }
+td.lt-blr { margin: 0px; padding: 4px; padding-left:0.3cm; padding-right:0.3cm; border-style: none; border-bottom-style: solid; border-left-style: solid; border-right-style: solid; border-width: 1px; border-color: black; }
diff --git a/plugins/supereq/nsfft-1.00/doc/index.xhtml b/plugins/supereq/nsfft-1.00/doc/index.xhtml
new file mode 100644
index 00000000..8b7e2c97
--- /dev/null
+++ b/plugins/supereq/nsfft-1.00/doc/index.xhtml
@@ -0,0 +1,2016 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1 plus MathML 2.0//EN" "http://www.w3.org/Math/DTD/mathml2/xhtml-math11-f.dtd">
+<html xmlns="http://www.w3.org/1999/xhtml">
+<head>
+<meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>
+<link rel="stylesheet" type="text/css" href="default.css"/>
+<title>NSFFT Reference Manual</title>
+</head>
+<body>
+<h1>NSFFT Reference Manual</h1>
+
+<h3>Introduction</h3>
+
+<p>
+This is a library for performing 1-dimensional discrete Fourier
+transforms. NSDFT is a simple, small and portable library, and it is
+efficient since it can utilize SIMD instruction sets in modern
+processors. It performs multiple transforms simultaneously, and thus
+it is especially suitable for digital signal processing. It does not
+need so much computation to make a good execution plan. This library
+is in public domain, so that you can incorporate this library into
+your product without any obligation.
+</p>
+
+<h3>API Reference</h3>
+
+<p>
+In this section, the API functions are explained.
+</p>
+
+<h4>Include files</h4>
+
+<p>
+You have to include two include files in dft directory.
+</p>
+
+<pre class="code">
+#include &lt;stdint.h&gt;
+#include "SIMDBase.h"
+#include "DFT.h"
+</pre>
+
+<h4>Data types</h4>
+
+<p>
+First, you have to choose a data type to represent an element in the
+input and output sequence of numbers. You can choose from the
+following three types.
+</p>
+
+<table class="figure">
+ <tr align="center">
+ <td>
+ <table class="lt">
+ <tr>
+ <td class="lt-hl"></td>
+ <td class="lt-hl"></td>
+ </tr>
+ <tr>
+ <td class="lt-br" align="center">Symbol</td>
+ <td class="lt-b" align="center">Data Type</td>
+ </tr>
+ <tr>
+ <td class="lt-hl"></td>
+ <td class="lt-hl"></td>
+ </tr>
+ <tr>
+ <td class="lt-r" align="left">SIMDBase_TYPE_FLOAT</td>
+ <td class="lt-" align="left">float type in C language</td>
+ </tr>
+ <tr>
+ <td class="lt-r" align="left">SIMDBase_TYPE_DOUBLE</td>
+ <td class="lt-" align="left">double type in C language</td>
+ </tr>
+ <tr>
+ <td class="lt-br" align="left">SIMDBase_TYPE_LONGDOUBLE</td>
+ <td class="lt-b" align="left">long double type in C language</td>
+ </tr>
+ </table>
+ </td>
+ </tr>
+ <tr align="center">
+ <td class="caption">Table 1 Data types</td>
+ </tr>
+</table>
+
+
+<h4>Computation modes</h4>
+
+<p>
+Next, a compuation mode have to be chosen. You can choose from the
+following modes.
+</p>
+
+<table class="figure">
+ <tr align="center">
+ <td>
+ <table class="lt">
+ <tr>
+ <td class="lt-hl"></td>
+ <td class="lt-hl"></td>
+ <td class="lt-hl"></td>
+ <td class="lt-hl"></td>
+ </tr>
+ <tr>
+ <td class="lt-br" align="center">Symbol</td>
+ <td class="lt-br" align="center">Type</td>
+ <td class="lt-br" align="center">Vector Length</td>
+ <td class="lt-b" align="center">Computation Mode</td>
+ </tr>
+ <tr>
+ <td class="lt-hl"></td>
+ <td class="lt-hl"></td>
+ <td class="lt-hl"></td>
+ <td class="lt-hl"></td>
+ </tr>
+ <tr>
+ <td class="lt-r" align="left">SIMDBase_MODE_PUREC_FLOAT</td>
+ <td class="lt-r" align="center">float</td>
+ <td class="lt-r" align="center">1</td>
+ <td class="lt-" align="center">Scalar float</td>
+ </tr>
+ <tr>
+ <td class="lt-r" align="left">SIMDBase_MODE_PUREC_DOUBLE</td>
+ <td class="lt-r" align="center">double</td>
+ <td class="lt-r" align="center">1</td>
+ <td class="lt-" align="center">Scalar double</td>
+ </tr>
+ <tr>
+ <td class="lt-r" align="left">SIMDBase_MODE_PUREC_LONGDOUBLE</td>
+ <td class="lt-r" align="center">long double</td>
+ <td class="lt-r" align="center">1</td>
+ <td class="lt-" align="center">Scalar long double</td>
+ </tr>
+ <tr>
+ <td class="lt-r" align="left">SIMDBase_MODE_SSE_FLOAT</td>
+ <td class="lt-r" align="center">float</td>
+ <td class="lt-r" align="center">4</td>
+ <td class="lt-" align="center">x86 SSE</td>
+ </tr>
+ <tr>
+ <td class="lt-r" align="left">SIMDBase_MODE_SSE2_DOUBLE</td>
+ <td class="lt-r" align="center">double</td>
+ <td class="lt-r" align="center">2</td>
+ <td class="lt-" align="center">x86 SSE2</td>
+ </tr>
+ <tr>
+ <td class="lt-r" align="left">SIMDBase_MODE_NEON_FLOAT</td>
+ <td class="lt-r" align="center">float</td>
+ <td class="lt-r" align="center">4</td>
+ <td class="lt-" align="center">ARM NEON</td>
+ </tr>
+ <tr>
+ <td class="lt-r" align="left">SIMDBase_MODE_AVX_FLOAT</td>
+ <td class="lt-r" align="center">float</td>
+ <td class="lt-r" align="center">8</td>
+ <td class="lt-" align="center">x86 AVX (float)</td>
+ </tr>
+ <tr>
+ <td class="lt-r" align="left">SIMDBase_MODE_AVX_DOUBLE</td>
+ <td class="lt-r" align="center">double</td>
+ <td class="lt-r" align="center">4</td>
+ <td class="lt-" align="center">x86 AVX (double)</td>
+ </tr>
+ <tr>
+ <td class="lt-br" align="left">SIMDBase_MODE_ALTIVEC_FLOAT</td>
+ <td class="lt-br" align="center">float</td>
+ <td class="lt-br" align="center">4</td>
+ <td class="lt-b" align="center">PowerPC Altivec</td>
+ </tr>
+ </table>
+ </td>
+ </tr>
+ <tr align="center">
+ <td class="caption">Table 2 Computation modes</td>
+ </tr>
+</table>
+
+<p>
+The following function automatically checks the availability of each
+instruction set on your computer, and chooses the best computation
+mode.
+</p>
+
+<pre class="code">
+int32_t SIMDBase_chooseBestMode(int32_t type);
+</pre>
+
+<p>
+The return value is the best mode chosen by this routine.
+<i>type</i> is the data type you chose.
+</p>
+
+
+<h4>Retrieving parameters</h4>
+
+<p>
+You can make queries for any mode using the following function.
+</p>
+
+<pre class="code">
+int32_t SIMDBase_getModeParamInt(int32_t paramId, int32_t mode);
+</pre>
+
+<p>
+<i>mode</i> is the computation mode you chose. <i>paramId</i> is one
+of the following.
+</p>
+
+<table class="figure">
+ <tr align="center">
+ <td>
+ <table class="lt">
+ <tr>
+ <td class="lt-hl"></td>
+ <td class="lt-hl"></td>
+ </tr>
+ <tr>
+ <td class="lt-br" align="center">Symbol</td>
+ <td class="lt-b" align="center">Meaning</td>
+ </tr>
+ <tr>
+ <td class="lt-hl"></td>
+ <td class="lt-hl"></td>
+ </tr>
+ <tr>
+ <td class="lt-r" align="left">SIMDBase_PARAMID_SIZE_OF_REAL</td>
+ <td class="lt-" align="left">Size of an element in a vector in byte</td>
+ </tr>
+ <tr>
+ <td class="lt-r" align="left">SIMDBase_PARAMID_SIZE_OF_VECT</td>
+ <td class="lt-" align="left">Size of the vector in byte</td>
+ </tr>
+ <tr>
+ <td class="lt-r" align="left">SIMDBase_PARAMID_VECTOR_LEN</td>
+ <td class="lt-" align="left">Number of elements in a vector</td>
+ </tr>
+ <tr>
+ <td class="lt-br" align="left">SIMDBase_PARAMID_MODE_AVAILABILITY</td>
+ <td class="lt-b" align="left">Whether the given mode is available or not</td>
+ </tr>
+ </table>
+ </td>
+ </tr>
+ <tr align="center">
+ <td class="caption">Table 3 Querying parameter for computation mode</td>
+ </tr>
+</table>
+
+<p>
+Here, a vector is a set of multiple primitive data element (single or
+double precision FP number) which can be stored in one SIMD register,
+and can be processed by one SIMD instruction at the same time.
+</p>
+
+<p>
+You can get the mode name in string data type. In this
+case, <i>paramId</i> must be SIMDBase_PARAMID_MODE_NAME.
+</p>
+
+<pre class="code">
+char *SIMDBase_getModeParamString(int32_t paramId, int32_t mode);
+</pre>
+
+<p>
+You should not modify the data returned by the above function.
+</p>
+
+
+<h4>Making and destroying execution plan</h4>
+
+<p>
+An execution plan can be made by the following function.
+</p>
+
+<pre class="code">
+DFT *DFT_init(int32_t mode, int32_t n, int32_t flags);
+</pre>
+
+<p>
+The return value is a pointer to a newly made plan.
+<i>mode</i> is the mode you chose above. <i>n</i> is the length of a
+transform. You can specify a bitwise OR of the following symbols
+as <i>flags</i>. You should not specify more than one flags regarding
+to test run. You should not specify DFT_FLAG_FORCE_RECURSIVE and
+DFT_FLAG_FORCE_COBRA at the same time. If neither DFT_FLAG_REAL nor
+DFT_FLAG_ALT_REAL is specified, an execution plan for complex
+transforms are made.
+</p>
+
+<table class="figure">
+ <tr align="center">
+ <td>
+ <table class="lt">
+ <tr>
+ <td class="lt-hl"></td>
+ <td class="lt-hl"></td>
+ </tr>
+ <tr>
+ <td class="lt-br" align="center">Symbol</td>
+ <td class="lt-b" align="center">Meaning</td>
+ </tr>
+ <tr>
+ <td class="lt-hl"></td>
+ <td class="lt-hl"></td>
+ </tr>
+ <tr>
+ <td class="lt-r" align="left">DFT_FLAG_NO_TEST_RUN</td>
+ <td class="lt-" align="left">Make execution plan without performing a test run</td>
+ </tr>
+ <tr>
+ <td class="lt-r" align="left">DFT_FLAG_LIGHT_TEST_RUN</td>
+ <td class="lt-" align="left">Perform small amount of test run to make an execution plan</td>
+ </tr>
+ <tr>
+ <td class="lt-r" align="left">DFT_FLAG_HEAVY_TEST_RUN</td>
+ <td class="lt-" align="left">Perform large amount of test run to make an execution plan</td>
+ </tr>
+ <tr>
+ <td class="lt-r" align="left">DFT_FLAG_EXHAUSTIVE_TEST_RUN</td>
+ <td class="lt-" align="left">Perform exhaustive search of parameters and find the optimal execution plan</td>
+ </tr>
+ <tr>
+ <td class="lt-r" align="left">DFT_FLAG_REAL</td>
+ <td class="lt-" align="left">Make an execution plan for a real transform</td>
+ </tr>
+ <tr>
+ <td class="lt-r" align="left">DFT_FLAG_ALT_REAL</td>
+ <td class="lt-" align="left">Make an execution plan for an alternative real transform</td>
+ </tr>
+ <tr>
+ <td class="lt-r" align="left">DFT_FLAG_VERBOSE</td>
+ <td class="lt-" align="left">Make some noise during making an execution plan</td>
+ </tr>
+ <tr>
+ <td class="lt-r" align="left">DFT_FLAG_NOBITREVERSAL</td>
+ <td class="lt-" align="left">Does not perforam bitreversal operation during a transform</td>
+ </tr>
+ <tr>
+ <td class="lt-r" align="left">DFT_FLAG_FORCE_RECURSIVE</td>
+ <td class="lt-" align="left">Force using the recursive bit-reveral routine. This routine is suited for small transforms.</td>
+ </tr>
+ <tr>
+ <td class="lt-br" align="left">DFT_FLAG_FORCE_COBRA</td>
+ <td class="lt-b" align="left">Force using the Cobra bit-reveral routine. This routine is suited for large transforms.</td>
+ </tr>
+ </table>
+ </td>
+ </tr>
+ <tr align="center">
+ <td class="caption">Table 4 Options for making execution plan</td>
+ </tr>
+</table>
+
+<p>
+You can destroy the plan you made by the following function.
+</p>
+
+<pre class="code">
+void DFT_dispose(DFT *p, int32_t mode);
+</pre>
+
+<p>
+<i>p</i> is a pointer to the execution plan. <i>mode</i> is the
+corresponding execution mode.
+</p>
+
+<p>
+You can retrieve parameters of a plan using the following function.
+</p>
+
+<pre class="code">
+int32_t DFT_getPlanParamInt(int32_t paramId, void *p);
+</pre>
+
+<p>
+<i>p</i> is a pointer to an execution plan. <i>paramId</i> is one
+of the following.
+</p>
+
+<table class="figure">
+ <tr align="center">
+ <td>
+ <table class="lt">
+ <tr>
+ <td class="lt-hl"></td>
+ <td class="lt-hl"></td>
+ </tr>
+ <tr>
+ <td class="lt-br" align="center">Symbol</td>
+ <td class="lt-b" align="center">Meaning</td>
+ </tr>
+ <tr>
+ <td class="lt-hl"></td>
+ <td class="lt-hl"></td>
+ </tr>
+ <tr>
+ <td class="lt-r" align="left">DFT_PARAMID_TYPE</td>
+ <td class="lt-" align="left">Data type</td>
+ </tr>
+ <tr>
+ <td class="lt-r" align="left">DFT_PARAMID_MODE</td>
+ <td class="lt-" align="left">Computation mode</td>
+ </tr>
+ <tr>
+ <td class="lt-r" align="left">DFT_PARAMID_FFT_LENGTH</td>
+ <td class="lt-" align="left">Length of the transform</td>
+ </tr>
+ <tr>
+ <td class="lt-r" align="left">DFT_PARAMID_IS_REAL_TRANSFORM</td>
+ <td class="lt-" align="left">Whether the plan is for real transforms</td>
+ </tr>
+ <tr>
+ <td class="lt-r" align="left">DFT_PARAMID_NO_BIT_REVERSAL</td>
+ <td class="lt-" align="left">Whether the plan does not perform bit reversal operation</td>
+ </tr>
+ <tr>
+ <td class="lt-br" align="left">DFT_PARAMID_TEST_RUN</td>
+ <td class="lt-b" align="left">How much test run is performed when making this plan</td>
+ </tr>
+ </table>
+ </td>
+ </tr>
+ <tr align="center">
+ <td class="caption">Table 5 Querying parameter for execution plan</td>
+ </tr>
+</table>
+
+<h4>Writing and reading execution plan to/from file</h4>
+
+<p>
+You can write or read an execution plan to/from a file using the following functions.
+</p>
+
+<pre class="code">
+int32_t DFT_fwrite(DFT *p, FILE *fp);
+DFT *DFT_fread(FILE *fp, int32_t *errcode);
+</pre>
+
+<p>
+<i>p</i> is a pointer to a plan. <i>fp</i> is a file
+pointer. DFT_fwrite returns 1 if the plan is successfully written, and
+0 if an error occurs. DFT_fread returns the pointer to the read plan
+if the plan is successfully read, and NULL if an error occurs. If an
+error occurs, an error code is returned to a variable whose pointer is
+specified by <i>errcode</i>. The interpretation of error codes is
+given below.
+</p>
+
+<table class="figure">
+ <tr align="center">
+ <td>
+ <table class="lt">
+ <tr>
+ <td class="lt-hl"></td>
+ <td class="lt-hl"></td>
+ </tr>
+ <tr>
+ <td class="lt-br" align="center">Symbol</td>
+ <td class="lt-b" align="center">Meaning</td>
+ </tr>
+ <tr>
+ <td class="lt-hl"></td>
+ <td class="lt-hl"></td>
+ </tr>
+ <tr>
+ <td class="lt-r" align="left">DFT_ERROR_NOERROR</td>
+ <td class="lt-" align="left">No error</td>
+ </tr>
+ <tr>
+ <td class="lt-r" align="left">DFT_ERROR_FILE_VERSION</td>
+ <td class="lt-" align="left">File format version mismatch</td>
+ </tr>
+ <tr>
+ <td class="lt-r" align="left">DFT_ERROR_FILE_IO</td>
+ <td class="lt-" align="left">I/O error</td>
+ </tr>
+ <tr>
+ <td class="lt-r" align="left">DFT_ERROR_UNEXPECTED_EOF</td>
+ <td class="lt-" align="left">Unexpected EOF</td>
+ </tr>
+ <tr>
+ <td class="lt-r" align="left">DFT_ERROR_MODE_NOT_COMPILED_IN</td>
+ <td class="lt-" align="left">Tried to read a plan with mode that is not compiled in</td>
+ </tr>
+ <tr>
+ <td class="lt-r" align="left">DFT_ERROR_MODE_NOT_AVAILABLE</td>
+ <td class="lt-" align="left">Tried to read a plan with mode that is not supported by hardware</td>
+ </tr>
+ <tr>
+ <td class="lt-br" align="left">DFT_ERROR_UNKNOWN_MODE</td>
+ <td class="lt-b" align="left">Tried to read a plan with mode that is unknown by library</td>
+ </tr>
+ </table>
+ </td>
+ </tr>
+ <tr align="center">
+ <td class="caption">Table 6 Errors that may happen during file I/O</td>
+ </tr>
+</table>
+
+
+<h4>Allocating and freeing buffers for transforms</h4>
+
+<p>
+In order to allocate word-aligned buffers for storing data which is
+fed to the FFT routine, you have to use the following function.
+</p>
+
+<pre class="code">
+void *DFT_alignedMalloc(uint64_t size);
+</pre>
+
+<p>
+This function allocates <i>size</i> bytes of word-aligned memory and
+returns the pointer. In order to free this memory, you have to use the
+following function.
+</p>
+
+<pre class="code">
+void DFT_alignedFree(void *ptr);
+</pre>
+
+<p>
+<i>ptr</i> is the pointer returned from DFT_alignedMalloc function.
+</p>
+
+<h4>Executing transform</h4>
+
+<p>
+By the following function, the planned transform can be executed.
+</p>
+
+<pre class="code">
+void DFT_execute(DFT *p, int32_t mode, void *s, int32_t dir);
+</pre>
+
+<p>
+<i>p</i> is a pointer to the plan. <i>mode</i> is the computation
+mode. <i>s</i> is the pointer to the buffer in which the sequence of
+input values is stored. This pointer must be a pointer returned from
+DFT_alignedMalloc function.
+<i>dir</i> specifies the direction of transform.
+</p>
+
+<p>
+The forward and backward discrete Fourier transforms are defined by
+the following formula (1) and (2), respectively.
+</p>
+
+<table border="0" style="margin-right:1.0cm; margin-left:1.0cm; margin-top:0.5cm; margin-bottom:0.5cm;">
+ <tr>
+ <td align="center" style="width:100%;">
+ <math mode="display" style="font-size:1.2em;" xmlns="http://www.w3.org/1998/Math/MathML">
+ <mrow>
+ <msub><mi>X</mi><mi>k</mi></msub>
+ <mo>=</mo>
+ <munderover>
+ <mo style="font-size:140%;">&Sum;</mo>
+ <mrow><mi>n</mi><mo>=</mo><mn>0</mn></mrow>
+ <mrow><mi>N</mi><mo>-</mo><mn>1</mn></mrow>
+ </munderover>
+ <msub><mi>x</mi><mi>n</mi></msub>
+ <msup>
+ <mi>e</mi>
+ <mrow>
+ <mo>-</mo>
+ <mfrac>
+ <mrow><mn>2</mn><mi>&pi;</mi><mi>i</mi></mrow>
+ <mi>N</mi>
+ </mfrac>
+ <mi>k</mi><mi>n</mi>
+ </mrow>
+ </msup>
+
+ <mo>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</mo>
+
+ <mi>k</mi>
+ <mo>=</mo>
+ <mn>0</mn>
+ <mo>,</mo>
+ <mo>&middot;</mo>
+ <mo>&middot;</mo>
+ <mo>&middot;</mo>
+ <mo>,</mo>
+ <mi>N</mi>
+ <mo>-</mo>
+ <mn>1</mn>
+ </mrow>
+ </math>
+ </td>
+ <td>
+ <p>(1)</p>
+ </td>
+ </tr>
+ <tr>
+ <td align="center" style="width:100%;">
+ <math mode="display" style="font-size:1.2em;" xmlns="http://www.w3.org/1998/Math/MathML">
+ <mrow>
+ <msub><mi>x</mi><mi>n</mi></msub>
+ <mo>=</mo>
+ <mfrac>
+ <mn>1</mn>
+ <mi>N</mi>
+ </mfrac>
+ <munderover>
+ <mo style="font-size:140%;">&Sum;</mo>
+ <mrow><mi>k</mi><mo>=</mo><mn>0</mn></mrow>
+ <mrow><mi>N</mi><mo>-</mo><mn>1</mn></mrow>
+ </munderover>
+ <msub><mi>X</mi><mi>k</mi></msub>
+ <msup>
+ <mi>e</mi>
+ <mrow>
+ <mfrac>
+ <mrow><mn>2</mn><mi>&pi;</mi><mi>i</mi></mrow>
+ <mi>N</mi>
+ </mfrac>
+ <mi>k</mi><mi>n</mi>
+ </mrow>
+ </msup>
+
+ <mo>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</mo>
+
+ <mi>n</mi>
+ <mo>=</mo>
+ <mn>0</mn>
+ <mo>,</mo>
+ <mo>&middot;</mo>
+ <mo>&middot;</mo>
+ <mo>&middot;</mo>
+ <mo>,</mo>
+ <mi>N</mi>
+ <mo>-</mo>
+ <mn>1</mn>
+
+ </mrow>
+ </math>
+ </td>
+ <td>
+ <p>(2)</p>
+ </td>
+ </tr>
+</table>
+
+<p>
+The complex forward and backward transforms perform the transforms
+defined by the following formula (3) and (4), respectively. <i>V</i>
+is the vector length mentioned above. Again, calling DFT_execute once
+performs <i>V</i> forward or backward transforms at a time. Please
+note that (4) gives values multiplied by <i>N</i> compared to
+(2). Specifying -1 as the direction of transform performs the
+transform defined by (3). In this case, the input should be given as
+in (5) , and the output is given as in (6). Specifying 1 as the
+direction of transform performs the transform defined by (4), and in
+this case, the input should be given as in (6) , and the output is
+given as in (5).
+</p>
+
+<table border="0" style="margin-right:1.0cm; margin-left:1.0cm; margin-top:0.5cm; margin-bottom:0.5cm;">
+ <tr>
+ <td align="center" style="width:100%;">
+ <math mode="display" style="font-size:1.2em;" xmlns="http://www.w3.org/1998/Math/MathML">
+ <mrow>
+ <msub><mi>X</mi><mrow><mi>k</mi><mo>,</mo><mi>v</mi></mrow></msub>
+ <mo>=</mo>
+ <munderover>
+ <mo style="font-size:140%;">&Sum;</mo>
+ <mrow><mi>n</mi><mo>=</mo><mn>0</mn></mrow>
+ <mrow><mi>N</mi><mo>-</mo><mn>1</mn></mrow>
+ </munderover>
+ <msub><mi>x</mi><mrow><mi>n</mi><mo>,</mo><mi>v</mi></mrow></msub>
+ <msup>
+ <mi>e</mi>
+ <mrow>
+ <mo>-</mo>
+ <mfrac>
+ <mrow><mn>2</mn><mi>&pi;</mi><mi>i</mi></mrow>
+ <mi>N</mi>
+ </mfrac>
+ <mi>k</mi><mi>n</mi>
+ </mrow>
+ </msup>
+
+ <mo>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</mo>
+
+ <mrow style="font-size:100%;">
+ <mi>k</mi>
+ <mo>=</mo>
+ <mn>0</mn>
+ <mo>,</mo>
+ <mo>&middot;</mo>
+ <mo>&middot;</mo>
+ <mo>&middot;</mo>
+ <mo>,</mo>
+ <mi>N</mi>
+ <mo>-</mo>
+ <mn>1</mn>
+
+ <mo>&nbsp;&nbsp;</mo>
+ <mo>,</mo>
+ <mo>&nbsp;&nbsp;</mo>
+
+ <mi>v</mi>
+ <mo>=</mo>
+ <mn>0</mn>
+ <mo>,</mo>
+ <mo>&middot;</mo>
+ <mo>&middot;</mo>
+ <mo>&middot;</mo>
+ <mo>,</mo>
+ <mi>V</mi>
+ <mo>-</mo>
+ <mn>1</mn>
+ </mrow>
+ </mrow>
+ </math>
+ </td>
+ <td>
+ <p>(3)</p>
+ </td>
+ </tr>
+ <tr>
+ <td align="center" style="width:100%;">
+ <math mode="display" style="font-size:1.2em;" xmlns="http://www.w3.org/1998/Math/MathML">
+ <mrow>
+ <msub><mi>x</mi><mrow><mi>n</mi><mo>,</mo><mi>v</mi></mrow></msub>
+ <mo>=</mo>
+ <munderover>
+ <mo style="font-size:140%;">&Sum;</mo>
+ <mrow><mi>k</mi><mo>=</mo><mn>0</mn></mrow>
+ <mrow><mi>N</mi><mo>-</mo><mn>1</mn></mrow>
+ </munderover>
+ <msub><mi>X</mi><mrow><mi>k</mi><mo>,</mo><mi>v</mi></mrow></msub>
+ <msup>
+ <mi>e</mi>
+ <mrow>
+ <mfrac>
+ <mrow><mn>2</mn><mi>&pi;</mi><mi>i</mi></mrow>
+ <mi>N</mi>
+ </mfrac>
+ <mi>k</mi><mi>n</mi>
+ </mrow>
+ </msup>
+
+ <mo>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</mo>
+
+ <mrow style="font-size:100%;">
+ <mi>n</mi>
+ <mo>=</mo>
+ <mn>0</mn>
+ <mo>,</mo>
+ <mo>&middot;</mo>
+ <mo>&middot;</mo>
+ <mo>&middot;</mo>
+ <mo>,</mo>
+ <mi>N</mi>
+ <mo>-</mo>
+ <mn>1</mn>
+
+ <mo>&nbsp;&nbsp;</mo>
+ <mo>,</mo>
+ <mo>&nbsp;&nbsp;</mo>
+
+ <mi>v</mi>
+ <mo>=</mo>
+ <mn>0</mn>
+ <mo>,</mo>
+ <mo>&middot;</mo>
+ <mo>&middot;</mo>
+ <mo>&middot;</mo>
+ <mo>,</mo>
+ <mi>V</mi>
+ <mo>-</mo>
+ <mn>1</mn>
+ </mrow>
+ </mrow>
+ </math>
+ </td>
+ <td>
+ <p>(4)</p>
+ </td>
+ </tr>
+</table>
+
+<table border="0" style="margin-right:1.0cm; margin-left:1.0cm; margin-top:0.5cm; margin-bottom:0.5cm;">
+ <tr>
+ <td align="center" style="width:100%;">
+ <math mode="display" style="font-size:1.2em;" xmlns="http://www.w3.org/1998/Math/MathML">
+ <mrow>
+ <mfenced open="{" close="">
+ <mtable>
+ <mtr>
+ <mtd>
+ <mrow>
+ <mi>s</mi>
+ <mo>[</mo>
+ <mo>(</mo>
+ <mn>2</mn>
+ <mi>n</mi>
+ <mo>+</mo>
+ <mn>0</mn>
+ <mo>)</mo>
+ <mi>V</mi>
+ <mo>+</mo>
+ <mi>v</mi>
+ <mo>]</mo>
+
+ <mo>=</mo>
+
+ <mi>Re</mi>
+ <mo>(</mo>
+ <msub><mi>x</mi><mrow><mi>n</mi><mo>,</mo><mi>v</mi></mrow></msub>
+ <mo>)</mo>
+ </mrow>
+ </mtd>
+ </mtr>
+
+ <mtr>
+ <mtd>
+ <mrow>
+ <mi>s</mi>
+ <mo>[</mo>
+ <mo>(</mo>
+ <mn>2</mn>
+ <mi>n</mi>
+ <mo>+</mo>
+ <mn>1</mn>
+ <mo>)</mo>
+ <mi>V</mi>
+ <mo>+</mo>
+ <mi>v</mi>
+ <mo>]</mo>
+
+ <mo>=</mo>
+
+ <mi>Im</mi>
+ <mo>(</mo>
+ <msub><mi>x</mi><mrow><mi>n</mi><mo>,</mo><mi>v</mi></mrow></msub>
+ <mo>)</mo>
+ </mrow>
+ </mtd>
+ </mtr>
+
+ </mtable>
+ </mfenced>
+
+ <mo>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</mo>
+
+ <mrow style="font-size:100%;">
+ <mi>n</mi>
+ <mo>=</mo>
+ <mn>0</mn>
+ <mo>,</mo>
+ <mo>&middot;</mo>
+ <mo>&middot;</mo>
+ <mo>&middot;</mo>
+ <mo>,</mo>
+ <mi>N</mi>
+ <mo>-</mo>
+ <mn>1</mn>
+
+ <mo>&nbsp;&nbsp;</mo>
+ <mo>,</mo>
+ <mo>&nbsp;&nbsp;</mo>
+
+ <mi>v</mi>
+ <mo>=</mo>
+ <mn>0</mn>
+ <mo>,</mo>
+ <mo>&middot;</mo>
+ <mo>&middot;</mo>
+ <mo>&middot;</mo>
+ <mo>,</mo>
+ <mi>V</mi>
+ <mo>-</mo>
+ <mn>1</mn>
+ </mrow>
+ </mrow>
+ </math>
+ </td>
+ <td>
+ <p>(5)</p>
+ </td>
+ </tr>
+</table>
+
+<table border="0" style="margin-right:1.0cm; margin-left:1.0cm; margin-top:0.5cm; margin-bottom:0.5cm;">
+ <tr>
+ <td align="center" style="width:100%;">
+ <math mode="display" style="font-size:1.2em;" xmlns="http://www.w3.org/1998/Math/MathML">
+ <mrow>
+ <mfenced open="{" close="">
+ <mtable>
+ <mtr>
+ <mtd>
+ <mrow>
+ <mi>s</mi>
+ <mo>[</mo>
+ <mo>(</mo>
+ <mn>2</mn>
+ <mi>k</mi>
+ <mo>+</mo>
+ <mn>0</mn>
+ <mo>)</mo>
+ <mi>V</mi>
+ <mo>+</mo>
+ <mi>v</mi>
+ <mo>]</mo>
+
+ <mo>=</mo>
+
+ <mi>Re</mi>
+ <mo>(</mo>
+ <msub><mi>X</mi><mrow><mi>k</mi><mo>,</mo><mi>v</mi></mrow></msub>
+ <mo>)</mo>
+ </mrow>
+ </mtd>
+ </mtr>
+
+ <mtr>
+ <mtd>
+ <mrow>
+ <mi>s</mi>
+ <mo>[</mo>
+ <mo>(</mo>
+ <mn>2</mn>
+ <mi>k</mi>
+ <mo>+</mo>
+ <mn>1</mn>
+ <mo>)</mo>
+ <mi>V</mi>
+ <mo>+</mo>
+ <mi>v</mi>
+ <mo>]</mo>
+
+ <mo>=</mo>
+
+ <mi>Im</mi>
+ <mo>(</mo>
+ <msub><mi>X</mi><mrow><mi>k</mi><mo>,</mo><mi>v</mi></mrow></msub>
+ <mo>)</mo>
+ </mrow>
+ </mtd>
+ </mtr>
+
+ </mtable>
+ </mfenced>
+
+ <mo>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</mo>
+
+ <mrow style="font-size:100%;">
+ <mi>k</mi>
+ <mo>=</mo>
+ <mn>0</mn>
+ <mo>,</mo>
+ <mo>&middot;</mo>
+ <mo>&middot;</mo>
+ <mo>&middot;</mo>
+ <mo>,</mo>
+ <mi>N</mi>
+ <mo>-</mo>
+ <mn>1</mn>
+
+ <mo>&nbsp;&nbsp;</mo>
+ <mo>,</mo>
+ <mo>&nbsp;&nbsp;</mo>
+
+ <mi>v</mi>
+ <mo>=</mo>
+ <mn>0</mn>
+ <mo>,</mo>
+ <mo>&middot;</mo>
+ <mo>&middot;</mo>
+ <mo>&middot;</mo>
+ <mo>,</mo>
+ <mi>V</mi>
+ <mo>-</mo>
+ <mn>1</mn>
+ </mrow>
+ </mrow>
+ </math>
+ </td>
+ <td>
+ <p>(6)</p>
+ </td>
+ </tr>
+</table>
+
+<p>
+The real forward transform performs the transform defined by (3) when
+the condition (7) is satisfied. In this case, the output satisfies
+(8). You should specify -1 as the direction of transform, and the
+input should be given as in (9), and the output is given as in (10).
+The real backward transform is the opposite of the real forward
+transform. The input should satisfy (8) and the output satisfies (7).
+You should specify 1 as the direction of transform, and the input
+should be given as in (10), and the output is given as in (11).
+</p>
+
+<table border="0" style="margin-right:1.0cm; margin-left:1.0cm; margin-top:0.5cm; margin-bottom:0.5cm;">
+ <tr>
+ <td align="center" style="width:100%;">
+ <math mode="display" style="font-size:1.2em;" xmlns="http://www.w3.org/1998/Math/MathML">
+ <mrow>
+ <mi>Im</mi>
+ <mo>(</mo>
+ <msub><mi>x</mi><mrow><mi>n</mi><mo>,</mo><mi>v</mi></mrow></msub>
+ <mo>)</mo>
+ <mo>=</mo>
+ <mn>0</mn>
+
+ <mo>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</mo>
+
+ <mrow style="font-size:100%;">
+ <mi>n</mi>
+ <mo>=</mo>
+ <mn>0</mn>
+ <mo>,</mo>
+ <mo>&middot;</mo>
+ <mo>&middot;</mo>
+ <mo>&middot;</mo>
+ <mo>,</mo>
+ <mi>N</mi>
+ <mo>-</mo>
+ <mn>1</mn>
+
+ <mo>&nbsp;&nbsp;</mo>
+ <mo>,</mo>
+ <mo>&nbsp;&nbsp;</mo>
+
+ <mi>v</mi>
+ <mo>=</mo>
+ <mn>0</mn>
+ <mo>,</mo>
+ <mo>&middot;</mo>
+ <mo>&middot;</mo>
+ <mo>&middot;</mo>
+ <mo>,</mo>
+ <mi>V</mi>
+ <mo>-</mo>
+ <mn>1</mn>
+ </mrow>
+
+ </mrow>
+ </math>
+ </td>
+ <td>
+ <p>(7)</p>
+ </td>
+ </tr>
+</table>
+
+<table border="0" style="margin-right:1.0cm; margin-left:1.0cm; margin-top:0.5cm; margin-bottom:0.5cm;">
+ <tr>
+ <td align="center" style="width:100%;">
+ <math mode="display" style="font-size:1.2em;" xmlns="http://www.w3.org/1998/Math/MathML">
+ <mrow>
+ <mfenced open="{" close="">
+ <mtable>
+ <mtr>
+ <mtd>
+ <mrow>
+ <msub><mi>X</mi><mrow><mi>k</mi><mo>,</mo><mi>v</mi></mrow></msub>
+ <mo>=</mo>
+ <msubsup>
+ <mi>X</mi>
+ <mrow><mi>N</mi><mo>-</mo><mi>k</mi><mo>,</mo><mi>v</mi></mrow>
+ <mo>*</mo>
+ </msubsup>
+ </mrow>
+ </mtd>
+
+ <mtd>
+ <mo>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</mo>
+ </mtd>
+
+ <mtd>
+ <mrow style="font-size:100%;">
+ <mi>k</mi>
+ <mo>=</mo>
+ <mn>1</mn>
+ <mo>,</mo>
+ <mo>&middot;</mo>
+ <mo>&middot;</mo>
+ <mo>&middot;</mo>
+ <mo>,</mo>
+ <mfrac>
+ <mi>N</mi>
+ <mn>2</mn>
+ </mfrac>
+ <mo>-</mo>
+ <mn>1</mn>
+ </mrow>
+ </mtd>
+ </mtr>
+
+ <mtr>
+ <mtd>
+ <mrow>
+ <mi>Im</mi>
+ <mo>(</mo>
+ <msub><mi>X</mi><mrow><mi>k</mi><mo>,</mo><mi>v</mi></mrow></msub>
+ <mo>)</mo>
+ <mo>=</mo>
+ <mn>0</mn>
+ </mrow>
+ </mtd>
+
+ <mtd>
+ <mo>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</mo>
+ </mtd>
+
+ <mtd>
+ <mrow style="font-size:100%;">
+ <mi>k</mi>
+ <mo>=</mo>
+ <mn>0</mn>
+ <mo>,</mo>
+ <mfrac>
+ <mi>N</mi>
+ <mn>2</mn>
+ </mfrac>
+ </mrow>
+ </mtd>
+ </mtr>
+
+ </mtable>
+ </mfenced>
+
+ <mo>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</mo>
+
+ <mrow style="font-size:100%;">
+ <mi>v</mi>
+ <mo>=</mo>
+ <mn>0</mn>
+ <mo>,</mo>
+ <mo>&middot;</mo>
+ <mo>&middot;</mo>
+ <mo>&middot;</mo>
+ <mo>,</mo>
+ <mi>V</mi>
+ <mo>-</mo>
+ <mn>1</mn>
+ </mrow>
+ </mrow>
+ </math>
+ </td>
+ <td>
+ <p>(8)</p>
+ </td>
+ </tr>
+</table>
+
+<table border="0" style="margin-right:1.0cm; margin-left:1.0cm; margin-top:0.5cm; margin-bottom:0.5cm;">
+ <tr>
+ <td align="center" style="width:100%;">
+ <math mode="display" style="font-size:1.2em;" xmlns="http://www.w3.org/1998/Math/MathML">
+ <mrow>
+ <mrow>
+ <mi>s</mi>
+ <mo>[</mo>
+ <mi>n</mi>
+ <mi>V</mi>
+ <mo>+</mo>
+ <mi>v</mi>
+ <mo>]</mo>
+
+ <mo>=</mo>
+
+ <mi>Re</mi>
+ <mo>(</mo>
+ <msub><mi>x</mi><mrow><mi>n</mi><mo>,</mo><mi>v</mi></mrow></msub>
+ <mo>)</mo>
+ </mrow>
+
+ <mo>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</mo>
+
+ <mrow style="font-size:100%;">
+ <mi>n</mi>
+ <mo>=</mo>
+ <mn>0</mn>
+ <mo>,</mo>
+ <mo>&middot;</mo>
+ <mo>&middot;</mo>
+ <mo>&middot;</mo>
+ <mo>,</mo>
+ <mi>N</mi>
+ <mo>-</mo>
+ <mn>1</mn>
+
+ <mo>&nbsp;&nbsp;</mo>
+ <mo>,</mo>
+ <mo>&nbsp;&nbsp;</mo>
+
+ <mi>v</mi>
+ <mo>=</mo>
+ <mn>0</mn>
+ <mo>,</mo>
+ <mo>&middot;</mo>
+ <mo>&middot;</mo>
+ <mo>&middot;</mo>
+ <mo>,</mo>
+ <mi>V</mi>
+ <mo>-</mo>
+ <mn>1</mn>
+ </mrow>
+ </mrow>
+ </math>
+ </td>
+ <td>
+ <p>(9)</p>
+ </td>
+ </tr>
+</table>
+
+<table border="0" style="margin-right:1.0cm; margin-left:1.0cm; margin-top:0.5cm; margin-bottom:0.5cm;">
+ <tr>
+ <td align="center" style="width:100%;">
+ <math mode="display" style="font-size:1.2em;" xmlns="http://www.w3.org/1998/Math/MathML">
+ <mrow>
+ <mfenced open="{" close="">
+ <mtable>
+ <mtr>
+ <mtd>
+ <mrow>
+ <mi>s</mi>
+ <mo>[</mo>
+ <mo>(</mo>
+ <mn>2</mn>
+ <mi>k</mi>
+ <mo>+</mo>
+ <mn>0</mn>
+ <mo>)</mo>
+ <mi>V</mi>
+ <mo>+</mo>
+ <mi>v</mi>
+ <mo>]</mo>
+ </mrow>
+ </mtd>
+
+ <mtd>
+ <mo>=</mo>
+ </mtd>
+
+ <mtd>
+ <mrow>
+ <mi>Re</mi>
+ <mo>(</mo>
+ <msub><mi>X</mi><mrow><mi>k</mi><mo>,</mo><mi>v</mi></mrow></msub>
+ <mo>)</mo>
+ </mrow>
+ </mtd>
+
+ <mtd>
+ <mrow style="font-size:100%;">
+ <mi>k</mi>
+ <mo>=</mo>
+ <mn>0</mn>
+ <mo>,</mo>
+ <mo>&middot;</mo>
+ <mo>&middot;</mo>
+ <mo>&middot;</mo>
+ <mo>,</mo>
+ <mfrac>
+ <mi>N</mi>
+ <mn>2</mn>
+ </mfrac>
+ <mo>-</mo>
+ <mn>1</mn>
+ </mrow>
+ </mtd>
+ </mtr>
+
+ <mtr>
+ <mtd>
+ <mrow>
+ <mi>s</mi>
+ <mo>[</mo>
+ <mi>V</mi>
+ <mo>+</mo>
+ <mi>v</mi>
+ <mo>]</mo>
+ </mrow>
+ </mtd>
+
+ <mtd>
+ <mo>=</mo>
+ </mtd>
+
+ <mtd>
+ <mrow>
+ <mi>Re</mi>
+ <mo>(</mo>
+ <msub><mi>X</mi><mrow><mi>N</mi><mo>/</mo><mn>2</mn><mo>,</mo><mi>v</mi></mrow></msub>
+ <mo>)</mo>
+ </mrow>
+ </mtd>
+
+ <mtd>
+ </mtd>
+ </mtr>
+
+ <mtr>
+ <mtd>
+ <mrow>
+ <mi>s</mi>
+ <mo>[</mo>
+ <mo>(</mo>
+ <mn>2</mn>
+ <mi>k</mi>
+ <mo>+</mo>
+ <mn>1</mn>
+ <mo>)</mo>
+ <mi>V</mi>
+ <mo>+</mo>
+ <mi>v</mi>
+ <mo>]</mo>
+ </mrow>
+ </mtd>
+
+ <mtd>
+ <mo>=</mo>
+ </mtd>
+
+ <mtd>
+ <mrow>
+ <mi>Im</mi>
+ <mo>(</mo>
+ <msub><mi>X</mi><mrow><mi>k</mi><mo>,</mo><mi>v</mi></mrow></msub>
+ <mo>)</mo>
+ </mrow>
+ </mtd>
+
+ <mtd>
+ <mrow style="font-size:100%;">
+ <mi>k</mi>
+ <mo>=</mo>
+ <mn>1</mn>
+ <mo>,</mo>
+ <mo>&middot;</mo>
+ <mo>&middot;</mo>
+ <mo>&middot;</mo>
+ <mo>,</mo>
+ <mfrac>
+ <mi>N</mi>
+ <mn>2</mn>
+ </mfrac>
+ <mo>-</mo>
+ <mn>1</mn>
+ </mrow>
+ </mtd>
+ </mtr>
+
+ </mtable>
+ </mfenced>
+
+ <mo>&nbsp;&nbsp;&nbsp;&nbsp;</mo>
+
+ <mrow style="font-size:100%;">
+ <mi>v</mi>
+ <mo>=</mo>
+ <mn>0</mn>
+ <mo>,</mo>
+ <mo>&middot;</mo>
+ <mo>&middot;</mo>
+ <mo>&middot;</mo>
+ <mo>,</mo>
+ <mi>V</mi>
+ <mo>-</mo>
+ <mn>1</mn>
+ </mrow>
+ </mrow>
+ </math>
+ </td>
+ <td>
+ <p>(10)</p>
+ </td>
+ </tr>
+</table>
+
+<table border="0" style="margin-right:1.0cm; margin-left:1.0cm; margin-top:0.5cm; margin-bottom:0.5cm;">
+ <tr>
+ <td align="center" style="width:100%;">
+ <math mode="display" style="font-size:1.2em;" xmlns="http://www.w3.org/1998/Math/MathML">
+ <mrow>
+ <mrow>
+ <mn>2</mn>
+ <mo> &nbsp; </mo>
+ <mi>s</mi>
+ <mo>[</mo>
+ <mi>n</mi>
+ <mi>V</mi>
+ <mo>+</mo>
+ <mi>v</mi>
+ <mo>]</mo>
+
+ <mo>=</mo>
+
+ <mi>Re</mi>
+ <mo>(</mo>
+ <msub><mi>x</mi><mrow><mi>n</mi><mo>,</mo><mi>v</mi></mrow></msub>
+ <mo>)</mo>
+ </mrow>
+
+ <mo>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</mo>
+
+ <mrow style="font-size:100%;">
+ <mi>n</mi>
+ <mo>=</mo>
+ <mn>0</mn>
+ <mo>,</mo>
+ <mo>&middot;</mo>
+ <mo>&middot;</mo>
+ <mo>&middot;</mo>
+ <mo>,</mo>
+ <mi>N</mi>
+ <mo>-</mo>
+ <mn>1</mn>
+
+ <mo>&nbsp;&nbsp;</mo>
+ <mo>,</mo>
+ <mo>&nbsp;&nbsp;</mo>
+
+ <mi>v</mi>
+ <mo>=</mo>
+ <mn>0</mn>
+ <mo>,</mo>
+ <mo>&middot;</mo>
+ <mo>&middot;</mo>
+ <mo>&middot;</mo>
+ <mo>,</mo>
+ <mi>V</mi>
+ <mo>-</mo>
+ <mn>1</mn>
+ </mrow>
+ </mrow>
+ </math>
+ </td>
+ <td>
+ <p>(11)</p>
+ </td>
+ </tr>
+</table>
+
+<p>
+The alternative real transforms are defined by (12) to (16), similarly
+to the real transforms. The alternative transforms are handy if you
+are migrating from the FFT library made by Prof. Takuya Ooura. You
+should specify 1 as the direction in order to perform a forward
+transform, and -1 when you perform a backward transform.
+</p>
+
+<table border="0" style="margin-right:1.0cm; margin-left:1.0cm; margin-top:0.5cm; margin-bottom:0.5cm;">
+ <tr>
+ <td align="center" style="width:100%;">
+ <math mode="display" style="font-size:1.2em;" xmlns="http://www.w3.org/1998/Math/MathML">
+ <mrow>
+ <mi>Im</mi>
+ <mo>(</mo>
+ <msub><mi>X</mi><mrow><mi>k</mi><mo>,</mo><mi>v</mi></mrow></msub>
+ <mo>)</mo>
+ <mo>=</mo>
+ <mn>0</mn>
+
+ <mo>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</mo>
+
+ <mrow style="font-size:100%;">
+ <mi>k</mi>
+ <mo>=</mo>
+ <mn>0</mn>
+ <mo>,</mo>
+ <mo>&middot;</mo>
+ <mo>&middot;</mo>
+ <mo>&middot;</mo>
+ <mo>,</mo>
+ <mi>N</mi>
+ <mo>-</mo>
+ <mn>1</mn>
+
+ <mo>&nbsp;&nbsp;</mo>
+ <mo>,</mo>
+ <mo>&nbsp;&nbsp;</mo>
+
+ <mi>v</mi>
+ <mo>=</mo>
+ <mn>0</mn>
+ <mo>,</mo>
+ <mo>&middot;</mo>
+ <mo>&middot;</mo>
+ <mo>&middot;</mo>
+ <mo>,</mo>
+ <mi>V</mi>
+ <mo>-</mo>
+ <mn>1</mn>
+ </mrow>
+
+ </mrow>
+ </math>
+ </td>
+ <td>
+ <p>(12)</p>
+ </td>
+ </tr>
+</table>
+
+<table border="0" style="margin-right:1.0cm; margin-left:1.0cm; margin-top:0.5cm; margin-bottom:0.5cm;">
+ <tr>
+ <td align="center" style="width:100%;">
+ <math mode="display" style="font-size:1.2em;" xmlns="http://www.w3.org/1998/Math/MathML">
+ <mrow>
+ <mfenced open="{" close="">
+ <mtable>
+ <mtr>
+ <mtd>
+ <mrow>
+ <msub><mi>x</mi><mrow><mi>n</mi><mo>,</mo><mi>v</mi></mrow></msub>
+ <mo>=</mo>
+ <msubsup>
+ <mi>x</mi>
+ <mrow><mi>N</mi><mo>-</mo><mi>n</mi><mo>,</mo><mi>v</mi></mrow>
+ <mo>*</mo>
+ </msubsup>
+ </mrow>
+ </mtd>
+
+ <mtd>
+ <mo>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</mo>
+ </mtd>
+
+ <mtd>
+ <mrow style="font-size:100%;">
+ <mi>n</mi>
+ <mo>=</mo>
+ <mn>1</mn>
+ <mo>,</mo>
+ <mo>&middot;</mo>
+ <mo>&middot;</mo>
+ <mo>&middot;</mo>
+ <mo>,</mo>
+ <mfrac>
+ <mi>N</mi>
+ <mn>2</mn>
+ </mfrac>
+ <mo>-</mo>
+ <mn>1</mn>
+ </mrow>
+ </mtd>
+ </mtr>
+
+ <mtr>
+ <mtd>
+ <mrow>
+ <mi>Im</mi>
+ <mo>(</mo>
+ <msub><mi>x</mi><mrow><mi>n</mi><mo>,</mo><mi>v</mi></mrow></msub>
+ <mo>)</mo>
+ <mo>=</mo>
+ <mn>0</mn>
+ </mrow>
+ </mtd>
+
+ <mtd>
+ <mo>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</mo>
+ </mtd>
+
+ <mtd>
+ <mrow style="font-size:100%;">
+ <mi>n</mi>
+ <mo>=</mo>
+ <mn>0</mn>
+ <mo>,</mo>
+ <mfrac>
+ <mi>N</mi>
+ <mn>2</mn>
+ </mfrac>
+ </mrow>
+ </mtd>
+ </mtr>
+
+ </mtable>
+ </mfenced>
+
+ <mo>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</mo>
+
+ <mrow style="font-size:100%;">
+ <mi>v</mi>
+ <mo>=</mo>
+ <mn>0</mn>
+ <mo>,</mo>
+ <mo>&middot;</mo>
+ <mo>&middot;</mo>
+ <mo>&middot;</mo>
+ <mo>,</mo>
+ <mi>V</mi>
+ <mo>-</mo>
+ <mn>1</mn>
+ </mrow>
+ </mrow>
+ </math>
+ </td>
+ <td>
+ <p>(13)</p>
+ </td>
+ </tr>
+</table>
+
+<table border="0" style="margin-right:1.0cm; margin-left:1.0cm; margin-top:0.5cm; margin-bottom:0.5cm;">
+ <tr>
+ <td align="center" style="width:100%;">
+ <math mode="display" style="font-size:1.2em;" xmlns="http://www.w3.org/1998/Math/MathML">
+ <mrow>
+ <mrow>
+ <mi>s</mi>
+ <mo>[</mo>
+ <mi>n</mi>
+ <mi>V</mi>
+ <mo>+</mo>
+ <mi>v</mi>
+ <mo>]</mo>
+
+ <mo>=</mo>
+
+ <mi>Re</mi>
+ <mo>(</mo>
+ <msub><mi>X</mi><mrow><mi>k</mi><mo>,</mo><mi>v</mi></mrow></msub>
+ <mo>)</mo>
+ </mrow>
+
+ <mo>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</mo>
+
+ <mrow style="font-size:100%;">
+ <mi>k</mi>
+ <mo>=</mo>
+ <mn>0</mn>
+ <mo>,</mo>
+ <mo>&middot;</mo>
+ <mo>&middot;</mo>
+ <mo>&middot;</mo>
+ <mo>,</mo>
+ <mi>N</mi>
+ <mo>-</mo>
+ <mn>1</mn>
+
+ <mo>&nbsp;&nbsp;</mo>
+ <mo>,</mo>
+ <mo>&nbsp;&nbsp;</mo>
+
+ <mi>v</mi>
+ <mo>=</mo>
+ <mn>0</mn>
+ <mo>,</mo>
+ <mo>&middot;</mo>
+ <mo>&middot;</mo>
+ <mo>&middot;</mo>
+ <mo>,</mo>
+ <mi>V</mi>
+ <mo>-</mo>
+ <mn>1</mn>
+ </mrow>
+ </mrow>
+ </math>
+ </td>
+ <td>
+ <p>(14)</p>
+ </td>
+ </tr>
+</table>
+
+<table border="0" style="margin-right:1.0cm; margin-left:1.0cm; margin-top:0.5cm; margin-bottom:0.5cm;">
+ <tr>
+ <td align="center" style="width:100%;">
+ <math mode="display" style="font-size:1.2em;" xmlns="http://www.w3.org/1998/Math/MathML">
+ <mrow>
+ <mfenced open="{" close="">
+ <mtable>
+ <mtr>
+ <mtd>
+ <mrow>
+ <mi>s</mi>
+ <mo>[</mo>
+ <mo>(</mo>
+ <mn>2</mn>
+ <mi>n</mi>
+ <mo>+</mo>
+ <mn>0</mn>
+ <mo>)</mo>
+ <mi>V</mi>
+ <mo>+</mo>
+ <mi>v</mi>
+ <mo>]</mo>
+ </mrow>
+ </mtd>
+
+ <mtd>
+ <mo>=</mo>
+ </mtd>
+
+ <mtd>
+ <mrow>
+ <mi>Re</mi>
+ <mo>(</mo>
+ <msub><mi>x</mi><mrow><mi>n</mi><mo>,</mo><mi>v</mi></mrow></msub>
+ <mo>)</mo>
+ </mrow>
+ </mtd>
+
+ <mtd>
+ <mrow style="font-size:100%;">
+ <mi>n</mi>
+ <mo>=</mo>
+ <mn>0</mn>
+ <mo>,</mo>
+ <mo>&middot;</mo>
+ <mo>&middot;</mo>
+ <mo>&middot;</mo>
+ <mo>,</mo>
+ <mfrac>
+ <mi>N</mi>
+ <mn>2</mn>
+ </mfrac>
+ <mo>-</mo>
+ <mn>1</mn>
+ </mrow>
+ </mtd>
+ </mtr>
+
+ <mtr>
+ <mtd>
+ <mrow>
+ <mi>s</mi>
+ <mo>[</mo>
+ <mi>V</mi>
+ <mo>+</mo>
+ <mi>v</mi>
+ <mo>]</mo>
+ </mrow>
+ </mtd>
+
+ <mtd>
+ <mo>=</mo>
+ </mtd>
+
+ <mtd>
+ <mrow>
+ <mi>Re</mi>
+ <mo>(</mo>
+ <msub><mi>x</mi><mrow><mi>N</mi><mo>/</mo><mn>2</mn><mo>,</mo><mi>v</mi></mrow></msub>
+ <mo>)</mo>
+ </mrow>
+ </mtd>
+
+ <mtd>
+ </mtd>
+ </mtr>
+
+ <mtr>
+ <mtd>
+ <mrow>
+ <mi>s</mi>
+ <mo>[</mo>
+ <mo>(</mo>
+ <mn>2</mn>
+ <mi>n</mi>
+ <mo>+</mo>
+ <mn>1</mn>
+ <mo>)</mo>
+ <mi>V</mi>
+ <mo>+</mo>
+ <mi>v</mi>
+ <mo>]</mo>
+ </mrow>
+ </mtd>
+
+ <mtd>
+ <mo>=</mo>
+ </mtd>
+
+ <mtd>
+ <mrow>
+ <mi>Im</mi>
+ <mo>(</mo>
+ <msub><mi>x</mi><mrow><mi>n</mi><mo>,</mo><mi>v</mi></mrow></msub>
+ <mo>)</mo>
+ </mrow>
+ </mtd>
+
+ <mtd>
+ <mrow style="font-size:100%;">
+ <mi>n</mi>
+ <mo>=</mo>
+ <mn>1</mn>
+ <mo>,</mo>
+ <mo>&middot;</mo>
+ <mo>&middot;</mo>
+ <mo>&middot;</mo>
+ <mo>,</mo>
+ <mfrac>
+ <mi>N</mi>
+ <mn>2</mn>
+ </mfrac>
+ <mo>-</mo>
+ <mn>1</mn>
+ </mrow>
+ </mtd>
+ </mtr>
+
+ </mtable>
+ </mfenced>
+
+ <mo>&nbsp;&nbsp;&nbsp;&nbsp;</mo>
+
+ <mrow style="font-size:100%;">
+ <mi>v</mi>
+ <mo>=</mo>
+ <mn>0</mn>
+ <mo>,</mo>
+ <mo>&middot;</mo>
+ <mo>&middot;</mo>
+ <mo>&middot;</mo>
+ <mo>,</mo>
+ <mi>V</mi>
+ <mo>-</mo>
+ <mn>1</mn>
+ </mrow>
+ </mrow>
+ </math>
+ </td>
+ <td>
+ <p>(15)</p>
+ </td>
+ </tr>
+</table>
+
+<table border="0" style="margin-right:1.0cm; margin-left:1.0cm; margin-top:0.5cm; margin-bottom:0.5cm;">
+ <tr>
+ <td align="center" style="width:100%;">
+ <math mode="display" style="font-size:1.2em;" xmlns="http://www.w3.org/1998/Math/MathML">
+ <mrow>
+ <mrow>
+ <mn>2</mn>
+ <mo> &nbsp; </mo>
+ <mi>s</mi>
+ <mo>[</mo>
+ <mi>n</mi>
+ <mi>V</mi>
+ <mo>+</mo>
+ <mi>v</mi>
+ <mo>]</mo>
+
+ <mo>=</mo>
+
+ <mi>Re</mi>
+ <mo>(</mo>
+ <msub><mi>X</mi><mrow><mi>k</mi><mo>,</mo><mi>v</mi></mrow></msub>
+ <mo>)</mo>
+ </mrow>
+
+ <mo>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</mo>
+
+ <mrow style="font-size:100%;">
+ <mi>k</mi>
+ <mo>=</mo>
+ <mn>0</mn>
+ <mo>,</mo>
+ <mo>&middot;</mo>
+ <mo>&middot;</mo>
+ <mo>&middot;</mo>
+ <mo>,</mo>
+ <mi>N</mi>
+ <mo>-</mo>
+ <mn>1</mn>
+
+ <mo>&nbsp;&nbsp;</mo>
+ <mo>,</mo>
+ <mo>&nbsp;&nbsp;</mo>
+
+ <mi>v</mi>
+ <mo>=</mo>
+ <mn>0</mn>
+ <mo>,</mo>
+ <mo>&middot;</mo>
+ <mo>&middot;</mo>
+ <mo>&middot;</mo>
+ <mo>,</mo>
+ <mi>V</mi>
+ <mo>-</mo>
+ <mn>1</mn>
+ </mrow>
+ </mrow>
+ </math>
+ </td>
+ <td>
+ <p>(16)</p>
+ </td>
+ </tr>
+</table>
+
+
+<h3>Examples</h3>
+
+<p>
+Below is an example code using nsfft library.
+</p>
+
+<pre class="code">
+#include &lt;stdio.h&gt;
+#include &lt;stdlib.h&gt;
+#include &lt;math.h&gt;
+#include &lt;stdint.h&gt;
+#include &lt;complex.h&gt;
+
+#include "SIMDBase.h"
+#include "DFT.h"
+
+typedef float REAL;
+#define TYPE SIMDBase_TYPE_FLOAT
+
+#define THRES 1e-3
+
+double complex omega(double n, double kn) {
+ return cexp((-2 * M_PI * _Complex_I / n) * kn);
+}
+
+void forward(double complex *ts, double complex *fs, int len) {
+ int k, n;
+
+ for(k=0;k&lt;len;k++) {
+ fs[k] = 0;
+
+ for(n=0;n&lt;len;n++) {
+ fs[k] += ts[n] * omega(len, n*k);
+ }
+ }
+}
+
+int main(int argc, char **argv) {
+ const int n = 256;
+
+ int mode = SIMDBase_chooseBestMode(TYPE);
+ printf("mode : %d, %s\n", mode, SIMDBase_getModeParamString(SIMDBase_PARAMID_MODE_NAME, mode));
+
+ int veclen = SIMDBase_getModeParamInt(SIMDBase_PARAMID_VECTOR_LEN, mode);
+ int sizeOfVect = SIMDBase_getModeParamInt(SIMDBase_PARAMID_SIZE_OF_VECT, mode);
+
+ //
+
+ int i, j;
+
+ DFT *p = DFT_init(mode, n, 0);
+ REAL *sx = SIMDBase_alignedMalloc(sizeOfVect*n*2);
+
+ //
+
+ double complex ts[veclen][n], fs[veclen][n];
+
+ for(j=0;j&lt;veclen;j++) {
+ for(i=0;i&lt;n;i++) {
+ ts[j][i] = (random() / (double)RAND_MAX) + (random() / (double)RAND_MAX) * _Complex_I;
+ sx[(i*2+0)*veclen+j] = creal(ts[j][i]);
+ sx[(i*2+1)*veclen+j] = cimag(ts[j][i]);
+ }
+ }
+
+ //
+
+ DFT_execute(p, mode, sx, -1);
+
+ for(j=0;j&lt;veclen;j++) {
+ forward(ts[j], fs[j], n);
+ }
+
+ //
+
+ int success = 1;
+
+ for(j=0;j&lt;veclen;j++) {
+ for(i=0;i&lt;n;i++) {
+ if ((fabs(sx[(i*2+0)*veclen+j] - creal(fs[j][i])) &gt; THRES) ||
+ (fabs(sx[(i*2+1)*veclen+j] - cimag(fs[j][i])) &gt; THRES)) {
+ success = 0;
+ }
+ }
+ }
+
+ printf("%s\n", success ? "OK" : "NG");
+
+ //
+
+ SIMDBase_alignedFree(sx);
+ DFT_dispose(p, mode);
+
+ exit(0);
+}
+</pre>
+
+<p>
+You should put this code under a directory in the root directory of
+the library, and then you can compile this code with the following
+command.
+</p>
+
+<pre class="code">
+gcc -Wall -g -I ../simd -I ../dft -L../simd -L../dft -O DFTExample.c -lDFT -lSIMD -lm -o DFTExample
+</pre>
+
+<h3>Compilation</h3>
+
+<p>
+The nsfft source package include a few makefiles for various
+architectures. You should make symbolic links to makefiles suited for
+your computer under <i>dft</i> and <i>simd</i> directories.
+</p>
+
+</body>
+</html>
diff --git a/plugins/supereq/nsfft-1.00/doc/nsfft.pdf b/plugins/supereq/nsfft-1.00/doc/nsfft.pdf
new file mode 100644
index 00000000..ed4ad5db
--- /dev/null
+++ b/plugins/supereq/nsfft-1.00/doc/nsfft.pdf
Binary files differ
diff --git a/plugins/supereq/nsfft-1.00/ooura/Makefile b/plugins/supereq/nsfft-1.00/ooura/Makefile
new file mode 100644
index 00000000..bad1679e
--- /dev/null
+++ b/plugins/supereq/nsfft-1.00/ooura/Makefile
@@ -0,0 +1,11 @@
+CC=gcc
+BASEOPT=-Wall -g
+OPT=$(BASEOPT) -O3
+
+all : fftsg.o
+
+clean :
+ rm -f *~ *.o a.out
+
+fftsg.o : fftsg.c
+ $(CC) $(OPT) -c fftsg.c
diff --git a/plugins/supereq/nsfft-1.00/ooura/README b/plugins/supereq/nsfft-1.00/ooura/README
new file mode 100644
index 00000000..d7ddefc2
--- /dev/null
+++ b/plugins/supereq/nsfft-1.00/ooura/README
@@ -0,0 +1,2 @@
+Please put fftsg.c and pi_fft.c which is included in Prof. Takuya
+Ooura's FFT package.
diff --git a/plugins/supereq/nsfft-1.00/simd/Makefile b/plugins/supereq/nsfft-1.00/simd/Makefile
new file mode 120000
index 00000000..5d253498
--- /dev/null
+++ b/plugins/supereq/nsfft-1.00/simd/Makefile
@@ -0,0 +1 @@
+Makefile.x86avx \ No newline at end of file
diff --git a/plugins/supereq/nsfft-1.00/simd/Makefile.altivec b/plugins/supereq/nsfft-1.00/simd/Makefile.altivec
new file mode 100644
index 00000000..eeaed6a1
--- /dev/null
+++ b/plugins/supereq/nsfft-1.00/simd/Makefile.altivec
@@ -0,0 +1,26 @@
+CC=gcc
+BASEOPT=-Wall -maltivec -mabi=altivec
+OPT=$(BASEOPT) -O3
+
+all : libSIMD.a
+
+SIMDBaseUndiff_purecfloat.o : SIMDBaseUndiff.c SIMDBase.h SIMDBaseUndiff.h
+ $(CC) $(OPT) -DENABLE_PUREC_FLOAT SIMDBaseUndiff.c -c -o SIMDBaseUndiff_purecfloat.o
+
+SIMDBaseUndiff_purecdouble.o : SIMDBaseUndiff.c SIMDBase.h SIMDBaseUndiff.h
+ $(CC) $(OPT) -DENABLE_PUREC_DOUBLE SIMDBaseUndiff.c -c -o SIMDBaseUndiff_purecdouble.o
+
+SIMDBaseUndiff_pureclongdouble.o : SIMDBaseUndiff.c SIMDBase.h SIMDBaseUndiff.h
+ $(CC) $(OPT) -DENABLE_PUREC_LONGDOUBLE SIMDBaseUndiff.c -c -o SIMDBaseUndiff_pureclongdouble.o
+
+SIMDBaseUndiff_altivecfloat.o : SIMDBaseUndiff.c SIMDBase.h SIMDBaseUndiff.h
+ $(CC) $(OPT) -DENABLE_ALTIVEC_FLOAT SIMDBaseUndiff.c -c -o SIMDBaseUndiff_altivecfloat.o
+
+SIMDBase.o : SIMDBase.c SIMDBase.h
+ $(CC) $(BASEOPT) -O -DENABLE_PUREC_FLOAT -DENABLE_PUREC_DOUBLE -DENABLE_PUREC_LONGDOUBLE -DENABLE_ALTIVEC_FLOAT SIMDBase.c -c -o SIMDBase.o
+
+libSIMD.a : SIMDBase.o SIMDBaseUndiff_purecfloat.o SIMDBaseUndiff_purecdouble.o SIMDBaseUndiff_pureclongdouble.o SIMDBaseUndiff_altivecfloat.o
+ rm -f libSIMD.a; ar -cvq libSIMD.a SIMDBase.o SIMDBaseUndiff_purecfloat.o SIMDBaseUndiff_purecdouble.o SIMDBaseUndiff_pureclongdouble.o SIMDBaseUndiff_altivecfloat.o
+
+clean :
+ rm -f *~ *.o *.s *.a
diff --git a/plugins/supereq/nsfft-1.00/simd/Makefile.neon b/plugins/supereq/nsfft-1.00/simd/Makefile.neon
new file mode 100644
index 00000000..ace704f1
--- /dev/null
+++ b/plugins/supereq/nsfft-1.00/simd/Makefile.neon
@@ -0,0 +1,26 @@
+CC=gcc
+BASEOPT=-Wall -mfloat-abi=softfp
+OPT=$(BASEOPT) -O3
+
+all : libSIMD.a
+
+SIMDBaseUndiff_purecfloat.o : SIMDBaseUndiff.c SIMDBase.h SIMDBaseUndiff.h
+ $(CC) $(OPT) -DENABLE_PUREC_FLOAT SIMDBaseUndiff.c -c -o SIMDBaseUndiff_purecfloat.o
+
+SIMDBaseUndiff_purecdouble.o : SIMDBaseUndiff.c SIMDBase.h SIMDBaseUndiff.h
+ $(CC) $(OPT) -DENABLE_PUREC_DOUBLE SIMDBaseUndiff.c -c -o SIMDBaseUndiff_purecdouble.o
+
+SIMDBaseUndiff_pureclongdouble.o : SIMDBaseUndiff.c SIMDBase.h SIMDBaseUndiff.h
+ $(CC) $(OPT) -DENABLE_PUREC_LONGDOUBLE SIMDBaseUndiff.c -c -o SIMDBaseUndiff_pureclongdouble.o
+
+SIMDBaseUndiff_neonfloat.o : SIMDBaseUndiff.c SIMDBase.h SIMDBaseUndiff.h
+ $(CC) $(OPT) -mfpu=neon -DENABLE_NEON_FLOAT SIMDBaseUndiff.c -c -o SIMDBaseUndiff_neonfloat.o
+
+SIMDBase.o : SIMDBase.c SIMDBase.h
+ $(CC) $(BASEOPT) -O -DENABLE_PUREC_FLOAT -DENABLE_PUREC_DOUBLE -DENABLE_PUREC_LONGDOUBLE -DENABLE_NEON_FLOAT SIMDBase.c -c -o SIMDBase.o
+
+libSIMD.a : SIMDBase.o SIMDBaseUndiff_purecfloat.o SIMDBaseUndiff_purecdouble.o SIMDBaseUndiff_pureclongdouble.o SIMDBaseUndiff_neonfloat.o
+ rm -f libSIMD.a; ar -cvq libSIMD.a SIMDBase.o SIMDBaseUndiff_purecfloat.o SIMDBaseUndiff_purecdouble.o SIMDBaseUndiff_pureclongdouble.o SIMDBaseUndiff_neonfloat.o
+
+clean :
+ rm -f *~ *.o *.s *.a
diff --git a/plugins/supereq/nsfft-1.00/simd/Makefile.purec b/plugins/supereq/nsfft-1.00/simd/Makefile.purec
new file mode 100644
index 00000000..2c8b04f1
--- /dev/null
+++ b/plugins/supereq/nsfft-1.00/simd/Makefile.purec
@@ -0,0 +1,35 @@
+CC=gcc
+BASEOPT=-Wall
+OPT=$(BASEOPT) -O3
+
+all : libDFT.a
+
+DFTpurecfloat.o : DFTUndiff.c DFT.h SIMDBase.h
+ $(CC) $(OPT) -DENABLE_PUREC_FLOAT DFTUndiff.c -c -o DFTpurecfloat.o
+
+DFTpurecdouble.o : DFTUndiff.c DFT.h SIMDBase.h
+ $(CC) $(OPT) -DENABLE_PUREC_DOUBLE DFTUndiff.c -c -o DFTpurecdouble.o
+
+DFTpureclongdouble.o : DFTUndiff.c DFT.h SIMDBase.h
+ $(CC) $(OPT) -DENABLE_PUREC_LONGDOUBLE DFTUndiff.c -c -o DFTpureclongdouble.o
+
+SIMDBaseUndiff_purecfloat.o : SIMDBaseUndiff.c SIMDBase.h SIMDBaseUndiff.h
+ $(CC) $(OPT) -DENABLE_PUREC_FLOAT SIMDBaseUndiff.c -c -o SIMDBaseUndiff_purecfloat.o
+
+SIMDBaseUndiff_purecdouble.o : SIMDBaseUndiff.c DFT.h SIMDBase.h SIMDBaseUndiff.h
+ $(CC) $(OPT) -DENABLE_PUREC_DOUBLE SIMDBaseUndiff.c -c -o SIMDBaseUndiff_purecdouble.o
+
+SIMDBaseUndiff_pureclongdouble.o : SIMDBaseUndiff.c SIMDBase.h SIMDBaseUndiff.h
+ $(CC) $(OPT) -DENABLE_PUREC_LONGDOUBLE SIMDBaseUndiff.c -c -o SIMDBaseUndiff_pureclongdouble.o
+
+SIMDBase.o : SIMDBase.c SIMDBase.h
+ $(CC) $(BASEOPT) -O -DENABLE_PUREC_FLOAT -DENABLE_PUREC_DOUBLE -DENABLE_PUREC_LONGDOUBLE SIMDBase.c -c -o SIMDBase.o
+
+DFT.o : DFT.c DFT.h
+ $(CC) $(OPT) -DENABLE_PUREC_FLOAT -DENABLE_PUREC_DOUBLE -DENABLE_PUREC_LONGDOUBLE DFT.c -c -o DFT.o
+
+libDFT.a : DFTpurecfloat.o DFTpurecdouble.o DFTpureclongdouble.o DFT.o SIMDBase.o SIMDBaseUndiff_purecfloat.o SIMDBaseUndiff_purecdouble.o SIMDBaseUndiff_pureclongdouble.o
+ rm -f libDFT.a; ar -cvq libDFT.a DFTpurecfloat.o DFTpurecdouble.o DFTpureclongdouble.o DFT.o SIMDBase.o SIMDBaseUndiff_purecfloat.o SIMDBaseUndiff_purecdouble.o SIMDBaseUndiff_pureclongdouble.o
+
+clean :
+ rm -f *~ *.o *.s *.a
diff --git a/plugins/supereq/nsfft-1.00/simd/Makefile.x86 b/plugins/supereq/nsfft-1.00/simd/Makefile.x86
new file mode 100644
index 00000000..02f49610
--- /dev/null
+++ b/plugins/supereq/nsfft-1.00/simd/Makefile.x86
@@ -0,0 +1,35 @@
+CC=gcc
+BASEOPT=-Wall
+OPT=$(BASEOPT) -O3
+
+all : libSIMD.a
+
+SIMDBaseUndiff_purecfloat.o : SIMDBaseUndiff.c SIMDBase.h SIMDBaseUndiff.h
+ $(CC) $(OPT) -DENABLE_PUREC_FLOAT SIMDBaseUndiff.c -c -o SIMDBaseUndiff_purecfloat.o
+
+SIMDBase_purecdouble.o : SIMDBaseUndiff.c SIMDBase.h SIMDBaseUndiff.h
+ $(CC) $(OPT) -DENABLE_PUREC_DOUBLE SIMDBaseUndiff.c -c -o SIMDBase_purecdouble.o
+
+SIMDBase_pureclongdouble.o : SIMDBaseUndiff.c SIMDBase.h SIMDBaseUndiff.h
+ $(CC) $(OPT) -DENABLE_PUREC_LONGDOUBLE SIMDBaseUndiff.c -c -o SIMDBase_pureclongdouble.o
+
+SIMDBase_ssefloat.o : SIMDBaseUndiff.c SIMDBase.h SIMDBaseUndiff.h
+ $(CC) $(OPT) -msse -DENABLE_SSE_FLOAT SIMDBaseUndiff.c -c -o SIMDBase_ssefloat.o
+
+SIMDBase_sse2double.o : SIMDBaseUndiff.c SIMDBase.h SIMDBaseUndiff.h
+ $(CC) $(OPT) -msse2 -DENABLE_SSE2_DOUBLE SIMDBaseUndiff.c -c -o SIMDBase_sse2double.o
+
+SIMDBase_avxfloat.o : SIMDBaseUndiff.c SIMDBase.h SIMDBaseUndiff.h
+ $(CC) $(OPT) -mavx -DENABLE_AVX_FLOAT SIMDBaseUndiff.c -c -o SIMDBase_avxfloat.o
+
+SIMDBase_avxdouble.o : SIMDBaseUndiff.c SIMDBase.h SIMDBaseUndiff.h
+ $(CC) $(OPT) -mavx -DENABLE_AVX_DOUBLE SIMDBaseUndiff.c -c -o SIMDBase_avxdouble.o
+
+SIMDBase.o : SIMDBase.c SIMDBase.h
+ $(CC) $(BASEOPT) -O -DENABLE_PUREC_FLOAT -DENABLE_PUREC_DOUBLE -DENABLE_PUREC_LONGDOUBLE -DENABLE_SSE_FLOAT -DENABLE_SSE2_DOUBLE SIMDBase.c -c -o SIMDBase.o
+
+libSIMD.a : SIMDBase.o SIMDBaseUndiff_purecfloat.o SIMDBase_purecdouble.o SIMDBase_pureclongdouble.o SIMDBase_ssefloat.o SIMDBase_sse2double.o
+ rm -f libSIMD.a; ar -cvq libSIMD.a SIMDBase.o SIMDBaseUndiff_purecfloat.o SIMDBase_purecdouble.o SIMDBase_pureclongdouble.o SIMDBase_ssefloat.o SIMDBase_sse2double.o
+
+clean :
+ rm -f *~ *.o *.s *.a a.out
diff --git a/plugins/supereq/nsfft-1.00/simd/Makefile.x86avx b/plugins/supereq/nsfft-1.00/simd/Makefile.x86avx
new file mode 100644
index 00000000..d9d27a2e
--- /dev/null
+++ b/plugins/supereq/nsfft-1.00/simd/Makefile.x86avx
@@ -0,0 +1,35 @@
+CC=gcc
+BASEOPT=-Wall
+OPT=$(BASEOPT) -O3
+
+all : libSIMD.a
+
+SIMDBaseUndiff_purecfloat.o : SIMDBaseUndiff.c SIMDBase.h SIMDBaseUndiff.h
+ $(CC) $(OPT) -DENABLE_PUREC_FLOAT SIMDBaseUndiff.c -c -o SIMDBaseUndiff_purecfloat.o
+
+SIMDBaseUndiff_purecdouble.o : SIMDBaseUndiff.c SIMDBase.h SIMDBaseUndiff.h
+ $(CC) $(OPT) -DENABLE_PUREC_DOUBLE SIMDBaseUndiff.c -c -o SIMDBaseUndiff_purecdouble.o
+
+SIMDBaseUndiff_pureclongdouble.o : SIMDBaseUndiff.c SIMDBase.h SIMDBaseUndiff.h
+ $(CC) $(OPT) -DENABLE_PUREC_LONGDOUBLE SIMDBaseUndiff.c -c -o SIMDBaseUndiff_pureclongdouble.o
+
+SIMDBaseUndiff_ssefloat.o : SIMDBaseUndiff.c SIMDBase.h SIMDBaseUndiff.h
+ $(CC) $(OPT) -msse -DENABLE_SSE_FLOAT SIMDBaseUndiff.c -c -o SIMDBaseUndiff_ssefloat.o
+
+SIMDBaseUndiff_sse2double.o : SIMDBaseUndiff.c SIMDBase.h SIMDBaseUndiff.h
+ $(CC) $(OPT) -msse2 -DENABLE_SSE2_DOUBLE SIMDBaseUndiff.c -c -o SIMDBaseUndiff_sse2double.o
+
+SIMDBaseUndiff_avxfloat.o : SIMDBaseUndiff.c SIMDBase.h SIMDBaseUndiff.h
+ $(CC) $(OPT) -mavx -DENABLE_AVX_FLOAT SIMDBaseUndiff.c -c -o SIMDBaseUndiff_avxfloat.o
+
+SIMDBaseUndiff_avxdouble.o : SIMDBaseUndiff.c SIMDBase.h SIMDBaseUndiff.h
+ $(CC) $(OPT) -mavx -DENABLE_AVX_DOUBLE SIMDBaseUndiff.c -c -o SIMDBaseUndiff_avxdouble.o
+
+SIMDBase.o : SIMDBase.c SIMDBase.h
+ $(CC) $(BASEOPT) -O -DENABLE_PUREC_FLOAT -DENABLE_PUREC_DOUBLE -DENABLE_PUREC_LONGDOUBLE -DENABLE_SSE_FLOAT -DENABLE_SSE2_DOUBLE -DENABLE_AVX_FLOAT -DENABLE_AVX_DOUBLE SIMDBase.c -c -o SIMDBase.o
+
+libSIMD.a : SIMDBase.o SIMDBaseUndiff_purecfloat.o SIMDBaseUndiff_purecdouble.o SIMDBaseUndiff_pureclongdouble.o SIMDBaseUndiff_ssefloat.o SIMDBaseUndiff_sse2double.o SIMDBaseUndiff_avxfloat.o SIMDBaseUndiff_avxdouble.o
+ rm -f libSIMD.a; ar -cvq libSIMD.a SIMDBase.o SIMDBaseUndiff_purecfloat.o SIMDBaseUndiff_purecdouble.o SIMDBaseUndiff_pureclongdouble.o SIMDBaseUndiff_ssefloat.o SIMDBaseUndiff_sse2double.o SIMDBaseUndiff_avxfloat.o SIMDBaseUndiff_avxdouble.o
+
+clean :
+ rm -f *~ *.o *.s *.a a.out
diff --git a/plugins/supereq/nsfft-1.00/simd/SIMDBase.c b/plugins/supereq/nsfft-1.00/simd/SIMDBase.c
new file mode 100644
index 00000000..eb51ee10
--- /dev/null
+++ b/plugins/supereq/nsfft-1.00/simd/SIMDBase.c
@@ -0,0 +1,454 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include <stdint.h>
+#include <signal.h>
+#include <setjmp.h>
+#include <string.h>
+
+#include "SIMDBase.h"
+
+void detect_purec_float(void);
+void detect_purec_double(void);
+void detect_purec_longdouble(void);
+void detect_sse_float(void);
+void detect_sse2_double(void);
+void detect_neon_float(void);
+void detect_avx_float(void);
+void detect_avx_double(void);
+void detect_altivec_float(void);
+
+int32_t getModeParamInt_purec_float(int32_t paramId);
+int32_t getModeParamInt_purec_double(int32_t paramId);
+int32_t getModeParamInt_purec_longdouble(int32_t paramId);
+int32_t getModeParamInt_sse_float(int32_t paramId);
+int32_t getModeParamInt_sse2_double(int32_t paramId);
+int32_t getModeParamInt_neon_float(int32_t paramId);
+int32_t getModeParamInt_avx_float(int32_t paramId);
+int32_t getModeParamInt_avx_double(int32_t paramId);
+int32_t getModeParamInt_altivec_float(int32_t paramId);
+
+char * getModeParamString_purec_float(int32_t paramId);
+char * getModeParamString_purec_double(int32_t paramId);
+char * getModeParamString_purec_longdouble(int32_t paramId);
+char * getModeParamString_sse_float(int32_t paramId);
+char * getModeParamString_sse2_double(int32_t paramId);
+char * getModeParamString_neon_float(int32_t paramId);
+char * getModeParamString_avx_float(int32_t paramId);
+char * getModeParamString_avx_double(int32_t paramId);
+char * getModeParamString_altivec_float(int32_t paramId);
+
+uint8_t detectBuffer[256];
+char SIMDBase_processorNameString[256];
+
+static char *startsWith(char *str1, char *str2) {
+ if (strncmp(str1, str2, strlen(str2)) == 0) {
+ return str1 + strlen(str2);
+ }
+
+ return NULL;
+}
+
+#if defined(__linux__)
+static char *tryReadingProcCpuinfo(char *entry) {
+ int i;
+
+ FILE *fp = fopen("/proc/cpuinfo", "r");
+ if (fp == NULL) return NULL;
+
+ for(i=0;i<100;i++) {
+ char *q;
+ bzero(SIMDBase_processorNameString, 256);
+ if (fgets(SIMDBase_processorNameString, 255, fp) == NULL) break;
+
+ if ((q = startsWith(SIMDBase_processorNameString, entry)) != NULL) {
+ int j;
+ fclose(fp);
+
+ for(j=0;j<256;j++) {
+ if (SIMDBase_processorNameString[j] == '\n') SIMDBase_processorNameString[j] = ' ';
+ }
+ while(*q != '\0' && *q != ':' && q - SIMDBase_processorNameString < 200) q++;
+ if (q - SIMDBase_processorNameString >= 200) return NULL;
+ if (*q == ':' && *(q+1) == ' ') return q + 2;
+ return NULL;
+ }
+ }
+
+ fclose(fp);
+ return NULL;
+}
+#else
+static char *tryReadingProcCpuinfo(char *entry) { return NULL; }
+#endif
+
+#if defined(__i386__)
+static void SIMDBase_x86cpuid(uint32_t out[4], uint32_t eax, uint32_t ecx) {
+ uint32_t a, b, c, d;
+ __asm__ __volatile__("pushl %%eax; \n\t"
+ "pushl %%ebx; \n\t"
+ "pushl %%ecx; \n\t"
+ "pushl %%edx; \n\t"
+ "cpuid; \n\t"
+ "movl %%eax, %0; \n\t"
+ "movl %%ebx, %1; \n\t"
+ "movl %%ecx, %2; \n\t"
+ "movl %%edx, %3; \n\t"
+ "popl %%edx; \n\t"
+ "popl %%ecx; \n\t"
+ "popl %%ebx; \n\t"
+ "popl %%eax; \n\t"
+ : "=m"(a), "=m"(b), "=m"(c), "=m"(d)
+ : "a"(eax), "c"(ecx)
+ : "cc");
+ out[0] = a; out[1] = b; out[2] = c; out[3] = d;
+}
+#endif
+
+#if defined(__x86_64__)
+static void SIMDBase_x86cpuid(uint32_t out[4], uint32_t eax, uint32_t ecx) {
+ uint32_t a, b, c, d;
+ __asm__ __volatile__ ("cpuid" : "=a" (a), "=b" (b), "=c" (c), "=d" (d) : "a" (eax), "c"(ecx));
+ out[0] = a; out[1] = b; out[2] = c; out[3] = d;
+}
+#endif
+
+#if defined(__i386__) || defined(__x86_64__)
+static void getCacheParam(CacheParam *p) {
+ static int l2assoc[] = {0,1,2,0,4,0,8,0,16,0,32,48,64,96,128,-1};
+ int32_t i;
+ uint32_t out[4];
+
+ for(i=0;i<8;i++) {
+ p->size[i] = p->assoc[i] = 0;
+ }
+
+ SIMDBase_x86cpuid(out, 4, 0);
+
+ if ((out[0] & 0xf) != 0) {
+ p->linesize = ((out[1] >> 0) & 2047)+1;
+ for(i=0;i<8;i++) {
+ SIMDBase_x86cpuid(out, 4, i);
+ if ((out[0] & 0xf) == 0) break;
+ int level = (out[0] >> 5) & 0x7;
+ int type = (out[0] >> 0) & 0xf;
+ int assoc = ((out[1] >> 22) & 1023)+1;
+ int part = ((out[1] >> 12) & 1023)+1;
+ int lsize = ((out[1] >> 0) & 2047)+1;
+ int nsets = ((out[2] >> 0))+1;
+ int nthre = ((out[0] >> 14) & 1023)+1;
+
+ if (type != 1 && type != 3) continue;
+ p->assoc[level-1] = assoc;
+ p->size[level-1] = (uint64_t)assoc * part * lsize * nsets / nthre;
+ }
+ } else {
+ SIMDBase_x86cpuid(out, 0x80000008U, 0);
+ int ncores = (out[2] & 0xff) + 1;
+
+ SIMDBase_x86cpuid(out, 0x80000005U, 0);
+ p->linesize = out[2] & 255;
+ p->size[0] = (out[2] >> 24) * 1024 / ncores;
+ p->assoc[0] = (out[2] >> 16) & 0xff;
+
+ SIMDBase_x86cpuid(out, 0x80000006U, 0);
+ p->size[1] = (out[2] >> 16) * 1024 / ncores;
+ p->assoc[1] = l2assoc[(out[2] >> 12) & 0xf];
+ p->size[2] = (out[3] >> 18) * 512 * 1024 / ncores;
+ p->assoc[2] = l2assoc[(out[3] >> 12) & 0xf];
+ }
+
+ if (p->size[0] == 0) {
+ p->size[0] = 16 * 1024;
+ p->assoc[0] = 4;
+ }
+
+ if (p->size[1] == 0) {
+ p->size[1] = 256 * 1024;
+ p->assoc[1] = 4;
+ }
+}
+
+char *SIMDBase_getProcessorNameString() {
+ union {
+ uint32_t info[4];
+ uint8_t str[16];
+ } u;
+ int i,j;
+ char *p;
+
+ p = SIMDBase_processorNameString;
+
+ SIMDBase_x86cpuid(u.info, 0, 0);
+
+ for(i=0;i<4;i++) *p++ = u.str[i+4];
+ for(i=0;i<4;i++) *p++ = u.str[i+12];
+ for(i=0;i<4;i++) *p++ = u.str[i+8];
+
+ *p++ = ' ';
+
+ for(i=0;i<3;i++) {
+ SIMDBase_x86cpuid(u.info, i + 0x80000002, 0);
+
+ for(j=0;j<16;j++) {
+ *p++ = u.str[j];
+ }
+ }
+
+ *p++ = '\n';
+
+ return SIMDBase_processorNameString;
+}
+#else
+char *SIMDBase_getProcessorNameString() {
+ char *p = "Unknown";
+#if defined(__powerpc__)
+ if ((p = tryReadingProcCpuinfo("cpu")) == NULL) p = "PowerPC";
+#elif defined(__arm__)
+ if ((p = tryReadingProcCpuinfo("Processor")) == NULL) p = "ARM";
+#endif
+
+ return p;
+}
+#endif
+
+int32_t SIMDBase_sizeOfCachelineInByte() {
+#if defined(__i386__) || defined(__x86_64__)
+ CacheParam p;
+ getCacheParam(&p);
+ return p.linesize;
+#else
+ return 64;
+#endif
+}
+
+int32_t SIMDBase_sizeOfDataCacheInByte() {
+#if defined(__i386__) || defined(__x86_64__)
+ CacheParam p;
+ getCacheParam(&p);
+ return p.size[1] + p.size[2]; // L2 + L3
+#else
+ return 256 * 1024;
+#endif
+}
+
+static jmp_buf sigjmp;
+
+static void sighandler(int signum) {
+ longjmp(sigjmp, 1);
+}
+
+int32_t SIMDBase_detect(int32_t paramId) {
+#if defined(__i386__) || defined(__x86_64__)
+ uint32_t reg[4];
+#endif
+
+ switch(paramId) {
+ case SIMDBase_MODE_PUREC_FLOAT:
+#if defined(ENABLE_PUREC_FLOAT)
+ return 1;
+#else
+ return -1;
+#endif
+ case SIMDBase_MODE_PUREC_DOUBLE:
+#if defined(ENABLE_PUREC_DOUBLE)
+ return 1;
+#else
+ return -1;
+#endif
+ case SIMDBase_MODE_PUREC_LONGDOUBLE:
+#if defined(ENABLE_PUREC_LONGDOUBLE)
+ return 1;
+#else
+ return -1;
+#endif
+ case SIMDBase_MODE_SSE_FLOAT:
+#if defined(ENABLE_SSE_FLOAT)
+ SIMDBase_x86cpuid(reg, 1, 0);
+ return (reg[3] & (1 << 25)) != 0;
+#else
+ return -1;
+#endif
+ case SIMDBase_MODE_SSE2_DOUBLE:
+#if defined(ENABLE_SSE2_DOUBLE)
+ SIMDBase_x86cpuid(reg, 1, 0);
+ return (reg[3] & (1 << 26)) != 0;
+#else
+ return -1;
+#endif
+ case SIMDBase_MODE_AVX_FLOAT:
+#if defined(ENABLE_AVX_FLOAT)
+ SIMDBase_x86cpuid(reg, 1, 0);
+ return (reg[2] & (1 << 28)) != 0;
+#else
+ return -1;
+#endif
+ case SIMDBase_MODE_AVX_DOUBLE:
+#if defined(ENABLE_AVX_DOUBLE)
+ SIMDBase_x86cpuid(reg, 1, 0);
+ return (reg[2] & (1 << 28)) != 0;
+#else
+ return -1;
+#endif
+ default:
+ break;
+ }
+
+ signal(SIGILL, sighandler);
+
+ if (setjmp(sigjmp) == 0) {
+ switch(paramId) {
+#if defined(ENABLE_NEON_FLOAT)
+ case SIMDBase_MODE_NEON_FLOAT:
+ detect_neon_float();
+ break;
+#endif
+#if defined(ENABLE_ALTIVEC_FLOAT)
+ case SIMDBase_MODE_ALTIVEC_FLOAT:
+ detect_altivec_float();
+ break;
+#endif
+ default:
+ signal(SIGILL, SIG_DFL);
+ return -1;
+ }
+ signal(SIGILL, SIG_DFL);
+ return 1;
+ } else {
+ signal(SIGILL, SIG_DFL);
+ return 0;
+ }
+}
+
+int32_t SIMDBase_chooseBestMode(int32_t typeId) {
+ switch(typeId) {
+ case SIMDBase_TYPE_HALF:
+ break;
+ case SIMDBase_TYPE_FLOAT:
+ if (SIMDBase_detect(SIMDBase_MODE_AVX_FLOAT) == 1) return SIMDBase_MODE_AVX_FLOAT;
+ if (SIMDBase_detect(SIMDBase_MODE_SSE_FLOAT) == 1) return SIMDBase_MODE_SSE_FLOAT;
+ if (SIMDBase_detect(SIMDBase_MODE_NEON_FLOAT) == 1) return SIMDBase_MODE_NEON_FLOAT;
+ if (SIMDBase_detect(SIMDBase_MODE_ALTIVEC_FLOAT) == 1) return SIMDBase_MODE_ALTIVEC_FLOAT;
+ if (SIMDBase_detect(SIMDBase_MODE_PUREC_FLOAT) == 1) return SIMDBase_MODE_PUREC_FLOAT;
+ break;
+
+ case SIMDBase_TYPE_DOUBLE:
+ if (SIMDBase_detect(SIMDBase_MODE_AVX_DOUBLE) == 1) return SIMDBase_MODE_AVX_DOUBLE;
+ if (SIMDBase_detect(SIMDBase_MODE_SSE2_DOUBLE) == 1) return SIMDBase_MODE_SSE2_DOUBLE;
+ if (SIMDBase_detect(SIMDBase_MODE_PUREC_DOUBLE) == 1) return SIMDBase_MODE_PUREC_DOUBLE;
+ break;
+
+ case SIMDBase_TYPE_LONGDOUBLE:
+ if (SIMDBase_detect(SIMDBase_MODE_PUREC_LONGDOUBLE) == 1) return SIMDBase_MODE_PUREC_LONGDOUBLE;
+ break;
+
+ case SIMDBase_TYPE_EXTENDED:
+ break;
+
+ case SIMDBase_TYPE_QUAD:
+ break;
+ }
+
+ return SIMDBase_MODE_NONE;
+}
+
+int32_t SIMDBase_getModeParamInt(int32_t paramId, int32_t mode) {
+ switch(mode) {
+#if defined(ENABLE_PUREC_FLOAT)
+ case 1: return getModeParamInt_purec_float(paramId); break;
+#endif
+#if defined(ENABLE_PUREC_DOUBLE)
+ case 2: return getModeParamInt_purec_double(paramId); break;
+#endif
+#if defined(ENABLE_PUREC_LONGDOUBLE)
+ case 3: return getModeParamInt_purec_longdouble(paramId); break;
+#endif
+#if defined(ENABLE_SSE_FLOAT)
+ case 4: return getModeParamInt_sse_float(paramId); break;
+#endif
+#if defined(ENABLE_SSE2_DOUBLE)
+ case 5: return getModeParamInt_sse2_double(paramId); break;
+#endif
+#if defined(ENABLE_NEON_FLOAT)
+ case 6: return getModeParamInt_neon_float(paramId); break;
+#endif
+#if defined(ENABLE_AVX_FLOAT)
+ case 7: return getModeParamInt_avx_float(paramId); break;
+#endif
+#if defined(ENABLE_AVX_DOUBLE)
+ case 8: return getModeParamInt_avx_double(paramId); break;
+#endif
+#if defined(ENABLE_ALTIVEC_FLOAT)
+ case 9: return getModeParamInt_altivec_float(paramId); break;
+#endif
+ }
+
+ return -1;
+}
+
+char *SIMDBase_getModeParamString(int32_t paramId, int32_t mode) {
+ switch(mode) {
+#if defined(ENABLE_PUREC_FLOAT)
+ case 1: return getModeParamString_purec_float(paramId); break;
+#endif
+#if defined(ENABLE_PUREC_DOUBLE)
+ case 2: return getModeParamString_purec_double(paramId); break;
+#endif
+#if defined(ENABLE_PUREC_LONGDOUBLE)
+ case 3: return getModeParamString_purec_longdouble(paramId); break;
+#endif
+#if defined(ENABLE_SSE_FLOAT)
+ case 4: return getModeParamString_sse_float(paramId); break;
+#endif
+#if defined(ENABLE_SSE2_DOUBLE)
+ case 5: return getModeParamString_sse2_double(paramId); break;
+#endif
+#if defined(ENABLE_NEON_FLOAT)
+ case 6: return getModeParamString_neon_float(paramId); break;
+#endif
+#if defined(ENABLE_AVX_FLOAT)
+ case 7: return getModeParamString_avx_float(paramId); break;
+#endif
+#if defined(ENABLE_AVX_DOUBLE)
+ case 8: return getModeParamString_avx_double(paramId); break;
+#endif
+#if defined(ENABLE_ALTIVEC_FLOAT)
+ case 9: return getModeParamString_altivec_float(paramId); break;
+#endif
+ }
+
+ return NULL;
+}
+
+#ifdef ANDROID
+int posix_memalign (void **memptr, size_t alignment, size_t size) {
+ *memptr = malloc (size);
+ return *memptr ? 0 : -1;
+}
+#endif
+
+void *SIMDBase_alignedMalloc(uint64_t size) {
+ void *p;
+ if (posix_memalign(&p, SIMDBase_sizeOfCachelineInByte(), size) != 0) abort();
+ return p;
+}
+
+void SIMDBase_alignedFree(void *ptr) {
+ free(ptr);
+}
+
+int32_t SIMDBase_getParamInt(int32_t paramId) {
+ switch(paramId) {
+ case SIMDBase_PARAMID_MODE_MAX:
+ return SIMDBase_LAST_MODE + 1;
+ }
+
+ return -1;
+}
+
+int32_t SIMDBase_getTypeParamInt(int32_t paramId, int32_t typeId) {
+ switch(typeId) {
+ }
+
+ return -1;
+}
diff --git a/plugins/supereq/nsfft-1.00/simd/SIMDBase.h b/plugins/supereq/nsfft-1.00/simd/SIMDBase.h
new file mode 100644
index 00000000..5382b4d1
--- /dev/null
+++ b/plugins/supereq/nsfft-1.00/simd/SIMDBase.h
@@ -0,0 +1,51 @@
+#ifndef _SIMDBase_H_
+#define _SIMDBase_H_
+
+#define SIMDBase_TYPE_FLOAT ( 1 | ( 1 << 24 ))
+#define SIMDBase_TYPE_DOUBLE ( 2 | ( 1 << 24 ))
+#define SIMDBase_TYPE_LONGDOUBLE ( 3 | ( 1 << 24 ))
+#define SIMDBase_TYPE_EXTENDED ( 4 | ( 1 << 24 ))
+#define SIMDBase_TYPE_QUAD ( 5 | ( 1 << 24 ))
+#define SIMDBase_TYPE_HALF ( 6 | ( 1 << 24 ))
+
+#define SIMDBase_MODE_NONE 0
+#define SIMDBase_MODE_PUREC_FLOAT 1
+#define SIMDBase_MODE_PUREC_DOUBLE 2
+#define SIMDBase_MODE_PUREC_LONGDOUBLE 3
+#define SIMDBase_MODE_SSE_FLOAT 4
+#define SIMDBase_MODE_SSE2_DOUBLE 5
+#define SIMDBase_MODE_NEON_FLOAT 6
+#define SIMDBase_MODE_AVX_FLOAT 7
+#define SIMDBase_MODE_AVX_DOUBLE 8
+#define SIMDBase_MODE_ALTIVEC_FLOAT 9
+
+#define SIMDBase_LAST_MODE SIMDBase_MODE_ALTIVEC_FLOAT
+
+#define SIMDBase_PARAMID_MODE_MAX ( 1 | ( 2 << 24 ))
+#define SIMDBase_PARAMID_TYPE_AVAILABILITY ( 2 | ( 2 << 24 ))
+#define SIMDBase_PARAMID_SIZE_OF_REAL ( 3 | ( 2 << 24 ))
+#define SIMDBase_PARAMID_SIZE_OF_VECT ( 4 | ( 2 << 24 ))
+#define SIMDBase_PARAMID_VECTOR_LEN ( 5 | ( 2 << 24 ))
+#define SIMDBase_PARAMID_MODE_AVAILABILITY ( 6 | ( 2 << 24 ))
+#define SIMDBase_PARAMID_MODE_NAME ( 7 | ( 2 << 24 ))
+
+//
+
+typedef struct {
+ uint32_t linesize;
+ uint32_t size[8], assoc[8];
+} CacheParam;
+
+void *SIMDBase_alignedMalloc(uint64_t size);
+void SIMDBase_alignedFree(void *ptr);
+int32_t SIMDBase_sizeOfCachelineInByte();
+int32_t SIMDBase_sizeOfDataCacheInByte();
+int32_t SIMDBase_chooseBestMode(int32_t typeId);
+char *SIMDBase_getProcessorNameString();
+int32_t SIMDBase_detect(int32_t paramId);
+int32_t SIMDBase_getParamInt(int32_t paramId);
+int32_t SIMDBase_getTypeParamInt(int32_t paramId, int32_t typeId);
+int32_t SIMDBase_getModeParamInt(int32_t paramId, int32_t mode);
+char *SIMDBase_getModeParamString(int32_t paramId, int32_t mode);
+
+#endif
diff --git a/plugins/supereq/nsfft-1.00/simd/SIMDBaseUndiff.c b/plugins/supereq/nsfft-1.00/simd/SIMDBaseUndiff.c
new file mode 100644
index 00000000..257a5ff0
--- /dev/null
+++ b/plugins/supereq/nsfft-1.00/simd/SIMDBaseUndiff.c
@@ -0,0 +1,38 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+
+#include "SIMDBase.h"
+#include "SIMDBaseUndiff.h"
+
+void SIMDBaseUndiff_DETECT() {
+ extern uint8_t detectBuffer[256];
+ SIMDBase_VECT a = SIMDBase_LOAD((SIMDBase_VECT *)&detectBuffer[0]);
+ SIMDBase_VECT b = SIMDBase_LOAD((SIMDBase_VECT *)&detectBuffer[64]);
+ SIMDBase_VECT c = SIMDBase_ADDi(a, b);
+ SIMDBase_STOR((SIMDBase_VECT *)&detectBuffer[128], c);
+}
+
+int32_t SIMDBaseUndiff_GETMODEPARAMINT(int32_t paramId) {
+ switch(paramId) {
+ case SIMDBase_PARAMID_SIZE_OF_REAL:
+ return sizeof(SIMDBase_REAL);
+ case SIMDBase_PARAMID_SIZE_OF_VECT:
+ return sizeof(SIMDBase_VECT);
+ case SIMDBase_PARAMID_VECTOR_LEN:
+ return SIMDBase_VECTLEN;
+ case SIMDBase_PARAMID_MODE_AVAILABILITY:
+ return SIMDBase_detect(paramId);
+ }
+
+ return -1;
+}
+
+char * SIMDBaseUndiff_GETMODEPARAMSTRING(int32_t paramId) {
+ switch(paramId) {
+ case SIMDBase_PARAMID_MODE_NAME:
+ return SIMDBase_NAME;
+ }
+
+ return NULL;
+}
diff --git a/plugins/supereq/nsfft-1.00/simd/SIMDBaseUndiff.h b/plugins/supereq/nsfft-1.00/simd/SIMDBaseUndiff.h
new file mode 100644
index 00000000..1af849a8
--- /dev/null
+++ b/plugins/supereq/nsfft-1.00/simd/SIMDBaseUndiff.h
@@ -0,0 +1,231 @@
+#ifndef _SIMDBaseUndiff_H_
+#define _SIMDBaseUndiff_H_
+
+#if defined(ENABLE_PUREC_FLOAT) ////////////////////////////////////////////
+
+typedef float SIMDBase_REAL;
+typedef float SIMDBase_VECT;
+
+#define SIMDBase_MODE 1
+#define SIMDBase_TYPE SIMDBase_TYPE_FLOAT
+#define SIMDBase_VECTLEN 1
+#define SIMDBase_NAME "Pure C float"
+#define SIMDBaseUndiff_DETECT detect_purec_float
+#define SIMDBaseUndiff_GETMODEPARAMINT getModeParamInt_purec_float
+#define SIMDBaseUndiff_GETMODEPARAMSTRING getModeParamString_purec_float
+
+static inline SIMDBase_VECT SIMDBase_LOAD(SIMDBase_VECT *p) { return *p; }
+static inline void SIMDBase_STOR(SIMDBase_VECT *p, SIMDBase_VECT u) { *p = u; }
+static inline SIMDBase_VECT SIMDBase_SET1(SIMDBase_REAL f) { return f; }
+static inline SIMDBase_VECT SIMDBase_LOAD1(SIMDBase_REAL *p) { return *p; }
+static inline SIMDBase_VECT SIMDBase_ADDi(SIMDBase_VECT u, SIMDBase_VECT v) { return u + v; }
+static inline SIMDBase_VECT SIMDBase_SUBi(SIMDBase_VECT u, SIMDBase_VECT v) { return u - v; }
+static inline SIMDBase_VECT SIMDBase_MULi(SIMDBase_VECT u, SIMDBase_VECT v) { return u * v; }
+static inline SIMDBase_VECT SIMDBase_NEGi(SIMDBase_VECT u) { return -u; }
+
+#elif defined(ENABLE_PUREC_DOUBLE) ////////////////////////////////////////////
+
+typedef double SIMDBase_REAL;
+typedef double SIMDBase_VECT;
+
+#define SIMDBase_MODE 2
+#define SIMDBase_TYPE SIMDBase_TYPE_DOUBLE
+#define SIMDBase_VECTLEN 1
+#define SIMDBase_NAME "Pure C double"
+#define SIMDBaseUndiff_DETECT detect_purec_double
+#define SIMDBaseUndiff_GETMODEPARAMINT getModeParamInt_purec_double
+#define SIMDBaseUndiff_GETMODEPARAMSTRING getModeParamString_purec_double
+
+static inline SIMDBase_VECT SIMDBase_LOAD(SIMDBase_VECT *p) { return *p; }
+static inline void SIMDBase_STOR(SIMDBase_VECT *p, SIMDBase_VECT u) { *p = u; }
+static inline SIMDBase_VECT SIMDBase_SET1(SIMDBase_REAL f) { return f; }
+static inline SIMDBase_VECT SIMDBase_LOAD1(SIMDBase_REAL *p) { return *p; }
+static inline SIMDBase_VECT SIMDBase_ADDi(SIMDBase_VECT u, SIMDBase_VECT v) { return u + v; }
+static inline SIMDBase_VECT SIMDBase_SUBi(SIMDBase_VECT u, SIMDBase_VECT v) { return u - v; }
+static inline SIMDBase_VECT SIMDBase_MULi(SIMDBase_VECT u, SIMDBase_VECT v) { return u * v; }
+static inline SIMDBase_VECT SIMDBase_NEGi(SIMDBase_VECT u) { return -u; }
+
+#elif defined(ENABLE_PUREC_LONGDOUBLE) ////////////////////////////////////////////
+
+typedef long double SIMDBase_REAL;
+typedef long double SIMDBase_VECT;
+
+#define SIMDBase_MODE 3
+#define SIMDBase_TYPE SIMDBase_TYPE_LONGDOUBLE
+#define SIMDBase_VECTLEN 1
+#define SIMDBase_NAME "Pure C long double"
+#define SIMDBaseUndiff_DETECT detect_purec_longdouble
+#define SIMDBaseUndiff_GETMODEPARAMINT getModeParamInt_purec_longdouble
+#define SIMDBaseUndiff_GETMODEPARAMSTRING getModeParamString_purec_longdouble
+
+static inline SIMDBase_VECT SIMDBase_LOAD(SIMDBase_VECT *p) { return *p; }
+static inline void SIMDBase_STOR(SIMDBase_VECT *p, SIMDBase_VECT u) { *p = u; }
+static inline SIMDBase_VECT SIMDBase_SET1(SIMDBase_REAL f) { return f; }
+static inline SIMDBase_VECT SIMDBase_LOAD1(SIMDBase_REAL *p) { return *p; }
+static inline SIMDBase_VECT SIMDBase_ADDi(SIMDBase_VECT u, SIMDBase_VECT v) { return u + v; }
+static inline SIMDBase_VECT SIMDBase_SUBi(SIMDBase_VECT u, SIMDBase_VECT v) { return u - v; }
+static inline SIMDBase_VECT SIMDBase_MULi(SIMDBase_VECT u, SIMDBase_VECT v) { return u * v; }
+static inline SIMDBase_VECT SIMDBase_NEGi(SIMDBase_VECT u) { return -u; }
+
+#elif defined(ENABLE_SSE_FLOAT) ////////////////////////////////////////////
+
+#include <xmmintrin.h>
+
+typedef float SIMDBase_REAL;
+typedef __m128 SIMDBase_VECT;
+
+#define SIMDBase_MODE 4
+#define SIMDBase_TYPE SIMDBase_TYPE_FLOAT
+#define SIMDBase_VECTLEN 4
+#define SIMDBase_NAME "x86 SSE float"
+#define SIMDBaseUndiff_DETECT detect_sse_float
+#define SIMDBaseUndiff_GETMODEPARAMINT getModeParamInt_sse_float
+#define SIMDBaseUndiff_GETMODEPARAMSTRING getModeParamString_sse_float
+
+static inline SIMDBase_VECT SIMDBase_LOAD(SIMDBase_VECT *p) { return _mm_load_ps((float *)p); }
+static inline void SIMDBase_STOR(SIMDBase_VECT *p, SIMDBase_VECT u) { _mm_store_ps((float *)p, u); }
+static inline SIMDBase_VECT SIMDBase_SET1(SIMDBase_REAL f) { return _mm_set1_ps(f); }
+static inline SIMDBase_VECT SIMDBase_LOAD1(SIMDBase_REAL *p) { return _mm_load1_ps(p); }
+static inline SIMDBase_VECT SIMDBase_ADDi(SIMDBase_VECT u, SIMDBase_VECT v) { return _mm_add_ps(u, v); }
+static inline SIMDBase_VECT SIMDBase_SUBi(SIMDBase_VECT u, SIMDBase_VECT v) { return _mm_sub_ps(u, v); }
+static inline SIMDBase_VECT SIMDBase_MULi(SIMDBase_VECT u, SIMDBase_VECT v) { return _mm_mul_ps(u, v); }
+static inline SIMDBase_VECT SIMDBase_NEGi(SIMDBase_VECT u) { return _mm_xor_ps(u, _mm_set_ps(-0.0f, -0.0f, -0.0f, -0.0f)); }
+
+#elif defined(ENABLE_SSE2_DOUBLE) ////////////////////////////////////////////
+
+#include <emmintrin.h>
+
+typedef double SIMDBase_REAL;
+typedef __m128d SIMDBase_VECT;
+
+#define SIMDBase_MODE 5
+#define SIMDBase_TYPE SIMDBase_TYPE_DOUBLE
+#define SIMDBase_VECTLEN 2
+#define SIMDBase_NAME "x86 SSE2 double"
+#define SIMDBaseUndiff_DETECT detect_sse2_double
+#define SIMDBaseUndiff_GETMODEPARAMINT getModeParamInt_sse2_double
+#define SIMDBaseUndiff_GETMODEPARAMSTRING getModeParamString_sse2_double
+
+static inline SIMDBase_VECT SIMDBase_LOAD(SIMDBase_VECT *p) { return _mm_load_pd((double *)p); }
+static inline void SIMDBase_STOR(SIMDBase_VECT *p, SIMDBase_VECT u) { _mm_store_pd((double *)p, u); }
+static inline SIMDBase_VECT SIMDBase_SET1(SIMDBase_REAL f) { return _mm_set1_pd(f); }
+static inline SIMDBase_VECT SIMDBase_LOAD1(SIMDBase_REAL *p) { return _mm_load1_pd(p); }
+static inline SIMDBase_VECT SIMDBase_ADDi(SIMDBase_VECT u, SIMDBase_VECT v) { return _mm_add_pd(u, v); }
+static inline SIMDBase_VECT SIMDBase_SUBi(SIMDBase_VECT u, SIMDBase_VECT v) { return _mm_sub_pd(u, v); }
+static inline SIMDBase_VECT SIMDBase_MULi(SIMDBase_VECT u, SIMDBase_VECT v) { return _mm_mul_pd(u, v); }
+static inline SIMDBase_VECT SIMDBase_NEGi(SIMDBase_VECT u) { return _mm_xor_pd(u, _mm_set_pd(-0.0, -0.0)); }
+
+#elif defined(ENABLE_NEON_FLOAT) ////////////////////////////////////////////
+
+#include <arm_neon.h>
+
+typedef float32_t SIMDBase_REAL;
+typedef float32x4_t SIMDBase_VECT;
+
+#define SIMDBase_MODE 6
+#define SIMDBase_TYPE SIMDBase_TYPE_FLOAT
+#define SIMDBase_VECTLEN 4
+#define SIMDBase_NAME "ARM NEON float"
+#define SIMDBaseUndiff_DETECT detect_neon_float
+#define SIMDBaseUndiff_GETMODEPARAMINT getModeParamInt_neon_float
+#define SIMDBaseUndiff_GETMODEPARAMSTRING getModeParamString_neon_float
+
+static inline SIMDBase_VECT SIMDBase_LOAD(SIMDBase_VECT *p) { return vld1q_f32((float32_t *)p); }
+static inline void SIMDBase_STOR(SIMDBase_VECT *p, SIMDBase_VECT u) { vst1q_f32((float32_t *)p, u); }
+static inline SIMDBase_VECT SIMDBase_SET1(SIMDBase_REAL f) { return vdupq_n_f32(f); }
+static inline SIMDBase_VECT SIMDBase_LOAD1(SIMDBase_REAL *p) { return vdupq_n_f32(*p); }
+static inline SIMDBase_VECT SIMDBase_ADDi(SIMDBase_VECT u, SIMDBase_VECT v) { return vaddq_f32(u, v); }
+static inline SIMDBase_VECT SIMDBase_SUBi(SIMDBase_VECT u, SIMDBase_VECT v) { return vsubq_f32(u, v); }
+static inline SIMDBase_VECT SIMDBase_MULi(SIMDBase_VECT u, SIMDBase_VECT v) { return vmulq_f32(u, v); }
+static inline SIMDBase_VECT SIMDBase_NEGi(SIMDBase_VECT u) {
+ return vreinterpretq_f32_u32( veorq_u32(vreinterpretq_u32_f32(u), vdupq_n_u32(0x80000000U)));
+}
+
+#define SIMDBase_FMADD_AVAILABLE
+
+static inline SIMDBase_VECT SIMDBase_FMADDi(SIMDBase_VECT u, SIMDBase_VECT v, SIMDBase_VECT w) { return vmlaq_f32(w, u, v); } // w + u * v
+static inline SIMDBase_VECT SIMDBase_FMSUBi(SIMDBase_VECT u, SIMDBase_VECT v, SIMDBase_VECT w) { return vmlsq_f32(w, u, v); } // w - u * v
+
+#elif defined(ENABLE_AVX_FLOAT) ////////////////////////////////////////////
+
+#include <immintrin.h>
+
+typedef float SIMDBase_REAL;
+typedef __m256 SIMDBase_VECT;
+
+#define SIMDBase_MODE 7
+#define SIMDBase_TYPE SIMDBase_TYPE_FLOAT
+#define SIMDBase_VECTLEN 8
+#define SIMDBase_NAME "x86 AVX float"
+#define SIMDBaseUndiff_DETECT detect_avx_float
+#define SIMDBaseUndiff_GETMODEPARAMINT getModeParamInt_avx_float
+#define SIMDBaseUndiff_GETMODEPARAMSTRING getModeParamString_avx_float
+
+static inline SIMDBase_VECT SIMDBase_LOAD(SIMDBase_VECT *p) { return _mm256_load_ps((float *)p); }
+static inline void SIMDBase_STOR(SIMDBase_VECT *p, SIMDBase_VECT u) { _mm256_store_ps((float *)p, u); }
+static inline SIMDBase_VECT SIMDBase_SET1(SIMDBase_REAL f) { return _mm256_set1_ps(f); }
+static inline SIMDBase_VECT SIMDBase_LOAD1(SIMDBase_REAL *p) { return _mm256_set1_ps(*p); }
+static inline SIMDBase_VECT SIMDBase_ADDi(SIMDBase_VECT u, SIMDBase_VECT v) { return _mm256_add_ps(u, v); }
+static inline SIMDBase_VECT SIMDBase_SUBi(SIMDBase_VECT u, SIMDBase_VECT v) { return _mm256_sub_ps(u, v); }
+static inline SIMDBase_VECT SIMDBase_MULi(SIMDBase_VECT u, SIMDBase_VECT v) { return _mm256_mul_ps(u, v); }
+static inline SIMDBase_VECT SIMDBase_NEGi(SIMDBase_VECT u) { return _mm256_xor_ps(u, _mm256_set1_ps(-0.0f)); }
+
+#elif defined(ENABLE_AVX_DOUBLE) ////////////////////////////////////////////
+
+#include <immintrin.h>
+
+typedef double SIMDBase_REAL;
+typedef __m256d SIMDBase_VECT;
+
+#define SIMDBase_MODE 8
+#define SIMDBase_TYPE SIMDBase_TYPE_DOUBLE
+#define SIMDBase_VECTLEN 4
+#define SIMDBase_NAME "x86 AVX double"
+#define SIMDBaseUndiff_DETECT detect_avx_double
+#define SIMDBaseUndiff_GETMODEPARAMINT getModeParamInt_avx_double
+#define SIMDBaseUndiff_GETMODEPARAMSTRING getModeParamString_avx_double
+
+static inline SIMDBase_VECT SIMDBase_LOAD(SIMDBase_VECT *p) { return _mm256_load_pd((double *)p); }
+static inline void SIMDBase_STOR(SIMDBase_VECT *p, SIMDBase_VECT u) { _mm256_store_pd((double *)p, u); }
+static inline SIMDBase_VECT SIMDBase_SET1(SIMDBase_REAL f) { return _mm256_set1_pd(f); }
+static inline SIMDBase_VECT SIMDBase_LOAD1(SIMDBase_REAL *p) { return _mm256_set1_pd(*p); }
+static inline SIMDBase_VECT SIMDBase_ADDi(SIMDBase_VECT u, SIMDBase_VECT v) { return _mm256_add_pd(u, v); }
+static inline SIMDBase_VECT SIMDBase_SUBi(SIMDBase_VECT u, SIMDBase_VECT v) { return _mm256_sub_pd(u, v); }
+static inline SIMDBase_VECT SIMDBase_MULi(SIMDBase_VECT u, SIMDBase_VECT v) { return _mm256_mul_pd(u, v); }
+static inline SIMDBase_VECT SIMDBase_NEGi(SIMDBase_VECT u) { return _mm256_xor_pd(u, _mm256_set1_pd(-0.0)); }
+
+#elif defined(ENABLE_ALTIVEC_FLOAT) ////////////////////////////////////////////
+
+#include <altivec.h>
+
+typedef float SIMDBase_REAL;
+typedef vector float SIMDBase_VECT;
+
+#define SIMDBase_MODE 9
+#define SIMDBase_TYPE SIMDBase_TYPE_FLOAT
+#define SIMDBase_VECTLEN 4
+#define SIMDBase_NAME "PowerPC AltiVec float"
+#define SIMDBaseUndiff_DETECT detect_altivec_float
+#define SIMDBaseUndiff_GETMODEPARAMINT getModeParamInt_altivec_float
+#define SIMDBaseUndiff_GETMODEPARAMSTRING getModeParamString_altivec_float
+
+static inline SIMDBase_VECT SIMDBase_LOAD(SIMDBase_VECT *p) { return vec_ld(0, p); }
+static inline void SIMDBase_STOR(SIMDBase_VECT *p, SIMDBase_VECT u) { vec_st(u, 0, p); }
+static inline SIMDBase_VECT SIMDBase_SET1(SIMDBase_REAL f) { return (vector float){f, f, f, f}; }
+static inline SIMDBase_VECT SIMDBase_LOAD1(SIMDBase_REAL *p) { return (vector float){*p, *p, *p, *p}; }
+static inline SIMDBase_VECT SIMDBase_ADDi(SIMDBase_VECT u, SIMDBase_VECT v) { return vec_add(u, v); }
+static inline SIMDBase_VECT SIMDBase_SUBi(SIMDBase_VECT u, SIMDBase_VECT v) { return vec_sub(u, v); }
+static inline SIMDBase_VECT SIMDBase_MULi(SIMDBase_VECT u, SIMDBase_VECT v) { return vec_madd(u, v, (vector float){0, 0, 0, 0}); }
+static inline SIMDBase_VECT SIMDBase_NEGi(SIMDBase_VECT u) { return vec_xor(u, (vector float){-0.0f, -0.0f, -0.0f, -0.0f}); }
+
+#define SIMDBase_FMADD_AVAILABLE
+
+static inline SIMDBase_VECT SIMDBase_FMADDi(SIMDBase_VECT u, SIMDBase_VECT v, SIMDBase_VECT w) { return vec_madd(u, v, w); } // w + u * v
+static inline SIMDBase_VECT SIMDBase_FMSUBi(SIMDBase_VECT u, SIMDBase_VECT v, SIMDBase_VECT w) { return vec_nmsub(u, v, w); } // w - u * v
+
+#endif ////////////////////////////////////////////////////////////////////
+
+static inline SIMDBase_VECT SIMDBase_ADDm(SIMDBase_VECT *p, SIMDBase_VECT *q) { return SIMDBase_ADDi(SIMDBase_LOAD(p), SIMDBase_LOAD(q)); }
+static inline SIMDBase_VECT SIMDBase_SUBm(SIMDBase_VECT *p, SIMDBase_VECT *q) { return SIMDBase_SUBi(SIMDBase_LOAD(p), SIMDBase_LOAD(q)); }
+
+#endif