32 files changed, 6716 insertions, 0 deletions
diff --git a/plugins/supereq/nsfft-1.00/README b/plugins/supereq/nsfft-1.00/README
new file mode 100644
index 00000000..1ca873b1
--- /dev/null
+++ b/plugins/supereq/nsfft-1.00/README
@@ -0,0 +1,15 @@
+
+NSFFT (Nonrestrictive SIMD FFT) is yet another FFT library for
+performing 1-dimensional fast Fourier transforms. NSDFT is a simple,
+small and portable library, and it is efficient since it can utilize
+SIMD instruction sets in modern processors. It performs multiple
+transforms simultaneously, and thus it is especially suitable for
+digital signal processing. It does not need so much computation to
+make a good execution plan. This library is in public domain, so that
+you can incorporate this library into your product without any
+obligation.
+
+Visit http://shibatch.sourceforge.net/ to get the latest version of
+this library.
+
+Contact : Naoki Shibata shibatch@users.sourceforge.net
diff --git a/plugins/supereq/nsfft-1.00/dft/DFT.c b/plugins/supereq/nsfft-1.00/dft/DFT.c
new file mode 100644
index 00000000..d59e6ab8
--- /dev/null
+++ b/plugins/supereq/nsfft-1.00/dft/DFT.c
@@ -0,0 +1,327 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+#include <stdint.h>
+#include <sys/time.h>
+
+#include "SIMDBase.h"
+#include "DFT.h"
+#include "DFTUndiff.h"
+
+int32_t getModeParamInt_purec_float(int32_t paramId);
+int32_t getModeParamInt_purec_double(int32_t paramId);
+int32_t getModeParamInt_purec_longdouble(int32_t paramId);
+int32_t getModeParamInt_sse_float(int32_t paramId);
+int32_t getModeParamInt_sse2_double(int32_t paramId);
+int32_t getModeParamInt_neon_float(int32_t paramId);
+int32_t getModeParamInt_avx_float(int32_t paramId);
+int32_t getModeParamInt_avx_double(int32_t paramId);
+int32_t getModeParamInt_altivec_float(int32_t paramId);
+
+char * getModeParamString_purec_float(int32_t paramId);
+char * getModeParamString_purec_double(int32_t paramId);
+char * getModeParamString_purec_longdouble(int32_t paramId);
+char * getModeParamString_sse_float(int32_t paramId);
+char * getModeParamString_sse2_double(int32_t paramId);
+char * getModeParamString_neon_float(int32_t paramId);
+char * getModeParamString_avx_float(int32_t paramId);
+char * getModeParamString_avx_double(int32_t paramId);
+char * getModeParamString_altivec_float(int32_t paramId);
+
+void *makePlan_purec_float(uint64_t n, uint64_t flags);
+void *makePlan_purec_double(uint64_t n, uint64_t flags);
+void *makePlan_purec_longdouble(uint64_t n, uint64_t flags);
+void *makePlan_sse_float(uint64_t n, uint64_t flags);
+void *makePlan_sse2_double(uint64_t n, uint64_t flags);
+void *makePlan_neon_float(uint64_t n, uint64_t flags);
+void *makePlan_avx_float(uint64_t n, uint64_t flags);
+void *makePlan_avx_double(uint64_t n, uint64_t flags);
+void *makePlan_altivec_float(uint64_t n, uint64_t flags);
+
+void *makePlanSub_purec_float(uint64_t n, int32_t radix2thres, int32_t useCobra, uint64_t flags);
+void *makePlanSub_purec_double(uint64_t n, int32_t radix2thres, int32_t useCobra, uint64_t flags);
+void *makePlanSub_purec_longdouble(uint64_t n, int32_t radix2thres, int32_t useCobra, uint64_t flags);
+void *makePlanSub_sse_float(uint64_t n, int32_t radix2thres, int32_t useCobra, uint64_t flags);
+void *makePlanSub_sse2_double(uint64_t n, int32_t radix2thres, int32_t useCobra, uint64_t flags);
+void *makePlanSub_neon_float(uint64_t n, int32_t radix2thres, int32_t useCobra, uint64_t flags);
+void *makePlanSub_avx_float(uint64_t n, int32_t radix2thres, int32_t useCobra, uint64_t flags);
+void *makePlanSub_avx_double(uint64_t n, int32_t radix2thres, int32_t useCobra, uint64_t flags);
+void *makePlanSub_altivec_float(uint64_t n, int32_t radix2thres, int32_t useCobra, uint64_t flags);
+
+void destroyPlan_purec_float(void *p);
+void destroyPlan_purec_double(void *p);
+void destroyPlan_purec_longdouble(void *p);
+void destroyPlan_sse_float(void *p);
+void destroyPlan_sse2_double(void *p);
+void destroyPlan_neon_float(void *p);
+void destroyPlan_avx_float(void *p);
+void destroyPlan_avx_double(void *p);
+void destroyPlan_altivec_float(void *p);
+
+void execute_purec_float(void *p, void *s, int32_t dir);
+void execute_purec_double(void *p, void *s, int32_t dir);
+void execute_purec_longdouble(void *p, void *s, int32_t dir);
+void execute_sse_float(void *p, void *s, int32_t dir);
+void execute_sse2_double(void *p, void *s, int32_t dir);
+void execute_neon_float(void *p, void *s, int32_t dir);
+void execute_avx_float(void *p, void *s, int32_t dir);
+void execute_avx_double(void *p, void *s, int32_t dir);
+void execute_altivec_float(void *p, void *s, int32_t dir);
+
+void *DFT_init(int32_t mode, uint64_t n, uint64_t flags) {
+  switch(mode) {
+#if defined(ENABLE_PUREC_FLOAT)
+  case 1: return makePlan_purec_float(n, flags); break;
+#endif
+#if defined(ENABLE_PUREC_DOUBLE)
+  case 2: return makePlan_purec_double(n, flags); break;
+#endif
+#if defined(ENABLE_PUREC_LONGDOUBLE)
+  case 3: return makePlan_purec_longdouble(n, flags); break;
+#endif
+#if defined(ENABLE_SSE_FLOAT)
+  case 4: return makePlan_sse_float(n, flags); break;
+#endif
+#if defined(ENABLE_SSE2_DOUBLE)
+  case 5: return makePlan_sse2_double(n, flags); break;
+#endif
+#if defined(ENABLE_NEON_FLOAT)
+  case 6: return makePlan_neon_float(n, flags); break;
+#endif
+#if defined(ENABLE_AVX_FLOAT)
+  case 7: return makePlan_avx_float(n, flags); break;
+#endif
+#if defined(ENABLE_AVX_DOUBLE)
+  case 8: return makePlan_avx_double(n, flags); break;
+#endif
+#if defined(ENABLE_ALTIVEC_FLOAT)
+  case 9: return makePlan_altivec_float(n, flags); break;
+#endif
+  default: break;
+  }
+
+  return NULL;
+}
+
+void DFT_dispose(void *p, int32_t mode) {
+  switch(mode) {
+#if defined(ENABLE_PUREC_FLOAT)
+  case 1: destroyPlan_purec_float(p); break;
+#endif
+#if defined(ENABLE_PUREC_DOUBLE)
+  case 2: destroyPlan_purec_double(p); break;
+#endif
+#if defined(ENABLE_PUREC_LONGDOUBLE)
+  case 3: destroyPlan_purec_longdouble(p); break;
+#endif
+#if defined(ENABLE_SSE_FLOAT)
+  case 4: destroyPlan_sse_float(p); break;
+#endif
+#if defined(ENABLE_SSE2_DOUBLE)
+  case 5: destroyPlan_sse2_double(p); break;
+#endif
+#if defined(ENABLE_NEON_FLOAT)
+  case 6: destroyPlan_neon_float(p); break;
+#endif
+#if defined(ENABLE_AVX_FLOAT)
+  case 7: destroyPlan_avx_float(p); break;
+#endif
+#if defined(ENABLE_AVX_DOUBLE)
+  case 8: destroyPlan_avx_double(p); break;
+#endif
+#if defined(ENABLE_ALTIVEC_FLOAT)
+  case 9: destroyPlan_altivec_float(p); break;
+#endif
+  default: break;
+  }
+}
+
+void DFT_execute(void *p, int32_t mode, void *s, int32_t dir) {
+  switch(mode) {
+#if defined(ENABLE_PUREC_FLOAT)
+  case 1: return execute_purec_float(p, s, dir); break;
+#endif
+#if defined(ENABLE_PUREC_DOUBLE)
+  case 2: return execute_purec_double(p, s, dir); break;
+#endif
+#if defined(ENABLE_PUREC_LONGDOUBLE)
+  case 3: return execute_purec_longdouble(p, s, dir); break;
+#endif
+#if defined(ENABLE_SSE_FLOAT)
+  case 4: return execute_sse_float(p, s, dir); break;
+#endif
+#if defined(ENABLE_SSE2_DOUBLE)
+  case 5: return execute_sse2_double(p, s, dir); break;
+#endif
+#if defined(ENABLE_NEON_FLOAT)
+  case 6: return execute_neon_float(p, s, dir); break;
+#endif
+#if defined(ENABLE_AVX_FLOAT)
+  case 7: return execute_avx_float(p, s, dir); break;
+#endif
+#if defined(ENABLE_AVX_DOUBLE)
+  case 8: return execute_avx_double(p, s, dir); break;
+#endif
+#if defined(ENABLE_ALTIVEC_FLOAT)
+  case 9: return execute_altivec_float(p, s, dir); break;
+#endif
+  default: break;
+  }
+}
+
+#define FILE_FORMAT_VERSION 0
+
+int32_t DFT_fwrite(void *p2, FILE *fp) {
+  DFTUndiff *p = (DFTUndiff *)p2;
+  if (p->magic != MAGIC_DFT) abort();
+
+  if (fprintf(fp, "nsfft file format : %d\n", FILE_FORMAT_VERSION) <= 0) return 0;
+  if (fprintf(fp, "arch : %s\n", SIMDBase_getProcessorNameString()) <= 0) return 0;
+  if (fprintf(fp, "computation mode : %d\n", p->mode) <= 0) return 0;
+  if (fprintf(fp, "length : %d\n", ((p->flags & DFT_FLAG_REAL) != 0 || (p->flags & DFT_FLAG_ALT_REAL) != 0)? p->length * 2 : p->length) <= 0) return 0;
+  if (fprintf(fp, "radix2 threshold : %d\n", p->radix2thres) <= 0) return 0;
+  if (fprintf(fp, "transpose : %d\n", p->flagTrans) <= 0) return 0;
+  if (fprintf(fp, "bit reversal : %d\n", p->useCobra) <= 0) return 0;
+  if (fprintf(fp, "flags : %llx\n", (unsigned long long int)p->flags) <= 0) return 0;
+  if (fprintf(fp, "%s\n", "end :") <= 0) return 0;
+
+  return 1;
+}
+
+static char *startsWith(char *str1, char *str2) {
+  if (strncmp(str1, str2, strlen(str2)) == 0) {
+    return str1 + strlen(str2);
+  }
+
+  return NULL;
+}
+
+DFT *DFT_fread(FILE *fp, int32_t *errcode) {
+  int length = -1, radix2thres = -1, flagTrans = -1, useCobra = -1;
+  int mode = -1, formatver = -1;
+  unsigned long long int flags = (1ULL << 63);
+
+  if (errcode != NULL) *errcode = DFT_ERROR_NOERROR;
+
+  for(;;) {
+    char buf[256], *q;
+    if (fgets(buf, 255, fp) == NULL) { if (errcode != NULL) *errcode = DFT_ERROR_UNEXPECTED_EOF; return NULL; }
+
+    if ((q = startsWith(buf, "nsfft file format :")) != NULL) {
+      if (1 != sscanf(q, "%d", &formatver)) { if (errcode != NULL) *errcode = DFT_ERROR_FILE_IO; return NULL; }
+    } else if ((q = startsWith(buf, "computation mode :")) != NULL) {
+      if (1 != sscanf(q, "%d", &mode)) { if (errcode != NULL) *errcode = DFT_ERROR_FILE_IO; return NULL; }
+    } else if ((q = startsWith(buf, "length :")) != NULL) {
+      if (1 != sscanf(q, "%d", &length)) { if (errcode != NULL) *errcode = DFT_ERROR_FILE_IO; return NULL; }
+    } else if ((q = startsWith(buf, "radix2 threshold :")) != NULL) {
+      if (1 != sscanf(q, "%d", &radix2thres)) { if (errcode != NULL) *errcode = DFT_ERROR_FILE_IO; return NULL; }
+    } else if ((q = startsWith(buf, "transpose :")) != NULL) {
+      if (1 != sscanf(q, "%d", &flagTrans)) { if (errcode != NULL) *errcode = DFT_ERROR_FILE_IO; return NULL; }
+    } else if ((q = startsWith(buf, "bit reversal :")) != NULL) {
+      if (1 != sscanf(q, "%d", &useCobra)) { if (errcode != NULL) *errcode = DFT_ERROR_FILE_IO; return NULL; }
+    } else if ((q = startsWith(buf, "flags :")) != NULL) {
+      if (1 != sscanf(q, "%llx", &flags)) { if (errcode != NULL) *errcode = DFT_ERROR_FILE_IO; return NULL; }
+    } else if ((q = startsWith(buf, "end :")) != NULL) {
+      break;
+    }
+  }
+
+  if (formatver > FILE_FORMAT_VERSION) {
+    if (errcode != NULL) *errcode = DFT_ERROR_FILE_VERSION;
+    return NULL;
+  }
+
+  switch(SIMDBase_detect(mode)) {
+  case 1:
+    break;
+  case 0:
+    if (errcode != NULL) *errcode = DFT_ERROR_MODE_NOT_AVAILABLE;
+    return NULL;
+  case -1:
+    if (errcode != NULL) *errcode = DFT_ERROR_MODE_NOT_COMPILED_IN;
+    return NULL;
+  }
+
+  switch(mode) {
+#if defined(ENABLE_PUREC_FLOAT)
+  case 1: return makePlanSub_purec_float(length, radix2thres, useCobra, flags);
+#endif
+#if defined(ENABLE_PUREC_DOUBLE)
+  case 2: return makePlanSub_purec_double(length, radix2thres, useCobra, flags);
+#endif
+#if defined(ENABLE_PUREC_LONGDOUBLE)
+  case 3: return makePlanSub_purec_longdouble(length, radix2thres, useCobra, flags);
+#endif
+#if defined(ENABLE_SSE_FLOAT)
+  case 4: return makePlanSub_sse_float(length, radix2thres, useCobra, flags);
+#endif
+#if defined(ENABLE_SSE2_DOUBLE)
+  case 5: return makePlanSub_sse2_double(length, radix2thres, useCobra, flags);
+#endif
+#if defined(ENABLE_NEON_FLOAT)
+  case 6: return makePlanSub_neon_float(length, radix2thres, useCobra, flags);
+#endif
+#if defined(ENABLE_AVX_FLOAT)
+  case 7: return makePlanSub_avx_float(length, radix2thres, useCobra, flags);
+#endif
+#if defined(ENABLE_AVX_DOUBLE)
+  case 8: return makePlanSub_avx_double(length, radix2thres, useCobra, flags);
+#endif
+#if defined(ENABLE_ALTIVEC_FLOAT)
+  case 9: return makePlanSub_altivec_float(length, radix2thres, useCobra, flags);
+#endif
+  }
+
+  if (errcode != NULL) *errcode = DFT_ERROR_UNKNOWN_MODE;
+
+  return NULL;
+}
+
+int32_t DFT_getPlanParamInt(int32_t paramId, void *p2) {
+  DFTUndiff *p = (DFTUndiff *)p2;
+  if (p->magic != MAGIC_DFT) abort();
+
+  switch(paramId) {
+  case DFT_PARAMID_MODE: return p->mode;
+  case DFT_PARAMID_FFT_LENGTH:
+    if ((p->flags & DFT_FLAG_REAL) != 0) return p->length * 2;
+    if ((p->flags & DFT_FLAG_ALT_REAL) != 0) return p->length * 2;
+    return p->length;
+  case DFT_PARAMID_IS_REAL_TRANSFORM: return (p->flags & DFT_FLAG_REAL) ? 1 : 0;
+  case DFT_PARAMID_IS_ALT_REAL_TRANSFORM: return (p->flags & DFT_FLAG_ALT_REAL) ? 1 : 0;
+  case DFT_PARAMID_NO_BIT_REVERSAL: return (p->flags & DFT_FLAG_NO_BITREVERSAL) ? 1 : 0;
+  case DFT_PARAMID_TEST_RUN: return p->flags & 3;
+  }
+
+  return -1;
+}
+
+#if 0
+char *DFT_getPlanParamString(int32_t paramId, void *p2) {
+  dft_t *p = (dft_t *)p2;
+  if (p->magic != MAGIC_NSDFT) abort();
+
+  return NULL;
+}
+#endif
+
+uint32_t DFT_ilog2(uint32_t q) {
+  static const uint32_t tab[] = {0,1,2,2,3,3,3,3,4,4,4,4,4,4,4,4};
+  uint32_t r = 0,qq;
+
+  if (q & 0xffff0000) r = 16;
+
+  q >>= r;
+  qq = q | (q >> 1);
+  qq |= (qq >> 2);
+  qq = ((qq & 0x10) >> 4) | ((qq & 0x100) >> 7) | ((qq & 0x1000) >> 10);
+
+  return r + tab[qq] * 4 + tab[q >> (tab[qq] * 4)] - 1;
+}
+
+double DFT_timeofday(void) {
+  struct timeval tp;
+  gettimeofday(&tp, NULL);
+  return (double)tp.tv_sec+(1e-6)*tp.tv_usec;
+}
diff --git a/plugins/supereq/nsfft-1.00/dft/DFT.h b/plugins/supereq/nsfft-1.00/dft/DFT.h
new file mode 100644
index 00000000..facb701a
--- /dev/null
+++ b/plugins/supereq/nsfft-1.00/dft/DFT.h
@@ -0,0 +1,56 @@
+#ifndef __DFT_H__
+#define __DFT_H__
+
+#include <stdio.h>
+#include <stdint.h>
+
+typedef void DFT;
+
+int32_t DFT_getParamInt(int32_t paramId);
+char *DFT_getParamString(int32_t paramId);
+
+int32_t DFT_getModeParamInt(int32_t paramId, int32_t mode);
+char *DFT_getModeParamString(int32_t paramId, int32_t mode);
+
+DFT *DFT_init(int32_t mode, uint64_t n, uint64_t flags);
+void DFT_dispose(DFT *p, int32_t mode);
+
+int32_t DFT_fwrite(DFT *p, FILE *fp);
+DFT *DFT_fread(FILE *fp, int32_t *errcode);
+
+int32_t DFT_getPlanParamInt(int32_t paramId, void *p);
+
+void DFT_execute(DFT *p, int32_t mode, void *s, int32_t dir);
+
+uint32_t DFT_ilog2(uint32_t q);
+double DFT_timeofday(void);
+
+#define DFT_FLAG_NO_TEST_RUN ( 0ULL << 0)
+#define DFT_FLAG_LIGHT_TEST_RUN ( 1ULL << 0)
+#define DFT_FLAG_HEAVY_TEST_RUN ( 2ULL << 0)
+#define DFT_FLAG_EXHAUSTIVE_TEST_RUN ( 3ULL << 0)
+
+#define DFT_FLAG_REAL (1ULL << 2)
+#define DFT_FLAG_ALT_REAL (1ULL << 3)
+#define DFT_FLAG_VERBOSE (1ULL << 4)
+#define DFT_FLAG_NO_BITREVERSAL (1ULL << 5)
+#define DFT_FLAG_FORCE_RECURSIVE (1ULL << 6)
+#define DFT_FLAG_FORCE_COBRA (1ULL << 7)
+
+#define DFT_PARAMID_TYPE ( 1 | ( 3 << 24 ))
+#define DFT_PARAMID_MODE ( 2 | ( 3 << 24 ))
+#define DFT_PARAMID_FFT_LENGTH ( 3 | ( 3 << 24 ))
+#define DFT_PARAMID_IS_REAL_TRANSFORM ( 4 | ( 3 << 24 ))
+#define DFT_PARAMID_IS_ALT_REAL_TRANSFORM ( 5 | ( 3 << 24 ))
+#define DFT_PARAMID_NO_BIT_REVERSAL ( 6 | ( 3 << 24 ))
+#define DFT_PARAMID_TEST_RUN ( 7 | ( 3 << 24 ))
+
+#define DFT_ERROR_NOERROR 0
+#define DFT_ERROR_FILE_VERSION 1
+#define DFT_ERROR_FILE_IO 2
+#define DFT_ERROR_UNEXPECTED_EOF 3
+#define DFT_ERROR_MODE_NOT_COMPILED_IN 4
+#define DFT_ERROR_MODE_NOT_AVAILABLE 5
+#define DFT_ERROR_UNKNOWN_MODE 6
+
+#endif
diff --git a/plugins/supereq/nsfft-1.00/dft/DFTUndiff.c b/plugins/supereq/nsfft-1.00/dft/DFTUndiff.c
new file mode 100644
index 00000000..4985da33
--- /dev/null
+++ b/plugins/supereq/nsfft-1.00/dft/DFTUndiff.c
@@ -0,0 +1,1807 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include <stdint.h>
+
+#include "SIMDBase.h"
+#include "SIMDBaseUndiff.h"
+#include "DFT.h"
+#include "DFTUndiff.h"
+
+//
+
+#define SIN(x) sin(x)
+#define COS(x) cos(x)
+
+#define SQRT2_2 .7071067811865475244008443621048490392848359376884740365883398689953L
+
+#ifndef M_PIl
+#define M_PIl 3.141592653589793238462643383279502884197169399375105820974944592307L
+#endif
+
+//
+
+static inline void srBut2(DFTUndiff *p) {
+  SIMDBase_VECT *s = p->s;
+  int32_t o = p->offset1;
+  SIMDBase_VECT t0, t1;
+
+  t0 = SIMDBase_ADDm(&s[o  ], &s[o+2]); t1 = SIMDBase_SUBm(&s[o  ], &s[o+2]);
+  SIMDBase_STOR(&s[o  ], t0); SIMDBase_STOR(&s[o+2], t1);
+  t0 = SIMDBase_ADDm(&s[o+1], &s[o+3]); t1 = SIMDBase_SUBm(&s[o+1], &s[o+3]);
+  SIMDBase_STOR(&s[o+1], t0); SIMDBase_STOR(&s[o+3], t1);
+}
+
+static inline void srButForward4(DFTUndiff *p) {
+  SIMDBase_VECT *s = p->s;
+  int32_t o = p->offset1;
+  SIMDBase_VECT t0r, t0i, t1r, t1i, t2r, t2i, t3r, t3i;
+
+  t0r = SIMDBase_ADDm(&s[o+0], &s[o+4]); t2r = SIMDBase_SUBm(&s[o+0], &s[o+4]);
+  t0i = SIMDBase_ADDm(&s[o+1], &s[o+5]); t2i = SIMDBase_SUBm(&s[o+1], &s[o+5]);
+  t1r = SIMDBase_ADDm(&s[o+2], &s[o+6]); t3i = SIMDBase_SUBm(&s[o+2], &s[o+6]);
+  t1i = SIMDBase_ADDm(&s[o+7], &s[o+3]); t3r = SIMDBase_SUBm(&s[o+7], &s[o+3]);
+
+  SIMDBase_STOR(&s[o+0], SIMDBase_ADDi(t0r, t1r)); SIMDBase_STOR(&s[o+1], SIMDBase_ADDi(t0i, t1i));
+  SIMDBase_STOR(&s[o+2], SIMDBase_SUBi(t0r, t1r)); SIMDBase_STOR(&s[o+3], SIMDBase_SUBi(t0i, t1i));
+  SIMDBase_STOR(&s[o+4], SIMDBase_SUBi(t2r, t3r)); SIMDBase_STOR(&s[o+5], SIMDBase_SUBi(t2i, t3i));
+  SIMDBase_STOR(&s[o+6], SIMDBase_ADDi(t2r, t3r)); SIMDBase_STOR(&s[o+7], SIMDBase_ADDi(t2i, t3i));
+}
+
+static inline void srButBackward4(DFTUndiff *p) {
+  SIMDBase_VECT *s = p->s;
+  int32_t o = p->offset1;
+
+  SIMDBase_VECT t0r, t0i, t1r, t1i;
+  SIMDBase_VECT s0 = SIMDBase_LOAD(&s[o+0]), s1 = SIMDBase_LOAD(&s[o+1]), s2 = SIMDBase_LOAD(&s[o+2]), s3 = SIMDBase_LOAD(&s[o+3]);
+
+  t0r = SIMDBase_ADDi(s0, s2); t0i = SIMDBase_SUBi(s0, s2); s0 = t0r; s2 = t0i;
+  t0r = SIMDBase_ADDi(s1, s3); t0i = SIMDBase_SUBi(s1, s3); s1 = t0r; s3 = t0i;
+  t0r = SIMDBase_ADDm(&s[o+4], &s[o+6]); t1i = SIMDBase_SUBm(&s[o+4], &s[o+6]);
+  t0i = SIMDBase_ADDm(&s[o+7], &s[o+5]); t1r = SIMDBase_SUBm(&s[o+7], &s[o+5]);
+
+  SIMDBase_STOR(&s[o+4], SIMDBase_SUBi(s0, t0r)); SIMDBase_STOR(&s[o+5], SIMDBase_SUBi(s1, t0i));
+  SIMDBase_STOR(&s[o+6], SIMDBase_SUBi(s2, t1r)); SIMDBase_STOR(&s[o+7], SIMDBase_SUBi(s3, t1i));
+  SIMDBase_STOR(&s[o+0], SIMDBase_ADDi(s0, t0r)); SIMDBase_STOR(&s[o+1], SIMDBase_ADDi(s1, t0i));
+  SIMDBase_STOR(&s[o+2], SIMDBase_ADDi(s2, t1r)); SIMDBase_STOR(&s[o+3], SIMDBase_ADDi(s3, t1i));
+}
+
+static inline void srButForward8(DFTUndiff *p) {
+  SIMDBase_VECT *s = p->s;
+  int32_t o = p->offset1;
+  SIMDBase_VECT t0r, t0i, t1r, t1i, t2r, t2i, t3r, t3i;
+
+  SIMDBase_VECT s0 = SIMDBase_LOAD(&s[o+ 0]), s1 = SIMDBase_LOAD(&s[o+ 1]), s2 = SIMDBase_LOAD(&s[o+ 2]), s3 = SIMDBase_LOAD(&s[o+ 3]);
+  SIMDBase_VECT s4 = SIMDBase_LOAD(&s[o+ 4]), s5 = SIMDBase_LOAD(&s[o+ 5]), s6 = SIMDBase_LOAD(&s[o+ 6]), s7 = SIMDBase_LOAD(&s[o+ 7]);
+  SIMDBase_VECT s8 = SIMDBase_LOAD(&s[o+ 8]), s9 = SIMDBase_LOAD(&s[o+ 9]), sa = SIMDBase_LOAD(&s[o+10]) ,sb = SIMDBase_LOAD(&s[o+11]);
+  SIMDBase_VECT sc = SIMDBase_LOAD(&s[o+12]), sd = SIMDBase_LOAD(&s[o+13]), se = SIMDBase_LOAD(&s[o+14]), sf = SIMDBase_LOAD(&s[o+15]);
+
+  t2r = SIMDBase_SUBi(s0, s8); t2i = SIMDBase_SUBi(s1, s9);
+  t3r = SIMDBase_SUBi(sd, s5); t3i = SIMDBase_SUBi(s4, sc); 
+  
+  s0 = SIMDBase_ADDi(s0, s8); s1 = SIMDBase_ADDi(s1, s9);
+  s4 = SIMDBase_ADDi(s4, sc); s5 = SIMDBase_ADDi(s5, sd);
+
+  s8 = SIMDBase_SUBi(t2r, t3r); s9 = SIMDBase_SUBi(t2i, t3i);
+  sc = SIMDBase_ADDi(t2r, t3r); sd = SIMDBase_ADDi(t2i, t3i);
+
+  t2r = SIMDBase_SUBi(s2, sa); t2i = SIMDBase_SUBi(s3, sb);
+  t3r = SIMDBase_SUBi(sf, s7); t3i = SIMDBase_SUBi(s6, se);
+
+  s2 = SIMDBase_ADDi(s2, sa); s3 = SIMDBase_ADDi(s3, sb);
+  s6 = SIMDBase_ADDi(s6, se); s7 = SIMDBase_ADDi(s7, sf);
+
+  t0r = SIMDBase_SUBi(t2r, t3r); t1r = SIMDBase_ADDi(t2r, t3r);
+  t0i = SIMDBase_SUBi(t2i, t3i); t1i = SIMDBase_ADDi(t2i, t3i);
+
+  sa = SIMDBase_MULi(SIMDBase_ADDi(t0r, t0i), SIMDBase_SET1( SQRT2_2));
+  sb = SIMDBase_MULi(SIMDBase_SUBi(t0i, t0r), SIMDBase_SET1( SQRT2_2));
+  se = SIMDBase_MULi(SIMDBase_SUBi(t1i, t1r), SIMDBase_SET1( SQRT2_2));
+  sf = SIMDBase_MULi(SIMDBase_ADDi(t1r, t1i), SIMDBase_SET1(-SQRT2_2));
+
+  SIMDBase_STOR(&s[o+ 8], SIMDBase_ADDi(s8, sa)); SIMDBase_STOR(&s[o+ 9], SIMDBase_ADDi(s9, sb));
+  SIMDBase_STOR(&s[o+10], SIMDBase_SUBi(s8, sa)); SIMDBase_STOR(&s[o+11], SIMDBase_SUBi(s9, sb));
+
+  SIMDBase_STOR(&s[o+12], SIMDBase_ADDi(sc, se)); SIMDBase_STOR(&s[o+13], SIMDBase_ADDi(sd, sf));
+  SIMDBase_STOR(&s[o+14], SIMDBase_SUBi(sc, se)); SIMDBase_STOR(&s[o+15], SIMDBase_SUBi(sd, sf));
+
+  t0r = SIMDBase_ADDi(s0, s4); t2r = SIMDBase_SUBi(s0, s4);
+  t0i = SIMDBase_ADDi(s1, s5); t2i = SIMDBase_SUBi(s1, s5);
+
+  t1r = SIMDBase_ADDi(s2, s6); t3i = SIMDBase_SUBi(s2, s6);
+  t1i = SIMDBase_ADDi(s3, s7); t3r = SIMDBase_SUBi(s7, s3);
+
+  SIMDBase_STOR(&s[o+0], SIMDBase_ADDi(t0r, t1r)); SIMDBase_STOR(&s[o+1], SIMDBase_ADDi(t0i, t1i));
+  SIMDBase_STOR(&s[o+2], SIMDBase_SUBi(t0r, t1r)); SIMDBase_STOR(&s[o+3], SIMDBase_SUBi(t0i, t1i));
+  SIMDBase_STOR(&s[o+4], SIMDBase_SUBi(t2r, t3r)); SIMDBase_STOR(&s[o+5], SIMDBase_SUBi(t2i, t3i));
+  SIMDBase_STOR(&s[o+6], SIMDBase_ADDi(t2r, t3r)); SIMDBase_STOR(&s[o+7], SIMDBase_ADDi(t2i, t3i));
+}
+
+static void srButBackward8(DFTUndiff *p) {
+  SIMDBase_VECT *s = p->s;
+  int32_t o = p->offset1;
+  SIMDBase_VECT t0r, t0i, t1r, t1i;
+
+  SIMDBase_VECT s0 = SIMDBase_LOAD(&s[o+ 0]), s1 = SIMDBase_LOAD(&s[o+ 1]), s2 = SIMDBase_LOAD(&s[o+ 2]), s3 = SIMDBase_LOAD(&s[o+ 3]);
+  SIMDBase_VECT s4 = SIMDBase_LOAD(&s[o+ 4]), s5 = SIMDBase_LOAD(&s[o+ 5]), s6 = SIMDBase_LOAD(&s[o+ 6]), s7 = SIMDBase_LOAD(&s[o+ 7]);
+  SIMDBase_VECT s8 = SIMDBase_LOAD(&s[o+ 8]), s9 = SIMDBase_LOAD(&s[o+ 9]), sa = SIMDBase_LOAD(&s[o+10]) ,sb = SIMDBase_LOAD(&s[o+11]);
+  SIMDBase_VECT sc = SIMDBase_LOAD(&s[o+12]), sd = SIMDBase_LOAD(&s[o+13]), se = SIMDBase_LOAD(&s[o+14]), sf = SIMDBase_LOAD(&s[o+15]);
+
+  t0r = SIMDBase_ADDi(s8, sa); t0i = SIMDBase_SUBi(s8, sa); s8 = t0r; sa = t0i;
+  t0r = SIMDBase_ADDi(s9, sb); t0i = SIMDBase_SUBi(s9, sb); s9 = t0r; sb = t0i;
+  t0r = SIMDBase_ADDi(sc, se); t0i = SIMDBase_SUBi(sc, se); sc = t0r; se = t0i;
+  t0r = SIMDBase_ADDi(sd, sf); t0i = SIMDBase_SUBi(sd, sf); sd = t0r; sf = t0i;
+  t0r = SIMDBase_ADDi(s0, s2); t0i = SIMDBase_SUBi(s0, s2); s0 = t0r; s2 = t0i;
+  t0r = SIMDBase_ADDi(s1, s3); t0i = SIMDBase_SUBi(s1, s3); s1 = t0r; s3 = t0i;
+
+  t0r = SIMDBase_ADDi(s4, s6); t0i = SIMDBase_ADDi(s7, s5);
+  t1r = SIMDBase_SUBi(s7, s5); t1i = SIMDBase_SUBi(s4, s6);
+
+  s4 = SIMDBase_SUBi(s0, t0r); s5 = SIMDBase_SUBi(s1, t0i);
+  s6 = SIMDBase_SUBi(s2, t1r); s7 = SIMDBase_SUBi(s3, t1i);
+  s0 = SIMDBase_ADDi(s0, t0r); s1 = SIMDBase_ADDi(s1, t0i);
+  s2 = SIMDBase_ADDi(s2, t1r); s3 = SIMDBase_ADDi(s3, t1i);
+
+  t0r = SIMDBase_ADDi(s8, sc); t0i = SIMDBase_ADDi(s9, sd);
+  t1r = SIMDBase_SUBi(sd, s9); t1i = SIMDBase_SUBi(s8, sc);
+
+  s8 = SIMDBase_SUBi(s0, t0r); s9 = SIMDBase_SUBi(s1, t0i);
+  sc = SIMDBase_SUBi(s4, t1r); sd = SIMDBase_SUBi(s5, t1i);
+  s0 = SIMDBase_ADDi(s0, t0r); s1 = SIMDBase_ADDi(s1, t0i);
+  s4 = SIMDBase_ADDi(s4, t1r); s5 = SIMDBase_ADDi(s5, t1i);
+
+  t0r = SIMDBase_MULi(SIMDBase_SUBi(sa, sb), SIMDBase_SET1( SQRT2_2));
+  t0i = SIMDBase_MULi(SIMDBase_ADDi(sa, sb), SIMDBase_SET1( SQRT2_2));
+  t1r = SIMDBase_MULi(SIMDBase_ADDi(se, sf), SIMDBase_SET1(-SQRT2_2));
+  t1i = SIMDBase_MULi(SIMDBase_SUBi(se, sf), SIMDBase_SET1( SQRT2_2));
+
+  sa = t0r; sb = t0i; se = t1r; sf = t1i;
+
+  t0r = SIMDBase_ADDi(sa, se); t0i = SIMDBase_ADDi(sb, sf);
+  t1r = SIMDBase_SUBi(sf, sb); t1i = SIMDBase_SUBi(sa, se);
+
+  sa = SIMDBase_SUBi(s2, t0r); sb = SIMDBase_SUBi(s3, t0i);
+  se = SIMDBase_SUBi(s6, t1r); sf = SIMDBase_SUBi(s7, t1i);
+  s2 = SIMDBase_ADDi(s2, t0r); s3 = SIMDBase_ADDi(s3, t0i);
+  s6 = SIMDBase_ADDi(s6, t1r); s7 = SIMDBase_ADDi(s7, t1i);
+
+  SIMDBase_STOR(&s[o+ 0], s0); SIMDBase_STOR(&s[o+ 1], s1); SIMDBase_STOR(&s[o+ 2], s2); SIMDBase_STOR(&s[o+ 3], s3);
+  SIMDBase_STOR(&s[o+ 4], s4); SIMDBase_STOR(&s[o+ 5], s5); SIMDBase_STOR(&s[o+ 6], s6); SIMDBase_STOR(&s[o+ 7], s7);
+  SIMDBase_STOR(&s[o+ 8], s8); SIMDBase_STOR(&s[o+ 9], s9); SIMDBase_STOR(&s[o+10], sa); SIMDBase_STOR(&s[o+11], sb);
+  SIMDBase_STOR(&s[o+12], sc); SIMDBase_STOR(&s[o+13], sd); SIMDBase_STOR(&s[o+14], se); SIMDBase_STOR(&s[o+15], sf);
+}
+
+#if 0
+static inline void srButForwardSub(DFTUndiff *p) {
+  SIMDBase_VECT *s = p->s;
+  SIMDBase_REAL *tbl = p->ptTable[p->log2butlen];
+
+  int32_t i0 = p->offset1;
+  int32_t i1 = i0 + p->stride;
+  int32_t i2 = i1 + p->stride;
+  int32_t i3 = i2 + p->stride;
+  int32_t im = i1;
+
+  int32_t p0 = p->offset2 & (p->butlen*4-1);
+
+  while(i0 < im) {
+    SIMDBase_VECT t0r, t0i, t1r, t1i;
+    SIMDBase_VECT s00, s01, s10, s11, s20, s21, s30, s31;
+    SIMDBase_VECT a0, a1, a2, a3;
+
+    s00 = SIMDBase_LOAD(&s[i0+0]), s01 = SIMDBase_LOAD(&s[i0+1]);
+    s10 = SIMDBase_LOAD(&s[i1+0]), s11 = SIMDBase_LOAD(&s[i1+1]);
+    s20 = SIMDBase_LOAD(&s[i2+0]), s21 = SIMDBase_LOAD(&s[i2+1]);
+    s30 = SIMDBase_LOAD(&s[i3+0]), s31 = SIMDBase_LOAD(&s[i3+1]);
+
+    t0r = SIMDBase_SUBi(SIMDBase_SUBi(s00, s20), SIMDBase_SUBi(s31, s11));
+    t0i = SIMDBase_SUBi(SIMDBase_SUBi(s01, s21), SIMDBase_SUBi(s10, s30));
+
+    t1r = SIMDBase_ADDi(SIMDBase_SUBi(s00, s20), SIMDBase_SUBi(s31, s11));
+    t1i = SIMDBase_ADDi(SIMDBase_SUBi(s01, s21), SIMDBase_SUBi(s10, s30));
+    
+    a0 = SIMDBase_LOAD1(&tbl[p0+0]); a1 = SIMDBase_LOAD1(&tbl[p0+1]);
+    a2 = SIMDBase_LOAD1(&tbl[p0+2]); a3 = SIMDBase_LOAD1(&tbl[p0+3]);
+
+    SIMDBase_STOR(&s[i0  ], SIMDBase_ADDi(s00, s20)); SIMDBase_STOR(&s[i0+1], SIMDBase_ADDi(s01, s21));
+    SIMDBase_STOR(&s[i1  ], SIMDBase_ADDi(s10, s30)); SIMDBase_STOR(&s[i1+1], SIMDBase_ADDi(s11, s31));
+
+#ifndef SIMDBase_FMADD_AVAILABLE
+    SIMDBase_STOR(&s[i2  ], SIMDBase_SUBi(SIMDBase_MULi(t0r, a0), SIMDBase_MULi(t0i, a1)));
+    SIMDBase_STOR(&s[i2+1], SIMDBase_ADDi(SIMDBase_MULi(t0r, a1), SIMDBase_MULi(t0i, a0)));
+    SIMDBase_STOR(&s[i3  ], SIMDBase_SUBi(SIMDBase_MULi(t1r, a2), SIMDBase_MULi(t1i, a3)));
+    SIMDBase_STOR(&s[i3+1], SIMDBase_ADDi(SIMDBase_MULi(t1r, a3), SIMDBase_MULi(t1i, a2)));
+#else
+    SIMDBase_STOR(&s[i2  ], SIMDBase_FMSUBi(t0i, a1, SIMDBase_MULi(t0r, a0)));
+    SIMDBase_STOR(&s[i2+1], SIMDBase_FMADDi(t0r, a1, SIMDBase_MULi(t0i, a0)));
+    SIMDBase_STOR(&s[i3  ], SIMDBase_FMSUBi(t1i, a3, SIMDBase_MULi(t1r, a2)));
+    SIMDBase_STOR(&s[i3+1], SIMDBase_FMADDi(t1r, a3, SIMDBase_MULi(t1i, a2)));
+#endif
+
+    i0 += 2; i1 += 2; i2 += 2; i3 += 2;
+    p0 += 4;
+  }
+}
+#endif
+
+#if 0
+static inline void srButBackwardSub(DFTUndiff *p) {
+  SIMDBase_VECT *s = p->s;
+  SIMDBase_REAL *tbl = p->ptTable[p->log2butlen];
+
+  int32_t i0 = p->offset1;
+  int32_t i1 = i0 + p->stride;
+  int32_t i2 = i1 + p->stride;
+  int32_t i3 = i2 + p->stride;
+  int32_t im = i1;
+
+  int32_t p0 = p->offset2 & (p->butlen*4-1);
+
+  while(i0 < im) {
+    SIMDBase_VECT t0r, t0i, t1r, t1i, u, v;
+    SIMDBase_VECT s00, s01, s10, s11, s20, s21, s30, s31;
+    SIMDBase_VECT a0, a1, a2, a3;
+
+    s20 = SIMDBase_LOAD(&s[i2+0]); s21 = SIMDBase_LOAD(&s[i2+1]);
+    a0 = SIMDBase_LOAD1(&tbl[p0+0]); a1 = SIMDBase_LOAD1(&tbl[p0+1]);
+    u = SIMDBase_ADDi(SIMDBase_MULi(s20, a0), SIMDBase_MULi(s21, a1));
+
+    s30 = SIMDBase_LOAD(&s[i3+0]); s31 = SIMDBase_LOAD(&s[i3+1]);
+    a2 = SIMDBase_LOAD1(&tbl[p0+2]); a3 = SIMDBase_LOAD1(&tbl[p0+3]);
+    v = SIMDBase_ADDi(SIMDBase_MULi(s30, a2), SIMDBase_MULi(s31, a3));
+
+    t0r = SIMDBase_ADDi(u, v); t1i = SIMDBase_SUBi(u, v);
+    u = SIMDBase_SUBi(SIMDBase_MULi(s31, a2), SIMDBase_MULi(s30, a3));
+    v = SIMDBase_SUBi(SIMDBase_MULi(s21, a0), SIMDBase_MULi(s20, a1));
+    t0i = SIMDBase_ADDi(u, v); t1r = SIMDBase_SUBi(u, v);
+
+    s00 = SIMDBase_LOAD(&s[i0+0]); s01 = SIMDBase_LOAD(&s[i0+1]);
+    s10 = SIMDBase_LOAD(&s[i1+0]); s11 = SIMDBase_LOAD(&s[i1+1]);
+
+    SIMDBase_STOR(&s[i2+0], SIMDBase_SUBi(s00, t0r)); SIMDBase_STOR(&s[i0+0], SIMDBase_ADDi(s00, t0r));
+    SIMDBase_STOR(&s[i2+1], SIMDBase_SUBi(s01, t0i)); SIMDBase_STOR(&s[i0+1], SIMDBase_ADDi(s01, t0i));
+    SIMDBase_STOR(&s[i3+0], SIMDBase_SUBi(s10, t1r)); SIMDBase_STOR(&s[i1+0], SIMDBase_ADDi(s10, t1r));
+    SIMDBase_STOR(&s[i3+1], SIMDBase_SUBi(s11, t1i)); SIMDBase_STOR(&s[i1+1], SIMDBase_ADDi(s11, t1i));
+
+    i0 += 2; i1 += 2; i2 += 2; i3 += 2;
+    p0 += 4;
+  }
+}
+
+static void srButBackwardSubUnrolled(DFTUndiff *p) {
+  srButBackwardSub(p);
+}
+#endif
+
+static inline void srButForwardSubUnrolled(DFTUndiff *p) {
+  SIMDBase_VECT *s = p->s;
+  SIMDBase_REAL *tbl = p->ptTable[p->log2butlen];
+
+  int32_t i0 = p->offset1;
+  int32_t i1 = i0 + p->stride;
+  int32_t i2 = i1 + p->stride;
+  int32_t i3 = i2 + p->stride;
+  int32_t im = i1;
+
+  int32_t p0 = p->offset2 & (p->butlen*4-1);
+
+  while(i0 < im) {
+    SIMDBase_VECT t0r, t0i, t1r, t1i;
+    SIMDBase_VECT s00, s01, s10, s11, s20, s21, s30, s31;
+    SIMDBase_VECT a0, a1, a2, a3;
+
+    //
+
+    s00 = SIMDBase_LOAD(&s[i0+0]); s01 = SIMDBase_LOAD(&s[i0+1]);
+    s10 = SIMDBase_LOAD(&s[i1+0]); s11 = SIMDBase_LOAD(&s[i1+1]);
+    s20 = SIMDBase_LOAD(&s[i2+0]); s21 = SIMDBase_LOAD(&s[i2+1]);
+    s30 = SIMDBase_LOAD(&s[i3+0]); s31 = SIMDBase_LOAD(&s[i3+1]);
+
+    t0r = SIMDBase_SUBi(SIMDBase_SUBi(s00, s20), SIMDBase_SUBi(s31, s11));
+    t0i = SIMDBase_SUBi(SIMDBase_SUBi(s01, s21), SIMDBase_SUBi(s10, s30));
+
+    t1r = SIMDBase_ADDi(SIMDBase_SUBi(s00, s20), SIMDBase_SUBi(s31, s11));
+    t1i = SIMDBase_ADDi(SIMDBase_SUBi(s01, s21), SIMDBase_SUBi(s10, s30));
+    
+    a0 = SIMDBase_LOAD1(&tbl[p0+0]); a1 = SIMDBase_LOAD1(&tbl[p0+1]);
+    a2 = SIMDBase_LOAD1(&tbl[p0+2]); a3 = SIMDBase_LOAD1(&tbl[p0+3]);
+
+    SIMDBase_STOR(&s[i0  ], SIMDBase_ADDi(s00, s20)); SIMDBase_STOR(&s[i0+1], SIMDBase_ADDi(s01, s21));
+    SIMDBase_STOR(&s[i1  ], SIMDBase_ADDi(s10, s30)); SIMDBase_STOR(&s[i1+1], SIMDBase_ADDi(s11, s31));
+
+#ifndef SIMDBase_FMADD_AVAILABLE
+    SIMDBase_STOR(&s[i2  ], SIMDBase_SUBi(SIMDBase_MULi(t0r, a0), SIMDBase_MULi(t0i, a1)));
+    SIMDBase_STOR(&s[i2+1], SIMDBase_ADDi(SIMDBase_MULi(t0r, a1), SIMDBase_MULi(t0i, a0)));
+    SIMDBase_STOR(&s[i3  ], SIMDBase_SUBi(SIMDBase_MULi(t1r, a2), SIMDBase_MULi(t1i, a3)));
+    SIMDBase_STOR(&s[i3+1], SIMDBase_ADDi(SIMDBase_MULi(t1r, a3), SIMDBase_MULi(t1i, a2)));
+#else
+    SIMDBase_STOR(&s[i2  ], SIMDBase_FMSUBi(t0i, a1, SIMDBase_MULi(t0r, a0)));
+    SIMDBase_STOR(&s[i2+1], SIMDBase_FMADDi(t0r, a1, SIMDBase_MULi(t0i, a0)));
+    SIMDBase_STOR(&s[i3  ], SIMDBase_FMSUBi(t1i, a3, SIMDBase_MULi(t1r, a2)));
+    SIMDBase_STOR(&s[i3+1], SIMDBase_FMADDi(t1r, a3, SIMDBase_MULi(t1i, a2)));
+#endif
+
+    //
+
+    s00 = SIMDBase_LOAD(&s[i0+2]); s01 = SIMDBase_LOAD(&s[i0+3]);
+    s10 = SIMDBase_LOAD(&s[i1+2]); s11 = SIMDBase_LOAD(&s[i1+3]);
+    s20 = SIMDBase_LOAD(&s[i2+2]); s21 = SIMDBase_LOAD(&s[i2+3]);
+    s30 = SIMDBase_LOAD(&s[i3+2]); s31 = SIMDBase_LOAD(&s[i3+3]);
+
+    t0r = SIMDBase_SUBi(SIMDBase_SUBi(s00, s20), SIMDBase_SUBi(s31, s11));
+    t0i = SIMDBase_SUBi(SIMDBase_SUBi(s01, s21), SIMDBase_SUBi(s10, s30));
+
+    t1r = SIMDBase_ADDi(SIMDBase_SUBi(s00, s20), SIMDBase_SUBi(s31, s11));
+    t1i = SIMDBase_ADDi(SIMDBase_SUBi(s01, s21), SIMDBase_SUBi(s10, s30));
+    
+    a0 = SIMDBase_LOAD1(&tbl[p0+4]); a1 = SIMDBase_LOAD1(&tbl[p0+5]);
+    a2 = SIMDBase_LOAD1(&tbl[p0+6]); a3 = SIMDBase_LOAD1(&tbl[p0+7]);
+
+    SIMDBase_STOR(&s[i0+2], SIMDBase_ADDi(s00, s20)); SIMDBase_STOR(&s[i0+3], SIMDBase_ADDi(s01, s21));
+    SIMDBase_STOR(&s[i1+2], SIMDBase_ADDi(s10, s30)); SIMDBase_STOR(&s[i1+3], SIMDBase_ADDi(s11, s31));
+
+#ifndef SIMDBase_FMADD_AVAILABLE
+    SIMDBase_STOR(&s[i2+2], SIMDBase_SUBi(SIMDBase_MULi(t0r, a0), SIMDBase_MULi(t0i, a1)));
+    SIMDBase_STOR(&s[i2+3], SIMDBase_ADDi(SIMDBase_MULi(t0r, a1), SIMDBase_MULi(t0i, a0)));
+    SIMDBase_STOR(&s[i3+2], SIMDBase_SUBi(SIMDBase_MULi(t1r, a2), SIMDBase_MULi(t1i, a3)));
+    SIMDBase_STOR(&s[i3+3], SIMDBase_ADDi(SIMDBase_MULi(t1r, a3), SIMDBase_MULi(t1i, a2)));
+#else
+    SIMDBase_STOR(&s[i2+2], SIMDBase_FMSUBi(t0i, a1, SIMDBase_MULi(t0r, a0)));
+    SIMDBase_STOR(&s[i2+3], SIMDBase_FMADDi(t0r, a1, SIMDBase_MULi(t0i, a0)));
+    SIMDBase_STOR(&s[i3+2], SIMDBase_FMSUBi(t1i, a3, SIMDBase_MULi(t1r, a2)));
+    SIMDBase_STOR(&s[i3+3], SIMDBase_FMADDi(t1r, a3, SIMDBase_MULi(t1i, a2)));
+#endif
+
+    //
+
+    s00 = SIMDBase_LOAD(&s[i0+4]); s01 = SIMDBase_LOAD(&s[i0+5]);
+    s10 = SIMDBase_LOAD(&s[i1+4]); s11 = SIMDBase_LOAD(&s[i1+5]);
+    s20 = SIMDBase_LOAD(&s[i2+4]); s21 = SIMDBase_LOAD(&s[i2+5]);
+    s30 = SIMDBase_LOAD(&s[i3+4]); s31 = SIMDBase_LOAD(&s[i3+5]);
+
+    t0r = SIMDBase_SUBi(SIMDBase_SUBi(s00, s20), SIMDBase_SUBi(s31, s11));
+    t0i = SIMDBase_SUBi(SIMDBase_SUBi(s01, s21), SIMDBase_SUBi(s10, s30));
+
+    t1r = SIMDBase_ADDi(SIMDBase_SUBi(s00, s20), SIMDBase_SUBi(s31, s11));
+    t1i = SIMDBase_ADDi(SIMDBase_SUBi(s01, s21), SIMDBase_SUBi(s10, s30));
+    
+    a0 = SIMDBase_LOAD1(&tbl[p0+ 8]); a1 = SIMDBase_LOAD1(&tbl[p0+ 9]);
+    a2 = SIMDBase_LOAD1(&tbl[p0+10]); a3 = SIMDBase_LOAD1(&tbl[p0+11]);
+
+    SIMDBase_STOR(&s[i0+4], SIMDBase_ADDi(s00, s20)); SIMDBase_STOR(&s[i0+5], SIMDBase_ADDi(s01, s21));
+    SIMDBase_STOR(&s[i1+4], SIMDBase_ADDi(s10, s30)); SIMDBase_STOR(&s[i1+5], SIMDBase_ADDi(s11, s31));
+
+#ifndef SIMDBase_FMADD_AVAILABLE
+    SIMDBase_STOR(&s[i2+4], SIMDBase_SUBi(SIMDBase_MULi(t0r, a0), SIMDBase_MULi(t0i, a1)));
+    SIMDBase_STOR(&s[i2+5], SIMDBase_ADDi(SIMDBase_MULi(t0r, a1), SIMDBase_MULi(t0i, a0)));
+    SIMDBase_STOR(&s[i3+4], SIMDBase_SUBi(SIMDBase_MULi(t1r, a2), SIMDBase_MULi(t1i, a3)));
+    SIMDBase_STOR(&s[i3+5], SIMDBase_ADDi(SIMDBase_MULi(t1r, a3), SIMDBase_MULi(t1i, a2)));
+#else
+    SIMDBase_STOR(&s[i2+4], SIMDBase_FMSUBi(t0i, a1, SIMDBase_MULi(t0r, a0)));
+    SIMDBase_STOR(&s[i2+5], SIMDBase_FMADDi(t0r, a1, SIMDBase_MULi(t0i, a0)));
+    SIMDBase_STOR(&s[i3+4], SIMDBase_FMSUBi(t1i, a3, SIMDBase_MULi(t1r, a2)));
+    SIMDBase_STOR(&s[i3+5], SIMDBase_FMADDi(t1r, a3, SIMDBase_MULi(t1i, a2)));
+#endif
+
+    //
+
+    s00 = SIMDBase_LOAD(&s[i0+6]); s01 = SIMDBase_LOAD(&s[i0+7]);
+    s10 = SIMDBase_LOAD(&s[i1+6]); s11 = SIMDBase_LOAD(&s[i1+7]);
+    s20 = SIMDBase_LOAD(&s[i2+6]); s21 = SIMDBase_LOAD(&s[i2+7]);
+    s30 = SIMDBase_LOAD(&s[i3+6]); s31 = SIMDBase_LOAD(&s[i3+7]);
+
+    t0r = SIMDBase_SUBi(SIMDBase_SUBi(s00, s20), SIMDBase_SUBi(s31, s11));
+    t0i = SIMDBase_SUBi(SIMDBase_SUBi(s01, s21), SIMDBase_SUBi(s10, s30));
+
+    t1r = SIMDBase_ADDi(SIMDBase_SUBi(s00, s20), SIMDBase_SUBi(s31, s11));
+    t1i = SIMDBase_ADDi(SIMDBase_SUBi(s01, s21), SIMDBase_SUBi(s10, s30));
+    
+    a0 = SIMDBase_LOAD1(&tbl[p0+12]); a1 = SIMDBase_LOAD1(&tbl[p0+13]);
+    a2 = SIMDBase_LOAD1(&tbl[p0+14]); a3 = SIMDBase_LOAD1(&tbl[p0+15]);
+
+    SIMDBase_STOR(&s[i0+6], SIMDBase_ADDi(s00, s20)); SIMDBase_STOR(&s[i0+7], SIMDBase_ADDi(s01, s21));
+    SIMDBase_STOR(&s[i1+6], SIMDBase_ADDi(s10, s30)); SIMDBase_STOR(&s[i1+7], SIMDBase_ADDi(s11, s31));
+
+#ifndef SIMDBase_FMADD_AVAILABLE
+    SIMDBase_STOR(&s[i2+6], SIMDBase_SUBi(SIMDBase_MULi(t0r, a0), SIMDBase_MULi(t0i, a1)));
+    SIMDBase_STOR(&s[i2+7], SIMDBase_ADDi(SIMDBase_MULi(t0r, a1), SIMDBase_MULi(t0i, a0)));
+    SIMDBase_STOR(&s[i3+6], SIMDBase_SUBi(SIMDBase_MULi(t1r, a2), SIMDBase_MULi(t1i, a3)));
+    SIMDBase_STOR(&s[i3+7], SIMDBase_ADDi(SIMDBase_MULi(t1r, a3), SIMDBase_MULi(t1i, a2)));
+#else
+    SIMDBase_STOR(&s[i2+6], SIMDBase_FMSUBi(t0i, a1, SIMDBase_MULi(t0r, a0)));
+    SIMDBase_STOR(&s[i2+7], SIMDBase_FMADDi(t0r, a1, SIMDBase_MULi(t0i, a0)));
+    SIMDBase_STOR(&s[i3+6], SIMDBase_FMSUBi(t1i, a3, SIMDBase_MULi(t1r, a2)));
+    SIMDBase_STOR(&s[i3+7], SIMDBase_FMADDi(t1r, a3, SIMDBase_MULi(t1i, a2)));
+#endif
+
+    //
+
+    i0 += 8; i1 += 8; i2 += 8; i3 += 8;
+    p0 += 16;
+  }
+}
+
+#if 1
+static void srButBackwardSubUnrolled(DFTUndiff *p) {
+  SIMDBase_VECT *s = p->s;
+  SIMDBase_REAL *tbl = p->ptTable[p->log2butlen];
+
+  int32_t i0 = p->offset1;
+  int32_t i1 = i0 + p->stride;
+  int32_t i2 = i1 + p->stride;
+  int32_t i3 = i2 + p->stride;
+  int32_t im = i1;
+
+  int32_t p0 = p->offset2 & (p->butlen*4-1);
+
+  while(i0 < im) {
+    SIMDBase_VECT t0r, t0i, t1r, t1i, u, v;
+    SIMDBase_VECT s00, s01, s10, s11, s20, s21, s30, s31;
+    SIMDBase_VECT a0, a1, a2, a3;
+
+    //
+
+    s20 = SIMDBase_LOAD(&s[i2+ 0]); s21 = SIMDBase_LOAD(&s[i2+ 1]);
+    a0 = SIMDBase_LOAD1(&tbl[p0+ 0]); a1 = SIMDBase_LOAD1(&tbl[p0+ 1]);
+#ifndef SIMDBase_FMADD_AVAILABLE
+    u = SIMDBase_ADDi(SIMDBase_MULi(s20, a0), SIMDBase_MULi(s21, a1));
+#else
+    u = SIMDBase_FMADDi(s20, a0, SIMDBase_MULi(s21, a1));
+#endif
+
+    s30 = SIMDBase_LOAD(&s[i3+ 0]); s31 = SIMDBase_LOAD(&s[i3+ 1]);
+    a2 = SIMDBase_LOAD1(&tbl[p0+ 2]); a3 = SIMDBase_LOAD1(&tbl[p0+ 3]);
+#ifndef SIMDBase_FMADD_AVAILABLE
+    v = SIMDBase_ADDi(SIMDBase_MULi(s30, a2), SIMDBase_MULi(s31, a3));
+#else
+    v = SIMDBase_FMADDi(s30, a2, SIMDBase_MULi(s31, a3));
+#endif
+
+    t0r = SIMDBase_ADDi(u, v); t1i = SIMDBase_SUBi(u, v);
+
+#ifndef SIMDBase_FMADD_AVAILABLE
+    u = SIMDBase_SUBi(SIMDBase_MULi(s31, a2), SIMDBase_MULi(s30, a3));
+    v = SIMDBase_SUBi(SIMDBase_MULi(s21, a0), SIMDBase_MULi(s20, a1));
+#else
+    u = SIMDBase_FMSUBi(s30, a3, SIMDBase_MULi(s31, a2));
+    v = SIMDBase_FMSUBi(s20, a1, SIMDBase_MULi(s21, a0));
+#endif
+    t0i = SIMDBase_ADDi(u, v); t1r = SIMDBase_SUBi(u, v);
+
+    s00 = SIMDBase_LOAD(&s[i0+ 0]); s01 = SIMDBase_LOAD(&s[i0+ 1]);
+    s10 = SIMDBase_LOAD(&s[i1+ 0]); s11 = SIMDBase_LOAD(&s[i1+ 1]);
+
+    SIMDBase_STOR(&s[i2+ 0], SIMDBase_SUBi(s00, t0r)); SIMDBase_STOR(&s[i0+ 0], SIMDBase_ADDi(s00, t0r));
+    SIMDBase_STOR(&s[i2+ 1], SIMDBase_SUBi(s01, t0i)); SIMDBase_STOR(&s[i0+ 1], SIMDBase_ADDi(s01, t0i));
+    SIMDBase_STOR(&s[i3+ 0], SIMDBase_SUBi(s10, t1r)); SIMDBase_STOR(&s[i1+ 0], SIMDBase_ADDi(s10, t1r));
+    SIMDBase_STOR(&s[i3+ 1], SIMDBase_SUBi(s11, t1i)); SIMDBase_STOR(&s[i1+ 1], SIMDBase_ADDi(s11, t1i));
+
+    //
+
+    s20 = SIMDBase_LOAD(&s[i2+ 2]); s21 = SIMDBase_LOAD(&s[i2+ 3]);
+    a0 = SIMDBase_LOAD1(&tbl[p0+ 4]); a1 = SIMDBase_LOAD1(&tbl[p0+ 5]);
+#ifndef SIMDBase_FMADD_AVAILABLE
+    u = SIMDBase_ADDi(SIMDBase_MULi(s20, a0), SIMDBase_MULi(s21, a1));
+#else
+    u = SIMDBase_FMADDi(s20, a0, SIMDBase_MULi(s21, a1));
+#endif
+
+    s30 = SIMDBase_LOAD(&s[i3+ 2]); s31 = SIMDBase_LOAD(&s[i3+ 3]);
+    a2 = SIMDBase_LOAD1(&tbl[p0+ 6]); a3 = SIMDBase_LOAD1(&tbl[p0+ 7]);
+#ifndef SIMDBase_FMADD_AVAILABLE
+    v = SIMDBase_ADDi(SIMDBase_MULi(s30, a2), SIMDBase_MULi(s31, a3));
+#else
+    v = SIMDBase_FMADDi(s30, a2, SIMDBase_MULi(s31, a3));
+#endif
+
+    t0r = SIMDBase_ADDi(u, v); t1i = SIMDBase_SUBi(u, v);
+
+#ifndef SIMDBase_FMADD_AVAILABLE
+    u = SIMDBase_SUBi(SIMDBase_MULi(s31, a2), SIMDBase_MULi(s30, a3));
+    v = SIMDBase_SUBi(SIMDBase_MULi(s21, a0), SIMDBase_MULi(s20, a1));
+#else
+    u = SIMDBase_FMSUBi(s30, a3, SIMDBase_MULi(s31, a2));
+    v = SIMDBase_FMSUBi(s20, a1, SIMDBase_MULi(s21, a0));
+#endif
+    t0i = SIMDBase_ADDi(u, v); t1r = SIMDBase_SUBi(u, v);
+
+    s00 = SIMDBase_LOAD(&s[i0+ 2]); s01 = SIMDBase_LOAD(&s[i0+ 3]);
+    s10 = SIMDBase_LOAD(&s[i1+ 2]); s11 = SIMDBase_LOAD(&s[i1+ 3]);
+
+    SIMDBase_STOR(&s[i2+ 2], SIMDBase_SUBi(s00, t0r)); SIMDBase_STOR(&s[i0+ 2], SIMDBase_ADDi(s00, t0r));
+    SIMDBase_STOR(&s[i2+ 3], SIMDBase_SUBi(s01, t0i)); SIMDBase_STOR(&s[i0+ 3], SIMDBase_ADDi(s01, t0i));
+    SIMDBase_STOR(&s[i3+ 2], SIMDBase_SUBi(s10, t1r)); SIMDBase_STOR(&s[i1+ 2], SIMDBase_ADDi(s10, t1r));
+    SIMDBase_STOR(&s[i3+ 3], SIMDBase_SUBi(s11, t1i)); SIMDBase_STOR(&s[i1+ 3], SIMDBase_ADDi(s11, t1i));
+
+    //
+
+    s20 = SIMDBase_LOAD(&s[i2+ 4]); s21 = SIMDBase_LOAD(&s[i2+ 5]);
+    a0 = SIMDBase_LOAD1(&tbl[p0+ 8]); a1 = SIMDBase_LOAD1(&tbl[p0+ 9]);
+#ifndef SIMDBase_FMADD_AVAILABLE
+    u = SIMDBase_ADDi(SIMDBase_MULi(s20, a0), SIMDBase_MULi(s21, a1));
+#else
+    u = SIMDBase_FMADDi(s20, a0, SIMDBase_MULi(s21, a1));
+#endif
+
+    s30 = SIMDBase_LOAD(&s[i3+ 4]); s31 = SIMDBase_LOAD(&s[i3+ 5]);
+    a2 = SIMDBase_LOAD1(&tbl[p0+10]); a3 = SIMDBase_LOAD1(&tbl[p0+11]);
+#ifndef SIMDBase_FMADD_AVAILABLE
+    v = SIMDBase_ADDi(SIMDBase_MULi(s30, a2), SIMDBase_MULi(s31, a3));
+#else
+    v = SIMDBase_FMADDi(s30, a2, SIMDBase_MULi(s31, a3));
+#endif
+
+    t0r = SIMDBase_ADDi(u, v); t1i = SIMDBase_SUBi(u, v);
+
+#ifndef SIMDBase_FMADD_AVAILABLE
+    u = SIMDBase_SUBi(SIMDBase_MULi(s31, a2), SIMDBase_MULi(s30, a3));
+    v = SIMDBase_SUBi(SIMDBase_MULi(s21, a0), SIMDBase_MULi(s20, a1));
+#else
+    u = SIMDBase_FMSUBi(s30, a3, SIMDBase_MULi(s31, a2));
+    v = SIMDBase_FMSUBi(s20, a1, SIMDBase_MULi(s21, a0));
+#endif
+    t0i = SIMDBase_ADDi(u, v); t1r = SIMDBase_SUBi(u, v);
+
+    s00 = SIMDBase_LOAD(&s[i0+ 4]); s01 = SIMDBase_LOAD(&s[i0+ 5]);
+    s10 = SIMDBase_LOAD(&s[i1+ 4]); s11 = SIMDBase_LOAD(&s[i1+ 5]);
+
+    SIMDBase_STOR(&s[i2+ 4], SIMDBase_SUBi(s00, t0r)); SIMDBase_STOR(&s[i0+ 4], SIMDBase_ADDi(s00, t0r));
+    SIMDBase_STOR(&s[i2+ 5], SIMDBase_SUBi(s01, t0i)); SIMDBase_STOR(&s[i0+ 5], SIMDBase_ADDi(s01, t0i));
+    SIMDBase_STOR(&s[i3+ 4], SIMDBase_SUBi(s10, t1r)); SIMDBase_STOR(&s[i1+ 4], SIMDBase_ADDi(s10, t1r));
+    SIMDBase_STOR(&s[i3+ 5], SIMDBase_SUBi(s11, t1i)); SIMDBase_STOR(&s[i1+ 5], SIMDBase_ADDi(s11, t1i));
+
+    //
+
+    s20 = SIMDBase_LOAD(&s[i2+ 6]); s21 = SIMDBase_LOAD(&s[i2+ 7]);
+    a0 = SIMDBase_LOAD1(&tbl[p0+12]); a1 = SIMDBase_LOAD1(&tbl[p0+13]);
+#ifndef SIMDBase_FMADD_AVAILABLE
+    u = SIMDBase_ADDi(SIMDBase_MULi(s20, a0), SIMDBase_MULi(s21, a1));
+#else
+    u = SIMDBase_FMADDi(s20, a0, SIMDBase_MULi(s21, a1));
+#endif
+
+    s30 = SIMDBase_LOAD(&s[i3+ 6]); s31 = SIMDBase_LOAD(&s[i3+ 7]);
+    a2 = SIMDBase_LOAD1(&tbl[p0+14]); a3 = SIMDBase_LOAD1(&tbl[p0+15]);
+#ifndef SIMDBase_FMADD_AVAILABLE
+    v = SIMDBase_ADDi(SIMDBase_MULi(s30, a2), SIMDBase_MULi(s31, a3));
+#else
+    v = SIMDBase_FMADDi(s30, a2, SIMDBase_MULi(s31, a3));
+#endif
+
+    t0r = SIMDBase_ADDi(u, v); t1i = SIMDBase_SUBi(u, v);
+
+#ifndef SIMDBase_FMADD_AVAILABLE
+    u = SIMDBase_SUBi(SIMDBase_MULi(s31, a2), SIMDBase_MULi(s30, a3));
+    v = SIMDBase_SUBi(SIMDBase_MULi(s21, a0), SIMDBase_MULi(s20, a1));
+#else
+    u = SIMDBase_FMSUBi(s30, a3, SIMDBase_MULi(s31, a2));
+    v = SIMDBase_FMSUBi(s20, a1, SIMDBase_MULi(s21, a0));
+#endif
+    t0i = SIMDBase_ADDi(u, v); t1r = SIMDBase_SUBi(u, v);
+
+    s00 = SIMDBase_LOAD(&s[i0+ 6]); s01 = SIMDBase_LOAD(&s[i0+ 7]);
+    s10 = SIMDBase_LOAD(&s[i1+ 6]); s11 = SIMDBase_LOAD(&s[i1+ 7]);
+
+    SIMDBase_STOR(&s[i2+ 6], SIMDBase_SUBi(s00, t0r)); SIMDBase_STOR(&s[i0+ 6], SIMDBase_ADDi(s00, t0r));
+    SIMDBase_STOR(&s[i2+ 7], SIMDBase_SUBi(s01, t0i)); SIMDBase_STOR(&s[i0+ 7], SIMDBase_ADDi(s01, t0i));
+    SIMDBase_STOR(&s[i3+ 6], SIMDBase_SUBi(s10, t1r)); SIMDBase_STOR(&s[i1+ 6], SIMDBase_ADDi(s10, t1r));
+    SIMDBase_STOR(&s[i3+ 7], SIMDBase_SUBi(s11, t1i)); SIMDBase_STOR(&s[i1+ 7], SIMDBase_ADDi(s11, t1i));
+
+    //
+
+    i0 += 8; i1 += 8; i2 += 8; i3 += 8;
+    p0 += 16;
+  }
+}
+#endif
+
+static void r2ButForwardSub(DFTUndiff *p) {
+  SIMDBase_VECT *s = p->s;
+
+  SIMDBase_REAL *tbl = p->ptTable[p->log2butlen];
+
+  int32_t i0 = p->offset1;
+  int32_t i2 = i0 + p->stride*2;
+  int32_t cp = 0, sp = p->butlen/4;
+
+  do {
+    SIMDBase_VECT t0r, t0i, s0, s1, s2, s3, t0, t1;
+
+    s0 = SIMDBase_LOAD(&s[i0+0]); s2 = SIMDBase_LOAD(&s[i0+1]);
+    s1 = SIMDBase_LOAD(&s[i2+0]); s3 = SIMDBase_LOAD(&s[i2+1]);
+    t0 = SIMDBase_LOAD1(&tbl[cp+0]); t1 = SIMDBase_LOAD1(&tbl[sp-0]);
+    t0r = SIMDBase_SUBi(s0, s1); SIMDBase_STOR(&s[i0+0], SIMDBase_ADDi(s0, s1));
+    t0i = SIMDBase_SUBi(s2, s3); SIMDBase_STOR(&s[i0+1], SIMDBase_ADDi(s2, s3));
+    SIMDBase_STOR(&s[i2+0], SIMDBase_ADDi(SIMDBase_MULi(t0r, t0), SIMDBase_MULi(t0i, t1)));
+    SIMDBase_STOR(&s[i2+1], SIMDBase_SUBi(SIMDBase_MULi(t0i, t0), SIMDBase_MULi(t0r, t1)));
+
+    s0 = SIMDBase_LOAD(&s[i0+2]); s2 = SIMDBase_LOAD(&s[i0+3]);
+    s1 = SIMDBase_LOAD(&s[i2+2]); s3 = SIMDBase_LOAD(&s[i2+3]);
+    t0 = SIMDBase_LOAD1(&tbl[cp+1]); t1 = SIMDBase_LOAD1(&tbl[sp-1]);
+    t0r = SIMDBase_SUBi(s0, s1); SIMDBase_STOR(&s[i0+2], SIMDBase_ADDi(s0, s1));
+    t0i = SIMDBase_SUBi(s2, s3); SIMDBase_STOR(&s[i0+3], SIMDBase_ADDi(s2, s3));
+    SIMDBase_STOR(&s[i2+2], SIMDBase_ADDi(SIMDBase_MULi(t0r, t0), SIMDBase_MULi(t0i, t1)));
+    SIMDBase_STOR(&s[i2+3], SIMDBase_SUBi(SIMDBase_MULi(t0i, t0), SIMDBase_MULi(t0r, t1)));
+
+    s0 = SIMDBase_LOAD(&s[i0+4]); s2 = SIMDBase_LOAD(&s[i0+5]);
+    s1 = SIMDBase_LOAD(&s[i2+4]); s3 = SIMDBase_LOAD(&s[i2+5]);
+    t0 = SIMDBase_LOAD1(&tbl[cp+2]); t1 = SIMDBase_LOAD1(&tbl[sp-2]);
+    t0r = SIMDBase_SUBi(s0, s1); SIMDBase_STOR(&s[i0+4], SIMDBase_ADDi(s0, s1));
+    t0i = SIMDBase_SUBi(s2, s3); SIMDBase_STOR(&s[i0+5], SIMDBase_ADDi(s2, s3));
+    SIMDBase_STOR(&s[i2+4], SIMDBase_ADDi(SIMDBase_MULi(t0r, t0), SIMDBase_MULi(t0i, t1)));
+    SIMDBase_STOR(&s[i2+5], SIMDBase_SUBi(SIMDBase_MULi(t0i, t0), SIMDBase_MULi(t0r, t1)));
+
+    s0 = SIMDBase_LOAD(&s[i0+6]); s2 = SIMDBase_LOAD(&s[i0+7]);
+    s1 = SIMDBase_LOAD(&s[i2+6]); s3 = SIMDBase_LOAD(&s[i2+7]);
+    t0 = SIMDBase_LOAD1(&tbl[cp+3]); t1 = SIMDBase_LOAD1(&tbl[sp-3]);
+    t0r = SIMDBase_SUBi(s0, s1); SIMDBase_STOR(&s[i0+6], SIMDBase_ADDi(s0, s1));
+    t0i = SIMDBase_SUBi(s2, s3); SIMDBase_STOR(&s[i0+7], SIMDBase_ADDi(s2, s3));
+    SIMDBase_STOR(&s[i2+6], SIMDBase_ADDi(SIMDBase_MULi(t0r, t0), SIMDBase_MULi(t0i, t1)));
+    SIMDBase_STOR(&s[i2+7], SIMDBase_SUBi(SIMDBase_MULi(t0i, t0), SIMDBase_MULi(t0r, t1)));
+
+    //
+
+    i0 += 8; i2 += 8; cp += 4; sp -= 4;
+  } while(sp > 0);
+
+  do {
+    SIMDBase_VECT t0r, t0i, s0, s1, s2, s3, t0, t1;
+
+    s0 = SIMDBase_LOAD(&s[i0+0]); s2 = SIMDBase_LOAD(&s[i0+1]);
+    s1 = SIMDBase_LOAD(&s[i2+0]); s3 = SIMDBase_LOAD(&s[i2+1]);
+    t0 = SIMDBase_LOAD1(&tbl[cp-0]); t1 = SIMDBase_LOAD1(&tbl[sp+0]);
+    t0r = SIMDBase_SUBi(s0, s1); SIMDBase_STOR(&s[i0+0], SIMDBase_ADDi(s0, s1));
+    t0i = SIMDBase_SUBi(s2, s3); SIMDBase_STOR(&s[i0+1], SIMDBase_ADDi(s2, s3));
+    SIMDBase_STOR(&s[i2+0], SIMDBase_SUBi(SIMDBase_MULi(t0i, t1), SIMDBase_MULi(t0r, t0)));
+    SIMDBase_STOR(&s[i2+1], SIMDBase_NEGi(SIMDBase_ADDi(SIMDBase_MULi(t0i, t0), SIMDBase_MULi(t0r, t1))));
+
+    s0 = SIMDBase_LOAD(&s[i0+2]); s2 = SIMDBase_LOAD(&s[i0+3]);
+    s1 = SIMDBase_LOAD(&s[i2+2]); s3 = SIMDBase_LOAD(&s[i2+3]);
+    t0 = SIMDBase_LOAD1(&tbl[cp-1]); t1 = SIMDBase_LOAD1(&tbl[sp+1]);
+    t0r = SIMDBase_SUBi(s0, s1); SIMDBase_STOR(&s[i0+2], SIMDBase_ADDi(s0, s1));
+    t0i = SIMDBase_SUBi(s2, s3); SIMDBase_STOR(&s[i0+3], SIMDBase_ADDi(s2, s3));
+    SIMDBase_STOR(&s[i2+2], SIMDBase_SUBi(SIMDBase_MULi(t0i, t1), SIMDBase_MULi(t0r, t0)));
+    SIMDBase_STOR(&s[i2+3], SIMDBase_NEGi(SIMDBase_ADDi(SIMDBase_MULi(t0i, t0), SIMDBase_MULi(t0r, t1))));
+
+    s0 = SIMDBase_LOAD(&s[i0+4]); s2 = SIMDBase_LOAD(&s[i0+5]);
+    s1 = SIMDBase_LOAD(&s[i2+4]); s3 = SIMDBase_LOAD(&s[i2+5]);
+    t0 = SIMDBase_LOAD1(&tbl[cp-2]); t1 = SIMDBase_LOAD1(&tbl[sp+2]);
+    t0r = SIMDBase_SUBi(s0, s1); SIMDBase_STOR(&s[i0+4], SIMDBase_ADDi(s0, s1));
+    t0i = SIMDBase_SUBi(s2, s3); SIMDBase_STOR(&s[i0+5], SIMDBase_ADDi(s2, s3));
+    SIMDBase_STOR(&s[i2+4], SIMDBase_SUBi(SIMDBase_MULi(t0i, t1), SIMDBase_MULi(t0r, t0)));
+    SIMDBase_STOR(&s[i2+5], SIMDBase_NEGi(SIMDBase_ADDi(SIMDBase_MULi(t0i, t0), SIMDBase_MULi(t0r, t1))));
+
+    s0 = SIMDBase_LOAD(&s[i0+6]); s2 = SIMDBase_LOAD(&s[i0+7]);
+    s1 = SIMDBase_LOAD(&s[i2+6]); s3 = SIMDBase_LOAD(&s[i2+7]);
+    t0 = SIMDBase_LOAD1(&tbl[cp-3]); t1 = SIMDBase_LOAD1(&tbl[sp+3]);
+    t0r = SIMDBase_SUBi(s0, s1); SIMDBase_STOR(&s[i0+6], SIMDBase_ADDi(s0, s1));
+    t0i = SIMDBase_SUBi(s2, s3); SIMDBase_STOR(&s[i0+7], SIMDBase_ADDi(s2, s3));
+    SIMDBase_STOR(&s[i2+6], SIMDBase_SUBi(SIMDBase_MULi(t0i, t1), SIMDBase_MULi(t0r, t0)));
+    SIMDBase_STOR(&s[i2+7], SIMDBase_NEGi(SIMDBase_ADDi(SIMDBase_MULi(t0i, t0), SIMDBase_MULi(t0r, t1))));
+
+    //
+
+    i0 += 8; i2 += 8; cp -= 4; sp += 4;
+  } while(cp > 0);
+}
+
+static void r2ButBackwardSub(DFTUndiff *p) {
+  SIMDBase_VECT *s = p->s;
+
+  SIMDBase_REAL *tbl = p->ptTable[p->log2butlen];
+
+  int i0 = p->offset1;
+  int i2 = i0 + p->stride*2;
+
+  int cp = 0, sp = p->butlen/4;
+
+  do {
+    SIMDBase_VECT t0r, t0i, s0, s1, s2, s3, t0, t1;
+
+    s0 = SIMDBase_LOAD(&s[i0+0]); s2 = SIMDBase_LOAD(&s[i0+1]);
+    s1 = SIMDBase_LOAD(&s[i2+0]); s3 = SIMDBase_LOAD(&s[i2+1]);
+    t0 = SIMDBase_LOAD1(&tbl[cp+0]); t1 = SIMDBase_LOAD1(&tbl[sp-0]);
+    t0r = SIMDBase_SUBi(SIMDBase_MULi(s1, t0), SIMDBase_MULi(s3, t1));
+    t0i = SIMDBase_ADDi(SIMDBase_MULi(s1, t1), SIMDBase_MULi(s3, t0));
+    SIMDBase_STOR(&s[i2+0], SIMDBase_SUBi(s0, t0r)); SIMDBase_STOR(&s[i0+0], SIMDBase_ADDi(s0, t0r));
+    SIMDBase_STOR(&s[i2+1], SIMDBase_SUBi(s2, t0i)); SIMDBase_STOR(&s[i0+1], SIMDBase_ADDi(s2, t0i));
+
+    s0 = SIMDBase_LOAD(&s[i0+2]); s2 = SIMDBase_LOAD(&s[i0+3]);
+    s1 = SIMDBase_LOAD(&s[i2+2]); s3 = SIMDBase_LOAD(&s[i2+3]);
+    t0 = SIMDBase_LOAD1(&tbl[cp+1]); t1 = SIMDBase_LOAD1(&tbl[sp-1]);
+    t0r = SIMDBase_SUBi(SIMDBase_MULi(s1, t0), SIMDBase_MULi(s3, t1));
+    t0i = SIMDBase_ADDi(SIMDBase_MULi(s1, t1), SIMDBase_MULi(s3, t0));
+    SIMDBase_STOR(&s[i2+2], SIMDBase_SUBi(s0, t0r)); SIMDBase_STOR(&s[i0+2], SIMDBase_ADDi(s0, t0r));
+    SIMDBase_STOR(&s[i2+3], SIMDBase_SUBi(s2, t0i)); SIMDBase_STOR(&s[i0+3], SIMDBase_ADDi(s2, t0i));
+
+    s0 = SIMDBase_LOAD(&s[i0+4]); s2 = SIMDBase_LOAD(&s[i0+5]);
+    s1 = SIMDBase_LOAD(&s[i2+4]); s3 = SIMDBase_LOAD(&s[i2+5]);
+    t0 = SIMDBase_LOAD1(&tbl[cp+2]); t1 = SIMDBase_LOAD1(&tbl[sp-2]);
+    t0r = SIMDBase_SUBi(SIMDBase_MULi(s1, t0), SIMDBase_MULi(s3, t1));
+    t0i = SIMDBase_ADDi(SIMDBase_MULi(s1, t1), SIMDBase_MULi(s3, t0));
+    SIMDBase_STOR(&s[i2+4], SIMDBase_SUBi(s0, t0r)); SIMDBase_STOR(&s[i0+4], SIMDBase_ADDi(s0, t0r));
+    SIMDBase_STOR(&s[i2+5], SIMDBase_SUBi(s2, t0i)); SIMDBase_STOR(&s[i0+5], SIMDBase_ADDi(s2, t0i));
+
+    s0 = SIMDBase_LOAD(&s[i0+6]); s2 = SIMDBase_LOAD(&s[i0+7]);
+    s1 = SIMDBase_LOAD(&s[i2+6]); s3 = SIMDBase_LOAD(&s[i2+7]);
+    t0 = SIMDBase_LOAD1(&tbl[cp+3]); t1 = SIMDBase_LOAD1(&tbl[sp-3]);
+    t0r = SIMDBase_SUBi(SIMDBase_MULi(s1, t0), SIMDBase_MULi(s3, t1));
+    t0i = SIMDBase_ADDi(SIMDBase_MULi(s1, t1), SIMDBase_MULi(s3, t0));
+    SIMDBase_STOR(&s[i2+6], SIMDBase_SUBi(s0, t0r)); SIMDBase_STOR(&s[i0+6], SIMDBase_ADDi(s0, t0r));
+    SIMDBase_STOR(&s[i2+7], SIMDBase_SUBi(s2, t0i)); SIMDBase_STOR(&s[i0+7], SIMDBase_ADDi(s2, t0i));
+
+    i0 += 8; i2 += 8; cp += 4; sp -= 4;
+  } while(sp > 0);
+
+  do {
+    SIMDBase_VECT t0r, t0i, s0, s1, s2, s3, t0, t1;
+
+    s0 = SIMDBase_LOAD(&s[i0+0]); s2 = SIMDBase_LOAD(&s[i0+1]);
+    s1 = SIMDBase_LOAD(&s[i2+0]); s3 = SIMDBase_LOAD(&s[i2+1]);
+    t0 = SIMDBase_LOAD1(&tbl[cp-0]); t1 = SIMDBase_LOAD1(&tbl[sp+0]);
+    t0r = SIMDBase_NEGi(SIMDBase_ADDi(SIMDBase_MULi(s1, t0), SIMDBase_MULi(s3, t1)));
+    t0i = SIMDBase_SUBi(SIMDBase_MULi(s1, t1), SIMDBase_MULi(s3, t0));
+    SIMDBase_STOR(&s[i2+0], SIMDBase_SUBi(s0, t0r)); SIMDBase_STOR(&s[i0+0], SIMDBase_ADDi(s0, t0r));
+    SIMDBase_STOR(&s[i2+1], SIMDBase_SUBi(s2, t0i)); SIMDBase_STOR(&s[i0+1], SIMDBase_ADDi(s2, t0i));
+
+    s0 = SIMDBase_LOAD(&s[i0+2]); s2 = SIMDBase_LOAD(&s[i0+3]);
+    s1 = SIMDBase_LOAD(&s[i2+2]); s3 = SIMDBase_LOAD(&s[i2+3]);
+    t0 = SIMDBase_LOAD1(&tbl[cp-1]); t1 = SIMDBase_LOAD1(&tbl[sp+1]);
+    t0r = SIMDBase_NEGi(SIMDBase_ADDi(SIMDBase_MULi(s1, t0), SIMDBase_MULi(s3, t1)));
+    t0i = SIMDBase_SUBi(SIMDBase_MULi(s1, t1), SIMDBase_MULi(s3, t0));
+    SIMDBase_STOR(&s[i2+2], SIMDBase_SUBi(s0, t0r)); SIMDBase_STOR(&s[i0+2], SIMDBase_ADDi(s0, t0r));
+    SIMDBase_STOR(&s[i2+3], SIMDBase_SUBi(s2, t0i)); SIMDBase_STOR(&s[i0+3], SIMDBase_ADDi(s2, t0i));
+
+    s0 = SIMDBase_LOAD(&s[i0+4]); s2 = SIMDBase_LOAD(&s[i0+5]);
+    s1 = SIMDBase_LOAD(&s[i2+4]); s3 = SIMDBase_LOAD(&s[i2+5]);
+    t0 = SIMDBase_LOAD1(&tbl[cp-2]); t1 = SIMDBase_LOAD1(&tbl[sp+2]);
+    t0r = SIMDBase_NEGi(SIMDBase_ADDi(SIMDBase_MULi(s1, t0), SIMDBase_MULi(s3, t1)));
+    t0i = SIMDBase_SUBi(SIMDBase_MULi(s1, t1), SIMDBase_MULi(s3, t0));
+    SIMDBase_STOR(&s[i2+4], SIMDBase_SUBi(s0, t0r)); SIMDBase_STOR(&s[i0+4], SIMDBase_ADDi(s0, t0r));
+    SIMDBase_STOR(&s[i2+5], SIMDBase_SUBi(s2, t0i)); SIMDBase_STOR(&s[i0+5], SIMDBase_ADDi(s2, t0i));
+
+    s0 = SIMDBase_LOAD(&s[i0+6]); s2 = SIMDBase_LOAD(&s[i0+7]);
+    s1 = SIMDBase_LOAD(&s[i2+6]); s3 = SIMDBase_LOAD(&s[i2+7]);
+    t0 = SIMDBase_LOAD1(&tbl[cp-3]); t1 = SIMDBase_LOAD1(&tbl[sp+3]);
+    t0r = SIMDBase_NEGi(SIMDBase_ADDi(SIMDBase_MULi(s1, t0), SIMDBase_MULi(s3, t1)));
+    t0i = SIMDBase_SUBi(SIMDBase_MULi(s1, t1), SIMDBase_MULi(s3, t0));
+    SIMDBase_STOR(&s[i2+6], SIMDBase_SUBi(s0, t0r)); SIMDBase_STOR(&s[i0+6], SIMDBase_ADDi(s0, t0r));
+    SIMDBase_STOR(&s[i2+7], SIMDBase_SUBi(s2, t0i)); SIMDBase_STOR(&s[i0+7], SIMDBase_ADDi(s2, t0i));
+
+    i0 += 8; i2 += 8; cp -= 4; sp += 4;
+  } while(cp > 0);
+}
+
+static void srButForward16(DFTUndiff *p) {
+  int32_t o = p->offset1;
+
+  p->butlen = 16; p->log2butlen = 4; p->stride = p->butlen/2;
+  srButForwardSubUnrolled(p);
+
+  p->offset1 = o + 16*6/4;
+  srButForward4(p);
+
+  p->offset1 = o + 16*4/4;
+  srButForward4(p);
+
+  p->offset1 = o;
+  srButForward8(p);
+}
+
+static void srButBackward16(DFTUndiff *p) {
+  int32_t o = p->offset1;
+
+  p->offset1 = o + 16*6/4;
+  srButBackward4(p);
+
+  p->offset1 = o + 16*4/4;
+  srButBackward4(p);
+
+  p->offset1 = o;
+  srButBackward8(p);
+
+  p->butlen = 16; p->log2butlen = 4; p->stride = p->butlen/2;
+  srButBackwardSubUnrolled(p);
+}
+
+static void srButForward32(DFTUndiff *p) {
+  int32_t o = p->offset1;
+
+  p->butlen = 32; p->log2butlen = 5; p->stride = p->butlen/2;
+  srButForwardSubUnrolled(p);
+
+  p->offset1 = o + 32*6/4;
+  srButForward8 (p);
+
+  p->offset1 = o + 32*4/4;
+  srButForward8 (p);
+
+  p->offset1 = o;
+  srButForward16(p);
+}
+
+static void srButBackward32(DFTUndiff *p) {
+  int32_t o = p->offset1;
+
+  p->offset1 = o + 32*6/4;
+  srButBackward8 (p);
+
+  p->offset1 = o + 32*4/4;
+  srButBackward8 (p);
+
+  p->offset1 = o;
+  srButBackward16(p);
+
+  p->butlen = 32; p->log2butlen = 5; p->stride = p->butlen/2;
+  srButBackwardSubUnrolled(p);
+}
+
+//
+
+#if 1
+static inline void bitReversalUnit(SIMDBase_VECT *p, SIMDBase_VECT *q) {
+  SIMDBase_VECT w, x, y, z;
+
+  w = SIMDBase_LOAD(p); x = SIMDBase_LOAD(p+1);
+  y = SIMDBase_LOAD(q); z = SIMDBase_LOAD(q+1);
+
+  SIMDBase_STOR(q, w); SIMDBase_STOR(q+1, x);
+  SIMDBase_STOR(p, y); SIMDBase_STOR(p+1, z);
+}
+#else
+#define bitReversalUnit(p0, q0) {                    \
+  SIMDBase_VECT *px = (p0), *qx = (q0);              \
+  SIMDBase_VECT wx, xx, yx, zx;                      \
+                                                     \
+  wx = SIMDBase_LOAD(px); xx = SIMDBase_LOAD(px+1);  \
+  yx = SIMDBase_LOAD(qx); zx = SIMDBase_LOAD(qx+1);  \
+                                                     \
+  SIMDBase_STOR(qx, wx); SIMDBase_STOR(qx+1, xx);    \
+  SIMDBase_STOR(px, yx); SIMDBase_STOR(px+1, zx);    \
+}
+#endif
+
+static inline void bitReversal4s(SIMDBase_VECT *s, int32_t sc, int32_t o1, int32_t o2) {
+  SIMDBase_VECT *p = &s[o1*2], *q = &s[o2*2];
+  int b1 = sc*2*1, b2 = b1*2;
+  p += b1; q += b2;
+  bitReversalUnit(p, q);
+}
+
+static inline void bitReversal8s(SIMDBase_VECT *s, int32_t sc, int32_t o1, int32_t o2) {
+  SIMDBase_VECT *p = &s[o1*2], *q = &s[o2*2];
+  int b1 = sc*2*1, b2 = b1*2, b4 = b2*2;
+  p += b1; q += b4;
+  bitReversalUnit(p, q); p += b2; q += b2;
+  bitReversalUnit(p, q);
+}
+
+static inline void bitReversal8d(SIMDBase_VECT *s, int32_t sc, int32_t o1, int32_t o2) {
+  SIMDBase_VECT *p = &s[o1*2], *q = &s[o2*2];
+  int32_t b1 = sc*2*1, b2 = b1*2, b4 = b2*2;
+  bitReversalUnit(p, q); p += b1; q += b4;
+  bitReversalUnit(p, q); p += b2; q += b2;
+  bitReversalUnit(p, q); p -= b1; q -= b4;
+  bitReversalUnit(p, q); p += b4; q += b1;
+  bitReversalUnit(p, q); p += b1; q += b4;
+  bitReversalUnit(p, q); p -= b2; q -= b2;
+  bitReversalUnit(p, q); p -= b1; q -= b4;
+  bitReversalUnit(p, q);
+}
+
+static inline void bitReversal16s(SIMDBase_VECT *s, int32_t sc, int32_t o1, int32_t o2) {
+  SIMDBase_VECT *p = &s[o1*2], *q = &s[o2*2];
+  int32_t b1 = sc*2*1, b2 = b1*2, b4 = b2*2, b8 = b4*2;
+  p += b1; q += b8;
+  bitReversalUnit(p, q); p += b2; q += b4;
+  bitReversalUnit(p, q); p -= b1; q -= b8;
+  bitReversalUnit(p, q); p += b1 + b4; q += b2 + b8;
+  bitReversalUnit(p, q); p -= b2; q -= b4;
+  bitReversalUnit(p, q); p += b2 + b4; q += b1 + b2;
+  bitReversalUnit(p, q);
+}
+
+static inline void bitReversal16d(SIMDBase_VECT *s, int32_t sc, int32_t o1, int32_t o2) {
+  SIMDBase_VECT *p = &s[o1*2], *q = &s[o2*2];
+  int32_t b1 = sc*2*1, b2 = b1*2, b4 = b2*2, b8 = b4*2;
+  bitReversalUnit(p, q); p += b1; q += b8;
+  bitReversalUnit(p, q); p += b2; q += b4;
+  bitReversalUnit(p, q); p -= b1; q -= b8;
+  bitReversalUnit(p, q); p += b4; q += b2;
+  bitReversalUnit(p, q); p += b1; q += b8;
+  bitReversalUnit(p, q); p -= b2; q -= b4;
+  bitReversalUnit(p, q); p -= b1; q -= b8;
+  bitReversalUnit(p, q); p += b8; q += b1;
+  bitReversalUnit(p, q); p += b1; q += b8;
+  bitReversalUnit(p, q); p += b2; q += b4;
+  bitReversalUnit(p, q); p -= b1; q -= b8;
+  bitReversalUnit(p, q); p -= b4; q -= b2;
+  bitReversalUnit(p, q); p += b1; q += b8;
+  bitReversalUnit(p, q); p -= b2; q -= b4;
+  bitReversalUnit(p, q); p -= b1; q -= b8;
+  bitReversalUnit(p, q);
+}
+
+static inline void bitReversal32s(SIMDBase_VECT *s, int32_t sc, int32_t o1, int32_t o2) {
+  SIMDBase_VECT *p = &s[o1*2], *q = &s[o2*2];
+  int32_t b1 = sc*2*1, b2 = b1*2, b4 = b2*2, b8 = b4*2, b16 = b8*2;
+  p += b1; q += b16;
+  bitReversalUnit(p, q); p += b2; q += b8;
+  bitReversalUnit(p, q); p -= b1; q -= b16;
+  bitReversalUnit(p, q); p += b4; q += b4;
+  bitReversalUnit(p, q); p += b1; q += b16;
+  bitReversalUnit(p, q); p -= b2; q -= b8;
+  bitReversalUnit(p, q); p += b8; q += b2;
+  bitReversalUnit(p, q); p += b2; q += b8;
+  bitReversalUnit(p, q); p -= b4; q -= b4;
+  bitReversalUnit(p, q); p -= b2; q -= b8;
+  bitReversalUnit(p, q); p += b16 - b2; q += b1 + b2 + b8;
+  bitReversalUnit(p, q); p -= b4; q -= b4;
+  bitReversalUnit(p, q);
+}
+
+static void bitReversal32d(SIMDBase_VECT *s, int32_t sc, int32_t o1, int32_t o2) {
+  const int32_t k = 32;
+
+  bitReversal8d(s,2*sc, sc*(k/2  )+o1, sc*     1 +o2);
+  bitReversal8d(s,2*sc, sc*     0 +o1, sc*     0 +o2);
+  bitReversal8d(s,2*sc, sc*     1 +o1, sc*(k/2  )+o2);
+  bitReversal8d(s,2*sc, sc*(k/2+1)+o1, sc*(k/2+1)+o2);
+}
+
+static void bitReversalRecursive(SIMDBase_VECT *s, int32_t n, int32_t sc, int32_t o1, int32_t o2) {
+  if (n >= 64) {
+    if (o1 != o2) bitReversalRecursive(s, n/4, 2*sc, sc*(n/2)+o1, sc*1+o2);
+
+    bitReversalRecursive(s, n/4, 2*sc, sc*     0 +o1, sc*     0 +o2);
+    bitReversalRecursive(s, n/4, 2*sc, sc*     1 +o1, sc*(n/2  )+o2);
+    bitReversalRecursive(s, n/4, 2*sc, sc*(n/2+1)+o1, sc*(n/2+1)+o2);
+  } else {
+    if (o1 == o2) {
+      switch(n) {
+      case  4: bitReversal4s (s,sc,o1,o2); return;
+      case  8: bitReversal8s (s,sc,o1,o2); return;
+      case 16: bitReversal16s(s,sc,o1,o2); return;
+      case 32: bitReversal32s(s,sc,o1,o2); return;
+      }
+    } else {
+      switch(n) {
+      case  8: bitReversal8d (s,sc,o1,o2); return;
+      case 16: bitReversal16d(s,sc,o1,o2); return;
+      case 32: bitReversal32d(s,sc,o1,o2); return;
+      }
+    }
+  }
+}
+
+//
+
+static int bitR(int a, int logN) {
+  int ret = 0;
+  int i,j,k;
+  for(i=0,j=1,k=1<<(logN-1);i<logN;i++,j=j<<1,k=k>>1) {
+    if ((a & j) != 0) ret |= k;
+  }
+  return ret;
+}
+
+static void bitReversalCobraInplace(DFTUndiff *p) {
+  SIMDBase_VECT *s = p->s;
+  int cobraQ = p->cobraQ;
+  SIMDBase_VECT *cobraT = p->cobraT;
+  int *cobraR = p->cobraR;
+  int logN = p->log2len;
+
+  int b;
+
+  for(b=0;b<(1 << (logN-2*cobraQ));b++) {
+    int a,c;
+    int b2 = bitR(b, logN-2*cobraQ);
+
+    if (b2 < b) continue;
+
+    if (b2 == b) {
+      for(a=0;a<(1 << cobraQ);a++) {
+	int abc = ((a << (logN-2*cobraQ)) | b) << (cobraQ + 1);
+
+	int a2c = (cobraR[a] << cobraQ) << 1, a2cm = a2c+(1 << cobraQ)*2;
+
+	while(a2c < a2cm) {
+	  SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++]));
+	  SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++]));
+	  SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++]));
+	  SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++]));
+	  SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++]));
+	  SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++]));
+	  SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++]));
+	  SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++]));
+	  SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++]));
+	  SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++]));
+	  SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++]));
+	  SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++]));
+	  SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++]));
+	  SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++]));
+	  SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++]));
+	  SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++]));
+	}
+      }
+
+      for(c=0;c<(1 << cobraQ);c++) {
+	int c2 = cobraR[c];
+	int c2b2a2 = ((c2 << (logN-2*cobraQ)) | b2) << (cobraQ+1);
+
+	int a2c = c << 1;
+	int a2ci = 1 << (cobraQ+1);
+	int c2b2a2m = c2b2a2 + (1 << cobraQ)*2;
+
+	while(c2b2a2 < c2b2a2m) {
+	  SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c  ])); SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c+1])); a2c += a2ci;
+	  SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c  ])); SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c+1])); a2c += a2ci;
+	  SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c  ])); SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c+1])); a2c += a2ci;
+	  SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c  ])); SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c+1])); a2c += a2ci;
+	  SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c  ])); SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c+1])); a2c += a2ci;
+	  SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c  ])); SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c+1])); a2c += a2ci;
+	  SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c  ])); SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c+1])); a2c += a2ci;
+	  SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c  ])); SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c+1])); a2c += a2ci;
+	  SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c  ])); SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c+1])); a2c += a2ci;
+	  SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c  ])); SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c+1])); a2c += a2ci;
+	  SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c  ])); SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c+1])); a2c += a2ci;
+	  SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c  ])); SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c+1])); a2c += a2ci;
+	  SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c  ])); SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c+1])); a2c += a2ci;
+	  SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c  ])); SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c+1])); a2c += a2ci;
+	  SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c  ])); SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c+1])); a2c += a2ci;
+	  SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c  ])); SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c+1])); a2c += a2ci;
+	}
+      }
+    } else {
+      for(a=0;a<(1 << cobraQ);a++) {
+	int a2c = (cobraR[a] << cobraQ) << 1, a2cm = a2c+(1 << cobraQ)*2;
+	int abc = ((a << (logN-2*cobraQ)) | b) << (cobraQ + 1);
+
+	while(a2c < a2cm) {
+	  SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++]));
+	  SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++]));
+	  SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++]));
+	  SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++]));
+	  SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++]));
+	  SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++]));
+	  SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++]));
+	  SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++]));
+	  SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++]));
+	  SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++]));
+	  SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++]));
+	  SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++]));
+	  SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++]));
+	  SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++]));
+	  SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++]));
+	  SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++]));
+	}
+      }
+
+      for(c=0;c<(1 << cobraQ);c++) {
+	int c2 = cobraR[c];
+	int c2b2a2 = ((c2 << (logN-2*cobraQ)) | b2) << (cobraQ+1);
+
+	int a2c = c << 1;
+	int a2ci = 1 << (cobraQ+1);
+	int c2b2a2m = c2b2a2 + (1 << cobraQ)*2;
+
+	while(c2b2a2 < c2b2a2m) {
+	  SIMDBase_VECT t0, t1, t2, t3, t4, t5, t6, t7;
+
+	  t0 = SIMDBase_LOAD(&s[c2b2a2+0]); t1 = SIMDBase_LOAD(&s[c2b2a2+1]);
+	  t2 = SIMDBase_LOAD(&s[c2b2a2+2]); t3 = SIMDBase_LOAD(&s[c2b2a2+3]);
+	  t4 = SIMDBase_LOAD(&s[c2b2a2+4]); t5 = SIMDBase_LOAD(&s[c2b2a2+5]);
+	  t6 = SIMDBase_LOAD(&s[c2b2a2+6]); t7 = SIMDBase_LOAD(&s[c2b2a2+7]);
+
+	  SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c  ])); SIMDBase_STOR(&cobraT[a2c  ], t0);
+	  SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c+1])); SIMDBase_STOR(&cobraT[a2c+1], t1); a2c += a2ci;
+	  SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c  ])); SIMDBase_STOR(&cobraT[a2c  ], t2);
+	  SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c+1])); SIMDBase_STOR(&cobraT[a2c+1], t3); a2c += a2ci;
+	  SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c  ])); SIMDBase_STOR(&cobraT[a2c  ], t4);
+	  SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c+1])); SIMDBase_STOR(&cobraT[a2c+1], t5); a2c += a2ci;
+	  SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c  ])); SIMDBase_STOR(&cobraT[a2c  ], t6);
+	  SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c+1])); SIMDBase_STOR(&cobraT[a2c+1], t7); a2c += a2ci;
+
+	  t0 = SIMDBase_LOAD(&s[c2b2a2+0]); t1 = SIMDBase_LOAD(&s[c2b2a2+1]);
+	  t2 = SIMDBase_LOAD(&s[c2b2a2+2]); t3 = SIMDBase_LOAD(&s[c2b2a2+3]);
+	  t4 = SIMDBase_LOAD(&s[c2b2a2+4]); t5 = SIMDBase_LOAD(&s[c2b2a2+5]);
+	  t6 = SIMDBase_LOAD(&s[c2b2a2+6]); t7 = SIMDBase_LOAD(&s[c2b2a2+7]);
+
+	  SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c  ])); SIMDBase_STOR(&cobraT[a2c  ], t0);
+	  SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c+1])); SIMDBase_STOR(&cobraT[a2c+1], t1); a2c += a2ci;
+	  SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c  ])); SIMDBase_STOR(&cobraT[a2c  ], t2);
+	  SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c+1])); SIMDBase_STOR(&cobraT[a2c+1], t3); a2c += a2ci;
+	  SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c  ])); SIMDBase_STOR(&cobraT[a2c  ], t4);
+	  SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c+1])); SIMDBase_STOR(&cobraT[a2c+1], t5); a2c += a2ci;
+	  SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c  ])); SIMDBase_STOR(&cobraT[a2c  ], t6);
+	  SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c+1])); SIMDBase_STOR(&cobraT[a2c+1], t7); a2c += a2ci;
+
+	  t0 = SIMDBase_LOAD(&s[c2b2a2+0]); t1 = SIMDBase_LOAD(&s[c2b2a2+1]);
+	  t2 = SIMDBase_LOAD(&s[c2b2a2+2]); t3 = SIMDBase_LOAD(&s[c2b2a2+3]);
+	  t4 = SIMDBase_LOAD(&s[c2b2a2+4]); t5 = SIMDBase_LOAD(&s[c2b2a2+5]);
+	  t6 = SIMDBase_LOAD(&s[c2b2a2+6]); t7 = SIMDBase_LOAD(&s[c2b2a2+7]);
+
+	  SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c  ])); SIMDBase_STOR(&cobraT[a2c  ], t0);
+	  SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c+1])); SIMDBase_STOR(&cobraT[a2c+1], t1); a2c += a2ci;
+	  SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c  ])); SIMDBase_STOR(&cobraT[a2c  ], t2);
+	  SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c+1])); SIMDBase_STOR(&cobraT[a2c+1], t3); a2c += a2ci;
+	  SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c  ])); SIMDBase_STOR(&cobraT[a2c  ], t4);
+	  SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c+1])); SIMDBase_STOR(&cobraT[a2c+1], t5); a2c += a2ci;
+	  SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c  ])); SIMDBase_STOR(&cobraT[a2c  ], t6);
+	  SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c+1])); SIMDBase_STOR(&cobraT[a2c+1], t7); a2c += a2ci;
+
+	  t0 = SIMDBase_LOAD(&s[c2b2a2+0]); t1 = SIMDBase_LOAD(&s[c2b2a2+1]);
+	  t2 = SIMDBase_LOAD(&s[c2b2a2+2]); t3 = SIMDBase_LOAD(&s[c2b2a2+3]);
+	  t4 = SIMDBase_LOAD(&s[c2b2a2+4]); t5 = SIMDBase_LOAD(&s[c2b2a2+5]);
+	  t6 = SIMDBase_LOAD(&s[c2b2a2+6]); t7 = SIMDBase_LOAD(&s[c2b2a2+7]);
+
+	  SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c  ])); SIMDBase_STOR(&cobraT[a2c  ], t0);
+	  SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c+1])); SIMDBase_STOR(&cobraT[a2c+1], t1); a2c += a2ci;
+	  SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c  ])); SIMDBase_STOR(&cobraT[a2c  ], t2);
+	  SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c+1])); SIMDBase_STOR(&cobraT[a2c+1], t3); a2c += a2ci;
+	  SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c  ])); SIMDBase_STOR(&cobraT[a2c  ], t4);
+	  SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c+1])); SIMDBase_STOR(&cobraT[a2c+1], t5); a2c += a2ci;
+	  SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c  ])); SIMDBase_STOR(&cobraT[a2c  ], t6);
+	  SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c+1])); SIMDBase_STOR(&cobraT[a2c+1], t7); a2c += a2ci;
+	}
+      }
+
+      for(a=0;a<(1 << cobraQ);a++) {
+	int a2c = (cobraR[a] << cobraQ) << 1, a2cm = a2c+(1 << cobraQ)*2;
+	int abc = ((a << (logN-2*cobraQ)) | b) << (cobraQ + 1);
+
+	while(a2c < a2cm) {
+	  SIMDBase_STOR(&s[abc++], SIMDBase_LOAD(&cobraT[a2c++])); SIMDBase_STOR(&s[abc++], SIMDBase_LOAD(&cobraT[a2c++]));
+	  SIMDBase_STOR(&s[abc++], SIMDBase_LOAD(&cobraT[a2c++])); SIMDBase_STOR(&s[abc++], SIMDBase_LOAD(&cobraT[a2c++]));
+	  SIMDBase_STOR(&s[abc++], SIMDBase_LOAD(&cobraT[a2c++])); SIMDBase_STOR(&s[abc++], SIMDBase_LOAD(&cobraT[a2c++]));
+	  SIMDBase_STOR(&s[abc++], SIMDBase_LOAD(&cobraT[a2c++])); SIMDBase_STOR(&s[abc++], SIMDBase_LOAD(&cobraT[a2c++]));
+	  SIMDBase_STOR(&s[abc++], SIMDBase_LOAD(&cobraT[a2c++])); SIMDBase_STOR(&s[abc++], SIMDBase_LOAD(&cobraT[a2c++]));
+	  SIMDBase_STOR(&s[abc++], SIMDBase_LOAD(&cobraT[a2c++])); SIMDBase_STOR(&s[abc++], SIMDBase_LOAD(&cobraT[a2c++]));
+	  SIMDBase_STOR(&s[abc++], SIMDBase_LOAD(&cobraT[a2c++])); SIMDBase_STOR(&s[abc++], SIMDBase_LOAD(&cobraT[a2c++]));
+	  SIMDBase_STOR(&s[abc++], SIMDBase_LOAD(&cobraT[a2c++])); SIMDBase_STOR(&s[abc++], SIMDBase_LOAD(&cobraT[a2c++]));
+	  SIMDBase_STOR(&s[abc++], SIMDBase_LOAD(&cobraT[a2c++])); SIMDBase_STOR(&s[abc++], SIMDBase_LOAD(&cobraT[a2c++]));
+	  SIMDBase_STOR(&s[abc++], SIMDBase_LOAD(&cobraT[a2c++])); SIMDBase_STOR(&s[abc++], SIMDBase_LOAD(&cobraT[a2c++]));
+	  SIMDBase_STOR(&s[abc++], SIMDBase_LOAD(&cobraT[a2c++])); SIMDBase_STOR(&s[abc++], SIMDBase_LOAD(&cobraT[a2c++]));
+	  SIMDBase_STOR(&s[abc++], SIMDBase_LOAD(&cobraT[a2c++])); SIMDBase_STOR(&s[abc++], SIMDBase_LOAD(&cobraT[a2c++]));
+	  SIMDBase_STOR(&s[abc++], SIMDBase_LOAD(&cobraT[a2c++])); SIMDBase_STOR(&s[abc++], SIMDBase_LOAD(&cobraT[a2c++]));
+	  SIMDBase_STOR(&s[abc++], SIMDBase_LOAD(&cobraT[a2c++])); SIMDBase_STOR(&s[abc++], SIMDBase_LOAD(&cobraT[a2c++]));
+	  SIMDBase_STOR(&s[abc++], SIMDBase_LOAD(&cobraT[a2c++])); SIMDBase_STOR(&s[abc++], SIMDBase_LOAD(&cobraT[a2c++]));
+	  SIMDBase_STOR(&s[abc++], SIMDBase_LOAD(&cobraT[a2c++])); SIMDBase_STOR(&s[abc++], SIMDBase_LOAD(&cobraT[a2c++]));
+	}
+      }
+    }
+  }
+}
+
+//
+
+static void srForwardMain2(DFTUndiff *p) {
+  int32_t o = p->offset1;
+  int32_t butlen = p->butlen;
+  int32_t log2butlen = p->log2butlen;
+
+  if (butlen >= p->radix2thres) {
+    p->stride           = p->butlen/2;
+    r2ButForwardSub(p);
+
+    p->offset1          = o + butlen*4/4;
+    p->butlen           = butlen/2;
+    p->log2butlen       = log2butlen-1;
+    srForwardMain2(p);
+
+    p->offset1          = o;
+    p->butlen           = butlen/2;
+    p->log2butlen       = log2butlen-1;
+    srForwardMain2(p);
+
+    return;
+  }
+
+  if (butlen >= 256) {
+    p->stride           = p->butlen/2;
+    srButForwardSubUnrolled(p);
+
+    p->offset1          = o + butlen*6/4;
+    p->butlen           = butlen/4;
+    p->log2butlen       = log2butlen-2;
+    srForwardMain2(p);
+
+    p->offset1          = o + butlen*4/4;
+    p->butlen           = butlen/4;
+    p->log2butlen       = log2butlen-2;
+    srForwardMain2(p);
+
+    p->offset1          = o;
+    p->butlen           = butlen/2;
+    p->log2butlen       = log2butlen-1;
+    srForwardMain2(p);
+
+    return;
+  }
+
+  if (butlen == 128) {
+    p->stride           = p->butlen/2;
+    srButForwardSubUnrolled(p);
+
+    p->offset1 = o + butlen*6/4;
+    srButForward32(p);
+
+    p->offset1 = o + butlen*4/4;
+    srButForward32(p);
+
+    p->offset1          = o;
+    p->butlen           = butlen/2;
+    p->log2butlen       = log2butlen-1;
+    srForwardMain2 (p);
+
+    return;
+  }
+
+  // butlen == 64
+
+  p->stride = p->butlen/2;
+  srButForwardSubUnrolled(p);
+
+  p->offset1 = o + butlen*6/4;
+  srButForward16(p);
+
+  p->offset1 = o + butlen*4/4;
+  srButForward16(p);
+
+  p->offset1 = o;
+  srButForward32(p);
+}
+
+static void srBackwardMain2(DFTUndiff *p) {
+  int32_t o = p->offset1;
+  int32_t butlen = p->butlen;
+  int32_t log2butlen = p->log2butlen;
+
+  if (butlen >= p->radix2thres) {
+    p->offset1          = o + butlen*4/4;
+    p->butlen           = butlen/2;
+    p->log2butlen       = log2butlen-1;
+    srBackwardMain2(p);
+
+    p->offset1          = o;
+    p->butlen           = butlen/2;
+    p->log2butlen       = log2butlen-1;
+    srBackwardMain2(p);
+
+    p->butlen           = butlen;
+    p->stride           = p->butlen/2;
+    p->log2butlen       = log2butlen;
+    r2ButBackwardSub(p);
+
+    return;
+  }
+
+  if (butlen >= 256) {
+    p->offset1          = o + butlen*6/4;
+    p->butlen           = butlen/4;
+    p->log2butlen       = log2butlen-2;
+    srBackwardMain2(p);
+
+    p->offset1          = o + butlen*4/4;
+    p->butlen           = butlen/4;
+    p->log2butlen       = log2butlen-2;
+    srBackwardMain2(p);
+
+    p->offset1          = o;
+    p->butlen           = butlen/2;
+    p->log2butlen       = log2butlen-1;
+    srBackwardMain2(p);
+
+    p->butlen           = butlen;
+    p->stride           = p->butlen/2;
+    p->log2butlen       = log2butlen;
+    srButBackwardSubUnrolled(p);
+
+    return;
+  }
+
+  if (butlen == 128) {
+    p->offset1 = o + butlen*6/4;
+    srButBackward32(p);
+
+    p->offset1 = o + butlen*4/4;
+    srButBackward32(p);
+
+    p->offset1          = o;
+    p->butlen           = butlen/2;
+    p->log2butlen       = log2butlen-1;
+    srBackwardMain2 (p);
+
+    p->butlen           = butlen;
+    p->stride           = p->butlen/2;
+    p->log2butlen       = log2butlen;
+    srButBackwardSubUnrolled(p);
+
+    return;
+  }
+
+  // butlen == 64
+
+  p->offset1 = o + butlen*6/4;
+  srButBackward16(p);
+
+  p->offset1 = o + butlen*4/4;
+  srButBackward16(p);
+
+  p->offset1 = o;
+  srButBackward32(p);
+
+  p->butlen           = butlen;
+  p->stride           = p->butlen/2;
+  p->log2butlen       = log2butlen;
+  srButBackwardSubUnrolled(p);
+}
+
+static void srForwardMain(DFTUndiff *p) {
+  if (p->length >= 64) {
+    p->butlen = p->length;
+    p->log2butlen = p->log2len;
+    p->offset1 = p->offset2 = 0;
+
+    srForwardMain2(p);
+  } else {
+    switch(p->length) {
+    case 32:
+      srButForward32(p);
+      break;
+    case 16:
+      srButForward16(p);
+      break;
+    case 8:
+      srButForward8(p);
+      break;
+    case 4:
+      srButForward4(p);
+      break;
+    case 2:
+      srBut2(p);
+      break;
+    }
+  }
+}
+
+static void srBackwardMain(DFTUndiff *p) {
+  if (p->length >= 64) {
+    p->butlen = p->length;
+    p->log2butlen = p->log2len;
+    p->offset1 = p->offset2 = 0;
+
+    srBackwardMain2(p);
+  } else {
+    switch(p->length) {
+    case 32:
+      srButBackward32(p);
+      break;
+    case 16:
+      srButBackward16(p);
+      break;
+    case 8:
+      srButBackward8(p);
+      break;
+    case 4:
+      srButBackward4(p);
+      break;
+    case 2:
+      srBut2(p);
+      break;
+    }
+  }
+}
+
+static void realSub0(DFTUndiff *p, SIMDBase_VECT *s, int32_t ts) {
+  SIMDBase_VECT tr, ti, ur, ui, mr, mi;
+  int32_t n = p->length*2;
+  int32_t k;
+
+  for(k=1;k<n/4;k++) {
+    SIMDBase_VECT s00 = SIMDBase_LOAD(&s[k*2+0]), s01 = SIMDBase_LOAD(&s[k*2+1]);
+    SIMDBase_VECT s10 = SIMDBase_LOAD(&s[(n/2-k)*2+0]), s11 = SIMDBase_LOAD(&s[(n/2-k)*2+1]);
+
+    tr = SIMDBase_SUBi(s00, s10); ti = SIMDBase_ADDi(s01, s11);
+    ur = SIMDBase_LOAD1(&(p->rtTable[ts][k*2+0]));
+    ui = SIMDBase_LOAD1(&(p->rtTable[ts][k*2+1]));
+    mr = SIMDBase_SUBi(SIMDBase_MULi(tr, ur), SIMDBase_MULi(ti, ui));
+    mi = SIMDBase_ADDi(SIMDBase_MULi(tr, ui), SIMDBase_MULi(ti, ur));
+    SIMDBase_STOR(&s[k*2+0], SIMDBase_SUBi(s00, mr));
+    SIMDBase_STOR(&s[k*2+1], SIMDBase_SUBi(s01, mi));
+    SIMDBase_STOR(&s[(n/2-k)*2+0], SIMDBase_ADDi(s10, mr));
+    SIMDBase_STOR(&s[(n/2-k)*2+1], SIMDBase_SUBi(s11, mi));
+  }
+
+  tr = SIMDBase_LOAD(&s[0]); ti = SIMDBase_LOAD(&s[1]);
+  SIMDBase_STOR(&s[0], SIMDBase_ADDi(tr, ti));
+  SIMDBase_STOR(&s[1], SIMDBase_SUBi(tr, ti));
+}
+
+static void realSub1(DFTUndiff *p, SIMDBase_VECT *s, int32_t ts) {
+  SIMDBase_VECT tr, ti, ur, ui, mr, mi;
+  int32_t n = p->length*2;
+  int32_t k;
+
+  tr = SIMDBase_LOAD(&s[0]); ti = SIMDBase_LOAD(&s[1]);
+  SIMDBase_STOR(&s[0], SIMDBase_MULi(SIMDBase_ADDi(tr, ti), SIMDBase_SET1(0.5)));
+  SIMDBase_STOR(&s[1], SIMDBase_MULi(SIMDBase_SUBi(tr, ti), SIMDBase_SET1(0.5)));
+
+  for(k=1;k<n/4;k++) {
+    SIMDBase_VECT s00 = SIMDBase_LOAD(&s[k*2+0]), s01 = SIMDBase_LOAD(&s[k*2+1]);
+    SIMDBase_VECT s10 = SIMDBase_LOAD(&s[(n/2-k)*2+0]), s11 = SIMDBase_LOAD(&s[(n/2-k)*2+1]);
+
+    tr = SIMDBase_SUBi(s00, s10); ti = SIMDBase_ADDi(s01, s11);
+    ur = SIMDBase_LOAD1(&(p->rtTable[ts][k*2+0]));
+    ui = SIMDBase_LOAD1(&(p->rtTable[ts][k*2+1]));
+    mr = SIMDBase_SUBi(SIMDBase_MULi(tr, ur), SIMDBase_MULi(ti, ui));
+    mi = SIMDBase_ADDi(SIMDBase_MULi(tr, ui), SIMDBase_MULi(ti, ur));
+    tr = SIMDBase_SUBi(s00, mr); ti = SIMDBase_SUBi(mi, s01);
+    SIMDBase_STOR(&s[k*2+0], SIMDBase_ADDi(mr, s10));
+    SIMDBase_STOR(&s[k*2+1], SIMDBase_SUBi(mi, s11));
+    SIMDBase_STOR(&s[(n/2-k)*2+0], tr);
+    SIMDBase_STOR(&s[(n/2-k)*2+1], ti);
+  }
+}
+
+void DFTUndiff_EXECUTE(void *p2, void *s2, int32_t dir) {
+  DFTUndiff *p = (DFTUndiff *)p2;
+  SIMDBase_VECT *s = (SIMDBase_VECT *)s2;
+
+  if (p->magic != MAGIC_DFT) abort();
+
+  p->s = s;
+
+  if (dir == -1) {
+    if ((p->flags & DFT_FLAG_ALT_REAL) != 0) {
+      realSub1(p, s, 0);
+    }
+
+    srForwardMain(p);
+
+    if ((p->flags & DFT_FLAG_NO_BITREVERSAL) == 0) {
+      if (p->useCobra) {
+	bitReversalCobraInplace(p);
+      } else {
+	bitReversalRecursive(p->s, p->length, 1, 0, 0);
+      }
+    }
+
+    if ((p->flags & DFT_FLAG_REAL) != 0) {
+      realSub0(p, s, 0);
+      s[p->length+1] = SIMDBase_NEGi(s[p->length+1]);
+    }
+  } else {
+    if ((p->flags & DFT_FLAG_REAL) != 0) {
+      s[p->length+1] = SIMDBase_NEGi(s[p->length+1]);
+      realSub1(p, s, 1);
+    }
+
+    if ((p->flags & DFT_FLAG_NO_BITREVERSAL) == 0) {
+      if (p->useCobra) {
+	bitReversalCobraInplace(p);
+      } else {
+	bitReversalRecursive(p->s, p->length, 1, 0, 0);
+      }
+    }
+
+    srBackwardMain(p);
+
+    if ((p->flags & DFT_FLAG_ALT_REAL) != 0) {
+      realSub0(p, s, 1);
+    }
+  }
+}
+
+void DFTUndiff_DESTROYPLAN(void *p2) {
+  DFTUndiff *plan = (DFTUndiff *)p2;
+  if (plan->magic != MAGIC_DFT) abort();
+
+  free(*(plan->ptTable));
+  free(plan->ptTable);
+  free(plan->cobraT);
+  free(plan->cobraR);
+  //free(plan->t);
+  if (plan->rtTable != NULL) {
+    free(plan->rtTable[0]);
+    free(plan->rtTable[1]);
+    free(plan->rtTable);
+  }
+
+  plan->magic = 0;
+  free(plan);
+}
+
+DFTUndiff *DFTUndiff_MAKEPLANSUB(uint64_t n, int32_t radix2thres, int32_t useCobra, uint64_t flags) {
+  int32_t i, j, k;
+
+  uint32_t linesize = SIMDBase_sizeOfCachelineInByte();
+  uint32_t cachesize = SIMDBase_sizeOfDataCacheInByte();
+
+  //
+
+  if ((flags & DFT_FLAG_REAL) != 0 || (flags & DFT_FLAG_ALT_REAL) != 0) n /= 2;
+
+  DFTUndiff *d = calloc(1, sizeof(DFTUndiff));
+
+  d->magic = MAGIC_DFT;
+  d->mode = SIMDBase_MODE;
+  d->flags = flags;
+
+  d->radix2thres = radix2thres;
+  d->useCobra = useCobra;
+
+  d->length = (uint32_t) n;
+  d->log2len = DFT_ilog2((uint32_t) n);
+
+  //
+
+  SIMDBase_REAL *trigTable = SIMDBase_alignedMalloc(sizeof(SIMDBase_REAL)*n*2);
+  d->ptTable = malloc(sizeof(SIMDBase_REAL *) * (d->log2len+1));
+
+  SIMDBase_REAL *p = trigTable, **pp = d->ptTable;
+
+  for(j=0;j<(int32_t)d->log2len+1;j++) {
+    *pp++ = p;
+
+    if ((1 << j) >= d->radix2thres) {
+      for(i=0;i<(1 << j)/4+1;i++) {
+	*p++ = (SIMDBase_REAL)COS(-2*M_PIl*i/(1 << j));
+      }
+      const int32_t step = linesize / sizeof(SIMDBase_REAL);
+      p += (step - (p - trigTable) % step) % step;
+    } else {
+      for(i=0;i<(1 << j)/4;i++) {
+	*p++ = (SIMDBase_REAL)COS(-2*M_PIl*i/(1 << j));
+	*p++ = (SIMDBase_REAL)SIN(-2*M_PIl*i/(1 << j));
+	*p++ = (SIMDBase_REAL)COS(-6*M_PIl*i/(1 << j));
+	*p++ = (SIMDBase_REAL)SIN(-6*M_PIl*i/(1 << j));
+      }
+    }
+  }
+
+  //
+
+  int32_t cobraQ;
+
+  cobraQ = linesize / (sizeof(SIMDBase_VECT) * 2);
+
+  for(;;) {
+    if (1 << (cobraQ*2) >
+	(cachesize / (sizeof(SIMDBase_VECT) * 2)/2))
+      break;
+
+    cobraQ++;
+  }
+  cobraQ--;
+
+  d->cobraQ = cobraQ;
+
+  if (cobraQ >= 4 && d->log2len >= 2*cobraQ) {
+    SIMDBase_VECT *cobraT;
+    int32_t *cobraR;
+
+    if (d->log2len <= 2*cobraQ) cobraQ = d->log2len / 2;
+
+    cobraT = SIMDBase_alignedMalloc(sizeof(SIMDBase_VECT)*2 * (1 << (cobraQ*2)));
+    cobraR = (int32_t *)SIMDBase_alignedMalloc(sizeof(int32_t) * (1 << cobraQ));
+
+    for(i=0;i<(1 << cobraQ);i++) cobraR[i] = bitR(i, cobraQ);
+
+    d->cobraT = cobraT; d->cobraR = cobraR;
+  } else {
+    d->useCobra = 0;
+  }
+
+  //
+
+  if ((d->flags & DFT_FLAG_REAL) != 0 || (d->flags & DFT_FLAG_ALT_REAL) != 0) {
+    int32_t m = n*2;
+
+    d->rtTable = malloc(sizeof(SIMDBase_REAL *)*2);
+    d->rtTable[0] = SIMDBase_alignedMalloc(sizeof(SIMDBase_REAL)*m/2);
+    d->rtTable[1] = SIMDBase_alignedMalloc(sizeof(SIMDBase_REAL)*m/2);
+
+    for(k=0;k<m/4;k++) {
+      d->rtTable[0][k*2+0] = 0.5-0.5*SIN(-2*M_PIl*k/m);
+      d->rtTable[0][k*2+1] =     0.5*COS(-2*M_PIl*k/m);
+      d->rtTable[1][k*2+0] = 0.5-0.5*SIN( 2*M_PIl*k/m);
+      d->rtTable[1][k*2+1] =     0.5*COS( 2*M_PIl*k/m);
+    }
+  }
+
+  //
+
+  return (void *)d;
+}
+
+void *DFTUndiff_MAKEPLAN(uint64_t n, uint64_t flags) {
+  if (flags & DFT_FLAG_VERBOSE) {
+    printf("\n--------------------------------\n");
+    printf("Making plan, mode = %s, dft length = %d\n", SIMDBase_NAME, (int)n);
+    printf("Processor : %s\n", SIMDBase_getProcessorNameString());
+    printf("Cache size (L2 + L3) : %d kbytes / thread\n", SIMDBase_sizeOfDataCacheInByte() / 1024);
+    printf("Cache Line Size : %d bytes\n", SIMDBase_sizeOfCachelineInByte());
+  }
+
+  if (n <= 256 || (flags & 3) == 0) {
+    return DFTUndiff_MAKEPLANSUB(n, n*2, (flags & DFT_FLAG_FORCE_COBRA) != 0, flags);
+  }
+
+  SIMDBase_REAL *s1 = SIMDBase_alignedMalloc(sizeof(SIMDBase_VECT)*n*2);
+
+  int32_t i, j, ts, tsbest, useCobra = 0;
+  double tick, tickmin;
+
+  if (flags & DFT_FLAG_VERBOSE) {
+    printf("\nWarming up before calibration ...");
+    fflush(stdout);
+  }
+
+  // warming up
+  tick = DFT_timeofday();
+  while(DFT_timeofday() - tick < 0.5)
+    ;
+
+  if (flags & DFT_FLAG_VERBOSE) {
+    printf(" done\n");
+  }
+
+  int32_t ntimes = 20000000.0 / n / DFT_ilog2(n);
+  if (ntimes == 0) ntimes = 1;
+
+  if (flags & DFT_FLAG_VERBOSE) {
+    printf("nTimes = %d\n", ntimes);
+  }
+
+  //
+
+  DFTUndiff *plan = DFTUndiff_MAKEPLANSUB(n, n*2, 0, flags);
+
+  for(i=0;i<n*2*SIMDBase_VECTLEN;i++) {
+    s1[i] = 0;
+  }
+
+  plan->s = (SIMDBase_VECT *)s1;
+
+  if (plan->cobraT != NULL) {
+    double tcobra = 0, trecur = 0;
+
+    if (flags & DFT_FLAG_VERBOSE) {
+      printf("\nChecking which bit-reversal method is faster\n");
+    }
+
+    //
+
+    bitReversalCobraInplace(plan);
+
+    tick = DFT_timeofday();
+
+    for(j=0;j<ntimes*4;j++) {
+      bitReversalCobraInplace(plan);
+    }
+
+    tcobra += DFT_timeofday() - tick;
+
+    //
+
+    bitReversalRecursive(plan->s, plan->length, 1, 0, 0);
+
+    tick = DFT_timeofday();
+
+    for(j=0;j<ntimes*4;j++) {
+      bitReversalRecursive(plan->s, plan->length, 1, 0, 0);
+    }
+
+    trecur += DFT_timeofday() - tick;
+
+    //
+
+    bitReversalCobraInplace(plan);
+
+    tick = DFT_timeofday();
+
+    for(j=0;j<ntimes*4;j++) {
+      bitReversalCobraInplace(plan);
+    }
+
+    tcobra += DFT_timeofday() - tick;
+
+    //
+
+    bitReversalRecursive(plan->s, plan->length, 1, 0, 0);
+
+    tick = DFT_timeofday();
+
+    for(j=0;j<ntimes*4;j++) {
+      bitReversalRecursive(plan->s, plan->length, 1, 0, 0);
+    }
+
+    trecur += DFT_timeofday() - tick;
+
+    //
+
+    useCobra = tcobra < trecur;
+
+    if ((flags & DFT_FLAG_FORCE_RECURSIVE) != 0) useCobra = 0;
+    if ((flags & DFT_FLAG_FORCE_COBRA) != 0) useCobra = 1;
+
+    if (flags & DFT_FLAG_VERBOSE) {
+      printf("cobra : %g\n", tcobra);
+      printf("recur : %g\n", trecur);
+      if (useCobra) {
+	printf("will use Cobra\n");
+      } else {
+	printf("will use the recursive reverser\n");
+      }
+    }
+  }
+
+  DFTUndiff_DESTROYPLAN(plan);
+
+  //
+
+  if (flags & DFT_FLAG_VERBOSE) {
+    printf("\nDetermining radix 2 threshold\n");
+  }
+
+  plan = DFTUndiff_MAKEPLANSUB(n, n*2, useCobra, flags);
+
+  for(j=0;j<ntimes;j++) {
+    DFTUndiff_EXECUTE(plan, s1, -1);
+    DFTUndiff_EXECUTE(plan, s1,  1);
+  }
+
+  DFTUndiff_DESTROYPLAN(plan);
+
+  tsbest = -1;
+  tickmin = 0;
+
+  for(ts = 1024;ts <= n*2;ts *= 2) {
+    plan = DFTUndiff_MAKEPLANSUB(n, ts, useCobra, flags);
+
+    tick = DFT_timeofday();
+
+    for(j=0;j<ntimes;j++) {
+      DFTUndiff_EXECUTE(plan, s1, -1);
+      DFTUndiff_EXECUTE(plan, s1,  1);
+    }
+
+    tick = DFT_timeofday() - tick;
+
+    DFTUndiff_DESTROYPLAN(plan);
+
+    if (tickmin == 0) tickmin = tick;
+
+    if (flags & DFT_FLAG_VERBOSE) {
+      printf("%d : %g\n",ts, (double)tick);
+    }
+
+    if (tick < tickmin) {
+      tickmin = tick;
+      tsbest = ts;
+    }
+  }
+
+  if (tsbest == -1) tsbest = n*2;;
+
+  if (flags & DFT_FLAG_VERBOSE) {
+    //printf("forcing tsbest = 1024\n");
+    //tsbest = 1024;
+    printf("radix 2 threshold : %d\n\n", tsbest);
+
+    double t = tickmin / ntimes / 2;
+    double nf = 5 * n * log(n) / log(2) / (t * 1000000);
+
+    printf("nFlops = %d x %g\n", SIMDBase_VECTLEN, nf);
+  }
+
+  plan = DFTUndiff_MAKEPLANSUB(n, tsbest, useCobra, flags);
+
+  if (flags & DFT_FLAG_VERBOSE) {
+    printf("\nDone making plan\n--------------------------------\n");
+  }
+
+  return plan;
+}
diff --git a/plugins/supereq/nsfft-1.00/dft/DFTUndiff.h b/plugins/supereq/nsfft-1.00/dft/DFTUndiff.h
new file mode 100644
index 00000000..d26b0d9b
--- /dev/null
+++ b/plugins/supereq/nsfft-1.00/dft/DFTUndiff.h
@@ -0,0 +1,114 @@
+#ifndef __DFTIMPL_H__
+#define __DFTIMPL_H__
+
+#include "SIMDBaseUndiff.h"
+
+#define MAGIC_DFT 0x18839f6d82bb02b6ULL
+
+typedef struct {
+  uint64_t magic;
+
+  SIMDBase_VECT *s;
+  uint32_t offset1, offset2;
+  uint32_t butlen, log2butlen;
+  uint32_t stride;
+
+  SIMDBase_REAL **ptTable;
+  uint32_t length, log2len;
+
+  int32_t radix2thres, flagTrans, useCobra;
+
+  int32_t cobraQ;
+  SIMDBase_VECT *cobraT;
+  int32_t *cobraR;
+
+  SIMDBase_REAL **rtTable;
+
+  uint64_t flags;
+  int32_t mode;
+} DFTUndiff;
+
+#if defined(ENABLE_PUREC_FLOAT) ////////////////////////////////////////////
+
+#define DFTUndiff_GETMODEPARAMINT getModeParamInt_purec_float
+#define DFTUndiff_GETMODEPARAMSTRING getModeParamString_purec_float
+#define DFTUndiff_EXECUTE execute_purec_float
+#define DFTUndiff_MAKEPLAN makePlan_purec_float
+#define DFTUndiff_MAKEPLANSUB makePlanSub_purec_float
+#define DFTUndiff_DESTROYPLAN destroyPlan_purec_float
+
+#elif defined(ENABLE_PUREC_DOUBLE) ////////////////////////////////////////////
+
+#define DFTUndiff_GETMODEPARAMINT getModeParamInt_purec_double
+#define DFTUndiff_GETMODEPARAMSTRING getModeParamString_purec_double
+#define DFTUndiff_EXECUTE execute_purec_double
+#define DFTUndiff_MAKEPLAN makePlan_purec_double
+#define DFTUndiff_MAKEPLANSUB makePlanSub_purec_double
+#define DFTUndiff_DESTROYPLAN destroyPlan_purec_double
+
+#elif defined(ENABLE_PUREC_LONGDOUBLE) ////////////////////////////////////////////
+
+#define DFTUndiff_GETMODEPARAMINT getModeParamInt_purec_longdouble
+#define DFTUndiff_GETMODEPARAMSTRING getModeParamString_purec_longdouble
+#define DFTUndiff_EXECUTE execute_purec_longdouble
+#define DFTUndiff_MAKEPLAN makePlan_purec_longdouble
+#define DFTUndiff_MAKEPLANSUB makePlanSub_purec_longdouble
+#define DFTUndiff_DESTROYPLAN destroyPlan_purec_longdouble
+
+#elif defined(ENABLE_SSE_FLOAT) ////////////////////////////////////////////
+
+#define DFTUndiff_GETMODEPARAMINT getModeParamInt_sse_float
+#define DFTUndiff_GETMODEPARAMSTRING getModeParamString_sse_float
+#define DFTUndiff_EXECUTE execute_sse_float
+#define DFTUndiff_MAKEPLAN makePlan_sse_float
+#define DFTUndiff_MAKEPLANSUB makePlanSub_sse_float
+#define DFTUndiff_DESTROYPLAN destroyPlan_sse_float
+
+#elif defined(ENABLE_SSE2_DOUBLE) ////////////////////////////////////////////
+
+#define DFTUndiff_GETMODEPARAMINT getModeParamInt_sse2_double
+#define DFTUndiff_GETMODEPARAMSTRING getModeParamString_sse2_double
+#define DFTUndiff_EXECUTE execute_sse2_double
+#define DFTUndiff_MAKEPLAN makePlan_sse2_double
+#define DFTUndiff_MAKEPLANSUB makePlanSub_sse2_double
+#define DFTUndiff_DESTROYPLAN destroyPlan_sse2_double
+
+#elif defined(ENABLE_NEON_FLOAT) ////////////////////////////////////////////
+
+#define DFTUndiff_GETMODEPARAMINT getModeParamInt_neon_float
+#define DFTUndiff_GETMODEPARAMSTRING getModeParamString_neon_float
+#define DFTUndiff_EXECUTE execute_neon_float
+#define DFTUndiff_MAKEPLAN makePlan_neon_float
+#define DFTUndiff_MAKEPLANSUB makePlanSub_neon_float
+#define DFTUndiff_DESTROYPLAN destroyPlan_neon_float
+
+#elif defined(ENABLE_AVX_FLOAT) ////////////////////////////////////////////
+
+#define DFTUndiff_GETMODEPARAMINT getModeParamInt_avx_float
+#define DFTUndiff_GETMODEPARAMSTRING getModeParamString_avx_float
+#define DFTUndiff_EXECUTE execute_avx_float
+#define DFTUndiff_MAKEPLAN makePlan_avx_float
+#define DFTUndiff_MAKEPLANSUB makePlanSub_avx_float
+#define DFTUndiff_DESTROYPLAN destroyPlan_avx_float
+
+#elif defined(ENABLE_AVX_DOUBLE) ////////////////////////////////////////////
+
+#define DFTUndiff_GETMODEPARAMINT getModeParamInt_avx_double
+#define DFTUndiff_GETMODEPARAMSTRING getModeParamString_avx_double
+#define DFTUndiff_EXECUTE execute_avx_double
+#define DFTUndiff_MAKEPLAN makePlan_avx_double
+#define DFTUndiff_MAKEPLANSUB makePlanSub_avx_double
+#define DFTUndiff_DESTROYPLAN destroyPlan_avx_double
+
+#elif defined(ENABLE_ALTIVEC_FLOAT) ////////////////////////////////////////////
+
+#define DFTUndiff_GETMODEPARAMINT getModeParamInt_altivec_float
+#define DFTUndiff_GETMODEPARAMSTRING getModeParamString_altivec_float
+#define DFTUndiff_EXECUTE execute_altivec_float
+#define DFTUndiff_MAKEPLAN makePlan_altivec_float
+#define DFTUndiff_MAKEPLANSUB makePlanSub_altivec_float
+#define DFTUndiff_DESTROYPLAN destroyPlan_altivec_float
+
+#endif ////////////////////////////////////////////////////////////////////
+
+#endif
diff --git a/plugins/supereq/nsfft-1.00/dft/Makefile b/plugins/supereq/nsfft-1.00/dft/Makefile
new file mode 120000
index 00000000..5d253498
--- /dev/null
+++ b/plugins/supereq/nsfft-1.00/dft/Makefile
@@ -0,0 +1 @@
+Makefile.x86avx
+\ No newline at end of file
diff --git a/plugins/supereq/nsfft-1.00/dft/Makefile.altivec b/plugins/supereq/nsfft-1.00/dft/Makefile.altivec
new file mode 100644
index 00000000..fe7fc993
--- /dev/null
+++ b/plugins/supereq/nsfft-1.00/dft/Makefile.altivec
@@ -0,0 +1,26 @@
+CC=gcc
+BASEOPT=-Wall  -I ../simd -maltivec -mabi=altivec
+OPT=$(BASEOPT) -O3
+
+all : libDFT.a
+
+DFTpurecfloat.o : DFTUndiff.c DFT.h ../simd/SIMDBase.h ../simd/SIMDBaseUndiff.h
+	$(CC) $(OPT) -DENABLE_PUREC_FLOAT DFTUndiff.c -c -o DFTpurecfloat.o
+
+DFTpurecdouble.o : DFTUndiff.c DFT.h ../simd/SIMDBase.h ../simd/SIMDBaseUndiff.h
+	$(CC) $(OPT) -DENABLE_PUREC_DOUBLE DFTUndiff.c -c -o DFTpurecdouble.o
+
+DFTpureclongdouble.o : DFTUndiff.c DFT.h ../simd/SIMDBase.h ../simd/SIMDBaseUndiff.h
+	$(CC) $(OPT) -DENABLE_PUREC_LONGDOUBLE DFTUndiff.c -c -o DFTpureclongdouble.o
+
+DFTaltivecfloat.o : DFTUndiff.c DFT.h ../simd/SIMDBase.h ../simd/SIMDBaseUndiff.h
+	$(CC) $(OPT) -DENABLE_ALTIVEC_FLOAT DFTUndiff.c -c -o DFTaltivecfloat.o
+
+DFT.o : DFT.c DFT.h
+	$(CC) $(OPT) -DENABLE_PUREC_FLOAT -DENABLE_PUREC_DOUBLE -DENABLE_PUREC_LONGDOUBLE -DENABLE_ALTIVEC_FLOAT DFT.c -c -o DFT.o
+
+libDFT.a : DFTpurecfloat.o DFTpurecdouble.o DFTpureclongdouble.o DFTaltivecfloat.o DFT.o
+	rm -f libDFT.a; ar -cvq libDFT.a DFTpurecfloat.o DFTpurecdouble.o DFTpureclongdouble.o DFTaltivecfloat.o DFT.o
+
+clean :
+	rm -f *~ *.o *.s *.a
diff --git a/plugins/supereq/nsfft-1.00/dft/Makefile.neon b/plugins/supereq/nsfft-1.00/dft/Makefile.neon
new file mode 100644
index 00000000..111a04ae
--- /dev/null
+++ b/plugins/supereq/nsfft-1.00/dft/Makefile.neon
@@ -0,0 +1,26 @@
+CC=gcc
+BASEOPT=-Wall -I ../simd -mfloat-abi=softfp
+OPT=$(BASEOPT) -O3
+
+all : libDFT.a
+
+DFTpurecfloat.o : DFTUndiff.c DFT.h ../simd/SIMDBase.h
+	$(CC) $(OPT) -DENABLE_PUREC_FLOAT DFTUndiff.c -c -o DFTpurecfloat.o
+
+DFTpurecdouble.o : DFTUndiff.c DFT.h ../simd/SIMDBase.h
+	$(CC) $(OPT) -DENABLE_PUREC_DOUBLE DFTUndiff.c -c -o DFTpurecdouble.o
+
+DFTpureclongdouble.o : DFTUndiff.c DFT.h ../simd/SIMDBase.h
+	$(CC) $(OPT) -DENABLE_PUREC_LONGDOUBLE DFTUndiff.c -c -o DFTpureclongdouble.o
+
+DFTneonfloat.o : DFTUndiff.c DFT.h ../simd/SIMDBase.h
+	$(CC) $(OPT) -mfpu=neon -DENABLE_NEON_FLOAT DFTUndiff.c -c -o DFTneonfloat.o
+
+DFT.o : DFT.c DFT.h
+	$(CC) $(OPT) -DENABLE_PUREC_FLOAT -DENABLE_PUREC_DOUBLE -DENABLE_PUREC_LONGDOUBLE -DENABLE_NEON_FLOAT DFT.c -c -o DFT.o
+
+libDFT.a : DFTpurecfloat.o DFTpurecdouble.o DFTpureclongdouble.o DFTneonfloat.o DFT.o
+	rm -f libDFT.a; ar -cvq libDFT.a DFTpurecfloat.o DFTpurecdouble.o DFTpureclongdouble.o DFTneonfloat.o DFT.o
+
+clean :
+	rm -f *~ *.o *.s *.a
diff --git a/plugins/supereq/nsfft-1.00/dft/Makefile.purec b/plugins/supereq/nsfft-1.00/dft/Makefile.purec
new file mode 100644
index 00000000..2c8b04f1
--- /dev/null
+++ b/plugins/supereq/nsfft-1.00/dft/Makefile.purec
@@ -0,0 +1,35 @@
+CC=gcc
+BASEOPT=-Wall
+OPT=$(BASEOPT) -O3
+
+all : libDFT.a
+
+DFTpurecfloat.o : DFTUndiff.c DFT.h SIMDBase.h
+	$(CC) $(OPT) -DENABLE_PUREC_FLOAT DFTUndiff.c -c -o DFTpurecfloat.o
+
+DFTpurecdouble.o : DFTUndiff.c DFT.h SIMDBase.h
+	$(CC) $(OPT) -DENABLE_PUREC_DOUBLE DFTUndiff.c -c -o DFTpurecdouble.o
+
+DFTpureclongdouble.o : DFTUndiff.c DFT.h SIMDBase.h
+	$(CC) $(OPT) -DENABLE_PUREC_LONGDOUBLE DFTUndiff.c -c -o DFTpureclongdouble.o
+
+SIMDBaseUndiff_purecfloat.o : SIMDBaseUndiff.c SIMDBase.h SIMDBaseUndiff.h
+	$(CC) $(OPT) -DENABLE_PUREC_FLOAT SIMDBaseUndiff.c -c -o SIMDBaseUndiff_purecfloat.o
+
+SIMDBaseUndiff_purecdouble.o : SIMDBaseUndiff.c DFT.h SIMDBase.h SIMDBaseUndiff.h
+	$(CC) $(OPT) -DENABLE_PUREC_DOUBLE SIMDBaseUndiff.c -c -o SIMDBaseUndiff_purecdouble.o
+
+SIMDBaseUndiff_pureclongdouble.o : SIMDBaseUndiff.c SIMDBase.h SIMDBaseUndiff.h
+	$(CC) $(OPT) -DENABLE_PUREC_LONGDOUBLE SIMDBaseUndiff.c -c -o SIMDBaseUndiff_pureclongdouble.o
+
+SIMDBase.o : SIMDBase.c SIMDBase.h
+	$(CC) $(BASEOPT) -O -DENABLE_PUREC_FLOAT -DENABLE_PUREC_DOUBLE -DENABLE_PUREC_LONGDOUBLE SIMDBase.c -c -o SIMDBase.o
+
+DFT.o : DFT.c DFT.h
+	$(CC) $(OPT) -DENABLE_PUREC_FLOAT -DENABLE_PUREC_DOUBLE -DENABLE_PUREC_LONGDOUBLE DFT.c -c -o DFT.o
+
+libDFT.a : DFTpurecfloat.o DFTpurecdouble.o DFTpureclongdouble.o DFT.o SIMDBase.o SIMDBaseUndiff_purecfloat.o SIMDBaseUndiff_purecdouble.o SIMDBaseUndiff_pureclongdouble.o
+	rm -f libDFT.a; ar -cvq libDFT.a DFTpurecfloat.o DFTpurecdouble.o DFTpureclongdouble.o DFT.o SIMDBase.o SIMDBaseUndiff_purecfloat.o SIMDBaseUndiff_purecdouble.o SIMDBaseUndiff_pureclongdouble.o
+
+clean :
+	rm -f *~ *.o *.s *.a
diff --git a/plugins/supereq/nsfft-1.00/dft/Makefile.x86 b/plugins/supereq/nsfft-1.00/dft/Makefile.x86
new file mode 100644
index 00000000..6ecbacec
--- /dev/null
+++ b/plugins/supereq/nsfft-1.00/dft/Makefile.x86
@@ -0,0 +1,29 @@
+CC=gcc
+BASEOPT=-Wall -I ../simd
+OPT=$(BASEOPT) -O3
+
+all : libDFT.a
+
+DFTpurecfloat.o : DFTUndiff.c DFT.h ../simd/SIMDBase.h ../simd/SIMDBaseUndiff.h
+	$(CC) $(OPT) -DENABLE_PUREC_FLOAT DFTUndiff.c -c -o DFTpurecfloat.o
+
+DFTpurecdouble.o : DFTUndiff.c DFT.h ../simd/SIMDBase.h ../simd/SIMDBaseUndiff.h
+	$(CC) $(OPT) -DENABLE_PUREC_DOUBLE DFTUndiff.c -c -o DFTpurecdouble.o
+
+DFTpureclongdouble.o : DFTUndiff.c DFT.h ../simd/SIMDBase.h ../simd/SIMDBaseUndiff.h
+	$(CC) $(OPT) -DENABLE_PUREC_LONGDOUBLE DFTUndiff.c -c -o DFTpureclongdouble.o
+
+DFTssefloat.o : DFTUndiff.c DFT.h ../simd/SIMDBase.h ../simd/SIMDBaseUndiff.h
+	$(CC) $(OPT) -msse -DENABLE_SSE_FLOAT DFTUndiff.c -c -o DFTssefloat.o
+
+DFTsse2double.o : DFTUndiff.c DFT.h ../simd/SIMDBase.h ../simd/SIMDBaseUndiff.h
+	$(CC) $(OPT) -msse2 -DENABLE_SSE2_DOUBLE DFTUndiff.c -c -o DFTsse2double.o
+
+DFT.o : DFT.c DFT.h
+	$(CC) $(OPT) -DENABLE_PUREC_FLOAT -DENABLE_PUREC_DOUBLE -DENABLE_PUREC_LONGDOUBLE -DENABLE_SSE_FLOAT -DENABLE_SSE2_DOUBLE DFT.c -c -o DFT.o
+
+libDFT.a : DFTpurecfloat.o DFTpurecdouble.o DFTpureclongdouble.o DFTssefloat.o DFTsse2double.o DFT.o
+	rm -f libDFT.a; ar -cvq libDFT.a DFTpurecfloat.o DFTpurecdouble.o DFTpureclongdouble.o DFTssefloat.o DFTsse2double.o DFT.o
+
+clean :
+	rm -f *~ *.o *.s *.a a.out
diff --git a/plugins/supereq/nsfft-1.00/dft/Makefile.x86avx b/plugins/supereq/nsfft-1.00/dft/Makefile.x86avx
new file mode 100644
index 00000000..b38909cb
--- /dev/null
+++ b/plugins/supereq/nsfft-1.00/dft/Makefile.x86avx
@@ -0,0 +1,35 @@
+CC=gcc
+BASEOPT=-Wall -I ../simd
+OPT=$(BASEOPT) -O3
+
+all : libDFT.a
+
+DFTpurecfloat.o : DFTUndiff.c DFT.h ../simd/SIMDBase.h ../simd/SIMDBaseUndiff.h
+	$(CC) $(OPT) -DENABLE_PUREC_FLOAT DFTUndiff.c -c -o DFTpurecfloat.o
+
+DFTpurecdouble.o : DFTUndiff.c DFT.h ../simd/SIMDBase.h ../simd/SIMDBaseUndiff.h
+	$(CC) $(OPT) -DENABLE_PUREC_DOUBLE DFTUndiff.c -c -o DFTpurecdouble.o
+
+DFTpureclongdouble.o : DFTUndiff.c DFT.h ../simd/SIMDBase.h ../simd/SIMDBaseUndiff.h
+	$(CC) $(OPT) -DENABLE_PUREC_LONGDOUBLE DFTUndiff.c -c -o DFTpureclongdouble.o
+
+DFTssefloat.o : DFTUndiff.c DFT.h ../simd/SIMDBase.h ../simd/SIMDBaseUndiff.h
+	$(CC) $(OPT) -msse -DENABLE_SSE_FLOAT DFTUndiff.c -c -o DFTssefloat.o
+
+DFTsse2double.o : DFTUndiff.c DFT.h ../simd/SIMDBase.h ../simd/SIMDBaseUndiff.h
+	$(CC) $(OPT) -msse2 -DENABLE_SSE2_DOUBLE DFTUndiff.c -c -o DFTsse2double.o
+
+DFTavxfloat.o : DFTUndiff.c DFT.h ../simd/SIMDBase.h ../simd/SIMDBaseUndiff.h
+	$(CC) $(OPT) -mavx -DENABLE_AVX_FLOAT DFTUndiff.c -c -o DFTavxfloat.o
+
+DFTavxdouble.o : DFTUndiff.c DFT.h ../simd/SIMDBase.h ../simd/SIMDBaseUndiff.h
+	$(CC) $(OPT) -mavx -DENABLE_AVX_DOUBLE DFTUndiff.c -c -o DFTavxdouble.o
+
+DFT.o : DFT.c DFT.h
+	$(CC) $(OPT) -DENABLE_PUREC_FLOAT -DENABLE_PUREC_DOUBLE -DENABLE_PUREC_LONGDOUBLE -DENABLE_SSE_FLOAT -DENABLE_SSE2_DOUBLE -DENABLE_AVX_FLOAT -DENABLE_AVX_DOUBLE DFT.c -c -o DFT.o
+
+libDFT.a : DFTpurecfloat.o DFTpurecdouble.o DFTpureclongdouble.o DFTssefloat.o DFTsse2double.o DFTavxfloat.o DFTavxdouble.o DFT.o
+	rm -f libDFT.a; ar -cvq libDFT.a DFTpurecfloat.o DFTpurecdouble.o DFTpureclongdouble.o DFTssefloat.o DFTsse2double.o DFTavxfloat.o DFTavxdouble.o DFT.o
+
+clean :
+	rm -f *~ *.o *.s *.a a.out
diff --git a/plugins/supereq/nsfft-1.00/dfttest/DFTExample.c b/plugins/supereq/nsfft-1.00/dfttest/DFTExample.c
new file mode 100644
index 00000000..78ff14dc
--- /dev/null
+++ b/plugins/supereq/nsfft-1.00/dfttest/DFTExample.c
@@ -0,0 +1,88 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include <stdint.h>
+#include <complex.h>
+
+#include "SIMDBase.h"
+#include "DFT.h"
+
+typedef float REAL;
+#define TYPE SIMDBase_TYPE_FLOAT
+
+#define THRES 1e-3
+
+double complex omega(double n, double kn) {
+  return cexp((-2 * M_PI * _Complex_I / n) * kn);
+}
+
+void forward(double complex *ts, double complex *fs, int len) {
+  int k, n;
+
+  for(k=0;k<len;k++) {
+    fs[k] = 0;
+
+    for(n=0;n<len;n++) {
+      fs[k] += ts[n] * omega(len, n*k);
+    }
+  }
+}
+
+int main(int argc, char **argv) {
+  const int n = 256;
+
+  int mode = SIMDBase_chooseBestMode(TYPE);
+  printf("mode : %d, %s\n", mode, SIMDBase_getModeParamString(SIMDBase_PARAMID_MODE_NAME, mode));
+
+  int veclen = SIMDBase_getModeParamInt(SIMDBase_PARAMID_VECTOR_LEN, mode);
+  int sizeOfVect = SIMDBase_getModeParamInt(SIMDBase_PARAMID_SIZE_OF_VECT, mode);
+
+  //
+
+  int i, j;
+
+  DFT *p = DFT_init(mode, n, 0);
+  REAL *sx = SIMDBase_alignedMalloc(sizeOfVect*n*2);
+
+  //
+
+  double complex ts[veclen][n], fs[veclen][n];
+
+  for(j=0;j<veclen;j++) {
+    for(i=0;i<n;i++) {
+      ts[j][i] = (random() / (double)RAND_MAX) + (random() / (double)RAND_MAX) * _Complex_I;
+      sx[(i*2+0)*veclen+j] = creal(ts[j][i]);
+      sx[(i*2+1)*veclen+j] = cimag(ts[j][i]);
+    }
+  }
+
+  //
+
+  DFT_execute(p, mode, sx, -1);
+
+  for(j=0;j<veclen;j++) {
+    forward(ts[j], fs[j], n);
+  }
+
+  //
+
+  int success = 1;
+
+  for(j=0;j<veclen;j++) {
+    for(i=0;i<n;i++) {
+      if ((fabs(sx[(i*2+0)*veclen+j] - creal(fs[j][i])) > THRES) ||
+	  (fabs(sx[(i*2+1)*veclen+j] - cimag(fs[j][i])) > THRES)) {
+	success = 0;
+      }
+    }
+  }
+
+  printf("%s\n", success ? "OK" : "NG");
+
+  //
+
+  SIMDBase_alignedFree(sx);
+  DFT_dispose(p, mode);
+
+  exit(0);
+}
diff --git a/plugins/supereq/nsfft-1.00/dfttest/DFTTestFFTW.c b/plugins/supereq/nsfft-1.00/dfttest/DFTTestFFTW.c
new file mode 100644
index 00000000..42825ed9
--- /dev/null
+++ b/plugins/supereq/nsfft-1.00/dfttest/DFTTestFFTW.c
@@ -0,0 +1,317 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include <stdint.h>
+#include <time.h>
+#include <complex.h>
+
+#include <fftw3.h>
+
+#include "SIMDBase.h"
+#include "DFT.h"
+
+#if 1
+typedef float REAL;
+#define TYPE SIMDBase_TYPE_FLOAT
+#else
+typedef double REAL;
+#define TYPE SIMDBase_TYPE_DOUBLE
+#endif
+
+#define THRES 1e-3
+
+// complex forward
+int check_cf(int n, int mode, int veclen, int sizeOfVect) {
+  int i, j;
+
+  DFT *p = DFT_init(mode, n, 0);
+  fftw_plan w[n];
+
+  fftw_complex *in[sizeOfVect], *out[sizeOfVect];
+
+  REAL *sx = SIMDBase_alignedMalloc(sizeOfVect*n*2);
+
+  //
+
+  for(j=0;j<veclen;j++) {
+    in[j] = (fftw_complex*) fftw_malloc(sizeof(fftw_complex) * n);
+    out[j] = (fftw_complex*) fftw_malloc(sizeof(fftw_complex) * n);
+    w[j] = fftw_plan_dft_1d(n, in[j], out[j], FFTW_FORWARD, FFTW_ESTIMATE);
+
+    for(i=0;i<n;i++) {
+      double re = random() / (double)RAND_MAX;
+      double im = random() / (double)RAND_MAX;
+      sx[(i*2+0)*veclen+j] = re;
+      sx[(i*2+1)*veclen+j] = im;
+      in[j][i] = re + im * _Complex_I;
+    }
+  }
+
+  //
+
+  DFT_execute(p, mode, sx, -1);
+
+  for(j=0;j<veclen;j++) {
+    fftw_execute(w[j]);
+  }
+
+  //
+
+  int success = 1;
+
+  for(j=0;j<veclen;j++) {
+    for(i=0;i<n;i++) {
+      if (fabs(sx[(i*2+0)*veclen+j] - creal(out[j][i])) > THRES) success = 0;
+      if (fabs(sx[(i*2+1)*veclen+j] - cimag(out[j][i])) > THRES) success = 0;
+    }
+  }
+
+  //
+
+  for(j=0;j<veclen;j++) {
+    fftw_destroy_plan(w[j]);
+    fftw_free(in[j]);
+    fftw_free(out[j]);
+  }
+
+  SIMDBase_alignedFree(sx);
+
+  DFT_dispose(p, mode);
+
+  //
+
+  return success;
+}
+
+// complex backward
+int check_cb(int n, int mode, int veclen, int sizeOfVect) {
+  int i, j;
+
+  DFT *p = DFT_init(mode, n, 0);
+  fftw_plan w[n];
+
+  fftw_complex *in[sizeOfVect], *out[sizeOfVect];
+
+  REAL *sx = SIMDBase_alignedMalloc(sizeOfVect*n*2);
+
+  //
+
+  for(j=0;j<veclen;j++) {
+    in[j] = (fftw_complex*) fftw_malloc(sizeof(fftw_complex) * n);
+    out[j] = (fftw_complex*) fftw_malloc(sizeof(fftw_complex) * n);
+    w[j] = fftw_plan_dft_1d(n, in[j], out[j], FFTW_BACKWARD, FFTW_ESTIMATE);
+
+    for(i=0;i<n;i++) {
+      double re = random() / (double)RAND_MAX;
+      double im = random() / (double)RAND_MAX;
+      sx[(i*2+0)*veclen+j] = re;
+      sx[(i*2+1)*veclen+j] = im;
+      in[j][i] = re + im * _Complex_I;
+    }
+  }
+
+  //
+
+  DFT_execute(p, mode, sx, 1);
+
+  for(j=0;j<veclen;j++) {
+    fftw_execute(w[j]);
+  }
+
+  //
+
+  int success = 1;
+
+  for(j=0;j<veclen;j++) {
+    for(i=0;i<n;i++) {
+      if (fabs(sx[(i*2+0)*veclen+j] - creal(out[j][i])) > THRES) success = 0;
+      if (fabs(sx[(i*2+1)*veclen+j] - cimag(out[j][i])) > THRES) success = 0;
+    }
+  }
+
+  //
+
+  for(j=0;j<veclen;j++) {
+    fftw_destroy_plan(w[j]);
+    fftw_free(in[j]);
+    fftw_free(out[j]);
+  }
+
+  SIMDBase_alignedFree(sx);
+
+  DFT_dispose(p, mode);
+
+  //
+
+  return success;
+}
+
+// real forward
+int check_rf(int n, int mode, int veclen, int sizeOfVect) {
+  int i, j;
+
+  DFT *p = DFT_init(mode, n, DFT_FLAG_REAL);
+  fftw_plan w[n];
+
+  double *in[sizeOfVect];
+  fftw_complex *out[sizeOfVect];
+
+  REAL *sx = SIMDBase_alignedMalloc(sizeOfVect*n*2);
+
+  //
+
+  for(j=0;j<veclen;j++) {
+    in[j] = (double *) fftw_malloc(sizeof(double) * n);
+    out[j] = (fftw_complex*) fftw_malloc(sizeof(fftw_complex) * (n/2+1));
+    w[j] = fftw_plan_dft_r2c_1d(n, in[j], out[j], FFTW_ESTIMATE);
+
+    for(i=0;i<n;i++) {
+      double re = random() / (double)RAND_MAX;
+      sx[i*veclen+j] = re;
+      in[j][i] = re;
+    }
+  }
+
+  //
+
+  DFT_execute(p, mode, sx, -1);
+
+  for(j=0;j<veclen;j++) {
+    fftw_execute(w[j]);
+  }
+
+  //
+
+  int success = 1;
+
+  for(j=0;j<veclen;j++) {
+    for(i=0;i<n/2;i++) {
+      if (i == 0) {
+	if (fabs(sx[(i*2+0)*veclen+j] - creal(out[j][0])) > THRES) success = 0;
+	if (fabs(sx[(i*2+1)*veclen+j] - creal(out[j][n/2])) > THRES) success = 0;
+      } else {
+	if (fabs(sx[(i*2+0)*veclen+j] - creal(out[j][i])) > THRES) success = 0;
+	if (fabs(sx[(i*2+1)*veclen+j] - cimag(out[j][i])) > THRES) success = 0;
+      }
+    }
+  }
+
+  //
+
+  for(j=0;j<veclen;j++) {
+    fftw_destroy_plan(w[j]);
+    fftw_free(in[j]);
+    fftw_free(out[j]);
+  }
+
+  SIMDBase_alignedFree(sx);
+
+  DFT_dispose(p, mode);
+
+  //
+
+  return success;
+}
+
+// real backward
+int check_rb(int n, int mode, int veclen, int sizeOfVect) {
+  int i, j;
+
+  DFT *p = DFT_init(mode, n, DFT_FLAG_REAL);
+  fftw_plan w[n];
+
+  fftw_complex *in[sizeOfVect];
+  double *out[sizeOfVect];
+
+  REAL *sx = SIMDBase_alignedMalloc(sizeOfVect*n*2);
+
+  //
+
+  for(j=0;j<veclen;j++) {
+    in[j] = (fftw_complex*) fftw_malloc(sizeof(fftw_complex) * (n/2+1));
+    out[j] = (double *) fftw_malloc(sizeof(double) * n);
+    w[j] = fftw_plan_dft_c2r_1d(n, in[j], out[j], FFTW_ESTIMATE);
+
+    for(i=0;i<n/2;i++) {
+      if (i == 0) {
+	in[j][0  ] = (random() / (double)RAND_MAX);
+	in[j][n/2] = (random() / (double)RAND_MAX);
+      } else {
+	in[j][i  ] = (random() / (double)RAND_MAX) + (random() / (double)RAND_MAX) * _Complex_I;
+      }
+    }
+
+    for(i=0;i<n/2;i++) {
+      if (i == 0) {
+	sx[(2*0+0) * veclen + j] = creal(in[j][0  ]);
+	sx[(2*0+1) * veclen + j] = creal(in[j][n/2]);
+      } else {
+	sx[(2*i+0) * veclen + j] = creal(in[j][i]);
+	sx[(2*i+1) * veclen + j] = cimag(in[j][i]);
+      }
+    }
+  }
+
+  //
+
+  DFT_execute(p, mode, sx, 1);
+
+  for(j=0;j<veclen;j++) {
+    fftw_execute(w[j]);
+  }
+
+  //
+
+  int success = 1;
+
+  for(j=0;j<veclen;j++) {
+    for(i=0;i<n/2;i++) {
+      if ((fabs(sx[i * veclen + j]*2 - out[j][i]) > THRES)) {
+	success = 0;
+      }
+    }
+  }
+
+  //
+
+  for(j=0;j<veclen;j++) {
+    fftw_destroy_plan(w[j]);
+    fftw_free(in[j]);
+    fftw_free(out[j]);
+  }
+
+  SIMDBase_alignedFree(sx);
+
+  DFT_dispose(p, mode);
+
+  //
+
+  return success;
+}
+
+int main(int argc, char **argv) {
+  if (argc != 2) {
+    fprintf(stderr, "%s <log2n>\n", argv[0]);
+    exit(-1);
+  }
+
+  const int n = 1 << atoi(argv[1]);
+
+  srandom(time(NULL));
+
+  //
+
+  int mode = SIMDBase_chooseBestMode(TYPE);
+
+  printf("mode : %d, %s\n", mode, SIMDBase_getModeParamString(SIMDBase_PARAMID_MODE_NAME, mode));
+
+  int veclen = SIMDBase_getModeParamInt(SIMDBase_PARAMID_VECTOR_LEN, mode);
+  int sizeOfVect = SIMDBase_getModeParamInt(SIMDBase_PARAMID_SIZE_OF_VECT, mode);
+
+  printf("complex forward   : %s\n", check_cf(n, mode, veclen, sizeOfVect) ? "OK" : "NG");
+  printf("complex backward  : %s\n", check_cb(n, mode, veclen, sizeOfVect) ? "OK" : "NG");
+  printf("real forward      : %s\n", check_rf(n, mode, veclen, sizeOfVect) ? "OK" : "NG");
+  printf("real backward     : %s\n", check_rb(n, mode, veclen, sizeOfVect) ? "OK" : "NG");
+
+  exit(0);
+}
diff --git a/plugins/supereq/nsfft-1.00/dfttest/DFTTestNaive.c b/plugins/supereq/nsfft-1.00/dfttest/DFTTestNaive.c
new file mode 100644
index 00000000..9d4bdaae
--- /dev/null
+++ b/plugins/supereq/nsfft-1.00/dfttest/DFTTestNaive.c
@@ -0,0 +1,419 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include <stdint.h>
+#include <time.h>
+#include <complex.h>
+
+#include "SIMDBase.h"
+#include "DFT.h"
+
+#if 1
+typedef float REAL;
+#define TYPE SIMDBase_TYPE_FLOAT
+#else
+typedef double REAL;
+#define TYPE SIMDBase_TYPE_DOUBLE
+#endif
+
+#define THRES 1e-3
+
+double complex omega(double n, double kn) {
+  return cexp((-2 * M_PI * _Complex_I / n) * kn);
+}
+
+void forward(double complex *ts, double complex *fs, int len) {
+  int k, n;
+
+  for(k=0;k<len;k++) {
+    fs[k] = 0;
+
+    for(n=0;n<len;n++) {
+      fs[k] += ts[n] * omega(len, n*k);
+    }
+  }
+}
+
+void backward(double complex *fs, double complex *ts, int len) {
+  int k, n;
+
+  for(k=0;k<len;k++) {
+    ts[k] = 0;
+
+    for(n=0;n<len;n++) {
+      ts[k] += fs[n] * omega(-len, n*k);
+    }
+  }
+}
+
+// complex forward
+int check_cf(int n, int mode, int veclen, int sizeOfVect) {
+  int i, j;
+
+  DFT *p = DFT_init(mode, n, 0);
+  REAL *sx = SIMDBase_alignedMalloc(sizeOfVect*n*2);
+
+  //
+
+  double complex ts[veclen][n], fs[veclen][n];
+
+  for(j=0;j<veclen;j++) {
+    for(i=0;i<n;i++) {
+      ts[j][i] = (random() / (double)RAND_MAX) + (random() / (double)RAND_MAX) * _Complex_I;
+      sx[(i*2+0)*veclen+j] = creal(ts[j][i]);
+      sx[(i*2+1)*veclen+j] = cimag(ts[j][i]);
+    }
+  }
+
+  //
+
+  DFT_execute(p, mode, sx, -1);
+
+  for(j=0;j<veclen;j++) {
+    forward(ts[j], fs[j], n);
+  }
+
+  //
+
+  int success = 1;
+
+  for(j=0;j<veclen;j++) {
+    for(i=0;i<n;i++) {
+      if ((fabs(sx[(i*2+0)*veclen+j] - creal(fs[j][i])) > THRES) ||
+	  (fabs(sx[(i*2+1)*veclen+j] - cimag(fs[j][i])) > THRES)) {
+	success = 0;
+      }
+    }
+  }
+
+  //
+
+  SIMDBase_alignedFree(sx);
+  DFT_dispose(p, mode);
+
+  //
+
+  return success;
+}
+
+// complex backward
+int check_cb(int n, int mode, int veclen, int sizeOfVect) {
+  int i,j;
+
+  DFT *p = DFT_init(mode, n, 0);
+  REAL *sx = SIMDBase_alignedMalloc(sizeOfVect*n*2);
+
+  //
+
+  double complex fs[veclen][n], ts[veclen][n];
+
+  for(j=0;j<veclen;j++) {
+    for(i=0;i<n;i++) {
+      fs[j][i] = (random() / (double)RAND_MAX) + (random() / (double)RAND_MAX) * _Complex_I;
+
+      sx[(i*2+0)*veclen+j] = creal(fs[j][i]);
+      sx[(i*2+1)*veclen+j] = cimag(fs[j][i]);
+    }
+  }
+
+  //
+
+  DFT_execute(p, mode, sx, 1);
+
+  for(j=0;j<veclen;j++) {
+    backward(fs[j], ts[j], n);
+  }
+
+  //
+
+  int success = 1;
+
+  for(j=0;j<veclen;j++) {
+    for(i=0;i<n;i++) {
+      if ((fabs(sx[(i*2+0)*veclen+j] - creal(ts[j][i])) > THRES) ||
+	  (fabs(sx[(i*2+1)*veclen+j] - cimag(ts[j][i])) > THRES)) {
+	success = 0;
+      }
+    }
+  }
+
+  //
+
+  SIMDBase_alignedFree(sx);
+  DFT_dispose(p, mode);
+
+  //
+
+  return success;
+}
+
+// real forward
+int check_rf(int n, int mode, int veclen, int sizeOfVect) {
+  int i,j;
+
+  DFT *p = DFT_init(mode, n, DFT_FLAG_REAL);
+  REAL *sx = SIMDBase_alignedMalloc(sizeOfVect*n);
+
+  //
+
+  double complex ts[veclen][n], fs[veclen][n];
+
+  for(j=0;j<veclen;j++) {
+    for(i=0;i<n;i++) {
+      ts[j][i] = (random() / (double)RAND_MAX);
+      sx[i*veclen+j] = creal(ts[j][i]);
+    }
+  }
+
+  //
+
+  DFT_execute(p, mode, sx, -1);
+
+  for(j=0;j<veclen;j++) {
+    forward(ts[j], fs[j], n);
+  }
+
+  //
+
+  int success = 1;
+
+  for(j=0;j<veclen;j++) {
+    for(i=0;i<n/2;i++) {
+      if (i == 0) {
+	if (fabs(sx[(2*0+0) * veclen + j] - creal(fs[j][0  ])) > THRES) success = 0;
+	if (fabs(sx[(2*0+1) * veclen + j] - creal(fs[j][n/2])) > THRES) success = 0;
+      } else {
+	if (fabs(sx[(2*i+0) * veclen + j] - creal(fs[j][i])) > THRES) success = 0;
+	if (fabs(sx[(2*i+1) * veclen + j] - cimag(fs[j][i])) > THRES) success = 0;
+      }
+    }
+  }
+
+  //
+
+  SIMDBase_alignedFree(sx);
+  DFT_dispose(p, mode);
+
+  //
+
+  return success;
+}
+
+// real backward
+int check_rb(int n, int mode, int veclen, int sizeOfVect) {
+  int i,j;
+
+  DFT *p = DFT_init(mode, n, DFT_FLAG_REAL);
+  REAL *sx = SIMDBase_alignedMalloc(sizeOfVect*n);
+
+  //
+
+  double complex fs[veclen][n], ts[veclen][n];
+
+  for(j=0;j<veclen;j++) {
+    for(i=0;i<n/2;i++) {
+      if (i == 0) {
+	fs[j][0  ] = (random() / (double)RAND_MAX);
+	fs[j][n/2] = (random() / (double)RAND_MAX);
+      } else {
+	fs[j][i  ] = (random() / (double)RAND_MAX) + (random() / (double)RAND_MAX) * _Complex_I;
+	fs[j][n-i] = conj(fs[j][i]);
+      }
+    }
+  }
+
+  for(j=0;j<veclen;j++) {
+    for(i=0;i<n/2;i++) {
+      if (i == 0) {
+	sx[(2*0+0) * veclen + j] = creal(fs[j][0  ]);
+	sx[(2*0+1) * veclen + j] = creal(fs[j][n/2]);
+      } else {
+	sx[(2*i+0) * veclen + j] = creal(fs[j][i]);
+	sx[(2*i+1) * veclen + j] = cimag(fs[j][i]);
+      }
+    }
+  }
+
+  //
+
+  for(j=0;j<veclen;j++) {
+    backward(fs[j], ts[j], n);
+  }
+
+  DFT_execute(p, mode, sx, 1);
+
+  //
+
+  int success = 1;
+
+  for(j=0;j<veclen;j++) {
+    for(i=0;i<n;i++) {
+      if (fabs(cimag(ts[j][i])) > THRES) {
+	success = 0;
+      }
+
+      if ((fabs(sx[i * veclen + j]*2 - creal(ts[j][i])) > THRES)) {
+	success = 0;
+      }
+    }
+  }
+
+  //
+
+  SIMDBase_alignedFree(sx);
+  DFT_dispose(p, mode);
+
+  //
+
+  return success;
+}
+
+// alt real forward
+int check_arf(int n, int mode, int veclen, int sizeOfVect) {
+  int i,j;
+
+  DFT *p = DFT_init(mode, n, DFT_FLAG_ALT_REAL);
+  REAL *sx = SIMDBase_alignedMalloc(sizeOfVect*n);
+
+  //
+
+  double complex ts[veclen][n], fs[veclen][n];
+
+  for(j=0;j<veclen;j++) {
+    for(i=0;i<n;i++) {
+      ts[j][i] = (random() / (double)RAND_MAX);
+      sx[i*veclen+j] = creal(ts[j][i]);
+    }
+  }
+
+  //
+
+  DFT_execute(p, mode, sx, 1);
+
+  for(j=0;j<veclen;j++) {
+    backward(ts[j], fs[j], n);
+  }
+
+  //
+
+  int success = 1;
+
+  for(j=0;j<veclen;j++) {
+    for(i=0;i<n/2;i++) {
+      if (i == 0) {
+	if (fabs(sx[(2*0+0) * veclen + j] - creal(fs[j][0  ])) > THRES) success = 0;
+	if (fabs(sx[(2*0+1) * veclen + j] - creal(fs[j][n/2])) > THRES) success = 0;
+      } else {
+	if (fabs(sx[(2*i+0) * veclen + j] - creal(fs[j][i])) > THRES) success = 0;
+	if (fabs(sx[(2*i+1) * veclen + j] - cimag(fs[j][i])) > THRES) success = 0;
+      }
+    }
+  }
+
+  //
+
+  SIMDBase_alignedFree(sx);
+  DFT_dispose(p, mode);
+
+  //
+
+  return success;
+}
+
+// alt real backward
+int check_arb(int n, int mode, int veclen, int sizeOfVect) {
+  int i,j;
+
+  DFT *p = DFT_init(mode, n, DFT_FLAG_ALT_REAL);
+  REAL *sx = SIMDBase_alignedMalloc(sizeOfVect*n);
+
+  //
+
+  double complex fs[veclen][n], ts[veclen][n];
+
+  for(j=0;j<veclen;j++) {
+    for(i=0;i<n/2;i++) {
+      if (i == 0) {
+	fs[j][0  ] = (random() / (double)RAND_MAX);
+	fs[j][n/2] = (random() / (double)RAND_MAX);
+      } else {
+	fs[j][i  ] = (random() / (double)RAND_MAX) + (random() / (double)RAND_MAX) * _Complex_I;
+	fs[j][n-i] = conj(fs[j][i]);
+      }
+    }
+  }
+
+  for(j=0;j<veclen;j++) {
+    for(i=0;i<n/2;i++) {
+      if (i == 0) {
+	sx[(2*0+0) * veclen + j] = creal(fs[j][0  ]);
+	sx[(2*0+1) * veclen + j] = creal(fs[j][n/2]);
+      } else {
+	sx[(2*i+0) * veclen + j] = creal(fs[j][i]);
+	sx[(2*i+1) * veclen + j] = cimag(fs[j][i]);
+      }
+    }
+  }
+
+  //
+
+  for(j=0;j<veclen;j++) {
+    forward(fs[j], ts[j], n);
+  }
+
+  DFT_execute(p, mode, sx, -1);
+
+  //
+
+  int success = 1;
+
+  for(j=0;j<veclen;j++) {
+    for(i=0;i<n;i++) {
+      if (fabs(cimag(ts[j][i])) > THRES) {
+	success = 0;
+      }
+
+      if ((fabs(sx[i * veclen + j]*2 - creal(ts[j][i])) > THRES)) {
+	success = 0;
+      }
+    }
+  }
+
+  //
+
+  SIMDBase_alignedFree(sx);
+  DFT_dispose(p, mode);
+
+  //
+
+  return success;
+}
+
+int main(int argc, char **argv) {
+  if (argc != 2) {
+    fprintf(stderr, "%s <log2n>\n", argv[0]);
+    exit(-1);
+  }
+
+  const int n = 1 << atoi(argv[1]);
+
+  srandom(time(NULL));
+
+  //
+
+  int mode = SIMDBase_chooseBestMode(TYPE);
+
+  printf("mode : %d, %s\n", mode, SIMDBase_getModeParamString(SIMDBase_PARAMID_MODE_NAME, mode));
+
+  int veclen = SIMDBase_getModeParamInt(SIMDBase_PARAMID_VECTOR_LEN, mode);
+  int sizeOfVect = SIMDBase_getModeParamInt(SIMDBase_PARAMID_SIZE_OF_VECT, mode);
+
+  printf("complex forward   : %s\n", check_cf(n, mode, veclen, sizeOfVect) ? "OK" : "NG");
+  printf("complex backward  : %s\n", check_cb(n, mode, veclen, sizeOfVect) ? "OK" : "NG");
+  printf("real forward      : %s\n", check_rf(n, mode, veclen, sizeOfVect) ? "OK" : "NG");
+  printf("real backward     : %s\n", check_rb(n, mode, veclen, sizeOfVect) ? "OK" : "NG");
+  printf("alt real forward  : %s\n", check_arf(n, mode, veclen, sizeOfVect) ? "OK" : "NG");
+  printf("alt real backward : %s\n", check_arb(n, mode, veclen, sizeOfVect) ? "OK" : "NG");
+
+  exit(0);
+}
diff --git a/plugins/supereq/nsfft-1.00/dfttest/DFTTestOoura.c b/plugins/supereq/nsfft-1.00/dfttest/DFTTestOoura.c
new file mode 100644
index 00000000..08c8315f
--- /dev/null
+++ b/plugins/supereq/nsfft-1.00/dfttest/DFTTestOoura.c
@@ -0,0 +1,260 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include <stdint.h>
+#include <time.h>
+
+#include "SIMDBase.h"
+#include "DFT.h"
+
+void cdft(int, int, double *, int *, double *);
+void rdft(int, int, double *, int *, double *);
+
+#if 1
+typedef float REAL;
+#define TYPE SIMDBase_TYPE_FLOAT
+#else
+typedef double REAL;
+#define TYPE SIMDBase_TYPE_DOUBLE
+#endif
+
+#define THRES 1e-3
+
+// complex forward
+int check_cf(int n, int mode, int veclen, int sizeOfVect) {
+  int i, j;
+
+  DFT *p = DFT_init(mode, n, 0);
+
+  int *ip = calloc(n, sizeof(int));
+  double *trigTable = SIMDBase_alignedMalloc(sizeof(double)*n/2);
+
+  REAL *sx = SIMDBase_alignedMalloc(sizeOfVect*n*2);
+  double *sy = SIMDBase_alignedMalloc(veclen * sizeof(double) *n*2);
+
+  //
+
+  for(j=0;j<veclen;j++) {
+    for(i=0;i<n*2;i++) {
+      sx[i*veclen + j] = random() / (double)RAND_MAX;
+      sy[j*n*2 + i] = sx[i*veclen + j];
+    }
+  }
+
+  //
+
+  DFT_execute(p, mode, sx, -1);
+
+  for(j=0;j<veclen;j++) {
+    cdft(n*2, -1, &sy[j*n*2], ip, trigTable);
+  }
+
+  //
+
+  int success = 1;
+
+  for(j=0;j<veclen;j++) {
+    for(i=0;i<n*2;i++) {
+      if (fabs(sx[i*veclen+j] - sy[j*n*2 + i]) > THRES) success = 0;
+    }
+  }
+
+  //
+
+  SIMDBase_alignedFree(sy);
+  SIMDBase_alignedFree(sx);
+  SIMDBase_alignedFree(trigTable);
+  free(ip);
+
+  DFT_dispose(p, mode);
+
+  //
+
+  return success;
+}
+
+// complex backward
+int check_cb(int n, int mode, int veclen, int sizeOfVect) {
+  int i, j;
+
+  DFT *p = DFT_init(mode, n, 0);
+
+  int *ip = calloc(n, sizeof(int));
+  double *trigTable = SIMDBase_alignedMalloc(sizeof(double)*n/2);
+
+  REAL *sx = SIMDBase_alignedMalloc(sizeOfVect*n*2);
+  double *sy = SIMDBase_alignedMalloc(veclen * sizeof(double) *n*2);
+
+  //
+
+  for(j=0;j<veclen;j++) {
+    for(i=0;i<n*2;i++) {
+      sx[i*veclen + j] = random() / (double)RAND_MAX;
+      sy[j*n*2 + i] = sx[i*veclen + j];
+    }
+  }
+
+  //
+
+  DFT_execute(p, mode, sx, 1);
+
+  for(j=0;j<veclen;j++) {
+    cdft(n*2, 1, &sy[j*n*2], ip, trigTable);
+  }
+
+  //
+
+  int success = 1;
+
+  for(j=0;j<veclen;j++) {
+    for(i=0;i<n*2;i++) {
+      if (fabs(sx[i*veclen+j] - sy[j*n*2 + i]) > THRES) success = 0;
+    }
+  }
+
+  //
+
+  SIMDBase_alignedFree(sy);
+  SIMDBase_alignedFree(sx);
+  SIMDBase_alignedFree(trigTable);
+  free(ip);
+
+  DFT_dispose(p, mode);
+
+  //
+
+  return success;
+}
+
+// real forward
+int check_rf(int n, int mode, int veclen, int sizeOfVect) {
+  int i, j;
+
+  DFT *p = DFT_init(mode, n, DFT_FLAG_ALT_REAL);
+
+  int *ip = calloc(n, sizeof(int));
+  double *trigTable = SIMDBase_alignedMalloc(sizeof(double)*n/2);
+
+  REAL *sx = SIMDBase_alignedMalloc(sizeOfVect*n);
+  double *sy = SIMDBase_alignedMalloc(veclen * sizeof(double) *n);
+
+  //
+
+  for(j=0;j<veclen;j++) {
+    for(i=0;i<n;i++) {
+      sx[i*veclen + j] = random() / (double)RAND_MAX;
+      sy[j*n + i] = sx[i*veclen + j];
+    }
+  }
+
+  //
+
+  DFT_execute(p, mode, sx, -1);
+
+  for(j=0;j<veclen;j++) {
+    rdft(n, -1, &sy[j*n], ip, trigTable);
+  }
+
+  //
+
+  int success = 1;
+
+  for(j=0;j<veclen;j++) {
+    for(i=0;i<n;i++) {
+      if (fabs(sx[i*veclen+j] - sy[j*n + i]) > THRES) success = 0;
+    }
+  }
+
+  //
+
+  SIMDBase_alignedFree(sy);
+  SIMDBase_alignedFree(sx);
+  SIMDBase_alignedFree(trigTable);
+  free(ip);
+
+  DFT_dispose(p, mode);
+
+  //
+
+  return success;
+}
+
+// real backward
+int check_rb(int n, int mode, int veclen, int sizeOfVect) {
+  int i, j;
+
+  DFT *p = DFT_init(mode, n, DFT_FLAG_ALT_REAL);
+
+  int *ip = calloc(n, sizeof(int));
+  double *trigTable = SIMDBase_alignedMalloc(sizeof(double)*n/2);
+
+  REAL *sx = SIMDBase_alignedMalloc(sizeOfVect*n);
+  double *sy = SIMDBase_alignedMalloc(veclen * sizeof(double) *n);
+
+  //
+
+  for(j=0;j<veclen;j++) {
+    for(i=0;i<n;i++) {
+      sx[i*veclen + j] = random() / (double)RAND_MAX;
+      sy[j*n + i] = sx[i*veclen + j];
+    }
+  }
+
+  //
+
+  DFT_execute(p, mode, sx, 1);
+
+  for(j=0;j<veclen;j++) {
+    rdft(n, 1, &sy[j*n], ip, trigTable);
+  }
+
+  //
+
+  int success = 1;
+
+  for(j=0;j<veclen;j++) {
+    for(i=0;i<n;i++) {
+      if (fabs(sx[i*veclen+j] - sy[j*n + i]) > THRES) success = 0;
+    }
+  }
+
+  //
+
+  SIMDBase_alignedFree(sy);
+  SIMDBase_alignedFree(sx);
+  SIMDBase_alignedFree(trigTable);
+  free(ip);
+
+  DFT_dispose(p, mode);
+
+  //
+
+  return success;
+}
+
+int main(int argc, char **argv) {
+  if (argc != 2) {
+    fprintf(stderr, "%s <log2n>\n", argv[0]);
+    exit(-1);
+  }
+
+  const int n = 1 << atoi(argv[1]);
+
+  srandom(time(NULL));
+
+  //
+
+  int mode = SIMDBase_chooseBestMode(TYPE);
+
+  printf("mode : %d, %s\n", mode, SIMDBase_getModeParamString(SIMDBase_PARAMID_MODE_NAME, mode));
+
+  int veclen = SIMDBase_getModeParamInt(SIMDBase_PARAMID_VECTOR_LEN, mode);
+  int sizeOfVect = SIMDBase_getModeParamInt(SIMDBase_PARAMID_SIZE_OF_VECT, mode);
+
+  printf("complex forward   : %s\n", check_cf(n, mode, veclen, sizeOfVect) ? "OK" : "NG");
+  printf("complex backward  : %s\n", check_cb(n, mode, veclen, sizeOfVect) ? "OK" : "NG");
+  printf("real forward      : %s\n", check_rf(n, mode, veclen, sizeOfVect) ? "OK" : "NG");
+  printf("real backward     : %s\n", check_rb(n, mode, veclen, sizeOfVect) ? "OK" : "NG");
+
+  exit(0);
+}
diff --git a/plugins/supereq/nsfft-1.00/dfttest/Makefile b/plugins/supereq/nsfft-1.00/dfttest/Makefile
new file mode 100644
index 00000000..924b8656
--- /dev/null
+++ b/plugins/supereq/nsfft-1.00/dfttest/Makefile
@@ -0,0 +1,35 @@
+CC=gcc
+BASEOPT=-Wall -g -I ../simd -I ../dft -L../simd -L../dft
+OPT=$(BASEOPT) -O
+
+all : DFTExample DFTTestNaive
+
+clean :
+	rm -f *~ *.o nsfftplan.*.txt *.log *.dat a.out DFTExample DFTTestNaive DFTTestOoura DFTTestFFTW pi_fft_mod pi_fft_mod.c
+
+../simd/libSIMD.a :
+	@cd ../simd; make
+
+../dft/libDFT.a :
+	@cd ../dft; make
+
+../ooura/fftsg.o :
+	@cd ../ooura; make
+
+DFTExample : DFTExample.c ../simd/libSIMD.a ../dft/libDFT.a
+	$(CC) $(OPT) DFTExample.c -lDFT -lSIMD -lm -o DFTExample
+
+DFTTestNaive : DFTTestNaive.c ../simd/libSIMD.a ../dft/libDFT.a
+	$(CC) $(OPT) DFTTestNaive.c -lDFT -lSIMD -lm -o DFTTestNaive
+
+DFTTestOoura : DFTTestOoura.c ../ooura/fftsg.o ../simd/libSIMD.a ../dft/libDFT.a
+	$(CC) $(OPT) DFTTestOoura.c ../ooura/fftsg.o -lDFT -lSIMD -lm -o DFTTestOoura
+
+DFTTestFFTW : DFTTestFFTW.c ../simd/libSIMD.a ../dft/libDFT.a
+	$(CC) $(OPT) DFTTestFFTW.c -lDFT -lSIMD -lfftw3 -lm -o DFTTestFFTW
+
+pi_fft_mod.c : ../ooura/pi_fft.c pi_fft.c.patch
+	patch -o pi_fft_mod.c ../ooura/pi_fft.c pi_fft.c.patch
+
+pi_fft_mod : ../simd/libSIMD.a ../dft/libDFT.a pi_fft_mod.c
+	$(CC) $(OPT) pi_fft_mod.c -I ../dft -I ../simd -L../dft -L../simd -lm -lDFT -lSIMD -o pi_fft_mod
diff --git a/plugins/supereq/nsfft-1.00/dfttest/pi_fft.c.patch b/plugins/supereq/nsfft-1.00/dfttest/pi_fft.c.patch
new file mode 100644
index 00000000..c50133cc
--- /dev/null
+++ b/plugins/supereq/nsfft-1.00/dfttest/pi_fft.c.patch
@@ -0,0 +1,131 @@
+--- pi_fft.c	2010-07-30 13:04:25.000000000 +0900
++++ pi_fft_mod.c	2010-07-31 20:50:11.000000000 +0900
+@@ -25,7 +25,75 @@
+ #include <stdio.h>
+ #include <stdlib.h>
+ #include <time.h>
++#include <sys/time.h>
++#include <unistd.h>
+ 
++/****/
++
++#include <stdint.h>
++#include "SIMDBase.h"
++#include "DFT.h"
++
++DFT* dft[64];
++
++void initdft(int n) {
++  int i, logn = 31 - __builtin_clz(n), writeflag = 0;
++  char buf[20], fn[256];
++  gethostname(buf, 19);
++  sprintf(fn, "nsfftplan.%s.txt", buf);
++  FILE *fp = fopen(fn, "r");
++  if (fp != NULL) {
++    for(i=1;i<=logn;i++) {
++      int err;
++      dft[i] = DFT_fread(fp, &err);
++      if (err != DFT_ERROR_NOERROR) {
++	printf("error when reading plan %d : %d\n", i, err);
++	break;
++      }
++      if (DFT_getPlanParamInt(DFT_PARAMID_MODE, dft[i]) != SIMDBase_MODE_PUREC_DOUBLE ||
++	  DFT_getPlanParamInt(DFT_PARAMID_FFT_LENGTH, dft[i]) != (1 << i) ||
++	  DFT_getPlanParamInt(DFT_PARAMID_IS_ALT_REAL_TRANSFORM, dft[i]) != 1) {
++	fprintf(stderr, "plan not compatible : %d\n", i);
++	break;
++      }
++    }
++  }
++  if (fp != NULL) fclose(fp);
++
++  for(i=1;i<=logn;i++) {
++    if (dft[i] == NULL) {
++      dft[i] = DFT_init(SIMDBase_MODE_PUREC_DOUBLE, 1 << i, DFT_FLAG_ALT_REAL | DFT_FLAG_LIGHT_TEST_RUN | DFT_FLAG_VERBOSE);
++      if (dft[i] == NULL) {
++	printf("dft[%d] == NULL\n", i);
++	exit(-1);
++      }
++      writeflag = 1;
++    }
++  }
++
++  if (writeflag) {
++    fp = fopen(fn, "w");
++    if (fp != NULL) {
++      for(i=1;i<=logn;i++) {
++	DFT_fwrite(dft[i], fp);
++      }
++      fclose(fp);
++    }
++  }
++}
++
++void rdft(int n, int isgn, double *a, int *ip, double *w) {
++  int logn = 31 - __builtin_clz(n);
++  DFT_execute(dft[logn], SIMDBase_MODE_PUREC_DOUBLE, a, isgn);
++}
++
++double timeofday(void) {
++  struct timeval tp;
++  gettimeofday(&tp, NULL);
++  return (double)tp.tv_sec+(1e-6)*tp.tv_usec;
++}
++
++/****/
+ 
+ void mp_load_0(int n, int radix, int out[]);
+ void mp_load_1(int n, int radix, int out[]);
+@@ -67,7 +135,7 @@
+     double err, d_time, n_op;
+     int *a, *b, *c, *e, *i1, *i2, *ip;
+     double *d1, *d2, *d3, *w;
+-    time_t t_1, t_2;
++    double t_1, t_2;
+     FILE *f_log, *f_out;
+     
+     f_log = fopen("pi.log", "w");
+@@ -96,6 +164,8 @@
+         exit(1);
+     }
+     ip[0] = 0;
++
++    initdft(nfft);
+     /* ---- radix test ---- */
+     log10_radix = 1;
+     radix = 10;
+@@ -111,7 +181,7 @@
+     printf("calculating %d digits of PI...\n", log10_radix * (n - 2));
+     fprintf(f_log, "calculating %d digits of PI...\n", log10_radix * (n - 2));
+     /* ---- time check ---- */
+-    time(&t_1);
++    t_1 = timeofday();
+     /*
+      * ---- a formula based on the AGM (Arithmetic-Geometric Mean) ----
+      *   c = sqrt(0.125);
+@@ -216,10 +286,10 @@
+     mp_mul(n, radix, a, b, a, i1, nfft, d1, d2, d3, ip, w);
+     mp_idiv(n, radix, a, npow, a);
+     /* ---- time check ---- */
+-    time(&t_2);
++    t_2 = timeofday();
+     /* ---- output ---- */
+     f_out = fopen("pi_mod.dat", "w");
+-    printf("writing pi.dat...\n");
++    printf("writing pi_mod.dat...\n");
+     mp_fprintf(n - 1, log10_radix, a, f_out);
+     fclose(f_out);
+     free(d3);
+@@ -238,9 +308,9 @@
+     printf("floating point operation: %g op.\n", n_op);
+     fprintf(f_log, "floating point operation: %g op.\n", n_op);
+     /* ---- difftime ---- */
+-    d_time = difftime(t_2, t_1);
+-    printf("execution time: %g sec. (real time)\n", d_time);
+-    fprintf(f_log, "execution time: %g sec. (real time)\n", d_time);
++    d_time = t_2 - t_1;
++    printf("execution time: %.5g sec. (real time)\n", d_time);
++    fprintf(f_log, "execution time: %.5g sec. (real time)\n", d_time);
+     fclose(f_log);
+     return 0;
+ }
diff --git a/plugins/supereq/nsfft-1.00/doc/default.css b/plugins/supereq/nsfft-1.00/doc/default.css
new file mode 100644
index 00000000..09721163
--- /dev/null
+++ b/plugins/supereq/nsfft-1.00/doc/default.css
@@ -0,0 +1,34 @@
+body {margin-left: 1.5cm; padding-left: 0.1cm; margin-right: 1.5cm; padding-right: 0.1cm; margin-top: 2.5cm; padding-top: 0.5cm; margin-bottom: 1cm; padding-bottom: 1.0cm; border-top-style:solid; border-bottom-style:solid; }
+h1 {font-family: arial, sansserif; font-weight: bold; font-style: italic; margin-top: 0.8cm; }
+h2 {font-family: arial, sansserif; font-weight: bold; font-style: italic; margin-top: 0.8cm; }
+h3 {font-family: arial, sansserif; font-weight: bold; margin-top: 1.2cm; margin-bottom: 0.8cm; }
+h4 {font-family: arial, sansserif; font-weight: bold; margin-top: 1.2cm; margin-bottom: 0.8cm; }
+p {font-family: Georgia, "Times New Roman", times, serif; margin-top: 0.3cm; margin-left: 0.5cm; margin-bottom: 0.3cm;}
+p.dir {font-family: arial, sansserif; margin-top: 0cm; margin-bottom: 0cm;}
+dl { margin-left: 0.5cm; }
+dt { font-weight: bold; }
+a:link {color: black;}
+a:visited {color: black;}
+ul.disc {list-style-type: disc; font-family: times, serif;}
+ul.circle {list-style-type: circle; font-family: times, serif;}
+ul.square {list-style-type: square; font-family: times, serif;}
+ul.none {list-style-type: none; font-family: times, serif;}
+pre.code { margin-top: 1.0cm; margin-bottom: 1.0cm; margin-left: 1.0cm; margin-right: 1.0cm; border:3px solid #c0c0c0; padding: 0.5cm; font-family: tahoma, sansserif; font-weight: normal; background-color:#f8f8f8; }
+pre.command { margin-top: 1.0cm; margin-bottom: 1.0cm; margin-left: 1.5cm; margin-right: 0.0cm; border:0px; padding:0.0cm; font-family: tahoma, sansserif; font-weight: bold; background-color:#f8fffc; }
+ol.level1 { font-family: arial, sansserif; font-weight: bold;  font-style: italic; font-size:1.5em; }
+ol.level2 { font-family: "Times New Roman", serif; font-weight: normal; font-style: normal; font-size:0.85em; margin-top: 0.2cm; margin-bottom: 0.5cm; }
+table.figure { margin-left:auto; margin-right:auto; margin-top:1.0cm; margin-bottom:1.0cm; }
+
+td.caption { font-family: arial, sansserif; font-size: 75%; color: black; }
+td { font-family: times, serif; }
+
+table.lt { border-collapse: collapse; border-style: none; }
+td.lt- { margin: 0px; padding: 4px; padding-left:0.3cm; padding-right:0.3cm; border-width: 1px; border-style: none; padding-left=0.2cm; padding-right=0.2cm; }
+td.lt-r { margin: 0px; padding: 4px; padding-left:0.3cm; padding-right:0.3cm; border-style: none; border-right-style: solid; border-width: 1px; border-color: black; }
+td.lt-l { margin: 0px; padding: 4px; padding-left:0.3cm; padding-right:0.3cm; border-style: none; border-left-style: solid; border-width: 1px; border-color: black; }
+td.lt-lr { margin: 0px; padding: 4px; padding-left:0.3cm; padding-right:0.3cm; border-style: none; border-right-style: solid; border-left-style: solid; border-width: 1px; border-color: black; }
+td.lt-b { margin: 0px; padding: 4px; padding-left:0.3cm; padding-right:0.3cm; border-style: none; border-bottom-style: solid; border-width: 1px; border-color: black; }
+td.lt-hl { margin: 0px; border-style: none; border-bottom-style: solid; border-width: 1px; border-color: black; height: 2px; }
+td.lt-bl { margin: 0px; padding: 4px; padding-left:0.3cm; padding-right:0.3cm; border-style: none; border-bottom-style: solid; border-left-style: solid; border-width: 1px; border-color: black; }
+td.lt-br { margin: 0px; padding: 4px; padding-left:0.3cm; padding-right:0.3cm; border-style: none; border-bottom-style: solid; border-right-style: solid; border-width: 1px; border-color: black; }
+td.lt-blr { margin: 0px; padding: 4px; padding-left:0.3cm; padding-right:0.3cm; border-style: none; border-bottom-style: solid; border-left-style: solid; border-right-style: solid; border-width: 1px; border-color: black; }
diff --git a/plugins/supereq/nsfft-1.00/doc/index.xhtml b/plugins/supereq/nsfft-1.00/doc/index.xhtml
new file mode 100644
index 00000000..8b7e2c97
--- /dev/null
+++ b/plugins/supereq/nsfft-1.00/doc/index.xhtml
@@ -0,0 +1,2016 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1 plus MathML 2.0//EN" "http://www.w3.org/Math/DTD/mathml2/xhtml-math11-f.dtd">
+<html xmlns="http://www.w3.org/1999/xhtml">
+<head>
+<meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>
+<link rel="stylesheet" type="text/css" href="default.css"/>
+<title>NSFFT Reference Manual</title>
+</head>
+<body>
+<h1>NSFFT Reference Manual</h1>
+
+<h3>Introduction</h3>
+
+<p>
+This is a library for performing 1-dimensional discrete Fourier
+transforms. NSDFT is a simple, small and portable library, and it is
+efficient since it can utilize SIMD instruction sets in modern
+processors. It performs multiple transforms simultaneously, and thus
+it is especially suitable for digital signal processing. It does not
+need so much computation to make a good execution plan. This library
+is in public domain, so that you can incorporate this library into
+your product without any obligation.
+</p>
+
+<h3>API Reference</h3>
+
+<p>
+In this section, the API functions are explained.
+</p>
+
+<h4>Include files</h4>
+
+<p>
+You have to include two include files in dft directory.
+</p>
+
+<pre class="code">
+#include &lt;stdint.h&gt;
+#include "SIMDBase.h"
+#include "DFT.h"
+</pre>
+
+<h4>Data types</h4>
+
+<p>
+First, you have to choose a data type to represent an element in the
+input and output sequence of numbers. You can choose from the
+following three types.
+</p>
+
+<table class="figure">
+  <tr align="center">
+    <td>
+      <table class="lt">
+        <tr>
+          <td class="lt-hl"></td>
+          <td class="lt-hl"></td>
+        </tr>
+	<tr>
+	  <td class="lt-br" align="center">Symbol</td>
+	  <td class="lt-b" align="center">Data Type</td>
+	</tr>
+	<tr>
+	  <td class="lt-hl"></td>
+	  <td class="lt-hl"></td>
+	</tr>
+	<tr>
+	  <td class="lt-r" align="left">SIMDBase_TYPE_FLOAT</td>
+	  <td class="lt-" align="left">float type in C language</td>
+	</tr>
+	<tr>
+	  <td class="lt-r" align="left">SIMDBase_TYPE_DOUBLE</td>
+	  <td class="lt-" align="left">double type in C language</td>
+	</tr>
+	<tr>
+	  <td class="lt-br" align="left">SIMDBase_TYPE_LONGDOUBLE</td>
+	  <td class="lt-b" align="left">long double type in C language</td>
+	</tr>
+      </table>
+    </td>
+  </tr>
+  <tr align="center">
+    <td class="caption">Table 1 Data types</td>
+  </tr>
+</table>
+
+
+<h4>Computation modes</h4>
+
+<p>
+Next, a compuation mode have to be chosen. You can choose from the
+following modes.
+</p>
+
+<table class="figure">
+  <tr align="center">
+    <td>
+      <table class="lt">
+        <tr>
+          <td class="lt-hl"></td>
+          <td class="lt-hl"></td>
+          <td class="lt-hl"></td>
+          <td class="lt-hl"></td>
+        </tr>
+	<tr>
+	  <td class="lt-br" align="center">Symbol</td>
+	  <td class="lt-br" align="center">Type</td>
+	  <td class="lt-br" align="center">Vector Length</td>
+	  <td class="lt-b" align="center">Computation Mode</td>
+	</tr>
+	<tr>
+	  <td class="lt-hl"></td>
+	  <td class="lt-hl"></td>
+	  <td class="lt-hl"></td>
+	  <td class="lt-hl"></td>
+	</tr>
+	<tr>
+	  <td class="lt-r" align="left">SIMDBase_MODE_PUREC_FLOAT</td>
+	  <td class="lt-r" align="center">float</td>
+	  <td class="lt-r" align="center">1</td>
+	  <td class="lt-" align="center">Scalar float</td>
+	</tr>
+	<tr>
+	  <td class="lt-r" align="left">SIMDBase_MODE_PUREC_DOUBLE</td>
+	  <td class="lt-r" align="center">double</td>
+	  <td class="lt-r" align="center">1</td>
+	  <td class="lt-" align="center">Scalar double</td>
+	</tr>
+	<tr>
+	  <td class="lt-r" align="left">SIMDBase_MODE_PUREC_LONGDOUBLE</td>
+	  <td class="lt-r" align="center">long double</td>
+	  <td class="lt-r" align="center">1</td>
+	  <td class="lt-" align="center">Scalar long double</td>
+	</tr>
+	<tr>
+	  <td class="lt-r" align="left">SIMDBase_MODE_SSE_FLOAT</td>
+	  <td class="lt-r" align="center">float</td>
+	  <td class="lt-r" align="center">4</td>
+	  <td class="lt-" align="center">x86 SSE</td>
+	</tr>
+	<tr>
+	  <td class="lt-r" align="left">SIMDBase_MODE_SSE2_DOUBLE</td>
+	  <td class="lt-r" align="center">double</td>
+	  <td class="lt-r" align="center">2</td>
+	  <td class="lt-" align="center">x86 SSE2</td>
+	</tr>
+	<tr>
+	  <td class="lt-r" align="left">SIMDBase_MODE_NEON_FLOAT</td>
+	  <td class="lt-r" align="center">float</td>
+	  <td class="lt-r" align="center">4</td>
+	  <td class="lt-" align="center">ARM NEON</td>
+	</tr>
+	<tr>
+	  <td class="lt-r" align="left">SIMDBase_MODE_AVX_FLOAT</td>
+	  <td class="lt-r" align="center">float</td>
+	  <td class="lt-r" align="center">8</td>
+	  <td class="lt-" align="center">x86 AVX (float)</td>
+	</tr>
+	<tr>
+	  <td class="lt-r" align="left">SIMDBase_MODE_AVX_DOUBLE</td>
+	  <td class="lt-r" align="center">double</td>
+	  <td class="lt-r" align="center">4</td>
+	  <td class="lt-" align="center">x86 AVX (double)</td>
+	</tr>
+	<tr>
+	  <td class="lt-br" align="left">SIMDBase_MODE_ALTIVEC_FLOAT</td>
+	  <td class="lt-br" align="center">float</td>
+	  <td class="lt-br" align="center">4</td>
+	  <td class="lt-b" align="center">PowerPC Altivec</td>
+	</tr>
+      </table>
+    </td>
+  </tr>
+  <tr align="center">
+    <td class="caption">Table 2 Computation modes</td>
+  </tr>
+</table>
+
+<p>
+The following function automatically checks the availability of each
+instruction set on your computer, and chooses the best computation
+mode.
+</p>
+
+<pre class="code">
+int32_t SIMDBase_chooseBestMode(int32_t type);
+</pre>
+
+<p>
+The return value is the best mode chosen by this routine.
+<i>type</i> is the data type you chose.
+</p>
+
+
+<h4>Retrieving parameters</h4>
+
+<p>
+You can make queries for any mode using the following function.
+</p>
+
+<pre class="code">
+int32_t SIMDBase_getModeParamInt(int32_t paramId, int32_t mode);
+</pre>
+
+<p>
+<i>mode</i> is the computation mode you chose. <i>paramId</i> is one
+of the following.
+</p>
+
+<table class="figure">
+  <tr align="center">
+    <td>
+      <table class="lt">
+        <tr>
+          <td class="lt-hl"></td>
+          <td class="lt-hl"></td>
+        </tr>
+	<tr>
+	  <td class="lt-br" align="center">Symbol</td>
+	  <td class="lt-b" align="center">Meaning</td>
+	</tr>
+	<tr>
+	  <td class="lt-hl"></td>
+	  <td class="lt-hl"></td>
+	</tr>
+	<tr>
+	  <td class="lt-r" align="left">SIMDBase_PARAMID_SIZE_OF_REAL</td>
+	  <td class="lt-" align="left">Size of an element in a vector in byte</td>
+	</tr>
+	<tr>
+	  <td class="lt-r" align="left">SIMDBase_PARAMID_SIZE_OF_VECT</td>
+	  <td class="lt-" align="left">Size of the vector in byte</td>
+	</tr>
+	<tr>
+	  <td class="lt-r" align="left">SIMDBase_PARAMID_VECTOR_LEN</td>
+	  <td class="lt-" align="left">Number of elements in a vector</td>
+	</tr>
+	<tr>
+	  <td class="lt-br" align="left">SIMDBase_PARAMID_MODE_AVAILABILITY</td>
+	  <td class="lt-b" align="left">Whether the given mode is available or not</td>
+	</tr>
+      </table>
+    </td>
+  </tr>
+  <tr align="center">
+    <td class="caption">Table 3 Querying parameter for computation mode</td>
+  </tr>
+</table>
+
+<p>
+Here, a vector is a set of multiple primitive data element (single or
+double precision FP number) which can be stored in one SIMD register,
+and can be processed by one SIMD instruction at the same time.
+</p>
+
+<p>
+You can get the mode name in string data type. In this
+case, <i>paramId</i> must be SIMDBase_PARAMID_MODE_NAME.
+</p>
+
+<pre class="code">
+char *SIMDBase_getModeParamString(int32_t paramId, int32_t mode);
+</pre>
+
+<p>
+You should not modify the data returned by the above function.
+</p>
+
+
+<h4>Making and destroying execution plan</h4>
+
+<p>
+An execution plan can be made by the following function.
+</p>
+
+<pre class="code">
+DFT *DFT_init(int32_t mode, int32_t n, int32_t flags);
+</pre>
+
+<p>
+The return value is a pointer to a newly made plan.
+<i>mode</i> is the mode you chose above. <i>n</i> is the length of a
+transform. You can specify a bitwise OR of the following symbols
+as <i>flags</i>. You should not specify more than one flags regarding
+to test run. You should not specify DFT_FLAG_FORCE_RECURSIVE and
+DFT_FLAG_FORCE_COBRA at the same time. If neither DFT_FLAG_REAL nor
+DFT_FLAG_ALT_REAL is specified, an execution plan for complex
+transforms are made.
+</p>
+
+<table class="figure">
+  <tr align="center">
+    <td>
+      <table class="lt">
+        <tr>
+          <td class="lt-hl"></td>
+          <td class="lt-hl"></td>
+        </tr>
+	<tr>
+	  <td class="lt-br" align="center">Symbol</td>
+	  <td class="lt-b" align="center">Meaning</td>
+	</tr>
+	<tr>
+	  <td class="lt-hl"></td>
+	  <td class="lt-hl"></td>
+	</tr>
+	<tr>
+	  <td class="lt-r" align="left">DFT_FLAG_NO_TEST_RUN</td>
+	  <td class="lt-" align="left">Make execution plan without performing a test run</td>
+	</tr>
+	<tr>
+	  <td class="lt-r" align="left">DFT_FLAG_LIGHT_TEST_RUN</td>
+	  <td class="lt-" align="left">Perform small amount of test run to make an execution plan</td>
+	</tr>
+	<tr>
+	  <td class="lt-r" align="left">DFT_FLAG_HEAVY_TEST_RUN</td>
+	  <td class="lt-" align="left">Perform large amount of test run to make an execution plan</td>
+	</tr>
+	<tr>
+	  <td class="lt-r" align="left">DFT_FLAG_EXHAUSTIVE_TEST_RUN</td>
+	  <td class="lt-" align="left">Perform exhaustive search of parameters and find the optimal execution plan</td>
+	</tr>
+	<tr>
+	  <td class="lt-r" align="left">DFT_FLAG_REAL</td>
+	  <td class="lt-" align="left">Make an execution plan for a real transform</td>
+	</tr>
+	<tr>
+	  <td class="lt-r" align="left">DFT_FLAG_ALT_REAL</td>
+	  <td class="lt-" align="left">Make an execution plan for an alternative real transform</td>
+	</tr>
+	<tr>
+	  <td class="lt-r" align="left">DFT_FLAG_VERBOSE</td>
+	  <td class="lt-" align="left">Make some noise during making an execution plan</td>
+	</tr>
+	<tr>
+	  <td class="lt-r" align="left">DFT_FLAG_NOBITREVERSAL</td>
+	  <td class="lt-" align="left">Does not perforam bitreversal operation during a transform</td>
+	</tr>
+	<tr>
+	  <td class="lt-r" align="left">DFT_FLAG_FORCE_RECURSIVE</td>
+	  <td class="lt-" align="left">Force using the recursive bit-reveral routine. This routine is suited for small transforms.</td>
+	</tr>
+	<tr>
+	  <td class="lt-br" align="left">DFT_FLAG_FORCE_COBRA</td>
+	  <td class="lt-b" align="left">Force using the Cobra bit-reveral routine. This routine is suited for large transforms.</td>
+	</tr>
+      </table>
+    </td>
+  </tr>
+  <tr align="center">
+    <td class="caption">Table 4 Options for making execution plan</td>
+  </tr>
+</table>
+
+<p>
+You can destroy the plan you made by the following function.
+</p>
+
+<pre class="code">
+void DFT_dispose(DFT *p, int32_t mode);
+</pre>
+
+<p>
+<i>p</i> is a pointer to the execution plan. <i>mode</i> is the
+corresponding execution mode.
+</p>
+
+<p>
+You can retrieve parameters of a plan using the following function.
+</p>
+
+<pre class="code">
+int32_t DFT_getPlanParamInt(int32_t paramId, void *p);
+</pre>
+
+<p>
+<i>p</i> is a pointer to an execution plan. <i>paramId</i> is one
+of the following.
+</p>
+
+<table class="figure">
+  <tr align="center">
+    <td>
+      <table class="lt">
+        <tr>
+          <td class="lt-hl"></td>
+          <td class="lt-hl"></td>
+        </tr>
+	<tr>
+	  <td class="lt-br" align="center">Symbol</td>
+	  <td class="lt-b" align="center">Meaning</td>
+	</tr>
+	<tr>
+	  <td class="lt-hl"></td>
+	  <td class="lt-hl"></td>
+	</tr>
+	<tr>
+	  <td class="lt-r" align="left">DFT_PARAMID_TYPE</td>
+	  <td class="lt-" align="left">Data type</td>
+	</tr>
+	<tr>
+	  <td class="lt-r" align="left">DFT_PARAMID_MODE</td>
+	  <td class="lt-" align="left">Computation mode</td>
+	</tr>
+	<tr>
+	  <td class="lt-r" align="left">DFT_PARAMID_FFT_LENGTH</td>
+	  <td class="lt-" align="left">Length of the transform</td>
+	</tr>
+	<tr>
+	  <td class="lt-r" align="left">DFT_PARAMID_IS_REAL_TRANSFORM</td>
+	  <td class="lt-" align="left">Whether the plan is for real transforms</td>
+	</tr>
+	<tr>
+	  <td class="lt-r" align="left">DFT_PARAMID_NO_BIT_REVERSAL</td>
+	  <td class="lt-" align="left">Whether the plan does not perform bit reversal operation</td>
+	</tr>
+	<tr>
+	  <td class="lt-br" align="left">DFT_PARAMID_TEST_RUN</td>
+	  <td class="lt-b" align="left">How much test run is performed when making this plan</td>
+	</tr>
+      </table>
+    </td>
+  </tr>
+  <tr align="center">
+    <td class="caption">Table 5 Querying parameter for execution plan</td>
+  </tr>
+</table>
+
+<h4>Writing and reading execution plan to/from file</h4>
+
+<p>
+You can write or read an execution plan to/from a file using the following functions.
+</p>
+
+<pre class="code">
+int32_t DFT_fwrite(DFT *p, FILE *fp);
+DFT *DFT_fread(FILE *fp, int32_t *errcode);
+</pre>
+
+<p>
+<i>p</i> is a pointer to a plan. <i>fp</i> is a file
+pointer. DFT_fwrite returns 1 if the plan is successfully written, and
+0 if an error occurs. DFT_fread returns the pointer to the read plan
+if the plan is successfully read, and NULL if an error occurs. If an
+error occurs, an error code is returned to a variable whose pointer is
+specified by <i>errcode</i>. The interpretation of error codes is
+given below.
+</p>
+
+<table class="figure">
+  <tr align="center">
+    <td>
+      <table class="lt">
+        <tr>
+          <td class="lt-hl"></td>
+          <td class="lt-hl"></td>
+        </tr>
+	<tr>
+	  <td class="lt-br" align="center">Symbol</td>
+	  <td class="lt-b" align="center">Meaning</td>
+	</tr>
+	<tr>
+	  <td class="lt-hl"></td>
+	  <td class="lt-hl"></td>
+	</tr>
+	<tr>
+	  <td class="lt-r" align="left">DFT_ERROR_NOERROR</td>
+	  <td class="lt-" align="left">No error</td>
+	</tr>
+	<tr>
+	  <td class="lt-r" align="left">DFT_ERROR_FILE_VERSION</td>
+	  <td class="lt-" align="left">File format version mismatch</td>
+	</tr>
+	<tr>
+	  <td class="lt-r" align="left">DFT_ERROR_FILE_IO</td>
+	  <td class="lt-" align="left">I/O error</td>
+	</tr>
+	<tr>
+	  <td class="lt-r" align="left">DFT_ERROR_UNEXPECTED_EOF</td>
+	  <td class="lt-" align="left">Unexpected EOF</td>
+	</tr>
+	<tr>
+	  <td class="lt-r" align="left">DFT_ERROR_MODE_NOT_COMPILED_IN</td>
+	  <td class="lt-" align="left">Tried to read a plan with mode that is not compiled in</td>
+	</tr>
+	<tr>
+	  <td class="lt-r" align="left">DFT_ERROR_MODE_NOT_AVAILABLE</td>
+	  <td class="lt-" align="left">Tried to read a plan with mode that is not supported by hardware</td>
+	</tr>
+	<tr>
+	  <td class="lt-br" align="left">DFT_ERROR_UNKNOWN_MODE</td>
+	  <td class="lt-b" align="left">Tried to read a plan with mode that is unknown by library</td>
+	</tr>
+      </table>
+    </td>
+  </tr>
+  <tr align="center">
+    <td class="caption">Table 6 Errors that may happen during file I/O</td>
+  </tr>
+</table>
+
+
+<h4>Allocating and freeing buffers for transforms</h4>
+
+<p>
+In order to allocate word-aligned buffers for storing data which is
+fed to the FFT routine, you have to use the following function.
+</p>
+
+<pre class="code">
+void *DFT_alignedMalloc(uint64_t size);
+</pre>
+
+<p>
+This function allocates <i>size</i> bytes of word-aligned memory and
+returns the pointer. In order to free this memory, you have to use the
+following function.
+</p>
+
+<pre class="code">
+void DFT_alignedFree(void *ptr);
+</pre>
+
+<p>
+<i>ptr</i> is the pointer returned from DFT_alignedMalloc function.
+</p>
+
+<h4>Executing transform</h4>
+
+<p>
+By the following function, the planned transform can be executed.
+</p>
+
+<pre class="code">
+void DFT_execute(DFT *p, int32_t mode, void *s, int32_t dir);
+</pre>
+
+<p>
+<i>p</i> is a pointer to the plan. <i>mode</i> is the computation
+mode. <i>s</i> is the pointer to the buffer in which the sequence of
+input values is stored. This pointer must be a pointer returned from
+DFT_alignedMalloc function.
+<i>dir</i> specifies the direction of transform.
+</p>
+
+<p>
+The forward and backward discrete Fourier transforms are defined by
+the following formula (1) and (2), respectively.
+</p>
+
+<table border="0" style="margin-right:1.0cm; margin-left:1.0cm; margin-top:0.5cm; margin-bottom:0.5cm;">
+  <tr>
+    <td align="center" style="width:100%;">
+      <math mode="display" style="font-size:1.2em;" xmlns="http://www.w3.org/1998/Math/MathML">
+	<mrow>
+	  <msub><mi>X</mi><mi>k</mi></msub>
+	  <mo>=</mo>
+	  <munderover>
+	    <mo style="font-size:140%;">&Sum;</mo>
+	    <mrow><mi>n</mi><mo>=</mo><mn>0</mn></mrow>
+	    <mrow><mi>N</mi><mo>-</mo><mn>1</mn></mrow>
+	  </munderover>
+	  <msub><mi>x</mi><mi>n</mi></msub>
+	  <msup>
+	    <mi>e</mi>
+	    <mrow>
+	      <mo>-</mo>
+	      <mfrac>
+		<mrow><mn>2</mn><mi>&pi;</mi><mi>i</mi></mrow>
+		<mi>N</mi>
+	      </mfrac>
+	      <mi>k</mi><mi>n</mi>
+	    </mrow>
+	  </msup>
+
+	  <mo>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</mo>
+
+	  <mi>k</mi>
+	  <mo>=</mo>
+	  <mn>0</mn>
+	  <mo>,</mo>
+	  <mo>&middot;</mo>
+	  <mo>&middot;</mo>
+	  <mo>&middot;</mo>
+	  <mo>,</mo>
+	  <mi>N</mi>
+	  <mo>-</mo>
+	  <mn>1</mn>
+	</mrow>
+      </math>
+    </td>
+    <td>
+      <p>(1)</p>
+    </td>
+  </tr>
+  <tr>
+    <td align="center" style="width:100%;">
+      <math mode="display" style="font-size:1.2em;" xmlns="http://www.w3.org/1998/Math/MathML">
+	<mrow>
+	  <msub><mi>x</mi><mi>n</mi></msub>
+	  <mo>=</mo>
+	  <mfrac>
+	    <mn>1</mn>
+	    <mi>N</mi>
+	  </mfrac>
+	  <munderover>
+	    <mo style="font-size:140%;">&Sum;</mo>
+	    <mrow><mi>k</mi><mo>=</mo><mn>0</mn></mrow>
+	    <mrow><mi>N</mi><mo>-</mo><mn>1</mn></mrow>
+	  </munderover>
+	  <msub><mi>X</mi><mi>k</mi></msub>
+	  <msup>
+	    <mi>e</mi>
+	    <mrow>
+	      <mfrac>
+		<mrow><mn>2</mn><mi>&pi;</mi><mi>i</mi></mrow>
+		<mi>N</mi>
+	      </mfrac>
+	      <mi>k</mi><mi>n</mi>
+	    </mrow>
+	  </msup>
+
+	  <mo>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</mo>
+
+	  <mi>n</mi>
+	  <mo>=</mo>
+	  <mn>0</mn>
+	  <mo>,</mo>
+	  <mo>&middot;</mo>
+	  <mo>&middot;</mo>
+	  <mo>&middot;</mo>
+	  <mo>,</mo>
+	  <mi>N</mi>
+	  <mo>-</mo>
+	  <mn>1</mn>
+
+	</mrow>
+      </math>
+    </td>
+    <td>
+      <p>(2)</p>
+    </td>
+  </tr>
+</table>
+
+<p>
+The complex forward and backward transforms perform the transforms
+defined by the following formula (3) and (4), respectively. <i>V</i>
+is the vector length mentioned above. Again, calling DFT_execute once
+performs <i>V</i> forward or backward transforms at a time. Please
+note that (4) gives values multiplied by <i>N</i> compared to
+(2). Specifying -1 as the direction of transform performs the
+transform defined by (3). In this case, the input should be given as
+in (5) , and the output is given as in (6).  Specifying 1 as the
+direction of transform performs the transform defined by (4), and in
+this case, the input should be given as in (6) , and the output is
+given as in (5).
+</p>
+
+<table border="0" style="margin-right:1.0cm; margin-left:1.0cm; margin-top:0.5cm; margin-bottom:0.5cm;">
+  <tr>
+    <td align="center" style="width:100%;">
+      <math mode="display" style="font-size:1.2em;" xmlns="http://www.w3.org/1998/Math/MathML">
+	<mrow>
+	  <msub><mi>X</mi><mrow><mi>k</mi><mo>,</mo><mi>v</mi></mrow></msub>
+	  <mo>=</mo>
+	  <munderover>
+	    <mo style="font-size:140%;">&Sum;</mo>
+	    <mrow><mi>n</mi><mo>=</mo><mn>0</mn></mrow>
+	    <mrow><mi>N</mi><mo>-</mo><mn>1</mn></mrow>
+	  </munderover>
+	  <msub><mi>x</mi><mrow><mi>n</mi><mo>,</mo><mi>v</mi></mrow></msub>
+	  <msup>
+	    <mi>e</mi>
+	    <mrow>
+	      <mo>-</mo>
+	      <mfrac>
+		<mrow><mn>2</mn><mi>&pi;</mi><mi>i</mi></mrow>
+		<mi>N</mi>
+	      </mfrac>
+	      <mi>k</mi><mi>n</mi>
+	    </mrow>
+	  </msup>
+
+	  <mo>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</mo>
+
+	  <mrow style="font-size:100%;">
+	    <mi>k</mi>
+	    <mo>=</mo>
+	    <mn>0</mn>
+	    <mo>,</mo>
+	    <mo>&middot;</mo>
+	    <mo>&middot;</mo>
+	    <mo>&middot;</mo>
+	    <mo>,</mo>
+	    <mi>N</mi>
+	    <mo>-</mo>
+	    <mn>1</mn>
+
+	    <mo>&nbsp;&nbsp;</mo>
+	    <mo>,</mo>
+	    <mo>&nbsp;&nbsp;</mo>
+
+	    <mi>v</mi>
+	    <mo>=</mo>
+	    <mn>0</mn>
+	    <mo>,</mo>
+	    <mo>&middot;</mo>
+	    <mo>&middot;</mo>
+	    <mo>&middot;</mo>
+	    <mo>,</mo>
+	    <mi>V</mi>
+	    <mo>-</mo>
+	    <mn>1</mn>
+	  </mrow>
+	</mrow>
+      </math>
+    </td>
+    <td>
+      <p>(3)</p>
+    </td>
+  </tr>
+  <tr>
+    <td align="center" style="width:100%;">
+      <math mode="display" style="font-size:1.2em;" xmlns="http://www.w3.org/1998/Math/MathML">
+	<mrow>
+	  <msub><mi>x</mi><mrow><mi>n</mi><mo>,</mo><mi>v</mi></mrow></msub>
+	  <mo>=</mo>
+	  <munderover>
+	    <mo style="font-size:140%;">&Sum;</mo>
+	    <mrow><mi>k</mi><mo>=</mo><mn>0</mn></mrow>
+	    <mrow><mi>N</mi><mo>-</mo><mn>1</mn></mrow>
+	  </munderover>
+	  <msub><mi>X</mi><mrow><mi>k</mi><mo>,</mo><mi>v</mi></mrow></msub>
+	  <msup>
+	    <mi>e</mi>
+	    <mrow>
+	      <mfrac>
+		<mrow><mn>2</mn><mi>&pi;</mi><mi>i</mi></mrow>
+		<mi>N</mi>
+	      </mfrac>
+	      <mi>k</mi><mi>n</mi>
+	    </mrow>
+	  </msup>
+
+	  <mo>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</mo>
+
+	  <mrow style="font-size:100%;">
+	    <mi>n</mi>
+	    <mo>=</mo>
+	    <mn>0</mn>
+	    <mo>,</mo>
+	    <mo>&middot;</mo>
+	    <mo>&middot;</mo>
+	    <mo>&middot;</mo>
+	    <mo>,</mo>
+	    <mi>N</mi>
+	    <mo>-</mo>
+	    <mn>1</mn>
+
+	    <mo>&nbsp;&nbsp;</mo>
+	    <mo>,</mo>
+	    <mo>&nbsp;&nbsp;</mo>
+
+	    <mi>v</mi>
+	    <mo>=</mo>
+	    <mn>0</mn>
+	    <mo>,</mo>
+	    <mo>&middot;</mo>
+	    <mo>&middot;</mo>
+	    <mo>&middot;</mo>
+	    <mo>,</mo>
+	    <mi>V</mi>
+	    <mo>-</mo>
+	    <mn>1</mn>
+	  </mrow>
+	</mrow>
+      </math>
+    </td>
+    <td>
+      <p>(4)</p>
+    </td>
+  </tr>
+</table>
+
+<table border="0" style="margin-right:1.0cm; margin-left:1.0cm; margin-top:0.5cm; margin-bottom:0.5cm;">
+  <tr>
+    <td align="center" style="width:100%;">
+      <math mode="display" style="font-size:1.2em;" xmlns="http://www.w3.org/1998/Math/MathML">
+	<mrow>
+	  <mfenced open="{" close="">
+	    <mtable>
+	      <mtr>
+		<mtd>
+		  <mrow>
+		    <mi>s</mi>
+		    <mo>[</mo>
+		    <mo>(</mo>
+		    <mn>2</mn>
+		    <mi>n</mi>
+		    <mo>+</mo>
+		    <mn>0</mn>
+		    <mo>)</mo>
+		    <mi>V</mi>
+		    <mo>+</mo>
+		    <mi>v</mi>
+		    <mo>]</mo>
+
+		    <mo>=</mo>
+
+		    <mi>Re</mi>
+		    <mo>(</mo>
+		    <msub><mi>x</mi><mrow><mi>n</mi><mo>,</mo><mi>v</mi></mrow></msub>
+		    <mo>)</mo>
+		  </mrow>
+		</mtd>
+	      </mtr>
+
+	      <mtr>
+		<mtd>
+		  <mrow>
+		    <mi>s</mi>
+		    <mo>[</mo>
+		    <mo>(</mo>
+		    <mn>2</mn>
+		    <mi>n</mi>
+		    <mo>+</mo>
+		    <mn>1</mn>
+		    <mo>)</mo>
+		    <mi>V</mi>
+		    <mo>+</mo>
+		    <mi>v</mi>
+		    <mo>]</mo>
+
+		    <mo>=</mo>
+
+		    <mi>Im</mi>
+		    <mo>(</mo>
+		    <msub><mi>x</mi><mrow><mi>n</mi><mo>,</mo><mi>v</mi></mrow></msub>
+		    <mo>)</mo>
+		  </mrow>
+		</mtd>
+	      </mtr>
+
+	    </mtable>
+	  </mfenced>
+
+	  <mo>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</mo>
+
+	  <mrow style="font-size:100%;">
+	    <mi>n</mi>
+	    <mo>=</mo>
+	    <mn>0</mn>
+	    <mo>,</mo>
+	    <mo>&middot;</mo>
+	    <mo>&middot;</mo>
+	    <mo>&middot;</mo>
+	    <mo>,</mo>
+	    <mi>N</mi>
+	    <mo>-</mo>
+	    <mn>1</mn>
+
+	    <mo>&nbsp;&nbsp;</mo>
+	    <mo>,</mo>
+	    <mo>&nbsp;&nbsp;</mo>
+
+	    <mi>v</mi>
+	    <mo>=</mo>
+	    <mn>0</mn>
+	    <mo>,</mo>
+	    <mo>&middot;</mo>
+	    <mo>&middot;</mo>
+	    <mo>&middot;</mo>
+	    <mo>,</mo>
+	    <mi>V</mi>
+	    <mo>-</mo>
+	    <mn>1</mn>
+	  </mrow>
+	</mrow>
+      </math>
+    </td>
+    <td>
+      <p>(5)</p>
+    </td>
+  </tr>
+</table>
+
+<table border="0" style="margin-right:1.0cm; margin-left:1.0cm; margin-top:0.5cm; margin-bottom:0.5cm;">
+  <tr>
+    <td align="center" style="width:100%;">
+      <math mode="display" style="font-size:1.2em;" xmlns="http://www.w3.org/1998/Math/MathML">
+	<mrow>
+	  <mfenced open="{" close="">
+	    <mtable>
+	      <mtr>
+		<mtd>
+		  <mrow>
+		    <mi>s</mi>
+		    <mo>[</mo>
+		    <mo>(</mo>
+		    <mn>2</mn>
+		    <mi>k</mi>
+		    <mo>+</mo>
+		    <mn>0</mn>
+		    <mo>)</mo>
+		    <mi>V</mi>
+		    <mo>+</mo>
+		    <mi>v</mi>
+		    <mo>]</mo>
+
+		    <mo>=</mo>
+
+		    <mi>Re</mi>
+		    <mo>(</mo>
+		    <msub><mi>X</mi><mrow><mi>k</mi><mo>,</mo><mi>v</mi></mrow></msub>
+		    <mo>)</mo>
+		  </mrow>
+		</mtd>
+	      </mtr>
+
+	      <mtr>
+		<mtd>
+		  <mrow>
+		    <mi>s</mi>
+		    <mo>[</mo>
+		    <mo>(</mo>
+		    <mn>2</mn>
+		    <mi>k</mi>
+		    <mo>+</mo>
+		    <mn>1</mn>
+		    <mo>)</mo>
+		    <mi>V</mi>
+		    <mo>+</mo>
+		    <mi>v</mi>
+		    <mo>]</mo>
+
+		    <mo>=</mo>
+
+		    <mi>Im</mi>
+		    <mo>(</mo>
+		    <msub><mi>X</mi><mrow><mi>k</mi><mo>,</mo><mi>v</mi></mrow></msub>
+		    <mo>)</mo>
+		  </mrow>
+		</mtd>
+	      </mtr>
+
+	    </mtable>
+	  </mfenced>
+
+	  <mo>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</mo>
+
+	  <mrow style="font-size:100%;">
+	    <mi>k</mi>
+	    <mo>=</mo>
+	    <mn>0</mn>
+	    <mo>,</mo>
+	    <mo>&middot;</mo>
+	    <mo>&middot;</mo>
+	    <mo>&middot;</mo>
+	    <mo>,</mo>
+	    <mi>N</mi>
+	    <mo>-</mo>
+	    <mn>1</mn>
+
+	    <mo>&nbsp;&nbsp;</mo>
+	    <mo>,</mo>
+	    <mo>&nbsp;&nbsp;</mo>
+
+	    <mi>v</mi>
+	    <mo>=</mo>
+	    <mn>0</mn>
+	    <mo>,</mo>
+	    <mo>&middot;</mo>
+	    <mo>&middot;</mo>
+	    <mo>&middot;</mo>
+	    <mo>,</mo>
+	    <mi>V</mi>
+	    <mo>-</mo>
+	    <mn>1</mn>
+	  </mrow>
+	</mrow>
+      </math>
+    </td>
+    <td>
+      <p>(6)</p>
+    </td>
+  </tr>
+</table>
+
+<p>
+The real forward transform performs the transform defined by (3) when
+the condition (7) is satisfied. In this case, the output satisfies
+(8). You should specify -1 as the direction of transform, and the
+input should be given as in (9), and the output is given as in (10).
+The real backward transform is the opposite of the real forward
+transform. The input should satisfy (8) and the output satisfies (7).
+You should specify 1 as the direction of transform, and the input
+should be given as in (10), and the output is given as in (11).
+</p>
+
+<table border="0" style="margin-right:1.0cm; margin-left:1.0cm; margin-top:0.5cm; margin-bottom:0.5cm;">
+  <tr>
+    <td align="center" style="width:100%;">
+      <math mode="display" style="font-size:1.2em;" xmlns="http://www.w3.org/1998/Math/MathML">
+	<mrow>
+	  <mi>Im</mi>
+	  <mo>(</mo>
+	  <msub><mi>x</mi><mrow><mi>n</mi><mo>,</mo><mi>v</mi></mrow></msub>
+	  <mo>)</mo>
+	  <mo>=</mo>
+	  <mn>0</mn>
+
+	  <mo>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</mo>
+
+	  <mrow style="font-size:100%;">
+	    <mi>n</mi>
+	    <mo>=</mo>
+	    <mn>0</mn>
+	    <mo>,</mo>
+	    <mo>&middot;</mo>
+	    <mo>&middot;</mo>
+	    <mo>&middot;</mo>
+	    <mo>,</mo>
+	    <mi>N</mi>
+	    <mo>-</mo>
+	    <mn>1</mn>
+
+	    <mo>&nbsp;&nbsp;</mo>
+	    <mo>,</mo>
+	    <mo>&nbsp;&nbsp;</mo>
+
+	    <mi>v</mi>
+	    <mo>=</mo>
+	    <mn>0</mn>
+	    <mo>,</mo>
+	    <mo>&middot;</mo>
+	    <mo>&middot;</mo>
+	    <mo>&middot;</mo>
+	    <mo>,</mo>
+	    <mi>V</mi>
+	    <mo>-</mo>
+	    <mn>1</mn>
+	  </mrow>
+
+	</mrow>
+      </math>
+    </td>
+    <td>
+      <p>(7)</p>
+    </td>
+  </tr>
+</table>
+
+<table border="0" style="margin-right:1.0cm; margin-left:1.0cm; margin-top:0.5cm; margin-bottom:0.5cm;">
+  <tr>
+    <td align="center" style="width:100%;">
+      <math mode="display" style="font-size:1.2em;" xmlns="http://www.w3.org/1998/Math/MathML">
+	<mrow>
+	  <mfenced open="{" close="">
+	    <mtable>
+	      <mtr>
+		<mtd>
+		  <mrow>
+		    <msub><mi>X</mi><mrow><mi>k</mi><mo>,</mo><mi>v</mi></mrow></msub>
+		    <mo>=</mo>
+		    <msubsup>
+		      <mi>X</mi>
+		      <mrow><mi>N</mi><mo>-</mo><mi>k</mi><mo>,</mo><mi>v</mi></mrow>
+		      <mo>*</mo>
+		    </msubsup>
+		  </mrow>
+		</mtd>
+
+		<mtd>
+		  <mo>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</mo>
+		</mtd>
+
+		<mtd>
+		  <mrow style="font-size:100%;">
+		    <mi>k</mi>
+		    <mo>=</mo>
+		    <mn>1</mn>
+		    <mo>,</mo>
+		    <mo>&middot;</mo>
+		    <mo>&middot;</mo>
+		    <mo>&middot;</mo>
+		    <mo>,</mo>
+		    <mfrac>
+		      <mi>N</mi>
+		      <mn>2</mn>
+		    </mfrac>
+		    <mo>-</mo>
+		    <mn>1</mn>
+		  </mrow>
+		</mtd>
+	      </mtr>
+
+	      <mtr>
+		<mtd>
+		  <mrow>
+		    <mi>Im</mi>
+		    <mo>(</mo>
+		    <msub><mi>X</mi><mrow><mi>k</mi><mo>,</mo><mi>v</mi></mrow></msub>
+		    <mo>)</mo>
+		    <mo>=</mo>
+		    <mn>0</mn>
+		  </mrow>
+		</mtd>
+
+		<mtd>
+		  <mo>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</mo>
+		</mtd>
+
+		<mtd>
+		  <mrow style="font-size:100%;">
+		    <mi>k</mi>
+		    <mo>=</mo>
+		    <mn>0</mn>
+		    <mo>,</mo>
+		    <mfrac>
+		      <mi>N</mi>
+		      <mn>2</mn>
+		    </mfrac>
+		  </mrow>
+		</mtd>
+	      </mtr>
+
+	    </mtable>
+	  </mfenced>
+
+	  <mo>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</mo>
+
+	  <mrow style="font-size:100%;">
+	    <mi>v</mi>
+	    <mo>=</mo>
+	    <mn>0</mn>
+	    <mo>,</mo>
+	    <mo>&middot;</mo>
+	    <mo>&middot;</mo>
+	    <mo>&middot;</mo>
+	    <mo>,</mo>
+	    <mi>V</mi>
+	    <mo>-</mo>
+	    <mn>1</mn>
+	  </mrow>
+	</mrow>
+      </math>
+    </td>
+    <td>
+      <p>(8)</p>
+    </td>
+  </tr>
+</table>
+
+<table border="0" style="margin-right:1.0cm; margin-left:1.0cm; margin-top:0.5cm; margin-bottom:0.5cm;">
+  <tr>
+    <td align="center" style="width:100%;">
+      <math mode="display" style="font-size:1.2em;" xmlns="http://www.w3.org/1998/Math/MathML">
+	<mrow>
+	  <mrow>
+	    <mi>s</mi>
+	    <mo>[</mo>
+	    <mi>n</mi>
+	    <mi>V</mi>
+	    <mo>+</mo>
+	    <mi>v</mi>
+	    <mo>]</mo>
+
+	    <mo>=</mo>
+
+	    <mi>Re</mi>
+	    <mo>(</mo>
+	    <msub><mi>x</mi><mrow><mi>n</mi><mo>,</mo><mi>v</mi></mrow></msub>
+	    <mo>)</mo>
+	  </mrow>
+
+	  <mo>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</mo>
+
+	  <mrow style="font-size:100%;">
+	    <mi>n</mi>
+	    <mo>=</mo>
+	    <mn>0</mn>
+	    <mo>,</mo>
+	    <mo>&middot;</mo>
+	    <mo>&middot;</mo>
+	    <mo>&middot;</mo>
+	    <mo>,</mo>
+	    <mi>N</mi>
+	    <mo>-</mo>
+	    <mn>1</mn>
+
+	    <mo>&nbsp;&nbsp;</mo>
+	    <mo>,</mo>
+	    <mo>&nbsp;&nbsp;</mo>
+
+	    <mi>v</mi>
+	    <mo>=</mo>
+	    <mn>0</mn>
+	    <mo>,</mo>
+	    <mo>&middot;</mo>
+	    <mo>&middot;</mo>
+	    <mo>&middot;</mo>
+	    <mo>,</mo>
+	    <mi>V</mi>
+	    <mo>-</mo>
+	    <mn>1</mn>
+	  </mrow>
+	</mrow>
+      </math>
+    </td>
+    <td>
+      <p>(9)</p>
+    </td>
+  </tr>
+</table>
+
+<table border="0" style="margin-right:1.0cm; margin-left:1.0cm; margin-top:0.5cm; margin-bottom:0.5cm;">
+  <tr>
+    <td align="center" style="width:100%;">
+      <math mode="display" style="font-size:1.2em;" xmlns="http://www.w3.org/1998/Math/MathML">
+	<mrow>
+	  <mfenced open="{" close="">
+	    <mtable>
+	      <mtr>
+		<mtd>
+		  <mrow>
+		    <mi>s</mi>
+		    <mo>[</mo>
+		    <mo>(</mo>
+		    <mn>2</mn>
+		    <mi>k</mi>
+		    <mo>+</mo>
+		    <mn>0</mn>
+		    <mo>)</mo>
+		    <mi>V</mi>
+		    <mo>+</mo>
+		    <mi>v</mi>
+		    <mo>]</mo>
+		  </mrow>
+		</mtd>
+
+		<mtd>
+		  <mo>=</mo>
+		</mtd>
+
+		<mtd>
+		  <mrow>
+		    <mi>Re</mi>
+		    <mo>(</mo>
+		    <msub><mi>X</mi><mrow><mi>k</mi><mo>,</mo><mi>v</mi></mrow></msub>
+		    <mo>)</mo>
+		  </mrow>
+		</mtd>
+
+		<mtd>
+		  <mrow style="font-size:100%;">
+		    <mi>k</mi>
+		    <mo>=</mo>
+		    <mn>0</mn>
+		    <mo>,</mo>
+		    <mo>&middot;</mo>
+		    <mo>&middot;</mo>
+		    <mo>&middot;</mo>
+		    <mo>,</mo>
+		    <mfrac>
+		      <mi>N</mi>
+		      <mn>2</mn>
+		    </mfrac>
+		    <mo>-</mo>
+		    <mn>1</mn>
+		  </mrow>
+		</mtd>
+	      </mtr>
+
+	      <mtr>
+		<mtd>
+		  <mrow>
+		    <mi>s</mi>
+		    <mo>[</mo>
+		    <mi>V</mi>
+		    <mo>+</mo>
+		    <mi>v</mi>
+		    <mo>]</mo>
+		  </mrow>
+		</mtd>
+
+		<mtd>
+		  <mo>=</mo>
+		</mtd>
+
+		<mtd>
+		  <mrow>
+		    <mi>Re</mi>
+		    <mo>(</mo>
+		    <msub><mi>X</mi><mrow><mi>N</mi><mo>/</mo><mn>2</mn><mo>,</mo><mi>v</mi></mrow></msub>
+		    <mo>)</mo>
+		  </mrow>
+		</mtd>
+
+		<mtd>
+		</mtd>
+	      </mtr>
+
+	      <mtr>
+		<mtd>
+		  <mrow>
+		    <mi>s</mi>
+		    <mo>[</mo>
+		    <mo>(</mo>
+		    <mn>2</mn>
+		    <mi>k</mi>
+		    <mo>+</mo>
+		    <mn>1</mn>
+		    <mo>)</mo>
+		    <mi>V</mi>
+		    <mo>+</mo>
+		    <mi>v</mi>
+		    <mo>]</mo>
+		  </mrow>
+		</mtd>
+
+		<mtd>
+		  <mo>=</mo>
+		</mtd>
+
+		<mtd>
+		  <mrow>
+		    <mi>Im</mi>
+		    <mo>(</mo>
+		    <msub><mi>X</mi><mrow><mi>k</mi><mo>,</mo><mi>v</mi></mrow></msub>
+		    <mo>)</mo>
+		  </mrow>
+		</mtd>
+
+		<mtd>
+		  <mrow style="font-size:100%;">
+		    <mi>k</mi>
+		    <mo>=</mo>
+		    <mn>1</mn>
+		    <mo>,</mo>
+		    <mo>&middot;</mo>
+		    <mo>&middot;</mo>
+		    <mo>&middot;</mo>
+		    <mo>,</mo>
+		    <mfrac>
+		      <mi>N</mi>
+		      <mn>2</mn>
+		    </mfrac>
+		    <mo>-</mo>
+		    <mn>1</mn>
+		  </mrow>
+		</mtd>
+	      </mtr>
+
+	    </mtable>
+	  </mfenced>
+
+	  <mo>&nbsp;&nbsp;&nbsp;&nbsp;</mo>
+
+	  <mrow style="font-size:100%;">
+	    <mi>v</mi>
+	    <mo>=</mo>
+	    <mn>0</mn>
+	    <mo>,</mo>
+	    <mo>&middot;</mo>
+	    <mo>&middot;</mo>
+	    <mo>&middot;</mo>
+	    <mo>,</mo>
+	    <mi>V</mi>
+	    <mo>-</mo>
+	    <mn>1</mn>
+	  </mrow>
+	</mrow>
+      </math>
+    </td>
+    <td>
+      <p>(10)</p>
+    </td>
+  </tr>
+</table>
+
+<table border="0" style="margin-right:1.0cm; margin-left:1.0cm; margin-top:0.5cm; margin-bottom:0.5cm;">
+  <tr>
+    <td align="center" style="width:100%;">
+      <math mode="display" style="font-size:1.2em;" xmlns="http://www.w3.org/1998/Math/MathML">
+	<mrow>
+	  <mrow>
+	    <mn>2</mn>
+	    <mo> &nbsp; </mo>
+	    <mi>s</mi>
+	    <mo>[</mo>
+	    <mi>n</mi>
+	    <mi>V</mi>
+	    <mo>+</mo>
+	    <mi>v</mi>
+	    <mo>]</mo>
+
+	    <mo>=</mo>
+
+	    <mi>Re</mi>
+	    <mo>(</mo>
+	    <msub><mi>x</mi><mrow><mi>n</mi><mo>,</mo><mi>v</mi></mrow></msub>
+	    <mo>)</mo>
+	  </mrow>
+
+	  <mo>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</mo>
+
+	  <mrow style="font-size:100%;">
+	    <mi>n</mi>
+	    <mo>=</mo>
+	    <mn>0</mn>
+	    <mo>,</mo>
+	    <mo>&middot;</mo>
+	    <mo>&middot;</mo>
+	    <mo>&middot;</mo>
+	    <mo>,</mo>
+	    <mi>N</mi>
+	    <mo>-</mo>
+	    <mn>1</mn>
+
+	    <mo>&nbsp;&nbsp;</mo>
+	    <mo>,</mo>
+	    <mo>&nbsp;&nbsp;</mo>
+
+	    <mi>v</mi>
+	    <mo>=</mo>
+	    <mn>0</mn>
+	    <mo>,</mo>
+	    <mo>&middot;</mo>
+	    <mo>&middot;</mo>
+	    <mo>&middot;</mo>
+	    <mo>,</mo>
+	    <mi>V</mi>
+	    <mo>-</mo>
+	    <mn>1</mn>
+	  </mrow>
+	</mrow>
+      </math>
+    </td>
+    <td>
+      <p>(11)</p>
+    </td>
+  </tr>
+</table>
+
+<p>
+The alternative real transforms are defined by (12) to (16), similarly
+to the real transforms. The alternative transforms are handy if you
+are migrating from the FFT library made by Prof. Takuya Ooura.  You
+should specify 1 as the direction in order to perform a forward
+transform, and -1 when you perform a backward transform.
+</p>
+
+<table border="0" style="margin-right:1.0cm; margin-left:1.0cm; margin-top:0.5cm; margin-bottom:0.5cm;">
+  <tr>
+    <td align="center" style="width:100%;">
+      <math mode="display" style="font-size:1.2em;" xmlns="http://www.w3.org/1998/Math/MathML">
+	<mrow>
+	  <mi>Im</mi>
+	  <mo>(</mo>
+	  <msub><mi>X</mi><mrow><mi>k</mi><mo>,</mo><mi>v</mi></mrow></msub>
+	  <mo>)</mo>
+	  <mo>=</mo>
+	  <mn>0</mn>
+
+	  <mo>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</mo>
+
+	  <mrow style="font-size:100%;">
+	    <mi>k</mi>
+	    <mo>=</mo>
+	    <mn>0</mn>
+	    <mo>,</mo>
+	    <mo>&middot;</mo>
+	    <mo>&middot;</mo>
+	    <mo>&middot;</mo>
+	    <mo>,</mo>
+	    <mi>N</mi>
+	    <mo>-</mo>
+	    <mn>1</mn>
+
+	    <mo>&nbsp;&nbsp;</mo>
+	    <mo>,</mo>
+	    <mo>&nbsp;&nbsp;</mo>
+
+	    <mi>v</mi>
+	    <mo>=</mo>
+	    <mn>0</mn>
+	    <mo>,</mo>
+	    <mo>&middot;</mo>
+	    <mo>&middot;</mo>
+	    <mo>&middot;</mo>
+	    <mo>,</mo>
+	    <mi>V</mi>
+	    <mo>-</mo>
+	    <mn>1</mn>
+	  </mrow>
+
+	</mrow>
+      </math>
+    </td>
+    <td>
+      <p>(12)</p>
+    </td>
+  </tr>
+</table>
+
+<table border="0" style="margin-right:1.0cm; margin-left:1.0cm; margin-top:0.5cm; margin-bottom:0.5cm;">
+  <tr>
+    <td align="center" style="width:100%;">
+      <math mode="display" style="font-size:1.2em;" xmlns="http://www.w3.org/1998/Math/MathML">
+	<mrow>
+	  <mfenced open="{" close="">
+	    <mtable>
+	      <mtr>
+		<mtd>
+		  <mrow>
+		    <msub><mi>x</mi><mrow><mi>n</mi><mo>,</mo><mi>v</mi></mrow></msub>
+		    <mo>=</mo>
+		    <msubsup>
+		      <mi>x</mi>
+		      <mrow><mi>N</mi><mo>-</mo><mi>n</mi><mo>,</mo><mi>v</mi></mrow>
+		      <mo>*</mo>
+		    </msubsup>
+		  </mrow>
+		</mtd>
+
+		<mtd>
+		  <mo>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</mo>
+		</mtd>
+
+		<mtd>
+		  <mrow style="font-size:100%;">
+		    <mi>n</mi>
+		    <mo>=</mo>
+		    <mn>1</mn>
+		    <mo>,</mo>
+		    <mo>&middot;</mo>
+		    <mo>&middot;</mo>
+		    <mo>&middot;</mo>
+		    <mo>,</mo>
+		    <mfrac>
+		      <mi>N</mi>
+		      <mn>2</mn>
+		    </mfrac>
+		    <mo>-</mo>
+		    <mn>1</mn>
+		  </mrow>
+		</mtd>
+	      </mtr>
+
+	      <mtr>
+		<mtd>
+		  <mrow>
+		    <mi>Im</mi>
+		    <mo>(</mo>
+		    <msub><mi>x</mi><mrow><mi>n</mi><mo>,</mo><mi>v</mi></mrow></msub>
+		    <mo>)</mo>
+		    <mo>=</mo>
+		    <mn>0</mn>
+		  </mrow>
+		</mtd>
+
+		<mtd>
+		  <mo>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</mo>
+		</mtd>
+
+		<mtd>
+		  <mrow style="font-size:100%;">
+		    <mi>n</mi>
+		    <mo>=</mo>
+		    <mn>0</mn>
+		    <mo>,</mo>
+		    <mfrac>
+		      <mi>N</mi>
+		      <mn>2</mn>
+		    </mfrac>
+		  </mrow>
+		</mtd>
+	      </mtr>
+
+	    </mtable>
+	  </mfenced>
+
+	  <mo>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</mo>
+
+	  <mrow style="font-size:100%;">
+	    <mi>v</mi>
+	    <mo>=</mo>
+	    <mn>0</mn>
+	    <mo>,</mo>
+	    <mo>&middot;</mo>
+	    <mo>&middot;</mo>
+	    <mo>&middot;</mo>
+	    <mo>,</mo>
+	    <mi>V</mi>
+	    <mo>-</mo>
+	    <mn>1</mn>
+	  </mrow>
+	</mrow>
+      </math>
+    </td>
+    <td>
+      <p>(13)</p>
+    </td>
+  </tr>
+</table>
+
+<table border="0" style="margin-right:1.0cm; margin-left:1.0cm; margin-top:0.5cm; margin-bottom:0.5cm;">
+  <tr>
+    <td align="center" style="width:100%;">
+      <math mode="display" style="font-size:1.2em;" xmlns="http://www.w3.org/1998/Math/MathML">
+	<mrow>
+	  <mrow>
+	    <mi>s</mi>
+	    <mo>[</mo>
+	    <mi>n</mi>
+	    <mi>V</mi>
+	    <mo>+</mo>
+	    <mi>v</mi>
+	    <mo>]</mo>
+
+	    <mo>=</mo>
+
+	    <mi>Re</mi>
+	    <mo>(</mo>
+	    <msub><mi>X</mi><mrow><mi>k</mi><mo>,</mo><mi>v</mi></mrow></msub>
+	    <mo>)</mo>
+	  </mrow>
+
+	  <mo>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</mo>
+
+	  <mrow style="font-size:100%;">
+	    <mi>k</mi>
+	    <mo>=</mo>
+	    <mn>0</mn>
+	    <mo>,</mo>
+	    <mo>&middot;</mo>
+	    <mo>&middot;</mo>
+	    <mo>&middot;</mo>
+	    <mo>,</mo>
+	    <mi>N</mi>
+	    <mo>-</mo>
+	    <mn>1</mn>
+
+	    <mo>&nbsp;&nbsp;</mo>
+	    <mo>,</mo>
+	    <mo>&nbsp;&nbsp;</mo>
+
+	    <mi>v</mi>
+	    <mo>=</mo>
+	    <mn>0</mn>
+	    <mo>,</mo>
+	    <mo>&middot;</mo>
+	    <mo>&middot;</mo>
+	    <mo>&middot;</mo>
+	    <mo>,</mo>
+	    <mi>V</mi>
+	    <mo>-</mo>
+	    <mn>1</mn>
+	  </mrow>
+	</mrow>
+      </math>
+    </td>
+    <td>
+      <p>(14)</p>
+    </td>
+  </tr>
+</table>
+
+<table border="0" style="margin-right:1.0cm; margin-left:1.0cm; margin-top:0.5cm; margin-bottom:0.5cm;">
+  <tr>
+    <td align="center" style="width:100%;">
+      <math mode="display" style="font-size:1.2em;" xmlns="http://www.w3.org/1998/Math/MathML">
+	<mrow>
+	  <mfenced open="{" close="">
+	    <mtable>
+	      <mtr>
+		<mtd>
+		  <mrow>
+		    <mi>s</mi>
+		    <mo>[</mo>
+		    <mo>(</mo>
+		    <mn>2</mn>
+		    <mi>n</mi>
+		    <mo>+</mo>
+		    <mn>0</mn>
+		    <mo>)</mo>
+		    <mi>V</mi>
+		    <mo>+</mo>
+		    <mi>v</mi>
+		    <mo>]</mo>
+		  </mrow>
+		</mtd>
+
+		<mtd>
+		  <mo>=</mo>
+		</mtd>
+
+		<mtd>
+		  <mrow>
+		    <mi>Re</mi>
+		    <mo>(</mo>
+		    <msub><mi>x</mi><mrow><mi>n</mi><mo>,</mo><mi>v</mi></mrow></msub>
+		    <mo>)</mo>
+		  </mrow>
+		</mtd>
+
+		<mtd>
+		  <mrow style="font-size:100%;">
+		    <mi>n</mi>
+		    <mo>=</mo>
+		    <mn>0</mn>
+		    <mo>,</mo>
+		    <mo>&middot;</mo>
+		    <mo>&middot;</mo>
+		    <mo>&middot;</mo>
+		    <mo>,</mo>
+		    <mfrac>
+		      <mi>N</mi>
+		      <mn>2</mn>
+		    </mfrac>
+		    <mo>-</mo>
+		    <mn>1</mn>
+		  </mrow>
+		</mtd>
+	      </mtr>
+
+	      <mtr>
+		<mtd>
+		  <mrow>
+		    <mi>s</mi>
+		    <mo>[</mo>
+		    <mi>V</mi>
+		    <mo>+</mo>
+		    <mi>v</mi>
+		    <mo>]</mo>
+		  </mrow>
+		</mtd>
+
+		<mtd>
+		  <mo>=</mo>
+		</mtd>
+
+		<mtd>
+		  <mrow>
+		    <mi>Re</mi>
+		    <mo>(</mo>
+		    <msub><mi>x</mi><mrow><mi>N</mi><mo>/</mo><mn>2</mn><mo>,</mo><mi>v</mi></mrow></msub>
+		    <mo>)</mo>
+		  </mrow>
+		</mtd>
+
+		<mtd>
+		</mtd>
+	      </mtr>
+
+	      <mtr>
+		<mtd>
+		  <mrow>
+		    <mi>s</mi>
+		    <mo>[</mo>
+		    <mo>(</mo>
+		    <mn>2</mn>
+		    <mi>n</mi>
+		    <mo>+</mo>
+		    <mn>1</mn>
+		    <mo>)</mo>
+		    <mi>V</mi>
+		    <mo>+</mo>
+		    <mi>v</mi>
+		    <mo>]</mo>
+		  </mrow>
+		</mtd>
+
+		<mtd>
+		  <mo>=</mo>
+		</mtd>
+
+		<mtd>
+		  <mrow>
+		    <mi>Im</mi>
+		    <mo>(</mo>
+		    <msub><mi>x</mi><mrow><mi>n</mi><mo>,</mo><mi>v</mi></mrow></msub>
+		    <mo>)</mo>
+		  </mrow>
+		</mtd>
+
+		<mtd>
+		  <mrow style="font-size:100%;">
+		    <mi>n</mi>
+		    <mo>=</mo>
+		    <mn>1</mn>
+		    <mo>,</mo>
+		    <mo>&middot;</mo>
+		    <mo>&middot;</mo>
+		    <mo>&middot;</mo>
+		    <mo>,</mo>
+		    <mfrac>
+		      <mi>N</mi>
+		      <mn>2</mn>
+		    </mfrac>
+		    <mo>-</mo>
+		    <mn>1</mn>
+		  </mrow>
+		</mtd>
+	      </mtr>
+
+	    </mtable>
+	  </mfenced>
+
+	  <mo>&nbsp;&nbsp;&nbsp;&nbsp;</mo>
+
+	  <mrow style="font-size:100%;">
+	    <mi>v</mi>
+	    <mo>=</mo>
+	    <mn>0</mn>
+	    <mo>,</mo>
+	    <mo>&middot;</mo>
+	    <mo>&middot;</mo>
+	    <mo>&middot;</mo>
+	    <mo>,</mo>
+	    <mi>V</mi>
+	    <mo>-</mo>
+	    <mn>1</mn>
+	  </mrow>
+	</mrow>
+      </math>
+    </td>
+    <td>
+      <p>(15)</p>
+    </td>
+  </tr>
+</table>
+
+<table border="0" style="margin-right:1.0cm; margin-left:1.0cm; margin-top:0.5cm; margin-bottom:0.5cm;">
+  <tr>
+    <td align="center" style="width:100%;">
+      <math mode="display" style="font-size:1.2em;" xmlns="http://www.w3.org/1998/Math/MathML">
+	<mrow>
+	  <mrow>
+	    <mn>2</mn>
+	    <mo> &nbsp; </mo>
+	    <mi>s</mi>
+	    <mo>[</mo>
+	    <mi>n</mi>
+	    <mi>V</mi>
+	    <mo>+</mo>
+	    <mi>v</mi>
+	    <mo>]</mo>
+
+	    <mo>=</mo>
+
+	    <mi>Re</mi>
+	    <mo>(</mo>
+	    <msub><mi>X</mi><mrow><mi>k</mi><mo>,</mo><mi>v</mi></mrow></msub>
+	    <mo>)</mo>
+	  </mrow>
+
+	  <mo>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</mo>
+
+	  <mrow style="font-size:100%;">
+	    <mi>k</mi>
+	    <mo>=</mo>
+	    <mn>0</mn>
+	    <mo>,</mo>
+	    <mo>&middot;</mo>
+	    <mo>&middot;</mo>
+	    <mo>&middot;</mo>
+	    <mo>,</mo>
+	    <mi>N</mi>
+	    <mo>-</mo>
+	    <mn>1</mn>
+
+	    <mo>&nbsp;&nbsp;</mo>
+	    <mo>,</mo>
+	    <mo>&nbsp;&nbsp;</mo>
+
+	    <mi>v</mi>
+	    <mo>=</mo>
+	    <mn>0</mn>
+	    <mo>,</mo>
+	    <mo>&middot;</mo>
+	    <mo>&middot;</mo>
+	    <mo>&middot;</mo>
+	    <mo>,</mo>
+	    <mi>V</mi>
+	    <mo>-</mo>
+	    <mn>1</mn>
+	  </mrow>
+	</mrow>
+      </math>
+    </td>
+    <td>
+      <p>(16)</p>
+    </td>
+  </tr>
+</table>
+
+
+<h3>Examples</h3>
+
+<p>
+Below is an example code using nsfft library.
+</p>
+
+<pre class="code">
+#include &lt;stdio.h&gt;
+#include &lt;stdlib.h&gt;
+#include &lt;math.h&gt;
+#include &lt;stdint.h&gt;
+#include &lt;complex.h&gt;
+
+#include "SIMDBase.h"
+#include "DFT.h"
+
+typedef float REAL;
+#define TYPE SIMDBase_TYPE_FLOAT
+
+#define THRES 1e-3
+
+double complex omega(double n, double kn) {
+  return cexp((-2 * M_PI * _Complex_I / n) * kn);
+}
+
+void forward(double complex *ts, double complex *fs, int len) {
+  int k, n;
+
+  for(k=0;k&lt;len;k++) {
+    fs[k] = 0;
+
+    for(n=0;n&lt;len;n++) {
+      fs[k] += ts[n] * omega(len, n*k);
+    }
+  }
+}
+
+int main(int argc, char **argv) {
+  const int n = 256;
+
+  int mode = SIMDBase_chooseBestMode(TYPE);
+  printf("mode : %d, %s\n", mode, SIMDBase_getModeParamString(SIMDBase_PARAMID_MODE_NAME, mode));
+
+  int veclen = SIMDBase_getModeParamInt(SIMDBase_PARAMID_VECTOR_LEN, mode);
+  int sizeOfVect = SIMDBase_getModeParamInt(SIMDBase_PARAMID_SIZE_OF_VECT, mode);
+
+  //
+
+  int i, j;
+
+  DFT *p = DFT_init(mode, n, 0);
+  REAL *sx = SIMDBase_alignedMalloc(sizeOfVect*n*2);
+
+  //
+
+  double complex ts[veclen][n], fs[veclen][n];
+
+  for(j=0;j&lt;veclen;j++) {
+    for(i=0;i&lt;n;i++) {
+      ts[j][i] = (random() / (double)RAND_MAX) + (random() / (double)RAND_MAX) * _Complex_I;
+      sx[(i*2+0)*veclen+j] = creal(ts[j][i]);
+      sx[(i*2+1)*veclen+j] = cimag(ts[j][i]);
+    }
+  }
+
+  //
+
+  DFT_execute(p, mode, sx, -1);
+
+  for(j=0;j&lt;veclen;j++) {
+    forward(ts[j], fs[j], n);
+  }
+
+  //
+
+  int success = 1;
+
+  for(j=0;j&lt;veclen;j++) {
+    for(i=0;i&lt;n;i++) {
+      if ((fabs(sx[(i*2+0)*veclen+j] - creal(fs[j][i])) &gt; THRES) ||
+	  (fabs(sx[(i*2+1)*veclen+j] - cimag(fs[j][i])) &gt; THRES)) {
+	success = 0;
+      }
+    }
+  }
+
+  printf("%s\n", success ? "OK" : "NG");
+
+  //
+
+  SIMDBase_alignedFree(sx);
+  DFT_dispose(p, mode);
+
+  exit(0);
+}
+</pre>
+
+<p>
+You should put this code under a directory in the root directory of
+the library, and then you can compile this code with the following
+command.
+</p>
+
+<pre class="code">
+gcc -Wall -g -I ../simd -I ../dft -L../simd -L../dft -O DFTExample.c -lDFT -lSIMD -lm -o DFTExample
+</pre>
+
+<h3>Compilation</h3>
+
+<p>
+The nsfft source package include a few makefiles for various
+architectures.  You should make symbolic links to makefiles suited for
+your computer under <i>dft</i> and <i>simd</i> directories.
+</p>
+
+</body>
+</html>
diff --git a/plugins/supereq/nsfft-1.00/doc/nsfft.pdf b/plugins/supereq/nsfft-1.00/doc/nsfft.pdf
new file mode 100644
index 00000000..ed4ad5db
--- /dev/null
+++ b/plugins/supereq/nsfft-1.00/doc/nsfft.pdf
diff --git a/plugins/supereq/nsfft-1.00/ooura/Makefile b/plugins/supereq/nsfft-1.00/ooura/Makefile
new file mode 100644
index 00000000..bad1679e
--- /dev/null
+++ b/plugins/supereq/nsfft-1.00/ooura/Makefile
@@ -0,0 +1,11 @@
+CC=gcc
+BASEOPT=-Wall -g
+OPT=$(BASEOPT) -O3
+
+all : fftsg.o
+
+clean :
+	rm -f *~ *.o a.out
+
+fftsg.o : fftsg.c
+	$(CC) $(OPT) -c fftsg.c
diff --git a/plugins/supereq/nsfft-1.00/ooura/README b/plugins/supereq/nsfft-1.00/ooura/README
new file mode 100644
index 00000000..d7ddefc2
--- /dev/null
+++ b/plugins/supereq/nsfft-1.00/ooura/README
@@ -0,0 +1,2 @@
+Please put fftsg.c and pi_fft.c which is included in Prof. Takuya
+Ooura's FFT package.
diff --git a/plugins/supereq/nsfft-1.00/simd/Makefile b/plugins/supereq/nsfft-1.00/simd/Makefile
new file mode 120000
index 00000000..5d253498
--- /dev/null
+++ b/plugins/supereq/nsfft-1.00/simd/Makefile
@@ -0,0 +1 @@
+Makefile.x86avx
+\ No newline at end of file
diff --git a/plugins/supereq/nsfft-1.00/simd/Makefile.altivec b/plugins/supereq/nsfft-1.00/simd/Makefile.altivec
new file mode 100644
index 00000000..eeaed6a1
--- /dev/null
+++ b/plugins/supereq/nsfft-1.00/simd/Makefile.altivec
@@ -0,0 +1,26 @@
+CC=gcc
+BASEOPT=-Wall -maltivec -mabi=altivec
+OPT=$(BASEOPT) -O3
+
+all : libSIMD.a
+
+SIMDBaseUndiff_purecfloat.o : SIMDBaseUndiff.c SIMDBase.h SIMDBaseUndiff.h
+	$(CC) $(OPT) -DENABLE_PUREC_FLOAT SIMDBaseUndiff.c -c -o SIMDBaseUndiff_purecfloat.o
+
+SIMDBaseUndiff_purecdouble.o : SIMDBaseUndiff.c SIMDBase.h SIMDBaseUndiff.h
+	$(CC) $(OPT) -DENABLE_PUREC_DOUBLE SIMDBaseUndiff.c -c -o SIMDBaseUndiff_purecdouble.o
+
+SIMDBaseUndiff_pureclongdouble.o : SIMDBaseUndiff.c SIMDBase.h SIMDBaseUndiff.h
+	$(CC) $(OPT) -DENABLE_PUREC_LONGDOUBLE SIMDBaseUndiff.c -c -o SIMDBaseUndiff_pureclongdouble.o
+
+SIMDBaseUndiff_altivecfloat.o : SIMDBaseUndiff.c SIMDBase.h SIMDBaseUndiff.h
+	$(CC) $(OPT) -DENABLE_ALTIVEC_FLOAT SIMDBaseUndiff.c -c -o SIMDBaseUndiff_altivecfloat.o
+
+SIMDBase.o : SIMDBase.c SIMDBase.h
+	$(CC) $(BASEOPT) -O -DENABLE_PUREC_FLOAT -DENABLE_PUREC_DOUBLE -DENABLE_PUREC_LONGDOUBLE -DENABLE_ALTIVEC_FLOAT SIMDBase.c -c -o SIMDBase.o
+
+libSIMD.a : SIMDBase.o SIMDBaseUndiff_purecfloat.o SIMDBaseUndiff_purecdouble.o SIMDBaseUndiff_pureclongdouble.o SIMDBaseUndiff_altivecfloat.o
+	rm -f libSIMD.a; ar -cvq libSIMD.a SIMDBase.o SIMDBaseUndiff_purecfloat.o SIMDBaseUndiff_purecdouble.o SIMDBaseUndiff_pureclongdouble.o SIMDBaseUndiff_altivecfloat.o
+
+clean :
+	rm -f *~ *.o *.s *.a
diff --git a/plugins/supereq/nsfft-1.00/simd/Makefile.neon b/plugins/supereq/nsfft-1.00/simd/Makefile.neon
new file mode 100644
index 00000000..ace704f1
--- /dev/null
+++ b/plugins/supereq/nsfft-1.00/simd/Makefile.neon
@@ -0,0 +1,26 @@
+CC=gcc
+BASEOPT=-Wall -mfloat-abi=softfp
+OPT=$(BASEOPT) -O3
+
+all : libSIMD.a
+
+SIMDBaseUndiff_purecfloat.o : SIMDBaseUndiff.c SIMDBase.h SIMDBaseUndiff.h
+	$(CC) $(OPT) -DENABLE_PUREC_FLOAT SIMDBaseUndiff.c -c -o SIMDBaseUndiff_purecfloat.o
+
+SIMDBaseUndiff_purecdouble.o : SIMDBaseUndiff.c SIMDBase.h SIMDBaseUndiff.h
+	$(CC) $(OPT) -DENABLE_PUREC_DOUBLE SIMDBaseUndiff.c -c -o SIMDBaseUndiff_purecdouble.o
+
+SIMDBaseUndiff_pureclongdouble.o : SIMDBaseUndiff.c SIMDBase.h SIMDBaseUndiff.h
+	$(CC) $(OPT) -DENABLE_PUREC_LONGDOUBLE SIMDBaseUndiff.c -c -o SIMDBaseUndiff_pureclongdouble.o
+
+SIMDBaseUndiff_neonfloat.o : SIMDBaseUndiff.c SIMDBase.h SIMDBaseUndiff.h
+	$(CC) $(OPT) -mfpu=neon -DENABLE_NEON_FLOAT SIMDBaseUndiff.c -c -o SIMDBaseUndiff_neonfloat.o
+
+SIMDBase.o : SIMDBase.c SIMDBase.h
+	$(CC) $(BASEOPT) -O -DENABLE_PUREC_FLOAT -DENABLE_PUREC_DOUBLE -DENABLE_PUREC_LONGDOUBLE -DENABLE_NEON_FLOAT SIMDBase.c -c -o SIMDBase.o
+
+libSIMD.a : SIMDBase.o SIMDBaseUndiff_purecfloat.o SIMDBaseUndiff_purecdouble.o SIMDBaseUndiff_pureclongdouble.o SIMDBaseUndiff_neonfloat.o
+	rm -f libSIMD.a; ar -cvq libSIMD.a SIMDBase.o SIMDBaseUndiff_purecfloat.o SIMDBaseUndiff_purecdouble.o SIMDBaseUndiff_pureclongdouble.o SIMDBaseUndiff_neonfloat.o
+
+clean :
+	rm -f *~ *.o *.s *.a
diff --git a/plugins/supereq/nsfft-1.00/simd/Makefile.purec b/plugins/supereq/nsfft-1.00/simd/Makefile.purec
new file mode 100644
index 00000000..2c8b04f1
--- /dev/null
+++ b/plugins/supereq/nsfft-1.00/simd/Makefile.purec
@@ -0,0 +1,35 @@
+CC=gcc
+BASEOPT=-Wall
+OPT=$(BASEOPT) -O3
+
+all : libDFT.a
+
+DFTpurecfloat.o : DFTUndiff.c DFT.h SIMDBase.h
+	$(CC) $(OPT) -DENABLE_PUREC_FLOAT DFTUndiff.c -c -o DFTpurecfloat.o
+
+DFTpurecdouble.o : DFTUndiff.c DFT.h SIMDBase.h
+	$(CC) $(OPT) -DENABLE_PUREC_DOUBLE DFTUndiff.c -c -o DFTpurecdouble.o
+
+DFTpureclongdouble.o : DFTUndiff.c DFT.h SIMDBase.h
+	$(CC) $(OPT) -DENABLE_PUREC_LONGDOUBLE DFTUndiff.c -c -o DFTpureclongdouble.o
+
+SIMDBaseUndiff_purecfloat.o : SIMDBaseUndiff.c SIMDBase.h SIMDBaseUndiff.h
+	$(CC) $(OPT) -DENABLE_PUREC_FLOAT SIMDBaseUndiff.c -c -o SIMDBaseUndiff_purecfloat.o
+
+SIMDBaseUndiff_purecdouble.o : SIMDBaseUndiff.c DFT.h SIMDBase.h SIMDBaseUndiff.h
+	$(CC) $(OPT) -DENABLE_PUREC_DOUBLE SIMDBaseUndiff.c -c -o SIMDBaseUndiff_purecdouble.o
+
+SIMDBaseUndiff_pureclongdouble.o : SIMDBaseUndiff.c SIMDBase.h SIMDBaseUndiff.h
+	$(CC) $(OPT) -DENABLE_PUREC_LONGDOUBLE SIMDBaseUndiff.c -c -o SIMDBaseUndiff_pureclongdouble.o
+
+SIMDBase.o : SIMDBase.c SIMDBase.h
+	$(CC) $(BASEOPT) -O -DENABLE_PUREC_FLOAT -DENABLE_PUREC_DOUBLE -DENABLE_PUREC_LONGDOUBLE SIMDBase.c -c -o SIMDBase.o
+
+DFT.o : DFT.c DFT.h
+	$(CC) $(OPT) -DENABLE_PUREC_FLOAT -DENABLE_PUREC_DOUBLE -DENABLE_PUREC_LONGDOUBLE DFT.c -c -o DFT.o
+
+libDFT.a : DFTpurecfloat.o DFTpurecdouble.o DFTpureclongdouble.o DFT.o SIMDBase.o SIMDBaseUndiff_purecfloat.o SIMDBaseUndiff_purecdouble.o SIMDBaseUndiff_pureclongdouble.o
+	rm -f libDFT.a; ar -cvq libDFT.a DFTpurecfloat.o DFTpurecdouble.o DFTpureclongdouble.o DFT.o SIMDBase.o SIMDBaseUndiff_purecfloat.o SIMDBaseUndiff_purecdouble.o SIMDBaseUndiff_pureclongdouble.o
+
+clean :
+	rm -f *~ *.o *.s *.a
diff --git a/plugins/supereq/nsfft-1.00/simd/Makefile.x86 b/plugins/supereq/nsfft-1.00/simd/Makefile.x86
new file mode 100644
index 00000000..02f49610
--- /dev/null
+++ b/plugins/supereq/nsfft-1.00/simd/Makefile.x86
@@ -0,0 +1,35 @@
+CC=gcc
+BASEOPT=-Wall
+OPT=$(BASEOPT) -O3
+
+all : libSIMD.a
+
+SIMDBaseUndiff_purecfloat.o : SIMDBaseUndiff.c SIMDBase.h SIMDBaseUndiff.h
+	$(CC) $(OPT) -DENABLE_PUREC_FLOAT SIMDBaseUndiff.c -c -o SIMDBaseUndiff_purecfloat.o
+
+SIMDBase_purecdouble.o : SIMDBaseUndiff.c SIMDBase.h SIMDBaseUndiff.h
+	$(CC) $(OPT) -DENABLE_PUREC_DOUBLE SIMDBaseUndiff.c -c -o SIMDBase_purecdouble.o
+
+SIMDBase_pureclongdouble.o : SIMDBaseUndiff.c SIMDBase.h SIMDBaseUndiff.h
+	$(CC) $(OPT) -DENABLE_PUREC_LONGDOUBLE SIMDBaseUndiff.c -c -o SIMDBase_pureclongdouble.o
+
+SIMDBase_ssefloat.o : SIMDBaseUndiff.c SIMDBase.h SIMDBaseUndiff.h
+	$(CC) $(OPT) -msse -DENABLE_SSE_FLOAT SIMDBaseUndiff.c -c -o SIMDBase_ssefloat.o
+
+SIMDBase_sse2double.o : SIMDBaseUndiff.c SIMDBase.h SIMDBaseUndiff.h
+	$(CC) $(OPT) -msse2 -DENABLE_SSE2_DOUBLE SIMDBaseUndiff.c -c -o SIMDBase_sse2double.o
+
+SIMDBase_avxfloat.o : SIMDBaseUndiff.c SIMDBase.h SIMDBaseUndiff.h
+	$(CC) $(OPT) -mavx -DENABLE_AVX_FLOAT SIMDBaseUndiff.c -c -o SIMDBase_avxfloat.o
+
+SIMDBase_avxdouble.o : SIMDBaseUndiff.c SIMDBase.h SIMDBaseUndiff.h
+	$(CC) $(OPT) -mavx -DENABLE_AVX_DOUBLE SIMDBaseUndiff.c -c -o SIMDBase_avxdouble.o
+
+SIMDBase.o : SIMDBase.c SIMDBase.h
+	$(CC) $(BASEOPT) -O -DENABLE_PUREC_FLOAT -DENABLE_PUREC_DOUBLE -DENABLE_PUREC_LONGDOUBLE -DENABLE_SSE_FLOAT -DENABLE_SSE2_DOUBLE SIMDBase.c -c -o SIMDBase.o
+
+libSIMD.a : SIMDBase.o SIMDBaseUndiff_purecfloat.o SIMDBase_purecdouble.o SIMDBase_pureclongdouble.o SIMDBase_ssefloat.o SIMDBase_sse2double.o
+	rm -f libSIMD.a; ar -cvq libSIMD.a SIMDBase.o SIMDBaseUndiff_purecfloat.o SIMDBase_purecdouble.o SIMDBase_pureclongdouble.o SIMDBase_ssefloat.o SIMDBase_sse2double.o
+
+clean :
+	rm -f *~ *.o *.s *.a a.out
diff --git a/plugins/supereq/nsfft-1.00/simd/Makefile.x86avx b/plugins/supereq/nsfft-1.00/simd/Makefile.x86avx
new file mode 100644
index 00000000..d9d27a2e
--- /dev/null
+++ b/plugins/supereq/nsfft-1.00/simd/Makefile.x86avx
@@ -0,0 +1,35 @@
+CC=gcc
+BASEOPT=-Wall
+OPT=$(BASEOPT) -O3
+
+all : libSIMD.a
+
+SIMDBaseUndiff_purecfloat.o : SIMDBaseUndiff.c SIMDBase.h SIMDBaseUndiff.h
+	$(CC) $(OPT) -DENABLE_PUREC_FLOAT SIMDBaseUndiff.c -c -o SIMDBaseUndiff_purecfloat.o
+
+SIMDBaseUndiff_purecdouble.o : SIMDBaseUndiff.c SIMDBase.h SIMDBaseUndiff.h
+	$(CC) $(OPT) -DENABLE_PUREC_DOUBLE SIMDBaseUndiff.c -c -o SIMDBaseUndiff_purecdouble.o
+
+SIMDBaseUndiff_pureclongdouble.o : SIMDBaseUndiff.c SIMDBase.h SIMDBaseUndiff.h
+	$(CC) $(OPT) -DENABLE_PUREC_LONGDOUBLE SIMDBaseUndiff.c -c -o SIMDBaseUndiff_pureclongdouble.o
+
+SIMDBaseUndiff_ssefloat.o : SIMDBaseUndiff.c SIMDBase.h SIMDBaseUndiff.h
+	$(CC) $(OPT) -msse -DENABLE_SSE_FLOAT SIMDBaseUndiff.c -c -o SIMDBaseUndiff_ssefloat.o
+
+SIMDBaseUndiff_sse2double.o : SIMDBaseUndiff.c SIMDBase.h SIMDBaseUndiff.h
+	$(CC) $(OPT) -msse2 -DENABLE_SSE2_DOUBLE SIMDBaseUndiff.c -c -o SIMDBaseUndiff_sse2double.o
+
+SIMDBaseUndiff_avxfloat.o : SIMDBaseUndiff.c SIMDBase.h SIMDBaseUndiff.h
+	$(CC) $(OPT) -mavx -DENABLE_AVX_FLOAT SIMDBaseUndiff.c -c -o SIMDBaseUndiff_avxfloat.o
+
+SIMDBaseUndiff_avxdouble.o : SIMDBaseUndiff.c SIMDBase.h SIMDBaseUndiff.h
+	$(CC) $(OPT) -mavx -DENABLE_AVX_DOUBLE SIMDBaseUndiff.c -c -o SIMDBaseUndiff_avxdouble.o
+
+SIMDBase.o : SIMDBase.c SIMDBase.h
+	$(CC) $(BASEOPT) -O -DENABLE_PUREC_FLOAT -DENABLE_PUREC_DOUBLE -DENABLE_PUREC_LONGDOUBLE -DENABLE_SSE_FLOAT -DENABLE_SSE2_DOUBLE -DENABLE_AVX_FLOAT -DENABLE_AVX_DOUBLE SIMDBase.c -c -o SIMDBase.o
+
+libSIMD.a : SIMDBase.o SIMDBaseUndiff_purecfloat.o SIMDBaseUndiff_purecdouble.o SIMDBaseUndiff_pureclongdouble.o SIMDBaseUndiff_ssefloat.o SIMDBaseUndiff_sse2double.o SIMDBaseUndiff_avxfloat.o SIMDBaseUndiff_avxdouble.o
+	rm -f libSIMD.a; ar -cvq libSIMD.a SIMDBase.o SIMDBaseUndiff_purecfloat.o SIMDBaseUndiff_purecdouble.o SIMDBaseUndiff_pureclongdouble.o SIMDBaseUndiff_ssefloat.o SIMDBaseUndiff_sse2double.o SIMDBaseUndiff_avxfloat.o SIMDBaseUndiff_avxdouble.o
+
+clean :
+	rm -f *~ *.o *.s *.a a.out
diff --git a/plugins/supereq/nsfft-1.00/simd/SIMDBase.c b/plugins/supereq/nsfft-1.00/simd/SIMDBase.c
new file mode 100644
index 00000000..eb51ee10
--- /dev/null
+++ b/plugins/supereq/nsfft-1.00/simd/SIMDBase.c
@@ -0,0 +1,454 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include <stdint.h>
+#include <signal.h>
+#include <setjmp.h>
+#include <string.h>
+
+#include "SIMDBase.h"
+
+void detect_purec_float(void);
+void detect_purec_double(void);
+void detect_purec_longdouble(void);
+void detect_sse_float(void);
+void detect_sse2_double(void);
+void detect_neon_float(void);
+void detect_avx_float(void);
+void detect_avx_double(void);
+void detect_altivec_float(void);
+
+int32_t getModeParamInt_purec_float(int32_t paramId);
+int32_t getModeParamInt_purec_double(int32_t paramId);
+int32_t getModeParamInt_purec_longdouble(int32_t paramId);
+int32_t getModeParamInt_sse_float(int32_t paramId);
+int32_t getModeParamInt_sse2_double(int32_t paramId);
+int32_t getModeParamInt_neon_float(int32_t paramId);
+int32_t getModeParamInt_avx_float(int32_t paramId);
+int32_t getModeParamInt_avx_double(int32_t paramId);
+int32_t getModeParamInt_altivec_float(int32_t paramId);
+
+char * getModeParamString_purec_float(int32_t paramId);
+char * getModeParamString_purec_double(int32_t paramId);
+char * getModeParamString_purec_longdouble(int32_t paramId);
+char * getModeParamString_sse_float(int32_t paramId);
+char * getModeParamString_sse2_double(int32_t paramId);
+char * getModeParamString_neon_float(int32_t paramId);
+char * getModeParamString_avx_float(int32_t paramId);
+char * getModeParamString_avx_double(int32_t paramId);
+char * getModeParamString_altivec_float(int32_t paramId);
+
+uint8_t detectBuffer[256];
+char SIMDBase_processorNameString[256];
+
+static char *startsWith(char *str1, char *str2) {
+  if (strncmp(str1, str2, strlen(str2)) == 0) {
+    return str1 + strlen(str2);
+  }
+
+  return NULL;
+}
+
+#if defined(__linux__)
+static char *tryReadingProcCpuinfo(char *entry) {
+  int i;
+
+  FILE *fp = fopen("/proc/cpuinfo", "r");
+  if (fp == NULL) return NULL;
+
+  for(i=0;i<100;i++) {
+    char *q;
+    bzero(SIMDBase_processorNameString, 256);
+    if (fgets(SIMDBase_processorNameString, 255, fp) == NULL) break;
+
+    if ((q = startsWith(SIMDBase_processorNameString, entry)) != NULL) {
+      int j;
+      fclose(fp);
+
+      for(j=0;j<256;j++) {
+	if (SIMDBase_processorNameString[j] == '\n') SIMDBase_processorNameString[j] = ' ';
+      }
+      while(*q != '\0' && *q != ':' && q - SIMDBase_processorNameString < 200) q++;
+      if (q - SIMDBase_processorNameString >= 200) return NULL;
+      if (*q == ':' && *(q+1) == ' ') return q + 2;
+      return NULL;
+    }
+  }
+
+  fclose(fp);
+  return NULL;
+}
+#else
+static char *tryReadingProcCpuinfo(char *entry) { return NULL; }
+#endif
+
+#if defined(__i386__)
+static void SIMDBase_x86cpuid(uint32_t out[4], uint32_t eax, uint32_t ecx) {
+  uint32_t a, b, c, d;
+  __asm__ __volatile__("pushl %%eax;      \n\t"
+		       "pushl %%ebx;      \n\t"
+		       "pushl %%ecx;      \n\t"
+		       "pushl %%edx;      \n\t"
+		       "cpuid;            \n\t"
+		       "movl %%eax, %0;   \n\t"
+		       "movl %%ebx, %1;   \n\t"
+		       "movl %%ecx, %2;   \n\t"
+		       "movl %%edx, %3;   \n\t"
+		       "popl %%edx;       \n\t"
+		       "popl %%ecx;       \n\t"
+		       "popl %%ebx;       \n\t"
+		       "popl %%eax;       \n\t"
+		       : "=m"(a), "=m"(b), "=m"(c), "=m"(d)
+		       : "a"(eax), "c"(ecx)
+		       : "cc");
+  out[0] = a; out[1] = b; out[2] = c; out[3] = d;
+}
+#endif
+
+#if defined(__x86_64__)
+static void SIMDBase_x86cpuid(uint32_t out[4], uint32_t eax, uint32_t ecx) {
+  uint32_t a, b, c, d;
+  __asm__ __volatile__ ("cpuid" : "=a" (a), "=b" (b), "=c" (c), "=d" (d) : "a" (eax), "c"(ecx));
+  out[0] = a; out[1] = b; out[2] = c; out[3] = d;
+}
+#endif
+
+#if defined(__i386__) || defined(__x86_64__)
+static void getCacheParam(CacheParam *p) {
+  static int l2assoc[] = {0,1,2,0,4,0,8,0,16,0,32,48,64,96,128,-1};
+  int32_t i;
+  uint32_t out[4];
+
+  for(i=0;i<8;i++) {
+    p->size[i] = p->assoc[i] = 0;
+  }
+
+  SIMDBase_x86cpuid(out, 4, 0);
+
+  if ((out[0] & 0xf) != 0) {
+    p->linesize = ((out[1] >> 0) & 2047)+1;
+    for(i=0;i<8;i++) {
+      SIMDBase_x86cpuid(out, 4, i);
+      if ((out[0] & 0xf) == 0) break;
+      int level = (out[0] >> 5) & 0x7;
+      int type  = (out[0] >> 0) & 0xf;
+      int assoc = ((out[1] >> 22) & 1023)+1;
+      int part  = ((out[1] >> 12) & 1023)+1;
+      int lsize = ((out[1] >> 0) & 2047)+1;
+      int nsets = ((out[2] >> 0))+1;
+      int nthre = ((out[0] >> 14) & 1023)+1;
+
+      if (type != 1 && type != 3) continue;
+      p->assoc[level-1] = assoc;
+      p->size[level-1] = (uint64_t)assoc * part * lsize * nsets / nthre;
+    }
+  } else {
+    SIMDBase_x86cpuid(out, 0x80000008U, 0);
+    int ncores = (out[2] & 0xff) + 1;
+
+    SIMDBase_x86cpuid(out, 0x80000005U, 0);
+    p->linesize = out[2] & 255;
+    p->size[0] = (out[2] >> 24) * 1024 / ncores;
+    p->assoc[0] = (out[2] >> 16) & 0xff;
+
+    SIMDBase_x86cpuid(out, 0x80000006U, 0);
+    p->size[1] = (out[2] >> 16) * 1024 / ncores;
+    p->assoc[1] = l2assoc[(out[2] >> 12) & 0xf];
+    p->size[2] = (out[3] >> 18) * 512 * 1024 / ncores;
+    p->assoc[2] = l2assoc[(out[3] >> 12) & 0xf];
+  }
+
+  if (p->size[0] == 0) {
+    p->size[0] = 16 * 1024;
+    p->assoc[0] = 4;
+  }
+
+  if (p->size[1] == 0) {
+    p->size[1] = 256 * 1024;
+    p->assoc[1] = 4;
+  }
+}
+
+char *SIMDBase_getProcessorNameString() {
+  union {
+    uint32_t info[4];
+    uint8_t str[16];
+  } u;
+  int i,j;
+  char *p;
+
+  p = SIMDBase_processorNameString;
+
+  SIMDBase_x86cpuid(u.info, 0, 0);
+
+  for(i=0;i<4;i++) *p++ = u.str[i+4];
+  for(i=0;i<4;i++) *p++ = u.str[i+12];
+  for(i=0;i<4;i++) *p++ = u.str[i+8];
+
+  *p++ = ' ';
+
+  for(i=0;i<3;i++) {
+    SIMDBase_x86cpuid(u.info, i + 0x80000002, 0);
+
+    for(j=0;j<16;j++) {
+      *p++ = u.str[j];
+    }
+  }
+
+  *p++ = '\n';
+
+  return SIMDBase_processorNameString;
+}
+#else
+char *SIMDBase_getProcessorNameString() {
+  char *p = "Unknown";
+#if defined(__powerpc__)
+  if ((p = tryReadingProcCpuinfo("cpu")) == NULL) p = "PowerPC";
+#elif defined(__arm__)
+  if ((p = tryReadingProcCpuinfo("Processor")) == NULL) p = "ARM";
+#endif
+
+  return p;
+}
+#endif
+
+int32_t SIMDBase_sizeOfCachelineInByte() {
+#if defined(__i386__) || defined(__x86_64__)
+  CacheParam p;
+  getCacheParam(&p);
+  return p.linesize;
+#else
+  return 64;
+#endif
+}
+
+int32_t SIMDBase_sizeOfDataCacheInByte() {
+#if defined(__i386__) || defined(__x86_64__)
+  CacheParam p;
+  getCacheParam(&p);
+  return p.size[1] + p.size[2]; // L2 + L3
+#else
+  return 256 * 1024;
+#endif
+}
+
+static jmp_buf sigjmp;
+
+static void sighandler(int signum) {
+  longjmp(sigjmp, 1);
+}
+
+int32_t SIMDBase_detect(int32_t paramId) {
+#if defined(__i386__) || defined(__x86_64__)
+  uint32_t reg[4];
+#endif
+
+  switch(paramId) {
+  case SIMDBase_MODE_PUREC_FLOAT:
+#if defined(ENABLE_PUREC_FLOAT)
+    return 1;
+#else
+    return -1;
+#endif
+  case SIMDBase_MODE_PUREC_DOUBLE:
+#if defined(ENABLE_PUREC_DOUBLE)
+    return 1;
+#else
+    return -1;
+#endif
+  case SIMDBase_MODE_PUREC_LONGDOUBLE:
+#if defined(ENABLE_PUREC_LONGDOUBLE)
+    return 1;
+#else
+    return -1;
+#endif
+  case SIMDBase_MODE_SSE_FLOAT:
+#if defined(ENABLE_SSE_FLOAT)
+    SIMDBase_x86cpuid(reg, 1, 0);
+    return (reg[3] & (1 << 25)) != 0;
+#else
+    return -1;
+#endif
+  case SIMDBase_MODE_SSE2_DOUBLE:
+#if defined(ENABLE_SSE2_DOUBLE)
+    SIMDBase_x86cpuid(reg, 1, 0);
+    return (reg[3] & (1 << 26)) != 0;
+#else
+    return -1;
+#endif
+  case SIMDBase_MODE_AVX_FLOAT:
+#if defined(ENABLE_AVX_FLOAT)
+    SIMDBase_x86cpuid(reg, 1, 0);
+    return (reg[2] & (1 << 28)) != 0;
+#else
+    return -1;
+#endif
+  case SIMDBase_MODE_AVX_DOUBLE:
+#if defined(ENABLE_AVX_DOUBLE)
+    SIMDBase_x86cpuid(reg, 1, 0);
+    return (reg[2] & (1 << 28)) != 0;
+#else
+    return -1;
+#endif
+  default:
+    break;
+  }
+
+  signal(SIGILL, sighandler);
+
+  if (setjmp(sigjmp) == 0) {
+    switch(paramId) {
+#if defined(ENABLE_NEON_FLOAT)
+    case SIMDBase_MODE_NEON_FLOAT:
+      detect_neon_float();
+      break;
+#endif
+#if defined(ENABLE_ALTIVEC_FLOAT)
+    case SIMDBase_MODE_ALTIVEC_FLOAT:
+      detect_altivec_float();
+      break;
+#endif
+    default:
+      signal(SIGILL, SIG_DFL);
+      return -1;
+    }
+    signal(SIGILL, SIG_DFL);
+    return 1;
+  } else {
+    signal(SIGILL, SIG_DFL);
+    return 0;
+  }
+}
+
+int32_t SIMDBase_chooseBestMode(int32_t typeId) {
+  switch(typeId) {
+  case SIMDBase_TYPE_HALF:
+    break;
+  case SIMDBase_TYPE_FLOAT:
+    if (SIMDBase_detect(SIMDBase_MODE_AVX_FLOAT) == 1) return SIMDBase_MODE_AVX_FLOAT;
+    if (SIMDBase_detect(SIMDBase_MODE_SSE_FLOAT) == 1) return SIMDBase_MODE_SSE_FLOAT;
+    if (SIMDBase_detect(SIMDBase_MODE_NEON_FLOAT) == 1) return SIMDBase_MODE_NEON_FLOAT;
+    if (SIMDBase_detect(SIMDBase_MODE_ALTIVEC_FLOAT) == 1) return SIMDBase_MODE_ALTIVEC_FLOAT;
+    if (SIMDBase_detect(SIMDBase_MODE_PUREC_FLOAT) == 1) return SIMDBase_MODE_PUREC_FLOAT;
+    break;
+
+  case SIMDBase_TYPE_DOUBLE:
+    if (SIMDBase_detect(SIMDBase_MODE_AVX_DOUBLE) == 1) return SIMDBase_MODE_AVX_DOUBLE;
+    if (SIMDBase_detect(SIMDBase_MODE_SSE2_DOUBLE) == 1) return SIMDBase_MODE_SSE2_DOUBLE;
+    if (SIMDBase_detect(SIMDBase_MODE_PUREC_DOUBLE) == 1) return SIMDBase_MODE_PUREC_DOUBLE;
+    break;
+
+  case SIMDBase_TYPE_LONGDOUBLE:
+    if (SIMDBase_detect(SIMDBase_MODE_PUREC_LONGDOUBLE) == 1) return SIMDBase_MODE_PUREC_LONGDOUBLE;
+    break;
+
+  case SIMDBase_TYPE_EXTENDED:
+    break;
+
+  case SIMDBase_TYPE_QUAD:
+    break;
+  }
+
+  return SIMDBase_MODE_NONE;
+}
+
+int32_t SIMDBase_getModeParamInt(int32_t paramId, int32_t mode) {
+  switch(mode) {
+#if defined(ENABLE_PUREC_FLOAT)
+  case 1: return getModeParamInt_purec_float(paramId); break;
+#endif
+#if defined(ENABLE_PUREC_DOUBLE)
+  case 2: return getModeParamInt_purec_double(paramId); break;
+#endif
+#if defined(ENABLE_PUREC_LONGDOUBLE)
+  case 3: return getModeParamInt_purec_longdouble(paramId); break;
+#endif
+#if defined(ENABLE_SSE_FLOAT)
+  case 4: return getModeParamInt_sse_float(paramId); break;
+#endif
+#if defined(ENABLE_SSE2_DOUBLE)
+  case 5: return getModeParamInt_sse2_double(paramId); break;
+#endif
+#if defined(ENABLE_NEON_FLOAT)
+  case 6: return getModeParamInt_neon_float(paramId); break;
+#endif
+#if defined(ENABLE_AVX_FLOAT)
+  case 7: return getModeParamInt_avx_float(paramId); break;
+#endif
+#if defined(ENABLE_AVX_DOUBLE)
+  case 8: return getModeParamInt_avx_double(paramId); break;
+#endif
+#if defined(ENABLE_ALTIVEC_FLOAT)
+  case 9: return getModeParamInt_altivec_float(paramId); break;
+#endif
+  }
+
+  return -1;
+}
+
+char *SIMDBase_getModeParamString(int32_t paramId, int32_t mode) {
+  switch(mode) {
+#if defined(ENABLE_PUREC_FLOAT)
+  case 1: return getModeParamString_purec_float(paramId); break;
+#endif
+#if defined(ENABLE_PUREC_DOUBLE)
+  case 2: return getModeParamString_purec_double(paramId); break;
+#endif
+#if defined(ENABLE_PUREC_LONGDOUBLE)
+  case 3: return getModeParamString_purec_longdouble(paramId); break;
+#endif
+#if defined(ENABLE_SSE_FLOAT)
+  case 4: return getModeParamString_sse_float(paramId); break;
+#endif
+#if defined(ENABLE_SSE2_DOUBLE)
+  case 5: return getModeParamString_sse2_double(paramId); break;
+#endif
+#if defined(ENABLE_NEON_FLOAT)
+  case 6: return getModeParamString_neon_float(paramId); break;
+#endif
+#if defined(ENABLE_AVX_FLOAT)
+  case 7: return getModeParamString_avx_float(paramId); break;
+#endif
+#if defined(ENABLE_AVX_DOUBLE)
+  case 8: return getModeParamString_avx_double(paramId); break;
+#endif
+#if defined(ENABLE_ALTIVEC_FLOAT)
+  case 9: return getModeParamString_altivec_float(paramId); break;
+#endif
+  }
+
+  return NULL;
+}
+
+#ifdef ANDROID
+int posix_memalign (void **memptr, size_t alignment, size_t size) {
+    *memptr = malloc (size);
+    return *memptr ? 0 : -1;
+}
+#endif
+
+void *SIMDBase_alignedMalloc(uint64_t size) {
+  void *p;
+  if (posix_memalign(&p, SIMDBase_sizeOfCachelineInByte(), size) != 0) abort();
+  return p;
+}
+
+void SIMDBase_alignedFree(void *ptr) {
+  free(ptr);
+}
+
+int32_t SIMDBase_getParamInt(int32_t paramId) {
+  switch(paramId) {
+  case SIMDBase_PARAMID_MODE_MAX:
+    return SIMDBase_LAST_MODE + 1;
+  }
+
+  return -1;
+}
+
+int32_t SIMDBase_getTypeParamInt(int32_t paramId, int32_t typeId) {
+  switch(typeId) {
+  }
+
+  return -1;
+}
diff --git a/plugins/supereq/nsfft-1.00/simd/SIMDBase.h b/plugins/supereq/nsfft-1.00/simd/SIMDBase.h
new file mode 100644
index 00000000..5382b4d1
--- /dev/null
+++ b/plugins/supereq/nsfft-1.00/simd/SIMDBase.h
@@ -0,0 +1,51 @@
+#ifndef _SIMDBase_H_
+#define _SIMDBase_H_
+
+#define SIMDBase_TYPE_FLOAT ( 1 | ( 1 << 24 ))
+#define SIMDBase_TYPE_DOUBLE ( 2 | ( 1 << 24 ))
+#define SIMDBase_TYPE_LONGDOUBLE ( 3 | ( 1 << 24 ))
+#define SIMDBase_TYPE_EXTENDED ( 4 | ( 1 << 24 ))
+#define SIMDBase_TYPE_QUAD ( 5 | ( 1 << 24 ))
+#define SIMDBase_TYPE_HALF ( 6 | ( 1 << 24 ))
+
+#define SIMDBase_MODE_NONE 0
+#define SIMDBase_MODE_PUREC_FLOAT 1
+#define SIMDBase_MODE_PUREC_DOUBLE 2
+#define SIMDBase_MODE_PUREC_LONGDOUBLE 3
+#define SIMDBase_MODE_SSE_FLOAT 4
+#define SIMDBase_MODE_SSE2_DOUBLE 5
+#define SIMDBase_MODE_NEON_FLOAT 6
+#define SIMDBase_MODE_AVX_FLOAT 7
+#define SIMDBase_MODE_AVX_DOUBLE 8
+#define SIMDBase_MODE_ALTIVEC_FLOAT 9
+
+#define SIMDBase_LAST_MODE SIMDBase_MODE_ALTIVEC_FLOAT
+
+#define SIMDBase_PARAMID_MODE_MAX ( 1 | ( 2 << 24 ))
+#define SIMDBase_PARAMID_TYPE_AVAILABILITY ( 2 | ( 2 << 24 ))
+#define SIMDBase_PARAMID_SIZE_OF_REAL ( 3 | ( 2 << 24 ))
+#define SIMDBase_PARAMID_SIZE_OF_VECT ( 4 | ( 2 << 24 ))
+#define SIMDBase_PARAMID_VECTOR_LEN ( 5 | ( 2 << 24 ))
+#define SIMDBase_PARAMID_MODE_AVAILABILITY ( 6 | ( 2 << 24 ))
+#define SIMDBase_PARAMID_MODE_NAME ( 7 | ( 2 << 24 ))
+
+//
+
+typedef struct {
+  uint32_t linesize;
+  uint32_t size[8], assoc[8];
+} CacheParam;
+
+void *SIMDBase_alignedMalloc(uint64_t size);
+void SIMDBase_alignedFree(void *ptr);
+int32_t SIMDBase_sizeOfCachelineInByte();
+int32_t SIMDBase_sizeOfDataCacheInByte();
+int32_t SIMDBase_chooseBestMode(int32_t typeId);
+char *SIMDBase_getProcessorNameString();
+int32_t SIMDBase_detect(int32_t paramId);
+int32_t SIMDBase_getParamInt(int32_t paramId);
+int32_t SIMDBase_getTypeParamInt(int32_t paramId, int32_t typeId);
+int32_t SIMDBase_getModeParamInt(int32_t paramId, int32_t mode);
+char *SIMDBase_getModeParamString(int32_t paramId, int32_t mode);
+
+#endif
diff --git a/plugins/supereq/nsfft-1.00/simd/SIMDBaseUndiff.c b/plugins/supereq/nsfft-1.00/simd/SIMDBaseUndiff.c
new file mode 100644
index 00000000..257a5ff0
--- /dev/null
+++ b/plugins/supereq/nsfft-1.00/simd/SIMDBaseUndiff.c
@@ -0,0 +1,38 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+
+#include "SIMDBase.h"
+#include "SIMDBaseUndiff.h"
+
+void SIMDBaseUndiff_DETECT() {
+  extern uint8_t detectBuffer[256];
+  SIMDBase_VECT a = SIMDBase_LOAD((SIMDBase_VECT *)&detectBuffer[0]);
+  SIMDBase_VECT b = SIMDBase_LOAD((SIMDBase_VECT *)&detectBuffer[64]);
+  SIMDBase_VECT c = SIMDBase_ADDi(a, b);
+  SIMDBase_STOR((SIMDBase_VECT *)&detectBuffer[128], c);
+}
+
+int32_t SIMDBaseUndiff_GETMODEPARAMINT(int32_t paramId) {
+  switch(paramId) {
+  case SIMDBase_PARAMID_SIZE_OF_REAL:
+    return sizeof(SIMDBase_REAL);
+  case SIMDBase_PARAMID_SIZE_OF_VECT:
+    return sizeof(SIMDBase_VECT);
+  case SIMDBase_PARAMID_VECTOR_LEN:
+    return SIMDBase_VECTLEN;
+  case SIMDBase_PARAMID_MODE_AVAILABILITY:
+    return SIMDBase_detect(paramId);
+  }
+
+  return -1;
+}
+
+char * SIMDBaseUndiff_GETMODEPARAMSTRING(int32_t paramId) {
+  switch(paramId) {
+  case SIMDBase_PARAMID_MODE_NAME:
+    return SIMDBase_NAME;
+  }
+
+  return NULL;
+}
diff --git a/plugins/supereq/nsfft-1.00/simd/SIMDBaseUndiff.h b/plugins/supereq/nsfft-1.00/simd/SIMDBaseUndiff.h
new file mode 100644
index 00000000..1af849a8
--- /dev/null
+++ b/plugins/supereq/nsfft-1.00/simd/SIMDBaseUndiff.h
@@ -0,0 +1,231 @@
+#ifndef _SIMDBaseUndiff_H_
+#define _SIMDBaseUndiff_H_
+
+#if defined(ENABLE_PUREC_FLOAT) ////////////////////////////////////////////
+
+typedef float SIMDBase_REAL;
+typedef float SIMDBase_VECT;
+
+#define SIMDBase_MODE 1
+#define SIMDBase_TYPE SIMDBase_TYPE_FLOAT
+#define SIMDBase_VECTLEN 1
+#define SIMDBase_NAME "Pure C float"
+#define SIMDBaseUndiff_DETECT detect_purec_float
+#define SIMDBaseUndiff_GETMODEPARAMINT getModeParamInt_purec_float
+#define SIMDBaseUndiff_GETMODEPARAMSTRING getModeParamString_purec_float
+
+static inline SIMDBase_VECT SIMDBase_LOAD(SIMDBase_VECT *p) { return *p; }
+static inline void SIMDBase_STOR(SIMDBase_VECT *p, SIMDBase_VECT u) { *p = u; }
+static inline SIMDBase_VECT SIMDBase_SET1(SIMDBase_REAL f) { return f; }
+static inline SIMDBase_VECT SIMDBase_LOAD1(SIMDBase_REAL *p) { return *p; }
+static inline SIMDBase_VECT SIMDBase_ADDi(SIMDBase_VECT u, SIMDBase_VECT v) { return u + v; }
+static inline SIMDBase_VECT SIMDBase_SUBi(SIMDBase_VECT u, SIMDBase_VECT v) { return u - v; }
+static inline SIMDBase_VECT SIMDBase_MULi(SIMDBase_VECT u, SIMDBase_VECT v) { return u * v; }
+static inline SIMDBase_VECT SIMDBase_NEGi(SIMDBase_VECT u) { return -u; }
+
+#elif defined(ENABLE_PUREC_DOUBLE) ////////////////////////////////////////////
+
+typedef double SIMDBase_REAL;
+typedef double SIMDBase_VECT;
+
+#define SIMDBase_MODE 2
+#define SIMDBase_TYPE SIMDBase_TYPE_DOUBLE
+#define SIMDBase_VECTLEN 1
+#define SIMDBase_NAME "Pure C double"
+#define SIMDBaseUndiff_DETECT detect_purec_double
+#define SIMDBaseUndiff_GETMODEPARAMINT getModeParamInt_purec_double
+#define SIMDBaseUndiff_GETMODEPARAMSTRING getModeParamString_purec_double
+
+static inline SIMDBase_VECT SIMDBase_LOAD(SIMDBase_VECT *p) { return *p; }
+static inline void SIMDBase_STOR(SIMDBase_VECT *p, SIMDBase_VECT u) { *p = u; }
+static inline SIMDBase_VECT SIMDBase_SET1(SIMDBase_REAL f) { return f; }
+static inline SIMDBase_VECT SIMDBase_LOAD1(SIMDBase_REAL *p) { return *p; }
+static inline SIMDBase_VECT SIMDBase_ADDi(SIMDBase_VECT u, SIMDBase_VECT v) { return u + v; }
+static inline SIMDBase_VECT SIMDBase_SUBi(SIMDBase_VECT u, SIMDBase_VECT v) { return u - v; }
+static inline SIMDBase_VECT SIMDBase_MULi(SIMDBase_VECT u, SIMDBase_VECT v) { return u * v; }
+static inline SIMDBase_VECT SIMDBase_NEGi(SIMDBase_VECT u) { return -u; }
+
+#elif defined(ENABLE_PUREC_LONGDOUBLE) ////////////////////////////////////////////
+
+typedef long double SIMDBase_REAL;
+typedef long double SIMDBase_VECT;
+
+#define SIMDBase_MODE 3
+#define SIMDBase_TYPE SIMDBase_TYPE_LONGDOUBLE
+#define SIMDBase_VECTLEN 1
+#define SIMDBase_NAME "Pure C long double"
+#define SIMDBaseUndiff_DETECT detect_purec_longdouble
+#define SIMDBaseUndiff_GETMODEPARAMINT getModeParamInt_purec_longdouble
+#define SIMDBaseUndiff_GETMODEPARAMSTRING getModeParamString_purec_longdouble
+
+static inline SIMDBase_VECT SIMDBase_LOAD(SIMDBase_VECT *p) { return *p; }
+static inline void SIMDBase_STOR(SIMDBase_VECT *p, SIMDBase_VECT u) { *p = u; }
+static inline SIMDBase_VECT SIMDBase_SET1(SIMDBase_REAL f) { return f; }
+static inline SIMDBase_VECT SIMDBase_LOAD1(SIMDBase_REAL *p) { return *p; }
+static inline SIMDBase_VECT SIMDBase_ADDi(SIMDBase_VECT u, SIMDBase_VECT v) { return u + v; }
+static inline SIMDBase_VECT SIMDBase_SUBi(SIMDBase_VECT u, SIMDBase_VECT v) { return u - v; }
+static inline SIMDBase_VECT SIMDBase_MULi(SIMDBase_VECT u, SIMDBase_VECT v) { return u * v; }
+static inline SIMDBase_VECT SIMDBase_NEGi(SIMDBase_VECT u) { return -u; }
+
+#elif defined(ENABLE_SSE_FLOAT) ////////////////////////////////////////////
+
+#include <xmmintrin.h>
+
+typedef float SIMDBase_REAL;
+typedef __m128 SIMDBase_VECT;
+
+#define SIMDBase_MODE 4
+#define SIMDBase_TYPE SIMDBase_TYPE_FLOAT
+#define SIMDBase_VECTLEN 4
+#define SIMDBase_NAME "x86 SSE float"
+#define SIMDBaseUndiff_DETECT detect_sse_float
+#define SIMDBaseUndiff_GETMODEPARAMINT getModeParamInt_sse_float
+#define SIMDBaseUndiff_GETMODEPARAMSTRING getModeParamString_sse_float
+
+static inline SIMDBase_VECT SIMDBase_LOAD(SIMDBase_VECT *p) { return _mm_load_ps((float *)p); }
+static inline void SIMDBase_STOR(SIMDBase_VECT *p, SIMDBase_VECT u) { _mm_store_ps((float *)p, u); }
+static inline SIMDBase_VECT SIMDBase_SET1(SIMDBase_REAL f) { return _mm_set1_ps(f); }
+static inline SIMDBase_VECT SIMDBase_LOAD1(SIMDBase_REAL *p) { return _mm_load1_ps(p); }
+static inline SIMDBase_VECT SIMDBase_ADDi(SIMDBase_VECT u, SIMDBase_VECT v) { return _mm_add_ps(u, v); }
+static inline SIMDBase_VECT SIMDBase_SUBi(SIMDBase_VECT u, SIMDBase_VECT v) { return _mm_sub_ps(u, v); }
+static inline SIMDBase_VECT SIMDBase_MULi(SIMDBase_VECT u, SIMDBase_VECT v) { return _mm_mul_ps(u, v); }
+static inline SIMDBase_VECT SIMDBase_NEGi(SIMDBase_VECT u) { return _mm_xor_ps(u, _mm_set_ps(-0.0f, -0.0f, -0.0f, -0.0f)); }
+
+#elif defined(ENABLE_SSE2_DOUBLE) ////////////////////////////////////////////
+
+#include <emmintrin.h>
+
+typedef double SIMDBase_REAL;
+typedef __m128d SIMDBase_VECT;
+
+#define SIMDBase_MODE 5
+#define SIMDBase_TYPE SIMDBase_TYPE_DOUBLE
+#define SIMDBase_VECTLEN 2
+#define SIMDBase_NAME "x86 SSE2 double"
+#define SIMDBaseUndiff_DETECT detect_sse2_double
+#define SIMDBaseUndiff_GETMODEPARAMINT getModeParamInt_sse2_double
+#define SIMDBaseUndiff_GETMODEPARAMSTRING getModeParamString_sse2_double
+
+static inline SIMDBase_VECT SIMDBase_LOAD(SIMDBase_VECT *p) { return _mm_load_pd((double *)p); }
+static inline void SIMDBase_STOR(SIMDBase_VECT *p, SIMDBase_VECT u) { _mm_store_pd((double *)p, u); }
+static inline SIMDBase_VECT SIMDBase_SET1(SIMDBase_REAL f) { return _mm_set1_pd(f); }
+static inline SIMDBase_VECT SIMDBase_LOAD1(SIMDBase_REAL *p) { return _mm_load1_pd(p); }
+static inline SIMDBase_VECT SIMDBase_ADDi(SIMDBase_VECT u, SIMDBase_VECT v) { return _mm_add_pd(u, v); }
+static inline SIMDBase_VECT SIMDBase_SUBi(SIMDBase_VECT u, SIMDBase_VECT v) { return _mm_sub_pd(u, v); }
+static inline SIMDBase_VECT SIMDBase_MULi(SIMDBase_VECT u, SIMDBase_VECT v) { return _mm_mul_pd(u, v); }
+static inline SIMDBase_VECT SIMDBase_NEGi(SIMDBase_VECT u) { return _mm_xor_pd(u, _mm_set_pd(-0.0, -0.0)); }
+
+#elif defined(ENABLE_NEON_FLOAT) ////////////////////////////////////////////
+
+#include <arm_neon.h>
+
+typedef float32_t SIMDBase_REAL;
+typedef float32x4_t SIMDBase_VECT;
+
+#define SIMDBase_MODE 6
+#define SIMDBase_TYPE SIMDBase_TYPE_FLOAT
+#define SIMDBase_VECTLEN 4
+#define SIMDBase_NAME "ARM NEON float"
+#define SIMDBaseUndiff_DETECT detect_neon_float
+#define SIMDBaseUndiff_GETMODEPARAMINT getModeParamInt_neon_float
+#define SIMDBaseUndiff_GETMODEPARAMSTRING getModeParamString_neon_float
+
+static inline SIMDBase_VECT SIMDBase_LOAD(SIMDBase_VECT *p) { return vld1q_f32((float32_t *)p); }
+static inline void SIMDBase_STOR(SIMDBase_VECT *p, SIMDBase_VECT u) { vst1q_f32((float32_t *)p, u); }
+static inline SIMDBase_VECT SIMDBase_SET1(SIMDBase_REAL f) { return vdupq_n_f32(f); }
+static inline SIMDBase_VECT SIMDBase_LOAD1(SIMDBase_REAL *p) { return vdupq_n_f32(*p); }
+static inline SIMDBase_VECT SIMDBase_ADDi(SIMDBase_VECT u, SIMDBase_VECT v) { return vaddq_f32(u, v); }
+static inline SIMDBase_VECT SIMDBase_SUBi(SIMDBase_VECT u, SIMDBase_VECT v) { return vsubq_f32(u, v); }
+static inline SIMDBase_VECT SIMDBase_MULi(SIMDBase_VECT u, SIMDBase_VECT v) { return vmulq_f32(u, v); }
+static inline SIMDBase_VECT SIMDBase_NEGi(SIMDBase_VECT u) { 
+  return vreinterpretq_f32_u32( veorq_u32(vreinterpretq_u32_f32(u), vdupq_n_u32(0x80000000U)));
+}
+
+#define SIMDBase_FMADD_AVAILABLE
+
+static inline SIMDBase_VECT SIMDBase_FMADDi(SIMDBase_VECT u, SIMDBase_VECT v, SIMDBase_VECT w) { return vmlaq_f32(w, u, v); } // w + u * v
+static inline SIMDBase_VECT SIMDBase_FMSUBi(SIMDBase_VECT u, SIMDBase_VECT v, SIMDBase_VECT w) { return vmlsq_f32(w, u, v); } // w - u * v
+
+#elif defined(ENABLE_AVX_FLOAT) ////////////////////////////////////////////
+
+#include <immintrin.h>
+
+typedef float SIMDBase_REAL;
+typedef __m256 SIMDBase_VECT;
+
+#define SIMDBase_MODE 7
+#define SIMDBase_TYPE SIMDBase_TYPE_FLOAT
+#define SIMDBase_VECTLEN 8
+#define SIMDBase_NAME "x86 AVX float"
+#define SIMDBaseUndiff_DETECT detect_avx_float
+#define SIMDBaseUndiff_GETMODEPARAMINT getModeParamInt_avx_float
+#define SIMDBaseUndiff_GETMODEPARAMSTRING getModeParamString_avx_float
+
+static inline SIMDBase_VECT SIMDBase_LOAD(SIMDBase_VECT *p) { return _mm256_load_ps((float *)p); }
+static inline void SIMDBase_STOR(SIMDBase_VECT *p, SIMDBase_VECT u) { _mm256_store_ps((float *)p, u); }
+static inline SIMDBase_VECT SIMDBase_SET1(SIMDBase_REAL f) { return _mm256_set1_ps(f); }
+static inline SIMDBase_VECT SIMDBase_LOAD1(SIMDBase_REAL *p) { return _mm256_set1_ps(*p); }
+static inline SIMDBase_VECT SIMDBase_ADDi(SIMDBase_VECT u, SIMDBase_VECT v) { return _mm256_add_ps(u, v); }
+static inline SIMDBase_VECT SIMDBase_SUBi(SIMDBase_VECT u, SIMDBase_VECT v) { return _mm256_sub_ps(u, v); }
+static inline SIMDBase_VECT SIMDBase_MULi(SIMDBase_VECT u, SIMDBase_VECT v) { return _mm256_mul_ps(u, v); }
+static inline SIMDBase_VECT SIMDBase_NEGi(SIMDBase_VECT u) { return _mm256_xor_ps(u, _mm256_set1_ps(-0.0f)); }
+
+#elif defined(ENABLE_AVX_DOUBLE) ////////////////////////////////////////////
+
+#include <immintrin.h>
+
+typedef double SIMDBase_REAL;
+typedef __m256d SIMDBase_VECT;
+
+#define SIMDBase_MODE 8
+#define SIMDBase_TYPE SIMDBase_TYPE_DOUBLE
+#define SIMDBase_VECTLEN 4
+#define SIMDBase_NAME "x86 AVX double"
+#define SIMDBaseUndiff_DETECT detect_avx_double
+#define SIMDBaseUndiff_GETMODEPARAMINT getModeParamInt_avx_double
+#define SIMDBaseUndiff_GETMODEPARAMSTRING getModeParamString_avx_double
+
+static inline SIMDBase_VECT SIMDBase_LOAD(SIMDBase_VECT *p) { return _mm256_load_pd((double *)p); }
+static inline void SIMDBase_STOR(SIMDBase_VECT *p, SIMDBase_VECT u) { _mm256_store_pd((double *)p, u); }
+static inline SIMDBase_VECT SIMDBase_SET1(SIMDBase_REAL f) { return _mm256_set1_pd(f); }
+static inline SIMDBase_VECT SIMDBase_LOAD1(SIMDBase_REAL *p) { return _mm256_set1_pd(*p); }
+static inline SIMDBase_VECT SIMDBase_ADDi(SIMDBase_VECT u, SIMDBase_VECT v) { return _mm256_add_pd(u, v); }
+static inline SIMDBase_VECT SIMDBase_SUBi(SIMDBase_VECT u, SIMDBase_VECT v) { return _mm256_sub_pd(u, v); }
+static inline SIMDBase_VECT SIMDBase_MULi(SIMDBase_VECT u, SIMDBase_VECT v) { return _mm256_mul_pd(u, v); }
+static inline SIMDBase_VECT SIMDBase_NEGi(SIMDBase_VECT u) { return _mm256_xor_pd(u, _mm256_set1_pd(-0.0)); }
+
+#elif defined(ENABLE_ALTIVEC_FLOAT) ////////////////////////////////////////////
+
+#include <altivec.h>
+
+typedef float SIMDBase_REAL;
+typedef vector float SIMDBase_VECT;
+
+#define SIMDBase_MODE 9
+#define SIMDBase_TYPE SIMDBase_TYPE_FLOAT
+#define SIMDBase_VECTLEN 4
+#define SIMDBase_NAME "PowerPC AltiVec float"
+#define SIMDBaseUndiff_DETECT detect_altivec_float
+#define SIMDBaseUndiff_GETMODEPARAMINT getModeParamInt_altivec_float
+#define SIMDBaseUndiff_GETMODEPARAMSTRING getModeParamString_altivec_float
+
+static inline SIMDBase_VECT SIMDBase_LOAD(SIMDBase_VECT *p) { return vec_ld(0, p); }
+static inline void SIMDBase_STOR(SIMDBase_VECT *p, SIMDBase_VECT u) { vec_st(u, 0, p); }
+static inline SIMDBase_VECT SIMDBase_SET1(SIMDBase_REAL f) { return (vector float){f, f, f, f}; }
+static inline SIMDBase_VECT SIMDBase_LOAD1(SIMDBase_REAL *p) { return (vector float){*p, *p, *p, *p}; }
+static inline SIMDBase_VECT SIMDBase_ADDi(SIMDBase_VECT u, SIMDBase_VECT v) { return vec_add(u, v); }
+static inline SIMDBase_VECT SIMDBase_SUBi(SIMDBase_VECT u, SIMDBase_VECT v) { return vec_sub(u, v); }
+static inline SIMDBase_VECT SIMDBase_MULi(SIMDBase_VECT u, SIMDBase_VECT v) { return vec_madd(u, v, (vector float){0, 0, 0, 0}); }
+static inline SIMDBase_VECT SIMDBase_NEGi(SIMDBase_VECT u) { return vec_xor(u, (vector float){-0.0f, -0.0f, -0.0f, -0.0f}); }
+
+#define SIMDBase_FMADD_AVAILABLE
+
+static inline SIMDBase_VECT SIMDBase_FMADDi(SIMDBase_VECT u, SIMDBase_VECT v, SIMDBase_VECT w) { return vec_madd(u, v, w); } // w + u * v
+static inline SIMDBase_VECT SIMDBase_FMSUBi(SIMDBase_VECT u, SIMDBase_VECT v, SIMDBase_VECT w) { return vec_nmsub(u, v, w); } // w - u * v
+
+#endif ////////////////////////////////////////////////////////////////////
+
+static inline SIMDBase_VECT SIMDBase_ADDm(SIMDBase_VECT *p, SIMDBase_VECT *q) { return SIMDBase_ADDi(SIMDBase_LOAD(p), SIMDBase_LOAD(q)); }
+static inline SIMDBase_VECT SIMDBase_SUBm(SIMDBase_VECT *p, SIMDBase_VECT *q) { return SIMDBase_SUBi(SIMDBase_LOAD(p), SIMDBase_LOAD(q)); }
+
+#endif