73 files changed, 17440 insertions, 623 deletions
diff --git a/plugins/supereq/Equ.cpp b/plugins/supereq/Equ.cpp
index f53b99d1..0aff4f8a 100644
--- a/plugins/supereq/Equ.cpp
+++ b/plugins/supereq/Equ.cpp
@@ -1,37 +1,92 @@
+/*
+    DeaDBeeF - ultimate music player for GNU/Linux systems with X11
+    Copyright (C) 2009-2011 Alexey Yakovenko <waker@users.sourceforge.net>
+    Original SuperEQ code (C) Naoki Shibata <shibatch@users.sf.net>
+
+    This program is free software; you can redistribute it and/or
+    modify it under the terms of the GNU General Public License
+    as published by the Free Software Foundation; either version 2
+    of the License, or (at your option) any later version.
+    
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+    
+    You should have received a copy of the GNU General Public License
+    along with this program; if not, write to the Free Software
+    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+*/
 #include <stdio.h>
 #include <stdlib.h>
 #include <math.h>
 #include <assert.h>
 #include "paramlist.hpp"
+#include "Equ.h"
+
+int _Unwind_Resume_or_Rethrow;
+int _Unwind_RaiseException;
+int _Unwind_GetLanguageSpecificData;
+int _Unwind_Resume;
+int _Unwind_DeleteException;
+int _Unwind_GetTextRelBase;
+int _Unwind_SetIP;
+int _Unwind_GetDataRelBase;
+int _Unwind_GetRegionStart;
+int _Unwind_SetGR;
+int _Unwind_GetIPInfo;
+
+#ifdef USE_OOURA
+extern "C" void rdft(int, int, REAL *, int *, REAL *);
+void rfft(int n,int isign,REAL *x)
+{
+    static int ipsize = 0,wsize=0;
+    static int *ip = NULL;
+    static REAL *w = NULL;
+    int newipsize,newwsize;
+    if (n == 0) {
+        free(ip); ip = NULL; ipsize = 0;
+        free(w);  w  = NULL; wsize  = 0;
+        return;
+    }
 
-typedef float REAL;
-void rfft(int n,int isign,REAL x[]);
+    n = 1 << n;
 
-#define M 15
 
-#define PI 3.1415926535897932384626433832795
+    newipsize = 2+sqrt(n/2);
+    if (newipsize > ipsize) {
+        ipsize = newipsize;
+        ip = (int *)realloc(ip,sizeof(int)*ipsize);
+        ip[0] = 0;
+    }
 
-#define RINT(x) ((x) >= 0 ? ((int)((x) + 0.5)) : ((int)((x) - 0.5)))
+    newwsize = n/2;
+    if (newwsize > wsize) {
+        wsize = newwsize;
+        w = (REAL *)realloc(w,sizeof(REAL)*wsize);
+    }
 
-#define DITHERLEN 65536
+    rdft(n,isign,x,ip,w);
+}
+#elif defined(USE_FFMPEG) || defined(USE_SHIBATCH)
+extern "C" void rfft(int n,int isign,REAL *x);
+#endif
 
-// play -c 2 -r 44100 -fs -sw
+#if defined(USE_SHIBATCH)
+extern "C" {
+#include "SIMDBase.h"
+}
+#endif
+
+
+#define PI 3.1415926535897932384626433832795
+
+#define DITHERLEN 65536
 
+#define M 15
 static REAL fact[M+1];
 static REAL aa = 96;
-static REAL iza;
-static REAL *lires,*lires1,*lires2,*rires,*rires1,*rires2,*irest;
-static REAL *fsamples;
-static REAL *ditherbuf;
-static int ditherptr = 0;
-static volatile int chg_ires,cur_ires;
-static int winlen,winlenbit,tabsize,nbufsamples;
-static short *inbuf;
-static REAL *outbuf;
-static int maxamp;
-int enable = 1, dither = 0;
-
-#define NCH 2
+static REAL iza = 0;
 
 #define NBANDS 17
 static REAL bands[] = {
@@ -62,49 +117,75 @@ static REAL izero(REAL x)
   return ret;
 }
 
-extern "C" void equ_init(int wb)
+void *equ_malloc (int size) {
+#ifdef USE_SHIBATCH
+    return SIMDBase_alignedMalloc (size);
+#else
+    return malloc (size);
+#endif
+}
+
+void equ_free (void *mem) {
+#ifdef USE_SHIBATCH
+    SIMDBase_alignedFree (mem);
+#else
+    free (mem);
+#endif
+}
+
+extern "C" void equ_init(SuperEqState *state, int wb, int channels)
 {
   int i,j;
 
-  if (lires1 != NULL)   free(lires1);
-  if (lires2 != NULL)   free(lires2);
-  if (rires1 != NULL)   free(rires1);
-  if (rires2 != NULL)   free(rires2);
-  if (irest != NULL)    free(irest);
-  if (fsamples != NULL) free(fsamples);
-  if (inbuf != NULL)    free(inbuf);
-  if (outbuf != NULL)   free(outbuf);
-  if (ditherbuf != NULL) free(ditherbuf);
-
-  winlen = (1 << (wb-1))-1;
-  winlenbit = wb;
-  tabsize  = 1 << wb;
-
-  lires1   = (REAL *)malloc(sizeof(REAL)*tabsize);
-  lires2   = (REAL *)malloc(sizeof(REAL)*tabsize);
-  rires1   = (REAL *)malloc(sizeof(REAL)*tabsize);
-  rires2   = (REAL *)malloc(sizeof(REAL)*tabsize);
-  irest    = (REAL *)malloc(sizeof(REAL)*tabsize);
-  fsamples = (REAL *)malloc(sizeof(REAL)*tabsize);
-  inbuf    = (short *)calloc(winlen*NCH,sizeof(int));
-  outbuf   = (REAL *)calloc(tabsize*NCH,sizeof(REAL));
-  ditherbuf = (REAL *)malloc(sizeof(REAL)*DITHERLEN);
-
-  lires = lires1;
-  rires = rires1;
-  cur_ires = 1;
-  chg_ires = 1;
+  if (state->lires1 != NULL)   free(state->lires1);
+  if (state->lires2 != NULL)   free(state->lires2);
+  if (state->irest != NULL)    free(state->irest);
+  if (state->fsamples != NULL) free(state->fsamples);
+  if (state->finbuf != NULL)    free(state->finbuf);
+  if (state->outbuf != NULL)   free(state->outbuf);
+  if (state->ditherbuf != NULL) free(state->ditherbuf);
+
+
+  memset (state, 0, sizeof (SuperEqState));
+  state->channels = channels;
+  state->enable = 1;
+
+  state->winlen = (1 << (wb-1))-1;
+  state->winlenbit = wb;
+  state->tabsize  = 1 << wb;
+  state->fft_bits = wb;
+
+  state->lires1   = (REAL *)equ_malloc(sizeof(REAL)*state->tabsize * state->channels);
+  state->lires2   = (REAL *)equ_malloc(sizeof(REAL)*state->tabsize * state->channels);
+  state->irest    = (REAL *)equ_malloc(sizeof(REAL)*state->tabsize);
+  state->fsamples = (REAL *)equ_malloc(sizeof(REAL)*state->tabsize);
+  state->finbuf    = (REAL *)equ_malloc(state->winlen*state->channels*sizeof(REAL));
+  state->outbuf   = (REAL *)equ_malloc(state->tabsize*state->channels*sizeof(REAL));
+  state->ditherbuf = (REAL *)equ_malloc(sizeof(REAL)*DITHERLEN);
+
+  memset (state->lires1, 0, sizeof(REAL)*state->tabsize * state->channels);
+  memset (state->lires2, 0, sizeof(REAL)*state->tabsize * state->channels);
+  memset (state->irest, 0, sizeof(REAL)*state->tabsize);
+  memset (state->fsamples, 0, sizeof(REAL)*state->tabsize);
+  memset (state->finbuf, 0, state->winlen*state->channels*sizeof(REAL));
+  memset (state->outbuf, 0, state->tabsize*state->channels*sizeof(REAL));
+  memset (state->ditherbuf, 0, sizeof(REAL)*DITHERLEN);
+
+  state->lires = state->lires1;
+  state->cur_ires = 1;
+  state->chg_ires = 1;
 
   for(i=0;i<DITHERLEN;i++)
-	ditherbuf[i] = (float(rand())/RAND_MAX-0.5);
-
-  for(i=0;i<=M;i++)
-    {
-      fact[i] = 1;
-      for(j=1;j<=i;j++) fact[i] *= j;
-    }
-
-  iza = izero(alpha(aa));
+	state->ditherbuf[i] = (float(rand())/RAND_MAX-0.5);
+
+  if (fact[0] < 1) {
+      for(i=0;i<=M;i++)
+      {
+          fact[i] = 1;
+          for(j=1;j<=i;j++) fact[i] *= j;
+      }
+      iza = izero(alpha(aa));
+  }
 }
 
 // -(N-1)/2 <= n <= (N-1)/2
@@ -168,7 +249,6 @@ void process_param(REAL *bc,paramlist *param,paramlist &param2,REAL fs,int ch)
   
   for(e = param->elm;e != NULL;e = e->next)
   {
-	if ((ch == 0 && !e->left) || (ch == 1 && !e->right)) continue;
 	if (e->lower >= e->upper) continue;
 
 	for(p=param2.elm;p != NULL;p = p->next)
@@ -231,414 +311,164 @@ void process_param(REAL *bc,paramlist *param,paramlist &param2,REAL fs,int ch)
   }
 }
 
-extern "C" void equ_makeTable(REAL *lbc,REAL *rbc,paramlist *param,REAL fs)
+extern "C" void equ_makeTable(SuperEqState *state, REAL *lbc,void *_param,REAL fs)
 {
-  int i,cires = cur_ires;
+  paramlist *param = (paramlist *)_param;
+  int i,cires = state->cur_ires;
   REAL *nires;
 
   if (fs <= 0) return;
 
   paramlist param2;
 
-  // L
-
-  process_param(lbc,param,param2,fs,0);
-  
-  for(i=0;i<winlen;i++)
-    irest[i] = hn(i-winlen/2,param2,fs)*win(i-winlen/2,winlen);
-
-  for(;i<tabsize;i++)
-    irest[i] = 0;
+  for (int ch = 0; ch < state->channels; ch++) {
+      process_param(lbc,param,param2,fs,ch);
 
-  rfft(tabsize,1,irest);
+      for(i=0;i<state->winlen;i++)
+          state->irest[i] = hn(i-state->winlen/2,param2,fs)*win(i-state->winlen/2,state->winlen);
 
-  nires = cires == 1 ? lires2 : lires1;
+      for(;i<state->tabsize;i++)
+          state->irest[i] = 0;
 
-  for(i=0;i<tabsize;i++)
-    nires[i] = irest[i];
+      rfft(state->fft_bits,1,state->irest);
 
-  process_param(rbc,param,param2,fs,1);
-
-  // R
-  
-  for(i=0;i<winlen;i++)
-    irest[i] = hn(i-winlen/2,param2,fs)*win(i-winlen/2,winlen);
+      nires = cires == 1 ? state->lires2 : state->lires1;
+      nires += ch * state->tabsize;
 
-  for(;i<tabsize;i++)
-    irest[i] = 0;
-
-  rfft(tabsize,1,irest);
-
-  nires = cires == 1 ? rires2 : rires1;
-
-  for(i=0;i<tabsize;i++)
-    nires[i] = irest[i];
-   
-  //
-  
-  chg_ires = cires == 1 ? 2 : 1;
+      for(i=0;i<state->tabsize;i++)
+          nires[i] = state->irest[i];
+  }
+  state->chg_ires = cires == 1 ? 2 : 1;
 }
 
-extern "C" void equ_quit(void)
+extern "C" void equ_quit(SuperEqState *state)
 {
-  free(lires1);
-  free(lires2);
-  free(rires1);
-  free(rires2);
-  free(irest);
-  free(fsamples);
-  free(inbuf);
-  free(outbuf);
-  free(ditherbuf);
-
-  lires1   = NULL;
-  lires2   = NULL;
-  rires1   = NULL;
-  rires2   = NULL;
-  irest    = NULL;
-  fsamples = NULL;
-  inbuf    = NULL;
-  outbuf   = NULL;
+  equ_free(state->lires1);
+  equ_free(state->lires2);
+  equ_free(state->irest);
+  equ_free(state->fsamples);
+  equ_free(state->finbuf);
+  equ_free(state->outbuf);
+  equ_free(state->ditherbuf);
+
+  state->lires1   = NULL;
+  state->lires2   = NULL;
+  state->irest    = NULL;
+  state->fsamples = NULL;
+  state->finbuf    = NULL;
+  state->outbuf   = NULL;
 
   rfft(0,0,NULL);
 }
 
-extern "C" void equ_clearbuf(int bps,int srate)
+extern "C" void equ_clearbuf(SuperEqState *state)
 {
 	int i;
 
-	nbufsamples = 0;
-	for(i=0;i<tabsize*NCH;i++) outbuf[i] = 0;
+	state->nbufsamples = 0;
+	for(i=0;i<state->tabsize*state->channels;i++) state->outbuf[i] = 0;
 }
 
-extern "C" int equ_modifySamples(char *buf,int nsamples,int nch,int bps)
+extern "C" int equ_modifySamples_float (SuperEqState *state, char *buf,int nsamples,int nch)
 {
   int i,p,ch;
   REAL *ires;
-  int amax =  (1 << (bps-1))-1;
-  int amin = -(1 << (bps-1));
+  float amax = 1.0f;
+  float amin = -1.0f;
   static float hm1 = 0, hm2 = 0;
 
-  if (chg_ires) {
-	  cur_ires = chg_ires;
-	  lires = cur_ires == 1 ? lires1 : lires2;
-	  rires = cur_ires == 1 ? rires1 : rires2;
-	  chg_ires = 0;
+  if (state->chg_ires) {
+	  state->cur_ires = state->chg_ires;
+	  state->lires = state->cur_ires == 1 ? state->lires1 : state->lires2;
+	  state->chg_ires = 0;
   }
 
   p = 0;
 
-  while(nbufsamples+nsamples >= winlen)
+  while(state->nbufsamples+nsamples >= state->winlen)
     {
-	  switch(bps)
-	  {
-	  case 8:
-		for(i=0;i<(winlen-nbufsamples)*nch;i++)
-			{
-				inbuf[nbufsamples*nch+i] = ((unsigned char *)buf)[i+p*nch] - 0x80;
-				float s = outbuf[nbufsamples*nch+i];
-				if (dither) {
-					float u;
-					s -= hm1;
-					u = s;
-					s += ditherbuf[(ditherptr++) & (DITHERLEN-1)];
-					if (s < amin) s = amin;
-					if (amax < s) s = amax;
-					s = RINT(s);
-					hm1 = s - u;
-					((unsigned char *)buf)[i+p*nch] = s + 0x80;
-				} else {
-					if (s < amin) s = amin;
-					if (amax < s) s = amax;
-					((unsigned char *)buf)[i+p*nch] = RINT(s) + 0x80;
-				}
-			}
-		for(i=winlen*nch;i<tabsize*nch;i++)
-			outbuf[i-winlen*nch] = outbuf[i];
-
-		break;
-
-	  case 16:
-		for(i=0;i<(winlen-nbufsamples)*nch;i++)
+		for(i=0;i<(state->winlen-state->nbufsamples)*nch;i++)
 			{
-				inbuf[nbufsamples*nch+i] = ((short *)buf)[i+p*nch];
-				float s = outbuf[nbufsamples*nch+i];
-				if (dither) {
-					float u;
-					s -= hm1;
-					u = s;
-					s += ditherbuf[(ditherptr++) & (DITHERLEN-1)];
-					if (s < amin) s = amin;
-					if (amax < s) s = amax;
-					s = RINT(s);
-					hm1 = s - u;
-					((short *)buf)[i+p*nch] = s;
-				} else {
-					if (s < amin) s = amin;
-					if (amax < s) s = amax;
-					((short *)buf)[i+p*nch] = RINT(s);
-				}
-			}
-		for(i=winlen*nch;i<tabsize*nch;i++)
-			outbuf[i-winlen*nch] = outbuf[i];
-
-		break;
-
-	  case 24:
-		for(i=0;i<(winlen-nbufsamples)*nch;i++)
-			{
-				((int *)inbuf)[nbufsamples*nch+i] =
-					(((unsigned char *)buf)[(i+p*nch)*3  ]      ) +
-					(((unsigned char *)buf)[(i+p*nch)*3+1] <<  8) +
-					(((  signed char *)buf)[(i+p*nch)*3+2] << 16) ;
-
-				float s = outbuf[nbufsamples*nch+i];
+                state->finbuf[state->nbufsamples*nch+i] = ((float *)buf)[i+p*nch];
+				float s = state->outbuf[state->nbufsamples*nch+i];
 				//if (dither) s += ditherbuf[(ditherptr++) & (DITHERLEN-1)];
 				if (s < amin) s = amin;
 				if (amax < s) s = amax;
-				int s2 = RINT(s);
-				((signed char *)buf)[(i+p*nch)*3  ] = s2 & 255; s2 >>= 8;
-				((signed char *)buf)[(i+p*nch)*3+1] = s2 & 255; s2 >>= 8;
-				((signed char *)buf)[(i+p*nch)*3+2] = s2 & 255;
+				((float *)buf)[i+p*nch] = s;
 			}
-		for(i=winlen*nch;i<tabsize*nch;i++)
-			outbuf[i-winlen*nch] = outbuf[i];
+		for(i=state->winlen*nch;i<state->tabsize*nch;i++)
+			state->outbuf[i-state->winlen*nch] = state->outbuf[i];
 
-		break;
 
-	  default:
-		assert(0);
-	  }
-
-      p += winlen-nbufsamples;
-      nsamples -= winlen-nbufsamples;
-      nbufsamples = 0;
+      p += state->winlen-state->nbufsamples;
+      nsamples -= state->winlen-state->nbufsamples;
+      state->nbufsamples = 0;
 
       for(ch=0;ch<nch;ch++)
 		{
-			ires = ch == 0 ? lires : rires;
+            ires = state->lires + ch * state->tabsize;
 
-			if (bps == 24) {
-				for(i=0;i<winlen;i++)
-					fsamples[i] = ((int *)inbuf)[nch*i+ch];
-			} else {
-				for(i=0;i<winlen;i++)
-					fsamples[i] = inbuf[nch*i+ch];
-			}
+            for(i=0;i<state->winlen;i++)
+                state->fsamples[i] = state->finbuf[nch*i+ch];
 
-			for(i=winlen;i<tabsize;i++)
-				fsamples[i] = 0;
+			for(i=state->winlen;i<state->tabsize;i++)
+				state->fsamples[i] = 0;
 
-			if (enable) {
-				rfft(tabsize,1,fsamples);
+			if (state->enable) {
+				rfft(state->fft_bits,1,state->fsamples);
 
-				fsamples[0] = ires[0]*fsamples[0];
-				fsamples[1] = ires[1]*fsamples[1]; 
+				state->fsamples[0] = ires[0]*state->fsamples[0];
+				state->fsamples[1] = ires[1]*state->fsamples[1]; 
 			
-				for(i=1;i<tabsize/2;i++)
+				for(i=1;i<state->tabsize/2;i++)
 					{
 						REAL re,im;
 
-						re = ires[i*2  ]*fsamples[i*2] - ires[i*2+1]*fsamples[i*2+1];
-						im = ires[i*2+1]*fsamples[i*2] + ires[i*2  ]*fsamples[i*2+1];
+						re = ires[i*2  ]*state->fsamples[i*2] - ires[i*2+1]*state->fsamples[i*2+1];
+						im = ires[i*2+1]*state->fsamples[i*2] + ires[i*2  ]*state->fsamples[i*2+1];
 
-						fsamples[i*2  ] = re;
-						fsamples[i*2+1] = im;
+						state->fsamples[i*2  ] = re;
+						state->fsamples[i*2+1] = im;
 					}
 
-				rfft(tabsize,-1,fsamples);
+				rfft(state->fft_bits,-1,state->fsamples);
 			} else {
-				for(i=winlen-1+winlen/2;i>=winlen/2;i--) fsamples[i] = fsamples[i-winlen/2]*tabsize/2;
-				for(;i>=0;i--) fsamples[i] = 0;
+				for(i=state->winlen-1+state->winlen/2;i>=state->winlen/2;i--) state->fsamples[i] = state->fsamples[i-state->winlen/2]*state->tabsize/2;
+				for(;i>=0;i--) state->fsamples[i] = 0;
 			}
 
-			for(i=0;i<winlen;i++) outbuf[i*nch+ch] += fsamples[i]/tabsize*2;
+			for(i=0;i<state->winlen;i++) state->outbuf[i*nch+ch] += state->fsamples[i]/state->tabsize*2;
 
-			for(i=winlen;i<tabsize;i++) outbuf[i*nch+ch] = fsamples[i]/tabsize*2;
+			for(i=state->winlen;i<state->tabsize;i++) state->outbuf[i*nch+ch] = state->fsamples[i]/state->tabsize*2;
 		}
     }
 
-	switch(bps)
-	  {
-	  case 8:
-		for(i=0;i<nsamples*nch;i++)
-			{
-				inbuf[nbufsamples*nch+i] = ((unsigned char *)buf)[i+p*nch] - 0x80;
-				float s = outbuf[nbufsamples*nch+i];
-				if (dither) {
-					float u;
-					s -= hm1;
-					u = s;
-					s += ditherbuf[(ditherptr++) & (DITHERLEN-1)];
-					if (s < amin) s = amin;
-					if (amax < s) s = amax;
-					s = RINT(s);
-					hm1 = s - u;
-					((unsigned char *)buf)[i+p*nch] = s + 0x80;
-				} else {
-					if (s < amin) s = amin;
-					if (amax < s) s = amax;
-					((unsigned char *)buf)[i+p*nch] = RINT(s) + 0x80;
-				}
-			}
-		break;
-
-	  case 16:
 		for(i=0;i<nsamples*nch;i++)
 			{
-				inbuf[nbufsamples*nch+i] = ((short *)buf)[i+p*nch];
-				float s = outbuf[nbufsamples*nch+i];
-				if (dither) {
+				state->finbuf[state->nbufsamples*nch+i] = ((float *)buf)[i+p*nch];
+				float s = state->outbuf[state->nbufsamples*nch+i];
+				if (state->dither) {
 					float u;
 					s -= hm1;
 					u = s;
-					s += ditherbuf[(ditherptr++) & (DITHERLEN-1)];
+//					s += ditherbuf[(ditherptr++) & (DITHERLEN-1)];
 					if (s < amin) s = amin;
 					if (amax < s) s = amax;
-					s = RINT(s);
 					hm1 = s - u;
-					((short *)buf)[i+p*nch] = s;
+					((float *)buf)[i+p*nch] = s;
 				} else {
 					if (s < amin) s = amin;
 					if (amax < s) s = amax;
-					((short *)buf)[i+p*nch] = RINT(s);
+					((float *)buf)[i+p*nch] = s;
 				}
 			}
-		break;
-
-	  case 24:
-		for(i=0;i<nsamples*nch;i++)
-			{
-				((int *)inbuf)[nbufsamples*nch+i] =
-					(((unsigned char *)buf)[(i+p*nch)*3  ]      ) +
-					(((unsigned char *)buf)[(i+p*nch)*3+1] <<  8) +
-					(((  signed char *)buf)[(i+p*nch)*3+2] << 16) ;
-
-				float s = outbuf[nbufsamples*nch+i];
-				//if (dither) s += ditherbuf[(ditherptr++) & (DITHERLEN-1)];
-				if (s < amin) s = amin;
-				if (amax < s) s = amax;
-				int s2 = RINT(s);
-				((signed char *)buf)[(i+p*nch)*3  ] = s2 & 255; s2 >>= 8;
-				((signed char *)buf)[(i+p*nch)*3+1] = s2 & 255; s2 >>= 8;
-				((signed char *)buf)[(i+p*nch)*3+2] = s2 & 255;
-			}
-		break;
-
-	  default:
-		assert(0);
-	}
 
   p += nsamples;
-  nbufsamples += nsamples;
+  state->nbufsamples += nsamples;
 
   return p;
 }
 
-#if 0
-void usage(void)
-{
-  fprintf(stderr,"Ouch!\n");
-}
-
-int main(int argc,char **argv)
-{
-  FILE *fpi,*fpo;
-  char buf[576*2*2];
-
-  static REAL bc[] =
-  {1.0,  0,1.0,  0,1.0,  0,1.0,  0,1.0,  0,1.0,  0,1.0,  0,1.0,  0,1.0,  0};
-
-  init(14);
-  makeTable(bc,44100);
-
-  if (argc != 3 && argc != 4) exit(-1);
-
-  fpi = fopen(argv[1],"r");
-  fpo = fopen(argv[2],"w");
-
-  if (!fpi || !fpo) exit(-1);
-
-  /* generate wav header */
-
-  {
-    short word;
-    int dword;
-
-    fwrite("RIFF",4,1,fpo);
-    dword = 0;
-    fwrite(&dword,4,1,fpo);
-
-    fwrite("WAVEfmt ",8,1,fpo);
-    dword = 16;
-    fwrite(&dword,4,1,fpo);
-    word = 1;
-    fwrite(&word,2,1,fpo);  /* format category, PCM */
-    word = 2;
-    fwrite(&word,2,1,fpo);  /* channels */
-    dword = 44100;
-    fwrite(&dword,4,1,fpo); /* sampling rate */
-    dword = 44100*2*2;
-    fwrite(&dword,4,1,fpo); /* bytes per sec */
-    word = 4;
-    fwrite(&word,2,1,fpo);  /* block alignment */
-    word = 16;
-    fwrite(&word,2,1,fpo);  /* ??? */
-
-    fwrite("data",4,1,fpo);
-    dword = 0;
-    fwrite(&dword,4,1,fpo);
-  }
-
-  preamp = 65536;
-  maxamp = 0;
-
-  if (argc == 4) {
-    preamp = 32767*65536/atoi(argv[3]);
-    fprintf(stderr,"preamp = %d\n",preamp);
-  }
-
-  for(;;)
-    {
-      int n,m;
-
-      n = fread(buf,1,576*2*2,fpi);
-      if (n <= 0) break;
-      m = modifySamples((short *)buf,n/4,2);
-      fwrite(buf,4,m,fpo);
-    }
-
-#if 0
-  for(;;)
-    {
-      int n = flushbuf((short *)buf,576);
-      if (n == 0) break;
-      fwrite(buf,4,n,fpo);
-    }
-#endif
-
-  {
-    short word;
-    int dword;
-    int len = ftell(fpo);
-
-    fseek(fpo,4,SEEK_SET);
-    dword = len-8;
-    fwrite(&dword,4,1,fpo);
-
-    fseek(fpo,40,SEEK_SET);
-    dword = len-44;
-    fwrite(&dword,4,1,fpo);
-  }
-
-  if (maxamp != 0) {
-    fprintf(stderr,"maxamp = %d\n",maxamp);
-  }
-
-  quit();
-}
-#endif
-
 extern "C" void *paramlist_alloc (void) {
     return (void *)(new paramlist);
 }
diff --git a/plugins/supereq/Equ.h b/plugins/supereq/Equ.h
new file mode 100644
index 00000000..a315741a
--- /dev/null
+++ b/plugins/supereq/Equ.h
@@ -0,0 +1,56 @@
+/*
+    DeaDBeeF - ultimate music player for GNU/Linux systems with X11
+    Copyright (C) 2009-2011 Alexey Yakovenko <waker@users.sourceforge.net>
+
+    This program is free software; you can redistribute it and/or
+    modify it under the terms of the GNU General Public License
+    as published by the Free Software Foundation; either version 2
+    of the License, or (at your option) any later version.
+    
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+    
+    You should have received a copy of the GNU General Public License
+    along with this program; if not, write to the Free Software
+    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+*/
+#ifndef __EQU_H
+#define __EQU_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef float REAL;
+typedef struct {
+    REAL *lires,*lires1,*lires2;
+    REAL *irest;
+    REAL *fsamples;
+    REAL *ditherbuf;
+    int ditherptr;
+    volatile int chg_ires,cur_ires;
+    int winlen,winlenbit,tabsize,nbufsamples;
+    REAL *finbuf;
+    REAL *outbuf;
+    int dither;
+    int channels;
+    int enable;
+    int fft_bits;
+} SuperEqState;
+
+void *paramlist_alloc (void);
+void paramlist_free (void *);
+void equ_makeTable(SuperEqState *state, float *lbc,void *param,float fs);
+int equ_modifySamples(SuperEqState *state, char *buf,int nsamples,int nch,int bps);
+int equ_modifySamples_float (SuperEqState *state, char *buf,int nsamples,int nch);
+void equ_clearbuf(SuperEqState *state);
+void equ_init(SuperEqState *state, int wb, int channels);
+void equ_quit(SuperEqState *state);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/plugins/supereq/Fftsg_fl.cpp b/plugins/supereq/Fftsg_fl.cpp
index d48debfe..636f8b8a 100644
--- a/plugins/supereq/Fftsg_fl.cpp
+++ b/plugins/supereq/Fftsg_fl.cpp
@@ -285,6 +285,7 @@ Appendix :
     w[] and ip[] are compatible with all routines.
 */
 
+extern "C" {
 
 void cdft(int n, int isgn, REAL *a, int *ip, REAL *w)
 {
@@ -2649,32 +2650,4 @@ void dstsub(int n, REAL *a, int nc, REAL *c)
     }
     a[m] *= c[0];
 }
-
-void rfft(int n,int isign,REAL x[])
-{
-  static int ipsize = 0,wsize=0;
-  static int *ip = NULL;
-  static REAL *w = NULL;
-  int newipsize,newwsize;
-
-  if (n == 0) {
-    free(ip); ip = NULL; ipsize = 0;
-    free(w);  w  = NULL; wsize  = 0;
-    return;
-  }
-
-  newipsize = 2+sqrt(n/2);
-  if (newipsize > ipsize) {
-    ipsize = newipsize;
-    ip = (int *)realloc(ip,sizeof(int)*ipsize);
-    ip[0] = 0;
-  }
-
-  newwsize = n/2;
-  if (newwsize > wsize) {
-    wsize = newwsize;
-    w = (REAL *)realloc(w,sizeof(REAL)*wsize);
-  }
-
-  rdft(n,isign,x,ip,w);
 }
diff --git a/plugins/supereq/Makefile.am b/plugins/supereq/Makefile.am
index 0fffd6d6..45010ec8 100644
--- a/plugins/supereq/Makefile.am
+++ b/plugins/supereq/Makefile.am
@@ -3,8 +3,51 @@ supereqdir = $(libdir)/$(PACKAGE)
 pkglib_LTLIBRARIES = supereq.la
 supereq_la_SOURCES = supereq.c supereq.h Equ.cpp Fftsg_fl.cpp paramlist.hpp
 
-supereq_la_LDFLAGS = -module
+#nsfft-1.00/simd/SIMDBaseUndiff.c\
+#nsfft-1.00/simd/SIMDBase.c\
+#nsfft-1.00/dft/DFT.c\
+#nsfft-1.00/dft/DFTUndiff.c\
+#nsfft-1.00/simd/SIMDBase.h\
+#nsfft-1.00/simd/SIMDBaseUndiff.h\
+#nsfft-1.00/dft/DFTUndiff.h\
+#nsfft-1.00/dft/DFT.h\
+#shibatch_rdft.c
+
+#ffmpeg_fft/libavutil/mem.c\
+#ffmpeg_fft/libavutil/mathematics.c\
+#ffmpeg_fft/libavutil/rational.c\
+#ffmpeg_fft/libavutil/intfloat_readwrite.c\
+#ffmpeg_fft/libavcodec/dct.c\
+#ffmpeg_fft/libavcodec/avfft.c\
+#ffmpeg_fft/libavcodec/fft.c\
+#ffmpeg_fft/libavcodec/dct32.c\
+#ffmpeg_fft/libavcodec/rdft.c\
+#ffmpeg_fft/libavutil/intfloat_readwrite.h\
+#ffmpeg_fft/libavutil/avutil.h\
+#ffmpeg_fft/libavutil/common.h\
+#ffmpeg_fft/libavutil/attributes.h\
+#ffmpeg_fft/libavutil/mem.h\
+#ffmpeg_fft/libavutil/avconfig.h\
+#ffmpeg_fft/libavutil/mathematics.h\
+#ffmpeg_fft/libavutil/rational.h\
+#ffmpeg_fft/publik.h\
+#ffmpeg_fft/ffmpeg_fft.h\
+#ffmpeg_fft/libavcodec/dct32.h\
+#ffmpeg_fft/libavcodec/fft.h\
+#ffmpeg_fft/libavcodec/avfft.h\
+#ffmpeg_fft/config.h\
+#ff_rdft.c
+
+#AM_CFLAGS = $(CFLAGS) -I ffmpeg_fft -I ffmpeg_fft/libavcodec -I ffmpeg_fft/libavutil -std=c99
+#AM_CPPFLAGS = $(CXXFLAGS) -fno-exceptions -fno-rtti -nostdlib -fno-unwind-tables -I ffmpeg_fft -I ffmpeg_fft/libavcodec -I ffmpeg_fft/libavutil
+
+#AM_CFLAGS = $(CFLAGS) -I nsfft-1.00/dft -I nsfft-1.00/simd -std=c99 -msse -DENABLE_SSE_FLOAT -DUSE_SHIBATCH
+#AM_CPPFLAGS = $(CXXFLAGS) -fno-exceptions -fno-rtti -nostdlib -fno-unwind-tables -I nsfft-1.00/dft -I nsfft-1.00/simd -msse -DENABLE_SSE_FLOAT -DUSE_SHIBATCH
+
+AM_CFLAGS = $(CFLAGS) -std=c99 -DUSE_OOURA
+AM_CPPFLAGS = $(CXXFLAGS) -fno-exceptions -fno-rtti -nostdlib -fno-unwind-tables -DUSE_OOURA
+
+supereq_la_LDFLAGS = -module -nostdlib -lsupc++
 
 supereq_la_LIBADD = $(LDADD)
-AM_CFLAGS = -std=c99
 endif
diff --git a/plugins/supereq/ff_rdft.c b/plugins/supereq/ff_rdft.c
new file mode 100644
index 00000000..70a09350
--- /dev/null
+++ b/plugins/supereq/ff_rdft.c
@@ -0,0 +1,63 @@
+#include <stdint.h>
+#include <complex.h>
+#include "libavcodec/avfft.h"
+#include "libavutil/avutil.h"
+
+void rfft(int n,int isign,float *x)
+{
+  static int wsize=0;
+  static float *w = NULL;
+  static RDFTContext *s = NULL;
+  static RDFTContext *si = NULL;
+  int newwsize;
+
+  if (n == 0) {
+      if (w) {
+          av_free(w);
+          w  = NULL;
+          wsize  = 0;
+      }
+      if (s) {
+          av_rdft_end (s);
+          s = NULL;
+      }
+      if (si) {
+          av_rdft_end (si);
+          si = NULL;
+      }
+    return;
+  }
+
+  newwsize = n/2;
+  if (newwsize > wsize) {
+    wsize = newwsize;
+    if (s) {
+        av_rdft_end (s);
+        s = NULL;
+    }
+      if (si) {
+          av_rdft_end (si);
+          si = NULL;
+      }
+    if (w) {
+        av_free (w);
+        w = NULL;
+    }
+    w = (float *)av_malloc(sizeof(float)*wsize);
+  }
+
+  if (!s) {
+      s = av_rdft_init(n,DFT_R2C);
+  }
+  if (!si) {
+      si = av_rdft_init(n,IDFT_C2R);
+  }
+
+  if (isign == 1) {
+      av_rdft_calc (s, x);
+  }
+  else {
+      av_rdft_calc (si, x);
+  }
+}
+
diff --git a/plugins/supereq/ffmpeg_fft/README b/plugins/supereq/ffmpeg_fft/README
new file mode 100644
index 00000000..f53b2447
--- /dev/null
+++ b/plugins/supereq/ffmpeg_fft/README
@@ -0,0 +1,9 @@
+purpose:
+
+* compare fftw and ffmpeg fourier transforms using benchfft and / or libbench
+* note: this is very specifically for neon. if you want to use ffmpeg_fft with
+  some other arch / fpu, then you will need to do some reorganization
+
+todo: 
+
+1) fix benchees/ffmpeg/doitr.c 
diff --git a/plugins/supereq/ffmpeg_fft/config.h b/plugins/supereq/ffmpeg_fft/config.h
new file mode 100644
index 00000000..0f36b47c
--- /dev/null
+++ b/plugins/supereq/ffmpeg_fft/config.h
@@ -0,0 +1,904 @@
+/* Automatically generated by configure - do not modify! */
+#ifndef FFMPEG_CONFIG_H
+#define FFMPEG_CONFIG_H
+#define FFMPEG_CONFIGURATION "--prefix=/usr --enable-neon --enable-pic --cpu=cortex-a8 --arch=arm --cross-prefix=arm-none-linux-gnueabi- --enable-cross-compile --target-os=linux --extra-cflags='-mfpu=neon -mcpu=cortex-a8 -mfloat-abi=softfp' --enable-shared --disable-debug"
+#define FFMPEG_LICENSE "LGPL version 2.1 or later"
+#define FFMPEG_DATADIR "/usr/share/ffmpeg"
+#define CC_TYPE "gcc"
+#define CC_VERSION __VERSION__
+#define restrict restrict
+#define ASMALIGN(ZEROBITS) ".p2align " #ZEROBITS "\n\t"
+#define EXTERN_PREFIX ""
+#define EXTERN_ASM 
+#define ARCH_ALPHA 0
+#define ARCH_ARM 0
+#define ARCH_AVR32 0
+#define ARCH_AVR32_AP 0
+#define ARCH_AVR32_UC 0
+#define ARCH_BFIN 0
+#define ARCH_IA64 0
+#define ARCH_M68K 0
+#define ARCH_MIPS 0
+#define ARCH_MIPS64 0
+#define ARCH_PARISC 0
+#define ARCH_PPC 0
+#define ARCH_PPC64 0
+#define ARCH_S390 0
+#define ARCH_SH4 0
+#define ARCH_SPARC 0
+#define ARCH_SPARC64 0
+#define ARCH_TOMI 0
+#define ARCH_X86 1
+#define ARCH_X86_32 1
+#define ARCH_X86_64 0
+#define HAVE_ALTIVEC 0
+#define HAVE_AMD3DNOW 0
+#define HAVE_AMD3DNOWEXT 0
+#define HAVE_ARMV5TE 1
+#define HAVE_ARMV6 1
+#define HAVE_ARMV6T2 1
+#define HAVE_ARMVFP 1
+#define HAVE_IWMMXT 0
+#define HAVE_MMI 0
+#define HAVE_MMX 0
+#define HAVE_MMX2 0
+#define HAVE_NEON 1
+#define HAVE_PPC4XX 0
+#define HAVE_SSE 1
+#define HAVE_SSSE3 1
+#define HAVE_VIS 0
+#define HAVE_BIGENDIAN 0
+#define HAVE_PTHREADS 1
+#define HAVE_W32THREADS 0
+#define HAVE_ALSA_ASOUNDLIB_H 0
+#define HAVE_ALTIVEC_H 0
+#define HAVE_ARPA_INET_H 1
+#define HAVE_ATTRIBUTE_MAY_ALIAS 1
+#define HAVE_ATTRIBUTE_PACKED 1
+#define HAVE_BSWAP 0
+#define HAVE_CLOSESOCKET 0
+#define HAVE_CMOV 0
+#define HAVE_CONIO_H 0
+#define HAVE_DCBZL 0
+#define HAVE_DEV_BKTR_IOCTL_BT848_H 0
+#define HAVE_DEV_BKTR_IOCTL_METEOR_H 0
+#define HAVE_DEV_IC_BT8XX_H 0
+#define HAVE_DEV_VIDEO_METEOR_IOCTL_METEOR_H 0
+#define HAVE_DEV_VIDEO_BKTR_IOCTL_BT848_H 0
+#define HAVE_DLFCN_H 1
+#define HAVE_DLOPEN 1
+#define HAVE_DOS_PATHS 0
+#define HAVE_EBP_AVAILABLE 0
+#define HAVE_EBX_AVAILABLE 0
+#define HAVE_EXP2 1
+#define HAVE_EXP2F 1
+#define HAVE_FAST_64BIT 0
+#define HAVE_FAST_CLZ 1
+#define HAVE_FAST_CMOV 0
+#define HAVE_FAST_UNALIGNED 1
+#define HAVE_FCNTL 1
+#define HAVE_FORK 1
+#define HAVE_GETADDRINFO 1
+#define HAVE_GETHRTIME 0
+#define HAVE_GETPROCESSMEMORYINFO 0
+#define HAVE_GETPROCESSTIMES 0
+#define HAVE_GETRUSAGE 1
+#define HAVE_GNU_AS 1
+#define HAVE_STRUCT_RUSAGE_RU_MAXRSS 1
+#define HAVE_IBM_ASM 0
+#define HAVE_INET_ATON 1
+#define HAVE_INLINE_ASM 1
+#define HAVE_ISATTY 1
+#define HAVE_LDBRX 0
+#define HAVE_LIBDC1394_1 0
+#define HAVE_LIBDC1394_2 0
+#define HAVE_LLRINT 1
+#define HAVE_LLRINTF 1
+#define HAVE_LOCAL_ALIGNED_16 0
+#define HAVE_LOCAL_ALIGNED_8 0
+#define HAVE_LOG2 1
+#define HAVE_LOG2F 1
+#define HAVE_LOONGSON 0
+#define HAVE_LRINT 1
+#define HAVE_LRINTF 1
+#define HAVE_LZO1X_999_COMPRESS 0
+#define HAVE_MACHINE_IOCTL_BT848_H 0
+#define HAVE_MACHINE_IOCTL_METEOR_H 0
+#define HAVE_MALLOC_H 1
+#define HAVE_MEMALIGN 1
+#define HAVE_MKSTEMP 1
+#define HAVE_PLD 1
+#define HAVE_POSIX_MEMALIGN 1
+#define HAVE_ROUND 1
+#define HAVE_ROUNDF 1
+#define HAVE_SDL 0
+#define HAVE_SDL_VIDEO_SIZE 0
+#define HAVE_SETMODE 0
+#define HAVE_SOCKLEN_T 1
+#define HAVE_SOUNDCARD_H 0
+#define HAVE_POLL_H 1
+#define HAVE_SETRLIMIT 1
+#define HAVE_STRERROR_R 1
+#define HAVE_STRUCT_ADDRINFO 1
+#define HAVE_STRUCT_IPV6_MREQ 1
+#define HAVE_STRUCT_SOCKADDR_IN6 1
+#define HAVE_STRUCT_SOCKADDR_SA_LEN 0
+#define HAVE_STRUCT_SOCKADDR_STORAGE 1
+#define HAVE_SYMVER 1
+#define HAVE_SYMVER_GNU_ASM 1
+#define HAVE_SYMVER_ASM_LABEL 0
+#define HAVE_SYS_MMAN_H 1
+#define HAVE_SYS_RESOURCE_H 1
+#define HAVE_SYS_SELECT_H 1
+#define HAVE_SYS_SOUNDCARD_H 1
+#define HAVE_SYS_VIDEOIO_H 0
+#define HAVE_TEN_OPERANDS 0
+#define HAVE_TERMIOS_H 1
+#define HAVE_THREADS 1
+#define HAVE_TRUNCF 1
+#define HAVE_VFP_ARGS 0
+#define HAVE_VIRTUALALLOC 0
+#define HAVE_WINSOCK2_H 0
+#define HAVE_XFORM_ASM 0
+#define HAVE_YASM 0
+#define CONFIG_BSFS 1
+#define CONFIG_DECODERS 1
+#define CONFIG_DEMUXERS 1
+#define CONFIG_ENCODERS 1
+#define CONFIG_FILTERS 1
+#define CONFIG_HWACCELS 0
+#define CONFIG_INDEVS 1
+#define CONFIG_MUXERS 1
+#define CONFIG_OUTDEVS 1
+#define CONFIG_PARSERS 1
+#define CONFIG_PROTOCOLS 1
+#define CONFIG_AANDCT 1
+#define CONFIG_AVCODEC 1
+#define CONFIG_AVDEVICE 1
+#define CONFIG_AVFILTER 1
+#define CONFIG_AVFILTER_LAVF 0
+#define CONFIG_AVFORMAT 1
+#define CONFIG_AVISYNTH 0
+#define CONFIG_BZLIB 0
+#define CONFIG_DCT 1
+#define CONFIG_DOC 0
+#define CONFIG_DWT 1
+#define CONFIG_DXVA2 0
+#define CONFIG_FASTDIV 1
+#define CONFIG_FFMPEG 1
+#define CONFIG_FFPLAY 0
+#define CONFIG_FFPROBE 1
+#define CONFIG_FFSERVER 1
+#define CONFIG_FFT 1
+#define CONFIG_GOLOMB 1
+#define CONFIG_GPL 0
+#define CONFIG_GRAY 0
+#define CONFIG_H264DSP 1
+#define CONFIG_HARDCODED_TABLES 0
+#define CONFIG_LIBDC1394 0
+#define CONFIG_LIBDIRAC 0
+#define CONFIG_LIBFAAC 0
+#define CONFIG_LIBGSM 0
+#define CONFIG_LIBMP3LAME 0
+#define CONFIG_LIBNUT 0
+#define CONFIG_LIBOPENCORE_AMRNB 0
+#define CONFIG_LIBOPENCORE_AMRWB 0
+#define CONFIG_LIBOPENJPEG 0
+#define CONFIG_LIBRTMP 0
+#define CONFIG_LIBSCHROEDINGER 0
+#define CONFIG_LIBSPEEX 0
+#define CONFIG_LIBTHEORA 0
+#define CONFIG_LIBVORBIS 0
+#define CONFIG_LIBVPX 0
+#define CONFIG_LIBX264 0
+#define CONFIG_LIBXVID 0
+#define CONFIG_LPC 1
+#define CONFIG_LSP 1
+//#define CONFIG_MDCT 1
+#define CONFIG_MEMALIGN_HACK 0
+#define CONFIG_MLIB 0
+#define CONFIG_MPEGAUDIO_HP 1
+#define CONFIG_NETWORK 1
+#define CONFIG_NONFREE 0
+#define CONFIG_PIC 1
+#define CONFIG_POSTPROC 0
+#define CONFIG_RDFT 1
+#define CONFIG_RUNTIME_CPUDETECT 0
+#define CONFIG_SHARED 1
+#define CONFIG_SMALL 0
+#define CONFIG_SRAM 0
+#define CONFIG_STATIC 1
+#define CONFIG_SWSCALE 1
+#define CONFIG_SWSCALE_ALPHA 1
+#define CONFIG_VAAPI 0
+#define CONFIG_VDPAU 0
+#define CONFIG_VERSION3 0
+#define CONFIG_X11GRAB 0
+#define CONFIG_ZLIB 0
+#define CONFIG_AVUTIL 1
+#define CONFIG_GPLV3 0
+#define CONFIG_LGPLV3 0
+#define CONFIG_AASC_DECODER 1
+#define CONFIG_AMV_DECODER 1
+#define CONFIG_ANM_DECODER 1
+#define CONFIG_ASV1_DECODER 1
+#define CONFIG_ASV2_DECODER 1
+#define CONFIG_AURA_DECODER 1
+#define CONFIG_AURA2_DECODER 1
+#define CONFIG_AVS_DECODER 1
+#define CONFIG_BETHSOFTVID_DECODER 1
+#define CONFIG_BFI_DECODER 1
+#define CONFIG_BINK_DECODER 1
+#define CONFIG_BMP_DECODER 1
+#define CONFIG_C93_DECODER 1
+#define CONFIG_CAVS_DECODER 1
+#define CONFIG_CDGRAPHICS_DECODER 1
+#define CONFIG_CINEPAK_DECODER 1
+#define CONFIG_CLJR_DECODER 1
+#define CONFIG_CSCD_DECODER 1
+#define CONFIG_CYUV_DECODER 1
+#define CONFIG_DNXHD_DECODER 1
+#define CONFIG_DPX_DECODER 1
+#define CONFIG_DSICINVIDEO_DECODER 1
+#define CONFIG_DVVIDEO_DECODER 1
+#define CONFIG_DXA_DECODER 0
+#define CONFIG_EACMV_DECODER 1
+#define CONFIG_EAMAD_DECODER 1
+#define CONFIG_EATGQ_DECODER 1
+#define CONFIG_EATGV_DECODER 1
+#define CONFIG_EATQI_DECODER 1
+#define CONFIG_EIGHTBPS_DECODER 1
+#define CONFIG_EIGHTSVX_EXP_DECODER 1
+#define CONFIG_EIGHTSVX_FIB_DECODER 1
+#define CONFIG_ESCAPE124_DECODER 1
+#define CONFIG_FFV1_DECODER 1
+#define CONFIG_FFVHUFF_DECODER 1
+#define CONFIG_FLASHSV_DECODER 0
+#define CONFIG_FLIC_DECODER 1
+#define CONFIG_FLV_DECODER 1
+#define CONFIG_FOURXM_DECODER 1
+#define CONFIG_FRAPS_DECODER 1
+#define CONFIG_FRWU_DECODER 1
+#define CONFIG_GIF_DECODER 1
+#define CONFIG_H261_DECODER 1
+#define CONFIG_H263_DECODER 1
+#define CONFIG_H263I_DECODER 1
+#define CONFIG_H264_DECODER 1
+#define CONFIG_H264_VDPAU_DECODER 0
+#define CONFIG_HUFFYUV_DECODER 1
+#define CONFIG_IDCIN_DECODER 1
+#define CONFIG_IFF_BYTERUN1_DECODER 1
+#define CONFIG_IFF_ILBM_DECODER 1
+#define CONFIG_INDEO2_DECODER 1
+#define CONFIG_INDEO3_DECODER 1
+#define CONFIG_INDEO5_DECODER 1
+#define CONFIG_INTERPLAY_VIDEO_DECODER 1
+#define CONFIG_JPEGLS_DECODER 1
+#define CONFIG_KGV1_DECODER 1
+#define CONFIG_KMVC_DECODER 1
+#define CONFIG_LOCO_DECODER 1
+#define CONFIG_MDEC_DECODER 1
+#define CONFIG_MIMIC_DECODER 1
+#define CONFIG_MJPEG_DECODER 1
+#define CONFIG_MJPEGB_DECODER 1
+#define CONFIG_MMVIDEO_DECODER 1
+#define CONFIG_MOTIONPIXELS_DECODER 1
+#define CONFIG_MPEG_XVMC_DECODER 0
+#define CONFIG_MPEG1VIDEO_DECODER 1
+#define CONFIG_MPEG2VIDEO_DECODER 1
+#define CONFIG_MPEG4_DECODER 1
+#define CONFIG_MPEG4_VDPAU_DECODER 0
+#define CONFIG_MPEGVIDEO_DECODER 1
+#define CONFIG_MPEG_VDPAU_DECODER 0
+#define CONFIG_MPEG1_VDPAU_DECODER 0
+#define CONFIG_MSMPEG4V1_DECODER 1
+#define CONFIG_MSMPEG4V2_DECODER 1
+#define CONFIG_MSMPEG4V3_DECODER 1
+#define CONFIG_MSRLE_DECODER 1
+#define CONFIG_MSVIDEO1_DECODER 1
+#define CONFIG_MSZH_DECODER 1
+#define CONFIG_NUV_DECODER 1
+#define CONFIG_PAM_DECODER 1
+#define CONFIG_PBM_DECODER 1
+#define CONFIG_PCX_DECODER 1
+#define CONFIG_PGM_DECODER 1
+#define CONFIG_PGMYUV_DECODER 1
+#define CONFIG_PICTOR_DECODER 1
+#define CONFIG_PNG_DECODER 0
+#define CONFIG_PPM_DECODER 1
+#define CONFIG_PTX_DECODER 1
+#define CONFIG_QDRAW_DECODER 1
+#define CONFIG_QPEG_DECODER 1
+#define CONFIG_QTRLE_DECODER 1
+#define CONFIG_R210_DECODER 1
+#define CONFIG_RAWVIDEO_DECODER 1
+#define CONFIG_RL2_DECODER 1
+#define CONFIG_ROQ_DECODER 1
+#define CONFIG_RPZA_DECODER 1
+#define CONFIG_RV10_DECODER 1
+#define CONFIG_RV20_DECODER 1
+#define CONFIG_RV30_DECODER 1
+#define CONFIG_RV40_DECODER 1
+#define CONFIG_SGI_DECODER 1
+#define CONFIG_SMACKER_DECODER 1
+#define CONFIG_SMC_DECODER 1
+#define CONFIG_SNOW_DECODER 1
+#define CONFIG_SP5X_DECODER 1
+#define CONFIG_SUNRAST_DECODER 1
+#define CONFIG_SVQ1_DECODER 1
+#define CONFIG_SVQ3_DECODER 1
+#define CONFIG_TARGA_DECODER 1
+#define CONFIG_THEORA_DECODER 1
+#define CONFIG_THP_DECODER 1
+#define CONFIG_TIERTEXSEQVIDEO_DECODER 1
+#define CONFIG_TIFF_DECODER 1
+#define CONFIG_TMV_DECODER 1
+#define CONFIG_TRUEMOTION1_DECODER 1
+#define CONFIG_TRUEMOTION2_DECODER 1
+#define CONFIG_TSCC_DECODER 0
+#define CONFIG_TXD_DECODER 1
+#define CONFIG_ULTI_DECODER 1
+#define CONFIG_V210_DECODER 1
+#define CONFIG_V210X_DECODER 1
+#define CONFIG_VB_DECODER 1
+#define CONFIG_VC1_DECODER 1
+#define CONFIG_VC1_VDPAU_DECODER 0
+#define CONFIG_VCR1_DECODER 1
+#define CONFIG_VMDVIDEO_DECODER 1
+#define CONFIG_VMNC_DECODER 1
+#define CONFIG_VP3_DECODER 1
+#define CONFIG_VP5_DECODER 1
+#define CONFIG_VP6_DECODER 1
+#define CONFIG_VP6A_DECODER 1
+#define CONFIG_VP6F_DECODER 1
+#define CONFIG_VP8_DECODER 1
+#define CONFIG_VQA_DECODER 1
+#define CONFIG_WMV1_DECODER 1
+#define CONFIG_WMV2_DECODER 1
+#define CONFIG_WMV3_DECODER 1
+#define CONFIG_WMV3_VDPAU_DECODER 0
+#define CONFIG_WNV1_DECODER 1
+#define CONFIG_XAN_WC3_DECODER 1
+#define CONFIG_XL_DECODER 1
+#define CONFIG_YOP_DECODER 1
+#define CONFIG_ZLIB_DECODER 0
+#define CONFIG_ZMBV_DECODER 0
+#define CONFIG_AAC_DECODER 1
+#define CONFIG_AC3_DECODER 1
+#define CONFIG_ALAC_DECODER 1
+#define CONFIG_ALS_DECODER 1
+#define CONFIG_AMRNB_DECODER 1
+#define CONFIG_APE_DECODER 1
+#define CONFIG_ATRAC1_DECODER 1
+#define CONFIG_ATRAC3_DECODER 1
+#define CONFIG_BINKAUDIO_DCT_DECODER 1
+#define CONFIG_BINKAUDIO_RDFT_DECODER 1
+#define CONFIG_COOK_DECODER 1
+/* #define CONFIG_DCA_DECODER 1 */
+#define CONFIG_DSICINAUDIO_DECODER 1
+#define CONFIG_EAC3_DECODER 1
+#define CONFIG_FLAC_DECODER 1
+#define CONFIG_GSM_DECODER 1
+#define CONFIG_GSM_MS_DECODER 1
+#define CONFIG_IMC_DECODER 1
+#define CONFIG_MACE3_DECODER 1
+#define CONFIG_MACE6_DECODER 1
+#define CONFIG_MLP_DECODER 1
+#define CONFIG_MP1_DECODER 1
+#define CONFIG_MP1FLOAT_DECODER 1
+#define CONFIG_MP2_DECODER 1
+#define CONFIG_MP2FLOAT_DECODER 1
+#define CONFIG_MP3_DECODER 1
+#define CONFIG_MP3FLOAT_DECODER 1
+#define CONFIG_MP3ADU_DECODER 1
+#define CONFIG_MP3ADUFLOAT_DECODER 1
+#define CONFIG_MP3ON4_DECODER 1
+#define CONFIG_MP3ON4FLOAT_DECODER 1
+#define CONFIG_MPC7_DECODER 1
+#define CONFIG_MPC8_DECODER 1
+#define CONFIG_NELLYMOSER_DECODER 1
+#define CONFIG_QCELP_DECODER 1
+#define CONFIG_QDM2_DECODER 1
+#define CONFIG_RA_144_DECODER 1
+#define CONFIG_RA_288_DECODER 1
+#define CONFIG_SHORTEN_DECODER 1
+#define CONFIG_SIPR_DECODER 1
+#define CONFIG_SMACKAUD_DECODER 1
+#define CONFIG_SONIC_DECODER 1
+#define CONFIG_TRUEHD_DECODER 1
+#define CONFIG_TRUESPEECH_DECODER 1
+#define CONFIG_TTA_DECODER 1
+#define CONFIG_TWINVQ_DECODER 1
+#define CONFIG_VMDAUDIO_DECODER 1
+#define CONFIG_VORBIS_DECODER 1
+#define CONFIG_WAVPACK_DECODER 1
+#define CONFIG_WMAPRO_DECODER 1
+#define CONFIG_WMAV1_DECODER 1
+#define CONFIG_WMAV2_DECODER 1
+#define CONFIG_WMAVOICE_DECODER 1
+#define CONFIG_WS_SND1_DECODER 1
+#define CONFIG_PCM_ALAW_DECODER 1
+#define CONFIG_PCM_BLURAY_DECODER 1
+#define CONFIG_PCM_DVD_DECODER 1
+#define CONFIG_PCM_F32BE_DECODER 1
+#define CONFIG_PCM_F32LE_DECODER 1
+#define CONFIG_PCM_F64BE_DECODER 1
+#define CONFIG_PCM_F64LE_DECODER 1
+#define CONFIG_PCM_MULAW_DECODER 1
+#define CONFIG_PCM_S8_DECODER 1
+#define CONFIG_PCM_S16BE_DECODER 1
+#define CONFIG_PCM_S16LE_DECODER 1
+#define CONFIG_PCM_S16LE_PLANAR_DECODER 1
+#define CONFIG_PCM_S24BE_DECODER 1
+#define CONFIG_PCM_S24DAUD_DECODER 1
+#define CONFIG_PCM_S24LE_DECODER 1
+#define CONFIG_PCM_S32BE_DECODER 1
+#define CONFIG_PCM_S32LE_DECODER 1
+#define CONFIG_PCM_U8_DECODER 1
+#define CONFIG_PCM_U16BE_DECODER 1
+#define CONFIG_PCM_U16LE_DECODER 1
+#define CONFIG_PCM_U24BE_DECODER 1
+#define CONFIG_PCM_U24LE_DECODER 1
+#define CONFIG_PCM_U32BE_DECODER 1
+#define CONFIG_PCM_U32LE_DECODER 1
+#define CONFIG_PCM_ZORK_DECODER 1
+#define CONFIG_INTERPLAY_DPCM_DECODER 1
+#define CONFIG_ROQ_DPCM_DECODER 1
+#define CONFIG_SOL_DPCM_DECODER 1
+#define CONFIG_XAN_DPCM_DECODER 1
+#define CONFIG_ADPCM_4XM_DECODER 1
+#define CONFIG_ADPCM_ADX_DECODER 1
+#define CONFIG_ADPCM_CT_DECODER 1
+#define CONFIG_ADPCM_EA_DECODER 1
+#define CONFIG_ADPCM_EA_MAXIS_XA_DECODER 1
+#define CONFIG_ADPCM_EA_R1_DECODER 1
+#define CONFIG_ADPCM_EA_R2_DECODER 1
+#define CONFIG_ADPCM_EA_R3_DECODER 1
+#define CONFIG_ADPCM_EA_XAS_DECODER 1
+#define CONFIG_ADPCM_G726_DECODER 1
+#define CONFIG_ADPCM_IMA_AMV_DECODER 1
+#define CONFIG_ADPCM_IMA_DK3_DECODER 1
+#define CONFIG_ADPCM_IMA_DK4_DECODER 1
+#define CONFIG_ADPCM_IMA_EA_EACS_DECODER 1
+#define CONFIG_ADPCM_IMA_EA_SEAD_DECODER 1
+#define CONFIG_ADPCM_IMA_ISS_DECODER 1
+#define CONFIG_ADPCM_IMA_QT_DECODER 1
+#define CONFIG_ADPCM_IMA_SMJPEG_DECODER 1
+#define CONFIG_ADPCM_IMA_WAV_DECODER 1
+#define CONFIG_ADPCM_IMA_WS_DECODER 1
+#define CONFIG_ADPCM_MS_DECODER 1
+#define CONFIG_ADPCM_SBPRO_2_DECODER 1
+#define CONFIG_ADPCM_SBPRO_3_DECODER 1
+#define CONFIG_ADPCM_SBPRO_4_DECODER 1
+#define CONFIG_ADPCM_SWF_DECODER 1
+#define CONFIG_ADPCM_THP_DECODER 1
+#define CONFIG_ADPCM_XA_DECODER 1
+#define CONFIG_ADPCM_YAMAHA_DECODER 1
+#define CONFIG_DVBSUB_DECODER 1
+#define CONFIG_DVDSUB_DECODER 1
+#define CONFIG_PGSSUB_DECODER 1
+#define CONFIG_XSUB_DECODER 1
+#define CONFIG_LIBDIRAC_DECODER 0
+#define CONFIG_LIBGSM_DECODER 0
+#define CONFIG_LIBGSM_MS_DECODER 0
+#define CONFIG_LIBOPENCORE_AMRNB_DECODER 0
+#define CONFIG_LIBOPENCORE_AMRWB_DECODER 0
+#define CONFIG_LIBOPENJPEG_DECODER 0
+#define CONFIG_LIBSCHROEDINGER_DECODER 0
+#define CONFIG_LIBSPEEX_DECODER 0
+#define CONFIG_LIBVPX_DECODER 0
+#define CONFIG_ASV1_ENCODER 1
+#define CONFIG_ASV2_ENCODER 1
+#define CONFIG_BMP_ENCODER 1
+#define CONFIG_DNXHD_ENCODER 1
+#define CONFIG_DVVIDEO_ENCODER 1
+#define CONFIG_FFV1_ENCODER 1
+#define CONFIG_FFVHUFF_ENCODER 1
+#define CONFIG_FLASHSV_ENCODER 0
+#define CONFIG_FLV_ENCODER 1
+#define CONFIG_GIF_ENCODER 1
+#define CONFIG_H261_ENCODER 1
+#define CONFIG_H263_ENCODER 1
+#define CONFIG_H263P_ENCODER 1
+#define CONFIG_HUFFYUV_ENCODER 1
+#define CONFIG_JPEGLS_ENCODER 1
+#define CONFIG_LJPEG_ENCODER 1
+#define CONFIG_MJPEG_ENCODER 1
+#define CONFIG_MPEG1VIDEO_ENCODER 1
+#define CONFIG_MPEG2VIDEO_ENCODER 1
+#define CONFIG_MPEG4_ENCODER 1
+#define CONFIG_MSMPEG4V1_ENCODER 1
+#define CONFIG_MSMPEG4V2_ENCODER 1
+#define CONFIG_MSMPEG4V3_ENCODER 1
+#define CONFIG_PAM_ENCODER 1
+#define CONFIG_PBM_ENCODER 1
+#define CONFIG_PCX_ENCODER 1
+#define CONFIG_PGM_ENCODER 1
+#define CONFIG_PGMYUV_ENCODER 1
+#define CONFIG_PNG_ENCODER 0
+#define CONFIG_PPM_ENCODER 1
+#define CONFIG_QTRLE_ENCODER 1
+#define CONFIG_RAWVIDEO_ENCODER 1
+#define CONFIG_ROQ_ENCODER 1
+#define CONFIG_RV10_ENCODER 1
+#define CONFIG_RV20_ENCODER 1
+#define CONFIG_SGI_ENCODER 1
+#define CONFIG_SNOW_ENCODER 1
+#define CONFIG_SVQ1_ENCODER 1
+#define CONFIG_TARGA_ENCODER 1
+#define CONFIG_TIFF_ENCODER 1
+#define CONFIG_V210_ENCODER 1
+#define CONFIG_WMV1_ENCODER 1
+#define CONFIG_WMV2_ENCODER 1
+#define CONFIG_ZLIB_ENCODER 0
+#define CONFIG_ZMBV_ENCODER 0
+#define CONFIG_AAC_ENCODER 1
+#define CONFIG_AC3_ENCODER 1
+#define CONFIG_ALAC_ENCODER 1
+#define CONFIG_FLAC_ENCODER 1
+#define CONFIG_MP2_ENCODER 1
+#define CONFIG_NELLYMOSER_ENCODER 1
+#define CONFIG_RA_144_ENCODER 1
+#define CONFIG_SONIC_ENCODER 1
+#define CONFIG_SONIC_LS_ENCODER 1
+#define CONFIG_VORBIS_ENCODER 1
+#define CONFIG_WMAV1_ENCODER 1
+#define CONFIG_WMAV2_ENCODER 1
+#define CONFIG_PCM_ALAW_ENCODER 1
+#define CONFIG_PCM_F32BE_ENCODER 1
+#define CONFIG_PCM_F32LE_ENCODER 1
+#define CONFIG_PCM_F64BE_ENCODER 1
+#define CONFIG_PCM_F64LE_ENCODER 1
+#define CONFIG_PCM_MULAW_ENCODER 1
+#define CONFIG_PCM_S8_ENCODER 1
+#define CONFIG_PCM_S16BE_ENCODER 1
+#define CONFIG_PCM_S16LE_ENCODER 1
+#define CONFIG_PCM_S24BE_ENCODER 1
+#define CONFIG_PCM_S24DAUD_ENCODER 1
+#define CONFIG_PCM_S24LE_ENCODER 1
+#define CONFIG_PCM_S32BE_ENCODER 1
+#define CONFIG_PCM_S32LE_ENCODER 1
+#define CONFIG_PCM_U8_ENCODER 1
+#define CONFIG_PCM_U16BE_ENCODER 1
+#define CONFIG_PCM_U16LE_ENCODER 1
+#define CONFIG_PCM_U24BE_ENCODER 1
+#define CONFIG_PCM_U24LE_ENCODER 1
+#define CONFIG_PCM_U32BE_ENCODER 1
+#define CONFIG_PCM_U32LE_ENCODER 1
+#define CONFIG_PCM_ZORK_ENCODER 1
+#define CONFIG_ROQ_DPCM_ENCODER 1
+#define CONFIG_ADPCM_ADX_ENCODER 1
+#define CONFIG_ADPCM_G726_ENCODER 1
+#define CONFIG_ADPCM_IMA_QT_ENCODER 1
+#define CONFIG_ADPCM_IMA_WAV_ENCODER 1
+#define CONFIG_ADPCM_MS_ENCODER 1
+#define CONFIG_ADPCM_SWF_ENCODER 1
+#define CONFIG_ADPCM_YAMAHA_ENCODER 1
+#define CONFIG_DVBSUB_ENCODER 1
+#define CONFIG_DVDSUB_ENCODER 1
+#define CONFIG_XSUB_ENCODER 1
+#define CONFIG_LIBDIRAC_ENCODER 0
+#define CONFIG_LIBFAAC_ENCODER 0
+#define CONFIG_LIBGSM_ENCODER 0
+#define CONFIG_LIBGSM_MS_ENCODER 0
+#define CONFIG_LIBMP3LAME_ENCODER 0
+#define CONFIG_LIBOPENCORE_AMRNB_ENCODER 0
+#define CONFIG_LIBSCHROEDINGER_ENCODER 0
+#define CONFIG_LIBTHEORA_ENCODER 0
+#define CONFIG_LIBVORBIS_ENCODER 0
+#define CONFIG_LIBVPX_ENCODER 0
+#define CONFIG_LIBX264_ENCODER 0
+#define CONFIG_LIBXVID_ENCODER 0
+#define CONFIG_H263_VAAPI_HWACCEL 0
+#define CONFIG_H264_DXVA2_HWACCEL 0
+#define CONFIG_H264_VAAPI_HWACCEL 0
+#define CONFIG_MPEG2_DXVA2_HWACCEL 0
+#define CONFIG_MPEG2_VAAPI_HWACCEL 0
+#define CONFIG_MPEG4_VAAPI_HWACCEL 0
+#define CONFIG_VC1_DXVA2_HWACCEL 0
+#define CONFIG_VC1_VAAPI_HWACCEL 0
+#define CONFIG_WMV3_DXVA2_HWACCEL 0
+#define CONFIG_WMV3_VAAPI_HWACCEL 0
+#define CONFIG_AAC_PARSER 1
+#define CONFIG_AC3_PARSER 1
+#define CONFIG_CAVSVIDEO_PARSER 1
+#define CONFIG_DCA_PARSER 1
+#define CONFIG_DIRAC_PARSER 1
+#define CONFIG_DNXHD_PARSER 1
+#define CONFIG_DVBSUB_PARSER 1
+#define CONFIG_DVDSUB_PARSER 1
+#define CONFIG_H261_PARSER 1
+#define CONFIG_H263_PARSER 1
+#define CONFIG_H264_PARSER 1
+#define CONFIG_MJPEG_PARSER 1
+#define CONFIG_MLP_PARSER 1
+#define CONFIG_MPEG4VIDEO_PARSER 1
+#define CONFIG_MPEGAUDIO_PARSER 1
+#define CONFIG_MPEGVIDEO_PARSER 1
+#define CONFIG_PNM_PARSER 1
+#define CONFIG_VC1_PARSER 1
+#define CONFIG_VP3_PARSER 1
+#define CONFIG_VP8_PARSER 1
+#define CONFIG_AAC_ADTSTOASC_BSF 1
+#define CONFIG_CHOMP_BSF 1
+#define CONFIG_DUMP_EXTRADATA_BSF 1
+#define CONFIG_H264_MP4TOANNEXB_BSF 1
+#define CONFIG_IMX_DUMP_HEADER_BSF 1
+#define CONFIG_MJPEGA_DUMP_HEADER_BSF 1
+#define CONFIG_MP3_HEADER_COMPRESS_BSF 1
+#define CONFIG_MP3_HEADER_DECOMPRESS_BSF 1
+#define CONFIG_MOV2TEXTSUB_BSF 1
+#define CONFIG_NOISE_BSF 1
+#define CONFIG_REMOVE_EXTRADATA_BSF 1
+#define CONFIG_TEXT2MOVSUB_BSF 1
+#define CONFIG_AAC_DEMUXER 1
+#define CONFIG_AC3_DEMUXER 1
+#define CONFIG_AEA_DEMUXER 1
+#define CONFIG_AIFF_DEMUXER 1
+#define CONFIG_AMR_DEMUXER 1
+#define CONFIG_ANM_DEMUXER 1
+#define CONFIG_APC_DEMUXER 1
+#define CONFIG_APE_DEMUXER 1
+#define CONFIG_ASF_DEMUXER 1
+#define CONFIG_ASS_DEMUXER 1
+#define CONFIG_AU_DEMUXER 1
+#define CONFIG_AVI_DEMUXER 1
+#define CONFIG_AVISYNTH_DEMUXER 0
+#define CONFIG_AVS_DEMUXER 1
+#define CONFIG_BETHSOFTVID_DEMUXER 1
+#define CONFIG_BFI_DEMUXER 1
+#define CONFIG_BINK_DEMUXER 1
+#define CONFIG_C93_DEMUXER 1
+#define CONFIG_CAF_DEMUXER 1
+#define CONFIG_CAVSVIDEO_DEMUXER 1
+#define CONFIG_CDG_DEMUXER 1
+#define CONFIG_DAUD_DEMUXER 1
+#define CONFIG_DIRAC_DEMUXER 1
+#define CONFIG_DNXHD_DEMUXER 1
+#define CONFIG_DSICIN_DEMUXER 1
+#define CONFIG_DTS_DEMUXER 1
+#define CONFIG_DV_DEMUXER 1
+#define CONFIG_DXA_DEMUXER 1
+#define CONFIG_EA_DEMUXER 1
+#define CONFIG_EA_CDATA_DEMUXER 1
+#define CONFIG_EAC3_DEMUXER 1
+#define CONFIG_FFM_DEMUXER 1
+#define CONFIG_FILMSTRIP_DEMUXER 1
+#define CONFIG_FLAC_DEMUXER 1
+#define CONFIG_FLIC_DEMUXER 1
+#define CONFIG_FLV_DEMUXER 1
+#define CONFIG_FOURXM_DEMUXER 1
+#define CONFIG_GSM_DEMUXER 1
+#define CONFIG_GXF_DEMUXER 1
+#define CONFIG_H261_DEMUXER 1
+#define CONFIG_H263_DEMUXER 1
+#define CONFIG_H264_DEMUXER 1
+#define CONFIG_IDCIN_DEMUXER 1
+#define CONFIG_IFF_DEMUXER 1
+#define CONFIG_IMAGE2_DEMUXER 1
+#define CONFIG_IMAGE2PIPE_DEMUXER 1
+#define CONFIG_INGENIENT_DEMUXER 1
+#define CONFIG_IPMOVIE_DEMUXER 1
+#define CONFIG_ISS_DEMUXER 1
+#define CONFIG_IV8_DEMUXER 1
+#define CONFIG_IVF_DEMUXER 1
+#define CONFIG_LMLM4_DEMUXER 1
+#define CONFIG_M4V_DEMUXER 1
+#define CONFIG_MATROSKA_DEMUXER 1
+#define CONFIG_MJPEG_DEMUXER 1
+#define CONFIG_MLP_DEMUXER 1
+#define CONFIG_MM_DEMUXER 1
+#define CONFIG_MMF_DEMUXER 1
+#define CONFIG_MOV_DEMUXER 1
+#define CONFIG_MP3_DEMUXER 1
+#define CONFIG_MPC_DEMUXER 1
+#define CONFIG_MPC8_DEMUXER 1
+#define CONFIG_MPEGPS_DEMUXER 1
+#define CONFIG_MPEGTS_DEMUXER 1
+#define CONFIG_MPEGTSRAW_DEMUXER 1
+#define CONFIG_MPEGVIDEO_DEMUXER 1
+#define CONFIG_MSNWC_TCP_DEMUXER 1
+#define CONFIG_MTV_DEMUXER 1
+#define CONFIG_MVI_DEMUXER 1
+#define CONFIG_MXF_DEMUXER 1
+#define CONFIG_NC_DEMUXER 1
+#define CONFIG_NSV_DEMUXER 1
+#define CONFIG_NUT_DEMUXER 1
+#define CONFIG_NUV_DEMUXER 1
+#define CONFIG_OGG_DEMUXER 1
+#define CONFIG_OMA_DEMUXER 1
+#define CONFIG_PCM_ALAW_DEMUXER 1
+#define CONFIG_PCM_MULAW_DEMUXER 1
+#define CONFIG_PCM_F64BE_DEMUXER 1
+#define CONFIG_PCM_F64LE_DEMUXER 1
+#define CONFIG_PCM_F32BE_DEMUXER 1
+#define CONFIG_PCM_F32LE_DEMUXER 1
+#define CONFIG_PCM_S32BE_DEMUXER 1
+#define CONFIG_PCM_S32LE_DEMUXER 1
+#define CONFIG_PCM_S24BE_DEMUXER 1
+#define CONFIG_PCM_S24LE_DEMUXER 1
+#define CONFIG_PCM_S16BE_DEMUXER 1
+#define CONFIG_PCM_S16LE_DEMUXER 1
+#define CONFIG_PCM_S8_DEMUXER 1
+#define CONFIG_PCM_U32BE_DEMUXER 1
+#define CONFIG_PCM_U32LE_DEMUXER 1
+#define CONFIG_PCM_U24BE_DEMUXER 1
+#define CONFIG_PCM_U24LE_DEMUXER 1
+#define CONFIG_PCM_U16BE_DEMUXER 1
+#define CONFIG_PCM_U16LE_DEMUXER 1
+#define CONFIG_PCM_U8_DEMUXER 1
+#define CONFIG_PVA_DEMUXER 1
+#define CONFIG_QCP_DEMUXER 1
+#define CONFIG_R3D_DEMUXER 1
+#define CONFIG_RAWVIDEO_DEMUXER 1
+#define CONFIG_RL2_DEMUXER 1
+#define CONFIG_RM_DEMUXER 1
+#define CONFIG_ROQ_DEMUXER 1
+#define CONFIG_RPL_DEMUXER 1
+#define CONFIG_RTSP_DEMUXER 1
+#define CONFIG_SDP_DEMUXER 1
+#define CONFIG_SEGAFILM_DEMUXER 1
+#define CONFIG_SHORTEN_DEMUXER 1
+#define CONFIG_SIFF_DEMUXER 1
+#define CONFIG_SMACKER_DEMUXER 1
+#define CONFIG_SOL_DEMUXER 1
+#define CONFIG_SOX_DEMUXER 1
+#define CONFIG_STR_DEMUXER 1
+#define CONFIG_SWF_DEMUXER 1
+#define CONFIG_THP_DEMUXER 1
+#define CONFIG_TIERTEXSEQ_DEMUXER 1
+#define CONFIG_TMV_DEMUXER 1
+#define CONFIG_TRUEHD_DEMUXER 1
+#define CONFIG_TTA_DEMUXER 1
+#define CONFIG_TXD_DEMUXER 1
+#define CONFIG_VC1_DEMUXER 1
+#define CONFIG_VC1T_DEMUXER 1
+#define CONFIG_VMD_DEMUXER 1
+#define CONFIG_VOC_DEMUXER 1
+#define CONFIG_VQF_DEMUXER 1
+#define CONFIG_W64_DEMUXER 1
+#define CONFIG_WAV_DEMUXER 1
+#define CONFIG_WC3_DEMUXER 1
+#define CONFIG_WSAUD_DEMUXER 1
+#define CONFIG_WSVQA_DEMUXER 1
+#define CONFIG_WV_DEMUXER 1
+#define CONFIG_XA_DEMUXER 1
+#define CONFIG_YOP_DEMUXER 1
+#define CONFIG_YUV4MPEGPIPE_DEMUXER 1
+#define CONFIG_LIBNUT_DEMUXER 0
+#define CONFIG_AC3_MUXER 1
+#define CONFIG_ADTS_MUXER 1
+#define CONFIG_AIFF_MUXER 1
+#define CONFIG_AMR_MUXER 1
+#define CONFIG_ASF_MUXER 1
+#define CONFIG_ASS_MUXER 1
+#define CONFIG_ASF_STREAM_MUXER 1
+#define CONFIG_AU_MUXER 1
+#define CONFIG_AVI_MUXER 1
+#define CONFIG_AVM2_MUXER 1
+#define CONFIG_CRC_MUXER 1
+#define CONFIG_DAUD_MUXER 1
+#define CONFIG_DIRAC_MUXER 1
+#define CONFIG_DNXHD_MUXER 1
+#define CONFIG_DTS_MUXER 1
+#define CONFIG_DV_MUXER 1
+#define CONFIG_EAC3_MUXER 1
+#define CONFIG_FFM_MUXER 1
+#define CONFIG_FILMSTRIP_MUXER 1
+#define CONFIG_FLAC_MUXER 1
+#define CONFIG_FLV_MUXER 1
+#define CONFIG_FRAMECRC_MUXER 1
+#define CONFIG_FRAMEMD5_MUXER 1
+#define CONFIG_GIF_MUXER 1
+#define CONFIG_GXF_MUXER 1
+#define CONFIG_H261_MUXER 1
+#define CONFIG_H263_MUXER 1
+#define CONFIG_H264_MUXER 1
+#define CONFIG_IMAGE2_MUXER 1
+#define CONFIG_IMAGE2PIPE_MUXER 1
+#define CONFIG_IPOD_MUXER 1
+#define CONFIG_M4V_MUXER 1
+#define CONFIG_MD5_MUXER 1
+#define CONFIG_MATROSKA_MUXER 1
+#define CONFIG_MATROSKA_AUDIO_MUXER 1
+#define CONFIG_MJPEG_MUXER 1
+#define CONFIG_MLP_MUXER 1
+#define CONFIG_MMF_MUXER 1
+#define CONFIG_MOV_MUXER 1
+#define CONFIG_MP2_MUXER 1
+#define CONFIG_MP3_MUXER 1
+#define CONFIG_MP4_MUXER 1
+#define CONFIG_MPEG1SYSTEM_MUXER 1
+#define CONFIG_MPEG1VCD_MUXER 1
+#define CONFIG_MPEG1VIDEO_MUXER 1
+#define CONFIG_MPEG2DVD_MUXER 1
+#define CONFIG_MPEG2SVCD_MUXER 1
+#define CONFIG_MPEG2VIDEO_MUXER 1
+#define CONFIG_MPEG2VOB_MUXER 1
+#define CONFIG_MPEGTS_MUXER 1
+#define CONFIG_MPJPEG_MUXER 1
+#define CONFIG_MXF_MUXER 1
+#define CONFIG_MXF_D10_MUXER 1
+#define CONFIG_NULL_MUXER 1
+#define CONFIG_NUT_MUXER 1
+#define CONFIG_OGG_MUXER 1
+#define CONFIG_PCM_ALAW_MUXER 1
+#define CONFIG_PCM_MULAW_MUXER 1
+#define CONFIG_PCM_F64BE_MUXER 1
+#define CONFIG_PCM_F64LE_MUXER 1
+#define CONFIG_PCM_F32BE_MUXER 1
+#define CONFIG_PCM_F32LE_MUXER 1
+#define CONFIG_PCM_S32BE_MUXER 1
+#define CONFIG_PCM_S32LE_MUXER 1
+#define CONFIG_PCM_S24BE_MUXER 1
+#define CONFIG_PCM_S24LE_MUXER 1
+#define CONFIG_PCM_S16BE_MUXER 1
+#define CONFIG_PCM_S16LE_MUXER 1
+#define CONFIG_PCM_S8_MUXER 1
+#define CONFIG_PCM_U32BE_MUXER 1
+#define CONFIG_PCM_U32LE_MUXER 1
+#define CONFIG_PCM_U24BE_MUXER 1
+#define CONFIG_PCM_U24LE_MUXER 1
+#define CONFIG_PCM_U16BE_MUXER 1
+#define CONFIG_PCM_U16LE_MUXER 1
+#define CONFIG_PCM_U8_MUXER 1
+#define CONFIG_PSP_MUXER 1
+#define CONFIG_RAWVIDEO_MUXER 1
+#define CONFIG_RM_MUXER 1
+#define CONFIG_ROQ_MUXER 1
+#define CONFIG_RTP_MUXER 1
+#define CONFIG_RTSP_MUXER 1
+#define CONFIG_SOX_MUXER 1
+#define CONFIG_SPDIF_MUXER 1
+#define CONFIG_SWF_MUXER 1
+#define CONFIG_TG2_MUXER 1
+#define CONFIG_TGP_MUXER 1
+#define CONFIG_TRUEHD_MUXER 1
+#define CONFIG_VC1T_MUXER 1
+#define CONFIG_VOC_MUXER 1
+#define CONFIG_WAV_MUXER 1
+#define CONFIG_WEBM_MUXER 1
+#define CONFIG_YUV4MPEGPIPE_MUXER 1
+#define CONFIG_LIBNUT_MUXER 0
+#define CONFIG_ASPECT_FILTER 1
+#define CONFIG_CROP_FILTER 1
+#define CONFIG_FORMAT_FILTER 1
+#define CONFIG_NOFORMAT_FILTER 1
+#define CONFIG_NULL_FILTER 1
+#define CONFIG_PAD_FILTER 1
+#define CONFIG_PIXDESCTEST_FILTER 1
+#define CONFIG_PIXELASPECT_FILTER 1
+#define CONFIG_SCALE_FILTER 1
+#define CONFIG_SLICIFY_FILTER 1
+#define CONFIG_UNSHARP_FILTER 1
+#define CONFIG_VFLIP_FILTER 1
+#define CONFIG_BUFFER_FILTER 1
+#define CONFIG_NULLSRC_FILTER 1
+#define CONFIG_NULLSINK_FILTER 1
+#define CONFIG_FILE_PROTOCOL 1
+#define CONFIG_GOPHER_PROTOCOL 1
+#define CONFIG_HTTP_PROTOCOL 1
+#define CONFIG_MMST_PROTOCOL 1
+#define CONFIG_PIPE_PROTOCOL 1
+#define CONFIG_RTMP_PROTOCOL 1
+#define CONFIG_RTMPT_PROTOCOL 1
+#define CONFIG_RTMPE_PROTOCOL 1
+#define CONFIG_RTMPTE_PROTOCOL 1
+#define CONFIG_RTMPS_PROTOCOL 1
+#define CONFIG_RTP_PROTOCOL 1
+#define CONFIG_TCP_PROTOCOL 1
+#define CONFIG_UDP_PROTOCOL 1
+#define CONFIG_CONCAT_PROTOCOL 1
+#define CONFIG_ALSA_INDEV 0
+#define CONFIG_BKTR_INDEV 0
+#define CONFIG_DV1394_INDEV 1
+#define CONFIG_JACK_INDEV 0
+#define CONFIG_OSS_INDEV 1
+#define CONFIG_V4L2_INDEV 1
+#define CONFIG_V4L_INDEV 1
+#define CONFIG_VFWCAP_INDEV 0
+#define CONFIG_X11_GRAB_DEVICE_INDEV 0
+#define CONFIG_LIBDC1394_INDEV 0
+#define CONFIG_ALSA_OUTDEV 0
+#define CONFIG_OSS_OUTDEV 1
+#endif /* FFMPEG_CONFIG_H */
diff --git a/plugins/supereq/ffmpeg_fft/ffmpeg_fft.h b/plugins/supereq/ffmpeg_fft/ffmpeg_fft.h
new file mode 100644
index 00000000..b98313d2
--- /dev/null
+++ b/plugins/supereq/ffmpeg_fft/ffmpeg_fft.h
@@ -0,0 +1,95 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_AVFFT_H
+#define AVCODEC_AVFFT_H
+
+typedef float FFTSample;
+
+typedef struct FFTComplex {
+    FFTSample re, im;
+} FFTComplex;
+
+//#define FFTC_SZ 32
+typedef struct FFTContext FFTContext;
+
+/**
+ * Set up a complex FFT.
+ * @param nbits           log2 of the length of the input array
+ * @param inverse         if 0 perform the forward transform, if 1 perform the inverse
+ */
+FFTContext *av_fft_init(int nbits, int inverse);
+
+/**
+ * Do the permutation needed BEFORE calling ff_fft_calc().
+ */
+void av_fft_permute(FFTContext *s, FFTComplex *z);
+
+/**
+ * Do a complex FFT with the parameters defined in av_fft_init(). The
+ * input data must be permuted before. No 1.0/sqrt(n) normalization is done.
+ */
+void av_fft_calc(FFTContext *s, FFTComplex *z);
+
+void av_fft_end(FFTContext *s);
+
+/* Real Discrete Fourier Transform */
+
+enum RDFTransformType {
+    DFT_R2C,
+    IDFT_C2R,
+    IDFT_R2C,
+    DFT_C2R,
+};
+
+//#define RDFTC_SZ 56
+typedef struct RDFTContext RDFTContext;
+
+/**
+ * Set up a real FFT.
+ * @param nbits           log2 of the length of the input array
+ * @param trans           the type of transform
+ */
+RDFTContext *av_rdft_init(int nbits, enum RDFTransformType trans);
+void av_rdft_calc(RDFTContext *s, FFTSample *data);
+void av_rdft_end(RDFTContext *s);
+
+/* Discrete Cosine Transform */
+
+typedef struct DCTContext DCTContext;
+
+enum DCTTransformType {
+    DCT_II = 0,
+    DCT_III,
+    DCT_I,
+    DST_I,
+};
+
+/**
+ * Set up DCT.
+ * @param nbits           size of the input array:
+ *                        (1 << nbits)     for DCT-II, DCT-III and DST-I
+ *                        (1 << nbits) + 1 for DCT-I
+ *
+ * @note the first element of the input of DST-I is ignored
+ */
+DCTContext *av_dct_init(int nbits, enum DCTTransformType type);
+void av_dct_calc(DCTContext *s, FFTSample *data);
+void av_dct_end (DCTContext *s);
+
+#endif /* AVCODEC_AVFFT_H */
diff --git a/plugins/supereq/ffmpeg_fft/libavcodec/arm/asm.S b/plugins/supereq/ffmpeg_fft/libavcodec/arm/asm.S
new file mode 100644
index 00000000..6860f1cf
--- /dev/null
+++ b/plugins/supereq/ffmpeg_fft/libavcodec/arm/asm.S
@@ -0,0 +1,104 @@
+/*
+ * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+
+#ifdef __ELF__
+#   define ELF
+#else
+#   define ELF @
+#endif
+
+.macro  require8 val=1
+ELF     .eabi_attribute 24, \val
+.endm
+
+.macro  preserve8 val=1
+ELF     .eabi_attribute 25, \val
+.endm
+
+/*
+.macro  function name, export=0
+    .macro endfunc
+ELF     .size   \name, . - \name
+        .endfunc
+        .purgem endfunc
+    .endm
+        .text
+    .if \export
+        .global EXTERN_ASM\name
+EXTERN_ASM\name:
+    .endif
+ELF     .type   \name, %function
+        .func   \name
+\name:
+.endm
+*/
+
+.macro  function name, export=0
+    .macro endfunc
+ELF     .size   \name, . - \name
+        .endfunc
+        .purgem endfunc
+    .endm
+        .text
+    .if \export
+    	.hidden EXTERN_ASM\name
+        .global EXTERN_ASM\name
+EXTERN_ASM\name:
+    .endif
+ELF     .type   \name, %function
+        .func   \name
+\name:
+.endm
+
+.macro  mov32   rd, val
+#if HAVE_ARMV6T2
+        movw            \rd, #(\val) & 0xffff
+    .if (\val) >> 16
+        movt            \rd, #(\val) >> 16
+    .endif
+#else
+        ldr             \rd, =\val
+#endif
+.endm
+
+.macro  movrel rd, val
+#if HAVE_ARMV6T2 && !CONFIG_PIC
+        movw            \rd, #:lower16:\val
+        movt            \rd, #:upper16:\val
+#else
+        ldr             \rd, =\val
+#endif
+.endm
+
+#if HAVE_VFP_ARGS
+        .eabi_attribute 28, 1
+#   define VFP
+#   define NOVFP @
+#else
+#   define VFP   @
+#   define NOVFP
+#endif
+
+#define GLUE(a, b) a ## b
+#define JOIN(a, b) GLUE(a, b)
+#define X(s) JOIN(EXTERN_ASM, s)
+
diff --git a/plugins/supereq/ffmpeg_fft/libavcodec/arm/fft_init_arm.c b/plugins/supereq/ffmpeg_fft/libavcodec/arm/fft_init_arm.c
new file mode 100644
index 00000000..28148e92
--- /dev/null
+++ b/plugins/supereq/ffmpeg_fft/libavcodec/arm/fft_init_arm.c
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavcodec/fft.h"
+#if CONFIG_DCA_DECODER
+#include "libavcodec/synth_filter.h"
+#endif
+
+void ff_fft_permute_neon(FFTContext *s, FFTComplex *z);
+void ff_fft_calc_neon(FFTContext *s, FFTComplex *z);
+
+#if 0
+void ff_imdct_calc_neon(FFTContext *s, FFTSample *output, const FFTSample *input);
+void ff_imdct_half_neon(FFTContext *s, FFTSample *output, const FFTSample *input);
+void ff_mdct_calc_neon(FFTContext *s, FFTSample *output, const FFTSample *input);
+#endif
+
+void ff_rdft_calc_neon(struct RDFTContext *s, FFTSample *z);
+
+void ff_synth_filter_float_neon(FFTContext *imdct,
+                                float *synth_buf_ptr, int *synth_buf_offset,
+                                float synth_buf2[32], const float window[512],
+                                float out[32], const float in[32],
+                                float scale, float bias);
+
+av_cold void ff_fft_init_arm(FFTContext *s)
+{
+    if (HAVE_NEON) {
+        s->fft_permute  = ff_fft_permute_neon;
+        s->fft_calc     = ff_fft_calc_neon;
+#if 0
+        s->imdct_calc   = ff_imdct_calc_neon;
+        s->imdct_half   = ff_imdct_half_neon;
+        s->mdct_calc    = ff_mdct_calc_neon;
+        s->permutation  = FF_MDCT_PERM_INTERLEAVE;
+#endif
+    }
+}
+
+#if CONFIG_RDFT
+av_cold void ff_rdft_init_arm(RDFTContext *s)
+{
+    if (HAVE_NEON)
+        s->rdft_calc    = ff_rdft_calc_neon;
+}
+#endif
+
+#if CONFIG_DCA_DECODER
+av_cold void ff_synth_filter_init_arm(SynthFilterContext *s)
+{
+    if (HAVE_NEON)
+        s->synth_filter_float = ff_synth_filter_float_neon;
+}
+#endif
diff --git a/plugins/supereq/ffmpeg_fft/libavcodec/arm/fft_neon.S b/plugins/supereq/ffmpeg_fft/libavcodec/arm/fft_neon.S
new file mode 100644
index 00000000..117f4fee
--- /dev/null
+++ b/plugins/supereq/ffmpeg_fft/libavcodec/arm/fft_neon.S
@@ -0,0 +1,372 @@
+/*
+ * ARM NEON optimised FFT
+ *
+ * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
+ * Copyright (c) 2009 Naotoshi Nojiri
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "asm.S"
+
+#define M_SQRT1_2 0.70710678118654752440
+
+        .text
+
+function fft4_neon
+        vld1.32         {d0-d3}, [r0,:128]
+
+        vext.32         q8,  q1,  q1,  #1       @ i2,r3 d3=i3,r2
+        vsub.f32        d6,  d0,  d1            @ r0-r1,i0-i1
+        vsub.f32        d7,  d16, d17           @ r3-r2,i2-i3
+        vadd.f32        d4,  d0,  d1            @ r0+r1,i0+i1
+        vadd.f32        d5,  d2,  d3            @ i2+i3,r2+r3
+        vadd.f32        d1,  d6,  d7
+        vsub.f32        d3,  d6,  d7
+        vadd.f32        d0,  d4,  d5
+        vsub.f32        d2,  d4,  d5
+
+        vst1.32         {d0-d3}, [r0,:128]
+
+        bx              lr
+endfunc
+
+function fft8_neon
+        mov             r1,  r0
+        vld1.32         {d0-d3},   [r1,:128]!
+        vld1.32         {d16-d19}, [r1,:128]
+
+        movw            r2,  #0x04f3            @ sqrt(1/2)
+        movt            r2,  #0x3f35
+        eor             r3,  r2,  #1<<31
+        vdup.32         d31, r2
+
+        vext.32         q11, q1,  q1,  #1       @ i2,r3,i3,r2
+        vadd.f32        d4,  d16, d17           @ r4+r5,i4+i5
+        vmov            d28, r3,  r2
+        vadd.f32        d5,  d18, d19           @ r6+r7,i6+i7
+        vsub.f32        d17, d16, d17           @ r4-r5,i4-i5
+        vsub.f32        d19, d18, d19           @ r6-r7,i6-i7
+        vrev64.32       d29, d28
+        vadd.f32        d20, d0,  d1            @ r0+r1,i0+i1
+        vadd.f32        d21, d2,  d3            @ r2+r3,i2+i3
+        vmul.f32        d26, d17, d28           @ -a2r*w,a2i*w
+        vext.32         q3,  q2,  q2,  #1
+        vmul.f32        d27, d19, d29           @ a3r*w,-a3i*w
+        vsub.f32        d23, d22, d23           @ i2-i3,r3-r2
+        vsub.f32        d22, d0,  d1            @ r0-r1,i0-i1
+        vmul.f32        d24, d17, d31           @ a2r*w,a2i*w
+        vmul.f32        d25, d19, d31           @ a3r*w,a3i*w
+        vadd.f32        d0,  d20, d21
+        vsub.f32        d2,  d20, d21
+        vadd.f32        d1,  d22, d23
+        vrev64.32       q13, q13
+        vsub.f32        d3,  d22, d23
+        vsub.f32        d6,  d6,  d7
+        vadd.f32        d24, d24, d26           @ a2r+a2i,a2i-a2r   t1,t2
+        vadd.f32        d25, d25, d27           @ a3r-a3i,a3i+a3r   t5,t6
+        vadd.f32        d7,  d4,  d5
+        vsub.f32        d18, d2,  d6
+        vext.32         q13, q12, q12, #1
+        vadd.f32        d2,  d2,  d6
+        vsub.f32        d16, d0,  d7
+        vadd.f32        d5,  d25, d24
+        vsub.f32        d4,  d26, d27
+        vadd.f32        d0,  d0,  d7
+        vsub.f32        d17, d1,  d5
+        vsub.f32        d19, d3,  d4
+        vadd.f32        d3,  d3,  d4
+        vadd.f32        d1,  d1,  d5
+
+        vst1.32         {d16-d19}, [r1,:128]
+        vst1.32         {d0-d3},   [r0,:128]
+
+        bx              lr
+endfunc
+
+function fft16_neon
+        movrel          r1, mppm
+        vld1.32         {d16-d19}, [r0,:128]!   @ q8{r0,i0,r1,i1} q9{r2,i2,r3,i3}
+        pld             [r0, #32]
+        vld1.32         {d2-d3}, [r1,:128]
+        vext.32         q13, q9,  q9,  #1
+        vld1.32         {d22-d25}, [r0,:128]!   @ q11{r4,i4,r5,i5} q12{r6,i5,r7,i7}
+        vadd.f32        d4,  d16, d17
+        vsub.f32        d5,  d16, d17
+        vadd.f32        d18, d18, d19
+        vsub.f32        d19, d26, d27
+
+        vadd.f32        d20, d22, d23
+        vsub.f32        d22, d22, d23
+        vsub.f32        d23, d24, d25
+        vadd.f32        q8,  q2,  q9            @ {r0,i0,r1,i1}
+        vadd.f32        d21, d24, d25
+        vmul.f32        d24, d22, d2
+        vsub.f32        q9,  q2,  q9            @ {r2,i2,r3,i3}
+        vmul.f32        d25, d23, d3
+        vuzp.32         d16, d17                @ {r0,r1,i0,i1}
+        vmul.f32        q1,  q11, d2[1]
+        vuzp.32         d18, d19                @ {r2,r3,i2,i3}
+        vrev64.32       q12, q12
+        vadd.f32        q11, q12, q1            @ {t1a,t2a,t5,t6}
+        vld1.32         {d24-d27}, [r0,:128]!   @ q12{r8,i8,r9,i9} q13{r10,i10,r11,i11}
+        vzip.32         q10, q11
+        vld1.32         {d28-d31}, [r0,:128]    @ q14{r12,i12,r13,i13} q15{r14,i14,r15,i15}
+        vadd.f32        d0,  d22, d20
+        vadd.f32        d1,  d21, d23
+        vsub.f32        d2,  d21, d23
+        vsub.f32        d3,  d22, d20
+        sub             r0,  r0,  #96
+        vext.32         q13, q13, q13, #1
+        vsub.f32        q10, q8,  q0            @ {r4,r5,i4,i5}
+        vadd.f32        q8,  q8,  q0            @ {r0,r1,i0,i1}
+        vext.32         q15, q15, q15, #1
+        vsub.f32        q11, q9,  q1            @ {r6,r7,i6,i7}
+        vswp            d25, d26                @ q12{r8,i8,i10,r11} q13{r9,i9,i11,r10}
+        vadd.f32        q9,  q9,  q1            @ {r2,r3,i2,i3}
+        vswp            d29, d30                @ q14{r12,i12,i14,r15} q15{r13,i13,i15,r14}
+        vadd.f32        q0,  q12, q13           @ {t1,t2,t5,t6}
+        vadd.f32        q1,  q14, q15           @ {t1a,t2a,t5a,t6a}
+        movrel          r2,  X(ff_cos_16)
+        vsub.f32        q13, q12, q13           @ {t3,t4,t7,t8}
+        vrev64.32       d1,  d1
+        vsub.f32        q15, q14, q15           @ {t3a,t4a,t7a,t8a}
+        vrev64.32       d3,  d3
+        movrel          r3,  pmmp
+        vswp            d1,  d26                @ q0{t1,t2,t3,t4} q13{t6,t5,t7,t8}
+        vswp            d3,  d30                @ q1{t1a,t2a,t3a,t4a} q15{t6a,t5a,t7a,t8a}
+        vadd.f32        q12, q0,  q13           @ {r8,i8,r9,i9}
+        vadd.f32        q14, q1,  q15           @ {r12,i12,r13,i13}
+        vld1.32         {d4-d5},  [r2,:64]
+        vsub.f32        q13, q0,  q13           @ {r10,i10,r11,i11}
+        vsub.f32        q15, q1,  q15           @ {r14,i14,r15,i15}
+        vswp            d25, d28                @ q12{r8,i8,r12,i12} q14{r9,i9,r13,i13}
+        vld1.32         {d6-d7},  [r3,:128]
+        vrev64.32       q1,  q14
+        vmul.f32        q14, q14, d4[1]
+        vmul.f32        q1,  q1,  q3
+        vmla.f32        q14, q1,  d5[1]         @ {t1a,t2a,t5a,t6a}
+        vswp            d27, d30                @ q13{r10,i10,r14,i14} q15{r11,i11,r15,i15}
+        vzip.32         q12, q14
+        vadd.f32        d0,  d28, d24
+        vadd.f32        d1,  d25, d29
+        vsub.f32        d2,  d25, d29
+        vsub.f32        d3,  d28, d24
+        vsub.f32        q12, q8,  q0            @ {r8,r9,i8,i9}
+        vadd.f32        q8,  q8,  q0            @ {r0,r1,i0,i1}
+        vsub.f32        q14, q10, q1            @ {r12,r13,i12,i13}
+        mov             r1,  #32
+        vadd.f32        q10, q10, q1            @ {r4,r5,i4,i5}
+        vrev64.32       q0,  q13
+        vmul.f32        q13, q13, d5[0]
+        vrev64.32       q1,  q15
+        vmul.f32        q15, q15, d5[1]
+        vst2.32         {d16-d17},[r0,:128], r1
+        vmul.f32        q0,  q0,  q3
+        vst2.32         {d20-d21},[r0,:128], r1
+        vmul.f32        q1,  q1,  q3
+        vmla.f32        q13, q0,  d5[0]         @ {t1,t2,t5,t6}
+        vmla.f32        q15, q1,  d4[1]         @ {t1a,t2a,t5a,t6a}
+        vst2.32         {d24-d25},[r0,:128], r1
+        vst2.32         {d28-d29},[r0,:128]
+        vzip.32         q13, q15
+        sub             r0, r0, #80
+        vadd.f32        d0,  d30, d26
+        vadd.f32        d1,  d27, d31
+        vsub.f32        d2,  d27, d31
+        vsub.f32        d3,  d30, d26
+        vsub.f32        q13, q9,  q0            @ {r10,r11,i10,i11}
+        vadd.f32        q9,  q9,  q0            @ {r2,r3,i2,i3}
+        vsub.f32        q15, q11, q1            @ {r14,r15,i14,i15}
+        vadd.f32        q11, q11, q1            @ {r6,r7,i6,i7}
+        vst2.32         {d18-d19},[r0,:128], r1
+        vst2.32         {d22-d23},[r0,:128], r1
+        vst2.32         {d26-d27},[r0,:128], r1
+        vst2.32         {d30-d31},[r0,:128]
+        bx              lr
+endfunc
+
+function fft_pass_neon
+        push            {r4-r6,lr}
+        mov             r6,  r2                 @ n
+        lsl             r5,  r2,  #3            @ 2 * n * sizeof FFTSample
+        lsl             r4,  r2,  #4            @ 2 * n * sizeof FFTComplex
+        lsl             r2,  r2,  #5            @ 4 * n * sizeof FFTComplex
+        add             r3,  r2,  r4
+        add             r4,  r4,  r0            @ &z[o1]
+        add             r2,  r2,  r0            @ &z[o2]
+        add             r3,  r3,  r0            @ &z[o3]
+        vld1.32         {d20-d21},[r2,:128]     @ {z[o2],z[o2+1]}
+        movrel          r12, pmmp
+        vld1.32         {d22-d23},[r3,:128]     @ {z[o3],z[o3+1]}
+        add             r5,  r5,  r1            @ wim
+        vld1.32         {d6-d7},  [r12,:128]    @ pmmp
+        vswp            d21, d22
+        vld1.32         {d4},     [r1,:64]!     @ {wre[0],wre[1]}
+        sub             r5,  r5,  #4            @ wim--
+        vrev64.32       q1,  q11
+        vmul.f32        q11, q11, d4[1]
+        vmul.f32        q1,  q1,  q3
+        vld1.32         {d5[0]},  [r5,:32]      @ d5[0] = wim[-1]
+        vmla.f32        q11, q1,  d5[0]         @ {t1a,t2a,t5a,t6a}
+        vld2.32         {d16-d17},[r0,:128]     @ {z[0],z[1]}
+        sub             r6, r6, #1              @ n--
+        vld2.32         {d18-d19},[r4,:128]     @ {z[o1],z[o1+1]}
+        vzip.32         q10, q11
+        vadd.f32        d0,  d22, d20
+        vadd.f32        d1,  d21, d23
+        vsub.f32        d2,  d21, d23
+        vsub.f32        d3,  d22, d20
+        vsub.f32        q10, q8,  q0
+        vadd.f32        q8,  q8,  q0
+        vsub.f32        q11, q9,  q1
+        vadd.f32        q9,  q9,  q1
+        vst2.32         {d20-d21},[r2,:128]!    @ {z[o2],z[o2+1]}
+        vst2.32         {d16-d17},[r0,:128]!    @ {z[0],z[1]}
+        vst2.32         {d22-d23},[r3,:128]!    @ {z[o3],z[o3+1]}
+        vst2.32         {d18-d19},[r4,:128]!    @ {z[o1],z[o1+1]}
+        sub             r5,  r5,  #8            @ wim -= 2
+1:
+        vld1.32         {d20-d21},[r2,:128]     @ {z[o2],z[o2+1]}
+        vld1.32         {d22-d23},[r3,:128]     @ {z[o3],z[o3+1]}
+        vswp            d21, d22
+        vld1.32         {d4}, [r1]!             @ {wre[0],wre[1]}
+        vrev64.32       q0,  q10
+        vmul.f32        q10, q10, d4[0]
+        vrev64.32       q1,  q11
+        vmul.f32        q11, q11, d4[1]
+        vld1.32         {d5}, [r5]              @ {wim[-1],wim[0]}
+        vmul.f32        q0,  q0,  q3
+        sub             r5,  r5,  #8            @ wim -= 2
+        vmul.f32        q1,  q1,  q3
+        vmla.f32        q10, q0,  d5[1]         @ {t1,t2,t5,t6}
+        vmla.f32        q11, q1,  d5[0]         @ {t1a,t2a,t5a,t6a}
+        vld2.32         {d16-d17},[r0,:128]     @ {z[0],z[1]}
+        subs            r6,  r6,  #1            @ n--
+        vld2.32         {d18-d19},[r4,:128]     @ {z[o1],z[o1+1]}
+        vzip.32         q10, q11
+        vadd.f32        d0,  d22, d20
+        vadd.f32        d1,  d21, d23
+        vsub.f32        d2,  d21, d23
+        vsub.f32        d3,  d22, d20
+        vsub.f32        q10, q8,  q0
+        vadd.f32        q8,  q8,  q0
+        vsub.f32        q11, q9,  q1
+        vadd.f32        q9,  q9,  q1
+        vst2.32         {d20-d21}, [r2,:128]!   @ {z[o2],z[o2+1]}
+        vst2.32         {d16-d17}, [r0,:128]!   @ {z[0],z[1]}
+        vst2.32         {d22-d23}, [r3,:128]!   @ {z[o3],z[o3+1]}
+        vst2.32         {d18-d19}, [r4,:128]!   @ {z[o1],z[o1+1]}
+        bne             1b
+
+        pop             {r4-r6,pc}
+endfunc
+
+.macro  def_fft n, n2, n4
+        .align 6
+function fft\n\()_neon
+        push            {r4, lr}
+        mov             r4,  r0
+        bl              fft\n2\()_neon
+        add             r0,  r4,  #\n4*2*8
+        bl              fft\n4\()_neon
+        add             r0,  r4,  #\n4*3*8
+        bl              fft\n4\()_neon
+        mov             r0,  r4
+        pop             {r4, lr}
+        movrel          r1,  X(ff_cos_\n)
+        mov             r2,  #\n4/2
+        b               fft_pass_neon
+endfunc
+.endm
+
+        def_fft    32,    16,     8
+        def_fft    64,    32,    16
+        def_fft   128,    64,    32
+        def_fft   256,   128,    64
+        def_fft   512,   256,   128
+        def_fft  1024,   512,   256
+        def_fft  2048,  1024,   512
+        def_fft  4096,  2048,  1024
+        def_fft  8192,  4096,  2048
+        def_fft 16384,  8192,  4096
+        def_fft 32768, 16384,  8192
+        def_fft 65536, 32768, 16384
+
+function ff_fft_calc_neon, export=1
+        ldr             r2,  [r0]
+        sub             r2,  r2,  #2
+        movrel          r3,  fft_tab_neon
+        ldr             r3,  [r3, r2, lsl #2]
+        mov             r0,  r1
+        bx              r3
+endfunc
+
+function ff_fft_permute_neon, export=1
+        push            {r4,lr}
+        mov             r12, #1
+        ldr             r2,  [r0]       @ nbits
+        ldr             r3,  [r0, #12]  @ tmp_buf
+        ldr             r0,  [r0, #8]   @ revtab
+        lsl             r12, r12, r2
+        mov             r2,  r12
+1:
+        vld1.32         {d0-d1}, [r1,:128]!
+        ldr             r4,  [r0], #4
+        uxth            lr,  r4
+        uxth            r4,  r4,  ror #16
+        add             lr,  r3,  lr,  lsl #3
+        add             r4,  r3,  r4,  lsl #3
+        vst1.32         {d0}, [lr,:64]
+        vst1.32         {d1}, [r4,:64]
+        subs            r12, r12, #2
+        bgt             1b
+
+        sub             r1,  r1,  r2,  lsl #3
+1:
+        vld1.32         {d0-d3}, [r3,:128]!
+        vst1.32         {d0-d3}, [r1,:128]!
+        subs            r2,  r2,  #4
+        bgt             1b
+
+        pop             {r4,pc}
+endfunc
+
+        .section .rodata
+        .align 4
+fft_tab_neon:
+        .word fft4_neon
+        .word fft8_neon
+        .word fft16_neon
+        .word fft32_neon
+        .word fft64_neon
+        .word fft128_neon
+        .word fft256_neon
+        .word fft512_neon
+        .word fft1024_neon
+        .word fft2048_neon
+        .word fft4096_neon
+        .word fft8192_neon
+        .word fft16384_neon
+        .word fft32768_neon
+        .word fft65536_neon
+ELF     .size fft_tab_neon, . - fft_tab_neon
+
+        .align 4
+pmmp:   .float  +1.0, -1.0, -1.0, +1.0
+mppm:   .float  -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2
+
diff --git a/plugins/supereq/ffmpeg_fft/libavcodec/arm/rdft_neon.S b/plugins/supereq/ffmpeg_fft/libavcodec/arm/rdft_neon.S
new file mode 100644
index 00000000..4f8a1032
--- /dev/null
+++ b/plugins/supereq/ffmpeg_fft/libavcodec/arm/rdft_neon.S
@@ -0,0 +1,151 @@
+/*
+ * ARM NEON optimised RDFT
+ * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "asm.S"
+
+        preserve8
+
+function ff_rdft_calc_neon, export=1
+        push            {r4-r8,lr}
+
+        ldr             r6,  [r0, #4]           @ inverse
+        mov             r4,  r0
+        mov             r5,  r1
+
+        lsls            r6,  r6,  #31
+        bne             1f
+        add             r0,  r4,  #20
+        bl              X(ff_fft_permute_neon)
+        add             r0,  r4,  #20
+        mov             r1,  r5
+        bl              X(ff_fft_calc_neon)
+1:
+        ldr             r12, [r4, #0]           @ nbits
+        mov             r2,  #1
+        lsl             r12, r2,  r12
+        add             r0,  r5,  #8
+        add             r1,  r5,  r12, lsl #2
+        lsr             r12, r12, #2
+        ldr             r2,  [r4, #12]          @ tcos
+        sub             r12, r12, #2
+        ldr             r3,  [r4, #16]          @ tsin
+        mov             r7,  r0
+        sub             r1,  r1,  #8
+        mov             lr,  r1
+        mov             r8,  #-8
+        vld1.32         {d0},     [r0,:64]!     @ d1[0,1]
+        vld1.32         {d1},     [r1,:64], r8  @ d2[0,1]
+        vld1.32         {d4},     [r2,:64]!     @ tcos[i]
+        vld1.32         {d5},     [r3,:64]!     @ tsin[i]
+        vmov.f32        d18, #0.5               @ k1
+        vdup.32         d19, r6
+        pld             [r0, #32]
+        veor            d19, d18, d19           @ k2
+        vmov.i32        d16, #0
+        vmov.i32        d17, #1<<31
+        pld             [r1, #-32]
+        vtrn.32         d16, d17
+        pld             [r2, #32]
+        vrev64.32       d16, d16                @ d16=1,0 d17=0,1
+        pld             [r3, #32]
+2:
+        veor            q1,  q0,  q8            @ -d1[0],d1[1], d2[0],-d2[1]
+        vld1.32         {d24},    [r0,:64]!     @  d1[0,1]
+        vadd.f32        d0,  d0,  d3            @  d1[0]+d2[0], d1[1]-d2[1]
+        vld1.32         {d25},    [r1,:64], r8  @  d2[0,1]
+        vadd.f32        d1,  d2,  d1            @ -d1[0]+d2[0], d1[1]+d2[1]
+        veor            q3,  q12, q8            @ -d1[0],d1[1], d2[0],-d2[1]
+        pld             [r0, #32]
+        vmul.f32        q10, q0,  q9            @  ev.re, ev.im, od.im, od.re
+        pld             [r1, #-32]
+        vadd.f32        d0,  d24, d7            @  d1[0]+d2[0], d1[1]-d2[1]
+        vadd.f32        d1,  d6,  d25           @ -d1[0]+d2[0], d1[1]+d2[1]
+        vmul.f32        q11, q0,  q9            @  ev.re, ev.im, od.im, od.re
+        veor            d7,  d21, d16           @ -od.im, od.re
+        vrev64.32       d3,  d21                @  od.re, od.im
+        veor            d6,  d20, d17           @  ev.re,-ev.im
+        veor            d2,  d3,  d16           @ -od.re, od.im
+        vmla.f32        d20, d3,  d4[1]
+        vmla.f32        d20, d7,  d5[1]
+        vmla.f32        d6,  d2,  d4[1]
+        vmla.f32        d6,  d21, d5[1]
+        vld1.32         {d4},     [r2,:64]!     @  tcos[i]
+        veor            d7,  d23, d16           @ -od.im, od.re
+        vld1.32         {d5},     [r3,:64]!     @  tsin[i]
+        veor            d24, d22, d17           @  ev.re,-ev.im
+        vrev64.32       d3,  d23                @  od.re, od.im
+        pld             [r2, #32]
+        veor            d2,  d3,  d16           @ -od.re, od.im
+        pld             [r3, #32]
+        vmla.f32        d22, d3,  d4[0]
+        vmla.f32        d22, d7,  d5[0]
+        vmla.f32        d24, d2,  d4[0]
+        vmla.f32        d24, d23, d5[0]
+        vld1.32         {d0},     [r0,:64]!     @  d1[0,1]
+        vld1.32         {d1},     [r1,:64], r8  @  d2[0,1]
+        vst1.32         {d20},    [r7,:64]!
+        vst1.32         {d6},     [lr,:64], r8
+        vst1.32         {d22},    [r7,:64]!
+        vst1.32         {d24},    [lr,:64], r8
+        subs            r12, r12, #2
+        bgt             2b
+
+        veor            q1,  q0,  q8            @ -d1[0],d1[1], d2[0],-d2[1]
+        vadd.f32        d0,  d0,  d3            @  d1[0]+d2[0], d1[1]-d2[1]
+        vadd.f32        d1,  d2,  d1            @ -d1[0]+d2[0], d1[1]+d2[1]
+        ldr             r2,  [r4, #8]           @  sign_convention
+        vmul.f32        q10, q0,  q9            @  ev.re, ev.im, od.im, od.re
+        add             r0,  r0,  #4
+        bfc             r2,  #0,  #31
+        vld1.32         {d0[0]},  [r0,:32]
+        veor            d7,  d21, d16           @ -od.im, od.re
+        vrev64.32       d3,  d21                @  od.re, od.im
+        veor            d6,  d20, d17           @  ev.re,-ev.im
+        vld1.32         {d22},    [r5,:64]
+        vdup.32         d1,  r2
+        vmov            d23, d22
+        veor            d2,  d3,  d16           @ -od.re, od.im
+        vtrn.32         d22, d23
+        veor            d0,  d0,  d1
+        veor            d23, d23, d17
+        vmla.f32        d20, d3,  d4[1]
+        vmla.f32        d20, d7,  d5[1]
+        vmla.f32        d6,  d2,  d4[1]
+        vmla.f32        d6,  d21, d5[1]
+        vadd.f32        d22, d22, d23
+        vst1.32         {d20},    [r7,:64]
+        vst1.32         {d6},     [lr,:64]
+        vst1.32         {d0[0]},  [r0,:32]
+        vst1.32         {d22},    [r5,:64]
+
+        cmp             r6,  #0
+        popeq           {r4-r8,pc}
+
+        vmul.f32        d22, d22, d18
+        vst1.32         {d22},    [r5,:64]
+        add             r0,  r4,  #20
+        mov             r1,  r5
+        bl              X(ff_fft_permute_neon)
+        add             r0,  r4,  #20
+        mov             r1,  r5
+        pop             {r4-r8,lr}
+        b               X(ff_fft_calc_neon)
+endfunc
diff --git a/plugins/supereq/ffmpeg_fft/libavcodec/arm/simple_idct_neon.S b/plugins/supereq/ffmpeg_fft/libavcodec/arm/simple_idct_neon.S
new file mode 100644
index 00000000..17cde583
--- /dev/null
+++ b/plugins/supereq/ffmpeg_fft/libavcodec/arm/simple_idct_neon.S
@@ -0,0 +1,372 @@
+/*
+ * ARM NEON IDCT
+ *
+ * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
+ *
+ * Based on Simple IDCT
+ * Copyright (c) 2001 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "asm.S"
+
+#define W1  22725  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+#define W2  21407  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+#define W3  19266  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+#define W4  16383  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+#define W5  12873  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+#define W6  8867   //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+#define W7  4520   //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
+#define W4c ((1<<(COL_SHIFT-1))/W4)
+#define ROW_SHIFT 11
+#define COL_SHIFT 20
+
+#define w1 d0[0]
+#define w2 d0[1]
+#define w3 d0[2]
+#define w4 d0[3]
+#define w5 d1[0]
+#define w6 d1[1]
+#define w7 d1[2]
+#define w4c d1[3]
+
+        .macro idct_col4_top
+        vmull.s16       q7,  d6,  w2    /* q9   = W2 * col[2] */
+        vmull.s16       q8,  d6,  w6    /* q10  = W6 * col[2] */
+        vmull.s16       q9,  d4,  w1    /* q9   = W1 * col[1] */
+        vadd.i32        q11, q15, q7
+        vmull.s16       q10, d4,  w3    /* q10  = W3 * col[1] */
+        vadd.i32        q12, q15, q8
+        vmull.s16       q5,  d4,  w5    /* q5   = W5 * col[1] */
+        vsub.i32        q13, q15, q8
+        vmull.s16       q6,  d4,  w7    /* q6   = W7 * col[1] */
+        vsub.i32        q14, q15, q7
+
+        vmlal.s16       q9,  d8,  w3    /* q9  += W3 * col[3] */
+        vmlsl.s16       q10, d8,  w7    /* q10 -= W7 * col[3] */
+        vmlsl.s16       q5,  d8,  w1    /* q5  -= W1 * col[3] */
+        vmlsl.s16       q6,  d8,  w5    /* q6  -= W5 * col[3] */
+        .endm
+
+        .text
+        .align 6
+
+function idct_row4_pld_neon
+        pld             [r0]
+        add             r3,  r0,  r1,  lsl #2
+        pld             [r0, r1]
+        pld             [r0, r1, lsl #1]
+        pld             [r3, -r1]
+        pld             [r3]
+        pld             [r3, r1]
+        add             r3,  r3,  r1,  lsl #1
+        pld             [r3]
+        pld             [r3, r1]
+endfunc
+
+function idct_row4_neon
+        vmov.i32        q15, #(1<<(ROW_SHIFT-1))
+        vld1.64         {d2-d5},  [r2,:128]!
+        vmlal.s16       q15, d2,  w4    /* q15  += W4 * col[0] */
+        vld1.64         {d6,d7},  [r2,:128]!
+        vorr            d10, d3,  d5
+        vld1.64         {d8,d9},  [r2,:128]!
+        add             r2,  r2,  #-64
+
+        vorr            d11, d7,  d9
+        vorr            d10, d10, d11
+        vmov            r3,  r4,  d10
+
+        idct_col4_top
+
+        orrs            r3,  r3,  r4
+        beq             1f
+
+        vmull.s16       q7,  d3,  w4    /* q7   = W4 * col[4] */
+        vmlal.s16       q9,  d5,  w5    /* q9  += W5 * col[5] */
+        vmlsl.s16       q10, d5,  w1    /* q10 -= W1 * col[5] */
+        vmull.s16       q8,  d7,  w2    /* q8   = W2 * col[6] */
+        vmlal.s16       q5,  d5,  w7    /* q5  += W7 * col[5] */
+        vadd.i32        q11, q11, q7
+        vsub.i32        q12, q12, q7
+        vsub.i32        q13, q13, q7
+        vadd.i32        q14, q14, q7
+        vmlal.s16       q6,  d5,  w3    /* q6  += W3 * col[5] */
+        vmull.s16       q7,  d7,  w6    /* q7   = W6 * col[6] */
+        vmlal.s16       q9,  d9,  w7
+        vmlsl.s16       q10, d9,  w5
+        vmlal.s16       q5,  d9,  w3
+        vmlsl.s16       q6,  d9,  w1
+        vadd.i32        q11, q11, q7
+        vsub.i32        q12, q12, q8
+        vadd.i32        q13, q13, q8
+        vsub.i32        q14, q14, q7
+
+1:      vadd.i32        q3,  q11, q9
+        vadd.i32        q4,  q12, q10
+        vshrn.i32       d2,  q3,  #ROW_SHIFT
+        vshrn.i32       d4,  q4,  #ROW_SHIFT
+        vadd.i32        q7,  q13, q5
+        vadd.i32        q8,  q14, q6
+        vtrn.16         d2,  d4
+        vshrn.i32       d6,  q7,  #ROW_SHIFT
+        vshrn.i32       d8,  q8,  #ROW_SHIFT
+        vsub.i32        q14, q14, q6
+        vsub.i32        q11, q11, q9
+        vtrn.16         d6,  d8
+        vsub.i32        q13, q13, q5
+        vshrn.i32       d3,  q14, #ROW_SHIFT
+        vtrn.32         d2,  d6
+        vsub.i32        q12, q12, q10
+        vtrn.32         d4,  d8
+        vshrn.i32       d5,  q13, #ROW_SHIFT
+        vshrn.i32       d7,  q12, #ROW_SHIFT
+        vshrn.i32       d9,  q11, #ROW_SHIFT
+
+        vtrn.16         d3,  d5
+        vtrn.16         d7,  d9
+        vtrn.32         d3,  d7
+        vtrn.32         d5,  d9
+
+        vst1.64         {d2-d5},  [r2,:128]!
+        vst1.64         {d6-d9},  [r2,:128]!
+
+        bx              lr
+endfunc
+
+function idct_col4_neon
+        mov             ip,  #16
+        vld1.64         {d2}, [r2,:64], ip /* d2 = col[0] */
+        vdup.16         d30, w4c
+        vld1.64         {d4}, [r2,:64], ip /* d3 = col[1] */
+        vadd.i16        d30, d30, d2
+        vld1.64         {d6}, [r2,:64], ip /* d4 = col[2] */
+        vmull.s16       q15, d30, w4 /* q15 = W4*(col[0]+(1<<COL_SHIFT-1)/W4)*/
+        vld1.64         {d8}, [r2,:64], ip /* d5 = col[3] */
+
+        ldrd            r4,  [r2]
+        ldrd            r6,  [r2, #16]
+        orrs            r4,  r4,  r5
+
+        idct_col4_top
+        addeq           r2,  r2,  #16
+        beq             1f
+
+        vld1.64         {d3}, [r2,:64], ip /* d6 = col[4] */
+        vmull.s16       q7,  d3,  w4    /* q7   = W4 * col[4] */
+        vadd.i32        q11, q11, q7
+        vsub.i32        q12, q12, q7
+        vsub.i32        q13, q13, q7
+        vadd.i32        q14, q14, q7
+
+1:      orrs            r6,  r6,  r7
+        ldrd            r4,  [r2, #16]
+        addeq           r2,  r2,  #16
+        beq             2f
+
+        vld1.64         {d5}, [r2,:64], ip /* d7 = col[5] */
+        vmlal.s16       q9,  d5,  w5    /* q9  += W5 * col[5] */
+        vmlsl.s16       q10, d5,  w1    /* q10 -= W1 * col[5] */
+        vmlal.s16       q5,  d5,  w7    /* q5  += W7 * col[5] */
+        vmlal.s16       q6,  d5,  w3    /* q6  += W3 * col[5] */
+
+2:      orrs            r4,  r4,  r5
+        ldrd            r4,  [r2, #16]
+        addeq           r2,  r2,  #16
+        beq             3f
+
+        vld1.64         {d7}, [r2,:64], ip /* d8 = col[6] */
+        vmull.s16       q7,  d7,  w6    /* q7   = W6 * col[6] */
+        vmull.s16       q8,  d7,  w2    /* q8   = W2 * col[6] */
+        vadd.i32        q11, q11, q7
+        vsub.i32        q14, q14, q7
+        vsub.i32        q12, q12, q8
+        vadd.i32        q13, q13, q8
+
+3:      orrs            r4,  r4,  r5
+        addeq           r2,  r2,  #16
+        beq             4f
+
+        vld1.64         {d9}, [r2,:64], ip /* d9 = col[7] */
+        vmlal.s16       q9,  d9,  w7
+        vmlsl.s16       q10, d9,  w5
+        vmlal.s16       q5,  d9,  w3
+        vmlsl.s16       q6,  d9,  w1
+
+4:      vaddhn.i32      d2,  q11, q9
+        vaddhn.i32      d3,  q12, q10
+        vaddhn.i32      d4,  q13, q5
+        vaddhn.i32      d5,  q14, q6
+        vsubhn.i32      d9,  q11, q9
+        vsubhn.i32      d8,  q12, q10
+        vsubhn.i32      d7,  q13, q5
+        vsubhn.i32      d6,  q14, q6
+
+        bx              lr
+endfunc
+
+        .align 6
+
+function idct_col4_st8_neon
+        vqshrun.s16     d2,  q1,  #COL_SHIFT-16
+        vqshrun.s16     d3,  q2,  #COL_SHIFT-16
+        vqshrun.s16     d4,  q3,  #COL_SHIFT-16
+        vqshrun.s16     d5,  q4,  #COL_SHIFT-16
+        vst1.32         {d2[0]}, [r0,:32], r1
+        vst1.32         {d2[1]}, [r0,:32], r1
+        vst1.32         {d3[0]}, [r0,:32], r1
+        vst1.32         {d3[1]}, [r0,:32], r1
+        vst1.32         {d4[0]}, [r0,:32], r1
+        vst1.32         {d4[1]}, [r0,:32], r1
+        vst1.32         {d5[0]}, [r0,:32], r1
+        vst1.32         {d5[1]}, [r0,:32], r1
+
+        bx              lr
+endfunc
+
+        .section .rodata
+        .align 4
+idct_coeff_neon:
+        .short W1, W2, W3, W4, W5, W6, W7, W4c
+
+        .macro idct_start data
+        push            {r4-r7, lr}
+        pld             [\data]
+        pld             [\data, #64]
+        vpush           {d8-d15}
+        movrel          r3,  idct_coeff_neon
+        vld1.64         {d0,d1}, [r3,:128]
+        .endm
+
+        .macro idct_end
+        vpop            {d8-d15}
+        pop             {r4-r7, pc}
+        .endm
+
+/* void ff_simple_idct_put_neon(uint8_t *dst, int line_size, DCTELEM *data); */
+function ff_simple_idct_put_neon, export=1
+        idct_start      r2
+
+        bl              idct_row4_pld_neon
+        bl              idct_row4_neon
+        add             r2,  r2,  #-128
+        bl              idct_col4_neon
+        bl              idct_col4_st8_neon
+        sub             r0,  r0,  r1, lsl #3
+        add             r0,  r0,  #4
+        add             r2,  r2,  #-120
+        bl              idct_col4_neon
+        bl              idct_col4_st8_neon
+
+        idct_end
+endfunc
+
+        .align 6
+
+function idct_col4_add8_neon
+        mov             ip,  r0
+
+        vld1.32         {d10[0]}, [r0,:32], r1
+        vshr.s16        q1,  q1,  #COL_SHIFT-16
+        vld1.32         {d10[1]}, [r0,:32], r1
+        vshr.s16        q2,  q2,  #COL_SHIFT-16
+        vld1.32         {d11[0]}, [r0,:32], r1
+        vshr.s16        q3,  q3,  #COL_SHIFT-16
+        vld1.32         {d11[1]}, [r0,:32], r1
+        vshr.s16        q4,  q4,  #COL_SHIFT-16
+        vld1.32         {d12[0]}, [r0,:32], r1
+        vaddw.u8        q1,  q1,  d10
+        vld1.32         {d12[1]}, [r0,:32], r1
+        vaddw.u8        q2,  q2,  d11
+        vld1.32         {d13[0]}, [r0,:32], r1
+        vqmovun.s16     d2,  q1
+        vld1.32         {d13[1]}, [r0,:32], r1
+        vaddw.u8        q3,  q3,  d12
+        vst1.32         {d2[0]},  [ip,:32], r1
+        vqmovun.s16     d3,  q2
+        vst1.32         {d2[1]},  [ip,:32], r1
+        vaddw.u8        q4,  q4,  d13
+        vst1.32         {d3[0]},  [ip,:32], r1
+        vqmovun.s16     d4,  q3
+        vst1.32         {d3[1]},  [ip,:32], r1
+        vqmovun.s16     d5,  q4
+        vst1.32         {d4[0]},  [ip,:32], r1
+        vst1.32         {d4[1]},  [ip,:32], r1
+        vst1.32         {d5[0]},  [ip,:32], r1
+        vst1.32         {d5[1]},  [ip,:32], r1
+
+        bx              lr
+endfunc
+
+/* void ff_simple_idct_add_neon(uint8_t *dst, int line_size, DCTELEM *data); */
+function ff_simple_idct_add_neon, export=1
+        idct_start      r2
+
+        bl              idct_row4_pld_neon
+        bl              idct_row4_neon
+        add             r2,  r2,  #-128
+        bl              idct_col4_neon
+        bl              idct_col4_add8_neon
+        sub             r0,  r0,  r1, lsl #3
+        add             r0,  r0,  #4
+        add             r2,  r2,  #-120
+        bl              idct_col4_neon
+        bl              idct_col4_add8_neon
+
+        idct_end
+endfunc
+
+        .align 6
+
+function idct_col4_st16_neon
+        mov             ip,  #16
+
+        vshr.s16        q1,  q1,  #COL_SHIFT-16
+        vshr.s16        q2,  q2,  #COL_SHIFT-16
+        vst1.64         {d2}, [r2,:64], ip
+        vshr.s16        q3,  q3,  #COL_SHIFT-16
+        vst1.64         {d3}, [r2,:64], ip
+        vshr.s16        q4,  q4,  #COL_SHIFT-16
+        vst1.64         {d4}, [r2,:64], ip
+        vst1.64         {d5}, [r2,:64], ip
+        vst1.64         {d6}, [r2,:64], ip
+        vst1.64         {d7}, [r2,:64], ip
+        vst1.64         {d8}, [r2,:64], ip
+        vst1.64         {d9}, [r2,:64], ip
+
+        bx              lr
+endfunc
+
+/* void ff_simple_idct_neon(DCTELEM *data); */
+function ff_simple_idct_neon, export=1
+        idct_start      r0
+
+        mov             r2,  r0
+        bl              idct_row4_neon
+        bl              idct_row4_neon
+        add             r2,  r2,  #-128
+        bl              idct_col4_neon
+        add             r2,  r2,  #-128
+        bl              idct_col4_st16_neon
+        add             r2,  r2,  #-120
+        bl              idct_col4_neon
+        add             r2,  r2,  #-128
+        bl              idct_col4_st16_neon
+
+        idct_end
+endfunc
diff --git a/plugins/supereq/ffmpeg_fft/libavcodec/avfft.c b/plugins/supereq/ffmpeg_fft/libavcodec/avfft.c
new file mode 100644
index 00000000..25fc4e09
--- /dev/null
+++ b/plugins/supereq/ffmpeg_fft/libavcodec/avfft.c
@@ -0,0 +1,142 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/mem.h"
+#include "avfft.h"
+#include "fft.h"
+
+/* FFT */
+
+FFTContext *av_fft_init(int nbits, int inverse)
+{
+    FFTContext *s = av_malloc(sizeof(*s));
+
+    if (s)
+        ff_fft_init(s, nbits, inverse);
+
+    return s;
+}
+
+void av_fft_permute(FFTContext *s, FFTComplex *z)
+{
+    s->fft_permute(s, z);
+}
+
+void av_fft_calc(FFTContext *s, FFTComplex *z)
+{
+    s->fft_calc(s, z);
+}
+
+void av_fft_end(FFTContext *s)
+{
+    if (s) {
+        ff_fft_end(s);
+        av_free(s);
+    }
+}
+
+#if CONFIG_MDCT
+
+FFTContext *av_mdct_init(int nbits, int inverse, double scale)
+{
+    FFTContext *s = av_malloc(sizeof(*s));
+
+    if (s)
+        ff_mdct_init(s, nbits, inverse, scale);
+
+    return s;
+}
+
+void av_imdct_calc(FFTContext *s, FFTSample *output, const FFTSample *input)
+{
+    s->imdct_calc(s, output, input);
+}
+
+void av_imdct_half(FFTContext *s, FFTSample *output, const FFTSample *input)
+{
+    s->imdct_half(s, output, input);
+}
+
+void av_mdct_calc(FFTContext *s, FFTSample *output, const FFTSample *input)
+{
+    s->mdct_calc(s, output, input);
+}
+
+void av_mdct_end(FFTContext *s)
+{
+    if (s) {
+        ff_mdct_end(s);
+        av_free(s);
+    }
+}
+
+#endif /* CONFIG_MDCT */
+
+#if CONFIG_RDFT
+
+RDFTContext *av_rdft_init(int nbits, enum RDFTransformType trans)
+{
+    RDFTContext *s = av_malloc(sizeof(*s));
+
+    if (s)
+        ff_rdft_init(s, nbits, trans);
+
+    return s;
+}
+
+void av_rdft_calc(RDFTContext *s, FFTSample *data)
+{
+    ff_rdft_calc(s, data);
+}
+
+void av_rdft_end(RDFTContext *s)
+{
+    if (s) {
+        ff_rdft_end(s);
+        av_free(s);
+    }
+}
+
+#endif /* CONFIG_RDFT */
+
+#if CONFIG_DCT
+
+DCTContext *av_dct_init(int nbits, enum DCTTransformType inverse)
+{
+    DCTContext *s = av_malloc(sizeof(*s));
+
+    if (s)
+        ff_dct_init(s, nbits, inverse);
+
+    return s;
+}
+
+void av_dct_calc(DCTContext *s, FFTSample *data)
+{
+    ff_dct_calc(s, data);
+}
+
+void av_dct_end(DCTContext *s)
+{
+    if (s) {
+        ff_dct_end(s);
+        av_free(s);
+    }
+}
+
+#endif /* CONFIG_DCT */
diff --git a/plugins/supereq/ffmpeg_fft/libavcodec/avfft.h b/plugins/supereq/ffmpeg_fft/libavcodec/avfft.h
new file mode 100644
index 00000000..fdf30237
--- /dev/null
+++ b/plugins/supereq/ffmpeg_fft/libavcodec/avfft.h
@@ -0,0 +1,103 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_AVFFT_H
+#define AVCODEC_AVFFT_H
+
+#include "publik.h"
+
+typedef float FFTSample;
+
+typedef struct FFTComplex {
+    FFTSample re, im;
+} FFTComplex;
+
+typedef struct FFTContext FFTContext;
+
+/**
+ * Set up a complex FFT.
+ * @param nbits           log2 of the length of the input array
+ * @param inverse         if 0 perform the forward transform, if 1 perform the inverse
+ */
+PUBLIK FFTContext *av_fft_init(int nbits, int inverse);
+
+/**
+ * Do the permutation needed BEFORE calling ff_fft_calc().
+ */
+PUBLIK void av_fft_permute(FFTContext *s, FFTComplex *z);
+
+/**
+ * Do a complex FFT with the parameters defined in av_fft_init(). The
+ * input data must be permuted before. No 1.0/sqrt(n) normalization is done.
+ */
+PUBLIK void av_fft_calc(FFTContext *s, FFTComplex *z);
+
+PUBLIK void av_fft_end(FFTContext *s);
+
+#if 0
+FFTContext *av_mdct_init(int nbits, int inverse, double scale);
+void av_imdct_calc(FFTContext *s, FFTSample *output, const FFTSample *input);
+void av_imdct_half(FFTContext *s, FFTSample *output, const FFTSample *input);
+void av_mdct_calc(FFTContext *s, FFTSample *output, const FFTSample *input);
+void av_mdct_end(FFTContext *s);
+#endif
+
+/* Real Discrete Fourier Transform */
+
+enum RDFTransformType {
+    DFT_R2C,
+    IDFT_C2R,
+    IDFT_R2C,
+    DFT_C2R,
+};
+
+typedef struct RDFTContext RDFTContext;
+
+/**
+ * Set up a real FFT.
+ * @param nbits           log2 of the length of the input array
+ * @param trans           the type of transform
+ */
+PUBLIK RDFTContext *av_rdft_init(int nbits, enum RDFTransformType trans);
+PUBLIK void av_rdft_calc(RDFTContext *s, FFTSample *data);
+PUBLIK void av_rdft_end(RDFTContext *s);
+
+/* Discrete Cosine Transform */
+
+typedef struct DCTContext DCTContext;
+
+enum DCTTransformType {
+    DCT_II = 0,
+    DCT_III,
+    DCT_I,
+    DST_I,
+};
+
+/**
+ * Set up DCT.
+ * @param nbits           size of the input array:
+ *                        (1 << nbits)     for DCT-II, DCT-III and DST-I
+ *                        (1 << nbits) + 1 for DCT-I
+ *
+ * @note the first element of the input of DST-I is ignored
+ */
+PUBLIK DCTContext *av_dct_init(int nbits, enum DCTTransformType type);
+PUBLIK void av_dct_calc(DCTContext *s, FFTSample *data);
+PUBLIK void av_dct_end (DCTContext *s);
+
+#endif /* AVCODEC_AVFFT_H */
diff --git a/plugins/supereq/ffmpeg_fft/libavcodec/dct.c b/plugins/supereq/ffmpeg_fft/libavcodec/dct.c
new file mode 100644
index 00000000..6ea1936e
--- /dev/null
+++ b/plugins/supereq/ffmpeg_fft/libavcodec/dct.c
@@ -0,0 +1,228 @@
+/*
+ * (I)DCT Transforms
+ * Copyright (c) 2009 Peter Ross <pross@xvid.org>
+ * Copyright (c) 2010 Alex Converse <alex.converse@gmail.com>
+ * Copyright (c) 2010 Vitor Sessak
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+
+/**
+ * @file
+ * (Inverse) Discrete Cosine Transforms. These are also known as the
+ * type II and type III DCTs respectively.
+ */
+
+#include <math.h>
+#include "libavutil/mathematics.h"
+#include "fft.h"
+#ifndef ARCH_ARM
+#include "x86/fft.h"
+#endif
+
+#define DCT32_FLOAT
+#include "dct32.h"
+
+/* sin((M_PI * x / (2*n)) */
+#define SIN(s,n,x) (s->costab[(n) - (x)])
+
+/* cos((M_PI * x / (2*n)) */
+#define COS(s,n,x) (s->costab[x])
+
+static void ff_dst_calc_I_c(DCTContext *ctx, FFTSample *data)
+{
+    int n = 1 << ctx->nbits;
+    int i;
+
+    data[0] = 0;
+    for(i = 1; i < n/2; i++) {
+        float tmp1 = data[i    ];
+        float tmp2 = data[n - i];
+        float s = SIN(ctx, n, 2*i);
+
+        s *= tmp1 + tmp2;
+        tmp1 = (tmp1 - tmp2) * 0.5f;
+        data[i    ] = s + tmp1;
+        data[n - i] = s - tmp1;
+    }
+
+    data[n/2] *= 2;
+    ff_rdft_calc(&ctx->rdft, data);
+
+    data[0] *= 0.5f;
+
+    for(i = 1; i < n-2; i += 2) {
+        data[i + 1] += data[i - 1];
+        data[i    ] = -data[i + 2];
+    }
+
+    data[n-1] = 0;
+}
+
+static void ff_dct_calc_I_c(DCTContext *ctx, FFTSample *data)
+{
+    int n = 1 << ctx->nbits;
+    int i;
+    float next = -0.5f * (data[0] - data[n]);
+
+    for(i = 0; i < n/2; i++) {
+        float tmp1 = data[i    ];
+        float tmp2 = data[n - i];
+        float s = SIN(ctx, n, 2*i);
+        float c = COS(ctx, n, 2*i);
+
+        c *= tmp1 - tmp2;
+        s *= tmp1 - tmp2;
+
+        next += c;
+
+        tmp1 = (tmp1 + tmp2) * 0.5f;
+        data[i    ] = tmp1 - s;
+        data[n - i] = tmp1 + s;
+    }
+
+    ff_rdft_calc(&ctx->rdft, data);
+    data[n] = data[1];
+    data[1] = next;
+
+    for(i = 3; i <= n; i += 2)
+        data[i] = data[i - 2] - data[i];
+}
+
+static void ff_dct_calc_III_c(DCTContext *ctx, FFTSample *data)
+{
+    int n = 1 << ctx->nbits;
+    int i;
+
+    float next = data[n - 1];
+    float inv_n = 1.0f / n;
+
+    for (i = n - 2; i >= 2; i -= 2) {
+        float val1 = data[i    ];
+        float val2 = data[i - 1] - data[i + 1];
+        float c = COS(ctx, n, i);
+        float s = SIN(ctx, n, i);
+
+        data[i    ] = c * val1 + s * val2;
+        data[i + 1] = s * val1 - c * val2;
+    }
+
+    data[1] = 2 * next;
+
+    ff_rdft_calc(&ctx->rdft, data);
+
+    for (i = 0; i < n / 2; i++) {
+        float tmp1 = data[i        ] * inv_n;
+        float tmp2 = data[n - i - 1] * inv_n;
+        float csc = ctx->csc2[i] * (tmp1 - tmp2);
+
+        tmp1 += tmp2;
+        data[i        ] = tmp1 + csc;
+        data[n - i - 1] = tmp1 - csc;
+    }
+}
+
+static void ff_dct_calc_II_c(DCTContext *ctx, FFTSample *data)
+{
+    int n = 1 << ctx->nbits;
+    int i;
+    float next;
+
+    for (i=0; i < n/2; i++) {
+        float tmp1 = data[i        ];
+        float tmp2 = data[n - i - 1];
+        float s = SIN(ctx, n, 2*i + 1);
+
+        s *= tmp1 - tmp2;
+        tmp1 = (tmp1 + tmp2) * 0.5f;
+
+        data[i    ] = tmp1 + s;
+        data[n-i-1] = tmp1 - s;
+    }
+
+    ff_rdft_calc(&ctx->rdft, data);
+
+    next = data[1] * 0.5;
+    data[1] *= -1;
+
+    for (i = n - 2; i >= 0; i -= 2) {
+        float inr = data[i    ];
+        float ini = data[i + 1];
+        float c = COS(ctx, n, i);
+        float s = SIN(ctx, n, i);
+
+        data[i  ] = c * inr + s * ini;
+
+        data[i+1] = next;
+
+        next +=     s * inr - c * ini;
+    }
+}
+
+static void dct32_func(DCTContext *ctx, FFTSample *data)
+{
+    ctx->dct32(data, data);
+}
+
+void ff_dct_calc(DCTContext *s, FFTSample *data)
+{
+    s->dct_calc(s, data);
+}
+
+av_cold int ff_dct_init(DCTContext *s, int nbits, enum DCTTransformType inverse)
+{
+    int n = 1 << nbits;
+    int i;
+
+    s->nbits    = nbits;
+    s->inverse  = inverse;
+
+    ff_init_ff_cos_tabs(nbits+2);
+
+    s->costab = ff_cos_tabs[nbits+2];
+
+    s->csc2 = av_malloc(n/2 * sizeof(FFTSample));
+
+    if (ff_rdft_init(&s->rdft, nbits, inverse == DCT_III) < 0) {
+        av_free(s->csc2);
+        return -1;
+    }
+
+    for (i = 0; i < n/2; i++)
+        s->csc2[i] = 0.5 / sin((M_PI / (2*n) * (2*i + 1)));
+
+    switch(inverse) {
+    case DCT_I  : s->dct_calc = ff_dct_calc_I_c; break;
+    case DCT_II : s->dct_calc = ff_dct_calc_II_c ; break;
+    case DCT_III: s->dct_calc = ff_dct_calc_III_c; break;
+    case DST_I  : s->dct_calc = ff_dst_calc_I_c; break;
+    }
+
+    if (inverse == DCT_II && nbits == 5)
+        s->dct_calc = dct32_func;
+
+    s->dct32 = dct32;
+    if (HAVE_MMX)     ff_dct_init_mmx(s);
+
+    return 0;
+}
+
+av_cold void ff_dct_end(DCTContext *s)
+{
+    ff_rdft_end(&s->rdft);
+    av_free(s->csc2);
+}
diff --git a/plugins/supereq/ffmpeg_fft/libavcodec/dct32.c b/plugins/supereq/ffmpeg_fft/libavcodec/dct32.c
new file mode 100644
index 00000000..3e6ad78d
--- /dev/null
+++ b/plugins/supereq/ffmpeg_fft/libavcodec/dct32.c
@@ -0,0 +1,262 @@
+/*
+ * Template for the Discrete Cosine Transform for 32 samples
+ * Copyright (c) 2001, 2002 Fabrice Bellard
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "dct32.h"
+
+/* tab[i][j] = 1.0 / (2.0 * cos(pi*(2*k+1) / 2^(6 - j))) */
+
+/* cos(i*pi/64) */
+
+#define COS0_0  FIXHR(0.50060299823519630134/2)
+#define COS0_1  FIXHR(0.50547095989754365998/2)
+#define COS0_2  FIXHR(0.51544730992262454697/2)
+#define COS0_3  FIXHR(0.53104259108978417447/2)
+#define COS0_4  FIXHR(0.55310389603444452782/2)
+#define COS0_5  FIXHR(0.58293496820613387367/2)
+#define COS0_6  FIXHR(0.62250412303566481615/2)
+#define COS0_7  FIXHR(0.67480834145500574602/2)
+#define COS0_8  FIXHR(0.74453627100229844977/2)
+#define COS0_9  FIXHR(0.83934964541552703873/2)
+#define COS0_10 FIXHR(0.97256823786196069369/2)
+#define COS0_11 FIXHR(1.16943993343288495515/4)
+#define COS0_12 FIXHR(1.48416461631416627724/4)
+#define COS0_13 FIXHR(2.05778100995341155085/8)
+#define COS0_14 FIXHR(3.40760841846871878570/8)
+#define COS0_15 FIXHR(10.19000812354805681150/32)
+
+#define COS1_0 FIXHR(0.50241928618815570551/2)
+#define COS1_1 FIXHR(0.52249861493968888062/2)
+#define COS1_2 FIXHR(0.56694403481635770368/2)
+#define COS1_3 FIXHR(0.64682178335999012954/2)
+#define COS1_4 FIXHR(0.78815462345125022473/2)
+#define COS1_5 FIXHR(1.06067768599034747134/4)
+#define COS1_6 FIXHR(1.72244709823833392782/4)
+#define COS1_7 FIXHR(5.10114861868916385802/16)
+
+#define COS2_0 FIXHR(0.50979557910415916894/2)
+#define COS2_1 FIXHR(0.60134488693504528054/2)
+#define COS2_2 FIXHR(0.89997622313641570463/2)
+#define COS2_3 FIXHR(2.56291544774150617881/8)
+
+#define COS3_0 FIXHR(0.54119610014619698439/2)
+#define COS3_1 FIXHR(1.30656296487637652785/4)
+
+#define COS4_0 FIXHR(0.70710678118654752439/2)
+
+/* butterfly operator */
+#define BF(a, b, c, s)\
+{\
+    tmp0 = val##a + val##b;\
+    tmp1 = val##a - val##b;\
+    val##a = tmp0;\
+    val##b = MULH3(tmp1, c, 1<<(s));\
+}
+
+#define BF0(a, b, c, s)\
+{\
+    tmp0 = tab[a] + tab[b];\
+    tmp1 = tab[a] - tab[b];\
+    val##a = tmp0;\
+    val##b = MULH3(tmp1, c, 1<<(s));\
+}
+
+#define BF1(a, b, c, d)\
+{\
+    BF(a, b, COS4_0, 1);\
+    BF(c, d,-COS4_0, 1);\
+    val##c += val##d;\
+}
+
+#define BF2(a, b, c, d)\
+{\
+    BF(a, b, COS4_0, 1);\
+    BF(c, d,-COS4_0, 1);\
+    val##c += val##d;\
+    val##a += val##c;\
+    val##c += val##b;\
+    val##b += val##d;\
+}
+
+#define ADD(a, b) val##a += val##b
+
+/* DCT32 without 1/sqrt(2) coef zero scaling. */
+void dct32(INTFLOAT *out, const INTFLOAT *tab)
+{
+    INTFLOAT tmp0, tmp1;
+
+    INTFLOAT val0 , val1 , val2 , val3 , val4 , val5 , val6 , val7 ,
+             val8 , val9 , val10, val11, val12, val13, val14, val15,
+             val16, val17, val18, val19, val20, val21, val22, val23,
+             val24, val25, val26, val27, val28, val29, val30, val31;
+
+    /* pass 1 */
+    BF0( 0, 31, COS0_0 , 1);
+    BF0(15, 16, COS0_15, 5);
+    /* pass 2 */
+    BF( 0, 15, COS1_0 , 1);
+    BF(16, 31,-COS1_0 , 1);
+    /* pass 1 */
+    BF0( 7, 24, COS0_7 , 1);
+    BF0( 8, 23, COS0_8 , 1);
+    /* pass 2 */
+    BF( 7,  8, COS1_7 , 4);
+    BF(23, 24,-COS1_7 , 4);
+    /* pass 3 */
+    BF( 0,  7, COS2_0 , 1);
+    BF( 8, 15,-COS2_0 , 1);
+    BF(16, 23, COS2_0 , 1);
+    BF(24, 31,-COS2_0 , 1);
+    /* pass 1 */
+    BF0( 3, 28, COS0_3 , 1);
+    BF0(12, 19, COS0_12, 2);
+    /* pass 2 */
+    BF( 3, 12, COS1_3 , 1);
+    BF(19, 28,-COS1_3 , 1);
+    /* pass 1 */
+    BF0( 4, 27, COS0_4 , 1);
+    BF0(11, 20, COS0_11, 2);
+    /* pass 2 */
+    BF( 4, 11, COS1_4 , 1);
+    BF(20, 27,-COS1_4 , 1);
+    /* pass 3 */
+    BF( 3,  4, COS2_3 , 3);
+    BF(11, 12,-COS2_3 , 3);
+    BF(19, 20, COS2_3 , 3);
+    BF(27, 28,-COS2_3 , 3);
+    /* pass 4 */
+    BF( 0,  3, COS3_0 , 1);
+    BF( 4,  7,-COS3_0 , 1);
+    BF( 8, 11, COS3_0 , 1);
+    BF(12, 15,-COS3_0 , 1);
+    BF(16, 19, COS3_0 , 1);
+    BF(20, 23,-COS3_0 , 1);
+    BF(24, 27, COS3_0 , 1);
+    BF(28, 31,-COS3_0 , 1);
+
+
+
+    /* pass 1 */
+    BF0( 1, 30, COS0_1 , 1);
+    BF0(14, 17, COS0_14, 3);
+    /* pass 2 */
+    BF( 1, 14, COS1_1 , 1);
+    BF(17, 30,-COS1_1 , 1);
+    /* pass 1 */
+    BF0( 6, 25, COS0_6 , 1);
+    BF0( 9, 22, COS0_9 , 1);
+    /* pass 2 */
+    BF( 6,  9, COS1_6 , 2);
+    BF(22, 25,-COS1_6 , 2);
+    /* pass 3 */
+    BF( 1,  6, COS2_1 , 1);
+    BF( 9, 14,-COS2_1 , 1);
+    BF(17, 22, COS2_1 , 1);
+    BF(25, 30,-COS2_1 , 1);
+
+    /* pass 1 */
+    BF0( 2, 29, COS0_2 , 1);
+    BF0(13, 18, COS0_13, 3);
+    /* pass 2 */
+    BF( 2, 13, COS1_2 , 1);
+    BF(18, 29,-COS1_2 , 1);
+    /* pass 1 */
+    BF0( 5, 26, COS0_5 , 1);
+    BF0(10, 21, COS0_10, 1);
+    /* pass 2 */
+    BF( 5, 10, COS1_5 , 2);
+    BF(21, 26,-COS1_5 , 2);
+    /* pass 3 */
+    BF( 2,  5, COS2_2 , 1);
+    BF(10, 13,-COS2_2 , 1);
+    BF(18, 21, COS2_2 , 1);
+    BF(26, 29,-COS2_2 , 1);
+    /* pass 4 */
+    BF( 1,  2, COS3_1 , 2);
+    BF( 5,  6,-COS3_1 , 2);
+    BF( 9, 10, COS3_1 , 2);
+    BF(13, 14,-COS3_1 , 2);
+    BF(17, 18, COS3_1 , 2);
+    BF(21, 22,-COS3_1 , 2);
+    BF(25, 26, COS3_1 , 2);
+    BF(29, 30,-COS3_1 , 2);
+
+    /* pass 5 */
+    BF1( 0,  1,  2,  3);
+    BF2( 4,  5,  6,  7);
+    BF1( 8,  9, 10, 11);
+    BF2(12, 13, 14, 15);
+    BF1(16, 17, 18, 19);
+    BF2(20, 21, 22, 23);
+    BF1(24, 25, 26, 27);
+    BF2(28, 29, 30, 31);
+
+    /* pass 6 */
+
+    ADD( 8, 12);
+    ADD(12, 10);
+    ADD(10, 14);
+    ADD(14,  9);
+    ADD( 9, 13);
+    ADD(13, 11);
+    ADD(11, 15);
+
+    out[ 0] = val0;
+    out[16] = val1;
+    out[ 8] = val2;
+    out[24] = val3;
+    out[ 4] = val4;
+    out[20] = val5;
+    out[12] = val6;
+    out[28] = val7;
+    out[ 2] = val8;
+    out[18] = val9;
+    out[10] = val10;
+    out[26] = val11;
+    out[ 6] = val12;
+    out[22] = val13;
+    out[14] = val14;
+    out[30] = val15;
+
+    ADD(24, 28);
+    ADD(28, 26);
+    ADD(26, 30);
+    ADD(30, 25);
+    ADD(25, 29);
+    ADD(29, 27);
+    ADD(27, 31);
+
+    out[ 1] = val16 + val24;
+    out[17] = val17 + val25;
+    out[ 9] = val18 + val26;
+    out[25] = val19 + val27;
+    out[ 5] = val20 + val28;
+    out[21] = val21 + val29;
+    out[13] = val22 + val30;
+    out[29] = val23 + val31;
+    out[ 3] = val24 + val20;
+    out[19] = val25 + val21;
+    out[11] = val26 + val22;
+    out[27] = val27 + val23;
+    out[ 7] = val28 + val18;
+    out[23] = val29 + val19;
+    out[15] = val30 + val17;
+    out[31] = val31;
+}
diff --git a/plugins/supereq/ffmpeg_fft/libavcodec/dct32.h b/plugins/supereq/ffmpeg_fft/libavcodec/dct32.h
new file mode 100644
index 00000000..dc2d847a
--- /dev/null
+++ b/plugins/supereq/ffmpeg_fft/libavcodec/dct32.h
@@ -0,0 +1,10 @@
+#ifndef DCT_32_H
+#define DCT_32_H
+
+#define FIXHR(x)       ((float)(x))
+#define MULH3(x, y, s) ((s)*(y)*(x))
+#define INTFLOAT float
+
+void dct32(INTFLOAT *out, const INTFLOAT *tab);
+
+#endif
diff --git a/plugins/supereq/ffmpeg_fft/libavcodec/fft.c b/plugins/supereq/ffmpeg_fft/libavcodec/fft.c
new file mode 100644
index 00000000..04082bf4
--- /dev/null
+++ b/plugins/supereq/ffmpeg_fft/libavcodec/fft.c
@@ -0,0 +1,300 @@
+/*
+ * FFT/IFFT transforms
+ * Copyright (c) 2008 Loren Merritt
+ * Copyright (c) 2002 Fabrice Bellard
+ * Partly based on libdjbfft by D. J. Bernstein
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * FFT/IFFT transforms.
+ */
+
+#include <stdlib.h>
+#include <string.h>
+#include "libavutil/mathematics.h"
+#include "fft.h"
+
+/* cos(2*pi*x/n) for 0<=x<=n/4, followed by its reverse */
+#if !CONFIG_HARDCODED_TABLES
+COSTABLE(16);
+COSTABLE(32);
+COSTABLE(64);
+COSTABLE(128);
+COSTABLE(256);
+COSTABLE(512);
+COSTABLE(1024);
+COSTABLE(2048);
+COSTABLE(4096);
+COSTABLE(8192);
+COSTABLE(16384);
+COSTABLE(32768);
+COSTABLE(65536);
+#endif
+COSTABLE_CONST FFTSample * const ff_cos_tabs[] = {
+    NULL, NULL, NULL, NULL,
+    ff_cos_16, ff_cos_32, ff_cos_64, ff_cos_128, ff_cos_256, ff_cos_512, ff_cos_1024,
+    ff_cos_2048, ff_cos_4096, ff_cos_8192, ff_cos_16384, ff_cos_32768, ff_cos_65536,
+};
+
+static int split_radix_permutation(int i, int n, int inverse)
+{
+    int m;
+    if(n <= 2) return i&1;
+    m = n >> 1;
+    if(!(i&m))            return split_radix_permutation(i, m, inverse)*2;
+    m >>= 1;
+    if(inverse == !(i&m)) return split_radix_permutation(i, m, inverse)*4 + 1;
+    else                  return split_radix_permutation(i, m, inverse)*4 - 1;
+}
+
+av_cold void ff_init_ff_cos_tabs(int index)
+{
+#if !CONFIG_HARDCODED_TABLES
+    int i;
+    int m = 1<<index;
+    double freq = 2*M_PI/m;
+    FFTSample *tab = ff_cos_tabs[index];
+    for(i=0; i<=m/4; i++)
+        tab[i] = cos(i*freq);
+    for(i=1; i<m/4; i++)
+        tab[m/2-i] = tab[i];
+#endif
+}
+
+av_cold int ff_fft_init(FFTContext *s, int nbits, int inverse)
+{
+    int i, j, n;
+
+    if (nbits < 2 || nbits > 16)
+        goto fail;
+    s->nbits = nbits;
+    n = 1 << nbits;
+
+    s->revtab = av_malloc(n * sizeof(uint16_t));
+    if (!s->revtab)
+        goto fail;
+    s->tmp_buf = av_malloc(n * sizeof(FFTComplex));
+    if (!s->tmp_buf)
+        goto fail;
+    s->inverse = inverse;
+
+    s->fft_permute = ff_fft_permute_c;
+    s->fft_calc    = ff_fft_calc_c;
+#if CONFIG_MDCT
+    s->imdct_calc  = ff_imdct_calc_c;
+    s->imdct_half  = ff_imdct_half_c;
+    s->mdct_calc   = ff_mdct_calc_c;
+#endif
+
+#if ARCH_ARM
+    ff_fft_init_arm(s);
+#elif HAVE_ALTIVEC
+    if (HAVE_ALTIVEC) ff_fft_init_altivec(s);
+#elif HAVE_MMX
+    if (HAVE_MMX)     ff_fft_init_mmx(s);
+#endif
+
+    for(j=4; j<=nbits; j++) {
+        ff_init_ff_cos_tabs(j);
+    }
+    for(i=0; i<n; i++)
+        s->revtab[-split_radix_permutation(i, n, s->inverse) & (n-1)] = i;
+
+    return 0;
+ fail:
+    av_freep(&s->revtab);
+    av_freep(&s->tmp_buf);
+    return -1;
+}
+
+void ff_fft_permute_c(FFTContext *s, FFTComplex *z)
+{
+    int j, np;
+    const uint16_t *revtab = s->revtab;
+    np = 1 << s->nbits;
+    /* TODO: handle split-radix permute in a more optimal way, probably in-place */
+    for(j=0;j<np;j++) s->tmp_buf[revtab[j]] = z[j];
+    memcpy(z, s->tmp_buf, np * sizeof(FFTComplex));
+}
+
+av_cold void ff_fft_end(FFTContext *s)
+{
+    av_freep(&s->revtab);
+    av_freep(&s->tmp_buf);
+}
+
+#define sqrthalf (float)M_SQRT1_2
+
+#define BF(x,y,a,b) {\
+    x = a - b;\
+    y = a + b;\
+}
+
+#define BUTTERFLIES(a0,a1,a2,a3) {\
+    BF(t3, t5, t5, t1);\
+    BF(a2.re, a0.re, a0.re, t5);\
+    BF(a3.im, a1.im, a1.im, t3);\
+    BF(t4, t6, t2, t6);\
+    BF(a3.re, a1.re, a1.re, t4);\
+    BF(a2.im, a0.im, a0.im, t6);\
+}
+
+// force loading all the inputs before storing any.
+// this is slightly slower for small data, but avoids store->load aliasing
+// for addresses separated by large powers of 2.
+#define BUTTERFLIES_BIG(a0,a1,a2,a3) {\
+    FFTSample r0=a0.re, i0=a0.im, r1=a1.re, i1=a1.im;\
+    BF(t3, t5, t5, t1);\
+    BF(a2.re, a0.re, r0, t5);\
+    BF(a3.im, a1.im, i1, t3);\
+    BF(t4, t6, t2, t6);\
+    BF(a3.re, a1.re, r1, t4);\
+    BF(a2.im, a0.im, i0, t6);\
+}
+
+#define TRANSFORM(a0,a1,a2,a3,wre,wim) {\
+    t1 = a2.re * wre + a2.im * wim;\
+    t2 = a2.im * wre - a2.re * wim;\
+    t5 = a3.re * wre - a3.im * wim;\
+    t6 = a3.im * wre + a3.re * wim;\
+    BUTTERFLIES(a0,a1,a2,a3)\
+}
+
+#define TRANSFORM_ZERO(a0,a1,a2,a3) {\
+    t1 = a2.re;\
+    t2 = a2.im;\
+    t5 = a3.re;\
+    t6 = a3.im;\
+    BUTTERFLIES(a0,a1,a2,a3)\
+}
+
+/* z[0...8n-1], w[1...2n-1] */
+#define PASS(name)\
+static void name(FFTComplex *z, const FFTSample *wre, unsigned int n)\
+{\
+    FFTSample t1, t2, t3, t4, t5, t6;\
+    int o1 = 2*n;\
+    int o2 = 4*n;\
+    int o3 = 6*n;\
+    const FFTSample *wim = wre+o1;\
+    n--;\
+\
+    TRANSFORM_ZERO(z[0],z[o1],z[o2],z[o3]);\
+    TRANSFORM(z[1],z[o1+1],z[o2+1],z[o3+1],wre[1],wim[-1]);\
+    do {\
+        z += 2;\
+        wre += 2;\
+        wim -= 2;\
+        TRANSFORM(z[0],z[o1],z[o2],z[o3],wre[0],wim[0]);\
+        TRANSFORM(z[1],z[o1+1],z[o2+1],z[o3+1],wre[1],wim[-1]);\
+    } while(--n);\
+}
+
+PASS(pass)
+#undef BUTTERFLIES
+#define BUTTERFLIES BUTTERFLIES_BIG
+PASS(pass_big)
+
+#define DECL_FFT(n,n2,n4)\
+static void fft##n(FFTComplex *z)\
+{\
+    fft##n2(z);\
+    fft##n4(z+n4*2);\
+    fft##n4(z+n4*3);\
+    pass(z,ff_cos_##n,n4/2);\
+}
+
+static void fft4(FFTComplex *z)
+{
+    FFTSample t1, t2, t3, t4, t5, t6, t7, t8;
+
+    BF(t3, t1, z[0].re, z[1].re);
+    BF(t8, t6, z[3].re, z[2].re);
+    BF(z[2].re, z[0].re, t1, t6);
+    BF(t4, t2, z[0].im, z[1].im);
+    BF(t7, t5, z[2].im, z[3].im);
+    BF(z[3].im, z[1].im, t4, t8);
+    BF(z[3].re, z[1].re, t3, t7);
+    BF(z[2].im, z[0].im, t2, t5);
+}
+
+static void fft8(FFTComplex *z)
+{
+    FFTSample t1, t2, t3, t4, t5, t6, t7, t8;
+
+    fft4(z);
+
+    BF(t1, z[5].re, z[4].re, -z[5].re);
+    BF(t2, z[5].im, z[4].im, -z[5].im);
+    BF(t3, z[7].re, z[6].re, -z[7].re);
+    BF(t4, z[7].im, z[6].im, -z[7].im);
+    BF(t8, t1, t3, t1);
+    BF(t7, t2, t2, t4);
+    BF(z[4].re, z[0].re, z[0].re, t1);
+    BF(z[4].im, z[0].im, z[0].im, t2);
+    BF(z[6].re, z[2].re, z[2].re, t7);
+    BF(z[6].im, z[2].im, z[2].im, t8);
+
+    TRANSFORM(z[1],z[3],z[5],z[7],sqrthalf,sqrthalf);
+}
+
+#if !CONFIG_SMALL
+static void fft16(FFTComplex *z)
+{
+    FFTSample t1, t2, t3, t4, t5, t6;
+
+    fft8(z);
+    fft4(z+8);
+    fft4(z+12);
+
+    TRANSFORM_ZERO(z[0],z[4],z[8],z[12]);
+    TRANSFORM(z[2],z[6],z[10],z[14],sqrthalf,sqrthalf);
+    TRANSFORM(z[1],z[5],z[9],z[13],ff_cos_16[1],ff_cos_16[3]);
+    TRANSFORM(z[3],z[7],z[11],z[15],ff_cos_16[3],ff_cos_16[1]);
+}
+#else
+DECL_FFT(16,8,4)
+#endif
+DECL_FFT(32,16,8)
+DECL_FFT(64,32,16)
+DECL_FFT(128,64,32)
+DECL_FFT(256,128,64)
+DECL_FFT(512,256,128)
+#if !CONFIG_SMALL
+#define pass pass_big
+#endif
+DECL_FFT(1024,512,256)
+DECL_FFT(2048,1024,512)
+DECL_FFT(4096,2048,1024)
+DECL_FFT(8192,4096,2048)
+DECL_FFT(16384,8192,4096)
+DECL_FFT(32768,16384,8192)
+DECL_FFT(65536,32768,16384)
+
+static void (* const fft_dispatch[])(FFTComplex*) = {
+    fft4, fft8, fft16, fft32, fft64, fft128, fft256, fft512, fft1024,
+    fft2048, fft4096, fft8192, fft16384, fft32768, fft65536,
+};
+
+void ff_fft_calc_c(FFTContext *s, FFTComplex *z)
+{
+    fft_dispatch[s->nbits-2](z);
+}
+
diff --git a/plugins/supereq/ffmpeg_fft/libavcodec/fft.h b/plugins/supereq/ffmpeg_fft/libavcodec/fft.h
new file mode 100644
index 00000000..b2e0f540
--- /dev/null
+++ b/plugins/supereq/ffmpeg_fft/libavcodec/fft.h
@@ -0,0 +1,244 @@
+/*
+ * Copyright (c) 2000, 2001, 2002 Fabrice Bellard
+ * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_FFT_H
+#define AVCODEC_FFT_H
+
+#include <stdint.h>
+#include "../config.h"
+#include "libavutil/mem.h"
+#include "avfft.h"
+
+/* FFT computation */
+
+struct FFTContext {
+    int nbits;
+    int inverse;
+    uint16_t *revtab;
+    FFTComplex *tmp_buf;
+    int mdct_size; /* size of MDCT (i.e. number of input data * 2) */
+    int mdct_bits; /* n = 2^nbits */
+    /* pre/post rotation tables */
+    FFTSample *tcos;
+    FFTSample *tsin;
+    void (*fft_permute)(struct FFTContext *s, FFTComplex *z);
+    void (*fft_calc)(struct FFTContext *s, FFTComplex *z);
+    void (*imdct_calc)(struct FFTContext *s, FFTSample *output, const FFTSample *input);
+    void (*imdct_half)(struct FFTContext *s, FFTSample *output, const FFTSample *input);
+    void (*mdct_calc)(struct FFTContext *s, FFTSample *output, const FFTSample *input);
+    int permutation;
+#define FF_MDCT_PERM_NONE       0
+#define FF_MDCT_PERM_INTERLEAVE 1
+};
+
+#if CONFIG_HARDCODED_TABLES
+#define COSTABLE_CONST const
+#define SINTABLE_CONST const
+#define SINETABLE_CONST const
+#else
+#define COSTABLE_CONST
+#define SINTABLE_CONST
+#define SINETABLE_CONST
+#endif
+
+#define COSTABLE(size) \
+    COSTABLE_CONST DECLARE_ALIGNED(16, FFTSample, ff_cos_##size)[size/2]
+#define SINTABLE(size) \
+    SINTABLE_CONST DECLARE_ALIGNED(16, FFTSample, ff_sin_##size)[size/2]
+#define SINETABLE(size) \
+    SINETABLE_CONST DECLARE_ALIGNED(16, float, ff_sine_##size)[size]
+extern COSTABLE(16);
+extern COSTABLE(32);
+extern COSTABLE(64);
+extern COSTABLE(128);
+extern COSTABLE(256);
+extern COSTABLE(512);
+extern COSTABLE(1024);
+extern COSTABLE(2048);
+extern COSTABLE(4096);
+extern COSTABLE(8192);
+extern COSTABLE(16384);
+extern COSTABLE(32768);
+extern COSTABLE(65536);
+extern COSTABLE_CONST FFTSample* const ff_cos_tabs[17];
+
+/**
+ * Initialize the cosine table in ff_cos_tabs[index]
+ * \param index index in ff_cos_tabs array of the table to initialize
+ */
+void ff_init_ff_cos_tabs(int index);
+
+extern SINTABLE(16);
+extern SINTABLE(32);
+extern SINTABLE(64);
+extern SINTABLE(128);
+extern SINTABLE(256);
+extern SINTABLE(512);
+extern SINTABLE(1024);
+extern SINTABLE(2048);
+extern SINTABLE(4096);
+extern SINTABLE(8192);
+extern SINTABLE(16384);
+extern SINTABLE(32768);
+extern SINTABLE(65536);
+
+/**
+ * Set up a complex FFT.
+ * @param nbits           log2 of the length of the input array
+ * @param inverse         if 0 perform the forward transform, if 1 perform the inverse
+ */
+int ff_fft_init(FFTContext *s, int nbits, int inverse);
+void ff_fft_permute_c(FFTContext *s, FFTComplex *z);
+void ff_fft_calc_c(FFTContext *s, FFTComplex *z);
+
+void ff_fft_init_altivec(FFTContext *s);
+void ff_fft_init_mmx(FFTContext *s);
+void ff_fft_init_arm(FFTContext *s);
+void ff_dct_init_mmx(DCTContext *s);
+
+/**
+ * Do the permutation needed BEFORE calling ff_fft_calc().
+ */
+static inline void ff_fft_permute(FFTContext *s, FFTComplex *z)
+{
+    s->fft_permute(s, z);
+}
+/**
+ * Do a complex FFT with the parameters defined in ff_fft_init(). The
+ * input data must be permuted before. No 1.0/sqrt(n) normalization is done.
+ */
+static inline void ff_fft_calc(FFTContext *s, FFTComplex *z)
+{
+    s->fft_calc(s, z);
+}
+void ff_fft_end(FFTContext *s);
+
+/* MDCT computation */
+
+static inline void ff_imdct_calc(FFTContext *s, FFTSample *output, const FFTSample *input)
+{
+    s->imdct_calc(s, output, input);
+}
+static inline void ff_imdct_half(FFTContext *s, FFTSample *output, const FFTSample *input)
+{
+    s->imdct_half(s, output, input);
+}
+
+static inline void ff_mdct_calc(FFTContext *s, FFTSample *output,
+                                const FFTSample *input)
+{
+    s->mdct_calc(s, output, input);
+}
+
+/**
+ * Maximum window size for ff_kbd_window_init.
+ */
+#define FF_KBD_WINDOW_MAX 1024
+
+/**
+ * Generate a Kaiser-Bessel Derived Window.
+ * @param   window  pointer to half window
+ * @param   alpha   determines window shape
+ * @param   n       size of half window, max FF_KBD_WINDOW_MAX
+ */
+void ff_kbd_window_init(float *window, float alpha, int n);
+
+/**
+ * Generate a sine window.
+ * @param   window  pointer to half window
+ * @param   n       size of half window
+ */
+void ff_sine_window_init(float *window, int n);
+
+/**
+ * initialize the specified entry of ff_sine_windows
+ */
+void ff_init_ff_sine_windows(int index);
+extern SINETABLE(  32);
+extern SINETABLE(  64);
+extern SINETABLE( 128);
+extern SINETABLE( 256);
+extern SINETABLE( 512);
+extern SINETABLE(1024);
+extern SINETABLE(2048);
+extern SINETABLE(4096);
+extern SINETABLE_CONST float * const ff_sine_windows[13];
+
+int ff_mdct_init(FFTContext *s, int nbits, int inverse, double scale);
+void ff_imdct_calc_c(FFTContext *s, FFTSample *output, const FFTSample *input);
+void ff_imdct_half_c(FFTContext *s, FFTSample *output, const FFTSample *input);
+void ff_mdct_calc_c(FFTContext *s, FFTSample *output, const FFTSample *input);
+void ff_mdct_end(FFTContext *s);
+
+/* Real Discrete Fourier Transform */
+
+struct RDFTContext {
+    int nbits;
+    int inverse;
+    int sign_convention;
+
+    /* pre/post rotation tables */
+    const FFTSample *tcos;
+    SINTABLE_CONST FFTSample *tsin;
+    FFTContext fft;
+    void (*rdft_calc)(struct RDFTContext *s, FFTSample *z);
+};
+
+/**
+ * Set up a real FFT.
+ * @param nbits           log2 of the length of the input array
+ * @param trans           the type of transform
+ */
+int ff_rdft_init(RDFTContext *s, int nbits, enum RDFTransformType trans);
+void ff_rdft_end(RDFTContext *s);
+
+void ff_rdft_init_arm(RDFTContext *s);
+
+static av_always_inline void ff_rdft_calc(RDFTContext *s, FFTSample *data)
+{
+    s->rdft_calc(s, data);
+}
+
+/* Discrete Cosine Transform */
+
+struct DCTContext {
+    int nbits;
+    int inverse;
+    RDFTContext rdft;
+    const float *costab;
+    FFTSample *csc2;
+    void (*dct_calc)(struct DCTContext *s, FFTSample *data);
+    void (*dct32)(FFTSample *out, const FFTSample *in);
+};
+
+/**
+ * Set up DCT.
+ * @param nbits           size of the input array:
+ *                        (1 << nbits)     for DCT-II, DCT-III and DST-I
+ *                        (1 << nbits) + 1 for DCT-I
+ *
+ * @note the first element of the input of DST-I is ignored
+ */
+int  ff_dct_init(DCTContext *s, int nbits, enum DCTTransformType type);
+void ff_dct_calc(DCTContext *s, FFTSample *data);
+void ff_dct_end (DCTContext *s);
+
+#endif /* AVCODEC_FFT_H */
diff --git a/plugins/supereq/ffmpeg_fft/libavcodec/rdft.c b/plugins/supereq/ffmpeg_fft/libavcodec/rdft.c
new file mode 100644
index 00000000..fe6014fb
--- /dev/null
+++ b/plugins/supereq/ffmpeg_fft/libavcodec/rdft.c
@@ -0,0 +1,137 @@
+/*
+ * (I)RDFT transforms
+ * Copyright (c) 2009 Alex Converse <alex dot converse at gmail dot com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#include <stdlib.h>
+#include <math.h>
+#include "libavutil/mathematics.h"
+#include "fft.h"
+
+/**
+ * @file
+ * (Inverse) Real Discrete Fourier Transforms.
+ */
+
+/* sin(2*pi*x/n) for 0<=x<n/4, followed by n/2<=x<3n/4 */
+#if !CONFIG_HARDCODED_TABLES
+SINTABLE(16);
+SINTABLE(32);
+SINTABLE(64);
+SINTABLE(128);
+SINTABLE(256);
+SINTABLE(512);
+SINTABLE(1024);
+SINTABLE(2048);
+SINTABLE(4096);
+SINTABLE(8192);
+SINTABLE(16384);
+SINTABLE(32768);
+SINTABLE(65536);
+#endif
+SINTABLE_CONST FFTSample * const ff_sin_tabs[] = {
+    NULL, NULL, NULL, NULL,
+    ff_sin_16, ff_sin_32, ff_sin_64, ff_sin_128, ff_sin_256, ff_sin_512, ff_sin_1024,
+    ff_sin_2048, ff_sin_4096, ff_sin_8192, ff_sin_16384, ff_sin_32768, ff_sin_65536,
+};
+
+/** Map one real FFT into two parallel real even and odd FFTs. Then interleave
+ * the two real FFTs into one complex FFT. Unmangle the results.
+ * ref: http://www.engineeringproductivitytools.com/stuff/T0001/PT10.HTM
+ */
+static void ff_rdft_calc_c(RDFTContext* s, FFTSample* data)
+{
+    int i, i1, i2;
+    FFTComplex ev, od;
+    const int n = 1 << s->nbits;
+    const float k1 = 0.5;
+    const float k2 = 0.5 - s->inverse;
+    const FFTSample *tcos = s->tcos;
+    const FFTSample *tsin = s->tsin;
+
+    if (!s->inverse) {
+        ff_fft_permute(&s->fft, (FFTComplex*)data);
+        ff_fft_calc(&s->fft, (FFTComplex*)data);
+    }
+    /* i=0 is a special case because of packing, the DC term is real, so we
+       are going to throw the N/2 term (also real) in with it. */
+    ev.re = data[0];
+    data[0] = ev.re+data[1];
+    data[1] = ev.re-data[1];
+    for (i = 1; i < (n>>2); i++) {
+        i1 = 2*i;
+        i2 = n-i1;
+        /* Separate even and odd FFTs */
+        ev.re =  k1*(data[i1  ]+data[i2  ]);
+        od.im = -k2*(data[i1  ]-data[i2  ]);
+        ev.im =  k1*(data[i1+1]-data[i2+1]);
+        od.re =  k2*(data[i1+1]+data[i2+1]);
+        /* Apply twiddle factors to the odd FFT and add to the even FFT */
+        data[i1  ] =  ev.re + od.re*tcos[i] - od.im*tsin[i];
+        data[i1+1] =  ev.im + od.im*tcos[i] + od.re*tsin[i];
+        data[i2  ] =  ev.re - od.re*tcos[i] + od.im*tsin[i];
+        data[i2+1] = -ev.im + od.im*tcos[i] + od.re*tsin[i];
+    }
+    data[2*i+1]=s->sign_convention*data[2*i+1];
+    if (s->inverse) {
+        data[0] *= k1;
+        data[1] *= k1;
+        ff_fft_permute(&s->fft, (FFTComplex*)data);
+        ff_fft_calc(&s->fft, (FFTComplex*)data);
+    }
+}
+
+av_cold int ff_rdft_init(RDFTContext *s, int nbits, enum RDFTransformType trans)
+{
+    int n = 1 << nbits;
+    int i;
+    const double theta = (trans == DFT_R2C || trans == DFT_C2R ? -1 : 1)*2*M_PI/n;
+
+    s->nbits           = nbits;
+    s->inverse         = trans == IDFT_C2R || trans == DFT_C2R;
+    s->sign_convention = trans == IDFT_R2C || trans == DFT_C2R ? 1 : -1;
+
+    if (nbits < 4 || nbits > 16) {
+        return -1;
+    }
+
+    if (ff_fft_init(&s->fft, nbits-1, trans == IDFT_C2R || trans == IDFT_R2C) < 0) {
+        return -1;
+    }
+
+    ff_init_ff_cos_tabs(nbits);
+    s->tcos = ff_cos_tabs[nbits];
+    s->tsin = ff_sin_tabs[nbits]+(trans == DFT_R2C || trans == DFT_C2R)*(n>>2);
+#if !CONFIG_HARDCODED_TABLES
+    for (i = 0; i < (n>>2); i++) {
+        s->tsin[i] = sin(i*theta);
+    }
+#endif
+    s->rdft_calc   = ff_rdft_calc_c;
+
+#if ARCH_ARM
+    ff_rdft_init_arm(s);
+#endif
+
+    return 0;
+}
+
+av_cold void ff_rdft_end(RDFTContext *s)
+{
+    ff_fft_end(&s->fft);
+}
diff --git a/plugins/supereq/ffmpeg_fft/libavutil/attributes.h b/plugins/supereq/ffmpeg_fft/libavutil/attributes.h
new file mode 100644
index 00000000..50fbfc31
--- /dev/null
+++ b/plugins/supereq/ffmpeg_fft/libavutil/attributes.h
@@ -0,0 +1,122 @@
+/*
+ * copyright (c) 2006 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * Macro definitions for various function/variable attributes
+ */
+
+#ifndef AVUTIL_ATTRIBUTES_H
+#define AVUTIL_ATTRIBUTES_H
+
+#ifdef __GNUC__
+#    define AV_GCC_VERSION_AT_LEAST(x,y) (__GNUC__ > x || __GNUC__ == x && __GNUC_MINOR__ >= y)
+#else
+#    define AV_GCC_VERSION_AT_LEAST(x,y) 0
+#endif
+
+#ifndef av_always_inline
+#if AV_GCC_VERSION_AT_LEAST(3,1)
+#    define av_always_inline __attribute__((always_inline)) inline
+#else
+#    define av_always_inline inline
+#endif
+#endif
+
+#ifndef av_noinline
+#if AV_GCC_VERSION_AT_LEAST(3,1)
+#    define av_noinline __attribute__((noinline))
+#else
+#    define av_noinline
+#endif
+#endif
+
+#ifndef av_pure
+#if AV_GCC_VERSION_AT_LEAST(3,1)
+#    define av_pure __attribute__((pure))
+#else
+#    define av_pure
+#endif
+#endif
+
+#ifndef av_const
+#if AV_GCC_VERSION_AT_LEAST(2,6)
+#    define av_const __attribute__((const))
+#else
+#    define av_const
+#endif
+#endif
+
+#ifndef av_cold
+#if (!defined(__ICC) || __ICC > 1110) && AV_GCC_VERSION_AT_LEAST(4,3)
+#    define av_cold __attribute__((cold))
+#else
+#    define av_cold
+#endif
+#endif
+
+#ifndef av_flatten
+#if (!defined(__ICC) || __ICC > 1110) && AV_GCC_VERSION_AT_LEAST(4,1)
+#    define av_flatten __attribute__((flatten))
+#else
+#    define av_flatten
+#endif
+#endif
+
+#ifndef attribute_deprecated
+#if AV_GCC_VERSION_AT_LEAST(3,1)
+#    define attribute_deprecated __attribute__((deprecated))
+#else
+#    define attribute_deprecated
+#endif
+#endif
+
+#ifndef av_unused
+#if defined(__GNUC__)
+#    define av_unused __attribute__((unused))
+#else
+#    define av_unused
+#endif
+#endif
+
+#ifndef av_alias
+#if (!defined(__ICC) || __ICC > 1110) && AV_GCC_VERSION_AT_LEAST(3,3)
+#   define av_alias __attribute__((may_alias))
+#else
+#   define av_alias
+#endif
+#endif
+
+#ifndef av_uninit
+#if defined(__GNUC__) && !defined(__ICC)
+#    define av_uninit(x) x=x
+#else
+#    define av_uninit(x) x
+#endif
+#endif
+
+#ifdef __GNUC__
+#    define av_builtin_constant_p __builtin_constant_p
+#else
+#    define av_builtin_constant_p(x) 0
+#endif
+
+#endif /* AVUTIL_ATTRIBUTES_H */
+
diff --git a/plugins/supereq/ffmpeg_fft/libavutil/avconfig.h b/plugins/supereq/ffmpeg_fft/libavutil/avconfig.h
new file mode 100644
index 00000000..b028bb4f
--- /dev/null
+++ b/plugins/supereq/ffmpeg_fft/libavutil/avconfig.h
@@ -0,0 +1,5 @@
+/* Generated by ffconf */
+#ifndef AVUTIL_AVCONFIG_H
+#define AVUTIL_AVCONFIG_H
+#define AV_HAVE_BIGENDIAN 0
+#endif /* AVUTIL_AVCONFIG_H */
diff --git a/plugins/supereq/ffmpeg_fft/libavutil/avutil.h b/plugins/supereq/ffmpeg_fft/libavutil/avutil.h
new file mode 100644
index 00000000..f5d364be
--- /dev/null
+++ b/plugins/supereq/ffmpeg_fft/libavutil/avutil.h
@@ -0,0 +1,90 @@
+/*
+ * copyright (c) 2006 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVUTIL_AVUTIL_H
+#define AVUTIL_AVUTIL_H
+
+/**
+ * @file
+ * external API header
+ */
+
+
+#define AV_STRINGIFY(s)         AV_TOSTRING(s)
+#define AV_TOSTRING(s) #s
+
+#define AV_GLUE(a, b) a ## b
+#define AV_JOIN(a, b) AV_GLUE(a, b)
+
+#define AV_PRAGMA(s) _Pragma(#s)
+
+#define AV_VERSION_INT(a, b, c) (a<<16 | b<<8 | c)
+#define AV_VERSION_DOT(a, b, c) a ##.## b ##.## c
+#define AV_VERSION(a, b, c) AV_VERSION_DOT(a, b, c)
+
+#define LIBAVUTIL_VERSION_MAJOR 50
+#define LIBAVUTIL_VERSION_MINOR 21
+#define LIBAVUTIL_VERSION_MICRO  0
+
+#define LIBAVUTIL_VERSION_INT   AV_VERSION_INT(LIBAVUTIL_VERSION_MAJOR, \
+                                               LIBAVUTIL_VERSION_MINOR, \
+                                               LIBAVUTIL_VERSION_MICRO)
+#define LIBAVUTIL_VERSION       AV_VERSION(LIBAVUTIL_VERSION_MAJOR,     \
+                                           LIBAVUTIL_VERSION_MINOR,     \
+                                           LIBAVUTIL_VERSION_MICRO)
+#define LIBAVUTIL_BUILD         LIBAVUTIL_VERSION_INT
+
+#define LIBAVUTIL_IDENT         "Lavu" AV_STRINGIFY(LIBAVUTIL_VERSION)
+
+/**
+ * Return the LIBAVUTIL_VERSION_INT constant.
+ */
+unsigned avutil_version(void);
+
+/**
+ * Return the libavutil build-time configuration.
+ */
+const char *avutil_configuration(void);
+
+/**
+ * Return the libavutil license.
+ */
+const char *avutil_license(void);
+
+enum AVMediaType {
+    AVMEDIA_TYPE_UNKNOWN = -1,
+    AVMEDIA_TYPE_VIDEO,
+    AVMEDIA_TYPE_AUDIO,
+    AVMEDIA_TYPE_DATA,
+    AVMEDIA_TYPE_SUBTITLE,
+    AVMEDIA_TYPE_ATTACHMENT,
+    AVMEDIA_TYPE_NB
+};
+
+#include "common.h"
+/* #include "error.h" */
+#include "mathematics.h"
+#include "rational.h"
+#include "intfloat_readwrite.h"
+/* #include "log.h" */
+/* #include "pixfmt.h" */
+
+#endif /* AVUTIL_AVUTIL_H */
+
diff --git a/plugins/supereq/ffmpeg_fft/libavutil/common.h b/plugins/supereq/ffmpeg_fft/libavutil/common.h
new file mode 100644
index 00000000..9dff1435
--- /dev/null
+++ b/plugins/supereq/ffmpeg_fft/libavutil/common.h
@@ -0,0 +1,347 @@
+/*
+ * copyright (c) 2006 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * common internal and external API header
+ */
+
+#ifndef AVUTIL_COMMON_H
+#define AVUTIL_COMMON_H
+
+#include <ctype.h>
+#include <errno.h>
+#include <inttypes.h>
+#include <limits.h>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "attributes.h"
+
+//rounded division & shift
+#define RSHIFT(a,b) ((a) > 0 ? ((a) + ((1<<(b))>>1))>>(b) : ((a) + ((1<<(b))>>1)-1)>>(b))
+/* assume b>0 */
+#define ROUNDED_DIV(a,b) (((a)>0 ? (a) + ((b)>>1) : (a) - ((b)>>1))/(b))
+#define FFABS(a) ((a) >= 0 ? (a) : (-(a)))
+#define FFSIGN(a) ((a) > 0 ? 1 : -1)
+
+#define FFMAX(a,b) ((a) > (b) ? (a) : (b))
+#define FFMAX3(a,b,c) FFMAX(FFMAX(a,b),c)
+#define FFMIN(a,b) ((a) > (b) ? (b) : (a))
+#define FFMIN3(a,b,c) FFMIN(FFMIN(a,b),c)
+
+#define FFSWAP(type,a,b) do{type SWAP_tmp= b; b= a; a= SWAP_tmp;}while(0)
+#define FF_ARRAY_ELEMS(a) (sizeof(a) / sizeof((a)[0]))
+#define FFALIGN(x, a) (((x)+(a)-1)&~((a)-1))
+
+/* misc math functions */
+extern const uint8_t ff_log2_tab[256];
+
+extern const uint8_t av_reverse[256];
+
+static inline av_const int av_log2_c(unsigned int v)
+{
+    int n = 0;
+    if (v & 0xffff0000) {
+        v >>= 16;
+        n += 16;
+    }
+    if (v & 0xff00) {
+        v >>= 8;
+        n += 8;
+    }
+    n += ff_log2_tab[v];
+
+    return n;
+}
+
+static inline av_const int av_log2_16bit_c(unsigned int v)
+{
+    int n = 0;
+    if (v & 0xff00) {
+        v >>= 8;
+        n += 8;
+    }
+    n += ff_log2_tab[v];
+
+    return n;
+}
+
+#ifdef HAVE_AV_CONFIG_H
+#   include "config.h"
+#   include "intmath.h"
+#endif
+
+/* Pull in unguarded fallback defines at the end of this file. */
+#include "common.h"
+
+/**
+ * Clip a signed integer value into the amin-amax range.
+ * @param a value to clip
+ * @param amin minimum value of the clip range
+ * @param amax maximum value of the clip range
+ * @return clipped value
+ */
+static inline av_const int av_clip_c(int a, int amin, int amax)
+{
+    if      (a < amin) return amin;
+    else if (a > amax) return amax;
+    else               return a;
+}
+
+/**
+ * Clip a signed integer value into the 0-255 range.
+ * @param a value to clip
+ * @return clipped value
+ */
+static inline av_const uint8_t av_clip_uint8_c(int a)
+{
+    if (a&(~0xFF)) return (-a)>>31;
+    else           return a;
+}
+
+/**
+ * Clip a signed integer value into the -128,127 range.
+ * @param a value to clip
+ * @return clipped value
+ */
+static inline av_const int8_t av_clip_int8_c(int a)
+{
+    if ((a+0x80) & ~0xFF) return (a>>31) ^ 0x7F;
+    else                  return a;
+}
+
+/**
+ * Clip a signed integer value into the 0-65535 range.
+ * @param a value to clip
+ * @return clipped value
+ */
+static inline av_const uint16_t av_clip_uint16_c(int a)
+{
+    if (a&(~0xFFFF)) return (-a)>>31;
+    else             return a;
+}
+
+/**
+ * Clip a signed integer value into the -32768,32767 range.
+ * @param a value to clip
+ * @return clipped value
+ */
+static inline av_const int16_t av_clip_int16_c(int a)
+{
+    if ((a+0x8000) & ~0xFFFF) return (a>>31) ^ 0x7FFF;
+    else                      return a;
+}
+
+/**
+ * Clip a signed 64-bit integer value into the -2147483648,2147483647 range.
+ * @param a value to clip
+ * @return clipped value
+ */
+static inline av_const int32_t av_clipl_int32_c(int64_t a)
+{
+    if ((a+0x80000000u) & ~UINT64_C(0xFFFFFFFF)) return (a>>63) ^ 0x7FFFFFFF;
+    else                                         return a;
+}
+
+/**
+ * Clip a float value into the amin-amax range.
+ * @param a value to clip
+ * @param amin minimum value of the clip range
+ * @param amax maximum value of the clip range
+ * @return clipped value
+ */
+static inline av_const float av_clipf_c(float a, float amin, float amax)
+{
+    if      (a < amin) return amin;
+    else if (a > amax) return amax;
+    else               return a;
+}
+
+/** Compute ceil(log2(x)).
+ * @param x value used to compute ceil(log2(x))
+ * @return computed ceiling of log2(x)
+ */
+static inline av_const int av_ceil_log2_c(int x)
+{
+    return av_log2((x - 1) << 1);
+}
+
+#define MKTAG(a,b,c,d) ((a) | ((b) << 8) | ((c) << 16) | ((d) << 24))
+#define MKBETAG(a,b,c,d) ((d) | ((c) << 8) | ((b) << 16) | ((a) << 24))
+
+/**
+ * Convert a UTF-8 character (up to 4 bytes) to its 32-bit UCS-4 encoded form.
+ *
+ * @param val      Output value, must be an lvalue of type uint32_t.
+ * @param GET_BYTE Expression reading one byte from the input.
+ *                 Evaluated up to 7 times (4 for the currently
+ *                 assigned Unicode range).  With a memory buffer
+ *                 input, this could be *ptr++.
+ * @param ERROR    Expression to be evaluated on invalid input,
+ *                 typically a goto statement.
+ */
+#define GET_UTF8(val, GET_BYTE, ERROR)\
+    val= GET_BYTE;\
+    {\
+        int ones= 7 - av_log2(val ^ 255);\
+        if(ones==1)\
+            ERROR\
+        val&= 127>>ones;\
+        while(--ones > 0){\
+            int tmp= GET_BYTE - 128;\
+            if(tmp>>6)\
+                ERROR\
+            val= (val<<6) + tmp;\
+        }\
+    }
+
+/**
+ * Convert a UTF-16 character (2 or 4 bytes) to its 32-bit UCS-4 encoded form.
+ *
+ * @param val       Output value, must be an lvalue of type uint32_t.
+ * @param GET_16BIT Expression returning two bytes of UTF-16 data converted
+ *                  to native byte order.  Evaluated one or two times.
+ * @param ERROR     Expression to be evaluated on invalid input,
+ *                  typically a goto statement.
+ */
+#define GET_UTF16(val, GET_16BIT, ERROR)\
+    val = GET_16BIT;\
+    {\
+        unsigned int hi = val - 0xD800;\
+        if (hi < 0x800) {\
+            val = GET_16BIT - 0xDC00;\
+            if (val > 0x3FFU || hi > 0x3FFU)\
+                ERROR\
+            val += (hi<<10) + 0x10000;\
+        }\
+    }\
+
+/*!
+ * \def PUT_UTF8(val, tmp, PUT_BYTE)
+ * Convert a 32-bit Unicode character to its UTF-8 encoded form (up to 4 bytes long).
+ * \param val is an input-only argument and should be of type uint32_t. It holds
+ * a UCS-4 encoded Unicode character that is to be converted to UTF-8. If
+ * val is given as a function it is executed only once.
+ * \param tmp is a temporary variable and should be of type uint8_t. It
+ * represents an intermediate value during conversion that is to be
+ * output by PUT_BYTE.
+ * \param PUT_BYTE writes the converted UTF-8 bytes to any proper destination.
+ * It could be a function or a statement, and uses tmp as the input byte.
+ * For example, PUT_BYTE could be "*output++ = tmp;" PUT_BYTE will be
+ * executed up to 4 times for values in the valid UTF-8 range and up to
+ * 7 times in the general case, depending on the length of the converted
+ * Unicode character.
+ */
+#define PUT_UTF8(val, tmp, PUT_BYTE)\
+    {\
+        int bytes, shift;\
+        uint32_t in = val;\
+        if (in < 0x80) {\
+            tmp = in;\
+            PUT_BYTE\
+        } else {\
+            bytes = (av_log2(in) + 4) / 5;\
+            shift = (bytes - 1) * 6;\
+            tmp = (256 - (256 >> bytes)) | (in >> shift);\
+            PUT_BYTE\
+            while (shift >= 6) {\
+                shift -= 6;\
+                tmp = 0x80 | ((in >> shift) & 0x3f);\
+                PUT_BYTE\
+            }\
+        }\
+    }
+
+/*!
+ * \def PUT_UTF16(val, tmp, PUT_16BIT)
+ * Convert a 32-bit Unicode character to its UTF-16 encoded form (2 or 4 bytes).
+ * \param val is an input-only argument and should be of type uint32_t. It holds
+ * a UCS-4 encoded Unicode character that is to be converted to UTF-16. If
+ * val is given as a function it is executed only once.
+ * \param tmp is a temporary variable and should be of type uint16_t. It
+ * represents an intermediate value during conversion that is to be
+ * output by PUT_16BIT.
+ * \param PUT_16BIT writes the converted UTF-16 data to any proper destination
+ * in desired endianness. It could be a function or a statement, and uses tmp
+ * as the input byte.  For example, PUT_BYTE could be "*output++ = tmp;"
+ * PUT_BYTE will be executed 1 or 2 times depending on input character.
+ */
+#define PUT_UTF16(val, tmp, PUT_16BIT)\
+    {\
+        uint32_t in = val;\
+        if (in < 0x10000) {\
+            tmp = in;\
+            PUT_16BIT\
+        } else {\
+            tmp = 0xD800 | ((in - 0x10000) >> 10);\
+            PUT_16BIT\
+            tmp = 0xDC00 | ((in - 0x10000) & 0x3FF);\
+            PUT_16BIT\
+        }\
+    }\
+
+
+
+#include "mem.h"
+
+#ifdef HAVE_AV_CONFIG_H
+#    include "internal.h"
+#endif /* HAVE_AV_CONFIG_H */
+
+#endif /* AVUTIL_COMMON_H */
+
+/*
+ * The following definitions are outside the multiple inclusion guard
+ * to ensure they are immediately available in intmath.h.
+ */
+
+#ifndef av_log2
+#   define av_log2       av_log2_c
+#endif
+#ifndef av_log2_16bit
+#   define av_log2_16bit av_log2_16bit_c
+#endif
+#ifndef av_ceil_log2
+#   define av_ceil_log2     av_ceil_log2_c
+#endif
+#ifndef av_clip
+#   define av_clip          av_clip_c
+#endif
+#ifndef av_clip_uint8
+#   define av_clip_uint8    av_clip_uint8_c
+#endif
+#ifndef av_clip_int8
+#   define av_clip_int8     av_clip_int8_c
+#endif
+#ifndef av_clip_uint16
+#   define av_clip_uint16   av_clip_uint16_c
+#endif
+#ifndef av_clip_int16
+#   define av_clip_int16    av_clip_int16_c
+#endif
+#ifndef av_clipl_int32
+#   define av_clipl_int32   av_clipl_int32_c
+#endif
+#ifndef av_clipf
+#   define av_clipf         av_clipf_c
+#endif
+
diff --git a/plugins/supereq/ffmpeg_fft/libavutil/intfloat_readwrite.c b/plugins/supereq/ffmpeg_fft/libavutil/intfloat_readwrite.c
new file mode 100644
index 00000000..79fe1867
--- /dev/null
+++ b/plugins/supereq/ffmpeg_fft/libavutil/intfloat_readwrite.c
@@ -0,0 +1,98 @@
+/*
+ * portable IEEE float/double read/write functions
+ *
+ * Copyright (c) 2005 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * portable IEEE float/double read/write functions
+ */
+
+#include <stdint.h>
+#include <math.h>
+#include "intfloat_readwrite.h"
+
+double av_int2dbl(int64_t v){
+    if(v+v > 0xFFEULL<<52)
+        return 0.0/0.0;
+    return ldexp(((v&((1LL<<52)-1)) + (1LL<<52)) * (v>>63|1), (v>>52&0x7FF)-1075);
+}
+
+float av_int2flt(int32_t v){
+    if(v+v > 0xFF000000U)
+        return 0.0/0.0;
+    return ldexp(((v&0x7FFFFF) + (1<<23)) * (v>>31|1), (v>>23&0xFF)-150);
+}
+
+double av_ext2dbl(const AVExtFloat ext){
+    uint64_t m = 0;
+    int e, i;
+
+    for (i = 0; i < 8; i++)
+        m = (m<<8) + ext.mantissa[i];
+    e = (((int)ext.exponent[0]&0x7f)<<8) | ext.exponent[1];
+    if (e == 0x7fff && m)
+        return 0.0/0.0;
+    e -= 16383 + 63;        /* In IEEE 80 bits, the whole (i.e. 1.xxxx)
+                             * mantissa bit is written as opposed to the
+                             * single and double precision formats. */
+    if (ext.exponent[0]&0x80)
+        m= -m;
+    return ldexp(m, e);
+}
+
+int64_t av_dbl2int(double d){
+    int e;
+    if     ( !d) return 0;
+    else if(d-d) return 0x7FF0000000000000LL + ((int64_t)(d<0)<<63) + (d!=d);
+    d= frexp(d, &e);
+    return (int64_t)(d<0)<<63 | (e+1022LL)<<52 | (int64_t)((fabs(d)-0.5)*(1LL<<53));
+}
+
+int32_t av_flt2int(float d){
+    int e;
+    if     ( !d) return 0;
+    else if(d-d) return 0x7F800000 + ((d<0)<<31) + (d!=d);
+    d= frexp(d, &e);
+    return (d<0)<<31 | (e+126)<<23 | (int64_t)((fabs(d)-0.5)*(1<<24));
+}
+
+AVExtFloat av_dbl2ext(double d){
+    struct AVExtFloat ext= {{0}};
+    int e, i; double f; uint64_t m;
+
+    f = fabs(frexp(d, &e));
+    if (f >= 0.5 && f < 1) {
+        e += 16382;
+        ext.exponent[0] = e>>8;
+        ext.exponent[1] = e;
+        m = (uint64_t)ldexp(f, 64);
+        for (i=0; i < 8; i++)
+            ext.mantissa[i] = m>>(56-(i<<3));
+    } else if (f != 0.0) {
+        ext.exponent[0] = 0x7f; ext.exponent[1] = 0xff;
+        if (f != 1/0.0)
+            ext.mantissa[0] = ~0;
+    }
+    if (d < 0)
+        ext.exponent[0] |= 0x80;
+    return ext;
+}
+
diff --git a/plugins/supereq/ffmpeg_fft/libavutil/intfloat_readwrite.h b/plugins/supereq/ffmpeg_fft/libavutil/intfloat_readwrite.h
new file mode 100644
index 00000000..644b3e64
--- /dev/null
+++ b/plugins/supereq/ffmpeg_fft/libavutil/intfloat_readwrite.h
@@ -0,0 +1,41 @@
+/*
+ * copyright (c) 2005 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVUTIL_INTFLOAT_READWRITE_H
+#define AVUTIL_INTFLOAT_READWRITE_H
+
+#include <stdint.h>
+#include "attributes.h"
+
+/* IEEE 80 bits extended float */
+typedef struct AVExtFloat  {
+    uint8_t exponent[2];
+    uint8_t mantissa[8];
+} AVExtFloat;
+
+double av_int2dbl(int64_t v) av_const;
+float av_int2flt(int32_t v) av_const;
+double av_ext2dbl(const AVExtFloat ext) av_const;
+int64_t av_dbl2int(double d) av_const;
+int32_t av_flt2int(float d) av_const;
+AVExtFloat av_dbl2ext(double d) av_const;
+
+#endif /* AVUTIL_INTFLOAT_READWRITE_H */
+
diff --git a/plugins/supereq/ffmpeg_fft/libavutil/mathematics.c b/plugins/supereq/ffmpeg_fft/libavutil/mathematics.c
new file mode 100644
index 00000000..c6851cb7
--- /dev/null
+++ b/plugins/supereq/ffmpeg_fft/libavutil/mathematics.c
@@ -0,0 +1,181 @@
+/*
+ * Copyright (c) 2005 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * miscellaneous math routines and tables
+ */
+
+#include <assert.h>
+#include <stdint.h>
+#include <limits.h>
+#include "mathematics.h"
+
+const uint8_t ff_sqrt_tab[256]={
+  0, 16, 23, 28, 32, 36, 40, 43, 46, 48, 51, 54, 56, 58, 60, 62, 64, 66, 68, 70, 72, 74, 76, 77, 79, 80, 82, 84, 85, 87, 88, 90,
+ 91, 92, 94, 95, 96, 98, 99,100,102,103,104,105,107,108,109,110,111,112,114,115,116,117,118,119,120,121,122,123,124,125,126,127,
+128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,144,144,145,146,147,148,149,150,151,151,152,153,154,155,156,156,
+157,158,159,160,160,161,162,163,164,164,165,166,167,168,168,169,170,171,171,172,173,174,174,175,176,176,177,178,179,179,180,181,
+182,182,183,184,184,185,186,186,187,188,188,189,190,190,191,192,192,193,194,194,195,196,196,197,198,198,199,200,200,201,202,202,
+203,204,204,205,205,206,207,207,208,208,209,210,210,211,212,212,213,213,214,215,215,216,216,217,218,218,219,219,220,220,221,222,
+222,223,223,224,224,225,226,226,227,227,228,228,229,230,230,231,231,232,232,233,233,234,235,235,236,236,237,237,238,238,239,239,
+240,240,241,242,242,243,243,244,244,245,245,246,246,247,247,248,248,249,249,250,250,251,251,252,252,253,253,254,254,255,255,255
+};
+
+const uint8_t ff_log2_tab[256]={
+        0,0,1,1,2,2,2,2,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,
+        5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,
+        6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,
+        6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,
+        7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
+        7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
+        7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
+        7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7
+};
+
+const uint8_t av_reverse[256]={
+0x00,0x80,0x40,0xC0,0x20,0xA0,0x60,0xE0,0x10,0x90,0x50,0xD0,0x30,0xB0,0x70,0xF0,
+0x08,0x88,0x48,0xC8,0x28,0xA8,0x68,0xE8,0x18,0x98,0x58,0xD8,0x38,0xB8,0x78,0xF8,
+0x04,0x84,0x44,0xC4,0x24,0xA4,0x64,0xE4,0x14,0x94,0x54,0xD4,0x34,0xB4,0x74,0xF4,
+0x0C,0x8C,0x4C,0xCC,0x2C,0xAC,0x6C,0xEC,0x1C,0x9C,0x5C,0xDC,0x3C,0xBC,0x7C,0xFC,
+0x02,0x82,0x42,0xC2,0x22,0xA2,0x62,0xE2,0x12,0x92,0x52,0xD2,0x32,0xB2,0x72,0xF2,
+0x0A,0x8A,0x4A,0xCA,0x2A,0xAA,0x6A,0xEA,0x1A,0x9A,0x5A,0xDA,0x3A,0xBA,0x7A,0xFA,
+0x06,0x86,0x46,0xC6,0x26,0xA6,0x66,0xE6,0x16,0x96,0x56,0xD6,0x36,0xB6,0x76,0xF6,
+0x0E,0x8E,0x4E,0xCE,0x2E,0xAE,0x6E,0xEE,0x1E,0x9E,0x5E,0xDE,0x3E,0xBE,0x7E,0xFE,
+0x01,0x81,0x41,0xC1,0x21,0xA1,0x61,0xE1,0x11,0x91,0x51,0xD1,0x31,0xB1,0x71,0xF1,
+0x09,0x89,0x49,0xC9,0x29,0xA9,0x69,0xE9,0x19,0x99,0x59,0xD9,0x39,0xB9,0x79,0xF9,
+0x05,0x85,0x45,0xC5,0x25,0xA5,0x65,0xE5,0x15,0x95,0x55,0xD5,0x35,0xB5,0x75,0xF5,
+0x0D,0x8D,0x4D,0xCD,0x2D,0xAD,0x6D,0xED,0x1D,0x9D,0x5D,0xDD,0x3D,0xBD,0x7D,0xFD,
+0x03,0x83,0x43,0xC3,0x23,0xA3,0x63,0xE3,0x13,0x93,0x53,0xD3,0x33,0xB3,0x73,0xF3,
+0x0B,0x8B,0x4B,0xCB,0x2B,0xAB,0x6B,0xEB,0x1B,0x9B,0x5B,0xDB,0x3B,0xBB,0x7B,0xFB,
+0x07,0x87,0x47,0xC7,0x27,0xA7,0x67,0xE7,0x17,0x97,0x57,0xD7,0x37,0xB7,0x77,0xF7,
+0x0F,0x8F,0x4F,0xCF,0x2F,0xAF,0x6F,0xEF,0x1F,0x9F,0x5F,0xDF,0x3F,0xBF,0x7F,0xFF,
+};
+
+int64_t av_gcd(int64_t a, int64_t b){
+    if(b) return av_gcd(b, a%b);
+    else  return a;
+}
+
+int64_t av_rescale_rnd(int64_t a, int64_t b, int64_t c, enum AVRounding rnd){
+    int64_t r=0;
+    assert(c > 0);
+    assert(b >=0);
+    assert((unsigned)rnd<=5 && rnd!=4);
+
+    if(a<0 && a != INT64_MIN) return -av_rescale_rnd(-a, b, c, rnd ^ ((rnd>>1)&1));
+
+    if(rnd==AV_ROUND_NEAR_INF) r= c/2;
+    else if(rnd&1)             r= c-1;
+
+    if(b<=INT_MAX && c<=INT_MAX){
+        if(a<=INT_MAX)
+            return (a * b + r)/c;
+        else
+            return a/c*b + (a%c*b + r)/c;
+    }else{
+#if 1
+        uint64_t a0= a&0xFFFFFFFF;
+        uint64_t a1= a>>32;
+        uint64_t b0= b&0xFFFFFFFF;
+        uint64_t b1= b>>32;
+        uint64_t t1= a0*b1 + a1*b0;
+        uint64_t t1a= t1<<32;
+        int i;
+
+        a0 = a0*b0 + t1a;
+        a1 = a1*b1 + (t1>>32) + (a0<t1a);
+        a0 += r;
+        a1 += a0<r;
+
+        for(i=63; i>=0; i--){
+//            int o= a1 & 0x8000000000000000ULL;
+            a1+= a1 + ((a0>>i)&1);
+            t1+=t1;
+            if(/*o || */c <= a1){
+                a1 -= c;
+                t1++;
+            }
+        }
+        return t1;
+    }
+#else
+        AVInteger ai;
+        ai= av_mul_i(av_int2i(a), av_int2i(b));
+        ai= av_add_i(ai, av_int2i(r));
+
+        return av_i2int(av_div_i(ai, av_int2i(c)));
+    }
+#endif
+}
+
+int64_t av_rescale(int64_t a, int64_t b, int64_t c){
+    return av_rescale_rnd(a, b, c, AV_ROUND_NEAR_INF);
+}
+
+int64_t av_rescale_q(int64_t a, AVRational bq, AVRational cq){
+    int64_t b= bq.num * (int64_t)cq.den;
+    int64_t c= cq.num * (int64_t)bq.den;
+    return av_rescale_rnd(a, b, c, AV_ROUND_NEAR_INF);
+}
+
+int av_compare_ts(int64_t ts_a, AVRational tb_a, int64_t ts_b, AVRational tb_b){
+    int64_t a= tb_a.num * (int64_t)tb_b.den;
+    int64_t b= tb_b.num * (int64_t)tb_a.den;
+    if (av_rescale_rnd(ts_a, a, b, AV_ROUND_DOWN) < ts_b) return -1;
+    if (av_rescale_rnd(ts_b, b, a, AV_ROUND_DOWN) < ts_a) return  1;
+    return 0;
+}
+
+int64_t av_compare_mod(uint64_t a, uint64_t b, uint64_t mod){
+    int64_t c= (a-b) & (mod-1);
+    if(c > (mod>>1))
+        c-= mod;
+    return c;
+}
+
+#ifdef TEST
+#include "integer.h"
+#undef printf
+int main(void){
+    int64_t a,b,c,d,e;
+
+    for(a=7; a<(1LL<<62); a+=a/3+1){
+        for(b=3; b<(1LL<<62); b+=b/4+1){
+            for(c=9; c<(1LL<<62); c+=(c*2)/5+3){
+                int64_t r= c/2;
+                AVInteger ai;
+                ai= av_mul_i(av_int2i(a), av_int2i(b));
+                ai= av_add_i(ai, av_int2i(r));
+
+                d= av_i2int(av_div_i(ai, av_int2i(c)));
+
+                e= av_rescale(a,b,c);
+
+                if((double)a * (double)b / (double)c > (1LL<<63))
+                    continue;
+
+                if(d!=e) printf("%"PRId64"*%"PRId64"/%"PRId64"= %"PRId64"=%"PRId64"\n", a, b, c, d, e);
+            }
+        }
+    }
+    return 0;
+}
+#endif
diff --git a/plugins/supereq/ffmpeg_fft/libavutil/mathematics.h b/plugins/supereq/ffmpeg_fft/libavutil/mathematics.h
new file mode 100644
index 00000000..06d36e09
--- /dev/null
+++ b/plugins/supereq/ffmpeg_fft/libavutil/mathematics.h
@@ -0,0 +1,110 @@
+/*
+ * copyright (c) 2005 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVUTIL_MATHEMATICS_H
+#define AVUTIL_MATHEMATICS_H
+
+#include <stdint.h>
+#include <math.h>
+#include "attributes.h"
+#include "rational.h"
+
+#ifndef M_E
+#define M_E            2.7182818284590452354   /* e */
+#endif
+#ifndef M_LN2
+#define M_LN2          0.69314718055994530942  /* log_e 2 */
+#endif
+#ifndef M_LN10
+#define M_LN10         2.30258509299404568402  /* log_e 10 */
+#endif
+#ifndef M_LOG2_10
+#define M_LOG2_10      3.32192809488736234787  /* log_2 10 */
+#endif
+#ifndef M_PI
+#define M_PI           3.14159265358979323846  /* pi */
+#endif
+#ifndef M_SQRT1_2
+#define M_SQRT1_2      0.70710678118654752440  /* 1/sqrt(2) */
+#endif
+#ifndef M_SQRT2
+#define M_SQRT2        1.41421356237309504880  /* sqrt(2) */
+#endif
+#ifndef NAN
+#define NAN            (0.0/0.0)
+#endif
+#ifndef INFINITY
+#define INFINITY       (1.0/0.0)
+#endif
+
+enum AVRounding {
+    AV_ROUND_ZERO     = 0, ///< Round toward zero.
+    AV_ROUND_INF      = 1, ///< Round away from zero.
+    AV_ROUND_DOWN     = 2, ///< Round toward -infinity.
+    AV_ROUND_UP       = 3, ///< Round toward +infinity.
+    AV_ROUND_NEAR_INF = 5, ///< Round to nearest and halfway cases away from zero.
+};
+
+/**
+ * Return the greatest common divisor of a and b.
+ * If both a and b are 0 or either or both are <0 then behavior is
+ * undefined.
+ */
+int64_t av_const av_gcd(int64_t a, int64_t b);
+
+/**
+ * Rescale a 64-bit integer with rounding to nearest.
+ * A simple a*b/c isn't possible as it can overflow.
+ */
+int64_t av_rescale(int64_t a, int64_t b, int64_t c) av_const;
+
+/**
+ * Rescale a 64-bit integer with specified rounding.
+ * A simple a*b/c isn't possible as it can overflow.
+ */
+int64_t av_rescale_rnd(int64_t a, int64_t b, int64_t c, enum AVRounding) av_const;
+
+/**
+ * Rescale a 64-bit integer by 2 rational numbers.
+ */
+int64_t av_rescale_q(int64_t a, AVRational bq, AVRational cq) av_const;
+
+/**
+ * Compare 2 timestamps each in its own timebases.
+ * The result of the function is undefined if one of the timestamps
+ * is outside the int64_t range when represented in the others timebase.
+ * @return -1 if ts_a is before ts_b, 1 if ts_a is after ts_b or 0 if they represent the same position
+ */
+int av_compare_ts(int64_t ts_a, AVRational tb_a, int64_t ts_b, AVRational tb_b);
+
+/**
+ * Compare 2 integers modulo mod.
+ * That is we compare integers a and b for which only the least
+ * significant log2(mod) bits are known.
+ *
+ * @param mod must be a power of 2
+ * @return a negative value if a is smaller than b
+ *         a positive value if a is greater than b
+ *         0                if a equals          b
+ */
+int64_t av_compare_mod(uint64_t a, uint64_t b, uint64_t mod);
+
+#endif /* AVUTIL_MATHEMATICS_H */
+
diff --git a/plugins/supereq/ffmpeg_fft/libavutil/mem.c b/plugins/supereq/ffmpeg_fft/libavutil/mem.c
new file mode 100644
index 00000000..8cad089a
--- /dev/null
+++ b/plugins/supereq/ffmpeg_fft/libavutil/mem.c
@@ -0,0 +1,176 @@
+/*
+ * default memory allocator for libavutil
+ * Copyright (c) 2002 Fabrice Bellard
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * default memory allocator for libavutil
+ */
+
+#include "config.h"
+
+#include <limits.h>
+#include <stdlib.h>
+#include <string.h>
+#if HAVE_MALLOC_H
+#include <malloc.h>
+#endif
+
+#include "avutil.h"
+#include "mem.h"
+
+/* here we can use OS-dependent allocation functions */
+#undef free
+#undef malloc
+#undef realloc
+
+#ifdef MALLOC_PREFIX
+
+#define malloc         AV_JOIN(MALLOC_PREFIX, malloc)
+#define memalign       AV_JOIN(MALLOC_PREFIX, memalign)
+#define posix_memalign AV_JOIN(MALLOC_PREFIX, posix_memalign)
+#define realloc        AV_JOIN(MALLOC_PREFIX, realloc)
+#define free           AV_JOIN(MALLOC_PREFIX, free)
+
+void *malloc(size_t size);
+void *memalign(size_t align, size_t size);
+int   posix_memalign(void **ptr, size_t align, size_t size);
+void *realloc(void *ptr, size_t size);
+void  free(void *ptr);
+
+#endif /* MALLOC_PREFIX */
+
+/* You can redefine av_malloc and av_free in your project to use your
+   memory allocator. You do not need to suppress this file because the
+   linker will do it automatically. */
+
+void *av_malloc(unsigned int size)
+{
+    void *ptr = NULL;
+#if CONFIG_MEMALIGN_HACK
+    long diff;
+#endif
+
+    /* let's disallow possible ambiguous cases */
+    if(size > (INT_MAX-16) )
+        return NULL;
+
+#if CONFIG_MEMALIGN_HACK
+    ptr = malloc(size+16);
+    if(!ptr)
+        return ptr;
+    diff= ((-(long)ptr - 1)&15) + 1;
+    ptr = (char*)ptr + diff;
+    ((char*)ptr)[-1]= diff;
+#elif HAVE_POSIX_MEMALIGN
+    if (posix_memalign(&ptr,16,size))
+        ptr = NULL;
+#elif HAVE_MEMALIGN
+    ptr = memalign(16,size);
+    /* Why 64?
+       Indeed, we should align it:
+         on 4 for 386
+         on 16 for 486
+         on 32 for 586, PPro - K6-III
+         on 64 for K7 (maybe for P3 too).
+       Because L1 and L2 caches are aligned on those values.
+       But I don't want to code such logic here!
+     */
+     /* Why 16?
+        Because some CPUs need alignment, for example SSE2 on P4, & most RISC CPUs
+        it will just trigger an exception and the unaligned load will be done in the
+        exception handler or it will just segfault (SSE2 on P4).
+        Why not larger? Because I did not see a difference in benchmarks ...
+     */
+     /* benchmarks with P3
+        memalign(64)+1          3071,3051,3032
+        memalign(64)+2          3051,3032,3041
+        memalign(64)+4          2911,2896,2915
+        memalign(64)+8          2545,2554,2550
+        memalign(64)+16         2543,2572,2563
+        memalign(64)+32         2546,2545,2571
+        memalign(64)+64         2570,2533,2558
+
+        BTW, malloc seems to do 8-byte alignment by default here.
+     */
+#else
+    ptr = malloc(size);
+#endif
+    return ptr;
+}
+
+void *av_realloc(void *ptr, unsigned int size)
+{
+#if CONFIG_MEMALIGN_HACK
+    int diff;
+#endif
+
+    /* let's disallow possible ambiguous cases */
+    if(size > (INT_MAX-16) )
+        return NULL;
+
+#if CONFIG_MEMALIGN_HACK
+    //FIXME this isn't aligned correctly, though it probably isn't needed
+    if(!ptr) return av_malloc(size);
+    diff= ((char*)ptr)[-1];
+    return (char*)realloc((char*)ptr - diff, size + diff) + diff;
+#else
+    return realloc(ptr, size);
+#endif
+}
+
+void av_free(void *ptr)
+{
+    /* XXX: this test should not be needed on most libcs */
+    if (ptr)
+#if CONFIG_MEMALIGN_HACK
+        free((char*)ptr - ((char*)ptr)[-1]);
+#else
+        free(ptr);
+#endif
+}
+
+void av_freep(void *arg)
+{
+    void **ptr= (void**)arg;
+    av_free(*ptr);
+    *ptr = NULL;
+}
+
+void *av_mallocz(unsigned int size)
+{
+    void *ptr = av_malloc(size);
+    if (ptr)
+        memset(ptr, 0, size);
+    return ptr;
+}
+
+char *av_strdup(const char *s)
+{
+    char *ptr= NULL;
+    if(s){
+        int len = strlen(s) + 1;
+        ptr = av_malloc(len);
+        if (ptr)
+            memcpy(ptr, s, len);
+    }
+    return ptr;
+}
+
diff --git a/plugins/supereq/ffmpeg_fft/libavutil/mem.h b/plugins/supereq/ffmpeg_fft/libavutil/mem.h
new file mode 100644
index 00000000..7da0a15f
--- /dev/null
+++ b/plugins/supereq/ffmpeg_fft/libavutil/mem.h
@@ -0,0 +1,128 @@
+/*
+ * copyright (c) 2006 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * memory handling functions
+ */
+
+#ifndef AVUTIL_MEM_H
+#define AVUTIL_MEM_H
+
+#include "attributes.h"
+#include "avutil.h"
+#include "publik.h"
+
+#if defined(__ICC) && _ICC < 1200 || defined(__SUNPRO_C)
+    #define DECLARE_ALIGNED(n,t,v)      t __attribute__ ((aligned (n))) v
+    #define DECLARE_ASM_CONST(n,t,v)    const t __attribute__ ((aligned (n))) v
+#elif defined(__TI_COMPILER_VERSION__)
+    #define DECLARE_ALIGNED(n,t,v)                      \
+        AV_PRAGMA(DATA_ALIGN(v,n))                      \
+        t __attribute__((aligned(n))) v
+    #define DECLARE_ASM_CONST(n,t,v)                    \
+        AV_PRAGMA(DATA_ALIGN(v,n))                      \
+        static const t __attribute__((aligned(n))) v
+#elif defined(__GNUC__)
+    #define DECLARE_ALIGNED(n,t,v)      t __attribute__ ((aligned (n))) v
+    #define DECLARE_ASM_CONST(n,t,v)    static const t attribute_used __attribute__ ((aligned (n))) v
+#elif defined(_MSC_VER)
+    #define DECLARE_ALIGNED(n,t,v)      __declspec(align(n)) t v
+    #define DECLARE_ASM_CONST(n,t,v)    __declspec(align(n)) static const t v
+#else
+    #define DECLARE_ALIGNED(n,t,v)      t v
+    #define DECLARE_ASM_CONST(n,t,v)    static const t v
+#endif
+
+#if AV_GCC_VERSION_AT_LEAST(3,1)
+    #define av_malloc_attrib __attribute__((__malloc__))
+#else
+    #define av_malloc_attrib
+#endif
+
+#if (!defined(__ICC) || __ICC > 1110) && AV_GCC_VERSION_AT_LEAST(4,3)
+    #define av_alloc_size(n) __attribute__((alloc_size(n)))
+#else
+    #define av_alloc_size(n)
+#endif
+
+/**
+ * Allocate a block of size bytes with alignment suitable for all
+ * memory accesses (including vectors if available on the CPU).
+ * @param size Size in bytes for the memory block to be allocated.
+ * @return Pointer to the allocated block, NULL if the block cannot
+ * be allocated.
+ * @see av_mallocz()
+ */
+PUBLIK void *av_malloc(unsigned int size) av_malloc_attrib av_alloc_size(1);
+
+/**
+ * Allocate or reallocate a block of memory.
+ * If ptr is NULL and size > 0, allocate a new block. If
+ * size is zero, free the memory block pointed to by ptr.
+ * @param size Size in bytes for the memory block to be allocated or
+ * reallocated.
+ * @param ptr Pointer to a memory block already allocated with
+ * av_malloc(z)() or av_realloc() or NULL.
+ * @return Pointer to a newly reallocated block or NULL if the block
+ * cannot be reallocated or the function is used to free the memory block.
+ * @see av_fast_realloc()
+ */
+void *av_realloc(void *ptr, unsigned int size) av_alloc_size(2);
+
+/**
+ * Free a memory block which has been allocated with av_malloc(z)() or
+ * av_realloc().
+ * @param ptr Pointer to the memory block which should be freed.
+ * @note ptr = NULL is explicitly allowed.
+ * @note It is recommended that you use av_freep() instead.
+ * @see av_freep()
+ */
+PUBLIK void av_free(void *ptr);
+
+/**
+ * Allocate a block of size bytes with alignment suitable for all
+ * memory accesses (including vectors if available on the CPU) and
+ * zero all the bytes of the block.
+ * @param size Size in bytes for the memory block to be allocated.
+ * @return Pointer to the allocated block, NULL if it cannot be allocated.
+ * @see av_malloc()
+ */
+void *av_mallocz(unsigned int size) av_malloc_attrib av_alloc_size(1);
+
+/**
+ * Duplicate the string s.
+ * @param s string to be duplicated
+ * @return Pointer to a newly allocated string containing a
+ * copy of s or NULL if the string cannot be allocated.
+ */
+char *av_strdup(const char *s) av_malloc_attrib;
+
+/**
+ * Free a memory block which has been allocated with av_malloc(z)() or
+ * av_realloc() and set the pointer pointing to it to NULL.
+ * @param ptr Pointer to the pointer to the memory block which should
+ * be freed.
+ * @see av_free()
+ */
+void av_freep(void *ptr);
+
+#endif /* AVUTIL_MEM_H */
+
diff --git a/plugins/supereq/ffmpeg_fft/libavutil/rational.c b/plugins/supereq/ffmpeg_fft/libavutil/rational.c
new file mode 100644
index 00000000..3e8b885d
--- /dev/null
+++ b/plugins/supereq/ffmpeg_fft/libavutil/rational.c
@@ -0,0 +1,131 @@
+/*
+ * rational numbers
+ * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * rational numbers
+ * @author Michael Niedermayer <michaelni@gmx.at>
+ */
+
+#include <assert.h>
+//#include <math.h>
+#include <limits.h>
+
+#include "common.h"
+#include "mathematics.h"
+#include "rational.h"
+
+int av_reduce(int *dst_num, int *dst_den, int64_t num, int64_t den, int64_t max){
+    AVRational a0={0,1}, a1={1,0};
+    int sign= (num<0) ^ (den<0);
+    int64_t gcd= av_gcd(FFABS(num), FFABS(den));
+
+    if(gcd){
+        num = FFABS(num)/gcd;
+        den = FFABS(den)/gcd;
+    }
+    if(num<=max && den<=max){
+        a1= (AVRational){num, den};
+        den=0;
+    }
+
+    while(den){
+        uint64_t x      = num / den;
+        int64_t next_den= num - den*x;
+        int64_t a2n= x*a1.num + a0.num;
+        int64_t a2d= x*a1.den + a0.den;
+
+        if(a2n > max || a2d > max){
+            if(a1.num) x= (max - a0.num) / a1.num;
+            if(a1.den) x= FFMIN(x, (max - a0.den) / a1.den);
+
+            if (den*(2*x*a1.den + a0.den) > num*a1.den)
+                a1 = (AVRational){x*a1.num + a0.num, x*a1.den + a0.den};
+            break;
+        }
+
+        a0= a1;
+        a1= (AVRational){a2n, a2d};
+        num= den;
+        den= next_den;
+    }
+    assert(av_gcd(a1.num, a1.den) <= 1U);
+
+    *dst_num = sign ? -a1.num : a1.num;
+    *dst_den = a1.den;
+
+    return den==0;
+}
+
+AVRational av_mul_q(AVRational b, AVRational c){
+    av_reduce(&b.num, &b.den, b.num * (int64_t)c.num, b.den * (int64_t)c.den, INT_MAX);
+    return b;
+}
+
+AVRational av_div_q(AVRational b, AVRational c){
+    return av_mul_q(b, (AVRational){c.den, c.num});
+}
+
+AVRational av_add_q(AVRational b, AVRational c){
+    av_reduce(&b.num, &b.den, b.num * (int64_t)c.den + c.num * (int64_t)b.den, b.den * (int64_t)c.den, INT_MAX);
+    return b;
+}
+
+AVRational av_sub_q(AVRational b, AVRational c){
+    return av_add_q(b, (AVRational){-c.num, c.den});
+}
+
+AVRational av_d2q(double d, int max){
+    AVRational a;
+#define LOG2  0.69314718055994530941723212145817656807550013436025
+    int exponent= FFMAX( (int)(log(fabs(d) + 1e-20)/LOG2), 0);
+    int64_t den= 1LL << (61 - exponent);
+    if (isnan(d))
+        return (AVRational){0,0};
+    av_reduce(&a.num, &a.den, (int64_t)(d * den + 0.5), den, max);
+
+    return a;
+}
+
+int av_nearer_q(AVRational q, AVRational q1, AVRational q2)
+{
+    /* n/d is q, a/b is the median between q1 and q2 */
+    int64_t a = q1.num * (int64_t)q2.den + q2.num * (int64_t)q1.den;
+    int64_t b = 2 * (int64_t)q1.den * q2.den;
+
+    /* rnd_up(a*d/b) > n => a*d/b > n */
+    int64_t x_up = av_rescale_rnd(a, q.den, b, AV_ROUND_UP);
+
+    /* rnd_down(a*d/b) < n => a*d/b < n */
+    int64_t x_down = av_rescale_rnd(a, q.den, b, AV_ROUND_DOWN);
+
+    return ((x_up > q.num) - (x_down < q.num)) * av_cmp_q(q2, q1);
+}
+
+int av_find_nearest_q_idx(AVRational q, const AVRational* q_list)
+{
+    int i, nearest_q_idx = 0;
+    for(i=0; q_list[i].den; i++)
+        if (av_nearer_q(q, q_list[i], q_list[nearest_q_idx]) > 0)
+            nearest_q_idx = i;
+
+    return nearest_q_idx;
+}
diff --git a/plugins/supereq/ffmpeg_fft/libavutil/rational.h b/plugins/supereq/ffmpeg_fft/libavutil/rational.h
new file mode 100644
index 00000000..cd0a945a
--- /dev/null
+++ b/plugins/supereq/ffmpeg_fft/libavutil/rational.h
@@ -0,0 +1,130 @@
+/*
+ * rational numbers
+ * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/**
+ * @file
+ * rational numbers
+ * @author Michael Niedermayer <michaelni@gmx.at>
+ */
+
+#ifndef AVUTIL_RATIONAL_H
+#define AVUTIL_RATIONAL_H
+
+#include <stdint.h>
+#include "attributes.h"
+
+/**
+ * rational number numerator/denominator
+ */
+typedef struct AVRational{
+    int num; ///< numerator
+    int den; ///< denominator
+} AVRational;
+
+/**
+ * Compare two rationals.
+ * @param a first rational
+ * @param b second rational
+ * @return 0 if a==b, 1 if a>b and -1 if a<b
+ */
+static inline int av_cmp_q(AVRational a, AVRational b){
+    const int64_t tmp= a.num * (int64_t)b.den - b.num * (int64_t)a.den;
+
+    if(tmp) return (tmp>>63)|1;
+    else    return 0;
+}
+
+/**
+ * Convert rational to double.
+ * @param a rational to convert
+ * @return (double) a
+ */
+static inline double av_q2d(AVRational a){
+    return a.num / (double) a.den;
+}
+
+/**
+ * Reduce a fraction.
+ * This is useful for framerate calculations.
+ * @param dst_num destination numerator
+ * @param dst_den destination denominator
+ * @param num source numerator
+ * @param den source denominator
+ * @param max the maximum allowed for dst_num & dst_den
+ * @return 1 if exact, 0 otherwise
+ */
+int av_reduce(int *dst_num, int *dst_den, int64_t num, int64_t den, int64_t max);
+
+/**
+ * Multiply two rationals.
+ * @param b first rational
+ * @param c second rational
+ * @return b*c
+ */
+AVRational av_mul_q(AVRational b, AVRational c) av_const;
+
+/**
+ * Divide one rational by another.
+ * @param b first rational
+ * @param c second rational
+ * @return b/c
+ */
+AVRational av_div_q(AVRational b, AVRational c) av_const;
+
+/**
+ * Add two rationals.
+ * @param b first rational
+ * @param c second rational
+ * @return b+c
+ */
+AVRational av_add_q(AVRational b, AVRational c) av_const;
+
+/**
+ * Subtract one rational from another.
+ * @param b first rational
+ * @param c second rational
+ * @return b-c
+ */
+AVRational av_sub_q(AVRational b, AVRational c) av_const;
+
+/**
+ * Convert a double precision floating point number to a rational.
+ * @param d double to convert
+ * @param max the maximum allowed numerator and denominator
+ * @return (AVRational) d
+ */
+AVRational av_d2q(double d, int max) av_const;
+
+/**
+ * @return 1 if q1 is nearer to q than q2, -1 if q2 is nearer
+ * than q1, 0 if they have the same distance.
+ */
+int av_nearer_q(AVRational q, AVRational q1, AVRational q2);
+
+/**
+ * Find the nearest value in q_list to q.
+ * @param q_list an array of rationals terminated by {0, 0}
+ * @return the index of the nearest value found in the array
+ */
+int av_find_nearest_q_idx(AVRational q, const AVRational* q_list);
+
+#endif /* AVUTIL_RATIONAL_H */
+
diff --git a/plugins/supereq/ffmpeg_fft/libffmpeg_fft.ver b/plugins/supereq/ffmpeg_fft/libffmpeg_fft.ver
new file mode 100644
index 00000000..07b44318
--- /dev/null
+++ b/plugins/supereq/ffmpeg_fft/libffmpeg_fft.ver
@@ -0,0 +1,4 @@
+LIBFFMPEG_FFT_52 {
+        global: *;
+};
+
diff --git a/plugins/supereq/ffmpeg_fft/publik.h b/plugins/supereq/ffmpeg_fft/publik.h
new file mode 100644
index 00000000..bb044756
--- /dev/null
+++ b/plugins/supereq/ffmpeg_fft/publik.h
@@ -0,0 +1,6 @@
+#ifndef PUBLIK_H_
+#define PUBLIK_H_
+
+#define PUBLIK __attribute__ ((visibility ("default")))
+
+#endif /* PUBLIK_H_ */
diff --git a/plugins/supereq/nsfft-1.00/README b/plugins/supereq/nsfft-1.00/README
new file mode 100644
index 00000000..1ca873b1
--- /dev/null
+++ b/plugins/supereq/nsfft-1.00/README
@@ -0,0 +1,15 @@
+
+NSFFT (Nonrestrictive SIMD FFT) is yet another FFT library for
+performing 1-dimensional fast Fourier transforms. NSDFT is a simple,
+small and portable library, and it is efficient since it can utilize
+SIMD instruction sets in modern processors. It performs multiple
+transforms simultaneously, and thus it is especially suitable for
+digital signal processing. It does not need so much computation to
+make a good execution plan. This library is in public domain, so that
+you can incorporate this library into your product without any
+obligation.
+
+Visit http://shibatch.sourceforge.net/ to get the latest version of
+this library.
+
+Contact : Naoki Shibata shibatch@users.sourceforge.net
diff --git a/plugins/supereq/nsfft-1.00/dft/DFT.c b/plugins/supereq/nsfft-1.00/dft/DFT.c
new file mode 100644
index 00000000..d59e6ab8
--- /dev/null
+++ b/plugins/supereq/nsfft-1.00/dft/DFT.c
@@ -0,0 +1,327 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+#include <stdint.h>
+#include <sys/time.h>
+
+#include "SIMDBase.h"
+#include "DFT.h"
+#include "DFTUndiff.h"
+
+int32_t getModeParamInt_purec_float(int32_t paramId);
+int32_t getModeParamInt_purec_double(int32_t paramId);
+int32_t getModeParamInt_purec_longdouble(int32_t paramId);
+int32_t getModeParamInt_sse_float(int32_t paramId);
+int32_t getModeParamInt_sse2_double(int32_t paramId);
+int32_t getModeParamInt_neon_float(int32_t paramId);
+int32_t getModeParamInt_avx_float(int32_t paramId);
+int32_t getModeParamInt_avx_double(int32_t paramId);
+int32_t getModeParamInt_altivec_float(int32_t paramId);
+
+char * getModeParamString_purec_float(int32_t paramId);
+char * getModeParamString_purec_double(int32_t paramId);
+char * getModeParamString_purec_longdouble(int32_t paramId);
+char * getModeParamString_sse_float(int32_t paramId);
+char * getModeParamString_sse2_double(int32_t paramId);
+char * getModeParamString_neon_float(int32_t paramId);
+char * getModeParamString_avx_float(int32_t paramId);
+char * getModeParamString_avx_double(int32_t paramId);
+char * getModeParamString_altivec_float(int32_t paramId);
+
+void *makePlan_purec_float(uint64_t n, uint64_t flags);
+void *makePlan_purec_double(uint64_t n, uint64_t flags);
+void *makePlan_purec_longdouble(uint64_t n, uint64_t flags);
+void *makePlan_sse_float(uint64_t n, uint64_t flags);
+void *makePlan_sse2_double(uint64_t n, uint64_t flags);
+void *makePlan_neon_float(uint64_t n, uint64_t flags);
+void *makePlan_avx_float(uint64_t n, uint64_t flags);
+void *makePlan_avx_double(uint64_t n, uint64_t flags);
+void *makePlan_altivec_float(uint64_t n, uint64_t flags);
+
+void *makePlanSub_purec_float(uint64_t n, int32_t radix2thres, int32_t useCobra, uint64_t flags);
+void *makePlanSub_purec_double(uint64_t n, int32_t radix2thres, int32_t useCobra, uint64_t flags);
+void *makePlanSub_purec_longdouble(uint64_t n, int32_t radix2thres, int32_t useCobra, uint64_t flags);
+void *makePlanSub_sse_float(uint64_t n, int32_t radix2thres, int32_t useCobra, uint64_t flags);
+void *makePlanSub_sse2_double(uint64_t n, int32_t radix2thres, int32_t useCobra, uint64_t flags);
+void *makePlanSub_neon_float(uint64_t n, int32_t radix2thres, int32_t useCobra, uint64_t flags);
+void *makePlanSub_avx_float(uint64_t n, int32_t radix2thres, int32_t useCobra, uint64_t flags);
+void *makePlanSub_avx_double(uint64_t n, int32_t radix2thres, int32_t useCobra, uint64_t flags);
+void *makePlanSub_altivec_float(uint64_t n, int32_t radix2thres, int32_t useCobra, uint64_t flags);
+
+void destroyPlan_purec_float(void *p);
+void destroyPlan_purec_double(void *p);
+void destroyPlan_purec_longdouble(void *p);
+void destroyPlan_sse_float(void *p);
+void destroyPlan_sse2_double(void *p);
+void destroyPlan_neon_float(void *p);
+void destroyPlan_avx_float(void *p);
+void destroyPlan_avx_double(void *p);
+void destroyPlan_altivec_float(void *p);
+
+void execute_purec_float(void *p, void *s, int32_t dir);
+void execute_purec_double(void *p, void *s, int32_t dir);
+void execute_purec_longdouble(void *p, void *s, int32_t dir);
+void execute_sse_float(void *p, void *s, int32_t dir);
+void execute_sse2_double(void *p, void *s, int32_t dir);
+void execute_neon_float(void *p, void *s, int32_t dir);
+void execute_avx_float(void *p, void *s, int32_t dir);
+void execute_avx_double(void *p, void *s, int32_t dir);
+void execute_altivec_float(void *p, void *s, int32_t dir);
+
+void *DFT_init(int32_t mode, uint64_t n, uint64_t flags) {
+  switch(mode) {
+#if defined(ENABLE_PUREC_FLOAT)
+  case 1: return makePlan_purec_float(n, flags); break;
+#endif
+#if defined(ENABLE_PUREC_DOUBLE)
+  case 2: return makePlan_purec_double(n, flags); break;
+#endif
+#if defined(ENABLE_PUREC_LONGDOUBLE)
+  case 3: return makePlan_purec_longdouble(n, flags); break;
+#endif
+#if defined(ENABLE_SSE_FLOAT)
+  case 4: return makePlan_sse_float(n, flags); break;
+#endif
+#if defined(ENABLE_SSE2_DOUBLE)
+  case 5: return makePlan_sse2_double(n, flags); break;
+#endif
+#if defined(ENABLE_NEON_FLOAT)
+  case 6: return makePlan_neon_float(n, flags); break;
+#endif
+#if defined(ENABLE_AVX_FLOAT)
+  case 7: return makePlan_avx_float(n, flags); break;
+#endif
+#if defined(ENABLE_AVX_DOUBLE)
+  case 8: return makePlan_avx_double(n, flags); break;
+#endif
+#if defined(ENABLE_ALTIVEC_FLOAT)
+  case 9: return makePlan_altivec_float(n, flags); break;
+#endif
+  default: break;
+  }
+
+  return NULL;
+}
+
+void DFT_dispose(void *p, int32_t mode) {
+  switch(mode) {
+#if defined(ENABLE_PUREC_FLOAT)
+  case 1: destroyPlan_purec_float(p); break;
+#endif
+#if defined(ENABLE_PUREC_DOUBLE)
+  case 2: destroyPlan_purec_double(p); break;
+#endif
+#if defined(ENABLE_PUREC_LONGDOUBLE)
+  case 3: destroyPlan_purec_longdouble(p); break;
+#endif
+#if defined(ENABLE_SSE_FLOAT)
+  case 4: destroyPlan_sse_float(p); break;
+#endif
+#if defined(ENABLE_SSE2_DOUBLE)
+  case 5: destroyPlan_sse2_double(p); break;
+#endif
+#if defined(ENABLE_NEON_FLOAT)
+  case 6: destroyPlan_neon_float(p); break;
+#endif
+#if defined(ENABLE_AVX_FLOAT)
+  case 7: destroyPlan_avx_float(p); break;
+#endif
+#if defined(ENABLE_AVX_DOUBLE)
+  case 8: destroyPlan_avx_double(p); break;
+#endif
+#if defined(ENABLE_ALTIVEC_FLOAT)
+  case 9: destroyPlan_altivec_float(p); break;
+#endif
+  default: break;
+  }
+}
+
+void DFT_execute(void *p, int32_t mode, void *s, int32_t dir) {
+  switch(mode) {
+#if defined(ENABLE_PUREC_FLOAT)
+  case 1: return execute_purec_float(p, s, dir); break;
+#endif
+#if defined(ENABLE_PUREC_DOUBLE)
+  case 2: return execute_purec_double(p, s, dir); break;
+#endif
+#if defined(ENABLE_PUREC_LONGDOUBLE)
+  case 3: return execute_purec_longdouble(p, s, dir); break;
+#endif
+#if defined(ENABLE_SSE_FLOAT)
+  case 4: return execute_sse_float(p, s, dir); break;
+#endif
+#if defined(ENABLE_SSE2_DOUBLE)
+  case 5: return execute_sse2_double(p, s, dir); break;
+#endif
+#if defined(ENABLE_NEON_FLOAT)
+  case 6: return execute_neon_float(p, s, dir); break;
+#endif
+#if defined(ENABLE_AVX_FLOAT)
+  case 7: return execute_avx_float(p, s, dir); break;
+#endif
+#if defined(ENABLE_AVX_DOUBLE)
+  case 8: return execute_avx_double(p, s, dir); break;
+#endif
+#if defined(ENABLE_ALTIVEC_FLOAT)
+  case 9: return execute_altivec_float(p, s, dir); break;
+#endif
+  default: break;
+  }
+}
+
+#define FILE_FORMAT_VERSION 0
+
+int32_t DFT_fwrite(void *p2, FILE *fp) {
+  DFTUndiff *p = (DFTUndiff *)p2;
+  if (p->magic != MAGIC_DFT) abort();
+
+  if (fprintf(fp, "nsfft file format : %d\n", FILE_FORMAT_VERSION) <= 0) return 0;
+  if (fprintf(fp, "arch : %s\n", SIMDBase_getProcessorNameString()) <= 0) return 0;
+  if (fprintf(fp, "computation mode : %d\n", p->mode) <= 0) return 0;
+  if (fprintf(fp, "length : %d\n", ((p->flags & DFT_FLAG_REAL) != 0 || (p->flags & DFT_FLAG_ALT_REAL) != 0)? p->length * 2 : p->length) <= 0) return 0;
+  if (fprintf(fp, "radix2 threshold : %d\n", p->radix2thres) <= 0) return 0;
+  if (fprintf(fp, "transpose : %d\n", p->flagTrans) <= 0) return 0;
+  if (fprintf(fp, "bit reversal : %d\n", p->useCobra) <= 0) return 0;
+  if (fprintf(fp, "flags : %llx\n", (unsigned long long int)p->flags) <= 0) return 0;
+  if (fprintf(fp, "%s\n", "end :") <= 0) return 0;
+
+  return 1;
+}
+
+static char *startsWith(char *str1, char *str2) {
+  if (strncmp(str1, str2, strlen(str2)) == 0) {
+    return str1 + strlen(str2);
+  }
+
+  return NULL;
+}
+
+DFT *DFT_fread(FILE *fp, int32_t *errcode) {
+  int length = -1, radix2thres = -1, flagTrans = -1, useCobra = -1;
+  int mode = -1, formatver = -1;
+  unsigned long long int flags = (1ULL << 63);
+
+  if (errcode != NULL) *errcode = DFT_ERROR_NOERROR;
+
+  for(;;) {
+    char buf[256], *q;
+    if (fgets(buf, 255, fp) == NULL) { if (errcode != NULL) *errcode = DFT_ERROR_UNEXPECTED_EOF; return NULL; }
+
+    if ((q = startsWith(buf, "nsfft file format :")) != NULL) {
+      if (1 != sscanf(q, "%d", &formatver)) { if (errcode != NULL) *errcode = DFT_ERROR_FILE_IO; return NULL; }
+    } else if ((q = startsWith(buf, "computation mode :")) != NULL) {
+      if (1 != sscanf(q, "%d", &mode)) { if (errcode != NULL) *errcode = DFT_ERROR_FILE_IO; return NULL; }
+    } else if ((q = startsWith(buf, "length :")) != NULL) {
+      if (1 != sscanf(q, "%d", &length)) { if (errcode != NULL) *errcode = DFT_ERROR_FILE_IO; return NULL; }
+    } else if ((q = startsWith(buf, "radix2 threshold :")) != NULL) {
+      if (1 != sscanf(q, "%d", &radix2thres)) { if (errcode != NULL) *errcode = DFT_ERROR_FILE_IO; return NULL; }
+    } else if ((q = startsWith(buf, "transpose :")) != NULL) {
+      if (1 != sscanf(q, "%d", &flagTrans)) { if (errcode != NULL) *errcode = DFT_ERROR_FILE_IO; return NULL; }
+    } else if ((q = startsWith(buf, "bit reversal :")) != NULL) {
+      if (1 != sscanf(q, "%d", &useCobra)) { if (errcode != NULL) *errcode = DFT_ERROR_FILE_IO; return NULL; }
+    } else if ((q = startsWith(buf, "flags :")) != NULL) {
+      if (1 != sscanf(q, "%llx", &flags)) { if (errcode != NULL) *errcode = DFT_ERROR_FILE_IO; return NULL; }
+    } else if ((q = startsWith(buf, "end :")) != NULL) {
+      break;
+    }
+  }
+
+  if (formatver > FILE_FORMAT_VERSION) {
+    if (errcode != NULL) *errcode = DFT_ERROR_FILE_VERSION;
+    return NULL;
+  }
+
+  switch(SIMDBase_detect(mode)) {
+  case 1:
+    break;
+  case 0:
+    if (errcode != NULL) *errcode = DFT_ERROR_MODE_NOT_AVAILABLE;
+    return NULL;
+  case -1:
+    if (errcode != NULL) *errcode = DFT_ERROR_MODE_NOT_COMPILED_IN;
+    return NULL;
+  }
+
+  switch(mode) {
+#if defined(ENABLE_PUREC_FLOAT)
+  case 1: return makePlanSub_purec_float(length, radix2thres, useCobra, flags);
+#endif
+#if defined(ENABLE_PUREC_DOUBLE)
+  case 2: return makePlanSub_purec_double(length, radix2thres, useCobra, flags);
+#endif
+#if defined(ENABLE_PUREC_LONGDOUBLE)
+  case 3: return makePlanSub_purec_longdouble(length, radix2thres, useCobra, flags);
+#endif
+#if defined(ENABLE_SSE_FLOAT)
+  case 4: return makePlanSub_sse_float(length, radix2thres, useCobra, flags);
+#endif
+#if defined(ENABLE_SSE2_DOUBLE)
+  case 5: return makePlanSub_sse2_double(length, radix2thres, useCobra, flags);
+#endif
+#if defined(ENABLE_NEON_FLOAT)
+  case 6: return makePlanSub_neon_float(length, radix2thres, useCobra, flags);
+#endif
+#if defined(ENABLE_AVX_FLOAT)
+  case 7: return makePlanSub_avx_float(length, radix2thres, useCobra, flags);
+#endif
+#if defined(ENABLE_AVX_DOUBLE)
+  case 8: return makePlanSub_avx_double(length, radix2thres, useCobra, flags);
+#endif
+#if defined(ENABLE_ALTIVEC_FLOAT)
+  case 9: return makePlanSub_altivec_float(length, radix2thres, useCobra, flags);
+#endif
+  }
+
+  if (errcode != NULL) *errcode = DFT_ERROR_UNKNOWN_MODE;
+
+  return NULL;
+}
+
+int32_t DFT_getPlanParamInt(int32_t paramId, void *p2) {
+  DFTUndiff *p = (DFTUndiff *)p2;
+  if (p->magic != MAGIC_DFT) abort();
+
+  switch(paramId) {
+  case DFT_PARAMID_MODE: return p->mode;
+  case DFT_PARAMID_FFT_LENGTH:
+    if ((p->flags & DFT_FLAG_REAL) != 0) return p->length * 2;
+    if ((p->flags & DFT_FLAG_ALT_REAL) != 0) return p->length * 2;
+    return p->length;
+  case DFT_PARAMID_IS_REAL_TRANSFORM: return (p->flags & DFT_FLAG_REAL) ? 1 : 0;
+  case DFT_PARAMID_IS_ALT_REAL_TRANSFORM: return (p->flags & DFT_FLAG_ALT_REAL) ? 1 : 0;
+  case DFT_PARAMID_NO_BIT_REVERSAL: return (p->flags & DFT_FLAG_NO_BITREVERSAL) ? 1 : 0;
+  case DFT_PARAMID_TEST_RUN: return p->flags & 3;
+  }
+
+  return -1;
+}
+
+#if 0
+char *DFT_getPlanParamString(int32_t paramId, void *p2) {
+  dft_t *p = (dft_t *)p2;
+  if (p->magic != MAGIC_NSDFT) abort();
+
+  return NULL;
+}
+#endif
+
+uint32_t DFT_ilog2(uint32_t q) {
+  static const uint32_t tab[] = {0,1,2,2,3,3,3,3,4,4,4,4,4,4,4,4};
+  uint32_t r = 0,qq;
+
+  if (q & 0xffff0000) r = 16;
+
+  q >>= r;
+  qq = q | (q >> 1);
+  qq |= (qq >> 2);
+  qq = ((qq & 0x10) >> 4) | ((qq & 0x100) >> 7) | ((qq & 0x1000) >> 10);
+
+  return r + tab[qq] * 4 + tab[q >> (tab[qq] * 4)] - 1;
+}
+
+double DFT_timeofday(void) {
+  struct timeval tp;
+  gettimeofday(&tp, NULL);
+  return (double)tp.tv_sec+(1e-6)*tp.tv_usec;
+}
diff --git a/plugins/supereq/nsfft-1.00/dft/DFT.h b/plugins/supereq/nsfft-1.00/dft/DFT.h
new file mode 100644
index 00000000..facb701a
--- /dev/null
+++ b/plugins/supereq/nsfft-1.00/dft/DFT.h
@@ -0,0 +1,56 @@
+#ifndef __DFT_H__
+#define __DFT_H__
+
+#include <stdio.h>
+#include <stdint.h>
+
+typedef void DFT;
+
+int32_t DFT_getParamInt(int32_t paramId);
+char *DFT_getParamString(int32_t paramId);
+
+int32_t DFT_getModeParamInt(int32_t paramId, int32_t mode);
+char *DFT_getModeParamString(int32_t paramId, int32_t mode);
+
+DFT *DFT_init(int32_t mode, uint64_t n, uint64_t flags);
+void DFT_dispose(DFT *p, int32_t mode);
+
+int32_t DFT_fwrite(DFT *p, FILE *fp);
+DFT *DFT_fread(FILE *fp, int32_t *errcode);
+
+int32_t DFT_getPlanParamInt(int32_t paramId, void *p);
+
+void DFT_execute(DFT *p, int32_t mode, void *s, int32_t dir);
+
+uint32_t DFT_ilog2(uint32_t q);
+double DFT_timeofday(void);
+
+#define DFT_FLAG_NO_TEST_RUN ( 0ULL << 0)
+#define DFT_FLAG_LIGHT_TEST_RUN ( 1ULL << 0)
+#define DFT_FLAG_HEAVY_TEST_RUN ( 2ULL << 0)
+#define DFT_FLAG_EXHAUSTIVE_TEST_RUN ( 3ULL << 0)
+
+#define DFT_FLAG_REAL (1ULL << 2)
+#define DFT_FLAG_ALT_REAL (1ULL << 3)
+#define DFT_FLAG_VERBOSE (1ULL << 4)
+#define DFT_FLAG_NO_BITREVERSAL (1ULL << 5)
+#define DFT_FLAG_FORCE_RECURSIVE (1ULL << 6)
+#define DFT_FLAG_FORCE_COBRA (1ULL << 7)
+
+#define DFT_PARAMID_TYPE ( 1 | ( 3 << 24 ))
+#define DFT_PARAMID_MODE ( 2 | ( 3 << 24 ))
+#define DFT_PARAMID_FFT_LENGTH ( 3 | ( 3 << 24 ))
+#define DFT_PARAMID_IS_REAL_TRANSFORM ( 4 | ( 3 << 24 ))
+#define DFT_PARAMID_IS_ALT_REAL_TRANSFORM ( 5 | ( 3 << 24 ))
+#define DFT_PARAMID_NO_BIT_REVERSAL ( 6 | ( 3 << 24 ))
+#define DFT_PARAMID_TEST_RUN ( 7 | ( 3 << 24 ))
+
+#define DFT_ERROR_NOERROR 0
+#define DFT_ERROR_FILE_VERSION 1
+#define DFT_ERROR_FILE_IO 2
+#define DFT_ERROR_UNEXPECTED_EOF 3
+#define DFT_ERROR_MODE_NOT_COMPILED_IN 4
+#define DFT_ERROR_MODE_NOT_AVAILABLE 5
+#define DFT_ERROR_UNKNOWN_MODE 6
+
+#endif
diff --git a/plugins/supereq/nsfft-1.00/dft/DFTUndiff.c b/plugins/supereq/nsfft-1.00/dft/DFTUndiff.c
new file mode 100644
index 00000000..4985da33
--- /dev/null
+++ b/plugins/supereq/nsfft-1.00/dft/DFTUndiff.c
@@ -0,0 +1,1807 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include <stdint.h>
+
+#include "SIMDBase.h"
+#include "SIMDBaseUndiff.h"
+#include "DFT.h"
+#include "DFTUndiff.h"
+
+//
+
+#define SIN(x) sin(x)
+#define COS(x) cos(x)
+
+#define SQRT2_2 .7071067811865475244008443621048490392848359376884740365883398689953L
+
+#ifndef M_PIl
+#define M_PIl 3.141592653589793238462643383279502884197169399375105820974944592307L
+#endif
+
+//
+
+static inline void srBut2(DFTUndiff *p) {
+  SIMDBase_VECT *s = p->s;
+  int32_t o = p->offset1;
+  SIMDBase_VECT t0, t1;
+
+  t0 = SIMDBase_ADDm(&s[o  ], &s[o+2]); t1 = SIMDBase_SUBm(&s[o  ], &s[o+2]);
+  SIMDBase_STOR(&s[o  ], t0); SIMDBase_STOR(&s[o+2], t1);
+  t0 = SIMDBase_ADDm(&s[o+1], &s[o+3]); t1 = SIMDBase_SUBm(&s[o+1], &s[o+3]);
+  SIMDBase_STOR(&s[o+1], t0); SIMDBase_STOR(&s[o+3], t1);
+}
+
+static inline void srButForward4(DFTUndiff *p) {
+  SIMDBase_VECT *s = p->s;
+  int32_t o = p->offset1;
+  SIMDBase_VECT t0r, t0i, t1r, t1i, t2r, t2i, t3r, t3i;
+
+  t0r = SIMDBase_ADDm(&s[o+0], &s[o+4]); t2r = SIMDBase_SUBm(&s[o+0], &s[o+4]);
+  t0i = SIMDBase_ADDm(&s[o+1], &s[o+5]); t2i = SIMDBase_SUBm(&s[o+1], &s[o+5]);
+  t1r = SIMDBase_ADDm(&s[o+2], &s[o+6]); t3i = SIMDBase_SUBm(&s[o+2], &s[o+6]);
+  t1i = SIMDBase_ADDm(&s[o+7], &s[o+3]); t3r = SIMDBase_SUBm(&s[o+7], &s[o+3]);
+
+  SIMDBase_STOR(&s[o+0], SIMDBase_ADDi(t0r, t1r)); SIMDBase_STOR(&s[o+1], SIMDBase_ADDi(t0i, t1i));
+  SIMDBase_STOR(&s[o+2], SIMDBase_SUBi(t0r, t1r)); SIMDBase_STOR(&s[o+3], SIMDBase_SUBi(t0i, t1i));
+  SIMDBase_STOR(&s[o+4], SIMDBase_SUBi(t2r, t3r)); SIMDBase_STOR(&s[o+5], SIMDBase_SUBi(t2i, t3i));
+  SIMDBase_STOR(&s[o+6], SIMDBase_ADDi(t2r, t3r)); SIMDBase_STOR(&s[o+7], SIMDBase_ADDi(t2i, t3i));
+}
+
+static inline void srButBackward4(DFTUndiff *p) {
+  SIMDBase_VECT *s = p->s;
+  int32_t o = p->offset1;
+
+  SIMDBase_VECT t0r, t0i, t1r, t1i;
+  SIMDBase_VECT s0 = SIMDBase_LOAD(&s[o+0]), s1 = SIMDBase_LOAD(&s[o+1]), s2 = SIMDBase_LOAD(&s[o+2]), s3 = SIMDBase_LOAD(&s[o+3]);
+
+  t0r = SIMDBase_ADDi(s0, s2); t0i = SIMDBase_SUBi(s0, s2); s0 = t0r; s2 = t0i;
+  t0r = SIMDBase_ADDi(s1, s3); t0i = SIMDBase_SUBi(s1, s3); s1 = t0r; s3 = t0i;
+  t0r = SIMDBase_ADDm(&s[o+4], &s[o+6]); t1i = SIMDBase_SUBm(&s[o+4], &s[o+6]);
+  t0i = SIMDBase_ADDm(&s[o+7], &s[o+5]); t1r = SIMDBase_SUBm(&s[o+7], &s[o+5]);
+
+  SIMDBase_STOR(&s[o+4], SIMDBase_SUBi(s0, t0r)); SIMDBase_STOR(&s[o+5], SIMDBase_SUBi(s1, t0i));
+  SIMDBase_STOR(&s[o+6], SIMDBase_SUBi(s2, t1r)); SIMDBase_STOR(&s[o+7], SIMDBase_SUBi(s3, t1i));
+  SIMDBase_STOR(&s[o+0], SIMDBase_ADDi(s0, t0r)); SIMDBase_STOR(&s[o+1], SIMDBase_ADDi(s1, t0i));
+  SIMDBase_STOR(&s[o+2], SIMDBase_ADDi(s2, t1r)); SIMDBase_STOR(&s[o+3], SIMDBase_ADDi(s3, t1i));
+}
+
+static inline void srButForward8(DFTUndiff *p) {
+  SIMDBase_VECT *s = p->s;
+  int32_t o = p->offset1;
+  SIMDBase_VECT t0r, t0i, t1r, t1i, t2r, t2i, t3r, t3i;
+
+  SIMDBase_VECT s0 = SIMDBase_LOAD(&s[o+ 0]), s1 = SIMDBase_LOAD(&s[o+ 1]), s2 = SIMDBase_LOAD(&s[o+ 2]), s3 = SIMDBase_LOAD(&s[o+ 3]);
+  SIMDBase_VECT s4 = SIMDBase_LOAD(&s[o+ 4]), s5 = SIMDBase_LOAD(&s[o+ 5]), s6 = SIMDBase_LOAD(&s[o+ 6]), s7 = SIMDBase_LOAD(&s[o+ 7]);
+  SIMDBase_VECT s8 = SIMDBase_LOAD(&s[o+ 8]), s9 = SIMDBase_LOAD(&s[o+ 9]), sa = SIMDBase_LOAD(&s[o+10]) ,sb = SIMDBase_LOAD(&s[o+11]);
+  SIMDBase_VECT sc = SIMDBase_LOAD(&s[o+12]), sd = SIMDBase_LOAD(&s[o+13]), se = SIMDBase_LOAD(&s[o+14]), sf = SIMDBase_LOAD(&s[o+15]);
+
+  t2r = SIMDBase_SUBi(s0, s8); t2i = SIMDBase_SUBi(s1, s9);
+  t3r = SIMDBase_SUBi(sd, s5); t3i = SIMDBase_SUBi(s4, sc); 
+  
+  s0 = SIMDBase_ADDi(s0, s8); s1 = SIMDBase_ADDi(s1, s9);
+  s4 = SIMDBase_ADDi(s4, sc); s5 = SIMDBase_ADDi(s5, sd);
+
+  s8 = SIMDBase_SUBi(t2r, t3r); s9 = SIMDBase_SUBi(t2i, t3i);
+  sc = SIMDBase_ADDi(t2r, t3r); sd = SIMDBase_ADDi(t2i, t3i);
+
+  t2r = SIMDBase_SUBi(s2, sa); t2i = SIMDBase_SUBi(s3, sb);
+  t3r = SIMDBase_SUBi(sf, s7); t3i = SIMDBase_SUBi(s6, se);
+
+  s2 = SIMDBase_ADDi(s2, sa); s3 = SIMDBase_ADDi(s3, sb);
+  s6 = SIMDBase_ADDi(s6, se); s7 = SIMDBase_ADDi(s7, sf);
+
+  t0r = SIMDBase_SUBi(t2r, t3r); t1r = SIMDBase_ADDi(t2r, t3r);
+  t0i = SIMDBase_SUBi(t2i, t3i); t1i = SIMDBase_ADDi(t2i, t3i);
+
+  sa = SIMDBase_MULi(SIMDBase_ADDi(t0r, t0i), SIMDBase_SET1( SQRT2_2));
+  sb = SIMDBase_MULi(SIMDBase_SUBi(t0i, t0r), SIMDBase_SET1( SQRT2_2));
+  se = SIMDBase_MULi(SIMDBase_SUBi(t1i, t1r), SIMDBase_SET1( SQRT2_2));
+  sf = SIMDBase_MULi(SIMDBase_ADDi(t1r, t1i), SIMDBase_SET1(-SQRT2_2));
+
+  SIMDBase_STOR(&s[o+ 8], SIMDBase_ADDi(s8, sa)); SIMDBase_STOR(&s[o+ 9], SIMDBase_ADDi(s9, sb));
+  SIMDBase_STOR(&s[o+10], SIMDBase_SUBi(s8, sa)); SIMDBase_STOR(&s[o+11], SIMDBase_SUBi(s9, sb));
+
+  SIMDBase_STOR(&s[o+12], SIMDBase_ADDi(sc, se)); SIMDBase_STOR(&s[o+13], SIMDBase_ADDi(sd, sf));
+  SIMDBase_STOR(&s[o+14], SIMDBase_SUBi(sc, se)); SIMDBase_STOR(&s[o+15], SIMDBase_SUBi(sd, sf));
+
+  t0r = SIMDBase_ADDi(s0, s4); t2r = SIMDBase_SUBi(s0, s4);
+  t0i = SIMDBase_ADDi(s1, s5); t2i = SIMDBase_SUBi(s1, s5);
+
+  t1r = SIMDBase_ADDi(s2, s6); t3i = SIMDBase_SUBi(s2, s6);
+  t1i = SIMDBase_ADDi(s3, s7); t3r = SIMDBase_SUBi(s7, s3);
+
+  SIMDBase_STOR(&s[o+0], SIMDBase_ADDi(t0r, t1r)); SIMDBase_STOR(&s[o+1], SIMDBase_ADDi(t0i, t1i));
+  SIMDBase_STOR(&s[o+2], SIMDBase_SUBi(t0r, t1r)); SIMDBase_STOR(&s[o+3], SIMDBase_SUBi(t0i, t1i));
+  SIMDBase_STOR(&s[o+4], SIMDBase_SUBi(t2r, t3r)); SIMDBase_STOR(&s[o+5], SIMDBase_SUBi(t2i, t3i));
+  SIMDBase_STOR(&s[o+6], SIMDBase_ADDi(t2r, t3r)); SIMDBase_STOR(&s[o+7], SIMDBase_ADDi(t2i, t3i));
+}
+
+static void srButBackward8(DFTUndiff *p) {
+  SIMDBase_VECT *s = p->s;
+  int32_t o = p->offset1;
+  SIMDBase_VECT t0r, t0i, t1r, t1i;
+
+  SIMDBase_VECT s0 = SIMDBase_LOAD(&s[o+ 0]), s1 = SIMDBase_LOAD(&s[o+ 1]), s2 = SIMDBase_LOAD(&s[o+ 2]), s3 = SIMDBase_LOAD(&s[o+ 3]);
+  SIMDBase_VECT s4 = SIMDBase_LOAD(&s[o+ 4]), s5 = SIMDBase_LOAD(&s[o+ 5]), s6 = SIMDBase_LOAD(&s[o+ 6]), s7 = SIMDBase_LOAD(&s[o+ 7]);
+  SIMDBase_VECT s8 = SIMDBase_LOAD(&s[o+ 8]), s9 = SIMDBase_LOAD(&s[o+ 9]), sa = SIMDBase_LOAD(&s[o+10]) ,sb = SIMDBase_LOAD(&s[o+11]);
+  SIMDBase_VECT sc = SIMDBase_LOAD(&s[o+12]), sd = SIMDBase_LOAD(&s[o+13]), se = SIMDBase_LOAD(&s[o+14]), sf = SIMDBase_LOAD(&s[o+15]);
+
+  t0r = SIMDBase_ADDi(s8, sa); t0i = SIMDBase_SUBi(s8, sa); s8 = t0r; sa = t0i;
+  t0r = SIMDBase_ADDi(s9, sb); t0i = SIMDBase_SUBi(s9, sb); s9 = t0r; sb = t0i;
+  t0r = SIMDBase_ADDi(sc, se); t0i = SIMDBase_SUBi(sc, se); sc = t0r; se = t0i;
+  t0r = SIMDBase_ADDi(sd, sf); t0i = SIMDBase_SUBi(sd, sf); sd = t0r; sf = t0i;
+  t0r = SIMDBase_ADDi(s0, s2); t0i = SIMDBase_SUBi(s0, s2); s0 = t0r; s2 = t0i;
+  t0r = SIMDBase_ADDi(s1, s3); t0i = SIMDBase_SUBi(s1, s3); s1 = t0r; s3 = t0i;
+
+  t0r = SIMDBase_ADDi(s4, s6); t0i = SIMDBase_ADDi(s7, s5);
+  t1r = SIMDBase_SUBi(s7, s5); t1i = SIMDBase_SUBi(s4, s6);
+
+  s4 = SIMDBase_SUBi(s0, t0r); s5 = SIMDBase_SUBi(s1, t0i);
+  s6 = SIMDBase_SUBi(s2, t1r); s7 = SIMDBase_SUBi(s3, t1i);
+  s0 = SIMDBase_ADDi(s0, t0r); s1 = SIMDBase_ADDi(s1, t0i);
+  s2 = SIMDBase_ADDi(s2, t1r); s3 = SIMDBase_ADDi(s3, t1i);
+
+  t0r = SIMDBase_ADDi(s8, sc); t0i = SIMDBase_ADDi(s9, sd);
+  t1r = SIMDBase_SUBi(sd, s9); t1i = SIMDBase_SUBi(s8, sc);
+
+  s8 = SIMDBase_SUBi(s0, t0r); s9 = SIMDBase_SUBi(s1, t0i);
+  sc = SIMDBase_SUBi(s4, t1r); sd = SIMDBase_SUBi(s5, t1i);
+  s0 = SIMDBase_ADDi(s0, t0r); s1 = SIMDBase_ADDi(s1, t0i);
+  s4 = SIMDBase_ADDi(s4, t1r); s5 = SIMDBase_ADDi(s5, t1i);
+
+  t0r = SIMDBase_MULi(SIMDBase_SUBi(sa, sb), SIMDBase_SET1( SQRT2_2));
+  t0i = SIMDBase_MULi(SIMDBase_ADDi(sa, sb), SIMDBase_SET1( SQRT2_2));
+  t1r = SIMDBase_MULi(SIMDBase_ADDi(se, sf), SIMDBase_SET1(-SQRT2_2));
+  t1i = SIMDBase_MULi(SIMDBase_SUBi(se, sf), SIMDBase_SET1( SQRT2_2));
+
+  sa = t0r; sb = t0i; se = t1r; sf = t1i;
+
+  t0r = SIMDBase_ADDi(sa, se); t0i = SIMDBase_ADDi(sb, sf);
+  t1r = SIMDBase_SUBi(sf, sb); t1i = SIMDBase_SUBi(sa, se);
+
+  sa = SIMDBase_SUBi(s2, t0r); sb = SIMDBase_SUBi(s3, t0i);
+  se = SIMDBase_SUBi(s6, t1r); sf = SIMDBase_SUBi(s7, t1i);
+  s2 = SIMDBase_ADDi(s2, t0r); s3 = SIMDBase_ADDi(s3, t0i);
+  s6 = SIMDBase_ADDi(s6, t1r); s7 = SIMDBase_ADDi(s7, t1i);
+
+  SIMDBase_STOR(&s[o+ 0], s0); SIMDBase_STOR(&s[o+ 1], s1); SIMDBase_STOR(&s[o+ 2], s2); SIMDBase_STOR(&s[o+ 3], s3);
+  SIMDBase_STOR(&s[o+ 4], s4); SIMDBase_STOR(&s[o+ 5], s5); SIMDBase_STOR(&s[o+ 6], s6); SIMDBase_STOR(&s[o+ 7], s7);
+  SIMDBase_STOR(&s[o+ 8], s8); SIMDBase_STOR(&s[o+ 9], s9); SIMDBase_STOR(&s[o+10], sa); SIMDBase_STOR(&s[o+11], sb);
+  SIMDBase_STOR(&s[o+12], sc); SIMDBase_STOR(&s[o+13], sd); SIMDBase_STOR(&s[o+14], se); SIMDBase_STOR(&s[o+15], sf);
+}
+
+#if 0
+static inline void srButForwardSub(DFTUndiff *p) {
+  SIMDBase_VECT *s = p->s;
+  SIMDBase_REAL *tbl = p->ptTable[p->log2butlen];
+
+  int32_t i0 = p->offset1;
+  int32_t i1 = i0 + p->stride;
+  int32_t i2 = i1 + p->stride;
+  int32_t i3 = i2 + p->stride;
+  int32_t im = i1;
+
+  int32_t p0 = p->offset2 & (p->butlen*4-1);
+
+  while(i0 < im) {
+    SIMDBase_VECT t0r, t0i, t1r, t1i;
+    SIMDBase_VECT s00, s01, s10, s11, s20, s21, s30, s31;
+    SIMDBase_VECT a0, a1, a2, a3;
+
+    s00 = SIMDBase_LOAD(&s[i0+0]), s01 = SIMDBase_LOAD(&s[i0+1]);
+    s10 = SIMDBase_LOAD(&s[i1+0]), s11 = SIMDBase_LOAD(&s[i1+1]);
+    s20 = SIMDBase_LOAD(&s[i2+0]), s21 = SIMDBase_LOAD(&s[i2+1]);
+    s30 = SIMDBase_LOAD(&s[i3+0]), s31 = SIMDBase_LOAD(&s[i3+1]);
+
+    t0r = SIMDBase_SUBi(SIMDBase_SUBi(s00, s20), SIMDBase_SUBi(s31, s11));
+    t0i = SIMDBase_SUBi(SIMDBase_SUBi(s01, s21), SIMDBase_SUBi(s10, s30));
+
+    t1r = SIMDBase_ADDi(SIMDBase_SUBi(s00, s20), SIMDBase_SUBi(s31, s11));
+    t1i = SIMDBase_ADDi(SIMDBase_SUBi(s01, s21), SIMDBase_SUBi(s10, s30));
+    
+    a0 = SIMDBase_LOAD1(&tbl[p0+0]); a1 = SIMDBase_LOAD1(&tbl[p0+1]);
+    a2 = SIMDBase_LOAD1(&tbl[p0+2]); a3 = SIMDBase_LOAD1(&tbl[p0+3]);
+
+    SIMDBase_STOR(&s[i0  ], SIMDBase_ADDi(s00, s20)); SIMDBase_STOR(&s[i0+1], SIMDBase_ADDi(s01, s21));
+    SIMDBase_STOR(&s[i1  ], SIMDBase_ADDi(s10, s30)); SIMDBase_STOR(&s[i1+1], SIMDBase_ADDi(s11, s31));
+
+#ifndef SIMDBase_FMADD_AVAILABLE
+    SIMDBase_STOR(&s[i2  ], SIMDBase_SUBi(SIMDBase_MULi(t0r, a0), SIMDBase_MULi(t0i, a1)));
+    SIMDBase_STOR(&s[i2+1], SIMDBase_ADDi(SIMDBase_MULi(t0r, a1), SIMDBase_MULi(t0i, a0)));
+    SIMDBase_STOR(&s[i3  ], SIMDBase_SUBi(SIMDBase_MULi(t1r, a2), SIMDBase_MULi(t1i, a3)));
+    SIMDBase_STOR(&s[i3+1], SIMDBase_ADDi(SIMDBase_MULi(t1r, a3), SIMDBase_MULi(t1i, a2)));
+#else
+    SIMDBase_STOR(&s[i2  ], SIMDBase_FMSUBi(t0i, a1, SIMDBase_MULi(t0r, a0)));
+    SIMDBase_STOR(&s[i2+1], SIMDBase_FMADDi(t0r, a1, SIMDBase_MULi(t0i, a0)));
+    SIMDBase_STOR(&s[i3  ], SIMDBase_FMSUBi(t1i, a3, SIMDBase_MULi(t1r, a2)));
+    SIMDBase_STOR(&s[i3+1], SIMDBase_FMADDi(t1r, a3, SIMDBase_MULi(t1i, a2)));
+#endif
+
+    i0 += 2; i1 += 2; i2 += 2; i3 += 2;
+    p0 += 4;
+  }
+}
+#endif
+
+#if 0
+static inline void srButBackwardSub(DFTUndiff *p) {
+  SIMDBase_VECT *s = p->s;
+  SIMDBase_REAL *tbl = p->ptTable[p->log2butlen];
+
+  int32_t i0 = p->offset1;
+  int32_t i1 = i0 + p->stride;
+  int32_t i2 = i1 + p->stride;
+  int32_t i3 = i2 + p->stride;
+  int32_t im = i1;
+
+  int32_t p0 = p->offset2 & (p->butlen*4-1);
+
+  while(i0 < im) {
+    SIMDBase_VECT t0r, t0i, t1r, t1i, u, v;
+    SIMDBase_VECT s00, s01, s10, s11, s20, s21, s30, s31;
+    SIMDBase_VECT a0, a1, a2, a3;
+
+    s20 = SIMDBase_LOAD(&s[i2+0]); s21 = SIMDBase_LOAD(&s[i2+1]);
+    a0 = SIMDBase_LOAD1(&tbl[p0+0]); a1 = SIMDBase_LOAD1(&tbl[p0+1]);
+    u = SIMDBase_ADDi(SIMDBase_MULi(s20, a0), SIMDBase_MULi(s21, a1));
+
+    s30 = SIMDBase_LOAD(&s[i3+0]); s31 = SIMDBase_LOAD(&s[i3+1]);
+    a2 = SIMDBase_LOAD1(&tbl[p0+2]); a3 = SIMDBase_LOAD1(&tbl[p0+3]);
+    v = SIMDBase_ADDi(SIMDBase_MULi(s30, a2), SIMDBase_MULi(s31, a3));
+
+    t0r = SIMDBase_ADDi(u, v); t1i = SIMDBase_SUBi(u, v);
+    u = SIMDBase_SUBi(SIMDBase_MULi(s31, a2), SIMDBase_MULi(s30, a3));
+    v = SIMDBase_SUBi(SIMDBase_MULi(s21, a0), SIMDBase_MULi(s20, a1));
+    t0i = SIMDBase_ADDi(u, v); t1r = SIMDBase_SUBi(u, v);
+
+    s00 = SIMDBase_LOAD(&s[i0+0]); s01 = SIMDBase_LOAD(&s[i0+1]);
+    s10 = SIMDBase_LOAD(&s[i1+0]); s11 = SIMDBase_LOAD(&s[i1+1]);
+
+    SIMDBase_STOR(&s[i2+0], SIMDBase_SUBi(s00, t0r)); SIMDBase_STOR(&s[i0+0], SIMDBase_ADDi(s00, t0r));
+    SIMDBase_STOR(&s[i2+1], SIMDBase_SUBi(s01, t0i)); SIMDBase_STOR(&s[i0+1], SIMDBase_ADDi(s01, t0i));
+    SIMDBase_STOR(&s[i3+0], SIMDBase_SUBi(s10, t1r)); SIMDBase_STOR(&s[i1+0], SIMDBase_ADDi(s10, t1r));
+    SIMDBase_STOR(&s[i3+1], SIMDBase_SUBi(s11, t1i)); SIMDBase_STOR(&s[i1+1], SIMDBase_ADDi(s11, t1i));
+
+    i0 += 2; i1 += 2; i2 += 2; i3 += 2;
+    p0 += 4;
+  }
+}
+
+static void srButBackwardSubUnrolled(DFTUndiff *p) {
+  srButBackwardSub(p);
+}
+#endif
+
+static inline void srButForwardSubUnrolled(DFTUndiff *p) {
+  SIMDBase_VECT *s = p->s;
+  SIMDBase_REAL *tbl = p->ptTable[p->log2butlen];
+
+  int32_t i0 = p->offset1;
+  int32_t i1 = i0 + p->stride;
+  int32_t i2 = i1 + p->stride;
+  int32_t i3 = i2 + p->stride;
+  int32_t im = i1;
+
+  int32_t p0 = p->offset2 & (p->butlen*4-1);
+
+  while(i0 < im) {
+    SIMDBase_VECT t0r, t0i, t1r, t1i;
+    SIMDBase_VECT s00, s01, s10, s11, s20, s21, s30, s31;
+    SIMDBase_VECT a0, a1, a2, a3;
+
+    //
+
+    s00 = SIMDBase_LOAD(&s[i0+0]); s01 = SIMDBase_LOAD(&s[i0+1]);
+    s10 = SIMDBase_LOAD(&s[i1+0]); s11 = SIMDBase_LOAD(&s[i1+1]);
+    s20 = SIMDBase_LOAD(&s[i2+0]); s21 = SIMDBase_LOAD(&s[i2+1]);
+    s30 = SIMDBase_LOAD(&s[i3+0]); s31 = SIMDBase_LOAD(&s[i3+1]);
+
+    t0r = SIMDBase_SUBi(SIMDBase_SUBi(s00, s20), SIMDBase_SUBi(s31, s11));
+    t0i = SIMDBase_SUBi(SIMDBase_SUBi(s01, s21), SIMDBase_SUBi(s10, s30));
+
+    t1r = SIMDBase_ADDi(SIMDBase_SUBi(s00, s20), SIMDBase_SUBi(s31, s11));
+    t1i = SIMDBase_ADDi(SIMDBase_SUBi(s01, s21), SIMDBase_SUBi(s10, s30));
+    
+    a0 = SIMDBase_LOAD1(&tbl[p0+0]); a1 = SIMDBase_LOAD1(&tbl[p0+1]);
+    a2 = SIMDBase_LOAD1(&tbl[p0+2]); a3 = SIMDBase_LOAD1(&tbl[p0+3]);
+
+    SIMDBase_STOR(&s[i0  ], SIMDBase_ADDi(s00, s20)); SIMDBase_STOR(&s[i0+1], SIMDBase_ADDi(s01, s21));
+    SIMDBase_STOR(&s[i1  ], SIMDBase_ADDi(s10, s30)); SIMDBase_STOR(&s[i1+1], SIMDBase_ADDi(s11, s31));
+
+#ifndef SIMDBase_FMADD_AVAILABLE
+    SIMDBase_STOR(&s[i2  ], SIMDBase_SUBi(SIMDBase_MULi(t0r, a0), SIMDBase_MULi(t0i, a1)));
+    SIMDBase_STOR(&s[i2+1], SIMDBase_ADDi(SIMDBase_MULi(t0r, a1), SIMDBase_MULi(t0i, a0)));
+    SIMDBase_STOR(&s[i3  ], SIMDBase_SUBi(SIMDBase_MULi(t1r, a2), SIMDBase_MULi(t1i, a3)));
+    SIMDBase_STOR(&s[i3+1], SIMDBase_ADDi(SIMDBase_MULi(t1r, a3), SIMDBase_MULi(t1i, a2)));
+#else
+    SIMDBase_STOR(&s[i2  ], SIMDBase_FMSUBi(t0i, a1, SIMDBase_MULi(t0r, a0)));
+    SIMDBase_STOR(&s[i2+1], SIMDBase_FMADDi(t0r, a1, SIMDBase_MULi(t0i, a0)));
+    SIMDBase_STOR(&s[i3  ], SIMDBase_FMSUBi(t1i, a3, SIMDBase_MULi(t1r, a2)));
+    SIMDBase_STOR(&s[i3+1], SIMDBase_FMADDi(t1r, a3, SIMDBase_MULi(t1i, a2)));
+#endif
+
+    //
+
+    s00 = SIMDBase_LOAD(&s[i0+2]); s01 = SIMDBase_LOAD(&s[i0+3]);
+    s10 = SIMDBase_LOAD(&s[i1+2]); s11 = SIMDBase_LOAD(&s[i1+3]);
+    s20 = SIMDBase_LOAD(&s[i2+2]); s21 = SIMDBase_LOAD(&s[i2+3]);
+    s30 = SIMDBase_LOAD(&s[i3+2]); s31 = SIMDBase_LOAD(&s[i3+3]);
+
+    t0r = SIMDBase_SUBi(SIMDBase_SUBi(s00, s20), SIMDBase_SUBi(s31, s11));
+    t0i = SIMDBase_SUBi(SIMDBase_SUBi(s01, s21), SIMDBase_SUBi(s10, s30));
+
+    t1r = SIMDBase_ADDi(SIMDBase_SUBi(s00, s20), SIMDBase_SUBi(s31, s11));
+    t1i = SIMDBase_ADDi(SIMDBase_SUBi(s01, s21), SIMDBase_SUBi(s10, s30));
+    
+    a0 = SIMDBase_LOAD1(&tbl[p0+4]); a1 = SIMDBase_LOAD1(&tbl[p0+5]);
+    a2 = SIMDBase_LOAD1(&tbl[p0+6]); a3 = SIMDBase_LOAD1(&tbl[p0+7]);
+
+    SIMDBase_STOR(&s[i0+2], SIMDBase_ADDi(s00, s20)); SIMDBase_STOR(&s[i0+3], SIMDBase_ADDi(s01, s21));
+    SIMDBase_STOR(&s[i1+2], SIMDBase_ADDi(s10, s30)); SIMDBase_STOR(&s[i1+3], SIMDBase_ADDi(s11, s31));
+
+#ifndef SIMDBase_FMADD_AVAILABLE
+    SIMDBase_STOR(&s[i2+2], SIMDBase_SUBi(SIMDBase_MULi(t0r, a0), SIMDBase_MULi(t0i, a1)));
+    SIMDBase_STOR(&s[i2+3], SIMDBase_ADDi(SIMDBase_MULi(t0r, a1), SIMDBase_MULi(t0i, a0)));
+    SIMDBase_STOR(&s[i3+2], SIMDBase_SUBi(SIMDBase_MULi(t1r, a2), SIMDBase_MULi(t1i, a3)));
+    SIMDBase_STOR(&s[i3+3], SIMDBase_ADDi(SIMDBase_MULi(t1r, a3), SIMDBase_MULi(t1i, a2)));
+#else
+    SIMDBase_STOR(&s[i2+2], SIMDBase_FMSUBi(t0i, a1, SIMDBase_MULi(t0r, a0)));
+    SIMDBase_STOR(&s[i2+3], SIMDBase_FMADDi(t0r, a1, SIMDBase_MULi(t0i, a0)));
+    SIMDBase_STOR(&s[i3+2], SIMDBase_FMSUBi(t1i, a3, SIMDBase_MULi(t1r, a2)));
+    SIMDBase_STOR(&s[i3+3], SIMDBase_FMADDi(t1r, a3, SIMDBase_MULi(t1i, a2)));
+#endif
+
+    //
+
+    s00 = SIMDBase_LOAD(&s[i0+4]); s01 = SIMDBase_LOAD(&s[i0+5]);
+    s10 = SIMDBase_LOAD(&s[i1+4]); s11 = SIMDBase_LOAD(&s[i1+5]);
+    s20 = SIMDBase_LOAD(&s[i2+4]); s21 = SIMDBase_LOAD(&s[i2+5]);
+    s30 = SIMDBase_LOAD(&s[i3+4]); s31 = SIMDBase_LOAD(&s[i3+5]);
+
+    t0r = SIMDBase_SUBi(SIMDBase_SUBi(s00, s20), SIMDBase_SUBi(s31, s11));
+    t0i = SIMDBase_SUBi(SIMDBase_SUBi(s01, s21), SIMDBase_SUBi(s10, s30));
+
+    t1r = SIMDBase_ADDi(SIMDBase_SUBi(s00, s20), SIMDBase_SUBi(s31, s11));
+    t1i = SIMDBase_ADDi(SIMDBase_SUBi(s01, s21), SIMDBase_SUBi(s10, s30));
+    
+    a0 = SIMDBase_LOAD1(&tbl[p0+ 8]); a1 = SIMDBase_LOAD1(&tbl[p0+ 9]);
+    a2 = SIMDBase_LOAD1(&tbl[p0+10]); a3 = SIMDBase_LOAD1(&tbl[p0+11]);
+
+    SIMDBase_STOR(&s[i0+4], SIMDBase_ADDi(s00, s20)); SIMDBase_STOR(&s[i0+5], SIMDBase_ADDi(s01, s21));
+    SIMDBase_STOR(&s[i1+4], SIMDBase_ADDi(s10, s30)); SIMDBase_STOR(&s[i1+5], SIMDBase_ADDi(s11, s31));
+
+#ifndef SIMDBase_FMADD_AVAILABLE
+    SIMDBase_STOR(&s[i2+4], SIMDBase_SUBi(SIMDBase_MULi(t0r, a0), SIMDBase_MULi(t0i, a1)));
+    SIMDBase_STOR(&s[i2+5], SIMDBase_ADDi(SIMDBase_MULi(t0r, a1), SIMDBase_MULi(t0i, a0)));
+    SIMDBase_STOR(&s[i3+4], SIMDBase_SUBi(SIMDBase_MULi(t1r, a2), SIMDBase_MULi(t1i, a3)));
+    SIMDBase_STOR(&s[i3+5], SIMDBase_ADDi(SIMDBase_MULi(t1r, a3), SIMDBase_MULi(t1i, a2)));
+#else
+    SIMDBase_STOR(&s[i2+4], SIMDBase_FMSUBi(t0i, a1, SIMDBase_MULi(t0r, a0)));
+    SIMDBase_STOR(&s[i2+5], SIMDBase_FMADDi(t0r, a1, SIMDBase_MULi(t0i, a0)));
+    SIMDBase_STOR(&s[i3+4], SIMDBase_FMSUBi(t1i, a3, SIMDBase_MULi(t1r, a2)));
+    SIMDBase_STOR(&s[i3+5], SIMDBase_FMADDi(t1r, a3, SIMDBase_MULi(t1i, a2)));
+#endif
+
+    //
+
+    s00 = SIMDBase_LOAD(&s[i0+6]); s01 = SIMDBase_LOAD(&s[i0+7]);
+    s10 = SIMDBase_LOAD(&s[i1+6]); s11 = SIMDBase_LOAD(&s[i1+7]);
+    s20 = SIMDBase_LOAD(&s[i2+6]); s21 = SIMDBase_LOAD(&s[i2+7]);
+    s30 = SIMDBase_LOAD(&s[i3+6]); s31 = SIMDBase_LOAD(&s[i3+7]);
+
+    t0r = SIMDBase_SUBi(SIMDBase_SUBi(s00, s20), SIMDBase_SUBi(s31, s11));
+    t0i = SIMDBase_SUBi(SIMDBase_SUBi(s01, s21), SIMDBase_SUBi(s10, s30));
+
+    t1r = SIMDBase_ADDi(SIMDBase_SUBi(s00, s20), SIMDBase_SUBi(s31, s11));
+    t1i = SIMDBase_ADDi(SIMDBase_SUBi(s01, s21), SIMDBase_SUBi(s10, s30));
+    
+    a0 = SIMDBase_LOAD1(&tbl[p0+12]); a1 = SIMDBase_LOAD1(&tbl[p0+13]);
+    a2 = SIMDBase_LOAD1(&tbl[p0+14]); a3 = SIMDBase_LOAD1(&tbl[p0+15]);
+
+    SIMDBase_STOR(&s[i0+6], SIMDBase_ADDi(s00, s20)); SIMDBase_STOR(&s[i0+7], SIMDBase_ADDi(s01, s21));
+    SIMDBase_STOR(&s[i1+6], SIMDBase_ADDi(s10, s30)); SIMDBase_STOR(&s[i1+7], SIMDBase_ADDi(s11, s31));
+
+#ifndef SIMDBase_FMADD_AVAILABLE
+    SIMDBase_STOR(&s[i2+6], SIMDBase_SUBi(SIMDBase_MULi(t0r, a0), SIMDBase_MULi(t0i, a1)));
+    SIMDBase_STOR(&s[i2+7], SIMDBase_ADDi(SIMDBase_MULi(t0r, a1), SIMDBase_MULi(t0i, a0)));
+    SIMDBase_STOR(&s[i3+6], SIMDBase_SUBi(SIMDBase_MULi(t1r, a2), SIMDBase_MULi(t1i, a3)));
+    SIMDBase_STOR(&s[i3+7], SIMDBase_ADDi(SIMDBase_MULi(t1r, a3), SIMDBase_MULi(t1i, a2)));
+#else
+    SIMDBase_STOR(&s[i2+6], SIMDBase_FMSUBi(t0i, a1, SIMDBase_MULi(t0r, a0)));
+    SIMDBase_STOR(&s[i2+7], SIMDBase_FMADDi(t0r, a1, SIMDBase_MULi(t0i, a0)));
+    SIMDBase_STOR(&s[i3+6], SIMDBase_FMSUBi(t1i, a3, SIMDBase_MULi(t1r, a2)));
+    SIMDBase_STOR(&s[i3+7], SIMDBase_FMADDi(t1r, a3, SIMDBase_MULi(t1i, a2)));
+#endif
+
+    //
+
+    i0 += 8; i1 += 8; i2 += 8; i3 += 8;
+    p0 += 16;
+  }
+}
+
+#if 1
+static void srButBackwardSubUnrolled(DFTUndiff *p) {
+  SIMDBase_VECT *s = p->s;
+  SIMDBase_REAL *tbl = p->ptTable[p->log2butlen];
+
+  int32_t i0 = p->offset1;
+  int32_t i1 = i0 + p->stride;
+  int32_t i2 = i1 + p->stride;
+  int32_t i3 = i2 + p->stride;
+  int32_t im = i1;
+
+  int32_t p0 = p->offset2 & (p->butlen*4-1);
+
+  while(i0 < im) {
+    SIMDBase_VECT t0r, t0i, t1r, t1i, u, v;
+    SIMDBase_VECT s00, s01, s10, s11, s20, s21, s30, s31;
+    SIMDBase_VECT a0, a1, a2, a3;
+
+    //
+
+    s20 = SIMDBase_LOAD(&s[i2+ 0]); s21 = SIMDBase_LOAD(&s[i2+ 1]);
+    a0 = SIMDBase_LOAD1(&tbl[p0+ 0]); a1 = SIMDBase_LOAD1(&tbl[p0+ 1]);
+#ifndef SIMDBase_FMADD_AVAILABLE
+    u = SIMDBase_ADDi(SIMDBase_MULi(s20, a0), SIMDBase_MULi(s21, a1));
+#else
+    u = SIMDBase_FMADDi(s20, a0, SIMDBase_MULi(s21, a1));
+#endif
+
+    s30 = SIMDBase_LOAD(&s[i3+ 0]); s31 = SIMDBase_LOAD(&s[i3+ 1]);
+    a2 = SIMDBase_LOAD1(&tbl[p0+ 2]); a3 = SIMDBase_LOAD1(&tbl[p0+ 3]);
+#ifndef SIMDBase_FMADD_AVAILABLE
+    v = SIMDBase_ADDi(SIMDBase_MULi(s30, a2), SIMDBase_MULi(s31, a3));
+#else
+    v = SIMDBase_FMADDi(s30, a2, SIMDBase_MULi(s31, a3));
+#endif
+
+    t0r = SIMDBase_ADDi(u, v); t1i = SIMDBase_SUBi(u, v);
+
+#ifndef SIMDBase_FMADD_AVAILABLE
+    u = SIMDBase_SUBi(SIMDBase_MULi(s31, a2), SIMDBase_MULi(s30, a3));
+    v = SIMDBase_SUBi(SIMDBase_MULi(s21, a0), SIMDBase_MULi(s20, a1));
+#else
+    u = SIMDBase_FMSUBi(s30, a3, SIMDBase_MULi(s31, a2));
+    v = SIMDBase_FMSUBi(s20, a1, SIMDBase_MULi(s21, a0));
+#endif
+    t0i = SIMDBase_ADDi(u, v); t1r = SIMDBase_SUBi(u, v);
+
+    s00 = SIMDBase_LOAD(&s[i0+ 0]); s01 = SIMDBase_LOAD(&s[i0+ 1]);
+    s10 = SIMDBase_LOAD(&s[i1+ 0]); s11 = SIMDBase_LOAD(&s[i1+ 1]);
+
+    SIMDBase_STOR(&s[i2+ 0], SIMDBase_SUBi(s00, t0r)); SIMDBase_STOR(&s[i0+ 0], SIMDBase_ADDi(s00, t0r));
+    SIMDBase_STOR(&s[i2+ 1], SIMDBase_SUBi(s01, t0i)); SIMDBase_STOR(&s[i0+ 1], SIMDBase_ADDi(s01, t0i));
+    SIMDBase_STOR(&s[i3+ 0], SIMDBase_SUBi(s10, t1r)); SIMDBase_STOR(&s[i1+ 0], SIMDBase_ADDi(s10, t1r));
+    SIMDBase_STOR(&s[i3+ 1], SIMDBase_SUBi(s11, t1i)); SIMDBase_STOR(&s[i1+ 1], SIMDBase_ADDi(s11, t1i));
+
+    //
+
+    s20 = SIMDBase_LOAD(&s[i2+ 2]); s21 = SIMDBase_LOAD(&s[i2+ 3]);
+    a0 = SIMDBase_LOAD1(&tbl[p0+ 4]); a1 = SIMDBase_LOAD1(&tbl[p0+ 5]);
+#ifndef SIMDBase_FMADD_AVAILABLE
+    u = SIMDBase_ADDi(SIMDBase_MULi(s20, a0), SIMDBase_MULi(s21, a1));
+#else
+    u = SIMDBase_FMADDi(s20, a0, SIMDBase_MULi(s21, a1));
+#endif
+
+    s30 = SIMDBase_LOAD(&s[i3+ 2]); s31 = SIMDBase_LOAD(&s[i3+ 3]);
+    a2 = SIMDBase_LOAD1(&tbl[p0+ 6]); a3 = SIMDBase_LOAD1(&tbl[p0+ 7]);
+#ifndef SIMDBase_FMADD_AVAILABLE
+    v = SIMDBase_ADDi(SIMDBase_MULi(s30, a2), SIMDBase_MULi(s31, a3));
+#else
+    v = SIMDBase_FMADDi(s30, a2, SIMDBase_MULi(s31, a3));
+#endif
+
+    t0r = SIMDBase_ADDi(u, v); t1i = SIMDBase_SUBi(u, v);
+
+#ifndef SIMDBase_FMADD_AVAILABLE
+    u = SIMDBase_SUBi(SIMDBase_MULi(s31, a2), SIMDBase_MULi(s30, a3));
+    v = SIMDBase_SUBi(SIMDBase_MULi(s21, a0), SIMDBase_MULi(s20, a1));
+#else
+    u = SIMDBase_FMSUBi(s30, a3, SIMDBase_MULi(s31, a2));
+    v = SIMDBase_FMSUBi(s20, a1, SIMDBase_MULi(s21, a0));
+#endif
+    t0i = SIMDBase_ADDi(u, v); t1r = SIMDBase_SUBi(u, v);
+
+    s00 = SIMDBase_LOAD(&s[i0+ 2]); s01 = SIMDBase_LOAD(&s[i0+ 3]);
+    s10 = SIMDBase_LOAD(&s[i1+ 2]); s11 = SIMDBase_LOAD(&s[i1+ 3]);
+
+    SIMDBase_STOR(&s[i2+ 2], SIMDBase_SUBi(s00, t0r)); SIMDBase_STOR(&s[i0+ 2], SIMDBase_ADDi(s00, t0r));
+    SIMDBase_STOR(&s[i2+ 3], SIMDBase_SUBi(s01, t0i)); SIMDBase_STOR(&s[i0+ 3], SIMDBase_ADDi(s01, t0i));
+    SIMDBase_STOR(&s[i3+ 2], SIMDBase_SUBi(s10, t1r)); SIMDBase_STOR(&s[i1+ 2], SIMDBase_ADDi(s10, t1r));
+    SIMDBase_STOR(&s[i3+ 3], SIMDBase_SUBi(s11, t1i)); SIMDBase_STOR(&s[i1+ 3], SIMDBase_ADDi(s11, t1i));
+
+    //
+
+    s20 = SIMDBase_LOAD(&s[i2+ 4]); s21 = SIMDBase_LOAD(&s[i2+ 5]);
+    a0 = SIMDBase_LOAD1(&tbl[p0+ 8]); a1 = SIMDBase_LOAD1(&tbl[p0+ 9]);
+#ifndef SIMDBase_FMADD_AVAILABLE
+    u = SIMDBase_ADDi(SIMDBase_MULi(s20, a0), SIMDBase_MULi(s21, a1));
+#else
+    u = SIMDBase_FMADDi(s20, a0, SIMDBase_MULi(s21, a1));
+#endif
+
+    s30 = SIMDBase_LOAD(&s[i3+ 4]); s31 = SIMDBase_LOAD(&s[i3+ 5]);
+    a2 = SIMDBase_LOAD1(&tbl[p0+10]); a3 = SIMDBase_LOAD1(&tbl[p0+11]);
+#ifndef SIMDBase_FMADD_AVAILABLE
+    v = SIMDBase_ADDi(SIMDBase_MULi(s30, a2), SIMDBase_MULi(s31, a3));
+#else
+    v = SIMDBase_FMADDi(s30, a2, SIMDBase_MULi(s31, a3));
+#endif
+
+    t0r = SIMDBase_ADDi(u, v); t1i = SIMDBase_SUBi(u, v);
+
+#ifndef SIMDBase_FMADD_AVAILABLE
+    u = SIMDBase_SUBi(SIMDBase_MULi(s31, a2), SIMDBase_MULi(s30, a3));
+    v = SIMDBase_SUBi(SIMDBase_MULi(s21, a0), SIMDBase_MULi(s20, a1));
+#else
+    u = SIMDBase_FMSUBi(s30, a3, SIMDBase_MULi(s31, a2));
+    v = SIMDBase_FMSUBi(s20, a1, SIMDBase_MULi(s21, a0));
+#endif
+    t0i = SIMDBase_ADDi(u, v); t1r = SIMDBase_SUBi(u, v);
+
+    s00 = SIMDBase_LOAD(&s[i0+ 4]); s01 = SIMDBase_LOAD(&s[i0+ 5]);
+    s10 = SIMDBase_LOAD(&s[i1+ 4]); s11 = SIMDBase_LOAD(&s[i1+ 5]);
+
+    SIMDBase_STOR(&s[i2+ 4], SIMDBase_SUBi(s00, t0r)); SIMDBase_STOR(&s[i0+ 4], SIMDBase_ADDi(s00, t0r));
+    SIMDBase_STOR(&s[i2+ 5], SIMDBase_SUBi(s01, t0i)); SIMDBase_STOR(&s[i0+ 5], SIMDBase_ADDi(s01, t0i));
+    SIMDBase_STOR(&s[i3+ 4], SIMDBase_SUBi(s10, t1r)); SIMDBase_STOR(&s[i1+ 4], SIMDBase_ADDi(s10, t1r));
+    SIMDBase_STOR(&s[i3+ 5], SIMDBase_SUBi(s11, t1i)); SIMDBase_STOR(&s[i1+ 5], SIMDBase_ADDi(s11, t1i));
+
+    //
+
+    s20 = SIMDBase_LOAD(&s[i2+ 6]); s21 = SIMDBase_LOAD(&s[i2+ 7]);
+    a0 = SIMDBase_LOAD1(&tbl[p0+12]); a1 = SIMDBase_LOAD1(&tbl[p0+13]);
+#ifndef SIMDBase_FMADD_AVAILABLE
+    u = SIMDBase_ADDi(SIMDBase_MULi(s20, a0), SIMDBase_MULi(s21, a1));
+#else
+    u = SIMDBase_FMADDi(s20, a0, SIMDBase_MULi(s21, a1));
+#endif
+
+    s30 = SIMDBase_LOAD(&s[i3+ 6]); s31 = SIMDBase_LOAD(&s[i3+ 7]);
+    a2 = SIMDBase_LOAD1(&tbl[p0+14]); a3 = SIMDBase_LOAD1(&tbl[p0+15]);
+#ifndef SIMDBase_FMADD_AVAILABLE
+    v = SIMDBase_ADDi(SIMDBase_MULi(s30, a2), SIMDBase_MULi(s31, a3));
+#else
+    v = SIMDBase_FMADDi(s30, a2, SIMDBase_MULi(s31, a3));
+#endif
+
+    t0r = SIMDBase_ADDi(u, v); t1i = SIMDBase_SUBi(u, v);
+
+#ifndef SIMDBase_FMADD_AVAILABLE
+    u = SIMDBase_SUBi(SIMDBase_MULi(s31, a2), SIMDBase_MULi(s30, a3));
+    v = SIMDBase_SUBi(SIMDBase_MULi(s21, a0), SIMDBase_MULi(s20, a1));
+#else
+    u = SIMDBase_FMSUBi(s30, a3, SIMDBase_MULi(s31, a2));
+    v = SIMDBase_FMSUBi(s20, a1, SIMDBase_MULi(s21, a0));
+#endif
+    t0i = SIMDBase_ADDi(u, v); t1r = SIMDBase_SUBi(u, v);
+
+    s00 = SIMDBase_LOAD(&s[i0+ 6]); s01 = SIMDBase_LOAD(&s[i0+ 7]);
+    s10 = SIMDBase_LOAD(&s[i1+ 6]); s11 = SIMDBase_LOAD(&s[i1+ 7]);
+
+    SIMDBase_STOR(&s[i2+ 6], SIMDBase_SUBi(s00, t0r)); SIMDBase_STOR(&s[i0+ 6], SIMDBase_ADDi(s00, t0r));
+    SIMDBase_STOR(&s[i2+ 7], SIMDBase_SUBi(s01, t0i)); SIMDBase_STOR(&s[i0+ 7], SIMDBase_ADDi(s01, t0i));
+    SIMDBase_STOR(&s[i3+ 6], SIMDBase_SUBi(s10, t1r)); SIMDBase_STOR(&s[i1+ 6], SIMDBase_ADDi(s10, t1r));
+    SIMDBase_STOR(&s[i3+ 7], SIMDBase_SUBi(s11, t1i)); SIMDBase_STOR(&s[i1+ 7], SIMDBase_ADDi(s11, t1i));
+
+    //
+
+    i0 += 8; i1 += 8; i2 += 8; i3 += 8;
+    p0 += 16;
+  }
+}
+#endif
+
+static void r2ButForwardSub(DFTUndiff *p) {
+  SIMDBase_VECT *s = p->s;
+
+  SIMDBase_REAL *tbl = p->ptTable[p->log2butlen];
+
+  int32_t i0 = p->offset1;
+  int32_t i2 = i0 + p->stride*2;
+  int32_t cp = 0, sp = p->butlen/4;
+
+  do {
+    SIMDBase_VECT t0r, t0i, s0, s1, s2, s3, t0, t1;
+
+    s0 = SIMDBase_LOAD(&s[i0+0]); s2 = SIMDBase_LOAD(&s[i0+1]);
+    s1 = SIMDBase_LOAD(&s[i2+0]); s3 = SIMDBase_LOAD(&s[i2+1]);
+    t0 = SIMDBase_LOAD1(&tbl[cp+0]); t1 = SIMDBase_LOAD1(&tbl[sp-0]);
+    t0r = SIMDBase_SUBi(s0, s1); SIMDBase_STOR(&s[i0+0], SIMDBase_ADDi(s0, s1));
+    t0i = SIMDBase_SUBi(s2, s3); SIMDBase_STOR(&s[i0+1], SIMDBase_ADDi(s2, s3));
+    SIMDBase_STOR(&s[i2+0], SIMDBase_ADDi(SIMDBase_MULi(t0r, t0), SIMDBase_MULi(t0i, t1)));
+    SIMDBase_STOR(&s[i2+1], SIMDBase_SUBi(SIMDBase_MULi(t0i, t0), SIMDBase_MULi(t0r, t1)));
+
+    s0 = SIMDBase_LOAD(&s[i0+2]); s2 = SIMDBase_LOAD(&s[i0+3]);
+    s1 = SIMDBase_LOAD(&s[i2+2]); s3 = SIMDBase_LOAD(&s[i2+3]);
+    t0 = SIMDBase_LOAD1(&tbl[cp+1]); t1 = SIMDBase_LOAD1(&tbl[sp-1]);
+    t0r = SIMDBase_SUBi(s0, s1); SIMDBase_STOR(&s[i0+2], SIMDBase_ADDi(s0, s1));
+    t0i = SIMDBase_SUBi(s2, s3); SIMDBase_STOR(&s[i0+3], SIMDBase_ADDi(s2, s3));
+    SIMDBase_STOR(&s[i2+2], SIMDBase_ADDi(SIMDBase_MULi(t0r, t0), SIMDBase_MULi(t0i, t1)));
+    SIMDBase_STOR(&s[i2+3], SIMDBase_SUBi(SIMDBase_MULi(t0i, t0), SIMDBase_MULi(t0r, t1)));
+
+    s0 = SIMDBase_LOAD(&s[i0+4]); s2 = SIMDBase_LOAD(&s[i0+5]);
+    s1 = SIMDBase_LOAD(&s[i2+4]); s3 = SIMDBase_LOAD(&s[i2+5]);
+    t0 = SIMDBase_LOAD1(&tbl[cp+2]); t1 = SIMDBase_LOAD1(&tbl[sp-2]);
+    t0r = SIMDBase_SUBi(s0, s1); SIMDBase_STOR(&s[i0+4], SIMDBase_ADDi(s0, s1));
+    t0i = SIMDBase_SUBi(s2, s3); SIMDBase_STOR(&s[i0+5], SIMDBase_ADDi(s2, s3));
+    SIMDBase_STOR(&s[i2+4], SIMDBase_ADDi(SIMDBase_MULi(t0r, t0), SIMDBase_MULi(t0i, t1)));
+    SIMDBase_STOR(&s[i2+5], SIMDBase_SUBi(SIMDBase_MULi(t0i, t0), SIMDBase_MULi(t0r, t1)));
+
+    s0 = SIMDBase_LOAD(&s[i0+6]); s2 = SIMDBase_LOAD(&s[i0+7]);
+    s1 = SIMDBase_LOAD(&s[i2+6]); s3 = SIMDBase_LOAD(&s[i2+7]);
+    t0 = SIMDBase_LOAD1(&tbl[cp+3]); t1 = SIMDBase_LOAD1(&tbl[sp-3]);
+    t0r = SIMDBase_SUBi(s0, s1); SIMDBase_STOR(&s[i0+6], SIMDBase_ADDi(s0, s1));
+    t0i = SIMDBase_SUBi(s2, s3); SIMDBase_STOR(&s[i0+7], SIMDBase_ADDi(s2, s3));
+    SIMDBase_STOR(&s[i2+6], SIMDBase_ADDi(SIMDBase_MULi(t0r, t0), SIMDBase_MULi(t0i, t1)));
+    SIMDBase_STOR(&s[i2+7], SIMDBase_SUBi(SIMDBase_MULi(t0i, t0), SIMDBase_MULi(t0r, t1)));
+
+    //
+
+    i0 += 8; i2 += 8; cp += 4; sp -= 4;
+  } while(sp > 0);
+
+  do {
+    SIMDBase_VECT t0r, t0i, s0, s1, s2, s3, t0, t1;
+
+    s0 = SIMDBase_LOAD(&s[i0+0]); s2 = SIMDBase_LOAD(&s[i0+1]);
+    s1 = SIMDBase_LOAD(&s[i2+0]); s3 = SIMDBase_LOAD(&s[i2+1]);
+    t0 = SIMDBase_LOAD1(&tbl[cp-0]); t1 = SIMDBase_LOAD1(&tbl[sp+0]);
+    t0r = SIMDBase_SUBi(s0, s1); SIMDBase_STOR(&s[i0+0], SIMDBase_ADDi(s0, s1));
+    t0i = SIMDBase_SUBi(s2, s3); SIMDBase_STOR(&s[i0+1], SIMDBase_ADDi(s2, s3));
+    SIMDBase_STOR(&s[i2+0], SIMDBase_SUBi(SIMDBase_MULi(t0i, t1), SIMDBase_MULi(t0r, t0)));
+    SIMDBase_STOR(&s[i2+1], SIMDBase_NEGi(SIMDBase_ADDi(SIMDBase_MULi(t0i, t0), SIMDBase_MULi(t0r, t1))));
+
+    s0 = SIMDBase_LOAD(&s[i0+2]); s2 = SIMDBase_LOAD(&s[i0+3]);
+    s1 = SIMDBase_LOAD(&s[i2+2]); s3 = SIMDBase_LOAD(&s[i2+3]);
+    t0 = SIMDBase_LOAD1(&tbl[cp-1]); t1 = SIMDBase_LOAD1(&tbl[sp+1]);
+    t0r = SIMDBase_SUBi(s0, s1); SIMDBase_STOR(&s[i0+2], SIMDBase_ADDi(s0, s1));
+    t0i = SIMDBase_SUBi(s2, s3); SIMDBase_STOR(&s[i0+3], SIMDBase_ADDi(s2, s3));
+    SIMDBase_STOR(&s[i2+2], SIMDBase_SUBi(SIMDBase_MULi(t0i, t1), SIMDBase_MULi(t0r, t0)));
+    SIMDBase_STOR(&s[i2+3], SIMDBase_NEGi(SIMDBase_ADDi(SIMDBase_MULi(t0i, t0), SIMDBase_MULi(t0r, t1))));
+
+    s0 = SIMDBase_LOAD(&s[i0+4]); s2 = SIMDBase_LOAD(&s[i0+5]);
+    s1 = SIMDBase_LOAD(&s[i2+4]); s3 = SIMDBase_LOAD(&s[i2+5]);
+    t0 = SIMDBase_LOAD1(&tbl[cp-2]); t1 = SIMDBase_LOAD1(&tbl[sp+2]);
+    t0r = SIMDBase_SUBi(s0, s1); SIMDBase_STOR(&s[i0+4], SIMDBase_ADDi(s0, s1));
+    t0i = SIMDBase_SUBi(s2, s3); SIMDBase_STOR(&s[i0+5], SIMDBase_ADDi(s2, s3));
+    SIMDBase_STOR(&s[i2+4], SIMDBase_SUBi(SIMDBase_MULi(t0i, t1), SIMDBase_MULi(t0r, t0)));
+    SIMDBase_STOR(&s[i2+5], SIMDBase_NEGi(SIMDBase_ADDi(SIMDBase_MULi(t0i, t0), SIMDBase_MULi(t0r, t1))));
+
+    s0 = SIMDBase_LOAD(&s[i0+6]); s2 = SIMDBase_LOAD(&s[i0+7]);
+    s1 = SIMDBase_LOAD(&s[i2+6]); s3 = SIMDBase_LOAD(&s[i2+7]);
+    t0 = SIMDBase_LOAD1(&tbl[cp-3]); t1 = SIMDBase_LOAD1(&tbl[sp+3]);
+    t0r = SIMDBase_SUBi(s0, s1); SIMDBase_STOR(&s[i0+6], SIMDBase_ADDi(s0, s1));
+    t0i = SIMDBase_SUBi(s2, s3); SIMDBase_STOR(&s[i0+7], SIMDBase_ADDi(s2, s3));
+    SIMDBase_STOR(&s[i2+6], SIMDBase_SUBi(SIMDBase_MULi(t0i, t1), SIMDBase_MULi(t0r, t0)));
+    SIMDBase_STOR(&s[i2+7], SIMDBase_NEGi(SIMDBase_ADDi(SIMDBase_MULi(t0i, t0), SIMDBase_MULi(t0r, t1))));
+
+    //
+
+    i0 += 8; i2 += 8; cp -= 4; sp += 4;
+  } while(cp > 0);
+}
+
+static void r2ButBackwardSub(DFTUndiff *p) {
+  SIMDBase_VECT *s = p->s;
+
+  SIMDBase_REAL *tbl = p->ptTable[p->log2butlen];
+
+  int i0 = p->offset1;
+  int i2 = i0 + p->stride*2;
+
+  int cp = 0, sp = p->butlen/4;
+
+  do {
+    SIMDBase_VECT t0r, t0i, s0, s1, s2, s3, t0, t1;
+
+    s0 = SIMDBase_LOAD(&s[i0+0]); s2 = SIMDBase_LOAD(&s[i0+1]);
+    s1 = SIMDBase_LOAD(&s[i2+0]); s3 = SIMDBase_LOAD(&s[i2+1]);
+    t0 = SIMDBase_LOAD1(&tbl[cp+0]); t1 = SIMDBase_LOAD1(&tbl[sp-0]);
+    t0r = SIMDBase_SUBi(SIMDBase_MULi(s1, t0), SIMDBase_MULi(s3, t1));
+    t0i = SIMDBase_ADDi(SIMDBase_MULi(s1, t1), SIMDBase_MULi(s3, t0));
+    SIMDBase_STOR(&s[i2+0], SIMDBase_SUBi(s0, t0r)); SIMDBase_STOR(&s[i0+0], SIMDBase_ADDi(s0, t0r));
+    SIMDBase_STOR(&s[i2+1], SIMDBase_SUBi(s2, t0i)); SIMDBase_STOR(&s[i0+1], SIMDBase_ADDi(s2, t0i));
+
+    s0 = SIMDBase_LOAD(&s[i0+2]); s2 = SIMDBase_LOAD(&s[i0+3]);
+    s1 = SIMDBase_LOAD(&s[i2+2]); s3 = SIMDBase_LOAD(&s[i2+3]);
+    t0 = SIMDBase_LOAD1(&tbl[cp+1]); t1 = SIMDBase_LOAD1(&tbl[sp-1]);
+    t0r = SIMDBase_SUBi(SIMDBase_MULi(s1, t0), SIMDBase_MULi(s3, t1));
+    t0i = SIMDBase_ADDi(SIMDBase_MULi(s1, t1), SIMDBase_MULi(s3, t0));
+    SIMDBase_STOR(&s[i2+2], SIMDBase_SUBi(s0, t0r)); SIMDBase_STOR(&s[i0+2], SIMDBase_ADDi(s0, t0r));
+    SIMDBase_STOR(&s[i2+3], SIMDBase_SUBi(s2, t0i)); SIMDBase_STOR(&s[i0+3], SIMDBase_ADDi(s2, t0i));
+
+    s0 = SIMDBase_LOAD(&s[i0+4]); s2 = SIMDBase_LOAD(&s[i0+5]);
+    s1 = SIMDBase_LOAD(&s[i2+4]); s3 = SIMDBase_LOAD(&s[i2+5]);
+    t0 = SIMDBase_LOAD1(&tbl[cp+2]); t1 = SIMDBase_LOAD1(&tbl[sp-2]);
+    t0r = SIMDBase_SUBi(SIMDBase_MULi(s1, t0), SIMDBase_MULi(s3, t1));
+    t0i = SIMDBase_ADDi(SIMDBase_MULi(s1, t1), SIMDBase_MULi(s3, t0));
+    SIMDBase_STOR(&s[i2+4], SIMDBase_SUBi(s0, t0r)); SIMDBase_STOR(&s[i0+4], SIMDBase_ADDi(s0, t0r));
+    SIMDBase_STOR(&s[i2+5], SIMDBase_SUBi(s2, t0i)); SIMDBase_STOR(&s[i0+5], SIMDBase_ADDi(s2, t0i));
+
+    s0 = SIMDBase_LOAD(&s[i0+6]); s2 = SIMDBase_LOAD(&s[i0+7]);
+    s1 = SIMDBase_LOAD(&s[i2+6]); s3 = SIMDBase_LOAD(&s[i2+7]);
+    t0 = SIMDBase_LOAD1(&tbl[cp+3]); t1 = SIMDBase_LOAD1(&tbl[sp-3]);
+    t0r = SIMDBase_SUBi(SIMDBase_MULi(s1, t0), SIMDBase_MULi(s3, t1));
+    t0i = SIMDBase_ADDi(SIMDBase_MULi(s1, t1), SIMDBase_MULi(s3, t0));
+    SIMDBase_STOR(&s[i2+6], SIMDBase_SUBi(s0, t0r)); SIMDBase_STOR(&s[i0+6], SIMDBase_ADDi(s0, t0r));
+    SIMDBase_STOR(&s[i2+7], SIMDBase_SUBi(s2, t0i)); SIMDBase_STOR(&s[i0+7], SIMDBase_ADDi(s2, t0i));
+
+    i0 += 8; i2 += 8; cp += 4; sp -= 4;
+  } while(sp > 0);
+
+  do {
+    SIMDBase_VECT t0r, t0i, s0, s1, s2, s3, t0, t1;
+
+    s0 = SIMDBase_LOAD(&s[i0+0]); s2 = SIMDBase_LOAD(&s[i0+1]);
+    s1 = SIMDBase_LOAD(&s[i2+0]); s3 = SIMDBase_LOAD(&s[i2+1]);
+    t0 = SIMDBase_LOAD1(&tbl[cp-0]); t1 = SIMDBase_LOAD1(&tbl[sp+0]);
+    t0r = SIMDBase_NEGi(SIMDBase_ADDi(SIMDBase_MULi(s1, t0), SIMDBase_MULi(s3, t1)));
+    t0i = SIMDBase_SUBi(SIMDBase_MULi(s1, t1), SIMDBase_MULi(s3, t0));
+    SIMDBase_STOR(&s[i2+0], SIMDBase_SUBi(s0, t0r)); SIMDBase_STOR(&s[i0+0], SIMDBase_ADDi(s0, t0r));
+    SIMDBase_STOR(&s[i2+1], SIMDBase_SUBi(s2, t0i)); SIMDBase_STOR(&s[i0+1], SIMDBase_ADDi(s2, t0i));
+
+    s0 = SIMDBase_LOAD(&s[i0+2]); s2 = SIMDBase_LOAD(&s[i0+3]);
+    s1 = SIMDBase_LOAD(&s[i2+2]); s3 = SIMDBase_LOAD(&s[i2+3]);
+    t0 = SIMDBase_LOAD1(&tbl[cp-1]); t1 = SIMDBase_LOAD1(&tbl[sp+1]);
+    t0r = SIMDBase_NEGi(SIMDBase_ADDi(SIMDBase_MULi(s1, t0), SIMDBase_MULi(s3, t1)));
+    t0i = SIMDBase_SUBi(SIMDBase_MULi(s1, t1), SIMDBase_MULi(s3, t0));
+    SIMDBase_STOR(&s[i2+2], SIMDBase_SUBi(s0, t0r)); SIMDBase_STOR(&s[i0+2], SIMDBase_ADDi(s0, t0r));
+    SIMDBase_STOR(&s[i2+3], SIMDBase_SUBi(s2, t0i)); SIMDBase_STOR(&s[i0+3], SIMDBase_ADDi(s2, t0i));
+
+    s0 = SIMDBase_LOAD(&s[i0+4]); s2 = SIMDBase_LOAD(&s[i0+5]);
+    s1 = SIMDBase_LOAD(&s[i2+4]); s3 = SIMDBase_LOAD(&s[i2+5]);
+    t0 = SIMDBase_LOAD1(&tbl[cp-2]); t1 = SIMDBase_LOAD1(&tbl[sp+2]);
+    t0r = SIMDBase_NEGi(SIMDBase_ADDi(SIMDBase_MULi(s1, t0), SIMDBase_MULi(s3, t1)));
+    t0i = SIMDBase_SUBi(SIMDBase_MULi(s1, t1), SIMDBase_MULi(s3, t0));
+    SIMDBase_STOR(&s[i2+4], SIMDBase_SUBi(s0, t0r)); SIMDBase_STOR(&s[i0+4], SIMDBase_ADDi(s0, t0r));
+    SIMDBase_STOR(&s[i2+5], SIMDBase_SUBi(s2, t0i)); SIMDBase_STOR(&s[i0+5], SIMDBase_ADDi(s2, t0i));
+
+    s0 = SIMDBase_LOAD(&s[i0+6]); s2 = SIMDBase_LOAD(&s[i0+7]);
+    s1 = SIMDBase_LOAD(&s[i2+6]); s3 = SIMDBase_LOAD(&s[i2+7]);
+    t0 = SIMDBase_LOAD1(&tbl[cp-3]); t1 = SIMDBase_LOAD1(&tbl[sp+3]);
+    t0r = SIMDBase_NEGi(SIMDBase_ADDi(SIMDBase_MULi(s1, t0), SIMDBase_MULi(s3, t1)));
+    t0i = SIMDBase_SUBi(SIMDBase_MULi(s1, t1), SIMDBase_MULi(s3, t0));
+    SIMDBase_STOR(&s[i2+6], SIMDBase_SUBi(s0, t0r)); SIMDBase_STOR(&s[i0+6], SIMDBase_ADDi(s0, t0r));
+    SIMDBase_STOR(&s[i2+7], SIMDBase_SUBi(s2, t0i)); SIMDBase_STOR(&s[i0+7], SIMDBase_ADDi(s2, t0i));
+
+    i0 += 8; i2 += 8; cp -= 4; sp += 4;
+  } while(cp > 0);
+}
+
+static void srButForward16(DFTUndiff *p) {
+  int32_t o = p->offset1;
+
+  p->butlen = 16; p->log2butlen = 4; p->stride = p->butlen/2;
+  srButForwardSubUnrolled(p);
+
+  p->offset1 = o + 16*6/4;
+  srButForward4(p);
+
+  p->offset1 = o + 16*4/4;
+  srButForward4(p);
+
+  p->offset1 = o;
+  srButForward8(p);
+}
+
+static void srButBackward16(DFTUndiff *p) {
+  int32_t o = p->offset1;
+
+  p->offset1 = o + 16*6/4;
+  srButBackward4(p);
+
+  p->offset1 = o + 16*4/4;
+  srButBackward4(p);
+
+  p->offset1 = o;
+  srButBackward8(p);
+
+  p->butlen = 16; p->log2butlen = 4; p->stride = p->butlen/2;
+  srButBackwardSubUnrolled(p);
+}
+
+static void srButForward32(DFTUndiff *p) {
+  int32_t o = p->offset1;
+
+  p->butlen = 32; p->log2butlen = 5; p->stride = p->butlen/2;
+  srButForwardSubUnrolled(p);
+
+  p->offset1 = o + 32*6/4;
+  srButForward8 (p);
+
+  p->offset1 = o + 32*4/4;
+  srButForward8 (p);
+
+  p->offset1 = o;
+  srButForward16(p);
+}
+
+static void srButBackward32(DFTUndiff *p) {
+  int32_t o = p->offset1;
+
+  p->offset1 = o + 32*6/4;
+  srButBackward8 (p);
+
+  p->offset1 = o + 32*4/4;
+  srButBackward8 (p);
+
+  p->offset1 = o;
+  srButBackward16(p);
+
+  p->butlen = 32; p->log2butlen = 5; p->stride = p->butlen/2;
+  srButBackwardSubUnrolled(p);
+}
+
+//
+
+#if 1
+static inline void bitReversalUnit(SIMDBase_VECT *p, SIMDBase_VECT *q) {
+  SIMDBase_VECT w, x, y, z;
+
+  w = SIMDBase_LOAD(p); x = SIMDBase_LOAD(p+1);
+  y = SIMDBase_LOAD(q); z = SIMDBase_LOAD(q+1);
+
+  SIMDBase_STOR(q, w); SIMDBase_STOR(q+1, x);
+  SIMDBase_STOR(p, y); SIMDBase_STOR(p+1, z);
+}
+#else
+#define bitReversalUnit(p0, q0) {                    \
+  SIMDBase_VECT *px = (p0), *qx = (q0);              \
+  SIMDBase_VECT wx, xx, yx, zx;                      \
+                                                     \
+  wx = SIMDBase_LOAD(px); xx = SIMDBase_LOAD(px+1);  \
+  yx = SIMDBase_LOAD(qx); zx = SIMDBase_LOAD(qx+1);  \
+                                                     \
+  SIMDBase_STOR(qx, wx); SIMDBase_STOR(qx+1, xx);    \
+  SIMDBase_STOR(px, yx); SIMDBase_STOR(px+1, zx);    \
+}
+#endif
+
+static inline void bitReversal4s(SIMDBase_VECT *s, int32_t sc, int32_t o1, int32_t o2) {
+  SIMDBase_VECT *p = &s[o1*2], *q = &s[o2*2];
+  int b1 = sc*2*1, b2 = b1*2;
+  p += b1; q += b2;
+  bitReversalUnit(p, q);
+}
+
+static inline void bitReversal8s(SIMDBase_VECT *s, int32_t sc, int32_t o1, int32_t o2) {
+  SIMDBase_VECT *p = &s[o1*2], *q = &s[o2*2];
+  int b1 = sc*2*1, b2 = b1*2, b4 = b2*2;
+  p += b1; q += b4;
+  bitReversalUnit(p, q); p += b2; q += b2;
+  bitReversalUnit(p, q);
+}
+
+static inline void bitReversal8d(SIMDBase_VECT *s, int32_t sc, int32_t o1, int32_t o2) {
+  SIMDBase_VECT *p = &s[o1*2], *q = &s[o2*2];
+  int32_t b1 = sc*2*1, b2 = b1*2, b4 = b2*2;
+  bitReversalUnit(p, q); p += b1; q += b4;
+  bitReversalUnit(p, q); p += b2; q += b2;
+  bitReversalUnit(p, q); p -= b1; q -= b4;
+  bitReversalUnit(p, q); p += b4; q += b1;
+  bitReversalUnit(p, q); p += b1; q += b4;
+  bitReversalUnit(p, q); p -= b2; q -= b2;
+  bitReversalUnit(p, q); p -= b1; q -= b4;
+  bitReversalUnit(p, q);
+}
+
+static inline void bitReversal16s(SIMDBase_VECT *s, int32_t sc, int32_t o1, int32_t o2) {
+  SIMDBase_VECT *p = &s[o1*2], *q = &s[o2*2];
+  int32_t b1 = sc*2*1, b2 = b1*2, b4 = b2*2, b8 = b4*2;
+  p += b1; q += b8;
+  bitReversalUnit(p, q); p += b2; q += b4;
+  bitReversalUnit(p, q); p -= b1; q -= b8;
+  bitReversalUnit(p, q); p += b1 + b4; q += b2 + b8;
+  bitReversalUnit(p, q); p -= b2; q -= b4;
+  bitReversalUnit(p, q); p += b2 + b4; q += b1 + b2;
+  bitReversalUnit(p, q);
+}
+
+static inline void bitReversal16d(SIMDBase_VECT *s, int32_t sc, int32_t o1, int32_t o2) {
+  SIMDBase_VECT *p = &s[o1*2], *q = &s[o2*2];
+  int32_t b1 = sc*2*1, b2 = b1*2, b4 = b2*2, b8 = b4*2;
+  bitReversalUnit(p, q); p += b1; q += b8;
+  bitReversalUnit(p, q); p += b2; q += b4;
+  bitReversalUnit(p, q); p -= b1; q -= b8;
+  bitReversalUnit(p, q); p += b4; q += b2;
+  bitReversalUnit(p, q); p += b1; q += b8;
+  bitReversalUnit(p, q); p -= b2; q -= b4;
+  bitReversalUnit(p, q); p -= b1; q -= b8;
+  bitReversalUnit(p, q); p += b8; q += b1;
+  bitReversalUnit(p, q); p += b1; q += b8;
+  bitReversalUnit(p, q); p += b2; q += b4;
+  bitReversalUnit(p, q); p -= b1; q -= b8;
+  bitReversalUnit(p, q); p -= b4; q -= b2;
+  bitReversalUnit(p, q); p += b1; q += b8;
+  bitReversalUnit(p, q); p -= b2; q -= b4;
+  bitReversalUnit(p, q); p -= b1; q -= b8;
+  bitReversalUnit(p, q);
+}
+
+static inline void bitReversal32s(SIMDBase_VECT *s, int32_t sc, int32_t o1, int32_t o2) {
+  SIMDBase_VECT *p = &s[o1*2], *q = &s[o2*2];
+  int32_t b1 = sc*2*1, b2 = b1*2, b4 = b2*2, b8 = b4*2, b16 = b8*2;
+  p += b1; q += b16;
+  bitReversalUnit(p, q); p += b2; q += b8;
+  bitReversalUnit(p, q); p -= b1; q -= b16;
+  bitReversalUnit(p, q); p += b4; q += b4;
+  bitReversalUnit(p, q); p += b1; q += b16;
+  bitReversalUnit(p, q); p -= b2; q -= b8;
+  bitReversalUnit(p, q); p += b8; q += b2;
+  bitReversalUnit(p, q); p += b2; q += b8;
+  bitReversalUnit(p, q); p -= b4; q -= b4;
+  bitReversalUnit(p, q); p -= b2; q -= b8;
+  bitReversalUnit(p, q); p += b16 - b2; q += b1 + b2 + b8;
+  bitReversalUnit(p, q); p -= b4; q -= b4;
+  bitReversalUnit(p, q);
+}
+
+static void bitReversal32d(SIMDBase_VECT *s, int32_t sc, int32_t o1, int32_t o2) {
+  const int32_t k = 32;
+
+  bitReversal8d(s,2*sc, sc*(k/2  )+o1, sc*     1 +o2);
+  bitReversal8d(s,2*sc, sc*     0 +o1, sc*     0 +o2);
+  bitReversal8d(s,2*sc, sc*     1 +o1, sc*(k/2  )+o2);
+  bitReversal8d(s,2*sc, sc*(k/2+1)+o1, sc*(k/2+1)+o2);
+}
+
+static void bitReversalRecursive(SIMDBase_VECT *s, int32_t n, int32_t sc, int32_t o1, int32_t o2) {
+  if (n >= 64) {
+    if (o1 != o2) bitReversalRecursive(s, n/4, 2*sc, sc*(n/2)+o1, sc*1+o2);
+
+    bitReversalRecursive(s, n/4, 2*sc, sc*     0 +o1, sc*     0 +o2);
+    bitReversalRecursive(s, n/4, 2*sc, sc*     1 +o1, sc*(n/2  )+o2);
+    bitReversalRecursive(s, n/4, 2*sc, sc*(n/2+1)+o1, sc*(n/2+1)+o2);
+  } else {
+    if (o1 == o2) {
+      switch(n) {
+      case  4: bitReversal4s (s,sc,o1,o2); return;
+      case  8: bitReversal8s (s,sc,o1,o2); return;
+      case 16: bitReversal16s(s,sc,o1,o2); return;
+      case 32: bitReversal32s(s,sc,o1,o2); return;
+      }
+    } else {
+      switch(n) {
+      case  8: bitReversal8d (s,sc,o1,o2); return;
+      case 16: bitReversal16d(s,sc,o1,o2); return;
+      case 32: bitReversal32d(s,sc,o1,o2); return;
+      }
+    }
+  }
+}
+
+//
+
+static int bitR(int a, int logN) {
+  int ret = 0;
+  int i,j,k;
+  for(i=0,j=1,k=1<<(logN-1);i<logN;i++,j=j<<1,k=k>>1) {
+    if ((a & j) != 0) ret |= k;
+  }
+  return ret;
+}
+
+static void bitReversalCobraInplace(DFTUndiff *p) {
+  SIMDBase_VECT *s = p->s;
+  int cobraQ = p->cobraQ;
+  SIMDBase_VECT *cobraT = p->cobraT;
+  int *cobraR = p->cobraR;
+  int logN = p->log2len;
+
+  int b;
+
+  for(b=0;b<(1 << (logN-2*cobraQ));b++) {
+    int a,c;
+    int b2 = bitR(b, logN-2*cobraQ);
+
+    if (b2 < b) continue;
+
+    if (b2 == b) {
+      for(a=0;a<(1 << cobraQ);a++) {
+	int abc = ((a << (logN-2*cobraQ)) | b) << (cobraQ + 1);
+
+	int a2c = (cobraR[a] << cobraQ) << 1, a2cm = a2c+(1 << cobraQ)*2;
+
+	while(a2c < a2cm) {
+	  SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++]));
+	  SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++]));
+	  SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++]));
+	  SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++]));
+	  SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++]));
+	  SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++]));
+	  SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++]));
+	  SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++]));
+	  SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++]));
+	  SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++]));
+	  SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++]));
+	  SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++]));
+	  SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++]));
+	  SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++]));
+	  SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++]));
+	  SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++]));
+	}
+      }
+
+      for(c=0;c<(1 << cobraQ);c++) {
+	int c2 = cobraR[c];
+	int c2b2a2 = ((c2 << (logN-2*cobraQ)) | b2) << (cobraQ+1);
+
+	int a2c = c << 1;
+	int a2ci = 1 << (cobraQ+1);
+	int c2b2a2m = c2b2a2 + (1 << cobraQ)*2;
+
+	while(c2b2a2 < c2b2a2m) {
+	  SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c  ])); SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c+1])); a2c += a2ci;
+	  SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c  ])); SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c+1])); a2c += a2ci;
+	  SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c  ])); SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c+1])); a2c += a2ci;
+	  SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c  ])); SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c+1])); a2c += a2ci;
+	  SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c  ])); SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c+1])); a2c += a2ci;
+	  SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c  ])); SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c+1])); a2c += a2ci;
+	  SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c  ])); SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c+1])); a2c += a2ci;
+	  SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c  ])); SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c+1])); a2c += a2ci;
+	  SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c  ])); SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c+1])); a2c += a2ci;
+	  SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c  ])); SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c+1])); a2c += a2ci;
+	  SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c  ])); SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c+1])); a2c += a2ci;
+	  SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c  ])); SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c+1])); a2c += a2ci;
+	  SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c  ])); SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c+1])); a2c += a2ci;
+	  SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c  ])); SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c+1])); a2c += a2ci;
+	  SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c  ])); SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c+1])); a2c += a2ci;
+	  SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c  ])); SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c+1])); a2c += a2ci;
+	}
+      }
+    } else {
+      for(a=0;a<(1 << cobraQ);a++) {
+	int a2c = (cobraR[a] << cobraQ) << 1, a2cm = a2c+(1 << cobraQ)*2;
+	int abc = ((a << (logN-2*cobraQ)) | b) << (cobraQ + 1);
+
+	while(a2c < a2cm) {
+	  SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++]));
+	  SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++]));
+	  SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++]));
+	  SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++]));
+	  SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++]));
+	  SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++]));
+	  SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++]));
+	  SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++]));
+	  SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++]));
+	  SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++]));
+	  SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++]));
+	  SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++]));
+	  SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++]));
+	  SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++]));
+	  SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++]));
+	  SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++]));
+	}
+      }
+
+      for(c=0;c<(1 << cobraQ);c++) {
+	int c2 = cobraR[c];
+	int c2b2a2 = ((c2 << (logN-2*cobraQ)) | b2) << (cobraQ+1);
+
+	int a2c = c << 1;
+	int a2ci = 1 << (cobraQ+1);
+	int c2b2a2m = c2b2a2 + (1 << cobraQ)*2;
+
+	while(c2b2a2 < c2b2a2m) {
+	  SIMDBase_VECT t0, t1, t2, t3, t4, t5, t6, t7;
+
+	  t0 = SIMDBase_LOAD(&s[c2b2a2+0]); t1 = SIMDBase_LOAD(&s[c2b2a2+1]);
+	  t2 = SIMDBase_LOAD(&s[c2b2a2+2]); t3 = SIMDBase_LOAD(&s[c2b2a2+3]);
+	  t4 = SIMDBase_LOAD(&s[c2b2a2+4]); t5 = SIMDBase_LOAD(&s[c2b2a2+5]);
+	  t6 = SIMDBase_LOAD(&s[c2b2a2+6]); t7 = SIMDBase_LOAD(&s[c2b2a2+7]);
+
+	  SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c  ])); SIMDBase_STOR(&cobraT[a2c  ], t0);
+	  SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c+1])); SIMDBase_STOR(&cobraT[a2c+1], t1); a2c += a2ci;
+	  SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c  ])); SIMDBase_STOR(&cobraT[a2c  ], t2);
+	  SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c+1])); SIMDBase_STOR(&cobraT[a2c+1], t3); a2c += a2ci;
+	  SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c  ])); SIMDBase_STOR(&cobraT[a2c  ], t4);
+	  SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c+1])); SIMDBase_STOR(&cobraT[a2c+1], t5); a2c += a2ci;
+	  SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c  ])); SIMDBase_STOR(&cobraT[a2c  ], t6);
+	  SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c+1])); SIMDBase_STOR(&cobraT[a2c+1], t7); a2c += a2ci;
+
+	  t0 = SIMDBase_LOAD(&s[c2b2a2+0]); t1 = SIMDBase_LOAD(&s[c2b2a2+1]);
+	  t2 = SIMDBase_LOAD(&s[c2b2a2+2]); t3 = SIMDBase_LOAD(&s[c2b2a2+3]);
+	  t4 = SIMDBase_LOAD(&s[c2b2a2+4]); t5 = SIMDBase_LOAD(&s[c2b2a2+5]);
+	  t6 = SIMDBase_LOAD(&s[c2b2a2+6]); t7 = SIMDBase_LOAD(&s[c2b2a2+7]);
+
+	  SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c  ])); SIMDBase_STOR(&cobraT[a2c  ], t0);
+	  SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c+1])); SIMDBase_STOR(&cobraT[a2c+1], t1); a2c += a2ci;
+	  SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c  ])); SIMDBase_STOR(&cobraT[a2c  ], t2);
+	  SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c+1])); SIMDBase_STOR(&cobraT[a2c+1], t3); a2c += a2ci;
+	  SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c  ])); SIMDBase_STOR(&cobraT[a2c  ], t4);
+	  SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c+1])); SIMDBase_STOR(&cobraT[a2c+1], t5); a2c += a2ci;
+	  SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c  ])); SIMDBase_STOR(&cobraT[a2c  ], t6);
+	  SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c+1])); SIMDBase_STOR(&cobraT[a2c+1], t7); a2c += a2ci;
+
+	  t0 = SIMDBase_LOAD(&s[c2b2a2+0]); t1 = SIMDBase_LOAD(&s[c2b2a2+1]);
+	  t2 = SIMDBase_LOAD(&s[c2b2a2+2]); t3 = SIMDBase_LOAD(&s[c2b2a2+3]);
+	  t4 = SIMDBase_LOAD(&s[c2b2a2+4]); t5 = SIMDBase_LOAD(&s[c2b2a2+5]);
+	  t6 = SIMDBase_LOAD(&s[c2b2a2+6]); t7 = SIMDBase_LOAD(&s[c2b2a2+7]);
+
+	  SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c  ])); SIMDBase_STOR(&cobraT[a2c  ], t0);
+	  SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c+1])); SIMDBase_STOR(&cobraT[a2c+1], t1); a2c += a2ci;
+	  SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c  ])); SIMDBase_STOR(&cobraT[a2c  ], t2);
+	  SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c+1])); SIMDBase_STOR(&cobraT[a2c+1], t3); a2c += a2ci;
+	  SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c  ])); SIMDBase_STOR(&cobraT[a2c  ], t4);
+	  SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c+1])); SIMDBase_STOR(&cobraT[a2c+1], t5); a2c += a2ci;
+	  SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c  ])); SIMDBase_STOR(&cobraT[a2c  ], t6);
+	  SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c+1])); SIMDBase_STOR(&cobraT[a2c+1], t7); a2c += a2ci;
+
+	  t0 = SIMDBase_LOAD(&s[c2b2a2+0]); t1 = SIMDBase_LOAD(&s[c2b2a2+1]);
+	  t2 = SIMDBase_LOAD(&s[c2b2a2+2]); t3 = SIMDBase_LOAD(&s[c2b2a2+3]);
+	  t4 = SIMDBase_LOAD(&s[c2b2a2+4]); t5 = SIMDBase_LOAD(&s[c2b2a2+5]);
+	  t6 = SIMDBase_LOAD(&s[c2b2a2+6]); t7 = SIMDBase_LOAD(&s[c2b2a2+7]);
+
+	  SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c  ])); SIMDBase_STOR(&cobraT[a2c  ], t0);
+	  SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c+1])); SIMDBase_STOR(&cobraT[a2c+1], t1); a2c += a2ci;
+	  SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c  ])); SIMDBase_STOR(&cobraT[a2c  ], t2);
+	  SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c+1])); SIMDBase_STOR(&cobraT[a2c+1], t3); a2c += a2ci;
+	  SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c  ])); SIMDBase_STOR(&cobraT[a2c  ], t4);
+	  SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c+1])); SIMDBase_STOR(&cobraT[a2c+1], t5); a2c += a2ci;
+	  SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c  ])); SIMDBase_STOR(&cobraT[a2c  ], t6);
+	  SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c+1])); SIMDBase_STOR(&cobraT[a2c+1], t7); a2c += a2ci;
+	}
+      }
+
+      for(a=0;a<(1 << cobraQ);a++) {
+	int a2c = (cobraR[a] << cobraQ) << 1, a2cm = a2c+(1 << cobraQ)*2;
+	int abc = ((a << (logN-2*cobraQ)) | b) << (cobraQ + 1);
+
+	while(a2c < a2cm) {
+	  SIMDBase_STOR(&s[abc++], SIMDBase_LOAD(&cobraT[a2c++])); SIMDBase_STOR(&s[abc++], SIMDBase_LOAD(&cobraT[a2c++]));
+	  SIMDBase_STOR(&s[abc++], SIMDBase_LOAD(&cobraT[a2c++])); SIMDBase_STOR(&s[abc++], SIMDBase_LOAD(&cobraT[a2c++]));
+	  SIMDBase_STOR(&s[abc++], SIMDBase_LOAD(&cobraT[a2c++])); SIMDBase_STOR(&s[abc++], SIMDBase_LOAD(&cobraT[a2c++]));
+	  SIMDBase_STOR(&s[abc++], SIMDBase_LOAD(&cobraT[a2c++])); SIMDBase_STOR(&s[abc++], SIMDBase_LOAD(&cobraT[a2c++]));
+	  SIMDBase_STOR(&s[abc++], SIMDBase_LOAD(&cobraT[a2c++])); SIMDBase_STOR(&s[abc++], SIMDBase_LOAD(&cobraT[a2c++]));
+	  SIMDBase_STOR(&s[abc++], SIMDBase_LOAD(&cobraT[a2c++])); SIMDBase_STOR(&s[abc++], SIMDBase_LOAD(&cobraT[a2c++]));
+	  SIMDBase_STOR(&s[abc++], SIMDBase_LOAD(&cobraT[a2c++])); SIMDBase_STOR(&s[abc++], SIMDBase_LOAD(&cobraT[a2c++]));
+	  SIMDBase_STOR(&s[abc++], SIMDBase_LOAD(&cobraT[a2c++])); SIMDBase_STOR(&s[abc++], SIMDBase_LOAD(&cobraT[a2c++]));
+	  SIMDBase_STOR(&s[abc++], SIMDBase_LOAD(&cobraT[a2c++])); SIMDBase_STOR(&s[abc++], SIMDBase_LOAD(&cobraT[a2c++]));
+	  SIMDBase_STOR(&s[abc++], SIMDBase_LOAD(&cobraT[a2c++])); SIMDBase_STOR(&s[abc++], SIMDBase_LOAD(&cobraT[a2c++]));
+	  SIMDBase_STOR(&s[abc++], SIMDBase_LOAD(&cobraT[a2c++])); SIMDBase_STOR(&s[abc++], SIMDBase_LOAD(&cobraT[a2c++]));
+	  SIMDBase_STOR(&s[abc++], SIMDBase_LOAD(&cobraT[a2c++])); SIMDBase_STOR(&s[abc++], SIMDBase_LOAD(&cobraT[a2c++]));
+	  SIMDBase_STOR(&s[abc++], SIMDBase_LOAD(&cobraT[a2c++])); SIMDBase_STOR(&s[abc++], SIMDBase_LOAD(&cobraT[a2c++]));
+	  SIMDBase_STOR(&s[abc++], SIMDBase_LOAD(&cobraT[a2c++])); SIMDBase_STOR(&s[abc++], SIMDBase_LOAD(&cobraT[a2c++]));
+	  SIMDBase_STOR(&s[abc++], SIMDBase_LOAD(&cobraT[a2c++])); SIMDBase_STOR(&s[abc++], SIMDBase_LOAD(&cobraT[a2c++]));
+	  SIMDBase_STOR(&s[abc++], SIMDBase_LOAD(&cobraT[a2c++])); SIMDBase_STOR(&s[abc++], SIMDBase_LOAD(&cobraT[a2c++]));
+	}
+      }
+    }
+  }
+}
+
+//
+
+static void srForwardMain2(DFTUndiff *p) {
+  int32_t o = p->offset1;
+  int32_t butlen = p->butlen;
+  int32_t log2butlen = p->log2butlen;
+
+  if (butlen >= p->radix2thres) {
+    p->stride           = p->butlen/2;
+    r2ButForwardSub(p);
+
+    p->offset1          = o + butlen*4/4;
+    p->butlen           = butlen/2;
+    p->log2butlen       = log2butlen-1;
+    srForwardMain2(p);
+
+    p->offset1          = o;
+    p->butlen           = butlen/2;
+    p->log2butlen       = log2butlen-1;
+    srForwardMain2(p);
+
+    return;
+  }
+
+  if (butlen >= 256) {
+    p->stride           = p->butlen/2;
+    srButForwardSubUnrolled(p);
+
+    p->offset1          = o + butlen*6/4;
+    p->butlen           = butlen/4;
+    p->log2butlen       = log2butlen-2;
+    srForwardMain2(p);
+
+    p->offset1          = o + butlen*4/4;
+    p->butlen           = butlen/4;
+    p->log2butlen       = log2butlen-2;
+    srForwardMain2(p);
+
+    p->offset1          = o;
+    p->butlen           = butlen/2;
+    p->log2butlen       = log2butlen-1;
+    srForwardMain2(p);
+
+    return;
+  }
+
+  if (butlen == 128) {
+    p->stride           = p->butlen/2;
+    srButForwardSubUnrolled(p);
+
+    p->offset1 = o + butlen*6/4;
+    srButForward32(p);
+
+    p->offset1 = o + butlen*4/4;
+    srButForward32(p);
+
+    p->offset1          = o;
+    p->butlen           = butlen/2;
+    p->log2butlen       = log2butlen-1;
+    srForwardMain2 (p);
+
+    return;
+  }
+
+  // butlen == 64
+
+  p->stride = p->butlen/2;
+  srButForwardSubUnrolled(p);
+
+  p->offset1 = o + butlen*6/4;
+  srButForward16(p);
+
+  p->offset1 = o + butlen*4/4;
+  srButForward16(p);
+
+  p->offset1 = o;
+  srButForward32(p);
+}
+
+static void srBackwardMain2(DFTUndiff *p) {
+  int32_t o = p->offset1;
+  int32_t butlen = p->butlen;
+  int32_t log2butlen = p->log2butlen;
+
+  if (butlen >= p->radix2thres) {
+    p->offset1          = o + butlen*4/4;
+    p->butlen           = butlen/2;
+    p->log2butlen       = log2butlen-1;
+    srBackwardMain2(p);
+
+    p->offset1          = o;
+    p->butlen           = butlen/2;
+    p->log2butlen       = log2butlen-1;
+    srBackwardMain2(p);
+
+    p->butlen           = butlen;
+    p->stride           = p->butlen/2;
+    p->log2butlen       = log2butlen;
+    r2ButBackwardSub(p);
+
+    return;
+  }
+
+  if (butlen >= 256) {
+    p->offset1          = o + butlen*6/4;
+    p->butlen           = butlen/4;
+    p->log2butlen       = log2butlen-2;
+    srBackwardMain2(p);
+
+    p->offset1          = o + butlen*4/4;
+    p->butlen           = butlen/4;
+    p->log2butlen       = log2butlen-2;
+    srBackwardMain2(p);
+
+    p->offset1          = o;
+    p->butlen           = butlen/2;
+    p->log2butlen       = log2butlen-1;
+    srBackwardMain2(p);
+
+    p->butlen           = butlen;
+    p->stride           = p->butlen/2;
+    p->log2butlen       = log2butlen;
+    srButBackwardSubUnrolled(p);
+
+    return;
+  }
+
+  if (butlen == 128) {
+    p->offset1 = o + butlen*6/4;
+    srButBackward32(p);
+
+    p->offset1 = o + butlen*4/4;
+    srButBackward32(p);
+
+    p->offset1          = o;
+    p->butlen           = butlen/2;
+    p->log2butlen       = log2butlen-1;
+    srBackwardMain2 (p);
+
+    p->butlen           = butlen;
+    p->stride           = p->butlen/2;
+    p->log2butlen       = log2butlen;
+    srButBackwardSubUnrolled(p);
+
+    return;
+  }
+
+  // butlen == 64
+
+  p->offset1 = o + butlen*6/4;
+  srButBackward16(p);
+
+  p->offset1 = o + butlen*4/4;
+  srButBackward16(p);
+
+  p->offset1 = o;
+  srButBackward32(p);
+
+  p->butlen           = butlen;
+  p->stride           = p->butlen/2;
+  p->log2butlen       = log2butlen;
+  srButBackwardSubUnrolled(p);
+}
+
+static void srForwardMain(DFTUndiff *p) {
+  if (p->length >= 64) {
+    p->butlen = p->length;
+    p->log2butlen = p->log2len;
+    p->offset1 = p->offset2 = 0;
+
+    srForwardMain2(p);
+  } else {
+    switch(p->length) {
+    case 32:
+      srButForward32(p);
+      break;
+    case 16:
+      srButForward16(p);
+      break;
+    case 8:
+      srButForward8(p);
+      break;
+    case 4:
+      srButForward4(p);
+      break;
+    case 2:
+      srBut2(p);
+      break;
+    }
+  }
+}
+
+static void srBackwardMain(DFTUndiff *p) {
+  if (p->length >= 64) {
+    p->butlen = p->length;
+    p->log2butlen = p->log2len;
+    p->offset1 = p->offset2 = 0;
+
+    srBackwardMain2(p);
+  } else {
+    switch(p->length) {
+    case 32:
+      srButBackward32(p);
+      break;
+    case 16:
+      srButBackward16(p);
+      break;
+    case 8:
+      srButBackward8(p);
+      break;
+    case 4:
+      srButBackward4(p);
+      break;
+    case 2:
+      srBut2(p);
+      break;
+    }
+  }
+}
+
+static void realSub0(DFTUndiff *p, SIMDBase_VECT *s, int32_t ts) {
+  SIMDBase_VECT tr, ti, ur, ui, mr, mi;
+  int32_t n = p->length*2;
+  int32_t k;
+
+  for(k=1;k<n/4;k++) {
+    SIMDBase_VECT s00 = SIMDBase_LOAD(&s[k*2+0]), s01 = SIMDBase_LOAD(&s[k*2+1]);
+    SIMDBase_VECT s10 = SIMDBase_LOAD(&s[(n/2-k)*2+0]), s11 = SIMDBase_LOAD(&s[(n/2-k)*2+1]);
+
+    tr = SIMDBase_SUBi(s00, s10); ti = SIMDBase_ADDi(s01, s11);
+    ur = SIMDBase_LOAD1(&(p->rtTable[ts][k*2+0]));
+    ui = SIMDBase_LOAD1(&(p->rtTable[ts][k*2+1]));
+    mr = SIMDBase_SUBi(SIMDBase_MULi(tr, ur), SIMDBase_MULi(ti, ui));
+    mi = SIMDBase_ADDi(SIMDBase_MULi(tr, ui), SIMDBase_MULi(ti, ur));
+    SIMDBase_STOR(&s[k*2+0], SIMDBase_SUBi(s00, mr));
+    SIMDBase_STOR(&s[k*2+1], SIMDBase_SUBi(s01, mi));
+    SIMDBase_STOR(&s[(n/2-k)*2+0], SIMDBase_ADDi(s10, mr));
+    SIMDBase_STOR(&s[(n/2-k)*2+1], SIMDBase_SUBi(s11, mi));
+  }
+
+  tr = SIMDBase_LOAD(&s[0]); ti = SIMDBase_LOAD(&s[1]);
+  SIMDBase_STOR(&s[0], SIMDBase_ADDi(tr, ti));
+  SIMDBase_STOR(&s[1], SIMDBase_SUBi(tr, ti));
+}
+
+static void realSub1(DFTUndiff *p, SIMDBase_VECT *s, int32_t ts) {
+  SIMDBase_VECT tr, ti, ur, ui, mr, mi;
+  int32_t n = p->length*2;
+  int32_t k;
+
+  tr = SIMDBase_LOAD(&s[0]); ti = SIMDBase_LOAD(&s[1]);
+  SIMDBase_STOR(&s[0], SIMDBase_MULi(SIMDBase_ADDi(tr, ti), SIMDBase_SET1(0.5)));
+  SIMDBase_STOR(&s[1], SIMDBase_MULi(SIMDBase_SUBi(tr, ti), SIMDBase_SET1(0.5)));
+
+  for(k=1;k<n/4;k++) {
+    SIMDBase_VECT s00 = SIMDBase_LOAD(&s[k*2+0]), s01 = SIMDBase_LOAD(&s[k*2+1]);
+    SIMDBase_VECT s10 = SIMDBase_LOAD(&s[(n/2-k)*2+0]), s11 = SIMDBase_LOAD(&s[(n/2-k)*2+1]);
+
+    tr = SIMDBase_SUBi(s00, s10); ti = SIMDBase_ADDi(s01, s11);
+    ur = SIMDBase_LOAD1(&(p->rtTable[ts][k*2+0]));
+    ui = SIMDBase_LOAD1(&(p->rtTable[ts][k*2+1]));
+    mr = SIMDBase_SUBi(SIMDBase_MULi(tr, ur), SIMDBase_MULi(ti, ui));
+    mi = SIMDBase_ADDi(SIMDBase_MULi(tr, ui), SIMDBase_MULi(ti, ur));
+    tr = SIMDBase_SUBi(s00, mr); ti = SIMDBase_SUBi(mi, s01);
+    SIMDBase_STOR(&s[k*2+0], SIMDBase_ADDi(mr, s10));
+    SIMDBase_STOR(&s[k*2+1], SIMDBase_SUBi(mi, s11));
+    SIMDBase_STOR(&s[(n/2-k)*2+0], tr);
+    SIMDBase_STOR(&s[(n/2-k)*2+1], ti);
+  }
+}
+
+void DFTUndiff_EXECUTE(void *p2, void *s2, int32_t dir) {
+  DFTUndiff *p = (DFTUndiff *)p2;
+  SIMDBase_VECT *s = (SIMDBase_VECT *)s2;
+
+  if (p->magic != MAGIC_DFT) abort();
+
+  p->s = s;
+
+  if (dir == -1) {
+    if ((p->flags & DFT_FLAG_ALT_REAL) != 0) {
+      realSub1(p, s, 0);
+    }
+
+    srForwardMain(p);
+
+    if ((p->flags & DFT_FLAG_NO_BITREVERSAL) == 0) {
+      if (p->useCobra) {
+	bitReversalCobraInplace(p);
+      } else {
+	bitReversalRecursive(p->s, p->length, 1, 0, 0);
+      }
+    }
+
+    if ((p->flags & DFT_FLAG_REAL) != 0) {
+      realSub0(p, s, 0);
+      s[p->length+1] = SIMDBase_NEGi(s[p->length+1]);
+    }
+  } else {
+    if ((p->flags & DFT_FLAG_REAL) != 0) {
+      s[p->length+1] = SIMDBase_NEGi(s[p->length+1]);
+      realSub1(p, s, 1);
+    }
+
+    if ((p->flags & DFT_FLAG_NO_BITREVERSAL) == 0) {
+      if (p->useCobra) {
+	bitReversalCobraInplace(p);
+      } else {
+	bitReversalRecursive(p->s, p->length, 1, 0, 0);
+      }
+    }
+
+    srBackwardMain(p);
+
+    if ((p->flags & DFT_FLAG_ALT_REAL) != 0) {
+      realSub0(p, s, 1);
+    }
+  }
+}
+
+void DFTUndiff_DESTROYPLAN(void *p2) {
+  DFTUndiff *plan = (DFTUndiff *)p2;
+  if (plan->magic != MAGIC_DFT) abort();
+
+  free(*(plan->ptTable));
+  free(plan->ptTable);
+  free(plan->cobraT);
+  free(plan->cobraR);
+  //free(plan->t);
+  if (plan->rtTable != NULL) {
+    free(plan->rtTable[0]);
+    free(plan->rtTable[1]);
+    free(plan->rtTable);
+  }
+
+  plan->magic = 0;
+  free(plan);
+}
+
+DFTUndiff *DFTUndiff_MAKEPLANSUB(uint64_t n, int32_t radix2thres, int32_t useCobra, uint64_t flags) {
+  int32_t i, j, k;
+
+  uint32_t linesize = SIMDBase_sizeOfCachelineInByte();
+  uint32_t cachesize = SIMDBase_sizeOfDataCacheInByte();
+
+  //
+
+  if ((flags & DFT_FLAG_REAL) != 0 || (flags & DFT_FLAG_ALT_REAL) != 0) n /= 2;
+
+  DFTUndiff *d = calloc(1, sizeof(DFTUndiff));
+
+  d->magic = MAGIC_DFT;
+  d->mode = SIMDBase_MODE;
+  d->flags = flags;
+
+  d->radix2thres = radix2thres;
+  d->useCobra = useCobra;
+
+  d->length = (uint32_t) n;
+  d->log2len = DFT_ilog2((uint32_t) n);
+
+  //
+
+  SIMDBase_REAL *trigTable = SIMDBase_alignedMalloc(sizeof(SIMDBase_REAL)*n*2);
+  d->ptTable = malloc(sizeof(SIMDBase_REAL *) * (d->log2len+1));
+
+  SIMDBase_REAL *p = trigTable, **pp = d->ptTable;
+
+  for(j=0;j<(int32_t)d->log2len+1;j++) {
+    *pp++ = p;
+
+    if ((1 << j) >= d->radix2thres) {
+      for(i=0;i<(1 << j)/4+1;i++) {
+	*p++ = (SIMDBase_REAL)COS(-2*M_PIl*i/(1 << j));
+      }
+      const int32_t step = linesize / sizeof(SIMDBase_REAL);
+      p += (step - (p - trigTable) % step) % step;
+    } else {
+      for(i=0;i<(1 << j)/4;i++) {
+	*p++ = (SIMDBase_REAL)COS(-2*M_PIl*i/(1 << j));
+	*p++ = (SIMDBase_REAL)SIN(-2*M_PIl*i/(1 << j));
+	*p++ = (SIMDBase_REAL)COS(-6*M_PIl*i/(1 << j));
+	*p++ = (SIMDBase_REAL)SIN(-6*M_PIl*i/(1 << j));
+      }
+    }
+  }
+
+  //
+
+  int32_t cobraQ;
+
+  cobraQ = linesize / (sizeof(SIMDBase_VECT) * 2);
+
+  for(;;) {
+    if (1 << (cobraQ*2) >
+	(cachesize / (sizeof(SIMDBase_VECT) * 2)/2))
+      break;
+
+    cobraQ++;
+  }
+  cobraQ--;
+
+  d->cobraQ = cobraQ;
+
+  if (cobraQ >= 4 && d->log2len >= 2*cobraQ) {
+    SIMDBase_VECT *cobraT;
+    int32_t *cobraR;
+
+    if (d->log2len <= 2*cobraQ) cobraQ = d->log2len / 2;
+
+    cobraT = SIMDBase_alignedMalloc(sizeof(SIMDBase_VECT)*2 * (1 << (cobraQ*2)));
+    cobraR = (int32_t *)SIMDBase_alignedMalloc(sizeof(int32_t) * (1 << cobraQ));
+
+    for(i=0;i<(1 << cobraQ);i++) cobraR[i] = bitR(i, cobraQ);
+
+    d->cobraT = cobraT; d->cobraR = cobraR;
+  } else {
+    d->useCobra = 0;
+  }
+
+  //
+
+  if ((d->flags & DFT_FLAG_REAL) != 0 || (d->flags & DFT_FLAG_ALT_REAL) != 0) {
+    int32_t m = n*2;
+
+    d->rtTable = malloc(sizeof(SIMDBase_REAL *)*2);
+    d->rtTable[0] = SIMDBase_alignedMalloc(sizeof(SIMDBase_REAL)*m/2);
+    d->rtTable[1] = SIMDBase_alignedMalloc(sizeof(SIMDBase_REAL)*m/2);
+
+    for(k=0;k<m/4;k++) {
+      d->rtTable[0][k*2+0] = 0.5-0.5*SIN(-2*M_PIl*k/m);
+      d->rtTable[0][k*2+1] =     0.5*COS(-2*M_PIl*k/m);
+      d->rtTable[1][k*2+0] = 0.5-0.5*SIN( 2*M_PIl*k/m);
+      d->rtTable[1][k*2+1] =     0.5*COS( 2*M_PIl*k/m);
+    }
+  }
+
+  //
+
+  return (void *)d;
+}
+
+void *DFTUndiff_MAKEPLAN(uint64_t n, uint64_t flags) {
+  if (flags & DFT_FLAG_VERBOSE) {
+    printf("\n--------------------------------\n");
+    printf("Making plan, mode = %s, dft length = %d\n", SIMDBase_NAME, (int)n);
+    printf("Processor : %s\n", SIMDBase_getProcessorNameString());
+    printf("Cache size (L2 + L3) : %d kbytes / thread\n", SIMDBase_sizeOfDataCacheInByte() / 1024);
+    printf("Cache Line Size : %d bytes\n", SIMDBase_sizeOfCachelineInByte());
+  }
+
+  if (n <= 256 || (flags & 3) == 0) {
+    return DFTUndiff_MAKEPLANSUB(n, n*2, (flags & DFT_FLAG_FORCE_COBRA) != 0, flags);
+  }
+
+  SIMDBase_REAL *s1 = SIMDBase_alignedMalloc(sizeof(SIMDBase_VECT)*n*2);
+
+  int32_t i, j, ts, tsbest, useCobra = 0;
+  double tick, tickmin;
+
+  if (flags & DFT_FLAG_VERBOSE) {
+    printf("\nWarming up before calibration ...");
+    fflush(stdout);
+  }
+
+  // warming up
+  tick = DFT_timeofday();
+  while(DFT_timeofday() - tick < 0.5)
+    ;
+
+  if (flags & DFT_FLAG_VERBOSE) {
+    printf(" done\n");
+  }
+
+  int32_t ntimes = 20000000.0 / n / DFT_ilog2(n);
+  if (ntimes == 0) ntimes = 1;
+
+  if (flags & DFT_FLAG_VERBOSE) {
+    printf("nTimes = %d\n", ntimes);
+  }
+
+  //
+
+  DFTUndiff *plan = DFTUndiff_MAKEPLANSUB(n, n*2, 0, flags);
+
+  for(i=0;i<n*2*SIMDBase_VECTLEN;i++) {
+    s1[i] = 0;
+  }
+
+  plan->s = (SIMDBase_VECT *)s1;
+
+  if (plan->cobraT != NULL) {
+    double tcobra = 0, trecur = 0;
+
+    if (flags & DFT_FLAG_VERBOSE) {
+      printf("\nChecking which bit-reversal method is faster\n");
+    }
+
+    //
+
+    bitReversalCobraInplace(plan);
+
+    tick = DFT_timeofday();
+
+    for(j=0;j<ntimes*4;j++) {
+      bitReversalCobraInplace(plan);
+    }
+
+    tcobra += DFT_timeofday() - tick;
+
+    //
+
+    bitReversalRecursive(plan->s, plan->length, 1, 0, 0);
+
+    tick = DFT_timeofday();
+
+    for(j=0;j<ntimes*4;j++) {
+      bitReversalRecursive(plan->s, plan->length, 1, 0, 0);
+    }
+
+    trecur += DFT_timeofday() - tick;
+
+    //
+
+    bitReversalCobraInplace(plan);
+
+    tick = DFT_timeofday();
+
+    for(j=0;j<ntimes*4;j++) {
+      bitReversalCobraInplace(plan);
+    }
+
+    tcobra += DFT_timeofday() - tick;
+
+    //
+
+    bitReversalRecursive(plan->s, plan->length, 1, 0, 0);
+
+    tick = DFT_timeofday();
+
+    for(j=0;j<ntimes*4;j++) {
+      bitReversalRecursive(plan->s, plan->length, 1, 0, 0);
+    }
+
+    trecur += DFT_timeofday() - tick;
+
+    //
+
+    useCobra = tcobra < trecur;
+
+    if ((flags & DFT_FLAG_FORCE_RECURSIVE) != 0) useCobra = 0;
+    if ((flags & DFT_FLAG_FORCE_COBRA) != 0) useCobra = 1;
+
+    if (flags & DFT_FLAG_VERBOSE) {
+      printf("cobra : %g\n", tcobra);
+      printf("recur : %g\n", trecur);
+      if (useCobra) {
+	printf("will use Cobra\n");
+      } else {
+	printf("will use the recursive reverser\n");
+      }
+    }
+  }
+
+  DFTUndiff_DESTROYPLAN(plan);
+
+  //
+
+  if (flags & DFT_FLAG_VERBOSE) {
+    printf("\nDetermining radix 2 threshold\n");
+  }
+
+  plan = DFTUndiff_MAKEPLANSUB(n, n*2, useCobra, flags);
+
+  for(j=0;j<ntimes;j++) {
+    DFTUndiff_EXECUTE(plan, s1, -1);
+    DFTUndiff_EXECUTE(plan, s1,  1);
+  }
+
+  DFTUndiff_DESTROYPLAN(plan);
+
+  tsbest = -1;
+  tickmin = 0;
+
+  for(ts = 1024;ts <= n*2;ts *= 2) {
+    plan = DFTUndiff_MAKEPLANSUB(n, ts, useCobra, flags);
+
+    tick = DFT_timeofday();
+
+    for(j=0;j<ntimes;j++) {
+      DFTUndiff_EXECUTE(plan, s1, -1);
+      DFTUndiff_EXECUTE(plan, s1,  1);
+    }
+
+    tick = DFT_timeofday() - tick;
+
+    DFTUndiff_DESTROYPLAN(plan);
+
+    if (tickmin == 0) tickmin = tick;
+
+    if (flags & DFT_FLAG_VERBOSE) {
+      printf("%d : %g\n",ts, (double)tick);
+    }
+
+    if (tick < tickmin) {
+      tickmin = tick;
+      tsbest = ts;
+    }
+  }
+
+  if (tsbest == -1) tsbest = n*2;;
+
+  if (flags & DFT_FLAG_VERBOSE) {
+    //printf("forcing tsbest = 1024\n");
+    //tsbest = 1024;
+    printf("radix 2 threshold : %d\n\n", tsbest);
+
+    double t = tickmin / ntimes / 2;
+    double nf = 5 * n * log(n) / log(2) / (t * 1000000);
+
+    printf("nFlops = %d x %g\n", SIMDBase_VECTLEN, nf);
+  }
+
+  plan = DFTUndiff_MAKEPLANSUB(n, tsbest, useCobra, flags);
+
+  if (flags & DFT_FLAG_VERBOSE) {
+    printf("\nDone making plan\n--------------------------------\n");
+  }
+
+  return plan;
+}
diff --git a/plugins/supereq/nsfft-1.00/dft/DFTUndiff.h b/plugins/supereq/nsfft-1.00/dft/DFTUndiff.h
new file mode 100644
index 00000000..d26b0d9b
--- /dev/null
+++ b/plugins/supereq/nsfft-1.00/dft/DFTUndiff.h
@@ -0,0 +1,114 @@
+#ifndef __DFTIMPL_H__
+#define __DFTIMPL_H__
+
+#include "SIMDBaseUndiff.h"
+
+#define MAGIC_DFT 0x18839f6d82bb02b6ULL
+
+typedef struct {
+  uint64_t magic;
+
+  SIMDBase_VECT *s;
+  uint32_t offset1, offset2;
+  uint32_t butlen, log2butlen;
+  uint32_t stride;
+
+  SIMDBase_REAL **ptTable;
+  uint32_t length, log2len;
+
+  int32_t radix2thres, flagTrans, useCobra;
+
+  int32_t cobraQ;
+  SIMDBase_VECT *cobraT;
+  int32_t *cobraR;
+
+  SIMDBase_REAL **rtTable;
+
+  uint64_t flags;
+  int32_t mode;
+} DFTUndiff;
+
+#if defined(ENABLE_PUREC_FLOAT) ////////////////////////////////////////////
+
+#define DFTUndiff_GETMODEPARAMINT getModeParamInt_purec_float
+#define DFTUndiff_GETMODEPARAMSTRING getModeParamString_purec_float
+#define DFTUndiff_EXECUTE execute_purec_float
+#define DFTUndiff_MAKEPLAN makePlan_purec_float
+#define DFTUndiff_MAKEPLANSUB makePlanSub_purec_float
+#define DFTUndiff_DESTROYPLAN destroyPlan_purec_float
+
+#elif defined(ENABLE_PUREC_DOUBLE) ////////////////////////////////////////////
+
+#define DFTUndiff_GETMODEPARAMINT getModeParamInt_purec_double
+#define DFTUndiff_GETMODEPARAMSTRING getModeParamString_purec_double
+#define DFTUndiff_EXECUTE execute_purec_double
+#define DFTUndiff_MAKEPLAN makePlan_purec_double
+#define DFTUndiff_MAKEPLANSUB makePlanSub_purec_double
+#define DFTUndiff_DESTROYPLAN destroyPlan_purec_double
+
+#elif defined(ENABLE_PUREC_LONGDOUBLE) ////////////////////////////////////////////
+
+#define DFTUndiff_GETMODEPARAMINT getModeParamInt_purec_longdouble
+#define DFTUndiff_GETMODEPARAMSTRING getModeParamString_purec_longdouble
+#define DFTUndiff_EXECUTE execute_purec_longdouble
+#define DFTUndiff_MAKEPLAN makePlan_purec_longdouble
+#define DFTUndiff_MAKEPLANSUB makePlanSub_purec_longdouble
+#define DFTUndiff_DESTROYPLAN destroyPlan_purec_longdouble
+
+#elif defined(ENABLE_SSE_FLOAT) ////////////////////////////////////////////
+
+#define DFTUndiff_GETMODEPARAMINT getModeParamInt_sse_float
+#define DFTUndiff_GETMODEPARAMSTRING getModeParamString_sse_float
+#define DFTUndiff_EXECUTE execute_sse_float
+#define DFTUndiff_MAKEPLAN makePlan_sse_float
+#define DFTUndiff_MAKEPLANSUB makePlanSub_sse_float
+#define DFTUndiff_DESTROYPLAN destroyPlan_sse_float
+
+#elif defined(ENABLE_SSE2_DOUBLE) ////////////////////////////////////////////
+
+#define DFTUndiff_GETMODEPARAMINT getModeParamInt_sse2_double
+#define DFTUndiff_GETMODEPARAMSTRING getModeParamString_sse2_double
+#define DFTUndiff_EXECUTE execute_sse2_double
+#define DFTUndiff_MAKEPLAN makePlan_sse2_double
+#define DFTUndiff_MAKEPLANSUB makePlanSub_sse2_double
+#define DFTUndiff_DESTROYPLAN destroyPlan_sse2_double
+
+#elif defined(ENABLE_NEON_FLOAT) ////////////////////////////////////////////
+
+#define DFTUndiff_GETMODEPARAMINT getModeParamInt_neon_float
+#define DFTUndiff_GETMODEPARAMSTRING getModeParamString_neon_float
+#define DFTUndiff_EXECUTE execute_neon_float
+#define DFTUndiff_MAKEPLAN makePlan_neon_float
+#define DFTUndiff_MAKEPLANSUB makePlanSub_neon_float
+#define DFTUndiff_DESTROYPLAN destroyPlan_neon_float
+
+#elif defined(ENABLE_AVX_FLOAT) ////////////////////////////////////////////
+
+#define DFTUndiff_GETMODEPARAMINT getModeParamInt_avx_float
+#define DFTUndiff_GETMODEPARAMSTRING getModeParamString_avx_float
+#define DFTUndiff_EXECUTE execute_avx_float
+#define DFTUndiff_MAKEPLAN makePlan_avx_float
+#define DFTUndiff_MAKEPLANSUB makePlanSub_avx_float
+#define DFTUndiff_DESTROYPLAN destroyPlan_avx_float
+
+#elif defined(ENABLE_AVX_DOUBLE) ////////////////////////////////////////////
+
+#define DFTUndiff_GETMODEPARAMINT getModeParamInt_avx_double
+#define DFTUndiff_GETMODEPARAMSTRING getModeParamString_avx_double
+#define DFTUndiff_EXECUTE execute_avx_double
+#define DFTUndiff_MAKEPLAN makePlan_avx_double
+#define DFTUndiff_MAKEPLANSUB makePlanSub_avx_double
+#define DFTUndiff_DESTROYPLAN destroyPlan_avx_double
+
+#elif defined(ENABLE_ALTIVEC_FLOAT) ////////////////////////////////////////////
+
+#define DFTUndiff_GETMODEPARAMINT getModeParamInt_altivec_float
+#define DFTUndiff_GETMODEPARAMSTRING getModeParamString_altivec_float
+#define DFTUndiff_EXECUTE execute_altivec_float
+#define DFTUndiff_MAKEPLAN makePlan_altivec_float
+#define DFTUndiff_MAKEPLANSUB makePlanSub_altivec_float
+#define DFTUndiff_DESTROYPLAN destroyPlan_altivec_float
+
+#endif ////////////////////////////////////////////////////////////////////
+
+#endif
diff --git a/plugins/supereq/nsfft-1.00/dft/Makefile b/plugins/supereq/nsfft-1.00/dft/Makefile
new file mode 120000
index 00000000..fc484116
--- /dev/null
+++ b/plugins/supereq/nsfft-1.00/dft/Makefile
@@ -0,0 +1 @@
+Makefile.x86
+\ No newline at end of file
diff --git a/plugins/supereq/nsfft-1.00/dft/Makefile.altivec b/plugins/supereq/nsfft-1.00/dft/Makefile.altivec
new file mode 100644
index 00000000..fe7fc993
--- /dev/null
+++ b/plugins/supereq/nsfft-1.00/dft/Makefile.altivec
@@ -0,0 +1,26 @@
+CC=gcc
+BASEOPT=-Wall  -I ../simd -maltivec -mabi=altivec
+OPT=$(BASEOPT) -O3
+
+all : libDFT.a
+
+DFTpurecfloat.o : DFTUndiff.c DFT.h ../simd/SIMDBase.h ../simd/SIMDBaseUndiff.h
+	$(CC) $(OPT) -DENABLE_PUREC_FLOAT DFTUndiff.c -c -o DFTpurecfloat.o
+
+DFTpurecdouble.o : DFTUndiff.c DFT.h ../simd/SIMDBase.h ../simd/SIMDBaseUndiff.h
+	$(CC) $(OPT) -DENABLE_PUREC_DOUBLE DFTUndiff.c -c -o DFTpurecdouble.o
+
+DFTpureclongdouble.o : DFTUndiff.c DFT.h ../simd/SIMDBase.h ../simd/SIMDBaseUndiff.h
+	$(CC) $(OPT) -DENABLE_PUREC_LONGDOUBLE DFTUndiff.c -c -o DFTpureclongdouble.o
+
+DFTaltivecfloat.o : DFTUndiff.c DFT.h ../simd/SIMDBase.h ../simd/SIMDBaseUndiff.h
+	$(CC) $(OPT) -DENABLE_ALTIVEC_FLOAT DFTUndiff.c -c -o DFTaltivecfloat.o
+
+DFT.o : DFT.c DFT.h
+	$(CC) $(OPT) -DENABLE_PUREC_FLOAT -DENABLE_PUREC_DOUBLE -DENABLE_PUREC_LONGDOUBLE -DENABLE_ALTIVEC_FLOAT DFT.c -c -o DFT.o
+
+libDFT.a : DFTpurecfloat.o DFTpurecdouble.o DFTpureclongdouble.o DFTaltivecfloat.o DFT.o
+	rm -f libDFT.a; ar -cvq libDFT.a DFTpurecfloat.o DFTpurecdouble.o DFTpureclongdouble.o DFTaltivecfloat.o DFT.o
+
+clean :
+	rm -f *~ *.o *.s *.a
diff --git a/plugins/supereq/nsfft-1.00/dft/Makefile.neon b/plugins/supereq/nsfft-1.00/dft/Makefile.neon
new file mode 100644
index 00000000..111a04ae
--- /dev/null
+++ b/plugins/supereq/nsfft-1.00/dft/Makefile.neon
@@ -0,0 +1,26 @@
+CC=gcc
+BASEOPT=-Wall -I ../simd -mfloat-abi=softfp
+OPT=$(BASEOPT) -O3
+
+all : libDFT.a
+
+DFTpurecfloat.o : DFTUndiff.c DFT.h ../simd/SIMDBase.h
+	$(CC) $(OPT) -DENABLE_PUREC_FLOAT DFTUndiff.c -c -o DFTpurecfloat.o
+
+DFTpurecdouble.o : DFTUndiff.c DFT.h ../simd/SIMDBase.h
+	$(CC) $(OPT) -DENABLE_PUREC_DOUBLE DFTUndiff.c -c -o DFTpurecdouble.o
+
+DFTpureclongdouble.o : DFTUndiff.c DFT.h ../simd/SIMDBase.h
+	$(CC) $(OPT) -DENABLE_PUREC_LONGDOUBLE DFTUndiff.c -c -o DFTpureclongdouble.o
+
+DFTneonfloat.o : DFTUndiff.c DFT.h ../simd/SIMDBase.h
+	$(CC) $(OPT) -mfpu=neon -DENABLE_NEON_FLOAT DFTUndiff.c -c -o DFTneonfloat.o
+
+DFT.o : DFT.c DFT.h
+	$(CC) $(OPT) -DENABLE_PUREC_FLOAT -DENABLE_PUREC_DOUBLE -DENABLE_PUREC_LONGDOUBLE -DENABLE_NEON_FLOAT DFT.c -c -o DFT.o
+
+libDFT.a : DFTpurecfloat.o DFTpurecdouble.o DFTpureclongdouble.o DFTneonfloat.o DFT.o
+	rm -f libDFT.a; ar -cvq libDFT.a DFTpurecfloat.o DFTpurecdouble.o DFTpureclongdouble.o DFTneonfloat.o DFT.o
+
+clean :
+	rm -f *~ *.o *.s *.a
diff --git a/plugins/supereq/nsfft-1.00/dft/Makefile.purec b/plugins/supereq/nsfft-1.00/dft/Makefile.purec
new file mode 100644
index 00000000..2c8b04f1
--- /dev/null
+++ b/plugins/supereq/nsfft-1.00/dft/Makefile.purec
@@ -0,0 +1,35 @@
+CC=gcc
+BASEOPT=-Wall
+OPT=$(BASEOPT) -O3
+
+all : libDFT.a
+
+DFTpurecfloat.o : DFTUndiff.c DFT.h SIMDBase.h
+	$(CC) $(OPT) -DENABLE_PUREC_FLOAT DFTUndiff.c -c -o DFTpurecfloat.o
+
+DFTpurecdouble.o : DFTUndiff.c DFT.h SIMDBase.h
+	$(CC) $(OPT) -DENABLE_PUREC_DOUBLE DFTUndiff.c -c -o DFTpurecdouble.o
+
+DFTpureclongdouble.o : DFTUndiff.c DFT.h SIMDBase.h
+	$(CC) $(OPT) -DENABLE_PUREC_LONGDOUBLE DFTUndiff.c -c -o DFTpureclongdouble.o
+
+SIMDBaseUndiff_purecfloat.o : SIMDBaseUndiff.c SIMDBase.h SIMDBaseUndiff.h
+	$(CC) $(OPT) -DENABLE_PUREC_FLOAT SIMDBaseUndiff.c -c -o SIMDBaseUndiff_purecfloat.o
+
+SIMDBaseUndiff_purecdouble.o : SIMDBaseUndiff.c DFT.h SIMDBase.h SIMDBaseUndiff.h
+	$(CC) $(OPT) -DENABLE_PUREC_DOUBLE SIMDBaseUndiff.c -c -o SIMDBaseUndiff_purecdouble.o
+
+SIMDBaseUndiff_pureclongdouble.o : SIMDBaseUndiff.c SIMDBase.h SIMDBaseUndiff.h
+	$(CC) $(OPT) -DENABLE_PUREC_LONGDOUBLE SIMDBaseUndiff.c -c -o SIMDBaseUndiff_pureclongdouble.o
+
+SIMDBase.o : SIMDBase.c SIMDBase.h
+	$(CC) $(BASEOPT) -O -DENABLE_PUREC_FLOAT -DENABLE_PUREC_DOUBLE -DENABLE_PUREC_LONGDOUBLE SIMDBase.c -c -o SIMDBase.o
+
+DFT.o : DFT.c DFT.h
+	$(CC) $(OPT) -DENABLE_PUREC_FLOAT -DENABLE_PUREC_DOUBLE -DENABLE_PUREC_LONGDOUBLE DFT.c -c -o DFT.o
+
+libDFT.a : DFTpurecfloat.o DFTpurecdouble.o DFTpureclongdouble.o DFT.o SIMDBase.o SIMDBaseUndiff_purecfloat.o SIMDBaseUndiff_purecdouble.o SIMDBaseUndiff_pureclongdouble.o
+	rm -f libDFT.a; ar -cvq libDFT.a DFTpurecfloat.o DFTpurecdouble.o DFTpureclongdouble.o DFT.o SIMDBase.o SIMDBaseUndiff_purecfloat.o SIMDBaseUndiff_purecdouble.o SIMDBaseUndiff_pureclongdouble.o
+
+clean :
+	rm -f *~ *.o *.s *.a
diff --git a/plugins/supereq/nsfft-1.00/dft/Makefile.x86 b/plugins/supereq/nsfft-1.00/dft/Makefile.x86
new file mode 100644
index 00000000..6ecbacec
--- /dev/null
+++ b/plugins/supereq/nsfft-1.00/dft/Makefile.x86
@@ -0,0 +1,29 @@
+CC=gcc
+BASEOPT=-Wall -I ../simd
+OPT=$(BASEOPT) -O3
+
+all : libDFT.a
+
+DFTpurecfloat.o : DFTUndiff.c DFT.h ../simd/SIMDBase.h ../simd/SIMDBaseUndiff.h
+	$(CC) $(OPT) -DENABLE_PUREC_FLOAT DFTUndiff.c -c -o DFTpurecfloat.o
+
+DFTpurecdouble.o : DFTUndiff.c DFT.h ../simd/SIMDBase.h ../simd/SIMDBaseUndiff.h
+	$(CC) $(OPT) -DENABLE_PUREC_DOUBLE DFTUndiff.c -c -o DFTpurecdouble.o
+
+DFTpureclongdouble.o : DFTUndiff.c DFT.h ../simd/SIMDBase.h ../simd/SIMDBaseUndiff.h
+	$(CC) $(OPT) -DENABLE_PUREC_LONGDOUBLE DFTUndiff.c -c -o DFTpureclongdouble.o
+
+DFTssefloat.o : DFTUndiff.c DFT.h ../simd/SIMDBase.h ../simd/SIMDBaseUndiff.h
+	$(CC) $(OPT) -msse -DENABLE_SSE_FLOAT DFTUndiff.c -c -o DFTssefloat.o
+
+DFTsse2double.o : DFTUndiff.c DFT.h ../simd/SIMDBase.h ../simd/SIMDBaseUndiff.h
+	$(CC) $(OPT) -msse2 -DENABLE_SSE2_DOUBLE DFTUndiff.c -c -o DFTsse2double.o
+
+DFT.o : DFT.c DFT.h
+	$(CC) $(OPT) -DENABLE_PUREC_FLOAT -DENABLE_PUREC_DOUBLE -DENABLE_PUREC_LONGDOUBLE -DENABLE_SSE_FLOAT -DENABLE_SSE2_DOUBLE DFT.c -c -o DFT.o
+
+libDFT.a : DFTpurecfloat.o DFTpurecdouble.o DFTpureclongdouble.o DFTssefloat.o DFTsse2double.o DFT.o
+	rm -f libDFT.a; ar -cvq libDFT.a DFTpurecfloat.o DFTpurecdouble.o DFTpureclongdouble.o DFTssefloat.o DFTsse2double.o DFT.o
+
+clean :
+	rm -f *~ *.o *.s *.a a.out
diff --git a/plugins/supereq/nsfft-1.00/dft/Makefile.x86avx b/plugins/supereq/nsfft-1.00/dft/Makefile.x86avx
new file mode 100644
index 00000000..b38909cb
--- /dev/null
+++ b/plugins/supereq/nsfft-1.00/dft/Makefile.x86avx
@@ -0,0 +1,35 @@
+CC=gcc
+BASEOPT=-Wall -I ../simd
+OPT=$(BASEOPT) -O3
+
+all : libDFT.a
+
+DFTpurecfloat.o : DFTUndiff.c DFT.h ../simd/SIMDBase.h ../simd/SIMDBaseUndiff.h
+	$(CC) $(OPT) -DENABLE_PUREC_FLOAT DFTUndiff.c -c -o DFTpurecfloat.o
+
+DFTpurecdouble.o : DFTUndiff.c DFT.h ../simd/SIMDBase.h ../simd/SIMDBaseUndiff.h
+	$(CC) $(OPT) -DENABLE_PUREC_DOUBLE DFTUndiff.c -c -o DFTpurecdouble.o
+
+DFTpureclongdouble.o : DFTUndiff.c DFT.h ../simd/SIMDBase.h ../simd/SIMDBaseUndiff.h
+	$(CC) $(OPT) -DENABLE_PUREC_LONGDOUBLE DFTUndiff.c -c -o DFTpureclongdouble.o
+
+DFTssefloat.o : DFTUndiff.c DFT.h ../simd/SIMDBase.h ../simd/SIMDBaseUndiff.h
+	$(CC) $(OPT) -msse -DENABLE_SSE_FLOAT DFTUndiff.c -c -o DFTssefloat.o
+
+DFTsse2double.o : DFTUndiff.c DFT.h ../simd/SIMDBase.h ../simd/SIMDBaseUndiff.h
+	$(CC) $(OPT) -msse2 -DENABLE_SSE2_DOUBLE DFTUndiff.c -c -o DFTsse2double.o
+
+DFTavxfloat.o : DFTUndiff.c DFT.h ../simd/SIMDBase.h ../simd/SIMDBaseUndiff.h
+	$(CC) $(OPT) -mavx -DENABLE_AVX_FLOAT DFTUndiff.c -c -o DFTavxfloat.o
+
+DFTavxdouble.o : DFTUndiff.c DFT.h ../simd/SIMDBase.h ../simd/SIMDBaseUndiff.h
+	$(CC) $(OPT) -mavx -DENABLE_AVX_DOUBLE DFTUndiff.c -c -o DFTavxdouble.o
+
+DFT.o : DFT.c DFT.h
+	$(CC) $(OPT) -DENABLE_PUREC_FLOAT -DENABLE_PUREC_DOUBLE -DENABLE_PUREC_LONGDOUBLE -DENABLE_SSE_FLOAT -DENABLE_SSE2_DOUBLE -DENABLE_AVX_FLOAT -DENABLE_AVX_DOUBLE DFT.c -c -o DFT.o
+
+libDFT.a : DFTpurecfloat.o DFTpurecdouble.o DFTpureclongdouble.o DFTssefloat.o DFTsse2double.o DFTavxfloat.o DFTavxdouble.o DFT.o
+	rm -f libDFT.a; ar -cvq libDFT.a DFTpurecfloat.o DFTpurecdouble.o DFTpureclongdouble.o DFTssefloat.o DFTsse2double.o DFTavxfloat.o DFTavxdouble.o DFT.o
+
+clean :
+	rm -f *~ *.o *.s *.a a.out
diff --git a/plugins/supereq/nsfft-1.00/dfttest/DFTExample.c b/plugins/supereq/nsfft-1.00/dfttest/DFTExample.c
new file mode 100644
index 00000000..78ff14dc
--- /dev/null
+++ b/plugins/supereq/nsfft-1.00/dfttest/DFTExample.c
@@ -0,0 +1,88 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include <stdint.h>
+#include <complex.h>
+
+#include "SIMDBase.h"
+#include "DFT.h"
+
+typedef float REAL;
+#define TYPE SIMDBase_TYPE_FLOAT
+
+#define THRES 1e-3
+
+double complex omega(double n, double kn) {
+  return cexp((-2 * M_PI * _Complex_I / n) * kn);
+}
+
+void forward(double complex *ts, double complex *fs, int len) {
+  int k, n;
+
+  for(k=0;k<len;k++) {
+    fs[k] = 0;
+
+    for(n=0;n<len;n++) {
+      fs[k] += ts[n] * omega(len, n*k);
+    }
+  }
+}
+
+int main(int argc, char **argv) {
+  const int n = 256;
+
+  int mode = SIMDBase_chooseBestMode(TYPE);
+  printf("mode : %d, %s\n", mode, SIMDBase_getModeParamString(SIMDBase_PARAMID_MODE_NAME, mode));
+
+  int veclen = SIMDBase_getModeParamInt(SIMDBase_PARAMID_VECTOR_LEN, mode);
+  int sizeOfVect = SIMDBase_getModeParamInt(SIMDBase_PARAMID_SIZE_OF_VECT, mode);
+
+  //
+
+  int i, j;
+
+  DFT *p = DFT_init(mode, n, 0);
+  REAL *sx = SIMDBase_alignedMalloc(sizeOfVect*n*2);
+
+  //
+
+  double complex ts[veclen][n], fs[veclen][n];
+
+  for(j=0;j<veclen;j++) {
+    for(i=0;i<n;i++) {
+      ts[j][i] = (random() / (double)RAND_MAX) + (random() / (double)RAND_MAX) * _Complex_I;
+      sx[(i*2+0)*veclen+j] = creal(ts[j][i]);
+      sx[(i*2+1)*veclen+j] = cimag(ts[j][i]);
+    }
+  }
+
+  //
+
+  DFT_execute(p, mode, sx, -1);
+
+  for(j=0;j<veclen;j++) {
+    forward(ts[j], fs[j], n);
+  }
+
+  //
+
+  int success = 1;
+
+  for(j=0;j<veclen;j++) {
+    for(i=0;i<n;i++) {
+      if ((fabs(sx[(i*2+0)*veclen+j] - creal(fs[j][i])) > THRES) ||
+	  (fabs(sx[(i*2+1)*veclen+j] - cimag(fs[j][i])) > THRES)) {
+	success = 0;
+      }
+    }
+  }
+
+  printf("%s\n", success ? "OK" : "NG");
+
+  //
+
+  SIMDBase_alignedFree(sx);
+  DFT_dispose(p, mode);
+
+  exit(0);
+}
diff --git a/plugins/supereq/nsfft-1.00/dfttest/DFTTestFFTW.c b/plugins/supereq/nsfft-1.00/dfttest/DFTTestFFTW.c
new file mode 100644
index 00000000..42825ed9
--- /dev/null
+++ b/plugins/supereq/nsfft-1.00/dfttest/DFTTestFFTW.c
@@ -0,0 +1,317 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include <stdint.h>
+#include <time.h>
+#include <complex.h>
+
+#include <fftw3.h>
+
+#include "SIMDBase.h"
+#include "DFT.h"
+
+#if 1
+typedef float REAL;
+#define TYPE SIMDBase_TYPE_FLOAT
+#else
+typedef double REAL;
+#define TYPE SIMDBase_TYPE_DOUBLE
+#endif
+
+#define THRES 1e-3
+
+// complex forward
+int check_cf(int n, int mode, int veclen, int sizeOfVect) {
+  int i, j;
+
+  DFT *p = DFT_init(mode, n, 0);
+  fftw_plan w[n];
+
+  fftw_complex *in[sizeOfVect], *out[sizeOfVect];
+
+  REAL *sx = SIMDBase_alignedMalloc(sizeOfVect*n*2);
+
+  //
+
+  for(j=0;j<veclen;j++) {
+    in[j] = (fftw_complex*) fftw_malloc(sizeof(fftw_complex) * n);
+    out[j] = (fftw_complex*) fftw_malloc(sizeof(fftw_complex) * n);
+    w[j] = fftw_plan_dft_1d(n, in[j], out[j], FFTW_FORWARD, FFTW_ESTIMATE);
+
+    for(i=0;i<n;i++) {
+      double re = random() / (double)RAND_MAX;
+      double im = random() / (double)RAND_MAX;
+      sx[(i*2+0)*veclen+j] = re;
+      sx[(i*2+1)*veclen+j] = im;
+      in[j][i] = re + im * _Complex_I;
+    }
+  }
+
+  //
+
+  DFT_execute(p, mode, sx, -1);
+
+  for(j=0;j<veclen;j++) {
+    fftw_execute(w[j]);
+  }
+
+  //
+
+  int success = 1;
+
+  for(j=0;j<veclen;j++) {
+    for(i=0;i<n;i++) {
+      if (fabs(sx[(i*2+0)*veclen+j] - creal(out[j][i])) > THRES) success = 0;
+      if (fabs(sx[(i*2+1)*veclen+j] - cimag(out[j][i])) > THRES) success = 0;
+    }
+  }
+
+  //
+
+  for(j=0;j<veclen;j++) {
+    fftw_destroy_plan(w[j]);
+    fftw_free(in[j]);
+    fftw_free(out[j]);
+  }
+
+  SIMDBase_alignedFree(sx);
+
+  DFT_dispose(p, mode);
+
+  //
+
+  return success;
+}
+
+// complex backward
+int check_cb(int n, int mode, int veclen, int sizeOfVect) {
+  int i, j;
+
+  DFT *p = DFT_init(mode, n, 0);
+  fftw_plan w[n];
+
+  fftw_complex *in[sizeOfVect], *out[sizeOfVect];
+
+  REAL *sx = SIMDBase_alignedMalloc(sizeOfVect*n*2);
+
+  //
+
+  for(j=0;j<veclen;j++) {
+    in[j] = (fftw_complex*) fftw_malloc(sizeof(fftw_complex) * n);
+    out[j] = (fftw_complex*) fftw_malloc(sizeof(fftw_complex) * n);
+    w[j] = fftw_plan_dft_1d(n, in[j], out[j], FFTW_BACKWARD, FFTW_ESTIMATE);
+
+    for(i=0;i<n;i++) {
+      double re = random() / (double)RAND_MAX;
+      double im = random() / (double)RAND_MAX;
+      sx[(i*2+0)*veclen+j] = re;
+      sx[(i*2+1)*veclen+j] = im;
+      in[j][i] = re + im * _Complex_I;
+    }
+  }
+
+  //
+
+  DFT_execute(p, mode, sx, 1);
+
+  for(j=0;j<veclen;j++) {
+    fftw_execute(w[j]);
+  }
+
+  //
+
+  int success = 1;
+
+  for(j=0;j<veclen;j++) {
+    for(i=0;i<n;i++) {
+      if (fabs(sx[(i*2+0)*veclen+j] - creal(out[j][i])) > THRES) success = 0;
+      if (fabs(sx[(i*2+1)*veclen+j] - cimag(out[j][i])) > THRES) success = 0;
+    }
+  }
+
+  //
+
+  for(j=0;j<veclen;j++) {
+    fftw_destroy_plan(w[j]);
+    fftw_free(in[j]);
+    fftw_free(out[j]);
+  }
+
+  SIMDBase_alignedFree(sx);
+
+  DFT_dispose(p, mode);
+
+  //
+
+  return success;
+}
+
+// real forward
+int check_rf(int n, int mode, int veclen, int sizeOfVect) {
+  int i, j;
+
+  DFT *p = DFT_init(mode, n, DFT_FLAG_REAL);
+  fftw_plan w[n];
+
+  double *in[sizeOfVect];
+  fftw_complex *out[sizeOfVect];
+
+  REAL *sx = SIMDBase_alignedMalloc(sizeOfVect*n*2);
+
+  //
+
+  for(j=0;j<veclen;j++) {
+    in[j] = (double *) fftw_malloc(sizeof(double) * n);
+    out[j] = (fftw_complex*) fftw_malloc(sizeof(fftw_complex) * (n/2+1));
+    w[j] = fftw_plan_dft_r2c_1d(n, in[j], out[j], FFTW_ESTIMATE);
+
+    for(i=0;i<n;i++) {
+      double re = random() / (double)RAND_MAX;
+      sx[i*veclen+j] = re;
+      in[j][i] = re;
+    }
+  }
+
+  //
+
+  DFT_execute(p, mode, sx, -1);
+
+  for(j=0;j<veclen;j++) {
+    fftw_execute(w[j]);
+  }
+
+  //
+
+  int success = 1;
+
+  for(j=0;j<veclen;j++) {
+    for(i=0;i<n/2;i++) {
+      if (i == 0) {
+	if (fabs(sx[(i*2+0)*veclen+j] - creal(out[j][0])) > THRES) success = 0;
+	if (fabs(sx[(i*2+1)*veclen+j] - creal(out[j][n/2])) > THRES) success = 0;
+      } else {
+	if (fabs(sx[(i*2+0)*veclen+j] - creal(out[j][i])) > THRES) success = 0;
+	if (fabs(sx[(i*2+1)*veclen+j] - cimag(out[j][i])) > THRES) success = 0;
+      }
+    }
+  }
+
+  //
+
+  for(j=0;j<veclen;j++) {
+    fftw_destroy_plan(w[j]);
+    fftw_free(in[j]);
+    fftw_free(out[j]);
+  }
+
+  SIMDBase_alignedFree(sx);
+
+  DFT_dispose(p, mode);
+
+  //
+
+  return success;
+}
+
+// real backward
+int check_rb(int n, int mode, int veclen, int sizeOfVect) {
+  int i, j;
+
+  DFT *p = DFT_init(mode, n, DFT_FLAG_REAL);
+  fftw_plan w[n];
+
+  fftw_complex *in[sizeOfVect];
+  double *out[sizeOfVect];
+
+  REAL *sx = SIMDBase_alignedMalloc(sizeOfVect*n*2);
+
+  //
+
+  for(j=0;j<veclen;j++) {
+    in[j] = (fftw_complex*) fftw_malloc(sizeof(fftw_complex) * (n/2+1));
+    out[j] = (double *) fftw_malloc(sizeof(double) * n);
+    w[j] = fftw_plan_dft_c2r_1d(n, in[j], out[j], FFTW_ESTIMATE);
+
+    for(i=0;i<n/2;i++) {
+      if (i == 0) {
+	in[j][0  ] = (random() / (double)RAND_MAX);
+	in[j][n/2] = (random() / (double)RAND_MAX);
+      } else {
+	in[j][i  ] = (random() / (double)RAND_MAX) + (random() / (double)RAND_MAX) * _Complex_I;
+      }
+    }
+
+    for(i=0;i<n/2;i++) {
+      if (i == 0) {
+	sx[(2*0+0) * veclen + j] = creal(in[j][0  ]);
+	sx[(2*0+1) * veclen + j] = creal(in[j][n/2]);
+      } else {
+	sx[(2*i+0) * veclen + j] = creal(in[j][i]);
+	sx[(2*i+1) * veclen + j] = cimag(in[j][i]);
+      }
+    }
+  }
+
+  //
+
+  DFT_execute(p, mode, sx, 1);
+
+  for(j=0;j<veclen;j++) {
+    fftw_execute(w[j]);
+  }
+
+  //
+
+  int success = 1;
+
+  for(j=0;j<veclen;j++) {
+    for(i=0;i<n/2;i++) {
+      if ((fabs(sx[i * veclen + j]*2 - out[j][i]) > THRES)) {
+	success = 0;
+      }
+    }
+  }
+
+  //
+
+  for(j=0;j<veclen;j++) {
+    fftw_destroy_plan(w[j]);
+    fftw_free(in[j]);
+    fftw_free(out[j]);
+  }
+
+  SIMDBase_alignedFree(sx);
+
+  DFT_dispose(p, mode);
+
+  //
+
+  return success;
+}
+
+int main(int argc, char **argv) {
+  if (argc != 2) {
+    fprintf(stderr, "%s <log2n>\n", argv[0]);
+    exit(-1);
+  }
+
+  const int n = 1 << atoi(argv[1]);
+
+  srandom(time(NULL));
+
+  //
+
+  int mode = SIMDBase_chooseBestMode(TYPE);
+
+  printf("mode : %d, %s\n", mode, SIMDBase_getModeParamString(SIMDBase_PARAMID_MODE_NAME, mode));
+
+  int veclen = SIMDBase_getModeParamInt(SIMDBase_PARAMID_VECTOR_LEN, mode);
+  int sizeOfVect = SIMDBase_getModeParamInt(SIMDBase_PARAMID_SIZE_OF_VECT, mode);
+
+  printf("complex forward   : %s\n", check_cf(n, mode, veclen, sizeOfVect) ? "OK" : "NG");
+  printf("complex backward  : %s\n", check_cb(n, mode, veclen, sizeOfVect) ? "OK" : "NG");
+  printf("real forward      : %s\n", check_rf(n, mode, veclen, sizeOfVect) ? "OK" : "NG");
+  printf("real backward     : %s\n", check_rb(n, mode, veclen, sizeOfVect) ? "OK" : "NG");
+
+  exit(0);
+}
diff --git a/plugins/supereq/nsfft-1.00/dfttest/DFTTestNaive.c b/plugins/supereq/nsfft-1.00/dfttest/DFTTestNaive.c
new file mode 100644
index 00000000..9d4bdaae
--- /dev/null
+++ b/plugins/supereq/nsfft-1.00/dfttest/DFTTestNaive.c
@@ -0,0 +1,419 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include <stdint.h>
+#include <time.h>
+#include <complex.h>
+
+#include "SIMDBase.h"
+#include "DFT.h"
+
+#if 1
+typedef float REAL;
+#define TYPE SIMDBase_TYPE_FLOAT
+#else
+typedef double REAL;
+#define TYPE SIMDBase_TYPE_DOUBLE
+#endif
+
+#define THRES 1e-3
+
+double complex omega(double n, double kn) {
+  return cexp((-2 * M_PI * _Complex_I / n) * kn);
+}
+
+void forward(double complex *ts, double complex *fs, int len) {
+  int k, n;
+
+  for(k=0;k<len;k++) {
+    fs[k] = 0;
+
+    for(n=0;n<len;n++) {
+      fs[k] += ts[n] * omega(len, n*k);
+    }
+  }
+}
+
+void backward(double complex *fs, double complex *ts, int len) {
+  int k, n;
+
+  for(k=0;k<len;k++) {
+    ts[k] = 0;
+
+    for(n=0;n<len;n++) {
+      ts[k] += fs[n] * omega(-len, n*k);
+    }
+  }
+}
+
+// complex forward
+int check_cf(int n, int mode, int veclen, int sizeOfVect) {
+  int i, j;
+
+  DFT *p = DFT_init(mode, n, 0);
+  REAL *sx = SIMDBase_alignedMalloc(sizeOfVect*n*2);
+
+  //
+
+  double complex ts[veclen][n], fs[veclen][n];
+
+  for(j=0;j<veclen;j++) {
+    for(i=0;i<n;i++) {
+      ts[j][i] = (random() / (double)RAND_MAX) + (random() / (double)RAND_MAX) * _Complex_I;
+      sx[(i*2+0)*veclen+j] = creal(ts[j][i]);
+      sx[(i*2+1)*veclen+j] = cimag(ts[j][i]);
+    }
+  }
+
+  //
+
+  DFT_execute(p, mode, sx, -1);
+
+  for(j=0;j<veclen;j++) {
+    forward(ts[j], fs[j], n);
+  }
+
+  //
+
+  int success = 1;
+
+  for(j=0;j<veclen;j++) {
+    for(i=0;i<n;i++) {
+      if ((fabs(sx[(i*2+0)*veclen+j] - creal(fs[j][i])) > THRES) ||
+	  (fabs(sx[(i*2+1)*veclen+j] - cimag(fs[j][i])) > THRES)) {
+	success = 0;
+      }
+    }
+  }
+
+  //
+
+  SIMDBase_alignedFree(sx);
+  DFT_dispose(p, mode);
+
+  //
+
+  return success;
+}
+
+// complex backward
+int check_cb(int n, int mode, int veclen, int sizeOfVect) {
+  int i,j;
+
+  DFT *p = DFT_init(mode, n, 0);
+  REAL *sx = SIMDBase_alignedMalloc(sizeOfVect*n*2);
+
+  //
+
+  double complex fs[veclen][n], ts[veclen][n];
+
+  for(j=0;j<veclen;j++) {
+    for(i=0;i<n;i++) {
+      fs[j][i] = (random() / (double)RAND_MAX) + (random() / (double)RAND_MAX) * _Complex_I;
+
+      sx[(i*2+0)*veclen+j] = creal(fs[j][i]);
+      sx[(i*2+1)*veclen+j] = cimag(fs[j][i]);
+    }
+  }
+
+  //
+
+  DFT_execute(p, mode, sx, 1);
+
+  for(j=0;j<veclen;j++) {
+    backward(fs[j], ts[j], n);
+  }
+
+  //
+
+  int success = 1;
+
+  for(j=0;j<veclen;j++) {
+    for(i=0;i<n;i++) {
+      if ((fabs(sx[(i*2+0)*veclen+j] - creal(ts[j][i])) > THRES) ||
+	  (fabs(sx[(i*2+1)*veclen+j] - cimag(ts[j][i])) > THRES)) {
+	success = 0;
+      }
+    }
+  }
+
+  //
+
+  SIMDBase_alignedFree(sx);
+  DFT_dispose(p, mode);
+
+  //
+
+  return success;
+}
+
+// real forward
+int check_rf(int n, int mode, int veclen, int sizeOfVect) {
+  int i,j;
+
+  DFT *p = DFT_init(mode, n, DFT_FLAG_REAL);
+  REAL *sx = SIMDBase_alignedMalloc(sizeOfVect*n);
+
+  //
+
+  double complex ts[veclen][n], fs[veclen][n];
+
+  for(j=0;j<veclen;j++) {
+    for(i=0;i<n;i++) {
+      ts[j][i] = (random() / (double)RAND_MAX);
+      sx[i*veclen+j] = creal(ts[j][i]);
+    }
+  }
+
+  //
+
+  DFT_execute(p, mode, sx, -1);
+
+  for(j=0;j<veclen;j++) {
+    forward(ts[j], fs[j], n);
+  }
+
+  //
+
+  int success = 1;
+
+  for(j=0;j<veclen;j++) {
+    for(i=0;i<n/2;i++) {
+      if (i == 0) {
+	if (fabs(sx[(2*0+0) * veclen + j] - creal(fs[j][0  ])) > THRES) success = 0;
+	if (fabs(sx[(2*0+1) * veclen + j] - creal(fs[j][n/2])) > THRES) success = 0;
+      } else {
+	if (fabs(sx[(2*i+0) * veclen + j] - creal(fs[j][i])) > THRES) success = 0;
+	if (fabs(sx[(2*i+1) * veclen + j] - cimag(fs[j][i])) > THRES) success = 0;
+      }
+    }
+  }
+
+  //
+
+  SIMDBase_alignedFree(sx);
+  DFT_dispose(p, mode);
+
+  //
+
+  return success;
+}
+
+// real backward
+int check_rb(int n, int mode, int veclen, int sizeOfVect) {
+  int i,j;
+
+  DFT *p = DFT_init(mode, n, DFT_FLAG_REAL);
+  REAL *sx = SIMDBase_alignedMalloc(sizeOfVect*n);
+
+  //
+
+  double complex fs[veclen][n], ts[veclen][n];
+
+  for(j=0;j<veclen;j++) {
+    for(i=0;i<n/2;i++) {
+      if (i == 0) {
+	fs[j][0  ] = (random() / (double)RAND_MAX);
+	fs[j][n/2] = (random() / (double)RAND_MAX);
+      } else {
+	fs[j][i  ] = (random() / (double)RAND_MAX) + (random() / (double)RAND_MAX) * _Complex_I;
+	fs[j][n-i] = conj(fs[j][i]);
+      }
+    }
+  }
+
+  for(j=0;j<veclen;j++) {
+    for(i=0;i<n/2;i++) {
+      if (i == 0) {
+	sx[(2*0+0) * veclen + j] = creal(fs[j][0  ]);
+	sx[(2*0+1) * veclen + j] = creal(fs[j][n/2]);
+      } else {
+	sx[(2*i+0) * veclen + j] = creal(fs[j][i]);
+	sx[(2*i+1) * veclen + j] = cimag(fs[j][i]);
+      }
+    }
+  }
+
+  //
+
+  for(j=0;j<veclen;j++) {
+    backward(fs[j], ts[j], n);
+  }
+
+  DFT_execute(p, mode, sx, 1);
+
+  //
+
+  int success = 1;
+
+  for(j=0;j<veclen;j++) {
+    for(i=0;i<n;i++) {
+      if (fabs(cimag(ts[j][i])) > THRES) {
+	success = 0;
+      }
+
+      if ((fabs(sx[i * veclen + j]*2 - creal(ts[j][i])) > THRES)) {
+	success = 0;
+      }
+    }
+  }
+
+  //
+
+  SIMDBase_alignedFree(sx);
+  DFT_dispose(p, mode);
+
+  //
+
+  return success;
+}
+
+// alt real forward
+int check_arf(int n, int mode, int veclen, int sizeOfVect) {
+  int i,j;
+
+  DFT *p = DFT_init(mode, n, DFT_FLAG_ALT_REAL);
+  REAL *sx = SIMDBase_alignedMalloc(sizeOfVect*n);
+
+  //
+
+  double complex ts[veclen][n], fs[veclen][n];
+
+  for(j=0;j<veclen;j++) {
+    for(i=0;i<n;i++) {
+      ts[j][i] = (random() / (double)RAND_MAX);
+      sx[i*veclen+j] = creal(ts[j][i]);
+    }
+  }
+
+  //
+
+  DFT_execute(p, mode, sx, 1);
+
+  for(j=0;j<veclen;j++) {
+    backward(ts[j], fs[j], n);
+  }
+
+  //
+
+  int success = 1;
+
+  for(j=0;j<veclen;j++) {
+    for(i=0;i<n/2;i++) {
+      if (i == 0) {
+	if (fabs(sx[(2*0+0) * veclen + j] - creal(fs[j][0  ])) > THRES) success = 0;
+	if (fabs(sx[(2*0+1) * veclen + j] - creal(fs[j][n/2])) > THRES) success = 0;
+      } else {
+	if (fabs(sx[(2*i+0) * veclen + j] - creal(fs[j][i])) > THRES) success = 0;
+	if (fabs(sx[(2*i+1) * veclen + j] - cimag(fs[j][i])) > THRES) success = 0;
+      }
+    }
+  }
+
+  //
+
+  SIMDBase_alignedFree(sx);
+  DFT_dispose(p, mode);
+
+  //
+
+  return success;
+}
+
+// alt real backward
+int check_arb(int n, int mode, int veclen, int sizeOfVect) {
+  int i,j;
+
+  DFT *p = DFT_init(mode, n, DFT_FLAG_ALT_REAL);
+  REAL *sx = SIMDBase_alignedMalloc(sizeOfVect*n);
+
+  //
+
+  double complex fs[veclen][n], ts[veclen][n];
+
+  for(j=0;j<veclen;j++) {
+    for(i=0;i<n/2;i++) {
+      if (i == 0) {
+	fs[j][0  ] = (random() / (double)RAND_MAX);
+	fs[j][n/2] = (random() / (double)RAND_MAX);
+      } else {
+	fs[j][i  ] = (random() / (double)RAND_MAX) + (random() / (double)RAND_MAX) * _Complex_I;
+	fs[j][n-i] = conj(fs[j][i]);
+      }
+    }
+  }
+
+  for(j=0;j<veclen;j++) {
+    for(i=0;i<n/2;i++) {
+      if (i == 0) {
+	sx[(2*0+0) * veclen + j] = creal(fs[j][0  ]);
+	sx[(2*0+1) * veclen + j] = creal(fs[j][n/2]);
+      } else {
+	sx[(2*i+0) * veclen + j] = creal(fs[j][i]);
+	sx[(2*i+1) * veclen + j] = cimag(fs[j][i]);
+      }
+    }
+  }
+
+  //
+
+  for(j=0;j<veclen;j++) {
+    forward(fs[j], ts[j], n);
+  }
+
+  DFT_execute(p, mode, sx, -1);
+
+  //
+
+  int success = 1;
+
+  for(j=0;j<veclen;j++) {
+    for(i=0;i<n;i++) {
+      if (fabs(cimag(ts[j][i])) > THRES) {
+	success = 0;
+      }
+
+      if ((fabs(sx[i * veclen + j]*2 - creal(ts[j][i])) > THRES)) {
+	success = 0;
+      }
+    }
+  }
+
+  //
+
+  SIMDBase_alignedFree(sx);
+  DFT_dispose(p, mode);
+
+  //
+
+  return success;
+}
+
+int main(int argc, char **argv) {
+  if (argc != 2) {
+    fprintf(stderr, "%s <log2n>\n", argv[0]);
+    exit(-1);
+  }
+
+  const int n = 1 << atoi(argv[1]);
+
+  srandom(time(NULL));
+
+  //
+
+  int mode = SIMDBase_chooseBestMode(TYPE);
+
+  printf("mode : %d, %s\n", mode, SIMDBase_getModeParamString(SIMDBase_PARAMID_MODE_NAME, mode));
+
+  int veclen = SIMDBase_getModeParamInt(SIMDBase_PARAMID_VECTOR_LEN, mode);
+  int sizeOfVect = SIMDBase_getModeParamInt(SIMDBase_PARAMID_SIZE_OF_VECT, mode);
+
+  printf("complex forward   : %s\n", check_cf(n, mode, veclen, sizeOfVect) ? "OK" : "NG");
+  printf("complex backward  : %s\n", check_cb(n, mode, veclen, sizeOfVect) ? "OK" : "NG");
+  printf("real forward      : %s\n", check_rf(n, mode, veclen, sizeOfVect) ? "OK" : "NG");
+  printf("real backward     : %s\n", check_rb(n, mode, veclen, sizeOfVect) ? "OK" : "NG");
+  printf("alt real forward  : %s\n", check_arf(n, mode, veclen, sizeOfVect) ? "OK" : "NG");
+  printf("alt real backward : %s\n", check_arb(n, mode, veclen, sizeOfVect) ? "OK" : "NG");
+
+  exit(0);
+}
diff --git a/plugins/supereq/nsfft-1.00/dfttest/DFTTestOoura.c b/plugins/supereq/nsfft-1.00/dfttest/DFTTestOoura.c
new file mode 100644
index 00000000..08c8315f
--- /dev/null
+++ b/plugins/supereq/nsfft-1.00/dfttest/DFTTestOoura.c
@@ -0,0 +1,260 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include <stdint.h>
+#include <time.h>
+
+#include "SIMDBase.h"
+#include "DFT.h"
+
+void cdft(int, int, double *, int *, double *);
+void rdft(int, int, double *, int *, double *);
+
+#if 1
+typedef float REAL;
+#define TYPE SIMDBase_TYPE_FLOAT
+#else
+typedef double REAL;
+#define TYPE SIMDBase_TYPE_DOUBLE
+#endif
+
+#define THRES 1e-3
+
+// complex forward
+int check_cf(int n, int mode, int veclen, int sizeOfVect) {
+  int i, j;
+
+  DFT *p = DFT_init(mode, n, 0);
+
+  int *ip = calloc(n, sizeof(int));
+  double *trigTable = SIMDBase_alignedMalloc(sizeof(double)*n/2);
+
+  REAL *sx = SIMDBase_alignedMalloc(sizeOfVect*n*2);
+  double *sy = SIMDBase_alignedMalloc(veclen * sizeof(double) *n*2);
+
+  //
+
+  for(j=0;j<veclen;j++) {
+    for(i=0;i<n*2;i++) {
+      sx[i*veclen + j] = random() / (double)RAND_MAX;
+      sy[j*n*2 + i] = sx[i*veclen + j];
+    }
+  }
+
+  //
+
+  DFT_execute(p, mode, sx, -1);
+
+  for(j=0;j<veclen;j++) {
+    cdft(n*2, -1, &sy[j*n*2], ip, trigTable);
+  }
+
+  //
+
+  int success = 1;
+
+  for(j=0;j<veclen;j++) {
+    for(i=0;i<n*2;i++) {
+      if (fabs(sx[i*veclen+j] - sy[j*n*2 + i]) > THRES) success = 0;
+    }
+  }
+
+  //
+
+  SIMDBase_alignedFree(sy);
+  SIMDBase_alignedFree(sx);
+  SIMDBase_alignedFree(trigTable);
+  free(ip);
+
+  DFT_dispose(p, mode);
+
+  //
+
+  return success;
+}
+
+// complex backward
+int check_cb(int n, int mode, int veclen, int sizeOfVect) {
+  int i, j;
+
+  DFT *p = DFT_init(mode, n, 0);
+
+  int *ip = calloc(n, sizeof(int));
+  double *trigTable = SIMDBase_alignedMalloc(sizeof(double)*n/2);
+
+  REAL *sx = SIMDBase_alignedMalloc(sizeOfVect*n*2);
+  double *sy = SIMDBase_alignedMalloc(veclen * sizeof(double) *n*2);
+
+  //
+
+  for(j=0;j<veclen;j++) {
+    for(i=0;i<n*2;i++) {
+      sx[i*veclen + j] = random() / (double)RAND_MAX;
+      sy[j*n*2 + i] = sx[i*veclen + j];
+    }
+  }
+
+  //
+
+  DFT_execute(p, mode, sx, 1);
+
+  for(j=0;j<veclen;j++) {
+    cdft(n*2, 1, &sy[j*n*2], ip, trigTable);
+  }
+
+  //
+
+  int success = 1;
+
+  for(j=0;j<veclen;j++) {
+    for(i=0;i<n*2;i++) {
+      if (fabs(sx[i*veclen+j] - sy[j*n*2 + i]) > THRES) success = 0;
+    }
+  }
+
+  //
+
+  SIMDBase_alignedFree(sy);
+  SIMDBase_alignedFree(sx);
+  SIMDBase_alignedFree(trigTable);
+  free(ip);
+
+  DFT_dispose(p, mode);
+
+  //
+
+  return success;
+}
+
+// real forward
+int check_rf(int n, int mode, int veclen, int sizeOfVect) {
+  int i, j;
+
+  DFT *p = DFT_init(mode, n, DFT_FLAG_ALT_REAL);
+
+  int *ip = calloc(n, sizeof(int));
+  double *trigTable = SIMDBase_alignedMalloc(sizeof(double)*n/2);
+
+  REAL *sx = SIMDBase_alignedMalloc(sizeOfVect*n);
+  double *sy = SIMDBase_alignedMalloc(veclen * sizeof(double) *n);
+
+  //
+
+  for(j=0;j<veclen;j++) {
+    for(i=0;i<n;i++) {
+      sx[i*veclen + j] = random() / (double)RAND_MAX;
+      sy[j*n + i] = sx[i*veclen + j];
+    }
+  }
+
+  //
+
+  DFT_execute(p, mode, sx, -1);
+
+  for(j=0;j<veclen;j++) {
+    rdft(n, -1, &sy[j*n], ip, trigTable);
+  }
+
+  //
+
+  int success = 1;
+
+  for(j=0;j<veclen;j++) {
+    for(i=0;i<n;i++) {
+      if (fabs(sx[i*veclen+j] - sy[j*n + i]) > THRES) success = 0;
+    }
+  }
+
+  //
+
+  SIMDBase_alignedFree(sy);
+  SIMDBase_alignedFree(sx);
+  SIMDBase_alignedFree(trigTable);
+  free(ip);
+
+  DFT_dispose(p, mode);
+
+  //
+
+  return success;
+}
+
+// real backward
+int check_rb(int n, int mode, int veclen, int sizeOfVect) {
+  int i, j;
+
+  DFT *p = DFT_init(mode, n, DFT_FLAG_ALT_REAL);
+
+  int *ip = calloc(n, sizeof(int));
+  double *trigTable = SIMDBase_alignedMalloc(sizeof(double)*n/2);
+
+  REAL *sx = SIMDBase_alignedMalloc(sizeOfVect*n);
+  double *sy = SIMDBase_alignedMalloc(veclen * sizeof(double) *n);
+
+  //
+
+  for(j=0;j<veclen;j++) {
+    for(i=0;i<n;i++) {
+      sx[i*veclen + j] = random() / (double)RAND_MAX;
+      sy[j*n + i] = sx[i*veclen + j];
+    }
+  }
+
+  //
+
+  DFT_execute(p, mode, sx, 1);
+
+  for(j=0;j<veclen;j++) {
+    rdft(n, 1, &sy[j*n], ip, trigTable);
+  }
+
+  //
+
+  int success = 1;
+
+  for(j=0;j<veclen;j++) {
+    for(i=0;i<n;i++) {
+      if (fabs(sx[i*veclen+j] - sy[j*n + i]) > THRES) success = 0;
+    }
+  }
+
+  //
+
+  SIMDBase_alignedFree(sy);
+  SIMDBase_alignedFree(sx);
+  SIMDBase_alignedFree(trigTable);
+  free(ip);
+
+  DFT_dispose(p, mode);
+
+  //
+
+  return success;
+}
+
+int main(int argc, char **argv) {
+  if (argc != 2) {
+    fprintf(stderr, "%s <log2n>\n", argv[0]);
+    exit(-1);
+  }
+
+  const int n = 1 << atoi(argv[1]);
+
+  srandom(time(NULL));
+
+  //
+
+  int mode = SIMDBase_chooseBestMode(TYPE);
+
+  printf("mode : %d, %s\n", mode, SIMDBase_getModeParamString(SIMDBase_PARAMID_MODE_NAME, mode));
+
+  int veclen = SIMDBase_getModeParamInt(SIMDBase_PARAMID_VECTOR_LEN, mode);
+  int sizeOfVect = SIMDBase_getModeParamInt(SIMDBase_PARAMID_SIZE_OF_VECT, mode);
+
+  printf("complex forward   : %s\n", check_cf(n, mode, veclen, sizeOfVect) ? "OK" : "NG");
+  printf("complex backward  : %s\n", check_cb(n, mode, veclen, sizeOfVect) ? "OK" : "NG");
+  printf("real forward      : %s\n", check_rf(n, mode, veclen, sizeOfVect) ? "OK" : "NG");
+  printf("real backward     : %s\n", check_rb(n, mode, veclen, sizeOfVect) ? "OK" : "NG");
+
+  exit(0);
+}
diff --git a/plugins/supereq/nsfft-1.00/dfttest/Makefile b/plugins/supereq/nsfft-1.00/dfttest/Makefile
new file mode 100644
index 00000000..924b8656
--- /dev/null
+++ b/plugins/supereq/nsfft-1.00/dfttest/Makefile
@@ -0,0 +1,35 @@
+CC=gcc
+BASEOPT=-Wall -g -I ../simd -I ../dft -L../simd -L../dft
+OPT=$(BASEOPT) -O
+
+all : DFTExample DFTTestNaive
+
+clean :
+	rm -f *~ *.o nsfftplan.*.txt *.log *.dat a.out DFTExample DFTTestNaive DFTTestOoura DFTTestFFTW pi_fft_mod pi_fft_mod.c
+
+../simd/libSIMD.a :
+	@cd ../simd; make
+
+../dft/libDFT.a :
+	@cd ../dft; make
+
+../ooura/fftsg.o :
+	@cd ../ooura; make
+
+DFTExample : DFTExample.c ../simd/libSIMD.a ../dft/libDFT.a
+	$(CC) $(OPT) DFTExample.c -lDFT -lSIMD -lm -o DFTExample
+
+DFTTestNaive : DFTTestNaive.c ../simd/libSIMD.a ../dft/libDFT.a
+	$(CC) $(OPT) DFTTestNaive.c -lDFT -lSIMD -lm -o DFTTestNaive
+
+DFTTestOoura : DFTTestOoura.c ../ooura/fftsg.o ../simd/libSIMD.a ../dft/libDFT.a
+	$(CC) $(OPT) DFTTestOoura.c ../ooura/fftsg.o -lDFT -lSIMD -lm -o DFTTestOoura
+
+DFTTestFFTW : DFTTestFFTW.c ../simd/libSIMD.a ../dft/libDFT.a
+	$(CC) $(OPT) DFTTestFFTW.c -lDFT -lSIMD -lfftw3 -lm -o DFTTestFFTW
+
+pi_fft_mod.c : ../ooura/pi_fft.c pi_fft.c.patch
+	patch -o pi_fft_mod.c ../ooura/pi_fft.c pi_fft.c.patch
+
+pi_fft_mod : ../simd/libSIMD.a ../dft/libDFT.a pi_fft_mod.c
+	$(CC) $(OPT) pi_fft_mod.c -I ../dft -I ../simd -L../dft -L../simd -lm -lDFT -lSIMD -o pi_fft_mod
diff --git a/plugins/supereq/nsfft-1.00/dfttest/pi_fft.c.patch b/plugins/supereq/nsfft-1.00/dfttest/pi_fft.c.patch
new file mode 100644
index 00000000..c50133cc
--- /dev/null
+++ b/plugins/supereq/nsfft-1.00/dfttest/pi_fft.c.patch
@@ -0,0 +1,131 @@
+--- pi_fft.c	2010-07-30 13:04:25.000000000 +0900
++++ pi_fft_mod.c	2010-07-31 20:50:11.000000000 +0900
+@@ -25,7 +25,75 @@
+ #include <stdio.h>
+ #include <stdlib.h>
+ #include <time.h>
++#include <sys/time.h>
++#include <unistd.h>
+ 
++/****/
++
++#include <stdint.h>
++#include "SIMDBase.h"
++#include "DFT.h"
++
++DFT* dft[64];
++
++void initdft(int n) {
++  int i, logn = 31 - __builtin_clz(n), writeflag = 0;
++  char buf[20], fn[256];
++  gethostname(buf, 19);
++  sprintf(fn, "nsfftplan.%s.txt", buf);
++  FILE *fp = fopen(fn, "r");
++  if (fp != NULL) {
++    for(i=1;i<=logn;i++) {
++      int err;
++      dft[i] = DFT_fread(fp, &err);
++      if (err != DFT_ERROR_NOERROR) {
++	printf("error when reading plan %d : %d\n", i, err);
++	break;
++      }
++      if (DFT_getPlanParamInt(DFT_PARAMID_MODE, dft[i]) != SIMDBase_MODE_PUREC_DOUBLE ||
++	  DFT_getPlanParamInt(DFT_PARAMID_FFT_LENGTH, dft[i]) != (1 << i) ||
++	  DFT_getPlanParamInt(DFT_PARAMID_IS_ALT_REAL_TRANSFORM, dft[i]) != 1) {
++	fprintf(stderr, "plan not compatible : %d\n", i);
++	break;
++      }
++    }
++  }
++  if (fp != NULL) fclose(fp);
++
++  for(i=1;i<=logn;i++) {
++    if (dft[i] == NULL) {
++      dft[i] = DFT_init(SIMDBase_MODE_PUREC_DOUBLE, 1 << i, DFT_FLAG_ALT_REAL | DFT_FLAG_LIGHT_TEST_RUN | DFT_FLAG_VERBOSE);
++      if (dft[i] == NULL) {
++	printf("dft[%d] == NULL\n", i);
++	exit(-1);
++      }
++      writeflag = 1;
++    }
++  }
++
++  if (writeflag) {
++    fp = fopen(fn, "w");
++    if (fp != NULL) {
++      for(i=1;i<=logn;i++) {
++	DFT_fwrite(dft[i], fp);
++      }
++      fclose(fp);
++    }
++  }
++}
++
++void rdft(int n, int isgn, double *a, int *ip, double *w) {
++  int logn = 31 - __builtin_clz(n);
++  DFT_execute(dft[logn], SIMDBase_MODE_PUREC_DOUBLE, a, isgn);
++}
++
++double timeofday(void) {
++  struct timeval tp;
++  gettimeofday(&tp, NULL);
++  return (double)tp.tv_sec+(1e-6)*tp.tv_usec;
++}
++
++/****/
+ 
+ void mp_load_0(int n, int radix, int out[]);
+ void mp_load_1(int n, int radix, int out[]);
+@@ -67,7 +135,7 @@
+     double err, d_time, n_op;
+     int *a, *b, *c, *e, *i1, *i2, *ip;
+     double *d1, *d2, *d3, *w;
+-    time_t t_1, t_2;
++    double t_1, t_2;
+     FILE *f_log, *f_out;
+     
+     f_log = fopen("pi.log", "w");
+@@ -96,6 +164,8 @@
+         exit(1);
+     }
+     ip[0] = 0;
++
++    initdft(nfft);
+     /* ---- radix test ---- */
+     log10_radix = 1;
+     radix = 10;
+@@ -111,7 +181,7 @@
+     printf("calculating %d digits of PI...\n", log10_radix * (n - 2));
+     fprintf(f_log, "calculating %d digits of PI...\n", log10_radix * (n - 2));
+     /* ---- time check ---- */
+-    time(&t_1);
++    t_1 = timeofday();
+     /*
+      * ---- a formula based on the AGM (Arithmetic-Geometric Mean) ----
+      *   c = sqrt(0.125);
+@@ -216,10 +286,10 @@
+     mp_mul(n, radix, a, b, a, i1, nfft, d1, d2, d3, ip, w);
+     mp_idiv(n, radix, a, npow, a);
+     /* ---- time check ---- */
+-    time(&t_2);
++    t_2 = timeofday();
+     /* ---- output ---- */
+     f_out = fopen("pi_mod.dat", "w");
+-    printf("writing pi.dat...\n");
++    printf("writing pi_mod.dat...\n");
+     mp_fprintf(n - 1, log10_radix, a, f_out);
+     fclose(f_out);
+     free(d3);
+@@ -238,9 +308,9 @@
+     printf("floating point operation: %g op.\n", n_op);
+     fprintf(f_log, "floating point operation: %g op.\n", n_op);
+     /* ---- difftime ---- */
+-    d_time = difftime(t_2, t_1);
+-    printf("execution time: %g sec. (real time)\n", d_time);
+-    fprintf(f_log, "execution time: %g sec. (real time)\n", d_time);
++    d_time = t_2 - t_1;
++    printf("execution time: %.5g sec. (real time)\n", d_time);
++    fprintf(f_log, "execution time: %.5g sec. (real time)\n", d_time);
+     fclose(f_log);
+     return 0;
+ }
diff --git a/plugins/supereq/nsfft-1.00/doc/default.css b/plugins/supereq/nsfft-1.00/doc/default.css
new file mode 100644
index 00000000..09721163
--- /dev/null
+++ b/plugins/supereq/nsfft-1.00/doc/default.css
@@ -0,0 +1,34 @@
+body {margin-left: 1.5cm; padding-left: 0.1cm; margin-right: 1.5cm; padding-right: 0.1cm; margin-top: 2.5cm; padding-top: 0.5cm; margin-bottom: 1cm; padding-bottom: 1.0cm; border-top-style:solid; border-bottom-style:solid; }
+h1 {font-family: arial, sansserif; font-weight: bold; font-style: italic; margin-top: 0.8cm; }
+h2 {font-family: arial, sansserif; font-weight: bold; font-style: italic; margin-top: 0.8cm; }
+h3 {font-family: arial, sansserif; font-weight: bold; margin-top: 1.2cm; margin-bottom: 0.8cm; }
+h4 {font-family: arial, sansserif; font-weight: bold; margin-top: 1.2cm; margin-bottom: 0.8cm; }
+p {font-family: Georgia, "Times New Roman", times, serif; margin-top: 0.3cm; margin-left: 0.5cm; margin-bottom: 0.3cm;}
+p.dir {font-family: arial, sansserif; margin-top: 0cm; margin-bottom: 0cm;}
+dl { margin-left: 0.5cm; }
+dt { font-weight: bold; }
+a:link {color: black;}
+a:visited {color: black;}
+ul.disc {list-style-type: disc; font-family: times, serif;}
+ul.circle {list-style-type: circle; font-family: times, serif;}
+ul.square {list-style-type: square; font-family: times, serif;}
+ul.none {list-style-type: none; font-family: times, serif;}
+pre.code { margin-top: 1.0cm; margin-bottom: 1.0cm; margin-left: 1.0cm; margin-right: 1.0cm; border:3px solid #c0c0c0; padding: 0.5cm; font-family: tahoma, sansserif; font-weight: normal; background-color:#f8f8f8; }
+pre.command { margin-top: 1.0cm; margin-bottom: 1.0cm; margin-left: 1.5cm; margin-right: 0.0cm; border:0px; padding:0.0cm; font-family: tahoma, sansserif; font-weight: bold; background-color:#f8fffc; }
+ol.level1 { font-family: arial, sansserif; font-weight: bold;  font-style: italic; font-size:1.5em; }
+ol.level2 { font-family: "Times New Roman", serif; font-weight: normal; font-style: normal; font-size:0.85em; margin-top: 0.2cm; margin-bottom: 0.5cm; }
+table.figure { margin-left:auto; margin-right:auto; margin-top:1.0cm; margin-bottom:1.0cm; }
+
+td.caption { font-family: arial, sansserif; font-size: 75%; color: black; }
+td { font-family: times, serif; }
+
+table.lt { border-collapse: collapse; border-style: none; }
+td.lt- { margin: 0px; padding: 4px; padding-left:0.3cm; padding-right:0.3cm; border-width: 1px; border-style: none; padding-left=0.2cm; padding-right=0.2cm; }
+td.lt-r { margin: 0px; padding: 4px; padding-left:0.3cm; padding-right:0.3cm; border-style: none; border-right-style: solid; border-width: 1px; border-color: black; }
+td.lt-l { margin: 0px; padding: 4px; padding-left:0.3cm; padding-right:0.3cm; border-style: none; border-left-style: solid; border-width: 1px; border-color: black; }
+td.lt-lr { margin: 0px; padding: 4px; padding-left:0.3cm; padding-right:0.3cm; border-style: none; border-right-style: solid; border-left-style: solid; border-width: 1px; border-color: black; }
+td.lt-b { margin: 0px; padding: 4px; padding-left:0.3cm; padding-right:0.3cm; border-style: none; border-bottom-style: solid; border-width: 1px; border-color: black; }
+td.lt-hl { margin: 0px; border-style: none; border-bottom-style: solid; border-width: 1px; border-color: black; height: 2px; }
+td.lt-bl { margin: 0px; padding: 4px; padding-left:0.3cm; padding-right:0.3cm; border-style: none; border-bottom-style: solid; border-left-style: solid; border-width: 1px; border-color: black; }
+td.lt-br { margin: 0px; padding: 4px; padding-left:0.3cm; padding-right:0.3cm; border-style: none; border-bottom-style: solid; border-right-style: solid; border-width: 1px; border-color: black; }
+td.lt-blr { margin: 0px; padding: 4px; padding-left:0.3cm; padding-right:0.3cm; border-style: none; border-bottom-style: solid; border-left-style: solid; border-right-style: solid; border-width: 1px; border-color: black; }
diff --git a/plugins/supereq/nsfft-1.00/doc/index.xhtml b/plugins/supereq/nsfft-1.00/doc/index.xhtml
new file mode 100644
index 00000000..8b7e2c97
--- /dev/null
+++ b/plugins/supereq/nsfft-1.00/doc/index.xhtml
@@ -0,0 +1,2016 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1 plus MathML 2.0//EN" "http://www.w3.org/Math/DTD/mathml2/xhtml-math11-f.dtd">
+<html xmlns="http://www.w3.org/1999/xhtml">
+<head>
+<meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>
+<link rel="stylesheet" type="text/css" href="default.css"/>
+<title>NSFFT Reference Manual</title>
+</head>
+<body>
+<h1>NSFFT Reference Manual</h1>
+
+<h3>Introduction</h3>
+
+<p>
+This is a library for performing 1-dimensional discrete Fourier
+transforms. NSDFT is a simple, small and portable library, and it is
+efficient since it can utilize SIMD instruction sets in modern
+processors. It performs multiple transforms simultaneously, and thus
+it is especially suitable for digital signal processing. It does not
+need so much computation to make a good execution plan. This library
+is in public domain, so that you can incorporate this library into
+your product without any obligation.
+</p>
+
+<h3>API Reference</h3>
+
+<p>
+In this section, the API functions are explained.
+</p>
+
+<h4>Include files</h4>
+
+<p>
+You have to include two include files in dft directory.
+</p>
+
+<pre class="code">
+#include &lt;stdint.h&gt;
+#include "SIMDBase.h"
+#include "DFT.h"
+</pre>
+
+<h4>Data types</h4>
+
+<p>
+First, you have to choose a data type to represent an element in the
+input and output sequence of numbers. You can choose from the
+following three types.
+</p>
+
+<table class="figure">
+  <tr align="center">
+    <td>
+      <table class="lt">
+        <tr>
+          <td class="lt-hl"></td>
+          <td class="lt-hl"></td>
+        </tr>
+	<tr>
+	  <td class="lt-br" align="center">Symbol</td>
+	  <td class="lt-b" align="center">Data Type</td>
+	</tr>
+	<tr>
+	  <td class="lt-hl"></td>
+	  <td class="lt-hl"></td>
+	</tr>
+	<tr>
+	  <td class="lt-r" align="left">SIMDBase_TYPE_FLOAT</td>
+	  <td class="lt-" align="left">float type in C language</td>
+	</tr>
+	<tr>
+	  <td class="lt-r" align="left">SIMDBase_TYPE_DOUBLE</td>
+	  <td class="lt-" align="left">double type in C language</td>
+	</tr>
+	<tr>
+	  <td class="lt-br" align="left">SIMDBase_TYPE_LONGDOUBLE</td>
+	  <td class="lt-b" align="left">long double type in C language</td>
+	</tr>
+      </table>
+    </td>
+  </tr>
+  <tr align="center">
+    <td class="caption">Table 1 Data types</td>
+  </tr>
+</table>
+
+
+<h4>Computation modes</h4>
+
+<p>
+Next, a compuation mode have to be chosen. You can choose from the
+following modes.
+</p>
+
+<table class="figure">
+  <tr align="center">
+    <td>
+      <table class="lt">
+        <tr>
+          <td class="lt-hl"></td>
+          <td class="lt-hl"></td>
+          <td class="lt-hl"></td>
+          <td class="lt-hl"></td>
+        </tr>
+	<tr>
+	  <td class="lt-br" align="center">Symbol</td>
+	  <td class="lt-br" align="center">Type</td>
+	  <td class="lt-br" align="center">Vector Length</td>
+	  <td class="lt-b" align="center">Computation Mode</td>
+	</tr>
+	<tr>
+	  <td class="lt-hl"></td>
+	  <td class="lt-hl"></td>
+	  <td class="lt-hl"></td>
+	  <td class="lt-hl"></td>
+	</tr>
+	<tr>
+	  <td class="lt-r" align="left">SIMDBase_MODE_PUREC_FLOAT</td>
+	  <td class="lt-r" align="center">float</td>
+	  <td class="lt-r" align="center">1</td>
+	  <td class="lt-" align="center">Scalar float</td>
+	</tr>
+	<tr>
+	  <td class="lt-r" align="left">SIMDBase_MODE_PUREC_DOUBLE</td>
+	  <td class="lt-r" align="center">double</td>
+	  <td class="lt-r" align="center">1</td>
+	  <td class="lt-" align="center">Scalar double</td>
+	</tr>
+	<tr>
+	  <td class="lt-r" align="left">SIMDBase_MODE_PUREC_LONGDOUBLE</td>
+	  <td class="lt-r" align="center">long double</td>
+	  <td class="lt-r" align="center">1</td>
+	  <td class="lt-" align="center">Scalar long double</td>
+	</tr>
+	<tr>
+	  <td class="lt-r" align="left">SIMDBase_MODE_SSE_FLOAT</td>
+	  <td class="lt-r" align="center">float</td>
+	  <td class="lt-r" align="center">4</td>
+	  <td class="lt-" align="center">x86 SSE</td>
+	</tr>
+	<tr>
+	  <td class="lt-r" align="left">SIMDBase_MODE_SSE2_DOUBLE</td>
+	  <td class="lt-r" align="center">double</td>
+	  <td class="lt-r" align="center">2</td>
+	  <td class="lt-" align="center">x86 SSE2</td>
+	</tr>
+	<tr>
+	  <td class="lt-r" align="left">SIMDBase_MODE_NEON_FLOAT</td>
+	  <td class="lt-r" align="center">float</td>
+	  <td class="lt-r" align="center">4</td>
+	  <td class="lt-" align="center">ARM NEON</td>
+	</tr>
+	<tr>
+	  <td class="lt-r" align="left">SIMDBase_MODE_AVX_FLOAT</td>
+	  <td class="lt-r" align="center">float</td>
+	  <td class="lt-r" align="center">8</td>
+	  <td class="lt-" align="center">x86 AVX (float)</td>
+	</tr>
+	<tr>
+	  <td class="lt-r" align="left">SIMDBase_MODE_AVX_DOUBLE</td>
+	  <td class="lt-r" align="center">double</td>
+	  <td class="lt-r" align="center">4</td>
+	  <td class="lt-" align="center">x86 AVX (double)</td>
+	</tr>
+	<tr>
+	  <td class="lt-br" align="left">SIMDBase_MODE_ALTIVEC_FLOAT</td>
+	  <td class="lt-br" align="center">float</td>
+	  <td class="lt-br" align="center">4</td>
+	  <td class="lt-b" align="center">PowerPC Altivec</td>
+	</tr>
+      </table>
+    </td>
+  </tr>
+  <tr align="center">
+    <td class="caption">Table 2 Computation modes</td>
+  </tr>
+</table>
+
+<p>
+The following function automatically checks the availability of each
+instruction set on your computer, and chooses the best computation
+mode.
+</p>
+
+<pre class="code">
+int32_t SIMDBase_chooseBestMode(int32_t type);
+</pre>
+
+<p>
+The return value is the best mode chosen by this routine.
+<i>type</i> is the data type you chose.
+</p>
+
+
+<h4>Retrieving parameters</h4>
+
+<p>
+You can make queries for any mode using the following function.
+</p>
+
+<pre class="code">
+int32_t SIMDBase_getModeParamInt(int32_t paramId, int32_t mode);
+</pre>
+
+<p>
+<i>mode</i> is the computation mode you chose. <i>paramId</i> is one
+of the following.
+</p>
+
+<table class="figure">
+  <tr align="center">
+    <td>
+      <table class="lt">
+        <tr>
+          <td class="lt-hl"></td>
+          <td class="lt-hl"></td>
+        </tr>
+	<tr>
+	  <td class="lt-br" align="center">Symbol</td>
+	  <td class="lt-b" align="center">Meaning</td>
+	</tr>
+	<tr>
+	  <td class="lt-hl"></td>
+	  <td class="lt-hl"></td>
+	</tr>
+	<tr>
+	  <td class="lt-r" align="left">SIMDBase_PARAMID_SIZE_OF_REAL</td>
+	  <td class="lt-" align="left">Size of an element in a vector in byte</td>
+	</tr>
+	<tr>
+	  <td class="lt-r" align="left">SIMDBase_PARAMID_SIZE_OF_VECT</td>
+	  <td class="lt-" align="left">Size of the vector in byte</td>
+	</tr>
+	<tr>
+	  <td class="lt-r" align="left">SIMDBase_PARAMID_VECTOR_LEN</td>
+	  <td class="lt-" align="left">Number of elements in a vector</td>
+	</tr>
+	<tr>
+	  <td class="lt-br" align="left">SIMDBase_PARAMID_MODE_AVAILABILITY</td>
+	  <td class="lt-b" align="left">Whether the given mode is available or not</td>
+	</tr>
+      </table>
+    </td>
+  </tr>
+  <tr align="center">
+    <td class="caption">Table 3 Querying parameter for computation mode</td>
+  </tr>
+</table>
+
+<p>
+Here, a vector is a set of multiple primitive data element (single or
+double precision FP number) which can be stored in one SIMD register,
+and can be processed by one SIMD instruction at the same time.
+</p>
+
+<p>
+You can get the mode name in string data type. In this
+case, <i>paramId</i> must be SIMDBase_PARAMID_MODE_NAME.
+</p>
+
+<pre class="code">
+char *SIMDBase_getModeParamString(int32_t paramId, int32_t mode);
+</pre>
+
+<p>
+You should not modify the data returned by the above function.
+</p>
+
+
+<h4>Making and destroying execution plan</h4>
+
+<p>
+An execution plan can be made by the following function.
+</p>
+
+<pre class="code">
+DFT *DFT_init(int32_t mode, int32_t n, int32_t flags);
+</pre>
+
+<p>
+The return value is a pointer to a newly made plan.
+<i>mode</i> is the mode you chose above. <i>n</i> is the length of a
+transform. You can specify a bitwise OR of the following symbols
+as <i>flags</i>. You should not specify more than one flags regarding
+to test run. You should not specify DFT_FLAG_FORCE_RECURSIVE and
+DFT_FLAG_FORCE_COBRA at the same time. If neither DFT_FLAG_REAL nor
+DFT_FLAG_ALT_REAL is specified, an execution plan for complex
+transforms are made.
+</p>
+
+<table class="figure">
+  <tr align="center">
+    <td>
+      <table class="lt">
+        <tr>
+          <td class="lt-hl"></td>
+          <td class="lt-hl"></td>
+        </tr>
+	<tr>
+	  <td class="lt-br" align="center">Symbol</td>
+	  <td class="lt-b" align="center">Meaning</td>
+	</tr>
+	<tr>
+	  <td class="lt-hl"></td>
+	  <td class="lt-hl"></td>
+	</tr>
+	<tr>
+	  <td class="lt-r" align="left">DFT_FLAG_NO_TEST_RUN</td>
+	  <td class="lt-" align="left">Make execution plan without performing a test run</td>
+	</tr>
+	<tr>
+	  <td class="lt-r" align="left">DFT_FLAG_LIGHT_TEST_RUN</td>
+	  <td class="lt-" align="left">Perform small amount of test run to make an execution plan</td>
+	</tr>
+	<tr>
+	  <td class="lt-r" align="left">DFT_FLAG_HEAVY_TEST_RUN</td>
+	  <td class="lt-" align="left">Perform large amount of test run to make an execution plan</td>
+	</tr>
+	<tr>
+	  <td class="lt-r" align="left">DFT_FLAG_EXHAUSTIVE_TEST_RUN</td>
+	  <td class="lt-" align="left">Perform exhaustive search of parameters and find the optimal execution plan</td>
+	</tr>
+	<tr>
+	  <td class="lt-r" align="left">DFT_FLAG_REAL</td>
+	  <td class="lt-" align="left">Make an execution plan for a real transform</td>
+	</tr>
+	<tr>
+	  <td class="lt-r" align="left">DFT_FLAG_ALT_REAL</td>
+	  <td class="lt-" align="left">Make an execution plan for an alternative real transform</td>
+	</tr>
+	<tr>
+	  <td class="lt-r" align="left">DFT_FLAG_VERBOSE</td>
+	  <td class="lt-" align="left">Make some noise during making an execution plan</td>
+	</tr>
+	<tr>
+	  <td class="lt-r" align="left">DFT_FLAG_NOBITREVERSAL</td>
+	  <td class="lt-" align="left">Does not perforam bitreversal operation during a transform</td>
+	</tr>
+	<tr>
+	  <td class="lt-r" align="left">DFT_FLAG_FORCE_RECURSIVE</td>
+	  <td class="lt-" align="left">Force using the recursive bit-reveral routine. This routine is suited for small transforms.</td>
+	</tr>
+	<tr>
+	  <td class="lt-br" align="left">DFT_FLAG_FORCE_COBRA</td>
+	  <td class="lt-b" align="left">Force using the Cobra bit-reveral routine. This routine is suited for large transforms.</td>
+	</tr>
+      </table>
+    </td>
+  </tr>
+  <tr align="center">
+    <td class="caption">Table 4 Options for making execution plan</td>
+  </tr>
+</table>
+
+<p>
+You can destroy the plan you made by the following function.
+</p>
+
+<pre class="code">
+void DFT_dispose(DFT *p, int32_t mode);
+</pre>
+
+<p>
+<i>p</i> is a pointer to the execution plan. <i>mode</i> is the
+corresponding execution mode.
+</p>
+
+<p>
+You can retrieve parameters of a plan using the following function.
+</p>
+
+<pre class="code">
+int32_t DFT_getPlanParamInt(int32_t paramId, void *p);
+</pre>
+
+<p>
+<i>p</i> is a pointer to an execution plan. <i>paramId</i> is one
+of the following.
+</p>
+
+<table class="figure">
+  <tr align="center">
+    <td>
+      <table class="lt">
+        <tr>
+          <td class="lt-hl"></td>
+          <td class="lt-hl"></td>
+        </tr>
+	<tr>
+	  <td class="lt-br" align="center">Symbol</td>
+	  <td class="lt-b" align="center">Meaning</td>
+	</tr>
+	<tr>
+	  <td class="lt-hl"></td>
+	  <td class="lt-hl"></td>
+	</tr>
+	<tr>
+	  <td class="lt-r" align="left">DFT_PARAMID_TYPE</td>
+	  <td class="lt-" align="left">Data type</td>
+	</tr>
+	<tr>
+	  <td class="lt-r" align="left">DFT_PARAMID_MODE</td>
+	  <td class="lt-" align="left">Computation mode</td>
+	</tr>
+	<tr>
+	  <td class="lt-r" align="left">DFT_PARAMID_FFT_LENGTH</td>
+	  <td class="lt-" align="left">Length of the transform</td>
+	</tr>
+	<tr>
+	  <td class="lt-r" align="left">DFT_PARAMID_IS_REAL_TRANSFORM</td>
+	  <td class="lt-" align="left">Whether the plan is for real transforms</td>
+	</tr>
+	<tr>
+	  <td class="lt-r" align="left">DFT_PARAMID_NO_BIT_REVERSAL</td>
+	  <td class="lt-" align="left">Whether the plan does not perform bit reversal operation</td>
+	</tr>
+	<tr>
+	  <td class="lt-br" align="left">DFT_PARAMID_TEST_RUN</td>
+	  <td class="lt-b" align="left">How much test run is performed when making this plan</td>
+	</tr>
+      </table>
+    </td>
+  </tr>
+  <tr align="center">
+    <td class="caption">Table 5 Querying parameter for execution plan</td>
+  </tr>
+</table>
+
+<h4>Writing and reading execution plan to/from file</h4>
+
+<p>
+You can write or read an execution plan to/from a file using the following functions.
+</p>
+
+<pre class="code">
+int32_t DFT_fwrite(DFT *p, FILE *fp);
+DFT *DFT_fread(FILE *fp, int32_t *errcode);
+</pre>
+
+<p>
+<i>p</i> is a pointer to a plan. <i>fp</i> is a file
+pointer. DFT_fwrite returns 1 if the plan is successfully written, and
+0 if an error occurs. DFT_fread returns the pointer to the read plan
+if the plan is successfully read, and NULL if an error occurs. If an
+error occurs, an error code is returned to a variable whose pointer is
+specified by <i>errcode</i>. The interpretation of error codes is
+given below.
+</p>
+
+<table class="figure">
+  <tr align="center">
+    <td>
+      <table class="lt">
+        <tr>
+          <td class="lt-hl"></td>
+          <td class="lt-hl"></td>
+        </tr>
+	<tr>
+	  <td class="lt-br" align="center">Symbol</td>
+	  <td class="lt-b" align="center">Meaning</td>
+	</tr>
+	<tr>
+	  <td class="lt-hl"></td>
+	  <td class="lt-hl"></td>
+	</tr>
+	<tr>
+	  <td class="lt-r" align="left">DFT_ERROR_NOERROR</td>
+	  <td class="lt-" align="left">No error</td>
+	</tr>
+	<tr>
+	  <td class="lt-r" align="left">DFT_ERROR_FILE_VERSION</td>
+	  <td class="lt-" align="left">File format version mismatch</td>
+	</tr>
+	<tr>
+	  <td class="lt-r" align="left">DFT_ERROR_FILE_IO</td>
+	  <td class="lt-" align="left">I/O error</td>
+	</tr>
+	<tr>
+	  <td class="lt-r" align="left">DFT_ERROR_UNEXPECTED_EOF</td>
+	  <td class="lt-" align="left">Unexpected EOF</td>
+	</tr>
+	<tr>
+	  <td class="lt-r" align="left">DFT_ERROR_MODE_NOT_COMPILED_IN</td>
+	  <td class="lt-" align="left">Tried to read a plan with mode that is not compiled in</td>
+	</tr>
+	<tr>
+	  <td class="lt-r" align="left">DFT_ERROR_MODE_NOT_AVAILABLE</td>
+	  <td class="lt-" align="left">Tried to read a plan with mode that is not supported by hardware</td>
+	</tr>
+	<tr>
+	  <td class="lt-br" align="left">DFT_ERROR_UNKNOWN_MODE</td>
+	  <td class="lt-b" align="left">Tried to read a plan with mode that is unknown by library</td>
+	</tr>
+      </table>
+    </td>
+  </tr>
+  <tr align="center">
+    <td class="caption">Table 6 Errors that may happen during file I/O</td>
+  </tr>
+</table>
+
+
+<h4>Allocating and freeing buffers for transforms</h4>
+
+<p>
+In order to allocate word-aligned buffers for storing data which is
+fed to the FFT routine, you have to use the following function.
+</p>
+
+<pre class="code">
+void *DFT_alignedMalloc(uint64_t size);
+</pre>
+
+<p>
+This function allocates <i>size</i> bytes of word-aligned memory and
+returns the pointer. In order to free this memory, you have to use the
+following function.
+</p>
+
+<pre class="code">
+void DFT_alignedFree(void *ptr);
+</pre>
+
+<p>
+<i>ptr</i> is the pointer returned from DFT_alignedMalloc function.
+</p>
+
+<h4>Executing transform</h4>
+
+<p>
+By the following function, the planned transform can be executed.
+</p>
+
+<pre class="code">
+void DFT_execute(DFT *p, int32_t mode, void *s, int32_t dir);
+</pre>
+
+<p>
+<i>p</i> is a pointer to the plan. <i>mode</i> is the computation
+mode. <i>s</i> is the pointer to the buffer in which the sequence of
+input values is stored. This pointer must be a pointer returned from
+DFT_alignedMalloc function.
+<i>dir</i> specifies the direction of transform.
+</p>
+
+<p>
+The forward and backward discrete Fourier transforms are defined by
+the following formula (1) and (2), respectively.
+</p>
+
+<table border="0" style="margin-right:1.0cm; margin-left:1.0cm; margin-top:0.5cm; margin-bottom:0.5cm;">
+  <tr>
+    <td align="center" style="width:100%;">
+      <math mode="display" style="font-size:1.2em;" xmlns="http://www.w3.org/1998/Math/MathML">
+	<mrow>
+	  <msub><mi>X</mi><mi>k</mi></msub>
+	  <mo>=</mo>
+	  <munderover>
+	    <mo style="font-size:140%;">&Sum;</mo>
+	    <mrow><mi>n</mi><mo>=</mo><mn>0</mn></mrow>
+	    <mrow><mi>N</mi><mo>-</mo><mn>1</mn></mrow>
+	  </munderover>
+	  <msub><mi>x</mi><mi>n</mi></msub>
+	  <msup>
+	    <mi>e</mi>
+	    <mrow>
+	      <mo>-</mo>
+	      <mfrac>
+		<mrow><mn>2</mn><mi>&pi;</mi><mi>i</mi></mrow>
+		<mi>N</mi>
+	      </mfrac>
+	      <mi>k</mi><mi>n</mi>
+	    </mrow>
+	  </msup>
+
+	  <mo>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</mo>
+
+	  <mi>k</mi>
+	  <mo>=</mo>
+	  <mn>0</mn>
+	  <mo>,</mo>
+	  <mo>&middot;</mo>
+	  <mo>&middot;</mo>
+	  <mo>&middot;</mo>
+	  <mo>,</mo>
+	  <mi>N</mi>
+	  <mo>-</mo>
+	  <mn>1</mn>
+	</mrow>
+      </math>
+    </td>
+    <td>
+      <p>(1)</p>
+    </td>
+  </tr>
+  <tr>
+    <td align="center" style="width:100%;">
+      <math mode="display" style="font-size:1.2em;" xmlns="http://www.w3.org/1998/Math/MathML">
+	<mrow>
+	  <msub><mi>x</mi><mi>n</mi></msub>
+	  <mo>=</mo>
+	  <mfrac>
+	    <mn>1</mn>
+	    <mi>N</mi>
+	  </mfrac>
+	  <munderover>
+	    <mo style="font-size:140%;">&Sum;</mo>
+	    <mrow><mi>k</mi><mo>=</mo><mn>0</mn></mrow>
+	    <mrow><mi>N</mi><mo>-</mo><mn>1</mn></mrow>
+	  </munderover>
+	  <msub><mi>X</mi><mi>k</mi></msub>
+	  <msup>
+	    <mi>e</mi>
+	    <mrow>
+	      <mfrac>
+		<mrow><mn>2</mn><mi>&pi;</mi><mi>i</mi></mrow>
+		<mi>N</mi>
+	      </mfrac>
+	      <mi>k</mi><mi>n</mi>
+	    </mrow>
+	  </msup>
+
+	  <mo>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</mo>
+
+	  <mi>n</mi>
+	  <mo>=</mo>
+	  <mn>0</mn>
+	  <mo>,</mo>
+	  <mo>&middot;</mo>
+	  <mo>&middot;</mo>
+	  <mo>&middot;</mo>
+	  <mo>,</mo>
+	  <mi>N</mi>
+	  <mo>-</mo>
+	  <mn>1</mn>
+
+	</mrow>
+      </math>
+    </td>
+    <td>
+      <p>(2)</p>
+    </td>
+  </tr>
+</table>
+
+<p>
+The complex forward and backward transforms perform the transforms
+defined by the following formula (3) and (4), respectively. <i>V</i>
+is the vector length mentioned above. Again, calling DFT_execute once
+performs <i>V</i> forward or backward transforms at a time. Please
+note that (4) gives values multiplied by <i>N</i> compared to
+(2). Specifying -1 as the direction of transform performs the
+transform defined by (3). In this case, the input should be given as
+in (5) , and the output is given as in (6).  Specifying 1 as the
+direction of transform performs the transform defined by (4), and in
+this case, the input should be given as in (6) , and the output is
+given as in (5).
+</p>
+
+<table border="0" style="margin-right:1.0cm; margin-left:1.0cm; margin-top:0.5cm; margin-bottom:0.5cm;">
+  <tr>
+    <td align="center" style="width:100%;">
+      <math mode="display" style="font-size:1.2em;" xmlns="http://www.w3.org/1998/Math/MathML">
+	<mrow>
+	  <msub><mi>X</mi><mrow><mi>k</mi><mo>,</mo><mi>v</mi></mrow></msub>
+	  <mo>=</mo>
+	  <munderover>
+	    <mo style="font-size:140%;">&Sum;</mo>
+	    <mrow><mi>n</mi><mo>=</mo><mn>0</mn></mrow>
+	    <mrow><mi>N</mi><mo>-</mo><mn>1</mn></mrow>
+	  </munderover>
+	  <msub><mi>x</mi><mrow><mi>n</mi><mo>,</mo><mi>v</mi></mrow></msub>
+	  <msup>
+	    <mi>e</mi>
+	    <mrow>
+	      <mo>-</mo>
+	      <mfrac>
+		<mrow><mn>2</mn><mi>&pi;</mi><mi>i</mi></mrow>
+		<mi>N</mi>
+	      </mfrac>
+	      <mi>k</mi><mi>n</mi>
+	    </mrow>
+	  </msup>
+
+	  <mo>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</mo>
+
+	  <mrow style="font-size:100%;">
+	    <mi>k</mi>
+	    <mo>=</mo>
+	    <mn>0</mn>
+	    <mo>,</mo>
+	    <mo>&middot;</mo>
+	    <mo>&middot;</mo>
+	    <mo>&middot;</mo>
+	    <mo>,</mo>
+	    <mi>N</mi>
+	    <mo>-</mo>
+	    <mn>1</mn>
+
+	    <mo>&nbsp;&nbsp;</mo>
+	    <mo>,</mo>
+	    <mo>&nbsp;&nbsp;</mo>
+
+	    <mi>v</mi>
+	    <mo>=</mo>
+	    <mn>0</mn>
+	    <mo>,</mo>
+	    <mo>&middot;</mo>
+	    <mo>&middot;</mo>
+	    <mo>&middot;</mo>
+	    <mo>,</mo>
+	    <mi>V</mi>
+	    <mo>-</mo>
+	    <mn>1</mn>
+	  </mrow>
+	</mrow>
+      </math>
+    </td>
+    <td>
+      <p>(3)</p>
+    </td>
+  </tr>
+  <tr>
+    <td align="center" style="width:100%;">
+      <math mode="display" style="font-size:1.2em;" xmlns="http://www.w3.org/1998/Math/MathML">
+	<mrow>
+	  <msub><mi>x</mi><mrow><mi>n</mi><mo>,</mo><mi>v</mi></mrow></msub>
+	  <mo>=</mo>
+	  <munderover>
+	    <mo style="font-size:140%;">&Sum;</mo>
+	    <mrow><mi>k</mi><mo>=</mo><mn>0</mn></mrow>
+	    <mrow><mi>N</mi><mo>-</mo><mn>1</mn></mrow>
+	  </munderover>
+	  <msub><mi>X</mi><mrow><mi>k</mi><mo>,</mo><mi>v</mi></mrow></msub>
+	  <msup>
+	    <mi>e</mi>
+	    <mrow>
+	      <mfrac>
+		<mrow><mn>2</mn><mi>&pi;</mi><mi>i</mi></mrow>
+		<mi>N</mi>
+	      </mfrac>
+	      <mi>k</mi><mi>n</mi>
+	    </mrow>
+	  </msup>
+
+	  <mo>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</mo>
+
+	  <mrow style="font-size:100%;">
+	    <mi>n</mi>
+	    <mo>=</mo>
+	    <mn>0</mn>
+	    <mo>,</mo>
+	    <mo>&middot;</mo>
+	    <mo>&middot;</mo>
+	    <mo>&middot;</mo>
+	    <mo>,</mo>
+	    <mi>N</mi>
+	    <mo>-</mo>
+	    <mn>1</mn>
+
+	    <mo>&nbsp;&nbsp;</mo>
+	    <mo>,</mo>
+	    <mo>&nbsp;&nbsp;</mo>
+
+	    <mi>v</mi>
+	    <mo>=</mo>
+	    <mn>0</mn>
+	    <mo>,</mo>
+	    <mo>&middot;</mo>
+	    <mo>&middot;</mo>
+	    <mo>&middot;</mo>
+	    <mo>,</mo>
+	    <mi>V</mi>
+	    <mo>-</mo>
+	    <mn>1</mn>
+	  </mrow>
+	</mrow>
+      </math>
+    </td>
+    <td>
+      <p>(4)</p>
+    </td>
+  </tr>
+</table>
+
+<table border="0" style="margin-right:1.0cm; margin-left:1.0cm; margin-top:0.5cm; margin-bottom:0.5cm;">
+  <tr>
+    <td align="center" style="width:100%;">
+      <math mode="display" style="font-size:1.2em;" xmlns="http://www.w3.org/1998/Math/MathML">
+	<mrow>
+	  <mfenced open="{" close="">
+	    <mtable>
+	      <mtr>
+		<mtd>
+		  <mrow>
+		    <mi>s</mi>
+		    <mo>[</mo>
+		    <mo>(</mo>
+		    <mn>2</mn>
+		    <mi>n</mi>
+		    <mo>+</mo>
+		    <mn>0</mn>
+		    <mo>)</mo>
+		    <mi>V</mi>
+		    <mo>+</mo>
+		    <mi>v</mi>
+		    <mo>]</mo>
+
+		    <mo>=</mo>
+
+		    <mi>Re</mi>
+		    <mo>(</mo>
+		    <msub><mi>x</mi><mrow><mi>n</mi><mo>,</mo><mi>v</mi></mrow></msub>
+		    <mo>)</mo>
+		  </mrow>
+		</mtd>
+	      </mtr>
+
+	      <mtr>
+		<mtd>
+		  <mrow>
+		    <mi>s</mi>
+		    <mo>[</mo>
+		    <mo>(</mo>
+		    <mn>2</mn>
+		    <mi>n</mi>
+		    <mo>+</mo>
+		    <mn>1</mn>
+		    <mo>)</mo>
+		    <mi>V</mi>
+		    <mo>+</mo>
+		    <mi>v</mi>
+		    <mo>]</mo>
+
+		    <mo>=</mo>
+
+		    <mi>Im</mi>
+		    <mo>(</mo>
+		    <msub><mi>x</mi><mrow><mi>n</mi><mo>,</mo><mi>v</mi></mrow></msub>
+		    <mo>)</mo>
+		  </mrow>
+		</mtd>
+	      </mtr>
+
+	    </mtable>
+	  </mfenced>
+
+	  <mo>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</mo>
+
+	  <mrow style="font-size:100%;">
+	    <mi>n</mi>
+	    <mo>=</mo>
+	    <mn>0</mn>
+	    <mo>,</mo>
+	    <mo>&middot;</mo>
+	    <mo>&middot;</mo>
+	    <mo>&middot;</mo>
+	    <mo>,</mo>
+	    <mi>N</mi>
+	    <mo>-</mo>
+	    <mn>1</mn>
+
+	    <mo>&nbsp;&nbsp;</mo>
+	    <mo>,</mo>
+	    <mo>&nbsp;&nbsp;</mo>
+
+	    <mi>v</mi>
+	    <mo>=</mo>
+	    <mn>0</mn>
+	    <mo>,</mo>
+	    <mo>&middot;</mo>
+	    <mo>&middot;</mo>
+	    <mo>&middot;</mo>
+	    <mo>,</mo>
+	    <mi>V</mi>
+	    <mo>-</mo>
+	    <mn>1</mn>
+	  </mrow>
+	</mrow>
+      </math>
+    </td>
+    <td>
+      <p>(5)</p>
+    </td>
+  </tr>
+</table>
+
+<table border="0" style="margin-right:1.0cm; margin-left:1.0cm; margin-top:0.5cm; margin-bottom:0.5cm;">
+  <tr>
+    <td align="center" style="width:100%;">
+      <math mode="display" style="font-size:1.2em;" xmlns="http://www.w3.org/1998/Math/MathML">
+	<mrow>
+	  <mfenced open="{" close="">
+	    <mtable>
+	      <mtr>
+		<mtd>
+		  <mrow>
+		    <mi>s</mi>
+		    <mo>[</mo>
+		    <mo>(</mo>
+		    <mn>2</mn>
+		    <mi>k</mi>
+		    <mo>+</mo>
+		    <mn>0</mn>
+		    <mo>)</mo>
+		    <mi>V</mi>
+		    <mo>+</mo>
+		    <mi>v</mi>
+		    <mo>]</mo>
+
+		    <mo>=</mo>
+
+		    <mi>Re</mi>
+		    <mo>(</mo>
+		    <msub><mi>X</mi><mrow><mi>k</mi><mo>,</mo><mi>v</mi></mrow></msub>
+		    <mo>)</mo>
+		  </mrow>
+		</mtd>
+	      </mtr>
+
+	      <mtr>
+		<mtd>
+		  <mrow>
+		    <mi>s</mi>
+		    <mo>[</mo>
+		    <mo>(</mo>
+		    <mn>2</mn>
+		    <mi>k</mi>
+		    <mo>+</mo>
+		    <mn>1</mn>
+		    <mo>)</mo>
+		    <mi>V</mi>
+		    <mo>+</mo>
+		    <mi>v</mi>
+		    <mo>]</mo>
+
+		    <mo>=</mo>
+
+		    <mi>Im</mi>
+		    <mo>(</mo>
+		    <msub><mi>X</mi><mrow><mi>k</mi><mo>,</mo><mi>v</mi></mrow></msub>
+		    <mo>)</mo>
+		  </mrow>
+		</mtd>
+	      </mtr>
+
+	    </mtable>
+	  </mfenced>
+
+	  <mo>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</mo>
+
+	  <mrow style="font-size:100%;">
+	    <mi>k</mi>
+	    <mo>=</mo>
+	    <mn>0</mn>
+	    <mo>,</mo>
+	    <mo>&middot;</mo>
+	    <mo>&middot;</mo>
+	    <mo>&middot;</mo>
+	    <mo>,</mo>
+	    <mi>N</mi>
+	    <mo>-</mo>
+	    <mn>1</mn>
+
+	    <mo>&nbsp;&nbsp;</mo>
+	    <mo>,</mo>
+	    <mo>&nbsp;&nbsp;</mo>
+
+	    <mi>v</mi>
+	    <mo>=</mo>
+	    <mn>0</mn>
+	    <mo>,</mo>
+	    <mo>&middot;</mo>
+	    <mo>&middot;</mo>
+	    <mo>&middot;</mo>
+	    <mo>,</mo>
+	    <mi>V</mi>
+	    <mo>-</mo>
+	    <mn>1</mn>
+	  </mrow>
+	</mrow>
+      </math>
+    </td>
+    <td>
+      <p>(6)</p>
+    </td>
+  </tr>
+</table>
+
+<p>
+The real forward transform performs the transform defined by (3) when
+the condition (7) is satisfied. In this case, the output satisfies
+(8). You should specify -1 as the direction of transform, and the
+input should be given as in (9), and the output is given as in (10).
+The real backward transform is the opposite of the real forward
+transform. The input should satisfy (8) and the output satisfies (7).
+You should specify 1 as the direction of transform, and the input
+should be given as in (10), and the output is given as in (11).
+</p>
+
+<table border="0" style="margin-right:1.0cm; margin-left:1.0cm; margin-top:0.5cm; margin-bottom:0.5cm;">
+  <tr>
+    <td align="center" style="width:100%;">
+      <math mode="display" style="font-size:1.2em;" xmlns="http://www.w3.org/1998/Math/MathML">
+	<mrow>
+	  <mi>Im</mi>
+	  <mo>(</mo>
+	  <msub><mi>x</mi><mrow><mi>n</mi><mo>,</mo><mi>v</mi></mrow></msub>
+	  <mo>)</mo>
+	  <mo>=</mo>
+	  <mn>0</mn>
+
+	  <mo>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</mo>
+
+	  <mrow style="font-size:100%;">
+	    <mi>n</mi>
+	    <mo>=</mo>
+	    <mn>0</mn>
+	    <mo>,</mo>
+	    <mo>&middot;</mo>
+	    <mo>&middot;</mo>
+	    <mo>&middot;</mo>
+	    <mo>,</mo>
+	    <mi>N</mi>
+	    <mo>-</mo>
+	    <mn>1</mn>
+
+	    <mo>&nbsp;&nbsp;</mo>
+	    <mo>,</mo>
+	    <mo>&nbsp;&nbsp;</mo>
+
+	    <mi>v</mi>
+	    <mo>=</mo>
+	    <mn>0</mn>
+	    <mo>,</mo>
+	    <mo>&middot;</mo>
+	    <mo>&middot;</mo>
+	    <mo>&middot;</mo>
+	    <mo>,</mo>
+	    <mi>V</mi>
+	    <mo>-</mo>
+	    <mn>1</mn>
+	  </mrow>
+
+	</mrow>
+      </math>
+    </td>
+    <td>
+      <p>(7)</p>
+    </td>
+  </tr>
+</table>
+
+<table border="0" style="margin-right:1.0cm; margin-left:1.0cm; margin-top:0.5cm; margin-bottom:0.5cm;">
+  <tr>
+    <td align="center" style="width:100%;">
+      <math mode="display" style="font-size:1.2em;" xmlns="http://www.w3.org/1998/Math/MathML">
+	<mrow>
+	  <mfenced open="{" close="">
+	    <mtable>
+	      <mtr>
+		<mtd>
+		  <mrow>
+		    <msub><mi>X</mi><mrow><mi>k</mi><mo>,</mo><mi>v</mi></mrow></msub>
+		    <mo>=</mo>
+		    <msubsup>
+		      <mi>X</mi>
+		      <mrow><mi>N</mi><mo>-</mo><mi>k</mi><mo>,</mo><mi>v</mi></mrow>
+		      <mo>*</mo>
+		    </msubsup>
+		  </mrow>
+		</mtd>
+
+		<mtd>
+		  <mo>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</mo>
+		</mtd>
+
+		<mtd>
+		  <mrow style="font-size:100%;">
+		    <mi>k</mi>
+		    <mo>=</mo>
+		    <mn>1</mn>
+		    <mo>,</mo>
+		    <mo>&middot;</mo>
+		    <mo>&middot;</mo>
+		    <mo>&middot;</mo>
+		    <mo>,</mo>
+		    <mfrac>
+		      <mi>N</mi>
+		      <mn>2</mn>
+		    </mfrac>
+		    <mo>-</mo>
+		    <mn>1</mn>
+		  </mrow>
+		</mtd>
+	      </mtr>
+
+	      <mtr>
+		<mtd>
+		  <mrow>
+		    <mi>Im</mi>
+		    <mo>(</mo>
+		    <msub><mi>X</mi><mrow><mi>k</mi><mo>,</mo><mi>v</mi></mrow></msub>
+		    <mo>)</mo>
+		    <mo>=</mo>
+		    <mn>0</mn>
+		  </mrow>
+		</mtd>
+
+		<mtd>
+		  <mo>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</mo>
+		</mtd>
+
+		<mtd>
+		  <mrow style="font-size:100%;">
+		    <mi>k</mi>
+		    <mo>=</mo>
+		    <mn>0</mn>
+		    <mo>,</mo>
+		    <mfrac>
+		      <mi>N</mi>
+		      <mn>2</mn>
+		    </mfrac>
+		  </mrow>
+		</mtd>
+	      </mtr>
+
+	    </mtable>
+	  </mfenced>
+
+	  <mo>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</mo>
+
+	  <mrow style="font-size:100%;">
+	    <mi>v</mi>
+	    <mo>=</mo>
+	    <mn>0</mn>
+	    <mo>,</mo>
+	    <mo>&middot;</mo>
+	    <mo>&middot;</mo>
+	    <mo>&middot;</mo>
+	    <mo>,</mo>
+	    <mi>V</mi>
+	    <mo>-</mo>
+	    <mn>1</mn>
+	  </mrow>
+	</mrow>
+      </math>
+    </td>
+    <td>
+      <p>(8)</p>
+    </td>
+  </tr>
+</table>
+
+<table border="0" style="margin-right:1.0cm; margin-left:1.0cm; margin-top:0.5cm; margin-bottom:0.5cm;">
+  <tr>
+    <td align="center" style="width:100%;">
+      <math mode="display" style="font-size:1.2em;" xmlns="http://www.w3.org/1998/Math/MathML">
+	<mrow>
+	  <mrow>
+	    <mi>s</mi>
+	    <mo>[</mo>
+	    <mi>n</mi>
+	    <mi>V</mi>
+	    <mo>+</mo>
+	    <mi>v</mi>
+	    <mo>]</mo>
+
+	    <mo>=</mo>
+
+	    <mi>Re</mi>
+	    <mo>(</mo>
+	    <msub><mi>x</mi><mrow><mi>n</mi><mo>,</mo><mi>v</mi></mrow></msub>
+	    <mo>)</mo>
+	  </mrow>
+
+	  <mo>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</mo>
+
+	  <mrow style="font-size:100%;">
+	    <mi>n</mi>
+	    <mo>=</mo>
+	    <mn>0</mn>
+	    <mo>,</mo>
+	    <mo>&middot;</mo>
+	    <mo>&middot;</mo>
+	    <mo>&middot;</mo>
+	    <mo>,</mo>
+	    <mi>N</mi>
+	    <mo>-</mo>
+	    <mn>1</mn>
+
+	    <mo>&nbsp;&nbsp;</mo>
+	    <mo>,</mo>
+	    <mo>&nbsp;&nbsp;</mo>
+
+	    <mi>v</mi>
+	    <mo>=</mo>
+	    <mn>0</mn>
+	    <mo>,</mo>
+	    <mo>&middot;</mo>
+	    <mo>&middot;</mo>
+	    <mo>&middot;</mo>
+	    <mo>,</mo>
+	    <mi>V</mi>
+	    <mo>-</mo>
+	    <mn>1</mn>
+	  </mrow>
+	</mrow>
+      </math>
+    </td>
+    <td>
+      <p>(9)</p>
+    </td>
+  </tr>
+</table>
+
+<table border="0" style="margin-right:1.0cm; margin-left:1.0cm; margin-top:0.5cm; margin-bottom:0.5cm;">
+  <tr>
+    <td align="center" style="width:100%;">
+      <math mode="display" style="font-size:1.2em;" xmlns="http://www.w3.org/1998/Math/MathML">
+	<mrow>
+	  <mfenced open="{" close="">
+	    <mtable>
+	      <mtr>
+		<mtd>
+		  <mrow>
+		    <mi>s</mi>
+		    <mo>[</mo>
+		    <mo>(</mo>
+		    <mn>2</mn>
+		    <mi>k</mi>
+		    <mo>+</mo>
+		    <mn>0</mn>
+		    <mo>)</mo>
+		    <mi>V</mi>
+		    <mo>+</mo>
+		    <mi>v</mi>
+		    <mo>]</mo>
+		  </mrow>
+		</mtd>
+
+		<mtd>
+		  <mo>=</mo>
+		</mtd>
+
+		<mtd>
+		  <mrow>
+		    <mi>Re</mi>
+		    <mo>(</mo>
+		    <msub><mi>X</mi><mrow><mi>k</mi><mo>,</mo><mi>v</mi></mrow></msub>
+		    <mo>)</mo>
+		  </mrow>
+		</mtd>
+
+		<mtd>
+		  <mrow style="font-size:100%;">
+		    <mi>k</mi>
+		    <mo>=</mo>
+		    <mn>0</mn>
+		    <mo>,</mo>
+		    <mo>&middot;</mo>
+		    <mo>&middot;</mo>
+		    <mo>&middot;</mo>
+		    <mo>,</mo>
+		    <mfrac>
+		      <mi>N</mi>
+		      <mn>2</mn>
+		    </mfrac>
+		    <mo>-</mo>
+		    <mn>1</mn>
+		  </mrow>
+		</mtd>
+	      </mtr>
+
+	      <mtr>
+		<mtd>
+		  <mrow>
+		    <mi>s</mi>
+		    <mo>[</mo>
+		    <mi>V</mi>
+		    <mo>+</mo>
+		    <mi>v</mi>
+		    <mo>]</mo>
+		  </mrow>
+		</mtd>
+
+		<mtd>
+		  <mo>=</mo>
+		</mtd>
+
+		<mtd>
+		  <mrow>
+		    <mi>Re</mi>
+		    <mo>(</mo>
+		    <msub><mi>X</mi><mrow><mi>N</mi><mo>/</mo><mn>2</mn><mo>,</mo><mi>v</mi></mrow></msub>
+		    <mo>)</mo>
+		  </mrow>
+		</mtd>
+
+		<mtd>
+		</mtd>
+	      </mtr>
+
+	      <mtr>
+		<mtd>
+		  <mrow>
+		    <mi>s</mi>
+		    <mo>[</mo>
+		    <mo>(</mo>
+		    <mn>2</mn>
+		    <mi>k</mi>
+		    <mo>+</mo>
+		    <mn>1</mn>
+		    <mo>)</mo>
+		    <mi>V</mi>
+		    <mo>+</mo>
+		    <mi>v</mi>
+		    <mo>]</mo>
+		  </mrow>
+		</mtd>
+
+		<mtd>
+		  <mo>=</mo>
+		</mtd>
+
+		<mtd>
+		  <mrow>
+		    <mi>Im</mi>
+		    <mo>(</mo>
+		    <msub><mi>X</mi><mrow><mi>k</mi><mo>,</mo><mi>v</mi></mrow></msub>
+		    <mo>)</mo>
+		  </mrow>
+		</mtd>
+
+		<mtd>
+		  <mrow style="font-size:100%;">
+		    <mi>k</mi>
+		    <mo>=</mo>
+		    <mn>1</mn>
+		    <mo>,</mo>
+		    <mo>&middot;</mo>
+		    <mo>&middot;</mo>
+		    <mo>&middot;</mo>
+		    <mo>,</mo>
+		    <mfrac>
+		      <mi>N</mi>
+		      <mn>2</mn>
+		    </mfrac>
+		    <mo>-</mo>
+		    <mn>1</mn>
+		  </mrow>
+		</mtd>
+	      </mtr>
+
+	    </mtable>
+	  </mfenced>
+
+	  <mo>&nbsp;&nbsp;&nbsp;&nbsp;</mo>
+
+	  <mrow style="font-size:100%;">
+	    <mi>v</mi>
+	    <mo>=</mo>
+	    <mn>0</mn>
+	    <mo>,</mo>
+	    <mo>&middot;</mo>
+	    <mo>&middot;</mo>
+	    <mo>&middot;</mo>
+	    <mo>,</mo>
+	    <mi>V</mi>
+	    <mo>-</mo>
+	    <mn>1</mn>
+	  </mrow>
+	</mrow>
+      </math>
+    </td>
+    <td>
+      <p>(10)</p>
+    </td>
+  </tr>
+</table>
+
+<table border="0" style="margin-right:1.0cm; margin-left:1.0cm; margin-top:0.5cm; margin-bottom:0.5cm;">
+  <tr>
+    <td align="center" style="width:100%;">
+      <math mode="display" style="font-size:1.2em;" xmlns="http://www.w3.org/1998/Math/MathML">
+	<mrow>
+	  <mrow>
+	    <mn>2</mn>
+	    <mo> &nbsp; </mo>
+	    <mi>s</mi>
+	    <mo>[</mo>
+	    <mi>n</mi>
+	    <mi>V</mi>
+	    <mo>+</mo>
+	    <mi>v</mi>
+	    <mo>]</mo>
+
+	    <mo>=</mo>
+
+	    <mi>Re</mi>
+	    <mo>(</mo>
+	    <msub><mi>x</mi><mrow><mi>n</mi><mo>,</mo><mi>v</mi></mrow></msub>
+	    <mo>)</mo>
+	  </mrow>
+
+	  <mo>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</mo>
+
+	  <mrow style="font-size:100%;">
+	    <mi>n</mi>
+	    <mo>=</mo>
+	    <mn>0</mn>
+	    <mo>,</mo>
+	    <mo>&middot;</mo>
+	    <mo>&middot;</mo>
+	    <mo>&middot;</mo>
+	    <mo>,</mo>
+	    <mi>N</mi>
+	    <mo>-</mo>
+	    <mn>1</mn>
+
+	    <mo>&nbsp;&nbsp;</mo>
+	    <mo>,</mo>
+	    <mo>&nbsp;&nbsp;</mo>
+
+	    <mi>v</mi>
+	    <mo>=</mo>
+	    <mn>0</mn>
+	    <mo>,</mo>
+	    <mo>&middot;</mo>
+	    <mo>&middot;</mo>
+	    <mo>&middot;</mo>
+	    <mo>,</mo>
+	    <mi>V</mi>
+	    <mo>-</mo>
+	    <mn>1</mn>
+	  </mrow>
+	</mrow>
+      </math>
+    </td>
+    <td>
+      <p>(11)</p>
+    </td>
+  </tr>
+</table>
+
+<p>
+The alternative real transforms are defined by (12) to (16), similarly
+to the real transforms. The alternative transforms are handy if you
+are migrating from the FFT library made by Prof. Takuya Ooura.  You
+should specify 1 as the direction in order to perform a forward
+transform, and -1 when you perform a backward transform.
+</p>
+
+<table border="0" style="margin-right:1.0cm; margin-left:1.0cm; margin-top:0.5cm; margin-bottom:0.5cm;">
+  <tr>
+    <td align="center" style="width:100%;">
+      <math mode="display" style="font-size:1.2em;" xmlns="http://www.w3.org/1998/Math/MathML">
+	<mrow>
+	  <mi>Im</mi>
+	  <mo>(</mo>
+	  <msub><mi>X</mi><mrow><mi>k</mi><mo>,</mo><mi>v</mi></mrow></msub>
+	  <mo>)</mo>
+	  <mo>=</mo>
+	  <mn>0</mn>
+
+	  <mo>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</mo>
+
+	  <mrow style="font-size:100%;">
+	    <mi>k</mi>
+	    <mo>=</mo>
+	    <mn>0</mn>
+	    <mo>,</mo>
+	    <mo>&middot;</mo>
+	    <mo>&middot;</mo>
+	    <mo>&middot;</mo>
+	    <mo>,</mo>
+	    <mi>N</mi>
+	    <mo>-</mo>
+	    <mn>1</mn>
+
+	    <mo>&nbsp;&nbsp;</mo>
+	    <mo>,</mo>
+	    <mo>&nbsp;&nbsp;</mo>
+
+	    <mi>v</mi>
+	    <mo>=</mo>
+	    <mn>0</mn>
+	    <mo>,</mo>
+	    <mo>&middot;</mo>
+	    <mo>&middot;</mo>
+	    <mo>&middot;</mo>
+	    <mo>,</mo>
+	    <mi>V</mi>
+	    <mo>-</mo>
+	    <mn>1</mn>
+	  </mrow>
+
+	</mrow>
+      </math>
+    </td>
+    <td>
+      <p>(12)</p>
+    </td>
+  </tr>
+</table>
+
+<table border="0" style="margin-right:1.0cm; margin-left:1.0cm; margin-top:0.5cm; margin-bottom:0.5cm;">
+  <tr>
+    <td align="center" style="width:100%;">
+      <math mode="display" style="font-size:1.2em;" xmlns="http://www.w3.org/1998/Math/MathML">
+	<mrow>
+	  <mfenced open="{" close="">
+	    <mtable>
+	      <mtr>
+		<mtd>
+		  <mrow>
+		    <msub><mi>x</mi><mrow><mi>n</mi><mo>,</mo><mi>v</mi></mrow></msub>
+		    <mo>=</mo>
+		    <msubsup>
+		      <mi>x</mi>
+		      <mrow><mi>N</mi><mo>-</mo><mi>n</mi><mo>,</mo><mi>v</mi></mrow>
+		      <mo>*</mo>
+		    </msubsup>
+		  </mrow>
+		</mtd>
+
+		<mtd>
+		  <mo>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</mo>
+		</mtd>
+
+		<mtd>
+		  <mrow style="font-size:100%;">
+		    <mi>n</mi>
+		    <mo>=</mo>
+		    <mn>1</mn>
+		    <mo>,</mo>
+		    <mo>&middot;</mo>
+		    <mo>&middot;</mo>
+		    <mo>&middot;</mo>
+		    <mo>,</mo>
+		    <mfrac>
+		      <mi>N</mi>
+		      <mn>2</mn>
+		    </mfrac>
+		    <mo>-</mo>
+		    <mn>1</mn>
+		  </mrow>
+		</mtd>
+	      </mtr>
+
+	      <mtr>
+		<mtd>
+		  <mrow>
+		    <mi>Im</mi>
+		    <mo>(</mo>
+		    <msub><mi>x</mi><mrow><mi>n</mi><mo>,</mo><mi>v</mi></mrow></msub>
+		    <mo>)</mo>
+		    <mo>=</mo>
+		    <mn>0</mn>
+		  </mrow>
+		</mtd>
+
+		<mtd>
+		  <mo>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</mo>
+		</mtd>
+
+		<mtd>
+		  <mrow style="font-size:100%;">
+		    <mi>n</mi>
+		    <mo>=</mo>
+		    <mn>0</mn>
+		    <mo>,</mo>
+		    <mfrac>
+		      <mi>N</mi>
+		      <mn>2</mn>
+		    </mfrac>
+		  </mrow>
+		</mtd>
+	      </mtr>
+
+	    </mtable>
+	  </mfenced>
+
+	  <mo>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</mo>
+
+	  <mrow style="font-size:100%;">
+	    <mi>v</mi>
+	    <mo>=</mo>
+	    <mn>0</mn>
+	    <mo>,</mo>
+	    <mo>&middot;</mo>
+	    <mo>&middot;</mo>
+	    <mo>&middot;</mo>
+	    <mo>,</mo>
+	    <mi>V</mi>
+	    <mo>-</mo>
+	    <mn>1</mn>
+	  </mrow>
+	</mrow>
+      </math>
+    </td>
+    <td>
+      <p>(13)</p>
+    </td>
+  </tr>
+</table>
+
+<table border="0" style="margin-right:1.0cm; margin-left:1.0cm; margin-top:0.5cm; margin-bottom:0.5cm;">
+  <tr>
+    <td align="center" style="width:100%;">
+      <math mode="display" style="font-size:1.2em;" xmlns="http://www.w3.org/1998/Math/MathML">
+	<mrow>
+	  <mrow>
+	    <mi>s</mi>
+	    <mo>[</mo>
+	    <mi>n</mi>
+	    <mi>V</mi>
+	    <mo>+</mo>
+	    <mi>v</mi>
+	    <mo>]</mo>
+
+	    <mo>=</mo>
+
+	    <mi>Re</mi>
+	    <mo>(</mo>
+	    <msub><mi>X</mi><mrow><mi>k</mi><mo>,</mo><mi>v</mi></mrow></msub>
+	    <mo>)</mo>
+	  </mrow>
+
+	  <mo>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</mo>
+
+	  <mrow style="font-size:100%;">
+	    <mi>k</mi>
+	    <mo>=</mo>
+	    <mn>0</mn>
+	    <mo>,</mo>
+	    <mo>&middot;</mo>
+	    <mo>&middot;</mo>
+	    <mo>&middot;</mo>
+	    <mo>,</mo>
+	    <mi>N</mi>
+	    <mo>-</mo>
+	    <mn>1</mn>
+
+	    <mo>&nbsp;&nbsp;</mo>
+	    <mo>,</mo>
+	    <mo>&nbsp;&nbsp;</mo>
+
+	    <mi>v</mi>
+	    <mo>=</mo>
+	    <mn>0</mn>
+	    <mo>,</mo>
+	    <mo>&middot;</mo>
+	    <mo>&middot;</mo>
+	    <mo>&middot;</mo>
+	    <mo>,</mo>
+	    <mi>V</mi>
+	    <mo>-</mo>
+	    <mn>1</mn>
+	  </mrow>
+	</mrow>
+      </math>
+    </td>
+    <td>
+      <p>(14)</p>
+    </td>
+  </tr>
+</table>
+
+<table border="0" style="margin-right:1.0cm; margin-left:1.0cm; margin-top:0.5cm; margin-bottom:0.5cm;">
+  <tr>
+    <td align="center" style="width:100%;">
+      <math mode="display" style="font-size:1.2em;" xmlns="http://www.w3.org/1998/Math/MathML">
+	<mrow>
+	  <mfenced open="{" close="">
+	    <mtable>
+	      <mtr>
+		<mtd>
+		  <mrow>
+		    <mi>s</mi>
+		    <mo>[</mo>
+		    <mo>(</mo>
+		    <mn>2</mn>
+		    <mi>n</mi>
+		    <mo>+</mo>
+		    <mn>0</mn>
+		    <mo>)</mo>
+		    <mi>V</mi>
+		    <mo>+</mo>
+		    <mi>v</mi>
+		    <mo>]</mo>
+		  </mrow>
+		</mtd>
+
+		<mtd>
+		  <mo>=</mo>
+		</mtd>
+
+		<mtd>
+		  <mrow>
+		    <mi>Re</mi>
+		    <mo>(</mo>
+		    <msub><mi>x</mi><mrow><mi>n</mi><mo>,</mo><mi>v</mi></mrow></msub>
+		    <mo>)</mo>
+		  </mrow>
+		</mtd>
+
+		<mtd>
+		  <mrow style="font-size:100%;">
+		    <mi>n</mi>
+		    <mo>=</mo>
+		    <mn>0</mn>
+		    <mo>,</mo>
+		    <mo>&middot;</mo>
+		    <mo>&middot;</mo>
+		    <mo>&middot;</mo>
+		    <mo>,</mo>
+		    <mfrac>
+		      <mi>N</mi>
+		      <mn>2</mn>
+		    </mfrac>
+		    <mo>-</mo>
+		    <mn>1</mn>
+		  </mrow>
+		</mtd>
+	      </mtr>
+
+	      <mtr>
+		<mtd>
+		  <mrow>
+		    <mi>s</mi>
+		    <mo>[</mo>
+		    <mi>V</mi>
+		    <mo>+</mo>
+		    <mi>v</mi>
+		    <mo>]</mo>
+		  </mrow>
+		</mtd>
+
+		<mtd>
+		  <mo>=</mo>
+		</mtd>
+
+		<mtd>
+		  <mrow>
+		    <mi>Re</mi>
+		    <mo>(</mo>
+		    <msub><mi>x</mi><mrow><mi>N</mi><mo>/</mo><mn>2</mn><mo>,</mo><mi>v</mi></mrow></msub>
+		    <mo>)</mo>
+		  </mrow>
+		</mtd>
+
+		<mtd>
+		</mtd>
+	      </mtr>
+
+	      <mtr>
+		<mtd>
+		  <mrow>
+		    <mi>s</mi>
+		    <mo>[</mo>
+		    <mo>(</mo>
+		    <mn>2</mn>
+		    <mi>n</mi>
+		    <mo>+</mo>
+		    <mn>1</mn>
+		    <mo>)</mo>
+		    <mi>V</mi>
+		    <mo>+</mo>
+		    <mi>v</mi>
+		    <mo>]</mo>
+		  </mrow>
+		</mtd>
+
+		<mtd>
+		  <mo>=</mo>
+		</mtd>
+
+		<mtd>
+		  <mrow>
+		    <mi>Im</mi>
+		    <mo>(</mo>
+		    <msub><mi>x</mi><mrow><mi>n</mi><mo>,</mo><mi>v</mi></mrow></msub>
+		    <mo>)</mo>
+		  </mrow>
+		</mtd>
+
+		<mtd>
+		  <mrow style="font-size:100%;">
+		    <mi>n</mi>
+		    <mo>=</mo>
+		    <mn>1</mn>
+		    <mo>,</mo>
+		    <mo>&middot;</mo>
+		    <mo>&middot;</mo>
+		    <mo>&middot;</mo>
+		    <mo>,</mo>
+		    <mfrac>
+		      <mi>N</mi>
+		      <mn>2</mn>
+		    </mfrac>
+		    <mo>-</mo>
+		    <mn>1</mn>
+		  </mrow>
+		</mtd>
+	      </mtr>
+
+	    </mtable>
+	  </mfenced>
+
+	  <mo>&nbsp;&nbsp;&nbsp;&nbsp;</mo>
+
+	  <mrow style="font-size:100%;">
+	    <mi>v</mi>
+	    <mo>=</mo>
+	    <mn>0</mn>
+	    <mo>,</mo>
+	    <mo>&middot;</mo>
+	    <mo>&middot;</mo>
+	    <mo>&middot;</mo>
+	    <mo>,</mo>
+	    <mi>V</mi>
+	    <mo>-</mo>
+	    <mn>1</mn>
+	  </mrow>
+	</mrow>
+      </math>
+    </td>
+    <td>
+      <p>(15)</p>
+    </td>
+  </tr>
+</table>
+
+<table border="0" style="margin-right:1.0cm; margin-left:1.0cm; margin-top:0.5cm; margin-bottom:0.5cm;">
+  <tr>
+    <td align="center" style="width:100%;">
+      <math mode="display" style="font-size:1.2em;" xmlns="http://www.w3.org/1998/Math/MathML">
+	<mrow>
+	  <mrow>
+	    <mn>2</mn>
+	    <mo> &nbsp; </mo>
+	    <mi>s</mi>
+	    <mo>[</mo>
+	    <mi>n</mi>
+	    <mi>V</mi>
+	    <mo>+</mo>
+	    <mi>v</mi>
+	    <mo>]</mo>
+
+	    <mo>=</mo>
+
+	    <mi>Re</mi>
+	    <mo>(</mo>
+	    <msub><mi>X</mi><mrow><mi>k</mi><mo>,</mo><mi>v</mi></mrow></msub>
+	    <mo>)</mo>
+	  </mrow>
+
+	  <mo>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</mo>
+
+	  <mrow style="font-size:100%;">
+	    <mi>k</mi>
+	    <mo>=</mo>
+	    <mn>0</mn>
+	    <mo>,</mo>
+	    <mo>&middot;</mo>
+	    <mo>&middot;</mo>
+	    <mo>&middot;</mo>
+	    <mo>,</mo>
+	    <mi>N</mi>
+	    <mo>-</mo>
+	    <mn>1</mn>
+
+	    <mo>&nbsp;&nbsp;</mo>
+	    <mo>,</mo>
+	    <mo>&nbsp;&nbsp;</mo>
+
+	    <mi>v</mi>
+	    <mo>=</mo>
+	    <mn>0</mn>
+	    <mo>,</mo>
+	    <mo>&middot;</mo>
+	    <mo>&middot;</mo>
+	    <mo>&middot;</mo>
+	    <mo>,</mo>
+	    <mi>V</mi>
+	    <mo>-</mo>
+	    <mn>1</mn>
+	  </mrow>
+	</mrow>
+      </math>
+    </td>
+    <td>
+      <p>(16)</p>
+    </td>
+  </tr>
+</table>
+
+
+<h3>Examples</h3>
+
+<p>
+Below is an example code using nsfft library.
+</p>
+
+<pre class="code">
+#include &lt;stdio.h&gt;
+#include &lt;stdlib.h&gt;
+#include &lt;math.h&gt;
+#include &lt;stdint.h&gt;
+#include &lt;complex.h&gt;
+
+#include "SIMDBase.h"
+#include "DFT.h"
+
+typedef float REAL;
+#define TYPE SIMDBase_TYPE_FLOAT
+
+#define THRES 1e-3
+
+double complex omega(double n, double kn) {
+  return cexp((-2 * M_PI * _Complex_I / n) * kn);
+}
+
+void forward(double complex *ts, double complex *fs, int len) {
+  int k, n;
+
+  for(k=0;k&lt;len;k++) {
+    fs[k] = 0;
+
+    for(n=0;n&lt;len;n++) {
+      fs[k] += ts[n] * omega(len, n*k);
+    }
+  }
+}
+
+int main(int argc, char **argv) {
+  const int n = 256;
+
+  int mode = SIMDBase_chooseBestMode(TYPE);
+  printf("mode : %d, %s\n", mode, SIMDBase_getModeParamString(SIMDBase_PARAMID_MODE_NAME, mode));
+
+  int veclen = SIMDBase_getModeParamInt(SIMDBase_PARAMID_VECTOR_LEN, mode);
+  int sizeOfVect = SIMDBase_getModeParamInt(SIMDBase_PARAMID_SIZE_OF_VECT, mode);
+
+  //
+
+  int i, j;
+
+  DFT *p = DFT_init(mode, n, 0);
+  REAL *sx = SIMDBase_alignedMalloc(sizeOfVect*n*2);
+
+  //
+
+  double complex ts[veclen][n], fs[veclen][n];
+
+  for(j=0;j&lt;veclen;j++) {
+    for(i=0;i&lt;n;i++) {
+      ts[j][i] = (random() / (double)RAND_MAX) + (random() / (double)RAND_MAX) * _Complex_I;
+      sx[(i*2+0)*veclen+j] = creal(ts[j][i]);
+      sx[(i*2+1)*veclen+j] = cimag(ts[j][i]);
+    }
+  }
+
+  //
+
+  DFT_execute(p, mode, sx, -1);
+
+  for(j=0;j&lt;veclen;j++) {
+    forward(ts[j], fs[j], n);
+  }
+
+  //
+
+  int success = 1;
+
+  for(j=0;j&lt;veclen;j++) {
+    for(i=0;i&lt;n;i++) {
+      if ((fabs(sx[(i*2+0)*veclen+j] - creal(fs[j][i])) &gt; THRES) ||
+	  (fabs(sx[(i*2+1)*veclen+j] - cimag(fs[j][i])) &gt; THRES)) {
+	success = 0;
+      }
+    }
+  }
+
+  printf("%s\n", success ? "OK" : "NG");
+
+  //
+
+  SIMDBase_alignedFree(sx);
+  DFT_dispose(p, mode);
+
+  exit(0);
+}
+</pre>
+
+<p>
+You should put this code under a directory in the root directory of
+the library, and then you can compile this code with the following
+command.
+</p>
+
+<pre class="code">
+gcc -Wall -g -I ../simd -I ../dft -L../simd -L../dft -O DFTExample.c -lDFT -lSIMD -lm -o DFTExample
+</pre>
+
+<h3>Compilation</h3>
+
+<p>
+The nsfft source package include a few makefiles for various
+architectures.  You should make symbolic links to makefiles suited for
+your computer under <i>dft</i> and <i>simd</i> directories.
+</p>
+
+</body>
+</html>
diff --git a/plugins/supereq/nsfft-1.00/doc/nsfft.pdf b/plugins/supereq/nsfft-1.00/doc/nsfft.pdf
new file mode 100644
index 00000000..ed4ad5db
--- /dev/null
+++ b/plugins/supereq/nsfft-1.00/doc/nsfft.pdf
diff --git a/plugins/supereq/nsfft-1.00/ooura/Makefile b/plugins/supereq/nsfft-1.00/ooura/Makefile
new file mode 100644
index 00000000..bad1679e
--- /dev/null
+++ b/plugins/supereq/nsfft-1.00/ooura/Makefile
@@ -0,0 +1,11 @@
+CC=gcc
+BASEOPT=-Wall -g
+OPT=$(BASEOPT) -O3
+
+all : fftsg.o
+
+clean :
+	rm -f *~ *.o a.out
+
+fftsg.o : fftsg.c
+	$(CC) $(OPT) -c fftsg.c
diff --git a/plugins/supereq/nsfft-1.00/ooura/README b/plugins/supereq/nsfft-1.00/ooura/README
new file mode 100644
index 00000000..d7ddefc2
--- /dev/null
+++ b/plugins/supereq/nsfft-1.00/ooura/README
@@ -0,0 +1,2 @@
+Please put fftsg.c and pi_fft.c which is included in Prof. Takuya
+Ooura's FFT package.
diff --git a/plugins/supereq/nsfft-1.00/ooura/fftsg.c b/plugins/supereq/nsfft-1.00/ooura/fftsg.c
new file mode 100644
index 00000000..43d75344
--- /dev/null
+++ b/plugins/supereq/nsfft-1.00/ooura/fftsg.c
@@ -0,0 +1,3314 @@
+/*
+Fast Fourier/Cosine/Sine Transform
+    dimension   :one
+    data length :power of 2
+    decimation  :frequency
+    radix       :split-radix
+    data        :inplace
+    table       :use
+functions
+    cdft: Complex Discrete Fourier Transform
+    rdft: Real Discrete Fourier Transform
+    ddct: Discrete Cosine Transform
+    ddst: Discrete Sine Transform
+    dfct: Cosine Transform of RDFT (Real Symmetric DFT)
+    dfst: Sine Transform of RDFT (Real Anti-symmetric DFT)
+function prototypes
+    void cdft(int, int, double *, int *, double *);
+    void rdft(int, int, double *, int *, double *);
+    void ddct(int, int, double *, int *, double *);
+    void ddst(int, int, double *, int *, double *);
+    void dfct(int, double *, double *, int *, double *);
+    void dfst(int, double *, double *, int *, double *);
+macro definitions
+    USE_CDFT_PTHREADS : default=not defined
+        CDFT_THREADS_BEGIN_N  : must be >= 512, default=8192
+        CDFT_4THREADS_BEGIN_N : must be >= 512, default=65536
+    USE_CDFT_WINTHREADS : default=not defined
+        CDFT_THREADS_BEGIN_N  : must be >= 512, default=32768
+        CDFT_4THREADS_BEGIN_N : must be >= 512, default=524288
+
+
+-------- Complex DFT (Discrete Fourier Transform) --------
+    [definition]
+        <case1>
+            X[k] = sum_j=0^n-1 x[j]*exp(2*pi*i*j*k/n), 0<=k<n
+        <case2>
+            X[k] = sum_j=0^n-1 x[j]*exp(-2*pi*i*j*k/n), 0<=k<n
+        (notes: sum_j=0^n-1 is a summation from j=0 to n-1)
+    [usage]
+        <case1>
+            ip[0] = 0; // first time only
+            cdft(2*n, 1, a, ip, w);
+        <case2>
+            ip[0] = 0; // first time only
+            cdft(2*n, -1, a, ip, w);
+    [parameters]
+        2*n            :data length (int)
+                        n >= 1, n = power of 2
+        a[0...2*n-1]   :input/output data (double *)
+                        input data
+                            a[2*j] = Re(x[j]), 
+                            a[2*j+1] = Im(x[j]), 0<=j<n
+                        output data
+                            a[2*k] = Re(X[k]), 
+                            a[2*k+1] = Im(X[k]), 0<=k<n
+        ip[0...*]      :work area for bit reversal (int *)
+                        length of ip >= 2+sqrt(n)
+                        strictly, 
+                        length of ip >= 
+                            2+(1<<(int)(log(n+0.5)/log(2))/2).
+                        ip[0],ip[1] are pointers of the cos/sin table.
+        w[0...n/2-1]   :cos/sin table (double *)
+                        w[],ip[] are initialized if ip[0] == 0.
+    [remark]
+        Inverse of 
+            cdft(2*n, -1, a, ip, w);
+        is 
+            cdft(2*n, 1, a, ip, w);
+            for (j = 0; j <= 2 * n - 1; j++) {
+                a[j] *= 1.0 / n;
+            }
+        .
+
+
+-------- Real DFT / Inverse of Real DFT --------
+    [definition]
+        <case1> RDFT
+            R[k] = sum_j=0^n-1 a[j]*cos(2*pi*j*k/n), 0<=k<=n/2
+            I[k] = sum_j=0^n-1 a[j]*sin(2*pi*j*k/n), 0<k<n/2
+        <case2> IRDFT (excluding scale)
+            a[k] = (R[0] + R[n/2]*cos(pi*k))/2 + 
+                   sum_j=1^n/2-1 R[j]*cos(2*pi*j*k/n) + 
+                   sum_j=1^n/2-1 I[j]*sin(2*pi*j*k/n), 0<=k<n
+    [usage]
+        <case1>
+            ip[0] = 0; // first time only
+            rdft(n, 1, a, ip, w);
+        <case2>
+            ip[0] = 0; // first time only
+            rdft(n, -1, a, ip, w);
+    [parameters]
+        n              :data length (int)
+                        n >= 2, n = power of 2
+        a[0...n-1]     :input/output data (double *)
+                        <case1>
+                            output data
+                                a[2*k] = R[k], 0<=k<n/2
+                                a[2*k+1] = I[k], 0<k<n/2
+                                a[1] = R[n/2]
+                        <case2>
+                            input data
+                                a[2*j] = R[j], 0<=j<n/2
+                                a[2*j+1] = I[j], 0<j<n/2
+                                a[1] = R[n/2]
+        ip[0...*]      :work area for bit reversal (int *)
+                        length of ip >= 2+sqrt(n/2)
+                        strictly, 
+                        length of ip >= 
+                            2+(1<<(int)(log(n/2+0.5)/log(2))/2).
+                        ip[0],ip[1] are pointers of the cos/sin table.
+        w[0...n/2-1]   :cos/sin table (double *)
+                        w[],ip[] are initialized if ip[0] == 0.
+    [remark]
+        Inverse of 
+            rdft(n, 1, a, ip, w);
+        is 
+            rdft(n, -1, a, ip, w);
+            for (j = 0; j <= n - 1; j++) {
+                a[j] *= 2.0 / n;
+            }
+        .
+
+
+-------- DCT (Discrete Cosine Transform) / Inverse of DCT --------
+    [definition]
+        <case1> IDCT (excluding scale)
+            C[k] = sum_j=0^n-1 a[j]*cos(pi*j*(k+1/2)/n), 0<=k<n
+        <case2> DCT
+            C[k] = sum_j=0^n-1 a[j]*cos(pi*(j+1/2)*k/n), 0<=k<n
+    [usage]
+        <case1>
+            ip[0] = 0; // first time only
+            ddct(n, 1, a, ip, w);
+        <case2>
+            ip[0] = 0; // first time only
+            ddct(n, -1, a, ip, w);
+    [parameters]
+        n              :data length (int)
+                        n >= 2, n = power of 2
+        a[0...n-1]     :input/output data (double *)
+                        output data
+                            a[k] = C[k], 0<=k<n
+        ip[0...*]      :work area for bit reversal (int *)
+                        length of ip >= 2+sqrt(n/2)
+                        strictly, 
+                        length of ip >= 
+                            2+(1<<(int)(log(n/2+0.5)/log(2))/2).
+                        ip[0],ip[1] are pointers of the cos/sin table.
+        w[0...n*5/4-1] :cos/sin table (double *)
+                        w[],ip[] are initialized if ip[0] == 0.
+    [remark]
+        Inverse of 
+            ddct(n, -1, a, ip, w);
+        is 
+            a[0] *= 0.5;
+            ddct(n, 1, a, ip, w);
+            for (j = 0; j <= n - 1; j++) {
+                a[j] *= 2.0 / n;
+            }
+        .
+
+
+-------- DST (Discrete Sine Transform) / Inverse of DST --------
+    [definition]
+        <case1> IDST (excluding scale)
+            S[k] = sum_j=1^n A[j]*sin(pi*j*(k+1/2)/n), 0<=k<n
+        <case2> DST
+            S[k] = sum_j=0^n-1 a[j]*sin(pi*(j+1/2)*k/n), 0<k<=n
+    [usage]
+        <case1>
+            ip[0] = 0; // first time only
+            ddst(n, 1, a, ip, w);
+        <case2>
+            ip[0] = 0; // first time only
+            ddst(n, -1, a, ip, w);
+    [parameters]
+        n              :data length (int)
+                        n >= 2, n = power of 2
+        a[0...n-1]     :input/output data (double *)
+                        <case1>
+                            input data
+                                a[j] = A[j], 0<j<n
+                                a[0] = A[n]
+                            output data
+                                a[k] = S[k], 0<=k<n
+                        <case2>
+                            output data
+                                a[k] = S[k], 0<k<n
+                                a[0] = S[n]
+        ip[0...*]      :work area for bit reversal (int *)
+                        length of ip >= 2+sqrt(n/2)
+                        strictly, 
+                        length of ip >= 
+                            2+(1<<(int)(log(n/2+0.5)/log(2))/2).
+                        ip[0],ip[1] are pointers of the cos/sin table.
+        w[0...n*5/4-1] :cos/sin table (double *)
+                        w[],ip[] are initialized if ip[0] == 0.
+    [remark]
+        Inverse of 
+            ddst(n, -1, a, ip, w);
+        is 
+            a[0] *= 0.5;
+            ddst(n, 1, a, ip, w);
+            for (j = 0; j <= n - 1; j++) {
+                a[j] *= 2.0 / n;
+            }
+        .
+
+
+-------- Cosine Transform of RDFT (Real Symmetric DFT) --------
+    [definition]
+        C[k] = sum_j=0^n a[j]*cos(pi*j*k/n), 0<=k<=n
+    [usage]
+        ip[0] = 0; // first time only
+        dfct(n, a, t, ip, w);
+    [parameters]
+        n              :data length - 1 (int)
+                        n >= 2, n = power of 2
+        a[0...n]       :input/output data (double *)
+                        output data
+                            a[k] = C[k], 0<=k<=n
+        t[0...n/2]     :work area (double *)
+        ip[0...*]      :work area for bit reversal (int *)
+                        length of ip >= 2+sqrt(n/4)
+                        strictly, 
+                        length of ip >= 
+                            2+(1<<(int)(log(n/4+0.5)/log(2))/2).
+                        ip[0],ip[1] are pointers of the cos/sin table.
+        w[0...n*5/8-1] :cos/sin table (double *)
+                        w[],ip[] are initialized if ip[0] == 0.
+    [remark]
+        Inverse of 
+            a[0] *= 0.5;
+            a[n] *= 0.5;
+            dfct(n, a, t, ip, w);
+        is 
+            a[0] *= 0.5;
+            a[n] *= 0.5;
+            dfct(n, a, t, ip, w);
+            for (j = 0; j <= n; j++) {
+                a[j] *= 2.0 / n;
+            }
+        .
+
+
+-------- Sine Transform of RDFT (Real Anti-symmetric DFT) --------
+    [definition]
+        S[k] = sum_j=1^n-1 a[j]*sin(pi*j*k/n), 0<k<n
+    [usage]
+        ip[0] = 0; // first time only
+        dfst(n, a, t, ip, w);
+    [parameters]
+        n              :data length + 1 (int)
+                        n >= 2, n = power of 2
+        a[0...n-1]     :input/output data (double *)
+                        output data
+                            a[k] = S[k], 0<k<n
+                        (a[0] is used for work area)
+        t[0...n/2-1]   :work area (double *)
+        ip[0...*]      :work area for bit reversal (int *)
+                        length of ip >= 2+sqrt(n/4)
+                        strictly, 
+                        length of ip >= 
+                            2+(1<<(int)(log(n/4+0.5)/log(2))/2).
+                        ip[0],ip[1] are pointers of the cos/sin table.
+        w[0...n*5/8-1] :cos/sin table (double *)
+                        w[],ip[] are initialized if ip[0] == 0.
+    [remark]
+        Inverse of 
+            dfst(n, a, t, ip, w);
+        is 
+            dfst(n, a, t, ip, w);
+            for (j = 1; j <= n - 1; j++) {
+                a[j] *= 2.0 / n;
+            }
+        .
+
+
+Appendix :
+    The cos/sin table is recalculated when the larger table required.
+    w[] and ip[] are compatible with all routines.
+*/
+
+
+void cdft(int n, int isgn, double *a, int *ip, double *w)
+{
+    void makewt(int nw, int *ip, double *w);
+    void cftfsub(int n, double *a, int *ip, int nw, double *w);
+    void cftbsub(int n, double *a, int *ip, int nw, double *w);
+    int nw;
+    
+    nw = ip[0];
+    if (n > (nw << 2)) {
+        nw = n >> 2;
+        makewt(nw, ip, w);
+    }
+    if (isgn >= 0) {
+        cftfsub(n, a, ip, nw, w);
+    } else {
+        cftbsub(n, a, ip, nw, w);
+    }
+}
+
+
+void rdft(int n, int isgn, double *a, int *ip, double *w)
+{
+    void makewt(int nw, int *ip, double *w);
+    void makect(int nc, int *ip, double *c);
+    void cftfsub(int n, double *a, int *ip, int nw, double *w);
+    void cftbsub(int n, double *a, int *ip, int nw, double *w);
+    void rftfsub(int n, double *a, int nc, double *c);
+    void rftbsub(int n, double *a, int nc, double *c);
+    int nw, nc;
+    double xi;
+    
+    nw = ip[0];
+    if (n > (nw << 2)) {
+        nw = n >> 2;
+        makewt(nw, ip, w);
+    }
+    nc = ip[1];
+    if (n > (nc << 2)) {
+        nc = n >> 2;
+        makect(nc, ip, w + nw);
+    }
+    if (isgn >= 0) {
+        if (n > 4) {
+            cftfsub(n, a, ip, nw, w);
+            rftfsub(n, a, nc, w + nw);
+        } else if (n == 4) {
+            cftfsub(n, a, ip, nw, w);
+        }
+        xi = a[0] - a[1];
+        a[0] += a[1];
+        a[1] = xi;
+    } else {
+        a[1] = 0.5 * (a[0] - a[1]);
+        a[0] -= a[1];
+        if (n > 4) {
+            rftbsub(n, a, nc, w + nw);
+            cftbsub(n, a, ip, nw, w);
+        } else if (n == 4) {
+            cftbsub(n, a, ip, nw, w);
+        }
+    }
+}
+
+
+void ddct(int n, int isgn, double *a, int *ip, double *w)
+{
+    void makewt(int nw, int *ip, double *w);
+    void makect(int nc, int *ip, double *c);
+    void cftfsub(int n, double *a, int *ip, int nw, double *w);
+    void cftbsub(int n, double *a, int *ip, int nw, double *w);
+    void rftfsub(int n, double *a, int nc, double *c);
+    void rftbsub(int n, double *a, int nc, double *c);
+    void dctsub(int n, double *a, int nc, double *c);
+    int j, nw, nc;
+    double xr;
+    
+    nw = ip[0];
+    if (n > (nw << 2)) {
+        nw = n >> 2;
+        makewt(nw, ip, w);
+    }
+    nc = ip[1];
+    if (n > nc) {
+        nc = n;
+        makect(nc, ip, w + nw);
+    }
+    if (isgn < 0) {
+        xr = a[n - 1];
+        for (j = n - 2; j >= 2; j -= 2) {
+            a[j + 1] = a[j] - a[j - 1];
+            a[j] += a[j - 1];
+        }
+        a[1] = a[0] - xr;
+        a[0] += xr;
+        if (n > 4) {
+            rftbsub(n, a, nc, w + nw);
+            cftbsub(n, a, ip, nw, w);
+        } else if (n == 4) {
+            cftbsub(n, a, ip, nw, w);
+        }
+    }
+    dctsub(n, a, nc, w + nw);
+    if (isgn >= 0) {
+        if (n > 4) {
+            cftfsub(n, a, ip, nw, w);
+            rftfsub(n, a, nc, w + nw);
+        } else if (n == 4) {
+            cftfsub(n, a, ip, nw, w);
+        }
+        xr = a[0] - a[1];
+        a[0] += a[1];
+        for (j = 2; j < n; j += 2) {
+            a[j - 1] = a[j] - a[j + 1];
+            a[j] += a[j + 1];
+        }
+        a[n - 1] = xr;
+    }
+}
+
+
+void ddst(int n, int isgn, double *a, int *ip, double *w)
+{
+    void makewt(int nw, int *ip, double *w);
+    void makect(int nc, int *ip, double *c);
+    void cftfsub(int n, double *a, int *ip, int nw, double *w);
+    void cftbsub(int n, double *a, int *ip, int nw, double *w);
+    void rftfsub(int n, double *a, int nc, double *c);
+    void rftbsub(int n, double *a, int nc, double *c);
+    void dstsub(int n, double *a, int nc, double *c);
+    int j, nw, nc;
+    double xr;
+    
+    nw = ip[0];
+    if (n > (nw << 2)) {
+        nw = n >> 2;
+        makewt(nw, ip, w);
+    }
+    nc = ip[1];
+    if (n > nc) {
+        nc = n;
+        makect(nc, ip, w + nw);
+    }
+    if (isgn < 0) {
+        xr = a[n - 1];
+        for (j = n - 2; j >= 2; j -= 2) {
+            a[j + 1] = -a[j] - a[j - 1];
+            a[j] -= a[j - 1];
+        }
+        a[1] = a[0] + xr;
+        a[0] -= xr;
+        if (n > 4) {
+            rftbsub(n, a, nc, w + nw);
+            cftbsub(n, a, ip, nw, w);
+        } else if (n == 4) {
+            cftbsub(n, a, ip, nw, w);
+        }
+    }
+    dstsub(n, a, nc, w + nw);
+    if (isgn >= 0) {
+        if (n > 4) {
+            cftfsub(n, a, ip, nw, w);
+            rftfsub(n, a, nc, w + nw);
+        } else if (n == 4) {
+            cftfsub(n, a, ip, nw, w);
+        }
+        xr = a[0] - a[1];
+        a[0] += a[1];
+        for (j = 2; j < n; j += 2) {
+            a[j - 1] = -a[j] - a[j + 1];
+            a[j] -= a[j + 1];
+        }
+        a[n - 1] = -xr;
+    }
+}
+
+
+void dfct(int n, double *a, double *t, int *ip, double *w)
+{
+    void makewt(int nw, int *ip, double *w);
+    void makect(int nc, int *ip, double *c);
+    void cftfsub(int n, double *a, int *ip, int nw, double *w);
+    void rftfsub(int n, double *a, int nc, double *c);
+    void dctsub(int n, double *a, int nc, double *c);
+    int j, k, l, m, mh, nw, nc;
+    double xr, xi, yr, yi;
+    
+    nw = ip[0];
+    if (n > (nw << 3)) {
+        nw = n >> 3;
+        makewt(nw, ip, w);
+    }
+    nc = ip[1];
+    if (n > (nc << 1)) {
+        nc = n >> 1;
+        makect(nc, ip, w + nw);
+    }
+    m = n >> 1;
+    yi = a[m];
+    xi = a[0] + a[n];
+    a[0] -= a[n];
+    t[0] = xi - yi;
+    t[m] = xi + yi;
+    if (n > 2) {
+        mh = m >> 1;
+        for (j = 1; j < mh; j++) {
+            k = m - j;
+            xr = a[j] - a[n - j];
+            xi = a[j] + a[n - j];
+            yr = a[k] - a[n - k];
+            yi = a[k] + a[n - k];
+            a[j] = xr;
+            a[k] = yr;
+            t[j] = xi - yi;
+            t[k] = xi + yi;
+        }
+        t[mh] = a[mh] + a[n - mh];
+        a[mh] -= a[n - mh];
+        dctsub(m, a, nc, w + nw);
+        if (m > 4) {
+            cftfsub(m, a, ip, nw, w);
+            rftfsub(m, a, nc, w + nw);
+        } else if (m == 4) {
+            cftfsub(m, a, ip, nw, w);
+        }
+        a[n - 1] = a[0] - a[1];
+        a[1] = a[0] + a[1];
+        for (j = m - 2; j >= 2; j -= 2) {
+            a[2 * j + 1] = a[j] + a[j + 1];
+            a[2 * j - 1] = a[j] - a[j + 1];
+        }
+        l = 2;
+        m = mh;
+        while (m >= 2) {
+            dctsub(m, t, nc, w + nw);
+            if (m > 4) {
+                cftfsub(m, t, ip, nw, w);
+                rftfsub(m, t, nc, w + nw);
+            } else if (m == 4) {
+                cftfsub(m, t, ip, nw, w);
+            }
+            a[n - l] = t[0] - t[1];
+            a[l] = t[0] + t[1];
+            k = 0;
+            for (j = 2; j < m; j += 2) {
+                k += l << 2;
+                a[k - l] = t[j] - t[j + 1];
+                a[k + l] = t[j] + t[j + 1];
+            }
+            l <<= 1;
+            mh = m >> 1;
+            for (j = 0; j < mh; j++) {
+                k = m - j;
+                t[j] = t[m + k] - t[m + j];
+                t[k] = t[m + k] + t[m + j];
+            }
+            t[mh] = t[m + mh];
+            m = mh;
+        }
+        a[l] = t[0];
+        a[n] = t[2] - t[1];
+        a[0] = t[2] + t[1];
+    } else {
+        a[1] = a[0];
+        a[2] = t[0];
+        a[0] = t[1];
+    }
+}
+
+
+void dfst(int n, double *a, double *t, int *ip, double *w)
+{
+    void makewt(int nw, int *ip, double *w);
+    void makect(int nc, int *ip, double *c);
+    void cftfsub(int n, double *a, int *ip, int nw, double *w);
+    void rftfsub(int n, double *a, int nc, double *c);
+    void dstsub(int n, double *a, int nc, double *c);
+    int j, k, l, m, mh, nw, nc;
+    double xr, xi, yr, yi;
+    
+    nw = ip[0];
+    if (n > (nw << 3)) {
+        nw = n >> 3;
+        makewt(nw, ip, w);
+    }
+    nc = ip[1];
+    if (n > (nc << 1)) {
+        nc = n >> 1;
+        makect(nc, ip, w + nw);
+    }
+    if (n > 2) {
+        m = n >> 1;
+        mh = m >> 1;
+        for (j = 1; j < mh; j++) {
+            k = m - j;
+            xr = a[j] + a[n - j];
+            xi = a[j] - a[n - j];
+            yr = a[k] + a[n - k];
+            yi = a[k] - a[n - k];
+            a[j] = xr;
+            a[k] = yr;
+            t[j] = xi + yi;
+            t[k] = xi - yi;
+        }
+        t[0] = a[mh] - a[n - mh];
+        a[mh] += a[n - mh];
+        a[0] = a[m];
+        dstsub(m, a, nc, w + nw);
+        if (m > 4) {
+            cftfsub(m, a, ip, nw, w);
+            rftfsub(m, a, nc, w + nw);
+        } else if (m == 4) {
+            cftfsub(m, a, ip, nw, w);
+        }
+        a[n - 1] = a[1] - a[0];
+        a[1] = a[0] + a[1];
+        for (j = m - 2; j >= 2; j -= 2) {
+            a[2 * j + 1] = a[j] - a[j + 1];
+            a[2 * j - 1] = -a[j] - a[j + 1];
+        }
+        l = 2;
+        m = mh;
+        while (m >= 2) {
+            dstsub(m, t, nc, w + nw);
+            if (m > 4) {
+                cftfsub(m, t, ip, nw, w);
+                rftfsub(m, t, nc, w + nw);
+            } else if (m == 4) {
+                cftfsub(m, t, ip, nw, w);
+            }
+            a[n - l] = t[1] - t[0];
+            a[l] = t[0] + t[1];
+            k = 0;
+            for (j = 2; j < m; j += 2) {
+                k += l << 2;
+                a[k - l] = -t[j] - t[j + 1];
+                a[k + l] = t[j] - t[j + 1];
+            }
+            l <<= 1;
+            mh = m >> 1;
+            for (j = 1; j < mh; j++) {
+                k = m - j;
+                t[j] = t[m + k] + t[m + j];
+                t[k] = t[m + k] - t[m + j];
+            }
+            t[0] = t[m + mh];
+            m = mh;
+        }
+        a[l] = t[0];
+    }
+    a[0] = 0;
+}
+
+
+/* -------- initializing routines -------- */
+
+
+#include <math.h>
+
+void makewt(int nw, int *ip, double *w)
+{
+    void makeipt(int nw, int *ip);
+    int j, nwh, nw0, nw1;
+    double delta, wn4r, wk1r, wk1i, wk3r, wk3i;
+    
+    ip[0] = nw;
+    ip[1] = 1;
+    if (nw > 2) {
+        nwh = nw >> 1;
+        delta = atan(1.0) / nwh;
+        wn4r = cos(delta * nwh);
+        w[0] = 1;
+        w[1] = wn4r;
+        if (nwh == 4) {
+            w[2] = cos(delta * 2);
+            w[3] = sin(delta * 2);
+        } else if (nwh > 4) {
+            makeipt(nw, ip);
+            w[2] = 0.5 / cos(delta * 2);
+            w[3] = 0.5 / cos(delta * 6);
+            for (j = 4; j < nwh; j += 4) {
+                w[j] = cos(delta * j);
+                w[j + 1] = sin(delta * j);
+                w[j + 2] = cos(3 * delta * j);
+                w[j + 3] = -sin(3 * delta * j);
+            }
+        }
+        nw0 = 0;
+        while (nwh > 2) {
+            nw1 = nw0 + nwh;
+            nwh >>= 1;
+            w[nw1] = 1;
+            w[nw1 + 1] = wn4r;
+            if (nwh == 4) {
+                wk1r = w[nw0 + 4];
+                wk1i = w[nw0 + 5];
+                w[nw1 + 2] = wk1r;
+                w[nw1 + 3] = wk1i;
+            } else if (nwh > 4) {
+                wk1r = w[nw0 + 4];
+                wk3r = w[nw0 + 6];
+                w[nw1 + 2] = 0.5 / wk1r;
+                w[nw1 + 3] = 0.5 / wk3r;
+                for (j = 4; j < nwh; j += 4) {
+                    wk1r = w[nw0 + 2 * j];
+                    wk1i = w[nw0 + 2 * j + 1];
+                    wk3r = w[nw0 + 2 * j + 2];
+                    wk3i = w[nw0 + 2 * j + 3];
+                    w[nw1 + j] = wk1r;
+                    w[nw1 + j + 1] = wk1i;
+                    w[nw1 + j + 2] = wk3r;
+                    w[nw1 + j + 3] = wk3i;
+                }
+            }
+            nw0 = nw1;
+        }
+    }
+}
+
+
+void makeipt(int nw, int *ip)
+{
+    int j, l, m, m2, p, q;
+    
+    ip[2] = 0;
+    ip[3] = 16;
+    m = 2;
+    for (l = nw; l > 32; l >>= 2) {
+        m2 = m << 1;
+        q = m2 << 3;
+        for (j = m; j < m2; j++) {
+            p = ip[j] << 2;
+            ip[m + j] = p;
+            ip[m2 + j] = p + q;
+        }
+        m = m2;
+    }
+}
+
+
+void makect(int nc, int *ip, double *c)
+{
+    int j, nch;
+    double delta;
+    
+    ip[1] = nc;
+    if (nc > 1) {
+        nch = nc >> 1;
+        delta = atan(1.0) / nch;
+        c[0] = cos(delta * nch);
+        c[nch] = 0.5 * c[0];
+        for (j = 1; j < nch; j++) {
+            c[j] = 0.5 * cos(delta * j);
+            c[nc - j] = 0.5 * sin(delta * j);
+        }
+    }
+}
+
+
+/* -------- child routines -------- */
+
+
+#ifdef USE_CDFT_PTHREADS
+#define USE_CDFT_THREADS
+#ifndef CDFT_THREADS_BEGIN_N
+#define CDFT_THREADS_BEGIN_N 8192
+#endif
+#ifndef CDFT_4THREADS_BEGIN_N
+#define CDFT_4THREADS_BEGIN_N 65536
+#endif
+#include <pthread.h>
+#include <stdio.h>
+#include <stdlib.h>
+#define cdft_thread_t pthread_t
+#define cdft_thread_create(thp,func,argp) { \
+    if (pthread_create(thp, NULL, func, (void *) argp) != 0) { \
+        fprintf(stderr, "cdft thread error\n"); \
+        exit(1); \
+    } \
+}
+#define cdft_thread_wait(th) { \
+    if (pthread_join(th, NULL) != 0) { \
+        fprintf(stderr, "cdft thread error\n"); \
+        exit(1); \
+    } \
+}
+#endif /* USE_CDFT_PTHREADS */
+
+
+#ifdef USE_CDFT_WINTHREADS
+#define USE_CDFT_THREADS
+#ifndef CDFT_THREADS_BEGIN_N
+#define CDFT_THREADS_BEGIN_N 32768
+#endif
+#ifndef CDFT_4THREADS_BEGIN_N
+#define CDFT_4THREADS_BEGIN_N 524288
+#endif
+#include <windows.h>
+#include <stdio.h>
+#include <stdlib.h>
+#define cdft_thread_t HANDLE
+#define cdft_thread_create(thp,func,argp) { \
+    DWORD thid; \
+    *(thp) = CreateThread(NULL, 0, (LPTHREAD_START_ROUTINE) func, (LPVOID) argp, 0, &thid); \
+    if (*(thp) == 0) { \
+        fprintf(stderr, "cdft thread error\n"); \
+        exit(1); \
+    } \
+}
+#define cdft_thread_wait(th) { \
+    WaitForSingleObject(th, INFINITE); \
+    CloseHandle(th); \
+}
+#endif /* USE_CDFT_WINTHREADS */
+
+
+void cftfsub(int n, double *a, int *ip, int nw, double *w)
+{
+    void bitrv2(int n, int *ip, double *a);
+    void bitrv216(double *a);
+    void bitrv208(double *a);
+    void cftf1st(int n, double *a, double *w);
+    void cftrec4(int n, double *a, int nw, double *w);
+    void cftleaf(int n, int isplt, double *a, int nw, double *w);
+    void cftfx41(int n, double *a, int nw, double *w);
+    void cftf161(double *a, double *w);
+    void cftf081(double *a, double *w);
+    void cftf040(double *a);
+    void cftx020(double *a);
+#ifdef USE_CDFT_THREADS
+    void cftrec4_th(int n, double *a, int nw, double *w);
+#endif /* USE_CDFT_THREADS */
+    
+    if (n > 8) {
+        if (n > 32) {
+            cftf1st(n, a, &w[nw - (n >> 2)]);
+#ifdef USE_CDFT_THREADS
+            if (n > CDFT_THREADS_BEGIN_N) {
+                cftrec4_th(n, a, nw, w);
+            } else 
+#endif /* USE_CDFT_THREADS */
+            if (n > 512) {
+                cftrec4(n, a, nw, w);
+            } else if (n > 128) {
+                cftleaf(n, 1, a, nw, w);
+            } else {
+                cftfx41(n, a, nw, w);
+            }
+            bitrv2(n, ip, a);
+        } else if (n == 32) {
+            cftf161(a, &w[nw - 8]);
+            bitrv216(a);
+        } else {
+            cftf081(a, w);
+            bitrv208(a);
+        }
+    } else if (n == 8) {
+        cftf040(a);
+    } else if (n == 4) {
+        cftx020(a);
+    }
+}
+
+
+void cftbsub(int n, double *a, int *ip, int nw, double *w)
+{
+    void bitrv2conj(int n, int *ip, double *a);
+    void bitrv216neg(double *a);
+    void bitrv208neg(double *a);
+    void cftb1st(int n, double *a, double *w);
+    void cftrec4(int n, double *a, int nw, double *w);
+    void cftleaf(int n, int isplt, double *a, int nw, double *w);
+    void cftfx41(int n, double *a, int nw, double *w);
+    void cftf161(double *a, double *w);
+    void cftf081(double *a, double *w);
+    void cftb040(double *a);
+    void cftx020(double *a);
+#ifdef USE_CDFT_THREADS
+    void cftrec4_th(int n, double *a, int nw, double *w);
+#endif /* USE_CDFT_THREADS */
+    
+    if (n > 8) {
+        if (n > 32) {
+            cftb1st(n, a, &w[nw - (n >> 2)]);
+#ifdef USE_CDFT_THREADS
+            if (n > CDFT_THREADS_BEGIN_N) {
+                cftrec4_th(n, a, nw, w);
+            } else 
+#endif /* USE_CDFT_THREADS */
+            if (n > 512) {
+                cftrec4(n, a, nw, w);
+            } else if (n > 128) {
+                cftleaf(n, 1, a, nw, w);
+            } else {
+                cftfx41(n, a, nw, w);
+            }
+            bitrv2conj(n, ip, a);
+        } else if (n == 32) {
+            cftf161(a, &w[nw - 8]);
+            bitrv216neg(a);
+        } else {
+            cftf081(a, w);
+            bitrv208neg(a);
+        }
+    } else if (n == 8) {
+        cftb040(a);
+    } else if (n == 4) {
+        cftx020(a);
+    }
+}
+
+
+void bitrv2(int n, int *ip, double *a)
+{
+    int j, j1, k, k1, l, m, nh, nm;
+    double xr, xi, yr, yi;
+    
+    m = 1;
+    for (l = n >> 2; l > 8; l >>= 2) {
+        m <<= 1;
+    }
+    nh = n >> 1;
+    nm = 4 * m;
+    if (l == 8) {
+        for (k = 0; k < m; k++) {
+            for (j = 0; j < k; j++) {
+                j1 = 4 * j + 2 * ip[m + k];
+                k1 = 4 * k + 2 * ip[m + j];
+                xr = a[j1];
+                xi = a[j1 + 1];
+                yr = a[k1];
+                yi = a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 += nm;
+                k1 += 2 * nm;
+                xr = a[j1];
+                xi = a[j1 + 1];
+                yr = a[k1];
+                yi = a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 += nm;
+                k1 -= nm;
+                xr = a[j1];
+                xi = a[j1 + 1];
+                yr = a[k1];
+                yi = a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 += nm;
+                k1 += 2 * nm;
+                xr = a[j1];
+                xi = a[j1 + 1];
+                yr = a[k1];
+                yi = a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 += nh;
+                k1 += 2;
+                xr = a[j1];
+                xi = a[j1 + 1];
+                yr = a[k1];
+                yi = a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 -= nm;
+                k1 -= 2 * nm;
+                xr = a[j1];
+                xi = a[j1 + 1];
+                yr = a[k1];
+                yi = a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 -= nm;
+                k1 += nm;
+                xr = a[j1];
+                xi = a[j1 + 1];
+                yr = a[k1];
+                yi = a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 -= nm;
+                k1 -= 2 * nm;
+                xr = a[j1];
+                xi = a[j1 + 1];
+                yr = a[k1];
+                yi = a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 += 2;
+                k1 += nh;
+                xr = a[j1];
+                xi = a[j1 + 1];
+                yr = a[k1];
+                yi = a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 += nm;
+                k1 += 2 * nm;
+                xr = a[j1];
+                xi = a[j1 + 1];
+                yr = a[k1];
+                yi = a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 += nm;
+                k1 -= nm;
+                xr = a[j1];
+                xi = a[j1 + 1];
+                yr = a[k1];
+                yi = a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 += nm;
+                k1 += 2 * nm;
+                xr = a[j1];
+                xi = a[j1 + 1];
+                yr = a[k1];
+                yi = a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 -= nh;
+                k1 -= 2;
+                xr = a[j1];
+                xi = a[j1 + 1];
+                yr = a[k1];
+                yi = a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 -= nm;
+                k1 -= 2 * nm;
+                xr = a[j1];
+                xi = a[j1 + 1];
+                yr = a[k1];
+                yi = a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 -= nm;
+                k1 += nm;
+                xr = a[j1];
+                xi = a[j1 + 1];
+                yr = a[k1];
+                yi = a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 -= nm;
+                k1 -= 2 * nm;
+                xr = a[j1];
+                xi = a[j1 + 1];
+                yr = a[k1];
+                yi = a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+            }
+            k1 = 4 * k + 2 * ip[m + k];
+            j1 = k1 + 2;
+            k1 += nh;
+            xr = a[j1];
+            xi = a[j1 + 1];
+            yr = a[k1];
+            yi = a[k1 + 1];
+            a[j1] = yr;
+            a[j1 + 1] = yi;
+            a[k1] = xr;
+            a[k1 + 1] = xi;
+            j1 += nm;
+            k1 += 2 * nm;
+            xr = a[j1];
+            xi = a[j1 + 1];
+            yr = a[k1];
+            yi = a[k1 + 1];
+            a[j1] = yr;
+            a[j1 + 1] = yi;
+            a[k1] = xr;
+            a[k1 + 1] = xi;
+            j1 += nm;
+            k1 -= nm;
+            xr = a[j1];
+            xi = a[j1 + 1];
+            yr = a[k1];
+            yi = a[k1 + 1];
+            a[j1] = yr;
+            a[j1 + 1] = yi;
+            a[k1] = xr;
+            a[k1 + 1] = xi;
+            j1 -= 2;
+            k1 -= nh;
+            xr = a[j1];
+            xi = a[j1 + 1];
+            yr = a[k1];
+            yi = a[k1 + 1];
+            a[j1] = yr;
+            a[j1 + 1] = yi;
+            a[k1] = xr;
+            a[k1 + 1] = xi;
+            j1 += nh + 2;
+            k1 += nh + 2;
+            xr = a[j1];
+            xi = a[j1 + 1];
+            yr = a[k1];
+            yi = a[k1 + 1];
+            a[j1] = yr;
+            a[j1 + 1] = yi;
+            a[k1] = xr;
+            a[k1 + 1] = xi;
+            j1 -= nh - nm;
+            k1 += 2 * nm - 2;
+            xr = a[j1];
+            xi = a[j1 + 1];
+            yr = a[k1];
+            yi = a[k1 + 1];
+            a[j1] = yr;
+            a[j1 + 1] = yi;
+            a[k1] = xr;
+            a[k1 + 1] = xi;
+        }
+    } else {
+        for (k = 0; k < m; k++) {
+            for (j = 0; j < k; j++) {
+                j1 = 4 * j + ip[m + k];
+                k1 = 4 * k + ip[m + j];
+                xr = a[j1];
+                xi = a[j1 + 1];
+                yr = a[k1];
+                yi = a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 += nm;
+                k1 += nm;
+                xr = a[j1];
+                xi = a[j1 + 1];
+                yr = a[k1];
+                yi = a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 += nh;
+                k1 += 2;
+                xr = a[j1];
+                xi = a[j1 + 1];
+                yr = a[k1];
+                yi = a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 -= nm;
+                k1 -= nm;
+                xr = a[j1];
+                xi = a[j1 + 1];
+                yr = a[k1];
+                yi = a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 += 2;
+                k1 += nh;
+                xr = a[j1];
+                xi = a[j1 + 1];
+                yr = a[k1];
+                yi = a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 += nm;
+                k1 += nm;
+                xr = a[j1];
+                xi = a[j1 + 1];
+                yr = a[k1];
+                yi = a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 -= nh;
+                k1 -= 2;
+                xr = a[j1];
+                xi = a[j1 + 1];
+                yr = a[k1];
+                yi = a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 -= nm;
+                k1 -= nm;
+                xr = a[j1];
+                xi = a[j1 + 1];
+                yr = a[k1];
+                yi = a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+            }
+            k1 = 4 * k + ip[m + k];
+            j1 = k1 + 2;
+            k1 += nh;
+            xr = a[j1];
+            xi = a[j1 + 1];
+            yr = a[k1];
+            yi = a[k1 + 1];
+            a[j1] = yr;
+            a[j1 + 1] = yi;
+            a[k1] = xr;
+            a[k1 + 1] = xi;
+            j1 += nm;
+            k1 += nm;
+            xr = a[j1];
+            xi = a[j1 + 1];
+            yr = a[k1];
+            yi = a[k1 + 1];
+            a[j1] = yr;
+            a[j1 + 1] = yi;
+            a[k1] = xr;
+            a[k1 + 1] = xi;
+        }
+    }
+}
+
+
+void bitrv2conj(int n, int *ip, double *a)
+{
+    int j, j1, k, k1, l, m, nh, nm;
+    double xr, xi, yr, yi;
+    
+    m = 1;
+    for (l = n >> 2; l > 8; l >>= 2) {
+        m <<= 1;
+    }
+    nh = n >> 1;
+    nm = 4 * m;
+    if (l == 8) {
+        for (k = 0; k < m; k++) {
+            for (j = 0; j < k; j++) {
+                j1 = 4 * j + 2 * ip[m + k];
+                k1 = 4 * k + 2 * ip[m + j];
+                xr = a[j1];
+                xi = -a[j1 + 1];
+                yr = a[k1];
+                yi = -a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 += nm;
+                k1 += 2 * nm;
+                xr = a[j1];
+                xi = -a[j1 + 1];
+                yr = a[k1];
+                yi = -a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 += nm;
+                k1 -= nm;
+                xr = a[j1];
+                xi = -a[j1 + 1];
+                yr = a[k1];
+                yi = -a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 += nm;
+                k1 += 2 * nm;
+                xr = a[j1];
+                xi = -a[j1 + 1];
+                yr = a[k1];
+                yi = -a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 += nh;
+                k1 += 2;
+                xr = a[j1];
+                xi = -a[j1 + 1];
+                yr = a[k1];
+                yi = -a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 -= nm;
+                k1 -= 2 * nm;
+                xr = a[j1];
+                xi = -a[j1 + 1];
+                yr = a[k1];
+                yi = -a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 -= nm;
+                k1 += nm;
+                xr = a[j1];
+                xi = -a[j1 + 1];
+                yr = a[k1];
+                yi = -a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 -= nm;
+                k1 -= 2 * nm;
+                xr = a[j1];
+                xi = -a[j1 + 1];
+                yr = a[k1];
+                yi = -a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 += 2;
+                k1 += nh;
+                xr = a[j1];
+                xi = -a[j1 + 1];
+                yr = a[k1];
+                yi = -a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 += nm;
+                k1 += 2 * nm;
+                xr = a[j1];
+                xi = -a[j1 + 1];
+                yr = a[k1];
+                yi = -a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 += nm;
+                k1 -= nm;
+                xr = a[j1];
+                xi = -a[j1 + 1];
+                yr = a[k1];
+                yi = -a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 += nm;
+                k1 += 2 * nm;
+                xr = a[j1];
+                xi = -a[j1 + 1];
+                yr = a[k1];
+                yi = -a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 -= nh;
+                k1 -= 2;
+                xr = a[j1];
+                xi = -a[j1 + 1];
+                yr = a[k1];
+                yi = -a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 -= nm;
+                k1 -= 2 * nm;
+                xr = a[j1];
+                xi = -a[j1 + 1];
+                yr = a[k1];
+                yi = -a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 -= nm;
+                k1 += nm;
+                xr = a[j1];
+                xi = -a[j1 + 1];
+                yr = a[k1];
+                yi = -a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 -= nm;
+                k1 -= 2 * nm;
+                xr = a[j1];
+                xi = -a[j1 + 1];
+                yr = a[k1];
+                yi = -a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+            }
+            k1 = 4 * k + 2 * ip[m + k];
+            j1 = k1 + 2;
+            k1 += nh;
+            a[j1 - 1] = -a[j1 - 1];
+            xr = a[j1];
+            xi = -a[j1 + 1];
+            yr = a[k1];
+            yi = -a[k1 + 1];
+            a[j1] = yr;
+            a[j1 + 1] = yi;
+            a[k1] = xr;
+            a[k1 + 1] = xi;
+            a[k1 + 3] = -a[k1 + 3];
+            j1 += nm;
+            k1 += 2 * nm;
+            xr = a[j1];
+            xi = -a[j1 + 1];
+            yr = a[k1];
+            yi = -a[k1 + 1];
+            a[j1] = yr;
+            a[j1 + 1] = yi;
+            a[k1] = xr;
+            a[k1 + 1] = xi;
+            j1 += nm;
+            k1 -= nm;
+            xr = a[j1];
+            xi = -a[j1 + 1];
+            yr = a[k1];
+            yi = -a[k1 + 1];
+            a[j1] = yr;
+            a[j1 + 1] = yi;
+            a[k1] = xr;
+            a[k1 + 1] = xi;
+            j1 -= 2;
+            k1 -= nh;
+            xr = a[j1];
+            xi = -a[j1 + 1];
+            yr = a[k1];
+            yi = -a[k1 + 1];
+            a[j1] = yr;
+            a[j1 + 1] = yi;
+            a[k1] = xr;
+            a[k1 + 1] = xi;
+            j1 += nh + 2;
+            k1 += nh + 2;
+            xr = a[j1];
+            xi = -a[j1 + 1];
+            yr = a[k1];
+            yi = -a[k1 + 1];
+            a[j1] = yr;
+            a[j1 + 1] = yi;
+            a[k1] = xr;
+            a[k1 + 1] = xi;
+            j1 -= nh - nm;
+            k1 += 2 * nm - 2;
+            a[j1 - 1] = -a[j1 - 1];
+            xr = a[j1];
+            xi = -a[j1 + 1];
+            yr = a[k1];
+            yi = -a[k1 + 1];
+            a[j1] = yr;
+            a[j1 + 1] = yi;
+            a[k1] = xr;
+            a[k1 + 1] = xi;
+            a[k1 + 3] = -a[k1 + 3];
+        }
+    } else {
+        for (k = 0; k < m; k++) {
+            for (j = 0; j < k; j++) {
+                j1 = 4 * j + ip[m + k];
+                k1 = 4 * k + ip[m + j];
+                xr = a[j1];
+                xi = -a[j1 + 1];
+                yr = a[k1];
+                yi = -a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 += nm;
+                k1 += nm;
+                xr = a[j1];
+                xi = -a[j1 + 1];
+                yr = a[k1];
+                yi = -a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 += nh;
+                k1 += 2;
+                xr = a[j1];
+                xi = -a[j1 + 1];
+                yr = a[k1];
+                yi = -a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 -= nm;
+                k1 -= nm;
+                xr = a[j1];
+                xi = -a[j1 + 1];
+                yr = a[k1];
+                yi = -a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 += 2;
+                k1 += nh;
+                xr = a[j1];
+                xi = -a[j1 + 1];
+                yr = a[k1];
+                yi = -a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 += nm;
+                k1 += nm;
+                xr = a[j1];
+                xi = -a[j1 + 1];
+                yr = a[k1];
+                yi = -a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 -= nh;
+                k1 -= 2;
+                xr = a[j1];
+                xi = -a[j1 + 1];
+                yr = a[k1];
+                yi = -a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 -= nm;
+                k1 -= nm;
+                xr = a[j1];
+                xi = -a[j1 + 1];
+                yr = a[k1];
+                yi = -a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+            }
+            k1 = 4 * k + ip[m + k];
+            j1 = k1 + 2;
+            k1 += nh;
+            a[j1 - 1] = -a[j1 - 1];
+            xr = a[j1];
+            xi = -a[j1 + 1];
+            yr = a[k1];
+            yi = -a[k1 + 1];
+            a[j1] = yr;
+            a[j1 + 1] = yi;
+            a[k1] = xr;
+            a[k1 + 1] = xi;
+            a[k1 + 3] = -a[k1 + 3];
+            j1 += nm;
+            k1 += nm;
+            a[j1 - 1] = -a[j1 - 1];
+            xr = a[j1];
+            xi = -a[j1 + 1];
+            yr = a[k1];
+            yi = -a[k1 + 1];
+            a[j1] = yr;
+            a[j1 + 1] = yi;
+            a[k1] = xr;
+            a[k1 + 1] = xi;
+            a[k1 + 3] = -a[k1 + 3];
+        }
+    }
+}
+
+
+void bitrv216(double *a)
+{
+    double x1r, x1i, x2r, x2i, x3r, x3i, x4r, x4i, 
+        x5r, x5i, x7r, x7i, x8r, x8i, x10r, x10i, 
+        x11r, x11i, x12r, x12i, x13r, x13i, x14r, x14i;
+    
+    x1r = a[2];
+    x1i = a[3];
+    x2r = a[4];
+    x2i = a[5];
+    x3r = a[6];
+    x3i = a[7];
+    x4r = a[8];
+    x4i = a[9];
+    x5r = a[10];
+    x5i = a[11];
+    x7r = a[14];
+    x7i = a[15];
+    x8r = a[16];
+    x8i = a[17];
+    x10r = a[20];
+    x10i = a[21];
+    x11r = a[22];
+    x11i = a[23];
+    x12r = a[24];
+    x12i = a[25];
+    x13r = a[26];
+    x13i = a[27];
+    x14r = a[28];
+    x14i = a[29];
+    a[2] = x8r;
+    a[3] = x8i;
+    a[4] = x4r;
+    a[5] = x4i;
+    a[6] = x12r;
+    a[7] = x12i;
+    a[8] = x2r;
+    a[9] = x2i;
+    a[10] = x10r;
+    a[11] = x10i;
+    a[14] = x14r;
+    a[15] = x14i;
+    a[16] = x1r;
+    a[17] = x1i;
+    a[20] = x5r;
+    a[21] = x5i;
+    a[22] = x13r;
+    a[23] = x13i;
+    a[24] = x3r;
+    a[25] = x3i;
+    a[26] = x11r;
+    a[27] = x11i;
+    a[28] = x7r;
+    a[29] = x7i;
+}
+
+
+void bitrv216neg(double *a)
+{
+    double x1r, x1i, x2r, x2i, x3r, x3i, x4r, x4i, 
+        x5r, x5i, x6r, x6i, x7r, x7i, x8r, x8i, 
+        x9r, x9i, x10r, x10i, x11r, x11i, x12r, x12i, 
+        x13r, x13i, x14r, x14i, x15r, x15i;
+    
+    x1r = a[2];
+    x1i = a[3];
+    x2r = a[4];
+    x2i = a[5];
+    x3r = a[6];
+    x3i = a[7];
+    x4r = a[8];
+    x4i = a[9];
+    x5r = a[10];
+    x5i = a[11];
+    x6r = a[12];
+    x6i = a[13];
+    x7r = a[14];
+    x7i = a[15];
+    x8r = a[16];
+    x8i = a[17];
+    x9r = a[18];
+    x9i = a[19];
+    x10r = a[20];
+    x10i = a[21];
+    x11r = a[22];
+    x11i = a[23];
+    x12r = a[24];
+    x12i = a[25];
+    x13r = a[26];
+    x13i = a[27];
+    x14r = a[28];
+    x14i = a[29];
+    x15r = a[30];
+    x15i = a[31];
+    a[2] = x15r;
+    a[3] = x15i;
+    a[4] = x7r;
+    a[5] = x7i;
+    a[6] = x11r;
+    a[7] = x11i;
+    a[8] = x3r;
+    a[9] = x3i;
+    a[10] = x13r;
+    a[11] = x13i;
+    a[12] = x5r;
+    a[13] = x5i;
+    a[14] = x9r;
+    a[15] = x9i;
+    a[16] = x1r;
+    a[17] = x1i;
+    a[18] = x14r;
+    a[19] = x14i;
+    a[20] = x6r;
+    a[21] = x6i;
+    a[22] = x10r;
+    a[23] = x10i;
+    a[24] = x2r;
+    a[25] = x2i;
+    a[26] = x12r;
+    a[27] = x12i;
+    a[28] = x4r;
+    a[29] = x4i;
+    a[30] = x8r;
+    a[31] = x8i;
+}
+
+
+void bitrv208(double *a)
+{
+    double x1r, x1i, x3r, x3i, x4r, x4i, x6r, x6i;
+    
+    x1r = a[2];
+    x1i = a[3];
+    x3r = a[6];
+    x3i = a[7];
+    x4r = a[8];
+    x4i = a[9];
+    x6r = a[12];
+    x6i = a[13];
+    a[2] = x4r;
+    a[3] = x4i;
+    a[6] = x6r;
+    a[7] = x6i;
+    a[8] = x1r;
+    a[9] = x1i;
+    a[12] = x3r;
+    a[13] = x3i;
+}
+
+
+void bitrv208neg(double *a)
+{
+    double x1r, x1i, x2r, x2i, x3r, x3i, x4r, x4i, 
+        x5r, x5i, x6r, x6i, x7r, x7i;
+    
+    x1r = a[2];
+    x1i = a[3];
+    x2r = a[4];
+    x2i = a[5];
+    x3r = a[6];
+    x3i = a[7];
+    x4r = a[8];
+    x4i = a[9];
+    x5r = a[10];
+    x5i = a[11];
+    x6r = a[12];
+    x6i = a[13];
+    x7r = a[14];
+    x7i = a[15];
+    a[2] = x7r;
+    a[3] = x7i;
+    a[4] = x3r;
+    a[5] = x3i;
+    a[6] = x5r;
+    a[7] = x5i;
+    a[8] = x1r;
+    a[9] = x1i;
+    a[10] = x6r;
+    a[11] = x6i;
+    a[12] = x2r;
+    a[13] = x2i;
+    a[14] = x4r;
+    a[15] = x4i;
+}
+
+
+void cftf1st(int n, double *a, double *w)
+{
+    int j, j0, j1, j2, j3, k, m, mh;
+    double wn4r, csc1, csc3, wk1r, wk1i, wk3r, wk3i, 
+        wd1r, wd1i, wd3r, wd3i;
+    double x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i, 
+        y0r, y0i, y1r, y1i, y2r, y2i, y3r, y3i;
+    
+    mh = n >> 3;
+    m = 2 * mh;
+    j1 = m;
+    j2 = j1 + m;
+    j3 = j2 + m;
+    x0r = a[0] + a[j2];
+    x0i = a[1] + a[j2 + 1];
+    x1r = a[0] - a[j2];
+    x1i = a[1] - a[j2 + 1];
+    x2r = a[j1] + a[j3];
+    x2i = a[j1 + 1] + a[j3 + 1];
+    x3r = a[j1] - a[j3];
+    x3i = a[j1 + 1] - a[j3 + 1];
+    a[0] = x0r + x2r;
+    a[1] = x0i + x2i;
+    a[j1] = x0r - x2r;
+    a[j1 + 1] = x0i - x2i;
+    a[j2] = x1r - x3i;
+    a[j2 + 1] = x1i + x3r;
+    a[j3] = x1r + x3i;
+    a[j3 + 1] = x1i - x3r;
+    wn4r = w[1];
+    csc1 = w[2];
+    csc3 = w[3];
+    wd1r = 1;
+    wd1i = 0;
+    wd3r = 1;
+    wd3i = 0;
+    k = 0;
+    for (j = 2; j < mh - 2; j += 4) {
+        k += 4;
+        wk1r = csc1 * (wd1r + w[k]);
+        wk1i = csc1 * (wd1i + w[k + 1]);
+        wk3r = csc3 * (wd3r + w[k + 2]);
+        wk3i = csc3 * (wd3i + w[k + 3]);
+        wd1r = w[k];
+        wd1i = w[k + 1];
+        wd3r = w[k + 2];
+        wd3i = w[k + 3];
+        j1 = j + m;
+        j2 = j1 + m;
+        j3 = j2 + m;
+        x0r = a[j] + a[j2];
+        x0i = a[j + 1] + a[j2 + 1];
+        x1r = a[j] - a[j2];
+        x1i = a[j + 1] - a[j2 + 1];
+        y0r = a[j + 2] + a[j2 + 2];
+        y0i = a[j + 3] + a[j2 + 3];
+        y1r = a[j + 2] - a[j2 + 2];
+        y1i = a[j + 3] - a[j2 + 3];
+        x2r = a[j1] + a[j3];
+        x2i = a[j1 + 1] + a[j3 + 1];
+        x3r = a[j1] - a[j3];
+        x3i = a[j1 + 1] - a[j3 + 1];
+        y2r = a[j1 + 2] + a[j3 + 2];
+        y2i = a[j1 + 3] + a[j3 + 3];
+        y3r = a[j1 + 2] - a[j3 + 2];
+        y3i = a[j1 + 3] - a[j3 + 3];
+        a[j] = x0r + x2r;
+        a[j + 1] = x0i + x2i;
+        a[j + 2] = y0r + y2r;
+        a[j + 3] = y0i + y2i;
+        a[j1] = x0r - x2r;
+        a[j1 + 1] = x0i - x2i;
+        a[j1 + 2] = y0r - y2r;
+        a[j1 + 3] = y0i - y2i;
+        x0r = x1r - x3i;
+        x0i = x1i + x3r;
+        a[j2] = wk1r * x0r - wk1i * x0i;
+        a[j2 + 1] = wk1r * x0i + wk1i * x0r;
+        x0r = y1r - y3i;
+        x0i = y1i + y3r;
+        a[j2 + 2] = wd1r * x0r - wd1i * x0i;
+        a[j2 + 3] = wd1r * x0i + wd1i * x0r;
+        x0r = x1r + x3i;
+        x0i = x1i - x3r;
+        a[j3] = wk3r * x0r + wk3i * x0i;
+        a[j3 + 1] = wk3r * x0i - wk3i * x0r;
+        x0r = y1r + y3i;
+        x0i = y1i - y3r;
+        a[j3 + 2] = wd3r * x0r + wd3i * x0i;
+        a[j3 + 3] = wd3r * x0i - wd3i * x0r;
+        j0 = m - j;
+        j1 = j0 + m;
+        j2 = j1 + m;
+        j3 = j2 + m;
+        x0r = a[j0] + a[j2];
+        x0i = a[j0 + 1] + a[j2 + 1];
+        x1r = a[j0] - a[j2];
+        x1i = a[j0 + 1] - a[j2 + 1];
+        y0r = a[j0 - 2] + a[j2 - 2];
+        y0i = a[j0 - 1] + a[j2 - 1];
+        y1r = a[j0 - 2] - a[j2 - 2];
+        y1i = a[j0 - 1] - a[j2 - 1];
+        x2r = a[j1] + a[j3];
+        x2i = a[j1 + 1] + a[j3 + 1];
+        x3r = a[j1] - a[j3];
+        x3i = a[j1 + 1] - a[j3 + 1];
+        y2r = a[j1 - 2] + a[j3 - 2];
+        y2i = a[j1 - 1] + a[j3 - 1];
+        y3r = a[j1 - 2] - a[j3 - 2];
+        y3i = a[j1 - 1] - a[j3 - 1];
+        a[j0] = x0r + x2r;
+        a[j0 + 1] = x0i + x2i;
+        a[j0 - 2] = y0r + y2r;
+        a[j0 - 1] = y0i + y2i;
+        a[j1] = x0r - x2r;
+        a[j1 + 1] = x0i - x2i;
+        a[j1 - 2] = y0r - y2r;
+        a[j1 - 1] = y0i - y2i;
+        x0r = x1r - x3i;
+        x0i = x1i + x3r;
+        a[j2] = wk1i * x0r - wk1r * x0i;
+        a[j2 + 1] = wk1i * x0i + wk1r * x0r;
+        x0r = y1r - y3i;
+        x0i = y1i + y3r;
+        a[j2 - 2] = wd1i * x0r - wd1r * x0i;
+        a[j2 - 1] = wd1i * x0i + wd1r * x0r;
+        x0r = x1r + x3i;
+        x0i = x1i - x3r;
+        a[j3] = wk3i * x0r + wk3r * x0i;
+        a[j3 + 1] = wk3i * x0i - wk3r * x0r;
+        x0r = y1r + y3i;
+        x0i = y1i - y3r;
+        a[j3 - 2] = wd3i * x0r + wd3r * x0i;
+        a[j3 - 1] = wd3i * x0i - wd3r * x0r;
+    }
+    wk1r = csc1 * (wd1r + wn4r);
+    wk1i = csc1 * (wd1i + wn4r);
+    wk3r = csc3 * (wd3r - wn4r);
+    wk3i = csc3 * (wd3i - wn4r);
+    j0 = mh;
+    j1 = j0 + m;
+    j2 = j1 + m;
+    j3 = j2 + m;
+    x0r = a[j0 - 2] + a[j2 - 2];
+    x0i = a[j0 - 1] + a[j2 - 1];
+    x1r = a[j0 - 2] - a[j2 - 2];
+    x1i = a[j0 - 1] - a[j2 - 1];
+    x2r = a[j1 - 2] + a[j3 - 2];
+    x2i = a[j1 - 1] + a[j3 - 1];
+    x3r = a[j1 - 2] - a[j3 - 2];
+    x3i = a[j1 - 1] - a[j3 - 1];
+    a[j0 - 2] = x0r + x2r;
+    a[j0 - 1] = x0i + x2i;
+    a[j1 - 2] = x0r - x2r;
+    a[j1 - 1] = x0i - x2i;
+    x0r = x1r - x3i;
+    x0i = x1i + x3r;
+    a[j2 - 2] = wk1r * x0r - wk1i * x0i;
+    a[j2 - 1] = wk1r * x0i + wk1i * x0r;
+    x0r = x1r + x3i;
+    x0i = x1i - x3r;
+    a[j3 - 2] = wk3r * x0r + wk3i * x0i;
+    a[j3 - 1] = wk3r * x0i - wk3i * x0r;
+    x0r = a[j0] + a[j2];
+    x0i = a[j0 + 1] + a[j2 + 1];
+    x1r = a[j0] - a[j2];
+    x1i = a[j0 + 1] - a[j2 + 1];
+    x2r = a[j1] + a[j3];
+    x2i = a[j1 + 1] + a[j3 + 1];
+    x3r = a[j1] - a[j3];
+    x3i = a[j1 + 1] - a[j3 + 1];
+    a[j0] = x0r + x2r;
+    a[j0 + 1] = x0i + x2i;
+    a[j1] = x0r - x2r;
+    a[j1 + 1] = x0i - x2i;
+    x0r = x1r - x3i;
+    x0i = x1i + x3r;
+    a[j2] = wn4r * (x0r - x0i);
+    a[j2 + 1] = wn4r * (x0i + x0r);
+    x0r = x1r + x3i;
+    x0i = x1i - x3r;
+    a[j3] = -wn4r * (x0r + x0i);
+    a[j3 + 1] = -wn4r * (x0i - x0r);
+    x0r = a[j0 + 2] + a[j2 + 2];
+    x0i = a[j0 + 3] + a[j2 + 3];
+    x1r = a[j0 + 2] - a[j2 + 2];
+    x1i = a[j0 + 3] - a[j2 + 3];
+    x2r = a[j1 + 2] + a[j3 + 2];
+    x2i = a[j1 + 3] + a[j3 + 3];
+    x3r = a[j1 + 2] - a[j3 + 2];
+    x3i = a[j1 + 3] - a[j3 + 3];
+    a[j0 + 2] = x0r + x2r;
+    a[j0 + 3] = x0i + x2i;
+    a[j1 + 2] = x0r - x2r;
+    a[j1 + 3] = x0i - x2i;
+    x0r = x1r - x3i;
+    x0i = x1i + x3r;
+    a[j2 + 2] = wk1i * x0r - wk1r * x0i;
+    a[j2 + 3] = wk1i * x0i + wk1r * x0r;
+    x0r = x1r + x3i;
+    x0i = x1i - x3r;
+    a[j3 + 2] = wk3i * x0r + wk3r * x0i;
+    a[j3 + 3] = wk3i * x0i - wk3r * x0r;
+}
+
+
+void cftb1st(int n, double *a, double *w)
+{
+    int j, j0, j1, j2, j3, k, m, mh;
+    double wn4r, csc1, csc3, wk1r, wk1i, wk3r, wk3i, 
+        wd1r, wd1i, wd3r, wd3i;
+    double x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i, 
+        y0r, y0i, y1r, y1i, y2r, y2i, y3r, y3i;
+    
+    mh = n >> 3;
+    m = 2 * mh;
+    j1 = m;
+    j2 = j1 + m;
+    j3 = j2 + m;
+    x0r = a[0] + a[j2];
+    x0i = -a[1] - a[j2 + 1];
+    x1r = a[0] - a[j2];
+    x1i = -a[1] + a[j2 + 1];
+    x2r = a[j1] + a[j3];
+    x2i = a[j1 + 1] + a[j3 + 1];
+    x3r = a[j1] - a[j3];
+    x3i = a[j1 + 1] - a[j3 + 1];
+    a[0] = x0r + x2r;
+    a[1] = x0i - x2i;
+    a[j1] = x0r - x2r;
+    a[j1 + 1] = x0i + x2i;
+    a[j2] = x1r + x3i;
+    a[j2 + 1] = x1i + x3r;
+    a[j3] = x1r - x3i;
+    a[j3 + 1] = x1i - x3r;
+    wn4r = w[1];
+    csc1 = w[2];
+    csc3 = w[3];
+    wd1r = 1;
+    wd1i = 0;
+    wd3r = 1;
+    wd3i = 0;
+    k = 0;
+    for (j = 2; j < mh - 2; j += 4) {
+        k += 4;
+        wk1r = csc1 * (wd1r + w[k]);
+        wk1i = csc1 * (wd1i + w[k + 1]);
+        wk3r = csc3 * (wd3r + w[k + 2]);
+        wk3i = csc3 * (wd3i + w[k + 3]);
+        wd1r = w[k];
+        wd1i = w[k + 1];
+        wd3r = w[k + 2];
+        wd3i = w[k + 3];
+        j1 = j + m;
+        j2 = j1 + m;
+        j3 = j2 + m;
+        x0r = a[j] + a[j2];
+        x0i = -a[j + 1] - a[j2 + 1];
+        x1r = a[j] - a[j2];
+        x1i = -a[j + 1] + a[j2 + 1];
+        y0r = a[j + 2] + a[j2 + 2];
+        y0i = -a[j + 3] - a[j2 + 3];
+        y1r = a[j + 2] - a[j2 + 2];
+        y1i = -a[j + 3] + a[j2 + 3];
+        x2r = a[j1] + a[j3];
+        x2i = a[j1 + 1] + a[j3 + 1];
+        x3r = a[j1] - a[j3];
+        x3i = a[j1 + 1] - a[j3 + 1];
+        y2r = a[j1 + 2] + a[j3 + 2];
+        y2i = a[j1 + 3] + a[j3 + 3];
+        y3r = a[j1 + 2] - a[j3 + 2];
+        y3i = a[j1 + 3] - a[j3 + 3];
+        a[j] = x0r + x2r;
+        a[j + 1] = x0i - x2i;
+        a[j + 2] = y0r + y2r;
+        a[j + 3] = y0i - y2i;
+        a[j1] = x0r - x2r;
+        a[j1 + 1] = x0i + x2i;
+        a[j1 + 2] = y0r - y2r;
+        a[j1 + 3] = y0i + y2i;
+        x0r = x1r + x3i;
+        x0i = x1i + x3r;
+        a[j2] = wk1r * x0r - wk1i * x0i;
+        a[j2 + 1] = wk1r * x0i + wk1i * x0r;
+        x0r = y1r + y3i;
+        x0i = y1i + y3r;
+        a[j2 + 2] = wd1r * x0r - wd1i * x0i;
+        a[j2 + 3] = wd1r * x0i + wd1i * x0r;
+        x0r = x1r - x3i;
+        x0i = x1i - x3r;
+        a[j3] = wk3r * x0r + wk3i * x0i;
+        a[j3 + 1] = wk3r * x0i - wk3i * x0r;
+        x0r = y1r - y3i;
+        x0i = y1i - y3r;
+        a[j3 + 2] = wd3r * x0r + wd3i * x0i;
+        a[j3 + 3] = wd3r * x0i - wd3i * x0r;
+        j0 = m - j;
+        j1 = j0 + m;
+        j2 = j1 + m;
+        j3 = j2 + m;
+        x0r = a[j0] + a[j2];
+        x0i = -a[j0 + 1] - a[j2 + 1];
+        x1r = a[j0] - a[j2];
+        x1i = -a[j0 + 1] + a[j2 + 1];
+        y0r = a[j0 - 2] + a[j2 - 2];
+        y0i = -a[j0 - 1] - a[j2 - 1];
+        y1r = a[j0 - 2] - a[j2 - 2];
+        y1i = -a[j0 - 1] + a[j2 - 1];
+        x2r = a[j1] + a[j3];
+        x2i = a[j1 + 1] + a[j3 + 1];
+        x3r = a[j1] - a[j3];
+        x3i = a[j1 + 1] - a[j3 + 1];
+        y2r = a[j1 - 2] + a[j3 - 2];
+        y2i = a[j1 - 1] + a[j3 - 1];
+        y3r = a[j1 - 2] - a[j3 - 2];
+        y3i = a[j1 - 1] - a[j3 - 1];
+        a[j0] = x0r + x2r;
+        a[j0 + 1] = x0i - x2i;
+        a[j0 - 2] = y0r + y2r;
+        a[j0 - 1] = y0i - y2i;
+        a[j1] = x0r - x2r;
+        a[j1 + 1] = x0i + x2i;
+        a[j1 - 2] = y0r - y2r;
+        a[j1 - 1] = y0i + y2i;
+        x0r = x1r + x3i;
+        x0i = x1i + x3r;
+        a[j2] = wk1i * x0r - wk1r * x0i;
+        a[j2 + 1] = wk1i * x0i + wk1r * x0r;
+        x0r = y1r + y3i;
+        x0i = y1i + y3r;
+        a[j2 - 2] = wd1i * x0r - wd1r * x0i;
+        a[j2 - 1] = wd1i * x0i + wd1r * x0r;
+        x0r = x1r - x3i;
+        x0i = x1i - x3r;
+        a[j3] = wk3i * x0r + wk3r * x0i;
+        a[j3 + 1] = wk3i * x0i - wk3r * x0r;
+        x0r = y1r - y3i;
+        x0i = y1i - y3r;
+        a[j3 - 2] = wd3i * x0r + wd3r * x0i;
+        a[j3 - 1] = wd3i * x0i - wd3r * x0r;
+    }
+    wk1r = csc1 * (wd1r + wn4r);
+    wk1i = csc1 * (wd1i + wn4r);
+    wk3r = csc3 * (wd3r - wn4r);
+    wk3i = csc3 * (wd3i - wn4r);
+    j0 = mh;
+    j1 = j0 + m;
+    j2 = j1 + m;
+    j3 = j2 + m;
+    x0r = a[j0 - 2] + a[j2 - 2];
+    x0i = -a[j0 - 1] - a[j2 - 1];
+    x1r = a[j0 - 2] - a[j2 - 2];
+    x1i = -a[j0 - 1] + a[j2 - 1];
+    x2r = a[j1 - 2] + a[j3 - 2];
+    x2i = a[j1 - 1] + a[j3 - 1];
+    x3r = a[j1 - 2] - a[j3 - 2];
+    x3i = a[j1 - 1] - a[j3 - 1];
+    a[j0 - 2] = x0r + x2r;
+    a[j0 - 1] = x0i - x2i;
+    a[j1 - 2] = x0r - x2r;
+    a[j1 - 1] = x0i + x2i;
+    x0r = x1r + x3i;
+    x0i = x1i + x3r;
+    a[j2 - 2] = wk1r * x0r - wk1i * x0i;
+    a[j2 - 1] = wk1r * x0i + wk1i * x0r;
+    x0r = x1r - x3i;
+    x0i = x1i - x3r;
+    a[j3 - 2] = wk3r * x0r + wk3i * x0i;
+    a[j3 - 1] = wk3r * x0i - wk3i * x0r;
+    x0r = a[j0] + a[j2];
+    x0i = -a[j0 + 1] - a[j2 + 1];
+    x1r = a[j0] - a[j2];
+    x1i = -a[j0 + 1] + a[j2 + 1];
+    x2r = a[j1] + a[j3];
+    x2i = a[j1 + 1] + a[j3 + 1];
+    x3r = a[j1] - a[j3];
+    x3i = a[j1 + 1] - a[j3 + 1];
+    a[j0] = x0r + x2r;
+    a[j0 + 1] = x0i - x2i;
+    a[j1] = x0r - x2r;
+    a[j1 + 1] = x0i + x2i;
+    x0r = x1r + x3i;
+    x0i = x1i + x3r;
+    a[j2] = wn4r * (x0r - x0i);
+    a[j2 + 1] = wn4r * (x0i + x0r);
+    x0r = x1r - x3i;
+    x0i = x1i - x3r;
+    a[j3] = -wn4r * (x0r + x0i);
+    a[j3 + 1] = -wn4r * (x0i - x0r);
+    x0r = a[j0 + 2] + a[j2 + 2];
+    x0i = -a[j0 + 3] - a[j2 + 3];
+    x1r = a[j0 + 2] - a[j2 + 2];
+    x1i = -a[j0 + 3] + a[j2 + 3];
+    x2r = a[j1 + 2] + a[j3 + 2];
+    x2i = a[j1 + 3] + a[j3 + 3];
+    x3r = a[j1 + 2] - a[j3 + 2];
+    x3i = a[j1 + 3] - a[j3 + 3];
+    a[j0 + 2] = x0r + x2r;
+    a[j0 + 3] = x0i - x2i;
+    a[j1 + 2] = x0r - x2r;
+    a[j1 + 3] = x0i + x2i;
+    x0r = x1r + x3i;
+    x0i = x1i + x3r;
+    a[j2 + 2] = wk1i * x0r - wk1r * x0i;
+    a[j2 + 3] = wk1i * x0i + wk1r * x0r;
+    x0r = x1r - x3i;
+    x0i = x1i - x3r;
+    a[j3 + 2] = wk3i * x0r + wk3r * x0i;
+    a[j3 + 3] = wk3i * x0i - wk3r * x0r;
+}
+
+
+#ifdef USE_CDFT_THREADS
+struct cdft_arg_st {
+    int n0;
+    int n;
+    double *a;
+    int nw;
+    double *w;
+};
+typedef struct cdft_arg_st cdft_arg_t;
+
+
+void cftrec4_th(int n, double *a, int nw, double *w)
+{
+    void *cftrec1_th(void *p);
+    void *cftrec2_th(void *p);
+    int i, idiv4, m, nthread;
+    cdft_thread_t th[4];
+    cdft_arg_t ag[4];
+    
+    nthread = 2;
+    idiv4 = 0;
+    m = n >> 1;
+    if (n > CDFT_4THREADS_BEGIN_N) {
+        nthread = 4;
+        idiv4 = 1;
+        m >>= 1;
+    }
+    for (i = 0; i < nthread; i++) {
+        ag[i].n0 = n;
+        ag[i].n = m;
+        ag[i].a = &a[i * m];
+        ag[i].nw = nw;
+        ag[i].w = w;
+        if (i != idiv4) {
+            cdft_thread_create(&th[i], cftrec1_th, &ag[i]);
+        } else {
+            cdft_thread_create(&th[i], cftrec2_th, &ag[i]);
+        }
+    }
+    for (i = 0; i < nthread; i++) {
+        cdft_thread_wait(th[i]);
+    }
+}
+
+
+void *cftrec1_th(void *p)
+{
+    int cfttree(int n, int j, int k, double *a, int nw, double *w);
+    void cftleaf(int n, int isplt, double *a, int nw, double *w);
+    void cftmdl1(int n, double *a, double *w);
+    int isplt, j, k, m, n, n0, nw;
+    double *a, *w;
+    
+    n0 = ((cdft_arg_t *) p)->n0;
+    n = ((cdft_arg_t *) p)->n;
+    a = ((cdft_arg_t *) p)->a;
+    nw = ((cdft_arg_t *) p)->nw;
+    w = ((cdft_arg_t *) p)->w;
+    m = n0;
+    while (m > 512) {
+        m >>= 2;
+        cftmdl1(m, &a[n - m], &w[nw - (m >> 1)]);
+    }
+    cftleaf(m, 1, &a[n - m], nw, w);
+    k = 0;
+    for (j = n - m; j > 0; j -= m) {
+        k++;
+        isplt = cfttree(m, j, k, a, nw, w);
+        cftleaf(m, isplt, &a[j - m], nw, w);
+    }
+    return (void *) 0;
+}
+
+
+void *cftrec2_th(void *p)
+{
+    int cfttree(int n, int j, int k, double *a, int nw, double *w);
+    void cftleaf(int n, int isplt, double *a, int nw, double *w);
+    void cftmdl2(int n, double *a, double *w);
+    int isplt, j, k, m, n, n0, nw;
+    double *a, *w;
+    
+    n0 = ((cdft_arg_t *) p)->n0;
+    n = ((cdft_arg_t *) p)->n;
+    a = ((cdft_arg_t *) p)->a;
+    nw = ((cdft_arg_t *) p)->nw;
+    w = ((cdft_arg_t *) p)->w;
+    k = 1;
+    m = n0;
+    while (m > 512) {
+        m >>= 2;
+        k <<= 2;
+        cftmdl2(m, &a[n - m], &w[nw - m]);
+    }
+    cftleaf(m, 0, &a[n - m], nw, w);
+    k >>= 1;
+    for (j = n - m; j > 0; j -= m) {
+        k++;
+        isplt = cfttree(m, j, k, a, nw, w);
+        cftleaf(m, isplt, &a[j - m], nw, w);
+    }
+    return (void *) 0;
+}
+#endif /* USE_CDFT_THREADS */
+
+
+void cftrec4(int n, double *a, int nw, double *w)
+{
+    int cfttree(int n, int j, int k, double *a, int nw, double *w);
+    void cftleaf(int n, int isplt, double *a, int nw, double *w);
+    void cftmdl1(int n, double *a, double *w);
+    int isplt, j, k, m;
+    
+    m = n;
+    while (m > 512) {
+        m >>= 2;
+        cftmdl1(m, &a[n - m], &w[nw - (m >> 1)]);
+    }
+    cftleaf(m, 1, &a[n - m], nw, w);
+    k = 0;
+    for (j = n - m; j > 0; j -= m) {
+        k++;
+        isplt = cfttree(m, j, k, a, nw, w);
+        cftleaf(m, isplt, &a[j - m], nw, w);
+    }
+}
+
+
+int cfttree(int n, int j, int k, double *a, int nw, double *w)
+{
+    void cftmdl1(int n, double *a, double *w);
+    void cftmdl2(int n, double *a, double *w);
+    int i, isplt, m;
+    
+    if ((k & 3) != 0) {
+        isplt = k & 1;
+        if (isplt != 0) {
+            cftmdl1(n, &a[j - n], &w[nw - (n >> 1)]);
+        } else {
+            cftmdl2(n, &a[j - n], &w[nw - n]);
+        }
+    } else {
+        m = n;
+        for (i = k; (i & 3) == 0; i >>= 2) {
+            m <<= 2;
+        }
+        isplt = i & 1;
+        if (isplt != 0) {
+            while (m > 128) {
+                cftmdl1(m, &a[j - m], &w[nw - (m >> 1)]);
+                m >>= 2;
+            }
+        } else {
+            while (m > 128) {
+                cftmdl2(m, &a[j - m], &w[nw - m]);
+                m >>= 2;
+            }
+        }
+    }
+    return isplt;
+}
+
+
+void cftleaf(int n, int isplt, double *a, int nw, double *w)
+{
+    void cftmdl1(int n, double *a, double *w);
+    void cftmdl2(int n, double *a, double *w);
+    void cftf161(double *a, double *w);
+    void cftf162(double *a, double *w);
+    void cftf081(double *a, double *w);
+    void cftf082(double *a, double *w);
+    
+    if (n == 512) {
+        cftmdl1(128, a, &w[nw - 64]);
+        cftf161(a, &w[nw - 8]);
+        cftf162(&a[32], &w[nw - 32]);
+        cftf161(&a[64], &w[nw - 8]);
+        cftf161(&a[96], &w[nw - 8]);
+        cftmdl2(128, &a[128], &w[nw - 128]);
+        cftf161(&a[128], &w[nw - 8]);
+        cftf162(&a[160], &w[nw - 32]);
+        cftf161(&a[192], &w[nw - 8]);
+        cftf162(&a[224], &w[nw - 32]);
+        cftmdl1(128, &a[256], &w[nw - 64]);
+        cftf161(&a[256], &w[nw - 8]);
+        cftf162(&a[288], &w[nw - 32]);
+        cftf161(&a[320], &w[nw - 8]);
+        cftf161(&a[352], &w[nw - 8]);
+        if (isplt != 0) {
+            cftmdl1(128, &a[384], &w[nw - 64]);
+            cftf161(&a[480], &w[nw - 8]);
+        } else {
+            cftmdl2(128, &a[384], &w[nw - 128]);
+            cftf162(&a[480], &w[nw - 32]);
+        }
+        cftf161(&a[384], &w[nw - 8]);
+        cftf162(&a[416], &w[nw - 32]);
+        cftf161(&a[448], &w[nw - 8]);
+    } else {
+        cftmdl1(64, a, &w[nw - 32]);
+        cftf081(a, &w[nw - 8]);
+        cftf082(&a[16], &w[nw - 8]);
+        cftf081(&a[32], &w[nw - 8]);
+        cftf081(&a[48], &w[nw - 8]);
+        cftmdl2(64, &a[64], &w[nw - 64]);
+        cftf081(&a[64], &w[nw - 8]);
+        cftf082(&a[80], &w[nw - 8]);
+        cftf081(&a[96], &w[nw - 8]);
+        cftf082(&a[112], &w[nw - 8]);
+        cftmdl1(64, &a[128], &w[nw - 32]);
+        cftf081(&a[128], &w[nw - 8]);
+        cftf082(&a[144], &w[nw - 8]);
+        cftf081(&a[160], &w[nw - 8]);
+        cftf081(&a[176], &w[nw - 8]);
+        if (isplt != 0) {
+            cftmdl1(64, &a[192], &w[nw - 32]);
+            cftf081(&a[240], &w[nw - 8]);
+        } else {
+            cftmdl2(64, &a[192], &w[nw - 64]);
+            cftf082(&a[240], &w[nw - 8]);
+        }
+        cftf081(&a[192], &w[nw - 8]);
+        cftf082(&a[208], &w[nw - 8]);
+        cftf081(&a[224], &w[nw - 8]);
+    }
+}
+
+
+void cftmdl1(int n, double *a, double *w)
+{
+    int j, j0, j1, j2, j3, k, m, mh;
+    double wn4r, wk1r, wk1i, wk3r, wk3i;
+    double x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
+    
+    mh = n >> 3;
+    m = 2 * mh;
+    j1 = m;
+    j2 = j1 + m;
+    j3 = j2 + m;
+    x0r = a[0] + a[j2];
+    x0i = a[1] + a[j2 + 1];
+    x1r = a[0] - a[j2];
+    x1i = a[1] - a[j2 + 1];
+    x2r = a[j1] + a[j3];
+    x2i = a[j1 + 1] + a[j3 + 1];
+    x3r = a[j1] - a[j3];
+    x3i = a[j1 + 1] - a[j3 + 1];
+    a[0] = x0r + x2r;
+    a[1] = x0i + x2i;
+    a[j1] = x0r - x2r;
+    a[j1 + 1] = x0i - x2i;
+    a[j2] = x1r - x3i;
+    a[j2 + 1] = x1i + x3r;
+    a[j3] = x1r + x3i;
+    a[j3 + 1] = x1i - x3r;
+    wn4r = w[1];
+    k = 0;
+    for (j = 2; j < mh; j += 2) {
+        k += 4;
+        wk1r = w[k];
+        wk1i = w[k + 1];
+        wk3r = w[k + 2];
+        wk3i = w[k + 3];
+        j1 = j + m;
+        j2 = j1 + m;
+        j3 = j2 + m;
+        x0r = a[j] + a[j2];
+        x0i = a[j + 1] + a[j2 + 1];
+        x1r = a[j] - a[j2];
+        x1i = a[j + 1] - a[j2 + 1];
+        x2r = a[j1] + a[j3];
+        x2i = a[j1 + 1] + a[j3 + 1];
+        x3r = a[j1] - a[j3];
+        x3i = a[j1 + 1] - a[j3 + 1];
+        a[j] = x0r + x2r;
+        a[j + 1] = x0i + x2i;
+        a[j1] = x0r - x2r;
+        a[j1 + 1] = x0i - x2i;
+        x0r = x1r - x3i;
+        x0i = x1i + x3r;
+        a[j2] = wk1r * x0r - wk1i * x0i;
+        a[j2 + 1] = wk1r * x0i + wk1i * x0r;
+        x0r = x1r + x3i;
+        x0i = x1i - x3r;
+        a[j3] = wk3r * x0r + wk3i * x0i;
+        a[j3 + 1] = wk3r * x0i - wk3i * x0r;
+        j0 = m - j;
+        j1 = j0 + m;
+        j2 = j1 + m;
+        j3 = j2 + m;
+        x0r = a[j0] + a[j2];
+        x0i = a[j0 + 1] + a[j2 + 1];
+        x1r = a[j0] - a[j2];
+        x1i = a[j0 + 1] - a[j2 + 1];
+        x2r = a[j1] + a[j3];
+        x2i = a[j1 + 1] + a[j3 + 1];
+        x3r = a[j1] - a[j3];
+        x3i = a[j1 + 1] - a[j3 + 1];
+        a[j0] = x0r + x2r;
+        a[j0 + 1] = x0i + x2i;
+        a[j1] = x0r - x2r;
+        a[j1 + 1] = x0i - x2i;
+        x0r = x1r - x3i;
+        x0i = x1i + x3r;
+        a[j2] = wk1i * x0r - wk1r * x0i;
+        a[j2 + 1] = wk1i * x0i + wk1r * x0r;
+        x0r = x1r + x3i;
+        x0i = x1i - x3r;
+        a[j3] = wk3i * x0r + wk3r * x0i;
+        a[j3 + 1] = wk3i * x0i - wk3r * x0r;
+    }
+    j0 = mh;
+    j1 = j0 + m;
+    j2 = j1 + m;
+    j3 = j2 + m;
+    x0r = a[j0] + a[j2];
+    x0i = a[j0 + 1] + a[j2 + 1];
+    x1r = a[j0] - a[j2];
+    x1i = a[j0 + 1] - a[j2 + 1];
+    x2r = a[j1] + a[j3];
+    x2i = a[j1 + 1] + a[j3 + 1];
+    x3r = a[j1] - a[j3];
+    x3i = a[j1 + 1] - a[j3 + 1];
+    a[j0] = x0r + x2r;
+    a[j0 + 1] = x0i + x2i;
+    a[j1] = x0r - x2r;
+    a[j1 + 1] = x0i - x2i;
+    x0r = x1r - x3i;
+    x0i = x1i + x3r;
+    a[j2] = wn4r * (x0r - x0i);
+    a[j2 + 1] = wn4r * (x0i + x0r);
+    x0r = x1r + x3i;
+    x0i = x1i - x3r;
+    a[j3] = -wn4r * (x0r + x0i);
+    a[j3 + 1] = -wn4r * (x0i - x0r);
+}
+
+
+void cftmdl2(int n, double *a, double *w)
+{
+    int j, j0, j1, j2, j3, k, kr, m, mh;
+    double wn4r, wk1r, wk1i, wk3r, wk3i, wd1r, wd1i, wd3r, wd3i;
+    double x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i, y0r, y0i, y2r, y2i;
+    
+    mh = n >> 3;
+    m = 2 * mh;
+    wn4r = w[1];
+    j1 = m;
+    j2 = j1 + m;
+    j3 = j2 + m;
+    x0r = a[0] - a[j2 + 1];
+    x0i = a[1] + a[j2];
+    x1r = a[0] + a[j2 + 1];
+    x1i = a[1] - a[j2];
+    x2r = a[j1] - a[j3 + 1];
+    x2i = a[j1 + 1] + a[j3];
+    x3r = a[j1] + a[j3 + 1];
+    x3i = a[j1 + 1] - a[j3];
+    y0r = wn4r * (x2r - x2i);
+    y0i = wn4r * (x2i + x2r);
+    a[0] = x0r + y0r;
+    a[1] = x0i + y0i;
+    a[j1] = x0r - y0r;
+    a[j1 + 1] = x0i - y0i;
+    y0r = wn4r * (x3r - x3i);
+    y0i = wn4r * (x3i + x3r);
+    a[j2] = x1r - y0i;
+    a[j2 + 1] = x1i + y0r;
+    a[j3] = x1r + y0i;
+    a[j3 + 1] = x1i - y0r;
+    k = 0;
+    kr = 2 * m;
+    for (j = 2; j < mh; j += 2) {
+        k += 4;
+        wk1r = w[k];
+        wk1i = w[k + 1];
+        wk3r = w[k + 2];
+        wk3i = w[k + 3];
+        kr -= 4;
+        wd1i = w[kr];
+        wd1r = w[kr + 1];
+        wd3i = w[kr + 2];
+        wd3r = w[kr + 3];
+        j1 = j + m;
+        j2 = j1 + m;
+        j3 = j2 + m;
+        x0r = a[j] - a[j2 + 1];
+        x0i = a[j + 1] + a[j2];
+        x1r = a[j] + a[j2 + 1];
+        x1i = a[j + 1] - a[j2];
+        x2r = a[j1] - a[j3 + 1];
+        x2i = a[j1 + 1] + a[j3];
+        x3r = a[j1] + a[j3 + 1];
+        x3i = a[j1 + 1] - a[j3];
+        y0r = wk1r * x0r - wk1i * x0i;
+        y0i = wk1r * x0i + wk1i * x0r;
+        y2r = wd1r * x2r - wd1i * x2i;
+        y2i = wd1r * x2i + wd1i * x2r;
+        a[j] = y0r + y2r;
+        a[j + 1] = y0i + y2i;
+        a[j1] = y0r - y2r;
+        a[j1 + 1] = y0i - y2i;
+        y0r = wk3r * x1r + wk3i * x1i;
+        y0i = wk3r * x1i - wk3i * x1r;
+        y2r = wd3r * x3r + wd3i * x3i;
+        y2i = wd3r * x3i - wd3i * x3r;
+        a[j2] = y0r + y2r;
+        a[j2 + 1] = y0i + y2i;
+        a[j3] = y0r - y2r;
+        a[j3 + 1] = y0i - y2i;
+        j0 = m - j;
+        j1 = j0 + m;
+        j2 = j1 + m;
+        j3 = j2 + m;
+        x0r = a[j0] - a[j2 + 1];
+        x0i = a[j0 + 1] + a[j2];
+        x1r = a[j0] + a[j2 + 1];
+        x1i = a[j0 + 1] - a[j2];
+        x2r = a[j1] - a[j3 + 1];
+        x2i = a[j1 + 1] + a[j3];
+        x3r = a[j1] + a[j3 + 1];
+        x3i = a[j1 + 1] - a[j3];
+        y0r = wd1i * x0r - wd1r * x0i;
+        y0i = wd1i * x0i + wd1r * x0r;
+        y2r = wk1i * x2r - wk1r * x2i;
+        y2i = wk1i * x2i + wk1r * x2r;
+        a[j0] = y0r + y2r;
+        a[j0 + 1] = y0i + y2i;
+        a[j1] = y0r - y2r;
+        a[j1 + 1] = y0i - y2i;
+        y0r = wd3i * x1r + wd3r * x1i;
+        y0i = wd3i * x1i - wd3r * x1r;
+        y2r = wk3i * x3r + wk3r * x3i;
+        y2i = wk3i * x3i - wk3r * x3r;
+        a[j2] = y0r + y2r;
+        a[j2 + 1] = y0i + y2i;
+        a[j3] = y0r - y2r;
+        a[j3 + 1] = y0i - y2i;
+    }
+    wk1r = w[m];
+    wk1i = w[m + 1];
+    j0 = mh;
+    j1 = j0 + m;
+    j2 = j1 + m;
+    j3 = j2 + m;
+    x0r = a[j0] - a[j2 + 1];
+    x0i = a[j0 + 1] + a[j2];
+    x1r = a[j0] + a[j2 + 1];
+    x1i = a[j0 + 1] - a[j2];
+    x2r = a[j1] - a[j3 + 1];
+    x2i = a[j1 + 1] + a[j3];
+    x3r = a[j1] + a[j3 + 1];
+    x3i = a[j1 + 1] - a[j3];
+    y0r = wk1r * x0r - wk1i * x0i;
+    y0i = wk1r * x0i + wk1i * x0r;
+    y2r = wk1i * x2r - wk1r * x2i;
+    y2i = wk1i * x2i + wk1r * x2r;
+    a[j0] = y0r + y2r;
+    a[j0 + 1] = y0i + y2i;
+    a[j1] = y0r - y2r;
+    a[j1 + 1] = y0i - y2i;
+    y0r = wk1i * x1r - wk1r * x1i;
+    y0i = wk1i * x1i + wk1r * x1r;
+    y2r = wk1r * x3r - wk1i * x3i;
+    y2i = wk1r * x3i + wk1i * x3r;
+    a[j2] = y0r - y2r;
+    a[j2 + 1] = y0i - y2i;
+    a[j3] = y0r + y2r;
+    a[j3 + 1] = y0i + y2i;
+}
+
+
+void cftfx41(int n, double *a, int nw, double *w)
+{
+    void cftf161(double *a, double *w);
+    void cftf162(double *a, double *w);
+    void cftf081(double *a, double *w);
+    void cftf082(double *a, double *w);
+    
+    if (n == 128) {
+        cftf161(a, &w[nw - 8]);
+        cftf162(&a[32], &w[nw - 32]);
+        cftf161(&a[64], &w[nw - 8]);
+        cftf161(&a[96], &w[nw - 8]);
+    } else {
+        cftf081(a, &w[nw - 8]);
+        cftf082(&a[16], &w[nw - 8]);
+        cftf081(&a[32], &w[nw - 8]);
+        cftf081(&a[48], &w[nw - 8]);
+    }
+}
+
+
+void cftf161(double *a, double *w)
+{
+    double wn4r, wk1r, wk1i, 
+        x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i, 
+        y0r, y0i, y1r, y1i, y2r, y2i, y3r, y3i, 
+        y4r, y4i, y5r, y5i, y6r, y6i, y7r, y7i, 
+        y8r, y8i, y9r, y9i, y10r, y10i, y11r, y11i, 
+        y12r, y12i, y13r, y13i, y14r, y14i, y15r, y15i;
+    
+    wn4r = w[1];
+    wk1r = w[2];
+    wk1i = w[3];
+    x0r = a[0] + a[16];
+    x0i = a[1] + a[17];
+    x1r = a[0] - a[16];
+    x1i = a[1] - a[17];
+    x2r = a[8] + a[24];
+    x2i = a[9] + a[25];
+    x3r = a[8] - a[24];
+    x3i = a[9] - a[25];
+    y0r = x0r + x2r;
+    y0i = x0i + x2i;
+    y4r = x0r - x2r;
+    y4i = x0i - x2i;
+    y8r = x1r - x3i;
+    y8i = x1i + x3r;
+    y12r = x1r + x3i;
+    y12i = x1i - x3r;
+    x0r = a[2] + a[18];
+    x0i = a[3] + a[19];
+    x1r = a[2] - a[18];
+    x1i = a[3] - a[19];
+    x2r = a[10] + a[26];
+    x2i = a[11] + a[27];
+    x3r = a[10] - a[26];
+    x3i = a[11] - a[27];
+    y1r = x0r + x2r;
+    y1i = x0i + x2i;
+    y5r = x0r - x2r;
+    y5i = x0i - x2i;
+    x0r = x1r - x3i;
+    x0i = x1i + x3r;
+    y9r = wk1r * x0r - wk1i * x0i;
+    y9i = wk1r * x0i + wk1i * x0r;
+    x0r = x1r + x3i;
+    x0i = x1i - x3r;
+    y13r = wk1i * x0r - wk1r * x0i;
+    y13i = wk1i * x0i + wk1r * x0r;
+    x0r = a[4] + a[20];
+    x0i = a[5] + a[21];
+    x1r = a[4] - a[20];
+    x1i = a[5] - a[21];
+    x2r = a[12] + a[28];
+    x2i = a[13] + a[29];
+    x3r = a[12] - a[28];
+    x3i = a[13] - a[29];
+    y2r = x0r + x2r;
+    y2i = x0i + x2i;
+    y6r = x0r - x2r;
+    y6i = x0i - x2i;
+    x0r = x1r - x3i;
+    x0i = x1i + x3r;
+    y10r = wn4r * (x0r - x0i);
+    y10i = wn4r * (x0i + x0r);
+    x0r = x1r + x3i;
+    x0i = x1i - x3r;
+    y14r = wn4r * (x0r + x0i);
+    y14i = wn4r * (x0i - x0r);
+    x0r = a[6] + a[22];
+    x0i = a[7] + a[23];
+    x1r = a[6] - a[22];
+    x1i = a[7] - a[23];
+    x2r = a[14] + a[30];
+    x2i = a[15] + a[31];
+    x3r = a[14] - a[30];
+    x3i = a[15] - a[31];
+    y3r = x0r + x2r;
+    y3i = x0i + x2i;
+    y7r = x0r - x2r;
+    y7i = x0i - x2i;
+    x0r = x1r - x3i;
+    x0i = x1i + x3r;
+    y11r = wk1i * x0r - wk1r * x0i;
+    y11i = wk1i * x0i + wk1r * x0r;
+    x0r = x1r + x3i;
+    x0i = x1i - x3r;
+    y15r = wk1r * x0r - wk1i * x0i;
+    y15i = wk1r * x0i + wk1i * x0r;
+    x0r = y12r - y14r;
+    x0i = y12i - y14i;
+    x1r = y12r + y14r;
+    x1i = y12i + y14i;
+    x2r = y13r - y15r;
+    x2i = y13i - y15i;
+    x3r = y13r + y15r;
+    x3i = y13i + y15i;
+    a[24] = x0r + x2r;
+    a[25] = x0i + x2i;
+    a[26] = x0r - x2r;
+    a[27] = x0i - x2i;
+    a[28] = x1r - x3i;
+    a[29] = x1i + x3r;
+    a[30] = x1r + x3i;
+    a[31] = x1i - x3r;
+    x0r = y8r + y10r;
+    x0i = y8i + y10i;
+    x1r = y8r - y10r;
+    x1i = y8i - y10i;
+    x2r = y9r + y11r;
+    x2i = y9i + y11i;
+    x3r = y9r - y11r;
+    x3i = y9i - y11i;
+    a[16] = x0r + x2r;
+    a[17] = x0i + x2i;
+    a[18] = x0r - x2r;
+    a[19] = x0i - x2i;
+    a[20] = x1r - x3i;
+    a[21] = x1i + x3r;
+    a[22] = x1r + x3i;
+    a[23] = x1i - x3r;
+    x0r = y5r - y7i;
+    x0i = y5i + y7r;
+    x2r = wn4r * (x0r - x0i);
+    x2i = wn4r * (x0i + x0r);
+    x0r = y5r + y7i;
+    x0i = y5i - y7r;
+    x3r = wn4r * (x0r - x0i);
+    x3i = wn4r * (x0i + x0r);
+    x0r = y4r - y6i;
+    x0i = y4i + y6r;
+    x1r = y4r + y6i;
+    x1i = y4i - y6r;
+    a[8] = x0r + x2r;
+    a[9] = x0i + x2i;
+    a[10] = x0r - x2r;
+    a[11] = x0i - x2i;
+    a[12] = x1r - x3i;
+    a[13] = x1i + x3r;
+    a[14] = x1r + x3i;
+    a[15] = x1i - x3r;
+    x0r = y0r + y2r;
+    x0i = y0i + y2i;
+    x1r = y0r - y2r;
+    x1i = y0i - y2i;
+    x2r = y1r + y3r;
+    x2i = y1i + y3i;
+    x3r = y1r - y3r;
+    x3i = y1i - y3i;
+    a[0] = x0r + x2r;
+    a[1] = x0i + x2i;
+    a[2] = x0r - x2r;
+    a[3] = x0i - x2i;
+    a[4] = x1r - x3i;
+    a[5] = x1i + x3r;
+    a[6] = x1r + x3i;
+    a[7] = x1i - x3r;
+}
+
+
+void cftf162(double *a, double *w)
+{
+    double wn4r, wk1r, wk1i, wk2r, wk2i, wk3r, wk3i, 
+        x0r, x0i, x1r, x1i, x2r, x2i, 
+        y0r, y0i, y1r, y1i, y2r, y2i, y3r, y3i, 
+        y4r, y4i, y5r, y5i, y6r, y6i, y7r, y7i, 
+        y8r, y8i, y9r, y9i, y10r, y10i, y11r, y11i, 
+        y12r, y12i, y13r, y13i, y14r, y14i, y15r, y15i;
+    
+    wn4r = w[1];
+    wk1r = w[4];
+    wk1i = w[5];
+    wk3r = w[6];
+    wk3i = -w[7];
+    wk2r = w[8];
+    wk2i = w[9];
+    x1r = a[0] - a[17];
+    x1i = a[1] + a[16];
+    x0r = a[8] - a[25];
+    x0i = a[9] + a[24];
+    x2r = wn4r * (x0r - x0i);
+    x2i = wn4r * (x0i + x0r);
+    y0r = x1r + x2r;
+    y0i = x1i + x2i;
+    y4r = x1r - x2r;
+    y4i = x1i - x2i;
+    x1r = a[0] + a[17];
+    x1i = a[1] - a[16];
+    x0r = a[8] + a[25];
+    x0i = a[9] - a[24];
+    x2r = wn4r * (x0r - x0i);
+    x2i = wn4r * (x0i + x0r);
+    y8r = x1r - x2i;
+    y8i = x1i + x2r;
+    y12r = x1r + x2i;
+    y12i = x1i - x2r;
+    x0r = a[2] - a[19];
+    x0i = a[3] + a[18];
+    x1r = wk1r * x0r - wk1i * x0i;
+    x1i = wk1r * x0i + wk1i * x0r;
+    x0r = a[10] - a[27];
+    x0i = a[11] + a[26];
+    x2r = wk3i * x0r - wk3r * x0i;
+    x2i = wk3i * x0i + wk3r * x0r;
+    y1r = x1r + x2r;
+    y1i = x1i + x2i;
+    y5r = x1r - x2r;
+    y5i = x1i - x2i;
+    x0r = a[2] + a[19];
+    x0i = a[3] - a[18];
+    x1r = wk3r * x0r - wk3i * x0i;
+    x1i = wk3r * x0i + wk3i * x0r;
+    x0r = a[10] + a[27];
+    x0i = a[11] - a[26];
+    x2r = wk1r * x0r + wk1i * x0i;
+    x2i = wk1r * x0i - wk1i * x0r;
+    y9r = x1r - x2r;
+    y9i = x1i - x2i;
+    y13r = x1r + x2r;
+    y13i = x1i + x2i;
+    x0r = a[4] - a[21];
+    x0i = a[5] + a[20];
+    x1r = wk2r * x0r - wk2i * x0i;
+    x1i = wk2r * x0i + wk2i * x0r;
+    x0r = a[12] - a[29];
+    x0i = a[13] + a[28];
+    x2r = wk2i * x0r - wk2r * x0i;
+    x2i = wk2i * x0i + wk2r * x0r;
+    y2r = x1r + x2r;
+    y2i = x1i + x2i;
+    y6r = x1r - x2r;
+    y6i = x1i - x2i;
+    x0r = a[4] + a[21];
+    x0i = a[5] - a[20];
+    x1r = wk2i * x0r - wk2r * x0i;
+    x1i = wk2i * x0i + wk2r * x0r;
+    x0r = a[12] + a[29];
+    x0i = a[13] - a[28];
+    x2r = wk2r * x0r - wk2i * x0i;
+    x2i = wk2r * x0i + wk2i * x0r;
+    y10r = x1r - x2r;
+    y10i = x1i - x2i;
+    y14r = x1r + x2r;
+    y14i = x1i + x2i;
+    x0r = a[6] - a[23];
+    x0i = a[7] + a[22];
+    x1r = wk3r * x0r - wk3i * x0i;
+    x1i = wk3r * x0i + wk3i * x0r;
+    x0r = a[14] - a[31];
+    x0i = a[15] + a[30];
+    x2r = wk1i * x0r - wk1r * x0i;
+    x2i = wk1i * x0i + wk1r * x0r;
+    y3r = x1r + x2r;
+    y3i = x1i + x2i;
+    y7r = x1r - x2r;
+    y7i = x1i - x2i;
+    x0r = a[6] + a[23];
+    x0i = a[7] - a[22];
+    x1r = wk1i * x0r + wk1r * x0i;
+    x1i = wk1i * x0i - wk1r * x0r;
+    x0r = a[14] + a[31];
+    x0i = a[15] - a[30];
+    x2r = wk3i * x0r - wk3r * x0i;
+    x2i = wk3i * x0i + wk3r * x0r;
+    y11r = x1r + x2r;
+    y11i = x1i + x2i;
+    y15r = x1r - x2r;
+    y15i = x1i - x2i;
+    x1r = y0r + y2r;
+    x1i = y0i + y2i;
+    x2r = y1r + y3r;
+    x2i = y1i + y3i;
+    a[0] = x1r + x2r;
+    a[1] = x1i + x2i;
+    a[2] = x1r - x2r;
+    a[3] = x1i - x2i;
+    x1r = y0r - y2r;
+    x1i = y0i - y2i;
+    x2r = y1r - y3r;
+    x2i = y1i - y3i;
+    a[4] = x1r - x2i;
+    a[5] = x1i + x2r;
+    a[6] = x1r + x2i;
+    a[7] = x1i - x2r;
+    x1r = y4r - y6i;
+    x1i = y4i + y6r;
+    x0r = y5r - y7i;
+    x0i = y5i + y7r;
+    x2r = wn4r * (x0r - x0i);
+    x2i = wn4r * (x0i + x0r);
+    a[8] = x1r + x2r;
+    a[9] = x1i + x2i;
+    a[10] = x1r - x2r;
+    a[11] = x1i - x2i;
+    x1r = y4r + y6i;
+    x1i = y4i - y6r;
+    x0r = y5r + y7i;
+    x0i = y5i - y7r;
+    x2r = wn4r * (x0r - x0i);
+    x2i = wn4r * (x0i + x0r);
+    a[12] = x1r - x2i;
+    a[13] = x1i + x2r;
+    a[14] = x1r + x2i;
+    a[15] = x1i - x2r;
+    x1r = y8r + y10r;
+    x1i = y8i + y10i;
+    x2r = y9r - y11r;
+    x2i = y9i - y11i;
+    a[16] = x1r + x2r;
+    a[17] = x1i + x2i;
+    a[18] = x1r - x2r;
+    a[19] = x1i - x2i;
+    x1r = y8r - y10r;
+    x1i = y8i - y10i;
+    x2r = y9r + y11r;
+    x2i = y9i + y11i;
+    a[20] = x1r - x2i;
+    a[21] = x1i + x2r;
+    a[22] = x1r + x2i;
+    a[23] = x1i - x2r;
+    x1r = y12r - y14i;
+    x1i = y12i + y14r;
+    x0r = y13r + y15i;
+    x0i = y13i - y15r;
+    x2r = wn4r * (x0r - x0i);
+    x2i = wn4r * (x0i + x0r);
+    a[24] = x1r + x2r;
+    a[25] = x1i + x2i;
+    a[26] = x1r - x2r;
+    a[27] = x1i - x2i;
+    x1r = y12r + y14i;
+    x1i = y12i - y14r;
+    x0r = y13r - y15i;
+    x0i = y13i + y15r;
+    x2r = wn4r * (x0r - x0i);
+    x2i = wn4r * (x0i + x0r);
+    a[28] = x1r - x2i;
+    a[29] = x1i + x2r;
+    a[30] = x1r + x2i;
+    a[31] = x1i - x2r;
+}
+
+
+void cftf081(double *a, double *w)
+{
+    double wn4r, x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i, 
+        y0r, y0i, y1r, y1i, y2r, y2i, y3r, y3i, 
+        y4r, y4i, y5r, y5i, y6r, y6i, y7r, y7i;
+    
+    wn4r = w[1];
+    x0r = a[0] + a[8];
+    x0i = a[1] + a[9];
+    x1r = a[0] - a[8];
+    x1i = a[1] - a[9];
+    x2r = a[4] + a[12];
+    x2i = a[5] + a[13];
+    x3r = a[4] - a[12];
+    x3i = a[5] - a[13];
+    y0r = x0r + x2r;
+    y0i = x0i + x2i;
+    y2r = x0r - x2r;
+    y2i = x0i - x2i;
+    y1r = x1r - x3i;
+    y1i = x1i + x3r;
+    y3r = x1r + x3i;
+    y3i = x1i - x3r;
+    x0r = a[2] + a[10];
+    x0i = a[3] + a[11];
+    x1r = a[2] - a[10];
+    x1i = a[3] - a[11];
+    x2r = a[6] + a[14];
+    x2i = a[7] + a[15];
+    x3r = a[6] - a[14];
+    x3i = a[7] - a[15];
+    y4r = x0r + x2r;
+    y4i = x0i + x2i;
+    y6r = x0r - x2r;
+    y6i = x0i - x2i;
+    x0r = x1r - x3i;
+    x0i = x1i + x3r;
+    x2r = x1r + x3i;
+    x2i = x1i - x3r;
+    y5r = wn4r * (x0r - x0i);
+    y5i = wn4r * (x0r + x0i);
+    y7r = wn4r * (x2r - x2i);
+    y7i = wn4r * (x2r + x2i);
+    a[8] = y1r + y5r;
+    a[9] = y1i + y5i;
+    a[10] = y1r - y5r;
+    a[11] = y1i - y5i;
+    a[12] = y3r - y7i;
+    a[13] = y3i + y7r;
+    a[14] = y3r + y7i;
+    a[15] = y3i - y7r;
+    a[0] = y0r + y4r;
+    a[1] = y0i + y4i;
+    a[2] = y0r - y4r;
+    a[3] = y0i - y4i;
+    a[4] = y2r - y6i;
+    a[5] = y2i + y6r;
+    a[6] = y2r + y6i;
+    a[7] = y2i - y6r;
+}
+
+
+void cftf082(double *a, double *w)
+{
+    double wn4r, wk1r, wk1i, x0r, x0i, x1r, x1i, 
+        y0r, y0i, y1r, y1i, y2r, y2i, y3r, y3i, 
+        y4r, y4i, y5r, y5i, y6r, y6i, y7r, y7i;
+    
+    wn4r = w[1];
+    wk1r = w[2];
+    wk1i = w[3];
+    y0r = a[0] - a[9];
+    y0i = a[1] + a[8];
+    y1r = a[0] + a[9];
+    y1i = a[1] - a[8];
+    x0r = a[4] - a[13];
+    x0i = a[5] + a[12];
+    y2r = wn4r * (x0r - x0i);
+    y2i = wn4r * (x0i + x0r);
+    x0r = a[4] + a[13];
+    x0i = a[5] - a[12];
+    y3r = wn4r * (x0r - x0i);
+    y3i = wn4r * (x0i + x0r);
+    x0r = a[2] - a[11];
+    x0i = a[3] + a[10];
+    y4r = wk1r * x0r - wk1i * x0i;
+    y4i = wk1r * x0i + wk1i * x0r;
+    x0r = a[2] + a[11];
+    x0i = a[3] - a[10];
+    y5r = wk1i * x0r - wk1r * x0i;
+    y5i = wk1i * x0i + wk1r * x0r;
+    x0r = a[6] - a[15];
+    x0i = a[7] + a[14];
+    y6r = wk1i * x0r - wk1r * x0i;
+    y6i = wk1i * x0i + wk1r * x0r;
+    x0r = a[6] + a[15];
+    x0i = a[7] - a[14];
+    y7r = wk1r * x0r - wk1i * x0i;
+    y7i = wk1r * x0i + wk1i * x0r;
+    x0r = y0r + y2r;
+    x0i = y0i + y2i;
+    x1r = y4r + y6r;
+    x1i = y4i + y6i;
+    a[0] = x0r + x1r;
+    a[1] = x0i + x1i;
+    a[2] = x0r - x1r;
+    a[3] = x0i - x1i;
+    x0r = y0r - y2r;
+    x0i = y0i - y2i;
+    x1r = y4r - y6r;
+    x1i = y4i - y6i;
+    a[4] = x0r - x1i;
+    a[5] = x0i + x1r;
+    a[6] = x0r + x1i;
+    a[7] = x0i - x1r;
+    x0r = y1r - y3i;
+    x0i = y1i + y3r;
+    x1r = y5r - y7r;
+    x1i = y5i - y7i;
+    a[8] = x0r + x1r;
+    a[9] = x0i + x1i;
+    a[10] = x0r - x1r;
+    a[11] = x0i - x1i;
+    x0r = y1r + y3i;
+    x0i = y1i - y3r;
+    x1r = y5r + y7r;
+    x1i = y5i + y7i;
+    a[12] = x0r - x1i;
+    a[13] = x0i + x1r;
+    a[14] = x0r + x1i;
+    a[15] = x0i - x1r;
+}
+
+
+void cftf040(double *a)
+{
+    double x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
+    
+    x0r = a[0] + a[4];
+    x0i = a[1] + a[5];
+    x1r = a[0] - a[4];
+    x1i = a[1] - a[5];
+    x2r = a[2] + a[6];
+    x2i = a[3] + a[7];
+    x3r = a[2] - a[6];
+    x3i = a[3] - a[7];
+    a[0] = x0r + x2r;
+    a[1] = x0i + x2i;
+    a[2] = x1r - x3i;
+    a[3] = x1i + x3r;
+    a[4] = x0r - x2r;
+    a[5] = x0i - x2i;
+    a[6] = x1r + x3i;
+    a[7] = x1i - x3r;
+}
+
+
+void cftb040(double *a)
+{
+    double x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
+    
+    x0r = a[0] + a[4];
+    x0i = a[1] + a[5];
+    x1r = a[0] - a[4];
+    x1i = a[1] - a[5];
+    x2r = a[2] + a[6];
+    x2i = a[3] + a[7];
+    x3r = a[2] - a[6];
+    x3i = a[3] - a[7];
+    a[0] = x0r + x2r;
+    a[1] = x0i + x2i;
+    a[2] = x1r + x3i;
+    a[3] = x1i - x3r;
+    a[4] = x0r - x2r;
+    a[5] = x0i - x2i;
+    a[6] = x1r - x3i;
+    a[7] = x1i + x3r;
+}
+
+
+void cftx020(double *a)
+{
+    double x0r, x0i;
+    
+    x0r = a[0] - a[2];
+    x0i = a[1] - a[3];
+    a[0] += a[2];
+    a[1] += a[3];
+    a[2] = x0r;
+    a[3] = x0i;
+}
+
+
+void rftfsub(int n, double *a, int nc, double *c)
+{
+    int j, k, kk, ks, m;
+    double wkr, wki, xr, xi, yr, yi;
+    
+    m = n >> 1;
+    ks = 2 * nc / m;
+    kk = 0;
+    for (j = 2; j < m; j += 2) {
+        k = n - j;
+        kk += ks;
+        wkr = 0.5 - c[nc - kk];
+        wki = c[kk];
+        xr = a[j] - a[k];
+        xi = a[j + 1] + a[k + 1];
+        yr = wkr * xr - wki * xi;
+        yi = wkr * xi + wki * xr;
+        a[j] -= yr;
+        a[j + 1] -= yi;
+        a[k] += yr;
+        a[k + 1] -= yi;
+    }
+}
+
+
+void rftbsub(int n, double *a, int nc, double *c)
+{
+    int j, k, kk, ks, m;
+    double wkr, wki, xr, xi, yr, yi;
+    
+    m = n >> 1;
+    ks = 2 * nc / m;
+    kk = 0;
+    for (j = 2; j < m; j += 2) {
+        k = n - j;
+        kk += ks;
+        wkr = 0.5 - c[nc - kk];
+        wki = c[kk];
+        xr = a[j] - a[k];
+        xi = a[j + 1] + a[k + 1];
+        yr = wkr * xr + wki * xi;
+        yi = wkr * xi - wki * xr;
+        a[j] -= yr;
+        a[j + 1] -= yi;
+        a[k] += yr;
+        a[k + 1] -= yi;
+    }
+}
+
+
+void dctsub(int n, double *a, int nc, double *c)
+{
+    int j, k, kk, ks, m;
+    double wkr, wki, xr;
+    
+    m = n >> 1;
+    ks = nc / n;
+    kk = 0;
+    for (j = 1; j < m; j++) {
+        k = n - j;
+        kk += ks;
+        wkr = c[kk] - c[nc - kk];
+        wki = c[kk] + c[nc - kk];
+        xr = wki * a[j] - wkr * a[k];
+        a[j] = wkr * a[j] + wki * a[k];
+        a[k] = xr;
+    }
+    a[m] *= c[0];
+}
+
+
+void dstsub(int n, double *a, int nc, double *c)
+{
+    int j, k, kk, ks, m;
+    double wkr, wki, xr;
+    
+    m = n >> 1;
+    ks = nc / n;
+    kk = 0;
+    for (j = 1; j < m; j++) {
+        k = n - j;
+        kk += ks;
+        wkr = c[kk] - c[nc - kk];
+        wki = c[kk] + c[nc - kk];
+        xr = wki * a[k] - wkr * a[j];
+        a[k] = wkr * a[k] + wki * a[j];
+        a[j] = xr;
+    }
+    a[m] *= c[0];
+}
+
diff --git a/plugins/supereq/nsfft-1.00/ooura/pi_fft.c b/plugins/supereq/nsfft-1.00/ooura/pi_fft.c
new file mode 100644
index 00000000..c9a76bf8
--- /dev/null
+++ b/plugins/supereq/nsfft-1.00/ooura/pi_fft.c
@@ -0,0 +1,1616 @@
+/*
+---- calculation of PI(= 3.14159...) using FFT ----
+    by T.Ooura, ver. LG1.1.2-MP1.5a Sep. 2001.
+
+This is a test program to estimate the performance of
+the FFT routines: fft*g.c.
+
+Example compilation:
+    GNU      : gcc -O6 -ffast-math pi_fft.c fftsg.c -lm -o pi_fftsg
+    SUN      : cc -fast -xO5 pi_fft.c fft8g.c -lm -o pi_fft8g
+    Microsoft: cl /O2 /G6 pi_fft.c fft4g.c /Fepi_fft4g.exe
+    ...
+    etc.
+*/
+
+/* Please check the following macros before compiling */
+#ifndef DBL_ERROR_MARGIN
+#define DBL_ERROR_MARGIN 0.3  /* must be < 0.5 */
+#endif
+
+
+#include <math.h>
+#include <limits.h>
+#include <float.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+
+
+void mp_load_0(int n, int radix, int out[]);
+void mp_load_1(int n, int radix, int out[]);
+void mp_copy(int n, int radix, int in[], int out[]);
+void mp_round(int n, int radix, int m, int inout[]);
+int mp_cmp(int n, int radix, int in1[], int in2[]);
+void mp_add(int n, int radix, int in1[], int in2[], int out[]);
+void mp_sub(int n, int radix, int in1[], int in2[], int out[]);
+void mp_imul(int n, int radix, int in1[], int in2, int out[]);
+int mp_idiv(int n, int radix, int in1[], int in2, int out[]);
+void mp_idiv_2(int n, int radix, int in[], int out[]);
+double mp_mul_radix_test(int n, int radix, int nfft, 
+        double tmpfft[], int ip[], double w[]);
+void mp_mul(int n, int radix, int in1[], int in2[], int out[], 
+        int tmp[], int nfft, double tmp1fft[], double tmp2fft[], 
+        double tmp3fft[], int ip[], double w[]);
+void mp_squ(int n, int radix, int in[], int out[], int tmp[], 
+        int nfft, double tmp1fft[], double tmp2fft[], 
+        int ip[], double w[]);
+void mp_mulh(int n, int radix, int in1[], int in2[], int out[], 
+        int nfft, double in1fft[], double outfft[], 
+        int ip[], double w[]);
+void mp_squh(int n, int radix, int in[], int out[], 
+        int nfft, double inoutfft[], int ip[], double w[]);
+int mp_inv(int n, int radix, int in[], int out[], 
+        int tmp1[], int tmp2[], int nfft, 
+        double tmp1fft[], double tmp2fft[], int ip[], double w[]);
+int mp_sqrt(int n, int radix, int in[], int out[], 
+        int tmp1[], int tmp2[], int nfft, 
+        double tmp1fft[], double tmp2fft[], int ip[], double w[]);
+void mp_sprintf(int n, int log10_radix, int in[], char out[]);
+void mp_sscanf(int n, int log10_radix, char in[], int out[]);
+void mp_fprintf(int n, int log10_radix, int in[], FILE *fout);
+
+
+int main()
+{
+    int nfft, log2_nfft, radix, log10_radix, n, npow, nprc;
+    double err, d_time, n_op;
+    int *a, *b, *c, *e, *i1, *i2, *ip;
+    double *d1, *d2, *d3, *w;
+    time_t t_1, t_2;
+    FILE *f_log, *f_out;
+    
+    f_log = fopen("pi.log", "w");
+    printf("PI calculation to estimate the FFT benchmarks\n");
+    fprintf(f_log, "PI calculation to estimate the FFT benchmarks\n");
+    printf("length of FFT =?\n");
+    scanf("%d", &nfft);
+    
+    printf("initializing...\n");
+    for (log2_nfft = 1; (1 << log2_nfft) < nfft; log2_nfft++);
+    nfft = 1 << log2_nfft;
+    n = nfft + 2;
+    ip = (int *) malloc((3 + (int) sqrt(0.5 * nfft)) * sizeof(int));
+    w = (double *) malloc(nfft / 2 * sizeof(double));
+    a = (int *) malloc((n + 2) * sizeof(int));
+    b = (int *) malloc((n + 2) * sizeof(int));
+    c = (int *) malloc((n + 2) * sizeof(int));
+    e = (int *) malloc((n + 2) * sizeof(int));
+    i1 = (int *) malloc((n + 2) * sizeof(int));
+    i2 = (int *) malloc((n + 2) * sizeof(int));
+    d1 = (double *) malloc((nfft + 2) * sizeof(double));
+    d2 = (double *) malloc((nfft + 2) * sizeof(double));
+    d3 = (double *) malloc((nfft + 2) * sizeof(double));
+    if (d3 == NULL) {
+        printf("Allocation Failure!\n");
+        exit(1);
+    }
+    ip[0] = 0;
+    /* ---- radix test ---- */
+    log10_radix = 1;
+    radix = 10;
+    err = mp_mul_radix_test(n, radix, nfft, d1, ip, w);
+    err += DBL_EPSILON * (n * radix * radix / 4);
+    while (100 * err < DBL_ERROR_MARGIN && radix <= INT_MAX / 20) {
+        err *= 100;
+        log10_radix++;
+        radix *= 10;
+    }
+    printf("nfft= %d\nradix= %d\nerror_margin= %g\n", nfft, radix, err);
+    fprintf(f_log, "nfft= %d\nradix= %d\nerror_margin= %g\n", nfft, radix, err);
+    printf("calculating %d digits of PI...\n", log10_radix * (n - 2));
+    fprintf(f_log, "calculating %d digits of PI...\n", log10_radix * (n - 2));
+    /* ---- time check ---- */
+    time(&t_1);
+    /*
+     * ---- a formula based on the AGM (Arithmetic-Geometric Mean) ----
+     *   c = sqrt(0.125);
+     *   a = 1 + 3 * c;
+     *   b = sqrt(a);
+     *   e = b - 0.625;
+     *   b = 2 * b;
+     *   c = e - c;
+     *   a = a + e;
+     *   npow = 4;
+     *   do {
+     *       npow = 2 * npow;
+     *       e = (a + b) / 2;
+     *       b = sqrt(a * b);
+     *       e = e - b;
+     *       b = 2 * b;
+     *       c = c - e;
+     *       a = e + b;
+     *   } while (e > SQRT_SQRT_EPSILON);
+     *   e = e * e / 4;
+     *   a = a + b;
+     *   pi = (a * a - e - e / 2) / (a * c - e) / npow;
+     * ---- modification ----
+     *   This is a modified version of Gauss-Legendre formula
+     *   (by T.Ooura). It is faster than original version.
+     * ---- reference ----
+     *   1. E.Salamin, 
+     *      Computation of PI Using Arithmetic-Geometric Mean, 
+     *      Mathematics of Computation, Vol.30 1976.
+     *   2. R.P.Brent, 
+     *      Fast Multiple-Precision Evaluation of Elementary Functions, 
+     *      J. ACM 23 1976.
+     *   3. D.Takahasi, Y.Kanada, 
+     *      Calculation of PI to 51.5 Billion Decimal Digits on 
+     *      Distributed Memoriy Parallel Processors, 
+     *      Transactions of Information Processing Society of Japan, 
+     *      Vol.39 No.7 1998.
+     *   4. T.Ooura, 
+     *      Improvement of the PI Calculation Algorithm and 
+     *      Implementation of Fast Multiple-Precision Computation, 
+     *      Information Processing Society of Japan SIG Notes, 
+     *      98-HPC-74, 1998.
+     */
+    /* ---- c = sqrt(0.125) ---- */
+    mp_sscanf(n, log10_radix, "0.125", a);
+    mp_sqrt(n, radix, a, c, i1, i2, nfft, d1, d2, ip, w);
+    /* ---- a = 1 + 3 * c ---- */
+    mp_imul(n, radix, c, 3, e);
+    mp_sscanf(n, log10_radix, "1", a);
+    mp_add(n, radix, a, e, a);
+    /* ---- b = sqrt(a) ---- */
+    mp_sqrt(n, radix, a, b, i1, i2, nfft, d1, d2, ip, w);
+    /* ---- e = b - 0.625 ---- */
+    mp_sscanf(n, log10_radix, "0.625", e);
+    mp_sub(n, radix, b, e, e);
+    /* ---- b = 2 * b ---- */
+    mp_add(n, radix, b, b, b);
+    /* ---- c = e - c ---- */
+    mp_sub(n, radix, e, c, c);
+    /* ---- a = a + e ---- */
+    mp_add(n, radix, a, e, a);
+    printf("AGM iteration\n");
+    fprintf(f_log, "AGM iteration\n");
+    npow = 4;
+    do {
+        npow *= 2;
+        /* ---- e = (a + b) / 2 ---- */
+        mp_add(n, radix, a, b, e);
+        mp_idiv_2(n, radix, e, e);
+        /* ---- b = sqrt(a * b) ---- */
+        mp_mul(n, radix, a, b, a, i1, nfft, d1, d2, d3, ip, w);
+        mp_sqrt(n, radix, a, b, i1, i2, nfft, d1, d2, ip, w);
+        /* ---- e = e - b ---- */
+        mp_sub(n, radix, e, b, e);
+        /* ---- b = 2 * b ---- */
+        mp_add(n, radix, b, b, b);
+        /* ---- c = c - e ---- */
+        mp_sub(n, radix, c, e, c);
+        /* ---- a = e + b ---- */
+        mp_add(n, radix, e, b, a);
+        /* ---- convergence check ---- */
+        nprc = -e[1];
+        if (e[0] == 0) {
+            nprc = n;
+        }
+        printf("precision= %d\n", 4 * nprc * log10_radix);
+        fprintf(f_log, "precision= %d\n", 4 * nprc * log10_radix);
+    } while (4 * nprc <= n);
+    /* ---- e = e * e / 4 (half precision) ---- */
+    mp_idiv_2(n, radix, e, e);
+    mp_squh(n, radix, e, e, nfft, d1, ip, w);
+    /* ---- a = a + b ---- */
+    mp_add(n, radix, a, b, a);
+    /* ---- a = (a * a - e - e / 2) / (a * c - e) / npow ---- */
+    mp_mul(n, radix, a, c, c, i1, nfft, d1, d2, d3, ip, w);
+    mp_sub(n, radix, c, e, c);
+    mp_inv(n, radix, c, b, i1, i2, nfft, d1, d2, ip, w);
+    mp_squ(n, radix, a, a, i1, nfft, d1, d2, ip, w);
+    mp_sub(n, radix, a, e, a);
+    mp_idiv_2(n, radix, e, e);
+    mp_sub(n, radix, a, e, a);
+    mp_mul(n, radix, a, b, a, i1, nfft, d1, d2, d3, ip, w);
+    mp_idiv(n, radix, a, npow, a);
+    /* ---- time check ---- */
+    time(&t_2);
+    /* ---- output ---- */
+    f_out = fopen("pi.dat", "w");
+    printf("writing pi.dat...\n");
+    mp_fprintf(n - 1, log10_radix, a, f_out);
+    fclose(f_out);
+    free(d3);
+    free(d2);
+    free(d1);
+    free(i2);
+    free(i1);
+    free(e);
+    free(c);
+    free(b);
+    free(a);
+    free(w);
+    free(ip);
+    /* ---- benchmark ---- */
+    n_op = 50.0 * nfft * log2_nfft * log2_nfft;
+    printf("floating point operation: %g op.\n", n_op);
+    fprintf(f_log, "floating point operation: %g op.\n", n_op);
+    /* ---- difftime ---- */
+    d_time = difftime(t_2, t_1);
+    printf("execution time: %g sec. (real time)\n", d_time);
+    fprintf(f_log, "execution time: %g sec. (real time)\n", d_time);
+    fclose(f_log);
+    return 0;
+}
+
+
+/* -------- multiple precision routines -------- */
+
+
+#include <math.h>
+#include <float.h>
+#include <stdio.h>
+
+/* ---- floating point format ----
+    data := data[0] * pow(radix, data[1]) * 
+            (data[2] + data[3]/radix + data[4]/radix/radix + ...), 
+    data[0]       : sign (1;data>0, -1;data<0, 0;data==0)
+    data[1]       : exponent (0;data==0)
+    data[2...n+1] : digits
+   ---- function prototypes ----
+    void mp_load_0(int n, int radix, int out[]);
+    void mp_load_1(int n, int radix, int out[]);
+    void mp_copy(int n, int radix, int in[], int out[]);
+    void mp_round(int n, int radix, int m, int inout[]);
+    int mp_cmp(int n, int radix, int in1[], int in2[]);
+    void mp_add(int n, int radix, int in1[], int in2[], int out[]);
+    void mp_sub(int n, int radix, int in1[], int in2[], int out[]);
+    void mp_imul(int n, int radix, int in1[], int in2, int out[]);
+    int mp_idiv(int n, int radix, int in1[], int in2, int out[]);
+    void mp_idiv_2(int n, int radix, int in[], int out[]);
+    double mp_mul_radix_test(int n, int radix, int nfft, 
+            double tmpfft[], int ip[], double w[]);
+    void mp_mul(int n, int radix, int in1[], int in2[], int out[], 
+            int tmp[], int nfft, double tmp1fft[], double tmp2fft[], 
+            double tmp3fft[], int ip[], double w[]);
+    void mp_squ(int n, int radix, int in[], int out[], int tmp[], 
+            int nfft, double tmp1fft[], double tmp2fft[], 
+            int ip[], double w[]);
+    void mp_mulh(int n, int radix, int in1[], int in2[], int out[], 
+            int nfft, double in1fft[], double outfft[], 
+            int ip[], double w[]);
+    void mp_squh(int n, int radix, int in[], int out[], 
+            int nfft, double inoutfft[], int ip[], double w[]);
+    int mp_inv(int n, int radix, int in[], int out[], 
+            int tmp1[], int tmp2[], int nfft, 
+            double tmp1fft[], double tmp2fft[], int ip[], double w[]);
+    int mp_sqrt(int n, int radix, int in[], int out[], 
+            int tmp1[], int tmp2[], int nfft, 
+            double tmp1fft[], double tmp2fft[], int ip[], double w[]);
+    void mp_sprintf(int n, int log10_radix, int in[], char out[]);
+    void mp_sscanf(int n, int log10_radix, char in[], int out[]);
+    void mp_fprintf(int n, int log10_radix, int in[], FILE *fout);
+   ----
+*/
+
+
+/* -------- mp_load routines -------- */
+
+
+void mp_load_0(int n, int radix, int out[])
+{
+    int j;
+    
+    for (j = 0; j <= n + 1; j++) {
+        out[j] = 0;
+    }
+}
+
+
+void mp_load_1(int n, int radix, int out[])
+{
+    int j;
+    
+    out[0] = 1;
+    out[1] = 0;
+    out[2] = 1;
+    for (j = 3; j <= n + 1; j++) {
+        out[j] = 0;
+    }
+}
+
+
+void mp_copy(int n, int radix, int in[], int out[])
+{
+    int j;
+    
+    for (j = 0; j <= n + 1; j++) {
+        out[j] = in[j];
+    }
+}
+
+
+void mp_round(int n, int radix, int m, int inout[])
+{
+    int j, x;
+    
+    if (m < n) {
+        for (j = n + 1; j > m + 2; j--) {
+            inout[j] = 0;
+        }
+        x = 2 * inout[m + 2];
+        inout[m + 2] = 0;
+        if (x >= radix) {
+            for (j = m + 1; j >= 2; j--) {
+                x = inout[j] + 1;
+                if (x < radix) {
+                    inout[j] = x;
+                    break;
+                }
+                inout[j] = 0;
+            }
+            if (x >= radix) {
+                inout[2] = 1;
+                inout[1]++;
+            }
+        }
+    }
+}
+
+
+/* -------- mp_add routines -------- */
+
+
+int mp_cmp(int n, int radix, int in1[], int in2[])
+{
+    int mp_unsgn_cmp(int n, int in1[], int in2[]);
+    
+    if (in1[0] > in2[0]) {
+        return 1;
+    } else if (in1[0] < in2[0]) {
+        return -1;
+    }
+    return in1[0] * mp_unsgn_cmp(n, &in1[1], &in2[1]);
+}
+
+
+void mp_add(int n, int radix, int in1[], int in2[], int out[])
+{
+    int mp_unsgn_cmp(int n, int in1[], int in2[]);
+    int mp_unexp_add(int n, int radix, int expdif, 
+            int in1[], int in2[], int out[]);
+    int mp_unexp_sub(int n, int radix, int expdif, 
+            int in1[], int in2[], int out[]);
+    int outsgn, outexp, expdif;
+    
+    expdif = in1[1] - in2[1];
+    outexp = in1[1];
+    if (expdif < 0) {
+        outexp = in2[1];
+    }
+    outsgn = in1[0] * in2[0];
+    if (outsgn >= 0) {
+        if (outsgn > 0) {
+            outsgn = in1[0];
+        } else {
+            outsgn = in1[0] + in2[0];
+            outexp = in1[1] + in2[1];
+            expdif = 0;
+        }
+        if (expdif >= 0) {
+            outexp += mp_unexp_add(n, radix, expdif, 
+                    &in1[2], &in2[2], &out[2]);
+        } else {
+            outexp += mp_unexp_add(n, radix, -expdif, 
+                    &in2[2], &in1[2], &out[2]);
+        }
+    } else {
+        outsgn = mp_unsgn_cmp(n, &in1[1], &in2[1]);
+        if (outsgn >= 0) {
+            expdif = mp_unexp_sub(n, radix, expdif, 
+                    &in1[2], &in2[2], &out[2]);
+        } else {
+            expdif = mp_unexp_sub(n, radix, -expdif, 
+                    &in2[2], &in1[2], &out[2]);
+        }
+        outexp -= expdif;
+        outsgn *= in1[0];
+        if (expdif == n) {
+            outsgn = 0;
+        }
+    }
+    if (outsgn == 0) {
+        outexp = 0;
+    }
+    out[0] = outsgn;
+    out[1] = outexp;
+}
+
+
+void mp_sub(int n, int radix, int in1[], int in2[], int out[])
+{
+    int mp_unsgn_cmp(int n, int in1[], int in2[]);
+    int mp_unexp_add(int n, int radix, int expdif, 
+            int in1[], int in2[], int out[]);
+    int mp_unexp_sub(int n, int radix, int expdif, 
+            int in1[], int in2[], int out[]);
+    int outsgn, outexp, expdif;
+    
+    expdif = in1[1] - in2[1];
+    outexp = in1[1];
+    if (expdif < 0) {
+        outexp = in2[1];
+    }
+    outsgn = in1[0] * in2[0];
+    if (outsgn <= 0) {
+        if (outsgn < 0) {
+            outsgn = in1[0];
+        } else {
+            outsgn = in1[0] - in2[0];
+            outexp = in1[1] + in2[1];
+            expdif = 0;
+        }
+        if (expdif >= 0) {
+            outexp += mp_unexp_add(n, radix, expdif, 
+                    &in1[2], &in2[2], &out[2]);
+        } else {
+            outexp += mp_unexp_add(n, radix, -expdif, 
+                    &in2[2], &in1[2], &out[2]);
+        }
+    } else {
+        outsgn = mp_unsgn_cmp(n, &in1[1], &in2[1]);
+        if (outsgn >= 0) {
+            expdif = mp_unexp_sub(n, radix, expdif, 
+                    &in1[2], &in2[2], &out[2]);
+        } else {
+            expdif = mp_unexp_sub(n, radix, -expdif, 
+                    &in2[2], &in1[2], &out[2]);
+        }
+        outexp -= expdif;
+        outsgn *= in1[0];
+        if (expdif == n) {
+            outsgn = 0;
+        }
+    }
+    if (outsgn == 0) {
+        outexp = 0;
+    }
+    out[0] = outsgn;
+    out[1] = outexp;
+}
+
+
+/* -------- mp_add child routines -------- */
+
+
+int mp_unsgn_cmp(int n, int in1[], int in2[])
+{
+    int j, cmp;
+    
+    cmp = 0;
+    for (j = 0; j <= n && cmp == 0; j++) {
+        cmp = in1[j] - in2[j];
+    }
+    if (cmp > 0) {
+        cmp = 1;
+    } else if (cmp < 0) {
+        cmp = -1;
+    }
+    return cmp;
+}
+
+
+int mp_unexp_add(int n, int radix, int expdif, 
+        int in1[], int in2[], int out[])
+{
+    int j, x, carry;
+    
+    carry = 0;
+    if (expdif == 0 && in1[0] + in2[0] >= radix) {
+        x = in1[n - 1] + in2[n - 1];
+        carry = x >= radix ? -1 : 0;
+        for (j = n - 1; j > 0; j--) {
+            x = in1[j - 1] + in2[j - 1] - carry;
+            carry = x >= radix ? -1 : 0;
+            out[j] = x - (radix & carry);
+        }
+        out[0] = -carry;
+    } else {
+        if (expdif > n) {
+            expdif = n;
+        }
+        for (j = n - 1; j >= expdif; j--) {
+            x = in1[j] + in2[j - expdif] - carry;
+            carry = x >= radix ? -1 : 0;
+            out[j] = x - (radix & carry);
+        }
+        for (j = expdif - 1; j >= 0; j--) {
+            x = in1[j] - carry;
+            carry = x >= radix ? -1 : 0;
+            out[j] = x - (radix & carry);
+        }
+        if (carry != 0) {
+            for (j = n - 1; j > 0; j--) {
+                out[j] = out[j - 1];
+            }
+            out[0] = -carry;
+        }
+    }
+    return -carry;
+}
+
+
+int mp_unexp_sub(int n, int radix, int expdif, 
+        int in1[], int in2[], int out[])
+{
+    int j, x, borrow, ncancel;
+    
+    if (expdif > n) {
+        expdif = n;
+    }
+    borrow = 0;
+    for (j = n - 1; j >= expdif; j--) {
+        x = in1[j] - in2[j - expdif] + borrow;
+        borrow = x < 0 ? -1 : 0;
+        out[j] = x + (radix & borrow);
+    }
+    for (j = expdif - 1; j >= 0; j--) {
+        x = in1[j] + borrow;
+        borrow = x < 0 ? -1 : 0;
+        out[j] = x + (radix & borrow);
+    }
+    ncancel = 0;
+    for (j = 0; j < n && out[j] == 0; j++) {
+        ncancel = j + 1;
+    }
+    if (ncancel > 0 && ncancel < n) {
+        for (j = 0; j < n - ncancel; j++) {
+            out[j] = out[j + ncancel];
+        }
+        for (j = n - ncancel; j < n; j++) {
+            out[j] = 0;
+        }
+    }
+    return ncancel;
+}
+
+
+/* -------- mp_imul routines -------- */
+
+
+void mp_imul(int n, int radix, int in1[], int in2, int out[])
+{
+    void mp_unsgn_imul(int n, double dradix, int in1[], double din2, 
+            int out[]);
+    
+    if (in2 > 0) {
+        out[0] = in1[0];
+    } else if (in2 < 0) {
+        out[0] = -in1[0];
+        in2 = -in2;
+    } else {
+        out[0] = 0;
+    }
+    mp_unsgn_imul(n, radix, &in1[1], in2, &out[1]);
+    if (out[0] == 0) {
+        out[1] = 0;
+    }
+}
+
+
+int mp_idiv(int n, int radix, int in1[], int in2, int out[])
+{
+    void mp_load_0(int n, int radix, int out[]);
+    void mp_unsgn_idiv(int n, double dradix, int in1[], double din2, 
+            int out[]);
+    
+    if (in2 == 0) {
+        return -1;
+    }
+    if (in2 > 0) {
+        out[0] = in1[0];
+    } else {
+        out[0] = -in1[0];
+        in2 = -in2;
+    }
+    if (in1[0] == 0) {
+        mp_load_0(n, radix, out);
+        return 0;
+    }
+    mp_unsgn_idiv(n, radix, &in1[1], in2, &out[1]);
+    return 0;
+}
+
+
+void mp_idiv_2(int n, int radix, int in[], int out[])
+{
+    int j, ix, carry, shift;
+    
+    out[0] = in[0];
+    shift = 0;
+    if (in[2] == 1) {
+        shift = 1;
+    }
+    out[1] = in[1] - shift;
+    carry = -shift;
+    for (j = 2; j <= n + 1 - shift; j++) {
+        ix = in[j + shift] + (radix & carry);
+        carry = -(ix & 1);
+        out[j] = ix >> 1;
+    }
+    if (shift > 0) {
+        out[n + 1] = (radix & carry) >> 1;
+    }
+}
+
+
+/* -------- mp_imul child routines -------- */
+
+
+void mp_unsgn_imul(int n, double dradix, int in1[], double din2, 
+        int out[])
+{
+    int j, carry, shift;
+    double x, d1_radix;
+    
+    d1_radix = 1.0 / dradix;
+    carry = 0;
+    for (j = n; j >= 1; j--) {
+        x = din2 * in1[j] + carry + 0.5;
+        carry = (int) (d1_radix * x);
+        out[j] = (int) (x - dradix * carry);
+    }
+    shift = 0;
+    x = carry + 0.5;
+    while (x > 1) {
+        x *= d1_radix;
+        shift++;
+    }
+    out[0] = in1[0] + shift;
+    if (shift > 0) {
+        while (shift > n) {
+            carry = (int) (d1_radix * carry + 0.5);
+            shift--;
+        }
+        for (j = n; j >= shift + 1; j--) {
+            out[j] = out[j - shift];
+        }
+        for (j = shift; j >= 1; j--) {
+            x = carry + 0.5;
+            carry = (int) (d1_radix * x);
+            out[j] = (int) (x - dradix * carry);
+        }
+    }
+}
+
+
+void mp_unsgn_idiv(int n, double dradix, int in1[], double din2, 
+        int out[])
+{
+    int j, ix, carry, shift;
+    double x, d1_in2;
+    
+    d1_in2 = 1.0 / din2;
+    shift = 0;
+    x = 0;
+    do {
+        shift++;
+        x *= dradix;
+        if (shift <= n) {
+            x += in1[shift];
+        }
+    } while (x < din2 - 0.5);
+    x += 0.5;
+    ix = (int) (d1_in2 * x);
+    carry = (int) (x - din2 * ix);
+    out[1] = ix;
+    shift--;
+    out[0] = in1[0] - shift;
+    if (shift >= n) {
+        shift = n - 1;
+    }
+    for (j = 2; j <= n - shift; j++) {
+        x = in1[j + shift] + dradix * carry + 0.5;
+        ix = (int) (d1_in2 * x);
+        carry = (int) (x - din2 * ix);
+        out[j] = ix;
+    }
+    for (j = n - shift + 1; j <= n; j++) {
+        x = dradix * carry + 0.5;
+        ix = (int) (d1_in2 * x);
+        carry = (int) (x - din2 * ix);
+        out[j] = ix;
+    }
+}
+
+
+/* -------- mp_mul routines -------- */
+
+
+double mp_mul_radix_test(int n, int radix, int nfft, 
+        double tmpfft[], int ip[], double w[])
+{
+    void rdft(int n, int isgn, double *a, int *ip, double *w);
+    void mp_mul_csqu(int nfft, double dinout[]);
+    double mp_mul_d2i_test(int radix, int nfft, double din[]);
+    int j, ndata, radix_2;
+    
+    ndata = (nfft >> 1) + 1;
+    if (ndata > n) {
+        ndata = n;
+    }
+    tmpfft[nfft + 1] = radix - 1;
+    for (j = nfft; j > ndata; j--) {
+        tmpfft[j] = 0;
+    }
+    radix_2 = (radix + 1) / 2;
+    for (j = ndata; j > 2; j--) {
+        tmpfft[j] = radix_2;
+    }
+    tmpfft[2] = radix;
+    tmpfft[1] = radix - 1;
+    tmpfft[0] = 0;
+    rdft(nfft, 1, &tmpfft[1], ip, w);
+    mp_mul_csqu(nfft, tmpfft);
+    rdft(nfft, -1, &tmpfft[1], ip, w);
+    return 2 * mp_mul_d2i_test(radix, nfft, tmpfft);
+}
+
+
+void mp_mul(int n, int radix, int in1[], int in2[], int out[], 
+        int tmp[], int nfft, double tmp1fft[], double tmp2fft[], 
+        double tmp3fft[], int ip[], double w[])
+{
+    void mp_copy(int n, int radix, int in[], int out[]);
+    void mp_add(int n, int radix, int in1[], int in2[], int out[]);
+    void rdft(int n, int isgn, double *a, int *ip, double *w);
+    void mp_mul_i2d(int n, int radix, int nfft, int shift, 
+            int in[], double dout[]);
+    void mp_mul_cmul(int nfft, double din[], double dinout[]);
+    void mp_mul_cmuladd(int nfft, double din1[], double din2[], 
+            double dinout[]);
+    void mp_mul_d2i(int n, int radix, int nfft, double din[], int out[]);
+    int n_h, shift;
+    
+    shift = (nfft >> 1) + 1;
+    while (n > shift) {
+        if (in1[shift + 2] + in2[shift + 2] != 0) {
+            break;
+        }
+        shift++;
+    }
+    n_h = n / 2 + 1;
+    if (n_h < n - shift) {
+        n_h = n - shift;
+    }
+    /* ---- tmp3fft = (upper) in1 * (lower) in2 ---- */
+    mp_mul_i2d(n, radix, nfft, 0, in1, tmp1fft);
+    rdft(nfft, 1, &tmp1fft[1], ip, w);
+    mp_mul_i2d(n, radix, nfft, shift, in2, tmp3fft);
+    rdft(nfft, 1, &tmp3fft[1], ip, w);
+    mp_mul_cmul(nfft, tmp1fft, tmp3fft);
+    /* ---- tmp = (upper) in1 * (upper) in2 ---- */
+    mp_mul_i2d(n, radix, nfft, 0, in2, tmp2fft);
+    rdft(nfft, 1, &tmp2fft[1], ip, w);
+    mp_mul_cmul(nfft, tmp2fft, tmp1fft);
+    rdft(nfft, -1, &tmp1fft[1], ip, w);
+    mp_mul_d2i(n, radix, nfft, tmp1fft, tmp);
+    /* ---- tmp3fft += (upper) in2 * (lower) in1 ---- */
+    mp_mul_i2d(n, radix, nfft, shift, in1, tmp1fft);
+    rdft(nfft, 1, &tmp1fft[1], ip, w);
+    mp_mul_cmuladd(nfft, tmp1fft, tmp2fft, tmp3fft);
+    /* ---- out = tmp + tmp3fft ---- */
+    rdft(nfft, -1, &tmp3fft[1], ip, w);
+    mp_mul_d2i(n_h, radix, nfft, tmp3fft, out);
+    if (out[0] != 0) {
+        mp_add(n, radix, out, tmp, out);
+    } else {
+        mp_copy(n, radix, tmp, out);
+    }
+}
+
+
+void mp_squ(int n, int radix, int in[], int out[], int tmp[], 
+        int nfft, double tmp1fft[], double tmp2fft[], 
+        int ip[], double w[])
+{
+    void mp_add(int n, int radix, int in1[], int in2[], int out[]);
+    void rdft(int n, int isgn, double *a, int *ip, double *w);
+    void mp_mul_i2d(int n, int radix, int nfft, int shift, 
+            int in[], double dout[]);
+    void mp_mul_cmul(int nfft, double din[], double dinout[]);
+    void mp_mul_csqu(int nfft, double dinout[]);
+    void mp_mul_d2i(int n, int radix, int nfft, double din[], int out[]);
+    int n_h, shift;
+    
+    shift = (nfft >> 1) + 1;
+    while (n > shift) {
+        if (in[shift + 2] != 0) {
+            break;
+        }
+        shift++;
+    }
+    n_h = n / 2 + 1;
+    if (n_h < n - shift) {
+        n_h = n - shift;
+    }
+    /* ---- tmp = (upper) in * (lower) in ---- */
+    mp_mul_i2d(n, radix, nfft, 0, in, tmp1fft);
+    rdft(nfft, 1, &tmp1fft[1], ip, w);
+    mp_mul_i2d(n, radix, nfft, shift, in, tmp2fft);
+    rdft(nfft, 1, &tmp2fft[1], ip, w);
+    mp_mul_cmul(nfft, tmp1fft, tmp2fft);
+    rdft(nfft, -1, &tmp2fft[1], ip, w);
+    mp_mul_d2i(n_h, radix, nfft, tmp2fft, tmp);
+    /* ---- out = 2 * tmp + ((upper) in)^2 ---- */
+    mp_mul_csqu(nfft, tmp1fft);
+    rdft(nfft, -1, &tmp1fft[1], ip, w);
+    mp_mul_d2i(n, radix, nfft, tmp1fft, out);
+    if (tmp[0] != 0) {
+        mp_add(n_h, radix, tmp, tmp, tmp);
+        mp_add(n, radix, out, tmp, out);
+    }
+}
+
+
+void mp_mulh(int n, int radix, int in1[], int in2[], int out[], 
+        int nfft, double in1fft[], double outfft[], int ip[], double w[])
+{
+    void rdft(int n, int isgn, double *a, int *ip, double *w);
+    void mp_mul_i2d(int n, int radix, int nfft, int shift, 
+            int in[], double dout[]);
+    void mp_mul_cmul(int nfft, double din[], double dinout[]);
+    void mp_mul_d2i(int n, int radix, int nfft, double din[], int out[]);
+    
+    mp_mul_i2d(n, radix, nfft, 0, in1, in1fft);
+    rdft(nfft, 1, &in1fft[1], ip, w);
+    mp_mul_i2d(n, radix, nfft, 0, in2, outfft);
+    rdft(nfft, 1, &outfft[1], ip, w);
+    mp_mul_cmul(nfft, in1fft, outfft);
+    rdft(nfft, -1, &outfft[1], ip, w);
+    mp_mul_d2i(n, radix, nfft, outfft, out);
+}
+
+
+void mp_mulh_use_in1fft(int n, int radix, double in1fft[], 
+        int shift, int in2[], int out[], int nfft, double outfft[], 
+        int ip[], double w[])
+{
+    void rdft(int n, int isgn, double *a, int *ip, double *w);
+    void mp_mul_i2d(int n, int radix, int nfft, int shift, 
+            int in[], double dout[]);
+    void mp_mul_cmul(int nfft, double din[], double dinout[]);
+    void mp_mul_d2i(int n, int radix, int nfft, double din[], int out[]);
+    int n_h;
+    
+    while (n > shift) {
+        if (in2[shift + 2] != 0) {
+            break;
+        }
+        shift++;
+    }
+    n_h = n / 2 + 1;
+    if (n_h < n - shift) {
+        n_h = n - shift;
+    }
+    mp_mul_i2d(n, radix, nfft, shift, in2, outfft);
+    rdft(nfft, 1, &outfft[1], ip, w);
+    mp_mul_cmul(nfft, in1fft, outfft);
+    rdft(nfft, -1, &outfft[1], ip, w);
+    mp_mul_d2i(n_h, radix, nfft, outfft, out);
+}
+
+
+void mp_squh(int n, int radix, int in[], int out[], 
+        int nfft, double inoutfft[], int ip[], double w[])
+{
+    void rdft(int n, int isgn, double *a, int *ip, double *w);
+    void mp_mul_i2d(int n, int radix, int nfft, int shift, 
+            int in[], double dout[]);
+    void mp_mul_csqu(int nfft, double dinout[]);
+    void mp_mul_d2i(int n, int radix, int nfft, double din[], int out[]);
+    
+    mp_mul_i2d(n, radix, nfft, 0, in, inoutfft);
+    rdft(nfft, 1, &inoutfft[1], ip, w);
+    mp_mul_csqu(nfft, inoutfft);
+    rdft(nfft, -1, &inoutfft[1], ip, w);
+    mp_mul_d2i(n, radix, nfft, inoutfft, out);
+}
+
+
+void mp_squh_use_in1fft(int n, int radix, double inoutfft[], int out[], 
+        int nfft, int ip[], double w[])
+{
+    void rdft(int n, int isgn, double *a, int *ip, double *w);
+    void mp_mul_csqu(int nfft, double dinout[]);
+    void mp_mul_d2i(int n, int radix, int nfft, double din[], int out[]);
+    
+    mp_mul_csqu(nfft, inoutfft);
+    rdft(nfft, -1, &inoutfft[1], ip, w);
+    mp_mul_d2i(n, radix, nfft, inoutfft, out);
+}
+
+
+/* -------- mp_mul child routines -------- */
+
+
+void mp_mul_i2d(int n, int radix, int nfft, int shift, 
+        int in[], double dout[])
+{
+    int j, x, carry, ndata, radix_2, topdgt;
+    
+    ndata = 0;
+    topdgt = 0;
+    if (n > shift) {
+        topdgt = in[shift + 2];
+        ndata = (nfft >> 1) + 1;
+        if (ndata > n - shift) {
+            ndata = n - shift;
+        }
+    }
+    dout[nfft + 1] = in[0] * topdgt;
+    for (j = nfft; j > ndata; j--) {
+        dout[j] = 0;
+    }
+    /* ---- abs(dout[j]) <= radix/2 (to keep FFT precision) ---- */
+    if (ndata > 1) {
+        radix_2 = radix / 2;
+        carry = 0;
+        for (j = ndata + 1; j > 3; j--) {
+            x = in[j + shift] - carry;
+            carry = x >= radix_2 ? -1 : 0;
+            dout[j - 1] = x - (radix & carry);
+        }
+        dout[2] = in[shift + 3] - carry;
+    }
+    dout[1] = topdgt;
+    dout[0] = in[1] - shift;
+}
+
+
+void mp_mul_cmul(int nfft, double din[], double dinout[])
+{
+    int j;
+    double xr, xi, yr, yi;
+    
+    dinout[0] += din[0];
+    dinout[1] *= din[1];
+    dinout[2] *= din[2];
+    for (j = 3; j < nfft; j += 2) {
+        xr = din[j];
+        xi = din[j + 1];
+        yr = dinout[j];
+        yi = dinout[j + 1];
+        dinout[j] = xr * yr - xi * yi;
+        dinout[j + 1] = xr * yi + xi * yr;
+    }
+    dinout[nfft + 1] *= din[nfft + 1];
+}
+
+
+void mp_mul_cmuladd(int nfft, double din1[], double din2[], 
+        double dinout[])
+{
+    int j;
+    double xr, xi, yr, yi;
+    
+    dinout[1] += din1[1] * din2[1];
+    dinout[2] += din1[2] * din2[2];
+    for (j = 3; j < nfft; j += 2) {
+        xr = din1[j];
+        xi = din1[j + 1];
+        yr = din2[j];
+        yi = din2[j + 1];
+        dinout[j] += xr * yr - xi * yi;
+        dinout[j + 1] += xr * yi + xi * yr;
+    }
+    dinout[nfft + 1] += din1[nfft + 1] * din2[nfft + 1];
+}
+
+
+void mp_mul_csqu(int nfft, double dinout[])
+{
+    int j;
+    double xr, xi;
+    
+    dinout[0] *= 2;
+    dinout[1] *= dinout[1];
+    dinout[2] *= dinout[2];
+    for (j = 3; j < nfft; j += 2) {
+        xr = dinout[j];
+        xi = dinout[j + 1];
+        dinout[j] = xr * xr - xi * xi;
+        dinout[j + 1] = 2 * xr * xi;
+    }
+    dinout[nfft + 1] *= dinout[nfft + 1];
+}
+
+
+void mp_mul_d2i(int n, int radix, int nfft, double din[], int out[])
+{
+    int j, carry, carry1, carry2, shift, ndata;
+    double x, scale, d1_radix, d1_radix2, pow_radix, topdgt;
+    
+    scale = 2.0 / nfft;
+    d1_radix = 1.0 / radix;
+    d1_radix2 = d1_radix * d1_radix;
+    topdgt = din[nfft + 1];
+    x = topdgt < 0 ? -topdgt : topdgt;
+    shift = x + 0.5 >= radix ? 1 : 0;
+    /* ---- correction of cyclic convolution of din[1] ---- */
+    x *= nfft * 0.5;
+    din[nfft + 1] = din[1] - x;
+    din[1] = x;
+    /* ---- output of digits ---- */
+    ndata = n;
+    if (n > nfft + 1 + shift) {
+        ndata = nfft + 1 + shift;
+        for (j = n + 1; j > ndata + 1; j--) {
+            out[j] = 0;
+        }
+    }
+    x = 0;
+    pow_radix = 1;
+    for (j = ndata + 1 - shift; j <= nfft + 1; j++) {
+        x += pow_radix * din[j];
+        pow_radix *= d1_radix;
+        if (pow_radix < DBL_EPSILON) {
+            break;
+        }
+    }
+    x = d1_radix2 * (scale * x + 0.5);
+    carry2 = ((int) x) - 1;
+    carry = (int) (radix * (x - carry2) + 0.5);
+    for (j = ndata; j > 1; j--) {
+        x = d1_radix2 * (scale * din[j - shift] + carry + 0.5);
+        carry = carry2;
+        carry2 = ((int) x) - 1;
+        x = radix * (x - carry2);
+        carry1 = (int) x;
+        out[j + 1] = (int) (radix * (x - carry1));
+        carry += carry1;
+    }
+    x = carry + ((double) radix) * carry2 + 0.5;
+    if (shift == 0) {
+        x += scale * din[1];
+    }
+    carry = (int) (d1_radix * x);
+    out[2] = (int) (x - ((double) radix) * carry);
+    if (carry > 0) {
+        for (j = n + 1; j > 2; j--) {
+            out[j] = out[j - 1];
+        }
+        out[2] = carry;
+        shift++;
+    }
+    /* ---- output of exp, sgn ---- */
+    x = din[0] + shift + 0.5;
+    shift = ((int) x) - 1;
+    out[1] = shift + ((int) (x - shift));
+    out[0] = topdgt > 0.5 ? 1 : -1;
+    if (out[2] == 0) {
+        out[0] = 0;
+        out[1] = 0;
+    }
+}
+
+
+double mp_mul_d2i_test(int radix, int nfft, double din[])
+{
+    int j, carry, carry1, carry2;
+    double x, scale, d1_radix, d1_radix2, err;
+    
+    scale = 2.0 / nfft;
+    d1_radix = 1.0 / radix;
+    d1_radix2 = d1_radix * d1_radix;
+    /* ---- correction of cyclic convolution of din[1] ---- */
+    x = din[nfft + 1] * nfft * 0.5;
+    if (x < 0) {
+        x = -x;
+    }
+    din[nfft + 1] = din[1] - x;
+    /* ---- check of digits ---- */
+    err = 0;
+    carry = 0;
+    carry2 = 0;
+    for (j = nfft + 1; j > 1; j--) {
+        x = d1_radix2 * (scale * din[j] + carry + 0.5);
+        carry = carry2;
+        carry2 = ((int) x) - 1;
+        x = radix * (x - carry2);
+        carry1 = (int) x;
+        x = radix * (x - carry1);
+        carry += carry1;
+        x = x - 0.5 - ((int) x);
+        if (x > err) {
+            err = x;
+        } else if (-x > err) {
+            err = -x;
+        }
+    }
+    return err;
+}
+
+
+/* -------- mp_inv routines -------- */
+
+
+int mp_inv(int n, int radix, int in[], int out[], 
+        int tmp1[], int tmp2[], int nfft, 
+        double tmp1fft[], double tmp2fft[], int ip[], double w[])
+{
+    int mp_get_nfft_init(int radix, int nfft_max);
+    void mp_inv_init(int n, int radix, int in[], int out[]);
+    int mp_inv_newton(int n, int radix, int in[], int inout[], 
+            int tmp1[], int tmp2[], int nfft, double tmp1fft[], 
+            double tmp2fft[], int ip[], double w[]);
+    int n_nwt, nfft_nwt, thr, prc;
+    
+    if (in[0] == 0) {
+        return -1;
+    }
+    nfft_nwt = mp_get_nfft_init(radix, nfft);
+    n_nwt = nfft_nwt + 2;
+    if (n_nwt > n) {
+        n_nwt = n;
+    }
+    mp_inv_init(n_nwt, radix, in, out);
+    thr = 8;
+    do {
+        n_nwt = nfft_nwt + 2;
+        if (n_nwt > n) {
+            n_nwt = n;
+        }
+        prc = mp_inv_newton(n_nwt, radix, in, out, 
+                tmp1, tmp2, nfft_nwt, tmp1fft, tmp2fft, ip, w);
+        if (thr * nfft_nwt >= nfft) {
+            thr = 0;
+            if (2 * prc <= n_nwt - 2) {
+                nfft_nwt >>= 1;
+            }
+        } else {
+            if (3 * prc < n_nwt - 2) {
+                nfft_nwt >>= 1;
+            }
+        }
+        nfft_nwt <<= 1;
+    } while (nfft_nwt <= nfft);
+    return 0;
+}
+
+
+int mp_sqrt(int n, int radix, int in[], int out[], 
+        int tmp1[], int tmp2[], int nfft, 
+        double tmp1fft[], double tmp2fft[], int ip[], double w[])
+{
+    void mp_load_0(int n, int radix, int out[]);
+    int mp_get_nfft_init(int radix, int nfft_max);
+    void mp_sqrt_init(int n, int radix, int in[], int out[], int out_rev[]);
+    int mp_sqrt_newton(int n, int radix, int in[], int inout[], 
+            int inout_rev[], int tmp[], int nfft, double tmp1fft[], 
+            double tmp2fft[], int ip[], double w[], int *n_tmp1fft);
+    int n_nwt, nfft_nwt, thr, prc, n_tmp1fft;
+    
+    if (in[0] < 0) {
+        return -1;
+    } else if (in[0] == 0) {
+        mp_load_0(n, radix, out);
+        return 0;
+    }
+    nfft_nwt = mp_get_nfft_init(radix, nfft);
+    n_nwt = nfft_nwt + 2;
+    if (n_nwt > n) {
+        n_nwt = n;
+    }
+    mp_sqrt_init(n_nwt, radix, in, out, tmp1);
+    n_tmp1fft = 0;
+    thr = 8;
+    do {
+        n_nwt = nfft_nwt + 2;
+        if (n_nwt > n) {
+            n_nwt = n;
+        }
+        prc = mp_sqrt_newton(n_nwt, radix, in, out, 
+                tmp1, tmp2, nfft_nwt, tmp1fft, tmp2fft, 
+                ip, w, &n_tmp1fft);
+        if (thr * nfft_nwt >= nfft) {
+            thr = 0;
+            if (2 * prc <= n_nwt - 2) {
+                nfft_nwt >>= 1;
+            }
+        } else {
+            if (3 * prc < n_nwt - 2) {
+                nfft_nwt >>= 1;
+            }
+        }
+        nfft_nwt <<= 1;
+    } while (nfft_nwt <= nfft);
+    return 0;
+}
+
+
+/* -------- mp_inv child routines -------- */
+
+
+int mp_get_nfft_init(int radix, int nfft_max)
+{
+    int nfft_init;
+    double r;
+    
+    r = radix;
+    nfft_init = 1;
+    do {
+        r *= r;
+        nfft_init <<= 1;
+    } while (DBL_EPSILON * r < 1 && nfft_init < nfft_max);
+    return nfft_init;
+}
+
+
+void mp_inv_init(int n, int radix, int in[], int out[])
+{
+    void mp_unexp_d2mp(int n, int radix, double din, int out[]);
+    double mp_unexp_mp2d(int n, int radix, int in[]);
+    int outexp;
+    double din;
+    
+    out[0] = in[0];
+    outexp = -in[1];
+    din = 1.0 / mp_unexp_mp2d(n, radix, &in[2]);
+    while (din < 1) {
+        din *= radix;
+        outexp--;
+    }
+    out[1] = outexp;
+    mp_unexp_d2mp(n, radix, din, &out[2]);
+}
+
+
+void mp_sqrt_init(int n, int radix, int in[], int out[], int out_rev[])
+{
+    void mp_unexp_d2mp(int n, int radix, double din, int out[]);
+    double mp_unexp_mp2d(int n, int radix, int in[]);
+    int outexp;
+    double din;
+    
+    out[0] = 1;
+    out_rev[0] = 1;
+    outexp = in[1];
+    din = mp_unexp_mp2d(n, radix, &in[2]);
+    if (outexp % 2 != 0) {
+        din *= radix;
+        outexp--;
+    }
+    outexp /= 2;
+    din = sqrt(din);
+    if (din < 1) {
+        din *= radix;
+        outexp--;
+    }
+    out[1] = outexp;
+    mp_unexp_d2mp(n, radix, din, &out[2]);
+    outexp = -outexp;
+    din = 1.0 / din;
+    while (din < 1) {
+        din *= radix;
+        outexp--;
+    }
+    out_rev[1] = outexp;
+    mp_unexp_d2mp(n, radix, din, &out_rev[2]);
+}
+
+
+void mp_unexp_d2mp(int n, int radix, double din, int out[])
+{
+    int j, x;
+    
+    for (j = 0; j < n; j++) {
+        x = (int) din;
+        if (x >= radix) {
+            x = radix - 1;
+            din = radix;
+        }
+        din = radix * (din - x);
+        out[j] = x;
+    }
+}
+
+
+double mp_unexp_mp2d(int n, int radix, int in[])
+{
+    int j;
+    double d1_radix, dout;
+    
+    d1_radix = 1.0 / radix;
+    dout = 0;
+    for (j = n - 1; j >= 0; j--) {
+        dout = d1_radix * dout + in[j];
+    }
+    return dout;
+}
+
+
+int mp_inv_newton(int n, int radix, int in[], int inout[], 
+        int tmp1[], int tmp2[], int nfft, double tmp1fft[], 
+        double tmp2fft[], int ip[], double w[])
+{
+    void mp_load_1(int n, int radix, int out[]);
+    void mp_round(int n, int radix, int m, int inout[]);
+    void mp_add(int n, int radix, int in1[], int in2[], int out[]);
+    void mp_sub(int n, int radix, int in1[], int in2[], int out[]);
+    void mp_mulh(int n, int radix, int in1[], int in2[], int out[], 
+            int nfft, double in1fft[], double outfft[], 
+            int ip[], double w[]);
+    void mp_mulh_use_in1fft(int n, int radix, double in1fft[], 
+            int shift, int in2[], int out[], int nfft, double outfft[], 
+            int ip[], double w[]);
+    int n_h, shift, prc;
+    
+    shift = (nfft >> 1) + 1;
+    n_h = n / 2 + 1;
+    if (n_h < n - shift) {
+        n_h = n - shift;
+    }
+    /* ---- tmp1 = inout * (upper) in (half to normal precision) ---- */
+    mp_round(n, radix, shift, inout);
+    mp_mulh(n, radix, inout, in, tmp1, 
+            nfft, tmp1fft, tmp2fft, ip, w);
+    /* ---- tmp2 = 1 - tmp1 ---- */
+    mp_load_1(n, radix, tmp2);
+    mp_sub(n, radix, tmp2, tmp1, tmp2);
+    /* ---- tmp2 -= inout * (lower) in (half precision) ---- */
+    mp_mulh_use_in1fft(n, radix, tmp1fft, shift, in, tmp1, 
+            nfft, tmp2fft, ip, w);
+    mp_sub(n_h, radix, tmp2, tmp1, tmp2);
+    /* ---- get precision ---- */
+    prc = -tmp2[1];
+    if (tmp2[0] == 0) {
+        prc = nfft + 1;
+    }
+    /* ---- tmp2 *= inout (half precision) ---- */
+    mp_mulh_use_in1fft(n_h, radix, tmp1fft, 0, tmp2, tmp2, 
+            nfft, tmp2fft, ip, w);
+    /* ---- inout += tmp2 ---- */
+    if (tmp2[0] != 0) {
+        mp_add(n, radix, inout, tmp2, inout);
+    }
+    return prc;
+}
+
+
+int mp_sqrt_newton(int n, int radix, int in[], int inout[], 
+        int inout_rev[], int tmp[], int nfft, double tmp1fft[], 
+        double tmp2fft[], int ip[], double w[], int *n_tmp1fft)
+{
+    void mp_round(int n, int radix, int m, int inout[]);
+    void mp_add(int n, int radix, int in1[], int in2[], int out[]);
+    void mp_sub(int n, int radix, int in1[], int in2[], int out[]);
+    void mp_idiv_2(int n, int radix, int in[], int out[]);
+    void mp_mulh(int n, int radix, int in1[], int in2[], int out[], 
+            int nfft, double in1fft[], double outfft[], 
+            int ip[], double w[]);
+    void mp_squh(int n, int radix, int in[], int out[], 
+            int nfft, double inoutfft[], int ip[], double w[]);
+    void mp_squh_use_in1fft(int n, int radix, double inoutfft[], int out[], 
+            int nfft, int ip[], double w[]);
+    int n_h, nfft_h, shift, prc;
+    
+    nfft_h = nfft >> 1;
+    shift = nfft_h + 1;
+    if (nfft_h < 2) {
+        nfft_h = 2;
+    }
+    n_h = n / 2 + 1;
+    if (n_h < n - shift) {
+        n_h = n - shift;
+    }
+    /* ---- tmp = inout_rev^2 (1/4 to half precision) ---- */
+    mp_round(n_h, radix, (nfft_h >> 1) + 1, inout_rev);
+    if (*n_tmp1fft != nfft_h) {
+        mp_squh(n_h, radix, inout_rev, tmp, 
+                nfft_h, tmp1fft, ip, w);
+    } else {
+        mp_squh_use_in1fft(n_h, radix, tmp1fft, tmp, 
+                nfft_h, ip, w);
+    }
+    /* ---- tmp = inout_rev - inout * tmp (half precision) ---- */
+    mp_round(n, radix, shift, inout);
+    mp_mulh(n_h, radix, inout, tmp, tmp, 
+            nfft, tmp1fft, tmp2fft, ip, w);
+    mp_sub(n_h, radix, inout_rev, tmp, tmp);
+    /* ---- inout_rev += tmp ---- */
+    mp_add(n_h, radix, inout_rev, tmp, inout_rev);
+    /* ---- tmp = in - inout^2 (half to normal precision) ---- */
+    mp_squh_use_in1fft(n, radix, tmp1fft, tmp, 
+            nfft, ip, w);
+    mp_sub(n, radix, in, tmp, tmp);
+    /* ---- get precision ---- */
+    prc = in[1] - tmp[1];
+    if (in[2] > tmp[2]) {
+        prc++;
+    }
+    if (tmp[0] == 0) {
+        prc = nfft + 1;
+    }
+    /* ---- tmp = tmp * inout_rev / 2 (half precision) ---- */
+    mp_round(n_h, radix, shift, inout_rev);
+    mp_mulh(n_h, radix, inout_rev, tmp, tmp, 
+            nfft, tmp1fft, tmp2fft, ip, w);
+    *n_tmp1fft = nfft;
+    mp_idiv_2(n_h, radix, tmp, tmp);
+    /* ---- inout += tmp ---- */
+    if (tmp[0] != 0) {
+        mp_add(n, radix, inout, tmp, inout);
+    }
+    return prc;
+}
+
+
+/* -------- mp_io routines -------- */
+
+
+void mp_sprintf(int n, int log10_radix, int in[], char out[])
+{
+    int j, k, x, y, outexp, shift;
+    
+    if (in[0] < 0) {
+        *out++ = '-';
+    }
+    x = in[2];
+    shift = log10_radix;
+    for (k = log10_radix; k > 0; k--) {
+        y = x % 10;
+        x /= 10;
+        out[k] = '0' + y;
+        if (y != 0) {
+            shift = k;
+        }
+    }
+    out[0] = out[shift];
+    out[1] = '.';
+    for (k = 1; k <= log10_radix - shift; k++) {
+        out[k + 1] = out[k + shift];
+    }
+    outexp = log10_radix - shift;
+    out += outexp + 2;
+    for (j = 3; j <= n + 1; j++) {
+        x = in[j];
+        for (k = log10_radix - 1; k >= 0; k--) {
+            y = x % 10;
+            x /= 10;
+            out[k] = '0' + y;
+        }
+        out += log10_radix;
+    }
+    *out++ = 'e';
+    outexp += log10_radix * in[1];
+    sprintf(out, "%d", outexp);
+}
+
+
+void mp_sscanf(int n, int log10_radix, char in[], int out[])
+{
+    char *s;
+    int j, x, outexp, outexp_mod;
+    
+    while (*in == ' ') {
+        in++;
+    }
+    out[0] = 1;
+    if (*in == '-') {
+        out[0] = -1;
+        in++;
+    } else if (*in == '+') {
+        in++;
+    }
+    while (*in == ' ' || *in == '0') {
+        in++;
+    }
+    outexp = 0;
+    for (s = in; *s != '\0'; s++) {
+        if (*s == 'e' || *s == 'E' || *s == 'd' || *s == 'D') {
+            if (sscanf(++s, "%d", &outexp) != 1) {
+                outexp = 0;
+            }
+            break;
+        }
+    }
+    if (*in == '.') {
+        do {
+            outexp--;
+            while (*++in == ' ');
+        } while (*in == '0' && *in != '\0');
+    } else if (*in != '\0') {
+        s = in;
+        while (*++s == ' ');
+        while (*s >= '0' && *s <= '9' && *s != '\0') {
+            outexp++;
+            while (*++s == ' ');
+        }
+    }
+    x = outexp / log10_radix;
+    outexp_mod = outexp - log10_radix * x;
+    if (outexp_mod < 0) {
+        x--;
+        outexp_mod += log10_radix;
+    }
+    out[1] = x;
+    x = 0;
+    j = 2;
+    for (s = in; *s != '\0'; s++) {
+        if (*s == '.' || *s == ' ') {
+            continue;
+        }
+        if (*s < '0' || *s > '9') {
+            break;
+        }
+        x = 10 * x + (*s - '0');
+        if (--outexp_mod < 0) {
+            if (j > n + 1) {
+                break;
+            }
+            out[j++] = x;
+            x = 0;
+            outexp_mod = log10_radix - 1;
+        }
+    }
+    while (outexp_mod-- >= 0) {
+        x *= 10;
+    }
+    while (j <= n + 1) {
+        out[j++] = x;
+        x = 0;
+    }
+    if (out[2] == 0) {
+        out[0] = 0;
+        out[1] = 0;
+    }
+}
+
+
+void mp_fprintf(int n, int log10_radix, int in[], FILE *fout)
+{
+    int j, k, x, y, outexp, shift;
+    char out[256];
+    
+    if (in[0] < 0) {
+        putc('-', fout);
+    }
+    x = in[2];
+    shift = log10_radix;
+    for (k = log10_radix; k > 0; k--) {
+        y = x % 10;
+        x /= 10;
+        out[k] = '0' + y;
+        if (y != 0) {
+            shift = k;
+        }
+    }
+    putc(out[shift], fout);
+    putc('.', fout);
+    for (k = 1; k <= log10_radix - shift; k++) {
+        putc(out[k + shift], fout);
+    }
+    outexp = log10_radix - shift;
+    for (j = 3; j <= n + 1; j++) {
+        x = in[j];
+        for (k = log10_radix - 1; k >= 0; k--) {
+            y = x % 10;
+            x /= 10;
+            out[k] = '0' + y;
+        }
+        for (k = 0; k < log10_radix; k++) {
+            putc(out[k], fout);
+        }
+    }
+    putc('e', fout);
+    outexp += log10_radix * in[1];
+    sprintf(out, "%d", outexp);
+    for (k = 0; out[k] != '\0'; k++) {
+        putc(out[k], fout);
+    }
+}
+
+
diff --git a/plugins/supereq/nsfft-1.00/simd/Makefile b/plugins/supereq/nsfft-1.00/simd/Makefile
new file mode 120000
index 00000000..fc484116
--- /dev/null
+++ b/plugins/supereq/nsfft-1.00/simd/Makefile
@@ -0,0 +1 @@
+Makefile.x86
+\ No newline at end of file
diff --git a/plugins/supereq/nsfft-1.00/simd/Makefile.altivec b/plugins/supereq/nsfft-1.00/simd/Makefile.altivec
new file mode 100644
index 00000000..eeaed6a1
--- /dev/null
+++ b/plugins/supereq/nsfft-1.00/simd/Makefile.altivec
@@ -0,0 +1,26 @@
+CC=gcc
+BASEOPT=-Wall -maltivec -mabi=altivec
+OPT=$(BASEOPT) -O3
+
+all : libSIMD.a
+
+SIMDBaseUndiff_purecfloat.o : SIMDBaseUndiff.c SIMDBase.h SIMDBaseUndiff.h
+	$(CC) $(OPT) -DENABLE_PUREC_FLOAT SIMDBaseUndiff.c -c -o SIMDBaseUndiff_purecfloat.o
+
+SIMDBaseUndiff_purecdouble.o : SIMDBaseUndiff.c SIMDBase.h SIMDBaseUndiff.h
+	$(CC) $(OPT) -DENABLE_PUREC_DOUBLE SIMDBaseUndiff.c -c -o SIMDBaseUndiff_purecdouble.o
+
+SIMDBaseUndiff_pureclongdouble.o : SIMDBaseUndiff.c SIMDBase.h SIMDBaseUndiff.h
+	$(CC) $(OPT) -DENABLE_PUREC_LONGDOUBLE SIMDBaseUndiff.c -c -o SIMDBaseUndiff_pureclongdouble.o
+
+SIMDBaseUndiff_altivecfloat.o : SIMDBaseUndiff.c SIMDBase.h SIMDBaseUndiff.h
+	$(CC) $(OPT) -DENABLE_ALTIVEC_FLOAT SIMDBaseUndiff.c -c -o SIMDBaseUndiff_altivecfloat.o
+
+SIMDBase.o : SIMDBase.c SIMDBase.h
+	$(CC) $(BASEOPT) -O -DENABLE_PUREC_FLOAT -DENABLE_PUREC_DOUBLE -DENABLE_PUREC_LONGDOUBLE -DENABLE_ALTIVEC_FLOAT SIMDBase.c -c -o SIMDBase.o
+
+libSIMD.a : SIMDBase.o SIMDBaseUndiff_purecfloat.o SIMDBaseUndiff_purecdouble.o SIMDBaseUndiff_pureclongdouble.o SIMDBaseUndiff_altivecfloat.o
+	rm -f libSIMD.a; ar -cvq libSIMD.a SIMDBase.o SIMDBaseUndiff_purecfloat.o SIMDBaseUndiff_purecdouble.o SIMDBaseUndiff_pureclongdouble.o SIMDBaseUndiff_altivecfloat.o
+
+clean :
+	rm -f *~ *.o *.s *.a
diff --git a/plugins/supereq/nsfft-1.00/simd/Makefile.neon b/plugins/supereq/nsfft-1.00/simd/Makefile.neon
new file mode 100644
index 00000000..ace704f1
--- /dev/null
+++ b/plugins/supereq/nsfft-1.00/simd/Makefile.neon
@@ -0,0 +1,26 @@
+CC=gcc
+BASEOPT=-Wall -mfloat-abi=softfp
+OPT=$(BASEOPT) -O3
+
+all : libSIMD.a
+
+SIMDBaseUndiff_purecfloat.o : SIMDBaseUndiff.c SIMDBase.h SIMDBaseUndiff.h
+	$(CC) $(OPT) -DENABLE_PUREC_FLOAT SIMDBaseUndiff.c -c -o SIMDBaseUndiff_purecfloat.o
+
+SIMDBaseUndiff_purecdouble.o : SIMDBaseUndiff.c SIMDBase.h SIMDBaseUndiff.h
+	$(CC) $(OPT) -DENABLE_PUREC_DOUBLE SIMDBaseUndiff.c -c -o SIMDBaseUndiff_purecdouble.o
+
+SIMDBaseUndiff_pureclongdouble.o : SIMDBaseUndiff.c SIMDBase.h SIMDBaseUndiff.h
+	$(CC) $(OPT) -DENABLE_PUREC_LONGDOUBLE SIMDBaseUndiff.c -c -o SIMDBaseUndiff_pureclongdouble.o
+
+SIMDBaseUndiff_neonfloat.o : SIMDBaseUndiff.c SIMDBase.h SIMDBaseUndiff.h
+	$(CC) $(OPT) -mfpu=neon -DENABLE_NEON_FLOAT SIMDBaseUndiff.c -c -o SIMDBaseUndiff_neonfloat.o
+
+SIMDBase.o : SIMDBase.c SIMDBase.h
+	$(CC) $(BASEOPT) -O -DENABLE_PUREC_FLOAT -DENABLE_PUREC_DOUBLE -DENABLE_PUREC_LONGDOUBLE -DENABLE_NEON_FLOAT SIMDBase.c -c -o SIMDBase.o
+
+libSIMD.a : SIMDBase.o SIMDBaseUndiff_purecfloat.o SIMDBaseUndiff_purecdouble.o SIMDBaseUndiff_pureclongdouble.o SIMDBaseUndiff_neonfloat.o
+	rm -f libSIMD.a; ar -cvq libSIMD.a SIMDBase.o SIMDBaseUndiff_purecfloat.o SIMDBaseUndiff_purecdouble.o SIMDBaseUndiff_pureclongdouble.o SIMDBaseUndiff_neonfloat.o
+
+clean :
+	rm -f *~ *.o *.s *.a
diff --git a/plugins/supereq/nsfft-1.00/simd/Makefile.purec b/plugins/supereq/nsfft-1.00/simd/Makefile.purec
new file mode 100644
index 00000000..2c8b04f1
--- /dev/null
+++ b/plugins/supereq/nsfft-1.00/simd/Makefile.purec
@@ -0,0 +1,35 @@
+CC=gcc
+BASEOPT=-Wall
+OPT=$(BASEOPT) -O3
+
+all : libDFT.a
+
+DFTpurecfloat.o : DFTUndiff.c DFT.h SIMDBase.h
+	$(CC) $(OPT) -DENABLE_PUREC_FLOAT DFTUndiff.c -c -o DFTpurecfloat.o
+
+DFTpurecdouble.o : DFTUndiff.c DFT.h SIMDBase.h
+	$(CC) $(OPT) -DENABLE_PUREC_DOUBLE DFTUndiff.c -c -o DFTpurecdouble.o
+
+DFTpureclongdouble.o : DFTUndiff.c DFT.h SIMDBase.h
+	$(CC) $(OPT) -DENABLE_PUREC_LONGDOUBLE DFTUndiff.c -c -o DFTpureclongdouble.o
+
+SIMDBaseUndiff_purecfloat.o : SIMDBaseUndiff.c SIMDBase.h SIMDBaseUndiff.h
+	$(CC) $(OPT) -DENABLE_PUREC_FLOAT SIMDBaseUndiff.c -c -o SIMDBaseUndiff_purecfloat.o
+
+SIMDBaseUndiff_purecdouble.o : SIMDBaseUndiff.c DFT.h SIMDBase.h SIMDBaseUndiff.h
+	$(CC) $(OPT) -DENABLE_PUREC_DOUBLE SIMDBaseUndiff.c -c -o SIMDBaseUndiff_purecdouble.o
+
+SIMDBaseUndiff_pureclongdouble.o : SIMDBaseUndiff.c SIMDBase.h SIMDBaseUndiff.h
+	$(CC) $(OPT) -DENABLE_PUREC_LONGDOUBLE SIMDBaseUndiff.c -c -o SIMDBaseUndiff_pureclongdouble.o
+
+SIMDBase.o : SIMDBase.c SIMDBase.h
+	$(CC) $(BASEOPT) -O -DENABLE_PUREC_FLOAT -DENABLE_PUREC_DOUBLE -DENABLE_PUREC_LONGDOUBLE SIMDBase.c -c -o SIMDBase.o
+
+DFT.o : DFT.c DFT.h
+	$(CC) $(OPT) -DENABLE_PUREC_FLOAT -DENABLE_PUREC_DOUBLE -DENABLE_PUREC_LONGDOUBLE DFT.c -c -o DFT.o
+
+libDFT.a : DFTpurecfloat.o DFTpurecdouble.o DFTpureclongdouble.o DFT.o SIMDBase.o SIMDBaseUndiff_purecfloat.o SIMDBaseUndiff_purecdouble.o SIMDBaseUndiff_pureclongdouble.o
+	rm -f libDFT.a; ar -cvq libDFT.a DFTpurecfloat.o DFTpurecdouble.o DFTpureclongdouble.o DFT.o SIMDBase.o SIMDBaseUndiff_purecfloat.o SIMDBaseUndiff_purecdouble.o SIMDBaseUndiff_pureclongdouble.o
+
+clean :
+	rm -f *~ *.o *.s *.a
diff --git a/plugins/supereq/nsfft-1.00/simd/Makefile.x86 b/plugins/supereq/nsfft-1.00/simd/Makefile.x86
new file mode 100644
index 00000000..02f49610
--- /dev/null
+++ b/plugins/supereq/nsfft-1.00/simd/Makefile.x86
@@ -0,0 +1,35 @@
+CC=gcc
+BASEOPT=-Wall
+OPT=$(BASEOPT) -O3
+
+all : libSIMD.a
+
+SIMDBaseUndiff_purecfloat.o : SIMDBaseUndiff.c SIMDBase.h SIMDBaseUndiff.h
+	$(CC) $(OPT) -DENABLE_PUREC_FLOAT SIMDBaseUndiff.c -c -o SIMDBaseUndiff_purecfloat.o
+
+SIMDBase_purecdouble.o : SIMDBaseUndiff.c SIMDBase.h SIMDBaseUndiff.h
+	$(CC) $(OPT) -DENABLE_PUREC_DOUBLE SIMDBaseUndiff.c -c -o SIMDBase_purecdouble.o
+
+SIMDBase_pureclongdouble.o : SIMDBaseUndiff.c SIMDBase.h SIMDBaseUndiff.h
+	$(CC) $(OPT) -DENABLE_PUREC_LONGDOUBLE SIMDBaseUndiff.c -c -o SIMDBase_pureclongdouble.o
+
+SIMDBase_ssefloat.o : SIMDBaseUndiff.c SIMDBase.h SIMDBaseUndiff.h
+	$(CC) $(OPT) -msse -DENABLE_SSE_FLOAT SIMDBaseUndiff.c -c -o SIMDBase_ssefloat.o
+
+SIMDBase_sse2double.o : SIMDBaseUndiff.c SIMDBase.h SIMDBaseUndiff.h
+	$(CC) $(OPT) -msse2 -DENABLE_SSE2_DOUBLE SIMDBaseUndiff.c -c -o SIMDBase_sse2double.o
+
+SIMDBase_avxfloat.o : SIMDBaseUndiff.c SIMDBase.h SIMDBaseUndiff.h
+	$(CC) $(OPT) -mavx -DENABLE_AVX_FLOAT SIMDBaseUndiff.c -c -o SIMDBase_avxfloat.o
+
+SIMDBase_avxdouble.o : SIMDBaseUndiff.c SIMDBase.h SIMDBaseUndiff.h
+	$(CC) $(OPT) -mavx -DENABLE_AVX_DOUBLE SIMDBaseUndiff.c -c -o SIMDBase_avxdouble.o
+
+SIMDBase.o : SIMDBase.c SIMDBase.h
+	$(CC) $(BASEOPT) -O -DENABLE_PUREC_FLOAT -DENABLE_PUREC_DOUBLE -DENABLE_PUREC_LONGDOUBLE -DENABLE_SSE_FLOAT -DENABLE_SSE2_DOUBLE SIMDBase.c -c -o SIMDBase.o
+
+libSIMD.a : SIMDBase.o SIMDBaseUndiff_purecfloat.o SIMDBase_purecdouble.o SIMDBase_pureclongdouble.o SIMDBase_ssefloat.o SIMDBase_sse2double.o
+	rm -f libSIMD.a; ar -cvq libSIMD.a SIMDBase.o SIMDBaseUndiff_purecfloat.o SIMDBase_purecdouble.o SIMDBase_pureclongdouble.o SIMDBase_ssefloat.o SIMDBase_sse2double.o
+
+clean :
+	rm -f *~ *.o *.s *.a a.out
diff --git a/plugins/supereq/nsfft-1.00/simd/Makefile.x86avx b/plugins/supereq/nsfft-1.00/simd/Makefile.x86avx
new file mode 100644
index 00000000..d9d27a2e
--- /dev/null
+++ b/plugins/supereq/nsfft-1.00/simd/Makefile.x86avx
@@ -0,0 +1,35 @@
+CC=gcc
+BASEOPT=-Wall
+OPT=$(BASEOPT) -O3
+
+all : libSIMD.a
+
+SIMDBaseUndiff_purecfloat.o : SIMDBaseUndiff.c SIMDBase.h SIMDBaseUndiff.h
+	$(CC) $(OPT) -DENABLE_PUREC_FLOAT SIMDBaseUndiff.c -c -o SIMDBaseUndiff_purecfloat.o
+
+SIMDBaseUndiff_purecdouble.o : SIMDBaseUndiff.c SIMDBase.h SIMDBaseUndiff.h
+	$(CC) $(OPT) -DENABLE_PUREC_DOUBLE SIMDBaseUndiff.c -c -o SIMDBaseUndiff_purecdouble.o
+
+SIMDBaseUndiff_pureclongdouble.o : SIMDBaseUndiff.c SIMDBase.h SIMDBaseUndiff.h
+	$(CC) $(OPT) -DENABLE_PUREC_LONGDOUBLE SIMDBaseUndiff.c -c -o SIMDBaseUndiff_pureclongdouble.o
+
+SIMDBaseUndiff_ssefloat.o : SIMDBaseUndiff.c SIMDBase.h SIMDBaseUndiff.h
+	$(CC) $(OPT) -msse -DENABLE_SSE_FLOAT SIMDBaseUndiff.c -c -o SIMDBaseUndiff_ssefloat.o
+
+SIMDBaseUndiff_sse2double.o : SIMDBaseUndiff.c SIMDBase.h SIMDBaseUndiff.h
+	$(CC) $(OPT) -msse2 -DENABLE_SSE2_DOUBLE SIMDBaseUndiff.c -c -o SIMDBaseUndiff_sse2double.o
+
+SIMDBaseUndiff_avxfloat.o : SIMDBaseUndiff.c SIMDBase.h SIMDBaseUndiff.h
+	$(CC) $(OPT) -mavx -DENABLE_AVX_FLOAT SIMDBaseUndiff.c -c -o SIMDBaseUndiff_avxfloat.o
+
+SIMDBaseUndiff_avxdouble.o : SIMDBaseUndiff.c SIMDBase.h SIMDBaseUndiff.h
+	$(CC) $(OPT) -mavx -DENABLE_AVX_DOUBLE SIMDBaseUndiff.c -c -o SIMDBaseUndiff_avxdouble.o
+
+SIMDBase.o : SIMDBase.c SIMDBase.h
+	$(CC) $(BASEOPT) -O -DENABLE_PUREC_FLOAT -DENABLE_PUREC_DOUBLE -DENABLE_PUREC_LONGDOUBLE -DENABLE_SSE_FLOAT -DENABLE_SSE2_DOUBLE -DENABLE_AVX_FLOAT -DENABLE_AVX_DOUBLE SIMDBase.c -c -o SIMDBase.o
+
+libSIMD.a : SIMDBase.o SIMDBaseUndiff_purecfloat.o SIMDBaseUndiff_purecdouble.o SIMDBaseUndiff_pureclongdouble.o SIMDBaseUndiff_ssefloat.o SIMDBaseUndiff_sse2double.o SIMDBaseUndiff_avxfloat.o SIMDBaseUndiff_avxdouble.o
+	rm -f libSIMD.a; ar -cvq libSIMD.a SIMDBase.o SIMDBaseUndiff_purecfloat.o SIMDBaseUndiff_purecdouble.o SIMDBaseUndiff_pureclongdouble.o SIMDBaseUndiff_ssefloat.o SIMDBaseUndiff_sse2double.o SIMDBaseUndiff_avxfloat.o SIMDBaseUndiff_avxdouble.o
+
+clean :
+	rm -f *~ *.o *.s *.a a.out
diff --git a/plugins/supereq/nsfft-1.00/simd/SIMDBase.c b/plugins/supereq/nsfft-1.00/simd/SIMDBase.c
new file mode 100644
index 00000000..eb51ee10
--- /dev/null
+++ b/plugins/supereq/nsfft-1.00/simd/SIMDBase.c
@@ -0,0 +1,454 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include <stdint.h>
+#include <signal.h>
+#include <setjmp.h>
+#include <string.h>
+
+#include "SIMDBase.h"
+
+void detect_purec_float(void);
+void detect_purec_double(void);
+void detect_purec_longdouble(void);
+void detect_sse_float(void);
+void detect_sse2_double(void);
+void detect_neon_float(void);
+void detect_avx_float(void);
+void detect_avx_double(void);
+void detect_altivec_float(void);
+
+int32_t getModeParamInt_purec_float(int32_t paramId);
+int32_t getModeParamInt_purec_double(int32_t paramId);
+int32_t getModeParamInt_purec_longdouble(int32_t paramId);
+int32_t getModeParamInt_sse_float(int32_t paramId);
+int32_t getModeParamInt_sse2_double(int32_t paramId);
+int32_t getModeParamInt_neon_float(int32_t paramId);
+int32_t getModeParamInt_avx_float(int32_t paramId);
+int32_t getModeParamInt_avx_double(int32_t paramId);
+int32_t getModeParamInt_altivec_float(int32_t paramId);
+
+char * getModeParamString_purec_float(int32_t paramId);
+char * getModeParamString_purec_double(int32_t paramId);
+char * getModeParamString_purec_longdouble(int32_t paramId);
+char * getModeParamString_sse_float(int32_t paramId);
+char * getModeParamString_sse2_double(int32_t paramId);
+char * getModeParamString_neon_float(int32_t paramId);
+char * getModeParamString_avx_float(int32_t paramId);
+char * getModeParamString_avx_double(int32_t paramId);
+char * getModeParamString_altivec_float(int32_t paramId);
+
+uint8_t detectBuffer[256];
+char SIMDBase_processorNameString[256];
+
+static char *startsWith(char *str1, char *str2) {
+  if (strncmp(str1, str2, strlen(str2)) == 0) {
+    return str1 + strlen(str2);
+  }
+
+  return NULL;
+}
+
+#if defined(__linux__)
+static char *tryReadingProcCpuinfo(char *entry) {
+  int i;
+
+  FILE *fp = fopen("/proc/cpuinfo", "r");
+  if (fp == NULL) return NULL;
+
+  for(i=0;i<100;i++) {
+    char *q;
+    bzero(SIMDBase_processorNameString, 256);
+    if (fgets(SIMDBase_processorNameString, 255, fp) == NULL) break;
+
+    if ((q = startsWith(SIMDBase_processorNameString, entry)) != NULL) {
+      int j;
+      fclose(fp);
+
+      for(j=0;j<256;j++) {
+	if (SIMDBase_processorNameString[j] == '\n') SIMDBase_processorNameString[j] = ' ';
+      }
+      while(*q != '\0' && *q != ':' && q - SIMDBase_processorNameString < 200) q++;
+      if (q - SIMDBase_processorNameString >= 200) return NULL;
+      if (*q == ':' && *(q+1) == ' ') return q + 2;
+      return NULL;
+    }
+  }
+
+  fclose(fp);
+  return NULL;
+}
+#else
+static char *tryReadingProcCpuinfo(char *entry) { return NULL; }
+#endif
+
+#if defined(__i386__)
+static void SIMDBase_x86cpuid(uint32_t out[4], uint32_t eax, uint32_t ecx) {
+  uint32_t a, b, c, d;
+  __asm__ __volatile__("pushl %%eax;      \n\t"
+		       "pushl %%ebx;      \n\t"
+		       "pushl %%ecx;      \n\t"
+		       "pushl %%edx;      \n\t"
+		       "cpuid;            \n\t"
+		       "movl %%eax, %0;   \n\t"
+		       "movl %%ebx, %1;   \n\t"
+		       "movl %%ecx, %2;   \n\t"
+		       "movl %%edx, %3;   \n\t"
+		       "popl %%edx;       \n\t"
+		       "popl %%ecx;       \n\t"
+		       "popl %%ebx;       \n\t"
+		       "popl %%eax;       \n\t"
+		       : "=m"(a), "=m"(b), "=m"(c), "=m"(d)
+		       : "a"(eax), "c"(ecx)
+		       : "cc");
+  out[0] = a; out[1] = b; out[2] = c; out[3] = d;
+}
+#endif
+
+#if defined(__x86_64__)
+static void SIMDBase_x86cpuid(uint32_t out[4], uint32_t eax, uint32_t ecx) {
+  uint32_t a, b, c, d;
+  __asm__ __volatile__ ("cpuid" : "=a" (a), "=b" (b), "=c" (c), "=d" (d) : "a" (eax), "c"(ecx));
+  out[0] = a; out[1] = b; out[2] = c; out[3] = d;
+}
+#endif
+
+#if defined(__i386__) || defined(__x86_64__)
+static void getCacheParam(CacheParam *p) {
+  static int l2assoc[] = {0,1,2,0,4,0,8,0,16,0,32,48,64,96,128,-1};
+  int32_t i;
+  uint32_t out[4];
+
+  for(i=0;i<8;i++) {
+    p->size[i] = p->assoc[i] = 0;
+  }
+
+  SIMDBase_x86cpuid(out, 4, 0);
+
+  if ((out[0] & 0xf) != 0) {
+    p->linesize = ((out[1] >> 0) & 2047)+1;
+    for(i=0;i<8;i++) {
+      SIMDBase_x86cpuid(out, 4, i);
+      if ((out[0] & 0xf) == 0) break;
+      int level = (out[0] >> 5) & 0x7;
+      int type  = (out[0] >> 0) & 0xf;
+      int assoc = ((out[1] >> 22) & 1023)+1;
+      int part  = ((out[1] >> 12) & 1023)+1;
+      int lsize = ((out[1] >> 0) & 2047)+1;
+      int nsets = ((out[2] >> 0))+1;
+      int nthre = ((out[0] >> 14) & 1023)+1;
+
+      if (type != 1 && type != 3) continue;
+      p->assoc[level-1] = assoc;
+      p->size[level-1] = (uint64_t)assoc * part * lsize * nsets / nthre;
+    }
+  } else {
+    SIMDBase_x86cpuid(out, 0x80000008U, 0);
+    int ncores = (out[2] & 0xff) + 1;
+
+    SIMDBase_x86cpuid(out, 0x80000005U, 0);
+    p->linesize = out[2] & 255;
+    p->size[0] = (out[2] >> 24) * 1024 / ncores;
+    p->assoc[0] = (out[2] >> 16) & 0xff;
+
+    SIMDBase_x86cpuid(out, 0x80000006U, 0);
+    p->size[1] = (out[2] >> 16) * 1024 / ncores;
+    p->assoc[1] = l2assoc[(out[2] >> 12) & 0xf];
+    p->size[2] = (out[3] >> 18) * 512 * 1024 / ncores;
+    p->assoc[2] = l2assoc[(out[3] >> 12) & 0xf];
+  }
+
+  if (p->size[0] == 0) {
+    p->size[0] = 16 * 1024;
+    p->assoc[0] = 4;
+  }
+
+  if (p->size[1] == 0) {
+    p->size[1] = 256 * 1024;
+    p->assoc[1] = 4;
+  }
+}
+
+char *SIMDBase_getProcessorNameString() {
+  union {
+    uint32_t info[4];
+    uint8_t str[16];
+  } u;
+  int i,j;
+  char *p;
+
+  p = SIMDBase_processorNameString;
+
+  SIMDBase_x86cpuid(u.info, 0, 0);
+
+  for(i=0;i<4;i++) *p++ = u.str[i+4];
+  for(i=0;i<4;i++) *p++ = u.str[i+12];
+  for(i=0;i<4;i++) *p++ = u.str[i+8];
+
+  *p++ = ' ';
+
+  for(i=0;i<3;i++) {
+    SIMDBase_x86cpuid(u.info, i + 0x80000002, 0);
+
+    for(j=0;j<16;j++) {
+      *p++ = u.str[j];
+    }
+  }
+
+  *p++ = '\n';
+
+  return SIMDBase_processorNameString;
+}
+#else
+char *SIMDBase_getProcessorNameString() {
+  char *p = "Unknown";
+#if defined(__powerpc__)
+  if ((p = tryReadingProcCpuinfo("cpu")) == NULL) p = "PowerPC";
+#elif defined(__arm__)
+  if ((p = tryReadingProcCpuinfo("Processor")) == NULL) p = "ARM";
+#endif
+
+  return p;
+}
+#endif
+
+int32_t SIMDBase_sizeOfCachelineInByte() {
+#if defined(__i386__) || defined(__x86_64__)
+  CacheParam p;
+  getCacheParam(&p);
+  return p.linesize;
+#else
+  return 64;
+#endif
+}
+
+int32_t SIMDBase_sizeOfDataCacheInByte() {
+#if defined(__i386__) || defined(__x86_64__)
+  CacheParam p;
+  getCacheParam(&p);
+  return p.size[1] + p.size[2]; // L2 + L3
+#else
+  return 256 * 1024;
+#endif
+}
+
+static jmp_buf sigjmp;
+
+static void sighandler(int signum) {
+  longjmp(sigjmp, 1);
+}
+
+int32_t SIMDBase_detect(int32_t paramId) {
+#if defined(__i386__) || defined(__x86_64__)
+  uint32_t reg[4];
+#endif
+
+  switch(paramId) {
+  case SIMDBase_MODE_PUREC_FLOAT:
+#if defined(ENABLE_PUREC_FLOAT)
+    return 1;
+#else
+    return -1;
+#endif
+  case SIMDBase_MODE_PUREC_DOUBLE:
+#if defined(ENABLE_PUREC_DOUBLE)
+    return 1;
+#else
+    return -1;
+#endif
+  case SIMDBase_MODE_PUREC_LONGDOUBLE:
+#if defined(ENABLE_PUREC_LONGDOUBLE)
+    return 1;
+#else
+    return -1;
+#endif
+  case SIMDBase_MODE_SSE_FLOAT:
+#if defined(ENABLE_SSE_FLOAT)
+    SIMDBase_x86cpuid(reg, 1, 0);
+    return (reg[3] & (1 << 25)) != 0;
+#else
+    return -1;
+#endif
+  case SIMDBase_MODE_SSE2_DOUBLE:
+#if defined(ENABLE_SSE2_DOUBLE)
+    SIMDBase_x86cpuid(reg, 1, 0);
+    return (reg[3] & (1 << 26)) != 0;
+#else
+    return -1;
+#endif
+  case SIMDBase_MODE_AVX_FLOAT:
+#if defined(ENABLE_AVX_FLOAT)
+    SIMDBase_x86cpuid(reg, 1, 0);
+    return (reg[2] & (1 << 28)) != 0;
+#else
+    return -1;
+#endif
+  case SIMDBase_MODE_AVX_DOUBLE:
+#if defined(ENABLE_AVX_DOUBLE)
+    SIMDBase_x86cpuid(reg, 1, 0);
+    return (reg[2] & (1 << 28)) != 0;
+#else
+    return -1;
+#endif
+  default:
+    break;
+  }
+
+  signal(SIGILL, sighandler);
+
+  if (setjmp(sigjmp) == 0) {
+    switch(paramId) {
+#if defined(ENABLE_NEON_FLOAT)
+    case SIMDBase_MODE_NEON_FLOAT:
+      detect_neon_float();
+      break;
+#endif
+#if defined(ENABLE_ALTIVEC_FLOAT)
+    case SIMDBase_MODE_ALTIVEC_FLOAT:
+      detect_altivec_float();
+      break;
+#endif
+    default:
+      signal(SIGILL, SIG_DFL);
+      return -1;
+    }
+    signal(SIGILL, SIG_DFL);
+    return 1;
+  } else {
+    signal(SIGILL, SIG_DFL);
+    return 0;
+  }
+}
+
+int32_t SIMDBase_chooseBestMode(int32_t typeId) {
+  switch(typeId) {
+  case SIMDBase_TYPE_HALF:
+    break;
+  case SIMDBase_TYPE_FLOAT:
+    if (SIMDBase_detect(SIMDBase_MODE_AVX_FLOAT) == 1) return SIMDBase_MODE_AVX_FLOAT;
+    if (SIMDBase_detect(SIMDBase_MODE_SSE_FLOAT) == 1) return SIMDBase_MODE_SSE_FLOAT;
+    if (SIMDBase_detect(SIMDBase_MODE_NEON_FLOAT) == 1) return SIMDBase_MODE_NEON_FLOAT;
+    if (SIMDBase_detect(SIMDBase_MODE_ALTIVEC_FLOAT) == 1) return SIMDBase_MODE_ALTIVEC_FLOAT;
+    if (SIMDBase_detect(SIMDBase_MODE_PUREC_FLOAT) == 1) return SIMDBase_MODE_PUREC_FLOAT;
+    break;
+
+  case SIMDBase_TYPE_DOUBLE:
+    if (SIMDBase_detect(SIMDBase_MODE_AVX_DOUBLE) == 1) return SIMDBase_MODE_AVX_DOUBLE;
+    if (SIMDBase_detect(SIMDBase_MODE_SSE2_DOUBLE) == 1) return SIMDBase_MODE_SSE2_DOUBLE;
+    if (SIMDBase_detect(SIMDBase_MODE_PUREC_DOUBLE) == 1) return SIMDBase_MODE_PUREC_DOUBLE;
+    break;
+
+  case SIMDBase_TYPE_LONGDOUBLE:
+    if (SIMDBase_detect(SIMDBase_MODE_PUREC_LONGDOUBLE) == 1) return SIMDBase_MODE_PUREC_LONGDOUBLE;
+    break;
+
+  case SIMDBase_TYPE_EXTENDED:
+    break;
+
+  case SIMDBase_TYPE_QUAD:
+    break;
+  }
+
+  return SIMDBase_MODE_NONE;
+}
+
+int32_t SIMDBase_getModeParamInt(int32_t paramId, int32_t mode) {
+  switch(mode) {
+#if defined(ENABLE_PUREC_FLOAT)
+  case 1: return getModeParamInt_purec_float(paramId); break;
+#endif
+#if defined(ENABLE_PUREC_DOUBLE)
+  case 2: return getModeParamInt_purec_double(paramId); break;
+#endif
+#if defined(ENABLE_PUREC_LONGDOUBLE)
+  case 3: return getModeParamInt_purec_longdouble(paramId); break;
+#endif
+#if defined(ENABLE_SSE_FLOAT)
+  case 4: return getModeParamInt_sse_float(paramId); break;
+#endif
+#if defined(ENABLE_SSE2_DOUBLE)
+  case 5: return getModeParamInt_sse2_double(paramId); break;
+#endif
+#if defined(ENABLE_NEON_FLOAT)
+  case 6: return getModeParamInt_neon_float(paramId); break;
+#endif
+#if defined(ENABLE_AVX_FLOAT)
+  case 7: return getModeParamInt_avx_float(paramId); break;
+#endif
+#if defined(ENABLE_AVX_DOUBLE)
+  case 8: return getModeParamInt_avx_double(paramId); break;
+#endif
+#if defined(ENABLE_ALTIVEC_FLOAT)
+  case 9: return getModeParamInt_altivec_float(paramId); break;
+#endif
+  }
+
+  return -1;
+}
+
+char *SIMDBase_getModeParamString(int32_t paramId, int32_t mode) {
+  switch(mode) {
+#if defined(ENABLE_PUREC_FLOAT)
+  case 1: return getModeParamString_purec_float(paramId); break;
+#endif
+#if defined(ENABLE_PUREC_DOUBLE)
+  case 2: return getModeParamString_purec_double(paramId); break;
+#endif
+#if defined(ENABLE_PUREC_LONGDOUBLE)
+  case 3: return getModeParamString_purec_longdouble(paramId); break;
+#endif
+#if defined(ENABLE_SSE_FLOAT)
+  case 4: return getModeParamString_sse_float(paramId); break;
+#endif
+#if defined(ENABLE_SSE2_DOUBLE)
+  case 5: return getModeParamString_sse2_double(paramId); break;
+#endif
+#if defined(ENABLE_NEON_FLOAT)
+  case 6: return getModeParamString_neon_float(paramId); break;
+#endif
+#if defined(ENABLE_AVX_FLOAT)
+  case 7: return getModeParamString_avx_float(paramId); break;
+#endif
+#if defined(ENABLE_AVX_DOUBLE)
+  case 8: return getModeParamString_avx_double(paramId); break;
+#endif
+#if defined(ENABLE_ALTIVEC_FLOAT)
+  case 9: return getModeParamString_altivec_float(paramId); break;
+#endif
+  }
+
+  return NULL;
+}
+
+#ifdef ANDROID
+int posix_memalign (void **memptr, size_t alignment, size_t size) {
+    *memptr = malloc (size);
+    return *memptr ? 0 : -1;
+}
+#endif
+
+void *SIMDBase_alignedMalloc(uint64_t size) {
+  void *p;
+  if (posix_memalign(&p, SIMDBase_sizeOfCachelineInByte(), size) != 0) abort();
+  return p;
+}
+
+void SIMDBase_alignedFree(void *ptr) {
+  free(ptr);
+}
+
+int32_t SIMDBase_getParamInt(int32_t paramId) {
+  switch(paramId) {
+  case SIMDBase_PARAMID_MODE_MAX:
+    return SIMDBase_LAST_MODE + 1;
+  }
+
+  return -1;
+}
+
+int32_t SIMDBase_getTypeParamInt(int32_t paramId, int32_t typeId) {
+  switch(typeId) {
+  }
+
+  return -1;
+}
diff --git a/plugins/supereq/nsfft-1.00/simd/SIMDBase.h b/plugins/supereq/nsfft-1.00/simd/SIMDBase.h
new file mode 100644
index 00000000..10cdeb81
--- /dev/null
+++ b/plugins/supereq/nsfft-1.00/simd/SIMDBase.h
@@ -0,0 +1,53 @@
+#ifndef _SIMDBase_H_
+#define _SIMDBase_H_
+
+#include <stdint.h>
+
+#define SIMDBase_TYPE_FLOAT ( 1 | ( 1 << 24 ))
+#define SIMDBase_TYPE_DOUBLE ( 2 | ( 1 << 24 ))
+#define SIMDBase_TYPE_LONGDOUBLE ( 3 | ( 1 << 24 ))
+#define SIMDBase_TYPE_EXTENDED ( 4 | ( 1 << 24 ))
+#define SIMDBase_TYPE_QUAD ( 5 | ( 1 << 24 ))
+#define SIMDBase_TYPE_HALF ( 6 | ( 1 << 24 ))
+
+#define SIMDBase_MODE_NONE 0
+#define SIMDBase_MODE_PUREC_FLOAT 1
+#define SIMDBase_MODE_PUREC_DOUBLE 2
+#define SIMDBase_MODE_PUREC_LONGDOUBLE 3
+#define SIMDBase_MODE_SSE_FLOAT 4
+#define SIMDBase_MODE_SSE2_DOUBLE 5
+#define SIMDBase_MODE_NEON_FLOAT 6
+#define SIMDBase_MODE_AVX_FLOAT 7
+#define SIMDBase_MODE_AVX_DOUBLE 8
+#define SIMDBase_MODE_ALTIVEC_FLOAT 9
+
+#define SIMDBase_LAST_MODE SIMDBase_MODE_ALTIVEC_FLOAT
+
+#define SIMDBase_PARAMID_MODE_MAX ( 1 | ( 2 << 24 ))
+#define SIMDBase_PARAMID_TYPE_AVAILABILITY ( 2 | ( 2 << 24 ))
+#define SIMDBase_PARAMID_SIZE_OF_REAL ( 3 | ( 2 << 24 ))
+#define SIMDBase_PARAMID_SIZE_OF_VECT ( 4 | ( 2 << 24 ))
+#define SIMDBase_PARAMID_VECTOR_LEN ( 5 | ( 2 << 24 ))
+#define SIMDBase_PARAMID_MODE_AVAILABILITY ( 6 | ( 2 << 24 ))
+#define SIMDBase_PARAMID_MODE_NAME ( 7 | ( 2 << 24 ))
+
+//
+
+typedef struct {
+  uint32_t linesize;
+  uint32_t size[8], assoc[8];
+} CacheParam;
+
+void *SIMDBase_alignedMalloc(uint64_t size);
+void SIMDBase_alignedFree(void *ptr);
+int32_t SIMDBase_sizeOfCachelineInByte();
+int32_t SIMDBase_sizeOfDataCacheInByte();
+int32_t SIMDBase_chooseBestMode(int32_t typeId);
+char *SIMDBase_getProcessorNameString();
+int32_t SIMDBase_detect(int32_t paramId);
+int32_t SIMDBase_getParamInt(int32_t paramId);
+int32_t SIMDBase_getTypeParamInt(int32_t paramId, int32_t typeId);
+int32_t SIMDBase_getModeParamInt(int32_t paramId, int32_t mode);
+char *SIMDBase_getModeParamString(int32_t paramId, int32_t mode);
+
+#endif
diff --git a/plugins/supereq/nsfft-1.00/simd/SIMDBaseUndiff.c b/plugins/supereq/nsfft-1.00/simd/SIMDBaseUndiff.c
new file mode 100644
index 00000000..257a5ff0
--- /dev/null
+++ b/plugins/supereq/nsfft-1.00/simd/SIMDBaseUndiff.c
@@ -0,0 +1,38 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+
+#include "SIMDBase.h"
+#include "SIMDBaseUndiff.h"
+
+void SIMDBaseUndiff_DETECT() {
+  extern uint8_t detectBuffer[256];
+  SIMDBase_VECT a = SIMDBase_LOAD((SIMDBase_VECT *)&detectBuffer[0]);
+  SIMDBase_VECT b = SIMDBase_LOAD((SIMDBase_VECT *)&detectBuffer[64]);
+  SIMDBase_VECT c = SIMDBase_ADDi(a, b);
+  SIMDBase_STOR((SIMDBase_VECT *)&detectBuffer[128], c);
+}
+
+int32_t SIMDBaseUndiff_GETMODEPARAMINT(int32_t paramId) {
+  switch(paramId) {
+  case SIMDBase_PARAMID_SIZE_OF_REAL:
+    return sizeof(SIMDBase_REAL);
+  case SIMDBase_PARAMID_SIZE_OF_VECT:
+    return sizeof(SIMDBase_VECT);
+  case SIMDBase_PARAMID_VECTOR_LEN:
+    return SIMDBase_VECTLEN;
+  case SIMDBase_PARAMID_MODE_AVAILABILITY:
+    return SIMDBase_detect(paramId);
+  }
+
+  return -1;
+}
+
+char * SIMDBaseUndiff_GETMODEPARAMSTRING(int32_t paramId) {
+  switch(paramId) {
+  case SIMDBase_PARAMID_MODE_NAME:
+    return SIMDBase_NAME;
+  }
+
+  return NULL;
+}
diff --git a/plugins/supereq/nsfft-1.00/simd/SIMDBaseUndiff.h b/plugins/supereq/nsfft-1.00/simd/SIMDBaseUndiff.h
new file mode 100644
index 00000000..1af849a8
--- /dev/null
+++ b/plugins/supereq/nsfft-1.00/simd/SIMDBaseUndiff.h
@@ -0,0 +1,231 @@
+#ifndef _SIMDBaseUndiff_H_
+#define _SIMDBaseUndiff_H_
+
+#if defined(ENABLE_PUREC_FLOAT) ////////////////////////////////////////////
+
+typedef float SIMDBase_REAL;
+typedef float SIMDBase_VECT;
+
+#define SIMDBase_MODE 1
+#define SIMDBase_TYPE SIMDBase_TYPE_FLOAT
+#define SIMDBase_VECTLEN 1
+#define SIMDBase_NAME "Pure C float"
+#define SIMDBaseUndiff_DETECT detect_purec_float
+#define SIMDBaseUndiff_GETMODEPARAMINT getModeParamInt_purec_float
+#define SIMDBaseUndiff_GETMODEPARAMSTRING getModeParamString_purec_float
+
+static inline SIMDBase_VECT SIMDBase_LOAD(SIMDBase_VECT *p) { return *p; }
+static inline void SIMDBase_STOR(SIMDBase_VECT *p, SIMDBase_VECT u) { *p = u; }
+static inline SIMDBase_VECT SIMDBase_SET1(SIMDBase_REAL f) { return f; }
+static inline SIMDBase_VECT SIMDBase_LOAD1(SIMDBase_REAL *p) { return *p; }
+static inline SIMDBase_VECT SIMDBase_ADDi(SIMDBase_VECT u, SIMDBase_VECT v) { return u + v; }
+static inline SIMDBase_VECT SIMDBase_SUBi(SIMDBase_VECT u, SIMDBase_VECT v) { return u - v; }
+static inline SIMDBase_VECT SIMDBase_MULi(SIMDBase_VECT u, SIMDBase_VECT v) { return u * v; }
+static inline SIMDBase_VECT SIMDBase_NEGi(SIMDBase_VECT u) { return -u; }
+
+#elif defined(ENABLE_PUREC_DOUBLE) ////////////////////////////////////////////
+
+typedef double SIMDBase_REAL;
+typedef double SIMDBase_VECT;
+
+#define SIMDBase_MODE 2
+#define SIMDBase_TYPE SIMDBase_TYPE_DOUBLE
+#define SIMDBase_VECTLEN 1
+#define SIMDBase_NAME "Pure C double"
+#define SIMDBaseUndiff_DETECT detect_purec_double
+#define SIMDBaseUndiff_GETMODEPARAMINT getModeParamInt_purec_double
+#define SIMDBaseUndiff_GETMODEPARAMSTRING getModeParamString_purec_double
+
+static inline SIMDBase_VECT SIMDBase_LOAD(SIMDBase_VECT *p) { return *p; }
+static inline void SIMDBase_STOR(SIMDBase_VECT *p, SIMDBase_VECT u) { *p = u; }
+static inline SIMDBase_VECT SIMDBase_SET1(SIMDBase_REAL f) { return f; }
+static inline SIMDBase_VECT SIMDBase_LOAD1(SIMDBase_REAL *p) { return *p; }
+static inline SIMDBase_VECT SIMDBase_ADDi(SIMDBase_VECT u, SIMDBase_VECT v) { return u + v; }
+static inline SIMDBase_VECT SIMDBase_SUBi(SIMDBase_VECT u, SIMDBase_VECT v) { return u - v; }
+static inline SIMDBase_VECT SIMDBase_MULi(SIMDBase_VECT u, SIMDBase_VECT v) { return u * v; }
+static inline SIMDBase_VECT SIMDBase_NEGi(SIMDBase_VECT u) { return -u; }
+
+#elif defined(ENABLE_PUREC_LONGDOUBLE) ////////////////////////////////////////////
+
+typedef long double SIMDBase_REAL;
+typedef long double SIMDBase_VECT;
+
+#define SIMDBase_MODE 3
+#define SIMDBase_TYPE SIMDBase_TYPE_LONGDOUBLE
+#define SIMDBase_VECTLEN 1
+#define SIMDBase_NAME "Pure C long double"
+#define SIMDBaseUndiff_DETECT detect_purec_longdouble
+#define SIMDBaseUndiff_GETMODEPARAMINT getModeParamInt_purec_longdouble
+#define SIMDBaseUndiff_GETMODEPARAMSTRING getModeParamString_purec_longdouble
+
+static inline SIMDBase_VECT SIMDBase_LOAD(SIMDBase_VECT *p) { return *p; }
+static inline void SIMDBase_STOR(SIMDBase_VECT *p, SIMDBase_VECT u) { *p = u; }
+static inline SIMDBase_VECT SIMDBase_SET1(SIMDBase_REAL f) { return f; }
+static inline SIMDBase_VECT SIMDBase_LOAD1(SIMDBase_REAL *p) { return *p; }
+static inline SIMDBase_VECT SIMDBase_ADDi(SIMDBase_VECT u, SIMDBase_VECT v) { return u + v; }
+static inline SIMDBase_VECT SIMDBase_SUBi(SIMDBase_VECT u, SIMDBase_VECT v) { return u - v; }
+static inline SIMDBase_VECT SIMDBase_MULi(SIMDBase_VECT u, SIMDBase_VECT v) { return u * v; }
+static inline SIMDBase_VECT SIMDBase_NEGi(SIMDBase_VECT u) { return -u; }
+
+#elif defined(ENABLE_SSE_FLOAT) ////////////////////////////////////////////
+
+#include <xmmintrin.h>
+
+typedef float SIMDBase_REAL;
+typedef __m128 SIMDBase_VECT;
+
+#define SIMDBase_MODE 4
+#define SIMDBase_TYPE SIMDBase_TYPE_FLOAT
+#define SIMDBase_VECTLEN 4
+#define SIMDBase_NAME "x86 SSE float"
+#define SIMDBaseUndiff_DETECT detect_sse_float
+#define SIMDBaseUndiff_GETMODEPARAMINT getModeParamInt_sse_float
+#define SIMDBaseUndiff_GETMODEPARAMSTRING getModeParamString_sse_float
+
+static inline SIMDBase_VECT SIMDBase_LOAD(SIMDBase_VECT *p) { return _mm_load_ps((float *)p); }
+static inline void SIMDBase_STOR(SIMDBase_VECT *p, SIMDBase_VECT u) { _mm_store_ps((float *)p, u); }
+static inline SIMDBase_VECT SIMDBase_SET1(SIMDBase_REAL f) { return _mm_set1_ps(f); }
+static inline SIMDBase_VECT SIMDBase_LOAD1(SIMDBase_REAL *p) { return _mm_load1_ps(p); }
+static inline SIMDBase_VECT SIMDBase_ADDi(SIMDBase_VECT u, SIMDBase_VECT v) { return _mm_add_ps(u, v); }
+static inline SIMDBase_VECT SIMDBase_SUBi(SIMDBase_VECT u, SIMDBase_VECT v) { return _mm_sub_ps(u, v); }
+static inline SIMDBase_VECT SIMDBase_MULi(SIMDBase_VECT u, SIMDBase_VECT v) { return _mm_mul_ps(u, v); }
+static inline SIMDBase_VECT SIMDBase_NEGi(SIMDBase_VECT u) { return _mm_xor_ps(u, _mm_set_ps(-0.0f, -0.0f, -0.0f, -0.0f)); }
+
+#elif defined(ENABLE_SSE2_DOUBLE) ////////////////////////////////////////////
+
+#include <emmintrin.h>
+
+typedef double SIMDBase_REAL;
+typedef __m128d SIMDBase_VECT;
+
+#define SIMDBase_MODE 5
+#define SIMDBase_TYPE SIMDBase_TYPE_DOUBLE
+#define SIMDBase_VECTLEN 2
+#define SIMDBase_NAME "x86 SSE2 double"
+#define SIMDBaseUndiff_DETECT detect_sse2_double
+#define SIMDBaseUndiff_GETMODEPARAMINT getModeParamInt_sse2_double
+#define SIMDBaseUndiff_GETMODEPARAMSTRING getModeParamString_sse2_double
+
+static inline SIMDBase_VECT SIMDBase_LOAD(SIMDBase_VECT *p) { return _mm_load_pd((double *)p); }
+static inline void SIMDBase_STOR(SIMDBase_VECT *p, SIMDBase_VECT u) { _mm_store_pd((double *)p, u); }
+static inline SIMDBase_VECT SIMDBase_SET1(SIMDBase_REAL f) { return _mm_set1_pd(f); }
+static inline SIMDBase_VECT SIMDBase_LOAD1(SIMDBase_REAL *p) { return _mm_load1_pd(p); }
+static inline SIMDBase_VECT SIMDBase_ADDi(SIMDBase_VECT u, SIMDBase_VECT v) { return _mm_add_pd(u, v); }
+static inline SIMDBase_VECT SIMDBase_SUBi(SIMDBase_VECT u, SIMDBase_VECT v) { return _mm_sub_pd(u, v); }
+static inline SIMDBase_VECT SIMDBase_MULi(SIMDBase_VECT u, SIMDBase_VECT v) { return _mm_mul_pd(u, v); }
+static inline SIMDBase_VECT SIMDBase_NEGi(SIMDBase_VECT u) { return _mm_xor_pd(u, _mm_set_pd(-0.0, -0.0)); }
+
+#elif defined(ENABLE_NEON_FLOAT) ////////////////////////////////////////////
+
+#include <arm_neon.h>
+
+typedef float32_t SIMDBase_REAL;
+typedef float32x4_t SIMDBase_VECT;
+
+#define SIMDBase_MODE 6
+#define SIMDBase_TYPE SIMDBase_TYPE_FLOAT
+#define SIMDBase_VECTLEN 4
+#define SIMDBase_NAME "ARM NEON float"
+#define SIMDBaseUndiff_DETECT detect_neon_float
+#define SIMDBaseUndiff_GETMODEPARAMINT getModeParamInt_neon_float
+#define SIMDBaseUndiff_GETMODEPARAMSTRING getModeParamString_neon_float
+
+static inline SIMDBase_VECT SIMDBase_LOAD(SIMDBase_VECT *p) { return vld1q_f32((float32_t *)p); }
+static inline void SIMDBase_STOR(SIMDBase_VECT *p, SIMDBase_VECT u) { vst1q_f32((float32_t *)p, u); }
+static inline SIMDBase_VECT SIMDBase_SET1(SIMDBase_REAL f) { return vdupq_n_f32(f); }
+static inline SIMDBase_VECT SIMDBase_LOAD1(SIMDBase_REAL *p) { return vdupq_n_f32(*p); }
+static inline SIMDBase_VECT SIMDBase_ADDi(SIMDBase_VECT u, SIMDBase_VECT v) { return vaddq_f32(u, v); }
+static inline SIMDBase_VECT SIMDBase_SUBi(SIMDBase_VECT u, SIMDBase_VECT v) { return vsubq_f32(u, v); }
+static inline SIMDBase_VECT SIMDBase_MULi(SIMDBase_VECT u, SIMDBase_VECT v) { return vmulq_f32(u, v); }
+static inline SIMDBase_VECT SIMDBase_NEGi(SIMDBase_VECT u) { 
+  return vreinterpretq_f32_u32( veorq_u32(vreinterpretq_u32_f32(u), vdupq_n_u32(0x80000000U)));
+}
+
+#define SIMDBase_FMADD_AVAILABLE
+
+static inline SIMDBase_VECT SIMDBase_FMADDi(SIMDBase_VECT u, SIMDBase_VECT v, SIMDBase_VECT w) { return vmlaq_f32(w, u, v); } // w + u * v
+static inline SIMDBase_VECT SIMDBase_FMSUBi(SIMDBase_VECT u, SIMDBase_VECT v, SIMDBase_VECT w) { return vmlsq_f32(w, u, v); } // w - u * v
+
+#elif defined(ENABLE_AVX_FLOAT) ////////////////////////////////////////////
+
+#include <immintrin.h>
+
+typedef float SIMDBase_REAL;
+typedef __m256 SIMDBase_VECT;
+
+#define SIMDBase_MODE 7
+#define SIMDBase_TYPE SIMDBase_TYPE_FLOAT
+#define SIMDBase_VECTLEN 8
+#define SIMDBase_NAME "x86 AVX float"
+#define SIMDBaseUndiff_DETECT detect_avx_float
+#define SIMDBaseUndiff_GETMODEPARAMINT getModeParamInt_avx_float
+#define SIMDBaseUndiff_GETMODEPARAMSTRING getModeParamString_avx_float
+
+static inline SIMDBase_VECT SIMDBase_LOAD(SIMDBase_VECT *p) { return _mm256_load_ps((float *)p); }
+static inline void SIMDBase_STOR(SIMDBase_VECT *p, SIMDBase_VECT u) { _mm256_store_ps((float *)p, u); }
+static inline SIMDBase_VECT SIMDBase_SET1(SIMDBase_REAL f) { return _mm256_set1_ps(f); }
+static inline SIMDBase_VECT SIMDBase_LOAD1(SIMDBase_REAL *p) { return _mm256_set1_ps(*p); }
+static inline SIMDBase_VECT SIMDBase_ADDi(SIMDBase_VECT u, SIMDBase_VECT v) { return _mm256_add_ps(u, v); }
+static inline SIMDBase_VECT SIMDBase_SUBi(SIMDBase_VECT u, SIMDBase_VECT v) { return _mm256_sub_ps(u, v); }
+static inline SIMDBase_VECT SIMDBase_MULi(SIMDBase_VECT u, SIMDBase_VECT v) { return _mm256_mul_ps(u, v); }
+static inline SIMDBase_VECT SIMDBase_NEGi(SIMDBase_VECT u) { return _mm256_xor_ps(u, _mm256_set1_ps(-0.0f)); }
+
+#elif defined(ENABLE_AVX_DOUBLE) ////////////////////////////////////////////
+
+#include <immintrin.h>
+
+typedef double SIMDBase_REAL;
+typedef __m256d SIMDBase_VECT;
+
+#define SIMDBase_MODE 8
+#define SIMDBase_TYPE SIMDBase_TYPE_DOUBLE
+#define SIMDBase_VECTLEN 4
+#define SIMDBase_NAME "x86 AVX double"
+#define SIMDBaseUndiff_DETECT detect_avx_double
+#define SIMDBaseUndiff_GETMODEPARAMINT getModeParamInt_avx_double
+#define SIMDBaseUndiff_GETMODEPARAMSTRING getModeParamString_avx_double
+
+static inline SIMDBase_VECT SIMDBase_LOAD(SIMDBase_VECT *p) { return _mm256_load_pd((double *)p); }
+static inline void SIMDBase_STOR(SIMDBase_VECT *p, SIMDBase_VECT u) { _mm256_store_pd((double *)p, u); }
+static inline SIMDBase_VECT SIMDBase_SET1(SIMDBase_REAL f) { return _mm256_set1_pd(f); }
+static inline SIMDBase_VECT SIMDBase_LOAD1(SIMDBase_REAL *p) { return _mm256_set1_pd(*p); }
+static inline SIMDBase_VECT SIMDBase_ADDi(SIMDBase_VECT u, SIMDBase_VECT v) { return _mm256_add_pd(u, v); }
+static inline SIMDBase_VECT SIMDBase_SUBi(SIMDBase_VECT u, SIMDBase_VECT v) { return _mm256_sub_pd(u, v); }
+static inline SIMDBase_VECT SIMDBase_MULi(SIMDBase_VECT u, SIMDBase_VECT v) { return _mm256_mul_pd(u, v); }
+static inline SIMDBase_VECT SIMDBase_NEGi(SIMDBase_VECT u) { return _mm256_xor_pd(u, _mm256_set1_pd(-0.0)); }
+
+#elif defined(ENABLE_ALTIVEC_FLOAT) ////////////////////////////////////////////
+
+#include <altivec.h>
+
+typedef float SIMDBase_REAL;
+typedef vector float SIMDBase_VECT;
+
+#define SIMDBase_MODE 9
+#define SIMDBase_TYPE SIMDBase_TYPE_FLOAT
+#define SIMDBase_VECTLEN 4
+#define SIMDBase_NAME "PowerPC AltiVec float"
+#define SIMDBaseUndiff_DETECT detect_altivec_float
+#define SIMDBaseUndiff_GETMODEPARAMINT getModeParamInt_altivec_float
+#define SIMDBaseUndiff_GETMODEPARAMSTRING getModeParamString_altivec_float
+
+static inline SIMDBase_VECT SIMDBase_LOAD(SIMDBase_VECT *p) { return vec_ld(0, p); }
+static inline void SIMDBase_STOR(SIMDBase_VECT *p, SIMDBase_VECT u) { vec_st(u, 0, p); }
+static inline SIMDBase_VECT SIMDBase_SET1(SIMDBase_REAL f) { return (vector float){f, f, f, f}; }
+static inline SIMDBase_VECT SIMDBase_LOAD1(SIMDBase_REAL *p) { return (vector float){*p, *p, *p, *p}; }
+static inline SIMDBase_VECT SIMDBase_ADDi(SIMDBase_VECT u, SIMDBase_VECT v) { return vec_add(u, v); }
+static inline SIMDBase_VECT SIMDBase_SUBi(SIMDBase_VECT u, SIMDBase_VECT v) { return vec_sub(u, v); }
+static inline SIMDBase_VECT SIMDBase_MULi(SIMDBase_VECT u, SIMDBase_VECT v) { return vec_madd(u, v, (vector float){0, 0, 0, 0}); }
+static inline SIMDBase_VECT SIMDBase_NEGi(SIMDBase_VECT u) { return vec_xor(u, (vector float){-0.0f, -0.0f, -0.0f, -0.0f}); }
+
+#define SIMDBase_FMADD_AVAILABLE
+
+static inline SIMDBase_VECT SIMDBase_FMADDi(SIMDBase_VECT u, SIMDBase_VECT v, SIMDBase_VECT w) { return vec_madd(u, v, w); } // w + u * v
+static inline SIMDBase_VECT SIMDBase_FMSUBi(SIMDBase_VECT u, SIMDBase_VECT v, SIMDBase_VECT w) { return vec_nmsub(u, v, w); } // w - u * v
+
+#endif ////////////////////////////////////////////////////////////////////
+
+static inline SIMDBase_VECT SIMDBase_ADDm(SIMDBase_VECT *p, SIMDBase_VECT *q) { return SIMDBase_ADDi(SIMDBase_LOAD(p), SIMDBase_LOAD(q)); }
+static inline SIMDBase_VECT SIMDBase_SUBm(SIMDBase_VECT *p, SIMDBase_VECT *q) { return SIMDBase_SUBi(SIMDBase_LOAD(p), SIMDBase_LOAD(q)); }
+
+#endif
diff --git a/plugins/supereq/paramlist.hpp b/plugins/supereq/paramlist.hpp
index 0c513b78..9c5b09c4 100644
--- a/plugins/supereq/paramlist.hpp
+++ b/plugins/supereq/paramlist.hpp
@@ -1,4 +1,22 @@
-//#include <iostream.h>
+/*
+    DeaDBeeF - ultimate music player for GNU/Linux systems with X11
+    Copyright (C) 2009-2011 Alexey Yakovenko <waker@users.sourceforge.net>
+    Original SuperEQ code (C) Naoki Shibata <shibatch@users.sf.net>
+
+    This program is free software; you can redistribute it and/or
+    modify it under the terms of the GNU General Public License
+    as published by the Free Software Foundation; either version 2
+    of the License, or (at your option) any later version.
+    
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+    
+    You should have received a copy of the GNU General Public License
+    along with this program; if not, write to the Free Software
+    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+*/
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
@@ -7,12 +25,10 @@ class paramlistelm {
 public:
 	class paramlistelm *next;
 
-	char left,right;
 	float lower,upper,gain,gain2;
 	int sortindex;
 
 	paramlistelm(void) {
-		left = right = 1;
 		lower = upper = gain = 0;
 		next = NULL;
 	};
@@ -21,13 +37,6 @@ public:
 		delete next;
 		next = NULL;
 	};
-
-	char *getString(void) {
-		static char str[64];
-		sprintf(str,"%gHz to %gHz, %gdB %c%c",
-			(double)lower,(double)upper,(double)gain,left?'L':' ',right?'R':' ');
-		return str;
-	}
 };
 
 class paramlist {
@@ -52,8 +61,6 @@ public:
 		for(p=&elm,q=src.elm;q != NULL;q = q->next,p = &(*p)->next)
 		{
 			*p = new paramlistelm;
-			(*p)->left  = q->left;
-			(*p)->right = q->right;
 			(*p)->lower = q->lower;
 			(*p)->upper = q->upper;
 			(*p)->gain  = q->gain;
diff --git a/plugins/supereq/shibatch_rdft.c b/plugins/supereq/shibatch_rdft.c
new file mode 100644
index 00000000..db453eb8
--- /dev/null
+++ b/plugins/supereq/shibatch_rdft.c
@@ -0,0 +1,71 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include <stdint.h>
+
+#include "SIMDBase.h"
+#include "DFT.h"
+
+#define TYPE SIMDBase_TYPE_FLOAT
+
+void rfft(int n,int isign,float *x) {
+    static DFT *p = NULL;
+    static float *buf = NULL;
+    static int ipsize = 0;
+    static int mode = 0;
+    static int veclen = 0;
+    int newipsize;
+    if (n == 0) {
+        if (buf) {
+            SIMDBase_alignedFree (buf);
+            buf = NULL;
+        }
+        if (p) {
+            DFT_dispose(p, mode);
+            p = NULL;
+        }
+        return;
+    }
+    int nn = n;
+    n = 1<<n;
+    newipsize = n;
+    if (newipsize != ipsize) {
+        ipsize = newipsize;
+
+        if (buf) {
+            SIMDBase_alignedFree (buf);
+            buf = NULL;
+        }
+
+        if (p) {
+            DFT_dispose(p, mode);
+            p = NULL;
+        }
+
+        buf = SIMDBase_alignedMalloc (n * sizeof (float));
+
+        mode = SIMDBase_chooseBestMode(TYPE);
+        veclen = SIMDBase_getModeParamInt(SIMDBase_PARAMID_VECTOR_LEN, mode);
+        int sizeOfVect = SIMDBase_getModeParamInt(SIMDBase_PARAMID_SIZE_OF_VECT, mode);
+        printf ("n: %d, veclen: %d, sizeOfVect: %d\n", n, veclen, sizeOfVect);
+        p = DFT_init(mode, n/veclen, DFT_FLAG_REAL);
+    }
+
+    // store in simd order
+    int asize = n / veclen;
+    int i, j;
+    for(j=0;j<veclen;j++) {
+        for (i = 0; i < asize; i++) {
+            buf[i * veclen + j] = x[j * asize + i];
+        }
+    }
+
+    DFT_execute(p, mode, buf, isign);
+
+#define THRES 1e-3
+    for(j=0;j<veclen;j++) {
+        for (i = 0; i < asize; i++) {
+            x[j * asize + i] = buf[i * veclen + j];
+        }
+    }
+}
diff --git a/plugins/supereq/supereq.c b/plugins/supereq/supereq.c
index af4000fd..a773b4ef 100644
--- a/plugins/supereq/supereq.c
+++ b/plugins/supereq/supereq.c
@@ -1,6 +1,6 @@
 /*
     DeaDBeeF - ultimate music player for GNU/Linux systems with X11
-    Copyright (C) 2009-2010 Alexey Yakovenko <waker@users.sourceforge.net>
+    Copyright (C) 2009-2011 Alexey Yakovenko <waker@users.sourceforge.net>
 
     This program is free software; you can redistribute it and/or
     modify it under the terms of the GNU General Public License
@@ -18,217 +18,301 @@
 */
 #include <stdio.h>
 #include <string.h>
+#include <stdlib.h>
+#include <math.h>
 #include "../../deadbeef.h"
-#include "supereq.h"
+#include "Equ.h"
 
 static DB_functions_t *deadbeef;
-static DB_supereq_dsp_t plugin;
-
-void *paramlist_alloc (void);
-void paramlist_free (void *);
-void equ_makeTable(float *lbc,float *rbc,void *param,float fs);
-int equ_modifySamples(char *buf,int nsamples,int nch,int bps);
-void equ_clearbuf(int bps,int srate);
-void equ_init(int wb);
-void equ_quit(void);
-
-void supereq_reset (void);
-
-static float last_srate = 0;
-static int last_nch = 0, last_bps = 0;
-static float lbands[18] = {1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0};
-static float rbands[18] = {1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0};
-static float preamp = 1;
-static void *paramsroot;
-
-static int params_changed = 0;
-static intptr_t tid = 0;
-static uintptr_t mutex = 0;
-static int enabled = 0;
-
-static int
-supereq_on_configchanged (DB_event_t *ev, uintptr_t data) {
-    int e = deadbeef->conf_get_int ("supereq.enable", 0);
-    if (e != enabled) {
-        if (e) {
-            supereq_reset ();
-        }
-        enabled = e;
-    }
-    
-    return 0;
-}
+static DB_dsp_t plugin;
+
+typedef struct {
+    ddb_dsp_context_t ctx;
+    float last_srate;
+    int last_nch;
+    float bands[18];
+    float preamp;
+    void *paramsroot;
+    int params_changed;
+    uintptr_t mutex;
+    SuperEqState state;
+    int enabled;
+} ddb_supereq_ctx_t;
+
+void supereq_reset (ddb_dsp_context_t *ctx);
 
 void
-recalc_table (void) {
+recalc_table (ddb_supereq_ctx_t *eq) {
     void *params = paramlist_alloc ();
 
-    deadbeef->mutex_lock (mutex);
-    float lbands_copy[18];
-    float rbands_copy[18];
-    float srate = last_srate;
-    memcpy (lbands_copy, lbands, sizeof (lbands));
-    memcpy (rbands_copy, rbands, sizeof (rbands));
+    deadbeef->mutex_lock (eq->mutex);
+    float bands_copy[18];
+    float srate = eq->last_srate;
+    memcpy (bands_copy, eq->bands, sizeof (eq->bands));
     for (int i = 0; i < 18; i++) {
-        lbands_copy[i] *= preamp;
-        rbands_copy[i] *= preamp;
+        bands_copy[i] *= eq->preamp;
     }
-    deadbeef->mutex_unlock (mutex);
+    deadbeef->mutex_unlock (eq->mutex);
 
-    equ_makeTable (lbands_copy, rbands_copy, params, srate);
+    equ_makeTable (&eq->state, bands_copy, params, srate);
 
-    deadbeef->mutex_lock (mutex);
-    paramlist_free (paramsroot);
-    paramsroot = params;
-    deadbeef->mutex_unlock (mutex);
+    deadbeef->mutex_lock (eq->mutex);
+    paramlist_free (eq->paramsroot);
+    eq->paramsroot = params;
+    deadbeef->mutex_unlock (eq->mutex);
 }
 
 int
 supereq_plugin_start (void) {
-    enabled = deadbeef->conf_get_int ("supereq.enable", 0);
-    // load bands from config
-    preamp = deadbeef->conf_get_float ("eq.preamp", 1);
-    for (int i = 0; i < 18; i++) {
-        char key[100];
-        snprintf (key, sizeof (key), "eq.band%d", i);
-        lbands[i] = rbands[i] = deadbeef->conf_get_float (key, 1);
-    }
-
-    equ_init (14);
-    paramsroot = paramlist_alloc ();
-    last_srate = 44100;
-    last_nch = 2;
-    last_bps = 16;
-    mutex = deadbeef->mutex_create ();
-    recalc_table ();
-    equ_clearbuf (last_bps,last_srate);
-    deadbeef->ev_subscribe (DB_PLUGIN (&plugin), DB_EV_CONFIGCHANGED, DB_CALLBACK (supereq_on_configchanged), 0);
     return 0;
 }
 
 int
 supereq_plugin_stop (void) {
-    deadbeef->ev_unsubscribe (DB_PLUGIN (&plugin), DB_EV_CONFIGCHANGED, DB_CALLBACK (supereq_on_configchanged), 0);
-    if (tid) {
-        deadbeef->thread_join (tid);
-        tid = 0;
-    }
-    if (mutex) {
-        deadbeef->mutex_free (mutex);
-        mutex = 0;
-    }
-    equ_quit ();
-    paramlist_free (paramsroot);
     return 0;
 }
 
-void
-supereq_regen_table_thread (void *param) {
-    recalc_table ();
-    tid = 0;
-}
-
 int
-supereq_process_int16 (int16_t *samples, int nsamples, int nch, int bps, int srate) {
-	if ((nch != 1 && nch != 2) || (bps != 8 && bps != 16 && bps != 24)) return nsamples;
-    if (params_changed && !tid) {
-        tid = deadbeef->thread_start (supereq_regen_table_thread, NULL);
-        params_changed = 0;
+supereq_process (ddb_dsp_context_t *ctx, float *samples, int frames, int maxframes, ddb_waveformat_t *fmt, float *r) {
+    ddb_supereq_ctx_t *supereq = (ddb_supereq_ctx_t *)ctx;
+    if (supereq->enabled != ctx->enabled) {
+        if (ctx->enabled && !supereq->enabled) {
+            supereq_reset (ctx);
+        }
+        supereq->enabled = ctx->enabled;
+
+// this causes a glitch on 1st track
+//        DB_playItem_t *it = deadbeef->streamer_get_playing_track ();
+//        if (it) {
+//            float playpos = deadbeef->streamer_get_playpos ();
+//            deadbeef->streamer_seek (playpos);
+//            deadbeef->pl_item_unref (it);
+//        }
     }
-	if (last_srate != srate) {
-        deadbeef->mutex_lock (mutex);
-        //equ_makeTable (lbands, rbands, paramsroot, srate);
-		last_srate = srate;
-		last_nch = nch;
-		last_bps = bps;
-        recalc_table ();
-        deadbeef->mutex_unlock (mutex);
-		equ_clearbuf(bps,srate);
+    if (supereq->params_changed) {
+        recalc_table (supereq);
+        supereq->params_changed = 0;
     }
-	else if (last_nch != nch || last_bps != bps) {
-        deadbeef->mutex_lock (mutex);
-		last_nch = nch;
-		last_bps = bps;
-        deadbeef->mutex_unlock (mutex);
-		equ_clearbuf(bps,srate);
+	if (supereq->last_srate != fmt->samplerate || supereq->last_nch != fmt->channels) {
+        deadbeef->mutex_lock (supereq->mutex);
+		supereq->last_srate = fmt->samplerate;
+		supereq->last_nch = fmt->channels;
+        equ_init (&supereq->state, 10, fmt->channels);
+        recalc_table (supereq);
+		equ_clearbuf(&supereq->state);
+        deadbeef->mutex_unlock (supereq->mutex);
     }
-	equ_modifySamples((char *)samples,nsamples,nch,bps);
-	return nsamples;
+	equ_modifySamples_float(&supereq->state, (char *)samples,frames,fmt->channels);
+	return frames;
 }
 
 float
-supereq_get_band (int band) {
-    return lbands[band];
+supereq_get_band (ddb_dsp_context_t *ctx, int band) {
+    ddb_supereq_ctx_t *supereq = (ddb_supereq_ctx_t *)ctx;
+    return supereq->bands[band];
 }
 
 void
-supereq_set_band (int band, float value) {
-    deadbeef->mutex_lock (mutex);
-    lbands[band] = rbands[band] = value;
-    deadbeef->mutex_unlock (mutex);
-    params_changed = 1;
-    char key[100];
-    snprintf (key, sizeof (key), "eq.band%d", band);
-    deadbeef->conf_set_float (key, value);
+supereq_set_band (ddb_dsp_context_t *ctx, int band, float value) {
+    ddb_supereq_ctx_t *supereq = (ddb_supereq_ctx_t *)ctx;
+    deadbeef->mutex_lock (supereq->mutex);
+    supereq->bands[band] = value;
+    deadbeef->mutex_unlock (supereq->mutex);
+    supereq->params_changed = 1;
 }
 
 float
-supereq_get_preamp (void) {
-    return preamp;
+supereq_get_preamp (ddb_dsp_context_t *ctx) {
+    ddb_supereq_ctx_t *supereq = (ddb_supereq_ctx_t *)ctx;
+    return supereq->preamp;
 }
 
 void
-supereq_set_preamp (float value) {
-    deadbeef->mutex_lock (mutex);
-    preamp = value;
-    deadbeef->mutex_unlock (mutex);
-    params_changed = 1;
-    deadbeef->conf_set_float ("eq.preamp", value);
+supereq_set_preamp (ddb_dsp_context_t *ctx, float value) {
+    ddb_supereq_ctx_t *supereq = (ddb_supereq_ctx_t *)ctx;
+    deadbeef->mutex_lock (supereq->mutex);
+    supereq->preamp = value;
+    deadbeef->mutex_unlock (supereq->mutex);
+    supereq->params_changed = 1;
 }
 
 void
-supereq_reset (void) {
-    deadbeef->mutex_lock (mutex);
-    equ_clearbuf(last_bps,last_srate);
-    deadbeef->mutex_unlock (mutex);
+supereq_reset (ddb_dsp_context_t *ctx) {
+    ddb_supereq_ctx_t *supereq = (ddb_supereq_ctx_t *)ctx;
+    deadbeef->mutex_lock (supereq->mutex);
+    equ_clearbuf(&supereq->state);
+    deadbeef->mutex_unlock (supereq->mutex);
+}
+
+int
+supereq_num_params (void) {
+    return 19;
+}
+
+static const char *bandnames[] = {
+    "Preamp",
+    "55 Hz",
+    "77 Hz",
+    "110 Hz",
+    "156 Hz",
+    "220 Hz",
+    "311 Hz",
+    "440 Hz",
+    "622 Hz",
+    "880 Hz",
+    "1.2 kHz",
+    "1.8 kHz",
+    "2.5 kHz",
+    "3.5 kHz",
+    "5 kHz",
+    "7 kHz",
+    "10 kHz",
+    "14 kHz",
+    "20 kHz"
+};
+
+const char *
+supereq_get_param_name (int p) {
+    return bandnames[p];
+}
+
+
+static inline float
+db_to_amp (float dB) {
+    const float ln10=2.3025850929940002f;
+    return exp(ln10*dB/20.f);
+}
+
+static inline float
+amp_to_db (float amp) {
+    return 20*log10 (amp);
 }
 
 void
-supereq_enable (int e) {
-    if (e != enabled) {
-        deadbeef->conf_set_int ("supereq.enable", e);
-        if (e && !enabled) {
-            supereq_reset ();
-        }
-        enabled = e;
+supereq_set_param (ddb_dsp_context_t *ctx, int p, const char *val) {
+    switch (p) {
+    case 0:
+        supereq_set_preamp (ctx, db_to_amp (atof (val)));
+        break;
+    case 1 ... 18:
+        supereq_set_band (ctx, p-1, db_to_amp (atof (val)));
+        break;
+    default:
+        fprintf (stderr, "supereq_set_param: invalid param index (%d)\n", p);
     }
 }
 
-int
-supereq_enabled (void) {
-    return enabled;
-}
-
-static DB_supereq_dsp_t plugin = {
-    .dsp.plugin.api_vmajor = DB_API_VERSION_MAJOR,
-    .dsp.plugin.api_vminor = DB_API_VERSION_MINOR,
-    .dsp.plugin.type = DB_PLUGIN_DSP,
-    .dsp.plugin.id = "supereq",
-    .dsp.plugin.name = "SuperEQ",
-    .dsp.plugin.descr = "equalizer plugin using SuperEQ library by Naoki Shibata",
-    .dsp.plugin.author = "Alexey Yakovenko",
-    .dsp.plugin.email = "waker@users.sourceforge.net",
-    .dsp.plugin.website = "http://deadbeef.sf.net",
-    .dsp.plugin.start = supereq_plugin_start,
-    .dsp.plugin.stop = supereq_plugin_stop,
-    .dsp.process_int16 = supereq_process_int16,
-    .dsp.reset = supereq_reset,
-    .dsp.enable = supereq_enable,
-    .dsp.enabled = supereq_enabled,
-    .get_band = supereq_get_band,
-    .set_band = supereq_set_band,
-    .get_preamp = supereq_get_preamp,
-    .set_preamp = supereq_set_preamp,
+void
+supereq_get_param (ddb_dsp_context_t *ctx, int p, char *v, int sz) {
+    switch (p) {
+    case 0:
+        snprintf (v, sz, "%f", amp_to_db (supereq_get_preamp (ctx)));
+        break;
+    case 1 ... 18:
+        snprintf (v, sz, "%f", amp_to_db (supereq_get_band (ctx, p-1)));
+        break;
+    default:
+        fprintf (stderr, "supereq_get_param: invalid param index (%d)\n", p);
+    }
+}
+
+
+ddb_dsp_context_t*
+supereq_open (void) {
+    ddb_supereq_ctx_t *supereq = malloc (sizeof (ddb_supereq_ctx_t));
+    DDB_INIT_DSP_CONTEXT (supereq,ddb_supereq_ctx_t,&plugin);
+
+    equ_init (&supereq->state, 10, 2);
+    supereq->paramsroot = paramlist_alloc ();
+    supereq->last_srate = 44100;
+    supereq->last_nch = 2;
+    supereq->mutex = deadbeef->mutex_create ();
+    supereq->preamp = 1;
+    for (int i = 0; i < 18; i++) {
+        supereq->bands[i] = 1;
+    }
+    recalc_table (supereq);
+    equ_clearbuf (&supereq->state);
+
+    return (ddb_dsp_context_t*)supereq;
+}
+
+void
+supereq_close (ddb_dsp_context_t *ctx) {
+    ddb_supereq_ctx_t *supereq = (ddb_supereq_ctx_t *)ctx;
+    if (supereq->mutex) {
+        deadbeef->mutex_free (supereq->mutex);
+        supereq->mutex = 0;
+    }
+    equ_quit (&supereq->state);
+    paramlist_free (supereq->paramsroot);
+    free (ctx);
+}
+
+static const char settings_dlg[] =
+    "property \"\" hbox[19] hmg fill expand border=0 spacing=8 height=200;\n"
+        "property \"Preamp\" vscale[20,-20,1] vert 0 0;\n"
+        "property \"55 Hz\" vscale[20,-20,1] vert 1 0;\n"
+        "property \"77 Hz\" vscale[20,-20,1] vert 2 0;\n"
+        "property \"110 Hz\" vscale[20,-20,1] vert 3 0;\n"
+        "property \"156 Hz\" vscale[20,-20,1] vert 4 0;\n"
+        "property \"220 Hz\" vscale[20,-20,1] vert 5 0;\n"
+        "property \"311 Hz\" vscale[20,-20,1] vert 6 0;\n"
+        "property \"440 Hz\" vscale[20,-20,1] vert 7 0;\n"
+        "property \"622 Hz\" vscale[20,-20,1] vert 8 0;\n"
+        "property \"880 Hz\" vscale[20,-20,1] vert 9 0;\n"
+        "property \"1.2 kHz\" vscale[20,-20,1] vert 10 0;\n"
+        "property \"1.8 kHz\" vscale[20,-20,1] vert 11 0;\n"
+        "property \"2.5 kHz\" vscale[20,-20,1] vert 12 0;\n"
+        "property \"3.5 kHz\" vscale[20,-20,1] vert 13 0;\n"
+        "property \"5 kHz\" vscale[20,-20,1] vert 14 0;\n"
+        "property \"7 kHz\" vscale[20,-20,1] vert 15 0;\n"
+        "property \"10 kHz\" vscale[20,-20,1] vert 16 0;\n"
+        "property \"14 kHz\" vscale[20,-20,1] vert 17 0;\n"
+        "property \"20 kHz\" vscale[20,-20,1] vert 18 0;\n"
+;
+
+static DB_dsp_t plugin = {
+    .plugin.api_vmajor = DB_API_VERSION_MAJOR,
+    .plugin.api_vminor = DB_API_VERSION_MINOR,
+    .plugin.version_major = 1,
+    .plugin.version_minor = 0,
+    .plugin.type = DB_PLUGIN_DSP,
+    .plugin.id = "supereq",
+    .plugin.name = "SuperEQ",
+    .plugin.descr = "equalizer plugin using SuperEQ library",
+    .plugin.copyright = 
+        "Copyright (C) 2009-2011 Alexey Yakovenko <waker@users.sourceforge.net>\n"
+        "\n"
+        "Uses supereq library by Naoki Shibata, http://shibatch.sourceforge.net\n"
+        "Uses FFT library by Takuya Ooura, http://www.kurims.kyoto-u.ac.jp/~ooura/\n"
+        "\n"
+        "This program is free software; you can redistribute it and/or\n"
+        "modify it under the terms of the GNU General Public License\n"
+        "as published by the Free Software Foundation; either version 2\n"
+        "of the License, or (at your option) any later version.\n"
+        "\n"
+        "This program is distributed in the hope that it will be useful,\n"
+        "but WITHOUT ANY WARRANTY; without even the implied warranty of\n"
+        "MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the\n"
+        "GNU General Public License for more details.\n"
+        "\n"
+        "You should have received a copy of the GNU General Public License\n"
+        "along with this program; if not, write to the Free Software\n"
+        "Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.\n"
+    ,
+    .plugin.website = "http://deadbeef.sf.net",
+    .plugin.start = supereq_plugin_start,
+    .plugin.stop = supereq_plugin_stop,
+    .open = supereq_open,
+    .close = supereq_close,
+    .process = supereq_process,
+    .reset = supereq_reset,
+    .num_params = supereq_num_params,
+    .get_param_name = supereq_get_param_name,
+    .set_param = supereq_set_param,
+    .get_param = supereq_get_param,
+    .configdialog = settings_dlg,
 };
 
 DB_plugin_t *
diff --git a/plugins/supereq/supereq.h b/plugins/supereq/supereq.h
deleted file mode 100644
index 32298ef1..00000000
--- a/plugins/supereq/supereq.h
+++ /dev/null
@@ -1,31 +0,0 @@
-/*
-    DeaDBeeF - ultimate music player for GNU/Linux systems with X11
-    Copyright (C) 2009-2010 Alexey Yakovenko <waker@users.sourceforge.net>
-
-    This program is free software; you can redistribute it and/or
-    modify it under the terms of the GNU General Public License
-    as published by the Free Software Foundation; either version 2
-    of the License, or (at your option) any later version.
-    
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-    
-    You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
-*/
-
-#ifndef __SUPEREQ_H
-#define __SUPEREQ_H
-
-typedef struct DB_supereq_dsp_s {
-    DB_dsp_t dsp;
-    float (*get_band) (int band);
-    void (*set_band) (int band, float value);
-    float (*get_preamp) (void);
-    void (*set_preamp) (float value);
-} DB_supereq_dsp_t;
-
-#endif