diff options
Diffstat (limited to 'plugins/supereq')
73 files changed, 17440 insertions, 623 deletions
diff --git a/plugins/supereq/Equ.cpp b/plugins/supereq/Equ.cpp index f53b99d1..0aff4f8a 100644 --- a/plugins/supereq/Equ.cpp +++ b/plugins/supereq/Equ.cpp @@ -1,37 +1,92 @@ +/*
+ DeaDBeeF - ultimate music player for GNU/Linux systems with X11
+ Copyright (C) 2009-2011 Alexey Yakovenko <waker@users.sourceforge.net>
+ Original SuperEQ code (C) Naoki Shibata <shibatch@users.sf.net>
+
+ This program is free software; you can redistribute it and/or
+ modify it under the terms of the GNU General Public License
+ as published by the Free Software Foundation; either version 2
+ of the License, or (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+*/
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <assert.h>
#include "paramlist.hpp"
+#include "Equ.h"
+
+int _Unwind_Resume_or_Rethrow;
+int _Unwind_RaiseException;
+int _Unwind_GetLanguageSpecificData;
+int _Unwind_Resume;
+int _Unwind_DeleteException;
+int _Unwind_GetTextRelBase;
+int _Unwind_SetIP;
+int _Unwind_GetDataRelBase;
+int _Unwind_GetRegionStart;
+int _Unwind_SetGR;
+int _Unwind_GetIPInfo;
+
+#ifdef USE_OOURA
+extern "C" void rdft(int, int, REAL *, int *, REAL *);
+void rfft(int n,int isign,REAL *x)
+{
+ static int ipsize = 0,wsize=0;
+ static int *ip = NULL;
+ static REAL *w = NULL;
+ int newipsize,newwsize;
+ if (n == 0) {
+ free(ip); ip = NULL; ipsize = 0;
+ free(w); w = NULL; wsize = 0;
+ return;
+ }
-typedef float REAL;
-void rfft(int n,int isign,REAL x[]);
+ n = 1 << n;
-#define M 15
-#define PI 3.1415926535897932384626433832795
+ newipsize = 2+sqrt(n/2);
+ if (newipsize > ipsize) {
+ ipsize = newipsize;
+ ip = (int *)realloc(ip,sizeof(int)*ipsize);
+ ip[0] = 0;
+ }
-#define RINT(x) ((x) >= 0 ? ((int)((x) + 0.5)) : ((int)((x) - 0.5)))
+ newwsize = n/2;
+ if (newwsize > wsize) {
+ wsize = newwsize;
+ w = (REAL *)realloc(w,sizeof(REAL)*wsize);
+ }
-#define DITHERLEN 65536
+ rdft(n,isign,x,ip,w);
+}
+#elif defined(USE_FFMPEG) || defined(USE_SHIBATCH)
+extern "C" void rfft(int n,int isign,REAL *x);
+#endif
-// play -c 2 -r 44100 -fs -sw
+#if defined(USE_SHIBATCH)
+extern "C" {
+#include "SIMDBase.h"
+}
+#endif
+
+
+#define PI 3.1415926535897932384626433832795
+
+#define DITHERLEN 65536
+#define M 15
static REAL fact[M+1];
static REAL aa = 96;
-static REAL iza;
-static REAL *lires,*lires1,*lires2,*rires,*rires1,*rires2,*irest;
-static REAL *fsamples;
-static REAL *ditherbuf;
-static int ditherptr = 0;
-static volatile int chg_ires,cur_ires;
-static int winlen,winlenbit,tabsize,nbufsamples;
-static short *inbuf;
-static REAL *outbuf;
-static int maxamp;
-int enable = 1, dither = 0;
-
-#define NCH 2
+static REAL iza = 0;
#define NBANDS 17
static REAL bands[] = {
@@ -62,49 +117,75 @@ static REAL izero(REAL x) return ret;
}
-extern "C" void equ_init(int wb)
+void *equ_malloc (int size) {
+#ifdef USE_SHIBATCH
+ return SIMDBase_alignedMalloc (size);
+#else
+ return malloc (size);
+#endif
+}
+
+void equ_free (void *mem) {
+#ifdef USE_SHIBATCH
+ SIMDBase_alignedFree (mem);
+#else
+ free (mem);
+#endif
+}
+
+extern "C" void equ_init(SuperEqState *state, int wb, int channels)
{
int i,j;
- if (lires1 != NULL) free(lires1);
- if (lires2 != NULL) free(lires2);
- if (rires1 != NULL) free(rires1);
- if (rires2 != NULL) free(rires2);
- if (irest != NULL) free(irest);
- if (fsamples != NULL) free(fsamples);
- if (inbuf != NULL) free(inbuf);
- if (outbuf != NULL) free(outbuf);
- if (ditherbuf != NULL) free(ditherbuf);
-
- winlen = (1 << (wb-1))-1;
- winlenbit = wb;
- tabsize = 1 << wb;
-
- lires1 = (REAL *)malloc(sizeof(REAL)*tabsize);
- lires2 = (REAL *)malloc(sizeof(REAL)*tabsize);
- rires1 = (REAL *)malloc(sizeof(REAL)*tabsize);
- rires2 = (REAL *)malloc(sizeof(REAL)*tabsize);
- irest = (REAL *)malloc(sizeof(REAL)*tabsize);
- fsamples = (REAL *)malloc(sizeof(REAL)*tabsize);
- inbuf = (short *)calloc(winlen*NCH,sizeof(int));
- outbuf = (REAL *)calloc(tabsize*NCH,sizeof(REAL));
- ditherbuf = (REAL *)malloc(sizeof(REAL)*DITHERLEN);
-
- lires = lires1;
- rires = rires1;
- cur_ires = 1;
- chg_ires = 1;
+ if (state->lires1 != NULL) free(state->lires1);
+ if (state->lires2 != NULL) free(state->lires2);
+ if (state->irest != NULL) free(state->irest);
+ if (state->fsamples != NULL) free(state->fsamples);
+ if (state->finbuf != NULL) free(state->finbuf);
+ if (state->outbuf != NULL) free(state->outbuf);
+ if (state->ditherbuf != NULL) free(state->ditherbuf);
+
+
+ memset (state, 0, sizeof (SuperEqState));
+ state->channels = channels;
+ state->enable = 1;
+
+ state->winlen = (1 << (wb-1))-1;
+ state->winlenbit = wb;
+ state->tabsize = 1 << wb;
+ state->fft_bits = wb;
+
+ state->lires1 = (REAL *)equ_malloc(sizeof(REAL)*state->tabsize * state->channels);
+ state->lires2 = (REAL *)equ_malloc(sizeof(REAL)*state->tabsize * state->channels);
+ state->irest = (REAL *)equ_malloc(sizeof(REAL)*state->tabsize);
+ state->fsamples = (REAL *)equ_malloc(sizeof(REAL)*state->tabsize);
+ state->finbuf = (REAL *)equ_malloc(state->winlen*state->channels*sizeof(REAL));
+ state->outbuf = (REAL *)equ_malloc(state->tabsize*state->channels*sizeof(REAL));
+ state->ditherbuf = (REAL *)equ_malloc(sizeof(REAL)*DITHERLEN);
+
+ memset (state->lires1, 0, sizeof(REAL)*state->tabsize * state->channels);
+ memset (state->lires2, 0, sizeof(REAL)*state->tabsize * state->channels);
+ memset (state->irest, 0, sizeof(REAL)*state->tabsize);
+ memset (state->fsamples, 0, sizeof(REAL)*state->tabsize);
+ memset (state->finbuf, 0, state->winlen*state->channels*sizeof(REAL));
+ memset (state->outbuf, 0, state->tabsize*state->channels*sizeof(REAL));
+ memset (state->ditherbuf, 0, sizeof(REAL)*DITHERLEN);
+
+ state->lires = state->lires1;
+ state->cur_ires = 1;
+ state->chg_ires = 1;
for(i=0;i<DITHERLEN;i++)
- ditherbuf[i] = (float(rand())/RAND_MAX-0.5);
-
- for(i=0;i<=M;i++)
- {
- fact[i] = 1;
- for(j=1;j<=i;j++) fact[i] *= j;
- }
-
- iza = izero(alpha(aa));
+ state->ditherbuf[i] = (float(rand())/RAND_MAX-0.5);
+
+ if (fact[0] < 1) {
+ for(i=0;i<=M;i++)
+ {
+ fact[i] = 1;
+ for(j=1;j<=i;j++) fact[i] *= j;
+ }
+ iza = izero(alpha(aa));
+ }
}
// -(N-1)/2 <= n <= (N-1)/2
@@ -168,7 +249,6 @@ void process_param(REAL *bc,paramlist *param,paramlist ¶m2,REAL fs,int ch) for(e = param->elm;e != NULL;e = e->next)
{
- if ((ch == 0 && !e->left) || (ch == 1 && !e->right)) continue;
if (e->lower >= e->upper) continue;
for(p=param2.elm;p != NULL;p = p->next)
@@ -231,414 +311,164 @@ void process_param(REAL *bc,paramlist *param,paramlist ¶m2,REAL fs,int ch) }
}
-extern "C" void equ_makeTable(REAL *lbc,REAL *rbc,paramlist *param,REAL fs)
+extern "C" void equ_makeTable(SuperEqState *state, REAL *lbc,void *_param,REAL fs)
{
- int i,cires = cur_ires;
+ paramlist *param = (paramlist *)_param;
+ int i,cires = state->cur_ires;
REAL *nires;
if (fs <= 0) return;
paramlist param2;
- // L
-
- process_param(lbc,param,param2,fs,0);
-
- for(i=0;i<winlen;i++)
- irest[i] = hn(i-winlen/2,param2,fs)*win(i-winlen/2,winlen);
-
- for(;i<tabsize;i++)
- irest[i] = 0;
+ for (int ch = 0; ch < state->channels; ch++) {
+ process_param(lbc,param,param2,fs,ch);
- rfft(tabsize,1,irest);
+ for(i=0;i<state->winlen;i++)
+ state->irest[i] = hn(i-state->winlen/2,param2,fs)*win(i-state->winlen/2,state->winlen);
- nires = cires == 1 ? lires2 : lires1;
+ for(;i<state->tabsize;i++)
+ state->irest[i] = 0;
- for(i=0;i<tabsize;i++)
- nires[i] = irest[i];
+ rfft(state->fft_bits,1,state->irest);
- process_param(rbc,param,param2,fs,1);
-
- // R
-
- for(i=0;i<winlen;i++)
- irest[i] = hn(i-winlen/2,param2,fs)*win(i-winlen/2,winlen);
+ nires = cires == 1 ? state->lires2 : state->lires1;
+ nires += ch * state->tabsize;
- for(;i<tabsize;i++)
- irest[i] = 0;
-
- rfft(tabsize,1,irest);
-
- nires = cires == 1 ? rires2 : rires1;
-
- for(i=0;i<tabsize;i++)
- nires[i] = irest[i];
-
- //
-
- chg_ires = cires == 1 ? 2 : 1;
+ for(i=0;i<state->tabsize;i++)
+ nires[i] = state->irest[i];
+ }
+ state->chg_ires = cires == 1 ? 2 : 1;
}
-extern "C" void equ_quit(void)
+extern "C" void equ_quit(SuperEqState *state)
{
- free(lires1);
- free(lires2);
- free(rires1);
- free(rires2);
- free(irest);
- free(fsamples);
- free(inbuf);
- free(outbuf);
- free(ditherbuf);
-
- lires1 = NULL;
- lires2 = NULL;
- rires1 = NULL;
- rires2 = NULL;
- irest = NULL;
- fsamples = NULL;
- inbuf = NULL;
- outbuf = NULL;
+ equ_free(state->lires1);
+ equ_free(state->lires2);
+ equ_free(state->irest);
+ equ_free(state->fsamples);
+ equ_free(state->finbuf);
+ equ_free(state->outbuf);
+ equ_free(state->ditherbuf);
+
+ state->lires1 = NULL;
+ state->lires2 = NULL;
+ state->irest = NULL;
+ state->fsamples = NULL;
+ state->finbuf = NULL;
+ state->outbuf = NULL;
rfft(0,0,NULL);
}
-extern "C" void equ_clearbuf(int bps,int srate)
+extern "C" void equ_clearbuf(SuperEqState *state)
{
int i;
- nbufsamples = 0;
- for(i=0;i<tabsize*NCH;i++) outbuf[i] = 0;
+ state->nbufsamples = 0;
+ for(i=0;i<state->tabsize*state->channels;i++) state->outbuf[i] = 0;
}
-extern "C" int equ_modifySamples(char *buf,int nsamples,int nch,int bps)
+extern "C" int equ_modifySamples_float (SuperEqState *state, char *buf,int nsamples,int nch)
{
int i,p,ch;
REAL *ires;
- int amax = (1 << (bps-1))-1;
- int amin = -(1 << (bps-1));
+ float amax = 1.0f;
+ float amin = -1.0f;
static float hm1 = 0, hm2 = 0;
- if (chg_ires) {
- cur_ires = chg_ires;
- lires = cur_ires == 1 ? lires1 : lires2;
- rires = cur_ires == 1 ? rires1 : rires2;
- chg_ires = 0;
+ if (state->chg_ires) {
+ state->cur_ires = state->chg_ires;
+ state->lires = state->cur_ires == 1 ? state->lires1 : state->lires2;
+ state->chg_ires = 0;
}
p = 0;
- while(nbufsamples+nsamples >= winlen)
+ while(state->nbufsamples+nsamples >= state->winlen)
{
- switch(bps)
- {
- case 8:
- for(i=0;i<(winlen-nbufsamples)*nch;i++)
- {
- inbuf[nbufsamples*nch+i] = ((unsigned char *)buf)[i+p*nch] - 0x80;
- float s = outbuf[nbufsamples*nch+i];
- if (dither) {
- float u;
- s -= hm1;
- u = s;
- s += ditherbuf[(ditherptr++) & (DITHERLEN-1)];
- if (s < amin) s = amin;
- if (amax < s) s = amax;
- s = RINT(s);
- hm1 = s - u;
- ((unsigned char *)buf)[i+p*nch] = s + 0x80;
- } else {
- if (s < amin) s = amin;
- if (amax < s) s = amax;
- ((unsigned char *)buf)[i+p*nch] = RINT(s) + 0x80;
- }
- }
- for(i=winlen*nch;i<tabsize*nch;i++)
- outbuf[i-winlen*nch] = outbuf[i];
-
- break;
-
- case 16:
- for(i=0;i<(winlen-nbufsamples)*nch;i++)
+ for(i=0;i<(state->winlen-state->nbufsamples)*nch;i++)
{
- inbuf[nbufsamples*nch+i] = ((short *)buf)[i+p*nch];
- float s = outbuf[nbufsamples*nch+i];
- if (dither) {
- float u;
- s -= hm1;
- u = s;
- s += ditherbuf[(ditherptr++) & (DITHERLEN-1)];
- if (s < amin) s = amin;
- if (amax < s) s = amax;
- s = RINT(s);
- hm1 = s - u;
- ((short *)buf)[i+p*nch] = s;
- } else {
- if (s < amin) s = amin;
- if (amax < s) s = amax;
- ((short *)buf)[i+p*nch] = RINT(s);
- }
- }
- for(i=winlen*nch;i<tabsize*nch;i++)
- outbuf[i-winlen*nch] = outbuf[i];
-
- break;
-
- case 24:
- for(i=0;i<(winlen-nbufsamples)*nch;i++)
- {
- ((int *)inbuf)[nbufsamples*nch+i] =
- (((unsigned char *)buf)[(i+p*nch)*3 ] ) +
- (((unsigned char *)buf)[(i+p*nch)*3+1] << 8) +
- ((( signed char *)buf)[(i+p*nch)*3+2] << 16) ;
-
- float s = outbuf[nbufsamples*nch+i];
+ state->finbuf[state->nbufsamples*nch+i] = ((float *)buf)[i+p*nch];
+ float s = state->outbuf[state->nbufsamples*nch+i];
//if (dither) s += ditherbuf[(ditherptr++) & (DITHERLEN-1)];
if (s < amin) s = amin;
if (amax < s) s = amax;
- int s2 = RINT(s);
- ((signed char *)buf)[(i+p*nch)*3 ] = s2 & 255; s2 >>= 8;
- ((signed char *)buf)[(i+p*nch)*3+1] = s2 & 255; s2 >>= 8;
- ((signed char *)buf)[(i+p*nch)*3+2] = s2 & 255;
+ ((float *)buf)[i+p*nch] = s;
}
- for(i=winlen*nch;i<tabsize*nch;i++)
- outbuf[i-winlen*nch] = outbuf[i];
+ for(i=state->winlen*nch;i<state->tabsize*nch;i++)
+ state->outbuf[i-state->winlen*nch] = state->outbuf[i];
- break;
- default:
- assert(0);
- }
-
- p += winlen-nbufsamples;
- nsamples -= winlen-nbufsamples;
- nbufsamples = 0;
+ p += state->winlen-state->nbufsamples;
+ nsamples -= state->winlen-state->nbufsamples;
+ state->nbufsamples = 0;
for(ch=0;ch<nch;ch++)
{
- ires = ch == 0 ? lires : rires;
+ ires = state->lires + ch * state->tabsize;
- if (bps == 24) {
- for(i=0;i<winlen;i++)
- fsamples[i] = ((int *)inbuf)[nch*i+ch];
- } else {
- for(i=0;i<winlen;i++)
- fsamples[i] = inbuf[nch*i+ch];
- }
+ for(i=0;i<state->winlen;i++)
+ state->fsamples[i] = state->finbuf[nch*i+ch];
- for(i=winlen;i<tabsize;i++)
- fsamples[i] = 0;
+ for(i=state->winlen;i<state->tabsize;i++)
+ state->fsamples[i] = 0;
- if (enable) {
- rfft(tabsize,1,fsamples);
+ if (state->enable) {
+ rfft(state->fft_bits,1,state->fsamples);
- fsamples[0] = ires[0]*fsamples[0];
- fsamples[1] = ires[1]*fsamples[1];
+ state->fsamples[0] = ires[0]*state->fsamples[0];
+ state->fsamples[1] = ires[1]*state->fsamples[1];
- for(i=1;i<tabsize/2;i++)
+ for(i=1;i<state->tabsize/2;i++)
{
REAL re,im;
- re = ires[i*2 ]*fsamples[i*2] - ires[i*2+1]*fsamples[i*2+1];
- im = ires[i*2+1]*fsamples[i*2] + ires[i*2 ]*fsamples[i*2+1];
+ re = ires[i*2 ]*state->fsamples[i*2] - ires[i*2+1]*state->fsamples[i*2+1];
+ im = ires[i*2+1]*state->fsamples[i*2] + ires[i*2 ]*state->fsamples[i*2+1];
- fsamples[i*2 ] = re;
- fsamples[i*2+1] = im;
+ state->fsamples[i*2 ] = re;
+ state->fsamples[i*2+1] = im;
}
- rfft(tabsize,-1,fsamples);
+ rfft(state->fft_bits,-1,state->fsamples);
} else {
- for(i=winlen-1+winlen/2;i>=winlen/2;i--) fsamples[i] = fsamples[i-winlen/2]*tabsize/2;
- for(;i>=0;i--) fsamples[i] = 0;
+ for(i=state->winlen-1+state->winlen/2;i>=state->winlen/2;i--) state->fsamples[i] = state->fsamples[i-state->winlen/2]*state->tabsize/2;
+ for(;i>=0;i--) state->fsamples[i] = 0;
}
- for(i=0;i<winlen;i++) outbuf[i*nch+ch] += fsamples[i]/tabsize*2;
+ for(i=0;i<state->winlen;i++) state->outbuf[i*nch+ch] += state->fsamples[i]/state->tabsize*2;
- for(i=winlen;i<tabsize;i++) outbuf[i*nch+ch] = fsamples[i]/tabsize*2;
+ for(i=state->winlen;i<state->tabsize;i++) state->outbuf[i*nch+ch] = state->fsamples[i]/state->tabsize*2;
}
}
- switch(bps)
- {
- case 8:
- for(i=0;i<nsamples*nch;i++)
- {
- inbuf[nbufsamples*nch+i] = ((unsigned char *)buf)[i+p*nch] - 0x80;
- float s = outbuf[nbufsamples*nch+i];
- if (dither) {
- float u;
- s -= hm1;
- u = s;
- s += ditherbuf[(ditherptr++) & (DITHERLEN-1)];
- if (s < amin) s = amin;
- if (amax < s) s = amax;
- s = RINT(s);
- hm1 = s - u;
- ((unsigned char *)buf)[i+p*nch] = s + 0x80;
- } else {
- if (s < amin) s = amin;
- if (amax < s) s = amax;
- ((unsigned char *)buf)[i+p*nch] = RINT(s) + 0x80;
- }
- }
- break;
-
- case 16:
for(i=0;i<nsamples*nch;i++)
{
- inbuf[nbufsamples*nch+i] = ((short *)buf)[i+p*nch];
- float s = outbuf[nbufsamples*nch+i];
- if (dither) {
+ state->finbuf[state->nbufsamples*nch+i] = ((float *)buf)[i+p*nch];
+ float s = state->outbuf[state->nbufsamples*nch+i];
+ if (state->dither) {
float u;
s -= hm1;
u = s;
- s += ditherbuf[(ditherptr++) & (DITHERLEN-1)];
+// s += ditherbuf[(ditherptr++) & (DITHERLEN-1)];
if (s < amin) s = amin;
if (amax < s) s = amax;
- s = RINT(s);
hm1 = s - u;
- ((short *)buf)[i+p*nch] = s;
+ ((float *)buf)[i+p*nch] = s;
} else {
if (s < amin) s = amin;
if (amax < s) s = amax;
- ((short *)buf)[i+p*nch] = RINT(s);
+ ((float *)buf)[i+p*nch] = s;
}
}
- break;
-
- case 24:
- for(i=0;i<nsamples*nch;i++)
- {
- ((int *)inbuf)[nbufsamples*nch+i] =
- (((unsigned char *)buf)[(i+p*nch)*3 ] ) +
- (((unsigned char *)buf)[(i+p*nch)*3+1] << 8) +
- ((( signed char *)buf)[(i+p*nch)*3+2] << 16) ;
-
- float s = outbuf[nbufsamples*nch+i];
- //if (dither) s += ditherbuf[(ditherptr++) & (DITHERLEN-1)];
- if (s < amin) s = amin;
- if (amax < s) s = amax;
- int s2 = RINT(s);
- ((signed char *)buf)[(i+p*nch)*3 ] = s2 & 255; s2 >>= 8;
- ((signed char *)buf)[(i+p*nch)*3+1] = s2 & 255; s2 >>= 8;
- ((signed char *)buf)[(i+p*nch)*3+2] = s2 & 255;
- }
- break;
-
- default:
- assert(0);
- }
p += nsamples;
- nbufsamples += nsamples;
+ state->nbufsamples += nsamples;
return p;
}
-#if 0
-void usage(void)
-{
- fprintf(stderr,"Ouch!\n");
-}
-
-int main(int argc,char **argv)
-{
- FILE *fpi,*fpo;
- char buf[576*2*2];
-
- static REAL bc[] =
- {1.0, 0,1.0, 0,1.0, 0,1.0, 0,1.0, 0,1.0, 0,1.0, 0,1.0, 0,1.0, 0};
-
- init(14);
- makeTable(bc,44100);
-
- if (argc != 3 && argc != 4) exit(-1);
-
- fpi = fopen(argv[1],"r");
- fpo = fopen(argv[2],"w");
-
- if (!fpi || !fpo) exit(-1);
-
- /* generate wav header */
-
- {
- short word;
- int dword;
-
- fwrite("RIFF",4,1,fpo);
- dword = 0;
- fwrite(&dword,4,1,fpo);
-
- fwrite("WAVEfmt ",8,1,fpo);
- dword = 16;
- fwrite(&dword,4,1,fpo);
- word = 1;
- fwrite(&word,2,1,fpo); /* format category, PCM */
- word = 2;
- fwrite(&word,2,1,fpo); /* channels */
- dword = 44100;
- fwrite(&dword,4,1,fpo); /* sampling rate */
- dword = 44100*2*2;
- fwrite(&dword,4,1,fpo); /* bytes per sec */
- word = 4;
- fwrite(&word,2,1,fpo); /* block alignment */
- word = 16;
- fwrite(&word,2,1,fpo); /* ??? */
-
- fwrite("data",4,1,fpo);
- dword = 0;
- fwrite(&dword,4,1,fpo);
- }
-
- preamp = 65536;
- maxamp = 0;
-
- if (argc == 4) {
- preamp = 32767*65536/atoi(argv[3]);
- fprintf(stderr,"preamp = %d\n",preamp);
- }
-
- for(;;)
- {
- int n,m;
-
- n = fread(buf,1,576*2*2,fpi);
- if (n <= 0) break;
- m = modifySamples((short *)buf,n/4,2);
- fwrite(buf,4,m,fpo);
- }
-
-#if 0
- for(;;)
- {
- int n = flushbuf((short *)buf,576);
- if (n == 0) break;
- fwrite(buf,4,n,fpo);
- }
-#endif
-
- {
- short word;
- int dword;
- int len = ftell(fpo);
-
- fseek(fpo,4,SEEK_SET);
- dword = len-8;
- fwrite(&dword,4,1,fpo);
-
- fseek(fpo,40,SEEK_SET);
- dword = len-44;
- fwrite(&dword,4,1,fpo);
- }
-
- if (maxamp != 0) {
- fprintf(stderr,"maxamp = %d\n",maxamp);
- }
-
- quit();
-}
-#endif
-
extern "C" void *paramlist_alloc (void) {
return (void *)(new paramlist);
}
diff --git a/plugins/supereq/Equ.h b/plugins/supereq/Equ.h new file mode 100644 index 00000000..a315741a --- /dev/null +++ b/plugins/supereq/Equ.h @@ -0,0 +1,56 @@ +/* + DeaDBeeF - ultimate music player for GNU/Linux systems with X11 + Copyright (C) 2009-2011 Alexey Yakovenko <waker@users.sourceforge.net> + + This program is free software; you can redistribute it and/or + modify it under the terms of the GNU General Public License + as published by the Free Software Foundation; either version 2 + of the License, or (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. +*/ +#ifndef __EQU_H +#define __EQU_H + +#ifdef __cplusplus +extern "C" { +#endif + +typedef float REAL; +typedef struct { + REAL *lires,*lires1,*lires2; + REAL *irest; + REAL *fsamples; + REAL *ditherbuf; + int ditherptr; + volatile int chg_ires,cur_ires; + int winlen,winlenbit,tabsize,nbufsamples; + REAL *finbuf; + REAL *outbuf; + int dither; + int channels; + int enable; + int fft_bits; +} SuperEqState; + +void *paramlist_alloc (void); +void paramlist_free (void *); +void equ_makeTable(SuperEqState *state, float *lbc,void *param,float fs); +int equ_modifySamples(SuperEqState *state, char *buf,int nsamples,int nch,int bps); +int equ_modifySamples_float (SuperEqState *state, char *buf,int nsamples,int nch); +void equ_clearbuf(SuperEqState *state); +void equ_init(SuperEqState *state, int wb, int channels); +void equ_quit(SuperEqState *state); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/plugins/supereq/Fftsg_fl.cpp b/plugins/supereq/Fftsg_fl.cpp index d48debfe..636f8b8a 100644 --- a/plugins/supereq/Fftsg_fl.cpp +++ b/plugins/supereq/Fftsg_fl.cpp @@ -285,6 +285,7 @@ Appendix : w[] and ip[] are compatible with all routines.
*/
+extern "C" {
void cdft(int n, int isgn, REAL *a, int *ip, REAL *w)
{
@@ -2649,32 +2650,4 @@ void dstsub(int n, REAL *a, int nc, REAL *c) }
a[m] *= c[0];
}
-
-void rfft(int n,int isign,REAL x[])
-{
- static int ipsize = 0,wsize=0;
- static int *ip = NULL;
- static REAL *w = NULL;
- int newipsize,newwsize;
-
- if (n == 0) {
- free(ip); ip = NULL; ipsize = 0;
- free(w); w = NULL; wsize = 0;
- return;
- }
-
- newipsize = 2+sqrt(n/2);
- if (newipsize > ipsize) {
- ipsize = newipsize;
- ip = (int *)realloc(ip,sizeof(int)*ipsize);
- ip[0] = 0;
- }
-
- newwsize = n/2;
- if (newwsize > wsize) {
- wsize = newwsize;
- w = (REAL *)realloc(w,sizeof(REAL)*wsize);
- }
-
- rdft(n,isign,x,ip,w);
}
diff --git a/plugins/supereq/Makefile.am b/plugins/supereq/Makefile.am index 0fffd6d6..45010ec8 100644 --- a/plugins/supereq/Makefile.am +++ b/plugins/supereq/Makefile.am @@ -3,8 +3,51 @@ supereqdir = $(libdir)/$(PACKAGE) pkglib_LTLIBRARIES = supereq.la supereq_la_SOURCES = supereq.c supereq.h Equ.cpp Fftsg_fl.cpp paramlist.hpp -supereq_la_LDFLAGS = -module +#nsfft-1.00/simd/SIMDBaseUndiff.c\ +#nsfft-1.00/simd/SIMDBase.c\ +#nsfft-1.00/dft/DFT.c\ +#nsfft-1.00/dft/DFTUndiff.c\ +#nsfft-1.00/simd/SIMDBase.h\ +#nsfft-1.00/simd/SIMDBaseUndiff.h\ +#nsfft-1.00/dft/DFTUndiff.h\ +#nsfft-1.00/dft/DFT.h\ +#shibatch_rdft.c + +#ffmpeg_fft/libavutil/mem.c\ +#ffmpeg_fft/libavutil/mathematics.c\ +#ffmpeg_fft/libavutil/rational.c\ +#ffmpeg_fft/libavutil/intfloat_readwrite.c\ +#ffmpeg_fft/libavcodec/dct.c\ +#ffmpeg_fft/libavcodec/avfft.c\ +#ffmpeg_fft/libavcodec/fft.c\ +#ffmpeg_fft/libavcodec/dct32.c\ +#ffmpeg_fft/libavcodec/rdft.c\ +#ffmpeg_fft/libavutil/intfloat_readwrite.h\ +#ffmpeg_fft/libavutil/avutil.h\ +#ffmpeg_fft/libavutil/common.h\ +#ffmpeg_fft/libavutil/attributes.h\ +#ffmpeg_fft/libavutil/mem.h\ +#ffmpeg_fft/libavutil/avconfig.h\ +#ffmpeg_fft/libavutil/mathematics.h\ +#ffmpeg_fft/libavutil/rational.h\ +#ffmpeg_fft/publik.h\ +#ffmpeg_fft/ffmpeg_fft.h\ +#ffmpeg_fft/libavcodec/dct32.h\ +#ffmpeg_fft/libavcodec/fft.h\ +#ffmpeg_fft/libavcodec/avfft.h\ +#ffmpeg_fft/config.h\ +#ff_rdft.c + +#AM_CFLAGS = $(CFLAGS) -I ffmpeg_fft -I ffmpeg_fft/libavcodec -I ffmpeg_fft/libavutil -std=c99 +#AM_CPPFLAGS = $(CXXFLAGS) -fno-exceptions -fno-rtti -nostdlib -fno-unwind-tables -I ffmpeg_fft -I ffmpeg_fft/libavcodec -I ffmpeg_fft/libavutil + +#AM_CFLAGS = $(CFLAGS) -I nsfft-1.00/dft -I nsfft-1.00/simd -std=c99 -msse -DENABLE_SSE_FLOAT -DUSE_SHIBATCH +#AM_CPPFLAGS = $(CXXFLAGS) -fno-exceptions -fno-rtti -nostdlib -fno-unwind-tables -I nsfft-1.00/dft -I nsfft-1.00/simd -msse -DENABLE_SSE_FLOAT -DUSE_SHIBATCH + +AM_CFLAGS = $(CFLAGS) -std=c99 -DUSE_OOURA +AM_CPPFLAGS = $(CXXFLAGS) -fno-exceptions -fno-rtti -nostdlib -fno-unwind-tables -DUSE_OOURA + +supereq_la_LDFLAGS = -module -nostdlib -lsupc++ supereq_la_LIBADD = $(LDADD) -AM_CFLAGS = -std=c99 endif diff --git a/plugins/supereq/ff_rdft.c b/plugins/supereq/ff_rdft.c new file mode 100644 index 00000000..70a09350 --- /dev/null +++ b/plugins/supereq/ff_rdft.c @@ -0,0 +1,63 @@ +#include <stdint.h> +#include <complex.h> +#include "libavcodec/avfft.h" +#include "libavutil/avutil.h" + +void rfft(int n,int isign,float *x) +{ + static int wsize=0; + static float *w = NULL; + static RDFTContext *s = NULL; + static RDFTContext *si = NULL; + int newwsize; + + if (n == 0) { + if (w) { + av_free(w); + w = NULL; + wsize = 0; + } + if (s) { + av_rdft_end (s); + s = NULL; + } + if (si) { + av_rdft_end (si); + si = NULL; + } + return; + } + + newwsize = n/2; + if (newwsize > wsize) { + wsize = newwsize; + if (s) { + av_rdft_end (s); + s = NULL; + } + if (si) { + av_rdft_end (si); + si = NULL; + } + if (w) { + av_free (w); + w = NULL; + } + w = (float *)av_malloc(sizeof(float)*wsize); + } + + if (!s) { + s = av_rdft_init(n,DFT_R2C); + } + if (!si) { + si = av_rdft_init(n,IDFT_C2R); + } + + if (isign == 1) { + av_rdft_calc (s, x); + } + else { + av_rdft_calc (si, x); + } +} + diff --git a/plugins/supereq/ffmpeg_fft/README b/plugins/supereq/ffmpeg_fft/README new file mode 100644 index 00000000..f53b2447 --- /dev/null +++ b/plugins/supereq/ffmpeg_fft/README @@ -0,0 +1,9 @@ +purpose: + +* compare fftw and ffmpeg fourier transforms using benchfft and / or libbench +* note: this is very specifically for neon. if you want to use ffmpeg_fft with + some other arch / fpu, then you will need to do some reorganization + +todo: + +1) fix benchees/ffmpeg/doitr.c diff --git a/plugins/supereq/ffmpeg_fft/config.h b/plugins/supereq/ffmpeg_fft/config.h new file mode 100644 index 00000000..0f36b47c --- /dev/null +++ b/plugins/supereq/ffmpeg_fft/config.h @@ -0,0 +1,904 @@ +/* Automatically generated by configure - do not modify! */ +#ifndef FFMPEG_CONFIG_H +#define FFMPEG_CONFIG_H +#define FFMPEG_CONFIGURATION "--prefix=/usr --enable-neon --enable-pic --cpu=cortex-a8 --arch=arm --cross-prefix=arm-none-linux-gnueabi- --enable-cross-compile --target-os=linux --extra-cflags='-mfpu=neon -mcpu=cortex-a8 -mfloat-abi=softfp' --enable-shared --disable-debug" +#define FFMPEG_LICENSE "LGPL version 2.1 or later" +#define FFMPEG_DATADIR "/usr/share/ffmpeg" +#define CC_TYPE "gcc" +#define CC_VERSION __VERSION__ +#define restrict restrict +#define ASMALIGN(ZEROBITS) ".p2align " #ZEROBITS "\n\t" +#define EXTERN_PREFIX "" +#define EXTERN_ASM +#define ARCH_ALPHA 0 +#define ARCH_ARM 0 +#define ARCH_AVR32 0 +#define ARCH_AVR32_AP 0 +#define ARCH_AVR32_UC 0 +#define ARCH_BFIN 0 +#define ARCH_IA64 0 +#define ARCH_M68K 0 +#define ARCH_MIPS 0 +#define ARCH_MIPS64 0 +#define ARCH_PARISC 0 +#define ARCH_PPC 0 +#define ARCH_PPC64 0 +#define ARCH_S390 0 +#define ARCH_SH4 0 +#define ARCH_SPARC 0 +#define ARCH_SPARC64 0 +#define ARCH_TOMI 0 +#define ARCH_X86 1 +#define ARCH_X86_32 1 +#define ARCH_X86_64 0 +#define HAVE_ALTIVEC 0 +#define HAVE_AMD3DNOW 0 +#define HAVE_AMD3DNOWEXT 0 +#define HAVE_ARMV5TE 1 +#define HAVE_ARMV6 1 +#define HAVE_ARMV6T2 1 +#define HAVE_ARMVFP 1 +#define HAVE_IWMMXT 0 +#define HAVE_MMI 0 +#define HAVE_MMX 0 +#define HAVE_MMX2 0 +#define HAVE_NEON 1 +#define HAVE_PPC4XX 0 +#define HAVE_SSE 1 +#define HAVE_SSSE3 1 +#define HAVE_VIS 0 +#define HAVE_BIGENDIAN 0 +#define HAVE_PTHREADS 1 +#define HAVE_W32THREADS 0 +#define HAVE_ALSA_ASOUNDLIB_H 0 +#define HAVE_ALTIVEC_H 0 +#define HAVE_ARPA_INET_H 1 +#define HAVE_ATTRIBUTE_MAY_ALIAS 1 +#define HAVE_ATTRIBUTE_PACKED 1 +#define HAVE_BSWAP 0 +#define HAVE_CLOSESOCKET 0 +#define HAVE_CMOV 0 +#define HAVE_CONIO_H 0 +#define HAVE_DCBZL 0 +#define HAVE_DEV_BKTR_IOCTL_BT848_H 0 +#define HAVE_DEV_BKTR_IOCTL_METEOR_H 0 +#define HAVE_DEV_IC_BT8XX_H 0 +#define HAVE_DEV_VIDEO_METEOR_IOCTL_METEOR_H 0 +#define HAVE_DEV_VIDEO_BKTR_IOCTL_BT848_H 0 +#define HAVE_DLFCN_H 1 +#define HAVE_DLOPEN 1 +#define HAVE_DOS_PATHS 0 +#define HAVE_EBP_AVAILABLE 0 +#define HAVE_EBX_AVAILABLE 0 +#define HAVE_EXP2 1 +#define HAVE_EXP2F 1 +#define HAVE_FAST_64BIT 0 +#define HAVE_FAST_CLZ 1 +#define HAVE_FAST_CMOV 0 +#define HAVE_FAST_UNALIGNED 1 +#define HAVE_FCNTL 1 +#define HAVE_FORK 1 +#define HAVE_GETADDRINFO 1 +#define HAVE_GETHRTIME 0 +#define HAVE_GETPROCESSMEMORYINFO 0 +#define HAVE_GETPROCESSTIMES 0 +#define HAVE_GETRUSAGE 1 +#define HAVE_GNU_AS 1 +#define HAVE_STRUCT_RUSAGE_RU_MAXRSS 1 +#define HAVE_IBM_ASM 0 +#define HAVE_INET_ATON 1 +#define HAVE_INLINE_ASM 1 +#define HAVE_ISATTY 1 +#define HAVE_LDBRX 0 +#define HAVE_LIBDC1394_1 0 +#define HAVE_LIBDC1394_2 0 +#define HAVE_LLRINT 1 +#define HAVE_LLRINTF 1 +#define HAVE_LOCAL_ALIGNED_16 0 +#define HAVE_LOCAL_ALIGNED_8 0 +#define HAVE_LOG2 1 +#define HAVE_LOG2F 1 +#define HAVE_LOONGSON 0 +#define HAVE_LRINT 1 +#define HAVE_LRINTF 1 +#define HAVE_LZO1X_999_COMPRESS 0 +#define HAVE_MACHINE_IOCTL_BT848_H 0 +#define HAVE_MACHINE_IOCTL_METEOR_H 0 +#define HAVE_MALLOC_H 1 +#define HAVE_MEMALIGN 1 +#define HAVE_MKSTEMP 1 +#define HAVE_PLD 1 +#define HAVE_POSIX_MEMALIGN 1 +#define HAVE_ROUND 1 +#define HAVE_ROUNDF 1 +#define HAVE_SDL 0 +#define HAVE_SDL_VIDEO_SIZE 0 +#define HAVE_SETMODE 0 +#define HAVE_SOCKLEN_T 1 +#define HAVE_SOUNDCARD_H 0 +#define HAVE_POLL_H 1 +#define HAVE_SETRLIMIT 1 +#define HAVE_STRERROR_R 1 +#define HAVE_STRUCT_ADDRINFO 1 +#define HAVE_STRUCT_IPV6_MREQ 1 +#define HAVE_STRUCT_SOCKADDR_IN6 1 +#define HAVE_STRUCT_SOCKADDR_SA_LEN 0 +#define HAVE_STRUCT_SOCKADDR_STORAGE 1 +#define HAVE_SYMVER 1 +#define HAVE_SYMVER_GNU_ASM 1 +#define HAVE_SYMVER_ASM_LABEL 0 +#define HAVE_SYS_MMAN_H 1 +#define HAVE_SYS_RESOURCE_H 1 +#define HAVE_SYS_SELECT_H 1 +#define HAVE_SYS_SOUNDCARD_H 1 +#define HAVE_SYS_VIDEOIO_H 0 +#define HAVE_TEN_OPERANDS 0 +#define HAVE_TERMIOS_H 1 +#define HAVE_THREADS 1 +#define HAVE_TRUNCF 1 +#define HAVE_VFP_ARGS 0 +#define HAVE_VIRTUALALLOC 0 +#define HAVE_WINSOCK2_H 0 +#define HAVE_XFORM_ASM 0 +#define HAVE_YASM 0 +#define CONFIG_BSFS 1 +#define CONFIG_DECODERS 1 +#define CONFIG_DEMUXERS 1 +#define CONFIG_ENCODERS 1 +#define CONFIG_FILTERS 1 +#define CONFIG_HWACCELS 0 +#define CONFIG_INDEVS 1 +#define CONFIG_MUXERS 1 +#define CONFIG_OUTDEVS 1 +#define CONFIG_PARSERS 1 +#define CONFIG_PROTOCOLS 1 +#define CONFIG_AANDCT 1 +#define CONFIG_AVCODEC 1 +#define CONFIG_AVDEVICE 1 +#define CONFIG_AVFILTER 1 +#define CONFIG_AVFILTER_LAVF 0 +#define CONFIG_AVFORMAT 1 +#define CONFIG_AVISYNTH 0 +#define CONFIG_BZLIB 0 +#define CONFIG_DCT 1 +#define CONFIG_DOC 0 +#define CONFIG_DWT 1 +#define CONFIG_DXVA2 0 +#define CONFIG_FASTDIV 1 +#define CONFIG_FFMPEG 1 +#define CONFIG_FFPLAY 0 +#define CONFIG_FFPROBE 1 +#define CONFIG_FFSERVER 1 +#define CONFIG_FFT 1 +#define CONFIG_GOLOMB 1 +#define CONFIG_GPL 0 +#define CONFIG_GRAY 0 +#define CONFIG_H264DSP 1 +#define CONFIG_HARDCODED_TABLES 0 +#define CONFIG_LIBDC1394 0 +#define CONFIG_LIBDIRAC 0 +#define CONFIG_LIBFAAC 0 +#define CONFIG_LIBGSM 0 +#define CONFIG_LIBMP3LAME 0 +#define CONFIG_LIBNUT 0 +#define CONFIG_LIBOPENCORE_AMRNB 0 +#define CONFIG_LIBOPENCORE_AMRWB 0 +#define CONFIG_LIBOPENJPEG 0 +#define CONFIG_LIBRTMP 0 +#define CONFIG_LIBSCHROEDINGER 0 +#define CONFIG_LIBSPEEX 0 +#define CONFIG_LIBTHEORA 0 +#define CONFIG_LIBVORBIS 0 +#define CONFIG_LIBVPX 0 +#define CONFIG_LIBX264 0 +#define CONFIG_LIBXVID 0 +#define CONFIG_LPC 1 +#define CONFIG_LSP 1 +//#define CONFIG_MDCT 1 +#define CONFIG_MEMALIGN_HACK 0 +#define CONFIG_MLIB 0 +#define CONFIG_MPEGAUDIO_HP 1 +#define CONFIG_NETWORK 1 +#define CONFIG_NONFREE 0 +#define CONFIG_PIC 1 +#define CONFIG_POSTPROC 0 +#define CONFIG_RDFT 1 +#define CONFIG_RUNTIME_CPUDETECT 0 +#define CONFIG_SHARED 1 +#define CONFIG_SMALL 0 +#define CONFIG_SRAM 0 +#define CONFIG_STATIC 1 +#define CONFIG_SWSCALE 1 +#define CONFIG_SWSCALE_ALPHA 1 +#define CONFIG_VAAPI 0 +#define CONFIG_VDPAU 0 +#define CONFIG_VERSION3 0 +#define CONFIG_X11GRAB 0 +#define CONFIG_ZLIB 0 +#define CONFIG_AVUTIL 1 +#define CONFIG_GPLV3 0 +#define CONFIG_LGPLV3 0 +#define CONFIG_AASC_DECODER 1 +#define CONFIG_AMV_DECODER 1 +#define CONFIG_ANM_DECODER 1 +#define CONFIG_ASV1_DECODER 1 +#define CONFIG_ASV2_DECODER 1 +#define CONFIG_AURA_DECODER 1 +#define CONFIG_AURA2_DECODER 1 +#define CONFIG_AVS_DECODER 1 +#define CONFIG_BETHSOFTVID_DECODER 1 +#define CONFIG_BFI_DECODER 1 +#define CONFIG_BINK_DECODER 1 +#define CONFIG_BMP_DECODER 1 +#define CONFIG_C93_DECODER 1 +#define CONFIG_CAVS_DECODER 1 +#define CONFIG_CDGRAPHICS_DECODER 1 +#define CONFIG_CINEPAK_DECODER 1 +#define CONFIG_CLJR_DECODER 1 +#define CONFIG_CSCD_DECODER 1 +#define CONFIG_CYUV_DECODER 1 +#define CONFIG_DNXHD_DECODER 1 +#define CONFIG_DPX_DECODER 1 +#define CONFIG_DSICINVIDEO_DECODER 1 +#define CONFIG_DVVIDEO_DECODER 1 +#define CONFIG_DXA_DECODER 0 +#define CONFIG_EACMV_DECODER 1 +#define CONFIG_EAMAD_DECODER 1 +#define CONFIG_EATGQ_DECODER 1 +#define CONFIG_EATGV_DECODER 1 +#define CONFIG_EATQI_DECODER 1 +#define CONFIG_EIGHTBPS_DECODER 1 +#define CONFIG_EIGHTSVX_EXP_DECODER 1 +#define CONFIG_EIGHTSVX_FIB_DECODER 1 +#define CONFIG_ESCAPE124_DECODER 1 +#define CONFIG_FFV1_DECODER 1 +#define CONFIG_FFVHUFF_DECODER 1 +#define CONFIG_FLASHSV_DECODER 0 +#define CONFIG_FLIC_DECODER 1 +#define CONFIG_FLV_DECODER 1 +#define CONFIG_FOURXM_DECODER 1 +#define CONFIG_FRAPS_DECODER 1 +#define CONFIG_FRWU_DECODER 1 +#define CONFIG_GIF_DECODER 1 +#define CONFIG_H261_DECODER 1 +#define CONFIG_H263_DECODER 1 +#define CONFIG_H263I_DECODER 1 +#define CONFIG_H264_DECODER 1 +#define CONFIG_H264_VDPAU_DECODER 0 +#define CONFIG_HUFFYUV_DECODER 1 +#define CONFIG_IDCIN_DECODER 1 +#define CONFIG_IFF_BYTERUN1_DECODER 1 +#define CONFIG_IFF_ILBM_DECODER 1 +#define CONFIG_INDEO2_DECODER 1 +#define CONFIG_INDEO3_DECODER 1 +#define CONFIG_INDEO5_DECODER 1 +#define CONFIG_INTERPLAY_VIDEO_DECODER 1 +#define CONFIG_JPEGLS_DECODER 1 +#define CONFIG_KGV1_DECODER 1 +#define CONFIG_KMVC_DECODER 1 +#define CONFIG_LOCO_DECODER 1 +#define CONFIG_MDEC_DECODER 1 +#define CONFIG_MIMIC_DECODER 1 +#define CONFIG_MJPEG_DECODER 1 +#define CONFIG_MJPEGB_DECODER 1 +#define CONFIG_MMVIDEO_DECODER 1 +#define CONFIG_MOTIONPIXELS_DECODER 1 +#define CONFIG_MPEG_XVMC_DECODER 0 +#define CONFIG_MPEG1VIDEO_DECODER 1 +#define CONFIG_MPEG2VIDEO_DECODER 1 +#define CONFIG_MPEG4_DECODER 1 +#define CONFIG_MPEG4_VDPAU_DECODER 0 +#define CONFIG_MPEGVIDEO_DECODER 1 +#define CONFIG_MPEG_VDPAU_DECODER 0 +#define CONFIG_MPEG1_VDPAU_DECODER 0 +#define CONFIG_MSMPEG4V1_DECODER 1 +#define CONFIG_MSMPEG4V2_DECODER 1 +#define CONFIG_MSMPEG4V3_DECODER 1 +#define CONFIG_MSRLE_DECODER 1 +#define CONFIG_MSVIDEO1_DECODER 1 +#define CONFIG_MSZH_DECODER 1 +#define CONFIG_NUV_DECODER 1 +#define CONFIG_PAM_DECODER 1 +#define CONFIG_PBM_DECODER 1 +#define CONFIG_PCX_DECODER 1 +#define CONFIG_PGM_DECODER 1 +#define CONFIG_PGMYUV_DECODER 1 +#define CONFIG_PICTOR_DECODER 1 +#define CONFIG_PNG_DECODER 0 +#define CONFIG_PPM_DECODER 1 +#define CONFIG_PTX_DECODER 1 +#define CONFIG_QDRAW_DECODER 1 +#define CONFIG_QPEG_DECODER 1 +#define CONFIG_QTRLE_DECODER 1 +#define CONFIG_R210_DECODER 1 +#define CONFIG_RAWVIDEO_DECODER 1 +#define CONFIG_RL2_DECODER 1 +#define CONFIG_ROQ_DECODER 1 +#define CONFIG_RPZA_DECODER 1 +#define CONFIG_RV10_DECODER 1 +#define CONFIG_RV20_DECODER 1 +#define CONFIG_RV30_DECODER 1 +#define CONFIG_RV40_DECODER 1 +#define CONFIG_SGI_DECODER 1 +#define CONFIG_SMACKER_DECODER 1 +#define CONFIG_SMC_DECODER 1 +#define CONFIG_SNOW_DECODER 1 +#define CONFIG_SP5X_DECODER 1 +#define CONFIG_SUNRAST_DECODER 1 +#define CONFIG_SVQ1_DECODER 1 +#define CONFIG_SVQ3_DECODER 1 +#define CONFIG_TARGA_DECODER 1 +#define CONFIG_THEORA_DECODER 1 +#define CONFIG_THP_DECODER 1 +#define CONFIG_TIERTEXSEQVIDEO_DECODER 1 +#define CONFIG_TIFF_DECODER 1 +#define CONFIG_TMV_DECODER 1 +#define CONFIG_TRUEMOTION1_DECODER 1 +#define CONFIG_TRUEMOTION2_DECODER 1 +#define CONFIG_TSCC_DECODER 0 +#define CONFIG_TXD_DECODER 1 +#define CONFIG_ULTI_DECODER 1 +#define CONFIG_V210_DECODER 1 +#define CONFIG_V210X_DECODER 1 +#define CONFIG_VB_DECODER 1 +#define CONFIG_VC1_DECODER 1 +#define CONFIG_VC1_VDPAU_DECODER 0 +#define CONFIG_VCR1_DECODER 1 +#define CONFIG_VMDVIDEO_DECODER 1 +#define CONFIG_VMNC_DECODER 1 +#define CONFIG_VP3_DECODER 1 +#define CONFIG_VP5_DECODER 1 +#define CONFIG_VP6_DECODER 1 +#define CONFIG_VP6A_DECODER 1 +#define CONFIG_VP6F_DECODER 1 +#define CONFIG_VP8_DECODER 1 +#define CONFIG_VQA_DECODER 1 +#define CONFIG_WMV1_DECODER 1 +#define CONFIG_WMV2_DECODER 1 +#define CONFIG_WMV3_DECODER 1 +#define CONFIG_WMV3_VDPAU_DECODER 0 +#define CONFIG_WNV1_DECODER 1 +#define CONFIG_XAN_WC3_DECODER 1 +#define CONFIG_XL_DECODER 1 +#define CONFIG_YOP_DECODER 1 +#define CONFIG_ZLIB_DECODER 0 +#define CONFIG_ZMBV_DECODER 0 +#define CONFIG_AAC_DECODER 1 +#define CONFIG_AC3_DECODER 1 +#define CONFIG_ALAC_DECODER 1 +#define CONFIG_ALS_DECODER 1 +#define CONFIG_AMRNB_DECODER 1 +#define CONFIG_APE_DECODER 1 +#define CONFIG_ATRAC1_DECODER 1 +#define CONFIG_ATRAC3_DECODER 1 +#define CONFIG_BINKAUDIO_DCT_DECODER 1 +#define CONFIG_BINKAUDIO_RDFT_DECODER 1 +#define CONFIG_COOK_DECODER 1 +/* #define CONFIG_DCA_DECODER 1 */ +#define CONFIG_DSICINAUDIO_DECODER 1 +#define CONFIG_EAC3_DECODER 1 +#define CONFIG_FLAC_DECODER 1 +#define CONFIG_GSM_DECODER 1 +#define CONFIG_GSM_MS_DECODER 1 +#define CONFIG_IMC_DECODER 1 +#define CONFIG_MACE3_DECODER 1 +#define CONFIG_MACE6_DECODER 1 +#define CONFIG_MLP_DECODER 1 +#define CONFIG_MP1_DECODER 1 +#define CONFIG_MP1FLOAT_DECODER 1 +#define CONFIG_MP2_DECODER 1 +#define CONFIG_MP2FLOAT_DECODER 1 +#define CONFIG_MP3_DECODER 1 +#define CONFIG_MP3FLOAT_DECODER 1 +#define CONFIG_MP3ADU_DECODER 1 +#define CONFIG_MP3ADUFLOAT_DECODER 1 +#define CONFIG_MP3ON4_DECODER 1 +#define CONFIG_MP3ON4FLOAT_DECODER 1 +#define CONFIG_MPC7_DECODER 1 +#define CONFIG_MPC8_DECODER 1 +#define CONFIG_NELLYMOSER_DECODER 1 +#define CONFIG_QCELP_DECODER 1 +#define CONFIG_QDM2_DECODER 1 +#define CONFIG_RA_144_DECODER 1 +#define CONFIG_RA_288_DECODER 1 +#define CONFIG_SHORTEN_DECODER 1 +#define CONFIG_SIPR_DECODER 1 +#define CONFIG_SMACKAUD_DECODER 1 +#define CONFIG_SONIC_DECODER 1 +#define CONFIG_TRUEHD_DECODER 1 +#define CONFIG_TRUESPEECH_DECODER 1 +#define CONFIG_TTA_DECODER 1 +#define CONFIG_TWINVQ_DECODER 1 +#define CONFIG_VMDAUDIO_DECODER 1 +#define CONFIG_VORBIS_DECODER 1 +#define CONFIG_WAVPACK_DECODER 1 +#define CONFIG_WMAPRO_DECODER 1 +#define CONFIG_WMAV1_DECODER 1 +#define CONFIG_WMAV2_DECODER 1 +#define CONFIG_WMAVOICE_DECODER 1 +#define CONFIG_WS_SND1_DECODER 1 +#define CONFIG_PCM_ALAW_DECODER 1 +#define CONFIG_PCM_BLURAY_DECODER 1 +#define CONFIG_PCM_DVD_DECODER 1 +#define CONFIG_PCM_F32BE_DECODER 1 +#define CONFIG_PCM_F32LE_DECODER 1 +#define CONFIG_PCM_F64BE_DECODER 1 +#define CONFIG_PCM_F64LE_DECODER 1 +#define CONFIG_PCM_MULAW_DECODER 1 +#define CONFIG_PCM_S8_DECODER 1 +#define CONFIG_PCM_S16BE_DECODER 1 +#define CONFIG_PCM_S16LE_DECODER 1 +#define CONFIG_PCM_S16LE_PLANAR_DECODER 1 +#define CONFIG_PCM_S24BE_DECODER 1 +#define CONFIG_PCM_S24DAUD_DECODER 1 +#define CONFIG_PCM_S24LE_DECODER 1 +#define CONFIG_PCM_S32BE_DECODER 1 +#define CONFIG_PCM_S32LE_DECODER 1 +#define CONFIG_PCM_U8_DECODER 1 +#define CONFIG_PCM_U16BE_DECODER 1 +#define CONFIG_PCM_U16LE_DECODER 1 +#define CONFIG_PCM_U24BE_DECODER 1 +#define CONFIG_PCM_U24LE_DECODER 1 +#define CONFIG_PCM_U32BE_DECODER 1 +#define CONFIG_PCM_U32LE_DECODER 1 +#define CONFIG_PCM_ZORK_DECODER 1 +#define CONFIG_INTERPLAY_DPCM_DECODER 1 +#define CONFIG_ROQ_DPCM_DECODER 1 +#define CONFIG_SOL_DPCM_DECODER 1 +#define CONFIG_XAN_DPCM_DECODER 1 +#define CONFIG_ADPCM_4XM_DECODER 1 +#define CONFIG_ADPCM_ADX_DECODER 1 +#define CONFIG_ADPCM_CT_DECODER 1 +#define CONFIG_ADPCM_EA_DECODER 1 +#define CONFIG_ADPCM_EA_MAXIS_XA_DECODER 1 +#define CONFIG_ADPCM_EA_R1_DECODER 1 +#define CONFIG_ADPCM_EA_R2_DECODER 1 +#define CONFIG_ADPCM_EA_R3_DECODER 1 +#define CONFIG_ADPCM_EA_XAS_DECODER 1 +#define CONFIG_ADPCM_G726_DECODER 1 +#define CONFIG_ADPCM_IMA_AMV_DECODER 1 +#define CONFIG_ADPCM_IMA_DK3_DECODER 1 +#define CONFIG_ADPCM_IMA_DK4_DECODER 1 +#define CONFIG_ADPCM_IMA_EA_EACS_DECODER 1 +#define CONFIG_ADPCM_IMA_EA_SEAD_DECODER 1 +#define CONFIG_ADPCM_IMA_ISS_DECODER 1 +#define CONFIG_ADPCM_IMA_QT_DECODER 1 +#define CONFIG_ADPCM_IMA_SMJPEG_DECODER 1 +#define CONFIG_ADPCM_IMA_WAV_DECODER 1 +#define CONFIG_ADPCM_IMA_WS_DECODER 1 +#define CONFIG_ADPCM_MS_DECODER 1 +#define CONFIG_ADPCM_SBPRO_2_DECODER 1 +#define CONFIG_ADPCM_SBPRO_3_DECODER 1 +#define CONFIG_ADPCM_SBPRO_4_DECODER 1 +#define CONFIG_ADPCM_SWF_DECODER 1 +#define CONFIG_ADPCM_THP_DECODER 1 +#define CONFIG_ADPCM_XA_DECODER 1 +#define CONFIG_ADPCM_YAMAHA_DECODER 1 +#define CONFIG_DVBSUB_DECODER 1 +#define CONFIG_DVDSUB_DECODER 1 +#define CONFIG_PGSSUB_DECODER 1 +#define CONFIG_XSUB_DECODER 1 +#define CONFIG_LIBDIRAC_DECODER 0 +#define CONFIG_LIBGSM_DECODER 0 +#define CONFIG_LIBGSM_MS_DECODER 0 +#define CONFIG_LIBOPENCORE_AMRNB_DECODER 0 +#define CONFIG_LIBOPENCORE_AMRWB_DECODER 0 +#define CONFIG_LIBOPENJPEG_DECODER 0 +#define CONFIG_LIBSCHROEDINGER_DECODER 0 +#define CONFIG_LIBSPEEX_DECODER 0 +#define CONFIG_LIBVPX_DECODER 0 +#define CONFIG_ASV1_ENCODER 1 +#define CONFIG_ASV2_ENCODER 1 +#define CONFIG_BMP_ENCODER 1 +#define CONFIG_DNXHD_ENCODER 1 +#define CONFIG_DVVIDEO_ENCODER 1 +#define CONFIG_FFV1_ENCODER 1 +#define CONFIG_FFVHUFF_ENCODER 1 +#define CONFIG_FLASHSV_ENCODER 0 +#define CONFIG_FLV_ENCODER 1 +#define CONFIG_GIF_ENCODER 1 +#define CONFIG_H261_ENCODER 1 +#define CONFIG_H263_ENCODER 1 +#define CONFIG_H263P_ENCODER 1 +#define CONFIG_HUFFYUV_ENCODER 1 +#define CONFIG_JPEGLS_ENCODER 1 +#define CONFIG_LJPEG_ENCODER 1 +#define CONFIG_MJPEG_ENCODER 1 +#define CONFIG_MPEG1VIDEO_ENCODER 1 +#define CONFIG_MPEG2VIDEO_ENCODER 1 +#define CONFIG_MPEG4_ENCODER 1 +#define CONFIG_MSMPEG4V1_ENCODER 1 +#define CONFIG_MSMPEG4V2_ENCODER 1 +#define CONFIG_MSMPEG4V3_ENCODER 1 +#define CONFIG_PAM_ENCODER 1 +#define CONFIG_PBM_ENCODER 1 +#define CONFIG_PCX_ENCODER 1 +#define CONFIG_PGM_ENCODER 1 +#define CONFIG_PGMYUV_ENCODER 1 +#define CONFIG_PNG_ENCODER 0 +#define CONFIG_PPM_ENCODER 1 +#define CONFIG_QTRLE_ENCODER 1 +#define CONFIG_RAWVIDEO_ENCODER 1 +#define CONFIG_ROQ_ENCODER 1 +#define CONFIG_RV10_ENCODER 1 +#define CONFIG_RV20_ENCODER 1 +#define CONFIG_SGI_ENCODER 1 +#define CONFIG_SNOW_ENCODER 1 +#define CONFIG_SVQ1_ENCODER 1 +#define CONFIG_TARGA_ENCODER 1 +#define CONFIG_TIFF_ENCODER 1 +#define CONFIG_V210_ENCODER 1 +#define CONFIG_WMV1_ENCODER 1 +#define CONFIG_WMV2_ENCODER 1 +#define CONFIG_ZLIB_ENCODER 0 +#define CONFIG_ZMBV_ENCODER 0 +#define CONFIG_AAC_ENCODER 1 +#define CONFIG_AC3_ENCODER 1 +#define CONFIG_ALAC_ENCODER 1 +#define CONFIG_FLAC_ENCODER 1 +#define CONFIG_MP2_ENCODER 1 +#define CONFIG_NELLYMOSER_ENCODER 1 +#define CONFIG_RA_144_ENCODER 1 +#define CONFIG_SONIC_ENCODER 1 +#define CONFIG_SONIC_LS_ENCODER 1 +#define CONFIG_VORBIS_ENCODER 1 +#define CONFIG_WMAV1_ENCODER 1 +#define CONFIG_WMAV2_ENCODER 1 +#define CONFIG_PCM_ALAW_ENCODER 1 +#define CONFIG_PCM_F32BE_ENCODER 1 +#define CONFIG_PCM_F32LE_ENCODER 1 +#define CONFIG_PCM_F64BE_ENCODER 1 +#define CONFIG_PCM_F64LE_ENCODER 1 +#define CONFIG_PCM_MULAW_ENCODER 1 +#define CONFIG_PCM_S8_ENCODER 1 +#define CONFIG_PCM_S16BE_ENCODER 1 +#define CONFIG_PCM_S16LE_ENCODER 1 +#define CONFIG_PCM_S24BE_ENCODER 1 +#define CONFIG_PCM_S24DAUD_ENCODER 1 +#define CONFIG_PCM_S24LE_ENCODER 1 +#define CONFIG_PCM_S32BE_ENCODER 1 +#define CONFIG_PCM_S32LE_ENCODER 1 +#define CONFIG_PCM_U8_ENCODER 1 +#define CONFIG_PCM_U16BE_ENCODER 1 +#define CONFIG_PCM_U16LE_ENCODER 1 +#define CONFIG_PCM_U24BE_ENCODER 1 +#define CONFIG_PCM_U24LE_ENCODER 1 +#define CONFIG_PCM_U32BE_ENCODER 1 +#define CONFIG_PCM_U32LE_ENCODER 1 +#define CONFIG_PCM_ZORK_ENCODER 1 +#define CONFIG_ROQ_DPCM_ENCODER 1 +#define CONFIG_ADPCM_ADX_ENCODER 1 +#define CONFIG_ADPCM_G726_ENCODER 1 +#define CONFIG_ADPCM_IMA_QT_ENCODER 1 +#define CONFIG_ADPCM_IMA_WAV_ENCODER 1 +#define CONFIG_ADPCM_MS_ENCODER 1 +#define CONFIG_ADPCM_SWF_ENCODER 1 +#define CONFIG_ADPCM_YAMAHA_ENCODER 1 +#define CONFIG_DVBSUB_ENCODER 1 +#define CONFIG_DVDSUB_ENCODER 1 +#define CONFIG_XSUB_ENCODER 1 +#define CONFIG_LIBDIRAC_ENCODER 0 +#define CONFIG_LIBFAAC_ENCODER 0 +#define CONFIG_LIBGSM_ENCODER 0 +#define CONFIG_LIBGSM_MS_ENCODER 0 +#define CONFIG_LIBMP3LAME_ENCODER 0 +#define CONFIG_LIBOPENCORE_AMRNB_ENCODER 0 +#define CONFIG_LIBSCHROEDINGER_ENCODER 0 +#define CONFIG_LIBTHEORA_ENCODER 0 +#define CONFIG_LIBVORBIS_ENCODER 0 +#define CONFIG_LIBVPX_ENCODER 0 +#define CONFIG_LIBX264_ENCODER 0 +#define CONFIG_LIBXVID_ENCODER 0 +#define CONFIG_H263_VAAPI_HWACCEL 0 +#define CONFIG_H264_DXVA2_HWACCEL 0 +#define CONFIG_H264_VAAPI_HWACCEL 0 +#define CONFIG_MPEG2_DXVA2_HWACCEL 0 +#define CONFIG_MPEG2_VAAPI_HWACCEL 0 +#define CONFIG_MPEG4_VAAPI_HWACCEL 0 +#define CONFIG_VC1_DXVA2_HWACCEL 0 +#define CONFIG_VC1_VAAPI_HWACCEL 0 +#define CONFIG_WMV3_DXVA2_HWACCEL 0 +#define CONFIG_WMV3_VAAPI_HWACCEL 0 +#define CONFIG_AAC_PARSER 1 +#define CONFIG_AC3_PARSER 1 +#define CONFIG_CAVSVIDEO_PARSER 1 +#define CONFIG_DCA_PARSER 1 +#define CONFIG_DIRAC_PARSER 1 +#define CONFIG_DNXHD_PARSER 1 +#define CONFIG_DVBSUB_PARSER 1 +#define CONFIG_DVDSUB_PARSER 1 +#define CONFIG_H261_PARSER 1 +#define CONFIG_H263_PARSER 1 +#define CONFIG_H264_PARSER 1 +#define CONFIG_MJPEG_PARSER 1 +#define CONFIG_MLP_PARSER 1 +#define CONFIG_MPEG4VIDEO_PARSER 1 +#define CONFIG_MPEGAUDIO_PARSER 1 +#define CONFIG_MPEGVIDEO_PARSER 1 +#define CONFIG_PNM_PARSER 1 +#define CONFIG_VC1_PARSER 1 +#define CONFIG_VP3_PARSER 1 +#define CONFIG_VP8_PARSER 1 +#define CONFIG_AAC_ADTSTOASC_BSF 1 +#define CONFIG_CHOMP_BSF 1 +#define CONFIG_DUMP_EXTRADATA_BSF 1 +#define CONFIG_H264_MP4TOANNEXB_BSF 1 +#define CONFIG_IMX_DUMP_HEADER_BSF 1 +#define CONFIG_MJPEGA_DUMP_HEADER_BSF 1 +#define CONFIG_MP3_HEADER_COMPRESS_BSF 1 +#define CONFIG_MP3_HEADER_DECOMPRESS_BSF 1 +#define CONFIG_MOV2TEXTSUB_BSF 1 +#define CONFIG_NOISE_BSF 1 +#define CONFIG_REMOVE_EXTRADATA_BSF 1 +#define CONFIG_TEXT2MOVSUB_BSF 1 +#define CONFIG_AAC_DEMUXER 1 +#define CONFIG_AC3_DEMUXER 1 +#define CONFIG_AEA_DEMUXER 1 +#define CONFIG_AIFF_DEMUXER 1 +#define CONFIG_AMR_DEMUXER 1 +#define CONFIG_ANM_DEMUXER 1 +#define CONFIG_APC_DEMUXER 1 +#define CONFIG_APE_DEMUXER 1 +#define CONFIG_ASF_DEMUXER 1 +#define CONFIG_ASS_DEMUXER 1 +#define CONFIG_AU_DEMUXER 1 +#define CONFIG_AVI_DEMUXER 1 +#define CONFIG_AVISYNTH_DEMUXER 0 +#define CONFIG_AVS_DEMUXER 1 +#define CONFIG_BETHSOFTVID_DEMUXER 1 +#define CONFIG_BFI_DEMUXER 1 +#define CONFIG_BINK_DEMUXER 1 +#define CONFIG_C93_DEMUXER 1 +#define CONFIG_CAF_DEMUXER 1 +#define CONFIG_CAVSVIDEO_DEMUXER 1 +#define CONFIG_CDG_DEMUXER 1 +#define CONFIG_DAUD_DEMUXER 1 +#define CONFIG_DIRAC_DEMUXER 1 +#define CONFIG_DNXHD_DEMUXER 1 +#define CONFIG_DSICIN_DEMUXER 1 +#define CONFIG_DTS_DEMUXER 1 +#define CONFIG_DV_DEMUXER 1 +#define CONFIG_DXA_DEMUXER 1 +#define CONFIG_EA_DEMUXER 1 +#define CONFIG_EA_CDATA_DEMUXER 1 +#define CONFIG_EAC3_DEMUXER 1 +#define CONFIG_FFM_DEMUXER 1 +#define CONFIG_FILMSTRIP_DEMUXER 1 +#define CONFIG_FLAC_DEMUXER 1 +#define CONFIG_FLIC_DEMUXER 1 +#define CONFIG_FLV_DEMUXER 1 +#define CONFIG_FOURXM_DEMUXER 1 +#define CONFIG_GSM_DEMUXER 1 +#define CONFIG_GXF_DEMUXER 1 +#define CONFIG_H261_DEMUXER 1 +#define CONFIG_H263_DEMUXER 1 +#define CONFIG_H264_DEMUXER 1 +#define CONFIG_IDCIN_DEMUXER 1 +#define CONFIG_IFF_DEMUXER 1 +#define CONFIG_IMAGE2_DEMUXER 1 +#define CONFIG_IMAGE2PIPE_DEMUXER 1 +#define CONFIG_INGENIENT_DEMUXER 1 +#define CONFIG_IPMOVIE_DEMUXER 1 +#define CONFIG_ISS_DEMUXER 1 +#define CONFIG_IV8_DEMUXER 1 +#define CONFIG_IVF_DEMUXER 1 +#define CONFIG_LMLM4_DEMUXER 1 +#define CONFIG_M4V_DEMUXER 1 +#define CONFIG_MATROSKA_DEMUXER 1 +#define CONFIG_MJPEG_DEMUXER 1 +#define CONFIG_MLP_DEMUXER 1 +#define CONFIG_MM_DEMUXER 1 +#define CONFIG_MMF_DEMUXER 1 +#define CONFIG_MOV_DEMUXER 1 +#define CONFIG_MP3_DEMUXER 1 +#define CONFIG_MPC_DEMUXER 1 +#define CONFIG_MPC8_DEMUXER 1 +#define CONFIG_MPEGPS_DEMUXER 1 +#define CONFIG_MPEGTS_DEMUXER 1 +#define CONFIG_MPEGTSRAW_DEMUXER 1 +#define CONFIG_MPEGVIDEO_DEMUXER 1 +#define CONFIG_MSNWC_TCP_DEMUXER 1 +#define CONFIG_MTV_DEMUXER 1 +#define CONFIG_MVI_DEMUXER 1 +#define CONFIG_MXF_DEMUXER 1 +#define CONFIG_NC_DEMUXER 1 +#define CONFIG_NSV_DEMUXER 1 +#define CONFIG_NUT_DEMUXER 1 +#define CONFIG_NUV_DEMUXER 1 +#define CONFIG_OGG_DEMUXER 1 +#define CONFIG_OMA_DEMUXER 1 +#define CONFIG_PCM_ALAW_DEMUXER 1 +#define CONFIG_PCM_MULAW_DEMUXER 1 +#define CONFIG_PCM_F64BE_DEMUXER 1 +#define CONFIG_PCM_F64LE_DEMUXER 1 +#define CONFIG_PCM_F32BE_DEMUXER 1 +#define CONFIG_PCM_F32LE_DEMUXER 1 +#define CONFIG_PCM_S32BE_DEMUXER 1 +#define CONFIG_PCM_S32LE_DEMUXER 1 +#define CONFIG_PCM_S24BE_DEMUXER 1 +#define CONFIG_PCM_S24LE_DEMUXER 1 +#define CONFIG_PCM_S16BE_DEMUXER 1 +#define CONFIG_PCM_S16LE_DEMUXER 1 +#define CONFIG_PCM_S8_DEMUXER 1 +#define CONFIG_PCM_U32BE_DEMUXER 1 +#define CONFIG_PCM_U32LE_DEMUXER 1 +#define CONFIG_PCM_U24BE_DEMUXER 1 +#define CONFIG_PCM_U24LE_DEMUXER 1 +#define CONFIG_PCM_U16BE_DEMUXER 1 +#define CONFIG_PCM_U16LE_DEMUXER 1 +#define CONFIG_PCM_U8_DEMUXER 1 +#define CONFIG_PVA_DEMUXER 1 +#define CONFIG_QCP_DEMUXER 1 +#define CONFIG_R3D_DEMUXER 1 +#define CONFIG_RAWVIDEO_DEMUXER 1 +#define CONFIG_RL2_DEMUXER 1 +#define CONFIG_RM_DEMUXER 1 +#define CONFIG_ROQ_DEMUXER 1 +#define CONFIG_RPL_DEMUXER 1 +#define CONFIG_RTSP_DEMUXER 1 +#define CONFIG_SDP_DEMUXER 1 +#define CONFIG_SEGAFILM_DEMUXER 1 +#define CONFIG_SHORTEN_DEMUXER 1 +#define CONFIG_SIFF_DEMUXER 1 +#define CONFIG_SMACKER_DEMUXER 1 +#define CONFIG_SOL_DEMUXER 1 +#define CONFIG_SOX_DEMUXER 1 +#define CONFIG_STR_DEMUXER 1 +#define CONFIG_SWF_DEMUXER 1 +#define CONFIG_THP_DEMUXER 1 +#define CONFIG_TIERTEXSEQ_DEMUXER 1 +#define CONFIG_TMV_DEMUXER 1 +#define CONFIG_TRUEHD_DEMUXER 1 +#define CONFIG_TTA_DEMUXER 1 +#define CONFIG_TXD_DEMUXER 1 +#define CONFIG_VC1_DEMUXER 1 +#define CONFIG_VC1T_DEMUXER 1 +#define CONFIG_VMD_DEMUXER 1 +#define CONFIG_VOC_DEMUXER 1 +#define CONFIG_VQF_DEMUXER 1 +#define CONFIG_W64_DEMUXER 1 +#define CONFIG_WAV_DEMUXER 1 +#define CONFIG_WC3_DEMUXER 1 +#define CONFIG_WSAUD_DEMUXER 1 +#define CONFIG_WSVQA_DEMUXER 1 +#define CONFIG_WV_DEMUXER 1 +#define CONFIG_XA_DEMUXER 1 +#define CONFIG_YOP_DEMUXER 1 +#define CONFIG_YUV4MPEGPIPE_DEMUXER 1 +#define CONFIG_LIBNUT_DEMUXER 0 +#define CONFIG_AC3_MUXER 1 +#define CONFIG_ADTS_MUXER 1 +#define CONFIG_AIFF_MUXER 1 +#define CONFIG_AMR_MUXER 1 +#define CONFIG_ASF_MUXER 1 +#define CONFIG_ASS_MUXER 1 +#define CONFIG_ASF_STREAM_MUXER 1 +#define CONFIG_AU_MUXER 1 +#define CONFIG_AVI_MUXER 1 +#define CONFIG_AVM2_MUXER 1 +#define CONFIG_CRC_MUXER 1 +#define CONFIG_DAUD_MUXER 1 +#define CONFIG_DIRAC_MUXER 1 +#define CONFIG_DNXHD_MUXER 1 +#define CONFIG_DTS_MUXER 1 +#define CONFIG_DV_MUXER 1 +#define CONFIG_EAC3_MUXER 1 +#define CONFIG_FFM_MUXER 1 +#define CONFIG_FILMSTRIP_MUXER 1 +#define CONFIG_FLAC_MUXER 1 +#define CONFIG_FLV_MUXER 1 +#define CONFIG_FRAMECRC_MUXER 1 +#define CONFIG_FRAMEMD5_MUXER 1 +#define CONFIG_GIF_MUXER 1 +#define CONFIG_GXF_MUXER 1 +#define CONFIG_H261_MUXER 1 +#define CONFIG_H263_MUXER 1 +#define CONFIG_H264_MUXER 1 +#define CONFIG_IMAGE2_MUXER 1 +#define CONFIG_IMAGE2PIPE_MUXER 1 +#define CONFIG_IPOD_MUXER 1 +#define CONFIG_M4V_MUXER 1 +#define CONFIG_MD5_MUXER 1 +#define CONFIG_MATROSKA_MUXER 1 +#define CONFIG_MATROSKA_AUDIO_MUXER 1 +#define CONFIG_MJPEG_MUXER 1 +#define CONFIG_MLP_MUXER 1 +#define CONFIG_MMF_MUXER 1 +#define CONFIG_MOV_MUXER 1 +#define CONFIG_MP2_MUXER 1 +#define CONFIG_MP3_MUXER 1 +#define CONFIG_MP4_MUXER 1 +#define CONFIG_MPEG1SYSTEM_MUXER 1 +#define CONFIG_MPEG1VCD_MUXER 1 +#define CONFIG_MPEG1VIDEO_MUXER 1 +#define CONFIG_MPEG2DVD_MUXER 1 +#define CONFIG_MPEG2SVCD_MUXER 1 +#define CONFIG_MPEG2VIDEO_MUXER 1 +#define CONFIG_MPEG2VOB_MUXER 1 +#define CONFIG_MPEGTS_MUXER 1 +#define CONFIG_MPJPEG_MUXER 1 +#define CONFIG_MXF_MUXER 1 +#define CONFIG_MXF_D10_MUXER 1 +#define CONFIG_NULL_MUXER 1 +#define CONFIG_NUT_MUXER 1 +#define CONFIG_OGG_MUXER 1 +#define CONFIG_PCM_ALAW_MUXER 1 +#define CONFIG_PCM_MULAW_MUXER 1 +#define CONFIG_PCM_F64BE_MUXER 1 +#define CONFIG_PCM_F64LE_MUXER 1 +#define CONFIG_PCM_F32BE_MUXER 1 +#define CONFIG_PCM_F32LE_MUXER 1 +#define CONFIG_PCM_S32BE_MUXER 1 +#define CONFIG_PCM_S32LE_MUXER 1 +#define CONFIG_PCM_S24BE_MUXER 1 +#define CONFIG_PCM_S24LE_MUXER 1 +#define CONFIG_PCM_S16BE_MUXER 1 +#define CONFIG_PCM_S16LE_MUXER 1 +#define CONFIG_PCM_S8_MUXER 1 +#define CONFIG_PCM_U32BE_MUXER 1 +#define CONFIG_PCM_U32LE_MUXER 1 +#define CONFIG_PCM_U24BE_MUXER 1 +#define CONFIG_PCM_U24LE_MUXER 1 +#define CONFIG_PCM_U16BE_MUXER 1 +#define CONFIG_PCM_U16LE_MUXER 1 +#define CONFIG_PCM_U8_MUXER 1 +#define CONFIG_PSP_MUXER 1 +#define CONFIG_RAWVIDEO_MUXER 1 +#define CONFIG_RM_MUXER 1 +#define CONFIG_ROQ_MUXER 1 +#define CONFIG_RTP_MUXER 1 +#define CONFIG_RTSP_MUXER 1 +#define CONFIG_SOX_MUXER 1 +#define CONFIG_SPDIF_MUXER 1 +#define CONFIG_SWF_MUXER 1 +#define CONFIG_TG2_MUXER 1 +#define CONFIG_TGP_MUXER 1 +#define CONFIG_TRUEHD_MUXER 1 +#define CONFIG_VC1T_MUXER 1 +#define CONFIG_VOC_MUXER 1 +#define CONFIG_WAV_MUXER 1 +#define CONFIG_WEBM_MUXER 1 +#define CONFIG_YUV4MPEGPIPE_MUXER 1 +#define CONFIG_LIBNUT_MUXER 0 +#define CONFIG_ASPECT_FILTER 1 +#define CONFIG_CROP_FILTER 1 +#define CONFIG_FORMAT_FILTER 1 +#define CONFIG_NOFORMAT_FILTER 1 +#define CONFIG_NULL_FILTER 1 +#define CONFIG_PAD_FILTER 1 +#define CONFIG_PIXDESCTEST_FILTER 1 +#define CONFIG_PIXELASPECT_FILTER 1 +#define CONFIG_SCALE_FILTER 1 +#define CONFIG_SLICIFY_FILTER 1 +#define CONFIG_UNSHARP_FILTER 1 +#define CONFIG_VFLIP_FILTER 1 +#define CONFIG_BUFFER_FILTER 1 +#define CONFIG_NULLSRC_FILTER 1 +#define CONFIG_NULLSINK_FILTER 1 +#define CONFIG_FILE_PROTOCOL 1 +#define CONFIG_GOPHER_PROTOCOL 1 +#define CONFIG_HTTP_PROTOCOL 1 +#define CONFIG_MMST_PROTOCOL 1 +#define CONFIG_PIPE_PROTOCOL 1 +#define CONFIG_RTMP_PROTOCOL 1 +#define CONFIG_RTMPT_PROTOCOL 1 +#define CONFIG_RTMPE_PROTOCOL 1 +#define CONFIG_RTMPTE_PROTOCOL 1 +#define CONFIG_RTMPS_PROTOCOL 1 +#define CONFIG_RTP_PROTOCOL 1 +#define CONFIG_TCP_PROTOCOL 1 +#define CONFIG_UDP_PROTOCOL 1 +#define CONFIG_CONCAT_PROTOCOL 1 +#define CONFIG_ALSA_INDEV 0 +#define CONFIG_BKTR_INDEV 0 +#define CONFIG_DV1394_INDEV 1 +#define CONFIG_JACK_INDEV 0 +#define CONFIG_OSS_INDEV 1 +#define CONFIG_V4L2_INDEV 1 +#define CONFIG_V4L_INDEV 1 +#define CONFIG_VFWCAP_INDEV 0 +#define CONFIG_X11_GRAB_DEVICE_INDEV 0 +#define CONFIG_LIBDC1394_INDEV 0 +#define CONFIG_ALSA_OUTDEV 0 +#define CONFIG_OSS_OUTDEV 1 +#endif /* FFMPEG_CONFIG_H */ diff --git a/plugins/supereq/ffmpeg_fft/ffmpeg_fft.h b/plugins/supereq/ffmpeg_fft/ffmpeg_fft.h new file mode 100644 index 00000000..b98313d2 --- /dev/null +++ b/plugins/supereq/ffmpeg_fft/ffmpeg_fft.h @@ -0,0 +1,95 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCODEC_AVFFT_H +#define AVCODEC_AVFFT_H + +typedef float FFTSample; + +typedef struct FFTComplex { + FFTSample re, im; +} FFTComplex; + +//#define FFTC_SZ 32 +typedef struct FFTContext FFTContext; + +/** + * Set up a complex FFT. + * @param nbits log2 of the length of the input array + * @param inverse if 0 perform the forward transform, if 1 perform the inverse + */ +FFTContext *av_fft_init(int nbits, int inverse); + +/** + * Do the permutation needed BEFORE calling ff_fft_calc(). + */ +void av_fft_permute(FFTContext *s, FFTComplex *z); + +/** + * Do a complex FFT with the parameters defined in av_fft_init(). The + * input data must be permuted before. No 1.0/sqrt(n) normalization is done. + */ +void av_fft_calc(FFTContext *s, FFTComplex *z); + +void av_fft_end(FFTContext *s); + +/* Real Discrete Fourier Transform */ + +enum RDFTransformType { + DFT_R2C, + IDFT_C2R, + IDFT_R2C, + DFT_C2R, +}; + +//#define RDFTC_SZ 56 +typedef struct RDFTContext RDFTContext; + +/** + * Set up a real FFT. + * @param nbits log2 of the length of the input array + * @param trans the type of transform + */ +RDFTContext *av_rdft_init(int nbits, enum RDFTransformType trans); +void av_rdft_calc(RDFTContext *s, FFTSample *data); +void av_rdft_end(RDFTContext *s); + +/* Discrete Cosine Transform */ + +typedef struct DCTContext DCTContext; + +enum DCTTransformType { + DCT_II = 0, + DCT_III, + DCT_I, + DST_I, +}; + +/** + * Set up DCT. + * @param nbits size of the input array: + * (1 << nbits) for DCT-II, DCT-III and DST-I + * (1 << nbits) + 1 for DCT-I + * + * @note the first element of the input of DST-I is ignored + */ +DCTContext *av_dct_init(int nbits, enum DCTTransformType type); +void av_dct_calc(DCTContext *s, FFTSample *data); +void av_dct_end (DCTContext *s); + +#endif /* AVCODEC_AVFFT_H */ diff --git a/plugins/supereq/ffmpeg_fft/libavcodec/arm/asm.S b/plugins/supereq/ffmpeg_fft/libavcodec/arm/asm.S new file mode 100644 index 00000000..6860f1cf --- /dev/null +++ b/plugins/supereq/ffmpeg_fft/libavcodec/arm/asm.S @@ -0,0 +1,104 @@ +/* + * Copyright (c) 2008 Mans Rullgard <mans@mansr.com> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "config.h" + +#ifdef __ELF__ +# define ELF +#else +# define ELF @ +#endif + +.macro require8 val=1 +ELF .eabi_attribute 24, \val +.endm + +.macro preserve8 val=1 +ELF .eabi_attribute 25, \val +.endm + +/* +.macro function name, export=0 + .macro endfunc +ELF .size \name, . - \name + .endfunc + .purgem endfunc + .endm + .text + .if \export + .global EXTERN_ASM\name +EXTERN_ASM\name: + .endif +ELF .type \name, %function + .func \name +\name: +.endm +*/ + +.macro function name, export=0 + .macro endfunc +ELF .size \name, . - \name + .endfunc + .purgem endfunc + .endm + .text + .if \export + .hidden EXTERN_ASM\name + .global EXTERN_ASM\name +EXTERN_ASM\name: + .endif +ELF .type \name, %function + .func \name +\name: +.endm + +.macro mov32 rd, val +#if HAVE_ARMV6T2 + movw \rd, #(\val) & 0xffff + .if (\val) >> 16 + movt \rd, #(\val) >> 16 + .endif +#else + ldr \rd, =\val +#endif +.endm + +.macro movrel rd, val +#if HAVE_ARMV6T2 && !CONFIG_PIC + movw \rd, #:lower16:\val + movt \rd, #:upper16:\val +#else + ldr \rd, =\val +#endif +.endm + +#if HAVE_VFP_ARGS + .eabi_attribute 28, 1 +# define VFP +# define NOVFP @ +#else +# define VFP @ +# define NOVFP +#endif + +#define GLUE(a, b) a ## b +#define JOIN(a, b) GLUE(a, b) +#define X(s) JOIN(EXTERN_ASM, s) + diff --git a/plugins/supereq/ffmpeg_fft/libavcodec/arm/fft_init_arm.c b/plugins/supereq/ffmpeg_fft/libavcodec/arm/fft_init_arm.c new file mode 100644 index 00000000..28148e92 --- /dev/null +++ b/plugins/supereq/ffmpeg_fft/libavcodec/arm/fft_init_arm.c @@ -0,0 +1,71 @@ +/* + * Copyright (c) 2009 Mans Rullgard <mans@mansr.com> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavcodec/fft.h" +#if CONFIG_DCA_DECODER +#include "libavcodec/synth_filter.h" +#endif + +void ff_fft_permute_neon(FFTContext *s, FFTComplex *z); +void ff_fft_calc_neon(FFTContext *s, FFTComplex *z); + +#if 0 +void ff_imdct_calc_neon(FFTContext *s, FFTSample *output, const FFTSample *input); +void ff_imdct_half_neon(FFTContext *s, FFTSample *output, const FFTSample *input); +void ff_mdct_calc_neon(FFTContext *s, FFTSample *output, const FFTSample *input); +#endif + +void ff_rdft_calc_neon(struct RDFTContext *s, FFTSample *z); + +void ff_synth_filter_float_neon(FFTContext *imdct, + float *synth_buf_ptr, int *synth_buf_offset, + float synth_buf2[32], const float window[512], + float out[32], const float in[32], + float scale, float bias); + +av_cold void ff_fft_init_arm(FFTContext *s) +{ + if (HAVE_NEON) { + s->fft_permute = ff_fft_permute_neon; + s->fft_calc = ff_fft_calc_neon; +#if 0 + s->imdct_calc = ff_imdct_calc_neon; + s->imdct_half = ff_imdct_half_neon; + s->mdct_calc = ff_mdct_calc_neon; + s->permutation = FF_MDCT_PERM_INTERLEAVE; +#endif + } +} + +#if CONFIG_RDFT +av_cold void ff_rdft_init_arm(RDFTContext *s) +{ + if (HAVE_NEON) + s->rdft_calc = ff_rdft_calc_neon; +} +#endif + +#if CONFIG_DCA_DECODER +av_cold void ff_synth_filter_init_arm(SynthFilterContext *s) +{ + if (HAVE_NEON) + s->synth_filter_float = ff_synth_filter_float_neon; +} +#endif diff --git a/plugins/supereq/ffmpeg_fft/libavcodec/arm/fft_neon.S b/plugins/supereq/ffmpeg_fft/libavcodec/arm/fft_neon.S new file mode 100644 index 00000000..117f4fee --- /dev/null +++ b/plugins/supereq/ffmpeg_fft/libavcodec/arm/fft_neon.S @@ -0,0 +1,372 @@ +/* + * ARM NEON optimised FFT + * + * Copyright (c) 2009 Mans Rullgard <mans@mansr.com> + * Copyright (c) 2009 Naotoshi Nojiri + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "asm.S" + +#define M_SQRT1_2 0.70710678118654752440 + + .text + +function fft4_neon + vld1.32 {d0-d3}, [r0,:128] + + vext.32 q8, q1, q1, #1 @ i2,r3 d3=i3,r2 + vsub.f32 d6, d0, d1 @ r0-r1,i0-i1 + vsub.f32 d7, d16, d17 @ r3-r2,i2-i3 + vadd.f32 d4, d0, d1 @ r0+r1,i0+i1 + vadd.f32 d5, d2, d3 @ i2+i3,r2+r3 + vadd.f32 d1, d6, d7 + vsub.f32 d3, d6, d7 + vadd.f32 d0, d4, d5 + vsub.f32 d2, d4, d5 + + vst1.32 {d0-d3}, [r0,:128] + + bx lr +endfunc + +function fft8_neon + mov r1, r0 + vld1.32 {d0-d3}, [r1,:128]! + vld1.32 {d16-d19}, [r1,:128] + + movw r2, #0x04f3 @ sqrt(1/2) + movt r2, #0x3f35 + eor r3, r2, #1<<31 + vdup.32 d31, r2 + + vext.32 q11, q1, q1, #1 @ i2,r3,i3,r2 + vadd.f32 d4, d16, d17 @ r4+r5,i4+i5 + vmov d28, r3, r2 + vadd.f32 d5, d18, d19 @ r6+r7,i6+i7 + vsub.f32 d17, d16, d17 @ r4-r5,i4-i5 + vsub.f32 d19, d18, d19 @ r6-r7,i6-i7 + vrev64.32 d29, d28 + vadd.f32 d20, d0, d1 @ r0+r1,i0+i1 + vadd.f32 d21, d2, d3 @ r2+r3,i2+i3 + vmul.f32 d26, d17, d28 @ -a2r*w,a2i*w + vext.32 q3, q2, q2, #1 + vmul.f32 d27, d19, d29 @ a3r*w,-a3i*w + vsub.f32 d23, d22, d23 @ i2-i3,r3-r2 + vsub.f32 d22, d0, d1 @ r0-r1,i0-i1 + vmul.f32 d24, d17, d31 @ a2r*w,a2i*w + vmul.f32 d25, d19, d31 @ a3r*w,a3i*w + vadd.f32 d0, d20, d21 + vsub.f32 d2, d20, d21 + vadd.f32 d1, d22, d23 + vrev64.32 q13, q13 + vsub.f32 d3, d22, d23 + vsub.f32 d6, d6, d7 + vadd.f32 d24, d24, d26 @ a2r+a2i,a2i-a2r t1,t2 + vadd.f32 d25, d25, d27 @ a3r-a3i,a3i+a3r t5,t6 + vadd.f32 d7, d4, d5 + vsub.f32 d18, d2, d6 + vext.32 q13, q12, q12, #1 + vadd.f32 d2, d2, d6 + vsub.f32 d16, d0, d7 + vadd.f32 d5, d25, d24 + vsub.f32 d4, d26, d27 + vadd.f32 d0, d0, d7 + vsub.f32 d17, d1, d5 + vsub.f32 d19, d3, d4 + vadd.f32 d3, d3, d4 + vadd.f32 d1, d1, d5 + + vst1.32 {d16-d19}, [r1,:128] + vst1.32 {d0-d3}, [r0,:128] + + bx lr +endfunc + +function fft16_neon + movrel r1, mppm + vld1.32 {d16-d19}, [r0,:128]! @ q8{r0,i0,r1,i1} q9{r2,i2,r3,i3} + pld [r0, #32] + vld1.32 {d2-d3}, [r1,:128] + vext.32 q13, q9, q9, #1 + vld1.32 {d22-d25}, [r0,:128]! @ q11{r4,i4,r5,i5} q12{r6,i5,r7,i7} + vadd.f32 d4, d16, d17 + vsub.f32 d5, d16, d17 + vadd.f32 d18, d18, d19 + vsub.f32 d19, d26, d27 + + vadd.f32 d20, d22, d23 + vsub.f32 d22, d22, d23 + vsub.f32 d23, d24, d25 + vadd.f32 q8, q2, q9 @ {r0,i0,r1,i1} + vadd.f32 d21, d24, d25 + vmul.f32 d24, d22, d2 + vsub.f32 q9, q2, q9 @ {r2,i2,r3,i3} + vmul.f32 d25, d23, d3 + vuzp.32 d16, d17 @ {r0,r1,i0,i1} + vmul.f32 q1, q11, d2[1] + vuzp.32 d18, d19 @ {r2,r3,i2,i3} + vrev64.32 q12, q12 + vadd.f32 q11, q12, q1 @ {t1a,t2a,t5,t6} + vld1.32 {d24-d27}, [r0,:128]! @ q12{r8,i8,r9,i9} q13{r10,i10,r11,i11} + vzip.32 q10, q11 + vld1.32 {d28-d31}, [r0,:128] @ q14{r12,i12,r13,i13} q15{r14,i14,r15,i15} + vadd.f32 d0, d22, d20 + vadd.f32 d1, d21, d23 + vsub.f32 d2, d21, d23 + vsub.f32 d3, d22, d20 + sub r0, r0, #96 + vext.32 q13, q13, q13, #1 + vsub.f32 q10, q8, q0 @ {r4,r5,i4,i5} + vadd.f32 q8, q8, q0 @ {r0,r1,i0,i1} + vext.32 q15, q15, q15, #1 + vsub.f32 q11, q9, q1 @ {r6,r7,i6,i7} + vswp d25, d26 @ q12{r8,i8,i10,r11} q13{r9,i9,i11,r10} + vadd.f32 q9, q9, q1 @ {r2,r3,i2,i3} + vswp d29, d30 @ q14{r12,i12,i14,r15} q15{r13,i13,i15,r14} + vadd.f32 q0, q12, q13 @ {t1,t2,t5,t6} + vadd.f32 q1, q14, q15 @ {t1a,t2a,t5a,t6a} + movrel r2, X(ff_cos_16) + vsub.f32 q13, q12, q13 @ {t3,t4,t7,t8} + vrev64.32 d1, d1 + vsub.f32 q15, q14, q15 @ {t3a,t4a,t7a,t8a} + vrev64.32 d3, d3 + movrel r3, pmmp + vswp d1, d26 @ q0{t1,t2,t3,t4} q13{t6,t5,t7,t8} + vswp d3, d30 @ q1{t1a,t2a,t3a,t4a} q15{t6a,t5a,t7a,t8a} + vadd.f32 q12, q0, q13 @ {r8,i8,r9,i9} + vadd.f32 q14, q1, q15 @ {r12,i12,r13,i13} + vld1.32 {d4-d5}, [r2,:64] + vsub.f32 q13, q0, q13 @ {r10,i10,r11,i11} + vsub.f32 q15, q1, q15 @ {r14,i14,r15,i15} + vswp d25, d28 @ q12{r8,i8,r12,i12} q14{r9,i9,r13,i13} + vld1.32 {d6-d7}, [r3,:128] + vrev64.32 q1, q14 + vmul.f32 q14, q14, d4[1] + vmul.f32 q1, q1, q3 + vmla.f32 q14, q1, d5[1] @ {t1a,t2a,t5a,t6a} + vswp d27, d30 @ q13{r10,i10,r14,i14} q15{r11,i11,r15,i15} + vzip.32 q12, q14 + vadd.f32 d0, d28, d24 + vadd.f32 d1, d25, d29 + vsub.f32 d2, d25, d29 + vsub.f32 d3, d28, d24 + vsub.f32 q12, q8, q0 @ {r8,r9,i8,i9} + vadd.f32 q8, q8, q0 @ {r0,r1,i0,i1} + vsub.f32 q14, q10, q1 @ {r12,r13,i12,i13} + mov r1, #32 + vadd.f32 q10, q10, q1 @ {r4,r5,i4,i5} + vrev64.32 q0, q13 + vmul.f32 q13, q13, d5[0] + vrev64.32 q1, q15 + vmul.f32 q15, q15, d5[1] + vst2.32 {d16-d17},[r0,:128], r1 + vmul.f32 q0, q0, q3 + vst2.32 {d20-d21},[r0,:128], r1 + vmul.f32 q1, q1, q3 + vmla.f32 q13, q0, d5[0] @ {t1,t2,t5,t6} + vmla.f32 q15, q1, d4[1] @ {t1a,t2a,t5a,t6a} + vst2.32 {d24-d25},[r0,:128], r1 + vst2.32 {d28-d29},[r0,:128] + vzip.32 q13, q15 + sub r0, r0, #80 + vadd.f32 d0, d30, d26 + vadd.f32 d1, d27, d31 + vsub.f32 d2, d27, d31 + vsub.f32 d3, d30, d26 + vsub.f32 q13, q9, q0 @ {r10,r11,i10,i11} + vadd.f32 q9, q9, q0 @ {r2,r3,i2,i3} + vsub.f32 q15, q11, q1 @ {r14,r15,i14,i15} + vadd.f32 q11, q11, q1 @ {r6,r7,i6,i7} + vst2.32 {d18-d19},[r0,:128], r1 + vst2.32 {d22-d23},[r0,:128], r1 + vst2.32 {d26-d27},[r0,:128], r1 + vst2.32 {d30-d31},[r0,:128] + bx lr +endfunc + +function fft_pass_neon + push {r4-r6,lr} + mov r6, r2 @ n + lsl r5, r2, #3 @ 2 * n * sizeof FFTSample + lsl r4, r2, #4 @ 2 * n * sizeof FFTComplex + lsl r2, r2, #5 @ 4 * n * sizeof FFTComplex + add r3, r2, r4 + add r4, r4, r0 @ &z[o1] + add r2, r2, r0 @ &z[o2] + add r3, r3, r0 @ &z[o3] + vld1.32 {d20-d21},[r2,:128] @ {z[o2],z[o2+1]} + movrel r12, pmmp + vld1.32 {d22-d23},[r3,:128] @ {z[o3],z[o3+1]} + add r5, r5, r1 @ wim + vld1.32 {d6-d7}, [r12,:128] @ pmmp + vswp d21, d22 + vld1.32 {d4}, [r1,:64]! @ {wre[0],wre[1]} + sub r5, r5, #4 @ wim-- + vrev64.32 q1, q11 + vmul.f32 q11, q11, d4[1] + vmul.f32 q1, q1, q3 + vld1.32 {d5[0]}, [r5,:32] @ d5[0] = wim[-1] + vmla.f32 q11, q1, d5[0] @ {t1a,t2a,t5a,t6a} + vld2.32 {d16-d17},[r0,:128] @ {z[0],z[1]} + sub r6, r6, #1 @ n-- + vld2.32 {d18-d19},[r4,:128] @ {z[o1],z[o1+1]} + vzip.32 q10, q11 + vadd.f32 d0, d22, d20 + vadd.f32 d1, d21, d23 + vsub.f32 d2, d21, d23 + vsub.f32 d3, d22, d20 + vsub.f32 q10, q8, q0 + vadd.f32 q8, q8, q0 + vsub.f32 q11, q9, q1 + vadd.f32 q9, q9, q1 + vst2.32 {d20-d21},[r2,:128]! @ {z[o2],z[o2+1]} + vst2.32 {d16-d17},[r0,:128]! @ {z[0],z[1]} + vst2.32 {d22-d23},[r3,:128]! @ {z[o3],z[o3+1]} + vst2.32 {d18-d19},[r4,:128]! @ {z[o1],z[o1+1]} + sub r5, r5, #8 @ wim -= 2 +1: + vld1.32 {d20-d21},[r2,:128] @ {z[o2],z[o2+1]} + vld1.32 {d22-d23},[r3,:128] @ {z[o3],z[o3+1]} + vswp d21, d22 + vld1.32 {d4}, [r1]! @ {wre[0],wre[1]} + vrev64.32 q0, q10 + vmul.f32 q10, q10, d4[0] + vrev64.32 q1, q11 + vmul.f32 q11, q11, d4[1] + vld1.32 {d5}, [r5] @ {wim[-1],wim[0]} + vmul.f32 q0, q0, q3 + sub r5, r5, #8 @ wim -= 2 + vmul.f32 q1, q1, q3 + vmla.f32 q10, q0, d5[1] @ {t1,t2,t5,t6} + vmla.f32 q11, q1, d5[0] @ {t1a,t2a,t5a,t6a} + vld2.32 {d16-d17},[r0,:128] @ {z[0],z[1]} + subs r6, r6, #1 @ n-- + vld2.32 {d18-d19},[r4,:128] @ {z[o1],z[o1+1]} + vzip.32 q10, q11 + vadd.f32 d0, d22, d20 + vadd.f32 d1, d21, d23 + vsub.f32 d2, d21, d23 + vsub.f32 d3, d22, d20 + vsub.f32 q10, q8, q0 + vadd.f32 q8, q8, q0 + vsub.f32 q11, q9, q1 + vadd.f32 q9, q9, q1 + vst2.32 {d20-d21}, [r2,:128]! @ {z[o2],z[o2+1]} + vst2.32 {d16-d17}, [r0,:128]! @ {z[0],z[1]} + vst2.32 {d22-d23}, [r3,:128]! @ {z[o3],z[o3+1]} + vst2.32 {d18-d19}, [r4,:128]! @ {z[o1],z[o1+1]} + bne 1b + + pop {r4-r6,pc} +endfunc + +.macro def_fft n, n2, n4 + .align 6 +function fft\n\()_neon + push {r4, lr} + mov r4, r0 + bl fft\n2\()_neon + add r0, r4, #\n4*2*8 + bl fft\n4\()_neon + add r0, r4, #\n4*3*8 + bl fft\n4\()_neon + mov r0, r4 + pop {r4, lr} + movrel r1, X(ff_cos_\n) + mov r2, #\n4/2 + b fft_pass_neon +endfunc +.endm + + def_fft 32, 16, 8 + def_fft 64, 32, 16 + def_fft 128, 64, 32 + def_fft 256, 128, 64 + def_fft 512, 256, 128 + def_fft 1024, 512, 256 + def_fft 2048, 1024, 512 + def_fft 4096, 2048, 1024 + def_fft 8192, 4096, 2048 + def_fft 16384, 8192, 4096 + def_fft 32768, 16384, 8192 + def_fft 65536, 32768, 16384 + +function ff_fft_calc_neon, export=1 + ldr r2, [r0] + sub r2, r2, #2 + movrel r3, fft_tab_neon + ldr r3, [r3, r2, lsl #2] + mov r0, r1 + bx r3 +endfunc + +function ff_fft_permute_neon, export=1 + push {r4,lr} + mov r12, #1 + ldr r2, [r0] @ nbits + ldr r3, [r0, #12] @ tmp_buf + ldr r0, [r0, #8] @ revtab + lsl r12, r12, r2 + mov r2, r12 +1: + vld1.32 {d0-d1}, [r1,:128]! + ldr r4, [r0], #4 + uxth lr, r4 + uxth r4, r4, ror #16 + add lr, r3, lr, lsl #3 + add r4, r3, r4, lsl #3 + vst1.32 {d0}, [lr,:64] + vst1.32 {d1}, [r4,:64] + subs r12, r12, #2 + bgt 1b + + sub r1, r1, r2, lsl #3 +1: + vld1.32 {d0-d3}, [r3,:128]! + vst1.32 {d0-d3}, [r1,:128]! + subs r2, r2, #4 + bgt 1b + + pop {r4,pc} +endfunc + + .section .rodata + .align 4 +fft_tab_neon: + .word fft4_neon + .word fft8_neon + .word fft16_neon + .word fft32_neon + .word fft64_neon + .word fft128_neon + .word fft256_neon + .word fft512_neon + .word fft1024_neon + .word fft2048_neon + .word fft4096_neon + .word fft8192_neon + .word fft16384_neon + .word fft32768_neon + .word fft65536_neon +ELF .size fft_tab_neon, . - fft_tab_neon + + .align 4 +pmmp: .float +1.0, -1.0, -1.0, +1.0 +mppm: .float -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2 + diff --git a/plugins/supereq/ffmpeg_fft/libavcodec/arm/rdft_neon.S b/plugins/supereq/ffmpeg_fft/libavcodec/arm/rdft_neon.S new file mode 100644 index 00000000..4f8a1032 --- /dev/null +++ b/plugins/supereq/ffmpeg_fft/libavcodec/arm/rdft_neon.S @@ -0,0 +1,151 @@ +/* + * ARM NEON optimised RDFT + * Copyright (c) 2009 Mans Rullgard <mans@mansr.com> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "asm.S" + + preserve8 + +function ff_rdft_calc_neon, export=1 + push {r4-r8,lr} + + ldr r6, [r0, #4] @ inverse + mov r4, r0 + mov r5, r1 + + lsls r6, r6, #31 + bne 1f + add r0, r4, #20 + bl X(ff_fft_permute_neon) + add r0, r4, #20 + mov r1, r5 + bl X(ff_fft_calc_neon) +1: + ldr r12, [r4, #0] @ nbits + mov r2, #1 + lsl r12, r2, r12 + add r0, r5, #8 + add r1, r5, r12, lsl #2 + lsr r12, r12, #2 + ldr r2, [r4, #12] @ tcos + sub r12, r12, #2 + ldr r3, [r4, #16] @ tsin + mov r7, r0 + sub r1, r1, #8 + mov lr, r1 + mov r8, #-8 + vld1.32 {d0}, [r0,:64]! @ d1[0,1] + vld1.32 {d1}, [r1,:64], r8 @ d2[0,1] + vld1.32 {d4}, [r2,:64]! @ tcos[i] + vld1.32 {d5}, [r3,:64]! @ tsin[i] + vmov.f32 d18, #0.5 @ k1 + vdup.32 d19, r6 + pld [r0, #32] + veor d19, d18, d19 @ k2 + vmov.i32 d16, #0 + vmov.i32 d17, #1<<31 + pld [r1, #-32] + vtrn.32 d16, d17 + pld [r2, #32] + vrev64.32 d16, d16 @ d16=1,0 d17=0,1 + pld [r3, #32] +2: + veor q1, q0, q8 @ -d1[0],d1[1], d2[0],-d2[1] + vld1.32 {d24}, [r0,:64]! @ d1[0,1] + vadd.f32 d0, d0, d3 @ d1[0]+d2[0], d1[1]-d2[1] + vld1.32 {d25}, [r1,:64], r8 @ d2[0,1] + vadd.f32 d1, d2, d1 @ -d1[0]+d2[0], d1[1]+d2[1] + veor q3, q12, q8 @ -d1[0],d1[1], d2[0],-d2[1] + pld [r0, #32] + vmul.f32 q10, q0, q9 @ ev.re, ev.im, od.im, od.re + pld [r1, #-32] + vadd.f32 d0, d24, d7 @ d1[0]+d2[0], d1[1]-d2[1] + vadd.f32 d1, d6, d25 @ -d1[0]+d2[0], d1[1]+d2[1] + vmul.f32 q11, q0, q9 @ ev.re, ev.im, od.im, od.re + veor d7, d21, d16 @ -od.im, od.re + vrev64.32 d3, d21 @ od.re, od.im + veor d6, d20, d17 @ ev.re,-ev.im + veor d2, d3, d16 @ -od.re, od.im + vmla.f32 d20, d3, d4[1] + vmla.f32 d20, d7, d5[1] + vmla.f32 d6, d2, d4[1] + vmla.f32 d6, d21, d5[1] + vld1.32 {d4}, [r2,:64]! @ tcos[i] + veor d7, d23, d16 @ -od.im, od.re + vld1.32 {d5}, [r3,:64]! @ tsin[i] + veor d24, d22, d17 @ ev.re,-ev.im + vrev64.32 d3, d23 @ od.re, od.im + pld [r2, #32] + veor d2, d3, d16 @ -od.re, od.im + pld [r3, #32] + vmla.f32 d22, d3, d4[0] + vmla.f32 d22, d7, d5[0] + vmla.f32 d24, d2, d4[0] + vmla.f32 d24, d23, d5[0] + vld1.32 {d0}, [r0,:64]! @ d1[0,1] + vld1.32 {d1}, [r1,:64], r8 @ d2[0,1] + vst1.32 {d20}, [r7,:64]! + vst1.32 {d6}, [lr,:64], r8 + vst1.32 {d22}, [r7,:64]! + vst1.32 {d24}, [lr,:64], r8 + subs r12, r12, #2 + bgt 2b + + veor q1, q0, q8 @ -d1[0],d1[1], d2[0],-d2[1] + vadd.f32 d0, d0, d3 @ d1[0]+d2[0], d1[1]-d2[1] + vadd.f32 d1, d2, d1 @ -d1[0]+d2[0], d1[1]+d2[1] + ldr r2, [r4, #8] @ sign_convention + vmul.f32 q10, q0, q9 @ ev.re, ev.im, od.im, od.re + add r0, r0, #4 + bfc r2, #0, #31 + vld1.32 {d0[0]}, [r0,:32] + veor d7, d21, d16 @ -od.im, od.re + vrev64.32 d3, d21 @ od.re, od.im + veor d6, d20, d17 @ ev.re,-ev.im + vld1.32 {d22}, [r5,:64] + vdup.32 d1, r2 + vmov d23, d22 + veor d2, d3, d16 @ -od.re, od.im + vtrn.32 d22, d23 + veor d0, d0, d1 + veor d23, d23, d17 + vmla.f32 d20, d3, d4[1] + vmla.f32 d20, d7, d5[1] + vmla.f32 d6, d2, d4[1] + vmla.f32 d6, d21, d5[1] + vadd.f32 d22, d22, d23 + vst1.32 {d20}, [r7,:64] + vst1.32 {d6}, [lr,:64] + vst1.32 {d0[0]}, [r0,:32] + vst1.32 {d22}, [r5,:64] + + cmp r6, #0 + popeq {r4-r8,pc} + + vmul.f32 d22, d22, d18 + vst1.32 {d22}, [r5,:64] + add r0, r4, #20 + mov r1, r5 + bl X(ff_fft_permute_neon) + add r0, r4, #20 + mov r1, r5 + pop {r4-r8,lr} + b X(ff_fft_calc_neon) +endfunc diff --git a/plugins/supereq/ffmpeg_fft/libavcodec/arm/simple_idct_neon.S b/plugins/supereq/ffmpeg_fft/libavcodec/arm/simple_idct_neon.S new file mode 100644 index 00000000..17cde583 --- /dev/null +++ b/plugins/supereq/ffmpeg_fft/libavcodec/arm/simple_idct_neon.S @@ -0,0 +1,372 @@ +/* + * ARM NEON IDCT + * + * Copyright (c) 2008 Mans Rullgard <mans@mansr.com> + * + * Based on Simple IDCT + * Copyright (c) 2001 Michael Niedermayer <michaelni@gmx.at> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "asm.S" + +#define W1 22725 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 +#define W2 21407 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 +#define W3 19266 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 +#define W4 16383 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 +#define W5 12873 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 +#define W6 8867 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 +#define W7 4520 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 +#define W4c ((1<<(COL_SHIFT-1))/W4) +#define ROW_SHIFT 11 +#define COL_SHIFT 20 + +#define w1 d0[0] +#define w2 d0[1] +#define w3 d0[2] +#define w4 d0[3] +#define w5 d1[0] +#define w6 d1[1] +#define w7 d1[2] +#define w4c d1[3] + + .macro idct_col4_top + vmull.s16 q7, d6, w2 /* q9 = W2 * col[2] */ + vmull.s16 q8, d6, w6 /* q10 = W6 * col[2] */ + vmull.s16 q9, d4, w1 /* q9 = W1 * col[1] */ + vadd.i32 q11, q15, q7 + vmull.s16 q10, d4, w3 /* q10 = W3 * col[1] */ + vadd.i32 q12, q15, q8 + vmull.s16 q5, d4, w5 /* q5 = W5 * col[1] */ + vsub.i32 q13, q15, q8 + vmull.s16 q6, d4, w7 /* q6 = W7 * col[1] */ + vsub.i32 q14, q15, q7 + + vmlal.s16 q9, d8, w3 /* q9 += W3 * col[3] */ + vmlsl.s16 q10, d8, w7 /* q10 -= W7 * col[3] */ + vmlsl.s16 q5, d8, w1 /* q5 -= W1 * col[3] */ + vmlsl.s16 q6, d8, w5 /* q6 -= W5 * col[3] */ + .endm + + .text + .align 6 + +function idct_row4_pld_neon + pld [r0] + add r3, r0, r1, lsl #2 + pld [r0, r1] + pld [r0, r1, lsl #1] + pld [r3, -r1] + pld [r3] + pld [r3, r1] + add r3, r3, r1, lsl #1 + pld [r3] + pld [r3, r1] +endfunc + +function idct_row4_neon + vmov.i32 q15, #(1<<(ROW_SHIFT-1)) + vld1.64 {d2-d5}, [r2,:128]! + vmlal.s16 q15, d2, w4 /* q15 += W4 * col[0] */ + vld1.64 {d6,d7}, [r2,:128]! + vorr d10, d3, d5 + vld1.64 {d8,d9}, [r2,:128]! + add r2, r2, #-64 + + vorr d11, d7, d9 + vorr d10, d10, d11 + vmov r3, r4, d10 + + idct_col4_top + + orrs r3, r3, r4 + beq 1f + + vmull.s16 q7, d3, w4 /* q7 = W4 * col[4] */ + vmlal.s16 q9, d5, w5 /* q9 += W5 * col[5] */ + vmlsl.s16 q10, d5, w1 /* q10 -= W1 * col[5] */ + vmull.s16 q8, d7, w2 /* q8 = W2 * col[6] */ + vmlal.s16 q5, d5, w7 /* q5 += W7 * col[5] */ + vadd.i32 q11, q11, q7 + vsub.i32 q12, q12, q7 + vsub.i32 q13, q13, q7 + vadd.i32 q14, q14, q7 + vmlal.s16 q6, d5, w3 /* q6 += W3 * col[5] */ + vmull.s16 q7, d7, w6 /* q7 = W6 * col[6] */ + vmlal.s16 q9, d9, w7 + vmlsl.s16 q10, d9, w5 + vmlal.s16 q5, d9, w3 + vmlsl.s16 q6, d9, w1 + vadd.i32 q11, q11, q7 + vsub.i32 q12, q12, q8 + vadd.i32 q13, q13, q8 + vsub.i32 q14, q14, q7 + +1: vadd.i32 q3, q11, q9 + vadd.i32 q4, q12, q10 + vshrn.i32 d2, q3, #ROW_SHIFT + vshrn.i32 d4, q4, #ROW_SHIFT + vadd.i32 q7, q13, q5 + vadd.i32 q8, q14, q6 + vtrn.16 d2, d4 + vshrn.i32 d6, q7, #ROW_SHIFT + vshrn.i32 d8, q8, #ROW_SHIFT + vsub.i32 q14, q14, q6 + vsub.i32 q11, q11, q9 + vtrn.16 d6, d8 + vsub.i32 q13, q13, q5 + vshrn.i32 d3, q14, #ROW_SHIFT + vtrn.32 d2, d6 + vsub.i32 q12, q12, q10 + vtrn.32 d4, d8 + vshrn.i32 d5, q13, #ROW_SHIFT + vshrn.i32 d7, q12, #ROW_SHIFT + vshrn.i32 d9, q11, #ROW_SHIFT + + vtrn.16 d3, d5 + vtrn.16 d7, d9 + vtrn.32 d3, d7 + vtrn.32 d5, d9 + + vst1.64 {d2-d5}, [r2,:128]! + vst1.64 {d6-d9}, [r2,:128]! + + bx lr +endfunc + +function idct_col4_neon + mov ip, #16 + vld1.64 {d2}, [r2,:64], ip /* d2 = col[0] */ + vdup.16 d30, w4c + vld1.64 {d4}, [r2,:64], ip /* d3 = col[1] */ + vadd.i16 d30, d30, d2 + vld1.64 {d6}, [r2,:64], ip /* d4 = col[2] */ + vmull.s16 q15, d30, w4 /* q15 = W4*(col[0]+(1<<COL_SHIFT-1)/W4)*/ + vld1.64 {d8}, [r2,:64], ip /* d5 = col[3] */ + + ldrd r4, [r2] + ldrd r6, [r2, #16] + orrs r4, r4, r5 + + idct_col4_top + addeq r2, r2, #16 + beq 1f + + vld1.64 {d3}, [r2,:64], ip /* d6 = col[4] */ + vmull.s16 q7, d3, w4 /* q7 = W4 * col[4] */ + vadd.i32 q11, q11, q7 + vsub.i32 q12, q12, q7 + vsub.i32 q13, q13, q7 + vadd.i32 q14, q14, q7 + +1: orrs r6, r6, r7 + ldrd r4, [r2, #16] + addeq r2, r2, #16 + beq 2f + + vld1.64 {d5}, [r2,:64], ip /* d7 = col[5] */ + vmlal.s16 q9, d5, w5 /* q9 += W5 * col[5] */ + vmlsl.s16 q10, d5, w1 /* q10 -= W1 * col[5] */ + vmlal.s16 q5, d5, w7 /* q5 += W7 * col[5] */ + vmlal.s16 q6, d5, w3 /* q6 += W3 * col[5] */ + +2: orrs r4, r4, r5 + ldrd r4, [r2, #16] + addeq r2, r2, #16 + beq 3f + + vld1.64 {d7}, [r2,:64], ip /* d8 = col[6] */ + vmull.s16 q7, d7, w6 /* q7 = W6 * col[6] */ + vmull.s16 q8, d7, w2 /* q8 = W2 * col[6] */ + vadd.i32 q11, q11, q7 + vsub.i32 q14, q14, q7 + vsub.i32 q12, q12, q8 + vadd.i32 q13, q13, q8 + +3: orrs r4, r4, r5 + addeq r2, r2, #16 + beq 4f + + vld1.64 {d9}, [r2,:64], ip /* d9 = col[7] */ + vmlal.s16 q9, d9, w7 + vmlsl.s16 q10, d9, w5 + vmlal.s16 q5, d9, w3 + vmlsl.s16 q6, d9, w1 + +4: vaddhn.i32 d2, q11, q9 + vaddhn.i32 d3, q12, q10 + vaddhn.i32 d4, q13, q5 + vaddhn.i32 d5, q14, q6 + vsubhn.i32 d9, q11, q9 + vsubhn.i32 d8, q12, q10 + vsubhn.i32 d7, q13, q5 + vsubhn.i32 d6, q14, q6 + + bx lr +endfunc + + .align 6 + +function idct_col4_st8_neon + vqshrun.s16 d2, q1, #COL_SHIFT-16 + vqshrun.s16 d3, q2, #COL_SHIFT-16 + vqshrun.s16 d4, q3, #COL_SHIFT-16 + vqshrun.s16 d5, q4, #COL_SHIFT-16 + vst1.32 {d2[0]}, [r0,:32], r1 + vst1.32 {d2[1]}, [r0,:32], r1 + vst1.32 {d3[0]}, [r0,:32], r1 + vst1.32 {d3[1]}, [r0,:32], r1 + vst1.32 {d4[0]}, [r0,:32], r1 + vst1.32 {d4[1]}, [r0,:32], r1 + vst1.32 {d5[0]}, [r0,:32], r1 + vst1.32 {d5[1]}, [r0,:32], r1 + + bx lr +endfunc + + .section .rodata + .align 4 +idct_coeff_neon: + .short W1, W2, W3, W4, W5, W6, W7, W4c + + .macro idct_start data + push {r4-r7, lr} + pld [\data] + pld [\data, #64] + vpush {d8-d15} + movrel r3, idct_coeff_neon + vld1.64 {d0,d1}, [r3,:128] + .endm + + .macro idct_end + vpop {d8-d15} + pop {r4-r7, pc} + .endm + +/* void ff_simple_idct_put_neon(uint8_t *dst, int line_size, DCTELEM *data); */ +function ff_simple_idct_put_neon, export=1 + idct_start r2 + + bl idct_row4_pld_neon + bl idct_row4_neon + add r2, r2, #-128 + bl idct_col4_neon + bl idct_col4_st8_neon + sub r0, r0, r1, lsl #3 + add r0, r0, #4 + add r2, r2, #-120 + bl idct_col4_neon + bl idct_col4_st8_neon + + idct_end +endfunc + + .align 6 + +function idct_col4_add8_neon + mov ip, r0 + + vld1.32 {d10[0]}, [r0,:32], r1 + vshr.s16 q1, q1, #COL_SHIFT-16 + vld1.32 {d10[1]}, [r0,:32], r1 + vshr.s16 q2, q2, #COL_SHIFT-16 + vld1.32 {d11[0]}, [r0,:32], r1 + vshr.s16 q3, q3, #COL_SHIFT-16 + vld1.32 {d11[1]}, [r0,:32], r1 + vshr.s16 q4, q4, #COL_SHIFT-16 + vld1.32 {d12[0]}, [r0,:32], r1 + vaddw.u8 q1, q1, d10 + vld1.32 {d12[1]}, [r0,:32], r1 + vaddw.u8 q2, q2, d11 + vld1.32 {d13[0]}, [r0,:32], r1 + vqmovun.s16 d2, q1 + vld1.32 {d13[1]}, [r0,:32], r1 + vaddw.u8 q3, q3, d12 + vst1.32 {d2[0]}, [ip,:32], r1 + vqmovun.s16 d3, q2 + vst1.32 {d2[1]}, [ip,:32], r1 + vaddw.u8 q4, q4, d13 + vst1.32 {d3[0]}, [ip,:32], r1 + vqmovun.s16 d4, q3 + vst1.32 {d3[1]}, [ip,:32], r1 + vqmovun.s16 d5, q4 + vst1.32 {d4[0]}, [ip,:32], r1 + vst1.32 {d4[1]}, [ip,:32], r1 + vst1.32 {d5[0]}, [ip,:32], r1 + vst1.32 {d5[1]}, [ip,:32], r1 + + bx lr +endfunc + +/* void ff_simple_idct_add_neon(uint8_t *dst, int line_size, DCTELEM *data); */ +function ff_simple_idct_add_neon, export=1 + idct_start r2 + + bl idct_row4_pld_neon + bl idct_row4_neon + add r2, r2, #-128 + bl idct_col4_neon + bl idct_col4_add8_neon + sub r0, r0, r1, lsl #3 + add r0, r0, #4 + add r2, r2, #-120 + bl idct_col4_neon + bl idct_col4_add8_neon + + idct_end +endfunc + + .align 6 + +function idct_col4_st16_neon + mov ip, #16 + + vshr.s16 q1, q1, #COL_SHIFT-16 + vshr.s16 q2, q2, #COL_SHIFT-16 + vst1.64 {d2}, [r2,:64], ip + vshr.s16 q3, q3, #COL_SHIFT-16 + vst1.64 {d3}, [r2,:64], ip + vshr.s16 q4, q4, #COL_SHIFT-16 + vst1.64 {d4}, [r2,:64], ip + vst1.64 {d5}, [r2,:64], ip + vst1.64 {d6}, [r2,:64], ip + vst1.64 {d7}, [r2,:64], ip + vst1.64 {d8}, [r2,:64], ip + vst1.64 {d9}, [r2,:64], ip + + bx lr +endfunc + +/* void ff_simple_idct_neon(DCTELEM *data); */ +function ff_simple_idct_neon, export=1 + idct_start r0 + + mov r2, r0 + bl idct_row4_neon + bl idct_row4_neon + add r2, r2, #-128 + bl idct_col4_neon + add r2, r2, #-128 + bl idct_col4_st16_neon + add r2, r2, #-120 + bl idct_col4_neon + add r2, r2, #-128 + bl idct_col4_st16_neon + + idct_end +endfunc diff --git a/plugins/supereq/ffmpeg_fft/libavcodec/avfft.c b/plugins/supereq/ffmpeg_fft/libavcodec/avfft.c new file mode 100644 index 00000000..25fc4e09 --- /dev/null +++ b/plugins/supereq/ffmpeg_fft/libavcodec/avfft.c @@ -0,0 +1,142 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/mem.h" +#include "avfft.h" +#include "fft.h" + +/* FFT */ + +FFTContext *av_fft_init(int nbits, int inverse) +{ + FFTContext *s = av_malloc(sizeof(*s)); + + if (s) + ff_fft_init(s, nbits, inverse); + + return s; +} + +void av_fft_permute(FFTContext *s, FFTComplex *z) +{ + s->fft_permute(s, z); +} + +void av_fft_calc(FFTContext *s, FFTComplex *z) +{ + s->fft_calc(s, z); +} + +void av_fft_end(FFTContext *s) +{ + if (s) { + ff_fft_end(s); + av_free(s); + } +} + +#if CONFIG_MDCT + +FFTContext *av_mdct_init(int nbits, int inverse, double scale) +{ + FFTContext *s = av_malloc(sizeof(*s)); + + if (s) + ff_mdct_init(s, nbits, inverse, scale); + + return s; +} + +void av_imdct_calc(FFTContext *s, FFTSample *output, const FFTSample *input) +{ + s->imdct_calc(s, output, input); +} + +void av_imdct_half(FFTContext *s, FFTSample *output, const FFTSample *input) +{ + s->imdct_half(s, output, input); +} + +void av_mdct_calc(FFTContext *s, FFTSample *output, const FFTSample *input) +{ + s->mdct_calc(s, output, input); +} + +void av_mdct_end(FFTContext *s) +{ + if (s) { + ff_mdct_end(s); + av_free(s); + } +} + +#endif /* CONFIG_MDCT */ + +#if CONFIG_RDFT + +RDFTContext *av_rdft_init(int nbits, enum RDFTransformType trans) +{ + RDFTContext *s = av_malloc(sizeof(*s)); + + if (s) + ff_rdft_init(s, nbits, trans); + + return s; +} + +void av_rdft_calc(RDFTContext *s, FFTSample *data) +{ + ff_rdft_calc(s, data); +} + +void av_rdft_end(RDFTContext *s) +{ + if (s) { + ff_rdft_end(s); + av_free(s); + } +} + +#endif /* CONFIG_RDFT */ + +#if CONFIG_DCT + +DCTContext *av_dct_init(int nbits, enum DCTTransformType inverse) +{ + DCTContext *s = av_malloc(sizeof(*s)); + + if (s) + ff_dct_init(s, nbits, inverse); + + return s; +} + +void av_dct_calc(DCTContext *s, FFTSample *data) +{ + ff_dct_calc(s, data); +} + +void av_dct_end(DCTContext *s) +{ + if (s) { + ff_dct_end(s); + av_free(s); + } +} + +#endif /* CONFIG_DCT */ diff --git a/plugins/supereq/ffmpeg_fft/libavcodec/avfft.h b/plugins/supereq/ffmpeg_fft/libavcodec/avfft.h new file mode 100644 index 00000000..fdf30237 --- /dev/null +++ b/plugins/supereq/ffmpeg_fft/libavcodec/avfft.h @@ -0,0 +1,103 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCODEC_AVFFT_H +#define AVCODEC_AVFFT_H + +#include "publik.h" + +typedef float FFTSample; + +typedef struct FFTComplex { + FFTSample re, im; +} FFTComplex; + +typedef struct FFTContext FFTContext; + +/** + * Set up a complex FFT. + * @param nbits log2 of the length of the input array + * @param inverse if 0 perform the forward transform, if 1 perform the inverse + */ +PUBLIK FFTContext *av_fft_init(int nbits, int inverse); + +/** + * Do the permutation needed BEFORE calling ff_fft_calc(). + */ +PUBLIK void av_fft_permute(FFTContext *s, FFTComplex *z); + +/** + * Do a complex FFT with the parameters defined in av_fft_init(). The + * input data must be permuted before. No 1.0/sqrt(n) normalization is done. + */ +PUBLIK void av_fft_calc(FFTContext *s, FFTComplex *z); + +PUBLIK void av_fft_end(FFTContext *s); + +#if 0 +FFTContext *av_mdct_init(int nbits, int inverse, double scale); +void av_imdct_calc(FFTContext *s, FFTSample *output, const FFTSample *input); +void av_imdct_half(FFTContext *s, FFTSample *output, const FFTSample *input); +void av_mdct_calc(FFTContext *s, FFTSample *output, const FFTSample *input); +void av_mdct_end(FFTContext *s); +#endif + +/* Real Discrete Fourier Transform */ + +enum RDFTransformType { + DFT_R2C, + IDFT_C2R, + IDFT_R2C, + DFT_C2R, +}; + +typedef struct RDFTContext RDFTContext; + +/** + * Set up a real FFT. + * @param nbits log2 of the length of the input array + * @param trans the type of transform + */ +PUBLIK RDFTContext *av_rdft_init(int nbits, enum RDFTransformType trans); +PUBLIK void av_rdft_calc(RDFTContext *s, FFTSample *data); +PUBLIK void av_rdft_end(RDFTContext *s); + +/* Discrete Cosine Transform */ + +typedef struct DCTContext DCTContext; + +enum DCTTransformType { + DCT_II = 0, + DCT_III, + DCT_I, + DST_I, +}; + +/** + * Set up DCT. + * @param nbits size of the input array: + * (1 << nbits) for DCT-II, DCT-III and DST-I + * (1 << nbits) + 1 for DCT-I + * + * @note the first element of the input of DST-I is ignored + */ +PUBLIK DCTContext *av_dct_init(int nbits, enum DCTTransformType type); +PUBLIK void av_dct_calc(DCTContext *s, FFTSample *data); +PUBLIK void av_dct_end (DCTContext *s); + +#endif /* AVCODEC_AVFFT_H */ diff --git a/plugins/supereq/ffmpeg_fft/libavcodec/dct.c b/plugins/supereq/ffmpeg_fft/libavcodec/dct.c new file mode 100644 index 00000000..6ea1936e --- /dev/null +++ b/plugins/supereq/ffmpeg_fft/libavcodec/dct.c @@ -0,0 +1,228 @@ +/* + * (I)DCT Transforms + * Copyright (c) 2009 Peter Ross <pross@xvid.org> + * Copyright (c) 2010 Alex Converse <alex.converse@gmail.com> + * Copyright (c) 2010 Vitor Sessak + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/** + * @file + * (Inverse) Discrete Cosine Transforms. These are also known as the + * type II and type III DCTs respectively. + */ + +#include <math.h> +#include "libavutil/mathematics.h" +#include "fft.h" +#ifndef ARCH_ARM +#include "x86/fft.h" +#endif + +#define DCT32_FLOAT +#include "dct32.h" + +/* sin((M_PI * x / (2*n)) */ +#define SIN(s,n,x) (s->costab[(n) - (x)]) + +/* cos((M_PI * x / (2*n)) */ +#define COS(s,n,x) (s->costab[x]) + +static void ff_dst_calc_I_c(DCTContext *ctx, FFTSample *data) +{ + int n = 1 << ctx->nbits; + int i; + + data[0] = 0; + for(i = 1; i < n/2; i++) { + float tmp1 = data[i ]; + float tmp2 = data[n - i]; + float s = SIN(ctx, n, 2*i); + + s *= tmp1 + tmp2; + tmp1 = (tmp1 - tmp2) * 0.5f; + data[i ] = s + tmp1; + data[n - i] = s - tmp1; + } + + data[n/2] *= 2; + ff_rdft_calc(&ctx->rdft, data); + + data[0] *= 0.5f; + + for(i = 1; i < n-2; i += 2) { + data[i + 1] += data[i - 1]; + data[i ] = -data[i + 2]; + } + + data[n-1] = 0; +} + +static void ff_dct_calc_I_c(DCTContext *ctx, FFTSample *data) +{ + int n = 1 << ctx->nbits; + int i; + float next = -0.5f * (data[0] - data[n]); + + for(i = 0; i < n/2; i++) { + float tmp1 = data[i ]; + float tmp2 = data[n - i]; + float s = SIN(ctx, n, 2*i); + float c = COS(ctx, n, 2*i); + + c *= tmp1 - tmp2; + s *= tmp1 - tmp2; + + next += c; + + tmp1 = (tmp1 + tmp2) * 0.5f; + data[i ] = tmp1 - s; + data[n - i] = tmp1 + s; + } + + ff_rdft_calc(&ctx->rdft, data); + data[n] = data[1]; + data[1] = next; + + for(i = 3; i <= n; i += 2) + data[i] = data[i - 2] - data[i]; +} + +static void ff_dct_calc_III_c(DCTContext *ctx, FFTSample *data) +{ + int n = 1 << ctx->nbits; + int i; + + float next = data[n - 1]; + float inv_n = 1.0f / n; + + for (i = n - 2; i >= 2; i -= 2) { + float val1 = data[i ]; + float val2 = data[i - 1] - data[i + 1]; + float c = COS(ctx, n, i); + float s = SIN(ctx, n, i); + + data[i ] = c * val1 + s * val2; + data[i + 1] = s * val1 - c * val2; + } + + data[1] = 2 * next; + + ff_rdft_calc(&ctx->rdft, data); + + for (i = 0; i < n / 2; i++) { + float tmp1 = data[i ] * inv_n; + float tmp2 = data[n - i - 1] * inv_n; + float csc = ctx->csc2[i] * (tmp1 - tmp2); + + tmp1 += tmp2; + data[i ] = tmp1 + csc; + data[n - i - 1] = tmp1 - csc; + } +} + +static void ff_dct_calc_II_c(DCTContext *ctx, FFTSample *data) +{ + int n = 1 << ctx->nbits; + int i; + float next; + + for (i=0; i < n/2; i++) { + float tmp1 = data[i ]; + float tmp2 = data[n - i - 1]; + float s = SIN(ctx, n, 2*i + 1); + + s *= tmp1 - tmp2; + tmp1 = (tmp1 + tmp2) * 0.5f; + + data[i ] = tmp1 + s; + data[n-i-1] = tmp1 - s; + } + + ff_rdft_calc(&ctx->rdft, data); + + next = data[1] * 0.5; + data[1] *= -1; + + for (i = n - 2; i >= 0; i -= 2) { + float inr = data[i ]; + float ini = data[i + 1]; + float c = COS(ctx, n, i); + float s = SIN(ctx, n, i); + + data[i ] = c * inr + s * ini; + + data[i+1] = next; + + next += s * inr - c * ini; + } +} + +static void dct32_func(DCTContext *ctx, FFTSample *data) +{ + ctx->dct32(data, data); +} + +void ff_dct_calc(DCTContext *s, FFTSample *data) +{ + s->dct_calc(s, data); +} + +av_cold int ff_dct_init(DCTContext *s, int nbits, enum DCTTransformType inverse) +{ + int n = 1 << nbits; + int i; + + s->nbits = nbits; + s->inverse = inverse; + + ff_init_ff_cos_tabs(nbits+2); + + s->costab = ff_cos_tabs[nbits+2]; + + s->csc2 = av_malloc(n/2 * sizeof(FFTSample)); + + if (ff_rdft_init(&s->rdft, nbits, inverse == DCT_III) < 0) { + av_free(s->csc2); + return -1; + } + + for (i = 0; i < n/2; i++) + s->csc2[i] = 0.5 / sin((M_PI / (2*n) * (2*i + 1))); + + switch(inverse) { + case DCT_I : s->dct_calc = ff_dct_calc_I_c; break; + case DCT_II : s->dct_calc = ff_dct_calc_II_c ; break; + case DCT_III: s->dct_calc = ff_dct_calc_III_c; break; + case DST_I : s->dct_calc = ff_dst_calc_I_c; break; + } + + if (inverse == DCT_II && nbits == 5) + s->dct_calc = dct32_func; + + s->dct32 = dct32; + if (HAVE_MMX) ff_dct_init_mmx(s); + + return 0; +} + +av_cold void ff_dct_end(DCTContext *s) +{ + ff_rdft_end(&s->rdft); + av_free(s->csc2); +} diff --git a/plugins/supereq/ffmpeg_fft/libavcodec/dct32.c b/plugins/supereq/ffmpeg_fft/libavcodec/dct32.c new file mode 100644 index 00000000..3e6ad78d --- /dev/null +++ b/plugins/supereq/ffmpeg_fft/libavcodec/dct32.c @@ -0,0 +1,262 @@ +/* + * Template for the Discrete Cosine Transform for 32 samples + * Copyright (c) 2001, 2002 Fabrice Bellard + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "dct32.h" + +/* tab[i][j] = 1.0 / (2.0 * cos(pi*(2*k+1) / 2^(6 - j))) */ + +/* cos(i*pi/64) */ + +#define COS0_0 FIXHR(0.50060299823519630134/2) +#define COS0_1 FIXHR(0.50547095989754365998/2) +#define COS0_2 FIXHR(0.51544730992262454697/2) +#define COS0_3 FIXHR(0.53104259108978417447/2) +#define COS0_4 FIXHR(0.55310389603444452782/2) +#define COS0_5 FIXHR(0.58293496820613387367/2) +#define COS0_6 FIXHR(0.62250412303566481615/2) +#define COS0_7 FIXHR(0.67480834145500574602/2) +#define COS0_8 FIXHR(0.74453627100229844977/2) +#define COS0_9 FIXHR(0.83934964541552703873/2) +#define COS0_10 FIXHR(0.97256823786196069369/2) +#define COS0_11 FIXHR(1.16943993343288495515/4) +#define COS0_12 FIXHR(1.48416461631416627724/4) +#define COS0_13 FIXHR(2.05778100995341155085/8) +#define COS0_14 FIXHR(3.40760841846871878570/8) +#define COS0_15 FIXHR(10.19000812354805681150/32) + +#define COS1_0 FIXHR(0.50241928618815570551/2) +#define COS1_1 FIXHR(0.52249861493968888062/2) +#define COS1_2 FIXHR(0.56694403481635770368/2) +#define COS1_3 FIXHR(0.64682178335999012954/2) +#define COS1_4 FIXHR(0.78815462345125022473/2) +#define COS1_5 FIXHR(1.06067768599034747134/4) +#define COS1_6 FIXHR(1.72244709823833392782/4) +#define COS1_7 FIXHR(5.10114861868916385802/16) + +#define COS2_0 FIXHR(0.50979557910415916894/2) +#define COS2_1 FIXHR(0.60134488693504528054/2) +#define COS2_2 FIXHR(0.89997622313641570463/2) +#define COS2_3 FIXHR(2.56291544774150617881/8) + +#define COS3_0 FIXHR(0.54119610014619698439/2) +#define COS3_1 FIXHR(1.30656296487637652785/4) + +#define COS4_0 FIXHR(0.70710678118654752439/2) + +/* butterfly operator */ +#define BF(a, b, c, s)\ +{\ + tmp0 = val##a + val##b;\ + tmp1 = val##a - val##b;\ + val##a = tmp0;\ + val##b = MULH3(tmp1, c, 1<<(s));\ +} + +#define BF0(a, b, c, s)\ +{\ + tmp0 = tab[a] + tab[b];\ + tmp1 = tab[a] - tab[b];\ + val##a = tmp0;\ + val##b = MULH3(tmp1, c, 1<<(s));\ +} + +#define BF1(a, b, c, d)\ +{\ + BF(a, b, COS4_0, 1);\ + BF(c, d,-COS4_0, 1);\ + val##c += val##d;\ +} + +#define BF2(a, b, c, d)\ +{\ + BF(a, b, COS4_0, 1);\ + BF(c, d,-COS4_0, 1);\ + val##c += val##d;\ + val##a += val##c;\ + val##c += val##b;\ + val##b += val##d;\ +} + +#define ADD(a, b) val##a += val##b + +/* DCT32 without 1/sqrt(2) coef zero scaling. */ +void dct32(INTFLOAT *out, const INTFLOAT *tab) +{ + INTFLOAT tmp0, tmp1; + + INTFLOAT val0 , val1 , val2 , val3 , val4 , val5 , val6 , val7 , + val8 , val9 , val10, val11, val12, val13, val14, val15, + val16, val17, val18, val19, val20, val21, val22, val23, + val24, val25, val26, val27, val28, val29, val30, val31; + + /* pass 1 */ + BF0( 0, 31, COS0_0 , 1); + BF0(15, 16, COS0_15, 5); + /* pass 2 */ + BF( 0, 15, COS1_0 , 1); + BF(16, 31,-COS1_0 , 1); + /* pass 1 */ + BF0( 7, 24, COS0_7 , 1); + BF0( 8, 23, COS0_8 , 1); + /* pass 2 */ + BF( 7, 8, COS1_7 , 4); + BF(23, 24,-COS1_7 , 4); + /* pass 3 */ + BF( 0, 7, COS2_0 , 1); + BF( 8, 15,-COS2_0 , 1); + BF(16, 23, COS2_0 , 1); + BF(24, 31,-COS2_0 , 1); + /* pass 1 */ + BF0( 3, 28, COS0_3 , 1); + BF0(12, 19, COS0_12, 2); + /* pass 2 */ + BF( 3, 12, COS1_3 , 1); + BF(19, 28,-COS1_3 , 1); + /* pass 1 */ + BF0( 4, 27, COS0_4 , 1); + BF0(11, 20, COS0_11, 2); + /* pass 2 */ + BF( 4, 11, COS1_4 , 1); + BF(20, 27,-COS1_4 , 1); + /* pass 3 */ + BF( 3, 4, COS2_3 , 3); + BF(11, 12,-COS2_3 , 3); + BF(19, 20, COS2_3 , 3); + BF(27, 28,-COS2_3 , 3); + /* pass 4 */ + BF( 0, 3, COS3_0 , 1); + BF( 4, 7,-COS3_0 , 1); + BF( 8, 11, COS3_0 , 1); + BF(12, 15,-COS3_0 , 1); + BF(16, 19, COS3_0 , 1); + BF(20, 23,-COS3_0 , 1); + BF(24, 27, COS3_0 , 1); + BF(28, 31,-COS3_0 , 1); + + + + /* pass 1 */ + BF0( 1, 30, COS0_1 , 1); + BF0(14, 17, COS0_14, 3); + /* pass 2 */ + BF( 1, 14, COS1_1 , 1); + BF(17, 30,-COS1_1 , 1); + /* pass 1 */ + BF0( 6, 25, COS0_6 , 1); + BF0( 9, 22, COS0_9 , 1); + /* pass 2 */ + BF( 6, 9, COS1_6 , 2); + BF(22, 25,-COS1_6 , 2); + /* pass 3 */ + BF( 1, 6, COS2_1 , 1); + BF( 9, 14,-COS2_1 , 1); + BF(17, 22, COS2_1 , 1); + BF(25, 30,-COS2_1 , 1); + + /* pass 1 */ + BF0( 2, 29, COS0_2 , 1); + BF0(13, 18, COS0_13, 3); + /* pass 2 */ + BF( 2, 13, COS1_2 , 1); + BF(18, 29,-COS1_2 , 1); + /* pass 1 */ + BF0( 5, 26, COS0_5 , 1); + BF0(10, 21, COS0_10, 1); + /* pass 2 */ + BF( 5, 10, COS1_5 , 2); + BF(21, 26,-COS1_5 , 2); + /* pass 3 */ + BF( 2, 5, COS2_2 , 1); + BF(10, 13,-COS2_2 , 1); + BF(18, 21, COS2_2 , 1); + BF(26, 29,-COS2_2 , 1); + /* pass 4 */ + BF( 1, 2, COS3_1 , 2); + BF( 5, 6,-COS3_1 , 2); + BF( 9, 10, COS3_1 , 2); + BF(13, 14,-COS3_1 , 2); + BF(17, 18, COS3_1 , 2); + BF(21, 22,-COS3_1 , 2); + BF(25, 26, COS3_1 , 2); + BF(29, 30,-COS3_1 , 2); + + /* pass 5 */ + BF1( 0, 1, 2, 3); + BF2( 4, 5, 6, 7); + BF1( 8, 9, 10, 11); + BF2(12, 13, 14, 15); + BF1(16, 17, 18, 19); + BF2(20, 21, 22, 23); + BF1(24, 25, 26, 27); + BF2(28, 29, 30, 31); + + /* pass 6 */ + + ADD( 8, 12); + ADD(12, 10); + ADD(10, 14); + ADD(14, 9); + ADD( 9, 13); + ADD(13, 11); + ADD(11, 15); + + out[ 0] = val0; + out[16] = val1; + out[ 8] = val2; + out[24] = val3; + out[ 4] = val4; + out[20] = val5; + out[12] = val6; + out[28] = val7; + out[ 2] = val8; + out[18] = val9; + out[10] = val10; + out[26] = val11; + out[ 6] = val12; + out[22] = val13; + out[14] = val14; + out[30] = val15; + + ADD(24, 28); + ADD(28, 26); + ADD(26, 30); + ADD(30, 25); + ADD(25, 29); + ADD(29, 27); + ADD(27, 31); + + out[ 1] = val16 + val24; + out[17] = val17 + val25; + out[ 9] = val18 + val26; + out[25] = val19 + val27; + out[ 5] = val20 + val28; + out[21] = val21 + val29; + out[13] = val22 + val30; + out[29] = val23 + val31; + out[ 3] = val24 + val20; + out[19] = val25 + val21; + out[11] = val26 + val22; + out[27] = val27 + val23; + out[ 7] = val28 + val18; + out[23] = val29 + val19; + out[15] = val30 + val17; + out[31] = val31; +} diff --git a/plugins/supereq/ffmpeg_fft/libavcodec/dct32.h b/plugins/supereq/ffmpeg_fft/libavcodec/dct32.h new file mode 100644 index 00000000..dc2d847a --- /dev/null +++ b/plugins/supereq/ffmpeg_fft/libavcodec/dct32.h @@ -0,0 +1,10 @@ +#ifndef DCT_32_H +#define DCT_32_H + +#define FIXHR(x) ((float)(x)) +#define MULH3(x, y, s) ((s)*(y)*(x)) +#define INTFLOAT float + +void dct32(INTFLOAT *out, const INTFLOAT *tab); + +#endif diff --git a/plugins/supereq/ffmpeg_fft/libavcodec/fft.c b/plugins/supereq/ffmpeg_fft/libavcodec/fft.c new file mode 100644 index 00000000..04082bf4 --- /dev/null +++ b/plugins/supereq/ffmpeg_fft/libavcodec/fft.c @@ -0,0 +1,300 @@ +/* + * FFT/IFFT transforms + * Copyright (c) 2008 Loren Merritt + * Copyright (c) 2002 Fabrice Bellard + * Partly based on libdjbfft by D. J. Bernstein + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/** + * @file + * FFT/IFFT transforms. + */ + +#include <stdlib.h> +#include <string.h> +#include "libavutil/mathematics.h" +#include "fft.h" + +/* cos(2*pi*x/n) for 0<=x<=n/4, followed by its reverse */ +#if !CONFIG_HARDCODED_TABLES +COSTABLE(16); +COSTABLE(32); +COSTABLE(64); +COSTABLE(128); +COSTABLE(256); +COSTABLE(512); +COSTABLE(1024); +COSTABLE(2048); +COSTABLE(4096); +COSTABLE(8192); +COSTABLE(16384); +COSTABLE(32768); +COSTABLE(65536); +#endif +COSTABLE_CONST FFTSample * const ff_cos_tabs[] = { + NULL, NULL, NULL, NULL, + ff_cos_16, ff_cos_32, ff_cos_64, ff_cos_128, ff_cos_256, ff_cos_512, ff_cos_1024, + ff_cos_2048, ff_cos_4096, ff_cos_8192, ff_cos_16384, ff_cos_32768, ff_cos_65536, +}; + +static int split_radix_permutation(int i, int n, int inverse) +{ + int m; + if(n <= 2) return i&1; + m = n >> 1; + if(!(i&m)) return split_radix_permutation(i, m, inverse)*2; + m >>= 1; + if(inverse == !(i&m)) return split_radix_permutation(i, m, inverse)*4 + 1; + else return split_radix_permutation(i, m, inverse)*4 - 1; +} + +av_cold void ff_init_ff_cos_tabs(int index) +{ +#if !CONFIG_HARDCODED_TABLES + int i; + int m = 1<<index; + double freq = 2*M_PI/m; + FFTSample *tab = ff_cos_tabs[index]; + for(i=0; i<=m/4; i++) + tab[i] = cos(i*freq); + for(i=1; i<m/4; i++) + tab[m/2-i] = tab[i]; +#endif +} + +av_cold int ff_fft_init(FFTContext *s, int nbits, int inverse) +{ + int i, j, n; + + if (nbits < 2 || nbits > 16) + goto fail; + s->nbits = nbits; + n = 1 << nbits; + + s->revtab = av_malloc(n * sizeof(uint16_t)); + if (!s->revtab) + goto fail; + s->tmp_buf = av_malloc(n * sizeof(FFTComplex)); + if (!s->tmp_buf) + goto fail; + s->inverse = inverse; + + s->fft_permute = ff_fft_permute_c; + s->fft_calc = ff_fft_calc_c; +#if CONFIG_MDCT + s->imdct_calc = ff_imdct_calc_c; + s->imdct_half = ff_imdct_half_c; + s->mdct_calc = ff_mdct_calc_c; +#endif + +#if ARCH_ARM + ff_fft_init_arm(s); +#elif HAVE_ALTIVEC + if (HAVE_ALTIVEC) ff_fft_init_altivec(s); +#elif HAVE_MMX + if (HAVE_MMX) ff_fft_init_mmx(s); +#endif + + for(j=4; j<=nbits; j++) { + ff_init_ff_cos_tabs(j); + } + for(i=0; i<n; i++) + s->revtab[-split_radix_permutation(i, n, s->inverse) & (n-1)] = i; + + return 0; + fail: + av_freep(&s->revtab); + av_freep(&s->tmp_buf); + return -1; +} + +void ff_fft_permute_c(FFTContext *s, FFTComplex *z) +{ + int j, np; + const uint16_t *revtab = s->revtab; + np = 1 << s->nbits; + /* TODO: handle split-radix permute in a more optimal way, probably in-place */ + for(j=0;j<np;j++) s->tmp_buf[revtab[j]] = z[j]; + memcpy(z, s->tmp_buf, np * sizeof(FFTComplex)); +} + +av_cold void ff_fft_end(FFTContext *s) +{ + av_freep(&s->revtab); + av_freep(&s->tmp_buf); +} + +#define sqrthalf (float)M_SQRT1_2 + +#define BF(x,y,a,b) {\ + x = a - b;\ + y = a + b;\ +} + +#define BUTTERFLIES(a0,a1,a2,a3) {\ + BF(t3, t5, t5, t1);\ + BF(a2.re, a0.re, a0.re, t5);\ + BF(a3.im, a1.im, a1.im, t3);\ + BF(t4, t6, t2, t6);\ + BF(a3.re, a1.re, a1.re, t4);\ + BF(a2.im, a0.im, a0.im, t6);\ +} + +// force loading all the inputs before storing any. +// this is slightly slower for small data, but avoids store->load aliasing +// for addresses separated by large powers of 2. +#define BUTTERFLIES_BIG(a0,a1,a2,a3) {\ + FFTSample r0=a0.re, i0=a0.im, r1=a1.re, i1=a1.im;\ + BF(t3, t5, t5, t1);\ + BF(a2.re, a0.re, r0, t5);\ + BF(a3.im, a1.im, i1, t3);\ + BF(t4, t6, t2, t6);\ + BF(a3.re, a1.re, r1, t4);\ + BF(a2.im, a0.im, i0, t6);\ +} + +#define TRANSFORM(a0,a1,a2,a3,wre,wim) {\ + t1 = a2.re * wre + a2.im * wim;\ + t2 = a2.im * wre - a2.re * wim;\ + t5 = a3.re * wre - a3.im * wim;\ + t6 = a3.im * wre + a3.re * wim;\ + BUTTERFLIES(a0,a1,a2,a3)\ +} + +#define TRANSFORM_ZERO(a0,a1,a2,a3) {\ + t1 = a2.re;\ + t2 = a2.im;\ + t5 = a3.re;\ + t6 = a3.im;\ + BUTTERFLIES(a0,a1,a2,a3)\ +} + +/* z[0...8n-1], w[1...2n-1] */ +#define PASS(name)\ +static void name(FFTComplex *z, const FFTSample *wre, unsigned int n)\ +{\ + FFTSample t1, t2, t3, t4, t5, t6;\ + int o1 = 2*n;\ + int o2 = 4*n;\ + int o3 = 6*n;\ + const FFTSample *wim = wre+o1;\ + n--;\ +\ + TRANSFORM_ZERO(z[0],z[o1],z[o2],z[o3]);\ + TRANSFORM(z[1],z[o1+1],z[o2+1],z[o3+1],wre[1],wim[-1]);\ + do {\ + z += 2;\ + wre += 2;\ + wim -= 2;\ + TRANSFORM(z[0],z[o1],z[o2],z[o3],wre[0],wim[0]);\ + TRANSFORM(z[1],z[o1+1],z[o2+1],z[o3+1],wre[1],wim[-1]);\ + } while(--n);\ +} + +PASS(pass) +#undef BUTTERFLIES +#define BUTTERFLIES BUTTERFLIES_BIG +PASS(pass_big) + +#define DECL_FFT(n,n2,n4)\ +static void fft##n(FFTComplex *z)\ +{\ + fft##n2(z);\ + fft##n4(z+n4*2);\ + fft##n4(z+n4*3);\ + pass(z,ff_cos_##n,n4/2);\ +} + +static void fft4(FFTComplex *z) +{ + FFTSample t1, t2, t3, t4, t5, t6, t7, t8; + + BF(t3, t1, z[0].re, z[1].re); + BF(t8, t6, z[3].re, z[2].re); + BF(z[2].re, z[0].re, t1, t6); + BF(t4, t2, z[0].im, z[1].im); + BF(t7, t5, z[2].im, z[3].im); + BF(z[3].im, z[1].im, t4, t8); + BF(z[3].re, z[1].re, t3, t7); + BF(z[2].im, z[0].im, t2, t5); +} + +static void fft8(FFTComplex *z) +{ + FFTSample t1, t2, t3, t4, t5, t6, t7, t8; + + fft4(z); + + BF(t1, z[5].re, z[4].re, -z[5].re); + BF(t2, z[5].im, z[4].im, -z[5].im); + BF(t3, z[7].re, z[6].re, -z[7].re); + BF(t4, z[7].im, z[6].im, -z[7].im); + BF(t8, t1, t3, t1); + BF(t7, t2, t2, t4); + BF(z[4].re, z[0].re, z[0].re, t1); + BF(z[4].im, z[0].im, z[0].im, t2); + BF(z[6].re, z[2].re, z[2].re, t7); + BF(z[6].im, z[2].im, z[2].im, t8); + + TRANSFORM(z[1],z[3],z[5],z[7],sqrthalf,sqrthalf); +} + +#if !CONFIG_SMALL +static void fft16(FFTComplex *z) +{ + FFTSample t1, t2, t3, t4, t5, t6; + + fft8(z); + fft4(z+8); + fft4(z+12); + + TRANSFORM_ZERO(z[0],z[4],z[8],z[12]); + TRANSFORM(z[2],z[6],z[10],z[14],sqrthalf,sqrthalf); + TRANSFORM(z[1],z[5],z[9],z[13],ff_cos_16[1],ff_cos_16[3]); + TRANSFORM(z[3],z[7],z[11],z[15],ff_cos_16[3],ff_cos_16[1]); +} +#else +DECL_FFT(16,8,4) +#endif +DECL_FFT(32,16,8) +DECL_FFT(64,32,16) +DECL_FFT(128,64,32) +DECL_FFT(256,128,64) +DECL_FFT(512,256,128) +#if !CONFIG_SMALL +#define pass pass_big +#endif +DECL_FFT(1024,512,256) +DECL_FFT(2048,1024,512) +DECL_FFT(4096,2048,1024) +DECL_FFT(8192,4096,2048) +DECL_FFT(16384,8192,4096) +DECL_FFT(32768,16384,8192) +DECL_FFT(65536,32768,16384) + +static void (* const fft_dispatch[])(FFTComplex*) = { + fft4, fft8, fft16, fft32, fft64, fft128, fft256, fft512, fft1024, + fft2048, fft4096, fft8192, fft16384, fft32768, fft65536, +}; + +void ff_fft_calc_c(FFTContext *s, FFTComplex *z) +{ + fft_dispatch[s->nbits-2](z); +} + diff --git a/plugins/supereq/ffmpeg_fft/libavcodec/fft.h b/plugins/supereq/ffmpeg_fft/libavcodec/fft.h new file mode 100644 index 00000000..b2e0f540 --- /dev/null +++ b/plugins/supereq/ffmpeg_fft/libavcodec/fft.h @@ -0,0 +1,244 @@ +/* + * Copyright (c) 2000, 2001, 2002 Fabrice Bellard + * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCODEC_FFT_H +#define AVCODEC_FFT_H + +#include <stdint.h> +#include "../config.h" +#include "libavutil/mem.h" +#include "avfft.h" + +/* FFT computation */ + +struct FFTContext { + int nbits; + int inverse; + uint16_t *revtab; + FFTComplex *tmp_buf; + int mdct_size; /* size of MDCT (i.e. number of input data * 2) */ + int mdct_bits; /* n = 2^nbits */ + /* pre/post rotation tables */ + FFTSample *tcos; + FFTSample *tsin; + void (*fft_permute)(struct FFTContext *s, FFTComplex *z); + void (*fft_calc)(struct FFTContext *s, FFTComplex *z); + void (*imdct_calc)(struct FFTContext *s, FFTSample *output, const FFTSample *input); + void (*imdct_half)(struct FFTContext *s, FFTSample *output, const FFTSample *input); + void (*mdct_calc)(struct FFTContext *s, FFTSample *output, const FFTSample *input); + int permutation; +#define FF_MDCT_PERM_NONE 0 +#define FF_MDCT_PERM_INTERLEAVE 1 +}; + +#if CONFIG_HARDCODED_TABLES +#define COSTABLE_CONST const +#define SINTABLE_CONST const +#define SINETABLE_CONST const +#else +#define COSTABLE_CONST +#define SINTABLE_CONST +#define SINETABLE_CONST +#endif + +#define COSTABLE(size) \ + COSTABLE_CONST DECLARE_ALIGNED(16, FFTSample, ff_cos_##size)[size/2] +#define SINTABLE(size) \ + SINTABLE_CONST DECLARE_ALIGNED(16, FFTSample, ff_sin_##size)[size/2] +#define SINETABLE(size) \ + SINETABLE_CONST DECLARE_ALIGNED(16, float, ff_sine_##size)[size] +extern COSTABLE(16); +extern COSTABLE(32); +extern COSTABLE(64); +extern COSTABLE(128); +extern COSTABLE(256); +extern COSTABLE(512); +extern COSTABLE(1024); +extern COSTABLE(2048); +extern COSTABLE(4096); +extern COSTABLE(8192); +extern COSTABLE(16384); +extern COSTABLE(32768); +extern COSTABLE(65536); +extern COSTABLE_CONST FFTSample* const ff_cos_tabs[17]; + +/** + * Initialize the cosine table in ff_cos_tabs[index] + * \param index index in ff_cos_tabs array of the table to initialize + */ +void ff_init_ff_cos_tabs(int index); + +extern SINTABLE(16); +extern SINTABLE(32); +extern SINTABLE(64); +extern SINTABLE(128); +extern SINTABLE(256); +extern SINTABLE(512); +extern SINTABLE(1024); +extern SINTABLE(2048); +extern SINTABLE(4096); +extern SINTABLE(8192); +extern SINTABLE(16384); +extern SINTABLE(32768); +extern SINTABLE(65536); + +/** + * Set up a complex FFT. + * @param nbits log2 of the length of the input array + * @param inverse if 0 perform the forward transform, if 1 perform the inverse + */ +int ff_fft_init(FFTContext *s, int nbits, int inverse); +void ff_fft_permute_c(FFTContext *s, FFTComplex *z); +void ff_fft_calc_c(FFTContext *s, FFTComplex *z); + +void ff_fft_init_altivec(FFTContext *s); +void ff_fft_init_mmx(FFTContext *s); +void ff_fft_init_arm(FFTContext *s); +void ff_dct_init_mmx(DCTContext *s); + +/** + * Do the permutation needed BEFORE calling ff_fft_calc(). + */ +static inline void ff_fft_permute(FFTContext *s, FFTComplex *z) +{ + s->fft_permute(s, z); +} +/** + * Do a complex FFT with the parameters defined in ff_fft_init(). The + * input data must be permuted before. No 1.0/sqrt(n) normalization is done. + */ +static inline void ff_fft_calc(FFTContext *s, FFTComplex *z) +{ + s->fft_calc(s, z); +} +void ff_fft_end(FFTContext *s); + +/* MDCT computation */ + +static inline void ff_imdct_calc(FFTContext *s, FFTSample *output, const FFTSample *input) +{ + s->imdct_calc(s, output, input); +} +static inline void ff_imdct_half(FFTContext *s, FFTSample *output, const FFTSample *input) +{ + s->imdct_half(s, output, input); +} + +static inline void ff_mdct_calc(FFTContext *s, FFTSample *output, + const FFTSample *input) +{ + s->mdct_calc(s, output, input); +} + +/** + * Maximum window size for ff_kbd_window_init. + */ +#define FF_KBD_WINDOW_MAX 1024 + +/** + * Generate a Kaiser-Bessel Derived Window. + * @param window pointer to half window + * @param alpha determines window shape + * @param n size of half window, max FF_KBD_WINDOW_MAX + */ +void ff_kbd_window_init(float *window, float alpha, int n); + +/** + * Generate a sine window. + * @param window pointer to half window + * @param n size of half window + */ +void ff_sine_window_init(float *window, int n); + +/** + * initialize the specified entry of ff_sine_windows + */ +void ff_init_ff_sine_windows(int index); +extern SINETABLE( 32); +extern SINETABLE( 64); +extern SINETABLE( 128); +extern SINETABLE( 256); +extern SINETABLE( 512); +extern SINETABLE(1024); +extern SINETABLE(2048); +extern SINETABLE(4096); +extern SINETABLE_CONST float * const ff_sine_windows[13]; + +int ff_mdct_init(FFTContext *s, int nbits, int inverse, double scale); +void ff_imdct_calc_c(FFTContext *s, FFTSample *output, const FFTSample *input); +void ff_imdct_half_c(FFTContext *s, FFTSample *output, const FFTSample *input); +void ff_mdct_calc_c(FFTContext *s, FFTSample *output, const FFTSample *input); +void ff_mdct_end(FFTContext *s); + +/* Real Discrete Fourier Transform */ + +struct RDFTContext { + int nbits; + int inverse; + int sign_convention; + + /* pre/post rotation tables */ + const FFTSample *tcos; + SINTABLE_CONST FFTSample *tsin; + FFTContext fft; + void (*rdft_calc)(struct RDFTContext *s, FFTSample *z); +}; + +/** + * Set up a real FFT. + * @param nbits log2 of the length of the input array + * @param trans the type of transform + */ +int ff_rdft_init(RDFTContext *s, int nbits, enum RDFTransformType trans); +void ff_rdft_end(RDFTContext *s); + +void ff_rdft_init_arm(RDFTContext *s); + +static av_always_inline void ff_rdft_calc(RDFTContext *s, FFTSample *data) +{ + s->rdft_calc(s, data); +} + +/* Discrete Cosine Transform */ + +struct DCTContext { + int nbits; + int inverse; + RDFTContext rdft; + const float *costab; + FFTSample *csc2; + void (*dct_calc)(struct DCTContext *s, FFTSample *data); + void (*dct32)(FFTSample *out, const FFTSample *in); +}; + +/** + * Set up DCT. + * @param nbits size of the input array: + * (1 << nbits) for DCT-II, DCT-III and DST-I + * (1 << nbits) + 1 for DCT-I + * + * @note the first element of the input of DST-I is ignored + */ +int ff_dct_init(DCTContext *s, int nbits, enum DCTTransformType type); +void ff_dct_calc(DCTContext *s, FFTSample *data); +void ff_dct_end (DCTContext *s); + +#endif /* AVCODEC_FFT_H */ diff --git a/plugins/supereq/ffmpeg_fft/libavcodec/rdft.c b/plugins/supereq/ffmpeg_fft/libavcodec/rdft.c new file mode 100644 index 00000000..fe6014fb --- /dev/null +++ b/plugins/supereq/ffmpeg_fft/libavcodec/rdft.c @@ -0,0 +1,137 @@ +/* + * (I)RDFT transforms + * Copyright (c) 2009 Alex Converse <alex dot converse at gmail dot com> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include <stdlib.h> +#include <math.h> +#include "libavutil/mathematics.h" +#include "fft.h" + +/** + * @file + * (Inverse) Real Discrete Fourier Transforms. + */ + +/* sin(2*pi*x/n) for 0<=x<n/4, followed by n/2<=x<3n/4 */ +#if !CONFIG_HARDCODED_TABLES +SINTABLE(16); +SINTABLE(32); +SINTABLE(64); +SINTABLE(128); +SINTABLE(256); +SINTABLE(512); +SINTABLE(1024); +SINTABLE(2048); +SINTABLE(4096); +SINTABLE(8192); +SINTABLE(16384); +SINTABLE(32768); +SINTABLE(65536); +#endif +SINTABLE_CONST FFTSample * const ff_sin_tabs[] = { + NULL, NULL, NULL, NULL, + ff_sin_16, ff_sin_32, ff_sin_64, ff_sin_128, ff_sin_256, ff_sin_512, ff_sin_1024, + ff_sin_2048, ff_sin_4096, ff_sin_8192, ff_sin_16384, ff_sin_32768, ff_sin_65536, +}; + +/** Map one real FFT into two parallel real even and odd FFTs. Then interleave + * the two real FFTs into one complex FFT. Unmangle the results. + * ref: http://www.engineeringproductivitytools.com/stuff/T0001/PT10.HTM + */ +static void ff_rdft_calc_c(RDFTContext* s, FFTSample* data) +{ + int i, i1, i2; + FFTComplex ev, od; + const int n = 1 << s->nbits; + const float k1 = 0.5; + const float k2 = 0.5 - s->inverse; + const FFTSample *tcos = s->tcos; + const FFTSample *tsin = s->tsin; + + if (!s->inverse) { + ff_fft_permute(&s->fft, (FFTComplex*)data); + ff_fft_calc(&s->fft, (FFTComplex*)data); + } + /* i=0 is a special case because of packing, the DC term is real, so we + are going to throw the N/2 term (also real) in with it. */ + ev.re = data[0]; + data[0] = ev.re+data[1]; + data[1] = ev.re-data[1]; + for (i = 1; i < (n>>2); i++) { + i1 = 2*i; + i2 = n-i1; + /* Separate even and odd FFTs */ + ev.re = k1*(data[i1 ]+data[i2 ]); + od.im = -k2*(data[i1 ]-data[i2 ]); + ev.im = k1*(data[i1+1]-data[i2+1]); + od.re = k2*(data[i1+1]+data[i2+1]); + /* Apply twiddle factors to the odd FFT and add to the even FFT */ + data[i1 ] = ev.re + od.re*tcos[i] - od.im*tsin[i]; + data[i1+1] = ev.im + od.im*tcos[i] + od.re*tsin[i]; + data[i2 ] = ev.re - od.re*tcos[i] + od.im*tsin[i]; + data[i2+1] = -ev.im + od.im*tcos[i] + od.re*tsin[i]; + } + data[2*i+1]=s->sign_convention*data[2*i+1]; + if (s->inverse) { + data[0] *= k1; + data[1] *= k1; + ff_fft_permute(&s->fft, (FFTComplex*)data); + ff_fft_calc(&s->fft, (FFTComplex*)data); + } +} + +av_cold int ff_rdft_init(RDFTContext *s, int nbits, enum RDFTransformType trans) +{ + int n = 1 << nbits; + int i; + const double theta = (trans == DFT_R2C || trans == DFT_C2R ? -1 : 1)*2*M_PI/n; + + s->nbits = nbits; + s->inverse = trans == IDFT_C2R || trans == DFT_C2R; + s->sign_convention = trans == IDFT_R2C || trans == DFT_C2R ? 1 : -1; + + if (nbits < 4 || nbits > 16) { + return -1; + } + + if (ff_fft_init(&s->fft, nbits-1, trans == IDFT_C2R || trans == IDFT_R2C) < 0) { + return -1; + } + + ff_init_ff_cos_tabs(nbits); + s->tcos = ff_cos_tabs[nbits]; + s->tsin = ff_sin_tabs[nbits]+(trans == DFT_R2C || trans == DFT_C2R)*(n>>2); +#if !CONFIG_HARDCODED_TABLES + for (i = 0; i < (n>>2); i++) { + s->tsin[i] = sin(i*theta); + } +#endif + s->rdft_calc = ff_rdft_calc_c; + +#if ARCH_ARM + ff_rdft_init_arm(s); +#endif + + return 0; +} + +av_cold void ff_rdft_end(RDFTContext *s) +{ + ff_fft_end(&s->fft); +} diff --git a/plugins/supereq/ffmpeg_fft/libavutil/attributes.h b/plugins/supereq/ffmpeg_fft/libavutil/attributes.h new file mode 100644 index 00000000..50fbfc31 --- /dev/null +++ b/plugins/supereq/ffmpeg_fft/libavutil/attributes.h @@ -0,0 +1,122 @@ +/* + * copyright (c) 2006 Michael Niedermayer <michaelni@gmx.at> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/** + * @file + * Macro definitions for various function/variable attributes + */ + +#ifndef AVUTIL_ATTRIBUTES_H +#define AVUTIL_ATTRIBUTES_H + +#ifdef __GNUC__ +# define AV_GCC_VERSION_AT_LEAST(x,y) (__GNUC__ > x || __GNUC__ == x && __GNUC_MINOR__ >= y) +#else +# define AV_GCC_VERSION_AT_LEAST(x,y) 0 +#endif + +#ifndef av_always_inline +#if AV_GCC_VERSION_AT_LEAST(3,1) +# define av_always_inline __attribute__((always_inline)) inline +#else +# define av_always_inline inline +#endif +#endif + +#ifndef av_noinline +#if AV_GCC_VERSION_AT_LEAST(3,1) +# define av_noinline __attribute__((noinline)) +#else +# define av_noinline +#endif +#endif + +#ifndef av_pure +#if AV_GCC_VERSION_AT_LEAST(3,1) +# define av_pure __attribute__((pure)) +#else +# define av_pure +#endif +#endif + +#ifndef av_const +#if AV_GCC_VERSION_AT_LEAST(2,6) +# define av_const __attribute__((const)) +#else +# define av_const +#endif +#endif + +#ifndef av_cold +#if (!defined(__ICC) || __ICC > 1110) && AV_GCC_VERSION_AT_LEAST(4,3) +# define av_cold __attribute__((cold)) +#else +# define av_cold +#endif +#endif + +#ifndef av_flatten +#if (!defined(__ICC) || __ICC > 1110) && AV_GCC_VERSION_AT_LEAST(4,1) +# define av_flatten __attribute__((flatten)) +#else +# define av_flatten +#endif +#endif + +#ifndef attribute_deprecated +#if AV_GCC_VERSION_AT_LEAST(3,1) +# define attribute_deprecated __attribute__((deprecated)) +#else +# define attribute_deprecated +#endif +#endif + +#ifndef av_unused +#if defined(__GNUC__) +# define av_unused __attribute__((unused)) +#else +# define av_unused +#endif +#endif + +#ifndef av_alias +#if (!defined(__ICC) || __ICC > 1110) && AV_GCC_VERSION_AT_LEAST(3,3) +# define av_alias __attribute__((may_alias)) +#else +# define av_alias +#endif +#endif + +#ifndef av_uninit +#if defined(__GNUC__) && !defined(__ICC) +# define av_uninit(x) x=x +#else +# define av_uninit(x) x +#endif +#endif + +#ifdef __GNUC__ +# define av_builtin_constant_p __builtin_constant_p +#else +# define av_builtin_constant_p(x) 0 +#endif + +#endif /* AVUTIL_ATTRIBUTES_H */ + diff --git a/plugins/supereq/ffmpeg_fft/libavutil/avconfig.h b/plugins/supereq/ffmpeg_fft/libavutil/avconfig.h new file mode 100644 index 00000000..b028bb4f --- /dev/null +++ b/plugins/supereq/ffmpeg_fft/libavutil/avconfig.h @@ -0,0 +1,5 @@ +/* Generated by ffconf */ +#ifndef AVUTIL_AVCONFIG_H +#define AVUTIL_AVCONFIG_H +#define AV_HAVE_BIGENDIAN 0 +#endif /* AVUTIL_AVCONFIG_H */ diff --git a/plugins/supereq/ffmpeg_fft/libavutil/avutil.h b/plugins/supereq/ffmpeg_fft/libavutil/avutil.h new file mode 100644 index 00000000..f5d364be --- /dev/null +++ b/plugins/supereq/ffmpeg_fft/libavutil/avutil.h @@ -0,0 +1,90 @@ +/* + * copyright (c) 2006 Michael Niedermayer <michaelni@gmx.at> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVUTIL_AVUTIL_H +#define AVUTIL_AVUTIL_H + +/** + * @file + * external API header + */ + + +#define AV_STRINGIFY(s) AV_TOSTRING(s) +#define AV_TOSTRING(s) #s + +#define AV_GLUE(a, b) a ## b +#define AV_JOIN(a, b) AV_GLUE(a, b) + +#define AV_PRAGMA(s) _Pragma(#s) + +#define AV_VERSION_INT(a, b, c) (a<<16 | b<<8 | c) +#define AV_VERSION_DOT(a, b, c) a ##.## b ##.## c +#define AV_VERSION(a, b, c) AV_VERSION_DOT(a, b, c) + +#define LIBAVUTIL_VERSION_MAJOR 50 +#define LIBAVUTIL_VERSION_MINOR 21 +#define LIBAVUTIL_VERSION_MICRO 0 + +#define LIBAVUTIL_VERSION_INT AV_VERSION_INT(LIBAVUTIL_VERSION_MAJOR, \ + LIBAVUTIL_VERSION_MINOR, \ + LIBAVUTIL_VERSION_MICRO) +#define LIBAVUTIL_VERSION AV_VERSION(LIBAVUTIL_VERSION_MAJOR, \ + LIBAVUTIL_VERSION_MINOR, \ + LIBAVUTIL_VERSION_MICRO) +#define LIBAVUTIL_BUILD LIBAVUTIL_VERSION_INT + +#define LIBAVUTIL_IDENT "Lavu" AV_STRINGIFY(LIBAVUTIL_VERSION) + +/** + * Return the LIBAVUTIL_VERSION_INT constant. + */ +unsigned avutil_version(void); + +/** + * Return the libavutil build-time configuration. + */ +const char *avutil_configuration(void); + +/** + * Return the libavutil license. + */ +const char *avutil_license(void); + +enum AVMediaType { + AVMEDIA_TYPE_UNKNOWN = -1, + AVMEDIA_TYPE_VIDEO, + AVMEDIA_TYPE_AUDIO, + AVMEDIA_TYPE_DATA, + AVMEDIA_TYPE_SUBTITLE, + AVMEDIA_TYPE_ATTACHMENT, + AVMEDIA_TYPE_NB +}; + +#include "common.h" +/* #include "error.h" */ +#include "mathematics.h" +#include "rational.h" +#include "intfloat_readwrite.h" +/* #include "log.h" */ +/* #include "pixfmt.h" */ + +#endif /* AVUTIL_AVUTIL_H */ + diff --git a/plugins/supereq/ffmpeg_fft/libavutil/common.h b/plugins/supereq/ffmpeg_fft/libavutil/common.h new file mode 100644 index 00000000..9dff1435 --- /dev/null +++ b/plugins/supereq/ffmpeg_fft/libavutil/common.h @@ -0,0 +1,347 @@ +/* + * copyright (c) 2006 Michael Niedermayer <michaelni@gmx.at> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/** + * @file + * common internal and external API header + */ + +#ifndef AVUTIL_COMMON_H +#define AVUTIL_COMMON_H + +#include <ctype.h> +#include <errno.h> +#include <inttypes.h> +#include <limits.h> +#include <math.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include "attributes.h" + +//rounded division & shift +#define RSHIFT(a,b) ((a) > 0 ? ((a) + ((1<<(b))>>1))>>(b) : ((a) + ((1<<(b))>>1)-1)>>(b)) +/* assume b>0 */ +#define ROUNDED_DIV(a,b) (((a)>0 ? (a) + ((b)>>1) : (a) - ((b)>>1))/(b)) +#define FFABS(a) ((a) >= 0 ? (a) : (-(a))) +#define FFSIGN(a) ((a) > 0 ? 1 : -1) + +#define FFMAX(a,b) ((a) > (b) ? (a) : (b)) +#define FFMAX3(a,b,c) FFMAX(FFMAX(a,b),c) +#define FFMIN(a,b) ((a) > (b) ? (b) : (a)) +#define FFMIN3(a,b,c) FFMIN(FFMIN(a,b),c) + +#define FFSWAP(type,a,b) do{type SWAP_tmp= b; b= a; a= SWAP_tmp;}while(0) +#define FF_ARRAY_ELEMS(a) (sizeof(a) / sizeof((a)[0])) +#define FFALIGN(x, a) (((x)+(a)-1)&~((a)-1)) + +/* misc math functions */ +extern const uint8_t ff_log2_tab[256]; + +extern const uint8_t av_reverse[256]; + +static inline av_const int av_log2_c(unsigned int v) +{ + int n = 0; + if (v & 0xffff0000) { + v >>= 16; + n += 16; + } + if (v & 0xff00) { + v >>= 8; + n += 8; + } + n += ff_log2_tab[v]; + + return n; +} + +static inline av_const int av_log2_16bit_c(unsigned int v) +{ + int n = 0; + if (v & 0xff00) { + v >>= 8; + n += 8; + } + n += ff_log2_tab[v]; + + return n; +} + +#ifdef HAVE_AV_CONFIG_H +# include "config.h" +# include "intmath.h" +#endif + +/* Pull in unguarded fallback defines at the end of this file. */ +#include "common.h" + +/** + * Clip a signed integer value into the amin-amax range. + * @param a value to clip + * @param amin minimum value of the clip range + * @param amax maximum value of the clip range + * @return clipped value + */ +static inline av_const int av_clip_c(int a, int amin, int amax) +{ + if (a < amin) return amin; + else if (a > amax) return amax; + else return a; +} + +/** + * Clip a signed integer value into the 0-255 range. + * @param a value to clip + * @return clipped value + */ +static inline av_const uint8_t av_clip_uint8_c(int a) +{ + if (a&(~0xFF)) return (-a)>>31; + else return a; +} + +/** + * Clip a signed integer value into the -128,127 range. + * @param a value to clip + * @return clipped value + */ +static inline av_const int8_t av_clip_int8_c(int a) +{ + if ((a+0x80) & ~0xFF) return (a>>31) ^ 0x7F; + else return a; +} + +/** + * Clip a signed integer value into the 0-65535 range. + * @param a value to clip + * @return clipped value + */ +static inline av_const uint16_t av_clip_uint16_c(int a) +{ + if (a&(~0xFFFF)) return (-a)>>31; + else return a; +} + +/** + * Clip a signed integer value into the -32768,32767 range. + * @param a value to clip + * @return clipped value + */ +static inline av_const int16_t av_clip_int16_c(int a) +{ + if ((a+0x8000) & ~0xFFFF) return (a>>31) ^ 0x7FFF; + else return a; +} + +/** + * Clip a signed 64-bit integer value into the -2147483648,2147483647 range. + * @param a value to clip + * @return clipped value + */ +static inline av_const int32_t av_clipl_int32_c(int64_t a) +{ + if ((a+0x80000000u) & ~UINT64_C(0xFFFFFFFF)) return (a>>63) ^ 0x7FFFFFFF; + else return a; +} + +/** + * Clip a float value into the amin-amax range. + * @param a value to clip + * @param amin minimum value of the clip range + * @param amax maximum value of the clip range + * @return clipped value + */ +static inline av_const float av_clipf_c(float a, float amin, float amax) +{ + if (a < amin) return amin; + else if (a > amax) return amax; + else return a; +} + +/** Compute ceil(log2(x)). + * @param x value used to compute ceil(log2(x)) + * @return computed ceiling of log2(x) + */ +static inline av_const int av_ceil_log2_c(int x) +{ + return av_log2((x - 1) << 1); +} + +#define MKTAG(a,b,c,d) ((a) | ((b) << 8) | ((c) << 16) | ((d) << 24)) +#define MKBETAG(a,b,c,d) ((d) | ((c) << 8) | ((b) << 16) | ((a) << 24)) + +/** + * Convert a UTF-8 character (up to 4 bytes) to its 32-bit UCS-4 encoded form. + * + * @param val Output value, must be an lvalue of type uint32_t. + * @param GET_BYTE Expression reading one byte from the input. + * Evaluated up to 7 times (4 for the currently + * assigned Unicode range). With a memory buffer + * input, this could be *ptr++. + * @param ERROR Expression to be evaluated on invalid input, + * typically a goto statement. + */ +#define GET_UTF8(val, GET_BYTE, ERROR)\ + val= GET_BYTE;\ + {\ + int ones= 7 - av_log2(val ^ 255);\ + if(ones==1)\ + ERROR\ + val&= 127>>ones;\ + while(--ones > 0){\ + int tmp= GET_BYTE - 128;\ + if(tmp>>6)\ + ERROR\ + val= (val<<6) + tmp;\ + }\ + } + +/** + * Convert a UTF-16 character (2 or 4 bytes) to its 32-bit UCS-4 encoded form. + * + * @param val Output value, must be an lvalue of type uint32_t. + * @param GET_16BIT Expression returning two bytes of UTF-16 data converted + * to native byte order. Evaluated one or two times. + * @param ERROR Expression to be evaluated on invalid input, + * typically a goto statement. + */ +#define GET_UTF16(val, GET_16BIT, ERROR)\ + val = GET_16BIT;\ + {\ + unsigned int hi = val - 0xD800;\ + if (hi < 0x800) {\ + val = GET_16BIT - 0xDC00;\ + if (val > 0x3FFU || hi > 0x3FFU)\ + ERROR\ + val += (hi<<10) + 0x10000;\ + }\ + }\ + +/*! + * \def PUT_UTF8(val, tmp, PUT_BYTE) + * Convert a 32-bit Unicode character to its UTF-8 encoded form (up to 4 bytes long). + * \param val is an input-only argument and should be of type uint32_t. It holds + * a UCS-4 encoded Unicode character that is to be converted to UTF-8. If + * val is given as a function it is executed only once. + * \param tmp is a temporary variable and should be of type uint8_t. It + * represents an intermediate value during conversion that is to be + * output by PUT_BYTE. + * \param PUT_BYTE writes the converted UTF-8 bytes to any proper destination. + * It could be a function or a statement, and uses tmp as the input byte. + * For example, PUT_BYTE could be "*output++ = tmp;" PUT_BYTE will be + * executed up to 4 times for values in the valid UTF-8 range and up to + * 7 times in the general case, depending on the length of the converted + * Unicode character. + */ +#define PUT_UTF8(val, tmp, PUT_BYTE)\ + {\ + int bytes, shift;\ + uint32_t in = val;\ + if (in < 0x80) {\ + tmp = in;\ + PUT_BYTE\ + } else {\ + bytes = (av_log2(in) + 4) / 5;\ + shift = (bytes - 1) * 6;\ + tmp = (256 - (256 >> bytes)) | (in >> shift);\ + PUT_BYTE\ + while (shift >= 6) {\ + shift -= 6;\ + tmp = 0x80 | ((in >> shift) & 0x3f);\ + PUT_BYTE\ + }\ + }\ + } + +/*! + * \def PUT_UTF16(val, tmp, PUT_16BIT) + * Convert a 32-bit Unicode character to its UTF-16 encoded form (2 or 4 bytes). + * \param val is an input-only argument and should be of type uint32_t. It holds + * a UCS-4 encoded Unicode character that is to be converted to UTF-16. If + * val is given as a function it is executed only once. + * \param tmp is a temporary variable and should be of type uint16_t. It + * represents an intermediate value during conversion that is to be + * output by PUT_16BIT. + * \param PUT_16BIT writes the converted UTF-16 data to any proper destination + * in desired endianness. It could be a function or a statement, and uses tmp + * as the input byte. For example, PUT_BYTE could be "*output++ = tmp;" + * PUT_BYTE will be executed 1 or 2 times depending on input character. + */ +#define PUT_UTF16(val, tmp, PUT_16BIT)\ + {\ + uint32_t in = val;\ + if (in < 0x10000) {\ + tmp = in;\ + PUT_16BIT\ + } else {\ + tmp = 0xD800 | ((in - 0x10000) >> 10);\ + PUT_16BIT\ + tmp = 0xDC00 | ((in - 0x10000) & 0x3FF);\ + PUT_16BIT\ + }\ + }\ + + + +#include "mem.h" + +#ifdef HAVE_AV_CONFIG_H +# include "internal.h" +#endif /* HAVE_AV_CONFIG_H */ + +#endif /* AVUTIL_COMMON_H */ + +/* + * The following definitions are outside the multiple inclusion guard + * to ensure they are immediately available in intmath.h. + */ + +#ifndef av_log2 +# define av_log2 av_log2_c +#endif +#ifndef av_log2_16bit +# define av_log2_16bit av_log2_16bit_c +#endif +#ifndef av_ceil_log2 +# define av_ceil_log2 av_ceil_log2_c +#endif +#ifndef av_clip +# define av_clip av_clip_c +#endif +#ifndef av_clip_uint8 +# define av_clip_uint8 av_clip_uint8_c +#endif +#ifndef av_clip_int8 +# define av_clip_int8 av_clip_int8_c +#endif +#ifndef av_clip_uint16 +# define av_clip_uint16 av_clip_uint16_c +#endif +#ifndef av_clip_int16 +# define av_clip_int16 av_clip_int16_c +#endif +#ifndef av_clipl_int32 +# define av_clipl_int32 av_clipl_int32_c +#endif +#ifndef av_clipf +# define av_clipf av_clipf_c +#endif + diff --git a/plugins/supereq/ffmpeg_fft/libavutil/intfloat_readwrite.c b/plugins/supereq/ffmpeg_fft/libavutil/intfloat_readwrite.c new file mode 100644 index 00000000..79fe1867 --- /dev/null +++ b/plugins/supereq/ffmpeg_fft/libavutil/intfloat_readwrite.c @@ -0,0 +1,98 @@ +/* + * portable IEEE float/double read/write functions + * + * Copyright (c) 2005 Michael Niedermayer <michaelni@gmx.at> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/** + * @file + * portable IEEE float/double read/write functions + */ + +#include <stdint.h> +#include <math.h> +#include "intfloat_readwrite.h" + +double av_int2dbl(int64_t v){ + if(v+v > 0xFFEULL<<52) + return 0.0/0.0; + return ldexp(((v&((1LL<<52)-1)) + (1LL<<52)) * (v>>63|1), (v>>52&0x7FF)-1075); +} + +float av_int2flt(int32_t v){ + if(v+v > 0xFF000000U) + return 0.0/0.0; + return ldexp(((v&0x7FFFFF) + (1<<23)) * (v>>31|1), (v>>23&0xFF)-150); +} + +double av_ext2dbl(const AVExtFloat ext){ + uint64_t m = 0; + int e, i; + + for (i = 0; i < 8; i++) + m = (m<<8) + ext.mantissa[i]; + e = (((int)ext.exponent[0]&0x7f)<<8) | ext.exponent[1]; + if (e == 0x7fff && m) + return 0.0/0.0; + e -= 16383 + 63; /* In IEEE 80 bits, the whole (i.e. 1.xxxx) + * mantissa bit is written as opposed to the + * single and double precision formats. */ + if (ext.exponent[0]&0x80) + m= -m; + return ldexp(m, e); +} + +int64_t av_dbl2int(double d){ + int e; + if ( !d) return 0; + else if(d-d) return 0x7FF0000000000000LL + ((int64_t)(d<0)<<63) + (d!=d); + d= frexp(d, &e); + return (int64_t)(d<0)<<63 | (e+1022LL)<<52 | (int64_t)((fabs(d)-0.5)*(1LL<<53)); +} + +int32_t av_flt2int(float d){ + int e; + if ( !d) return 0; + else if(d-d) return 0x7F800000 + ((d<0)<<31) + (d!=d); + d= frexp(d, &e); + return (d<0)<<31 | (e+126)<<23 | (int64_t)((fabs(d)-0.5)*(1<<24)); +} + +AVExtFloat av_dbl2ext(double d){ + struct AVExtFloat ext= {{0}}; + int e, i; double f; uint64_t m; + + f = fabs(frexp(d, &e)); + if (f >= 0.5 && f < 1) { + e += 16382; + ext.exponent[0] = e>>8; + ext.exponent[1] = e; + m = (uint64_t)ldexp(f, 64); + for (i=0; i < 8; i++) + ext.mantissa[i] = m>>(56-(i<<3)); + } else if (f != 0.0) { + ext.exponent[0] = 0x7f; ext.exponent[1] = 0xff; + if (f != 1/0.0) + ext.mantissa[0] = ~0; + } + if (d < 0) + ext.exponent[0] |= 0x80; + return ext; +} + diff --git a/plugins/supereq/ffmpeg_fft/libavutil/intfloat_readwrite.h b/plugins/supereq/ffmpeg_fft/libavutil/intfloat_readwrite.h new file mode 100644 index 00000000..644b3e64 --- /dev/null +++ b/plugins/supereq/ffmpeg_fft/libavutil/intfloat_readwrite.h @@ -0,0 +1,41 @@ +/* + * copyright (c) 2005 Michael Niedermayer <michaelni@gmx.at> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVUTIL_INTFLOAT_READWRITE_H +#define AVUTIL_INTFLOAT_READWRITE_H + +#include <stdint.h> +#include "attributes.h" + +/* IEEE 80 bits extended float */ +typedef struct AVExtFloat { + uint8_t exponent[2]; + uint8_t mantissa[8]; +} AVExtFloat; + +double av_int2dbl(int64_t v) av_const; +float av_int2flt(int32_t v) av_const; +double av_ext2dbl(const AVExtFloat ext) av_const; +int64_t av_dbl2int(double d) av_const; +int32_t av_flt2int(float d) av_const; +AVExtFloat av_dbl2ext(double d) av_const; + +#endif /* AVUTIL_INTFLOAT_READWRITE_H */ + diff --git a/plugins/supereq/ffmpeg_fft/libavutil/mathematics.c b/plugins/supereq/ffmpeg_fft/libavutil/mathematics.c new file mode 100644 index 00000000..c6851cb7 --- /dev/null +++ b/plugins/supereq/ffmpeg_fft/libavutil/mathematics.c @@ -0,0 +1,181 @@ +/* + * Copyright (c) 2005 Michael Niedermayer <michaelni@gmx.at> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/** + * @file + * miscellaneous math routines and tables + */ + +#include <assert.h> +#include <stdint.h> +#include <limits.h> +#include "mathematics.h" + +const uint8_t ff_sqrt_tab[256]={ + 0, 16, 23, 28, 32, 36, 40, 43, 46, 48, 51, 54, 56, 58, 60, 62, 64, 66, 68, 70, 72, 74, 76, 77, 79, 80, 82, 84, 85, 87, 88, 90, + 91, 92, 94, 95, 96, 98, 99,100,102,103,104,105,107,108,109,110,111,112,114,115,116,117,118,119,120,121,122,123,124,125,126,127, +128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,144,144,145,146,147,148,149,150,151,151,152,153,154,155,156,156, +157,158,159,160,160,161,162,163,164,164,165,166,167,168,168,169,170,171,171,172,173,174,174,175,176,176,177,178,179,179,180,181, +182,182,183,184,184,185,186,186,187,188,188,189,190,190,191,192,192,193,194,194,195,196,196,197,198,198,199,200,200,201,202,202, +203,204,204,205,205,206,207,207,208,208,209,210,210,211,212,212,213,213,214,215,215,216,216,217,218,218,219,219,220,220,221,222, +222,223,223,224,224,225,226,226,227,227,228,228,229,230,230,231,231,232,232,233,233,234,235,235,236,236,237,237,238,238,239,239, +240,240,241,242,242,243,243,244,244,245,245,246,246,247,247,248,248,249,249,250,250,251,251,252,252,253,253,254,254,255,255,255 +}; + +const uint8_t ff_log2_tab[256]={ + 0,0,1,1,2,2,2,2,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4, + 5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5, + 6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6, + 6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6, + 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, + 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, + 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, + 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7 +}; + +const uint8_t av_reverse[256]={ +0x00,0x80,0x40,0xC0,0x20,0xA0,0x60,0xE0,0x10,0x90,0x50,0xD0,0x30,0xB0,0x70,0xF0, +0x08,0x88,0x48,0xC8,0x28,0xA8,0x68,0xE8,0x18,0x98,0x58,0xD8,0x38,0xB8,0x78,0xF8, +0x04,0x84,0x44,0xC4,0x24,0xA4,0x64,0xE4,0x14,0x94,0x54,0xD4,0x34,0xB4,0x74,0xF4, +0x0C,0x8C,0x4C,0xCC,0x2C,0xAC,0x6C,0xEC,0x1C,0x9C,0x5C,0xDC,0x3C,0xBC,0x7C,0xFC, +0x02,0x82,0x42,0xC2,0x22,0xA2,0x62,0xE2,0x12,0x92,0x52,0xD2,0x32,0xB2,0x72,0xF2, +0x0A,0x8A,0x4A,0xCA,0x2A,0xAA,0x6A,0xEA,0x1A,0x9A,0x5A,0xDA,0x3A,0xBA,0x7A,0xFA, +0x06,0x86,0x46,0xC6,0x26,0xA6,0x66,0xE6,0x16,0x96,0x56,0xD6,0x36,0xB6,0x76,0xF6, +0x0E,0x8E,0x4E,0xCE,0x2E,0xAE,0x6E,0xEE,0x1E,0x9E,0x5E,0xDE,0x3E,0xBE,0x7E,0xFE, +0x01,0x81,0x41,0xC1,0x21,0xA1,0x61,0xE1,0x11,0x91,0x51,0xD1,0x31,0xB1,0x71,0xF1, +0x09,0x89,0x49,0xC9,0x29,0xA9,0x69,0xE9,0x19,0x99,0x59,0xD9,0x39,0xB9,0x79,0xF9, +0x05,0x85,0x45,0xC5,0x25,0xA5,0x65,0xE5,0x15,0x95,0x55,0xD5,0x35,0xB5,0x75,0xF5, +0x0D,0x8D,0x4D,0xCD,0x2D,0xAD,0x6D,0xED,0x1D,0x9D,0x5D,0xDD,0x3D,0xBD,0x7D,0xFD, +0x03,0x83,0x43,0xC3,0x23,0xA3,0x63,0xE3,0x13,0x93,0x53,0xD3,0x33,0xB3,0x73,0xF3, +0x0B,0x8B,0x4B,0xCB,0x2B,0xAB,0x6B,0xEB,0x1B,0x9B,0x5B,0xDB,0x3B,0xBB,0x7B,0xFB, +0x07,0x87,0x47,0xC7,0x27,0xA7,0x67,0xE7,0x17,0x97,0x57,0xD7,0x37,0xB7,0x77,0xF7, +0x0F,0x8F,0x4F,0xCF,0x2F,0xAF,0x6F,0xEF,0x1F,0x9F,0x5F,0xDF,0x3F,0xBF,0x7F,0xFF, +}; + +int64_t av_gcd(int64_t a, int64_t b){ + if(b) return av_gcd(b, a%b); + else return a; +} + +int64_t av_rescale_rnd(int64_t a, int64_t b, int64_t c, enum AVRounding rnd){ + int64_t r=0; + assert(c > 0); + assert(b >=0); + assert((unsigned)rnd<=5 && rnd!=4); + + if(a<0 && a != INT64_MIN) return -av_rescale_rnd(-a, b, c, rnd ^ ((rnd>>1)&1)); + + if(rnd==AV_ROUND_NEAR_INF) r= c/2; + else if(rnd&1) r= c-1; + + if(b<=INT_MAX && c<=INT_MAX){ + if(a<=INT_MAX) + return (a * b + r)/c; + else + return a/c*b + (a%c*b + r)/c; + }else{ +#if 1 + uint64_t a0= a&0xFFFFFFFF; + uint64_t a1= a>>32; + uint64_t b0= b&0xFFFFFFFF; + uint64_t b1= b>>32; + uint64_t t1= a0*b1 + a1*b0; + uint64_t t1a= t1<<32; + int i; + + a0 = a0*b0 + t1a; + a1 = a1*b1 + (t1>>32) + (a0<t1a); + a0 += r; + a1 += a0<r; + + for(i=63; i>=0; i--){ +// int o= a1 & 0x8000000000000000ULL; + a1+= a1 + ((a0>>i)&1); + t1+=t1; + if(/*o || */c <= a1){ + a1 -= c; + t1++; + } + } + return t1; + } +#else + AVInteger ai; + ai= av_mul_i(av_int2i(a), av_int2i(b)); + ai= av_add_i(ai, av_int2i(r)); + + return av_i2int(av_div_i(ai, av_int2i(c))); + } +#endif +} + +int64_t av_rescale(int64_t a, int64_t b, int64_t c){ + return av_rescale_rnd(a, b, c, AV_ROUND_NEAR_INF); +} + +int64_t av_rescale_q(int64_t a, AVRational bq, AVRational cq){ + int64_t b= bq.num * (int64_t)cq.den; + int64_t c= cq.num * (int64_t)bq.den; + return av_rescale_rnd(a, b, c, AV_ROUND_NEAR_INF); +} + +int av_compare_ts(int64_t ts_a, AVRational tb_a, int64_t ts_b, AVRational tb_b){ + int64_t a= tb_a.num * (int64_t)tb_b.den; + int64_t b= tb_b.num * (int64_t)tb_a.den; + if (av_rescale_rnd(ts_a, a, b, AV_ROUND_DOWN) < ts_b) return -1; + if (av_rescale_rnd(ts_b, b, a, AV_ROUND_DOWN) < ts_a) return 1; + return 0; +} + +int64_t av_compare_mod(uint64_t a, uint64_t b, uint64_t mod){ + int64_t c= (a-b) & (mod-1); + if(c > (mod>>1)) + c-= mod; + return c; +} + +#ifdef TEST +#include "integer.h" +#undef printf +int main(void){ + int64_t a,b,c,d,e; + + for(a=7; a<(1LL<<62); a+=a/3+1){ + for(b=3; b<(1LL<<62); b+=b/4+1){ + for(c=9; c<(1LL<<62); c+=(c*2)/5+3){ + int64_t r= c/2; + AVInteger ai; + ai= av_mul_i(av_int2i(a), av_int2i(b)); + ai= av_add_i(ai, av_int2i(r)); + + d= av_i2int(av_div_i(ai, av_int2i(c))); + + e= av_rescale(a,b,c); + + if((double)a * (double)b / (double)c > (1LL<<63)) + continue; + + if(d!=e) printf("%"PRId64"*%"PRId64"/%"PRId64"= %"PRId64"=%"PRId64"\n", a, b, c, d, e); + } + } + } + return 0; +} +#endif diff --git a/plugins/supereq/ffmpeg_fft/libavutil/mathematics.h b/plugins/supereq/ffmpeg_fft/libavutil/mathematics.h new file mode 100644 index 00000000..06d36e09 --- /dev/null +++ b/plugins/supereq/ffmpeg_fft/libavutil/mathematics.h @@ -0,0 +1,110 @@ +/* + * copyright (c) 2005 Michael Niedermayer <michaelni@gmx.at> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVUTIL_MATHEMATICS_H +#define AVUTIL_MATHEMATICS_H + +#include <stdint.h> +#include <math.h> +#include "attributes.h" +#include "rational.h" + +#ifndef M_E +#define M_E 2.7182818284590452354 /* e */ +#endif +#ifndef M_LN2 +#define M_LN2 0.69314718055994530942 /* log_e 2 */ +#endif +#ifndef M_LN10 +#define M_LN10 2.30258509299404568402 /* log_e 10 */ +#endif +#ifndef M_LOG2_10 +#define M_LOG2_10 3.32192809488736234787 /* log_2 10 */ +#endif +#ifndef M_PI +#define M_PI 3.14159265358979323846 /* pi */ +#endif +#ifndef M_SQRT1_2 +#define M_SQRT1_2 0.70710678118654752440 /* 1/sqrt(2) */ +#endif +#ifndef M_SQRT2 +#define M_SQRT2 1.41421356237309504880 /* sqrt(2) */ +#endif +#ifndef NAN +#define NAN (0.0/0.0) +#endif +#ifndef INFINITY +#define INFINITY (1.0/0.0) +#endif + +enum AVRounding { + AV_ROUND_ZERO = 0, ///< Round toward zero. + AV_ROUND_INF = 1, ///< Round away from zero. + AV_ROUND_DOWN = 2, ///< Round toward -infinity. + AV_ROUND_UP = 3, ///< Round toward +infinity. + AV_ROUND_NEAR_INF = 5, ///< Round to nearest and halfway cases away from zero. +}; + +/** + * Return the greatest common divisor of a and b. + * If both a and b are 0 or either or both are <0 then behavior is + * undefined. + */ +int64_t av_const av_gcd(int64_t a, int64_t b); + +/** + * Rescale a 64-bit integer with rounding to nearest. + * A simple a*b/c isn't possible as it can overflow. + */ +int64_t av_rescale(int64_t a, int64_t b, int64_t c) av_const; + +/** + * Rescale a 64-bit integer with specified rounding. + * A simple a*b/c isn't possible as it can overflow. + */ +int64_t av_rescale_rnd(int64_t a, int64_t b, int64_t c, enum AVRounding) av_const; + +/** + * Rescale a 64-bit integer by 2 rational numbers. + */ +int64_t av_rescale_q(int64_t a, AVRational bq, AVRational cq) av_const; + +/** + * Compare 2 timestamps each in its own timebases. + * The result of the function is undefined if one of the timestamps + * is outside the int64_t range when represented in the others timebase. + * @return -1 if ts_a is before ts_b, 1 if ts_a is after ts_b or 0 if they represent the same position + */ +int av_compare_ts(int64_t ts_a, AVRational tb_a, int64_t ts_b, AVRational tb_b); + +/** + * Compare 2 integers modulo mod. + * That is we compare integers a and b for which only the least + * significant log2(mod) bits are known. + * + * @param mod must be a power of 2 + * @return a negative value if a is smaller than b + * a positive value if a is greater than b + * 0 if a equals b + */ +int64_t av_compare_mod(uint64_t a, uint64_t b, uint64_t mod); + +#endif /* AVUTIL_MATHEMATICS_H */ + diff --git a/plugins/supereq/ffmpeg_fft/libavutil/mem.c b/plugins/supereq/ffmpeg_fft/libavutil/mem.c new file mode 100644 index 00000000..8cad089a --- /dev/null +++ b/plugins/supereq/ffmpeg_fft/libavutil/mem.c @@ -0,0 +1,176 @@ +/* + * default memory allocator for libavutil + * Copyright (c) 2002 Fabrice Bellard + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/** + * @file + * default memory allocator for libavutil + */ + +#include "config.h" + +#include <limits.h> +#include <stdlib.h> +#include <string.h> +#if HAVE_MALLOC_H +#include <malloc.h> +#endif + +#include "avutil.h" +#include "mem.h" + +/* here we can use OS-dependent allocation functions */ +#undef free +#undef malloc +#undef realloc + +#ifdef MALLOC_PREFIX + +#define malloc AV_JOIN(MALLOC_PREFIX, malloc) +#define memalign AV_JOIN(MALLOC_PREFIX, memalign) +#define posix_memalign AV_JOIN(MALLOC_PREFIX, posix_memalign) +#define realloc AV_JOIN(MALLOC_PREFIX, realloc) +#define free AV_JOIN(MALLOC_PREFIX, free) + +void *malloc(size_t size); +void *memalign(size_t align, size_t size); +int posix_memalign(void **ptr, size_t align, size_t size); +void *realloc(void *ptr, size_t size); +void free(void *ptr); + +#endif /* MALLOC_PREFIX */ + +/* You can redefine av_malloc and av_free in your project to use your + memory allocator. You do not need to suppress this file because the + linker will do it automatically. */ + +void *av_malloc(unsigned int size) +{ + void *ptr = NULL; +#if CONFIG_MEMALIGN_HACK + long diff; +#endif + + /* let's disallow possible ambiguous cases */ + if(size > (INT_MAX-16) ) + return NULL; + +#if CONFIG_MEMALIGN_HACK + ptr = malloc(size+16); + if(!ptr) + return ptr; + diff= ((-(long)ptr - 1)&15) + 1; + ptr = (char*)ptr + diff; + ((char*)ptr)[-1]= diff; +#elif HAVE_POSIX_MEMALIGN + if (posix_memalign(&ptr,16,size)) + ptr = NULL; +#elif HAVE_MEMALIGN + ptr = memalign(16,size); + /* Why 64? + Indeed, we should align it: + on 4 for 386 + on 16 for 486 + on 32 for 586, PPro - K6-III + on 64 for K7 (maybe for P3 too). + Because L1 and L2 caches are aligned on those values. + But I don't want to code such logic here! + */ + /* Why 16? + Because some CPUs need alignment, for example SSE2 on P4, & most RISC CPUs + it will just trigger an exception and the unaligned load will be done in the + exception handler or it will just segfault (SSE2 on P4). + Why not larger? Because I did not see a difference in benchmarks ... + */ + /* benchmarks with P3 + memalign(64)+1 3071,3051,3032 + memalign(64)+2 3051,3032,3041 + memalign(64)+4 2911,2896,2915 + memalign(64)+8 2545,2554,2550 + memalign(64)+16 2543,2572,2563 + memalign(64)+32 2546,2545,2571 + memalign(64)+64 2570,2533,2558 + + BTW, malloc seems to do 8-byte alignment by default here. + */ +#else + ptr = malloc(size); +#endif + return ptr; +} + +void *av_realloc(void *ptr, unsigned int size) +{ +#if CONFIG_MEMALIGN_HACK + int diff; +#endif + + /* let's disallow possible ambiguous cases */ + if(size > (INT_MAX-16) ) + return NULL; + +#if CONFIG_MEMALIGN_HACK + //FIXME this isn't aligned correctly, though it probably isn't needed + if(!ptr) return av_malloc(size); + diff= ((char*)ptr)[-1]; + return (char*)realloc((char*)ptr - diff, size + diff) + diff; +#else + return realloc(ptr, size); +#endif +} + +void av_free(void *ptr) +{ + /* XXX: this test should not be needed on most libcs */ + if (ptr) +#if CONFIG_MEMALIGN_HACK + free((char*)ptr - ((char*)ptr)[-1]); +#else + free(ptr); +#endif +} + +void av_freep(void *arg) +{ + void **ptr= (void**)arg; + av_free(*ptr); + *ptr = NULL; +} + +void *av_mallocz(unsigned int size) +{ + void *ptr = av_malloc(size); + if (ptr) + memset(ptr, 0, size); + return ptr; +} + +char *av_strdup(const char *s) +{ + char *ptr= NULL; + if(s){ + int len = strlen(s) + 1; + ptr = av_malloc(len); + if (ptr) + memcpy(ptr, s, len); + } + return ptr; +} + diff --git a/plugins/supereq/ffmpeg_fft/libavutil/mem.h b/plugins/supereq/ffmpeg_fft/libavutil/mem.h new file mode 100644 index 00000000..7da0a15f --- /dev/null +++ b/plugins/supereq/ffmpeg_fft/libavutil/mem.h @@ -0,0 +1,128 @@ +/* + * copyright (c) 2006 Michael Niedermayer <michaelni@gmx.at> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/** + * @file + * memory handling functions + */ + +#ifndef AVUTIL_MEM_H +#define AVUTIL_MEM_H + +#include "attributes.h" +#include "avutil.h" +#include "publik.h" + +#if defined(__ICC) && _ICC < 1200 || defined(__SUNPRO_C) + #define DECLARE_ALIGNED(n,t,v) t __attribute__ ((aligned (n))) v + #define DECLARE_ASM_CONST(n,t,v) const t __attribute__ ((aligned (n))) v +#elif defined(__TI_COMPILER_VERSION__) + #define DECLARE_ALIGNED(n,t,v) \ + AV_PRAGMA(DATA_ALIGN(v,n)) \ + t __attribute__((aligned(n))) v + #define DECLARE_ASM_CONST(n,t,v) \ + AV_PRAGMA(DATA_ALIGN(v,n)) \ + static const t __attribute__((aligned(n))) v +#elif defined(__GNUC__) + #define DECLARE_ALIGNED(n,t,v) t __attribute__ ((aligned (n))) v + #define DECLARE_ASM_CONST(n,t,v) static const t attribute_used __attribute__ ((aligned (n))) v +#elif defined(_MSC_VER) + #define DECLARE_ALIGNED(n,t,v) __declspec(align(n)) t v + #define DECLARE_ASM_CONST(n,t,v) __declspec(align(n)) static const t v +#else + #define DECLARE_ALIGNED(n,t,v) t v + #define DECLARE_ASM_CONST(n,t,v) static const t v +#endif + +#if AV_GCC_VERSION_AT_LEAST(3,1) + #define av_malloc_attrib __attribute__((__malloc__)) +#else + #define av_malloc_attrib +#endif + +#if (!defined(__ICC) || __ICC > 1110) && AV_GCC_VERSION_AT_LEAST(4,3) + #define av_alloc_size(n) __attribute__((alloc_size(n))) +#else + #define av_alloc_size(n) +#endif + +/** + * Allocate a block of size bytes with alignment suitable for all + * memory accesses (including vectors if available on the CPU). + * @param size Size in bytes for the memory block to be allocated. + * @return Pointer to the allocated block, NULL if the block cannot + * be allocated. + * @see av_mallocz() + */ +PUBLIK void *av_malloc(unsigned int size) av_malloc_attrib av_alloc_size(1); + +/** + * Allocate or reallocate a block of memory. + * If ptr is NULL and size > 0, allocate a new block. If + * size is zero, free the memory block pointed to by ptr. + * @param size Size in bytes for the memory block to be allocated or + * reallocated. + * @param ptr Pointer to a memory block already allocated with + * av_malloc(z)() or av_realloc() or NULL. + * @return Pointer to a newly reallocated block or NULL if the block + * cannot be reallocated or the function is used to free the memory block. + * @see av_fast_realloc() + */ +void *av_realloc(void *ptr, unsigned int size) av_alloc_size(2); + +/** + * Free a memory block which has been allocated with av_malloc(z)() or + * av_realloc(). + * @param ptr Pointer to the memory block which should be freed. + * @note ptr = NULL is explicitly allowed. + * @note It is recommended that you use av_freep() instead. + * @see av_freep() + */ +PUBLIK void av_free(void *ptr); + +/** + * Allocate a block of size bytes with alignment suitable for all + * memory accesses (including vectors if available on the CPU) and + * zero all the bytes of the block. + * @param size Size in bytes for the memory block to be allocated. + * @return Pointer to the allocated block, NULL if it cannot be allocated. + * @see av_malloc() + */ +void *av_mallocz(unsigned int size) av_malloc_attrib av_alloc_size(1); + +/** + * Duplicate the string s. + * @param s string to be duplicated + * @return Pointer to a newly allocated string containing a + * copy of s or NULL if the string cannot be allocated. + */ +char *av_strdup(const char *s) av_malloc_attrib; + +/** + * Free a memory block which has been allocated with av_malloc(z)() or + * av_realloc() and set the pointer pointing to it to NULL. + * @param ptr Pointer to the pointer to the memory block which should + * be freed. + * @see av_free() + */ +void av_freep(void *ptr); + +#endif /* AVUTIL_MEM_H */ + diff --git a/plugins/supereq/ffmpeg_fft/libavutil/rational.c b/plugins/supereq/ffmpeg_fft/libavutil/rational.c new file mode 100644 index 00000000..3e8b885d --- /dev/null +++ b/plugins/supereq/ffmpeg_fft/libavutil/rational.c @@ -0,0 +1,131 @@ +/* + * rational numbers + * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/** + * @file + * rational numbers + * @author Michael Niedermayer <michaelni@gmx.at> + */ + +#include <assert.h> +//#include <math.h> +#include <limits.h> + +#include "common.h" +#include "mathematics.h" +#include "rational.h" + +int av_reduce(int *dst_num, int *dst_den, int64_t num, int64_t den, int64_t max){ + AVRational a0={0,1}, a1={1,0}; + int sign= (num<0) ^ (den<0); + int64_t gcd= av_gcd(FFABS(num), FFABS(den)); + + if(gcd){ + num = FFABS(num)/gcd; + den = FFABS(den)/gcd; + } + if(num<=max && den<=max){ + a1= (AVRational){num, den}; + den=0; + } + + while(den){ + uint64_t x = num / den; + int64_t next_den= num - den*x; + int64_t a2n= x*a1.num + a0.num; + int64_t a2d= x*a1.den + a0.den; + + if(a2n > max || a2d > max){ + if(a1.num) x= (max - a0.num) / a1.num; + if(a1.den) x= FFMIN(x, (max - a0.den) / a1.den); + + if (den*(2*x*a1.den + a0.den) > num*a1.den) + a1 = (AVRational){x*a1.num + a0.num, x*a1.den + a0.den}; + break; + } + + a0= a1; + a1= (AVRational){a2n, a2d}; + num= den; + den= next_den; + } + assert(av_gcd(a1.num, a1.den) <= 1U); + + *dst_num = sign ? -a1.num : a1.num; + *dst_den = a1.den; + + return den==0; +} + +AVRational av_mul_q(AVRational b, AVRational c){ + av_reduce(&b.num, &b.den, b.num * (int64_t)c.num, b.den * (int64_t)c.den, INT_MAX); + return b; +} + +AVRational av_div_q(AVRational b, AVRational c){ + return av_mul_q(b, (AVRational){c.den, c.num}); +} + +AVRational av_add_q(AVRational b, AVRational c){ + av_reduce(&b.num, &b.den, b.num * (int64_t)c.den + c.num * (int64_t)b.den, b.den * (int64_t)c.den, INT_MAX); + return b; +} + +AVRational av_sub_q(AVRational b, AVRational c){ + return av_add_q(b, (AVRational){-c.num, c.den}); +} + +AVRational av_d2q(double d, int max){ + AVRational a; +#define LOG2 0.69314718055994530941723212145817656807550013436025 + int exponent= FFMAX( (int)(log(fabs(d) + 1e-20)/LOG2), 0); + int64_t den= 1LL << (61 - exponent); + if (isnan(d)) + return (AVRational){0,0}; + av_reduce(&a.num, &a.den, (int64_t)(d * den + 0.5), den, max); + + return a; +} + +int av_nearer_q(AVRational q, AVRational q1, AVRational q2) +{ + /* n/d is q, a/b is the median between q1 and q2 */ + int64_t a = q1.num * (int64_t)q2.den + q2.num * (int64_t)q1.den; + int64_t b = 2 * (int64_t)q1.den * q2.den; + + /* rnd_up(a*d/b) > n => a*d/b > n */ + int64_t x_up = av_rescale_rnd(a, q.den, b, AV_ROUND_UP); + + /* rnd_down(a*d/b) < n => a*d/b < n */ + int64_t x_down = av_rescale_rnd(a, q.den, b, AV_ROUND_DOWN); + + return ((x_up > q.num) - (x_down < q.num)) * av_cmp_q(q2, q1); +} + +int av_find_nearest_q_idx(AVRational q, const AVRational* q_list) +{ + int i, nearest_q_idx = 0; + for(i=0; q_list[i].den; i++) + if (av_nearer_q(q, q_list[i], q_list[nearest_q_idx]) > 0) + nearest_q_idx = i; + + return nearest_q_idx; +} diff --git a/plugins/supereq/ffmpeg_fft/libavutil/rational.h b/plugins/supereq/ffmpeg_fft/libavutil/rational.h new file mode 100644 index 00000000..cd0a945a --- /dev/null +++ b/plugins/supereq/ffmpeg_fft/libavutil/rational.h @@ -0,0 +1,130 @@ +/* + * rational numbers + * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/** + * @file + * rational numbers + * @author Michael Niedermayer <michaelni@gmx.at> + */ + +#ifndef AVUTIL_RATIONAL_H +#define AVUTIL_RATIONAL_H + +#include <stdint.h> +#include "attributes.h" + +/** + * rational number numerator/denominator + */ +typedef struct AVRational{ + int num; ///< numerator + int den; ///< denominator +} AVRational; + +/** + * Compare two rationals. + * @param a first rational + * @param b second rational + * @return 0 if a==b, 1 if a>b and -1 if a<b + */ +static inline int av_cmp_q(AVRational a, AVRational b){ + const int64_t tmp= a.num * (int64_t)b.den - b.num * (int64_t)a.den; + + if(tmp) return (tmp>>63)|1; + else return 0; +} + +/** + * Convert rational to double. + * @param a rational to convert + * @return (double) a + */ +static inline double av_q2d(AVRational a){ + return a.num / (double) a.den; +} + +/** + * Reduce a fraction. + * This is useful for framerate calculations. + * @param dst_num destination numerator + * @param dst_den destination denominator + * @param num source numerator + * @param den source denominator + * @param max the maximum allowed for dst_num & dst_den + * @return 1 if exact, 0 otherwise + */ +int av_reduce(int *dst_num, int *dst_den, int64_t num, int64_t den, int64_t max); + +/** + * Multiply two rationals. + * @param b first rational + * @param c second rational + * @return b*c + */ +AVRational av_mul_q(AVRational b, AVRational c) av_const; + +/** + * Divide one rational by another. + * @param b first rational + * @param c second rational + * @return b/c + */ +AVRational av_div_q(AVRational b, AVRational c) av_const; + +/** + * Add two rationals. + * @param b first rational + * @param c second rational + * @return b+c + */ +AVRational av_add_q(AVRational b, AVRational c) av_const; + +/** + * Subtract one rational from another. + * @param b first rational + * @param c second rational + * @return b-c + */ +AVRational av_sub_q(AVRational b, AVRational c) av_const; + +/** + * Convert a double precision floating point number to a rational. + * @param d double to convert + * @param max the maximum allowed numerator and denominator + * @return (AVRational) d + */ +AVRational av_d2q(double d, int max) av_const; + +/** + * @return 1 if q1 is nearer to q than q2, -1 if q2 is nearer + * than q1, 0 if they have the same distance. + */ +int av_nearer_q(AVRational q, AVRational q1, AVRational q2); + +/** + * Find the nearest value in q_list to q. + * @param q_list an array of rationals terminated by {0, 0} + * @return the index of the nearest value found in the array + */ +int av_find_nearest_q_idx(AVRational q, const AVRational* q_list); + +#endif /* AVUTIL_RATIONAL_H */ + diff --git a/plugins/supereq/ffmpeg_fft/libffmpeg_fft.ver b/plugins/supereq/ffmpeg_fft/libffmpeg_fft.ver new file mode 100644 index 00000000..07b44318 --- /dev/null +++ b/plugins/supereq/ffmpeg_fft/libffmpeg_fft.ver @@ -0,0 +1,4 @@ +LIBFFMPEG_FFT_52 { + global: *; +}; + diff --git a/plugins/supereq/ffmpeg_fft/publik.h b/plugins/supereq/ffmpeg_fft/publik.h new file mode 100644 index 00000000..bb044756 --- /dev/null +++ b/plugins/supereq/ffmpeg_fft/publik.h @@ -0,0 +1,6 @@ +#ifndef PUBLIK_H_ +#define PUBLIK_H_ + +#define PUBLIK __attribute__ ((visibility ("default"))) + +#endif /* PUBLIK_H_ */ diff --git a/plugins/supereq/nsfft-1.00/README b/plugins/supereq/nsfft-1.00/README new file mode 100644 index 00000000..1ca873b1 --- /dev/null +++ b/plugins/supereq/nsfft-1.00/README @@ -0,0 +1,15 @@ + +NSFFT (Nonrestrictive SIMD FFT) is yet another FFT library for +performing 1-dimensional fast Fourier transforms. NSDFT is a simple, +small and portable library, and it is efficient since it can utilize +SIMD instruction sets in modern processors. It performs multiple +transforms simultaneously, and thus it is especially suitable for +digital signal processing. It does not need so much computation to +make a good execution plan. This library is in public domain, so that +you can incorporate this library into your product without any +obligation. + +Visit http://shibatch.sourceforge.net/ to get the latest version of +this library. + +Contact : Naoki Shibata shibatch@users.sourceforge.net diff --git a/plugins/supereq/nsfft-1.00/dft/DFT.c b/plugins/supereq/nsfft-1.00/dft/DFT.c new file mode 100644 index 00000000..d59e6ab8 --- /dev/null +++ b/plugins/supereq/nsfft-1.00/dft/DFT.c @@ -0,0 +1,327 @@ +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <math.h> +#include <stdint.h> +#include <sys/time.h> + +#include "SIMDBase.h" +#include "DFT.h" +#include "DFTUndiff.h" + +int32_t getModeParamInt_purec_float(int32_t paramId); +int32_t getModeParamInt_purec_double(int32_t paramId); +int32_t getModeParamInt_purec_longdouble(int32_t paramId); +int32_t getModeParamInt_sse_float(int32_t paramId); +int32_t getModeParamInt_sse2_double(int32_t paramId); +int32_t getModeParamInt_neon_float(int32_t paramId); +int32_t getModeParamInt_avx_float(int32_t paramId); +int32_t getModeParamInt_avx_double(int32_t paramId); +int32_t getModeParamInt_altivec_float(int32_t paramId); + +char * getModeParamString_purec_float(int32_t paramId); +char * getModeParamString_purec_double(int32_t paramId); +char * getModeParamString_purec_longdouble(int32_t paramId); +char * getModeParamString_sse_float(int32_t paramId); +char * getModeParamString_sse2_double(int32_t paramId); +char * getModeParamString_neon_float(int32_t paramId); +char * getModeParamString_avx_float(int32_t paramId); +char * getModeParamString_avx_double(int32_t paramId); +char * getModeParamString_altivec_float(int32_t paramId); + +void *makePlan_purec_float(uint64_t n, uint64_t flags); +void *makePlan_purec_double(uint64_t n, uint64_t flags); +void *makePlan_purec_longdouble(uint64_t n, uint64_t flags); +void *makePlan_sse_float(uint64_t n, uint64_t flags); +void *makePlan_sse2_double(uint64_t n, uint64_t flags); +void *makePlan_neon_float(uint64_t n, uint64_t flags); +void *makePlan_avx_float(uint64_t n, uint64_t flags); +void *makePlan_avx_double(uint64_t n, uint64_t flags); +void *makePlan_altivec_float(uint64_t n, uint64_t flags); + +void *makePlanSub_purec_float(uint64_t n, int32_t radix2thres, int32_t useCobra, uint64_t flags); +void *makePlanSub_purec_double(uint64_t n, int32_t radix2thres, int32_t useCobra, uint64_t flags); +void *makePlanSub_purec_longdouble(uint64_t n, int32_t radix2thres, int32_t useCobra, uint64_t flags); +void *makePlanSub_sse_float(uint64_t n, int32_t radix2thres, int32_t useCobra, uint64_t flags); +void *makePlanSub_sse2_double(uint64_t n, int32_t radix2thres, int32_t useCobra, uint64_t flags); +void *makePlanSub_neon_float(uint64_t n, int32_t radix2thres, int32_t useCobra, uint64_t flags); +void *makePlanSub_avx_float(uint64_t n, int32_t radix2thres, int32_t useCobra, uint64_t flags); +void *makePlanSub_avx_double(uint64_t n, int32_t radix2thres, int32_t useCobra, uint64_t flags); +void *makePlanSub_altivec_float(uint64_t n, int32_t radix2thres, int32_t useCobra, uint64_t flags); + +void destroyPlan_purec_float(void *p); +void destroyPlan_purec_double(void *p); +void destroyPlan_purec_longdouble(void *p); +void destroyPlan_sse_float(void *p); +void destroyPlan_sse2_double(void *p); +void destroyPlan_neon_float(void *p); +void destroyPlan_avx_float(void *p); +void destroyPlan_avx_double(void *p); +void destroyPlan_altivec_float(void *p); + +void execute_purec_float(void *p, void *s, int32_t dir); +void execute_purec_double(void *p, void *s, int32_t dir); +void execute_purec_longdouble(void *p, void *s, int32_t dir); +void execute_sse_float(void *p, void *s, int32_t dir); +void execute_sse2_double(void *p, void *s, int32_t dir); +void execute_neon_float(void *p, void *s, int32_t dir); +void execute_avx_float(void *p, void *s, int32_t dir); +void execute_avx_double(void *p, void *s, int32_t dir); +void execute_altivec_float(void *p, void *s, int32_t dir); + +void *DFT_init(int32_t mode, uint64_t n, uint64_t flags) { + switch(mode) { +#if defined(ENABLE_PUREC_FLOAT) + case 1: return makePlan_purec_float(n, flags); break; +#endif +#if defined(ENABLE_PUREC_DOUBLE) + case 2: return makePlan_purec_double(n, flags); break; +#endif +#if defined(ENABLE_PUREC_LONGDOUBLE) + case 3: return makePlan_purec_longdouble(n, flags); break; +#endif +#if defined(ENABLE_SSE_FLOAT) + case 4: return makePlan_sse_float(n, flags); break; +#endif +#if defined(ENABLE_SSE2_DOUBLE) + case 5: return makePlan_sse2_double(n, flags); break; +#endif +#if defined(ENABLE_NEON_FLOAT) + case 6: return makePlan_neon_float(n, flags); break; +#endif +#if defined(ENABLE_AVX_FLOAT) + case 7: return makePlan_avx_float(n, flags); break; +#endif +#if defined(ENABLE_AVX_DOUBLE) + case 8: return makePlan_avx_double(n, flags); break; +#endif +#if defined(ENABLE_ALTIVEC_FLOAT) + case 9: return makePlan_altivec_float(n, flags); break; +#endif + default: break; + } + + return NULL; +} + +void DFT_dispose(void *p, int32_t mode) { + switch(mode) { +#if defined(ENABLE_PUREC_FLOAT) + case 1: destroyPlan_purec_float(p); break; +#endif +#if defined(ENABLE_PUREC_DOUBLE) + case 2: destroyPlan_purec_double(p); break; +#endif +#if defined(ENABLE_PUREC_LONGDOUBLE) + case 3: destroyPlan_purec_longdouble(p); break; +#endif +#if defined(ENABLE_SSE_FLOAT) + case 4: destroyPlan_sse_float(p); break; +#endif +#if defined(ENABLE_SSE2_DOUBLE) + case 5: destroyPlan_sse2_double(p); break; +#endif +#if defined(ENABLE_NEON_FLOAT) + case 6: destroyPlan_neon_float(p); break; +#endif +#if defined(ENABLE_AVX_FLOAT) + case 7: destroyPlan_avx_float(p); break; +#endif +#if defined(ENABLE_AVX_DOUBLE) + case 8: destroyPlan_avx_double(p); break; +#endif +#if defined(ENABLE_ALTIVEC_FLOAT) + case 9: destroyPlan_altivec_float(p); break; +#endif + default: break; + } +} + +void DFT_execute(void *p, int32_t mode, void *s, int32_t dir) { + switch(mode) { +#if defined(ENABLE_PUREC_FLOAT) + case 1: return execute_purec_float(p, s, dir); break; +#endif +#if defined(ENABLE_PUREC_DOUBLE) + case 2: return execute_purec_double(p, s, dir); break; +#endif +#if defined(ENABLE_PUREC_LONGDOUBLE) + case 3: return execute_purec_longdouble(p, s, dir); break; +#endif +#if defined(ENABLE_SSE_FLOAT) + case 4: return execute_sse_float(p, s, dir); break; +#endif +#if defined(ENABLE_SSE2_DOUBLE) + case 5: return execute_sse2_double(p, s, dir); break; +#endif +#if defined(ENABLE_NEON_FLOAT) + case 6: return execute_neon_float(p, s, dir); break; +#endif +#if defined(ENABLE_AVX_FLOAT) + case 7: return execute_avx_float(p, s, dir); break; +#endif +#if defined(ENABLE_AVX_DOUBLE) + case 8: return execute_avx_double(p, s, dir); break; +#endif +#if defined(ENABLE_ALTIVEC_FLOAT) + case 9: return execute_altivec_float(p, s, dir); break; +#endif + default: break; + } +} + +#define FILE_FORMAT_VERSION 0 + +int32_t DFT_fwrite(void *p2, FILE *fp) { + DFTUndiff *p = (DFTUndiff *)p2; + if (p->magic != MAGIC_DFT) abort(); + + if (fprintf(fp, "nsfft file format : %d\n", FILE_FORMAT_VERSION) <= 0) return 0; + if (fprintf(fp, "arch : %s\n", SIMDBase_getProcessorNameString()) <= 0) return 0; + if (fprintf(fp, "computation mode : %d\n", p->mode) <= 0) return 0; + if (fprintf(fp, "length : %d\n", ((p->flags & DFT_FLAG_REAL) != 0 || (p->flags & DFT_FLAG_ALT_REAL) != 0)? p->length * 2 : p->length) <= 0) return 0; + if (fprintf(fp, "radix2 threshold : %d\n", p->radix2thres) <= 0) return 0; + if (fprintf(fp, "transpose : %d\n", p->flagTrans) <= 0) return 0; + if (fprintf(fp, "bit reversal : %d\n", p->useCobra) <= 0) return 0; + if (fprintf(fp, "flags : %llx\n", (unsigned long long int)p->flags) <= 0) return 0; + if (fprintf(fp, "%s\n", "end :") <= 0) return 0; + + return 1; +} + +static char *startsWith(char *str1, char *str2) { + if (strncmp(str1, str2, strlen(str2)) == 0) { + return str1 + strlen(str2); + } + + return NULL; +} + +DFT *DFT_fread(FILE *fp, int32_t *errcode) { + int length = -1, radix2thres = -1, flagTrans = -1, useCobra = -1; + int mode = -1, formatver = -1; + unsigned long long int flags = (1ULL << 63); + + if (errcode != NULL) *errcode = DFT_ERROR_NOERROR; + + for(;;) { + char buf[256], *q; + if (fgets(buf, 255, fp) == NULL) { if (errcode != NULL) *errcode = DFT_ERROR_UNEXPECTED_EOF; return NULL; } + + if ((q = startsWith(buf, "nsfft file format :")) != NULL) { + if (1 != sscanf(q, "%d", &formatver)) { if (errcode != NULL) *errcode = DFT_ERROR_FILE_IO; return NULL; } + } else if ((q = startsWith(buf, "computation mode :")) != NULL) { + if (1 != sscanf(q, "%d", &mode)) { if (errcode != NULL) *errcode = DFT_ERROR_FILE_IO; return NULL; } + } else if ((q = startsWith(buf, "length :")) != NULL) { + if (1 != sscanf(q, "%d", &length)) { if (errcode != NULL) *errcode = DFT_ERROR_FILE_IO; return NULL; } + } else if ((q = startsWith(buf, "radix2 threshold :")) != NULL) { + if (1 != sscanf(q, "%d", &radix2thres)) { if (errcode != NULL) *errcode = DFT_ERROR_FILE_IO; return NULL; } + } else if ((q = startsWith(buf, "transpose :")) != NULL) { + if (1 != sscanf(q, "%d", &flagTrans)) { if (errcode != NULL) *errcode = DFT_ERROR_FILE_IO; return NULL; } + } else if ((q = startsWith(buf, "bit reversal :")) != NULL) { + if (1 != sscanf(q, "%d", &useCobra)) { if (errcode != NULL) *errcode = DFT_ERROR_FILE_IO; return NULL; } + } else if ((q = startsWith(buf, "flags :")) != NULL) { + if (1 != sscanf(q, "%llx", &flags)) { if (errcode != NULL) *errcode = DFT_ERROR_FILE_IO; return NULL; } + } else if ((q = startsWith(buf, "end :")) != NULL) { + break; + } + } + + if (formatver > FILE_FORMAT_VERSION) { + if (errcode != NULL) *errcode = DFT_ERROR_FILE_VERSION; + return NULL; + } + + switch(SIMDBase_detect(mode)) { + case 1: + break; + case 0: + if (errcode != NULL) *errcode = DFT_ERROR_MODE_NOT_AVAILABLE; + return NULL; + case -1: + if (errcode != NULL) *errcode = DFT_ERROR_MODE_NOT_COMPILED_IN; + return NULL; + } + + switch(mode) { +#if defined(ENABLE_PUREC_FLOAT) + case 1: return makePlanSub_purec_float(length, radix2thres, useCobra, flags); +#endif +#if defined(ENABLE_PUREC_DOUBLE) + case 2: return makePlanSub_purec_double(length, radix2thres, useCobra, flags); +#endif +#if defined(ENABLE_PUREC_LONGDOUBLE) + case 3: return makePlanSub_purec_longdouble(length, radix2thres, useCobra, flags); +#endif +#if defined(ENABLE_SSE_FLOAT) + case 4: return makePlanSub_sse_float(length, radix2thres, useCobra, flags); +#endif +#if defined(ENABLE_SSE2_DOUBLE) + case 5: return makePlanSub_sse2_double(length, radix2thres, useCobra, flags); +#endif +#if defined(ENABLE_NEON_FLOAT) + case 6: return makePlanSub_neon_float(length, radix2thres, useCobra, flags); +#endif +#if defined(ENABLE_AVX_FLOAT) + case 7: return makePlanSub_avx_float(length, radix2thres, useCobra, flags); +#endif +#if defined(ENABLE_AVX_DOUBLE) + case 8: return makePlanSub_avx_double(length, radix2thres, useCobra, flags); +#endif +#if defined(ENABLE_ALTIVEC_FLOAT) + case 9: return makePlanSub_altivec_float(length, radix2thres, useCobra, flags); +#endif + } + + if (errcode != NULL) *errcode = DFT_ERROR_UNKNOWN_MODE; + + return NULL; +} + +int32_t DFT_getPlanParamInt(int32_t paramId, void *p2) { + DFTUndiff *p = (DFTUndiff *)p2; + if (p->magic != MAGIC_DFT) abort(); + + switch(paramId) { + case DFT_PARAMID_MODE: return p->mode; + case DFT_PARAMID_FFT_LENGTH: + if ((p->flags & DFT_FLAG_REAL) != 0) return p->length * 2; + if ((p->flags & DFT_FLAG_ALT_REAL) != 0) return p->length * 2; + return p->length; + case DFT_PARAMID_IS_REAL_TRANSFORM: return (p->flags & DFT_FLAG_REAL) ? 1 : 0; + case DFT_PARAMID_IS_ALT_REAL_TRANSFORM: return (p->flags & DFT_FLAG_ALT_REAL) ? 1 : 0; + case DFT_PARAMID_NO_BIT_REVERSAL: return (p->flags & DFT_FLAG_NO_BITREVERSAL) ? 1 : 0; + case DFT_PARAMID_TEST_RUN: return p->flags & 3; + } + + return -1; +} + +#if 0 +char *DFT_getPlanParamString(int32_t paramId, void *p2) { + dft_t *p = (dft_t *)p2; + if (p->magic != MAGIC_NSDFT) abort(); + + return NULL; +} +#endif + +uint32_t DFT_ilog2(uint32_t q) { + static const uint32_t tab[] = {0,1,2,2,3,3,3,3,4,4,4,4,4,4,4,4}; + uint32_t r = 0,qq; + + if (q & 0xffff0000) r = 16; + + q >>= r; + qq = q | (q >> 1); + qq |= (qq >> 2); + qq = ((qq & 0x10) >> 4) | ((qq & 0x100) >> 7) | ((qq & 0x1000) >> 10); + + return r + tab[qq] * 4 + tab[q >> (tab[qq] * 4)] - 1; +} + +double DFT_timeofday(void) { + struct timeval tp; + gettimeofday(&tp, NULL); + return (double)tp.tv_sec+(1e-6)*tp.tv_usec; +} diff --git a/plugins/supereq/nsfft-1.00/dft/DFT.h b/plugins/supereq/nsfft-1.00/dft/DFT.h new file mode 100644 index 00000000..facb701a --- /dev/null +++ b/plugins/supereq/nsfft-1.00/dft/DFT.h @@ -0,0 +1,56 @@ +#ifndef __DFT_H__ +#define __DFT_H__ + +#include <stdio.h> +#include <stdint.h> + +typedef void DFT; + +int32_t DFT_getParamInt(int32_t paramId); +char *DFT_getParamString(int32_t paramId); + +int32_t DFT_getModeParamInt(int32_t paramId, int32_t mode); +char *DFT_getModeParamString(int32_t paramId, int32_t mode); + +DFT *DFT_init(int32_t mode, uint64_t n, uint64_t flags); +void DFT_dispose(DFT *p, int32_t mode); + +int32_t DFT_fwrite(DFT *p, FILE *fp); +DFT *DFT_fread(FILE *fp, int32_t *errcode); + +int32_t DFT_getPlanParamInt(int32_t paramId, void *p); + +void DFT_execute(DFT *p, int32_t mode, void *s, int32_t dir); + +uint32_t DFT_ilog2(uint32_t q); +double DFT_timeofday(void); + +#define DFT_FLAG_NO_TEST_RUN ( 0ULL << 0) +#define DFT_FLAG_LIGHT_TEST_RUN ( 1ULL << 0) +#define DFT_FLAG_HEAVY_TEST_RUN ( 2ULL << 0) +#define DFT_FLAG_EXHAUSTIVE_TEST_RUN ( 3ULL << 0) + +#define DFT_FLAG_REAL (1ULL << 2) +#define DFT_FLAG_ALT_REAL (1ULL << 3) +#define DFT_FLAG_VERBOSE (1ULL << 4) +#define DFT_FLAG_NO_BITREVERSAL (1ULL << 5) +#define DFT_FLAG_FORCE_RECURSIVE (1ULL << 6) +#define DFT_FLAG_FORCE_COBRA (1ULL << 7) + +#define DFT_PARAMID_TYPE ( 1 | ( 3 << 24 )) +#define DFT_PARAMID_MODE ( 2 | ( 3 << 24 )) +#define DFT_PARAMID_FFT_LENGTH ( 3 | ( 3 << 24 )) +#define DFT_PARAMID_IS_REAL_TRANSFORM ( 4 | ( 3 << 24 )) +#define DFT_PARAMID_IS_ALT_REAL_TRANSFORM ( 5 | ( 3 << 24 )) +#define DFT_PARAMID_NO_BIT_REVERSAL ( 6 | ( 3 << 24 )) +#define DFT_PARAMID_TEST_RUN ( 7 | ( 3 << 24 )) + +#define DFT_ERROR_NOERROR 0 +#define DFT_ERROR_FILE_VERSION 1 +#define DFT_ERROR_FILE_IO 2 +#define DFT_ERROR_UNEXPECTED_EOF 3 +#define DFT_ERROR_MODE_NOT_COMPILED_IN 4 +#define DFT_ERROR_MODE_NOT_AVAILABLE 5 +#define DFT_ERROR_UNKNOWN_MODE 6 + +#endif diff --git a/plugins/supereq/nsfft-1.00/dft/DFTUndiff.c b/plugins/supereq/nsfft-1.00/dft/DFTUndiff.c new file mode 100644 index 00000000..4985da33 --- /dev/null +++ b/plugins/supereq/nsfft-1.00/dft/DFTUndiff.c @@ -0,0 +1,1807 @@ +#include <stdio.h> +#include <stdlib.h> +#include <math.h> +#include <stdint.h> + +#include "SIMDBase.h" +#include "SIMDBaseUndiff.h" +#include "DFT.h" +#include "DFTUndiff.h" + +// + +#define SIN(x) sin(x) +#define COS(x) cos(x) + +#define SQRT2_2 .7071067811865475244008443621048490392848359376884740365883398689953L + +#ifndef M_PIl +#define M_PIl 3.141592653589793238462643383279502884197169399375105820974944592307L +#endif + +// + +static inline void srBut2(DFTUndiff *p) { + SIMDBase_VECT *s = p->s; + int32_t o = p->offset1; + SIMDBase_VECT t0, t1; + + t0 = SIMDBase_ADDm(&s[o ], &s[o+2]); t1 = SIMDBase_SUBm(&s[o ], &s[o+2]); + SIMDBase_STOR(&s[o ], t0); SIMDBase_STOR(&s[o+2], t1); + t0 = SIMDBase_ADDm(&s[o+1], &s[o+3]); t1 = SIMDBase_SUBm(&s[o+1], &s[o+3]); + SIMDBase_STOR(&s[o+1], t0); SIMDBase_STOR(&s[o+3], t1); +} + +static inline void srButForward4(DFTUndiff *p) { + SIMDBase_VECT *s = p->s; + int32_t o = p->offset1; + SIMDBase_VECT t0r, t0i, t1r, t1i, t2r, t2i, t3r, t3i; + + t0r = SIMDBase_ADDm(&s[o+0], &s[o+4]); t2r = SIMDBase_SUBm(&s[o+0], &s[o+4]); + t0i = SIMDBase_ADDm(&s[o+1], &s[o+5]); t2i = SIMDBase_SUBm(&s[o+1], &s[o+5]); + t1r = SIMDBase_ADDm(&s[o+2], &s[o+6]); t3i = SIMDBase_SUBm(&s[o+2], &s[o+6]); + t1i = SIMDBase_ADDm(&s[o+7], &s[o+3]); t3r = SIMDBase_SUBm(&s[o+7], &s[o+3]); + + SIMDBase_STOR(&s[o+0], SIMDBase_ADDi(t0r, t1r)); SIMDBase_STOR(&s[o+1], SIMDBase_ADDi(t0i, t1i)); + SIMDBase_STOR(&s[o+2], SIMDBase_SUBi(t0r, t1r)); SIMDBase_STOR(&s[o+3], SIMDBase_SUBi(t0i, t1i)); + SIMDBase_STOR(&s[o+4], SIMDBase_SUBi(t2r, t3r)); SIMDBase_STOR(&s[o+5], SIMDBase_SUBi(t2i, t3i)); + SIMDBase_STOR(&s[o+6], SIMDBase_ADDi(t2r, t3r)); SIMDBase_STOR(&s[o+7], SIMDBase_ADDi(t2i, t3i)); +} + +static inline void srButBackward4(DFTUndiff *p) { + SIMDBase_VECT *s = p->s; + int32_t o = p->offset1; + + SIMDBase_VECT t0r, t0i, t1r, t1i; + SIMDBase_VECT s0 = SIMDBase_LOAD(&s[o+0]), s1 = SIMDBase_LOAD(&s[o+1]), s2 = SIMDBase_LOAD(&s[o+2]), s3 = SIMDBase_LOAD(&s[o+3]); + + t0r = SIMDBase_ADDi(s0, s2); t0i = SIMDBase_SUBi(s0, s2); s0 = t0r; s2 = t0i; + t0r = SIMDBase_ADDi(s1, s3); t0i = SIMDBase_SUBi(s1, s3); s1 = t0r; s3 = t0i; + t0r = SIMDBase_ADDm(&s[o+4], &s[o+6]); t1i = SIMDBase_SUBm(&s[o+4], &s[o+6]); + t0i = SIMDBase_ADDm(&s[o+7], &s[o+5]); t1r = SIMDBase_SUBm(&s[o+7], &s[o+5]); + + SIMDBase_STOR(&s[o+4], SIMDBase_SUBi(s0, t0r)); SIMDBase_STOR(&s[o+5], SIMDBase_SUBi(s1, t0i)); + SIMDBase_STOR(&s[o+6], SIMDBase_SUBi(s2, t1r)); SIMDBase_STOR(&s[o+7], SIMDBase_SUBi(s3, t1i)); + SIMDBase_STOR(&s[o+0], SIMDBase_ADDi(s0, t0r)); SIMDBase_STOR(&s[o+1], SIMDBase_ADDi(s1, t0i)); + SIMDBase_STOR(&s[o+2], SIMDBase_ADDi(s2, t1r)); SIMDBase_STOR(&s[o+3], SIMDBase_ADDi(s3, t1i)); +} + +static inline void srButForward8(DFTUndiff *p) { + SIMDBase_VECT *s = p->s; + int32_t o = p->offset1; + SIMDBase_VECT t0r, t0i, t1r, t1i, t2r, t2i, t3r, t3i; + + SIMDBase_VECT s0 = SIMDBase_LOAD(&s[o+ 0]), s1 = SIMDBase_LOAD(&s[o+ 1]), s2 = SIMDBase_LOAD(&s[o+ 2]), s3 = SIMDBase_LOAD(&s[o+ 3]); + SIMDBase_VECT s4 = SIMDBase_LOAD(&s[o+ 4]), s5 = SIMDBase_LOAD(&s[o+ 5]), s6 = SIMDBase_LOAD(&s[o+ 6]), s7 = SIMDBase_LOAD(&s[o+ 7]); + SIMDBase_VECT s8 = SIMDBase_LOAD(&s[o+ 8]), s9 = SIMDBase_LOAD(&s[o+ 9]), sa = SIMDBase_LOAD(&s[o+10]) ,sb = SIMDBase_LOAD(&s[o+11]); + SIMDBase_VECT sc = SIMDBase_LOAD(&s[o+12]), sd = SIMDBase_LOAD(&s[o+13]), se = SIMDBase_LOAD(&s[o+14]), sf = SIMDBase_LOAD(&s[o+15]); + + t2r = SIMDBase_SUBi(s0, s8); t2i = SIMDBase_SUBi(s1, s9); + t3r = SIMDBase_SUBi(sd, s5); t3i = SIMDBase_SUBi(s4, sc); + + s0 = SIMDBase_ADDi(s0, s8); s1 = SIMDBase_ADDi(s1, s9); + s4 = SIMDBase_ADDi(s4, sc); s5 = SIMDBase_ADDi(s5, sd); + + s8 = SIMDBase_SUBi(t2r, t3r); s9 = SIMDBase_SUBi(t2i, t3i); + sc = SIMDBase_ADDi(t2r, t3r); sd = SIMDBase_ADDi(t2i, t3i); + + t2r = SIMDBase_SUBi(s2, sa); t2i = SIMDBase_SUBi(s3, sb); + t3r = SIMDBase_SUBi(sf, s7); t3i = SIMDBase_SUBi(s6, se); + + s2 = SIMDBase_ADDi(s2, sa); s3 = SIMDBase_ADDi(s3, sb); + s6 = SIMDBase_ADDi(s6, se); s7 = SIMDBase_ADDi(s7, sf); + + t0r = SIMDBase_SUBi(t2r, t3r); t1r = SIMDBase_ADDi(t2r, t3r); + t0i = SIMDBase_SUBi(t2i, t3i); t1i = SIMDBase_ADDi(t2i, t3i); + + sa = SIMDBase_MULi(SIMDBase_ADDi(t0r, t0i), SIMDBase_SET1( SQRT2_2)); + sb = SIMDBase_MULi(SIMDBase_SUBi(t0i, t0r), SIMDBase_SET1( SQRT2_2)); + se = SIMDBase_MULi(SIMDBase_SUBi(t1i, t1r), SIMDBase_SET1( SQRT2_2)); + sf = SIMDBase_MULi(SIMDBase_ADDi(t1r, t1i), SIMDBase_SET1(-SQRT2_2)); + + SIMDBase_STOR(&s[o+ 8], SIMDBase_ADDi(s8, sa)); SIMDBase_STOR(&s[o+ 9], SIMDBase_ADDi(s9, sb)); + SIMDBase_STOR(&s[o+10], SIMDBase_SUBi(s8, sa)); SIMDBase_STOR(&s[o+11], SIMDBase_SUBi(s9, sb)); + + SIMDBase_STOR(&s[o+12], SIMDBase_ADDi(sc, se)); SIMDBase_STOR(&s[o+13], SIMDBase_ADDi(sd, sf)); + SIMDBase_STOR(&s[o+14], SIMDBase_SUBi(sc, se)); SIMDBase_STOR(&s[o+15], SIMDBase_SUBi(sd, sf)); + + t0r = SIMDBase_ADDi(s0, s4); t2r = SIMDBase_SUBi(s0, s4); + t0i = SIMDBase_ADDi(s1, s5); t2i = SIMDBase_SUBi(s1, s5); + + t1r = SIMDBase_ADDi(s2, s6); t3i = SIMDBase_SUBi(s2, s6); + t1i = SIMDBase_ADDi(s3, s7); t3r = SIMDBase_SUBi(s7, s3); + + SIMDBase_STOR(&s[o+0], SIMDBase_ADDi(t0r, t1r)); SIMDBase_STOR(&s[o+1], SIMDBase_ADDi(t0i, t1i)); + SIMDBase_STOR(&s[o+2], SIMDBase_SUBi(t0r, t1r)); SIMDBase_STOR(&s[o+3], SIMDBase_SUBi(t0i, t1i)); + SIMDBase_STOR(&s[o+4], SIMDBase_SUBi(t2r, t3r)); SIMDBase_STOR(&s[o+5], SIMDBase_SUBi(t2i, t3i)); + SIMDBase_STOR(&s[o+6], SIMDBase_ADDi(t2r, t3r)); SIMDBase_STOR(&s[o+7], SIMDBase_ADDi(t2i, t3i)); +} + +static void srButBackward8(DFTUndiff *p) { + SIMDBase_VECT *s = p->s; + int32_t o = p->offset1; + SIMDBase_VECT t0r, t0i, t1r, t1i; + + SIMDBase_VECT s0 = SIMDBase_LOAD(&s[o+ 0]), s1 = SIMDBase_LOAD(&s[o+ 1]), s2 = SIMDBase_LOAD(&s[o+ 2]), s3 = SIMDBase_LOAD(&s[o+ 3]); + SIMDBase_VECT s4 = SIMDBase_LOAD(&s[o+ 4]), s5 = SIMDBase_LOAD(&s[o+ 5]), s6 = SIMDBase_LOAD(&s[o+ 6]), s7 = SIMDBase_LOAD(&s[o+ 7]); + SIMDBase_VECT s8 = SIMDBase_LOAD(&s[o+ 8]), s9 = SIMDBase_LOAD(&s[o+ 9]), sa = SIMDBase_LOAD(&s[o+10]) ,sb = SIMDBase_LOAD(&s[o+11]); + SIMDBase_VECT sc = SIMDBase_LOAD(&s[o+12]), sd = SIMDBase_LOAD(&s[o+13]), se = SIMDBase_LOAD(&s[o+14]), sf = SIMDBase_LOAD(&s[o+15]); + + t0r = SIMDBase_ADDi(s8, sa); t0i = SIMDBase_SUBi(s8, sa); s8 = t0r; sa = t0i; + t0r = SIMDBase_ADDi(s9, sb); t0i = SIMDBase_SUBi(s9, sb); s9 = t0r; sb = t0i; + t0r = SIMDBase_ADDi(sc, se); t0i = SIMDBase_SUBi(sc, se); sc = t0r; se = t0i; + t0r = SIMDBase_ADDi(sd, sf); t0i = SIMDBase_SUBi(sd, sf); sd = t0r; sf = t0i; + t0r = SIMDBase_ADDi(s0, s2); t0i = SIMDBase_SUBi(s0, s2); s0 = t0r; s2 = t0i; + t0r = SIMDBase_ADDi(s1, s3); t0i = SIMDBase_SUBi(s1, s3); s1 = t0r; s3 = t0i; + + t0r = SIMDBase_ADDi(s4, s6); t0i = SIMDBase_ADDi(s7, s5); + t1r = SIMDBase_SUBi(s7, s5); t1i = SIMDBase_SUBi(s4, s6); + + s4 = SIMDBase_SUBi(s0, t0r); s5 = SIMDBase_SUBi(s1, t0i); + s6 = SIMDBase_SUBi(s2, t1r); s7 = SIMDBase_SUBi(s3, t1i); + s0 = SIMDBase_ADDi(s0, t0r); s1 = SIMDBase_ADDi(s1, t0i); + s2 = SIMDBase_ADDi(s2, t1r); s3 = SIMDBase_ADDi(s3, t1i); + + t0r = SIMDBase_ADDi(s8, sc); t0i = SIMDBase_ADDi(s9, sd); + t1r = SIMDBase_SUBi(sd, s9); t1i = SIMDBase_SUBi(s8, sc); + + s8 = SIMDBase_SUBi(s0, t0r); s9 = SIMDBase_SUBi(s1, t0i); + sc = SIMDBase_SUBi(s4, t1r); sd = SIMDBase_SUBi(s5, t1i); + s0 = SIMDBase_ADDi(s0, t0r); s1 = SIMDBase_ADDi(s1, t0i); + s4 = SIMDBase_ADDi(s4, t1r); s5 = SIMDBase_ADDi(s5, t1i); + + t0r = SIMDBase_MULi(SIMDBase_SUBi(sa, sb), SIMDBase_SET1( SQRT2_2)); + t0i = SIMDBase_MULi(SIMDBase_ADDi(sa, sb), SIMDBase_SET1( SQRT2_2)); + t1r = SIMDBase_MULi(SIMDBase_ADDi(se, sf), SIMDBase_SET1(-SQRT2_2)); + t1i = SIMDBase_MULi(SIMDBase_SUBi(se, sf), SIMDBase_SET1( SQRT2_2)); + + sa = t0r; sb = t0i; se = t1r; sf = t1i; + + t0r = SIMDBase_ADDi(sa, se); t0i = SIMDBase_ADDi(sb, sf); + t1r = SIMDBase_SUBi(sf, sb); t1i = SIMDBase_SUBi(sa, se); + + sa = SIMDBase_SUBi(s2, t0r); sb = SIMDBase_SUBi(s3, t0i); + se = SIMDBase_SUBi(s6, t1r); sf = SIMDBase_SUBi(s7, t1i); + s2 = SIMDBase_ADDi(s2, t0r); s3 = SIMDBase_ADDi(s3, t0i); + s6 = SIMDBase_ADDi(s6, t1r); s7 = SIMDBase_ADDi(s7, t1i); + + SIMDBase_STOR(&s[o+ 0], s0); SIMDBase_STOR(&s[o+ 1], s1); SIMDBase_STOR(&s[o+ 2], s2); SIMDBase_STOR(&s[o+ 3], s3); + SIMDBase_STOR(&s[o+ 4], s4); SIMDBase_STOR(&s[o+ 5], s5); SIMDBase_STOR(&s[o+ 6], s6); SIMDBase_STOR(&s[o+ 7], s7); + SIMDBase_STOR(&s[o+ 8], s8); SIMDBase_STOR(&s[o+ 9], s9); SIMDBase_STOR(&s[o+10], sa); SIMDBase_STOR(&s[o+11], sb); + SIMDBase_STOR(&s[o+12], sc); SIMDBase_STOR(&s[o+13], sd); SIMDBase_STOR(&s[o+14], se); SIMDBase_STOR(&s[o+15], sf); +} + +#if 0 +static inline void srButForwardSub(DFTUndiff *p) { + SIMDBase_VECT *s = p->s; + SIMDBase_REAL *tbl = p->ptTable[p->log2butlen]; + + int32_t i0 = p->offset1; + int32_t i1 = i0 + p->stride; + int32_t i2 = i1 + p->stride; + int32_t i3 = i2 + p->stride; + int32_t im = i1; + + int32_t p0 = p->offset2 & (p->butlen*4-1); + + while(i0 < im) { + SIMDBase_VECT t0r, t0i, t1r, t1i; + SIMDBase_VECT s00, s01, s10, s11, s20, s21, s30, s31; + SIMDBase_VECT a0, a1, a2, a3; + + s00 = SIMDBase_LOAD(&s[i0+0]), s01 = SIMDBase_LOAD(&s[i0+1]); + s10 = SIMDBase_LOAD(&s[i1+0]), s11 = SIMDBase_LOAD(&s[i1+1]); + s20 = SIMDBase_LOAD(&s[i2+0]), s21 = SIMDBase_LOAD(&s[i2+1]); + s30 = SIMDBase_LOAD(&s[i3+0]), s31 = SIMDBase_LOAD(&s[i3+1]); + + t0r = SIMDBase_SUBi(SIMDBase_SUBi(s00, s20), SIMDBase_SUBi(s31, s11)); + t0i = SIMDBase_SUBi(SIMDBase_SUBi(s01, s21), SIMDBase_SUBi(s10, s30)); + + t1r = SIMDBase_ADDi(SIMDBase_SUBi(s00, s20), SIMDBase_SUBi(s31, s11)); + t1i = SIMDBase_ADDi(SIMDBase_SUBi(s01, s21), SIMDBase_SUBi(s10, s30)); + + a0 = SIMDBase_LOAD1(&tbl[p0+0]); a1 = SIMDBase_LOAD1(&tbl[p0+1]); + a2 = SIMDBase_LOAD1(&tbl[p0+2]); a3 = SIMDBase_LOAD1(&tbl[p0+3]); + + SIMDBase_STOR(&s[i0 ], SIMDBase_ADDi(s00, s20)); SIMDBase_STOR(&s[i0+1], SIMDBase_ADDi(s01, s21)); + SIMDBase_STOR(&s[i1 ], SIMDBase_ADDi(s10, s30)); SIMDBase_STOR(&s[i1+1], SIMDBase_ADDi(s11, s31)); + +#ifndef SIMDBase_FMADD_AVAILABLE + SIMDBase_STOR(&s[i2 ], SIMDBase_SUBi(SIMDBase_MULi(t0r, a0), SIMDBase_MULi(t0i, a1))); + SIMDBase_STOR(&s[i2+1], SIMDBase_ADDi(SIMDBase_MULi(t0r, a1), SIMDBase_MULi(t0i, a0))); + SIMDBase_STOR(&s[i3 ], SIMDBase_SUBi(SIMDBase_MULi(t1r, a2), SIMDBase_MULi(t1i, a3))); + SIMDBase_STOR(&s[i3+1], SIMDBase_ADDi(SIMDBase_MULi(t1r, a3), SIMDBase_MULi(t1i, a2))); +#else + SIMDBase_STOR(&s[i2 ], SIMDBase_FMSUBi(t0i, a1, SIMDBase_MULi(t0r, a0))); + SIMDBase_STOR(&s[i2+1], SIMDBase_FMADDi(t0r, a1, SIMDBase_MULi(t0i, a0))); + SIMDBase_STOR(&s[i3 ], SIMDBase_FMSUBi(t1i, a3, SIMDBase_MULi(t1r, a2))); + SIMDBase_STOR(&s[i3+1], SIMDBase_FMADDi(t1r, a3, SIMDBase_MULi(t1i, a2))); +#endif + + i0 += 2; i1 += 2; i2 += 2; i3 += 2; + p0 += 4; + } +} +#endif + +#if 0 +static inline void srButBackwardSub(DFTUndiff *p) { + SIMDBase_VECT *s = p->s; + SIMDBase_REAL *tbl = p->ptTable[p->log2butlen]; + + int32_t i0 = p->offset1; + int32_t i1 = i0 + p->stride; + int32_t i2 = i1 + p->stride; + int32_t i3 = i2 + p->stride; + int32_t im = i1; + + int32_t p0 = p->offset2 & (p->butlen*4-1); + + while(i0 < im) { + SIMDBase_VECT t0r, t0i, t1r, t1i, u, v; + SIMDBase_VECT s00, s01, s10, s11, s20, s21, s30, s31; + SIMDBase_VECT a0, a1, a2, a3; + + s20 = SIMDBase_LOAD(&s[i2+0]); s21 = SIMDBase_LOAD(&s[i2+1]); + a0 = SIMDBase_LOAD1(&tbl[p0+0]); a1 = SIMDBase_LOAD1(&tbl[p0+1]); + u = SIMDBase_ADDi(SIMDBase_MULi(s20, a0), SIMDBase_MULi(s21, a1)); + + s30 = SIMDBase_LOAD(&s[i3+0]); s31 = SIMDBase_LOAD(&s[i3+1]); + a2 = SIMDBase_LOAD1(&tbl[p0+2]); a3 = SIMDBase_LOAD1(&tbl[p0+3]); + v = SIMDBase_ADDi(SIMDBase_MULi(s30, a2), SIMDBase_MULi(s31, a3)); + + t0r = SIMDBase_ADDi(u, v); t1i = SIMDBase_SUBi(u, v); + u = SIMDBase_SUBi(SIMDBase_MULi(s31, a2), SIMDBase_MULi(s30, a3)); + v = SIMDBase_SUBi(SIMDBase_MULi(s21, a0), SIMDBase_MULi(s20, a1)); + t0i = SIMDBase_ADDi(u, v); t1r = SIMDBase_SUBi(u, v); + + s00 = SIMDBase_LOAD(&s[i0+0]); s01 = SIMDBase_LOAD(&s[i0+1]); + s10 = SIMDBase_LOAD(&s[i1+0]); s11 = SIMDBase_LOAD(&s[i1+1]); + + SIMDBase_STOR(&s[i2+0], SIMDBase_SUBi(s00, t0r)); SIMDBase_STOR(&s[i0+0], SIMDBase_ADDi(s00, t0r)); + SIMDBase_STOR(&s[i2+1], SIMDBase_SUBi(s01, t0i)); SIMDBase_STOR(&s[i0+1], SIMDBase_ADDi(s01, t0i)); + SIMDBase_STOR(&s[i3+0], SIMDBase_SUBi(s10, t1r)); SIMDBase_STOR(&s[i1+0], SIMDBase_ADDi(s10, t1r)); + SIMDBase_STOR(&s[i3+1], SIMDBase_SUBi(s11, t1i)); SIMDBase_STOR(&s[i1+1], SIMDBase_ADDi(s11, t1i)); + + i0 += 2; i1 += 2; i2 += 2; i3 += 2; + p0 += 4; + } +} + +static void srButBackwardSubUnrolled(DFTUndiff *p) { + srButBackwardSub(p); +} +#endif + +static inline void srButForwardSubUnrolled(DFTUndiff *p) { + SIMDBase_VECT *s = p->s; + SIMDBase_REAL *tbl = p->ptTable[p->log2butlen]; + + int32_t i0 = p->offset1; + int32_t i1 = i0 + p->stride; + int32_t i2 = i1 + p->stride; + int32_t i3 = i2 + p->stride; + int32_t im = i1; + + int32_t p0 = p->offset2 & (p->butlen*4-1); + + while(i0 < im) { + SIMDBase_VECT t0r, t0i, t1r, t1i; + SIMDBase_VECT s00, s01, s10, s11, s20, s21, s30, s31; + SIMDBase_VECT a0, a1, a2, a3; + + // + + s00 = SIMDBase_LOAD(&s[i0+0]); s01 = SIMDBase_LOAD(&s[i0+1]); + s10 = SIMDBase_LOAD(&s[i1+0]); s11 = SIMDBase_LOAD(&s[i1+1]); + s20 = SIMDBase_LOAD(&s[i2+0]); s21 = SIMDBase_LOAD(&s[i2+1]); + s30 = SIMDBase_LOAD(&s[i3+0]); s31 = SIMDBase_LOAD(&s[i3+1]); + + t0r = SIMDBase_SUBi(SIMDBase_SUBi(s00, s20), SIMDBase_SUBi(s31, s11)); + t0i = SIMDBase_SUBi(SIMDBase_SUBi(s01, s21), SIMDBase_SUBi(s10, s30)); + + t1r = SIMDBase_ADDi(SIMDBase_SUBi(s00, s20), SIMDBase_SUBi(s31, s11)); + t1i = SIMDBase_ADDi(SIMDBase_SUBi(s01, s21), SIMDBase_SUBi(s10, s30)); + + a0 = SIMDBase_LOAD1(&tbl[p0+0]); a1 = SIMDBase_LOAD1(&tbl[p0+1]); + a2 = SIMDBase_LOAD1(&tbl[p0+2]); a3 = SIMDBase_LOAD1(&tbl[p0+3]); + + SIMDBase_STOR(&s[i0 ], SIMDBase_ADDi(s00, s20)); SIMDBase_STOR(&s[i0+1], SIMDBase_ADDi(s01, s21)); + SIMDBase_STOR(&s[i1 ], SIMDBase_ADDi(s10, s30)); SIMDBase_STOR(&s[i1+1], SIMDBase_ADDi(s11, s31)); + +#ifndef SIMDBase_FMADD_AVAILABLE + SIMDBase_STOR(&s[i2 ], SIMDBase_SUBi(SIMDBase_MULi(t0r, a0), SIMDBase_MULi(t0i, a1))); + SIMDBase_STOR(&s[i2+1], SIMDBase_ADDi(SIMDBase_MULi(t0r, a1), SIMDBase_MULi(t0i, a0))); + SIMDBase_STOR(&s[i3 ], SIMDBase_SUBi(SIMDBase_MULi(t1r, a2), SIMDBase_MULi(t1i, a3))); + SIMDBase_STOR(&s[i3+1], SIMDBase_ADDi(SIMDBase_MULi(t1r, a3), SIMDBase_MULi(t1i, a2))); +#else + SIMDBase_STOR(&s[i2 ], SIMDBase_FMSUBi(t0i, a1, SIMDBase_MULi(t0r, a0))); + SIMDBase_STOR(&s[i2+1], SIMDBase_FMADDi(t0r, a1, SIMDBase_MULi(t0i, a0))); + SIMDBase_STOR(&s[i3 ], SIMDBase_FMSUBi(t1i, a3, SIMDBase_MULi(t1r, a2))); + SIMDBase_STOR(&s[i3+1], SIMDBase_FMADDi(t1r, a3, SIMDBase_MULi(t1i, a2))); +#endif + + // + + s00 = SIMDBase_LOAD(&s[i0+2]); s01 = SIMDBase_LOAD(&s[i0+3]); + s10 = SIMDBase_LOAD(&s[i1+2]); s11 = SIMDBase_LOAD(&s[i1+3]); + s20 = SIMDBase_LOAD(&s[i2+2]); s21 = SIMDBase_LOAD(&s[i2+3]); + s30 = SIMDBase_LOAD(&s[i3+2]); s31 = SIMDBase_LOAD(&s[i3+3]); + + t0r = SIMDBase_SUBi(SIMDBase_SUBi(s00, s20), SIMDBase_SUBi(s31, s11)); + t0i = SIMDBase_SUBi(SIMDBase_SUBi(s01, s21), SIMDBase_SUBi(s10, s30)); + + t1r = SIMDBase_ADDi(SIMDBase_SUBi(s00, s20), SIMDBase_SUBi(s31, s11)); + t1i = SIMDBase_ADDi(SIMDBase_SUBi(s01, s21), SIMDBase_SUBi(s10, s30)); + + a0 = SIMDBase_LOAD1(&tbl[p0+4]); a1 = SIMDBase_LOAD1(&tbl[p0+5]); + a2 = SIMDBase_LOAD1(&tbl[p0+6]); a3 = SIMDBase_LOAD1(&tbl[p0+7]); + + SIMDBase_STOR(&s[i0+2], SIMDBase_ADDi(s00, s20)); SIMDBase_STOR(&s[i0+3], SIMDBase_ADDi(s01, s21)); + SIMDBase_STOR(&s[i1+2], SIMDBase_ADDi(s10, s30)); SIMDBase_STOR(&s[i1+3], SIMDBase_ADDi(s11, s31)); + +#ifndef SIMDBase_FMADD_AVAILABLE + SIMDBase_STOR(&s[i2+2], SIMDBase_SUBi(SIMDBase_MULi(t0r, a0), SIMDBase_MULi(t0i, a1))); + SIMDBase_STOR(&s[i2+3], SIMDBase_ADDi(SIMDBase_MULi(t0r, a1), SIMDBase_MULi(t0i, a0))); + SIMDBase_STOR(&s[i3+2], SIMDBase_SUBi(SIMDBase_MULi(t1r, a2), SIMDBase_MULi(t1i, a3))); + SIMDBase_STOR(&s[i3+3], SIMDBase_ADDi(SIMDBase_MULi(t1r, a3), SIMDBase_MULi(t1i, a2))); +#else + SIMDBase_STOR(&s[i2+2], SIMDBase_FMSUBi(t0i, a1, SIMDBase_MULi(t0r, a0))); + SIMDBase_STOR(&s[i2+3], SIMDBase_FMADDi(t0r, a1, SIMDBase_MULi(t0i, a0))); + SIMDBase_STOR(&s[i3+2], SIMDBase_FMSUBi(t1i, a3, SIMDBase_MULi(t1r, a2))); + SIMDBase_STOR(&s[i3+3], SIMDBase_FMADDi(t1r, a3, SIMDBase_MULi(t1i, a2))); +#endif + + // + + s00 = SIMDBase_LOAD(&s[i0+4]); s01 = SIMDBase_LOAD(&s[i0+5]); + s10 = SIMDBase_LOAD(&s[i1+4]); s11 = SIMDBase_LOAD(&s[i1+5]); + s20 = SIMDBase_LOAD(&s[i2+4]); s21 = SIMDBase_LOAD(&s[i2+5]); + s30 = SIMDBase_LOAD(&s[i3+4]); s31 = SIMDBase_LOAD(&s[i3+5]); + + t0r = SIMDBase_SUBi(SIMDBase_SUBi(s00, s20), SIMDBase_SUBi(s31, s11)); + t0i = SIMDBase_SUBi(SIMDBase_SUBi(s01, s21), SIMDBase_SUBi(s10, s30)); + + t1r = SIMDBase_ADDi(SIMDBase_SUBi(s00, s20), SIMDBase_SUBi(s31, s11)); + t1i = SIMDBase_ADDi(SIMDBase_SUBi(s01, s21), SIMDBase_SUBi(s10, s30)); + + a0 = SIMDBase_LOAD1(&tbl[p0+ 8]); a1 = SIMDBase_LOAD1(&tbl[p0+ 9]); + a2 = SIMDBase_LOAD1(&tbl[p0+10]); a3 = SIMDBase_LOAD1(&tbl[p0+11]); + + SIMDBase_STOR(&s[i0+4], SIMDBase_ADDi(s00, s20)); SIMDBase_STOR(&s[i0+5], SIMDBase_ADDi(s01, s21)); + SIMDBase_STOR(&s[i1+4], SIMDBase_ADDi(s10, s30)); SIMDBase_STOR(&s[i1+5], SIMDBase_ADDi(s11, s31)); + +#ifndef SIMDBase_FMADD_AVAILABLE + SIMDBase_STOR(&s[i2+4], SIMDBase_SUBi(SIMDBase_MULi(t0r, a0), SIMDBase_MULi(t0i, a1))); + SIMDBase_STOR(&s[i2+5], SIMDBase_ADDi(SIMDBase_MULi(t0r, a1), SIMDBase_MULi(t0i, a0))); + SIMDBase_STOR(&s[i3+4], SIMDBase_SUBi(SIMDBase_MULi(t1r, a2), SIMDBase_MULi(t1i, a3))); + SIMDBase_STOR(&s[i3+5], SIMDBase_ADDi(SIMDBase_MULi(t1r, a3), SIMDBase_MULi(t1i, a2))); +#else + SIMDBase_STOR(&s[i2+4], SIMDBase_FMSUBi(t0i, a1, SIMDBase_MULi(t0r, a0))); + SIMDBase_STOR(&s[i2+5], SIMDBase_FMADDi(t0r, a1, SIMDBase_MULi(t0i, a0))); + SIMDBase_STOR(&s[i3+4], SIMDBase_FMSUBi(t1i, a3, SIMDBase_MULi(t1r, a2))); + SIMDBase_STOR(&s[i3+5], SIMDBase_FMADDi(t1r, a3, SIMDBase_MULi(t1i, a2))); +#endif + + // + + s00 = SIMDBase_LOAD(&s[i0+6]); s01 = SIMDBase_LOAD(&s[i0+7]); + s10 = SIMDBase_LOAD(&s[i1+6]); s11 = SIMDBase_LOAD(&s[i1+7]); + s20 = SIMDBase_LOAD(&s[i2+6]); s21 = SIMDBase_LOAD(&s[i2+7]); + s30 = SIMDBase_LOAD(&s[i3+6]); s31 = SIMDBase_LOAD(&s[i3+7]); + + t0r = SIMDBase_SUBi(SIMDBase_SUBi(s00, s20), SIMDBase_SUBi(s31, s11)); + t0i = SIMDBase_SUBi(SIMDBase_SUBi(s01, s21), SIMDBase_SUBi(s10, s30)); + + t1r = SIMDBase_ADDi(SIMDBase_SUBi(s00, s20), SIMDBase_SUBi(s31, s11)); + t1i = SIMDBase_ADDi(SIMDBase_SUBi(s01, s21), SIMDBase_SUBi(s10, s30)); + + a0 = SIMDBase_LOAD1(&tbl[p0+12]); a1 = SIMDBase_LOAD1(&tbl[p0+13]); + a2 = SIMDBase_LOAD1(&tbl[p0+14]); a3 = SIMDBase_LOAD1(&tbl[p0+15]); + + SIMDBase_STOR(&s[i0+6], SIMDBase_ADDi(s00, s20)); SIMDBase_STOR(&s[i0+7], SIMDBase_ADDi(s01, s21)); + SIMDBase_STOR(&s[i1+6], SIMDBase_ADDi(s10, s30)); SIMDBase_STOR(&s[i1+7], SIMDBase_ADDi(s11, s31)); + +#ifndef SIMDBase_FMADD_AVAILABLE + SIMDBase_STOR(&s[i2+6], SIMDBase_SUBi(SIMDBase_MULi(t0r, a0), SIMDBase_MULi(t0i, a1))); + SIMDBase_STOR(&s[i2+7], SIMDBase_ADDi(SIMDBase_MULi(t0r, a1), SIMDBase_MULi(t0i, a0))); + SIMDBase_STOR(&s[i3+6], SIMDBase_SUBi(SIMDBase_MULi(t1r, a2), SIMDBase_MULi(t1i, a3))); + SIMDBase_STOR(&s[i3+7], SIMDBase_ADDi(SIMDBase_MULi(t1r, a3), SIMDBase_MULi(t1i, a2))); +#else + SIMDBase_STOR(&s[i2+6], SIMDBase_FMSUBi(t0i, a1, SIMDBase_MULi(t0r, a0))); + SIMDBase_STOR(&s[i2+7], SIMDBase_FMADDi(t0r, a1, SIMDBase_MULi(t0i, a0))); + SIMDBase_STOR(&s[i3+6], SIMDBase_FMSUBi(t1i, a3, SIMDBase_MULi(t1r, a2))); + SIMDBase_STOR(&s[i3+7], SIMDBase_FMADDi(t1r, a3, SIMDBase_MULi(t1i, a2))); +#endif + + // + + i0 += 8; i1 += 8; i2 += 8; i3 += 8; + p0 += 16; + } +} + +#if 1 +static void srButBackwardSubUnrolled(DFTUndiff *p) { + SIMDBase_VECT *s = p->s; + SIMDBase_REAL *tbl = p->ptTable[p->log2butlen]; + + int32_t i0 = p->offset1; + int32_t i1 = i0 + p->stride; + int32_t i2 = i1 + p->stride; + int32_t i3 = i2 + p->stride; + int32_t im = i1; + + int32_t p0 = p->offset2 & (p->butlen*4-1); + + while(i0 < im) { + SIMDBase_VECT t0r, t0i, t1r, t1i, u, v; + SIMDBase_VECT s00, s01, s10, s11, s20, s21, s30, s31; + SIMDBase_VECT a0, a1, a2, a3; + + // + + s20 = SIMDBase_LOAD(&s[i2+ 0]); s21 = SIMDBase_LOAD(&s[i2+ 1]); + a0 = SIMDBase_LOAD1(&tbl[p0+ 0]); a1 = SIMDBase_LOAD1(&tbl[p0+ 1]); +#ifndef SIMDBase_FMADD_AVAILABLE + u = SIMDBase_ADDi(SIMDBase_MULi(s20, a0), SIMDBase_MULi(s21, a1)); +#else + u = SIMDBase_FMADDi(s20, a0, SIMDBase_MULi(s21, a1)); +#endif + + s30 = SIMDBase_LOAD(&s[i3+ 0]); s31 = SIMDBase_LOAD(&s[i3+ 1]); + a2 = SIMDBase_LOAD1(&tbl[p0+ 2]); a3 = SIMDBase_LOAD1(&tbl[p0+ 3]); +#ifndef SIMDBase_FMADD_AVAILABLE + v = SIMDBase_ADDi(SIMDBase_MULi(s30, a2), SIMDBase_MULi(s31, a3)); +#else + v = SIMDBase_FMADDi(s30, a2, SIMDBase_MULi(s31, a3)); +#endif + + t0r = SIMDBase_ADDi(u, v); t1i = SIMDBase_SUBi(u, v); + +#ifndef SIMDBase_FMADD_AVAILABLE + u = SIMDBase_SUBi(SIMDBase_MULi(s31, a2), SIMDBase_MULi(s30, a3)); + v = SIMDBase_SUBi(SIMDBase_MULi(s21, a0), SIMDBase_MULi(s20, a1)); +#else + u = SIMDBase_FMSUBi(s30, a3, SIMDBase_MULi(s31, a2)); + v = SIMDBase_FMSUBi(s20, a1, SIMDBase_MULi(s21, a0)); +#endif + t0i = SIMDBase_ADDi(u, v); t1r = SIMDBase_SUBi(u, v); + + s00 = SIMDBase_LOAD(&s[i0+ 0]); s01 = SIMDBase_LOAD(&s[i0+ 1]); + s10 = SIMDBase_LOAD(&s[i1+ 0]); s11 = SIMDBase_LOAD(&s[i1+ 1]); + + SIMDBase_STOR(&s[i2+ 0], SIMDBase_SUBi(s00, t0r)); SIMDBase_STOR(&s[i0+ 0], SIMDBase_ADDi(s00, t0r)); + SIMDBase_STOR(&s[i2+ 1], SIMDBase_SUBi(s01, t0i)); SIMDBase_STOR(&s[i0+ 1], SIMDBase_ADDi(s01, t0i)); + SIMDBase_STOR(&s[i3+ 0], SIMDBase_SUBi(s10, t1r)); SIMDBase_STOR(&s[i1+ 0], SIMDBase_ADDi(s10, t1r)); + SIMDBase_STOR(&s[i3+ 1], SIMDBase_SUBi(s11, t1i)); SIMDBase_STOR(&s[i1+ 1], SIMDBase_ADDi(s11, t1i)); + + // + + s20 = SIMDBase_LOAD(&s[i2+ 2]); s21 = SIMDBase_LOAD(&s[i2+ 3]); + a0 = SIMDBase_LOAD1(&tbl[p0+ 4]); a1 = SIMDBase_LOAD1(&tbl[p0+ 5]); +#ifndef SIMDBase_FMADD_AVAILABLE + u = SIMDBase_ADDi(SIMDBase_MULi(s20, a0), SIMDBase_MULi(s21, a1)); +#else + u = SIMDBase_FMADDi(s20, a0, SIMDBase_MULi(s21, a1)); +#endif + + s30 = SIMDBase_LOAD(&s[i3+ 2]); s31 = SIMDBase_LOAD(&s[i3+ 3]); + a2 = SIMDBase_LOAD1(&tbl[p0+ 6]); a3 = SIMDBase_LOAD1(&tbl[p0+ 7]); +#ifndef SIMDBase_FMADD_AVAILABLE + v = SIMDBase_ADDi(SIMDBase_MULi(s30, a2), SIMDBase_MULi(s31, a3)); +#else + v = SIMDBase_FMADDi(s30, a2, SIMDBase_MULi(s31, a3)); +#endif + + t0r = SIMDBase_ADDi(u, v); t1i = SIMDBase_SUBi(u, v); + +#ifndef SIMDBase_FMADD_AVAILABLE + u = SIMDBase_SUBi(SIMDBase_MULi(s31, a2), SIMDBase_MULi(s30, a3)); + v = SIMDBase_SUBi(SIMDBase_MULi(s21, a0), SIMDBase_MULi(s20, a1)); +#else + u = SIMDBase_FMSUBi(s30, a3, SIMDBase_MULi(s31, a2)); + v = SIMDBase_FMSUBi(s20, a1, SIMDBase_MULi(s21, a0)); +#endif + t0i = SIMDBase_ADDi(u, v); t1r = SIMDBase_SUBi(u, v); + + s00 = SIMDBase_LOAD(&s[i0+ 2]); s01 = SIMDBase_LOAD(&s[i0+ 3]); + s10 = SIMDBase_LOAD(&s[i1+ 2]); s11 = SIMDBase_LOAD(&s[i1+ 3]); + + SIMDBase_STOR(&s[i2+ 2], SIMDBase_SUBi(s00, t0r)); SIMDBase_STOR(&s[i0+ 2], SIMDBase_ADDi(s00, t0r)); + SIMDBase_STOR(&s[i2+ 3], SIMDBase_SUBi(s01, t0i)); SIMDBase_STOR(&s[i0+ 3], SIMDBase_ADDi(s01, t0i)); + SIMDBase_STOR(&s[i3+ 2], SIMDBase_SUBi(s10, t1r)); SIMDBase_STOR(&s[i1+ 2], SIMDBase_ADDi(s10, t1r)); + SIMDBase_STOR(&s[i3+ 3], SIMDBase_SUBi(s11, t1i)); SIMDBase_STOR(&s[i1+ 3], SIMDBase_ADDi(s11, t1i)); + + // + + s20 = SIMDBase_LOAD(&s[i2+ 4]); s21 = SIMDBase_LOAD(&s[i2+ 5]); + a0 = SIMDBase_LOAD1(&tbl[p0+ 8]); a1 = SIMDBase_LOAD1(&tbl[p0+ 9]); +#ifndef SIMDBase_FMADD_AVAILABLE + u = SIMDBase_ADDi(SIMDBase_MULi(s20, a0), SIMDBase_MULi(s21, a1)); +#else + u = SIMDBase_FMADDi(s20, a0, SIMDBase_MULi(s21, a1)); +#endif + + s30 = SIMDBase_LOAD(&s[i3+ 4]); s31 = SIMDBase_LOAD(&s[i3+ 5]); + a2 = SIMDBase_LOAD1(&tbl[p0+10]); a3 = SIMDBase_LOAD1(&tbl[p0+11]); +#ifndef SIMDBase_FMADD_AVAILABLE + v = SIMDBase_ADDi(SIMDBase_MULi(s30, a2), SIMDBase_MULi(s31, a3)); +#else + v = SIMDBase_FMADDi(s30, a2, SIMDBase_MULi(s31, a3)); +#endif + + t0r = SIMDBase_ADDi(u, v); t1i = SIMDBase_SUBi(u, v); + +#ifndef SIMDBase_FMADD_AVAILABLE + u = SIMDBase_SUBi(SIMDBase_MULi(s31, a2), SIMDBase_MULi(s30, a3)); + v = SIMDBase_SUBi(SIMDBase_MULi(s21, a0), SIMDBase_MULi(s20, a1)); +#else + u = SIMDBase_FMSUBi(s30, a3, SIMDBase_MULi(s31, a2)); + v = SIMDBase_FMSUBi(s20, a1, SIMDBase_MULi(s21, a0)); +#endif + t0i = SIMDBase_ADDi(u, v); t1r = SIMDBase_SUBi(u, v); + + s00 = SIMDBase_LOAD(&s[i0+ 4]); s01 = SIMDBase_LOAD(&s[i0+ 5]); + s10 = SIMDBase_LOAD(&s[i1+ 4]); s11 = SIMDBase_LOAD(&s[i1+ 5]); + + SIMDBase_STOR(&s[i2+ 4], SIMDBase_SUBi(s00, t0r)); SIMDBase_STOR(&s[i0+ 4], SIMDBase_ADDi(s00, t0r)); + SIMDBase_STOR(&s[i2+ 5], SIMDBase_SUBi(s01, t0i)); SIMDBase_STOR(&s[i0+ 5], SIMDBase_ADDi(s01, t0i)); + SIMDBase_STOR(&s[i3+ 4], SIMDBase_SUBi(s10, t1r)); SIMDBase_STOR(&s[i1+ 4], SIMDBase_ADDi(s10, t1r)); + SIMDBase_STOR(&s[i3+ 5], SIMDBase_SUBi(s11, t1i)); SIMDBase_STOR(&s[i1+ 5], SIMDBase_ADDi(s11, t1i)); + + // + + s20 = SIMDBase_LOAD(&s[i2+ 6]); s21 = SIMDBase_LOAD(&s[i2+ 7]); + a0 = SIMDBase_LOAD1(&tbl[p0+12]); a1 = SIMDBase_LOAD1(&tbl[p0+13]); +#ifndef SIMDBase_FMADD_AVAILABLE + u = SIMDBase_ADDi(SIMDBase_MULi(s20, a0), SIMDBase_MULi(s21, a1)); +#else + u = SIMDBase_FMADDi(s20, a0, SIMDBase_MULi(s21, a1)); +#endif + + s30 = SIMDBase_LOAD(&s[i3+ 6]); s31 = SIMDBase_LOAD(&s[i3+ 7]); + a2 = SIMDBase_LOAD1(&tbl[p0+14]); a3 = SIMDBase_LOAD1(&tbl[p0+15]); +#ifndef SIMDBase_FMADD_AVAILABLE + v = SIMDBase_ADDi(SIMDBase_MULi(s30, a2), SIMDBase_MULi(s31, a3)); +#else + v = SIMDBase_FMADDi(s30, a2, SIMDBase_MULi(s31, a3)); +#endif + + t0r = SIMDBase_ADDi(u, v); t1i = SIMDBase_SUBi(u, v); + +#ifndef SIMDBase_FMADD_AVAILABLE + u = SIMDBase_SUBi(SIMDBase_MULi(s31, a2), SIMDBase_MULi(s30, a3)); + v = SIMDBase_SUBi(SIMDBase_MULi(s21, a0), SIMDBase_MULi(s20, a1)); +#else + u = SIMDBase_FMSUBi(s30, a3, SIMDBase_MULi(s31, a2)); + v = SIMDBase_FMSUBi(s20, a1, SIMDBase_MULi(s21, a0)); +#endif + t0i = SIMDBase_ADDi(u, v); t1r = SIMDBase_SUBi(u, v); + + s00 = SIMDBase_LOAD(&s[i0+ 6]); s01 = SIMDBase_LOAD(&s[i0+ 7]); + s10 = SIMDBase_LOAD(&s[i1+ 6]); s11 = SIMDBase_LOAD(&s[i1+ 7]); + + SIMDBase_STOR(&s[i2+ 6], SIMDBase_SUBi(s00, t0r)); SIMDBase_STOR(&s[i0+ 6], SIMDBase_ADDi(s00, t0r)); + SIMDBase_STOR(&s[i2+ 7], SIMDBase_SUBi(s01, t0i)); SIMDBase_STOR(&s[i0+ 7], SIMDBase_ADDi(s01, t0i)); + SIMDBase_STOR(&s[i3+ 6], SIMDBase_SUBi(s10, t1r)); SIMDBase_STOR(&s[i1+ 6], SIMDBase_ADDi(s10, t1r)); + SIMDBase_STOR(&s[i3+ 7], SIMDBase_SUBi(s11, t1i)); SIMDBase_STOR(&s[i1+ 7], SIMDBase_ADDi(s11, t1i)); + + // + + i0 += 8; i1 += 8; i2 += 8; i3 += 8; + p0 += 16; + } +} +#endif + +static void r2ButForwardSub(DFTUndiff *p) { + SIMDBase_VECT *s = p->s; + + SIMDBase_REAL *tbl = p->ptTable[p->log2butlen]; + + int32_t i0 = p->offset1; + int32_t i2 = i0 + p->stride*2; + int32_t cp = 0, sp = p->butlen/4; + + do { + SIMDBase_VECT t0r, t0i, s0, s1, s2, s3, t0, t1; + + s0 = SIMDBase_LOAD(&s[i0+0]); s2 = SIMDBase_LOAD(&s[i0+1]); + s1 = SIMDBase_LOAD(&s[i2+0]); s3 = SIMDBase_LOAD(&s[i2+1]); + t0 = SIMDBase_LOAD1(&tbl[cp+0]); t1 = SIMDBase_LOAD1(&tbl[sp-0]); + t0r = SIMDBase_SUBi(s0, s1); SIMDBase_STOR(&s[i0+0], SIMDBase_ADDi(s0, s1)); + t0i = SIMDBase_SUBi(s2, s3); SIMDBase_STOR(&s[i0+1], SIMDBase_ADDi(s2, s3)); + SIMDBase_STOR(&s[i2+0], SIMDBase_ADDi(SIMDBase_MULi(t0r, t0), SIMDBase_MULi(t0i, t1))); + SIMDBase_STOR(&s[i2+1], SIMDBase_SUBi(SIMDBase_MULi(t0i, t0), SIMDBase_MULi(t0r, t1))); + + s0 = SIMDBase_LOAD(&s[i0+2]); s2 = SIMDBase_LOAD(&s[i0+3]); + s1 = SIMDBase_LOAD(&s[i2+2]); s3 = SIMDBase_LOAD(&s[i2+3]); + t0 = SIMDBase_LOAD1(&tbl[cp+1]); t1 = SIMDBase_LOAD1(&tbl[sp-1]); + t0r = SIMDBase_SUBi(s0, s1); SIMDBase_STOR(&s[i0+2], SIMDBase_ADDi(s0, s1)); + t0i = SIMDBase_SUBi(s2, s3); SIMDBase_STOR(&s[i0+3], SIMDBase_ADDi(s2, s3)); + SIMDBase_STOR(&s[i2+2], SIMDBase_ADDi(SIMDBase_MULi(t0r, t0), SIMDBase_MULi(t0i, t1))); + SIMDBase_STOR(&s[i2+3], SIMDBase_SUBi(SIMDBase_MULi(t0i, t0), SIMDBase_MULi(t0r, t1))); + + s0 = SIMDBase_LOAD(&s[i0+4]); s2 = SIMDBase_LOAD(&s[i0+5]); + s1 = SIMDBase_LOAD(&s[i2+4]); s3 = SIMDBase_LOAD(&s[i2+5]); + t0 = SIMDBase_LOAD1(&tbl[cp+2]); t1 = SIMDBase_LOAD1(&tbl[sp-2]); + t0r = SIMDBase_SUBi(s0, s1); SIMDBase_STOR(&s[i0+4], SIMDBase_ADDi(s0, s1)); + t0i = SIMDBase_SUBi(s2, s3); SIMDBase_STOR(&s[i0+5], SIMDBase_ADDi(s2, s3)); + SIMDBase_STOR(&s[i2+4], SIMDBase_ADDi(SIMDBase_MULi(t0r, t0), SIMDBase_MULi(t0i, t1))); + SIMDBase_STOR(&s[i2+5], SIMDBase_SUBi(SIMDBase_MULi(t0i, t0), SIMDBase_MULi(t0r, t1))); + + s0 = SIMDBase_LOAD(&s[i0+6]); s2 = SIMDBase_LOAD(&s[i0+7]); + s1 = SIMDBase_LOAD(&s[i2+6]); s3 = SIMDBase_LOAD(&s[i2+7]); + t0 = SIMDBase_LOAD1(&tbl[cp+3]); t1 = SIMDBase_LOAD1(&tbl[sp-3]); + t0r = SIMDBase_SUBi(s0, s1); SIMDBase_STOR(&s[i0+6], SIMDBase_ADDi(s0, s1)); + t0i = SIMDBase_SUBi(s2, s3); SIMDBase_STOR(&s[i0+7], SIMDBase_ADDi(s2, s3)); + SIMDBase_STOR(&s[i2+6], SIMDBase_ADDi(SIMDBase_MULi(t0r, t0), SIMDBase_MULi(t0i, t1))); + SIMDBase_STOR(&s[i2+7], SIMDBase_SUBi(SIMDBase_MULi(t0i, t0), SIMDBase_MULi(t0r, t1))); + + // + + i0 += 8; i2 += 8; cp += 4; sp -= 4; + } while(sp > 0); + + do { + SIMDBase_VECT t0r, t0i, s0, s1, s2, s3, t0, t1; + + s0 = SIMDBase_LOAD(&s[i0+0]); s2 = SIMDBase_LOAD(&s[i0+1]); + s1 = SIMDBase_LOAD(&s[i2+0]); s3 = SIMDBase_LOAD(&s[i2+1]); + t0 = SIMDBase_LOAD1(&tbl[cp-0]); t1 = SIMDBase_LOAD1(&tbl[sp+0]); + t0r = SIMDBase_SUBi(s0, s1); SIMDBase_STOR(&s[i0+0], SIMDBase_ADDi(s0, s1)); + t0i = SIMDBase_SUBi(s2, s3); SIMDBase_STOR(&s[i0+1], SIMDBase_ADDi(s2, s3)); + SIMDBase_STOR(&s[i2+0], SIMDBase_SUBi(SIMDBase_MULi(t0i, t1), SIMDBase_MULi(t0r, t0))); + SIMDBase_STOR(&s[i2+1], SIMDBase_NEGi(SIMDBase_ADDi(SIMDBase_MULi(t0i, t0), SIMDBase_MULi(t0r, t1)))); + + s0 = SIMDBase_LOAD(&s[i0+2]); s2 = SIMDBase_LOAD(&s[i0+3]); + s1 = SIMDBase_LOAD(&s[i2+2]); s3 = SIMDBase_LOAD(&s[i2+3]); + t0 = SIMDBase_LOAD1(&tbl[cp-1]); t1 = SIMDBase_LOAD1(&tbl[sp+1]); + t0r = SIMDBase_SUBi(s0, s1); SIMDBase_STOR(&s[i0+2], SIMDBase_ADDi(s0, s1)); + t0i = SIMDBase_SUBi(s2, s3); SIMDBase_STOR(&s[i0+3], SIMDBase_ADDi(s2, s3)); + SIMDBase_STOR(&s[i2+2], SIMDBase_SUBi(SIMDBase_MULi(t0i, t1), SIMDBase_MULi(t0r, t0))); + SIMDBase_STOR(&s[i2+3], SIMDBase_NEGi(SIMDBase_ADDi(SIMDBase_MULi(t0i, t0), SIMDBase_MULi(t0r, t1)))); + + s0 = SIMDBase_LOAD(&s[i0+4]); s2 = SIMDBase_LOAD(&s[i0+5]); + s1 = SIMDBase_LOAD(&s[i2+4]); s3 = SIMDBase_LOAD(&s[i2+5]); + t0 = SIMDBase_LOAD1(&tbl[cp-2]); t1 = SIMDBase_LOAD1(&tbl[sp+2]); + t0r = SIMDBase_SUBi(s0, s1); SIMDBase_STOR(&s[i0+4], SIMDBase_ADDi(s0, s1)); + t0i = SIMDBase_SUBi(s2, s3); SIMDBase_STOR(&s[i0+5], SIMDBase_ADDi(s2, s3)); + SIMDBase_STOR(&s[i2+4], SIMDBase_SUBi(SIMDBase_MULi(t0i, t1), SIMDBase_MULi(t0r, t0))); + SIMDBase_STOR(&s[i2+5], SIMDBase_NEGi(SIMDBase_ADDi(SIMDBase_MULi(t0i, t0), SIMDBase_MULi(t0r, t1)))); + + s0 = SIMDBase_LOAD(&s[i0+6]); s2 = SIMDBase_LOAD(&s[i0+7]); + s1 = SIMDBase_LOAD(&s[i2+6]); s3 = SIMDBase_LOAD(&s[i2+7]); + t0 = SIMDBase_LOAD1(&tbl[cp-3]); t1 = SIMDBase_LOAD1(&tbl[sp+3]); + t0r = SIMDBase_SUBi(s0, s1); SIMDBase_STOR(&s[i0+6], SIMDBase_ADDi(s0, s1)); + t0i = SIMDBase_SUBi(s2, s3); SIMDBase_STOR(&s[i0+7], SIMDBase_ADDi(s2, s3)); + SIMDBase_STOR(&s[i2+6], SIMDBase_SUBi(SIMDBase_MULi(t0i, t1), SIMDBase_MULi(t0r, t0))); + SIMDBase_STOR(&s[i2+7], SIMDBase_NEGi(SIMDBase_ADDi(SIMDBase_MULi(t0i, t0), SIMDBase_MULi(t0r, t1)))); + + // + + i0 += 8; i2 += 8; cp -= 4; sp += 4; + } while(cp > 0); +} + +static void r2ButBackwardSub(DFTUndiff *p) { + SIMDBase_VECT *s = p->s; + + SIMDBase_REAL *tbl = p->ptTable[p->log2butlen]; + + int i0 = p->offset1; + int i2 = i0 + p->stride*2; + + int cp = 0, sp = p->butlen/4; + + do { + SIMDBase_VECT t0r, t0i, s0, s1, s2, s3, t0, t1; + + s0 = SIMDBase_LOAD(&s[i0+0]); s2 = SIMDBase_LOAD(&s[i0+1]); + s1 = SIMDBase_LOAD(&s[i2+0]); s3 = SIMDBase_LOAD(&s[i2+1]); + t0 = SIMDBase_LOAD1(&tbl[cp+0]); t1 = SIMDBase_LOAD1(&tbl[sp-0]); + t0r = SIMDBase_SUBi(SIMDBase_MULi(s1, t0), SIMDBase_MULi(s3, t1)); + t0i = SIMDBase_ADDi(SIMDBase_MULi(s1, t1), SIMDBase_MULi(s3, t0)); + SIMDBase_STOR(&s[i2+0], SIMDBase_SUBi(s0, t0r)); SIMDBase_STOR(&s[i0+0], SIMDBase_ADDi(s0, t0r)); + SIMDBase_STOR(&s[i2+1], SIMDBase_SUBi(s2, t0i)); SIMDBase_STOR(&s[i0+1], SIMDBase_ADDi(s2, t0i)); + + s0 = SIMDBase_LOAD(&s[i0+2]); s2 = SIMDBase_LOAD(&s[i0+3]); + s1 = SIMDBase_LOAD(&s[i2+2]); s3 = SIMDBase_LOAD(&s[i2+3]); + t0 = SIMDBase_LOAD1(&tbl[cp+1]); t1 = SIMDBase_LOAD1(&tbl[sp-1]); + t0r = SIMDBase_SUBi(SIMDBase_MULi(s1, t0), SIMDBase_MULi(s3, t1)); + t0i = SIMDBase_ADDi(SIMDBase_MULi(s1, t1), SIMDBase_MULi(s3, t0)); + SIMDBase_STOR(&s[i2+2], SIMDBase_SUBi(s0, t0r)); SIMDBase_STOR(&s[i0+2], SIMDBase_ADDi(s0, t0r)); + SIMDBase_STOR(&s[i2+3], SIMDBase_SUBi(s2, t0i)); SIMDBase_STOR(&s[i0+3], SIMDBase_ADDi(s2, t0i)); + + s0 = SIMDBase_LOAD(&s[i0+4]); s2 = SIMDBase_LOAD(&s[i0+5]); + s1 = SIMDBase_LOAD(&s[i2+4]); s3 = SIMDBase_LOAD(&s[i2+5]); + t0 = SIMDBase_LOAD1(&tbl[cp+2]); t1 = SIMDBase_LOAD1(&tbl[sp-2]); + t0r = SIMDBase_SUBi(SIMDBase_MULi(s1, t0), SIMDBase_MULi(s3, t1)); + t0i = SIMDBase_ADDi(SIMDBase_MULi(s1, t1), SIMDBase_MULi(s3, t0)); + SIMDBase_STOR(&s[i2+4], SIMDBase_SUBi(s0, t0r)); SIMDBase_STOR(&s[i0+4], SIMDBase_ADDi(s0, t0r)); + SIMDBase_STOR(&s[i2+5], SIMDBase_SUBi(s2, t0i)); SIMDBase_STOR(&s[i0+5], SIMDBase_ADDi(s2, t0i)); + + s0 = SIMDBase_LOAD(&s[i0+6]); s2 = SIMDBase_LOAD(&s[i0+7]); + s1 = SIMDBase_LOAD(&s[i2+6]); s3 = SIMDBase_LOAD(&s[i2+7]); + t0 = SIMDBase_LOAD1(&tbl[cp+3]); t1 = SIMDBase_LOAD1(&tbl[sp-3]); + t0r = SIMDBase_SUBi(SIMDBase_MULi(s1, t0), SIMDBase_MULi(s3, t1)); + t0i = SIMDBase_ADDi(SIMDBase_MULi(s1, t1), SIMDBase_MULi(s3, t0)); + SIMDBase_STOR(&s[i2+6], SIMDBase_SUBi(s0, t0r)); SIMDBase_STOR(&s[i0+6], SIMDBase_ADDi(s0, t0r)); + SIMDBase_STOR(&s[i2+7], SIMDBase_SUBi(s2, t0i)); SIMDBase_STOR(&s[i0+7], SIMDBase_ADDi(s2, t0i)); + + i0 += 8; i2 += 8; cp += 4; sp -= 4; + } while(sp > 0); + + do { + SIMDBase_VECT t0r, t0i, s0, s1, s2, s3, t0, t1; + + s0 = SIMDBase_LOAD(&s[i0+0]); s2 = SIMDBase_LOAD(&s[i0+1]); + s1 = SIMDBase_LOAD(&s[i2+0]); s3 = SIMDBase_LOAD(&s[i2+1]); + t0 = SIMDBase_LOAD1(&tbl[cp-0]); t1 = SIMDBase_LOAD1(&tbl[sp+0]); + t0r = SIMDBase_NEGi(SIMDBase_ADDi(SIMDBase_MULi(s1, t0), SIMDBase_MULi(s3, t1))); + t0i = SIMDBase_SUBi(SIMDBase_MULi(s1, t1), SIMDBase_MULi(s3, t0)); + SIMDBase_STOR(&s[i2+0], SIMDBase_SUBi(s0, t0r)); SIMDBase_STOR(&s[i0+0], SIMDBase_ADDi(s0, t0r)); + SIMDBase_STOR(&s[i2+1], SIMDBase_SUBi(s2, t0i)); SIMDBase_STOR(&s[i0+1], SIMDBase_ADDi(s2, t0i)); + + s0 = SIMDBase_LOAD(&s[i0+2]); s2 = SIMDBase_LOAD(&s[i0+3]); + s1 = SIMDBase_LOAD(&s[i2+2]); s3 = SIMDBase_LOAD(&s[i2+3]); + t0 = SIMDBase_LOAD1(&tbl[cp-1]); t1 = SIMDBase_LOAD1(&tbl[sp+1]); + t0r = SIMDBase_NEGi(SIMDBase_ADDi(SIMDBase_MULi(s1, t0), SIMDBase_MULi(s3, t1))); + t0i = SIMDBase_SUBi(SIMDBase_MULi(s1, t1), SIMDBase_MULi(s3, t0)); + SIMDBase_STOR(&s[i2+2], SIMDBase_SUBi(s0, t0r)); SIMDBase_STOR(&s[i0+2], SIMDBase_ADDi(s0, t0r)); + SIMDBase_STOR(&s[i2+3], SIMDBase_SUBi(s2, t0i)); SIMDBase_STOR(&s[i0+3], SIMDBase_ADDi(s2, t0i)); + + s0 = SIMDBase_LOAD(&s[i0+4]); s2 = SIMDBase_LOAD(&s[i0+5]); + s1 = SIMDBase_LOAD(&s[i2+4]); s3 = SIMDBase_LOAD(&s[i2+5]); + t0 = SIMDBase_LOAD1(&tbl[cp-2]); t1 = SIMDBase_LOAD1(&tbl[sp+2]); + t0r = SIMDBase_NEGi(SIMDBase_ADDi(SIMDBase_MULi(s1, t0), SIMDBase_MULi(s3, t1))); + t0i = SIMDBase_SUBi(SIMDBase_MULi(s1, t1), SIMDBase_MULi(s3, t0)); + SIMDBase_STOR(&s[i2+4], SIMDBase_SUBi(s0, t0r)); SIMDBase_STOR(&s[i0+4], SIMDBase_ADDi(s0, t0r)); + SIMDBase_STOR(&s[i2+5], SIMDBase_SUBi(s2, t0i)); SIMDBase_STOR(&s[i0+5], SIMDBase_ADDi(s2, t0i)); + + s0 = SIMDBase_LOAD(&s[i0+6]); s2 = SIMDBase_LOAD(&s[i0+7]); + s1 = SIMDBase_LOAD(&s[i2+6]); s3 = SIMDBase_LOAD(&s[i2+7]); + t0 = SIMDBase_LOAD1(&tbl[cp-3]); t1 = SIMDBase_LOAD1(&tbl[sp+3]); + t0r = SIMDBase_NEGi(SIMDBase_ADDi(SIMDBase_MULi(s1, t0), SIMDBase_MULi(s3, t1))); + t0i = SIMDBase_SUBi(SIMDBase_MULi(s1, t1), SIMDBase_MULi(s3, t0)); + SIMDBase_STOR(&s[i2+6], SIMDBase_SUBi(s0, t0r)); SIMDBase_STOR(&s[i0+6], SIMDBase_ADDi(s0, t0r)); + SIMDBase_STOR(&s[i2+7], SIMDBase_SUBi(s2, t0i)); SIMDBase_STOR(&s[i0+7], SIMDBase_ADDi(s2, t0i)); + + i0 += 8; i2 += 8; cp -= 4; sp += 4; + } while(cp > 0); +} + +static void srButForward16(DFTUndiff *p) { + int32_t o = p->offset1; + + p->butlen = 16; p->log2butlen = 4; p->stride = p->butlen/2; + srButForwardSubUnrolled(p); + + p->offset1 = o + 16*6/4; + srButForward4(p); + + p->offset1 = o + 16*4/4; + srButForward4(p); + + p->offset1 = o; + srButForward8(p); +} + +static void srButBackward16(DFTUndiff *p) { + int32_t o = p->offset1; + + p->offset1 = o + 16*6/4; + srButBackward4(p); + + p->offset1 = o + 16*4/4; + srButBackward4(p); + + p->offset1 = o; + srButBackward8(p); + + p->butlen = 16; p->log2butlen = 4; p->stride = p->butlen/2; + srButBackwardSubUnrolled(p); +} + +static void srButForward32(DFTUndiff *p) { + int32_t o = p->offset1; + + p->butlen = 32; p->log2butlen = 5; p->stride = p->butlen/2; + srButForwardSubUnrolled(p); + + p->offset1 = o + 32*6/4; + srButForward8 (p); + + p->offset1 = o + 32*4/4; + srButForward8 (p); + + p->offset1 = o; + srButForward16(p); +} + +static void srButBackward32(DFTUndiff *p) { + int32_t o = p->offset1; + + p->offset1 = o + 32*6/4; + srButBackward8 (p); + + p->offset1 = o + 32*4/4; + srButBackward8 (p); + + p->offset1 = o; + srButBackward16(p); + + p->butlen = 32; p->log2butlen = 5; p->stride = p->butlen/2; + srButBackwardSubUnrolled(p); +} + +// + +#if 1 +static inline void bitReversalUnit(SIMDBase_VECT *p, SIMDBase_VECT *q) { + SIMDBase_VECT w, x, y, z; + + w = SIMDBase_LOAD(p); x = SIMDBase_LOAD(p+1); + y = SIMDBase_LOAD(q); z = SIMDBase_LOAD(q+1); + + SIMDBase_STOR(q, w); SIMDBase_STOR(q+1, x); + SIMDBase_STOR(p, y); SIMDBase_STOR(p+1, z); +} +#else +#define bitReversalUnit(p0, q0) { \ + SIMDBase_VECT *px = (p0), *qx = (q0); \ + SIMDBase_VECT wx, xx, yx, zx; \ + \ + wx = SIMDBase_LOAD(px); xx = SIMDBase_LOAD(px+1); \ + yx = SIMDBase_LOAD(qx); zx = SIMDBase_LOAD(qx+1); \ + \ + SIMDBase_STOR(qx, wx); SIMDBase_STOR(qx+1, xx); \ + SIMDBase_STOR(px, yx); SIMDBase_STOR(px+1, zx); \ +} +#endif + +static inline void bitReversal4s(SIMDBase_VECT *s, int32_t sc, int32_t o1, int32_t o2) { + SIMDBase_VECT *p = &s[o1*2], *q = &s[o2*2]; + int b1 = sc*2*1, b2 = b1*2; + p += b1; q += b2; + bitReversalUnit(p, q); +} + +static inline void bitReversal8s(SIMDBase_VECT *s, int32_t sc, int32_t o1, int32_t o2) { + SIMDBase_VECT *p = &s[o1*2], *q = &s[o2*2]; + int b1 = sc*2*1, b2 = b1*2, b4 = b2*2; + p += b1; q += b4; + bitReversalUnit(p, q); p += b2; q += b2; + bitReversalUnit(p, q); +} + +static inline void bitReversal8d(SIMDBase_VECT *s, int32_t sc, int32_t o1, int32_t o2) { + SIMDBase_VECT *p = &s[o1*2], *q = &s[o2*2]; + int32_t b1 = sc*2*1, b2 = b1*2, b4 = b2*2; + bitReversalUnit(p, q); p += b1; q += b4; + bitReversalUnit(p, q); p += b2; q += b2; + bitReversalUnit(p, q); p -= b1; q -= b4; + bitReversalUnit(p, q); p += b4; q += b1; + bitReversalUnit(p, q); p += b1; q += b4; + bitReversalUnit(p, q); p -= b2; q -= b2; + bitReversalUnit(p, q); p -= b1; q -= b4; + bitReversalUnit(p, q); +} + +static inline void bitReversal16s(SIMDBase_VECT *s, int32_t sc, int32_t o1, int32_t o2) { + SIMDBase_VECT *p = &s[o1*2], *q = &s[o2*2]; + int32_t b1 = sc*2*1, b2 = b1*2, b4 = b2*2, b8 = b4*2; + p += b1; q += b8; + bitReversalUnit(p, q); p += b2; q += b4; + bitReversalUnit(p, q); p -= b1; q -= b8; + bitReversalUnit(p, q); p += b1 + b4; q += b2 + b8; + bitReversalUnit(p, q); p -= b2; q -= b4; + bitReversalUnit(p, q); p += b2 + b4; q += b1 + b2; + bitReversalUnit(p, q); +} + +static inline void bitReversal16d(SIMDBase_VECT *s, int32_t sc, int32_t o1, int32_t o2) { + SIMDBase_VECT *p = &s[o1*2], *q = &s[o2*2]; + int32_t b1 = sc*2*1, b2 = b1*2, b4 = b2*2, b8 = b4*2; + bitReversalUnit(p, q); p += b1; q += b8; + bitReversalUnit(p, q); p += b2; q += b4; + bitReversalUnit(p, q); p -= b1; q -= b8; + bitReversalUnit(p, q); p += b4; q += b2; + bitReversalUnit(p, q); p += b1; q += b8; + bitReversalUnit(p, q); p -= b2; q -= b4; + bitReversalUnit(p, q); p -= b1; q -= b8; + bitReversalUnit(p, q); p += b8; q += b1; + bitReversalUnit(p, q); p += b1; q += b8; + bitReversalUnit(p, q); p += b2; q += b4; + bitReversalUnit(p, q); p -= b1; q -= b8; + bitReversalUnit(p, q); p -= b4; q -= b2; + bitReversalUnit(p, q); p += b1; q += b8; + bitReversalUnit(p, q); p -= b2; q -= b4; + bitReversalUnit(p, q); p -= b1; q -= b8; + bitReversalUnit(p, q); +} + +static inline void bitReversal32s(SIMDBase_VECT *s, int32_t sc, int32_t o1, int32_t o2) { + SIMDBase_VECT *p = &s[o1*2], *q = &s[o2*2]; + int32_t b1 = sc*2*1, b2 = b1*2, b4 = b2*2, b8 = b4*2, b16 = b8*2; + p += b1; q += b16; + bitReversalUnit(p, q); p += b2; q += b8; + bitReversalUnit(p, q); p -= b1; q -= b16; + bitReversalUnit(p, q); p += b4; q += b4; + bitReversalUnit(p, q); p += b1; q += b16; + bitReversalUnit(p, q); p -= b2; q -= b8; + bitReversalUnit(p, q); p += b8; q += b2; + bitReversalUnit(p, q); p += b2; q += b8; + bitReversalUnit(p, q); p -= b4; q -= b4; + bitReversalUnit(p, q); p -= b2; q -= b8; + bitReversalUnit(p, q); p += b16 - b2; q += b1 + b2 + b8; + bitReversalUnit(p, q); p -= b4; q -= b4; + bitReversalUnit(p, q); +} + +static void bitReversal32d(SIMDBase_VECT *s, int32_t sc, int32_t o1, int32_t o2) { + const int32_t k = 32; + + bitReversal8d(s,2*sc, sc*(k/2 )+o1, sc* 1 +o2); + bitReversal8d(s,2*sc, sc* 0 +o1, sc* 0 +o2); + bitReversal8d(s,2*sc, sc* 1 +o1, sc*(k/2 )+o2); + bitReversal8d(s,2*sc, sc*(k/2+1)+o1, sc*(k/2+1)+o2); +} + +static void bitReversalRecursive(SIMDBase_VECT *s, int32_t n, int32_t sc, int32_t o1, int32_t o2) { + if (n >= 64) { + if (o1 != o2) bitReversalRecursive(s, n/4, 2*sc, sc*(n/2)+o1, sc*1+o2); + + bitReversalRecursive(s, n/4, 2*sc, sc* 0 +o1, sc* 0 +o2); + bitReversalRecursive(s, n/4, 2*sc, sc* 1 +o1, sc*(n/2 )+o2); + bitReversalRecursive(s, n/4, 2*sc, sc*(n/2+1)+o1, sc*(n/2+1)+o2); + } else { + if (o1 == o2) { + switch(n) { + case 4: bitReversal4s (s,sc,o1,o2); return; + case 8: bitReversal8s (s,sc,o1,o2); return; + case 16: bitReversal16s(s,sc,o1,o2); return; + case 32: bitReversal32s(s,sc,o1,o2); return; + } + } else { + switch(n) { + case 8: bitReversal8d (s,sc,o1,o2); return; + case 16: bitReversal16d(s,sc,o1,o2); return; + case 32: bitReversal32d(s,sc,o1,o2); return; + } + } + } +} + +// + +static int bitR(int a, int logN) { + int ret = 0; + int i,j,k; + for(i=0,j=1,k=1<<(logN-1);i<logN;i++,j=j<<1,k=k>>1) { + if ((a & j) != 0) ret |= k; + } + return ret; +} + +static void bitReversalCobraInplace(DFTUndiff *p) { + SIMDBase_VECT *s = p->s; + int cobraQ = p->cobraQ; + SIMDBase_VECT *cobraT = p->cobraT; + int *cobraR = p->cobraR; + int logN = p->log2len; + + int b; + + for(b=0;b<(1 << (logN-2*cobraQ));b++) { + int a,c; + int b2 = bitR(b, logN-2*cobraQ); + + if (b2 < b) continue; + + if (b2 == b) { + for(a=0;a<(1 << cobraQ);a++) { + int abc = ((a << (logN-2*cobraQ)) | b) << (cobraQ + 1); + + int a2c = (cobraR[a] << cobraQ) << 1, a2cm = a2c+(1 << cobraQ)*2; + + while(a2c < a2cm) { + SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); + SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); + SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); + SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); + SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); + SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); + SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); + SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); + SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); + SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); + SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); + SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); + SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); + SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); + SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); + SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); + } + } + + for(c=0;c<(1 << cobraQ);c++) { + int c2 = cobraR[c]; + int c2b2a2 = ((c2 << (logN-2*cobraQ)) | b2) << (cobraQ+1); + + int a2c = c << 1; + int a2ci = 1 << (cobraQ+1); + int c2b2a2m = c2b2a2 + (1 << cobraQ)*2; + + while(c2b2a2 < c2b2a2m) { + SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c ])); SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c+1])); a2c += a2ci; + SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c ])); SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c+1])); a2c += a2ci; + SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c ])); SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c+1])); a2c += a2ci; + SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c ])); SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c+1])); a2c += a2ci; + SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c ])); SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c+1])); a2c += a2ci; + SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c ])); SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c+1])); a2c += a2ci; + SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c ])); SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c+1])); a2c += a2ci; + SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c ])); SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c+1])); a2c += a2ci; + SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c ])); SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c+1])); a2c += a2ci; + SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c ])); SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c+1])); a2c += a2ci; + SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c ])); SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c+1])); a2c += a2ci; + SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c ])); SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c+1])); a2c += a2ci; + SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c ])); SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c+1])); a2c += a2ci; + SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c ])); SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c+1])); a2c += a2ci; + SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c ])); SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c+1])); a2c += a2ci; + SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c ])); SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c+1])); a2c += a2ci; + } + } + } else { + for(a=0;a<(1 << cobraQ);a++) { + int a2c = (cobraR[a] << cobraQ) << 1, a2cm = a2c+(1 << cobraQ)*2; + int abc = ((a << (logN-2*cobraQ)) | b) << (cobraQ + 1); + + while(a2c < a2cm) { + SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); + SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); + SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); + SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); + SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); + SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); + SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); + SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); + SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); + SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); + SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); + SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); + SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); + SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); + SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); + SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); + } + } + + for(c=0;c<(1 << cobraQ);c++) { + int c2 = cobraR[c]; + int c2b2a2 = ((c2 << (logN-2*cobraQ)) | b2) << (cobraQ+1); + + int a2c = c << 1; + int a2ci = 1 << (cobraQ+1); + int c2b2a2m = c2b2a2 + (1 << cobraQ)*2; + + while(c2b2a2 < c2b2a2m) { + SIMDBase_VECT t0, t1, t2, t3, t4, t5, t6, t7; + + t0 = SIMDBase_LOAD(&s[c2b2a2+0]); t1 = SIMDBase_LOAD(&s[c2b2a2+1]); + t2 = SIMDBase_LOAD(&s[c2b2a2+2]); t3 = SIMDBase_LOAD(&s[c2b2a2+3]); + t4 = SIMDBase_LOAD(&s[c2b2a2+4]); t5 = SIMDBase_LOAD(&s[c2b2a2+5]); + t6 = SIMDBase_LOAD(&s[c2b2a2+6]); t7 = SIMDBase_LOAD(&s[c2b2a2+7]); + + SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c ])); SIMDBase_STOR(&cobraT[a2c ], t0); + SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c+1])); SIMDBase_STOR(&cobraT[a2c+1], t1); a2c += a2ci; + SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c ])); SIMDBase_STOR(&cobraT[a2c ], t2); + SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c+1])); SIMDBase_STOR(&cobraT[a2c+1], t3); a2c += a2ci; + SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c ])); SIMDBase_STOR(&cobraT[a2c ], t4); + SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c+1])); SIMDBase_STOR(&cobraT[a2c+1], t5); a2c += a2ci; + SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c ])); SIMDBase_STOR(&cobraT[a2c ], t6); + SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c+1])); SIMDBase_STOR(&cobraT[a2c+1], t7); a2c += a2ci; + + t0 = SIMDBase_LOAD(&s[c2b2a2+0]); t1 = SIMDBase_LOAD(&s[c2b2a2+1]); + t2 = SIMDBase_LOAD(&s[c2b2a2+2]); t3 = SIMDBase_LOAD(&s[c2b2a2+3]); + t4 = SIMDBase_LOAD(&s[c2b2a2+4]); t5 = SIMDBase_LOAD(&s[c2b2a2+5]); + t6 = SIMDBase_LOAD(&s[c2b2a2+6]); t7 = SIMDBase_LOAD(&s[c2b2a2+7]); + + SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c ])); SIMDBase_STOR(&cobraT[a2c ], t0); + SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c+1])); SIMDBase_STOR(&cobraT[a2c+1], t1); a2c += a2ci; + SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c ])); SIMDBase_STOR(&cobraT[a2c ], t2); + SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c+1])); SIMDBase_STOR(&cobraT[a2c+1], t3); a2c += a2ci; + SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c ])); SIMDBase_STOR(&cobraT[a2c ], t4); + SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c+1])); SIMDBase_STOR(&cobraT[a2c+1], t5); a2c += a2ci; + SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c ])); SIMDBase_STOR(&cobraT[a2c ], t6); + SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c+1])); SIMDBase_STOR(&cobraT[a2c+1], t7); a2c += a2ci; + + t0 = SIMDBase_LOAD(&s[c2b2a2+0]); t1 = SIMDBase_LOAD(&s[c2b2a2+1]); + t2 = SIMDBase_LOAD(&s[c2b2a2+2]); t3 = SIMDBase_LOAD(&s[c2b2a2+3]); + t4 = SIMDBase_LOAD(&s[c2b2a2+4]); t5 = SIMDBase_LOAD(&s[c2b2a2+5]); + t6 = SIMDBase_LOAD(&s[c2b2a2+6]); t7 = SIMDBase_LOAD(&s[c2b2a2+7]); + + SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c ])); SIMDBase_STOR(&cobraT[a2c ], t0); + SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c+1])); SIMDBase_STOR(&cobraT[a2c+1], t1); a2c += a2ci; + SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c ])); SIMDBase_STOR(&cobraT[a2c ], t2); + SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c+1])); SIMDBase_STOR(&cobraT[a2c+1], t3); a2c += a2ci; + SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c ])); SIMDBase_STOR(&cobraT[a2c ], t4); + SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c+1])); SIMDBase_STOR(&cobraT[a2c+1], t5); a2c += a2ci; + SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c ])); SIMDBase_STOR(&cobraT[a2c ], t6); + SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c+1])); SIMDBase_STOR(&cobraT[a2c+1], t7); a2c += a2ci; + + t0 = SIMDBase_LOAD(&s[c2b2a2+0]); t1 = SIMDBase_LOAD(&s[c2b2a2+1]); + t2 = SIMDBase_LOAD(&s[c2b2a2+2]); t3 = SIMDBase_LOAD(&s[c2b2a2+3]); + t4 = SIMDBase_LOAD(&s[c2b2a2+4]); t5 = SIMDBase_LOAD(&s[c2b2a2+5]); + t6 = SIMDBase_LOAD(&s[c2b2a2+6]); t7 = SIMDBase_LOAD(&s[c2b2a2+7]); + + SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c ])); SIMDBase_STOR(&cobraT[a2c ], t0); + SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c+1])); SIMDBase_STOR(&cobraT[a2c+1], t1); a2c += a2ci; + SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c ])); SIMDBase_STOR(&cobraT[a2c ], t2); + SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c+1])); SIMDBase_STOR(&cobraT[a2c+1], t3); a2c += a2ci; + SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c ])); SIMDBase_STOR(&cobraT[a2c ], t4); + SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c+1])); SIMDBase_STOR(&cobraT[a2c+1], t5); a2c += a2ci; + SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c ])); SIMDBase_STOR(&cobraT[a2c ], t6); + SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c+1])); SIMDBase_STOR(&cobraT[a2c+1], t7); a2c += a2ci; + } + } + + for(a=0;a<(1 << cobraQ);a++) { + int a2c = (cobraR[a] << cobraQ) << 1, a2cm = a2c+(1 << cobraQ)*2; + int abc = ((a << (logN-2*cobraQ)) | b) << (cobraQ + 1); + + while(a2c < a2cm) { + SIMDBase_STOR(&s[abc++], SIMDBase_LOAD(&cobraT[a2c++])); SIMDBase_STOR(&s[abc++], SIMDBase_LOAD(&cobraT[a2c++])); + SIMDBase_STOR(&s[abc++], SIMDBase_LOAD(&cobraT[a2c++])); SIMDBase_STOR(&s[abc++], SIMDBase_LOAD(&cobraT[a2c++])); + SIMDBase_STOR(&s[abc++], SIMDBase_LOAD(&cobraT[a2c++])); SIMDBase_STOR(&s[abc++], SIMDBase_LOAD(&cobraT[a2c++])); + SIMDBase_STOR(&s[abc++], SIMDBase_LOAD(&cobraT[a2c++])); SIMDBase_STOR(&s[abc++], SIMDBase_LOAD(&cobraT[a2c++])); + SIMDBase_STOR(&s[abc++], SIMDBase_LOAD(&cobraT[a2c++])); SIMDBase_STOR(&s[abc++], SIMDBase_LOAD(&cobraT[a2c++])); + SIMDBase_STOR(&s[abc++], SIMDBase_LOAD(&cobraT[a2c++])); SIMDBase_STOR(&s[abc++], SIMDBase_LOAD(&cobraT[a2c++])); + SIMDBase_STOR(&s[abc++], SIMDBase_LOAD(&cobraT[a2c++])); SIMDBase_STOR(&s[abc++], SIMDBase_LOAD(&cobraT[a2c++])); + SIMDBase_STOR(&s[abc++], SIMDBase_LOAD(&cobraT[a2c++])); SIMDBase_STOR(&s[abc++], SIMDBase_LOAD(&cobraT[a2c++])); + SIMDBase_STOR(&s[abc++], SIMDBase_LOAD(&cobraT[a2c++])); SIMDBase_STOR(&s[abc++], SIMDBase_LOAD(&cobraT[a2c++])); + SIMDBase_STOR(&s[abc++], SIMDBase_LOAD(&cobraT[a2c++])); SIMDBase_STOR(&s[abc++], SIMDBase_LOAD(&cobraT[a2c++])); + SIMDBase_STOR(&s[abc++], SIMDBase_LOAD(&cobraT[a2c++])); SIMDBase_STOR(&s[abc++], SIMDBase_LOAD(&cobraT[a2c++])); + SIMDBase_STOR(&s[abc++], SIMDBase_LOAD(&cobraT[a2c++])); SIMDBase_STOR(&s[abc++], SIMDBase_LOAD(&cobraT[a2c++])); + SIMDBase_STOR(&s[abc++], SIMDBase_LOAD(&cobraT[a2c++])); SIMDBase_STOR(&s[abc++], SIMDBase_LOAD(&cobraT[a2c++])); + SIMDBase_STOR(&s[abc++], SIMDBase_LOAD(&cobraT[a2c++])); SIMDBase_STOR(&s[abc++], SIMDBase_LOAD(&cobraT[a2c++])); + SIMDBase_STOR(&s[abc++], SIMDBase_LOAD(&cobraT[a2c++])); SIMDBase_STOR(&s[abc++], SIMDBase_LOAD(&cobraT[a2c++])); + SIMDBase_STOR(&s[abc++], SIMDBase_LOAD(&cobraT[a2c++])); SIMDBase_STOR(&s[abc++], SIMDBase_LOAD(&cobraT[a2c++])); + } + } + } + } +} + +// + +static void srForwardMain2(DFTUndiff *p) { + int32_t o = p->offset1; + int32_t butlen = p->butlen; + int32_t log2butlen = p->log2butlen; + + if (butlen >= p->radix2thres) { + p->stride = p->butlen/2; + r2ButForwardSub(p); + + p->offset1 = o + butlen*4/4; + p->butlen = butlen/2; + p->log2butlen = log2butlen-1; + srForwardMain2(p); + + p->offset1 = o; + p->butlen = butlen/2; + p->log2butlen = log2butlen-1; + srForwardMain2(p); + + return; + } + + if (butlen >= 256) { + p->stride = p->butlen/2; + srButForwardSubUnrolled(p); + + p->offset1 = o + butlen*6/4; + p->butlen = butlen/4; + p->log2butlen = log2butlen-2; + srForwardMain2(p); + + p->offset1 = o + butlen*4/4; + p->butlen = butlen/4; + p->log2butlen = log2butlen-2; + srForwardMain2(p); + + p->offset1 = o; + p->butlen = butlen/2; + p->log2butlen = log2butlen-1; + srForwardMain2(p); + + return; + } + + if (butlen == 128) { + p->stride = p->butlen/2; + srButForwardSubUnrolled(p); + + p->offset1 = o + butlen*6/4; + srButForward32(p); + + p->offset1 = o + butlen*4/4; + srButForward32(p); + + p->offset1 = o; + p->butlen = butlen/2; + p->log2butlen = log2butlen-1; + srForwardMain2 (p); + + return; + } + + // butlen == 64 + + p->stride = p->butlen/2; + srButForwardSubUnrolled(p); + + p->offset1 = o + butlen*6/4; + srButForward16(p); + + p->offset1 = o + butlen*4/4; + srButForward16(p); + + p->offset1 = o; + srButForward32(p); +} + +static void srBackwardMain2(DFTUndiff *p) { + int32_t o = p->offset1; + int32_t butlen = p->butlen; + int32_t log2butlen = p->log2butlen; + + if (butlen >= p->radix2thres) { + p->offset1 = o + butlen*4/4; + p->butlen = butlen/2; + p->log2butlen = log2butlen-1; + srBackwardMain2(p); + + p->offset1 = o; + p->butlen = butlen/2; + p->log2butlen = log2butlen-1; + srBackwardMain2(p); + + p->butlen = butlen; + p->stride = p->butlen/2; + p->log2butlen = log2butlen; + r2ButBackwardSub(p); + + return; + } + + if (butlen >= 256) { + p->offset1 = o + butlen*6/4; + p->butlen = butlen/4; + p->log2butlen = log2butlen-2; + srBackwardMain2(p); + + p->offset1 = o + butlen*4/4; + p->butlen = butlen/4; + p->log2butlen = log2butlen-2; + srBackwardMain2(p); + + p->offset1 = o; + p->butlen = butlen/2; + p->log2butlen = log2butlen-1; + srBackwardMain2(p); + + p->butlen = butlen; + p->stride = p->butlen/2; + p->log2butlen = log2butlen; + srButBackwardSubUnrolled(p); + + return; + } + + if (butlen == 128) { + p->offset1 = o + butlen*6/4; + srButBackward32(p); + + p->offset1 = o + butlen*4/4; + srButBackward32(p); + + p->offset1 = o; + p->butlen = butlen/2; + p->log2butlen = log2butlen-1; + srBackwardMain2 (p); + + p->butlen = butlen; + p->stride = p->butlen/2; + p->log2butlen = log2butlen; + srButBackwardSubUnrolled(p); + + return; + } + + // butlen == 64 + + p->offset1 = o + butlen*6/4; + srButBackward16(p); + + p->offset1 = o + butlen*4/4; + srButBackward16(p); + + p->offset1 = o; + srButBackward32(p); + + p->butlen = butlen; + p->stride = p->butlen/2; + p->log2butlen = log2butlen; + srButBackwardSubUnrolled(p); +} + +static void srForwardMain(DFTUndiff *p) { + if (p->length >= 64) { + p->butlen = p->length; + p->log2butlen = p->log2len; + p->offset1 = p->offset2 = 0; + + srForwardMain2(p); + } else { + switch(p->length) { + case 32: + srButForward32(p); + break; + case 16: + srButForward16(p); + break; + case 8: + srButForward8(p); + break; + case 4: + srButForward4(p); + break; + case 2: + srBut2(p); + break; + } + } +} + +static void srBackwardMain(DFTUndiff *p) { + if (p->length >= 64) { + p->butlen = p->length; + p->log2butlen = p->log2len; + p->offset1 = p->offset2 = 0; + + srBackwardMain2(p); + } else { + switch(p->length) { + case 32: + srButBackward32(p); + break; + case 16: + srButBackward16(p); + break; + case 8: + srButBackward8(p); + break; + case 4: + srButBackward4(p); + break; + case 2: + srBut2(p); + break; + } + } +} + +static void realSub0(DFTUndiff *p, SIMDBase_VECT *s, int32_t ts) { + SIMDBase_VECT tr, ti, ur, ui, mr, mi; + int32_t n = p->length*2; + int32_t k; + + for(k=1;k<n/4;k++) { + SIMDBase_VECT s00 = SIMDBase_LOAD(&s[k*2+0]), s01 = SIMDBase_LOAD(&s[k*2+1]); + SIMDBase_VECT s10 = SIMDBase_LOAD(&s[(n/2-k)*2+0]), s11 = SIMDBase_LOAD(&s[(n/2-k)*2+1]); + + tr = SIMDBase_SUBi(s00, s10); ti = SIMDBase_ADDi(s01, s11); + ur = SIMDBase_LOAD1(&(p->rtTable[ts][k*2+0])); + ui = SIMDBase_LOAD1(&(p->rtTable[ts][k*2+1])); + mr = SIMDBase_SUBi(SIMDBase_MULi(tr, ur), SIMDBase_MULi(ti, ui)); + mi = SIMDBase_ADDi(SIMDBase_MULi(tr, ui), SIMDBase_MULi(ti, ur)); + SIMDBase_STOR(&s[k*2+0], SIMDBase_SUBi(s00, mr)); + SIMDBase_STOR(&s[k*2+1], SIMDBase_SUBi(s01, mi)); + SIMDBase_STOR(&s[(n/2-k)*2+0], SIMDBase_ADDi(s10, mr)); + SIMDBase_STOR(&s[(n/2-k)*2+1], SIMDBase_SUBi(s11, mi)); + } + + tr = SIMDBase_LOAD(&s[0]); ti = SIMDBase_LOAD(&s[1]); + SIMDBase_STOR(&s[0], SIMDBase_ADDi(tr, ti)); + SIMDBase_STOR(&s[1], SIMDBase_SUBi(tr, ti)); +} + +static void realSub1(DFTUndiff *p, SIMDBase_VECT *s, int32_t ts) { + SIMDBase_VECT tr, ti, ur, ui, mr, mi; + int32_t n = p->length*2; + int32_t k; + + tr = SIMDBase_LOAD(&s[0]); ti = SIMDBase_LOAD(&s[1]); + SIMDBase_STOR(&s[0], SIMDBase_MULi(SIMDBase_ADDi(tr, ti), SIMDBase_SET1(0.5))); + SIMDBase_STOR(&s[1], SIMDBase_MULi(SIMDBase_SUBi(tr, ti), SIMDBase_SET1(0.5))); + + for(k=1;k<n/4;k++) { + SIMDBase_VECT s00 = SIMDBase_LOAD(&s[k*2+0]), s01 = SIMDBase_LOAD(&s[k*2+1]); + SIMDBase_VECT s10 = SIMDBase_LOAD(&s[(n/2-k)*2+0]), s11 = SIMDBase_LOAD(&s[(n/2-k)*2+1]); + + tr = SIMDBase_SUBi(s00, s10); ti = SIMDBase_ADDi(s01, s11); + ur = SIMDBase_LOAD1(&(p->rtTable[ts][k*2+0])); + ui = SIMDBase_LOAD1(&(p->rtTable[ts][k*2+1])); + mr = SIMDBase_SUBi(SIMDBase_MULi(tr, ur), SIMDBase_MULi(ti, ui)); + mi = SIMDBase_ADDi(SIMDBase_MULi(tr, ui), SIMDBase_MULi(ti, ur)); + tr = SIMDBase_SUBi(s00, mr); ti = SIMDBase_SUBi(mi, s01); + SIMDBase_STOR(&s[k*2+0], SIMDBase_ADDi(mr, s10)); + SIMDBase_STOR(&s[k*2+1], SIMDBase_SUBi(mi, s11)); + SIMDBase_STOR(&s[(n/2-k)*2+0], tr); + SIMDBase_STOR(&s[(n/2-k)*2+1], ti); + } +} + +void DFTUndiff_EXECUTE(void *p2, void *s2, int32_t dir) { + DFTUndiff *p = (DFTUndiff *)p2; + SIMDBase_VECT *s = (SIMDBase_VECT *)s2; + + if (p->magic != MAGIC_DFT) abort(); + + p->s = s; + + if (dir == -1) { + if ((p->flags & DFT_FLAG_ALT_REAL) != 0) { + realSub1(p, s, 0); + } + + srForwardMain(p); + + if ((p->flags & DFT_FLAG_NO_BITREVERSAL) == 0) { + if (p->useCobra) { + bitReversalCobraInplace(p); + } else { + bitReversalRecursive(p->s, p->length, 1, 0, 0); + } + } + + if ((p->flags & DFT_FLAG_REAL) != 0) { + realSub0(p, s, 0); + s[p->length+1] = SIMDBase_NEGi(s[p->length+1]); + } + } else { + if ((p->flags & DFT_FLAG_REAL) != 0) { + s[p->length+1] = SIMDBase_NEGi(s[p->length+1]); + realSub1(p, s, 1); + } + + if ((p->flags & DFT_FLAG_NO_BITREVERSAL) == 0) { + if (p->useCobra) { + bitReversalCobraInplace(p); + } else { + bitReversalRecursive(p->s, p->length, 1, 0, 0); + } + } + + srBackwardMain(p); + + if ((p->flags & DFT_FLAG_ALT_REAL) != 0) { + realSub0(p, s, 1); + } + } +} + +void DFTUndiff_DESTROYPLAN(void *p2) { + DFTUndiff *plan = (DFTUndiff *)p2; + if (plan->magic != MAGIC_DFT) abort(); + + free(*(plan->ptTable)); + free(plan->ptTable); + free(plan->cobraT); + free(plan->cobraR); + //free(plan->t); + if (plan->rtTable != NULL) { + free(plan->rtTable[0]); + free(plan->rtTable[1]); + free(plan->rtTable); + } + + plan->magic = 0; + free(plan); +} + +DFTUndiff *DFTUndiff_MAKEPLANSUB(uint64_t n, int32_t radix2thres, int32_t useCobra, uint64_t flags) { + int32_t i, j, k; + + uint32_t linesize = SIMDBase_sizeOfCachelineInByte(); + uint32_t cachesize = SIMDBase_sizeOfDataCacheInByte(); + + // + + if ((flags & DFT_FLAG_REAL) != 0 || (flags & DFT_FLAG_ALT_REAL) != 0) n /= 2; + + DFTUndiff *d = calloc(1, sizeof(DFTUndiff)); + + d->magic = MAGIC_DFT; + d->mode = SIMDBase_MODE; + d->flags = flags; + + d->radix2thres = radix2thres; + d->useCobra = useCobra; + + d->length = (uint32_t) n; + d->log2len = DFT_ilog2((uint32_t) n); + + // + + SIMDBase_REAL *trigTable = SIMDBase_alignedMalloc(sizeof(SIMDBase_REAL)*n*2); + d->ptTable = malloc(sizeof(SIMDBase_REAL *) * (d->log2len+1)); + + SIMDBase_REAL *p = trigTable, **pp = d->ptTable; + + for(j=0;j<(int32_t)d->log2len+1;j++) { + *pp++ = p; + + if ((1 << j) >= d->radix2thres) { + for(i=0;i<(1 << j)/4+1;i++) { + *p++ = (SIMDBase_REAL)COS(-2*M_PIl*i/(1 << j)); + } + const int32_t step = linesize / sizeof(SIMDBase_REAL); + p += (step - (p - trigTable) % step) % step; + } else { + for(i=0;i<(1 << j)/4;i++) { + *p++ = (SIMDBase_REAL)COS(-2*M_PIl*i/(1 << j)); + *p++ = (SIMDBase_REAL)SIN(-2*M_PIl*i/(1 << j)); + *p++ = (SIMDBase_REAL)COS(-6*M_PIl*i/(1 << j)); + *p++ = (SIMDBase_REAL)SIN(-6*M_PIl*i/(1 << j)); + } + } + } + + // + + int32_t cobraQ; + + cobraQ = linesize / (sizeof(SIMDBase_VECT) * 2); + + for(;;) { + if (1 << (cobraQ*2) > + (cachesize / (sizeof(SIMDBase_VECT) * 2)/2)) + break; + + cobraQ++; + } + cobraQ--; + + d->cobraQ = cobraQ; + + if (cobraQ >= 4 && d->log2len >= 2*cobraQ) { + SIMDBase_VECT *cobraT; + int32_t *cobraR; + + if (d->log2len <= 2*cobraQ) cobraQ = d->log2len / 2; + + cobraT = SIMDBase_alignedMalloc(sizeof(SIMDBase_VECT)*2 * (1 << (cobraQ*2))); + cobraR = (int32_t *)SIMDBase_alignedMalloc(sizeof(int32_t) * (1 << cobraQ)); + + for(i=0;i<(1 << cobraQ);i++) cobraR[i] = bitR(i, cobraQ); + + d->cobraT = cobraT; d->cobraR = cobraR; + } else { + d->useCobra = 0; + } + + // + + if ((d->flags & DFT_FLAG_REAL) != 0 || (d->flags & DFT_FLAG_ALT_REAL) != 0) { + int32_t m = n*2; + + d->rtTable = malloc(sizeof(SIMDBase_REAL *)*2); + d->rtTable[0] = SIMDBase_alignedMalloc(sizeof(SIMDBase_REAL)*m/2); + d->rtTable[1] = SIMDBase_alignedMalloc(sizeof(SIMDBase_REAL)*m/2); + + for(k=0;k<m/4;k++) { + d->rtTable[0][k*2+0] = 0.5-0.5*SIN(-2*M_PIl*k/m); + d->rtTable[0][k*2+1] = 0.5*COS(-2*M_PIl*k/m); + d->rtTable[1][k*2+0] = 0.5-0.5*SIN( 2*M_PIl*k/m); + d->rtTable[1][k*2+1] = 0.5*COS( 2*M_PIl*k/m); + } + } + + // + + return (void *)d; +} + +void *DFTUndiff_MAKEPLAN(uint64_t n, uint64_t flags) { + if (flags & DFT_FLAG_VERBOSE) { + printf("\n--------------------------------\n"); + printf("Making plan, mode = %s, dft length = %d\n", SIMDBase_NAME, (int)n); + printf("Processor : %s\n", SIMDBase_getProcessorNameString()); + printf("Cache size (L2 + L3) : %d kbytes / thread\n", SIMDBase_sizeOfDataCacheInByte() / 1024); + printf("Cache Line Size : %d bytes\n", SIMDBase_sizeOfCachelineInByte()); + } + + if (n <= 256 || (flags & 3) == 0) { + return DFTUndiff_MAKEPLANSUB(n, n*2, (flags & DFT_FLAG_FORCE_COBRA) != 0, flags); + } + + SIMDBase_REAL *s1 = SIMDBase_alignedMalloc(sizeof(SIMDBase_VECT)*n*2); + + int32_t i, j, ts, tsbest, useCobra = 0; + double tick, tickmin; + + if (flags & DFT_FLAG_VERBOSE) { + printf("\nWarming up before calibration ..."); + fflush(stdout); + } + + // warming up + tick = DFT_timeofday(); + while(DFT_timeofday() - tick < 0.5) + ; + + if (flags & DFT_FLAG_VERBOSE) { + printf(" done\n"); + } + + int32_t ntimes = 20000000.0 / n / DFT_ilog2(n); + if (ntimes == 0) ntimes = 1; + + if (flags & DFT_FLAG_VERBOSE) { + printf("nTimes = %d\n", ntimes); + } + + // + + DFTUndiff *plan = DFTUndiff_MAKEPLANSUB(n, n*2, 0, flags); + + for(i=0;i<n*2*SIMDBase_VECTLEN;i++) { + s1[i] = 0; + } + + plan->s = (SIMDBase_VECT *)s1; + + if (plan->cobraT != NULL) { + double tcobra = 0, trecur = 0; + + if (flags & DFT_FLAG_VERBOSE) { + printf("\nChecking which bit-reversal method is faster\n"); + } + + // + + bitReversalCobraInplace(plan); + + tick = DFT_timeofday(); + + for(j=0;j<ntimes*4;j++) { + bitReversalCobraInplace(plan); + } + + tcobra += DFT_timeofday() - tick; + + // + + bitReversalRecursive(plan->s, plan->length, 1, 0, 0); + + tick = DFT_timeofday(); + + for(j=0;j<ntimes*4;j++) { + bitReversalRecursive(plan->s, plan->length, 1, 0, 0); + } + + trecur += DFT_timeofday() - tick; + + // + + bitReversalCobraInplace(plan); + + tick = DFT_timeofday(); + + for(j=0;j<ntimes*4;j++) { + bitReversalCobraInplace(plan); + } + + tcobra += DFT_timeofday() - tick; + + // + + bitReversalRecursive(plan->s, plan->length, 1, 0, 0); + + tick = DFT_timeofday(); + + for(j=0;j<ntimes*4;j++) { + bitReversalRecursive(plan->s, plan->length, 1, 0, 0); + } + + trecur += DFT_timeofday() - tick; + + // + + useCobra = tcobra < trecur; + + if ((flags & DFT_FLAG_FORCE_RECURSIVE) != 0) useCobra = 0; + if ((flags & DFT_FLAG_FORCE_COBRA) != 0) useCobra = 1; + + if (flags & DFT_FLAG_VERBOSE) { + printf("cobra : %g\n", tcobra); + printf("recur : %g\n", trecur); + if (useCobra) { + printf("will use Cobra\n"); + } else { + printf("will use the recursive reverser\n"); + } + } + } + + DFTUndiff_DESTROYPLAN(plan); + + // + + if (flags & DFT_FLAG_VERBOSE) { + printf("\nDetermining radix 2 threshold\n"); + } + + plan = DFTUndiff_MAKEPLANSUB(n, n*2, useCobra, flags); + + for(j=0;j<ntimes;j++) { + DFTUndiff_EXECUTE(plan, s1, -1); + DFTUndiff_EXECUTE(plan, s1, 1); + } + + DFTUndiff_DESTROYPLAN(plan); + + tsbest = -1; + tickmin = 0; + + for(ts = 1024;ts <= n*2;ts *= 2) { + plan = DFTUndiff_MAKEPLANSUB(n, ts, useCobra, flags); + + tick = DFT_timeofday(); + + for(j=0;j<ntimes;j++) { + DFTUndiff_EXECUTE(plan, s1, -1); + DFTUndiff_EXECUTE(plan, s1, 1); + } + + tick = DFT_timeofday() - tick; + + DFTUndiff_DESTROYPLAN(plan); + + if (tickmin == 0) tickmin = tick; + + if (flags & DFT_FLAG_VERBOSE) { + printf("%d : %g\n",ts, (double)tick); + } + + if (tick < tickmin) { + tickmin = tick; + tsbest = ts; + } + } + + if (tsbest == -1) tsbest = n*2;; + + if (flags & DFT_FLAG_VERBOSE) { + //printf("forcing tsbest = 1024\n"); + //tsbest = 1024; + printf("radix 2 threshold : %d\n\n", tsbest); + + double t = tickmin / ntimes / 2; + double nf = 5 * n * log(n) / log(2) / (t * 1000000); + + printf("nFlops = %d x %g\n", SIMDBase_VECTLEN, nf); + } + + plan = DFTUndiff_MAKEPLANSUB(n, tsbest, useCobra, flags); + + if (flags & DFT_FLAG_VERBOSE) { + printf("\nDone making plan\n--------------------------------\n"); + } + + return plan; +} diff --git a/plugins/supereq/nsfft-1.00/dft/DFTUndiff.h b/plugins/supereq/nsfft-1.00/dft/DFTUndiff.h new file mode 100644 index 00000000..d26b0d9b --- /dev/null +++ b/plugins/supereq/nsfft-1.00/dft/DFTUndiff.h @@ -0,0 +1,114 @@ +#ifndef __DFTIMPL_H__ +#define __DFTIMPL_H__ + +#include "SIMDBaseUndiff.h" + +#define MAGIC_DFT 0x18839f6d82bb02b6ULL + +typedef struct { + uint64_t magic; + + SIMDBase_VECT *s; + uint32_t offset1, offset2; + uint32_t butlen, log2butlen; + uint32_t stride; + + SIMDBase_REAL **ptTable; + uint32_t length, log2len; + + int32_t radix2thres, flagTrans, useCobra; + + int32_t cobraQ; + SIMDBase_VECT *cobraT; + int32_t *cobraR; + + SIMDBase_REAL **rtTable; + + uint64_t flags; + int32_t mode; +} DFTUndiff; + +#if defined(ENABLE_PUREC_FLOAT) //////////////////////////////////////////// + +#define DFTUndiff_GETMODEPARAMINT getModeParamInt_purec_float +#define DFTUndiff_GETMODEPARAMSTRING getModeParamString_purec_float +#define DFTUndiff_EXECUTE execute_purec_float +#define DFTUndiff_MAKEPLAN makePlan_purec_float +#define DFTUndiff_MAKEPLANSUB makePlanSub_purec_float +#define DFTUndiff_DESTROYPLAN destroyPlan_purec_float + +#elif defined(ENABLE_PUREC_DOUBLE) //////////////////////////////////////////// + +#define DFTUndiff_GETMODEPARAMINT getModeParamInt_purec_double +#define DFTUndiff_GETMODEPARAMSTRING getModeParamString_purec_double +#define DFTUndiff_EXECUTE execute_purec_double +#define DFTUndiff_MAKEPLAN makePlan_purec_double +#define DFTUndiff_MAKEPLANSUB makePlanSub_purec_double +#define DFTUndiff_DESTROYPLAN destroyPlan_purec_double + +#elif defined(ENABLE_PUREC_LONGDOUBLE) //////////////////////////////////////////// + +#define DFTUndiff_GETMODEPARAMINT getModeParamInt_purec_longdouble +#define DFTUndiff_GETMODEPARAMSTRING getModeParamString_purec_longdouble +#define DFTUndiff_EXECUTE execute_purec_longdouble +#define DFTUndiff_MAKEPLAN makePlan_purec_longdouble +#define DFTUndiff_MAKEPLANSUB makePlanSub_purec_longdouble +#define DFTUndiff_DESTROYPLAN destroyPlan_purec_longdouble + +#elif defined(ENABLE_SSE_FLOAT) //////////////////////////////////////////// + +#define DFTUndiff_GETMODEPARAMINT getModeParamInt_sse_float +#define DFTUndiff_GETMODEPARAMSTRING getModeParamString_sse_float +#define DFTUndiff_EXECUTE execute_sse_float +#define DFTUndiff_MAKEPLAN makePlan_sse_float +#define DFTUndiff_MAKEPLANSUB makePlanSub_sse_float +#define DFTUndiff_DESTROYPLAN destroyPlan_sse_float + +#elif defined(ENABLE_SSE2_DOUBLE) //////////////////////////////////////////// + +#define DFTUndiff_GETMODEPARAMINT getModeParamInt_sse2_double +#define DFTUndiff_GETMODEPARAMSTRING getModeParamString_sse2_double +#define DFTUndiff_EXECUTE execute_sse2_double +#define DFTUndiff_MAKEPLAN makePlan_sse2_double +#define DFTUndiff_MAKEPLANSUB makePlanSub_sse2_double +#define DFTUndiff_DESTROYPLAN destroyPlan_sse2_double + +#elif defined(ENABLE_NEON_FLOAT) //////////////////////////////////////////// + +#define DFTUndiff_GETMODEPARAMINT getModeParamInt_neon_float +#define DFTUndiff_GETMODEPARAMSTRING getModeParamString_neon_float +#define DFTUndiff_EXECUTE execute_neon_float +#define DFTUndiff_MAKEPLAN makePlan_neon_float +#define DFTUndiff_MAKEPLANSUB makePlanSub_neon_float +#define DFTUndiff_DESTROYPLAN destroyPlan_neon_float + +#elif defined(ENABLE_AVX_FLOAT) //////////////////////////////////////////// + +#define DFTUndiff_GETMODEPARAMINT getModeParamInt_avx_float +#define DFTUndiff_GETMODEPARAMSTRING getModeParamString_avx_float +#define DFTUndiff_EXECUTE execute_avx_float +#define DFTUndiff_MAKEPLAN makePlan_avx_float +#define DFTUndiff_MAKEPLANSUB makePlanSub_avx_float +#define DFTUndiff_DESTROYPLAN destroyPlan_avx_float + +#elif defined(ENABLE_AVX_DOUBLE) //////////////////////////////////////////// + +#define DFTUndiff_GETMODEPARAMINT getModeParamInt_avx_double +#define DFTUndiff_GETMODEPARAMSTRING getModeParamString_avx_double +#define DFTUndiff_EXECUTE execute_avx_double +#define DFTUndiff_MAKEPLAN makePlan_avx_double +#define DFTUndiff_MAKEPLANSUB makePlanSub_avx_double +#define DFTUndiff_DESTROYPLAN destroyPlan_avx_double + +#elif defined(ENABLE_ALTIVEC_FLOAT) //////////////////////////////////////////// + +#define DFTUndiff_GETMODEPARAMINT getModeParamInt_altivec_float +#define DFTUndiff_GETMODEPARAMSTRING getModeParamString_altivec_float +#define DFTUndiff_EXECUTE execute_altivec_float +#define DFTUndiff_MAKEPLAN makePlan_altivec_float +#define DFTUndiff_MAKEPLANSUB makePlanSub_altivec_float +#define DFTUndiff_DESTROYPLAN destroyPlan_altivec_float + +#endif //////////////////////////////////////////////////////////////////// + +#endif diff --git a/plugins/supereq/nsfft-1.00/dft/Makefile b/plugins/supereq/nsfft-1.00/dft/Makefile new file mode 120000 index 00000000..fc484116 --- /dev/null +++ b/plugins/supereq/nsfft-1.00/dft/Makefile @@ -0,0 +1 @@ +Makefile.x86
\ No newline at end of file diff --git a/plugins/supereq/nsfft-1.00/dft/Makefile.altivec b/plugins/supereq/nsfft-1.00/dft/Makefile.altivec new file mode 100644 index 00000000..fe7fc993 --- /dev/null +++ b/plugins/supereq/nsfft-1.00/dft/Makefile.altivec @@ -0,0 +1,26 @@ +CC=gcc +BASEOPT=-Wall -I ../simd -maltivec -mabi=altivec +OPT=$(BASEOPT) -O3 + +all : libDFT.a + +DFTpurecfloat.o : DFTUndiff.c DFT.h ../simd/SIMDBase.h ../simd/SIMDBaseUndiff.h + $(CC) $(OPT) -DENABLE_PUREC_FLOAT DFTUndiff.c -c -o DFTpurecfloat.o + +DFTpurecdouble.o : DFTUndiff.c DFT.h ../simd/SIMDBase.h ../simd/SIMDBaseUndiff.h + $(CC) $(OPT) -DENABLE_PUREC_DOUBLE DFTUndiff.c -c -o DFTpurecdouble.o + +DFTpureclongdouble.o : DFTUndiff.c DFT.h ../simd/SIMDBase.h ../simd/SIMDBaseUndiff.h + $(CC) $(OPT) -DENABLE_PUREC_LONGDOUBLE DFTUndiff.c -c -o DFTpureclongdouble.o + +DFTaltivecfloat.o : DFTUndiff.c DFT.h ../simd/SIMDBase.h ../simd/SIMDBaseUndiff.h + $(CC) $(OPT) -DENABLE_ALTIVEC_FLOAT DFTUndiff.c -c -o DFTaltivecfloat.o + +DFT.o : DFT.c DFT.h + $(CC) $(OPT) -DENABLE_PUREC_FLOAT -DENABLE_PUREC_DOUBLE -DENABLE_PUREC_LONGDOUBLE -DENABLE_ALTIVEC_FLOAT DFT.c -c -o DFT.o + +libDFT.a : DFTpurecfloat.o DFTpurecdouble.o DFTpureclongdouble.o DFTaltivecfloat.o DFT.o + rm -f libDFT.a; ar -cvq libDFT.a DFTpurecfloat.o DFTpurecdouble.o DFTpureclongdouble.o DFTaltivecfloat.o DFT.o + +clean : + rm -f *~ *.o *.s *.a diff --git a/plugins/supereq/nsfft-1.00/dft/Makefile.neon b/plugins/supereq/nsfft-1.00/dft/Makefile.neon new file mode 100644 index 00000000..111a04ae --- /dev/null +++ b/plugins/supereq/nsfft-1.00/dft/Makefile.neon @@ -0,0 +1,26 @@ +CC=gcc +BASEOPT=-Wall -I ../simd -mfloat-abi=softfp +OPT=$(BASEOPT) -O3 + +all : libDFT.a + +DFTpurecfloat.o : DFTUndiff.c DFT.h ../simd/SIMDBase.h + $(CC) $(OPT) -DENABLE_PUREC_FLOAT DFTUndiff.c -c -o DFTpurecfloat.o + +DFTpurecdouble.o : DFTUndiff.c DFT.h ../simd/SIMDBase.h + $(CC) $(OPT) -DENABLE_PUREC_DOUBLE DFTUndiff.c -c -o DFTpurecdouble.o + +DFTpureclongdouble.o : DFTUndiff.c DFT.h ../simd/SIMDBase.h + $(CC) $(OPT) -DENABLE_PUREC_LONGDOUBLE DFTUndiff.c -c -o DFTpureclongdouble.o + +DFTneonfloat.o : DFTUndiff.c DFT.h ../simd/SIMDBase.h + $(CC) $(OPT) -mfpu=neon -DENABLE_NEON_FLOAT DFTUndiff.c -c -o DFTneonfloat.o + +DFT.o : DFT.c DFT.h + $(CC) $(OPT) -DENABLE_PUREC_FLOAT -DENABLE_PUREC_DOUBLE -DENABLE_PUREC_LONGDOUBLE -DENABLE_NEON_FLOAT DFT.c -c -o DFT.o + +libDFT.a : DFTpurecfloat.o DFTpurecdouble.o DFTpureclongdouble.o DFTneonfloat.o DFT.o + rm -f libDFT.a; ar -cvq libDFT.a DFTpurecfloat.o DFTpurecdouble.o DFTpureclongdouble.o DFTneonfloat.o DFT.o + +clean : + rm -f *~ *.o *.s *.a diff --git a/plugins/supereq/nsfft-1.00/dft/Makefile.purec b/plugins/supereq/nsfft-1.00/dft/Makefile.purec new file mode 100644 index 00000000..2c8b04f1 --- /dev/null +++ b/plugins/supereq/nsfft-1.00/dft/Makefile.purec @@ -0,0 +1,35 @@ +CC=gcc +BASEOPT=-Wall +OPT=$(BASEOPT) -O3 + +all : libDFT.a + +DFTpurecfloat.o : DFTUndiff.c DFT.h SIMDBase.h + $(CC) $(OPT) -DENABLE_PUREC_FLOAT DFTUndiff.c -c -o DFTpurecfloat.o + +DFTpurecdouble.o : DFTUndiff.c DFT.h SIMDBase.h + $(CC) $(OPT) -DENABLE_PUREC_DOUBLE DFTUndiff.c -c -o DFTpurecdouble.o + +DFTpureclongdouble.o : DFTUndiff.c DFT.h SIMDBase.h + $(CC) $(OPT) -DENABLE_PUREC_LONGDOUBLE DFTUndiff.c -c -o DFTpureclongdouble.o + +SIMDBaseUndiff_purecfloat.o : SIMDBaseUndiff.c SIMDBase.h SIMDBaseUndiff.h + $(CC) $(OPT) -DENABLE_PUREC_FLOAT SIMDBaseUndiff.c -c -o SIMDBaseUndiff_purecfloat.o + +SIMDBaseUndiff_purecdouble.o : SIMDBaseUndiff.c DFT.h SIMDBase.h SIMDBaseUndiff.h + $(CC) $(OPT) -DENABLE_PUREC_DOUBLE SIMDBaseUndiff.c -c -o SIMDBaseUndiff_purecdouble.o + +SIMDBaseUndiff_pureclongdouble.o : SIMDBaseUndiff.c SIMDBase.h SIMDBaseUndiff.h + $(CC) $(OPT) -DENABLE_PUREC_LONGDOUBLE SIMDBaseUndiff.c -c -o SIMDBaseUndiff_pureclongdouble.o + +SIMDBase.o : SIMDBase.c SIMDBase.h + $(CC) $(BASEOPT) -O -DENABLE_PUREC_FLOAT -DENABLE_PUREC_DOUBLE -DENABLE_PUREC_LONGDOUBLE SIMDBase.c -c -o SIMDBase.o + +DFT.o : DFT.c DFT.h + $(CC) $(OPT) -DENABLE_PUREC_FLOAT -DENABLE_PUREC_DOUBLE -DENABLE_PUREC_LONGDOUBLE DFT.c -c -o DFT.o + +libDFT.a : DFTpurecfloat.o DFTpurecdouble.o DFTpureclongdouble.o DFT.o SIMDBase.o SIMDBaseUndiff_purecfloat.o SIMDBaseUndiff_purecdouble.o SIMDBaseUndiff_pureclongdouble.o + rm -f libDFT.a; ar -cvq libDFT.a DFTpurecfloat.o DFTpurecdouble.o DFTpureclongdouble.o DFT.o SIMDBase.o SIMDBaseUndiff_purecfloat.o SIMDBaseUndiff_purecdouble.o SIMDBaseUndiff_pureclongdouble.o + +clean : + rm -f *~ *.o *.s *.a diff --git a/plugins/supereq/nsfft-1.00/dft/Makefile.x86 b/plugins/supereq/nsfft-1.00/dft/Makefile.x86 new file mode 100644 index 00000000..6ecbacec --- /dev/null +++ b/plugins/supereq/nsfft-1.00/dft/Makefile.x86 @@ -0,0 +1,29 @@ +CC=gcc +BASEOPT=-Wall -I ../simd +OPT=$(BASEOPT) -O3 + +all : libDFT.a + +DFTpurecfloat.o : DFTUndiff.c DFT.h ../simd/SIMDBase.h ../simd/SIMDBaseUndiff.h + $(CC) $(OPT) -DENABLE_PUREC_FLOAT DFTUndiff.c -c -o DFTpurecfloat.o + +DFTpurecdouble.o : DFTUndiff.c DFT.h ../simd/SIMDBase.h ../simd/SIMDBaseUndiff.h + $(CC) $(OPT) -DENABLE_PUREC_DOUBLE DFTUndiff.c -c -o DFTpurecdouble.o + +DFTpureclongdouble.o : DFTUndiff.c DFT.h ../simd/SIMDBase.h ../simd/SIMDBaseUndiff.h + $(CC) $(OPT) -DENABLE_PUREC_LONGDOUBLE DFTUndiff.c -c -o DFTpureclongdouble.o + +DFTssefloat.o : DFTUndiff.c DFT.h ../simd/SIMDBase.h ../simd/SIMDBaseUndiff.h + $(CC) $(OPT) -msse -DENABLE_SSE_FLOAT DFTUndiff.c -c -o DFTssefloat.o + +DFTsse2double.o : DFTUndiff.c DFT.h ../simd/SIMDBase.h ../simd/SIMDBaseUndiff.h + $(CC) $(OPT) -msse2 -DENABLE_SSE2_DOUBLE DFTUndiff.c -c -o DFTsse2double.o + +DFT.o : DFT.c DFT.h + $(CC) $(OPT) -DENABLE_PUREC_FLOAT -DENABLE_PUREC_DOUBLE -DENABLE_PUREC_LONGDOUBLE -DENABLE_SSE_FLOAT -DENABLE_SSE2_DOUBLE DFT.c -c -o DFT.o + +libDFT.a : DFTpurecfloat.o DFTpurecdouble.o DFTpureclongdouble.o DFTssefloat.o DFTsse2double.o DFT.o + rm -f libDFT.a; ar -cvq libDFT.a DFTpurecfloat.o DFTpurecdouble.o DFTpureclongdouble.o DFTssefloat.o DFTsse2double.o DFT.o + +clean : + rm -f *~ *.o *.s *.a a.out diff --git a/plugins/supereq/nsfft-1.00/dft/Makefile.x86avx b/plugins/supereq/nsfft-1.00/dft/Makefile.x86avx new file mode 100644 index 00000000..b38909cb --- /dev/null +++ b/plugins/supereq/nsfft-1.00/dft/Makefile.x86avx @@ -0,0 +1,35 @@ +CC=gcc +BASEOPT=-Wall -I ../simd +OPT=$(BASEOPT) -O3 + +all : libDFT.a + +DFTpurecfloat.o : DFTUndiff.c DFT.h ../simd/SIMDBase.h ../simd/SIMDBaseUndiff.h + $(CC) $(OPT) -DENABLE_PUREC_FLOAT DFTUndiff.c -c -o DFTpurecfloat.o + +DFTpurecdouble.o : DFTUndiff.c DFT.h ../simd/SIMDBase.h ../simd/SIMDBaseUndiff.h + $(CC) $(OPT) -DENABLE_PUREC_DOUBLE DFTUndiff.c -c -o DFTpurecdouble.o + +DFTpureclongdouble.o : DFTUndiff.c DFT.h ../simd/SIMDBase.h ../simd/SIMDBaseUndiff.h + $(CC) $(OPT) -DENABLE_PUREC_LONGDOUBLE DFTUndiff.c -c -o DFTpureclongdouble.o + +DFTssefloat.o : DFTUndiff.c DFT.h ../simd/SIMDBase.h ../simd/SIMDBaseUndiff.h + $(CC) $(OPT) -msse -DENABLE_SSE_FLOAT DFTUndiff.c -c -o DFTssefloat.o + +DFTsse2double.o : DFTUndiff.c DFT.h ../simd/SIMDBase.h ../simd/SIMDBaseUndiff.h + $(CC) $(OPT) -msse2 -DENABLE_SSE2_DOUBLE DFTUndiff.c -c -o DFTsse2double.o + +DFTavxfloat.o : DFTUndiff.c DFT.h ../simd/SIMDBase.h ../simd/SIMDBaseUndiff.h + $(CC) $(OPT) -mavx -DENABLE_AVX_FLOAT DFTUndiff.c -c -o DFTavxfloat.o + +DFTavxdouble.o : DFTUndiff.c DFT.h ../simd/SIMDBase.h ../simd/SIMDBaseUndiff.h + $(CC) $(OPT) -mavx -DENABLE_AVX_DOUBLE DFTUndiff.c -c -o DFTavxdouble.o + +DFT.o : DFT.c DFT.h + $(CC) $(OPT) -DENABLE_PUREC_FLOAT -DENABLE_PUREC_DOUBLE -DENABLE_PUREC_LONGDOUBLE -DENABLE_SSE_FLOAT -DENABLE_SSE2_DOUBLE -DENABLE_AVX_FLOAT -DENABLE_AVX_DOUBLE DFT.c -c -o DFT.o + +libDFT.a : DFTpurecfloat.o DFTpurecdouble.o DFTpureclongdouble.o DFTssefloat.o DFTsse2double.o DFTavxfloat.o DFTavxdouble.o DFT.o + rm -f libDFT.a; ar -cvq libDFT.a DFTpurecfloat.o DFTpurecdouble.o DFTpureclongdouble.o DFTssefloat.o DFTsse2double.o DFTavxfloat.o DFTavxdouble.o DFT.o + +clean : + rm -f *~ *.o *.s *.a a.out diff --git a/plugins/supereq/nsfft-1.00/dfttest/DFTExample.c b/plugins/supereq/nsfft-1.00/dfttest/DFTExample.c new file mode 100644 index 00000000..78ff14dc --- /dev/null +++ b/plugins/supereq/nsfft-1.00/dfttest/DFTExample.c @@ -0,0 +1,88 @@ +#include <stdio.h> +#include <stdlib.h> +#include <math.h> +#include <stdint.h> +#include <complex.h> + +#include "SIMDBase.h" +#include "DFT.h" + +typedef float REAL; +#define TYPE SIMDBase_TYPE_FLOAT + +#define THRES 1e-3 + +double complex omega(double n, double kn) { + return cexp((-2 * M_PI * _Complex_I / n) * kn); +} + +void forward(double complex *ts, double complex *fs, int len) { + int k, n; + + for(k=0;k<len;k++) { + fs[k] = 0; + + for(n=0;n<len;n++) { + fs[k] += ts[n] * omega(len, n*k); + } + } +} + +int main(int argc, char **argv) { + const int n = 256; + + int mode = SIMDBase_chooseBestMode(TYPE); + printf("mode : %d, %s\n", mode, SIMDBase_getModeParamString(SIMDBase_PARAMID_MODE_NAME, mode)); + + int veclen = SIMDBase_getModeParamInt(SIMDBase_PARAMID_VECTOR_LEN, mode); + int sizeOfVect = SIMDBase_getModeParamInt(SIMDBase_PARAMID_SIZE_OF_VECT, mode); + + // + + int i, j; + + DFT *p = DFT_init(mode, n, 0); + REAL *sx = SIMDBase_alignedMalloc(sizeOfVect*n*2); + + // + + double complex ts[veclen][n], fs[veclen][n]; + + for(j=0;j<veclen;j++) { + for(i=0;i<n;i++) { + ts[j][i] = (random() / (double)RAND_MAX) + (random() / (double)RAND_MAX) * _Complex_I; + sx[(i*2+0)*veclen+j] = creal(ts[j][i]); + sx[(i*2+1)*veclen+j] = cimag(ts[j][i]); + } + } + + // + + DFT_execute(p, mode, sx, -1); + + for(j=0;j<veclen;j++) { + forward(ts[j], fs[j], n); + } + + // + + int success = 1; + + for(j=0;j<veclen;j++) { + for(i=0;i<n;i++) { + if ((fabs(sx[(i*2+0)*veclen+j] - creal(fs[j][i])) > THRES) || + (fabs(sx[(i*2+1)*veclen+j] - cimag(fs[j][i])) > THRES)) { + success = 0; + } + } + } + + printf("%s\n", success ? "OK" : "NG"); + + // + + SIMDBase_alignedFree(sx); + DFT_dispose(p, mode); + + exit(0); +} diff --git a/plugins/supereq/nsfft-1.00/dfttest/DFTTestFFTW.c b/plugins/supereq/nsfft-1.00/dfttest/DFTTestFFTW.c new file mode 100644 index 00000000..42825ed9 --- /dev/null +++ b/plugins/supereq/nsfft-1.00/dfttest/DFTTestFFTW.c @@ -0,0 +1,317 @@ +#include <stdio.h> +#include <stdlib.h> +#include <math.h> +#include <stdint.h> +#include <time.h> +#include <complex.h> + +#include <fftw3.h> + +#include "SIMDBase.h" +#include "DFT.h" + +#if 1 +typedef float REAL; +#define TYPE SIMDBase_TYPE_FLOAT +#else +typedef double REAL; +#define TYPE SIMDBase_TYPE_DOUBLE +#endif + +#define THRES 1e-3 + +// complex forward +int check_cf(int n, int mode, int veclen, int sizeOfVect) { + int i, j; + + DFT *p = DFT_init(mode, n, 0); + fftw_plan w[n]; + + fftw_complex *in[sizeOfVect], *out[sizeOfVect]; + + REAL *sx = SIMDBase_alignedMalloc(sizeOfVect*n*2); + + // + + for(j=0;j<veclen;j++) { + in[j] = (fftw_complex*) fftw_malloc(sizeof(fftw_complex) * n); + out[j] = (fftw_complex*) fftw_malloc(sizeof(fftw_complex) * n); + w[j] = fftw_plan_dft_1d(n, in[j], out[j], FFTW_FORWARD, FFTW_ESTIMATE); + + for(i=0;i<n;i++) { + double re = random() / (double)RAND_MAX; + double im = random() / (double)RAND_MAX; + sx[(i*2+0)*veclen+j] = re; + sx[(i*2+1)*veclen+j] = im; + in[j][i] = re + im * _Complex_I; + } + } + + // + + DFT_execute(p, mode, sx, -1); + + for(j=0;j<veclen;j++) { + fftw_execute(w[j]); + } + + // + + int success = 1; + + for(j=0;j<veclen;j++) { + for(i=0;i<n;i++) { + if (fabs(sx[(i*2+0)*veclen+j] - creal(out[j][i])) > THRES) success = 0; + if (fabs(sx[(i*2+1)*veclen+j] - cimag(out[j][i])) > THRES) success = 0; + } + } + + // + + for(j=0;j<veclen;j++) { + fftw_destroy_plan(w[j]); + fftw_free(in[j]); + fftw_free(out[j]); + } + + SIMDBase_alignedFree(sx); + + DFT_dispose(p, mode); + + // + + return success; +} + +// complex backward +int check_cb(int n, int mode, int veclen, int sizeOfVect) { + int i, j; + + DFT *p = DFT_init(mode, n, 0); + fftw_plan w[n]; + + fftw_complex *in[sizeOfVect], *out[sizeOfVect]; + + REAL *sx = SIMDBase_alignedMalloc(sizeOfVect*n*2); + + // + + for(j=0;j<veclen;j++) { + in[j] = (fftw_complex*) fftw_malloc(sizeof(fftw_complex) * n); + out[j] = (fftw_complex*) fftw_malloc(sizeof(fftw_complex) * n); + w[j] = fftw_plan_dft_1d(n, in[j], out[j], FFTW_BACKWARD, FFTW_ESTIMATE); + + for(i=0;i<n;i++) { + double re = random() / (double)RAND_MAX; + double im = random() / (double)RAND_MAX; + sx[(i*2+0)*veclen+j] = re; + sx[(i*2+1)*veclen+j] = im; + in[j][i] = re + im * _Complex_I; + } + } + + // + + DFT_execute(p, mode, sx, 1); + + for(j=0;j<veclen;j++) { + fftw_execute(w[j]); + } + + // + + int success = 1; + + for(j=0;j<veclen;j++) { + for(i=0;i<n;i++) { + if (fabs(sx[(i*2+0)*veclen+j] - creal(out[j][i])) > THRES) success = 0; + if (fabs(sx[(i*2+1)*veclen+j] - cimag(out[j][i])) > THRES) success = 0; + } + } + + // + + for(j=0;j<veclen;j++) { + fftw_destroy_plan(w[j]); + fftw_free(in[j]); + fftw_free(out[j]); + } + + SIMDBase_alignedFree(sx); + + DFT_dispose(p, mode); + + // + + return success; +} + +// real forward +int check_rf(int n, int mode, int veclen, int sizeOfVect) { + int i, j; + + DFT *p = DFT_init(mode, n, DFT_FLAG_REAL); + fftw_plan w[n]; + + double *in[sizeOfVect]; + fftw_complex *out[sizeOfVect]; + + REAL *sx = SIMDBase_alignedMalloc(sizeOfVect*n*2); + + // + + for(j=0;j<veclen;j++) { + in[j] = (double *) fftw_malloc(sizeof(double) * n); + out[j] = (fftw_complex*) fftw_malloc(sizeof(fftw_complex) * (n/2+1)); + w[j] = fftw_plan_dft_r2c_1d(n, in[j], out[j], FFTW_ESTIMATE); + + for(i=0;i<n;i++) { + double re = random() / (double)RAND_MAX; + sx[i*veclen+j] = re; + in[j][i] = re; + } + } + + // + + DFT_execute(p, mode, sx, -1); + + for(j=0;j<veclen;j++) { + fftw_execute(w[j]); + } + + // + + int success = 1; + + for(j=0;j<veclen;j++) { + for(i=0;i<n/2;i++) { + if (i == 0) { + if (fabs(sx[(i*2+0)*veclen+j] - creal(out[j][0])) > THRES) success = 0; + if (fabs(sx[(i*2+1)*veclen+j] - creal(out[j][n/2])) > THRES) success = 0; + } else { + if (fabs(sx[(i*2+0)*veclen+j] - creal(out[j][i])) > THRES) success = 0; + if (fabs(sx[(i*2+1)*veclen+j] - cimag(out[j][i])) > THRES) success = 0; + } + } + } + + // + + for(j=0;j<veclen;j++) { + fftw_destroy_plan(w[j]); + fftw_free(in[j]); + fftw_free(out[j]); + } + + SIMDBase_alignedFree(sx); + + DFT_dispose(p, mode); + + // + + return success; +} + +// real backward +int check_rb(int n, int mode, int veclen, int sizeOfVect) { + int i, j; + + DFT *p = DFT_init(mode, n, DFT_FLAG_REAL); + fftw_plan w[n]; + + fftw_complex *in[sizeOfVect]; + double *out[sizeOfVect]; + + REAL *sx = SIMDBase_alignedMalloc(sizeOfVect*n*2); + + // + + for(j=0;j<veclen;j++) { + in[j] = (fftw_complex*) fftw_malloc(sizeof(fftw_complex) * (n/2+1)); + out[j] = (double *) fftw_malloc(sizeof(double) * n); + w[j] = fftw_plan_dft_c2r_1d(n, in[j], out[j], FFTW_ESTIMATE); + + for(i=0;i<n/2;i++) { + if (i == 0) { + in[j][0 ] = (random() / (double)RAND_MAX); + in[j][n/2] = (random() / (double)RAND_MAX); + } else { + in[j][i ] = (random() / (double)RAND_MAX) + (random() / (double)RAND_MAX) * _Complex_I; + } + } + + for(i=0;i<n/2;i++) { + if (i == 0) { + sx[(2*0+0) * veclen + j] = creal(in[j][0 ]); + sx[(2*0+1) * veclen + j] = creal(in[j][n/2]); + } else { + sx[(2*i+0) * veclen + j] = creal(in[j][i]); + sx[(2*i+1) * veclen + j] = cimag(in[j][i]); + } + } + } + + // + + DFT_execute(p, mode, sx, 1); + + for(j=0;j<veclen;j++) { + fftw_execute(w[j]); + } + + // + + int success = 1; + + for(j=0;j<veclen;j++) { + for(i=0;i<n/2;i++) { + if ((fabs(sx[i * veclen + j]*2 - out[j][i]) > THRES)) { + success = 0; + } + } + } + + // + + for(j=0;j<veclen;j++) { + fftw_destroy_plan(w[j]); + fftw_free(in[j]); + fftw_free(out[j]); + } + + SIMDBase_alignedFree(sx); + + DFT_dispose(p, mode); + + // + + return success; +} + +int main(int argc, char **argv) { + if (argc != 2) { + fprintf(stderr, "%s <log2n>\n", argv[0]); + exit(-1); + } + + const int n = 1 << atoi(argv[1]); + + srandom(time(NULL)); + + // + + int mode = SIMDBase_chooseBestMode(TYPE); + + printf("mode : %d, %s\n", mode, SIMDBase_getModeParamString(SIMDBase_PARAMID_MODE_NAME, mode)); + + int veclen = SIMDBase_getModeParamInt(SIMDBase_PARAMID_VECTOR_LEN, mode); + int sizeOfVect = SIMDBase_getModeParamInt(SIMDBase_PARAMID_SIZE_OF_VECT, mode); + + printf("complex forward : %s\n", check_cf(n, mode, veclen, sizeOfVect) ? "OK" : "NG"); + printf("complex backward : %s\n", check_cb(n, mode, veclen, sizeOfVect) ? "OK" : "NG"); + printf("real forward : %s\n", check_rf(n, mode, veclen, sizeOfVect) ? "OK" : "NG"); + printf("real backward : %s\n", check_rb(n, mode, veclen, sizeOfVect) ? "OK" : "NG"); + + exit(0); +} diff --git a/plugins/supereq/nsfft-1.00/dfttest/DFTTestNaive.c b/plugins/supereq/nsfft-1.00/dfttest/DFTTestNaive.c new file mode 100644 index 00000000..9d4bdaae --- /dev/null +++ b/plugins/supereq/nsfft-1.00/dfttest/DFTTestNaive.c @@ -0,0 +1,419 @@ +#include <stdio.h> +#include <stdlib.h> +#include <math.h> +#include <stdint.h> +#include <time.h> +#include <complex.h> + +#include "SIMDBase.h" +#include "DFT.h" + +#if 1 +typedef float REAL; +#define TYPE SIMDBase_TYPE_FLOAT +#else +typedef double REAL; +#define TYPE SIMDBase_TYPE_DOUBLE +#endif + +#define THRES 1e-3 + +double complex omega(double n, double kn) { + return cexp((-2 * M_PI * _Complex_I / n) * kn); +} + +void forward(double complex *ts, double complex *fs, int len) { + int k, n; + + for(k=0;k<len;k++) { + fs[k] = 0; + + for(n=0;n<len;n++) { + fs[k] += ts[n] * omega(len, n*k); + } + } +} + +void backward(double complex *fs, double complex *ts, int len) { + int k, n; + + for(k=0;k<len;k++) { + ts[k] = 0; + + for(n=0;n<len;n++) { + ts[k] += fs[n] * omega(-len, n*k); + } + } +} + +// complex forward +int check_cf(int n, int mode, int veclen, int sizeOfVect) { + int i, j; + + DFT *p = DFT_init(mode, n, 0); + REAL *sx = SIMDBase_alignedMalloc(sizeOfVect*n*2); + + // + + double complex ts[veclen][n], fs[veclen][n]; + + for(j=0;j<veclen;j++) { + for(i=0;i<n;i++) { + ts[j][i] = (random() / (double)RAND_MAX) + (random() / (double)RAND_MAX) * _Complex_I; + sx[(i*2+0)*veclen+j] = creal(ts[j][i]); + sx[(i*2+1)*veclen+j] = cimag(ts[j][i]); + } + } + + // + + DFT_execute(p, mode, sx, -1); + + for(j=0;j<veclen;j++) { + forward(ts[j], fs[j], n); + } + + // + + int success = 1; + + for(j=0;j<veclen;j++) { + for(i=0;i<n;i++) { + if ((fabs(sx[(i*2+0)*veclen+j] - creal(fs[j][i])) > THRES) || + (fabs(sx[(i*2+1)*veclen+j] - cimag(fs[j][i])) > THRES)) { + success = 0; + } + } + } + + // + + SIMDBase_alignedFree(sx); + DFT_dispose(p, mode); + + // + + return success; +} + +// complex backward +int check_cb(int n, int mode, int veclen, int sizeOfVect) { + int i,j; + + DFT *p = DFT_init(mode, n, 0); + REAL *sx = SIMDBase_alignedMalloc(sizeOfVect*n*2); + + // + + double complex fs[veclen][n], ts[veclen][n]; + + for(j=0;j<veclen;j++) { + for(i=0;i<n;i++) { + fs[j][i] = (random() / (double)RAND_MAX) + (random() / (double)RAND_MAX) * _Complex_I; + + sx[(i*2+0)*veclen+j] = creal(fs[j][i]); + sx[(i*2+1)*veclen+j] = cimag(fs[j][i]); + } + } + + // + + DFT_execute(p, mode, sx, 1); + + for(j=0;j<veclen;j++) { + backward(fs[j], ts[j], n); + } + + // + + int success = 1; + + for(j=0;j<veclen;j++) { + for(i=0;i<n;i++) { + if ((fabs(sx[(i*2+0)*veclen+j] - creal(ts[j][i])) > THRES) || + (fabs(sx[(i*2+1)*veclen+j] - cimag(ts[j][i])) > THRES)) { + success = 0; + } + } + } + + // + + SIMDBase_alignedFree(sx); + DFT_dispose(p, mode); + + // + + return success; +} + +// real forward +int check_rf(int n, int mode, int veclen, int sizeOfVect) { + int i,j; + + DFT *p = DFT_init(mode, n, DFT_FLAG_REAL); + REAL *sx = SIMDBase_alignedMalloc(sizeOfVect*n); + + // + + double complex ts[veclen][n], fs[veclen][n]; + + for(j=0;j<veclen;j++) { + for(i=0;i<n;i++) { + ts[j][i] = (random() / (double)RAND_MAX); + sx[i*veclen+j] = creal(ts[j][i]); + } + } + + // + + DFT_execute(p, mode, sx, -1); + + for(j=0;j<veclen;j++) { + forward(ts[j], fs[j], n); + } + + // + + int success = 1; + + for(j=0;j<veclen;j++) { + for(i=0;i<n/2;i++) { + if (i == 0) { + if (fabs(sx[(2*0+0) * veclen + j] - creal(fs[j][0 ])) > THRES) success = 0; + if (fabs(sx[(2*0+1) * veclen + j] - creal(fs[j][n/2])) > THRES) success = 0; + } else { + if (fabs(sx[(2*i+0) * veclen + j] - creal(fs[j][i])) > THRES) success = 0; + if (fabs(sx[(2*i+1) * veclen + j] - cimag(fs[j][i])) > THRES) success = 0; + } + } + } + + // + + SIMDBase_alignedFree(sx); + DFT_dispose(p, mode); + + // + + return success; +} + +// real backward +int check_rb(int n, int mode, int veclen, int sizeOfVect) { + int i,j; + + DFT *p = DFT_init(mode, n, DFT_FLAG_REAL); + REAL *sx = SIMDBase_alignedMalloc(sizeOfVect*n); + + // + + double complex fs[veclen][n], ts[veclen][n]; + + for(j=0;j<veclen;j++) { + for(i=0;i<n/2;i++) { + if (i == 0) { + fs[j][0 ] = (random() / (double)RAND_MAX); + fs[j][n/2] = (random() / (double)RAND_MAX); + } else { + fs[j][i ] = (random() / (double)RAND_MAX) + (random() / (double)RAND_MAX) * _Complex_I; + fs[j][n-i] = conj(fs[j][i]); + } + } + } + + for(j=0;j<veclen;j++) { + for(i=0;i<n/2;i++) { + if (i == 0) { + sx[(2*0+0) * veclen + j] = creal(fs[j][0 ]); + sx[(2*0+1) * veclen + j] = creal(fs[j][n/2]); + } else { + sx[(2*i+0) * veclen + j] = creal(fs[j][i]); + sx[(2*i+1) * veclen + j] = cimag(fs[j][i]); + } + } + } + + // + + for(j=0;j<veclen;j++) { + backward(fs[j], ts[j], n); + } + + DFT_execute(p, mode, sx, 1); + + // + + int success = 1; + + for(j=0;j<veclen;j++) { + for(i=0;i<n;i++) { + if (fabs(cimag(ts[j][i])) > THRES) { + success = 0; + } + + if ((fabs(sx[i * veclen + j]*2 - creal(ts[j][i])) > THRES)) { + success = 0; + } + } + } + + // + + SIMDBase_alignedFree(sx); + DFT_dispose(p, mode); + + // + + return success; +} + +// alt real forward +int check_arf(int n, int mode, int veclen, int sizeOfVect) { + int i,j; + + DFT *p = DFT_init(mode, n, DFT_FLAG_ALT_REAL); + REAL *sx = SIMDBase_alignedMalloc(sizeOfVect*n); + + // + + double complex ts[veclen][n], fs[veclen][n]; + + for(j=0;j<veclen;j++) { + for(i=0;i<n;i++) { + ts[j][i] = (random() / (double)RAND_MAX); + sx[i*veclen+j] = creal(ts[j][i]); + } + } + + // + + DFT_execute(p, mode, sx, 1); + + for(j=0;j<veclen;j++) { + backward(ts[j], fs[j], n); + } + + // + + int success = 1; + + for(j=0;j<veclen;j++) { + for(i=0;i<n/2;i++) { + if (i == 0) { + if (fabs(sx[(2*0+0) * veclen + j] - creal(fs[j][0 ])) > THRES) success = 0; + if (fabs(sx[(2*0+1) * veclen + j] - creal(fs[j][n/2])) > THRES) success = 0; + } else { + if (fabs(sx[(2*i+0) * veclen + j] - creal(fs[j][i])) > THRES) success = 0; + if (fabs(sx[(2*i+1) * veclen + j] - cimag(fs[j][i])) > THRES) success = 0; + } + } + } + + // + + SIMDBase_alignedFree(sx); + DFT_dispose(p, mode); + + // + + return success; +} + +// alt real backward +int check_arb(int n, int mode, int veclen, int sizeOfVect) { + int i,j; + + DFT *p = DFT_init(mode, n, DFT_FLAG_ALT_REAL); + REAL *sx = SIMDBase_alignedMalloc(sizeOfVect*n); + + // + + double complex fs[veclen][n], ts[veclen][n]; + + for(j=0;j<veclen;j++) { + for(i=0;i<n/2;i++) { + if (i == 0) { + fs[j][0 ] = (random() / (double)RAND_MAX); + fs[j][n/2] = (random() / (double)RAND_MAX); + } else { + fs[j][i ] = (random() / (double)RAND_MAX) + (random() / (double)RAND_MAX) * _Complex_I; + fs[j][n-i] = conj(fs[j][i]); + } + } + } + + for(j=0;j<veclen;j++) { + for(i=0;i<n/2;i++) { + if (i == 0) { + sx[(2*0+0) * veclen + j] = creal(fs[j][0 ]); + sx[(2*0+1) * veclen + j] = creal(fs[j][n/2]); + } else { + sx[(2*i+0) * veclen + j] = creal(fs[j][i]); + sx[(2*i+1) * veclen + j] = cimag(fs[j][i]); + } + } + } + + // + + for(j=0;j<veclen;j++) { + forward(fs[j], ts[j], n); + } + + DFT_execute(p, mode, sx, -1); + + // + + int success = 1; + + for(j=0;j<veclen;j++) { + for(i=0;i<n;i++) { + if (fabs(cimag(ts[j][i])) > THRES) { + success = 0; + } + + if ((fabs(sx[i * veclen + j]*2 - creal(ts[j][i])) > THRES)) { + success = 0; + } + } + } + + // + + SIMDBase_alignedFree(sx); + DFT_dispose(p, mode); + + // + + return success; +} + +int main(int argc, char **argv) { + if (argc != 2) { + fprintf(stderr, "%s <log2n>\n", argv[0]); + exit(-1); + } + + const int n = 1 << atoi(argv[1]); + + srandom(time(NULL)); + + // + + int mode = SIMDBase_chooseBestMode(TYPE); + + printf("mode : %d, %s\n", mode, SIMDBase_getModeParamString(SIMDBase_PARAMID_MODE_NAME, mode)); + + int veclen = SIMDBase_getModeParamInt(SIMDBase_PARAMID_VECTOR_LEN, mode); + int sizeOfVect = SIMDBase_getModeParamInt(SIMDBase_PARAMID_SIZE_OF_VECT, mode); + + printf("complex forward : %s\n", check_cf(n, mode, veclen, sizeOfVect) ? "OK" : "NG"); + printf("complex backward : %s\n", check_cb(n, mode, veclen, sizeOfVect) ? "OK" : "NG"); + printf("real forward : %s\n", check_rf(n, mode, veclen, sizeOfVect) ? "OK" : "NG"); + printf("real backward : %s\n", check_rb(n, mode, veclen, sizeOfVect) ? "OK" : "NG"); + printf("alt real forward : %s\n", check_arf(n, mode, veclen, sizeOfVect) ? "OK" : "NG"); + printf("alt real backward : %s\n", check_arb(n, mode, veclen, sizeOfVect) ? "OK" : "NG"); + + exit(0); +} diff --git a/plugins/supereq/nsfft-1.00/dfttest/DFTTestOoura.c b/plugins/supereq/nsfft-1.00/dfttest/DFTTestOoura.c new file mode 100644 index 00000000..08c8315f --- /dev/null +++ b/plugins/supereq/nsfft-1.00/dfttest/DFTTestOoura.c @@ -0,0 +1,260 @@ +#include <stdio.h> +#include <stdlib.h> +#include <math.h> +#include <stdint.h> +#include <time.h> + +#include "SIMDBase.h" +#include "DFT.h" + +void cdft(int, int, double *, int *, double *); +void rdft(int, int, double *, int *, double *); + +#if 1 +typedef float REAL; +#define TYPE SIMDBase_TYPE_FLOAT +#else +typedef double REAL; +#define TYPE SIMDBase_TYPE_DOUBLE +#endif + +#define THRES 1e-3 + +// complex forward +int check_cf(int n, int mode, int veclen, int sizeOfVect) { + int i, j; + + DFT *p = DFT_init(mode, n, 0); + + int *ip = calloc(n, sizeof(int)); + double *trigTable = SIMDBase_alignedMalloc(sizeof(double)*n/2); + + REAL *sx = SIMDBase_alignedMalloc(sizeOfVect*n*2); + double *sy = SIMDBase_alignedMalloc(veclen * sizeof(double) *n*2); + + // + + for(j=0;j<veclen;j++) { + for(i=0;i<n*2;i++) { + sx[i*veclen + j] = random() / (double)RAND_MAX; + sy[j*n*2 + i] = sx[i*veclen + j]; + } + } + + // + + DFT_execute(p, mode, sx, -1); + + for(j=0;j<veclen;j++) { + cdft(n*2, -1, &sy[j*n*2], ip, trigTable); + } + + // + + int success = 1; + + for(j=0;j<veclen;j++) { + for(i=0;i<n*2;i++) { + if (fabs(sx[i*veclen+j] - sy[j*n*2 + i]) > THRES) success = 0; + } + } + + // + + SIMDBase_alignedFree(sy); + SIMDBase_alignedFree(sx); + SIMDBase_alignedFree(trigTable); + free(ip); + + DFT_dispose(p, mode); + + // + + return success; +} + +// complex backward +int check_cb(int n, int mode, int veclen, int sizeOfVect) { + int i, j; + + DFT *p = DFT_init(mode, n, 0); + + int *ip = calloc(n, sizeof(int)); + double *trigTable = SIMDBase_alignedMalloc(sizeof(double)*n/2); + + REAL *sx = SIMDBase_alignedMalloc(sizeOfVect*n*2); + double *sy = SIMDBase_alignedMalloc(veclen * sizeof(double) *n*2); + + // + + for(j=0;j<veclen;j++) { + for(i=0;i<n*2;i++) { + sx[i*veclen + j] = random() / (double)RAND_MAX; + sy[j*n*2 + i] = sx[i*veclen + j]; + } + } + + // + + DFT_execute(p, mode, sx, 1); + + for(j=0;j<veclen;j++) { + cdft(n*2, 1, &sy[j*n*2], ip, trigTable); + } + + // + + int success = 1; + + for(j=0;j<veclen;j++) { + for(i=0;i<n*2;i++) { + if (fabs(sx[i*veclen+j] - sy[j*n*2 + i]) > THRES) success = 0; + } + } + + // + + SIMDBase_alignedFree(sy); + SIMDBase_alignedFree(sx); + SIMDBase_alignedFree(trigTable); + free(ip); + + DFT_dispose(p, mode); + + // + + return success; +} + +// real forward +int check_rf(int n, int mode, int veclen, int sizeOfVect) { + int i, j; + + DFT *p = DFT_init(mode, n, DFT_FLAG_ALT_REAL); + + int *ip = calloc(n, sizeof(int)); + double *trigTable = SIMDBase_alignedMalloc(sizeof(double)*n/2); + + REAL *sx = SIMDBase_alignedMalloc(sizeOfVect*n); + double *sy = SIMDBase_alignedMalloc(veclen * sizeof(double) *n); + + // + + for(j=0;j<veclen;j++) { + for(i=0;i<n;i++) { + sx[i*veclen + j] = random() / (double)RAND_MAX; + sy[j*n + i] = sx[i*veclen + j]; + } + } + + // + + DFT_execute(p, mode, sx, -1); + + for(j=0;j<veclen;j++) { + rdft(n, -1, &sy[j*n], ip, trigTable); + } + + // + + int success = 1; + + for(j=0;j<veclen;j++) { + for(i=0;i<n;i++) { + if (fabs(sx[i*veclen+j] - sy[j*n + i]) > THRES) success = 0; + } + } + + // + + SIMDBase_alignedFree(sy); + SIMDBase_alignedFree(sx); + SIMDBase_alignedFree(trigTable); + free(ip); + + DFT_dispose(p, mode); + + // + + return success; +} + +// real backward +int check_rb(int n, int mode, int veclen, int sizeOfVect) { + int i, j; + + DFT *p = DFT_init(mode, n, DFT_FLAG_ALT_REAL); + + int *ip = calloc(n, sizeof(int)); + double *trigTable = SIMDBase_alignedMalloc(sizeof(double)*n/2); + + REAL *sx = SIMDBase_alignedMalloc(sizeOfVect*n); + double *sy = SIMDBase_alignedMalloc(veclen * sizeof(double) *n); + + // + + for(j=0;j<veclen;j++) { + for(i=0;i<n;i++) { + sx[i*veclen + j] = random() / (double)RAND_MAX; + sy[j*n + i] = sx[i*veclen + j]; + } + } + + // + + DFT_execute(p, mode, sx, 1); + + for(j=0;j<veclen;j++) { + rdft(n, 1, &sy[j*n], ip, trigTable); + } + + // + + int success = 1; + + for(j=0;j<veclen;j++) { + for(i=0;i<n;i++) { + if (fabs(sx[i*veclen+j] - sy[j*n + i]) > THRES) success = 0; + } + } + + // + + SIMDBase_alignedFree(sy); + SIMDBase_alignedFree(sx); + SIMDBase_alignedFree(trigTable); + free(ip); + + DFT_dispose(p, mode); + + // + + return success; +} + +int main(int argc, char **argv) { + if (argc != 2) { + fprintf(stderr, "%s <log2n>\n", argv[0]); + exit(-1); + } + + const int n = 1 << atoi(argv[1]); + + srandom(time(NULL)); + + // + + int mode = SIMDBase_chooseBestMode(TYPE); + + printf("mode : %d, %s\n", mode, SIMDBase_getModeParamString(SIMDBase_PARAMID_MODE_NAME, mode)); + + int veclen = SIMDBase_getModeParamInt(SIMDBase_PARAMID_VECTOR_LEN, mode); + int sizeOfVect = SIMDBase_getModeParamInt(SIMDBase_PARAMID_SIZE_OF_VECT, mode); + + printf("complex forward : %s\n", check_cf(n, mode, veclen, sizeOfVect) ? "OK" : "NG"); + printf("complex backward : %s\n", check_cb(n, mode, veclen, sizeOfVect) ? "OK" : "NG"); + printf("real forward : %s\n", check_rf(n, mode, veclen, sizeOfVect) ? "OK" : "NG"); + printf("real backward : %s\n", check_rb(n, mode, veclen, sizeOfVect) ? "OK" : "NG"); + + exit(0); +} diff --git a/plugins/supereq/nsfft-1.00/dfttest/Makefile b/plugins/supereq/nsfft-1.00/dfttest/Makefile new file mode 100644 index 00000000..924b8656 --- /dev/null +++ b/plugins/supereq/nsfft-1.00/dfttest/Makefile @@ -0,0 +1,35 @@ +CC=gcc +BASEOPT=-Wall -g -I ../simd -I ../dft -L../simd -L../dft +OPT=$(BASEOPT) -O + +all : DFTExample DFTTestNaive + +clean : + rm -f *~ *.o nsfftplan.*.txt *.log *.dat a.out DFTExample DFTTestNaive DFTTestOoura DFTTestFFTW pi_fft_mod pi_fft_mod.c + +../simd/libSIMD.a : + @cd ../simd; make + +../dft/libDFT.a : + @cd ../dft; make + +../ooura/fftsg.o : + @cd ../ooura; make + +DFTExample : DFTExample.c ../simd/libSIMD.a ../dft/libDFT.a + $(CC) $(OPT) DFTExample.c -lDFT -lSIMD -lm -o DFTExample + +DFTTestNaive : DFTTestNaive.c ../simd/libSIMD.a ../dft/libDFT.a + $(CC) $(OPT) DFTTestNaive.c -lDFT -lSIMD -lm -o DFTTestNaive + +DFTTestOoura : DFTTestOoura.c ../ooura/fftsg.o ../simd/libSIMD.a ../dft/libDFT.a + $(CC) $(OPT) DFTTestOoura.c ../ooura/fftsg.o -lDFT -lSIMD -lm -o DFTTestOoura + +DFTTestFFTW : DFTTestFFTW.c ../simd/libSIMD.a ../dft/libDFT.a + $(CC) $(OPT) DFTTestFFTW.c -lDFT -lSIMD -lfftw3 -lm -o DFTTestFFTW + +pi_fft_mod.c : ../ooura/pi_fft.c pi_fft.c.patch + patch -o pi_fft_mod.c ../ooura/pi_fft.c pi_fft.c.patch + +pi_fft_mod : ../simd/libSIMD.a ../dft/libDFT.a pi_fft_mod.c + $(CC) $(OPT) pi_fft_mod.c -I ../dft -I ../simd -L../dft -L../simd -lm -lDFT -lSIMD -o pi_fft_mod diff --git a/plugins/supereq/nsfft-1.00/dfttest/pi_fft.c.patch b/plugins/supereq/nsfft-1.00/dfttest/pi_fft.c.patch new file mode 100644 index 00000000..c50133cc --- /dev/null +++ b/plugins/supereq/nsfft-1.00/dfttest/pi_fft.c.patch @@ -0,0 +1,131 @@ +--- pi_fft.c 2010-07-30 13:04:25.000000000 +0900 ++++ pi_fft_mod.c 2010-07-31 20:50:11.000000000 +0900 +@@ -25,7 +25,75 @@ + #include <stdio.h> + #include <stdlib.h> + #include <time.h> ++#include <sys/time.h> ++#include <unistd.h> + ++/****/ ++ ++#include <stdint.h> ++#include "SIMDBase.h" ++#include "DFT.h" ++ ++DFT* dft[64]; ++ ++void initdft(int n) { ++ int i, logn = 31 - __builtin_clz(n), writeflag = 0; ++ char buf[20], fn[256]; ++ gethostname(buf, 19); ++ sprintf(fn, "nsfftplan.%s.txt", buf); ++ FILE *fp = fopen(fn, "r"); ++ if (fp != NULL) { ++ for(i=1;i<=logn;i++) { ++ int err; ++ dft[i] = DFT_fread(fp, &err); ++ if (err != DFT_ERROR_NOERROR) { ++ printf("error when reading plan %d : %d\n", i, err); ++ break; ++ } ++ if (DFT_getPlanParamInt(DFT_PARAMID_MODE, dft[i]) != SIMDBase_MODE_PUREC_DOUBLE || ++ DFT_getPlanParamInt(DFT_PARAMID_FFT_LENGTH, dft[i]) != (1 << i) || ++ DFT_getPlanParamInt(DFT_PARAMID_IS_ALT_REAL_TRANSFORM, dft[i]) != 1) { ++ fprintf(stderr, "plan not compatible : %d\n", i); ++ break; ++ } ++ } ++ } ++ if (fp != NULL) fclose(fp); ++ ++ for(i=1;i<=logn;i++) { ++ if (dft[i] == NULL) { ++ dft[i] = DFT_init(SIMDBase_MODE_PUREC_DOUBLE, 1 << i, DFT_FLAG_ALT_REAL | DFT_FLAG_LIGHT_TEST_RUN | DFT_FLAG_VERBOSE); ++ if (dft[i] == NULL) { ++ printf("dft[%d] == NULL\n", i); ++ exit(-1); ++ } ++ writeflag = 1; ++ } ++ } ++ ++ if (writeflag) { ++ fp = fopen(fn, "w"); ++ if (fp != NULL) { ++ for(i=1;i<=logn;i++) { ++ DFT_fwrite(dft[i], fp); ++ } ++ fclose(fp); ++ } ++ } ++} ++ ++void rdft(int n, int isgn, double *a, int *ip, double *w) { ++ int logn = 31 - __builtin_clz(n); ++ DFT_execute(dft[logn], SIMDBase_MODE_PUREC_DOUBLE, a, isgn); ++} ++ ++double timeofday(void) { ++ struct timeval tp; ++ gettimeofday(&tp, NULL); ++ return (double)tp.tv_sec+(1e-6)*tp.tv_usec; ++} ++ ++/****/ + + void mp_load_0(int n, int radix, int out[]); + void mp_load_1(int n, int radix, int out[]); +@@ -67,7 +135,7 @@ + double err, d_time, n_op; + int *a, *b, *c, *e, *i1, *i2, *ip; + double *d1, *d2, *d3, *w; +- time_t t_1, t_2; ++ double t_1, t_2; + FILE *f_log, *f_out; + + f_log = fopen("pi.log", "w"); +@@ -96,6 +164,8 @@ + exit(1); + } + ip[0] = 0; ++ ++ initdft(nfft); + /* ---- radix test ---- */ + log10_radix = 1; + radix = 10; +@@ -111,7 +181,7 @@ + printf("calculating %d digits of PI...\n", log10_radix * (n - 2)); + fprintf(f_log, "calculating %d digits of PI...\n", log10_radix * (n - 2)); + /* ---- time check ---- */ +- time(&t_1); ++ t_1 = timeofday(); + /* + * ---- a formula based on the AGM (Arithmetic-Geometric Mean) ---- + * c = sqrt(0.125); +@@ -216,10 +286,10 @@ + mp_mul(n, radix, a, b, a, i1, nfft, d1, d2, d3, ip, w); + mp_idiv(n, radix, a, npow, a); + /* ---- time check ---- */ +- time(&t_2); ++ t_2 = timeofday(); + /* ---- output ---- */ + f_out = fopen("pi_mod.dat", "w"); +- printf("writing pi.dat...\n"); ++ printf("writing pi_mod.dat...\n"); + mp_fprintf(n - 1, log10_radix, a, f_out); + fclose(f_out); + free(d3); +@@ -238,9 +308,9 @@ + printf("floating point operation: %g op.\n", n_op); + fprintf(f_log, "floating point operation: %g op.\n", n_op); + /* ---- difftime ---- */ +- d_time = difftime(t_2, t_1); +- printf("execution time: %g sec. (real time)\n", d_time); +- fprintf(f_log, "execution time: %g sec. (real time)\n", d_time); ++ d_time = t_2 - t_1; ++ printf("execution time: %.5g sec. (real time)\n", d_time); ++ fprintf(f_log, "execution time: %.5g sec. (real time)\n", d_time); + fclose(f_log); + return 0; + } diff --git a/plugins/supereq/nsfft-1.00/doc/default.css b/plugins/supereq/nsfft-1.00/doc/default.css new file mode 100644 index 00000000..09721163 --- /dev/null +++ b/plugins/supereq/nsfft-1.00/doc/default.css @@ -0,0 +1,34 @@ +body {margin-left: 1.5cm; padding-left: 0.1cm; margin-right: 1.5cm; padding-right: 0.1cm; margin-top: 2.5cm; padding-top: 0.5cm; margin-bottom: 1cm; padding-bottom: 1.0cm; border-top-style:solid; border-bottom-style:solid; } +h1 {font-family: arial, sansserif; font-weight: bold; font-style: italic; margin-top: 0.8cm; } +h2 {font-family: arial, sansserif; font-weight: bold; font-style: italic; margin-top: 0.8cm; } +h3 {font-family: arial, sansserif; font-weight: bold; margin-top: 1.2cm; margin-bottom: 0.8cm; } +h4 {font-family: arial, sansserif; font-weight: bold; margin-top: 1.2cm; margin-bottom: 0.8cm; } +p {font-family: Georgia, "Times New Roman", times, serif; margin-top: 0.3cm; margin-left: 0.5cm; margin-bottom: 0.3cm;} +p.dir {font-family: arial, sansserif; margin-top: 0cm; margin-bottom: 0cm;} +dl { margin-left: 0.5cm; } +dt { font-weight: bold; } +a:link {color: black;} +a:visited {color: black;} +ul.disc {list-style-type: disc; font-family: times, serif;} +ul.circle {list-style-type: circle; font-family: times, serif;} +ul.square {list-style-type: square; font-family: times, serif;} +ul.none {list-style-type: none; font-family: times, serif;} +pre.code { margin-top: 1.0cm; margin-bottom: 1.0cm; margin-left: 1.0cm; margin-right: 1.0cm; border:3px solid #c0c0c0; padding: 0.5cm; font-family: tahoma, sansserif; font-weight: normal; background-color:#f8f8f8; } +pre.command { margin-top: 1.0cm; margin-bottom: 1.0cm; margin-left: 1.5cm; margin-right: 0.0cm; border:0px; padding:0.0cm; font-family: tahoma, sansserif; font-weight: bold; background-color:#f8fffc; } +ol.level1 { font-family: arial, sansserif; font-weight: bold; font-style: italic; font-size:1.5em; } +ol.level2 { font-family: "Times New Roman", serif; font-weight: normal; font-style: normal; font-size:0.85em; margin-top: 0.2cm; margin-bottom: 0.5cm; } +table.figure { margin-left:auto; margin-right:auto; margin-top:1.0cm; margin-bottom:1.0cm; } + +td.caption { font-family: arial, sansserif; font-size: 75%; color: black; } +td { font-family: times, serif; } + +table.lt { border-collapse: collapse; border-style: none; } +td.lt- { margin: 0px; padding: 4px; padding-left:0.3cm; padding-right:0.3cm; border-width: 1px; border-style: none; padding-left=0.2cm; padding-right=0.2cm; } +td.lt-r { margin: 0px; padding: 4px; padding-left:0.3cm; padding-right:0.3cm; border-style: none; border-right-style: solid; border-width: 1px; border-color: black; } +td.lt-l { margin: 0px; padding: 4px; padding-left:0.3cm; padding-right:0.3cm; border-style: none; border-left-style: solid; border-width: 1px; border-color: black; } +td.lt-lr { margin: 0px; padding: 4px; padding-left:0.3cm; padding-right:0.3cm; border-style: none; border-right-style: solid; border-left-style: solid; border-width: 1px; border-color: black; } +td.lt-b { margin: 0px; padding: 4px; padding-left:0.3cm; padding-right:0.3cm; border-style: none; border-bottom-style: solid; border-width: 1px; border-color: black; } +td.lt-hl { margin: 0px; border-style: none; border-bottom-style: solid; border-width: 1px; border-color: black; height: 2px; } +td.lt-bl { margin: 0px; padding: 4px; padding-left:0.3cm; padding-right:0.3cm; border-style: none; border-bottom-style: solid; border-left-style: solid; border-width: 1px; border-color: black; } +td.lt-br { margin: 0px; padding: 4px; padding-left:0.3cm; padding-right:0.3cm; border-style: none; border-bottom-style: solid; border-right-style: solid; border-width: 1px; border-color: black; } +td.lt-blr { margin: 0px; padding: 4px; padding-left:0.3cm; padding-right:0.3cm; border-style: none; border-bottom-style: solid; border-left-style: solid; border-right-style: solid; border-width: 1px; border-color: black; } diff --git a/plugins/supereq/nsfft-1.00/doc/index.xhtml b/plugins/supereq/nsfft-1.00/doc/index.xhtml new file mode 100644 index 00000000..8b7e2c97 --- /dev/null +++ b/plugins/supereq/nsfft-1.00/doc/index.xhtml @@ -0,0 +1,2016 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1 plus MathML 2.0//EN" "http://www.w3.org/Math/DTD/mathml2/xhtml-math11-f.dtd"> +<html xmlns="http://www.w3.org/1999/xhtml"> +<head> +<meta http-equiv="Content-Type" content="text/html; charset=utf-8"/> +<link rel="stylesheet" type="text/css" href="default.css"/> +<title>NSFFT Reference Manual</title> +</head> +<body> +<h1>NSFFT Reference Manual</h1> + +<h3>Introduction</h3> + +<p> +This is a library for performing 1-dimensional discrete Fourier +transforms. NSDFT is a simple, small and portable library, and it is +efficient since it can utilize SIMD instruction sets in modern +processors. It performs multiple transforms simultaneously, and thus +it is especially suitable for digital signal processing. It does not +need so much computation to make a good execution plan. This library +is in public domain, so that you can incorporate this library into +your product without any obligation. +</p> + +<h3>API Reference</h3> + +<p> +In this section, the API functions are explained. +</p> + +<h4>Include files</h4> + +<p> +You have to include two include files in dft directory. +</p> + +<pre class="code"> +#include <stdint.h> +#include "SIMDBase.h" +#include "DFT.h" +</pre> + +<h4>Data types</h4> + +<p> +First, you have to choose a data type to represent an element in the +input and output sequence of numbers. You can choose from the +following three types. +</p> + +<table class="figure"> + <tr align="center"> + <td> + <table class="lt"> + <tr> + <td class="lt-hl"></td> + <td class="lt-hl"></td> + </tr> + <tr> + <td class="lt-br" align="center">Symbol</td> + <td class="lt-b" align="center">Data Type</td> + </tr> + <tr> + <td class="lt-hl"></td> + <td class="lt-hl"></td> + </tr> + <tr> + <td class="lt-r" align="left">SIMDBase_TYPE_FLOAT</td> + <td class="lt-" align="left">float type in C language</td> + </tr> + <tr> + <td class="lt-r" align="left">SIMDBase_TYPE_DOUBLE</td> + <td class="lt-" align="left">double type in C language</td> + </tr> + <tr> + <td class="lt-br" align="left">SIMDBase_TYPE_LONGDOUBLE</td> + <td class="lt-b" align="left">long double type in C language</td> + </tr> + </table> + </td> + </tr> + <tr align="center"> + <td class="caption">Table 1 Data types</td> + </tr> +</table> + + +<h4>Computation modes</h4> + +<p> +Next, a compuation mode have to be chosen. You can choose from the +following modes. +</p> + +<table class="figure"> + <tr align="center"> + <td> + <table class="lt"> + <tr> + <td class="lt-hl"></td> + <td class="lt-hl"></td> + <td class="lt-hl"></td> + <td class="lt-hl"></td> + </tr> + <tr> + <td class="lt-br" align="center">Symbol</td> + <td class="lt-br" align="center">Type</td> + <td class="lt-br" align="center">Vector Length</td> + <td class="lt-b" align="center">Computation Mode</td> + </tr> + <tr> + <td class="lt-hl"></td> + <td class="lt-hl"></td> + <td class="lt-hl"></td> + <td class="lt-hl"></td> + </tr> + <tr> + <td class="lt-r" align="left">SIMDBase_MODE_PUREC_FLOAT</td> + <td class="lt-r" align="center">float</td> + <td class="lt-r" align="center">1</td> + <td class="lt-" align="center">Scalar float</td> + </tr> + <tr> + <td class="lt-r" align="left">SIMDBase_MODE_PUREC_DOUBLE</td> + <td class="lt-r" align="center">double</td> + <td class="lt-r" align="center">1</td> + <td class="lt-" align="center">Scalar double</td> + </tr> + <tr> + <td class="lt-r" align="left">SIMDBase_MODE_PUREC_LONGDOUBLE</td> + <td class="lt-r" align="center">long double</td> + <td class="lt-r" align="center">1</td> + <td class="lt-" align="center">Scalar long double</td> + </tr> + <tr> + <td class="lt-r" align="left">SIMDBase_MODE_SSE_FLOAT</td> + <td class="lt-r" align="center">float</td> + <td class="lt-r" align="center">4</td> + <td class="lt-" align="center">x86 SSE</td> + </tr> + <tr> + <td class="lt-r" align="left">SIMDBase_MODE_SSE2_DOUBLE</td> + <td class="lt-r" align="center">double</td> + <td class="lt-r" align="center">2</td> + <td class="lt-" align="center">x86 SSE2</td> + </tr> + <tr> + <td class="lt-r" align="left">SIMDBase_MODE_NEON_FLOAT</td> + <td class="lt-r" align="center">float</td> + <td class="lt-r" align="center">4</td> + <td class="lt-" align="center">ARM NEON</td> + </tr> + <tr> + <td class="lt-r" align="left">SIMDBase_MODE_AVX_FLOAT</td> + <td class="lt-r" align="center">float</td> + <td class="lt-r" align="center">8</td> + <td class="lt-" align="center">x86 AVX (float)</td> + </tr> + <tr> + <td class="lt-r" align="left">SIMDBase_MODE_AVX_DOUBLE</td> + <td class="lt-r" align="center">double</td> + <td class="lt-r" align="center">4</td> + <td class="lt-" align="center">x86 AVX (double)</td> + </tr> + <tr> + <td class="lt-br" align="left">SIMDBase_MODE_ALTIVEC_FLOAT</td> + <td class="lt-br" align="center">float</td> + <td class="lt-br" align="center">4</td> + <td class="lt-b" align="center">PowerPC Altivec</td> + </tr> + </table> + </td> + </tr> + <tr align="center"> + <td class="caption">Table 2 Computation modes</td> + </tr> +</table> + +<p> +The following function automatically checks the availability of each +instruction set on your computer, and chooses the best computation +mode. +</p> + +<pre class="code"> +int32_t SIMDBase_chooseBestMode(int32_t type); +</pre> + +<p> +The return value is the best mode chosen by this routine. +<i>type</i> is the data type you chose. +</p> + + +<h4>Retrieving parameters</h4> + +<p> +You can make queries for any mode using the following function. +</p> + +<pre class="code"> +int32_t SIMDBase_getModeParamInt(int32_t paramId, int32_t mode); +</pre> + +<p> +<i>mode</i> is the computation mode you chose. <i>paramId</i> is one +of the following. +</p> + +<table class="figure"> + <tr align="center"> + <td> + <table class="lt"> + <tr> + <td class="lt-hl"></td> + <td class="lt-hl"></td> + </tr> + <tr> + <td class="lt-br" align="center">Symbol</td> + <td class="lt-b" align="center">Meaning</td> + </tr> + <tr> + <td class="lt-hl"></td> + <td class="lt-hl"></td> + </tr> + <tr> + <td class="lt-r" align="left">SIMDBase_PARAMID_SIZE_OF_REAL</td> + <td class="lt-" align="left">Size of an element in a vector in byte</td> + </tr> + <tr> + <td class="lt-r" align="left">SIMDBase_PARAMID_SIZE_OF_VECT</td> + <td class="lt-" align="left">Size of the vector in byte</td> + </tr> + <tr> + <td class="lt-r" align="left">SIMDBase_PARAMID_VECTOR_LEN</td> + <td class="lt-" align="left">Number of elements in a vector</td> + </tr> + <tr> + <td class="lt-br" align="left">SIMDBase_PARAMID_MODE_AVAILABILITY</td> + <td class="lt-b" align="left">Whether the given mode is available or not</td> + </tr> + </table> + </td> + </tr> + <tr align="center"> + <td class="caption">Table 3 Querying parameter for computation mode</td> + </tr> +</table> + +<p> +Here, a vector is a set of multiple primitive data element (single or +double precision FP number) which can be stored in one SIMD register, +and can be processed by one SIMD instruction at the same time. +</p> + +<p> +You can get the mode name in string data type. In this +case, <i>paramId</i> must be SIMDBase_PARAMID_MODE_NAME. +</p> + +<pre class="code"> +char *SIMDBase_getModeParamString(int32_t paramId, int32_t mode); +</pre> + +<p> +You should not modify the data returned by the above function. +</p> + + +<h4>Making and destroying execution plan</h4> + +<p> +An execution plan can be made by the following function. +</p> + +<pre class="code"> +DFT *DFT_init(int32_t mode, int32_t n, int32_t flags); +</pre> + +<p> +The return value is a pointer to a newly made plan. +<i>mode</i> is the mode you chose above. <i>n</i> is the length of a +transform. You can specify a bitwise OR of the following symbols +as <i>flags</i>. You should not specify more than one flags regarding +to test run. You should not specify DFT_FLAG_FORCE_RECURSIVE and +DFT_FLAG_FORCE_COBRA at the same time. If neither DFT_FLAG_REAL nor +DFT_FLAG_ALT_REAL is specified, an execution plan for complex +transforms are made. +</p> + +<table class="figure"> + <tr align="center"> + <td> + <table class="lt"> + <tr> + <td class="lt-hl"></td> + <td class="lt-hl"></td> + </tr> + <tr> + <td class="lt-br" align="center">Symbol</td> + <td class="lt-b" align="center">Meaning</td> + </tr> + <tr> + <td class="lt-hl"></td> + <td class="lt-hl"></td> + </tr> + <tr> + <td class="lt-r" align="left">DFT_FLAG_NO_TEST_RUN</td> + <td class="lt-" align="left">Make execution plan without performing a test run</td> + </tr> + <tr> + <td class="lt-r" align="left">DFT_FLAG_LIGHT_TEST_RUN</td> + <td class="lt-" align="left">Perform small amount of test run to make an execution plan</td> + </tr> + <tr> + <td class="lt-r" align="left">DFT_FLAG_HEAVY_TEST_RUN</td> + <td class="lt-" align="left">Perform large amount of test run to make an execution plan</td> + </tr> + <tr> + <td class="lt-r" align="left">DFT_FLAG_EXHAUSTIVE_TEST_RUN</td> + <td class="lt-" align="left">Perform exhaustive search of parameters and find the optimal execution plan</td> + </tr> + <tr> + <td class="lt-r" align="left">DFT_FLAG_REAL</td> + <td class="lt-" align="left">Make an execution plan for a real transform</td> + </tr> + <tr> + <td class="lt-r" align="left">DFT_FLAG_ALT_REAL</td> + <td class="lt-" align="left">Make an execution plan for an alternative real transform</td> + </tr> + <tr> + <td class="lt-r" align="left">DFT_FLAG_VERBOSE</td> + <td class="lt-" align="left">Make some noise during making an execution plan</td> + </tr> + <tr> + <td class="lt-r" align="left">DFT_FLAG_NOBITREVERSAL</td> + <td class="lt-" align="left">Does not perforam bitreversal operation during a transform</td> + </tr> + <tr> + <td class="lt-r" align="left">DFT_FLAG_FORCE_RECURSIVE</td> + <td class="lt-" align="left">Force using the recursive bit-reveral routine. This routine is suited for small transforms.</td> + </tr> + <tr> + <td class="lt-br" align="left">DFT_FLAG_FORCE_COBRA</td> + <td class="lt-b" align="left">Force using the Cobra bit-reveral routine. This routine is suited for large transforms.</td> + </tr> + </table> + </td> + </tr> + <tr align="center"> + <td class="caption">Table 4 Options for making execution plan</td> + </tr> +</table> + +<p> +You can destroy the plan you made by the following function. +</p> + +<pre class="code"> +void DFT_dispose(DFT *p, int32_t mode); +</pre> + +<p> +<i>p</i> is a pointer to the execution plan. <i>mode</i> is the +corresponding execution mode. +</p> + +<p> +You can retrieve parameters of a plan using the following function. +</p> + +<pre class="code"> +int32_t DFT_getPlanParamInt(int32_t paramId, void *p); +</pre> + +<p> +<i>p</i> is a pointer to an execution plan. <i>paramId</i> is one +of the following. +</p> + +<table class="figure"> + <tr align="center"> + <td> + <table class="lt"> + <tr> + <td class="lt-hl"></td> + <td class="lt-hl"></td> + </tr> + <tr> + <td class="lt-br" align="center">Symbol</td> + <td class="lt-b" align="center">Meaning</td> + </tr> + <tr> + <td class="lt-hl"></td> + <td class="lt-hl"></td> + </tr> + <tr> + <td class="lt-r" align="left">DFT_PARAMID_TYPE</td> + <td class="lt-" align="left">Data type</td> + </tr> + <tr> + <td class="lt-r" align="left">DFT_PARAMID_MODE</td> + <td class="lt-" align="left">Computation mode</td> + </tr> + <tr> + <td class="lt-r" align="left">DFT_PARAMID_FFT_LENGTH</td> + <td class="lt-" align="left">Length of the transform</td> + </tr> + <tr> + <td class="lt-r" align="left">DFT_PARAMID_IS_REAL_TRANSFORM</td> + <td class="lt-" align="left">Whether the plan is for real transforms</td> + </tr> + <tr> + <td class="lt-r" align="left">DFT_PARAMID_NO_BIT_REVERSAL</td> + <td class="lt-" align="left">Whether the plan does not perform bit reversal operation</td> + </tr> + <tr> + <td class="lt-br" align="left">DFT_PARAMID_TEST_RUN</td> + <td class="lt-b" align="left">How much test run is performed when making this plan</td> + </tr> + </table> + </td> + </tr> + <tr align="center"> + <td class="caption">Table 5 Querying parameter for execution plan</td> + </tr> +</table> + +<h4>Writing and reading execution plan to/from file</h4> + +<p> +You can write or read an execution plan to/from a file using the following functions. +</p> + +<pre class="code"> +int32_t DFT_fwrite(DFT *p, FILE *fp); +DFT *DFT_fread(FILE *fp, int32_t *errcode); +</pre> + +<p> +<i>p</i> is a pointer to a plan. <i>fp</i> is a file +pointer. DFT_fwrite returns 1 if the plan is successfully written, and +0 if an error occurs. DFT_fread returns the pointer to the read plan +if the plan is successfully read, and NULL if an error occurs. If an +error occurs, an error code is returned to a variable whose pointer is +specified by <i>errcode</i>. The interpretation of error codes is +given below. +</p> + +<table class="figure"> + <tr align="center"> + <td> + <table class="lt"> + <tr> + <td class="lt-hl"></td> + <td class="lt-hl"></td> + </tr> + <tr> + <td class="lt-br" align="center">Symbol</td> + <td class="lt-b" align="center">Meaning</td> + </tr> + <tr> + <td class="lt-hl"></td> + <td class="lt-hl"></td> + </tr> + <tr> + <td class="lt-r" align="left">DFT_ERROR_NOERROR</td> + <td class="lt-" align="left">No error</td> + </tr> + <tr> + <td class="lt-r" align="left">DFT_ERROR_FILE_VERSION</td> + <td class="lt-" align="left">File format version mismatch</td> + </tr> + <tr> + <td class="lt-r" align="left">DFT_ERROR_FILE_IO</td> + <td class="lt-" align="left">I/O error</td> + </tr> + <tr> + <td class="lt-r" align="left">DFT_ERROR_UNEXPECTED_EOF</td> + <td class="lt-" align="left">Unexpected EOF</td> + </tr> + <tr> + <td class="lt-r" align="left">DFT_ERROR_MODE_NOT_COMPILED_IN</td> + <td class="lt-" align="left">Tried to read a plan with mode that is not compiled in</td> + </tr> + <tr> + <td class="lt-r" align="left">DFT_ERROR_MODE_NOT_AVAILABLE</td> + <td class="lt-" align="left">Tried to read a plan with mode that is not supported by hardware</td> + </tr> + <tr> + <td class="lt-br" align="left">DFT_ERROR_UNKNOWN_MODE</td> + <td class="lt-b" align="left">Tried to read a plan with mode that is unknown by library</td> + </tr> + </table> + </td> + </tr> + <tr align="center"> + <td class="caption">Table 6 Errors that may happen during file I/O</td> + </tr> +</table> + + +<h4>Allocating and freeing buffers for transforms</h4> + +<p> +In order to allocate word-aligned buffers for storing data which is +fed to the FFT routine, you have to use the following function. +</p> + +<pre class="code"> +void *DFT_alignedMalloc(uint64_t size); +</pre> + +<p> +This function allocates <i>size</i> bytes of word-aligned memory and +returns the pointer. In order to free this memory, you have to use the +following function. +</p> + +<pre class="code"> +void DFT_alignedFree(void *ptr); +</pre> + +<p> +<i>ptr</i> is the pointer returned from DFT_alignedMalloc function. +</p> + +<h4>Executing transform</h4> + +<p> +By the following function, the planned transform can be executed. +</p> + +<pre class="code"> +void DFT_execute(DFT *p, int32_t mode, void *s, int32_t dir); +</pre> + +<p> +<i>p</i> is a pointer to the plan. <i>mode</i> is the computation +mode. <i>s</i> is the pointer to the buffer in which the sequence of +input values is stored. This pointer must be a pointer returned from +DFT_alignedMalloc function. +<i>dir</i> specifies the direction of transform. +</p> + +<p> +The forward and backward discrete Fourier transforms are defined by +the following formula (1) and (2), respectively. +</p> + +<table border="0" style="margin-right:1.0cm; margin-left:1.0cm; margin-top:0.5cm; margin-bottom:0.5cm;"> + <tr> + <td align="center" style="width:100%;"> + <math mode="display" style="font-size:1.2em;" xmlns="http://www.w3.org/1998/Math/MathML"> + <mrow> + <msub><mi>X</mi><mi>k</mi></msub> + <mo>=</mo> + <munderover> + <mo style="font-size:140%;">∑</mo> + <mrow><mi>n</mi><mo>=</mo><mn>0</mn></mrow> + <mrow><mi>N</mi><mo>-</mo><mn>1</mn></mrow> + </munderover> + <msub><mi>x</mi><mi>n</mi></msub> + <msup> + <mi>e</mi> + <mrow> + <mo>-</mo> + <mfrac> + <mrow><mn>2</mn><mi>π</mi><mi>i</mi></mrow> + <mi>N</mi> + </mfrac> + <mi>k</mi><mi>n</mi> + </mrow> + </msup> + + <mo> </mo> + + <mi>k</mi> + <mo>=</mo> + <mn>0</mn> + <mo>,</mo> + <mo>·</mo> + <mo>·</mo> + <mo>·</mo> + <mo>,</mo> + <mi>N</mi> + <mo>-</mo> + <mn>1</mn> + </mrow> + </math> + </td> + <td> + <p>(1)</p> + </td> + </tr> + <tr> + <td align="center" style="width:100%;"> + <math mode="display" style="font-size:1.2em;" xmlns="http://www.w3.org/1998/Math/MathML"> + <mrow> + <msub><mi>x</mi><mi>n</mi></msub> + <mo>=</mo> + <mfrac> + <mn>1</mn> + <mi>N</mi> + </mfrac> + <munderover> + <mo style="font-size:140%;">∑</mo> + <mrow><mi>k</mi><mo>=</mo><mn>0</mn></mrow> + <mrow><mi>N</mi><mo>-</mo><mn>1</mn></mrow> + </munderover> + <msub><mi>X</mi><mi>k</mi></msub> + <msup> + <mi>e</mi> + <mrow> + <mfrac> + <mrow><mn>2</mn><mi>π</mi><mi>i</mi></mrow> + <mi>N</mi> + </mfrac> + <mi>k</mi><mi>n</mi> + </mrow> + </msup> + + <mo> </mo> + + <mi>n</mi> + <mo>=</mo> + <mn>0</mn> + <mo>,</mo> + <mo>·</mo> + <mo>·</mo> + <mo>·</mo> + <mo>,</mo> + <mi>N</mi> + <mo>-</mo> + <mn>1</mn> + + </mrow> + </math> + </td> + <td> + <p>(2)</p> + </td> + </tr> +</table> + +<p> +The complex forward and backward transforms perform the transforms +defined by the following formula (3) and (4), respectively. <i>V</i> +is the vector length mentioned above. Again, calling DFT_execute once +performs <i>V</i> forward or backward transforms at a time. Please +note that (4) gives values multiplied by <i>N</i> compared to +(2). Specifying -1 as the direction of transform performs the +transform defined by (3). In this case, the input should be given as +in (5) , and the output is given as in (6). Specifying 1 as the +direction of transform performs the transform defined by (4), and in +this case, the input should be given as in (6) , and the output is +given as in (5). +</p> + +<table border="0" style="margin-right:1.0cm; margin-left:1.0cm; margin-top:0.5cm; margin-bottom:0.5cm;"> + <tr> + <td align="center" style="width:100%;"> + <math mode="display" style="font-size:1.2em;" xmlns="http://www.w3.org/1998/Math/MathML"> + <mrow> + <msub><mi>X</mi><mrow><mi>k</mi><mo>,</mo><mi>v</mi></mrow></msub> + <mo>=</mo> + <munderover> + <mo style="font-size:140%;">∑</mo> + <mrow><mi>n</mi><mo>=</mo><mn>0</mn></mrow> + <mrow><mi>N</mi><mo>-</mo><mn>1</mn></mrow> + </munderover> + <msub><mi>x</mi><mrow><mi>n</mi><mo>,</mo><mi>v</mi></mrow></msub> + <msup> + <mi>e</mi> + <mrow> + <mo>-</mo> + <mfrac> + <mrow><mn>2</mn><mi>π</mi><mi>i</mi></mrow> + <mi>N</mi> + </mfrac> + <mi>k</mi><mi>n</mi> + </mrow> + </msup> + + <mo> </mo> + + <mrow style="font-size:100%;"> + <mi>k</mi> + <mo>=</mo> + <mn>0</mn> + <mo>,</mo> + <mo>·</mo> + <mo>·</mo> + <mo>·</mo> + <mo>,</mo> + <mi>N</mi> + <mo>-</mo> + <mn>1</mn> + + <mo> </mo> + <mo>,</mo> + <mo> </mo> + + <mi>v</mi> + <mo>=</mo> + <mn>0</mn> + <mo>,</mo> + <mo>·</mo> + <mo>·</mo> + <mo>·</mo> + <mo>,</mo> + <mi>V</mi> + <mo>-</mo> + <mn>1</mn> + </mrow> + </mrow> + </math> + </td> + <td> + <p>(3)</p> + </td> + </tr> + <tr> + <td align="center" style="width:100%;"> + <math mode="display" style="font-size:1.2em;" xmlns="http://www.w3.org/1998/Math/MathML"> + <mrow> + <msub><mi>x</mi><mrow><mi>n</mi><mo>,</mo><mi>v</mi></mrow></msub> + <mo>=</mo> + <munderover> + <mo style="font-size:140%;">∑</mo> + <mrow><mi>k</mi><mo>=</mo><mn>0</mn></mrow> + <mrow><mi>N</mi><mo>-</mo><mn>1</mn></mrow> + </munderover> + <msub><mi>X</mi><mrow><mi>k</mi><mo>,</mo><mi>v</mi></mrow></msub> + <msup> + <mi>e</mi> + <mrow> + <mfrac> + <mrow><mn>2</mn><mi>π</mi><mi>i</mi></mrow> + <mi>N</mi> + </mfrac> + <mi>k</mi><mi>n</mi> + </mrow> + </msup> + + <mo> </mo> + + <mrow style="font-size:100%;"> + <mi>n</mi> + <mo>=</mo> + <mn>0</mn> + <mo>,</mo> + <mo>·</mo> + <mo>·</mo> + <mo>·</mo> + <mo>,</mo> + <mi>N</mi> + <mo>-</mo> + <mn>1</mn> + + <mo> </mo> + <mo>,</mo> + <mo> </mo> + + <mi>v</mi> + <mo>=</mo> + <mn>0</mn> + <mo>,</mo> + <mo>·</mo> + <mo>·</mo> + <mo>·</mo> + <mo>,</mo> + <mi>V</mi> + <mo>-</mo> + <mn>1</mn> + </mrow> + </mrow> + </math> + </td> + <td> + <p>(4)</p> + </td> + </tr> +</table> + +<table border="0" style="margin-right:1.0cm; margin-left:1.0cm; margin-top:0.5cm; margin-bottom:0.5cm;"> + <tr> + <td align="center" style="width:100%;"> + <math mode="display" style="font-size:1.2em;" xmlns="http://www.w3.org/1998/Math/MathML"> + <mrow> + <mfenced open="{" close=""> + <mtable> + <mtr> + <mtd> + <mrow> + <mi>s</mi> + <mo>[</mo> + <mo>(</mo> + <mn>2</mn> + <mi>n</mi> + <mo>+</mo> + <mn>0</mn> + <mo>)</mo> + <mi>V</mi> + <mo>+</mo> + <mi>v</mi> + <mo>]</mo> + + <mo>=</mo> + + <mi>Re</mi> + <mo>(</mo> + <msub><mi>x</mi><mrow><mi>n</mi><mo>,</mo><mi>v</mi></mrow></msub> + <mo>)</mo> + </mrow> + </mtd> + </mtr> + + <mtr> + <mtd> + <mrow> + <mi>s</mi> + <mo>[</mo> + <mo>(</mo> + <mn>2</mn> + <mi>n</mi> + <mo>+</mo> + <mn>1</mn> + <mo>)</mo> + <mi>V</mi> + <mo>+</mo> + <mi>v</mi> + <mo>]</mo> + + <mo>=</mo> + + <mi>Im</mi> + <mo>(</mo> + <msub><mi>x</mi><mrow><mi>n</mi><mo>,</mo><mi>v</mi></mrow></msub> + <mo>)</mo> + </mrow> + </mtd> + </mtr> + + </mtable> + </mfenced> + + <mo> </mo> + + <mrow style="font-size:100%;"> + <mi>n</mi> + <mo>=</mo> + <mn>0</mn> + <mo>,</mo> + <mo>·</mo> + <mo>·</mo> + <mo>·</mo> + <mo>,</mo> + <mi>N</mi> + <mo>-</mo> + <mn>1</mn> + + <mo> </mo> + <mo>,</mo> + <mo> </mo> + + <mi>v</mi> + <mo>=</mo> + <mn>0</mn> + <mo>,</mo> + <mo>·</mo> + <mo>·</mo> + <mo>·</mo> + <mo>,</mo> + <mi>V</mi> + <mo>-</mo> + <mn>1</mn> + </mrow> + </mrow> + </math> + </td> + <td> + <p>(5)</p> + </td> + </tr> +</table> + +<table border="0" style="margin-right:1.0cm; margin-left:1.0cm; margin-top:0.5cm; margin-bottom:0.5cm;"> + <tr> + <td align="center" style="width:100%;"> + <math mode="display" style="font-size:1.2em;" xmlns="http://www.w3.org/1998/Math/MathML"> + <mrow> + <mfenced open="{" close=""> + <mtable> + <mtr> + <mtd> + <mrow> + <mi>s</mi> + <mo>[</mo> + <mo>(</mo> + <mn>2</mn> + <mi>k</mi> + <mo>+</mo> + <mn>0</mn> + <mo>)</mo> + <mi>V</mi> + <mo>+</mo> + <mi>v</mi> + <mo>]</mo> + + <mo>=</mo> + + <mi>Re</mi> + <mo>(</mo> + <msub><mi>X</mi><mrow><mi>k</mi><mo>,</mo><mi>v</mi></mrow></msub> + <mo>)</mo> + </mrow> + </mtd> + </mtr> + + <mtr> + <mtd> + <mrow> + <mi>s</mi> + <mo>[</mo> + <mo>(</mo> + <mn>2</mn> + <mi>k</mi> + <mo>+</mo> + <mn>1</mn> + <mo>)</mo> + <mi>V</mi> + <mo>+</mo> + <mi>v</mi> + <mo>]</mo> + + <mo>=</mo> + + <mi>Im</mi> + <mo>(</mo> + <msub><mi>X</mi><mrow><mi>k</mi><mo>,</mo><mi>v</mi></mrow></msub> + <mo>)</mo> + </mrow> + </mtd> + </mtr> + + </mtable> + </mfenced> + + <mo> </mo> + + <mrow style="font-size:100%;"> + <mi>k</mi> + <mo>=</mo> + <mn>0</mn> + <mo>,</mo> + <mo>·</mo> + <mo>·</mo> + <mo>·</mo> + <mo>,</mo> + <mi>N</mi> + <mo>-</mo> + <mn>1</mn> + + <mo> </mo> + <mo>,</mo> + <mo> </mo> + + <mi>v</mi> + <mo>=</mo> + <mn>0</mn> + <mo>,</mo> + <mo>·</mo> + <mo>·</mo> + <mo>·</mo> + <mo>,</mo> + <mi>V</mi> + <mo>-</mo> + <mn>1</mn> + </mrow> + </mrow> + </math> + </td> + <td> + <p>(6)</p> + </td> + </tr> +</table> + +<p> +The real forward transform performs the transform defined by (3) when +the condition (7) is satisfied. In this case, the output satisfies +(8). You should specify -1 as the direction of transform, and the +input should be given as in (9), and the output is given as in (10). +The real backward transform is the opposite of the real forward +transform. The input should satisfy (8) and the output satisfies (7). +You should specify 1 as the direction of transform, and the input +should be given as in (10), and the output is given as in (11). +</p> + +<table border="0" style="margin-right:1.0cm; margin-left:1.0cm; margin-top:0.5cm; margin-bottom:0.5cm;"> + <tr> + <td align="center" style="width:100%;"> + <math mode="display" style="font-size:1.2em;" xmlns="http://www.w3.org/1998/Math/MathML"> + <mrow> + <mi>Im</mi> + <mo>(</mo> + <msub><mi>x</mi><mrow><mi>n</mi><mo>,</mo><mi>v</mi></mrow></msub> + <mo>)</mo> + <mo>=</mo> + <mn>0</mn> + + <mo> </mo> + + <mrow style="font-size:100%;"> + <mi>n</mi> + <mo>=</mo> + <mn>0</mn> + <mo>,</mo> + <mo>·</mo> + <mo>·</mo> + <mo>·</mo> + <mo>,</mo> + <mi>N</mi> + <mo>-</mo> + <mn>1</mn> + + <mo> </mo> + <mo>,</mo> + <mo> </mo> + + <mi>v</mi> + <mo>=</mo> + <mn>0</mn> + <mo>,</mo> + <mo>·</mo> + <mo>·</mo> + <mo>·</mo> + <mo>,</mo> + <mi>V</mi> + <mo>-</mo> + <mn>1</mn> + </mrow> + + </mrow> + </math> + </td> + <td> + <p>(7)</p> + </td> + </tr> +</table> + +<table border="0" style="margin-right:1.0cm; margin-left:1.0cm; margin-top:0.5cm; margin-bottom:0.5cm;"> + <tr> + <td align="center" style="width:100%;"> + <math mode="display" style="font-size:1.2em;" xmlns="http://www.w3.org/1998/Math/MathML"> + <mrow> + <mfenced open="{" close=""> + <mtable> + <mtr> + <mtd> + <mrow> + <msub><mi>X</mi><mrow><mi>k</mi><mo>,</mo><mi>v</mi></mrow></msub> + <mo>=</mo> + <msubsup> + <mi>X</mi> + <mrow><mi>N</mi><mo>-</mo><mi>k</mi><mo>,</mo><mi>v</mi></mrow> + <mo>*</mo> + </msubsup> + </mrow> + </mtd> + + <mtd> + <mo> </mo> + </mtd> + + <mtd> + <mrow style="font-size:100%;"> + <mi>k</mi> + <mo>=</mo> + <mn>1</mn> + <mo>,</mo> + <mo>·</mo> + <mo>·</mo> + <mo>·</mo> + <mo>,</mo> + <mfrac> + <mi>N</mi> + <mn>2</mn> + </mfrac> + <mo>-</mo> + <mn>1</mn> + </mrow> + </mtd> + </mtr> + + <mtr> + <mtd> + <mrow> + <mi>Im</mi> + <mo>(</mo> + <msub><mi>X</mi><mrow><mi>k</mi><mo>,</mo><mi>v</mi></mrow></msub> + <mo>)</mo> + <mo>=</mo> + <mn>0</mn> + </mrow> + </mtd> + + <mtd> + <mo> </mo> + </mtd> + + <mtd> + <mrow style="font-size:100%;"> + <mi>k</mi> + <mo>=</mo> + <mn>0</mn> + <mo>,</mo> + <mfrac> + <mi>N</mi> + <mn>2</mn> + </mfrac> + </mrow> + </mtd> + </mtr> + + </mtable> + </mfenced> + + <mo> </mo> + + <mrow style="font-size:100%;"> + <mi>v</mi> + <mo>=</mo> + <mn>0</mn> + <mo>,</mo> + <mo>·</mo> + <mo>·</mo> + <mo>·</mo> + <mo>,</mo> + <mi>V</mi> + <mo>-</mo> + <mn>1</mn> + </mrow> + </mrow> + </math> + </td> + <td> + <p>(8)</p> + </td> + </tr> +</table> + +<table border="0" style="margin-right:1.0cm; margin-left:1.0cm; margin-top:0.5cm; margin-bottom:0.5cm;"> + <tr> + <td align="center" style="width:100%;"> + <math mode="display" style="font-size:1.2em;" xmlns="http://www.w3.org/1998/Math/MathML"> + <mrow> + <mrow> + <mi>s</mi> + <mo>[</mo> + <mi>n</mi> + <mi>V</mi> + <mo>+</mo> + <mi>v</mi> + <mo>]</mo> + + <mo>=</mo> + + <mi>Re</mi> + <mo>(</mo> + <msub><mi>x</mi><mrow><mi>n</mi><mo>,</mo><mi>v</mi></mrow></msub> + <mo>)</mo> + </mrow> + + <mo> </mo> + + <mrow style="font-size:100%;"> + <mi>n</mi> + <mo>=</mo> + <mn>0</mn> + <mo>,</mo> + <mo>·</mo> + <mo>·</mo> + <mo>·</mo> + <mo>,</mo> + <mi>N</mi> + <mo>-</mo> + <mn>1</mn> + + <mo> </mo> + <mo>,</mo> + <mo> </mo> + + <mi>v</mi> + <mo>=</mo> + <mn>0</mn> + <mo>,</mo> + <mo>·</mo> + <mo>·</mo> + <mo>·</mo> + <mo>,</mo> + <mi>V</mi> + <mo>-</mo> + <mn>1</mn> + </mrow> + </mrow> + </math> + </td> + <td> + <p>(9)</p> + </td> + </tr> +</table> + +<table border="0" style="margin-right:1.0cm; margin-left:1.0cm; margin-top:0.5cm; margin-bottom:0.5cm;"> + <tr> + <td align="center" style="width:100%;"> + <math mode="display" style="font-size:1.2em;" xmlns="http://www.w3.org/1998/Math/MathML"> + <mrow> + <mfenced open="{" close=""> + <mtable> + <mtr> + <mtd> + <mrow> + <mi>s</mi> + <mo>[</mo> + <mo>(</mo> + <mn>2</mn> + <mi>k</mi> + <mo>+</mo> + <mn>0</mn> + <mo>)</mo> + <mi>V</mi> + <mo>+</mo> + <mi>v</mi> + <mo>]</mo> + </mrow> + </mtd> + + <mtd> + <mo>=</mo> + </mtd> + + <mtd> + <mrow> + <mi>Re</mi> + <mo>(</mo> + <msub><mi>X</mi><mrow><mi>k</mi><mo>,</mo><mi>v</mi></mrow></msub> + <mo>)</mo> + </mrow> + </mtd> + + <mtd> + <mrow style="font-size:100%;"> + <mi>k</mi> + <mo>=</mo> + <mn>0</mn> + <mo>,</mo> + <mo>·</mo> + <mo>·</mo> + <mo>·</mo> + <mo>,</mo> + <mfrac> + <mi>N</mi> + <mn>2</mn> + </mfrac> + <mo>-</mo> + <mn>1</mn> + </mrow> + </mtd> + </mtr> + + <mtr> + <mtd> + <mrow> + <mi>s</mi> + <mo>[</mo> + <mi>V</mi> + <mo>+</mo> + <mi>v</mi> + <mo>]</mo> + </mrow> + </mtd> + + <mtd> + <mo>=</mo> + </mtd> + + <mtd> + <mrow> + <mi>Re</mi> + <mo>(</mo> + <msub><mi>X</mi><mrow><mi>N</mi><mo>/</mo><mn>2</mn><mo>,</mo><mi>v</mi></mrow></msub> + <mo>)</mo> + </mrow> + </mtd> + + <mtd> + </mtd> + </mtr> + + <mtr> + <mtd> + <mrow> + <mi>s</mi> + <mo>[</mo> + <mo>(</mo> + <mn>2</mn> + <mi>k</mi> + <mo>+</mo> + <mn>1</mn> + <mo>)</mo> + <mi>V</mi> + <mo>+</mo> + <mi>v</mi> + <mo>]</mo> + </mrow> + </mtd> + + <mtd> + <mo>=</mo> + </mtd> + + <mtd> + <mrow> + <mi>Im</mi> + <mo>(</mo> + <msub><mi>X</mi><mrow><mi>k</mi><mo>,</mo><mi>v</mi></mrow></msub> + <mo>)</mo> + </mrow> + </mtd> + + <mtd> + <mrow style="font-size:100%;"> + <mi>k</mi> + <mo>=</mo> + <mn>1</mn> + <mo>,</mo> + <mo>·</mo> + <mo>·</mo> + <mo>·</mo> + <mo>,</mo> + <mfrac> + <mi>N</mi> + <mn>2</mn> + </mfrac> + <mo>-</mo> + <mn>1</mn> + </mrow> + </mtd> + </mtr> + + </mtable> + </mfenced> + + <mo> </mo> + + <mrow style="font-size:100%;"> + <mi>v</mi> + <mo>=</mo> + <mn>0</mn> + <mo>,</mo> + <mo>·</mo> + <mo>·</mo> + <mo>·</mo> + <mo>,</mo> + <mi>V</mi> + <mo>-</mo> + <mn>1</mn> + </mrow> + </mrow> + </math> + </td> + <td> + <p>(10)</p> + </td> + </tr> +</table> + +<table border="0" style="margin-right:1.0cm; margin-left:1.0cm; margin-top:0.5cm; margin-bottom:0.5cm;"> + <tr> + <td align="center" style="width:100%;"> + <math mode="display" style="font-size:1.2em;" xmlns="http://www.w3.org/1998/Math/MathML"> + <mrow> + <mrow> + <mn>2</mn> + <mo> </mo> + <mi>s</mi> + <mo>[</mo> + <mi>n</mi> + <mi>V</mi> + <mo>+</mo> + <mi>v</mi> + <mo>]</mo> + + <mo>=</mo> + + <mi>Re</mi> + <mo>(</mo> + <msub><mi>x</mi><mrow><mi>n</mi><mo>,</mo><mi>v</mi></mrow></msub> + <mo>)</mo> + </mrow> + + <mo> </mo> + + <mrow style="font-size:100%;"> + <mi>n</mi> + <mo>=</mo> + <mn>0</mn> + <mo>,</mo> + <mo>·</mo> + <mo>·</mo> + <mo>·</mo> + <mo>,</mo> + <mi>N</mi> + <mo>-</mo> + <mn>1</mn> + + <mo> </mo> + <mo>,</mo> + <mo> </mo> + + <mi>v</mi> + <mo>=</mo> + <mn>0</mn> + <mo>,</mo> + <mo>·</mo> + <mo>·</mo> + <mo>·</mo> + <mo>,</mo> + <mi>V</mi> + <mo>-</mo> + <mn>1</mn> + </mrow> + </mrow> + </math> + </td> + <td> + <p>(11)</p> + </td> + </tr> +</table> + +<p> +The alternative real transforms are defined by (12) to (16), similarly +to the real transforms. The alternative transforms are handy if you +are migrating from the FFT library made by Prof. Takuya Ooura. You +should specify 1 as the direction in order to perform a forward +transform, and -1 when you perform a backward transform. +</p> + +<table border="0" style="margin-right:1.0cm; margin-left:1.0cm; margin-top:0.5cm; margin-bottom:0.5cm;"> + <tr> + <td align="center" style="width:100%;"> + <math mode="display" style="font-size:1.2em;" xmlns="http://www.w3.org/1998/Math/MathML"> + <mrow> + <mi>Im</mi> + <mo>(</mo> + <msub><mi>X</mi><mrow><mi>k</mi><mo>,</mo><mi>v</mi></mrow></msub> + <mo>)</mo> + <mo>=</mo> + <mn>0</mn> + + <mo> </mo> + + <mrow style="font-size:100%;"> + <mi>k</mi> + <mo>=</mo> + <mn>0</mn> + <mo>,</mo> + <mo>·</mo> + <mo>·</mo> + <mo>·</mo> + <mo>,</mo> + <mi>N</mi> + <mo>-</mo> + <mn>1</mn> + + <mo> </mo> + <mo>,</mo> + <mo> </mo> + + <mi>v</mi> + <mo>=</mo> + <mn>0</mn> + <mo>,</mo> + <mo>·</mo> + <mo>·</mo> + <mo>·</mo> + <mo>,</mo> + <mi>V</mi> + <mo>-</mo> + <mn>1</mn> + </mrow> + + </mrow> + </math> + </td> + <td> + <p>(12)</p> + </td> + </tr> +</table> + +<table border="0" style="margin-right:1.0cm; margin-left:1.0cm; margin-top:0.5cm; margin-bottom:0.5cm;"> + <tr> + <td align="center" style="width:100%;"> + <math mode="display" style="font-size:1.2em;" xmlns="http://www.w3.org/1998/Math/MathML"> + <mrow> + <mfenced open="{" close=""> + <mtable> + <mtr> + <mtd> + <mrow> + <msub><mi>x</mi><mrow><mi>n</mi><mo>,</mo><mi>v</mi></mrow></msub> + <mo>=</mo> + <msubsup> + <mi>x</mi> + <mrow><mi>N</mi><mo>-</mo><mi>n</mi><mo>,</mo><mi>v</mi></mrow> + <mo>*</mo> + </msubsup> + </mrow> + </mtd> + + <mtd> + <mo> </mo> + </mtd> + + <mtd> + <mrow style="font-size:100%;"> + <mi>n</mi> + <mo>=</mo> + <mn>1</mn> + <mo>,</mo> + <mo>·</mo> + <mo>·</mo> + <mo>·</mo> + <mo>,</mo> + <mfrac> + <mi>N</mi> + <mn>2</mn> + </mfrac> + <mo>-</mo> + <mn>1</mn> + </mrow> + </mtd> + </mtr> + + <mtr> + <mtd> + <mrow> + <mi>Im</mi> + <mo>(</mo> + <msub><mi>x</mi><mrow><mi>n</mi><mo>,</mo><mi>v</mi></mrow></msub> + <mo>)</mo> + <mo>=</mo> + <mn>0</mn> + </mrow> + </mtd> + + <mtd> + <mo> </mo> + </mtd> + + <mtd> + <mrow style="font-size:100%;"> + <mi>n</mi> + <mo>=</mo> + <mn>0</mn> + <mo>,</mo> + <mfrac> + <mi>N</mi> + <mn>2</mn> + </mfrac> + </mrow> + </mtd> + </mtr> + + </mtable> + </mfenced> + + <mo> </mo> + + <mrow style="font-size:100%;"> + <mi>v</mi> + <mo>=</mo> + <mn>0</mn> + <mo>,</mo> + <mo>·</mo> + <mo>·</mo> + <mo>·</mo> + <mo>,</mo> + <mi>V</mi> + <mo>-</mo> + <mn>1</mn> + </mrow> + </mrow> + </math> + </td> + <td> + <p>(13)</p> + </td> + </tr> +</table> + +<table border="0" style="margin-right:1.0cm; margin-left:1.0cm; margin-top:0.5cm; margin-bottom:0.5cm;"> + <tr> + <td align="center" style="width:100%;"> + <math mode="display" style="font-size:1.2em;" xmlns="http://www.w3.org/1998/Math/MathML"> + <mrow> + <mrow> + <mi>s</mi> + <mo>[</mo> + <mi>n</mi> + <mi>V</mi> + <mo>+</mo> + <mi>v</mi> + <mo>]</mo> + + <mo>=</mo> + + <mi>Re</mi> + <mo>(</mo> + <msub><mi>X</mi><mrow><mi>k</mi><mo>,</mo><mi>v</mi></mrow></msub> + <mo>)</mo> + </mrow> + + <mo> </mo> + + <mrow style="font-size:100%;"> + <mi>k</mi> + <mo>=</mo> + <mn>0</mn> + <mo>,</mo> + <mo>·</mo> + <mo>·</mo> + <mo>·</mo> + <mo>,</mo> + <mi>N</mi> + <mo>-</mo> + <mn>1</mn> + + <mo> </mo> + <mo>,</mo> + <mo> </mo> + + <mi>v</mi> + <mo>=</mo> + <mn>0</mn> + <mo>,</mo> + <mo>·</mo> + <mo>·</mo> + <mo>·</mo> + <mo>,</mo> + <mi>V</mi> + <mo>-</mo> + <mn>1</mn> + </mrow> + </mrow> + </math> + </td> + <td> + <p>(14)</p> + </td> + </tr> +</table> + +<table border="0" style="margin-right:1.0cm; margin-left:1.0cm; margin-top:0.5cm; margin-bottom:0.5cm;"> + <tr> + <td align="center" style="width:100%;"> + <math mode="display" style="font-size:1.2em;" xmlns="http://www.w3.org/1998/Math/MathML"> + <mrow> + <mfenced open="{" close=""> + <mtable> + <mtr> + <mtd> + <mrow> + <mi>s</mi> + <mo>[</mo> + <mo>(</mo> + <mn>2</mn> + <mi>n</mi> + <mo>+</mo> + <mn>0</mn> + <mo>)</mo> + <mi>V</mi> + <mo>+</mo> + <mi>v</mi> + <mo>]</mo> + </mrow> + </mtd> + + <mtd> + <mo>=</mo> + </mtd> + + <mtd> + <mrow> + <mi>Re</mi> + <mo>(</mo> + <msub><mi>x</mi><mrow><mi>n</mi><mo>,</mo><mi>v</mi></mrow></msub> + <mo>)</mo> + </mrow> + </mtd> + + <mtd> + <mrow style="font-size:100%;"> + <mi>n</mi> + <mo>=</mo> + <mn>0</mn> + <mo>,</mo> + <mo>·</mo> + <mo>·</mo> + <mo>·</mo> + <mo>,</mo> + <mfrac> + <mi>N</mi> + <mn>2</mn> + </mfrac> + <mo>-</mo> + <mn>1</mn> + </mrow> + </mtd> + </mtr> + + <mtr> + <mtd> + <mrow> + <mi>s</mi> + <mo>[</mo> + <mi>V</mi> + <mo>+</mo> + <mi>v</mi> + <mo>]</mo> + </mrow> + </mtd> + + <mtd> + <mo>=</mo> + </mtd> + + <mtd> + <mrow> + <mi>Re</mi> + <mo>(</mo> + <msub><mi>x</mi><mrow><mi>N</mi><mo>/</mo><mn>2</mn><mo>,</mo><mi>v</mi></mrow></msub> + <mo>)</mo> + </mrow> + </mtd> + + <mtd> + </mtd> + </mtr> + + <mtr> + <mtd> + <mrow> + <mi>s</mi> + <mo>[</mo> + <mo>(</mo> + <mn>2</mn> + <mi>n</mi> + <mo>+</mo> + <mn>1</mn> + <mo>)</mo> + <mi>V</mi> + <mo>+</mo> + <mi>v</mi> + <mo>]</mo> + </mrow> + </mtd> + + <mtd> + <mo>=</mo> + </mtd> + + <mtd> + <mrow> + <mi>Im</mi> + <mo>(</mo> + <msub><mi>x</mi><mrow><mi>n</mi><mo>,</mo><mi>v</mi></mrow></msub> + <mo>)</mo> + </mrow> + </mtd> + + <mtd> + <mrow style="font-size:100%;"> + <mi>n</mi> + <mo>=</mo> + <mn>1</mn> + <mo>,</mo> + <mo>·</mo> + <mo>·</mo> + <mo>·</mo> + <mo>,</mo> + <mfrac> + <mi>N</mi> + <mn>2</mn> + </mfrac> + <mo>-</mo> + <mn>1</mn> + </mrow> + </mtd> + </mtr> + + </mtable> + </mfenced> + + <mo> </mo> + + <mrow style="font-size:100%;"> + <mi>v</mi> + <mo>=</mo> + <mn>0</mn> + <mo>,</mo> + <mo>·</mo> + <mo>·</mo> + <mo>·</mo> + <mo>,</mo> + <mi>V</mi> + <mo>-</mo> + <mn>1</mn> + </mrow> + </mrow> + </math> + </td> + <td> + <p>(15)</p> + </td> + </tr> +</table> + +<table border="0" style="margin-right:1.0cm; margin-left:1.0cm; margin-top:0.5cm; margin-bottom:0.5cm;"> + <tr> + <td align="center" style="width:100%;"> + <math mode="display" style="font-size:1.2em;" xmlns="http://www.w3.org/1998/Math/MathML"> + <mrow> + <mrow> + <mn>2</mn> + <mo> </mo> + <mi>s</mi> + <mo>[</mo> + <mi>n</mi> + <mi>V</mi> + <mo>+</mo> + <mi>v</mi> + <mo>]</mo> + + <mo>=</mo> + + <mi>Re</mi> + <mo>(</mo> + <msub><mi>X</mi><mrow><mi>k</mi><mo>,</mo><mi>v</mi></mrow></msub> + <mo>)</mo> + </mrow> + + <mo> </mo> + + <mrow style="font-size:100%;"> + <mi>k</mi> + <mo>=</mo> + <mn>0</mn> + <mo>,</mo> + <mo>·</mo> + <mo>·</mo> + <mo>·</mo> + <mo>,</mo> + <mi>N</mi> + <mo>-</mo> + <mn>1</mn> + + <mo> </mo> + <mo>,</mo> + <mo> </mo> + + <mi>v</mi> + <mo>=</mo> + <mn>0</mn> + <mo>,</mo> + <mo>·</mo> + <mo>·</mo> + <mo>·</mo> + <mo>,</mo> + <mi>V</mi> + <mo>-</mo> + <mn>1</mn> + </mrow> + </mrow> + </math> + </td> + <td> + <p>(16)</p> + </td> + </tr> +</table> + + +<h3>Examples</h3> + +<p> +Below is an example code using nsfft library. +</p> + +<pre class="code"> +#include <stdio.h> +#include <stdlib.h> +#include <math.h> +#include <stdint.h> +#include <complex.h> + +#include "SIMDBase.h" +#include "DFT.h" + +typedef float REAL; +#define TYPE SIMDBase_TYPE_FLOAT + +#define THRES 1e-3 + +double complex omega(double n, double kn) { + return cexp((-2 * M_PI * _Complex_I / n) * kn); +} + +void forward(double complex *ts, double complex *fs, int len) { + int k, n; + + for(k=0;k<len;k++) { + fs[k] = 0; + + for(n=0;n<len;n++) { + fs[k] += ts[n] * omega(len, n*k); + } + } +} + +int main(int argc, char **argv) { + const int n = 256; + + int mode = SIMDBase_chooseBestMode(TYPE); + printf("mode : %d, %s\n", mode, SIMDBase_getModeParamString(SIMDBase_PARAMID_MODE_NAME, mode)); + + int veclen = SIMDBase_getModeParamInt(SIMDBase_PARAMID_VECTOR_LEN, mode); + int sizeOfVect = SIMDBase_getModeParamInt(SIMDBase_PARAMID_SIZE_OF_VECT, mode); + + // + + int i, j; + + DFT *p = DFT_init(mode, n, 0); + REAL *sx = SIMDBase_alignedMalloc(sizeOfVect*n*2); + + // + + double complex ts[veclen][n], fs[veclen][n]; + + for(j=0;j<veclen;j++) { + for(i=0;i<n;i++) { + ts[j][i] = (random() / (double)RAND_MAX) + (random() / (double)RAND_MAX) * _Complex_I; + sx[(i*2+0)*veclen+j] = creal(ts[j][i]); + sx[(i*2+1)*veclen+j] = cimag(ts[j][i]); + } + } + + // + + DFT_execute(p, mode, sx, -1); + + for(j=0;j<veclen;j++) { + forward(ts[j], fs[j], n); + } + + // + + int success = 1; + + for(j=0;j<veclen;j++) { + for(i=0;i<n;i++) { + if ((fabs(sx[(i*2+0)*veclen+j] - creal(fs[j][i])) > THRES) || + (fabs(sx[(i*2+1)*veclen+j] - cimag(fs[j][i])) > THRES)) { + success = 0; + } + } + } + + printf("%s\n", success ? "OK" : "NG"); + + // + + SIMDBase_alignedFree(sx); + DFT_dispose(p, mode); + + exit(0); +} +</pre> + +<p> +You should put this code under a directory in the root directory of +the library, and then you can compile this code with the following +command. +</p> + +<pre class="code"> +gcc -Wall -g -I ../simd -I ../dft -L../simd -L../dft -O DFTExample.c -lDFT -lSIMD -lm -o DFTExample +</pre> + +<h3>Compilation</h3> + +<p> +The nsfft source package include a few makefiles for various +architectures. You should make symbolic links to makefiles suited for +your computer under <i>dft</i> and <i>simd</i> directories. +</p> + +</body> +</html> diff --git a/plugins/supereq/nsfft-1.00/doc/nsfft.pdf b/plugins/supereq/nsfft-1.00/doc/nsfft.pdf Binary files differnew file mode 100644 index 00000000..ed4ad5db --- /dev/null +++ b/plugins/supereq/nsfft-1.00/doc/nsfft.pdf diff --git a/plugins/supereq/nsfft-1.00/ooura/Makefile b/plugins/supereq/nsfft-1.00/ooura/Makefile new file mode 100644 index 00000000..bad1679e --- /dev/null +++ b/plugins/supereq/nsfft-1.00/ooura/Makefile @@ -0,0 +1,11 @@ +CC=gcc +BASEOPT=-Wall -g +OPT=$(BASEOPT) -O3 + +all : fftsg.o + +clean : + rm -f *~ *.o a.out + +fftsg.o : fftsg.c + $(CC) $(OPT) -c fftsg.c diff --git a/plugins/supereq/nsfft-1.00/ooura/README b/plugins/supereq/nsfft-1.00/ooura/README new file mode 100644 index 00000000..d7ddefc2 --- /dev/null +++ b/plugins/supereq/nsfft-1.00/ooura/README @@ -0,0 +1,2 @@ +Please put fftsg.c and pi_fft.c which is included in Prof. Takuya +Ooura's FFT package. diff --git a/plugins/supereq/nsfft-1.00/ooura/fftsg.c b/plugins/supereq/nsfft-1.00/ooura/fftsg.c new file mode 100644 index 00000000..43d75344 --- /dev/null +++ b/plugins/supereq/nsfft-1.00/ooura/fftsg.c @@ -0,0 +1,3314 @@ +/* +Fast Fourier/Cosine/Sine Transform + dimension :one + data length :power of 2 + decimation :frequency + radix :split-radix + data :inplace + table :use +functions + cdft: Complex Discrete Fourier Transform + rdft: Real Discrete Fourier Transform + ddct: Discrete Cosine Transform + ddst: Discrete Sine Transform + dfct: Cosine Transform of RDFT (Real Symmetric DFT) + dfst: Sine Transform of RDFT (Real Anti-symmetric DFT) +function prototypes + void cdft(int, int, double *, int *, double *); + void rdft(int, int, double *, int *, double *); + void ddct(int, int, double *, int *, double *); + void ddst(int, int, double *, int *, double *); + void dfct(int, double *, double *, int *, double *); + void dfst(int, double *, double *, int *, double *); +macro definitions + USE_CDFT_PTHREADS : default=not defined + CDFT_THREADS_BEGIN_N : must be >= 512, default=8192 + CDFT_4THREADS_BEGIN_N : must be >= 512, default=65536 + USE_CDFT_WINTHREADS : default=not defined + CDFT_THREADS_BEGIN_N : must be >= 512, default=32768 + CDFT_4THREADS_BEGIN_N : must be >= 512, default=524288 + + +-------- Complex DFT (Discrete Fourier Transform) -------- + [definition] + <case1> + X[k] = sum_j=0^n-1 x[j]*exp(2*pi*i*j*k/n), 0<=k<n + <case2> + X[k] = sum_j=0^n-1 x[j]*exp(-2*pi*i*j*k/n), 0<=k<n + (notes: sum_j=0^n-1 is a summation from j=0 to n-1) + [usage] + <case1> + ip[0] = 0; // first time only + cdft(2*n, 1, a, ip, w); + <case2> + ip[0] = 0; // first time only + cdft(2*n, -1, a, ip, w); + [parameters] + 2*n :data length (int) + n >= 1, n = power of 2 + a[0...2*n-1] :input/output data (double *) + input data + a[2*j] = Re(x[j]), + a[2*j+1] = Im(x[j]), 0<=j<n + output data + a[2*k] = Re(X[k]), + a[2*k+1] = Im(X[k]), 0<=k<n + ip[0...*] :work area for bit reversal (int *) + length of ip >= 2+sqrt(n) + strictly, + length of ip >= + 2+(1<<(int)(log(n+0.5)/log(2))/2). + ip[0],ip[1] are pointers of the cos/sin table. + w[0...n/2-1] :cos/sin table (double *) + w[],ip[] are initialized if ip[0] == 0. + [remark] + Inverse of + cdft(2*n, -1, a, ip, w); + is + cdft(2*n, 1, a, ip, w); + for (j = 0; j <= 2 * n - 1; j++) { + a[j] *= 1.0 / n; + } + . + + +-------- Real DFT / Inverse of Real DFT -------- + [definition] + <case1> RDFT + R[k] = sum_j=0^n-1 a[j]*cos(2*pi*j*k/n), 0<=k<=n/2 + I[k] = sum_j=0^n-1 a[j]*sin(2*pi*j*k/n), 0<k<n/2 + <case2> IRDFT (excluding scale) + a[k] = (R[0] + R[n/2]*cos(pi*k))/2 + + sum_j=1^n/2-1 R[j]*cos(2*pi*j*k/n) + + sum_j=1^n/2-1 I[j]*sin(2*pi*j*k/n), 0<=k<n + [usage] + <case1> + ip[0] = 0; // first time only + rdft(n, 1, a, ip, w); + <case2> + ip[0] = 0; // first time only + rdft(n, -1, a, ip, w); + [parameters] + n :data length (int) + n >= 2, n = power of 2 + a[0...n-1] :input/output data (double *) + <case1> + output data + a[2*k] = R[k], 0<=k<n/2 + a[2*k+1] = I[k], 0<k<n/2 + a[1] = R[n/2] + <case2> + input data + a[2*j] = R[j], 0<=j<n/2 + a[2*j+1] = I[j], 0<j<n/2 + a[1] = R[n/2] + ip[0...*] :work area for bit reversal (int *) + length of ip >= 2+sqrt(n/2) + strictly, + length of ip >= + 2+(1<<(int)(log(n/2+0.5)/log(2))/2). + ip[0],ip[1] are pointers of the cos/sin table. + w[0...n/2-1] :cos/sin table (double *) + w[],ip[] are initialized if ip[0] == 0. + [remark] + Inverse of + rdft(n, 1, a, ip, w); + is + rdft(n, -1, a, ip, w); + for (j = 0; j <= n - 1; j++) { + a[j] *= 2.0 / n; + } + . + + +-------- DCT (Discrete Cosine Transform) / Inverse of DCT -------- + [definition] + <case1> IDCT (excluding scale) + C[k] = sum_j=0^n-1 a[j]*cos(pi*j*(k+1/2)/n), 0<=k<n + <case2> DCT + C[k] = sum_j=0^n-1 a[j]*cos(pi*(j+1/2)*k/n), 0<=k<n + [usage] + <case1> + ip[0] = 0; // first time only + ddct(n, 1, a, ip, w); + <case2> + ip[0] = 0; // first time only + ddct(n, -1, a, ip, w); + [parameters] + n :data length (int) + n >= 2, n = power of 2 + a[0...n-1] :input/output data (double *) + output data + a[k] = C[k], 0<=k<n + ip[0...*] :work area for bit reversal (int *) + length of ip >= 2+sqrt(n/2) + strictly, + length of ip >= + 2+(1<<(int)(log(n/2+0.5)/log(2))/2). + ip[0],ip[1] are pointers of the cos/sin table. + w[0...n*5/4-1] :cos/sin table (double *) + w[],ip[] are initialized if ip[0] == 0. + [remark] + Inverse of + ddct(n, -1, a, ip, w); + is + a[0] *= 0.5; + ddct(n, 1, a, ip, w); + for (j = 0; j <= n - 1; j++) { + a[j] *= 2.0 / n; + } + . + + +-------- DST (Discrete Sine Transform) / Inverse of DST -------- + [definition] + <case1> IDST (excluding scale) + S[k] = sum_j=1^n A[j]*sin(pi*j*(k+1/2)/n), 0<=k<n + <case2> DST + S[k] = sum_j=0^n-1 a[j]*sin(pi*(j+1/2)*k/n), 0<k<=n + [usage] + <case1> + ip[0] = 0; // first time only + ddst(n, 1, a, ip, w); + <case2> + ip[0] = 0; // first time only + ddst(n, -1, a, ip, w); + [parameters] + n :data length (int) + n >= 2, n = power of 2 + a[0...n-1] :input/output data (double *) + <case1> + input data + a[j] = A[j], 0<j<n + a[0] = A[n] + output data + a[k] = S[k], 0<=k<n + <case2> + output data + a[k] = S[k], 0<k<n + a[0] = S[n] + ip[0...*] :work area for bit reversal (int *) + length of ip >= 2+sqrt(n/2) + strictly, + length of ip >= + 2+(1<<(int)(log(n/2+0.5)/log(2))/2). + ip[0],ip[1] are pointers of the cos/sin table. + w[0...n*5/4-1] :cos/sin table (double *) + w[],ip[] are initialized if ip[0] == 0. + [remark] + Inverse of + ddst(n, -1, a, ip, w); + is + a[0] *= 0.5; + ddst(n, 1, a, ip, w); + for (j = 0; j <= n - 1; j++) { + a[j] *= 2.0 / n; + } + . + + +-------- Cosine Transform of RDFT (Real Symmetric DFT) -------- + [definition] + C[k] = sum_j=0^n a[j]*cos(pi*j*k/n), 0<=k<=n + [usage] + ip[0] = 0; // first time only + dfct(n, a, t, ip, w); + [parameters] + n :data length - 1 (int) + n >= 2, n = power of 2 + a[0...n] :input/output data (double *) + output data + a[k] = C[k], 0<=k<=n + t[0...n/2] :work area (double *) + ip[0...*] :work area for bit reversal (int *) + length of ip >= 2+sqrt(n/4) + strictly, + length of ip >= + 2+(1<<(int)(log(n/4+0.5)/log(2))/2). + ip[0],ip[1] are pointers of the cos/sin table. + w[0...n*5/8-1] :cos/sin table (double *) + w[],ip[] are initialized if ip[0] == 0. + [remark] + Inverse of + a[0] *= 0.5; + a[n] *= 0.5; + dfct(n, a, t, ip, w); + is + a[0] *= 0.5; + a[n] *= 0.5; + dfct(n, a, t, ip, w); + for (j = 0; j <= n; j++) { + a[j] *= 2.0 / n; + } + . + + +-------- Sine Transform of RDFT (Real Anti-symmetric DFT) -------- + [definition] + S[k] = sum_j=1^n-1 a[j]*sin(pi*j*k/n), 0<k<n + [usage] + ip[0] = 0; // first time only + dfst(n, a, t, ip, w); + [parameters] + n :data length + 1 (int) + n >= 2, n = power of 2 + a[0...n-1] :input/output data (double *) + output data + a[k] = S[k], 0<k<n + (a[0] is used for work area) + t[0...n/2-1] :work area (double *) + ip[0...*] :work area for bit reversal (int *) + length of ip >= 2+sqrt(n/4) + strictly, + length of ip >= + 2+(1<<(int)(log(n/4+0.5)/log(2))/2). + ip[0],ip[1] are pointers of the cos/sin table. + w[0...n*5/8-1] :cos/sin table (double *) + w[],ip[] are initialized if ip[0] == 0. + [remark] + Inverse of + dfst(n, a, t, ip, w); + is + dfst(n, a, t, ip, w); + for (j = 1; j <= n - 1; j++) { + a[j] *= 2.0 / n; + } + . + + +Appendix : + The cos/sin table is recalculated when the larger table required. + w[] and ip[] are compatible with all routines. +*/ + + +void cdft(int n, int isgn, double *a, int *ip, double *w) +{ + void makewt(int nw, int *ip, double *w); + void cftfsub(int n, double *a, int *ip, int nw, double *w); + void cftbsub(int n, double *a, int *ip, int nw, double *w); + int nw; + + nw = ip[0]; + if (n > (nw << 2)) { + nw = n >> 2; + makewt(nw, ip, w); + } + if (isgn >= 0) { + cftfsub(n, a, ip, nw, w); + } else { + cftbsub(n, a, ip, nw, w); + } +} + + +void rdft(int n, int isgn, double *a, int *ip, double *w) +{ + void makewt(int nw, int *ip, double *w); + void makect(int nc, int *ip, double *c); + void cftfsub(int n, double *a, int *ip, int nw, double *w); + void cftbsub(int n, double *a, int *ip, int nw, double *w); + void rftfsub(int n, double *a, int nc, double *c); + void rftbsub(int n, double *a, int nc, double *c); + int nw, nc; + double xi; + + nw = ip[0]; + if (n > (nw << 2)) { + nw = n >> 2; + makewt(nw, ip, w); + } + nc = ip[1]; + if (n > (nc << 2)) { + nc = n >> 2; + makect(nc, ip, w + nw); + } + if (isgn >= 0) { + if (n > 4) { + cftfsub(n, a, ip, nw, w); + rftfsub(n, a, nc, w + nw); + } else if (n == 4) { + cftfsub(n, a, ip, nw, w); + } + xi = a[0] - a[1]; + a[0] += a[1]; + a[1] = xi; + } else { + a[1] = 0.5 * (a[0] - a[1]); + a[0] -= a[1]; + if (n > 4) { + rftbsub(n, a, nc, w + nw); + cftbsub(n, a, ip, nw, w); + } else if (n == 4) { + cftbsub(n, a, ip, nw, w); + } + } +} + + +void ddct(int n, int isgn, double *a, int *ip, double *w) +{ + void makewt(int nw, int *ip, double *w); + void makect(int nc, int *ip, double *c); + void cftfsub(int n, double *a, int *ip, int nw, double *w); + void cftbsub(int n, double *a, int *ip, int nw, double *w); + void rftfsub(int n, double *a, int nc, double *c); + void rftbsub(int n, double *a, int nc, double *c); + void dctsub(int n, double *a, int nc, double *c); + int j, nw, nc; + double xr; + + nw = ip[0]; + if (n > (nw << 2)) { + nw = n >> 2; + makewt(nw, ip, w); + } + nc = ip[1]; + if (n > nc) { + nc = n; + makect(nc, ip, w + nw); + } + if (isgn < 0) { + xr = a[n - 1]; + for (j = n - 2; j >= 2; j -= 2) { + a[j + 1] = a[j] - a[j - 1]; + a[j] += a[j - 1]; + } + a[1] = a[0] - xr; + a[0] += xr; + if (n > 4) { + rftbsub(n, a, nc, w + nw); + cftbsub(n, a, ip, nw, w); + } else if (n == 4) { + cftbsub(n, a, ip, nw, w); + } + } + dctsub(n, a, nc, w + nw); + if (isgn >= 0) { + if (n > 4) { + cftfsub(n, a, ip, nw, w); + rftfsub(n, a, nc, w + nw); + } else if (n == 4) { + cftfsub(n, a, ip, nw, w); + } + xr = a[0] - a[1]; + a[0] += a[1]; + for (j = 2; j < n; j += 2) { + a[j - 1] = a[j] - a[j + 1]; + a[j] += a[j + 1]; + } + a[n - 1] = xr; + } +} + + +void ddst(int n, int isgn, double *a, int *ip, double *w) +{ + void makewt(int nw, int *ip, double *w); + void makect(int nc, int *ip, double *c); + void cftfsub(int n, double *a, int *ip, int nw, double *w); + void cftbsub(int n, double *a, int *ip, int nw, double *w); + void rftfsub(int n, double *a, int nc, double *c); + void rftbsub(int n, double *a, int nc, double *c); + void dstsub(int n, double *a, int nc, double *c); + int j, nw, nc; + double xr; + + nw = ip[0]; + if (n > (nw << 2)) { + nw = n >> 2; + makewt(nw, ip, w); + } + nc = ip[1]; + if (n > nc) { + nc = n; + makect(nc, ip, w + nw); + } + if (isgn < 0) { + xr = a[n - 1]; + for (j = n - 2; j >= 2; j -= 2) { + a[j + 1] = -a[j] - a[j - 1]; + a[j] -= a[j - 1]; + } + a[1] = a[0] + xr; + a[0] -= xr; + if (n > 4) { + rftbsub(n, a, nc, w + nw); + cftbsub(n, a, ip, nw, w); + } else if (n == 4) { + cftbsub(n, a, ip, nw, w); + } + } + dstsub(n, a, nc, w + nw); + if (isgn >= 0) { + if (n > 4) { + cftfsub(n, a, ip, nw, w); + rftfsub(n, a, nc, w + nw); + } else if (n == 4) { + cftfsub(n, a, ip, nw, w); + } + xr = a[0] - a[1]; + a[0] += a[1]; + for (j = 2; j < n; j += 2) { + a[j - 1] = -a[j] - a[j + 1]; + a[j] -= a[j + 1]; + } + a[n - 1] = -xr; + } +} + + +void dfct(int n, double *a, double *t, int *ip, double *w) +{ + void makewt(int nw, int *ip, double *w); + void makect(int nc, int *ip, double *c); + void cftfsub(int n, double *a, int *ip, int nw, double *w); + void rftfsub(int n, double *a, int nc, double *c); + void dctsub(int n, double *a, int nc, double *c); + int j, k, l, m, mh, nw, nc; + double xr, xi, yr, yi; + + nw = ip[0]; + if (n > (nw << 3)) { + nw = n >> 3; + makewt(nw, ip, w); + } + nc = ip[1]; + if (n > (nc << 1)) { + nc = n >> 1; + makect(nc, ip, w + nw); + } + m = n >> 1; + yi = a[m]; + xi = a[0] + a[n]; + a[0] -= a[n]; + t[0] = xi - yi; + t[m] = xi + yi; + if (n > 2) { + mh = m >> 1; + for (j = 1; j < mh; j++) { + k = m - j; + xr = a[j] - a[n - j]; + xi = a[j] + a[n - j]; + yr = a[k] - a[n - k]; + yi = a[k] + a[n - k]; + a[j] = xr; + a[k] = yr; + t[j] = xi - yi; + t[k] = xi + yi; + } + t[mh] = a[mh] + a[n - mh]; + a[mh] -= a[n - mh]; + dctsub(m, a, nc, w + nw); + if (m > 4) { + cftfsub(m, a, ip, nw, w); + rftfsub(m, a, nc, w + nw); + } else if (m == 4) { + cftfsub(m, a, ip, nw, w); + } + a[n - 1] = a[0] - a[1]; + a[1] = a[0] + a[1]; + for (j = m - 2; j >= 2; j -= 2) { + a[2 * j + 1] = a[j] + a[j + 1]; + a[2 * j - 1] = a[j] - a[j + 1]; + } + l = 2; + m = mh; + while (m >= 2) { + dctsub(m, t, nc, w + nw); + if (m > 4) { + cftfsub(m, t, ip, nw, w); + rftfsub(m, t, nc, w + nw); + } else if (m == 4) { + cftfsub(m, t, ip, nw, w); + } + a[n - l] = t[0] - t[1]; + a[l] = t[0] + t[1]; + k = 0; + for (j = 2; j < m; j += 2) { + k += l << 2; + a[k - l] = t[j] - t[j + 1]; + a[k + l] = t[j] + t[j + 1]; + } + l <<= 1; + mh = m >> 1; + for (j = 0; j < mh; j++) { + k = m - j; + t[j] = t[m + k] - t[m + j]; + t[k] = t[m + k] + t[m + j]; + } + t[mh] = t[m + mh]; + m = mh; + } + a[l] = t[0]; + a[n] = t[2] - t[1]; + a[0] = t[2] + t[1]; + } else { + a[1] = a[0]; + a[2] = t[0]; + a[0] = t[1]; + } +} + + +void dfst(int n, double *a, double *t, int *ip, double *w) +{ + void makewt(int nw, int *ip, double *w); + void makect(int nc, int *ip, double *c); + void cftfsub(int n, double *a, int *ip, int nw, double *w); + void rftfsub(int n, double *a, int nc, double *c); + void dstsub(int n, double *a, int nc, double *c); + int j, k, l, m, mh, nw, nc; + double xr, xi, yr, yi; + + nw = ip[0]; + if (n > (nw << 3)) { + nw = n >> 3; + makewt(nw, ip, w); + } + nc = ip[1]; + if (n > (nc << 1)) { + nc = n >> 1; + makect(nc, ip, w + nw); + } + if (n > 2) { + m = n >> 1; + mh = m >> 1; + for (j = 1; j < mh; j++) { + k = m - j; + xr = a[j] + a[n - j]; + xi = a[j] - a[n - j]; + yr = a[k] + a[n - k]; + yi = a[k] - a[n - k]; + a[j] = xr; + a[k] = yr; + t[j] = xi + yi; + t[k] = xi - yi; + } + t[0] = a[mh] - a[n - mh]; + a[mh] += a[n - mh]; + a[0] = a[m]; + dstsub(m, a, nc, w + nw); + if (m > 4) { + cftfsub(m, a, ip, nw, w); + rftfsub(m, a, nc, w + nw); + } else if (m == 4) { + cftfsub(m, a, ip, nw, w); + } + a[n - 1] = a[1] - a[0]; + a[1] = a[0] + a[1]; + for (j = m - 2; j >= 2; j -= 2) { + a[2 * j + 1] = a[j] - a[j + 1]; + a[2 * j - 1] = -a[j] - a[j + 1]; + } + l = 2; + m = mh; + while (m >= 2) { + dstsub(m, t, nc, w + nw); + if (m > 4) { + cftfsub(m, t, ip, nw, w); + rftfsub(m, t, nc, w + nw); + } else if (m == 4) { + cftfsub(m, t, ip, nw, w); + } + a[n - l] = t[1] - t[0]; + a[l] = t[0] + t[1]; + k = 0; + for (j = 2; j < m; j += 2) { + k += l << 2; + a[k - l] = -t[j] - t[j + 1]; + a[k + l] = t[j] - t[j + 1]; + } + l <<= 1; + mh = m >> 1; + for (j = 1; j < mh; j++) { + k = m - j; + t[j] = t[m + k] + t[m + j]; + t[k] = t[m + k] - t[m + j]; + } + t[0] = t[m + mh]; + m = mh; + } + a[l] = t[0]; + } + a[0] = 0; +} + + +/* -------- initializing routines -------- */ + + +#include <math.h> + +void makewt(int nw, int *ip, double *w) +{ + void makeipt(int nw, int *ip); + int j, nwh, nw0, nw1; + double delta, wn4r, wk1r, wk1i, wk3r, wk3i; + + ip[0] = nw; + ip[1] = 1; + if (nw > 2) { + nwh = nw >> 1; + delta = atan(1.0) / nwh; + wn4r = cos(delta * nwh); + w[0] = 1; + w[1] = wn4r; + if (nwh == 4) { + w[2] = cos(delta * 2); + w[3] = sin(delta * 2); + } else if (nwh > 4) { + makeipt(nw, ip); + w[2] = 0.5 / cos(delta * 2); + w[3] = 0.5 / cos(delta * 6); + for (j = 4; j < nwh; j += 4) { + w[j] = cos(delta * j); + w[j + 1] = sin(delta * j); + w[j + 2] = cos(3 * delta * j); + w[j + 3] = -sin(3 * delta * j); + } + } + nw0 = 0; + while (nwh > 2) { + nw1 = nw0 + nwh; + nwh >>= 1; + w[nw1] = 1; + w[nw1 + 1] = wn4r; + if (nwh == 4) { + wk1r = w[nw0 + 4]; + wk1i = w[nw0 + 5]; + w[nw1 + 2] = wk1r; + w[nw1 + 3] = wk1i; + } else if (nwh > 4) { + wk1r = w[nw0 + 4]; + wk3r = w[nw0 + 6]; + w[nw1 + 2] = 0.5 / wk1r; + w[nw1 + 3] = 0.5 / wk3r; + for (j = 4; j < nwh; j += 4) { + wk1r = w[nw0 + 2 * j]; + wk1i = w[nw0 + 2 * j + 1]; + wk3r = w[nw0 + 2 * j + 2]; + wk3i = w[nw0 + 2 * j + 3]; + w[nw1 + j] = wk1r; + w[nw1 + j + 1] = wk1i; + w[nw1 + j + 2] = wk3r; + w[nw1 + j + 3] = wk3i; + } + } + nw0 = nw1; + } + } +} + + +void makeipt(int nw, int *ip) +{ + int j, l, m, m2, p, q; + + ip[2] = 0; + ip[3] = 16; + m = 2; + for (l = nw; l > 32; l >>= 2) { + m2 = m << 1; + q = m2 << 3; + for (j = m; j < m2; j++) { + p = ip[j] << 2; + ip[m + j] = p; + ip[m2 + j] = p + q; + } + m = m2; + } +} + + +void makect(int nc, int *ip, double *c) +{ + int j, nch; + double delta; + + ip[1] = nc; + if (nc > 1) { + nch = nc >> 1; + delta = atan(1.0) / nch; + c[0] = cos(delta * nch); + c[nch] = 0.5 * c[0]; + for (j = 1; j < nch; j++) { + c[j] = 0.5 * cos(delta * j); + c[nc - j] = 0.5 * sin(delta * j); + } + } +} + + +/* -------- child routines -------- */ + + +#ifdef USE_CDFT_PTHREADS +#define USE_CDFT_THREADS +#ifndef CDFT_THREADS_BEGIN_N +#define CDFT_THREADS_BEGIN_N 8192 +#endif +#ifndef CDFT_4THREADS_BEGIN_N +#define CDFT_4THREADS_BEGIN_N 65536 +#endif +#include <pthread.h> +#include <stdio.h> +#include <stdlib.h> +#define cdft_thread_t pthread_t +#define cdft_thread_create(thp,func,argp) { \ + if (pthread_create(thp, NULL, func, (void *) argp) != 0) { \ + fprintf(stderr, "cdft thread error\n"); \ + exit(1); \ + } \ +} +#define cdft_thread_wait(th) { \ + if (pthread_join(th, NULL) != 0) { \ + fprintf(stderr, "cdft thread error\n"); \ + exit(1); \ + } \ +} +#endif /* USE_CDFT_PTHREADS */ + + +#ifdef USE_CDFT_WINTHREADS +#define USE_CDFT_THREADS +#ifndef CDFT_THREADS_BEGIN_N +#define CDFT_THREADS_BEGIN_N 32768 +#endif +#ifndef CDFT_4THREADS_BEGIN_N +#define CDFT_4THREADS_BEGIN_N 524288 +#endif +#include <windows.h> +#include <stdio.h> +#include <stdlib.h> +#define cdft_thread_t HANDLE +#define cdft_thread_create(thp,func,argp) { \ + DWORD thid; \ + *(thp) = CreateThread(NULL, 0, (LPTHREAD_START_ROUTINE) func, (LPVOID) argp, 0, &thid); \ + if (*(thp) == 0) { \ + fprintf(stderr, "cdft thread error\n"); \ + exit(1); \ + } \ +} +#define cdft_thread_wait(th) { \ + WaitForSingleObject(th, INFINITE); \ + CloseHandle(th); \ +} +#endif /* USE_CDFT_WINTHREADS */ + + +void cftfsub(int n, double *a, int *ip, int nw, double *w) +{ + void bitrv2(int n, int *ip, double *a); + void bitrv216(double *a); + void bitrv208(double *a); + void cftf1st(int n, double *a, double *w); + void cftrec4(int n, double *a, int nw, double *w); + void cftleaf(int n, int isplt, double *a, int nw, double *w); + void cftfx41(int n, double *a, int nw, double *w); + void cftf161(double *a, double *w); + void cftf081(double *a, double *w); + void cftf040(double *a); + void cftx020(double *a); +#ifdef USE_CDFT_THREADS + void cftrec4_th(int n, double *a, int nw, double *w); +#endif /* USE_CDFT_THREADS */ + + if (n > 8) { + if (n > 32) { + cftf1st(n, a, &w[nw - (n >> 2)]); +#ifdef USE_CDFT_THREADS + if (n > CDFT_THREADS_BEGIN_N) { + cftrec4_th(n, a, nw, w); + } else +#endif /* USE_CDFT_THREADS */ + if (n > 512) { + cftrec4(n, a, nw, w); + } else if (n > 128) { + cftleaf(n, 1, a, nw, w); + } else { + cftfx41(n, a, nw, w); + } + bitrv2(n, ip, a); + } else if (n == 32) { + cftf161(a, &w[nw - 8]); + bitrv216(a); + } else { + cftf081(a, w); + bitrv208(a); + } + } else if (n == 8) { + cftf040(a); + } else if (n == 4) { + cftx020(a); + } +} + + +void cftbsub(int n, double *a, int *ip, int nw, double *w) +{ + void bitrv2conj(int n, int *ip, double *a); + void bitrv216neg(double *a); + void bitrv208neg(double *a); + void cftb1st(int n, double *a, double *w); + void cftrec4(int n, double *a, int nw, double *w); + void cftleaf(int n, int isplt, double *a, int nw, double *w); + void cftfx41(int n, double *a, int nw, double *w); + void cftf161(double *a, double *w); + void cftf081(double *a, double *w); + void cftb040(double *a); + void cftx020(double *a); +#ifdef USE_CDFT_THREADS + void cftrec4_th(int n, double *a, int nw, double *w); +#endif /* USE_CDFT_THREADS */ + + if (n > 8) { + if (n > 32) { + cftb1st(n, a, &w[nw - (n >> 2)]); +#ifdef USE_CDFT_THREADS + if (n > CDFT_THREADS_BEGIN_N) { + cftrec4_th(n, a, nw, w); + } else +#endif /* USE_CDFT_THREADS */ + if (n > 512) { + cftrec4(n, a, nw, w); + } else if (n > 128) { + cftleaf(n, 1, a, nw, w); + } else { + cftfx41(n, a, nw, w); + } + bitrv2conj(n, ip, a); + } else if (n == 32) { + cftf161(a, &w[nw - 8]); + bitrv216neg(a); + } else { + cftf081(a, w); + bitrv208neg(a); + } + } else if (n == 8) { + cftb040(a); + } else if (n == 4) { + cftx020(a); + } +} + + +void bitrv2(int n, int *ip, double *a) +{ + int j, j1, k, k1, l, m, nh, nm; + double xr, xi, yr, yi; + + m = 1; + for (l = n >> 2; l > 8; l >>= 2) { + m <<= 1; + } + nh = n >> 1; + nm = 4 * m; + if (l == 8) { + for (k = 0; k < m; k++) { + for (j = 0; j < k; j++) { + j1 = 4 * j + 2 * ip[m + k]; + k1 = 4 * k + 2 * ip[m + j]; + xr = a[j1]; + xi = a[j1 + 1]; + yr = a[k1]; + yi = a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 += nm; + k1 += 2 * nm; + xr = a[j1]; + xi = a[j1 + 1]; + yr = a[k1]; + yi = a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 += nm; + k1 -= nm; + xr = a[j1]; + xi = a[j1 + 1]; + yr = a[k1]; + yi = a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 += nm; + k1 += 2 * nm; + xr = a[j1]; + xi = a[j1 + 1]; + yr = a[k1]; + yi = a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 += nh; + k1 += 2; + xr = a[j1]; + xi = a[j1 + 1]; + yr = a[k1]; + yi = a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 -= nm; + k1 -= 2 * nm; + xr = a[j1]; + xi = a[j1 + 1]; + yr = a[k1]; + yi = a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 -= nm; + k1 += nm; + xr = a[j1]; + xi = a[j1 + 1]; + yr = a[k1]; + yi = a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 -= nm; + k1 -= 2 * nm; + xr = a[j1]; + xi = a[j1 + 1]; + yr = a[k1]; + yi = a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 += 2; + k1 += nh; + xr = a[j1]; + xi = a[j1 + 1]; + yr = a[k1]; + yi = a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 += nm; + k1 += 2 * nm; + xr = a[j1]; + xi = a[j1 + 1]; + yr = a[k1]; + yi = a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 += nm; + k1 -= nm; + xr = a[j1]; + xi = a[j1 + 1]; + yr = a[k1]; + yi = a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 += nm; + k1 += 2 * nm; + xr = a[j1]; + xi = a[j1 + 1]; + yr = a[k1]; + yi = a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 -= nh; + k1 -= 2; + xr = a[j1]; + xi = a[j1 + 1]; + yr = a[k1]; + yi = a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 -= nm; + k1 -= 2 * nm; + xr = a[j1]; + xi = a[j1 + 1]; + yr = a[k1]; + yi = a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 -= nm; + k1 += nm; + xr = a[j1]; + xi = a[j1 + 1]; + yr = a[k1]; + yi = a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 -= nm; + k1 -= 2 * nm; + xr = a[j1]; + xi = a[j1 + 1]; + yr = a[k1]; + yi = a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + } + k1 = 4 * k + 2 * ip[m + k]; + j1 = k1 + 2; + k1 += nh; + xr = a[j1]; + xi = a[j1 + 1]; + yr = a[k1]; + yi = a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 += nm; + k1 += 2 * nm; + xr = a[j1]; + xi = a[j1 + 1]; + yr = a[k1]; + yi = a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 += nm; + k1 -= nm; + xr = a[j1]; + xi = a[j1 + 1]; + yr = a[k1]; + yi = a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 -= 2; + k1 -= nh; + xr = a[j1]; + xi = a[j1 + 1]; + yr = a[k1]; + yi = a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 += nh + 2; + k1 += nh + 2; + xr = a[j1]; + xi = a[j1 + 1]; + yr = a[k1]; + yi = a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 -= nh - nm; + k1 += 2 * nm - 2; + xr = a[j1]; + xi = a[j1 + 1]; + yr = a[k1]; + yi = a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + } + } else { + for (k = 0; k < m; k++) { + for (j = 0; j < k; j++) { + j1 = 4 * j + ip[m + k]; + k1 = 4 * k + ip[m + j]; + xr = a[j1]; + xi = a[j1 + 1]; + yr = a[k1]; + yi = a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 += nm; + k1 += nm; + xr = a[j1]; + xi = a[j1 + 1]; + yr = a[k1]; + yi = a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 += nh; + k1 += 2; + xr = a[j1]; + xi = a[j1 + 1]; + yr = a[k1]; + yi = a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 -= nm; + k1 -= nm; + xr = a[j1]; + xi = a[j1 + 1]; + yr = a[k1]; + yi = a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 += 2; + k1 += nh; + xr = a[j1]; + xi = a[j1 + 1]; + yr = a[k1]; + yi = a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 += nm; + k1 += nm; + xr = a[j1]; + xi = a[j1 + 1]; + yr = a[k1]; + yi = a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 -= nh; + k1 -= 2; + xr = a[j1]; + xi = a[j1 + 1]; + yr = a[k1]; + yi = a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 -= nm; + k1 -= nm; + xr = a[j1]; + xi = a[j1 + 1]; + yr = a[k1]; + yi = a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + } + k1 = 4 * k + ip[m + k]; + j1 = k1 + 2; + k1 += nh; + xr = a[j1]; + xi = a[j1 + 1]; + yr = a[k1]; + yi = a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 += nm; + k1 += nm; + xr = a[j1]; + xi = a[j1 + 1]; + yr = a[k1]; + yi = a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + } + } +} + + +void bitrv2conj(int n, int *ip, double *a) +{ + int j, j1, k, k1, l, m, nh, nm; + double xr, xi, yr, yi; + + m = 1; + for (l = n >> 2; l > 8; l >>= 2) { + m <<= 1; + } + nh = n >> 1; + nm = 4 * m; + if (l == 8) { + for (k = 0; k < m; k++) { + for (j = 0; j < k; j++) { + j1 = 4 * j + 2 * ip[m + k]; + k1 = 4 * k + 2 * ip[m + j]; + xr = a[j1]; + xi = -a[j1 + 1]; + yr = a[k1]; + yi = -a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 += nm; + k1 += 2 * nm; + xr = a[j1]; + xi = -a[j1 + 1]; + yr = a[k1]; + yi = -a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 += nm; + k1 -= nm; + xr = a[j1]; + xi = -a[j1 + 1]; + yr = a[k1]; + yi = -a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 += nm; + k1 += 2 * nm; + xr = a[j1]; + xi = -a[j1 + 1]; + yr = a[k1]; + yi = -a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 += nh; + k1 += 2; + xr = a[j1]; + xi = -a[j1 + 1]; + yr = a[k1]; + yi = -a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 -= nm; + k1 -= 2 * nm; + xr = a[j1]; + xi = -a[j1 + 1]; + yr = a[k1]; + yi = -a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 -= nm; + k1 += nm; + xr = a[j1]; + xi = -a[j1 + 1]; + yr = a[k1]; + yi = -a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 -= nm; + k1 -= 2 * nm; + xr = a[j1]; + xi = -a[j1 + 1]; + yr = a[k1]; + yi = -a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 += 2; + k1 += nh; + xr = a[j1]; + xi = -a[j1 + 1]; + yr = a[k1]; + yi = -a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 += nm; + k1 += 2 * nm; + xr = a[j1]; + xi = -a[j1 + 1]; + yr = a[k1]; + yi = -a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 += nm; + k1 -= nm; + xr = a[j1]; + xi = -a[j1 + 1]; + yr = a[k1]; + yi = -a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 += nm; + k1 += 2 * nm; + xr = a[j1]; + xi = -a[j1 + 1]; + yr = a[k1]; + yi = -a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 -= nh; + k1 -= 2; + xr = a[j1]; + xi = -a[j1 + 1]; + yr = a[k1]; + yi = -a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 -= nm; + k1 -= 2 * nm; + xr = a[j1]; + xi = -a[j1 + 1]; + yr = a[k1]; + yi = -a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 -= nm; + k1 += nm; + xr = a[j1]; + xi = -a[j1 + 1]; + yr = a[k1]; + yi = -a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 -= nm; + k1 -= 2 * nm; + xr = a[j1]; + xi = -a[j1 + 1]; + yr = a[k1]; + yi = -a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + } + k1 = 4 * k + 2 * ip[m + k]; + j1 = k1 + 2; + k1 += nh; + a[j1 - 1] = -a[j1 - 1]; + xr = a[j1]; + xi = -a[j1 + 1]; + yr = a[k1]; + yi = -a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + a[k1 + 3] = -a[k1 + 3]; + j1 += nm; + k1 += 2 * nm; + xr = a[j1]; + xi = -a[j1 + 1]; + yr = a[k1]; + yi = -a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 += nm; + k1 -= nm; + xr = a[j1]; + xi = -a[j1 + 1]; + yr = a[k1]; + yi = -a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 -= 2; + k1 -= nh; + xr = a[j1]; + xi = -a[j1 + 1]; + yr = a[k1]; + yi = -a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 += nh + 2; + k1 += nh + 2; + xr = a[j1]; + xi = -a[j1 + 1]; + yr = a[k1]; + yi = -a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 -= nh - nm; + k1 += 2 * nm - 2; + a[j1 - 1] = -a[j1 - 1]; + xr = a[j1]; + xi = -a[j1 + 1]; + yr = a[k1]; + yi = -a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + a[k1 + 3] = -a[k1 + 3]; + } + } else { + for (k = 0; k < m; k++) { + for (j = 0; j < k; j++) { + j1 = 4 * j + ip[m + k]; + k1 = 4 * k + ip[m + j]; + xr = a[j1]; + xi = -a[j1 + 1]; + yr = a[k1]; + yi = -a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 += nm; + k1 += nm; + xr = a[j1]; + xi = -a[j1 + 1]; + yr = a[k1]; + yi = -a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 += nh; + k1 += 2; + xr = a[j1]; + xi = -a[j1 + 1]; + yr = a[k1]; + yi = -a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 -= nm; + k1 -= nm; + xr = a[j1]; + xi = -a[j1 + 1]; + yr = a[k1]; + yi = -a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 += 2; + k1 += nh; + xr = a[j1]; + xi = -a[j1 + 1]; + yr = a[k1]; + yi = -a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 += nm; + k1 += nm; + xr = a[j1]; + xi = -a[j1 + 1]; + yr = a[k1]; + yi = -a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 -= nh; + k1 -= 2; + xr = a[j1]; + xi = -a[j1 + 1]; + yr = a[k1]; + yi = -a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 -= nm; + k1 -= nm; + xr = a[j1]; + xi = -a[j1 + 1]; + yr = a[k1]; + yi = -a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + } + k1 = 4 * k + ip[m + k]; + j1 = k1 + 2; + k1 += nh; + a[j1 - 1] = -a[j1 - 1]; + xr = a[j1]; + xi = -a[j1 + 1]; + yr = a[k1]; + yi = -a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + a[k1 + 3] = -a[k1 + 3]; + j1 += nm; + k1 += nm; + a[j1 - 1] = -a[j1 - 1]; + xr = a[j1]; + xi = -a[j1 + 1]; + yr = a[k1]; + yi = -a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + a[k1 + 3] = -a[k1 + 3]; + } + } +} + + +void bitrv216(double *a) +{ + double x1r, x1i, x2r, x2i, x3r, x3i, x4r, x4i, + x5r, x5i, x7r, x7i, x8r, x8i, x10r, x10i, + x11r, x11i, x12r, x12i, x13r, x13i, x14r, x14i; + + x1r = a[2]; + x1i = a[3]; + x2r = a[4]; + x2i = a[5]; + x3r = a[6]; + x3i = a[7]; + x4r = a[8]; + x4i = a[9]; + x5r = a[10]; + x5i = a[11]; + x7r = a[14]; + x7i = a[15]; + x8r = a[16]; + x8i = a[17]; + x10r = a[20]; + x10i = a[21]; + x11r = a[22]; + x11i = a[23]; + x12r = a[24]; + x12i = a[25]; + x13r = a[26]; + x13i = a[27]; + x14r = a[28]; + x14i = a[29]; + a[2] = x8r; + a[3] = x8i; + a[4] = x4r; + a[5] = x4i; + a[6] = x12r; + a[7] = x12i; + a[8] = x2r; + a[9] = x2i; + a[10] = x10r; + a[11] = x10i; + a[14] = x14r; + a[15] = x14i; + a[16] = x1r; + a[17] = x1i; + a[20] = x5r; + a[21] = x5i; + a[22] = x13r; + a[23] = x13i; + a[24] = x3r; + a[25] = x3i; + a[26] = x11r; + a[27] = x11i; + a[28] = x7r; + a[29] = x7i; +} + + +void bitrv216neg(double *a) +{ + double x1r, x1i, x2r, x2i, x3r, x3i, x4r, x4i, + x5r, x5i, x6r, x6i, x7r, x7i, x8r, x8i, + x9r, x9i, x10r, x10i, x11r, x11i, x12r, x12i, + x13r, x13i, x14r, x14i, x15r, x15i; + + x1r = a[2]; + x1i = a[3]; + x2r = a[4]; + x2i = a[5]; + x3r = a[6]; + x3i = a[7]; + x4r = a[8]; + x4i = a[9]; + x5r = a[10]; + x5i = a[11]; + x6r = a[12]; + x6i = a[13]; + x7r = a[14]; + x7i = a[15]; + x8r = a[16]; + x8i = a[17]; + x9r = a[18]; + x9i = a[19]; + x10r = a[20]; + x10i = a[21]; + x11r = a[22]; + x11i = a[23]; + x12r = a[24]; + x12i = a[25]; + x13r = a[26]; + x13i = a[27]; + x14r = a[28]; + x14i = a[29]; + x15r = a[30]; + x15i = a[31]; + a[2] = x15r; + a[3] = x15i; + a[4] = x7r; + a[5] = x7i; + a[6] = x11r; + a[7] = x11i; + a[8] = x3r; + a[9] = x3i; + a[10] = x13r; + a[11] = x13i; + a[12] = x5r; + a[13] = x5i; + a[14] = x9r; + a[15] = x9i; + a[16] = x1r; + a[17] = x1i; + a[18] = x14r; + a[19] = x14i; + a[20] = x6r; + a[21] = x6i; + a[22] = x10r; + a[23] = x10i; + a[24] = x2r; + a[25] = x2i; + a[26] = x12r; + a[27] = x12i; + a[28] = x4r; + a[29] = x4i; + a[30] = x8r; + a[31] = x8i; +} + + +void bitrv208(double *a) +{ + double x1r, x1i, x3r, x3i, x4r, x4i, x6r, x6i; + + x1r = a[2]; + x1i = a[3]; + x3r = a[6]; + x3i = a[7]; + x4r = a[8]; + x4i = a[9]; + x6r = a[12]; + x6i = a[13]; + a[2] = x4r; + a[3] = x4i; + a[6] = x6r; + a[7] = x6i; + a[8] = x1r; + a[9] = x1i; + a[12] = x3r; + a[13] = x3i; +} + + +void bitrv208neg(double *a) +{ + double x1r, x1i, x2r, x2i, x3r, x3i, x4r, x4i, + x5r, x5i, x6r, x6i, x7r, x7i; + + x1r = a[2]; + x1i = a[3]; + x2r = a[4]; + x2i = a[5]; + x3r = a[6]; + x3i = a[7]; + x4r = a[8]; + x4i = a[9]; + x5r = a[10]; + x5i = a[11]; + x6r = a[12]; + x6i = a[13]; + x7r = a[14]; + x7i = a[15]; + a[2] = x7r; + a[3] = x7i; + a[4] = x3r; + a[5] = x3i; + a[6] = x5r; + a[7] = x5i; + a[8] = x1r; + a[9] = x1i; + a[10] = x6r; + a[11] = x6i; + a[12] = x2r; + a[13] = x2i; + a[14] = x4r; + a[15] = x4i; +} + + +void cftf1st(int n, double *a, double *w) +{ + int j, j0, j1, j2, j3, k, m, mh; + double wn4r, csc1, csc3, wk1r, wk1i, wk3r, wk3i, + wd1r, wd1i, wd3r, wd3i; + double x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i, + y0r, y0i, y1r, y1i, y2r, y2i, y3r, y3i; + + mh = n >> 3; + m = 2 * mh; + j1 = m; + j2 = j1 + m; + j3 = j2 + m; + x0r = a[0] + a[j2]; + x0i = a[1] + a[j2 + 1]; + x1r = a[0] - a[j2]; + x1i = a[1] - a[j2 + 1]; + x2r = a[j1] + a[j3]; + x2i = a[j1 + 1] + a[j3 + 1]; + x3r = a[j1] - a[j3]; + x3i = a[j1 + 1] - a[j3 + 1]; + a[0] = x0r + x2r; + a[1] = x0i + x2i; + a[j1] = x0r - x2r; + a[j1 + 1] = x0i - x2i; + a[j2] = x1r - x3i; + a[j2 + 1] = x1i + x3r; + a[j3] = x1r + x3i; + a[j3 + 1] = x1i - x3r; + wn4r = w[1]; + csc1 = w[2]; + csc3 = w[3]; + wd1r = 1; + wd1i = 0; + wd3r = 1; + wd3i = 0; + k = 0; + for (j = 2; j < mh - 2; j += 4) { + k += 4; + wk1r = csc1 * (wd1r + w[k]); + wk1i = csc1 * (wd1i + w[k + 1]); + wk3r = csc3 * (wd3r + w[k + 2]); + wk3i = csc3 * (wd3i + w[k + 3]); + wd1r = w[k]; + wd1i = w[k + 1]; + wd3r = w[k + 2]; + wd3i = w[k + 3]; + j1 = j + m; + j2 = j1 + m; + j3 = j2 + m; + x0r = a[j] + a[j2]; + x0i = a[j + 1] + a[j2 + 1]; + x1r = a[j] - a[j2]; + x1i = a[j + 1] - a[j2 + 1]; + y0r = a[j + 2] + a[j2 + 2]; + y0i = a[j + 3] + a[j2 + 3]; + y1r = a[j + 2] - a[j2 + 2]; + y1i = a[j + 3] - a[j2 + 3]; + x2r = a[j1] + a[j3]; + x2i = a[j1 + 1] + a[j3 + 1]; + x3r = a[j1] - a[j3]; + x3i = a[j1 + 1] - a[j3 + 1]; + y2r = a[j1 + 2] + a[j3 + 2]; + y2i = a[j1 + 3] + a[j3 + 3]; + y3r = a[j1 + 2] - a[j3 + 2]; + y3i = a[j1 + 3] - a[j3 + 3]; + a[j] = x0r + x2r; + a[j + 1] = x0i + x2i; + a[j + 2] = y0r + y2r; + a[j + 3] = y0i + y2i; + a[j1] = x0r - x2r; + a[j1 + 1] = x0i - x2i; + a[j1 + 2] = y0r - y2r; + a[j1 + 3] = y0i - y2i; + x0r = x1r - x3i; + x0i = x1i + x3r; + a[j2] = wk1r * x0r - wk1i * x0i; + a[j2 + 1] = wk1r * x0i + wk1i * x0r; + x0r = y1r - y3i; + x0i = y1i + y3r; + a[j2 + 2] = wd1r * x0r - wd1i * x0i; + a[j2 + 3] = wd1r * x0i + wd1i * x0r; + x0r = x1r + x3i; + x0i = x1i - x3r; + a[j3] = wk3r * x0r + wk3i * x0i; + a[j3 + 1] = wk3r * x0i - wk3i * x0r; + x0r = y1r + y3i; + x0i = y1i - y3r; + a[j3 + 2] = wd3r * x0r + wd3i * x0i; + a[j3 + 3] = wd3r * x0i - wd3i * x0r; + j0 = m - j; + j1 = j0 + m; + j2 = j1 + m; + j3 = j2 + m; + x0r = a[j0] + a[j2]; + x0i = a[j0 + 1] + a[j2 + 1]; + x1r = a[j0] - a[j2]; + x1i = a[j0 + 1] - a[j2 + 1]; + y0r = a[j0 - 2] + a[j2 - 2]; + y0i = a[j0 - 1] + a[j2 - 1]; + y1r = a[j0 - 2] - a[j2 - 2]; + y1i = a[j0 - 1] - a[j2 - 1]; + x2r = a[j1] + a[j3]; + x2i = a[j1 + 1] + a[j3 + 1]; + x3r = a[j1] - a[j3]; + x3i = a[j1 + 1] - a[j3 + 1]; + y2r = a[j1 - 2] + a[j3 - 2]; + y2i = a[j1 - 1] + a[j3 - 1]; + y3r = a[j1 - 2] - a[j3 - 2]; + y3i = a[j1 - 1] - a[j3 - 1]; + a[j0] = x0r + x2r; + a[j0 + 1] = x0i + x2i; + a[j0 - 2] = y0r + y2r; + a[j0 - 1] = y0i + y2i; + a[j1] = x0r - x2r; + a[j1 + 1] = x0i - x2i; + a[j1 - 2] = y0r - y2r; + a[j1 - 1] = y0i - y2i; + x0r = x1r - x3i; + x0i = x1i + x3r; + a[j2] = wk1i * x0r - wk1r * x0i; + a[j2 + 1] = wk1i * x0i + wk1r * x0r; + x0r = y1r - y3i; + x0i = y1i + y3r; + a[j2 - 2] = wd1i * x0r - wd1r * x0i; + a[j2 - 1] = wd1i * x0i + wd1r * x0r; + x0r = x1r + x3i; + x0i = x1i - x3r; + a[j3] = wk3i * x0r + wk3r * x0i; + a[j3 + 1] = wk3i * x0i - wk3r * x0r; + x0r = y1r + y3i; + x0i = y1i - y3r; + a[j3 - 2] = wd3i * x0r + wd3r * x0i; + a[j3 - 1] = wd3i * x0i - wd3r * x0r; + } + wk1r = csc1 * (wd1r + wn4r); + wk1i = csc1 * (wd1i + wn4r); + wk3r = csc3 * (wd3r - wn4r); + wk3i = csc3 * (wd3i - wn4r); + j0 = mh; + j1 = j0 + m; + j2 = j1 + m; + j3 = j2 + m; + x0r = a[j0 - 2] + a[j2 - 2]; + x0i = a[j0 - 1] + a[j2 - 1]; + x1r = a[j0 - 2] - a[j2 - 2]; + x1i = a[j0 - 1] - a[j2 - 1]; + x2r = a[j1 - 2] + a[j3 - 2]; + x2i = a[j1 - 1] + a[j3 - 1]; + x3r = a[j1 - 2] - a[j3 - 2]; + x3i = a[j1 - 1] - a[j3 - 1]; + a[j0 - 2] = x0r + x2r; + a[j0 - 1] = x0i + x2i; + a[j1 - 2] = x0r - x2r; + a[j1 - 1] = x0i - x2i; + x0r = x1r - x3i; + x0i = x1i + x3r; + a[j2 - 2] = wk1r * x0r - wk1i * x0i; + a[j2 - 1] = wk1r * x0i + wk1i * x0r; + x0r = x1r + x3i; + x0i = x1i - x3r; + a[j3 - 2] = wk3r * x0r + wk3i * x0i; + a[j3 - 1] = wk3r * x0i - wk3i * x0r; + x0r = a[j0] + a[j2]; + x0i = a[j0 + 1] + a[j2 + 1]; + x1r = a[j0] - a[j2]; + x1i = a[j0 + 1] - a[j2 + 1]; + x2r = a[j1] + a[j3]; + x2i = a[j1 + 1] + a[j3 + 1]; + x3r = a[j1] - a[j3]; + x3i = a[j1 + 1] - a[j3 + 1]; + a[j0] = x0r + x2r; + a[j0 + 1] = x0i + x2i; + a[j1] = x0r - x2r; + a[j1 + 1] = x0i - x2i; + x0r = x1r - x3i; + x0i = x1i + x3r; + a[j2] = wn4r * (x0r - x0i); + a[j2 + 1] = wn4r * (x0i + x0r); + x0r = x1r + x3i; + x0i = x1i - x3r; + a[j3] = -wn4r * (x0r + x0i); + a[j3 + 1] = -wn4r * (x0i - x0r); + x0r = a[j0 + 2] + a[j2 + 2]; + x0i = a[j0 + 3] + a[j2 + 3]; + x1r = a[j0 + 2] - a[j2 + 2]; + x1i = a[j0 + 3] - a[j2 + 3]; + x2r = a[j1 + 2] + a[j3 + 2]; + x2i = a[j1 + 3] + a[j3 + 3]; + x3r = a[j1 + 2] - a[j3 + 2]; + x3i = a[j1 + 3] - a[j3 + 3]; + a[j0 + 2] = x0r + x2r; + a[j0 + 3] = x0i + x2i; + a[j1 + 2] = x0r - x2r; + a[j1 + 3] = x0i - x2i; + x0r = x1r - x3i; + x0i = x1i + x3r; + a[j2 + 2] = wk1i * x0r - wk1r * x0i; + a[j2 + 3] = wk1i * x0i + wk1r * x0r; + x0r = x1r + x3i; + x0i = x1i - x3r; + a[j3 + 2] = wk3i * x0r + wk3r * x0i; + a[j3 + 3] = wk3i * x0i - wk3r * x0r; +} + + +void cftb1st(int n, double *a, double *w) +{ + int j, j0, j1, j2, j3, k, m, mh; + double wn4r, csc1, csc3, wk1r, wk1i, wk3r, wk3i, + wd1r, wd1i, wd3r, wd3i; + double x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i, + y0r, y0i, y1r, y1i, y2r, y2i, y3r, y3i; + + mh = n >> 3; + m = 2 * mh; + j1 = m; + j2 = j1 + m; + j3 = j2 + m; + x0r = a[0] + a[j2]; + x0i = -a[1] - a[j2 + 1]; + x1r = a[0] - a[j2]; + x1i = -a[1] + a[j2 + 1]; + x2r = a[j1] + a[j3]; + x2i = a[j1 + 1] + a[j3 + 1]; + x3r = a[j1] - a[j3]; + x3i = a[j1 + 1] - a[j3 + 1]; + a[0] = x0r + x2r; + a[1] = x0i - x2i; + a[j1] = x0r - x2r; + a[j1 + 1] = x0i + x2i; + a[j2] = x1r + x3i; + a[j2 + 1] = x1i + x3r; + a[j3] = x1r - x3i; + a[j3 + 1] = x1i - x3r; + wn4r = w[1]; + csc1 = w[2]; + csc3 = w[3]; + wd1r = 1; + wd1i = 0; + wd3r = 1; + wd3i = 0; + k = 0; + for (j = 2; j < mh - 2; j += 4) { + k += 4; + wk1r = csc1 * (wd1r + w[k]); + wk1i = csc1 * (wd1i + w[k + 1]); + wk3r = csc3 * (wd3r + w[k + 2]); + wk3i = csc3 * (wd3i + w[k + 3]); + wd1r = w[k]; + wd1i = w[k + 1]; + wd3r = w[k + 2]; + wd3i = w[k + 3]; + j1 = j + m; + j2 = j1 + m; + j3 = j2 + m; + x0r = a[j] + a[j2]; + x0i = -a[j + 1] - a[j2 + 1]; + x1r = a[j] - a[j2]; + x1i = -a[j + 1] + a[j2 + 1]; + y0r = a[j + 2] + a[j2 + 2]; + y0i = -a[j + 3] - a[j2 + 3]; + y1r = a[j + 2] - a[j2 + 2]; + y1i = -a[j + 3] + a[j2 + 3]; + x2r = a[j1] + a[j3]; + x2i = a[j1 + 1] + a[j3 + 1]; + x3r = a[j1] - a[j3]; + x3i = a[j1 + 1] - a[j3 + 1]; + y2r = a[j1 + 2] + a[j3 + 2]; + y2i = a[j1 + 3] + a[j3 + 3]; + y3r = a[j1 + 2] - a[j3 + 2]; + y3i = a[j1 + 3] - a[j3 + 3]; + a[j] = x0r + x2r; + a[j + 1] = x0i - x2i; + a[j + 2] = y0r + y2r; + a[j + 3] = y0i - y2i; + a[j1] = x0r - x2r; + a[j1 + 1] = x0i + x2i; + a[j1 + 2] = y0r - y2r; + a[j1 + 3] = y0i + y2i; + x0r = x1r + x3i; + x0i = x1i + x3r; + a[j2] = wk1r * x0r - wk1i * x0i; + a[j2 + 1] = wk1r * x0i + wk1i * x0r; + x0r = y1r + y3i; + x0i = y1i + y3r; + a[j2 + 2] = wd1r * x0r - wd1i * x0i; + a[j2 + 3] = wd1r * x0i + wd1i * x0r; + x0r = x1r - x3i; + x0i = x1i - x3r; + a[j3] = wk3r * x0r + wk3i * x0i; + a[j3 + 1] = wk3r * x0i - wk3i * x0r; + x0r = y1r - y3i; + x0i = y1i - y3r; + a[j3 + 2] = wd3r * x0r + wd3i * x0i; + a[j3 + 3] = wd3r * x0i - wd3i * x0r; + j0 = m - j; + j1 = j0 + m; + j2 = j1 + m; + j3 = j2 + m; + x0r = a[j0] + a[j2]; + x0i = -a[j0 + 1] - a[j2 + 1]; + x1r = a[j0] - a[j2]; + x1i = -a[j0 + 1] + a[j2 + 1]; + y0r = a[j0 - 2] + a[j2 - 2]; + y0i = -a[j0 - 1] - a[j2 - 1]; + y1r = a[j0 - 2] - a[j2 - 2]; + y1i = -a[j0 - 1] + a[j2 - 1]; + x2r = a[j1] + a[j3]; + x2i = a[j1 + 1] + a[j3 + 1]; + x3r = a[j1] - a[j3]; + x3i = a[j1 + 1] - a[j3 + 1]; + y2r = a[j1 - 2] + a[j3 - 2]; + y2i = a[j1 - 1] + a[j3 - 1]; + y3r = a[j1 - 2] - a[j3 - 2]; + y3i = a[j1 - 1] - a[j3 - 1]; + a[j0] = x0r + x2r; + a[j0 + 1] = x0i - x2i; + a[j0 - 2] = y0r + y2r; + a[j0 - 1] = y0i - y2i; + a[j1] = x0r - x2r; + a[j1 + 1] = x0i + x2i; + a[j1 - 2] = y0r - y2r; + a[j1 - 1] = y0i + y2i; + x0r = x1r + x3i; + x0i = x1i + x3r; + a[j2] = wk1i * x0r - wk1r * x0i; + a[j2 + 1] = wk1i * x0i + wk1r * x0r; + x0r = y1r + y3i; + x0i = y1i + y3r; + a[j2 - 2] = wd1i * x0r - wd1r * x0i; + a[j2 - 1] = wd1i * x0i + wd1r * x0r; + x0r = x1r - x3i; + x0i = x1i - x3r; + a[j3] = wk3i * x0r + wk3r * x0i; + a[j3 + 1] = wk3i * x0i - wk3r * x0r; + x0r = y1r - y3i; + x0i = y1i - y3r; + a[j3 - 2] = wd3i * x0r + wd3r * x0i; + a[j3 - 1] = wd3i * x0i - wd3r * x0r; + } + wk1r = csc1 * (wd1r + wn4r); + wk1i = csc1 * (wd1i + wn4r); + wk3r = csc3 * (wd3r - wn4r); + wk3i = csc3 * (wd3i - wn4r); + j0 = mh; + j1 = j0 + m; + j2 = j1 + m; + j3 = j2 + m; + x0r = a[j0 - 2] + a[j2 - 2]; + x0i = -a[j0 - 1] - a[j2 - 1]; + x1r = a[j0 - 2] - a[j2 - 2]; + x1i = -a[j0 - 1] + a[j2 - 1]; + x2r = a[j1 - 2] + a[j3 - 2]; + x2i = a[j1 - 1] + a[j3 - 1]; + x3r = a[j1 - 2] - a[j3 - 2]; + x3i = a[j1 - 1] - a[j3 - 1]; + a[j0 - 2] = x0r + x2r; + a[j0 - 1] = x0i - x2i; + a[j1 - 2] = x0r - x2r; + a[j1 - 1] = x0i + x2i; + x0r = x1r + x3i; + x0i = x1i + x3r; + a[j2 - 2] = wk1r * x0r - wk1i * x0i; + a[j2 - 1] = wk1r * x0i + wk1i * x0r; + x0r = x1r - x3i; + x0i = x1i - x3r; + a[j3 - 2] = wk3r * x0r + wk3i * x0i; + a[j3 - 1] = wk3r * x0i - wk3i * x0r; + x0r = a[j0] + a[j2]; + x0i = -a[j0 + 1] - a[j2 + 1]; + x1r = a[j0] - a[j2]; + x1i = -a[j0 + 1] + a[j2 + 1]; + x2r = a[j1] + a[j3]; + x2i = a[j1 + 1] + a[j3 + 1]; + x3r = a[j1] - a[j3]; + x3i = a[j1 + 1] - a[j3 + 1]; + a[j0] = x0r + x2r; + a[j0 + 1] = x0i - x2i; + a[j1] = x0r - x2r; + a[j1 + 1] = x0i + x2i; + x0r = x1r + x3i; + x0i = x1i + x3r; + a[j2] = wn4r * (x0r - x0i); + a[j2 + 1] = wn4r * (x0i + x0r); + x0r = x1r - x3i; + x0i = x1i - x3r; + a[j3] = -wn4r * (x0r + x0i); + a[j3 + 1] = -wn4r * (x0i - x0r); + x0r = a[j0 + 2] + a[j2 + 2]; + x0i = -a[j0 + 3] - a[j2 + 3]; + x1r = a[j0 + 2] - a[j2 + 2]; + x1i = -a[j0 + 3] + a[j2 + 3]; + x2r = a[j1 + 2] + a[j3 + 2]; + x2i = a[j1 + 3] + a[j3 + 3]; + x3r = a[j1 + 2] - a[j3 + 2]; + x3i = a[j1 + 3] - a[j3 + 3]; + a[j0 + 2] = x0r + x2r; + a[j0 + 3] = x0i - x2i; + a[j1 + 2] = x0r - x2r; + a[j1 + 3] = x0i + x2i; + x0r = x1r + x3i; + x0i = x1i + x3r; + a[j2 + 2] = wk1i * x0r - wk1r * x0i; + a[j2 + 3] = wk1i * x0i + wk1r * x0r; + x0r = x1r - x3i; + x0i = x1i - x3r; + a[j3 + 2] = wk3i * x0r + wk3r * x0i; + a[j3 + 3] = wk3i * x0i - wk3r * x0r; +} + + +#ifdef USE_CDFT_THREADS +struct cdft_arg_st { + int n0; + int n; + double *a; + int nw; + double *w; +}; +typedef struct cdft_arg_st cdft_arg_t; + + +void cftrec4_th(int n, double *a, int nw, double *w) +{ + void *cftrec1_th(void *p); + void *cftrec2_th(void *p); + int i, idiv4, m, nthread; + cdft_thread_t th[4]; + cdft_arg_t ag[4]; + + nthread = 2; + idiv4 = 0; + m = n >> 1; + if (n > CDFT_4THREADS_BEGIN_N) { + nthread = 4; + idiv4 = 1; + m >>= 1; + } + for (i = 0; i < nthread; i++) { + ag[i].n0 = n; + ag[i].n = m; + ag[i].a = &a[i * m]; + ag[i].nw = nw; + ag[i].w = w; + if (i != idiv4) { + cdft_thread_create(&th[i], cftrec1_th, &ag[i]); + } else { + cdft_thread_create(&th[i], cftrec2_th, &ag[i]); + } + } + for (i = 0; i < nthread; i++) { + cdft_thread_wait(th[i]); + } +} + + +void *cftrec1_th(void *p) +{ + int cfttree(int n, int j, int k, double *a, int nw, double *w); + void cftleaf(int n, int isplt, double *a, int nw, double *w); + void cftmdl1(int n, double *a, double *w); + int isplt, j, k, m, n, n0, nw; + double *a, *w; + + n0 = ((cdft_arg_t *) p)->n0; + n = ((cdft_arg_t *) p)->n; + a = ((cdft_arg_t *) p)->a; + nw = ((cdft_arg_t *) p)->nw; + w = ((cdft_arg_t *) p)->w; + m = n0; + while (m > 512) { + m >>= 2; + cftmdl1(m, &a[n - m], &w[nw - (m >> 1)]); + } + cftleaf(m, 1, &a[n - m], nw, w); + k = 0; + for (j = n - m; j > 0; j -= m) { + k++; + isplt = cfttree(m, j, k, a, nw, w); + cftleaf(m, isplt, &a[j - m], nw, w); + } + return (void *) 0; +} + + +void *cftrec2_th(void *p) +{ + int cfttree(int n, int j, int k, double *a, int nw, double *w); + void cftleaf(int n, int isplt, double *a, int nw, double *w); + void cftmdl2(int n, double *a, double *w); + int isplt, j, k, m, n, n0, nw; + double *a, *w; + + n0 = ((cdft_arg_t *) p)->n0; + n = ((cdft_arg_t *) p)->n; + a = ((cdft_arg_t *) p)->a; + nw = ((cdft_arg_t *) p)->nw; + w = ((cdft_arg_t *) p)->w; + k = 1; + m = n0; + while (m > 512) { + m >>= 2; + k <<= 2; + cftmdl2(m, &a[n - m], &w[nw - m]); + } + cftleaf(m, 0, &a[n - m], nw, w); + k >>= 1; + for (j = n - m; j > 0; j -= m) { + k++; + isplt = cfttree(m, j, k, a, nw, w); + cftleaf(m, isplt, &a[j - m], nw, w); + } + return (void *) 0; +} +#endif /* USE_CDFT_THREADS */ + + +void cftrec4(int n, double *a, int nw, double *w) +{ + int cfttree(int n, int j, int k, double *a, int nw, double *w); + void cftleaf(int n, int isplt, double *a, int nw, double *w); + void cftmdl1(int n, double *a, double *w); + int isplt, j, k, m; + + m = n; + while (m > 512) { + m >>= 2; + cftmdl1(m, &a[n - m], &w[nw - (m >> 1)]); + } + cftleaf(m, 1, &a[n - m], nw, w); + k = 0; + for (j = n - m; j > 0; j -= m) { + k++; + isplt = cfttree(m, j, k, a, nw, w); + cftleaf(m, isplt, &a[j - m], nw, w); + } +} + + +int cfttree(int n, int j, int k, double *a, int nw, double *w) +{ + void cftmdl1(int n, double *a, double *w); + void cftmdl2(int n, double *a, double *w); + int i, isplt, m; + + if ((k & 3) != 0) { + isplt = k & 1; + if (isplt != 0) { + cftmdl1(n, &a[j - n], &w[nw - (n >> 1)]); + } else { + cftmdl2(n, &a[j - n], &w[nw - n]); + } + } else { + m = n; + for (i = k; (i & 3) == 0; i >>= 2) { + m <<= 2; + } + isplt = i & 1; + if (isplt != 0) { + while (m > 128) { + cftmdl1(m, &a[j - m], &w[nw - (m >> 1)]); + m >>= 2; + } + } else { + while (m > 128) { + cftmdl2(m, &a[j - m], &w[nw - m]); + m >>= 2; + } + } + } + return isplt; +} + + +void cftleaf(int n, int isplt, double *a, int nw, double *w) +{ + void cftmdl1(int n, double *a, double *w); + void cftmdl2(int n, double *a, double *w); + void cftf161(double *a, double *w); + void cftf162(double *a, double *w); + void cftf081(double *a, double *w); + void cftf082(double *a, double *w); + + if (n == 512) { + cftmdl1(128, a, &w[nw - 64]); + cftf161(a, &w[nw - 8]); + cftf162(&a[32], &w[nw - 32]); + cftf161(&a[64], &w[nw - 8]); + cftf161(&a[96], &w[nw - 8]); + cftmdl2(128, &a[128], &w[nw - 128]); + cftf161(&a[128], &w[nw - 8]); + cftf162(&a[160], &w[nw - 32]); + cftf161(&a[192], &w[nw - 8]); + cftf162(&a[224], &w[nw - 32]); + cftmdl1(128, &a[256], &w[nw - 64]); + cftf161(&a[256], &w[nw - 8]); + cftf162(&a[288], &w[nw - 32]); + cftf161(&a[320], &w[nw - 8]); + cftf161(&a[352], &w[nw - 8]); + if (isplt != 0) { + cftmdl1(128, &a[384], &w[nw - 64]); + cftf161(&a[480], &w[nw - 8]); + } else { + cftmdl2(128, &a[384], &w[nw - 128]); + cftf162(&a[480], &w[nw - 32]); + } + cftf161(&a[384], &w[nw - 8]); + cftf162(&a[416], &w[nw - 32]); + cftf161(&a[448], &w[nw - 8]); + } else { + cftmdl1(64, a, &w[nw - 32]); + cftf081(a, &w[nw - 8]); + cftf082(&a[16], &w[nw - 8]); + cftf081(&a[32], &w[nw - 8]); + cftf081(&a[48], &w[nw - 8]); + cftmdl2(64, &a[64], &w[nw - 64]); + cftf081(&a[64], &w[nw - 8]); + cftf082(&a[80], &w[nw - 8]); + cftf081(&a[96], &w[nw - 8]); + cftf082(&a[112], &w[nw - 8]); + cftmdl1(64, &a[128], &w[nw - 32]); + cftf081(&a[128], &w[nw - 8]); + cftf082(&a[144], &w[nw - 8]); + cftf081(&a[160], &w[nw - 8]); + cftf081(&a[176], &w[nw - 8]); + if (isplt != 0) { + cftmdl1(64, &a[192], &w[nw - 32]); + cftf081(&a[240], &w[nw - 8]); + } else { + cftmdl2(64, &a[192], &w[nw - 64]); + cftf082(&a[240], &w[nw - 8]); + } + cftf081(&a[192], &w[nw - 8]); + cftf082(&a[208], &w[nw - 8]); + cftf081(&a[224], &w[nw - 8]); + } +} + + +void cftmdl1(int n, double *a, double *w) +{ + int j, j0, j1, j2, j3, k, m, mh; + double wn4r, wk1r, wk1i, wk3r, wk3i; + double x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i; + + mh = n >> 3; + m = 2 * mh; + j1 = m; + j2 = j1 + m; + j3 = j2 + m; + x0r = a[0] + a[j2]; + x0i = a[1] + a[j2 + 1]; + x1r = a[0] - a[j2]; + x1i = a[1] - a[j2 + 1]; + x2r = a[j1] + a[j3]; + x2i = a[j1 + 1] + a[j3 + 1]; + x3r = a[j1] - a[j3]; + x3i = a[j1 + 1] - a[j3 + 1]; + a[0] = x0r + x2r; + a[1] = x0i + x2i; + a[j1] = x0r - x2r; + a[j1 + 1] = x0i - x2i; + a[j2] = x1r - x3i; + a[j2 + 1] = x1i + x3r; + a[j3] = x1r + x3i; + a[j3 + 1] = x1i - x3r; + wn4r = w[1]; + k = 0; + for (j = 2; j < mh; j += 2) { + k += 4; + wk1r = w[k]; + wk1i = w[k + 1]; + wk3r = w[k + 2]; + wk3i = w[k + 3]; + j1 = j + m; + j2 = j1 + m; + j3 = j2 + m; + x0r = a[j] + a[j2]; + x0i = a[j + 1] + a[j2 + 1]; + x1r = a[j] - a[j2]; + x1i = a[j + 1] - a[j2 + 1]; + x2r = a[j1] + a[j3]; + x2i = a[j1 + 1] + a[j3 + 1]; + x3r = a[j1] - a[j3]; + x3i = a[j1 + 1] - a[j3 + 1]; + a[j] = x0r + x2r; + a[j + 1] = x0i + x2i; + a[j1] = x0r - x2r; + a[j1 + 1] = x0i - x2i; + x0r = x1r - x3i; + x0i = x1i + x3r; + a[j2] = wk1r * x0r - wk1i * x0i; + a[j2 + 1] = wk1r * x0i + wk1i * x0r; + x0r = x1r + x3i; + x0i = x1i - x3r; + a[j3] = wk3r * x0r + wk3i * x0i; + a[j3 + 1] = wk3r * x0i - wk3i * x0r; + j0 = m - j; + j1 = j0 + m; + j2 = j1 + m; + j3 = j2 + m; + x0r = a[j0] + a[j2]; + x0i = a[j0 + 1] + a[j2 + 1]; + x1r = a[j0] - a[j2]; + x1i = a[j0 + 1] - a[j2 + 1]; + x2r = a[j1] + a[j3]; + x2i = a[j1 + 1] + a[j3 + 1]; + x3r = a[j1] - a[j3]; + x3i = a[j1 + 1] - a[j3 + 1]; + a[j0] = x0r + x2r; + a[j0 + 1] = x0i + x2i; + a[j1] = x0r - x2r; + a[j1 + 1] = x0i - x2i; + x0r = x1r - x3i; + x0i = x1i + x3r; + a[j2] = wk1i * x0r - wk1r * x0i; + a[j2 + 1] = wk1i * x0i + wk1r * x0r; + x0r = x1r + x3i; + x0i = x1i - x3r; + a[j3] = wk3i * x0r + wk3r * x0i; + a[j3 + 1] = wk3i * x0i - wk3r * x0r; + } + j0 = mh; + j1 = j0 + m; + j2 = j1 + m; + j3 = j2 + m; + x0r = a[j0] + a[j2]; + x0i = a[j0 + 1] + a[j2 + 1]; + x1r = a[j0] - a[j2]; + x1i = a[j0 + 1] - a[j2 + 1]; + x2r = a[j1] + a[j3]; + x2i = a[j1 + 1] + a[j3 + 1]; + x3r = a[j1] - a[j3]; + x3i = a[j1 + 1] - a[j3 + 1]; + a[j0] = x0r + x2r; + a[j0 + 1] = x0i + x2i; + a[j1] = x0r - x2r; + a[j1 + 1] = x0i - x2i; + x0r = x1r - x3i; + x0i = x1i + x3r; + a[j2] = wn4r * (x0r - x0i); + a[j2 + 1] = wn4r * (x0i + x0r); + x0r = x1r + x3i; + x0i = x1i - x3r; + a[j3] = -wn4r * (x0r + x0i); + a[j3 + 1] = -wn4r * (x0i - x0r); +} + + +void cftmdl2(int n, double *a, double *w) +{ + int j, j0, j1, j2, j3, k, kr, m, mh; + double wn4r, wk1r, wk1i, wk3r, wk3i, wd1r, wd1i, wd3r, wd3i; + double x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i, y0r, y0i, y2r, y2i; + + mh = n >> 3; + m = 2 * mh; + wn4r = w[1]; + j1 = m; + j2 = j1 + m; + j3 = j2 + m; + x0r = a[0] - a[j2 + 1]; + x0i = a[1] + a[j2]; + x1r = a[0] + a[j2 + 1]; + x1i = a[1] - a[j2]; + x2r = a[j1] - a[j3 + 1]; + x2i = a[j1 + 1] + a[j3]; + x3r = a[j1] + a[j3 + 1]; + x3i = a[j1 + 1] - a[j3]; + y0r = wn4r * (x2r - x2i); + y0i = wn4r * (x2i + x2r); + a[0] = x0r + y0r; + a[1] = x0i + y0i; + a[j1] = x0r - y0r; + a[j1 + 1] = x0i - y0i; + y0r = wn4r * (x3r - x3i); + y0i = wn4r * (x3i + x3r); + a[j2] = x1r - y0i; + a[j2 + 1] = x1i + y0r; + a[j3] = x1r + y0i; + a[j3 + 1] = x1i - y0r; + k = 0; + kr = 2 * m; + for (j = 2; j < mh; j += 2) { + k += 4; + wk1r = w[k]; + wk1i = w[k + 1]; + wk3r = w[k + 2]; + wk3i = w[k + 3]; + kr -= 4; + wd1i = w[kr]; + wd1r = w[kr + 1]; + wd3i = w[kr + 2]; + wd3r = w[kr + 3]; + j1 = j + m; + j2 = j1 + m; + j3 = j2 + m; + x0r = a[j] - a[j2 + 1]; + x0i = a[j + 1] + a[j2]; + x1r = a[j] + a[j2 + 1]; + x1i = a[j + 1] - a[j2]; + x2r = a[j1] - a[j3 + 1]; + x2i = a[j1 + 1] + a[j3]; + x3r = a[j1] + a[j3 + 1]; + x3i = a[j1 + 1] - a[j3]; + y0r = wk1r * x0r - wk1i * x0i; + y0i = wk1r * x0i + wk1i * x0r; + y2r = wd1r * x2r - wd1i * x2i; + y2i = wd1r * x2i + wd1i * x2r; + a[j] = y0r + y2r; + a[j + 1] = y0i + y2i; + a[j1] = y0r - y2r; + a[j1 + 1] = y0i - y2i; + y0r = wk3r * x1r + wk3i * x1i; + y0i = wk3r * x1i - wk3i * x1r; + y2r = wd3r * x3r + wd3i * x3i; + y2i = wd3r * x3i - wd3i * x3r; + a[j2] = y0r + y2r; + a[j2 + 1] = y0i + y2i; + a[j3] = y0r - y2r; + a[j3 + 1] = y0i - y2i; + j0 = m - j; + j1 = j0 + m; + j2 = j1 + m; + j3 = j2 + m; + x0r = a[j0] - a[j2 + 1]; + x0i = a[j0 + 1] + a[j2]; + x1r = a[j0] + a[j2 + 1]; + x1i = a[j0 + 1] - a[j2]; + x2r = a[j1] - a[j3 + 1]; + x2i = a[j1 + 1] + a[j3]; + x3r = a[j1] + a[j3 + 1]; + x3i = a[j1 + 1] - a[j3]; + y0r = wd1i * x0r - wd1r * x0i; + y0i = wd1i * x0i + wd1r * x0r; + y2r = wk1i * x2r - wk1r * x2i; + y2i = wk1i * x2i + wk1r * x2r; + a[j0] = y0r + y2r; + a[j0 + 1] = y0i + y2i; + a[j1] = y0r - y2r; + a[j1 + 1] = y0i - y2i; + y0r = wd3i * x1r + wd3r * x1i; + y0i = wd3i * x1i - wd3r * x1r; + y2r = wk3i * x3r + wk3r * x3i; + y2i = wk3i * x3i - wk3r * x3r; + a[j2] = y0r + y2r; + a[j2 + 1] = y0i + y2i; + a[j3] = y0r - y2r; + a[j3 + 1] = y0i - y2i; + } + wk1r = w[m]; + wk1i = w[m + 1]; + j0 = mh; + j1 = j0 + m; + j2 = j1 + m; + j3 = j2 + m; + x0r = a[j0] - a[j2 + 1]; + x0i = a[j0 + 1] + a[j2]; + x1r = a[j0] + a[j2 + 1]; + x1i = a[j0 + 1] - a[j2]; + x2r = a[j1] - a[j3 + 1]; + x2i = a[j1 + 1] + a[j3]; + x3r = a[j1] + a[j3 + 1]; + x3i = a[j1 + 1] - a[j3]; + y0r = wk1r * x0r - wk1i * x0i; + y0i = wk1r * x0i + wk1i * x0r; + y2r = wk1i * x2r - wk1r * x2i; + y2i = wk1i * x2i + wk1r * x2r; + a[j0] = y0r + y2r; + a[j0 + 1] = y0i + y2i; + a[j1] = y0r - y2r; + a[j1 + 1] = y0i - y2i; + y0r = wk1i * x1r - wk1r * x1i; + y0i = wk1i * x1i + wk1r * x1r; + y2r = wk1r * x3r - wk1i * x3i; + y2i = wk1r * x3i + wk1i * x3r; + a[j2] = y0r - y2r; + a[j2 + 1] = y0i - y2i; + a[j3] = y0r + y2r; + a[j3 + 1] = y0i + y2i; +} + + +void cftfx41(int n, double *a, int nw, double *w) +{ + void cftf161(double *a, double *w); + void cftf162(double *a, double *w); + void cftf081(double *a, double *w); + void cftf082(double *a, double *w); + + if (n == 128) { + cftf161(a, &w[nw - 8]); + cftf162(&a[32], &w[nw - 32]); + cftf161(&a[64], &w[nw - 8]); + cftf161(&a[96], &w[nw - 8]); + } else { + cftf081(a, &w[nw - 8]); + cftf082(&a[16], &w[nw - 8]); + cftf081(&a[32], &w[nw - 8]); + cftf081(&a[48], &w[nw - 8]); + } +} + + +void cftf161(double *a, double *w) +{ + double wn4r, wk1r, wk1i, + x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i, + y0r, y0i, y1r, y1i, y2r, y2i, y3r, y3i, + y4r, y4i, y5r, y5i, y6r, y6i, y7r, y7i, + y8r, y8i, y9r, y9i, y10r, y10i, y11r, y11i, + y12r, y12i, y13r, y13i, y14r, y14i, y15r, y15i; + + wn4r = w[1]; + wk1r = w[2]; + wk1i = w[3]; + x0r = a[0] + a[16]; + x0i = a[1] + a[17]; + x1r = a[0] - a[16]; + x1i = a[1] - a[17]; + x2r = a[8] + a[24]; + x2i = a[9] + a[25]; + x3r = a[8] - a[24]; + x3i = a[9] - a[25]; + y0r = x0r + x2r; + y0i = x0i + x2i; + y4r = x0r - x2r; + y4i = x0i - x2i; + y8r = x1r - x3i; + y8i = x1i + x3r; + y12r = x1r + x3i; + y12i = x1i - x3r; + x0r = a[2] + a[18]; + x0i = a[3] + a[19]; + x1r = a[2] - a[18]; + x1i = a[3] - a[19]; + x2r = a[10] + a[26]; + x2i = a[11] + a[27]; + x3r = a[10] - a[26]; + x3i = a[11] - a[27]; + y1r = x0r + x2r; + y1i = x0i + x2i; + y5r = x0r - x2r; + y5i = x0i - x2i; + x0r = x1r - x3i; + x0i = x1i + x3r; + y9r = wk1r * x0r - wk1i * x0i; + y9i = wk1r * x0i + wk1i * x0r; + x0r = x1r + x3i; + x0i = x1i - x3r; + y13r = wk1i * x0r - wk1r * x0i; + y13i = wk1i * x0i + wk1r * x0r; + x0r = a[4] + a[20]; + x0i = a[5] + a[21]; + x1r = a[4] - a[20]; + x1i = a[5] - a[21]; + x2r = a[12] + a[28]; + x2i = a[13] + a[29]; + x3r = a[12] - a[28]; + x3i = a[13] - a[29]; + y2r = x0r + x2r; + y2i = x0i + x2i; + y6r = x0r - x2r; + y6i = x0i - x2i; + x0r = x1r - x3i; + x0i = x1i + x3r; + y10r = wn4r * (x0r - x0i); + y10i = wn4r * (x0i + x0r); + x0r = x1r + x3i; + x0i = x1i - x3r; + y14r = wn4r * (x0r + x0i); + y14i = wn4r * (x0i - x0r); + x0r = a[6] + a[22]; + x0i = a[7] + a[23]; + x1r = a[6] - a[22]; + x1i = a[7] - a[23]; + x2r = a[14] + a[30]; + x2i = a[15] + a[31]; + x3r = a[14] - a[30]; + x3i = a[15] - a[31]; + y3r = x0r + x2r; + y3i = x0i + x2i; + y7r = x0r - x2r; + y7i = x0i - x2i; + x0r = x1r - x3i; + x0i = x1i + x3r; + y11r = wk1i * x0r - wk1r * x0i; + y11i = wk1i * x0i + wk1r * x0r; + x0r = x1r + x3i; + x0i = x1i - x3r; + y15r = wk1r * x0r - wk1i * x0i; + y15i = wk1r * x0i + wk1i * x0r; + x0r = y12r - y14r; + x0i = y12i - y14i; + x1r = y12r + y14r; + x1i = y12i + y14i; + x2r = y13r - y15r; + x2i = y13i - y15i; + x3r = y13r + y15r; + x3i = y13i + y15i; + a[24] = x0r + x2r; + a[25] = x0i + x2i; + a[26] = x0r - x2r; + a[27] = x0i - x2i; + a[28] = x1r - x3i; + a[29] = x1i + x3r; + a[30] = x1r + x3i; + a[31] = x1i - x3r; + x0r = y8r + y10r; + x0i = y8i + y10i; + x1r = y8r - y10r; + x1i = y8i - y10i; + x2r = y9r + y11r; + x2i = y9i + y11i; + x3r = y9r - y11r; + x3i = y9i - y11i; + a[16] = x0r + x2r; + a[17] = x0i + x2i; + a[18] = x0r - x2r; + a[19] = x0i - x2i; + a[20] = x1r - x3i; + a[21] = x1i + x3r; + a[22] = x1r + x3i; + a[23] = x1i - x3r; + x0r = y5r - y7i; + x0i = y5i + y7r; + x2r = wn4r * (x0r - x0i); + x2i = wn4r * (x0i + x0r); + x0r = y5r + y7i; + x0i = y5i - y7r; + x3r = wn4r * (x0r - x0i); + x3i = wn4r * (x0i + x0r); + x0r = y4r - y6i; + x0i = y4i + y6r; + x1r = y4r + y6i; + x1i = y4i - y6r; + a[8] = x0r + x2r; + a[9] = x0i + x2i; + a[10] = x0r - x2r; + a[11] = x0i - x2i; + a[12] = x1r - x3i; + a[13] = x1i + x3r; + a[14] = x1r + x3i; + a[15] = x1i - x3r; + x0r = y0r + y2r; + x0i = y0i + y2i; + x1r = y0r - y2r; + x1i = y0i - y2i; + x2r = y1r + y3r; + x2i = y1i + y3i; + x3r = y1r - y3r; + x3i = y1i - y3i; + a[0] = x0r + x2r; + a[1] = x0i + x2i; + a[2] = x0r - x2r; + a[3] = x0i - x2i; + a[4] = x1r - x3i; + a[5] = x1i + x3r; + a[6] = x1r + x3i; + a[7] = x1i - x3r; +} + + +void cftf162(double *a, double *w) +{ + double wn4r, wk1r, wk1i, wk2r, wk2i, wk3r, wk3i, + x0r, x0i, x1r, x1i, x2r, x2i, + y0r, y0i, y1r, y1i, y2r, y2i, y3r, y3i, + y4r, y4i, y5r, y5i, y6r, y6i, y7r, y7i, + y8r, y8i, y9r, y9i, y10r, y10i, y11r, y11i, + y12r, y12i, y13r, y13i, y14r, y14i, y15r, y15i; + + wn4r = w[1]; + wk1r = w[4]; + wk1i = w[5]; + wk3r = w[6]; + wk3i = -w[7]; + wk2r = w[8]; + wk2i = w[9]; + x1r = a[0] - a[17]; + x1i = a[1] + a[16]; + x0r = a[8] - a[25]; + x0i = a[9] + a[24]; + x2r = wn4r * (x0r - x0i); + x2i = wn4r * (x0i + x0r); + y0r = x1r + x2r; + y0i = x1i + x2i; + y4r = x1r - x2r; + y4i = x1i - x2i; + x1r = a[0] + a[17]; + x1i = a[1] - a[16]; + x0r = a[8] + a[25]; + x0i = a[9] - a[24]; + x2r = wn4r * (x0r - x0i); + x2i = wn4r * (x0i + x0r); + y8r = x1r - x2i; + y8i = x1i + x2r; + y12r = x1r + x2i; + y12i = x1i - x2r; + x0r = a[2] - a[19]; + x0i = a[3] + a[18]; + x1r = wk1r * x0r - wk1i * x0i; + x1i = wk1r * x0i + wk1i * x0r; + x0r = a[10] - a[27]; + x0i = a[11] + a[26]; + x2r = wk3i * x0r - wk3r * x0i; + x2i = wk3i * x0i + wk3r * x0r; + y1r = x1r + x2r; + y1i = x1i + x2i; + y5r = x1r - x2r; + y5i = x1i - x2i; + x0r = a[2] + a[19]; + x0i = a[3] - a[18]; + x1r = wk3r * x0r - wk3i * x0i; + x1i = wk3r * x0i + wk3i * x0r; + x0r = a[10] + a[27]; + x0i = a[11] - a[26]; + x2r = wk1r * x0r + wk1i * x0i; + x2i = wk1r * x0i - wk1i * x0r; + y9r = x1r - x2r; + y9i = x1i - x2i; + y13r = x1r + x2r; + y13i = x1i + x2i; + x0r = a[4] - a[21]; + x0i = a[5] + a[20]; + x1r = wk2r * x0r - wk2i * x0i; + x1i = wk2r * x0i + wk2i * x0r; + x0r = a[12] - a[29]; + x0i = a[13] + a[28]; + x2r = wk2i * x0r - wk2r * x0i; + x2i = wk2i * x0i + wk2r * x0r; + y2r = x1r + x2r; + y2i = x1i + x2i; + y6r = x1r - x2r; + y6i = x1i - x2i; + x0r = a[4] + a[21]; + x0i = a[5] - a[20]; + x1r = wk2i * x0r - wk2r * x0i; + x1i = wk2i * x0i + wk2r * x0r; + x0r = a[12] + a[29]; + x0i = a[13] - a[28]; + x2r = wk2r * x0r - wk2i * x0i; + x2i = wk2r * x0i + wk2i * x0r; + y10r = x1r - x2r; + y10i = x1i - x2i; + y14r = x1r + x2r; + y14i = x1i + x2i; + x0r = a[6] - a[23]; + x0i = a[7] + a[22]; + x1r = wk3r * x0r - wk3i * x0i; + x1i = wk3r * x0i + wk3i * x0r; + x0r = a[14] - a[31]; + x0i = a[15] + a[30]; + x2r = wk1i * x0r - wk1r * x0i; + x2i = wk1i * x0i + wk1r * x0r; + y3r = x1r + x2r; + y3i = x1i + x2i; + y7r = x1r - x2r; + y7i = x1i - x2i; + x0r = a[6] + a[23]; + x0i = a[7] - a[22]; + x1r = wk1i * x0r + wk1r * x0i; + x1i = wk1i * x0i - wk1r * x0r; + x0r = a[14] + a[31]; + x0i = a[15] - a[30]; + x2r = wk3i * x0r - wk3r * x0i; + x2i = wk3i * x0i + wk3r * x0r; + y11r = x1r + x2r; + y11i = x1i + x2i; + y15r = x1r - x2r; + y15i = x1i - x2i; + x1r = y0r + y2r; + x1i = y0i + y2i; + x2r = y1r + y3r; + x2i = y1i + y3i; + a[0] = x1r + x2r; + a[1] = x1i + x2i; + a[2] = x1r - x2r; + a[3] = x1i - x2i; + x1r = y0r - y2r; + x1i = y0i - y2i; + x2r = y1r - y3r; + x2i = y1i - y3i; + a[4] = x1r - x2i; + a[5] = x1i + x2r; + a[6] = x1r + x2i; + a[7] = x1i - x2r; + x1r = y4r - y6i; + x1i = y4i + y6r; + x0r = y5r - y7i; + x0i = y5i + y7r; + x2r = wn4r * (x0r - x0i); + x2i = wn4r * (x0i + x0r); + a[8] = x1r + x2r; + a[9] = x1i + x2i; + a[10] = x1r - x2r; + a[11] = x1i - x2i; + x1r = y4r + y6i; + x1i = y4i - y6r; + x0r = y5r + y7i; + x0i = y5i - y7r; + x2r = wn4r * (x0r - x0i); + x2i = wn4r * (x0i + x0r); + a[12] = x1r - x2i; + a[13] = x1i + x2r; + a[14] = x1r + x2i; + a[15] = x1i - x2r; + x1r = y8r + y10r; + x1i = y8i + y10i; + x2r = y9r - y11r; + x2i = y9i - y11i; + a[16] = x1r + x2r; + a[17] = x1i + x2i; + a[18] = x1r - x2r; + a[19] = x1i - x2i; + x1r = y8r - y10r; + x1i = y8i - y10i; + x2r = y9r + y11r; + x2i = y9i + y11i; + a[20] = x1r - x2i; + a[21] = x1i + x2r; + a[22] = x1r + x2i; + a[23] = x1i - x2r; + x1r = y12r - y14i; + x1i = y12i + y14r; + x0r = y13r + y15i; + x0i = y13i - y15r; + x2r = wn4r * (x0r - x0i); + x2i = wn4r * (x0i + x0r); + a[24] = x1r + x2r; + a[25] = x1i + x2i; + a[26] = x1r - x2r; + a[27] = x1i - x2i; + x1r = y12r + y14i; + x1i = y12i - y14r; + x0r = y13r - y15i; + x0i = y13i + y15r; + x2r = wn4r * (x0r - x0i); + x2i = wn4r * (x0i + x0r); + a[28] = x1r - x2i; + a[29] = x1i + x2r; + a[30] = x1r + x2i; + a[31] = x1i - x2r; +} + + +void cftf081(double *a, double *w) +{ + double wn4r, x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i, + y0r, y0i, y1r, y1i, y2r, y2i, y3r, y3i, + y4r, y4i, y5r, y5i, y6r, y6i, y7r, y7i; + + wn4r = w[1]; + x0r = a[0] + a[8]; + x0i = a[1] + a[9]; + x1r = a[0] - a[8]; + x1i = a[1] - a[9]; + x2r = a[4] + a[12]; + x2i = a[5] + a[13]; + x3r = a[4] - a[12]; + x3i = a[5] - a[13]; + y0r = x0r + x2r; + y0i = x0i + x2i; + y2r = x0r - x2r; + y2i = x0i - x2i; + y1r = x1r - x3i; + y1i = x1i + x3r; + y3r = x1r + x3i; + y3i = x1i - x3r; + x0r = a[2] + a[10]; + x0i = a[3] + a[11]; + x1r = a[2] - a[10]; + x1i = a[3] - a[11]; + x2r = a[6] + a[14]; + x2i = a[7] + a[15]; + x3r = a[6] - a[14]; + x3i = a[7] - a[15]; + y4r = x0r + x2r; + y4i = x0i + x2i; + y6r = x0r - x2r; + y6i = x0i - x2i; + x0r = x1r - x3i; + x0i = x1i + x3r; + x2r = x1r + x3i; + x2i = x1i - x3r; + y5r = wn4r * (x0r - x0i); + y5i = wn4r * (x0r + x0i); + y7r = wn4r * (x2r - x2i); + y7i = wn4r * (x2r + x2i); + a[8] = y1r + y5r; + a[9] = y1i + y5i; + a[10] = y1r - y5r; + a[11] = y1i - y5i; + a[12] = y3r - y7i; + a[13] = y3i + y7r; + a[14] = y3r + y7i; + a[15] = y3i - y7r; + a[0] = y0r + y4r; + a[1] = y0i + y4i; + a[2] = y0r - y4r; + a[3] = y0i - y4i; + a[4] = y2r - y6i; + a[5] = y2i + y6r; + a[6] = y2r + y6i; + a[7] = y2i - y6r; +} + + +void cftf082(double *a, double *w) +{ + double wn4r, wk1r, wk1i, x0r, x0i, x1r, x1i, + y0r, y0i, y1r, y1i, y2r, y2i, y3r, y3i, + y4r, y4i, y5r, y5i, y6r, y6i, y7r, y7i; + + wn4r = w[1]; + wk1r = w[2]; + wk1i = w[3]; + y0r = a[0] - a[9]; + y0i = a[1] + a[8]; + y1r = a[0] + a[9]; + y1i = a[1] - a[8]; + x0r = a[4] - a[13]; + x0i = a[5] + a[12]; + y2r = wn4r * (x0r - x0i); + y2i = wn4r * (x0i + x0r); + x0r = a[4] + a[13]; + x0i = a[5] - a[12]; + y3r = wn4r * (x0r - x0i); + y3i = wn4r * (x0i + x0r); + x0r = a[2] - a[11]; + x0i = a[3] + a[10]; + y4r = wk1r * x0r - wk1i * x0i; + y4i = wk1r * x0i + wk1i * x0r; + x0r = a[2] + a[11]; + x0i = a[3] - a[10]; + y5r = wk1i * x0r - wk1r * x0i; + y5i = wk1i * x0i + wk1r * x0r; + x0r = a[6] - a[15]; + x0i = a[7] + a[14]; + y6r = wk1i * x0r - wk1r * x0i; + y6i = wk1i * x0i + wk1r * x0r; + x0r = a[6] + a[15]; + x0i = a[7] - a[14]; + y7r = wk1r * x0r - wk1i * x0i; + y7i = wk1r * x0i + wk1i * x0r; + x0r = y0r + y2r; + x0i = y0i + y2i; + x1r = y4r + y6r; + x1i = y4i + y6i; + a[0] = x0r + x1r; + a[1] = x0i + x1i; + a[2] = x0r - x1r; + a[3] = x0i - x1i; + x0r = y0r - y2r; + x0i = y0i - y2i; + x1r = y4r - y6r; + x1i = y4i - y6i; + a[4] = x0r - x1i; + a[5] = x0i + x1r; + a[6] = x0r + x1i; + a[7] = x0i - x1r; + x0r = y1r - y3i; + x0i = y1i + y3r; + x1r = y5r - y7r; + x1i = y5i - y7i; + a[8] = x0r + x1r; + a[9] = x0i + x1i; + a[10] = x0r - x1r; + a[11] = x0i - x1i; + x0r = y1r + y3i; + x0i = y1i - y3r; + x1r = y5r + y7r; + x1i = y5i + y7i; + a[12] = x0r - x1i; + a[13] = x0i + x1r; + a[14] = x0r + x1i; + a[15] = x0i - x1r; +} + + +void cftf040(double *a) +{ + double x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i; + + x0r = a[0] + a[4]; + x0i = a[1] + a[5]; + x1r = a[0] - a[4]; + x1i = a[1] - a[5]; + x2r = a[2] + a[6]; + x2i = a[3] + a[7]; + x3r = a[2] - a[6]; + x3i = a[3] - a[7]; + a[0] = x0r + x2r; + a[1] = x0i + x2i; + a[2] = x1r - x3i; + a[3] = x1i + x3r; + a[4] = x0r - x2r; + a[5] = x0i - x2i; + a[6] = x1r + x3i; + a[7] = x1i - x3r; +} + + +void cftb040(double *a) +{ + double x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i; + + x0r = a[0] + a[4]; + x0i = a[1] + a[5]; + x1r = a[0] - a[4]; + x1i = a[1] - a[5]; + x2r = a[2] + a[6]; + x2i = a[3] + a[7]; + x3r = a[2] - a[6]; + x3i = a[3] - a[7]; + a[0] = x0r + x2r; + a[1] = x0i + x2i; + a[2] = x1r + x3i; + a[3] = x1i - x3r; + a[4] = x0r - x2r; + a[5] = x0i - x2i; + a[6] = x1r - x3i; + a[7] = x1i + x3r; +} + + +void cftx020(double *a) +{ + double x0r, x0i; + + x0r = a[0] - a[2]; + x0i = a[1] - a[3]; + a[0] += a[2]; + a[1] += a[3]; + a[2] = x0r; + a[3] = x0i; +} + + +void rftfsub(int n, double *a, int nc, double *c) +{ + int j, k, kk, ks, m; + double wkr, wki, xr, xi, yr, yi; + + m = n >> 1; + ks = 2 * nc / m; + kk = 0; + for (j = 2; j < m; j += 2) { + k = n - j; + kk += ks; + wkr = 0.5 - c[nc - kk]; + wki = c[kk]; + xr = a[j] - a[k]; + xi = a[j + 1] + a[k + 1]; + yr = wkr * xr - wki * xi; + yi = wkr * xi + wki * xr; + a[j] -= yr; + a[j + 1] -= yi; + a[k] += yr; + a[k + 1] -= yi; + } +} + + +void rftbsub(int n, double *a, int nc, double *c) +{ + int j, k, kk, ks, m; + double wkr, wki, xr, xi, yr, yi; + + m = n >> 1; + ks = 2 * nc / m; + kk = 0; + for (j = 2; j < m; j += 2) { + k = n - j; + kk += ks; + wkr = 0.5 - c[nc - kk]; + wki = c[kk]; + xr = a[j] - a[k]; + xi = a[j + 1] + a[k + 1]; + yr = wkr * xr + wki * xi; + yi = wkr * xi - wki * xr; + a[j] -= yr; + a[j + 1] -= yi; + a[k] += yr; + a[k + 1] -= yi; + } +} + + +void dctsub(int n, double *a, int nc, double *c) +{ + int j, k, kk, ks, m; + double wkr, wki, xr; + + m = n >> 1; + ks = nc / n; + kk = 0; + for (j = 1; j < m; j++) { + k = n - j; + kk += ks; + wkr = c[kk] - c[nc - kk]; + wki = c[kk] + c[nc - kk]; + xr = wki * a[j] - wkr * a[k]; + a[j] = wkr * a[j] + wki * a[k]; + a[k] = xr; + } + a[m] *= c[0]; +} + + +void dstsub(int n, double *a, int nc, double *c) +{ + int j, k, kk, ks, m; + double wkr, wki, xr; + + m = n >> 1; + ks = nc / n; + kk = 0; + for (j = 1; j < m; j++) { + k = n - j; + kk += ks; + wkr = c[kk] - c[nc - kk]; + wki = c[kk] + c[nc - kk]; + xr = wki * a[k] - wkr * a[j]; + a[k] = wkr * a[k] + wki * a[j]; + a[j] = xr; + } + a[m] *= c[0]; +} + diff --git a/plugins/supereq/nsfft-1.00/ooura/pi_fft.c b/plugins/supereq/nsfft-1.00/ooura/pi_fft.c new file mode 100644 index 00000000..c9a76bf8 --- /dev/null +++ b/plugins/supereq/nsfft-1.00/ooura/pi_fft.c @@ -0,0 +1,1616 @@ +/* +---- calculation of PI(= 3.14159...) using FFT ---- + by T.Ooura, ver. LG1.1.2-MP1.5a Sep. 2001. + +This is a test program to estimate the performance of +the FFT routines: fft*g.c. + +Example compilation: + GNU : gcc -O6 -ffast-math pi_fft.c fftsg.c -lm -o pi_fftsg + SUN : cc -fast -xO5 pi_fft.c fft8g.c -lm -o pi_fft8g + Microsoft: cl /O2 /G6 pi_fft.c fft4g.c /Fepi_fft4g.exe + ... + etc. +*/ + +/* Please check the following macros before compiling */ +#ifndef DBL_ERROR_MARGIN +#define DBL_ERROR_MARGIN 0.3 /* must be < 0.5 */ +#endif + + +#include <math.h> +#include <limits.h> +#include <float.h> +#include <stdio.h> +#include <stdlib.h> +#include <time.h> + + +void mp_load_0(int n, int radix, int out[]); +void mp_load_1(int n, int radix, int out[]); +void mp_copy(int n, int radix, int in[], int out[]); +void mp_round(int n, int radix, int m, int inout[]); +int mp_cmp(int n, int radix, int in1[], int in2[]); +void mp_add(int n, int radix, int in1[], int in2[], int out[]); +void mp_sub(int n, int radix, int in1[], int in2[], int out[]); +void mp_imul(int n, int radix, int in1[], int in2, int out[]); +int mp_idiv(int n, int radix, int in1[], int in2, int out[]); +void mp_idiv_2(int n, int radix, int in[], int out[]); +double mp_mul_radix_test(int n, int radix, int nfft, + double tmpfft[], int ip[], double w[]); +void mp_mul(int n, int radix, int in1[], int in2[], int out[], + int tmp[], int nfft, double tmp1fft[], double tmp2fft[], + double tmp3fft[], int ip[], double w[]); +void mp_squ(int n, int radix, int in[], int out[], int tmp[], + int nfft, double tmp1fft[], double tmp2fft[], + int ip[], double w[]); +void mp_mulh(int n, int radix, int in1[], int in2[], int out[], + int nfft, double in1fft[], double outfft[], + int ip[], double w[]); +void mp_squh(int n, int radix, int in[], int out[], + int nfft, double inoutfft[], int ip[], double w[]); +int mp_inv(int n, int radix, int in[], int out[], + int tmp1[], int tmp2[], int nfft, + double tmp1fft[], double tmp2fft[], int ip[], double w[]); +int mp_sqrt(int n, int radix, int in[], int out[], + int tmp1[], int tmp2[], int nfft, + double tmp1fft[], double tmp2fft[], int ip[], double w[]); +void mp_sprintf(int n, int log10_radix, int in[], char out[]); +void mp_sscanf(int n, int log10_radix, char in[], int out[]); +void mp_fprintf(int n, int log10_radix, int in[], FILE *fout); + + +int main() +{ + int nfft, log2_nfft, radix, log10_radix, n, npow, nprc; + double err, d_time, n_op; + int *a, *b, *c, *e, *i1, *i2, *ip; + double *d1, *d2, *d3, *w; + time_t t_1, t_2; + FILE *f_log, *f_out; + + f_log = fopen("pi.log", "w"); + printf("PI calculation to estimate the FFT benchmarks\n"); + fprintf(f_log, "PI calculation to estimate the FFT benchmarks\n"); + printf("length of FFT =?\n"); + scanf("%d", &nfft); + + printf("initializing...\n"); + for (log2_nfft = 1; (1 << log2_nfft) < nfft; log2_nfft++); + nfft = 1 << log2_nfft; + n = nfft + 2; + ip = (int *) malloc((3 + (int) sqrt(0.5 * nfft)) * sizeof(int)); + w = (double *) malloc(nfft / 2 * sizeof(double)); + a = (int *) malloc((n + 2) * sizeof(int)); + b = (int *) malloc((n + 2) * sizeof(int)); + c = (int *) malloc((n + 2) * sizeof(int)); + e = (int *) malloc((n + 2) * sizeof(int)); + i1 = (int *) malloc((n + 2) * sizeof(int)); + i2 = (int *) malloc((n + 2) * sizeof(int)); + d1 = (double *) malloc((nfft + 2) * sizeof(double)); + d2 = (double *) malloc((nfft + 2) * sizeof(double)); + d3 = (double *) malloc((nfft + 2) * sizeof(double)); + if (d3 == NULL) { + printf("Allocation Failure!\n"); + exit(1); + } + ip[0] = 0; + /* ---- radix test ---- */ + log10_radix = 1; + radix = 10; + err = mp_mul_radix_test(n, radix, nfft, d1, ip, w); + err += DBL_EPSILON * (n * radix * radix / 4); + while (100 * err < DBL_ERROR_MARGIN && radix <= INT_MAX / 20) { + err *= 100; + log10_radix++; + radix *= 10; + } + printf("nfft= %d\nradix= %d\nerror_margin= %g\n", nfft, radix, err); + fprintf(f_log, "nfft= %d\nradix= %d\nerror_margin= %g\n", nfft, radix, err); + printf("calculating %d digits of PI...\n", log10_radix * (n - 2)); + fprintf(f_log, "calculating %d digits of PI...\n", log10_radix * (n - 2)); + /* ---- time check ---- */ + time(&t_1); + /* + * ---- a formula based on the AGM (Arithmetic-Geometric Mean) ---- + * c = sqrt(0.125); + * a = 1 + 3 * c; + * b = sqrt(a); + * e = b - 0.625; + * b = 2 * b; + * c = e - c; + * a = a + e; + * npow = 4; + * do { + * npow = 2 * npow; + * e = (a + b) / 2; + * b = sqrt(a * b); + * e = e - b; + * b = 2 * b; + * c = c - e; + * a = e + b; + * } while (e > SQRT_SQRT_EPSILON); + * e = e * e / 4; + * a = a + b; + * pi = (a * a - e - e / 2) / (a * c - e) / npow; + * ---- modification ---- + * This is a modified version of Gauss-Legendre formula + * (by T.Ooura). It is faster than original version. + * ---- reference ---- + * 1. E.Salamin, + * Computation of PI Using Arithmetic-Geometric Mean, + * Mathematics of Computation, Vol.30 1976. + * 2. R.P.Brent, + * Fast Multiple-Precision Evaluation of Elementary Functions, + * J. ACM 23 1976. + * 3. D.Takahasi, Y.Kanada, + * Calculation of PI to 51.5 Billion Decimal Digits on + * Distributed Memoriy Parallel Processors, + * Transactions of Information Processing Society of Japan, + * Vol.39 No.7 1998. + * 4. T.Ooura, + * Improvement of the PI Calculation Algorithm and + * Implementation of Fast Multiple-Precision Computation, + * Information Processing Society of Japan SIG Notes, + * 98-HPC-74, 1998. + */ + /* ---- c = sqrt(0.125) ---- */ + mp_sscanf(n, log10_radix, "0.125", a); + mp_sqrt(n, radix, a, c, i1, i2, nfft, d1, d2, ip, w); + /* ---- a = 1 + 3 * c ---- */ + mp_imul(n, radix, c, 3, e); + mp_sscanf(n, log10_radix, "1", a); + mp_add(n, radix, a, e, a); + /* ---- b = sqrt(a) ---- */ + mp_sqrt(n, radix, a, b, i1, i2, nfft, d1, d2, ip, w); + /* ---- e = b - 0.625 ---- */ + mp_sscanf(n, log10_radix, "0.625", e); + mp_sub(n, radix, b, e, e); + /* ---- b = 2 * b ---- */ + mp_add(n, radix, b, b, b); + /* ---- c = e - c ---- */ + mp_sub(n, radix, e, c, c); + /* ---- a = a + e ---- */ + mp_add(n, radix, a, e, a); + printf("AGM iteration\n"); + fprintf(f_log, "AGM iteration\n"); + npow = 4; + do { + npow *= 2; + /* ---- e = (a + b) / 2 ---- */ + mp_add(n, radix, a, b, e); + mp_idiv_2(n, radix, e, e); + /* ---- b = sqrt(a * b) ---- */ + mp_mul(n, radix, a, b, a, i1, nfft, d1, d2, d3, ip, w); + mp_sqrt(n, radix, a, b, i1, i2, nfft, d1, d2, ip, w); + /* ---- e = e - b ---- */ + mp_sub(n, radix, e, b, e); + /* ---- b = 2 * b ---- */ + mp_add(n, radix, b, b, b); + /* ---- c = c - e ---- */ + mp_sub(n, radix, c, e, c); + /* ---- a = e + b ---- */ + mp_add(n, radix, e, b, a); + /* ---- convergence check ---- */ + nprc = -e[1]; + if (e[0] == 0) { + nprc = n; + } + printf("precision= %d\n", 4 * nprc * log10_radix); + fprintf(f_log, "precision= %d\n", 4 * nprc * log10_radix); + } while (4 * nprc <= n); + /* ---- e = e * e / 4 (half precision) ---- */ + mp_idiv_2(n, radix, e, e); + mp_squh(n, radix, e, e, nfft, d1, ip, w); + /* ---- a = a + b ---- */ + mp_add(n, radix, a, b, a); + /* ---- a = (a * a - e - e / 2) / (a * c - e) / npow ---- */ + mp_mul(n, radix, a, c, c, i1, nfft, d1, d2, d3, ip, w); + mp_sub(n, radix, c, e, c); + mp_inv(n, radix, c, b, i1, i2, nfft, d1, d2, ip, w); + mp_squ(n, radix, a, a, i1, nfft, d1, d2, ip, w); + mp_sub(n, radix, a, e, a); + mp_idiv_2(n, radix, e, e); + mp_sub(n, radix, a, e, a); + mp_mul(n, radix, a, b, a, i1, nfft, d1, d2, d3, ip, w); + mp_idiv(n, radix, a, npow, a); + /* ---- time check ---- */ + time(&t_2); + /* ---- output ---- */ + f_out = fopen("pi.dat", "w"); + printf("writing pi.dat...\n"); + mp_fprintf(n - 1, log10_radix, a, f_out); + fclose(f_out); + free(d3); + free(d2); + free(d1); + free(i2); + free(i1); + free(e); + free(c); + free(b); + free(a); + free(w); + free(ip); + /* ---- benchmark ---- */ + n_op = 50.0 * nfft * log2_nfft * log2_nfft; + printf("floating point operation: %g op.\n", n_op); + fprintf(f_log, "floating point operation: %g op.\n", n_op); + /* ---- difftime ---- */ + d_time = difftime(t_2, t_1); + printf("execution time: %g sec. (real time)\n", d_time); + fprintf(f_log, "execution time: %g sec. (real time)\n", d_time); + fclose(f_log); + return 0; +} + + +/* -------- multiple precision routines -------- */ + + +#include <math.h> +#include <float.h> +#include <stdio.h> + +/* ---- floating point format ---- + data := data[0] * pow(radix, data[1]) * + (data[2] + data[3]/radix + data[4]/radix/radix + ...), + data[0] : sign (1;data>0, -1;data<0, 0;data==0) + data[1] : exponent (0;data==0) + data[2...n+1] : digits + ---- function prototypes ---- + void mp_load_0(int n, int radix, int out[]); + void mp_load_1(int n, int radix, int out[]); + void mp_copy(int n, int radix, int in[], int out[]); + void mp_round(int n, int radix, int m, int inout[]); + int mp_cmp(int n, int radix, int in1[], int in2[]); + void mp_add(int n, int radix, int in1[], int in2[], int out[]); + void mp_sub(int n, int radix, int in1[], int in2[], int out[]); + void mp_imul(int n, int radix, int in1[], int in2, int out[]); + int mp_idiv(int n, int radix, int in1[], int in2, int out[]); + void mp_idiv_2(int n, int radix, int in[], int out[]); + double mp_mul_radix_test(int n, int radix, int nfft, + double tmpfft[], int ip[], double w[]); + void mp_mul(int n, int radix, int in1[], int in2[], int out[], + int tmp[], int nfft, double tmp1fft[], double tmp2fft[], + double tmp3fft[], int ip[], double w[]); + void mp_squ(int n, int radix, int in[], int out[], int tmp[], + int nfft, double tmp1fft[], double tmp2fft[], + int ip[], double w[]); + void mp_mulh(int n, int radix, int in1[], int in2[], int out[], + int nfft, double in1fft[], double outfft[], + int ip[], double w[]); + void mp_squh(int n, int radix, int in[], int out[], + int nfft, double inoutfft[], int ip[], double w[]); + int mp_inv(int n, int radix, int in[], int out[], + int tmp1[], int tmp2[], int nfft, + double tmp1fft[], double tmp2fft[], int ip[], double w[]); + int mp_sqrt(int n, int radix, int in[], int out[], + int tmp1[], int tmp2[], int nfft, + double tmp1fft[], double tmp2fft[], int ip[], double w[]); + void mp_sprintf(int n, int log10_radix, int in[], char out[]); + void mp_sscanf(int n, int log10_radix, char in[], int out[]); + void mp_fprintf(int n, int log10_radix, int in[], FILE *fout); + ---- +*/ + + +/* -------- mp_load routines -------- */ + + +void mp_load_0(int n, int radix, int out[]) +{ + int j; + + for (j = 0; j <= n + 1; j++) { + out[j] = 0; + } +} + + +void mp_load_1(int n, int radix, int out[]) +{ + int j; + + out[0] = 1; + out[1] = 0; + out[2] = 1; + for (j = 3; j <= n + 1; j++) { + out[j] = 0; + } +} + + +void mp_copy(int n, int radix, int in[], int out[]) +{ + int j; + + for (j = 0; j <= n + 1; j++) { + out[j] = in[j]; + } +} + + +void mp_round(int n, int radix, int m, int inout[]) +{ + int j, x; + + if (m < n) { + for (j = n + 1; j > m + 2; j--) { + inout[j] = 0; + } + x = 2 * inout[m + 2]; + inout[m + 2] = 0; + if (x >= radix) { + for (j = m + 1; j >= 2; j--) { + x = inout[j] + 1; + if (x < radix) { + inout[j] = x; + break; + } + inout[j] = 0; + } + if (x >= radix) { + inout[2] = 1; + inout[1]++; + } + } + } +} + + +/* -------- mp_add routines -------- */ + + +int mp_cmp(int n, int radix, int in1[], int in2[]) +{ + int mp_unsgn_cmp(int n, int in1[], int in2[]); + + if (in1[0] > in2[0]) { + return 1; + } else if (in1[0] < in2[0]) { + return -1; + } + return in1[0] * mp_unsgn_cmp(n, &in1[1], &in2[1]); +} + + +void mp_add(int n, int radix, int in1[], int in2[], int out[]) +{ + int mp_unsgn_cmp(int n, int in1[], int in2[]); + int mp_unexp_add(int n, int radix, int expdif, + int in1[], int in2[], int out[]); + int mp_unexp_sub(int n, int radix, int expdif, + int in1[], int in2[], int out[]); + int outsgn, outexp, expdif; + + expdif = in1[1] - in2[1]; + outexp = in1[1]; + if (expdif < 0) { + outexp = in2[1]; + } + outsgn = in1[0] * in2[0]; + if (outsgn >= 0) { + if (outsgn > 0) { + outsgn = in1[0]; + } else { + outsgn = in1[0] + in2[0]; + outexp = in1[1] + in2[1]; + expdif = 0; + } + if (expdif >= 0) { + outexp += mp_unexp_add(n, radix, expdif, + &in1[2], &in2[2], &out[2]); + } else { + outexp += mp_unexp_add(n, radix, -expdif, + &in2[2], &in1[2], &out[2]); + } + } else { + outsgn = mp_unsgn_cmp(n, &in1[1], &in2[1]); + if (outsgn >= 0) { + expdif = mp_unexp_sub(n, radix, expdif, + &in1[2], &in2[2], &out[2]); + } else { + expdif = mp_unexp_sub(n, radix, -expdif, + &in2[2], &in1[2], &out[2]); + } + outexp -= expdif; + outsgn *= in1[0]; + if (expdif == n) { + outsgn = 0; + } + } + if (outsgn == 0) { + outexp = 0; + } + out[0] = outsgn; + out[1] = outexp; +} + + +void mp_sub(int n, int radix, int in1[], int in2[], int out[]) +{ + int mp_unsgn_cmp(int n, int in1[], int in2[]); + int mp_unexp_add(int n, int radix, int expdif, + int in1[], int in2[], int out[]); + int mp_unexp_sub(int n, int radix, int expdif, + int in1[], int in2[], int out[]); + int outsgn, outexp, expdif; + + expdif = in1[1] - in2[1]; + outexp = in1[1]; + if (expdif < 0) { + outexp = in2[1]; + } + outsgn = in1[0] * in2[0]; + if (outsgn <= 0) { + if (outsgn < 0) { + outsgn = in1[0]; + } else { + outsgn = in1[0] - in2[0]; + outexp = in1[1] + in2[1]; + expdif = 0; + } + if (expdif >= 0) { + outexp += mp_unexp_add(n, radix, expdif, + &in1[2], &in2[2], &out[2]); + } else { + outexp += mp_unexp_add(n, radix, -expdif, + &in2[2], &in1[2], &out[2]); + } + } else { + outsgn = mp_unsgn_cmp(n, &in1[1], &in2[1]); + if (outsgn >= 0) { + expdif = mp_unexp_sub(n, radix, expdif, + &in1[2], &in2[2], &out[2]); + } else { + expdif = mp_unexp_sub(n, radix, -expdif, + &in2[2], &in1[2], &out[2]); + } + outexp -= expdif; + outsgn *= in1[0]; + if (expdif == n) { + outsgn = 0; + } + } + if (outsgn == 0) { + outexp = 0; + } + out[0] = outsgn; + out[1] = outexp; +} + + +/* -------- mp_add child routines -------- */ + + +int mp_unsgn_cmp(int n, int in1[], int in2[]) +{ + int j, cmp; + + cmp = 0; + for (j = 0; j <= n && cmp == 0; j++) { + cmp = in1[j] - in2[j]; + } + if (cmp > 0) { + cmp = 1; + } else if (cmp < 0) { + cmp = -1; + } + return cmp; +} + + +int mp_unexp_add(int n, int radix, int expdif, + int in1[], int in2[], int out[]) +{ + int j, x, carry; + + carry = 0; + if (expdif == 0 && in1[0] + in2[0] >= radix) { + x = in1[n - 1] + in2[n - 1]; + carry = x >= radix ? -1 : 0; + for (j = n - 1; j > 0; j--) { + x = in1[j - 1] + in2[j - 1] - carry; + carry = x >= radix ? -1 : 0; + out[j] = x - (radix & carry); + } + out[0] = -carry; + } else { + if (expdif > n) { + expdif = n; + } + for (j = n - 1; j >= expdif; j--) { + x = in1[j] + in2[j - expdif] - carry; + carry = x >= radix ? -1 : 0; + out[j] = x - (radix & carry); + } + for (j = expdif - 1; j >= 0; j--) { + x = in1[j] - carry; + carry = x >= radix ? -1 : 0; + out[j] = x - (radix & carry); + } + if (carry != 0) { + for (j = n - 1; j > 0; j--) { + out[j] = out[j - 1]; + } + out[0] = -carry; + } + } + return -carry; +} + + +int mp_unexp_sub(int n, int radix, int expdif, + int in1[], int in2[], int out[]) +{ + int j, x, borrow, ncancel; + + if (expdif > n) { + expdif = n; + } + borrow = 0; + for (j = n - 1; j >= expdif; j--) { + x = in1[j] - in2[j - expdif] + borrow; + borrow = x < 0 ? -1 : 0; + out[j] = x + (radix & borrow); + } + for (j = expdif - 1; j >= 0; j--) { + x = in1[j] + borrow; + borrow = x < 0 ? -1 : 0; + out[j] = x + (radix & borrow); + } + ncancel = 0; + for (j = 0; j < n && out[j] == 0; j++) { + ncancel = j + 1; + } + if (ncancel > 0 && ncancel < n) { + for (j = 0; j < n - ncancel; j++) { + out[j] = out[j + ncancel]; + } + for (j = n - ncancel; j < n; j++) { + out[j] = 0; + } + } + return ncancel; +} + + +/* -------- mp_imul routines -------- */ + + +void mp_imul(int n, int radix, int in1[], int in2, int out[]) +{ + void mp_unsgn_imul(int n, double dradix, int in1[], double din2, + int out[]); + + if (in2 > 0) { + out[0] = in1[0]; + } else if (in2 < 0) { + out[0] = -in1[0]; + in2 = -in2; + } else { + out[0] = 0; + } + mp_unsgn_imul(n, radix, &in1[1], in2, &out[1]); + if (out[0] == 0) { + out[1] = 0; + } +} + + +int mp_idiv(int n, int radix, int in1[], int in2, int out[]) +{ + void mp_load_0(int n, int radix, int out[]); + void mp_unsgn_idiv(int n, double dradix, int in1[], double din2, + int out[]); + + if (in2 == 0) { + return -1; + } + if (in2 > 0) { + out[0] = in1[0]; + } else { + out[0] = -in1[0]; + in2 = -in2; + } + if (in1[0] == 0) { + mp_load_0(n, radix, out); + return 0; + } + mp_unsgn_idiv(n, radix, &in1[1], in2, &out[1]); + return 0; +} + + +void mp_idiv_2(int n, int radix, int in[], int out[]) +{ + int j, ix, carry, shift; + + out[0] = in[0]; + shift = 0; + if (in[2] == 1) { + shift = 1; + } + out[1] = in[1] - shift; + carry = -shift; + for (j = 2; j <= n + 1 - shift; j++) { + ix = in[j + shift] + (radix & carry); + carry = -(ix & 1); + out[j] = ix >> 1; + } + if (shift > 0) { + out[n + 1] = (radix & carry) >> 1; + } +} + + +/* -------- mp_imul child routines -------- */ + + +void mp_unsgn_imul(int n, double dradix, int in1[], double din2, + int out[]) +{ + int j, carry, shift; + double x, d1_radix; + + d1_radix = 1.0 / dradix; + carry = 0; + for (j = n; j >= 1; j--) { + x = din2 * in1[j] + carry + 0.5; + carry = (int) (d1_radix * x); + out[j] = (int) (x - dradix * carry); + } + shift = 0; + x = carry + 0.5; + while (x > 1) { + x *= d1_radix; + shift++; + } + out[0] = in1[0] + shift; + if (shift > 0) { + while (shift > n) { + carry = (int) (d1_radix * carry + 0.5); + shift--; + } + for (j = n; j >= shift + 1; j--) { + out[j] = out[j - shift]; + } + for (j = shift; j >= 1; j--) { + x = carry + 0.5; + carry = (int) (d1_radix * x); + out[j] = (int) (x - dradix * carry); + } + } +} + + +void mp_unsgn_idiv(int n, double dradix, int in1[], double din2, + int out[]) +{ + int j, ix, carry, shift; + double x, d1_in2; + + d1_in2 = 1.0 / din2; + shift = 0; + x = 0; + do { + shift++; + x *= dradix; + if (shift <= n) { + x += in1[shift]; + } + } while (x < din2 - 0.5); + x += 0.5; + ix = (int) (d1_in2 * x); + carry = (int) (x - din2 * ix); + out[1] = ix; + shift--; + out[0] = in1[0] - shift; + if (shift >= n) { + shift = n - 1; + } + for (j = 2; j <= n - shift; j++) { + x = in1[j + shift] + dradix * carry + 0.5; + ix = (int) (d1_in2 * x); + carry = (int) (x - din2 * ix); + out[j] = ix; + } + for (j = n - shift + 1; j <= n; j++) { + x = dradix * carry + 0.5; + ix = (int) (d1_in2 * x); + carry = (int) (x - din2 * ix); + out[j] = ix; + } +} + + +/* -------- mp_mul routines -------- */ + + +double mp_mul_radix_test(int n, int radix, int nfft, + double tmpfft[], int ip[], double w[]) +{ + void rdft(int n, int isgn, double *a, int *ip, double *w); + void mp_mul_csqu(int nfft, double dinout[]); + double mp_mul_d2i_test(int radix, int nfft, double din[]); + int j, ndata, radix_2; + + ndata = (nfft >> 1) + 1; + if (ndata > n) { + ndata = n; + } + tmpfft[nfft + 1] = radix - 1; + for (j = nfft; j > ndata; j--) { + tmpfft[j] = 0; + } + radix_2 = (radix + 1) / 2; + for (j = ndata; j > 2; j--) { + tmpfft[j] = radix_2; + } + tmpfft[2] = radix; + tmpfft[1] = radix - 1; + tmpfft[0] = 0; + rdft(nfft, 1, &tmpfft[1], ip, w); + mp_mul_csqu(nfft, tmpfft); + rdft(nfft, -1, &tmpfft[1], ip, w); + return 2 * mp_mul_d2i_test(radix, nfft, tmpfft); +} + + +void mp_mul(int n, int radix, int in1[], int in2[], int out[], + int tmp[], int nfft, double tmp1fft[], double tmp2fft[], + double tmp3fft[], int ip[], double w[]) +{ + void mp_copy(int n, int radix, int in[], int out[]); + void mp_add(int n, int radix, int in1[], int in2[], int out[]); + void rdft(int n, int isgn, double *a, int *ip, double *w); + void mp_mul_i2d(int n, int radix, int nfft, int shift, + int in[], double dout[]); + void mp_mul_cmul(int nfft, double din[], double dinout[]); + void mp_mul_cmuladd(int nfft, double din1[], double din2[], + double dinout[]); + void mp_mul_d2i(int n, int radix, int nfft, double din[], int out[]); + int n_h, shift; + + shift = (nfft >> 1) + 1; + while (n > shift) { + if (in1[shift + 2] + in2[shift + 2] != 0) { + break; + } + shift++; + } + n_h = n / 2 + 1; + if (n_h < n - shift) { + n_h = n - shift; + } + /* ---- tmp3fft = (upper) in1 * (lower) in2 ---- */ + mp_mul_i2d(n, radix, nfft, 0, in1, tmp1fft); + rdft(nfft, 1, &tmp1fft[1], ip, w); + mp_mul_i2d(n, radix, nfft, shift, in2, tmp3fft); + rdft(nfft, 1, &tmp3fft[1], ip, w); + mp_mul_cmul(nfft, tmp1fft, tmp3fft); + /* ---- tmp = (upper) in1 * (upper) in2 ---- */ + mp_mul_i2d(n, radix, nfft, 0, in2, tmp2fft); + rdft(nfft, 1, &tmp2fft[1], ip, w); + mp_mul_cmul(nfft, tmp2fft, tmp1fft); + rdft(nfft, -1, &tmp1fft[1], ip, w); + mp_mul_d2i(n, radix, nfft, tmp1fft, tmp); + /* ---- tmp3fft += (upper) in2 * (lower) in1 ---- */ + mp_mul_i2d(n, radix, nfft, shift, in1, tmp1fft); + rdft(nfft, 1, &tmp1fft[1], ip, w); + mp_mul_cmuladd(nfft, tmp1fft, tmp2fft, tmp3fft); + /* ---- out = tmp + tmp3fft ---- */ + rdft(nfft, -1, &tmp3fft[1], ip, w); + mp_mul_d2i(n_h, radix, nfft, tmp3fft, out); + if (out[0] != 0) { + mp_add(n, radix, out, tmp, out); + } else { + mp_copy(n, radix, tmp, out); + } +} + + +void mp_squ(int n, int radix, int in[], int out[], int tmp[], + int nfft, double tmp1fft[], double tmp2fft[], + int ip[], double w[]) +{ + void mp_add(int n, int radix, int in1[], int in2[], int out[]); + void rdft(int n, int isgn, double *a, int *ip, double *w); + void mp_mul_i2d(int n, int radix, int nfft, int shift, + int in[], double dout[]); + void mp_mul_cmul(int nfft, double din[], double dinout[]); + void mp_mul_csqu(int nfft, double dinout[]); + void mp_mul_d2i(int n, int radix, int nfft, double din[], int out[]); + int n_h, shift; + + shift = (nfft >> 1) + 1; + while (n > shift) { + if (in[shift + 2] != 0) { + break; + } + shift++; + } + n_h = n / 2 + 1; + if (n_h < n - shift) { + n_h = n - shift; + } + /* ---- tmp = (upper) in * (lower) in ---- */ + mp_mul_i2d(n, radix, nfft, 0, in, tmp1fft); + rdft(nfft, 1, &tmp1fft[1], ip, w); + mp_mul_i2d(n, radix, nfft, shift, in, tmp2fft); + rdft(nfft, 1, &tmp2fft[1], ip, w); + mp_mul_cmul(nfft, tmp1fft, tmp2fft); + rdft(nfft, -1, &tmp2fft[1], ip, w); + mp_mul_d2i(n_h, radix, nfft, tmp2fft, tmp); + /* ---- out = 2 * tmp + ((upper) in)^2 ---- */ + mp_mul_csqu(nfft, tmp1fft); + rdft(nfft, -1, &tmp1fft[1], ip, w); + mp_mul_d2i(n, radix, nfft, tmp1fft, out); + if (tmp[0] != 0) { + mp_add(n_h, radix, tmp, tmp, tmp); + mp_add(n, radix, out, tmp, out); + } +} + + +void mp_mulh(int n, int radix, int in1[], int in2[], int out[], + int nfft, double in1fft[], double outfft[], int ip[], double w[]) +{ + void rdft(int n, int isgn, double *a, int *ip, double *w); + void mp_mul_i2d(int n, int radix, int nfft, int shift, + int in[], double dout[]); + void mp_mul_cmul(int nfft, double din[], double dinout[]); + void mp_mul_d2i(int n, int radix, int nfft, double din[], int out[]); + + mp_mul_i2d(n, radix, nfft, 0, in1, in1fft); + rdft(nfft, 1, &in1fft[1], ip, w); + mp_mul_i2d(n, radix, nfft, 0, in2, outfft); + rdft(nfft, 1, &outfft[1], ip, w); + mp_mul_cmul(nfft, in1fft, outfft); + rdft(nfft, -1, &outfft[1], ip, w); + mp_mul_d2i(n, radix, nfft, outfft, out); +} + + +void mp_mulh_use_in1fft(int n, int radix, double in1fft[], + int shift, int in2[], int out[], int nfft, double outfft[], + int ip[], double w[]) +{ + void rdft(int n, int isgn, double *a, int *ip, double *w); + void mp_mul_i2d(int n, int radix, int nfft, int shift, + int in[], double dout[]); + void mp_mul_cmul(int nfft, double din[], double dinout[]); + void mp_mul_d2i(int n, int radix, int nfft, double din[], int out[]); + int n_h; + + while (n > shift) { + if (in2[shift + 2] != 0) { + break; + } + shift++; + } + n_h = n / 2 + 1; + if (n_h < n - shift) { + n_h = n - shift; + } + mp_mul_i2d(n, radix, nfft, shift, in2, outfft); + rdft(nfft, 1, &outfft[1], ip, w); + mp_mul_cmul(nfft, in1fft, outfft); + rdft(nfft, -1, &outfft[1], ip, w); + mp_mul_d2i(n_h, radix, nfft, outfft, out); +} + + +void mp_squh(int n, int radix, int in[], int out[], + int nfft, double inoutfft[], int ip[], double w[]) +{ + void rdft(int n, int isgn, double *a, int *ip, double *w); + void mp_mul_i2d(int n, int radix, int nfft, int shift, + int in[], double dout[]); + void mp_mul_csqu(int nfft, double dinout[]); + void mp_mul_d2i(int n, int radix, int nfft, double din[], int out[]); + + mp_mul_i2d(n, radix, nfft, 0, in, inoutfft); + rdft(nfft, 1, &inoutfft[1], ip, w); + mp_mul_csqu(nfft, inoutfft); + rdft(nfft, -1, &inoutfft[1], ip, w); + mp_mul_d2i(n, radix, nfft, inoutfft, out); +} + + +void mp_squh_use_in1fft(int n, int radix, double inoutfft[], int out[], + int nfft, int ip[], double w[]) +{ + void rdft(int n, int isgn, double *a, int *ip, double *w); + void mp_mul_csqu(int nfft, double dinout[]); + void mp_mul_d2i(int n, int radix, int nfft, double din[], int out[]); + + mp_mul_csqu(nfft, inoutfft); + rdft(nfft, -1, &inoutfft[1], ip, w); + mp_mul_d2i(n, radix, nfft, inoutfft, out); +} + + +/* -------- mp_mul child routines -------- */ + + +void mp_mul_i2d(int n, int radix, int nfft, int shift, + int in[], double dout[]) +{ + int j, x, carry, ndata, radix_2, topdgt; + + ndata = 0; + topdgt = 0; + if (n > shift) { + topdgt = in[shift + 2]; + ndata = (nfft >> 1) + 1; + if (ndata > n - shift) { + ndata = n - shift; + } + } + dout[nfft + 1] = in[0] * topdgt; + for (j = nfft; j > ndata; j--) { + dout[j] = 0; + } + /* ---- abs(dout[j]) <= radix/2 (to keep FFT precision) ---- */ + if (ndata > 1) { + radix_2 = radix / 2; + carry = 0; + for (j = ndata + 1; j > 3; j--) { + x = in[j + shift] - carry; + carry = x >= radix_2 ? -1 : 0; + dout[j - 1] = x - (radix & carry); + } + dout[2] = in[shift + 3] - carry; + } + dout[1] = topdgt; + dout[0] = in[1] - shift; +} + + +void mp_mul_cmul(int nfft, double din[], double dinout[]) +{ + int j; + double xr, xi, yr, yi; + + dinout[0] += din[0]; + dinout[1] *= din[1]; + dinout[2] *= din[2]; + for (j = 3; j < nfft; j += 2) { + xr = din[j]; + xi = din[j + 1]; + yr = dinout[j]; + yi = dinout[j + 1]; + dinout[j] = xr * yr - xi * yi; + dinout[j + 1] = xr * yi + xi * yr; + } + dinout[nfft + 1] *= din[nfft + 1]; +} + + +void mp_mul_cmuladd(int nfft, double din1[], double din2[], + double dinout[]) +{ + int j; + double xr, xi, yr, yi; + + dinout[1] += din1[1] * din2[1]; + dinout[2] += din1[2] * din2[2]; + for (j = 3; j < nfft; j += 2) { + xr = din1[j]; + xi = din1[j + 1]; + yr = din2[j]; + yi = din2[j + 1]; + dinout[j] += xr * yr - xi * yi; + dinout[j + 1] += xr * yi + xi * yr; + } + dinout[nfft + 1] += din1[nfft + 1] * din2[nfft + 1]; +} + + +void mp_mul_csqu(int nfft, double dinout[]) +{ + int j; + double xr, xi; + + dinout[0] *= 2; + dinout[1] *= dinout[1]; + dinout[2] *= dinout[2]; + for (j = 3; j < nfft; j += 2) { + xr = dinout[j]; + xi = dinout[j + 1]; + dinout[j] = xr * xr - xi * xi; + dinout[j + 1] = 2 * xr * xi; + } + dinout[nfft + 1] *= dinout[nfft + 1]; +} + + +void mp_mul_d2i(int n, int radix, int nfft, double din[], int out[]) +{ + int j, carry, carry1, carry2, shift, ndata; + double x, scale, d1_radix, d1_radix2, pow_radix, topdgt; + + scale = 2.0 / nfft; + d1_radix = 1.0 / radix; + d1_radix2 = d1_radix * d1_radix; + topdgt = din[nfft + 1]; + x = topdgt < 0 ? -topdgt : topdgt; + shift = x + 0.5 >= radix ? 1 : 0; + /* ---- correction of cyclic convolution of din[1] ---- */ + x *= nfft * 0.5; + din[nfft + 1] = din[1] - x; + din[1] = x; + /* ---- output of digits ---- */ + ndata = n; + if (n > nfft + 1 + shift) { + ndata = nfft + 1 + shift; + for (j = n + 1; j > ndata + 1; j--) { + out[j] = 0; + } + } + x = 0; + pow_radix = 1; + for (j = ndata + 1 - shift; j <= nfft + 1; j++) { + x += pow_radix * din[j]; + pow_radix *= d1_radix; + if (pow_radix < DBL_EPSILON) { + break; + } + } + x = d1_radix2 * (scale * x + 0.5); + carry2 = ((int) x) - 1; + carry = (int) (radix * (x - carry2) + 0.5); + for (j = ndata; j > 1; j--) { + x = d1_radix2 * (scale * din[j - shift] + carry + 0.5); + carry = carry2; + carry2 = ((int) x) - 1; + x = radix * (x - carry2); + carry1 = (int) x; + out[j + 1] = (int) (radix * (x - carry1)); + carry += carry1; + } + x = carry + ((double) radix) * carry2 + 0.5; + if (shift == 0) { + x += scale * din[1]; + } + carry = (int) (d1_radix * x); + out[2] = (int) (x - ((double) radix) * carry); + if (carry > 0) { + for (j = n + 1; j > 2; j--) { + out[j] = out[j - 1]; + } + out[2] = carry; + shift++; + } + /* ---- output of exp, sgn ---- */ + x = din[0] + shift + 0.5; + shift = ((int) x) - 1; + out[1] = shift + ((int) (x - shift)); + out[0] = topdgt > 0.5 ? 1 : -1; + if (out[2] == 0) { + out[0] = 0; + out[1] = 0; + } +} + + +double mp_mul_d2i_test(int radix, int nfft, double din[]) +{ + int j, carry, carry1, carry2; + double x, scale, d1_radix, d1_radix2, err; + + scale = 2.0 / nfft; + d1_radix = 1.0 / radix; + d1_radix2 = d1_radix * d1_radix; + /* ---- correction of cyclic convolution of din[1] ---- */ + x = din[nfft + 1] * nfft * 0.5; + if (x < 0) { + x = -x; + } + din[nfft + 1] = din[1] - x; + /* ---- check of digits ---- */ + err = 0; + carry = 0; + carry2 = 0; + for (j = nfft + 1; j > 1; j--) { + x = d1_radix2 * (scale * din[j] + carry + 0.5); + carry = carry2; + carry2 = ((int) x) - 1; + x = radix * (x - carry2); + carry1 = (int) x; + x = radix * (x - carry1); + carry += carry1; + x = x - 0.5 - ((int) x); + if (x > err) { + err = x; + } else if (-x > err) { + err = -x; + } + } + return err; +} + + +/* -------- mp_inv routines -------- */ + + +int mp_inv(int n, int radix, int in[], int out[], + int tmp1[], int tmp2[], int nfft, + double tmp1fft[], double tmp2fft[], int ip[], double w[]) +{ + int mp_get_nfft_init(int radix, int nfft_max); + void mp_inv_init(int n, int radix, int in[], int out[]); + int mp_inv_newton(int n, int radix, int in[], int inout[], + int tmp1[], int tmp2[], int nfft, double tmp1fft[], + double tmp2fft[], int ip[], double w[]); + int n_nwt, nfft_nwt, thr, prc; + + if (in[0] == 0) { + return -1; + } + nfft_nwt = mp_get_nfft_init(radix, nfft); + n_nwt = nfft_nwt + 2; + if (n_nwt > n) { + n_nwt = n; + } + mp_inv_init(n_nwt, radix, in, out); + thr = 8; + do { + n_nwt = nfft_nwt + 2; + if (n_nwt > n) { + n_nwt = n; + } + prc = mp_inv_newton(n_nwt, radix, in, out, + tmp1, tmp2, nfft_nwt, tmp1fft, tmp2fft, ip, w); + if (thr * nfft_nwt >= nfft) { + thr = 0; + if (2 * prc <= n_nwt - 2) { + nfft_nwt >>= 1; + } + } else { + if (3 * prc < n_nwt - 2) { + nfft_nwt >>= 1; + } + } + nfft_nwt <<= 1; + } while (nfft_nwt <= nfft); + return 0; +} + + +int mp_sqrt(int n, int radix, int in[], int out[], + int tmp1[], int tmp2[], int nfft, + double tmp1fft[], double tmp2fft[], int ip[], double w[]) +{ + void mp_load_0(int n, int radix, int out[]); + int mp_get_nfft_init(int radix, int nfft_max); + void mp_sqrt_init(int n, int radix, int in[], int out[], int out_rev[]); + int mp_sqrt_newton(int n, int radix, int in[], int inout[], + int inout_rev[], int tmp[], int nfft, double tmp1fft[], + double tmp2fft[], int ip[], double w[], int *n_tmp1fft); + int n_nwt, nfft_nwt, thr, prc, n_tmp1fft; + + if (in[0] < 0) { + return -1; + } else if (in[0] == 0) { + mp_load_0(n, radix, out); + return 0; + } + nfft_nwt = mp_get_nfft_init(radix, nfft); + n_nwt = nfft_nwt + 2; + if (n_nwt > n) { + n_nwt = n; + } + mp_sqrt_init(n_nwt, radix, in, out, tmp1); + n_tmp1fft = 0; + thr = 8; + do { + n_nwt = nfft_nwt + 2; + if (n_nwt > n) { + n_nwt = n; + } + prc = mp_sqrt_newton(n_nwt, radix, in, out, + tmp1, tmp2, nfft_nwt, tmp1fft, tmp2fft, + ip, w, &n_tmp1fft); + if (thr * nfft_nwt >= nfft) { + thr = 0; + if (2 * prc <= n_nwt - 2) { + nfft_nwt >>= 1; + } + } else { + if (3 * prc < n_nwt - 2) { + nfft_nwt >>= 1; + } + } + nfft_nwt <<= 1; + } while (nfft_nwt <= nfft); + return 0; +} + + +/* -------- mp_inv child routines -------- */ + + +int mp_get_nfft_init(int radix, int nfft_max) +{ + int nfft_init; + double r; + + r = radix; + nfft_init = 1; + do { + r *= r; + nfft_init <<= 1; + } while (DBL_EPSILON * r < 1 && nfft_init < nfft_max); + return nfft_init; +} + + +void mp_inv_init(int n, int radix, int in[], int out[]) +{ + void mp_unexp_d2mp(int n, int radix, double din, int out[]); + double mp_unexp_mp2d(int n, int radix, int in[]); + int outexp; + double din; + + out[0] = in[0]; + outexp = -in[1]; + din = 1.0 / mp_unexp_mp2d(n, radix, &in[2]); + while (din < 1) { + din *= radix; + outexp--; + } + out[1] = outexp; + mp_unexp_d2mp(n, radix, din, &out[2]); +} + + +void mp_sqrt_init(int n, int radix, int in[], int out[], int out_rev[]) +{ + void mp_unexp_d2mp(int n, int radix, double din, int out[]); + double mp_unexp_mp2d(int n, int radix, int in[]); + int outexp; + double din; + + out[0] = 1; + out_rev[0] = 1; + outexp = in[1]; + din = mp_unexp_mp2d(n, radix, &in[2]); + if (outexp % 2 != 0) { + din *= radix; + outexp--; + } + outexp /= 2; + din = sqrt(din); + if (din < 1) { + din *= radix; + outexp--; + } + out[1] = outexp; + mp_unexp_d2mp(n, radix, din, &out[2]); + outexp = -outexp; + din = 1.0 / din; + while (din < 1) { + din *= radix; + outexp--; + } + out_rev[1] = outexp; + mp_unexp_d2mp(n, radix, din, &out_rev[2]); +} + + +void mp_unexp_d2mp(int n, int radix, double din, int out[]) +{ + int j, x; + + for (j = 0; j < n; j++) { + x = (int) din; + if (x >= radix) { + x = radix - 1; + din = radix; + } + din = radix * (din - x); + out[j] = x; + } +} + + +double mp_unexp_mp2d(int n, int radix, int in[]) +{ + int j; + double d1_radix, dout; + + d1_radix = 1.0 / radix; + dout = 0; + for (j = n - 1; j >= 0; j--) { + dout = d1_radix * dout + in[j]; + } + return dout; +} + + +int mp_inv_newton(int n, int radix, int in[], int inout[], + int tmp1[], int tmp2[], int nfft, double tmp1fft[], + double tmp2fft[], int ip[], double w[]) +{ + void mp_load_1(int n, int radix, int out[]); + void mp_round(int n, int radix, int m, int inout[]); + void mp_add(int n, int radix, int in1[], int in2[], int out[]); + void mp_sub(int n, int radix, int in1[], int in2[], int out[]); + void mp_mulh(int n, int radix, int in1[], int in2[], int out[], + int nfft, double in1fft[], double outfft[], + int ip[], double w[]); + void mp_mulh_use_in1fft(int n, int radix, double in1fft[], + int shift, int in2[], int out[], int nfft, double outfft[], + int ip[], double w[]); + int n_h, shift, prc; + + shift = (nfft >> 1) + 1; + n_h = n / 2 + 1; + if (n_h < n - shift) { + n_h = n - shift; + } + /* ---- tmp1 = inout * (upper) in (half to normal precision) ---- */ + mp_round(n, radix, shift, inout); + mp_mulh(n, radix, inout, in, tmp1, + nfft, tmp1fft, tmp2fft, ip, w); + /* ---- tmp2 = 1 - tmp1 ---- */ + mp_load_1(n, radix, tmp2); + mp_sub(n, radix, tmp2, tmp1, tmp2); + /* ---- tmp2 -= inout * (lower) in (half precision) ---- */ + mp_mulh_use_in1fft(n, radix, tmp1fft, shift, in, tmp1, + nfft, tmp2fft, ip, w); + mp_sub(n_h, radix, tmp2, tmp1, tmp2); + /* ---- get precision ---- */ + prc = -tmp2[1]; + if (tmp2[0] == 0) { + prc = nfft + 1; + } + /* ---- tmp2 *= inout (half precision) ---- */ + mp_mulh_use_in1fft(n_h, radix, tmp1fft, 0, tmp2, tmp2, + nfft, tmp2fft, ip, w); + /* ---- inout += tmp2 ---- */ + if (tmp2[0] != 0) { + mp_add(n, radix, inout, tmp2, inout); + } + return prc; +} + + +int mp_sqrt_newton(int n, int radix, int in[], int inout[], + int inout_rev[], int tmp[], int nfft, double tmp1fft[], + double tmp2fft[], int ip[], double w[], int *n_tmp1fft) +{ + void mp_round(int n, int radix, int m, int inout[]); + void mp_add(int n, int radix, int in1[], int in2[], int out[]); + void mp_sub(int n, int radix, int in1[], int in2[], int out[]); + void mp_idiv_2(int n, int radix, int in[], int out[]); + void mp_mulh(int n, int radix, int in1[], int in2[], int out[], + int nfft, double in1fft[], double outfft[], + int ip[], double w[]); + void mp_squh(int n, int radix, int in[], int out[], + int nfft, double inoutfft[], int ip[], double w[]); + void mp_squh_use_in1fft(int n, int radix, double inoutfft[], int out[], + int nfft, int ip[], double w[]); + int n_h, nfft_h, shift, prc; + + nfft_h = nfft >> 1; + shift = nfft_h + 1; + if (nfft_h < 2) { + nfft_h = 2; + } + n_h = n / 2 + 1; + if (n_h < n - shift) { + n_h = n - shift; + } + /* ---- tmp = inout_rev^2 (1/4 to half precision) ---- */ + mp_round(n_h, radix, (nfft_h >> 1) + 1, inout_rev); + if (*n_tmp1fft != nfft_h) { + mp_squh(n_h, radix, inout_rev, tmp, + nfft_h, tmp1fft, ip, w); + } else { + mp_squh_use_in1fft(n_h, radix, tmp1fft, tmp, + nfft_h, ip, w); + } + /* ---- tmp = inout_rev - inout * tmp (half precision) ---- */ + mp_round(n, radix, shift, inout); + mp_mulh(n_h, radix, inout, tmp, tmp, + nfft, tmp1fft, tmp2fft, ip, w); + mp_sub(n_h, radix, inout_rev, tmp, tmp); + /* ---- inout_rev += tmp ---- */ + mp_add(n_h, radix, inout_rev, tmp, inout_rev); + /* ---- tmp = in - inout^2 (half to normal precision) ---- */ + mp_squh_use_in1fft(n, radix, tmp1fft, tmp, + nfft, ip, w); + mp_sub(n, radix, in, tmp, tmp); + /* ---- get precision ---- */ + prc = in[1] - tmp[1]; + if (in[2] > tmp[2]) { + prc++; + } + if (tmp[0] == 0) { + prc = nfft + 1; + } + /* ---- tmp = tmp * inout_rev / 2 (half precision) ---- */ + mp_round(n_h, radix, shift, inout_rev); + mp_mulh(n_h, radix, inout_rev, tmp, tmp, + nfft, tmp1fft, tmp2fft, ip, w); + *n_tmp1fft = nfft; + mp_idiv_2(n_h, radix, tmp, tmp); + /* ---- inout += tmp ---- */ + if (tmp[0] != 0) { + mp_add(n, radix, inout, tmp, inout); + } + return prc; +} + + +/* -------- mp_io routines -------- */ + + +void mp_sprintf(int n, int log10_radix, int in[], char out[]) +{ + int j, k, x, y, outexp, shift; + + if (in[0] < 0) { + *out++ = '-'; + } + x = in[2]; + shift = log10_radix; + for (k = log10_radix; k > 0; k--) { + y = x % 10; + x /= 10; + out[k] = '0' + y; + if (y != 0) { + shift = k; + } + } + out[0] = out[shift]; + out[1] = '.'; + for (k = 1; k <= log10_radix - shift; k++) { + out[k + 1] = out[k + shift]; + } + outexp = log10_radix - shift; + out += outexp + 2; + for (j = 3; j <= n + 1; j++) { + x = in[j]; + for (k = log10_radix - 1; k >= 0; k--) { + y = x % 10; + x /= 10; + out[k] = '0' + y; + } + out += log10_radix; + } + *out++ = 'e'; + outexp += log10_radix * in[1]; + sprintf(out, "%d", outexp); +} + + +void mp_sscanf(int n, int log10_radix, char in[], int out[]) +{ + char *s; + int j, x, outexp, outexp_mod; + + while (*in == ' ') { + in++; + } + out[0] = 1; + if (*in == '-') { + out[0] = -1; + in++; + } else if (*in == '+') { + in++; + } + while (*in == ' ' || *in == '0') { + in++; + } + outexp = 0; + for (s = in; *s != '\0'; s++) { + if (*s == 'e' || *s == 'E' || *s == 'd' || *s == 'D') { + if (sscanf(++s, "%d", &outexp) != 1) { + outexp = 0; + } + break; + } + } + if (*in == '.') { + do { + outexp--; + while (*++in == ' '); + } while (*in == '0' && *in != '\0'); + } else if (*in != '\0') { + s = in; + while (*++s == ' '); + while (*s >= '0' && *s <= '9' && *s != '\0') { + outexp++; + while (*++s == ' '); + } + } + x = outexp / log10_radix; + outexp_mod = outexp - log10_radix * x; + if (outexp_mod < 0) { + x--; + outexp_mod += log10_radix; + } + out[1] = x; + x = 0; + j = 2; + for (s = in; *s != '\0'; s++) { + if (*s == '.' || *s == ' ') { + continue; + } + if (*s < '0' || *s > '9') { + break; + } + x = 10 * x + (*s - '0'); + if (--outexp_mod < 0) { + if (j > n + 1) { + break; + } + out[j++] = x; + x = 0; + outexp_mod = log10_radix - 1; + } + } + while (outexp_mod-- >= 0) { + x *= 10; + } + while (j <= n + 1) { + out[j++] = x; + x = 0; + } + if (out[2] == 0) { + out[0] = 0; + out[1] = 0; + } +} + + +void mp_fprintf(int n, int log10_radix, int in[], FILE *fout) +{ + int j, k, x, y, outexp, shift; + char out[256]; + + if (in[0] < 0) { + putc('-', fout); + } + x = in[2]; + shift = log10_radix; + for (k = log10_radix; k > 0; k--) { + y = x % 10; + x /= 10; + out[k] = '0' + y; + if (y != 0) { + shift = k; + } + } + putc(out[shift], fout); + putc('.', fout); + for (k = 1; k <= log10_radix - shift; k++) { + putc(out[k + shift], fout); + } + outexp = log10_radix - shift; + for (j = 3; j <= n + 1; j++) { + x = in[j]; + for (k = log10_radix - 1; k >= 0; k--) { + y = x % 10; + x /= 10; + out[k] = '0' + y; + } + for (k = 0; k < log10_radix; k++) { + putc(out[k], fout); + } + } + putc('e', fout); + outexp += log10_radix * in[1]; + sprintf(out, "%d", outexp); + for (k = 0; out[k] != '\0'; k++) { + putc(out[k], fout); + } +} + + diff --git a/plugins/supereq/nsfft-1.00/simd/Makefile b/plugins/supereq/nsfft-1.00/simd/Makefile new file mode 120000 index 00000000..fc484116 --- /dev/null +++ b/plugins/supereq/nsfft-1.00/simd/Makefile @@ -0,0 +1 @@ +Makefile.x86
\ No newline at end of file diff --git a/plugins/supereq/nsfft-1.00/simd/Makefile.altivec b/plugins/supereq/nsfft-1.00/simd/Makefile.altivec new file mode 100644 index 00000000..eeaed6a1 --- /dev/null +++ b/plugins/supereq/nsfft-1.00/simd/Makefile.altivec @@ -0,0 +1,26 @@ +CC=gcc +BASEOPT=-Wall -maltivec -mabi=altivec +OPT=$(BASEOPT) -O3 + +all : libSIMD.a + +SIMDBaseUndiff_purecfloat.o : SIMDBaseUndiff.c SIMDBase.h SIMDBaseUndiff.h + $(CC) $(OPT) -DENABLE_PUREC_FLOAT SIMDBaseUndiff.c -c -o SIMDBaseUndiff_purecfloat.o + +SIMDBaseUndiff_purecdouble.o : SIMDBaseUndiff.c SIMDBase.h SIMDBaseUndiff.h + $(CC) $(OPT) -DENABLE_PUREC_DOUBLE SIMDBaseUndiff.c -c -o SIMDBaseUndiff_purecdouble.o + +SIMDBaseUndiff_pureclongdouble.o : SIMDBaseUndiff.c SIMDBase.h SIMDBaseUndiff.h + $(CC) $(OPT) -DENABLE_PUREC_LONGDOUBLE SIMDBaseUndiff.c -c -o SIMDBaseUndiff_pureclongdouble.o + +SIMDBaseUndiff_altivecfloat.o : SIMDBaseUndiff.c SIMDBase.h SIMDBaseUndiff.h + $(CC) $(OPT) -DENABLE_ALTIVEC_FLOAT SIMDBaseUndiff.c -c -o SIMDBaseUndiff_altivecfloat.o + +SIMDBase.o : SIMDBase.c SIMDBase.h + $(CC) $(BASEOPT) -O -DENABLE_PUREC_FLOAT -DENABLE_PUREC_DOUBLE -DENABLE_PUREC_LONGDOUBLE -DENABLE_ALTIVEC_FLOAT SIMDBase.c -c -o SIMDBase.o + +libSIMD.a : SIMDBase.o SIMDBaseUndiff_purecfloat.o SIMDBaseUndiff_purecdouble.o SIMDBaseUndiff_pureclongdouble.o SIMDBaseUndiff_altivecfloat.o + rm -f libSIMD.a; ar -cvq libSIMD.a SIMDBase.o SIMDBaseUndiff_purecfloat.o SIMDBaseUndiff_purecdouble.o SIMDBaseUndiff_pureclongdouble.o SIMDBaseUndiff_altivecfloat.o + +clean : + rm -f *~ *.o *.s *.a diff --git a/plugins/supereq/nsfft-1.00/simd/Makefile.neon b/plugins/supereq/nsfft-1.00/simd/Makefile.neon new file mode 100644 index 00000000..ace704f1 --- /dev/null +++ b/plugins/supereq/nsfft-1.00/simd/Makefile.neon @@ -0,0 +1,26 @@ +CC=gcc +BASEOPT=-Wall -mfloat-abi=softfp +OPT=$(BASEOPT) -O3 + +all : libSIMD.a + +SIMDBaseUndiff_purecfloat.o : SIMDBaseUndiff.c SIMDBase.h SIMDBaseUndiff.h + $(CC) $(OPT) -DENABLE_PUREC_FLOAT SIMDBaseUndiff.c -c -o SIMDBaseUndiff_purecfloat.o + +SIMDBaseUndiff_purecdouble.o : SIMDBaseUndiff.c SIMDBase.h SIMDBaseUndiff.h + $(CC) $(OPT) -DENABLE_PUREC_DOUBLE SIMDBaseUndiff.c -c -o SIMDBaseUndiff_purecdouble.o + +SIMDBaseUndiff_pureclongdouble.o : SIMDBaseUndiff.c SIMDBase.h SIMDBaseUndiff.h + $(CC) $(OPT) -DENABLE_PUREC_LONGDOUBLE SIMDBaseUndiff.c -c -o SIMDBaseUndiff_pureclongdouble.o + +SIMDBaseUndiff_neonfloat.o : SIMDBaseUndiff.c SIMDBase.h SIMDBaseUndiff.h + $(CC) $(OPT) -mfpu=neon -DENABLE_NEON_FLOAT SIMDBaseUndiff.c -c -o SIMDBaseUndiff_neonfloat.o + +SIMDBase.o : SIMDBase.c SIMDBase.h + $(CC) $(BASEOPT) -O -DENABLE_PUREC_FLOAT -DENABLE_PUREC_DOUBLE -DENABLE_PUREC_LONGDOUBLE -DENABLE_NEON_FLOAT SIMDBase.c -c -o SIMDBase.o + +libSIMD.a : SIMDBase.o SIMDBaseUndiff_purecfloat.o SIMDBaseUndiff_purecdouble.o SIMDBaseUndiff_pureclongdouble.o SIMDBaseUndiff_neonfloat.o + rm -f libSIMD.a; ar -cvq libSIMD.a SIMDBase.o SIMDBaseUndiff_purecfloat.o SIMDBaseUndiff_purecdouble.o SIMDBaseUndiff_pureclongdouble.o SIMDBaseUndiff_neonfloat.o + +clean : + rm -f *~ *.o *.s *.a diff --git a/plugins/supereq/nsfft-1.00/simd/Makefile.purec b/plugins/supereq/nsfft-1.00/simd/Makefile.purec new file mode 100644 index 00000000..2c8b04f1 --- /dev/null +++ b/plugins/supereq/nsfft-1.00/simd/Makefile.purec @@ -0,0 +1,35 @@ +CC=gcc +BASEOPT=-Wall +OPT=$(BASEOPT) -O3 + +all : libDFT.a + +DFTpurecfloat.o : DFTUndiff.c DFT.h SIMDBase.h + $(CC) $(OPT) -DENABLE_PUREC_FLOAT DFTUndiff.c -c -o DFTpurecfloat.o + +DFTpurecdouble.o : DFTUndiff.c DFT.h SIMDBase.h + $(CC) $(OPT) -DENABLE_PUREC_DOUBLE DFTUndiff.c -c -o DFTpurecdouble.o + +DFTpureclongdouble.o : DFTUndiff.c DFT.h SIMDBase.h + $(CC) $(OPT) -DENABLE_PUREC_LONGDOUBLE DFTUndiff.c -c -o DFTpureclongdouble.o + +SIMDBaseUndiff_purecfloat.o : SIMDBaseUndiff.c SIMDBase.h SIMDBaseUndiff.h + $(CC) $(OPT) -DENABLE_PUREC_FLOAT SIMDBaseUndiff.c -c -o SIMDBaseUndiff_purecfloat.o + +SIMDBaseUndiff_purecdouble.o : SIMDBaseUndiff.c DFT.h SIMDBase.h SIMDBaseUndiff.h + $(CC) $(OPT) -DENABLE_PUREC_DOUBLE SIMDBaseUndiff.c -c -o SIMDBaseUndiff_purecdouble.o + +SIMDBaseUndiff_pureclongdouble.o : SIMDBaseUndiff.c SIMDBase.h SIMDBaseUndiff.h + $(CC) $(OPT) -DENABLE_PUREC_LONGDOUBLE SIMDBaseUndiff.c -c -o SIMDBaseUndiff_pureclongdouble.o + +SIMDBase.o : SIMDBase.c SIMDBase.h + $(CC) $(BASEOPT) -O -DENABLE_PUREC_FLOAT -DENABLE_PUREC_DOUBLE -DENABLE_PUREC_LONGDOUBLE SIMDBase.c -c -o SIMDBase.o + +DFT.o : DFT.c DFT.h + $(CC) $(OPT) -DENABLE_PUREC_FLOAT -DENABLE_PUREC_DOUBLE -DENABLE_PUREC_LONGDOUBLE DFT.c -c -o DFT.o + +libDFT.a : DFTpurecfloat.o DFTpurecdouble.o DFTpureclongdouble.o DFT.o SIMDBase.o SIMDBaseUndiff_purecfloat.o SIMDBaseUndiff_purecdouble.o SIMDBaseUndiff_pureclongdouble.o + rm -f libDFT.a; ar -cvq libDFT.a DFTpurecfloat.o DFTpurecdouble.o DFTpureclongdouble.o DFT.o SIMDBase.o SIMDBaseUndiff_purecfloat.o SIMDBaseUndiff_purecdouble.o SIMDBaseUndiff_pureclongdouble.o + +clean : + rm -f *~ *.o *.s *.a diff --git a/plugins/supereq/nsfft-1.00/simd/Makefile.x86 b/plugins/supereq/nsfft-1.00/simd/Makefile.x86 new file mode 100644 index 00000000..02f49610 --- /dev/null +++ b/plugins/supereq/nsfft-1.00/simd/Makefile.x86 @@ -0,0 +1,35 @@ +CC=gcc +BASEOPT=-Wall +OPT=$(BASEOPT) -O3 + +all : libSIMD.a + +SIMDBaseUndiff_purecfloat.o : SIMDBaseUndiff.c SIMDBase.h SIMDBaseUndiff.h + $(CC) $(OPT) -DENABLE_PUREC_FLOAT SIMDBaseUndiff.c -c -o SIMDBaseUndiff_purecfloat.o + +SIMDBase_purecdouble.o : SIMDBaseUndiff.c SIMDBase.h SIMDBaseUndiff.h + $(CC) $(OPT) -DENABLE_PUREC_DOUBLE SIMDBaseUndiff.c -c -o SIMDBase_purecdouble.o + +SIMDBase_pureclongdouble.o : SIMDBaseUndiff.c SIMDBase.h SIMDBaseUndiff.h + $(CC) $(OPT) -DENABLE_PUREC_LONGDOUBLE SIMDBaseUndiff.c -c -o SIMDBase_pureclongdouble.o + +SIMDBase_ssefloat.o : SIMDBaseUndiff.c SIMDBase.h SIMDBaseUndiff.h + $(CC) $(OPT) -msse -DENABLE_SSE_FLOAT SIMDBaseUndiff.c -c -o SIMDBase_ssefloat.o + +SIMDBase_sse2double.o : SIMDBaseUndiff.c SIMDBase.h SIMDBaseUndiff.h + $(CC) $(OPT) -msse2 -DENABLE_SSE2_DOUBLE SIMDBaseUndiff.c -c -o SIMDBase_sse2double.o + +SIMDBase_avxfloat.o : SIMDBaseUndiff.c SIMDBase.h SIMDBaseUndiff.h + $(CC) $(OPT) -mavx -DENABLE_AVX_FLOAT SIMDBaseUndiff.c -c -o SIMDBase_avxfloat.o + +SIMDBase_avxdouble.o : SIMDBaseUndiff.c SIMDBase.h SIMDBaseUndiff.h + $(CC) $(OPT) -mavx -DENABLE_AVX_DOUBLE SIMDBaseUndiff.c -c -o SIMDBase_avxdouble.o + +SIMDBase.o : SIMDBase.c SIMDBase.h + $(CC) $(BASEOPT) -O -DENABLE_PUREC_FLOAT -DENABLE_PUREC_DOUBLE -DENABLE_PUREC_LONGDOUBLE -DENABLE_SSE_FLOAT -DENABLE_SSE2_DOUBLE SIMDBase.c -c -o SIMDBase.o + +libSIMD.a : SIMDBase.o SIMDBaseUndiff_purecfloat.o SIMDBase_purecdouble.o SIMDBase_pureclongdouble.o SIMDBase_ssefloat.o SIMDBase_sse2double.o + rm -f libSIMD.a; ar -cvq libSIMD.a SIMDBase.o SIMDBaseUndiff_purecfloat.o SIMDBase_purecdouble.o SIMDBase_pureclongdouble.o SIMDBase_ssefloat.o SIMDBase_sse2double.o + +clean : + rm -f *~ *.o *.s *.a a.out diff --git a/plugins/supereq/nsfft-1.00/simd/Makefile.x86avx b/plugins/supereq/nsfft-1.00/simd/Makefile.x86avx new file mode 100644 index 00000000..d9d27a2e --- /dev/null +++ b/plugins/supereq/nsfft-1.00/simd/Makefile.x86avx @@ -0,0 +1,35 @@ +CC=gcc +BASEOPT=-Wall +OPT=$(BASEOPT) -O3 + +all : libSIMD.a + +SIMDBaseUndiff_purecfloat.o : SIMDBaseUndiff.c SIMDBase.h SIMDBaseUndiff.h + $(CC) $(OPT) -DENABLE_PUREC_FLOAT SIMDBaseUndiff.c -c -o SIMDBaseUndiff_purecfloat.o + +SIMDBaseUndiff_purecdouble.o : SIMDBaseUndiff.c SIMDBase.h SIMDBaseUndiff.h + $(CC) $(OPT) -DENABLE_PUREC_DOUBLE SIMDBaseUndiff.c -c -o SIMDBaseUndiff_purecdouble.o + +SIMDBaseUndiff_pureclongdouble.o : SIMDBaseUndiff.c SIMDBase.h SIMDBaseUndiff.h + $(CC) $(OPT) -DENABLE_PUREC_LONGDOUBLE SIMDBaseUndiff.c -c -o SIMDBaseUndiff_pureclongdouble.o + +SIMDBaseUndiff_ssefloat.o : SIMDBaseUndiff.c SIMDBase.h SIMDBaseUndiff.h + $(CC) $(OPT) -msse -DENABLE_SSE_FLOAT SIMDBaseUndiff.c -c -o SIMDBaseUndiff_ssefloat.o + +SIMDBaseUndiff_sse2double.o : SIMDBaseUndiff.c SIMDBase.h SIMDBaseUndiff.h + $(CC) $(OPT) -msse2 -DENABLE_SSE2_DOUBLE SIMDBaseUndiff.c -c -o SIMDBaseUndiff_sse2double.o + +SIMDBaseUndiff_avxfloat.o : SIMDBaseUndiff.c SIMDBase.h SIMDBaseUndiff.h + $(CC) $(OPT) -mavx -DENABLE_AVX_FLOAT SIMDBaseUndiff.c -c -o SIMDBaseUndiff_avxfloat.o + +SIMDBaseUndiff_avxdouble.o : SIMDBaseUndiff.c SIMDBase.h SIMDBaseUndiff.h + $(CC) $(OPT) -mavx -DENABLE_AVX_DOUBLE SIMDBaseUndiff.c -c -o SIMDBaseUndiff_avxdouble.o + +SIMDBase.o : SIMDBase.c SIMDBase.h + $(CC) $(BASEOPT) -O -DENABLE_PUREC_FLOAT -DENABLE_PUREC_DOUBLE -DENABLE_PUREC_LONGDOUBLE -DENABLE_SSE_FLOAT -DENABLE_SSE2_DOUBLE -DENABLE_AVX_FLOAT -DENABLE_AVX_DOUBLE SIMDBase.c -c -o SIMDBase.o + +libSIMD.a : SIMDBase.o SIMDBaseUndiff_purecfloat.o SIMDBaseUndiff_purecdouble.o SIMDBaseUndiff_pureclongdouble.o SIMDBaseUndiff_ssefloat.o SIMDBaseUndiff_sse2double.o SIMDBaseUndiff_avxfloat.o SIMDBaseUndiff_avxdouble.o + rm -f libSIMD.a; ar -cvq libSIMD.a SIMDBase.o SIMDBaseUndiff_purecfloat.o SIMDBaseUndiff_purecdouble.o SIMDBaseUndiff_pureclongdouble.o SIMDBaseUndiff_ssefloat.o SIMDBaseUndiff_sse2double.o SIMDBaseUndiff_avxfloat.o SIMDBaseUndiff_avxdouble.o + +clean : + rm -f *~ *.o *.s *.a a.out diff --git a/plugins/supereq/nsfft-1.00/simd/SIMDBase.c b/plugins/supereq/nsfft-1.00/simd/SIMDBase.c new file mode 100644 index 00000000..eb51ee10 --- /dev/null +++ b/plugins/supereq/nsfft-1.00/simd/SIMDBase.c @@ -0,0 +1,454 @@ +#include <stdio.h> +#include <stdlib.h> +#include <math.h> +#include <stdint.h> +#include <signal.h> +#include <setjmp.h> +#include <string.h> + +#include "SIMDBase.h" + +void detect_purec_float(void); +void detect_purec_double(void); +void detect_purec_longdouble(void); +void detect_sse_float(void); +void detect_sse2_double(void); +void detect_neon_float(void); +void detect_avx_float(void); +void detect_avx_double(void); +void detect_altivec_float(void); + +int32_t getModeParamInt_purec_float(int32_t paramId); +int32_t getModeParamInt_purec_double(int32_t paramId); +int32_t getModeParamInt_purec_longdouble(int32_t paramId); +int32_t getModeParamInt_sse_float(int32_t paramId); +int32_t getModeParamInt_sse2_double(int32_t paramId); +int32_t getModeParamInt_neon_float(int32_t paramId); +int32_t getModeParamInt_avx_float(int32_t paramId); +int32_t getModeParamInt_avx_double(int32_t paramId); +int32_t getModeParamInt_altivec_float(int32_t paramId); + +char * getModeParamString_purec_float(int32_t paramId); +char * getModeParamString_purec_double(int32_t paramId); +char * getModeParamString_purec_longdouble(int32_t paramId); +char * getModeParamString_sse_float(int32_t paramId); +char * getModeParamString_sse2_double(int32_t paramId); +char * getModeParamString_neon_float(int32_t paramId); +char * getModeParamString_avx_float(int32_t paramId); +char * getModeParamString_avx_double(int32_t paramId); +char * getModeParamString_altivec_float(int32_t paramId); + +uint8_t detectBuffer[256]; +char SIMDBase_processorNameString[256]; + +static char *startsWith(char *str1, char *str2) { + if (strncmp(str1, str2, strlen(str2)) == 0) { + return str1 + strlen(str2); + } + + return NULL; +} + +#if defined(__linux__) +static char *tryReadingProcCpuinfo(char *entry) { + int i; + + FILE *fp = fopen("/proc/cpuinfo", "r"); + if (fp == NULL) return NULL; + + for(i=0;i<100;i++) { + char *q; + bzero(SIMDBase_processorNameString, 256); + if (fgets(SIMDBase_processorNameString, 255, fp) == NULL) break; + + if ((q = startsWith(SIMDBase_processorNameString, entry)) != NULL) { + int j; + fclose(fp); + + for(j=0;j<256;j++) { + if (SIMDBase_processorNameString[j] == '\n') SIMDBase_processorNameString[j] = ' '; + } + while(*q != '\0' && *q != ':' && q - SIMDBase_processorNameString < 200) q++; + if (q - SIMDBase_processorNameString >= 200) return NULL; + if (*q == ':' && *(q+1) == ' ') return q + 2; + return NULL; + } + } + + fclose(fp); + return NULL; +} +#else +static char *tryReadingProcCpuinfo(char *entry) { return NULL; } +#endif + +#if defined(__i386__) +static void SIMDBase_x86cpuid(uint32_t out[4], uint32_t eax, uint32_t ecx) { + uint32_t a, b, c, d; + __asm__ __volatile__("pushl %%eax; \n\t" + "pushl %%ebx; \n\t" + "pushl %%ecx; \n\t" + "pushl %%edx; \n\t" + "cpuid; \n\t" + "movl %%eax, %0; \n\t" + "movl %%ebx, %1; \n\t" + "movl %%ecx, %2; \n\t" + "movl %%edx, %3; \n\t" + "popl %%edx; \n\t" + "popl %%ecx; \n\t" + "popl %%ebx; \n\t" + "popl %%eax; \n\t" + : "=m"(a), "=m"(b), "=m"(c), "=m"(d) + : "a"(eax), "c"(ecx) + : "cc"); + out[0] = a; out[1] = b; out[2] = c; out[3] = d; +} +#endif + +#if defined(__x86_64__) +static void SIMDBase_x86cpuid(uint32_t out[4], uint32_t eax, uint32_t ecx) { + uint32_t a, b, c, d; + __asm__ __volatile__ ("cpuid" : "=a" (a), "=b" (b), "=c" (c), "=d" (d) : "a" (eax), "c"(ecx)); + out[0] = a; out[1] = b; out[2] = c; out[3] = d; +} +#endif + +#if defined(__i386__) || defined(__x86_64__) +static void getCacheParam(CacheParam *p) { + static int l2assoc[] = {0,1,2,0,4,0,8,0,16,0,32,48,64,96,128,-1}; + int32_t i; + uint32_t out[4]; + + for(i=0;i<8;i++) { + p->size[i] = p->assoc[i] = 0; + } + + SIMDBase_x86cpuid(out, 4, 0); + + if ((out[0] & 0xf) != 0) { + p->linesize = ((out[1] >> 0) & 2047)+1; + for(i=0;i<8;i++) { + SIMDBase_x86cpuid(out, 4, i); + if ((out[0] & 0xf) == 0) break; + int level = (out[0] >> 5) & 0x7; + int type = (out[0] >> 0) & 0xf; + int assoc = ((out[1] >> 22) & 1023)+1; + int part = ((out[1] >> 12) & 1023)+1; + int lsize = ((out[1] >> 0) & 2047)+1; + int nsets = ((out[2] >> 0))+1; + int nthre = ((out[0] >> 14) & 1023)+1; + + if (type != 1 && type != 3) continue; + p->assoc[level-1] = assoc; + p->size[level-1] = (uint64_t)assoc * part * lsize * nsets / nthre; + } + } else { + SIMDBase_x86cpuid(out, 0x80000008U, 0); + int ncores = (out[2] & 0xff) + 1; + + SIMDBase_x86cpuid(out, 0x80000005U, 0); + p->linesize = out[2] & 255; + p->size[0] = (out[2] >> 24) * 1024 / ncores; + p->assoc[0] = (out[2] >> 16) & 0xff; + + SIMDBase_x86cpuid(out, 0x80000006U, 0); + p->size[1] = (out[2] >> 16) * 1024 / ncores; + p->assoc[1] = l2assoc[(out[2] >> 12) & 0xf]; + p->size[2] = (out[3] >> 18) * 512 * 1024 / ncores; + p->assoc[2] = l2assoc[(out[3] >> 12) & 0xf]; + } + + if (p->size[0] == 0) { + p->size[0] = 16 * 1024; + p->assoc[0] = 4; + } + + if (p->size[1] == 0) { + p->size[1] = 256 * 1024; + p->assoc[1] = 4; + } +} + +char *SIMDBase_getProcessorNameString() { + union { + uint32_t info[4]; + uint8_t str[16]; + } u; + int i,j; + char *p; + + p = SIMDBase_processorNameString; + + SIMDBase_x86cpuid(u.info, 0, 0); + + for(i=0;i<4;i++) *p++ = u.str[i+4]; + for(i=0;i<4;i++) *p++ = u.str[i+12]; + for(i=0;i<4;i++) *p++ = u.str[i+8]; + + *p++ = ' '; + + for(i=0;i<3;i++) { + SIMDBase_x86cpuid(u.info, i + 0x80000002, 0); + + for(j=0;j<16;j++) { + *p++ = u.str[j]; + } + } + + *p++ = '\n'; + + return SIMDBase_processorNameString; +} +#else +char *SIMDBase_getProcessorNameString() { + char *p = "Unknown"; +#if defined(__powerpc__) + if ((p = tryReadingProcCpuinfo("cpu")) == NULL) p = "PowerPC"; +#elif defined(__arm__) + if ((p = tryReadingProcCpuinfo("Processor")) == NULL) p = "ARM"; +#endif + + return p; +} +#endif + +int32_t SIMDBase_sizeOfCachelineInByte() { +#if defined(__i386__) || defined(__x86_64__) + CacheParam p; + getCacheParam(&p); + return p.linesize; +#else + return 64; +#endif +} + +int32_t SIMDBase_sizeOfDataCacheInByte() { +#if defined(__i386__) || defined(__x86_64__) + CacheParam p; + getCacheParam(&p); + return p.size[1] + p.size[2]; // L2 + L3 +#else + return 256 * 1024; +#endif +} + +static jmp_buf sigjmp; + +static void sighandler(int signum) { + longjmp(sigjmp, 1); +} + +int32_t SIMDBase_detect(int32_t paramId) { +#if defined(__i386__) || defined(__x86_64__) + uint32_t reg[4]; +#endif + + switch(paramId) { + case SIMDBase_MODE_PUREC_FLOAT: +#if defined(ENABLE_PUREC_FLOAT) + return 1; +#else + return -1; +#endif + case SIMDBase_MODE_PUREC_DOUBLE: +#if defined(ENABLE_PUREC_DOUBLE) + return 1; +#else + return -1; +#endif + case SIMDBase_MODE_PUREC_LONGDOUBLE: +#if defined(ENABLE_PUREC_LONGDOUBLE) + return 1; +#else + return -1; +#endif + case SIMDBase_MODE_SSE_FLOAT: +#if defined(ENABLE_SSE_FLOAT) + SIMDBase_x86cpuid(reg, 1, 0); + return (reg[3] & (1 << 25)) != 0; +#else + return -1; +#endif + case SIMDBase_MODE_SSE2_DOUBLE: +#if defined(ENABLE_SSE2_DOUBLE) + SIMDBase_x86cpuid(reg, 1, 0); + return (reg[3] & (1 << 26)) != 0; +#else + return -1; +#endif + case SIMDBase_MODE_AVX_FLOAT: +#if defined(ENABLE_AVX_FLOAT) + SIMDBase_x86cpuid(reg, 1, 0); + return (reg[2] & (1 << 28)) != 0; +#else + return -1; +#endif + case SIMDBase_MODE_AVX_DOUBLE: +#if defined(ENABLE_AVX_DOUBLE) + SIMDBase_x86cpuid(reg, 1, 0); + return (reg[2] & (1 << 28)) != 0; +#else + return -1; +#endif + default: + break; + } + + signal(SIGILL, sighandler); + + if (setjmp(sigjmp) == 0) { + switch(paramId) { +#if defined(ENABLE_NEON_FLOAT) + case SIMDBase_MODE_NEON_FLOAT: + detect_neon_float(); + break; +#endif +#if defined(ENABLE_ALTIVEC_FLOAT) + case SIMDBase_MODE_ALTIVEC_FLOAT: + detect_altivec_float(); + break; +#endif + default: + signal(SIGILL, SIG_DFL); + return -1; + } + signal(SIGILL, SIG_DFL); + return 1; + } else { + signal(SIGILL, SIG_DFL); + return 0; + } +} + +int32_t SIMDBase_chooseBestMode(int32_t typeId) { + switch(typeId) { + case SIMDBase_TYPE_HALF: + break; + case SIMDBase_TYPE_FLOAT: + if (SIMDBase_detect(SIMDBase_MODE_AVX_FLOAT) == 1) return SIMDBase_MODE_AVX_FLOAT; + if (SIMDBase_detect(SIMDBase_MODE_SSE_FLOAT) == 1) return SIMDBase_MODE_SSE_FLOAT; + if (SIMDBase_detect(SIMDBase_MODE_NEON_FLOAT) == 1) return SIMDBase_MODE_NEON_FLOAT; + if (SIMDBase_detect(SIMDBase_MODE_ALTIVEC_FLOAT) == 1) return SIMDBase_MODE_ALTIVEC_FLOAT; + if (SIMDBase_detect(SIMDBase_MODE_PUREC_FLOAT) == 1) return SIMDBase_MODE_PUREC_FLOAT; + break; + + case SIMDBase_TYPE_DOUBLE: + if (SIMDBase_detect(SIMDBase_MODE_AVX_DOUBLE) == 1) return SIMDBase_MODE_AVX_DOUBLE; + if (SIMDBase_detect(SIMDBase_MODE_SSE2_DOUBLE) == 1) return SIMDBase_MODE_SSE2_DOUBLE; + if (SIMDBase_detect(SIMDBase_MODE_PUREC_DOUBLE) == 1) return SIMDBase_MODE_PUREC_DOUBLE; + break; + + case SIMDBase_TYPE_LONGDOUBLE: + if (SIMDBase_detect(SIMDBase_MODE_PUREC_LONGDOUBLE) == 1) return SIMDBase_MODE_PUREC_LONGDOUBLE; + break; + + case SIMDBase_TYPE_EXTENDED: + break; + + case SIMDBase_TYPE_QUAD: + break; + } + + return SIMDBase_MODE_NONE; +} + +int32_t SIMDBase_getModeParamInt(int32_t paramId, int32_t mode) { + switch(mode) { +#if defined(ENABLE_PUREC_FLOAT) + case 1: return getModeParamInt_purec_float(paramId); break; +#endif +#if defined(ENABLE_PUREC_DOUBLE) + case 2: return getModeParamInt_purec_double(paramId); break; +#endif +#if defined(ENABLE_PUREC_LONGDOUBLE) + case 3: return getModeParamInt_purec_longdouble(paramId); break; +#endif +#if defined(ENABLE_SSE_FLOAT) + case 4: return getModeParamInt_sse_float(paramId); break; +#endif +#if defined(ENABLE_SSE2_DOUBLE) + case 5: return getModeParamInt_sse2_double(paramId); break; +#endif +#if defined(ENABLE_NEON_FLOAT) + case 6: return getModeParamInt_neon_float(paramId); break; +#endif +#if defined(ENABLE_AVX_FLOAT) + case 7: return getModeParamInt_avx_float(paramId); break; +#endif +#if defined(ENABLE_AVX_DOUBLE) + case 8: return getModeParamInt_avx_double(paramId); break; +#endif +#if defined(ENABLE_ALTIVEC_FLOAT) + case 9: return getModeParamInt_altivec_float(paramId); break; +#endif + } + + return -1; +} + +char *SIMDBase_getModeParamString(int32_t paramId, int32_t mode) { + switch(mode) { +#if defined(ENABLE_PUREC_FLOAT) + case 1: return getModeParamString_purec_float(paramId); break; +#endif +#if defined(ENABLE_PUREC_DOUBLE) + case 2: return getModeParamString_purec_double(paramId); break; +#endif +#if defined(ENABLE_PUREC_LONGDOUBLE) + case 3: return getModeParamString_purec_longdouble(paramId); break; +#endif +#if defined(ENABLE_SSE_FLOAT) + case 4: return getModeParamString_sse_float(paramId); break; +#endif +#if defined(ENABLE_SSE2_DOUBLE) + case 5: return getModeParamString_sse2_double(paramId); break; +#endif +#if defined(ENABLE_NEON_FLOAT) + case 6: return getModeParamString_neon_float(paramId); break; +#endif +#if defined(ENABLE_AVX_FLOAT) + case 7: return getModeParamString_avx_float(paramId); break; +#endif +#if defined(ENABLE_AVX_DOUBLE) + case 8: return getModeParamString_avx_double(paramId); break; +#endif +#if defined(ENABLE_ALTIVEC_FLOAT) + case 9: return getModeParamString_altivec_float(paramId); break; +#endif + } + + return NULL; +} + +#ifdef ANDROID +int posix_memalign (void **memptr, size_t alignment, size_t size) { + *memptr = malloc (size); + return *memptr ? 0 : -1; +} +#endif + +void *SIMDBase_alignedMalloc(uint64_t size) { + void *p; + if (posix_memalign(&p, SIMDBase_sizeOfCachelineInByte(), size) != 0) abort(); + return p; +} + +void SIMDBase_alignedFree(void *ptr) { + free(ptr); +} + +int32_t SIMDBase_getParamInt(int32_t paramId) { + switch(paramId) { + case SIMDBase_PARAMID_MODE_MAX: + return SIMDBase_LAST_MODE + 1; + } + + return -1; +} + +int32_t SIMDBase_getTypeParamInt(int32_t paramId, int32_t typeId) { + switch(typeId) { + } + + return -1; +} diff --git a/plugins/supereq/nsfft-1.00/simd/SIMDBase.h b/plugins/supereq/nsfft-1.00/simd/SIMDBase.h new file mode 100644 index 00000000..10cdeb81 --- /dev/null +++ b/plugins/supereq/nsfft-1.00/simd/SIMDBase.h @@ -0,0 +1,53 @@ +#ifndef _SIMDBase_H_ +#define _SIMDBase_H_ + +#include <stdint.h> + +#define SIMDBase_TYPE_FLOAT ( 1 | ( 1 << 24 )) +#define SIMDBase_TYPE_DOUBLE ( 2 | ( 1 << 24 )) +#define SIMDBase_TYPE_LONGDOUBLE ( 3 | ( 1 << 24 )) +#define SIMDBase_TYPE_EXTENDED ( 4 | ( 1 << 24 )) +#define SIMDBase_TYPE_QUAD ( 5 | ( 1 << 24 )) +#define SIMDBase_TYPE_HALF ( 6 | ( 1 << 24 )) + +#define SIMDBase_MODE_NONE 0 +#define SIMDBase_MODE_PUREC_FLOAT 1 +#define SIMDBase_MODE_PUREC_DOUBLE 2 +#define SIMDBase_MODE_PUREC_LONGDOUBLE 3 +#define SIMDBase_MODE_SSE_FLOAT 4 +#define SIMDBase_MODE_SSE2_DOUBLE 5 +#define SIMDBase_MODE_NEON_FLOAT 6 +#define SIMDBase_MODE_AVX_FLOAT 7 +#define SIMDBase_MODE_AVX_DOUBLE 8 +#define SIMDBase_MODE_ALTIVEC_FLOAT 9 + +#define SIMDBase_LAST_MODE SIMDBase_MODE_ALTIVEC_FLOAT + +#define SIMDBase_PARAMID_MODE_MAX ( 1 | ( 2 << 24 )) +#define SIMDBase_PARAMID_TYPE_AVAILABILITY ( 2 | ( 2 << 24 )) +#define SIMDBase_PARAMID_SIZE_OF_REAL ( 3 | ( 2 << 24 )) +#define SIMDBase_PARAMID_SIZE_OF_VECT ( 4 | ( 2 << 24 )) +#define SIMDBase_PARAMID_VECTOR_LEN ( 5 | ( 2 << 24 )) +#define SIMDBase_PARAMID_MODE_AVAILABILITY ( 6 | ( 2 << 24 )) +#define SIMDBase_PARAMID_MODE_NAME ( 7 | ( 2 << 24 )) + +// + +typedef struct { + uint32_t linesize; + uint32_t size[8], assoc[8]; +} CacheParam; + +void *SIMDBase_alignedMalloc(uint64_t size); +void SIMDBase_alignedFree(void *ptr); +int32_t SIMDBase_sizeOfCachelineInByte(); +int32_t SIMDBase_sizeOfDataCacheInByte(); +int32_t SIMDBase_chooseBestMode(int32_t typeId); +char *SIMDBase_getProcessorNameString(); +int32_t SIMDBase_detect(int32_t paramId); +int32_t SIMDBase_getParamInt(int32_t paramId); +int32_t SIMDBase_getTypeParamInt(int32_t paramId, int32_t typeId); +int32_t SIMDBase_getModeParamInt(int32_t paramId, int32_t mode); +char *SIMDBase_getModeParamString(int32_t paramId, int32_t mode); + +#endif diff --git a/plugins/supereq/nsfft-1.00/simd/SIMDBaseUndiff.c b/plugins/supereq/nsfft-1.00/simd/SIMDBaseUndiff.c new file mode 100644 index 00000000..257a5ff0 --- /dev/null +++ b/plugins/supereq/nsfft-1.00/simd/SIMDBaseUndiff.c @@ -0,0 +1,38 @@ +#include <stdio.h> +#include <stdlib.h> +#include <stdint.h> + +#include "SIMDBase.h" +#include "SIMDBaseUndiff.h" + +void SIMDBaseUndiff_DETECT() { + extern uint8_t detectBuffer[256]; + SIMDBase_VECT a = SIMDBase_LOAD((SIMDBase_VECT *)&detectBuffer[0]); + SIMDBase_VECT b = SIMDBase_LOAD((SIMDBase_VECT *)&detectBuffer[64]); + SIMDBase_VECT c = SIMDBase_ADDi(a, b); + SIMDBase_STOR((SIMDBase_VECT *)&detectBuffer[128], c); +} + +int32_t SIMDBaseUndiff_GETMODEPARAMINT(int32_t paramId) { + switch(paramId) { + case SIMDBase_PARAMID_SIZE_OF_REAL: + return sizeof(SIMDBase_REAL); + case SIMDBase_PARAMID_SIZE_OF_VECT: + return sizeof(SIMDBase_VECT); + case SIMDBase_PARAMID_VECTOR_LEN: + return SIMDBase_VECTLEN; + case SIMDBase_PARAMID_MODE_AVAILABILITY: + return SIMDBase_detect(paramId); + } + + return -1; +} + +char * SIMDBaseUndiff_GETMODEPARAMSTRING(int32_t paramId) { + switch(paramId) { + case SIMDBase_PARAMID_MODE_NAME: + return SIMDBase_NAME; + } + + return NULL; +} diff --git a/plugins/supereq/nsfft-1.00/simd/SIMDBaseUndiff.h b/plugins/supereq/nsfft-1.00/simd/SIMDBaseUndiff.h new file mode 100644 index 00000000..1af849a8 --- /dev/null +++ b/plugins/supereq/nsfft-1.00/simd/SIMDBaseUndiff.h @@ -0,0 +1,231 @@ +#ifndef _SIMDBaseUndiff_H_ +#define _SIMDBaseUndiff_H_ + +#if defined(ENABLE_PUREC_FLOAT) //////////////////////////////////////////// + +typedef float SIMDBase_REAL; +typedef float SIMDBase_VECT; + +#define SIMDBase_MODE 1 +#define SIMDBase_TYPE SIMDBase_TYPE_FLOAT +#define SIMDBase_VECTLEN 1 +#define SIMDBase_NAME "Pure C float" +#define SIMDBaseUndiff_DETECT detect_purec_float +#define SIMDBaseUndiff_GETMODEPARAMINT getModeParamInt_purec_float +#define SIMDBaseUndiff_GETMODEPARAMSTRING getModeParamString_purec_float + +static inline SIMDBase_VECT SIMDBase_LOAD(SIMDBase_VECT *p) { return *p; } +static inline void SIMDBase_STOR(SIMDBase_VECT *p, SIMDBase_VECT u) { *p = u; } +static inline SIMDBase_VECT SIMDBase_SET1(SIMDBase_REAL f) { return f; } +static inline SIMDBase_VECT SIMDBase_LOAD1(SIMDBase_REAL *p) { return *p; } +static inline SIMDBase_VECT SIMDBase_ADDi(SIMDBase_VECT u, SIMDBase_VECT v) { return u + v; } +static inline SIMDBase_VECT SIMDBase_SUBi(SIMDBase_VECT u, SIMDBase_VECT v) { return u - v; } +static inline SIMDBase_VECT SIMDBase_MULi(SIMDBase_VECT u, SIMDBase_VECT v) { return u * v; } +static inline SIMDBase_VECT SIMDBase_NEGi(SIMDBase_VECT u) { return -u; } + +#elif defined(ENABLE_PUREC_DOUBLE) //////////////////////////////////////////// + +typedef double SIMDBase_REAL; +typedef double SIMDBase_VECT; + +#define SIMDBase_MODE 2 +#define SIMDBase_TYPE SIMDBase_TYPE_DOUBLE +#define SIMDBase_VECTLEN 1 +#define SIMDBase_NAME "Pure C double" +#define SIMDBaseUndiff_DETECT detect_purec_double +#define SIMDBaseUndiff_GETMODEPARAMINT getModeParamInt_purec_double +#define SIMDBaseUndiff_GETMODEPARAMSTRING getModeParamString_purec_double + +static inline SIMDBase_VECT SIMDBase_LOAD(SIMDBase_VECT *p) { return *p; } +static inline void SIMDBase_STOR(SIMDBase_VECT *p, SIMDBase_VECT u) { *p = u; } +static inline SIMDBase_VECT SIMDBase_SET1(SIMDBase_REAL f) { return f; } +static inline SIMDBase_VECT SIMDBase_LOAD1(SIMDBase_REAL *p) { return *p; } +static inline SIMDBase_VECT SIMDBase_ADDi(SIMDBase_VECT u, SIMDBase_VECT v) { return u + v; } +static inline SIMDBase_VECT SIMDBase_SUBi(SIMDBase_VECT u, SIMDBase_VECT v) { return u - v; } +static inline SIMDBase_VECT SIMDBase_MULi(SIMDBase_VECT u, SIMDBase_VECT v) { return u * v; } +static inline SIMDBase_VECT SIMDBase_NEGi(SIMDBase_VECT u) { return -u; } + +#elif defined(ENABLE_PUREC_LONGDOUBLE) //////////////////////////////////////////// + +typedef long double SIMDBase_REAL; +typedef long double SIMDBase_VECT; + +#define SIMDBase_MODE 3 +#define SIMDBase_TYPE SIMDBase_TYPE_LONGDOUBLE +#define SIMDBase_VECTLEN 1 +#define SIMDBase_NAME "Pure C long double" +#define SIMDBaseUndiff_DETECT detect_purec_longdouble +#define SIMDBaseUndiff_GETMODEPARAMINT getModeParamInt_purec_longdouble +#define SIMDBaseUndiff_GETMODEPARAMSTRING getModeParamString_purec_longdouble + +static inline SIMDBase_VECT SIMDBase_LOAD(SIMDBase_VECT *p) { return *p; } +static inline void SIMDBase_STOR(SIMDBase_VECT *p, SIMDBase_VECT u) { *p = u; } +static inline SIMDBase_VECT SIMDBase_SET1(SIMDBase_REAL f) { return f; } +static inline SIMDBase_VECT SIMDBase_LOAD1(SIMDBase_REAL *p) { return *p; } +static inline SIMDBase_VECT SIMDBase_ADDi(SIMDBase_VECT u, SIMDBase_VECT v) { return u + v; } +static inline SIMDBase_VECT SIMDBase_SUBi(SIMDBase_VECT u, SIMDBase_VECT v) { return u - v; } +static inline SIMDBase_VECT SIMDBase_MULi(SIMDBase_VECT u, SIMDBase_VECT v) { return u * v; } +static inline SIMDBase_VECT SIMDBase_NEGi(SIMDBase_VECT u) { return -u; } + +#elif defined(ENABLE_SSE_FLOAT) //////////////////////////////////////////// + +#include <xmmintrin.h> + +typedef float SIMDBase_REAL; +typedef __m128 SIMDBase_VECT; + +#define SIMDBase_MODE 4 +#define SIMDBase_TYPE SIMDBase_TYPE_FLOAT +#define SIMDBase_VECTLEN 4 +#define SIMDBase_NAME "x86 SSE float" +#define SIMDBaseUndiff_DETECT detect_sse_float +#define SIMDBaseUndiff_GETMODEPARAMINT getModeParamInt_sse_float +#define SIMDBaseUndiff_GETMODEPARAMSTRING getModeParamString_sse_float + +static inline SIMDBase_VECT SIMDBase_LOAD(SIMDBase_VECT *p) { return _mm_load_ps((float *)p); } +static inline void SIMDBase_STOR(SIMDBase_VECT *p, SIMDBase_VECT u) { _mm_store_ps((float *)p, u); } +static inline SIMDBase_VECT SIMDBase_SET1(SIMDBase_REAL f) { return _mm_set1_ps(f); } +static inline SIMDBase_VECT SIMDBase_LOAD1(SIMDBase_REAL *p) { return _mm_load1_ps(p); } +static inline SIMDBase_VECT SIMDBase_ADDi(SIMDBase_VECT u, SIMDBase_VECT v) { return _mm_add_ps(u, v); } +static inline SIMDBase_VECT SIMDBase_SUBi(SIMDBase_VECT u, SIMDBase_VECT v) { return _mm_sub_ps(u, v); } +static inline SIMDBase_VECT SIMDBase_MULi(SIMDBase_VECT u, SIMDBase_VECT v) { return _mm_mul_ps(u, v); } +static inline SIMDBase_VECT SIMDBase_NEGi(SIMDBase_VECT u) { return _mm_xor_ps(u, _mm_set_ps(-0.0f, -0.0f, -0.0f, -0.0f)); } + +#elif defined(ENABLE_SSE2_DOUBLE) //////////////////////////////////////////// + +#include <emmintrin.h> + +typedef double SIMDBase_REAL; +typedef __m128d SIMDBase_VECT; + +#define SIMDBase_MODE 5 +#define SIMDBase_TYPE SIMDBase_TYPE_DOUBLE +#define SIMDBase_VECTLEN 2 +#define SIMDBase_NAME "x86 SSE2 double" +#define SIMDBaseUndiff_DETECT detect_sse2_double +#define SIMDBaseUndiff_GETMODEPARAMINT getModeParamInt_sse2_double +#define SIMDBaseUndiff_GETMODEPARAMSTRING getModeParamString_sse2_double + +static inline SIMDBase_VECT SIMDBase_LOAD(SIMDBase_VECT *p) { return _mm_load_pd((double *)p); } +static inline void SIMDBase_STOR(SIMDBase_VECT *p, SIMDBase_VECT u) { _mm_store_pd((double *)p, u); } +static inline SIMDBase_VECT SIMDBase_SET1(SIMDBase_REAL f) { return _mm_set1_pd(f); } +static inline SIMDBase_VECT SIMDBase_LOAD1(SIMDBase_REAL *p) { return _mm_load1_pd(p); } +static inline SIMDBase_VECT SIMDBase_ADDi(SIMDBase_VECT u, SIMDBase_VECT v) { return _mm_add_pd(u, v); } +static inline SIMDBase_VECT SIMDBase_SUBi(SIMDBase_VECT u, SIMDBase_VECT v) { return _mm_sub_pd(u, v); } +static inline SIMDBase_VECT SIMDBase_MULi(SIMDBase_VECT u, SIMDBase_VECT v) { return _mm_mul_pd(u, v); } +static inline SIMDBase_VECT SIMDBase_NEGi(SIMDBase_VECT u) { return _mm_xor_pd(u, _mm_set_pd(-0.0, -0.0)); } + +#elif defined(ENABLE_NEON_FLOAT) //////////////////////////////////////////// + +#include <arm_neon.h> + +typedef float32_t SIMDBase_REAL; +typedef float32x4_t SIMDBase_VECT; + +#define SIMDBase_MODE 6 +#define SIMDBase_TYPE SIMDBase_TYPE_FLOAT +#define SIMDBase_VECTLEN 4 +#define SIMDBase_NAME "ARM NEON float" +#define SIMDBaseUndiff_DETECT detect_neon_float +#define SIMDBaseUndiff_GETMODEPARAMINT getModeParamInt_neon_float +#define SIMDBaseUndiff_GETMODEPARAMSTRING getModeParamString_neon_float + +static inline SIMDBase_VECT SIMDBase_LOAD(SIMDBase_VECT *p) { return vld1q_f32((float32_t *)p); } +static inline void SIMDBase_STOR(SIMDBase_VECT *p, SIMDBase_VECT u) { vst1q_f32((float32_t *)p, u); } +static inline SIMDBase_VECT SIMDBase_SET1(SIMDBase_REAL f) { return vdupq_n_f32(f); } +static inline SIMDBase_VECT SIMDBase_LOAD1(SIMDBase_REAL *p) { return vdupq_n_f32(*p); } +static inline SIMDBase_VECT SIMDBase_ADDi(SIMDBase_VECT u, SIMDBase_VECT v) { return vaddq_f32(u, v); } +static inline SIMDBase_VECT SIMDBase_SUBi(SIMDBase_VECT u, SIMDBase_VECT v) { return vsubq_f32(u, v); } +static inline SIMDBase_VECT SIMDBase_MULi(SIMDBase_VECT u, SIMDBase_VECT v) { return vmulq_f32(u, v); } +static inline SIMDBase_VECT SIMDBase_NEGi(SIMDBase_VECT u) { + return vreinterpretq_f32_u32( veorq_u32(vreinterpretq_u32_f32(u), vdupq_n_u32(0x80000000U))); +} + +#define SIMDBase_FMADD_AVAILABLE + +static inline SIMDBase_VECT SIMDBase_FMADDi(SIMDBase_VECT u, SIMDBase_VECT v, SIMDBase_VECT w) { return vmlaq_f32(w, u, v); } // w + u * v +static inline SIMDBase_VECT SIMDBase_FMSUBi(SIMDBase_VECT u, SIMDBase_VECT v, SIMDBase_VECT w) { return vmlsq_f32(w, u, v); } // w - u * v + +#elif defined(ENABLE_AVX_FLOAT) //////////////////////////////////////////// + +#include <immintrin.h> + +typedef float SIMDBase_REAL; +typedef __m256 SIMDBase_VECT; + +#define SIMDBase_MODE 7 +#define SIMDBase_TYPE SIMDBase_TYPE_FLOAT +#define SIMDBase_VECTLEN 8 +#define SIMDBase_NAME "x86 AVX float" +#define SIMDBaseUndiff_DETECT detect_avx_float +#define SIMDBaseUndiff_GETMODEPARAMINT getModeParamInt_avx_float +#define SIMDBaseUndiff_GETMODEPARAMSTRING getModeParamString_avx_float + +static inline SIMDBase_VECT SIMDBase_LOAD(SIMDBase_VECT *p) { return _mm256_load_ps((float *)p); } +static inline void SIMDBase_STOR(SIMDBase_VECT *p, SIMDBase_VECT u) { _mm256_store_ps((float *)p, u); } +static inline SIMDBase_VECT SIMDBase_SET1(SIMDBase_REAL f) { return _mm256_set1_ps(f); } +static inline SIMDBase_VECT SIMDBase_LOAD1(SIMDBase_REAL *p) { return _mm256_set1_ps(*p); } +static inline SIMDBase_VECT SIMDBase_ADDi(SIMDBase_VECT u, SIMDBase_VECT v) { return _mm256_add_ps(u, v); } +static inline SIMDBase_VECT SIMDBase_SUBi(SIMDBase_VECT u, SIMDBase_VECT v) { return _mm256_sub_ps(u, v); } +static inline SIMDBase_VECT SIMDBase_MULi(SIMDBase_VECT u, SIMDBase_VECT v) { return _mm256_mul_ps(u, v); } +static inline SIMDBase_VECT SIMDBase_NEGi(SIMDBase_VECT u) { return _mm256_xor_ps(u, _mm256_set1_ps(-0.0f)); } + +#elif defined(ENABLE_AVX_DOUBLE) //////////////////////////////////////////// + +#include <immintrin.h> + +typedef double SIMDBase_REAL; +typedef __m256d SIMDBase_VECT; + +#define SIMDBase_MODE 8 +#define SIMDBase_TYPE SIMDBase_TYPE_DOUBLE +#define SIMDBase_VECTLEN 4 +#define SIMDBase_NAME "x86 AVX double" +#define SIMDBaseUndiff_DETECT detect_avx_double +#define SIMDBaseUndiff_GETMODEPARAMINT getModeParamInt_avx_double +#define SIMDBaseUndiff_GETMODEPARAMSTRING getModeParamString_avx_double + +static inline SIMDBase_VECT SIMDBase_LOAD(SIMDBase_VECT *p) { return _mm256_load_pd((double *)p); } +static inline void SIMDBase_STOR(SIMDBase_VECT *p, SIMDBase_VECT u) { _mm256_store_pd((double *)p, u); } +static inline SIMDBase_VECT SIMDBase_SET1(SIMDBase_REAL f) { return _mm256_set1_pd(f); } +static inline SIMDBase_VECT SIMDBase_LOAD1(SIMDBase_REAL *p) { return _mm256_set1_pd(*p); } +static inline SIMDBase_VECT SIMDBase_ADDi(SIMDBase_VECT u, SIMDBase_VECT v) { return _mm256_add_pd(u, v); } +static inline SIMDBase_VECT SIMDBase_SUBi(SIMDBase_VECT u, SIMDBase_VECT v) { return _mm256_sub_pd(u, v); } +static inline SIMDBase_VECT SIMDBase_MULi(SIMDBase_VECT u, SIMDBase_VECT v) { return _mm256_mul_pd(u, v); } +static inline SIMDBase_VECT SIMDBase_NEGi(SIMDBase_VECT u) { return _mm256_xor_pd(u, _mm256_set1_pd(-0.0)); } + +#elif defined(ENABLE_ALTIVEC_FLOAT) //////////////////////////////////////////// + +#include <altivec.h> + +typedef float SIMDBase_REAL; +typedef vector float SIMDBase_VECT; + +#define SIMDBase_MODE 9 +#define SIMDBase_TYPE SIMDBase_TYPE_FLOAT +#define SIMDBase_VECTLEN 4 +#define SIMDBase_NAME "PowerPC AltiVec float" +#define SIMDBaseUndiff_DETECT detect_altivec_float +#define SIMDBaseUndiff_GETMODEPARAMINT getModeParamInt_altivec_float +#define SIMDBaseUndiff_GETMODEPARAMSTRING getModeParamString_altivec_float + +static inline SIMDBase_VECT SIMDBase_LOAD(SIMDBase_VECT *p) { return vec_ld(0, p); } +static inline void SIMDBase_STOR(SIMDBase_VECT *p, SIMDBase_VECT u) { vec_st(u, 0, p); } +static inline SIMDBase_VECT SIMDBase_SET1(SIMDBase_REAL f) { return (vector float){f, f, f, f}; } +static inline SIMDBase_VECT SIMDBase_LOAD1(SIMDBase_REAL *p) { return (vector float){*p, *p, *p, *p}; } +static inline SIMDBase_VECT SIMDBase_ADDi(SIMDBase_VECT u, SIMDBase_VECT v) { return vec_add(u, v); } +static inline SIMDBase_VECT SIMDBase_SUBi(SIMDBase_VECT u, SIMDBase_VECT v) { return vec_sub(u, v); } +static inline SIMDBase_VECT SIMDBase_MULi(SIMDBase_VECT u, SIMDBase_VECT v) { return vec_madd(u, v, (vector float){0, 0, 0, 0}); } +static inline SIMDBase_VECT SIMDBase_NEGi(SIMDBase_VECT u) { return vec_xor(u, (vector float){-0.0f, -0.0f, -0.0f, -0.0f}); } + +#define SIMDBase_FMADD_AVAILABLE + +static inline SIMDBase_VECT SIMDBase_FMADDi(SIMDBase_VECT u, SIMDBase_VECT v, SIMDBase_VECT w) { return vec_madd(u, v, w); } // w + u * v +static inline SIMDBase_VECT SIMDBase_FMSUBi(SIMDBase_VECT u, SIMDBase_VECT v, SIMDBase_VECT w) { return vec_nmsub(u, v, w); } // w - u * v + +#endif //////////////////////////////////////////////////////////////////// + +static inline SIMDBase_VECT SIMDBase_ADDm(SIMDBase_VECT *p, SIMDBase_VECT *q) { return SIMDBase_ADDi(SIMDBase_LOAD(p), SIMDBase_LOAD(q)); } +static inline SIMDBase_VECT SIMDBase_SUBm(SIMDBase_VECT *p, SIMDBase_VECT *q) { return SIMDBase_SUBi(SIMDBase_LOAD(p), SIMDBase_LOAD(q)); } + +#endif diff --git a/plugins/supereq/paramlist.hpp b/plugins/supereq/paramlist.hpp index 0c513b78..9c5b09c4 100644 --- a/plugins/supereq/paramlist.hpp +++ b/plugins/supereq/paramlist.hpp @@ -1,4 +1,22 @@ -//#include <iostream.h>
+/*
+ DeaDBeeF - ultimate music player for GNU/Linux systems with X11
+ Copyright (C) 2009-2011 Alexey Yakovenko <waker@users.sourceforge.net>
+ Original SuperEQ code (C) Naoki Shibata <shibatch@users.sf.net>
+
+ This program is free software; you can redistribute it and/or
+ modify it under the terms of the GNU General Public License
+ as published by the Free Software Foundation; either version 2
+ of the License, or (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+*/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
@@ -7,12 +25,10 @@ class paramlistelm { public:
class paramlistelm *next;
- char left,right;
float lower,upper,gain,gain2;
int sortindex;
paramlistelm(void) {
- left = right = 1;
lower = upper = gain = 0;
next = NULL;
};
@@ -21,13 +37,6 @@ public: delete next;
next = NULL;
};
-
- char *getString(void) {
- static char str[64];
- sprintf(str,"%gHz to %gHz, %gdB %c%c",
- (double)lower,(double)upper,(double)gain,left?'L':' ',right?'R':' ');
- return str;
- }
};
class paramlist {
@@ -52,8 +61,6 @@ public: for(p=&elm,q=src.elm;q != NULL;q = q->next,p = &(*p)->next)
{
*p = new paramlistelm;
- (*p)->left = q->left;
- (*p)->right = q->right;
(*p)->lower = q->lower;
(*p)->upper = q->upper;
(*p)->gain = q->gain;
diff --git a/plugins/supereq/shibatch_rdft.c b/plugins/supereq/shibatch_rdft.c new file mode 100644 index 00000000..db453eb8 --- /dev/null +++ b/plugins/supereq/shibatch_rdft.c @@ -0,0 +1,71 @@ +#include <stdio.h> +#include <stdlib.h> +#include <math.h> +#include <stdint.h> + +#include "SIMDBase.h" +#include "DFT.h" + +#define TYPE SIMDBase_TYPE_FLOAT + +void rfft(int n,int isign,float *x) { + static DFT *p = NULL; + static float *buf = NULL; + static int ipsize = 0; + static int mode = 0; + static int veclen = 0; + int newipsize; + if (n == 0) { + if (buf) { + SIMDBase_alignedFree (buf); + buf = NULL; + } + if (p) { + DFT_dispose(p, mode); + p = NULL; + } + return; + } + int nn = n; + n = 1<<n; + newipsize = n; + if (newipsize != ipsize) { + ipsize = newipsize; + + if (buf) { + SIMDBase_alignedFree (buf); + buf = NULL; + } + + if (p) { + DFT_dispose(p, mode); + p = NULL; + } + + buf = SIMDBase_alignedMalloc (n * sizeof (float)); + + mode = SIMDBase_chooseBestMode(TYPE); + veclen = SIMDBase_getModeParamInt(SIMDBase_PARAMID_VECTOR_LEN, mode); + int sizeOfVect = SIMDBase_getModeParamInt(SIMDBase_PARAMID_SIZE_OF_VECT, mode); + printf ("n: %d, veclen: %d, sizeOfVect: %d\n", n, veclen, sizeOfVect); + p = DFT_init(mode, n/veclen, DFT_FLAG_REAL); + } + + // store in simd order + int asize = n / veclen; + int i, j; + for(j=0;j<veclen;j++) { + for (i = 0; i < asize; i++) { + buf[i * veclen + j] = x[j * asize + i]; + } + } + + DFT_execute(p, mode, buf, isign); + +#define THRES 1e-3 + for(j=0;j<veclen;j++) { + for (i = 0; i < asize; i++) { + x[j * asize + i] = buf[i * veclen + j]; + } + } +} diff --git a/plugins/supereq/supereq.c b/plugins/supereq/supereq.c index af4000fd..a773b4ef 100644 --- a/plugins/supereq/supereq.c +++ b/plugins/supereq/supereq.c @@ -1,6 +1,6 @@ /* DeaDBeeF - ultimate music player for GNU/Linux systems with X11 - Copyright (C) 2009-2010 Alexey Yakovenko <waker@users.sourceforge.net> + Copyright (C) 2009-2011 Alexey Yakovenko <waker@users.sourceforge.net> This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License @@ -18,217 +18,301 @@ */ #include <stdio.h> #include <string.h> +#include <stdlib.h> +#include <math.h> #include "../../deadbeef.h" -#include "supereq.h" +#include "Equ.h" static DB_functions_t *deadbeef; -static DB_supereq_dsp_t plugin; - -void *paramlist_alloc (void); -void paramlist_free (void *); -void equ_makeTable(float *lbc,float *rbc,void *param,float fs); -int equ_modifySamples(char *buf,int nsamples,int nch,int bps); -void equ_clearbuf(int bps,int srate); -void equ_init(int wb); -void equ_quit(void); - -void supereq_reset (void); - -static float last_srate = 0; -static int last_nch = 0, last_bps = 0; -static float lbands[18] = {1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0}; -static float rbands[18] = {1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0}; -static float preamp = 1; -static void *paramsroot; - -static int params_changed = 0; -static intptr_t tid = 0; -static uintptr_t mutex = 0; -static int enabled = 0; - -static int -supereq_on_configchanged (DB_event_t *ev, uintptr_t data) { - int e = deadbeef->conf_get_int ("supereq.enable", 0); - if (e != enabled) { - if (e) { - supereq_reset (); - } - enabled = e; - } - - return 0; -} +static DB_dsp_t plugin; + +typedef struct { + ddb_dsp_context_t ctx; + float last_srate; + int last_nch; + float bands[18]; + float preamp; + void *paramsroot; + int params_changed; + uintptr_t mutex; + SuperEqState state; + int enabled; +} ddb_supereq_ctx_t; + +void supereq_reset (ddb_dsp_context_t *ctx); void -recalc_table (void) { +recalc_table (ddb_supereq_ctx_t *eq) { void *params = paramlist_alloc (); - deadbeef->mutex_lock (mutex); - float lbands_copy[18]; - float rbands_copy[18]; - float srate = last_srate; - memcpy (lbands_copy, lbands, sizeof (lbands)); - memcpy (rbands_copy, rbands, sizeof (rbands)); + deadbeef->mutex_lock (eq->mutex); + float bands_copy[18]; + float srate = eq->last_srate; + memcpy (bands_copy, eq->bands, sizeof (eq->bands)); for (int i = 0; i < 18; i++) { - lbands_copy[i] *= preamp; - rbands_copy[i] *= preamp; + bands_copy[i] *= eq->preamp; } - deadbeef->mutex_unlock (mutex); + deadbeef->mutex_unlock (eq->mutex); - equ_makeTable (lbands_copy, rbands_copy, params, srate); + equ_makeTable (&eq->state, bands_copy, params, srate); - deadbeef->mutex_lock (mutex); - paramlist_free (paramsroot); - paramsroot = params; - deadbeef->mutex_unlock (mutex); + deadbeef->mutex_lock (eq->mutex); + paramlist_free (eq->paramsroot); + eq->paramsroot = params; + deadbeef->mutex_unlock (eq->mutex); } int supereq_plugin_start (void) { - enabled = deadbeef->conf_get_int ("supereq.enable", 0); - // load bands from config - preamp = deadbeef->conf_get_float ("eq.preamp", 1); - for (int i = 0; i < 18; i++) { - char key[100]; - snprintf (key, sizeof (key), "eq.band%d", i); - lbands[i] = rbands[i] = deadbeef->conf_get_float (key, 1); - } - - equ_init (14); - paramsroot = paramlist_alloc (); - last_srate = 44100; - last_nch = 2; - last_bps = 16; - mutex = deadbeef->mutex_create (); - recalc_table (); - equ_clearbuf (last_bps,last_srate); - deadbeef->ev_subscribe (DB_PLUGIN (&plugin), DB_EV_CONFIGCHANGED, DB_CALLBACK (supereq_on_configchanged), 0); return 0; } int supereq_plugin_stop (void) { - deadbeef->ev_unsubscribe (DB_PLUGIN (&plugin), DB_EV_CONFIGCHANGED, DB_CALLBACK (supereq_on_configchanged), 0); - if (tid) { - deadbeef->thread_join (tid); - tid = 0; - } - if (mutex) { - deadbeef->mutex_free (mutex); - mutex = 0; - } - equ_quit (); - paramlist_free (paramsroot); return 0; } -void -supereq_regen_table_thread (void *param) { - recalc_table (); - tid = 0; -} - int -supereq_process_int16 (int16_t *samples, int nsamples, int nch, int bps, int srate) { - if ((nch != 1 && nch != 2) || (bps != 8 && bps != 16 && bps != 24)) return nsamples; - if (params_changed && !tid) { - tid = deadbeef->thread_start (supereq_regen_table_thread, NULL); - params_changed = 0; +supereq_process (ddb_dsp_context_t *ctx, float *samples, int frames, int maxframes, ddb_waveformat_t *fmt, float *r) { + ddb_supereq_ctx_t *supereq = (ddb_supereq_ctx_t *)ctx; + if (supereq->enabled != ctx->enabled) { + if (ctx->enabled && !supereq->enabled) { + supereq_reset (ctx); + } + supereq->enabled = ctx->enabled; + +// this causes a glitch on 1st track +// DB_playItem_t *it = deadbeef->streamer_get_playing_track (); +// if (it) { +// float playpos = deadbeef->streamer_get_playpos (); +// deadbeef->streamer_seek (playpos); +// deadbeef->pl_item_unref (it); +// } } - if (last_srate != srate) { - deadbeef->mutex_lock (mutex); - //equ_makeTable (lbands, rbands, paramsroot, srate); - last_srate = srate; - last_nch = nch; - last_bps = bps; - recalc_table (); - deadbeef->mutex_unlock (mutex); - equ_clearbuf(bps,srate); + if (supereq->params_changed) { + recalc_table (supereq); + supereq->params_changed = 0; } - else if (last_nch != nch || last_bps != bps) { - deadbeef->mutex_lock (mutex); - last_nch = nch; - last_bps = bps; - deadbeef->mutex_unlock (mutex); - equ_clearbuf(bps,srate); + if (supereq->last_srate != fmt->samplerate || supereq->last_nch != fmt->channels) { + deadbeef->mutex_lock (supereq->mutex); + supereq->last_srate = fmt->samplerate; + supereq->last_nch = fmt->channels; + equ_init (&supereq->state, 10, fmt->channels); + recalc_table (supereq); + equ_clearbuf(&supereq->state); + deadbeef->mutex_unlock (supereq->mutex); } - equ_modifySamples((char *)samples,nsamples,nch,bps); - return nsamples; + equ_modifySamples_float(&supereq->state, (char *)samples,frames,fmt->channels); + return frames; } float -supereq_get_band (int band) { - return lbands[band]; +supereq_get_band (ddb_dsp_context_t *ctx, int band) { + ddb_supereq_ctx_t *supereq = (ddb_supereq_ctx_t *)ctx; + return supereq->bands[band]; } void -supereq_set_band (int band, float value) { - deadbeef->mutex_lock (mutex); - lbands[band] = rbands[band] = value; - deadbeef->mutex_unlock (mutex); - params_changed = 1; - char key[100]; - snprintf (key, sizeof (key), "eq.band%d", band); - deadbeef->conf_set_float (key, value); +supereq_set_band (ddb_dsp_context_t *ctx, int band, float value) { + ddb_supereq_ctx_t *supereq = (ddb_supereq_ctx_t *)ctx; + deadbeef->mutex_lock (supereq->mutex); + supereq->bands[band] = value; + deadbeef->mutex_unlock (supereq->mutex); + supereq->params_changed = 1; } float -supereq_get_preamp (void) { - return preamp; +supereq_get_preamp (ddb_dsp_context_t *ctx) { + ddb_supereq_ctx_t *supereq = (ddb_supereq_ctx_t *)ctx; + return supereq->preamp; } void -supereq_set_preamp (float value) { - deadbeef->mutex_lock (mutex); - preamp = value; - deadbeef->mutex_unlock (mutex); - params_changed = 1; - deadbeef->conf_set_float ("eq.preamp", value); +supereq_set_preamp (ddb_dsp_context_t *ctx, float value) { + ddb_supereq_ctx_t *supereq = (ddb_supereq_ctx_t *)ctx; + deadbeef->mutex_lock (supereq->mutex); + supereq->preamp = value; + deadbeef->mutex_unlock (supereq->mutex); + supereq->params_changed = 1; } void -supereq_reset (void) { - deadbeef->mutex_lock (mutex); - equ_clearbuf(last_bps,last_srate); - deadbeef->mutex_unlock (mutex); +supereq_reset (ddb_dsp_context_t *ctx) { + ddb_supereq_ctx_t *supereq = (ddb_supereq_ctx_t *)ctx; + deadbeef->mutex_lock (supereq->mutex); + equ_clearbuf(&supereq->state); + deadbeef->mutex_unlock (supereq->mutex); +} + +int +supereq_num_params (void) { + return 19; +} + +static const char *bandnames[] = { + "Preamp", + "55 Hz", + "77 Hz", + "110 Hz", + "156 Hz", + "220 Hz", + "311 Hz", + "440 Hz", + "622 Hz", + "880 Hz", + "1.2 kHz", + "1.8 kHz", + "2.5 kHz", + "3.5 kHz", + "5 kHz", + "7 kHz", + "10 kHz", + "14 kHz", + "20 kHz" +}; + +const char * +supereq_get_param_name (int p) { + return bandnames[p]; +} + + +static inline float +db_to_amp (float dB) { + const float ln10=2.3025850929940002f; + return exp(ln10*dB/20.f); +} + +static inline float +amp_to_db (float amp) { + return 20*log10 (amp); } void -supereq_enable (int e) { - if (e != enabled) { - deadbeef->conf_set_int ("supereq.enable", e); - if (e && !enabled) { - supereq_reset (); - } - enabled = e; +supereq_set_param (ddb_dsp_context_t *ctx, int p, const char *val) { + switch (p) { + case 0: + supereq_set_preamp (ctx, db_to_amp (atof (val))); + break; + case 1 ... 18: + supereq_set_band (ctx, p-1, db_to_amp (atof (val))); + break; + default: + fprintf (stderr, "supereq_set_param: invalid param index (%d)\n", p); } } -int -supereq_enabled (void) { - return enabled; -} - -static DB_supereq_dsp_t plugin = { - .dsp.plugin.api_vmajor = DB_API_VERSION_MAJOR, - .dsp.plugin.api_vminor = DB_API_VERSION_MINOR, - .dsp.plugin.type = DB_PLUGIN_DSP, - .dsp.plugin.id = "supereq", - .dsp.plugin.name = "SuperEQ", - .dsp.plugin.descr = "equalizer plugin using SuperEQ library by Naoki Shibata", - .dsp.plugin.author = "Alexey Yakovenko", - .dsp.plugin.email = "waker@users.sourceforge.net", - .dsp.plugin.website = "http://deadbeef.sf.net", - .dsp.plugin.start = supereq_plugin_start, - .dsp.plugin.stop = supereq_plugin_stop, - .dsp.process_int16 = supereq_process_int16, - .dsp.reset = supereq_reset, - .dsp.enable = supereq_enable, - .dsp.enabled = supereq_enabled, - .get_band = supereq_get_band, - .set_band = supereq_set_band, - .get_preamp = supereq_get_preamp, - .set_preamp = supereq_set_preamp, +void +supereq_get_param (ddb_dsp_context_t *ctx, int p, char *v, int sz) { + switch (p) { + case 0: + snprintf (v, sz, "%f", amp_to_db (supereq_get_preamp (ctx))); + break; + case 1 ... 18: + snprintf (v, sz, "%f", amp_to_db (supereq_get_band (ctx, p-1))); + break; + default: + fprintf (stderr, "supereq_get_param: invalid param index (%d)\n", p); + } +} + + +ddb_dsp_context_t* +supereq_open (void) { + ddb_supereq_ctx_t *supereq = malloc (sizeof (ddb_supereq_ctx_t)); + DDB_INIT_DSP_CONTEXT (supereq,ddb_supereq_ctx_t,&plugin); + + equ_init (&supereq->state, 10, 2); + supereq->paramsroot = paramlist_alloc (); + supereq->last_srate = 44100; + supereq->last_nch = 2; + supereq->mutex = deadbeef->mutex_create (); + supereq->preamp = 1; + for (int i = 0; i < 18; i++) { + supereq->bands[i] = 1; + } + recalc_table (supereq); + equ_clearbuf (&supereq->state); + + return (ddb_dsp_context_t*)supereq; +} + +void +supereq_close (ddb_dsp_context_t *ctx) { + ddb_supereq_ctx_t *supereq = (ddb_supereq_ctx_t *)ctx; + if (supereq->mutex) { + deadbeef->mutex_free (supereq->mutex); + supereq->mutex = 0; + } + equ_quit (&supereq->state); + paramlist_free (supereq->paramsroot); + free (ctx); +} + +static const char settings_dlg[] = + "property \"\" hbox[19] hmg fill expand border=0 spacing=8 height=200;\n" + "property \"Preamp\" vscale[20,-20,1] vert 0 0;\n" + "property \"55 Hz\" vscale[20,-20,1] vert 1 0;\n" + "property \"77 Hz\" vscale[20,-20,1] vert 2 0;\n" + "property \"110 Hz\" vscale[20,-20,1] vert 3 0;\n" + "property \"156 Hz\" vscale[20,-20,1] vert 4 0;\n" + "property \"220 Hz\" vscale[20,-20,1] vert 5 0;\n" + "property \"311 Hz\" vscale[20,-20,1] vert 6 0;\n" + "property \"440 Hz\" vscale[20,-20,1] vert 7 0;\n" + "property \"622 Hz\" vscale[20,-20,1] vert 8 0;\n" + "property \"880 Hz\" vscale[20,-20,1] vert 9 0;\n" + "property \"1.2 kHz\" vscale[20,-20,1] vert 10 0;\n" + "property \"1.8 kHz\" vscale[20,-20,1] vert 11 0;\n" + "property \"2.5 kHz\" vscale[20,-20,1] vert 12 0;\n" + "property \"3.5 kHz\" vscale[20,-20,1] vert 13 0;\n" + "property \"5 kHz\" vscale[20,-20,1] vert 14 0;\n" + "property \"7 kHz\" vscale[20,-20,1] vert 15 0;\n" + "property \"10 kHz\" vscale[20,-20,1] vert 16 0;\n" + "property \"14 kHz\" vscale[20,-20,1] vert 17 0;\n" + "property \"20 kHz\" vscale[20,-20,1] vert 18 0;\n" +; + +static DB_dsp_t plugin = { + .plugin.api_vmajor = DB_API_VERSION_MAJOR, + .plugin.api_vminor = DB_API_VERSION_MINOR, + .plugin.version_major = 1, + .plugin.version_minor = 0, + .plugin.type = DB_PLUGIN_DSP, + .plugin.id = "supereq", + .plugin.name = "SuperEQ", + .plugin.descr = "equalizer plugin using SuperEQ library", + .plugin.copyright = + "Copyright (C) 2009-2011 Alexey Yakovenko <waker@users.sourceforge.net>\n" + "\n" + "Uses supereq library by Naoki Shibata, http://shibatch.sourceforge.net\n" + "Uses FFT library by Takuya Ooura, http://www.kurims.kyoto-u.ac.jp/~ooura/\n" + "\n" + "This program is free software; you can redistribute it and/or\n" + "modify it under the terms of the GNU General Public License\n" + "as published by the Free Software Foundation; either version 2\n" + "of the License, or (at your option) any later version.\n" + "\n" + "This program is distributed in the hope that it will be useful,\n" + "but WITHOUT ANY WARRANTY; without even the implied warranty of\n" + "MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\n" + "GNU General Public License for more details.\n" + "\n" + "You should have received a copy of the GNU General Public License\n" + "along with this program; if not, write to the Free Software\n" + "Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.\n" + , + .plugin.website = "http://deadbeef.sf.net", + .plugin.start = supereq_plugin_start, + .plugin.stop = supereq_plugin_stop, + .open = supereq_open, + .close = supereq_close, + .process = supereq_process, + .reset = supereq_reset, + .num_params = supereq_num_params, + .get_param_name = supereq_get_param_name, + .set_param = supereq_set_param, + .get_param = supereq_get_param, + .configdialog = settings_dlg, }; DB_plugin_t * diff --git a/plugins/supereq/supereq.h b/plugins/supereq/supereq.h deleted file mode 100644 index 32298ef1..00000000 --- a/plugins/supereq/supereq.h +++ /dev/null @@ -1,31 +0,0 @@ -/* - DeaDBeeF - ultimate music player for GNU/Linux systems with X11 - Copyright (C) 2009-2010 Alexey Yakovenko <waker@users.sourceforge.net> - - This program is free software; you can redistribute it and/or - modify it under the terms of the GNU General Public License - as published by the Free Software Foundation; either version 2 - of the License, or (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program; if not, write to the Free Software - Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. -*/ - -#ifndef __SUPEREQ_H -#define __SUPEREQ_H - -typedef struct DB_supereq_dsp_s { - DB_dsp_t dsp; - float (*get_band) (int band); - void (*set_band) (int band, float value); - float (*get_preamp) (void); - void (*set_preamp) (float value); -} DB_supereq_dsp_t; - -#endif |