diff options
Diffstat (limited to 'plugins/supereq/nsfft-1.00/dft')
-rw-r--r-- | plugins/supereq/nsfft-1.00/dft/DFT.c | 327 | ||||
-rw-r--r-- | plugins/supereq/nsfft-1.00/dft/DFT.h | 56 | ||||
-rw-r--r-- | plugins/supereq/nsfft-1.00/dft/DFTUndiff.c | 1807 | ||||
-rw-r--r-- | plugins/supereq/nsfft-1.00/dft/DFTUndiff.h | 114 | ||||
l--------- | plugins/supereq/nsfft-1.00/dft/Makefile | 1 | ||||
-rw-r--r-- | plugins/supereq/nsfft-1.00/dft/Makefile.altivec | 26 | ||||
-rw-r--r-- | plugins/supereq/nsfft-1.00/dft/Makefile.neon | 26 | ||||
-rw-r--r-- | plugins/supereq/nsfft-1.00/dft/Makefile.purec | 35 | ||||
-rw-r--r-- | plugins/supereq/nsfft-1.00/dft/Makefile.x86 | 29 | ||||
-rw-r--r-- | plugins/supereq/nsfft-1.00/dft/Makefile.x86avx | 35 |
10 files changed, 2456 insertions, 0 deletions
diff --git a/plugins/supereq/nsfft-1.00/dft/DFT.c b/plugins/supereq/nsfft-1.00/dft/DFT.c new file mode 100644 index 00000000..d59e6ab8 --- /dev/null +++ b/plugins/supereq/nsfft-1.00/dft/DFT.c @@ -0,0 +1,327 @@ +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <math.h> +#include <stdint.h> +#include <sys/time.h> + +#include "SIMDBase.h" +#include "DFT.h" +#include "DFTUndiff.h" + +int32_t getModeParamInt_purec_float(int32_t paramId); +int32_t getModeParamInt_purec_double(int32_t paramId); +int32_t getModeParamInt_purec_longdouble(int32_t paramId); +int32_t getModeParamInt_sse_float(int32_t paramId); +int32_t getModeParamInt_sse2_double(int32_t paramId); +int32_t getModeParamInt_neon_float(int32_t paramId); +int32_t getModeParamInt_avx_float(int32_t paramId); +int32_t getModeParamInt_avx_double(int32_t paramId); +int32_t getModeParamInt_altivec_float(int32_t paramId); + +char * getModeParamString_purec_float(int32_t paramId); +char * getModeParamString_purec_double(int32_t paramId); +char * getModeParamString_purec_longdouble(int32_t paramId); +char * getModeParamString_sse_float(int32_t paramId); +char * getModeParamString_sse2_double(int32_t paramId); +char * getModeParamString_neon_float(int32_t paramId); +char * getModeParamString_avx_float(int32_t paramId); +char * getModeParamString_avx_double(int32_t paramId); +char * getModeParamString_altivec_float(int32_t paramId); + +void *makePlan_purec_float(uint64_t n, uint64_t flags); +void *makePlan_purec_double(uint64_t n, uint64_t flags); +void *makePlan_purec_longdouble(uint64_t n, uint64_t flags); +void *makePlan_sse_float(uint64_t n, uint64_t flags); +void *makePlan_sse2_double(uint64_t n, uint64_t flags); +void *makePlan_neon_float(uint64_t n, uint64_t flags); +void *makePlan_avx_float(uint64_t n, uint64_t flags); +void *makePlan_avx_double(uint64_t n, uint64_t flags); +void *makePlan_altivec_float(uint64_t n, uint64_t flags); + +void *makePlanSub_purec_float(uint64_t n, int32_t radix2thres, int32_t useCobra, uint64_t flags); +void *makePlanSub_purec_double(uint64_t n, int32_t radix2thres, int32_t useCobra, uint64_t flags); +void *makePlanSub_purec_longdouble(uint64_t n, int32_t radix2thres, int32_t useCobra, uint64_t flags); +void *makePlanSub_sse_float(uint64_t n, int32_t radix2thres, int32_t useCobra, uint64_t flags); +void *makePlanSub_sse2_double(uint64_t n, int32_t radix2thres, int32_t useCobra, uint64_t flags); +void *makePlanSub_neon_float(uint64_t n, int32_t radix2thres, int32_t useCobra, uint64_t flags); +void *makePlanSub_avx_float(uint64_t n, int32_t radix2thres, int32_t useCobra, uint64_t flags); +void *makePlanSub_avx_double(uint64_t n, int32_t radix2thres, int32_t useCobra, uint64_t flags); +void *makePlanSub_altivec_float(uint64_t n, int32_t radix2thres, int32_t useCobra, uint64_t flags); + +void destroyPlan_purec_float(void *p); +void destroyPlan_purec_double(void *p); +void destroyPlan_purec_longdouble(void *p); +void destroyPlan_sse_float(void *p); +void destroyPlan_sse2_double(void *p); +void destroyPlan_neon_float(void *p); +void destroyPlan_avx_float(void *p); +void destroyPlan_avx_double(void *p); +void destroyPlan_altivec_float(void *p); + +void execute_purec_float(void *p, void *s, int32_t dir); +void execute_purec_double(void *p, void *s, int32_t dir); +void execute_purec_longdouble(void *p, void *s, int32_t dir); +void execute_sse_float(void *p, void *s, int32_t dir); +void execute_sse2_double(void *p, void *s, int32_t dir); +void execute_neon_float(void *p, void *s, int32_t dir); +void execute_avx_float(void *p, void *s, int32_t dir); +void execute_avx_double(void *p, void *s, int32_t dir); +void execute_altivec_float(void *p, void *s, int32_t dir); + +void *DFT_init(int32_t mode, uint64_t n, uint64_t flags) { + switch(mode) { +#if defined(ENABLE_PUREC_FLOAT) + case 1: return makePlan_purec_float(n, flags); break; +#endif +#if defined(ENABLE_PUREC_DOUBLE) + case 2: return makePlan_purec_double(n, flags); break; +#endif +#if defined(ENABLE_PUREC_LONGDOUBLE) + case 3: return makePlan_purec_longdouble(n, flags); break; +#endif +#if defined(ENABLE_SSE_FLOAT) + case 4: return makePlan_sse_float(n, flags); break; +#endif +#if defined(ENABLE_SSE2_DOUBLE) + case 5: return makePlan_sse2_double(n, flags); break; +#endif +#if defined(ENABLE_NEON_FLOAT) + case 6: return makePlan_neon_float(n, flags); break; +#endif +#if defined(ENABLE_AVX_FLOAT) + case 7: return makePlan_avx_float(n, flags); break; +#endif +#if defined(ENABLE_AVX_DOUBLE) + case 8: return makePlan_avx_double(n, flags); break; +#endif +#if defined(ENABLE_ALTIVEC_FLOAT) + case 9: return makePlan_altivec_float(n, flags); break; +#endif + default: break; + } + + return NULL; +} + +void DFT_dispose(void *p, int32_t mode) { + switch(mode) { +#if defined(ENABLE_PUREC_FLOAT) + case 1: destroyPlan_purec_float(p); break; +#endif +#if defined(ENABLE_PUREC_DOUBLE) + case 2: destroyPlan_purec_double(p); break; +#endif +#if defined(ENABLE_PUREC_LONGDOUBLE) + case 3: destroyPlan_purec_longdouble(p); break; +#endif +#if defined(ENABLE_SSE_FLOAT) + case 4: destroyPlan_sse_float(p); break; +#endif +#if defined(ENABLE_SSE2_DOUBLE) + case 5: destroyPlan_sse2_double(p); break; +#endif +#if defined(ENABLE_NEON_FLOAT) + case 6: destroyPlan_neon_float(p); break; +#endif +#if defined(ENABLE_AVX_FLOAT) + case 7: destroyPlan_avx_float(p); break; +#endif +#if defined(ENABLE_AVX_DOUBLE) + case 8: destroyPlan_avx_double(p); break; +#endif +#if defined(ENABLE_ALTIVEC_FLOAT) + case 9: destroyPlan_altivec_float(p); break; +#endif + default: break; + } +} + +void DFT_execute(void *p, int32_t mode, void *s, int32_t dir) { + switch(mode) { +#if defined(ENABLE_PUREC_FLOAT) + case 1: return execute_purec_float(p, s, dir); break; +#endif +#if defined(ENABLE_PUREC_DOUBLE) + case 2: return execute_purec_double(p, s, dir); break; +#endif +#if defined(ENABLE_PUREC_LONGDOUBLE) + case 3: return execute_purec_longdouble(p, s, dir); break; +#endif +#if defined(ENABLE_SSE_FLOAT) + case 4: return execute_sse_float(p, s, dir); break; +#endif +#if defined(ENABLE_SSE2_DOUBLE) + case 5: return execute_sse2_double(p, s, dir); break; +#endif +#if defined(ENABLE_NEON_FLOAT) + case 6: return execute_neon_float(p, s, dir); break; +#endif +#if defined(ENABLE_AVX_FLOAT) + case 7: return execute_avx_float(p, s, dir); break; +#endif +#if defined(ENABLE_AVX_DOUBLE) + case 8: return execute_avx_double(p, s, dir); break; +#endif +#if defined(ENABLE_ALTIVEC_FLOAT) + case 9: return execute_altivec_float(p, s, dir); break; +#endif + default: break; + } +} + +#define FILE_FORMAT_VERSION 0 + +int32_t DFT_fwrite(void *p2, FILE *fp) { + DFTUndiff *p = (DFTUndiff *)p2; + if (p->magic != MAGIC_DFT) abort(); + + if (fprintf(fp, "nsfft file format : %d\n", FILE_FORMAT_VERSION) <= 0) return 0; + if (fprintf(fp, "arch : %s\n", SIMDBase_getProcessorNameString()) <= 0) return 0; + if (fprintf(fp, "computation mode : %d\n", p->mode) <= 0) return 0; + if (fprintf(fp, "length : %d\n", ((p->flags & DFT_FLAG_REAL) != 0 || (p->flags & DFT_FLAG_ALT_REAL) != 0)? p->length * 2 : p->length) <= 0) return 0; + if (fprintf(fp, "radix2 threshold : %d\n", p->radix2thres) <= 0) return 0; + if (fprintf(fp, "transpose : %d\n", p->flagTrans) <= 0) return 0; + if (fprintf(fp, "bit reversal : %d\n", p->useCobra) <= 0) return 0; + if (fprintf(fp, "flags : %llx\n", (unsigned long long int)p->flags) <= 0) return 0; + if (fprintf(fp, "%s\n", "end :") <= 0) return 0; + + return 1; +} + +static char *startsWith(char *str1, char *str2) { + if (strncmp(str1, str2, strlen(str2)) == 0) { + return str1 + strlen(str2); + } + + return NULL; +} + +DFT *DFT_fread(FILE *fp, int32_t *errcode) { + int length = -1, radix2thres = -1, flagTrans = -1, useCobra = -1; + int mode = -1, formatver = -1; + unsigned long long int flags = (1ULL << 63); + + if (errcode != NULL) *errcode = DFT_ERROR_NOERROR; + + for(;;) { + char buf[256], *q; + if (fgets(buf, 255, fp) == NULL) { if (errcode != NULL) *errcode = DFT_ERROR_UNEXPECTED_EOF; return NULL; } + + if ((q = startsWith(buf, "nsfft file format :")) != NULL) { + if (1 != sscanf(q, "%d", &formatver)) { if (errcode != NULL) *errcode = DFT_ERROR_FILE_IO; return NULL; } + } else if ((q = startsWith(buf, "computation mode :")) != NULL) { + if (1 != sscanf(q, "%d", &mode)) { if (errcode != NULL) *errcode = DFT_ERROR_FILE_IO; return NULL; } + } else if ((q = startsWith(buf, "length :")) != NULL) { + if (1 != sscanf(q, "%d", &length)) { if (errcode != NULL) *errcode = DFT_ERROR_FILE_IO; return NULL; } + } else if ((q = startsWith(buf, "radix2 threshold :")) != NULL) { + if (1 != sscanf(q, "%d", &radix2thres)) { if (errcode != NULL) *errcode = DFT_ERROR_FILE_IO; return NULL; } + } else if ((q = startsWith(buf, "transpose :")) != NULL) { + if (1 != sscanf(q, "%d", &flagTrans)) { if (errcode != NULL) *errcode = DFT_ERROR_FILE_IO; return NULL; } + } else if ((q = startsWith(buf, "bit reversal :")) != NULL) { + if (1 != sscanf(q, "%d", &useCobra)) { if (errcode != NULL) *errcode = DFT_ERROR_FILE_IO; return NULL; } + } else if ((q = startsWith(buf, "flags :")) != NULL) { + if (1 != sscanf(q, "%llx", &flags)) { if (errcode != NULL) *errcode = DFT_ERROR_FILE_IO; return NULL; } + } else if ((q = startsWith(buf, "end :")) != NULL) { + break; + } + } + + if (formatver > FILE_FORMAT_VERSION) { + if (errcode != NULL) *errcode = DFT_ERROR_FILE_VERSION; + return NULL; + } + + switch(SIMDBase_detect(mode)) { + case 1: + break; + case 0: + if (errcode != NULL) *errcode = DFT_ERROR_MODE_NOT_AVAILABLE; + return NULL; + case -1: + if (errcode != NULL) *errcode = DFT_ERROR_MODE_NOT_COMPILED_IN; + return NULL; + } + + switch(mode) { +#if defined(ENABLE_PUREC_FLOAT) + case 1: return makePlanSub_purec_float(length, radix2thres, useCobra, flags); +#endif +#if defined(ENABLE_PUREC_DOUBLE) + case 2: return makePlanSub_purec_double(length, radix2thres, useCobra, flags); +#endif +#if defined(ENABLE_PUREC_LONGDOUBLE) + case 3: return makePlanSub_purec_longdouble(length, radix2thres, useCobra, flags); +#endif +#if defined(ENABLE_SSE_FLOAT) + case 4: return makePlanSub_sse_float(length, radix2thres, useCobra, flags); +#endif +#if defined(ENABLE_SSE2_DOUBLE) + case 5: return makePlanSub_sse2_double(length, radix2thres, useCobra, flags); +#endif +#if defined(ENABLE_NEON_FLOAT) + case 6: return makePlanSub_neon_float(length, radix2thres, useCobra, flags); +#endif +#if defined(ENABLE_AVX_FLOAT) + case 7: return makePlanSub_avx_float(length, radix2thres, useCobra, flags); +#endif +#if defined(ENABLE_AVX_DOUBLE) + case 8: return makePlanSub_avx_double(length, radix2thres, useCobra, flags); +#endif +#if defined(ENABLE_ALTIVEC_FLOAT) + case 9: return makePlanSub_altivec_float(length, radix2thres, useCobra, flags); +#endif + } + + if (errcode != NULL) *errcode = DFT_ERROR_UNKNOWN_MODE; + + return NULL; +} + +int32_t DFT_getPlanParamInt(int32_t paramId, void *p2) { + DFTUndiff *p = (DFTUndiff *)p2; + if (p->magic != MAGIC_DFT) abort(); + + switch(paramId) { + case DFT_PARAMID_MODE: return p->mode; + case DFT_PARAMID_FFT_LENGTH: + if ((p->flags & DFT_FLAG_REAL) != 0) return p->length * 2; + if ((p->flags & DFT_FLAG_ALT_REAL) != 0) return p->length * 2; + return p->length; + case DFT_PARAMID_IS_REAL_TRANSFORM: return (p->flags & DFT_FLAG_REAL) ? 1 : 0; + case DFT_PARAMID_IS_ALT_REAL_TRANSFORM: return (p->flags & DFT_FLAG_ALT_REAL) ? 1 : 0; + case DFT_PARAMID_NO_BIT_REVERSAL: return (p->flags & DFT_FLAG_NO_BITREVERSAL) ? 1 : 0; + case DFT_PARAMID_TEST_RUN: return p->flags & 3; + } + + return -1; +} + +#if 0 +char *DFT_getPlanParamString(int32_t paramId, void *p2) { + dft_t *p = (dft_t *)p2; + if (p->magic != MAGIC_NSDFT) abort(); + + return NULL; +} +#endif + +uint32_t DFT_ilog2(uint32_t q) { + static const uint32_t tab[] = {0,1,2,2,3,3,3,3,4,4,4,4,4,4,4,4}; + uint32_t r = 0,qq; + + if (q & 0xffff0000) r = 16; + + q >>= r; + qq = q | (q >> 1); + qq |= (qq >> 2); + qq = ((qq & 0x10) >> 4) | ((qq & 0x100) >> 7) | ((qq & 0x1000) >> 10); + + return r + tab[qq] * 4 + tab[q >> (tab[qq] * 4)] - 1; +} + +double DFT_timeofday(void) { + struct timeval tp; + gettimeofday(&tp, NULL); + return (double)tp.tv_sec+(1e-6)*tp.tv_usec; +} diff --git a/plugins/supereq/nsfft-1.00/dft/DFT.h b/plugins/supereq/nsfft-1.00/dft/DFT.h new file mode 100644 index 00000000..facb701a --- /dev/null +++ b/plugins/supereq/nsfft-1.00/dft/DFT.h @@ -0,0 +1,56 @@ +#ifndef __DFT_H__ +#define __DFT_H__ + +#include <stdio.h> +#include <stdint.h> + +typedef void DFT; + +int32_t DFT_getParamInt(int32_t paramId); +char *DFT_getParamString(int32_t paramId); + +int32_t DFT_getModeParamInt(int32_t paramId, int32_t mode); +char *DFT_getModeParamString(int32_t paramId, int32_t mode); + +DFT *DFT_init(int32_t mode, uint64_t n, uint64_t flags); +void DFT_dispose(DFT *p, int32_t mode); + +int32_t DFT_fwrite(DFT *p, FILE *fp); +DFT *DFT_fread(FILE *fp, int32_t *errcode); + +int32_t DFT_getPlanParamInt(int32_t paramId, void *p); + +void DFT_execute(DFT *p, int32_t mode, void *s, int32_t dir); + +uint32_t DFT_ilog2(uint32_t q); +double DFT_timeofday(void); + +#define DFT_FLAG_NO_TEST_RUN ( 0ULL << 0) +#define DFT_FLAG_LIGHT_TEST_RUN ( 1ULL << 0) +#define DFT_FLAG_HEAVY_TEST_RUN ( 2ULL << 0) +#define DFT_FLAG_EXHAUSTIVE_TEST_RUN ( 3ULL << 0) + +#define DFT_FLAG_REAL (1ULL << 2) +#define DFT_FLAG_ALT_REAL (1ULL << 3) +#define DFT_FLAG_VERBOSE (1ULL << 4) +#define DFT_FLAG_NO_BITREVERSAL (1ULL << 5) +#define DFT_FLAG_FORCE_RECURSIVE (1ULL << 6) +#define DFT_FLAG_FORCE_COBRA (1ULL << 7) + +#define DFT_PARAMID_TYPE ( 1 | ( 3 << 24 )) +#define DFT_PARAMID_MODE ( 2 | ( 3 << 24 )) +#define DFT_PARAMID_FFT_LENGTH ( 3 | ( 3 << 24 )) +#define DFT_PARAMID_IS_REAL_TRANSFORM ( 4 | ( 3 << 24 )) +#define DFT_PARAMID_IS_ALT_REAL_TRANSFORM ( 5 | ( 3 << 24 )) +#define DFT_PARAMID_NO_BIT_REVERSAL ( 6 | ( 3 << 24 )) +#define DFT_PARAMID_TEST_RUN ( 7 | ( 3 << 24 )) + +#define DFT_ERROR_NOERROR 0 +#define DFT_ERROR_FILE_VERSION 1 +#define DFT_ERROR_FILE_IO 2 +#define DFT_ERROR_UNEXPECTED_EOF 3 +#define DFT_ERROR_MODE_NOT_COMPILED_IN 4 +#define DFT_ERROR_MODE_NOT_AVAILABLE 5 +#define DFT_ERROR_UNKNOWN_MODE 6 + +#endif diff --git a/plugins/supereq/nsfft-1.00/dft/DFTUndiff.c b/plugins/supereq/nsfft-1.00/dft/DFTUndiff.c new file mode 100644 index 00000000..4985da33 --- /dev/null +++ b/plugins/supereq/nsfft-1.00/dft/DFTUndiff.c @@ -0,0 +1,1807 @@ +#include <stdio.h> +#include <stdlib.h> +#include <math.h> +#include <stdint.h> + +#include "SIMDBase.h" +#include "SIMDBaseUndiff.h" +#include "DFT.h" +#include "DFTUndiff.h" + +// + +#define SIN(x) sin(x) +#define COS(x) cos(x) + +#define SQRT2_2 .7071067811865475244008443621048490392848359376884740365883398689953L + +#ifndef M_PIl +#define M_PIl 3.141592653589793238462643383279502884197169399375105820974944592307L +#endif + +// + +static inline void srBut2(DFTUndiff *p) { + SIMDBase_VECT *s = p->s; + int32_t o = p->offset1; + SIMDBase_VECT t0, t1; + + t0 = SIMDBase_ADDm(&s[o ], &s[o+2]); t1 = SIMDBase_SUBm(&s[o ], &s[o+2]); + SIMDBase_STOR(&s[o ], t0); SIMDBase_STOR(&s[o+2], t1); + t0 = SIMDBase_ADDm(&s[o+1], &s[o+3]); t1 = SIMDBase_SUBm(&s[o+1], &s[o+3]); + SIMDBase_STOR(&s[o+1], t0); SIMDBase_STOR(&s[o+3], t1); +} + +static inline void srButForward4(DFTUndiff *p) { + SIMDBase_VECT *s = p->s; + int32_t o = p->offset1; + SIMDBase_VECT t0r, t0i, t1r, t1i, t2r, t2i, t3r, t3i; + + t0r = SIMDBase_ADDm(&s[o+0], &s[o+4]); t2r = SIMDBase_SUBm(&s[o+0], &s[o+4]); + t0i = SIMDBase_ADDm(&s[o+1], &s[o+5]); t2i = SIMDBase_SUBm(&s[o+1], &s[o+5]); + t1r = SIMDBase_ADDm(&s[o+2], &s[o+6]); t3i = SIMDBase_SUBm(&s[o+2], &s[o+6]); + t1i = SIMDBase_ADDm(&s[o+7], &s[o+3]); t3r = SIMDBase_SUBm(&s[o+7], &s[o+3]); + + SIMDBase_STOR(&s[o+0], SIMDBase_ADDi(t0r, t1r)); SIMDBase_STOR(&s[o+1], SIMDBase_ADDi(t0i, t1i)); + SIMDBase_STOR(&s[o+2], SIMDBase_SUBi(t0r, t1r)); SIMDBase_STOR(&s[o+3], SIMDBase_SUBi(t0i, t1i)); + SIMDBase_STOR(&s[o+4], SIMDBase_SUBi(t2r, t3r)); SIMDBase_STOR(&s[o+5], SIMDBase_SUBi(t2i, t3i)); + SIMDBase_STOR(&s[o+6], SIMDBase_ADDi(t2r, t3r)); SIMDBase_STOR(&s[o+7], SIMDBase_ADDi(t2i, t3i)); +} + +static inline void srButBackward4(DFTUndiff *p) { + SIMDBase_VECT *s = p->s; + int32_t o = p->offset1; + + SIMDBase_VECT t0r, t0i, t1r, t1i; + SIMDBase_VECT s0 = SIMDBase_LOAD(&s[o+0]), s1 = SIMDBase_LOAD(&s[o+1]), s2 = SIMDBase_LOAD(&s[o+2]), s3 = SIMDBase_LOAD(&s[o+3]); + + t0r = SIMDBase_ADDi(s0, s2); t0i = SIMDBase_SUBi(s0, s2); s0 = t0r; s2 = t0i; + t0r = SIMDBase_ADDi(s1, s3); t0i = SIMDBase_SUBi(s1, s3); s1 = t0r; s3 = t0i; + t0r = SIMDBase_ADDm(&s[o+4], &s[o+6]); t1i = SIMDBase_SUBm(&s[o+4], &s[o+6]); + t0i = SIMDBase_ADDm(&s[o+7], &s[o+5]); t1r = SIMDBase_SUBm(&s[o+7], &s[o+5]); + + SIMDBase_STOR(&s[o+4], SIMDBase_SUBi(s0, t0r)); SIMDBase_STOR(&s[o+5], SIMDBase_SUBi(s1, t0i)); + SIMDBase_STOR(&s[o+6], SIMDBase_SUBi(s2, t1r)); SIMDBase_STOR(&s[o+7], SIMDBase_SUBi(s3, t1i)); + SIMDBase_STOR(&s[o+0], SIMDBase_ADDi(s0, t0r)); SIMDBase_STOR(&s[o+1], SIMDBase_ADDi(s1, t0i)); + SIMDBase_STOR(&s[o+2], SIMDBase_ADDi(s2, t1r)); SIMDBase_STOR(&s[o+3], SIMDBase_ADDi(s3, t1i)); +} + +static inline void srButForward8(DFTUndiff *p) { + SIMDBase_VECT *s = p->s; + int32_t o = p->offset1; + SIMDBase_VECT t0r, t0i, t1r, t1i, t2r, t2i, t3r, t3i; + + SIMDBase_VECT s0 = SIMDBase_LOAD(&s[o+ 0]), s1 = SIMDBase_LOAD(&s[o+ 1]), s2 = SIMDBase_LOAD(&s[o+ 2]), s3 = SIMDBase_LOAD(&s[o+ 3]); + SIMDBase_VECT s4 = SIMDBase_LOAD(&s[o+ 4]), s5 = SIMDBase_LOAD(&s[o+ 5]), s6 = SIMDBase_LOAD(&s[o+ 6]), s7 = SIMDBase_LOAD(&s[o+ 7]); + SIMDBase_VECT s8 = SIMDBase_LOAD(&s[o+ 8]), s9 = SIMDBase_LOAD(&s[o+ 9]), sa = SIMDBase_LOAD(&s[o+10]) ,sb = SIMDBase_LOAD(&s[o+11]); + SIMDBase_VECT sc = SIMDBase_LOAD(&s[o+12]), sd = SIMDBase_LOAD(&s[o+13]), se = SIMDBase_LOAD(&s[o+14]), sf = SIMDBase_LOAD(&s[o+15]); + + t2r = SIMDBase_SUBi(s0, s8); t2i = SIMDBase_SUBi(s1, s9); + t3r = SIMDBase_SUBi(sd, s5); t3i = SIMDBase_SUBi(s4, sc); + + s0 = SIMDBase_ADDi(s0, s8); s1 = SIMDBase_ADDi(s1, s9); + s4 = SIMDBase_ADDi(s4, sc); s5 = SIMDBase_ADDi(s5, sd); + + s8 = SIMDBase_SUBi(t2r, t3r); s9 = SIMDBase_SUBi(t2i, t3i); + sc = SIMDBase_ADDi(t2r, t3r); sd = SIMDBase_ADDi(t2i, t3i); + + t2r = SIMDBase_SUBi(s2, sa); t2i = SIMDBase_SUBi(s3, sb); + t3r = SIMDBase_SUBi(sf, s7); t3i = SIMDBase_SUBi(s6, se); + + s2 = SIMDBase_ADDi(s2, sa); s3 = SIMDBase_ADDi(s3, sb); + s6 = SIMDBase_ADDi(s6, se); s7 = SIMDBase_ADDi(s7, sf); + + t0r = SIMDBase_SUBi(t2r, t3r); t1r = SIMDBase_ADDi(t2r, t3r); + t0i = SIMDBase_SUBi(t2i, t3i); t1i = SIMDBase_ADDi(t2i, t3i); + + sa = SIMDBase_MULi(SIMDBase_ADDi(t0r, t0i), SIMDBase_SET1( SQRT2_2)); + sb = SIMDBase_MULi(SIMDBase_SUBi(t0i, t0r), SIMDBase_SET1( SQRT2_2)); + se = SIMDBase_MULi(SIMDBase_SUBi(t1i, t1r), SIMDBase_SET1( SQRT2_2)); + sf = SIMDBase_MULi(SIMDBase_ADDi(t1r, t1i), SIMDBase_SET1(-SQRT2_2)); + + SIMDBase_STOR(&s[o+ 8], SIMDBase_ADDi(s8, sa)); SIMDBase_STOR(&s[o+ 9], SIMDBase_ADDi(s9, sb)); + SIMDBase_STOR(&s[o+10], SIMDBase_SUBi(s8, sa)); SIMDBase_STOR(&s[o+11], SIMDBase_SUBi(s9, sb)); + + SIMDBase_STOR(&s[o+12], SIMDBase_ADDi(sc, se)); SIMDBase_STOR(&s[o+13], SIMDBase_ADDi(sd, sf)); + SIMDBase_STOR(&s[o+14], SIMDBase_SUBi(sc, se)); SIMDBase_STOR(&s[o+15], SIMDBase_SUBi(sd, sf)); + + t0r = SIMDBase_ADDi(s0, s4); t2r = SIMDBase_SUBi(s0, s4); + t0i = SIMDBase_ADDi(s1, s5); t2i = SIMDBase_SUBi(s1, s5); + + t1r = SIMDBase_ADDi(s2, s6); t3i = SIMDBase_SUBi(s2, s6); + t1i = SIMDBase_ADDi(s3, s7); t3r = SIMDBase_SUBi(s7, s3); + + SIMDBase_STOR(&s[o+0], SIMDBase_ADDi(t0r, t1r)); SIMDBase_STOR(&s[o+1], SIMDBase_ADDi(t0i, t1i)); + SIMDBase_STOR(&s[o+2], SIMDBase_SUBi(t0r, t1r)); SIMDBase_STOR(&s[o+3], SIMDBase_SUBi(t0i, t1i)); + SIMDBase_STOR(&s[o+4], SIMDBase_SUBi(t2r, t3r)); SIMDBase_STOR(&s[o+5], SIMDBase_SUBi(t2i, t3i)); + SIMDBase_STOR(&s[o+6], SIMDBase_ADDi(t2r, t3r)); SIMDBase_STOR(&s[o+7], SIMDBase_ADDi(t2i, t3i)); +} + +static void srButBackward8(DFTUndiff *p) { + SIMDBase_VECT *s = p->s; + int32_t o = p->offset1; + SIMDBase_VECT t0r, t0i, t1r, t1i; + + SIMDBase_VECT s0 = SIMDBase_LOAD(&s[o+ 0]), s1 = SIMDBase_LOAD(&s[o+ 1]), s2 = SIMDBase_LOAD(&s[o+ 2]), s3 = SIMDBase_LOAD(&s[o+ 3]); + SIMDBase_VECT s4 = SIMDBase_LOAD(&s[o+ 4]), s5 = SIMDBase_LOAD(&s[o+ 5]), s6 = SIMDBase_LOAD(&s[o+ 6]), s7 = SIMDBase_LOAD(&s[o+ 7]); + SIMDBase_VECT s8 = SIMDBase_LOAD(&s[o+ 8]), s9 = SIMDBase_LOAD(&s[o+ 9]), sa = SIMDBase_LOAD(&s[o+10]) ,sb = SIMDBase_LOAD(&s[o+11]); + SIMDBase_VECT sc = SIMDBase_LOAD(&s[o+12]), sd = SIMDBase_LOAD(&s[o+13]), se = SIMDBase_LOAD(&s[o+14]), sf = SIMDBase_LOAD(&s[o+15]); + + t0r = SIMDBase_ADDi(s8, sa); t0i = SIMDBase_SUBi(s8, sa); s8 = t0r; sa = t0i; + t0r = SIMDBase_ADDi(s9, sb); t0i = SIMDBase_SUBi(s9, sb); s9 = t0r; sb = t0i; + t0r = SIMDBase_ADDi(sc, se); t0i = SIMDBase_SUBi(sc, se); sc = t0r; se = t0i; + t0r = SIMDBase_ADDi(sd, sf); t0i = SIMDBase_SUBi(sd, sf); sd = t0r; sf = t0i; + t0r = SIMDBase_ADDi(s0, s2); t0i = SIMDBase_SUBi(s0, s2); s0 = t0r; s2 = t0i; + t0r = SIMDBase_ADDi(s1, s3); t0i = SIMDBase_SUBi(s1, s3); s1 = t0r; s3 = t0i; + + t0r = SIMDBase_ADDi(s4, s6); t0i = SIMDBase_ADDi(s7, s5); + t1r = SIMDBase_SUBi(s7, s5); t1i = SIMDBase_SUBi(s4, s6); + + s4 = SIMDBase_SUBi(s0, t0r); s5 = SIMDBase_SUBi(s1, t0i); + s6 = SIMDBase_SUBi(s2, t1r); s7 = SIMDBase_SUBi(s3, t1i); + s0 = SIMDBase_ADDi(s0, t0r); s1 = SIMDBase_ADDi(s1, t0i); + s2 = SIMDBase_ADDi(s2, t1r); s3 = SIMDBase_ADDi(s3, t1i); + + t0r = SIMDBase_ADDi(s8, sc); t0i = SIMDBase_ADDi(s9, sd); + t1r = SIMDBase_SUBi(sd, s9); t1i = SIMDBase_SUBi(s8, sc); + + s8 = SIMDBase_SUBi(s0, t0r); s9 = SIMDBase_SUBi(s1, t0i); + sc = SIMDBase_SUBi(s4, t1r); sd = SIMDBase_SUBi(s5, t1i); + s0 = SIMDBase_ADDi(s0, t0r); s1 = SIMDBase_ADDi(s1, t0i); + s4 = SIMDBase_ADDi(s4, t1r); s5 = SIMDBase_ADDi(s5, t1i); + + t0r = SIMDBase_MULi(SIMDBase_SUBi(sa, sb), SIMDBase_SET1( SQRT2_2)); + t0i = SIMDBase_MULi(SIMDBase_ADDi(sa, sb), SIMDBase_SET1( SQRT2_2)); + t1r = SIMDBase_MULi(SIMDBase_ADDi(se, sf), SIMDBase_SET1(-SQRT2_2)); + t1i = SIMDBase_MULi(SIMDBase_SUBi(se, sf), SIMDBase_SET1( SQRT2_2)); + + sa = t0r; sb = t0i; se = t1r; sf = t1i; + + t0r = SIMDBase_ADDi(sa, se); t0i = SIMDBase_ADDi(sb, sf); + t1r = SIMDBase_SUBi(sf, sb); t1i = SIMDBase_SUBi(sa, se); + + sa = SIMDBase_SUBi(s2, t0r); sb = SIMDBase_SUBi(s3, t0i); + se = SIMDBase_SUBi(s6, t1r); sf = SIMDBase_SUBi(s7, t1i); + s2 = SIMDBase_ADDi(s2, t0r); s3 = SIMDBase_ADDi(s3, t0i); + s6 = SIMDBase_ADDi(s6, t1r); s7 = SIMDBase_ADDi(s7, t1i); + + SIMDBase_STOR(&s[o+ 0], s0); SIMDBase_STOR(&s[o+ 1], s1); SIMDBase_STOR(&s[o+ 2], s2); SIMDBase_STOR(&s[o+ 3], s3); + SIMDBase_STOR(&s[o+ 4], s4); SIMDBase_STOR(&s[o+ 5], s5); SIMDBase_STOR(&s[o+ 6], s6); SIMDBase_STOR(&s[o+ 7], s7); + SIMDBase_STOR(&s[o+ 8], s8); SIMDBase_STOR(&s[o+ 9], s9); SIMDBase_STOR(&s[o+10], sa); SIMDBase_STOR(&s[o+11], sb); + SIMDBase_STOR(&s[o+12], sc); SIMDBase_STOR(&s[o+13], sd); SIMDBase_STOR(&s[o+14], se); SIMDBase_STOR(&s[o+15], sf); +} + +#if 0 +static inline void srButForwardSub(DFTUndiff *p) { + SIMDBase_VECT *s = p->s; + SIMDBase_REAL *tbl = p->ptTable[p->log2butlen]; + + int32_t i0 = p->offset1; + int32_t i1 = i0 + p->stride; + int32_t i2 = i1 + p->stride; + int32_t i3 = i2 + p->stride; + int32_t im = i1; + + int32_t p0 = p->offset2 & (p->butlen*4-1); + + while(i0 < im) { + SIMDBase_VECT t0r, t0i, t1r, t1i; + SIMDBase_VECT s00, s01, s10, s11, s20, s21, s30, s31; + SIMDBase_VECT a0, a1, a2, a3; + + s00 = SIMDBase_LOAD(&s[i0+0]), s01 = SIMDBase_LOAD(&s[i0+1]); + s10 = SIMDBase_LOAD(&s[i1+0]), s11 = SIMDBase_LOAD(&s[i1+1]); + s20 = SIMDBase_LOAD(&s[i2+0]), s21 = SIMDBase_LOAD(&s[i2+1]); + s30 = SIMDBase_LOAD(&s[i3+0]), s31 = SIMDBase_LOAD(&s[i3+1]); + + t0r = SIMDBase_SUBi(SIMDBase_SUBi(s00, s20), SIMDBase_SUBi(s31, s11)); + t0i = SIMDBase_SUBi(SIMDBase_SUBi(s01, s21), SIMDBase_SUBi(s10, s30)); + + t1r = SIMDBase_ADDi(SIMDBase_SUBi(s00, s20), SIMDBase_SUBi(s31, s11)); + t1i = SIMDBase_ADDi(SIMDBase_SUBi(s01, s21), SIMDBase_SUBi(s10, s30)); + + a0 = SIMDBase_LOAD1(&tbl[p0+0]); a1 = SIMDBase_LOAD1(&tbl[p0+1]); + a2 = SIMDBase_LOAD1(&tbl[p0+2]); a3 = SIMDBase_LOAD1(&tbl[p0+3]); + + SIMDBase_STOR(&s[i0 ], SIMDBase_ADDi(s00, s20)); SIMDBase_STOR(&s[i0+1], SIMDBase_ADDi(s01, s21)); + SIMDBase_STOR(&s[i1 ], SIMDBase_ADDi(s10, s30)); SIMDBase_STOR(&s[i1+1], SIMDBase_ADDi(s11, s31)); + +#ifndef SIMDBase_FMADD_AVAILABLE + SIMDBase_STOR(&s[i2 ], SIMDBase_SUBi(SIMDBase_MULi(t0r, a0), SIMDBase_MULi(t0i, a1))); + SIMDBase_STOR(&s[i2+1], SIMDBase_ADDi(SIMDBase_MULi(t0r, a1), SIMDBase_MULi(t0i, a0))); + SIMDBase_STOR(&s[i3 ], SIMDBase_SUBi(SIMDBase_MULi(t1r, a2), SIMDBase_MULi(t1i, a3))); + SIMDBase_STOR(&s[i3+1], SIMDBase_ADDi(SIMDBase_MULi(t1r, a3), SIMDBase_MULi(t1i, a2))); +#else + SIMDBase_STOR(&s[i2 ], SIMDBase_FMSUBi(t0i, a1, SIMDBase_MULi(t0r, a0))); + SIMDBase_STOR(&s[i2+1], SIMDBase_FMADDi(t0r, a1, SIMDBase_MULi(t0i, a0))); + SIMDBase_STOR(&s[i3 ], SIMDBase_FMSUBi(t1i, a3, SIMDBase_MULi(t1r, a2))); + SIMDBase_STOR(&s[i3+1], SIMDBase_FMADDi(t1r, a3, SIMDBase_MULi(t1i, a2))); +#endif + + i0 += 2; i1 += 2; i2 += 2; i3 += 2; + p0 += 4; + } +} +#endif + +#if 0 +static inline void srButBackwardSub(DFTUndiff *p) { + SIMDBase_VECT *s = p->s; + SIMDBase_REAL *tbl = p->ptTable[p->log2butlen]; + + int32_t i0 = p->offset1; + int32_t i1 = i0 + p->stride; + int32_t i2 = i1 + p->stride; + int32_t i3 = i2 + p->stride; + int32_t im = i1; + + int32_t p0 = p->offset2 & (p->butlen*4-1); + + while(i0 < im) { + SIMDBase_VECT t0r, t0i, t1r, t1i, u, v; + SIMDBase_VECT s00, s01, s10, s11, s20, s21, s30, s31; + SIMDBase_VECT a0, a1, a2, a3; + + s20 = SIMDBase_LOAD(&s[i2+0]); s21 = SIMDBase_LOAD(&s[i2+1]); + a0 = SIMDBase_LOAD1(&tbl[p0+0]); a1 = SIMDBase_LOAD1(&tbl[p0+1]); + u = SIMDBase_ADDi(SIMDBase_MULi(s20, a0), SIMDBase_MULi(s21, a1)); + + s30 = SIMDBase_LOAD(&s[i3+0]); s31 = SIMDBase_LOAD(&s[i3+1]); + a2 = SIMDBase_LOAD1(&tbl[p0+2]); a3 = SIMDBase_LOAD1(&tbl[p0+3]); + v = SIMDBase_ADDi(SIMDBase_MULi(s30, a2), SIMDBase_MULi(s31, a3)); + + t0r = SIMDBase_ADDi(u, v); t1i = SIMDBase_SUBi(u, v); + u = SIMDBase_SUBi(SIMDBase_MULi(s31, a2), SIMDBase_MULi(s30, a3)); + v = SIMDBase_SUBi(SIMDBase_MULi(s21, a0), SIMDBase_MULi(s20, a1)); + t0i = SIMDBase_ADDi(u, v); t1r = SIMDBase_SUBi(u, v); + + s00 = SIMDBase_LOAD(&s[i0+0]); s01 = SIMDBase_LOAD(&s[i0+1]); + s10 = SIMDBase_LOAD(&s[i1+0]); s11 = SIMDBase_LOAD(&s[i1+1]); + + SIMDBase_STOR(&s[i2+0], SIMDBase_SUBi(s00, t0r)); SIMDBase_STOR(&s[i0+0], SIMDBase_ADDi(s00, t0r)); + SIMDBase_STOR(&s[i2+1], SIMDBase_SUBi(s01, t0i)); SIMDBase_STOR(&s[i0+1], SIMDBase_ADDi(s01, t0i)); + SIMDBase_STOR(&s[i3+0], SIMDBase_SUBi(s10, t1r)); SIMDBase_STOR(&s[i1+0], SIMDBase_ADDi(s10, t1r)); + SIMDBase_STOR(&s[i3+1], SIMDBase_SUBi(s11, t1i)); SIMDBase_STOR(&s[i1+1], SIMDBase_ADDi(s11, t1i)); + + i0 += 2; i1 += 2; i2 += 2; i3 += 2; + p0 += 4; + } +} + +static void srButBackwardSubUnrolled(DFTUndiff *p) { + srButBackwardSub(p); +} +#endif + +static inline void srButForwardSubUnrolled(DFTUndiff *p) { + SIMDBase_VECT *s = p->s; + SIMDBase_REAL *tbl = p->ptTable[p->log2butlen]; + + int32_t i0 = p->offset1; + int32_t i1 = i0 + p->stride; + int32_t i2 = i1 + p->stride; + int32_t i3 = i2 + p->stride; + int32_t im = i1; + + int32_t p0 = p->offset2 & (p->butlen*4-1); + + while(i0 < im) { + SIMDBase_VECT t0r, t0i, t1r, t1i; + SIMDBase_VECT s00, s01, s10, s11, s20, s21, s30, s31; + SIMDBase_VECT a0, a1, a2, a3; + + // + + s00 = SIMDBase_LOAD(&s[i0+0]); s01 = SIMDBase_LOAD(&s[i0+1]); + s10 = SIMDBase_LOAD(&s[i1+0]); s11 = SIMDBase_LOAD(&s[i1+1]); + s20 = SIMDBase_LOAD(&s[i2+0]); s21 = SIMDBase_LOAD(&s[i2+1]); + s30 = SIMDBase_LOAD(&s[i3+0]); s31 = SIMDBase_LOAD(&s[i3+1]); + + t0r = SIMDBase_SUBi(SIMDBase_SUBi(s00, s20), SIMDBase_SUBi(s31, s11)); + t0i = SIMDBase_SUBi(SIMDBase_SUBi(s01, s21), SIMDBase_SUBi(s10, s30)); + + t1r = SIMDBase_ADDi(SIMDBase_SUBi(s00, s20), SIMDBase_SUBi(s31, s11)); + t1i = SIMDBase_ADDi(SIMDBase_SUBi(s01, s21), SIMDBase_SUBi(s10, s30)); + + a0 = SIMDBase_LOAD1(&tbl[p0+0]); a1 = SIMDBase_LOAD1(&tbl[p0+1]); + a2 = SIMDBase_LOAD1(&tbl[p0+2]); a3 = SIMDBase_LOAD1(&tbl[p0+3]); + + SIMDBase_STOR(&s[i0 ], SIMDBase_ADDi(s00, s20)); SIMDBase_STOR(&s[i0+1], SIMDBase_ADDi(s01, s21)); + SIMDBase_STOR(&s[i1 ], SIMDBase_ADDi(s10, s30)); SIMDBase_STOR(&s[i1+1], SIMDBase_ADDi(s11, s31)); + +#ifndef SIMDBase_FMADD_AVAILABLE + SIMDBase_STOR(&s[i2 ], SIMDBase_SUBi(SIMDBase_MULi(t0r, a0), SIMDBase_MULi(t0i, a1))); + SIMDBase_STOR(&s[i2+1], SIMDBase_ADDi(SIMDBase_MULi(t0r, a1), SIMDBase_MULi(t0i, a0))); + SIMDBase_STOR(&s[i3 ], SIMDBase_SUBi(SIMDBase_MULi(t1r, a2), SIMDBase_MULi(t1i, a3))); + SIMDBase_STOR(&s[i3+1], SIMDBase_ADDi(SIMDBase_MULi(t1r, a3), SIMDBase_MULi(t1i, a2))); +#else + SIMDBase_STOR(&s[i2 ], SIMDBase_FMSUBi(t0i, a1, SIMDBase_MULi(t0r, a0))); + SIMDBase_STOR(&s[i2+1], SIMDBase_FMADDi(t0r, a1, SIMDBase_MULi(t0i, a0))); + SIMDBase_STOR(&s[i3 ], SIMDBase_FMSUBi(t1i, a3, SIMDBase_MULi(t1r, a2))); + SIMDBase_STOR(&s[i3+1], SIMDBase_FMADDi(t1r, a3, SIMDBase_MULi(t1i, a2))); +#endif + + // + + s00 = SIMDBase_LOAD(&s[i0+2]); s01 = SIMDBase_LOAD(&s[i0+3]); + s10 = SIMDBase_LOAD(&s[i1+2]); s11 = SIMDBase_LOAD(&s[i1+3]); + s20 = SIMDBase_LOAD(&s[i2+2]); s21 = SIMDBase_LOAD(&s[i2+3]); + s30 = SIMDBase_LOAD(&s[i3+2]); s31 = SIMDBase_LOAD(&s[i3+3]); + + t0r = SIMDBase_SUBi(SIMDBase_SUBi(s00, s20), SIMDBase_SUBi(s31, s11)); + t0i = SIMDBase_SUBi(SIMDBase_SUBi(s01, s21), SIMDBase_SUBi(s10, s30)); + + t1r = SIMDBase_ADDi(SIMDBase_SUBi(s00, s20), SIMDBase_SUBi(s31, s11)); + t1i = SIMDBase_ADDi(SIMDBase_SUBi(s01, s21), SIMDBase_SUBi(s10, s30)); + + a0 = SIMDBase_LOAD1(&tbl[p0+4]); a1 = SIMDBase_LOAD1(&tbl[p0+5]); + a2 = SIMDBase_LOAD1(&tbl[p0+6]); a3 = SIMDBase_LOAD1(&tbl[p0+7]); + + SIMDBase_STOR(&s[i0+2], SIMDBase_ADDi(s00, s20)); SIMDBase_STOR(&s[i0+3], SIMDBase_ADDi(s01, s21)); + SIMDBase_STOR(&s[i1+2], SIMDBase_ADDi(s10, s30)); SIMDBase_STOR(&s[i1+3], SIMDBase_ADDi(s11, s31)); + +#ifndef SIMDBase_FMADD_AVAILABLE + SIMDBase_STOR(&s[i2+2], SIMDBase_SUBi(SIMDBase_MULi(t0r, a0), SIMDBase_MULi(t0i, a1))); + SIMDBase_STOR(&s[i2+3], SIMDBase_ADDi(SIMDBase_MULi(t0r, a1), SIMDBase_MULi(t0i, a0))); + SIMDBase_STOR(&s[i3+2], SIMDBase_SUBi(SIMDBase_MULi(t1r, a2), SIMDBase_MULi(t1i, a3))); + SIMDBase_STOR(&s[i3+3], SIMDBase_ADDi(SIMDBase_MULi(t1r, a3), SIMDBase_MULi(t1i, a2))); +#else + SIMDBase_STOR(&s[i2+2], SIMDBase_FMSUBi(t0i, a1, SIMDBase_MULi(t0r, a0))); + SIMDBase_STOR(&s[i2+3], SIMDBase_FMADDi(t0r, a1, SIMDBase_MULi(t0i, a0))); + SIMDBase_STOR(&s[i3+2], SIMDBase_FMSUBi(t1i, a3, SIMDBase_MULi(t1r, a2))); + SIMDBase_STOR(&s[i3+3], SIMDBase_FMADDi(t1r, a3, SIMDBase_MULi(t1i, a2))); +#endif + + // + + s00 = SIMDBase_LOAD(&s[i0+4]); s01 = SIMDBase_LOAD(&s[i0+5]); + s10 = SIMDBase_LOAD(&s[i1+4]); s11 = SIMDBase_LOAD(&s[i1+5]); + s20 = SIMDBase_LOAD(&s[i2+4]); s21 = SIMDBase_LOAD(&s[i2+5]); + s30 = SIMDBase_LOAD(&s[i3+4]); s31 = SIMDBase_LOAD(&s[i3+5]); + + t0r = SIMDBase_SUBi(SIMDBase_SUBi(s00, s20), SIMDBase_SUBi(s31, s11)); + t0i = SIMDBase_SUBi(SIMDBase_SUBi(s01, s21), SIMDBase_SUBi(s10, s30)); + + t1r = SIMDBase_ADDi(SIMDBase_SUBi(s00, s20), SIMDBase_SUBi(s31, s11)); + t1i = SIMDBase_ADDi(SIMDBase_SUBi(s01, s21), SIMDBase_SUBi(s10, s30)); + + a0 = SIMDBase_LOAD1(&tbl[p0+ 8]); a1 = SIMDBase_LOAD1(&tbl[p0+ 9]); + a2 = SIMDBase_LOAD1(&tbl[p0+10]); a3 = SIMDBase_LOAD1(&tbl[p0+11]); + + SIMDBase_STOR(&s[i0+4], SIMDBase_ADDi(s00, s20)); SIMDBase_STOR(&s[i0+5], SIMDBase_ADDi(s01, s21)); + SIMDBase_STOR(&s[i1+4], SIMDBase_ADDi(s10, s30)); SIMDBase_STOR(&s[i1+5], SIMDBase_ADDi(s11, s31)); + +#ifndef SIMDBase_FMADD_AVAILABLE + SIMDBase_STOR(&s[i2+4], SIMDBase_SUBi(SIMDBase_MULi(t0r, a0), SIMDBase_MULi(t0i, a1))); + SIMDBase_STOR(&s[i2+5], SIMDBase_ADDi(SIMDBase_MULi(t0r, a1), SIMDBase_MULi(t0i, a0))); + SIMDBase_STOR(&s[i3+4], SIMDBase_SUBi(SIMDBase_MULi(t1r, a2), SIMDBase_MULi(t1i, a3))); + SIMDBase_STOR(&s[i3+5], SIMDBase_ADDi(SIMDBase_MULi(t1r, a3), SIMDBase_MULi(t1i, a2))); +#else + SIMDBase_STOR(&s[i2+4], SIMDBase_FMSUBi(t0i, a1, SIMDBase_MULi(t0r, a0))); + SIMDBase_STOR(&s[i2+5], SIMDBase_FMADDi(t0r, a1, SIMDBase_MULi(t0i, a0))); + SIMDBase_STOR(&s[i3+4], SIMDBase_FMSUBi(t1i, a3, SIMDBase_MULi(t1r, a2))); + SIMDBase_STOR(&s[i3+5], SIMDBase_FMADDi(t1r, a3, SIMDBase_MULi(t1i, a2))); +#endif + + // + + s00 = SIMDBase_LOAD(&s[i0+6]); s01 = SIMDBase_LOAD(&s[i0+7]); + s10 = SIMDBase_LOAD(&s[i1+6]); s11 = SIMDBase_LOAD(&s[i1+7]); + s20 = SIMDBase_LOAD(&s[i2+6]); s21 = SIMDBase_LOAD(&s[i2+7]); + s30 = SIMDBase_LOAD(&s[i3+6]); s31 = SIMDBase_LOAD(&s[i3+7]); + + t0r = SIMDBase_SUBi(SIMDBase_SUBi(s00, s20), SIMDBase_SUBi(s31, s11)); + t0i = SIMDBase_SUBi(SIMDBase_SUBi(s01, s21), SIMDBase_SUBi(s10, s30)); + + t1r = SIMDBase_ADDi(SIMDBase_SUBi(s00, s20), SIMDBase_SUBi(s31, s11)); + t1i = SIMDBase_ADDi(SIMDBase_SUBi(s01, s21), SIMDBase_SUBi(s10, s30)); + + a0 = SIMDBase_LOAD1(&tbl[p0+12]); a1 = SIMDBase_LOAD1(&tbl[p0+13]); + a2 = SIMDBase_LOAD1(&tbl[p0+14]); a3 = SIMDBase_LOAD1(&tbl[p0+15]); + + SIMDBase_STOR(&s[i0+6], SIMDBase_ADDi(s00, s20)); SIMDBase_STOR(&s[i0+7], SIMDBase_ADDi(s01, s21)); + SIMDBase_STOR(&s[i1+6], SIMDBase_ADDi(s10, s30)); SIMDBase_STOR(&s[i1+7], SIMDBase_ADDi(s11, s31)); + +#ifndef SIMDBase_FMADD_AVAILABLE + SIMDBase_STOR(&s[i2+6], SIMDBase_SUBi(SIMDBase_MULi(t0r, a0), SIMDBase_MULi(t0i, a1))); + SIMDBase_STOR(&s[i2+7], SIMDBase_ADDi(SIMDBase_MULi(t0r, a1), SIMDBase_MULi(t0i, a0))); + SIMDBase_STOR(&s[i3+6], SIMDBase_SUBi(SIMDBase_MULi(t1r, a2), SIMDBase_MULi(t1i, a3))); + SIMDBase_STOR(&s[i3+7], SIMDBase_ADDi(SIMDBase_MULi(t1r, a3), SIMDBase_MULi(t1i, a2))); +#else + SIMDBase_STOR(&s[i2+6], SIMDBase_FMSUBi(t0i, a1, SIMDBase_MULi(t0r, a0))); + SIMDBase_STOR(&s[i2+7], SIMDBase_FMADDi(t0r, a1, SIMDBase_MULi(t0i, a0))); + SIMDBase_STOR(&s[i3+6], SIMDBase_FMSUBi(t1i, a3, SIMDBase_MULi(t1r, a2))); + SIMDBase_STOR(&s[i3+7], SIMDBase_FMADDi(t1r, a3, SIMDBase_MULi(t1i, a2))); +#endif + + // + + i0 += 8; i1 += 8; i2 += 8; i3 += 8; + p0 += 16; + } +} + +#if 1 +static void srButBackwardSubUnrolled(DFTUndiff *p) { + SIMDBase_VECT *s = p->s; + SIMDBase_REAL *tbl = p->ptTable[p->log2butlen]; + + int32_t i0 = p->offset1; + int32_t i1 = i0 + p->stride; + int32_t i2 = i1 + p->stride; + int32_t i3 = i2 + p->stride; + int32_t im = i1; + + int32_t p0 = p->offset2 & (p->butlen*4-1); + + while(i0 < im) { + SIMDBase_VECT t0r, t0i, t1r, t1i, u, v; + SIMDBase_VECT s00, s01, s10, s11, s20, s21, s30, s31; + SIMDBase_VECT a0, a1, a2, a3; + + // + + s20 = SIMDBase_LOAD(&s[i2+ 0]); s21 = SIMDBase_LOAD(&s[i2+ 1]); + a0 = SIMDBase_LOAD1(&tbl[p0+ 0]); a1 = SIMDBase_LOAD1(&tbl[p0+ 1]); +#ifndef SIMDBase_FMADD_AVAILABLE + u = SIMDBase_ADDi(SIMDBase_MULi(s20, a0), SIMDBase_MULi(s21, a1)); +#else + u = SIMDBase_FMADDi(s20, a0, SIMDBase_MULi(s21, a1)); +#endif + + s30 = SIMDBase_LOAD(&s[i3+ 0]); s31 = SIMDBase_LOAD(&s[i3+ 1]); + a2 = SIMDBase_LOAD1(&tbl[p0+ 2]); a3 = SIMDBase_LOAD1(&tbl[p0+ 3]); +#ifndef SIMDBase_FMADD_AVAILABLE + v = SIMDBase_ADDi(SIMDBase_MULi(s30, a2), SIMDBase_MULi(s31, a3)); +#else + v = SIMDBase_FMADDi(s30, a2, SIMDBase_MULi(s31, a3)); +#endif + + t0r = SIMDBase_ADDi(u, v); t1i = SIMDBase_SUBi(u, v); + +#ifndef SIMDBase_FMADD_AVAILABLE + u = SIMDBase_SUBi(SIMDBase_MULi(s31, a2), SIMDBase_MULi(s30, a3)); + v = SIMDBase_SUBi(SIMDBase_MULi(s21, a0), SIMDBase_MULi(s20, a1)); +#else + u = SIMDBase_FMSUBi(s30, a3, SIMDBase_MULi(s31, a2)); + v = SIMDBase_FMSUBi(s20, a1, SIMDBase_MULi(s21, a0)); +#endif + t0i = SIMDBase_ADDi(u, v); t1r = SIMDBase_SUBi(u, v); + + s00 = SIMDBase_LOAD(&s[i0+ 0]); s01 = SIMDBase_LOAD(&s[i0+ 1]); + s10 = SIMDBase_LOAD(&s[i1+ 0]); s11 = SIMDBase_LOAD(&s[i1+ 1]); + + SIMDBase_STOR(&s[i2+ 0], SIMDBase_SUBi(s00, t0r)); SIMDBase_STOR(&s[i0+ 0], SIMDBase_ADDi(s00, t0r)); + SIMDBase_STOR(&s[i2+ 1], SIMDBase_SUBi(s01, t0i)); SIMDBase_STOR(&s[i0+ 1], SIMDBase_ADDi(s01, t0i)); + SIMDBase_STOR(&s[i3+ 0], SIMDBase_SUBi(s10, t1r)); SIMDBase_STOR(&s[i1+ 0], SIMDBase_ADDi(s10, t1r)); + SIMDBase_STOR(&s[i3+ 1], SIMDBase_SUBi(s11, t1i)); SIMDBase_STOR(&s[i1+ 1], SIMDBase_ADDi(s11, t1i)); + + // + + s20 = SIMDBase_LOAD(&s[i2+ 2]); s21 = SIMDBase_LOAD(&s[i2+ 3]); + a0 = SIMDBase_LOAD1(&tbl[p0+ 4]); a1 = SIMDBase_LOAD1(&tbl[p0+ 5]); +#ifndef SIMDBase_FMADD_AVAILABLE + u = SIMDBase_ADDi(SIMDBase_MULi(s20, a0), SIMDBase_MULi(s21, a1)); +#else + u = SIMDBase_FMADDi(s20, a0, SIMDBase_MULi(s21, a1)); +#endif + + s30 = SIMDBase_LOAD(&s[i3+ 2]); s31 = SIMDBase_LOAD(&s[i3+ 3]); + a2 = SIMDBase_LOAD1(&tbl[p0+ 6]); a3 = SIMDBase_LOAD1(&tbl[p0+ 7]); +#ifndef SIMDBase_FMADD_AVAILABLE + v = SIMDBase_ADDi(SIMDBase_MULi(s30, a2), SIMDBase_MULi(s31, a3)); +#else + v = SIMDBase_FMADDi(s30, a2, SIMDBase_MULi(s31, a3)); +#endif + + t0r = SIMDBase_ADDi(u, v); t1i = SIMDBase_SUBi(u, v); + +#ifndef SIMDBase_FMADD_AVAILABLE + u = SIMDBase_SUBi(SIMDBase_MULi(s31, a2), SIMDBase_MULi(s30, a3)); + v = SIMDBase_SUBi(SIMDBase_MULi(s21, a0), SIMDBase_MULi(s20, a1)); +#else + u = SIMDBase_FMSUBi(s30, a3, SIMDBase_MULi(s31, a2)); + v = SIMDBase_FMSUBi(s20, a1, SIMDBase_MULi(s21, a0)); +#endif + t0i = SIMDBase_ADDi(u, v); t1r = SIMDBase_SUBi(u, v); + + s00 = SIMDBase_LOAD(&s[i0+ 2]); s01 = SIMDBase_LOAD(&s[i0+ 3]); + s10 = SIMDBase_LOAD(&s[i1+ 2]); s11 = SIMDBase_LOAD(&s[i1+ 3]); + + SIMDBase_STOR(&s[i2+ 2], SIMDBase_SUBi(s00, t0r)); SIMDBase_STOR(&s[i0+ 2], SIMDBase_ADDi(s00, t0r)); + SIMDBase_STOR(&s[i2+ 3], SIMDBase_SUBi(s01, t0i)); SIMDBase_STOR(&s[i0+ 3], SIMDBase_ADDi(s01, t0i)); + SIMDBase_STOR(&s[i3+ 2], SIMDBase_SUBi(s10, t1r)); SIMDBase_STOR(&s[i1+ 2], SIMDBase_ADDi(s10, t1r)); + SIMDBase_STOR(&s[i3+ 3], SIMDBase_SUBi(s11, t1i)); SIMDBase_STOR(&s[i1+ 3], SIMDBase_ADDi(s11, t1i)); + + // + + s20 = SIMDBase_LOAD(&s[i2+ 4]); s21 = SIMDBase_LOAD(&s[i2+ 5]); + a0 = SIMDBase_LOAD1(&tbl[p0+ 8]); a1 = SIMDBase_LOAD1(&tbl[p0+ 9]); +#ifndef SIMDBase_FMADD_AVAILABLE + u = SIMDBase_ADDi(SIMDBase_MULi(s20, a0), SIMDBase_MULi(s21, a1)); +#else + u = SIMDBase_FMADDi(s20, a0, SIMDBase_MULi(s21, a1)); +#endif + + s30 = SIMDBase_LOAD(&s[i3+ 4]); s31 = SIMDBase_LOAD(&s[i3+ 5]); + a2 = SIMDBase_LOAD1(&tbl[p0+10]); a3 = SIMDBase_LOAD1(&tbl[p0+11]); +#ifndef SIMDBase_FMADD_AVAILABLE + v = SIMDBase_ADDi(SIMDBase_MULi(s30, a2), SIMDBase_MULi(s31, a3)); +#else + v = SIMDBase_FMADDi(s30, a2, SIMDBase_MULi(s31, a3)); +#endif + + t0r = SIMDBase_ADDi(u, v); t1i = SIMDBase_SUBi(u, v); + +#ifndef SIMDBase_FMADD_AVAILABLE + u = SIMDBase_SUBi(SIMDBase_MULi(s31, a2), SIMDBase_MULi(s30, a3)); + v = SIMDBase_SUBi(SIMDBase_MULi(s21, a0), SIMDBase_MULi(s20, a1)); +#else + u = SIMDBase_FMSUBi(s30, a3, SIMDBase_MULi(s31, a2)); + v = SIMDBase_FMSUBi(s20, a1, SIMDBase_MULi(s21, a0)); +#endif + t0i = SIMDBase_ADDi(u, v); t1r = SIMDBase_SUBi(u, v); + + s00 = SIMDBase_LOAD(&s[i0+ 4]); s01 = SIMDBase_LOAD(&s[i0+ 5]); + s10 = SIMDBase_LOAD(&s[i1+ 4]); s11 = SIMDBase_LOAD(&s[i1+ 5]); + + SIMDBase_STOR(&s[i2+ 4], SIMDBase_SUBi(s00, t0r)); SIMDBase_STOR(&s[i0+ 4], SIMDBase_ADDi(s00, t0r)); + SIMDBase_STOR(&s[i2+ 5], SIMDBase_SUBi(s01, t0i)); SIMDBase_STOR(&s[i0+ 5], SIMDBase_ADDi(s01, t0i)); + SIMDBase_STOR(&s[i3+ 4], SIMDBase_SUBi(s10, t1r)); SIMDBase_STOR(&s[i1+ 4], SIMDBase_ADDi(s10, t1r)); + SIMDBase_STOR(&s[i3+ 5], SIMDBase_SUBi(s11, t1i)); SIMDBase_STOR(&s[i1+ 5], SIMDBase_ADDi(s11, t1i)); + + // + + s20 = SIMDBase_LOAD(&s[i2+ 6]); s21 = SIMDBase_LOAD(&s[i2+ 7]); + a0 = SIMDBase_LOAD1(&tbl[p0+12]); a1 = SIMDBase_LOAD1(&tbl[p0+13]); +#ifndef SIMDBase_FMADD_AVAILABLE + u = SIMDBase_ADDi(SIMDBase_MULi(s20, a0), SIMDBase_MULi(s21, a1)); +#else + u = SIMDBase_FMADDi(s20, a0, SIMDBase_MULi(s21, a1)); +#endif + + s30 = SIMDBase_LOAD(&s[i3+ 6]); s31 = SIMDBase_LOAD(&s[i3+ 7]); + a2 = SIMDBase_LOAD1(&tbl[p0+14]); a3 = SIMDBase_LOAD1(&tbl[p0+15]); +#ifndef SIMDBase_FMADD_AVAILABLE + v = SIMDBase_ADDi(SIMDBase_MULi(s30, a2), SIMDBase_MULi(s31, a3)); +#else + v = SIMDBase_FMADDi(s30, a2, SIMDBase_MULi(s31, a3)); +#endif + + t0r = SIMDBase_ADDi(u, v); t1i = SIMDBase_SUBi(u, v); + +#ifndef SIMDBase_FMADD_AVAILABLE + u = SIMDBase_SUBi(SIMDBase_MULi(s31, a2), SIMDBase_MULi(s30, a3)); + v = SIMDBase_SUBi(SIMDBase_MULi(s21, a0), SIMDBase_MULi(s20, a1)); +#else + u = SIMDBase_FMSUBi(s30, a3, SIMDBase_MULi(s31, a2)); + v = SIMDBase_FMSUBi(s20, a1, SIMDBase_MULi(s21, a0)); +#endif + t0i = SIMDBase_ADDi(u, v); t1r = SIMDBase_SUBi(u, v); + + s00 = SIMDBase_LOAD(&s[i0+ 6]); s01 = SIMDBase_LOAD(&s[i0+ 7]); + s10 = SIMDBase_LOAD(&s[i1+ 6]); s11 = SIMDBase_LOAD(&s[i1+ 7]); + + SIMDBase_STOR(&s[i2+ 6], SIMDBase_SUBi(s00, t0r)); SIMDBase_STOR(&s[i0+ 6], SIMDBase_ADDi(s00, t0r)); + SIMDBase_STOR(&s[i2+ 7], SIMDBase_SUBi(s01, t0i)); SIMDBase_STOR(&s[i0+ 7], SIMDBase_ADDi(s01, t0i)); + SIMDBase_STOR(&s[i3+ 6], SIMDBase_SUBi(s10, t1r)); SIMDBase_STOR(&s[i1+ 6], SIMDBase_ADDi(s10, t1r)); + SIMDBase_STOR(&s[i3+ 7], SIMDBase_SUBi(s11, t1i)); SIMDBase_STOR(&s[i1+ 7], SIMDBase_ADDi(s11, t1i)); + + // + + i0 += 8; i1 += 8; i2 += 8; i3 += 8; + p0 += 16; + } +} +#endif + +static void r2ButForwardSub(DFTUndiff *p) { + SIMDBase_VECT *s = p->s; + + SIMDBase_REAL *tbl = p->ptTable[p->log2butlen]; + + int32_t i0 = p->offset1; + int32_t i2 = i0 + p->stride*2; + int32_t cp = 0, sp = p->butlen/4; + + do { + SIMDBase_VECT t0r, t0i, s0, s1, s2, s3, t0, t1; + + s0 = SIMDBase_LOAD(&s[i0+0]); s2 = SIMDBase_LOAD(&s[i0+1]); + s1 = SIMDBase_LOAD(&s[i2+0]); s3 = SIMDBase_LOAD(&s[i2+1]); + t0 = SIMDBase_LOAD1(&tbl[cp+0]); t1 = SIMDBase_LOAD1(&tbl[sp-0]); + t0r = SIMDBase_SUBi(s0, s1); SIMDBase_STOR(&s[i0+0], SIMDBase_ADDi(s0, s1)); + t0i = SIMDBase_SUBi(s2, s3); SIMDBase_STOR(&s[i0+1], SIMDBase_ADDi(s2, s3)); + SIMDBase_STOR(&s[i2+0], SIMDBase_ADDi(SIMDBase_MULi(t0r, t0), SIMDBase_MULi(t0i, t1))); + SIMDBase_STOR(&s[i2+1], SIMDBase_SUBi(SIMDBase_MULi(t0i, t0), SIMDBase_MULi(t0r, t1))); + + s0 = SIMDBase_LOAD(&s[i0+2]); s2 = SIMDBase_LOAD(&s[i0+3]); + s1 = SIMDBase_LOAD(&s[i2+2]); s3 = SIMDBase_LOAD(&s[i2+3]); + t0 = SIMDBase_LOAD1(&tbl[cp+1]); t1 = SIMDBase_LOAD1(&tbl[sp-1]); + t0r = SIMDBase_SUBi(s0, s1); SIMDBase_STOR(&s[i0+2], SIMDBase_ADDi(s0, s1)); + t0i = SIMDBase_SUBi(s2, s3); SIMDBase_STOR(&s[i0+3], SIMDBase_ADDi(s2, s3)); + SIMDBase_STOR(&s[i2+2], SIMDBase_ADDi(SIMDBase_MULi(t0r, t0), SIMDBase_MULi(t0i, t1))); + SIMDBase_STOR(&s[i2+3], SIMDBase_SUBi(SIMDBase_MULi(t0i, t0), SIMDBase_MULi(t0r, t1))); + + s0 = SIMDBase_LOAD(&s[i0+4]); s2 = SIMDBase_LOAD(&s[i0+5]); + s1 = SIMDBase_LOAD(&s[i2+4]); s3 = SIMDBase_LOAD(&s[i2+5]); + t0 = SIMDBase_LOAD1(&tbl[cp+2]); t1 = SIMDBase_LOAD1(&tbl[sp-2]); + t0r = SIMDBase_SUBi(s0, s1); SIMDBase_STOR(&s[i0+4], SIMDBase_ADDi(s0, s1)); + t0i = SIMDBase_SUBi(s2, s3); SIMDBase_STOR(&s[i0+5], SIMDBase_ADDi(s2, s3)); + SIMDBase_STOR(&s[i2+4], SIMDBase_ADDi(SIMDBase_MULi(t0r, t0), SIMDBase_MULi(t0i, t1))); + SIMDBase_STOR(&s[i2+5], SIMDBase_SUBi(SIMDBase_MULi(t0i, t0), SIMDBase_MULi(t0r, t1))); + + s0 = SIMDBase_LOAD(&s[i0+6]); s2 = SIMDBase_LOAD(&s[i0+7]); + s1 = SIMDBase_LOAD(&s[i2+6]); s3 = SIMDBase_LOAD(&s[i2+7]); + t0 = SIMDBase_LOAD1(&tbl[cp+3]); t1 = SIMDBase_LOAD1(&tbl[sp-3]); + t0r = SIMDBase_SUBi(s0, s1); SIMDBase_STOR(&s[i0+6], SIMDBase_ADDi(s0, s1)); + t0i = SIMDBase_SUBi(s2, s3); SIMDBase_STOR(&s[i0+7], SIMDBase_ADDi(s2, s3)); + SIMDBase_STOR(&s[i2+6], SIMDBase_ADDi(SIMDBase_MULi(t0r, t0), SIMDBase_MULi(t0i, t1))); + SIMDBase_STOR(&s[i2+7], SIMDBase_SUBi(SIMDBase_MULi(t0i, t0), SIMDBase_MULi(t0r, t1))); + + // + + i0 += 8; i2 += 8; cp += 4; sp -= 4; + } while(sp > 0); + + do { + SIMDBase_VECT t0r, t0i, s0, s1, s2, s3, t0, t1; + + s0 = SIMDBase_LOAD(&s[i0+0]); s2 = SIMDBase_LOAD(&s[i0+1]); + s1 = SIMDBase_LOAD(&s[i2+0]); s3 = SIMDBase_LOAD(&s[i2+1]); + t0 = SIMDBase_LOAD1(&tbl[cp-0]); t1 = SIMDBase_LOAD1(&tbl[sp+0]); + t0r = SIMDBase_SUBi(s0, s1); SIMDBase_STOR(&s[i0+0], SIMDBase_ADDi(s0, s1)); + t0i = SIMDBase_SUBi(s2, s3); SIMDBase_STOR(&s[i0+1], SIMDBase_ADDi(s2, s3)); + SIMDBase_STOR(&s[i2+0], SIMDBase_SUBi(SIMDBase_MULi(t0i, t1), SIMDBase_MULi(t0r, t0))); + SIMDBase_STOR(&s[i2+1], SIMDBase_NEGi(SIMDBase_ADDi(SIMDBase_MULi(t0i, t0), SIMDBase_MULi(t0r, t1)))); + + s0 = SIMDBase_LOAD(&s[i0+2]); s2 = SIMDBase_LOAD(&s[i0+3]); + s1 = SIMDBase_LOAD(&s[i2+2]); s3 = SIMDBase_LOAD(&s[i2+3]); + t0 = SIMDBase_LOAD1(&tbl[cp-1]); t1 = SIMDBase_LOAD1(&tbl[sp+1]); + t0r = SIMDBase_SUBi(s0, s1); SIMDBase_STOR(&s[i0+2], SIMDBase_ADDi(s0, s1)); + t0i = SIMDBase_SUBi(s2, s3); SIMDBase_STOR(&s[i0+3], SIMDBase_ADDi(s2, s3)); + SIMDBase_STOR(&s[i2+2], SIMDBase_SUBi(SIMDBase_MULi(t0i, t1), SIMDBase_MULi(t0r, t0))); + SIMDBase_STOR(&s[i2+3], SIMDBase_NEGi(SIMDBase_ADDi(SIMDBase_MULi(t0i, t0), SIMDBase_MULi(t0r, t1)))); + + s0 = SIMDBase_LOAD(&s[i0+4]); s2 = SIMDBase_LOAD(&s[i0+5]); + s1 = SIMDBase_LOAD(&s[i2+4]); s3 = SIMDBase_LOAD(&s[i2+5]); + t0 = SIMDBase_LOAD1(&tbl[cp-2]); t1 = SIMDBase_LOAD1(&tbl[sp+2]); + t0r = SIMDBase_SUBi(s0, s1); SIMDBase_STOR(&s[i0+4], SIMDBase_ADDi(s0, s1)); + t0i = SIMDBase_SUBi(s2, s3); SIMDBase_STOR(&s[i0+5], SIMDBase_ADDi(s2, s3)); + SIMDBase_STOR(&s[i2+4], SIMDBase_SUBi(SIMDBase_MULi(t0i, t1), SIMDBase_MULi(t0r, t0))); + SIMDBase_STOR(&s[i2+5], SIMDBase_NEGi(SIMDBase_ADDi(SIMDBase_MULi(t0i, t0), SIMDBase_MULi(t0r, t1)))); + + s0 = SIMDBase_LOAD(&s[i0+6]); s2 = SIMDBase_LOAD(&s[i0+7]); + s1 = SIMDBase_LOAD(&s[i2+6]); s3 = SIMDBase_LOAD(&s[i2+7]); + t0 = SIMDBase_LOAD1(&tbl[cp-3]); t1 = SIMDBase_LOAD1(&tbl[sp+3]); + t0r = SIMDBase_SUBi(s0, s1); SIMDBase_STOR(&s[i0+6], SIMDBase_ADDi(s0, s1)); + t0i = SIMDBase_SUBi(s2, s3); SIMDBase_STOR(&s[i0+7], SIMDBase_ADDi(s2, s3)); + SIMDBase_STOR(&s[i2+6], SIMDBase_SUBi(SIMDBase_MULi(t0i, t1), SIMDBase_MULi(t0r, t0))); + SIMDBase_STOR(&s[i2+7], SIMDBase_NEGi(SIMDBase_ADDi(SIMDBase_MULi(t0i, t0), SIMDBase_MULi(t0r, t1)))); + + // + + i0 += 8; i2 += 8; cp -= 4; sp += 4; + } while(cp > 0); +} + +static void r2ButBackwardSub(DFTUndiff *p) { + SIMDBase_VECT *s = p->s; + + SIMDBase_REAL *tbl = p->ptTable[p->log2butlen]; + + int i0 = p->offset1; + int i2 = i0 + p->stride*2; + + int cp = 0, sp = p->butlen/4; + + do { + SIMDBase_VECT t0r, t0i, s0, s1, s2, s3, t0, t1; + + s0 = SIMDBase_LOAD(&s[i0+0]); s2 = SIMDBase_LOAD(&s[i0+1]); + s1 = SIMDBase_LOAD(&s[i2+0]); s3 = SIMDBase_LOAD(&s[i2+1]); + t0 = SIMDBase_LOAD1(&tbl[cp+0]); t1 = SIMDBase_LOAD1(&tbl[sp-0]); + t0r = SIMDBase_SUBi(SIMDBase_MULi(s1, t0), SIMDBase_MULi(s3, t1)); + t0i = SIMDBase_ADDi(SIMDBase_MULi(s1, t1), SIMDBase_MULi(s3, t0)); + SIMDBase_STOR(&s[i2+0], SIMDBase_SUBi(s0, t0r)); SIMDBase_STOR(&s[i0+0], SIMDBase_ADDi(s0, t0r)); + SIMDBase_STOR(&s[i2+1], SIMDBase_SUBi(s2, t0i)); SIMDBase_STOR(&s[i0+1], SIMDBase_ADDi(s2, t0i)); + + s0 = SIMDBase_LOAD(&s[i0+2]); s2 = SIMDBase_LOAD(&s[i0+3]); + s1 = SIMDBase_LOAD(&s[i2+2]); s3 = SIMDBase_LOAD(&s[i2+3]); + t0 = SIMDBase_LOAD1(&tbl[cp+1]); t1 = SIMDBase_LOAD1(&tbl[sp-1]); + t0r = SIMDBase_SUBi(SIMDBase_MULi(s1, t0), SIMDBase_MULi(s3, t1)); + t0i = SIMDBase_ADDi(SIMDBase_MULi(s1, t1), SIMDBase_MULi(s3, t0)); + SIMDBase_STOR(&s[i2+2], SIMDBase_SUBi(s0, t0r)); SIMDBase_STOR(&s[i0+2], SIMDBase_ADDi(s0, t0r)); + SIMDBase_STOR(&s[i2+3], SIMDBase_SUBi(s2, t0i)); SIMDBase_STOR(&s[i0+3], SIMDBase_ADDi(s2, t0i)); + + s0 = SIMDBase_LOAD(&s[i0+4]); s2 = SIMDBase_LOAD(&s[i0+5]); + s1 = SIMDBase_LOAD(&s[i2+4]); s3 = SIMDBase_LOAD(&s[i2+5]); + t0 = SIMDBase_LOAD1(&tbl[cp+2]); t1 = SIMDBase_LOAD1(&tbl[sp-2]); + t0r = SIMDBase_SUBi(SIMDBase_MULi(s1, t0), SIMDBase_MULi(s3, t1)); + t0i = SIMDBase_ADDi(SIMDBase_MULi(s1, t1), SIMDBase_MULi(s3, t0)); + SIMDBase_STOR(&s[i2+4], SIMDBase_SUBi(s0, t0r)); SIMDBase_STOR(&s[i0+4], SIMDBase_ADDi(s0, t0r)); + SIMDBase_STOR(&s[i2+5], SIMDBase_SUBi(s2, t0i)); SIMDBase_STOR(&s[i0+5], SIMDBase_ADDi(s2, t0i)); + + s0 = SIMDBase_LOAD(&s[i0+6]); s2 = SIMDBase_LOAD(&s[i0+7]); + s1 = SIMDBase_LOAD(&s[i2+6]); s3 = SIMDBase_LOAD(&s[i2+7]); + t0 = SIMDBase_LOAD1(&tbl[cp+3]); t1 = SIMDBase_LOAD1(&tbl[sp-3]); + t0r = SIMDBase_SUBi(SIMDBase_MULi(s1, t0), SIMDBase_MULi(s3, t1)); + t0i = SIMDBase_ADDi(SIMDBase_MULi(s1, t1), SIMDBase_MULi(s3, t0)); + SIMDBase_STOR(&s[i2+6], SIMDBase_SUBi(s0, t0r)); SIMDBase_STOR(&s[i0+6], SIMDBase_ADDi(s0, t0r)); + SIMDBase_STOR(&s[i2+7], SIMDBase_SUBi(s2, t0i)); SIMDBase_STOR(&s[i0+7], SIMDBase_ADDi(s2, t0i)); + + i0 += 8; i2 += 8; cp += 4; sp -= 4; + } while(sp > 0); + + do { + SIMDBase_VECT t0r, t0i, s0, s1, s2, s3, t0, t1; + + s0 = SIMDBase_LOAD(&s[i0+0]); s2 = SIMDBase_LOAD(&s[i0+1]); + s1 = SIMDBase_LOAD(&s[i2+0]); s3 = SIMDBase_LOAD(&s[i2+1]); + t0 = SIMDBase_LOAD1(&tbl[cp-0]); t1 = SIMDBase_LOAD1(&tbl[sp+0]); + t0r = SIMDBase_NEGi(SIMDBase_ADDi(SIMDBase_MULi(s1, t0), SIMDBase_MULi(s3, t1))); + t0i = SIMDBase_SUBi(SIMDBase_MULi(s1, t1), SIMDBase_MULi(s3, t0)); + SIMDBase_STOR(&s[i2+0], SIMDBase_SUBi(s0, t0r)); SIMDBase_STOR(&s[i0+0], SIMDBase_ADDi(s0, t0r)); + SIMDBase_STOR(&s[i2+1], SIMDBase_SUBi(s2, t0i)); SIMDBase_STOR(&s[i0+1], SIMDBase_ADDi(s2, t0i)); + + s0 = SIMDBase_LOAD(&s[i0+2]); s2 = SIMDBase_LOAD(&s[i0+3]); + s1 = SIMDBase_LOAD(&s[i2+2]); s3 = SIMDBase_LOAD(&s[i2+3]); + t0 = SIMDBase_LOAD1(&tbl[cp-1]); t1 = SIMDBase_LOAD1(&tbl[sp+1]); + t0r = SIMDBase_NEGi(SIMDBase_ADDi(SIMDBase_MULi(s1, t0), SIMDBase_MULi(s3, t1))); + t0i = SIMDBase_SUBi(SIMDBase_MULi(s1, t1), SIMDBase_MULi(s3, t0)); + SIMDBase_STOR(&s[i2+2], SIMDBase_SUBi(s0, t0r)); SIMDBase_STOR(&s[i0+2], SIMDBase_ADDi(s0, t0r)); + SIMDBase_STOR(&s[i2+3], SIMDBase_SUBi(s2, t0i)); SIMDBase_STOR(&s[i0+3], SIMDBase_ADDi(s2, t0i)); + + s0 = SIMDBase_LOAD(&s[i0+4]); s2 = SIMDBase_LOAD(&s[i0+5]); + s1 = SIMDBase_LOAD(&s[i2+4]); s3 = SIMDBase_LOAD(&s[i2+5]); + t0 = SIMDBase_LOAD1(&tbl[cp-2]); t1 = SIMDBase_LOAD1(&tbl[sp+2]); + t0r = SIMDBase_NEGi(SIMDBase_ADDi(SIMDBase_MULi(s1, t0), SIMDBase_MULi(s3, t1))); + t0i = SIMDBase_SUBi(SIMDBase_MULi(s1, t1), SIMDBase_MULi(s3, t0)); + SIMDBase_STOR(&s[i2+4], SIMDBase_SUBi(s0, t0r)); SIMDBase_STOR(&s[i0+4], SIMDBase_ADDi(s0, t0r)); + SIMDBase_STOR(&s[i2+5], SIMDBase_SUBi(s2, t0i)); SIMDBase_STOR(&s[i0+5], SIMDBase_ADDi(s2, t0i)); + + s0 = SIMDBase_LOAD(&s[i0+6]); s2 = SIMDBase_LOAD(&s[i0+7]); + s1 = SIMDBase_LOAD(&s[i2+6]); s3 = SIMDBase_LOAD(&s[i2+7]); + t0 = SIMDBase_LOAD1(&tbl[cp-3]); t1 = SIMDBase_LOAD1(&tbl[sp+3]); + t0r = SIMDBase_NEGi(SIMDBase_ADDi(SIMDBase_MULi(s1, t0), SIMDBase_MULi(s3, t1))); + t0i = SIMDBase_SUBi(SIMDBase_MULi(s1, t1), SIMDBase_MULi(s3, t0)); + SIMDBase_STOR(&s[i2+6], SIMDBase_SUBi(s0, t0r)); SIMDBase_STOR(&s[i0+6], SIMDBase_ADDi(s0, t0r)); + SIMDBase_STOR(&s[i2+7], SIMDBase_SUBi(s2, t0i)); SIMDBase_STOR(&s[i0+7], SIMDBase_ADDi(s2, t0i)); + + i0 += 8; i2 += 8; cp -= 4; sp += 4; + } while(cp > 0); +} + +static void srButForward16(DFTUndiff *p) { + int32_t o = p->offset1; + + p->butlen = 16; p->log2butlen = 4; p->stride = p->butlen/2; + srButForwardSubUnrolled(p); + + p->offset1 = o + 16*6/4; + srButForward4(p); + + p->offset1 = o + 16*4/4; + srButForward4(p); + + p->offset1 = o; + srButForward8(p); +} + +static void srButBackward16(DFTUndiff *p) { + int32_t o = p->offset1; + + p->offset1 = o + 16*6/4; + srButBackward4(p); + + p->offset1 = o + 16*4/4; + srButBackward4(p); + + p->offset1 = o; + srButBackward8(p); + + p->butlen = 16; p->log2butlen = 4; p->stride = p->butlen/2; + srButBackwardSubUnrolled(p); +} + +static void srButForward32(DFTUndiff *p) { + int32_t o = p->offset1; + + p->butlen = 32; p->log2butlen = 5; p->stride = p->butlen/2; + srButForwardSubUnrolled(p); + + p->offset1 = o + 32*6/4; + srButForward8 (p); + + p->offset1 = o + 32*4/4; + srButForward8 (p); + + p->offset1 = o; + srButForward16(p); +} + +static void srButBackward32(DFTUndiff *p) { + int32_t o = p->offset1; + + p->offset1 = o + 32*6/4; + srButBackward8 (p); + + p->offset1 = o + 32*4/4; + srButBackward8 (p); + + p->offset1 = o; + srButBackward16(p); + + p->butlen = 32; p->log2butlen = 5; p->stride = p->butlen/2; + srButBackwardSubUnrolled(p); +} + +// + +#if 1 +static inline void bitReversalUnit(SIMDBase_VECT *p, SIMDBase_VECT *q) { + SIMDBase_VECT w, x, y, z; + + w = SIMDBase_LOAD(p); x = SIMDBase_LOAD(p+1); + y = SIMDBase_LOAD(q); z = SIMDBase_LOAD(q+1); + + SIMDBase_STOR(q, w); SIMDBase_STOR(q+1, x); + SIMDBase_STOR(p, y); SIMDBase_STOR(p+1, z); +} +#else +#define bitReversalUnit(p0, q0) { \ + SIMDBase_VECT *px = (p0), *qx = (q0); \ + SIMDBase_VECT wx, xx, yx, zx; \ + \ + wx = SIMDBase_LOAD(px); xx = SIMDBase_LOAD(px+1); \ + yx = SIMDBase_LOAD(qx); zx = SIMDBase_LOAD(qx+1); \ + \ + SIMDBase_STOR(qx, wx); SIMDBase_STOR(qx+1, xx); \ + SIMDBase_STOR(px, yx); SIMDBase_STOR(px+1, zx); \ +} +#endif + +static inline void bitReversal4s(SIMDBase_VECT *s, int32_t sc, int32_t o1, int32_t o2) { + SIMDBase_VECT *p = &s[o1*2], *q = &s[o2*2]; + int b1 = sc*2*1, b2 = b1*2; + p += b1; q += b2; + bitReversalUnit(p, q); +} + +static inline void bitReversal8s(SIMDBase_VECT *s, int32_t sc, int32_t o1, int32_t o2) { + SIMDBase_VECT *p = &s[o1*2], *q = &s[o2*2]; + int b1 = sc*2*1, b2 = b1*2, b4 = b2*2; + p += b1; q += b4; + bitReversalUnit(p, q); p += b2; q += b2; + bitReversalUnit(p, q); +} + +static inline void bitReversal8d(SIMDBase_VECT *s, int32_t sc, int32_t o1, int32_t o2) { + SIMDBase_VECT *p = &s[o1*2], *q = &s[o2*2]; + int32_t b1 = sc*2*1, b2 = b1*2, b4 = b2*2; + bitReversalUnit(p, q); p += b1; q += b4; + bitReversalUnit(p, q); p += b2; q += b2; + bitReversalUnit(p, q); p -= b1; q -= b4; + bitReversalUnit(p, q); p += b4; q += b1; + bitReversalUnit(p, q); p += b1; q += b4; + bitReversalUnit(p, q); p -= b2; q -= b2; + bitReversalUnit(p, q); p -= b1; q -= b4; + bitReversalUnit(p, q); +} + +static inline void bitReversal16s(SIMDBase_VECT *s, int32_t sc, int32_t o1, int32_t o2) { + SIMDBase_VECT *p = &s[o1*2], *q = &s[o2*2]; + int32_t b1 = sc*2*1, b2 = b1*2, b4 = b2*2, b8 = b4*2; + p += b1; q += b8; + bitReversalUnit(p, q); p += b2; q += b4; + bitReversalUnit(p, q); p -= b1; q -= b8; + bitReversalUnit(p, q); p += b1 + b4; q += b2 + b8; + bitReversalUnit(p, q); p -= b2; q -= b4; + bitReversalUnit(p, q); p += b2 + b4; q += b1 + b2; + bitReversalUnit(p, q); +} + +static inline void bitReversal16d(SIMDBase_VECT *s, int32_t sc, int32_t o1, int32_t o2) { + SIMDBase_VECT *p = &s[o1*2], *q = &s[o2*2]; + int32_t b1 = sc*2*1, b2 = b1*2, b4 = b2*2, b8 = b4*2; + bitReversalUnit(p, q); p += b1; q += b8; + bitReversalUnit(p, q); p += b2; q += b4; + bitReversalUnit(p, q); p -= b1; q -= b8; + bitReversalUnit(p, q); p += b4; q += b2; + bitReversalUnit(p, q); p += b1; q += b8; + bitReversalUnit(p, q); p -= b2; q -= b4; + bitReversalUnit(p, q); p -= b1; q -= b8; + bitReversalUnit(p, q); p += b8; q += b1; + bitReversalUnit(p, q); p += b1; q += b8; + bitReversalUnit(p, q); p += b2; q += b4; + bitReversalUnit(p, q); p -= b1; q -= b8; + bitReversalUnit(p, q); p -= b4; q -= b2; + bitReversalUnit(p, q); p += b1; q += b8; + bitReversalUnit(p, q); p -= b2; q -= b4; + bitReversalUnit(p, q); p -= b1; q -= b8; + bitReversalUnit(p, q); +} + +static inline void bitReversal32s(SIMDBase_VECT *s, int32_t sc, int32_t o1, int32_t o2) { + SIMDBase_VECT *p = &s[o1*2], *q = &s[o2*2]; + int32_t b1 = sc*2*1, b2 = b1*2, b4 = b2*2, b8 = b4*2, b16 = b8*2; + p += b1; q += b16; + bitReversalUnit(p, q); p += b2; q += b8; + bitReversalUnit(p, q); p -= b1; q -= b16; + bitReversalUnit(p, q); p += b4; q += b4; + bitReversalUnit(p, q); p += b1; q += b16; + bitReversalUnit(p, q); p -= b2; q -= b8; + bitReversalUnit(p, q); p += b8; q += b2; + bitReversalUnit(p, q); p += b2; q += b8; + bitReversalUnit(p, q); p -= b4; q -= b4; + bitReversalUnit(p, q); p -= b2; q -= b8; + bitReversalUnit(p, q); p += b16 - b2; q += b1 + b2 + b8; + bitReversalUnit(p, q); p -= b4; q -= b4; + bitReversalUnit(p, q); +} + +static void bitReversal32d(SIMDBase_VECT *s, int32_t sc, int32_t o1, int32_t o2) { + const int32_t k = 32; + + bitReversal8d(s,2*sc, sc*(k/2 )+o1, sc* 1 +o2); + bitReversal8d(s,2*sc, sc* 0 +o1, sc* 0 +o2); + bitReversal8d(s,2*sc, sc* 1 +o1, sc*(k/2 )+o2); + bitReversal8d(s,2*sc, sc*(k/2+1)+o1, sc*(k/2+1)+o2); +} + +static void bitReversalRecursive(SIMDBase_VECT *s, int32_t n, int32_t sc, int32_t o1, int32_t o2) { + if (n >= 64) { + if (o1 != o2) bitReversalRecursive(s, n/4, 2*sc, sc*(n/2)+o1, sc*1+o2); + + bitReversalRecursive(s, n/4, 2*sc, sc* 0 +o1, sc* 0 +o2); + bitReversalRecursive(s, n/4, 2*sc, sc* 1 +o1, sc*(n/2 )+o2); + bitReversalRecursive(s, n/4, 2*sc, sc*(n/2+1)+o1, sc*(n/2+1)+o2); + } else { + if (o1 == o2) { + switch(n) { + case 4: bitReversal4s (s,sc,o1,o2); return; + case 8: bitReversal8s (s,sc,o1,o2); return; + case 16: bitReversal16s(s,sc,o1,o2); return; + case 32: bitReversal32s(s,sc,o1,o2); return; + } + } else { + switch(n) { + case 8: bitReversal8d (s,sc,o1,o2); return; + case 16: bitReversal16d(s,sc,o1,o2); return; + case 32: bitReversal32d(s,sc,o1,o2); return; + } + } + } +} + +// + +static int bitR(int a, int logN) { + int ret = 0; + int i,j,k; + for(i=0,j=1,k=1<<(logN-1);i<logN;i++,j=j<<1,k=k>>1) { + if ((a & j) != 0) ret |= k; + } + return ret; +} + +static void bitReversalCobraInplace(DFTUndiff *p) { + SIMDBase_VECT *s = p->s; + int cobraQ = p->cobraQ; + SIMDBase_VECT *cobraT = p->cobraT; + int *cobraR = p->cobraR; + int logN = p->log2len; + + int b; + + for(b=0;b<(1 << (logN-2*cobraQ));b++) { + int a,c; + int b2 = bitR(b, logN-2*cobraQ); + + if (b2 < b) continue; + + if (b2 == b) { + for(a=0;a<(1 << cobraQ);a++) { + int abc = ((a << (logN-2*cobraQ)) | b) << (cobraQ + 1); + + int a2c = (cobraR[a] << cobraQ) << 1, a2cm = a2c+(1 << cobraQ)*2; + + while(a2c < a2cm) { + SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); + SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); + SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); + SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); + SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); + SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); + SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); + SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); + SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); + SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); + SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); + SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); + SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); + SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); + SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); + SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); + } + } + + for(c=0;c<(1 << cobraQ);c++) { + int c2 = cobraR[c]; + int c2b2a2 = ((c2 << (logN-2*cobraQ)) | b2) << (cobraQ+1); + + int a2c = c << 1; + int a2ci = 1 << (cobraQ+1); + int c2b2a2m = c2b2a2 + (1 << cobraQ)*2; + + while(c2b2a2 < c2b2a2m) { + SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c ])); SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c+1])); a2c += a2ci; + SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c ])); SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c+1])); a2c += a2ci; + SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c ])); SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c+1])); a2c += a2ci; + SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c ])); SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c+1])); a2c += a2ci; + SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c ])); SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c+1])); a2c += a2ci; + SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c ])); SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c+1])); a2c += a2ci; + SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c ])); SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c+1])); a2c += a2ci; + SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c ])); SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c+1])); a2c += a2ci; + SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c ])); SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c+1])); a2c += a2ci; + SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c ])); SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c+1])); a2c += a2ci; + SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c ])); SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c+1])); a2c += a2ci; + SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c ])); SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c+1])); a2c += a2ci; + SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c ])); SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c+1])); a2c += a2ci; + SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c ])); SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c+1])); a2c += a2ci; + SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c ])); SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c+1])); a2c += a2ci; + SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c ])); SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c+1])); a2c += a2ci; + } + } + } else { + for(a=0;a<(1 << cobraQ);a++) { + int a2c = (cobraR[a] << cobraQ) << 1, a2cm = a2c+(1 << cobraQ)*2; + int abc = ((a << (logN-2*cobraQ)) | b) << (cobraQ + 1); + + while(a2c < a2cm) { + SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); + SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); + SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); + SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); + SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); + SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); + SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); + SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); + SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); + SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); + SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); + SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); + SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); + SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); + SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); + SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); + } + } + + for(c=0;c<(1 << cobraQ);c++) { + int c2 = cobraR[c]; + int c2b2a2 = ((c2 << (logN-2*cobraQ)) | b2) << (cobraQ+1); + + int a2c = c << 1; + int a2ci = 1 << (cobraQ+1); + int c2b2a2m = c2b2a2 + (1 << cobraQ)*2; + + while(c2b2a2 < c2b2a2m) { + SIMDBase_VECT t0, t1, t2, t3, t4, t5, t6, t7; + + t0 = SIMDBase_LOAD(&s[c2b2a2+0]); t1 = SIMDBase_LOAD(&s[c2b2a2+1]); + t2 = SIMDBase_LOAD(&s[c2b2a2+2]); t3 = SIMDBase_LOAD(&s[c2b2a2+3]); + t4 = SIMDBase_LOAD(&s[c2b2a2+4]); t5 = SIMDBase_LOAD(&s[c2b2a2+5]); + t6 = SIMDBase_LOAD(&s[c2b2a2+6]); t7 = SIMDBase_LOAD(&s[c2b2a2+7]); + + SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c ])); SIMDBase_STOR(&cobraT[a2c ], t0); + SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c+1])); SIMDBase_STOR(&cobraT[a2c+1], t1); a2c += a2ci; + SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c ])); SIMDBase_STOR(&cobraT[a2c ], t2); + SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c+1])); SIMDBase_STOR(&cobraT[a2c+1], t3); a2c += a2ci; + SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c ])); SIMDBase_STOR(&cobraT[a2c ], t4); + SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c+1])); SIMDBase_STOR(&cobraT[a2c+1], t5); a2c += a2ci; + SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c ])); SIMDBase_STOR(&cobraT[a2c ], t6); + SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c+1])); SIMDBase_STOR(&cobraT[a2c+1], t7); a2c += a2ci; + + t0 = SIMDBase_LOAD(&s[c2b2a2+0]); t1 = SIMDBase_LOAD(&s[c2b2a2+1]); + t2 = SIMDBase_LOAD(&s[c2b2a2+2]); t3 = SIMDBase_LOAD(&s[c2b2a2+3]); + t4 = SIMDBase_LOAD(&s[c2b2a2+4]); t5 = SIMDBase_LOAD(&s[c2b2a2+5]); + t6 = SIMDBase_LOAD(&s[c2b2a2+6]); t7 = SIMDBase_LOAD(&s[c2b2a2+7]); + + SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c ])); SIMDBase_STOR(&cobraT[a2c ], t0); + SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c+1])); SIMDBase_STOR(&cobraT[a2c+1], t1); a2c += a2ci; + SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c ])); SIMDBase_STOR(&cobraT[a2c ], t2); + SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c+1])); SIMDBase_STOR(&cobraT[a2c+1], t3); a2c += a2ci; + SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c ])); SIMDBase_STOR(&cobraT[a2c ], t4); + SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c+1])); SIMDBase_STOR(&cobraT[a2c+1], t5); a2c += a2ci; + SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c ])); SIMDBase_STOR(&cobraT[a2c ], t6); + SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c+1])); SIMDBase_STOR(&cobraT[a2c+1], t7); a2c += a2ci; + + t0 = SIMDBase_LOAD(&s[c2b2a2+0]); t1 = SIMDBase_LOAD(&s[c2b2a2+1]); + t2 = SIMDBase_LOAD(&s[c2b2a2+2]); t3 = SIMDBase_LOAD(&s[c2b2a2+3]); + t4 = SIMDBase_LOAD(&s[c2b2a2+4]); t5 = SIMDBase_LOAD(&s[c2b2a2+5]); + t6 = SIMDBase_LOAD(&s[c2b2a2+6]); t7 = SIMDBase_LOAD(&s[c2b2a2+7]); + + SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c ])); SIMDBase_STOR(&cobraT[a2c ], t0); + SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c+1])); SIMDBase_STOR(&cobraT[a2c+1], t1); a2c += a2ci; + SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c ])); SIMDBase_STOR(&cobraT[a2c ], t2); + SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c+1])); SIMDBase_STOR(&cobraT[a2c+1], t3); a2c += a2ci; + SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c ])); SIMDBase_STOR(&cobraT[a2c ], t4); + SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c+1])); SIMDBase_STOR(&cobraT[a2c+1], t5); a2c += a2ci; + SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c ])); SIMDBase_STOR(&cobraT[a2c ], t6); + SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c+1])); SIMDBase_STOR(&cobraT[a2c+1], t7); a2c += a2ci; + + t0 = SIMDBase_LOAD(&s[c2b2a2+0]); t1 = SIMDBase_LOAD(&s[c2b2a2+1]); + t2 = SIMDBase_LOAD(&s[c2b2a2+2]); t3 = SIMDBase_LOAD(&s[c2b2a2+3]); + t4 = SIMDBase_LOAD(&s[c2b2a2+4]); t5 = SIMDBase_LOAD(&s[c2b2a2+5]); + t6 = SIMDBase_LOAD(&s[c2b2a2+6]); t7 = SIMDBase_LOAD(&s[c2b2a2+7]); + + SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c ])); SIMDBase_STOR(&cobraT[a2c ], t0); + SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c+1])); SIMDBase_STOR(&cobraT[a2c+1], t1); a2c += a2ci; + SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c ])); SIMDBase_STOR(&cobraT[a2c ], t2); + SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c+1])); SIMDBase_STOR(&cobraT[a2c+1], t3); a2c += a2ci; + SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c ])); SIMDBase_STOR(&cobraT[a2c ], t4); + SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c+1])); SIMDBase_STOR(&cobraT[a2c+1], t5); a2c += a2ci; + SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c ])); SIMDBase_STOR(&cobraT[a2c ], t6); + SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c+1])); SIMDBase_STOR(&cobraT[a2c+1], t7); a2c += a2ci; + } + } + + for(a=0;a<(1 << cobraQ);a++) { + int a2c = (cobraR[a] << cobraQ) << 1, a2cm = a2c+(1 << cobraQ)*2; + int abc = ((a << (logN-2*cobraQ)) | b) << (cobraQ + 1); + + while(a2c < a2cm) { + SIMDBase_STOR(&s[abc++], SIMDBase_LOAD(&cobraT[a2c++])); SIMDBase_STOR(&s[abc++], SIMDBase_LOAD(&cobraT[a2c++])); + SIMDBase_STOR(&s[abc++], SIMDBase_LOAD(&cobraT[a2c++])); SIMDBase_STOR(&s[abc++], SIMDBase_LOAD(&cobraT[a2c++])); + SIMDBase_STOR(&s[abc++], SIMDBase_LOAD(&cobraT[a2c++])); SIMDBase_STOR(&s[abc++], SIMDBase_LOAD(&cobraT[a2c++])); + SIMDBase_STOR(&s[abc++], SIMDBase_LOAD(&cobraT[a2c++])); SIMDBase_STOR(&s[abc++], SIMDBase_LOAD(&cobraT[a2c++])); + SIMDBase_STOR(&s[abc++], SIMDBase_LOAD(&cobraT[a2c++])); SIMDBase_STOR(&s[abc++], SIMDBase_LOAD(&cobraT[a2c++])); + SIMDBase_STOR(&s[abc++], SIMDBase_LOAD(&cobraT[a2c++])); SIMDBase_STOR(&s[abc++], SIMDBase_LOAD(&cobraT[a2c++])); + SIMDBase_STOR(&s[abc++], SIMDBase_LOAD(&cobraT[a2c++])); SIMDBase_STOR(&s[abc++], SIMDBase_LOAD(&cobraT[a2c++])); + SIMDBase_STOR(&s[abc++], SIMDBase_LOAD(&cobraT[a2c++])); SIMDBase_STOR(&s[abc++], SIMDBase_LOAD(&cobraT[a2c++])); + SIMDBase_STOR(&s[abc++], SIMDBase_LOAD(&cobraT[a2c++])); SIMDBase_STOR(&s[abc++], SIMDBase_LOAD(&cobraT[a2c++])); + SIMDBase_STOR(&s[abc++], SIMDBase_LOAD(&cobraT[a2c++])); SIMDBase_STOR(&s[abc++], SIMDBase_LOAD(&cobraT[a2c++])); + SIMDBase_STOR(&s[abc++], SIMDBase_LOAD(&cobraT[a2c++])); SIMDBase_STOR(&s[abc++], SIMDBase_LOAD(&cobraT[a2c++])); + SIMDBase_STOR(&s[abc++], SIMDBase_LOAD(&cobraT[a2c++])); SIMDBase_STOR(&s[abc++], SIMDBase_LOAD(&cobraT[a2c++])); + SIMDBase_STOR(&s[abc++], SIMDBase_LOAD(&cobraT[a2c++])); SIMDBase_STOR(&s[abc++], SIMDBase_LOAD(&cobraT[a2c++])); + SIMDBase_STOR(&s[abc++], SIMDBase_LOAD(&cobraT[a2c++])); SIMDBase_STOR(&s[abc++], SIMDBase_LOAD(&cobraT[a2c++])); + SIMDBase_STOR(&s[abc++], SIMDBase_LOAD(&cobraT[a2c++])); SIMDBase_STOR(&s[abc++], SIMDBase_LOAD(&cobraT[a2c++])); + SIMDBase_STOR(&s[abc++], SIMDBase_LOAD(&cobraT[a2c++])); SIMDBase_STOR(&s[abc++], SIMDBase_LOAD(&cobraT[a2c++])); + } + } + } + } +} + +// + +static void srForwardMain2(DFTUndiff *p) { + int32_t o = p->offset1; + int32_t butlen = p->butlen; + int32_t log2butlen = p->log2butlen; + + if (butlen >= p->radix2thres) { + p->stride = p->butlen/2; + r2ButForwardSub(p); + + p->offset1 = o + butlen*4/4; + p->butlen = butlen/2; + p->log2butlen = log2butlen-1; + srForwardMain2(p); + + p->offset1 = o; + p->butlen = butlen/2; + p->log2butlen = log2butlen-1; + srForwardMain2(p); + + return; + } + + if (butlen >= 256) { + p->stride = p->butlen/2; + srButForwardSubUnrolled(p); + + p->offset1 = o + butlen*6/4; + p->butlen = butlen/4; + p->log2butlen = log2butlen-2; + srForwardMain2(p); + + p->offset1 = o + butlen*4/4; + p->butlen = butlen/4; + p->log2butlen = log2butlen-2; + srForwardMain2(p); + + p->offset1 = o; + p->butlen = butlen/2; + p->log2butlen = log2butlen-1; + srForwardMain2(p); + + return; + } + + if (butlen == 128) { + p->stride = p->butlen/2; + srButForwardSubUnrolled(p); + + p->offset1 = o + butlen*6/4; + srButForward32(p); + + p->offset1 = o + butlen*4/4; + srButForward32(p); + + p->offset1 = o; + p->butlen = butlen/2; + p->log2butlen = log2butlen-1; + srForwardMain2 (p); + + return; + } + + // butlen == 64 + + p->stride = p->butlen/2; + srButForwardSubUnrolled(p); + + p->offset1 = o + butlen*6/4; + srButForward16(p); + + p->offset1 = o + butlen*4/4; + srButForward16(p); + + p->offset1 = o; + srButForward32(p); +} + +static void srBackwardMain2(DFTUndiff *p) { + int32_t o = p->offset1; + int32_t butlen = p->butlen; + int32_t log2butlen = p->log2butlen; + + if (butlen >= p->radix2thres) { + p->offset1 = o + butlen*4/4; + p->butlen = butlen/2; + p->log2butlen = log2butlen-1; + srBackwardMain2(p); + + p->offset1 = o; + p->butlen = butlen/2; + p->log2butlen = log2butlen-1; + srBackwardMain2(p); + + p->butlen = butlen; + p->stride = p->butlen/2; + p->log2butlen = log2butlen; + r2ButBackwardSub(p); + + return; + } + + if (butlen >= 256) { + p->offset1 = o + butlen*6/4; + p->butlen = butlen/4; + p->log2butlen = log2butlen-2; + srBackwardMain2(p); + + p->offset1 = o + butlen*4/4; + p->butlen = butlen/4; + p->log2butlen = log2butlen-2; + srBackwardMain2(p); + + p->offset1 = o; + p->butlen = butlen/2; + p->log2butlen = log2butlen-1; + srBackwardMain2(p); + + p->butlen = butlen; + p->stride = p->butlen/2; + p->log2butlen = log2butlen; + srButBackwardSubUnrolled(p); + + return; + } + + if (butlen == 128) { + p->offset1 = o + butlen*6/4; + srButBackward32(p); + + p->offset1 = o + butlen*4/4; + srButBackward32(p); + + p->offset1 = o; + p->butlen = butlen/2; + p->log2butlen = log2butlen-1; + srBackwardMain2 (p); + + p->butlen = butlen; + p->stride = p->butlen/2; + p->log2butlen = log2butlen; + srButBackwardSubUnrolled(p); + + return; + } + + // butlen == 64 + + p->offset1 = o + butlen*6/4; + srButBackward16(p); + + p->offset1 = o + butlen*4/4; + srButBackward16(p); + + p->offset1 = o; + srButBackward32(p); + + p->butlen = butlen; + p->stride = p->butlen/2; + p->log2butlen = log2butlen; + srButBackwardSubUnrolled(p); +} + +static void srForwardMain(DFTUndiff *p) { + if (p->length >= 64) { + p->butlen = p->length; + p->log2butlen = p->log2len; + p->offset1 = p->offset2 = 0; + + srForwardMain2(p); + } else { + switch(p->length) { + case 32: + srButForward32(p); + break; + case 16: + srButForward16(p); + break; + case 8: + srButForward8(p); + break; + case 4: + srButForward4(p); + break; + case 2: + srBut2(p); + break; + } + } +} + +static void srBackwardMain(DFTUndiff *p) { + if (p->length >= 64) { + p->butlen = p->length; + p->log2butlen = p->log2len; + p->offset1 = p->offset2 = 0; + + srBackwardMain2(p); + } else { + switch(p->length) { + case 32: + srButBackward32(p); + break; + case 16: + srButBackward16(p); + break; + case 8: + srButBackward8(p); + break; + case 4: + srButBackward4(p); + break; + case 2: + srBut2(p); + break; + } + } +} + +static void realSub0(DFTUndiff *p, SIMDBase_VECT *s, int32_t ts) { + SIMDBase_VECT tr, ti, ur, ui, mr, mi; + int32_t n = p->length*2; + int32_t k; + + for(k=1;k<n/4;k++) { + SIMDBase_VECT s00 = SIMDBase_LOAD(&s[k*2+0]), s01 = SIMDBase_LOAD(&s[k*2+1]); + SIMDBase_VECT s10 = SIMDBase_LOAD(&s[(n/2-k)*2+0]), s11 = SIMDBase_LOAD(&s[(n/2-k)*2+1]); + + tr = SIMDBase_SUBi(s00, s10); ti = SIMDBase_ADDi(s01, s11); + ur = SIMDBase_LOAD1(&(p->rtTable[ts][k*2+0])); + ui = SIMDBase_LOAD1(&(p->rtTable[ts][k*2+1])); + mr = SIMDBase_SUBi(SIMDBase_MULi(tr, ur), SIMDBase_MULi(ti, ui)); + mi = SIMDBase_ADDi(SIMDBase_MULi(tr, ui), SIMDBase_MULi(ti, ur)); + SIMDBase_STOR(&s[k*2+0], SIMDBase_SUBi(s00, mr)); + SIMDBase_STOR(&s[k*2+1], SIMDBase_SUBi(s01, mi)); + SIMDBase_STOR(&s[(n/2-k)*2+0], SIMDBase_ADDi(s10, mr)); + SIMDBase_STOR(&s[(n/2-k)*2+1], SIMDBase_SUBi(s11, mi)); + } + + tr = SIMDBase_LOAD(&s[0]); ti = SIMDBase_LOAD(&s[1]); + SIMDBase_STOR(&s[0], SIMDBase_ADDi(tr, ti)); + SIMDBase_STOR(&s[1], SIMDBase_SUBi(tr, ti)); +} + +static void realSub1(DFTUndiff *p, SIMDBase_VECT *s, int32_t ts) { + SIMDBase_VECT tr, ti, ur, ui, mr, mi; + int32_t n = p->length*2; + int32_t k; + + tr = SIMDBase_LOAD(&s[0]); ti = SIMDBase_LOAD(&s[1]); + SIMDBase_STOR(&s[0], SIMDBase_MULi(SIMDBase_ADDi(tr, ti), SIMDBase_SET1(0.5))); + SIMDBase_STOR(&s[1], SIMDBase_MULi(SIMDBase_SUBi(tr, ti), SIMDBase_SET1(0.5))); + + for(k=1;k<n/4;k++) { + SIMDBase_VECT s00 = SIMDBase_LOAD(&s[k*2+0]), s01 = SIMDBase_LOAD(&s[k*2+1]); + SIMDBase_VECT s10 = SIMDBase_LOAD(&s[(n/2-k)*2+0]), s11 = SIMDBase_LOAD(&s[(n/2-k)*2+1]); + + tr = SIMDBase_SUBi(s00, s10); ti = SIMDBase_ADDi(s01, s11); + ur = SIMDBase_LOAD1(&(p->rtTable[ts][k*2+0])); + ui = SIMDBase_LOAD1(&(p->rtTable[ts][k*2+1])); + mr = SIMDBase_SUBi(SIMDBase_MULi(tr, ur), SIMDBase_MULi(ti, ui)); + mi = SIMDBase_ADDi(SIMDBase_MULi(tr, ui), SIMDBase_MULi(ti, ur)); + tr = SIMDBase_SUBi(s00, mr); ti = SIMDBase_SUBi(mi, s01); + SIMDBase_STOR(&s[k*2+0], SIMDBase_ADDi(mr, s10)); + SIMDBase_STOR(&s[k*2+1], SIMDBase_SUBi(mi, s11)); + SIMDBase_STOR(&s[(n/2-k)*2+0], tr); + SIMDBase_STOR(&s[(n/2-k)*2+1], ti); + } +} + +void DFTUndiff_EXECUTE(void *p2, void *s2, int32_t dir) { + DFTUndiff *p = (DFTUndiff *)p2; + SIMDBase_VECT *s = (SIMDBase_VECT *)s2; + + if (p->magic != MAGIC_DFT) abort(); + + p->s = s; + + if (dir == -1) { + if ((p->flags & DFT_FLAG_ALT_REAL) != 0) { + realSub1(p, s, 0); + } + + srForwardMain(p); + + if ((p->flags & DFT_FLAG_NO_BITREVERSAL) == 0) { + if (p->useCobra) { + bitReversalCobraInplace(p); + } else { + bitReversalRecursive(p->s, p->length, 1, 0, 0); + } + } + + if ((p->flags & DFT_FLAG_REAL) != 0) { + realSub0(p, s, 0); + s[p->length+1] = SIMDBase_NEGi(s[p->length+1]); + } + } else { + if ((p->flags & DFT_FLAG_REAL) != 0) { + s[p->length+1] = SIMDBase_NEGi(s[p->length+1]); + realSub1(p, s, 1); + } + + if ((p->flags & DFT_FLAG_NO_BITREVERSAL) == 0) { + if (p->useCobra) { + bitReversalCobraInplace(p); + } else { + bitReversalRecursive(p->s, p->length, 1, 0, 0); + } + } + + srBackwardMain(p); + + if ((p->flags & DFT_FLAG_ALT_REAL) != 0) { + realSub0(p, s, 1); + } + } +} + +void DFTUndiff_DESTROYPLAN(void *p2) { + DFTUndiff *plan = (DFTUndiff *)p2; + if (plan->magic != MAGIC_DFT) abort(); + + free(*(plan->ptTable)); + free(plan->ptTable); + free(plan->cobraT); + free(plan->cobraR); + //free(plan->t); + if (plan->rtTable != NULL) { + free(plan->rtTable[0]); + free(plan->rtTable[1]); + free(plan->rtTable); + } + + plan->magic = 0; + free(plan); +} + +DFTUndiff *DFTUndiff_MAKEPLANSUB(uint64_t n, int32_t radix2thres, int32_t useCobra, uint64_t flags) { + int32_t i, j, k; + + uint32_t linesize = SIMDBase_sizeOfCachelineInByte(); + uint32_t cachesize = SIMDBase_sizeOfDataCacheInByte(); + + // + + if ((flags & DFT_FLAG_REAL) != 0 || (flags & DFT_FLAG_ALT_REAL) != 0) n /= 2; + + DFTUndiff *d = calloc(1, sizeof(DFTUndiff)); + + d->magic = MAGIC_DFT; + d->mode = SIMDBase_MODE; + d->flags = flags; + + d->radix2thres = radix2thres; + d->useCobra = useCobra; + + d->length = (uint32_t) n; + d->log2len = DFT_ilog2((uint32_t) n); + + // + + SIMDBase_REAL *trigTable = SIMDBase_alignedMalloc(sizeof(SIMDBase_REAL)*n*2); + d->ptTable = malloc(sizeof(SIMDBase_REAL *) * (d->log2len+1)); + + SIMDBase_REAL *p = trigTable, **pp = d->ptTable; + + for(j=0;j<(int32_t)d->log2len+1;j++) { + *pp++ = p; + + if ((1 << j) >= d->radix2thres) { + for(i=0;i<(1 << j)/4+1;i++) { + *p++ = (SIMDBase_REAL)COS(-2*M_PIl*i/(1 << j)); + } + const int32_t step = linesize / sizeof(SIMDBase_REAL); + p += (step - (p - trigTable) % step) % step; + } else { + for(i=0;i<(1 << j)/4;i++) { + *p++ = (SIMDBase_REAL)COS(-2*M_PIl*i/(1 << j)); + *p++ = (SIMDBase_REAL)SIN(-2*M_PIl*i/(1 << j)); + *p++ = (SIMDBase_REAL)COS(-6*M_PIl*i/(1 << j)); + *p++ = (SIMDBase_REAL)SIN(-6*M_PIl*i/(1 << j)); + } + } + } + + // + + int32_t cobraQ; + + cobraQ = linesize / (sizeof(SIMDBase_VECT) * 2); + + for(;;) { + if (1 << (cobraQ*2) > + (cachesize / (sizeof(SIMDBase_VECT) * 2)/2)) + break; + + cobraQ++; + } + cobraQ--; + + d->cobraQ = cobraQ; + + if (cobraQ >= 4 && d->log2len >= 2*cobraQ) { + SIMDBase_VECT *cobraT; + int32_t *cobraR; + + if (d->log2len <= 2*cobraQ) cobraQ = d->log2len / 2; + + cobraT = SIMDBase_alignedMalloc(sizeof(SIMDBase_VECT)*2 * (1 << (cobraQ*2))); + cobraR = (int32_t *)SIMDBase_alignedMalloc(sizeof(int32_t) * (1 << cobraQ)); + + for(i=0;i<(1 << cobraQ);i++) cobraR[i] = bitR(i, cobraQ); + + d->cobraT = cobraT; d->cobraR = cobraR; + } else { + d->useCobra = 0; + } + + // + + if ((d->flags & DFT_FLAG_REAL) != 0 || (d->flags & DFT_FLAG_ALT_REAL) != 0) { + int32_t m = n*2; + + d->rtTable = malloc(sizeof(SIMDBase_REAL *)*2); + d->rtTable[0] = SIMDBase_alignedMalloc(sizeof(SIMDBase_REAL)*m/2); + d->rtTable[1] = SIMDBase_alignedMalloc(sizeof(SIMDBase_REAL)*m/2); + + for(k=0;k<m/4;k++) { + d->rtTable[0][k*2+0] = 0.5-0.5*SIN(-2*M_PIl*k/m); + d->rtTable[0][k*2+1] = 0.5*COS(-2*M_PIl*k/m); + d->rtTable[1][k*2+0] = 0.5-0.5*SIN( 2*M_PIl*k/m); + d->rtTable[1][k*2+1] = 0.5*COS( 2*M_PIl*k/m); + } + } + + // + + return (void *)d; +} + +void *DFTUndiff_MAKEPLAN(uint64_t n, uint64_t flags) { + if (flags & DFT_FLAG_VERBOSE) { + printf("\n--------------------------------\n"); + printf("Making plan, mode = %s, dft length = %d\n", SIMDBase_NAME, (int)n); + printf("Processor : %s\n", SIMDBase_getProcessorNameString()); + printf("Cache size (L2 + L3) : %d kbytes / thread\n", SIMDBase_sizeOfDataCacheInByte() / 1024); + printf("Cache Line Size : %d bytes\n", SIMDBase_sizeOfCachelineInByte()); + } + + if (n <= 256 || (flags & 3) == 0) { + return DFTUndiff_MAKEPLANSUB(n, n*2, (flags & DFT_FLAG_FORCE_COBRA) != 0, flags); + } + + SIMDBase_REAL *s1 = SIMDBase_alignedMalloc(sizeof(SIMDBase_VECT)*n*2); + + int32_t i, j, ts, tsbest, useCobra = 0; + double tick, tickmin; + + if (flags & DFT_FLAG_VERBOSE) { + printf("\nWarming up before calibration ..."); + fflush(stdout); + } + + // warming up + tick = DFT_timeofday(); + while(DFT_timeofday() - tick < 0.5) + ; + + if (flags & DFT_FLAG_VERBOSE) { + printf(" done\n"); + } + + int32_t ntimes = 20000000.0 / n / DFT_ilog2(n); + if (ntimes == 0) ntimes = 1; + + if (flags & DFT_FLAG_VERBOSE) { + printf("nTimes = %d\n", ntimes); + } + + // + + DFTUndiff *plan = DFTUndiff_MAKEPLANSUB(n, n*2, 0, flags); + + for(i=0;i<n*2*SIMDBase_VECTLEN;i++) { + s1[i] = 0; + } + + plan->s = (SIMDBase_VECT *)s1; + + if (plan->cobraT != NULL) { + double tcobra = 0, trecur = 0; + + if (flags & DFT_FLAG_VERBOSE) { + printf("\nChecking which bit-reversal method is faster\n"); + } + + // + + bitReversalCobraInplace(plan); + + tick = DFT_timeofday(); + + for(j=0;j<ntimes*4;j++) { + bitReversalCobraInplace(plan); + } + + tcobra += DFT_timeofday() - tick; + + // + + bitReversalRecursive(plan->s, plan->length, 1, 0, 0); + + tick = DFT_timeofday(); + + for(j=0;j<ntimes*4;j++) { + bitReversalRecursive(plan->s, plan->length, 1, 0, 0); + } + + trecur += DFT_timeofday() - tick; + + // + + bitReversalCobraInplace(plan); + + tick = DFT_timeofday(); + + for(j=0;j<ntimes*4;j++) { + bitReversalCobraInplace(plan); + } + + tcobra += DFT_timeofday() - tick; + + // + + bitReversalRecursive(plan->s, plan->length, 1, 0, 0); + + tick = DFT_timeofday(); + + for(j=0;j<ntimes*4;j++) { + bitReversalRecursive(plan->s, plan->length, 1, 0, 0); + } + + trecur += DFT_timeofday() - tick; + + // + + useCobra = tcobra < trecur; + + if ((flags & DFT_FLAG_FORCE_RECURSIVE) != 0) useCobra = 0; + if ((flags & DFT_FLAG_FORCE_COBRA) != 0) useCobra = 1; + + if (flags & DFT_FLAG_VERBOSE) { + printf("cobra : %g\n", tcobra); + printf("recur : %g\n", trecur); + if (useCobra) { + printf("will use Cobra\n"); + } else { + printf("will use the recursive reverser\n"); + } + } + } + + DFTUndiff_DESTROYPLAN(plan); + + // + + if (flags & DFT_FLAG_VERBOSE) { + printf("\nDetermining radix 2 threshold\n"); + } + + plan = DFTUndiff_MAKEPLANSUB(n, n*2, useCobra, flags); + + for(j=0;j<ntimes;j++) { + DFTUndiff_EXECUTE(plan, s1, -1); + DFTUndiff_EXECUTE(plan, s1, 1); + } + + DFTUndiff_DESTROYPLAN(plan); + + tsbest = -1; + tickmin = 0; + + for(ts = 1024;ts <= n*2;ts *= 2) { + plan = DFTUndiff_MAKEPLANSUB(n, ts, useCobra, flags); + + tick = DFT_timeofday(); + + for(j=0;j<ntimes;j++) { + DFTUndiff_EXECUTE(plan, s1, -1); + DFTUndiff_EXECUTE(plan, s1, 1); + } + + tick = DFT_timeofday() - tick; + + DFTUndiff_DESTROYPLAN(plan); + + if (tickmin == 0) tickmin = tick; + + if (flags & DFT_FLAG_VERBOSE) { + printf("%d : %g\n",ts, (double)tick); + } + + if (tick < tickmin) { + tickmin = tick; + tsbest = ts; + } + } + + if (tsbest == -1) tsbest = n*2;; + + if (flags & DFT_FLAG_VERBOSE) { + //printf("forcing tsbest = 1024\n"); + //tsbest = 1024; + printf("radix 2 threshold : %d\n\n", tsbest); + + double t = tickmin / ntimes / 2; + double nf = 5 * n * log(n) / log(2) / (t * 1000000); + + printf("nFlops = %d x %g\n", SIMDBase_VECTLEN, nf); + } + + plan = DFTUndiff_MAKEPLANSUB(n, tsbest, useCobra, flags); + + if (flags & DFT_FLAG_VERBOSE) { + printf("\nDone making plan\n--------------------------------\n"); + } + + return plan; +} diff --git a/plugins/supereq/nsfft-1.00/dft/DFTUndiff.h b/plugins/supereq/nsfft-1.00/dft/DFTUndiff.h new file mode 100644 index 00000000..d26b0d9b --- /dev/null +++ b/plugins/supereq/nsfft-1.00/dft/DFTUndiff.h @@ -0,0 +1,114 @@ +#ifndef __DFTIMPL_H__ +#define __DFTIMPL_H__ + +#include "SIMDBaseUndiff.h" + +#define MAGIC_DFT 0x18839f6d82bb02b6ULL + +typedef struct { + uint64_t magic; + + SIMDBase_VECT *s; + uint32_t offset1, offset2; + uint32_t butlen, log2butlen; + uint32_t stride; + + SIMDBase_REAL **ptTable; + uint32_t length, log2len; + + int32_t radix2thres, flagTrans, useCobra; + + int32_t cobraQ; + SIMDBase_VECT *cobraT; + int32_t *cobraR; + + SIMDBase_REAL **rtTable; + + uint64_t flags; + int32_t mode; +} DFTUndiff; + +#if defined(ENABLE_PUREC_FLOAT) //////////////////////////////////////////// + +#define DFTUndiff_GETMODEPARAMINT getModeParamInt_purec_float +#define DFTUndiff_GETMODEPARAMSTRING getModeParamString_purec_float +#define DFTUndiff_EXECUTE execute_purec_float +#define DFTUndiff_MAKEPLAN makePlan_purec_float +#define DFTUndiff_MAKEPLANSUB makePlanSub_purec_float +#define DFTUndiff_DESTROYPLAN destroyPlan_purec_float + +#elif defined(ENABLE_PUREC_DOUBLE) //////////////////////////////////////////// + +#define DFTUndiff_GETMODEPARAMINT getModeParamInt_purec_double +#define DFTUndiff_GETMODEPARAMSTRING getModeParamString_purec_double +#define DFTUndiff_EXECUTE execute_purec_double +#define DFTUndiff_MAKEPLAN makePlan_purec_double +#define DFTUndiff_MAKEPLANSUB makePlanSub_purec_double +#define DFTUndiff_DESTROYPLAN destroyPlan_purec_double + +#elif defined(ENABLE_PUREC_LONGDOUBLE) //////////////////////////////////////////// + +#define DFTUndiff_GETMODEPARAMINT getModeParamInt_purec_longdouble +#define DFTUndiff_GETMODEPARAMSTRING getModeParamString_purec_longdouble +#define DFTUndiff_EXECUTE execute_purec_longdouble +#define DFTUndiff_MAKEPLAN makePlan_purec_longdouble +#define DFTUndiff_MAKEPLANSUB makePlanSub_purec_longdouble +#define DFTUndiff_DESTROYPLAN destroyPlan_purec_longdouble + +#elif defined(ENABLE_SSE_FLOAT) //////////////////////////////////////////// + +#define DFTUndiff_GETMODEPARAMINT getModeParamInt_sse_float +#define DFTUndiff_GETMODEPARAMSTRING getModeParamString_sse_float +#define DFTUndiff_EXECUTE execute_sse_float +#define DFTUndiff_MAKEPLAN makePlan_sse_float +#define DFTUndiff_MAKEPLANSUB makePlanSub_sse_float +#define DFTUndiff_DESTROYPLAN destroyPlan_sse_float + +#elif defined(ENABLE_SSE2_DOUBLE) //////////////////////////////////////////// + +#define DFTUndiff_GETMODEPARAMINT getModeParamInt_sse2_double +#define DFTUndiff_GETMODEPARAMSTRING getModeParamString_sse2_double +#define DFTUndiff_EXECUTE execute_sse2_double +#define DFTUndiff_MAKEPLAN makePlan_sse2_double +#define DFTUndiff_MAKEPLANSUB makePlanSub_sse2_double +#define DFTUndiff_DESTROYPLAN destroyPlan_sse2_double + +#elif defined(ENABLE_NEON_FLOAT) //////////////////////////////////////////// + +#define DFTUndiff_GETMODEPARAMINT getModeParamInt_neon_float +#define DFTUndiff_GETMODEPARAMSTRING getModeParamString_neon_float +#define DFTUndiff_EXECUTE execute_neon_float +#define DFTUndiff_MAKEPLAN makePlan_neon_float +#define DFTUndiff_MAKEPLANSUB makePlanSub_neon_float +#define DFTUndiff_DESTROYPLAN destroyPlan_neon_float + +#elif defined(ENABLE_AVX_FLOAT) //////////////////////////////////////////// + +#define DFTUndiff_GETMODEPARAMINT getModeParamInt_avx_float +#define DFTUndiff_GETMODEPARAMSTRING getModeParamString_avx_float +#define DFTUndiff_EXECUTE execute_avx_float +#define DFTUndiff_MAKEPLAN makePlan_avx_float +#define DFTUndiff_MAKEPLANSUB makePlanSub_avx_float +#define DFTUndiff_DESTROYPLAN destroyPlan_avx_float + +#elif defined(ENABLE_AVX_DOUBLE) //////////////////////////////////////////// + +#define DFTUndiff_GETMODEPARAMINT getModeParamInt_avx_double +#define DFTUndiff_GETMODEPARAMSTRING getModeParamString_avx_double +#define DFTUndiff_EXECUTE execute_avx_double +#define DFTUndiff_MAKEPLAN makePlan_avx_double +#define DFTUndiff_MAKEPLANSUB makePlanSub_avx_double +#define DFTUndiff_DESTROYPLAN destroyPlan_avx_double + +#elif defined(ENABLE_ALTIVEC_FLOAT) //////////////////////////////////////////// + +#define DFTUndiff_GETMODEPARAMINT getModeParamInt_altivec_float +#define DFTUndiff_GETMODEPARAMSTRING getModeParamString_altivec_float +#define DFTUndiff_EXECUTE execute_altivec_float +#define DFTUndiff_MAKEPLAN makePlan_altivec_float +#define DFTUndiff_MAKEPLANSUB makePlanSub_altivec_float +#define DFTUndiff_DESTROYPLAN destroyPlan_altivec_float + +#endif //////////////////////////////////////////////////////////////////// + +#endif diff --git a/plugins/supereq/nsfft-1.00/dft/Makefile b/plugins/supereq/nsfft-1.00/dft/Makefile new file mode 120000 index 00000000..5d253498 --- /dev/null +++ b/plugins/supereq/nsfft-1.00/dft/Makefile @@ -0,0 +1 @@ +Makefile.x86avx
\ No newline at end of file diff --git a/plugins/supereq/nsfft-1.00/dft/Makefile.altivec b/plugins/supereq/nsfft-1.00/dft/Makefile.altivec new file mode 100644 index 00000000..fe7fc993 --- /dev/null +++ b/plugins/supereq/nsfft-1.00/dft/Makefile.altivec @@ -0,0 +1,26 @@ +CC=gcc +BASEOPT=-Wall -I ../simd -maltivec -mabi=altivec +OPT=$(BASEOPT) -O3 + +all : libDFT.a + +DFTpurecfloat.o : DFTUndiff.c DFT.h ../simd/SIMDBase.h ../simd/SIMDBaseUndiff.h + $(CC) $(OPT) -DENABLE_PUREC_FLOAT DFTUndiff.c -c -o DFTpurecfloat.o + +DFTpurecdouble.o : DFTUndiff.c DFT.h ../simd/SIMDBase.h ../simd/SIMDBaseUndiff.h + $(CC) $(OPT) -DENABLE_PUREC_DOUBLE DFTUndiff.c -c -o DFTpurecdouble.o + +DFTpureclongdouble.o : DFTUndiff.c DFT.h ../simd/SIMDBase.h ../simd/SIMDBaseUndiff.h + $(CC) $(OPT) -DENABLE_PUREC_LONGDOUBLE DFTUndiff.c -c -o DFTpureclongdouble.o + +DFTaltivecfloat.o : DFTUndiff.c DFT.h ../simd/SIMDBase.h ../simd/SIMDBaseUndiff.h + $(CC) $(OPT) -DENABLE_ALTIVEC_FLOAT DFTUndiff.c -c -o DFTaltivecfloat.o + +DFT.o : DFT.c DFT.h + $(CC) $(OPT) -DENABLE_PUREC_FLOAT -DENABLE_PUREC_DOUBLE -DENABLE_PUREC_LONGDOUBLE -DENABLE_ALTIVEC_FLOAT DFT.c -c -o DFT.o + +libDFT.a : DFTpurecfloat.o DFTpurecdouble.o DFTpureclongdouble.o DFTaltivecfloat.o DFT.o + rm -f libDFT.a; ar -cvq libDFT.a DFTpurecfloat.o DFTpurecdouble.o DFTpureclongdouble.o DFTaltivecfloat.o DFT.o + +clean : + rm -f *~ *.o *.s *.a diff --git a/plugins/supereq/nsfft-1.00/dft/Makefile.neon b/plugins/supereq/nsfft-1.00/dft/Makefile.neon new file mode 100644 index 00000000..111a04ae --- /dev/null +++ b/plugins/supereq/nsfft-1.00/dft/Makefile.neon @@ -0,0 +1,26 @@ +CC=gcc +BASEOPT=-Wall -I ../simd -mfloat-abi=softfp +OPT=$(BASEOPT) -O3 + +all : libDFT.a + +DFTpurecfloat.o : DFTUndiff.c DFT.h ../simd/SIMDBase.h + $(CC) $(OPT) -DENABLE_PUREC_FLOAT DFTUndiff.c -c -o DFTpurecfloat.o + +DFTpurecdouble.o : DFTUndiff.c DFT.h ../simd/SIMDBase.h + $(CC) $(OPT) -DENABLE_PUREC_DOUBLE DFTUndiff.c -c -o DFTpurecdouble.o + +DFTpureclongdouble.o : DFTUndiff.c DFT.h ../simd/SIMDBase.h + $(CC) $(OPT) -DENABLE_PUREC_LONGDOUBLE DFTUndiff.c -c -o DFTpureclongdouble.o + +DFTneonfloat.o : DFTUndiff.c DFT.h ../simd/SIMDBase.h + $(CC) $(OPT) -mfpu=neon -DENABLE_NEON_FLOAT DFTUndiff.c -c -o DFTneonfloat.o + +DFT.o : DFT.c DFT.h + $(CC) $(OPT) -DENABLE_PUREC_FLOAT -DENABLE_PUREC_DOUBLE -DENABLE_PUREC_LONGDOUBLE -DENABLE_NEON_FLOAT DFT.c -c -o DFT.o + +libDFT.a : DFTpurecfloat.o DFTpurecdouble.o DFTpureclongdouble.o DFTneonfloat.o DFT.o + rm -f libDFT.a; ar -cvq libDFT.a DFTpurecfloat.o DFTpurecdouble.o DFTpureclongdouble.o DFTneonfloat.o DFT.o + +clean : + rm -f *~ *.o *.s *.a diff --git a/plugins/supereq/nsfft-1.00/dft/Makefile.purec b/plugins/supereq/nsfft-1.00/dft/Makefile.purec new file mode 100644 index 00000000..2c8b04f1 --- /dev/null +++ b/plugins/supereq/nsfft-1.00/dft/Makefile.purec @@ -0,0 +1,35 @@ +CC=gcc +BASEOPT=-Wall +OPT=$(BASEOPT) -O3 + +all : libDFT.a + +DFTpurecfloat.o : DFTUndiff.c DFT.h SIMDBase.h + $(CC) $(OPT) -DENABLE_PUREC_FLOAT DFTUndiff.c -c -o DFTpurecfloat.o + +DFTpurecdouble.o : DFTUndiff.c DFT.h SIMDBase.h + $(CC) $(OPT) -DENABLE_PUREC_DOUBLE DFTUndiff.c -c -o DFTpurecdouble.o + +DFTpureclongdouble.o : DFTUndiff.c DFT.h SIMDBase.h + $(CC) $(OPT) -DENABLE_PUREC_LONGDOUBLE DFTUndiff.c -c -o DFTpureclongdouble.o + +SIMDBaseUndiff_purecfloat.o : SIMDBaseUndiff.c SIMDBase.h SIMDBaseUndiff.h + $(CC) $(OPT) -DENABLE_PUREC_FLOAT SIMDBaseUndiff.c -c -o SIMDBaseUndiff_purecfloat.o + +SIMDBaseUndiff_purecdouble.o : SIMDBaseUndiff.c DFT.h SIMDBase.h SIMDBaseUndiff.h + $(CC) $(OPT) -DENABLE_PUREC_DOUBLE SIMDBaseUndiff.c -c -o SIMDBaseUndiff_purecdouble.o + +SIMDBaseUndiff_pureclongdouble.o : SIMDBaseUndiff.c SIMDBase.h SIMDBaseUndiff.h + $(CC) $(OPT) -DENABLE_PUREC_LONGDOUBLE SIMDBaseUndiff.c -c -o SIMDBaseUndiff_pureclongdouble.o + +SIMDBase.o : SIMDBase.c SIMDBase.h + $(CC) $(BASEOPT) -O -DENABLE_PUREC_FLOAT -DENABLE_PUREC_DOUBLE -DENABLE_PUREC_LONGDOUBLE SIMDBase.c -c -o SIMDBase.o + +DFT.o : DFT.c DFT.h + $(CC) $(OPT) -DENABLE_PUREC_FLOAT -DENABLE_PUREC_DOUBLE -DENABLE_PUREC_LONGDOUBLE DFT.c -c -o DFT.o + +libDFT.a : DFTpurecfloat.o DFTpurecdouble.o DFTpureclongdouble.o DFT.o SIMDBase.o SIMDBaseUndiff_purecfloat.o SIMDBaseUndiff_purecdouble.o SIMDBaseUndiff_pureclongdouble.o + rm -f libDFT.a; ar -cvq libDFT.a DFTpurecfloat.o DFTpurecdouble.o DFTpureclongdouble.o DFT.o SIMDBase.o SIMDBaseUndiff_purecfloat.o SIMDBaseUndiff_purecdouble.o SIMDBaseUndiff_pureclongdouble.o + +clean : + rm -f *~ *.o *.s *.a diff --git a/plugins/supereq/nsfft-1.00/dft/Makefile.x86 b/plugins/supereq/nsfft-1.00/dft/Makefile.x86 new file mode 100644 index 00000000..6ecbacec --- /dev/null +++ b/plugins/supereq/nsfft-1.00/dft/Makefile.x86 @@ -0,0 +1,29 @@ +CC=gcc +BASEOPT=-Wall -I ../simd +OPT=$(BASEOPT) -O3 + +all : libDFT.a + +DFTpurecfloat.o : DFTUndiff.c DFT.h ../simd/SIMDBase.h ../simd/SIMDBaseUndiff.h + $(CC) $(OPT) -DENABLE_PUREC_FLOAT DFTUndiff.c -c -o DFTpurecfloat.o + +DFTpurecdouble.o : DFTUndiff.c DFT.h ../simd/SIMDBase.h ../simd/SIMDBaseUndiff.h + $(CC) $(OPT) -DENABLE_PUREC_DOUBLE DFTUndiff.c -c -o DFTpurecdouble.o + +DFTpureclongdouble.o : DFTUndiff.c DFT.h ../simd/SIMDBase.h ../simd/SIMDBaseUndiff.h + $(CC) $(OPT) -DENABLE_PUREC_LONGDOUBLE DFTUndiff.c -c -o DFTpureclongdouble.o + +DFTssefloat.o : DFTUndiff.c DFT.h ../simd/SIMDBase.h ../simd/SIMDBaseUndiff.h + $(CC) $(OPT) -msse -DENABLE_SSE_FLOAT DFTUndiff.c -c -o DFTssefloat.o + +DFTsse2double.o : DFTUndiff.c DFT.h ../simd/SIMDBase.h ../simd/SIMDBaseUndiff.h + $(CC) $(OPT) -msse2 -DENABLE_SSE2_DOUBLE DFTUndiff.c -c -o DFTsse2double.o + +DFT.o : DFT.c DFT.h + $(CC) $(OPT) -DENABLE_PUREC_FLOAT -DENABLE_PUREC_DOUBLE -DENABLE_PUREC_LONGDOUBLE -DENABLE_SSE_FLOAT -DENABLE_SSE2_DOUBLE DFT.c -c -o DFT.o + +libDFT.a : DFTpurecfloat.o DFTpurecdouble.o DFTpureclongdouble.o DFTssefloat.o DFTsse2double.o DFT.o + rm -f libDFT.a; ar -cvq libDFT.a DFTpurecfloat.o DFTpurecdouble.o DFTpureclongdouble.o DFTssefloat.o DFTsse2double.o DFT.o + +clean : + rm -f *~ *.o *.s *.a a.out diff --git a/plugins/supereq/nsfft-1.00/dft/Makefile.x86avx b/plugins/supereq/nsfft-1.00/dft/Makefile.x86avx new file mode 100644 index 00000000..b38909cb --- /dev/null +++ b/plugins/supereq/nsfft-1.00/dft/Makefile.x86avx @@ -0,0 +1,35 @@ +CC=gcc +BASEOPT=-Wall -I ../simd +OPT=$(BASEOPT) -O3 + +all : libDFT.a + +DFTpurecfloat.o : DFTUndiff.c DFT.h ../simd/SIMDBase.h ../simd/SIMDBaseUndiff.h + $(CC) $(OPT) -DENABLE_PUREC_FLOAT DFTUndiff.c -c -o DFTpurecfloat.o + +DFTpurecdouble.o : DFTUndiff.c DFT.h ../simd/SIMDBase.h ../simd/SIMDBaseUndiff.h + $(CC) $(OPT) -DENABLE_PUREC_DOUBLE DFTUndiff.c -c -o DFTpurecdouble.o + +DFTpureclongdouble.o : DFTUndiff.c DFT.h ../simd/SIMDBase.h ../simd/SIMDBaseUndiff.h + $(CC) $(OPT) -DENABLE_PUREC_LONGDOUBLE DFTUndiff.c -c -o DFTpureclongdouble.o + +DFTssefloat.o : DFTUndiff.c DFT.h ../simd/SIMDBase.h ../simd/SIMDBaseUndiff.h + $(CC) $(OPT) -msse -DENABLE_SSE_FLOAT DFTUndiff.c -c -o DFTssefloat.o + +DFTsse2double.o : DFTUndiff.c DFT.h ../simd/SIMDBase.h ../simd/SIMDBaseUndiff.h + $(CC) $(OPT) -msse2 -DENABLE_SSE2_DOUBLE DFTUndiff.c -c -o DFTsse2double.o + +DFTavxfloat.o : DFTUndiff.c DFT.h ../simd/SIMDBase.h ../simd/SIMDBaseUndiff.h + $(CC) $(OPT) -mavx -DENABLE_AVX_FLOAT DFTUndiff.c -c -o DFTavxfloat.o + +DFTavxdouble.o : DFTUndiff.c DFT.h ../simd/SIMDBase.h ../simd/SIMDBaseUndiff.h + $(CC) $(OPT) -mavx -DENABLE_AVX_DOUBLE DFTUndiff.c -c -o DFTavxdouble.o + +DFT.o : DFT.c DFT.h + $(CC) $(OPT) -DENABLE_PUREC_FLOAT -DENABLE_PUREC_DOUBLE -DENABLE_PUREC_LONGDOUBLE -DENABLE_SSE_FLOAT -DENABLE_SSE2_DOUBLE -DENABLE_AVX_FLOAT -DENABLE_AVX_DOUBLE DFT.c -c -o DFT.o + +libDFT.a : DFTpurecfloat.o DFTpurecdouble.o DFTpureclongdouble.o DFTssefloat.o DFTsse2double.o DFTavxfloat.o DFTavxdouble.o DFT.o + rm -f libDFT.a; ar -cvq libDFT.a DFTpurecfloat.o DFTpurecdouble.o DFTpureclongdouble.o DFTssefloat.o DFTsse2double.o DFTavxfloat.o DFTavxdouble.o DFT.o + +clean : + rm -f *~ *.o *.s *.a a.out |