diff options
Diffstat (limited to 'plugins/supereq/nsfft-1.00')
32 files changed, 6716 insertions, 0 deletions
diff --git a/plugins/supereq/nsfft-1.00/README b/plugins/supereq/nsfft-1.00/README new file mode 100644 index 00000000..1ca873b1 --- /dev/null +++ b/plugins/supereq/nsfft-1.00/README @@ -0,0 +1,15 @@ + +NSFFT (Nonrestrictive SIMD FFT) is yet another FFT library for +performing 1-dimensional fast Fourier transforms. NSDFT is a simple, +small and portable library, and it is efficient since it can utilize +SIMD instruction sets in modern processors. It performs multiple +transforms simultaneously, and thus it is especially suitable for +digital signal processing. It does not need so much computation to +make a good execution plan. This library is in public domain, so that +you can incorporate this library into your product without any +obligation. + +Visit http://shibatch.sourceforge.net/ to get the latest version of +this library. + +Contact : Naoki Shibata shibatch@users.sourceforge.net diff --git a/plugins/supereq/nsfft-1.00/dft/DFT.c b/plugins/supereq/nsfft-1.00/dft/DFT.c new file mode 100644 index 00000000..d59e6ab8 --- /dev/null +++ b/plugins/supereq/nsfft-1.00/dft/DFT.c @@ -0,0 +1,327 @@ +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <math.h> +#include <stdint.h> +#include <sys/time.h> + +#include "SIMDBase.h" +#include "DFT.h" +#include "DFTUndiff.h" + +int32_t getModeParamInt_purec_float(int32_t paramId); +int32_t getModeParamInt_purec_double(int32_t paramId); +int32_t getModeParamInt_purec_longdouble(int32_t paramId); +int32_t getModeParamInt_sse_float(int32_t paramId); +int32_t getModeParamInt_sse2_double(int32_t paramId); +int32_t getModeParamInt_neon_float(int32_t paramId); +int32_t getModeParamInt_avx_float(int32_t paramId); +int32_t getModeParamInt_avx_double(int32_t paramId); +int32_t getModeParamInt_altivec_float(int32_t paramId); + +char * getModeParamString_purec_float(int32_t paramId); +char * getModeParamString_purec_double(int32_t paramId); +char * getModeParamString_purec_longdouble(int32_t paramId); +char * getModeParamString_sse_float(int32_t paramId); +char * getModeParamString_sse2_double(int32_t paramId); +char * getModeParamString_neon_float(int32_t paramId); +char * getModeParamString_avx_float(int32_t paramId); +char * getModeParamString_avx_double(int32_t paramId); +char * getModeParamString_altivec_float(int32_t paramId); + +void *makePlan_purec_float(uint64_t n, uint64_t flags); +void *makePlan_purec_double(uint64_t n, uint64_t flags); +void *makePlan_purec_longdouble(uint64_t n, uint64_t flags); +void *makePlan_sse_float(uint64_t n, uint64_t flags); +void *makePlan_sse2_double(uint64_t n, uint64_t flags); +void *makePlan_neon_float(uint64_t n, uint64_t flags); +void *makePlan_avx_float(uint64_t n, uint64_t flags); +void *makePlan_avx_double(uint64_t n, uint64_t flags); +void *makePlan_altivec_float(uint64_t n, uint64_t flags); + +void *makePlanSub_purec_float(uint64_t n, int32_t radix2thres, int32_t useCobra, uint64_t flags); +void *makePlanSub_purec_double(uint64_t n, int32_t radix2thres, int32_t useCobra, uint64_t flags); +void *makePlanSub_purec_longdouble(uint64_t n, int32_t radix2thres, int32_t useCobra, uint64_t flags); +void *makePlanSub_sse_float(uint64_t n, int32_t radix2thres, int32_t useCobra, uint64_t flags); +void *makePlanSub_sse2_double(uint64_t n, int32_t radix2thres, int32_t useCobra, uint64_t flags); +void *makePlanSub_neon_float(uint64_t n, int32_t radix2thres, int32_t useCobra, uint64_t flags); +void *makePlanSub_avx_float(uint64_t n, int32_t radix2thres, int32_t useCobra, uint64_t flags); +void *makePlanSub_avx_double(uint64_t n, int32_t radix2thres, int32_t useCobra, uint64_t flags); +void *makePlanSub_altivec_float(uint64_t n, int32_t radix2thres, int32_t useCobra, uint64_t flags); + +void destroyPlan_purec_float(void *p); +void destroyPlan_purec_double(void *p); +void destroyPlan_purec_longdouble(void *p); +void destroyPlan_sse_float(void *p); +void destroyPlan_sse2_double(void *p); +void destroyPlan_neon_float(void *p); +void destroyPlan_avx_float(void *p); +void destroyPlan_avx_double(void *p); +void destroyPlan_altivec_float(void *p); + +void execute_purec_float(void *p, void *s, int32_t dir); +void execute_purec_double(void *p, void *s, int32_t dir); +void execute_purec_longdouble(void *p, void *s, int32_t dir); +void execute_sse_float(void *p, void *s, int32_t dir); +void execute_sse2_double(void *p, void *s, int32_t dir); +void execute_neon_float(void *p, void *s, int32_t dir); +void execute_avx_float(void *p, void *s, int32_t dir); +void execute_avx_double(void *p, void *s, int32_t dir); +void execute_altivec_float(void *p, void *s, int32_t dir); + +void *DFT_init(int32_t mode, uint64_t n, uint64_t flags) { + switch(mode) { +#if defined(ENABLE_PUREC_FLOAT) + case 1: return makePlan_purec_float(n, flags); break; +#endif +#if defined(ENABLE_PUREC_DOUBLE) + case 2: return makePlan_purec_double(n, flags); break; +#endif +#if defined(ENABLE_PUREC_LONGDOUBLE) + case 3: return makePlan_purec_longdouble(n, flags); break; +#endif +#if defined(ENABLE_SSE_FLOAT) + case 4: return makePlan_sse_float(n, flags); break; +#endif +#if defined(ENABLE_SSE2_DOUBLE) + case 5: return makePlan_sse2_double(n, flags); break; +#endif +#if defined(ENABLE_NEON_FLOAT) + case 6: return makePlan_neon_float(n, flags); break; +#endif +#if defined(ENABLE_AVX_FLOAT) + case 7: return makePlan_avx_float(n, flags); break; +#endif +#if defined(ENABLE_AVX_DOUBLE) + case 8: return makePlan_avx_double(n, flags); break; +#endif +#if defined(ENABLE_ALTIVEC_FLOAT) + case 9: return makePlan_altivec_float(n, flags); break; +#endif + default: break; + } + + return NULL; +} + +void DFT_dispose(void *p, int32_t mode) { + switch(mode) { +#if defined(ENABLE_PUREC_FLOAT) + case 1: destroyPlan_purec_float(p); break; +#endif +#if defined(ENABLE_PUREC_DOUBLE) + case 2: destroyPlan_purec_double(p); break; +#endif +#if defined(ENABLE_PUREC_LONGDOUBLE) + case 3: destroyPlan_purec_longdouble(p); break; +#endif +#if defined(ENABLE_SSE_FLOAT) + case 4: destroyPlan_sse_float(p); break; +#endif +#if defined(ENABLE_SSE2_DOUBLE) + case 5: destroyPlan_sse2_double(p); break; +#endif +#if defined(ENABLE_NEON_FLOAT) + case 6: destroyPlan_neon_float(p); break; +#endif +#if defined(ENABLE_AVX_FLOAT) + case 7: destroyPlan_avx_float(p); break; +#endif +#if defined(ENABLE_AVX_DOUBLE) + case 8: destroyPlan_avx_double(p); break; +#endif +#if defined(ENABLE_ALTIVEC_FLOAT) + case 9: destroyPlan_altivec_float(p); break; +#endif + default: break; + } +} + +void DFT_execute(void *p, int32_t mode, void *s, int32_t dir) { + switch(mode) { +#if defined(ENABLE_PUREC_FLOAT) + case 1: return execute_purec_float(p, s, dir); break; +#endif +#if defined(ENABLE_PUREC_DOUBLE) + case 2: return execute_purec_double(p, s, dir); break; +#endif +#if defined(ENABLE_PUREC_LONGDOUBLE) + case 3: return execute_purec_longdouble(p, s, dir); break; +#endif +#if defined(ENABLE_SSE_FLOAT) + case 4: return execute_sse_float(p, s, dir); break; +#endif +#if defined(ENABLE_SSE2_DOUBLE) + case 5: return execute_sse2_double(p, s, dir); break; +#endif +#if defined(ENABLE_NEON_FLOAT) + case 6: return execute_neon_float(p, s, dir); break; +#endif +#if defined(ENABLE_AVX_FLOAT) + case 7: return execute_avx_float(p, s, dir); break; +#endif +#if defined(ENABLE_AVX_DOUBLE) + case 8: return execute_avx_double(p, s, dir); break; +#endif +#if defined(ENABLE_ALTIVEC_FLOAT) + case 9: return execute_altivec_float(p, s, dir); break; +#endif + default: break; + } +} + +#define FILE_FORMAT_VERSION 0 + +int32_t DFT_fwrite(void *p2, FILE *fp) { + DFTUndiff *p = (DFTUndiff *)p2; + if (p->magic != MAGIC_DFT) abort(); + + if (fprintf(fp, "nsfft file format : %d\n", FILE_FORMAT_VERSION) <= 0) return 0; + if (fprintf(fp, "arch : %s\n", SIMDBase_getProcessorNameString()) <= 0) return 0; + if (fprintf(fp, "computation mode : %d\n", p->mode) <= 0) return 0; + if (fprintf(fp, "length : %d\n", ((p->flags & DFT_FLAG_REAL) != 0 || (p->flags & DFT_FLAG_ALT_REAL) != 0)? p->length * 2 : p->length) <= 0) return 0; + if (fprintf(fp, "radix2 threshold : %d\n", p->radix2thres) <= 0) return 0; + if (fprintf(fp, "transpose : %d\n", p->flagTrans) <= 0) return 0; + if (fprintf(fp, "bit reversal : %d\n", p->useCobra) <= 0) return 0; + if (fprintf(fp, "flags : %llx\n", (unsigned long long int)p->flags) <= 0) return 0; + if (fprintf(fp, "%s\n", "end :") <= 0) return 0; + + return 1; +} + +static char *startsWith(char *str1, char *str2) { + if (strncmp(str1, str2, strlen(str2)) == 0) { + return str1 + strlen(str2); + } + + return NULL; +} + +DFT *DFT_fread(FILE *fp, int32_t *errcode) { + int length = -1, radix2thres = -1, flagTrans = -1, useCobra = -1; + int mode = -1, formatver = -1; + unsigned long long int flags = (1ULL << 63); + + if (errcode != NULL) *errcode = DFT_ERROR_NOERROR; + + for(;;) { + char buf[256], *q; + if (fgets(buf, 255, fp) == NULL) { if (errcode != NULL) *errcode = DFT_ERROR_UNEXPECTED_EOF; return NULL; } + + if ((q = startsWith(buf, "nsfft file format :")) != NULL) { + if (1 != sscanf(q, "%d", &formatver)) { if (errcode != NULL) *errcode = DFT_ERROR_FILE_IO; return NULL; } + } else if ((q = startsWith(buf, "computation mode :")) != NULL) { + if (1 != sscanf(q, "%d", &mode)) { if (errcode != NULL) *errcode = DFT_ERROR_FILE_IO; return NULL; } + } else if ((q = startsWith(buf, "length :")) != NULL) { + if (1 != sscanf(q, "%d", &length)) { if (errcode != NULL) *errcode = DFT_ERROR_FILE_IO; return NULL; } + } else if ((q = startsWith(buf, "radix2 threshold :")) != NULL) { + if (1 != sscanf(q, "%d", &radix2thres)) { if (errcode != NULL) *errcode = DFT_ERROR_FILE_IO; return NULL; } + } else if ((q = startsWith(buf, "transpose :")) != NULL) { + if (1 != sscanf(q, "%d", &flagTrans)) { if (errcode != NULL) *errcode = DFT_ERROR_FILE_IO; return NULL; } + } else if ((q = startsWith(buf, "bit reversal :")) != NULL) { + if (1 != sscanf(q, "%d", &useCobra)) { if (errcode != NULL) *errcode = DFT_ERROR_FILE_IO; return NULL; } + } else if ((q = startsWith(buf, "flags :")) != NULL) { + if (1 != sscanf(q, "%llx", &flags)) { if (errcode != NULL) *errcode = DFT_ERROR_FILE_IO; return NULL; } + } else if ((q = startsWith(buf, "end :")) != NULL) { + break; + } + } + + if (formatver > FILE_FORMAT_VERSION) { + if (errcode != NULL) *errcode = DFT_ERROR_FILE_VERSION; + return NULL; + } + + switch(SIMDBase_detect(mode)) { + case 1: + break; + case 0: + if (errcode != NULL) *errcode = DFT_ERROR_MODE_NOT_AVAILABLE; + return NULL; + case -1: + if (errcode != NULL) *errcode = DFT_ERROR_MODE_NOT_COMPILED_IN; + return NULL; + } + + switch(mode) { +#if defined(ENABLE_PUREC_FLOAT) + case 1: return makePlanSub_purec_float(length, radix2thres, useCobra, flags); +#endif +#if defined(ENABLE_PUREC_DOUBLE) + case 2: return makePlanSub_purec_double(length, radix2thres, useCobra, flags); +#endif +#if defined(ENABLE_PUREC_LONGDOUBLE) + case 3: return makePlanSub_purec_longdouble(length, radix2thres, useCobra, flags); +#endif +#if defined(ENABLE_SSE_FLOAT) + case 4: return makePlanSub_sse_float(length, radix2thres, useCobra, flags); +#endif +#if defined(ENABLE_SSE2_DOUBLE) + case 5: return makePlanSub_sse2_double(length, radix2thres, useCobra, flags); +#endif +#if defined(ENABLE_NEON_FLOAT) + case 6: return makePlanSub_neon_float(length, radix2thres, useCobra, flags); +#endif +#if defined(ENABLE_AVX_FLOAT) + case 7: return makePlanSub_avx_float(length, radix2thres, useCobra, flags); +#endif +#if defined(ENABLE_AVX_DOUBLE) + case 8: return makePlanSub_avx_double(length, radix2thres, useCobra, flags); +#endif +#if defined(ENABLE_ALTIVEC_FLOAT) + case 9: return makePlanSub_altivec_float(length, radix2thres, useCobra, flags); +#endif + } + + if (errcode != NULL) *errcode = DFT_ERROR_UNKNOWN_MODE; + + return NULL; +} + +int32_t DFT_getPlanParamInt(int32_t paramId, void *p2) { + DFTUndiff *p = (DFTUndiff *)p2; + if (p->magic != MAGIC_DFT) abort(); + + switch(paramId) { + case DFT_PARAMID_MODE: return p->mode; + case DFT_PARAMID_FFT_LENGTH: + if ((p->flags & DFT_FLAG_REAL) != 0) return p->length * 2; + if ((p->flags & DFT_FLAG_ALT_REAL) != 0) return p->length * 2; + return p->length; + case DFT_PARAMID_IS_REAL_TRANSFORM: return (p->flags & DFT_FLAG_REAL) ? 1 : 0; + case DFT_PARAMID_IS_ALT_REAL_TRANSFORM: return (p->flags & DFT_FLAG_ALT_REAL) ? 1 : 0; + case DFT_PARAMID_NO_BIT_REVERSAL: return (p->flags & DFT_FLAG_NO_BITREVERSAL) ? 1 : 0; + case DFT_PARAMID_TEST_RUN: return p->flags & 3; + } + + return -1; +} + +#if 0 +char *DFT_getPlanParamString(int32_t paramId, void *p2) { + dft_t *p = (dft_t *)p2; + if (p->magic != MAGIC_NSDFT) abort(); + + return NULL; +} +#endif + +uint32_t DFT_ilog2(uint32_t q) { + static const uint32_t tab[] = {0,1,2,2,3,3,3,3,4,4,4,4,4,4,4,4}; + uint32_t r = 0,qq; + + if (q & 0xffff0000) r = 16; + + q >>= r; + qq = q | (q >> 1); + qq |= (qq >> 2); + qq = ((qq & 0x10) >> 4) | ((qq & 0x100) >> 7) | ((qq & 0x1000) >> 10); + + return r + tab[qq] * 4 + tab[q >> (tab[qq] * 4)] - 1; +} + +double DFT_timeofday(void) { + struct timeval tp; + gettimeofday(&tp, NULL); + return (double)tp.tv_sec+(1e-6)*tp.tv_usec; +} diff --git a/plugins/supereq/nsfft-1.00/dft/DFT.h b/plugins/supereq/nsfft-1.00/dft/DFT.h new file mode 100644 index 00000000..facb701a --- /dev/null +++ b/plugins/supereq/nsfft-1.00/dft/DFT.h @@ -0,0 +1,56 @@ +#ifndef __DFT_H__ +#define __DFT_H__ + +#include <stdio.h> +#include <stdint.h> + +typedef void DFT; + +int32_t DFT_getParamInt(int32_t paramId); +char *DFT_getParamString(int32_t paramId); + +int32_t DFT_getModeParamInt(int32_t paramId, int32_t mode); +char *DFT_getModeParamString(int32_t paramId, int32_t mode); + +DFT *DFT_init(int32_t mode, uint64_t n, uint64_t flags); +void DFT_dispose(DFT *p, int32_t mode); + +int32_t DFT_fwrite(DFT *p, FILE *fp); +DFT *DFT_fread(FILE *fp, int32_t *errcode); + +int32_t DFT_getPlanParamInt(int32_t paramId, void *p); + +void DFT_execute(DFT *p, int32_t mode, void *s, int32_t dir); + +uint32_t DFT_ilog2(uint32_t q); +double DFT_timeofday(void); + +#define DFT_FLAG_NO_TEST_RUN ( 0ULL << 0) +#define DFT_FLAG_LIGHT_TEST_RUN ( 1ULL << 0) +#define DFT_FLAG_HEAVY_TEST_RUN ( 2ULL << 0) +#define DFT_FLAG_EXHAUSTIVE_TEST_RUN ( 3ULL << 0) + +#define DFT_FLAG_REAL (1ULL << 2) +#define DFT_FLAG_ALT_REAL (1ULL << 3) +#define DFT_FLAG_VERBOSE (1ULL << 4) +#define DFT_FLAG_NO_BITREVERSAL (1ULL << 5) +#define DFT_FLAG_FORCE_RECURSIVE (1ULL << 6) +#define DFT_FLAG_FORCE_COBRA (1ULL << 7) + +#define DFT_PARAMID_TYPE ( 1 | ( 3 << 24 )) +#define DFT_PARAMID_MODE ( 2 | ( 3 << 24 )) +#define DFT_PARAMID_FFT_LENGTH ( 3 | ( 3 << 24 )) +#define DFT_PARAMID_IS_REAL_TRANSFORM ( 4 | ( 3 << 24 )) +#define DFT_PARAMID_IS_ALT_REAL_TRANSFORM ( 5 | ( 3 << 24 )) +#define DFT_PARAMID_NO_BIT_REVERSAL ( 6 | ( 3 << 24 )) +#define DFT_PARAMID_TEST_RUN ( 7 | ( 3 << 24 )) + +#define DFT_ERROR_NOERROR 0 +#define DFT_ERROR_FILE_VERSION 1 +#define DFT_ERROR_FILE_IO 2 +#define DFT_ERROR_UNEXPECTED_EOF 3 +#define DFT_ERROR_MODE_NOT_COMPILED_IN 4 +#define DFT_ERROR_MODE_NOT_AVAILABLE 5 +#define DFT_ERROR_UNKNOWN_MODE 6 + +#endif diff --git a/plugins/supereq/nsfft-1.00/dft/DFTUndiff.c b/plugins/supereq/nsfft-1.00/dft/DFTUndiff.c new file mode 100644 index 00000000..4985da33 --- /dev/null +++ b/plugins/supereq/nsfft-1.00/dft/DFTUndiff.c @@ -0,0 +1,1807 @@ +#include <stdio.h> +#include <stdlib.h> +#include <math.h> +#include <stdint.h> + +#include "SIMDBase.h" +#include "SIMDBaseUndiff.h" +#include "DFT.h" +#include "DFTUndiff.h" + +// + +#define SIN(x) sin(x) +#define COS(x) cos(x) + +#define SQRT2_2 .7071067811865475244008443621048490392848359376884740365883398689953L + +#ifndef M_PIl +#define M_PIl 3.141592653589793238462643383279502884197169399375105820974944592307L +#endif + +// + +static inline void srBut2(DFTUndiff *p) { + SIMDBase_VECT *s = p->s; + int32_t o = p->offset1; + SIMDBase_VECT t0, t1; + + t0 = SIMDBase_ADDm(&s[o ], &s[o+2]); t1 = SIMDBase_SUBm(&s[o ], &s[o+2]); + SIMDBase_STOR(&s[o ], t0); SIMDBase_STOR(&s[o+2], t1); + t0 = SIMDBase_ADDm(&s[o+1], &s[o+3]); t1 = SIMDBase_SUBm(&s[o+1], &s[o+3]); + SIMDBase_STOR(&s[o+1], t0); SIMDBase_STOR(&s[o+3], t1); +} + +static inline void srButForward4(DFTUndiff *p) { + SIMDBase_VECT *s = p->s; + int32_t o = p->offset1; + SIMDBase_VECT t0r, t0i, t1r, t1i, t2r, t2i, t3r, t3i; + + t0r = SIMDBase_ADDm(&s[o+0], &s[o+4]); t2r = SIMDBase_SUBm(&s[o+0], &s[o+4]); + t0i = SIMDBase_ADDm(&s[o+1], &s[o+5]); t2i = SIMDBase_SUBm(&s[o+1], &s[o+5]); + t1r = SIMDBase_ADDm(&s[o+2], &s[o+6]); t3i = SIMDBase_SUBm(&s[o+2], &s[o+6]); + t1i = SIMDBase_ADDm(&s[o+7], &s[o+3]); t3r = SIMDBase_SUBm(&s[o+7], &s[o+3]); + + SIMDBase_STOR(&s[o+0], SIMDBase_ADDi(t0r, t1r)); SIMDBase_STOR(&s[o+1], SIMDBase_ADDi(t0i, t1i)); + SIMDBase_STOR(&s[o+2], SIMDBase_SUBi(t0r, t1r)); SIMDBase_STOR(&s[o+3], SIMDBase_SUBi(t0i, t1i)); + SIMDBase_STOR(&s[o+4], SIMDBase_SUBi(t2r, t3r)); SIMDBase_STOR(&s[o+5], SIMDBase_SUBi(t2i, t3i)); + SIMDBase_STOR(&s[o+6], SIMDBase_ADDi(t2r, t3r)); SIMDBase_STOR(&s[o+7], SIMDBase_ADDi(t2i, t3i)); +} + +static inline void srButBackward4(DFTUndiff *p) { + SIMDBase_VECT *s = p->s; + int32_t o = p->offset1; + + SIMDBase_VECT t0r, t0i, t1r, t1i; + SIMDBase_VECT s0 = SIMDBase_LOAD(&s[o+0]), s1 = SIMDBase_LOAD(&s[o+1]), s2 = SIMDBase_LOAD(&s[o+2]), s3 = SIMDBase_LOAD(&s[o+3]); + + t0r = SIMDBase_ADDi(s0, s2); t0i = SIMDBase_SUBi(s0, s2); s0 = t0r; s2 = t0i; + t0r = SIMDBase_ADDi(s1, s3); t0i = SIMDBase_SUBi(s1, s3); s1 = t0r; s3 = t0i; + t0r = SIMDBase_ADDm(&s[o+4], &s[o+6]); t1i = SIMDBase_SUBm(&s[o+4], &s[o+6]); + t0i = SIMDBase_ADDm(&s[o+7], &s[o+5]); t1r = SIMDBase_SUBm(&s[o+7], &s[o+5]); + + SIMDBase_STOR(&s[o+4], SIMDBase_SUBi(s0, t0r)); SIMDBase_STOR(&s[o+5], SIMDBase_SUBi(s1, t0i)); + SIMDBase_STOR(&s[o+6], SIMDBase_SUBi(s2, t1r)); SIMDBase_STOR(&s[o+7], SIMDBase_SUBi(s3, t1i)); + SIMDBase_STOR(&s[o+0], SIMDBase_ADDi(s0, t0r)); SIMDBase_STOR(&s[o+1], SIMDBase_ADDi(s1, t0i)); + SIMDBase_STOR(&s[o+2], SIMDBase_ADDi(s2, t1r)); SIMDBase_STOR(&s[o+3], SIMDBase_ADDi(s3, t1i)); +} + +static inline void srButForward8(DFTUndiff *p) { + SIMDBase_VECT *s = p->s; + int32_t o = p->offset1; + SIMDBase_VECT t0r, t0i, t1r, t1i, t2r, t2i, t3r, t3i; + + SIMDBase_VECT s0 = SIMDBase_LOAD(&s[o+ 0]), s1 = SIMDBase_LOAD(&s[o+ 1]), s2 = SIMDBase_LOAD(&s[o+ 2]), s3 = SIMDBase_LOAD(&s[o+ 3]); + SIMDBase_VECT s4 = SIMDBase_LOAD(&s[o+ 4]), s5 = SIMDBase_LOAD(&s[o+ 5]), s6 = SIMDBase_LOAD(&s[o+ 6]), s7 = SIMDBase_LOAD(&s[o+ 7]); + SIMDBase_VECT s8 = SIMDBase_LOAD(&s[o+ 8]), s9 = SIMDBase_LOAD(&s[o+ 9]), sa = SIMDBase_LOAD(&s[o+10]) ,sb = SIMDBase_LOAD(&s[o+11]); + SIMDBase_VECT sc = SIMDBase_LOAD(&s[o+12]), sd = SIMDBase_LOAD(&s[o+13]), se = SIMDBase_LOAD(&s[o+14]), sf = SIMDBase_LOAD(&s[o+15]); + + t2r = SIMDBase_SUBi(s0, s8); t2i = SIMDBase_SUBi(s1, s9); + t3r = SIMDBase_SUBi(sd, s5); t3i = SIMDBase_SUBi(s4, sc); + + s0 = SIMDBase_ADDi(s0, s8); s1 = SIMDBase_ADDi(s1, s9); + s4 = SIMDBase_ADDi(s4, sc); s5 = SIMDBase_ADDi(s5, sd); + + s8 = SIMDBase_SUBi(t2r, t3r); s9 = SIMDBase_SUBi(t2i, t3i); + sc = SIMDBase_ADDi(t2r, t3r); sd = SIMDBase_ADDi(t2i, t3i); + + t2r = SIMDBase_SUBi(s2, sa); t2i = SIMDBase_SUBi(s3, sb); + t3r = SIMDBase_SUBi(sf, s7); t3i = SIMDBase_SUBi(s6, se); + + s2 = SIMDBase_ADDi(s2, sa); s3 = SIMDBase_ADDi(s3, sb); + s6 = SIMDBase_ADDi(s6, se); s7 = SIMDBase_ADDi(s7, sf); + + t0r = SIMDBase_SUBi(t2r, t3r); t1r = SIMDBase_ADDi(t2r, t3r); + t0i = SIMDBase_SUBi(t2i, t3i); t1i = SIMDBase_ADDi(t2i, t3i); + + sa = SIMDBase_MULi(SIMDBase_ADDi(t0r, t0i), SIMDBase_SET1( SQRT2_2)); + sb = SIMDBase_MULi(SIMDBase_SUBi(t0i, t0r), SIMDBase_SET1( SQRT2_2)); + se = SIMDBase_MULi(SIMDBase_SUBi(t1i, t1r), SIMDBase_SET1( SQRT2_2)); + sf = SIMDBase_MULi(SIMDBase_ADDi(t1r, t1i), SIMDBase_SET1(-SQRT2_2)); + + SIMDBase_STOR(&s[o+ 8], SIMDBase_ADDi(s8, sa)); SIMDBase_STOR(&s[o+ 9], SIMDBase_ADDi(s9, sb)); + SIMDBase_STOR(&s[o+10], SIMDBase_SUBi(s8, sa)); SIMDBase_STOR(&s[o+11], SIMDBase_SUBi(s9, sb)); + + SIMDBase_STOR(&s[o+12], SIMDBase_ADDi(sc, se)); SIMDBase_STOR(&s[o+13], SIMDBase_ADDi(sd, sf)); + SIMDBase_STOR(&s[o+14], SIMDBase_SUBi(sc, se)); SIMDBase_STOR(&s[o+15], SIMDBase_SUBi(sd, sf)); + + t0r = SIMDBase_ADDi(s0, s4); t2r = SIMDBase_SUBi(s0, s4); + t0i = SIMDBase_ADDi(s1, s5); t2i = SIMDBase_SUBi(s1, s5); + + t1r = SIMDBase_ADDi(s2, s6); t3i = SIMDBase_SUBi(s2, s6); + t1i = SIMDBase_ADDi(s3, s7); t3r = SIMDBase_SUBi(s7, s3); + + SIMDBase_STOR(&s[o+0], SIMDBase_ADDi(t0r, t1r)); SIMDBase_STOR(&s[o+1], SIMDBase_ADDi(t0i, t1i)); + SIMDBase_STOR(&s[o+2], SIMDBase_SUBi(t0r, t1r)); SIMDBase_STOR(&s[o+3], SIMDBase_SUBi(t0i, t1i)); + SIMDBase_STOR(&s[o+4], SIMDBase_SUBi(t2r, t3r)); SIMDBase_STOR(&s[o+5], SIMDBase_SUBi(t2i, t3i)); + SIMDBase_STOR(&s[o+6], SIMDBase_ADDi(t2r, t3r)); SIMDBase_STOR(&s[o+7], SIMDBase_ADDi(t2i, t3i)); +} + +static void srButBackward8(DFTUndiff *p) { + SIMDBase_VECT *s = p->s; + int32_t o = p->offset1; + SIMDBase_VECT t0r, t0i, t1r, t1i; + + SIMDBase_VECT s0 = SIMDBase_LOAD(&s[o+ 0]), s1 = SIMDBase_LOAD(&s[o+ 1]), s2 = SIMDBase_LOAD(&s[o+ 2]), s3 = SIMDBase_LOAD(&s[o+ 3]); + SIMDBase_VECT s4 = SIMDBase_LOAD(&s[o+ 4]), s5 = SIMDBase_LOAD(&s[o+ 5]), s6 = SIMDBase_LOAD(&s[o+ 6]), s7 = SIMDBase_LOAD(&s[o+ 7]); + SIMDBase_VECT s8 = SIMDBase_LOAD(&s[o+ 8]), s9 = SIMDBase_LOAD(&s[o+ 9]), sa = SIMDBase_LOAD(&s[o+10]) ,sb = SIMDBase_LOAD(&s[o+11]); + SIMDBase_VECT sc = SIMDBase_LOAD(&s[o+12]), sd = SIMDBase_LOAD(&s[o+13]), se = SIMDBase_LOAD(&s[o+14]), sf = SIMDBase_LOAD(&s[o+15]); + + t0r = SIMDBase_ADDi(s8, sa); t0i = SIMDBase_SUBi(s8, sa); s8 = t0r; sa = t0i; + t0r = SIMDBase_ADDi(s9, sb); t0i = SIMDBase_SUBi(s9, sb); s9 = t0r; sb = t0i; + t0r = SIMDBase_ADDi(sc, se); t0i = SIMDBase_SUBi(sc, se); sc = t0r; se = t0i; + t0r = SIMDBase_ADDi(sd, sf); t0i = SIMDBase_SUBi(sd, sf); sd = t0r; sf = t0i; + t0r = SIMDBase_ADDi(s0, s2); t0i = SIMDBase_SUBi(s0, s2); s0 = t0r; s2 = t0i; + t0r = SIMDBase_ADDi(s1, s3); t0i = SIMDBase_SUBi(s1, s3); s1 = t0r; s3 = t0i; + + t0r = SIMDBase_ADDi(s4, s6); t0i = SIMDBase_ADDi(s7, s5); + t1r = SIMDBase_SUBi(s7, s5); t1i = SIMDBase_SUBi(s4, s6); + + s4 = SIMDBase_SUBi(s0, t0r); s5 = SIMDBase_SUBi(s1, t0i); + s6 = SIMDBase_SUBi(s2, t1r); s7 = SIMDBase_SUBi(s3, t1i); + s0 = SIMDBase_ADDi(s0, t0r); s1 = SIMDBase_ADDi(s1, t0i); + s2 = SIMDBase_ADDi(s2, t1r); s3 = SIMDBase_ADDi(s3, t1i); + + t0r = SIMDBase_ADDi(s8, sc); t0i = SIMDBase_ADDi(s9, sd); + t1r = SIMDBase_SUBi(sd, s9); t1i = SIMDBase_SUBi(s8, sc); + + s8 = SIMDBase_SUBi(s0, t0r); s9 = SIMDBase_SUBi(s1, t0i); + sc = SIMDBase_SUBi(s4, t1r); sd = SIMDBase_SUBi(s5, t1i); + s0 = SIMDBase_ADDi(s0, t0r); s1 = SIMDBase_ADDi(s1, t0i); + s4 = SIMDBase_ADDi(s4, t1r); s5 = SIMDBase_ADDi(s5, t1i); + + t0r = SIMDBase_MULi(SIMDBase_SUBi(sa, sb), SIMDBase_SET1( SQRT2_2)); + t0i = SIMDBase_MULi(SIMDBase_ADDi(sa, sb), SIMDBase_SET1( SQRT2_2)); + t1r = SIMDBase_MULi(SIMDBase_ADDi(se, sf), SIMDBase_SET1(-SQRT2_2)); + t1i = SIMDBase_MULi(SIMDBase_SUBi(se, sf), SIMDBase_SET1( SQRT2_2)); + + sa = t0r; sb = t0i; se = t1r; sf = t1i; + + t0r = SIMDBase_ADDi(sa, se); t0i = SIMDBase_ADDi(sb, sf); + t1r = SIMDBase_SUBi(sf, sb); t1i = SIMDBase_SUBi(sa, se); + + sa = SIMDBase_SUBi(s2, t0r); sb = SIMDBase_SUBi(s3, t0i); + se = SIMDBase_SUBi(s6, t1r); sf = SIMDBase_SUBi(s7, t1i); + s2 = SIMDBase_ADDi(s2, t0r); s3 = SIMDBase_ADDi(s3, t0i); + s6 = SIMDBase_ADDi(s6, t1r); s7 = SIMDBase_ADDi(s7, t1i); + + SIMDBase_STOR(&s[o+ 0], s0); SIMDBase_STOR(&s[o+ 1], s1); SIMDBase_STOR(&s[o+ 2], s2); SIMDBase_STOR(&s[o+ 3], s3); + SIMDBase_STOR(&s[o+ 4], s4); SIMDBase_STOR(&s[o+ 5], s5); SIMDBase_STOR(&s[o+ 6], s6); SIMDBase_STOR(&s[o+ 7], s7); + SIMDBase_STOR(&s[o+ 8], s8); SIMDBase_STOR(&s[o+ 9], s9); SIMDBase_STOR(&s[o+10], sa); SIMDBase_STOR(&s[o+11], sb); + SIMDBase_STOR(&s[o+12], sc); SIMDBase_STOR(&s[o+13], sd); SIMDBase_STOR(&s[o+14], se); SIMDBase_STOR(&s[o+15], sf); +} + +#if 0 +static inline void srButForwardSub(DFTUndiff *p) { + SIMDBase_VECT *s = p->s; + SIMDBase_REAL *tbl = p->ptTable[p->log2butlen]; + + int32_t i0 = p->offset1; + int32_t i1 = i0 + p->stride; + int32_t i2 = i1 + p->stride; + int32_t i3 = i2 + p->stride; + int32_t im = i1; + + int32_t p0 = p->offset2 & (p->butlen*4-1); + + while(i0 < im) { + SIMDBase_VECT t0r, t0i, t1r, t1i; + SIMDBase_VECT s00, s01, s10, s11, s20, s21, s30, s31; + SIMDBase_VECT a0, a1, a2, a3; + + s00 = SIMDBase_LOAD(&s[i0+0]), s01 = SIMDBase_LOAD(&s[i0+1]); + s10 = SIMDBase_LOAD(&s[i1+0]), s11 = SIMDBase_LOAD(&s[i1+1]); + s20 = SIMDBase_LOAD(&s[i2+0]), s21 = SIMDBase_LOAD(&s[i2+1]); + s30 = SIMDBase_LOAD(&s[i3+0]), s31 = SIMDBase_LOAD(&s[i3+1]); + + t0r = SIMDBase_SUBi(SIMDBase_SUBi(s00, s20), SIMDBase_SUBi(s31, s11)); + t0i = SIMDBase_SUBi(SIMDBase_SUBi(s01, s21), SIMDBase_SUBi(s10, s30)); + + t1r = SIMDBase_ADDi(SIMDBase_SUBi(s00, s20), SIMDBase_SUBi(s31, s11)); + t1i = SIMDBase_ADDi(SIMDBase_SUBi(s01, s21), SIMDBase_SUBi(s10, s30)); + + a0 = SIMDBase_LOAD1(&tbl[p0+0]); a1 = SIMDBase_LOAD1(&tbl[p0+1]); + a2 = SIMDBase_LOAD1(&tbl[p0+2]); a3 = SIMDBase_LOAD1(&tbl[p0+3]); + + SIMDBase_STOR(&s[i0 ], SIMDBase_ADDi(s00, s20)); SIMDBase_STOR(&s[i0+1], SIMDBase_ADDi(s01, s21)); + SIMDBase_STOR(&s[i1 ], SIMDBase_ADDi(s10, s30)); SIMDBase_STOR(&s[i1+1], SIMDBase_ADDi(s11, s31)); + +#ifndef SIMDBase_FMADD_AVAILABLE + SIMDBase_STOR(&s[i2 ], SIMDBase_SUBi(SIMDBase_MULi(t0r, a0), SIMDBase_MULi(t0i, a1))); + SIMDBase_STOR(&s[i2+1], SIMDBase_ADDi(SIMDBase_MULi(t0r, a1), SIMDBase_MULi(t0i, a0))); + SIMDBase_STOR(&s[i3 ], SIMDBase_SUBi(SIMDBase_MULi(t1r, a2), SIMDBase_MULi(t1i, a3))); + SIMDBase_STOR(&s[i3+1], SIMDBase_ADDi(SIMDBase_MULi(t1r, a3), SIMDBase_MULi(t1i, a2))); +#else + SIMDBase_STOR(&s[i2 ], SIMDBase_FMSUBi(t0i, a1, SIMDBase_MULi(t0r, a0))); + SIMDBase_STOR(&s[i2+1], SIMDBase_FMADDi(t0r, a1, SIMDBase_MULi(t0i, a0))); + SIMDBase_STOR(&s[i3 ], SIMDBase_FMSUBi(t1i, a3, SIMDBase_MULi(t1r, a2))); + SIMDBase_STOR(&s[i3+1], SIMDBase_FMADDi(t1r, a3, SIMDBase_MULi(t1i, a2))); +#endif + + i0 += 2; i1 += 2; i2 += 2; i3 += 2; + p0 += 4; + } +} +#endif + +#if 0 +static inline void srButBackwardSub(DFTUndiff *p) { + SIMDBase_VECT *s = p->s; + SIMDBase_REAL *tbl = p->ptTable[p->log2butlen]; + + int32_t i0 = p->offset1; + int32_t i1 = i0 + p->stride; + int32_t i2 = i1 + p->stride; + int32_t i3 = i2 + p->stride; + int32_t im = i1; + + int32_t p0 = p->offset2 & (p->butlen*4-1); + + while(i0 < im) { + SIMDBase_VECT t0r, t0i, t1r, t1i, u, v; + SIMDBase_VECT s00, s01, s10, s11, s20, s21, s30, s31; + SIMDBase_VECT a0, a1, a2, a3; + + s20 = SIMDBase_LOAD(&s[i2+0]); s21 = SIMDBase_LOAD(&s[i2+1]); + a0 = SIMDBase_LOAD1(&tbl[p0+0]); a1 = SIMDBase_LOAD1(&tbl[p0+1]); + u = SIMDBase_ADDi(SIMDBase_MULi(s20, a0), SIMDBase_MULi(s21, a1)); + + s30 = SIMDBase_LOAD(&s[i3+0]); s31 = SIMDBase_LOAD(&s[i3+1]); + a2 = SIMDBase_LOAD1(&tbl[p0+2]); a3 = SIMDBase_LOAD1(&tbl[p0+3]); + v = SIMDBase_ADDi(SIMDBase_MULi(s30, a2), SIMDBase_MULi(s31, a3)); + + t0r = SIMDBase_ADDi(u, v); t1i = SIMDBase_SUBi(u, v); + u = SIMDBase_SUBi(SIMDBase_MULi(s31, a2), SIMDBase_MULi(s30, a3)); + v = SIMDBase_SUBi(SIMDBase_MULi(s21, a0), SIMDBase_MULi(s20, a1)); + t0i = SIMDBase_ADDi(u, v); t1r = SIMDBase_SUBi(u, v); + + s00 = SIMDBase_LOAD(&s[i0+0]); s01 = SIMDBase_LOAD(&s[i0+1]); + s10 = SIMDBase_LOAD(&s[i1+0]); s11 = SIMDBase_LOAD(&s[i1+1]); + + SIMDBase_STOR(&s[i2+0], SIMDBase_SUBi(s00, t0r)); SIMDBase_STOR(&s[i0+0], SIMDBase_ADDi(s00, t0r)); + SIMDBase_STOR(&s[i2+1], SIMDBase_SUBi(s01, t0i)); SIMDBase_STOR(&s[i0+1], SIMDBase_ADDi(s01, t0i)); + SIMDBase_STOR(&s[i3+0], SIMDBase_SUBi(s10, t1r)); SIMDBase_STOR(&s[i1+0], SIMDBase_ADDi(s10, t1r)); + SIMDBase_STOR(&s[i3+1], SIMDBase_SUBi(s11, t1i)); SIMDBase_STOR(&s[i1+1], SIMDBase_ADDi(s11, t1i)); + + i0 += 2; i1 += 2; i2 += 2; i3 += 2; + p0 += 4; + } +} + +static void srButBackwardSubUnrolled(DFTUndiff *p) { + srButBackwardSub(p); +} +#endif + +static inline void srButForwardSubUnrolled(DFTUndiff *p) { + SIMDBase_VECT *s = p->s; + SIMDBase_REAL *tbl = p->ptTable[p->log2butlen]; + + int32_t i0 = p->offset1; + int32_t i1 = i0 + p->stride; + int32_t i2 = i1 + p->stride; + int32_t i3 = i2 + p->stride; + int32_t im = i1; + + int32_t p0 = p->offset2 & (p->butlen*4-1); + + while(i0 < im) { + SIMDBase_VECT t0r, t0i, t1r, t1i; + SIMDBase_VECT s00, s01, s10, s11, s20, s21, s30, s31; + SIMDBase_VECT a0, a1, a2, a3; + + // + + s00 = SIMDBase_LOAD(&s[i0+0]); s01 = SIMDBase_LOAD(&s[i0+1]); + s10 = SIMDBase_LOAD(&s[i1+0]); s11 = SIMDBase_LOAD(&s[i1+1]); + s20 = SIMDBase_LOAD(&s[i2+0]); s21 = SIMDBase_LOAD(&s[i2+1]); + s30 = SIMDBase_LOAD(&s[i3+0]); s31 = SIMDBase_LOAD(&s[i3+1]); + + t0r = SIMDBase_SUBi(SIMDBase_SUBi(s00, s20), SIMDBase_SUBi(s31, s11)); + t0i = SIMDBase_SUBi(SIMDBase_SUBi(s01, s21), SIMDBase_SUBi(s10, s30)); + + t1r = SIMDBase_ADDi(SIMDBase_SUBi(s00, s20), SIMDBase_SUBi(s31, s11)); + t1i = SIMDBase_ADDi(SIMDBase_SUBi(s01, s21), SIMDBase_SUBi(s10, s30)); + + a0 = SIMDBase_LOAD1(&tbl[p0+0]); a1 = SIMDBase_LOAD1(&tbl[p0+1]); + a2 = SIMDBase_LOAD1(&tbl[p0+2]); a3 = SIMDBase_LOAD1(&tbl[p0+3]); + + SIMDBase_STOR(&s[i0 ], SIMDBase_ADDi(s00, s20)); SIMDBase_STOR(&s[i0+1], SIMDBase_ADDi(s01, s21)); + SIMDBase_STOR(&s[i1 ], SIMDBase_ADDi(s10, s30)); SIMDBase_STOR(&s[i1+1], SIMDBase_ADDi(s11, s31)); + +#ifndef SIMDBase_FMADD_AVAILABLE + SIMDBase_STOR(&s[i2 ], SIMDBase_SUBi(SIMDBase_MULi(t0r, a0), SIMDBase_MULi(t0i, a1))); + SIMDBase_STOR(&s[i2+1], SIMDBase_ADDi(SIMDBase_MULi(t0r, a1), SIMDBase_MULi(t0i, a0))); + SIMDBase_STOR(&s[i3 ], SIMDBase_SUBi(SIMDBase_MULi(t1r, a2), SIMDBase_MULi(t1i, a3))); + SIMDBase_STOR(&s[i3+1], SIMDBase_ADDi(SIMDBase_MULi(t1r, a3), SIMDBase_MULi(t1i, a2))); +#else + SIMDBase_STOR(&s[i2 ], SIMDBase_FMSUBi(t0i, a1, SIMDBase_MULi(t0r, a0))); + SIMDBase_STOR(&s[i2+1], SIMDBase_FMADDi(t0r, a1, SIMDBase_MULi(t0i, a0))); + SIMDBase_STOR(&s[i3 ], SIMDBase_FMSUBi(t1i, a3, SIMDBase_MULi(t1r, a2))); + SIMDBase_STOR(&s[i3+1], SIMDBase_FMADDi(t1r, a3, SIMDBase_MULi(t1i, a2))); +#endif + + // + + s00 = SIMDBase_LOAD(&s[i0+2]); s01 = SIMDBase_LOAD(&s[i0+3]); + s10 = SIMDBase_LOAD(&s[i1+2]); s11 = SIMDBase_LOAD(&s[i1+3]); + s20 = SIMDBase_LOAD(&s[i2+2]); s21 = SIMDBase_LOAD(&s[i2+3]); + s30 = SIMDBase_LOAD(&s[i3+2]); s31 = SIMDBase_LOAD(&s[i3+3]); + + t0r = SIMDBase_SUBi(SIMDBase_SUBi(s00, s20), SIMDBase_SUBi(s31, s11)); + t0i = SIMDBase_SUBi(SIMDBase_SUBi(s01, s21), SIMDBase_SUBi(s10, s30)); + + t1r = SIMDBase_ADDi(SIMDBase_SUBi(s00, s20), SIMDBase_SUBi(s31, s11)); + t1i = SIMDBase_ADDi(SIMDBase_SUBi(s01, s21), SIMDBase_SUBi(s10, s30)); + + a0 = SIMDBase_LOAD1(&tbl[p0+4]); a1 = SIMDBase_LOAD1(&tbl[p0+5]); + a2 = SIMDBase_LOAD1(&tbl[p0+6]); a3 = SIMDBase_LOAD1(&tbl[p0+7]); + + SIMDBase_STOR(&s[i0+2], SIMDBase_ADDi(s00, s20)); SIMDBase_STOR(&s[i0+3], SIMDBase_ADDi(s01, s21)); + SIMDBase_STOR(&s[i1+2], SIMDBase_ADDi(s10, s30)); SIMDBase_STOR(&s[i1+3], SIMDBase_ADDi(s11, s31)); + +#ifndef SIMDBase_FMADD_AVAILABLE + SIMDBase_STOR(&s[i2+2], SIMDBase_SUBi(SIMDBase_MULi(t0r, a0), SIMDBase_MULi(t0i, a1))); + SIMDBase_STOR(&s[i2+3], SIMDBase_ADDi(SIMDBase_MULi(t0r, a1), SIMDBase_MULi(t0i, a0))); + SIMDBase_STOR(&s[i3+2], SIMDBase_SUBi(SIMDBase_MULi(t1r, a2), SIMDBase_MULi(t1i, a3))); + SIMDBase_STOR(&s[i3+3], SIMDBase_ADDi(SIMDBase_MULi(t1r, a3), SIMDBase_MULi(t1i, a2))); +#else + SIMDBase_STOR(&s[i2+2], SIMDBase_FMSUBi(t0i, a1, SIMDBase_MULi(t0r, a0))); + SIMDBase_STOR(&s[i2+3], SIMDBase_FMADDi(t0r, a1, SIMDBase_MULi(t0i, a0))); + SIMDBase_STOR(&s[i3+2], SIMDBase_FMSUBi(t1i, a3, SIMDBase_MULi(t1r, a2))); + SIMDBase_STOR(&s[i3+3], SIMDBase_FMADDi(t1r, a3, SIMDBase_MULi(t1i, a2))); +#endif + + // + + s00 = SIMDBase_LOAD(&s[i0+4]); s01 = SIMDBase_LOAD(&s[i0+5]); + s10 = SIMDBase_LOAD(&s[i1+4]); s11 = SIMDBase_LOAD(&s[i1+5]); + s20 = SIMDBase_LOAD(&s[i2+4]); s21 = SIMDBase_LOAD(&s[i2+5]); + s30 = SIMDBase_LOAD(&s[i3+4]); s31 = SIMDBase_LOAD(&s[i3+5]); + + t0r = SIMDBase_SUBi(SIMDBase_SUBi(s00, s20), SIMDBase_SUBi(s31, s11)); + t0i = SIMDBase_SUBi(SIMDBase_SUBi(s01, s21), SIMDBase_SUBi(s10, s30)); + + t1r = SIMDBase_ADDi(SIMDBase_SUBi(s00, s20), SIMDBase_SUBi(s31, s11)); + t1i = SIMDBase_ADDi(SIMDBase_SUBi(s01, s21), SIMDBase_SUBi(s10, s30)); + + a0 = SIMDBase_LOAD1(&tbl[p0+ 8]); a1 = SIMDBase_LOAD1(&tbl[p0+ 9]); + a2 = SIMDBase_LOAD1(&tbl[p0+10]); a3 = SIMDBase_LOAD1(&tbl[p0+11]); + + SIMDBase_STOR(&s[i0+4], SIMDBase_ADDi(s00, s20)); SIMDBase_STOR(&s[i0+5], SIMDBase_ADDi(s01, s21)); + SIMDBase_STOR(&s[i1+4], SIMDBase_ADDi(s10, s30)); SIMDBase_STOR(&s[i1+5], SIMDBase_ADDi(s11, s31)); + +#ifndef SIMDBase_FMADD_AVAILABLE + SIMDBase_STOR(&s[i2+4], SIMDBase_SUBi(SIMDBase_MULi(t0r, a0), SIMDBase_MULi(t0i, a1))); + SIMDBase_STOR(&s[i2+5], SIMDBase_ADDi(SIMDBase_MULi(t0r, a1), SIMDBase_MULi(t0i, a0))); + SIMDBase_STOR(&s[i3+4], SIMDBase_SUBi(SIMDBase_MULi(t1r, a2), SIMDBase_MULi(t1i, a3))); + SIMDBase_STOR(&s[i3+5], SIMDBase_ADDi(SIMDBase_MULi(t1r, a3), SIMDBase_MULi(t1i, a2))); +#else + SIMDBase_STOR(&s[i2+4], SIMDBase_FMSUBi(t0i, a1, SIMDBase_MULi(t0r, a0))); + SIMDBase_STOR(&s[i2+5], SIMDBase_FMADDi(t0r, a1, SIMDBase_MULi(t0i, a0))); + SIMDBase_STOR(&s[i3+4], SIMDBase_FMSUBi(t1i, a3, SIMDBase_MULi(t1r, a2))); + SIMDBase_STOR(&s[i3+5], SIMDBase_FMADDi(t1r, a3, SIMDBase_MULi(t1i, a2))); +#endif + + // + + s00 = SIMDBase_LOAD(&s[i0+6]); s01 = SIMDBase_LOAD(&s[i0+7]); + s10 = SIMDBase_LOAD(&s[i1+6]); s11 = SIMDBase_LOAD(&s[i1+7]); + s20 = SIMDBase_LOAD(&s[i2+6]); s21 = SIMDBase_LOAD(&s[i2+7]); + s30 = SIMDBase_LOAD(&s[i3+6]); s31 = SIMDBase_LOAD(&s[i3+7]); + + t0r = SIMDBase_SUBi(SIMDBase_SUBi(s00, s20), SIMDBase_SUBi(s31, s11)); + t0i = SIMDBase_SUBi(SIMDBase_SUBi(s01, s21), SIMDBase_SUBi(s10, s30)); + + t1r = SIMDBase_ADDi(SIMDBase_SUBi(s00, s20), SIMDBase_SUBi(s31, s11)); + t1i = SIMDBase_ADDi(SIMDBase_SUBi(s01, s21), SIMDBase_SUBi(s10, s30)); + + a0 = SIMDBase_LOAD1(&tbl[p0+12]); a1 = SIMDBase_LOAD1(&tbl[p0+13]); + a2 = SIMDBase_LOAD1(&tbl[p0+14]); a3 = SIMDBase_LOAD1(&tbl[p0+15]); + + SIMDBase_STOR(&s[i0+6], SIMDBase_ADDi(s00, s20)); SIMDBase_STOR(&s[i0+7], SIMDBase_ADDi(s01, s21)); + SIMDBase_STOR(&s[i1+6], SIMDBase_ADDi(s10, s30)); SIMDBase_STOR(&s[i1+7], SIMDBase_ADDi(s11, s31)); + +#ifndef SIMDBase_FMADD_AVAILABLE + SIMDBase_STOR(&s[i2+6], SIMDBase_SUBi(SIMDBase_MULi(t0r, a0), SIMDBase_MULi(t0i, a1))); + SIMDBase_STOR(&s[i2+7], SIMDBase_ADDi(SIMDBase_MULi(t0r, a1), SIMDBase_MULi(t0i, a0))); + SIMDBase_STOR(&s[i3+6], SIMDBase_SUBi(SIMDBase_MULi(t1r, a2), SIMDBase_MULi(t1i, a3))); + SIMDBase_STOR(&s[i3+7], SIMDBase_ADDi(SIMDBase_MULi(t1r, a3), SIMDBase_MULi(t1i, a2))); +#else + SIMDBase_STOR(&s[i2+6], SIMDBase_FMSUBi(t0i, a1, SIMDBase_MULi(t0r, a0))); + SIMDBase_STOR(&s[i2+7], SIMDBase_FMADDi(t0r, a1, SIMDBase_MULi(t0i, a0))); + SIMDBase_STOR(&s[i3+6], SIMDBase_FMSUBi(t1i, a3, SIMDBase_MULi(t1r, a2))); + SIMDBase_STOR(&s[i3+7], SIMDBase_FMADDi(t1r, a3, SIMDBase_MULi(t1i, a2))); +#endif + + // + + i0 += 8; i1 += 8; i2 += 8; i3 += 8; + p0 += 16; + } +} + +#if 1 +static void srButBackwardSubUnrolled(DFTUndiff *p) { + SIMDBase_VECT *s = p->s; + SIMDBase_REAL *tbl = p->ptTable[p->log2butlen]; + + int32_t i0 = p->offset1; + int32_t i1 = i0 + p->stride; + int32_t i2 = i1 + p->stride; + int32_t i3 = i2 + p->stride; + int32_t im = i1; + + int32_t p0 = p->offset2 & (p->butlen*4-1); + + while(i0 < im) { + SIMDBase_VECT t0r, t0i, t1r, t1i, u, v; + SIMDBase_VECT s00, s01, s10, s11, s20, s21, s30, s31; + SIMDBase_VECT a0, a1, a2, a3; + + // + + s20 = SIMDBase_LOAD(&s[i2+ 0]); s21 = SIMDBase_LOAD(&s[i2+ 1]); + a0 = SIMDBase_LOAD1(&tbl[p0+ 0]); a1 = SIMDBase_LOAD1(&tbl[p0+ 1]); +#ifndef SIMDBase_FMADD_AVAILABLE + u = SIMDBase_ADDi(SIMDBase_MULi(s20, a0), SIMDBase_MULi(s21, a1)); +#else + u = SIMDBase_FMADDi(s20, a0, SIMDBase_MULi(s21, a1)); +#endif + + s30 = SIMDBase_LOAD(&s[i3+ 0]); s31 = SIMDBase_LOAD(&s[i3+ 1]); + a2 = SIMDBase_LOAD1(&tbl[p0+ 2]); a3 = SIMDBase_LOAD1(&tbl[p0+ 3]); +#ifndef SIMDBase_FMADD_AVAILABLE + v = SIMDBase_ADDi(SIMDBase_MULi(s30, a2), SIMDBase_MULi(s31, a3)); +#else + v = SIMDBase_FMADDi(s30, a2, SIMDBase_MULi(s31, a3)); +#endif + + t0r = SIMDBase_ADDi(u, v); t1i = SIMDBase_SUBi(u, v); + +#ifndef SIMDBase_FMADD_AVAILABLE + u = SIMDBase_SUBi(SIMDBase_MULi(s31, a2), SIMDBase_MULi(s30, a3)); + v = SIMDBase_SUBi(SIMDBase_MULi(s21, a0), SIMDBase_MULi(s20, a1)); +#else + u = SIMDBase_FMSUBi(s30, a3, SIMDBase_MULi(s31, a2)); + v = SIMDBase_FMSUBi(s20, a1, SIMDBase_MULi(s21, a0)); +#endif + t0i = SIMDBase_ADDi(u, v); t1r = SIMDBase_SUBi(u, v); + + s00 = SIMDBase_LOAD(&s[i0+ 0]); s01 = SIMDBase_LOAD(&s[i0+ 1]); + s10 = SIMDBase_LOAD(&s[i1+ 0]); s11 = SIMDBase_LOAD(&s[i1+ 1]); + + SIMDBase_STOR(&s[i2+ 0], SIMDBase_SUBi(s00, t0r)); SIMDBase_STOR(&s[i0+ 0], SIMDBase_ADDi(s00, t0r)); + SIMDBase_STOR(&s[i2+ 1], SIMDBase_SUBi(s01, t0i)); SIMDBase_STOR(&s[i0+ 1], SIMDBase_ADDi(s01, t0i)); + SIMDBase_STOR(&s[i3+ 0], SIMDBase_SUBi(s10, t1r)); SIMDBase_STOR(&s[i1+ 0], SIMDBase_ADDi(s10, t1r)); + SIMDBase_STOR(&s[i3+ 1], SIMDBase_SUBi(s11, t1i)); SIMDBase_STOR(&s[i1+ 1], SIMDBase_ADDi(s11, t1i)); + + // + + s20 = SIMDBase_LOAD(&s[i2+ 2]); s21 = SIMDBase_LOAD(&s[i2+ 3]); + a0 = SIMDBase_LOAD1(&tbl[p0+ 4]); a1 = SIMDBase_LOAD1(&tbl[p0+ 5]); +#ifndef SIMDBase_FMADD_AVAILABLE + u = SIMDBase_ADDi(SIMDBase_MULi(s20, a0), SIMDBase_MULi(s21, a1)); +#else + u = SIMDBase_FMADDi(s20, a0, SIMDBase_MULi(s21, a1)); +#endif + + s30 = SIMDBase_LOAD(&s[i3+ 2]); s31 = SIMDBase_LOAD(&s[i3+ 3]); + a2 = SIMDBase_LOAD1(&tbl[p0+ 6]); a3 = SIMDBase_LOAD1(&tbl[p0+ 7]); +#ifndef SIMDBase_FMADD_AVAILABLE + v = SIMDBase_ADDi(SIMDBase_MULi(s30, a2), SIMDBase_MULi(s31, a3)); +#else + v = SIMDBase_FMADDi(s30, a2, SIMDBase_MULi(s31, a3)); +#endif + + t0r = SIMDBase_ADDi(u, v); t1i = SIMDBase_SUBi(u, v); + +#ifndef SIMDBase_FMADD_AVAILABLE + u = SIMDBase_SUBi(SIMDBase_MULi(s31, a2), SIMDBase_MULi(s30, a3)); + v = SIMDBase_SUBi(SIMDBase_MULi(s21, a0), SIMDBase_MULi(s20, a1)); +#else + u = SIMDBase_FMSUBi(s30, a3, SIMDBase_MULi(s31, a2)); + v = SIMDBase_FMSUBi(s20, a1, SIMDBase_MULi(s21, a0)); +#endif + t0i = SIMDBase_ADDi(u, v); t1r = SIMDBase_SUBi(u, v); + + s00 = SIMDBase_LOAD(&s[i0+ 2]); s01 = SIMDBase_LOAD(&s[i0+ 3]); + s10 = SIMDBase_LOAD(&s[i1+ 2]); s11 = SIMDBase_LOAD(&s[i1+ 3]); + + SIMDBase_STOR(&s[i2+ 2], SIMDBase_SUBi(s00, t0r)); SIMDBase_STOR(&s[i0+ 2], SIMDBase_ADDi(s00, t0r)); + SIMDBase_STOR(&s[i2+ 3], SIMDBase_SUBi(s01, t0i)); SIMDBase_STOR(&s[i0+ 3], SIMDBase_ADDi(s01, t0i)); + SIMDBase_STOR(&s[i3+ 2], SIMDBase_SUBi(s10, t1r)); SIMDBase_STOR(&s[i1+ 2], SIMDBase_ADDi(s10, t1r)); + SIMDBase_STOR(&s[i3+ 3], SIMDBase_SUBi(s11, t1i)); SIMDBase_STOR(&s[i1+ 3], SIMDBase_ADDi(s11, t1i)); + + // + + s20 = SIMDBase_LOAD(&s[i2+ 4]); s21 = SIMDBase_LOAD(&s[i2+ 5]); + a0 = SIMDBase_LOAD1(&tbl[p0+ 8]); a1 = SIMDBase_LOAD1(&tbl[p0+ 9]); +#ifndef SIMDBase_FMADD_AVAILABLE + u = SIMDBase_ADDi(SIMDBase_MULi(s20, a0), SIMDBase_MULi(s21, a1)); +#else + u = SIMDBase_FMADDi(s20, a0, SIMDBase_MULi(s21, a1)); +#endif + + s30 = SIMDBase_LOAD(&s[i3+ 4]); s31 = SIMDBase_LOAD(&s[i3+ 5]); + a2 = SIMDBase_LOAD1(&tbl[p0+10]); a3 = SIMDBase_LOAD1(&tbl[p0+11]); +#ifndef SIMDBase_FMADD_AVAILABLE + v = SIMDBase_ADDi(SIMDBase_MULi(s30, a2), SIMDBase_MULi(s31, a3)); +#else + v = SIMDBase_FMADDi(s30, a2, SIMDBase_MULi(s31, a3)); +#endif + + t0r = SIMDBase_ADDi(u, v); t1i = SIMDBase_SUBi(u, v); + +#ifndef SIMDBase_FMADD_AVAILABLE + u = SIMDBase_SUBi(SIMDBase_MULi(s31, a2), SIMDBase_MULi(s30, a3)); + v = SIMDBase_SUBi(SIMDBase_MULi(s21, a0), SIMDBase_MULi(s20, a1)); +#else + u = SIMDBase_FMSUBi(s30, a3, SIMDBase_MULi(s31, a2)); + v = SIMDBase_FMSUBi(s20, a1, SIMDBase_MULi(s21, a0)); +#endif + t0i = SIMDBase_ADDi(u, v); t1r = SIMDBase_SUBi(u, v); + + s00 = SIMDBase_LOAD(&s[i0+ 4]); s01 = SIMDBase_LOAD(&s[i0+ 5]); + s10 = SIMDBase_LOAD(&s[i1+ 4]); s11 = SIMDBase_LOAD(&s[i1+ 5]); + + SIMDBase_STOR(&s[i2+ 4], SIMDBase_SUBi(s00, t0r)); SIMDBase_STOR(&s[i0+ 4], SIMDBase_ADDi(s00, t0r)); + SIMDBase_STOR(&s[i2+ 5], SIMDBase_SUBi(s01, t0i)); SIMDBase_STOR(&s[i0+ 5], SIMDBase_ADDi(s01, t0i)); + SIMDBase_STOR(&s[i3+ 4], SIMDBase_SUBi(s10, t1r)); SIMDBase_STOR(&s[i1+ 4], SIMDBase_ADDi(s10, t1r)); + SIMDBase_STOR(&s[i3+ 5], SIMDBase_SUBi(s11, t1i)); SIMDBase_STOR(&s[i1+ 5], SIMDBase_ADDi(s11, t1i)); + + // + + s20 = SIMDBase_LOAD(&s[i2+ 6]); s21 = SIMDBase_LOAD(&s[i2+ 7]); + a0 = SIMDBase_LOAD1(&tbl[p0+12]); a1 = SIMDBase_LOAD1(&tbl[p0+13]); +#ifndef SIMDBase_FMADD_AVAILABLE + u = SIMDBase_ADDi(SIMDBase_MULi(s20, a0), SIMDBase_MULi(s21, a1)); +#else + u = SIMDBase_FMADDi(s20, a0, SIMDBase_MULi(s21, a1)); +#endif + + s30 = SIMDBase_LOAD(&s[i3+ 6]); s31 = SIMDBase_LOAD(&s[i3+ 7]); + a2 = SIMDBase_LOAD1(&tbl[p0+14]); a3 = SIMDBase_LOAD1(&tbl[p0+15]); +#ifndef SIMDBase_FMADD_AVAILABLE + v = SIMDBase_ADDi(SIMDBase_MULi(s30, a2), SIMDBase_MULi(s31, a3)); +#else + v = SIMDBase_FMADDi(s30, a2, SIMDBase_MULi(s31, a3)); +#endif + + t0r = SIMDBase_ADDi(u, v); t1i = SIMDBase_SUBi(u, v); + +#ifndef SIMDBase_FMADD_AVAILABLE + u = SIMDBase_SUBi(SIMDBase_MULi(s31, a2), SIMDBase_MULi(s30, a3)); + v = SIMDBase_SUBi(SIMDBase_MULi(s21, a0), SIMDBase_MULi(s20, a1)); +#else + u = SIMDBase_FMSUBi(s30, a3, SIMDBase_MULi(s31, a2)); + v = SIMDBase_FMSUBi(s20, a1, SIMDBase_MULi(s21, a0)); +#endif + t0i = SIMDBase_ADDi(u, v); t1r = SIMDBase_SUBi(u, v); + + s00 = SIMDBase_LOAD(&s[i0+ 6]); s01 = SIMDBase_LOAD(&s[i0+ 7]); + s10 = SIMDBase_LOAD(&s[i1+ 6]); s11 = SIMDBase_LOAD(&s[i1+ 7]); + + SIMDBase_STOR(&s[i2+ 6], SIMDBase_SUBi(s00, t0r)); SIMDBase_STOR(&s[i0+ 6], SIMDBase_ADDi(s00, t0r)); + SIMDBase_STOR(&s[i2+ 7], SIMDBase_SUBi(s01, t0i)); SIMDBase_STOR(&s[i0+ 7], SIMDBase_ADDi(s01, t0i)); + SIMDBase_STOR(&s[i3+ 6], SIMDBase_SUBi(s10, t1r)); SIMDBase_STOR(&s[i1+ 6], SIMDBase_ADDi(s10, t1r)); + SIMDBase_STOR(&s[i3+ 7], SIMDBase_SUBi(s11, t1i)); SIMDBase_STOR(&s[i1+ 7], SIMDBase_ADDi(s11, t1i)); + + // + + i0 += 8; i1 += 8; i2 += 8; i3 += 8; + p0 += 16; + } +} +#endif + +static void r2ButForwardSub(DFTUndiff *p) { + SIMDBase_VECT *s = p->s; + + SIMDBase_REAL *tbl = p->ptTable[p->log2butlen]; + + int32_t i0 = p->offset1; + int32_t i2 = i0 + p->stride*2; + int32_t cp = 0, sp = p->butlen/4; + + do { + SIMDBase_VECT t0r, t0i, s0, s1, s2, s3, t0, t1; + + s0 = SIMDBase_LOAD(&s[i0+0]); s2 = SIMDBase_LOAD(&s[i0+1]); + s1 = SIMDBase_LOAD(&s[i2+0]); s3 = SIMDBase_LOAD(&s[i2+1]); + t0 = SIMDBase_LOAD1(&tbl[cp+0]); t1 = SIMDBase_LOAD1(&tbl[sp-0]); + t0r = SIMDBase_SUBi(s0, s1); SIMDBase_STOR(&s[i0+0], SIMDBase_ADDi(s0, s1)); + t0i = SIMDBase_SUBi(s2, s3); SIMDBase_STOR(&s[i0+1], SIMDBase_ADDi(s2, s3)); + SIMDBase_STOR(&s[i2+0], SIMDBase_ADDi(SIMDBase_MULi(t0r, t0), SIMDBase_MULi(t0i, t1))); + SIMDBase_STOR(&s[i2+1], SIMDBase_SUBi(SIMDBase_MULi(t0i, t0), SIMDBase_MULi(t0r, t1))); + + s0 = SIMDBase_LOAD(&s[i0+2]); s2 = SIMDBase_LOAD(&s[i0+3]); + s1 = SIMDBase_LOAD(&s[i2+2]); s3 = SIMDBase_LOAD(&s[i2+3]); + t0 = SIMDBase_LOAD1(&tbl[cp+1]); t1 = SIMDBase_LOAD1(&tbl[sp-1]); + t0r = SIMDBase_SUBi(s0, s1); SIMDBase_STOR(&s[i0+2], SIMDBase_ADDi(s0, s1)); + t0i = SIMDBase_SUBi(s2, s3); SIMDBase_STOR(&s[i0+3], SIMDBase_ADDi(s2, s3)); + SIMDBase_STOR(&s[i2+2], SIMDBase_ADDi(SIMDBase_MULi(t0r, t0), SIMDBase_MULi(t0i, t1))); + SIMDBase_STOR(&s[i2+3], SIMDBase_SUBi(SIMDBase_MULi(t0i, t0), SIMDBase_MULi(t0r, t1))); + + s0 = SIMDBase_LOAD(&s[i0+4]); s2 = SIMDBase_LOAD(&s[i0+5]); + s1 = SIMDBase_LOAD(&s[i2+4]); s3 = SIMDBase_LOAD(&s[i2+5]); + t0 = SIMDBase_LOAD1(&tbl[cp+2]); t1 = SIMDBase_LOAD1(&tbl[sp-2]); + t0r = SIMDBase_SUBi(s0, s1); SIMDBase_STOR(&s[i0+4], SIMDBase_ADDi(s0, s1)); + t0i = SIMDBase_SUBi(s2, s3); SIMDBase_STOR(&s[i0+5], SIMDBase_ADDi(s2, s3)); + SIMDBase_STOR(&s[i2+4], SIMDBase_ADDi(SIMDBase_MULi(t0r, t0), SIMDBase_MULi(t0i, t1))); + SIMDBase_STOR(&s[i2+5], SIMDBase_SUBi(SIMDBase_MULi(t0i, t0), SIMDBase_MULi(t0r, t1))); + + s0 = SIMDBase_LOAD(&s[i0+6]); s2 = SIMDBase_LOAD(&s[i0+7]); + s1 = SIMDBase_LOAD(&s[i2+6]); s3 = SIMDBase_LOAD(&s[i2+7]); + t0 = SIMDBase_LOAD1(&tbl[cp+3]); t1 = SIMDBase_LOAD1(&tbl[sp-3]); + t0r = SIMDBase_SUBi(s0, s1); SIMDBase_STOR(&s[i0+6], SIMDBase_ADDi(s0, s1)); + t0i = SIMDBase_SUBi(s2, s3); SIMDBase_STOR(&s[i0+7], SIMDBase_ADDi(s2, s3)); + SIMDBase_STOR(&s[i2+6], SIMDBase_ADDi(SIMDBase_MULi(t0r, t0), SIMDBase_MULi(t0i, t1))); + SIMDBase_STOR(&s[i2+7], SIMDBase_SUBi(SIMDBase_MULi(t0i, t0), SIMDBase_MULi(t0r, t1))); + + // + + i0 += 8; i2 += 8; cp += 4; sp -= 4; + } while(sp > 0); + + do { + SIMDBase_VECT t0r, t0i, s0, s1, s2, s3, t0, t1; + + s0 = SIMDBase_LOAD(&s[i0+0]); s2 = SIMDBase_LOAD(&s[i0+1]); + s1 = SIMDBase_LOAD(&s[i2+0]); s3 = SIMDBase_LOAD(&s[i2+1]); + t0 = SIMDBase_LOAD1(&tbl[cp-0]); t1 = SIMDBase_LOAD1(&tbl[sp+0]); + t0r = SIMDBase_SUBi(s0, s1); SIMDBase_STOR(&s[i0+0], SIMDBase_ADDi(s0, s1)); + t0i = SIMDBase_SUBi(s2, s3); SIMDBase_STOR(&s[i0+1], SIMDBase_ADDi(s2, s3)); + SIMDBase_STOR(&s[i2+0], SIMDBase_SUBi(SIMDBase_MULi(t0i, t1), SIMDBase_MULi(t0r, t0))); + SIMDBase_STOR(&s[i2+1], SIMDBase_NEGi(SIMDBase_ADDi(SIMDBase_MULi(t0i, t0), SIMDBase_MULi(t0r, t1)))); + + s0 = SIMDBase_LOAD(&s[i0+2]); s2 = SIMDBase_LOAD(&s[i0+3]); + s1 = SIMDBase_LOAD(&s[i2+2]); s3 = SIMDBase_LOAD(&s[i2+3]); + t0 = SIMDBase_LOAD1(&tbl[cp-1]); t1 = SIMDBase_LOAD1(&tbl[sp+1]); + t0r = SIMDBase_SUBi(s0, s1); SIMDBase_STOR(&s[i0+2], SIMDBase_ADDi(s0, s1)); + t0i = SIMDBase_SUBi(s2, s3); SIMDBase_STOR(&s[i0+3], SIMDBase_ADDi(s2, s3)); + SIMDBase_STOR(&s[i2+2], SIMDBase_SUBi(SIMDBase_MULi(t0i, t1), SIMDBase_MULi(t0r, t0))); + SIMDBase_STOR(&s[i2+3], SIMDBase_NEGi(SIMDBase_ADDi(SIMDBase_MULi(t0i, t0), SIMDBase_MULi(t0r, t1)))); + + s0 = SIMDBase_LOAD(&s[i0+4]); s2 = SIMDBase_LOAD(&s[i0+5]); + s1 = SIMDBase_LOAD(&s[i2+4]); s3 = SIMDBase_LOAD(&s[i2+5]); + t0 = SIMDBase_LOAD1(&tbl[cp-2]); t1 = SIMDBase_LOAD1(&tbl[sp+2]); + t0r = SIMDBase_SUBi(s0, s1); SIMDBase_STOR(&s[i0+4], SIMDBase_ADDi(s0, s1)); + t0i = SIMDBase_SUBi(s2, s3); SIMDBase_STOR(&s[i0+5], SIMDBase_ADDi(s2, s3)); + SIMDBase_STOR(&s[i2+4], SIMDBase_SUBi(SIMDBase_MULi(t0i, t1), SIMDBase_MULi(t0r, t0))); + SIMDBase_STOR(&s[i2+5], SIMDBase_NEGi(SIMDBase_ADDi(SIMDBase_MULi(t0i, t0), SIMDBase_MULi(t0r, t1)))); + + s0 = SIMDBase_LOAD(&s[i0+6]); s2 = SIMDBase_LOAD(&s[i0+7]); + s1 = SIMDBase_LOAD(&s[i2+6]); s3 = SIMDBase_LOAD(&s[i2+7]); + t0 = SIMDBase_LOAD1(&tbl[cp-3]); t1 = SIMDBase_LOAD1(&tbl[sp+3]); + t0r = SIMDBase_SUBi(s0, s1); SIMDBase_STOR(&s[i0+6], SIMDBase_ADDi(s0, s1)); + t0i = SIMDBase_SUBi(s2, s3); SIMDBase_STOR(&s[i0+7], SIMDBase_ADDi(s2, s3)); + SIMDBase_STOR(&s[i2+6], SIMDBase_SUBi(SIMDBase_MULi(t0i, t1), SIMDBase_MULi(t0r, t0))); + SIMDBase_STOR(&s[i2+7], SIMDBase_NEGi(SIMDBase_ADDi(SIMDBase_MULi(t0i, t0), SIMDBase_MULi(t0r, t1)))); + + // + + i0 += 8; i2 += 8; cp -= 4; sp += 4; + } while(cp > 0); +} + +static void r2ButBackwardSub(DFTUndiff *p) { + SIMDBase_VECT *s = p->s; + + SIMDBase_REAL *tbl = p->ptTable[p->log2butlen]; + + int i0 = p->offset1; + int i2 = i0 + p->stride*2; + + int cp = 0, sp = p->butlen/4; + + do { + SIMDBase_VECT t0r, t0i, s0, s1, s2, s3, t0, t1; + + s0 = SIMDBase_LOAD(&s[i0+0]); s2 = SIMDBase_LOAD(&s[i0+1]); + s1 = SIMDBase_LOAD(&s[i2+0]); s3 = SIMDBase_LOAD(&s[i2+1]); + t0 = SIMDBase_LOAD1(&tbl[cp+0]); t1 = SIMDBase_LOAD1(&tbl[sp-0]); + t0r = SIMDBase_SUBi(SIMDBase_MULi(s1, t0), SIMDBase_MULi(s3, t1)); + t0i = SIMDBase_ADDi(SIMDBase_MULi(s1, t1), SIMDBase_MULi(s3, t0)); + SIMDBase_STOR(&s[i2+0], SIMDBase_SUBi(s0, t0r)); SIMDBase_STOR(&s[i0+0], SIMDBase_ADDi(s0, t0r)); + SIMDBase_STOR(&s[i2+1], SIMDBase_SUBi(s2, t0i)); SIMDBase_STOR(&s[i0+1], SIMDBase_ADDi(s2, t0i)); + + s0 = SIMDBase_LOAD(&s[i0+2]); s2 = SIMDBase_LOAD(&s[i0+3]); + s1 = SIMDBase_LOAD(&s[i2+2]); s3 = SIMDBase_LOAD(&s[i2+3]); + t0 = SIMDBase_LOAD1(&tbl[cp+1]); t1 = SIMDBase_LOAD1(&tbl[sp-1]); + t0r = SIMDBase_SUBi(SIMDBase_MULi(s1, t0), SIMDBase_MULi(s3, t1)); + t0i = SIMDBase_ADDi(SIMDBase_MULi(s1, t1), SIMDBase_MULi(s3, t0)); + SIMDBase_STOR(&s[i2+2], SIMDBase_SUBi(s0, t0r)); SIMDBase_STOR(&s[i0+2], SIMDBase_ADDi(s0, t0r)); + SIMDBase_STOR(&s[i2+3], SIMDBase_SUBi(s2, t0i)); SIMDBase_STOR(&s[i0+3], SIMDBase_ADDi(s2, t0i)); + + s0 = SIMDBase_LOAD(&s[i0+4]); s2 = SIMDBase_LOAD(&s[i0+5]); + s1 = SIMDBase_LOAD(&s[i2+4]); s3 = SIMDBase_LOAD(&s[i2+5]); + t0 = SIMDBase_LOAD1(&tbl[cp+2]); t1 = SIMDBase_LOAD1(&tbl[sp-2]); + t0r = SIMDBase_SUBi(SIMDBase_MULi(s1, t0), SIMDBase_MULi(s3, t1)); + t0i = SIMDBase_ADDi(SIMDBase_MULi(s1, t1), SIMDBase_MULi(s3, t0)); + SIMDBase_STOR(&s[i2+4], SIMDBase_SUBi(s0, t0r)); SIMDBase_STOR(&s[i0+4], SIMDBase_ADDi(s0, t0r)); + SIMDBase_STOR(&s[i2+5], SIMDBase_SUBi(s2, t0i)); SIMDBase_STOR(&s[i0+5], SIMDBase_ADDi(s2, t0i)); + + s0 = SIMDBase_LOAD(&s[i0+6]); s2 = SIMDBase_LOAD(&s[i0+7]); + s1 = SIMDBase_LOAD(&s[i2+6]); s3 = SIMDBase_LOAD(&s[i2+7]); + t0 = SIMDBase_LOAD1(&tbl[cp+3]); t1 = SIMDBase_LOAD1(&tbl[sp-3]); + t0r = SIMDBase_SUBi(SIMDBase_MULi(s1, t0), SIMDBase_MULi(s3, t1)); + t0i = SIMDBase_ADDi(SIMDBase_MULi(s1, t1), SIMDBase_MULi(s3, t0)); + SIMDBase_STOR(&s[i2+6], SIMDBase_SUBi(s0, t0r)); SIMDBase_STOR(&s[i0+6], SIMDBase_ADDi(s0, t0r)); + SIMDBase_STOR(&s[i2+7], SIMDBase_SUBi(s2, t0i)); SIMDBase_STOR(&s[i0+7], SIMDBase_ADDi(s2, t0i)); + + i0 += 8; i2 += 8; cp += 4; sp -= 4; + } while(sp > 0); + + do { + SIMDBase_VECT t0r, t0i, s0, s1, s2, s3, t0, t1; + + s0 = SIMDBase_LOAD(&s[i0+0]); s2 = SIMDBase_LOAD(&s[i0+1]); + s1 = SIMDBase_LOAD(&s[i2+0]); s3 = SIMDBase_LOAD(&s[i2+1]); + t0 = SIMDBase_LOAD1(&tbl[cp-0]); t1 = SIMDBase_LOAD1(&tbl[sp+0]); + t0r = SIMDBase_NEGi(SIMDBase_ADDi(SIMDBase_MULi(s1, t0), SIMDBase_MULi(s3, t1))); + t0i = SIMDBase_SUBi(SIMDBase_MULi(s1, t1), SIMDBase_MULi(s3, t0)); + SIMDBase_STOR(&s[i2+0], SIMDBase_SUBi(s0, t0r)); SIMDBase_STOR(&s[i0+0], SIMDBase_ADDi(s0, t0r)); + SIMDBase_STOR(&s[i2+1], SIMDBase_SUBi(s2, t0i)); SIMDBase_STOR(&s[i0+1], SIMDBase_ADDi(s2, t0i)); + + s0 = SIMDBase_LOAD(&s[i0+2]); s2 = SIMDBase_LOAD(&s[i0+3]); + s1 = SIMDBase_LOAD(&s[i2+2]); s3 = SIMDBase_LOAD(&s[i2+3]); + t0 = SIMDBase_LOAD1(&tbl[cp-1]); t1 = SIMDBase_LOAD1(&tbl[sp+1]); + t0r = SIMDBase_NEGi(SIMDBase_ADDi(SIMDBase_MULi(s1, t0), SIMDBase_MULi(s3, t1))); + t0i = SIMDBase_SUBi(SIMDBase_MULi(s1, t1), SIMDBase_MULi(s3, t0)); + SIMDBase_STOR(&s[i2+2], SIMDBase_SUBi(s0, t0r)); SIMDBase_STOR(&s[i0+2], SIMDBase_ADDi(s0, t0r)); + SIMDBase_STOR(&s[i2+3], SIMDBase_SUBi(s2, t0i)); SIMDBase_STOR(&s[i0+3], SIMDBase_ADDi(s2, t0i)); + + s0 = SIMDBase_LOAD(&s[i0+4]); s2 = SIMDBase_LOAD(&s[i0+5]); + s1 = SIMDBase_LOAD(&s[i2+4]); s3 = SIMDBase_LOAD(&s[i2+5]); + t0 = SIMDBase_LOAD1(&tbl[cp-2]); t1 = SIMDBase_LOAD1(&tbl[sp+2]); + t0r = SIMDBase_NEGi(SIMDBase_ADDi(SIMDBase_MULi(s1, t0), SIMDBase_MULi(s3, t1))); + t0i = SIMDBase_SUBi(SIMDBase_MULi(s1, t1), SIMDBase_MULi(s3, t0)); + SIMDBase_STOR(&s[i2+4], SIMDBase_SUBi(s0, t0r)); SIMDBase_STOR(&s[i0+4], SIMDBase_ADDi(s0, t0r)); + SIMDBase_STOR(&s[i2+5], SIMDBase_SUBi(s2, t0i)); SIMDBase_STOR(&s[i0+5], SIMDBase_ADDi(s2, t0i)); + + s0 = SIMDBase_LOAD(&s[i0+6]); s2 = SIMDBase_LOAD(&s[i0+7]); + s1 = SIMDBase_LOAD(&s[i2+6]); s3 = SIMDBase_LOAD(&s[i2+7]); + t0 = SIMDBase_LOAD1(&tbl[cp-3]); t1 = SIMDBase_LOAD1(&tbl[sp+3]); + t0r = SIMDBase_NEGi(SIMDBase_ADDi(SIMDBase_MULi(s1, t0), SIMDBase_MULi(s3, t1))); + t0i = SIMDBase_SUBi(SIMDBase_MULi(s1, t1), SIMDBase_MULi(s3, t0)); + SIMDBase_STOR(&s[i2+6], SIMDBase_SUBi(s0, t0r)); SIMDBase_STOR(&s[i0+6], SIMDBase_ADDi(s0, t0r)); + SIMDBase_STOR(&s[i2+7], SIMDBase_SUBi(s2, t0i)); SIMDBase_STOR(&s[i0+7], SIMDBase_ADDi(s2, t0i)); + + i0 += 8; i2 += 8; cp -= 4; sp += 4; + } while(cp > 0); +} + +static void srButForward16(DFTUndiff *p) { + int32_t o = p->offset1; + + p->butlen = 16; p->log2butlen = 4; p->stride = p->butlen/2; + srButForwardSubUnrolled(p); + + p->offset1 = o + 16*6/4; + srButForward4(p); + + p->offset1 = o + 16*4/4; + srButForward4(p); + + p->offset1 = o; + srButForward8(p); +} + +static void srButBackward16(DFTUndiff *p) { + int32_t o = p->offset1; + + p->offset1 = o + 16*6/4; + srButBackward4(p); + + p->offset1 = o + 16*4/4; + srButBackward4(p); + + p->offset1 = o; + srButBackward8(p); + + p->butlen = 16; p->log2butlen = 4; p->stride = p->butlen/2; + srButBackwardSubUnrolled(p); +} + +static void srButForward32(DFTUndiff *p) { + int32_t o = p->offset1; + + p->butlen = 32; p->log2butlen = 5; p->stride = p->butlen/2; + srButForwardSubUnrolled(p); + + p->offset1 = o + 32*6/4; + srButForward8 (p); + + p->offset1 = o + 32*4/4; + srButForward8 (p); + + p->offset1 = o; + srButForward16(p); +} + +static void srButBackward32(DFTUndiff *p) { + int32_t o = p->offset1; + + p->offset1 = o + 32*6/4; + srButBackward8 (p); + + p->offset1 = o + 32*4/4; + srButBackward8 (p); + + p->offset1 = o; + srButBackward16(p); + + p->butlen = 32; p->log2butlen = 5; p->stride = p->butlen/2; + srButBackwardSubUnrolled(p); +} + +// + +#if 1 +static inline void bitReversalUnit(SIMDBase_VECT *p, SIMDBase_VECT *q) { + SIMDBase_VECT w, x, y, z; + + w = SIMDBase_LOAD(p); x = SIMDBase_LOAD(p+1); + y = SIMDBase_LOAD(q); z = SIMDBase_LOAD(q+1); + + SIMDBase_STOR(q, w); SIMDBase_STOR(q+1, x); + SIMDBase_STOR(p, y); SIMDBase_STOR(p+1, z); +} +#else +#define bitReversalUnit(p0, q0) { \ + SIMDBase_VECT *px = (p0), *qx = (q0); \ + SIMDBase_VECT wx, xx, yx, zx; \ + \ + wx = SIMDBase_LOAD(px); xx = SIMDBase_LOAD(px+1); \ + yx = SIMDBase_LOAD(qx); zx = SIMDBase_LOAD(qx+1); \ + \ + SIMDBase_STOR(qx, wx); SIMDBase_STOR(qx+1, xx); \ + SIMDBase_STOR(px, yx); SIMDBase_STOR(px+1, zx); \ +} +#endif + +static inline void bitReversal4s(SIMDBase_VECT *s, int32_t sc, int32_t o1, int32_t o2) { + SIMDBase_VECT *p = &s[o1*2], *q = &s[o2*2]; + int b1 = sc*2*1, b2 = b1*2; + p += b1; q += b2; + bitReversalUnit(p, q); +} + +static inline void bitReversal8s(SIMDBase_VECT *s, int32_t sc, int32_t o1, int32_t o2) { + SIMDBase_VECT *p = &s[o1*2], *q = &s[o2*2]; + int b1 = sc*2*1, b2 = b1*2, b4 = b2*2; + p += b1; q += b4; + bitReversalUnit(p, q); p += b2; q += b2; + bitReversalUnit(p, q); +} + +static inline void bitReversal8d(SIMDBase_VECT *s, int32_t sc, int32_t o1, int32_t o2) { + SIMDBase_VECT *p = &s[o1*2], *q = &s[o2*2]; + int32_t b1 = sc*2*1, b2 = b1*2, b4 = b2*2; + bitReversalUnit(p, q); p += b1; q += b4; + bitReversalUnit(p, q); p += b2; q += b2; + bitReversalUnit(p, q); p -= b1; q -= b4; + bitReversalUnit(p, q); p += b4; q += b1; + bitReversalUnit(p, q); p += b1; q += b4; + bitReversalUnit(p, q); p -= b2; q -= b2; + bitReversalUnit(p, q); p -= b1; q -= b4; + bitReversalUnit(p, q); +} + +static inline void bitReversal16s(SIMDBase_VECT *s, int32_t sc, int32_t o1, int32_t o2) { + SIMDBase_VECT *p = &s[o1*2], *q = &s[o2*2]; + int32_t b1 = sc*2*1, b2 = b1*2, b4 = b2*2, b8 = b4*2; + p += b1; q += b8; + bitReversalUnit(p, q); p += b2; q += b4; + bitReversalUnit(p, q); p -= b1; q -= b8; + bitReversalUnit(p, q); p += b1 + b4; q += b2 + b8; + bitReversalUnit(p, q); p -= b2; q -= b4; + bitReversalUnit(p, q); p += b2 + b4; q += b1 + b2; + bitReversalUnit(p, q); +} + +static inline void bitReversal16d(SIMDBase_VECT *s, int32_t sc, int32_t o1, int32_t o2) { + SIMDBase_VECT *p = &s[o1*2], *q = &s[o2*2]; + int32_t b1 = sc*2*1, b2 = b1*2, b4 = b2*2, b8 = b4*2; + bitReversalUnit(p, q); p += b1; q += b8; + bitReversalUnit(p, q); p += b2; q += b4; + bitReversalUnit(p, q); p -= b1; q -= b8; + bitReversalUnit(p, q); p += b4; q += b2; + bitReversalUnit(p, q); p += b1; q += b8; + bitReversalUnit(p, q); p -= b2; q -= b4; + bitReversalUnit(p, q); p -= b1; q -= b8; + bitReversalUnit(p, q); p += b8; q += b1; + bitReversalUnit(p, q); p += b1; q += b8; + bitReversalUnit(p, q); p += b2; q += b4; + bitReversalUnit(p, q); p -= b1; q -= b8; + bitReversalUnit(p, q); p -= b4; q -= b2; + bitReversalUnit(p, q); p += b1; q += b8; + bitReversalUnit(p, q); p -= b2; q -= b4; + bitReversalUnit(p, q); p -= b1; q -= b8; + bitReversalUnit(p, q); +} + +static inline void bitReversal32s(SIMDBase_VECT *s, int32_t sc, int32_t o1, int32_t o2) { + SIMDBase_VECT *p = &s[o1*2], *q = &s[o2*2]; + int32_t b1 = sc*2*1, b2 = b1*2, b4 = b2*2, b8 = b4*2, b16 = b8*2; + p += b1; q += b16; + bitReversalUnit(p, q); p += b2; q += b8; + bitReversalUnit(p, q); p -= b1; q -= b16; + bitReversalUnit(p, q); p += b4; q += b4; + bitReversalUnit(p, q); p += b1; q += b16; + bitReversalUnit(p, q); p -= b2; q -= b8; + bitReversalUnit(p, q); p += b8; q += b2; + bitReversalUnit(p, q); p += b2; q += b8; + bitReversalUnit(p, q); p -= b4; q -= b4; + bitReversalUnit(p, q); p -= b2; q -= b8; + bitReversalUnit(p, q); p += b16 - b2; q += b1 + b2 + b8; + bitReversalUnit(p, q); p -= b4; q -= b4; + bitReversalUnit(p, q); +} + +static void bitReversal32d(SIMDBase_VECT *s, int32_t sc, int32_t o1, int32_t o2) { + const int32_t k = 32; + + bitReversal8d(s,2*sc, sc*(k/2 )+o1, sc* 1 +o2); + bitReversal8d(s,2*sc, sc* 0 +o1, sc* 0 +o2); + bitReversal8d(s,2*sc, sc* 1 +o1, sc*(k/2 )+o2); + bitReversal8d(s,2*sc, sc*(k/2+1)+o1, sc*(k/2+1)+o2); +} + +static void bitReversalRecursive(SIMDBase_VECT *s, int32_t n, int32_t sc, int32_t o1, int32_t o2) { + if (n >= 64) { + if (o1 != o2) bitReversalRecursive(s, n/4, 2*sc, sc*(n/2)+o1, sc*1+o2); + + bitReversalRecursive(s, n/4, 2*sc, sc* 0 +o1, sc* 0 +o2); + bitReversalRecursive(s, n/4, 2*sc, sc* 1 +o1, sc*(n/2 )+o2); + bitReversalRecursive(s, n/4, 2*sc, sc*(n/2+1)+o1, sc*(n/2+1)+o2); + } else { + if (o1 == o2) { + switch(n) { + case 4: bitReversal4s (s,sc,o1,o2); return; + case 8: bitReversal8s (s,sc,o1,o2); return; + case 16: bitReversal16s(s,sc,o1,o2); return; + case 32: bitReversal32s(s,sc,o1,o2); return; + } + } else { + switch(n) { + case 8: bitReversal8d (s,sc,o1,o2); return; + case 16: bitReversal16d(s,sc,o1,o2); return; + case 32: bitReversal32d(s,sc,o1,o2); return; + } + } + } +} + +// + +static int bitR(int a, int logN) { + int ret = 0; + int i,j,k; + for(i=0,j=1,k=1<<(logN-1);i<logN;i++,j=j<<1,k=k>>1) { + if ((a & j) != 0) ret |= k; + } + return ret; +} + +static void bitReversalCobraInplace(DFTUndiff *p) { + SIMDBase_VECT *s = p->s; + int cobraQ = p->cobraQ; + SIMDBase_VECT *cobraT = p->cobraT; + int *cobraR = p->cobraR; + int logN = p->log2len; + + int b; + + for(b=0;b<(1 << (logN-2*cobraQ));b++) { + int a,c; + int b2 = bitR(b, logN-2*cobraQ); + + if (b2 < b) continue; + + if (b2 == b) { + for(a=0;a<(1 << cobraQ);a++) { + int abc = ((a << (logN-2*cobraQ)) | b) << (cobraQ + 1); + + int a2c = (cobraR[a] << cobraQ) << 1, a2cm = a2c+(1 << cobraQ)*2; + + while(a2c < a2cm) { + SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); + SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); + SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); + SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); + SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); + SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); + SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); + SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); + SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); + SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); + SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); + SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); + SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); + SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); + SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); + SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); + } + } + + for(c=0;c<(1 << cobraQ);c++) { + int c2 = cobraR[c]; + int c2b2a2 = ((c2 << (logN-2*cobraQ)) | b2) << (cobraQ+1); + + int a2c = c << 1; + int a2ci = 1 << (cobraQ+1); + int c2b2a2m = c2b2a2 + (1 << cobraQ)*2; + + while(c2b2a2 < c2b2a2m) { + SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c ])); SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c+1])); a2c += a2ci; + SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c ])); SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c+1])); a2c += a2ci; + SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c ])); SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c+1])); a2c += a2ci; + SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c ])); SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c+1])); a2c += a2ci; + SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c ])); SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c+1])); a2c += a2ci; + SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c ])); SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c+1])); a2c += a2ci; + SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c ])); SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c+1])); a2c += a2ci; + SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c ])); SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c+1])); a2c += a2ci; + SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c ])); SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c+1])); a2c += a2ci; + SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c ])); SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c+1])); a2c += a2ci; + SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c ])); SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c+1])); a2c += a2ci; + SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c ])); SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c+1])); a2c += a2ci; + SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c ])); SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c+1])); a2c += a2ci; + SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c ])); SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c+1])); a2c += a2ci; + SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c ])); SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c+1])); a2c += a2ci; + SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c ])); SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c+1])); a2c += a2ci; + } + } + } else { + for(a=0;a<(1 << cobraQ);a++) { + int a2c = (cobraR[a] << cobraQ) << 1, a2cm = a2c+(1 << cobraQ)*2; + int abc = ((a << (logN-2*cobraQ)) | b) << (cobraQ + 1); + + while(a2c < a2cm) { + SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); + SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); + SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); + SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); + SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); + SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); + SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); + SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); + SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); + SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); + SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); + SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); + SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); + SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); + SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); + SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); SIMDBase_STOR(&cobraT[a2c++], SIMDBase_LOAD(&s[abc++])); + } + } + + for(c=0;c<(1 << cobraQ);c++) { + int c2 = cobraR[c]; + int c2b2a2 = ((c2 << (logN-2*cobraQ)) | b2) << (cobraQ+1); + + int a2c = c << 1; + int a2ci = 1 << (cobraQ+1); + int c2b2a2m = c2b2a2 + (1 << cobraQ)*2; + + while(c2b2a2 < c2b2a2m) { + SIMDBase_VECT t0, t1, t2, t3, t4, t5, t6, t7; + + t0 = SIMDBase_LOAD(&s[c2b2a2+0]); t1 = SIMDBase_LOAD(&s[c2b2a2+1]); + t2 = SIMDBase_LOAD(&s[c2b2a2+2]); t3 = SIMDBase_LOAD(&s[c2b2a2+3]); + t4 = SIMDBase_LOAD(&s[c2b2a2+4]); t5 = SIMDBase_LOAD(&s[c2b2a2+5]); + t6 = SIMDBase_LOAD(&s[c2b2a2+6]); t7 = SIMDBase_LOAD(&s[c2b2a2+7]); + + SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c ])); SIMDBase_STOR(&cobraT[a2c ], t0); + SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c+1])); SIMDBase_STOR(&cobraT[a2c+1], t1); a2c += a2ci; + SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c ])); SIMDBase_STOR(&cobraT[a2c ], t2); + SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c+1])); SIMDBase_STOR(&cobraT[a2c+1], t3); a2c += a2ci; + SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c ])); SIMDBase_STOR(&cobraT[a2c ], t4); + SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c+1])); SIMDBase_STOR(&cobraT[a2c+1], t5); a2c += a2ci; + SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c ])); SIMDBase_STOR(&cobraT[a2c ], t6); + SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c+1])); SIMDBase_STOR(&cobraT[a2c+1], t7); a2c += a2ci; + + t0 = SIMDBase_LOAD(&s[c2b2a2+0]); t1 = SIMDBase_LOAD(&s[c2b2a2+1]); + t2 = SIMDBase_LOAD(&s[c2b2a2+2]); t3 = SIMDBase_LOAD(&s[c2b2a2+3]); + t4 = SIMDBase_LOAD(&s[c2b2a2+4]); t5 = SIMDBase_LOAD(&s[c2b2a2+5]); + t6 = SIMDBase_LOAD(&s[c2b2a2+6]); t7 = SIMDBase_LOAD(&s[c2b2a2+7]); + + SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c ])); SIMDBase_STOR(&cobraT[a2c ], t0); + SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c+1])); SIMDBase_STOR(&cobraT[a2c+1], t1); a2c += a2ci; + SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c ])); SIMDBase_STOR(&cobraT[a2c ], t2); + SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c+1])); SIMDBase_STOR(&cobraT[a2c+1], t3); a2c += a2ci; + SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c ])); SIMDBase_STOR(&cobraT[a2c ], t4); + SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c+1])); SIMDBase_STOR(&cobraT[a2c+1], t5); a2c += a2ci; + SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c ])); SIMDBase_STOR(&cobraT[a2c ], t6); + SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c+1])); SIMDBase_STOR(&cobraT[a2c+1], t7); a2c += a2ci; + + t0 = SIMDBase_LOAD(&s[c2b2a2+0]); t1 = SIMDBase_LOAD(&s[c2b2a2+1]); + t2 = SIMDBase_LOAD(&s[c2b2a2+2]); t3 = SIMDBase_LOAD(&s[c2b2a2+3]); + t4 = SIMDBase_LOAD(&s[c2b2a2+4]); t5 = SIMDBase_LOAD(&s[c2b2a2+5]); + t6 = SIMDBase_LOAD(&s[c2b2a2+6]); t7 = SIMDBase_LOAD(&s[c2b2a2+7]); + + SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c ])); SIMDBase_STOR(&cobraT[a2c ], t0); + SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c+1])); SIMDBase_STOR(&cobraT[a2c+1], t1); a2c += a2ci; + SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c ])); SIMDBase_STOR(&cobraT[a2c ], t2); + SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c+1])); SIMDBase_STOR(&cobraT[a2c+1], t3); a2c += a2ci; + SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c ])); SIMDBase_STOR(&cobraT[a2c ], t4); + SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c+1])); SIMDBase_STOR(&cobraT[a2c+1], t5); a2c += a2ci; + SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c ])); SIMDBase_STOR(&cobraT[a2c ], t6); + SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c+1])); SIMDBase_STOR(&cobraT[a2c+1], t7); a2c += a2ci; + + t0 = SIMDBase_LOAD(&s[c2b2a2+0]); t1 = SIMDBase_LOAD(&s[c2b2a2+1]); + t2 = SIMDBase_LOAD(&s[c2b2a2+2]); t3 = SIMDBase_LOAD(&s[c2b2a2+3]); + t4 = SIMDBase_LOAD(&s[c2b2a2+4]); t5 = SIMDBase_LOAD(&s[c2b2a2+5]); + t6 = SIMDBase_LOAD(&s[c2b2a2+6]); t7 = SIMDBase_LOAD(&s[c2b2a2+7]); + + SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c ])); SIMDBase_STOR(&cobraT[a2c ], t0); + SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c+1])); SIMDBase_STOR(&cobraT[a2c+1], t1); a2c += a2ci; + SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c ])); SIMDBase_STOR(&cobraT[a2c ], t2); + SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c+1])); SIMDBase_STOR(&cobraT[a2c+1], t3); a2c += a2ci; + SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c ])); SIMDBase_STOR(&cobraT[a2c ], t4); + SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c+1])); SIMDBase_STOR(&cobraT[a2c+1], t5); a2c += a2ci; + SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c ])); SIMDBase_STOR(&cobraT[a2c ], t6); + SIMDBase_STOR(&s[c2b2a2++], SIMDBase_LOAD(&cobraT[a2c+1])); SIMDBase_STOR(&cobraT[a2c+1], t7); a2c += a2ci; + } + } + + for(a=0;a<(1 << cobraQ);a++) { + int a2c = (cobraR[a] << cobraQ) << 1, a2cm = a2c+(1 << cobraQ)*2; + int abc = ((a << (logN-2*cobraQ)) | b) << (cobraQ + 1); + + while(a2c < a2cm) { + SIMDBase_STOR(&s[abc++], SIMDBase_LOAD(&cobraT[a2c++])); SIMDBase_STOR(&s[abc++], SIMDBase_LOAD(&cobraT[a2c++])); + SIMDBase_STOR(&s[abc++], SIMDBase_LOAD(&cobraT[a2c++])); SIMDBase_STOR(&s[abc++], SIMDBase_LOAD(&cobraT[a2c++])); + SIMDBase_STOR(&s[abc++], SIMDBase_LOAD(&cobraT[a2c++])); SIMDBase_STOR(&s[abc++], SIMDBase_LOAD(&cobraT[a2c++])); + SIMDBase_STOR(&s[abc++], SIMDBase_LOAD(&cobraT[a2c++])); SIMDBase_STOR(&s[abc++], SIMDBase_LOAD(&cobraT[a2c++])); + SIMDBase_STOR(&s[abc++], SIMDBase_LOAD(&cobraT[a2c++])); SIMDBase_STOR(&s[abc++], SIMDBase_LOAD(&cobraT[a2c++])); + SIMDBase_STOR(&s[abc++], SIMDBase_LOAD(&cobraT[a2c++])); SIMDBase_STOR(&s[abc++], SIMDBase_LOAD(&cobraT[a2c++])); + SIMDBase_STOR(&s[abc++], SIMDBase_LOAD(&cobraT[a2c++])); SIMDBase_STOR(&s[abc++], SIMDBase_LOAD(&cobraT[a2c++])); + SIMDBase_STOR(&s[abc++], SIMDBase_LOAD(&cobraT[a2c++])); SIMDBase_STOR(&s[abc++], SIMDBase_LOAD(&cobraT[a2c++])); + SIMDBase_STOR(&s[abc++], SIMDBase_LOAD(&cobraT[a2c++])); SIMDBase_STOR(&s[abc++], SIMDBase_LOAD(&cobraT[a2c++])); + SIMDBase_STOR(&s[abc++], SIMDBase_LOAD(&cobraT[a2c++])); SIMDBase_STOR(&s[abc++], SIMDBase_LOAD(&cobraT[a2c++])); + SIMDBase_STOR(&s[abc++], SIMDBase_LOAD(&cobraT[a2c++])); SIMDBase_STOR(&s[abc++], SIMDBase_LOAD(&cobraT[a2c++])); + SIMDBase_STOR(&s[abc++], SIMDBase_LOAD(&cobraT[a2c++])); SIMDBase_STOR(&s[abc++], SIMDBase_LOAD(&cobraT[a2c++])); + SIMDBase_STOR(&s[abc++], SIMDBase_LOAD(&cobraT[a2c++])); SIMDBase_STOR(&s[abc++], SIMDBase_LOAD(&cobraT[a2c++])); + SIMDBase_STOR(&s[abc++], SIMDBase_LOAD(&cobraT[a2c++])); SIMDBase_STOR(&s[abc++], SIMDBase_LOAD(&cobraT[a2c++])); + SIMDBase_STOR(&s[abc++], SIMDBase_LOAD(&cobraT[a2c++])); SIMDBase_STOR(&s[abc++], SIMDBase_LOAD(&cobraT[a2c++])); + SIMDBase_STOR(&s[abc++], SIMDBase_LOAD(&cobraT[a2c++])); SIMDBase_STOR(&s[abc++], SIMDBase_LOAD(&cobraT[a2c++])); + } + } + } + } +} + +// + +static void srForwardMain2(DFTUndiff *p) { + int32_t o = p->offset1; + int32_t butlen = p->butlen; + int32_t log2butlen = p->log2butlen; + + if (butlen >= p->radix2thres) { + p->stride = p->butlen/2; + r2ButForwardSub(p); + + p->offset1 = o + butlen*4/4; + p->butlen = butlen/2; + p->log2butlen = log2butlen-1; + srForwardMain2(p); + + p->offset1 = o; + p->butlen = butlen/2; + p->log2butlen = log2butlen-1; + srForwardMain2(p); + + return; + } + + if (butlen >= 256) { + p->stride = p->butlen/2; + srButForwardSubUnrolled(p); + + p->offset1 = o + butlen*6/4; + p->butlen = butlen/4; + p->log2butlen = log2butlen-2; + srForwardMain2(p); + + p->offset1 = o + butlen*4/4; + p->butlen = butlen/4; + p->log2butlen = log2butlen-2; + srForwardMain2(p); + + p->offset1 = o; + p->butlen = butlen/2; + p->log2butlen = log2butlen-1; + srForwardMain2(p); + + return; + } + + if (butlen == 128) { + p->stride = p->butlen/2; + srButForwardSubUnrolled(p); + + p->offset1 = o + butlen*6/4; + srButForward32(p); + + p->offset1 = o + butlen*4/4; + srButForward32(p); + + p->offset1 = o; + p->butlen = butlen/2; + p->log2butlen = log2butlen-1; + srForwardMain2 (p); + + return; + } + + // butlen == 64 + + p->stride = p->butlen/2; + srButForwardSubUnrolled(p); + + p->offset1 = o + butlen*6/4; + srButForward16(p); + + p->offset1 = o + butlen*4/4; + srButForward16(p); + + p->offset1 = o; + srButForward32(p); +} + +static void srBackwardMain2(DFTUndiff *p) { + int32_t o = p->offset1; + int32_t butlen = p->butlen; + int32_t log2butlen = p->log2butlen; + + if (butlen >= p->radix2thres) { + p->offset1 = o + butlen*4/4; + p->butlen = butlen/2; + p->log2butlen = log2butlen-1; + srBackwardMain2(p); + + p->offset1 = o; + p->butlen = butlen/2; + p->log2butlen = log2butlen-1; + srBackwardMain2(p); + + p->butlen = butlen; + p->stride = p->butlen/2; + p->log2butlen = log2butlen; + r2ButBackwardSub(p); + + return; + } + + if (butlen >= 256) { + p->offset1 = o + butlen*6/4; + p->butlen = butlen/4; + p->log2butlen = log2butlen-2; + srBackwardMain2(p); + + p->offset1 = o + butlen*4/4; + p->butlen = butlen/4; + p->log2butlen = log2butlen-2; + srBackwardMain2(p); + + p->offset1 = o; + p->butlen = butlen/2; + p->log2butlen = log2butlen-1; + srBackwardMain2(p); + + p->butlen = butlen; + p->stride = p->butlen/2; + p->log2butlen = log2butlen; + srButBackwardSubUnrolled(p); + + return; + } + + if (butlen == 128) { + p->offset1 = o + butlen*6/4; + srButBackward32(p); + + p->offset1 = o + butlen*4/4; + srButBackward32(p); + + p->offset1 = o; + p->butlen = butlen/2; + p->log2butlen = log2butlen-1; + srBackwardMain2 (p); + + p->butlen = butlen; + p->stride = p->butlen/2; + p->log2butlen = log2butlen; + srButBackwardSubUnrolled(p); + + return; + } + + // butlen == 64 + + p->offset1 = o + butlen*6/4; + srButBackward16(p); + + p->offset1 = o + butlen*4/4; + srButBackward16(p); + + p->offset1 = o; + srButBackward32(p); + + p->butlen = butlen; + p->stride = p->butlen/2; + p->log2butlen = log2butlen; + srButBackwardSubUnrolled(p); +} + +static void srForwardMain(DFTUndiff *p) { + if (p->length >= 64) { + p->butlen = p->length; + p->log2butlen = p->log2len; + p->offset1 = p->offset2 = 0; + + srForwardMain2(p); + } else { + switch(p->length) { + case 32: + srButForward32(p); + break; + case 16: + srButForward16(p); + break; + case 8: + srButForward8(p); + break; + case 4: + srButForward4(p); + break; + case 2: + srBut2(p); + break; + } + } +} + +static void srBackwardMain(DFTUndiff *p) { + if (p->length >= 64) { + p->butlen = p->length; + p->log2butlen = p->log2len; + p->offset1 = p->offset2 = 0; + + srBackwardMain2(p); + } else { + switch(p->length) { + case 32: + srButBackward32(p); + break; + case 16: + srButBackward16(p); + break; + case 8: + srButBackward8(p); + break; + case 4: + srButBackward4(p); + break; + case 2: + srBut2(p); + break; + } + } +} + +static void realSub0(DFTUndiff *p, SIMDBase_VECT *s, int32_t ts) { + SIMDBase_VECT tr, ti, ur, ui, mr, mi; + int32_t n = p->length*2; + int32_t k; + + for(k=1;k<n/4;k++) { + SIMDBase_VECT s00 = SIMDBase_LOAD(&s[k*2+0]), s01 = SIMDBase_LOAD(&s[k*2+1]); + SIMDBase_VECT s10 = SIMDBase_LOAD(&s[(n/2-k)*2+0]), s11 = SIMDBase_LOAD(&s[(n/2-k)*2+1]); + + tr = SIMDBase_SUBi(s00, s10); ti = SIMDBase_ADDi(s01, s11); + ur = SIMDBase_LOAD1(&(p->rtTable[ts][k*2+0])); + ui = SIMDBase_LOAD1(&(p->rtTable[ts][k*2+1])); + mr = SIMDBase_SUBi(SIMDBase_MULi(tr, ur), SIMDBase_MULi(ti, ui)); + mi = SIMDBase_ADDi(SIMDBase_MULi(tr, ui), SIMDBase_MULi(ti, ur)); + SIMDBase_STOR(&s[k*2+0], SIMDBase_SUBi(s00, mr)); + SIMDBase_STOR(&s[k*2+1], SIMDBase_SUBi(s01, mi)); + SIMDBase_STOR(&s[(n/2-k)*2+0], SIMDBase_ADDi(s10, mr)); + SIMDBase_STOR(&s[(n/2-k)*2+1], SIMDBase_SUBi(s11, mi)); + } + + tr = SIMDBase_LOAD(&s[0]); ti = SIMDBase_LOAD(&s[1]); + SIMDBase_STOR(&s[0], SIMDBase_ADDi(tr, ti)); + SIMDBase_STOR(&s[1], SIMDBase_SUBi(tr, ti)); +} + +static void realSub1(DFTUndiff *p, SIMDBase_VECT *s, int32_t ts) { + SIMDBase_VECT tr, ti, ur, ui, mr, mi; + int32_t n = p->length*2; + int32_t k; + + tr = SIMDBase_LOAD(&s[0]); ti = SIMDBase_LOAD(&s[1]); + SIMDBase_STOR(&s[0], SIMDBase_MULi(SIMDBase_ADDi(tr, ti), SIMDBase_SET1(0.5))); + SIMDBase_STOR(&s[1], SIMDBase_MULi(SIMDBase_SUBi(tr, ti), SIMDBase_SET1(0.5))); + + for(k=1;k<n/4;k++) { + SIMDBase_VECT s00 = SIMDBase_LOAD(&s[k*2+0]), s01 = SIMDBase_LOAD(&s[k*2+1]); + SIMDBase_VECT s10 = SIMDBase_LOAD(&s[(n/2-k)*2+0]), s11 = SIMDBase_LOAD(&s[(n/2-k)*2+1]); + + tr = SIMDBase_SUBi(s00, s10); ti = SIMDBase_ADDi(s01, s11); + ur = SIMDBase_LOAD1(&(p->rtTable[ts][k*2+0])); + ui = SIMDBase_LOAD1(&(p->rtTable[ts][k*2+1])); + mr = SIMDBase_SUBi(SIMDBase_MULi(tr, ur), SIMDBase_MULi(ti, ui)); + mi = SIMDBase_ADDi(SIMDBase_MULi(tr, ui), SIMDBase_MULi(ti, ur)); + tr = SIMDBase_SUBi(s00, mr); ti = SIMDBase_SUBi(mi, s01); + SIMDBase_STOR(&s[k*2+0], SIMDBase_ADDi(mr, s10)); + SIMDBase_STOR(&s[k*2+1], SIMDBase_SUBi(mi, s11)); + SIMDBase_STOR(&s[(n/2-k)*2+0], tr); + SIMDBase_STOR(&s[(n/2-k)*2+1], ti); + } +} + +void DFTUndiff_EXECUTE(void *p2, void *s2, int32_t dir) { + DFTUndiff *p = (DFTUndiff *)p2; + SIMDBase_VECT *s = (SIMDBase_VECT *)s2; + + if (p->magic != MAGIC_DFT) abort(); + + p->s = s; + + if (dir == -1) { + if ((p->flags & DFT_FLAG_ALT_REAL) != 0) { + realSub1(p, s, 0); + } + + srForwardMain(p); + + if ((p->flags & DFT_FLAG_NO_BITREVERSAL) == 0) { + if (p->useCobra) { + bitReversalCobraInplace(p); + } else { + bitReversalRecursive(p->s, p->length, 1, 0, 0); + } + } + + if ((p->flags & DFT_FLAG_REAL) != 0) { + realSub0(p, s, 0); + s[p->length+1] = SIMDBase_NEGi(s[p->length+1]); + } + } else { + if ((p->flags & DFT_FLAG_REAL) != 0) { + s[p->length+1] = SIMDBase_NEGi(s[p->length+1]); + realSub1(p, s, 1); + } + + if ((p->flags & DFT_FLAG_NO_BITREVERSAL) == 0) { + if (p->useCobra) { + bitReversalCobraInplace(p); + } else { + bitReversalRecursive(p->s, p->length, 1, 0, 0); + } + } + + srBackwardMain(p); + + if ((p->flags & DFT_FLAG_ALT_REAL) != 0) { + realSub0(p, s, 1); + } + } +} + +void DFTUndiff_DESTROYPLAN(void *p2) { + DFTUndiff *plan = (DFTUndiff *)p2; + if (plan->magic != MAGIC_DFT) abort(); + + free(*(plan->ptTable)); + free(plan->ptTable); + free(plan->cobraT); + free(plan->cobraR); + //free(plan->t); + if (plan->rtTable != NULL) { + free(plan->rtTable[0]); + free(plan->rtTable[1]); + free(plan->rtTable); + } + + plan->magic = 0; + free(plan); +} + +DFTUndiff *DFTUndiff_MAKEPLANSUB(uint64_t n, int32_t radix2thres, int32_t useCobra, uint64_t flags) { + int32_t i, j, k; + + uint32_t linesize = SIMDBase_sizeOfCachelineInByte(); + uint32_t cachesize = SIMDBase_sizeOfDataCacheInByte(); + + // + + if ((flags & DFT_FLAG_REAL) != 0 || (flags & DFT_FLAG_ALT_REAL) != 0) n /= 2; + + DFTUndiff *d = calloc(1, sizeof(DFTUndiff)); + + d->magic = MAGIC_DFT; + d->mode = SIMDBase_MODE; + d->flags = flags; + + d->radix2thres = radix2thres; + d->useCobra = useCobra; + + d->length = (uint32_t) n; + d->log2len = DFT_ilog2((uint32_t) n); + + // + + SIMDBase_REAL *trigTable = SIMDBase_alignedMalloc(sizeof(SIMDBase_REAL)*n*2); + d->ptTable = malloc(sizeof(SIMDBase_REAL *) * (d->log2len+1)); + + SIMDBase_REAL *p = trigTable, **pp = d->ptTable; + + for(j=0;j<(int32_t)d->log2len+1;j++) { + *pp++ = p; + + if ((1 << j) >= d->radix2thres) { + for(i=0;i<(1 << j)/4+1;i++) { + *p++ = (SIMDBase_REAL)COS(-2*M_PIl*i/(1 << j)); + } + const int32_t step = linesize / sizeof(SIMDBase_REAL); + p += (step - (p - trigTable) % step) % step; + } else { + for(i=0;i<(1 << j)/4;i++) { + *p++ = (SIMDBase_REAL)COS(-2*M_PIl*i/(1 << j)); + *p++ = (SIMDBase_REAL)SIN(-2*M_PIl*i/(1 << j)); + *p++ = (SIMDBase_REAL)COS(-6*M_PIl*i/(1 << j)); + *p++ = (SIMDBase_REAL)SIN(-6*M_PIl*i/(1 << j)); + } + } + } + + // + + int32_t cobraQ; + + cobraQ = linesize / (sizeof(SIMDBase_VECT) * 2); + + for(;;) { + if (1 << (cobraQ*2) > + (cachesize / (sizeof(SIMDBase_VECT) * 2)/2)) + break; + + cobraQ++; + } + cobraQ--; + + d->cobraQ = cobraQ; + + if (cobraQ >= 4 && d->log2len >= 2*cobraQ) { + SIMDBase_VECT *cobraT; + int32_t *cobraR; + + if (d->log2len <= 2*cobraQ) cobraQ = d->log2len / 2; + + cobraT = SIMDBase_alignedMalloc(sizeof(SIMDBase_VECT)*2 * (1 << (cobraQ*2))); + cobraR = (int32_t *)SIMDBase_alignedMalloc(sizeof(int32_t) * (1 << cobraQ)); + + for(i=0;i<(1 << cobraQ);i++) cobraR[i] = bitR(i, cobraQ); + + d->cobraT = cobraT; d->cobraR = cobraR; + } else { + d->useCobra = 0; + } + + // + + if ((d->flags & DFT_FLAG_REAL) != 0 || (d->flags & DFT_FLAG_ALT_REAL) != 0) { + int32_t m = n*2; + + d->rtTable = malloc(sizeof(SIMDBase_REAL *)*2); + d->rtTable[0] = SIMDBase_alignedMalloc(sizeof(SIMDBase_REAL)*m/2); + d->rtTable[1] = SIMDBase_alignedMalloc(sizeof(SIMDBase_REAL)*m/2); + + for(k=0;k<m/4;k++) { + d->rtTable[0][k*2+0] = 0.5-0.5*SIN(-2*M_PIl*k/m); + d->rtTable[0][k*2+1] = 0.5*COS(-2*M_PIl*k/m); + d->rtTable[1][k*2+0] = 0.5-0.5*SIN( 2*M_PIl*k/m); + d->rtTable[1][k*2+1] = 0.5*COS( 2*M_PIl*k/m); + } + } + + // + + return (void *)d; +} + +void *DFTUndiff_MAKEPLAN(uint64_t n, uint64_t flags) { + if (flags & DFT_FLAG_VERBOSE) { + printf("\n--------------------------------\n"); + printf("Making plan, mode = %s, dft length = %d\n", SIMDBase_NAME, (int)n); + printf("Processor : %s\n", SIMDBase_getProcessorNameString()); + printf("Cache size (L2 + L3) : %d kbytes / thread\n", SIMDBase_sizeOfDataCacheInByte() / 1024); + printf("Cache Line Size : %d bytes\n", SIMDBase_sizeOfCachelineInByte()); + } + + if (n <= 256 || (flags & 3) == 0) { + return DFTUndiff_MAKEPLANSUB(n, n*2, (flags & DFT_FLAG_FORCE_COBRA) != 0, flags); + } + + SIMDBase_REAL *s1 = SIMDBase_alignedMalloc(sizeof(SIMDBase_VECT)*n*2); + + int32_t i, j, ts, tsbest, useCobra = 0; + double tick, tickmin; + + if (flags & DFT_FLAG_VERBOSE) { + printf("\nWarming up before calibration ..."); + fflush(stdout); + } + + // warming up + tick = DFT_timeofday(); + while(DFT_timeofday() - tick < 0.5) + ; + + if (flags & DFT_FLAG_VERBOSE) { + printf(" done\n"); + } + + int32_t ntimes = 20000000.0 / n / DFT_ilog2(n); + if (ntimes == 0) ntimes = 1; + + if (flags & DFT_FLAG_VERBOSE) { + printf("nTimes = %d\n", ntimes); + } + + // + + DFTUndiff *plan = DFTUndiff_MAKEPLANSUB(n, n*2, 0, flags); + + for(i=0;i<n*2*SIMDBase_VECTLEN;i++) { + s1[i] = 0; + } + + plan->s = (SIMDBase_VECT *)s1; + + if (plan->cobraT != NULL) { + double tcobra = 0, trecur = 0; + + if (flags & DFT_FLAG_VERBOSE) { + printf("\nChecking which bit-reversal method is faster\n"); + } + + // + + bitReversalCobraInplace(plan); + + tick = DFT_timeofday(); + + for(j=0;j<ntimes*4;j++) { + bitReversalCobraInplace(plan); + } + + tcobra += DFT_timeofday() - tick; + + // + + bitReversalRecursive(plan->s, plan->length, 1, 0, 0); + + tick = DFT_timeofday(); + + for(j=0;j<ntimes*4;j++) { + bitReversalRecursive(plan->s, plan->length, 1, 0, 0); + } + + trecur += DFT_timeofday() - tick; + + // + + bitReversalCobraInplace(plan); + + tick = DFT_timeofday(); + + for(j=0;j<ntimes*4;j++) { + bitReversalCobraInplace(plan); + } + + tcobra += DFT_timeofday() - tick; + + // + + bitReversalRecursive(plan->s, plan->length, 1, 0, 0); + + tick = DFT_timeofday(); + + for(j=0;j<ntimes*4;j++) { + bitReversalRecursive(plan->s, plan->length, 1, 0, 0); + } + + trecur += DFT_timeofday() - tick; + + // + + useCobra = tcobra < trecur; + + if ((flags & DFT_FLAG_FORCE_RECURSIVE) != 0) useCobra = 0; + if ((flags & DFT_FLAG_FORCE_COBRA) != 0) useCobra = 1; + + if (flags & DFT_FLAG_VERBOSE) { + printf("cobra : %g\n", tcobra); + printf("recur : %g\n", trecur); + if (useCobra) { + printf("will use Cobra\n"); + } else { + printf("will use the recursive reverser\n"); + } + } + } + + DFTUndiff_DESTROYPLAN(plan); + + // + + if (flags & DFT_FLAG_VERBOSE) { + printf("\nDetermining radix 2 threshold\n"); + } + + plan = DFTUndiff_MAKEPLANSUB(n, n*2, useCobra, flags); + + for(j=0;j<ntimes;j++) { + DFTUndiff_EXECUTE(plan, s1, -1); + DFTUndiff_EXECUTE(plan, s1, 1); + } + + DFTUndiff_DESTROYPLAN(plan); + + tsbest = -1; + tickmin = 0; + + for(ts = 1024;ts <= n*2;ts *= 2) { + plan = DFTUndiff_MAKEPLANSUB(n, ts, useCobra, flags); + + tick = DFT_timeofday(); + + for(j=0;j<ntimes;j++) { + DFTUndiff_EXECUTE(plan, s1, -1); + DFTUndiff_EXECUTE(plan, s1, 1); + } + + tick = DFT_timeofday() - tick; + + DFTUndiff_DESTROYPLAN(plan); + + if (tickmin == 0) tickmin = tick; + + if (flags & DFT_FLAG_VERBOSE) { + printf("%d : %g\n",ts, (double)tick); + } + + if (tick < tickmin) { + tickmin = tick; + tsbest = ts; + } + } + + if (tsbest == -1) tsbest = n*2;; + + if (flags & DFT_FLAG_VERBOSE) { + //printf("forcing tsbest = 1024\n"); + //tsbest = 1024; + printf("radix 2 threshold : %d\n\n", tsbest); + + double t = tickmin / ntimes / 2; + double nf = 5 * n * log(n) / log(2) / (t * 1000000); + + printf("nFlops = %d x %g\n", SIMDBase_VECTLEN, nf); + } + + plan = DFTUndiff_MAKEPLANSUB(n, tsbest, useCobra, flags); + + if (flags & DFT_FLAG_VERBOSE) { + printf("\nDone making plan\n--------------------------------\n"); + } + + return plan; +} diff --git a/plugins/supereq/nsfft-1.00/dft/DFTUndiff.h b/plugins/supereq/nsfft-1.00/dft/DFTUndiff.h new file mode 100644 index 00000000..d26b0d9b --- /dev/null +++ b/plugins/supereq/nsfft-1.00/dft/DFTUndiff.h @@ -0,0 +1,114 @@ +#ifndef __DFTIMPL_H__ +#define __DFTIMPL_H__ + +#include "SIMDBaseUndiff.h" + +#define MAGIC_DFT 0x18839f6d82bb02b6ULL + +typedef struct { + uint64_t magic; + + SIMDBase_VECT *s; + uint32_t offset1, offset2; + uint32_t butlen, log2butlen; + uint32_t stride; + + SIMDBase_REAL **ptTable; + uint32_t length, log2len; + + int32_t radix2thres, flagTrans, useCobra; + + int32_t cobraQ; + SIMDBase_VECT *cobraT; + int32_t *cobraR; + + SIMDBase_REAL **rtTable; + + uint64_t flags; + int32_t mode; +} DFTUndiff; + +#if defined(ENABLE_PUREC_FLOAT) //////////////////////////////////////////// + +#define DFTUndiff_GETMODEPARAMINT getModeParamInt_purec_float +#define DFTUndiff_GETMODEPARAMSTRING getModeParamString_purec_float +#define DFTUndiff_EXECUTE execute_purec_float +#define DFTUndiff_MAKEPLAN makePlan_purec_float +#define DFTUndiff_MAKEPLANSUB makePlanSub_purec_float +#define DFTUndiff_DESTROYPLAN destroyPlan_purec_float + +#elif defined(ENABLE_PUREC_DOUBLE) //////////////////////////////////////////// + +#define DFTUndiff_GETMODEPARAMINT getModeParamInt_purec_double +#define DFTUndiff_GETMODEPARAMSTRING getModeParamString_purec_double +#define DFTUndiff_EXECUTE execute_purec_double +#define DFTUndiff_MAKEPLAN makePlan_purec_double +#define DFTUndiff_MAKEPLANSUB makePlanSub_purec_double +#define DFTUndiff_DESTROYPLAN destroyPlan_purec_double + +#elif defined(ENABLE_PUREC_LONGDOUBLE) //////////////////////////////////////////// + +#define DFTUndiff_GETMODEPARAMINT getModeParamInt_purec_longdouble +#define DFTUndiff_GETMODEPARAMSTRING getModeParamString_purec_longdouble +#define DFTUndiff_EXECUTE execute_purec_longdouble +#define DFTUndiff_MAKEPLAN makePlan_purec_longdouble +#define DFTUndiff_MAKEPLANSUB makePlanSub_purec_longdouble +#define DFTUndiff_DESTROYPLAN destroyPlan_purec_longdouble + +#elif defined(ENABLE_SSE_FLOAT) //////////////////////////////////////////// + +#define DFTUndiff_GETMODEPARAMINT getModeParamInt_sse_float +#define DFTUndiff_GETMODEPARAMSTRING getModeParamString_sse_float +#define DFTUndiff_EXECUTE execute_sse_float +#define DFTUndiff_MAKEPLAN makePlan_sse_float +#define DFTUndiff_MAKEPLANSUB makePlanSub_sse_float +#define DFTUndiff_DESTROYPLAN destroyPlan_sse_float + +#elif defined(ENABLE_SSE2_DOUBLE) //////////////////////////////////////////// + +#define DFTUndiff_GETMODEPARAMINT getModeParamInt_sse2_double +#define DFTUndiff_GETMODEPARAMSTRING getModeParamString_sse2_double +#define DFTUndiff_EXECUTE execute_sse2_double +#define DFTUndiff_MAKEPLAN makePlan_sse2_double +#define DFTUndiff_MAKEPLANSUB makePlanSub_sse2_double +#define DFTUndiff_DESTROYPLAN destroyPlan_sse2_double + +#elif defined(ENABLE_NEON_FLOAT) //////////////////////////////////////////// + +#define DFTUndiff_GETMODEPARAMINT getModeParamInt_neon_float +#define DFTUndiff_GETMODEPARAMSTRING getModeParamString_neon_float +#define DFTUndiff_EXECUTE execute_neon_float +#define DFTUndiff_MAKEPLAN makePlan_neon_float +#define DFTUndiff_MAKEPLANSUB makePlanSub_neon_float +#define DFTUndiff_DESTROYPLAN destroyPlan_neon_float + +#elif defined(ENABLE_AVX_FLOAT) //////////////////////////////////////////// + +#define DFTUndiff_GETMODEPARAMINT getModeParamInt_avx_float +#define DFTUndiff_GETMODEPARAMSTRING getModeParamString_avx_float +#define DFTUndiff_EXECUTE execute_avx_float +#define DFTUndiff_MAKEPLAN makePlan_avx_float +#define DFTUndiff_MAKEPLANSUB makePlanSub_avx_float +#define DFTUndiff_DESTROYPLAN destroyPlan_avx_float + +#elif defined(ENABLE_AVX_DOUBLE) //////////////////////////////////////////// + +#define DFTUndiff_GETMODEPARAMINT getModeParamInt_avx_double +#define DFTUndiff_GETMODEPARAMSTRING getModeParamString_avx_double +#define DFTUndiff_EXECUTE execute_avx_double +#define DFTUndiff_MAKEPLAN makePlan_avx_double +#define DFTUndiff_MAKEPLANSUB makePlanSub_avx_double +#define DFTUndiff_DESTROYPLAN destroyPlan_avx_double + +#elif defined(ENABLE_ALTIVEC_FLOAT) //////////////////////////////////////////// + +#define DFTUndiff_GETMODEPARAMINT getModeParamInt_altivec_float +#define DFTUndiff_GETMODEPARAMSTRING getModeParamString_altivec_float +#define DFTUndiff_EXECUTE execute_altivec_float +#define DFTUndiff_MAKEPLAN makePlan_altivec_float +#define DFTUndiff_MAKEPLANSUB makePlanSub_altivec_float +#define DFTUndiff_DESTROYPLAN destroyPlan_altivec_float + +#endif //////////////////////////////////////////////////////////////////// + +#endif diff --git a/plugins/supereq/nsfft-1.00/dft/Makefile b/plugins/supereq/nsfft-1.00/dft/Makefile new file mode 120000 index 00000000..5d253498 --- /dev/null +++ b/plugins/supereq/nsfft-1.00/dft/Makefile @@ -0,0 +1 @@ +Makefile.x86avx
\ No newline at end of file diff --git a/plugins/supereq/nsfft-1.00/dft/Makefile.altivec b/plugins/supereq/nsfft-1.00/dft/Makefile.altivec new file mode 100644 index 00000000..fe7fc993 --- /dev/null +++ b/plugins/supereq/nsfft-1.00/dft/Makefile.altivec @@ -0,0 +1,26 @@ +CC=gcc +BASEOPT=-Wall -I ../simd -maltivec -mabi=altivec +OPT=$(BASEOPT) -O3 + +all : libDFT.a + +DFTpurecfloat.o : DFTUndiff.c DFT.h ../simd/SIMDBase.h ../simd/SIMDBaseUndiff.h + $(CC) $(OPT) -DENABLE_PUREC_FLOAT DFTUndiff.c -c -o DFTpurecfloat.o + +DFTpurecdouble.o : DFTUndiff.c DFT.h ../simd/SIMDBase.h ../simd/SIMDBaseUndiff.h + $(CC) $(OPT) -DENABLE_PUREC_DOUBLE DFTUndiff.c -c -o DFTpurecdouble.o + +DFTpureclongdouble.o : DFTUndiff.c DFT.h ../simd/SIMDBase.h ../simd/SIMDBaseUndiff.h + $(CC) $(OPT) -DENABLE_PUREC_LONGDOUBLE DFTUndiff.c -c -o DFTpureclongdouble.o + +DFTaltivecfloat.o : DFTUndiff.c DFT.h ../simd/SIMDBase.h ../simd/SIMDBaseUndiff.h + $(CC) $(OPT) -DENABLE_ALTIVEC_FLOAT DFTUndiff.c -c -o DFTaltivecfloat.o + +DFT.o : DFT.c DFT.h + $(CC) $(OPT) -DENABLE_PUREC_FLOAT -DENABLE_PUREC_DOUBLE -DENABLE_PUREC_LONGDOUBLE -DENABLE_ALTIVEC_FLOAT DFT.c -c -o DFT.o + +libDFT.a : DFTpurecfloat.o DFTpurecdouble.o DFTpureclongdouble.o DFTaltivecfloat.o DFT.o + rm -f libDFT.a; ar -cvq libDFT.a DFTpurecfloat.o DFTpurecdouble.o DFTpureclongdouble.o DFTaltivecfloat.o DFT.o + +clean : + rm -f *~ *.o *.s *.a diff --git a/plugins/supereq/nsfft-1.00/dft/Makefile.neon b/plugins/supereq/nsfft-1.00/dft/Makefile.neon new file mode 100644 index 00000000..111a04ae --- /dev/null +++ b/plugins/supereq/nsfft-1.00/dft/Makefile.neon @@ -0,0 +1,26 @@ +CC=gcc +BASEOPT=-Wall -I ../simd -mfloat-abi=softfp +OPT=$(BASEOPT) -O3 + +all : libDFT.a + +DFTpurecfloat.o : DFTUndiff.c DFT.h ../simd/SIMDBase.h + $(CC) $(OPT) -DENABLE_PUREC_FLOAT DFTUndiff.c -c -o DFTpurecfloat.o + +DFTpurecdouble.o : DFTUndiff.c DFT.h ../simd/SIMDBase.h + $(CC) $(OPT) -DENABLE_PUREC_DOUBLE DFTUndiff.c -c -o DFTpurecdouble.o + +DFTpureclongdouble.o : DFTUndiff.c DFT.h ../simd/SIMDBase.h + $(CC) $(OPT) -DENABLE_PUREC_LONGDOUBLE DFTUndiff.c -c -o DFTpureclongdouble.o + +DFTneonfloat.o : DFTUndiff.c DFT.h ../simd/SIMDBase.h + $(CC) $(OPT) -mfpu=neon -DENABLE_NEON_FLOAT DFTUndiff.c -c -o DFTneonfloat.o + +DFT.o : DFT.c DFT.h + $(CC) $(OPT) -DENABLE_PUREC_FLOAT -DENABLE_PUREC_DOUBLE -DENABLE_PUREC_LONGDOUBLE -DENABLE_NEON_FLOAT DFT.c -c -o DFT.o + +libDFT.a : DFTpurecfloat.o DFTpurecdouble.o DFTpureclongdouble.o DFTneonfloat.o DFT.o + rm -f libDFT.a; ar -cvq libDFT.a DFTpurecfloat.o DFTpurecdouble.o DFTpureclongdouble.o DFTneonfloat.o DFT.o + +clean : + rm -f *~ *.o *.s *.a diff --git a/plugins/supereq/nsfft-1.00/dft/Makefile.purec b/plugins/supereq/nsfft-1.00/dft/Makefile.purec new file mode 100644 index 00000000..2c8b04f1 --- /dev/null +++ b/plugins/supereq/nsfft-1.00/dft/Makefile.purec @@ -0,0 +1,35 @@ +CC=gcc +BASEOPT=-Wall +OPT=$(BASEOPT) -O3 + +all : libDFT.a + +DFTpurecfloat.o : DFTUndiff.c DFT.h SIMDBase.h + $(CC) $(OPT) -DENABLE_PUREC_FLOAT DFTUndiff.c -c -o DFTpurecfloat.o + +DFTpurecdouble.o : DFTUndiff.c DFT.h SIMDBase.h + $(CC) $(OPT) -DENABLE_PUREC_DOUBLE DFTUndiff.c -c -o DFTpurecdouble.o + +DFTpureclongdouble.o : DFTUndiff.c DFT.h SIMDBase.h + $(CC) $(OPT) -DENABLE_PUREC_LONGDOUBLE DFTUndiff.c -c -o DFTpureclongdouble.o + +SIMDBaseUndiff_purecfloat.o : SIMDBaseUndiff.c SIMDBase.h SIMDBaseUndiff.h + $(CC) $(OPT) -DENABLE_PUREC_FLOAT SIMDBaseUndiff.c -c -o SIMDBaseUndiff_purecfloat.o + +SIMDBaseUndiff_purecdouble.o : SIMDBaseUndiff.c DFT.h SIMDBase.h SIMDBaseUndiff.h + $(CC) $(OPT) -DENABLE_PUREC_DOUBLE SIMDBaseUndiff.c -c -o SIMDBaseUndiff_purecdouble.o + +SIMDBaseUndiff_pureclongdouble.o : SIMDBaseUndiff.c SIMDBase.h SIMDBaseUndiff.h + $(CC) $(OPT) -DENABLE_PUREC_LONGDOUBLE SIMDBaseUndiff.c -c -o SIMDBaseUndiff_pureclongdouble.o + +SIMDBase.o : SIMDBase.c SIMDBase.h + $(CC) $(BASEOPT) -O -DENABLE_PUREC_FLOAT -DENABLE_PUREC_DOUBLE -DENABLE_PUREC_LONGDOUBLE SIMDBase.c -c -o SIMDBase.o + +DFT.o : DFT.c DFT.h + $(CC) $(OPT) -DENABLE_PUREC_FLOAT -DENABLE_PUREC_DOUBLE -DENABLE_PUREC_LONGDOUBLE DFT.c -c -o DFT.o + +libDFT.a : DFTpurecfloat.o DFTpurecdouble.o DFTpureclongdouble.o DFT.o SIMDBase.o SIMDBaseUndiff_purecfloat.o SIMDBaseUndiff_purecdouble.o SIMDBaseUndiff_pureclongdouble.o + rm -f libDFT.a; ar -cvq libDFT.a DFTpurecfloat.o DFTpurecdouble.o DFTpureclongdouble.o DFT.o SIMDBase.o SIMDBaseUndiff_purecfloat.o SIMDBaseUndiff_purecdouble.o SIMDBaseUndiff_pureclongdouble.o + +clean : + rm -f *~ *.o *.s *.a diff --git a/plugins/supereq/nsfft-1.00/dft/Makefile.x86 b/plugins/supereq/nsfft-1.00/dft/Makefile.x86 new file mode 100644 index 00000000..6ecbacec --- /dev/null +++ b/plugins/supereq/nsfft-1.00/dft/Makefile.x86 @@ -0,0 +1,29 @@ +CC=gcc +BASEOPT=-Wall -I ../simd +OPT=$(BASEOPT) -O3 + +all : libDFT.a + +DFTpurecfloat.o : DFTUndiff.c DFT.h ../simd/SIMDBase.h ../simd/SIMDBaseUndiff.h + $(CC) $(OPT) -DENABLE_PUREC_FLOAT DFTUndiff.c -c -o DFTpurecfloat.o + +DFTpurecdouble.o : DFTUndiff.c DFT.h ../simd/SIMDBase.h ../simd/SIMDBaseUndiff.h + $(CC) $(OPT) -DENABLE_PUREC_DOUBLE DFTUndiff.c -c -o DFTpurecdouble.o + +DFTpureclongdouble.o : DFTUndiff.c DFT.h ../simd/SIMDBase.h ../simd/SIMDBaseUndiff.h + $(CC) $(OPT) -DENABLE_PUREC_LONGDOUBLE DFTUndiff.c -c -o DFTpureclongdouble.o + +DFTssefloat.o : DFTUndiff.c DFT.h ../simd/SIMDBase.h ../simd/SIMDBaseUndiff.h + $(CC) $(OPT) -msse -DENABLE_SSE_FLOAT DFTUndiff.c -c -o DFTssefloat.o + +DFTsse2double.o : DFTUndiff.c DFT.h ../simd/SIMDBase.h ../simd/SIMDBaseUndiff.h + $(CC) $(OPT) -msse2 -DENABLE_SSE2_DOUBLE DFTUndiff.c -c -o DFTsse2double.o + +DFT.o : DFT.c DFT.h + $(CC) $(OPT) -DENABLE_PUREC_FLOAT -DENABLE_PUREC_DOUBLE -DENABLE_PUREC_LONGDOUBLE -DENABLE_SSE_FLOAT -DENABLE_SSE2_DOUBLE DFT.c -c -o DFT.o + +libDFT.a : DFTpurecfloat.o DFTpurecdouble.o DFTpureclongdouble.o DFTssefloat.o DFTsse2double.o DFT.o + rm -f libDFT.a; ar -cvq libDFT.a DFTpurecfloat.o DFTpurecdouble.o DFTpureclongdouble.o DFTssefloat.o DFTsse2double.o DFT.o + +clean : + rm -f *~ *.o *.s *.a a.out diff --git a/plugins/supereq/nsfft-1.00/dft/Makefile.x86avx b/plugins/supereq/nsfft-1.00/dft/Makefile.x86avx new file mode 100644 index 00000000..b38909cb --- /dev/null +++ b/plugins/supereq/nsfft-1.00/dft/Makefile.x86avx @@ -0,0 +1,35 @@ +CC=gcc +BASEOPT=-Wall -I ../simd +OPT=$(BASEOPT) -O3 + +all : libDFT.a + +DFTpurecfloat.o : DFTUndiff.c DFT.h ../simd/SIMDBase.h ../simd/SIMDBaseUndiff.h + $(CC) $(OPT) -DENABLE_PUREC_FLOAT DFTUndiff.c -c -o DFTpurecfloat.o + +DFTpurecdouble.o : DFTUndiff.c DFT.h ../simd/SIMDBase.h ../simd/SIMDBaseUndiff.h + $(CC) $(OPT) -DENABLE_PUREC_DOUBLE DFTUndiff.c -c -o DFTpurecdouble.o + +DFTpureclongdouble.o : DFTUndiff.c DFT.h ../simd/SIMDBase.h ../simd/SIMDBaseUndiff.h + $(CC) $(OPT) -DENABLE_PUREC_LONGDOUBLE DFTUndiff.c -c -o DFTpureclongdouble.o + +DFTssefloat.o : DFTUndiff.c DFT.h ../simd/SIMDBase.h ../simd/SIMDBaseUndiff.h + $(CC) $(OPT) -msse -DENABLE_SSE_FLOAT DFTUndiff.c -c -o DFTssefloat.o + +DFTsse2double.o : DFTUndiff.c DFT.h ../simd/SIMDBase.h ../simd/SIMDBaseUndiff.h + $(CC) $(OPT) -msse2 -DENABLE_SSE2_DOUBLE DFTUndiff.c -c -o DFTsse2double.o + +DFTavxfloat.o : DFTUndiff.c DFT.h ../simd/SIMDBase.h ../simd/SIMDBaseUndiff.h + $(CC) $(OPT) -mavx -DENABLE_AVX_FLOAT DFTUndiff.c -c -o DFTavxfloat.o + +DFTavxdouble.o : DFTUndiff.c DFT.h ../simd/SIMDBase.h ../simd/SIMDBaseUndiff.h + $(CC) $(OPT) -mavx -DENABLE_AVX_DOUBLE DFTUndiff.c -c -o DFTavxdouble.o + +DFT.o : DFT.c DFT.h + $(CC) $(OPT) -DENABLE_PUREC_FLOAT -DENABLE_PUREC_DOUBLE -DENABLE_PUREC_LONGDOUBLE -DENABLE_SSE_FLOAT -DENABLE_SSE2_DOUBLE -DENABLE_AVX_FLOAT -DENABLE_AVX_DOUBLE DFT.c -c -o DFT.o + +libDFT.a : DFTpurecfloat.o DFTpurecdouble.o DFTpureclongdouble.o DFTssefloat.o DFTsse2double.o DFTavxfloat.o DFTavxdouble.o DFT.o + rm -f libDFT.a; ar -cvq libDFT.a DFTpurecfloat.o DFTpurecdouble.o DFTpureclongdouble.o DFTssefloat.o DFTsse2double.o DFTavxfloat.o DFTavxdouble.o DFT.o + +clean : + rm -f *~ *.o *.s *.a a.out diff --git a/plugins/supereq/nsfft-1.00/dfttest/DFTExample.c b/plugins/supereq/nsfft-1.00/dfttest/DFTExample.c new file mode 100644 index 00000000..78ff14dc --- /dev/null +++ b/plugins/supereq/nsfft-1.00/dfttest/DFTExample.c @@ -0,0 +1,88 @@ +#include <stdio.h> +#include <stdlib.h> +#include <math.h> +#include <stdint.h> +#include <complex.h> + +#include "SIMDBase.h" +#include "DFT.h" + +typedef float REAL; +#define TYPE SIMDBase_TYPE_FLOAT + +#define THRES 1e-3 + +double complex omega(double n, double kn) { + return cexp((-2 * M_PI * _Complex_I / n) * kn); +} + +void forward(double complex *ts, double complex *fs, int len) { + int k, n; + + for(k=0;k<len;k++) { + fs[k] = 0; + + for(n=0;n<len;n++) { + fs[k] += ts[n] * omega(len, n*k); + } + } +} + +int main(int argc, char **argv) { + const int n = 256; + + int mode = SIMDBase_chooseBestMode(TYPE); + printf("mode : %d, %s\n", mode, SIMDBase_getModeParamString(SIMDBase_PARAMID_MODE_NAME, mode)); + + int veclen = SIMDBase_getModeParamInt(SIMDBase_PARAMID_VECTOR_LEN, mode); + int sizeOfVect = SIMDBase_getModeParamInt(SIMDBase_PARAMID_SIZE_OF_VECT, mode); + + // + + int i, j; + + DFT *p = DFT_init(mode, n, 0); + REAL *sx = SIMDBase_alignedMalloc(sizeOfVect*n*2); + + // + + double complex ts[veclen][n], fs[veclen][n]; + + for(j=0;j<veclen;j++) { + for(i=0;i<n;i++) { + ts[j][i] = (random() / (double)RAND_MAX) + (random() / (double)RAND_MAX) * _Complex_I; + sx[(i*2+0)*veclen+j] = creal(ts[j][i]); + sx[(i*2+1)*veclen+j] = cimag(ts[j][i]); + } + } + + // + + DFT_execute(p, mode, sx, -1); + + for(j=0;j<veclen;j++) { + forward(ts[j], fs[j], n); + } + + // + + int success = 1; + + for(j=0;j<veclen;j++) { + for(i=0;i<n;i++) { + if ((fabs(sx[(i*2+0)*veclen+j] - creal(fs[j][i])) > THRES) || + (fabs(sx[(i*2+1)*veclen+j] - cimag(fs[j][i])) > THRES)) { + success = 0; + } + } + } + + printf("%s\n", success ? "OK" : "NG"); + + // + + SIMDBase_alignedFree(sx); + DFT_dispose(p, mode); + + exit(0); +} diff --git a/plugins/supereq/nsfft-1.00/dfttest/DFTTestFFTW.c b/plugins/supereq/nsfft-1.00/dfttest/DFTTestFFTW.c new file mode 100644 index 00000000..42825ed9 --- /dev/null +++ b/plugins/supereq/nsfft-1.00/dfttest/DFTTestFFTW.c @@ -0,0 +1,317 @@ +#include <stdio.h> +#include <stdlib.h> +#include <math.h> +#include <stdint.h> +#include <time.h> +#include <complex.h> + +#include <fftw3.h> + +#include "SIMDBase.h" +#include "DFT.h" + +#if 1 +typedef float REAL; +#define TYPE SIMDBase_TYPE_FLOAT +#else +typedef double REAL; +#define TYPE SIMDBase_TYPE_DOUBLE +#endif + +#define THRES 1e-3 + +// complex forward +int check_cf(int n, int mode, int veclen, int sizeOfVect) { + int i, j; + + DFT *p = DFT_init(mode, n, 0); + fftw_plan w[n]; + + fftw_complex *in[sizeOfVect], *out[sizeOfVect]; + + REAL *sx = SIMDBase_alignedMalloc(sizeOfVect*n*2); + + // + + for(j=0;j<veclen;j++) { + in[j] = (fftw_complex*) fftw_malloc(sizeof(fftw_complex) * n); + out[j] = (fftw_complex*) fftw_malloc(sizeof(fftw_complex) * n); + w[j] = fftw_plan_dft_1d(n, in[j], out[j], FFTW_FORWARD, FFTW_ESTIMATE); + + for(i=0;i<n;i++) { + double re = random() / (double)RAND_MAX; + double im = random() / (double)RAND_MAX; + sx[(i*2+0)*veclen+j] = re; + sx[(i*2+1)*veclen+j] = im; + in[j][i] = re + im * _Complex_I; + } + } + + // + + DFT_execute(p, mode, sx, -1); + + for(j=0;j<veclen;j++) { + fftw_execute(w[j]); + } + + // + + int success = 1; + + for(j=0;j<veclen;j++) { + for(i=0;i<n;i++) { + if (fabs(sx[(i*2+0)*veclen+j] - creal(out[j][i])) > THRES) success = 0; + if (fabs(sx[(i*2+1)*veclen+j] - cimag(out[j][i])) > THRES) success = 0; + } + } + + // + + for(j=0;j<veclen;j++) { + fftw_destroy_plan(w[j]); + fftw_free(in[j]); + fftw_free(out[j]); + } + + SIMDBase_alignedFree(sx); + + DFT_dispose(p, mode); + + // + + return success; +} + +// complex backward +int check_cb(int n, int mode, int veclen, int sizeOfVect) { + int i, j; + + DFT *p = DFT_init(mode, n, 0); + fftw_plan w[n]; + + fftw_complex *in[sizeOfVect], *out[sizeOfVect]; + + REAL *sx = SIMDBase_alignedMalloc(sizeOfVect*n*2); + + // + + for(j=0;j<veclen;j++) { + in[j] = (fftw_complex*) fftw_malloc(sizeof(fftw_complex) * n); + out[j] = (fftw_complex*) fftw_malloc(sizeof(fftw_complex) * n); + w[j] = fftw_plan_dft_1d(n, in[j], out[j], FFTW_BACKWARD, FFTW_ESTIMATE); + + for(i=0;i<n;i++) { + double re = random() / (double)RAND_MAX; + double im = random() / (double)RAND_MAX; + sx[(i*2+0)*veclen+j] = re; + sx[(i*2+1)*veclen+j] = im; + in[j][i] = re + im * _Complex_I; + } + } + + // + + DFT_execute(p, mode, sx, 1); + + for(j=0;j<veclen;j++) { + fftw_execute(w[j]); + } + + // + + int success = 1; + + for(j=0;j<veclen;j++) { + for(i=0;i<n;i++) { + if (fabs(sx[(i*2+0)*veclen+j] - creal(out[j][i])) > THRES) success = 0; + if (fabs(sx[(i*2+1)*veclen+j] - cimag(out[j][i])) > THRES) success = 0; + } + } + + // + + for(j=0;j<veclen;j++) { + fftw_destroy_plan(w[j]); + fftw_free(in[j]); + fftw_free(out[j]); + } + + SIMDBase_alignedFree(sx); + + DFT_dispose(p, mode); + + // + + return success; +} + +// real forward +int check_rf(int n, int mode, int veclen, int sizeOfVect) { + int i, j; + + DFT *p = DFT_init(mode, n, DFT_FLAG_REAL); + fftw_plan w[n]; + + double *in[sizeOfVect]; + fftw_complex *out[sizeOfVect]; + + REAL *sx = SIMDBase_alignedMalloc(sizeOfVect*n*2); + + // + + for(j=0;j<veclen;j++) { + in[j] = (double *) fftw_malloc(sizeof(double) * n); + out[j] = (fftw_complex*) fftw_malloc(sizeof(fftw_complex) * (n/2+1)); + w[j] = fftw_plan_dft_r2c_1d(n, in[j], out[j], FFTW_ESTIMATE); + + for(i=0;i<n;i++) { + double re = random() / (double)RAND_MAX; + sx[i*veclen+j] = re; + in[j][i] = re; + } + } + + // + + DFT_execute(p, mode, sx, -1); + + for(j=0;j<veclen;j++) { + fftw_execute(w[j]); + } + + // + + int success = 1; + + for(j=0;j<veclen;j++) { + for(i=0;i<n/2;i++) { + if (i == 0) { + if (fabs(sx[(i*2+0)*veclen+j] - creal(out[j][0])) > THRES) success = 0; + if (fabs(sx[(i*2+1)*veclen+j] - creal(out[j][n/2])) > THRES) success = 0; + } else { + if (fabs(sx[(i*2+0)*veclen+j] - creal(out[j][i])) > THRES) success = 0; + if (fabs(sx[(i*2+1)*veclen+j] - cimag(out[j][i])) > THRES) success = 0; + } + } + } + + // + + for(j=0;j<veclen;j++) { + fftw_destroy_plan(w[j]); + fftw_free(in[j]); + fftw_free(out[j]); + } + + SIMDBase_alignedFree(sx); + + DFT_dispose(p, mode); + + // + + return success; +} + +// real backward +int check_rb(int n, int mode, int veclen, int sizeOfVect) { + int i, j; + + DFT *p = DFT_init(mode, n, DFT_FLAG_REAL); + fftw_plan w[n]; + + fftw_complex *in[sizeOfVect]; + double *out[sizeOfVect]; + + REAL *sx = SIMDBase_alignedMalloc(sizeOfVect*n*2); + + // + + for(j=0;j<veclen;j++) { + in[j] = (fftw_complex*) fftw_malloc(sizeof(fftw_complex) * (n/2+1)); + out[j] = (double *) fftw_malloc(sizeof(double) * n); + w[j] = fftw_plan_dft_c2r_1d(n, in[j], out[j], FFTW_ESTIMATE); + + for(i=0;i<n/2;i++) { + if (i == 0) { + in[j][0 ] = (random() / (double)RAND_MAX); + in[j][n/2] = (random() / (double)RAND_MAX); + } else { + in[j][i ] = (random() / (double)RAND_MAX) + (random() / (double)RAND_MAX) * _Complex_I; + } + } + + for(i=0;i<n/2;i++) { + if (i == 0) { + sx[(2*0+0) * veclen + j] = creal(in[j][0 ]); + sx[(2*0+1) * veclen + j] = creal(in[j][n/2]); + } else { + sx[(2*i+0) * veclen + j] = creal(in[j][i]); + sx[(2*i+1) * veclen + j] = cimag(in[j][i]); + } + } + } + + // + + DFT_execute(p, mode, sx, 1); + + for(j=0;j<veclen;j++) { + fftw_execute(w[j]); + } + + // + + int success = 1; + + for(j=0;j<veclen;j++) { + for(i=0;i<n/2;i++) { + if ((fabs(sx[i * veclen + j]*2 - out[j][i]) > THRES)) { + success = 0; + } + } + } + + // + + for(j=0;j<veclen;j++) { + fftw_destroy_plan(w[j]); + fftw_free(in[j]); + fftw_free(out[j]); + } + + SIMDBase_alignedFree(sx); + + DFT_dispose(p, mode); + + // + + return success; +} + +int main(int argc, char **argv) { + if (argc != 2) { + fprintf(stderr, "%s <log2n>\n", argv[0]); + exit(-1); + } + + const int n = 1 << atoi(argv[1]); + + srandom(time(NULL)); + + // + + int mode = SIMDBase_chooseBestMode(TYPE); + + printf("mode : %d, %s\n", mode, SIMDBase_getModeParamString(SIMDBase_PARAMID_MODE_NAME, mode)); + + int veclen = SIMDBase_getModeParamInt(SIMDBase_PARAMID_VECTOR_LEN, mode); + int sizeOfVect = SIMDBase_getModeParamInt(SIMDBase_PARAMID_SIZE_OF_VECT, mode); + + printf("complex forward : %s\n", check_cf(n, mode, veclen, sizeOfVect) ? "OK" : "NG"); + printf("complex backward : %s\n", check_cb(n, mode, veclen, sizeOfVect) ? "OK" : "NG"); + printf("real forward : %s\n", check_rf(n, mode, veclen, sizeOfVect) ? "OK" : "NG"); + printf("real backward : %s\n", check_rb(n, mode, veclen, sizeOfVect) ? "OK" : "NG"); + + exit(0); +} diff --git a/plugins/supereq/nsfft-1.00/dfttest/DFTTestNaive.c b/plugins/supereq/nsfft-1.00/dfttest/DFTTestNaive.c new file mode 100644 index 00000000..9d4bdaae --- /dev/null +++ b/plugins/supereq/nsfft-1.00/dfttest/DFTTestNaive.c @@ -0,0 +1,419 @@ +#include <stdio.h> +#include <stdlib.h> +#include <math.h> +#include <stdint.h> +#include <time.h> +#include <complex.h> + +#include "SIMDBase.h" +#include "DFT.h" + +#if 1 +typedef float REAL; +#define TYPE SIMDBase_TYPE_FLOAT +#else +typedef double REAL; +#define TYPE SIMDBase_TYPE_DOUBLE +#endif + +#define THRES 1e-3 + +double complex omega(double n, double kn) { + return cexp((-2 * M_PI * _Complex_I / n) * kn); +} + +void forward(double complex *ts, double complex *fs, int len) { + int k, n; + + for(k=0;k<len;k++) { + fs[k] = 0; + + for(n=0;n<len;n++) { + fs[k] += ts[n] * omega(len, n*k); + } + } +} + +void backward(double complex *fs, double complex *ts, int len) { + int k, n; + + for(k=0;k<len;k++) { + ts[k] = 0; + + for(n=0;n<len;n++) { + ts[k] += fs[n] * omega(-len, n*k); + } + } +} + +// complex forward +int check_cf(int n, int mode, int veclen, int sizeOfVect) { + int i, j; + + DFT *p = DFT_init(mode, n, 0); + REAL *sx = SIMDBase_alignedMalloc(sizeOfVect*n*2); + + // + + double complex ts[veclen][n], fs[veclen][n]; + + for(j=0;j<veclen;j++) { + for(i=0;i<n;i++) { + ts[j][i] = (random() / (double)RAND_MAX) + (random() / (double)RAND_MAX) * _Complex_I; + sx[(i*2+0)*veclen+j] = creal(ts[j][i]); + sx[(i*2+1)*veclen+j] = cimag(ts[j][i]); + } + } + + // + + DFT_execute(p, mode, sx, -1); + + for(j=0;j<veclen;j++) { + forward(ts[j], fs[j], n); + } + + // + + int success = 1; + + for(j=0;j<veclen;j++) { + for(i=0;i<n;i++) { + if ((fabs(sx[(i*2+0)*veclen+j] - creal(fs[j][i])) > THRES) || + (fabs(sx[(i*2+1)*veclen+j] - cimag(fs[j][i])) > THRES)) { + success = 0; + } + } + } + + // + + SIMDBase_alignedFree(sx); + DFT_dispose(p, mode); + + // + + return success; +} + +// complex backward +int check_cb(int n, int mode, int veclen, int sizeOfVect) { + int i,j; + + DFT *p = DFT_init(mode, n, 0); + REAL *sx = SIMDBase_alignedMalloc(sizeOfVect*n*2); + + // + + double complex fs[veclen][n], ts[veclen][n]; + + for(j=0;j<veclen;j++) { + for(i=0;i<n;i++) { + fs[j][i] = (random() / (double)RAND_MAX) + (random() / (double)RAND_MAX) * _Complex_I; + + sx[(i*2+0)*veclen+j] = creal(fs[j][i]); + sx[(i*2+1)*veclen+j] = cimag(fs[j][i]); + } + } + + // + + DFT_execute(p, mode, sx, 1); + + for(j=0;j<veclen;j++) { + backward(fs[j], ts[j], n); + } + + // + + int success = 1; + + for(j=0;j<veclen;j++) { + for(i=0;i<n;i++) { + if ((fabs(sx[(i*2+0)*veclen+j] - creal(ts[j][i])) > THRES) || + (fabs(sx[(i*2+1)*veclen+j] - cimag(ts[j][i])) > THRES)) { + success = 0; + } + } + } + + // + + SIMDBase_alignedFree(sx); + DFT_dispose(p, mode); + + // + + return success; +} + +// real forward +int check_rf(int n, int mode, int veclen, int sizeOfVect) { + int i,j; + + DFT *p = DFT_init(mode, n, DFT_FLAG_REAL); + REAL *sx = SIMDBase_alignedMalloc(sizeOfVect*n); + + // + + double complex ts[veclen][n], fs[veclen][n]; + + for(j=0;j<veclen;j++) { + for(i=0;i<n;i++) { + ts[j][i] = (random() / (double)RAND_MAX); + sx[i*veclen+j] = creal(ts[j][i]); + } + } + + // + + DFT_execute(p, mode, sx, -1); + + for(j=0;j<veclen;j++) { + forward(ts[j], fs[j], n); + } + + // + + int success = 1; + + for(j=0;j<veclen;j++) { + for(i=0;i<n/2;i++) { + if (i == 0) { + if (fabs(sx[(2*0+0) * veclen + j] - creal(fs[j][0 ])) > THRES) success = 0; + if (fabs(sx[(2*0+1) * veclen + j] - creal(fs[j][n/2])) > THRES) success = 0; + } else { + if (fabs(sx[(2*i+0) * veclen + j] - creal(fs[j][i])) > THRES) success = 0; + if (fabs(sx[(2*i+1) * veclen + j] - cimag(fs[j][i])) > THRES) success = 0; + } + } + } + + // + + SIMDBase_alignedFree(sx); + DFT_dispose(p, mode); + + // + + return success; +} + +// real backward +int check_rb(int n, int mode, int veclen, int sizeOfVect) { + int i,j; + + DFT *p = DFT_init(mode, n, DFT_FLAG_REAL); + REAL *sx = SIMDBase_alignedMalloc(sizeOfVect*n); + + // + + double complex fs[veclen][n], ts[veclen][n]; + + for(j=0;j<veclen;j++) { + for(i=0;i<n/2;i++) { + if (i == 0) { + fs[j][0 ] = (random() / (double)RAND_MAX); + fs[j][n/2] = (random() / (double)RAND_MAX); + } else { + fs[j][i ] = (random() / (double)RAND_MAX) + (random() / (double)RAND_MAX) * _Complex_I; + fs[j][n-i] = conj(fs[j][i]); + } + } + } + + for(j=0;j<veclen;j++) { + for(i=0;i<n/2;i++) { + if (i == 0) { + sx[(2*0+0) * veclen + j] = creal(fs[j][0 ]); + sx[(2*0+1) * veclen + j] = creal(fs[j][n/2]); + } else { + sx[(2*i+0) * veclen + j] = creal(fs[j][i]); + sx[(2*i+1) * veclen + j] = cimag(fs[j][i]); + } + } + } + + // + + for(j=0;j<veclen;j++) { + backward(fs[j], ts[j], n); + } + + DFT_execute(p, mode, sx, 1); + + // + + int success = 1; + + for(j=0;j<veclen;j++) { + for(i=0;i<n;i++) { + if (fabs(cimag(ts[j][i])) > THRES) { + success = 0; + } + + if ((fabs(sx[i * veclen + j]*2 - creal(ts[j][i])) > THRES)) { + success = 0; + } + } + } + + // + + SIMDBase_alignedFree(sx); + DFT_dispose(p, mode); + + // + + return success; +} + +// alt real forward +int check_arf(int n, int mode, int veclen, int sizeOfVect) { + int i,j; + + DFT *p = DFT_init(mode, n, DFT_FLAG_ALT_REAL); + REAL *sx = SIMDBase_alignedMalloc(sizeOfVect*n); + + // + + double complex ts[veclen][n], fs[veclen][n]; + + for(j=0;j<veclen;j++) { + for(i=0;i<n;i++) { + ts[j][i] = (random() / (double)RAND_MAX); + sx[i*veclen+j] = creal(ts[j][i]); + } + } + + // + + DFT_execute(p, mode, sx, 1); + + for(j=0;j<veclen;j++) { + backward(ts[j], fs[j], n); + } + + // + + int success = 1; + + for(j=0;j<veclen;j++) { + for(i=0;i<n/2;i++) { + if (i == 0) { + if (fabs(sx[(2*0+0) * veclen + j] - creal(fs[j][0 ])) > THRES) success = 0; + if (fabs(sx[(2*0+1) * veclen + j] - creal(fs[j][n/2])) > THRES) success = 0; + } else { + if (fabs(sx[(2*i+0) * veclen + j] - creal(fs[j][i])) > THRES) success = 0; + if (fabs(sx[(2*i+1) * veclen + j] - cimag(fs[j][i])) > THRES) success = 0; + } + } + } + + // + + SIMDBase_alignedFree(sx); + DFT_dispose(p, mode); + + // + + return success; +} + +// alt real backward +int check_arb(int n, int mode, int veclen, int sizeOfVect) { + int i,j; + + DFT *p = DFT_init(mode, n, DFT_FLAG_ALT_REAL); + REAL *sx = SIMDBase_alignedMalloc(sizeOfVect*n); + + // + + double complex fs[veclen][n], ts[veclen][n]; + + for(j=0;j<veclen;j++) { + for(i=0;i<n/2;i++) { + if (i == 0) { + fs[j][0 ] = (random() / (double)RAND_MAX); + fs[j][n/2] = (random() / (double)RAND_MAX); + } else { + fs[j][i ] = (random() / (double)RAND_MAX) + (random() / (double)RAND_MAX) * _Complex_I; + fs[j][n-i] = conj(fs[j][i]); + } + } + } + + for(j=0;j<veclen;j++) { + for(i=0;i<n/2;i++) { + if (i == 0) { + sx[(2*0+0) * veclen + j] = creal(fs[j][0 ]); + sx[(2*0+1) * veclen + j] = creal(fs[j][n/2]); + } else { + sx[(2*i+0) * veclen + j] = creal(fs[j][i]); + sx[(2*i+1) * veclen + j] = cimag(fs[j][i]); + } + } + } + + // + + for(j=0;j<veclen;j++) { + forward(fs[j], ts[j], n); + } + + DFT_execute(p, mode, sx, -1); + + // + + int success = 1; + + for(j=0;j<veclen;j++) { + for(i=0;i<n;i++) { + if (fabs(cimag(ts[j][i])) > THRES) { + success = 0; + } + + if ((fabs(sx[i * veclen + j]*2 - creal(ts[j][i])) > THRES)) { + success = 0; + } + } + } + + // + + SIMDBase_alignedFree(sx); + DFT_dispose(p, mode); + + // + + return success; +} + +int main(int argc, char **argv) { + if (argc != 2) { + fprintf(stderr, "%s <log2n>\n", argv[0]); + exit(-1); + } + + const int n = 1 << atoi(argv[1]); + + srandom(time(NULL)); + + // + + int mode = SIMDBase_chooseBestMode(TYPE); + + printf("mode : %d, %s\n", mode, SIMDBase_getModeParamString(SIMDBase_PARAMID_MODE_NAME, mode)); + + int veclen = SIMDBase_getModeParamInt(SIMDBase_PARAMID_VECTOR_LEN, mode); + int sizeOfVect = SIMDBase_getModeParamInt(SIMDBase_PARAMID_SIZE_OF_VECT, mode); + + printf("complex forward : %s\n", check_cf(n, mode, veclen, sizeOfVect) ? "OK" : "NG"); + printf("complex backward : %s\n", check_cb(n, mode, veclen, sizeOfVect) ? "OK" : "NG"); + printf("real forward : %s\n", check_rf(n, mode, veclen, sizeOfVect) ? "OK" : "NG"); + printf("real backward : %s\n", check_rb(n, mode, veclen, sizeOfVect) ? "OK" : "NG"); + printf("alt real forward : %s\n", check_arf(n, mode, veclen, sizeOfVect) ? "OK" : "NG"); + printf("alt real backward : %s\n", check_arb(n, mode, veclen, sizeOfVect) ? "OK" : "NG"); + + exit(0); +} diff --git a/plugins/supereq/nsfft-1.00/dfttest/DFTTestOoura.c b/plugins/supereq/nsfft-1.00/dfttest/DFTTestOoura.c new file mode 100644 index 00000000..08c8315f --- /dev/null +++ b/plugins/supereq/nsfft-1.00/dfttest/DFTTestOoura.c @@ -0,0 +1,260 @@ +#include <stdio.h> +#include <stdlib.h> +#include <math.h> +#include <stdint.h> +#include <time.h> + +#include "SIMDBase.h" +#include "DFT.h" + +void cdft(int, int, double *, int *, double *); +void rdft(int, int, double *, int *, double *); + +#if 1 +typedef float REAL; +#define TYPE SIMDBase_TYPE_FLOAT +#else +typedef double REAL; +#define TYPE SIMDBase_TYPE_DOUBLE +#endif + +#define THRES 1e-3 + +// complex forward +int check_cf(int n, int mode, int veclen, int sizeOfVect) { + int i, j; + + DFT *p = DFT_init(mode, n, 0); + + int *ip = calloc(n, sizeof(int)); + double *trigTable = SIMDBase_alignedMalloc(sizeof(double)*n/2); + + REAL *sx = SIMDBase_alignedMalloc(sizeOfVect*n*2); + double *sy = SIMDBase_alignedMalloc(veclen * sizeof(double) *n*2); + + // + + for(j=0;j<veclen;j++) { + for(i=0;i<n*2;i++) { + sx[i*veclen + j] = random() / (double)RAND_MAX; + sy[j*n*2 + i] = sx[i*veclen + j]; + } + } + + // + + DFT_execute(p, mode, sx, -1); + + for(j=0;j<veclen;j++) { + cdft(n*2, -1, &sy[j*n*2], ip, trigTable); + } + + // + + int success = 1; + + for(j=0;j<veclen;j++) { + for(i=0;i<n*2;i++) { + if (fabs(sx[i*veclen+j] - sy[j*n*2 + i]) > THRES) success = 0; + } + } + + // + + SIMDBase_alignedFree(sy); + SIMDBase_alignedFree(sx); + SIMDBase_alignedFree(trigTable); + free(ip); + + DFT_dispose(p, mode); + + // + + return success; +} + +// complex backward +int check_cb(int n, int mode, int veclen, int sizeOfVect) { + int i, j; + + DFT *p = DFT_init(mode, n, 0); + + int *ip = calloc(n, sizeof(int)); + double *trigTable = SIMDBase_alignedMalloc(sizeof(double)*n/2); + + REAL *sx = SIMDBase_alignedMalloc(sizeOfVect*n*2); + double *sy = SIMDBase_alignedMalloc(veclen * sizeof(double) *n*2); + + // + + for(j=0;j<veclen;j++) { + for(i=0;i<n*2;i++) { + sx[i*veclen + j] = random() / (double)RAND_MAX; + sy[j*n*2 + i] = sx[i*veclen + j]; + } + } + + // + + DFT_execute(p, mode, sx, 1); + + for(j=0;j<veclen;j++) { + cdft(n*2, 1, &sy[j*n*2], ip, trigTable); + } + + // + + int success = 1; + + for(j=0;j<veclen;j++) { + for(i=0;i<n*2;i++) { + if (fabs(sx[i*veclen+j] - sy[j*n*2 + i]) > THRES) success = 0; + } + } + + // + + SIMDBase_alignedFree(sy); + SIMDBase_alignedFree(sx); + SIMDBase_alignedFree(trigTable); + free(ip); + + DFT_dispose(p, mode); + + // + + return success; +} + +// real forward +int check_rf(int n, int mode, int veclen, int sizeOfVect) { + int i, j; + + DFT *p = DFT_init(mode, n, DFT_FLAG_ALT_REAL); + + int *ip = calloc(n, sizeof(int)); + double *trigTable = SIMDBase_alignedMalloc(sizeof(double)*n/2); + + REAL *sx = SIMDBase_alignedMalloc(sizeOfVect*n); + double *sy = SIMDBase_alignedMalloc(veclen * sizeof(double) *n); + + // + + for(j=0;j<veclen;j++) { + for(i=0;i<n;i++) { + sx[i*veclen + j] = random() / (double)RAND_MAX; + sy[j*n + i] = sx[i*veclen + j]; + } + } + + // + + DFT_execute(p, mode, sx, -1); + + for(j=0;j<veclen;j++) { + rdft(n, -1, &sy[j*n], ip, trigTable); + } + + // + + int success = 1; + + for(j=0;j<veclen;j++) { + for(i=0;i<n;i++) { + if (fabs(sx[i*veclen+j] - sy[j*n + i]) > THRES) success = 0; + } + } + + // + + SIMDBase_alignedFree(sy); + SIMDBase_alignedFree(sx); + SIMDBase_alignedFree(trigTable); + free(ip); + + DFT_dispose(p, mode); + + // + + return success; +} + +// real backward +int check_rb(int n, int mode, int veclen, int sizeOfVect) { + int i, j; + + DFT *p = DFT_init(mode, n, DFT_FLAG_ALT_REAL); + + int *ip = calloc(n, sizeof(int)); + double *trigTable = SIMDBase_alignedMalloc(sizeof(double)*n/2); + + REAL *sx = SIMDBase_alignedMalloc(sizeOfVect*n); + double *sy = SIMDBase_alignedMalloc(veclen * sizeof(double) *n); + + // + + for(j=0;j<veclen;j++) { + for(i=0;i<n;i++) { + sx[i*veclen + j] = random() / (double)RAND_MAX; + sy[j*n + i] = sx[i*veclen + j]; + } + } + + // + + DFT_execute(p, mode, sx, 1); + + for(j=0;j<veclen;j++) { + rdft(n, 1, &sy[j*n], ip, trigTable); + } + + // + + int success = 1; + + for(j=0;j<veclen;j++) { + for(i=0;i<n;i++) { + if (fabs(sx[i*veclen+j] - sy[j*n + i]) > THRES) success = 0; + } + } + + // + + SIMDBase_alignedFree(sy); + SIMDBase_alignedFree(sx); + SIMDBase_alignedFree(trigTable); + free(ip); + + DFT_dispose(p, mode); + + // + + return success; +} + +int main(int argc, char **argv) { + if (argc != 2) { + fprintf(stderr, "%s <log2n>\n", argv[0]); + exit(-1); + } + + const int n = 1 << atoi(argv[1]); + + srandom(time(NULL)); + + // + + int mode = SIMDBase_chooseBestMode(TYPE); + + printf("mode : %d, %s\n", mode, SIMDBase_getModeParamString(SIMDBase_PARAMID_MODE_NAME, mode)); + + int veclen = SIMDBase_getModeParamInt(SIMDBase_PARAMID_VECTOR_LEN, mode); + int sizeOfVect = SIMDBase_getModeParamInt(SIMDBase_PARAMID_SIZE_OF_VECT, mode); + + printf("complex forward : %s\n", check_cf(n, mode, veclen, sizeOfVect) ? "OK" : "NG"); + printf("complex backward : %s\n", check_cb(n, mode, veclen, sizeOfVect) ? "OK" : "NG"); + printf("real forward : %s\n", check_rf(n, mode, veclen, sizeOfVect) ? "OK" : "NG"); + printf("real backward : %s\n", check_rb(n, mode, veclen, sizeOfVect) ? "OK" : "NG"); + + exit(0); +} diff --git a/plugins/supereq/nsfft-1.00/dfttest/Makefile b/plugins/supereq/nsfft-1.00/dfttest/Makefile new file mode 100644 index 00000000..924b8656 --- /dev/null +++ b/plugins/supereq/nsfft-1.00/dfttest/Makefile @@ -0,0 +1,35 @@ +CC=gcc +BASEOPT=-Wall -g -I ../simd -I ../dft -L../simd -L../dft +OPT=$(BASEOPT) -O + +all : DFTExample DFTTestNaive + +clean : + rm -f *~ *.o nsfftplan.*.txt *.log *.dat a.out DFTExample DFTTestNaive DFTTestOoura DFTTestFFTW pi_fft_mod pi_fft_mod.c + +../simd/libSIMD.a : + @cd ../simd; make + +../dft/libDFT.a : + @cd ../dft; make + +../ooura/fftsg.o : + @cd ../ooura; make + +DFTExample : DFTExample.c ../simd/libSIMD.a ../dft/libDFT.a + $(CC) $(OPT) DFTExample.c -lDFT -lSIMD -lm -o DFTExample + +DFTTestNaive : DFTTestNaive.c ../simd/libSIMD.a ../dft/libDFT.a + $(CC) $(OPT) DFTTestNaive.c -lDFT -lSIMD -lm -o DFTTestNaive + +DFTTestOoura : DFTTestOoura.c ../ooura/fftsg.o ../simd/libSIMD.a ../dft/libDFT.a + $(CC) $(OPT) DFTTestOoura.c ../ooura/fftsg.o -lDFT -lSIMD -lm -o DFTTestOoura + +DFTTestFFTW : DFTTestFFTW.c ../simd/libSIMD.a ../dft/libDFT.a + $(CC) $(OPT) DFTTestFFTW.c -lDFT -lSIMD -lfftw3 -lm -o DFTTestFFTW + +pi_fft_mod.c : ../ooura/pi_fft.c pi_fft.c.patch + patch -o pi_fft_mod.c ../ooura/pi_fft.c pi_fft.c.patch + +pi_fft_mod : ../simd/libSIMD.a ../dft/libDFT.a pi_fft_mod.c + $(CC) $(OPT) pi_fft_mod.c -I ../dft -I ../simd -L../dft -L../simd -lm -lDFT -lSIMD -o pi_fft_mod diff --git a/plugins/supereq/nsfft-1.00/dfttest/pi_fft.c.patch b/plugins/supereq/nsfft-1.00/dfttest/pi_fft.c.patch new file mode 100644 index 00000000..c50133cc --- /dev/null +++ b/plugins/supereq/nsfft-1.00/dfttest/pi_fft.c.patch @@ -0,0 +1,131 @@ +--- pi_fft.c 2010-07-30 13:04:25.000000000 +0900 ++++ pi_fft_mod.c 2010-07-31 20:50:11.000000000 +0900 +@@ -25,7 +25,75 @@ + #include <stdio.h> + #include <stdlib.h> + #include <time.h> ++#include <sys/time.h> ++#include <unistd.h> + ++/****/ ++ ++#include <stdint.h> ++#include "SIMDBase.h" ++#include "DFT.h" ++ ++DFT* dft[64]; ++ ++void initdft(int n) { ++ int i, logn = 31 - __builtin_clz(n), writeflag = 0; ++ char buf[20], fn[256]; ++ gethostname(buf, 19); ++ sprintf(fn, "nsfftplan.%s.txt", buf); ++ FILE *fp = fopen(fn, "r"); ++ if (fp != NULL) { ++ for(i=1;i<=logn;i++) { ++ int err; ++ dft[i] = DFT_fread(fp, &err); ++ if (err != DFT_ERROR_NOERROR) { ++ printf("error when reading plan %d : %d\n", i, err); ++ break; ++ } ++ if (DFT_getPlanParamInt(DFT_PARAMID_MODE, dft[i]) != SIMDBase_MODE_PUREC_DOUBLE || ++ DFT_getPlanParamInt(DFT_PARAMID_FFT_LENGTH, dft[i]) != (1 << i) || ++ DFT_getPlanParamInt(DFT_PARAMID_IS_ALT_REAL_TRANSFORM, dft[i]) != 1) { ++ fprintf(stderr, "plan not compatible : %d\n", i); ++ break; ++ } ++ } ++ } ++ if (fp != NULL) fclose(fp); ++ ++ for(i=1;i<=logn;i++) { ++ if (dft[i] == NULL) { ++ dft[i] = DFT_init(SIMDBase_MODE_PUREC_DOUBLE, 1 << i, DFT_FLAG_ALT_REAL | DFT_FLAG_LIGHT_TEST_RUN | DFT_FLAG_VERBOSE); ++ if (dft[i] == NULL) { ++ printf("dft[%d] == NULL\n", i); ++ exit(-1); ++ } ++ writeflag = 1; ++ } ++ } ++ ++ if (writeflag) { ++ fp = fopen(fn, "w"); ++ if (fp != NULL) { ++ for(i=1;i<=logn;i++) { ++ DFT_fwrite(dft[i], fp); ++ } ++ fclose(fp); ++ } ++ } ++} ++ ++void rdft(int n, int isgn, double *a, int *ip, double *w) { ++ int logn = 31 - __builtin_clz(n); ++ DFT_execute(dft[logn], SIMDBase_MODE_PUREC_DOUBLE, a, isgn); ++} ++ ++double timeofday(void) { ++ struct timeval tp; ++ gettimeofday(&tp, NULL); ++ return (double)tp.tv_sec+(1e-6)*tp.tv_usec; ++} ++ ++/****/ + + void mp_load_0(int n, int radix, int out[]); + void mp_load_1(int n, int radix, int out[]); +@@ -67,7 +135,7 @@ + double err, d_time, n_op; + int *a, *b, *c, *e, *i1, *i2, *ip; + double *d1, *d2, *d3, *w; +- time_t t_1, t_2; ++ double t_1, t_2; + FILE *f_log, *f_out; + + f_log = fopen("pi.log", "w"); +@@ -96,6 +164,8 @@ + exit(1); + } + ip[0] = 0; ++ ++ initdft(nfft); + /* ---- radix test ---- */ + log10_radix = 1; + radix = 10; +@@ -111,7 +181,7 @@ + printf("calculating %d digits of PI...\n", log10_radix * (n - 2)); + fprintf(f_log, "calculating %d digits of PI...\n", log10_radix * (n - 2)); + /* ---- time check ---- */ +- time(&t_1); ++ t_1 = timeofday(); + /* + * ---- a formula based on the AGM (Arithmetic-Geometric Mean) ---- + * c = sqrt(0.125); +@@ -216,10 +286,10 @@ + mp_mul(n, radix, a, b, a, i1, nfft, d1, d2, d3, ip, w); + mp_idiv(n, radix, a, npow, a); + /* ---- time check ---- */ +- time(&t_2); ++ t_2 = timeofday(); + /* ---- output ---- */ + f_out = fopen("pi_mod.dat", "w"); +- printf("writing pi.dat...\n"); ++ printf("writing pi_mod.dat...\n"); + mp_fprintf(n - 1, log10_radix, a, f_out); + fclose(f_out); + free(d3); +@@ -238,9 +308,9 @@ + printf("floating point operation: %g op.\n", n_op); + fprintf(f_log, "floating point operation: %g op.\n", n_op); + /* ---- difftime ---- */ +- d_time = difftime(t_2, t_1); +- printf("execution time: %g sec. (real time)\n", d_time); +- fprintf(f_log, "execution time: %g sec. (real time)\n", d_time); ++ d_time = t_2 - t_1; ++ printf("execution time: %.5g sec. (real time)\n", d_time); ++ fprintf(f_log, "execution time: %.5g sec. (real time)\n", d_time); + fclose(f_log); + return 0; + } diff --git a/plugins/supereq/nsfft-1.00/doc/default.css b/plugins/supereq/nsfft-1.00/doc/default.css new file mode 100644 index 00000000..09721163 --- /dev/null +++ b/plugins/supereq/nsfft-1.00/doc/default.css @@ -0,0 +1,34 @@ +body {margin-left: 1.5cm; padding-left: 0.1cm; margin-right: 1.5cm; padding-right: 0.1cm; margin-top: 2.5cm; padding-top: 0.5cm; margin-bottom: 1cm; padding-bottom: 1.0cm; border-top-style:solid; border-bottom-style:solid; } +h1 {font-family: arial, sansserif; font-weight: bold; font-style: italic; margin-top: 0.8cm; } +h2 {font-family: arial, sansserif; font-weight: bold; font-style: italic; margin-top: 0.8cm; } +h3 {font-family: arial, sansserif; font-weight: bold; margin-top: 1.2cm; margin-bottom: 0.8cm; } +h4 {font-family: arial, sansserif; font-weight: bold; margin-top: 1.2cm; margin-bottom: 0.8cm; } +p {font-family: Georgia, "Times New Roman", times, serif; margin-top: 0.3cm; margin-left: 0.5cm; margin-bottom: 0.3cm;} +p.dir {font-family: arial, sansserif; margin-top: 0cm; margin-bottom: 0cm;} +dl { margin-left: 0.5cm; } +dt { font-weight: bold; } +a:link {color: black;} +a:visited {color: black;} +ul.disc {list-style-type: disc; font-family: times, serif;} +ul.circle {list-style-type: circle; font-family: times, serif;} +ul.square {list-style-type: square; font-family: times, serif;} +ul.none {list-style-type: none; font-family: times, serif;} +pre.code { margin-top: 1.0cm; margin-bottom: 1.0cm; margin-left: 1.0cm; margin-right: 1.0cm; border:3px solid #c0c0c0; padding: 0.5cm; font-family: tahoma, sansserif; font-weight: normal; background-color:#f8f8f8; } +pre.command { margin-top: 1.0cm; margin-bottom: 1.0cm; margin-left: 1.5cm; margin-right: 0.0cm; border:0px; padding:0.0cm; font-family: tahoma, sansserif; font-weight: bold; background-color:#f8fffc; } +ol.level1 { font-family: arial, sansserif; font-weight: bold; font-style: italic; font-size:1.5em; } +ol.level2 { font-family: "Times New Roman", serif; font-weight: normal; font-style: normal; font-size:0.85em; margin-top: 0.2cm; margin-bottom: 0.5cm; } +table.figure { margin-left:auto; margin-right:auto; margin-top:1.0cm; margin-bottom:1.0cm; } + +td.caption { font-family: arial, sansserif; font-size: 75%; color: black; } +td { font-family: times, serif; } + +table.lt { border-collapse: collapse; border-style: none; } +td.lt- { margin: 0px; padding: 4px; padding-left:0.3cm; padding-right:0.3cm; border-width: 1px; border-style: none; padding-left=0.2cm; padding-right=0.2cm; } +td.lt-r { margin: 0px; padding: 4px; padding-left:0.3cm; padding-right:0.3cm; border-style: none; border-right-style: solid; border-width: 1px; border-color: black; } +td.lt-l { margin: 0px; padding: 4px; padding-left:0.3cm; padding-right:0.3cm; border-style: none; border-left-style: solid; border-width: 1px; border-color: black; } +td.lt-lr { margin: 0px; padding: 4px; padding-left:0.3cm; padding-right:0.3cm; border-style: none; border-right-style: solid; border-left-style: solid; border-width: 1px; border-color: black; } +td.lt-b { margin: 0px; padding: 4px; padding-left:0.3cm; padding-right:0.3cm; border-style: none; border-bottom-style: solid; border-width: 1px; border-color: black; } +td.lt-hl { margin: 0px; border-style: none; border-bottom-style: solid; border-width: 1px; border-color: black; height: 2px; } +td.lt-bl { margin: 0px; padding: 4px; padding-left:0.3cm; padding-right:0.3cm; border-style: none; border-bottom-style: solid; border-left-style: solid; border-width: 1px; border-color: black; } +td.lt-br { margin: 0px; padding: 4px; padding-left:0.3cm; padding-right:0.3cm; border-style: none; border-bottom-style: solid; border-right-style: solid; border-width: 1px; border-color: black; } +td.lt-blr { margin: 0px; padding: 4px; padding-left:0.3cm; padding-right:0.3cm; border-style: none; border-bottom-style: solid; border-left-style: solid; border-right-style: solid; border-width: 1px; border-color: black; } diff --git a/plugins/supereq/nsfft-1.00/doc/index.xhtml b/plugins/supereq/nsfft-1.00/doc/index.xhtml new file mode 100644 index 00000000..8b7e2c97 --- /dev/null +++ b/plugins/supereq/nsfft-1.00/doc/index.xhtml @@ -0,0 +1,2016 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1 plus MathML 2.0//EN" "http://www.w3.org/Math/DTD/mathml2/xhtml-math11-f.dtd"> +<html xmlns="http://www.w3.org/1999/xhtml"> +<head> +<meta http-equiv="Content-Type" content="text/html; charset=utf-8"/> +<link rel="stylesheet" type="text/css" href="default.css"/> +<title>NSFFT Reference Manual</title> +</head> +<body> +<h1>NSFFT Reference Manual</h1> + +<h3>Introduction</h3> + +<p> +This is a library for performing 1-dimensional discrete Fourier +transforms. NSDFT is a simple, small and portable library, and it is +efficient since it can utilize SIMD instruction sets in modern +processors. It performs multiple transforms simultaneously, and thus +it is especially suitable for digital signal processing. It does not +need so much computation to make a good execution plan. This library +is in public domain, so that you can incorporate this library into +your product without any obligation. +</p> + +<h3>API Reference</h3> + +<p> +In this section, the API functions are explained. +</p> + +<h4>Include files</h4> + +<p> +You have to include two include files in dft directory. +</p> + +<pre class="code"> +#include <stdint.h> +#include "SIMDBase.h" +#include "DFT.h" +</pre> + +<h4>Data types</h4> + +<p> +First, you have to choose a data type to represent an element in the +input and output sequence of numbers. You can choose from the +following three types. +</p> + +<table class="figure"> + <tr align="center"> + <td> + <table class="lt"> + <tr> + <td class="lt-hl"></td> + <td class="lt-hl"></td> + </tr> + <tr> + <td class="lt-br" align="center">Symbol</td> + <td class="lt-b" align="center">Data Type</td> + </tr> + <tr> + <td class="lt-hl"></td> + <td class="lt-hl"></td> + </tr> + <tr> + <td class="lt-r" align="left">SIMDBase_TYPE_FLOAT</td> + <td class="lt-" align="left">float type in C language</td> + </tr> + <tr> + <td class="lt-r" align="left">SIMDBase_TYPE_DOUBLE</td> + <td class="lt-" align="left">double type in C language</td> + </tr> + <tr> + <td class="lt-br" align="left">SIMDBase_TYPE_LONGDOUBLE</td> + <td class="lt-b" align="left">long double type in C language</td> + </tr> + </table> + </td> + </tr> + <tr align="center"> + <td class="caption">Table 1 Data types</td> + </tr> +</table> + + +<h4>Computation modes</h4> + +<p> +Next, a compuation mode have to be chosen. You can choose from the +following modes. +</p> + +<table class="figure"> + <tr align="center"> + <td> + <table class="lt"> + <tr> + <td class="lt-hl"></td> + <td class="lt-hl"></td> + <td class="lt-hl"></td> + <td class="lt-hl"></td> + </tr> + <tr> + <td class="lt-br" align="center">Symbol</td> + <td class="lt-br" align="center">Type</td> + <td class="lt-br" align="center">Vector Length</td> + <td class="lt-b" align="center">Computation Mode</td> + </tr> + <tr> + <td class="lt-hl"></td> + <td class="lt-hl"></td> + <td class="lt-hl"></td> + <td class="lt-hl"></td> + </tr> + <tr> + <td class="lt-r" align="left">SIMDBase_MODE_PUREC_FLOAT</td> + <td class="lt-r" align="center">float</td> + <td class="lt-r" align="center">1</td> + <td class="lt-" align="center">Scalar float</td> + </tr> + <tr> + <td class="lt-r" align="left">SIMDBase_MODE_PUREC_DOUBLE</td> + <td class="lt-r" align="center">double</td> + <td class="lt-r" align="center">1</td> + <td class="lt-" align="center">Scalar double</td> + </tr> + <tr> + <td class="lt-r" align="left">SIMDBase_MODE_PUREC_LONGDOUBLE</td> + <td class="lt-r" align="center">long double</td> + <td class="lt-r" align="center">1</td> + <td class="lt-" align="center">Scalar long double</td> + </tr> + <tr> + <td class="lt-r" align="left">SIMDBase_MODE_SSE_FLOAT</td> + <td class="lt-r" align="center">float</td> + <td class="lt-r" align="center">4</td> + <td class="lt-" align="center">x86 SSE</td> + </tr> + <tr> + <td class="lt-r" align="left">SIMDBase_MODE_SSE2_DOUBLE</td> + <td class="lt-r" align="center">double</td> + <td class="lt-r" align="center">2</td> + <td class="lt-" align="center">x86 SSE2</td> + </tr> + <tr> + <td class="lt-r" align="left">SIMDBase_MODE_NEON_FLOAT</td> + <td class="lt-r" align="center">float</td> + <td class="lt-r" align="center">4</td> + <td class="lt-" align="center">ARM NEON</td> + </tr> + <tr> + <td class="lt-r" align="left">SIMDBase_MODE_AVX_FLOAT</td> + <td class="lt-r" align="center">float</td> + <td class="lt-r" align="center">8</td> + <td class="lt-" align="center">x86 AVX (float)</td> + </tr> + <tr> + <td class="lt-r" align="left">SIMDBase_MODE_AVX_DOUBLE</td> + <td class="lt-r" align="center">double</td> + <td class="lt-r" align="center">4</td> + <td class="lt-" align="center">x86 AVX (double)</td> + </tr> + <tr> + <td class="lt-br" align="left">SIMDBase_MODE_ALTIVEC_FLOAT</td> + <td class="lt-br" align="center">float</td> + <td class="lt-br" align="center">4</td> + <td class="lt-b" align="center">PowerPC Altivec</td> + </tr> + </table> + </td> + </tr> + <tr align="center"> + <td class="caption">Table 2 Computation modes</td> + </tr> +</table> + +<p> +The following function automatically checks the availability of each +instruction set on your computer, and chooses the best computation +mode. +</p> + +<pre class="code"> +int32_t SIMDBase_chooseBestMode(int32_t type); +</pre> + +<p> +The return value is the best mode chosen by this routine. +<i>type</i> is the data type you chose. +</p> + + +<h4>Retrieving parameters</h4> + +<p> +You can make queries for any mode using the following function. +</p> + +<pre class="code"> +int32_t SIMDBase_getModeParamInt(int32_t paramId, int32_t mode); +</pre> + +<p> +<i>mode</i> is the computation mode you chose. <i>paramId</i> is one +of the following. +</p> + +<table class="figure"> + <tr align="center"> + <td> + <table class="lt"> + <tr> + <td class="lt-hl"></td> + <td class="lt-hl"></td> + </tr> + <tr> + <td class="lt-br" align="center">Symbol</td> + <td class="lt-b" align="center">Meaning</td> + </tr> + <tr> + <td class="lt-hl"></td> + <td class="lt-hl"></td> + </tr> + <tr> + <td class="lt-r" align="left">SIMDBase_PARAMID_SIZE_OF_REAL</td> + <td class="lt-" align="left">Size of an element in a vector in byte</td> + </tr> + <tr> + <td class="lt-r" align="left">SIMDBase_PARAMID_SIZE_OF_VECT</td> + <td class="lt-" align="left">Size of the vector in byte</td> + </tr> + <tr> + <td class="lt-r" align="left">SIMDBase_PARAMID_VECTOR_LEN</td> + <td class="lt-" align="left">Number of elements in a vector</td> + </tr> + <tr> + <td class="lt-br" align="left">SIMDBase_PARAMID_MODE_AVAILABILITY</td> + <td class="lt-b" align="left">Whether the given mode is available or not</td> + </tr> + </table> + </td> + </tr> + <tr align="center"> + <td class="caption">Table 3 Querying parameter for computation mode</td> + </tr> +</table> + +<p> +Here, a vector is a set of multiple primitive data element (single or +double precision FP number) which can be stored in one SIMD register, +and can be processed by one SIMD instruction at the same time. +</p> + +<p> +You can get the mode name in string data type. In this +case, <i>paramId</i> must be SIMDBase_PARAMID_MODE_NAME. +</p> + +<pre class="code"> +char *SIMDBase_getModeParamString(int32_t paramId, int32_t mode); +</pre> + +<p> +You should not modify the data returned by the above function. +</p> + + +<h4>Making and destroying execution plan</h4> + +<p> +An execution plan can be made by the following function. +</p> + +<pre class="code"> +DFT *DFT_init(int32_t mode, int32_t n, int32_t flags); +</pre> + +<p> +The return value is a pointer to a newly made plan. +<i>mode</i> is the mode you chose above. <i>n</i> is the length of a +transform. You can specify a bitwise OR of the following symbols +as <i>flags</i>. You should not specify more than one flags regarding +to test run. You should not specify DFT_FLAG_FORCE_RECURSIVE and +DFT_FLAG_FORCE_COBRA at the same time. If neither DFT_FLAG_REAL nor +DFT_FLAG_ALT_REAL is specified, an execution plan for complex +transforms are made. +</p> + +<table class="figure"> + <tr align="center"> + <td> + <table class="lt"> + <tr> + <td class="lt-hl"></td> + <td class="lt-hl"></td> + </tr> + <tr> + <td class="lt-br" align="center">Symbol</td> + <td class="lt-b" align="center">Meaning</td> + </tr> + <tr> + <td class="lt-hl"></td> + <td class="lt-hl"></td> + </tr> + <tr> + <td class="lt-r" align="left">DFT_FLAG_NO_TEST_RUN</td> + <td class="lt-" align="left">Make execution plan without performing a test run</td> + </tr> + <tr> + <td class="lt-r" align="left">DFT_FLAG_LIGHT_TEST_RUN</td> + <td class="lt-" align="left">Perform small amount of test run to make an execution plan</td> + </tr> + <tr> + <td class="lt-r" align="left">DFT_FLAG_HEAVY_TEST_RUN</td> + <td class="lt-" align="left">Perform large amount of test run to make an execution plan</td> + </tr> + <tr> + <td class="lt-r" align="left">DFT_FLAG_EXHAUSTIVE_TEST_RUN</td> + <td class="lt-" align="left">Perform exhaustive search of parameters and find the optimal execution plan</td> + </tr> + <tr> + <td class="lt-r" align="left">DFT_FLAG_REAL</td> + <td class="lt-" align="left">Make an execution plan for a real transform</td> + </tr> + <tr> + <td class="lt-r" align="left">DFT_FLAG_ALT_REAL</td> + <td class="lt-" align="left">Make an execution plan for an alternative real transform</td> + </tr> + <tr> + <td class="lt-r" align="left">DFT_FLAG_VERBOSE</td> + <td class="lt-" align="left">Make some noise during making an execution plan</td> + </tr> + <tr> + <td class="lt-r" align="left">DFT_FLAG_NOBITREVERSAL</td> + <td class="lt-" align="left">Does not perforam bitreversal operation during a transform</td> + </tr> + <tr> + <td class="lt-r" align="left">DFT_FLAG_FORCE_RECURSIVE</td> + <td class="lt-" align="left">Force using the recursive bit-reveral routine. This routine is suited for small transforms.</td> + </tr> + <tr> + <td class="lt-br" align="left">DFT_FLAG_FORCE_COBRA</td> + <td class="lt-b" align="left">Force using the Cobra bit-reveral routine. This routine is suited for large transforms.</td> + </tr> + </table> + </td> + </tr> + <tr align="center"> + <td class="caption">Table 4 Options for making execution plan</td> + </tr> +</table> + +<p> +You can destroy the plan you made by the following function. +</p> + +<pre class="code"> +void DFT_dispose(DFT *p, int32_t mode); +</pre> + +<p> +<i>p</i> is a pointer to the execution plan. <i>mode</i> is the +corresponding execution mode. +</p> + +<p> +You can retrieve parameters of a plan using the following function. +</p> + +<pre class="code"> +int32_t DFT_getPlanParamInt(int32_t paramId, void *p); +</pre> + +<p> +<i>p</i> is a pointer to an execution plan. <i>paramId</i> is one +of the following. +</p> + +<table class="figure"> + <tr align="center"> + <td> + <table class="lt"> + <tr> + <td class="lt-hl"></td> + <td class="lt-hl"></td> + </tr> + <tr> + <td class="lt-br" align="center">Symbol</td> + <td class="lt-b" align="center">Meaning</td> + </tr> + <tr> + <td class="lt-hl"></td> + <td class="lt-hl"></td> + </tr> + <tr> + <td class="lt-r" align="left">DFT_PARAMID_TYPE</td> + <td class="lt-" align="left">Data type</td> + </tr> + <tr> + <td class="lt-r" align="left">DFT_PARAMID_MODE</td> + <td class="lt-" align="left">Computation mode</td> + </tr> + <tr> + <td class="lt-r" align="left">DFT_PARAMID_FFT_LENGTH</td> + <td class="lt-" align="left">Length of the transform</td> + </tr> + <tr> + <td class="lt-r" align="left">DFT_PARAMID_IS_REAL_TRANSFORM</td> + <td class="lt-" align="left">Whether the plan is for real transforms</td> + </tr> + <tr> + <td class="lt-r" align="left">DFT_PARAMID_NO_BIT_REVERSAL</td> + <td class="lt-" align="left">Whether the plan does not perform bit reversal operation</td> + </tr> + <tr> + <td class="lt-br" align="left">DFT_PARAMID_TEST_RUN</td> + <td class="lt-b" align="left">How much test run is performed when making this plan</td> + </tr> + </table> + </td> + </tr> + <tr align="center"> + <td class="caption">Table 5 Querying parameter for execution plan</td> + </tr> +</table> + +<h4>Writing and reading execution plan to/from file</h4> + +<p> +You can write or read an execution plan to/from a file using the following functions. +</p> + +<pre class="code"> +int32_t DFT_fwrite(DFT *p, FILE *fp); +DFT *DFT_fread(FILE *fp, int32_t *errcode); +</pre> + +<p> +<i>p</i> is a pointer to a plan. <i>fp</i> is a file +pointer. DFT_fwrite returns 1 if the plan is successfully written, and +0 if an error occurs. DFT_fread returns the pointer to the read plan +if the plan is successfully read, and NULL if an error occurs. If an +error occurs, an error code is returned to a variable whose pointer is +specified by <i>errcode</i>. The interpretation of error codes is +given below. +</p> + +<table class="figure"> + <tr align="center"> + <td> + <table class="lt"> + <tr> + <td class="lt-hl"></td> + <td class="lt-hl"></td> + </tr> + <tr> + <td class="lt-br" align="center">Symbol</td> + <td class="lt-b" align="center">Meaning</td> + </tr> + <tr> + <td class="lt-hl"></td> + <td class="lt-hl"></td> + </tr> + <tr> + <td class="lt-r" align="left">DFT_ERROR_NOERROR</td> + <td class="lt-" align="left">No error</td> + </tr> + <tr> + <td class="lt-r" align="left">DFT_ERROR_FILE_VERSION</td> + <td class="lt-" align="left">File format version mismatch</td> + </tr> + <tr> + <td class="lt-r" align="left">DFT_ERROR_FILE_IO</td> + <td class="lt-" align="left">I/O error</td> + </tr> + <tr> + <td class="lt-r" align="left">DFT_ERROR_UNEXPECTED_EOF</td> + <td class="lt-" align="left">Unexpected EOF</td> + </tr> + <tr> + <td class="lt-r" align="left">DFT_ERROR_MODE_NOT_COMPILED_IN</td> + <td class="lt-" align="left">Tried to read a plan with mode that is not compiled in</td> + </tr> + <tr> + <td class="lt-r" align="left">DFT_ERROR_MODE_NOT_AVAILABLE</td> + <td class="lt-" align="left">Tried to read a plan with mode that is not supported by hardware</td> + </tr> + <tr> + <td class="lt-br" align="left">DFT_ERROR_UNKNOWN_MODE</td> + <td class="lt-b" align="left">Tried to read a plan with mode that is unknown by library</td> + </tr> + </table> + </td> + </tr> + <tr align="center"> + <td class="caption">Table 6 Errors that may happen during file I/O</td> + </tr> +</table> + + +<h4>Allocating and freeing buffers for transforms</h4> + +<p> +In order to allocate word-aligned buffers for storing data which is +fed to the FFT routine, you have to use the following function. +</p> + +<pre class="code"> +void *DFT_alignedMalloc(uint64_t size); +</pre> + +<p> +This function allocates <i>size</i> bytes of word-aligned memory and +returns the pointer. In order to free this memory, you have to use the +following function. +</p> + +<pre class="code"> +void DFT_alignedFree(void *ptr); +</pre> + +<p> +<i>ptr</i> is the pointer returned from DFT_alignedMalloc function. +</p> + +<h4>Executing transform</h4> + +<p> +By the following function, the planned transform can be executed. +</p> + +<pre class="code"> +void DFT_execute(DFT *p, int32_t mode, void *s, int32_t dir); +</pre> + +<p> +<i>p</i> is a pointer to the plan. <i>mode</i> is the computation +mode. <i>s</i> is the pointer to the buffer in which the sequence of +input values is stored. This pointer must be a pointer returned from +DFT_alignedMalloc function. +<i>dir</i> specifies the direction of transform. +</p> + +<p> +The forward and backward discrete Fourier transforms are defined by +the following formula (1) and (2), respectively. +</p> + +<table border="0" style="margin-right:1.0cm; margin-left:1.0cm; margin-top:0.5cm; margin-bottom:0.5cm;"> + <tr> + <td align="center" style="width:100%;"> + <math mode="display" style="font-size:1.2em;" xmlns="http://www.w3.org/1998/Math/MathML"> + <mrow> + <msub><mi>X</mi><mi>k</mi></msub> + <mo>=</mo> + <munderover> + <mo style="font-size:140%;">∑</mo> + <mrow><mi>n</mi><mo>=</mo><mn>0</mn></mrow> + <mrow><mi>N</mi><mo>-</mo><mn>1</mn></mrow> + </munderover> + <msub><mi>x</mi><mi>n</mi></msub> + <msup> + <mi>e</mi> + <mrow> + <mo>-</mo> + <mfrac> + <mrow><mn>2</mn><mi>π</mi><mi>i</mi></mrow> + <mi>N</mi> + </mfrac> + <mi>k</mi><mi>n</mi> + </mrow> + </msup> + + <mo> </mo> + + <mi>k</mi> + <mo>=</mo> + <mn>0</mn> + <mo>,</mo> + <mo>·</mo> + <mo>·</mo> + <mo>·</mo> + <mo>,</mo> + <mi>N</mi> + <mo>-</mo> + <mn>1</mn> + </mrow> + </math> + </td> + <td> + <p>(1)</p> + </td> + </tr> + <tr> + <td align="center" style="width:100%;"> + <math mode="display" style="font-size:1.2em;" xmlns="http://www.w3.org/1998/Math/MathML"> + <mrow> + <msub><mi>x</mi><mi>n</mi></msub> + <mo>=</mo> + <mfrac> + <mn>1</mn> + <mi>N</mi> + </mfrac> + <munderover> + <mo style="font-size:140%;">∑</mo> + <mrow><mi>k</mi><mo>=</mo><mn>0</mn></mrow> + <mrow><mi>N</mi><mo>-</mo><mn>1</mn></mrow> + </munderover> + <msub><mi>X</mi><mi>k</mi></msub> + <msup> + <mi>e</mi> + <mrow> + <mfrac> + <mrow><mn>2</mn><mi>π</mi><mi>i</mi></mrow> + <mi>N</mi> + </mfrac> + <mi>k</mi><mi>n</mi> + </mrow> + </msup> + + <mo> </mo> + + <mi>n</mi> + <mo>=</mo> + <mn>0</mn> + <mo>,</mo> + <mo>·</mo> + <mo>·</mo> + <mo>·</mo> + <mo>,</mo> + <mi>N</mi> + <mo>-</mo> + <mn>1</mn> + + </mrow> + </math> + </td> + <td> + <p>(2)</p> + </td> + </tr> +</table> + +<p> +The complex forward and backward transforms perform the transforms +defined by the following formula (3) and (4), respectively. <i>V</i> +is the vector length mentioned above. Again, calling DFT_execute once +performs <i>V</i> forward or backward transforms at a time. Please +note that (4) gives values multiplied by <i>N</i> compared to +(2). Specifying -1 as the direction of transform performs the +transform defined by (3). In this case, the input should be given as +in (5) , and the output is given as in (6). Specifying 1 as the +direction of transform performs the transform defined by (4), and in +this case, the input should be given as in (6) , and the output is +given as in (5). +</p> + +<table border="0" style="margin-right:1.0cm; margin-left:1.0cm; margin-top:0.5cm; margin-bottom:0.5cm;"> + <tr> + <td align="center" style="width:100%;"> + <math mode="display" style="font-size:1.2em;" xmlns="http://www.w3.org/1998/Math/MathML"> + <mrow> + <msub><mi>X</mi><mrow><mi>k</mi><mo>,</mo><mi>v</mi></mrow></msub> + <mo>=</mo> + <munderover> + <mo style="font-size:140%;">∑</mo> + <mrow><mi>n</mi><mo>=</mo><mn>0</mn></mrow> + <mrow><mi>N</mi><mo>-</mo><mn>1</mn></mrow> + </munderover> + <msub><mi>x</mi><mrow><mi>n</mi><mo>,</mo><mi>v</mi></mrow></msub> + <msup> + <mi>e</mi> + <mrow> + <mo>-</mo> + <mfrac> + <mrow><mn>2</mn><mi>π</mi><mi>i</mi></mrow> + <mi>N</mi> + </mfrac> + <mi>k</mi><mi>n</mi> + </mrow> + </msup> + + <mo> </mo> + + <mrow style="font-size:100%;"> + <mi>k</mi> + <mo>=</mo> + <mn>0</mn> + <mo>,</mo> + <mo>·</mo> + <mo>·</mo> + <mo>·</mo> + <mo>,</mo> + <mi>N</mi> + <mo>-</mo> + <mn>1</mn> + + <mo> </mo> + <mo>,</mo> + <mo> </mo> + + <mi>v</mi> + <mo>=</mo> + <mn>0</mn> + <mo>,</mo> + <mo>·</mo> + <mo>·</mo> + <mo>·</mo> + <mo>,</mo> + <mi>V</mi> + <mo>-</mo> + <mn>1</mn> + </mrow> + </mrow> + </math> + </td> + <td> + <p>(3)</p> + </td> + </tr> + <tr> + <td align="center" style="width:100%;"> + <math mode="display" style="font-size:1.2em;" xmlns="http://www.w3.org/1998/Math/MathML"> + <mrow> + <msub><mi>x</mi><mrow><mi>n</mi><mo>,</mo><mi>v</mi></mrow></msub> + <mo>=</mo> + <munderover> + <mo style="font-size:140%;">∑</mo> + <mrow><mi>k</mi><mo>=</mo><mn>0</mn></mrow> + <mrow><mi>N</mi><mo>-</mo><mn>1</mn></mrow> + </munderover> + <msub><mi>X</mi><mrow><mi>k</mi><mo>,</mo><mi>v</mi></mrow></msub> + <msup> + <mi>e</mi> + <mrow> + <mfrac> + <mrow><mn>2</mn><mi>π</mi><mi>i</mi></mrow> + <mi>N</mi> + </mfrac> + <mi>k</mi><mi>n</mi> + </mrow> + </msup> + + <mo> </mo> + + <mrow style="font-size:100%;"> + <mi>n</mi> + <mo>=</mo> + <mn>0</mn> + <mo>,</mo> + <mo>·</mo> + <mo>·</mo> + <mo>·</mo> + <mo>,</mo> + <mi>N</mi> + <mo>-</mo> + <mn>1</mn> + + <mo> </mo> + <mo>,</mo> + <mo> </mo> + + <mi>v</mi> + <mo>=</mo> + <mn>0</mn> + <mo>,</mo> + <mo>·</mo> + <mo>·</mo> + <mo>·</mo> + <mo>,</mo> + <mi>V</mi> + <mo>-</mo> + <mn>1</mn> + </mrow> + </mrow> + </math> + </td> + <td> + <p>(4)</p> + </td> + </tr> +</table> + +<table border="0" style="margin-right:1.0cm; margin-left:1.0cm; margin-top:0.5cm; margin-bottom:0.5cm;"> + <tr> + <td align="center" style="width:100%;"> + <math mode="display" style="font-size:1.2em;" xmlns="http://www.w3.org/1998/Math/MathML"> + <mrow> + <mfenced open="{" close=""> + <mtable> + <mtr> + <mtd> + <mrow> + <mi>s</mi> + <mo>[</mo> + <mo>(</mo> + <mn>2</mn> + <mi>n</mi> + <mo>+</mo> + <mn>0</mn> + <mo>)</mo> + <mi>V</mi> + <mo>+</mo> + <mi>v</mi> + <mo>]</mo> + + <mo>=</mo> + + <mi>Re</mi> + <mo>(</mo> + <msub><mi>x</mi><mrow><mi>n</mi><mo>,</mo><mi>v</mi></mrow></msub> + <mo>)</mo> + </mrow> + </mtd> + </mtr> + + <mtr> + <mtd> + <mrow> + <mi>s</mi> + <mo>[</mo> + <mo>(</mo> + <mn>2</mn> + <mi>n</mi> + <mo>+</mo> + <mn>1</mn> + <mo>)</mo> + <mi>V</mi> + <mo>+</mo> + <mi>v</mi> + <mo>]</mo> + + <mo>=</mo> + + <mi>Im</mi> + <mo>(</mo> + <msub><mi>x</mi><mrow><mi>n</mi><mo>,</mo><mi>v</mi></mrow></msub> + <mo>)</mo> + </mrow> + </mtd> + </mtr> + + </mtable> + </mfenced> + + <mo> </mo> + + <mrow style="font-size:100%;"> + <mi>n</mi> + <mo>=</mo> + <mn>0</mn> + <mo>,</mo> + <mo>·</mo> + <mo>·</mo> + <mo>·</mo> + <mo>,</mo> + <mi>N</mi> + <mo>-</mo> + <mn>1</mn> + + <mo> </mo> + <mo>,</mo> + <mo> </mo> + + <mi>v</mi> + <mo>=</mo> + <mn>0</mn> + <mo>,</mo> + <mo>·</mo> + <mo>·</mo> + <mo>·</mo> + <mo>,</mo> + <mi>V</mi> + <mo>-</mo> + <mn>1</mn> + </mrow> + </mrow> + </math> + </td> + <td> + <p>(5)</p> + </td> + </tr> +</table> + +<table border="0" style="margin-right:1.0cm; margin-left:1.0cm; margin-top:0.5cm; margin-bottom:0.5cm;"> + <tr> + <td align="center" style="width:100%;"> + <math mode="display" style="font-size:1.2em;" xmlns="http://www.w3.org/1998/Math/MathML"> + <mrow> + <mfenced open="{" close=""> + <mtable> + <mtr> + <mtd> + <mrow> + <mi>s</mi> + <mo>[</mo> + <mo>(</mo> + <mn>2</mn> + <mi>k</mi> + <mo>+</mo> + <mn>0</mn> + <mo>)</mo> + <mi>V</mi> + <mo>+</mo> + <mi>v</mi> + <mo>]</mo> + + <mo>=</mo> + + <mi>Re</mi> + <mo>(</mo> + <msub><mi>X</mi><mrow><mi>k</mi><mo>,</mo><mi>v</mi></mrow></msub> + <mo>)</mo> + </mrow> + </mtd> + </mtr> + + <mtr> + <mtd> + <mrow> + <mi>s</mi> + <mo>[</mo> + <mo>(</mo> + <mn>2</mn> + <mi>k</mi> + <mo>+</mo> + <mn>1</mn> + <mo>)</mo> + <mi>V</mi> + <mo>+</mo> + <mi>v</mi> + <mo>]</mo> + + <mo>=</mo> + + <mi>Im</mi> + <mo>(</mo> + <msub><mi>X</mi><mrow><mi>k</mi><mo>,</mo><mi>v</mi></mrow></msub> + <mo>)</mo> + </mrow> + </mtd> + </mtr> + + </mtable> + </mfenced> + + <mo> </mo> + + <mrow style="font-size:100%;"> + <mi>k</mi> + <mo>=</mo> + <mn>0</mn> + <mo>,</mo> + <mo>·</mo> + <mo>·</mo> + <mo>·</mo> + <mo>,</mo> + <mi>N</mi> + <mo>-</mo> + <mn>1</mn> + + <mo> </mo> + <mo>,</mo> + <mo> </mo> + + <mi>v</mi> + <mo>=</mo> + <mn>0</mn> + <mo>,</mo> + <mo>·</mo> + <mo>·</mo> + <mo>·</mo> + <mo>,</mo> + <mi>V</mi> + <mo>-</mo> + <mn>1</mn> + </mrow> + </mrow> + </math> + </td> + <td> + <p>(6)</p> + </td> + </tr> +</table> + +<p> +The real forward transform performs the transform defined by (3) when +the condition (7) is satisfied. In this case, the output satisfies +(8). You should specify -1 as the direction of transform, and the +input should be given as in (9), and the output is given as in (10). +The real backward transform is the opposite of the real forward +transform. The input should satisfy (8) and the output satisfies (7). +You should specify 1 as the direction of transform, and the input +should be given as in (10), and the output is given as in (11). +</p> + +<table border="0" style="margin-right:1.0cm; margin-left:1.0cm; margin-top:0.5cm; margin-bottom:0.5cm;"> + <tr> + <td align="center" style="width:100%;"> + <math mode="display" style="font-size:1.2em;" xmlns="http://www.w3.org/1998/Math/MathML"> + <mrow> + <mi>Im</mi> + <mo>(</mo> + <msub><mi>x</mi><mrow><mi>n</mi><mo>,</mo><mi>v</mi></mrow></msub> + <mo>)</mo> + <mo>=</mo> + <mn>0</mn> + + <mo> </mo> + + <mrow style="font-size:100%;"> + <mi>n</mi> + <mo>=</mo> + <mn>0</mn> + <mo>,</mo> + <mo>·</mo> + <mo>·</mo> + <mo>·</mo> + <mo>,</mo> + <mi>N</mi> + <mo>-</mo> + <mn>1</mn> + + <mo> </mo> + <mo>,</mo> + <mo> </mo> + + <mi>v</mi> + <mo>=</mo> + <mn>0</mn> + <mo>,</mo> + <mo>·</mo> + <mo>·</mo> + <mo>·</mo> + <mo>,</mo> + <mi>V</mi> + <mo>-</mo> + <mn>1</mn> + </mrow> + + </mrow> + </math> + </td> + <td> + <p>(7)</p> + </td> + </tr> +</table> + +<table border="0" style="margin-right:1.0cm; margin-left:1.0cm; margin-top:0.5cm; margin-bottom:0.5cm;"> + <tr> + <td align="center" style="width:100%;"> + <math mode="display" style="font-size:1.2em;" xmlns="http://www.w3.org/1998/Math/MathML"> + <mrow> + <mfenced open="{" close=""> + <mtable> + <mtr> + <mtd> + <mrow> + <msub><mi>X</mi><mrow><mi>k</mi><mo>,</mo><mi>v</mi></mrow></msub> + <mo>=</mo> + <msubsup> + <mi>X</mi> + <mrow><mi>N</mi><mo>-</mo><mi>k</mi><mo>,</mo><mi>v</mi></mrow> + <mo>*</mo> + </msubsup> + </mrow> + </mtd> + + <mtd> + <mo> </mo> + </mtd> + + <mtd> + <mrow style="font-size:100%;"> + <mi>k</mi> + <mo>=</mo> + <mn>1</mn> + <mo>,</mo> + <mo>·</mo> + <mo>·</mo> + <mo>·</mo> + <mo>,</mo> + <mfrac> + <mi>N</mi> + <mn>2</mn> + </mfrac> + <mo>-</mo> + <mn>1</mn> + </mrow> + </mtd> + </mtr> + + <mtr> + <mtd> + <mrow> + <mi>Im</mi> + <mo>(</mo> + <msub><mi>X</mi><mrow><mi>k</mi><mo>,</mo><mi>v</mi></mrow></msub> + <mo>)</mo> + <mo>=</mo> + <mn>0</mn> + </mrow> + </mtd> + + <mtd> + <mo> </mo> + </mtd> + + <mtd> + <mrow style="font-size:100%;"> + <mi>k</mi> + <mo>=</mo> + <mn>0</mn> + <mo>,</mo> + <mfrac> + <mi>N</mi> + <mn>2</mn> + </mfrac> + </mrow> + </mtd> + </mtr> + + </mtable> + </mfenced> + + <mo> </mo> + + <mrow style="font-size:100%;"> + <mi>v</mi> + <mo>=</mo> + <mn>0</mn> + <mo>,</mo> + <mo>·</mo> + <mo>·</mo> + <mo>·</mo> + <mo>,</mo> + <mi>V</mi> + <mo>-</mo> + <mn>1</mn> + </mrow> + </mrow> + </math> + </td> + <td> + <p>(8)</p> + </td> + </tr> +</table> + +<table border="0" style="margin-right:1.0cm; margin-left:1.0cm; margin-top:0.5cm; margin-bottom:0.5cm;"> + <tr> + <td align="center" style="width:100%;"> + <math mode="display" style="font-size:1.2em;" xmlns="http://www.w3.org/1998/Math/MathML"> + <mrow> + <mrow> + <mi>s</mi> + <mo>[</mo> + <mi>n</mi> + <mi>V</mi> + <mo>+</mo> + <mi>v</mi> + <mo>]</mo> + + <mo>=</mo> + + <mi>Re</mi> + <mo>(</mo> + <msub><mi>x</mi><mrow><mi>n</mi><mo>,</mo><mi>v</mi></mrow></msub> + <mo>)</mo> + </mrow> + + <mo> </mo> + + <mrow style="font-size:100%;"> + <mi>n</mi> + <mo>=</mo> + <mn>0</mn> + <mo>,</mo> + <mo>·</mo> + <mo>·</mo> + <mo>·</mo> + <mo>,</mo> + <mi>N</mi> + <mo>-</mo> + <mn>1</mn> + + <mo> </mo> + <mo>,</mo> + <mo> </mo> + + <mi>v</mi> + <mo>=</mo> + <mn>0</mn> + <mo>,</mo> + <mo>·</mo> + <mo>·</mo> + <mo>·</mo> + <mo>,</mo> + <mi>V</mi> + <mo>-</mo> + <mn>1</mn> + </mrow> + </mrow> + </math> + </td> + <td> + <p>(9)</p> + </td> + </tr> +</table> + +<table border="0" style="margin-right:1.0cm; margin-left:1.0cm; margin-top:0.5cm; margin-bottom:0.5cm;"> + <tr> + <td align="center" style="width:100%;"> + <math mode="display" style="font-size:1.2em;" xmlns="http://www.w3.org/1998/Math/MathML"> + <mrow> + <mfenced open="{" close=""> + <mtable> + <mtr> + <mtd> + <mrow> + <mi>s</mi> + <mo>[</mo> + <mo>(</mo> + <mn>2</mn> + <mi>k</mi> + <mo>+</mo> + <mn>0</mn> + <mo>)</mo> + <mi>V</mi> + <mo>+</mo> + <mi>v</mi> + <mo>]</mo> + </mrow> + </mtd> + + <mtd> + <mo>=</mo> + </mtd> + + <mtd> + <mrow> + <mi>Re</mi> + <mo>(</mo> + <msub><mi>X</mi><mrow><mi>k</mi><mo>,</mo><mi>v</mi></mrow></msub> + <mo>)</mo> + </mrow> + </mtd> + + <mtd> + <mrow style="font-size:100%;"> + <mi>k</mi> + <mo>=</mo> + <mn>0</mn> + <mo>,</mo> + <mo>·</mo> + <mo>·</mo> + <mo>·</mo> + <mo>,</mo> + <mfrac> + <mi>N</mi> + <mn>2</mn> + </mfrac> + <mo>-</mo> + <mn>1</mn> + </mrow> + </mtd> + </mtr> + + <mtr> + <mtd> + <mrow> + <mi>s</mi> + <mo>[</mo> + <mi>V</mi> + <mo>+</mo> + <mi>v</mi> + <mo>]</mo> + </mrow> + </mtd> + + <mtd> + <mo>=</mo> + </mtd> + + <mtd> + <mrow> + <mi>Re</mi> + <mo>(</mo> + <msub><mi>X</mi><mrow><mi>N</mi><mo>/</mo><mn>2</mn><mo>,</mo><mi>v</mi></mrow></msub> + <mo>)</mo> + </mrow> + </mtd> + + <mtd> + </mtd> + </mtr> + + <mtr> + <mtd> + <mrow> + <mi>s</mi> + <mo>[</mo> + <mo>(</mo> + <mn>2</mn> + <mi>k</mi> + <mo>+</mo> + <mn>1</mn> + <mo>)</mo> + <mi>V</mi> + <mo>+</mo> + <mi>v</mi> + <mo>]</mo> + </mrow> + </mtd> + + <mtd> + <mo>=</mo> + </mtd> + + <mtd> + <mrow> + <mi>Im</mi> + <mo>(</mo> + <msub><mi>X</mi><mrow><mi>k</mi><mo>,</mo><mi>v</mi></mrow></msub> + <mo>)</mo> + </mrow> + </mtd> + + <mtd> + <mrow style="font-size:100%;"> + <mi>k</mi> + <mo>=</mo> + <mn>1</mn> + <mo>,</mo> + <mo>·</mo> + <mo>·</mo> + <mo>·</mo> + <mo>,</mo> + <mfrac> + <mi>N</mi> + <mn>2</mn> + </mfrac> + <mo>-</mo> + <mn>1</mn> + </mrow> + </mtd> + </mtr> + + </mtable> + </mfenced> + + <mo> </mo> + + <mrow style="font-size:100%;"> + <mi>v</mi> + <mo>=</mo> + <mn>0</mn> + <mo>,</mo> + <mo>·</mo> + <mo>·</mo> + <mo>·</mo> + <mo>,</mo> + <mi>V</mi> + <mo>-</mo> + <mn>1</mn> + </mrow> + </mrow> + </math> + </td> + <td> + <p>(10)</p> + </td> + </tr> +</table> + +<table border="0" style="margin-right:1.0cm; margin-left:1.0cm; margin-top:0.5cm; margin-bottom:0.5cm;"> + <tr> + <td align="center" style="width:100%;"> + <math mode="display" style="font-size:1.2em;" xmlns="http://www.w3.org/1998/Math/MathML"> + <mrow> + <mrow> + <mn>2</mn> + <mo> </mo> + <mi>s</mi> + <mo>[</mo> + <mi>n</mi> + <mi>V</mi> + <mo>+</mo> + <mi>v</mi> + <mo>]</mo> + + <mo>=</mo> + + <mi>Re</mi> + <mo>(</mo> + <msub><mi>x</mi><mrow><mi>n</mi><mo>,</mo><mi>v</mi></mrow></msub> + <mo>)</mo> + </mrow> + + <mo> </mo> + + <mrow style="font-size:100%;"> + <mi>n</mi> + <mo>=</mo> + <mn>0</mn> + <mo>,</mo> + <mo>·</mo> + <mo>·</mo> + <mo>·</mo> + <mo>,</mo> + <mi>N</mi> + <mo>-</mo> + <mn>1</mn> + + <mo> </mo> + <mo>,</mo> + <mo> </mo> + + <mi>v</mi> + <mo>=</mo> + <mn>0</mn> + <mo>,</mo> + <mo>·</mo> + <mo>·</mo> + <mo>·</mo> + <mo>,</mo> + <mi>V</mi> + <mo>-</mo> + <mn>1</mn> + </mrow> + </mrow> + </math> + </td> + <td> + <p>(11)</p> + </td> + </tr> +</table> + +<p> +The alternative real transforms are defined by (12) to (16), similarly +to the real transforms. The alternative transforms are handy if you +are migrating from the FFT library made by Prof. Takuya Ooura. You +should specify 1 as the direction in order to perform a forward +transform, and -1 when you perform a backward transform. +</p> + +<table border="0" style="margin-right:1.0cm; margin-left:1.0cm; margin-top:0.5cm; margin-bottom:0.5cm;"> + <tr> + <td align="center" style="width:100%;"> + <math mode="display" style="font-size:1.2em;" xmlns="http://www.w3.org/1998/Math/MathML"> + <mrow> + <mi>Im</mi> + <mo>(</mo> + <msub><mi>X</mi><mrow><mi>k</mi><mo>,</mo><mi>v</mi></mrow></msub> + <mo>)</mo> + <mo>=</mo> + <mn>0</mn> + + <mo> </mo> + + <mrow style="font-size:100%;"> + <mi>k</mi> + <mo>=</mo> + <mn>0</mn> + <mo>,</mo> + <mo>·</mo> + <mo>·</mo> + <mo>·</mo> + <mo>,</mo> + <mi>N</mi> + <mo>-</mo> + <mn>1</mn> + + <mo> </mo> + <mo>,</mo> + <mo> </mo> + + <mi>v</mi> + <mo>=</mo> + <mn>0</mn> + <mo>,</mo> + <mo>·</mo> + <mo>·</mo> + <mo>·</mo> + <mo>,</mo> + <mi>V</mi> + <mo>-</mo> + <mn>1</mn> + </mrow> + + </mrow> + </math> + </td> + <td> + <p>(12)</p> + </td> + </tr> +</table> + +<table border="0" style="margin-right:1.0cm; margin-left:1.0cm; margin-top:0.5cm; margin-bottom:0.5cm;"> + <tr> + <td align="center" style="width:100%;"> + <math mode="display" style="font-size:1.2em;" xmlns="http://www.w3.org/1998/Math/MathML"> + <mrow> + <mfenced open="{" close=""> + <mtable> + <mtr> + <mtd> + <mrow> + <msub><mi>x</mi><mrow><mi>n</mi><mo>,</mo><mi>v</mi></mrow></msub> + <mo>=</mo> + <msubsup> + <mi>x</mi> + <mrow><mi>N</mi><mo>-</mo><mi>n</mi><mo>,</mo><mi>v</mi></mrow> + <mo>*</mo> + </msubsup> + </mrow> + </mtd> + + <mtd> + <mo> </mo> + </mtd> + + <mtd> + <mrow style="font-size:100%;"> + <mi>n</mi> + <mo>=</mo> + <mn>1</mn> + <mo>,</mo> + <mo>·</mo> + <mo>·</mo> + <mo>·</mo> + <mo>,</mo> + <mfrac> + <mi>N</mi> + <mn>2</mn> + </mfrac> + <mo>-</mo> + <mn>1</mn> + </mrow> + </mtd> + </mtr> + + <mtr> + <mtd> + <mrow> + <mi>Im</mi> + <mo>(</mo> + <msub><mi>x</mi><mrow><mi>n</mi><mo>,</mo><mi>v</mi></mrow></msub> + <mo>)</mo> + <mo>=</mo> + <mn>0</mn> + </mrow> + </mtd> + + <mtd> + <mo> </mo> + </mtd> + + <mtd> + <mrow style="font-size:100%;"> + <mi>n</mi> + <mo>=</mo> + <mn>0</mn> + <mo>,</mo> + <mfrac> + <mi>N</mi> + <mn>2</mn> + </mfrac> + </mrow> + </mtd> + </mtr> + + </mtable> + </mfenced> + + <mo> </mo> + + <mrow style="font-size:100%;"> + <mi>v</mi> + <mo>=</mo> + <mn>0</mn> + <mo>,</mo> + <mo>·</mo> + <mo>·</mo> + <mo>·</mo> + <mo>,</mo> + <mi>V</mi> + <mo>-</mo> + <mn>1</mn> + </mrow> + </mrow> + </math> + </td> + <td> + <p>(13)</p> + </td> + </tr> +</table> + +<table border="0" style="margin-right:1.0cm; margin-left:1.0cm; margin-top:0.5cm; margin-bottom:0.5cm;"> + <tr> + <td align="center" style="width:100%;"> + <math mode="display" style="font-size:1.2em;" xmlns="http://www.w3.org/1998/Math/MathML"> + <mrow> + <mrow> + <mi>s</mi> + <mo>[</mo> + <mi>n</mi> + <mi>V</mi> + <mo>+</mo> + <mi>v</mi> + <mo>]</mo> + + <mo>=</mo> + + <mi>Re</mi> + <mo>(</mo> + <msub><mi>X</mi><mrow><mi>k</mi><mo>,</mo><mi>v</mi></mrow></msub> + <mo>)</mo> + </mrow> + + <mo> </mo> + + <mrow style="font-size:100%;"> + <mi>k</mi> + <mo>=</mo> + <mn>0</mn> + <mo>,</mo> + <mo>·</mo> + <mo>·</mo> + <mo>·</mo> + <mo>,</mo> + <mi>N</mi> + <mo>-</mo> + <mn>1</mn> + + <mo> </mo> + <mo>,</mo> + <mo> </mo> + + <mi>v</mi> + <mo>=</mo> + <mn>0</mn> + <mo>,</mo> + <mo>·</mo> + <mo>·</mo> + <mo>·</mo> + <mo>,</mo> + <mi>V</mi> + <mo>-</mo> + <mn>1</mn> + </mrow> + </mrow> + </math> + </td> + <td> + <p>(14)</p> + </td> + </tr> +</table> + +<table border="0" style="margin-right:1.0cm; margin-left:1.0cm; margin-top:0.5cm; margin-bottom:0.5cm;"> + <tr> + <td align="center" style="width:100%;"> + <math mode="display" style="font-size:1.2em;" xmlns="http://www.w3.org/1998/Math/MathML"> + <mrow> + <mfenced open="{" close=""> + <mtable> + <mtr> + <mtd> + <mrow> + <mi>s</mi> + <mo>[</mo> + <mo>(</mo> + <mn>2</mn> + <mi>n</mi> + <mo>+</mo> + <mn>0</mn> + <mo>)</mo> + <mi>V</mi> + <mo>+</mo> + <mi>v</mi> + <mo>]</mo> + </mrow> + </mtd> + + <mtd> + <mo>=</mo> + </mtd> + + <mtd> + <mrow> + <mi>Re</mi> + <mo>(</mo> + <msub><mi>x</mi><mrow><mi>n</mi><mo>,</mo><mi>v</mi></mrow></msub> + <mo>)</mo> + </mrow> + </mtd> + + <mtd> + <mrow style="font-size:100%;"> + <mi>n</mi> + <mo>=</mo> + <mn>0</mn> + <mo>,</mo> + <mo>·</mo> + <mo>·</mo> + <mo>·</mo> + <mo>,</mo> + <mfrac> + <mi>N</mi> + <mn>2</mn> + </mfrac> + <mo>-</mo> + <mn>1</mn> + </mrow> + </mtd> + </mtr> + + <mtr> + <mtd> + <mrow> + <mi>s</mi> + <mo>[</mo> + <mi>V</mi> + <mo>+</mo> + <mi>v</mi> + <mo>]</mo> + </mrow> + </mtd> + + <mtd> + <mo>=</mo> + </mtd> + + <mtd> + <mrow> + <mi>Re</mi> + <mo>(</mo> + <msub><mi>x</mi><mrow><mi>N</mi><mo>/</mo><mn>2</mn><mo>,</mo><mi>v</mi></mrow></msub> + <mo>)</mo> + </mrow> + </mtd> + + <mtd> + </mtd> + </mtr> + + <mtr> + <mtd> + <mrow> + <mi>s</mi> + <mo>[</mo> + <mo>(</mo> + <mn>2</mn> + <mi>n</mi> + <mo>+</mo> + <mn>1</mn> + <mo>)</mo> + <mi>V</mi> + <mo>+</mo> + <mi>v</mi> + <mo>]</mo> + </mrow> + </mtd> + + <mtd> + <mo>=</mo> + </mtd> + + <mtd> + <mrow> + <mi>Im</mi> + <mo>(</mo> + <msub><mi>x</mi><mrow><mi>n</mi><mo>,</mo><mi>v</mi></mrow></msub> + <mo>)</mo> + </mrow> + </mtd> + + <mtd> + <mrow style="font-size:100%;"> + <mi>n</mi> + <mo>=</mo> + <mn>1</mn> + <mo>,</mo> + <mo>·</mo> + <mo>·</mo> + <mo>·</mo> + <mo>,</mo> + <mfrac> + <mi>N</mi> + <mn>2</mn> + </mfrac> + <mo>-</mo> + <mn>1</mn> + </mrow> + </mtd> + </mtr> + + </mtable> + </mfenced> + + <mo> </mo> + + <mrow style="font-size:100%;"> + <mi>v</mi> + <mo>=</mo> + <mn>0</mn> + <mo>,</mo> + <mo>·</mo> + <mo>·</mo> + <mo>·</mo> + <mo>,</mo> + <mi>V</mi> + <mo>-</mo> + <mn>1</mn> + </mrow> + </mrow> + </math> + </td> + <td> + <p>(15)</p> + </td> + </tr> +</table> + +<table border="0" style="margin-right:1.0cm; margin-left:1.0cm; margin-top:0.5cm; margin-bottom:0.5cm;"> + <tr> + <td align="center" style="width:100%;"> + <math mode="display" style="font-size:1.2em;" xmlns="http://www.w3.org/1998/Math/MathML"> + <mrow> + <mrow> + <mn>2</mn> + <mo> </mo> + <mi>s</mi> + <mo>[</mo> + <mi>n</mi> + <mi>V</mi> + <mo>+</mo> + <mi>v</mi> + <mo>]</mo> + + <mo>=</mo> + + <mi>Re</mi> + <mo>(</mo> + <msub><mi>X</mi><mrow><mi>k</mi><mo>,</mo><mi>v</mi></mrow></msub> + <mo>)</mo> + </mrow> + + <mo> </mo> + + <mrow style="font-size:100%;"> + <mi>k</mi> + <mo>=</mo> + <mn>0</mn> + <mo>,</mo> + <mo>·</mo> + <mo>·</mo> + <mo>·</mo> + <mo>,</mo> + <mi>N</mi> + <mo>-</mo> + <mn>1</mn> + + <mo> </mo> + <mo>,</mo> + <mo> </mo> + + <mi>v</mi> + <mo>=</mo> + <mn>0</mn> + <mo>,</mo> + <mo>·</mo> + <mo>·</mo> + <mo>·</mo> + <mo>,</mo> + <mi>V</mi> + <mo>-</mo> + <mn>1</mn> + </mrow> + </mrow> + </math> + </td> + <td> + <p>(16)</p> + </td> + </tr> +</table> + + +<h3>Examples</h3> + +<p> +Below is an example code using nsfft library. +</p> + +<pre class="code"> +#include <stdio.h> +#include <stdlib.h> +#include <math.h> +#include <stdint.h> +#include <complex.h> + +#include "SIMDBase.h" +#include "DFT.h" + +typedef float REAL; +#define TYPE SIMDBase_TYPE_FLOAT + +#define THRES 1e-3 + +double complex omega(double n, double kn) { + return cexp((-2 * M_PI * _Complex_I / n) * kn); +} + +void forward(double complex *ts, double complex *fs, int len) { + int k, n; + + for(k=0;k<len;k++) { + fs[k] = 0; + + for(n=0;n<len;n++) { + fs[k] += ts[n] * omega(len, n*k); + } + } +} + +int main(int argc, char **argv) { + const int n = 256; + + int mode = SIMDBase_chooseBestMode(TYPE); + printf("mode : %d, %s\n", mode, SIMDBase_getModeParamString(SIMDBase_PARAMID_MODE_NAME, mode)); + + int veclen = SIMDBase_getModeParamInt(SIMDBase_PARAMID_VECTOR_LEN, mode); + int sizeOfVect = SIMDBase_getModeParamInt(SIMDBase_PARAMID_SIZE_OF_VECT, mode); + + // + + int i, j; + + DFT *p = DFT_init(mode, n, 0); + REAL *sx = SIMDBase_alignedMalloc(sizeOfVect*n*2); + + // + + double complex ts[veclen][n], fs[veclen][n]; + + for(j=0;j<veclen;j++) { + for(i=0;i<n;i++) { + ts[j][i] = (random() / (double)RAND_MAX) + (random() / (double)RAND_MAX) * _Complex_I; + sx[(i*2+0)*veclen+j] = creal(ts[j][i]); + sx[(i*2+1)*veclen+j] = cimag(ts[j][i]); + } + } + + // + + DFT_execute(p, mode, sx, -1); + + for(j=0;j<veclen;j++) { + forward(ts[j], fs[j], n); + } + + // + + int success = 1; + + for(j=0;j<veclen;j++) { + for(i=0;i<n;i++) { + if ((fabs(sx[(i*2+0)*veclen+j] - creal(fs[j][i])) > THRES) || + (fabs(sx[(i*2+1)*veclen+j] - cimag(fs[j][i])) > THRES)) { + success = 0; + } + } + } + + printf("%s\n", success ? "OK" : "NG"); + + // + + SIMDBase_alignedFree(sx); + DFT_dispose(p, mode); + + exit(0); +} +</pre> + +<p> +You should put this code under a directory in the root directory of +the library, and then you can compile this code with the following +command. +</p> + +<pre class="code"> +gcc -Wall -g -I ../simd -I ../dft -L../simd -L../dft -O DFTExample.c -lDFT -lSIMD -lm -o DFTExample +</pre> + +<h3>Compilation</h3> + +<p> +The nsfft source package include a few makefiles for various +architectures. You should make symbolic links to makefiles suited for +your computer under <i>dft</i> and <i>simd</i> directories. +</p> + +</body> +</html> diff --git a/plugins/supereq/nsfft-1.00/doc/nsfft.pdf b/plugins/supereq/nsfft-1.00/doc/nsfft.pdf Binary files differnew file mode 100644 index 00000000..ed4ad5db --- /dev/null +++ b/plugins/supereq/nsfft-1.00/doc/nsfft.pdf diff --git a/plugins/supereq/nsfft-1.00/ooura/Makefile b/plugins/supereq/nsfft-1.00/ooura/Makefile new file mode 100644 index 00000000..bad1679e --- /dev/null +++ b/plugins/supereq/nsfft-1.00/ooura/Makefile @@ -0,0 +1,11 @@ +CC=gcc +BASEOPT=-Wall -g +OPT=$(BASEOPT) -O3 + +all : fftsg.o + +clean : + rm -f *~ *.o a.out + +fftsg.o : fftsg.c + $(CC) $(OPT) -c fftsg.c diff --git a/plugins/supereq/nsfft-1.00/ooura/README b/plugins/supereq/nsfft-1.00/ooura/README new file mode 100644 index 00000000..d7ddefc2 --- /dev/null +++ b/plugins/supereq/nsfft-1.00/ooura/README @@ -0,0 +1,2 @@ +Please put fftsg.c and pi_fft.c which is included in Prof. Takuya +Ooura's FFT package. diff --git a/plugins/supereq/nsfft-1.00/simd/Makefile b/plugins/supereq/nsfft-1.00/simd/Makefile new file mode 120000 index 00000000..5d253498 --- /dev/null +++ b/plugins/supereq/nsfft-1.00/simd/Makefile @@ -0,0 +1 @@ +Makefile.x86avx
\ No newline at end of file diff --git a/plugins/supereq/nsfft-1.00/simd/Makefile.altivec b/plugins/supereq/nsfft-1.00/simd/Makefile.altivec new file mode 100644 index 00000000..eeaed6a1 --- /dev/null +++ b/plugins/supereq/nsfft-1.00/simd/Makefile.altivec @@ -0,0 +1,26 @@ +CC=gcc +BASEOPT=-Wall -maltivec -mabi=altivec +OPT=$(BASEOPT) -O3 + +all : libSIMD.a + +SIMDBaseUndiff_purecfloat.o : SIMDBaseUndiff.c SIMDBase.h SIMDBaseUndiff.h + $(CC) $(OPT) -DENABLE_PUREC_FLOAT SIMDBaseUndiff.c -c -o SIMDBaseUndiff_purecfloat.o + +SIMDBaseUndiff_purecdouble.o : SIMDBaseUndiff.c SIMDBase.h SIMDBaseUndiff.h + $(CC) $(OPT) -DENABLE_PUREC_DOUBLE SIMDBaseUndiff.c -c -o SIMDBaseUndiff_purecdouble.o + +SIMDBaseUndiff_pureclongdouble.o : SIMDBaseUndiff.c SIMDBase.h SIMDBaseUndiff.h + $(CC) $(OPT) -DENABLE_PUREC_LONGDOUBLE SIMDBaseUndiff.c -c -o SIMDBaseUndiff_pureclongdouble.o + +SIMDBaseUndiff_altivecfloat.o : SIMDBaseUndiff.c SIMDBase.h SIMDBaseUndiff.h + $(CC) $(OPT) -DENABLE_ALTIVEC_FLOAT SIMDBaseUndiff.c -c -o SIMDBaseUndiff_altivecfloat.o + +SIMDBase.o : SIMDBase.c SIMDBase.h + $(CC) $(BASEOPT) -O -DENABLE_PUREC_FLOAT -DENABLE_PUREC_DOUBLE -DENABLE_PUREC_LONGDOUBLE -DENABLE_ALTIVEC_FLOAT SIMDBase.c -c -o SIMDBase.o + +libSIMD.a : SIMDBase.o SIMDBaseUndiff_purecfloat.o SIMDBaseUndiff_purecdouble.o SIMDBaseUndiff_pureclongdouble.o SIMDBaseUndiff_altivecfloat.o + rm -f libSIMD.a; ar -cvq libSIMD.a SIMDBase.o SIMDBaseUndiff_purecfloat.o SIMDBaseUndiff_purecdouble.o SIMDBaseUndiff_pureclongdouble.o SIMDBaseUndiff_altivecfloat.o + +clean : + rm -f *~ *.o *.s *.a diff --git a/plugins/supereq/nsfft-1.00/simd/Makefile.neon b/plugins/supereq/nsfft-1.00/simd/Makefile.neon new file mode 100644 index 00000000..ace704f1 --- /dev/null +++ b/plugins/supereq/nsfft-1.00/simd/Makefile.neon @@ -0,0 +1,26 @@ +CC=gcc +BASEOPT=-Wall -mfloat-abi=softfp +OPT=$(BASEOPT) -O3 + +all : libSIMD.a + +SIMDBaseUndiff_purecfloat.o : SIMDBaseUndiff.c SIMDBase.h SIMDBaseUndiff.h + $(CC) $(OPT) -DENABLE_PUREC_FLOAT SIMDBaseUndiff.c -c -o SIMDBaseUndiff_purecfloat.o + +SIMDBaseUndiff_purecdouble.o : SIMDBaseUndiff.c SIMDBase.h SIMDBaseUndiff.h + $(CC) $(OPT) -DENABLE_PUREC_DOUBLE SIMDBaseUndiff.c -c -o SIMDBaseUndiff_purecdouble.o + +SIMDBaseUndiff_pureclongdouble.o : SIMDBaseUndiff.c SIMDBase.h SIMDBaseUndiff.h + $(CC) $(OPT) -DENABLE_PUREC_LONGDOUBLE SIMDBaseUndiff.c -c -o SIMDBaseUndiff_pureclongdouble.o + +SIMDBaseUndiff_neonfloat.o : SIMDBaseUndiff.c SIMDBase.h SIMDBaseUndiff.h + $(CC) $(OPT) -mfpu=neon -DENABLE_NEON_FLOAT SIMDBaseUndiff.c -c -o SIMDBaseUndiff_neonfloat.o + +SIMDBase.o : SIMDBase.c SIMDBase.h + $(CC) $(BASEOPT) -O -DENABLE_PUREC_FLOAT -DENABLE_PUREC_DOUBLE -DENABLE_PUREC_LONGDOUBLE -DENABLE_NEON_FLOAT SIMDBase.c -c -o SIMDBase.o + +libSIMD.a : SIMDBase.o SIMDBaseUndiff_purecfloat.o SIMDBaseUndiff_purecdouble.o SIMDBaseUndiff_pureclongdouble.o SIMDBaseUndiff_neonfloat.o + rm -f libSIMD.a; ar -cvq libSIMD.a SIMDBase.o SIMDBaseUndiff_purecfloat.o SIMDBaseUndiff_purecdouble.o SIMDBaseUndiff_pureclongdouble.o SIMDBaseUndiff_neonfloat.o + +clean : + rm -f *~ *.o *.s *.a diff --git a/plugins/supereq/nsfft-1.00/simd/Makefile.purec b/plugins/supereq/nsfft-1.00/simd/Makefile.purec new file mode 100644 index 00000000..2c8b04f1 --- /dev/null +++ b/plugins/supereq/nsfft-1.00/simd/Makefile.purec @@ -0,0 +1,35 @@ +CC=gcc +BASEOPT=-Wall +OPT=$(BASEOPT) -O3 + +all : libDFT.a + +DFTpurecfloat.o : DFTUndiff.c DFT.h SIMDBase.h + $(CC) $(OPT) -DENABLE_PUREC_FLOAT DFTUndiff.c -c -o DFTpurecfloat.o + +DFTpurecdouble.o : DFTUndiff.c DFT.h SIMDBase.h + $(CC) $(OPT) -DENABLE_PUREC_DOUBLE DFTUndiff.c -c -o DFTpurecdouble.o + +DFTpureclongdouble.o : DFTUndiff.c DFT.h SIMDBase.h + $(CC) $(OPT) -DENABLE_PUREC_LONGDOUBLE DFTUndiff.c -c -o DFTpureclongdouble.o + +SIMDBaseUndiff_purecfloat.o : SIMDBaseUndiff.c SIMDBase.h SIMDBaseUndiff.h + $(CC) $(OPT) -DENABLE_PUREC_FLOAT SIMDBaseUndiff.c -c -o SIMDBaseUndiff_purecfloat.o + +SIMDBaseUndiff_purecdouble.o : SIMDBaseUndiff.c DFT.h SIMDBase.h SIMDBaseUndiff.h + $(CC) $(OPT) -DENABLE_PUREC_DOUBLE SIMDBaseUndiff.c -c -o SIMDBaseUndiff_purecdouble.o + +SIMDBaseUndiff_pureclongdouble.o : SIMDBaseUndiff.c SIMDBase.h SIMDBaseUndiff.h + $(CC) $(OPT) -DENABLE_PUREC_LONGDOUBLE SIMDBaseUndiff.c -c -o SIMDBaseUndiff_pureclongdouble.o + +SIMDBase.o : SIMDBase.c SIMDBase.h + $(CC) $(BASEOPT) -O -DENABLE_PUREC_FLOAT -DENABLE_PUREC_DOUBLE -DENABLE_PUREC_LONGDOUBLE SIMDBase.c -c -o SIMDBase.o + +DFT.o : DFT.c DFT.h + $(CC) $(OPT) -DENABLE_PUREC_FLOAT -DENABLE_PUREC_DOUBLE -DENABLE_PUREC_LONGDOUBLE DFT.c -c -o DFT.o + +libDFT.a : DFTpurecfloat.o DFTpurecdouble.o DFTpureclongdouble.o DFT.o SIMDBase.o SIMDBaseUndiff_purecfloat.o SIMDBaseUndiff_purecdouble.o SIMDBaseUndiff_pureclongdouble.o + rm -f libDFT.a; ar -cvq libDFT.a DFTpurecfloat.o DFTpurecdouble.o DFTpureclongdouble.o DFT.o SIMDBase.o SIMDBaseUndiff_purecfloat.o SIMDBaseUndiff_purecdouble.o SIMDBaseUndiff_pureclongdouble.o + +clean : + rm -f *~ *.o *.s *.a diff --git a/plugins/supereq/nsfft-1.00/simd/Makefile.x86 b/plugins/supereq/nsfft-1.00/simd/Makefile.x86 new file mode 100644 index 00000000..02f49610 --- /dev/null +++ b/plugins/supereq/nsfft-1.00/simd/Makefile.x86 @@ -0,0 +1,35 @@ +CC=gcc +BASEOPT=-Wall +OPT=$(BASEOPT) -O3 + +all : libSIMD.a + +SIMDBaseUndiff_purecfloat.o : SIMDBaseUndiff.c SIMDBase.h SIMDBaseUndiff.h + $(CC) $(OPT) -DENABLE_PUREC_FLOAT SIMDBaseUndiff.c -c -o SIMDBaseUndiff_purecfloat.o + +SIMDBase_purecdouble.o : SIMDBaseUndiff.c SIMDBase.h SIMDBaseUndiff.h + $(CC) $(OPT) -DENABLE_PUREC_DOUBLE SIMDBaseUndiff.c -c -o SIMDBase_purecdouble.o + +SIMDBase_pureclongdouble.o : SIMDBaseUndiff.c SIMDBase.h SIMDBaseUndiff.h + $(CC) $(OPT) -DENABLE_PUREC_LONGDOUBLE SIMDBaseUndiff.c -c -o SIMDBase_pureclongdouble.o + +SIMDBase_ssefloat.o : SIMDBaseUndiff.c SIMDBase.h SIMDBaseUndiff.h + $(CC) $(OPT) -msse -DENABLE_SSE_FLOAT SIMDBaseUndiff.c -c -o SIMDBase_ssefloat.o + +SIMDBase_sse2double.o : SIMDBaseUndiff.c SIMDBase.h SIMDBaseUndiff.h + $(CC) $(OPT) -msse2 -DENABLE_SSE2_DOUBLE SIMDBaseUndiff.c -c -o SIMDBase_sse2double.o + +SIMDBase_avxfloat.o : SIMDBaseUndiff.c SIMDBase.h SIMDBaseUndiff.h + $(CC) $(OPT) -mavx -DENABLE_AVX_FLOAT SIMDBaseUndiff.c -c -o SIMDBase_avxfloat.o + +SIMDBase_avxdouble.o : SIMDBaseUndiff.c SIMDBase.h SIMDBaseUndiff.h + $(CC) $(OPT) -mavx -DENABLE_AVX_DOUBLE SIMDBaseUndiff.c -c -o SIMDBase_avxdouble.o + +SIMDBase.o : SIMDBase.c SIMDBase.h + $(CC) $(BASEOPT) -O -DENABLE_PUREC_FLOAT -DENABLE_PUREC_DOUBLE -DENABLE_PUREC_LONGDOUBLE -DENABLE_SSE_FLOAT -DENABLE_SSE2_DOUBLE SIMDBase.c -c -o SIMDBase.o + +libSIMD.a : SIMDBase.o SIMDBaseUndiff_purecfloat.o SIMDBase_purecdouble.o SIMDBase_pureclongdouble.o SIMDBase_ssefloat.o SIMDBase_sse2double.o + rm -f libSIMD.a; ar -cvq libSIMD.a SIMDBase.o SIMDBaseUndiff_purecfloat.o SIMDBase_purecdouble.o SIMDBase_pureclongdouble.o SIMDBase_ssefloat.o SIMDBase_sse2double.o + +clean : + rm -f *~ *.o *.s *.a a.out diff --git a/plugins/supereq/nsfft-1.00/simd/Makefile.x86avx b/plugins/supereq/nsfft-1.00/simd/Makefile.x86avx new file mode 100644 index 00000000..d9d27a2e --- /dev/null +++ b/plugins/supereq/nsfft-1.00/simd/Makefile.x86avx @@ -0,0 +1,35 @@ +CC=gcc +BASEOPT=-Wall +OPT=$(BASEOPT) -O3 + +all : libSIMD.a + +SIMDBaseUndiff_purecfloat.o : SIMDBaseUndiff.c SIMDBase.h SIMDBaseUndiff.h + $(CC) $(OPT) -DENABLE_PUREC_FLOAT SIMDBaseUndiff.c -c -o SIMDBaseUndiff_purecfloat.o + +SIMDBaseUndiff_purecdouble.o : SIMDBaseUndiff.c SIMDBase.h SIMDBaseUndiff.h + $(CC) $(OPT) -DENABLE_PUREC_DOUBLE SIMDBaseUndiff.c -c -o SIMDBaseUndiff_purecdouble.o + +SIMDBaseUndiff_pureclongdouble.o : SIMDBaseUndiff.c SIMDBase.h SIMDBaseUndiff.h + $(CC) $(OPT) -DENABLE_PUREC_LONGDOUBLE SIMDBaseUndiff.c -c -o SIMDBaseUndiff_pureclongdouble.o + +SIMDBaseUndiff_ssefloat.o : SIMDBaseUndiff.c SIMDBase.h SIMDBaseUndiff.h + $(CC) $(OPT) -msse -DENABLE_SSE_FLOAT SIMDBaseUndiff.c -c -o SIMDBaseUndiff_ssefloat.o + +SIMDBaseUndiff_sse2double.o : SIMDBaseUndiff.c SIMDBase.h SIMDBaseUndiff.h + $(CC) $(OPT) -msse2 -DENABLE_SSE2_DOUBLE SIMDBaseUndiff.c -c -o SIMDBaseUndiff_sse2double.o + +SIMDBaseUndiff_avxfloat.o : SIMDBaseUndiff.c SIMDBase.h SIMDBaseUndiff.h + $(CC) $(OPT) -mavx -DENABLE_AVX_FLOAT SIMDBaseUndiff.c -c -o SIMDBaseUndiff_avxfloat.o + +SIMDBaseUndiff_avxdouble.o : SIMDBaseUndiff.c SIMDBase.h SIMDBaseUndiff.h + $(CC) $(OPT) -mavx -DENABLE_AVX_DOUBLE SIMDBaseUndiff.c -c -o SIMDBaseUndiff_avxdouble.o + +SIMDBase.o : SIMDBase.c SIMDBase.h + $(CC) $(BASEOPT) -O -DENABLE_PUREC_FLOAT -DENABLE_PUREC_DOUBLE -DENABLE_PUREC_LONGDOUBLE -DENABLE_SSE_FLOAT -DENABLE_SSE2_DOUBLE -DENABLE_AVX_FLOAT -DENABLE_AVX_DOUBLE SIMDBase.c -c -o SIMDBase.o + +libSIMD.a : SIMDBase.o SIMDBaseUndiff_purecfloat.o SIMDBaseUndiff_purecdouble.o SIMDBaseUndiff_pureclongdouble.o SIMDBaseUndiff_ssefloat.o SIMDBaseUndiff_sse2double.o SIMDBaseUndiff_avxfloat.o SIMDBaseUndiff_avxdouble.o + rm -f libSIMD.a; ar -cvq libSIMD.a SIMDBase.o SIMDBaseUndiff_purecfloat.o SIMDBaseUndiff_purecdouble.o SIMDBaseUndiff_pureclongdouble.o SIMDBaseUndiff_ssefloat.o SIMDBaseUndiff_sse2double.o SIMDBaseUndiff_avxfloat.o SIMDBaseUndiff_avxdouble.o + +clean : + rm -f *~ *.o *.s *.a a.out diff --git a/plugins/supereq/nsfft-1.00/simd/SIMDBase.c b/plugins/supereq/nsfft-1.00/simd/SIMDBase.c new file mode 100644 index 00000000..eb51ee10 --- /dev/null +++ b/plugins/supereq/nsfft-1.00/simd/SIMDBase.c @@ -0,0 +1,454 @@ +#include <stdio.h> +#include <stdlib.h> +#include <math.h> +#include <stdint.h> +#include <signal.h> +#include <setjmp.h> +#include <string.h> + +#include "SIMDBase.h" + +void detect_purec_float(void); +void detect_purec_double(void); +void detect_purec_longdouble(void); +void detect_sse_float(void); +void detect_sse2_double(void); +void detect_neon_float(void); +void detect_avx_float(void); +void detect_avx_double(void); +void detect_altivec_float(void); + +int32_t getModeParamInt_purec_float(int32_t paramId); +int32_t getModeParamInt_purec_double(int32_t paramId); +int32_t getModeParamInt_purec_longdouble(int32_t paramId); +int32_t getModeParamInt_sse_float(int32_t paramId); +int32_t getModeParamInt_sse2_double(int32_t paramId); +int32_t getModeParamInt_neon_float(int32_t paramId); +int32_t getModeParamInt_avx_float(int32_t paramId); +int32_t getModeParamInt_avx_double(int32_t paramId); +int32_t getModeParamInt_altivec_float(int32_t paramId); + +char * getModeParamString_purec_float(int32_t paramId); +char * getModeParamString_purec_double(int32_t paramId); +char * getModeParamString_purec_longdouble(int32_t paramId); +char * getModeParamString_sse_float(int32_t paramId); +char * getModeParamString_sse2_double(int32_t paramId); +char * getModeParamString_neon_float(int32_t paramId); +char * getModeParamString_avx_float(int32_t paramId); +char * getModeParamString_avx_double(int32_t paramId); +char * getModeParamString_altivec_float(int32_t paramId); + +uint8_t detectBuffer[256]; +char SIMDBase_processorNameString[256]; + +static char *startsWith(char *str1, char *str2) { + if (strncmp(str1, str2, strlen(str2)) == 0) { + return str1 + strlen(str2); + } + + return NULL; +} + +#if defined(__linux__) +static char *tryReadingProcCpuinfo(char *entry) { + int i; + + FILE *fp = fopen("/proc/cpuinfo", "r"); + if (fp == NULL) return NULL; + + for(i=0;i<100;i++) { + char *q; + bzero(SIMDBase_processorNameString, 256); + if (fgets(SIMDBase_processorNameString, 255, fp) == NULL) break; + + if ((q = startsWith(SIMDBase_processorNameString, entry)) != NULL) { + int j; + fclose(fp); + + for(j=0;j<256;j++) { + if (SIMDBase_processorNameString[j] == '\n') SIMDBase_processorNameString[j] = ' '; + } + while(*q != '\0' && *q != ':' && q - SIMDBase_processorNameString < 200) q++; + if (q - SIMDBase_processorNameString >= 200) return NULL; + if (*q == ':' && *(q+1) == ' ') return q + 2; + return NULL; + } + } + + fclose(fp); + return NULL; +} +#else +static char *tryReadingProcCpuinfo(char *entry) { return NULL; } +#endif + +#if defined(__i386__) +static void SIMDBase_x86cpuid(uint32_t out[4], uint32_t eax, uint32_t ecx) { + uint32_t a, b, c, d; + __asm__ __volatile__("pushl %%eax; \n\t" + "pushl %%ebx; \n\t" + "pushl %%ecx; \n\t" + "pushl %%edx; \n\t" + "cpuid; \n\t" + "movl %%eax, %0; \n\t" + "movl %%ebx, %1; \n\t" + "movl %%ecx, %2; \n\t" + "movl %%edx, %3; \n\t" + "popl %%edx; \n\t" + "popl %%ecx; \n\t" + "popl %%ebx; \n\t" + "popl %%eax; \n\t" + : "=m"(a), "=m"(b), "=m"(c), "=m"(d) + : "a"(eax), "c"(ecx) + : "cc"); + out[0] = a; out[1] = b; out[2] = c; out[3] = d; +} +#endif + +#if defined(__x86_64__) +static void SIMDBase_x86cpuid(uint32_t out[4], uint32_t eax, uint32_t ecx) { + uint32_t a, b, c, d; + __asm__ __volatile__ ("cpuid" : "=a" (a), "=b" (b), "=c" (c), "=d" (d) : "a" (eax), "c"(ecx)); + out[0] = a; out[1] = b; out[2] = c; out[3] = d; +} +#endif + +#if defined(__i386__) || defined(__x86_64__) +static void getCacheParam(CacheParam *p) { + static int l2assoc[] = {0,1,2,0,4,0,8,0,16,0,32,48,64,96,128,-1}; + int32_t i; + uint32_t out[4]; + + for(i=0;i<8;i++) { + p->size[i] = p->assoc[i] = 0; + } + + SIMDBase_x86cpuid(out, 4, 0); + + if ((out[0] & 0xf) != 0) { + p->linesize = ((out[1] >> 0) & 2047)+1; + for(i=0;i<8;i++) { + SIMDBase_x86cpuid(out, 4, i); + if ((out[0] & 0xf) == 0) break; + int level = (out[0] >> 5) & 0x7; + int type = (out[0] >> 0) & 0xf; + int assoc = ((out[1] >> 22) & 1023)+1; + int part = ((out[1] >> 12) & 1023)+1; + int lsize = ((out[1] >> 0) & 2047)+1; + int nsets = ((out[2] >> 0))+1; + int nthre = ((out[0] >> 14) & 1023)+1; + + if (type != 1 && type != 3) continue; + p->assoc[level-1] = assoc; + p->size[level-1] = (uint64_t)assoc * part * lsize * nsets / nthre; + } + } else { + SIMDBase_x86cpuid(out, 0x80000008U, 0); + int ncores = (out[2] & 0xff) + 1; + + SIMDBase_x86cpuid(out, 0x80000005U, 0); + p->linesize = out[2] & 255; + p->size[0] = (out[2] >> 24) * 1024 / ncores; + p->assoc[0] = (out[2] >> 16) & 0xff; + + SIMDBase_x86cpuid(out, 0x80000006U, 0); + p->size[1] = (out[2] >> 16) * 1024 / ncores; + p->assoc[1] = l2assoc[(out[2] >> 12) & 0xf]; + p->size[2] = (out[3] >> 18) * 512 * 1024 / ncores; + p->assoc[2] = l2assoc[(out[3] >> 12) & 0xf]; + } + + if (p->size[0] == 0) { + p->size[0] = 16 * 1024; + p->assoc[0] = 4; + } + + if (p->size[1] == 0) { + p->size[1] = 256 * 1024; + p->assoc[1] = 4; + } +} + +char *SIMDBase_getProcessorNameString() { + union { + uint32_t info[4]; + uint8_t str[16]; + } u; + int i,j; + char *p; + + p = SIMDBase_processorNameString; + + SIMDBase_x86cpuid(u.info, 0, 0); + + for(i=0;i<4;i++) *p++ = u.str[i+4]; + for(i=0;i<4;i++) *p++ = u.str[i+12]; + for(i=0;i<4;i++) *p++ = u.str[i+8]; + + *p++ = ' '; + + for(i=0;i<3;i++) { + SIMDBase_x86cpuid(u.info, i + 0x80000002, 0); + + for(j=0;j<16;j++) { + *p++ = u.str[j]; + } + } + + *p++ = '\n'; + + return SIMDBase_processorNameString; +} +#else +char *SIMDBase_getProcessorNameString() { + char *p = "Unknown"; +#if defined(__powerpc__) + if ((p = tryReadingProcCpuinfo("cpu")) == NULL) p = "PowerPC"; +#elif defined(__arm__) + if ((p = tryReadingProcCpuinfo("Processor")) == NULL) p = "ARM"; +#endif + + return p; +} +#endif + +int32_t SIMDBase_sizeOfCachelineInByte() { +#if defined(__i386__) || defined(__x86_64__) + CacheParam p; + getCacheParam(&p); + return p.linesize; +#else + return 64; +#endif +} + +int32_t SIMDBase_sizeOfDataCacheInByte() { +#if defined(__i386__) || defined(__x86_64__) + CacheParam p; + getCacheParam(&p); + return p.size[1] + p.size[2]; // L2 + L3 +#else + return 256 * 1024; +#endif +} + +static jmp_buf sigjmp; + +static void sighandler(int signum) { + longjmp(sigjmp, 1); +} + +int32_t SIMDBase_detect(int32_t paramId) { +#if defined(__i386__) || defined(__x86_64__) + uint32_t reg[4]; +#endif + + switch(paramId) { + case SIMDBase_MODE_PUREC_FLOAT: +#if defined(ENABLE_PUREC_FLOAT) + return 1; +#else + return -1; +#endif + case SIMDBase_MODE_PUREC_DOUBLE: +#if defined(ENABLE_PUREC_DOUBLE) + return 1; +#else + return -1; +#endif + case SIMDBase_MODE_PUREC_LONGDOUBLE: +#if defined(ENABLE_PUREC_LONGDOUBLE) + return 1; +#else + return -1; +#endif + case SIMDBase_MODE_SSE_FLOAT: +#if defined(ENABLE_SSE_FLOAT) + SIMDBase_x86cpuid(reg, 1, 0); + return (reg[3] & (1 << 25)) != 0; +#else + return -1; +#endif + case SIMDBase_MODE_SSE2_DOUBLE: +#if defined(ENABLE_SSE2_DOUBLE) + SIMDBase_x86cpuid(reg, 1, 0); + return (reg[3] & (1 << 26)) != 0; +#else + return -1; +#endif + case SIMDBase_MODE_AVX_FLOAT: +#if defined(ENABLE_AVX_FLOAT) + SIMDBase_x86cpuid(reg, 1, 0); + return (reg[2] & (1 << 28)) != 0; +#else + return -1; +#endif + case SIMDBase_MODE_AVX_DOUBLE: +#if defined(ENABLE_AVX_DOUBLE) + SIMDBase_x86cpuid(reg, 1, 0); + return (reg[2] & (1 << 28)) != 0; +#else + return -1; +#endif + default: + break; + } + + signal(SIGILL, sighandler); + + if (setjmp(sigjmp) == 0) { + switch(paramId) { +#if defined(ENABLE_NEON_FLOAT) + case SIMDBase_MODE_NEON_FLOAT: + detect_neon_float(); + break; +#endif +#if defined(ENABLE_ALTIVEC_FLOAT) + case SIMDBase_MODE_ALTIVEC_FLOAT: + detect_altivec_float(); + break; +#endif + default: + signal(SIGILL, SIG_DFL); + return -1; + } + signal(SIGILL, SIG_DFL); + return 1; + } else { + signal(SIGILL, SIG_DFL); + return 0; + } +} + +int32_t SIMDBase_chooseBestMode(int32_t typeId) { + switch(typeId) { + case SIMDBase_TYPE_HALF: + break; + case SIMDBase_TYPE_FLOAT: + if (SIMDBase_detect(SIMDBase_MODE_AVX_FLOAT) == 1) return SIMDBase_MODE_AVX_FLOAT; + if (SIMDBase_detect(SIMDBase_MODE_SSE_FLOAT) == 1) return SIMDBase_MODE_SSE_FLOAT; + if (SIMDBase_detect(SIMDBase_MODE_NEON_FLOAT) == 1) return SIMDBase_MODE_NEON_FLOAT; + if (SIMDBase_detect(SIMDBase_MODE_ALTIVEC_FLOAT) == 1) return SIMDBase_MODE_ALTIVEC_FLOAT; + if (SIMDBase_detect(SIMDBase_MODE_PUREC_FLOAT) == 1) return SIMDBase_MODE_PUREC_FLOAT; + break; + + case SIMDBase_TYPE_DOUBLE: + if (SIMDBase_detect(SIMDBase_MODE_AVX_DOUBLE) == 1) return SIMDBase_MODE_AVX_DOUBLE; + if (SIMDBase_detect(SIMDBase_MODE_SSE2_DOUBLE) == 1) return SIMDBase_MODE_SSE2_DOUBLE; + if (SIMDBase_detect(SIMDBase_MODE_PUREC_DOUBLE) == 1) return SIMDBase_MODE_PUREC_DOUBLE; + break; + + case SIMDBase_TYPE_LONGDOUBLE: + if (SIMDBase_detect(SIMDBase_MODE_PUREC_LONGDOUBLE) == 1) return SIMDBase_MODE_PUREC_LONGDOUBLE; + break; + + case SIMDBase_TYPE_EXTENDED: + break; + + case SIMDBase_TYPE_QUAD: + break; + } + + return SIMDBase_MODE_NONE; +} + +int32_t SIMDBase_getModeParamInt(int32_t paramId, int32_t mode) { + switch(mode) { +#if defined(ENABLE_PUREC_FLOAT) + case 1: return getModeParamInt_purec_float(paramId); break; +#endif +#if defined(ENABLE_PUREC_DOUBLE) + case 2: return getModeParamInt_purec_double(paramId); break; +#endif +#if defined(ENABLE_PUREC_LONGDOUBLE) + case 3: return getModeParamInt_purec_longdouble(paramId); break; +#endif +#if defined(ENABLE_SSE_FLOAT) + case 4: return getModeParamInt_sse_float(paramId); break; +#endif +#if defined(ENABLE_SSE2_DOUBLE) + case 5: return getModeParamInt_sse2_double(paramId); break; +#endif +#if defined(ENABLE_NEON_FLOAT) + case 6: return getModeParamInt_neon_float(paramId); break; +#endif +#if defined(ENABLE_AVX_FLOAT) + case 7: return getModeParamInt_avx_float(paramId); break; +#endif +#if defined(ENABLE_AVX_DOUBLE) + case 8: return getModeParamInt_avx_double(paramId); break; +#endif +#if defined(ENABLE_ALTIVEC_FLOAT) + case 9: return getModeParamInt_altivec_float(paramId); break; +#endif + } + + return -1; +} + +char *SIMDBase_getModeParamString(int32_t paramId, int32_t mode) { + switch(mode) { +#if defined(ENABLE_PUREC_FLOAT) + case 1: return getModeParamString_purec_float(paramId); break; +#endif +#if defined(ENABLE_PUREC_DOUBLE) + case 2: return getModeParamString_purec_double(paramId); break; +#endif +#if defined(ENABLE_PUREC_LONGDOUBLE) + case 3: return getModeParamString_purec_longdouble(paramId); break; +#endif +#if defined(ENABLE_SSE_FLOAT) + case 4: return getModeParamString_sse_float(paramId); break; +#endif +#if defined(ENABLE_SSE2_DOUBLE) + case 5: return getModeParamString_sse2_double(paramId); break; +#endif +#if defined(ENABLE_NEON_FLOAT) + case 6: return getModeParamString_neon_float(paramId); break; +#endif +#if defined(ENABLE_AVX_FLOAT) + case 7: return getModeParamString_avx_float(paramId); break; +#endif +#if defined(ENABLE_AVX_DOUBLE) + case 8: return getModeParamString_avx_double(paramId); break; +#endif +#if defined(ENABLE_ALTIVEC_FLOAT) + case 9: return getModeParamString_altivec_float(paramId); break; +#endif + } + + return NULL; +} + +#ifdef ANDROID +int posix_memalign (void **memptr, size_t alignment, size_t size) { + *memptr = malloc (size); + return *memptr ? 0 : -1; +} +#endif + +void *SIMDBase_alignedMalloc(uint64_t size) { + void *p; + if (posix_memalign(&p, SIMDBase_sizeOfCachelineInByte(), size) != 0) abort(); + return p; +} + +void SIMDBase_alignedFree(void *ptr) { + free(ptr); +} + +int32_t SIMDBase_getParamInt(int32_t paramId) { + switch(paramId) { + case SIMDBase_PARAMID_MODE_MAX: + return SIMDBase_LAST_MODE + 1; + } + + return -1; +} + +int32_t SIMDBase_getTypeParamInt(int32_t paramId, int32_t typeId) { + switch(typeId) { + } + + return -1; +} diff --git a/plugins/supereq/nsfft-1.00/simd/SIMDBase.h b/plugins/supereq/nsfft-1.00/simd/SIMDBase.h new file mode 100644 index 00000000..5382b4d1 --- /dev/null +++ b/plugins/supereq/nsfft-1.00/simd/SIMDBase.h @@ -0,0 +1,51 @@ +#ifndef _SIMDBase_H_ +#define _SIMDBase_H_ + +#define SIMDBase_TYPE_FLOAT ( 1 | ( 1 << 24 )) +#define SIMDBase_TYPE_DOUBLE ( 2 | ( 1 << 24 )) +#define SIMDBase_TYPE_LONGDOUBLE ( 3 | ( 1 << 24 )) +#define SIMDBase_TYPE_EXTENDED ( 4 | ( 1 << 24 )) +#define SIMDBase_TYPE_QUAD ( 5 | ( 1 << 24 )) +#define SIMDBase_TYPE_HALF ( 6 | ( 1 << 24 )) + +#define SIMDBase_MODE_NONE 0 +#define SIMDBase_MODE_PUREC_FLOAT 1 +#define SIMDBase_MODE_PUREC_DOUBLE 2 +#define SIMDBase_MODE_PUREC_LONGDOUBLE 3 +#define SIMDBase_MODE_SSE_FLOAT 4 +#define SIMDBase_MODE_SSE2_DOUBLE 5 +#define SIMDBase_MODE_NEON_FLOAT 6 +#define SIMDBase_MODE_AVX_FLOAT 7 +#define SIMDBase_MODE_AVX_DOUBLE 8 +#define SIMDBase_MODE_ALTIVEC_FLOAT 9 + +#define SIMDBase_LAST_MODE SIMDBase_MODE_ALTIVEC_FLOAT + +#define SIMDBase_PARAMID_MODE_MAX ( 1 | ( 2 << 24 )) +#define SIMDBase_PARAMID_TYPE_AVAILABILITY ( 2 | ( 2 << 24 )) +#define SIMDBase_PARAMID_SIZE_OF_REAL ( 3 | ( 2 << 24 )) +#define SIMDBase_PARAMID_SIZE_OF_VECT ( 4 | ( 2 << 24 )) +#define SIMDBase_PARAMID_VECTOR_LEN ( 5 | ( 2 << 24 )) +#define SIMDBase_PARAMID_MODE_AVAILABILITY ( 6 | ( 2 << 24 )) +#define SIMDBase_PARAMID_MODE_NAME ( 7 | ( 2 << 24 )) + +// + +typedef struct { + uint32_t linesize; + uint32_t size[8], assoc[8]; +} CacheParam; + +void *SIMDBase_alignedMalloc(uint64_t size); +void SIMDBase_alignedFree(void *ptr); +int32_t SIMDBase_sizeOfCachelineInByte(); +int32_t SIMDBase_sizeOfDataCacheInByte(); +int32_t SIMDBase_chooseBestMode(int32_t typeId); +char *SIMDBase_getProcessorNameString(); +int32_t SIMDBase_detect(int32_t paramId); +int32_t SIMDBase_getParamInt(int32_t paramId); +int32_t SIMDBase_getTypeParamInt(int32_t paramId, int32_t typeId); +int32_t SIMDBase_getModeParamInt(int32_t paramId, int32_t mode); +char *SIMDBase_getModeParamString(int32_t paramId, int32_t mode); + +#endif diff --git a/plugins/supereq/nsfft-1.00/simd/SIMDBaseUndiff.c b/plugins/supereq/nsfft-1.00/simd/SIMDBaseUndiff.c new file mode 100644 index 00000000..257a5ff0 --- /dev/null +++ b/plugins/supereq/nsfft-1.00/simd/SIMDBaseUndiff.c @@ -0,0 +1,38 @@ +#include <stdio.h> +#include <stdlib.h> +#include <stdint.h> + +#include "SIMDBase.h" +#include "SIMDBaseUndiff.h" + +void SIMDBaseUndiff_DETECT() { + extern uint8_t detectBuffer[256]; + SIMDBase_VECT a = SIMDBase_LOAD((SIMDBase_VECT *)&detectBuffer[0]); + SIMDBase_VECT b = SIMDBase_LOAD((SIMDBase_VECT *)&detectBuffer[64]); + SIMDBase_VECT c = SIMDBase_ADDi(a, b); + SIMDBase_STOR((SIMDBase_VECT *)&detectBuffer[128], c); +} + +int32_t SIMDBaseUndiff_GETMODEPARAMINT(int32_t paramId) { + switch(paramId) { + case SIMDBase_PARAMID_SIZE_OF_REAL: + return sizeof(SIMDBase_REAL); + case SIMDBase_PARAMID_SIZE_OF_VECT: + return sizeof(SIMDBase_VECT); + case SIMDBase_PARAMID_VECTOR_LEN: + return SIMDBase_VECTLEN; + case SIMDBase_PARAMID_MODE_AVAILABILITY: + return SIMDBase_detect(paramId); + } + + return -1; +} + +char * SIMDBaseUndiff_GETMODEPARAMSTRING(int32_t paramId) { + switch(paramId) { + case SIMDBase_PARAMID_MODE_NAME: + return SIMDBase_NAME; + } + + return NULL; +} diff --git a/plugins/supereq/nsfft-1.00/simd/SIMDBaseUndiff.h b/plugins/supereq/nsfft-1.00/simd/SIMDBaseUndiff.h new file mode 100644 index 00000000..1af849a8 --- /dev/null +++ b/plugins/supereq/nsfft-1.00/simd/SIMDBaseUndiff.h @@ -0,0 +1,231 @@ +#ifndef _SIMDBaseUndiff_H_ +#define _SIMDBaseUndiff_H_ + +#if defined(ENABLE_PUREC_FLOAT) //////////////////////////////////////////// + +typedef float SIMDBase_REAL; +typedef float SIMDBase_VECT; + +#define SIMDBase_MODE 1 +#define SIMDBase_TYPE SIMDBase_TYPE_FLOAT +#define SIMDBase_VECTLEN 1 +#define SIMDBase_NAME "Pure C float" +#define SIMDBaseUndiff_DETECT detect_purec_float +#define SIMDBaseUndiff_GETMODEPARAMINT getModeParamInt_purec_float +#define SIMDBaseUndiff_GETMODEPARAMSTRING getModeParamString_purec_float + +static inline SIMDBase_VECT SIMDBase_LOAD(SIMDBase_VECT *p) { return *p; } +static inline void SIMDBase_STOR(SIMDBase_VECT *p, SIMDBase_VECT u) { *p = u; } +static inline SIMDBase_VECT SIMDBase_SET1(SIMDBase_REAL f) { return f; } +static inline SIMDBase_VECT SIMDBase_LOAD1(SIMDBase_REAL *p) { return *p; } +static inline SIMDBase_VECT SIMDBase_ADDi(SIMDBase_VECT u, SIMDBase_VECT v) { return u + v; } +static inline SIMDBase_VECT SIMDBase_SUBi(SIMDBase_VECT u, SIMDBase_VECT v) { return u - v; } +static inline SIMDBase_VECT SIMDBase_MULi(SIMDBase_VECT u, SIMDBase_VECT v) { return u * v; } +static inline SIMDBase_VECT SIMDBase_NEGi(SIMDBase_VECT u) { return -u; } + +#elif defined(ENABLE_PUREC_DOUBLE) //////////////////////////////////////////// + +typedef double SIMDBase_REAL; +typedef double SIMDBase_VECT; + +#define SIMDBase_MODE 2 +#define SIMDBase_TYPE SIMDBase_TYPE_DOUBLE +#define SIMDBase_VECTLEN 1 +#define SIMDBase_NAME "Pure C double" +#define SIMDBaseUndiff_DETECT detect_purec_double +#define SIMDBaseUndiff_GETMODEPARAMINT getModeParamInt_purec_double +#define SIMDBaseUndiff_GETMODEPARAMSTRING getModeParamString_purec_double + +static inline SIMDBase_VECT SIMDBase_LOAD(SIMDBase_VECT *p) { return *p; } +static inline void SIMDBase_STOR(SIMDBase_VECT *p, SIMDBase_VECT u) { *p = u; } +static inline SIMDBase_VECT SIMDBase_SET1(SIMDBase_REAL f) { return f; } +static inline SIMDBase_VECT SIMDBase_LOAD1(SIMDBase_REAL *p) { return *p; } +static inline SIMDBase_VECT SIMDBase_ADDi(SIMDBase_VECT u, SIMDBase_VECT v) { return u + v; } +static inline SIMDBase_VECT SIMDBase_SUBi(SIMDBase_VECT u, SIMDBase_VECT v) { return u - v; } +static inline SIMDBase_VECT SIMDBase_MULi(SIMDBase_VECT u, SIMDBase_VECT v) { return u * v; } +static inline SIMDBase_VECT SIMDBase_NEGi(SIMDBase_VECT u) { return -u; } + +#elif defined(ENABLE_PUREC_LONGDOUBLE) //////////////////////////////////////////// + +typedef long double SIMDBase_REAL; +typedef long double SIMDBase_VECT; + +#define SIMDBase_MODE 3 +#define SIMDBase_TYPE SIMDBase_TYPE_LONGDOUBLE +#define SIMDBase_VECTLEN 1 +#define SIMDBase_NAME "Pure C long double" +#define SIMDBaseUndiff_DETECT detect_purec_longdouble +#define SIMDBaseUndiff_GETMODEPARAMINT getModeParamInt_purec_longdouble +#define SIMDBaseUndiff_GETMODEPARAMSTRING getModeParamString_purec_longdouble + +static inline SIMDBase_VECT SIMDBase_LOAD(SIMDBase_VECT *p) { return *p; } +static inline void SIMDBase_STOR(SIMDBase_VECT *p, SIMDBase_VECT u) { *p = u; } +static inline SIMDBase_VECT SIMDBase_SET1(SIMDBase_REAL f) { return f; } +static inline SIMDBase_VECT SIMDBase_LOAD1(SIMDBase_REAL *p) { return *p; } +static inline SIMDBase_VECT SIMDBase_ADDi(SIMDBase_VECT u, SIMDBase_VECT v) { return u + v; } +static inline SIMDBase_VECT SIMDBase_SUBi(SIMDBase_VECT u, SIMDBase_VECT v) { return u - v; } +static inline SIMDBase_VECT SIMDBase_MULi(SIMDBase_VECT u, SIMDBase_VECT v) { return u * v; } +static inline SIMDBase_VECT SIMDBase_NEGi(SIMDBase_VECT u) { return -u; } + +#elif defined(ENABLE_SSE_FLOAT) //////////////////////////////////////////// + +#include <xmmintrin.h> + +typedef float SIMDBase_REAL; +typedef __m128 SIMDBase_VECT; + +#define SIMDBase_MODE 4 +#define SIMDBase_TYPE SIMDBase_TYPE_FLOAT +#define SIMDBase_VECTLEN 4 +#define SIMDBase_NAME "x86 SSE float" +#define SIMDBaseUndiff_DETECT detect_sse_float +#define SIMDBaseUndiff_GETMODEPARAMINT getModeParamInt_sse_float +#define SIMDBaseUndiff_GETMODEPARAMSTRING getModeParamString_sse_float + +static inline SIMDBase_VECT SIMDBase_LOAD(SIMDBase_VECT *p) { return _mm_load_ps((float *)p); } +static inline void SIMDBase_STOR(SIMDBase_VECT *p, SIMDBase_VECT u) { _mm_store_ps((float *)p, u); } +static inline SIMDBase_VECT SIMDBase_SET1(SIMDBase_REAL f) { return _mm_set1_ps(f); } +static inline SIMDBase_VECT SIMDBase_LOAD1(SIMDBase_REAL *p) { return _mm_load1_ps(p); } +static inline SIMDBase_VECT SIMDBase_ADDi(SIMDBase_VECT u, SIMDBase_VECT v) { return _mm_add_ps(u, v); } +static inline SIMDBase_VECT SIMDBase_SUBi(SIMDBase_VECT u, SIMDBase_VECT v) { return _mm_sub_ps(u, v); } +static inline SIMDBase_VECT SIMDBase_MULi(SIMDBase_VECT u, SIMDBase_VECT v) { return _mm_mul_ps(u, v); } +static inline SIMDBase_VECT SIMDBase_NEGi(SIMDBase_VECT u) { return _mm_xor_ps(u, _mm_set_ps(-0.0f, -0.0f, -0.0f, -0.0f)); } + +#elif defined(ENABLE_SSE2_DOUBLE) //////////////////////////////////////////// + +#include <emmintrin.h> + +typedef double SIMDBase_REAL; +typedef __m128d SIMDBase_VECT; + +#define SIMDBase_MODE 5 +#define SIMDBase_TYPE SIMDBase_TYPE_DOUBLE +#define SIMDBase_VECTLEN 2 +#define SIMDBase_NAME "x86 SSE2 double" +#define SIMDBaseUndiff_DETECT detect_sse2_double +#define SIMDBaseUndiff_GETMODEPARAMINT getModeParamInt_sse2_double +#define SIMDBaseUndiff_GETMODEPARAMSTRING getModeParamString_sse2_double + +static inline SIMDBase_VECT SIMDBase_LOAD(SIMDBase_VECT *p) { return _mm_load_pd((double *)p); } +static inline void SIMDBase_STOR(SIMDBase_VECT *p, SIMDBase_VECT u) { _mm_store_pd((double *)p, u); } +static inline SIMDBase_VECT SIMDBase_SET1(SIMDBase_REAL f) { return _mm_set1_pd(f); } +static inline SIMDBase_VECT SIMDBase_LOAD1(SIMDBase_REAL *p) { return _mm_load1_pd(p); } +static inline SIMDBase_VECT SIMDBase_ADDi(SIMDBase_VECT u, SIMDBase_VECT v) { return _mm_add_pd(u, v); } +static inline SIMDBase_VECT SIMDBase_SUBi(SIMDBase_VECT u, SIMDBase_VECT v) { return _mm_sub_pd(u, v); } +static inline SIMDBase_VECT SIMDBase_MULi(SIMDBase_VECT u, SIMDBase_VECT v) { return _mm_mul_pd(u, v); } +static inline SIMDBase_VECT SIMDBase_NEGi(SIMDBase_VECT u) { return _mm_xor_pd(u, _mm_set_pd(-0.0, -0.0)); } + +#elif defined(ENABLE_NEON_FLOAT) //////////////////////////////////////////// + +#include <arm_neon.h> + +typedef float32_t SIMDBase_REAL; +typedef float32x4_t SIMDBase_VECT; + +#define SIMDBase_MODE 6 +#define SIMDBase_TYPE SIMDBase_TYPE_FLOAT +#define SIMDBase_VECTLEN 4 +#define SIMDBase_NAME "ARM NEON float" +#define SIMDBaseUndiff_DETECT detect_neon_float +#define SIMDBaseUndiff_GETMODEPARAMINT getModeParamInt_neon_float +#define SIMDBaseUndiff_GETMODEPARAMSTRING getModeParamString_neon_float + +static inline SIMDBase_VECT SIMDBase_LOAD(SIMDBase_VECT *p) { return vld1q_f32((float32_t *)p); } +static inline void SIMDBase_STOR(SIMDBase_VECT *p, SIMDBase_VECT u) { vst1q_f32((float32_t *)p, u); } +static inline SIMDBase_VECT SIMDBase_SET1(SIMDBase_REAL f) { return vdupq_n_f32(f); } +static inline SIMDBase_VECT SIMDBase_LOAD1(SIMDBase_REAL *p) { return vdupq_n_f32(*p); } +static inline SIMDBase_VECT SIMDBase_ADDi(SIMDBase_VECT u, SIMDBase_VECT v) { return vaddq_f32(u, v); } +static inline SIMDBase_VECT SIMDBase_SUBi(SIMDBase_VECT u, SIMDBase_VECT v) { return vsubq_f32(u, v); } +static inline SIMDBase_VECT SIMDBase_MULi(SIMDBase_VECT u, SIMDBase_VECT v) { return vmulq_f32(u, v); } +static inline SIMDBase_VECT SIMDBase_NEGi(SIMDBase_VECT u) { + return vreinterpretq_f32_u32( veorq_u32(vreinterpretq_u32_f32(u), vdupq_n_u32(0x80000000U))); +} + +#define SIMDBase_FMADD_AVAILABLE + +static inline SIMDBase_VECT SIMDBase_FMADDi(SIMDBase_VECT u, SIMDBase_VECT v, SIMDBase_VECT w) { return vmlaq_f32(w, u, v); } // w + u * v +static inline SIMDBase_VECT SIMDBase_FMSUBi(SIMDBase_VECT u, SIMDBase_VECT v, SIMDBase_VECT w) { return vmlsq_f32(w, u, v); } // w - u * v + +#elif defined(ENABLE_AVX_FLOAT) //////////////////////////////////////////// + +#include <immintrin.h> + +typedef float SIMDBase_REAL; +typedef __m256 SIMDBase_VECT; + +#define SIMDBase_MODE 7 +#define SIMDBase_TYPE SIMDBase_TYPE_FLOAT +#define SIMDBase_VECTLEN 8 +#define SIMDBase_NAME "x86 AVX float" +#define SIMDBaseUndiff_DETECT detect_avx_float +#define SIMDBaseUndiff_GETMODEPARAMINT getModeParamInt_avx_float +#define SIMDBaseUndiff_GETMODEPARAMSTRING getModeParamString_avx_float + +static inline SIMDBase_VECT SIMDBase_LOAD(SIMDBase_VECT *p) { return _mm256_load_ps((float *)p); } +static inline void SIMDBase_STOR(SIMDBase_VECT *p, SIMDBase_VECT u) { _mm256_store_ps((float *)p, u); } +static inline SIMDBase_VECT SIMDBase_SET1(SIMDBase_REAL f) { return _mm256_set1_ps(f); } +static inline SIMDBase_VECT SIMDBase_LOAD1(SIMDBase_REAL *p) { return _mm256_set1_ps(*p); } +static inline SIMDBase_VECT SIMDBase_ADDi(SIMDBase_VECT u, SIMDBase_VECT v) { return _mm256_add_ps(u, v); } +static inline SIMDBase_VECT SIMDBase_SUBi(SIMDBase_VECT u, SIMDBase_VECT v) { return _mm256_sub_ps(u, v); } +static inline SIMDBase_VECT SIMDBase_MULi(SIMDBase_VECT u, SIMDBase_VECT v) { return _mm256_mul_ps(u, v); } +static inline SIMDBase_VECT SIMDBase_NEGi(SIMDBase_VECT u) { return _mm256_xor_ps(u, _mm256_set1_ps(-0.0f)); } + +#elif defined(ENABLE_AVX_DOUBLE) //////////////////////////////////////////// + +#include <immintrin.h> + +typedef double SIMDBase_REAL; +typedef __m256d SIMDBase_VECT; + +#define SIMDBase_MODE 8 +#define SIMDBase_TYPE SIMDBase_TYPE_DOUBLE +#define SIMDBase_VECTLEN 4 +#define SIMDBase_NAME "x86 AVX double" +#define SIMDBaseUndiff_DETECT detect_avx_double +#define SIMDBaseUndiff_GETMODEPARAMINT getModeParamInt_avx_double +#define SIMDBaseUndiff_GETMODEPARAMSTRING getModeParamString_avx_double + +static inline SIMDBase_VECT SIMDBase_LOAD(SIMDBase_VECT *p) { return _mm256_load_pd((double *)p); } +static inline void SIMDBase_STOR(SIMDBase_VECT *p, SIMDBase_VECT u) { _mm256_store_pd((double *)p, u); } +static inline SIMDBase_VECT SIMDBase_SET1(SIMDBase_REAL f) { return _mm256_set1_pd(f); } +static inline SIMDBase_VECT SIMDBase_LOAD1(SIMDBase_REAL *p) { return _mm256_set1_pd(*p); } +static inline SIMDBase_VECT SIMDBase_ADDi(SIMDBase_VECT u, SIMDBase_VECT v) { return _mm256_add_pd(u, v); } +static inline SIMDBase_VECT SIMDBase_SUBi(SIMDBase_VECT u, SIMDBase_VECT v) { return _mm256_sub_pd(u, v); } +static inline SIMDBase_VECT SIMDBase_MULi(SIMDBase_VECT u, SIMDBase_VECT v) { return _mm256_mul_pd(u, v); } +static inline SIMDBase_VECT SIMDBase_NEGi(SIMDBase_VECT u) { return _mm256_xor_pd(u, _mm256_set1_pd(-0.0)); } + +#elif defined(ENABLE_ALTIVEC_FLOAT) //////////////////////////////////////////// + +#include <altivec.h> + +typedef float SIMDBase_REAL; +typedef vector float SIMDBase_VECT; + +#define SIMDBase_MODE 9 +#define SIMDBase_TYPE SIMDBase_TYPE_FLOAT +#define SIMDBase_VECTLEN 4 +#define SIMDBase_NAME "PowerPC AltiVec float" +#define SIMDBaseUndiff_DETECT detect_altivec_float +#define SIMDBaseUndiff_GETMODEPARAMINT getModeParamInt_altivec_float +#define SIMDBaseUndiff_GETMODEPARAMSTRING getModeParamString_altivec_float + +static inline SIMDBase_VECT SIMDBase_LOAD(SIMDBase_VECT *p) { return vec_ld(0, p); } +static inline void SIMDBase_STOR(SIMDBase_VECT *p, SIMDBase_VECT u) { vec_st(u, 0, p); } +static inline SIMDBase_VECT SIMDBase_SET1(SIMDBase_REAL f) { return (vector float){f, f, f, f}; } +static inline SIMDBase_VECT SIMDBase_LOAD1(SIMDBase_REAL *p) { return (vector float){*p, *p, *p, *p}; } +static inline SIMDBase_VECT SIMDBase_ADDi(SIMDBase_VECT u, SIMDBase_VECT v) { return vec_add(u, v); } +static inline SIMDBase_VECT SIMDBase_SUBi(SIMDBase_VECT u, SIMDBase_VECT v) { return vec_sub(u, v); } +static inline SIMDBase_VECT SIMDBase_MULi(SIMDBase_VECT u, SIMDBase_VECT v) { return vec_madd(u, v, (vector float){0, 0, 0, 0}); } +static inline SIMDBase_VECT SIMDBase_NEGi(SIMDBase_VECT u) { return vec_xor(u, (vector float){-0.0f, -0.0f, -0.0f, -0.0f}); } + +#define SIMDBase_FMADD_AVAILABLE + +static inline SIMDBase_VECT SIMDBase_FMADDi(SIMDBase_VECT u, SIMDBase_VECT v, SIMDBase_VECT w) { return vec_madd(u, v, w); } // w + u * v +static inline SIMDBase_VECT SIMDBase_FMSUBi(SIMDBase_VECT u, SIMDBase_VECT v, SIMDBase_VECT w) { return vec_nmsub(u, v, w); } // w - u * v + +#endif //////////////////////////////////////////////////////////////////// + +static inline SIMDBase_VECT SIMDBase_ADDm(SIMDBase_VECT *p, SIMDBase_VECT *q) { return SIMDBase_ADDi(SIMDBase_LOAD(p), SIMDBase_LOAD(q)); } +static inline SIMDBase_VECT SIMDBase_SUBm(SIMDBase_VECT *p, SIMDBase_VECT *q) { return SIMDBase_SUBi(SIMDBase_LOAD(p), SIMDBase_LOAD(q)); } + +#endif |