diff options
Diffstat (limited to 'libao2/fir.h')
-rw-r--r-- | libao2/fir.h | 164 |
1 files changed, 58 insertions, 106 deletions
diff --git a/libao2/fir.h b/libao2/fir.h index 8690824784..6c2d5e646c 100644 --- a/libao2/fir.h +++ b/libao2/fir.h @@ -11,123 +11,75 @@ #ifndef __FIR_H__ #define __FIR_H__ -/* 4, 8 and 16 tap FIR filters implemented using SSE instructions - int16_t* x Input data - int16_t* y Output value - int16_t* w Filter weights - - C function - for(int i = 0 ; i < L ; i++) - *y += w[i]*x[i]; -*/ +/* Fixpoint 16 bit fir filter FIR filter. The filter is implemented +both in C and MMX assembly. The filter consists of one macro +UPDATE_QUE and one inline function firn. The macro can be used for +adding new data to the circular buffer used by the filter firn. +Limitations: max length of n = 16*4 and n must be multiple of 4 (pad +fiter with zeros for other lengths). Sometimes it works with filters +longer than 4*16 (the problem is overshoot and the acumulated energy +in the filter taps). */ -#ifdef HAVE_SSE +#ifdef HAVE_MMX +inline int32_t firn(int16_t* x, int16_t* w, int16_t n) +{ + register int32_t y; // Output + // Prologue + asm volatile(" pxor %mm1, %mm1;\n" ); // Clear buffer yt + // Main loop + while((n-=4)>=0){ + asm volatile( + " movq (%1), %%mm0;\n" // Load x(n:n+4) + " pmaddwd (%0), %%mm0;\n" // yt(n:n+1)=sum(x(n:n+4).*w(n:n+4)) + " psrld $16, %%mm0;\n" // yt(n:n+1)=yt(n:n+1)>>16 + " paddd %%mm0, %%mm1;\n" // yt(n:n+1)=yt(n-2:n-1)+yt(n:n+1) + :: "r" (w), "r" (x)); + w+=4; x+=4; + } + // Epilogue + asm volatile( + " movq %%mm1, %%mm0;\n" + " punpckhdq %%mm1, %%mm0;\n" + " paddd %%mm0, %%mm1;\n" //yt(n)=yt(n)+yt(n+1) + " movd %%mm1, %0 ;\n" //y=yt + " emms ;\n" + : "=&r" (y)); + return y; +} -// This block should be MMX only compatible, but it isn't... -#ifdef L4 -#define LOAD_QUE(x) \ - __asm __volatile("movq %0, %%mm2\n\t" \ - : \ - :"m"((x)[0]) \ - :"memory"); -#define SAVE_QUE(x) \ - __asm __volatile("movq %%mm2, %0\n\t" \ - "emms \n\t" \ - :"=m"(x[0]) \ - : \ - :"memory"); -#define UPDATE_QUE(in) \ - __asm __volatile("psllq $16, %%mm2\n\t" \ - "pinsrw $0, %0,%%mm2\n\t" \ - : \ - :"m" ((in)[0]) \ - :"memory"); -#define FIR(x,w,y) \ - __asm __volatile("movq %%mm2, %%mm0\n\t" \ - "pmaddwd %1, %%mm0\n\t" \ - "movq %%mm0, %%mm1\n\t" \ - "psrlq $32, %%mm1\n\t" \ - "paddd %%mm0, %%mm1\n\t" \ - "movd %%mm1, %%esi\n\t" \ - "shrl $16, %%esi\n\t" \ - "movw %%si, %0\n\t" \ - : "=m" ((y)[0]) \ - : "m" ((w)[0]) \ - : "memory", "%esi"); -#endif /* L4 */ +#else /* HAVE_MMX */ -// It is possible to make the 8 bit filter a lot faster by using the -// 128 bit registers, feel free to optimize. -#ifdef L8 -#define LOAD_QUE(x) \ - __asm __volatile("movq %0, %%mm5\n\t" \ - "movq %1, %%mm4\n\t" \ - : \ - :"m"((x)[0]), \ - "m"((x)[4]) \ - :"memory"); -#define SAVE_QUE(x) \ - __asm __volatile("movq %%mm5, %0\n\t" \ - "movq %%mm4, %1\n\t" \ - "emms \n\t" \ - :"=m"((x)[0]), \ - "=m"((x)[4]) \ - : \ - :"memory"); - -// Below operation could replace line 2 to 5 in macro below but can -// not cause of compiler bug ??? -// "pextrw $3, %%mm5,%%eax\n\t" -#define UPDATE_QUE(in) \ - __asm __volatile("psllq $16, %%mm4\n\t" \ - "movq %%mm5, %%mm0\n\t" \ - "psrlq $48, %%mm0\n\t" \ - "movd %%mm0, %%eax\n\t" \ - "pinsrw $0, %%eax,%%mm4\n\t" \ - "psllq $16, %%mm5\n\t" \ - "pinsrw $0, %0,%%mm5\n\t" \ - : \ - :"m" ((in)[0]) \ - :"memory", "%eax"); -#define FIR(x,w,y) \ - __asm __volatile("movq %%mm5, %%mm0\n\t" \ - "pmaddwd %1, %%mm0\n\t" \ - "movq %%mm4, %%mm1\n\t" \ - "pmaddwd %2, %%mm1\n\t" \ - "paddd %%mm1, %%mm0\n\t" \ - "movq %%mm0, %%mm1\n\t" \ - "psrlq $32, %%mm1\n\t" \ - "paddd %%mm0, %%mm1\n\t" \ - "movd %%mm1, %%esi\n\t" \ - "shrl $16, %%esi\n\t" \ - "movw %%si, %0\n\t" \ - : "=m" ((y)[0]) \ - : "m" ((w)[0]), \ - "m" ((w)[4]) \ - : "memory", "%esi"); -#endif /* L8 */ +// Same thing as above but in C +inline int32_t firn(int16_t* x, int16_t* w, int16_t n) +{ + register int32_t y=0; + while((n-=4) >=0) + y+=w[n]*x[n]+w[n+1]*x[n+1]+w[n+2]*x[n+2]+w[n+3]*x[n+3] >> 16; + return y; +} -#else /* HAVE_SSE */ +#endif /* HAVE_MMX */ -#define LOAD_QUE(x) -#define SAVE_QUE(x) -#define UPDATE_QUE(inm) \ - xi=(--xi)&(L-1); \ - x[xi]=x[xi+L]=*(inm); +// Macro to add new data to circular queue +#define UPDATE_QUE(ind,xq,xid) \ + xid=(--xid)&(L-1); \ + xq[xid]=xq[xid+L]=*(ind); -#ifdef L4 -#define FIR(x,w,y) \ - y[0]=(w[0]*x[0]+w[1]*x[1]+w[2]*x[2]+w[3]*x[3]) >> 16; -#else +#ifdef L8 +#ifdef HAVE_MMX +#define FIR(x,w,y) *y=(int16_t)firn(x,w,8); +#else /* HAVE_MMX */ +// Unrolled loop to speed up execution #define FIR(x,w,y){ \ int16_t a = (w[0]*x[0]+w[1]*x[1]+w[2]*x[2]+w[3]*x[3]) >> 16; \ int16_t b = (w[4]*x[4]+w[5]*x[5]+w[6]*x[6]+w[7]*x[7]) >> 16; \ y[0] = a+b; \ } -#endif /* L4 */ +#endif /* HAVE_MMX */ +#endif /* L8 */ -#endif /* HAVE_SSE */ +#ifdef L16 +#define FIR(x,w,y) *y=(int16_t)firn(x,w,16); +#endif /* L16 */ #endif /* __FIR_H__ */ - - |