summaryrefslogtreecommitdiff
path: root/plugins/ffap/ffap.c
diff options
context:
space:
mode:
authorGravatar waker <wakeroid@gmail.com>2011-06-15 21:57:06 +0200
committerGravatar waker <wakeroid@gmail.com>2011-06-15 21:57:06 +0200
commitf702e5270db5bd4576765c8415c456e443513006 (patch)
treec901fa7a600bc1fa1496b5af716b7d375be7000a /plugins/ffap/ffap.c
parent10deb233f02b3b3234156c6cf535d06115decd03 (diff)
added better sse2 filter implementation to ape plugin;
added yasm support
Diffstat (limited to 'plugins/ffap/ffap.c')
-rw-r--r--plugins/ffap/ffap.c132
1 files changed, 4 insertions, 128 deletions
diff --git a/plugins/ffap/ffap.c b/plugins/ffap/ffap.c
index 7f9022c9..ccfa2b2d 100644
--- a/plugins/ffap/ffap.c
+++ b/plugins/ffap/ffap.c
@@ -1275,89 +1275,8 @@ typedef int x86_reg;
typedef struct { uint64_t a, b; } xmm_reg;
#define DECLARE_ALIGNED(n,t,v) t v __attribute__ ((aligned (n)))
#define DECLARE_ALIGNED_16(t, v) DECLARE_ALIGNED(16, t, v)
-static int32_t scalarproduct_int16_sse2 (int16_t * v1, int16_t * v2, int order, int shift)
-{
- int res = 0;
- DECLARE_ALIGNED_16(xmm_reg, sh);
- x86_reg o = -(order << 1);
-
- v1 += order;
- v2 += order;
- sh.a = shift;
- __asm__ volatile(
- "pxor %%xmm7, %%xmm7 \n\t"
- "1: \n\t"
- "movdqu (%0,%3), %%xmm0 \n\t"
- "movdqu 16(%0,%3), %%xmm1 \n\t"
- "pmaddwd (%1,%3), %%xmm0 \n\t"
- "pmaddwd 16(%1,%3), %%xmm1 \n\t"
- "paddd %%xmm0, %%xmm7 \n\t"
- "paddd %%xmm1, %%xmm7 \n\t"
- "add $32, %3 \n\t"
- "js 1b \n\t"
- "movhlps %%xmm7, %%xmm2 \n\t"
- "paddd %%xmm2, %%xmm7 \n\t"
- "psrad %4, %%xmm7 \n\t"
- "pshuflw $0x4E, %%xmm7,%%xmm2 \n\t"
- "paddd %%xmm2, %%xmm7 \n\t"
- "movd %%xmm7, %2 \n\t"
- : "+r"(v1), "+r"(v2), "=r"(res), "+r"(o)
- : "m"(sh)
- );
- return res;
-}
-static void add_int16_sse2(int16_t * v1, int16_t * v2, int order)
-{
- x86_reg o = -(order << 1);
- v1 += order;
- v2 += order;
- __asm__ volatile(
- "1: \n\t"
- "movdqu (%1,%2), %%xmm0 \n\t"
- "movdqu 16(%1,%2), %%xmm1 \n\t"
- "paddw (%0,%2), %%xmm0 \n\t"
- "paddw 16(%0,%2), %%xmm1 \n\t"
- "movdqa %%xmm0, (%0,%2) \n\t"
- "movdqa %%xmm1, 16(%0,%2) \n\t"
- "add $32, %2 \n\t"
- "js 1b \n\t"
- : "+r"(v1), "+r"(v2), "+r"(o)
- );
-}
-
-static void sub_int16_sse2(int16_t * v1, int16_t * v2, int order)
-{
- x86_reg o = -(order << 1);
- v1 += order;
- v2 += order;
- __asm__ volatile(
- "1: \n\t"
- "movdqa (%0,%2), %%xmm0 \n\t"
- "movdqa 16(%0,%2), %%xmm2 \n\t"
- "movdqu (%1,%2), %%xmm1 \n\t"
- "movdqu 16(%1,%2), %%xmm3 \n\t"
- "psubw %%xmm1, %%xmm0 \n\t"
- "psubw %%xmm3, %%xmm2 \n\t"
- "movdqa %%xmm0, (%0,%2) \n\t"
- "movdqa %%xmm2, 16(%0,%2) \n\t"
- "add $32, %2 \n\t"
- "js 1b \n\t"
- : "+r"(v1), "+r"(v2), "+r"(o)
- );
-}
#endif
-static int32_t
-scalarproduct_int16_c(int16_t * v1, int16_t * v2, int order, int shift)
-{
- int res = 0;
-
- while (order--)
- res += (*v1++ * *v2++) >> shift;
-
- return res;
-}
-
static int32_t scalarproduct_and_madd_int16_c(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul)
{
int res = 0;
@@ -1368,32 +1287,9 @@ static int32_t scalarproduct_and_madd_int16_c(int16_t *v1, const int16_t *v2, co
return res;
}
-static void
-add_int16_c (int16_t *v1/*align 16*/, int16_t *v2, int len) {
- while (len--) {
- *v1++ += *v2++;
- }
-}
-
-static void
-sub_int16_c (int16_t *v1/*align 16*/, int16_t *v2, int len) {
- while (len--) {
- *v1++ -= *v2++;
- }
-}
-
-static int32_t
-(*scalarproduct_int16)(int16_t * v1, int16_t * v2, int order, int shift);
-
static int32_t
(*scalarproduct_and_madd_int16)(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul);
-static void
-(*add_int16) (int16_t *v1/*align 16*/, int16_t *v2, int len);
-
-static void
-(*sub_int16) (int16_t *v1/*align 16*/, int16_t *v2, int len);
-
static inline int16_t clip_int16(int a)
{
if ((a+32768) & ~65535) return (a>>31) ^ 32767;
@@ -1425,17 +1321,6 @@ static inline void do_apply_filter(APEContext * ctx, int version, APEFilter *f,
int absres;
while (count--) {
-#if 0
- /* round fixedpoint scalar product */
- res = (scalarproduct_int16(f->delay - order, f->coeffs, order, 0) + (1 << (fracbits - 1))) >> fracbits;
-
-
- if (*data < 0)
- add_int16(f->coeffs, f->adaptcoeffs - order, order);
- else if (*data > 0)
- sub_int16(f->coeffs, f->adaptcoeffs - order, order);
-#endif
-
res = scalarproduct_and_madd_int16(f->coeffs, f->delay - order, f->adaptcoeffs - order, order, APESIGN(*data));
res = (res + (1 << (fracbits - 1))) >> fracbits;
res += *data;
@@ -2019,6 +1904,8 @@ static DB_decoder_t plugin = {
#if HAVE_SSE2 && !ARCH_UNKNOWN
+int32_t ff_scalarproduct_and_madd_int16_sse2(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul);
+
#define FF_MM_MMX 0x0001 ///< standard MMX
#define FF_MM_3DNOW 0x0004 ///< AMD 3DNOW
#define FF_MM_MMX2 0x0002 ///< SSE integer functions or AMD MMX ext
@@ -2131,32 +2018,21 @@ DB_plugin_t *
ffap_load (DB_functions_t *api) {
// detect sse2
#if ARCH_ARM
- scalarproduct_int16 = EXTERN_ASMff_scalarproduct_int16_neon;
scalarproduct_and_madd_int16 = EXTERN_ASMff_scalarproduct_and_madd_int16_neon;
- add_int16 = add_int16_c;
- sub_int16 = sub_int16_c;
#elif HAVE_SSE2 && !ARCH_UNKNOWN
-#error SSE2 version is broken in this branch, missing ff_scalarproduct_and_madd_int16_sse2
trace ("ffap: was compiled with sse2 support\n");
int mm_flags = mm_support ();
if (mm_flags & FF_MM_SSE2) {
trace ("ffap: sse2 support detected\n");
- scalarproduct_int16 = scalarproduct_int16_sse2;
- add_int16 = add_int16_sse2;
- sub_int16 = sub_int16_sse2;
+ scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_sse2;
}
else {
trace ("ffap: sse2 is not supported by CPU\n");
- scalarproduct_int16 = scalarproduct_int16_c;
- add_int16 = add_int16_c;
- sub_int16 = sub_int16_c;
+ scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_c;
}
#else
// trace ("ffap: sse2 support was not compiled in\n");
- scalarproduct_int16 = scalarproduct_int16_c;
scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_c;
- add_int16 = add_int16_c;
- sub_int16 = sub_int16_c;
#endif
deadbeef = api;
return DB_PLUGIN (&plugin);