added better sse2 filter implementation to ape plugin;

added yasm support
author: waker <wakeroid@gmail.com> 2011-06-15 21:57:06 +0200
committer: waker <wakeroid@gmail.com> 2011-06-15 21:57:06 +0200
commit: f702e5270db5bd4576765c8415c456e443513006 (patch)
tree: c901fa7a600bc1fa1496b5af716b7d375be7000a /plugins/ffap/ffap.c
parent: 10deb233f02b3b3234156c6cf535d06115decd03 (diff)
1 files changed, 4 insertions, 128 deletions
diff --git a/plugins/ffap/ffap.c b/plugins/ffap/ffap.c
index 7f9022c9..ccfa2b2d 100644
--- a/plugins/ffap/ffap.c
+++ b/plugins/ffap/ffap.c
@@ -1275,89 +1275,8 @@ typedef int x86_reg;
 typedef struct { uint64_t a, b; } xmm_reg;
 #define DECLARE_ALIGNED(n,t,v)      t v __attribute__ ((aligned (n)))
 #define DECLARE_ALIGNED_16(t, v) DECLARE_ALIGNED(16, t, v)
-static int32_t scalarproduct_int16_sse2 (int16_t * v1, int16_t * v2, int order, int shift)
-{
-    int res = 0;
-    DECLARE_ALIGNED_16(xmm_reg, sh);
-    x86_reg o = -(order << 1);
-
-    v1 += order;
-    v2 += order;
-    sh.a = shift;
-    __asm__ volatile(
-        "pxor      %%xmm7,  %%xmm7        \n\t"
-        "1:                               \n\t"
-        "movdqu    (%0,%3), %%xmm0        \n\t"
-        "movdqu  16(%0,%3), %%xmm1        \n\t"
-        "pmaddwd   (%1,%3), %%xmm0        \n\t"
-        "pmaddwd 16(%1,%3), %%xmm1        \n\t"
-        "paddd     %%xmm0,  %%xmm7        \n\t"
-        "paddd     %%xmm1,  %%xmm7        \n\t"
-        "add       $32,     %3            \n\t"
-        "js        1b                     \n\t"
-        "movhlps   %%xmm7,  %%xmm2        \n\t"
-        "paddd     %%xmm2,  %%xmm7        \n\t"
-        "psrad     %4,      %%xmm7        \n\t"
-        "pshuflw   $0x4E,   %%xmm7,%%xmm2 \n\t"
-        "paddd     %%xmm2,  %%xmm7        \n\t"
-        "movd      %%xmm7,  %2            \n\t"
-        : "+r"(v1), "+r"(v2), "=r"(res), "+r"(o)
-        : "m"(sh)
-    );
-    return res;
-}
-static void add_int16_sse2(int16_t * v1, int16_t * v2, int order)
-{
-    x86_reg o = -(order << 1);
-    v1 += order;
-    v2 += order;
-    __asm__ volatile(
-        "1:                          \n\t"
-        "movdqu   (%1,%2),   %%xmm0  \n\t"
-        "movdqu 16(%1,%2),   %%xmm1  \n\t"
-        "paddw    (%0,%2),   %%xmm0  \n\t"
-        "paddw  16(%0,%2),   %%xmm1  \n\t"
-        "movdqa   %%xmm0,    (%0,%2) \n\t"
-        "movdqa   %%xmm1,  16(%0,%2) \n\t"
-        "add      $32,       %2      \n\t"
-        "js       1b                 \n\t"
-        : "+r"(v1), "+r"(v2), "+r"(o)
-    );
-}
-
-static void sub_int16_sse2(int16_t * v1, int16_t * v2, int order)
-{
-    x86_reg o = -(order << 1);
-    v1 += order;
-    v2 += order;
-    __asm__ volatile(
-        "1:                           \n\t"
-        "movdqa    (%0,%2),   %%xmm0  \n\t"
-        "movdqa  16(%0,%2),   %%xmm2  \n\t"
-        "movdqu    (%1,%2),   %%xmm1  \n\t"
-        "movdqu  16(%1,%2),   %%xmm3  \n\t"
-        "psubw     %%xmm1,    %%xmm0  \n\t"
-        "psubw     %%xmm3,    %%xmm2  \n\t"
-        "movdqa    %%xmm0,    (%0,%2) \n\t"
-        "movdqa    %%xmm2,  16(%0,%2) \n\t"
-        "add       $32,       %2      \n\t"
-        "js        1b                 \n\t"
-        : "+r"(v1), "+r"(v2), "+r"(o)
-    );
-}
 #endif
 
-static int32_t
-scalarproduct_int16_c(int16_t * v1, int16_t * v2, int order, int shift)
-{
-    int res = 0;
-
-    while (order--)
-        res += (*v1++ * *v2++) >> shift;
-
-    return res;
-}
-
 static int32_t scalarproduct_and_madd_int16_c(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul)
 {
     int res = 0;
@@ -1368,32 +1287,9 @@ static int32_t scalarproduct_and_madd_int16_c(int16_t *v1, const int16_t *v2, co
     return res;
 }
 
-static void
-add_int16_c (int16_t *v1/*align 16*/, int16_t *v2, int len) {
-    while (len--) {
-        *v1++ += *v2++;
-    }
-}
-
-static void
-sub_int16_c (int16_t *v1/*align 16*/, int16_t *v2, int len) {
-    while (len--) {
-        *v1++ -= *v2++;
-    }
-}
-
-static int32_t
-(*scalarproduct_int16)(int16_t * v1, int16_t * v2, int order, int shift);
-
 static int32_t
 (*scalarproduct_and_madd_int16)(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul);
 
-static void
-(*add_int16) (int16_t *v1/*align 16*/, int16_t *v2, int len);
-
-static void
-(*sub_int16) (int16_t *v1/*align 16*/, int16_t *v2, int len);
-
 static inline int16_t clip_int16(int a)
 {
     if ((a+32768) & ~65535) return (a>>31) ^ 32767;
@@ -1425,17 +1321,6 @@ static inline void do_apply_filter(APEContext * ctx, int version, APEFilter *f,
     int absres;
 
     while (count--) {
-#if 0
-        /* round fixedpoint scalar product */
-        res = (scalarproduct_int16(f->delay - order, f->coeffs, order, 0) + (1 << (fracbits - 1))) >> fracbits;
-
-
-        if (*data < 0)
-            add_int16(f->coeffs, f->adaptcoeffs - order, order);
-        else if (*data > 0)
-            sub_int16(f->coeffs, f->adaptcoeffs - order, order);
-#endif
-
         res = scalarproduct_and_madd_int16(f->coeffs, f->delay - order, f->adaptcoeffs - order, order, APESIGN(*data));
         res = (res + (1 << (fracbits - 1))) >> fracbits;
         res += *data;
@@ -2019,6 +1904,8 @@ static DB_decoder_t plugin = {
 
 #if HAVE_SSE2 && !ARCH_UNKNOWN
 
+int32_t ff_scalarproduct_and_madd_int16_sse2(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul);
+
 #define FF_MM_MMX      0x0001 ///< standard MMX
 #define FF_MM_3DNOW    0x0004 ///< AMD 3DNOW
 #define FF_MM_MMX2     0x0002 ///< SSE integer functions or AMD MMX ext
@@ -2131,32 +2018,21 @@ DB_plugin_t *
 ffap_load (DB_functions_t *api) {
     // detect sse2
 #if ARCH_ARM
-        scalarproduct_int16 = EXTERN_ASMff_scalarproduct_int16_neon;
         scalarproduct_and_madd_int16 = EXTERN_ASMff_scalarproduct_and_madd_int16_neon;
-        add_int16 = add_int16_c;
-        sub_int16 = sub_int16_c;
 #elif HAVE_SSE2 && !ARCH_UNKNOWN
-#error SSE2 version is broken in this branch, missing ff_scalarproduct_and_madd_int16_sse2
     trace ("ffap: was compiled with sse2 support\n");
     int mm_flags = mm_support ();
     if (mm_flags & FF_MM_SSE2) {
         trace ("ffap: sse2 support detected\n");
-        scalarproduct_int16 = scalarproduct_int16_sse2;
-        add_int16 = add_int16_sse2;
-        sub_int16 = sub_int16_sse2;
+        scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_sse2;
     }
     else {
         trace ("ffap: sse2 is not supported by CPU\n");
-        scalarproduct_int16 = scalarproduct_int16_c;
-        add_int16 = add_int16_c;
-        sub_int16 = sub_int16_c;
+        scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_c;
     }
 #else
 //    trace ("ffap: sse2 support was not compiled in\n");
-    scalarproduct_int16 = scalarproduct_int16_c;
     scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_c;
-    add_int16 = add_int16_c;
-    sub_int16 = sub_int16_c;
 #endif
     deadbeef = api;
     return DB_PLUGIN (&plugin);
author	waker <wakeroid@gmail.com>	2011-06-15 21:57:06 +0200
committer	waker <wakeroid@gmail.com>	2011-06-15 21:57:06 +0200
commit	f702e5270db5bd4576765c8415c456e443513006 (patch)
tree	c901fa7a600bc1fa1496b5af716b7d375be7000a /plugins/ffap/ffap.c
parent	10deb233f02b3b3234156c6cf535d06115decd03 (diff)