ape: arm neon optimizations

author: waker <wakeroid@gmail.com> 2011-06-15 18:50:14 +0200
committer: waker <wakeroid@gmail.com> 2011-06-15 18:50:14 +0200
commit: 10deb233f02b3b3234156c6cf535d06115decd03 (patch)
tree: 8facc956c5f9271a501cdc17687863fc7fd61a8c
parent: 8130fa1771978168a150007cb99c9d35f96299c2 (diff)
3 files changed, 236 insertions, 3 deletions
diff --git a/plugins/ffap/asm.S b/plugins/ffap/asm.S
new file mode 100644
index 00000000..e0507834
--- /dev/null
+++ b/plugins/ffap/asm.S
@@ -0,0 +1,84 @@
+/*
+ * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+
+#ifdef __ELF__
+#   define ELF
+#else
+#   define ELF @
+#endif
+
+.macro  require8 val=1
+ELF     .eabi_attribute 24, \val
+.endm
+
+.macro  preserve8 val=1
+ELF     .eabi_attribute 25, \val
+.endm
+
+.macro  function name, export=0
+    .macro endfunc
+ELF     .size   \name, . - \name
+        .endfunc
+        .purgem endfunc
+    .endm
+        .text
+    .if \export
+        .global EXTERN_ASM\name
+EXTERN_ASM\name:
+    .endif
+ELF     .type   \name, %function
+        .func   \name
+\name:
+.endm
+
+.macro  mov32   rd, val
+#if HAVE_ARMV6T2
+        movw            \rd, #(\val) & 0xffff
+    .if (\val) >> 16
+        movt            \rd, #(\val) >> 16
+    .endif
+#else
+        ldr             \rd, =\val
+#endif
+.endm
+
+.macro  movrel rd, val
+#if HAVE_ARMV6T2 && !CONFIG_PIC && !defined(__APPLE__)
+        movw            \rd, #:lower16:\val
+        movt            \rd, #:upper16:\val
+#else
+        ldr             \rd, =\val
+#endif
+.endm
+
+#if HAVE_VFP_ARGS
+        .eabi_attribute 28, 1
+#   define VFP
+#   define NOVFP @
+#else
+#   define VFP   @
+#   define NOVFP
+#endif
+
+#define GLUE(a, b) a ## b
+#define JOIN(a, b) GLUE(a, b)
+#define X(s) JOIN(EXTERN_ASM, s)
diff --git a/plugins/ffap/ffap.c b/plugins/ffap/ffap.c
index 0b1e61d9..7f9022c9 100644
--- a/plugins/ffap/ffap.c
+++ b/plugins/ffap/ffap.c
@@ -41,7 +41,7 @@
 
 #ifdef TARGET_ANDROID
 int posix_memalign (void **memptr, size_t alignment, size_t size) {
-    *memptr = malloc (size);
+    *memptr = memalign (alignment, size);
     return *memptr ? 0 : -1;
 }
 #endif
@@ -1358,6 +1358,16 @@ scalarproduct_int16_c(int16_t * v1, int16_t * v2, int order, int shift)
     return res;
 }
 
+static int32_t scalarproduct_and_madd_int16_c(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul)
+{
+    int res = 0;
+    while (order--) {
+        res   += *v1 * *v2++;
+        *v1++ += mul * *v3++;
+    }
+    return res;
+}
+
 static void
 add_int16_c (int16_t *v1/*align 16*/, int16_t *v2, int len) {
     while (len--) {
@@ -1375,6 +1385,9 @@ sub_int16_c (int16_t *v1/*align 16*/, int16_t *v2, int len) {
 static int32_t
 (*scalarproduct_int16)(int16_t * v1, int16_t * v2, int order, int shift);
 
+static int32_t
+(*scalarproduct_and_madd_int16)(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul);
+
 static void
 (*add_int16) (int16_t *v1/*align 16*/, int16_t *v2, int len);
 
@@ -1412,6 +1425,7 @@ static inline void do_apply_filter(APEContext * ctx, int version, APEFilter *f,
     int absres;
 
     while (count--) {
+#if 0
         /* round fixedpoint scalar product */
         res = (scalarproduct_int16(f->delay - order, f->coeffs, order, 0) + (1 << (fracbits - 1))) >> fracbits;
 
@@ -1420,7 +1434,10 @@ static inline void do_apply_filter(APEContext * ctx, int version, APEFilter *f,
             add_int16(f->coeffs, f->adaptcoeffs - order, order);
         else if (*data > 0)
             sub_int16(f->coeffs, f->adaptcoeffs - order, order);
+#endif
 
+        res = scalarproduct_and_madd_int16(f->coeffs, f->delay - order, f->adaptcoeffs - order, order, APESIGN(*data));
+        res = (res + (1 << (fracbits - 1))) >> fracbits;
         res += *data;
 
         *data++ = res;
@@ -2103,10 +2120,23 @@ int mm_support(void)
 }
 #endif
 
+#if ARCH_ARM
+int32_t EXTERN_ASMff_scalarproduct_int16_neon(int16_t *v1, int16_t *v2, int len,
+                                    int shift);
+int32_t EXTERN_ASMff_scalarproduct_and_madd_int16_neon(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul);
+
+#endif
+
 DB_plugin_t *
 ffap_load (DB_functions_t *api) {
     // detect sse2
-#if HAVE_SSE2 && !ARCH_UNKNOWN
+#if ARCH_ARM
+        scalarproduct_int16 = EXTERN_ASMff_scalarproduct_int16_neon;
+        scalarproduct_and_madd_int16 = EXTERN_ASMff_scalarproduct_and_madd_int16_neon;
+        add_int16 = add_int16_c;
+        sub_int16 = sub_int16_c;
+#elif HAVE_SSE2 && !ARCH_UNKNOWN
+#error SSE2 version is broken in this branch, missing ff_scalarproduct_and_madd_int16_sse2
     trace ("ffap: was compiled with sse2 support\n");
     int mm_flags = mm_support ();
     if (mm_flags & FF_MM_SSE2) {
@@ -2122,8 +2152,9 @@ ffap_load (DB_functions_t *api) {
         sub_int16 = sub_int16_c;
     }
 #else
-    trace ("ffap: sse2 support was not compiled in\n");
+//    trace ("ffap: sse2 support was not compiled in\n");
     scalarproduct_int16 = scalarproduct_int16_c;
+    scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_c;
     add_int16 = add_int16_c;
     sub_int16 = sub_int16_c;
 #endif
diff --git a/plugins/ffap/int_neon.S b/plugins/ffap/int_neon.S
new file mode 100644
index 00000000..e8023e06
--- /dev/null
+++ b/plugins/ffap/int_neon.S
@@ -0,0 +1,118 @@
+/*
+ * ARM NEON optimised integer operations
+ * Copyright (c) 2009 Kostya Shishkov
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "asm.S"
+
+        preserve8
+        .fpu neon
+        .text
+
+function ff_scalarproduct_int16_neon, export=1
+        vmov.i16        q0,  #0
+        vmov.i16        q1,  #0
+        vmov.i16        q2,  #0
+        vmov.i16        q3,  #0
+        negs            r3,  r3
+        beq             2f
+
+        vdup.s32        q12, r3
+1:      vld1.16         {d16-d17}, [r0]!
+        vld1.16         {d20-d21}, [r1,:128]!
+        vmull.s16       q12, d16,  d20
+        vld1.16         {d18-d19}, [r0]!
+        vmull.s16       q13, d17,  d21
+        vld1.16         {d22-d23}, [r1,:128]!
+        vmull.s16       q14, d18,  d22
+        vmull.s16       q15, d19,  d23
+        vshl.s32        q8,  q12,  q12
+        vshl.s32        q9,  q13,  q12
+        vadd.s32        q0,  q0,   q8
+        vshl.s32        q10, q14,  q12
+        vadd.s32        q1,  q1,   q9
+        vshl.s32        q11, q15,  q12
+        vadd.s32        q2,  q2,   q10
+        vadd.s32        q3,  q3,   q11
+        subs            r2,  r2,   #16
+        bne             1b
+        b               3f
+
+2:      vld1.16         {d16-d17}, [r0]!
+        vld1.16         {d20-d21}, [r1,:128]!
+        vmlal.s16       q0,  d16,  d20
+        vld1.16         {d18-d19}, [r0]!
+        vmlal.s16       q1,  d17,  d21
+        vld1.16         {d22-d23}, [r1,:128]!
+        vmlal.s16       q2,  d18,  d22
+        vmlal.s16       q3,  d19,  d23
+        subs            r2,  r2,   #16
+        bne             2b
+
+3:      vpadd.s32       d16, d0,   d1
+        vpadd.s32       d17, d2,   d3
+        vpadd.s32       d10, d4,   d5
+        vpadd.s32       d11, d6,   d7
+        vpadd.s32       d0,  d16,  d17
+        vpadd.s32       d1,  d10,  d11
+        vpadd.s32       d2,  d0,   d1
+        vpaddl.s32      d3,  d2
+        vmov.32         r0,  d3[0]
+        bx              lr
+endfunc
+
+@ scalarproduct_and_madd_int16(/*aligned*/v0,v1,v2,order,mul)
+function ff_scalarproduct_and_madd_int16_neon, export=1
+        vld1.16         {d28[],d29[]}, [sp]
+        vmov.i16        q0,  #0
+        vmov.i16        q1,  #0
+        vmov.i16        q2,  #0
+        vmov.i16        q3,  #0
+        mov             r12, r0
+
+1:      vld1.16         {d16-d17}, [r0,:128]!
+        vld1.16         {d18-d19}, [r1]!
+        vld1.16         {d20-d21}, [r2]!
+        vld1.16         {d22-d23}, [r0,:128]!
+        vld1.16         {d24-d25}, [r1]!
+        vld1.16         {d26-d27}, [r2]!
+        vmul.s16        q10, q10,  q14
+        vmul.s16        q13, q13,  q14
+        vmlal.s16       q0,  d16,  d18
+        vmlal.s16       q1,  d17,  d19
+        vadd.s16        q10, q8,   q10
+        vadd.s16        q13, q11,  q13
+        vmlal.s16       q2,  d22,  d24
+        vmlal.s16       q3,  d23,  d25
+        vst1.16         {q10},     [r12,:128]!
+        subs            r3,  r3,   #16
+        vst1.16         {q13},     [r12,:128]!
+        bne             1b
+
+        vpadd.s32       d16, d0,   d1
+        vpadd.s32       d17, d2,   d3
+        vpadd.s32       d10, d4,   d5
+        vpadd.s32       d11, d6,   d7
+        vpadd.s32       d0,  d16,  d17
+        vpadd.s32       d1,  d10,  d11
+        vpadd.s32       d2,  d0,   d1
+        vpaddl.s32      d3,  d2
+        vmov.32         r0,  d3[0]
+        bx              lr
+endfunc
author	waker <wakeroid@gmail.com>	2011-06-15 18:50:14 +0200
committer	waker <wakeroid@gmail.com>	2011-06-15 18:50:14 +0200
commit	10deb233f02b3b3234156c6cf535d06115decd03 (patch)
tree	8facc956c5f9271a501cdc17687863fc7fd61a8c
parent	8130fa1771978168a150007cb99c9d35f96299c2 (diff)