summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGravatar waker <wakeroid@gmail.com>2011-06-15 18:50:14 +0200
committerGravatar waker <wakeroid@gmail.com>2011-06-15 18:50:14 +0200
commit10deb233f02b3b3234156c6cf535d06115decd03 (patch)
tree8facc956c5f9271a501cdc17687863fc7fd61a8c
parent8130fa1771978168a150007cb99c9d35f96299c2 (diff)
ape: arm neon optimizations
-rw-r--r--plugins/ffap/asm.S84
-rw-r--r--plugins/ffap/ffap.c37
-rw-r--r--plugins/ffap/int_neon.S118
3 files changed, 236 insertions, 3 deletions
diff --git a/plugins/ffap/asm.S b/plugins/ffap/asm.S
new file mode 100644
index 00000000..e0507834
--- /dev/null
+++ b/plugins/ffap/asm.S
@@ -0,0 +1,84 @@
+/*
+ * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+
+#ifdef __ELF__
+# define ELF
+#else
+# define ELF @
+#endif
+
+.macro require8 val=1
+ELF .eabi_attribute 24, \val
+.endm
+
+.macro preserve8 val=1
+ELF .eabi_attribute 25, \val
+.endm
+
+.macro function name, export=0
+ .macro endfunc
+ELF .size \name, . - \name
+ .endfunc
+ .purgem endfunc
+ .endm
+ .text
+ .if \export
+ .global EXTERN_ASM\name
+EXTERN_ASM\name:
+ .endif
+ELF .type \name, %function
+ .func \name
+\name:
+.endm
+
+.macro mov32 rd, val
+#if HAVE_ARMV6T2
+ movw \rd, #(\val) & 0xffff
+ .if (\val) >> 16
+ movt \rd, #(\val) >> 16
+ .endif
+#else
+ ldr \rd, =\val
+#endif
+.endm
+
+.macro movrel rd, val
+#if HAVE_ARMV6T2 && !CONFIG_PIC && !defined(__APPLE__)
+ movw \rd, #:lower16:\val
+ movt \rd, #:upper16:\val
+#else
+ ldr \rd, =\val
+#endif
+.endm
+
+#if HAVE_VFP_ARGS
+ .eabi_attribute 28, 1
+# define VFP
+# define NOVFP @
+#else
+# define VFP @
+# define NOVFP
+#endif
+
+#define GLUE(a, b) a ## b
+#define JOIN(a, b) GLUE(a, b)
+#define X(s) JOIN(EXTERN_ASM, s)
diff --git a/plugins/ffap/ffap.c b/plugins/ffap/ffap.c
index 0b1e61d9..7f9022c9 100644
--- a/plugins/ffap/ffap.c
+++ b/plugins/ffap/ffap.c
@@ -41,7 +41,7 @@
#ifdef TARGET_ANDROID
int posix_memalign (void **memptr, size_t alignment, size_t size) {
- *memptr = malloc (size);
+ *memptr = memalign (alignment, size);
return *memptr ? 0 : -1;
}
#endif
@@ -1358,6 +1358,16 @@ scalarproduct_int16_c(int16_t * v1, int16_t * v2, int order, int shift)
return res;
}
+static int32_t scalarproduct_and_madd_int16_c(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul)
+{
+ int res = 0;
+ while (order--) {
+ res += *v1 * *v2++;
+ *v1++ += mul * *v3++;
+ }
+ return res;
+}
+
static void
add_int16_c (int16_t *v1/*align 16*/, int16_t *v2, int len) {
while (len--) {
@@ -1375,6 +1385,9 @@ sub_int16_c (int16_t *v1/*align 16*/, int16_t *v2, int len) {
static int32_t
(*scalarproduct_int16)(int16_t * v1, int16_t * v2, int order, int shift);
+static int32_t
+(*scalarproduct_and_madd_int16)(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul);
+
static void
(*add_int16) (int16_t *v1/*align 16*/, int16_t *v2, int len);
@@ -1412,6 +1425,7 @@ static inline void do_apply_filter(APEContext * ctx, int version, APEFilter *f,
int absres;
while (count--) {
+#if 0
/* round fixedpoint scalar product */
res = (scalarproduct_int16(f->delay - order, f->coeffs, order, 0) + (1 << (fracbits - 1))) >> fracbits;
@@ -1420,7 +1434,10 @@ static inline void do_apply_filter(APEContext * ctx, int version, APEFilter *f,
add_int16(f->coeffs, f->adaptcoeffs - order, order);
else if (*data > 0)
sub_int16(f->coeffs, f->adaptcoeffs - order, order);
+#endif
+ res = scalarproduct_and_madd_int16(f->coeffs, f->delay - order, f->adaptcoeffs - order, order, APESIGN(*data));
+ res = (res + (1 << (fracbits - 1))) >> fracbits;
res += *data;
*data++ = res;
@@ -2103,10 +2120,23 @@ int mm_support(void)
}
#endif
+#if ARCH_ARM
+int32_t EXTERN_ASMff_scalarproduct_int16_neon(int16_t *v1, int16_t *v2, int len,
+ int shift);
+int32_t EXTERN_ASMff_scalarproduct_and_madd_int16_neon(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul);
+
+#endif
+
DB_plugin_t *
ffap_load (DB_functions_t *api) {
// detect sse2
-#if HAVE_SSE2 && !ARCH_UNKNOWN
+#if ARCH_ARM
+ scalarproduct_int16 = EXTERN_ASMff_scalarproduct_int16_neon;
+ scalarproduct_and_madd_int16 = EXTERN_ASMff_scalarproduct_and_madd_int16_neon;
+ add_int16 = add_int16_c;
+ sub_int16 = sub_int16_c;
+#elif HAVE_SSE2 && !ARCH_UNKNOWN
+#error SSE2 version is broken in this branch, missing ff_scalarproduct_and_madd_int16_sse2
trace ("ffap: was compiled with sse2 support\n");
int mm_flags = mm_support ();
if (mm_flags & FF_MM_SSE2) {
@@ -2122,8 +2152,9 @@ ffap_load (DB_functions_t *api) {
sub_int16 = sub_int16_c;
}
#else
- trace ("ffap: sse2 support was not compiled in\n");
+// trace ("ffap: sse2 support was not compiled in\n");
scalarproduct_int16 = scalarproduct_int16_c;
+ scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_c;
add_int16 = add_int16_c;
sub_int16 = sub_int16_c;
#endif
diff --git a/plugins/ffap/int_neon.S b/plugins/ffap/int_neon.S
new file mode 100644
index 00000000..e8023e06
--- /dev/null
+++ b/plugins/ffap/int_neon.S
@@ -0,0 +1,118 @@
+/*
+ * ARM NEON optimised integer operations
+ * Copyright (c) 2009 Kostya Shishkov
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "asm.S"
+
+ preserve8
+ .fpu neon
+ .text
+
+function ff_scalarproduct_int16_neon, export=1
+ vmov.i16 q0, #0
+ vmov.i16 q1, #0
+ vmov.i16 q2, #0
+ vmov.i16 q3, #0
+ negs r3, r3
+ beq 2f
+
+ vdup.s32 q12, r3
+1: vld1.16 {d16-d17}, [r0]!
+ vld1.16 {d20-d21}, [r1,:128]!
+ vmull.s16 q12, d16, d20
+ vld1.16 {d18-d19}, [r0]!
+ vmull.s16 q13, d17, d21
+ vld1.16 {d22-d23}, [r1,:128]!
+ vmull.s16 q14, d18, d22
+ vmull.s16 q15, d19, d23
+ vshl.s32 q8, q12, q12
+ vshl.s32 q9, q13, q12
+ vadd.s32 q0, q0, q8
+ vshl.s32 q10, q14, q12
+ vadd.s32 q1, q1, q9
+ vshl.s32 q11, q15, q12
+ vadd.s32 q2, q2, q10
+ vadd.s32 q3, q3, q11
+ subs r2, r2, #16
+ bne 1b
+ b 3f
+
+2: vld1.16 {d16-d17}, [r0]!
+ vld1.16 {d20-d21}, [r1,:128]!
+ vmlal.s16 q0, d16, d20
+ vld1.16 {d18-d19}, [r0]!
+ vmlal.s16 q1, d17, d21
+ vld1.16 {d22-d23}, [r1,:128]!
+ vmlal.s16 q2, d18, d22
+ vmlal.s16 q3, d19, d23
+ subs r2, r2, #16
+ bne 2b
+
+3: vpadd.s32 d16, d0, d1
+ vpadd.s32 d17, d2, d3
+ vpadd.s32 d10, d4, d5
+ vpadd.s32 d11, d6, d7
+ vpadd.s32 d0, d16, d17
+ vpadd.s32 d1, d10, d11
+ vpadd.s32 d2, d0, d1
+ vpaddl.s32 d3, d2
+ vmov.32 r0, d3[0]
+ bx lr
+endfunc
+
+@ scalarproduct_and_madd_int16(/*aligned*/v0,v1,v2,order,mul)
+function ff_scalarproduct_and_madd_int16_neon, export=1
+ vld1.16 {d28[],d29[]}, [sp]
+ vmov.i16 q0, #0
+ vmov.i16 q1, #0
+ vmov.i16 q2, #0
+ vmov.i16 q3, #0
+ mov r12, r0
+
+1: vld1.16 {d16-d17}, [r0,:128]!
+ vld1.16 {d18-d19}, [r1]!
+ vld1.16 {d20-d21}, [r2]!
+ vld1.16 {d22-d23}, [r0,:128]!
+ vld1.16 {d24-d25}, [r1]!
+ vld1.16 {d26-d27}, [r2]!
+ vmul.s16 q10, q10, q14
+ vmul.s16 q13, q13, q14
+ vmlal.s16 q0, d16, d18
+ vmlal.s16 q1, d17, d19
+ vadd.s16 q10, q8, q10
+ vadd.s16 q13, q11, q13
+ vmlal.s16 q2, d22, d24
+ vmlal.s16 q3, d23, d25
+ vst1.16 {q10}, [r12,:128]!
+ subs r3, r3, #16
+ vst1.16 {q13}, [r12,:128]!
+ bne 1b
+
+ vpadd.s32 d16, d0, d1
+ vpadd.s32 d17, d2, d3
+ vpadd.s32 d10, d4, d5
+ vpadd.s32 d11, d6, d7
+ vpadd.s32 d0, d16, d17
+ vpadd.s32 d1, d10, d11
+ vpadd.s32 d2, d0, d1
+ vpaddl.s32 d3, d2
+ vmov.32 r0, d3[0]
+ bx lr
+endfunc