diff options
author | waker <wakeroid@gmail.com> | 2011-06-15 18:50:14 +0200 |
---|---|---|
committer | waker <wakeroid@gmail.com> | 2011-06-15 18:50:14 +0200 |
commit | 10deb233f02b3b3234156c6cf535d06115decd03 (patch) | |
tree | 8facc956c5f9271a501cdc17687863fc7fd61a8c /plugins/ffap | |
parent | 8130fa1771978168a150007cb99c9d35f96299c2 (diff) |
ape: arm neon optimizations
Diffstat (limited to 'plugins/ffap')
-rw-r--r-- | plugins/ffap/asm.S | 84 | ||||
-rw-r--r-- | plugins/ffap/ffap.c | 37 | ||||
-rw-r--r-- | plugins/ffap/int_neon.S | 118 |
3 files changed, 236 insertions, 3 deletions
diff --git a/plugins/ffap/asm.S b/plugins/ffap/asm.S new file mode 100644 index 00000000..e0507834 --- /dev/null +++ b/plugins/ffap/asm.S @@ -0,0 +1,84 @@ +/* + * Copyright (c) 2008 Mans Rullgard <mans@mansr.com> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "config.h" + +#ifdef __ELF__ +# define ELF +#else +# define ELF @ +#endif + +.macro require8 val=1 +ELF .eabi_attribute 24, \val +.endm + +.macro preserve8 val=1 +ELF .eabi_attribute 25, \val +.endm + +.macro function name, export=0 + .macro endfunc +ELF .size \name, . - \name + .endfunc + .purgem endfunc + .endm + .text + .if \export + .global EXTERN_ASM\name +EXTERN_ASM\name: + .endif +ELF .type \name, %function + .func \name +\name: +.endm + +.macro mov32 rd, val +#if HAVE_ARMV6T2 + movw \rd, #(\val) & 0xffff + .if (\val) >> 16 + movt \rd, #(\val) >> 16 + .endif +#else + ldr \rd, =\val +#endif +.endm + +.macro movrel rd, val +#if HAVE_ARMV6T2 && !CONFIG_PIC && !defined(__APPLE__) + movw \rd, #:lower16:\val + movt \rd, #:upper16:\val +#else + ldr \rd, =\val +#endif +.endm + +#if HAVE_VFP_ARGS + .eabi_attribute 28, 1 +# define VFP +# define NOVFP @ +#else +# define VFP @ +# define NOVFP +#endif + +#define GLUE(a, b) a ## b +#define JOIN(a, b) GLUE(a, b) +#define X(s) JOIN(EXTERN_ASM, s) diff --git a/plugins/ffap/ffap.c b/plugins/ffap/ffap.c index 0b1e61d9..7f9022c9 100644 --- a/plugins/ffap/ffap.c +++ b/plugins/ffap/ffap.c @@ -41,7 +41,7 @@ #ifdef TARGET_ANDROID int posix_memalign (void **memptr, size_t alignment, size_t size) { - *memptr = malloc (size); + *memptr = memalign (alignment, size); return *memptr ? 0 : -1; } #endif @@ -1358,6 +1358,16 @@ scalarproduct_int16_c(int16_t * v1, int16_t * v2, int order, int shift) return res; } +static int32_t scalarproduct_and_madd_int16_c(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul) +{ + int res = 0; + while (order--) { + res += *v1 * *v2++; + *v1++ += mul * *v3++; + } + return res; +} + static void add_int16_c (int16_t *v1/*align 16*/, int16_t *v2, int len) { while (len--) { @@ -1375,6 +1385,9 @@ sub_int16_c (int16_t *v1/*align 16*/, int16_t *v2, int len) { static int32_t (*scalarproduct_int16)(int16_t * v1, int16_t * v2, int order, int shift); +static int32_t +(*scalarproduct_and_madd_int16)(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul); + static void (*add_int16) (int16_t *v1/*align 16*/, int16_t *v2, int len); @@ -1412,6 +1425,7 @@ static inline void do_apply_filter(APEContext * ctx, int version, APEFilter *f, int absres; while (count--) { +#if 0 /* round fixedpoint scalar product */ res = (scalarproduct_int16(f->delay - order, f->coeffs, order, 0) + (1 << (fracbits - 1))) >> fracbits; @@ -1420,7 +1434,10 @@ static inline void do_apply_filter(APEContext * ctx, int version, APEFilter *f, add_int16(f->coeffs, f->adaptcoeffs - order, order); else if (*data > 0) sub_int16(f->coeffs, f->adaptcoeffs - order, order); +#endif + res = scalarproduct_and_madd_int16(f->coeffs, f->delay - order, f->adaptcoeffs - order, order, APESIGN(*data)); + res = (res + (1 << (fracbits - 1))) >> fracbits; res += *data; *data++ = res; @@ -2103,10 +2120,23 @@ int mm_support(void) } #endif +#if ARCH_ARM +int32_t EXTERN_ASMff_scalarproduct_int16_neon(int16_t *v1, int16_t *v2, int len, + int shift); +int32_t EXTERN_ASMff_scalarproduct_and_madd_int16_neon(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul); + +#endif + DB_plugin_t * ffap_load (DB_functions_t *api) { // detect sse2 -#if HAVE_SSE2 && !ARCH_UNKNOWN +#if ARCH_ARM + scalarproduct_int16 = EXTERN_ASMff_scalarproduct_int16_neon; + scalarproduct_and_madd_int16 = EXTERN_ASMff_scalarproduct_and_madd_int16_neon; + add_int16 = add_int16_c; + sub_int16 = sub_int16_c; +#elif HAVE_SSE2 && !ARCH_UNKNOWN +#error SSE2 version is broken in this branch, missing ff_scalarproduct_and_madd_int16_sse2 trace ("ffap: was compiled with sse2 support\n"); int mm_flags = mm_support (); if (mm_flags & FF_MM_SSE2) { @@ -2122,8 +2152,9 @@ ffap_load (DB_functions_t *api) { sub_int16 = sub_int16_c; } #else - trace ("ffap: sse2 support was not compiled in\n"); +// trace ("ffap: sse2 support was not compiled in\n"); scalarproduct_int16 = scalarproduct_int16_c; + scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_c; add_int16 = add_int16_c; sub_int16 = sub_int16_c; #endif diff --git a/plugins/ffap/int_neon.S b/plugins/ffap/int_neon.S new file mode 100644 index 00000000..e8023e06 --- /dev/null +++ b/plugins/ffap/int_neon.S @@ -0,0 +1,118 @@ +/* + * ARM NEON optimised integer operations + * Copyright (c) 2009 Kostya Shishkov + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "asm.S" + + preserve8 + .fpu neon + .text + +function ff_scalarproduct_int16_neon, export=1 + vmov.i16 q0, #0 + vmov.i16 q1, #0 + vmov.i16 q2, #0 + vmov.i16 q3, #0 + negs r3, r3 + beq 2f + + vdup.s32 q12, r3 +1: vld1.16 {d16-d17}, [r0]! + vld1.16 {d20-d21}, [r1,:128]! + vmull.s16 q12, d16, d20 + vld1.16 {d18-d19}, [r0]! + vmull.s16 q13, d17, d21 + vld1.16 {d22-d23}, [r1,:128]! + vmull.s16 q14, d18, d22 + vmull.s16 q15, d19, d23 + vshl.s32 q8, q12, q12 + vshl.s32 q9, q13, q12 + vadd.s32 q0, q0, q8 + vshl.s32 q10, q14, q12 + vadd.s32 q1, q1, q9 + vshl.s32 q11, q15, q12 + vadd.s32 q2, q2, q10 + vadd.s32 q3, q3, q11 + subs r2, r2, #16 + bne 1b + b 3f + +2: vld1.16 {d16-d17}, [r0]! + vld1.16 {d20-d21}, [r1,:128]! + vmlal.s16 q0, d16, d20 + vld1.16 {d18-d19}, [r0]! + vmlal.s16 q1, d17, d21 + vld1.16 {d22-d23}, [r1,:128]! + vmlal.s16 q2, d18, d22 + vmlal.s16 q3, d19, d23 + subs r2, r2, #16 + bne 2b + +3: vpadd.s32 d16, d0, d1 + vpadd.s32 d17, d2, d3 + vpadd.s32 d10, d4, d5 + vpadd.s32 d11, d6, d7 + vpadd.s32 d0, d16, d17 + vpadd.s32 d1, d10, d11 + vpadd.s32 d2, d0, d1 + vpaddl.s32 d3, d2 + vmov.32 r0, d3[0] + bx lr +endfunc + +@ scalarproduct_and_madd_int16(/*aligned*/v0,v1,v2,order,mul) +function ff_scalarproduct_and_madd_int16_neon, export=1 + vld1.16 {d28[],d29[]}, [sp] + vmov.i16 q0, #0 + vmov.i16 q1, #0 + vmov.i16 q2, #0 + vmov.i16 q3, #0 + mov r12, r0 + +1: vld1.16 {d16-d17}, [r0,:128]! + vld1.16 {d18-d19}, [r1]! + vld1.16 {d20-d21}, [r2]! + vld1.16 {d22-d23}, [r0,:128]! + vld1.16 {d24-d25}, [r1]! + vld1.16 {d26-d27}, [r2]! + vmul.s16 q10, q10, q14 + vmul.s16 q13, q13, q14 + vmlal.s16 q0, d16, d18 + vmlal.s16 q1, d17, d19 + vadd.s16 q10, q8, q10 + vadd.s16 q13, q11, q13 + vmlal.s16 q2, d22, d24 + vmlal.s16 q3, d23, d25 + vst1.16 {q10}, [r12,:128]! + subs r3, r3, #16 + vst1.16 {q13}, [r12,:128]! + bne 1b + + vpadd.s32 d16, d0, d1 + vpadd.s32 d17, d2, d3 + vpadd.s32 d10, d4, d5 + vpadd.s32 d11, d6, d7 + vpadd.s32 d0, d16, d17 + vpadd.s32 d1, d10, d11 + vpadd.s32 d2, d0, d1 + vpaddl.s32 d3, d2 + vmov.32 r0, d3[0] + bx lr +endfunc |