diff options
Diffstat (limited to 'plugins/supereq/ffmpeg_fft/libavcodec')
-rw-r--r-- | plugins/supereq/ffmpeg_fft/libavcodec/arm/asm.S | 104 | ||||
-rw-r--r-- | plugins/supereq/ffmpeg_fft/libavcodec/arm/fft_init_arm.c | 71 | ||||
-rw-r--r-- | plugins/supereq/ffmpeg_fft/libavcodec/arm/fft_neon.S | 372 | ||||
-rw-r--r-- | plugins/supereq/ffmpeg_fft/libavcodec/arm/rdft_neon.S | 151 | ||||
-rw-r--r-- | plugins/supereq/ffmpeg_fft/libavcodec/arm/simple_idct_neon.S | 372 | ||||
-rw-r--r-- | plugins/supereq/ffmpeg_fft/libavcodec/avfft.c | 142 | ||||
-rw-r--r-- | plugins/supereq/ffmpeg_fft/libavcodec/avfft.h | 103 | ||||
-rw-r--r-- | plugins/supereq/ffmpeg_fft/libavcodec/dct.c | 228 | ||||
-rw-r--r-- | plugins/supereq/ffmpeg_fft/libavcodec/dct32.c | 262 | ||||
-rw-r--r-- | plugins/supereq/ffmpeg_fft/libavcodec/dct32.h | 10 | ||||
-rw-r--r-- | plugins/supereq/ffmpeg_fft/libavcodec/fft.c | 300 | ||||
-rw-r--r-- | plugins/supereq/ffmpeg_fft/libavcodec/fft.h | 244 | ||||
-rw-r--r-- | plugins/supereq/ffmpeg_fft/libavcodec/rdft.c | 137 |
13 files changed, 2496 insertions, 0 deletions
diff --git a/plugins/supereq/ffmpeg_fft/libavcodec/arm/asm.S b/plugins/supereq/ffmpeg_fft/libavcodec/arm/asm.S new file mode 100644 index 00000000..6860f1cf --- /dev/null +++ b/plugins/supereq/ffmpeg_fft/libavcodec/arm/asm.S @@ -0,0 +1,104 @@ +/* + * Copyright (c) 2008 Mans Rullgard <mans@mansr.com> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "config.h" + +#ifdef __ELF__ +# define ELF +#else +# define ELF @ +#endif + +.macro require8 val=1 +ELF .eabi_attribute 24, \val +.endm + +.macro preserve8 val=1 +ELF .eabi_attribute 25, \val +.endm + +/* +.macro function name, export=0 + .macro endfunc +ELF .size \name, . - \name + .endfunc + .purgem endfunc + .endm + .text + .if \export + .global EXTERN_ASM\name +EXTERN_ASM\name: + .endif +ELF .type \name, %function + .func \name +\name: +.endm +*/ + +.macro function name, export=0 + .macro endfunc +ELF .size \name, . - \name + .endfunc + .purgem endfunc + .endm + .text + .if \export + .hidden EXTERN_ASM\name + .global EXTERN_ASM\name +EXTERN_ASM\name: + .endif +ELF .type \name, %function + .func \name +\name: +.endm + +.macro mov32 rd, val +#if HAVE_ARMV6T2 + movw \rd, #(\val) & 0xffff + .if (\val) >> 16 + movt \rd, #(\val) >> 16 + .endif +#else + ldr \rd, =\val +#endif +.endm + +.macro movrel rd, val +#if HAVE_ARMV6T2 && !CONFIG_PIC + movw \rd, #:lower16:\val + movt \rd, #:upper16:\val +#else + ldr \rd, =\val +#endif +.endm + +#if HAVE_VFP_ARGS + .eabi_attribute 28, 1 +# define VFP +# define NOVFP @ +#else +# define VFP @ +# define NOVFP +#endif + +#define GLUE(a, b) a ## b +#define JOIN(a, b) GLUE(a, b) +#define X(s) JOIN(EXTERN_ASM, s) + diff --git a/plugins/supereq/ffmpeg_fft/libavcodec/arm/fft_init_arm.c b/plugins/supereq/ffmpeg_fft/libavcodec/arm/fft_init_arm.c new file mode 100644 index 00000000..28148e92 --- /dev/null +++ b/plugins/supereq/ffmpeg_fft/libavcodec/arm/fft_init_arm.c @@ -0,0 +1,71 @@ +/* + * Copyright (c) 2009 Mans Rullgard <mans@mansr.com> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavcodec/fft.h" +#if CONFIG_DCA_DECODER +#include "libavcodec/synth_filter.h" +#endif + +void ff_fft_permute_neon(FFTContext *s, FFTComplex *z); +void ff_fft_calc_neon(FFTContext *s, FFTComplex *z); + +#if 0 +void ff_imdct_calc_neon(FFTContext *s, FFTSample *output, const FFTSample *input); +void ff_imdct_half_neon(FFTContext *s, FFTSample *output, const FFTSample *input); +void ff_mdct_calc_neon(FFTContext *s, FFTSample *output, const FFTSample *input); +#endif + +void ff_rdft_calc_neon(struct RDFTContext *s, FFTSample *z); + +void ff_synth_filter_float_neon(FFTContext *imdct, + float *synth_buf_ptr, int *synth_buf_offset, + float synth_buf2[32], const float window[512], + float out[32], const float in[32], + float scale, float bias); + +av_cold void ff_fft_init_arm(FFTContext *s) +{ + if (HAVE_NEON) { + s->fft_permute = ff_fft_permute_neon; + s->fft_calc = ff_fft_calc_neon; +#if 0 + s->imdct_calc = ff_imdct_calc_neon; + s->imdct_half = ff_imdct_half_neon; + s->mdct_calc = ff_mdct_calc_neon; + s->permutation = FF_MDCT_PERM_INTERLEAVE; +#endif + } +} + +#if CONFIG_RDFT +av_cold void ff_rdft_init_arm(RDFTContext *s) +{ + if (HAVE_NEON) + s->rdft_calc = ff_rdft_calc_neon; +} +#endif + +#if CONFIG_DCA_DECODER +av_cold void ff_synth_filter_init_arm(SynthFilterContext *s) +{ + if (HAVE_NEON) + s->synth_filter_float = ff_synth_filter_float_neon; +} +#endif diff --git a/plugins/supereq/ffmpeg_fft/libavcodec/arm/fft_neon.S b/plugins/supereq/ffmpeg_fft/libavcodec/arm/fft_neon.S new file mode 100644 index 00000000..117f4fee --- /dev/null +++ b/plugins/supereq/ffmpeg_fft/libavcodec/arm/fft_neon.S @@ -0,0 +1,372 @@ +/* + * ARM NEON optimised FFT + * + * Copyright (c) 2009 Mans Rullgard <mans@mansr.com> + * Copyright (c) 2009 Naotoshi Nojiri + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "asm.S" + +#define M_SQRT1_2 0.70710678118654752440 + + .text + +function fft4_neon + vld1.32 {d0-d3}, [r0,:128] + + vext.32 q8, q1, q1, #1 @ i2,r3 d3=i3,r2 + vsub.f32 d6, d0, d1 @ r0-r1,i0-i1 + vsub.f32 d7, d16, d17 @ r3-r2,i2-i3 + vadd.f32 d4, d0, d1 @ r0+r1,i0+i1 + vadd.f32 d5, d2, d3 @ i2+i3,r2+r3 + vadd.f32 d1, d6, d7 + vsub.f32 d3, d6, d7 + vadd.f32 d0, d4, d5 + vsub.f32 d2, d4, d5 + + vst1.32 {d0-d3}, [r0,:128] + + bx lr +endfunc + +function fft8_neon + mov r1, r0 + vld1.32 {d0-d3}, [r1,:128]! + vld1.32 {d16-d19}, [r1,:128] + + movw r2, #0x04f3 @ sqrt(1/2) + movt r2, #0x3f35 + eor r3, r2, #1<<31 + vdup.32 d31, r2 + + vext.32 q11, q1, q1, #1 @ i2,r3,i3,r2 + vadd.f32 d4, d16, d17 @ r4+r5,i4+i5 + vmov d28, r3, r2 + vadd.f32 d5, d18, d19 @ r6+r7,i6+i7 + vsub.f32 d17, d16, d17 @ r4-r5,i4-i5 + vsub.f32 d19, d18, d19 @ r6-r7,i6-i7 + vrev64.32 d29, d28 + vadd.f32 d20, d0, d1 @ r0+r1,i0+i1 + vadd.f32 d21, d2, d3 @ r2+r3,i2+i3 + vmul.f32 d26, d17, d28 @ -a2r*w,a2i*w + vext.32 q3, q2, q2, #1 + vmul.f32 d27, d19, d29 @ a3r*w,-a3i*w + vsub.f32 d23, d22, d23 @ i2-i3,r3-r2 + vsub.f32 d22, d0, d1 @ r0-r1,i0-i1 + vmul.f32 d24, d17, d31 @ a2r*w,a2i*w + vmul.f32 d25, d19, d31 @ a3r*w,a3i*w + vadd.f32 d0, d20, d21 + vsub.f32 d2, d20, d21 + vadd.f32 d1, d22, d23 + vrev64.32 q13, q13 + vsub.f32 d3, d22, d23 + vsub.f32 d6, d6, d7 + vadd.f32 d24, d24, d26 @ a2r+a2i,a2i-a2r t1,t2 + vadd.f32 d25, d25, d27 @ a3r-a3i,a3i+a3r t5,t6 + vadd.f32 d7, d4, d5 + vsub.f32 d18, d2, d6 + vext.32 q13, q12, q12, #1 + vadd.f32 d2, d2, d6 + vsub.f32 d16, d0, d7 + vadd.f32 d5, d25, d24 + vsub.f32 d4, d26, d27 + vadd.f32 d0, d0, d7 + vsub.f32 d17, d1, d5 + vsub.f32 d19, d3, d4 + vadd.f32 d3, d3, d4 + vadd.f32 d1, d1, d5 + + vst1.32 {d16-d19}, [r1,:128] + vst1.32 {d0-d3}, [r0,:128] + + bx lr +endfunc + +function fft16_neon + movrel r1, mppm + vld1.32 {d16-d19}, [r0,:128]! @ q8{r0,i0,r1,i1} q9{r2,i2,r3,i3} + pld [r0, #32] + vld1.32 {d2-d3}, [r1,:128] + vext.32 q13, q9, q9, #1 + vld1.32 {d22-d25}, [r0,:128]! @ q11{r4,i4,r5,i5} q12{r6,i5,r7,i7} + vadd.f32 d4, d16, d17 + vsub.f32 d5, d16, d17 + vadd.f32 d18, d18, d19 + vsub.f32 d19, d26, d27 + + vadd.f32 d20, d22, d23 + vsub.f32 d22, d22, d23 + vsub.f32 d23, d24, d25 + vadd.f32 q8, q2, q9 @ {r0,i0,r1,i1} + vadd.f32 d21, d24, d25 + vmul.f32 d24, d22, d2 + vsub.f32 q9, q2, q9 @ {r2,i2,r3,i3} + vmul.f32 d25, d23, d3 + vuzp.32 d16, d17 @ {r0,r1,i0,i1} + vmul.f32 q1, q11, d2[1] + vuzp.32 d18, d19 @ {r2,r3,i2,i3} + vrev64.32 q12, q12 + vadd.f32 q11, q12, q1 @ {t1a,t2a,t5,t6} + vld1.32 {d24-d27}, [r0,:128]! @ q12{r8,i8,r9,i9} q13{r10,i10,r11,i11} + vzip.32 q10, q11 + vld1.32 {d28-d31}, [r0,:128] @ q14{r12,i12,r13,i13} q15{r14,i14,r15,i15} + vadd.f32 d0, d22, d20 + vadd.f32 d1, d21, d23 + vsub.f32 d2, d21, d23 + vsub.f32 d3, d22, d20 + sub r0, r0, #96 + vext.32 q13, q13, q13, #1 + vsub.f32 q10, q8, q0 @ {r4,r5,i4,i5} + vadd.f32 q8, q8, q0 @ {r0,r1,i0,i1} + vext.32 q15, q15, q15, #1 + vsub.f32 q11, q9, q1 @ {r6,r7,i6,i7} + vswp d25, d26 @ q12{r8,i8,i10,r11} q13{r9,i9,i11,r10} + vadd.f32 q9, q9, q1 @ {r2,r3,i2,i3} + vswp d29, d30 @ q14{r12,i12,i14,r15} q15{r13,i13,i15,r14} + vadd.f32 q0, q12, q13 @ {t1,t2,t5,t6} + vadd.f32 q1, q14, q15 @ {t1a,t2a,t5a,t6a} + movrel r2, X(ff_cos_16) + vsub.f32 q13, q12, q13 @ {t3,t4,t7,t8} + vrev64.32 d1, d1 + vsub.f32 q15, q14, q15 @ {t3a,t4a,t7a,t8a} + vrev64.32 d3, d3 + movrel r3, pmmp + vswp d1, d26 @ q0{t1,t2,t3,t4} q13{t6,t5,t7,t8} + vswp d3, d30 @ q1{t1a,t2a,t3a,t4a} q15{t6a,t5a,t7a,t8a} + vadd.f32 q12, q0, q13 @ {r8,i8,r9,i9} + vadd.f32 q14, q1, q15 @ {r12,i12,r13,i13} + vld1.32 {d4-d5}, [r2,:64] + vsub.f32 q13, q0, q13 @ {r10,i10,r11,i11} + vsub.f32 q15, q1, q15 @ {r14,i14,r15,i15} + vswp d25, d28 @ q12{r8,i8,r12,i12} q14{r9,i9,r13,i13} + vld1.32 {d6-d7}, [r3,:128] + vrev64.32 q1, q14 + vmul.f32 q14, q14, d4[1] + vmul.f32 q1, q1, q3 + vmla.f32 q14, q1, d5[1] @ {t1a,t2a,t5a,t6a} + vswp d27, d30 @ q13{r10,i10,r14,i14} q15{r11,i11,r15,i15} + vzip.32 q12, q14 + vadd.f32 d0, d28, d24 + vadd.f32 d1, d25, d29 + vsub.f32 d2, d25, d29 + vsub.f32 d3, d28, d24 + vsub.f32 q12, q8, q0 @ {r8,r9,i8,i9} + vadd.f32 q8, q8, q0 @ {r0,r1,i0,i1} + vsub.f32 q14, q10, q1 @ {r12,r13,i12,i13} + mov r1, #32 + vadd.f32 q10, q10, q1 @ {r4,r5,i4,i5} + vrev64.32 q0, q13 + vmul.f32 q13, q13, d5[0] + vrev64.32 q1, q15 + vmul.f32 q15, q15, d5[1] + vst2.32 {d16-d17},[r0,:128], r1 + vmul.f32 q0, q0, q3 + vst2.32 {d20-d21},[r0,:128], r1 + vmul.f32 q1, q1, q3 + vmla.f32 q13, q0, d5[0] @ {t1,t2,t5,t6} + vmla.f32 q15, q1, d4[1] @ {t1a,t2a,t5a,t6a} + vst2.32 {d24-d25},[r0,:128], r1 + vst2.32 {d28-d29},[r0,:128] + vzip.32 q13, q15 + sub r0, r0, #80 + vadd.f32 d0, d30, d26 + vadd.f32 d1, d27, d31 + vsub.f32 d2, d27, d31 + vsub.f32 d3, d30, d26 + vsub.f32 q13, q9, q0 @ {r10,r11,i10,i11} + vadd.f32 q9, q9, q0 @ {r2,r3,i2,i3} + vsub.f32 q15, q11, q1 @ {r14,r15,i14,i15} + vadd.f32 q11, q11, q1 @ {r6,r7,i6,i7} + vst2.32 {d18-d19},[r0,:128], r1 + vst2.32 {d22-d23},[r0,:128], r1 + vst2.32 {d26-d27},[r0,:128], r1 + vst2.32 {d30-d31},[r0,:128] + bx lr +endfunc + +function fft_pass_neon + push {r4-r6,lr} + mov r6, r2 @ n + lsl r5, r2, #3 @ 2 * n * sizeof FFTSample + lsl r4, r2, #4 @ 2 * n * sizeof FFTComplex + lsl r2, r2, #5 @ 4 * n * sizeof FFTComplex + add r3, r2, r4 + add r4, r4, r0 @ &z[o1] + add r2, r2, r0 @ &z[o2] + add r3, r3, r0 @ &z[o3] + vld1.32 {d20-d21},[r2,:128] @ {z[o2],z[o2+1]} + movrel r12, pmmp + vld1.32 {d22-d23},[r3,:128] @ {z[o3],z[o3+1]} + add r5, r5, r1 @ wim + vld1.32 {d6-d7}, [r12,:128] @ pmmp + vswp d21, d22 + vld1.32 {d4}, [r1,:64]! @ {wre[0],wre[1]} + sub r5, r5, #4 @ wim-- + vrev64.32 q1, q11 + vmul.f32 q11, q11, d4[1] + vmul.f32 q1, q1, q3 + vld1.32 {d5[0]}, [r5,:32] @ d5[0] = wim[-1] + vmla.f32 q11, q1, d5[0] @ {t1a,t2a,t5a,t6a} + vld2.32 {d16-d17},[r0,:128] @ {z[0],z[1]} + sub r6, r6, #1 @ n-- + vld2.32 {d18-d19},[r4,:128] @ {z[o1],z[o1+1]} + vzip.32 q10, q11 + vadd.f32 d0, d22, d20 + vadd.f32 d1, d21, d23 + vsub.f32 d2, d21, d23 + vsub.f32 d3, d22, d20 + vsub.f32 q10, q8, q0 + vadd.f32 q8, q8, q0 + vsub.f32 q11, q9, q1 + vadd.f32 q9, q9, q1 + vst2.32 {d20-d21},[r2,:128]! @ {z[o2],z[o2+1]} + vst2.32 {d16-d17},[r0,:128]! @ {z[0],z[1]} + vst2.32 {d22-d23},[r3,:128]! @ {z[o3],z[o3+1]} + vst2.32 {d18-d19},[r4,:128]! @ {z[o1],z[o1+1]} + sub r5, r5, #8 @ wim -= 2 +1: + vld1.32 {d20-d21},[r2,:128] @ {z[o2],z[o2+1]} + vld1.32 {d22-d23},[r3,:128] @ {z[o3],z[o3+1]} + vswp d21, d22 + vld1.32 {d4}, [r1]! @ {wre[0],wre[1]} + vrev64.32 q0, q10 + vmul.f32 q10, q10, d4[0] + vrev64.32 q1, q11 + vmul.f32 q11, q11, d4[1] + vld1.32 {d5}, [r5] @ {wim[-1],wim[0]} + vmul.f32 q0, q0, q3 + sub r5, r5, #8 @ wim -= 2 + vmul.f32 q1, q1, q3 + vmla.f32 q10, q0, d5[1] @ {t1,t2,t5,t6} + vmla.f32 q11, q1, d5[0] @ {t1a,t2a,t5a,t6a} + vld2.32 {d16-d17},[r0,:128] @ {z[0],z[1]} + subs r6, r6, #1 @ n-- + vld2.32 {d18-d19},[r4,:128] @ {z[o1],z[o1+1]} + vzip.32 q10, q11 + vadd.f32 d0, d22, d20 + vadd.f32 d1, d21, d23 + vsub.f32 d2, d21, d23 + vsub.f32 d3, d22, d20 + vsub.f32 q10, q8, q0 + vadd.f32 q8, q8, q0 + vsub.f32 q11, q9, q1 + vadd.f32 q9, q9, q1 + vst2.32 {d20-d21}, [r2,:128]! @ {z[o2],z[o2+1]} + vst2.32 {d16-d17}, [r0,:128]! @ {z[0],z[1]} + vst2.32 {d22-d23}, [r3,:128]! @ {z[o3],z[o3+1]} + vst2.32 {d18-d19}, [r4,:128]! @ {z[o1],z[o1+1]} + bne 1b + + pop {r4-r6,pc} +endfunc + +.macro def_fft n, n2, n4 + .align 6 +function fft\n\()_neon + push {r4, lr} + mov r4, r0 + bl fft\n2\()_neon + add r0, r4, #\n4*2*8 + bl fft\n4\()_neon + add r0, r4, #\n4*3*8 + bl fft\n4\()_neon + mov r0, r4 + pop {r4, lr} + movrel r1, X(ff_cos_\n) + mov r2, #\n4/2 + b fft_pass_neon +endfunc +.endm + + def_fft 32, 16, 8 + def_fft 64, 32, 16 + def_fft 128, 64, 32 + def_fft 256, 128, 64 + def_fft 512, 256, 128 + def_fft 1024, 512, 256 + def_fft 2048, 1024, 512 + def_fft 4096, 2048, 1024 + def_fft 8192, 4096, 2048 + def_fft 16384, 8192, 4096 + def_fft 32768, 16384, 8192 + def_fft 65536, 32768, 16384 + +function ff_fft_calc_neon, export=1 + ldr r2, [r0] + sub r2, r2, #2 + movrel r3, fft_tab_neon + ldr r3, [r3, r2, lsl #2] + mov r0, r1 + bx r3 +endfunc + +function ff_fft_permute_neon, export=1 + push {r4,lr} + mov r12, #1 + ldr r2, [r0] @ nbits + ldr r3, [r0, #12] @ tmp_buf + ldr r0, [r0, #8] @ revtab + lsl r12, r12, r2 + mov r2, r12 +1: + vld1.32 {d0-d1}, [r1,:128]! + ldr r4, [r0], #4 + uxth lr, r4 + uxth r4, r4, ror #16 + add lr, r3, lr, lsl #3 + add r4, r3, r4, lsl #3 + vst1.32 {d0}, [lr,:64] + vst1.32 {d1}, [r4,:64] + subs r12, r12, #2 + bgt 1b + + sub r1, r1, r2, lsl #3 +1: + vld1.32 {d0-d3}, [r3,:128]! + vst1.32 {d0-d3}, [r1,:128]! + subs r2, r2, #4 + bgt 1b + + pop {r4,pc} +endfunc + + .section .rodata + .align 4 +fft_tab_neon: + .word fft4_neon + .word fft8_neon + .word fft16_neon + .word fft32_neon + .word fft64_neon + .word fft128_neon + .word fft256_neon + .word fft512_neon + .word fft1024_neon + .word fft2048_neon + .word fft4096_neon + .word fft8192_neon + .word fft16384_neon + .word fft32768_neon + .word fft65536_neon +ELF .size fft_tab_neon, . - fft_tab_neon + + .align 4 +pmmp: .float +1.0, -1.0, -1.0, +1.0 +mppm: .float -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2 + diff --git a/plugins/supereq/ffmpeg_fft/libavcodec/arm/rdft_neon.S b/plugins/supereq/ffmpeg_fft/libavcodec/arm/rdft_neon.S new file mode 100644 index 00000000..4f8a1032 --- /dev/null +++ b/plugins/supereq/ffmpeg_fft/libavcodec/arm/rdft_neon.S @@ -0,0 +1,151 @@ +/* + * ARM NEON optimised RDFT + * Copyright (c) 2009 Mans Rullgard <mans@mansr.com> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "asm.S" + + preserve8 + +function ff_rdft_calc_neon, export=1 + push {r4-r8,lr} + + ldr r6, [r0, #4] @ inverse + mov r4, r0 + mov r5, r1 + + lsls r6, r6, #31 + bne 1f + add r0, r4, #20 + bl X(ff_fft_permute_neon) + add r0, r4, #20 + mov r1, r5 + bl X(ff_fft_calc_neon) +1: + ldr r12, [r4, #0] @ nbits + mov r2, #1 + lsl r12, r2, r12 + add r0, r5, #8 + add r1, r5, r12, lsl #2 + lsr r12, r12, #2 + ldr r2, [r4, #12] @ tcos + sub r12, r12, #2 + ldr r3, [r4, #16] @ tsin + mov r7, r0 + sub r1, r1, #8 + mov lr, r1 + mov r8, #-8 + vld1.32 {d0}, [r0,:64]! @ d1[0,1] + vld1.32 {d1}, [r1,:64], r8 @ d2[0,1] + vld1.32 {d4}, [r2,:64]! @ tcos[i] + vld1.32 {d5}, [r3,:64]! @ tsin[i] + vmov.f32 d18, #0.5 @ k1 + vdup.32 d19, r6 + pld [r0, #32] + veor d19, d18, d19 @ k2 + vmov.i32 d16, #0 + vmov.i32 d17, #1<<31 + pld [r1, #-32] + vtrn.32 d16, d17 + pld [r2, #32] + vrev64.32 d16, d16 @ d16=1,0 d17=0,1 + pld [r3, #32] +2: + veor q1, q0, q8 @ -d1[0],d1[1], d2[0],-d2[1] + vld1.32 {d24}, [r0,:64]! @ d1[0,1] + vadd.f32 d0, d0, d3 @ d1[0]+d2[0], d1[1]-d2[1] + vld1.32 {d25}, [r1,:64], r8 @ d2[0,1] + vadd.f32 d1, d2, d1 @ -d1[0]+d2[0], d1[1]+d2[1] + veor q3, q12, q8 @ -d1[0],d1[1], d2[0],-d2[1] + pld [r0, #32] + vmul.f32 q10, q0, q9 @ ev.re, ev.im, od.im, od.re + pld [r1, #-32] + vadd.f32 d0, d24, d7 @ d1[0]+d2[0], d1[1]-d2[1] + vadd.f32 d1, d6, d25 @ -d1[0]+d2[0], d1[1]+d2[1] + vmul.f32 q11, q0, q9 @ ev.re, ev.im, od.im, od.re + veor d7, d21, d16 @ -od.im, od.re + vrev64.32 d3, d21 @ od.re, od.im + veor d6, d20, d17 @ ev.re,-ev.im + veor d2, d3, d16 @ -od.re, od.im + vmla.f32 d20, d3, d4[1] + vmla.f32 d20, d7, d5[1] + vmla.f32 d6, d2, d4[1] + vmla.f32 d6, d21, d5[1] + vld1.32 {d4}, [r2,:64]! @ tcos[i] + veor d7, d23, d16 @ -od.im, od.re + vld1.32 {d5}, [r3,:64]! @ tsin[i] + veor d24, d22, d17 @ ev.re,-ev.im + vrev64.32 d3, d23 @ od.re, od.im + pld [r2, #32] + veor d2, d3, d16 @ -od.re, od.im + pld [r3, #32] + vmla.f32 d22, d3, d4[0] + vmla.f32 d22, d7, d5[0] + vmla.f32 d24, d2, d4[0] + vmla.f32 d24, d23, d5[0] + vld1.32 {d0}, [r0,:64]! @ d1[0,1] + vld1.32 {d1}, [r1,:64], r8 @ d2[0,1] + vst1.32 {d20}, [r7,:64]! + vst1.32 {d6}, [lr,:64], r8 + vst1.32 {d22}, [r7,:64]! + vst1.32 {d24}, [lr,:64], r8 + subs r12, r12, #2 + bgt 2b + + veor q1, q0, q8 @ -d1[0],d1[1], d2[0],-d2[1] + vadd.f32 d0, d0, d3 @ d1[0]+d2[0], d1[1]-d2[1] + vadd.f32 d1, d2, d1 @ -d1[0]+d2[0], d1[1]+d2[1] + ldr r2, [r4, #8] @ sign_convention + vmul.f32 q10, q0, q9 @ ev.re, ev.im, od.im, od.re + add r0, r0, #4 + bfc r2, #0, #31 + vld1.32 {d0[0]}, [r0,:32] + veor d7, d21, d16 @ -od.im, od.re + vrev64.32 d3, d21 @ od.re, od.im + veor d6, d20, d17 @ ev.re,-ev.im + vld1.32 {d22}, [r5,:64] + vdup.32 d1, r2 + vmov d23, d22 + veor d2, d3, d16 @ -od.re, od.im + vtrn.32 d22, d23 + veor d0, d0, d1 + veor d23, d23, d17 + vmla.f32 d20, d3, d4[1] + vmla.f32 d20, d7, d5[1] + vmla.f32 d6, d2, d4[1] + vmla.f32 d6, d21, d5[1] + vadd.f32 d22, d22, d23 + vst1.32 {d20}, [r7,:64] + vst1.32 {d6}, [lr,:64] + vst1.32 {d0[0]}, [r0,:32] + vst1.32 {d22}, [r5,:64] + + cmp r6, #0 + popeq {r4-r8,pc} + + vmul.f32 d22, d22, d18 + vst1.32 {d22}, [r5,:64] + add r0, r4, #20 + mov r1, r5 + bl X(ff_fft_permute_neon) + add r0, r4, #20 + mov r1, r5 + pop {r4-r8,lr} + b X(ff_fft_calc_neon) +endfunc diff --git a/plugins/supereq/ffmpeg_fft/libavcodec/arm/simple_idct_neon.S b/plugins/supereq/ffmpeg_fft/libavcodec/arm/simple_idct_neon.S new file mode 100644 index 00000000..17cde583 --- /dev/null +++ b/plugins/supereq/ffmpeg_fft/libavcodec/arm/simple_idct_neon.S @@ -0,0 +1,372 @@ +/* + * ARM NEON IDCT + * + * Copyright (c) 2008 Mans Rullgard <mans@mansr.com> + * + * Based on Simple IDCT + * Copyright (c) 2001 Michael Niedermayer <michaelni@gmx.at> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "asm.S" + +#define W1 22725 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 +#define W2 21407 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 +#define W3 19266 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 +#define W4 16383 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 +#define W5 12873 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 +#define W6 8867 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 +#define W7 4520 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 +#define W4c ((1<<(COL_SHIFT-1))/W4) +#define ROW_SHIFT 11 +#define COL_SHIFT 20 + +#define w1 d0[0] +#define w2 d0[1] +#define w3 d0[2] +#define w4 d0[3] +#define w5 d1[0] +#define w6 d1[1] +#define w7 d1[2] +#define w4c d1[3] + + .macro idct_col4_top + vmull.s16 q7, d6, w2 /* q9 = W2 * col[2] */ + vmull.s16 q8, d6, w6 /* q10 = W6 * col[2] */ + vmull.s16 q9, d4, w1 /* q9 = W1 * col[1] */ + vadd.i32 q11, q15, q7 + vmull.s16 q10, d4, w3 /* q10 = W3 * col[1] */ + vadd.i32 q12, q15, q8 + vmull.s16 q5, d4, w5 /* q5 = W5 * col[1] */ + vsub.i32 q13, q15, q8 + vmull.s16 q6, d4, w7 /* q6 = W7 * col[1] */ + vsub.i32 q14, q15, q7 + + vmlal.s16 q9, d8, w3 /* q9 += W3 * col[3] */ + vmlsl.s16 q10, d8, w7 /* q10 -= W7 * col[3] */ + vmlsl.s16 q5, d8, w1 /* q5 -= W1 * col[3] */ + vmlsl.s16 q6, d8, w5 /* q6 -= W5 * col[3] */ + .endm + + .text + .align 6 + +function idct_row4_pld_neon + pld [r0] + add r3, r0, r1, lsl #2 + pld [r0, r1] + pld [r0, r1, lsl #1] + pld [r3, -r1] + pld [r3] + pld [r3, r1] + add r3, r3, r1, lsl #1 + pld [r3] + pld [r3, r1] +endfunc + +function idct_row4_neon + vmov.i32 q15, #(1<<(ROW_SHIFT-1)) + vld1.64 {d2-d5}, [r2,:128]! + vmlal.s16 q15, d2, w4 /* q15 += W4 * col[0] */ + vld1.64 {d6,d7}, [r2,:128]! + vorr d10, d3, d5 + vld1.64 {d8,d9}, [r2,:128]! + add r2, r2, #-64 + + vorr d11, d7, d9 + vorr d10, d10, d11 + vmov r3, r4, d10 + + idct_col4_top + + orrs r3, r3, r4 + beq 1f + + vmull.s16 q7, d3, w4 /* q7 = W4 * col[4] */ + vmlal.s16 q9, d5, w5 /* q9 += W5 * col[5] */ + vmlsl.s16 q10, d5, w1 /* q10 -= W1 * col[5] */ + vmull.s16 q8, d7, w2 /* q8 = W2 * col[6] */ + vmlal.s16 q5, d5, w7 /* q5 += W7 * col[5] */ + vadd.i32 q11, q11, q7 + vsub.i32 q12, q12, q7 + vsub.i32 q13, q13, q7 + vadd.i32 q14, q14, q7 + vmlal.s16 q6, d5, w3 /* q6 += W3 * col[5] */ + vmull.s16 q7, d7, w6 /* q7 = W6 * col[6] */ + vmlal.s16 q9, d9, w7 + vmlsl.s16 q10, d9, w5 + vmlal.s16 q5, d9, w3 + vmlsl.s16 q6, d9, w1 + vadd.i32 q11, q11, q7 + vsub.i32 q12, q12, q8 + vadd.i32 q13, q13, q8 + vsub.i32 q14, q14, q7 + +1: vadd.i32 q3, q11, q9 + vadd.i32 q4, q12, q10 + vshrn.i32 d2, q3, #ROW_SHIFT + vshrn.i32 d4, q4, #ROW_SHIFT + vadd.i32 q7, q13, q5 + vadd.i32 q8, q14, q6 + vtrn.16 d2, d4 + vshrn.i32 d6, q7, #ROW_SHIFT + vshrn.i32 d8, q8, #ROW_SHIFT + vsub.i32 q14, q14, q6 + vsub.i32 q11, q11, q9 + vtrn.16 d6, d8 + vsub.i32 q13, q13, q5 + vshrn.i32 d3, q14, #ROW_SHIFT + vtrn.32 d2, d6 + vsub.i32 q12, q12, q10 + vtrn.32 d4, d8 + vshrn.i32 d5, q13, #ROW_SHIFT + vshrn.i32 d7, q12, #ROW_SHIFT + vshrn.i32 d9, q11, #ROW_SHIFT + + vtrn.16 d3, d5 + vtrn.16 d7, d9 + vtrn.32 d3, d7 + vtrn.32 d5, d9 + + vst1.64 {d2-d5}, [r2,:128]! + vst1.64 {d6-d9}, [r2,:128]! + + bx lr +endfunc + +function idct_col4_neon + mov ip, #16 + vld1.64 {d2}, [r2,:64], ip /* d2 = col[0] */ + vdup.16 d30, w4c + vld1.64 {d4}, [r2,:64], ip /* d3 = col[1] */ + vadd.i16 d30, d30, d2 + vld1.64 {d6}, [r2,:64], ip /* d4 = col[2] */ + vmull.s16 q15, d30, w4 /* q15 = W4*(col[0]+(1<<COL_SHIFT-1)/W4)*/ + vld1.64 {d8}, [r2,:64], ip /* d5 = col[3] */ + + ldrd r4, [r2] + ldrd r6, [r2, #16] + orrs r4, r4, r5 + + idct_col4_top + addeq r2, r2, #16 + beq 1f + + vld1.64 {d3}, [r2,:64], ip /* d6 = col[4] */ + vmull.s16 q7, d3, w4 /* q7 = W4 * col[4] */ + vadd.i32 q11, q11, q7 + vsub.i32 q12, q12, q7 + vsub.i32 q13, q13, q7 + vadd.i32 q14, q14, q7 + +1: orrs r6, r6, r7 + ldrd r4, [r2, #16] + addeq r2, r2, #16 + beq 2f + + vld1.64 {d5}, [r2,:64], ip /* d7 = col[5] */ + vmlal.s16 q9, d5, w5 /* q9 += W5 * col[5] */ + vmlsl.s16 q10, d5, w1 /* q10 -= W1 * col[5] */ + vmlal.s16 q5, d5, w7 /* q5 += W7 * col[5] */ + vmlal.s16 q6, d5, w3 /* q6 += W3 * col[5] */ + +2: orrs r4, r4, r5 + ldrd r4, [r2, #16] + addeq r2, r2, #16 + beq 3f + + vld1.64 {d7}, [r2,:64], ip /* d8 = col[6] */ + vmull.s16 q7, d7, w6 /* q7 = W6 * col[6] */ + vmull.s16 q8, d7, w2 /* q8 = W2 * col[6] */ + vadd.i32 q11, q11, q7 + vsub.i32 q14, q14, q7 + vsub.i32 q12, q12, q8 + vadd.i32 q13, q13, q8 + +3: orrs r4, r4, r5 + addeq r2, r2, #16 + beq 4f + + vld1.64 {d9}, [r2,:64], ip /* d9 = col[7] */ + vmlal.s16 q9, d9, w7 + vmlsl.s16 q10, d9, w5 + vmlal.s16 q5, d9, w3 + vmlsl.s16 q6, d9, w1 + +4: vaddhn.i32 d2, q11, q9 + vaddhn.i32 d3, q12, q10 + vaddhn.i32 d4, q13, q5 + vaddhn.i32 d5, q14, q6 + vsubhn.i32 d9, q11, q9 + vsubhn.i32 d8, q12, q10 + vsubhn.i32 d7, q13, q5 + vsubhn.i32 d6, q14, q6 + + bx lr +endfunc + + .align 6 + +function idct_col4_st8_neon + vqshrun.s16 d2, q1, #COL_SHIFT-16 + vqshrun.s16 d3, q2, #COL_SHIFT-16 + vqshrun.s16 d4, q3, #COL_SHIFT-16 + vqshrun.s16 d5, q4, #COL_SHIFT-16 + vst1.32 {d2[0]}, [r0,:32], r1 + vst1.32 {d2[1]}, [r0,:32], r1 + vst1.32 {d3[0]}, [r0,:32], r1 + vst1.32 {d3[1]}, [r0,:32], r1 + vst1.32 {d4[0]}, [r0,:32], r1 + vst1.32 {d4[1]}, [r0,:32], r1 + vst1.32 {d5[0]}, [r0,:32], r1 + vst1.32 {d5[1]}, [r0,:32], r1 + + bx lr +endfunc + + .section .rodata + .align 4 +idct_coeff_neon: + .short W1, W2, W3, W4, W5, W6, W7, W4c + + .macro idct_start data + push {r4-r7, lr} + pld [\data] + pld [\data, #64] + vpush {d8-d15} + movrel r3, idct_coeff_neon + vld1.64 {d0,d1}, [r3,:128] + .endm + + .macro idct_end + vpop {d8-d15} + pop {r4-r7, pc} + .endm + +/* void ff_simple_idct_put_neon(uint8_t *dst, int line_size, DCTELEM *data); */ +function ff_simple_idct_put_neon, export=1 + idct_start r2 + + bl idct_row4_pld_neon + bl idct_row4_neon + add r2, r2, #-128 + bl idct_col4_neon + bl idct_col4_st8_neon + sub r0, r0, r1, lsl #3 + add r0, r0, #4 + add r2, r2, #-120 + bl idct_col4_neon + bl idct_col4_st8_neon + + idct_end +endfunc + + .align 6 + +function idct_col4_add8_neon + mov ip, r0 + + vld1.32 {d10[0]}, [r0,:32], r1 + vshr.s16 q1, q1, #COL_SHIFT-16 + vld1.32 {d10[1]}, [r0,:32], r1 + vshr.s16 q2, q2, #COL_SHIFT-16 + vld1.32 {d11[0]}, [r0,:32], r1 + vshr.s16 q3, q3, #COL_SHIFT-16 + vld1.32 {d11[1]}, [r0,:32], r1 + vshr.s16 q4, q4, #COL_SHIFT-16 + vld1.32 {d12[0]}, [r0,:32], r1 + vaddw.u8 q1, q1, d10 + vld1.32 {d12[1]}, [r0,:32], r1 + vaddw.u8 q2, q2, d11 + vld1.32 {d13[0]}, [r0,:32], r1 + vqmovun.s16 d2, q1 + vld1.32 {d13[1]}, [r0,:32], r1 + vaddw.u8 q3, q3, d12 + vst1.32 {d2[0]}, [ip,:32], r1 + vqmovun.s16 d3, q2 + vst1.32 {d2[1]}, [ip,:32], r1 + vaddw.u8 q4, q4, d13 + vst1.32 {d3[0]}, [ip,:32], r1 + vqmovun.s16 d4, q3 + vst1.32 {d3[1]}, [ip,:32], r1 + vqmovun.s16 d5, q4 + vst1.32 {d4[0]}, [ip,:32], r1 + vst1.32 {d4[1]}, [ip,:32], r1 + vst1.32 {d5[0]}, [ip,:32], r1 + vst1.32 {d5[1]}, [ip,:32], r1 + + bx lr +endfunc + +/* void ff_simple_idct_add_neon(uint8_t *dst, int line_size, DCTELEM *data); */ +function ff_simple_idct_add_neon, export=1 + idct_start r2 + + bl idct_row4_pld_neon + bl idct_row4_neon + add r2, r2, #-128 + bl idct_col4_neon + bl idct_col4_add8_neon + sub r0, r0, r1, lsl #3 + add r0, r0, #4 + add r2, r2, #-120 + bl idct_col4_neon + bl idct_col4_add8_neon + + idct_end +endfunc + + .align 6 + +function idct_col4_st16_neon + mov ip, #16 + + vshr.s16 q1, q1, #COL_SHIFT-16 + vshr.s16 q2, q2, #COL_SHIFT-16 + vst1.64 {d2}, [r2,:64], ip + vshr.s16 q3, q3, #COL_SHIFT-16 + vst1.64 {d3}, [r2,:64], ip + vshr.s16 q4, q4, #COL_SHIFT-16 + vst1.64 {d4}, [r2,:64], ip + vst1.64 {d5}, [r2,:64], ip + vst1.64 {d6}, [r2,:64], ip + vst1.64 {d7}, [r2,:64], ip + vst1.64 {d8}, [r2,:64], ip + vst1.64 {d9}, [r2,:64], ip + + bx lr +endfunc + +/* void ff_simple_idct_neon(DCTELEM *data); */ +function ff_simple_idct_neon, export=1 + idct_start r0 + + mov r2, r0 + bl idct_row4_neon + bl idct_row4_neon + add r2, r2, #-128 + bl idct_col4_neon + add r2, r2, #-128 + bl idct_col4_st16_neon + add r2, r2, #-120 + bl idct_col4_neon + add r2, r2, #-128 + bl idct_col4_st16_neon + + idct_end +endfunc diff --git a/plugins/supereq/ffmpeg_fft/libavcodec/avfft.c b/plugins/supereq/ffmpeg_fft/libavcodec/avfft.c new file mode 100644 index 00000000..25fc4e09 --- /dev/null +++ b/plugins/supereq/ffmpeg_fft/libavcodec/avfft.c @@ -0,0 +1,142 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/mem.h" +#include "avfft.h" +#include "fft.h" + +/* FFT */ + +FFTContext *av_fft_init(int nbits, int inverse) +{ + FFTContext *s = av_malloc(sizeof(*s)); + + if (s) + ff_fft_init(s, nbits, inverse); + + return s; +} + +void av_fft_permute(FFTContext *s, FFTComplex *z) +{ + s->fft_permute(s, z); +} + +void av_fft_calc(FFTContext *s, FFTComplex *z) +{ + s->fft_calc(s, z); +} + +void av_fft_end(FFTContext *s) +{ + if (s) { + ff_fft_end(s); + av_free(s); + } +} + +#if CONFIG_MDCT + +FFTContext *av_mdct_init(int nbits, int inverse, double scale) +{ + FFTContext *s = av_malloc(sizeof(*s)); + + if (s) + ff_mdct_init(s, nbits, inverse, scale); + + return s; +} + +void av_imdct_calc(FFTContext *s, FFTSample *output, const FFTSample *input) +{ + s->imdct_calc(s, output, input); +} + +void av_imdct_half(FFTContext *s, FFTSample *output, const FFTSample *input) +{ + s->imdct_half(s, output, input); +} + +void av_mdct_calc(FFTContext *s, FFTSample *output, const FFTSample *input) +{ + s->mdct_calc(s, output, input); +} + +void av_mdct_end(FFTContext *s) +{ + if (s) { + ff_mdct_end(s); + av_free(s); + } +} + +#endif /* CONFIG_MDCT */ + +#if CONFIG_RDFT + +RDFTContext *av_rdft_init(int nbits, enum RDFTransformType trans) +{ + RDFTContext *s = av_malloc(sizeof(*s)); + + if (s) + ff_rdft_init(s, nbits, trans); + + return s; +} + +void av_rdft_calc(RDFTContext *s, FFTSample *data) +{ + ff_rdft_calc(s, data); +} + +void av_rdft_end(RDFTContext *s) +{ + if (s) { + ff_rdft_end(s); + av_free(s); + } +} + +#endif /* CONFIG_RDFT */ + +#if CONFIG_DCT + +DCTContext *av_dct_init(int nbits, enum DCTTransformType inverse) +{ + DCTContext *s = av_malloc(sizeof(*s)); + + if (s) + ff_dct_init(s, nbits, inverse); + + return s; +} + +void av_dct_calc(DCTContext *s, FFTSample *data) +{ + ff_dct_calc(s, data); +} + +void av_dct_end(DCTContext *s) +{ + if (s) { + ff_dct_end(s); + av_free(s); + } +} + +#endif /* CONFIG_DCT */ diff --git a/plugins/supereq/ffmpeg_fft/libavcodec/avfft.h b/plugins/supereq/ffmpeg_fft/libavcodec/avfft.h new file mode 100644 index 00000000..fdf30237 --- /dev/null +++ b/plugins/supereq/ffmpeg_fft/libavcodec/avfft.h @@ -0,0 +1,103 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCODEC_AVFFT_H +#define AVCODEC_AVFFT_H + +#include "publik.h" + +typedef float FFTSample; + +typedef struct FFTComplex { + FFTSample re, im; +} FFTComplex; + +typedef struct FFTContext FFTContext; + +/** + * Set up a complex FFT. + * @param nbits log2 of the length of the input array + * @param inverse if 0 perform the forward transform, if 1 perform the inverse + */ +PUBLIK FFTContext *av_fft_init(int nbits, int inverse); + +/** + * Do the permutation needed BEFORE calling ff_fft_calc(). + */ +PUBLIK void av_fft_permute(FFTContext *s, FFTComplex *z); + +/** + * Do a complex FFT with the parameters defined in av_fft_init(). The + * input data must be permuted before. No 1.0/sqrt(n) normalization is done. + */ +PUBLIK void av_fft_calc(FFTContext *s, FFTComplex *z); + +PUBLIK void av_fft_end(FFTContext *s); + +#if 0 +FFTContext *av_mdct_init(int nbits, int inverse, double scale); +void av_imdct_calc(FFTContext *s, FFTSample *output, const FFTSample *input); +void av_imdct_half(FFTContext *s, FFTSample *output, const FFTSample *input); +void av_mdct_calc(FFTContext *s, FFTSample *output, const FFTSample *input); +void av_mdct_end(FFTContext *s); +#endif + +/* Real Discrete Fourier Transform */ + +enum RDFTransformType { + DFT_R2C, + IDFT_C2R, + IDFT_R2C, + DFT_C2R, +}; + +typedef struct RDFTContext RDFTContext; + +/** + * Set up a real FFT. + * @param nbits log2 of the length of the input array + * @param trans the type of transform + */ +PUBLIK RDFTContext *av_rdft_init(int nbits, enum RDFTransformType trans); +PUBLIK void av_rdft_calc(RDFTContext *s, FFTSample *data); +PUBLIK void av_rdft_end(RDFTContext *s); + +/* Discrete Cosine Transform */ + +typedef struct DCTContext DCTContext; + +enum DCTTransformType { + DCT_II = 0, + DCT_III, + DCT_I, + DST_I, +}; + +/** + * Set up DCT. + * @param nbits size of the input array: + * (1 << nbits) for DCT-II, DCT-III and DST-I + * (1 << nbits) + 1 for DCT-I + * + * @note the first element of the input of DST-I is ignored + */ +PUBLIK DCTContext *av_dct_init(int nbits, enum DCTTransformType type); +PUBLIK void av_dct_calc(DCTContext *s, FFTSample *data); +PUBLIK void av_dct_end (DCTContext *s); + +#endif /* AVCODEC_AVFFT_H */ diff --git a/plugins/supereq/ffmpeg_fft/libavcodec/dct.c b/plugins/supereq/ffmpeg_fft/libavcodec/dct.c new file mode 100644 index 00000000..6ea1936e --- /dev/null +++ b/plugins/supereq/ffmpeg_fft/libavcodec/dct.c @@ -0,0 +1,228 @@ +/* + * (I)DCT Transforms + * Copyright (c) 2009 Peter Ross <pross@xvid.org> + * Copyright (c) 2010 Alex Converse <alex.converse@gmail.com> + * Copyright (c) 2010 Vitor Sessak + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/** + * @file + * (Inverse) Discrete Cosine Transforms. These are also known as the + * type II and type III DCTs respectively. + */ + +#include <math.h> +#include "libavutil/mathematics.h" +#include "fft.h" +#ifndef ARCH_ARM +#include "x86/fft.h" +#endif + +#define DCT32_FLOAT +#include "dct32.h" + +/* sin((M_PI * x / (2*n)) */ +#define SIN(s,n,x) (s->costab[(n) - (x)]) + +/* cos((M_PI * x / (2*n)) */ +#define COS(s,n,x) (s->costab[x]) + +static void ff_dst_calc_I_c(DCTContext *ctx, FFTSample *data) +{ + int n = 1 << ctx->nbits; + int i; + + data[0] = 0; + for(i = 1; i < n/2; i++) { + float tmp1 = data[i ]; + float tmp2 = data[n - i]; + float s = SIN(ctx, n, 2*i); + + s *= tmp1 + tmp2; + tmp1 = (tmp1 - tmp2) * 0.5f; + data[i ] = s + tmp1; + data[n - i] = s - tmp1; + } + + data[n/2] *= 2; + ff_rdft_calc(&ctx->rdft, data); + + data[0] *= 0.5f; + + for(i = 1; i < n-2; i += 2) { + data[i + 1] += data[i - 1]; + data[i ] = -data[i + 2]; + } + + data[n-1] = 0; +} + +static void ff_dct_calc_I_c(DCTContext *ctx, FFTSample *data) +{ + int n = 1 << ctx->nbits; + int i; + float next = -0.5f * (data[0] - data[n]); + + for(i = 0; i < n/2; i++) { + float tmp1 = data[i ]; + float tmp2 = data[n - i]; + float s = SIN(ctx, n, 2*i); + float c = COS(ctx, n, 2*i); + + c *= tmp1 - tmp2; + s *= tmp1 - tmp2; + + next += c; + + tmp1 = (tmp1 + tmp2) * 0.5f; + data[i ] = tmp1 - s; + data[n - i] = tmp1 + s; + } + + ff_rdft_calc(&ctx->rdft, data); + data[n] = data[1]; + data[1] = next; + + for(i = 3; i <= n; i += 2) + data[i] = data[i - 2] - data[i]; +} + +static void ff_dct_calc_III_c(DCTContext *ctx, FFTSample *data) +{ + int n = 1 << ctx->nbits; + int i; + + float next = data[n - 1]; + float inv_n = 1.0f / n; + + for (i = n - 2; i >= 2; i -= 2) { + float val1 = data[i ]; + float val2 = data[i - 1] - data[i + 1]; + float c = COS(ctx, n, i); + float s = SIN(ctx, n, i); + + data[i ] = c * val1 + s * val2; + data[i + 1] = s * val1 - c * val2; + } + + data[1] = 2 * next; + + ff_rdft_calc(&ctx->rdft, data); + + for (i = 0; i < n / 2; i++) { + float tmp1 = data[i ] * inv_n; + float tmp2 = data[n - i - 1] * inv_n; + float csc = ctx->csc2[i] * (tmp1 - tmp2); + + tmp1 += tmp2; + data[i ] = tmp1 + csc; + data[n - i - 1] = tmp1 - csc; + } +} + +static void ff_dct_calc_II_c(DCTContext *ctx, FFTSample *data) +{ + int n = 1 << ctx->nbits; + int i; + float next; + + for (i=0; i < n/2; i++) { + float tmp1 = data[i ]; + float tmp2 = data[n - i - 1]; + float s = SIN(ctx, n, 2*i + 1); + + s *= tmp1 - tmp2; + tmp1 = (tmp1 + tmp2) * 0.5f; + + data[i ] = tmp1 + s; + data[n-i-1] = tmp1 - s; + } + + ff_rdft_calc(&ctx->rdft, data); + + next = data[1] * 0.5; + data[1] *= -1; + + for (i = n - 2; i >= 0; i -= 2) { + float inr = data[i ]; + float ini = data[i + 1]; + float c = COS(ctx, n, i); + float s = SIN(ctx, n, i); + + data[i ] = c * inr + s * ini; + + data[i+1] = next; + + next += s * inr - c * ini; + } +} + +static void dct32_func(DCTContext *ctx, FFTSample *data) +{ + ctx->dct32(data, data); +} + +void ff_dct_calc(DCTContext *s, FFTSample *data) +{ + s->dct_calc(s, data); +} + +av_cold int ff_dct_init(DCTContext *s, int nbits, enum DCTTransformType inverse) +{ + int n = 1 << nbits; + int i; + + s->nbits = nbits; + s->inverse = inverse; + + ff_init_ff_cos_tabs(nbits+2); + + s->costab = ff_cos_tabs[nbits+2]; + + s->csc2 = av_malloc(n/2 * sizeof(FFTSample)); + + if (ff_rdft_init(&s->rdft, nbits, inverse == DCT_III) < 0) { + av_free(s->csc2); + return -1; + } + + for (i = 0; i < n/2; i++) + s->csc2[i] = 0.5 / sin((M_PI / (2*n) * (2*i + 1))); + + switch(inverse) { + case DCT_I : s->dct_calc = ff_dct_calc_I_c; break; + case DCT_II : s->dct_calc = ff_dct_calc_II_c ; break; + case DCT_III: s->dct_calc = ff_dct_calc_III_c; break; + case DST_I : s->dct_calc = ff_dst_calc_I_c; break; + } + + if (inverse == DCT_II && nbits == 5) + s->dct_calc = dct32_func; + + s->dct32 = dct32; + if (HAVE_MMX) ff_dct_init_mmx(s); + + return 0; +} + +av_cold void ff_dct_end(DCTContext *s) +{ + ff_rdft_end(&s->rdft); + av_free(s->csc2); +} diff --git a/plugins/supereq/ffmpeg_fft/libavcodec/dct32.c b/plugins/supereq/ffmpeg_fft/libavcodec/dct32.c new file mode 100644 index 00000000..3e6ad78d --- /dev/null +++ b/plugins/supereq/ffmpeg_fft/libavcodec/dct32.c @@ -0,0 +1,262 @@ +/* + * Template for the Discrete Cosine Transform for 32 samples + * Copyright (c) 2001, 2002 Fabrice Bellard + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "dct32.h" + +/* tab[i][j] = 1.0 / (2.0 * cos(pi*(2*k+1) / 2^(6 - j))) */ + +/* cos(i*pi/64) */ + +#define COS0_0 FIXHR(0.50060299823519630134/2) +#define COS0_1 FIXHR(0.50547095989754365998/2) +#define COS0_2 FIXHR(0.51544730992262454697/2) +#define COS0_3 FIXHR(0.53104259108978417447/2) +#define COS0_4 FIXHR(0.55310389603444452782/2) +#define COS0_5 FIXHR(0.58293496820613387367/2) +#define COS0_6 FIXHR(0.62250412303566481615/2) +#define COS0_7 FIXHR(0.67480834145500574602/2) +#define COS0_8 FIXHR(0.74453627100229844977/2) +#define COS0_9 FIXHR(0.83934964541552703873/2) +#define COS0_10 FIXHR(0.97256823786196069369/2) +#define COS0_11 FIXHR(1.16943993343288495515/4) +#define COS0_12 FIXHR(1.48416461631416627724/4) +#define COS0_13 FIXHR(2.05778100995341155085/8) +#define COS0_14 FIXHR(3.40760841846871878570/8) +#define COS0_15 FIXHR(10.19000812354805681150/32) + +#define COS1_0 FIXHR(0.50241928618815570551/2) +#define COS1_1 FIXHR(0.52249861493968888062/2) +#define COS1_2 FIXHR(0.56694403481635770368/2) +#define COS1_3 FIXHR(0.64682178335999012954/2) +#define COS1_4 FIXHR(0.78815462345125022473/2) +#define COS1_5 FIXHR(1.06067768599034747134/4) +#define COS1_6 FIXHR(1.72244709823833392782/4) +#define COS1_7 FIXHR(5.10114861868916385802/16) + +#define COS2_0 FIXHR(0.50979557910415916894/2) +#define COS2_1 FIXHR(0.60134488693504528054/2) +#define COS2_2 FIXHR(0.89997622313641570463/2) +#define COS2_3 FIXHR(2.56291544774150617881/8) + +#define COS3_0 FIXHR(0.54119610014619698439/2) +#define COS3_1 FIXHR(1.30656296487637652785/4) + +#define COS4_0 FIXHR(0.70710678118654752439/2) + +/* butterfly operator */ +#define BF(a, b, c, s)\ +{\ + tmp0 = val##a + val##b;\ + tmp1 = val##a - val##b;\ + val##a = tmp0;\ + val##b = MULH3(tmp1, c, 1<<(s));\ +} + +#define BF0(a, b, c, s)\ +{\ + tmp0 = tab[a] + tab[b];\ + tmp1 = tab[a] - tab[b];\ + val##a = tmp0;\ + val##b = MULH3(tmp1, c, 1<<(s));\ +} + +#define BF1(a, b, c, d)\ +{\ + BF(a, b, COS4_0, 1);\ + BF(c, d,-COS4_0, 1);\ + val##c += val##d;\ +} + +#define BF2(a, b, c, d)\ +{\ + BF(a, b, COS4_0, 1);\ + BF(c, d,-COS4_0, 1);\ + val##c += val##d;\ + val##a += val##c;\ + val##c += val##b;\ + val##b += val##d;\ +} + +#define ADD(a, b) val##a += val##b + +/* DCT32 without 1/sqrt(2) coef zero scaling. */ +void dct32(INTFLOAT *out, const INTFLOAT *tab) +{ + INTFLOAT tmp0, tmp1; + + INTFLOAT val0 , val1 , val2 , val3 , val4 , val5 , val6 , val7 , + val8 , val9 , val10, val11, val12, val13, val14, val15, + val16, val17, val18, val19, val20, val21, val22, val23, + val24, val25, val26, val27, val28, val29, val30, val31; + + /* pass 1 */ + BF0( 0, 31, COS0_0 , 1); + BF0(15, 16, COS0_15, 5); + /* pass 2 */ + BF( 0, 15, COS1_0 , 1); + BF(16, 31,-COS1_0 , 1); + /* pass 1 */ + BF0( 7, 24, COS0_7 , 1); + BF0( 8, 23, COS0_8 , 1); + /* pass 2 */ + BF( 7, 8, COS1_7 , 4); + BF(23, 24,-COS1_7 , 4); + /* pass 3 */ + BF( 0, 7, COS2_0 , 1); + BF( 8, 15,-COS2_0 , 1); + BF(16, 23, COS2_0 , 1); + BF(24, 31,-COS2_0 , 1); + /* pass 1 */ + BF0( 3, 28, COS0_3 , 1); + BF0(12, 19, COS0_12, 2); + /* pass 2 */ + BF( 3, 12, COS1_3 , 1); + BF(19, 28,-COS1_3 , 1); + /* pass 1 */ + BF0( 4, 27, COS0_4 , 1); + BF0(11, 20, COS0_11, 2); + /* pass 2 */ + BF( 4, 11, COS1_4 , 1); + BF(20, 27,-COS1_4 , 1); + /* pass 3 */ + BF( 3, 4, COS2_3 , 3); + BF(11, 12,-COS2_3 , 3); + BF(19, 20, COS2_3 , 3); + BF(27, 28,-COS2_3 , 3); + /* pass 4 */ + BF( 0, 3, COS3_0 , 1); + BF( 4, 7,-COS3_0 , 1); + BF( 8, 11, COS3_0 , 1); + BF(12, 15,-COS3_0 , 1); + BF(16, 19, COS3_0 , 1); + BF(20, 23,-COS3_0 , 1); + BF(24, 27, COS3_0 , 1); + BF(28, 31,-COS3_0 , 1); + + + + /* pass 1 */ + BF0( 1, 30, COS0_1 , 1); + BF0(14, 17, COS0_14, 3); + /* pass 2 */ + BF( 1, 14, COS1_1 , 1); + BF(17, 30,-COS1_1 , 1); + /* pass 1 */ + BF0( 6, 25, COS0_6 , 1); + BF0( 9, 22, COS0_9 , 1); + /* pass 2 */ + BF( 6, 9, COS1_6 , 2); + BF(22, 25,-COS1_6 , 2); + /* pass 3 */ + BF( 1, 6, COS2_1 , 1); + BF( 9, 14,-COS2_1 , 1); + BF(17, 22, COS2_1 , 1); + BF(25, 30,-COS2_1 , 1); + + /* pass 1 */ + BF0( 2, 29, COS0_2 , 1); + BF0(13, 18, COS0_13, 3); + /* pass 2 */ + BF( 2, 13, COS1_2 , 1); + BF(18, 29,-COS1_2 , 1); + /* pass 1 */ + BF0( 5, 26, COS0_5 , 1); + BF0(10, 21, COS0_10, 1); + /* pass 2 */ + BF( 5, 10, COS1_5 , 2); + BF(21, 26,-COS1_5 , 2); + /* pass 3 */ + BF( 2, 5, COS2_2 , 1); + BF(10, 13,-COS2_2 , 1); + BF(18, 21, COS2_2 , 1); + BF(26, 29,-COS2_2 , 1); + /* pass 4 */ + BF( 1, 2, COS3_1 , 2); + BF( 5, 6,-COS3_1 , 2); + BF( 9, 10, COS3_1 , 2); + BF(13, 14,-COS3_1 , 2); + BF(17, 18, COS3_1 , 2); + BF(21, 22,-COS3_1 , 2); + BF(25, 26, COS3_1 , 2); + BF(29, 30,-COS3_1 , 2); + + /* pass 5 */ + BF1( 0, 1, 2, 3); + BF2( 4, 5, 6, 7); + BF1( 8, 9, 10, 11); + BF2(12, 13, 14, 15); + BF1(16, 17, 18, 19); + BF2(20, 21, 22, 23); + BF1(24, 25, 26, 27); + BF2(28, 29, 30, 31); + + /* pass 6 */ + + ADD( 8, 12); + ADD(12, 10); + ADD(10, 14); + ADD(14, 9); + ADD( 9, 13); + ADD(13, 11); + ADD(11, 15); + + out[ 0] = val0; + out[16] = val1; + out[ 8] = val2; + out[24] = val3; + out[ 4] = val4; + out[20] = val5; + out[12] = val6; + out[28] = val7; + out[ 2] = val8; + out[18] = val9; + out[10] = val10; + out[26] = val11; + out[ 6] = val12; + out[22] = val13; + out[14] = val14; + out[30] = val15; + + ADD(24, 28); + ADD(28, 26); + ADD(26, 30); + ADD(30, 25); + ADD(25, 29); + ADD(29, 27); + ADD(27, 31); + + out[ 1] = val16 + val24; + out[17] = val17 + val25; + out[ 9] = val18 + val26; + out[25] = val19 + val27; + out[ 5] = val20 + val28; + out[21] = val21 + val29; + out[13] = val22 + val30; + out[29] = val23 + val31; + out[ 3] = val24 + val20; + out[19] = val25 + val21; + out[11] = val26 + val22; + out[27] = val27 + val23; + out[ 7] = val28 + val18; + out[23] = val29 + val19; + out[15] = val30 + val17; + out[31] = val31; +} diff --git a/plugins/supereq/ffmpeg_fft/libavcodec/dct32.h b/plugins/supereq/ffmpeg_fft/libavcodec/dct32.h new file mode 100644 index 00000000..dc2d847a --- /dev/null +++ b/plugins/supereq/ffmpeg_fft/libavcodec/dct32.h @@ -0,0 +1,10 @@ +#ifndef DCT_32_H +#define DCT_32_H + +#define FIXHR(x) ((float)(x)) +#define MULH3(x, y, s) ((s)*(y)*(x)) +#define INTFLOAT float + +void dct32(INTFLOAT *out, const INTFLOAT *tab); + +#endif diff --git a/plugins/supereq/ffmpeg_fft/libavcodec/fft.c b/plugins/supereq/ffmpeg_fft/libavcodec/fft.c new file mode 100644 index 00000000..04082bf4 --- /dev/null +++ b/plugins/supereq/ffmpeg_fft/libavcodec/fft.c @@ -0,0 +1,300 @@ +/* + * FFT/IFFT transforms + * Copyright (c) 2008 Loren Merritt + * Copyright (c) 2002 Fabrice Bellard + * Partly based on libdjbfft by D. J. Bernstein + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/** + * @file + * FFT/IFFT transforms. + */ + +#include <stdlib.h> +#include <string.h> +#include "libavutil/mathematics.h" +#include "fft.h" + +/* cos(2*pi*x/n) for 0<=x<=n/4, followed by its reverse */ +#if !CONFIG_HARDCODED_TABLES +COSTABLE(16); +COSTABLE(32); +COSTABLE(64); +COSTABLE(128); +COSTABLE(256); +COSTABLE(512); +COSTABLE(1024); +COSTABLE(2048); +COSTABLE(4096); +COSTABLE(8192); +COSTABLE(16384); +COSTABLE(32768); +COSTABLE(65536); +#endif +COSTABLE_CONST FFTSample * const ff_cos_tabs[] = { + NULL, NULL, NULL, NULL, + ff_cos_16, ff_cos_32, ff_cos_64, ff_cos_128, ff_cos_256, ff_cos_512, ff_cos_1024, + ff_cos_2048, ff_cos_4096, ff_cos_8192, ff_cos_16384, ff_cos_32768, ff_cos_65536, +}; + +static int split_radix_permutation(int i, int n, int inverse) +{ + int m; + if(n <= 2) return i&1; + m = n >> 1; + if(!(i&m)) return split_radix_permutation(i, m, inverse)*2; + m >>= 1; + if(inverse == !(i&m)) return split_radix_permutation(i, m, inverse)*4 + 1; + else return split_radix_permutation(i, m, inverse)*4 - 1; +} + +av_cold void ff_init_ff_cos_tabs(int index) +{ +#if !CONFIG_HARDCODED_TABLES + int i; + int m = 1<<index; + double freq = 2*M_PI/m; + FFTSample *tab = ff_cos_tabs[index]; + for(i=0; i<=m/4; i++) + tab[i] = cos(i*freq); + for(i=1; i<m/4; i++) + tab[m/2-i] = tab[i]; +#endif +} + +av_cold int ff_fft_init(FFTContext *s, int nbits, int inverse) +{ + int i, j, n; + + if (nbits < 2 || nbits > 16) + goto fail; + s->nbits = nbits; + n = 1 << nbits; + + s->revtab = av_malloc(n * sizeof(uint16_t)); + if (!s->revtab) + goto fail; + s->tmp_buf = av_malloc(n * sizeof(FFTComplex)); + if (!s->tmp_buf) + goto fail; + s->inverse = inverse; + + s->fft_permute = ff_fft_permute_c; + s->fft_calc = ff_fft_calc_c; +#if CONFIG_MDCT + s->imdct_calc = ff_imdct_calc_c; + s->imdct_half = ff_imdct_half_c; + s->mdct_calc = ff_mdct_calc_c; +#endif + +#if ARCH_ARM + ff_fft_init_arm(s); +#elif HAVE_ALTIVEC + if (HAVE_ALTIVEC) ff_fft_init_altivec(s); +#elif HAVE_MMX + if (HAVE_MMX) ff_fft_init_mmx(s); +#endif + + for(j=4; j<=nbits; j++) { + ff_init_ff_cos_tabs(j); + } + for(i=0; i<n; i++) + s->revtab[-split_radix_permutation(i, n, s->inverse) & (n-1)] = i; + + return 0; + fail: + av_freep(&s->revtab); + av_freep(&s->tmp_buf); + return -1; +} + +void ff_fft_permute_c(FFTContext *s, FFTComplex *z) +{ + int j, np; + const uint16_t *revtab = s->revtab; + np = 1 << s->nbits; + /* TODO: handle split-radix permute in a more optimal way, probably in-place */ + for(j=0;j<np;j++) s->tmp_buf[revtab[j]] = z[j]; + memcpy(z, s->tmp_buf, np * sizeof(FFTComplex)); +} + +av_cold void ff_fft_end(FFTContext *s) +{ + av_freep(&s->revtab); + av_freep(&s->tmp_buf); +} + +#define sqrthalf (float)M_SQRT1_2 + +#define BF(x,y,a,b) {\ + x = a - b;\ + y = a + b;\ +} + +#define BUTTERFLIES(a0,a1,a2,a3) {\ + BF(t3, t5, t5, t1);\ + BF(a2.re, a0.re, a0.re, t5);\ + BF(a3.im, a1.im, a1.im, t3);\ + BF(t4, t6, t2, t6);\ + BF(a3.re, a1.re, a1.re, t4);\ + BF(a2.im, a0.im, a0.im, t6);\ +} + +// force loading all the inputs before storing any. +// this is slightly slower for small data, but avoids store->load aliasing +// for addresses separated by large powers of 2. +#define BUTTERFLIES_BIG(a0,a1,a2,a3) {\ + FFTSample r0=a0.re, i0=a0.im, r1=a1.re, i1=a1.im;\ + BF(t3, t5, t5, t1);\ + BF(a2.re, a0.re, r0, t5);\ + BF(a3.im, a1.im, i1, t3);\ + BF(t4, t6, t2, t6);\ + BF(a3.re, a1.re, r1, t4);\ + BF(a2.im, a0.im, i0, t6);\ +} + +#define TRANSFORM(a0,a1,a2,a3,wre,wim) {\ + t1 = a2.re * wre + a2.im * wim;\ + t2 = a2.im * wre - a2.re * wim;\ + t5 = a3.re * wre - a3.im * wim;\ + t6 = a3.im * wre + a3.re * wim;\ + BUTTERFLIES(a0,a1,a2,a3)\ +} + +#define TRANSFORM_ZERO(a0,a1,a2,a3) {\ + t1 = a2.re;\ + t2 = a2.im;\ + t5 = a3.re;\ + t6 = a3.im;\ + BUTTERFLIES(a0,a1,a2,a3)\ +} + +/* z[0...8n-1], w[1...2n-1] */ +#define PASS(name)\ +static void name(FFTComplex *z, const FFTSample *wre, unsigned int n)\ +{\ + FFTSample t1, t2, t3, t4, t5, t6;\ + int o1 = 2*n;\ + int o2 = 4*n;\ + int o3 = 6*n;\ + const FFTSample *wim = wre+o1;\ + n--;\ +\ + TRANSFORM_ZERO(z[0],z[o1],z[o2],z[o3]);\ + TRANSFORM(z[1],z[o1+1],z[o2+1],z[o3+1],wre[1],wim[-1]);\ + do {\ + z += 2;\ + wre += 2;\ + wim -= 2;\ + TRANSFORM(z[0],z[o1],z[o2],z[o3],wre[0],wim[0]);\ + TRANSFORM(z[1],z[o1+1],z[o2+1],z[o3+1],wre[1],wim[-1]);\ + } while(--n);\ +} + +PASS(pass) +#undef BUTTERFLIES +#define BUTTERFLIES BUTTERFLIES_BIG +PASS(pass_big) + +#define DECL_FFT(n,n2,n4)\ +static void fft##n(FFTComplex *z)\ +{\ + fft##n2(z);\ + fft##n4(z+n4*2);\ + fft##n4(z+n4*3);\ + pass(z,ff_cos_##n,n4/2);\ +} + +static void fft4(FFTComplex *z) +{ + FFTSample t1, t2, t3, t4, t5, t6, t7, t8; + + BF(t3, t1, z[0].re, z[1].re); + BF(t8, t6, z[3].re, z[2].re); + BF(z[2].re, z[0].re, t1, t6); + BF(t4, t2, z[0].im, z[1].im); + BF(t7, t5, z[2].im, z[3].im); + BF(z[3].im, z[1].im, t4, t8); + BF(z[3].re, z[1].re, t3, t7); + BF(z[2].im, z[0].im, t2, t5); +} + +static void fft8(FFTComplex *z) +{ + FFTSample t1, t2, t3, t4, t5, t6, t7, t8; + + fft4(z); + + BF(t1, z[5].re, z[4].re, -z[5].re); + BF(t2, z[5].im, z[4].im, -z[5].im); + BF(t3, z[7].re, z[6].re, -z[7].re); + BF(t4, z[7].im, z[6].im, -z[7].im); + BF(t8, t1, t3, t1); + BF(t7, t2, t2, t4); + BF(z[4].re, z[0].re, z[0].re, t1); + BF(z[4].im, z[0].im, z[0].im, t2); + BF(z[6].re, z[2].re, z[2].re, t7); + BF(z[6].im, z[2].im, z[2].im, t8); + + TRANSFORM(z[1],z[3],z[5],z[7],sqrthalf,sqrthalf); +} + +#if !CONFIG_SMALL +static void fft16(FFTComplex *z) +{ + FFTSample t1, t2, t3, t4, t5, t6; + + fft8(z); + fft4(z+8); + fft4(z+12); + + TRANSFORM_ZERO(z[0],z[4],z[8],z[12]); + TRANSFORM(z[2],z[6],z[10],z[14],sqrthalf,sqrthalf); + TRANSFORM(z[1],z[5],z[9],z[13],ff_cos_16[1],ff_cos_16[3]); + TRANSFORM(z[3],z[7],z[11],z[15],ff_cos_16[3],ff_cos_16[1]); +} +#else +DECL_FFT(16,8,4) +#endif +DECL_FFT(32,16,8) +DECL_FFT(64,32,16) +DECL_FFT(128,64,32) +DECL_FFT(256,128,64) +DECL_FFT(512,256,128) +#if !CONFIG_SMALL +#define pass pass_big +#endif +DECL_FFT(1024,512,256) +DECL_FFT(2048,1024,512) +DECL_FFT(4096,2048,1024) +DECL_FFT(8192,4096,2048) +DECL_FFT(16384,8192,4096) +DECL_FFT(32768,16384,8192) +DECL_FFT(65536,32768,16384) + +static void (* const fft_dispatch[])(FFTComplex*) = { + fft4, fft8, fft16, fft32, fft64, fft128, fft256, fft512, fft1024, + fft2048, fft4096, fft8192, fft16384, fft32768, fft65536, +}; + +void ff_fft_calc_c(FFTContext *s, FFTComplex *z) +{ + fft_dispatch[s->nbits-2](z); +} + diff --git a/plugins/supereq/ffmpeg_fft/libavcodec/fft.h b/plugins/supereq/ffmpeg_fft/libavcodec/fft.h new file mode 100644 index 00000000..b2e0f540 --- /dev/null +++ b/plugins/supereq/ffmpeg_fft/libavcodec/fft.h @@ -0,0 +1,244 @@ +/* + * Copyright (c) 2000, 2001, 2002 Fabrice Bellard + * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCODEC_FFT_H +#define AVCODEC_FFT_H + +#include <stdint.h> +#include "../config.h" +#include "libavutil/mem.h" +#include "avfft.h" + +/* FFT computation */ + +struct FFTContext { + int nbits; + int inverse; + uint16_t *revtab; + FFTComplex *tmp_buf; + int mdct_size; /* size of MDCT (i.e. number of input data * 2) */ + int mdct_bits; /* n = 2^nbits */ + /* pre/post rotation tables */ + FFTSample *tcos; + FFTSample *tsin; + void (*fft_permute)(struct FFTContext *s, FFTComplex *z); + void (*fft_calc)(struct FFTContext *s, FFTComplex *z); + void (*imdct_calc)(struct FFTContext *s, FFTSample *output, const FFTSample *input); + void (*imdct_half)(struct FFTContext *s, FFTSample *output, const FFTSample *input); + void (*mdct_calc)(struct FFTContext *s, FFTSample *output, const FFTSample *input); + int permutation; +#define FF_MDCT_PERM_NONE 0 +#define FF_MDCT_PERM_INTERLEAVE 1 +}; + +#if CONFIG_HARDCODED_TABLES +#define COSTABLE_CONST const +#define SINTABLE_CONST const +#define SINETABLE_CONST const +#else +#define COSTABLE_CONST +#define SINTABLE_CONST +#define SINETABLE_CONST +#endif + +#define COSTABLE(size) \ + COSTABLE_CONST DECLARE_ALIGNED(16, FFTSample, ff_cos_##size)[size/2] +#define SINTABLE(size) \ + SINTABLE_CONST DECLARE_ALIGNED(16, FFTSample, ff_sin_##size)[size/2] +#define SINETABLE(size) \ + SINETABLE_CONST DECLARE_ALIGNED(16, float, ff_sine_##size)[size] +extern COSTABLE(16); +extern COSTABLE(32); +extern COSTABLE(64); +extern COSTABLE(128); +extern COSTABLE(256); +extern COSTABLE(512); +extern COSTABLE(1024); +extern COSTABLE(2048); +extern COSTABLE(4096); +extern COSTABLE(8192); +extern COSTABLE(16384); +extern COSTABLE(32768); +extern COSTABLE(65536); +extern COSTABLE_CONST FFTSample* const ff_cos_tabs[17]; + +/** + * Initialize the cosine table in ff_cos_tabs[index] + * \param index index in ff_cos_tabs array of the table to initialize + */ +void ff_init_ff_cos_tabs(int index); + +extern SINTABLE(16); +extern SINTABLE(32); +extern SINTABLE(64); +extern SINTABLE(128); +extern SINTABLE(256); +extern SINTABLE(512); +extern SINTABLE(1024); +extern SINTABLE(2048); +extern SINTABLE(4096); +extern SINTABLE(8192); +extern SINTABLE(16384); +extern SINTABLE(32768); +extern SINTABLE(65536); + +/** + * Set up a complex FFT. + * @param nbits log2 of the length of the input array + * @param inverse if 0 perform the forward transform, if 1 perform the inverse + */ +int ff_fft_init(FFTContext *s, int nbits, int inverse); +void ff_fft_permute_c(FFTContext *s, FFTComplex *z); +void ff_fft_calc_c(FFTContext *s, FFTComplex *z); + +void ff_fft_init_altivec(FFTContext *s); +void ff_fft_init_mmx(FFTContext *s); +void ff_fft_init_arm(FFTContext *s); +void ff_dct_init_mmx(DCTContext *s); + +/** + * Do the permutation needed BEFORE calling ff_fft_calc(). + */ +static inline void ff_fft_permute(FFTContext *s, FFTComplex *z) +{ + s->fft_permute(s, z); +} +/** + * Do a complex FFT with the parameters defined in ff_fft_init(). The + * input data must be permuted before. No 1.0/sqrt(n) normalization is done. + */ +static inline void ff_fft_calc(FFTContext *s, FFTComplex *z) +{ + s->fft_calc(s, z); +} +void ff_fft_end(FFTContext *s); + +/* MDCT computation */ + +static inline void ff_imdct_calc(FFTContext *s, FFTSample *output, const FFTSample *input) +{ + s->imdct_calc(s, output, input); +} +static inline void ff_imdct_half(FFTContext *s, FFTSample *output, const FFTSample *input) +{ + s->imdct_half(s, output, input); +} + +static inline void ff_mdct_calc(FFTContext *s, FFTSample *output, + const FFTSample *input) +{ + s->mdct_calc(s, output, input); +} + +/** + * Maximum window size for ff_kbd_window_init. + */ +#define FF_KBD_WINDOW_MAX 1024 + +/** + * Generate a Kaiser-Bessel Derived Window. + * @param window pointer to half window + * @param alpha determines window shape + * @param n size of half window, max FF_KBD_WINDOW_MAX + */ +void ff_kbd_window_init(float *window, float alpha, int n); + +/** + * Generate a sine window. + * @param window pointer to half window + * @param n size of half window + */ +void ff_sine_window_init(float *window, int n); + +/** + * initialize the specified entry of ff_sine_windows + */ +void ff_init_ff_sine_windows(int index); +extern SINETABLE( 32); +extern SINETABLE( 64); +extern SINETABLE( 128); +extern SINETABLE( 256); +extern SINETABLE( 512); +extern SINETABLE(1024); +extern SINETABLE(2048); +extern SINETABLE(4096); +extern SINETABLE_CONST float * const ff_sine_windows[13]; + +int ff_mdct_init(FFTContext *s, int nbits, int inverse, double scale); +void ff_imdct_calc_c(FFTContext *s, FFTSample *output, const FFTSample *input); +void ff_imdct_half_c(FFTContext *s, FFTSample *output, const FFTSample *input); +void ff_mdct_calc_c(FFTContext *s, FFTSample *output, const FFTSample *input); +void ff_mdct_end(FFTContext *s); + +/* Real Discrete Fourier Transform */ + +struct RDFTContext { + int nbits; + int inverse; + int sign_convention; + + /* pre/post rotation tables */ + const FFTSample *tcos; + SINTABLE_CONST FFTSample *tsin; + FFTContext fft; + void (*rdft_calc)(struct RDFTContext *s, FFTSample *z); +}; + +/** + * Set up a real FFT. + * @param nbits log2 of the length of the input array + * @param trans the type of transform + */ +int ff_rdft_init(RDFTContext *s, int nbits, enum RDFTransformType trans); +void ff_rdft_end(RDFTContext *s); + +void ff_rdft_init_arm(RDFTContext *s); + +static av_always_inline void ff_rdft_calc(RDFTContext *s, FFTSample *data) +{ + s->rdft_calc(s, data); +} + +/* Discrete Cosine Transform */ + +struct DCTContext { + int nbits; + int inverse; + RDFTContext rdft; + const float *costab; + FFTSample *csc2; + void (*dct_calc)(struct DCTContext *s, FFTSample *data); + void (*dct32)(FFTSample *out, const FFTSample *in); +}; + +/** + * Set up DCT. + * @param nbits size of the input array: + * (1 << nbits) for DCT-II, DCT-III and DST-I + * (1 << nbits) + 1 for DCT-I + * + * @note the first element of the input of DST-I is ignored + */ +int ff_dct_init(DCTContext *s, int nbits, enum DCTTransformType type); +void ff_dct_calc(DCTContext *s, FFTSample *data); +void ff_dct_end (DCTContext *s); + +#endif /* AVCODEC_FFT_H */ diff --git a/plugins/supereq/ffmpeg_fft/libavcodec/rdft.c b/plugins/supereq/ffmpeg_fft/libavcodec/rdft.c new file mode 100644 index 00000000..fe6014fb --- /dev/null +++ b/plugins/supereq/ffmpeg_fft/libavcodec/rdft.c @@ -0,0 +1,137 @@ +/* + * (I)RDFT transforms + * Copyright (c) 2009 Alex Converse <alex dot converse at gmail dot com> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include <stdlib.h> +#include <math.h> +#include "libavutil/mathematics.h" +#include "fft.h" + +/** + * @file + * (Inverse) Real Discrete Fourier Transforms. + */ + +/* sin(2*pi*x/n) for 0<=x<n/4, followed by n/2<=x<3n/4 */ +#if !CONFIG_HARDCODED_TABLES +SINTABLE(16); +SINTABLE(32); +SINTABLE(64); +SINTABLE(128); +SINTABLE(256); +SINTABLE(512); +SINTABLE(1024); +SINTABLE(2048); +SINTABLE(4096); +SINTABLE(8192); +SINTABLE(16384); +SINTABLE(32768); +SINTABLE(65536); +#endif +SINTABLE_CONST FFTSample * const ff_sin_tabs[] = { + NULL, NULL, NULL, NULL, + ff_sin_16, ff_sin_32, ff_sin_64, ff_sin_128, ff_sin_256, ff_sin_512, ff_sin_1024, + ff_sin_2048, ff_sin_4096, ff_sin_8192, ff_sin_16384, ff_sin_32768, ff_sin_65536, +}; + +/** Map one real FFT into two parallel real even and odd FFTs. Then interleave + * the two real FFTs into one complex FFT. Unmangle the results. + * ref: http://www.engineeringproductivitytools.com/stuff/T0001/PT10.HTM + */ +static void ff_rdft_calc_c(RDFTContext* s, FFTSample* data) +{ + int i, i1, i2; + FFTComplex ev, od; + const int n = 1 << s->nbits; + const float k1 = 0.5; + const float k2 = 0.5 - s->inverse; + const FFTSample *tcos = s->tcos; + const FFTSample *tsin = s->tsin; + + if (!s->inverse) { + ff_fft_permute(&s->fft, (FFTComplex*)data); + ff_fft_calc(&s->fft, (FFTComplex*)data); + } + /* i=0 is a special case because of packing, the DC term is real, so we + are going to throw the N/2 term (also real) in with it. */ + ev.re = data[0]; + data[0] = ev.re+data[1]; + data[1] = ev.re-data[1]; + for (i = 1; i < (n>>2); i++) { + i1 = 2*i; + i2 = n-i1; + /* Separate even and odd FFTs */ + ev.re = k1*(data[i1 ]+data[i2 ]); + od.im = -k2*(data[i1 ]-data[i2 ]); + ev.im = k1*(data[i1+1]-data[i2+1]); + od.re = k2*(data[i1+1]+data[i2+1]); + /* Apply twiddle factors to the odd FFT and add to the even FFT */ + data[i1 ] = ev.re + od.re*tcos[i] - od.im*tsin[i]; + data[i1+1] = ev.im + od.im*tcos[i] + od.re*tsin[i]; + data[i2 ] = ev.re - od.re*tcos[i] + od.im*tsin[i]; + data[i2+1] = -ev.im + od.im*tcos[i] + od.re*tsin[i]; + } + data[2*i+1]=s->sign_convention*data[2*i+1]; + if (s->inverse) { + data[0] *= k1; + data[1] *= k1; + ff_fft_permute(&s->fft, (FFTComplex*)data); + ff_fft_calc(&s->fft, (FFTComplex*)data); + } +} + +av_cold int ff_rdft_init(RDFTContext *s, int nbits, enum RDFTransformType trans) +{ + int n = 1 << nbits; + int i; + const double theta = (trans == DFT_R2C || trans == DFT_C2R ? -1 : 1)*2*M_PI/n; + + s->nbits = nbits; + s->inverse = trans == IDFT_C2R || trans == DFT_C2R; + s->sign_convention = trans == IDFT_R2C || trans == DFT_C2R ? 1 : -1; + + if (nbits < 4 || nbits > 16) { + return -1; + } + + if (ff_fft_init(&s->fft, nbits-1, trans == IDFT_C2R || trans == IDFT_R2C) < 0) { + return -1; + } + + ff_init_ff_cos_tabs(nbits); + s->tcos = ff_cos_tabs[nbits]; + s->tsin = ff_sin_tabs[nbits]+(trans == DFT_R2C || trans == DFT_C2R)*(n>>2); +#if !CONFIG_HARDCODED_TABLES + for (i = 0; i < (n>>2); i++) { + s->tsin[i] = sin(i*theta); + } +#endif + s->rdft_calc = ff_rdft_calc_c; + +#if ARCH_ARM + ff_rdft_init_arm(s); +#endif + + return 0; +} + +av_cold void ff_rdft_end(RDFTContext *s) +{ + ff_fft_end(&s->fft); +} |