aboutsummaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
-rw-r--r--mp3lib/Makefile6
-rw-r--r--mp3lib/d_cpu.h3
-rw-r--r--mp3lib/d_cpu.s48
-rw-r--r--mp3lib/dct36.c2
-rw-r--r--mp3lib/dct64_3dnow.s1636
-rw-r--r--mp3lib/dct64_MMX.s1028
-rw-r--r--mp3lib/dct64_k7.s1469
-rw-r--r--mp3lib/decod386.c40
-rw-r--r--mp3lib/decode_3dnow.s265
-rw-r--r--mp3lib/decode_MMX.s117
-rw-r--r--mp3lib/decode_k7.s364
-rw-r--r--mp3lib/decode_sse.s201
-rw-r--r--mp3lib/layer2.c8
-rw-r--r--mp3lib/layer3.c25
-rw-r--r--mp3lib/mpg123.h33
-rw-r--r--mp3lib/sr1.c81
-rw-r--r--mp3lib/tabinit.c35
-rw-r--r--mp3lib/tabinit_MMX.s161
-rw-r--r--mp3lib/test2.c2
19 files changed, 3210 insertions, 2314 deletions
diff --git a/mp3lib/Makefile b/mp3lib/Makefile
index b82aa6215f..6aa93c4275 100644
--- a/mp3lib/Makefile
+++ b/mp3lib/Makefile
@@ -1,8 +1,10 @@
include config.mak
-SRCS = sr1.c d_cpu.s decode_i586.s $(OPTIONAL_SRCS)
-OBJS = sr1.o d_cpu.o decode_i586.o $(OPTIONAL_OBJS)
+SRCS = sr1.c d_cpu.s decode_i586.s dct64_MMX.s decode_MMX.s tabinit_MMX.s\
+dct36_3dnow.s dct64_3dnow.s dct36_k7.s dct64_k7.s
+OBJS = sr1.o d_cpu.o decode_i586.o dct64_MMX.o decode_MMX.o tabinit_MMX.o\
+dct36_3dnow.o dct64_3dnow.o dct36_k7.o dct64_k7.o
# OBJS = $(SRCS:.c,.s=.o)
CFLAGS = $(OPTFLAGS) $(EXTRA_INC)
diff --git a/mp3lib/d_cpu.h b/mp3lib/d_cpu.h
index d2c92b9415..3d221f66e4 100644
--- a/mp3lib/d_cpu.h
+++ b/mp3lib/d_cpu.h
@@ -9,9 +9,12 @@
unsigned int _CpuID;
unsigned int _i586;
unsigned int _3dnow;
+unsigned int _isse;
+unsigned int _has_mmx;
extern unsigned long CpuDetect( void );
extern unsigned long ipentium( void );
+extern unsigned long isse( void );
extern unsigned long a3dnow( void );
#endif
diff --git a/mp3lib/d_cpu.s b/mp3lib/d_cpu.s
index 0715ccccd1..6df924b241 100644
--- a/mp3lib/d_cpu.s
+++ b/mp3lib/d_cpu.s
@@ -9,6 +9,7 @@
.globl CpuDetect
.globl ipentium
.globl a3dnow
+.globl isse
/ ---------------------------------------------------------------------------
/ in C: unsigned long CpuDetect( void );
@@ -45,7 +46,9 @@ exit_cpudetect:
/ ---------------------------------------------------------------------------
/ in C: unsigled long ipentium( void );
-/ return: 0 if the processor is not P5 or above else above 1.
+/ return: 0 if this processor i386 or i486
+/ 1 otherwise
+/ 2 if this cpu supports mmx
/ ---------------------------------------------------------------------------
ipentium:
pushl %ebx
@@ -63,10 +66,15 @@ ipentium:
jz no_cpuid
movl $1,%eax
cpuid
- shrl $8,%eax
- cmpl $5,%eax
- jb no_cpuid
- movl $1,%eax
+ movl %eax, %ecx
+ xorl %eax, %eax
+ shrl $8,%ecx
+ cmpl $5,%ecx
+ jb exit
+ incl %eax
+ test $0x00800000, %edx
+ jz exit
+ incl %eax
jmp exit
no_cpuid:
xorl %eax,%eax
@@ -113,3 +121,33 @@ exit2:
popl %edx
popl %ebx
ret
+
+/ ---------------------------------------------------------------------------
+/ in C: unsigned long isse( void );
+/ return: 0 if this processor does not support sse
+/ 1 otherwise
+/ 2 if this cpu supports sse2 extension
+/ ---------------------------------------------------------------------------
+isse:
+ pushl %ebx
+ pushl %edx
+ pushl %ecx
+
+ call ipentium
+ testl %eax,%eax
+ jz exit3
+
+ movl $1,%eax
+ cpuid
+ xorl %eax, %eax
+ testl $0x02000000,%edx
+ jz exit3
+ incl %eax
+ testl $0x04000000,%edx
+ jz exit3
+ incl %eax
+exit3:
+ popl %ecx
+ popl %edx
+ popl %ebx
+ ret
diff --git a/mp3lib/dct36.c b/mp3lib/dct36.c
index 04992f09cc..18bb35a5c4 100644
--- a/mp3lib/dct36.c
+++ b/mp3lib/dct36.c
@@ -193,7 +193,7 @@ static void dct36(real *inbuf,real *o1,real *o2,real *wintab,real *tsbuf)
sum1 = (tmp2b - tmp1b) * tfcos36[(v)]; \
MACRO0(v); }
- register const real *c = nCOS9;
+ register const real *c = COS9;
register real *out2 = o2;
register real *w = wintab;
register real *out1 = o1;
diff --git a/mp3lib/dct64_3dnow.s b/mp3lib/dct64_3dnow.s
index b7540573a6..dfade383db 100644
--- a/mp3lib/dct64_3dnow.s
+++ b/mp3lib/dct64_3dnow.s
@@ -1,706 +1,932 @@
-///
-/// Replacement of dct64() with AMD's 3DNow! SIMD operations support
-///
-/// Syuuhei Kashiyama <squash@mb.kcom.ne.jp>
-///
-/// The author of this program disclaim whole expressed or implied
-/// warranties with regard to this program, and in no event shall the
-/// author of this program liable to whatever resulted from the use of
-/// this program. Use it at your own risk.
-///
-
- .globl dct64_3dnow
- .type dct64_3dnow,@function
-dct64_3dnow:
- subl $256,%esp
- pushl %ebp
- pushl %edi
- pushl %esi
- pushl %ebx
- leal 16(%esp),%ebx
- movl 284(%esp),%edi
- movl 276(%esp),%ebp
- movl 280(%esp),%edx
- leal 128(%ebx),%esi
-
- / femms
-
- // 1
- movl pnts,%eax
- movq 0(%edi),%mm0
- movq %mm0,%mm1
- movd 124(%edi),%mm2
- punpckldq 120(%edi),%mm2
- movq 0(%eax),%mm3
- pfadd %mm2,%mm0
- movq %mm0,0(%ebx)
- pfsub %mm2,%mm1
- pfmul %mm3,%mm1
- movd %mm1,124(%ebx)
- psrlq $32,%mm1
- movd %mm1,120(%ebx)
- movq 8(%edi),%mm4
- movq %mm4,%mm5
- movd 116(%edi),%mm6
- punpckldq 112(%edi),%mm6
- movq 8(%eax),%mm7
- pfadd %mm6,%mm4
- movq %mm4,8(%ebx)
- pfsub %mm6,%mm5
- pfmul %mm7,%mm5
- movd %mm5,116(%ebx)
- psrlq $32,%mm5
- movd %mm5,112(%ebx)
- movq 16(%edi),%mm0
- movq %mm0,%mm1
- movd 108(%edi),%mm2
- punpckldq 104(%edi),%mm2
- movq 16(%eax),%mm3
- pfadd %mm2,%mm0
- movq %mm0,16(%ebx)
- pfsub %mm2,%mm1
- pfmul %mm3,%mm1
- movd %mm1,108(%ebx)
- psrlq $32,%mm1
- movd %mm1,104(%ebx)
- movq 24(%edi),%mm4
- movq %mm4,%mm5
- movd 100(%edi),%mm6
- punpckldq 96(%edi),%mm6
- movq 24(%eax),%mm7
- pfadd %mm6,%mm4
- movq %mm4,24(%ebx)
- pfsub %mm6,%mm5
- pfmul %mm7,%mm5
- movd %mm5,100(%ebx)
- psrlq $32,%mm5
- movd %mm5,96(%ebx)
- movq 32(%edi),%mm0
- movq %mm0,%mm1
- movd 92(%edi),%mm2
- punpckldq 88(%edi),%mm2
- movq 32(%eax),%mm3
- pfadd %mm2,%mm0
- movq %mm0,32(%ebx)
- pfsub %mm2,%mm1
- pfmul %mm3,%mm1
- movd %mm1,92(%ebx)
- psrlq $32,%mm1
- movd %mm1,88(%ebx)
- movq 40(%edi),%mm4
- movq %mm4,%mm5
- movd 84(%edi),%mm6
- punpckldq 80(%edi),%mm6
- movq 40(%eax),%mm7
- pfadd %mm6,%mm4
- movq %mm4,40(%ebx)
- pfsub %mm6,%mm5
- pfmul %mm7,%mm5
- movd %mm5,84(%ebx)
- psrlq $32,%mm5
- movd %mm5,80(%ebx)
- movq 48(%edi),%mm0
- movq %mm0,%mm1
- movd 76(%edi),%mm2
- punpckldq 72(%edi),%mm2
- movq 48(%eax),%mm3
- pfadd %mm2,%mm0
- movq %mm0,48(%ebx)
- pfsub %mm2,%mm1
- pfmul %mm3,%mm1
- movd %mm1,76(%ebx)
- psrlq $32,%mm1
- movd %mm1,72(%ebx)
- movq 56(%edi),%mm4
- movq %mm4,%mm5
- movd 68(%edi),%mm6
- punpckldq 64(%edi),%mm6
- movq 56(%eax),%mm7
- pfadd %mm6,%mm4
- movq %mm4,56(%ebx)
- pfsub %mm6,%mm5
- pfmul %mm7,%mm5
- movd %mm5,68(%ebx)
- psrlq $32,%mm5
- movd %mm5,64(%ebx)
-
- // 2
- movl pnts+4,%eax
- / 0, 14
- movq 0(%ebx),%mm0
- movq %mm0,%mm1
- movd 60(%ebx),%mm2
- punpckldq 56(%ebx),%mm2
- movq 0(%eax),%mm3
- pfadd %mm2,%mm0
- movq %mm0,0(%esi)
- pfsub %mm2,%mm1
- pfmul %mm3,%mm1
- movd %mm1,60(%esi)
- psrlq $32,%mm1
- movd %mm1,56(%esi)
- / 16, 30
- movq 64(%ebx),%mm0
- movq %mm0,%mm1
- movd 124(%ebx),%mm2
- punpckldq 120(%ebx),%mm2
- pfadd %mm2,%mm0
- movq %mm0,64(%esi)
- pfsubr %mm2,%mm1
- pfmul %mm3,%mm1
- movd %mm1,124(%esi)
- psrlq $32,%mm1
- movd %mm1,120(%esi)
- movq 8(%ebx),%mm4
- / 2, 12
- movq %mm4,%mm5
- movd 52(%ebx),%mm6
- punpckldq 48(%ebx),%mm6
- movq 8(%eax),%mm7
- pfadd %mm6,%mm4
- movq %mm4,8(%esi)
- pfsub %mm6,%mm5
- pfmul %mm7,%mm5
- movd %mm5,52(%esi)
- psrlq $32,%mm5
- movd %mm5,48(%esi)
- movq 72(%ebx),%mm4
- / 18, 28
- movq %mm4,%mm5
- movd 116(%ebx),%mm6
- punpckldq 112(%ebx),%mm6
- pfadd %mm6,%mm4
- movq %mm4,72(%esi)
- pfsubr %mm6,%mm5
- pfmul %mm7,%mm5
- movd %mm5,116(%esi)
- psrlq $32,%mm5
- movd %mm5,112(%esi)
- movq 16(%ebx),%mm0
- / 4, 10
- movq %mm0,%mm1
- movd 44(%ebx),%mm2
- punpckldq 40(%ebx),%mm2
- movq 16(%eax),%mm3
- pfadd %mm2,%mm0
- movq %mm0,16(%esi)
- pfsub %mm2,%mm1
- pfmul %mm3,%mm1
- movd %mm1,44(%esi)
- psrlq $32,%mm1
- movd %mm1,40(%esi)
- movq 80(%ebx),%mm0
- / 20, 26
- movq %mm0,%mm1
- movd 108(%ebx),%mm2
- punpckldq 104(%ebx),%mm2
- pfadd %mm2,%mm0
- movq %mm0,80(%esi)
- pfsubr %mm2,%mm1
- pfmul %mm3,%mm1
- movd %mm1,108(%esi)
- psrlq $32,%mm1
- movd %mm1,104(%esi)
- movq 24(%ebx),%mm4
- / 6, 8
- movq %mm4,%mm5
- movd 36(%ebx),%mm6
- punpckldq 32(%ebx),%mm6
- movq 24(%eax),%mm7
- pfadd %mm6,%mm4
- movq %mm4,24(%esi)
- pfsub %mm6,%mm5
- pfmul %mm7,%mm5
- movd %mm5,36(%esi)
- psrlq $32,%mm5
- movd %mm5,32(%esi)
- movq 88(%ebx),%mm4
- / 22, 24
- movq %mm4,%mm5
- movd 100(%ebx),%mm6
- punpckldq 96(%ebx),%mm6
- pfadd %mm6,%mm4
- movq %mm4,88(%esi)
- pfsubr %mm6,%mm5
- pfmul %mm7,%mm5
- movd %mm5,100(%esi)
- psrlq $32,%mm5
- movd %mm5,96(%esi)
-
- // 3
- movl pnts+8,%eax
- movq 0(%eax),%mm0
- movq 8(%eax),%mm1
- movq 0(%esi),%mm2
- / 0, 6
- movq %mm2,%mm3
- movd 28(%esi),%mm4
- punpckldq 24(%esi),%mm4
- pfadd %mm4,%mm2
- pfsub %mm4,%mm3
- pfmul %mm0,%mm3
- movq %mm2,0(%ebx)
- movd %mm3,28(%ebx)
- psrlq $32,%mm3
- movd %mm3,24(%ebx)
- movq 8(%esi),%mm5
- / 2, 4
- movq %mm5,%mm6
- movd 20(%esi),%mm7
- punpckldq 16(%esi),%mm7
- pfadd %mm7,%mm5
- pfsub %mm7,%mm6
- pfmul %mm1,%mm6
- movq %mm5,8(%ebx)
- movd %mm6,20(%ebx)
- psrlq $32,%mm6
- movd %mm6,16(%ebx)
- movq 32(%esi),%mm2
- / 8, 14
- movq %mm2,%mm3
- movd 60(%esi),%mm4
- punpckldq 56(%esi),%mm4
- pfadd %mm4,%mm2
- pfsubr %mm4,%mm3
- pfmul %mm0,%mm3
- movq %mm2,32(%ebx)
- movd %mm3,60(%ebx)
- psrlq $32,%mm3
- movd %mm3,56(%ebx)
- movq 40(%esi),%mm5
- / 10, 12
- movq %mm5,%mm6
- movd 52(%esi),%mm7
- punpckldq 48(%esi),%mm7
- pfadd %mm7,%mm5
- pfsubr %mm7,%mm6
- pfmul %mm1,%mm6
- movq %mm5,40(%ebx)
- movd %mm6,52(%ebx)
- psrlq $32,%mm6
- movd %mm6,48(%ebx)
- movq 64(%esi),%mm2
- / 16, 22
- movq %mm2,%mm3
- movd 92(%esi),%mm4
- punpckldq 88(%esi),%mm4
- pfadd %mm4,%mm2
- pfsub %mm4,%mm3
- pfmul %mm0,%mm3
- movq %mm2,64(%ebx)
- movd %mm3,92(%ebx)
- psrlq $32,%mm3
- movd %mm3,88(%ebx)
- movq 72(%esi),%mm5
- / 18, 20
- movq %mm5,%mm6
- movd 84(%esi),%mm7
- punpckldq 80(%esi),%mm7
- pfadd %mm7,%mm5
- pfsub %mm7,%mm6
- pfmul %mm1,%mm6
- movq %mm5,72(%ebx)
- movd %mm6,84(%ebx)
- psrlq $32,%mm6
- movd %mm6,80(%ebx)
- movq 96(%esi),%mm2
- / 24, 30
- movq %mm2,%mm3
- movd 124(%esi),%mm4
- punpckldq 120(%esi),%mm4
- pfadd %mm4,%mm2
- pfsubr %mm4,%mm3
- pfmul %mm0,%mm3
- movq %mm2,96(%ebx)
- movd %mm3,124(%ebx)
- psrlq $32,%mm3
- movd %mm3,120(%ebx)
- movq 104(%esi),%mm5
- / 26, 28
- movq %mm5,%mm6
- movd 116(%esi),%mm7
- punpckldq 112(%esi),%mm7
- pfadd %mm7,%mm5
- pfsubr %mm7,%mm6
- pfmul %mm1,%mm6
- movq %mm5,104(%ebx)
- movd %mm6,116(%ebx)
- psrlq $32,%mm6
- movd %mm6,112(%ebx)
-
- // 4
- movl pnts+12,%eax
- movq 0(%eax),%mm0
- movq 0(%ebx),%mm1
- / 0
- movq %mm1,%mm2
- movd 12(%ebx),%mm3
- punpckldq 8(%ebx),%mm3
- pfadd %mm3,%mm1
- pfsub %mm3,%mm2
- pfmul %mm0,%mm2
- movq %mm1,0(%esi)
- movd %mm2,12(%esi)
- psrlq $32,%mm2
- movd %mm2,8(%esi)
- movq 16(%ebx),%mm4
- / 4
- movq %mm4,%mm5
- movd 28(%ebx),%mm6
- punpckldq 24(%ebx),%mm6
- pfadd %mm6,%mm4
- pfsubr %mm6,%mm5
- pfmul %mm0,%mm5
- movq %mm4,16(%esi)
- movd %mm5,28(%esi)
- psrlq $32,%mm5
- movd %mm5,24(%esi)
- movq 32(%ebx),%mm1
- / 8
- movq %mm1,%mm2
- movd 44(%ebx),%mm3
- punpckldq 40(%ebx),%mm3
- pfadd %mm3,%mm1
- pfsub %mm3,%mm2
- pfmul %mm0,%mm2
- movq %mm1,32(%esi)
- movd %mm2,44(%esi)
- psrlq $32,%mm2
- movd %mm2,40(%esi)
- movq 48(%ebx),%mm4
- / 12
- movq %mm4,%mm5
- movd 60(%ebx),%mm6
- punpckldq 56(%ebx),%mm6
- pfadd %mm6,%mm4
- pfsubr %mm6,%mm5
- pfmul %mm0,%mm5
- movq %mm4,48(%esi)
- movd %mm5,60(%esi)
- psrlq $32,%mm5
- movd %mm5,56(%esi)
- movq 64(%ebx),%mm1
- / 16
- movq %mm1,%mm2
- movd 76(%ebx),%mm3
- punpckldq 72(%ebx),%mm3
- pfadd %mm3,%mm1
- pfsub %mm3,%mm2
- pfmul %mm0,%mm2
- movq %mm1,64(%esi)
- movd %mm2,76(%esi)
- psrlq $32,%mm2
- movd %mm2,72(%esi)
- movq 80(%ebx),%mm4
- / 20
- movq %mm4,%mm5
- movd 92(%ebx),%mm6
- punpckldq 88(%ebx),%mm6
- pfadd %mm6,%mm4
- pfsubr %mm6,%mm5
- pfmul %mm0,%mm5
- movq %mm4,80(%esi)
- movd %mm5,92(%esi)
- psrlq $32,%mm5
- movd %mm5,88(%esi)
- movq 96(%ebx),%mm1
- / 24
- movq %mm1,%mm2
- movd 108(%ebx),%mm3
- punpckldq 104(%ebx),%mm3
- pfadd %mm3,%mm1
- pfsub %mm3,%mm2
- pfmul %mm0,%mm2
- movq %mm1,96(%esi)
- movd %mm2,108(%esi)
- psrlq $32,%mm2
- movd %mm2,104(%esi)
- movq 112(%ebx),%mm4
- / 28
- movq %mm4,%mm5
- movd 124(%ebx),%mm6
- punpckldq 120(%ebx),%mm6
- pfadd %mm6,%mm4
- pfsubr %mm6,%mm5
- pfmul %mm0,%mm5
- movq %mm4,112(%esi)
- movd %mm5,124(%esi)
- psrlq $32,%mm5
- movd %mm5,120(%esi)
-
- // 5
- movl $-1,%eax
- movd %eax,%mm1
- movl $1,%eax
- movd %eax,%mm0
- / L | H
- punpckldq %mm1,%mm0
- pi2fd %mm0,%mm0
- / 1.0 | -1.0
- movd %eax,%mm1
- pi2fd %mm1,%mm1
- movl pnts+16,%eax
- movd 0(%eax),%mm2
- punpckldq %mm2,%mm1
- / 1.0 | cos0
- movq 0(%esi),%mm2
- / 0
- movq %mm2,%mm3
- pfmul %mm0,%mm3
- pfacc %mm3,%mm2
- pfmul %mm1,%mm2
- movq %mm2,0(%ebx)
- movq 8(%esi),%mm4
- movq %mm4,%mm5
- pfmul %mm0,%mm5
- pfacc %mm5,%mm4
- pfmul %mm0,%mm4
- pfmul %mm1,%mm4
- movq %mm4,%mm5
- psrlq $32,%mm5
- pfacc %mm5,%mm4
- movq %mm4,8(%ebx)
- movq 16(%esi),%mm2
- / 4
- movq %mm2,%mm3
- pfmul %mm0,%mm3
- pfacc %mm3,%mm2
- pfmul %mm1,%mm2
- movq 24(%esi),%mm4
- movq %mm4,%mm5
- pfmul %mm0,%mm5
- pfacc %mm5,%mm4
- pfmul %mm0,%mm4
- pfmul %mm1,%mm4
- movq %mm4,%mm5
- psrlq $32,%mm5
- pfacc %mm5,%mm4
- movq %mm2,%mm3
- psrlq $32,%mm3
- pfadd %mm4,%mm2
- pfadd %mm3,%mm4
- movq %mm2,16(%ebx)
- movq %mm4,24(%ebx)
- movq 32(%esi),%mm2
- / 8
- movq %mm2,%mm3
- pfmul %mm0,%mm3
- pfacc %mm3,%mm2
- pfmul %mm1,%mm2
- movq %mm2,32(%ebx)
- movq 40(%esi),%mm4
- movq %mm4,%mm5
- pfmul %mm0,%mm5
- pfacc %mm5,%mm4
- pfmul %mm0,%mm4
- pfmul %mm1,%mm4
- movq %mm4,%mm5
- psrlq $32,%mm5
- pfacc %mm5,%mm4
- movq %mm4,40(%ebx)
- movq 48(%esi),%mm2
- / 12
- movq %mm2,%mm3
- pfmul %mm0,%mm3
- pfacc %mm3,%mm2
- pfmul %mm1,%mm2
- movq 56(%esi),%mm4
- movq %mm4,%mm5
- pfmul %mm0,%mm5
- pfacc %mm5,%mm4
- pfmul %mm0,%mm4
- pfmul %mm1,%mm4
- movq %mm4,%mm5
- psrlq $32,%mm5
- pfacc %mm5,%mm4
- movq %mm2,%mm3
- psrlq $32,%mm3
- pfadd %mm4,%mm2
- pfadd %mm3,%mm4
- movq %mm2,48(%ebx)
- movq %mm4,56(%ebx)
- movq 64(%esi),%mm2
- / 16
- movq %mm2,%mm3
- pfmul %mm0,%mm3
- pfacc %mm3,%mm2
- pfmul %mm1,%mm2
- movq %mm2,64(%ebx)
- movq 72(%esi),%mm4
- movq %mm4,%mm5
- pfmul %mm0,%mm5
- pfacc %mm5,%mm4
- pfmul %mm0,%mm4
- pfmul %mm1,%mm4
- movq %mm4,%mm5
- psrlq $32,%mm5
- pfacc %mm5,%mm4
- movq %mm4,72(%ebx)
- movq 80(%esi),%mm2
- / 20
- movq %mm2,%mm3
- pfmul %mm0,%mm3
- pfacc %mm3,%mm2
- pfmul %mm1,%mm2
- movq 88(%esi),%mm4
- movq %mm4,%mm5
- pfmul %mm0,%mm5
- pfacc %mm5,%mm4
- pfmul %mm0,%mm4
- pfmul %mm1,%mm4
- movq %mm4,%mm5
- psrlq $32,%mm5
- pfacc %mm5,%mm4
- movq %mm2,%mm3
- psrlq $32,%mm3
- pfadd %mm4,%mm2
- pfadd %mm3,%mm4
- movq %mm2,80(%ebx)
- movq %mm4,88(%ebx)
- movq 96(%esi),%mm2
- / 24
- movq %mm2,%mm3
- pfmul %mm0,%mm3
- pfacc %mm3,%mm2
- pfmul %mm1,%mm2
- movq %mm2,96(%ebx)
- movq 104(%esi),%mm4
- movq %mm4,%mm5
- pfmul %mm0,%mm5
- pfacc %mm5,%mm4
- pfmul %mm0,%mm4
- pfmul %mm1,%mm4
- movq %mm4,%mm5
- psrlq $32,%mm5
- pfacc %mm5,%mm4
- movq %mm4,104(%ebx)
- movq 112(%esi),%mm2
- / 28
- movq %mm2,%mm3
- pfmul %mm0,%mm3
- pfacc %mm3,%mm2
- pfmul %mm1,%mm2
- movq 120(%esi),%mm4
- movq %mm4,%mm5
- pfmul %mm0,%mm5
- pfacc %mm5,%mm4
- pfmul %mm0,%mm4
- pfmul %mm1,%mm4
- movq %mm4,%mm5
- psrlq $32,%mm5
- pfacc %mm5,%mm4
- movq %mm2,%mm3
- psrlq $32,%mm3
- pfadd %mm4,%mm2
- pfadd %mm3,%mm4
- movq %mm2,112(%ebx)
- movq %mm4,120(%ebx)
-
- // Phase6
- movl 0(%ebx),%eax
- movl %eax,1024(%ebp)
- movl 4(%ebx),%eax
- movl %eax,0(%ebp)
- movl %eax,0(%edx)
- movl 8(%ebx),%eax
- movl %eax,512(%ebp)
- movl 12(%ebx),%eax
- movl %eax,512(%edx)
-
- movl 16(%ebx),%eax
- movl %eax,768(%ebp)
- movl 20(%ebx),%eax
- movl %eax,256(%edx)
-
- movl 24(%ebx),%eax
- movl %eax,256(%ebp)
- movl 28(%ebx),%eax
- movl %eax,768(%edx)
-
- movq 32(%ebx),%mm0
- movq 48(%ebx),%mm1
- pfadd %mm1,%mm0
- movd %mm0,896(%ebp)
- psrlq $32,%mm0
- movd %mm0,128(%edx)
- movq 40(%ebx),%mm2
- pfadd %mm2,%mm1
- movd %mm1,640(%ebp)
- psrlq $32,%mm1
- movd %mm1,384(%edx)
-
- movq 56(%ebx),%mm3
- pfadd %mm3,%mm2
- movd %mm2,384(%ebp)
- psrlq $32,%mm2
- movd %mm2,640(%edx)
-
- movd 36(%ebx),%mm4
- pfadd %mm4,%mm3
- movd %mm3,128(%ebp)
- psrlq $32,%mm3
- movd %mm3,896(%edx)
- movq 96(%ebx),%mm0
- movq 64(%ebx),%mm1
-
- movq 112(%ebx),%mm2
- pfadd %mm2,%mm0
- movq %mm0,%mm3
- pfadd %mm1,%mm3
- movd %mm3,960(%ebp)
- psrlq $32,%mm3
- movd %mm3,64(%edx)
- movq 80(%ebx),%mm1
- pfadd %mm1,%mm0
- movd %mm0,832(%ebp)
- psrlq $32,%mm0
- movd %mm0,192(%edx)
- movq 104(%ebx),%mm3
- pfadd %mm3,%mm2
- movq %mm2,%mm4
- pfadd %mm1,%mm4
- movd %mm4,704(%ebp)
- psrlq $32,%mm4
- movd %mm4,320(%edx)
- movq 72(%ebx),%mm1
- pfadd %mm1,%mm2
- movd %mm2,576(%ebp)
- psrlq $32,%mm2
- movd %mm2,448(%edx)
-
- movq 120(%ebx),%mm4
- pfadd %mm4,%mm3
- movq %mm3,%mm5
- pfadd %mm1,%mm5
- movd %mm5,448(%ebp)
- psrlq $32,%mm5
- movd %mm5,576(%edx)
- movq 88(%ebx),%mm1
- pfadd %mm1,%mm3
- movd %mm3,320(%ebp)
- psrlq $32,%mm3
- movd %mm3,704(%edx)
-
- movd 100(%ebx),%mm5
- pfadd %mm5,%mm4
- movq %mm4,%mm6
- pfadd %mm1,%mm6
- movd %mm6,192(%ebp)
- psrlq $32,%mm6
- movd %mm6,832(%edx)
- movd 68(%ebx),%mm1
- pfadd %mm1,%mm4
- movd %mm4,64(%ebp)
- psrlq $32,%mm4
- movd %mm4,960(%edx)
-
- / femms
-
- popl %ebx
- popl %esi
- popl %edi
- popl %ebp
- addl $256,%esp
-
- ret
+# This code was taken from http://www.mpg123.org
+# See ChangeLog of mpg123-0.59s-pre.1 for detail
+# Applied to mplayer by Nick Kurshev <nickols_k@mail.ru>
+# Partial 3dnow! optimization by Nick Kurshev
+#
+# TODO: finish 3dnow! optimization at least in scalar mode
+#
+
+.data
+ .align 8
+plus_minus_3dnow: .long 0x00000000, 0x80000000
+costab:
+ .long 1056974725
+ .long 1057056395
+ .long 1057223771
+ .long 1057485416
+ .long 1057855544
+ .long 1058356026
+ .long 1059019886
+ .long 1059897405
+ .long 1061067246
+ .long 1062657950
+ .long 1064892987
+ .long 1066774581
+ .long 1069414683
+ .long 1073984175
+ .long 1079645762
+ .long 1092815430
+ .long 1057005197
+ .long 1057342072
+ .long 1058087743
+ .long 1059427869
+ .long 1061799040
+ .long 1065862217
+ .long 1071413542
+ .long 1084439708
+ .long 1057128951
+ .long 1058664893
+ .long 1063675095
+ .long 1076102863
+ .long 1057655764
+ .long 1067924853
+ .long 1060439283
+
+.text
+
+ .align 16
+
+.globl dct64_MMX_3dnow
+dct64_MMX_3dnow:
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+ subl $256,%esp
+ movl 280(%esp),%eax
+
+ leal 128(%esp),%edx
+ movl 272(%esp),%esi
+ movl 276(%esp),%edi
+ movl $costab,%ebx
+ orl %ecx,%ecx
+ movl %esp,%ecx
+ femms
+/* Phase 1*/
+ movq (%eax), %mm0
+ movq 8(%eax), %mm4
+ movq %mm0, %mm3
+ movq %mm4, %mm7
+ movq 120(%eax), %mm1
+ movq 112(%eax), %mm5
+ /* n.b.: pswapd*/
+ movq %mm1, %mm2
+ movq %mm5, %mm6
+ psrlq $32, %mm1
+ psrlq $32, %mm5
+ punpckldq %mm2, %mm1
+ punpckldq %mm6, %mm5
+ /**/
+ pfadd %mm1, %mm0
+ pfadd %mm5, %mm4
+ movq %mm0, (%edx)
+ movq %mm4, 8(%edx)
+ pfsub %mm1, %mm3
+ pfsub %mm5, %mm7
+ pfmul (%ebx), %mm3
+ pfmul 8(%ebx), %mm7
+ movd %mm3, 124(%edx)
+ movd %mm7, 116(%edx)
+ psrlq $32, %mm3
+ psrlq $32, %mm7
+ movd %mm3, 120(%edx)
+ movd %mm7, 112(%edx)
+
+ movq 16(%eax), %mm0
+ movq 24(%eax), %mm4
+ movq %mm0, %mm3
+ movq %mm4, %mm7
+ movq 104(%eax), %mm1
+ movq 96(%eax), %mm5
+ /* n.b.: pswapd*/
+ movq %mm1, %mm2
+ movq %mm5, %mm6
+ psrlq $32, %mm1
+ psrlq $32, %mm5
+ punpckldq %mm2, %mm1
+ punpckldq %mm6, %mm5
+ /**/
+ pfadd %mm1, %mm0
+ pfadd %mm5, %mm4
+ movq %mm0, 16(%edx)
+ movq %mm4, 24(%edx)
+ pfsub %mm1, %mm3
+ pfsub %mm5, %mm7
+ pfmul 16(%ebx), %mm3
+ pfmul 24(%ebx), %mm7
+ movd %mm3, 108(%edx)
+ movd %mm7, 100(%edx)
+ psrlq $32, %mm3
+ psrlq $32, %mm7
+ movd %mm3, 104(%edx)
+ movd %mm7, 96(%edx)
+
+ movq 32(%eax), %mm0
+ movq 40(%eax), %mm4
+ movq %mm0, %mm3
+ movq %mm4, %mm7
+ movq 88(%eax), %mm1
+ movq 80(%eax), %mm5
+ /* n.b.: pswapd*/
+ movq %mm1, %mm2
+ movq %mm5, %mm6
+ psrlq $32, %mm1
+ psrlq $32, %mm5
+ punpckldq %mm2, %mm1
+ punpckldq %mm6, %mm5
+ /**/
+ pfadd %mm1, %mm0
+ pfadd %mm5, %mm4
+ movq %mm0, 32(%edx)
+ movq %mm4, 40(%edx)
+ pfsub %mm1, %mm3
+ pfsub %mm5, %mm7
+ pfmul 32(%ebx), %mm3
+ pfmul 40(%ebx), %mm7
+ movd %mm3, 92(%edx)
+ movd %mm7, 84(%edx)
+ psrlq $32, %mm3
+ psrlq $32, %mm7
+ movd %mm3, 88(%edx)
+ movd %mm7, 80(%edx)
+
+ movq 48(%eax), %mm0
+ movq 56(%eax), %mm4
+ movq %mm0, %mm3
+ movq %mm4, %mm7
+ movq 72(%eax), %mm1
+ movq 64(%eax), %mm5
+ /* n.b.: pswapd*/
+ movq %mm1, %mm2
+ movq %mm5, %mm6
+ psrlq $32, %mm1
+ psrlq $32, %mm5
+ punpckldq %mm2, %mm1
+ punpckldq %mm6, %mm5
+ /**/
+ pfadd %mm1, %mm0
+ pfadd %mm5, %mm4
+ movq %mm0, 48(%edx)
+ movq %mm4, 56(%edx)
+ pfsub %mm1, %mm3
+ pfsub %mm5, %mm7
+ pfmul 48(%ebx), %mm3
+ pfmul 56(%ebx), %mm7
+ movd %mm3, 76(%edx)
+ movd %mm7, 68(%edx)
+ psrlq $32, %mm3
+ psrlq $32, %mm7
+ movd %mm3, 72(%edx)
+ movd %mm7, 64(%edx)
+
+/* Phase 2*/
+
+ movq (%edx), %mm0
+ movq 8(%edx), %mm4
+ movq %mm0, %mm3
+ movq %mm4, %mm7
+ movq 56(%edx), %mm1
+ movq 48(%edx), %mm5
+ /* n.b.: pswapd*/
+ movq %mm1, %mm2
+ movq %mm5, %mm6
+ psrlq $32, %mm1
+ psrlq $32, %mm5
+ punpckldq %mm2, %mm1
+ punpckldq %mm6, %mm5
+ /**/
+ pfadd %mm1, %mm0
+ pfadd %mm5, %mm4
+ movq %mm0, (%ecx)
+ movq %mm4, 8(%ecx)
+ pfsub %mm1, %mm3
+ pfsub %mm5, %mm7
+ pfmul 64(%ebx), %mm3
+ pfmul 72(%ebx), %mm7
+ movd %mm3, 60(%ecx)
+ movd %mm7, 52(%ecx)
+ psrlq $32, %mm3
+ psrlq $32, %mm7
+ movd %mm3, 56(%ecx)
+ movd %mm7, 48(%ecx)
+
+ movq 16(%edx), %mm0
+ movq 24(%edx), %mm4
+ movq %mm0, %mm3
+ movq %mm4, %mm7
+ movq 40(%edx), %mm1
+ movq 32(%edx), %mm5
+ /* n.b.: pswapd*/
+ movq %mm1, %mm2
+ movq %mm5, %mm6
+ psrlq $32, %mm1
+ psrlq $32, %mm5
+ punpckldq %mm2, %mm1
+ punpckldq %mm6, %mm5
+ /**/
+ pfadd %mm1, %mm0
+ pfadd %mm5, %mm4
+ movq %mm0, 16(%ecx)
+ movq %mm4, 24(%ecx)
+ pfsub %mm1, %mm3
+ pfsub %mm5, %mm7
+ pfmul 80(%ebx), %mm3
+ pfmul 88(%ebx), %mm7
+ movd %mm3, 44(%ecx)
+ movd %mm7, 36(%ecx)
+ psrlq $32, %mm3
+ psrlq $32, %mm7
+ movd %mm3, 40(%ecx)
+ movd %mm7, 32(%ecx)
+
+/* Phase 3*/
+
+ movq 64(%edx), %mm0
+ movq 72(%edx), %mm4
+ movq %mm0, %mm3
+ movq %mm4, %mm7
+ movq 120(%edx), %mm1
+ movq 112(%edx), %mm5
+ /* n.b.: pswapd*/
+ movq %mm1, %mm2
+ movq %mm5, %mm6
+ psrlq $32, %mm1
+ psrlq $32, %mm5
+ punpckldq %mm2, %mm1
+ punpckldq %mm6, %mm5
+ /**/
+ pfadd %mm1, %mm0
+ pfadd %mm5, %mm4
+ movq %mm0, 64(%ecx)
+ movq %mm4, 72(%ecx)
+ pfsubr %mm1, %mm3
+ pfsubr %mm5, %mm7
+ pfmul 64(%ebx), %mm3
+ pfmul 72(%ebx), %mm7
+ movd %mm3, 124(%ecx)
+ movd %mm7, 116(%ecx)
+ psrlq $32, %mm3
+ psrlq $32, %mm7
+ movd %mm3, 120(%ecx)
+ movd %mm7, 112(%ecx)
+
+ movq 80(%edx), %mm0
+ movq 88(%edx), %mm4
+ movq %mm0, %mm3
+ movq %mm4, %mm7
+ movq 104(%edx), %mm1
+ movq 96(%edx), %mm5
+ /* n.b.: pswapd*/
+ movq %mm1, %mm2
+ movq %mm5, %mm6
+ psrlq $32, %mm1
+ psrlq $32, %mm5
+ punpckldq %mm2, %mm1
+ punpckldq %mm6, %mm5
+ /**/
+ pfadd %mm1, %mm0
+ pfadd %mm5, %mm4
+ movq %mm0, 80(%ecx)
+ movq %mm4, 88(%ecx)
+ pfsubr %mm1, %mm3
+ pfsubr %mm5, %mm7
+ pfmul 80(%ebx), %mm3
+ pfmul 88(%ebx), %mm7
+ movd %mm3, 108(%ecx)
+ movd %mm7, 100(%ecx)
+ psrlq $32, %mm3
+ psrlq $32, %mm7
+ movd %mm3, 104(%ecx)
+ movd %mm7, 96(%ecx)
+
+/* Phase 4*/
+
+ movq (%ecx), %mm0
+ movq 8(%ecx), %mm4
+ movq %mm0, %mm3
+ movq %mm4, %mm7
+ movq 24(%ecx), %mm1
+ movq 16(%ecx), %mm5
+ /* n.b.: pswapd*/
+ movq %mm1, %mm2
+ movq %mm5, %mm6
+ psrlq $32, %mm1
+ psrlq $32, %mm5
+ punpckldq %mm2, %mm1
+ punpckldq %mm6, %mm5
+ /**/
+ pfadd %mm1, %mm0
+ pfadd %mm5, %mm4
+ movq %mm0, (%edx)
+ movq %mm4, 8(%edx)
+ pfsub %mm1, %mm3
+ pfsub %mm5, %mm7
+ pfmul 96(%ebx), %mm3
+ pfmul 104(%ebx), %mm7
+ movd %mm3, 28(%edx)
+ movd %mm7, 20(%edx)
+ psrlq $32, %mm3
+ psrlq $32, %mm7
+ movd %mm3, 24(%edx)
+ movd %mm7, 16(%edx)
+
+ movq 32(%ecx), %mm0
+ movq 40(%ecx), %mm4
+ movq %mm0, %mm3
+ movq %mm4, %mm7
+ movq 56(%ecx), %mm1
+ movq 48(%ecx), %mm5
+ /* n.b.: pswapd*/
+ movq %mm1, %mm2
+ movq %mm5, %mm6
+ psrlq $32, %mm1
+ psrlq $32, %mm5
+ punpckldq %mm2, %mm1
+ punpckldq %mm6, %mm5
+ /**/
+ pfadd %mm1, %mm0
+ pfadd %mm5, %mm4
+ movq %mm0, 32(%edx)
+ movq %mm4, 40(%edx)
+ pfsubr %mm1, %mm3
+ pfsubr %mm5, %mm7
+ pfmul 96(%ebx), %mm3
+ pfmul 104(%ebx), %mm7
+ movd %mm3, 60(%edx)
+ movd %mm7, 52(%edx)
+ psrlq $32, %mm3
+ psrlq $32, %mm7
+ movd %mm3, 56(%edx)
+ movd %mm7, 48(%edx)
+
+ movq 64(%ecx), %mm0
+ movq 72(%ecx), %mm4
+ movq %mm0, %mm3
+ movq %mm4, %mm7
+ movq 88(%ecx), %mm1
+ movq 80(%ecx), %mm5
+ /* n.b.: pswapd*/
+ movq %mm1, %mm2
+ movq %mm5, %mm6
+ psrlq $32, %mm1
+ psrlq $32, %mm5
+ punpckldq %mm2, %mm1
+ punpckldq %mm6, %mm5
+ /**/
+ pfadd %mm1, %mm0
+ pfadd %mm5, %mm4
+ movq %mm0, 64(%edx)
+ movq %mm4, 72(%edx)
+ pfsub %mm1, %mm3
+ pfsub %mm5, %mm7
+ pfmul 96(%ebx), %mm3
+ pfmul 104(%ebx), %mm7
+ movd %mm3, 92(%edx)
+ movd %mm7, 84(%edx)
+ psrlq $32, %mm3
+ psrlq $32, %mm7
+ movd %mm3, 88(%edx)
+ movd %mm7, 80(%edx)
+
+ movq 96(%ecx), %mm0
+ movq 104(%ecx), %mm4
+ movq %mm0, %mm3
+ movq %mm4, %mm7
+ movq 120(%ecx), %mm1
+ movq 112(%ecx), %mm5
+ /* n.b.: pswapd*/
+ movq %mm1, %mm2
+ movq %mm5, %mm6
+ psrlq $32, %mm1
+ psrlq $32, %mm5
+ punpckldq %mm2, %mm1
+ punpckldq %mm6, %mm5
+ /**/
+ pfadd %mm1, %mm0
+ pfadd %mm5, %mm4
+ movq %mm0, 96(%edx)
+ movq %mm4, 104(%edx)
+ pfsubr %mm1, %mm3
+ pfsubr %mm5, %mm7
+ pfmul 96(%ebx), %mm3
+ pfmul 104(%ebx), %mm7
+ movd %mm3, 124(%edx)
+ movd %mm7, 116(%edx)
+ psrlq $32, %mm3
+ psrlq $32, %mm7
+ movd %mm3, 120(%edx)
+ movd %mm7, 112(%edx)
+
+/* Phase 5 */
+
+ movq (%edx), %mm0
+ movq 16(%edx), %mm4
+ movq %mm0, %mm3
+ movq %mm4, %mm7
+ movq 8(%edx), %mm1
+ movq 24(%edx), %mm5
+ /* n.b.: pswapd*/
+ movq %mm1, %mm2
+ movq %mm5, %mm6
+ psrlq $32, %mm1
+ psrlq $32, %mm5
+ punpckldq %mm2, %mm1
+ punpckldq %mm6, %mm5
+ /**/
+ pfadd %mm1, %mm0
+ pfadd %mm5, %mm4
+ movq %mm0, (%ecx)
+ movq %mm4, 16(%ecx)
+ pfsub %mm1, %mm3
+ pfsubr %mm5, %mm7
+ pfmul 112(%ebx), %mm3
+ pfmul 112(%ebx), %mm7
+ movd %mm3, 12(%ecx)
+ movd %mm7, 28(%ecx)
+ psrlq $32, %mm3
+ psrlq $32, %mm7
+ movd %mm3, 8(%ecx)
+ movd %mm7, 24(%ecx)
+
+ movq 32(%edx), %mm0
+ movq 48(%edx), %mm4
+ movq %mm0, %mm3
+ movq %mm4, %mm7
+ movq 40(%edx), %mm1
+ movq 56(%edx), %mm5
+ /* n.b.: pswapd*/
+ movq %mm1, %mm2
+ movq %mm5, %mm6
+ psrlq $32, %mm1
+ psrlq $32, %mm5
+ punpckldq %mm2, %mm1
+ punpckldq %mm6, %mm5
+ /**/
+ pfadd %mm1, %mm0
+ pfadd %mm5, %mm4
+ movq %mm0, 32(%ecx)
+ movq %mm4, 48(%ecx)
+ pfsub %mm1, %mm3
+ pfsubr %mm5, %mm7
+ pfmul 112(%ebx), %mm3
+ pfmul 112(%ebx), %mm7
+ movd %mm3, 44(%ecx)
+ movd %mm7, 60(%ecx)
+ psrlq $32, %mm3
+ psrlq $32, %mm7
+ movd %mm3, 40(%ecx)
+ movd %mm7, 56(%ecx)
+
+ movq 64(%edx), %mm0
+ movq 80(%edx), %mm4
+ movq %mm0, %mm3
+ movq %mm4, %mm7
+ movq 72(%edx), %mm1
+ movq 88(%edx), %mm5
+ /* n.b.: pswapd*/
+ movq %mm1, %mm2
+ movq %mm5, %mm6
+ psrlq $32, %mm1
+ psrlq $32, %mm5
+ punpckldq %mm2, %mm1
+ punpckldq %mm6, %mm5
+ /**/
+ pfadd %mm1, %mm0
+ pfadd %mm5, %mm4
+ movq %mm0, 64(%ecx)
+ movq %mm4, 80(%ecx)
+ pfsub %mm1, %mm3
+ pfsubr %mm5, %mm7
+ pfmul 112(%ebx), %mm3
+ pfmul 112(%ebx), %mm7
+ movd %mm3, 76(%ecx)
+ movd %mm7, 92(%ecx)
+ psrlq $32, %mm3
+ psrlq $32, %mm7
+ movd %mm3, 72(%ecx)
+ movd %mm7, 88(%ecx)
+
+ movq 96(%edx), %mm0
+ movq 112(%edx), %mm4
+ movq %mm0, %mm3
+ movq %mm4, %mm7
+ movq 104(%edx), %mm1
+ movq 120(%edx), %mm5
+ /* n.b.: pswapd*/
+ movq %mm1, %mm2
+ movq %mm5, %mm6
+ psrlq $32, %mm1
+ psrlq $32, %mm5
+ punpckldq %mm2, %mm1
+ punpckldq %mm6, %mm5
+ /**/
+ pfadd %mm1, %mm0
+ pfadd %mm5, %mm4
+ movq %mm0, 96(%ecx)
+ movq %mm4, 112(%ecx)
+ pfsub %mm1, %mm3
+ pfsubr %mm5, %mm7
+ pfmul 112(%ebx), %mm3
+ pfmul 112(%ebx), %mm7
+ movd %mm3, 108(%ecx)
+ movd %mm7, 124(%ecx)
+ psrlq $32, %mm3
+ psrlq $32, %mm7
+ movd %mm3, 104(%ecx)
+ movd %mm7, 120(%ecx)
+
+/* Phase 6. This is the end of easy road. */
+ movl $1, %eax
+ movd %eax, %mm7
+ pi2fd %mm7, %mm7
+ movq 32(%ecx), %mm0
+ punpckldq 120(%ebx), %mm7 /* 1.0 | 120(%ebx) */
+ movq %mm0, %mm1
+ movq plus_minus_3dnow, %mm6
+ /* n.b.: pfpnacc */
+ pxor %mm6, %mm1
+ pfacc %mm1, %mm0
+ /**/
+ pfmul %mm7, %mm0
+ movq %mm0, 32(%edx)
+ femms
+
+ flds 44(%ecx)
+ fsubs 40(%ecx)
+ fmuls 120(%ebx)
+
+ fsts 44(%edx)
+ fadds 40(%ecx) /* pfacc 40(ecx), 56(%ecx) */
+ fadds 44(%ecx)
+ fstps 40(%edx)
+
+ flds 48(%ecx)
+ fsubs 52(%ecx)
+ fmuls 120(%ebx)
+
+ flds 60(%ecx)
+ fsubs 56(%ecx)
+ fmuls 120(%ebx)
+
+ fld %st(0)
+ fadds 56(%ecx)
+ fadds 60(%ecx)
+
+ fld %st(0)
+ fadds 48(%ecx)
+ fadds 52(%ecx)
+ fstps 48(%edx)
+ fadd %st(2)
+ fstps 56(%edx)
+ fsts 60(%edx)
+ faddp %st(1)
+ fstps 52(%edx)
+/*---*/
+ flds 64(%ecx)
+ fadds 68(%ecx)
+ fstps 64(%edx)
+
+ flds 64(%ecx)
+ fsubs 68(%ecx)
+ fmuls 120(%ebx)
+ fstps 68(%edx)
+
+ flds 76(%ecx)
+ fsubs 72(%ecx)
+ fmuls 120(%ebx)
+ fsts 76(%edx)
+ fadds 72(%ecx)
+ fadds 76(%ecx)
+ fstps 72(%edx)
+
+ flds 92(%ecx)
+ fsubs 88(%ecx)
+ fmuls 120(%ebx)
+ fsts 92(%edx)
+ fadds 92(%ecx)
+ fadds 88(%ecx)
+
+ fld %st(0)
+ fadds 80(%ecx)
+ fadds 84(%ecx)
+ fstps 80(%edx)
+
+ flds 80(%ecx)
+ fsubs 84(%ecx)
+ fmuls 120(%ebx)
+ fadd %st(0), %st(1)
+ fadds 92(%edx)
+ fstps 84(%edx)
+ fstps 88(%edx)
+
+ flds 96(%ecx)
+ fadds 100(%ecx)
+ fstps 96(%edx)
+
+ flds 96(%ecx)
+ fsubs 100(%ecx)
+ fmuls 120(%ebx)
+ fstps 100(%edx)
+
+ flds 108(%ecx)
+ fsubs 104(%ecx)
+ fmuls 120(%ebx)
+ fsts 108(%edx)
+ fadds 104(%ecx)
+ fadds 108(%ecx)
+ fstps 104(%edx)
+
+ flds 124(%ecx)
+ fsubs 120(%ecx)
+ fmuls 120(%ebx)
+ fsts 124(%edx)
+ fadds 120(%ecx)
+ fadds 124(%ecx)
+
+ fld %st(0)
+ fadds 112(%ecx)
+ fadds 116(%ecx)
+ fstps 112(%edx)
+
+ flds 112(%ecx)
+ fsubs 116(%ecx)
+ fmuls 120(%ebx)
+ fadd %st(0),%st(1)
+ fadds 124(%edx)
+ fstps 116(%edx)
+ fstps 120(%edx)
+ jnz .L01
+
+/* Phase 7*/
+
+ flds (%ecx)
+ fadds 4(%ecx)
+ fstps 1024(%esi)
+
+ flds (%ecx)
+ fsubs 4(%ecx)
+ fmuls 120(%ebx)
+ fsts (%esi)
+ fstps (%edi)
+
+ flds 12(%ecx)
+ fsubs 8(%ecx)
+ fmuls 120(%ebx)
+ fsts 512(%edi)
+ fadds 12(%ecx)
+ fadds 8(%ecx)
+ fstps 512(%esi)
+
+ flds 16(%ecx)
+ fsubs 20(%ecx)
+ fmuls 120(%ebx)
+
+ flds 28(%ecx)
+ fsubs 24(%ecx)
+ fmuls 120(%ebx)
+ fsts 768(%edi)
+ fld %st(0)
+ fadds 24(%ecx)
+ fadds 28(%ecx)
+ fld %st(0)
+ fadds 16(%ecx)
+ fadds 20(%ecx)
+ fstps 768(%esi)
+ fadd %st(2)
+ fstps 256(%esi)
+ faddp %st(1)
+ fstps 256(%edi)
+
+/* Phase 8*/
+
+ flds 32(%edx)
+ fadds 48(%edx)
+ fstps 896(%esi)
+
+ flds 48(%edx)
+ fadds 40(%edx)
+ fstps 640(%esi)
+
+ flds 40(%edx)
+ fadds 56(%edx)
+ fstps 384(%esi)
+
+ flds 56(%edx)
+ fadds 36(%edx)
+ fstps 128(%esi)
+
+ flds 36(%edx)
+ fadds 52(%edx)
+ fstps 128(%edi)
+
+ flds 52(%edx)
+ fadds 44(%edx)
+ fstps 384(%edi)
+
+ flds 60(%edx)
+ fsts 896(%edi)
+ fadds 44(%edx)
+ fstps 640(%edi)
+
+ flds 96(%edx)
+ fadds 112(%edx)
+ fld %st(0)
+ fadds 64(%edx)
+ fstps 960(%esi)
+ fadds 80(%edx)
+ fstps 832(%esi)
+
+ flds 112(%edx)
+ fadds 104(%edx)
+ fld %st(0)
+ fadds 80(%edx)
+ fstps 704(%esi)
+ fadds 72(%edx)
+ fstps 576(%esi)
+
+ flds 104(%edx)
+ fadds 120(%edx)
+ fld %st(0)
+ fadds 72(%edx)
+ fstps 448(%esi)
+ fadds 88(%edx)
+ fstps 320(%esi)
+
+ flds 120(%edx)
+ fadds 100(%edx)
+ fld %st(0)
+ fadds 88(%edx)
+ fstps 192(%esi)
+ fadds 68(%edx)
+ fstps 64(%esi)
+
+ flds 100(%edx)
+ fadds 116(%edx)
+ fld %st(0)
+ fadds 68(%edx)
+ fstps 64(%edi)
+ fadds 84(%edx)
+ fstps 192(%edi)
+
+ flds 116(%edx)
+ fadds 108(%edx)
+ fld %st(0)
+ fadds 84(%edx)
+ fstps 320(%edi)
+ fadds 76(%edx)
+ fstps 448(%edi)
+
+ flds 108(%edx)
+ fadds 124(%edx)
+ fld %st(0)
+ fadds 76(%edx)
+ fstps 576(%edi)
+ fadds 92(%edx)
+ fstps 704(%edi)
+
+ flds 124(%edx)
+ fsts 960(%edi)
+ fadds 92(%edx)
+ fstps 832(%edi)
+ jmp .L_bye
+.L01:
+/* Phase 9*/
+
+ flds (%ecx)
+ fadds 4(%ecx)
+ fistp 512(%esi)
+
+ flds (%ecx)
+ fsubs 4(%ecx)
+ fmuls 120(%ebx)
+
+ fistp (%esi)
+
+
+ flds 12(%ecx)
+ fsubs 8(%ecx)
+ fmuls 120(%ebx)
+ fist 256(%edi)
+ fadds 12(%ecx)
+ fadds 8(%ecx)
+ fistp 256(%esi)
+
+ flds 16(%ecx)
+ fsubs 20(%ecx)
+ fmuls 120(%ebx)
+
+ flds 28(%ecx)
+ fsubs 24(%ecx)
+ fmuls 120(%ebx)
+ fist 384(%edi)
+ fld %st(0)
+ fadds 24(%ecx)
+ fadds 28(%ecx)
+ fld %st(0)
+ fadds 16(%ecx)
+ fadds 20(%ecx)
+ fistp 384(%esi)
+ fadd %st(2)
+ fistp 128(%esi)
+ faddp %st(1)
+ fistp 128(%edi)
+
+/* Phase 10*/
+
+ flds 32(%edx)
+ fadds 48(%edx)
+ fistp 448(%esi)
+
+ flds 48(%edx)
+ fadds 40(%edx)
+ fistp 320(%esi)
+
+ flds 40(%edx)
+ fadds 56(%edx)
+ fistp 192(%esi)
+
+ flds 56(%edx)
+ fadds 36(%edx)
+ fistp 64(%esi)
+
+ flds 36(%edx)
+ fadds 52(%edx)
+ fistp 64(%edi)
+
+ flds 52(%edx)
+ fadds 44(%edx)
+ fistp 192(%edi)
+
+ flds 60(%edx)
+ fist 448(%edi)
+ fadds 44(%edx)
+ fistp 320(%edi)
+
+ flds 96(%edx)
+ fadds 112(%edx)
+ fld %st(0)
+ fadds 64(%edx)
+ fistp 480(%esi)
+ fadds 80(%edx)
+ fistp 416(%esi)
+
+ flds 112(%edx)
+ fadds 104(%edx)
+ fld %st(0)
+ fadds 80(%edx)
+ fistp 352(%esi)
+ fadds 72(%edx)
+ fistp 288(%esi)
+
+ flds 104(%edx)
+ fadds 120(%edx)
+ fld %st(0)
+ fadds 72(%edx)
+ fistp 224(%esi)
+ fadds 88(%edx)
+ fistp 160(%esi)
+
+ flds 120(%edx)
+ fadds 100(%edx)
+ fld %st(0)
+ fadds 88(%edx)
+ fistp 96(%esi)
+ fadds 68(%edx)
+ fistp 32(%esi)
+
+ flds 100(%edx)
+ fadds 116(%edx)
+ fld %st(0)
+ fadds 68(%edx)
+ fistp 32(%edi)
+ fadds 84(%edx)
+ fistp 96(%edi)
+
+ flds 116(%edx)
+ fadds 108(%edx)
+ fld %st(0)
+ fadds 84(%edx)
+ fistp 160(%edi)
+ fadds 76(%edx)
+ fistp 224(%edi)
+
+ flds 108(%edx)
+ fadds 124(%edx)
+ fld %st(0)
+ fadds 76(%edx)
+ fistp 288(%edi)
+ fadds 92(%edx)
+ fistp 352(%edi)
+
+ flds 124(%edx)
+ fist 480(%edi)
+ fadds 92(%edx)
+ fistp 416(%edi)
+ movsw
+.L_bye:
+ addl $256,%esp
+ popl %edi
+ popl %esi
+ popl %ebx
+ ret
+
diff --git a/mp3lib/dct64_MMX.s b/mp3lib/dct64_MMX.s
new file mode 100644
index 0000000000..cf288d5af9
--- /dev/null
+++ b/mp3lib/dct64_MMX.s
@@ -0,0 +1,1028 @@
+# This code was taken from http://www.mpg123.org
+# See ChangeLog of mpg123-0.59s-pre.1 for detail
+# Applied to mplayer by Nick Kurshev <nickols_k@mail.ru>
+
+.data
+ .align 4
+costab:
+ .long 1056974725
+ .long 1057056395
+ .long 1057223771
+ .long 1057485416
+ .long 1057855544
+ .long 1058356026
+ .long 1059019886
+ .long 1059897405
+ .long 1061067246
+ .long 1062657950
+ .long 1064892987
+ .long 1066774581
+ .long 1069414683
+ .long 1073984175
+ .long 1079645762
+ .long 1092815430
+ .long 1057005197
+ .long 1057342072
+ .long 1058087743
+ .long 1059427869
+ .long 1061799040
+ .long 1065862217
+ .long 1071413542
+ .long 1084439708
+ .long 1057128951
+ .long 1058664893
+ .long 1063675095
+ .long 1076102863
+ .long 1057655764
+ .long 1067924853
+ .long 1060439283
+
+.text
+
+ .align 16
+
+.globl dct64_MMX
+dct64_MMX:
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+ subl $256,%esp
+ movl 280(%esp),%eax
+/* Phase 1*/
+ flds (%eax)
+ leal 128(%esp),%edx
+ fadds 124(%eax)
+ movl 272(%esp),%esi
+ fstps (%edx)
+ movl 276(%esp),%edi
+
+ flds 4(%eax)
+ movl $costab,%ebx
+ fadds 120(%eax)
+ orl %ecx,%ecx
+ fstps 4(%edx)
+
+ flds (%eax)
+ movl %esp,%ecx
+ fsubs 124(%eax)
+ fmuls (%ebx)
+ fstps 124(%edx)
+
+ flds 4(%eax)
+ fsubs 120(%eax)
+ fmuls 4(%ebx)
+ fstps 120(%edx)
+
+ flds 8(%eax)
+ fadds 116(%eax)
+ fstps 8(%edx)
+
+ flds 12(%eax)
+ fadds 112(%eax)
+ fstps 12(%edx)
+
+ flds 8(%eax)
+ fsubs 116(%eax)
+ fmuls 8(%ebx)
+ fstps 116(%edx)
+
+ flds 12(%eax)
+ fsubs 112(%eax)
+ fmuls 12(%ebx)
+ fstps 112(%edx)
+
+ flds 16(%eax)
+ fadds 108(%eax)
+ fstps 16(%edx)
+
+ flds 20(%eax)
+ fadds 104(%eax)
+ fstps 20(%edx)
+
+ flds 16(%eax)
+ fsubs 108(%eax)
+ fmuls 16(%ebx)
+ fstps 108(%edx)
+
+ flds 20(%eax)
+ fsubs 104(%eax)
+ fmuls 20(%ebx)
+ fstps 104(%edx)
+
+ flds 24(%eax)
+ fadds 100(%eax)
+ fstps 24(%edx)
+
+ flds 28(%eax)
+ fadds 96(%eax)
+ fstps 28(%edx)
+
+ flds 24(%eax)
+ fsubs 100(%eax)
+ fmuls 24(%ebx)
+ fstps 100(%edx)
+
+ flds 28(%eax)
+ fsubs 96(%eax)
+ fmuls 28(%ebx)
+ fstps 96(%edx)
+
+ flds 32(%eax)
+ fadds 92(%eax)
+ fstps 32(%edx)
+
+ flds 36(%eax)
+ fadds 88(%eax)
+ fstps 36(%edx)
+
+ flds 32(%eax)
+ fsubs 92(%eax)
+ fmuls 32(%ebx)
+ fstps 92(%edx)
+
+ flds 36(%eax)
+ fsubs 88(%eax)
+ fmuls 36(%ebx)
+ fstps 88(%edx)
+
+ flds 40(%eax)
+ fadds 84(%eax)
+ fstps 40(%edx)
+
+ flds 44(%eax)
+ fadds 80(%eax)
+ fstps 44(%edx)
+
+ flds 40(%eax)
+ fsubs 84(%eax)
+ fmuls 40(%ebx)
+ fstps 84(%edx)
+
+ flds 44(%eax)
+ fsubs 80(%eax)
+ fmuls 44(%ebx)
+ fstps 80(%edx)
+
+ flds 48(%eax)
+ fadds 76(%eax)
+ fstps 48(%edx)
+
+ flds 52(%eax)
+ fadds 72(%eax)
+ fstps 52(%edx)
+
+ flds 48(%eax)
+ fsubs 76(%eax)
+ fmuls 48(%ebx)
+ fstps 76(%edx)
+
+ flds 52(%eax)
+ fsubs 72(%eax)
+ fmuls 52(%ebx)
+ fstps 72(%edx)
+
+ flds 56(%eax)
+ fadds 68(%eax)
+ fstps 56(%edx)
+
+ flds 60(%eax)
+ fadds 64(%eax)
+ fstps 60(%edx)
+
+ flds 56(%eax)
+ fsubs 68(%eax)
+ fmuls 56(%ebx)
+ fstps 68(%edx)
+
+ flds 60(%eax)
+ fsubs 64(%eax)
+ fmuls 60(%ebx)
+ fstps 64(%edx)
+
+/* Phase 2*/
+
+ flds (%edx)
+ fadds 60(%edx)
+ fstps (%ecx)
+
+ flds 4(%edx)
+ fadds 56(%edx)
+ fstps 4(%ecx)
+
+ flds (%edx)
+ fsubs 60(%edx)
+ fmuls 64(%ebx)
+ fstps 60(%ecx)
+
+ flds 4(%edx)
+ fsubs 56(%edx)
+ fmuls 68(%ebx)
+ fstps 56(%ecx)
+
+ flds 8(%edx)
+ fadds 52(%edx)
+ fstps 8(%ecx)
+
+ flds 12(%edx)
+ fadds 48(%edx)
+ fstps 12(%ecx)
+
+ flds 8(%edx)
+ fsubs 52(%edx)
+ fmuls 72(%ebx)
+ fstps 52(%ecx)
+
+ flds 12(%edx)
+ fsubs 48(%edx)
+ fmuls 76(%ebx)
+ fstps 48(%ecx)
+
+ flds 16(%edx)
+ fadds 44(%edx)
+ fstps 16(%ecx)
+
+ flds 20(%edx)
+ fadds 40(%edx)
+ fstps 20(%ecx)
+
+ flds 16(%edx)
+ fsubs 44(%edx)
+ fmuls 80(%ebx)
+ fstps 44(%ecx)
+
+ flds 20(%edx)
+ fsubs 40(%edx)
+ fmuls 84(%ebx)
+ fstps 40(%ecx)
+
+ flds 24(%edx)
+ fadds 36(%edx)
+ fstps 24(%ecx)
+
+ flds 28(%edx)
+ fadds 32(%edx)
+ fstps 28(%ecx)
+
+ flds 24(%edx)
+ fsubs 36(%edx)
+ fmuls 88(%ebx)
+ fstps 36(%ecx)
+
+ flds 28(%edx)
+ fsubs 32(%edx)
+ fmuls 92(%ebx)
+ fstps 32(%ecx)
+
+/* Phase 3*/
+
+ flds 64(%edx)
+ fadds 124(%edx)
+ fstps 64(%ecx)
+
+ flds 68(%edx)
+ fadds 120(%edx)
+ fstps 68(%ecx)
+
+ flds 124(%edx)
+ fsubs 64(%edx)
+ fmuls 64(%ebx)
+ fstps 124(%ecx)
+
+ flds 120(%edx)
+ fsubs 68(%edx)
+ fmuls 68(%ebx)
+ fstps 120(%ecx)
+
+ flds 72(%edx)
+ fadds 116(%edx)
+ fstps 72(%ecx)
+
+ flds 76(%edx)
+ fadds 112(%edx)
+ fstps 76(%ecx)
+
+ flds 116(%edx)
+ fsubs 72(%edx)
+ fmuls 72(%ebx)
+ fstps 116(%ecx)
+
+ flds 112(%edx)
+ fsubs 76(%edx)
+ fmuls 76(%ebx)
+ fstps 112(%ecx)
+
+ flds 80(%edx)
+ fadds 108(%edx)
+ fstps 80(%ecx)
+
+ flds 84(%edx)
+ fadds 104(%edx)
+ fstps 84(%ecx)
+
+ flds 108(%edx)
+ fsubs 80(%edx)
+ fmuls 80(%ebx)
+ fstps 108(%ecx)
+
+ flds 104(%edx)
+ fsubs 84(%edx)
+ fmuls 84(%ebx)
+ fstps 104(%ecx)
+
+ flds 88(%edx)
+ fadds 100(%edx)
+ fstps 88(%ecx)
+
+ flds 92(%edx)
+ fadds 96(%edx)
+ fstps 92(%ecx)
+
+ flds 100(%edx)
+ fsubs 88(%edx)
+ fmuls 88(%ebx)
+ fstps 100(%ecx)
+
+ flds 96(%edx)
+ fsubs 92(%edx)
+ fmuls 92(%ebx)
+ fstps 96(%ecx)
+
+/* Phase 4*/
+
+ flds (%ecx)
+ fadds 28(%ecx)
+ fstps (%edx)
+
+ flds (%ecx)
+ fsubs 28(%ecx)
+ fmuls 96(%ebx)
+ fstps 28(%edx)
+
+ flds 4(%ecx)
+ fadds 24(%ecx)
+ fstps 4(%edx)
+
+ flds 4(%ecx)
+ fsubs 24(%ecx)
+ fmuls 100(%ebx)
+ fstps 24(%edx)
+
+ flds 8(%ecx)
+ fadds 20(%ecx)
+ fstps 8(%edx)
+
+ flds 8(%ecx)
+ fsubs 20(%ecx)
+ fmuls 104(%ebx)
+ fstps 20(%edx)
+
+ flds 12(%ecx)
+ fadds 16(%ecx)
+ fstps 12(%edx)
+
+ flds 12(%ecx)
+ fsubs 16(%ecx)
+ fmuls 108(%ebx)
+ fstps 16(%edx)
+
+ flds 32(%ecx)
+ fadds 60(%ecx)
+ fstps 32(%edx)
+
+ flds 60(%ecx)
+ fsubs 32(%ecx)
+ fmuls 96(%ebx)
+ fstps 60(%edx)
+
+ flds 36(%ecx)
+ fadds 56(%ecx)
+ fstps 36(%edx)
+
+ flds 56(%ecx)
+ fsubs 36(%ecx)
+ fmuls 100(%ebx)
+ fstps 56(%edx)
+
+ flds 40(%ecx)
+ fadds 52(%ecx)
+ fstps 40(%edx)
+
+ flds 52(%ecx)
+ fsubs 40(%ecx)
+ fmuls 104(%ebx)
+ fstps 52(%edx)
+
+ flds 44(%ecx)
+ fadds 48(%ecx)
+ fstps 44(%edx)
+
+ flds 48(%ecx)
+ fsubs 44(%ecx)
+ fmuls 108(%ebx)
+ fstps 48(%edx)
+
+ flds 64(%ecx)
+ fadds 92(%ecx)
+ fstps 64(%edx)
+
+ flds 64(%ecx)
+ fsubs 92(%ecx)
+ fmuls 96(%ebx)
+ fstps 92(%edx)
+
+ flds 68(%ecx)
+ fadds 88(%ecx)
+ fstps 68(%edx)
+
+ flds 68(%ecx)
+ fsubs 88(%ecx)
+ fmuls 100(%ebx)
+ fstps 88(%edx)
+
+ flds 72(%ecx)
+ fadds 84(%ecx)
+ fstps 72(%edx)
+
+ flds 72(%ecx)
+ fsubs 84(%ecx)
+ fmuls 104(%ebx)
+ fstps 84(%edx)
+
+ flds 76(%ecx)
+ fadds 80(%ecx)
+ fstps 76(%edx)
+
+ flds 76(%ecx)
+ fsubs 80(%ecx)
+ fmuls 108(%ebx)
+ fstps 80(%edx)
+
+ flds 96(%ecx)
+ fadds 124(%ecx)
+ fstps 96(%edx)
+
+ flds 124(%ecx)
+ fsubs 96(%ecx)
+ fmuls 96(%ebx)
+ fstps 124(%edx)
+
+ flds 100(%ecx)
+ fadds 120(%ecx)
+ fstps 100(%edx)
+
+ flds 120(%ecx)
+ fsubs 100(%ecx)
+ fmuls 100(%ebx)
+ fstps 120(%edx)
+
+ flds 104(%ecx)
+ fadds 116(%ecx)
+ fstps 104(%edx)
+
+ flds 116(%ecx)
+ fsubs 104(%ecx)
+ fmuls 104(%ebx)
+ fstps 116(%edx)
+
+ flds 108(%ecx)
+ fadds 112(%ecx)
+ fstps 108(%edx)
+
+ flds 112(%ecx)
+ fsubs 108(%ecx)
+ fmuls 108(%ebx)
+ fstps 112(%edx)
+
+ flds (%edx)
+ fadds 12(%edx)
+ fstps (%ecx)
+
+ flds (%edx)
+ fsubs 12(%edx)
+ fmuls 112(%ebx)
+ fstps 12(%ecx)
+
+ flds 4(%edx)
+ fadds 8(%edx)
+ fstps 4(%ecx)
+
+ flds 4(%edx)
+ fsubs 8(%edx)
+ fmuls 116(%ebx)
+ fstps 8(%ecx)
+
+ flds 16(%edx)
+ fadds 28(%edx)
+ fstps 16(%ecx)
+
+ flds 28(%edx)
+ fsubs 16(%edx)
+ fmuls 112(%ebx)
+ fstps 28(%ecx)
+
+ flds 20(%edx)
+ fadds 24(%edx)
+ fstps 20(%ecx)
+
+ flds 24(%edx)
+ fsubs 20(%edx)
+ fmuls 116(%ebx)
+ fstps 24(%ecx)
+
+ flds 32(%edx)
+ fadds 44(%edx)
+ fstps 32(%ecx)
+
+ flds 32(%edx)
+ fsubs 44(%edx)
+ fmuls 112(%ebx)
+ fstps 44(%ecx)
+
+ flds 36(%edx)
+ fadds 40(%edx)
+ fstps 36(%ecx)
+
+ flds 36(%edx)
+ fsubs 40(%edx)
+ fmuls 116(%ebx)
+ fstps 40(%ecx)
+
+ flds 48(%edx)
+ fadds 60(%edx)
+ fstps 48(%ecx)
+
+ flds 60(%edx)
+ fsubs 48(%edx)
+ fmuls 112(%ebx)
+ fstps 60(%ecx)
+
+ flds 52(%edx)
+ fadds 56(%edx)
+ fstps 52(%ecx)
+
+ flds 56(%edx)
+ fsubs 52(%edx)
+ fmuls 116(%ebx)
+ fstps 56(%ecx)
+
+ flds 64(%edx)
+ fadds 76(%edx)
+ fstps 64(%ecx)
+
+ flds 64(%edx)
+ fsubs 76(%edx)
+ fmuls 112(%ebx)
+ fstps 76(%ecx)
+
+ flds 68(%edx)
+ fadds 72(%edx)
+ fstps 68(%ecx)
+
+ flds 68(%edx)
+ fsubs 72(%edx)
+ fmuls 116(%ebx)
+ fstps 72(%ecx)
+
+ flds 80(%edx)
+ fadds 92(%edx)
+ fstps 80(%ecx)
+
+ flds 92(%edx)
+ fsubs 80(%edx)
+ fmuls 112(%ebx)
+ fstps 92(%ecx)
+
+ flds 84(%edx)
+ fadds 88(%edx)
+ fstps 84(%ecx)
+
+ flds 88(%edx)
+ fsubs 84(%edx)
+ fmuls 116(%ebx)
+ fstps 88(%ecx)
+
+ flds 96(%edx)
+ fadds 108(%edx)
+ fstps 96(%ecx)
+
+ flds 96(%edx)
+ fsubs 108(%edx)
+ fmuls 112(%ebx)
+ fstps 108(%ecx)
+
+ flds 100(%edx)
+ fadds 104(%edx)
+ fstps 100(%ecx)
+
+ flds 100(%edx)
+ fsubs 104(%edx)
+ fmuls 116(%ebx)
+ fstps 104(%ecx)
+
+ flds 112(%edx)
+ fadds 124(%edx)
+ fstps 112(%ecx)
+
+ flds 124(%edx)
+ fsubs 112(%edx)
+ fmuls 112(%ebx)
+ fstps 124(%ecx)
+
+ flds 116(%edx)
+ fadds 120(%edx)
+ fstps 116(%ecx)
+
+ flds 120(%edx)
+ fsubs 116(%edx)
+ fmuls 116(%ebx)
+ fstps 120(%ecx)
+
+/* Phase 5*/
+
+ flds 32(%ecx)
+ fadds 36(%ecx)
+ fstps 32(%edx)
+
+ flds 32(%ecx)
+ fsubs 36(%ecx)
+ fmuls 120(%ebx)
+ fstps 36(%edx)
+
+ flds 44(%ecx)
+ fsubs 40(%ecx)
+ fmuls 120(%ebx)
+ fsts 44(%edx)
+ fadds 40(%ecx)
+ fadds 44(%ecx)
+ fstps 40(%edx)
+
+ flds 48(%ecx)
+ fsubs 52(%ecx)
+ fmuls 120(%ebx)
+
+ flds 60(%ecx)
+ fsubs 56(%ecx)
+ fmuls 120(%ebx)
+ fld %st(0)
+ fadds 56(%ecx)
+ fadds 60(%ecx)
+ fld %st(0)
+ fadds 48(%ecx)
+ fadds 52(%ecx)
+ fstps 48(%edx)
+ fadd %st(2)
+ fstps 56(%edx)
+ fsts 60(%edx)
+ faddp %st(1)
+ fstps 52(%edx)
+
+ flds 64(%ecx)
+ fadds 68(%ecx)
+ fstps 64(%edx)
+
+ flds 64(%ecx)
+ fsubs 68(%ecx)
+ fmuls 120(%ebx)
+ fstps 68(%edx)
+
+ flds 76(%ecx)
+ fsubs 72(%ecx)
+ fmuls 120(%ebx)
+ fsts 76(%edx)
+ fadds 72(%ecx)
+ fadds 76(%ecx)
+ fstps 72(%edx)
+
+ flds 92(%ecx)
+ fsubs 88(%ecx)
+ fmuls 120(%ebx)
+ fsts 92(%edx)
+ fadds 92(%ecx)
+ fadds 88(%ecx)
+ fld %st(0)
+ fadds 80(%ecx)
+ fadds 84(%ecx)
+ fstps 80(%edx)
+
+ flds 80(%ecx)
+ fsubs 84(%ecx)
+ fmuls 120(%ebx)
+ fadd %st(0), %st(1)
+ fadds 92(%edx)
+ fstps 84(%edx)
+ fstps 88(%edx)
+
+ flds 96(%ecx)
+ fadds 100(%ecx)
+ fstps 96(%edx)
+
+ flds 96(%ecx)
+ fsubs 100(%ecx)
+ fmuls 120(%ebx)
+ fstps 100(%edx)
+
+ flds 108(%ecx)
+ fsubs 104(%ecx)
+ fmuls 120(%ebx)
+ fsts 108(%edx)
+ fadds 104(%ecx)
+ fadds 108(%ecx)
+ fstps 104(%edx)
+
+ flds 124(%ecx)
+ fsubs 120(%ecx)
+ fmuls 120(%ebx)
+ fsts 124(%edx)
+ fadds 120(%ecx)
+ fadds 124(%ecx)
+ fld %st(0)
+ fadds 112(%ecx)
+ fadds 116(%ecx)
+ fstps 112(%edx)
+
+ flds 112(%ecx)
+ fsubs 116(%ecx)
+ fmuls 120(%ebx)
+ fadd %st(0),%st(1)
+ fadds 124(%edx)
+ fstps 116(%edx)
+ fstps 120(%edx)
+ jnz .L01
+
+/* Phase 6*/
+
+ flds (%ecx)
+ fadds 4(%ecx)
+ fstps 1024(%esi)
+
+ flds (%ecx)
+ fsubs 4(%ecx)
+ fmuls 120(%ebx)
+ fsts (%esi)
+ fstps (%edi)
+
+ flds 12(%ecx)
+ fsubs 8(%ecx)
+ fmuls 120(%ebx)
+ fsts 512(%edi)
+ fadds 12(%ecx)
+ fadds 8(%ecx)
+ fstps 512(%esi)
+
+ flds 16(%ecx)
+ fsubs 20(%ecx)
+ fmuls 120(%ebx)
+
+ flds 28(%ecx)
+ fsubs 24(%ecx)
+ fmuls 120(%ebx)
+ fsts 768(%edi)
+ fld %st(0)
+ fadds 24(%ecx)
+ fadds 28(%ecx)
+ fld %st(0)
+ fadds 16(%ecx)
+ fadds 20(%ecx)
+ fstps 768(%esi)
+ fadd %st(2)
+ fstps 256(%esi)
+ faddp %st(1)
+ fstps 256(%edi)
+
+/* Phase 7*/
+
+ flds 32(%edx)
+ fadds 48(%edx)
+ fstps 896(%esi)
+
+ flds 48(%edx)
+ fadds 40(%edx)
+ fstps 640(%esi)
+
+ flds 40(%edx)
+ fadds 56(%edx)
+ fstps 384(%esi)
+
+ flds 56(%edx)
+ fadds 36(%edx)
+ fstps 128(%esi)
+
+ flds 36(%edx)
+ fadds 52(%edx)
+ fstps 128(%edi)
+
+ flds 52(%edx)
+ fadds 44(%edx)
+ fstps 384(%edi)
+
+ flds 60(%edx)
+ fsts 896(%edi)
+ fadds 44(%edx)
+ fstps 640(%edi)
+
+ flds 96(%edx)
+ fadds 112(%edx)
+ fld %st(0)
+ fadds 64(%edx)
+ fstps 960(%esi)
+ fadds 80(%edx)
+ fstps 832(%esi)
+
+ flds 112(%edx)
+ fadds 104(%edx)
+ fld %st(0)
+ fadds 80(%edx)
+ fstps 704(%esi)
+ fadds 72(%edx)
+ fstps 576(%esi)
+
+ flds 104(%edx)
+ fadds 120(%edx)
+ fld %st(0)
+ fadds 72(%edx)
+ fstps 448(%esi)
+ fadds 88(%edx)
+ fstps 320(%esi)
+
+ flds 120(%edx)
+ fadds 100(%edx)
+ fld %st(0)
+ fadds 88(%edx)
+ fstps 192(%esi)
+ fadds 68(%edx)
+ fstps 64(%esi)
+
+ flds 100(%edx)
+ fadds 116(%edx)
+ fld %st(0)
+ fadds 68(%edx)
+ fstps 64(%edi)
+ fadds 84(%edx)
+ fstps 192(%edi)
+
+ flds 116(%edx)
+ fadds 108(%edx)
+ fld %st(0)
+ fadds 84(%edx)
+ fstps 320(%edi)
+ fadds 76(%edx)
+ fstps 448(%edi)
+
+ flds 108(%edx)
+ fadds 124(%edx)
+ fld %st(0)
+ fadds 76(%edx)
+ fstps 576(%edi)
+ fadds 92(%edx)
+ fstps 704(%edi)
+
+ flds 124(%edx)
+ fsts 960(%edi)
+ fadds 92(%edx)
+ fstps 832(%edi)
+ addl $256,%esp
+ popl %edi
+ popl %esi
+ popl %ebx
+ ret
+.L01:
+/* Phase 8*/
+
+ flds (%ecx)
+ fadds 4(%ecx)
+ fistp 512(%esi)
+
+ flds (%ecx)
+ fsubs 4(%ecx)
+ fmuls 120(%ebx)
+
+ fistp (%esi)
+
+
+ flds 12(%ecx)
+ fsubs 8(%ecx)
+ fmuls 120(%ebx)
+ fist 256(%edi)
+ fadds 12(%ecx)
+ fadds 8(%ecx)
+ fistp 256(%esi)
+
+ flds 16(%ecx)
+ fsubs 20(%ecx)
+ fmuls 120(%ebx)
+
+ flds 28(%ecx)
+ fsubs 24(%ecx)
+ fmuls 120(%ebx)
+ fist 384(%edi)
+ fld %st(0)
+ fadds 24(%ecx)
+ fadds 28(%ecx)
+ fld %st(0)
+ fadds 16(%ecx)
+ fadds 20(%ecx)
+ fistp 384(%esi)
+ fadd %st(2)
+ fistp 128(%esi)
+ faddp %st(1)
+ fistp 128(%edi)
+
+/* Phase 9*/
+
+ flds 32(%edx)
+ fadds 48(%edx)
+ fistp 448(%esi)
+
+ flds 48(%edx)
+ fadds 40(%edx)
+ fistp 320(%esi)
+
+ flds 40(%edx)
+ fadds 56(%edx)
+ fistp 192(%esi)
+
+ flds 56(%edx)
+ fadds 36(%edx)
+ fistp 64(%esi)
+
+ flds 36(%edx)
+ fadds 52(%edx)
+ fistp 64(%edi)
+
+ flds 52(%edx)
+ fadds 44(%edx)
+ fistp 192(%edi)
+
+ flds 60(%edx)
+ fist 448(%edi)
+ fadds 44(%edx)
+ fistp 320(%edi)
+
+ flds 96(%edx)
+ fadds 112(%edx)
+ fld %st(0)
+ fadds 64(%edx)
+ fistp 480(%esi)
+ fadds 80(%edx)
+ fistp 416(%esi)
+
+ flds 112(%edx)
+ fadds 104(%edx)
+ fld %st(0)
+ fadds 80(%edx)
+ fistp 352(%esi)
+ fadds 72(%edx)
+ fistp 288(%esi)
+
+ flds 104(%edx)
+ fadds 120(%edx)
+ fld %st(0)
+ fadds 72(%edx)
+ fistp 224(%esi)
+ fadds 88(%edx)
+ fistp 160(%esi)
+
+ flds 120(%edx)
+ fadds 100(%edx)
+ fld %st(0)
+ fadds 88(%edx)
+ fistp 96(%esi)
+ fadds 68(%edx)
+ fistp 32(%esi)
+
+ flds 100(%edx)
+ fadds 116(%edx)
+ fld %st(0)
+ fadds 68(%edx)
+ fistp 32(%edi)
+ fadds 84(%edx)
+ fistp 96(%edi)
+
+ flds 116(%edx)
+ fadds 108(%edx)
+ fld %st(0)
+ fadds 84(%edx)
+ fistp 160(%edi)
+ fadds 76(%edx)
+ fistp 224(%edi)
+
+ flds 108(%edx)
+ fadds 124(%edx)
+ fld %st(0)
+ fadds 76(%edx)
+ fistp 288(%edi)
+ fadds 92(%edx)
+ fistp 352(%edi)
+
+ flds 124(%edx)
+ fist 480(%edi)
+ fadds 92(%edx)
+ fistp 416(%edi)
+ movsw
+ addl $256,%esp
+ popl %edi
+ popl %esi
+ popl %ebx
+ ret
+
+
diff --git a/mp3lib/dct64_k7.s b/mp3lib/dct64_k7.s
index 6a82d618c4..e2dcf07195 100644
--- a/mp3lib/dct64_k7.s
+++ b/mp3lib/dct64_k7.s
@@ -1,677 +1,804 @@
-///
-/// Replacement of dct64() with AMD's 3DNowEx(DSP)! SIMD operations support
-///
-/// This code based 'dct64_3dnow.s' by Syuuhei Kashiyama
-/// <squash@mb.kcom.ne.jp>,only some types of changes have been made:
-///
-/// - added new opcodes PSWAPD, PFPNACC
-/// - decreased number of opcodes (as it was suggested by k7 manual)
-/// (using memory reference as operand of instructions)
-/// - Phase 6 is rewritten with mixing of cpu and mmx opcodes
-/// - change function name for support 3DNowEx! automatic detect
-/// - negation of 3dnow reg was replaced with PXOR 0x800000000, MMi instead
-/// of PFMUL as it was suggested by athlon manual. (Two not separated PFMUL
-/// can not be paired, but PXOR can be).
-///
-/// note: because K7 processors are an aggresive out-of-order three-way
-/// superscalar ones instruction order is not significand for them.
-///
-/// Modified by Nick Kurshev <nickols_k@mail.ru>
-///
-/// The author of this program disclaim whole expressed or implied
-/// warranties with regard to this program, and in no event shall the
-/// author of this program liable to whatever resulted from the use of
-/// this program. Use it at your own risk.
-///
+# This code was taken from http://www.mpg123.org
+# See ChangeLog of mpg123-0.59s-pre.1 for detail
+# Applied to mplayer by Nick Kurshev <nickols_k@mail.ru>
+# Partial 3dnowex-DSP! optimization by Nick Kurshev
+#
+# TODO: finish 3dnow! optimization at least in scalar mode
+#
.data
- .align 8
+ .align 8
plus_minus_3dnow: .long 0x00000000, 0x80000000
+costab:
+ .long 1056974725
+ .long 1057056395
+ .long 1057223771
+ .long 1057485416
+ .long 1057855544
+ .long 1058356026
+ .long 1059019886
+ .long 1059897405
+ .long 1061067246
+ .long 1062657950
+ .long 1064892987
+ .long 1066774581
+ .long 1069414683
+ .long 1073984175
+ .long 1079645762
+ .long 1092815430
+ .long 1057005197
+ .long 1057342072
+ .long 1058087743
+ .long 1059427869
+ .long 1061799040
+ .long 1065862217
+ .long 1071413542
+ .long 1084439708
+ .long 1057128951
+ .long 1058664893
+ .long 1063675095
+ .long 1076102863
+ .long 1057655764
+ .long 1067924853
+ .long 1060439283
.text
- .globl dct64_3dnowex
- .type dct64_3dnowex,@function
-
-/* Discrete Cosine Tansform (DCT) for subband synthesis */
-/* void dct64(real *a,real *b,real *c) */
-dct64_3dnowex:
- subl $256,%esp
- pushl %ebp
- pushl %edi
- pushl %esi
- pushl %ebx
- leal 16(%esp),%ebx /* ebx -> real tmp1[32] */
- movl 284(%esp),%edi /* edi -> c */
- movl 276(%esp),%ebp /* ebp -> a */
- movl 280(%esp),%edx /* edx -> b */
- leal 128(%ebx),%esi /* esi -> real tmp2[32] */
-
- / femms
-
- // 1
- movl pnts,%eax
-
- movq 0(%edi),%mm0 /* mm0 = c[0x00] | c[0x01]*/
- movq %mm0,%mm1 /* mm1 = mm0 */
- movd 124(%edi),%mm2 /* mm2 = c[0x1f] */
- punpckldq 120(%edi),%mm2 /* mm2 = c[0x1f] | c[0x1E] */
- pfadd %mm2,%mm0 /* mm0 = c[0x00]+c[0x1F] | c[0x1E]+c[0x01] */
- movq %mm0,0(%ebx) /* tmp[0, 1] = mm0 */
- pfsub %mm2,%mm1 /* c[0x00]-c[0x1f] | c[0x01]-c[0x1e] */
- pfmul 0(%eax),%mm1 /* (c[0x00]-c[0x1f])*pnts[0]|(c[0x01]-c[0x1e])*pnts[1]*/
- pswapd %mm1, %mm1 /* (c[0x01]-c[0x1e])*pnts[1]|(c[0x00]-c[0x1f])*pnts[0]*/
- movq %mm1, 120(%ebx) /* tmp1[30, 31]=mm1 */
-
- movq 8(%edi),%mm4
- movq %mm4,%mm5
- movd 116(%edi),%mm6
- punpckldq 112(%edi),%mm6
- pfadd %mm6,%mm4
- movq %mm4,8(%ebx)
- pfsub %mm6,%mm5
- pfmul 8(%eax),%mm5
- pswapd %mm5, %mm5
- movq %mm5, 112(%ebx)
-
- movq 16(%edi),%mm0
- movq %mm0,%mm1
- movd 108(%edi),%mm2
- punpckldq 104(%edi),%mm2
- pfadd %mm2,%mm0
- movq %mm0,16(%ebx)
- pfsub %mm2,%mm1
- pfmul 16(%eax),%mm1
- pswapd %mm1, %mm1
- movq %mm1, 104(%ebx)
-
- movq 24(%edi),%mm4
- movq %mm4,%mm5
- movd 100(%edi),%mm6
- punpckldq 96(%edi),%mm6
- pfadd %mm6,%mm4
- movq %mm4,24(%ebx)
- pfsub %mm6,%mm5
- pfmul 24(%eax),%mm5
- pswapd %mm5, %mm5
- movq %mm5, 96(%ebx)
-
- movq 32(%edi),%mm0
- movq %mm0,%mm1
- movd 92(%edi),%mm2
- punpckldq 88(%edi),%mm2
- pfadd %mm2,%mm0
- movq %mm0,32(%ebx)
- pfsub %mm2,%mm1
- pfmul 32(%eax),%mm1
- pswapd %mm1, %mm1
- movq %mm1, 88(%ebx)
-
- movq 40(%edi),%mm4
- movq %mm4,%mm5
- movd 84(%edi),%mm6
- punpckldq 80(%edi),%mm6
- pfadd %mm6,%mm4
- movq %mm4,40(%ebx)
- pfsub %mm6,%mm5
- pfmul 40(%eax),%mm5
- pswapd %mm5, %mm5
- movq %mm5, 80(%ebx)
-
- movq 48(%edi),%mm0
- movq %mm0,%mm1
- movd 76(%edi),%mm2
- punpckldq 72(%edi),%mm2
- pfadd %mm2,%mm0
- movq %mm0,48(%ebx)
- pfsub %mm2,%mm1
- pfmul 48(%eax),%mm1
- pswapd %mm1, %mm1
- movq %mm1, 72(%ebx)
-
- movq 56(%edi),%mm4
- movq %mm4,%mm5
- movd 68(%edi),%mm6
- punpckldq 64(%edi),%mm6
- pfadd %mm6,%mm4
- movq %mm4,56(%ebx)
- pfsub %mm6,%mm5
- pfmul 56(%eax),%mm5
- pswapd %mm5, %mm5
- movq %mm5, 64(%ebx)
-
- // 2
- movl pnts+4,%eax
- / 0, 14
- movq 0(%ebx),%mm0 /* mm0 = tmp1[0] | tmp1[1] */
- movq %mm0,%mm1
- movd 60(%ebx),%mm2 /* mm2 = tmp1[0x0F] */
- punpckldq 56(%ebx),%mm2 /* mm2 = tmp1[0x0E] | tmp1[0x0F] */
- movq 0(%eax),%mm3 /* mm3 = pnts[0] | pnts[1] */
- pfadd %mm2,%mm0 /* mm0 = tmp1[0]+tmp1[0x0F]|tmp1[1]+tmp1[0x0E]*/
- movq %mm0,0(%esi) /* tmp2[0, 1] = mm0 */
- pfsub %mm2,%mm1 /* mm1 = tmp1[0]-tmp1[0x0F]|tmp1[1]-tmp1[0x0E]*/
- pfmul %mm3,%mm1 /* mm1 = (tmp1[0]-tmp1[0x0F])*pnts[0]|(tmp1[1]-tmp1[0x0E])*pnts[1]*/
- pswapd %mm1, %mm1 /* mm1 = (tmp1[1]-tmp1[0x0E])*pnts[1]|(tmp1[0]-tmp1[0x0F])*pnts[0]*/
- movq %mm1, 56(%esi) /* tmp2[0x0E, 0x0F] = mm1 */
- / 16, 30
- movq 64(%ebx),%mm0
- movq %mm0,%mm1
- movd 124(%ebx),%mm2
- punpckldq 120(%ebx),%mm2
- pfadd %mm2,%mm0
- movq %mm0,64(%esi)
- pfsubr %mm2,%mm1
- pfmul %mm3,%mm1
- pswapd %mm1, %mm1
- movq %mm1, 120(%esi)
- movq 8(%ebx),%mm4
- / 2, 12
- movq %mm4,%mm5
- movd 52(%ebx),%mm6
- punpckldq 48(%ebx),%mm6
- movq 8(%eax),%mm7
- pfadd %mm6,%mm4
- movq %mm4,8(%esi)
- pfsub %mm6,%mm5
- pfmul %mm7,%mm5
- pswapd %mm5, %mm5
- movq %mm5, 48(%esi)
- movq 72(%ebx),%mm4
- / 18, 28
- movq %mm4,%mm5
- movd 116(%ebx),%mm6
- punpckldq 112(%ebx),%mm6
- pfadd %mm6,%mm4
- movq %mm4,72(%esi)
- pfsubr %mm6,%mm5
- pfmul %mm7,%mm5
- pswapd %mm5, %mm5
- movq %mm5, 112(%esi)
- movq 16(%ebx),%mm0
- / 4, 10
- movq %mm0,%mm1
- movd 44(%ebx),%mm2
- punpckldq 40(%ebx),%mm2
- movq 16(%eax),%mm3
- pfadd %mm2,%mm0
- movq %mm0,16(%esi)
- pfsub %mm2,%mm1
- pfmul %mm3,%mm1
- pswapd %mm1, %mm1
- movq %mm1, 40(%esi)
- movq 80(%ebx),%mm0
- / 20, 26
- movq %mm0,%mm1
- movd 108(%ebx),%mm2
- punpckldq 104(%ebx),%mm2
- pfadd %mm2,%mm0
- movq %mm0,80(%esi)
- pfsubr %mm2,%mm1
- pfmul %mm3,%mm1
- pswapd %mm1, %mm1
- movq %mm1, 104(%esi)
- movq 24(%ebx),%mm4
- / 6, 8
- movq %mm4,%mm5
- movd 36(%ebx),%mm6
- punpckldq 32(%ebx),%mm6
- movq 24(%eax),%mm7
- pfadd %mm6,%mm4
- movq %mm4,24(%esi)
- pfsub %mm6,%mm5
- pfmul %mm7,%mm5
- pswapd %mm5, %mm5
- movq %mm5, 32(%esi)
- movq 88(%ebx),%mm4
- / 22, 24
- movq %mm4,%mm5
- movd 100(%ebx),%mm6
- punpckldq 96(%ebx),%mm6
- pfadd %mm6,%mm4
- movq %mm4,88(%esi)
- pfsubr %mm6,%mm5
- pfmul %mm7,%mm5
- pswapd %mm5, %mm5
- movq %mm5, 96(%esi)
-
- // 3
- movl pnts+8,%eax
- movq 0(%eax),%mm0
- movq 8(%eax),%mm1
- movq 0(%esi),%mm2
- / 0, 6
- movq %mm2,%mm3
- movd 28(%esi),%mm4
- punpckldq 24(%esi),%mm4
- pfadd %mm4,%mm2
- pfsub %mm4,%mm3
- pfmul %mm0,%mm3
- movq %mm2,0(%ebx)
- pswapd %mm3, %mm3
- movq %mm3, 24(%ebx)
- movq 8(%esi),%mm5
- / 2, 4
- movq %mm5,%mm6
- movd 20(%esi),%mm7
- punpckldq 16(%esi),%mm7
- pfadd %mm7,%mm5
- pfsub %mm7,%mm6
- pfmul %mm1,%mm6
- movq %mm5,8(%ebx)
- pswapd %mm6, %mm6
- movq %mm6, 16(%ebx)
- movq 32(%esi),%mm2
- / 8, 14
- movq %mm2,%mm3
- movd 60(%esi),%mm4
- punpckldq 56(%esi),%mm4
- pfadd %mm4,%mm2
- pfsubr %mm4,%mm3
- pfmul %mm0,%mm3
- movq %mm2,32(%ebx)
- pswapd %mm3, %mm3
- movq %mm3, 56(%ebx)
- movq 40(%esi),%mm5
- / 10, 12
- movq %mm5,%mm6
- movd 52(%esi),%mm7
- punpckldq 48(%esi),%mm7
- pfadd %mm7,%mm5
- pfsubr %mm7,%mm6
- pfmul %mm1,%mm6
- movq %mm5,40(%ebx)
- pswapd %mm6, %mm6
- movq %mm6, 48(%ebx)
- movq 64(%esi),%mm2
- / 16, 22
- movq %mm2,%mm3
- movd 92(%esi),%mm4
- punpckldq 88(%esi),%mm4
- pfadd %mm4,%mm2
- pfsub %mm4,%mm3
- pfmul %mm0,%mm3
- movq %mm2,64(%ebx)
- pswapd %mm3, %mm3
- movq %mm3, 88(%ebx)
- movq 72(%esi),%mm5
- / 18, 20
- movq %mm5,%mm6
- movd 84(%esi),%mm7
- punpckldq 80(%esi),%mm7
- pfadd %mm7,%mm5
- pfsub %mm7,%mm6
- pfmul %mm1,%mm6
- movq %mm5,72(%ebx)
- pswapd %mm6, %mm6
- movq %mm6, 80(%ebx)
- movq 96(%esi),%mm2
- / 24, 30
- movq %mm2,%mm3
- movd 124(%esi),%mm4
- punpckldq 120(%esi),%mm4
- pfadd %mm4,%mm2
- pfsubr %mm4,%mm3
- pfmul %mm0,%mm3
- movq %mm2,96(%ebx)
- pswapd %mm3, %mm3
- movq %mm3, 120(%ebx)
- movq 104(%esi),%mm5
- / 26, 28
- movq %mm5,%mm6
- movd 116(%esi),%mm7
- punpckldq 112(%esi),%mm7
- pfadd %mm7,%mm5
- pfsubr %mm7,%mm6
- pfmul %mm1,%mm6
- movq %mm5,104(%ebx)
- pswapd %mm6, %mm6
- movq %mm6, 112(%ebx)
-
- // 4
- movl pnts+12,%eax
- movq 0(%eax),%mm0 /* mm0 = pnts[3] | pnts[4] */
- movq 0(%ebx),%mm1 /* mm1 = tmp1[0] | tmp1[1] */
- / 0
- movq %mm1,%mm2
- movd 12(%ebx),%mm3 /* mm3 = tmp1[3] */
- punpckldq 8(%ebx),%mm3 /* mm3 = tmp1[3] | tmp1[2] */
- pfadd %mm3,%mm1 /* mm1 = tmp1[0]+tmp1[3] | tmp1[1]+tmp1[2]*/
- pfsub %mm3,%mm2 /* mm2 = tmp1[0]-tmp1[3] | tmp1[0]-tmp1[2]*/
- pfmul %mm0,%mm2 /* mm2 = tmp1[0]-tmp1[3]*pnts[3]|tmp1[0]-tmp1[2]*pnts[4]*/
- movq %mm1,0(%esi) /* tmp2[0, 1] = mm1 */
- pswapd %mm2, %mm2 /* mm2 = tmp1[0]-tmp1[2]*pnts[4]|tmp1[0]-tmp1[3]*pnts[3] */
- movq %mm2, 8(%esi) /* tmp2[2, 3] = mm2 */
- movq 16(%ebx),%mm4
- / 4
- movq %mm4,%mm5
- movd 28(%ebx),%mm6
- punpckldq 24(%ebx),%mm6
- pfadd %mm6,%mm4
- pfsubr %mm6,%mm5
- pfmul %mm0,%mm5
- movq %mm4,16(%esi)
- pswapd %mm5, %mm5
- movq %mm5, 24(%esi)
- movq 32(%ebx),%mm1
- / 8
- movq %mm1,%mm2
- movd 44(%ebx),%mm3
- punpckldq 40(%ebx),%mm3
- pfadd %mm3,%mm1
- pfsub %mm3,%mm2
- pfmul %mm0,%mm2
- movq %mm1,32(%esi)
- pswapd %mm2, %mm2
- movq %mm2, 40(%esi)
- movq 48(%ebx),%mm4
- / 12
- movq %mm4,%mm5
- movd 60(%ebx),%mm6
- punpckldq 56(%ebx),%mm6
- pfadd %mm6,%mm4
- pfsubr %mm6,%mm5
- pfmul %mm0,%mm5
- movq %mm4,48(%esi)
- pswapd %mm5, %mm5
- movq %mm5, 56(%esi)
- movq 64(%ebx),%mm1
- / 16
- movq %mm1,%mm2
- movd 76(%ebx),%mm3
- punpckldq 72(%ebx),%mm3
- pfadd %mm3,%mm1
- pfsub %mm3,%mm2
- pfmul %mm0,%mm2
- movq %mm1,64(%esi)
- pswapd %mm2, %mm2
- movq %mm2, 72(%esi)
- movq 80(%ebx),%mm4
- / 20
- movq %mm4,%mm5
- movd 92(%ebx),%mm6
- punpckldq 88(%ebx),%mm6
- pfadd %mm6,%mm4
- pfsubr %mm6,%mm5
- pfmul %mm0,%mm5
- movq %mm4,80(%esi)
- pswapd %mm5, %mm5
- movq %mm5, 88(%esi)
- movq 96(%ebx),%mm1
- / 24
- movq %mm1,%mm2
- movd 108(%ebx),%mm3
- punpckldq 104(%ebx),%mm3
- pfadd %mm3,%mm1
- pfsub %mm3,%mm2
- pfmul %mm0,%mm2
- movq %mm1,96(%esi)
- pswapd %mm2, %mm2
- movq %mm2, 104(%esi)
- movq 112(%ebx),%mm4
- / 28
- movq %mm4,%mm5
- movd 124(%ebx),%mm6
- punpckldq 120(%ebx),%mm6
- pfadd %mm6,%mm4
- pfsubr %mm6,%mm5
- pfmul %mm0,%mm5
- movq %mm4,112(%esi)
- pswapd %mm5, %mm5
- movq %mm5, 120(%esi)
-
- // 5
- movq plus_minus_3dnow, %mm0 /* mm0 = 1.0 | -1.0 */
- movl $1,%eax
- movd %eax,%mm1
- pi2fd %mm1,%mm1
- movl pnts+16,%eax
- movd 0(%eax),%mm2
- punpckldq %mm2,%mm1 /* mm1 = 1.0 | cos0 */
- movq 0(%esi),%mm2 /* mm2 = tmp2[0] | tmp2[1] */
- / 0
- pfpnacc %mm2, %mm2
- pswapd %mm2, %mm2 /* mm2 = tmp2[0]+tmp2[1]|tmp2[0]-tmp2[1]*/
- pfmul %mm1,%mm2 /* mm2 = tmp2[0]+tmp2[1]|(tmp2[0]-tmp2[1])*cos0*/
- movq %mm2,0(%ebx) /* tmp1[0, 1] = mm2 */
- movq 8(%esi),%mm4 /* mm4 = tmp2[2] | tmp2[3]*/
- pfpnacc %mm4, %mm4
- pswapd %mm4, %mm4 /* mm4 = tmp2[2]+tmp2[3]|tmp2[2]-tmp2[3]*/
- pxor %mm0,%mm4 /* mm4 = tmp2[2]+tmp2[3]|tmp2[3]-tmp2[2]*/
- pfmul %mm1,%mm4 /* mm4 = tmp2[2]+tmp2[3]|(tmp2[3]-tmp2[2])*cos0*/
- movq %mm4,%mm5
- psrlq $32,%mm5 /* mm5 = (tmp2[3]-tmp2[2])*cos0 */
- pfacc %mm5,%mm4 /* mm4 = tmp2[2]+tmp2[3]+(tmp2[3]-tmp2[2])*cos0|(tmp2[3]-tmp2[2])*cos0*/
- movq %mm4,8(%ebx) /* tmp1[2, 3] = mm4 */
- movq 16(%esi),%mm2
- / 4
- pfpnacc %mm2, %mm2
- pswapd %mm2, %mm2
-
- pfmul %mm1,%mm2
- movq 24(%esi),%mm4
- pfpnacc %mm4, %mm4
- pswapd %mm4, %mm4
-
- pxor %mm0,%mm4
- pfmul %mm1,%mm4
- movq %mm4,%mm5
- psrlq $32,%mm5
- pfacc %mm5,%mm4
- movq %mm2,%mm3
- psrlq $32,%mm3
- pfadd %mm4,%mm2
- pfadd %mm3,%mm4
- movq %mm2,16(%ebx)
- movq %mm4,24(%ebx)
- movq 32(%esi),%mm2
- / 8
- pfpnacc %mm2, %mm2
- pswapd %mm2, %mm2
-
- pfmul %mm1,%mm2
- movq %mm2,32(%ebx)
- movq 40(%esi),%mm4
- pfpnacc %mm4, %mm4
- pswapd %mm4, %mm4
- pxor %mm0,%mm4
- pfmul %mm1,%mm4
- movq %mm4,%mm5
- psrlq $32,%mm5
- pfacc %mm5,%mm4
- movq %mm4,40(%ebx)
- movq 48(%esi),%mm2
- / 12
- pfpnacc %mm2, %mm2
- pswapd %mm2, %mm2
- pfmul %mm1,%mm2
- movq 56(%esi),%mm4
- pfpnacc %mm4, %mm4
- pswapd %mm4, %mm4
- pxor %mm0,%mm4
- pfmul %mm1,%mm4
- movq %mm4,%mm5
- psrlq $32,%mm5
- pfacc %mm5,%mm4
- movq %mm2,%mm3
- psrlq $32,%mm3
- pfadd %mm4,%mm2
- pfadd %mm3,%mm4
- movq %mm2,48(%ebx)
- movq %mm4,56(%ebx)
- movq 64(%esi),%mm2
- / 16
- pfpnacc %mm2, %mm2
- pswapd %mm2, %mm2
- pfmul %mm1,%mm2
- movq %mm2,64(%ebx)
- movq 72(%esi),%mm4
- pfpnacc %mm4, %mm4
- pswapd %mm4, %mm4
- pxor %mm0,%mm4
- pfmul %mm1,%mm4
- movq %mm4,%mm5
- psrlq $32,%mm5
- pfacc %mm5,%mm4
- movq %mm4,72(%ebx)
- movq 80(%esi),%mm2
- / 20
- pfpnacc %mm2, %mm2
- pswapd %mm2, %mm2
- pfmul %mm1,%mm2
- movq 88(%esi),%mm4
- pfpnacc %mm4, %mm4
- pswapd %mm4, %mm4
- pxor %mm0,%mm4
- pfmul %mm1,%mm4
- movq %mm4,%mm5
- psrlq $32,%mm5
- pfacc %mm5,%mm4
- movq %mm2,%mm3
- psrlq $32,%mm3
- pfadd %mm4,%mm2
- pfadd %mm3,%mm4
- movq %mm2,80(%ebx)
- movq %mm4,88(%ebx)
- movq 96(%esi),%mm2
- / 24
- pfpnacc %mm2, %mm2
- pswapd %mm2, %mm2
- pfmul %mm1,%mm2
- movq %mm2,96(%ebx)
- movq 104(%esi),%mm4
- pfpnacc %mm4, %mm4
- pswapd %mm4, %mm4
- pxor %mm0,%mm4
- pfmul %mm1,%mm4
- movq %mm4,%mm5
- psrlq $32,%mm5
- pfacc %mm5,%mm4
- movq %mm4,104(%ebx)
- movq 112(%esi),%mm2
- / 28
- pfpnacc %mm2, %mm2
- pswapd %mm2, %mm2
- pfmul %mm1,%mm2
- movq 120(%esi),%mm4
- pfpnacc %mm4, %mm4
- pswapd %mm4, %mm4
- pxor %mm0,%mm4
- pfmul %mm1,%mm4
- movq %mm4,%mm5
- psrlq $32,%mm5
- pfacc %mm5,%mm4
- movq %mm2,%mm3
- psrlq $32,%mm3
- pfadd %mm4,%mm2
- pfadd %mm3,%mm4
- movq %mm2,112(%ebx)
- movq %mm4,120(%ebx)
-
- // Phase6
- movd 0(%ebx),%mm0
- movd %mm0,1024(%ebp)
- movl 4(%ebx),%eax
- movl %eax,0(%ebp)
- movl %eax,0(%edx)
- movd 8(%ebx),%mm2
- movd %mm2,512(%ebp)
- movd 12(%ebx),%mm3
- movd %mm3,512(%edx)
-
- movl 16(%ebx),%eax
- movl %eax,768(%ebp)
- movd 20(%ebx),%mm5
- movd %mm5,256(%edx)
-
- movd 24(%ebx),%mm6
- movd %mm6,256(%ebp)
- movd 28(%ebx),%mm7
- movd %mm7,768(%edx)
-
- movq 32(%ebx),%mm0 /* mm0 = tmp1[8] | tmp1[9] */
- movq 48(%ebx),%mm1 /* mm1 = tmp1[12] | tmp1[13] */
- pfadd %mm1,%mm0 /* mm0 = tmp1[8]+tmp1[12]| tmp1[9]+tmp1[13]*/
- movd %mm0,896(%ebp) /* a[0xE0] = tmp1[8]+tmp1[12] */
- psrlq $32,%mm0
- movd %mm0,128(%edx) /* a[0x20] = tmp1[9]+tmp1[13] */
- movq 40(%ebx),%mm2
- pfadd %mm2,%mm1
- movd %mm1,640(%ebp)
- psrlq $32,%mm1
- movd %mm1,384(%edx)
-
- movq 56(%ebx),%mm3
- pfadd %mm3,%mm2
- movd %mm2,384(%ebp)
- psrlq $32,%mm2
- movd %mm2,640(%edx)
-
- movd 36(%ebx),%mm4
- pfadd %mm4,%mm3
- movd %mm3,128(%ebp)
- psrlq $32,%mm3
- movd %mm3,896(%edx)
- movq 96(%ebx),%mm0
- movq 64(%ebx),%mm1
-
- movq 112(%ebx),%mm2
- pfadd %mm2,%mm0
- movq %mm0,%mm3
- pfadd %mm1,%mm3
- movd %mm3,960(%ebp)
- psrlq $32,%mm3
- movd %mm3,64(%edx)
- movq 80(%ebx),%mm1
- pfadd %mm1,%mm0
- movd %mm0,832(%ebp)
- psrlq $32,%mm0
- movd %mm0,192(%edx)
- movq 104(%ebx),%mm3
- pfadd %mm3,%mm2
- movq %mm2,%mm4
- pfadd %mm1,%mm4
- movd %mm4,704(%ebp)
- psrlq $32,%mm4
- movd %mm4,320(%edx)
- movq 72(%ebx),%mm1
- pfadd %mm1,%mm2
- movd %mm2,576(%ebp)
- psrlq $32,%mm2
- movd %mm2,448(%edx)
-
- movq 120(%ebx),%mm4
- pfadd %mm4,%mm3
- movq %mm3,%mm5
- pfadd %mm1,%mm5
- movd %mm5,448(%ebp)
- psrlq $32,%mm5
- movd %mm5,576(%edx)
- movq 88(%ebx),%mm1
- pfadd %mm1,%mm3
- movd %mm3,320(%ebp)
- psrlq $32,%mm3
- movd %mm3,704(%edx)
-
- movd 100(%ebx),%mm5
- pfadd %mm5,%mm4
- movq %mm4,%mm6
- pfadd %mm1,%mm6
- movd %mm6,192(%ebp)
- psrlq $32,%mm6
- movd %mm6,832(%edx)
- movd 68(%ebx),%mm1
- pfadd %mm1,%mm4
- movd %mm4,64(%ebp)
- psrlq $32,%mm4
- movd %mm4,960(%edx)
-
- / femms
-
- popl %ebx
- popl %esi
- popl %edi
- popl %ebp
- addl $256,%esp
-
- ret $12
+
+ .align 16
+
+.globl dct64_MMX_3dnowex
+dct64_MMX_3dnowex:
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+ subl $256,%esp
+ movl 280(%esp),%eax
+
+ leal 128(%esp),%edx
+ movl 272(%esp),%esi
+ movl 276(%esp),%edi
+ movl $costab,%ebx
+ orl %ecx,%ecx
+ movl %esp,%ecx
+ femms
+/* Phase 1*/
+ movq (%eax), %mm0
+ movq 8(%eax), %mm4
+ movq %mm0, %mm3
+ movq %mm4, %mm7
+ movq 120(%eax), %mm1
+ movq 112(%eax), %mm5
+ pswapd %mm1, %mm1
+ pswapd %mm5, %mm5
+ pfadd %mm1, %mm0
+ pfadd %mm5, %mm4
+ movq %mm0, (%edx)
+ movq %mm4, 8(%edx)
+ pfsub %mm1, %mm3
+ pfsub %mm5, %mm7
+ pfmul (%ebx), %mm3
+ pfmul 8(%ebx), %mm7
+ pswapd %mm3, %mm3
+ pswapd %mm7, %mm7
+ movq %mm3, 120(%edx)
+ movq %mm7, 112(%edx)
+
+ movq 16(%eax), %mm0
+ movq 24(%eax), %mm4
+ movq %mm0, %mm3
+ movq %mm4, %mm7
+ movq 104(%eax), %mm1
+ movq 96(%eax), %mm5
+ pswapd %mm1, %mm1
+ pswapd %mm5, %mm5
+ pfadd %mm1, %mm0
+ pfadd %mm5, %mm4
+ movq %mm0, 16(%edx)
+ movq %mm4, 24(%edx)
+ pfsub %mm1, %mm3
+ pfsub %mm5, %mm7
+ pfmul 16(%ebx), %mm3
+ pfmul 24(%ebx), %mm7
+ pswapd %mm3, %mm3
+ pswapd %mm7, %mm7
+ movq %mm3, 104(%edx)
+ movq %mm7, 96(%edx)
+
+ movq 32(%eax), %mm0
+ movq 40(%eax), %mm4
+ movq %mm0, %mm3
+ movq %mm4, %mm7
+ movq 88(%eax), %mm1
+ movq 80(%eax), %mm5
+ pswapd %mm1, %mm1
+ pswapd %mm5, %mm5
+ pfadd %mm1, %mm0
+ pfadd %mm5, %mm4
+ movq %mm0, 32(%edx)
+ movq %mm4, 40(%edx)
+ pfsub %mm1, %mm3
+ pfsub %mm5, %mm7
+ pfmul 32(%ebx), %mm3
+ pfmul 40(%ebx), %mm7
+ pswapd %mm3, %mm3
+ pswapd %mm7, %mm7
+ movq %mm3, 88(%edx)
+ movq %mm7, 80(%edx)
+
+ movq 48(%eax), %mm0
+ movq 56(%eax), %mm4
+ movq %mm0, %mm3
+ movq %mm4, %mm7
+ movq 72(%eax), %mm1
+ movq 64(%eax), %mm5
+ pswapd %mm1, %mm1
+ pswapd %mm5, %mm5
+ pfadd %mm1, %mm0
+ pfadd %mm5, %mm4
+ movq %mm0, 48(%edx)
+ movq %mm4, 56(%edx)
+ pfsub %mm1, %mm3
+ pfsub %mm5, %mm7
+ pfmul 48(%ebx), %mm3
+ pfmul 56(%ebx), %mm7
+ pswapd %mm3, %mm3
+ pswapd %mm7, %mm7
+ movq %mm3, 72(%edx)
+ movq %mm7, 64(%edx)
+
+/* Phase 2*/
+
+ movq (%edx), %mm0
+ movq 8(%edx), %mm4
+ movq %mm0, %mm3
+ movq %mm4, %mm7
+ movq 56(%edx), %mm1
+ movq 48(%edx), %mm5
+ pswapd %mm1, %mm1
+ pswapd %mm5, %mm5
+ pfadd %mm1, %mm0
+ pfadd %mm5, %mm4
+ movq %mm0, (%ecx)
+ movq %mm4, 8(%ecx)
+ pfsub %mm1, %mm3
+ pfsub %mm5, %mm7
+ pfmul 64(%ebx), %mm3
+ pfmul 72(%ebx), %mm7
+ pswapd %mm3, %mm3
+ pswapd %mm7, %mm7
+ movq %mm3, 56(%ecx)
+ movq %mm7, 48(%ecx)
+
+ movq 16(%edx), %mm0
+ movq 24(%edx), %mm4
+ movq %mm0, %mm3
+ movq %mm4, %mm7
+ movq 40(%edx), %mm1
+ movq 32(%edx), %mm5
+ pswapd %mm1, %mm1
+ pswapd %mm5, %mm5
+ pfadd %mm1, %mm0
+ pfadd %mm5, %mm4
+ movq %mm0, 16(%ecx)
+ movq %mm4, 24(%ecx)
+ pfsub %mm1, %mm3
+ pfsub %mm5, %mm7
+ pfmul 80(%ebx), %mm3
+ pfmul 88(%ebx), %mm7
+ pswapd %mm3, %mm3
+ pswapd %mm7, %mm7
+ movq %mm3, 40(%ecx)
+ movq %mm7, 32(%ecx)
+
+/* Phase 3*/
+
+ movq 64(%edx), %mm0
+ movq 72(%edx), %mm4
+ movq %mm0, %mm3
+ movq %mm4, %mm7
+ movq 120(%edx), %mm1
+ movq 112(%edx), %mm5
+ pswapd %mm1, %mm1
+ pswapd %mm5, %mm5
+ pfadd %mm1, %mm0
+ pfadd %mm5, %mm4
+ movq %mm0, 64(%ecx)
+ movq %mm4, 72(%ecx)
+ pfsubr %mm1, %mm3
+ pfsubr %mm5, %mm7
+ pfmul 64(%ebx), %mm3
+ pfmul 72(%ebx), %mm7
+ pswapd %mm3, %mm3
+ pswapd %mm7, %mm7
+ movq %mm3, 120(%ecx)
+ movq %mm7, 112(%ecx)
+
+ movq 80(%edx), %mm0
+ movq 88(%edx), %mm4
+ movq %mm0, %mm3
+ movq %mm4, %mm7
+ movq 104(%edx), %mm1
+ movq 96(%edx), %mm5
+ pswapd %mm1, %mm1
+ pswapd %mm5, %mm5
+ pfadd %mm1, %mm0
+ pfadd %mm5, %mm4
+ movq %mm0, 80(%ecx)
+ movq %mm4, 88(%ecx)
+ pfsubr %mm1, %mm3
+ pfsubr %mm5, %mm7
+ pfmul 80(%ebx), %mm3
+ pfmul 88(%ebx), %mm7
+ pswapd %mm3, %mm3
+ pswapd %mm7, %mm7
+ movq %mm3, 104(%ecx)
+ movq %mm7, 96(%ecx)
+
+/* Phase 4*/
+
+ movq (%ecx), %mm0
+ movq 8(%ecx), %mm4
+ movq %mm0, %mm3
+ movq %mm4, %mm7
+ movq 24(%ecx), %mm1
+ movq 16(%ecx), %mm5
+ pswapd %mm1, %mm1
+ pswapd %mm5, %mm5
+ pfadd %mm1, %mm0
+ pfadd %mm5, %mm4
+ movq %mm0, (%edx)
+ movq %mm4, 8(%edx)
+ pfsub %mm1, %mm3
+ pfsub %mm5, %mm7
+ pfmul 96(%ebx), %mm3
+ pfmul 104(%ebx), %mm7
+ pswapd %mm3, %mm3
+ pswapd %mm7, %mm7
+ movq %mm3, 24(%edx)
+ movq %mm7, 16(%edx)
+
+ movq 32(%ecx), %mm0
+ movq 40(%ecx), %mm4
+ movq %mm0, %mm3
+ movq %mm4, %mm7
+ movq 56(%ecx), %mm1
+ movq 48(%ecx), %mm5
+ pswapd %mm1, %mm1
+ pswapd %mm5, %mm5
+ pfadd %mm1, %mm0
+ pfadd %mm5, %mm4
+ movq %mm0, 32(%edx)
+ movq %mm4, 40(%edx)
+ pfsubr %mm1, %mm3
+ pfsubr %mm5, %mm7
+ pfmul 96(%ebx), %mm3
+ pfmul 104(%ebx), %mm7
+ pswapd %mm3, %mm3
+ pswapd %mm7, %mm7
+ movq %mm3, 56(%edx)
+ movq %mm7, 48(%edx)
+
+ movq 64(%ecx), %mm0
+ movq 72(%ecx), %mm4
+ movq %mm0, %mm3
+ movq %mm4, %mm7
+ movq 88(%ecx), %mm1
+ movq 80(%ecx), %mm5
+ pswapd %mm1, %mm1
+ pswapd %mm5, %mm5
+ pfadd %mm1, %mm0
+ pfadd %mm5, %mm4
+ movq %mm0, 64(%edx)
+ movq %mm4, 72(%edx)
+ pfsub %mm1, %mm3
+ pfsub %mm5, %mm7
+ pfmul 96(%ebx), %mm3
+ pfmul 104(%ebx), %mm7
+ pswapd %mm3, %mm3
+ pswapd %mm7, %mm7
+ movq %mm3, 88(%edx)
+ movq %mm7, 80(%edx)
+
+ movq 96(%ecx), %mm0
+ movq 104(%ecx), %mm4
+ movq %mm0, %mm3
+ movq %mm4, %mm7
+ movq 120(%ecx), %mm1
+ movq 112(%ecx), %mm5
+ pswapd %mm1, %mm1
+ pswapd %mm5, %mm5
+ pfadd %mm1, %mm0
+ pfadd %mm5, %mm4
+ movq %mm0, 96(%edx)
+ movq %mm4, 104(%edx)
+ pfsubr %mm1, %mm3
+ pfsubr %mm5, %mm7
+ pfmul 96(%ebx), %mm3
+ pfmul 104(%ebx), %mm7
+ pswapd %mm3, %mm3
+ pswapd %mm7, %mm7
+ movq %mm3, 120(%edx)
+ movq %mm7, 112(%edx)
+
+/* Phase 5 */
+
+ movq (%edx), %mm0
+ movq 16(%edx), %mm4
+ movq %mm0, %mm3
+ movq %mm4, %mm7
+ movq 8(%edx), %mm1
+ movq 24(%edx), %mm5
+ pswapd %mm1, %mm1
+ pswapd %mm5, %mm5
+ pfadd %mm1, %mm0
+ pfadd %mm5, %mm4
+ movq %mm0, (%ecx)
+ movq %mm4, 16(%ecx)
+ pfsub %mm1, %mm3
+ pfsubr %mm5, %mm7
+ pfmul 112(%ebx), %mm3
+ pfmul 112(%ebx), %mm7
+ pswapd %mm3, %mm3
+ pswapd %mm7, %mm7
+ movq %mm3, 8(%ecx)
+ movq %mm7, 24(%ecx)
+
+ movq 32(%edx), %mm0
+ movq 48(%edx), %mm4
+ movq %mm0, %mm3
+ movq %mm4, %mm7
+ movq 40(%edx), %mm1
+ movq 56(%edx), %mm5
+ pswapd %mm1, %mm1
+ pswapd %mm5, %mm5
+ pfadd %mm1, %mm0
+ pfadd %mm5, %mm4
+ movq %mm0, 32(%ecx)
+ movq %mm4, 48(%ecx)
+ pfsub %mm1, %mm3
+ pfsubr %mm5, %mm7
+ pfmul 112(%ebx), %mm3
+ pfmul 112(%ebx), %mm7
+ pswapd %mm3, %mm3
+ pswapd %mm7, %mm7
+ movq %mm3, 40(%ecx)
+ movq %mm7, 56(%ecx)
+
+ movq 64(%edx), %mm0
+ movq 80(%edx), %mm4
+ movq %mm0, %mm3
+ movq %mm4, %mm7
+ movq 72(%edx), %mm1
+ movq 88(%edx), %mm5
+ pswapd %mm1, %mm1
+ pswapd %mm5, %mm5
+ pfadd %mm1, %mm0
+ pfadd %mm5, %mm4
+ movq %mm0, 64(%ecx)
+ movq %mm4, 80(%ecx)
+ pfsub %mm1, %mm3
+ pfsubr %mm5, %mm7
+ pfmul 112(%ebx), %mm3
+ pfmul 112(%ebx), %mm7
+ pswapd %mm3, %mm3
+ pswapd %mm7, %mm7
+ movq %mm3, 72(%ecx)
+ movq %mm7, 88(%ecx)
+
+ movq 96(%edx), %mm0
+ movq 112(%edx), %mm4
+ movq %mm0, %mm3
+ movq %mm4, %mm7
+ movq 104(%edx), %mm1
+ movq 120(%edx), %mm5
+ pswapd %mm1, %mm1
+ pswapd %mm5, %mm5
+ pfadd %mm1, %mm0
+ pfadd %mm5, %mm4
+ movq %mm0, 96(%ecx)
+ movq %mm4, 112(%ecx)
+ pfsub %mm1, %mm3
+ pfsubr %mm5, %mm7
+ pfmul 112(%ebx), %mm3
+ pfmul 112(%ebx), %mm7
+ pswapd %mm3, %mm3
+ pswapd %mm7, %mm7
+ movq %mm3, 104(%ecx)
+ movq %mm7, 120(%ecx)
+
+/* Phase 6. This is the end of easy road. */
+ movl $1, %eax
+ movd %eax, %mm7
+ pi2fd %mm7, %mm7
+ movq 32(%ecx), %mm0
+ punpckldq 120(%ebx), %mm7 /* 1.0 | 120(%ebx) */
+ movq %mm0, %mm1
+ movq plus_minus_3dnow, %mm6
+ /* n.b.: pfpnacc */
+ pxor %mm6, %mm1
+ pfacc %mm1, %mm0
+ /**/
+ pfmul %mm7, %mm0
+ movq %mm0, 32(%edx)
+ femms
+
+ flds 44(%ecx)
+ fsubs 40(%ecx)
+ fmuls 120(%ebx)
+
+ fsts 44(%edx)
+ fadds 40(%ecx) /* pfacc 40(ecx), 56(%ecx) */
+ fadds 44(%ecx)
+ fstps 40(%edx)
+
+ flds 48(%ecx)
+ fsubs 52(%ecx)
+ fmuls 120(%ebx)
+
+ flds 60(%ecx)
+ fsubs 56(%ecx)
+ fmuls 120(%ebx)
+
+ fld %st(0)
+ fadds 56(%ecx)
+ fadds 60(%ecx)
+
+ fld %st(0)
+ fadds 48(%ecx)
+ fadds 52(%ecx)
+ fstps 48(%edx)
+ fadd %st(2)
+ fstps 56(%edx)
+ fsts 60(%edx)
+ faddp %st(1)
+ fstps 52(%edx)
+/*---*/
+ flds 64(%ecx)
+ fadds 68(%ecx)
+ fstps 64(%edx)
+
+ flds 64(%ecx)
+ fsubs 68(%ecx)
+ fmuls 120(%ebx)
+ fstps 68(%edx)
+
+ flds 76(%ecx)
+ fsubs 72(%ecx)
+ fmuls 120(%ebx)
+ fsts 76(%edx)
+ fadds 72(%ecx)
+ fadds 76(%ecx)
+ fstps 72(%edx)
+
+ flds 92(%ecx)
+ fsubs 88(%ecx)
+ fmuls 120(%ebx)
+ fsts 92(%edx)
+ fadds 92(%ecx)
+ fadds 88(%ecx)
+
+ fld %st(0)
+ fadds 80(%ecx)
+ fadds 84(%ecx)
+ fstps 80(%edx)
+
+ flds 80(%ecx)
+ fsubs 84(%ecx)
+ fmuls 120(%ebx)
+ fadd %st(0), %st(1)
+ fadds 92(%edx)
+ fstps 84(%edx)
+ fstps 88(%edx)
+
+ flds 96(%ecx)
+ fadds 100(%ecx)
+ fstps 96(%edx)
+
+ flds 96(%ecx)
+ fsubs 100(%ecx)
+ fmuls 120(%ebx)
+ fstps 100(%edx)
+
+ flds 108(%ecx)
+ fsubs 104(%ecx)
+ fmuls 120(%ebx)
+ fsts 108(%edx)
+ fadds 104(%ecx)
+ fadds 108(%ecx)
+ fstps 104(%edx)
+
+ flds 124(%ecx)
+ fsubs 120(%ecx)
+ fmuls 120(%ebx)
+ fsts 124(%edx)
+ fadds 120(%ecx)
+ fadds 124(%ecx)
+
+ fld %st(0)
+ fadds 112(%ecx)
+ fadds 116(%ecx)
+ fstps 112(%edx)
+
+ flds 112(%ecx)
+ fsubs 116(%ecx)
+ fmuls 120(%ebx)
+ fadd %st(0),%st(1)
+ fadds 124(%edx)
+ fstps 116(%edx)
+ fstps 120(%edx)
+ jnz .L01
+
+/* Phase 7*/
+
+ flds (%ecx)
+ fadds 4(%ecx)
+ fstps 1024(%esi)
+
+ flds (%ecx)
+ fsubs 4(%ecx)
+ fmuls 120(%ebx)
+ fsts (%esi)
+ fstps (%edi)
+
+ flds 12(%ecx)
+ fsubs 8(%ecx)
+ fmuls 120(%ebx)
+ fsts 512(%edi)
+ fadds 12(%ecx)
+ fadds 8(%ecx)
+ fstps 512(%esi)
+
+ flds 16(%ecx)
+ fsubs 20(%ecx)
+ fmuls 120(%ebx)
+
+ flds 28(%ecx)
+ fsubs 24(%ecx)
+ fmuls 120(%ebx)
+ fsts 768(%edi)
+ fld %st(0)
+ fadds 24(%ecx)
+ fadds 28(%ecx)
+ fld %st(0)
+ fadds 16(%ecx)
+ fadds 20(%ecx)
+ fstps 768(%esi)
+ fadd %st(2)
+ fstps 256(%esi)
+ faddp %st(1)
+ fstps 256(%edi)
+
+/* Phase 8*/
+
+ flds 32(%edx)
+ fadds 48(%edx)
+ fstps 896(%esi)
+
+ flds 48(%edx)
+ fadds 40(%edx)
+ fstps 640(%esi)
+
+ flds 40(%edx)
+ fadds 56(%edx)
+ fstps 384(%esi)
+
+ flds 56(%edx)
+ fadds 36(%edx)
+ fstps 128(%esi)
+
+ flds 36(%edx)
+ fadds 52(%edx)
+ fstps 128(%edi)
+
+ flds 52(%edx)
+ fadds 44(%edx)
+ fstps 384(%edi)
+
+ flds 60(%edx)
+ fsts 896(%edi)
+ fadds 44(%edx)
+ fstps 640(%edi)
+
+ flds 96(%edx)
+ fadds 112(%edx)
+ fld %st(0)
+ fadds 64(%edx)
+ fstps 960(%esi)
+ fadds 80(%edx)
+ fstps 832(%esi)
+
+ flds 112(%edx)
+ fadds 104(%edx)
+ fld %st(0)
+ fadds 80(%edx)
+ fstps 704(%esi)
+ fadds 72(%edx)
+ fstps 576(%esi)
+
+ flds 104(%edx)
+ fadds 120(%edx)
+ fld %st(0)
+ fadds 72(%edx)
+ fstps 448(%esi)
+ fadds 88(%edx)
+ fstps 320(%esi)
+
+ flds 120(%edx)
+ fadds 100(%edx)
+ fld %st(0)
+ fadds 88(%edx)
+ fstps 192(%esi)
+ fadds 68(%edx)
+ fstps 64(%esi)
+
+ flds 100(%edx)
+ fadds 116(%edx)
+ fld %st(0)
+ fadds 68(%edx)
+ fstps 64(%edi)
+ fadds 84(%edx)
+ fstps 192(%edi)
+
+ flds 116(%edx)
+ fadds 108(%edx)
+ fld %st(0)
+ fadds 84(%edx)
+ fstps 320(%edi)
+ fadds 76(%edx)
+ fstps 448(%edi)
+
+ flds 108(%edx)
+ fadds 124(%edx)
+ fld %st(0)
+ fadds 76(%edx)
+ fstps 576(%edi)
+ fadds 92(%edx)
+ fstps 704(%edi)
+
+ flds 124(%edx)
+ fsts 960(%edi)
+ fadds 92(%edx)
+ fstps 832(%edi)
+ jmp .L_bye
+.L01:
+/* Phase 9*/
+
+ flds (%ecx)
+ fadds 4(%ecx)
+ fistp 512(%esi)
+
+ flds (%ecx)
+ fsubs 4(%ecx)
+ fmuls 120(%ebx)
+
+ fistp (%esi)
+
+
+ flds 12(%ecx)
+ fsubs 8(%ecx)
+ fmuls 120(%ebx)
+ fist 256(%edi)
+ fadds 12(%ecx)
+ fadds 8(%ecx)
+ fistp 256(%esi)
+
+ flds 16(%ecx)
+ fsubs 20(%ecx)
+ fmuls 120(%ebx)
+
+ flds 28(%ecx)
+ fsubs 24(%ecx)
+ fmuls 120(%ebx)
+ fist 384(%edi)
+ fld %st(0)
+ fadds 24(%ecx)
+ fadds 28(%ecx)
+ fld %st(0)
+ fadds 16(%ecx)
+ fadds 20(%ecx)
+ fistp 384(%esi)
+ fadd %st(2)
+ fistp 128(%esi)
+ faddp %st(1)
+ fistp 128(%edi)
+
+/* Phase 10*/
+
+ flds 32(%edx)
+ fadds 48(%edx)
+ fistp 448(%esi)
+
+ flds 48(%edx)
+ fadds 40(%edx)
+ fistp 320(%esi)
+
+ flds 40(%edx)
+ fadds 56(%edx)
+ fistp 192(%esi)
+
+ flds 56(%edx)
+ fadds 36(%edx)
+ fistp 64(%esi)
+
+ flds 36(%edx)
+ fadds 52(%edx)
+ fistp 64(%edi)
+
+ flds 52(%edx)
+ fadds 44(%edx)
+ fistp 192(%edi)
+
+ flds 60(%edx)
+ fist 448(%edi)
+ fadds 44(%edx)
+ fistp 320(%edi)
+
+ flds 96(%edx)
+ fadds 112(%edx)
+ fld %st(0)
+ fadds 64(%edx)
+ fistp 480(%esi)
+ fadds 80(%edx)
+ fistp 416(%esi)
+
+ flds 112(%edx)
+ fadds 104(%edx)
+ fld %st(0)
+ fadds 80(%edx)
+ fistp 352(%esi)
+ fadds 72(%edx)
+ fistp 288(%esi)
+
+ flds 104(%edx)
+ fadds 120(%edx)
+ fld %st(0)
+ fadds 72(%edx)
+ fistp 224(%esi)
+ fadds 88(%edx)
+ fistp 160(%esi)
+
+ flds 120(%edx)
+ fadds 100(%edx)
+ fld %st(0)
+ fadds 88(%edx)
+ fistp 96(%esi)
+ fadds 68(%edx)
+ fistp 32(%esi)
+
+ flds 100(%edx)
+ fadds 116(%edx)
+ fld %st(0)
+ fadds 68(%edx)
+ fistp 32(%edi)
+ fadds 84(%edx)
+ fistp 96(%edi)
+
+ flds 116(%edx)
+ fadds 108(%edx)
+ fld %st(0)
+ fadds 84(%edx)
+ fistp 160(%edi)
+ fadds 76(%edx)
+ fistp 224(%edi)
+
+ flds 108(%edx)
+ fadds 124(%edx)
+ fld %st(0)
+ fadds 76(%edx)
+ fistp 288(%edi)
+ fadds 92(%edx)
+ fistp 352(%edi)
+
+ flds 124(%edx)
+ fist 480(%edi)
+ fadds 92(%edx)
+ fistp 416(%edi)
+ movsw
+.L_bye:
+ addl $256,%esp
+ popl %edi
+ popl %esi
+ popl %ebx
+ ret
+
diff --git a/mp3lib/decod386.c b/mp3lib/decod386.c
index e0c2c570b9..9939886023 100644
--- a/mp3lib/decod386.c
+++ b/mp3lib/decod386.c
@@ -105,6 +105,15 @@ static int synth_1to1_r(real *bandPtr,int channel,unsigned char *out,int *pnt)
}
#endif
+synth_func_t synth_func;
+
+int synth_1to1_MMX( real *bandPtr,int channel,short * samples)
+{
+ static short buffs[2][2][0x110];
+ static int bo = 1;
+ synth_1to1_MMX_s(bandPtr, channel, samples, (short *) buffs, &bo);
+ return 0;
+ }
static int synth_1to1(real *bandPtr,int channel,unsigned char *out,int *pnt)
{
@@ -117,40 +126,13 @@ static int synth_1to1(real *bandPtr,int channel,unsigned char *out,int *pnt)
int clip = 0;
int bo1;
- #ifdef HAVE_SSE_MP3
- //if ( _3dnow )
- {
- int ret;
- ret=synth_1to1_sse( bandPtr,channel,out+*pnt );
- *pnt+=128;
- return ret;
- }
- #endif
- #ifdef HAVE_3DNOWEX
- if ( _3dnow > 1 )
+ if ( synth_func )
{
int ret;
- ret=synth_1to1_3dnowex( bandPtr,channel,out+*pnt );
+ ret=(*synth_func)( bandPtr,channel,samples);
*pnt+=128;
return ret;
}
- #endif
- #ifdef HAVE_3DNOW
- if ( _3dnow )
- {
- int ret;
- ret=synth_1to1_3dnow( bandPtr,channel,out+*pnt );
- *pnt+=128;
- return ret;
- }
- #endif
- if ( _i586 )
- {
- int ret;
- ret=synth_1to1_pent( bandPtr,channel,out+*pnt );
- *pnt+=128;
- return ret;
- }
if(!channel) { /* channel=0 */
bo--;
diff --git a/mp3lib/decode_3dnow.s b/mp3lib/decode_3dnow.s
deleted file mode 100644
index 155508eadf..0000000000
--- a/mp3lib/decode_3dnow.s
+++ /dev/null
@@ -1,265 +0,0 @@
-/ synth_1to1_3dnow works the same way as the c version of
-/ synth_1to1. this assembler code based 'decode-i586.s'
-/ (by Stefan Bieschewski <stb@acm.org>), two types of changes
-/ have been made:
-/ - use {MMX,3DNow!} instruction for reduce cpu
-/ - remove unused(?) local symbols
-/
-/ useful sources of information on optimizing 3DNow! code include:
-/ AMD 3DNow! Technology Manual (Publication #21928)
-/ English: http://www.amd.com/K6/k6docs/pdf/21928d.pdf
-/ (Japanese: http://www.amd.com/japan/K6/k6docs/j21928c.pdf)
-/ AMD-K6-2 Processor Code Optimization Application Note (Publication #21924)
-/ English: http://www.amd.com/K6/k6docs/pdf/21924b.pdf
-/
-/ This code was tested only AMD-K6-2 processor Linux systems,
-/ please tell me:
-/ - whether this code works on other 3DNow! capable processors
-/ (ex.IDT-C6-2) or not
-/ - whether this code works on other OSes or not
-/
-/ by KIMURA Takuhiro <kim@hannah.ipc.miyakyo-u.ac.jp> - until 31.Mar.1998
-/ <kim@comtec.co.jp> - after 1.Apr.1998
-
-/ Enhancments for q-word operation by Michael Hipp
-
-.bss
- .comm buffs,4352,4
-.data
- .align 4
-bo:
- .long 1
-.text
-.globl synth_1to1_3dnow
-synth_1to1_3dnow:
- subl $12,%esp
- pushl %ebp
- pushl %edi
- pushl %esi
- pushl %ebx
- movl 32(%esp),%eax
- movl 40(%esp),%esi
- movl $0,%edi
- movl bo,%ebp
- cmpl %edi,36(%esp)
- jne .L48
- decl %ebp
- andl $15,%ebp
- movl %ebp,bo
- movl $buffs,%ecx
- jmp .L49
-.L48:
- addl $2,%esi
- movl $buffs+2176,%ecx
-.L49:
- testl $1,%ebp
- je .L50
- movl %ecx,%ebx
- movl %ebp,16(%esp)
- pushl %eax
- movl 20(%esp),%edx
- leal (%ebx,%edx,4),%eax
- pushl %eax
- movl 24(%esp),%eax
- incl %eax
- andl $15,%eax
- leal 1088(,%eax,4),%eax
- addl %ebx,%eax
- jmp .L74
-.L50:
- leal 1088(%ecx),%ebx
- leal 1(%ebp),%edx
- movl %edx,16(%esp)
- pushl %eax
- leal 1092(%ecx,%ebp,4),%eax
- pushl %eax
- leal (%ecx,%ebp,4),%eax
-.L74:
- pushl %eax
- call dct64_3dnow
- addl $12,%esp
- movl 16(%esp),%edx
- leal 0(,%edx,4),%edx
- movl $decwin+64,%eax
- movl %eax,%ecx
- subl %edx,%ecx
- movl $16,%ebp
-
-.L55:
- movq (%ecx),%mm4
- movq (%ebx),%mm3
- movq 8(%ecx),%mm0
- movq 8(%ebx),%mm1
- pfmul %mm3,%mm4
-
- movq 16(%ecx),%mm2
- pfmul %mm1,%mm0
- movq 16(%ebx),%mm3
- pfadd %mm0,%mm4
-
- movq 24(%ecx),%mm0
- pfmul %mm2,%mm3
- movq 24(%ebx),%mm1
- pfadd %mm3,%mm4
-
- movq 32(%ecx),%mm2
- pfmul %mm1,%mm0
- movq 32(%ebx),%mm3
- pfadd %mm0,%mm4
-
- movq 40(%ecx),%mm0
- pfmul %mm2,%mm3
- movq 40(%ebx),%mm1
- pfadd %mm3,%mm4
-
- movq 48(%ecx),%mm2
- pfmul %mm1,%mm0
- movq 48(%ebx),%mm3
- pfadd %mm0,%mm4
-
- movq 56(%ecx),%mm0
- pfmul %mm2,%mm3
- movq 56(%ebx),%mm1
- pfadd %mm3,%mm4
-
- pfmul %mm1,%mm0
- pfadd %mm0,%mm4
-
- movq %mm4,%mm0
- psrlq $32,%mm0
- pfsub %mm0,%mm4
-
- pf2id %mm4,%mm4
- movd %mm4,%eax
-
- sar $16,%eax
- movw %ax,(%esi)
-
- addl $64,%ebx
- subl $-128,%ecx
- addl $4,%esi
- decl %ebp
- jnz .L55
-
-/ --- end of loop 1 ---
-
- movd (%ecx),%mm2
- movd (%ebx),%mm1
- pfmul %mm1,%mm2
-
- movd 8(%ecx),%mm0
- movd 8(%ebx),%mm1
- pfmul %mm0,%mm1
- pfadd %mm1,%mm2
-
- movd 16(%ecx),%mm0
- movd 16(%ebx),%mm1
- pfmul %mm0,%mm1
- pfadd %mm1,%mm2
-
- movd 24(%ecx),%mm0
- movd 24(%ebx),%mm1
- pfmul %mm0,%mm1
- pfadd %mm1,%mm2
-
- movd 32(%ecx),%mm0
- movd 32(%ebx),%mm1
- pfmul %mm0,%mm1
- pfadd %mm1,%mm2
-
- movd 40(%ecx),%mm0
- movd 40(%ebx),%mm1
- pfmul %mm0,%mm1
- pfadd %mm1,%mm2
-
- movd 48(%ecx),%mm0
- movd 48(%ebx),%mm1
- pfmul %mm0,%mm1
- pfadd %mm1,%mm2
-
- movd 56(%ecx),%mm0
- movd 56(%ebx),%mm1
- pfmul %mm0,%mm1
- pfadd %mm1,%mm2
-
- pf2id %mm2,%mm2
- movd %mm2,%eax
-
- sar $16,%eax
-
- movw %ax,(%esi)
-
- addl $-64,%ebx
- addl $4,%esi
- addl $256,%ecx
- movl $15,%ebp
-
-.L68:
- psubd %mm0,%mm0
-
- movq (%ebx),%mm1
- movq (%ecx),%mm2
- pfmul %mm1,%mm2
- pfsub %mm2,%mm0
-
- movq 8(%ebx),%mm3
- movq 8(%ecx),%mm4
- pfmul %mm3,%mm4
- pfsub %mm4,%mm0
-
- movq 16(%ebx),%mm1
- movq 16(%ecx),%mm2
- pfmul %mm1,%mm2
- pfsub %mm2,%mm0
-
- movq 24(%ebx),%mm3
- movq 24(%ecx),%mm4
- pfmul %mm3,%mm4
- pfsub %mm4,%mm0
-
- movq 32(%ebx),%mm1
- movq 32(%ecx),%mm2
- pfmul %mm1,%mm2
- pfsub %mm2,%mm0
-
- movq 40(%ebx),%mm3
- movq 40(%ecx),%mm4
- pfmul %mm3,%mm4
- pfsub %mm4,%mm0
-
- movq 48(%ebx),%mm1
- movq 48(%ecx),%mm2
- pfmul %mm1,%mm2
- pfsub %mm2,%mm0
-
- movq 56(%ebx),%mm3
- movq 56(%ecx),%mm4
- pfmul %mm3,%mm4
- pfsub %mm4,%mm0
-
- pfacc %mm0,%mm0
-
- pf2id %mm0,%mm0
- movd %mm0,%eax
-
- sar $16,%eax
-
- movw %ax,(%esi)
-
- addl $-64,%ebx
- subl $-128,%ecx
- addl $4,%esi
- decl %ebp
- jnz .L68
-
-/ --- end of loop 2
-
- femms
-
- movl %edi,%eax
- popl %ebx
- popl %esi
- popl %edi
- popl %ebp
- addl $12,%esp
- ret
diff --git a/mp3lib/decode_MMX.s b/mp3lib/decode_MMX.s
new file mode 100644
index 0000000000..41c8d34c4d
--- /dev/null
+++ b/mp3lib/decode_MMX.s
@@ -0,0 +1,117 @@
+# this code comes under GPL
+# This code was taken from http://www.mpg123.org
+# See ChangeLog of mpg123-0.59s-pre.1 for detail
+# Applied to mplayer by Nick Kurshev <nickols_k@mail.ru>
+#
+# TODO: Partial loops unrolling and removing MOVW insn.
+#
+
+.text
+
+.globl synth_1to1_MMX_s
+
+synth_1to1_MMX_s:
+ pushl %ebp
+ pushl %edi
+ pushl %esi
+ pushl %ebx
+ movl 24(%esp),%ecx
+ movl 28(%esp),%edi
+ movl $15,%ebx
+ movl 36(%esp),%edx
+ leal (%edi,%ecx,2),%edi
+ decl %ecx
+ movl 32(%esp),%esi
+ movl (%edx),%eax
+ jecxz .L1
+ decl %eax
+ andl %ebx,%eax
+ leal 1088(%esi),%esi
+ movl %eax,(%edx)
+.L1:
+ leal (%esi,%eax,2),%edx
+ movl %eax,%ebp
+ incl %eax
+ pushl 20(%esp)
+ andl %ebx,%eax
+ leal 544(%esi,%eax,2),%ecx
+ incl %ebx
+ testl $1, %eax
+ jnz .L2
+ xchgl %edx,%ecx
+ incl %ebp
+ leal 544(%esi),%esi
+.L2:
+ emms
+ pushl %edx
+ pushl %ecx
+ call *dct64_MMX_func
+ addl $12,%esp
+ leal 1(%ebx), %ecx
+ subl %ebp,%ebx
+
+ leal decwins(%ebx,%ebx,1), %edx
+.L3:
+ movq (%edx),%mm0
+ pmaddwd (%esi),%mm0
+ movq 8(%edx),%mm1
+ pmaddwd 8(%esi),%mm1
+ movq 16(%edx),%mm2
+ pmaddwd 16(%esi),%mm2
+ movq 24(%edx),%mm3
+ pmaddwd 24(%esi),%mm3
+ paddd %mm1,%mm0
+ paddd %mm2,%mm0
+ paddd %mm3,%mm0
+ movq %mm0,%mm1
+ psrlq $32,%mm1
+ paddd %mm1,%mm0
+ psrad $13,%mm0
+ packssdw %mm0,%mm0
+ movd %mm0,%eax
+ movw %ax, (%edi)
+
+ leal 32(%esi),%esi
+ leal 64(%edx),%edx
+ leal 4(%edi),%edi
+ decl %ecx
+ jnz .L3
+
+
+ subl $64,%esi
+ movl $15,%ecx
+.L4:
+ movq (%edx),%mm0
+ pmaddwd (%esi),%mm0
+ movq 8(%edx),%mm1
+ pmaddwd 8(%esi),%mm1
+ movq 16(%edx),%mm2
+ pmaddwd 16(%esi),%mm2
+ movq 24(%edx),%mm3
+ pmaddwd 24(%esi),%mm3
+ paddd %mm1,%mm0
+ paddd %mm2,%mm0
+ paddd %mm3,%mm0
+ movq %mm0,%mm1
+ psrlq $32,%mm1
+ paddd %mm0,%mm1
+ psrad $13,%mm1
+ packssdw %mm1,%mm1
+ psubd %mm0,%mm0
+ psubsw %mm1,%mm0
+ movd %mm0,%eax
+ movw %ax,(%edi)
+
+ subl $32,%esi
+ addl $64,%edx
+ leal 4(%edi),%edi
+ decl %ecx
+ jnz .L4
+ emms
+ popl %ebx
+ popl %esi
+ popl %edi
+ popl %ebp
+ ret
+
+
diff --git a/mp3lib/decode_k7.s b/mp3lib/decode_k7.s
deleted file mode 100644
index 4f26c67ca4..0000000000
--- a/mp3lib/decode_k7.s
+++ /dev/null
@@ -1,364 +0,0 @@
-///
-/// Replacement of synth_1to1() with AMD's 3DNowEx(DSP)! SIMD operations support
-///
-/// This code based 'decode_3dnow.s' by Syuuhei Kashiyama
-/// <squash@mb.kcom.ne.jp>,only some types of changes have been made:
-///
-/// - Added new opcode PFNACC
-/// - decreased number of opcodes (as it was suggested by k7 manual)
-/// (using memory reference as operand of instructions)
-/// - added PREFETCHW opcode. It has different semantic on k7 than on k6-2
-/// and saves 15-25 cpu clocks for athlon.
-/// - partial unrolling loops for removing slower MOVW insns.
-/// (Note: probably same operation should be done for decode_3dnow.s)
-/// - change function name for support 3DNowEx! automatic detect
-/// - added loops alignment
-///
-/// note: because K7 processors are an aggresive out-of-order three-way
-/// superscalar ones instruction order is not significand for them.
-///
-/// Benchmark: measured by mplayer on Duron-700:
-/// 3dNow! optimized code - 1.4% of cpu usage
-/// k7 optimized code (without partial loop unrolling) - 1.3% of cpu usage
-/// k7 optimized code - 1.1% of cpu usage
-/// Note: K6-2 users have an chance with partial loops unrolling
-///
-/// Modified by Nick Kurshev <nickols_k@mail.ru>
-///
-/ synth_1to1_3dnow works the same way as the c version of
-/ synth_1to1. this assembler code based 'decode-i586.s'
-/ (by Stefan Bieschewski <stb@acm.org>), two types of changes
-/ have been made:
-/ - use {MMX,3DNow!} instruction for reduce cpu
-/ - remove unused(?) local symbols
-/
-/ useful sources of information on optimizing 3DNow! code include:
-/ AMD 3DNow! Technology Manual (Publication #21928)
-/ English: http://www.amd.com/K6/k6docs/pdf/21928d.pdf
-/ (Japanese: http://www.amd.com/japan/K6/k6docs/j21928c.pdf)
-/ AMD-K6-2 Processor Code Optimization Application Note (Publication #21924)
-/ English: http://www.amd.com/K6/k6docs/pdf/21924b.pdf
-/
-/ This code was tested only AMD-K6-2 processor Linux systems,
-/ please tell me:
-/ - whether this code works on other 3DNow! capable processors
-/ (ex.IDT-C6-2) or not
-/ - whether this code works on other OSes or not
-/
-/ by KIMURA Takuhiro <kim@hannah.ipc.miyakyo-u.ac.jp> - until 31.Mar.1998
-/ <kim@comtec.co.jp> - after 1.Apr.1998
-
-/ Enhancments for q-word operation by Michael Hipp
-
-.bss
- .comm buffs,4352,4
-.data
- .align 8
-null_one: .long 0x0000ffff, 0x0000ffff
-one_null: .long 0xffff0000, 0xffff0000
-bo: .long 1
-.text
-/* int synth_1to1(real *bandPtr,int channel,unsigned char *out) */
-.globl synth_1to1_3dnowex
-synth_1to1_3dnowex:
- subl $12,%esp
- pushl %ebp
- pushl %edi
- pushl %esi
- pushl %ebx
-
- movl 32(%esp),%eax
- movl 40(%esp),%esi
- movl $0,%edi
- movl bo,%ebp
- cmpl %edi,36(%esp)
- jne .L48
- decl %ebp
- andl $15,%ebp
- movl %ebp,bo
- movl $buffs,%ecx
- jmp .L49
-.L48:
- addl $2,%esi
- movl $buffs+2176,%ecx
-.L49:
- testl $1,%ebp
- je .L50
- movl %ecx,%ebx
- movl %ebp,16(%esp)
- pushl %eax
- movl 20(%esp),%edx
- leal (%ebx,%edx,4),%eax
- pushl %eax
- movl 24(%esp),%eax
- incl %eax
- andl $15,%eax
- leal 1088(,%eax,4),%eax
- addl %ebx,%eax
- jmp .L74
-.L50:
- leal 1088(%ecx),%ebx
- leal 1(%ebp),%edx
- movl %edx,16(%esp)
- pushl %eax
- leal 1092(%ecx,%ebp,4),%eax
- pushl %eax
- leal (%ecx,%ebp,4),%eax
-.L74:
- pushl %eax
- call dct64_3dnowex
- movl 16(%esp),%edx
- leal 0(,%edx,4),%edx
- movl $decwin+64,%eax
- movl %eax,%ecx
- subl %edx,%ecx
- movl $8,%ebp
- prefetchw (%esi)
-.align 16
-.L55:
-
- movq (%ecx),%mm0
- pfmul (%ebx),%mm0
- movq 128(%ecx),%mm4
- pfmul 64(%ebx),%mm4
-
- movq 8(%ecx),%mm1
- pfmul 8(%ebx),%mm1
- pfadd %mm1,%mm0
- movq 136(%ecx),%mm5
- pfmul 72(%ebx),%mm5
- pfadd %mm5,%mm4
-
- movq 16(%ebx),%mm2
- pfmul 16(%ecx),%mm2
- pfadd %mm2,%mm0
- movq 80(%ebx),%mm6
- pfmul 144(%ecx),%mm6
- pfadd %mm6,%mm4
-
- movq 24(%ecx),%mm3
- pfmul 24(%ebx),%mm3
- pfadd %mm3,%mm0
- movq 152(%ecx),%mm7
- pfmul 88(%ebx),%mm7
- pfadd %mm7,%mm4
-
- movq 32(%ebx),%mm1
- pfmul 32(%ecx),%mm1
- pfadd %mm1,%mm0
- movq 96(%ebx),%mm5
- pfmul 160(%ecx),%mm5
- pfadd %mm5,%mm4
-
- movq 40(%ecx),%mm2
- pfmul 40(%ebx),%mm2
- pfadd %mm2,%mm0
- movq 168(%ecx),%mm6
- pfmul 104(%ebx),%mm6
- pfadd %mm6,%mm4
-
- movq 48(%ebx),%mm3
- pfmul 48(%ecx),%mm3
- pfadd %mm3,%mm0
- movq 112(%ebx),%mm7
- pfmul 176(%ecx),%mm7
- pfadd %mm7,%mm4
-
- movq 56(%ecx),%mm1
- pfmul 56(%ebx),%mm1
- pfadd %mm1,%mm0
- movq 184(%ecx),%mm5
- pfmul 120(%ebx),%mm5
- pfadd %mm5,%mm4
-
- pfnacc %mm4, %mm0
- movq (%esi), %mm1
- pf2id %mm0, %mm0
- pand one_null, %mm1
- psrld $16,%mm0
- pand null_one, %mm0
- por %mm0, %mm1
- movq %mm1,(%esi)
-
- addl $128,%ebx
- addl $256,%ecx
- addl $8,%esi
- decl %ebp
- jnz .L55
-
-/ --- end of loop 1 ---
-
- prefetchw (%esi) /* prefetching for writing this block and next loop */
-
- movd (%ecx),%mm0
- pfmul (%ebx),%mm0
-
- movd 8(%ebx),%mm1
- pfmul 8(%ecx),%mm1
- pfadd %mm1,%mm0
-
- movd 16(%ebx),%mm2
- pfmul 16(%ecx),%mm2
- pfadd %mm2,%mm0
-
- movd 24(%ebx),%mm3
- pfmul 24(%ecx),%mm3
- pfadd %mm3,%mm0
-
- movd 32(%ebx),%mm4
- pfmul 32(%ecx),%mm4
- pfadd %mm4,%mm0
-
- movd 40(%ebx),%mm5
- pfmul 40(%ecx),%mm5
- pfadd %mm5,%mm0
-
- movd 48(%ebx),%mm6
- pfmul 48(%ecx),%mm6
- pfadd %mm6,%mm0
-
- movd 56(%ebx),%mm7
- pfmul 56(%ecx),%mm7
- pfadd %mm7,%mm0
-
- pf2id %mm0,%mm0
- movd %mm0,%eax
-
- sar $16,%eax
-
- movw %ax,(%esi)
-
- subl $64,%ebx
- addl $4,%esi
- addl $256,%ecx
- movl $7,%ebp
-.align 16
-.L68:
- pxor %mm0, %mm0
- pxor %mm4, %mm4
-
- movq (%ecx),%mm1
- pfmul (%ebx),%mm1
- pfsub %mm1,%mm0
- movq 128(%ecx),%mm5
- pfmul -64(%ebx),%mm5
- pfsub %mm5,%mm4
-
- movq 8(%ecx),%mm2
- pfmul 8(%ebx),%mm2
- pfsub %mm2,%mm0
- movq 136(%ecx),%mm6
- pfmul -56(%ebx),%mm6
- pfsub %mm6,%mm4
-
- movq 16(%ecx),%mm3
- pfmul 16(%ebx),%mm3
- pfsub %mm3,%mm0
- movq 144(%ecx),%mm7
- pfmul -48(%ebx),%mm7
- pfsub %mm7,%mm4
-
- movq 24(%ecx),%mm1
- pfmul 24(%ebx),%mm1
- pfsub %mm1,%mm0
- movq 152(%ecx),%mm5
- pfmul -40(%ebx),%mm5
- pfsub %mm5,%mm4
-
- movq 32(%ecx),%mm2
- pfmul 32(%ebx),%mm2
- pfsub %mm2,%mm0
- movq 160(%ecx),%mm6
- pfmul -32(%ebx),%mm6
- pfsub %mm6,%mm4
-
- movq 40(%ecx),%mm3
- pfmul 40(%ebx),%mm3
- pfsub %mm3,%mm0
- movq 168(%ecx),%mm7
- pfmul -24(%ebx),%mm7
- pfsub %mm7,%mm4
-
- movq 48(%ecx),%mm1
- pfmul 48(%ebx),%mm1
- pfsub %mm1,%mm0
- movq 176(%ecx),%mm5
- pfmul -16(%ebx),%mm5
- pfsub %mm5,%mm4
-
- movq 56(%ecx),%mm2
- pfmul 56(%ebx),%mm2
- pfsub %mm2,%mm0
- movq 184(%ecx),%mm6
- pfmul -8(%ebx),%mm6
- pfsub %mm6,%mm4
-
- pfacc %mm4,%mm0
- movq (%esi), %mm1
- pf2id %mm0, %mm0
- pand one_null, %mm1
- psrld $16,%mm0
- pand null_one, %mm0
- por %mm0, %mm1
- movq %mm1,(%esi)
-
- subl $128,%ebx
- addl $256,%ecx
- addl $8,%esi
- decl %ebp
- jnz .L68
-
-/ --- end of loop 2
-
- pxor %mm0, %mm0
-
- movq (%ecx),%mm1
- pfmul (%ebx),%mm1
- pfsub %mm1,%mm0
-
- movq 8(%ecx),%mm2
- pfmul 8(%ebx),%mm2
- pfsub %mm2,%mm0
-
- movq 16(%ecx),%mm3
- pfmul 16(%ebx),%mm3
- pfsub %mm3,%mm0
-
- movq 24(%ecx),%mm4
- pfmul 24(%ebx),%mm4
- pfsub %mm4,%mm0
-
- movq 32(%ecx),%mm5
- pfmul 32(%ebx),%mm5
- pfsub %mm5,%mm0
-
- movq 40(%ecx),%mm6
- pfmul 40(%ebx),%mm6
- pfsub %mm6,%mm0
-
- movq 48(%ecx),%mm7
- pfmul 48(%ebx),%mm7
- pfsub %mm7,%mm0
-
- movq 56(%ecx),%mm1
- pfmul 56(%ebx),%mm1
- pfsub %mm1,%mm0
-
- pfacc %mm0,%mm0
-
- pf2id %mm0,%mm0
- movd %mm0,%eax
-
- sar $16,%eax
-
- movw %ax,(%esi)
-
- femms
-
- movl %edi,%eax
- popl %ebx
- popl %esi
- popl %edi
- popl %ebp
- addl $12,%esp
- ret
diff --git a/mp3lib/decode_sse.s b/mp3lib/decode_sse.s
deleted file mode 100644
index eebd2f6cfb..0000000000
--- a/mp3lib/decode_sse.s
+++ /dev/null
@@ -1,201 +0,0 @@
-///
-/// Replacement of synth_1to1() with Intel's SSE SIMD operations support
-///
-/// This code based 'decode_k7.s' by Nick Kurshev
-/// <squash@mb.kcom.ne.jp>,only some types of changes have been made:
-///
-/// - SSE optimization
-/// - change function name for support SSE automatic detect
-///
-/// Modified by Nick Kurshev <nickols_k@mail.ru>
-///
-/ synth_1to1_3dnow works the same way as the c version of
-/ synth_1to1. this assembler code based 'decode-i586.s'
-/ (by Stefan Bieschewski <stb@acm.org>), two types of changes
-/ have been made:
-/ - use {MMX,3DNow!} instruction for reduce cpu
-/ - remove unused(?) local symbols
-/
-/ useful sources of information on optimizing 3DNow! code include:
-/ AMD 3DNow! Technology Manual (Publication #21928)
-/ English: http://www.amd.com/K6/k6docs/pdf/21928d.pdf
-/ (Japanese: http://www.amd.com/japan/K6/k6docs/j21928c.pdf)
-/ AMD-K6-2 Processor Code Optimization Application Note (Publication #21924)
-/ English: http://www.amd.com/K6/k6docs/pdf/21924b.pdf
-/
-/ This code was tested only AMD-K6-2 processor Linux systems,
-/ please tell me:
-/ - whether this code works on other 3DNow! capable processors
-/ (ex.IDT-C6-2) or not
-/ - whether this code works on other OSes or not
-/
-/ by KIMURA Takuhiro <kim@hannah.ipc.miyakyo-u.ac.jp> - until 31.Mar.1998
-/ <kim@comtec.co.jp> - after 1.Apr.1998
-
-/ Enhancments for q-word operation by Michael Hipp
-
-.bss
- .comm buffs,4352,4
-.data
- .align 4
-bo:
- .long 1
-.text
-/* int synth_1to1(real *bandPtr,int channel,unsigned char *out) */
-.globl synth_1to1_sse
-synth_1to1_sse:
- subl $12,%esp
- pushl %ebp
- pushl %edi
- pushl %esi
- pushl %ebx
-
- movl 32(%esp),%eax
- movl 40(%esp),%esi
- movl $0,%edi
- movl bo,%ebp
- cmpl %edi,36(%esp)
- jne .L48
- decl %ebp
- andl $15,%ebp
- movl %ebp,bo
- movl $buffs,%ecx
- jmp .L49
-.L48:
- addl $2,%esi
- movl $buffs+2176,%ecx
-.L49:
- testl $1,%ebp
- je .L50
- movl %ecx,%ebx
- movl %ebp,16(%esp)
- pushl %eax
- movl 20(%esp),%edx
- leal (%ebx,%edx,4),%eax
- pushl %eax
- movl 24(%esp),%eax
- incl %eax
- andl $15,%eax
- leal 1088(,%eax,4),%eax
- addl %ebx,%eax
- jmp .L74
-.L50:
- leal 1088(%ecx),%ebx
- leal 1(%ebp),%edx
- movl %edx,16(%esp)
- pushl %eax
- leal 1092(%ecx,%ebp,4),%eax
- pushl %eax
- leal (%ecx,%ebp,4),%eax
-.L74:
- pushl %eax
- call dct64
- addl $12, %esp
- movl 16(%esp),%edx
- leal 0(,%edx,4),%edx
- movl $decwin+64,%eax
- movl %eax,%ecx
- subl %edx,%ecx
- movl $16,%ebp
-
-.L55:
- movups (%ecx), %xmm0
- mulps (%ebx), %xmm0
- movups 16(%ecx), %xmm1
- mulps 16(%ebx), %xmm1
- addps %xmm1, %xmm0
- movups 32(%ecx), %xmm1
- mulps 32(%ebx), %xmm1
- addps %xmm1, %xmm0
- movups 48(%ecx), %xmm1
- mulps 48(%ebx), %xmm1
- addps %xmm1, %xmm0
-/* pfnacc -> PFNACC mmreg1, mmreg2 performs the following operations: */
-/* temp = mmreg2 */
-/* mmreg1[31:0] = mmreg1[31:0] - mmreg1[63:32] */
-/* mmreg1[63:32]= temp [31:0] - temp[63:32] */
-/* save difference of mmreg1's low-word and high-word into mmreg1's low-word */
-/* save difference of mmreg2's low-word and high-word into mmreg1's high-word */
- movhlps %xmm0, %xmm1
- addps %xmm1, %xmm0
- movaps %xmm0, %xmm1
- shufps $0x55, %xmm1, %xmm1 /* fake of pfnacc. 1|1|1|1 */
-
- subss %xmm1, %xmm0
- cvtss2si %xmm0, %eax
-
-/ sar $16,%eax
- movw %ax,(%esi)
-
- addl $64,%ebx
- subl $-128,%ecx
- addl $4,%esi
- decl %ebp
- jnz .L55
-
-/ --- end of loop 1 ---
-
- movups (%ecx), %xmm0
- mulps (%ebx), %xmm0
- movups 16(%ecx), %xmm1
- mulps 16(%ebx), %xmm1
- addps %xmm1, %xmm0
- movups 32(%ecx), %xmm1
- mulps 32(%ebx), %xmm1
- addps %xmm1, %xmm0
- movups 48(%ecx), %xmm1
- mulps 48(%ebx), %xmm1
- addps %xmm1, %xmm0
- movhlps %xmm0, %xmm1
- addss %xmm1, %xmm0
- cvtss2si %xmm0, %eax
-
-/ sar $16,%eax
-
- movw %ax,(%esi)
-
- addl $-64,%ebx
- addl $4,%esi
- addl $256,%ecx
- movl $15,%ebp
-
-.L68:
- xorps %xmm0, %xmm0
- movups (%ecx), %xmm1
- mulps (%ebx), %xmm1
- subps %xmm1, %xmm0
- movups 16(%ecx), %xmm1
- mulps 16(%ebx), %xmm1
- subps %xmm1, %xmm0
- movups 32(%ecx), %xmm1
- mulps 32(%ebx), %xmm1
- subps %xmm1, %xmm0
- movups 48(%ecx), %xmm1
- mulps 48(%ebx), %xmm1
- subps %xmm1, %xmm0
- movhlps %xmm0, %xmm1
- subps %xmm1, %xmm0
- movaps %xmm0, %xmm1
- shufps $0x55, %xmm1, %xmm1 /* fake of pfacc 1|1|1|1 */
- addss %xmm1, %xmm0
- cvtss2si %xmm0, %eax
-
-/ sar $16,%eax
-
- movw %ax,(%esi)
-
- addl $-64,%ebx
- subl $-128,%ecx
- addl $4,%esi
- decl %ebp
- jnz .L68
-
-/ --- end of loop 2
-
- movl %edi,%eax
- popl %ebx
- popl %esi
- popl %edi
- popl %ebp
- addl $12,%esp
- ret
diff --git a/mp3lib/layer2.c b/mp3lib/layer2.c
index b8d7d12df7..21722e5533 100644
--- a/mp3lib/layer2.c
+++ b/mp3lib/layer2.c
@@ -50,8 +50,16 @@ static void init_layer2(void)
{
double m=mulmul[k];
table = muls[k];
+ if(_has_mmx)
+ {
+ for(j=3,i=0;i<63;i++,j--)
+ *table++ = 16384 * m * pow(2.0,(double) j / 3.0);
+ }
+ else
for(j=3,i=0;i<63;i++,j--)
+ {
*table++ = m * pow(2.0,(double) j / 3.0);
+ }
*table++ = 0.0;
}
}
diff --git a/mp3lib/layer3.c b/mp3lib/layer3.c
index 0983f86333..97f6c44712 100644
--- a/mp3lib/layer3.c
+++ b/mp3lib/layer3.c
@@ -22,9 +22,9 @@ static real win1[4][36];
#define GP2MAX (256+118+4)
static real gainpow2[GP2MAX];
-static real nCOS9[9];
+real COS9[9];
static real COS6_1,COS6_2;
-static real tfcos36[9];
+real tfcos36[9];
static real tfcos12[3];
#ifdef NEW_DCT9
static real cos9[3],cos18[3];
@@ -111,8 +111,12 @@ void init_layer3(int down_sample_sblimit)
int i,j,k,l;
for(i=-256;i<118+4;i++)
- gainpow2[i+256] = pow((double)2.0,-0.25 * (double) (i+210) );
-
+ {
+ if(_has_mmx)
+ gainpow2[i+256] = 16384.0 * pow((double)2.0,-0.25 * (double) (i+210) );
+ else
+ gainpow2[i+256] = pow((double)2.0,-0.25 * (double) (i+210) );
+ }
for(i=0;i<8207;i++)
ispow[i] = pow((double)i,(double)4.0/3.0);
@@ -139,7 +143,7 @@ void init_layer3(int down_sample_sblimit)
}
for(i=0;i<9;i++)
- nCOS9[i] = cos( M_PI / 18.0 * (double) i);
+ COS9[i] = cos( M_PI / 18.0 * (double) i);
for(i=0;i<9;i++)
tfcos36[i] = 0.5 / cos ( M_PI * (double) (i*2+1) / 36.0 );
@@ -1533,6 +1537,9 @@ static void III_antialias(real xr[SBLIMIT][SSLIMIT],struct gr_info_s *gr_info)
/*
* III_hybrid
*/
+
+dct36_func_t dct36_func;
+
static void III_hybrid(real fsIn[SBLIMIT][SSLIMIT],real tsOut[SSLIMIT][SBLIMIT],
int ch,struct gr_info_s *gr_info)
{
@@ -1553,8 +1560,8 @@ static void III_hybrid(real fsIn[SBLIMIT][SSLIMIT],real tsOut[SSLIMIT][SBLIMIT],
if(gr_info->mixed_block_flag) {
sb = 2;
- dct36(fsIn[0],rawout1,rawout2,win[0],tspnt);
- dct36(fsIn[1],rawout1+18,rawout2+18,win1[0],tspnt+1);
+ (*dct36_func)(fsIn[0],rawout1,rawout2,win[0],tspnt);
+ (*dct36_func)(fsIn[1],rawout1+18,rawout2+18,win1[0],tspnt+1);
rawout1 += 36; rawout2 += 36; tspnt += 2;
}
@@ -1567,8 +1574,8 @@ static void III_hybrid(real fsIn[SBLIMIT][SSLIMIT],real tsOut[SSLIMIT][SBLIMIT],
}
else {
for (; sb<gr_info->maxb; sb+=2,tspnt+=2,rawout1+=36,rawout2+=36) {
- dct36(fsIn[sb],rawout1,rawout2,win[bt],tspnt);
- dct36(fsIn[sb+1],rawout1+18,rawout2+18,win1[bt],tspnt+1);
+ (*dct36_func)(fsIn[sb],rawout1,rawout2,win[bt],tspnt);
+ (*dct36_func)(fsIn[sb+1],rawout1+18,rawout2+18,win1[bt],tspnt+1);
}
}
diff --git a/mp3lib/mpg123.h b/mp3lib/mpg123.h
index cadeab7347..ff4fef3df8 100644
--- a/mp3lib/mpg123.h
+++ b/mp3lib/mpg123.h
@@ -104,33 +104,22 @@ struct III_sideinfo
};
static long freqs[9];
-#ifdef HAVE_3DNOW
- real decwin[2*(512+32)];
-#else
- real decwin[512+32];
-#endif
- real *pnts[];
+extern real decwin[(512+32)];
+extern real *pnts[];
static int do_layer2(struct frame *fr,int single);
static int do_layer3(struct frame *fr,int single);
static int synth_1to1(real *bandPtr,int channel,unsigned char *out,int *pnt);
-extern int synth_1to1_pent( real *,int,unsigned char * );
+extern int synth_1to1_pent( real *,int,short * );
+extern void make_decode_tables_MMX(long scaleval);
+extern int synth_1to1_MMX( real *,int,short * );
+extern int synth_1to1_MMX_s(real *, int, short *, short *, int *);
extern void dct64(real *a,real *b,real *c);
-#ifdef HAVE_3DNOW
- extern void dct64_3dnow( real *,real *, real * );
- extern void dct36_3dnow(real *,real *,real *,real *,real *);
- extern int synth_1to1_3dnow( real *,int,unsigned char * );
-#endif
-#ifdef HAVE_3DNOWEX
- extern void dct64_3dnowex( real *,real *, real * );
- extern void dct36_3dnowex(real *,real *,real *,real *,real *);
- extern int synth_1to1_3dnowex( real *,int,unsigned char * );
-#endif
-#ifdef HAVE_SSE_MP3
-// extern void dct64_3dnow( real *,real *, real * );
-// extern void dct36_3dnow(real *,real *,real *,real *,real *);
- extern int synth_1to1_sse( real *,int,unsigned char * );
-#endif
+extern void dct36_3dnow(real *,real *,real *,real *,real *);
+extern void dct36_3dnowex(real *,real *,real *,real *,real *);
+extern void dct36_sse(real *,real *,real *,real *,real *);
+typedef int (*synth_func_t)( real *,int,short * );
+typedef void (*dct36_func_t)(real *,real *,real *,real *,real *);
diff --git a/mp3lib/sr1.c b/mp3lib/sr1.c
index 146e12f9f3..7ada065875 100644
--- a/mp3lib/sr1.c
+++ b/mp3lib/sr1.c
@@ -343,6 +343,12 @@ retry1:
static int tables_done_flag=0;
+/* It's hidden from gcc in assembler */
+extern void dct64_MMX( void );
+extern void dct64_MMX_3dnow( void );
+extern void dct64_MMX_3dnowex( void );
+void (*dct64_MMX_func)( void );
+
// Init decoder tables. Call first, once!
#ifdef USE_FAKE_MONO
void MP3_Init(int fakemono){
@@ -351,20 +357,41 @@ void MP3_Init(){
#endif
_CpuID=CpuDetect();
_i586=ipentium();
-#ifdef HAVE_3DNOW
+#ifndef HAVE_MMX
+ _i586 &= 1;
+#endif
_3dnow=a3dnow();
+#ifndef HAVE_3DNOW
+ _3dnow = 0;
#endif
-
- printf( "mp3lib: Processor ID: %x\n",_CpuID );
- printf( "mp3lib: i586 processor %sdetected.\n",(_i586?"":"not ") );
-#ifdef HAVE_3DNOW
- printf( "mp3lib: AMD 3dnow! extension %sdetected.\n",(_3dnow?"":"not ") );
+#ifndef HAVE_3DNOWEX
+ _3dnow &= 1;
+#endif
+ _isse=isse();
+#ifndef HAVE_SSE
+ _isse = 0;
#endif
-#ifdef HAVE_3DNOWEX
- printf( "mp3lib: AMD 3dnow-dsp! extension %sdetected.\n",(_3dnow>1?"":"not ") );
+#ifndef HAVE_SSE2
+ _isse &= 1;
#endif
+ _has_mmx=_i586>1||_3dnow||_isse;
+ printf( "mp3lib: Processor ID: %x\n",_CpuID );
+ if(_i586&&!_3dnow&&!_isse)
+ printf( "mp3lib: Using Pentium%s optimized decore.\n",(_i586>1?"-MMX":""));
+ else
+ if(_isse)
+ /*
+ Note: It's ok, Since K8 will have SSE2 support and will much faster
+ of P4 ;)
+ */
+ printf( "mp3lib: Using SSE%s! optimized decore.\n",(_isse>1?"2":""));
+ else
+ if(_3dnow)
+ printf( "mp3lib: Using AMD 3dnow%s! optimized decore.\n",(_3dnow>1?"-dsp(k7)":""));
- make_decode_tables(outscale);
+/* Use it for any MMX cpu */
+ if(_has_mmx) make_decode_tables_MMX(outscale);
+ else make_decode_tables(outscale);
#ifdef USE_FAKE_MONO
if (fakemono == 1)
fr.synth=synth_1to1_l;
@@ -381,6 +408,42 @@ void MP3_Init(){
init_layer2();
init_layer3(fr.down_sample_sblimit);
tables_done_flag=1;
+
+ dct36_func=dct36;
+ if(_isse)
+ {
+ synth_func=synth_1to1_MMX;
+ dct64_MMX_func=dct64_MMX;
+ }
+ else
+ if ( _3dnow > 1 )
+ {
+ synth_func=synth_1to1_MMX;
+ dct36_func=dct36_3dnowex;
+ dct64_MMX_func=dct64_MMX_3dnowex;
+ }
+ else
+ if ( _3dnow )
+ {
+ synth_func=synth_1to1_MMX;
+ dct36_func=dct36_3dnow;
+ dct64_MMX_func=dct64_MMX_3dnow;
+ }
+ else
+ if ( _i586 > 1)
+ {
+ synth_func=synth_1to1_MMX;
+ dct64_MMX_func=dct64_MMX;
+ }
+ else
+ if ( _i586 )
+ {
+ synth_func=synth_1to1_pent;
+ }
+ else
+ {
+ synth_func = NULL;
+ }
}
#if 0
diff --git a/mp3lib/tabinit.c b/mp3lib/tabinit.c
index 60c2e258b4..b6b7a3386b 100644
--- a/mp3lib/tabinit.c
+++ b/mp3lib/tabinit.c
@@ -1,20 +1,7 @@
+real decwin[(512+32)], cos64[32], cos32[16], cos16[8], cos8[4], cos4[2];
+real *pnts[]={ cos64,cos32,cos16,cos8,cos4 };
-
-#ifdef HAVE_3DNOW
- real decwin[2*(512+32)] __attribute__((aligned(8)));
- real cos64[32] __attribute__((aligned(8)));
- real cos32[16] __attribute__((aligned(8)));
- real cos16[8] __attribute__((aligned(8)));
- real cos8[4] __attribute__((aligned(8)));
- real cos4[2] __attribute__((aligned(8)));
- real *pnts[]={ cos64,cos32,cos16,cos8,cos4 };
-#else
- real decwin[512+32];
- real cos64[16],cos32[8],cos16[4],cos8[2],cos4[1];
- real *pnts[] = { cos64,cos32,cos16,cos8,cos4 };
-#endif
-
-long intwinbase[] = {
+static long intwinbase[] = {
0, -1, -1, -1, -1, -1, -1, -2, -2, -2,
-2, -3, -3, -4, -4, -5, -5, -6, -7, -7,
-8, -9, -10, -11, -13, -14, -16, -17, -19, -21,
@@ -42,7 +29,7 @@ long intwinbase[] = {
64019, 65290, 66494, 67629, 68692, 69679, 70590, 71420, 72169, 72835,
73415, 73908, 74313, 74630, 74856, 74992, 75038 };
- void make_decode_tables(long scaleval)
+void make_decode_tables(long scaleval)
{
int i,j,k,kr,divv;
real *table,*costab;
@@ -53,17 +40,13 @@ long intwinbase[] = {
kr=0x10>>i; divv=0x40>>i;
costab = pnts[i];
for(k=0;k<kr;k++) costab[k] = 1.0 / (2.0 * cos(M_PI * ((double) k * 2.0 + 1.0) / (double) divv));
- #ifdef HAVE_3DNOW
- if ( _3dnow ) for(k=0;k<kr;k++) costab[k+kr]=-costab[k];
- #endif
-
}
table = decwin;
scaleval = -scaleval;
for(i=0,j=0;i<256;i++,j++,table+=32)
{
- if(table < decwin+512+16)
+ if(table < decwin+512+16)
table[16] = table[0] = (double) intwinbase[j] / 65536.0 * (double) scaleval;
if(i % 32 == 31)
table -= 1023;
@@ -80,14 +63,6 @@ long intwinbase[] = {
if(i % 64 == 63)
scaleval = - scaleval;
}
- #ifdef HAVE_3DNOW
- if ( _3dnow )
- for(i=0;i<512+32;i++)
- {
- decwin[512+31-i]*=65536.0; // allows faster clipping in 3dnow code
- decwin[512+32+i]=decwin[512+31-i];
- }
- #endif
}
diff --git a/mp3lib/tabinit_MMX.s b/mp3lib/tabinit_MMX.s
new file mode 100644
index 0000000000..90b4d1d223
--- /dev/null
+++ b/mp3lib/tabinit_MMX.s
@@ -0,0 +1,161 @@
+# This code was taken from http://www.mpg123.org
+# See ChangeLog of mpg123-0.59s-pre.1 for detail
+# Applied to mplayer by Nick Kurshev <nickols_k@mail.ru>
+.bss
+ .align 8
+ .comm decwin,2176,32
+ .align 8
+ .comm decwins,2176,32
+.data
+ .align 8
+intwinbase_MMX:
+ .value 0, -1, -1, -1, -1, -1, -1, -2
+ .value -2, -2, -2, -3, -3, -4, -4, -5
+ .value -5, -6, -7, -7, -8, -9, -10, -11
+ .value -13, -14, -16, -17, -19, -21, -24, -26
+ .value -29, -31, -35, -38, -41, -45, -49, -53
+ .value -58, -63, -68, -73, -79, -85, -91, -97
+ .value -104, -111, -117, -125, -132, -139, -147, -154
+ .value -161, -169, -176, -183, -190, -196, -202, -208
+ .value -213, -218, -222, -225, -227, -228, -228, -227
+ .value -224, -221, -215, -208, -200, -189, -177, -163
+ .value -146, -127, -106, -83, -57, -29, 2, 36
+ .value 72, 111, 153, 197, 244, 294, 347, 401
+ .value 459, 519, 581, 645, 711, 779, 848, 919
+ .value 991, 1064, 1137, 1210, 1283, 1356, 1428, 1498
+ .value 1567, 1634, 1698, 1759, 1817, 1870, 1919, 1962
+ .value 2001, 2032, 2057, 2075, 2085, 2087, 2080, 2063
+ .value 2037, 2000, 1952, 1893, 1822, 1739, 1644, 1535
+ .value 1414, 1280, 1131, 970, 794, 605, 402, 185
+ .value -45, -288, -545, -814, -1095, -1388, -1692, -2006
+ .value -2330, -2663, -3004, -3351, -3705, -4063, -4425, -4788
+ .value -5153, -5517, -5879, -6237, -6589, -6935, -7271, -7597
+ .value -7910, -8209, -8491, -8755, -8998, -9219, -9416, -9585
+ .value -9727, -9838, -9916, -9959, -9966, -9935, -9863, -9750
+ .value -9592, -9389, -9139, -8840, -8492, -8092, -7640, -7134
+ .value -6574, -5959, -5288, -4561, -3776, -2935, -2037, -1082
+ .value -70, 998, 2122, 3300, 4533, 5818, 7154, 8540
+ .value 9975, 11455, 12980, 14548, 16155, 17799, 19478, 21189
+ .value 22929, 24694, 26482, 28289, 30112, 31947,-26209,-24360
+ .value -22511,-20664,-18824,-16994,-15179,-13383,-11610, -9863
+ .value -8147, -6466, -4822, -3222, -1667, -162, 1289, 2684
+ .value 4019, 5290, 6494, 7629, 8692, 9679, 10590, 11420
+ .value 12169, 12835, 13415, 13908, 14313, 14630, 14856, 14992
+ .value 15038
+
+intwindiv:
+ .long 0x47800000 # 65536.0
+.text
+ .align 32
+.globl make_decode_tables_MMX
+make_decode_tables_MMX:
+ pushl %edi
+ pushl %esi
+ pushl %ebx
+
+ xorl %ecx,%ecx
+ xorl %ebx,%ebx
+ movl $32,%esi
+ movl $intwinbase_MMX,%edi
+ negl 16(%esp) # scaleval
+ pushl $2 # intwinbase step
+.L00:
+ cmpl $528,%ecx
+ jnc .L02
+ movswl (%edi),%eax
+ cmpl $intwinbase_MMX+444,%edi
+ jc .L01
+ addl $60000,%eax
+.L01:
+ pushl %eax
+ fildl (%esp)
+ fdivs intwindiv
+ fimull 24(%esp)
+ popl %eax
+ fsts decwin(,%ecx,4)
+ fstps decwin+64(,%ecx,4)
+.L02:
+ leal -1(%esi),%edx
+ and %ebx,%edx
+ cmp $31,%edx
+ jnz .L03
+ addl $-1023,%ecx
+ test %esi,%ebx
+ jz .L03
+ negl 20(%esp)
+.L03:
+ addl %esi,%ecx
+ addl (%esp),%edi
+ incl %ebx
+ cmpl $intwinbase_MMX,%edi
+ jz .L04
+ cmp $256,%ebx
+ jnz .L00
+ negl (%esp)
+ jmp .L00
+.L04:
+ popl %eax
+
+ xorl %ecx,%ecx
+ xorl %ebx,%ebx
+ pushl $2
+.L05:
+ cmpl $528,%ecx
+ jnc .L11
+ movswl (%edi),%eax
+ cmpl $intwinbase_MMX+444,%edi
+ jc .L06
+ addl $60000,%eax
+.L06:
+ cltd
+ imull 20(%esp)
+ shrdl $17,%edx,%eax
+ cmpl $32767,%eax
+ movl $1055,%edx
+ jle .L07
+ movl $32767,%eax
+ jmp .L08
+.L07:
+ cmpl $-32767,%eax
+ jge .L08
+ movl $-32767,%eax
+.L08:
+ cmpl $512,%ecx
+ jnc .L09
+ subl %ecx,%edx
+ movw %ax,decwins(,%edx,2)
+ movw %ax,decwins-32(,%edx,2)
+.L09:
+ testl $1,%ecx
+ jnz .L10
+ negl %eax
+.L10:
+ movw %ax,decwins(,%ecx,2)
+ movw %ax,decwins+32(,%ecx,2)
+.L11:
+ leal -1(%esi),%edx
+ and %ebx,%edx
+ cmp $31,%edx
+ jnz .L12
+ addl $-1023,%ecx
+ test %esi,%ebx
+ jz .L12
+ negl 20(%esp)
+.L12:
+ addl %esi,%ecx
+ addl (%esp),%edi
+ incl %ebx
+ cmpl $intwinbase_MMX,%edi
+ jz .L13
+ cmp $256,%ebx
+ jnz .L05
+ negl (%esp)
+ jmp .L05
+.L13:
+ popl %eax
+
+ popl %ebx
+ popl %esi
+ popl %edi
+ ret
+
diff --git a/mp3lib/test2.c b/mp3lib/test2.c
index e520ddfc7d..ccde3de91a 100644
--- a/mp3lib/test2.c
+++ b/mp3lib/test2.c
@@ -1,5 +1,5 @@
-// gcc test.c -I.. -L. -lMP3 -lm -o test2 -O4
+//gcc test2.c -O2 -I.. -L. ../libvo/aclib.c -lMP3 -lm -o test2
#include <stdio.h>
#include <stdlib.h>