diff options
author | nick <nick@b3059339-0415-0410-9bf9-f77b7e298cf2> | 2001-06-29 17:55:35 +0000 |
---|---|---|
committer | nick <nick@b3059339-0415-0410-9bf9-f77b7e298cf2> | 2001-06-29 17:55:35 +0000 |
commit | 2ec6762923fea7f28331849b1d394f30dfce1aff (patch) | |
tree | 58ff3fcc1ac955a2b07e81d74fe489076e1fe631 /mp3lib/dct64_3dnow.s | |
parent | bf8a76c06387345aa448b66ce2dff37ba0fcd69e (diff) |
Added newest MMX-optimized decore which speedups decoding at least on 13% for any cpu.
git-svn-id: svn://svn.mplayerhq.hu/mplayer/trunk@1246 b3059339-0415-0410-9bf9-f77b7e298cf2
Diffstat (limited to 'mp3lib/dct64_3dnow.s')
-rw-r--r-- | mp3lib/dct64_3dnow.s | 1636 |
1 files changed, 931 insertions, 705 deletions
diff --git a/mp3lib/dct64_3dnow.s b/mp3lib/dct64_3dnow.s index b7540573a6..dfade383db 100644 --- a/mp3lib/dct64_3dnow.s +++ b/mp3lib/dct64_3dnow.s @@ -1,706 +1,932 @@ -/// -/// Replacement of dct64() with AMD's 3DNow! SIMD operations support -/// -/// Syuuhei Kashiyama <squash@mb.kcom.ne.jp> -/// -/// The author of this program disclaim whole expressed or implied -/// warranties with regard to this program, and in no event shall the -/// author of this program liable to whatever resulted from the use of -/// this program. Use it at your own risk. -/// - - .globl dct64_3dnow - .type dct64_3dnow,@function -dct64_3dnow: - subl $256,%esp - pushl %ebp - pushl %edi - pushl %esi - pushl %ebx - leal 16(%esp),%ebx - movl 284(%esp),%edi - movl 276(%esp),%ebp - movl 280(%esp),%edx - leal 128(%ebx),%esi - - / femms - - // 1 - movl pnts,%eax - movq 0(%edi),%mm0 - movq %mm0,%mm1 - movd 124(%edi),%mm2 - punpckldq 120(%edi),%mm2 - movq 0(%eax),%mm3 - pfadd %mm2,%mm0 - movq %mm0,0(%ebx) - pfsub %mm2,%mm1 - pfmul %mm3,%mm1 - movd %mm1,124(%ebx) - psrlq $32,%mm1 - movd %mm1,120(%ebx) - movq 8(%edi),%mm4 - movq %mm4,%mm5 - movd 116(%edi),%mm6 - punpckldq 112(%edi),%mm6 - movq 8(%eax),%mm7 - pfadd %mm6,%mm4 - movq %mm4,8(%ebx) - pfsub %mm6,%mm5 - pfmul %mm7,%mm5 - movd %mm5,116(%ebx) - psrlq $32,%mm5 - movd %mm5,112(%ebx) - movq 16(%edi),%mm0 - movq %mm0,%mm1 - movd 108(%edi),%mm2 - punpckldq 104(%edi),%mm2 - movq 16(%eax),%mm3 - pfadd %mm2,%mm0 - movq %mm0,16(%ebx) - pfsub %mm2,%mm1 - pfmul %mm3,%mm1 - movd %mm1,108(%ebx) - psrlq $32,%mm1 - movd %mm1,104(%ebx) - movq 24(%edi),%mm4 - movq %mm4,%mm5 - movd 100(%edi),%mm6 - punpckldq 96(%edi),%mm6 - movq 24(%eax),%mm7 - pfadd %mm6,%mm4 - movq %mm4,24(%ebx) - pfsub %mm6,%mm5 - pfmul %mm7,%mm5 - movd %mm5,100(%ebx) - psrlq $32,%mm5 - movd %mm5,96(%ebx) - movq 32(%edi),%mm0 - movq %mm0,%mm1 - movd 92(%edi),%mm2 - punpckldq 88(%edi),%mm2 - movq 32(%eax),%mm3 - pfadd %mm2,%mm0 - movq %mm0,32(%ebx) - pfsub %mm2,%mm1 - pfmul %mm3,%mm1 - movd %mm1,92(%ebx) - psrlq $32,%mm1 - movd %mm1,88(%ebx) - movq 40(%edi),%mm4 - movq %mm4,%mm5 - movd 84(%edi),%mm6 - punpckldq 80(%edi),%mm6 - movq 40(%eax),%mm7 - pfadd %mm6,%mm4 - movq %mm4,40(%ebx) - pfsub %mm6,%mm5 - pfmul %mm7,%mm5 - movd %mm5,84(%ebx) - psrlq $32,%mm5 - movd %mm5,80(%ebx) - movq 48(%edi),%mm0 - movq %mm0,%mm1 - movd 76(%edi),%mm2 - punpckldq 72(%edi),%mm2 - movq 48(%eax),%mm3 - pfadd %mm2,%mm0 - movq %mm0,48(%ebx) - pfsub %mm2,%mm1 - pfmul %mm3,%mm1 - movd %mm1,76(%ebx) - psrlq $32,%mm1 - movd %mm1,72(%ebx) - movq 56(%edi),%mm4 - movq %mm4,%mm5 - movd 68(%edi),%mm6 - punpckldq 64(%edi),%mm6 - movq 56(%eax),%mm7 - pfadd %mm6,%mm4 - movq %mm4,56(%ebx) - pfsub %mm6,%mm5 - pfmul %mm7,%mm5 - movd %mm5,68(%ebx) - psrlq $32,%mm5 - movd %mm5,64(%ebx) - - // 2 - movl pnts+4,%eax - / 0, 14 - movq 0(%ebx),%mm0 - movq %mm0,%mm1 - movd 60(%ebx),%mm2 - punpckldq 56(%ebx),%mm2 - movq 0(%eax),%mm3 - pfadd %mm2,%mm0 - movq %mm0,0(%esi) - pfsub %mm2,%mm1 - pfmul %mm3,%mm1 - movd %mm1,60(%esi) - psrlq $32,%mm1 - movd %mm1,56(%esi) - / 16, 30 - movq 64(%ebx),%mm0 - movq %mm0,%mm1 - movd 124(%ebx),%mm2 - punpckldq 120(%ebx),%mm2 - pfadd %mm2,%mm0 - movq %mm0,64(%esi) - pfsubr %mm2,%mm1 - pfmul %mm3,%mm1 - movd %mm1,124(%esi) - psrlq $32,%mm1 - movd %mm1,120(%esi) - movq 8(%ebx),%mm4 - / 2, 12 - movq %mm4,%mm5 - movd 52(%ebx),%mm6 - punpckldq 48(%ebx),%mm6 - movq 8(%eax),%mm7 - pfadd %mm6,%mm4 - movq %mm4,8(%esi) - pfsub %mm6,%mm5 - pfmul %mm7,%mm5 - movd %mm5,52(%esi) - psrlq $32,%mm5 - movd %mm5,48(%esi) - movq 72(%ebx),%mm4 - / 18, 28 - movq %mm4,%mm5 - movd 116(%ebx),%mm6 - punpckldq 112(%ebx),%mm6 - pfadd %mm6,%mm4 - movq %mm4,72(%esi) - pfsubr %mm6,%mm5 - pfmul %mm7,%mm5 - movd %mm5,116(%esi) - psrlq $32,%mm5 - movd %mm5,112(%esi) - movq 16(%ebx),%mm0 - / 4, 10 - movq %mm0,%mm1 - movd 44(%ebx),%mm2 - punpckldq 40(%ebx),%mm2 - movq 16(%eax),%mm3 - pfadd %mm2,%mm0 - movq %mm0,16(%esi) - pfsub %mm2,%mm1 - pfmul %mm3,%mm1 - movd %mm1,44(%esi) - psrlq $32,%mm1 - movd %mm1,40(%esi) - movq 80(%ebx),%mm0 - / 20, 26 - movq %mm0,%mm1 - movd 108(%ebx),%mm2 - punpckldq 104(%ebx),%mm2 - pfadd %mm2,%mm0 - movq %mm0,80(%esi) - pfsubr %mm2,%mm1 - pfmul %mm3,%mm1 - movd %mm1,108(%esi) - psrlq $32,%mm1 - movd %mm1,104(%esi) - movq 24(%ebx),%mm4 - / 6, 8 - movq %mm4,%mm5 - movd 36(%ebx),%mm6 - punpckldq 32(%ebx),%mm6 - movq 24(%eax),%mm7 - pfadd %mm6,%mm4 - movq %mm4,24(%esi) - pfsub %mm6,%mm5 - pfmul %mm7,%mm5 - movd %mm5,36(%esi) - psrlq $32,%mm5 - movd %mm5,32(%esi) - movq 88(%ebx),%mm4 - / 22, 24 - movq %mm4,%mm5 - movd 100(%ebx),%mm6 - punpckldq 96(%ebx),%mm6 - pfadd %mm6,%mm4 - movq %mm4,88(%esi) - pfsubr %mm6,%mm5 - pfmul %mm7,%mm5 - movd %mm5,100(%esi) - psrlq $32,%mm5 - movd %mm5,96(%esi) - - // 3 - movl pnts+8,%eax - movq 0(%eax),%mm0 - movq 8(%eax),%mm1 - movq 0(%esi),%mm2 - / 0, 6 - movq %mm2,%mm3 - movd 28(%esi),%mm4 - punpckldq 24(%esi),%mm4 - pfadd %mm4,%mm2 - pfsub %mm4,%mm3 - pfmul %mm0,%mm3 - movq %mm2,0(%ebx) - movd %mm3,28(%ebx) - psrlq $32,%mm3 - movd %mm3,24(%ebx) - movq 8(%esi),%mm5 - / 2, 4 - movq %mm5,%mm6 - movd 20(%esi),%mm7 - punpckldq 16(%esi),%mm7 - pfadd %mm7,%mm5 - pfsub %mm7,%mm6 - pfmul %mm1,%mm6 - movq %mm5,8(%ebx) - movd %mm6,20(%ebx) - psrlq $32,%mm6 - movd %mm6,16(%ebx) - movq 32(%esi),%mm2 - / 8, 14 - movq %mm2,%mm3 - movd 60(%esi),%mm4 - punpckldq 56(%esi),%mm4 - pfadd %mm4,%mm2 - pfsubr %mm4,%mm3 - pfmul %mm0,%mm3 - movq %mm2,32(%ebx) - movd %mm3,60(%ebx) - psrlq $32,%mm3 - movd %mm3,56(%ebx) - movq 40(%esi),%mm5 - / 10, 12 - movq %mm5,%mm6 - movd 52(%esi),%mm7 - punpckldq 48(%esi),%mm7 - pfadd %mm7,%mm5 - pfsubr %mm7,%mm6 - pfmul %mm1,%mm6 - movq %mm5,40(%ebx) - movd %mm6,52(%ebx) - psrlq $32,%mm6 - movd %mm6,48(%ebx) - movq 64(%esi),%mm2 - / 16, 22 - movq %mm2,%mm3 - movd 92(%esi),%mm4 - punpckldq 88(%esi),%mm4 - pfadd %mm4,%mm2 - pfsub %mm4,%mm3 - pfmul %mm0,%mm3 - movq %mm2,64(%ebx) - movd %mm3,92(%ebx) - psrlq $32,%mm3 - movd %mm3,88(%ebx) - movq 72(%esi),%mm5 - / 18, 20 - movq %mm5,%mm6 - movd 84(%esi),%mm7 - punpckldq 80(%esi),%mm7 - pfadd %mm7,%mm5 - pfsub %mm7,%mm6 - pfmul %mm1,%mm6 - movq %mm5,72(%ebx) - movd %mm6,84(%ebx) - psrlq $32,%mm6 - movd %mm6,80(%ebx) - movq 96(%esi),%mm2 - / 24, 30 - movq %mm2,%mm3 - movd 124(%esi),%mm4 - punpckldq 120(%esi),%mm4 - pfadd %mm4,%mm2 - pfsubr %mm4,%mm3 - pfmul %mm0,%mm3 - movq %mm2,96(%ebx) - movd %mm3,124(%ebx) - psrlq $32,%mm3 - movd %mm3,120(%ebx) - movq 104(%esi),%mm5 - / 26, 28 - movq %mm5,%mm6 - movd 116(%esi),%mm7 - punpckldq 112(%esi),%mm7 - pfadd %mm7,%mm5 - pfsubr %mm7,%mm6 - pfmul %mm1,%mm6 - movq %mm5,104(%ebx) - movd %mm6,116(%ebx) - psrlq $32,%mm6 - movd %mm6,112(%ebx) - - // 4 - movl pnts+12,%eax - movq 0(%eax),%mm0 - movq 0(%ebx),%mm1 - / 0 - movq %mm1,%mm2 - movd 12(%ebx),%mm3 - punpckldq 8(%ebx),%mm3 - pfadd %mm3,%mm1 - pfsub %mm3,%mm2 - pfmul %mm0,%mm2 - movq %mm1,0(%esi) - movd %mm2,12(%esi) - psrlq $32,%mm2 - movd %mm2,8(%esi) - movq 16(%ebx),%mm4 - / 4 - movq %mm4,%mm5 - movd 28(%ebx),%mm6 - punpckldq 24(%ebx),%mm6 - pfadd %mm6,%mm4 - pfsubr %mm6,%mm5 - pfmul %mm0,%mm5 - movq %mm4,16(%esi) - movd %mm5,28(%esi) - psrlq $32,%mm5 - movd %mm5,24(%esi) - movq 32(%ebx),%mm1 - / 8 - movq %mm1,%mm2 - movd 44(%ebx),%mm3 - punpckldq 40(%ebx),%mm3 - pfadd %mm3,%mm1 - pfsub %mm3,%mm2 - pfmul %mm0,%mm2 - movq %mm1,32(%esi) - movd %mm2,44(%esi) - psrlq $32,%mm2 - movd %mm2,40(%esi) - movq 48(%ebx),%mm4 - / 12 - movq %mm4,%mm5 - movd 60(%ebx),%mm6 - punpckldq 56(%ebx),%mm6 - pfadd %mm6,%mm4 - pfsubr %mm6,%mm5 - pfmul %mm0,%mm5 - movq %mm4,48(%esi) - movd %mm5,60(%esi) - psrlq $32,%mm5 - movd %mm5,56(%esi) - movq 64(%ebx),%mm1 - / 16 - movq %mm1,%mm2 - movd 76(%ebx),%mm3 - punpckldq 72(%ebx),%mm3 - pfadd %mm3,%mm1 - pfsub %mm3,%mm2 - pfmul %mm0,%mm2 - movq %mm1,64(%esi) - movd %mm2,76(%esi) - psrlq $32,%mm2 - movd %mm2,72(%esi) - movq 80(%ebx),%mm4 - / 20 - movq %mm4,%mm5 - movd 92(%ebx),%mm6 - punpckldq 88(%ebx),%mm6 - pfadd %mm6,%mm4 - pfsubr %mm6,%mm5 - pfmul %mm0,%mm5 - movq %mm4,80(%esi) - movd %mm5,92(%esi) - psrlq $32,%mm5 - movd %mm5,88(%esi) - movq 96(%ebx),%mm1 - / 24 - movq %mm1,%mm2 - movd 108(%ebx),%mm3 - punpckldq 104(%ebx),%mm3 - pfadd %mm3,%mm1 - pfsub %mm3,%mm2 - pfmul %mm0,%mm2 - movq %mm1,96(%esi) - movd %mm2,108(%esi) - psrlq $32,%mm2 - movd %mm2,104(%esi) - movq 112(%ebx),%mm4 - / 28 - movq %mm4,%mm5 - movd 124(%ebx),%mm6 - punpckldq 120(%ebx),%mm6 - pfadd %mm6,%mm4 - pfsubr %mm6,%mm5 - pfmul %mm0,%mm5 - movq %mm4,112(%esi) - movd %mm5,124(%esi) - psrlq $32,%mm5 - movd %mm5,120(%esi) - - // 5 - movl $-1,%eax - movd %eax,%mm1 - movl $1,%eax - movd %eax,%mm0 - / L | H - punpckldq %mm1,%mm0 - pi2fd %mm0,%mm0 - / 1.0 | -1.0 - movd %eax,%mm1 - pi2fd %mm1,%mm1 - movl pnts+16,%eax - movd 0(%eax),%mm2 - punpckldq %mm2,%mm1 - / 1.0 | cos0 - movq 0(%esi),%mm2 - / 0 - movq %mm2,%mm3 - pfmul %mm0,%mm3 - pfacc %mm3,%mm2 - pfmul %mm1,%mm2 - movq %mm2,0(%ebx) - movq 8(%esi),%mm4 - movq %mm4,%mm5 - pfmul %mm0,%mm5 - pfacc %mm5,%mm4 - pfmul %mm0,%mm4 - pfmul %mm1,%mm4 - movq %mm4,%mm5 - psrlq $32,%mm5 - pfacc %mm5,%mm4 - movq %mm4,8(%ebx) - movq 16(%esi),%mm2 - / 4 - movq %mm2,%mm3 - pfmul %mm0,%mm3 - pfacc %mm3,%mm2 - pfmul %mm1,%mm2 - movq 24(%esi),%mm4 - movq %mm4,%mm5 - pfmul %mm0,%mm5 - pfacc %mm5,%mm4 - pfmul %mm0,%mm4 - pfmul %mm1,%mm4 - movq %mm4,%mm5 - psrlq $32,%mm5 - pfacc %mm5,%mm4 - movq %mm2,%mm3 - psrlq $32,%mm3 - pfadd %mm4,%mm2 - pfadd %mm3,%mm4 - movq %mm2,16(%ebx) - movq %mm4,24(%ebx) - movq 32(%esi),%mm2 - / 8 - movq %mm2,%mm3 - pfmul %mm0,%mm3 - pfacc %mm3,%mm2 - pfmul %mm1,%mm2 - movq %mm2,32(%ebx) - movq 40(%esi),%mm4 - movq %mm4,%mm5 - pfmul %mm0,%mm5 - pfacc %mm5,%mm4 - pfmul %mm0,%mm4 - pfmul %mm1,%mm4 - movq %mm4,%mm5 - psrlq $32,%mm5 - pfacc %mm5,%mm4 - movq %mm4,40(%ebx) - movq 48(%esi),%mm2 - / 12 - movq %mm2,%mm3 - pfmul %mm0,%mm3 - pfacc %mm3,%mm2 - pfmul %mm1,%mm2 - movq 56(%esi),%mm4 - movq %mm4,%mm5 - pfmul %mm0,%mm5 - pfacc %mm5,%mm4 - pfmul %mm0,%mm4 - pfmul %mm1,%mm4 - movq %mm4,%mm5 - psrlq $32,%mm5 - pfacc %mm5,%mm4 - movq %mm2,%mm3 - psrlq $32,%mm3 - pfadd %mm4,%mm2 - pfadd %mm3,%mm4 - movq %mm2,48(%ebx) - movq %mm4,56(%ebx) - movq 64(%esi),%mm2 - / 16 - movq %mm2,%mm3 - pfmul %mm0,%mm3 - pfacc %mm3,%mm2 - pfmul %mm1,%mm2 - movq %mm2,64(%ebx) - movq 72(%esi),%mm4 - movq %mm4,%mm5 - pfmul %mm0,%mm5 - pfacc %mm5,%mm4 - pfmul %mm0,%mm4 - pfmul %mm1,%mm4 - movq %mm4,%mm5 - psrlq $32,%mm5 - pfacc %mm5,%mm4 - movq %mm4,72(%ebx) - movq 80(%esi),%mm2 - / 20 - movq %mm2,%mm3 - pfmul %mm0,%mm3 - pfacc %mm3,%mm2 - pfmul %mm1,%mm2 - movq 88(%esi),%mm4 - movq %mm4,%mm5 - pfmul %mm0,%mm5 - pfacc %mm5,%mm4 - pfmul %mm0,%mm4 - pfmul %mm1,%mm4 - movq %mm4,%mm5 - psrlq $32,%mm5 - pfacc %mm5,%mm4 - movq %mm2,%mm3 - psrlq $32,%mm3 - pfadd %mm4,%mm2 - pfadd %mm3,%mm4 - movq %mm2,80(%ebx) - movq %mm4,88(%ebx) - movq 96(%esi),%mm2 - / 24 - movq %mm2,%mm3 - pfmul %mm0,%mm3 - pfacc %mm3,%mm2 - pfmul %mm1,%mm2 - movq %mm2,96(%ebx) - movq 104(%esi),%mm4 - movq %mm4,%mm5 - pfmul %mm0,%mm5 - pfacc %mm5,%mm4 - pfmul %mm0,%mm4 - pfmul %mm1,%mm4 - movq %mm4,%mm5 - psrlq $32,%mm5 - pfacc %mm5,%mm4 - movq %mm4,104(%ebx) - movq 112(%esi),%mm2 - / 28 - movq %mm2,%mm3 - pfmul %mm0,%mm3 - pfacc %mm3,%mm2 - pfmul %mm1,%mm2 - movq 120(%esi),%mm4 - movq %mm4,%mm5 - pfmul %mm0,%mm5 - pfacc %mm5,%mm4 - pfmul %mm0,%mm4 - pfmul %mm1,%mm4 - movq %mm4,%mm5 - psrlq $32,%mm5 - pfacc %mm5,%mm4 - movq %mm2,%mm3 - psrlq $32,%mm3 - pfadd %mm4,%mm2 - pfadd %mm3,%mm4 - movq %mm2,112(%ebx) - movq %mm4,120(%ebx) - - // Phase6 - movl 0(%ebx),%eax - movl %eax,1024(%ebp) - movl 4(%ebx),%eax - movl %eax,0(%ebp) - movl %eax,0(%edx) - movl 8(%ebx),%eax - movl %eax,512(%ebp) - movl 12(%ebx),%eax - movl %eax,512(%edx) - - movl 16(%ebx),%eax - movl %eax,768(%ebp) - movl 20(%ebx),%eax - movl %eax,256(%edx) - - movl 24(%ebx),%eax - movl %eax,256(%ebp) - movl 28(%ebx),%eax - movl %eax,768(%edx) - - movq 32(%ebx),%mm0 - movq 48(%ebx),%mm1 - pfadd %mm1,%mm0 - movd %mm0,896(%ebp) - psrlq $32,%mm0 - movd %mm0,128(%edx) - movq 40(%ebx),%mm2 - pfadd %mm2,%mm1 - movd %mm1,640(%ebp) - psrlq $32,%mm1 - movd %mm1,384(%edx) - - movq 56(%ebx),%mm3 - pfadd %mm3,%mm2 - movd %mm2,384(%ebp) - psrlq $32,%mm2 - movd %mm2,640(%edx) - - movd 36(%ebx),%mm4 - pfadd %mm4,%mm3 - movd %mm3,128(%ebp) - psrlq $32,%mm3 - movd %mm3,896(%edx) - movq 96(%ebx),%mm0 - movq 64(%ebx),%mm1 - - movq 112(%ebx),%mm2 - pfadd %mm2,%mm0 - movq %mm0,%mm3 - pfadd %mm1,%mm3 - movd %mm3,960(%ebp) - psrlq $32,%mm3 - movd %mm3,64(%edx) - movq 80(%ebx),%mm1 - pfadd %mm1,%mm0 - movd %mm0,832(%ebp) - psrlq $32,%mm0 - movd %mm0,192(%edx) - movq 104(%ebx),%mm3 - pfadd %mm3,%mm2 - movq %mm2,%mm4 - pfadd %mm1,%mm4 - movd %mm4,704(%ebp) - psrlq $32,%mm4 - movd %mm4,320(%edx) - movq 72(%ebx),%mm1 - pfadd %mm1,%mm2 - movd %mm2,576(%ebp) - psrlq $32,%mm2 - movd %mm2,448(%edx) - - movq 120(%ebx),%mm4 - pfadd %mm4,%mm3 - movq %mm3,%mm5 - pfadd %mm1,%mm5 - movd %mm5,448(%ebp) - psrlq $32,%mm5 - movd %mm5,576(%edx) - movq 88(%ebx),%mm1 - pfadd %mm1,%mm3 - movd %mm3,320(%ebp) - psrlq $32,%mm3 - movd %mm3,704(%edx) - - movd 100(%ebx),%mm5 - pfadd %mm5,%mm4 - movq %mm4,%mm6 - pfadd %mm1,%mm6 - movd %mm6,192(%ebp) - psrlq $32,%mm6 - movd %mm6,832(%edx) - movd 68(%ebx),%mm1 - pfadd %mm1,%mm4 - movd %mm4,64(%ebp) - psrlq $32,%mm4 - movd %mm4,960(%edx) - - / femms - - popl %ebx - popl %esi - popl %edi - popl %ebp - addl $256,%esp - - ret +# This code was taken from http://www.mpg123.org +# See ChangeLog of mpg123-0.59s-pre.1 for detail +# Applied to mplayer by Nick Kurshev <nickols_k@mail.ru> +# Partial 3dnow! optimization by Nick Kurshev +# +# TODO: finish 3dnow! optimization at least in scalar mode +# + +.data + .align 8 +plus_minus_3dnow: .long 0x00000000, 0x80000000 +costab: + .long 1056974725 + .long 1057056395 + .long 1057223771 + .long 1057485416 + .long 1057855544 + .long 1058356026 + .long 1059019886 + .long 1059897405 + .long 1061067246 + .long 1062657950 + .long 1064892987 + .long 1066774581 + .long 1069414683 + .long 1073984175 + .long 1079645762 + .long 1092815430 + .long 1057005197 + .long 1057342072 + .long 1058087743 + .long 1059427869 + .long 1061799040 + .long 1065862217 + .long 1071413542 + .long 1084439708 + .long 1057128951 + .long 1058664893 + .long 1063675095 + .long 1076102863 + .long 1057655764 + .long 1067924853 + .long 1060439283 + +.text + + .align 16 + +.globl dct64_MMX_3dnow +dct64_MMX_3dnow: + pushl %ebx + pushl %esi + pushl %edi + subl $256,%esp + movl 280(%esp),%eax + + leal 128(%esp),%edx + movl 272(%esp),%esi + movl 276(%esp),%edi + movl $costab,%ebx + orl %ecx,%ecx + movl %esp,%ecx + femms +/* Phase 1*/ + movq (%eax), %mm0 + movq 8(%eax), %mm4 + movq %mm0, %mm3 + movq %mm4, %mm7 + movq 120(%eax), %mm1 + movq 112(%eax), %mm5 + /* n.b.: pswapd*/ + movq %mm1, %mm2 + movq %mm5, %mm6 + psrlq $32, %mm1 + psrlq $32, %mm5 + punpckldq %mm2, %mm1 + punpckldq %mm6, %mm5 + /**/ + pfadd %mm1, %mm0 + pfadd %mm5, %mm4 + movq %mm0, (%edx) + movq %mm4, 8(%edx) + pfsub %mm1, %mm3 + pfsub %mm5, %mm7 + pfmul (%ebx), %mm3 + pfmul 8(%ebx), %mm7 + movd %mm3, 124(%edx) + movd %mm7, 116(%edx) + psrlq $32, %mm3 + psrlq $32, %mm7 + movd %mm3, 120(%edx) + movd %mm7, 112(%edx) + + movq 16(%eax), %mm0 + movq 24(%eax), %mm4 + movq %mm0, %mm3 + movq %mm4, %mm7 + movq 104(%eax), %mm1 + movq 96(%eax), %mm5 + /* n.b.: pswapd*/ + movq %mm1, %mm2 + movq %mm5, %mm6 + psrlq $32, %mm1 + psrlq $32, %mm5 + punpckldq %mm2, %mm1 + punpckldq %mm6, %mm5 + /**/ + pfadd %mm1, %mm0 + pfadd %mm5, %mm4 + movq %mm0, 16(%edx) + movq %mm4, 24(%edx) + pfsub %mm1, %mm3 + pfsub %mm5, %mm7 + pfmul 16(%ebx), %mm3 + pfmul 24(%ebx), %mm7 + movd %mm3, 108(%edx) + movd %mm7, 100(%edx) + psrlq $32, %mm3 + psrlq $32, %mm7 + movd %mm3, 104(%edx) + movd %mm7, 96(%edx) + + movq 32(%eax), %mm0 + movq 40(%eax), %mm4 + movq %mm0, %mm3 + movq %mm4, %mm7 + movq 88(%eax), %mm1 + movq 80(%eax), %mm5 + /* n.b.: pswapd*/ + movq %mm1, %mm2 + movq %mm5, %mm6 + psrlq $32, %mm1 + psrlq $32, %mm5 + punpckldq %mm2, %mm1 + punpckldq %mm6, %mm5 + /**/ + pfadd %mm1, %mm0 + pfadd %mm5, %mm4 + movq %mm0, 32(%edx) + movq %mm4, 40(%edx) + pfsub %mm1, %mm3 + pfsub %mm5, %mm7 + pfmul 32(%ebx), %mm3 + pfmul 40(%ebx), %mm7 + movd %mm3, 92(%edx) + movd %mm7, 84(%edx) + psrlq $32, %mm3 + psrlq $32, %mm7 + movd %mm3, 88(%edx) + movd %mm7, 80(%edx) + + movq 48(%eax), %mm0 + movq 56(%eax), %mm4 + movq %mm0, %mm3 + movq %mm4, %mm7 + movq 72(%eax), %mm1 + movq 64(%eax), %mm5 + /* n.b.: pswapd*/ + movq %mm1, %mm2 + movq %mm5, %mm6 + psrlq $32, %mm1 + psrlq $32, %mm5 + punpckldq %mm2, %mm1 + punpckldq %mm6, %mm5 + /**/ + pfadd %mm1, %mm0 + pfadd %mm5, %mm4 + movq %mm0, 48(%edx) + movq %mm4, 56(%edx) + pfsub %mm1, %mm3 + pfsub %mm5, %mm7 + pfmul 48(%ebx), %mm3 + pfmul 56(%ebx), %mm7 + movd %mm3, 76(%edx) + movd %mm7, 68(%edx) + psrlq $32, %mm3 + psrlq $32, %mm7 + movd %mm3, 72(%edx) + movd %mm7, 64(%edx) + +/* Phase 2*/ + + movq (%edx), %mm0 + movq 8(%edx), %mm4 + movq %mm0, %mm3 + movq %mm4, %mm7 + movq 56(%edx), %mm1 + movq 48(%edx), %mm5 + /* n.b.: pswapd*/ + movq %mm1, %mm2 + movq %mm5, %mm6 + psrlq $32, %mm1 + psrlq $32, %mm5 + punpckldq %mm2, %mm1 + punpckldq %mm6, %mm5 + /**/ + pfadd %mm1, %mm0 + pfadd %mm5, %mm4 + movq %mm0, (%ecx) + movq %mm4, 8(%ecx) + pfsub %mm1, %mm3 + pfsub %mm5, %mm7 + pfmul 64(%ebx), %mm3 + pfmul 72(%ebx), %mm7 + movd %mm3, 60(%ecx) + movd %mm7, 52(%ecx) + psrlq $32, %mm3 + psrlq $32, %mm7 + movd %mm3, 56(%ecx) + movd %mm7, 48(%ecx) + + movq 16(%edx), %mm0 + movq 24(%edx), %mm4 + movq %mm0, %mm3 + movq %mm4, %mm7 + movq 40(%edx), %mm1 + movq 32(%edx), %mm5 + /* n.b.: pswapd*/ + movq %mm1, %mm2 + movq %mm5, %mm6 + psrlq $32, %mm1 + psrlq $32, %mm5 + punpckldq %mm2, %mm1 + punpckldq %mm6, %mm5 + /**/ + pfadd %mm1, %mm0 + pfadd %mm5, %mm4 + movq %mm0, 16(%ecx) + movq %mm4, 24(%ecx) + pfsub %mm1, %mm3 + pfsub %mm5, %mm7 + pfmul 80(%ebx), %mm3 + pfmul 88(%ebx), %mm7 + movd %mm3, 44(%ecx) + movd %mm7, 36(%ecx) + psrlq $32, %mm3 + psrlq $32, %mm7 + movd %mm3, 40(%ecx) + movd %mm7, 32(%ecx) + +/* Phase 3*/ + + movq 64(%edx), %mm0 + movq 72(%edx), %mm4 + movq %mm0, %mm3 + movq %mm4, %mm7 + movq 120(%edx), %mm1 + movq 112(%edx), %mm5 + /* n.b.: pswapd*/ + movq %mm1, %mm2 + movq %mm5, %mm6 + psrlq $32, %mm1 + psrlq $32, %mm5 + punpckldq %mm2, %mm1 + punpckldq %mm6, %mm5 + /**/ + pfadd %mm1, %mm0 + pfadd %mm5, %mm4 + movq %mm0, 64(%ecx) + movq %mm4, 72(%ecx) + pfsubr %mm1, %mm3 + pfsubr %mm5, %mm7 + pfmul 64(%ebx), %mm3 + pfmul 72(%ebx), %mm7 + movd %mm3, 124(%ecx) + movd %mm7, 116(%ecx) + psrlq $32, %mm3 + psrlq $32, %mm7 + movd %mm3, 120(%ecx) + movd %mm7, 112(%ecx) + + movq 80(%edx), %mm0 + movq 88(%edx), %mm4 + movq %mm0, %mm3 + movq %mm4, %mm7 + movq 104(%edx), %mm1 + movq 96(%edx), %mm5 + /* n.b.: pswapd*/ + movq %mm1, %mm2 + movq %mm5, %mm6 + psrlq $32, %mm1 + psrlq $32, %mm5 + punpckldq %mm2, %mm1 + punpckldq %mm6, %mm5 + /**/ + pfadd %mm1, %mm0 + pfadd %mm5, %mm4 + movq %mm0, 80(%ecx) + movq %mm4, 88(%ecx) + pfsubr %mm1, %mm3 + pfsubr %mm5, %mm7 + pfmul 80(%ebx), %mm3 + pfmul 88(%ebx), %mm7 + movd %mm3, 108(%ecx) + movd %mm7, 100(%ecx) + psrlq $32, %mm3 + psrlq $32, %mm7 + movd %mm3, 104(%ecx) + movd %mm7, 96(%ecx) + +/* Phase 4*/ + + movq (%ecx), %mm0 + movq 8(%ecx), %mm4 + movq %mm0, %mm3 + movq %mm4, %mm7 + movq 24(%ecx), %mm1 + movq 16(%ecx), %mm5 + /* n.b.: pswapd*/ + movq %mm1, %mm2 + movq %mm5, %mm6 + psrlq $32, %mm1 + psrlq $32, %mm5 + punpckldq %mm2, %mm1 + punpckldq %mm6, %mm5 + /**/ + pfadd %mm1, %mm0 + pfadd %mm5, %mm4 + movq %mm0, (%edx) + movq %mm4, 8(%edx) + pfsub %mm1, %mm3 + pfsub %mm5, %mm7 + pfmul 96(%ebx), %mm3 + pfmul 104(%ebx), %mm7 + movd %mm3, 28(%edx) + movd %mm7, 20(%edx) + psrlq $32, %mm3 + psrlq $32, %mm7 + movd %mm3, 24(%edx) + movd %mm7, 16(%edx) + + movq 32(%ecx), %mm0 + movq 40(%ecx), %mm4 + movq %mm0, %mm3 + movq %mm4, %mm7 + movq 56(%ecx), %mm1 + movq 48(%ecx), %mm5 + /* n.b.: pswapd*/ + movq %mm1, %mm2 + movq %mm5, %mm6 + psrlq $32, %mm1 + psrlq $32, %mm5 + punpckldq %mm2, %mm1 + punpckldq %mm6, %mm5 + /**/ + pfadd %mm1, %mm0 + pfadd %mm5, %mm4 + movq %mm0, 32(%edx) + movq %mm4, 40(%edx) + pfsubr %mm1, %mm3 + pfsubr %mm5, %mm7 + pfmul 96(%ebx), %mm3 + pfmul 104(%ebx), %mm7 + movd %mm3, 60(%edx) + movd %mm7, 52(%edx) + psrlq $32, %mm3 + psrlq $32, %mm7 + movd %mm3, 56(%edx) + movd %mm7, 48(%edx) + + movq 64(%ecx), %mm0 + movq 72(%ecx), %mm4 + movq %mm0, %mm3 + movq %mm4, %mm7 + movq 88(%ecx), %mm1 + movq 80(%ecx), %mm5 + /* n.b.: pswapd*/ + movq %mm1, %mm2 + movq %mm5, %mm6 + psrlq $32, %mm1 + psrlq $32, %mm5 + punpckldq %mm2, %mm1 + punpckldq %mm6, %mm5 + /**/ + pfadd %mm1, %mm0 + pfadd %mm5, %mm4 + movq %mm0, 64(%edx) + movq %mm4, 72(%edx) + pfsub %mm1, %mm3 + pfsub %mm5, %mm7 + pfmul 96(%ebx), %mm3 + pfmul 104(%ebx), %mm7 + movd %mm3, 92(%edx) + movd %mm7, 84(%edx) + psrlq $32, %mm3 + psrlq $32, %mm7 + movd %mm3, 88(%edx) + movd %mm7, 80(%edx) + + movq 96(%ecx), %mm0 + movq 104(%ecx), %mm4 + movq %mm0, %mm3 + movq %mm4, %mm7 + movq 120(%ecx), %mm1 + movq 112(%ecx), %mm5 + /* n.b.: pswapd*/ + movq %mm1, %mm2 + movq %mm5, %mm6 + psrlq $32, %mm1 + psrlq $32, %mm5 + punpckldq %mm2, %mm1 + punpckldq %mm6, %mm5 + /**/ + pfadd %mm1, %mm0 + pfadd %mm5, %mm4 + movq %mm0, 96(%edx) + movq %mm4, 104(%edx) + pfsubr %mm1, %mm3 + pfsubr %mm5, %mm7 + pfmul 96(%ebx), %mm3 + pfmul 104(%ebx), %mm7 + movd %mm3, 124(%edx) + movd %mm7, 116(%edx) + psrlq $32, %mm3 + psrlq $32, %mm7 + movd %mm3, 120(%edx) + movd %mm7, 112(%edx) + +/* Phase 5 */ + + movq (%edx), %mm0 + movq 16(%edx), %mm4 + movq %mm0, %mm3 + movq %mm4, %mm7 + movq 8(%edx), %mm1 + movq 24(%edx), %mm5 + /* n.b.: pswapd*/ + movq %mm1, %mm2 + movq %mm5, %mm6 + psrlq $32, %mm1 + psrlq $32, %mm5 + punpckldq %mm2, %mm1 + punpckldq %mm6, %mm5 + /**/ + pfadd %mm1, %mm0 + pfadd %mm5, %mm4 + movq %mm0, (%ecx) + movq %mm4, 16(%ecx) + pfsub %mm1, %mm3 + pfsubr %mm5, %mm7 + pfmul 112(%ebx), %mm3 + pfmul 112(%ebx), %mm7 + movd %mm3, 12(%ecx) + movd %mm7, 28(%ecx) + psrlq $32, %mm3 + psrlq $32, %mm7 + movd %mm3, 8(%ecx) + movd %mm7, 24(%ecx) + + movq 32(%edx), %mm0 + movq 48(%edx), %mm4 + movq %mm0, %mm3 + movq %mm4, %mm7 + movq 40(%edx), %mm1 + movq 56(%edx), %mm5 + /* n.b.: pswapd*/ + movq %mm1, %mm2 + movq %mm5, %mm6 + psrlq $32, %mm1 + psrlq $32, %mm5 + punpckldq %mm2, %mm1 + punpckldq %mm6, %mm5 + /**/ + pfadd %mm1, %mm0 + pfadd %mm5, %mm4 + movq %mm0, 32(%ecx) + movq %mm4, 48(%ecx) + pfsub %mm1, %mm3 + pfsubr %mm5, %mm7 + pfmul 112(%ebx), %mm3 + pfmul 112(%ebx), %mm7 + movd %mm3, 44(%ecx) + movd %mm7, 60(%ecx) + psrlq $32, %mm3 + psrlq $32, %mm7 + movd %mm3, 40(%ecx) + movd %mm7, 56(%ecx) + + movq 64(%edx), %mm0 + movq 80(%edx), %mm4 + movq %mm0, %mm3 + movq %mm4, %mm7 + movq 72(%edx), %mm1 + movq 88(%edx), %mm5 + /* n.b.: pswapd*/ + movq %mm1, %mm2 + movq %mm5, %mm6 + psrlq $32, %mm1 + psrlq $32, %mm5 + punpckldq %mm2, %mm1 + punpckldq %mm6, %mm5 + /**/ + pfadd %mm1, %mm0 + pfadd %mm5, %mm4 + movq %mm0, 64(%ecx) + movq %mm4, 80(%ecx) + pfsub %mm1, %mm3 + pfsubr %mm5, %mm7 + pfmul 112(%ebx), %mm3 + pfmul 112(%ebx), %mm7 + movd %mm3, 76(%ecx) + movd %mm7, 92(%ecx) + psrlq $32, %mm3 + psrlq $32, %mm7 + movd %mm3, 72(%ecx) + movd %mm7, 88(%ecx) + + movq 96(%edx), %mm0 + movq 112(%edx), %mm4 + movq %mm0, %mm3 + movq %mm4, %mm7 + movq 104(%edx), %mm1 + movq 120(%edx), %mm5 + /* n.b.: pswapd*/ + movq %mm1, %mm2 + movq %mm5, %mm6 + psrlq $32, %mm1 + psrlq $32, %mm5 + punpckldq %mm2, %mm1 + punpckldq %mm6, %mm5 + /**/ + pfadd %mm1, %mm0 + pfadd %mm5, %mm4 + movq %mm0, 96(%ecx) + movq %mm4, 112(%ecx) + pfsub %mm1, %mm3 + pfsubr %mm5, %mm7 + pfmul 112(%ebx), %mm3 + pfmul 112(%ebx), %mm7 + movd %mm3, 108(%ecx) + movd %mm7, 124(%ecx) + psrlq $32, %mm3 + psrlq $32, %mm7 + movd %mm3, 104(%ecx) + movd %mm7, 120(%ecx) + +/* Phase 6. This is the end of easy road. */ + movl $1, %eax + movd %eax, %mm7 + pi2fd %mm7, %mm7 + movq 32(%ecx), %mm0 + punpckldq 120(%ebx), %mm7 /* 1.0 | 120(%ebx) */ + movq %mm0, %mm1 + movq plus_minus_3dnow, %mm6 + /* n.b.: pfpnacc */ + pxor %mm6, %mm1 + pfacc %mm1, %mm0 + /**/ + pfmul %mm7, %mm0 + movq %mm0, 32(%edx) + femms + + flds 44(%ecx) + fsubs 40(%ecx) + fmuls 120(%ebx) + + fsts 44(%edx) + fadds 40(%ecx) /* pfacc 40(ecx), 56(%ecx) */ + fadds 44(%ecx) + fstps 40(%edx) + + flds 48(%ecx) + fsubs 52(%ecx) + fmuls 120(%ebx) + + flds 60(%ecx) + fsubs 56(%ecx) + fmuls 120(%ebx) + + fld %st(0) + fadds 56(%ecx) + fadds 60(%ecx) + + fld %st(0) + fadds 48(%ecx) + fadds 52(%ecx) + fstps 48(%edx) + fadd %st(2) + fstps 56(%edx) + fsts 60(%edx) + faddp %st(1) + fstps 52(%edx) +/*---*/ + flds 64(%ecx) + fadds 68(%ecx) + fstps 64(%edx) + + flds 64(%ecx) + fsubs 68(%ecx) + fmuls 120(%ebx) + fstps 68(%edx) + + flds 76(%ecx) + fsubs 72(%ecx) + fmuls 120(%ebx) + fsts 76(%edx) + fadds 72(%ecx) + fadds 76(%ecx) + fstps 72(%edx) + + flds 92(%ecx) + fsubs 88(%ecx) + fmuls 120(%ebx) + fsts 92(%edx) + fadds 92(%ecx) + fadds 88(%ecx) + + fld %st(0) + fadds 80(%ecx) + fadds 84(%ecx) + fstps 80(%edx) + + flds 80(%ecx) + fsubs 84(%ecx) + fmuls 120(%ebx) + fadd %st(0), %st(1) + fadds 92(%edx) + fstps 84(%edx) + fstps 88(%edx) + + flds 96(%ecx) + fadds 100(%ecx) + fstps 96(%edx) + + flds 96(%ecx) + fsubs 100(%ecx) + fmuls 120(%ebx) + fstps 100(%edx) + + flds 108(%ecx) + fsubs 104(%ecx) + fmuls 120(%ebx) + fsts 108(%edx) + fadds 104(%ecx) + fadds 108(%ecx) + fstps 104(%edx) + + flds 124(%ecx) + fsubs 120(%ecx) + fmuls 120(%ebx) + fsts 124(%edx) + fadds 120(%ecx) + fadds 124(%ecx) + + fld %st(0) + fadds 112(%ecx) + fadds 116(%ecx) + fstps 112(%edx) + + flds 112(%ecx) + fsubs 116(%ecx) + fmuls 120(%ebx) + fadd %st(0),%st(1) + fadds 124(%edx) + fstps 116(%edx) + fstps 120(%edx) + jnz .L01 + +/* Phase 7*/ + + flds (%ecx) + fadds 4(%ecx) + fstps 1024(%esi) + + flds (%ecx) + fsubs 4(%ecx) + fmuls 120(%ebx) + fsts (%esi) + fstps (%edi) + + flds 12(%ecx) + fsubs 8(%ecx) + fmuls 120(%ebx) + fsts 512(%edi) + fadds 12(%ecx) + fadds 8(%ecx) + fstps 512(%esi) + + flds 16(%ecx) + fsubs 20(%ecx) + fmuls 120(%ebx) + + flds 28(%ecx) + fsubs 24(%ecx) + fmuls 120(%ebx) + fsts 768(%edi) + fld %st(0) + fadds 24(%ecx) + fadds 28(%ecx) + fld %st(0) + fadds 16(%ecx) + fadds 20(%ecx) + fstps 768(%esi) + fadd %st(2) + fstps 256(%esi) + faddp %st(1) + fstps 256(%edi) + +/* Phase 8*/ + + flds 32(%edx) + fadds 48(%edx) + fstps 896(%esi) + + flds 48(%edx) + fadds 40(%edx) + fstps 640(%esi) + + flds 40(%edx) + fadds 56(%edx) + fstps 384(%esi) + + flds 56(%edx) + fadds 36(%edx) + fstps 128(%esi) + + flds 36(%edx) + fadds 52(%edx) + fstps 128(%edi) + + flds 52(%edx) + fadds 44(%edx) + fstps 384(%edi) + + flds 60(%edx) + fsts 896(%edi) + fadds 44(%edx) + fstps 640(%edi) + + flds 96(%edx) + fadds 112(%edx) + fld %st(0) + fadds 64(%edx) + fstps 960(%esi) + fadds 80(%edx) + fstps 832(%esi) + + flds 112(%edx) + fadds 104(%edx) + fld %st(0) + fadds 80(%edx) + fstps 704(%esi) + fadds 72(%edx) + fstps 576(%esi) + + flds 104(%edx) + fadds 120(%edx) + fld %st(0) + fadds 72(%edx) + fstps 448(%esi) + fadds 88(%edx) + fstps 320(%esi) + + flds 120(%edx) + fadds 100(%edx) + fld %st(0) + fadds 88(%edx) + fstps 192(%esi) + fadds 68(%edx) + fstps 64(%esi) + + flds 100(%edx) + fadds 116(%edx) + fld %st(0) + fadds 68(%edx) + fstps 64(%edi) + fadds 84(%edx) + fstps 192(%edi) + + flds 116(%edx) + fadds 108(%edx) + fld %st(0) + fadds 84(%edx) + fstps 320(%edi) + fadds 76(%edx) + fstps 448(%edi) + + flds 108(%edx) + fadds 124(%edx) + fld %st(0) + fadds 76(%edx) + fstps 576(%edi) + fadds 92(%edx) + fstps 704(%edi) + + flds 124(%edx) + fsts 960(%edi) + fadds 92(%edx) + fstps 832(%edi) + jmp .L_bye +.L01: +/* Phase 9*/ + + flds (%ecx) + fadds 4(%ecx) + fistp 512(%esi) + + flds (%ecx) + fsubs 4(%ecx) + fmuls 120(%ebx) + + fistp (%esi) + + + flds 12(%ecx) + fsubs 8(%ecx) + fmuls 120(%ebx) + fist 256(%edi) + fadds 12(%ecx) + fadds 8(%ecx) + fistp 256(%esi) + + flds 16(%ecx) + fsubs 20(%ecx) + fmuls 120(%ebx) + + flds 28(%ecx) + fsubs 24(%ecx) + fmuls 120(%ebx) + fist 384(%edi) + fld %st(0) + fadds 24(%ecx) + fadds 28(%ecx) + fld %st(0) + fadds 16(%ecx) + fadds 20(%ecx) + fistp 384(%esi) + fadd %st(2) + fistp 128(%esi) + faddp %st(1) + fistp 128(%edi) + +/* Phase 10*/ + + flds 32(%edx) + fadds 48(%edx) + fistp 448(%esi) + + flds 48(%edx) + fadds 40(%edx) + fistp 320(%esi) + + flds 40(%edx) + fadds 56(%edx) + fistp 192(%esi) + + flds 56(%edx) + fadds 36(%edx) + fistp 64(%esi) + + flds 36(%edx) + fadds 52(%edx) + fistp 64(%edi) + + flds 52(%edx) + fadds 44(%edx) + fistp 192(%edi) + + flds 60(%edx) + fist 448(%edi) + fadds 44(%edx) + fistp 320(%edi) + + flds 96(%edx) + fadds 112(%edx) + fld %st(0) + fadds 64(%edx) + fistp 480(%esi) + fadds 80(%edx) + fistp 416(%esi) + + flds 112(%edx) + fadds 104(%edx) + fld %st(0) + fadds 80(%edx) + fistp 352(%esi) + fadds 72(%edx) + fistp 288(%esi) + + flds 104(%edx) + fadds 120(%edx) + fld %st(0) + fadds 72(%edx) + fistp 224(%esi) + fadds 88(%edx) + fistp 160(%esi) + + flds 120(%edx) + fadds 100(%edx) + fld %st(0) + fadds 88(%edx) + fistp 96(%esi) + fadds 68(%edx) + fistp 32(%esi) + + flds 100(%edx) + fadds 116(%edx) + fld %st(0) + fadds 68(%edx) + fistp 32(%edi) + fadds 84(%edx) + fistp 96(%edi) + + flds 116(%edx) + fadds 108(%edx) + fld %st(0) + fadds 84(%edx) + fistp 160(%edi) + fadds 76(%edx) + fistp 224(%edi) + + flds 108(%edx) + fadds 124(%edx) + fld %st(0) + fadds 76(%edx) + fistp 288(%edi) + fadds 92(%edx) + fistp 352(%edi) + + flds 124(%edx) + fist 480(%edi) + fadds 92(%edx) + fistp 416(%edi) + movsw +.L_bye: + addl $256,%esp + popl %edi + popl %esi + popl %ebx + ret + |