diff options
author | michael <michael@b3059339-0415-0410-9bf9-f77b7e298cf2> | 2001-11-22 19:40:38 +0000 |
---|---|---|
committer | michael <michael@b3059339-0415-0410-9bf9-f77b7e298cf2> | 2001-11-22 19:40:38 +0000 |
commit | 6c6cc954f5eef6e800f4cb46db31d9932dc61f00 (patch) | |
tree | d005591526537e9be19929124940af0f5f8a2295 | |
parent | 2a6e9d9edae6aaf94e643f026cb63774e907cbe6 (diff) |
10-20% faster fastmemcpy :) on my p3 at least but the algo is mostly from "amd athlon processor x86 code optimization guide" so it should be faster for amd chips too, but i fear it might be slower for mem->vram copies (someone should check that, i cant) ... there are 2 #defines to finetune it (BLOCK_SIZE & CONFUSION_FACTOR)
git-svn-id: svn://svn.mplayerhq.hu/mplayer/trunk@3078 b3059339-0415-0410-9bf9-f77b7e298cf2
-rw-r--r-- | libvo/aclib.c | 122 | ||||
-rw-r--r-- | libvo/aclib_template.c | 122 |
2 files changed, 232 insertions, 12 deletions
diff --git a/libvo/aclib.c b/libvo/aclib.c index 2366a28d51..d2c51c3157 100644 --- a/libvo/aclib.c +++ b/libvo/aclib.c @@ -1,13 +1,19 @@ #include "../config.h" #ifdef USE_FASTMEMCPY -/* +/* aclib - advanced C library ;) This file contains functions which improve and expand standard C-library */ #include <stddef.h> +#define BLOCK_SIZE 4096 +#define CONFUSION_FACTOR 0 +//Feel free to fine-tune the above 2, it might be possible to get some speedup with them :) + +//#define STATISTICS + #ifndef HAVE_SSE2 /* P3 processor has only one SSE decoder so can execute only 1 sse insn per @@ -103,7 +109,7 @@ __asm__ __volatile__(\ #ifdef HAVE_SSE #define MMREG_SIZE 16 #else -#define MMREG_SIZE 8 +#define MMREG_SIZE 64 //8 #endif /* Small defines (for readability only) ;) */ @@ -132,7 +138,20 @@ void * fast_memcpy(void * to, const void * from, size_t len) { void *retval; size_t i; - retval = to; + retval = to; +#ifdef STATISTICS + { + static int freq[33]; + static int t=0; + int i; + for(i=0; len>(1<<i); i++); + freq[i]++; + t++; + if(1024*1024*1024 % t == 0) + for(i=0; i<32; i++) + printf("freq < %8d %4d\n", 1<<i, freq[i]); + } +#endif #ifndef HAVE_MMX1 /* PREFETCH has effect even for MOVSB instruction ;) */ __asm__ __volatile__ ( @@ -184,7 +203,7 @@ void * fast_memcpy(void * to, const void * from, size_t len) ((const unsigned char *)from)+=64; ((unsigned char *)to)+=64; } - else + else /* Only if SRC is aligned on 16-byte boundary. It allows to use movaps instead of movups, which required data @@ -207,6 +226,96 @@ void * fast_memcpy(void * to, const void * from, size_t len) ((unsigned char *)to)+=64; } #else + // Align destination at BLOCK_SIZE boundary + for(; ((int)to & (BLOCK_SIZE-1)) && i>0; i--) + { + __asm__ __volatile__ ( +#ifndef HAVE_MMX1 + PREFETCH" 320(%0)\n" +#endif + "movq (%0), %%mm0\n" + "movq 8(%0), %%mm1\n" + "movq 16(%0), %%mm2\n" + "movq 24(%0), %%mm3\n" + "movq 32(%0), %%mm4\n" + "movq 40(%0), %%mm5\n" + "movq 48(%0), %%mm6\n" + "movq 56(%0), %%mm7\n" + MOVNTQ" %%mm0, (%1)\n" + MOVNTQ" %%mm1, 8(%1)\n" + MOVNTQ" %%mm2, 16(%1)\n" + MOVNTQ" %%mm3, 24(%1)\n" + MOVNTQ" %%mm4, 32(%1)\n" + MOVNTQ" %%mm5, 40(%1)\n" + MOVNTQ" %%mm6, 48(%1)\n" + MOVNTQ" %%mm7, 56(%1)\n" + :: "r" (from), "r" (to) : "memory"); + ((const unsigned char *)from)+=64; + ((unsigned char *)to)+=64; + } + +// printf(" %d %d\n", (int)from&1023, (int)to&1023); + // Pure Assembly cuz gcc is a bit unpredictable ;) + if(i>=BLOCK_SIZE/64) + asm volatile( + "xorl %%eax, %%eax \n\t" + ".balign 16 \n\t" + "1: \n\t" + "movl (%0, %%eax), %%ebx \n\t" + "movl 32(%0, %%eax), %%ebx \n\t" + "movl 64(%0, %%eax), %%ebx \n\t" + "movl 96(%0, %%eax), %%ebx \n\t" + "addl $128, %%eax \n\t" + "cmpl %3, %%eax \n\t" + " jb 1b \n\t" + + "xorl %%eax, %%eax \n\t" + + ".balign 16 \n\t" + "2: \n\t" + "movq (%0, %%eax), %%mm0\n" + "movq 8(%0, %%eax), %%mm1\n" + "movq 16(%0, %%eax), %%mm2\n" + "movq 24(%0, %%eax), %%mm3\n" + "movq 32(%0, %%eax), %%mm4\n" + "movq 40(%0, %%eax), %%mm5\n" + "movq 48(%0, %%eax), %%mm6\n" + "movq 56(%0, %%eax), %%mm7\n" + MOVNTQ" %%mm0, (%1, %%eax)\n" + MOVNTQ" %%mm1, 8(%1, %%eax)\n" + MOVNTQ" %%mm2, 16(%1, %%eax)\n" + MOVNTQ" %%mm3, 24(%1, %%eax)\n" + MOVNTQ" %%mm4, 32(%1, %%eax)\n" + MOVNTQ" %%mm5, 40(%1, %%eax)\n" + MOVNTQ" %%mm6, 48(%1, %%eax)\n" + MOVNTQ" %%mm7, 56(%1, %%eax)\n" + "addl $64, %%eax \n\t" + "cmpl %3, %%eax \n\t" + "jb 2b \n\t" + +#if CONFUSION_FACTOR > 0 + // a few percent speedup on out of order executing CPUs + "movl %5, %%eax \n\t" + "2: \n\t" + "movl (%0), %%ebx \n\t" + "movl (%0), %%ebx \n\t" + "movl (%0), %%ebx \n\t" + "movl (%0), %%ebx \n\t" + "decl %%eax \n\t" + " jnz 2b \n\t" +#endif + + "xorl %%eax, %%eax \n\t" + "addl %3, %0 \n\t" + "addl %3, %1 \n\t" + "subl %4, %2 \n\t" + "cmpl %4, %2 \n\t" + " jae 1b \n\t" + : "+r" (from), "+r" (to), "+r" (i) + : "r" (BLOCK_SIZE), "i" (BLOCK_SIZE/64), "i" (CONFUSION_FACTOR) + : "%eax", "%ebx" + ); + for(; i>0; i--) { __asm__ __volatile__ ( @@ -233,16 +342,17 @@ void * fast_memcpy(void * to, const void * from, size_t len) ((const unsigned char *)from)+=64; ((unsigned char *)to)+=64; } + #endif /* Have SSE */ #ifdef HAVE_MMX2 /* since movntq is weakly-ordered, a "sfence" * is needed to become ordered again. */ __asm__ __volatile__ ("sfence":::"memory"); #endif -#ifndef HAVE_SSE +#ifndef HAVE_SSE /* enables to use FPU */ __asm__ __volatile__ (EMMS:::"memory"); -#endif +#endif } /* * Now do the tail of the block diff --git a/libvo/aclib_template.c b/libvo/aclib_template.c index 2366a28d51..d2c51c3157 100644 --- a/libvo/aclib_template.c +++ b/libvo/aclib_template.c @@ -1,13 +1,19 @@ #include "../config.h" #ifdef USE_FASTMEMCPY -/* +/* aclib - advanced C library ;) This file contains functions which improve and expand standard C-library */ #include <stddef.h> +#define BLOCK_SIZE 4096 +#define CONFUSION_FACTOR 0 +//Feel free to fine-tune the above 2, it might be possible to get some speedup with them :) + +//#define STATISTICS + #ifndef HAVE_SSE2 /* P3 processor has only one SSE decoder so can execute only 1 sse insn per @@ -103,7 +109,7 @@ __asm__ __volatile__(\ #ifdef HAVE_SSE #define MMREG_SIZE 16 #else -#define MMREG_SIZE 8 +#define MMREG_SIZE 64 //8 #endif /* Small defines (for readability only) ;) */ @@ -132,7 +138,20 @@ void * fast_memcpy(void * to, const void * from, size_t len) { void *retval; size_t i; - retval = to; + retval = to; +#ifdef STATISTICS + { + static int freq[33]; + static int t=0; + int i; + for(i=0; len>(1<<i); i++); + freq[i]++; + t++; + if(1024*1024*1024 % t == 0) + for(i=0; i<32; i++) + printf("freq < %8d %4d\n", 1<<i, freq[i]); + } +#endif #ifndef HAVE_MMX1 /* PREFETCH has effect even for MOVSB instruction ;) */ __asm__ __volatile__ ( @@ -184,7 +203,7 @@ void * fast_memcpy(void * to, const void * from, size_t len) ((const unsigned char *)from)+=64; ((unsigned char *)to)+=64; } - else + else /* Only if SRC is aligned on 16-byte boundary. It allows to use movaps instead of movups, which required data @@ -207,6 +226,96 @@ void * fast_memcpy(void * to, const void * from, size_t len) ((unsigned char *)to)+=64; } #else + // Align destination at BLOCK_SIZE boundary + for(; ((int)to & (BLOCK_SIZE-1)) && i>0; i--) + { + __asm__ __volatile__ ( +#ifndef HAVE_MMX1 + PREFETCH" 320(%0)\n" +#endif + "movq (%0), %%mm0\n" + "movq 8(%0), %%mm1\n" + "movq 16(%0), %%mm2\n" + "movq 24(%0), %%mm3\n" + "movq 32(%0), %%mm4\n" + "movq 40(%0), %%mm5\n" + "movq 48(%0), %%mm6\n" + "movq 56(%0), %%mm7\n" + MOVNTQ" %%mm0, (%1)\n" + MOVNTQ" %%mm1, 8(%1)\n" + MOVNTQ" %%mm2, 16(%1)\n" + MOVNTQ" %%mm3, 24(%1)\n" + MOVNTQ" %%mm4, 32(%1)\n" + MOVNTQ" %%mm5, 40(%1)\n" + MOVNTQ" %%mm6, 48(%1)\n" + MOVNTQ" %%mm7, 56(%1)\n" + :: "r" (from), "r" (to) : "memory"); + ((const unsigned char *)from)+=64; + ((unsigned char *)to)+=64; + } + +// printf(" %d %d\n", (int)from&1023, (int)to&1023); + // Pure Assembly cuz gcc is a bit unpredictable ;) + if(i>=BLOCK_SIZE/64) + asm volatile( + "xorl %%eax, %%eax \n\t" + ".balign 16 \n\t" + "1: \n\t" + "movl (%0, %%eax), %%ebx \n\t" + "movl 32(%0, %%eax), %%ebx \n\t" + "movl 64(%0, %%eax), %%ebx \n\t" + "movl 96(%0, %%eax), %%ebx \n\t" + "addl $128, %%eax \n\t" + "cmpl %3, %%eax \n\t" + " jb 1b \n\t" + + "xorl %%eax, %%eax \n\t" + + ".balign 16 \n\t" + "2: \n\t" + "movq (%0, %%eax), %%mm0\n" + "movq 8(%0, %%eax), %%mm1\n" + "movq 16(%0, %%eax), %%mm2\n" + "movq 24(%0, %%eax), %%mm3\n" + "movq 32(%0, %%eax), %%mm4\n" + "movq 40(%0, %%eax), %%mm5\n" + "movq 48(%0, %%eax), %%mm6\n" + "movq 56(%0, %%eax), %%mm7\n" + MOVNTQ" %%mm0, (%1, %%eax)\n" + MOVNTQ" %%mm1, 8(%1, %%eax)\n" + MOVNTQ" %%mm2, 16(%1, %%eax)\n" + MOVNTQ" %%mm3, 24(%1, %%eax)\n" + MOVNTQ" %%mm4, 32(%1, %%eax)\n" + MOVNTQ" %%mm5, 40(%1, %%eax)\n" + MOVNTQ" %%mm6, 48(%1, %%eax)\n" + MOVNTQ" %%mm7, 56(%1, %%eax)\n" + "addl $64, %%eax \n\t" + "cmpl %3, %%eax \n\t" + "jb 2b \n\t" + +#if CONFUSION_FACTOR > 0 + // a few percent speedup on out of order executing CPUs + "movl %5, %%eax \n\t" + "2: \n\t" + "movl (%0), %%ebx \n\t" + "movl (%0), %%ebx \n\t" + "movl (%0), %%ebx \n\t" + "movl (%0), %%ebx \n\t" + "decl %%eax \n\t" + " jnz 2b \n\t" +#endif + + "xorl %%eax, %%eax \n\t" + "addl %3, %0 \n\t" + "addl %3, %1 \n\t" + "subl %4, %2 \n\t" + "cmpl %4, %2 \n\t" + " jae 1b \n\t" + : "+r" (from), "+r" (to), "+r" (i) + : "r" (BLOCK_SIZE), "i" (BLOCK_SIZE/64), "i" (CONFUSION_FACTOR) + : "%eax", "%ebx" + ); + for(; i>0; i--) { __asm__ __volatile__ ( @@ -233,16 +342,17 @@ void * fast_memcpy(void * to, const void * from, size_t len) ((const unsigned char *)from)+=64; ((unsigned char *)to)+=64; } + #endif /* Have SSE */ #ifdef HAVE_MMX2 /* since movntq is weakly-ordered, a "sfence" * is needed to become ordered again. */ __asm__ __volatile__ ("sfence":::"memory"); #endif -#ifndef HAVE_SSE +#ifndef HAVE_SSE /* enables to use FPU */ __asm__ __volatile__ (EMMS:::"memory"); -#endif +#endif } /* * Now do the tail of the block |