summaryrefslogtreecommitdiff
path: root/plugins/ffap
diff options
context:
space:
mode:
authorGravatar Alexey Yakovenko <wakeroid@gmail.com>2009-09-12 15:27:29 +0200
committerGravatar Alexey Yakovenko <wakeroid@gmail.com>2009-09-12 15:27:57 +0200
commit370dfdaddd05af7edccd419bb1249650f524770b (patch)
treea800bb7fe5acd2e775fa5dbcab0ebde3f55be490 /plugins/ffap
parent6a1e3ccfdea89c653a858ee9baec5af8f658248e (diff)
more sse optimizations from ffmpeg
Diffstat (limited to 'plugins/ffap')
-rw-r--r--plugins/ffap/ffap.c66
1 files changed, 64 insertions, 2 deletions
diff --git a/plugins/ffap/ffap.c b/plugins/ffap/ffap.c
index 42985127..01821f42 100644
--- a/plugins/ffap/ffap.c
+++ b/plugins/ffap/ffap.c
@@ -18,6 +18,9 @@
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
*/
+#if HAVE_CONFIG_H
+#include "config.h"
+#endif
#include <stdio.h>
#include <string.h>
#include <limits.h>
@@ -1134,7 +1137,16 @@ static void init_filter(APEContext * ctx, APEFilter *f, int16_t * buf, int order
do_init_filter(&f[1], buf + order * 3 + HISTORY_SIZE, order);
}
+#if __WORDSIZE==64
typedef int64_t x86_reg;
+#elif __WORDSIZE==32
+typedef int32_t x86_reg;
+#else
+#warning unknown arch
+typedef int x86_reg;
+#endif
+
+#ifdef HAVE_SSE2
typedef struct { uint64_t a, b; } xmm_reg;
#define DECLARE_ALIGNED(n,t,v) t v __attribute__ ((aligned (n)))
#define DECLARE_ALIGNED_16(t, v) DECLARE_ALIGNED(16, t, v)
@@ -1169,10 +1181,60 @@ static int32_t scalarproduct_int16 (int16_t * v1, int16_t * v2, int order, int s
);
return res;
}
+static void add_int16(int16_t * v1, int16_t * v2, int order)
+{
+ x86_reg o = -(order << 1);
+ v1 += order;
+ v2 += order;
+ __asm__ volatile(
+ "1: \n\t"
+ "movdqu (%1,%2), %%xmm0 \n\t"
+ "movdqu 16(%1,%2), %%xmm1 \n\t"
+ "paddw (%0,%2), %%xmm0 \n\t"
+ "paddw 16(%0,%2), %%xmm1 \n\t"
+ "movdqa %%xmm0, (%0,%2) \n\t"
+ "movdqa %%xmm1, 16(%0,%2) \n\t"
+ "add $32, %2 \n\t"
+ "js 1b \n\t"
+ : "+r"(v1), "+r"(v2), "+r"(o)
+ );
+}
+
+static void sub_int16(int16_t * v1, int16_t * v2, int order)
+{
+ x86_reg o = -(order << 1);
+ v1 += order;
+ v2 += order;
+ __asm__ volatile(
+ "1: \n\t"
+ "movdqa (%0,%2), %%xmm0 \n\t"
+ "movdqa 16(%0,%2), %%xmm2 \n\t"
+ "movdqu (%1,%2), %%xmm1 \n\t"
+ "movdqu 16(%1,%2), %%xmm3 \n\t"
+ "psubw %%xmm1, %%xmm0 \n\t"
+ "psubw %%xmm3, %%xmm2 \n\t"
+ "movdqa %%xmm0, (%0,%2) \n\t"
+ "movdqa %%xmm2, 16(%0,%2) \n\t"
+ "add $32, %2 \n\t"
+ "js 1b \n\t"
+ : "+r"(v1), "+r"(v2), "+r"(o)
+ );
+}
+
+#else
+static int32_t
+scalarproduct_int16(int16_t * v1, int16_t * v2, int order, int shift)
+{
+ int res = 0;
+
+ while (order--)
+ res += (*v1++ * *v2++) >> shift;
+
+ return res;
+}
static inline void
add_int16 (int16_t *v1/*align 16*/, int16_t *v2, int len) {
- int i;
while (len--) {
*v1++ += *v2++;
}
@@ -1180,11 +1242,11 @@ add_int16 (int16_t *v1/*align 16*/, int16_t *v2, int len) {
static inline void
sub_int16 (int16_t *v1/*align 16*/, int16_t *v2, int len) {
- int i;
while (len--) {
*v1++ -= *v2++;
}
}
+#endif
static inline int16_t clip_int16(int a)
{