17 files changed, 2605 insertions, 2511 deletions
diff --git a/libmpeg2/Makefile b/libmpeg2/Makefile
index 914b41844d..6ee925ddb9 100644
--- a/libmpeg2/Makefile
+++ b/libmpeg2/Makefile
@@ -3,9 +3,8 @@ LIBNAME = libmpeg2.a
 
 include ../config.mak
 
-SRCS	= header.c idct.c idct_mmx.c idct_mlib.c \
-		motion_comp.c motion_comp_mmx.c motion_comp_mlib.c \
-		slice.c stats.c # decode.c
+SRCS	= alloc.c cpu_accel.c cpu_state.c decode.c header.c idct.c idct_alpha.c idct_altivec.c idct_mlib.c idct_mmx.c motion_comp.c motion_comp_alpha.c motion_comp_altivec.c motion_comp_mlib.c motion_comp_mmx.c slice.c
+
 OBJS	= $(SRCS:.c=.o)
 INCLUDE = -I. -I../libvo -I.. $(EXTRA_INC) $(MLIB_INC)
 CFLAGS  = $(OPTFLAGS) $(INCLUDE) -DMPG12PLAY
diff --git a/libmpeg2/attributes.h b/libmpeg2/attributes.h
index ab7105c2df..96a86b26c0 100644
--- a/libmpeg2/attributes.h
+++ b/libmpeg2/attributes.h
@@ -1,8 +1,10 @@
 /*
  * attributes.h
- * Copyright (C) 1999-2001 Aaron Holtzman <aholtzma@ess.engr.uvic.ca>
+ * Copyright (C) 2000-2002 Michel Lespinasse <walken@zoy.org>
+ * Copyright (C) 1999-2000 Aaron Holtzman <aholtzma@ess.engr.uvic.ca>
  *
  * This file is part of mpeg2dec, a free MPEG-2 video stream decoder.
+ * See http://libmpeg2.sourceforge.net/ for updates.
  *
  * mpeg2dec is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -21,7 +23,15 @@
 
 /* use gcc attribs to align critical data structures */
 #ifdef ATTRIBUTE_ALIGNED_MAX
-#define ATTR_ALIGN(align) __attribute__ ((__aligned__ ((ATTRIBUTE_ALIGNED_MAX < (align)) ? ATTRIBUTE_ALIGNED_MAX : (align))))
+#define ATTR_ALIGN(align) __attribute__ ((__aligned__ ((ATTRIBUTE_ALIGNED_MAX < align) ? ATTRIBUTE_ALIGNED_MAX : align)))
 #else
 #define ATTR_ALIGN(align)
 #endif
+
+#ifdef HAVE_BUILTIN_EXPECT
+#define likely(x) __builtin_expect ((x) != 0, 1)
+#define unlikely(x) __builtin_expect ((x) != 0, 0)
+#else
+#define likely(x) (x)
+#define unlikely(x) (x)
+#endif
diff --git a/libmpeg2/header.c b/libmpeg2/header.c
index 68483a71c1..548d6bf21e 100644
--- a/libmpeg2/header.c
+++ b/libmpeg2/header.c
@@ -1,8 +1,10 @@
 /*
- * slice.c
- * Copyright (C) 1999-2001 Aaron Holtzman <aholtzma@ess.engr.uvic.ca>
+ * header.c
+ * Copyright (C) 2000-2002 Michel Lespinasse <walken@zoy.org>
+ * Copyright (C) 1999-2000 Aaron Holtzman <aholtzma@ess.engr.uvic.ca>
  *
  * This file is part of mpeg2dec, a free MPEG-2 video stream decoder.
+ * See http://libmpeg2.sourceforge.net/ for updates.
  *
  * mpeg2dec is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -22,13 +24,23 @@
 #include "config.h"
 
 #include <inttypes.h>
-#include <stdio.h>
+#include <stdlib.h>	/* defines NULL */
+#include <string.h>	/* memcmp */
 
+#include "mpeg2.h"
 #include "mpeg2_internal.h"
+#include "convert.h"
 #include "attributes.h"
 
+#define SEQ_EXT 2
+#define SEQ_DISPLAY_EXT 4
+#define QUANT_MATRIX_EXT 8
+#define COPYRIGHT_EXT 0x10
+#define PIC_DISPLAY_EXT 0x80
+#define PIC_CODING_EXT 0x100
+
 /* default intra quant matrix, in zig-zag order */
-static uint8_t default_intra_quantizer_matrix[64] ATTR_ALIGN(16) = {
+static const uint8_t default_intra_quantizer_matrix[64] ATTR_ALIGN(16) = {
     8,
     16, 16,
     19, 16, 19,
@@ -46,214 +58,634 @@ static uint8_t default_intra_quantizer_matrix[64] ATTR_ALIGN(16) = {
     83
 };
 
-uint8_t scan_norm[64] ATTR_ALIGN(16) =
-{
+uint8_t mpeg2_scan_norm[64] ATTR_ALIGN(16) = {
     /* Zig-Zag scan pattern */
-     0, 1, 8,16, 9, 2, 3,10,
-    17,24,32,25,18,11, 4, 5,
-    12,19,26,33,40,48,41,34,
-    27,20,13, 6, 7,14,21,28,
-    35,42,49,56,57,50,43,36,
-    29,22,15,23,30,37,44,51,
-    58,59,52,45,38,31,39,46,
-    53,60,61,54,47,55,62,63
+     0,  1,  8, 16,  9,  2,  3, 10, 17, 24, 32, 25, 18, 11,  4,  5,
+    12, 19, 26, 33, 40, 48, 41, 34, 27, 20, 13,  6,  7, 14, 21, 28,
+    35, 42, 49, 56, 57, 50, 43, 36, 29, 22, 15, 23, 30, 37, 44, 51,
+    58, 59, 52, 45, 38, 31, 39, 46, 53, 60, 61, 54, 47, 55, 62, 63
 };
 
-uint8_t scan_alt[64] ATTR_ALIGN(16) =
-{
+uint8_t mpeg2_scan_alt[64] ATTR_ALIGN(16) = {
     /* Alternate scan pattern */
-    0,8,16,24,1,9,2,10,17,25,32,40,48,56,57,49,
-    41,33,26,18,3,11,4,12,19,27,34,42,50,58,35,43,
-    51,59,20,28,5,13,6,14,21,29,36,44,52,60,37,45,
-    53,61,22,30,7,15,23,31,38,46,54,62,39,47,55,63
+     0, 8,  16, 24,  1,  9,  2, 10, 17, 25, 32, 40, 48, 56, 57, 49,
+    41, 33, 26, 18,  3, 11,  4, 12, 19, 27, 34, 42, 50, 58, 35, 43,
+    51, 59, 20, 28,  5, 13,  6, 14, 21, 29, 36, 44, 52, 60, 37, 45,
+    53, 61, 22, 30,  7, 15, 23, 31, 38, 46, 54, 62, 39, 47, 55, 63
 };
 
-void header_state_init (picture_t * picture)
+void mpeg2_header_state_init (mpeg2dec_t * mpeg2dec)
 {
-    picture->scan = scan_norm;
+    mpeg2dec->decoder.scan = mpeg2_scan_norm;
+    mpeg2dec->picture = mpeg2dec->pictures;
+    mpeg2dec->fbuf[0] = &mpeg2dec->fbuf_alloc[0].fbuf;
+    mpeg2dec->fbuf[1] = &mpeg2dec->fbuf_alloc[1].fbuf;
+    mpeg2dec->fbuf[2] = &mpeg2dec->fbuf_alloc[2].fbuf;
+    mpeg2dec->first = 1;
+    mpeg2dec->alloc_index = 0;
+    mpeg2dec->alloc_index_user = 0;
 }
 
-int header_process_sequence_header (picture_t * picture, uint8_t * buffer)
+static void reset_info (mpeg2_info_t * info)
 {
+    info->current_picture = info->current_picture_2nd = NULL;
+    info->display_picture = info->display_picture_2nd = NULL;
+    info->current_fbuf = info->display_fbuf = info->discard_fbuf = NULL;
+    info->user_data = NULL;	info->user_data_len = 0;
+}
+
+int mpeg2_header_sequence (mpeg2dec_t * mpeg2dec)
+{
+    uint8_t * buffer = mpeg2dec->chunk_start;
+    sequence_t * sequence = &(mpeg2dec->new_sequence);
+    decoder_t * decoder = &(mpeg2dec->decoder);
+    static unsigned int frame_period[9] = {
+	0, 1126125, 1125000, 1080000, 900900, 900000, 540000, 450450, 450000
+    };
     int width, height;
     int i;
 
-    if ((buffer[6] & 0x20) != 0x20){
-	printf("missing marker bit!\n");
-	return 1;	/* missing marker_bit */
-    }
+    if ((buffer[6] & 0x20) != 0x20)	/* missing marker_bit */
+	return 1;
 
-    height = (buffer[0] << 16) | (buffer[1] << 8) | buffer[2];
+    i = (buffer[0] << 16) | (buffer[1] << 8) | buffer[2];
+    sequence->display_width = sequence->picture_width = width = i >> 12;
+    sequence->display_height = sequence->picture_height = height = i & 0xfff;
+    decoder->width = sequence->width = width = (width + 15) & ~15;
+    decoder->height = sequence->height = height = (height + 15) & ~15;
+    decoder->vertical_position_extension = (height > 2800);
+    sequence->chroma_width = width >> 1;
+    sequence->chroma_height = height >> 1;
 
-    picture->display_picture_width = (height >> 12);
-    picture->display_picture_height = (height & 0xfff);
+    sequence->flags = SEQ_FLAG_PROGRESSIVE_SEQUENCE;
 
-    width = ((height >> 12) + 15) & ~15;
-    height = ((height & 0xfff) + 15) & ~15;
+    sequence->pixel_width = buffer[3] >> 4;	/* aspect ratio */
+    sequence->frame_period = 0;
+    if ((buffer[3] & 15) < 9)
+	sequence->frame_period = frame_period[buffer[3] & 15];
 
-    if ((width > 768) || (height > 576)){
-	printf("size restrictions for MP@ML or MPEG1 exceeded! (%dx%d)\n",width,height);
-//	return 1;	/* size restrictions for MP@ML or MPEG1 */
-    }
-    
-    picture->coded_picture_width = width;
-    picture->coded_picture_height = height;
+    sequence->byte_rate = (buffer[4]<<10) | (buffer[5]<<2) | (buffer[6]>>6);
 
-    /* this is not used by the decoder */
-    picture->aspect_ratio_information = buffer[3] >> 4;
-    picture->frame_rate_code = buffer[3] & 15;
-    picture->bitrate = (buffer[4]<<10)|(buffer[5]<<2)|(buffer[6]>>6);
+    sequence->vbv_buffer_size = ((buffer[6]<<16)|(buffer[7]<<8))&0x1ff800;
+
+    if (buffer[7] & 4)
+	sequence->flags |= SEQ_FLAG_CONSTRAINED_PARAMETERS;
 
     if (buffer[7] & 2) {
 	for (i = 0; i < 64; i++)
-	    picture->intra_quantizer_matrix[scan_norm[i]] =
+	    decoder->intra_quantizer_matrix[mpeg2_scan_norm[i]] =
 		(buffer[i+7] << 7) | (buffer[i+8] >> 1);
 	buffer += 64;
-    } else {
+    } else
 	for (i = 0; i < 64; i++)
-	    picture->intra_quantizer_matrix[scan_norm[i]] =
+	    decoder->intra_quantizer_matrix[mpeg2_scan_norm[i]] =
 		default_intra_quantizer_matrix [i];
-    }
 
-    if (buffer[7] & 1) {
+    if (buffer[7] & 1)
 	for (i = 0; i < 64; i++)
-	    picture->non_intra_quantizer_matrix[scan_norm[i]] =
+	    decoder->non_intra_quantizer_matrix[mpeg2_scan_norm[i]] =
 		buffer[i+8];
-    } else {
+    else
 	for (i = 0; i < 64; i++)
-	    picture->non_intra_quantizer_matrix[i] = 16;
+	    decoder->non_intra_quantizer_matrix[i] = 16;
+
+    sequence->profile_level_id = 0x80;
+    sequence->colour_primaries = 1;
+    sequence->transfer_characteristics = 1;
+    sequence->matrix_coefficients = 1;
+
+    decoder->mpeg1 = 1;
+    decoder->intra_dc_precision = 0;
+    decoder->frame_pred_frame_dct = 1;
+    decoder->q_scale_type = 0;
+    decoder->concealment_motion_vectors = 0;
+    decoder->scan = mpeg2_scan_norm;
+    decoder->picture_structure = FRAME_PICTURE;
+
+    mpeg2dec->ext_state = SEQ_EXT;
+    mpeg2dec->state = STATE_SEQUENCE;
+    mpeg2dec->display_offset_x = mpeg2dec->display_offset_y = 0;
+
+    reset_info (&(mpeg2dec->info));
+    return 0;
+}
+
+static int sequence_ext (mpeg2dec_t * mpeg2dec)
+{
+    uint8_t * buffer = mpeg2dec->chunk_start;
+    sequence_t * sequence = &(mpeg2dec->new_sequence);
+    decoder_t * decoder = &(mpeg2dec->decoder);
+    int width, height;
+    uint32_t flags;
+
+    if (!(buffer[3] & 1))
+	return 1;
+
+    sequence->profile_level_id = (buffer[0] << 4) | (buffer[1] >> 4);
+
+    width = sequence->display_width = sequence->picture_width +=
+	((buffer[1] << 13) | (buffer[2] << 5)) & 0x3000;
+    height = sequence->display_height = sequence->picture_height +=
+	(buffer[2] << 7) & 0x3000;
+    decoder->vertical_position_extension = (height > 2800);
+    flags = sequence->flags | SEQ_FLAG_MPEG2;
+    if (!(buffer[1] & 8)) {
+	flags &= ~SEQ_FLAG_PROGRESSIVE_SEQUENCE;
+	height = (height + 31) & ~31;
+    }
+    if (buffer[5] & 0x80)
+	flags |= SEQ_FLAG_LOW_DELAY;
+    sequence->flags = flags;
+    decoder->width = sequence->width = width = (width + 15) & ~15;
+    decoder->height = sequence->height = height = (height + 15) & ~15;
+    switch (buffer[1] & 6) {
+    case 0:	/* invalid */
+	return 1;
+    case 2:	/* 4:2:0 */
+	height >>= 1;
+    case 4:	/* 4:2:2 */
+	width >>= 1;
     }
+    sequence->chroma_width = width;
+    sequence->chroma_height = height;
 
-    /* MPEG1 - for testing only */
-    picture->mpeg1 = 1;
-    picture->intra_dc_precision = 0;
-    picture->frame_pred_frame_dct = 1;
-    picture->q_scale_type = 0;
-    picture->concealment_motion_vectors = 0;
-    /* picture->alternate_scan = 0; */
-    picture->picture_structure = FRAME_PICTURE;
-    /* picture->second_field = 0; */
+    sequence->byte_rate += ((buffer[2]<<25) | (buffer[3]<<17)) & 0x3ffc0000;
+
+    sequence->vbv_buffer_size |= buffer[4] << 21;
+
+    sequence->frame_period =
+	sequence->frame_period * ((buffer[5]&31)+1) / (((buffer[5]>>2)&3)+1);
+
+    decoder->mpeg1 = 0;
+
+    mpeg2dec->ext_state = SEQ_DISPLAY_EXT;
 
     return 0;
 }
 
-static int header_process_sequence_extension (picture_t * picture,
-					      uint8_t * buffer)
+static int sequence_display_ext (mpeg2dec_t * mpeg2dec)
 {
-    /* check chroma format, size extensions, marker bit */
-    if (((buffer[1] & 0x07) != 0x02) || (buffer[2] & 0xe0) ||
-	((buffer[3] & 0x01) != 0x01))
+    uint8_t * buffer = mpeg2dec->chunk_start;
+    sequence_t * sequence = &(mpeg2dec->new_sequence);
+    uint32_t flags;
+
+    flags = ((sequence->flags & ~SEQ_MASK_VIDEO_FORMAT) |
+	     ((buffer[0]<<4) & SEQ_MASK_VIDEO_FORMAT));
+    if (buffer[0] & 1) {
+	flags |= SEQ_FLAG_COLOUR_DESCRIPTION;
+	sequence->colour_primaries = buffer[1];
+	sequence->transfer_characteristics = buffer[2];
+	sequence->matrix_coefficients = buffer[3];
+	buffer += 3;
+    }
+
+    if (!(buffer[2] & 2))	/* missing marker_bit */
 	return 1;
 
-    /* this is not used by the decoder */
-    picture->progressive_sequence = (buffer[1] >> 3) & 1;
+    sequence->display_width = (buffer[1] << 6) | (buffer[2] >> 2);
+    sequence->display_height =
+	((buffer[2]& 1 ) << 13) | (buffer[3] << 5) | (buffer[4] >> 3);
+
+    return 0;
+}
+
+static inline void finalize_sequence (sequence_t * sequence)
+{
+    int width;
+    int height;
+
+    sequence->byte_rate *= 50;
+
+    if (sequence->flags & SEQ_FLAG_MPEG2) {
+	switch (sequence->pixel_width) {
+	case 1:		/* square pixels */
+	    sequence->pixel_width = sequence->pixel_height = 1;	return;
+	case 2:		/* 4:3 aspect ratio */
+	    width = 4; height = 3;	break;
+	case 3:		/* 16:9 aspect ratio */
+	    width = 16; height = 9;	break;
+	case 4:		/* 2.21:1 aspect ratio */
+	    width = 221; height = 100;	break;
+	default:	/* illegal */
+	    sequence->pixel_width = sequence->pixel_height = 0;	return;
+	}
+	width *= sequence->display_height;
+	height *= sequence->display_width;
+
+    } else {
+	if (sequence->byte_rate == 50 * 0x3ffff) 
+	    sequence->byte_rate = 0;        /* mpeg-1 VBR */ 
+
+	switch (sequence->pixel_width) {
+	case 0:	case 15:	/* illegal */
+	    sequence->pixel_width = sequence->pixel_height = 0;		return;
+	case 1:	/* square pixels */
+	    sequence->pixel_width = sequence->pixel_height = 1;		return;
+	case 3:	/* 720x576 16:9 */
+	    sequence->pixel_width = 64;	sequence->pixel_height = 45;	return;
+	case 6:	/* 720x480 16:9 */
+	    sequence->pixel_width = 32;	sequence->pixel_height = 27;	return;
+	case 12:	/* 720*480 4:3 */
+	    sequence->pixel_width = 8;	sequence->pixel_height = 9;	return;
+	default:
+	    height = 88 * sequence->pixel_width + 1171;
+	    width = 2000;
+	}
+    }
 
-    if (picture->progressive_sequence)
-	picture->coded_picture_height =
-	    (picture->coded_picture_height + 31) & ~31;
+    sequence->pixel_width = width;
+    sequence->pixel_height = height;
+    while (width) {	/* find greatest common divisor */
+	int tmp = width;
+	width = height % tmp;
+	height = tmp;
+    }
+    sequence->pixel_width /= height;
+    sequence->pixel_height /= height;
+}
 
-    /* MPEG1 - for testing only */
-    picture->mpeg1 = 0;
+void mpeg2_header_sequence_finalize (mpeg2dec_t * mpeg2dec)
+{
+    sequence_t * sequence = &(mpeg2dec->new_sequence);
+
+    finalize_sequence (sequence);
+
+    /*
+     * according to 6.1.1.6, repeat sequence headers should be
+     * identical to the original. However some DVDs dont respect that
+     * and have different bitrates in the repeat sequence headers. So
+     * we'll ignore that in the comparison and still consider these as
+     * repeat sequence headers.
+     */
+    mpeg2dec->sequence.byte_rate = sequence->byte_rate;
+    if (!memcmp (&(mpeg2dec->sequence), sequence, sizeof (sequence_t)))
+	mpeg2dec->state = STATE_SEQUENCE_REPEATED;
+    mpeg2dec->sequence = *sequence;
+
+    mpeg2dec->info.sequence = &(mpeg2dec->sequence);
+}
 
+int mpeg2_header_gop (mpeg2dec_t * mpeg2dec)
+{
+    mpeg2dec->state = STATE_GOP;
+    reset_info (&(mpeg2dec->info));
     return 0;
 }
 
-static int header_process_quant_matrix_extension (picture_t * picture,
-						  uint8_t * buffer)
+void mpeg2_set_fbuf (mpeg2dec_t * mpeg2dec, int coding_type)
 {
     int i;
 
-    if (buffer[0] & 8) {
-	for (i = 0; i < 64; i++)
-	    picture->intra_quantizer_matrix[scan_norm[i]] =
-		(buffer[i] << 5) | (buffer[i+1] >> 3);
-	buffer += 64;
+    for (i = 0; i < 3; i++)
+	if (mpeg2dec->fbuf[1] != &mpeg2dec->fbuf_alloc[i].fbuf &&
+	    mpeg2dec->fbuf[2] != &mpeg2dec->fbuf_alloc[i].fbuf) {
+	    mpeg2dec->fbuf[0] = &mpeg2dec->fbuf_alloc[i].fbuf;
+	    mpeg2dec->info.current_fbuf = mpeg2dec->fbuf[0];
+	    if ((coding_type == B_TYPE) ||
+		(mpeg2dec->sequence.flags & SEQ_FLAG_LOW_DELAY)) {
+		if ((coding_type == B_TYPE) || (mpeg2dec->convert_start))
+		    mpeg2dec->info.discard_fbuf = mpeg2dec->fbuf[0];
+		mpeg2dec->info.display_fbuf = mpeg2dec->fbuf[0];
+	    }
+	    break;
+	}
+}
+
+int mpeg2_header_picture_start (mpeg2dec_t * mpeg2dec)
+{
+    decoder_t * decoder = &(mpeg2dec->decoder);
+    picture_t * picture;
+
+    if (mpeg2dec->state != STATE_SLICE_1ST) {
+	mpeg2dec->state = STATE_PICTURE;
+	picture = mpeg2dec->pictures;
+	if ((decoder->coding_type != PIC_FLAG_CODING_TYPE_B) ^
+	    (mpeg2dec->picture >= mpeg2dec->pictures + 2))
+	    picture += 2;
+    } else {
+	mpeg2dec->state = STATE_PICTURE_2ND;
+	picture = mpeg2dec->picture + 1;	/* second field picture */
     }
+    mpeg2dec->picture = picture;
+    picture->flags = 0;
+    if (mpeg2dec->num_pts) {
+	if (mpeg2dec->bytes_since_pts >= 4) {
+	    mpeg2dec->num_pts = 0;
+	    picture->pts = mpeg2dec->pts_current;
+	    picture->flags = PIC_FLAG_PTS;
+	} else if (mpeg2dec->num_pts > 1) {
+	    mpeg2dec->num_pts = 1;
+	    picture->pts = mpeg2dec->pts_previous;
+	    picture->flags = PIC_FLAG_PTS;
+	}
+    }
+    picture->display_offset[0].x = picture->display_offset[1].x =
+	picture->display_offset[2].x = mpeg2dec->display_offset_x;
+    picture->display_offset[0].y = picture->display_offset[1].y =
+	picture->display_offset[2].y = mpeg2dec->display_offset_y;
+    return mpeg2_parse_header (mpeg2dec);
+}
 
-    if (buffer[0] & 4) {
-	for (i = 0; i < 64; i++)
-	    picture->non_intra_quantizer_matrix[scan_norm[i]] =
-		(buffer[i] << 6) | (buffer[i+1] >> 2);
+int mpeg2_header_picture (mpeg2dec_t * mpeg2dec)
+{
+    uint8_t * buffer = mpeg2dec->chunk_start;
+    picture_t * picture = mpeg2dec->picture;
+    decoder_t * decoder = &(mpeg2dec->decoder);
+    int type;
+    int low_delay;
+
+    type = (buffer [1] >> 3) & 7;
+    low_delay = mpeg2dec->sequence.flags & SEQ_FLAG_LOW_DELAY;
+
+    if (mpeg2dec->state == STATE_PICTURE) {
+	picture_t * other;
+
+	decoder->second_field = 0;
+	other = mpeg2dec->pictures;
+	if (other == picture)
+	    other += 2;
+	if (decoder->coding_type != PIC_FLAG_CODING_TYPE_B) {
+	    mpeg2dec->fbuf[2] = mpeg2dec->fbuf[1];
+	    mpeg2dec->fbuf[1] = mpeg2dec->fbuf[0];
+	}
+	mpeg2dec->fbuf[0] = NULL;
+	reset_info (&(mpeg2dec->info));
+	mpeg2dec->info.current_picture = picture;
+	mpeg2dec->info.display_picture = picture;
+	if (type != PIC_FLAG_CODING_TYPE_B) {
+	    if (!low_delay) {
+		if (mpeg2dec->first) {
+		    mpeg2dec->info.display_picture = NULL;
+		    mpeg2dec->first = 0;
+		} else {
+		    mpeg2dec->info.display_picture = other;
+		    if (other->nb_fields == 1)
+			mpeg2dec->info.display_picture_2nd = other + 1;
+		    mpeg2dec->info.display_fbuf = mpeg2dec->fbuf[1];
+		}
+	    }
+	    if (!low_delay + !mpeg2dec->convert_start)
+		mpeg2dec->info.discard_fbuf =
+		    mpeg2dec->fbuf[!low_delay + !mpeg2dec->convert_start];
+	}
+	if (!mpeg2dec->custom_fbuf) {
+	    while (mpeg2dec->alloc_index < 3) {
+		fbuf_t * fbuf;
+
+		fbuf = &(mpeg2dec->fbuf_alloc[mpeg2dec->alloc_index++].fbuf);
+		fbuf->id = NULL;
+		if (mpeg2dec->convert_start) {    
+		    fbuf->buf[0] =
+			(uint8_t *) mpeg2_malloc (mpeg2dec->convert_size[0],
+						  ALLOC_CONVERTED);
+		    fbuf->buf[1] = fbuf->buf[0] + mpeg2dec->convert_size[1];
+		    fbuf->buf[2] = fbuf->buf[0] + mpeg2dec->convert_size[2];
+		} else {
+		    int size;
+		    size = mpeg2dec->decoder.width * mpeg2dec->decoder.height;
+		    fbuf->buf[0] = (uint8_t *) mpeg2_malloc (6 * size >> 2,
+							     ALLOC_YUV);
+		    fbuf->buf[1] = fbuf->buf[0] + size;
+		    fbuf->buf[2] = fbuf->buf[1] + (size >> 2);
+		}
+	    }
+	    mpeg2_set_fbuf (mpeg2dec, type);
+	}
+    } else {
+	decoder->second_field = 1;
+	mpeg2dec->info.current_picture_2nd = picture;
+	mpeg2dec->info.user_data = NULL; mpeg2dec->info.user_data_len = 0;
+	if (low_delay || type == PIC_FLAG_CODING_TYPE_B)
+	    mpeg2dec->info.display_picture_2nd = picture;
+    }
+    mpeg2dec->ext_state = PIC_CODING_EXT;
+
+    picture->temporal_reference = (buffer[0] << 2) | (buffer[1] >> 6);
+
+    decoder->coding_type = type;
+    picture->flags |= type;
+
+    if (type == PIC_FLAG_CODING_TYPE_P || type == PIC_FLAG_CODING_TYPE_B) {
+	/* forward_f_code and backward_f_code - used in mpeg1 only */
+	decoder->f_motion.f_code[1] = (buffer[3] >> 2) & 1;
+	decoder->f_motion.f_code[0] =
+	    (((buffer[3] << 1) | (buffer[4] >> 7)) & 7) - 1;
+	decoder->b_motion.f_code[1] = (buffer[4] >> 6) & 1;
+	decoder->b_motion.f_code[0] = ((buffer[4] >> 3) & 7) - 1;
     }
 
+    /* XXXXXX decode extra_information_picture as well */
+
+    picture->nb_fields = 2;
+
     return 0;
 }
 
-static int header_process_picture_coding_extension (picture_t * picture, uint8_t * buffer)
+static int picture_coding_ext (mpeg2dec_t * mpeg2dec)
 {
+    uint8_t * buffer = mpeg2dec->chunk_start;
+    picture_t * picture = mpeg2dec->picture;
+    decoder_t * decoder = &(mpeg2dec->decoder);
+    uint32_t flags;
+
     /* pre subtract 1 for use later in compute_motion_vector */
-    picture->f_motion.f_code[0] = (buffer[0] & 15) - 1;
-    picture->f_motion.f_code[1] = (buffer[1] >> 4) - 1;
-    picture->b_motion.f_code[0] = (buffer[1] & 15) - 1;
-    picture->b_motion.f_code[1] = (buffer[2] >> 4) - 1;
-
-    picture->intra_dc_precision = (buffer[2] >> 2) & 3;
-    picture->picture_structure = buffer[2] & 3;
-    picture->frame_pred_frame_dct = (buffer[3] >> 6) & 1;
-    picture->concealment_motion_vectors = (buffer[3] >> 5) & 1;
-    picture->q_scale_type = (buffer[3] >> 4) & 1;
-    picture->intra_vlc_format = (buffer[3] >> 3) & 1;
-
-    if (buffer[3] & 4)	/* alternate_scan */
-	picture->scan = scan_alt;
-    else
-	picture->scan = scan_norm;
-
-    /* these are not used by the decoder */
-    picture->top_field_first = buffer[3] >> 7;
-    picture->repeat_first_field = (buffer[3] >> 1) & 1;
-    picture->progressive_frame = buffer[4] >> 7;
-
-    // repeat_first implementation by A'rpi/ESP-team, based on libmpeg3:
-    picture->display_time=100;
-    if(picture->repeat_first_field){
-        if(picture->progressive_sequence){
-            if(picture->top_field_first)
-                picture->display_time+=200;
-            else
-                picture->display_time+=100;
-        } else
-        if(picture->progressive_frame){
-                picture->display_time+=50;
-        }
+    decoder->f_motion.f_code[0] = (buffer[0] & 15) - 1;
+    decoder->f_motion.f_code[1] = (buffer[1] >> 4) - 1;
+    decoder->b_motion.f_code[0] = (buffer[1] & 15) - 1;
+    decoder->b_motion.f_code[1] = (buffer[2] >> 4) - 1;
+
+    flags = picture->flags;
+    decoder->intra_dc_precision = (buffer[2] >> 2) & 3;
+    decoder->picture_structure = buffer[2] & 3;
+    switch (decoder->picture_structure) {
+    case TOP_FIELD:
+	flags |= PIC_FLAG_TOP_FIELD_FIRST;
+    case BOTTOM_FIELD:
+	picture->nb_fields = 1;
+	break;
+    case FRAME_PICTURE:
+	if (!(mpeg2dec->sequence.flags & SEQ_FLAG_PROGRESSIVE_SEQUENCE)) {
+	    picture->nb_fields = (buffer[3] & 2) ? 3 : 2;
+	    flags |= (buffer[3] & 128) ? PIC_FLAG_TOP_FIELD_FIRST : 0;
+	} else
+	    picture->nb_fields = (buffer[3]&2) ? ((buffer[3]&128) ? 6 : 4) : 2;
+	break;
+    default:
+	return 1;
+    }
+    decoder->top_field_first = buffer[3] >> 7;
+    decoder->frame_pred_frame_dct = (buffer[3] >> 6) & 1;
+    decoder->concealment_motion_vectors = (buffer[3] >> 5) & 1;
+    decoder->q_scale_type = (buffer[3] >> 4) & 1;
+    decoder->intra_vlc_format = (buffer[3] >> 3) & 1;
+    decoder->scan = (buffer[3] & 4) ? mpeg2_scan_alt : mpeg2_scan_norm;
+    flags |= (buffer[4] & 0x80) ? PIC_FLAG_PROGRESSIVE_FRAME : 0;
+    if (buffer[4] & 0x40)
+	flags |= (((buffer[4]<<26) | (buffer[5]<<18) | (buffer[6]<<10)) &
+		  PIC_MASK_COMPOSITE_DISPLAY) | PIC_FLAG_COMPOSITE_DISPLAY;
+    picture->flags = flags;
+
+    mpeg2dec->ext_state = PIC_DISPLAY_EXT | COPYRIGHT_EXT | QUANT_MATRIX_EXT;
+
+    return 0;
+}
+
+static int picture_display_ext (mpeg2dec_t * mpeg2dec)
+{
+    uint8_t * buffer = mpeg2dec->chunk_start;
+    picture_t * picture = mpeg2dec->picture;
+    int i, nb_pos;
+
+    nb_pos = picture->nb_fields;
+    if (mpeg2dec->sequence.flags & SEQ_FLAG_PROGRESSIVE_SEQUENCE)
+	nb_pos >>= 1;
+
+    for (i = 0; i < nb_pos; i++) {
+	int x, y;
+
+	x = ((buffer[4*i] << 24) | (buffer[4*i+1] << 16) |
+	     (buffer[4*i+2] << 8) | buffer[4*i+3]) >> (11-2*i);
+	y = ((buffer[4*i+2] << 24) | (buffer[4*i+3] << 16) |
+	     (buffer[4*i+4] << 8) | buffer[4*i+5]) >> (10-2*i);
+	if (! (x & y & 1))
+	    return 1;
+	picture->display_offset[i].x = mpeg2dec->display_offset_x = x >> 1;
+	picture->display_offset[i].y = mpeg2dec->display_offset_y = y >> 1;
+    }
+    for (; i < 3; i++) {
+	picture->display_offset[i].x = mpeg2dec->display_offset_x;
+	picture->display_offset[i].y = mpeg2dec->display_offset_y;
     }
-    //temopral hack. We calc time on every field, so if we have 2 fields
-    // interlaced we'll end with double time for 1 frame
-    if( picture->picture_structure!=3 ) picture->display_time/=2;
     return 0;
 }
 
-int header_process_extension (picture_t * picture, uint8_t * buffer)
+static int copyright_ext (mpeg2dec_t * mpeg2dec)
 {
-    switch (buffer[0] & 0xf0) {
-    case 0x10:	/* sequence extension */
-	return header_process_sequence_extension (picture, buffer);
+    return 0;
+}
 
-    case 0x30:	/* quant matrix extension */
-	return header_process_quant_matrix_extension (picture, buffer);
+static int quant_matrix_ext (mpeg2dec_t * mpeg2dec)
+{
+    uint8_t * buffer = mpeg2dec->chunk_start;
+    decoder_t * decoder = &(mpeg2dec->decoder);
+    int i;
 
-    case 0x80:	/* picture coding extension */
-	return header_process_picture_coding_extension (picture, buffer);
+    if (buffer[0] & 8) {
+	for (i = 0; i < 64; i++)
+	    decoder->intra_quantizer_matrix[mpeg2_scan_norm[i]] =
+		(buffer[i] << 5) | (buffer[i+1] >> 3);
+	buffer += 64;
     }
 
+    if (buffer[0] & 4)
+	for (i = 0; i < 64; i++)
+	    decoder->non_intra_quantizer_matrix[mpeg2_scan_norm[i]] =
+		(buffer[i] << 6) | (buffer[i+1] >> 2);
+
     return 0;
 }
 
-int header_process_picture_header (picture_t *picture, uint8_t * buffer)
+int mpeg2_header_extension (mpeg2dec_t * mpeg2dec)
 {
-    picture->picture_coding_type = (buffer [1] >> 3) & 7;
+    static int (* parser[]) (mpeg2dec_t *) = {
+	0, sequence_ext, sequence_display_ext, quant_matrix_ext,
+	copyright_ext, 0, 0, picture_display_ext, picture_coding_ext
+    };
+    int ext, ext_bit;
+
+    ext = mpeg2dec->chunk_start[0] >> 4;
+    ext_bit = 1 << ext;
+
+    if (!(mpeg2dec->ext_state & ext_bit))
+	return 0;	/* ignore illegal extensions */
+    mpeg2dec->ext_state &= ~ext_bit;
+    return parser[ext] (mpeg2dec);
+}
 
-    /* forward_f_code and backward_f_code - used in mpeg1 only */
-    picture->f_motion.f_code[1] = (buffer[3] >> 2) & 1;
-    picture->f_motion.f_code[0] =
-	(((buffer[3] << 1) | (buffer[4] >> 7)) & 7) - 1;
-    picture->b_motion.f_code[1] = (buffer[4] >> 6) & 1;
-    picture->b_motion.f_code[0] = ((buffer[4] >> 3) & 7) - 1;
+int mpeg2_header_user_data (mpeg2dec_t * mpeg2dec)
+{
+    if (!mpeg2dec->info.user_data_len)
+	mpeg2dec->info.user_data = mpeg2dec->chunk_start;
+    else
+	mpeg2dec->info.user_data_len += 3;
+    mpeg2dec->info.user_data_len += (mpeg2dec->chunk_ptr - 4 -
+				     mpeg2dec->chunk_start);
+    mpeg2dec->chunk_start = mpeg2dec->chunk_ptr - 1;
+    
+    return 0;
+}
 
-    /* move in header_process_picture_header */
-        picture->second_field =
-            (picture->picture_structure != FRAME_PICTURE) &&
-            !(picture->second_field);
+int mpeg2_header_slice_start (mpeg2dec_t * mpeg2dec)
+{
+    mpeg2dec->state = ((mpeg2dec->picture->nb_fields > 1 ||
+			mpeg2dec->state == STATE_PICTURE_2ND) ?
+		       STATE_SLICE : STATE_SLICE_1ST);
+
+    if (!(mpeg2dec->nb_decode_slices))
+	mpeg2dec->picture->flags |= PIC_FLAG_SKIP;
+    else if (mpeg2dec->convert_start) {
+	int flags;
+
+	switch (mpeg2dec->decoder.picture_structure) {
+	case TOP_FIELD:		flags = CONVERT_TOP_FIELD;	break;
+	case BOTTOM_FIELD:	flags = CONVERT_BOTTOM_FIELD;	break;
+	default:
+	    flags =
+		((mpeg2dec->sequence.flags & SEQ_FLAG_PROGRESSIVE_SEQUENCE) ?
+		 CONVERT_FRAME : CONVERT_BOTH_FIELDS);
+	}
+	mpeg2dec->convert_start (mpeg2dec->convert_id,
+				 mpeg2dec->fbuf[0]->buf, flags);
+
+	mpeg2dec->decoder.convert = mpeg2dec->convert_copy;
+	mpeg2dec->decoder.fbuf_id = mpeg2dec->convert_id;
+
+	if (mpeg2dec->decoder.coding_type == B_TYPE)
+	    mpeg2_init_fbuf (&(mpeg2dec->decoder), mpeg2dec->yuv_buf[2],
+			     mpeg2dec->yuv_buf[mpeg2dec->yuv_index ^ 1],
+			     mpeg2dec->yuv_buf[mpeg2dec->yuv_index]);
+	else {
+	    mpeg2_init_fbuf (&(mpeg2dec->decoder),
+			     mpeg2dec->yuv_buf[mpeg2dec->yuv_index ^ 1],
+			     mpeg2dec->yuv_buf[mpeg2dec->yuv_index],
+			     mpeg2dec->yuv_buf[mpeg2dec->yuv_index]);
+	    if (mpeg2dec->state == STATE_SLICE)
+		mpeg2dec->yuv_index ^= 1;
+	}
+    } else {
+	int b_type;
 
+	mpeg2dec->decoder.convert = NULL;
+	b_type = (mpeg2dec->decoder.coding_type == B_TYPE);
+	mpeg2_init_fbuf (&(mpeg2dec->decoder), mpeg2dec->fbuf[0]->buf,
+			 mpeg2dec->fbuf[b_type + 1]->buf,
+			 mpeg2dec->fbuf[b_type]->buf);
+    }
+    mpeg2dec->action = NULL;
     return 0;
 }
+
+int mpeg2_header_end (mpeg2dec_t * mpeg2dec)
+{
+    picture_t * picture;
+    int b_type;
+
+    picture = mpeg2dec->pictures;
+    if (mpeg2dec->picture < picture + 2)
+	picture = mpeg2dec->pictures + 2;
+
+    mpeg2dec->state = STATE_INVALID;
+    reset_info (&(mpeg2dec->info));
+    b_type = (mpeg2dec->decoder.coding_type == B_TYPE);
+    if (!(mpeg2dec->sequence.flags & SEQ_FLAG_LOW_DELAY)) {
+	mpeg2dec->info.display_picture = picture;
+	if (picture->nb_fields == 1)
+	    mpeg2dec->info.display_picture_2nd = picture + 1;
+	mpeg2dec->info.display_fbuf = mpeg2dec->fbuf[b_type];
+	if (!mpeg2dec->convert_start)
+	    mpeg2dec->info.discard_fbuf = mpeg2dec->fbuf[b_type + 1];
+    } else if (!mpeg2dec->convert_start)
+	mpeg2dec->info.discard_fbuf = mpeg2dec->fbuf[b_type];
+    mpeg2dec->action = mpeg2_seek_sequence;
+    return STATE_END;
+}
diff --git a/libmpeg2/idct.c b/libmpeg2/idct.c
index 1e869c37de..bcae078156 100644
--- a/libmpeg2/idct.c
+++ b/libmpeg2/idct.c
@@ -1,12 +1,10 @@
 /*
  * idct.c
- * Copyright (C) 1999-2001 Aaron Holtzman <aholtzma@ess.engr.uvic.ca>
- *
- * Portions of this code are from the MPEG software simulation group
- * idct implementation. This code will be replaced with a new
- * implementation soon.
+ * Copyright (C) 2000-2002 Michel Lespinasse <walken@zoy.org>
+ * Copyright (C) 1999-2000 Aaron Holtzman <aholtzma@ess.engr.uvic.ca>
  *
  * This file is part of mpeg2dec, a free MPEG-2 video stream decoder.
+ * See http://libmpeg2.sourceforge.net/ for updates.
  *
  * mpeg2dec is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -23,27 +21,14 @@
  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  */
 
-/**********************************************************/
-/* inverse two dimensional DCT, Chen-Wang algorithm */
-/* (cf. IEEE ASSP-32, pp. 803-816, Aug. 1984) */
-/* 32-bit integer arithmetic (8 bit coefficients) */
-/* 11 mults, 29 adds per DCT */
-/* sE, 18.8.91 */
-/**********************************************************/
-/* coefficients extended to 12 bit for IEEE1180-1990 */
-/* compliance sE, 2.1.94 */
-/**********************************************************/
-
-/* this code assumes >> to be a two's-complement arithmetic */
-/* right shift: (-2)>>1 == -1 , (-3)>>1 == -2 */
-
 #include "config.h"
 
-#include <stdio.h>
+#include <stdlib.h>
 #include <inttypes.h>
 
+#include "mpeg2.h"
 #include "mpeg2_internal.h"
-#include "mm_accel.h"
+#include "attributes.h"
 
 #define W1 2841 /* 2048*sqrt (2)*cos (1*pi/16) */
 #define W2 2676 /* 2048*sqrt (2)*cos (2*pi/16) */
@@ -53,199 +38,131 @@
 #define W7 565  /* 2048*sqrt (2)*cos (7*pi/16) */
 
 /* idct main entry point  */
-void (*idct_block_copy) (int16_t * block, uint8_t * dest, int stride);
-void (*idct_block_add) (int16_t * block, uint8_t * dest, int stride);
-
-static void idct_block_copy_c (int16_t *block, uint8_t * dest, int stride);
-static void idct_block_add_c (int16_t *block, uint8_t * dest, int stride);
+void (* mpeg2_idct_copy) (int16_t * block, uint8_t * dest, int stride);
+void (* mpeg2_idct_add) (int last, int16_t * block,
+			 uint8_t * dest, int stride);
 
 static uint8_t clip_lut[1024];
-#define CLIP(i) ((clip_lut+384)[ (i)])
+#define CLIP(i) ((clip_lut+384)[(i)])
 
-void idct_init (void)
-{
-#ifdef ARCH_X86
-    if (config.flags & MM_ACCEL_X86_MMXEXT) {
-	printf ("libmpeg2: Using MMXEXT for IDCT transform\n");
-	idct_block_copy = idct_block_copy_mmxext;
-	idct_block_add = idct_block_add_mmxext;
-	idct_mmx_init ();
-    } else if (config.flags & MM_ACCEL_X86_MMX) {
-	printf ("libmpeg2: Using MMX for IDCT transform\n");
-	idct_block_copy = idct_block_copy_mmx;
-	idct_block_add = idct_block_add_mmx;
-	idct_mmx_init ();
-    } else
-#endif
-#ifdef LIBMPEG2_MLIB
-    if (config.flags & MM_ACCEL_MLIB) {
-	printf ("libmpeg2: Using mlib for IDCT transform\n");
-	idct_block_copy = idct_block_copy_mlib;
-	idct_block_add = idct_block_add_mlib;
-    } else
+#if 0
+#define BUTTERFLY(t0,t1,W0,W1,d0,d1)	\
+do {					\
+    t0 = W0*d0 + W1*d1;			\
+    t1 = W0*d1 - W1*d0;			\
+} while (0)
+#else
+#define BUTTERFLY(t0,t1,W0,W1,d0,d1)	\
+do {					\
+    int tmp = W0 * (d0 + d1);		\
+    t0 = tmp + (W1 - W0) * d1;		\
+    t1 = tmp - (W1 + W0) * d0;		\
+} while (0)
 #endif
-    {
-	int i;
-
-	printf ("libmpeg2: No accelerated IDCT transform found\n");
-	idct_block_copy = idct_block_copy_c;
-	idct_block_add = idct_block_add_c;
-	for (i = -384; i < 640; i++)
-	    clip_lut[i+384] = (i < 0) ? 0 : ((i > 255) ? 255 : i);
-    }
-}
 
-/* row (horizontal) IDCT
- *
- * 7 pi 1
- * dst[k] = sum c[l] * src[l] * cos ( -- * ( k + - ) * l )
- * l=0 8 2
- *
- * where: c[0] = 128
- * c[1..7] = 128*sqrt (2)
- */
-
-static inline void idct_row (int16_t * block)
+static void inline idct_row (int16_t * const block)
 {
-    int x0, x1, x2, x3, x4, x5, x6, x7, x8;
-
-    x1 = block[4] << 11;
-    x2 = block[6];
-    x3 = block[2];
-    x4 = block[1];
-    x5 = block[7];
-    x6 = block[5];
-    x7 = block[3];
+    int d0, d1, d2, d3;
+    int a0, a1, a2, a3, b0, b1, b2, b3;
+    int t0, t1, t2, t3;
 
     /* shortcut */
-    if (! (x1 | x2 | x3 | x4 | x5 | x6 | x7 )) {
-	block[0] = block[1] = block[2] = block[3] = block[4] =
-	    block[5] = block[6] = block[7] = block[0]<<3;
+    if (likely (!(block[1] | ((int32_t *)block)[1] | ((int32_t *)block)[2] |
+		  ((int32_t *)block)[3]))) {
+	uint32_t tmp = (uint16_t) (block[0] << 3);
+	tmp |= tmp << 16;
+	((int32_t *)block)[0] = tmp;
+	((int32_t *)block)[1] = tmp;
+	((int32_t *)block)[2] = tmp;
+	((int32_t *)block)[3] = tmp;
 	return;
     }
 
-    x0 = (block[0] << 11) + 128; /* for proper rounding in the fourth stage */
-
-    /* first stage */
-    x8 = W7 * (x4 + x5);
-    x4 = x8 + (W1 - W7) * x4;
-    x5 = x8 - (W1 + W7) * x5;
-    x8 = W3 * (x6 + x7);
-    x6 = x8 - (W3 - W5) * x6;
-    x7 = x8 - (W3 + W5) * x7;
- 
-    /* second stage */
-    x8 = x0 + x1;
-    x0 -= x1;
-    x1 = W6 * (x3 + x2);
-    x2 = x1 - (W2 + W6) * x2;
-    x3 = x1 + (W2 - W6) * x3;
-    x1 = x4 + x6;
-    x4 -= x6;
-    x6 = x5 + x7;
-    x5 -= x7;
- 
-    /* third stage */
-    x7 = x8 + x3;
-    x8 -= x3;
-    x3 = x0 + x2;
-    x0 -= x2;
-    x2 = (181 * (x4 + x5) + 128) >> 8;
-    x4 = (181 * (x4 - x5) + 128) >> 8;
- 
-    /* fourth stage */
-    block[0] = (x7 + x1) >> 8;
-    block[1] = (x3 + x2) >> 8;
-    block[2] = (x0 + x4) >> 8;
-    block[3] = (x8 + x6) >> 8;
-    block[4] = (x8 - x6) >> 8;
-    block[5] = (x0 - x4) >> 8;
-    block[6] = (x3 - x2) >> 8;
-    block[7] = (x7 - x1) >> 8;
+    d0 = (block[0] << 11) + 128;
+    d1 = block[1];
+    d2 = block[2] << 11;
+    d3 = block[3];
+    t0 = d0 + d2;
+    t1 = d0 - d2;
+    BUTTERFLY (t2, t3, W6, W2, d3, d1);
+    a0 = t0 + t2;
+    a1 = t1 + t3;
+    a2 = t1 - t3;
+    a3 = t0 - t2;
+
+    d0 = block[4];
+    d1 = block[5];
+    d2 = block[6];
+    d3 = block[7];
+    BUTTERFLY (t0, t1, W7, W1, d3, d0);
+    BUTTERFLY (t2, t3, W3, W5, d1, d2);
+    b0 = t0 + t2;
+    b3 = t1 + t3;
+    t0 -= t2;
+    t1 -= t3;
+    b1 = ((t0 + t1) * 181) >> 8;
+    b2 = ((t0 - t1) * 181) >> 8;
+
+    block[0] = (a0 + b0) >> 8;
+    block[1] = (a1 + b1) >> 8;
+    block[2] = (a2 + b2) >> 8;
+    block[3] = (a3 + b3) >> 8;
+    block[4] = (a3 - b3) >> 8;
+    block[5] = (a2 - b2) >> 8;
+    block[6] = (a1 - b1) >> 8;
+    block[7] = (a0 - b0) >> 8;
 }
 
-/* column (vertical) IDCT
- *
- * 7 pi 1
- * dst[8*k] = sum c[l] * src[8*l] * cos ( -- * ( k + - ) * l )
- * l=0 8 2
- *
- * where: c[0] = 1/1024
- * c[1..7] = (1/1024)*sqrt (2)
- */
-
-static inline void idct_col (int16_t *block)
+static void inline idct_col (int16_t * const block)
 {
-    int x0, x1, x2, x3, x4, x5, x6, x7, x8;
-
-    /* shortcut */
-    x1 = block [8*4] << 8;
-    x2 = block [8*6];
-    x3 = block [8*2];
-    x4 = block [8*1];
-    x5 = block [8*7];
-    x6 = block [8*5];
-    x7 = block [8*3];
-
-#if 0
-    if (! (x1 | x2 | x3 | x4 | x5 | x6 | x7 )) {
-	block[8*0] = block[8*1] = block[8*2] = block[8*3] = block[8*4] =
-	    block[8*5] = block[8*6] = block[8*7] = (block[8*0] + 32) >> 6;
-	return;
-    }
-#endif
-
-    x0 = (block[8*0] << 8) + 8192;
-
-    /* first stage */
-    x8 = W7 * (x4 + x5) + 4;
-    x4 = (x8 + (W1 - W7) * x4) >> 3;
-    x5 = (x8 - (W1 + W7) * x5) >> 3;
-    x8 = W3 * (x6 + x7) + 4;
-    x6 = (x8 - (W3 - W5) * x6) >> 3;
-    x7 = (x8 - (W3 + W5) * x7) >> 3;
- 
-    /* second stage */
-    x8 = x0 + x1;
-    x0 -= x1;
-    x1 = W6 * (x3 + x2) + 4;
-    x2 = (x1 - (W2 + W6) * x2) >> 3;
-    x3 = (x1 + (W2 - W6) * x3) >> 3;
-    x1 = x4 + x6;
-    x4 -= x6;
-    x6 = x5 + x7;
-    x5 -= x7;
- 
-    /* third stage */
-    x7 = x8 + x3;
-    x8 -= x3;
-    x3 = x0 + x2;
-    x0 -= x2;
-    x2 = (181 * (x4 + x5) + 128) >> 8;
-    x4 = (181 * (x4 - x5) + 128) >> 8;
- 
-    /* fourth stage */
-    block[8*0] = (x7 + x1) >> 14;
-    block[8*1] = (x3 + x2) >> 14;
-    block[8*2] = (x0 + x4) >> 14;
-    block[8*3] = (x8 + x6) >> 14;
-    block[8*4] = (x8 - x6) >> 14;
-    block[8*5] = (x0 - x4) >> 14;
-    block[8*6] = (x3 - x2) >> 14;
-    block[8*7] = (x7 - x1) >> 14;
+    int d0, d1, d2, d3;
+    int a0, a1, a2, a3, b0, b1, b2, b3;
+    int t0, t1, t2, t3;
+
+    d0 = (block[8*0] << 11) + 65536;
+    d1 = block[8*1];
+    d2 = block[8*2] << 11;
+    d3 = block[8*3];
+    t0 = d0 + d2;
+    t1 = d0 - d2;
+    BUTTERFLY (t2, t3, W6, W2, d3, d1);
+    a0 = t0 + t2;
+    a1 = t1 + t3;
+    a2 = t1 - t3;
+    a3 = t0 - t2;
+
+    d0 = block[8*4];
+    d1 = block[8*5];
+    d2 = block[8*6];
+    d3 = block[8*7];
+    BUTTERFLY (t0, t1, W7, W1, d3, d0);
+    BUTTERFLY (t2, t3, W3, W5, d1, d2);
+    b0 = t0 + t2;
+    b3 = t1 + t3;
+    t0 = (t0 - t2) >> 8;
+    t1 = (t1 - t3) >> 8;
+    b1 = (t0 + t1) * 181;
+    b2 = (t0 - t1) * 181;
+
+    block[8*0] = (a0 + b0) >> 17;
+    block[8*1] = (a1 + b1) >> 17;
+    block[8*2] = (a2 + b2) >> 17;
+    block[8*3] = (a3 + b3) >> 17;
+    block[8*4] = (a3 - b3) >> 17;
+    block[8*5] = (a2 - b2) >> 17;
+    block[8*6] = (a1 - b1) >> 17;
+    block[8*7] = (a0 - b0) >> 17;
 }
 
-void idct_block_copy_c (int16_t * block, uint8_t * dest, int stride)
+static void mpeg2_idct_copy_c (int16_t * block, uint8_t * dest,
+			       const int stride)
 {
     int i;
 
     for (i = 0; i < 8; i++)
 	idct_row (block + 8 * i);
-
     for (i = 0; i < 8; i++)
 	idct_col (block + i);
-
-    i = 8;
     do {
 	dest[0] = CLIP (block[0]);
 	dest[1] = CLIP (block[1]);
@@ -256,33 +173,112 @@ void idct_block_copy_c (int16_t * block, uint8_t * dest, int stride)
 	dest[6] = CLIP (block[6]);
 	dest[7] = CLIP (block[7]);
 
+	block[0] = 0;	block[1] = 0;	block[2] = 0;	block[3] = 0;
+	block[4] = 0;	block[5] = 0;	block[6] = 0;	block[7] = 0;
+
 	dest += stride;
 	block += 8;
     } while (--i);
 }
 
-void idct_block_add_c (int16_t * block, uint8_t * dest, int stride)
+static void mpeg2_idct_add_c (const int last, int16_t * block,
+			      uint8_t * dest, const int stride)
 {
     int i;
 
-    for (i = 0; i < 8; i++)
-	idct_row (block + 8 * i);
-
-    for (i = 0; i < 8; i++)
-	idct_col (block + i);
+    if (last != 129 || (block[0] & 7) == 4) {
+	for (i = 0; i < 8; i++)
+	    idct_row (block + 8 * i);
+	for (i = 0; i < 8; i++)
+	    idct_col (block + i);
+	do {
+	    dest[0] = CLIP (block[0] + dest[0]);
+	    dest[1] = CLIP (block[1] + dest[1]);
+	    dest[2] = CLIP (block[2] + dest[2]);
+	    dest[3] = CLIP (block[3] + dest[3]);
+	    dest[4] = CLIP (block[4] + dest[4]);
+	    dest[5] = CLIP (block[5] + dest[5]);
+	    dest[6] = CLIP (block[6] + dest[6]);
+	    dest[7] = CLIP (block[7] + dest[7]);
+
+	    block[0] = 0;	block[1] = 0;	block[2] = 0;	block[3] = 0;
+	    block[4] = 0;	block[5] = 0;	block[6] = 0;	block[7] = 0;
+
+	    dest += stride;
+	    block += 8;
+	} while (--i);
+    } else {
+	int DC;
+
+	DC = (block[0] + 4) >> 3;
+	block[0] = block[63] = 0;
+	i = 8;
+	do {
+	    dest[0] = CLIP (DC + dest[0]);
+	    dest[1] = CLIP (DC + dest[1]);
+	    dest[2] = CLIP (DC + dest[2]);
+	    dest[3] = CLIP (DC + dest[3]);
+	    dest[4] = CLIP (DC + dest[4]);
+	    dest[5] = CLIP (DC + dest[5]);
+	    dest[6] = CLIP (DC + dest[6]);
+	    dest[7] = CLIP (DC + dest[7]);
+	    dest += stride;
+	} while (--i);
+    }
+}
 
-    i = 8;
-    do {
-	dest[0] = CLIP (block[0] + dest[0]);
-	dest[1] = CLIP (block[1] + dest[1]);
-	dest[2] = CLIP (block[2] + dest[2]);
-	dest[3] = CLIP (block[3] + dest[3]);
-	dest[4] = CLIP (block[4] + dest[4]);
-	dest[5] = CLIP (block[5] + dest[5]);
-	dest[6] = CLIP (block[6] + dest[6]);
-	dest[7] = CLIP (block[7] + dest[7]);
+void mpeg2_idct_init (uint32_t accel)
+{
+#ifdef ARCH_X86
+    if (accel & MPEG2_ACCEL_X86_MMXEXT) {
+	mpeg2_idct_copy = mpeg2_idct_copy_mmxext;
+	mpeg2_idct_add = mpeg2_idct_add_mmxext;
+	mpeg2_idct_mmx_init ();
+    } else if (accel & MPEG2_ACCEL_X86_MMX) {
+	mpeg2_idct_copy = mpeg2_idct_copy_mmx;
+	mpeg2_idct_add = mpeg2_idct_add_mmx;
+	mpeg2_idct_mmx_init ();
+    } else
+#endif
+#ifdef ARCH_PPC
+    if (accel & MPEG2_ACCEL_PPC_ALTIVEC) {
+	mpeg2_idct_copy = mpeg2_idct_copy_altivec;
+	mpeg2_idct_add = mpeg2_idct_add_altivec;
+	mpeg2_idct_altivec_init ();
+    } else
+#endif
+#ifdef ARCH_ALPHA
+    if (accel & MPEG2_ACCEL_ALPHA_MVI) {
+	mpeg2_idct_copy = mpeg2_idct_copy_mvi;
+	mpeg2_idct_add = mpeg2_idct_add_mvi;
+	mpeg2_idct_alpha_init (0);
+    } else if (accel & MPEG2_ACCEL_ALPHA) {
+	mpeg2_idct_copy = mpeg2_idct_copy_alpha;
+	mpeg2_idct_add = mpeg2_idct_add_alpha;
+	mpeg2_idct_alpha_init (1);
+    } else
+#endif
+#ifdef LIBMPEG2_MLIB
+    if (accel & MPEG2_ACCEL_MLIB) {
+	mpeg2_idct_copy = mpeg2_idct_copy_mlib_non_ieee;
+	mpeg2_idct_add = (getenv ("MLIB_NON_IEEE") ?
+			  mpeg2_idct_add_mlib_non_ieee : mpeg2_idct_add_mlib);
+    } else
+#endif
+    {
+	extern uint8_t mpeg2_scan_norm[64];
+	extern uint8_t mpeg2_scan_alt[64];
+	int i, j;
 
-	dest += stride;
-	block += 8;
-    } while (--i);
+	mpeg2_idct_copy = mpeg2_idct_copy_c;
+	mpeg2_idct_add = mpeg2_idct_add_c;
+	for (i = -384; i < 640; i++)
+	    clip_lut[i+384] = (i < 0) ? 0 : ((i > 255) ? 255 : i);
+	for (i = 0; i < 64; i++) {
+	    j = mpeg2_scan_norm[i];
+	    mpeg2_scan_norm[i] = ((j & 0x36) >> 1) | ((j & 0x09) << 2);
+	    j = mpeg2_scan_alt[i];
+	    mpeg2_scan_alt[i] = ((j & 0x36) >> 1) | ((j & 0x09) << 2);
+	}
+    }
 }
diff --git a/libmpeg2/idct_mlib.c b/libmpeg2/idct_mlib.c
index 876ab574a4..eae2a2f1be 100644
--- a/libmpeg2/idct_mlib.c
+++ b/libmpeg2/idct_mlib.c
@@ -1,8 +1,9 @@
 /*
  * idct_mlib.c
- * Copyright (C) 1999-2001 H�kan Hjort <d95hjort@dtek.chalmers.se>
+ * Copyright (C) 1999-2002 H�kan Hjort <d95hjort@dtek.chalmers.se>
  *
  * This file is part of mpeg2dec, a free MPEG-2 video stream decoder.
+ * See http://libmpeg2.sourceforge.net/ for updates.
  *
  * mpeg2dec is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -23,25 +24,37 @@
 
 #ifdef LIBMPEG2_MLIB
 
-#include <inttypes.h>
 #include <mlib_types.h>
 #include <mlib_status.h>
 #include <mlib_sys.h>
 #include <mlib_video.h>
+#include <string.h>
+#include <inttypes.h>
 
+#include "mpeg2.h"
 #include "mpeg2_internal.h"
 
-void idct_block_copy_mlib (int16_t * block, uint8_t * dest, int stride)
+void mpeg2_idct_add_mlib (const int last, int16_t * const block,
+			  uint8_t * const dest, const int stride)
+{
+    mlib_VideoIDCT_IEEE_S16_S16 (block, block);
+    mlib_VideoAddBlock_U8_S16 (dest, block, stride);
+    memset (block, 0, 64 * sizeof (uint16_t));
+}
+
+void mpeg2_idct_copy_mlib_non_ieee (int16_t * const block,
+				    uint8_t * const dest, const int stride)
 {
     mlib_VideoIDCT8x8_U8_S16 (dest, block, stride);
+    memset (block, 0, 64 * sizeof (uint16_t));
 }
 
-void idct_block_add_mlib (int16_t * block, uint8_t * dest, int stride)
+void mpeg2_idct_add_mlib_non_ieee (const int last, int16_t * const block,
+				   uint8_t * const dest, const int stride)
 {
-    /* Should we use mlib_VideoIDCT_IEEE_S16_S16 here ?? */
-    /* it's ~30% slower. */
     mlib_VideoIDCT8x8_S16_S16 (block, block);
     mlib_VideoAddBlock_U8_S16 (dest, block, stride);
+    memset (block, 0, 64 * sizeof (uint16_t));
 }
 
 #endif
diff --git a/libmpeg2/idct_mmx.c b/libmpeg2/idct_mmx.c
index 70b3b9b95e..4915b93750 100644
--- a/libmpeg2/idct_mmx.c
+++ b/libmpeg2/idct_mmx.c
@@ -1,8 +1,10 @@
 /*
  * idct_mmx.c
- * Copyright (C) 1999-2001 Aaron Holtzman <aholtzma@ess.engr.uvic.ca>
+ * Copyright (C) 2000-2002 Michel Lespinasse <walken@zoy.org>
+ * Copyright (C) 1999-2000 Aaron Holtzman <aholtzma@ess.engr.uvic.ca>
  *
  * This file is part of mpeg2dec, a free MPEG-2 video stream decoder.
+ * See http://libmpeg2.sourceforge.net/ for updates.
  *
  * mpeg2dec is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -25,6 +27,7 @@
 
 #include <inttypes.h>
 
+#include "mpeg2.h"
 #include "mpeg2_internal.h"
 #include "attributes.h"
 #include "mmx.h"
@@ -87,104 +90,107 @@ static inline void idct_row (int16_t * row, int offset,
 						   c5, -c1,  c3, -c1,	\
 						   c7,  c3,  c7, -c5 }
 
-static inline void mmxext_row_head (int16_t * row, int offset, int16_t * table)
+static inline void mmxext_row_head (int16_t * const row, const int offset,
+				    const int16_t * const table)
 {
-    movq_m2r (*(row+offset), mm2);	// mm2 = x6 x4 x2 x0
+    movq_m2r (*(row+offset), mm2);	/* mm2 = x6 x4 x2 x0 */
 
-    movq_m2r (*(row+offset+4), mm5);	// mm5 = x7 x5 x3 x1
-    movq_r2r (mm2, mm0);		// mm0 = x6 x4 x2 x0
+    movq_m2r (*(row+offset+4), mm5);	/* mm5 = x7 x5 x3 x1 */
+    movq_r2r (mm2, mm0);		/* mm0 = x6 x4 x2 x0 */
 
-    movq_m2r (*table, mm3);		// mm3 = -C2 -C4 C2 C4
-    movq_r2r (mm5, mm6);		// mm6 = x7 x5 x3 x1
+    movq_m2r (*table, mm3);		/* mm3 = -C2 -C4 C2 C4 */
+    movq_r2r (mm5, mm6);		/* mm6 = x7 x5 x3 x1 */
 
-    movq_m2r (*(table+4), mm4);		// mm4 = C6 C4 C6 C4
-    pmaddwd_r2r (mm0, mm3);		// mm3 = -C4*x4-C2*x6 C4*x0+C2*x2
+    movq_m2r (*(table+4), mm4);		/* mm4 = C6 C4 C6 C4 */
+    pmaddwd_r2r (mm0, mm3);		/* mm3 = -C4*x4-C2*x6 C4*x0+C2*x2 */
 
-    pshufw_r2r (mm2, mm2, 0x4e);	// mm2 = x2 x0 x6 x4
+    pshufw_r2r (mm2, mm2, 0x4e);	/* mm2 = x2 x0 x6 x4 */
 }
 
-static inline void mmxext_row (int16_t * table, int32_t * rounder)
+static inline void mmxext_row (const int16_t * const table,
+			       const int32_t * const rounder)
 {
-    movq_m2r (*(table+8), mm1);		// mm1 = -C5 -C1 C3 C1
-    pmaddwd_r2r (mm2, mm4);		// mm4 = C4*x0+C6*x2 C4*x4+C6*x6
+    movq_m2r (*(table+8), mm1);		/* mm1 = -C5 -C1 C3 C1 */
+    pmaddwd_r2r (mm2, mm4);		/* mm4 = C4*x0+C6*x2 C4*x4+C6*x6 */
 
-    pmaddwd_m2r (*(table+16), mm0);	// mm0 = C4*x4-C6*x6 C4*x0-C6*x2
-    pshufw_r2r (mm6, mm6, 0x4e);	// mm6 = x3 x1 x7 x5
+    pmaddwd_m2r (*(table+16), mm0);	/* mm0 = C4*x4-C6*x6 C4*x0-C6*x2 */
+    pshufw_r2r (mm6, mm6, 0x4e);	/* mm6 = x3 x1 x7 x5 */
 
-    movq_m2r (*(table+12), mm7);	// mm7 = -C7 C3 C7 C5
-    pmaddwd_r2r (mm5, mm1);		// mm1 = -C1*x5-C5*x7 C1*x1+C3*x3
+    movq_m2r (*(table+12), mm7);	/* mm7 = -C7 C3 C7 C5 */
+    pmaddwd_r2r (mm5, mm1);		/* mm1 = -C1*x5-C5*x7 C1*x1+C3*x3 */
 
-    paddd_m2r (*rounder, mm3);		// mm3 += rounder
-    pmaddwd_r2r (mm6, mm7);		// mm7 = C3*x1-C7*x3 C5*x5+C7*x7
+    paddd_m2r (*rounder, mm3);		/* mm3 += rounder */
+    pmaddwd_r2r (mm6, mm7);		/* mm7 = C3*x1-C7*x3 C5*x5+C7*x7 */
 
-    pmaddwd_m2r (*(table+20), mm2);	// mm2 = C4*x0-C2*x2 -C4*x4+C2*x6
-    paddd_r2r (mm4, mm3);		// mm3 = a1 a0 + rounder
+    pmaddwd_m2r (*(table+20), mm2);	/* mm2 = C4*x0-C2*x2 -C4*x4+C2*x6 */
+    paddd_r2r (mm4, mm3);		/* mm3 = a1 a0 + rounder */
 
-    pmaddwd_m2r (*(table+24), mm5);	// mm5 = C3*x5-C1*x7 C5*x1-C1*x3
-    movq_r2r (mm3, mm4);		// mm4 = a1 a0 + rounder
+    pmaddwd_m2r (*(table+24), mm5);	/* mm5 = C3*x5-C1*x7 C5*x1-C1*x3 */
+    movq_r2r (mm3, mm4);		/* mm4 = a1 a0 + rounder */
 
-    pmaddwd_m2r (*(table+28), mm6);	// mm6 = C7*x1-C5*x3 C7*x5+C3*x7
-    paddd_r2r (mm7, mm1);		// mm1 = b1 b0
+    pmaddwd_m2r (*(table+28), mm6);	/* mm6 = C7*x1-C5*x3 C7*x5+C3*x7 */
+    paddd_r2r (mm7, mm1);		/* mm1 = b1 b0 */
 
-    paddd_m2r (*rounder, mm0);		// mm0 += rounder
-    psubd_r2r (mm1, mm3);		// mm3 = a1-b1 a0-b0 + rounder
+    paddd_m2r (*rounder, mm0);		/* mm0 += rounder */
+    psubd_r2r (mm1, mm3);		/* mm3 = a1-b1 a0-b0 + rounder */
 
-    psrad_i2r (ROW_SHIFT, mm3);		// mm3 = y6 y7
-    paddd_r2r (mm4, mm1);		// mm1 = a1+b1 a0+b0 + rounder
+    psrad_i2r (ROW_SHIFT, mm3);		/* mm3 = y6 y7 */
+    paddd_r2r (mm4, mm1);		/* mm1 = a1+b1 a0+b0 + rounder */
 
-    paddd_r2r (mm2, mm0);		// mm0 = a3 a2 + rounder
-    psrad_i2r (ROW_SHIFT, mm1);		// mm1 = y1 y0
+    paddd_r2r (mm2, mm0);		/* mm0 = a3 a2 + rounder */
+    psrad_i2r (ROW_SHIFT, mm1);		/* mm1 = y1 y0 */
 
-    paddd_r2r (mm6, mm5);		// mm5 = b3 b2
-    movq_r2r (mm0, mm4);		// mm4 = a3 a2 + rounder
+    paddd_r2r (mm6, mm5);		/* mm5 = b3 b2 */
+    movq_r2r (mm0, mm4);		/* mm4 = a3 a2 + rounder */
 
-    paddd_r2r (mm5, mm0);		// mm0 = a3+b3 a2+b2 + rounder
-    psubd_r2r (mm5, mm4);		// mm4 = a3-b3 a2-b2 + rounder
+    paddd_r2r (mm5, mm0);		/* mm0 = a3+b3 a2+b2 + rounder */
+    psubd_r2r (mm5, mm4);		/* mm4 = a3-b3 a2-b2 + rounder */
 }
 
-static inline void mmxext_row_tail (int16_t * row, int store)
+static inline void mmxext_row_tail (int16_t * const row, const int store)
 {
-    psrad_i2r (ROW_SHIFT, mm0);		// mm0 = y3 y2
+    psrad_i2r (ROW_SHIFT, mm0);		/* mm0 = y3 y2 */
 
-    psrad_i2r (ROW_SHIFT, mm4);		// mm4 = y4 y5
+    psrad_i2r (ROW_SHIFT, mm4);		/* mm4 = y4 y5 */
 
-    packssdw_r2r (mm0, mm1);		// mm1 = y3 y2 y1 y0
+    packssdw_r2r (mm0, mm1);		/* mm1 = y3 y2 y1 y0 */
 
-    packssdw_r2r (mm3, mm4);		// mm4 = y6 y7 y4 y5
+    packssdw_r2r (mm3, mm4);		/* mm4 = y6 y7 y4 y5 */
 
-    movq_r2m (mm1, *(row+store));	// save y3 y2 y1 y0
-    pshufw_r2r (mm4, mm4, 0xb1);	// mm4 = y7 y6 y5 y4
+    movq_r2m (mm1, *(row+store));	/* save y3 y2 y1 y0 */
+    pshufw_r2r (mm4, mm4, 0xb1);	/* mm4 = y7 y6 y5 y4 */
 
     /* slot */
 
-    movq_r2m (mm4, *(row+store+4));	// save y7 y6 y5 y4
+    movq_r2m (mm4, *(row+store+4));	/* save y7 y6 y5 y4 */
 }
 
-static inline void mmxext_row_mid (int16_t * row, int store,
-				   int offset, int16_t * table)
+static inline void mmxext_row_mid (int16_t * const row, const int store,
+				   const int offset,
+				   const int16_t * const table)
 {
-    movq_m2r (*(row+offset), mm2);	// mm2 = x6 x4 x2 x0
-    psrad_i2r (ROW_SHIFT, mm0);		// mm0 = y3 y2
+    movq_m2r (*(row+offset), mm2);	/* mm2 = x6 x4 x2 x0 */
+    psrad_i2r (ROW_SHIFT, mm0);		/* mm0 = y3 y2 */
 
-    movq_m2r (*(row+offset+4), mm5);	// mm5 = x7 x5 x3 x1
-    psrad_i2r (ROW_SHIFT, mm4);		// mm4 = y4 y5
+    movq_m2r (*(row+offset+4), mm5);	/* mm5 = x7 x5 x3 x1 */
+    psrad_i2r (ROW_SHIFT, mm4);		/* mm4 = y4 y5 */
 
-    packssdw_r2r (mm0, mm1);		// mm1 = y3 y2 y1 y0
-    movq_r2r (mm5, mm6);		// mm6 = x7 x5 x3 x1
+    packssdw_r2r (mm0, mm1);		/* mm1 = y3 y2 y1 y0 */
+    movq_r2r (mm5, mm6);		/* mm6 = x7 x5 x3 x1 */
 
-    packssdw_r2r (mm3, mm4);		// mm4 = y6 y7 y4 y5
-    movq_r2r (mm2, mm0);		// mm0 = x6 x4 x2 x0
+    packssdw_r2r (mm3, mm4);		/* mm4 = y6 y7 y4 y5 */
+    movq_r2r (mm2, mm0);		/* mm0 = x6 x4 x2 x0 */
 
-    movq_r2m (mm1, *(row+store));	// save y3 y2 y1 y0
-    pshufw_r2r (mm4, mm4, 0xb1);	// mm4 = y7 y6 y5 y4
+    movq_r2m (mm1, *(row+store));	/* save y3 y2 y1 y0 */
+    pshufw_r2r (mm4, mm4, 0xb1);	/* mm4 = y7 y6 y5 y4 */
 
-    movq_m2r (*table, mm3);		// mm3 = -C2 -C4 C2 C4
-    movq_r2m (mm4, *(row+store+4));	// save y7 y6 y5 y4
+    movq_m2r (*table, mm3);		/* mm3 = -C2 -C4 C2 C4 */
+    movq_r2m (mm4, *(row+store+4));	/* save y7 y6 y5 y4 */
 
-    pmaddwd_r2r (mm0, mm3);		// mm3 = -C4*x4-C2*x6 C4*x0+C2*x2
+    pmaddwd_r2r (mm0, mm3);		/* mm3 = -C4*x4-C2*x6 C4*x0+C2*x2 */
 
-    movq_m2r (*(table+4), mm4);		// mm4 = C6 C4 C6 C4
-    pshufw_r2r (mm2, mm2, 0x4e);	// mm2 = x2 x0 x6 x4
+    movq_m2r (*(table+4), mm4);		/* mm4 = C6 C4 C6 C4 */
+    pshufw_r2r (mm2, mm2, 0x4e);	/* mm2 = x2 x0 x6 x4 */
 }
 
 
@@ -199,125 +205,127 @@ static inline void mmxext_row_mid (int16_t * row, int store,
 					   c5, -c1,  c7, -c5,	\
 					   c7,  c3,  c3, -c1 }
 
-static inline void mmx_row_head (int16_t * row, int offset, int16_t * table)
+static inline void mmx_row_head (int16_t * const row, const int offset,
+				 const int16_t * const table)
 {
-    movq_m2r (*(row+offset), mm2);	// mm2 = x6 x4 x2 x0
+    movq_m2r (*(row+offset), mm2);	/* mm2 = x6 x4 x2 x0 */
 
-    movq_m2r (*(row+offset+4), mm5);	// mm5 = x7 x5 x3 x1
-    movq_r2r (mm2, mm0);		// mm0 = x6 x4 x2 x0
+    movq_m2r (*(row+offset+4), mm5);	/* mm5 = x7 x5 x3 x1 */
+    movq_r2r (mm2, mm0);		/* mm0 = x6 x4 x2 x0 */
 
-    movq_m2r (*table, mm3);		// mm3 = C6 C4 C2 C4
-    movq_r2r (mm5, mm6);		// mm6 = x7 x5 x3 x1
+    movq_m2r (*table, mm3);		/* mm3 = C6 C4 C2 C4 */
+    movq_r2r (mm5, mm6);		/* mm6 = x7 x5 x3 x1 */
 
-    punpckldq_r2r (mm0, mm0);		// mm0 = x2 x0 x2 x0
+    punpckldq_r2r (mm0, mm0);		/* mm0 = x2 x0 x2 x0 */
 
-    movq_m2r (*(table+4), mm4);		// mm4 = -C2 -C4 C6 C4
-    pmaddwd_r2r (mm0, mm3);		// mm3 = C4*x0+C6*x2 C4*x0+C2*x2
+    movq_m2r (*(table+4), mm4);		/* mm4 = -C2 -C4 C6 C4 */
+    pmaddwd_r2r (mm0, mm3);		/* mm3 = C4*x0+C6*x2 C4*x0+C2*x2 */
 
-    movq_m2r (*(table+8), mm1);		// mm1 = -C7 C3 C3 C1
-    punpckhdq_r2r (mm2, mm2);		// mm2 = x6 x4 x6 x4
+    movq_m2r (*(table+8), mm1);		/* mm1 = -C7 C3 C3 C1 */
+    punpckhdq_r2r (mm2, mm2);		/* mm2 = x6 x4 x6 x4 */
 }
 
-static inline void mmx_row (int16_t * table, int32_t * rounder)
+static inline void mmx_row (const int16_t * const table,
+			    const int32_t * const rounder)
 {
-    pmaddwd_r2r (mm2, mm4);		// mm4 = -C4*x4-C2*x6 C4*x4+C6*x6
-    punpckldq_r2r (mm5, mm5);		// mm5 = x3 x1 x3 x1
+    pmaddwd_r2r (mm2, mm4);		/* mm4 = -C4*x4-C2*x6 C4*x4+C6*x6 */
+    punpckldq_r2r (mm5, mm5);		/* mm5 = x3 x1 x3 x1 */
 
-    pmaddwd_m2r (*(table+16), mm0);	// mm0 = C4*x0-C2*x2 C4*x0-C6*x2
-    punpckhdq_r2r (mm6, mm6);		// mm6 = x7 x5 x7 x5
+    pmaddwd_m2r (*(table+16), mm0);	/* mm0 = C4*x0-C2*x2 C4*x0-C6*x2 */
+    punpckhdq_r2r (mm6, mm6);		/* mm6 = x7 x5 x7 x5 */
 
-    movq_m2r (*(table+12), mm7);	// mm7 = -C5 -C1 C7 C5
-    pmaddwd_r2r (mm5, mm1);		// mm1 = C3*x1-C7*x3 C1*x1+C3*x3
+    movq_m2r (*(table+12), mm7);	/* mm7 = -C5 -C1 C7 C5 */
+    pmaddwd_r2r (mm5, mm1);		/* mm1 = C3*x1-C7*x3 C1*x1+C3*x3 */
 
-    paddd_m2r (*rounder, mm3);		// mm3 += rounder
-    pmaddwd_r2r (mm6, mm7);		// mm7 = -C1*x5-C5*x7 C5*x5+C7*x7
+    paddd_m2r (*rounder, mm3);		/* mm3 += rounder */
+    pmaddwd_r2r (mm6, mm7);		/* mm7 = -C1*x5-C5*x7 C5*x5+C7*x7 */
 
-    pmaddwd_m2r (*(table+20), mm2);	// mm2 = C4*x4-C6*x6 -C4*x4+C2*x6
-    paddd_r2r (mm4, mm3);		// mm3 = a1 a0 + rounder
+    pmaddwd_m2r (*(table+20), mm2);	/* mm2 = C4*x4-C6*x6 -C4*x4+C2*x6 */
+    paddd_r2r (mm4, mm3);		/* mm3 = a1 a0 + rounder */
 
-    pmaddwd_m2r (*(table+24), mm5);	// mm5 = C7*x1-C5*x3 C5*x1-C1*x3
-    movq_r2r (mm3, mm4);		// mm4 = a1 a0 + rounder
+    pmaddwd_m2r (*(table+24), mm5);	/* mm5 = C7*x1-C5*x3 C5*x1-C1*x3 */
+    movq_r2r (mm3, mm4);		/* mm4 = a1 a0 + rounder */
 
-    pmaddwd_m2r (*(table+28), mm6);	// mm6 = C3*x5-C1*x7 C7*x5+C3*x7
-    paddd_r2r (mm7, mm1);		// mm1 = b1 b0
+    pmaddwd_m2r (*(table+28), mm6);	/* mm6 = C3*x5-C1*x7 C7*x5+C3*x7 */
+    paddd_r2r (mm7, mm1);		/* mm1 = b1 b0 */
 
-    paddd_m2r (*rounder, mm0);		// mm0 += rounder
-    psubd_r2r (mm1, mm3);		// mm3 = a1-b1 a0-b0 + rounder
+    paddd_m2r (*rounder, mm0);		/* mm0 += rounder */
+    psubd_r2r (mm1, mm3);		/* mm3 = a1-b1 a0-b0 + rounder */
 
-    psrad_i2r (ROW_SHIFT, mm3);		// mm3 = y6 y7
-    paddd_r2r (mm4, mm1);		// mm1 = a1+b1 a0+b0 + rounder
+    psrad_i2r (ROW_SHIFT, mm3);		/* mm3 = y6 y7 */
+    paddd_r2r (mm4, mm1);		/* mm1 = a1+b1 a0+b0 + rounder */
 
-    paddd_r2r (mm2, mm0);		// mm0 = a3 a2 + rounder
-    psrad_i2r (ROW_SHIFT, mm1);		// mm1 = y1 y0
+    paddd_r2r (mm2, mm0);		/* mm0 = a3 a2 + rounder */
+    psrad_i2r (ROW_SHIFT, mm1);		/* mm1 = y1 y0 */
 
-    paddd_r2r (mm6, mm5);		// mm5 = b3 b2
-    movq_r2r (mm0, mm7);		// mm7 = a3 a2 + rounder
+    paddd_r2r (mm6, mm5);		/* mm5 = b3 b2 */
+    movq_r2r (mm0, mm7);		/* mm7 = a3 a2 + rounder */
 
-    paddd_r2r (mm5, mm0);		// mm0 = a3+b3 a2+b2 + rounder
-    psubd_r2r (mm5, mm7);		// mm7 = a3-b3 a2-b2 + rounder
+    paddd_r2r (mm5, mm0);		/* mm0 = a3+b3 a2+b2 + rounder */
+    psubd_r2r (mm5, mm7);		/* mm7 = a3-b3 a2-b2 + rounder */
 }
 
-static inline void mmx_row_tail (int16_t * row, int store)
+static inline void mmx_row_tail (int16_t * const row, const int store)
 {
-    psrad_i2r (ROW_SHIFT, mm0);		// mm0 = y3 y2
+    psrad_i2r (ROW_SHIFT, mm0);		/* mm0 = y3 y2 */
 
-    psrad_i2r (ROW_SHIFT, mm7);		// mm7 = y4 y5
+    psrad_i2r (ROW_SHIFT, mm7);		/* mm7 = y4 y5 */
 
-    packssdw_r2r (mm0, mm1);		// mm1 = y3 y2 y1 y0
+    packssdw_r2r (mm0, mm1);		/* mm1 = y3 y2 y1 y0 */
 
-    packssdw_r2r (mm3, mm7);		// mm7 = y6 y7 y4 y5
+    packssdw_r2r (mm3, mm7);		/* mm7 = y6 y7 y4 y5 */
 
-    movq_r2m (mm1, *(row+store));	// save y3 y2 y1 y0
-    movq_r2r (mm7, mm4);		// mm4 = y6 y7 y4 y5
+    movq_r2m (mm1, *(row+store));	/* save y3 y2 y1 y0 */
+    movq_r2r (mm7, mm4);		/* mm4 = y6 y7 y4 y5 */
 
-    pslld_i2r (16, mm7);		// mm7 = y7 0 y5 0
+    pslld_i2r (16, mm7);		/* mm7 = y7 0 y5 0 */
 
-    psrld_i2r (16, mm4);		// mm4 = 0 y6 0 y4
+    psrld_i2r (16, mm4);		/* mm4 = 0 y6 0 y4 */
 
-    por_r2r (mm4, mm7);			// mm7 = y7 y6 y5 y4
+    por_r2r (mm4, mm7);			/* mm7 = y7 y6 y5 y4 */
 
     /* slot */
 
-    movq_r2m (mm7, *(row+store+4));	// save y7 y6 y5 y4
+    movq_r2m (mm7, *(row+store+4));	/* save y7 y6 y5 y4 */
 }
 
-static inline void mmx_row_mid (int16_t * row, int store,
-				int offset, int16_t * table)
+static inline void mmx_row_mid (int16_t * const row, const int store,
+				const int offset, const int16_t * const table)
 {
-    movq_m2r (*(row+offset), mm2);	// mm2 = x6 x4 x2 x0
-    psrad_i2r (ROW_SHIFT, mm0);		// mm0 = y3 y2
+    movq_m2r (*(row+offset), mm2);	/* mm2 = x6 x4 x2 x0 */
+    psrad_i2r (ROW_SHIFT, mm0);		/* mm0 = y3 y2 */
 
-    movq_m2r (*(row+offset+4), mm5);	// mm5 = x7 x5 x3 x1
-    psrad_i2r (ROW_SHIFT, mm7);		// mm7 = y4 y5
+    movq_m2r (*(row+offset+4), mm5);	/* mm5 = x7 x5 x3 x1 */
+    psrad_i2r (ROW_SHIFT, mm7);		/* mm7 = y4 y5 */
 
-    packssdw_r2r (mm0, mm1);		// mm1 = y3 y2 y1 y0
-    movq_r2r (mm5, mm6);		// mm6 = x7 x5 x3 x1
+    packssdw_r2r (mm0, mm1);		/* mm1 = y3 y2 y1 y0 */
+    movq_r2r (mm5, mm6);		/* mm6 = x7 x5 x3 x1 */
 
-    packssdw_r2r (mm3, mm7);		// mm7 = y6 y7 y4 y5
-    movq_r2r (mm2, mm0);		// mm0 = x6 x4 x2 x0
+    packssdw_r2r (mm3, mm7);		/* mm7 = y6 y7 y4 y5 */
+    movq_r2r (mm2, mm0);		/* mm0 = x6 x4 x2 x0 */
 
-    movq_r2m (mm1, *(row+store));	// save y3 y2 y1 y0
-    movq_r2r (mm7, mm1);		// mm1 = y6 y7 y4 y5
+    movq_r2m (mm1, *(row+store));	/* save y3 y2 y1 y0 */
+    movq_r2r (mm7, mm1);		/* mm1 = y6 y7 y4 y5 */
 
-    punpckldq_r2r (mm0, mm0);		// mm0 = x2 x0 x2 x0
-    psrld_i2r (16, mm7);		// mm7 = 0 y6 0 y4
+    punpckldq_r2r (mm0, mm0);		/* mm0 = x2 x0 x2 x0 */
+    psrld_i2r (16, mm7);		/* mm7 = 0 y6 0 y4 */
 
-    movq_m2r (*table, mm3);		// mm3 = C6 C4 C2 C4
-    pslld_i2r (16, mm1);		// mm1 = y7 0 y5 0
+    movq_m2r (*table, mm3);		/* mm3 = C6 C4 C2 C4 */
+    pslld_i2r (16, mm1);		/* mm1 = y7 0 y5 0 */
 
-    movq_m2r (*(table+4), mm4);		// mm4 = -C2 -C4 C6 C4
-    por_r2r (mm1, mm7);			// mm7 = y7 y6 y5 y4
+    movq_m2r (*(table+4), mm4);		/* mm4 = -C2 -C4 C6 C4 */
+    por_r2r (mm1, mm7);			/* mm7 = y7 y6 y5 y4 */
 
-    movq_m2r (*(table+8), mm1);		// mm1 = -C7 C3 C3 C1
-    punpckhdq_r2r (mm2, mm2);		// mm2 = x6 x4 x6 x4
+    movq_m2r (*(table+8), mm1);		/* mm1 = -C7 C3 C3 C1 */
+    punpckhdq_r2r (mm2, mm2);		/* mm2 = x6 x4 x6 x4 */
 
-    movq_r2m (mm7, *(row+store+4));	// save y7 y6 y5 y4
-    pmaddwd_r2r (mm0, mm3);		// mm3 = C4*x0+C6*x2 C4*x0+C2*x2
+    movq_r2m (mm7, *(row+store+4));	/* save y7 y6 y5 y4 */
+    pmaddwd_r2r (mm0, mm3);		/* mm3 = C4*x0+C6*x2 C4*x0+C2*x2 */
 }
 
 
 #if 0
-// C column IDCT - its just here to document the MMXEXT and MMX versions
+/* C column IDCT - its just here to document the MMXEXT and MMX versions */
 static inline void idct_col (int16_t * col, int offset)
 {
 /* multiplication - as implemented on mmx */
@@ -388,178 +396,178 @@ static inline void idct_col (int16_t * col, int offset)
 #endif
 
 
-// MMX column IDCT
-static inline void idct_col (int16_t * col, int offset)
+/* MMX column IDCT */
+static inline void idct_col (int16_t * const col, const int offset)
 {
 #define T1 13036
 #define T2 27146
 #define T3 43790
 #define C4 23170
 
-    static short _T1[] ATTR_ALIGN(8) = {T1,T1,T1,T1};
-    static short _T2[] ATTR_ALIGN(8) = {T2,T2,T2,T2};
-    static short _T3[] ATTR_ALIGN(8) = {T3,T3,T3,T3};
-    static short _C4[] ATTR_ALIGN(8) = {C4,C4,C4,C4};
+    static const short _T1[] ATTR_ALIGN(8) = {T1,T1,T1,T1};
+    static const short _T2[] ATTR_ALIGN(8) = {T2,T2,T2,T2};
+    static const short _T3[] ATTR_ALIGN(8) = {T3,T3,T3,T3};
+    static const short _C4[] ATTR_ALIGN(8) = {C4,C4,C4,C4};
 
     /* column code adapted from peter gubanov */
     /* http://www.elecard.com/peter/idct.shtml */
 
-    movq_m2r (*_T1, mm0);		// mm0 = T1
+    movq_m2r (*_T1, mm0);		/* mm0 = T1 */
 
-    movq_m2r (*(col+offset+1*8), mm1);	// mm1 = x1
-    movq_r2r (mm0, mm2);		// mm2 = T1
+    movq_m2r (*(col+offset+1*8), mm1);	/* mm1 = x1 */
+    movq_r2r (mm0, mm2);		/* mm2 = T1 */
 
-    movq_m2r (*(col+offset+7*8), mm4);	// mm4 = x7
-    pmulhw_r2r (mm1, mm0);		// mm0 = T1*x1
+    movq_m2r (*(col+offset+7*8), mm4);	/* mm4 = x7 */
+    pmulhw_r2r (mm1, mm0);		/* mm0 = T1*x1 */
 
-    movq_m2r (*_T3, mm5);		// mm5 = T3
-    pmulhw_r2r (mm4, mm2);		// mm2 = T1*x7
+    movq_m2r (*_T3, mm5);		/* mm5 = T3 */
+    pmulhw_r2r (mm4, mm2);		/* mm2 = T1*x7 */
 
-    movq_m2r (*(col+offset+5*8), mm6);	// mm6 = x5
-    movq_r2r (mm5, mm7);		// mm7 = T3-1
+    movq_m2r (*(col+offset+5*8), mm6);	/* mm6 = x5 */
+    movq_r2r (mm5, mm7);		/* mm7 = T3-1 */
 
-    movq_m2r (*(col+offset+3*8), mm3);	// mm3 = x3
-    psubsw_r2r (mm4, mm0);		// mm0 = v17
+    movq_m2r (*(col+offset+3*8), mm3);	/* mm3 = x3 */
+    psubsw_r2r (mm4, mm0);		/* mm0 = v17 */
 
-    movq_m2r (*_T2, mm4);		// mm4 = T2
-    pmulhw_r2r (mm3, mm5);		// mm5 = (T3-1)*x3
+    movq_m2r (*_T2, mm4);		/* mm4 = T2 */
+    pmulhw_r2r (mm3, mm5);		/* mm5 = (T3-1)*x3 */
 
-    paddsw_r2r (mm2, mm1);		// mm1 = u17
-    pmulhw_r2r (mm6, mm7);		// mm7 = (T3-1)*x5
+    paddsw_r2r (mm2, mm1);		/* mm1 = u17 */
+    pmulhw_r2r (mm6, mm7);		/* mm7 = (T3-1)*x5 */
 
     /* slot */
 
-    movq_r2r (mm4, mm2);		// mm2 = T2
-    paddsw_r2r (mm3, mm5);		// mm5 = T3*x3
+    movq_r2r (mm4, mm2);		/* mm2 = T2 */
+    paddsw_r2r (mm3, mm5);		/* mm5 = T3*x3 */
 
-    pmulhw_m2r (*(col+offset+2*8), mm4);// mm4 = T2*x2
-    paddsw_r2r (mm6, mm7);		// mm7 = T3*x5
+    pmulhw_m2r (*(col+offset+2*8), mm4);/* mm4 = T2*x2 */
+    paddsw_r2r (mm6, mm7);		/* mm7 = T3*x5 */
 
-    psubsw_r2r (mm6, mm5);		// mm5 = v35
-    paddsw_r2r (mm3, mm7);		// mm7 = u35
+    psubsw_r2r (mm6, mm5);		/* mm5 = v35 */
+    paddsw_r2r (mm3, mm7);		/* mm7 = u35 */
 
-    movq_m2r (*(col+offset+6*8), mm3);	// mm3 = x6
-    movq_r2r (mm0, mm6);		// mm6 = v17
+    movq_m2r (*(col+offset+6*8), mm3);	/* mm3 = x6 */
+    movq_r2r (mm0, mm6);		/* mm6 = v17 */
 
-    pmulhw_r2r (mm3, mm2);		// mm2 = T2*x6
-    psubsw_r2r (mm5, mm0);		// mm0 = b3
+    pmulhw_r2r (mm3, mm2);		/* mm2 = T2*x6 */
+    psubsw_r2r (mm5, mm0);		/* mm0 = b3 */
 
-    psubsw_r2r (mm3, mm4);		// mm4 = v26
-    paddsw_r2r (mm6, mm5);		// mm5 = v12
+    psubsw_r2r (mm3, mm4);		/* mm4 = v26 */
+    paddsw_r2r (mm6, mm5);		/* mm5 = v12 */
 
-    movq_r2m (mm0, *(col+offset+3*8));	// save b3 in scratch0
-    movq_r2r (mm1, mm6);		// mm6 = u17
+    movq_r2m (mm0, *(col+offset+3*8));	/* save b3 in scratch0 */
+    movq_r2r (mm1, mm6);		/* mm6 = u17 */
 
-    paddsw_m2r (*(col+offset+2*8), mm2);// mm2 = u26
-    paddsw_r2r (mm7, mm6);		// mm6 = b0
+    paddsw_m2r (*(col+offset+2*8), mm2);/* mm2 = u26 */
+    paddsw_r2r (mm7, mm6);		/* mm6 = b0 */
 
-    psubsw_r2r (mm7, mm1);		// mm1 = u12
-    movq_r2r (mm1, mm7);		// mm7 = u12
+    psubsw_r2r (mm7, mm1);		/* mm1 = u12 */
+    movq_r2r (mm1, mm7);		/* mm7 = u12 */
 
-    movq_m2r (*(col+offset+0*8), mm3);	// mm3 = x0
-    paddsw_r2r (mm5, mm1);		// mm1 = u12+v12
+    movq_m2r (*(col+offset+0*8), mm3);	/* mm3 = x0 */
+    paddsw_r2r (mm5, mm1);		/* mm1 = u12+v12 */
 
-    movq_m2r (*_C4, mm0);		// mm0 = C4/2
-    psubsw_r2r (mm5, mm7);		// mm7 = u12-v12
+    movq_m2r (*_C4, mm0);		/* mm0 = C4/2 */
+    psubsw_r2r (mm5, mm7);		/* mm7 = u12-v12 */
 
-    movq_r2m (mm6, *(col+offset+5*8));	// save b0 in scratch1
-    pmulhw_r2r (mm0, mm1);		// mm1 = b1/2
+    movq_r2m (mm6, *(col+offset+5*8));	/* save b0 in scratch1 */
+    pmulhw_r2r (mm0, mm1);		/* mm1 = b1/2 */
 
-    movq_r2r (mm4, mm6);		// mm6 = v26
-    pmulhw_r2r (mm0, mm7);		// mm7 = b2/2
+    movq_r2r (mm4, mm6);		/* mm6 = v26 */
+    pmulhw_r2r (mm0, mm7);		/* mm7 = b2/2 */
 
-    movq_m2r (*(col+offset+4*8), mm5);	// mm5 = x4
-    movq_r2r (mm3, mm0);		// mm0 = x0
+    movq_m2r (*(col+offset+4*8), mm5);	/* mm5 = x4 */
+    movq_r2r (mm3, mm0);		/* mm0 = x0 */
 
-    psubsw_r2r (mm5, mm3);		// mm3 = v04
-    paddsw_r2r (mm5, mm0);		// mm0 = u04
+    psubsw_r2r (mm5, mm3);		/* mm3 = v04 */
+    paddsw_r2r (mm5, mm0);		/* mm0 = u04 */
 
-    paddsw_r2r (mm3, mm4);		// mm4 = a1
-    movq_r2r (mm0, mm5);		// mm5 = u04
+    paddsw_r2r (mm3, mm4);		/* mm4 = a1 */
+    movq_r2r (mm0, mm5);		/* mm5 = u04 */
 
-    psubsw_r2r (mm6, mm3);		// mm3 = a2
-    paddsw_r2r (mm2, mm5);		// mm5 = a0
+    psubsw_r2r (mm6, mm3);		/* mm3 = a2 */
+    paddsw_r2r (mm2, mm5);		/* mm5 = a0 */
 
-    paddsw_r2r (mm1, mm1);		// mm1 = b1
-    psubsw_r2r (mm2, mm0);		// mm0 = a3
+    paddsw_r2r (mm1, mm1);		/* mm1 = b1 */
+    psubsw_r2r (mm2, mm0);		/* mm0 = a3 */
 
-    paddsw_r2r (mm7, mm7);		// mm7 = b2
-    movq_r2r (mm3, mm2);		// mm2 = a2
+    paddsw_r2r (mm7, mm7);		/* mm7 = b2 */
+    movq_r2r (mm3, mm2);		/* mm2 = a2 */
 
-    movq_r2r (mm4, mm6);		// mm6 = a1
-    paddsw_r2r (mm7, mm3);		// mm3 = a2+b2
+    movq_r2r (mm4, mm6);		/* mm6 = a1 */
+    paddsw_r2r (mm7, mm3);		/* mm3 = a2+b2 */
 
-    psraw_i2r (COL_SHIFT, mm3);		// mm3 = y2
-    paddsw_r2r (mm1, mm4);		// mm4 = a1+b1
+    psraw_i2r (COL_SHIFT, mm3);		/* mm3 = y2 */
+    paddsw_r2r (mm1, mm4);		/* mm4 = a1+b1 */
 
-    psraw_i2r (COL_SHIFT, mm4);		// mm4 = y1
-    psubsw_r2r (mm1, mm6);		// mm6 = a1-b1
+    psraw_i2r (COL_SHIFT, mm4);		/* mm4 = y1 */
+    psubsw_r2r (mm1, mm6);		/* mm6 = a1-b1 */
 
-    movq_m2r (*(col+offset+5*8), mm1);	// mm1 = b0
-    psubsw_r2r (mm7, mm2);		// mm2 = a2-b2
+    movq_m2r (*(col+offset+5*8), mm1);	/* mm1 = b0 */
+    psubsw_r2r (mm7, mm2);		/* mm2 = a2-b2 */
 
-    psraw_i2r (COL_SHIFT, mm6);		// mm6 = y6
-    movq_r2r (mm5, mm7);		// mm7 = a0
+    psraw_i2r (COL_SHIFT, mm6);		/* mm6 = y6 */
+    movq_r2r (mm5, mm7);		/* mm7 = a0 */
 
-    movq_r2m (mm4, *(col+offset+1*8));	// save y1
-    psraw_i2r (COL_SHIFT, mm2);		// mm2 = y5
+    movq_r2m (mm4, *(col+offset+1*8));	/* save y1 */
+    psraw_i2r (COL_SHIFT, mm2);		/* mm2 = y5 */
 
-    movq_r2m (mm3, *(col+offset+2*8));	// save y2
-    paddsw_r2r (mm1, mm5);		// mm5 = a0+b0
+    movq_r2m (mm3, *(col+offset+2*8));	/* save y2 */
+    paddsw_r2r (mm1, mm5);		/* mm5 = a0+b0 */
 
-    movq_m2r (*(col+offset+3*8), mm4);	// mm4 = b3
-    psubsw_r2r (mm1, mm7);		// mm7 = a0-b0
+    movq_m2r (*(col+offset+3*8), mm4);	/* mm4 = b3 */
+    psubsw_r2r (mm1, mm7);		/* mm7 = a0-b0 */
 
-    psraw_i2r (COL_SHIFT, mm5);		// mm5 = y0
-    movq_r2r (mm0, mm3);		// mm3 = a3
+    psraw_i2r (COL_SHIFT, mm5);		/* mm5 = y0 */
+    movq_r2r (mm0, mm3);		/* mm3 = a3 */
 
-    movq_r2m (mm2, *(col+offset+5*8));	// save y5
-    psubsw_r2r (mm4, mm3);		// mm3 = a3-b3
+    movq_r2m (mm2, *(col+offset+5*8));	/* save y5 */
+    psubsw_r2r (mm4, mm3);		/* mm3 = a3-b3 */
 
-    psraw_i2r (COL_SHIFT, mm7);		// mm7 = y7
-    paddsw_r2r (mm0, mm4);		// mm4 = a3+b3
+    psraw_i2r (COL_SHIFT, mm7);		/* mm7 = y7 */
+    paddsw_r2r (mm0, mm4);		/* mm4 = a3+b3 */
 
-    movq_r2m (mm5, *(col+offset+0*8));	// save y0
-    psraw_i2r (COL_SHIFT, mm3);		// mm3 = y4
+    movq_r2m (mm5, *(col+offset+0*8));	/* save y0 */
+    psraw_i2r (COL_SHIFT, mm3);		/* mm3 = y4 */
 
-    movq_r2m (mm6, *(col+offset+6*8));	// save y6
-    psraw_i2r (COL_SHIFT, mm4);		// mm4 = y3
+    movq_r2m (mm6, *(col+offset+6*8));	/* save y6 */
+    psraw_i2r (COL_SHIFT, mm4);		/* mm4 = y3 */
 
-    movq_r2m (mm7, *(col+offset+7*8));	// save y7
+    movq_r2m (mm7, *(col+offset+7*8));	/* save y7 */
 
-    movq_r2m (mm3, *(col+offset+4*8));	// save y4
+    movq_r2m (mm3, *(col+offset+4*8));	/* save y4 */
 
-    movq_r2m (mm4, *(col+offset+3*8));	// save y3
+    movq_r2m (mm4, *(col+offset+3*8));	/* save y3 */
 }
 
 
-static int32_t rounder0[] ATTR_ALIGN(8) =
+static const int32_t rounder0[] ATTR_ALIGN(8) =
     rounder ((1 << (COL_SHIFT - 1)) - 0.5);
-static int32_t rounder4[] ATTR_ALIGN(8) = rounder (0);
-static int32_t rounder1[] ATTR_ALIGN(8) =
+static const int32_t rounder4[] ATTR_ALIGN(8) = rounder (0);
+static const int32_t rounder1[] ATTR_ALIGN(8) =
     rounder (1.25683487303);	/* C1*(C1/C4+C1+C7)/2 */
-static int32_t rounder7[] ATTR_ALIGN(8) =
+static const int32_t rounder7[] ATTR_ALIGN(8) =
     rounder (-0.25);		/* C1*(C7/C4+C7-C1)/2 */
-static int32_t rounder2[] ATTR_ALIGN(8) =
+static const int32_t rounder2[] ATTR_ALIGN(8) =
     rounder (0.60355339059);	/* C2 * (C6+C2)/2 */
-static int32_t rounder6[] ATTR_ALIGN(8) =
+static const int32_t rounder6[] ATTR_ALIGN(8) =
     rounder (-0.25);		/* C2 * (C6-C2)/2 */
-static int32_t rounder3[] ATTR_ALIGN(8) =
+static const int32_t rounder3[] ATTR_ALIGN(8) =
     rounder (0.087788325588);	/* C3*(-C3/C4+C3+C5)/2 */
-static int32_t rounder5[] ATTR_ALIGN(8) =
+static const int32_t rounder5[] ATTR_ALIGN(8) =
     rounder (-0.441341716183);	/* C3*(-C5/C4+C5-C3)/2 */
 
 
 #define declare_idct(idct,table,idct_row_head,idct_row,idct_row_tail,idct_row_mid)	\
-inline void idct (int16_t * block)					\
+static inline void idct (int16_t * const block)				\
 {									\
-    static int16_t table04[] ATTR_ALIGN(16) =				\
+    static const int16_t table04[] ATTR_ALIGN(16) =			\
 	table (22725, 21407, 19266, 16384, 12873,  8867, 4520);		\
-    static int16_t table17[] ATTR_ALIGN(16) =				\
+    static const int16_t table17[] ATTR_ALIGN(16) =			\
 	table (31521, 29692, 26722, 22725, 17855, 12299, 6270);		\
-    static int16_t table26[] ATTR_ALIGN(16) =				\
+    static const int16_t table26[] ATTR_ALIGN(16) =			\
 	table (29692, 27969, 25172, 21407, 16819, 11585, 5906);		\
-    static int16_t table35[] ATTR_ALIGN(16) =				\
+    static const int16_t table35[] ATTR_ALIGN(16) =			\
 	table (26722, 25172, 22654, 19266, 15137, 10426, 5315);		\
 									\
     idct_row_head (block, 0*8, table04);				\
@@ -594,7 +602,8 @@ do {					\
     packuswb_r2r (r1, r0);		\
 } while (0)
 
-static void block_copy (int16_t * block, uint8_t * dest, int stride)
+static inline void block_copy (int16_t * const block, uint8_t * dest,
+			       const int stride)
 {
     movq_m2r (*(block+0*8), mm0);
     movq_m2r (*(block+0*8+4), mm1);
@@ -626,7 +635,8 @@ do {					\
     paddsw_m2r (*(block+offset+4), r2);	\
 } while (0)
 
-static void block_add (int16_t * block, uint8_t * dest, int stride)
+static inline void block_add (int16_t * const block, uint8_t * dest,
+			      const int stride)
 {
     movq_m2r (*dest, mm1);
     pxor_r2r (mm0, mm0);
@@ -654,51 +664,150 @@ static void block_add (int16_t * block, uint8_t * dest, int stride)
 }
 
 
+static inline void block_zero (int16_t * const block)
+{
+    pxor_r2r (mm0, mm0);
+    movq_r2m (mm0, *(block+0*4));
+    movq_r2m (mm0, *(block+1*4));
+    movq_r2m (mm0, *(block+2*4));
+    movq_r2m (mm0, *(block+3*4));
+    movq_r2m (mm0, *(block+4*4));
+    movq_r2m (mm0, *(block+5*4));
+    movq_r2m (mm0, *(block+6*4));
+    movq_r2m (mm0, *(block+7*4));
+    movq_r2m (mm0, *(block+8*4));
+    movq_r2m (mm0, *(block+9*4));
+    movq_r2m (mm0, *(block+10*4));
+    movq_r2m (mm0, *(block+11*4));
+    movq_r2m (mm0, *(block+12*4));
+    movq_r2m (mm0, *(block+13*4));
+    movq_r2m (mm0, *(block+14*4));
+    movq_r2m (mm0, *(block+15*4));
+}
+
+
+#define CPU_MMXEXT 0
+#define CPU_MMX 1
+
+#define dup4(reg)			\
+do {					\
+    if (cpu != CPU_MMXEXT) {		\
+	punpcklwd_r2r (reg, reg);	\
+	punpckldq_r2r (reg, reg);	\
+    } else				\
+	pshufw_r2r (reg, reg, 0x00);	\
+} while (0)
+
+static inline void block_add_DC (int16_t * const block, uint8_t * dest,
+				 const int stride, const int cpu)
+{
+    movd_v2r ((block[0] + 4) >> 3, mm0);
+    pxor_r2r (mm1, mm1);
+    movq_m2r (*dest, mm2);
+    dup4 (mm0);
+    psubsw_r2r (mm0, mm1);
+    packuswb_r2r (mm0, mm0);
+    paddusb_r2r (mm0, mm2);
+    packuswb_r2r (mm1, mm1);
+    movq_m2r (*(dest + stride), mm3);
+    psubusb_r2r (mm1, mm2);
+    block[0] = 0;
+    paddusb_r2r (mm0, mm3);
+    movq_r2m (mm2, *dest);
+    psubusb_r2r (mm1, mm3);
+    movq_m2r (*(dest + 2*stride), mm2);
+    dest += stride;
+    movq_r2m (mm3, *dest);
+    paddusb_r2r (mm0, mm2);
+    movq_m2r (*(dest + 2*stride), mm3);
+    psubusb_r2r (mm1, mm2);
+    dest += stride;
+    paddusb_r2r (mm0, mm3);
+    movq_r2m (mm2, *dest);
+    psubusb_r2r (mm1, mm3);
+    movq_m2r (*(dest + 2*stride), mm2);
+    dest += stride;
+    movq_r2m (mm3, *dest);
+    paddusb_r2r (mm0, mm2);
+    movq_m2r (*(dest + 2*stride), mm3);
+    psubusb_r2r (mm1, mm2);
+    dest += stride;
+    paddusb_r2r (mm0, mm3);
+    movq_r2m (mm2, *dest);
+    psubusb_r2r (mm1, mm3);
+    movq_m2r (*(dest + 2*stride), mm2);
+    dest += stride;
+    movq_r2m (mm3, *dest);
+    paddusb_r2r (mm0, mm2);
+    movq_m2r (*(dest + 2*stride), mm3);
+    psubusb_r2r (mm1, mm2);
+    block[63] = 0;
+    paddusb_r2r (mm0, mm3);
+    movq_r2m (mm2, *(dest + stride));
+    psubusb_r2r (mm1, mm3);
+    movq_r2m (mm3, *(dest + 2*stride));
+}
+
+
 declare_idct (mmxext_idct, mmxext_table,
 	      mmxext_row_head, mmxext_row, mmxext_row_tail, mmxext_row_mid)
 
-void idct_block_copy_mmxext (int16_t * block, uint8_t * dest, int stride)
+void mpeg2_idct_copy_mmxext (int16_t * const block, uint8_t * const dest,
+			     const int stride)
 {
     mmxext_idct (block);
     block_copy (block, dest, stride);
+    block_zero (block);
 }
 
-void idct_block_add_mmxext (int16_t * block, uint8_t * dest, int stride)
+void mpeg2_idct_add_mmxext (const int last, int16_t * const block,
+			    uint8_t * const dest, const int stride)
 {
-    mmxext_idct (block);
-    block_add (block, dest, stride);
+    if (last != 129 || (block[0] & 7) == 4) {
+	mmxext_idct (block);
+	block_add (block, dest, stride);
+	block_zero (block);
+    } else
+	block_add_DC (block, dest, stride, CPU_MMXEXT);
 }
 
 
 declare_idct (mmx_idct, mmx_table,
 	      mmx_row_head, mmx_row, mmx_row_tail, mmx_row_mid)
 
-void idct_block_copy_mmx (int16_t * block, uint8_t * dest, int stride)
+void mpeg2_idct_copy_mmx (int16_t * const block, uint8_t * const dest,
+			  const int stride)
 {
     mmx_idct (block);
     block_copy (block, dest, stride);
+    block_zero (block);
 }
 
-void idct_block_add_mmx (int16_t * block, uint8_t * dest, int stride)
+void mpeg2_idct_add_mmx (const int last, int16_t * const block,
+			 uint8_t * const dest, const int stride)
 {
-    mmx_idct (block);
-    block_add (block, dest, stride);
+    if (last != 129 || (block[0] & 7) == 4) {
+	mmx_idct (block);
+	block_add (block, dest, stride);
+	block_zero (block);
+    } else
+	block_add_DC (block, dest, stride, CPU_MMX);
 }
 
 
-void idct_mmx_init (void)
+void mpeg2_idct_mmx_init (void)
 {
-    extern uint8_t scan_norm[64];
-    extern uint8_t scan_alt[64];
+    extern uint8_t mpeg2_scan_norm[64];
+    extern uint8_t mpeg2_scan_alt[64];
     int i, j;
 
     /* the mmx/mmxext idct uses a reordered input, so we patch scan tables */
 
     for (i = 0; i < 64; i++) {
-	j = scan_norm[i];
-	scan_norm[i] = (j & 0x38) | ((j & 6) >> 1) | ((j & 1) << 2);
-	j = scan_alt[i];
-	scan_alt[i] = (j & 0x38) | ((j & 6) >> 1) | ((j & 1) << 2);
+	j = mpeg2_scan_norm[i];
+	mpeg2_scan_norm[i] = (j & 0x38) | ((j & 6) >> 1) | ((j & 1) << 2);
+	j = mpeg2_scan_alt[i];
+	mpeg2_scan_alt[i] = (j & 0x38) | ((j & 6) >> 1) | ((j & 1) << 2);
     }
 }
 
diff --git a/libmpeg2/mm_accel.h b/libmpeg2/mm_accel.h
deleted file mode 100644
index 133d6acb03..0000000000
--- a/libmpeg2/mm_accel.h
+++ /dev/null
@@ -1,30 +0,0 @@
-/*
- * oms_accel.h
- * Copyright (C) 1999-2000 Aaron Holtzman <aholtzma@ess.engr.uvic.ca>
- *
- * This file is part of mpeg2dec, a free MPEG-2 video stream decoder.
- *
- * mpeg2dec is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * mpeg2dec is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
- */
-
-// generic accelerations
-#define MM_ACCEL_MLIB		0x00000001
-
-// x86 accelerations
-#define MM_ACCEL_X86_MMX	0x80000000
-#define MM_ACCEL_X86_3DNOW	0x40000000
-#define MM_ACCEL_X86_MMXEXT	0x20000000
-
-//uint32_t mm_accel (void);
diff --git a/libmpeg2/mmx.h b/libmpeg2/mmx.h
index ac23866690..c05bfe1ccb 100644
--- a/libmpeg2/mmx.h
+++ b/libmpeg2/mmx.h
@@ -1,8 +1,10 @@
 /*
  * mmx.h
- * Copyright (C) 1997-2001 H. Dietz and R. Fisher
+ * Copyright (C) 2000-2002 Michel Lespinasse <walken@zoy.org>
+ * Copyright (C) 1999-2000 Aaron Holtzman <aholtzma@ess.engr.uvic.ca>
  *
  * This file is part of mpeg2dec, a free MPEG-2 video stream decoder.
+ * See http://libmpeg2.sourceforge.net/ for updates.
  *
  * mpeg2dec is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -61,7 +63,12 @@ typedef	union {
 
 #define	movd_m2r(var,reg)	mmx_m2r (movd, var, reg)
 #define	movd_r2m(reg,var)	mmx_r2m (movd, reg, var)
-#define	movd_r2r(regs,regd)	mmx_r2r (movd, regs, regd)
+#define	movd_v2r(var,reg)	__asm__ __volatile__ ("movd %0, %%" #reg \
+						      : /* nothing */ \
+						      : "rm" (var))
+#define	movd_r2v(reg,var)	__asm__ __volatile__ ("movd %%" #reg ", %0" \
+						      : "=rm" (var) \
+						      : /* nothing */ )
 
 #define	movq_m2r(var,reg)	mmx_m2r (movq, var, reg)
 #define	movq_r2m(reg,var)	mmx_r2m (movq, reg, var)
@@ -196,18 +203,19 @@ typedef	union {
 
 
 #define mmx_m2ri(op,mem,reg,imm) \
-        __asm__ __volatile__ (#op " %1, %0, %%" #reg \
-                              : /* nothing */ \
-                              : "X" (mem), "X" (imm))
+	__asm__ __volatile__ (#op " %1, %0, %%" #reg \
+			      : /* nothing */ \
+			      : "m" (mem), "i" (imm))
+
 #define mmx_r2ri(op,regs,regd,imm) \
-        __asm__ __volatile__ (#op " %0, %%" #regs ", %%" #regd \
-                              : /* nothing */ \
-                              : "X" (imm) )
+	__asm__ __volatile__ (#op " %0, %%" #regs ", %%" #regd \
+			      : /* nothing */ \
+			      : "i" (imm) )
 
 #define	mmx_fetch(mem,hint) \
 	__asm__ __volatile__ ("prefetch" #hint " %0" \
 			      : /* nothing */ \
-			      : "X" (mem))
+			      : "m" (mem))
 
 
 #define	maskmovq(regs,maskreg)		mmx_r2ri (maskmovq, regs, maskreg)
diff --git a/libmpeg2/motion_comp.c b/libmpeg2/motion_comp.c
index 6f4d979317..25c001584d 100644
--- a/libmpeg2/motion_comp.c
+++ b/libmpeg2/motion_comp.c
@@ -1,8 +1,10 @@
 /*
  * motion_comp.c
- * Copyright (C) 1999-2001 Aaron Holtzman <aholtzma@ess.engr.uvic.ca>
+ * Copyright (C) 2000-2002 Michel Lespinasse <walken@zoy.org>
+ * Copyright (C) 1999-2000 Aaron Holtzman <aholtzma@ess.engr.uvic.ca>
  *
  * This file is part of mpeg2dec, a free MPEG-2 video stream decoder.
+ * See http://libmpeg2.sourceforge.net/ for updates.
  *
  * mpeg2dec is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -21,100 +23,102 @@
 
 #include "config.h"
 
-#include <stdio.h>
 #include <inttypes.h>
 
+#include "mpeg2.h"
 #include "mpeg2_internal.h"
-#include "mm_accel.h"
 
-mc_functions_t mc_functions;
+mpeg2_mc_t mpeg2_mc;
 
-void motion_comp_init (void)
+void mpeg2_mc_init (uint32_t accel)
 {
-
 #ifdef ARCH_X86
-    if (config.flags & MM_ACCEL_X86_MMXEXT) {
-	printf ("libmpeg2: Using MMXEXT for motion compensation\n");
-	mc_functions = mc_functions_mmxext;
-    } else if (config.flags & MM_ACCEL_X86_3DNOW) {
-	printf ("libmpeg2: Using 3DNOW for motion compensation\n");
-	mc_functions = mc_functions_3dnow;
-    } else if (config.flags & MM_ACCEL_X86_MMX) {
-	printf ("libmpeg2: Using MMX for motion compensation\n");
-	mc_functions = mc_functions_mmx;
-    } else
+    if (accel & MPEG2_ACCEL_X86_MMXEXT)
+	mpeg2_mc = mpeg2_mc_mmxext;
+    else if (accel & MPEG2_ACCEL_X86_3DNOW)
+	mpeg2_mc = mpeg2_mc_3dnow;
+    else if (accel & MPEG2_ACCEL_X86_MMX)
+	mpeg2_mc = mpeg2_mc_mmx;
+    else
+#endif
+#ifdef ARCH_PPC
+    if (accel & MPEG2_ACCEL_PPC_ALTIVEC)
+	mpeg2_mc = mpeg2_mc_altivec;
+    else
+#endif
+#ifdef ARCH_ALPHA
+    if (accel & MPEG2_ACCEL_ALPHA)
+	mpeg2_mc = mpeg2_mc_alpha;
+    else
 #endif
 #ifdef LIBMPEG2_MLIB
-    if (config.flags & MM_ACCEL_MLIB) {
-	printf ("libmpeg2: Using mlib for motion compensation\n");
-	mc_functions = mc_functions_mlib;
-    } else
+    if (accel & MPEG2_ACCEL_MLIB)
+	mpeg2_mc = mpeg2_mc_mlib;
+    else
 #endif
-    {
-	printf ("libmpeg2: No accelerated motion compensation found\n");
-	mc_functions = mc_functions_c;
-    }
+	mpeg2_mc = mpeg2_mc_c;
 }
 
 #define avg2(a,b) ((a+b+1)>>1)
 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
 
-#define predict_(i) (ref[i])
+#define predict_o(i) (ref[i])
 #define predict_x(i) (avg2 (ref[i], ref[i+1]))
 #define predict_y(i) (avg2 (ref[i], (ref+stride)[i]))
-#define predict_xy(i) (avg4 (ref[i], ref[i+1], (ref+stride)[i], (ref+stride)[i+1]))
+#define predict_xy(i) (avg4 (ref[i], ref[i+1], \
+			     (ref+stride)[i], (ref+stride)[i+1]))
 
 #define put(predictor,i) dest[i] = predictor (i)
 #define avg(predictor,i) dest[i] = avg2 (predictor (i), dest[i])
 
 /* mc function template */
 
-#define MC_FUNC(op,xy)						\
-static void MC_##op##_##xy##16_c (uint8_t * dest, uint8_t * ref,\
-				 int stride, int height)	\
-{								\
-    do {							\
-	op (predict_##xy, 0);					\
-	op (predict_##xy, 1);					\
-	op (predict_##xy, 2);					\
-	op (predict_##xy, 3);					\
-	op (predict_##xy, 4);					\
-	op (predict_##xy, 5);					\
-	op (predict_##xy, 6);					\
-	op (predict_##xy, 7);					\
-	op (predict_##xy, 8);					\
-	op (predict_##xy, 9);					\
-	op (predict_##xy, 10);					\
-	op (predict_##xy, 11);					\
-	op (predict_##xy, 12);					\
-	op (predict_##xy, 13);					\
-	op (predict_##xy, 14);					\
-	op (predict_##xy, 15);					\
-	ref += stride;						\
-	dest += stride;						\
-    } while (--height);						\
-}								\
-static void MC_##op##_##xy##8_c (uint8_t * dest, uint8_t * ref,	\
-				int stride, int height)		\
-{								\
-    do {							\
-	op (predict_##xy, 0);					\
-	op (predict_##xy, 1);					\
-	op (predict_##xy, 2);					\
-	op (predict_##xy, 3);					\
-	op (predict_##xy, 4);					\
-	op (predict_##xy, 5);					\
-	op (predict_##xy, 6);					\
-	op (predict_##xy, 7);					\
-	ref += stride;						\
-	dest += stride;						\
-    } while (--height);						\
+#define MC_FUNC(op,xy)							\
+static void MC_##op##_##xy##_16_c (uint8_t * dest, const uint8_t * ref,	\
+				   const int stride, int height)	\
+{									\
+    do {								\
+	op (predict_##xy, 0);						\
+	op (predict_##xy, 1);						\
+	op (predict_##xy, 2);						\
+	op (predict_##xy, 3);						\
+	op (predict_##xy, 4);						\
+	op (predict_##xy, 5);						\
+	op (predict_##xy, 6);						\
+	op (predict_##xy, 7);						\
+	op (predict_##xy, 8);						\
+	op (predict_##xy, 9);						\
+	op (predict_##xy, 10);						\
+	op (predict_##xy, 11);						\
+	op (predict_##xy, 12);						\
+	op (predict_##xy, 13);						\
+	op (predict_##xy, 14);						\
+	op (predict_##xy, 15);						\
+	ref += stride;							\
+	dest += stride;							\
+    } while (--height);							\
+}									\
+static void MC_##op##_##xy##_8_c (uint8_t * dest, const uint8_t * ref,	\
+				  const int stride, int height)		\
+{									\
+    do {								\
+	op (predict_##xy, 0);						\
+	op (predict_##xy, 1);						\
+	op (predict_##xy, 2);						\
+	op (predict_##xy, 3);						\
+	op (predict_##xy, 4);						\
+	op (predict_##xy, 5);						\
+	op (predict_##xy, 6);						\
+	op (predict_##xy, 7);						\
+	ref += stride;							\
+	dest += stride;							\
+    } while (--height);							\
 }
 
 /* definitions of the actual mc functions */
 
-MC_FUNC (put,)
-MC_FUNC (avg,)
+MC_FUNC (put,o)
+MC_FUNC (avg,o)
 MC_FUNC (put,x)
 MC_FUNC (avg,x)
 MC_FUNC (put,y)
@@ -122,4 +126,4 @@ MC_FUNC (avg,y)
 MC_FUNC (put,xy)
 MC_FUNC (avg,xy)
 
-MOTION_COMP_EXTERN (c)
+MPEG2_MC_EXTERN (c)
diff --git a/libmpeg2/motion_comp_mlib.c b/libmpeg2/motion_comp_mlib.c
index 91c0fb5a87..de181c0651 100644
--- a/libmpeg2/motion_comp_mlib.c
+++ b/libmpeg2/motion_comp_mlib.c
@@ -1,8 +1,9 @@
 /*
  * motion_comp_mlib.c
- * Copyright (C) 2000-2001 H�kan Hjort <d95hjort@dtek.chalmers.se>
+ * Copyright (C) 2000-2002 H�kan Hjort <d95hjort@dtek.chalmers.se>
  *
  * This file is part of mpeg2dec, a free MPEG-2 video stream decoder.
+ * See http://libmpeg2.sourceforge.net/ for updates.
  *
  * mpeg2dec is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -23,158 +24,167 @@
 
 #ifdef LIBMPEG2_MLIB
 
-#include <inttypes.h>
 #include <mlib_types.h>
 #include <mlib_status.h>
 #include <mlib_sys.h>
 #include <mlib_video.h>
+#include <inttypes.h>
 
+#include "mpeg2.h"
 #include "mpeg2_internal.h"
 
-static void MC_put_16_mlib (uint8_t * dest, uint8_t * ref,
-			    int stride, int height)
+static void MC_put_o_16_mlib (uint8_t * dest, const uint8_t * ref,
+			      int stride, int height)
 {
-    if (height == 16) 
-	mlib_VideoCopyRef_U8_U8_16x16 (dest, ref, stride);
+    if (height == 16)
+	mlib_VideoCopyRef_U8_U8_16x16 (dest, (uint8_t *) ref, stride);
     else
-	mlib_VideoCopyRef_U8_U8_16x8 (dest, ref, stride);
+	mlib_VideoCopyRef_U8_U8_16x8 (dest, (uint8_t *) ref, stride);
 }
 
-static void MC_put_x16_mlib (uint8_t * dest, uint8_t * ref,
-			     int stride, int height)
+static void MC_put_x_16_mlib (uint8_t * dest, const uint8_t * ref,
+			      int stride, int height)
 {
     if (height == 16)
-	mlib_VideoInterpX_U8_U8_16x16 (dest, ref, stride, stride);
+	mlib_VideoInterpX_U8_U8_16x16 (dest, (uint8_t *) ref, stride, stride);
     else
-	mlib_VideoInterpX_U8_U8_16x8 (dest, ref, stride, stride);
+	mlib_VideoInterpX_U8_U8_16x8 (dest, (uint8_t *) ref, stride, stride);
 }
 
-static void MC_put_y16_mlib (uint8_t * dest, uint8_t * ref,
-			     int stride, int height)
+static void MC_put_y_16_mlib (uint8_t * dest, const uint8_t * ref,
+			      int stride, int height)
 {
     if (height == 16)
-	mlib_VideoInterpY_U8_U8_16x16 (dest, ref, stride, stride);
+	mlib_VideoInterpY_U8_U8_16x16 (dest, (uint8_t *) ref, stride, stride);
     else
-	mlib_VideoInterpY_U8_U8_16x8 (dest, ref, stride, stride);
+	mlib_VideoInterpY_U8_U8_16x8 (dest, (uint8_t *) ref, stride, stride);
 }
 
-static void MC_put_xy16_mlib (uint8_t * dest, uint8_t * ref,
-			      int stride, int height)
+static void MC_put_xy_16_mlib (uint8_t * dest, const uint8_t * ref,
+			       int stride, int height)
 {
     if (height == 16)
-	mlib_VideoInterpXY_U8_U8_16x16 (dest, ref, stride, stride);
+	mlib_VideoInterpXY_U8_U8_16x16 (dest, (uint8_t *) ref, stride, stride);
     else
-	mlib_VideoInterpXY_U8_U8_16x8 (dest, ref, stride, stride);
+	mlib_VideoInterpXY_U8_U8_16x8 (dest, (uint8_t *) ref, stride, stride);
 }
 
-static void MC_put_8_mlib (uint8_t * dest, uint8_t * ref,
-			   int stride, int height)
+static void MC_put_o_8_mlib (uint8_t * dest, const uint8_t * ref,
+			     int stride, int height)
 {
     if (height == 8)
-	mlib_VideoCopyRef_U8_U8_8x8 (dest, ref, stride);
+	mlib_VideoCopyRef_U8_U8_8x8 (dest, (uint8_t *) ref, stride);
     else
-	mlib_VideoCopyRef_U8_U8_8x4 (dest, ref, stride);
+	mlib_VideoCopyRef_U8_U8_8x4 (dest, (uint8_t *) ref, stride);
 }
 
-static void MC_put_x8_mlib (uint8_t * dest, uint8_t * ref,
-			    int stride, int height)
+static void MC_put_x_8_mlib (uint8_t * dest, const uint8_t * ref,
+			     int stride, int height)
 {
     if (height == 8)
-	mlib_VideoInterpX_U8_U8_8x8 (dest, ref, stride, stride);
+	mlib_VideoInterpX_U8_U8_8x8 (dest, (uint8_t *) ref, stride, stride);
     else
-	mlib_VideoInterpX_U8_U8_8x4 (dest, ref, stride, stride);
+	mlib_VideoInterpX_U8_U8_8x4 (dest, (uint8_t *) ref, stride, stride);
 }
 
-static void MC_put_y8_mlib (uint8_t * dest, uint8_t * ref,
-			    int stride, int height)
+static void MC_put_y_8_mlib (uint8_t * dest, const uint8_t * ref,
+			     int stride, int height)
 {
     if (height == 8)
-	mlib_VideoInterpY_U8_U8_8x8 (dest, ref, stride, stride);
+	mlib_VideoInterpY_U8_U8_8x8 (dest, (uint8_t *) ref, stride, stride);
     else
-	mlib_VideoInterpY_U8_U8_8x4 (dest, ref, stride, stride);
+	mlib_VideoInterpY_U8_U8_8x4 (dest, (uint8_t *) ref, stride, stride);
 }
 
-static void MC_put_xy8_mlib (uint8_t * dest, uint8_t * ref,
-			     int stride, int height)
+static void MC_put_xy_8_mlib (uint8_t * dest, const uint8_t * ref,
+			      int stride, int height)
 {
-    if (height == 8) 
-	mlib_VideoInterpXY_U8_U8_8x8 (dest, ref, stride, stride);
+    if (height == 8)
+	mlib_VideoInterpXY_U8_U8_8x8 (dest, (uint8_t *) ref, stride, stride);
     else
-	mlib_VideoInterpXY_U8_U8_8x4 (dest, ref, stride, stride);
+	mlib_VideoInterpXY_U8_U8_8x4 (dest, (uint8_t *) ref, stride, stride);
 }
 
-static void MC_avg_16_mlib (uint8_t * dest, uint8_t * ref,
-			    int stride, int height)
+static void MC_avg_o_16_mlib (uint8_t * dest, const uint8_t * ref,
+			      int stride, int height)
 {
     if (height == 16)
-	mlib_VideoCopyRefAve_U8_U8_16x16 (dest, ref, stride);
+	mlib_VideoCopyRefAve_U8_U8_16x16 (dest, (uint8_t *) ref, stride);
     else
-	mlib_VideoCopyRefAve_U8_U8_16x8 (dest, ref, stride);
+	mlib_VideoCopyRefAve_U8_U8_16x8 (dest, (uint8_t *) ref, stride);
 }
 
-static void MC_avg_x16_mlib (uint8_t * dest, uint8_t * ref,
-			     int stride, int height)
+static void MC_avg_x_16_mlib (uint8_t * dest, const uint8_t * ref,
+			      int stride, int height)
 {
     if (height == 16)
-	mlib_VideoInterpAveX_U8_U8_16x16 (dest, ref, stride, stride);
+	mlib_VideoInterpAveX_U8_U8_16x16 (dest, (uint8_t *) ref,
+					  stride, stride);
     else
-	mlib_VideoInterpAveX_U8_U8_16x8 (dest, ref, stride, stride);
+	mlib_VideoInterpAveX_U8_U8_16x8 (dest, (uint8_t *) ref,
+					 stride, stride);
 }
 
-static void MC_avg_y16_mlib (uint8_t * dest, uint8_t * ref,
-			     int stride, int height)
+static void MC_avg_y_16_mlib (uint8_t * dest, const uint8_t * ref,
+			      int stride, int height)
 {
     if (height == 16)
-	mlib_VideoInterpAveY_U8_U8_16x16 (dest, ref, stride, stride);
+	mlib_VideoInterpAveY_U8_U8_16x16 (dest, (uint8_t *) ref,
+					  stride, stride);
     else
-	mlib_VideoInterpAveY_U8_U8_16x8 (dest, ref, stride, stride);
+	mlib_VideoInterpAveY_U8_U8_16x8 (dest, (uint8_t *) ref,
+					 stride, stride);
 }
 
-static void MC_avg_xy16_mlib (uint8_t * dest, uint8_t * ref,
-			      int stride, int height)
+static void MC_avg_xy_16_mlib (uint8_t * dest, const uint8_t * ref,
+			       int stride, int height)
 {
     if (height == 16)
-	mlib_VideoInterpAveXY_U8_U8_16x16 (dest, ref, stride, stride);
+	mlib_VideoInterpAveXY_U8_U8_16x16 (dest, (uint8_t *) ref,
+					   stride, stride);
     else
-	mlib_VideoInterpAveXY_U8_U8_16x8 (dest, ref, stride, stride);
+	mlib_VideoInterpAveXY_U8_U8_16x8 (dest, (uint8_t *) ref,
+					  stride, stride);
 }
 
-static void MC_avg_8_mlib (uint8_t * dest, uint8_t * ref,
-			   int stride, int height)
+static void MC_avg_o_8_mlib (uint8_t * dest, const uint8_t * ref,
+			     int stride, int height)
 {
     if (height == 8)
-	mlib_VideoCopyRefAve_U8_U8_8x8 (dest, ref, stride);
+	mlib_VideoCopyRefAve_U8_U8_8x8 (dest, (uint8_t *) ref, stride);
     else
-	mlib_VideoCopyRefAve_U8_U8_8x4 (dest, ref, stride);
+	mlib_VideoCopyRefAve_U8_U8_8x4 (dest, (uint8_t *) ref, stride);
 }
 
-static void MC_avg_x8_mlib (uint8_t * dest, uint8_t * ref,
-			    int stride, int height)
+static void MC_avg_x_8_mlib (uint8_t * dest, const uint8_t * ref,
+			     int stride, int height)
 {
     if (height == 8)
-	mlib_VideoInterpAveX_U8_U8_8x8 (dest, ref, stride, stride);
+	mlib_VideoInterpAveX_U8_U8_8x8 (dest, (uint8_t *) ref, stride, stride);
     else
-	mlib_VideoInterpAveX_U8_U8_8x4 (dest, ref, stride, stride);
+	mlib_VideoInterpAveX_U8_U8_8x4 (dest, (uint8_t *) ref, stride, stride);
 }
 
-static void MC_avg_y8_mlib (uint8_t * dest, uint8_t * ref,
-			    int stride, int height)
+static void MC_avg_y_8_mlib (uint8_t * dest, const uint8_t * ref,
+			     int stride, int height)
 {
     if (height == 8)
-	mlib_VideoInterpAveY_U8_U8_8x8 (dest, ref, stride, stride);
+	mlib_VideoInterpAveY_U8_U8_8x8 (dest, (uint8_t *) ref, stride, stride);
     else
-	mlib_VideoInterpAveY_U8_U8_8x4 (dest, ref, stride, stride);
+	mlib_VideoInterpAveY_U8_U8_8x4 (dest, (uint8_t *) ref, stride, stride);
 }
 
-static void MC_avg_xy8_mlib (uint8_t * dest, uint8_t * ref,
-			     int stride, int height)
+static void MC_avg_xy_8_mlib (uint8_t * dest, const uint8_t * ref,
+			      int stride, int height)
 {
     if (height == 8)
-	mlib_VideoInterpAveXY_U8_U8_8x8 (dest, ref, stride, stride);
+	mlib_VideoInterpAveXY_U8_U8_8x8 (dest, (uint8_t *) ref,
+					 stride, stride);
     else
-	mlib_VideoInterpAveXY_U8_U8_8x4 (dest, ref, stride, stride);
+	mlib_VideoInterpAveXY_U8_U8_8x4 (dest, (uint8_t *) ref,
+					 stride, stride);
 }
 
-MOTION_COMP_EXTERN (mlib)
+MPEG2_MC_EXTERN (mlib)
 
 #endif
diff --git a/libmpeg2/motion_comp_mmx.c b/libmpeg2/motion_comp_mmx.c
index 51b40bac55..33103e1738 100644
--- a/libmpeg2/motion_comp_mmx.c
+++ b/libmpeg2/motion_comp_mmx.c
@@ -1,8 +1,10 @@
 /*
  * motion_comp_mmx.c
- * Copyright (C) 1999-2001 Aaron Holtzman <aholtzma@ess.engr.uvic.ca>
+ * Copyright (C) 2000-2002 Michel Lespinasse <walken@zoy.org>
+ * Copyright (C) 1999-2000 Aaron Holtzman <aholtzma@ess.engr.uvic.ca>
  *
  * This file is part of mpeg2dec, a free MPEG-2 video stream decoder.
+ * See http://libmpeg2.sourceforge.net/ for updates.
  *
  * mpeg2dec is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -25,6 +27,7 @@
 
 #include <inttypes.h>
 
+#include "mpeg2.h"
 #include "mpeg2_internal.h"
 #include "attributes.h"
 #include "mmx.h"
@@ -35,15 +38,22 @@
 
 /* MMX code - needs a rewrite */
 
-
-
-
-
-
+/*
+ * Motion Compensation frequently needs to average values using the
+ * formula (x+y+1)>>1. Both MMXEXT and 3Dnow include one instruction
+ * to compute this, but it's been left out of classic MMX.
+ *
+ * We need to be careful of overflows when doing this computation.
+ * Rather than unpacking data to 16-bits, which reduces parallelism,
+ * we use the following formulas:
+ *
+ * (x+y)>>1 == (x&y)+((x^y)>>1)
+ * (x+y+1)>>1 == (x|y)-((x^y)>>1)
+ */
 
 /* some rounding constants */
-mmx_t round1 = {0x0001000100010001LL};
-mmx_t round4 = {0x0002000200020002LL};
+static mmx_t mask1 = {0xfefefefefefefefeLL};
+static mmx_t round4 = {0x0002000200020002LL};
 
 /*
  * This code should probably be compiled with loop unrolling
@@ -59,202 +69,176 @@ static inline void mmx_zero_reg ()
     pxor_r2r (mm0, mm0);
 }
 
-static inline void mmx_average_2_U8 (uint8_t * dest,
-				     uint8_t * src1, uint8_t * src2)
+static inline void mmx_average_2_U8 (uint8_t * dest, const uint8_t * src1,
+				     const uint8_t * src2)
 {
     /* *dest = (*src1 + *src2 + 1)/ 2; */
 
-    movq_m2r (*src1, mm1);	// load 8 src1 bytes
-    movq_r2r (mm1, mm2);	// copy 8 src1 bytes
-
-    movq_m2r (*src2, mm3);	// load 8 src2 bytes
-    movq_r2r (mm3, mm4);	// copy 8 src2 bytes
-
-    punpcklbw_r2r (mm0, mm1);	// unpack low src1 bytes
-    punpckhbw_r2r (mm0, mm2);	// unpack high src1 bytes
+    movq_m2r (*src1, mm1);	/* load 8 src1 bytes */
+    movq_r2r (mm1, mm2);	/* copy 8 src1 bytes */
 
-    punpcklbw_r2r (mm0, mm3);	// unpack low src2 bytes
-    punpckhbw_r2r (mm0, mm4);	// unpack high src2 bytes
+    movq_m2r (*src2, mm3);	/* load 8 src2 bytes */
+    movq_r2r (mm3, mm4);	/* copy 8 src2 bytes */
 
-    paddw_r2r (mm3, mm1);	// add lows to mm1
-    paddw_m2r (round1, mm1);
-    psraw_i2r (1, mm1);		// /2
-
-    paddw_r2r (mm4, mm2);	// add highs to mm2
-    paddw_m2r (round1, mm2);
-    psraw_i2r (1, mm2);		// /2
-
-    packuswb_r2r (mm2, mm1);	// pack (w/ saturation)
-    movq_r2m (mm1, *dest);	// store result in dest
+    pxor_r2r (mm1, mm3);	/* xor src1 and src2 */
+    pand_m2r (mask1, mm3);	/* mask lower bits */
+    psrlq_i2r (1, mm3);		/* /2 */
+    por_r2r (mm2, mm4);		/* or src1 and src2 */
+    psubb_r2r (mm3, mm4);	/* subtract subresults */
+    movq_r2m (mm4, *dest);	/* store result in dest */
 }
 
 static inline void mmx_interp_average_2_U8 (uint8_t * dest,
-					    uint8_t * src1, uint8_t * src2)
+					    const uint8_t * src1,
+					    const uint8_t * src2)
 {
     /* *dest = (*dest + (*src1 + *src2 + 1)/ 2 + 1)/ 2; */
 
-    movq_m2r (*dest, mm1);	// load 8 dest bytes
-    movq_r2r (mm1, mm2);	// copy 8 dest bytes
-
-    movq_m2r (*src1, mm3);	// load 8 src1 bytes
-    movq_r2r (mm3, mm4);	// copy 8 src1 bytes
-
-    movq_m2r (*src2, mm5);	// load 8 src2 bytes
-    movq_r2r (mm5, mm6);	// copy 8 src2 bytes
-
-    punpcklbw_r2r (mm0, mm1);	// unpack low dest bytes
-    punpckhbw_r2r (mm0, mm2);	// unpack high dest bytes
+    movq_m2r (*dest, mm1);	/* load 8 dest bytes */
+    movq_r2r (mm1, mm2);	/* copy 8 dest bytes */
 
-    punpcklbw_r2r (mm0, mm3);	// unpack low src1 bytes
-    punpckhbw_r2r (mm0, mm4);	// unpack high src1 bytes
+    movq_m2r (*src1, mm3);	/* load 8 src1 bytes */
+    movq_r2r (mm3, mm4);	/* copy 8 src1 bytes */
 
-    punpcklbw_r2r (mm0, mm5);	// unpack low src2 bytes
-    punpckhbw_r2r (mm0, mm6);	// unpack high src2 bytes
+    movq_m2r (*src2, mm5);	/* load 8 src2 bytes */
+    movq_r2r (mm5, mm6);	/* copy 8 src2 bytes */
 
-    paddw_r2r (mm5, mm3);	// add lows
-    paddw_m2r (round1, mm3);
-    psraw_i2r (1, mm3);		// /2
+    pxor_r2r (mm3, mm5);	/* xor src1 and src2 */
+    pand_m2r (mask1, mm5);	/* mask lower bits */
+    psrlq_i2r (1, mm5);		/* /2 */
+    por_r2r (mm4, mm6);		/* or src1 and src2 */
+    psubb_r2r (mm5, mm6);	/* subtract subresults */
+    movq_r2r (mm6, mm5);	/* copy subresult */
 
-    paddw_r2r (mm6, mm4);	// add highs
-    paddw_m2r (round1, mm4);
-    psraw_i2r (1, mm4);		// /2
-
-    paddw_r2r (mm3, mm1);	// add lows
-    paddw_m2r (round1, mm1);
-    psraw_i2r (1, mm1);		// /2
-
-    paddw_r2r (mm4, mm2);	// add highs
-    paddw_m2r (round1, mm2);
-    psraw_i2r (1, mm2);		// /2
-
-    packuswb_r2r (mm2, mm1);	// pack (w/ saturation)
-    movq_r2m (mm1, *dest);	// store result in dest
+    pxor_r2r (mm1, mm5);	/* xor srcavg and dest */
+    pand_m2r (mask1, mm5);	/* mask lower bits */
+    psrlq_i2r (1, mm5);		/* /2 */
+    por_r2r (mm2, mm6);		/* or srcavg and dest */
+    psubb_r2r (mm5, mm6);	/* subtract subresults */
+    movq_r2m (mm6, *dest);	/* store result in dest */
 }
 
-static inline void mmx_average_4_U8 (uint8_t * dest,
-				     uint8_t * src1, uint8_t * src2,
-				     uint8_t * src3, uint8_t * src4)
+static inline void mmx_average_4_U8 (uint8_t * dest, const uint8_t * src1,
+				     const uint8_t * src2,
+				     const uint8_t * src3,
+				     const uint8_t * src4)
 {
     /* *dest = (*src1 + *src2 + *src3 + *src4 + 2)/ 4; */
 
-    movq_m2r (*src1, mm1);	// load 8 src1 bytes
-    movq_r2r (mm1, mm2);	// copy 8 src1 bytes
+    movq_m2r (*src1, mm1);	/* load 8 src1 bytes */
+    movq_r2r (mm1, mm2);	/* copy 8 src1 bytes */
 
-    punpcklbw_r2r (mm0, mm1);	// unpack low src1 bytes
-    punpckhbw_r2r (mm0, mm2);	// unpack high src1 bytes
+    punpcklbw_r2r (mm0, mm1);	/* unpack low src1 bytes */
+    punpckhbw_r2r (mm0, mm2);	/* unpack high src1 bytes */
 
-    movq_m2r (*src2, mm3);	// load 8 src2 bytes
-    movq_r2r (mm3, mm4);	// copy 8 src2 bytes
+    movq_m2r (*src2, mm3);	/* load 8 src2 bytes */
+    movq_r2r (mm3, mm4);	/* copy 8 src2 bytes */
 
-    punpcklbw_r2r (mm0, mm3);	// unpack low src2 bytes
-    punpckhbw_r2r (mm0, mm4);	// unpack high src2 bytes
+    punpcklbw_r2r (mm0, mm3);	/* unpack low src2 bytes */
+    punpckhbw_r2r (mm0, mm4);	/* unpack high src2 bytes */
 
-    paddw_r2r (mm3, mm1);	// add lows
-    paddw_r2r (mm4, mm2);	// add highs
+    paddw_r2r (mm3, mm1);	/* add lows */
+    paddw_r2r (mm4, mm2);	/* add highs */
 
     /* now have partials in mm1 and mm2 */
 
-    movq_m2r (*src3, mm3);	// load 8 src3 bytes
-    movq_r2r (mm3, mm4);	// copy 8 src3 bytes
+    movq_m2r (*src3, mm3);	/* load 8 src3 bytes */
+    movq_r2r (mm3, mm4);	/* copy 8 src3 bytes */
 
-    punpcklbw_r2r (mm0, mm3);	// unpack low src3 bytes
-    punpckhbw_r2r (mm0, mm4);	// unpack high src3 bytes
+    punpcklbw_r2r (mm0, mm3);	/* unpack low src3 bytes */
+    punpckhbw_r2r (mm0, mm4);	/* unpack high src3 bytes */
 
-    paddw_r2r (mm3, mm1);	// add lows
-    paddw_r2r (mm4, mm2);	// add highs
+    paddw_r2r (mm3, mm1);	/* add lows */
+    paddw_r2r (mm4, mm2);	/* add highs */
 
-    movq_m2r (*src4, mm5);	// load 8 src4 bytes
-    movq_r2r (mm5, mm6);	// copy 8 src4 bytes
+    movq_m2r (*src4, mm5);	/* load 8 src4 bytes */
+    movq_r2r (mm5, mm6);	/* copy 8 src4 bytes */
 
-    punpcklbw_r2r (mm0, mm5);	// unpack low src4 bytes
-    punpckhbw_r2r (mm0, mm6);	// unpack high src4 bytes
+    punpcklbw_r2r (mm0, mm5);	/* unpack low src4 bytes */
+    punpckhbw_r2r (mm0, mm6);	/* unpack high src4 bytes */
 
-    paddw_r2r (mm5, mm1);	// add lows
-    paddw_r2r (mm6, mm2);	// add highs
+    paddw_r2r (mm5, mm1);	/* add lows */
+    paddw_r2r (mm6, mm2);	/* add highs */
 
     /* now have subtotal in mm1 and mm2 */
 
     paddw_m2r (round4, mm1);
-    psraw_i2r (2, mm1);		// /4
+    psraw_i2r (2, mm1);		/* /4 */
     paddw_m2r (round4, mm2);
-    psraw_i2r (2, mm2);		// /4
+    psraw_i2r (2, mm2);		/* /4 */
 
-    packuswb_r2r (mm2, mm1);	// pack (w/ saturation)
-    movq_r2m (mm1, *dest);	// store result in dest
+    packuswb_r2r (mm2, mm1);	/* pack (w/ saturation) */
+    movq_r2m (mm1, *dest);	/* store result in dest */
 }
 
 static inline void mmx_interp_average_4_U8 (uint8_t * dest,
-					    uint8_t * src1, uint8_t * src2,
-					    uint8_t * src3, uint8_t * src4)
+					    const uint8_t * src1,
+					    const uint8_t * src2,
+					    const uint8_t * src3,
+					    const uint8_t * src4)
 {
     /* *dest = (*dest + (*src1 + *src2 + *src3 + *src4 + 2)/ 4 + 1)/ 2; */
 
-    movq_m2r (*src1, mm1);	// load 8 src1 bytes
-    movq_r2r (mm1, mm2);	// copy 8 src1 bytes
+    movq_m2r (*src1, mm1);	/* load 8 src1 bytes */
+    movq_r2r (mm1, mm2);	/* copy 8 src1 bytes */
 
-    punpcklbw_r2r (mm0, mm1);	// unpack low src1 bytes
-    punpckhbw_r2r (mm0, mm2);	// unpack high src1 bytes
+    punpcklbw_r2r (mm0, mm1);	/* unpack low src1 bytes */
+    punpckhbw_r2r (mm0, mm2);	/* unpack high src1 bytes */
 
-    movq_m2r (*src2, mm3);	// load 8 src2 bytes
-    movq_r2r (mm3, mm4);	// copy 8 src2 bytes
+    movq_m2r (*src2, mm3);	/* load 8 src2 bytes */
+    movq_r2r (mm3, mm4);	/* copy 8 src2 bytes */
 
-    punpcklbw_r2r (mm0, mm3);	// unpack low src2 bytes
-    punpckhbw_r2r (mm0, mm4);	// unpack high src2 bytes
+    punpcklbw_r2r (mm0, mm3);	/* unpack low src2 bytes */
+    punpckhbw_r2r (mm0, mm4);	/* unpack high src2 bytes */
 
-    paddw_r2r (mm3, mm1);	// add lows
-    paddw_r2r (mm4, mm2);	// add highs
+    paddw_r2r (mm3, mm1);	/* add lows */
+    paddw_r2r (mm4, mm2);	/* add highs */
 
     /* now have partials in mm1 and mm2 */
 
-    movq_m2r (*src3, mm3);	// load 8 src3 bytes
-    movq_r2r (mm3, mm4);	// copy 8 src3 bytes
+    movq_m2r (*src3, mm3);	/* load 8 src3 bytes */
+    movq_r2r (mm3, mm4);	/* copy 8 src3 bytes */
 
-    punpcklbw_r2r (mm0, mm3);	// unpack low src3 bytes
-    punpckhbw_r2r (mm0, mm4);	// unpack high src3 bytes
+    punpcklbw_r2r (mm0, mm3);	/* unpack low src3 bytes */
+    punpckhbw_r2r (mm0, mm4);	/* unpack high src3 bytes */
 
-    paddw_r2r (mm3, mm1);	// add lows
-    paddw_r2r (mm4, mm2);	// add highs
+    paddw_r2r (mm3, mm1);	/* add lows */
+    paddw_r2r (mm4, mm2);	/* add highs */
 
-    movq_m2r (*src4, mm5);	// load 8 src4 bytes
-    movq_r2r (mm5, mm6);	// copy 8 src4 bytes
+    movq_m2r (*src4, mm5);	/* load 8 src4 bytes */
+    movq_r2r (mm5, mm6);	/* copy 8 src4 bytes */
 
-    punpcklbw_r2r (mm0, mm5);	// unpack low src4 bytes
-    punpckhbw_r2r (mm0, mm6);	// unpack high src4 bytes
+    punpcklbw_r2r (mm0, mm5);	/* unpack low src4 bytes */
+    punpckhbw_r2r (mm0, mm6);	/* unpack high src4 bytes */
 
-    paddw_r2r (mm5, mm1);	// add lows
-    paddw_r2r (mm6, mm2);	// add highs
+    paddw_r2r (mm5, mm1);	/* add lows */
+    paddw_r2r (mm6, mm2);	/* add highs */
 
     paddw_m2r (round4, mm1);
-    psraw_i2r (2, mm1);		// /4
+    psraw_i2r (2, mm1);		/* /4 */
     paddw_m2r (round4, mm2);
-    psraw_i2r (2, mm2);		// /4
+    psraw_i2r (2, mm2);		/* /4 */
 
     /* now have subtotal/4 in mm1 and mm2 */
 
-    movq_m2r (*dest, mm3);	// load 8 dest bytes
-    movq_r2r (mm3, mm4);	// copy 8 dest bytes
-
-    punpcklbw_r2r (mm0, mm3);	// unpack low dest bytes
-    punpckhbw_r2r (mm0, mm4);	// unpack high dest bytes
-
-    paddw_r2r (mm3, mm1);	// add lows
-    paddw_r2r (mm4, mm2);	// add highs
+    movq_m2r (*dest, mm3);	/* load 8 dest bytes */
+    movq_r2r (mm3, mm4);	/* copy 8 dest bytes */
 
-    paddw_m2r (round1, mm1);
-    psraw_i2r (1, mm1);		// /2
-    paddw_m2r (round1, mm2);
-    psraw_i2r (1, mm2);		// /2
+    packuswb_r2r (mm2, mm1);	/* pack (w/ saturation) */
+    movq_r2r (mm1,mm2);		/* copy subresult */
 
-    /* now have end value in mm1 and mm2 */
-
-    packuswb_r2r (mm2, mm1);	// pack (w/ saturation)
-    movq_r2m (mm1,*dest);	// store result in dest
+    pxor_r2r (mm1, mm3);	/* xor srcavg and dest */
+    pand_m2r (mask1, mm3);	/* mask lower bits */
+    psrlq_i2r (1, mm3);		/* /2 */
+    por_r2r (mm2, mm4);		/* or srcavg and dest */
+    psubb_r2r (mm3, mm4);	/* subtract subresults */
+    movq_r2m (mm4, *dest);	/* store result in dest */
 }
 
 /*-----------------------------------------------------------------------*/
 
-static inline void MC_avg_mmx (int width, int height,
-			       uint8_t * dest, uint8_t * ref, int stride)
+static inline void MC_avg_mmx (const int width, int height, uint8_t * dest,
+			       const uint8_t * ref, const int stride)
 {
     mmx_zero_reg ();
 
@@ -269,33 +253,33 @@ static inline void MC_avg_mmx (int width, int height,
     } while (--height);
 }
 
-static void MC_avg_16_mmx (uint8_t * dest, uint8_t * ref,
-			   int stride, int height)
+static void MC_avg_o_16_mmx (uint8_t * dest, const uint8_t * ref,
+			     int stride, int height)
 {
     MC_avg_mmx (16, height, dest, ref, stride);
 }
 
-static void MC_avg_8_mmx (uint8_t * dest, uint8_t * ref,
-			  int stride, int height)
+static void MC_avg_o_8_mmx (uint8_t * dest, const uint8_t * ref,
+			    int stride, int height)
 {
     MC_avg_mmx (8, height, dest, ref, stride);
 }
 
 /*-----------------------------------------------------------------------*/
 
-static inline void MC_put_mmx (int width, int height,
-			       uint8_t * dest, uint8_t * ref, int stride)
+static inline void MC_put_mmx (const int width, int height, uint8_t * dest,
+			       const uint8_t * ref, const int stride)
 {
     mmx_zero_reg ();
 
     do {
-	movq_m2r (* ref, mm1);	// load 8 ref bytes
-	movq_r2m (mm1,* dest);	// store 8 bytes at curr
+	movq_m2r (* ref, mm1);	/* load 8 ref bytes */
+	movq_r2m (mm1,* dest);	/* store 8 bytes at curr */
 
 	if (width == 16)
 	    {
-		movq_m2r (* (ref+8), mm1);	// load 8 ref bytes
-		movq_r2m (mm1,* (dest+8));	// store 8 bytes at curr
+		movq_m2r (* (ref+8), mm1);	/* load 8 ref bytes */
+		movq_r2m (mm1,* (dest+8));	/* store 8 bytes at curr */
 	    }
 
 	dest += stride;
@@ -303,14 +287,14 @@ static inline void MC_put_mmx (int width, int height,
     } while (--height);
 }
 
-static void MC_put_16_mmx (uint8_t * dest, uint8_t * ref,
-			   int stride, int height)
+static void MC_put_o_16_mmx (uint8_t * dest, const uint8_t * ref,
+			     int stride, int height)
 {
     MC_put_mmx (16, height, dest, ref, stride);
 }
 
-static void MC_put_8_mmx (uint8_t * dest, uint8_t * ref,
-			  int stride, int height)
+static void MC_put_o_8_mmx (uint8_t * dest, const uint8_t * ref,
+			    int stride, int height)
 {
     MC_put_mmx (8, height, dest, ref, stride);
 }
@@ -318,8 +302,8 @@ static void MC_put_8_mmx (uint8_t * dest, uint8_t * ref,
 /*-----------------------------------------------------------------------*/
 
 /* Half pixel interpolation in the x direction */
-static inline void MC_avg_x_mmx (int width, int height,
-				 uint8_t * dest, uint8_t * ref, int stride)
+static inline void MC_avg_x_mmx (const int width, int height, uint8_t * dest,
+				 const uint8_t * ref, const int stride)
 {
     mmx_zero_reg ();
 
@@ -334,22 +318,22 @@ static inline void MC_avg_x_mmx (int width, int height,
     } while (--height);
 }
 
-static void MC_avg_x16_mmx (uint8_t * dest, uint8_t * ref,
-			    int stride, int height)
+static void MC_avg_x_16_mmx (uint8_t * dest, const uint8_t * ref,
+			     int stride, int height)
 {
     MC_avg_x_mmx (16, height, dest, ref, stride);
 }
 
-static void MC_avg_x8_mmx (uint8_t * dest, uint8_t * ref,
-			   int stride, int height)
+static void MC_avg_x_8_mmx (uint8_t * dest, const uint8_t * ref,
+			    int stride, int height)
 {
     MC_avg_x_mmx (8, height, dest, ref, stride);
 }
 
 /*-----------------------------------------------------------------------*/
 
-static inline void MC_put_x_mmx (int width, int height,
-				 uint8_t * dest, uint8_t * ref, int stride)
+static inline void MC_put_x_mmx (const int width, int height, uint8_t * dest,
+				 const uint8_t * ref, const int stride)
 {
     mmx_zero_reg ();
 
@@ -364,24 +348,24 @@ static inline void MC_put_x_mmx (int width, int height,
     } while (--height);
 }
 
-static void MC_put_x16_mmx (uint8_t * dest, uint8_t * ref,
-			    int stride, int height)
+static void MC_put_x_16_mmx (uint8_t * dest, const uint8_t * ref,
+			     int stride, int height)
 {
     MC_put_x_mmx (16, height, dest, ref, stride);
 }
 
-static void MC_put_x8_mmx (uint8_t * dest, uint8_t * ref,
-			   int stride, int height)
+static void MC_put_x_8_mmx (uint8_t * dest, const uint8_t * ref,
+			    int stride, int height)
 {
     MC_put_x_mmx (8, height, dest, ref, stride);
 }
 
 /*-----------------------------------------------------------------------*/
 
-static inline void MC_avg_xy_mmx (int width, int height,
-				  uint8_t * dest, uint8_t * ref, int stride)
+static inline void MC_avg_xy_mmx (const int width, int height, uint8_t * dest,
+				  const uint8_t * ref, const int stride)
 {
-    uint8_t * ref_next = ref+stride;
+    const uint8_t * ref_next = ref + stride;
 
     mmx_zero_reg ();
 
@@ -398,24 +382,24 @@ static inline void MC_avg_xy_mmx (int width, int height,
     } while (--height);
 }
 
-static void MC_avg_xy16_mmx (uint8_t * dest, uint8_t * ref,
-			     int stride, int height)
+static void MC_avg_xy_16_mmx (uint8_t * dest, const uint8_t * ref,
+			      int stride, int height)
 {
     MC_avg_xy_mmx (16, height, dest, ref, stride);
 }
 
-static void MC_avg_xy8_mmx (uint8_t * dest, uint8_t * ref,
-			    int stride, int height)
+static void MC_avg_xy_8_mmx (uint8_t * dest, const uint8_t * ref,
+			     int stride, int height)
 {
     MC_avg_xy_mmx (8, height, dest, ref, stride);
 }
 
 /*-----------------------------------------------------------------------*/
 
-static inline void MC_put_xy_mmx (int width, int height,
-				  uint8_t * dest, uint8_t * ref, int stride)
+static inline void MC_put_xy_mmx (const int width, int height, uint8_t * dest,
+				  const uint8_t * ref, const int stride)
 {
-    uint8_t * ref_next = ref+stride;
+    const uint8_t * ref_next = ref + stride;
 
     mmx_zero_reg ();
 
@@ -431,24 +415,24 @@ static inline void MC_put_xy_mmx (int width, int height,
     } while (--height);
 }
 
-static void MC_put_xy16_mmx (uint8_t * dest, uint8_t * ref,
-			     int stride, int height)
+static void MC_put_xy_16_mmx (uint8_t * dest, const uint8_t * ref,
+			      int stride, int height)
 {
     MC_put_xy_mmx (16, height, dest, ref, stride);
 }
 
-static void MC_put_xy8_mmx (uint8_t * dest, uint8_t * ref,
-			    int stride, int height)
+static void MC_put_xy_8_mmx (uint8_t * dest, const uint8_t * ref,
+			     int stride, int height)
 {
     MC_put_xy_mmx (8, height, dest, ref, stride);
 }
 
 /*-----------------------------------------------------------------------*/
 
-static inline void MC_avg_y_mmx (int width, int height,
-				 uint8_t * dest, uint8_t * ref, int stride)
+static inline void MC_avg_y_mmx (const int width, int height, uint8_t * dest,
+				 const uint8_t * ref, const int stride)
 {
-    uint8_t * ref_next = ref+stride;
+    const uint8_t * ref_next = ref + stride;
 
     mmx_zero_reg ();
 
@@ -464,24 +448,24 @@ static inline void MC_avg_y_mmx (int width, int height,
     } while (--height);
 }
 
-static void MC_avg_y16_mmx (uint8_t * dest, uint8_t * ref,
-			    int stride, int height)
+static void MC_avg_y_16_mmx (uint8_t * dest, const uint8_t * ref,
+			     int stride, int height)
 {
     MC_avg_y_mmx (16, height, dest, ref, stride);
 }
 
-static void MC_avg_y8_mmx (uint8_t * dest, uint8_t * ref,
-			   int stride, int height)
+static void MC_avg_y_8_mmx (uint8_t * dest, const uint8_t * ref,
+			    int stride, int height)
 {
     MC_avg_y_mmx (8, height, dest, ref, stride);
 }
 
 /*-----------------------------------------------------------------------*/
 
-static inline void MC_put_y_mmx (int width, int height,
-				 uint8_t * dest, uint8_t * ref, int stride)
+static inline void MC_put_y_mmx (const int width, int height, uint8_t * dest,
+				 const uint8_t * ref, const int stride)
 {
-    uint8_t * ref_next = ref+stride;
+    const uint8_t * ref_next = ref + stride;
 
     mmx_zero_reg ();
 
@@ -497,20 +481,20 @@ static inline void MC_put_y_mmx (int width, int height,
     } while (--height);
 }
 
-static void MC_put_y16_mmx (uint8_t * dest, uint8_t * ref,
-			    int stride, int height)
+static void MC_put_y_16_mmx (uint8_t * dest, const uint8_t * ref,
+			     int stride, int height)
 {
     MC_put_y_mmx (16, height, dest, ref, stride);
 }
 
-static void MC_put_y8_mmx (uint8_t * dest, uint8_t * ref,
-			   int stride, int height)
+static void MC_put_y_8_mmx (uint8_t * dest, const uint8_t * ref,
+			    int stride, int height)
 {
     MC_put_y_mmx (8, height, dest, ref, stride);
 }
 
 
-MOTION_COMP_EXTERN (mmx)
+MPEG2_MC_EXTERN (mmx)
 
 
 
@@ -540,8 +524,8 @@ do {					\
 /* CPU_MMXEXT code */
 
 
-static inline void MC_put1_8 (int height, uint8_t * dest, uint8_t * ref,
-			      int stride)
+static inline void MC_put1_8 (int height, uint8_t * dest, const uint8_t * ref,
+			      const int stride)
 {
     do {
 	movq_m2r (*ref, mm0);
@@ -551,8 +535,8 @@ static inline void MC_put1_8 (int height, uint8_t * dest, uint8_t * ref,
     } while (--height);
 }
 
-static inline void MC_put1_16 (int height, uint8_t * dest, uint8_t * ref,
-			       int stride)
+static inline void MC_put1_16 (int height, uint8_t * dest, const uint8_t * ref,
+			       const int stride)
 {
     do {
 	movq_m2r (*ref, mm0);
@@ -564,8 +548,8 @@ static inline void MC_put1_16 (int height, uint8_t * dest, uint8_t * ref,
     } while (--height);
 }
 
-static inline void MC_avg1_8 (int height, uint8_t * dest, uint8_t * ref,
-			      int stride, int cpu)
+static inline void MC_avg1_8 (int height, uint8_t * dest, const uint8_t * ref,
+			      const int stride, const int cpu)
 {
     do {
 	movq_m2r (*ref, mm0);
@@ -576,8 +560,8 @@ static inline void MC_avg1_8 (int height, uint8_t * dest, uint8_t * ref,
     } while (--height);
 }
 
-static inline void MC_avg1_16 (int height, uint8_t * dest, uint8_t * ref,
-			       int stride, int cpu)
+static inline void MC_avg1_16 (int height, uint8_t * dest, const uint8_t * ref,
+			       const int stride, const int cpu)
 {
     do {
 	movq_m2r (*ref, mm0);
@@ -591,8 +575,9 @@ static inline void MC_avg1_16 (int height, uint8_t * dest, uint8_t * ref,
     } while (--height);
 }
 
-static inline void MC_put2_8 (int height, uint8_t * dest, uint8_t * ref,
-			      int stride, int offset, int cpu)
+static inline void MC_put2_8 (int height, uint8_t * dest, const uint8_t * ref,
+			      const int stride, const int offset,
+			      const int cpu)
 {
     do {
 	movq_m2r (*ref, mm0);
@@ -603,8 +588,9 @@ static inline void MC_put2_8 (int height, uint8_t * dest, uint8_t * ref,
     } while (--height);
 }
 
-static inline void MC_put2_16 (int height, uint8_t * dest, uint8_t * ref,
-			       int stride, int offset, int cpu)
+static inline void MC_put2_16 (int height, uint8_t * dest, const uint8_t * ref,
+			       const int stride, const int offset,
+			       const int cpu)
 {
     do {
 	movq_m2r (*ref, mm0);
@@ -618,8 +604,9 @@ static inline void MC_put2_16 (int height, uint8_t * dest, uint8_t * ref,
     } while (--height);
 }
 
-static inline void MC_avg2_8 (int height, uint8_t * dest, uint8_t * ref,
-			      int stride, int offset, int cpu)
+static inline void MC_avg2_8 (int height, uint8_t * dest, const uint8_t * ref,
+			      const int stride, const int offset,
+			      const int cpu)
 {
     do {
 	movq_m2r (*ref, mm0);
@@ -631,8 +618,9 @@ static inline void MC_avg2_8 (int height, uint8_t * dest, uint8_t * ref,
     } while (--height);
 }
 
-static inline void MC_avg2_16 (int height, uint8_t * dest, uint8_t * ref,
-			       int stride, int offset, int cpu)
+static inline void MC_avg2_16 (int height, uint8_t * dest, const uint8_t * ref,
+			       const int stride, const int offset,
+			       const int cpu)
 {
     do {
 	movq_m2r (*ref, mm0);
@@ -650,8 +638,8 @@ static inline void MC_avg2_16 (int height, uint8_t * dest, uint8_t * ref,
 
 static mmx_t mask_one = {0x0101010101010101LL};
 
-static inline void MC_put4_8 (int height, uint8_t * dest, uint8_t * ref,
-			      int stride, int cpu)
+static inline void MC_put4_8 (int height, uint8_t * dest, const uint8_t * ref,
+			      const int stride, const int cpu)
 {
     movq_m2r (*ref, mm0);
     movq_m2r (*(ref+1), mm1);
@@ -684,13 +672,13 @@ static inline void MC_put4_8 (int height, uint8_t * dest, uint8_t * ref,
 	movq_r2m (mm0, *dest);
 	dest += stride;
 
-	movq_r2r (mm6, mm7);	// unroll !
-	movq_r2r (mm2, mm0);	// unroll !
+	movq_r2r (mm6, mm7);	/* unroll ! */
+	movq_r2r (mm2, mm0);	/* unroll ! */
     } while (--height);
 }
 
-static inline void MC_put4_16 (int height, uint8_t * dest, uint8_t * ref,
-			       int stride, int cpu)
+static inline void MC_put4_16 (int height, uint8_t * dest, const uint8_t * ref,
+			       const int stride, const int cpu)
 {
     do {
 	movq_m2r (*ref, mm0);
@@ -735,8 +723,8 @@ static inline void MC_put4_16 (int height, uint8_t * dest, uint8_t * ref,
     } while (--height);
 }
 
-static inline void MC_avg4_8 (int height, uint8_t * dest, uint8_t * ref,
-			      int stride, int cpu)
+static inline void MC_avg4_8 (int height, uint8_t * dest, const uint8_t * ref,
+			      const int stride, const int cpu)
 {
     do {
 	movq_m2r (*ref, mm0);
@@ -764,8 +752,8 @@ static inline void MC_avg4_8 (int height, uint8_t * dest, uint8_t * ref,
     } while (--height);
 }
 
-static inline void MC_avg4_16 (int height, uint8_t * dest, uint8_t * ref,
-			       int stride, int cpu)
+static inline void MC_avg4_16 (int height, uint8_t * dest, const uint8_t * ref,
+			       const int stride, const int cpu)
 {
     do {
 	movq_m2r (*ref, mm0);
@@ -814,204 +802,204 @@ static inline void MC_avg4_16 (int height, uint8_t * dest, uint8_t * ref,
     } while (--height);
 }
 
-static void MC_avg_16_mmxext (uint8_t * dest, uint8_t * ref,
-			      int stride, int height)
+static void MC_avg_o_16_mmxext (uint8_t * dest, const uint8_t * ref,
+				int stride, int height)
 {
     MC_avg1_16 (height, dest, ref, stride, CPU_MMXEXT);
 }
 
-static void MC_avg_8_mmxext (uint8_t * dest, uint8_t * ref,
-			     int stride, int height)
+static void MC_avg_o_8_mmxext (uint8_t * dest, const uint8_t * ref,
+			       int stride, int height)
 {
     MC_avg1_8 (height, dest, ref, stride, CPU_MMXEXT);
 }
 
-static void MC_put_16_mmxext (uint8_t * dest, uint8_t * ref,
-			      int stride, int height)
+static void MC_put_o_16_mmxext (uint8_t * dest, const uint8_t * ref,
+				int stride, int height)
 {
     MC_put1_16 (height, dest, ref, stride);
 }
 
-static void MC_put_8_mmxext (uint8_t * dest, uint8_t * ref,
-			     int stride, int height)
+static void MC_put_o_8_mmxext (uint8_t * dest, const uint8_t * ref,
+			       int stride, int height)
 {
     MC_put1_8 (height, dest, ref, stride);
 }
 
-static void MC_avg_x16_mmxext (uint8_t * dest, uint8_t * ref,
-			       int stride, int height)
+static void MC_avg_x_16_mmxext (uint8_t * dest, const uint8_t * ref,
+				int stride, int height)
 {
     MC_avg2_16 (height, dest, ref, stride, 1, CPU_MMXEXT);
 }
 
-static void MC_avg_x8_mmxext (uint8_t * dest, uint8_t * ref,
-			      int stride, int height)
+static void MC_avg_x_8_mmxext (uint8_t * dest, const uint8_t * ref,
+			       int stride, int height)
 {
     MC_avg2_8 (height, dest, ref, stride, 1, CPU_MMXEXT);
 }
 
-static void MC_put_x16_mmxext (uint8_t * dest, uint8_t * ref,
-			       int stride, int height)
+static void MC_put_x_16_mmxext (uint8_t * dest, const uint8_t * ref,
+				int stride, int height)
 {
     MC_put2_16 (height, dest, ref, stride, 1, CPU_MMXEXT);
 }
 
-static void MC_put_x8_mmxext (uint8_t * dest, uint8_t * ref,
-			      int stride, int height)
+static void MC_put_x_8_mmxext (uint8_t * dest, const uint8_t * ref,
+			       int stride, int height)
 {
     MC_put2_8 (height, dest, ref, stride, 1, CPU_MMXEXT);
 }
 
-static void MC_avg_y16_mmxext (uint8_t * dest, uint8_t * ref,
-			       int stride, int height)
+static void MC_avg_y_16_mmxext (uint8_t * dest, const uint8_t * ref,
+				int stride, int height)
 {
     MC_avg2_16 (height, dest, ref, stride, stride, CPU_MMXEXT);
 }
 
-static void MC_avg_y8_mmxext (uint8_t * dest, uint8_t * ref,
-			      int stride, int height)
+static void MC_avg_y_8_mmxext (uint8_t * dest, const uint8_t * ref,
+			       int stride, int height)
 {
     MC_avg2_8 (height, dest, ref, stride, stride, CPU_MMXEXT);
 }
 
-static void MC_put_y16_mmxext (uint8_t * dest, uint8_t * ref,
-			       int stride, int height)
+static void MC_put_y_16_mmxext (uint8_t * dest, const uint8_t * ref,
+				int stride, int height)
 {
     MC_put2_16 (height, dest, ref, stride, stride, CPU_MMXEXT);
 }
 
-static void MC_put_y8_mmxext (uint8_t * dest, uint8_t * ref,
-			      int stride, int height)
+static void MC_put_y_8_mmxext (uint8_t * dest, const uint8_t * ref,
+			       int stride, int height)
 {
     MC_put2_8 (height, dest, ref, stride, stride, CPU_MMXEXT);
 }
 
-static void MC_avg_xy16_mmxext (uint8_t * dest, uint8_t * ref,
-				int stride, int height)
+static void MC_avg_xy_16_mmxext (uint8_t * dest, const uint8_t * ref,
+				 int stride, int height)
 {
     MC_avg4_16 (height, dest, ref, stride, CPU_MMXEXT);
 }
 
-static void MC_avg_xy8_mmxext (uint8_t * dest, uint8_t * ref,
-			       int stride, int height)
+static void MC_avg_xy_8_mmxext (uint8_t * dest, const uint8_t * ref,
+				int stride, int height)
 {
     MC_avg4_8 (height, dest, ref, stride, CPU_MMXEXT);
 }
 
-static void MC_put_xy16_mmxext (uint8_t * dest, uint8_t * ref,
-				int stride, int height)
+static void MC_put_xy_16_mmxext (uint8_t * dest, const uint8_t * ref,
+				 int stride, int height)
 {
     MC_put4_16 (height, dest, ref, stride, CPU_MMXEXT);
 }
 
-static void MC_put_xy8_mmxext (uint8_t * dest, uint8_t * ref,
-			       int stride, int height)
+static void MC_put_xy_8_mmxext (uint8_t * dest, const uint8_t * ref,
+				int stride, int height)
 {
     MC_put4_8 (height, dest, ref, stride, CPU_MMXEXT);
 }
 
 
-MOTION_COMP_EXTERN (mmxext)
+MPEG2_MC_EXTERN (mmxext)
 
 
 
-static void MC_avg_16_3dnow (uint8_t * dest, uint8_t * ref,
-			      int stride, int height)
+static void MC_avg_o_16_3dnow (uint8_t * dest, const uint8_t * ref,
+			       int stride, int height)
 {
     MC_avg1_16 (height, dest, ref, stride, CPU_3DNOW);
 }
 
-static void MC_avg_8_3dnow (uint8_t * dest, uint8_t * ref,
-			     int stride, int height)
+static void MC_avg_o_8_3dnow (uint8_t * dest, const uint8_t * ref,
+			      int stride, int height)
 {
     MC_avg1_8 (height, dest, ref, stride, CPU_3DNOW);
 }
 
-static void MC_put_16_3dnow (uint8_t * dest, uint8_t * ref,
-			      int stride, int height)
+static void MC_put_o_16_3dnow (uint8_t * dest, const uint8_t * ref,
+			       int stride, int height)
 {
     MC_put1_16 (height, dest, ref, stride);
 }
 
-static void MC_put_8_3dnow (uint8_t * dest, uint8_t * ref,
-			     int stride, int height)
+static void MC_put_o_8_3dnow (uint8_t * dest, const uint8_t * ref,
+			      int stride, int height)
 {
     MC_put1_8 (height, dest, ref, stride);
 }
 
-static void MC_avg_x16_3dnow (uint8_t * dest, uint8_t * ref,
+static void MC_avg_x_16_3dnow (uint8_t * dest, const uint8_t * ref,
 			       int stride, int height)
 {
     MC_avg2_16 (height, dest, ref, stride, 1, CPU_3DNOW);
 }
 
-static void MC_avg_x8_3dnow (uint8_t * dest, uint8_t * ref,
+static void MC_avg_x_8_3dnow (uint8_t * dest, const uint8_t * ref,
 			      int stride, int height)
 {
     MC_avg2_8 (height, dest, ref, stride, 1, CPU_3DNOW);
 }
 
-static void MC_put_x16_3dnow (uint8_t * dest, uint8_t * ref,
+static void MC_put_x_16_3dnow (uint8_t * dest, const uint8_t * ref,
 			       int stride, int height)
 {
     MC_put2_16 (height, dest, ref, stride, 1, CPU_3DNOW);
 }
 
-static void MC_put_x8_3dnow (uint8_t * dest, uint8_t * ref,
+static void MC_put_x_8_3dnow (uint8_t * dest, const uint8_t * ref,
 			      int stride, int height)
 {
     MC_put2_8 (height, dest, ref, stride, 1, CPU_3DNOW);
 }
 
-static void MC_avg_y16_3dnow (uint8_t * dest, uint8_t * ref,
+static void MC_avg_y_16_3dnow (uint8_t * dest, const uint8_t * ref,
 			       int stride, int height)
 {
     MC_avg2_16 (height, dest, ref, stride, stride, CPU_3DNOW);
 }
 
-static void MC_avg_y8_3dnow (uint8_t * dest, uint8_t * ref,
+static void MC_avg_y_8_3dnow (uint8_t * dest, const uint8_t * ref,
 			      int stride, int height)
 {
     MC_avg2_8 (height, dest, ref, stride, stride, CPU_3DNOW);
 }
 
-static void MC_put_y16_3dnow (uint8_t * dest, uint8_t * ref,
+static void MC_put_y_16_3dnow (uint8_t * dest, const uint8_t * ref,
 			       int stride, int height)
 {
     MC_put2_16 (height, dest, ref, stride, stride, CPU_3DNOW);
 }
 
-static void MC_put_y8_3dnow (uint8_t * dest, uint8_t * ref,
+static void MC_put_y_8_3dnow (uint8_t * dest, const uint8_t * ref,
 			      int stride, int height)
 {
     MC_put2_8 (height, dest, ref, stride, stride, CPU_3DNOW);
 }
 
-static void MC_avg_xy16_3dnow (uint8_t * dest, uint8_t * ref,
+static void MC_avg_xy_16_3dnow (uint8_t * dest, const uint8_t * ref,
 				int stride, int height)
 {
     MC_avg4_16 (height, dest, ref, stride, CPU_3DNOW);
 }
 
-static void MC_avg_xy8_3dnow (uint8_t * dest, uint8_t * ref,
+static void MC_avg_xy_8_3dnow (uint8_t * dest, const uint8_t * ref,
 			       int stride, int height)
 {
     MC_avg4_8 (height, dest, ref, stride, CPU_3DNOW);
 }
 
-static void MC_put_xy16_3dnow (uint8_t * dest, uint8_t * ref,
+static void MC_put_xy_16_3dnow (uint8_t * dest, const uint8_t * ref,
 				int stride, int height)
 {
     MC_put4_16 (height, dest, ref, stride, CPU_3DNOW);
 }
 
-static void MC_put_xy8_3dnow (uint8_t * dest, uint8_t * ref,
+static void MC_put_xy_8_3dnow (uint8_t * dest, const uint8_t * ref,
 			       int stride, int height)
 {
     MC_put4_8 (height, dest, ref, stride, CPU_3DNOW);
 }
 
 
-MOTION_COMP_EXTERN (3dnow)
+MPEG2_MC_EXTERN (3dnow)
 
 #endif
diff --git a/libmpeg2/mpeg2.h b/libmpeg2/mpeg2.h
index a1a0ef1681..5016f4d85f 100644
--- a/libmpeg2/mpeg2.h
+++ b/libmpeg2/mpeg2.h
@@ -1,8 +1,10 @@
 /*
  * mpeg2.h
- * Copyright (C) 1999-2001 Aaron Holtzman <aholtzma@ess.engr.uvic.ca>
+ * Copyright (C) 2000-2002 Michel Lespinasse <walken@zoy.org>
+ * Copyright (C) 1999-2000 Aaron Holtzman <aholtzma@ess.engr.uvic.ca>
  *
  * This file is part of mpeg2dec, a free MPEG-2 video stream decoder.
+ * See http://libmpeg2.sourceforge.net/ for updates.
  *
  * mpeg2dec is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -19,52 +21,126 @@
  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  */
 
-/* Structure for the mpeg2dec decoder */
-
-typedef struct mpeg2dec_s {
-//    vo_instance_t * output;
-
-    /* this is where we keep the state of the decoder */
-    struct picture_s * picture;
-    
-    uint32_t shift;
-    int is_display_initialized;
-    int is_sequence_needed;
-    int drop_flag;
-    int drop_frame;
-    int in_slice;
-
-    /* the maximum chunk size is determined by vbv_buffer_size */
-    /* which is 224K for MP@ML streams. */
-    /* (we make no pretenses of decoding anything more than that) */
-    /* allocated in init - gcc has problems allocating such big structures */
-    uint8_t * chunk_buffer;
-    /* pointer to current position in chunk_buffer */
-    uint8_t * chunk_ptr;
-    /* last start code ? */
-    uint8_t code;
-
-    /* ONLY for 0.2.0 release - will not stay there later */
-    int frame_rate_code;
-} mpeg2dec_t ;
-
-
-void mpeg2_init (void);
-//void mpeg2_allocate_image_buffers (picture_t * picture);
-int mpeg2_decode_data (vo_functions_t *, uint8_t * data_start, uint8_t * data_end,int framedrop);
-//void mpeg2_close (vo_functions_t *);
-//void mpeg2_drop (int flag);
-//void mpeg2_free_image_buffers (picture_t * picture)
-
-/* initialize mpegdec with a opaque user pointer */
-//void mpeg2_init (mpeg2dec_t * mpeg2dec, uint32_t mm_accel
-//		 ,vo_instance_t * output
-//		 );
-
-/* destroy everything which was allocated, shutdown the output */
-//void mpeg2_close (mpeg2dec_t * mpeg2dec);
-
-//int mpeg2_decode_data (mpeg2dec_t * mpeg2dec,
-//		       uint8_t * data_start, uint8_t * data_end);
-
-//void mpeg2_drop (mpeg2dec_t * mpeg2dec, int flag);
+#ifndef MPEG2_H
+#define MPEG2_H
+
+#define SEQ_FLAG_MPEG2 1
+#define SEQ_FLAG_CONSTRAINED_PARAMETERS 2
+#define SEQ_FLAG_PROGRESSIVE_SEQUENCE 4
+#define SEQ_FLAG_LOW_DELAY 8
+#define SEQ_FLAG_COLOUR_DESCRIPTION 16
+
+#define SEQ_MASK_VIDEO_FORMAT 0xe0
+#define SEQ_VIDEO_FORMAT_COMPONENT 0
+#define SEQ_VIDEO_FORMAT_PAL 0x20
+#define SEQ_VIDEO_FORMAT_NTSC 0x40
+#define SEQ_VIDEO_FORMAT_SECAM 0x60
+#define SEQ_VIDEO_FORMAT_MAC 0x80
+#define SEQ_VIDEO_FORMAT_UNSPECIFIED 0xa0
+
+typedef struct {
+    unsigned int width, height;
+    unsigned int chroma_width, chroma_height;
+    unsigned int byte_rate;
+    unsigned int vbv_buffer_size;
+    uint32_t flags;
+
+    unsigned int picture_width, picture_height;
+    unsigned int display_width, display_height;
+    unsigned int pixel_width, pixel_height;
+    unsigned int frame_period;
+
+    uint8_t profile_level_id;
+    uint8_t colour_primaries;
+    uint8_t transfer_characteristics;
+    uint8_t matrix_coefficients;
+} sequence_t;
+
+#define PIC_MASK_CODING_TYPE 7
+#define PIC_FLAG_CODING_TYPE_I 1
+#define PIC_FLAG_CODING_TYPE_P 2
+#define PIC_FLAG_CODING_TYPE_B 3
+#define PIC_FLAG_CODING_TYPE_D 4
+
+#define PIC_FLAG_TOP_FIELD_FIRST 8
+#define PIC_FLAG_PROGRESSIVE_FRAME 16
+#define PIC_FLAG_COMPOSITE_DISPLAY 32
+#define PIC_FLAG_SKIP 64
+#define PIC_FLAG_PTS 128
+#define PIC_MASK_COMPOSITE_DISPLAY 0xfffff000
+
+typedef struct {
+    unsigned int temporal_reference;
+    unsigned int nb_fields;
+    uint32_t pts;
+    uint32_t flags;
+    struct {
+	int x, y;
+    } display_offset[3];
+} picture_t;
+
+typedef struct {
+    uint8_t * buf[3];
+    void * id;
+} fbuf_t;
+
+typedef struct {
+    const sequence_t * sequence;
+    const picture_t * current_picture;
+    const picture_t * current_picture_2nd;
+    const fbuf_t * current_fbuf;
+    const picture_t * display_picture;
+    const picture_t * display_picture_2nd;
+    const fbuf_t * display_fbuf;
+    const fbuf_t * discard_fbuf;
+    const uint8_t * user_data;
+    int user_data_len;
+} mpeg2_info_t;
+
+typedef struct mpeg2dec_s mpeg2dec_t;
+typedef struct decoder_s decoder_t;
+
+#define STATE_SEQUENCE 1
+#define STATE_SEQUENCE_REPEATED 2
+#define STATE_GOP 3
+#define STATE_PICTURE 4
+#define STATE_SLICE_1ST 5
+#define STATE_PICTURE_2ND 6
+#define STATE_SLICE 7
+#define STATE_END 8
+#define STATE_INVALID 9
+
+struct convert_init_s;
+void mpeg2_convert (mpeg2dec_t * mpeg2dec,
+		    void (* convert) (int, int, uint32_t, void *,
+				      struct convert_init_s *), void * arg);
+void mpeg2_set_buf (mpeg2dec_t * mpeg2dec, uint8_t * buf[3], void * id);
+void mpeg2_custom_fbuf (mpeg2dec_t * mpeg2dec, int custom_fbuf);
+void mpeg2_init_fbuf (decoder_t * decoder, uint8_t * current_fbuf[3],
+		      uint8_t * forward_fbuf[3], uint8_t * backward_fbuf[3]);
+
+void mpeg2_slice (decoder_t * decoder, int code, const uint8_t * buffer);
+
+#define MPEG2_ACCEL_X86_MMX 1
+#define MPEG2_ACCEL_X86_3DNOW 2
+#define MPEG2_ACCEL_X86_MMXEXT 4
+#define MPEG2_ACCEL_PPC_ALTIVEC 1
+#define MPEG2_ACCEL_ALPHA 1
+#define MPEG2_ACCEL_ALPHA_MVI 2
+#define MPEG2_ACCEL_MLIB 0x40000000
+#define MPEG2_ACCEL_DETECT 0x80000000
+
+uint32_t mpeg2_accel (uint32_t accel);
+mpeg2dec_t * mpeg2_init (void);
+const mpeg2_info_t * mpeg2_info (mpeg2dec_t * mpeg2dec);
+void mpeg2_close (mpeg2dec_t * mpeg2dec);
+
+void mpeg2_buffer (mpeg2dec_t * mpeg2dec, uint8_t * start, uint8_t * end);
+int mpeg2_parse (mpeg2dec_t * mpeg2dec);
+
+void mpeg2_skip (mpeg2dec_t * mpeg2dec, int skip);
+void mpeg2_slice_region (mpeg2dec_t * mpeg2dec, int start, int end);
+
+void mpeg2_pts (mpeg2dec_t * mpeg2dec, uint32_t pts);
+
+#endif /* MPEG2_H */
diff --git a/libmpeg2/mpeg2_internal.h b/libmpeg2/mpeg2_internal.h
index 6f1c48425b..0e364cbf43 100644
--- a/libmpeg2/mpeg2_internal.h
+++ b/libmpeg2/mpeg2_internal.h
@@ -1,8 +1,10 @@
 /*
  * mpeg2_internal.h
- * Copyright (C) 1999-2001 Aaron Holtzman <aholtzma@ess.engr.uvic.ca>
+ * Copyright (C) 2000-2002 Michel Lespinasse <walken@zoy.org>
+ * Copyright (C) 1999-2000 Aaron Holtzman <aholtzma@ess.engr.uvic.ca>
  *
  * This file is part of mpeg2dec, a free MPEG-2 video stream decoder.
+ * See http://libmpeg2.sourceforge.net/ for updates.
  *
  * mpeg2dec is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -45,40 +47,38 @@
 #define B_TYPE 3
 #define D_TYPE 4
 
-typedef struct motion_s {
+typedef struct {
     uint8_t * ref[2][3];
+    uint8_t ** ref2[2];
     int pmv[2][2];
     int f_code[2];
 } motion_t;
 
-typedef struct vo_frame_s {
-    uint8_t * base[3];	/* pointer to 3 planes */
-    void (* copy) (struct vo_frame_s * frame, uint8_t ** src);
-    void* vo;
-    void* mpi;
-#ifdef MPEG12_POSTPROC
-#define MPEG2_MBC 120
-#define MPEG2_MBR 72
-    int8_t quant_store[MPEG2_MBR+1][MPEG2_MBC+1];
-#endif
-
-//    int slice;
-//    void (* field) (struct vo_frame_s * frame, int flags);
-//    void (* draw) (struct vo_frame_s * frame);
-//    vo_instance_t * instance;
-} vo_frame_t;
-
-typedef struct picture_s {
+struct decoder_s {
     /* first, state that carries information from one macroblock to the */
-    /* next inside a slice, and is never used outside of slice_process() */
+    /* next inside a slice, and is never used outside of mpeg2_slice() */
 
     /* DCT coefficients - should be kept aligned ! */
     int16_t DCTblock[64];
 
     /* bit parsing stuff */
-    uint32_t bitstream_buf;	/* current 32 bit working set of buffer */
-    int bitstream_bits;		/* used bits in working set */
-    uint8_t * bitstream_ptr;	/* buffer with stream data */
+    uint32_t bitstream_buf;		/* current 32 bit working set */
+    int bitstream_bits;			/* used bits in working set */
+    const uint8_t * bitstream_ptr;	/* buffer with stream data */
+
+    uint8_t * dest[3];
+    uint8_t * picture_dest[3];
+    void (* convert) (void * fbuf_id, uint8_t * const * src,
+		      unsigned int v_offset);
+    void * fbuf_id;
+
+    int offset;
+    int stride;
+    int uv_stride;
+    unsigned int limit_x;
+    unsigned int limit_y_16;
+    unsigned int limit_y_8;
+    unsigned int limit_y;
 
     /* Motion vectors */
     /* The f_ and b_ correspond to the forward and backward motion */
@@ -90,9 +90,8 @@ typedef struct picture_s {
     int16_t dc_dct_pred[3];
 
     int quantizer_scale;	/* remove */
-    int current_field;		/* remove */
-    int v_offset;		/* remove */
-
+    int dmv_offset;		/* remove */
+    unsigned int v_offset;	/* remove */
 
     /* now non-slice-specific information */
 
@@ -101,16 +100,17 @@ typedef struct picture_s {
     uint8_t non_intra_quantizer_matrix [64];
 
     /* The width and height of the picture snapped to macroblock units */
-    int coded_picture_width;
-    int coded_picture_height;
+    int width;
+    int height;
+    int vertical_position_extension;
 
     /* picture header stuff */
 
     /* what type of picture this is (I, P, B, D) */
-    int picture_coding_type;
-	
+    int coding_type;
+
     /* picture coding extension stuff */
-	
+
     /* quantization factor for intra dc coefficients */
     int intra_dc_precision;
     /* top/bottom/both fields */
@@ -130,98 +130,167 @@ typedef struct picture_s {
     /* stuff derived from bitstream */
 
     /* pointer to the zigzag scan we're supposed to be using */
-    uint8_t * scan;
-
-    struct vo_frame_s * current_frame;
-    struct vo_frame_s * forward_reference_frame;
-    struct vo_frame_s * backward_reference_frame;
-    struct vo_frame_s * temp_frame; // B frame
+    const uint8_t * scan;
 
     int second_field;
 
     int mpeg1;
+};
 
-    /* these things are not needed by the decoder */
-    /* this is a temporary interface, we will build a better one later. */
-    int aspect_ratio_information;
-    int frame_rate_code;
-    int progressive_sequence;
-    int repeat_first_field;
-    int progressive_frame;
-    int bitrate;
-    
-    // added by A'rpi/ESP-team
-    int display_picture_width;
-    int display_picture_height;
-    int pp_options;
-    int display_time;
-
-    struct vo_frame_s * display_frame;
-    int slice;
-
-} picture_t;
-
-typedef struct mpeg2_config_s {
-    /* Bit flags that enable various things */
-    uint32_t flags;
-} mpeg2_config_t;
-
-/* The only global variable, */
-/* the config struct */
-extern mpeg2_config_t config;
-
-
+typedef struct {
+    fbuf_t fbuf;
+} fbuf_alloc_t;
+
+struct mpeg2dec_s {
+    decoder_t decoder;
+
+    mpeg2_info_t info;
+
+    uint32_t shift;
+    int is_display_initialized;
+    int (* action) (struct mpeg2dec_s * mpeg2dec);
+    int state;
+    uint32_t ext_state;
+
+    /* allocated in init - gcc has problems allocating such big structures */
+    uint8_t * chunk_buffer;
+    /* pointer to start of the current chunk */
+    uint8_t * chunk_start;
+    /* pointer to current position in chunk_buffer */
+    uint8_t * chunk_ptr;
+    /* last start code ? */
+    uint8_t code;
+
+    /* PTS */
+    uint32_t pts_current, pts_previous;
+    int num_pts;
+    int bytes_since_pts;
+
+    int first;
+    int alloc_index_user;
+    int alloc_index;
+    uint8_t first_decode_slice;
+    uint8_t nb_decode_slices;
+
+    sequence_t new_sequence;
+    sequence_t sequence;
+    picture_t pictures[4];
+    picture_t * picture;
+    /*const*/ fbuf_t * fbuf[3];	/* 0: current fbuf, 1-2: prediction fbufs */
+
+    fbuf_alloc_t fbuf_alloc[3];
+    int custom_fbuf;
+
+    uint8_t * yuv_buf[3][3];
+    int yuv_index;
+    void * convert_id;
+    int convert_size[3];
+    void (* convert_start) (void * id, uint8_t * const * dest, int flags);
+    void (* convert_copy) (void * id, uint8_t * const * src,
+			   unsigned int v_offset);
+
+    uint8_t * buf_start;
+    uint8_t * buf_end;
+
+    int16_t display_offset_x, display_offset_y;
+};
 
-/* slice.c */
-void header_state_init (picture_t * picture);
-int header_process_picture_header (picture_t * picture, uint8_t * buffer);
-int header_process_sequence_header (picture_t * picture, uint8_t * buffer);
-int header_process_extension (picture_t * picture, uint8_t * buffer);
+typedef struct {
+#ifdef ARCH_PPC
+    uint8_t regv[12*16];
+#endif
+    int dummy;
+} cpu_state_t;
+
+/* alloc.c */
+#define ALLOC_MPEG2DEC 0
+#define ALLOC_CHUNK 1
+#define ALLOC_YUV 2
+#define ALLOC_CONVERT_ID 3
+#define ALLOC_CONVERTED 4
+void * mpeg2_malloc (int size, int reason);
+void mpeg2_free (void * buf);
+
+/* cpu_accel.c */
+uint32_t mpeg2_detect_accel (void);
+
+/* cpu_state.c */
+void mpeg2_cpu_state_init (uint32_t accel);
+
+/* decode.c */
+int mpeg2_seek_sequence (mpeg2dec_t * mpeg2dec);
+int mpeg2_seek_header (mpeg2dec_t * mpeg2dec);
+int mpeg2_parse_header (mpeg2dec_t * mpeg2dec);
+
+/* header.c */
+void mpeg2_header_state_init (mpeg2dec_t * mpeg2dec);
+int mpeg2_header_sequence (mpeg2dec_t * mpeg2dec);
+int mpeg2_header_gop (mpeg2dec_t * mpeg2dec);
+int mpeg2_header_picture_start (mpeg2dec_t * mpeg2dec);
+int mpeg2_header_picture (mpeg2dec_t * mpeg2dec);
+int mpeg2_header_extension (mpeg2dec_t * mpeg2dec);
+int mpeg2_header_user_data (mpeg2dec_t * mpeg2dec);
+void mpeg2_header_sequence_finalize (mpeg2dec_t * mpeg2dec);
+int mpeg2_header_slice_start (mpeg2dec_t * mpeg2dec);
+int mpeg2_header_end (mpeg2dec_t * mpeg2dec);
+void mpeg2_set_fbuf (mpeg2dec_t * mpeg2dec, int coding_type);
 
 /* idct.c */
-void idct_init (void);
+void mpeg2_idct_init (uint32_t accel);
 
 /* idct_mlib.c */
-void idct_block_copy_mlib (int16_t * block, uint8_t * dest, int stride);
-void idct_block_add_mlib (int16_t * block, uint8_t * dest, int stride);
+void mpeg2_idct_add_mlib (int last, int16_t * block,
+			  uint8_t * dest, int stride);
+void mpeg2_idct_copy_mlib_non_ieee (int16_t * block, uint8_t * dest,
+				    int stride);
+void mpeg2_idct_add_mlib_non_ieee (int last, int16_t * block,
+				   uint8_t * dest, int stride);
 
 /* idct_mmx.c */
-void idct_block_copy_mmxext (int16_t *block, uint8_t * dest, int stride);
-void idct_block_add_mmxext (int16_t *block, uint8_t * dest, int stride);
-void idct_block_copy_mmx (int16_t *block, uint8_t * dest, int stride);
-void idct_block_add_mmx (int16_t *block, uint8_t * dest, int stride);
-void idct_mmx_init (void);
+void mpeg2_idct_copy_mmxext (int16_t * block, uint8_t * dest, int stride);
+void mpeg2_idct_add_mmxext (int last, int16_t * block,
+			    uint8_t * dest, int stride);
+void mpeg2_idct_copy_mmx (int16_t * block, uint8_t * dest, int stride);
+void mpeg2_idct_add_mmx (int last, int16_t * block,
+			 uint8_t * dest, int stride);
+void mpeg2_idct_mmx_init (void);
+
+/* idct_altivec.c */
+void mpeg2_idct_copy_altivec (int16_t * block, uint8_t * dest, int stride);
+void mpeg2_idct_add_altivec (int last, int16_t * block,
+			     uint8_t * dest, int stride);
+void mpeg2_idct_altivec_init (void);
+
+/* idct_alpha.c */
+void mpeg2_idct_copy_mvi (int16_t * block, uint8_t * dest, int stride);
+void mpeg2_idct_add_mvi (int last, int16_t * block,
+			 uint8_t * dest, int stride);
+void mpeg2_idct_copy_alpha (int16_t * block, uint8_t * dest, int stride);
+void mpeg2_idct_add_alpha (int last, int16_t * block,
+			   uint8_t * dest, int stride);
+void mpeg2_idct_alpha_init(int no_mvi);
 
 /* motion_comp.c */
-void motion_comp_init (void);
-
-typedef struct mc_functions_s
-{
-    void (* put [8]) (uint8_t *dst, uint8_t *, int32_t, int32_t);
-    void (* avg [8]) (uint8_t *dst, uint8_t *, int32_t, int32_t);
-} mc_functions_t;
-
-#define MOTION_COMP_EXTERN(x) mc_functions_t mc_functions_##x =		\
-{									\
-    {MC_put_16_##x, MC_put_x16_##x, MC_put_y16_##x, MC_put_xy16_##x,	\
-     MC_put_8_##x,  MC_put_x8_##x,  MC_put_y8_##x,  MC_put_xy8_##x},	\
-    {MC_avg_16_##x, MC_avg_x16_##x, MC_avg_y16_##x, MC_avg_xy16_##x,	\
-     MC_avg_8_##x,  MC_avg_x8_##x,  MC_avg_y8_##x,  MC_avg_xy8_##x}	\
-};
-
-extern mc_functions_t mc_functions_c;
-extern mc_functions_t mc_functions_mmx;
-extern mc_functions_t mc_functions_mmxext;
-extern mc_functions_t mc_functions_3dnow;
-extern mc_functions_t mc_functions_mlib;
+void mpeg2_mc_init (uint32_t accel);
 
-/* slice.c */
-int slice_process (picture_t *picture, uint8_t code, uint8_t * buffer);
+typedef void mpeg2_mc_fct (uint8_t *, const uint8_t *, int, int);
 
-/* stats.c */
-void stats_header (uint8_t code, uint8_t * buffer);
-
-void mpeg2_allocate_image_buffers(picture_t * picture);
-void mpeg2_free_image_buffers (picture_t * picture);
+typedef struct {
+    mpeg2_mc_fct * put [8];
+    mpeg2_mc_fct * avg [8];
+} mpeg2_mc_t;
 
+#define MPEG2_MC_EXTERN(x) mpeg2_mc_t mpeg2_mc_##x = {			  \
+    {MC_put_o_16_##x, MC_put_x_16_##x, MC_put_y_16_##x, MC_put_xy_16_##x, \
+     MC_put_o_8_##x,  MC_put_x_8_##x,  MC_put_y_8_##x,  MC_put_xy_8_##x}, \
+    {MC_avg_o_16_##x, MC_avg_x_16_##x, MC_avg_y_16_##x, MC_avg_xy_16_##x, \
+     MC_avg_o_8_##x,  MC_avg_x_8_##x,  MC_avg_y_8_##x,  MC_avg_xy_8_##x}  \
+};
 
+extern mpeg2_mc_t mpeg2_mc_c;
+extern mpeg2_mc_t mpeg2_mc_mmx;
+extern mpeg2_mc_t mpeg2_mc_mmxext;
+extern mpeg2_mc_t mpeg2_mc_3dnow;
+extern mpeg2_mc_t mpeg2_mc_altivec;
+extern mpeg2_mc_t mpeg2_mc_alpha;
+extern mpeg2_mc_t mpeg2_mc_mlib;
diff --git a/libmpeg2/slice.c b/libmpeg2/slice.c
index 4e289f0d06..7f6a2ed052 100644
--- a/libmpeg2/slice.c
+++ b/libmpeg2/slice.c
@@ -1,8 +1,10 @@
 /*
  * slice.c
- * Copyright (C) 1999-2001 Aaron Holtzman <aholtzma@ess.engr.uvic.ca>
+ * Copyright (C) 2000-2002 Michel Lespinasse <walken@zoy.org>
+ * Copyright (C) 1999-2000 Aaron Holtzman <aholtzma@ess.engr.uvic.ca>
  *
  * This file is part of mpeg2dec, a free MPEG-2 video stream decoder.
+ * See http://libmpeg2.sourceforge.net/ for updates.
  *
  * mpeg2dec is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -21,20 +23,18 @@
 
 #include "config.h"
 
-#include <string.h>
 #include <inttypes.h>
 
-#include "video_out.h"
+#include "mpeg2.h"
 #include "mpeg2_internal.h"
 #include "attributes.h"
 
-extern mc_functions_t mc_functions;
-extern void (* idct_block_copy) (int16_t * block, uint8_t * dest, int stride);
-extern void (* idct_block_add) (int16_t * block, uint8_t * dest, int stride);
-
-//#ifdef MPEG12_POSTPROC
-//extern int quant_store[MPEG2_MBR+1][MPEG2_MBC+1]; // [Review]
-//#endif
+extern mpeg2_mc_t mpeg2_mc;
+extern void (* mpeg2_idct_copy) (int16_t * block, uint8_t * dest, int stride);
+extern void (* mpeg2_idct_add) (int last, int16_t * block,
+				uint8_t * dest, int stride);
+extern void (* mpeg2_cpu_state_save) (cpu_state_t * state);
+extern void (* mpeg2_cpu_state_restore) (cpu_state_t * state);
 
 #include "vlc.h"
 
@@ -45,23 +45,23 @@ static int non_linear_quantizer_scale [] = {
     56, 64, 72, 80, 88, 96, 104, 112
 };
 
-static inline int get_macroblock_modes (picture_t * picture)
+static inline int get_macroblock_modes (decoder_t * const decoder)
 {
-#define bit_buf (picture->bitstream_buf)
-#define bits (picture->bitstream_bits)
-#define bit_ptr (picture->bitstream_ptr)
+#define bit_buf (decoder->bitstream_buf)
+#define bits (decoder->bitstream_bits)
+#define bit_ptr (decoder->bitstream_ptr)
     int macroblock_modes;
-    MBtab * tab;
+    const MBtab * tab;
 
-    switch (picture->picture_coding_type) {
+    switch (decoder->coding_type) {
     case I_TYPE:
 
 	tab = MB_I + UBITS (bit_buf, 1);
 	DUMPBITS (bit_buf, bits, tab->len);
 	macroblock_modes = tab->modes;
 
-	if ((! (picture->frame_pred_frame_dct)) &&
-	    (picture->picture_structure == FRAME_PICTURE)) {
+	if ((! (decoder->frame_pred_frame_dct)) &&
+	    (decoder->picture_structure == FRAME_PICTURE)) {
 	    macroblock_modes |= UBITS (bit_buf, 1) * DCT_TYPE_INTERLACED;
 	    DUMPBITS (bit_buf, bits, 1);
 	}
@@ -74,13 +74,13 @@ static inline int get_macroblock_modes (picture_t * picture)
 	DUMPBITS (bit_buf, bits, tab->len);
 	macroblock_modes = tab->modes;
 
-	if (picture->picture_structure != FRAME_PICTURE) {
+	if (decoder->picture_structure != FRAME_PICTURE) {
 	    if (macroblock_modes & MACROBLOCK_MOTION_FORWARD) {
 		macroblock_modes |= UBITS (bit_buf, 2) * MOTION_TYPE_BASE;
 		DUMPBITS (bit_buf, bits, 2);
 	    }
 	    return macroblock_modes;
-	} else if (picture->frame_pred_frame_dct) {
+	} else if (decoder->frame_pred_frame_dct) {
 	    if (macroblock_modes & MACROBLOCK_MOTION_FORWARD)
 		macroblock_modes |= MC_FRAME;
 	    return macroblock_modes;
@@ -102,13 +102,13 @@ static inline int get_macroblock_modes (picture_t * picture)
 	DUMPBITS (bit_buf, bits, tab->len);
 	macroblock_modes = tab->modes;
 
-	if (picture->picture_structure != FRAME_PICTURE) {
+	if (decoder->picture_structure != FRAME_PICTURE) {
 	    if (! (macroblock_modes & MACROBLOCK_INTRA)) {
 		macroblock_modes |= UBITS (bit_buf, 2) * MOTION_TYPE_BASE;
 		DUMPBITS (bit_buf, bits, 2);
 	    }
 	    return macroblock_modes;
-	} else if (picture->frame_pred_frame_dct) {
+	} else if (decoder->frame_pred_frame_dct) {
 	    /* if (! (macroblock_modes & MACROBLOCK_INTRA)) */
 	    macroblock_modes |= MC_FRAME;
 	    return macroblock_modes;
@@ -138,18 +138,18 @@ static inline int get_macroblock_modes (picture_t * picture)
 #undef bit_ptr
 }
 
-static inline int get_quantizer_scale (picture_t * picture)
+static inline int get_quantizer_scale (decoder_t * const decoder)
 {
-#define bit_buf (picture->bitstream_buf)
-#define bits (picture->bitstream_bits)
-#define bit_ptr (picture->bitstream_ptr)
+#define bit_buf (decoder->bitstream_buf)
+#define bits (decoder->bitstream_bits)
+#define bit_ptr (decoder->bitstream_ptr)
 
     int quantizer_scale_code;
 
     quantizer_scale_code = UBITS (bit_buf, 5);
     DUMPBITS (bit_buf, bits, 5);
 
-    if (picture->q_scale_type)
+    if (decoder->q_scale_type)
 	return non_linear_quantizer_scale [quantizer_scale_code];
     else
 	return quantizer_scale_code << 1;
@@ -158,15 +158,16 @@ static inline int get_quantizer_scale (picture_t * picture)
 #undef bit_ptr
 }
 
-static inline int get_motion_delta (picture_t * picture, int f_code)
+static inline int get_motion_delta (decoder_t * const decoder,
+				    const int f_code)
 {
-#define bit_buf (picture->bitstream_buf)
-#define bits (picture->bitstream_bits)
-#define bit_ptr (picture->bitstream_ptr)
+#define bit_buf (decoder->bitstream_buf)
+#define bits (decoder->bitstream_bits)
+#define bit_ptr (decoder->bitstream_ptr)
 
     int delta;
     int sign;
-    MVtab * tab;
+    const MVtab * tab;
 
     if (bit_buf & 0x80000000) {
 	DUMPBITS (bit_buf, bits, 1);
@@ -211,30 +212,32 @@ static inline int get_motion_delta (picture_t * picture, int f_code)
 #undef bit_ptr
 }
 
-static inline int bound_motion_vector (int vector, int f_code)
+static inline int bound_motion_vector (const int vector, const int f_code)
 {
-#if 1
-    int limit;
+#if 0
+    unsigned int limit;
+    int sign;
 
     limit = 16 << f_code;
 
-    if (vector >= limit)
-	return vector - 2*limit;
-    else if (vector < -limit)
-	return vector + 2*limit;
-    else return vector;
+    if ((unsigned int)(vector + limit) < 2 * limit)
+	return vector;
+    else {
+	sign = ((int32_t)vector) >> 31;
+	return vector - ((2 * limit) ^ sign) + sign;
+    }
 #else
-    return (vector << (27 - f_code)) >> (27 - f_code);
+    return ((int32_t)vector << (27 - f_code)) >> (27 - f_code);
 #endif
 }
 
-static inline int get_dmv (picture_t * picture)
+static inline int get_dmv (decoder_t * const decoder)
 {
-#define bit_buf (picture->bitstream_buf)
-#define bits (picture->bitstream_bits)
-#define bit_ptr (picture->bitstream_ptr)
+#define bit_buf (decoder->bitstream_buf)
+#define bits (decoder->bitstream_bits)
+#define bit_ptr (decoder->bitstream_ptr)
 
-    DMVtab * tab;
+    const DMVtab * tab;
 
     tab = DMV_2 + UBITS (bit_buf, 2);
     DUMPBITS (bit_buf, bits, tab->len);
@@ -244,19 +247,19 @@ static inline int get_dmv (picture_t * picture)
 #undef bit_ptr
 }
 
-static inline int get_coded_block_pattern (picture_t * picture)
+static inline int get_coded_block_pattern (decoder_t * const decoder)
 {
-#define bit_buf (picture->bitstream_buf)
-#define bits (picture->bitstream_bits)
-#define bit_ptr (picture->bitstream_ptr)
+#define bit_buf (decoder->bitstream_buf)
+#define bits (decoder->bitstream_bits)
+#define bit_ptr (decoder->bitstream_ptr)
 
-    CBPtab * tab;
+    const CBPtab * tab;
 
     NEEDBITS (bit_buf, bits, bit_ptr);
 
     if (bit_buf >= 0x20000000) {
 
-	tab = CBP_7 - 16 + UBITS (bit_buf, 7);
+	tab = CBP_7 + (UBITS (bit_buf, 7) - 16);
 	DUMPBITS (bit_buf, bits, tab->len);
 	return tab->cbp;
 
@@ -272,12 +275,12 @@ static inline int get_coded_block_pattern (picture_t * picture)
 #undef bit_ptr
 }
 
-static inline int get_luma_dc_dct_diff (picture_t * picture)
+static inline int get_luma_dc_dct_diff (decoder_t * const decoder)
 {
-#define bit_buf (picture->bitstream_buf)
-#define bits (picture->bitstream_bits)
-#define bit_ptr (picture->bitstream_ptr)
-    DCtab * tab;
+#define bit_buf (decoder->bitstream_buf)
+#define bits (decoder->bitstream_bits)
+#define bit_ptr (decoder->bitstream_ptr)
+    const DCtab * tab;
     int size;
     int dc_diff;
 
@@ -296,7 +299,7 @@ static inline int get_luma_dc_dct_diff (picture_t * picture)
 	    return 0;
 	}
     } else {
-	tab = DC_long - 0x1e0 + UBITS (bit_buf, 9);
+	tab = DC_long + (UBITS (bit_buf, 9) - 0x1e0);
 	size = tab->size;
 	DUMPBITS (bit_buf, bits, tab->len);
 	NEEDBITS (bit_buf, bits, bit_ptr);
@@ -309,12 +312,12 @@ static inline int get_luma_dc_dct_diff (picture_t * picture)
 #undef bit_ptr
 }
 
-static inline int get_chroma_dc_dct_diff (picture_t * picture)
+static inline int get_chroma_dc_dct_diff (decoder_t * const decoder)
 {
-#define bit_buf (picture->bitstream_buf)
-#define bits (picture->bitstream_bits)
-#define bit_ptr (picture->bitstream_ptr)
-    DCtab * tab;
+#define bit_buf (decoder->bitstream_buf)
+#define bits (decoder->bitstream_bits)
+#define bit_ptr (decoder->bitstream_ptr)
+    const DCtab * tab;
     int size;
     int dc_diff;
 
@@ -333,7 +336,7 @@ static inline int get_chroma_dc_dct_diff (picture_t * picture)
 	    return 0;
 	}
     } else {
-	tab = DC_long - 0x3e0 + UBITS (bit_buf, 10);
+	tab = DC_long + (UBITS (bit_buf, 10) - 0x3e0);
 	size = tab->size;
 	DUMPBITS (bit_buf, bits, tab->len + 1);
 	NEEDBITS (bit_buf, bits, bit_ptr);
@@ -346,41 +349,41 @@ static inline int get_chroma_dc_dct_diff (picture_t * picture)
 #undef bit_ptr
 }
 
-#define SATURATE(val)			\
-do {					\
-    if ((uint32_t)(val + 2048) > 4095)	\
-	val = (val > 0) ? 2047 : -2048;	\
+#define SATURATE(val)					\
+do {							\
+    if (unlikely ((uint32_t)(val + 2048) > 4095))	\
+	val = SBITS (val, 1) ^ 2047;			\
 } while (0)
 
-static void get_intra_block_B14 (picture_t * picture)
+static void get_intra_block_B14 (decoder_t * const decoder)
 {
     int i;
     int j;
     int val;
-    uint8_t * scan = picture->scan;
-    uint8_t * quant_matrix = picture->intra_quantizer_matrix;
-    int quantizer_scale = picture->quantizer_scale;
+    const uint8_t * scan = decoder->scan;
+    const uint8_t * quant_matrix = decoder->intra_quantizer_matrix;
+    int quantizer_scale = decoder->quantizer_scale;
     int mismatch;
-    DCTtab * tab;
+    const DCTtab * tab;
     uint32_t bit_buf;
     int bits;
-    uint8_t * bit_ptr;
+    const uint8_t * bit_ptr;
     int16_t * dest;
 
-    dest = picture->DCTblock;
+    dest = decoder->DCTblock;
     i = 0;
     mismatch = ~dest[0];
 
-    bit_buf = picture->bitstream_buf;
-    bits = picture->bitstream_bits;
-    bit_ptr = picture->bitstream_ptr;
+    bit_buf = decoder->bitstream_buf;
+    bits = decoder->bitstream_bits;
+    bit_ptr = decoder->bitstream_ptr;
 
     NEEDBITS (bit_buf, bits, bit_ptr);
 
     while (1) {
 	if (bit_buf >= 0x28000000) {
 
-	    tab = DCT_B14AC_5 - 5 + UBITS (bit_buf, 5);
+	    tab = DCT_B14AC_5 + (UBITS (bit_buf, 5) - 5);
 
 	    i += tab->run;
 	    if (i >= 64)
@@ -406,7 +409,7 @@ static void get_intra_block_B14 (picture_t * picture)
 
 	} else if (bit_buf >= 0x04000000) {
 
-	    tab = DCT_B14_8 - 4 + UBITS (bit_buf, 8);
+	    tab = DCT_B14_8 + (UBITS (bit_buf, 8) - 4);
 
 	    i += tab->run;
 	    if (i < 64)
@@ -435,17 +438,17 @@ static void get_intra_block_B14 (picture_t * picture)
 	    continue;
 
 	} else if (bit_buf >= 0x02000000) {
-	    tab = DCT_B14_10 - 8 + UBITS (bit_buf, 10);
+	    tab = DCT_B14_10 + (UBITS (bit_buf, 10) - 8);
 	    i += tab->run;
 	    if (i < 64)
 		goto normal_code;
 	} else if (bit_buf >= 0x00800000) {
-	    tab = DCT_13 - 16 + UBITS (bit_buf, 13);
+	    tab = DCT_13 + (UBITS (bit_buf, 13) - 16);
 	    i += tab->run;
 	    if (i < 64)
 		goto normal_code;
 	} else if (bit_buf >= 0x00200000) {
-	    tab = DCT_15 - 16 + UBITS (bit_buf, 15);
+	    tab = DCT_15 + (UBITS (bit_buf, 15) - 16);
 	    i += tab->run;
 	    if (i < 64)
 		goto normal_code;
@@ -461,40 +464,40 @@ static void get_intra_block_B14 (picture_t * picture)
     }
     dest[63] ^= mismatch & 1;
     DUMPBITS (bit_buf, bits, 2);	/* dump end of block code */
-    picture->bitstream_buf = bit_buf;
-    picture->bitstream_bits = bits;
-    picture->bitstream_ptr = bit_ptr;
+    decoder->bitstream_buf = bit_buf;
+    decoder->bitstream_bits = bits;
+    decoder->bitstream_ptr = bit_ptr;
 }
 
-static void get_intra_block_B15 (picture_t * picture)
+static void get_intra_block_B15 (decoder_t * const decoder)
 {
     int i;
     int j;
     int val;
-    uint8_t * scan = picture->scan;
-    uint8_t * quant_matrix = picture->intra_quantizer_matrix;
-    int quantizer_scale = picture->quantizer_scale;
+    const uint8_t * scan = decoder->scan;
+    const uint8_t * quant_matrix = decoder->intra_quantizer_matrix;
+    int quantizer_scale = decoder->quantizer_scale;
     int mismatch;
-    DCTtab * tab;
+    const DCTtab * tab;
     uint32_t bit_buf;
     int bits;
-    uint8_t * bit_ptr;
+    const uint8_t * bit_ptr;
     int16_t * dest;
 
-    dest = picture->DCTblock;
+    dest = decoder->DCTblock;
     i = 0;
     mismatch = ~dest[0];
 
-    bit_buf = picture->bitstream_buf;
-    bits = picture->bitstream_bits;
-    bit_ptr = picture->bitstream_ptr;
+    bit_buf = decoder->bitstream_buf;
+    bits = decoder->bitstream_bits;
+    bit_ptr = decoder->bitstream_ptr;
 
     NEEDBITS (bit_buf, bits, bit_ptr);
 
     while (1) {
 	if (bit_buf >= 0x04000000) {
 
-	    tab = DCT_B15_8 - 4 + UBITS (bit_buf, 8);
+	    tab = DCT_B15_8 + (UBITS (bit_buf, 8) - 4);
 
 	    i += tab->run;
 	    if (i < 64) {
@@ -548,17 +551,17 @@ static void get_intra_block_B15 (picture_t * picture)
 
 	    }
 	} else if (bit_buf >= 0x02000000) {
-	    tab = DCT_B15_10 - 8 + UBITS (bit_buf, 10);
+	    tab = DCT_B15_10 + (UBITS (bit_buf, 10) - 8);
 	    i += tab->run;
 	    if (i < 64)
 		goto normal_code;
 	} else if (bit_buf >= 0x00800000) {
-	    tab = DCT_13 - 16 + UBITS (bit_buf, 13);
+	    tab = DCT_13 + (UBITS (bit_buf, 13) - 16);
 	    i += tab->run;
 	    if (i < 64)
 		goto normal_code;
 	} else if (bit_buf >= 0x00200000) {
-	    tab = DCT_15 - 16 + UBITS (bit_buf, 15);
+	    tab = DCT_15 + (UBITS (bit_buf, 15) - 16);
 	    i += tab->run;
 	    if (i < 64)
 		goto normal_code;
@@ -574,37 +577,37 @@ static void get_intra_block_B15 (picture_t * picture)
     }
     dest[63] ^= mismatch & 1;
     DUMPBITS (bit_buf, bits, 4);	/* dump end of block code */
-    picture->bitstream_buf = bit_buf;
-    picture->bitstream_bits = bits;
-    picture->bitstream_ptr = bit_ptr;
+    decoder->bitstream_buf = bit_buf;
+    decoder->bitstream_bits = bits;
+    decoder->bitstream_ptr = bit_ptr;
 }
 
-static void get_non_intra_block (picture_t * picture)
+static int get_non_intra_block (decoder_t * const decoder)
 {
     int i;
     int j;
     int val;
-    uint8_t * scan = picture->scan;
-    uint8_t * quant_matrix = picture->non_intra_quantizer_matrix;
-    int quantizer_scale = picture->quantizer_scale;
+    const uint8_t * scan = decoder->scan;
+    const uint8_t * quant_matrix = decoder->non_intra_quantizer_matrix;
+    int quantizer_scale = decoder->quantizer_scale;
     int mismatch;
-    DCTtab * tab;
+    const DCTtab * tab;
     uint32_t bit_buf;
     int bits;
-    uint8_t * bit_ptr;
+    const uint8_t * bit_ptr;
     int16_t * dest;
 
     i = -1;
     mismatch = 1;
-    dest = picture->DCTblock;
+    dest = decoder->DCTblock;
 
-    bit_buf = picture->bitstream_buf;
-    bits = picture->bitstream_bits;
-    bit_ptr = picture->bitstream_ptr;
+    bit_buf = decoder->bitstream_buf;
+    bits = decoder->bitstream_bits;
+    bit_ptr = decoder->bitstream_ptr;
 
     NEEDBITS (bit_buf, bits, bit_ptr);
     if (bit_buf >= 0x28000000) {
-	tab = DCT_B14DC_5 - 5 + UBITS (bit_buf, 5);
+	tab = DCT_B14DC_5 + (UBITS (bit_buf, 5) - 5);
 	goto entry_1;
     } else
 	goto entry_2;
@@ -612,7 +615,7 @@ static void get_non_intra_block (picture_t * picture)
     while (1) {
 	if (bit_buf >= 0x28000000) {
 
-	    tab = DCT_B14AC_5 - 5 + UBITS (bit_buf, 5);
+	    tab = DCT_B14AC_5 + (UBITS (bit_buf, 5) - 5);
 
 	entry_1:
 	    i += tab->run;
@@ -642,7 +645,7 @@ static void get_non_intra_block (picture_t * picture)
     entry_2:
 	if (bit_buf >= 0x04000000) {
 
-	    tab = DCT_B14_8 - 4 + UBITS (bit_buf, 8);
+	    tab = DCT_B14_8 + (UBITS (bit_buf, 8) - 4);
 
 	    i += tab->run;
 	    if (i < 64)
@@ -671,17 +674,17 @@ static void get_non_intra_block (picture_t * picture)
 	    continue;
 
 	} else if (bit_buf >= 0x02000000) {
-	    tab = DCT_B14_10 - 8 + UBITS (bit_buf, 10);
+	    tab = DCT_B14_10 + (UBITS (bit_buf, 10) - 8);
 	    i += tab->run;
 	    if (i < 64)
 		goto normal_code;
 	} else if (bit_buf >= 0x00800000) {
-	    tab = DCT_13 - 16 + UBITS (bit_buf, 13);
+	    tab = DCT_13 + (UBITS (bit_buf, 13) - 16);
 	    i += tab->run;
 	    if (i < 64)
 		goto normal_code;
 	} else if (bit_buf >= 0x00200000) {
-	    tab = DCT_15 - 16 + UBITS (bit_buf, 15);
+	    tab = DCT_15 + (UBITS (bit_buf, 15) - 16);
 	    i += tab->run;
 	    if (i < 64)
 		goto normal_code;
@@ -697,38 +700,39 @@ static void get_non_intra_block (picture_t * picture)
     }
     dest[63] ^= mismatch & 1;
     DUMPBITS (bit_buf, bits, 2);	/* dump end of block code */
-    picture->bitstream_buf = bit_buf;
-    picture->bitstream_bits = bits;
-    picture->bitstream_ptr = bit_ptr;
+    decoder->bitstream_buf = bit_buf;
+    decoder->bitstream_bits = bits;
+    decoder->bitstream_ptr = bit_ptr;
+    return i;
 }
 
-static void get_mpeg1_intra_block (picture_t * picture)
+static void get_mpeg1_intra_block (decoder_t * const decoder)
 {
     int i;
     int j;
     int val;
-    uint8_t * scan = picture->scan;
-    uint8_t * quant_matrix = picture->intra_quantizer_matrix;
-    int quantizer_scale = picture->quantizer_scale;
-    DCTtab * tab;
+    const uint8_t * scan = decoder->scan;
+    const uint8_t * quant_matrix = decoder->intra_quantizer_matrix;
+    int quantizer_scale = decoder->quantizer_scale;
+    const DCTtab * tab;
     uint32_t bit_buf;
     int bits;
-    uint8_t * bit_ptr;
+    const uint8_t * bit_ptr;
     int16_t * dest;
 
     i = 0;
-    dest = picture->DCTblock;
+    dest = decoder->DCTblock;
 
-    bit_buf = picture->bitstream_buf;
-    bits = picture->bitstream_bits;
-    bit_ptr = picture->bitstream_ptr;
+    bit_buf = decoder->bitstream_buf;
+    bits = decoder->bitstream_bits;
+    bit_ptr = decoder->bitstream_ptr;
 
     NEEDBITS (bit_buf, bits, bit_ptr);
 
     while (1) {
 	if (bit_buf >= 0x28000000) {
 
-	    tab = DCT_B14AC_5 - 5 + UBITS (bit_buf, 5);
+	    tab = DCT_B14AC_5 + (UBITS (bit_buf, 5) - 5);
 
 	    i += tab->run;
 	    if (i >= 64)
@@ -756,7 +760,7 @@ static void get_mpeg1_intra_block (picture_t * picture)
 
 	} else if (bit_buf >= 0x04000000) {
 
-	    tab = DCT_B14_8 - 4 + UBITS (bit_buf, 8);
+	    tab = DCT_B14_8 + (UBITS (bit_buf, 8) - 4);
 
 	    i += tab->run;
 	    if (i < 64)
@@ -791,17 +795,17 @@ static void get_mpeg1_intra_block (picture_t * picture)
 	    continue;
 
 	} else if (bit_buf >= 0x02000000) {
-	    tab = DCT_B14_10 - 8 + UBITS (bit_buf, 10);
+	    tab = DCT_B14_10 + (UBITS (bit_buf, 10) - 8);
 	    i += tab->run;
 	    if (i < 64)
 		goto normal_code;
 	} else if (bit_buf >= 0x00800000) {
-	    tab = DCT_13 - 16 + UBITS (bit_buf, 13);
+	    tab = DCT_13 + (UBITS (bit_buf, 13) - 16);
 	    i += tab->run;
 	    if (i < 64)
 		goto normal_code;
 	} else if (bit_buf >= 0x00200000) {
-	    tab = DCT_15 - 16 + UBITS (bit_buf, 15);
+	    tab = DCT_15 + (UBITS (bit_buf, 15) - 16);
 	    i += tab->run;
 	    if (i < 64)
 		goto normal_code;
@@ -816,35 +820,35 @@ static void get_mpeg1_intra_block (picture_t * picture)
 	break;	/* illegal, check needed to avoid buffer overflow */
     }
     DUMPBITS (bit_buf, bits, 2);	/* dump end of block code */
-    picture->bitstream_buf = bit_buf;
-    picture->bitstream_bits = bits;
-    picture->bitstream_ptr = bit_ptr;
+    decoder->bitstream_buf = bit_buf;
+    decoder->bitstream_bits = bits;
+    decoder->bitstream_ptr = bit_ptr;
 }
 
-static void get_mpeg1_non_intra_block (picture_t * picture)
+static int get_mpeg1_non_intra_block (decoder_t * const decoder)
 {
     int i;
     int j;
     int val;
-    uint8_t * scan = picture->scan;
-    uint8_t * quant_matrix = picture->non_intra_quantizer_matrix;
-    int quantizer_scale = picture->quantizer_scale;
-    DCTtab * tab;
+    const uint8_t * scan = decoder->scan;
+    const uint8_t * quant_matrix = decoder->non_intra_quantizer_matrix;
+    int quantizer_scale = decoder->quantizer_scale;
+    const DCTtab * tab;
     uint32_t bit_buf;
     int bits;
-    uint8_t * bit_ptr;
+    const uint8_t * bit_ptr;
     int16_t * dest;
 
     i = -1;
-    dest = picture->DCTblock;
+    dest = decoder->DCTblock;
 
-    bit_buf = picture->bitstream_buf;
-    bits = picture->bitstream_bits;
-    bit_ptr = picture->bitstream_ptr;
+    bit_buf = decoder->bitstream_buf;
+    bits = decoder->bitstream_bits;
+    bit_ptr = decoder->bitstream_ptr;
 
     NEEDBITS (bit_buf, bits, bit_ptr);
     if (bit_buf >= 0x28000000) {
-	tab = DCT_B14DC_5 - 5 + UBITS (bit_buf, 5);
+	tab = DCT_B14DC_5 + (UBITS (bit_buf, 5) - 5);
 	goto entry_1;
     } else
 	goto entry_2;
@@ -852,7 +856,7 @@ static void get_mpeg1_non_intra_block (picture_t * picture)
     while (1) {
 	if (bit_buf >= 0x28000000) {
 
-	    tab = DCT_B14AC_5 - 5 + UBITS (bit_buf, 5);
+	    tab = DCT_B14AC_5 + (UBITS (bit_buf, 5) - 5);
 
 	entry_1:
 	    i += tab->run;
@@ -884,7 +888,7 @@ static void get_mpeg1_non_intra_block (picture_t * picture)
     entry_2:
 	if (bit_buf >= 0x04000000) {
 
-	    tab = DCT_B14_8 - 4 + UBITS (bit_buf, 8);
+	    tab = DCT_B14_8 + (UBITS (bit_buf, 8) - 4);
 
 	    i += tab->run;
 	    if (i < 64)
@@ -920,17 +924,17 @@ static void get_mpeg1_non_intra_block (picture_t * picture)
 	    continue;
 
 	} else if (bit_buf >= 0x02000000) {
-	    tab = DCT_B14_10 - 8 + UBITS (bit_buf, 10);
+	    tab = DCT_B14_10 + (UBITS (bit_buf, 10) - 8);
 	    i += tab->run;
 	    if (i < 64)
 		goto normal_code;
 	} else if (bit_buf >= 0x00800000) {
-	    tab = DCT_13 - 16 + UBITS (bit_buf, 13);
+	    tab = DCT_13 + (UBITS (bit_buf, 13) - 16);
 	    i += tab->run;
 	    if (i < 64)
 		goto normal_code;
 	} else if (bit_buf >= 0x00200000) {
-	    tab = DCT_15 - 16 + UBITS (bit_buf, 15);
+	    tab = DCT_15 + (UBITS (bit_buf, 15) - 16);
 	    i += tab->run;
 	    if (i < 64)
 		goto normal_code;
@@ -945,362 +949,320 @@ static void get_mpeg1_non_intra_block (picture_t * picture)
 	break;	/* illegal, check needed to avoid buffer overflow */
     }
     DUMPBITS (bit_buf, bits, 2);	/* dump end of block code */
-    picture->bitstream_buf = bit_buf;
-    picture->bitstream_bits = bits;
-    picture->bitstream_ptr = bit_ptr;
+    decoder->bitstream_buf = bit_buf;
+    decoder->bitstream_bits = bits;
+    decoder->bitstream_ptr = bit_ptr;
+    return i;
 }
 
-static inline int get_macroblock_address_increment (picture_t * picture)
+static inline void slice_intra_DCT (decoder_t * const decoder, const int cc,
+				    uint8_t * const dest, const int stride)
 {
-#define bit_buf (picture->bitstream_buf)
-#define bits (picture->bitstream_bits)
-#define bit_ptr (picture->bitstream_ptr)
-
-    MBAtab * tab;
-    int mba;
-
-    mba = 0;
-
-    while (1) {
-	if (bit_buf >= 0x10000000) {
-	    tab = MBA_5 - 2 + UBITS (bit_buf, 5);
-	    DUMPBITS (bit_buf, bits, tab->len);
-	    return mba + tab->mba;
-	} else if (bit_buf >= 0x03000000) {
-	    tab = MBA_11 - 24 + UBITS (bit_buf, 11);
-	    DUMPBITS (bit_buf, bits, tab->len);
-	    return mba + tab->mba;
-	} else switch (UBITS (bit_buf, 11)) {
-	case 8:		/* macroblock_escape */
-	    mba += 33;
-	    /* no break here on purpose */
-	case 15:	/* macroblock_stuffing (MPEG1 only) */
-	    DUMPBITS (bit_buf, bits, 11);
-	    NEEDBITS (bit_buf, bits, bit_ptr);
-	    break;
-	default:	/* end of slice, or error */
-//	    printf("MB error: %d  \n",(UBITS (bit_buf, 11))); // FIXME!
-//	    return 0;
-	    return -1;
-	}
-    }
-
-#undef bit_buf
-#undef bits
-#undef bit_ptr
-}
-
-static inline void slice_intra_DCT (picture_t * picture, int cc,
-				    uint8_t * dest, int stride)
-{
-#define bit_buf (picture->bitstream_buf)
-#define bits (picture->bitstream_bits)  
-#define bit_ptr (picture->bitstream_ptr)
+#define bit_buf (decoder->bitstream_buf)
+#define bits (decoder->bitstream_bits)
+#define bit_ptr (decoder->bitstream_ptr)
     NEEDBITS (bit_buf, bits, bit_ptr);
     /* Get the intra DC coefficient and inverse quantize it */
     if (cc == 0)
-	picture->dc_dct_pred[0] += get_luma_dc_dct_diff (picture);
+	decoder->dc_dct_pred[0] += get_luma_dc_dct_diff (decoder);
     else
-	picture->dc_dct_pred[cc] += get_chroma_dc_dct_diff (picture);
-    picture->DCTblock[0] =
-	picture->dc_dct_pred[cc] << (3 - picture->intra_dc_precision);
-    memset (picture->DCTblock + 1, 0, 63 * sizeof (int16_t));
-
-    if (picture->mpeg1) {
-	if (picture->picture_coding_type != D_TYPE)
-	    get_mpeg1_intra_block (picture);
-    } else if (picture->intra_vlc_format)
-	get_intra_block_B15 (picture);
+	decoder->dc_dct_pred[cc] += get_chroma_dc_dct_diff (decoder);
+    decoder->DCTblock[0] =
+	decoder->dc_dct_pred[cc] << (3 - decoder->intra_dc_precision);
+
+    if (decoder->mpeg1) {
+	if (decoder->coding_type != D_TYPE)
+	    get_mpeg1_intra_block (decoder);
+    } else if (decoder->intra_vlc_format)
+	get_intra_block_B15 (decoder);
     else
-	get_intra_block_B14 (picture);
-    idct_block_copy (picture->DCTblock, dest, stride);
+	get_intra_block_B14 (decoder);
+    mpeg2_idct_copy (decoder->DCTblock, dest, stride);
 #undef bit_buf
 #undef bits
 #undef bit_ptr
 }
 
-static inline void slice_non_intra_DCT (picture_t * picture, uint8_t * dest,
-					int stride)
-{
-    memset (picture->DCTblock, 0, 64 * sizeof (int16_t));
-    if (picture->mpeg1)
-	get_mpeg1_non_intra_block (picture);
-    else
-	get_non_intra_block (picture);
-    idct_block_add (picture->DCTblock, dest, stride);
-}
-
-#define MOTION_Y(table,offset_x,offset_y,motion_x,motion_y,		\
-		 dest,src,offset_dest,offset_src,stride,height)		\
-do {									\
-    int xy_half;							\
-    int total_offset;							\
-									\
-    xy_half = ((motion_y & 1) << 1) | (motion_x & 1);			\
-    total_offset = ((offset_y + (motion_y >> 1)) * stride +		\
-		    offset_x + (motion_x >> 1) + (offset_src));		\
-    table[xy_half] (dest[0] + offset_x + (offset_dest),			\
-		    src[0] + total_offset, stride, height);		\
-} while (0)
-
-#define MOTION_UV(table,offset_x,offset_y,motion_x,motion_y,		\
-		  dest,src,offset_dest,offset_src,stride,height)	\
-do {									\
-    int xy_half;							\
-    int total_offset;							\
-									\
-    xy_half = ((motion_y & 1) << 1) | (motion_x & 1);			\
-    total_offset = (((offset_y + motion_y) >> 1) * (stride) +		\
-		    ((offset_x + motion_x) >> 1) + (offset_src));	\
-    table[4+xy_half] (dest[1] + (offset_x >> 1) + (offset_dest),	\
-		      src[1] + total_offset, stride, height);		\
-    table[4+xy_half] (dest[2] + (offset_x >> 1) + (offset_dest),	\
-		      src[2] + total_offset, stride, height);		\
-} while (0)
-
-static inline void motion_block (void (** table) (uint8_t *, uint8_t *,
-						  int32_t, int32_t),
-				 int x_offset, int y_offset, int mb_y_8_offset,
-				 int src_field, int dest_field,
-				 int x_pred, int y_pred,
-				 uint8_t * dest[3], uint8_t * src[3],
-				 int stride, int height)
+static inline void slice_non_intra_DCT (decoder_t * const decoder,
+					uint8_t * const dest, const int stride)
 {
-    MOTION_Y (table, x_offset, y_offset, x_pred, y_pred, dest, src,
-	      dest_field + mb_y_8_offset*8*stride, src_field, stride, height);
+    int last;
 
-    x_pred /= 2;
-    y_pred /= 2;
-    stride >>= 1;
-    height >>= 1;
-
-    MOTION_UV (table, x_offset, y_offset, x_pred, y_pred, dest, src,
-	       (dest_field >> 1) + mb_y_8_offset*4*stride, src_field >> 1,
-	       stride, height);
+    if (decoder->mpeg1)
+	last = get_mpeg1_non_intra_block (decoder);
+    else
+	last = get_non_intra_block (decoder);
+    mpeg2_idct_add (last, decoder->DCTblock, dest, stride);
 }
 
-static void motion_mp1 (picture_t * picture, motion_t * motion,
-			uint8_t * dest[3], int offset, int stride,
-			void (** table) (uint8_t *, uint8_t *, int, int))
+#define MOTION(table,ref,motion_x,motion_y,size,y)			      \
+    pos_x = 2 * decoder->offset + motion_x;				      \
+    pos_y = 2 * decoder->v_offset + motion_y + 2 * y;			      \
+    if ((pos_x > decoder->limit_x) || (pos_y > decoder->limit_y_ ## size))    \
+	return;								      \
+    xy_half = ((pos_y & 1) << 1) | (pos_x & 1);				      \
+    table[xy_half] (decoder->dest[0] + y * decoder->stride + decoder->offset, \
+		    ref[0] + (pos_x >> 1) + (pos_y >> 1) * decoder->stride,   \
+		    decoder->stride, size);				      \
+    motion_x /= 2;	motion_y /= 2;					      \
+    xy_half = ((motion_y & 1) << 1) | (motion_x & 1);			      \
+    offset = (((decoder->offset + motion_x) >> 1) +			      \
+	      ((((decoder->v_offset + motion_y) >> 1) + y/2) *		      \
+	       decoder->uv_stride));					      \
+    table[4+xy_half] (decoder->dest[1] + y/2 * decoder->uv_stride +	      \
+		      (decoder->offset >> 1), ref[1] + offset,		      \
+		      decoder->uv_stride, size/2);			      \
+    table[4+xy_half] (decoder->dest[2] + y/2 * decoder->uv_stride +	      \
+		      (decoder->offset >> 1), ref[2] + offset,		      \
+		      decoder->uv_stride, size/2)
+
+#define MOTION_FIELD(table,ref,motion_x,motion_y,dest_field,op,src_field)     \
+    pos_x = 2 * decoder->offset + motion_x;				      \
+    pos_y = decoder->v_offset + motion_y;				      \
+    if ((pos_x > decoder->limit_x) || (pos_y > decoder->limit_y))	      \
+	return;								      \
+    xy_half = ((pos_y & 1) << 1) | (pos_x & 1);				      \
+    table[xy_half] (decoder->dest[0] + dest_field * decoder->stride +	      \
+		    decoder->offset,					      \
+		    (ref[0] + (pos_x >> 1) +				      \
+		     ((pos_y op) + src_field) * decoder->stride),	      \
+		    2 * decoder->stride, 8);				      \
+    motion_x /= 2;	motion_y /= 2;					      \
+    xy_half = ((motion_y & 1) << 1) | (motion_x & 1);			      \
+    offset = (((decoder->offset + motion_x) >> 1) +			      \
+	      (((decoder->v_offset >> 1) + (motion_y op) + src_field) *	      \
+	       decoder->uv_stride));					      \
+    table[4+xy_half] (decoder->dest[1] + dest_field * decoder->uv_stride +    \
+		      (decoder->offset >> 1), ref[1] + offset,		      \
+		      2 * decoder->uv_stride, 4);			      \
+    table[4+xy_half] (decoder->dest[2] + dest_field * decoder->uv_stride +    \
+		      (decoder->offset >> 1), ref[2] + offset,		      \
+		      2 * decoder->uv_stride, 4)
+
+static void motion_mp1 (decoder_t * const decoder, motion_t * const motion,
+			mpeg2_mc_fct * const * const table)
 {
-#define bit_buf (picture->bitstream_buf)
-#define bits (picture->bitstream_bits)
-#define bit_ptr (picture->bitstream_ptr)
+#define bit_buf (decoder->bitstream_buf)
+#define bits (decoder->bitstream_bits)
+#define bit_ptr (decoder->bitstream_ptr)
     int motion_x, motion_y;
+    unsigned int pos_x, pos_y, xy_half, offset;
 
     NEEDBITS (bit_buf, bits, bit_ptr);
-    motion_x = motion->pmv[0][0] + get_motion_delta (picture,
-						     motion->f_code[0]);
-    motion_x = bound_motion_vector (motion_x, motion->f_code[0]);
+    motion_x = (motion->pmv[0][0] +
+		(get_motion_delta (decoder,
+				   motion->f_code[0]) << motion->f_code[1]));
+    motion_x = bound_motion_vector (motion_x,
+				    motion->f_code[0] + motion->f_code[1]);
     motion->pmv[0][0] = motion_x;
 
     NEEDBITS (bit_buf, bits, bit_ptr);
-    motion_y = motion->pmv[0][1] + get_motion_delta (picture,
-						     motion->f_code[0]);
-    motion_y = bound_motion_vector (motion_y, motion->f_code[0]);
+    motion_y = (motion->pmv[0][1] +
+		(get_motion_delta (decoder,
+				   motion->f_code[0]) << motion->f_code[1]));
+    motion_y = bound_motion_vector (motion_y,
+				    motion->f_code[0] + motion->f_code[1]);
     motion->pmv[0][1] = motion_y;
 
-    if (motion->f_code[1]) {
-	motion_x <<= 1;
-	motion_y <<= 1;
-    }
-
-    motion_block (table, offset, picture->v_offset, 0, 0, 0,
-		  motion_x, motion_y, dest, motion->ref[0], stride, 16);
+    MOTION (table, motion->ref[0], motion_x, motion_y, 16, 0);
 #undef bit_buf
 #undef bits
 #undef bit_ptr
 }
 
-static void motion_mp1_reuse (picture_t * picture, motion_t * motion,
-			      uint8_t * dest[3], int offset, int stride,
-			      void (** table) (uint8_t *, uint8_t *, int, int))
+static void motion_fr_frame (decoder_t * const decoder,
+			     motion_t * const motion,
+			     mpeg2_mc_fct * const * const table)
 {
+#define bit_buf (decoder->bitstream_buf)
+#define bits (decoder->bitstream_bits)
+#define bit_ptr (decoder->bitstream_ptr)
     int motion_x, motion_y;
-
-    motion_x = motion->pmv[0][0];
-    motion_y = motion->pmv[0][1];
-
-    if (motion->f_code[1]) {
-	motion_x <<= 1;
-	motion_y <<= 1;
-    }
-
-    motion_block (table, offset, picture->v_offset, 0, 0, 0,
-		  motion_x, motion_y, dest, motion->ref[0], stride, 16);
-}
-
-static void motion_fr_frame (picture_t * picture, motion_t * motion,
-			     uint8_t * dest[3], int offset, int stride,
-			     void (** table) (uint8_t *, uint8_t *, int, int))
-{
-#define bit_buf (picture->bitstream_buf)
-#define bits (picture->bitstream_bits)
-#define bit_ptr (picture->bitstream_ptr)
-    int motion_x, motion_y;
+    unsigned int pos_x, pos_y, xy_half, offset;
 
     NEEDBITS (bit_buf, bits, bit_ptr);
-    motion_x = motion->pmv[0][0] + get_motion_delta (picture,
+    motion_x = motion->pmv[0][0] + get_motion_delta (decoder,
 						     motion->f_code[0]);
     motion_x = bound_motion_vector (motion_x, motion->f_code[0]);
     motion->pmv[1][0] = motion->pmv[0][0] = motion_x;
 
     NEEDBITS (bit_buf, bits, bit_ptr);
-    motion_y = motion->pmv[0][1] + get_motion_delta (picture,
+    motion_y = motion->pmv[0][1] + get_motion_delta (decoder,
 						     motion->f_code[1]);
     motion_y = bound_motion_vector (motion_y, motion->f_code[1]);
     motion->pmv[1][1] = motion->pmv[0][1] = motion_y;
 
-    motion_block (table, offset, picture->v_offset, 0, 0, 0,
-		  motion_x, motion_y, dest, motion->ref[0], stride, 16);
+    MOTION (table, motion->ref[0], motion_x, motion_y, 16, 0);
 #undef bit_buf
 #undef bits
 #undef bit_ptr
 }
 
-static void motion_fr_field (picture_t * picture, motion_t * motion,
-			     uint8_t * dest[3], int offset, int stride,
-			     void (** table) (uint8_t *, uint8_t *, int, int))
+static void motion_fr_field (decoder_t * const decoder,
+			     motion_t * const motion,
+			     mpeg2_mc_fct * const * const table)
 {
-#define bit_buf (picture->bitstream_buf)
-#define bits (picture->bitstream_bits)
-#define bit_ptr (picture->bitstream_ptr)
-    int motion_x, motion_y;
-    int field_select;
+#define bit_buf (decoder->bitstream_buf)
+#define bits (decoder->bitstream_bits)
+#define bit_ptr (decoder->bitstream_ptr)
+    int motion_x, motion_y, field;
+    unsigned int pos_x, pos_y, xy_half, offset;
 
     NEEDBITS (bit_buf, bits, bit_ptr);
-    field_select = SBITS (bit_buf, 1);
+    field = UBITS (bit_buf, 1);
     DUMPBITS (bit_buf, bits, 1);
 
-    motion_x = motion->pmv[0][0] + get_motion_delta (picture,
+    motion_x = motion->pmv[0][0] + get_motion_delta (decoder,
 						     motion->f_code[0]);
     motion_x = bound_motion_vector (motion_x, motion->f_code[0]);
     motion->pmv[0][0] = motion_x;
 
     NEEDBITS (bit_buf, bits, bit_ptr);
-    motion_y = (motion->pmv[0][1] >> 1) + get_motion_delta (picture,
+    motion_y = (motion->pmv[0][1] >> 1) + get_motion_delta (decoder,
 							    motion->f_code[1]);
     /* motion_y = bound_motion_vector (motion_y, motion->f_code[1]); */
     motion->pmv[0][1] = motion_y << 1;
 
-    motion_block (table, offset, picture->v_offset >> 1,
-		  0, (field_select & stride), 0,
-		  motion_x, motion_y, dest, motion->ref[0], stride * 2, 8);
+    MOTION_FIELD (table, motion->ref[0], motion_x, motion_y, 0, & ~1, field);
 
     NEEDBITS (bit_buf, bits, bit_ptr);
-    field_select = SBITS (bit_buf, 1);
+    field = UBITS (bit_buf, 1);
     DUMPBITS (bit_buf, bits, 1);
 
-    motion_x = motion->pmv[1][0] + get_motion_delta (picture,
+    motion_x = motion->pmv[1][0] + get_motion_delta (decoder,
 						     motion->f_code[0]);
     motion_x = bound_motion_vector (motion_x, motion->f_code[0]);
     motion->pmv[1][0] = motion_x;
 
     NEEDBITS (bit_buf, bits, bit_ptr);
-    motion_y = (motion->pmv[1][1] >> 1) + get_motion_delta (picture,
+    motion_y = (motion->pmv[1][1] >> 1) + get_motion_delta (decoder,
 							    motion->f_code[1]);
     /* motion_y = bound_motion_vector (motion_y, motion->f_code[1]); */
     motion->pmv[1][1] = motion_y << 1;
 
-    motion_block (table, offset, picture->v_offset >> 1,
-		  0, (field_select & stride), stride,
-		  motion_x, motion_y, dest, motion->ref[0], stride * 2, 8);
+    MOTION_FIELD (table, motion->ref[0], motion_x, motion_y, 1, & ~1, field);
 #undef bit_buf
 #undef bits
 #undef bit_ptr
 }
 
-static void motion_fr_dmv (picture_t * picture, motion_t * motion,
-			   uint8_t * dest[3], int offset, int stride,
-			   void (** table) (uint8_t *, uint8_t *, int, int))
+static void motion_fr_dmv (decoder_t * const decoder, motion_t * const motion,
+			   mpeg2_mc_fct * const * const table)
 {
-#define bit_buf (picture->bitstream_buf)
-#define bits (picture->bitstream_bits)
-#define bit_ptr (picture->bitstream_ptr)
-    int motion_x, motion_y;
-    int dmv_x, dmv_y;
-    int m;
-    int other_x, other_y;
+#define bit_buf (decoder->bitstream_buf)
+#define bits (decoder->bitstream_bits)
+#define bit_ptr (decoder->bitstream_ptr)
+    int motion_x, motion_y, dmv_x, dmv_y, m, other_x, other_y;
+    unsigned int pos_x, pos_y, xy_half, offset;
 
     NEEDBITS (bit_buf, bits, bit_ptr);
-    motion_x = motion->pmv[0][0] + get_motion_delta (picture,
+    motion_x = motion->pmv[0][0] + get_motion_delta (decoder,
 						     motion->f_code[0]);
     motion_x = bound_motion_vector (motion_x, motion->f_code[0]);
     motion->pmv[1][0] = motion->pmv[0][0] = motion_x;
-
     NEEDBITS (bit_buf, bits, bit_ptr);
-    dmv_x = get_dmv (picture);
+    dmv_x = get_dmv (decoder);
 
-    NEEDBITS (bit_buf, bits, bit_ptr);
-    motion_y = (motion->pmv[0][1] >> 1) + get_motion_delta (picture,
+    motion_y = (motion->pmv[0][1] >> 1) + get_motion_delta (decoder,
 							    motion->f_code[1]);
     /* motion_y = bound_motion_vector (motion_y, motion->f_code[1]); */
     motion->pmv[1][1] = motion->pmv[0][1] = motion_y << 1;
+    dmv_y = get_dmv (decoder);
 
-    NEEDBITS (bit_buf, bits, bit_ptr);
-    dmv_y = get_dmv (picture);
-
-    motion_block (mc_functions.put, offset, picture->v_offset >> 1, 0, 0, 0,
-		  motion_x, motion_y, dest, motion->ref[0], stride * 2, 8);
-
-    m = picture->top_field_first ? 1 : 3;
+    m = decoder->top_field_first ? 1 : 3;
     other_x = ((motion_x * m + (motion_x > 0)) >> 1) + dmv_x;
     other_y = ((motion_y * m + (motion_y > 0)) >> 1) + dmv_y - 1;
-    motion_block (mc_functions.avg, offset, picture->v_offset >> 1, 0, stride, 0,
-		  other_x, other_y, dest, motion->ref[0], stride * 2, 8);
-
-    motion_block (mc_functions.put, offset, picture->v_offset >> 1,
-		  0, stride, stride,
-		  motion_x, motion_y, dest, motion->ref[0], stride * 2, 8);
+    MOTION_FIELD (mpeg2_mc.put, motion->ref[0], other_x, other_y, 0, | 1, 0);
 
-    m = picture->top_field_first ? 3 : 1;
+    m = decoder->top_field_first ? 3 : 1;
     other_x = ((motion_x * m + (motion_x > 0)) >> 1) + dmv_x;
     other_y = ((motion_y * m + (motion_y > 0)) >> 1) + dmv_y + 1;
-    motion_block (mc_functions.avg, offset, picture->v_offset >> 1, 0, 0, stride,
-		  other_x, other_y, dest, motion->ref[0], stride * 2, 8);
+    MOTION_FIELD (mpeg2_mc.put, motion->ref[0], other_x, other_y, 1, & ~1, 0);
+
+    xy_half = ((motion_y & 1) << 1) | (motion_x & 1);
+    offset = (decoder->offset + (motion_x >> 1) +
+	      (decoder->v_offset + (motion_y & ~1)) * decoder->stride);
+    mpeg2_mc.avg[xy_half]
+	(decoder->dest[0] + decoder->offset,
+	 motion->ref[0][0] + offset, 2 * decoder->stride, 8);
+    mpeg2_mc.avg[xy_half]
+	(decoder->dest[0] + decoder->stride + decoder->offset,
+	 motion->ref[0][0] + decoder->stride + offset, 2 * decoder->stride, 8);
+    motion_x /= 2;	motion_y /= 2;
+    xy_half = ((motion_y & 1) << 1) | (motion_x & 1);
+    offset = (((decoder->offset + motion_x) >> 1) +
+	      (((decoder->v_offset >> 1) + (motion_y & ~1)) *
+	       decoder->uv_stride));
+    mpeg2_mc.avg[4+xy_half]
+	(decoder->dest[1] + (decoder->offset >> 1),
+	 motion->ref[0][1] + offset, 2 * decoder->uv_stride, 4);
+    mpeg2_mc.avg[4+xy_half]
+	(decoder->dest[1] + decoder->uv_stride + (decoder->offset >> 1),
+	 motion->ref[0][1] + decoder->uv_stride + offset,
+	 2 * decoder->uv_stride, 4);
+    mpeg2_mc.avg[4+xy_half]
+	(decoder->dest[2] + (decoder->offset >> 1),
+	 motion->ref[0][2] + offset, 2 * decoder->uv_stride, 4);
+    mpeg2_mc.avg[4+xy_half]
+	(decoder->dest[2] + decoder->uv_stride + (decoder->offset >> 1),
+	 motion->ref[0][2] + decoder->uv_stride + offset,
+	 2 * decoder->uv_stride, 4);
 #undef bit_buf
 #undef bits
 #undef bit_ptr
 }
 
-/* like motion_frame, but reuse previous motion vectors */
-static void motion_fr_reuse (picture_t * picture, motion_t * motion,
-			     uint8_t * dest[3], int offset, int stride,
-			     void (** table) (uint8_t *, uint8_t *, int, int))
+static inline void motion_reuse (const decoder_t * const decoder,
+				 const motion_t * const motion,
+				 mpeg2_mc_fct * const * const table)
 {
-    motion_block (table, offset, picture->v_offset, 0, 0, 0,
-		  motion->pmv[0][0], motion->pmv[0][1],
-		  dest, motion->ref[0], stride, 16);
+    int motion_x, motion_y;
+    unsigned int pos_x, pos_y, xy_half, offset;
+
+    motion_x = motion->pmv[0][0];
+    motion_y = motion->pmv[0][1];
+
+    MOTION (table, motion->ref[0], motion_x, motion_y, 16, 0);
 }
 
-/* like motion_frame, but use null motion vectors */
-static void motion_fr_zero (picture_t * picture, motion_t * motion,
-			    uint8_t * dest[3], int offset, int stride,
-			    void (** table) (uint8_t *, uint8_t *, int, int))
+static inline void motion_zero (const decoder_t * const decoder,
+				const motion_t * const motion,
+				mpeg2_mc_fct * const * const table)
 {
-    motion_block (table, offset, picture->v_offset, 0, 0, 0, 0, 0,
-		  dest, motion->ref[0], stride, 16);
+    unsigned int offset;
+
+    table[0] (decoder->dest[0] + decoder->offset,
+	      (motion->ref[0][0] + decoder->offset +
+	       decoder->v_offset * decoder->stride),
+	      decoder->stride, 16);
+
+    offset = ((decoder->offset >> 1) +
+	      (decoder->v_offset >> 1) * decoder->uv_stride);
+    table[4] (decoder->dest[1] + (decoder->offset >> 1),
+	      motion->ref[0][1] + offset, decoder->uv_stride, 8);
+    table[4] (decoder->dest[2] + (decoder->offset >> 1),
+	      motion->ref[0][2] + offset, decoder->uv_stride, 8);
 }
 
 /* like motion_frame, but parsing without actual motion compensation */
-static void motion_fr_conceal (picture_t * picture)
+static void motion_fr_conceal (decoder_t * const decoder)
 {
-#define bit_buf (picture->bitstream_buf)
-#define bits (picture->bitstream_bits)
-#define bit_ptr (picture->bitstream_ptr)
+#define bit_buf (decoder->bitstream_buf)
+#define bits (decoder->bitstream_bits)
+#define bit_ptr (decoder->bitstream_ptr)
     int tmp;
 
     NEEDBITS (bit_buf, bits, bit_ptr);
-    tmp = (picture->f_motion.pmv[0][0] +
-	   get_motion_delta (picture, picture->f_motion.f_code[0]));
-    tmp = bound_motion_vector (tmp, picture->f_motion.f_code[0]);
-    picture->f_motion.pmv[1][0] = picture->f_motion.pmv[0][0] = tmp;
+    tmp = (decoder->f_motion.pmv[0][0] +
+	   get_motion_delta (decoder, decoder->f_motion.f_code[0]));
+    tmp = bound_motion_vector (tmp, decoder->f_motion.f_code[0]);
+    decoder->f_motion.pmv[1][0] = decoder->f_motion.pmv[0][0] = tmp;
 
     NEEDBITS (bit_buf, bits, bit_ptr);
-    tmp = (picture->f_motion.pmv[0][1] +
-	   get_motion_delta (picture, picture->f_motion.f_code[1]));
-    tmp = bound_motion_vector (tmp, picture->f_motion.f_code[1]);
-    picture->f_motion.pmv[1][1] = picture->f_motion.pmv[0][1] = tmp;
+    tmp = (decoder->f_motion.pmv[0][1] +
+	   get_motion_delta (decoder, decoder->f_motion.f_code[1]));
+    tmp = bound_motion_vector (tmp, decoder->f_motion.f_code[1]);
+    decoder->f_motion.pmv[1][1] = decoder->f_motion.pmv[0][1] = tmp;
 
     DUMPBITS (bit_buf, bits, 1); /* remove marker_bit */
 #undef bit_buf
@@ -1308,175 +1270,137 @@ static void motion_fr_conceal (picture_t * picture)
 #undef bit_ptr
 }
 
-static void motion_fi_field (picture_t * picture, motion_t * motion,
-			     uint8_t * dest[3], int offset, int stride,
-			     void (** table) (uint8_t *, uint8_t *, int, int))
+static void motion_fi_field (decoder_t * const decoder,
+			     motion_t * const motion,
+			     mpeg2_mc_fct * const * const table)
 {
-#define bit_buf (picture->bitstream_buf)
-#define bits (picture->bitstream_bits)
-#define bit_ptr (picture->bitstream_ptr)
+#define bit_buf (decoder->bitstream_buf)
+#define bits (decoder->bitstream_bits)
+#define bit_ptr (decoder->bitstream_ptr)
     int motion_x, motion_y;
-    int field_select;
+    uint8_t ** ref_field;
+    unsigned int pos_x, pos_y, xy_half, offset;
 
     NEEDBITS (bit_buf, bits, bit_ptr);
-    field_select = UBITS (bit_buf, 1);
+    ref_field = motion->ref2[UBITS (bit_buf, 1)];
     DUMPBITS (bit_buf, bits, 1);
 
-    NEEDBITS (bit_buf, bits, bit_ptr);
-    motion_x = motion->pmv[0][0] + get_motion_delta (picture,
+    motion_x = motion->pmv[0][0] + get_motion_delta (decoder,
 						     motion->f_code[0]);
     motion_x = bound_motion_vector (motion_x, motion->f_code[0]);
     motion->pmv[1][0] = motion->pmv[0][0] = motion_x;
 
     NEEDBITS (bit_buf, bits, bit_ptr);
-    motion_y = motion->pmv[0][1] + get_motion_delta (picture,
+    motion_y = motion->pmv[0][1] + get_motion_delta (decoder,
 						     motion->f_code[1]);
     motion_y = bound_motion_vector (motion_y, motion->f_code[1]);
     motion->pmv[1][1] = motion->pmv[0][1] = motion_y;
 
-    motion_block (table, offset, picture->v_offset, 0, 0, 0,
-		  motion_x, motion_y,
-		  dest, motion->ref[field_select], stride, 16);
+    MOTION (table, ref_field, motion_x, motion_y, 16, 0);
 #undef bit_buf
 #undef bits
 #undef bit_ptr
 }
 
-static void motion_fi_16x8 (picture_t * picture, motion_t * motion,
-			    uint8_t * dest[3], int offset, int stride,
-			    void (** table) (uint8_t *, uint8_t *, int, int))
+static void motion_fi_16x8 (decoder_t * const decoder, motion_t * const motion,
+			    mpeg2_mc_fct * const * const table)
 {
-#define bit_buf (picture->bitstream_buf)
-#define bits (picture->bitstream_bits)
-#define bit_ptr (picture->bitstream_ptr)
+#define bit_buf (decoder->bitstream_buf)
+#define bits (decoder->bitstream_bits)
+#define bit_ptr (decoder->bitstream_ptr)
     int motion_x, motion_y;
-    int field_select;
+    uint8_t ** ref_field;
+    unsigned int pos_x, pos_y, xy_half, offset;
 
     NEEDBITS (bit_buf, bits, bit_ptr);
-    field_select = UBITS (bit_buf, 1);
+    ref_field = motion->ref2[UBITS (bit_buf, 1)];
     DUMPBITS (bit_buf, bits, 1);
 
-    NEEDBITS (bit_buf, bits, bit_ptr);
-    motion_x = motion->pmv[0][0] + get_motion_delta (picture,
+    motion_x = motion->pmv[0][0] + get_motion_delta (decoder,
 						     motion->f_code[0]);
     motion_x = bound_motion_vector (motion_x, motion->f_code[0]);
     motion->pmv[0][0] = motion_x;
 
     NEEDBITS (bit_buf, bits, bit_ptr);
-    motion_y = motion->pmv[0][1] + get_motion_delta (picture,
+    motion_y = motion->pmv[0][1] + get_motion_delta (decoder,
 						     motion->f_code[1]);
     motion_y = bound_motion_vector (motion_y, motion->f_code[1]);
     motion->pmv[0][1] = motion_y;
 
-    motion_block (table, offset, picture->v_offset, 0, 0, 0,
-		  motion_x, motion_y,
-		  dest, motion->ref[field_select], stride, 8);
+    MOTION (table, ref_field, motion_x, motion_y, 8, 0);
 
     NEEDBITS (bit_buf, bits, bit_ptr);
-    field_select = UBITS (bit_buf, 1);
+    ref_field = motion->ref2[UBITS (bit_buf, 1)];
     DUMPBITS (bit_buf, bits, 1);
 
-    NEEDBITS (bit_buf, bits, bit_ptr);
-    motion_x = motion->pmv[1][0] + get_motion_delta (picture,
+    motion_x = motion->pmv[1][0] + get_motion_delta (decoder,
 						     motion->f_code[0]);
     motion_x = bound_motion_vector (motion_x, motion->f_code[0]);
     motion->pmv[1][0] = motion_x;
 
     NEEDBITS (bit_buf, bits, bit_ptr);
-    motion_y = motion->pmv[1][1] + get_motion_delta (picture,
+    motion_y = motion->pmv[1][1] + get_motion_delta (decoder,
 						     motion->f_code[1]);
     motion_y = bound_motion_vector (motion_y, motion->f_code[1]);
     motion->pmv[1][1] = motion_y;
 
-    motion_block (table, offset, picture->v_offset+8, 1, 0, 0,
-		  motion_x, motion_y,
-		  dest, motion->ref[field_select], stride, 8);
+    MOTION (table, ref_field, motion_x, motion_y, 8, 8);
 #undef bit_buf
 #undef bits
 #undef bit_ptr
 }
 
-static void motion_fi_dmv (picture_t * picture, motion_t * motion,
-			   uint8_t * dest[3], int offset, int stride,
-			   void (** table) (uint8_t *, uint8_t *, int, int))
+static void motion_fi_dmv (decoder_t * const decoder, motion_t * const motion,
+			   mpeg2_mc_fct * const * const table)
 {
-#define bit_buf (picture->bitstream_buf)
-#define bits (picture->bitstream_bits)
-#define bit_ptr (picture->bitstream_ptr)
-    int motion_x, motion_y;
-    int dmv_x, dmv_y;
+#define bit_buf (decoder->bitstream_buf)
+#define bits (decoder->bitstream_bits)
+#define bit_ptr (decoder->bitstream_ptr)
+    int motion_x, motion_y, other_x, other_y;
+    unsigned int pos_x, pos_y, xy_half, offset;
 
     NEEDBITS (bit_buf, bits, bit_ptr);
-    motion_x = motion->pmv[0][0] + get_motion_delta (picture,
+    motion_x = motion->pmv[0][0] + get_motion_delta (decoder,
 						     motion->f_code[0]);
     motion_x = bound_motion_vector (motion_x, motion->f_code[0]);
     motion->pmv[1][0] = motion->pmv[0][0] = motion_x;
-
     NEEDBITS (bit_buf, bits, bit_ptr);
-    dmv_x = get_dmv (picture);
+    other_x = ((motion_x + (motion_x > 0)) >> 1) + get_dmv (decoder);
 
-    NEEDBITS (bit_buf, bits, bit_ptr);
-    motion_y = motion->pmv[0][1] + get_motion_delta (picture,
+    motion_y = motion->pmv[0][1] + get_motion_delta (decoder,
 						     motion->f_code[1]);
     motion_y = bound_motion_vector (motion_y, motion->f_code[1]);
     motion->pmv[1][1] = motion->pmv[0][1] = motion_y;
+    other_y = (((motion_y + (motion_y > 0)) >> 1) + get_dmv (decoder) +
+	       decoder->dmv_offset);
 
-    NEEDBITS (bit_buf, bits, bit_ptr);
-    dmv_y = get_dmv (picture);
-
-    motion_block (mc_functions.put, offset, picture->v_offset, 0, 0, 0,
-		  motion_x, motion_y,
-		  dest, motion->ref[picture->current_field], stride, 16);
-
-    motion_x = ((motion_x + (motion_x > 0)) >> 1) + dmv_x;
-    motion_y = ((motion_y + (motion_y > 0)) >> 1) + dmv_y +
-	2 * picture->current_field - 1;
-    motion_block (mc_functions.avg, offset, picture->v_offset, 0, 0, 0,
-		  motion_x, motion_y,
-		  dest, motion->ref[!picture->current_field], stride, 16);
+    MOTION (mpeg2_mc.put, motion->ref[0], motion_x, motion_y, 16, 0);
+    MOTION (mpeg2_mc.avg, motion->ref[1], other_x, other_y, 16, 0);
 #undef bit_buf
 #undef bits
 #undef bit_ptr
 }
 
-static void motion_fi_reuse (picture_t * picture, motion_t * motion,
-			     uint8_t * dest[3], int offset, int stride,
-			     void (** table) (uint8_t *, uint8_t *, int, int))
-{
-    motion_block (table, offset, picture->v_offset, 0, 0, 0,
-		  motion->pmv[0][0], motion->pmv[0][1],
-		  dest, motion->ref[picture->current_field], stride, 16);
-}
-
-static void motion_fi_zero (picture_t * picture, motion_t * motion,
-			    uint8_t * dest[3], int offset, int stride,
-			    void (** table) (uint8_t *, uint8_t *, int, int))
-{
-    motion_block (table, offset, picture->v_offset, 0, 0, 0, 0, 0,
-		  dest, motion->ref[picture->current_field], stride, 16);
-}
-
-static void motion_fi_conceal (picture_t * picture)
+static void motion_fi_conceal (decoder_t * const decoder)
 {
-#define bit_buf (picture->bitstream_buf)
-#define bits (picture->bitstream_bits)
-#define bit_ptr (picture->bitstream_ptr)
+#define bit_buf (decoder->bitstream_buf)
+#define bits (decoder->bitstream_bits)
+#define bit_ptr (decoder->bitstream_ptr)
     int tmp;
 
     NEEDBITS (bit_buf, bits, bit_ptr);
     DUMPBITS (bit_buf, bits, 1); /* remove field_select */
 
-    NEEDBITS (bit_buf, bits, bit_ptr);
-    tmp = (picture->f_motion.pmv[0][0] +
-	   get_motion_delta (picture, picture->f_motion.f_code[0]));
-    tmp = bound_motion_vector (tmp, picture->f_motion.f_code[0]);
-    picture->f_motion.pmv[1][0] = picture->f_motion.pmv[0][0] = tmp;
+    tmp = (decoder->f_motion.pmv[0][0] +
+	   get_motion_delta (decoder, decoder->f_motion.f_code[0]));
+    tmp = bound_motion_vector (tmp, decoder->f_motion.f_code[0]);
+    decoder->f_motion.pmv[1][0] = decoder->f_motion.pmv[0][0] = tmp;
 
     NEEDBITS (bit_buf, bits, bit_ptr);
-    tmp = (picture->f_motion.pmv[0][1] +
-	   get_motion_delta (picture, picture->f_motion.f_code[1]));
-    tmp = bound_motion_vector (tmp, picture->f_motion.f_code[1]);
-    picture->f_motion.pmv[1][1] = picture->f_motion.pmv[0][1] = tmp;
+    tmp = (decoder->f_motion.pmv[0][1] +
+	   get_motion_delta (decoder, decoder->f_motion.f_code[1]));
+    tmp = bound_motion_vector (tmp, decoder->f_motion.f_code[1]);
+    decoder->f_motion.pmv[1][1] = decoder->f_motion.pmv[0][1] = tmp;
 
     DUMPBITS (bit_buf, bits, 1); /* remove marker_bit */
 #undef bit_buf
@@ -1484,334 +1408,388 @@ static void motion_fi_conceal (picture_t * picture)
 #undef bit_ptr
 }
 
-#define MOTION(routine,direction)					\
-do {									\
-    if ((direction) & MACROBLOCK_MOTION_FORWARD)			\
-	routine (picture, &(picture->f_motion), dest, offset, stride,	\
-		 mc_functions.put);					\
-    if ((direction) & MACROBLOCK_MOTION_BACKWARD)			\
-	routine (picture, &(picture->b_motion), dest, offset, stride,	\
-		 ((direction) & MACROBLOCK_MOTION_FORWARD ?		\
-		  mc_functions.avg : mc_functions.put));		\
+#define MOTION_CALL(routine,direction)				\
+do {								\
+    if ((direction) & MACROBLOCK_MOTION_FORWARD)		\
+	routine (decoder, &(decoder->f_motion), mpeg2_mc.put);	\
+    if ((direction) & MACROBLOCK_MOTION_BACKWARD)		\
+	routine (decoder, &(decoder->b_motion),			\
+		 ((direction) & MACROBLOCK_MOTION_FORWARD ?	\
+		  mpeg2_mc.avg : mpeg2_mc.put));		\
 } while (0)
 
-#define CHECK_DISPLAY							\
+#define NEXT_MACROBLOCK							\
 do {									\
-    if (offset == picture->coded_picture_width) {			\
+    decoder->offset += 16;						\
+    if (decoder->offset == decoder->width) {				\
 	do { /* just so we can use the break statement */		\
-	    if (picture->current_frame->copy) {				\
-		picture->current_frame->copy (picture->current_frame,	\
-					      dest);			\
-		if (picture->picture_coding_type == B_TYPE)		\
+	    if (decoder->convert) {					\
+		decoder->convert (decoder->fbuf_id, decoder->dest,	\
+				  decoder->v_offset);			\
+		if (decoder->coding_type == B_TYPE)			\
 		    break;						\
 	    }								\
-	    dest[0] += 16 * stride;					\
-	    dest[1] += 4 * stride;					\
-	    dest[2] += 4 * stride;					\
+	    decoder->dest[0] += 16 * decoder->stride;			\
+	    decoder->dest[1] += 4 * decoder->stride;			\
+	    decoder->dest[2] += 4 * decoder->stride;			\
 	} while (0);							\
- 	if (! (picture->mpeg1))						\
- 	    return 0;							\
- 	picture->v_offset += 16;					\
- 	if (picture->v_offset >= picture->coded_picture_height)		\
- 	    return 0;							\
-	offset = 0; ++code;						\
+	decoder->v_offset += 16;					\
+	if (decoder->v_offset > decoder->limit_y) {			\
+	    if (mpeg2_cpu_state_restore)				\
+		mpeg2_cpu_state_restore (&cpu_state);			\
+	    return;							\
+	}								\
+	decoder->offset = 0;						\
     }									\
 } while (0)
 
-int slice_process (picture_t * picture, uint8_t code, uint8_t * buffer)
+void mpeg2_init_fbuf (decoder_t * decoder, uint8_t * current_fbuf[3],
+		      uint8_t * forward_fbuf[3], uint8_t * backward_fbuf[3])
 {
-#define bit_buf (picture->bitstream_buf)
-#define bits (picture->bitstream_bits)
-#define bit_ptr (picture->bitstream_ptr)
-    int macroblock_modes;
-    int stride;
-    uint8_t * dest[3];
-    int offset;
-    uint8_t ** forward_ref[2];
-
-    stride = picture->coded_picture_width;
-    offset = (code - 1) * stride * 4;
-    picture->v_offset = (code - 1) * 16;
-
-    forward_ref[0] = picture->forward_reference_frame->base;
-    if (picture->picture_structure != FRAME_PICTURE) {
-	forward_ref[1] = picture->forward_reference_frame->base;
-	offset <<= 1;
-	picture->current_field = (picture->picture_structure == BOTTOM_FIELD);
-	if ((picture->second_field) &&
-	    (picture->picture_coding_type != B_TYPE))
-	    forward_ref[picture->picture_structure == TOP_FIELD] =
-		picture->current_frame->base;
-
-	picture->f_motion.ref[1][0] = forward_ref[1][0] + stride;
-	picture->f_motion.ref[1][1] = forward_ref[1][1] + (stride >> 1);
-	picture->f_motion.ref[1][2] = forward_ref[1][2] + (stride >> 1);
-
-	picture->b_motion.ref[1][0] =
-	    picture->backward_reference_frame->base[0] + stride;
-	picture->b_motion.ref[1][1] =
-	    picture->backward_reference_frame->base[1] + (stride >> 1);
-	picture->b_motion.ref[1][2] =
-	    picture->backward_reference_frame->base[2] + (stride >> 1);
-    }
+    int offset, stride, height, bottom_field;
 
-    picture->f_motion.ref[0][0] = forward_ref[0][0];
-    picture->f_motion.ref[0][1] = forward_ref[0][1];
-    picture->f_motion.ref[0][2] = forward_ref[0][2];
+    stride = decoder->width;
+    bottom_field = (decoder->picture_structure == BOTTOM_FIELD);
+    offset = bottom_field ? stride : 0;
+    height = decoder->height;
 
-    picture->f_motion.pmv[0][0] = picture->f_motion.pmv[0][1] = 0;
-    picture->f_motion.pmv[1][0] = picture->f_motion.pmv[1][1] = 0;
+    decoder->picture_dest[0] = current_fbuf[0] + offset;
+    decoder->picture_dest[1] = current_fbuf[1] + (offset >> 1);
+    decoder->picture_dest[2] = current_fbuf[2] + (offset >> 1);
 
-    picture->b_motion.ref[0][0] = picture->backward_reference_frame->base[0];
-    picture->b_motion.ref[0][1] = picture->backward_reference_frame->base[1];
-    picture->b_motion.ref[0][2] = picture->backward_reference_frame->base[2];
+    decoder->f_motion.ref[0][0] = forward_fbuf[0] + offset;
+    decoder->f_motion.ref[0][1] = forward_fbuf[1] + (offset >> 1);
+    decoder->f_motion.ref[0][2] = forward_fbuf[2] + (offset >> 1);
 
-    picture->b_motion.pmv[0][0] = picture->b_motion.pmv[0][1] = 0;
-    picture->b_motion.pmv[1][0] = picture->b_motion.pmv[1][1] = 0;
+    decoder->b_motion.ref[0][0] = backward_fbuf[0] + offset;
+    decoder->b_motion.ref[0][1] = backward_fbuf[1] + (offset >> 1);
+    decoder->b_motion.ref[0][2] = backward_fbuf[2] + (offset >> 1);
 
-    if ((picture->current_frame->copy) &&
-	(picture->picture_coding_type == B_TYPE))
-	offset = 0;
+    if (decoder->picture_structure != FRAME_PICTURE) {
+	decoder->dmv_offset = bottom_field ? 1 : -1;
+	decoder->f_motion.ref2[0] = decoder->f_motion.ref[bottom_field];
+	decoder->f_motion.ref2[1] = decoder->f_motion.ref[!bottom_field];
+	decoder->b_motion.ref2[0] = decoder->b_motion.ref[bottom_field];
+	decoder->b_motion.ref2[1] = decoder->b_motion.ref[!bottom_field];
+	offset = stride - offset;
 
-    dest[0] = picture->current_frame->base[0] + offset * 4;
-    dest[1] = picture->current_frame->base[1] + offset;
-    dest[2] = picture->current_frame->base[2] + offset;
+	if (decoder->second_field && (decoder->coding_type != B_TYPE))
+	    forward_fbuf = current_fbuf;
+
+	decoder->f_motion.ref[1][0] = forward_fbuf[0] + offset;
+	decoder->f_motion.ref[1][1] = forward_fbuf[1] + (offset >> 1);
+	decoder->f_motion.ref[1][2] = forward_fbuf[2] + (offset >> 1);
+
+	decoder->b_motion.ref[1][0] = backward_fbuf[0] + offset;
+	decoder->b_motion.ref[1][1] = backward_fbuf[1] + (offset >> 1);
+	decoder->b_motion.ref[1][2] = backward_fbuf[2] + (offset >> 1);
 
-    switch (picture->picture_structure) {
-    case BOTTOM_FIELD:
-	dest[0] += stride;
-	dest[1] += stride >> 1;
-	dest[2] += stride >> 1;
-	/* follow thru */
-    case TOP_FIELD:
 	stride <<= 1;
+	height >>= 1;
     }
 
-    picture->dc_dct_pred[0] = picture->dc_dct_pred[1] =
-	picture->dc_dct_pred[2] = 1 << (picture->intra_dc_precision + 7);
+    decoder->stride = stride;
+    decoder->uv_stride = stride >> 1;
+    decoder->limit_x = 2 * decoder->width - 32;
+    decoder->limit_y_16 = 2 * height - 32;
+    decoder->limit_y_8 = 2 * height - 16;
+    decoder->limit_y = height - 16;
+}
 
-    bitstream_init (picture, buffer);
+static inline int slice_init (decoder_t * const decoder, int code)
+{
+#define bit_buf (decoder->bitstream_buf)
+#define bits (decoder->bitstream_bits)
+#define bit_ptr (decoder->bitstream_ptr)
+    int offset;
+    const MBAtab * mba;
 
-    picture->quantizer_scale = get_quantizer_scale (picture);
+    decoder->dc_dct_pred[0] = decoder->dc_dct_pred[1] =
+	decoder->dc_dct_pred[2] = 128 << decoder->intra_dc_precision;
+
+    decoder->f_motion.pmv[0][0] = decoder->f_motion.pmv[0][1] = 0;
+    decoder->f_motion.pmv[1][0] = decoder->f_motion.pmv[1][1] = 0;
+    decoder->b_motion.pmv[0][0] = decoder->b_motion.pmv[0][1] = 0;
+    decoder->b_motion.pmv[1][0] = decoder->b_motion.pmv[1][1] = 0;
+
+    if (decoder->vertical_position_extension) {
+	code += UBITS (bit_buf, 3) << 7;
+	DUMPBITS (bit_buf, bits, 3);
+    }
+    decoder->v_offset = (code - 1) * 16;
+    offset = 0;
+    if (!(decoder->convert) || decoder->coding_type != B_TYPE)
+	offset = (code - 1) * decoder->stride * 4;
+
+    decoder->dest[0] = decoder->picture_dest[0] + offset * 4;
+    decoder->dest[1] = decoder->picture_dest[1] + offset;
+    decoder->dest[2] = decoder->picture_dest[2] + offset;
+
+    decoder->quantizer_scale = get_quantizer_scale (decoder);
 
     /* ignore intra_slice and all the extra data */
     while (bit_buf & 0x80000000) {
 	DUMPBITS (bit_buf, bits, 9);
 	NEEDBITS (bit_buf, bits, bit_ptr);
     }
-    DUMPBITS (bit_buf, bits, 1);
 
-    NEEDBITS (bit_buf, bits, bit_ptr);
-    offset = get_macroblock_address_increment (picture) << 4;
+    /* decode initial macroblock address increment */
+    offset = 0;
+    while (1) {
+	if (bit_buf >= 0x08000000) {
+	    mba = MBA_5 + (UBITS (bit_buf, 6) - 2);
+	    break;
+	} else if (bit_buf >= 0x01800000) {
+	    mba = MBA_11 + (UBITS (bit_buf, 12) - 24);
+	    break;
+	} else switch (UBITS (bit_buf, 12)) {
+	case 8:		/* macroblock_escape */
+	    offset += 33;
+	    DUMPBITS (bit_buf, bits, 11);
+	    NEEDBITS (bit_buf, bits, bit_ptr);
+	    continue;
+	case 15:	/* macroblock_stuffing (MPEG1 only) */
+	    bit_buf &= 0xfffff;
+	    DUMPBITS (bit_buf, bits, 11);
+	    NEEDBITS (bit_buf, bits, bit_ptr);
+	    continue;
+	default:	/* error */
+	    return 1;
+	}
+    }
+    DUMPBITS (bit_buf, bits, mba->len + 1);
+    decoder->offset = (offset + mba->mba) << 4;
+
+    while (decoder->offset - decoder->width >= 0) {
+	decoder->offset -= decoder->width;
+	if (!(decoder->convert) || decoder->coding_type != B_TYPE) {
+	    decoder->dest[0] += 16 * decoder->stride;
+	    decoder->dest[1] += 4 * decoder->stride;
+	    decoder->dest[2] += 4 * decoder->stride;
+	}
+	decoder->v_offset += 16;
+    }
+    if (decoder->v_offset > decoder->limit_y)
+	return 1;
+
+    return 0;
+#undef bit_buf
+#undef bits
+#undef bit_ptr
+}
+
+void mpeg2_slice (decoder_t * const decoder, const int code,
+		  const uint8_t * const buffer)
+{
+#define bit_buf (decoder->bitstream_buf)
+#define bits (decoder->bitstream_bits)
+#define bit_ptr (decoder->bitstream_ptr)
+    cpu_state_t cpu_state;
+
+    bitstream_init (decoder, buffer);
+
+    if (slice_init (decoder, code))
+	return;
+
+    if (mpeg2_cpu_state_save)
+	mpeg2_cpu_state_save (&cpu_state);
 
     while (1) {
+	int macroblock_modes;
+	int mba_inc;
+	const MBAtab * mba;
+
 	NEEDBITS (bit_buf, bits, bit_ptr);
 
-	macroblock_modes = get_macroblock_modes (picture);
+	macroblock_modes = get_macroblock_modes (decoder);
 
 	/* maybe integrate MACROBLOCK_QUANT test into get_macroblock_modes ? */
 	if (macroblock_modes & MACROBLOCK_QUANT)
-	    picture->quantizer_scale = get_quantizer_scale (picture);
+	    decoder->quantizer_scale = get_quantizer_scale (decoder);
 
 	if (macroblock_modes & MACROBLOCK_INTRA) {
 
 	    int DCT_offset, DCT_stride;
+	    int offset;
+	    uint8_t * dest_y;
 
-	    if (picture->concealment_motion_vectors) {
-		if (picture->picture_structure == FRAME_PICTURE)
-		    motion_fr_conceal (picture);
+	    if (decoder->concealment_motion_vectors) {
+		if (decoder->picture_structure == FRAME_PICTURE)
+		    motion_fr_conceal (decoder);
 		else
-		    motion_fi_conceal (picture);
+		    motion_fi_conceal (decoder);
 	    } else {
-		picture->f_motion.pmv[0][0] = picture->f_motion.pmv[0][1] = 0;
-		picture->f_motion.pmv[1][0] = picture->f_motion.pmv[1][1] = 0;
-		picture->b_motion.pmv[0][0] = picture->b_motion.pmv[0][1] = 0;
-		picture->b_motion.pmv[1][0] = picture->b_motion.pmv[1][1] = 0;
+		decoder->f_motion.pmv[0][0] = decoder->f_motion.pmv[0][1] = 0;
+		decoder->f_motion.pmv[1][0] = decoder->f_motion.pmv[1][1] = 0;
+		decoder->b_motion.pmv[0][0] = decoder->b_motion.pmv[0][1] = 0;
+		decoder->b_motion.pmv[1][0] = decoder->b_motion.pmv[1][1] = 0;
 	    }
 
 	    if (macroblock_modes & DCT_TYPE_INTERLACED) {
-		DCT_offset = stride;
-		DCT_stride = stride * 2;
+		DCT_offset = decoder->stride;
+		DCT_stride = decoder->stride * 2;
 	    } else {
-		DCT_offset = stride * 8;
-		DCT_stride = stride;
+		DCT_offset = decoder->stride * 8;
+		DCT_stride = decoder->stride;
 	    }
 
-	    slice_intra_DCT (picture, 0, dest[0] + offset, DCT_stride);
-	    slice_intra_DCT (picture, 0, dest[0] + offset + 8, DCT_stride);
-	    slice_intra_DCT (picture, 0, dest[0] + offset + DCT_offset,
-			     DCT_stride);
-	    slice_intra_DCT (picture, 0, dest[0] + offset + DCT_offset + 8,
-			     DCT_stride);
-
-	    slice_intra_DCT (picture, 1, dest[1] + (offset >> 1), stride >> 1);
-	    slice_intra_DCT (picture, 2, dest[2] + (offset >> 1), stride >> 1);
-
-	    if (picture->picture_coding_type == D_TYPE) {
+	    offset = decoder->offset;
+	    dest_y = decoder->dest[0] + offset;
+	    slice_intra_DCT (decoder, 0, dest_y, DCT_stride);
+	    slice_intra_DCT (decoder, 0, dest_y + 8, DCT_stride);
+	    slice_intra_DCT (decoder, 0, dest_y + DCT_offset, DCT_stride);
+	    slice_intra_DCT (decoder, 0, dest_y + DCT_offset + 8, DCT_stride);
+	    slice_intra_DCT (decoder, 1, decoder->dest[1] + (offset >> 1),
+			     decoder->uv_stride);
+	    slice_intra_DCT (decoder, 2, decoder->dest[2] + (offset >> 1),
+			     decoder->uv_stride);
+
+	    if (decoder->coding_type == D_TYPE) {
 		NEEDBITS (bit_buf, bits, bit_ptr);
 		DUMPBITS (bit_buf, bits, 1);
 	    }
 	} else {
 
-	    if (picture->mpeg1) {
-		if ((macroblock_modes & MOTION_TYPE_MASK) == MC_FRAME)
-		    MOTION (motion_mp1, macroblock_modes);
-		else {
-		    /* non-intra mb without forward mv in a P picture */
-		    picture->f_motion.pmv[0][0] = 0;
-		    picture->f_motion.pmv[0][1] = 0;
-		    picture->f_motion.pmv[1][0] = 0;
-		    picture->f_motion.pmv[1][1] = 0;
-		    MOTION (motion_fr_zero, MACROBLOCK_MOTION_FORWARD);
-		}
-	    } else if (picture->picture_structure == FRAME_PICTURE)
+	    if (decoder->picture_structure == FRAME_PICTURE)
 		switch (macroblock_modes & MOTION_TYPE_MASK) {
 		case MC_FRAME:
-		    MOTION (motion_fr_frame, macroblock_modes);
+		    if (decoder->mpeg1)
+			MOTION_CALL (motion_mp1, macroblock_modes);
+		    else
+			MOTION_CALL (motion_fr_frame, macroblock_modes);
 		    break;
 
 		case MC_FIELD:
-		    MOTION (motion_fr_field, macroblock_modes);
+		    MOTION_CALL (motion_fr_field, macroblock_modes);
 		    break;
 
 		case MC_DMV:
-		    MOTION (motion_fr_dmv, MACROBLOCK_MOTION_FORWARD);
+		    MOTION_CALL (motion_fr_dmv, MACROBLOCK_MOTION_FORWARD);
 		    break;
 
 		case 0:
 		    /* non-intra mb without forward mv in a P picture */
-		    picture->f_motion.pmv[0][0] = 0;
-		    picture->f_motion.pmv[0][1] = 0;
-		    picture->f_motion.pmv[1][0] = 0;
-		    picture->f_motion.pmv[1][1] = 0;
-		    MOTION (motion_fr_zero, MACROBLOCK_MOTION_FORWARD);
+		    decoder->f_motion.pmv[0][0] = 0;
+		    decoder->f_motion.pmv[0][1] = 0;
+		    decoder->f_motion.pmv[1][0] = 0;
+		    decoder->f_motion.pmv[1][1] = 0;
+		    MOTION_CALL (motion_zero, MACROBLOCK_MOTION_FORWARD);
 		    break;
 		}
 	    else
 		switch (macroblock_modes & MOTION_TYPE_MASK) {
 		case MC_FIELD:
-		    MOTION (motion_fi_field, macroblock_modes);
+		    MOTION_CALL (motion_fi_field, macroblock_modes);
 		    break;
 
 		case MC_16X8:
-		    MOTION (motion_fi_16x8, macroblock_modes);
+		    MOTION_CALL (motion_fi_16x8, macroblock_modes);
 		    break;
 
 		case MC_DMV:
-		    MOTION (motion_fi_dmv, MACROBLOCK_MOTION_FORWARD);
+		    MOTION_CALL (motion_fi_dmv, MACROBLOCK_MOTION_FORWARD);
 		    break;
 
 		case 0:
 		    /* non-intra mb without forward mv in a P picture */
-		    picture->f_motion.pmv[0][0] = 0;
-		    picture->f_motion.pmv[0][1] = 0;
-		    picture->f_motion.pmv[1][0] = 0;
-		    picture->f_motion.pmv[1][1] = 0;
-		    MOTION (motion_fi_zero, MACROBLOCK_MOTION_FORWARD);
+		    decoder->f_motion.pmv[0][0] = 0;
+		    decoder->f_motion.pmv[0][1] = 0;
+		    decoder->f_motion.pmv[1][0] = 0;
+		    decoder->f_motion.pmv[1][1] = 0;
+		    MOTION_CALL (motion_zero, MACROBLOCK_MOTION_FORWARD);
 		    break;
 		}
 
 	    if (macroblock_modes & MACROBLOCK_PATTERN) {
 		int coded_block_pattern;
 		int DCT_offset, DCT_stride;
+		int offset;
+		uint8_t * dest_y;
 
 		if (macroblock_modes & DCT_TYPE_INTERLACED) {
-		    DCT_offset = stride;
-		    DCT_stride = stride * 2;
+		    DCT_offset = decoder->stride;
+		    DCT_stride = decoder->stride * 2;
 		} else {
-		    DCT_offset = stride * 8;
-		    DCT_stride = stride;
+		    DCT_offset = decoder->stride * 8;
+		    DCT_stride = decoder->stride;
 		}
 
-		coded_block_pattern = get_coded_block_pattern (picture);
+		coded_block_pattern = get_coded_block_pattern (decoder);
 
+		offset = decoder->offset;
+		dest_y = decoder->dest[0] + offset;
 		if (coded_block_pattern & 0x20)
-		    slice_non_intra_DCT (picture, dest[0] + offset,
-					 DCT_stride);
+		    slice_non_intra_DCT (decoder, dest_y, DCT_stride);
 		if (coded_block_pattern & 0x10)
-		    slice_non_intra_DCT (picture, dest[0] + offset + 8,
-					 DCT_stride);
+		    slice_non_intra_DCT (decoder, dest_y + 8, DCT_stride);
 		if (coded_block_pattern & 0x08)
-		    slice_non_intra_DCT (picture,
-					 dest[0] + offset + DCT_offset,
+		    slice_non_intra_DCT (decoder, dest_y + DCT_offset,
 					 DCT_stride);
 		if (coded_block_pattern & 0x04)
-		    slice_non_intra_DCT (picture,
-					 dest[0] + offset + DCT_offset + 8,
+		    slice_non_intra_DCT (decoder, dest_y + DCT_offset + 8,
 					 DCT_stride);
-
 		if (coded_block_pattern & 0x2)
-		    slice_non_intra_DCT (picture, dest[1] + (offset >> 1),
-					 stride >> 1);
+		    slice_non_intra_DCT (decoder,
+					 decoder->dest[1] + (offset >> 1),
+					 decoder->uv_stride);
 		if (coded_block_pattern & 0x1)
-		    slice_non_intra_DCT (picture, dest[2] + (offset >> 1),
-					 stride >> 1);
+		    slice_non_intra_DCT (decoder,
+					 decoder->dest[2] + (offset >> 1),
+					 decoder->uv_stride);
 	    }
 
-	    picture->dc_dct_pred[0] = picture->dc_dct_pred[1] =
-		picture->dc_dct_pred[2] = 1 << (picture->intra_dc_precision+7);
+	    decoder->dc_dct_pred[0] = decoder->dc_dct_pred[1] =
+		decoder->dc_dct_pred[2] = 128 << decoder->intra_dc_precision;
 	}
 
-#ifdef MPEG12_POSTPROC
-	picture->current_frame->quant_store[code][(offset>>4)+1] = picture->quantizer_scale>>1;
-#endif
-	offset += 16;
-	CHECK_DISPLAY;
+	NEXT_MACROBLOCK;
 
 	NEEDBITS (bit_buf, bits, bit_ptr);
-
-	if (0 /* FIXME */ && (bit_buf & 0x80000000)) {
-	    DUMPBITS (bit_buf, bits, 1);
-	} else {
-	    int mba_inc;
-
-	    mba_inc = get_macroblock_address_increment (picture);
-	    if (!mba_inc)
-		continue;
-	    else if (mba_inc < 0)
+	mba_inc = 0;
+	while (1) {
+	    if (bit_buf >= 0x10000000) {
+		mba = MBA_5 + (UBITS (bit_buf, 5) - 2);
 		break;
+	    } else if (bit_buf >= 0x03000000) {
+		mba = MBA_11 + (UBITS (bit_buf, 11) - 24);
+		break;
+	    } else switch (UBITS (bit_buf, 11)) {
+	    case 8:		/* macroblock_escape */
+		mba_inc += 33;
+		/* pass through */
+	    case 15:	/* macroblock_stuffing (MPEG1 only) */
+		DUMPBITS (bit_buf, bits, 11);
+		NEEDBITS (bit_buf, bits, bit_ptr);
+		continue;
+	    default:	/* end of slice, or error */
+		if (mpeg2_cpu_state_restore)
+		    mpeg2_cpu_state_restore (&cpu_state);
+		return;
+	    }
+	}
+	DUMPBITS (bit_buf, bits, mba->len);
+	mba_inc += mba->mba;
 
-	    picture->dc_dct_pred[0] = picture->dc_dct_pred[1] =
-		picture->dc_dct_pred[2] = 1 << (picture->intra_dc_precision+7);
+	if (mba_inc) {
+	    decoder->dc_dct_pred[0] = decoder->dc_dct_pred[1] =
+		decoder->dc_dct_pred[2] = 128 << decoder->intra_dc_precision;
 
-	    if (picture->picture_coding_type == P_TYPE) {
-		picture->f_motion.pmv[0][0] = picture->f_motion.pmv[0][1] = 0;
-		picture->f_motion.pmv[1][0] = picture->f_motion.pmv[1][1] = 0;
+	    if (decoder->coding_type == P_TYPE) {
+		decoder->f_motion.pmv[0][0] = decoder->f_motion.pmv[0][1] = 0;
+		decoder->f_motion.pmv[1][0] = decoder->f_motion.pmv[1][1] = 0;
 
 		do {
-		    if (picture->picture_structure == FRAME_PICTURE)
-			MOTION (motion_fr_zero, MACROBLOCK_MOTION_FORWARD);
-		    else
-			MOTION (motion_fi_zero, MACROBLOCK_MOTION_FORWARD);
-
-#ifdef MPEG12_POSTPROC
-	picture->current_frame->quant_store[code][(offset>>4)+1] = picture->quantizer_scale>>1;
-#endif
-
-		    offset += 16;
-		    CHECK_DISPLAY;
+		    MOTION_CALL (motion_zero, MACROBLOCK_MOTION_FORWARD);
+		    NEXT_MACROBLOCK;
 		} while (--mba_inc);
 	    } else {
 		do {
-		    if (picture->mpeg1)
-			MOTION (motion_mp1_reuse, macroblock_modes);
-		    else if (picture->picture_structure == FRAME_PICTURE)
-			MOTION (motion_fr_reuse, macroblock_modes);
-		    else
-			MOTION (motion_fi_reuse, macroblock_modes);
-
-#ifdef MPEG12_POSTPROC
-	picture->current_frame->quant_store[code][(offset>>4)+1] = picture->quantizer_scale>>1;
-#endif
-
-		    offset += 16;
-		    CHECK_DISPLAY;
+		    MOTION_CALL (motion_reuse, macroblock_modes);
+		    NEXT_MACROBLOCK;
 		} while (--mba_inc);
 	    }
 	}
     }
-
-    return 0;
 #undef bit_buf
 #undef bits
 #undef bit_ptr
diff --git a/libmpeg2/sse.h b/libmpeg2/sse.h
deleted file mode 100644
index 51540dca08..0000000000
--- a/libmpeg2/sse.h
+++ /dev/null
@@ -1,256 +0,0 @@
-/*
- * sse.h
- * Copyright (C) 1999 R. Fisher
- *
- * This file is part of mpeg2dec, a free MPEG-2 video stream decoder.
- *
- * mpeg2dec is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * mpeg2dec is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
- */
-
-
-typedef	union {
-	float			sf[4];	/* Single-precision (32-bit) value */
-} ATTR_ALIGN(16) sse_t;	/* On a 16 byte (128-bit) boundary */
-
-
-#define	sse_i2r(op, imm, reg) \
-	__asm__ __volatile__ (#op " %0, %%" #reg \
-			      : /* nothing */ \
-			      : "X" (imm) )
-
-#define	sse_m2r(op, mem, reg) \
-	__asm__ __volatile__ (#op " %0, %%" #reg \
-			      : /* nothing */ \
-			      : "X" (mem))
-
-#define	sse_r2m(op, reg, mem) \
-	__asm__ __volatile__ (#op " %%" #reg ", %0" \
-			      : "=X" (mem) \
-			      : /* nothing */ )
-
-#define	sse_r2r(op, regs, regd) \
-	__asm__ __volatile__ (#op " %" #regs ", %" #regd)
-
-#define	sse_r2ri(op, regs, regd, imm) \
-	__asm__ __volatile__ (#op " %0, %%" #regs ", %%" #regd \
-			      : /* nothing */ \
-			      : "X" (imm) )
-
-#define	sse_m2ri(op, mem, reg, subop) \
-	__asm__ __volatile__ (#op " %0, %%" #reg ", " #subop \
-			      : /* nothing */ \
-			      : "X" (mem))
-
-
-#define	movaps_m2r(var, reg)	sse_m2r(movaps, var, reg)
-#define	movaps_r2m(reg, var)	sse_r2m(movaps, reg, var)
-#define	movaps_r2r(regs, regd)	sse_r2r(movaps, regs, regd)
-
-#define	movntps_r2m(xmmreg, var)	sse_r2m(movntps, xmmreg, var)
-
-#define	movups_m2r(var, reg)	sse_m2r(movups, var, reg)
-#define	movups_r2m(reg, var)	sse_r2m(movups, reg, var)
-#define	movups_r2r(regs, regd)	sse_r2r(movups, regs, regd)
-
-#define	movhlps_r2r(regs, regd)	sse_r2r(movhlps, regs, regd)
-
-#define	movlhps_r2r(regs, regd)	sse_r2r(movlhps, regs, regd)
-
-#define	movhps_m2r(var, reg)	sse_m2r(movhps, var, reg)
-#define	movhps_r2m(reg, var)	sse_r2m(movhps, reg, var)
-
-#define	movlps_m2r(var, reg)	sse_m2r(movlps, var, reg)
-#define	movlps_r2m(reg, var)	sse_r2m(movlps, reg, var)
-
-#define	movss_m2r(var, reg)	sse_m2r(movss, var, reg)
-#define	movss_r2m(reg, var)	sse_r2m(movss, reg, var)
-#define	movss_r2r(regs, regd)	sse_r2r(movss, regs, regd)
-
-#define	shufps_m2r(var, reg, index)	sse_m2ri(shufps, var, reg, index)
-#define	shufps_r2r(regs, regd, index)	sse_r2ri(shufps, regs, regd, index)
-
-#define	cvtpi2ps_m2r(var, xmmreg)	sse_m2r(cvtpi2ps, var, xmmreg)
-#define	cvtpi2ps_r2r(mmreg, xmmreg)	sse_r2r(cvtpi2ps, mmreg, xmmreg)
-
-#define	cvtps2pi_m2r(var, mmreg)	sse_m2r(cvtps2pi, var, mmreg)
-#define	cvtps2pi_r2r(xmmreg, mmreg)	sse_r2r(cvtps2pi, mmreg, xmmreg)
-
-#define	cvttps2pi_m2r(var, mmreg)	sse_m2r(cvttps2pi, var, mmreg)
-#define	cvttps2pi_r2r(xmmreg, mmreg)	sse_r2r(cvttps2pi, mmreg, xmmreg)
-
-#define	cvtsi2ss_m2r(var, xmmreg)	sse_m2r(cvtsi2ss, var, xmmreg)
-#define	cvtsi2ss_r2r(reg, xmmreg)	sse_r2r(cvtsi2ss, reg, xmmreg)
-
-#define	cvtss2si_m2r(var, reg)		sse_m2r(cvtss2si, var, reg)
-#define	cvtss2si_r2r(xmmreg, reg)	sse_r2r(cvtss2si, xmmreg, reg)
-
-#define	cvttss2si_m2r(var, reg)		sse_m2r(cvtss2si, var, reg)
-#define	cvttss2si_r2r(xmmreg, reg)	sse_r2r(cvtss2si, xmmreg, reg)
-
-#define	movmskps(xmmreg, reg) \
-	__asm__ __volatile__ ("movmskps %" #xmmreg ", %" #reg)
-
-#define	addps_m2r(var, reg)		sse_m2r(addps, var, reg)
-#define	addps_r2r(regs, regd)		sse_r2r(addps, regs, regd)
-
-#define	addss_m2r(var, reg)		sse_m2r(addss, var, reg)
-#define	addss_r2r(regs, regd)		sse_r2r(addss, regs, regd)
-
-#define	subps_m2r(var, reg)		sse_m2r(subps, var, reg)
-#define	subps_r2r(regs, regd)		sse_r2r(subps, regs, regd)
-
-#define	subss_m2r(var, reg)		sse_m2r(subss, var, reg)
-#define	subss_r2r(regs, regd)		sse_r2r(subss, regs, regd)
-
-#define	mulps_m2r(var, reg)		sse_m2r(mulps, var, reg)
-#define	mulps_r2r(regs, regd)		sse_r2r(mulps, regs, regd)
-
-#define	mulss_m2r(var, reg)		sse_m2r(mulss, var, reg)
-#define	mulss_r2r(regs, regd)		sse_r2r(mulss, regs, regd)
-
-#define	divps_m2r(var, reg)		sse_m2r(divps, var, reg)
-#define	divps_r2r(regs, regd)		sse_r2r(divps, regs, regd)
-
-#define	divss_m2r(var, reg)		sse_m2r(divss, var, reg)
-#define	divss_r2r(regs, regd)		sse_r2r(divss, regs, regd)
-
-#define	rcpps_m2r(var, reg)		sse_m2r(rcpps, var, reg)
-#define	rcpps_r2r(regs, regd)		sse_r2r(rcpps, regs, regd)
-
-#define	rcpss_m2r(var, reg)		sse_m2r(rcpss, var, reg)
-#define	rcpss_r2r(regs, regd)		sse_r2r(rcpss, regs, regd)
-
-#define	rsqrtps_m2r(var, reg)		sse_m2r(rsqrtps, var, reg)
-#define	rsqrtps_r2r(regs, regd)		sse_r2r(rsqrtps, regs, regd)
-
-#define	rsqrtss_m2r(var, reg)		sse_m2r(rsqrtss, var, reg)
-#define	rsqrtss_r2r(regs, regd)		sse_r2r(rsqrtss, regs, regd)
-
-#define	sqrtps_m2r(var, reg)		sse_m2r(sqrtps, var, reg)
-#define	sqrtps_r2r(regs, regd)		sse_r2r(sqrtps, regs, regd)
-
-#define	sqrtss_m2r(var, reg)		sse_m2r(sqrtss, var, reg)
-#define	sqrtss_r2r(regs, regd)		sse_r2r(sqrtss, regs, regd)
-
-#define	andps_m2r(var, reg)		sse_m2r(andps, var, reg)
-#define	andps_r2r(regs, regd)		sse_r2r(andps, regs, regd)
-
-#define	andnps_m2r(var, reg)		sse_m2r(andnps, var, reg)
-#define	andnps_r2r(regs, regd)		sse_r2r(andnps, regs, regd)
-
-#define	orps_m2r(var, reg)		sse_m2r(orps, var, reg)
-#define	orps_r2r(regs, regd)		sse_r2r(orps, regs, regd)
-
-#define	xorps_m2r(var, reg)		sse_m2r(xorps, var, reg)
-#define	xorps_r2r(regs, regd)		sse_r2r(xorps, regs, regd)
-
-#define	maxps_m2r(var, reg)		sse_m2r(maxps, var, reg)
-#define	maxps_r2r(regs, regd)		sse_r2r(maxps, regs, regd)
-
-#define	maxss_m2r(var, reg)		sse_m2r(maxss, var, reg)
-#define	maxss_r2r(regs, regd)		sse_r2r(maxss, regs, regd)
-
-#define	minps_m2r(var, reg)		sse_m2r(minps, var, reg)
-#define	minps_r2r(regs, regd)		sse_r2r(minps, regs, regd)
-
-#define	minss_m2r(var, reg)		sse_m2r(minss, var, reg)
-#define	minss_r2r(regs, regd)		sse_r2r(minss, regs, regd)
-
-#define	cmpps_m2r(var, reg, op)		sse_m2ri(cmpps, var, reg, op)
-#define	cmpps_r2r(regs, regd, op)	sse_r2ri(cmpps, regs, regd, op)
-
-#define	cmpeqps_m2r(var, reg)		sse_m2ri(cmpps, var, reg, 0)
-#define	cmpeqps_r2r(regs, regd)		sse_r2ri(cmpps, regs, regd, 0)
-
-#define	cmpltps_m2r(var, reg)		sse_m2ri(cmpps, var, reg, 1)
-#define	cmpltps_r2r(regs, regd)		sse_r2ri(cmpps, regs, regd, 1)
-
-#define	cmpleps_m2r(var, reg)		sse_m2ri(cmpps, var, reg, 2)
-#define	cmpleps_r2r(regs, regd)		sse_r2ri(cmpps, regs, regd, 2)
-
-#define	cmpunordps_m2r(var, reg)	sse_m2ri(cmpps, var, reg, 3)
-#define	cmpunordps_r2r(regs, regd)	sse_r2ri(cmpps, regs, regd, 3)
-
-#define	cmpneqps_m2r(var, reg)		sse_m2ri(cmpps, var, reg, 4)
-#define	cmpneqps_r2r(regs, regd)	sse_r2ri(cmpps, regs, regd, 4)
-
-#define	cmpnltps_m2r(var, reg)		sse_m2ri(cmpps, var, reg, 5)
-#define	cmpnltps_r2r(regs, regd)	sse_r2ri(cmpps, regs, regd, 5)
-
-#define	cmpnleps_m2r(var, reg)		sse_m2ri(cmpps, var, reg, 6)
-#define	cmpnleps_r2r(regs, regd)	sse_r2ri(cmpps, regs, regd, 6)
-
-#define	cmpordps_m2r(var, reg)		sse_m2ri(cmpps, var, reg, 7)
-#define	cmpordps_r2r(regs, regd)	sse_r2ri(cmpps, regs, regd, 7)
-
-#define	cmpss_m2r(var, reg, op)		sse_m2ri(cmpss, var, reg, op)
-#define	cmpss_r2r(regs, regd, op)	sse_r2ri(cmpss, regs, regd, op)
-
-#define	cmpeqss_m2r(var, reg)		sse_m2ri(cmpss, var, reg, 0)
-#define	cmpeqss_r2r(regs, regd)		sse_r2ri(cmpss, regs, regd, 0)
-
-#define	cmpltss_m2r(var, reg)		sse_m2ri(cmpss, var, reg, 1)
-#define	cmpltss_r2r(regs, regd)		sse_r2ri(cmpss, regs, regd, 1)
-
-#define	cmpless_m2r(var, reg)		sse_m2ri(cmpss, var, reg, 2)
-#define	cmpless_r2r(regs, regd)		sse_r2ri(cmpss, regs, regd, 2)
-
-#define	cmpunordss_m2r(var, reg)	sse_m2ri(cmpss, var, reg, 3)
-#define	cmpunordss_r2r(regs, regd)	sse_r2ri(cmpss, regs, regd, 3)
-
-#define	cmpneqss_m2r(var, reg)		sse_m2ri(cmpss, var, reg, 4)
-#define	cmpneqss_r2r(regs, regd)	sse_r2ri(cmpss, regs, regd, 4)
-
-#define	cmpnltss_m2r(var, reg)		sse_m2ri(cmpss, var, reg, 5)
-#define	cmpnltss_r2r(regs, regd)	sse_r2ri(cmpss, regs, regd, 5)
-
-#define	cmpnless_m2r(var, reg)		sse_m2ri(cmpss, var, reg, 6)
-#define	cmpnless_r2r(regs, regd)	sse_r2ri(cmpss, regs, regd, 6)
-
-#define	cmpordss_m2r(var, reg)		sse_m2ri(cmpss, var, reg, 7)
-#define	cmpordss_r2r(regs, regd)	sse_r2ri(cmpss, regs, regd, 7)
-
-#define	comiss_m2r(var, reg)		sse_m2r(comiss, var, reg)
-#define	comiss_r2r(regs, regd)		sse_r2r(comiss, regs, regd)
-
-#define	ucomiss_m2r(var, reg)		sse_m2r(ucomiss, var, reg)
-#define	ucomiss_r2r(regs, regd)		sse_r2r(ucomiss, regs, regd)
-
-#define	unpcklps_m2r(var, reg)		sse_m2r(unpcklps, var, reg)
-#define	unpcklps_r2r(regs, regd)	sse_r2r(unpcklps, regs, regd)
-
-#define	unpckhps_m2r(var, reg)		sse_m2r(unpckhps, var, reg)
-#define	unpckhps_r2r(regs, regd)	sse_r2r(unpckhps, regs, regd)
-
-#define	fxrstor(mem) \
-	__asm__ __volatile__ ("fxrstor %0" \
-			      : /* nothing */ \
-			      : "X" (mem))
-
-#define	fxsave(mem) \
-	__asm__ __volatile__ ("fxsave %0" \
-			      : /* nothing */ \
-			      : "X" (mem))
-
-#define	stmxcsr(mem) \
-	__asm__ __volatile__ ("stmxcsr %0" \
-			      : /* nothing */ \
-			      : "X" (mem))
-
-#define	ldmxcsr(mem) \
-	__asm__ __volatile__ ("ldmxcsr %0" \
-			      : /* nothing */ \
-			      : "X" (mem))
-
diff --git a/libmpeg2/stats.c b/libmpeg2/stats.c
deleted file mode 100644
index f3456058df..0000000000
--- a/libmpeg2/stats.c
+++ /dev/null
@@ -1,315 +0,0 @@
-/*
- * stats.c
- * Copyright (C) 1999-2001 Aaron Holtzman <aholtzma@ess.engr.uvic.ca>
- *
- * This file is part of mpeg2dec, a free MPEG-2 video stream decoder.
- *
- * mpeg2dec is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * mpeg2dec is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
- */
-
-#include "config.h"
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <inttypes.h>
-
-#include "mpeg2_internal.h"
-
-static int debug_level = -1;
-
-/* Determine is debug output is required. */
-/* We could potentially have multiple levels of debug info */
-static int debug_is_on (void)
-{
-    char * env_var;
-	
-    if (debug_level < 0) {
-	env_var = getenv ("MPEG2_DEBUG");
-
-	if (env_var)
-	    debug_level = 1;
-	else
-	    debug_level = 0;
-    }
-	
-    return debug_level;
-}
-
-static void stats_picture (uint8_t * buffer)
-{
-    static char * picture_coding_type_str [8] = {
-	"Invalid picture type",
-	"I-type",
-	"P-type",
-	"B-type",
-	"D (very bad)",
-	"Invalid","Invalid","Invalid"
-    };
-
-    int picture_coding_type;
-    int temporal_reference;
-    int vbv_delay;
-
-    temporal_reference = (buffer[0] << 2) | (buffer[1] >> 6);
-    picture_coding_type = (buffer [1] >> 3) & 7;
-    vbv_delay = ((buffer[1] << 13) | (buffer[2] << 5) |
-		 (buffer[3] >> 3)) & 0xffff;
-
-    fprintf (stderr, " (picture) %s temporal_reference %d, vbv_delay %d\n",
-	     picture_coding_type_str [picture_coding_type],
-	     temporal_reference, vbv_delay);
-}
-
-static void stats_user_data (uint8_t * buffer)
-{
-    fprintf (stderr, " (user_data)\n");
-}
-
-static void stats_sequence (uint8_t * buffer)
-{
-    static char * aspect_ratio_information_str[8] = {
-	"Invalid Aspect Ratio",
-	"1:1",
-	"4:3",
-	"16:9",
-	"2.21:1",
-	"Invalid Aspect Ratio",
-	"Invalid Aspect Ratio",
-	"Invalid Aspect Ratio"
-    };
-    static char * frame_rate_str[16] = {
-	"Invalid frame_rate_code",
-	"23.976", "24", "25" , "29.97",
-	"30" , "50", "59.94", "60" ,
-	"Invalid frame_rate_code", "Invalid frame_rate_code",
-	"Invalid frame_rate_code", "Invalid frame_rate_code",
-	"Invalid frame_rate_code", "Invalid frame_rate_code",
-	"Invalid frame_rate_code"
-    };
-
-    int horizontal_size;
-    int vertical_size;
-    int aspect_ratio_information;
-    int frame_rate_code;
-    int bit_rate_value;
-    int vbv_buffer_size_value;
-    int constrained_parameters_flag;
-    int load_intra_quantizer_matrix;
-    int load_non_intra_quantizer_matrix;
-
-    vertical_size = (buffer[0] << 16) | (buffer[1] << 8) | buffer[2];
-    horizontal_size = vertical_size >> 12;
-    vertical_size &= 0xfff;
-    aspect_ratio_information = buffer[3] >> 4;
-    frame_rate_code = buffer[3] & 15;
-    bit_rate_value = (buffer[4] << 10) | (buffer[5] << 2) | (buffer[6] >> 6);
-    vbv_buffer_size_value = ((buffer[6] << 5) | (buffer[7] >> 3)) & 0x3ff;
-    constrained_parameters_flag = buffer[7] & 4;
-    load_intra_quantizer_matrix = buffer[7] & 2;
-    if (load_intra_quantizer_matrix)
-	buffer += 64;
-    load_non_intra_quantizer_matrix = buffer[7] & 1;
-
-    fprintf (stderr, " (seq) %dx%d %s, %s fps, %5.0f kbps, VBV %d kB%s%s%s\n",
-	     horizontal_size, vertical_size,
-	     aspect_ratio_information_str [aspect_ratio_information],
-	     frame_rate_str [frame_rate_code],
-	     bit_rate_value * 400.0 / 1000.0,
-	     2 * vbv_buffer_size_value,
-	     constrained_parameters_flag ? " , CP":"",
-	     load_intra_quantizer_matrix ? " , Custom Intra Matrix":"",
-	     load_non_intra_quantizer_matrix ? " , Custom Non-Intra Matrix":"");
-}
-
-static void stats_sequence_error (uint8_t * buffer)
-{
-    fprintf (stderr, " (sequence_error)\n");
-}
-
-static void stats_sequence_end (uint8_t * buffer)
-{
-    fprintf (stderr, " (sequence_end)\n");
-}
-
-static void stats_group (uint8_t * buffer)
-{
-    fprintf (stderr, " (group)%s%s\n",
-	     (buffer[4] & 0x40) ? " closed_gop" : "",
-	     (buffer[4] & 0x20) ? " broken_link" : "");
-}
-
-static void stats_slice (uint8_t code, uint8_t * buffer)
-{
-    /* fprintf (stderr, " (slice %d)\n", code); */
-}
-
-static void stats_sequence_extension (uint8_t * buffer)
-{
-    static char * chroma_format_str[4] = {
-	"Invalid Chroma Format",
-	"4:2:0 Chroma",
-	"4:2:2 Chroma",
-	"4:4:4 Chroma"
-    };
-
-    int progressive_sequence;
-    int chroma_format;
-
-    progressive_sequence = (buffer[1] >> 3) & 1;
-    chroma_format = (buffer[1] >> 1) & 3;
-
-    fprintf (stderr, " (seq_ext) progressive_sequence %d, %s\n",
-	     progressive_sequence, chroma_format_str [chroma_format]);
-}
-
-static void stats_sequence_display_extension (uint8_t * buffer)
-{
-    fprintf (stderr, " (sequence_display_extension)\n");
-}
-
-static void stats_quant_matrix_extension (uint8_t * buffer)
-{
-    fprintf (stderr, " (quant_matrix_extension)\n");
-}
-
-static void stats_copyright_extension (uint8_t * buffer)
-{
-    fprintf (stderr, " (copyright_extension)\n");
-}
-
-
-static void stats_sequence_scalable_extension (uint8_t * buffer)
-{
-    fprintf (stderr, " (sequence_scalable_extension)\n");
-}
-
-static void stats_picture_display_extension (uint8_t * buffer)
-{
-    fprintf (stderr, " (picture_display_extension)\n");
-}
-
-static void stats_picture_coding_extension (uint8_t * buffer)
-{
-    static char * picture_structure_str[4] = {
-	"Invalid Picture Structure",
-	"Top field",
-	"Bottom field",
-	"Frame Picture"
-    };
-
-    int f_code[2][2];
-    int intra_dc_precision;
-    int picture_structure;
-    int top_field_first;
-    int frame_pred_frame_dct;
-    int concealment_motion_vectors;
-    int q_scale_type;
-    int intra_vlc_format;
-    int alternate_scan;
-    int repeat_first_field;
-    int progressive_frame;
-
-    f_code[0][0] = buffer[0] & 15;
-    f_code[0][1] = buffer[1] >> 4;
-    f_code[1][0] = buffer[1] & 15;
-    f_code[1][1] = buffer[2] >> 4;
-    intra_dc_precision = (buffer[2] >> 2) & 3;
-    picture_structure = buffer[2] & 3;
-    top_field_first = buffer[3] >> 7;
-    frame_pred_frame_dct = (buffer[3] >> 6) & 1;
-    concealment_motion_vectors = (buffer[3] >> 5) & 1;
-    q_scale_type = (buffer[3] >> 4) & 1;
-    intra_vlc_format = (buffer[3] >> 3) & 1;
-    alternate_scan = (buffer[3] >> 2) & 1;
-    repeat_first_field = (buffer[3] >> 1) & 1;
-    progressive_frame = buffer[4] >> 7;
-
-    fprintf (stderr,
-	     " (pic_ext) %s\n", picture_structure_str [picture_structure]);
-    fprintf (stderr,
-	     " (pic_ext) forward horizontal f_code % d, forward vertical f_code % d\n",
-	     f_code[0][0], f_code[0][1]);
-    fprintf (stderr,
-	     " (pic_ext) backward horizontal f_code % d, backward vertical f_code % d\n", 
-	     f_code[1][0], f_code[1][1]);
-    fprintf (stderr,
-	     " (pic_ext) intra_dc_precision %d, top_field_first %d, frame_pred_frame_dct %d\n",
-	     intra_dc_precision, top_field_first, frame_pred_frame_dct);
-    fprintf (stderr,
-	     " (pic_ext) concealment_motion_vectors %d, q_scale_type %d, intra_vlc_format %d\n",
-	     concealment_motion_vectors, q_scale_type, intra_vlc_format);
-    fprintf (stderr,
-	     " (pic_ext) alternate_scan %d, repeat_first_field %d, progressive_frame %d\n",
-	     alternate_scan, repeat_first_field, progressive_frame);
-}
-
-void stats_header (uint8_t code, uint8_t * buffer)
-{
-    if (! (debug_is_on ()))
-	return;
-
-    switch (code) {
-    case 0x00:
-	stats_picture (buffer);
-	break;
-    case 0xb2:
-	stats_user_data (buffer);
-	break;
-    case 0xb3:
-	stats_sequence (buffer);
-	break;
-    case 0xb4:
-	stats_sequence_error (buffer);
-	break;
-    case 0xb5:
-	switch (buffer[0] >> 4) {
-	case 1:
-	    stats_sequence_extension (buffer);
-	    break;
-	case 2:
-	    stats_sequence_display_extension (buffer);
-	    break;
-	case 3:
-	    stats_quant_matrix_extension (buffer);
-	    break;
-	case 4:
-	    stats_copyright_extension (buffer);
-	    break;
-	case 5:
-	    stats_sequence_scalable_extension (buffer);
-	    break;
-	case 7:
-	    stats_picture_display_extension (buffer);
-	    break;
-	case 8:
-	    stats_picture_coding_extension (buffer);
-	    break;
-	default:
-	    fprintf (stderr, " (unknown extension %#x)\n", buffer[0] >> 4);
-	}
-	break;
-    case 0xb7:
-	stats_sequence_end (buffer);
-	break;
-    case 0xb8:
-	stats_group (buffer);
-	break;
-    default:
-	if (code < 0xb0)
-	    stats_slice (code, buffer);
-	else
-	    fprintf (stderr, " (unknown start code %#02x)\n", code);
-    }
-}
diff --git a/libmpeg2/vlc.h b/libmpeg2/vlc.h
index ed2e04f882..aa3dfe1841 100644
--- a/libmpeg2/vlc.h
+++ b/libmpeg2/vlc.h
@@ -1,8 +1,10 @@
 /*
  * vlc.h
- * Copyright (C) 1999-2001 Aaron Holtzman <aholtzma@ess.engr.uvic.ca>
+ * Copyright (C) 2000-2002 Michel Lespinasse <walken@zoy.org>
+ * Copyright (C) 1999-2000 Aaron Holtzman <aholtzma@ess.engr.uvic.ca>
  *
  * This file is part of mpeg2dec, a free MPEG-2 video stream decoder.
+ * See http://libmpeg2.sourceforge.net/ for updates.
  *
  * mpeg2dec is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -25,17 +27,18 @@ do {								\
     bit_ptr += 2;						\
 } while (0)
 
-static inline void bitstream_init (picture_t * picture, uint8_t * start)
+static inline void bitstream_init (decoder_t * decoder, const uint8_t * start)
 {
-    picture->bitstream_buf = 0;    GETWORD (picture->bitstream_buf, 16, start);
-    picture->bitstream_ptr = start;
-    picture->bitstream_bits = 0;
+    decoder->bitstream_buf =
+	(start[0] << 24) | (start[1] << 16) | (start[2] << 8) | start[3];
+    decoder->bitstream_ptr = start + 4;
+    decoder->bitstream_bits = -16;
 }
 
 /* make sure that there are at least 16 valid bits in bit_buf */
 #define NEEDBITS(bit_buf,bits,bit_ptr)		\
 do {						\
-    if (bits > 0) {				\
+    if (unlikely (bits > 0)) {			\
 	GETWORD (bit_buf, bits, bit_ptr);	\
 	bits -= 16;				\
     }						\
@@ -94,14 +97,14 @@ typedef struct {
 #define INTRA MACROBLOCK_INTRA
 #define QUANT MACROBLOCK_QUANT
 
-static MBtab MB_I [] = {
+static const MBtab MB_I [] = {
     {INTRA|QUANT, 2}, {INTRA, 1}
 };
 
 #define MC MACROBLOCK_MOTION_FORWARD
 #define CODED MACROBLOCK_PATTERN
 
-static MBtab MB_P [] = {
+static const MBtab MB_P [] = {
     {INTRA|QUANT, 6}, {CODED|QUANT, 5}, {MC|CODED|QUANT, 5}, {INTRA,    5},
     {MC,          3}, {MC,          3}, {MC,             3}, {MC,       3},
     {CODED,       2}, {CODED,       2}, {CODED,          2}, {CODED,    2},
@@ -116,7 +119,7 @@ static MBtab MB_P [] = {
 #define BWD MACROBLOCK_MOTION_BACKWARD
 #define INTER MACROBLOCK_MOTION_FORWARD|MACROBLOCK_MOTION_BACKWARD
 
-static MBtab MB_B [] = {
+static const MBtab MB_B [] = {
     {0,                 0}, {INTRA|QUANT,       6},
     {BWD|CODED|QUANT,   6}, {FWD|CODED|QUANT,   6},
     {INTER|CODED|QUANT, 5}, {INTER|CODED|QUANT, 5},
@@ -146,11 +149,11 @@ static MBtab MB_B [] = {
 #undef INTER
 
 
-static MVtab MV_4 [] = {
+static const MVtab MV_4 [] = {
     { 3, 6}, { 2, 4}, { 1, 3}, { 1, 3}, { 0, 2}, { 0, 2}, { 0, 2}, { 0, 2}
 };
 
-static MVtab MV_10 [] = {
+static const MVtab MV_10 [] = {
     { 0,10}, { 0,10}, { 0,10}, { 0,10}, { 0,10}, { 0,10}, { 0,10}, { 0,10},
     { 0,10}, { 0,10}, { 0,10}, { 0,10}, {15,10}, {14,10}, {13,10}, {12,10},
     {11,10}, {10,10}, { 9, 9}, { 9, 9}, { 8, 9}, { 8, 9}, { 7, 9}, { 7, 9},
@@ -160,12 +163,12 @@ static MVtab MV_10 [] = {
 };
 
 
-static DMVtab DMV_2 [] = {
+static const DMVtab DMV_2 [] = {
     { 0, 1}, { 0, 1}, { 1, 2}, {-1, 2}
 };
 
 
-static CBPtab CBP_7 [] = {
+static const CBPtab CBP_7 [] = {
     {0x22, 7}, {0x12, 7}, {0x0a, 7}, {0x06, 7},
     {0x21, 7}, {0x11, 7}, {0x09, 7}, {0x05, 7},
     {0x3f, 6}, {0x3f, 6}, {0x03, 6}, {0x03, 6},
@@ -196,7 +199,7 @@ static CBPtab CBP_7 [] = {
     {0x3c, 3}, {0x3c, 3}, {0x3c, 3}, {0x3c, 3}
 };
 
-static CBPtab CBP_9 [] = {
+static const CBPtab CBP_9 [] = {
     {0,    0}, {0x00, 9}, {0x27, 9}, {0x1b, 9},
     {0x3b, 9}, {0x37, 9}, {0x2f, 9}, {0x1f, 9},
     {0x3a, 8}, {0x3a, 8}, {0x36, 8}, {0x36, 8},
@@ -216,21 +219,21 @@ static CBPtab CBP_9 [] = {
 };
 
 
-static DCtab DC_lum_5 [] = {
+static const DCtab DC_lum_5 [] = {
     {1, 2}, {1, 2}, {1, 2}, {1, 2}, {1, 2}, {1, 2}, {1, 2}, {1, 2},
     {2, 2}, {2, 2}, {2, 2}, {2, 2}, {2, 2}, {2, 2}, {2, 2}, {2, 2},
     {0, 3}, {0, 3}, {0, 3}, {0, 3}, {3, 3}, {3, 3}, {3, 3}, {3, 3},
     {4, 3}, {4, 3}, {4, 3}, {4, 3}, {5, 4}, {5, 4}, {6, 5}
 };
 
-static DCtab DC_chrom_5 [] = {
+static const DCtab DC_chrom_5 [] = {
     {0, 2}, {0, 2}, {0, 2}, {0, 2}, {0, 2}, {0, 2}, {0, 2}, {0, 2},
     {1, 2}, {1, 2}, {1, 2}, {1, 2}, {1, 2}, {1, 2}, {1, 2}, {1, 2},
     {2, 2}, {2, 2}, {2, 2}, {2, 2}, {2, 2}, {2, 2}, {2, 2}, {2, 2},
     {3, 3}, {3, 3}, {3, 3}, {3, 3}, {4, 4}, {4, 4}, {5, 5}
 };
 
-static DCtab DC_long [] = {
+static const DCtab DC_long [] = {
     {6, 5}, {6, 5}, {6, 5}, {6, 5}, {6, 5}, {6, 5}, { 6, 5}, { 6, 5},
     {6, 5}, {6, 5}, {6, 5}, {6, 5}, {6, 5}, {6, 5}, { 6, 5}, { 6, 5},
     {7, 6}, {7, 6}, {7, 6}, {7, 6}, {7, 6}, {7, 6}, { 7, 6}, { 7, 6},
@@ -238,7 +241,7 @@ static DCtab DC_long [] = {
 };
 
 
-static DCTtab DCT_16 [] = {
+static const DCTtab DCT_16 [] = {
     {129, 0, 0}, {129, 0, 0}, {129, 0, 0}, {129, 0, 0},
     {129, 0, 0}, {129, 0, 0}, {129, 0, 0}, {129, 0, 0},
     {129, 0, 0}, {129, 0, 0}, {129, 0, 0}, {129, 0, 0},
@@ -249,7 +252,7 @@ static DCTtab DCT_16 [] = {
     { 31, 1, 0}, { 30, 1, 0}, { 29, 1, 0}, { 28, 1, 0}
 };
 
-static DCTtab DCT_15 [] = {
+static const DCTtab DCT_15 [] = {
     {  1,40,15}, {  1,39,15}, {  1,38,15}, {  1,37,15},
     {  1,36,15}, {  1,35,15}, {  1,34,15}, {  1,33,15},
     {  1,32,15}, {  2,14,15}, {  2,13,15}, {  2,12,15},
@@ -264,7 +267,7 @@ static DCTtab DCT_15 [] = {
     {  1,17,14}, {  1,17,14}, {  1,16,14}, {  1,16,14}
 };
 
-static DCTtab DCT_13 [] = {
+static const DCTtab DCT_13 [] = {
     { 11, 2,13}, { 10, 2,13}, {  6, 3,13}, {  4, 4,13},
     {  3, 5,13}, {  2, 7,13}, {  2, 6,13}, {  1,15,13},
     {  1,14,13}, {  1,13,13}, {  1,12,13}, { 27, 1,13},
@@ -279,12 +282,12 @@ static DCTtab DCT_13 [] = {
     {  7, 2,12}, {  7, 2,12}, { 18, 1,12}, { 18, 1,12}
 };
 
-static DCTtab DCT_B14_10 [] = {
+static const DCTtab DCT_B14_10 [] = {
     { 17, 1,10}, {  6, 2,10}, {  1, 7,10}, {  3, 3,10},
     {  2, 4,10}, { 16, 1,10}, { 15, 1,10}, {  5, 2,10}
 };
 
-static DCTtab DCT_B14_8 [] = {
+static const DCTtab DCT_B14_8 [] = {
     { 65, 0, 6}, { 65, 0, 6}, { 65, 0, 6}, { 65, 0, 6},
     {  3, 2, 7}, {  3, 2, 7}, { 10, 1, 7}, { 10, 1, 7},
     {  1, 4, 7}, {  1, 4, 7}, {  9, 1, 7}, {  9, 1, 7},
@@ -296,7 +299,7 @@ static DCTtab DCT_B14_8 [] = {
     {  4, 2, 8}, {  2, 3, 8}, {  1, 5, 8}, { 11, 1, 8}
 };
 
-static DCTtab DCT_B14AC_5 [] = {
+static const DCTtab DCT_B14AC_5 [] = {
 		 {  1, 3, 5}, {  5, 1, 5}, {  4, 1, 5},
     {  1, 2, 4}, {  1, 2, 4}, {  3, 1, 4}, {  3, 1, 4},
     {  2, 1, 3}, {  2, 1, 3}, {  2, 1, 3}, {  2, 1, 3},
@@ -306,7 +309,7 @@ static DCTtab DCT_B14AC_5 [] = {
     {  1, 1, 2}, {  1, 1, 2}, {  1, 1, 2}, {  1, 1, 2}
 };
 
-static DCTtab DCT_B14DC_5 [] = {
+static const DCTtab DCT_B14DC_5 [] = {
 		 {  1, 3, 5}, {  5, 1, 5}, {  4, 1, 5},
     {  1, 2, 4}, {  1, 2, 4}, {  3, 1, 4}, {  3, 1, 4},
     {  2, 1, 3}, {  2, 1, 3}, {  2, 1, 3}, {  2, 1, 3},
@@ -316,12 +319,12 @@ static DCTtab DCT_B14DC_5 [] = {
     {  1, 1, 1}, {  1, 1, 1}, {  1, 1, 1}, {  1, 1, 1}
 };
 
-static DCTtab DCT_B15_10 [] = {
+static const DCTtab DCT_B15_10 [] = {
     {  6, 2, 9}, {  6, 2, 9}, { 15, 1, 9}, { 15, 1, 9},
     {  3, 4,10}, { 17, 1,10}, { 16, 1, 9}, { 16, 1, 9}
 };
 
-static DCTtab DCT_B15_8 [] = {
+static const DCTtab DCT_B15_8 [] = {
     { 65, 0, 6}, { 65, 0, 6}, { 65, 0, 6}, { 65, 0, 6},
     {  8, 1, 7}, {  8, 1, 7}, {  9, 1, 7}, {  9, 1, 7},
     {  7, 1, 7}, {  7, 1, 7}, {  3, 2, 7}, {  3, 2, 7},
@@ -388,14 +391,14 @@ static DCTtab DCT_B15_8 [] = {
 };
 
 
-static MBAtab MBA_5 [] = {
+static const MBAtab MBA_5 [] = {
 		    {6, 5}, {5, 5}, {4, 4}, {4, 4}, {3, 4}, {3, 4},
     {2, 3}, {2, 3}, {2, 3}, {2, 3}, {1, 3}, {1, 3}, {1, 3}, {1, 3},
     {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1},
     {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}, {0, 1}
 };
 
-static MBAtab MBA_11 [] = {
+static const MBAtab MBA_11 [] = {
     {32, 11}, {31, 11}, {30, 11}, {29, 11},
     {28, 11}, {27, 11}, {26, 11}, {25, 11},
     {24, 11}, {23, 11}, {22, 11}, {21, 11},