aboutsummaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
-rw-r--r--liba52/Makefile3
-rw-r--r--liba52/a52.h72
-rw-r--r--liba52/a52_internal.h89
-rw-r--r--liba52/bit_allocate.c25
-rw-r--r--liba52/bitstream.c53
-rw-r--r--liba52/bitstream.h45
-rw-r--r--liba52/downmix.c24
-rw-r--r--liba52/imdct.c1016
-rw-r--r--liba52/imdct_3dnow.h2
-rw-r--r--liba52/liba52_changes.diff2028
-rw-r--r--liba52/mm_accel.h4
-rw-r--r--liba52/parse.c417
-rw-r--r--liba52/tables.h10
-rw-r--r--libmpcodecs/ad_liba52.c23
14 files changed, 1521 insertions, 2290 deletions
diff --git a/liba52/Makefile b/liba52/Makefile
index 9ddd8e6284..9296b32284 100644
--- a/liba52/Makefile
+++ b/liba52/Makefile
@@ -9,12 +9,11 @@ SRCS = crc.c \
bitstream.c \
downmix.c \
imdct.c \
- imdct_mlib.c \
parse.c \
OBJS = $(SRCS:.c=.o)
-CFLAGS = $(MLIB_INC) $(OPTFLAGS) -I..
+CFLAGS = $(OPTFLAGS) -I..
.SUFFIXES: .c .o
diff --git a/liba52/a52.h b/liba52/a52.h
index f87abe9851..5c7a3d8656 100644
--- a/liba52/a52.h
+++ b/liba52/a52.h
@@ -1,6 +1,6 @@
/*
* a52.h
- * Copyright (C) 2000-2001 Michel Lespinasse <walken@zoy.org>
+ * Copyright (C) 2000-2002 Michel Lespinasse <walken@zoy.org>
* Copyright (C) 1999-2000 Aaron Holtzman <aholtzma@ess.engr.uvic.ca>
*
* This file is part of a52dec, a free ATSC A-52 stream decoder.
@@ -34,69 +34,7 @@ typedef float sample_t;
typedef double sample_t;
#endif
-typedef struct a52_ba_s {
- uint16_t fsnroffst; /* fine SNR offset */
- uint16_t fgaincod; /* fast gain */
- uint16_t deltbae; /* delta bit allocation exists */
- int8_t deltba[50]; /* per-band delta bit allocation */
-} a52_ba_t;
-
-typedef struct a52_state_s {
- uint8_t fscod; /* sample rate */
- uint8_t halfrate; /* halfrate factor */
- uint8_t acmod; /* coded channels */
- sample_t clev; /* centre channel mix level */
- sample_t slev; /* surround channels mix level */
- uint8_t lfeon; /* coded lfe channel */
-
- int output; /* type of output */
- sample_t level; /* output level */
- sample_t bias; /* output bias */
-
- int dynrnge; /* apply dynamic range */
- sample_t dynrng; /* dynamic range */
- void * dynrngdata; /* dynamic range callback funtion and data */
- sample_t (* dynrngcall) (sample_t range, void * dynrngdata);
-
- uint16_t cplinu; /* coupling in use */
- uint16_t chincpl[5]; /* channel coupled */
- uint16_t phsflginu; /* phase flags in use (stereo only) */
- uint16_t cplbndstrc[18]; /* coupling band structure */
- uint16_t cplstrtmant; /* coupling channel start mantissa */
- uint16_t cplendmant; /* coupling channel end mantissa */
- sample_t cplco[5][18]; /* coupling coordinates */
-
- /* derived information */
- uint16_t cplstrtbnd; /* coupling start band (for bit allocation) */
- uint16_t ncplbnd; /* number of coupling bands */
-
- uint16_t rematflg[4]; /* stereo rematrixing */
-
- uint16_t endmant[5]; /* channel end mantissa */
-
- uint8_t cpl_exp[256]; /* decoded coupling channel exponents */
- uint8_t fbw_exp[5][256]; /* decoded channel exponents */
- uint8_t lfe_exp[7]; /* decoded lfe channel exponents */
-
- uint16_t sdcycod; /* slow decay */
- uint16_t fdcycod; /* fast decay */
- uint16_t sgaincod; /* slow gain */
- uint16_t dbpbcod; /* dB per bit - encodes the dbknee value */
- uint16_t floorcod; /* masking floor */
-
- uint16_t csnroffst; /* coarse SNR offset */
- a52_ba_t cplba; /* coupling bit allocation parameters */
- a52_ba_t ba[5]; /* channel bit allocation parameters */
- a52_ba_t lfeba; /* lfe bit allocation parameters */
-
- uint16_t cplfleak; /* coupling fast leak init */
- uint16_t cplsleak; /* coupling slow leak init */
-
- /* derived bit allocation information */
- int8_t fbw_bap[5][256];
- int8_t cpl_bap[256];
- int8_t lfe_bap[7];
-} a52_state_t;
+typedef struct a52_state_s a52_state_t;
#define A52_CHANNEL 0
#define A52_MONO 1
@@ -114,14 +52,16 @@ typedef struct a52_state_s {
#define A52_LFE 16
#define A52_ADJUST_LEVEL 32
-sample_t * a52_init (uint32_t mm_accel);
+a52_state_t * a52_init (uint32_t mm_accel);
+sample_t * a52_samples (a52_state_t * state);
int a52_syncinfo (uint8_t * buf, int * flags,
int * sample_rate, int * bit_rate);
int a52_frame (a52_state_t * state, uint8_t * buf, int * flags,
sample_t * level, sample_t bias);
void a52_dynrng (a52_state_t * state,
sample_t (* call) (sample_t, void *), void * data);
-int a52_block (a52_state_t * state, sample_t * samples);
+int a52_block (a52_state_t * state);
+void a52_free (a52_state_t * state);
void* a52_resample_init(uint32_t mm_accel,int flags,int chans);
extern int (* a52_resample) (float * _f, int16_t * s16);
diff --git a/liba52/a52_internal.h b/liba52/a52_internal.h
index 91fc54a300..d420803699 100644
--- a/liba52/a52_internal.h
+++ b/liba52/a52_internal.h
@@ -1,6 +1,6 @@
/*
* a52_internal.h
- * Copyright (C) 2000-2001 Michel Lespinasse <walken@zoy.org>
+ * Copyright (C) 2000-2002 Michel Lespinasse <walken@zoy.org>
* Copyright (C) 1999-2000 Aaron Holtzman <aholtzma@ess.engr.uvic.ca>
*
* This file is part of a52dec, a free ATSC A-52 stream decoder.
@@ -25,6 +25,72 @@
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
+typedef struct {
+ uint8_t bai; /* fine SNR offset, fast gain */
+ uint8_t deltbae; /* delta bit allocation exists */
+ int8_t deltba[50]; /* per-band delta bit allocation */
+} ba_t;
+
+typedef struct {
+ uint8_t exp[256]; /* decoded channel exponents */
+ int8_t bap[256]; /* derived channel bit allocation */
+} expbap_t;
+
+struct a52_state_s {
+ uint8_t fscod; /* sample rate */
+ uint8_t halfrate; /* halfrate factor */
+ uint8_t acmod; /* coded channels */
+ uint8_t lfeon; /* coded lfe channel */
+ sample_t clev; /* centre channel mix level */
+ sample_t slev; /* surround channels mix level */
+
+ int output; /* type of output */
+ sample_t level; /* output level */
+ sample_t bias; /* output bias */
+
+ int dynrnge; /* apply dynamic range */
+ sample_t dynrng; /* dynamic range */
+ void * dynrngdata; /* dynamic range callback funtion and data */
+ sample_t (* dynrngcall) (sample_t range, void * dynrngdata);
+
+ uint8_t chincpl; /* channel coupled */
+ uint8_t phsflginu; /* phase flags in use (stereo only) */
+ uint8_t cplstrtmant; /* coupling channel start mantissa */
+ uint8_t cplendmant; /* coupling channel end mantissa */
+ uint32_t cplbndstrc; /* coupling band structure */
+ sample_t cplco[5][18]; /* coupling coordinates */
+
+ /* derived information */
+ uint8_t cplstrtbnd; /* coupling start band (for bit allocation) */
+ uint8_t ncplbnd; /* number of coupling bands */
+
+ uint8_t rematflg; /* stereo rematrixing */
+
+ uint8_t endmant[5]; /* channel end mantissa */
+
+ uint16_t bai; /* bit allocation information */
+
+ uint32_t * buffer_start;
+ uint16_t lfsr_state; /* dither state */
+ uint32_t bits_left;
+ uint32_t current_word;
+
+ uint8_t csnroffst; /* coarse SNR offset */
+ ba_t cplba; /* coupling bit allocation parameters */
+ ba_t ba[5]; /* channel bit allocation parameters */
+ ba_t lfeba; /* lfe bit allocation parameters */
+
+ uint8_t cplfleak; /* coupling fast leak init */
+ uint8_t cplsleak; /* coupling slow leak init */
+
+ expbap_t cpl_expbap;
+ expbap_t fbw_expbap[5];
+ expbap_t lfe_expbap;
+
+ sample_t * samples;
+ int downmixed;
+};
+
#define LEVEL_PLUS6DB 2.0
#define LEVEL_PLUS3DB 1.4142135623730951
#define LEVEL_3DB 0.7071067811865476
@@ -55,21 +121,20 @@
# define REG_BP "ebp"
#endif
-void bit_allocate (a52_state_t * state, a52_ba_t * ba, int bndstart,
+void a52_bit_allocate (a52_state_t * state, ba_t * ba, int bndstart,
int start, int end, int fastleak, int slowleak,
- uint8_t * exp, int8_t * bap);
+ expbap_t * expbap);
-int downmix_init (int input, int flags, sample_t * level,
+int a52_downmix_init (int input, int flags, sample_t * level,
sample_t clev, sample_t slev);
void downmix_accel_init(uint32_t mm_accel);
-int downmix_coeff (sample_t * coeff, int acmod, int output, sample_t level,
+int a52_downmix_coeff (sample_t * coeff, int acmod, int output, sample_t level,
sample_t clev, sample_t slev);
-extern void (*downmix) (sample_t * samples, int acmod, int output, sample_t bias,
+extern void (*a52_downmix) (sample_t * samples, int acmod, int output, sample_t bias,
sample_t clev, sample_t slev);
-extern void (*upmix) (sample_t * samples, int acmod, int output);
+extern void (*a52_upmix) (sample_t * samples, int acmod, int output);
-void imdct_init (uint32_t mm_accel);
-extern void (* imdct_256) (sample_t * data, sample_t * delay, sample_t bias);
-extern void (* imdct_512) (sample_t * data, sample_t * delay, sample_t bias);
-void imdct_do_256_mlib (sample_t * data, sample_t * delay, sample_t bias);
-void imdct_do_512_mlib (sample_t * data, sample_t * delay, sample_t bias);
+void a52_imdct_init (uint32_t mm_accel);
+void a52_imdct_256 (sample_t * data, sample_t * delay, sample_t bias);
+extern void (*a52_imdct_512) (sample_t * data, sample_t * delay, sample_t bias);
+void imdct_do_512 (sample_t * data, sample_t * delay, sample_t bias);
diff --git a/liba52/bit_allocate.c b/liba52/bit_allocate.c
index d0dd602156..a5f3b77024 100644
--- a/liba52/bit_allocate.c
+++ b/liba52/bit_allocate.c
@@ -1,6 +1,6 @@
/*
* bit_allocate.c
- * Copyright (C) 2000-2001 Michel Lespinasse <walken@zoy.org>
+ * Copyright (C) 2000-2002 Michel Lespinasse <walken@zoy.org>
* Copyright (C) 1999-2000 Aaron Holtzman <aholtzma@ess.engr.uvic.ca>
*
* This file is part of a52dec, a free ATSC A-52 stream decoder.
@@ -121,9 +121,9 @@ do { \
mask -= floor; \
} while (0)
-void bit_allocate (a52_state_t * state, a52_ba_t * ba, int bndstart,
+void a52_bit_allocate (a52_state_t * state, ba_t * ba, int bndstart,
int start, int end, int fastleak, int slowleak,
- uint8_t * exp, int8_t * bap)
+ expbap_t * expbap)
{
static int slowgain[4] = {0x540, 0x4d8, 0x478, 0x410};
static int dbpbtab[4] = {0xc00, 0x500, 0x300, 0x100};
@@ -131,6 +131,8 @@ void bit_allocate (a52_state_t * state, a52_ba_t * ba, int bndstart,
0xa10, 0xa90, 0xb10, 0x1400};
int i, j;
+ uint8_t * exp;
+ int8_t * bap;
int fdecay, fgain, sdecay, sgain, dbknee, floor, snroffset;
int psd, mask;
int8_t * deltba;
@@ -138,21 +140,24 @@ void bit_allocate (a52_state_t * state, a52_ba_t * ba, int bndstart,
int halfrate;
halfrate = state->halfrate;
- fdecay = (63 + 20 * state->fdcycod) >> halfrate;
- fgain = 128 + 128 * ba->fgaincod;
- sdecay = (15 + 2 * state->sdcycod) >> halfrate;
- sgain = slowgain[state->sgaincod];
- dbknee = dbpbtab[state->dbpbcod];
+ fdecay = (63 + 20 * ((state->bai >> 7) & 3)) >> halfrate; /* fdcycod */
+ fgain = 128 + 128 * (ba->bai & 7); /* fgaincod */
+ sdecay = (15 + 2 * (state->bai >> 9)) >> halfrate; /* sdcycod */
+ sgain = slowgain[(state->bai >> 5) & 3]; /* sgaincod */
+ dbknee = dbpbtab[(state->bai >> 3) & 3]; /* dbpbcod */
hth = hthtab[state->fscod];
/*
* if there is no delta bit allocation, make deltba point to an area
* known to contain zeroes. baptab+156 here.
*/
deltba = (ba->deltbae == DELTA_BIT_NONE) ? baptab + 156 : ba->deltba;
- floor = floortab[state->floorcod];
- snroffset = 960 - 64 * state->csnroffst - 4 * ba->fsnroffst + floor;
+ floor = floortab[state->bai & 7]; /* floorcod */
+ snroffset = 960 - 64 * state->csnroffst - 4 * (ba->bai >> 3) + floor;
floor >>= 5;
+ exp = expbap->exp;
+ bap = expbap->bap;
+
i = bndstart;
j = start;
if (start == 0) { /* not the coupling channel */
diff --git a/liba52/bitstream.c b/liba52/bitstream.c
index 3a25001c57..6c275109d1 100644
--- a/liba52/bitstream.c
+++ b/liba52/bitstream.c
@@ -1,6 +1,6 @@
/*
* bitstream.c
- * Copyright (C) 2000-2001 Michel Lespinasse <walken@zoy.org>
+ * Copyright (C) 2000-2002 Michel Lespinasse <walken@zoy.org>
* Copyright (C) 1999-2000 Aaron Holtzman <aholtzma@ess.engr.uvic.ca>
*
* This file is part of a52dec, a free ATSC A-52 stream decoder.
@@ -37,34 +37,27 @@
#ifdef ALT_BITSTREAM_READER
int indx=0;
-uint32_t * buffer_start;
-#else
-static uint32_t * buffer_start;
#endif
-uint32_t bits_left;
-uint32_t current_word;
-
-void bitstream_set_ptr (uint8_t * buf)
+void a52_bitstream_set_ptr (a52_state_t * state, uint8_t * buf)
{
int align;
- align = (int)buf & 3;
- buffer_start = (uint32_t *) (buf - align);
- bits_left = 0;
+ align = (long)buf & 3;
+ state->buffer_start = (uint32_t *) (buf - align);
+ state->bits_left = 0;
#ifdef ALT_BITSTREAM_READER
indx=0;
#endif
- bitstream_get (align * 8);
+ bitstream_get (state, align * 8);
}
-static inline void
-bitstream_fill_current(void)
+static inline void bitstream_fill_current (a52_state_t * state)
{
uint32_t tmp;
- tmp = *(buffer_start++);
- current_word = swab32 (tmp);
+ tmp = *(state->buffer_start++);
+ state->current_word = swab32 (tmp);
}
/*
@@ -76,38 +69,38 @@ bitstream_fill_current(void)
* -ah
*/
-uint32_t
-bitstream_get_bh(uint32_t num_bits)
+uint32_t a52_bitstream_get_bh (a52_state_t * state, uint32_t num_bits)
{
uint32_t result;
- num_bits -= bits_left;
- result = (current_word << (32 - bits_left)) >> (32 - bits_left);
+ num_bits -= state->bits_left;
+ result = ((state->current_word << (32 - state->bits_left)) >>
+ (32 - state->bits_left));
- bitstream_fill_current();
+ bitstream_fill_current (state);
if(num_bits != 0)
- result = (result << num_bits) | (current_word >> (32 - num_bits));
+ result = (result << num_bits) | (state->current_word >> (32 - num_bits));
- bits_left = 32 - num_bits;
+ state->bits_left = 32 - num_bits;
return result;
}
-int32_t
-bitstream_get_bh_2(uint32_t num_bits)
+int32_t a52_bitstream_get_bh_2 (a52_state_t * state, uint32_t num_bits)
{
int32_t result;
- num_bits -= bits_left;
- result = (((int32_t)current_word) << (32 - bits_left)) >> (32 - bits_left);
+ num_bits -= state->bits_left;
+ result = ((((int32_t)state->current_word) << (32 - state->bits_left)) >>
+ (32 - state->bits_left));
- bitstream_fill_current();
+ bitstream_fill_current(state);
if(num_bits != 0)
- result = (result << num_bits) | (current_word >> (32 - num_bits));
+ result = (result << num_bits) | (state->current_word >> (32 - num_bits));
- bits_left = 32 - num_bits;
+ state->bits_left = 32 - num_bits;
return result;
}
diff --git a/liba52/bitstream.h b/liba52/bitstream.h
index 7e4ff676c8..8576f8b282 100644
--- a/liba52/bitstream.h
+++ b/liba52/bitstream.h
@@ -1,6 +1,6 @@
/*
* bitstream.h
- * Copyright (C) 2000-2001 Michel Lespinasse <walken@zoy.org>
+ * Copyright (C) 2000-2002 Michel Lespinasse <walken@zoy.org>
* Copyright (C) 1999-2000 Aaron Holtzman <aholtzma@ess.engr.uvic.ca>
*
* This file is part of a52dec, a free ATSC A-52 stream decoder.
@@ -77,7 +77,7 @@ static inline uint32_t unaligned32(const void *v) {
# if defined (__i386__)
# define swab32(x) __i386_swab32(x)
- static always_inline const uint32_t __i386_swab32(uint32_t x)
+ static inline const uint32_t __i386_swab32(uint32_t x)
{
__asm__("bswap %0" : "=r" (x) : "0" (x));
return x;
@@ -95,23 +95,17 @@ static inline uint32_t unaligned32(const void *v) {
#endif
#ifdef ALT_BITSTREAM_READER
-extern uint32_t *buffer_start;
extern int indx;
-#else
-extern uint32_t bits_left;
-extern uint32_t current_word;
#endif
-void bitstream_set_ptr (uint8_t * buf);
-uint32_t bitstream_get_bh(uint32_t num_bits);
-int32_t bitstream_get_bh_2(uint32_t num_bits);
-
+void a52_bitstream_set_ptr (a52_state_t * state, uint8_t * buf);
+uint32_t a52_bitstream_get_bh (a52_state_t * state, uint32_t num_bits);
+int32_t a52_bitstream_get_bh_2 (a52_state_t * state, uint32_t num_bits);
-static inline uint32_t
-bitstream_get(uint32_t num_bits) // note num_bits is practically a constant due to inlineing
+static inline uint32_t bitstream_get (a52_state_t * state, uint32_t num_bits)
{
#ifdef ALT_BITSTREAM_READER
- uint32_t result= swab32( unaligned32(((uint8_t *)buffer_start)+(indx>>3)) );
+ uint32_t result= swab32( unaligned32(((uint8_t *)state->buffer_start)+(indx>>3)) );
result<<= (indx&0x07);
result>>= 32 - num_bits;
@@ -121,30 +115,29 @@ bitstream_get(uint32_t num_bits) // note num_bits is practically a constant due
#else
uint32_t result;
- if(num_bits < bits_left) {
- result = (current_word << (32 - bits_left)) >> (32 - num_bits);
- bits_left -= num_bits;
+ if (num_bits < state->bits_left) {
+ result = (state->current_word << (32 - state->bits_left)) >> (32 - num_bits);
+ state->bits_left -= num_bits;
return result;
}
- return bitstream_get_bh(num_bits);
+ return a52_bitstream_get_bh (state, num_bits);
#endif
}
-static inline void bitstream_skip(int num_bits)
+static inline void bitstream_skip(a52_state_t * state, int num_bits)
{
#ifdef ALT_BITSTREAM_READER
indx+= num_bits;
#else
- bitstream_get(num_bits);
+ bitstream_get(state, num_bits);
#endif
}
-static inline int32_t
-bitstream_get_2(uint32_t num_bits)
+static inline int32_t bitstream_get_2 (a52_state_t * state, uint32_t num_bits)
{
#ifdef ALT_BITSTREAM_READER
- int32_t result= swab32( unaligned32(((uint8_t *)buffer_start)+(indx>>3)) );
+ int32_t result= swab32( unaligned32(((uint8_t *)state->buffer_start)+(indx>>3)) );
result<<= (indx&0x07);
result>>= 32 - num_bits;
@@ -154,12 +147,12 @@ bitstream_get_2(uint32_t num_bits)
#else
int32_t result;
- if(num_bits < bits_left) {
- result = (((int32_t)current_word) << (32 - bits_left)) >> (32 - num_bits);
- bits_left -= num_bits;
+ if (num_bits < state->bits_left) {
+ result = (((int32_t)state->current_word) << (32 - state->bits_left)) >> (32 - num_bits);
+ state->bits_left -= num_bits;
return result;
}
- return bitstream_get_bh_2(num_bits);
+ return a52_bitstream_get_bh_2 (state, num_bits);
#endif
}
diff --git a/liba52/downmix.c b/liba52/downmix.c
index 67eee7a89e..5c61cee475 100644
--- a/liba52/downmix.c
+++ b/liba52/downmix.c
@@ -1,6 +1,6 @@
/*
* downmix.c
- * Copyright (C) 2000-2001 Michel Lespinasse <walken@zoy.org>
+ * Copyright (C) 2000-2002 Michel Lespinasse <walken@zoy.org>
* Copyright (C) 1999-2000 Aaron Holtzman <aholtzma@ess.engr.uvic.ca>
*
* This file is part of a52dec, a free ATSC A-52 stream decoder.
@@ -40,9 +40,9 @@
#define CONVERT(acmod,output) (((output) << 3) + (acmod))
-void (*downmix)(sample_t * samples, int acmod, int output, sample_t bias,
+void (*a52_downmix)(sample_t * samples, int acmod, int output, sample_t bias,
sample_t clev, sample_t slev)= NULL;
-void (*upmix)(sample_t * samples, int acmod, int output)= NULL;
+void (*a52_upmix)(sample_t * samples, int acmod, int output)= NULL;
static void downmix_SSE (sample_t * samples, int acmod, int output, sample_t bias,
sample_t clev, sample_t slev);
@@ -55,16 +55,16 @@ static void upmix_C (sample_t * samples, int acmod, int output);
void downmix_accel_init(uint32_t mm_accel)
{
- upmix= upmix_C;
- downmix= downmix_C;
+ a52_upmix= upmix_C;
+ a52_downmix= downmix_C;
#if defined(ARCH_X86) || defined(ARCH_X86_64)
- if(mm_accel & MM_ACCEL_X86_MMX) upmix= upmix_MMX;
- if(mm_accel & MM_ACCEL_X86_SSE) downmix= downmix_SSE;
- if(mm_accel & MM_ACCEL_X86_3DNOW) downmix= downmix_3dnow;
+ if(mm_accel & MM_ACCEL_X86_MMX) a52_upmix= upmix_MMX;
+ if(mm_accel & MM_ACCEL_X86_SSE) a52_downmix= downmix_SSE;
+ if(mm_accel & MM_ACCEL_X86_3DNOW) a52_downmix= downmix_3dnow;
#endif
}
-int downmix_init (int input, int flags, sample_t * level,
+int a52_downmix_init (int input, int flags, sample_t * level,
sample_t clev, sample_t slev)
{
static uint8_t table[11][8] = {
@@ -183,7 +183,7 @@ int downmix_init (int input, int flags, sample_t * level,
return output;
}
-int downmix_coeff (sample_t * coeff, int acmod, int output, sample_t level,
+int a52_downmix_coeff (sample_t * coeff, int acmod, int output, sample_t level,
sample_t clev, sample_t slev)
{
switch (CONVERT (acmod, output & A52_CHANNEL_MASK)) {
@@ -478,7 +478,7 @@ static void zero (sample_t * samples)
samples[i] = 0;
}
-static void downmix_C (sample_t * samples, int acmod, int output, sample_t bias,
+void downmix_C (sample_t * samples, int acmod, int output, sample_t bias,
sample_t clev, sample_t slev)
{
switch (CONVERT (acmod, output & A52_CHANNEL_MASK)) {
@@ -619,7 +619,7 @@ static void downmix_C (sample_t * samples, int acmod, int output, sample_t bias,
}
}
-static void upmix_C (sample_t * samples, int acmod, int output)
+void upmix_C (sample_t * samples, int acmod, int output)
{
switch (CONVERT (acmod, output & A52_CHANNEL_MASK)) {
diff --git a/liba52/imdct.c b/liba52/imdct.c
index a535823584..4909fc5ce9 100644
--- a/liba52/imdct.c
+++ b/liba52/imdct.c
@@ -1,8 +1,11 @@
/*
* imdct.c
- * Copyright (C) 2000-2001 Michel Lespinasse <walken@zoy.org>
+ * Copyright (C) 2000-2002 Michel Lespinasse <walken@zoy.org>
* Copyright (C) 1999-2000 Aaron Holtzman <aholtzma@ess.engr.uvic.ca>
*
+ * The ifft algorithms in this file have been largely inspired by Dan
+ * Bernstein's work, djbfft, available at http://cr.yp.to/djbfft.html
+ *
* This file is part of a52dec, a free ATSC A-52 stream decoder.
* See http://liba52.sourceforge.net/ for updates.
*
@@ -35,6 +38,9 @@
#include <math.h>
#include <stdio.h>
+#ifdef LIBA52_DJBFFT
+#include <fftc4.h>
+#endif
#ifndef M_PI
#define M_PI 3.1415926535897932384626433832795029
#endif
@@ -45,22 +51,17 @@
#include "mm_accel.h"
#include "mangle.h"
+void (*a52_imdct_512) (sample_t * data, sample_t * delay, sample_t bias);
+
#ifdef RUNTIME_CPUDETECT
#undef HAVE_3DNOWEX
#endif
-#define USE_AC3_C
-
-void (* imdct_256) (sample_t data[], sample_t delay[], sample_t bias);
-void (* imdct_512) (sample_t data[], sample_t delay[], sample_t bias);
-
typedef struct complex_s {
sample_t real;
sample_t imag;
} complex_t;
-static void fft_128p(complex_t *a);
-
static const int pm128[128] attribute_used __attribute__((aligned(16))) =
{
0, 16, 32, 48, 64, 80, 96, 112, 8, 40, 72, 104, 24, 56, 88, 120,
@@ -73,7 +74,6 @@ static const int pm128[128] attribute_used __attribute__((aligned(16))) =
7, 23, 39, 55, 71, 87, 103, 119, 15, 31, 47, 63, 79, 95, 111, 127
};
-/* 128 point bit-reverse LUT */
static uint8_t attribute_used bit_reverse_512[] = {
0x00, 0x40, 0x20, 0x60, 0x10, 0x50, 0x30, 0x70,
0x08, 0x48, 0x28, 0x68, 0x18, 0x58, 0x38, 0x78,
@@ -92,20 +92,36 @@ static uint8_t attribute_used bit_reverse_512[] = {
0x07, 0x47, 0x27, 0x67, 0x17, 0x57, 0x37, 0x77,
0x0f, 0x4f, 0x2f, 0x6f, 0x1f, 0x5f, 0x3f, 0x7f};
-static uint8_t bit_reverse_256[] = {
- 0x00, 0x20, 0x10, 0x30, 0x08, 0x28, 0x18, 0x38,
- 0x04, 0x24, 0x14, 0x34, 0x0c, 0x2c, 0x1c, 0x3c,
- 0x02, 0x22, 0x12, 0x32, 0x0a, 0x2a, 0x1a, 0x3a,
- 0x06, 0x26, 0x16, 0x36, 0x0e, 0x2e, 0x1e, 0x3e,
- 0x01, 0x21, 0x11, 0x31, 0x09, 0x29, 0x19, 0x39,
- 0x05, 0x25, 0x15, 0x35, 0x0d, 0x2d, 0x1d, 0x3d,
- 0x03, 0x23, 0x13, 0x33, 0x0b, 0x2b, 0x1b, 0x3b,
- 0x07, 0x27, 0x17, 0x37, 0x0f, 0x2f, 0x1f, 0x3f};
+static uint8_t fftorder[] = {
+ 0,128, 64,192, 32,160,224, 96, 16,144, 80,208,240,112, 48,176,
+ 8,136, 72,200, 40,168,232,104,248,120, 56,184, 24,152,216, 88,
+ 4,132, 68,196, 36,164,228,100, 20,148, 84,212,244,116, 52,180,
+ 252,124, 60,188, 28,156,220, 92, 12,140, 76,204,236,108, 44,172,
+ 2,130, 66,194, 34,162,226, 98, 18,146, 82,210,242,114, 50,178,
+ 10,138, 74,202, 42,170,234,106,250,122, 58,186, 26,154,218, 90,
+ 254,126, 62,190, 30,158,222, 94, 14,142, 78,206,238,110, 46,174,
+ 6,134, 70,198, 38,166,230,102,246,118, 54,182, 22,150,214, 86
+};
+
+static complex_t __attribute__((aligned(16))) buf[128];
+
+/* Twiddle factor LUT */
+static complex_t __attribute__((aligned(16))) w_1[1];
+static complex_t __attribute__((aligned(16))) w_2[2];
+static complex_t __attribute__((aligned(16))) w_4[4];
+static complex_t __attribute__((aligned(16))) w_8[8];
+static complex_t __attribute__((aligned(16))) w_16[16];
+static complex_t __attribute__((aligned(16))) w_32[32];
+static complex_t __attribute__((aligned(16))) w_64[64];
+static complex_t __attribute__((aligned(16))) * w[7] = {w_1, w_2, w_4, w_8, w_16, w_32, w_64};
+
+/* Twiddle factors for IMDCT */
+static sample_t __attribute__((aligned(16))) xcos1[128];
+static sample_t __attribute__((aligned(16))) xsin1[128];
#if defined(ARCH_X86) || defined(ARCH_X86_64)
// NOTE: SSE needs 16byte alignment or it will segfault
//
-static complex_t __attribute__((aligned(16))) buf[128];
static float __attribute__((aligned(16))) sseSinCos1c[256];
static float __attribute__((aligned(16))) sseSinCos1d[256];
static float attribute_used __attribute__((aligned(16))) ps111_1[4]={1,1,1,-1};
@@ -119,274 +135,234 @@ static float __attribute__((aligned(16))) sseW6[256];
static float __attribute__((aligned(16))) *sseW[7]=
{NULL /*sseW0*/,sseW1,sseW2,sseW3,sseW4,sseW5,sseW6};
static float __attribute__((aligned(16))) sseWindow[512];
-#else
-static complex_t __attribute__((aligned(16))) buf[128];
#endif
-/* Twiddle factor LUT */
-static complex_t __attribute__((aligned(16))) w_1[1];
-static complex_t __attribute__((aligned(16))) w_2[2];
-static complex_t __attribute__((aligned(16))) w_4[4];
-static complex_t __attribute__((aligned(16))) w_8[8];
-static complex_t __attribute__((aligned(16))) w_16[16];
-static complex_t __attribute__((aligned(16))) w_32[32];
-static complex_t __attribute__((aligned(16))) w_64[64];
-static complex_t __attribute__((aligned(16))) * w[7] = {w_1, w_2, w_4, w_8, w_16, w_32, w_64};
+/* Root values for IFFT */
+static sample_t roots16[3];
+static sample_t roots32[7];
+static sample_t roots64[15];
+static sample_t roots128[31];
/* Twiddle factors for IMDCT */
-static sample_t __attribute__((aligned(16))) xcos1[128];
-static sample_t __attribute__((aligned(16))) xsin1[128];
-static sample_t __attribute__((aligned(16))) xcos2[64];
-static sample_t __attribute__((aligned(16))) xsin2[64];
-
-/* Windowing function for Modified DCT - Thank you acroread */
-sample_t imdct_window[] = {
- 0.00014, 0.00024, 0.00037, 0.00051, 0.00067, 0.00086, 0.00107, 0.00130,
- 0.00157, 0.00187, 0.00220, 0.00256, 0.00297, 0.00341, 0.00390, 0.00443,
- 0.00501, 0.00564, 0.00632, 0.00706, 0.00785, 0.00871, 0.00962, 0.01061,
- 0.01166, 0.01279, 0.01399, 0.01526, 0.01662, 0.01806, 0.01959, 0.02121,
- 0.02292, 0.02472, 0.02662, 0.02863, 0.03073, 0.03294, 0.03527, 0.03770,
- 0.04025, 0.04292, 0.04571, 0.04862, 0.05165, 0.05481, 0.05810, 0.06153,
- 0.06508, 0.06878, 0.07261, 0.07658, 0.08069, 0.08495, 0.08935, 0.09389,
- 0.09859, 0.10343, 0.10842, 0.11356, 0.11885, 0.12429, 0.12988, 0.13563,
- 0.14152, 0.14757, 0.15376, 0.16011, 0.16661, 0.17325, 0.18005, 0.18699,
- 0.19407, 0.20130, 0.20867, 0.21618, 0.22382, 0.23161, 0.23952, 0.24757,
- 0.25574, 0.26404, 0.27246, 0.28100, 0.28965, 0.29841, 0.30729, 0.31626,
- 0.32533, 0.33450, 0.34376, 0.35311, 0.36253, 0.37204, 0.38161, 0.39126,
- 0.40096, 0.41072, 0.42054, 0.43040, 0.44030, 0.45023, 0.46020, 0.47019,
- 0.48020, 0.49022, 0.50025, 0.51028, 0.52031, 0.53033, 0.54033, 0.55031,
- 0.56026, 0.57019, 0.58007, 0.58991, 0.59970, 0.60944, 0.61912, 0.62873,
- 0.63827, 0.64774, 0.65713, 0.66643, 0.67564, 0.68476, 0.69377, 0.70269,
- 0.71150, 0.72019, 0.72877, 0.73723, 0.74557, 0.75378, 0.76186, 0.76981,
- 0.77762, 0.78530, 0.79283, 0.80022, 0.80747, 0.81457, 0.82151, 0.82831,
- 0.83496, 0.84145, 0.84779, 0.85398, 0.86001, 0.86588, 0.87160, 0.87716,
- 0.88257, 0.88782, 0.89291, 0.89785, 0.90264, 0.90728, 0.91176, 0.91610,
- 0.92028, 0.92432, 0.92822, 0.93197, 0.93558, 0.93906, 0.94240, 0.94560,
- 0.94867, 0.95162, 0.95444, 0.95713, 0.95971, 0.96217, 0.96451, 0.96674,
- 0.96887, 0.97089, 0.97281, 0.97463, 0.97635, 0.97799, 0.97953, 0.98099,
- 0.98236, 0.98366, 0.98488, 0.98602, 0.98710, 0.98811, 0.98905, 0.98994,
- 0.99076, 0.99153, 0.99225, 0.99291, 0.99353, 0.99411, 0.99464, 0.99513,
- 0.99558, 0.99600, 0.99639, 0.99674, 0.99706, 0.99736, 0.99763, 0.99788,
- 0.99811, 0.99831, 0.99850, 0.99867, 0.99882, 0.99895, 0.99908, 0.99919,
- 0.99929, 0.99938, 0.99946, 0.99953, 0.99959, 0.99965, 0.99969, 0.99974,
- 0.99978, 0.99981, 0.99984, 0.99986, 0.99988, 0.99990, 0.99992, 0.99993,
- 0.99994, 0.99995, 0.99996, 0.99997, 0.99998, 0.99998, 0.99998, 0.99999,
- 0.99999, 0.99999, 0.99999, 1.00000, 1.00000, 1.00000, 1.00000, 1.00000,
- 1.00000, 1.00000, 1.00000, 1.00000, 1.00000, 1.00000, 1.00000, 1.00000 };
-
-
-static inline void swap_cmplx(complex_t *a, complex_t *b)
-{
- complex_t tmp;
+static complex_t pre1[128];
+static complex_t post1[64];
+static complex_t pre2[64];
+static complex_t post2[32];
- tmp = *a;
- *a = *b;
- *b = tmp;
-}
+static sample_t a52_imdct_window[256];
+static void (* ifft128) (complex_t * buf);
+static void (* ifft64) (complex_t * buf);
+static inline void ifft2 (complex_t * buf)
+{
+ double r, i;
+
+ r = buf[0].real;
+ i = buf[0].imag;
+ buf[0].real += buf[1].real;
+ buf[0].imag += buf[1].imag;
+ buf[1].real = r - buf[1].real;
+ buf[1].imag = i - buf[1].imag;
+}
-static inline complex_t cmplx_mult(complex_t a, complex_t b)
+static inline void ifft4 (complex_t * buf)
{
- complex_t ret;
+ double tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8;
+
+ tmp1 = buf[0].real + buf[1].real;
+ tmp2 = buf[3].real + buf[2].real;
+ tmp3 = buf[0].imag + buf[1].imag;
+ tmp4 = buf[2].imag + buf[3].imag;
+ tmp5 = buf[0].real - buf[1].real;
+ tmp6 = buf[0].imag - buf[1].imag;
+ tmp7 = buf[2].imag - buf[3].imag;
+ tmp8 = buf[3].real - buf[2].real;
+
+ buf[0].real = tmp1 + tmp2;
+ buf[0].imag = tmp3 + tmp4;
+ buf[2].real = tmp1 - tmp2;
+ buf[2].imag = tmp3 - tmp4;
+ buf[1].real = tmp5 + tmp7;
+ buf[1].imag = tmp6 + tmp8;
+ buf[3].real = tmp5 - tmp7;
+ buf[3].imag = tmp6 - tmp8;
+}
- ret.real = a.real * b.real - a.imag * b.imag;
- ret.imag = a.real * b.imag + a.imag * b.real;
+/* the basic split-radix ifft butterfly */
+
+#define BUTTERFLY(a0,a1,a2,a3,wr,wi) do { \
+ tmp5 = a2.real * wr + a2.imag * wi; \
+ tmp6 = a2.imag * wr - a2.real * wi; \
+ tmp7 = a3.real * wr - a3.imag * wi; \
+ tmp8 = a3.imag * wr + a3.real * wi; \
+ tmp1 = tmp5 + tmp7; \
+ tmp2 = tmp6 + tmp8; \
+ tmp3 = tmp6 - tmp8; \
+ tmp4 = tmp7 - tmp5; \
+ a2.real = a0.real - tmp1; \
+ a2.imag = a0.imag - tmp2; \
+ a3.real = a1.real - tmp3; \
+ a3.imag = a1.imag - tmp4; \
+ a0.real += tmp1; \
+ a0.imag += tmp2; \
+ a1.real += tmp3; \
+ a1.imag += tmp4; \
+} while (0)
+
+/* split-radix ifft butterfly, specialized for wr=1 wi=0 */
+
+#define BUTTERFLY_ZERO(a0,a1,a2,a3) do { \
+ tmp1 = a2.real + a3.real; \
+ tmp2 = a2.imag + a3.imag; \
+ tmp3 = a2.imag - a3.imag; \
+ tmp4 = a3.real - a2.real; \
+ a2.real = a0.real - tmp1; \
+ a2.imag = a0.imag - tmp2; \
+ a3.real = a1.real - tmp3; \
+ a3.imag = a1.imag - tmp4; \
+ a0.real += tmp1; \
+ a0.imag += tmp2; \
+ a1.real += tmp3; \
+ a1.imag += tmp4; \
+} while (0)
+
+/* split-radix ifft butterfly, specialized for wr=wi */
+
+#define BUTTERFLY_HALF(a0,a1,a2,a3,w) do { \
+ tmp5 = (a2.real + a2.imag) * w; \
+ tmp6 = (a2.imag - a2.real) * w; \
+ tmp7 = (a3.real - a3.imag) * w; \
+ tmp8 = (a3.imag + a3.real) * w; \
+ tmp1 = tmp5 + tmp7; \
+ tmp2 = tmp6 + tmp8; \
+ tmp3 = tmp6 - tmp8; \
+ tmp4 = tmp7 - tmp5; \
+ a2.real = a0.real - tmp1; \
+ a2.imag = a0.imag - tmp2; \
+ a3.real = a1.real - tmp3; \
+ a3.imag = a1.imag - tmp4; \
+ a0.real += tmp1; \
+ a0.imag += tmp2; \
+ a1.real += tmp3; \
+ a1.imag += tmp4; \
+} while (0)
+
+static inline void ifft8 (complex_t * buf)
+{
+ double tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8;
- return ret;
+ ifft4 (buf);
+ ifft2 (buf + 4);
+ ifft2 (buf + 6);
+ BUTTERFLY_ZERO (buf[0], buf[2], buf[4], buf[6]);
+ BUTTERFLY_HALF (buf[1], buf[3], buf[5], buf[7], roots16[1]);
}
-void
-imdct_do_512(sample_t data[],sample_t delay[], sample_t bias)
+static void ifft_pass (complex_t * buf, sample_t * weight, int n)
{
+ complex_t * buf1;
+ complex_t * buf2;
+ complex_t * buf3;
+ double tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8;
int i;
-#ifndef USE_AC3_C
- int k;
- int p,q;
- int m;
- int two_m;
- int two_m_plus_one;
- sample_t tmp_b_i;
- sample_t tmp_b_r;
-#endif
- sample_t tmp_a_i;
- sample_t tmp_a_r;
+ buf++;
+ buf1 = buf + n;
+ buf2 = buf + 2 * n;
+ buf3 = buf + 3 * n;
- sample_t *data_ptr;
- sample_t *delay_ptr;
- sample_t *window_ptr;
-
- /* 512 IMDCT with source and dest data in 'data' */
-
- /* Pre IFFT complex multiply plus IFFT cmplx conjugate & reordering*/
- for( i=0; i < 128; i++) {
- /* z[i] = (X[256-2*i-1] + j * X[2*i]) * (xcos1[i] + j * xsin1[i]) ; */
-#ifdef USE_AC3_C
- int j= pm128[i];
-#else
- int j= bit_reverse_512[i];
-#endif
- buf[i].real = (data[256-2*j-1] * xcos1[j]) - (data[2*j] * xsin1[j]);
- buf[i].imag = -1.0 * ((data[2*j] * xcos1[j]) + (data[256-2*j-1] * xsin1[j]));
- }
+ BUTTERFLY_ZERO (buf[-1], buf1[-1], buf2[-1], buf3[-1]);
- /* FFT Merge */
-/* unoptimized variant
- for (m=1; m < 7; m++) {
- if(m)
- two_m = (1 << m);
- else
- two_m = 1;
+ i = n - 1;
- two_m_plus_one = (1 << (m+1));
+ do {
+ BUTTERFLY (buf[0], buf1[0], buf2[0], buf3[0], weight[n], weight[2*i]);
+ buf++;
+ buf1++;
+ buf2++;
+ buf3++;
+ weight++;
+ } while (--i);
+}
- for(i = 0; i < 128; i += two_m_plus_one) {
- for(k = 0; k < two_m; k++) {
- p = k + i;
- q = p + two_m;
- tmp_a_r = buf[p].real;
- tmp_a_i = buf[p].imag;
- tmp_b_r = buf[q].real * w[m][k].real - buf[q].imag * w[m][k].imag;
- tmp_b_i = buf[q].imag * w[m][k].real + buf[q].real * w[m][k].imag;
- buf[p].real = tmp_a_r + tmp_b_r;
- buf[p].imag = tmp_a_i + tmp_b_i;
- buf[q].real = tmp_a_r - tmp_b_r;
- buf[q].imag = tmp_a_i - tmp_b_i;
- }
- }
+static void ifft16 (complex_t * buf)
+{
+ ifft8 (buf);
+ ifft4 (buf + 8);
+ ifft4 (buf + 12);
+ ifft_pass (buf, roots16 - 4, 4);
}
-*/
-#ifdef USE_AC3_C
- fft_128p (&buf[0]);
-#else
- /* 1. iteration */
- for(i = 0; i < 128; i += 2) {
- tmp_a_r = buf[i].real;
- tmp_a_i = buf[i].imag;
- tmp_b_r = buf[i+1].real;
- tmp_b_i = buf[i+1].imag;
- buf[i].real = tmp_a_r + tmp_b_r;
- buf[i].imag = tmp_a_i + tmp_b_i;
- buf[i+1].real = tmp_a_r - tmp_b_r;
- buf[i+1].imag = tmp_a_i - tmp_b_i;
- }
-
- /* 2. iteration */
- // Note w[1]={{1,0}, {0,-1}}
- for(i = 0; i < 128; i += 4) {
- tmp_a_r = buf[i].real;
- tmp_a_i = buf[i].imag;
- tmp_b_r = buf[i+2].real;
- tmp_b_i = buf[i+2].imag;
- buf[i].real = tmp_a_r + tmp_b_r;
- buf[i].imag = tmp_a_i + tmp_b_i;
- buf[i+2].real = tmp_a_r - tmp_b_r;
- buf[i+2].imag = tmp_a_i - tmp_b_i;
- tmp_a_r = buf[i+1].real;
- tmp_a_i = buf[i+1].imag;
- tmp_b_r = buf[i+3].imag;
- tmp_b_i = buf[i+3].real;
- buf[i+1].real = tmp_a_r + tmp_b_r;
- buf[i+1].imag = tmp_a_i - tmp_b_i;
- buf[i+3].real = tmp_a_r - tmp_b_r;
- buf[i+3].imag = tmp_a_i + tmp_b_i;
+static void ifft32 (complex_t * buf)
+{
+ ifft16 (buf);
+ ifft8 (buf + 16);
+ ifft8 (buf + 24);
+ ifft_pass (buf, roots32 - 8, 8);
}
- /* 3. iteration */
- for(i = 0; i < 128; i += 8) {
- tmp_a_r = buf[i].real;
- tmp_a_i = buf[i].imag;
- tmp_b_r = buf[i+4].real;
- tmp_b_i = buf[i+4].imag;
- buf[i].real = tmp_a_r + tmp_b_r;
- buf[i].imag = tmp_a_i + tmp_b_i;
- buf[i+4].real = tmp_a_r - tmp_b_r;
- buf[i+4].imag = tmp_a_i - tmp_b_i;
- tmp_a_r = buf[1+i].real;
- tmp_a_i = buf[1+i].imag;
- tmp_b_r = (buf[i+5].real + buf[i+5].imag) * w[2][1].real;
- tmp_b_i = (buf[i+5].imag - buf[i+5].real) * w[2][1].real;
- buf[1+i].real = tmp_a_r + tmp_b_r;
- buf[1+i].imag = tmp_a_i + tmp_b_i;
- buf[i+5].real = tmp_a_r - tmp_b_r;
- buf[i+5].imag = tmp_a_i - tmp_b_i;
- tmp_a_r = buf[i+2].real;
- tmp_a_i = buf[i+2].imag;
- tmp_b_r = buf[i+6].imag;
- tmp_b_i = - buf[i+6].real;
- buf[i+2].real = tmp_a_r + tmp_b_r;
- buf[i+2].imag = tmp_a_i + tmp_b_i;
- buf[i+6].real = tmp_a_r - tmp_b_r;
- buf[i+6].imag = tmp_a_i - tmp_b_i;
- tmp_a_r = buf[i+3].real;
- tmp_a_i = buf[i+3].imag;
- tmp_b_r = (buf[i+7].real - buf[i+7].imag) * w[2][3].imag;
- tmp_b_i = (buf[i+7].imag + buf[i+7].real) * w[2][3].imag;
- buf[i+3].real = tmp_a_r + tmp_b_r;
- buf[i+3].imag = tmp_a_i + tmp_b_i;
- buf[i+7].real = tmp_a_r - tmp_b_r;
- buf[i+7].imag = tmp_a_i - tmp_b_i;
+static void ifft64_c (complex_t * buf)
+{
+ ifft32 (buf);
+ ifft16 (buf + 32);
+ ifft16 (buf + 48);
+ ifft_pass (buf, roots64 - 16, 16);
+ }
+
+static void ifft128_c (complex_t * buf)
+{
+ ifft32 (buf);
+ ifft16 (buf + 32);
+ ifft16 (buf + 48);
+ ifft_pass (buf, roots64 - 16, 16);
+
+ ifft32 (buf + 64);
+ ifft32 (buf + 96);
+ ifft_pass (buf, roots128 - 32, 32);
}
- /* 4-7. iterations */
- for (m=3; m < 7; m++) {
- two_m = (1 << m);
-
- two_m_plus_one = two_m<<1;
+void imdct_do_512 (sample_t * data, sample_t * delay, sample_t bias)
+{
+ int i, k;
+ sample_t t_r, t_i, a_r, a_i, b_r, b_i, w_1, w_2;
+ const sample_t * window = a52_imdct_window;
+ complex_t buf[128];
- for(i = 0; i < 128; i += two_m_plus_one) {
- for(k = 0; k < two_m; k++) {
- int p = k + i;
- int q = p + two_m;
- tmp_a_r = buf[p].real;
- tmp_a_i = buf[p].imag;
- tmp_b_r = buf[q].real * w[m][k].real - buf[q].imag * w[m][k].imag;
- tmp_b_i = buf[q].imag * w[m][k].real + buf[q].real * w[m][k].imag;
- buf[p].real = tmp_a_r + tmp_b_r;
- buf[p].imag = tmp_a_i + tmp_b_i;
- buf[q].real = tmp_a_r - tmp_b_r;
- buf[q].imag = tmp_a_i - tmp_b_i;
- }
- }
- }
-#endif
- /* Post IFFT complex multiply plus IFFT complex conjugate*/
for( i=0; i < 128; i++) {
- /* y[n] = z[n] * (xcos1[n] + j * xsin1[n]) ; */
- tmp_a_r = buf[i].real;
- tmp_a_i = -1.0 * buf[i].imag;
- buf[i].real =(tmp_a_r * xcos1[i]) - (tmp_a_i * xsin1[i]);
- buf[i].imag =(tmp_a_r * xsin1[i]) + (tmp_a_i * xcos1[i]);
- }
-
- data_ptr = data;
- delay_ptr = delay;
- window_ptr = imdct_window;
-
- /* Window and convert to real valued signal */
- for(i=0; i< 64; i++) {
- *data_ptr++ = -buf[64+i].imag * *window_ptr++ + *delay_ptr++ + bias;
- *data_ptr++ = buf[64-i-1].real * *window_ptr++ + *delay_ptr++ + bias;
- }
+ k = fftorder[i];
+ t_r = pre1[i].real;
+ t_i = pre1[i].imag;
- for(i=0; i< 64; i++) {
- *data_ptr++ = -buf[i].real * *window_ptr++ + *delay_ptr++ + bias;
- *data_ptr++ = buf[128-i-1].imag * *window_ptr++ + *delay_ptr++ + bias;
+ buf[i].real = t_i * data[255-k] + t_r * data[k];
+ buf[i].imag = t_r * data[255-k] - t_i * data[k];
}
- /* The trailing edge of the window goes into the delay line */
- delay_ptr = delay;
+ ifft128 (buf);
+ /* Post IFFT complex multiply plus IFFT complex conjugate*/
+ /* Window and convert to real valued signal */
for(i=0; i< 64; i++) {
- *delay_ptr++ = -buf[64+i].real * *--window_ptr;
- *delay_ptr++ = buf[64-i-1].imag * *--window_ptr;
- }
+ /* y[n] = z[n] * (xcos1[n] + j * xsin1[n]) ; */
+ t_r = post1[i].real;
+ t_i = post1[i].imag;
- for(i=0; i<64; i++) {
- *delay_ptr++ = buf[i].imag * *--window_ptr;
- *delay_ptr++ = -buf[128-i-1].real * *--window_ptr;
+ a_r = t_r * buf[i].real + t_i * buf[i].imag;
+ a_i = t_i * buf[i].real - t_r * buf[i].imag;
+ b_r = t_i * buf[127-i].real + t_r * buf[127-i].imag;
+ b_i = t_r * buf[127-i].real - t_i * buf[127-i].imag;
+
+ w_1 = window[2*i];
+ w_2 = window[255-2*i];
+ data[2*i] = delay[2*i] * w_2 - a_r * w_1 + bias;
+ data[255-2*i] = delay[2*i] * w_1 + a_r * w_2 + bias;
+ delay[2*i] = a_i;
+
+ w_1 = window[2*i+1];
+ w_2 = window[254-2*i];
+ data[2*i+1] = delay[2*i+1] * w_2 + b_r * w_1 + bias;
+ data[254-2*i] = delay[2*i+1] * w_1 - b_r * w_2 + bias;
+ delay[2*i+1] = b_i;
}
}
@@ -717,7 +693,7 @@ imdct_do_512_altivec(sample_t data[],sample_t delay[], sample_t bias)
data_ptr = data;
delay_ptr = delay;
- window_ptr = imdct_window;
+ window_ptr = a52_imdct_window;
/* Window and convert to real valued signal */
for(i=0; i< 64; i++) {
@@ -995,7 +971,7 @@ imdct_do_512_sse(sample_t data[],sample_t delay[], sample_t bias)
data_ptr = data;
delay_ptr = delay;
- window_ptr = imdct_window;
+ window_ptr = a52_imdct_window;
/* Window and convert to real valued signal */
asm volatile(
@@ -1098,166 +1074,141 @@ imdct_do_512_sse(sample_t data[],sample_t delay[], sample_t bias)
}
#endif // ARCH_X86 || ARCH_X86_64
-void
-imdct_do_256(sample_t data[],sample_t delay[],sample_t bias)
+void a52_imdct_256(sample_t * data, sample_t * delay, sample_t bias)
{
int i,k;
- int p,q;
- int m;
- int two_m;
- int two_m_plus_one;
-
- sample_t tmp_a_i;
- sample_t tmp_a_r;
- sample_t tmp_b_i;
- sample_t tmp_b_r;
-
- sample_t *data_ptr;
- sample_t *delay_ptr;
- sample_t *window_ptr;
-
- complex_t *buf_1, *buf_2;
-
- buf_1 = &buf[0];
- buf_2 = &buf[64];
+ sample_t t_r, t_i, a_r, a_i, b_r, b_i, c_r, c_i, d_r, d_i, w_1, w_2;
+ const sample_t * window = a52_imdct_window;
+ complex_t buf1[64], buf2[64];
/* Pre IFFT complex multiply plus IFFT cmplx conjugate */
- for(k=0; k<64; k++) {
- /* X1[k] = X[2*k] */
- /* X2[k] = X[2*k+1] */
-
- p = 2 * (128-2*k-1);
- q = 2 * (2 * k);
-
- /* Z1[k] = (X1[128-2*k-1] + j * X1[2*k]) * (xcos2[k] + j * xsin2[k]); */
- buf_1[k].real = data[p] * xcos2[k] - data[q] * xsin2[k];
- buf_1[k].imag = -1.0f * (data[q] * xcos2[k] + data[p] * xsin2[k]);
- /* Z2[k] = (X2[128-2*k-1] + j * X2[2*k]) * (xcos2[k] + j * xsin2[k]); */
- buf_2[k].real = data[p + 1] * xcos2[k] - data[q + 1] * xsin2[k];
- buf_2[k].imag = -1.0f * ( data[q + 1] * xcos2[k] + data[p + 1] * xsin2[k]);
- }
+ for (i = 0; i < 64; i++) {
+ k = fftorder[i];
+ t_r = pre2[i].real;
+ t_i = pre2[i].imag;
- /* IFFT Bit reversed shuffling */
- for(i=0; i<64; i++) {
- k = bit_reverse_256[i];
- if (k < i) {
- swap_cmplx(&buf_1[i],&buf_1[k]);
- swap_cmplx(&buf_2[i],&buf_2[k]);
- }
+ buf1[i].real = t_i * data[254-k] + t_r * data[k];
+ buf1[i].imag = t_r * data[254-k] - t_i * data[k];
+
+ buf2[i].real = t_i * data[255-k] + t_r * data[k+1];
+ buf2[i].imag = t_r * data[255-k] - t_i * data[k+1];
}
- /* FFT Merge */
- for (m=0; m < 6; m++) {
- two_m = (1 << m);
- two_m_plus_one = (1 << (m+1));
+ ifft64 (buf1);
+ ifft64 (buf2);
- /* FIXME */
- if(m)
- two_m = (1 << m);
- else
- two_m = 1;
-
- for(k = 0; k < two_m; k++) {
- for(i = 0; i < 64; i += two_m_plus_one) {
- p = k + i;
- q = p + two_m;
- /* Do block 1 */
- tmp_a_r = buf_1[p].real;
- tmp_a_i = buf_1[p].imag;
- tmp_b_r = buf_1[q].real * w[m][k].real - buf_1[q].imag * w[m][k].imag;
- tmp_b_i = buf_1[q].imag * w[m][k].real + buf_1[q].real * w[m][k].imag;
- buf_1[p].real = tmp_a_r + tmp_b_r;
- buf_1[p].imag = tmp_a_i + tmp_b_i;
- buf_1[q].real = tmp_a_r - tmp_b_r;
- buf_1[q].imag = tmp_a_i - tmp_b_i;
-
- /* Do block 2 */
- tmp_a_r = buf_2[p].real;
- tmp_a_i = buf_2[p].imag;
- tmp_b_r = buf_2[q].real * w[m][k].real - buf_2[q].imag * w[m][k].imag;
- tmp_b_i = buf_2[q].imag * w[m][k].real + buf_2[q].real * w[m][k].imag;
- buf_2[p].real = tmp_a_r + tmp_b_r;
- buf_2[p].imag = tmp_a_i + tmp_b_i;
- buf_2[q].real = tmp_a_r - tmp_b_r;
- buf_2[q].imag = tmp_a_i - tmp_b_i;
- }
+ /* Post IFFT complex multiply */
+ /* Window and convert to real valued signal */
+ for (i = 0; i < 32; i++) {
+ /* y1[n] = z1[n] * (xcos2[n] + j * xs in2[n]) ; */
+ t_r = post2[i].real;
+ t_i = post2[i].imag;
+
+ a_r = t_r * buf1[i].real + t_i * buf1[i].imag;
+ a_i = t_i * buf1[i].real - t_r * buf1[i].imag;
+ b_r = t_i * buf1[63-i].real + t_r * buf1[63-i].imag;
+ b_i = t_r * buf1[63-i].real - t_i * buf1[63-i].imag;
+
+ c_r = t_r * buf2[i].real + t_i * buf2[i].imag;
+ c_i = t_i * buf2[i].real - t_r * buf2[i].imag;
+ d_r = t_i * buf2[63-i].real + t_r * buf2[63-i].imag;
+ d_i = t_r * buf2[63-i].real - t_i * buf2[63-i].imag;
+
+ w_1 = window[2*i];
+ w_2 = window[255-2*i];
+ data[2*i] = delay[2*i] * w_2 - a_r * w_1 + bias;
+ data[255-2*i] = delay[2*i] * w_1 + a_r * w_2 + bias;
+ delay[2*i] = c_i;
+
+ w_1 = window[128+2*i];
+ w_2 = window[127-2*i];
+ data[128+2*i] = delay[127-2*i] * w_2 + a_i * w_1 + bias;
+ data[127-2*i] = delay[127-2*i] * w_1 - a_i * w_2 + bias;
+ delay[127-2*i] = c_r;
+
+ w_1 = window[2*i+1];
+ w_2 = window[254-2*i];
+ data[2*i+1] = delay[2*i+1] * w_2 - b_i * w_1 + bias;
+ data[254-2*i] = delay[2*i+1] * w_1 + b_i * w_2 + bias;
+ delay[2*i+1] = d_r;
+
+ w_1 = window[129+2*i];
+ w_2 = window[126-2*i];
+ data[129+2*i] = delay[126-2*i] * w_2 + b_r * w_1 + bias;
+ data[126-2*i] = delay[126-2*i] * w_1 - b_r * w_2 + bias;
+ delay[126-2*i] = d_i;
}
}
- /* Post IFFT complex multiply */
- for( i=0; i < 64; i++) {
- /* y1[n] = z1[n] * (xcos2[n] + j * xs in2[n]) ; */
- tmp_a_r = buf_1[i].real;
- tmp_a_i = -buf_1[i].imag;
- buf_1[i].real =(tmp_a_r * xcos2[i]) - (tmp_a_i * xsin2[i]);
- buf_1[i].imag =(tmp_a_r * xsin2[i]) + (tmp_a_i * xcos2[i]);
- /* y2[n] = z2[n] * (xcos2[n] + j * xsin2[n]) ; */
- tmp_a_r = buf_2[i].real;
- tmp_a_i = -buf_2[i].imag;
- buf_2[i].real =(tmp_a_r * xcos2[i]) - (tmp_a_i * xsin2[i]);
- buf_2[i].imag =(tmp_a_r * xsin2[i]) + (tmp_a_i * xcos2[i]);
+static double besselI0 (double x)
+{
+ double bessel = 1;
+ int i = 100;
+
+ do
+ bessel = bessel * x / (i * i) + 1;
+ while (--i);
+ return bessel;
}
- data_ptr = data;
- delay_ptr = delay;
- window_ptr = imdct_window;
-
- /* Window and convert to real valued signal */
- for(i=0; i< 64; i++) {
- *data_ptr++ = -buf_1[i].imag * *window_ptr++ + *delay_ptr++ + bias;
- *data_ptr++ = buf_1[64-i-1].real * *window_ptr++ + *delay_ptr++ + bias;
+void a52_imdct_init (uint32_t mm_accel)
+{
+ int i, j, k;
+ double sum;
+
+ /* compute imdct window - kaiser-bessel derived window, alpha = 5.0 */
+ sum = 0;
+ for (i = 0; i < 256; i++) {
+ sum += besselI0 (i * (256 - i) * (5 * M_PI / 256) * (5 * M_PI / 256));
+ a52_imdct_window[i] = sum;
}
+ sum++;
+ for (i = 0; i < 256; i++)
+ a52_imdct_window[i] = sqrt (a52_imdct_window[i] / sum);
+
+ for (i = 0; i < 3; i++)
+ roots16[i] = cos ((M_PI / 8) * (i + 1));
+
+ for (i = 0; i < 7; i++)
+ roots32[i] = cos ((M_PI / 16) * (i + 1));
+
+ for (i = 0; i < 15; i++)
+ roots64[i] = cos ((M_PI / 32) * (i + 1));
+
+ for (i = 0; i < 31; i++)
+ roots128[i] = cos ((M_PI / 64) * (i + 1));
for(i=0; i< 64; i++) {
- *data_ptr++ = -buf_1[i].real * *window_ptr++ + *delay_ptr++ + bias;
- *data_ptr++ = buf_1[64-i-1].imag * *window_ptr++ + *delay_ptr++ + bias;
+ k = fftorder[i] / 2 + 64;
+ pre1[i].real = cos ((M_PI / 256) * (k - 0.25));
+ pre1[i].imag = sin ((M_PI / 256) * (k - 0.25));
}
- delay_ptr = delay;
+ for (i = 64; i < 128; i++) {
+ k = fftorder[i] / 2 + 64;
+ pre1[i].real = -cos ((M_PI / 256) * (k - 0.25));
+ pre1[i].imag = -sin ((M_PI / 256) * (k - 0.25));
+ }
for(i=0; i< 64; i++) {
- *delay_ptr++ = -buf_2[i].real * *--window_ptr;
- *delay_ptr++ = buf_2[64-i-1].imag * *--window_ptr;
+ post1[i].real = cos ((M_PI / 256) * (i + 0.5));
+ post1[i].imag = sin ((M_PI / 256) * (i + 0.5));
}
for(i=0; i< 64; i++) {
- *delay_ptr++ = buf_2[i].imag * *--window_ptr;
- *delay_ptr++ = -buf_2[64-i-1].real * *--window_ptr;
- }
+ k = fftorder[i] / 4;
+ pre2[i].real = cos ((M_PI / 128) * (k - 0.25));
+ pre2[i].imag = sin ((M_PI / 128) * (k - 0.25));
}
-void imdct_init (uint32_t mm_accel)
-{
-#ifdef LIBA52_MLIB
- if (mm_accel & MM_ACCEL_MLIB) {
- fprintf (stderr, "Using mlib for IMDCT transform\n");
- imdct_512 = imdct_do_512_mlib;
- imdct_256 = imdct_do_256_mlib;
- } else
-#endif
- {
- int i, j, k;
-
- /* Twiddle factors to turn IFFT into IMDCT */
+ for (i = 0; i < 32; i++) {
+ post2[i].real = cos ((M_PI / 128) * (i + 0.5));
+ post2[i].imag = sin ((M_PI / 128) * (i + 0.5));
+ }
for (i = 0; i < 128; i++) {
xcos1[i] = -cos ((M_PI / 2048) * (8 * i + 1));
xsin1[i] = -sin ((M_PI / 2048) * (8 * i + 1));
}
-#if defined(ARCH_X86) || defined(ARCH_X86_64)
- for (i = 0; i < 128; i++) {
- sseSinCos1c[2*i+0]= xcos1[i];
- sseSinCos1c[2*i+1]= -xcos1[i];
- sseSinCos1d[2*i+0]= xsin1[i];
- sseSinCos1d[2*i+1]= xsin1[i];
- }
-#endif
-
- /* More twiddle factors to turn IFFT into IMDCT */
- for (i = 0; i < 64; i++) {
- xcos2[i] = -cos ((M_PI / 1024) * (8 * i + 1));
- xsin2[i] = -sin ((M_PI / 1024) * (8 * i + 1));
- }
-
for (i = 0; i < 7; i++) {
j = 1 << i;
for (k = 0; k < j; k++) {
@@ -1266,6 +1217,12 @@ void imdct_init (uint32_t mm_accel)
}
}
#if defined(ARCH_X86) || defined(ARCH_X86_64)
+ for (i = 0; i < 128; i++) {
+ sseSinCos1c[2*i+0]= xcos1[i];
+ sseSinCos1c[2*i+1]= -xcos1[i];
+ sseSinCos1d[2*i+0]= xsin1[i];
+ sseSinCos1d[2*i+1]= xsin1[i];
+ }
for (i = 1; i < 7; i++) {
j = 1 << i;
for (k = 0; k < j; k+=2) {
@@ -1297,37 +1254,39 @@ void imdct_init (uint32_t mm_accel)
for(i=0; i<128; i++)
{
- sseWindow[2*i+0]= -imdct_window[2*i+0];
- sseWindow[2*i+1]= imdct_window[2*i+1];
+ sseWindow[2*i+0]= -a52_imdct_window[2*i+0];
+ sseWindow[2*i+1]= a52_imdct_window[2*i+1];
}
for(i=0; i<64; i++)
{
- sseWindow[256 + 2*i+0]= -imdct_window[254 - 2*i+1];
- sseWindow[256 + 2*i+1]= imdct_window[254 - 2*i+0];
- sseWindow[384 + 2*i+0]= imdct_window[126 - 2*i+1];
- sseWindow[384 + 2*i+1]= -imdct_window[126 - 2*i+0];
+ sseWindow[256 + 2*i+0]= -a52_imdct_window[254 - 2*i+1];
+ sseWindow[256 + 2*i+1]= a52_imdct_window[254 - 2*i+0];
+ sseWindow[384 + 2*i+0]= a52_imdct_window[126 - 2*i+1];
+ sseWindow[384 + 2*i+1]= -a52_imdct_window[126 - 2*i+0];
}
-#endif // ARCH_X86 || ARCH_X86_64
+#endif
+ a52_imdct_512 = imdct_do_512;
+ ifft128 = ifft128_c;
+ ifft64 = ifft64_c;
- imdct_512 = imdct_do_512;
#if defined(ARCH_X86) || defined(ARCH_X86_64)
if(mm_accel & MM_ACCEL_X86_SSE)
{
fprintf (stderr, "Using SSE optimized IMDCT transform\n");
- imdct_512 = imdct_do_512_sse;
+ a52_imdct_512 = imdct_do_512_sse;
}
else
if(mm_accel & MM_ACCEL_X86_3DNOWEXT)
{
fprintf (stderr, "Using 3DNowEx optimized IMDCT transform\n");
- imdct_512 = imdct_do_512_3dnowex;
+ a52_imdct_512 = imdct_do_512_3dnowex;
}
else
if(mm_accel & MM_ACCEL_X86_3DNOW)
{
fprintf (stderr, "Using 3DNow optimized IMDCT transform\n");
- imdct_512 = imdct_do_512_3dnow;
+ a52_imdct_512 = imdct_do_512_3dnow;
}
else
#endif // ARCH_X86 || ARCH_X86_64
@@ -1335,264 +1294,19 @@ void imdct_init (uint32_t mm_accel)
if (mm_accel & MM_ACCEL_PPC_ALTIVEC)
{
fprintf(stderr, "Using AltiVec optimized IMDCT transform\n");
- imdct_512 = imdct_do_512_altivec;
+ a52_imdct_512 = imdct_do_512_altivec;
}
else
#endif
- fprintf (stderr, "No accelerated IMDCT transform found\n");
- imdct_256 = imdct_do_256;
- }
-}
-
-static void fft_asmb(int k, complex_t *x, complex_t *wTB,
- const complex_t *d, const complex_t *d_3)
-{
- register complex_t *x2k, *x3k, *x4k, *wB;
- register float a_r, a_i, a1_r, a1_i, u_r, u_i, v_r, v_i;
-
- x2k = x + 2 * k;
- x3k = x2k + 2 * k;
- x4k = x3k + 2 * k;
- wB = wTB + 2 * k;
-
- TRANSZERO(x[0],x2k[0],x3k[0],x4k[0]);
- TRANS(x[1],x2k[1],x3k[1],x4k[1],wTB[1],wB[1],d[1],d_3[1]);
-
- --k;
- for(;;) {
- TRANS(x[2],x2k[2],x3k[2],x4k[2],wTB[2],wB[2],d[2],d_3[2]);
- TRANS(x[3],x2k[3],x3k[3],x4k[3],wTB[3],wB[3],d[3],d_3[3]);
- if (!--k) break;
- x += 2;
- x2k += 2;
- x3k += 2;
- x4k += 2;
- d += 2;
- d_3 += 2;
- wTB += 2;
- wB += 2;
- }
-
-}
-
-static void fft_asmb16(complex_t *x, complex_t *wTB)
-{
- register float a_r, a_i, a1_r, a1_i, u_r, u_i, v_r, v_i;
- int k = 2;
-
- /* transform x[0], x[8], x[4], x[12] */
- TRANSZERO(x[0],x[4],x[8],x[12]);
-
- /* transform x[1], x[9], x[5], x[13] */
- TRANS(x[1],x[5],x[9],x[13],wTB[1],wTB[5],delta16[1],delta16_3[1]);
-
- /* transform x[2], x[10], x[6], x[14] */
- TRANSHALF_16(x[2],x[6],x[10],x[14]);
-
- /* transform x[3], x[11], x[7], x[15] */
- TRANS(x[3],x[7],x[11],x[15],wTB[3],wTB[7],delta16[3],delta16_3[3]);
-
-}
-
-static void fft_4(complex_t *x)
-{
- /* delta_p = 1 here */
- /* x[k] = sum_{i=0..3} x[i] * w^{i*k}, w=e^{-2*pi/4}
- */
-
- register float yt_r, yt_i, yb_r, yb_i, u_r, u_i, vi_r, vi_i;
-
- yt_r = x[0].real;
- yb_r = yt_r - x[2].real;
- yt_r += x[2].real;
- u_r = x[1].real;
- vi_i = x[3].real - u_r;
- u_r += x[3].real;
-
- u_i = x[1].imag;
- vi_r = u_i - x[3].imag;
- u_i += x[3].imag;
-
- yt_i = yt_r;
- yt_i += u_r;
- x[0].real = yt_i;
- yt_r -= u_r;
- x[2].real = yt_r;
- yt_i = yb_r;
- yt_i += vi_r;
- x[1].real = yt_i;
- yb_r -= vi_r;
- x[3].real = yb_r;
-
- yt_i = x[0].imag;
- yb_i = yt_i - x[2].imag;
- yt_i += x[2].imag;
-
- yt_r = yt_i;
- yt_r += u_i;
- x[0].imag = yt_r;
- yt_i -= u_i;
- x[2].imag = yt_i;
- yt_r = yb_i;
- yt_r += vi_i;
- x[1].imag = yt_r;
- yb_i -= vi_i;
- x[3].imag = yb_i;
-}
-
-
-static void fft_8(complex_t *x)
+#ifdef LIBA52_DJBFFT
+ if (mm_accel & MM_ACCEL_DJBFFT) {
+ fprintf (stderr, "Using djbfft for IMDCT transform\n");
+ ifft128 = (void (*) (complex_t *)) fftc4_un128;
+ ifft64 = (void (*) (complex_t *)) fftc4_un64;
+ } else
+#endif
{
- /* delta_p = diag{1, sqrt(i)} here */
- /* x[k] = sum_{i=0..7} x[i] * w^{i*k}, w=e^{-2*pi/8}
- */
- register float wT1_r, wT1_i, wB1_r, wB1_i, wT2_r, wT2_i, wB2_r, wB2_i;
-
- wT1_r = x[1].real;
- wT1_i = x[1].imag;
- wB1_r = x[3].real;
- wB1_i = x[3].imag;
-
- x[1] = x[2];
- x[2] = x[4];
- x[3] = x[6];
- fft_4(&x[0]);
-
-
- /* x[0] x[4] */
- wT2_r = x[5].real;
- wT2_r += x[7].real;
- wT2_r += wT1_r;
- wT2_r += wB1_r;
- wT2_i = wT2_r;
- wT2_r += x[0].real;
- wT2_i = x[0].real - wT2_i;
- x[0].real = wT2_r;
- x[4].real = wT2_i;
-
- wT2_i = x[5].imag;
- wT2_i += x[7].imag;
- wT2_i += wT1_i;
- wT2_i += wB1_i;
- wT2_r = wT2_i;
- wT2_r += x[0].imag;
- wT2_i = x[0].imag - wT2_i;
- x[0].imag = wT2_r;
- x[4].imag = wT2_i;
-
- /* x[2] x[6] */
- wT2_r = x[5].imag;
- wT2_r -= x[7].imag;
- wT2_r += wT1_i;
- wT2_r -= wB1_i;
- wT2_i = wT2_r;
- wT2_r += x[2].real;
- wT2_i = x[2].real - wT2_i;
- x[2].real = wT2_r;
- x[6].real = wT2_i;
-
- wT2_i = x[5].real;
- wT2_i -= x[7].real;
- wT2_i += wT1_r;
- wT2_i -= wB1_r;
- wT2_r = wT2_i;
- wT2_r += x[2].imag;
- wT2_i = x[2].imag - wT2_i;
- x[2].imag = wT2_i;
- x[6].imag = wT2_r;
-
-
- /* x[1] x[5] */
- wT2_r = wT1_r;
- wT2_r += wB1_i;
- wT2_r -= x[5].real;
- wT2_r -= x[7].imag;
- wT2_i = wT1_i;
- wT2_i -= wB1_r;
- wT2_i -= x[5].imag;
- wT2_i += x[7].real;
-
- wB2_r = wT2_r;
- wB2_r += wT2_i;
- wT2_i -= wT2_r;
- wB2_r *= HSQRT2;
- wT2_i *= HSQRT2;
- wT2_r = wB2_r;
- wB2_r += x[1].real;
- wT2_r = x[1].real - wT2_r;
-
- wB2_i = x[5].real;
- x[1].real = wB2_r;
- x[5].real = wT2_r;
-
- wT2_r = wT2_i;
- wT2_r += x[1].imag;
- wT2_i = x[1].imag - wT2_i;
- wB2_r = x[5].imag;
- x[1].imag = wT2_r;
- x[5].imag = wT2_i;
-
- /* x[3] x[7] */
- wT1_r -= wB1_i;
- wT1_i += wB1_r;
- wB1_r = wB2_i - x[7].imag;
- wB1_i = wB2_r + x[7].real;
- wT1_r -= wB1_r;
- wT1_i -= wB1_i;
- wB1_r = wT1_r + wT1_i;
- wB1_r *= HSQRT2;
- wT1_i -= wT1_r;
- wT1_i *= HSQRT2;
- wB2_r = x[3].real;
- wB2_i = wB2_r + wT1_i;
- wB2_r -= wT1_i;
- x[3].real = wB2_i;
- x[7].real = wB2_r;
- wB2_i = x[3].imag;
- wB2_r = wB2_i + wB1_r;
- wB2_i -= wB1_r;
- x[3].imag = wB2_i;
- x[7].imag = wB2_r;
+ fprintf (stderr, "No accelerated IMDCT transform found\n");
}
-
-
-static void fft_128p(complex_t *a)
-{
- fft_8(&a[0]); fft_4(&a[8]); fft_4(&a[12]);
- fft_asmb16(&a[0], &a[8]);
-
- fft_8(&a[16]), fft_8(&a[24]);
- fft_asmb(4, &a[0], &a[16],&delta32[0], &delta32_3[0]);
-
- fft_8(&a[32]); fft_4(&a[40]); fft_4(&a[44]);
- fft_asmb16(&a[32], &a[40]);
-
- fft_8(&a[48]); fft_4(&a[56]); fft_4(&a[60]);
- fft_asmb16(&a[48], &a[56]);
-
- fft_asmb(8, &a[0], &a[32],&delta64[0], &delta64_3[0]);
-
- fft_8(&a[64]); fft_4(&a[72]); fft_4(&a[76]);
- /* fft_16(&a[64]); */
- fft_asmb16(&a[64], &a[72]);
-
- fft_8(&a[80]); fft_8(&a[88]);
-
- /* fft_32(&a[64]); */
- fft_asmb(4, &a[64], &a[80],&delta32[0], &delta32_3[0]);
-
- fft_8(&a[96]); fft_4(&a[104]), fft_4(&a[108]);
- /* fft_16(&a[96]); */
- fft_asmb16(&a[96], &a[104]);
-
- fft_8(&a[112]), fft_8(&a[120]);
- /* fft_32(&a[96]); */
- fft_asmb(4, &a[96], &a[112], &delta32[0], &delta32_3[0]);
-
- /* fft_128(&a[0]); */
- fft_asmb(16, &a[0], &a[64], &delta128[0], &delta128_3[0]);
}
-
-
-
diff --git a/liba52/imdct_3dnow.h b/liba52/imdct_3dnow.h
index b45878913b..db703eed67 100644
--- a/liba52/imdct_3dnow.h
+++ b/liba52/imdct_3dnow.h
@@ -469,7 +469,7 @@ imdct_do_512_3dnow
data_ptr = data;
delay_ptr = delay;
- window_ptr = imdct_window;
+ window_ptr = a52_imdct_window;
/* Window and convert to real valued signal */
#if 1
diff --git a/liba52/liba52_changes.diff b/liba52/liba52_changes.diff
index ceb1de2576..09eefbd617 100644
--- a/liba52/liba52_changes.diff
+++ b/liba52/liba52_changes.diff
@@ -1,71 +1,81 @@
---- include/a52.h 2005-03-22 19:58:53.000000000 +0100
-+++ a52.h 2004-03-19 01:15:49.000000000 +0100
-@@ -19,6 +25,9 @@
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- */
+--- liba52-0.7.4/a52.h 2006-06-12 15:04:57.000000000 +0200
++++ liba52/a52.h 2006-06-05 02:23:02.000000000 +0200
+@@ -59,4 +63,9 @@
+ int a52_block (a52_state_t * state);
+ void a52_free (a52_state_t * state);
-+#ifndef A52_H
-+#define A52_H
-+
- #ifndef LIBA52_DOUBLE
- typedef float sample_t;
- #else
-@@ -113,3 +122,10 @@
- void a52_dynrng (a52_state_t * state,
- sample_t (* call) (sample_t, void *), void * data);
- int a52_block (a52_state_t * state, sample_t * samples);
-+
+void* a52_resample_init(uint32_t mm_accel,int flags,int chans);
+extern int (* a52_resample) (float * _f, int16_t * s16);
+
+uint16_t crc16_block(uint8_t *data,uint32_t num_bytes);
+
-+#endif /* A52_H */
---- liba52/a52_internal.h 2005-03-22 19:59:35.000000000 +0100
-+++ a52_internal.h 2004-03-19 01:15:49.000000000 +0100
-@@ -41,11 +43,12 @@
+ #endif /* A52_H */
+--- liba52-0.7.4/a52_internal.h 2006-06-12 15:05:07.000000000 +0200
++++ liba52/a52_internal.h 2006-06-05 02:23:02.000000000 +0200
+@@ -103,18 +107,34 @@
+ #define DELTA_BIT_NONE (2)
+ #define DELTA_BIT_RESERVED (3)
+
++#ifdef ARCH_X86_64
++# define REG_a "rax"
++# define REG_d "rdx"
++# define REG_S "rsi"
++# define REG_D "rdi"
++# define REG_BP "rbp"
++#else
++# define REG_a "eax"
++# define REG_d "edx"
++# define REG_S "esi"
++# define REG_D "edi"
++# define REG_BP "ebp"
++#endif
++
+ void a52_bit_allocate (a52_state_t * state, ba_t * ba, int bndstart,
+ int start, int end, int fastleak, int slowleak,
+ expbap_t * expbap);
- int downmix_init (int input, int flags, sample_t * level,
+ int a52_downmix_init (int input, int flags, sample_t * level,
sample_t clev, sample_t slev);
+void downmix_accel_init(uint32_t mm_accel);
- int downmix_coeff (sample_t * coeff, int acmod, int output, sample_t level,
+ int a52_downmix_coeff (sample_t * coeff, int acmod, int output, sample_t level,
sample_t clev, sample_t slev);
--void downmix (sample_t * samples, int acmod, int output, sample_t bias,
-+extern void (*downmix) (sample_t * samples, int acmod, int output, sample_t bias,
+-void a52_downmix (sample_t * samples, int acmod, int output, sample_t bias,
++extern void (*a52_downmix) (sample_t * samples, int acmod, int output, sample_t bias,
sample_t clev, sample_t slev);
--void upmix (sample_t * samples, int acmod, int output);
-+extern void (*upmix) (sample_t * samples, int acmod, int output);
-
- void imdct_init (uint32_t mm_accel);
- extern void (* imdct_256) (sample_t * data, sample_t * delay, sample_t bias);
---- liba52/bitstream.c 2005-03-22 19:59:35.000000000 +0100
-+++ bitstream.c 2004-03-19 01:15:49.000000000 +0100
-@@ -29,7 +35,12 @@
+-void a52_upmix (sample_t * samples, int acmod, int output);
++extern void (*a52_upmix) (sample_t * samples, int acmod, int output);
+
+ void a52_imdct_init (uint32_t mm_accel);
+ void a52_imdct_256 (sample_t * data, sample_t * delay, sample_t bias);
+-void a52_imdct_512 (sample_t * data, sample_t * delay, sample_t bias);
++extern void (*a52_imdct_512) (sample_t * data, sample_t * delay, sample_t bias);
++void imdct_do_512 (sample_t * data, sample_t * delay, sample_t bias);
+--- liba52-0.7.4/bitstream.c 2006-06-12 15:05:07.000000000 +0200
++++ liba52/bitstream.c 2006-06-05 02:23:02.000000000 +0200
+@@ -31,6 +35,10 @@
#define BUFFER_SIZE 4096
+#ifdef ALT_BITSTREAM_READER
+int indx=0;
-+uint32_t * buffer_start;
-+#else
- static uint32_t * buffer_start;
+#endif
-
- uint32_t bits_left;
- uint32_t current_word;
-@@ -41,6 +52,9 @@
- align = (int)buf & 3;
- buffer_start = (uint32_t *) (buf - align);
- bits_left = 0;
++
+ void a52_bitstream_set_ptr (a52_state_t * state, uint8_t * buf)
+ {
+ int align;
+@@ -38,6 +46,9 @@
+ align = (long)buf & 3;
+ state->buffer_start = (uint32_t *) (buf - align);
+ state->bits_left = 0;
+#ifdef ALT_BITSTREAM_READER
+ indx=0;
+#endif
- bitstream_get (align * 8);
+ bitstream_get (state, align * 8);
}
---- liba52/bitstream.h 2005-03-22 19:59:35.000000000 +0100
-+++ bitstream.h 2004-03-19 01:15:49.000000000 +0100
-@@ -19,6 +25,48 @@
+--- liba52-0.7.4/bitstream.h 2006-06-12 15:05:07.000000000 +0200
++++ liba52/bitstream.h 2006-06-05 02:23:02.000000000 +0200
+@@ -21,6 +25,48 @@
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
@@ -114,16 +124,16 @@
/* (stolen from the kernel) */
#ifdef WORDS_BIGENDIAN
-@@ -29,7 +77,7 @@
- # if defined (__i386__)
+@@ -28,7 +74,7 @@
+
+ #else
+
+-# if 0 && defined (__i386__)
++# if defined (__i386__)
# define swab32(x) __i386_swab32(x)
-- static inline const uint32_t __i386_swab32(uint32_t x)
-+ static always_inline const uint32_t __i386_swab32(uint32_t x)
- {
- __asm__("bswap %0" : "=r" (x) : "0" (x));
- return x;
-@@ -37,25 +85,42 @@
+ static inline const uint32_t __i386_swab32(uint32_t x)
+@@ -39,19 +85,34 @@
# else
@@ -141,24 +151,17 @@
#endif
+#ifdef ALT_BITSTREAM_READER
-+extern uint32_t *buffer_start;
+extern int indx;
-+#else
- extern uint32_t bits_left;
- extern uint32_t current_word;
+#endif
-
- void bitstream_set_ptr (uint8_t * buf);
- uint32_t bitstream_get_bh(uint32_t num_bits);
- int32_t bitstream_get_bh_2(uint32_t num_bits);
-
+
- static inline uint32_t
--bitstream_get(uint32_t num_bits)
-+bitstream_get(uint32_t num_bits) // note num_bits is practically a constant due to inlineing
+ void a52_bitstream_set_ptr (a52_state_t * state, uint8_t * buf);
+ uint32_t a52_bitstream_get_bh (a52_state_t * state, uint32_t num_bits);
+ int32_t a52_bitstream_get_bh_2 (a52_state_t * state, uint32_t num_bits);
+
+ static inline uint32_t bitstream_get (a52_state_t * state, uint32_t num_bits)
{
+#ifdef ALT_BITSTREAM_READER
-+ uint32_t result= swab32( unaligned32(((uint8_t *)buffer_start)+(indx>>3)) );
++ uint32_t result= swab32( unaligned32(((uint8_t *)state->buffer_start)+(indx>>3)) );
+
+ result<<= (indx&0x07);
+ result>>= 32 - num_bits;
@@ -167,32 +170,28 @@
+ return result;
+#else
uint32_t result;
--
-+
- if(num_bits < bits_left) {
- result = (current_word << (32 - bits_left)) >> (32 - num_bits);
- bits_left -= num_bits;
-@@ -63,11 +128,30 @@
+
+ if (num_bits < state->bits_left) {
+@@ -61,10 +122,29 @@
}
- return bitstream_get_bh(num_bits);
+ return a52_bitstream_get_bh (state, num_bits);
+#endif
+}
+
-+static inline void bitstream_skip(int num_bits)
++static inline void bitstream_skip(a52_state_t * state, int num_bits)
+{
+#ifdef ALT_BITSTREAM_READER
+ indx+= num_bits;
+#else
-+ bitstream_get(num_bits);
++ bitstream_get(state, num_bits);
+#endif
}
- static inline int32_t
- bitstream_get_2(uint32_t num_bits)
+ static inline int32_t bitstream_get_2 (a52_state_t * state, uint32_t num_bits)
{
+#ifdef ALT_BITSTREAM_READER
-+ int32_t result= swab32( unaligned32(((uint8_t *)buffer_start)+(indx>>3)) );
++ int32_t result= swab32( unaligned32(((uint8_t *)state->buffer_start)+(indx>>3)) );
+
+ result<<= (indx&0x07);
+ result>>= 32 - num_bits;
@@ -202,16 +201,16 @@
+#else
int32_t result;
- if(num_bits < bits_left) {
-@@ -77,4 +161,5 @@
+ if (num_bits < state->bits_left) {
+@@ -74,4 +154,5 @@
}
- return bitstream_get_bh_2(num_bits);
+ return a52_bitstream_get_bh_2 (state, num_bits);
+#endif
}
---- liba52/downmix.c 2005-03-22 19:59:35.000000000 +0100
-+++ downmix.c 2004-04-12 18:42:14.000000000 +0200
-@@ -17,18 +23,46 @@
+--- liba52-0.7.4/downmix.c 2006-06-12 15:17:53.000000000 +0200
++++ liba52/downmix.c 2006-06-05 02:23:02.000000000 +0200
+@@ -23,18 +23,47 @@
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
@@ -220,10 +219,10 @@
*/
#include "config.h"
++#include "asmalign.h"
--#include <inttypes.h>
#include <string.h>
-+#include <inttypes.h>
+ #include <inttypes.h>
#include "a52.h"
#include "a52_internal.h"
@@ -232,9 +231,9 @@
#define CONVERT(acmod,output) (((output) << 3) + (acmod))
+
-+void (*downmix)(sample_t * samples, int acmod, int output, sample_t bias,
++void (*a52_downmix)(sample_t * samples, int acmod, int output, sample_t bias,
+ sample_t clev, sample_t slev)= NULL;
-+void (*upmix)(sample_t * samples, int acmod, int output)= NULL;
++void (*a52_upmix)(sample_t * samples, int acmod, int output)= NULL;
+
+static void downmix_SSE (sample_t * samples, int acmod, int output, sample_t bias,
+ sample_t clev, sample_t slev);
@@ -247,50 +246,28 @@
+
+void downmix_accel_init(uint32_t mm_accel)
+{
-+ upmix= upmix_C;
-+ downmix= downmix_C;
-+#ifdef ARCH_X86
-+ if(mm_accel & MM_ACCEL_X86_MMX) upmix= upmix_MMX;
-+ if(mm_accel & MM_ACCEL_X86_SSE) downmix= downmix_SSE;
-+ if(mm_accel & MM_ACCEL_X86_3DNOW) downmix= downmix_3dnow;
++ a52_upmix= upmix_C;
++ a52_downmix= downmix_C;
++#if defined(ARCH_X86) || defined(ARCH_X86_64)
++ if(mm_accel & MM_ACCEL_X86_MMX) a52_upmix= upmix_MMX;
++ if(mm_accel & MM_ACCEL_X86_SSE) a52_downmix= downmix_SSE;
++ if(mm_accel & MM_ACCEL_X86_3DNOW) a52_downmix= downmix_3dnow;
+#endif
+}
+
- int downmix_init (int input, int flags, sample_t * level,
+ int a52_downmix_init (int input, int flags, sample_t * level,
sample_t clev, sample_t slev)
{
-@@ -61,7 +95,7 @@
- output = flags & A52_CHANNEL_MASK;
- if (output > A52_DOLBY)
- return -1;
--
-+
- output = table[output][input & 7];
-
- if ((output == A52_STEREO) &&
-@@ -145,7 +179,6 @@
- *level *= 1 / (1 + 3 * LEVEL_3DB);
- break;
- }
--
- return output;
- }
-
-@@ -440,12 +473,11 @@
- static void zero (sample_t * samples)
- {
- int i;
--
- for (i = 0; i < 256; i++)
+@@ -451,7 +480,7 @@
samples[i] = 0;
}
--void downmix (sample_t * samples, int acmod, int output, sample_t bias,
-+static void downmix_C (sample_t * samples, int acmod, int output, sample_t bias,
+-void a52_downmix (sample_t * samples, int acmod, int output, sample_t bias,
++void downmix_C (sample_t * samples, int acmod, int output, sample_t bias,
sample_t clev, sample_t slev)
{
switch (CONVERT (acmod, output & A52_CHANNEL_MASK)) {
-@@ -557,7 +589,7 @@
+@@ -563,7 +592,7 @@
break;
case CONVERT (A52_3F2R, A52_2F1R):
@@ -299,7 +276,7 @@
move2to1 (samples + 768, samples + 512, bias);
break;
-@@ -581,12 +613,12 @@
+@@ -587,12 +616,12 @@
break;
case CONVERT (A52_3F1R, A52_3F2R):
@@ -309,37 +286,37 @@
}
}
--void upmix (sample_t * samples, int acmod, int output)
-+static void upmix_C (sample_t * samples, int acmod, int output)
+-void a52_upmix (sample_t * samples, int acmod, int output)
++void upmix_C (sample_t * samples, int acmod, int output)
{
switch (CONVERT (acmod, output & A52_CHANNEL_MASK)) {
-@@ -651,3 +683,1137 @@
+@@ -657,3 +686,1137 @@
goto mix_31to21;
}
}
+
-+#ifdef ARCH_X86
++#if defined(ARCH_X86) || defined(ARCH_X86_64)
+static void mix2to1_SSE (sample_t * dest, sample_t * src, sample_t bias)
+{
+ asm volatile(
+ "movlps %2, %%xmm7 \n\t"
+ "shufps $0x00, %%xmm7, %%xmm7 \n\t"
-+ "movl $-1024, %%esi \n\t"
-+ ".balign 16\n\t"
++ "mov $-1024, %%"REG_S" \n\t"
++ ASMALIGN16
+ "1: \n\t"
-+ "movaps (%0, %%esi), %%xmm0 \n\t"
-+ "movaps 16(%0, %%esi), %%xmm1 \n\t"
-+ "addps (%1, %%esi), %%xmm0 \n\t"
-+ "addps 16(%1, %%esi), %%xmm1 \n\t"
++ "movaps (%0, %%"REG_S"), %%xmm0 \n\t"
++ "movaps 16(%0, %%"REG_S"), %%xmm1\n\t"
++ "addps (%1, %%"REG_S"), %%xmm0 \n\t"
++ "addps 16(%1, %%"REG_S"), %%xmm1\n\t"
+ "addps %%xmm7, %%xmm0 \n\t"
+ "addps %%xmm7, %%xmm1 \n\t"
-+ "movaps %%xmm0, (%1, %%esi) \n\t"
-+ "movaps %%xmm1, 16(%1, %%esi) \n\t"
-+ "addl $32, %%esi \n\t"
++ "movaps %%xmm0, (%1, %%"REG_S") \n\t"
++ "movaps %%xmm1, 16(%1, %%"REG_S")\n\t"
++ "add $32, %%"REG_S" \n\t"
+ " jnz 1b \n\t"
+ :: "r" (src+256), "r" (dest+256), "m" (bias)
-+ : "%esi"
++ : "%"REG_S
+ );
+}
+
@@ -348,19 +325,19 @@
+ asm volatile(
+ "movlps %1, %%xmm7 \n\t"
+ "shufps $0x00, %%xmm7, %%xmm7 \n\t"
-+ "movl $-1024, %%esi \n\t"
-+ ".balign 16\n\t"
++ "mov $-1024, %%"REG_S" \n\t"
++ ASMALIGN16
+ "1: \n\t"
-+ "movaps (%0, %%esi), %%xmm0 \n\t"
-+ "movaps 1024(%0, %%esi), %%xmm1 \n\t"
-+ "addps 2048(%0, %%esi), %%xmm0 \n\t"
++ "movaps (%0, %%"REG_S"), %%xmm0 \n\t"
++ "movaps 1024(%0, %%"REG_S"), %%xmm1\n\t"
++ "addps 2048(%0, %%"REG_S"), %%xmm0\n\t"
+ "addps %%xmm7, %%xmm1 \n\t"
+ "addps %%xmm1, %%xmm0 \n\t"
-+ "movaps %%xmm0, (%0, %%esi) \n\t"
-+ "addl $16, %%esi \n\t"
++ "movaps %%xmm0, (%0, %%"REG_S") \n\t"
++ "add $16, %%"REG_S" \n\t"
+ " jnz 1b \n\t"
+ :: "r" (samples+256), "m" (bias)
-+ : "%esi"
++ : "%"REG_S
+ );
+}
+
@@ -369,20 +346,20 @@
+ asm volatile(
+ "movlps %1, %%xmm7 \n\t"
+ "shufps $0x00, %%xmm7, %%xmm7 \n\t"
-+ "movl $-1024, %%esi \n\t"
-+ ".balign 16\n\t"
++ "mov $-1024, %%"REG_S" \n\t"
++ ASMALIGN16
+ "1: \n\t"
-+ "movaps (%0, %%esi), %%xmm0 \n\t"
-+ "movaps 1024(%0, %%esi), %%xmm1 \n\t"
-+ "addps 2048(%0, %%esi), %%xmm0 \n\t"
-+ "addps 3072(%0, %%esi), %%xmm1 \n\t"
++ "movaps (%0, %%"REG_S"), %%xmm0 \n\t"
++ "movaps 1024(%0, %%"REG_S"), %%xmm1\n\t"
++ "addps 2048(%0, %%"REG_S"), %%xmm0\n\t"
++ "addps 3072(%0, %%"REG_S"), %%xmm1\n\t"
+ "addps %%xmm7, %%xmm0 \n\t"
+ "addps %%xmm1, %%xmm0 \n\t"
-+ "movaps %%xmm0, (%0, %%esi) \n\t"
-+ "addl $16, %%esi \n\t"
++ "movaps %%xmm0, (%0, %%"REG_S") \n\t"
++ "add $16, %%"REG_S" \n\t"
+ " jnz 1b \n\t"
+ :: "r" (samples+256), "m" (bias)
-+ : "%esi"
++ : "%"REG_S
+ );
+}
+
@@ -391,21 +368,21 @@
+ asm volatile(
+ "movlps %1, %%xmm7 \n\t"
+ "shufps $0x00, %%xmm7, %%xmm7 \n\t"
-+ "movl $-1024, %%esi \n\t"
-+ ".balign 16\n\t"
++ "mov $-1024, %%"REG_S" \n\t"
++ ASMALIGN16
+ "1: \n\t"
-+ "movaps (%0, %%esi), %%xmm0 \n\t"
-+ "movaps 1024(%0, %%esi), %%xmm1 \n\t"
-+ "addps 2048(%0, %%esi), %%xmm0 \n\t"
-+ "addps 3072(%0, %%esi), %%xmm1 \n\t"
++ "movaps (%0, %%"REG_S"), %%xmm0 \n\t"
++ "movaps 1024(%0, %%"REG_S"), %%xmm1\n\t"
++ "addps 2048(%0, %%"REG_S"), %%xmm0\n\t"
++ "addps 3072(%0, %%"REG_S"), %%xmm1\n\t"
+ "addps %%xmm7, %%xmm0 \n\t"
-+ "addps 4096(%0, %%esi), %%xmm1 \n\t"
++ "addps 4096(%0, %%"REG_S"), %%xmm1\n\t"
+ "addps %%xmm1, %%xmm0 \n\t"
-+ "movaps %%xmm0, (%0, %%esi) \n\t"
-+ "addl $16, %%esi \n\t"
++ "movaps %%xmm0, (%0, %%"REG_S") \n\t"
++ "add $16, %%"REG_S" \n\t"
+ " jnz 1b \n\t"
+ :: "r" (samples+256), "m" (bias)
-+ : "%esi"
++ : "%"REG_S
+ );
+}
+
@@ -414,21 +391,21 @@
+ asm volatile(
+ "movlps %1, %%xmm7 \n\t"
+ "shufps $0x00, %%xmm7, %%xmm7 \n\t"
-+ "movl $-1024, %%esi \n\t"
-+ ".balign 16\n\t"
++ "mov $-1024, %%"REG_S" \n\t"
++ ASMALIGN16
+ "1: \n\t"
-+ "movaps 1024(%0, %%esi), %%xmm0 \n\t"
++ "movaps 1024(%0, %%"REG_S"), %%xmm0\n\t"
+ "addps %%xmm7, %%xmm0 \n\t" //common
-+ "movaps (%0, %%esi), %%xmm1 \n\t"
-+ "movaps 2048(%0, %%esi), %%xmm2 \n\t"
++ "movaps (%0, %%"REG_S"), %%xmm1 \n\t"
++ "movaps 2048(%0, %%"REG_S"), %%xmm2\n\t"
+ "addps %%xmm0, %%xmm1 \n\t"
+ "addps %%xmm0, %%xmm2 \n\t"
-+ "movaps %%xmm1, (%0, %%esi) \n\t"
-+ "movaps %%xmm2, 1024(%0, %%esi) \n\t"
-+ "addl $16, %%esi \n\t"
++ "movaps %%xmm1, (%0, %%"REG_S") \n\t"
++ "movaps %%xmm2, 1024(%0, %%"REG_S")\n\t"
++ "add $16, %%"REG_S" \n\t"
+ " jnz 1b \n\t"
+ :: "r" (samples+256), "m" (bias)
-+ : "%esi"
++ : "%"REG_S
+ );
+}
+
@@ -437,21 +414,21 @@
+ asm volatile(
+ "movlps %2, %%xmm7 \n\t"
+ "shufps $0x00, %%xmm7, %%xmm7 \n\t"
-+ "movl $-1024, %%esi \n\t"
-+ ".balign 16\n\t"
++ "mov $-1024, %%"REG_S" \n\t"
++ ASMALIGN16
+ "1: \n\t"
-+ "movaps 1024(%1, %%esi), %%xmm0 \n\t"
++ "movaps 1024(%1, %%"REG_S"), %%xmm0\n\t"
+ "addps %%xmm7, %%xmm0 \n\t" //common
-+ "movaps (%0, %%esi), %%xmm1 \n\t"
-+ "movaps (%1, %%esi), %%xmm2 \n\t"
++ "movaps (%0, %%"REG_S"), %%xmm1 \n\t"
++ "movaps (%1, %%"REG_S"), %%xmm2 \n\t"
+ "addps %%xmm0, %%xmm1 \n\t"
+ "addps %%xmm0, %%xmm2 \n\t"
-+ "movaps %%xmm1, (%0, %%esi) \n\t"
-+ "movaps %%xmm2, (%1, %%esi) \n\t"
-+ "addl $16, %%esi \n\t"
++ "movaps %%xmm1, (%0, %%"REG_S") \n\t"
++ "movaps %%xmm2, (%1, %%"REG_S") \n\t"
++ "add $16, %%"REG_S" \n\t"
+ " jnz 1b \n\t"
+ :: "r" (left+256), "r" (right+256), "m" (bias)
-+ : "%esi"
++ : "%"REG_S
+ );
+}
+
@@ -460,22 +437,22 @@
+ asm volatile(
+ "movlps %1, %%xmm7 \n\t"
+ "shufps $0x00, %%xmm7, %%xmm7 \n\t"
-+ "movl $-1024, %%esi \n\t"
-+ ".balign 16\n\t"
++ "mov $-1024, %%"REG_S" \n\t"
++ ASMALIGN16
+ "1: \n\t"
-+ "movaps 2048(%0, %%esi), %%xmm0 \n\t" // surround
-+ "movaps (%0, %%esi), %%xmm1 \n\t"
-+ "movaps 1024(%0, %%esi), %%xmm2 \n\t"
++ "movaps 2048(%0, %%"REG_S"), %%xmm0\n\t" // surround
++ "movaps (%0, %%"REG_S"), %%xmm1 \n\t"
++ "movaps 1024(%0, %%"REG_S"), %%xmm2\n\t"
+ "addps %%xmm7, %%xmm1 \n\t"
+ "addps %%xmm7, %%xmm2 \n\t"
+ "subps %%xmm0, %%xmm1 \n\t"
+ "addps %%xmm0, %%xmm2 \n\t"
-+ "movaps %%xmm1, (%0, %%esi) \n\t"
-+ "movaps %%xmm2, 1024(%0, %%esi) \n\t"
-+ "addl $16, %%esi \n\t"
++ "movaps %%xmm1, (%0, %%"REG_S") \n\t"
++ "movaps %%xmm2, 1024(%0, %%"REG_S")\n\t"
++ "add $16, %%"REG_S" \n\t"
+ " jnz 1b \n\t"
+ :: "r" (samples+256), "m" (bias)
-+ : "%esi"
++ : "%"REG_S
+ );
+}
+
@@ -484,22 +461,22 @@
+ asm volatile(
+ "movlps %1, %%xmm7 \n\t"
+ "shufps $0x00, %%xmm7, %%xmm7 \n\t"
-+ "movl $-1024, %%esi \n\t"
-+ ".balign 16\n\t"
++ "mov $-1024, %%"REG_S" \n\t"
++ ASMALIGN16
+ "1: \n\t"
-+ "movaps 1024(%0, %%esi), %%xmm0 \n\t"
-+ "addps 3072(%0, %%esi), %%xmm0 \n\t"
++ "movaps 1024(%0, %%"REG_S"), %%xmm0\n\t"
++ "addps 3072(%0, %%"REG_S"), %%xmm0\n\t"
+ "addps %%xmm7, %%xmm0 \n\t" // common
-+ "movaps (%0, %%esi), %%xmm1 \n\t"
-+ "movaps 2048(%0, %%esi), %%xmm2 \n\t"
++ "movaps (%0, %%"REG_S"), %%xmm1 \n\t"
++ "movaps 2048(%0, %%"REG_S"), %%xmm2\n\t"
+ "addps %%xmm0, %%xmm1 \n\t"
+ "addps %%xmm0, %%xmm2 \n\t"
-+ "movaps %%xmm1, (%0, %%esi) \n\t"
-+ "movaps %%xmm2, 1024(%0, %%esi) \n\t"
-+ "addl $16, %%esi \n\t"
++ "movaps %%xmm1, (%0, %%"REG_S") \n\t"
++ "movaps %%xmm2, 1024(%0, %%"REG_S")\n\t"
++ "add $16, %%"REG_S" \n\t"
+ " jnz 1b \n\t"
+ :: "r" (samples+256), "m" (bias)
-+ : "%esi"
++ : "%"REG_S
+ );
+}
+
@@ -508,24 +485,24 @@
+ asm volatile(
+ "movlps %1, %%xmm7 \n\t"
+ "shufps $0x00, %%xmm7, %%xmm7 \n\t"
-+ "movl $-1024, %%esi \n\t"
-+ ".balign 16\n\t"
++ "mov $-1024, %%"REG_S" \n\t"
++ ASMALIGN16
+ "1: \n\t"
-+ "movaps 1024(%0, %%esi), %%xmm0 \n\t"
-+ "movaps 3072(%0, %%esi), %%xmm3 \n\t" // surround
++ "movaps 1024(%0, %%"REG_S"), %%xmm0\n\t"
++ "movaps 3072(%0, %%"REG_S"), %%xmm3\n\t" // surround
+ "addps %%xmm7, %%xmm0 \n\t" // common
-+ "movaps (%0, %%esi), %%xmm1 \n\t"
-+ "movaps 2048(%0, %%esi), %%xmm2 \n\t"
++ "movaps (%0, %%"REG_S"), %%xmm1 \n\t"
++ "movaps 2048(%0, %%"REG_S"), %%xmm2\n\t"
+ "addps %%xmm0, %%xmm1 \n\t"
+ "addps %%xmm0, %%xmm2 \n\t"
+ "subps %%xmm3, %%xmm1 \n\t"
+ "addps %%xmm3, %%xmm2 \n\t"
-+ "movaps %%xmm1, (%0, %%esi) \n\t"
-+ "movaps %%xmm2, 1024(%0, %%esi) \n\t"
-+ "addl $16, %%esi \n\t"
++ "movaps %%xmm1, (%0, %%"REG_S") \n\t"
++ "movaps %%xmm2, 1024(%0, %%"REG_S")\n\t"
++ "add $16, %%"REG_S" \n\t"
+ " jnz 1b \n\t"
+ :: "r" (samples+256), "m" (bias)
-+ : "%esi"
++ : "%"REG_S
+ );
+}
+
@@ -534,23 +511,23 @@
+ asm volatile(
+ "movlps %1, %%xmm7 \n\t"
+ "shufps $0x00, %%xmm7, %%xmm7 \n\t"
-+ "movl $-1024, %%esi \n\t"
-+ ".balign 16\n\t"
++ "mov $-1024, %%"REG_S" \n\t"
++ ASMALIGN16
+ "1: \n\t"
-+ "movaps 2048(%0, %%esi), %%xmm0 \n\t"
-+ "addps 3072(%0, %%esi), %%xmm0 \n\t" // surround
-+ "movaps (%0, %%esi), %%xmm1 \n\t"
-+ "movaps 1024(%0, %%esi), %%xmm2 \n\t"
++ "movaps 2048(%0, %%"REG_S"), %%xmm0\n\t"
++ "addps 3072(%0, %%"REG_S"), %%xmm0\n\t" // surround
++ "movaps (%0, %%"REG_S"), %%xmm1 \n\t"
++ "movaps 1024(%0, %%"REG_S"), %%xmm2\n\t"
+ "addps %%xmm7, %%xmm1 \n\t"
+ "addps %%xmm7, %%xmm2 \n\t"
+ "subps %%xmm0, %%xmm1 \n\t"
+ "addps %%xmm0, %%xmm2 \n\t"
-+ "movaps %%xmm1, (%0, %%esi) \n\t"
-+ "movaps %%xmm2, 1024(%0, %%esi) \n\t"
-+ "addl $16, %%esi \n\t"
++ "movaps %%xmm1, (%0, %%"REG_S") \n\t"
++ "movaps %%xmm2, 1024(%0, %%"REG_S")\n\t"
++ "add $16, %%"REG_S" \n\t"
+ " jnz 1b \n\t"
+ :: "r" (samples+256), "m" (bias)
-+ : "%esi"
++ : "%"REG_S
+ );
+}
+
@@ -559,22 +536,22 @@
+ asm volatile(
+ "movlps %1, %%xmm7 \n\t"
+ "shufps $0x00, %%xmm7, %%xmm7 \n\t"
-+ "movl $-1024, %%esi \n\t"
-+ ".balign 16\n\t"
++ "mov $-1024, %%"REG_S" \n\t"
++ ASMALIGN16
+ "1: \n\t"
-+ "movaps 1024(%0, %%esi), %%xmm0 \n\t"
++ "movaps 1024(%0, %%"REG_S"), %%xmm0\n\t"
+ "addps %%xmm7, %%xmm0 \n\t" // common
+ "movaps %%xmm0, %%xmm1 \n\t" // common
-+ "addps (%0, %%esi), %%xmm0 \n\t"
-+ "addps 2048(%0, %%esi), %%xmm1 \n\t"
-+ "addps 3072(%0, %%esi), %%xmm0 \n\t"
-+ "addps 4096(%0, %%esi), %%xmm1 \n\t"
-+ "movaps %%xmm0, (%0, %%esi) \n\t"
-+ "movaps %%xmm1, 1024(%0, %%esi) \n\t"
-+ "addl $16, %%esi \n\t"
++ "addps (%0, %%"REG_S"), %%xmm0 \n\t"
++ "addps 2048(%0, %%"REG_S"), %%xmm1\n\t"
++ "addps 3072(%0, %%"REG_S"), %%xmm0\n\t"
++ "addps 4096(%0, %%"REG_S"), %%xmm1\n\t"
++ "movaps %%xmm0, (%0, %%"REG_S") \n\t"
++ "movaps %%xmm1, 1024(%0, %%"REG_S")\n\t"
++ "add $16, %%"REG_S" \n\t"
+ " jnz 1b \n\t"
+ :: "r" (samples+256), "m" (bias)
-+ : "%esi"
++ : "%"REG_S
+ );
+}
+
@@ -583,25 +560,25 @@
+ asm volatile(
+ "movlps %1, %%xmm7 \n\t"
+ "shufps $0x00, %%xmm7, %%xmm7 \n\t"
-+ "movl $-1024, %%esi \n\t"
-+ ".balign 16\n\t"
++ "mov $-1024, %%"REG_S" \n\t"
++ ASMALIGN16
+ "1: \n\t"
-+ "movaps 1024(%0, %%esi), %%xmm0 \n\t"
-+ "movaps 3072(%0, %%esi), %%xmm2 \n\t"
++ "movaps 1024(%0, %%"REG_S"), %%xmm0\n\t"
++ "movaps 3072(%0, %%"REG_S"), %%xmm2\n\t"
+ "addps %%xmm7, %%xmm0 \n\t" // common
-+ "addps 4096(%0, %%esi), %%xmm2 \n\t" // surround
-+ "movaps (%0, %%esi), %%xmm1 \n\t"
-+ "movaps 2048(%0, %%esi), %%xmm3 \n\t"
++ "addps 4096(%0, %%"REG_S"), %%xmm2\n\t" // surround
++ "movaps (%0, %%"REG_S"), %%xmm1 \n\t"
++ "movaps 2048(%0, %%"REG_S"), %%xmm3\n\t"
+ "subps %%xmm2, %%xmm1 \n\t"
+ "addps %%xmm2, %%xmm3 \n\t"
+ "addps %%xmm0, %%xmm1 \n\t"
+ "addps %%xmm0, %%xmm3 \n\t"
-+ "movaps %%xmm1, (%0, %%esi) \n\t"
-+ "movaps %%xmm3, 1024(%0, %%esi) \n\t"
-+ "addl $16, %%esi \n\t"
++ "movaps %%xmm1, (%0, %%"REG_S") \n\t"
++ "movaps %%xmm3, 1024(%0, %%"REG_S")\n\t"
++ "add $16, %%"REG_S" \n\t"
+ " jnz 1b \n\t"
+ :: "r" (samples+256), "m" (bias)
-+ : "%esi"
++ : "%"REG_S
+ );
+}
+
@@ -610,40 +587,40 @@
+ asm volatile(
+ "movlps %2, %%xmm7 \n\t"
+ "shufps $0x00, %%xmm7, %%xmm7 \n\t"
-+ "movl $-1024, %%esi \n\t"
-+ ".balign 16\n\t"
++ "mov $-1024, %%"REG_S" \n\t"
++ ASMALIGN16
+ "1: \n\t"
-+ "movaps (%0, %%esi), %%xmm0 \n\t"
-+ "movaps 16(%0, %%esi), %%xmm1 \n\t"
-+ "addps 1024(%0, %%esi), %%xmm0 \n\t"
-+ "addps 1040(%0, %%esi), %%xmm1 \n\t"
++ "movaps (%0, %%"REG_S"), %%xmm0 \n\t"
++ "movaps 16(%0, %%"REG_S"), %%xmm1\n\t"
++ "addps 1024(%0, %%"REG_S"), %%xmm0\n\t"
++ "addps 1040(%0, %%"REG_S"), %%xmm1\n\t"
+ "addps %%xmm7, %%xmm0 \n\t"
+ "addps %%xmm7, %%xmm1 \n\t"
-+ "movaps %%xmm0, (%1, %%esi) \n\t"
-+ "movaps %%xmm1, 16(%1, %%esi) \n\t"
-+ "addl $32, %%esi \n\t"
++ "movaps %%xmm0, (%1, %%"REG_S") \n\t"
++ "movaps %%xmm1, 16(%1, %%"REG_S")\n\t"
++ "add $32, %%"REG_S" \n\t"
+ " jnz 1b \n\t"
+ :: "r" (src+256), "r" (dest+256), "m" (bias)
-+ : "%esi"
++ : "%"REG_S
+ );
+}
+
+static void zero_MMX(sample_t * samples)
+{
+ asm volatile(
-+ "movl $-1024, %%esi \n\t"
++ "mov $-1024, %%"REG_S" \n\t"
+ "pxor %%mm0, %%mm0 \n\t"
-+ ".balign 16\n\t"
++ ASMALIGN16
+ "1: \n\t"
-+ "movq %%mm0, (%0, %%esi) \n\t"
-+ "movq %%mm0, 8(%0, %%esi) \n\t"
-+ "movq %%mm0, 16(%0, %%esi) \n\t"
-+ "movq %%mm0, 24(%0, %%esi) \n\t"
-+ "addl $32, %%esi \n\t"
++ "movq %%mm0, (%0, %%"REG_S") \n\t"
++ "movq %%mm0, 8(%0, %%"REG_S") \n\t"
++ "movq %%mm0, 16(%0, %%"REG_S") \n\t"
++ "movq %%mm0, 24(%0, %%"REG_S") \n\t"
++ "add $32, %%"REG_S" \n\t"
+ " jnz 1b \n\t"
+ "emms"
+ :: "r" (samples+256)
-+ : "%esi"
++ : "%"REG_S
+ );
+}
+
@@ -892,29 +869,29 @@
+ asm volatile(
+ "movd %2, %%mm7 \n\t"
+ "punpckldq %2, %%mm7 \n\t"
-+ "movl $-1024, %%esi \n\t"
-+ ".balign 16\n\t"
++ "mov $-1024, %%"REG_S" \n\t"
++ ASMALIGN16
+ "1: \n\t"
-+ "movq (%0, %%esi), %%mm0 \n\t"
-+ "movq 8(%0, %%esi), %%mm1 \n\t"
-+ "movq 16(%0, %%esi), %%mm2 \n\t"
-+ "movq 24(%0, %%esi), %%mm3 \n\t"
-+ "pfadd (%1, %%esi), %%mm0 \n\t"
-+ "pfadd 8(%1, %%esi), %%mm1 \n\t"
-+ "pfadd 16(%1, %%esi), %%mm2 \n\t"
-+ "pfadd 24(%1, %%esi), %%mm3 \n\t"
++ "movq (%0, %%"REG_S"), %%mm0 \n\t"
++ "movq 8(%0, %%"REG_S"), %%mm1 \n\t"
++ "movq 16(%0, %%"REG_S"), %%mm2 \n\t"
++ "movq 24(%0, %%"REG_S"), %%mm3 \n\t"
++ "pfadd (%1, %%"REG_S"), %%mm0 \n\t"
++ "pfadd 8(%1, %%"REG_S"), %%mm1 \n\t"
++ "pfadd 16(%1, %%"REG_S"), %%mm2 \n\t"
++ "pfadd 24(%1, %%"REG_S"), %%mm3 \n\t"
+ "pfadd %%mm7, %%mm0 \n\t"
+ "pfadd %%mm7, %%mm1 \n\t"
+ "pfadd %%mm7, %%mm2 \n\t"
+ "pfadd %%mm7, %%mm3 \n\t"
-+ "movq %%mm0, (%1, %%esi) \n\t"
-+ "movq %%mm1, 8(%1, %%esi) \n\t"
-+ "movq %%mm2, 16(%1, %%esi) \n\t"
-+ "movq %%mm3, 24(%1, %%esi) \n\t"
-+ "addl $32, %%esi \n\t"
++ "movq %%mm0, (%1, %%"REG_S") \n\t"
++ "movq %%mm1, 8(%1, %%"REG_S") \n\t"
++ "movq %%mm2, 16(%1, %%"REG_S") \n\t"
++ "movq %%mm3, 24(%1, %%"REG_S") \n\t"
++ "add $32, %%"REG_S" \n\t"
+ " jnz 1b \n\t"
+ :: "r" (src+256), "r" (dest+256), "m" (bias)
-+ : "%esi"
++ : "%"REG_S
+ );
+}
+
@@ -923,25 +900,25 @@
+ asm volatile(
+ "movd %1, %%mm7 \n\t"
+ "punpckldq %1, %%mm7 \n\t"
-+ "movl $-1024, %%esi \n\t"
-+ ".balign 16\n\t"
++ "mov $-1024, %%"REG_S" \n\t"
++ ASMALIGN16
+ "1: \n\t"
-+ "movq (%0, %%esi), %%mm0 \n\t"
-+ "movq 8(%0, %%esi), %%mm1 \n\t"
-+ "movq 1024(%0, %%esi), %%mm2 \n\t"
-+ "movq 1032(%0, %%esi), %%mm3 \n\t"
-+ "pfadd 2048(%0, %%esi), %%mm0 \n\t"
-+ "pfadd 2056(%0, %%esi), %%mm1 \n\t"
++ "movq (%0, %%"REG_S"), %%mm0 \n\t"
++ "movq 8(%0, %%"REG_S"), %%mm1 \n\t"
++ "movq 1024(%0, %%"REG_S"), %%mm2\n\t"
++ "movq 1032(%0, %%"REG_S"), %%mm3\n\t"
++ "pfadd 2048(%0, %%"REG_S"), %%mm0\n\t"
++ "pfadd 2056(%0, %%"REG_S"), %%mm1\n\t"
+ "pfadd %%mm7, %%mm0 \n\t"
+ "pfadd %%mm7, %%mm1 \n\t"
+ "pfadd %%mm2, %%mm0 \n\t"
+ "pfadd %%mm3, %%mm1 \n\t"
-+ "movq %%mm0, (%0, %%esi) \n\t"
-+ "movq %%mm1, 8(%0, %%esi) \n\t"
-+ "addl $16, %%esi \n\t"
++ "movq %%mm0, (%0, %%"REG_S") \n\t"
++ "movq %%mm1, 8(%0, %%"REG_S") \n\t"
++ "add $16, %%"REG_S" \n\t"
+ " jnz 1b \n\t"
+ :: "r" (samples+256), "m" (bias)
-+ : "%esi"
++ : "%"REG_S
+ );
+}
+
@@ -950,27 +927,27 @@
+ asm volatile(
+ "movd %1, %%mm7 \n\t"
+ "punpckldq %1, %%mm7 \n\t"
-+ "movl $-1024, %%esi \n\t"
-+ ".balign 16\n\t"
++ "mov $-1024, %%"REG_S" \n\t"
++ ASMALIGN16
+ "1: \n\t"
-+ "movq (%0, %%esi), %%mm0 \n\t"
-+ "movq 8(%0, %%esi), %%mm1 \n\t"
-+ "movq 1024(%0, %%esi), %%mm2 \n\t"
-+ "movq 1032(%0, %%esi), %%mm3 \n\t"
-+ "pfadd 2048(%0, %%esi), %%mm0 \n\t"
-+ "pfadd 2056(%0, %%esi), %%mm1 \n\t"
-+ "pfadd 3072(%0, %%esi), %%mm2 \n\t"
-+ "pfadd 3080(%0, %%esi), %%mm3 \n\t"
++ "movq (%0, %%"REG_S"), %%mm0 \n\t"
++ "movq 8(%0, %%"REG_S"), %%mm1 \n\t"
++ "movq 1024(%0, %%"REG_S"), %%mm2\n\t"
++ "movq 1032(%0, %%"REG_S"), %%mm3\n\t"
++ "pfadd 2048(%0, %%"REG_S"), %%mm0\n\t"
++ "pfadd 2056(%0, %%"REG_S"), %%mm1\n\t"
++ "pfadd 3072(%0, %%"REG_S"), %%mm2\n\t"
++ "pfadd 3080(%0, %%"REG_S"), %%mm3\n\t"
+ "pfadd %%mm7, %%mm0 \n\t"
+ "pfadd %%mm7, %%mm1 \n\t"
+ "pfadd %%mm2, %%mm0 \n\t"
+ "pfadd %%mm3, %%mm1 \n\t"
-+ "movq %%mm0, (%0, %%esi) \n\t"
-+ "movq %%mm1, 8(%0, %%esi) \n\t"
-+ "addl $16, %%esi \n\t"
++ "movq %%mm0, (%0, %%"REG_S") \n\t"
++ "movq %%mm1, 8(%0, %%"REG_S") \n\t"
++ "add $16, %%"REG_S" \n\t"
+ " jnz 1b \n\t"
+ :: "r" (samples+256), "m" (bias)
-+ : "%esi"
++ : "%"REG_S
+ );
+}
+
@@ -979,29 +956,29 @@
+ asm volatile(
+ "movd %1, %%mm7 \n\t"
+ "punpckldq %1, %%mm7 \n\t"
-+ "movl $-1024, %%esi \n\t"
-+ ".balign 16\n\t"
++ "mov $-1024, %%"REG_S" \n\t"
++ ASMALIGN16
+ "1: \n\t"
-+ "movq (%0, %%esi), %%mm0 \n\t"
-+ "movq 8(%0, %%esi), %%mm1 \n\t"
-+ "movq 1024(%0, %%esi), %%mm2 \n\t"
-+ "movq 1032(%0, %%esi), %%mm3 \n\t"
-+ "pfadd 2048(%0, %%esi), %%mm0 \n\t"
-+ "pfadd 2056(%0, %%esi), %%mm1 \n\t"
-+ "pfadd 3072(%0, %%esi), %%mm2 \n\t"
-+ "pfadd 3080(%0, %%esi), %%mm3 \n\t"
++ "movq (%0, %%"REG_S"), %%mm0 \n\t"
++ "movq 8(%0, %%"REG_S"), %%mm1 \n\t"
++ "movq 1024(%0, %%"REG_S"), %%mm2\n\t"
++ "movq 1032(%0, %%"REG_S"), %%mm3\n\t"
++ "pfadd 2048(%0, %%"REG_S"), %%mm0\n\t"
++ "pfadd 2056(%0, %%"REG_S"), %%mm1\n\t"
++ "pfadd 3072(%0, %%"REG_S"), %%mm2\n\t"
++ "pfadd 3080(%0, %%"REG_S"), %%mm3\n\t"
+ "pfadd %%mm7, %%mm0 \n\t"
+ "pfadd %%mm7, %%mm1 \n\t"
-+ "pfadd 4096(%0, %%esi), %%mm2 \n\t"
-+ "pfadd 4104(%0, %%esi), %%mm3 \n\t"
++ "pfadd 4096(%0, %%"REG_S"), %%mm2\n\t"
++ "pfadd 4104(%0, %%"REG_S"), %%mm3\n\t"
+ "pfadd %%mm2, %%mm0 \n\t"
+ "pfadd %%mm3, %%mm1 \n\t"
-+ "movq %%mm0, (%0, %%esi) \n\t"
-+ "movq %%mm1, 8(%0, %%esi) \n\t"
-+ "addl $16, %%esi \n\t"
++ "movq %%mm0, (%0, %%"REG_S") \n\t"
++ "movq %%mm1, 8(%0, %%"REG_S") \n\t"
++ "add $16, %%"REG_S" \n\t"
+ " jnz 1b \n\t"
+ :: "r" (samples+256), "m" (bias)
-+ : "%esi"
++ : "%"REG_S
+ );
+}
+
@@ -1010,29 +987,29 @@
+ asm volatile(
+ "movd %1, %%mm7 \n\t"
+ "punpckldq %1, %%mm7 \n\t"
-+ "movl $-1024, %%esi \n\t"
-+ ".balign 16\n\t"
++ "mov $-1024, %%"REG_S" \n\t"
++ ASMALIGN16
+ "1: \n\t"
-+ "movq 1024(%0, %%esi), %%mm0 \n\t"
-+ "movq 1032(%0, %%esi), %%mm1 \n\t"
++ "movq 1024(%0, %%"REG_S"), %%mm0\n\t"
++ "movq 1032(%0, %%"REG_S"), %%mm1\n\t"
+ "pfadd %%mm7, %%mm0 \n\t" //common
+ "pfadd %%mm7, %%mm1 \n\t" //common
-+ "movq (%0, %%esi), %%mm2 \n\t"
-+ "movq 8(%0, %%esi), %%mm3 \n\t"
-+ "movq 2048(%0, %%esi), %%mm4 \n\t"
-+ "movq 2056(%0, %%esi), %%mm5 \n\t"
++ "movq (%0, %%"REG_S"), %%mm2 \n\t"
++ "movq 8(%0, %%"REG_S"), %%mm3 \n\t"
++ "movq 2048(%0, %%"REG_S"), %%mm4\n\t"
++ "movq 2056(%0, %%"REG_S"), %%mm5\n\t"
+ "pfadd %%mm0, %%mm2 \n\t"
+ "pfadd %%mm1, %%mm3 \n\t"
+ "pfadd %%mm0, %%mm4 \n\t"
+ "pfadd %%mm1, %%mm5 \n\t"
-+ "movq %%mm2, (%0, %%esi) \n\t"
-+ "movq %%mm3, 8(%0, %%esi) \n\t"
-+ "movq %%mm4, 1024(%0, %%esi) \n\t"
-+ "movq %%mm5, 1032(%0, %%esi) \n\t"
-+ "addl $16, %%esi \n\t"
++ "movq %%mm2, (%0, %%"REG_S") \n\t"
++ "movq %%mm3, 8(%0, %%"REG_S") \n\t"
++ "movq %%mm4, 1024(%0, %%"REG_S")\n\t"
++ "movq %%mm5, 1032(%0, %%"REG_S")\n\t"
++ "add $16, %%"REG_S" \n\t"
+ " jnz 1b \n\t"
+ :: "r" (samples+256), "m" (bias)
-+ : "%esi"
++ : "%"REG_S
+ );
+}
+
@@ -1041,29 +1018,29 @@
+ asm volatile(
+ "movd %2, %%mm7 \n\t"
+ "punpckldq %2, %%mm7 \n\t"
-+ "movl $-1024, %%esi \n\t"
-+ ".balign 16\n\t"
++ "mov $-1024, %%"REG_S" \n\t"
++ ASMALIGN16
+ "1: \n\t"
-+ "movq 1024(%1, %%esi), %%mm0 \n\t"
-+ "movq 1032(%1, %%esi), %%mm1 \n\t"
++ "movq 1024(%1, %%"REG_S"), %%mm0\n\t"
++ "movq 1032(%1, %%"REG_S"), %%mm1\n\t"
+ "pfadd %%mm7, %%mm0 \n\t" //common
+ "pfadd %%mm7, %%mm1 \n\t" //common
-+ "movq (%0, %%esi), %%mm2 \n\t"
-+ "movq 8(%0, %%esi), %%mm3 \n\t"
-+ "movq (%1, %%esi), %%mm4 \n\t"
-+ "movq 8(%1, %%esi), %%mm5 \n\t"
++ "movq (%0, %%"REG_S"), %%mm2 \n\t"
++ "movq 8(%0, %%"REG_S"), %%mm3 \n\t"
++ "movq (%1, %%"REG_S"), %%mm4 \n\t"
++ "movq 8(%1, %%"REG_S"), %%mm5 \n\t"
+ "pfadd %%mm0, %%mm2 \n\t"
+ "pfadd %%mm1, %%mm3 \n\t"
+ "pfadd %%mm0, %%mm4 \n\t"
+ "pfadd %%mm1, %%mm5 \n\t"
-+ "movq %%mm2, (%0, %%esi) \n\t"
-+ "movq %%mm3, 8(%0, %%esi) \n\t"
-+ "movq %%mm4, (%1, %%esi) \n\t"
-+ "movq %%mm5, 8(%1, %%esi) \n\t"
-+ "addl $16, %%esi \n\t"
++ "movq %%mm2, (%0, %%"REG_S") \n\t"
++ "movq %%mm3, 8(%0, %%"REG_S") \n\t"
++ "movq %%mm4, (%1, %%"REG_S") \n\t"
++ "movq %%mm5, 8(%1, %%"REG_S") \n\t"
++ "add $16, %%"REG_S" \n\t"
+ " jnz 1b \n\t"
+ :: "r" (left+256), "r" (right+256), "m" (bias)
-+ : "%esi"
++ : "%"REG_S
+ );
+}
+
@@ -1072,15 +1049,15 @@
+ asm volatile(
+ "movd %1, %%mm7 \n\t"
+ "punpckldq %1, %%mm7 \n\t"
-+ "movl $-1024, %%esi \n\t"
-+ ".balign 16\n\t"
++ "mov $-1024, %%"REG_S" \n\t"
++ ASMALIGN16
+ "1: \n\t"
-+ "movq 2048(%0, %%esi), %%mm0 \n\t" // surround
-+ "movq 2056(%0, %%esi), %%mm1 \n\t" // surround
-+ "movq (%0, %%esi), %%mm2 \n\t"
-+ "movq 8(%0, %%esi), %%mm3 \n\t"
-+ "movq 1024(%0, %%esi), %%mm4 \n\t"
-+ "movq 1032(%0, %%esi), %%mm5 \n\t"
++ "movq 2048(%0, %%"REG_S"), %%mm0\n\t" // surround
++ "movq 2056(%0, %%"REG_S"), %%mm1\n\t" // surround
++ "movq (%0, %%"REG_S"), %%mm2 \n\t"
++ "movq 8(%0, %%"REG_S"), %%mm3 \n\t"
++ "movq 1024(%0, %%"REG_S"), %%mm4\n\t"
++ "movq 1032(%0, %%"REG_S"), %%mm5\n\t"
+ "pfadd %%mm7, %%mm2 \n\t"
+ "pfadd %%mm7, %%mm3 \n\t"
+ "pfadd %%mm7, %%mm4 \n\t"
@@ -1089,14 +1066,14 @@
+ "pfsub %%mm1, %%mm3 \n\t"
+ "pfadd %%mm0, %%mm4 \n\t"
+ "pfadd %%mm1, %%mm5 \n\t"
-+ "movq %%mm2, (%0, %%esi) \n\t"
-+ "movq %%mm3, 8(%0, %%esi) \n\t"
-+ "movq %%mm4, 1024(%0, %%esi) \n\t"
-+ "movq %%mm5, 1032(%0, %%esi) \n\t"
-+ "addl $16, %%esi \n\t"
++ "movq %%mm2, (%0, %%"REG_S") \n\t"
++ "movq %%mm3, 8(%0, %%"REG_S") \n\t"
++ "movq %%mm4, 1024(%0, %%"REG_S")\n\t"
++ "movq %%mm5, 1032(%0, %%"REG_S")\n\t"
++ "add $16, %%"REG_S" \n\t"
+ " jnz 1b \n\t"
+ :: "r" (samples+256), "m" (bias)
-+ : "%esi"
++ : "%"REG_S
+ );
+}
+
@@ -1105,31 +1082,31 @@
+ asm volatile(
+ "movd %1, %%mm7 \n\t"
+ "punpckldq %1, %%mm7 \n\t"
-+ "movl $-1024, %%esi \n\t"
-+ ".balign 16\n\t"
++ "mov $-1024, %%"REG_S" \n\t"
++ ASMALIGN16
+ "1: \n\t"
-+ "movq 1024(%0, %%esi), %%mm0 \n\t"
-+ "movq 1032(%0, %%esi), %%mm1 \n\t"
-+ "pfadd 3072(%0, %%esi), %%mm0 \n\t"
-+ "pfadd 3080(%0, %%esi), %%mm1 \n\t"
++ "movq 1024(%0, %%"REG_S"), %%mm0\n\t"
++ "movq 1032(%0, %%"REG_S"), %%mm1\n\t"
++ "pfadd 3072(%0, %%"REG_S"), %%mm0\n\t"
++ "pfadd 3080(%0, %%"REG_S"), %%mm1\n\t"
+ "pfadd %%mm7, %%mm0 \n\t" // common
+ "pfadd %%mm7, %%mm1 \n\t" // common
-+ "movq (%0, %%esi), %%mm2 \n\t"
-+ "movq 8(%0, %%esi), %%mm3 \n\t"
-+ "movq 2048(%0, %%esi), %%mm4 \n\t"
-+ "movq 2056(%0, %%esi), %%mm5 \n\t"
++ "movq (%0, %%"REG_S"), %%mm2 \n\t"
++ "movq 8(%0, %%"REG_S"), %%mm3 \n\t"
++ "movq 2048(%0, %%"REG_S"), %%mm4\n\t"
++ "movq 2056(%0, %%"REG_S"), %%mm5\n\t"
+ "pfadd %%mm0, %%mm2 \n\t"
+ "pfadd %%mm1, %%mm3 \n\t"
+ "pfadd %%mm0, %%mm4 \n\t"
+ "pfadd %%mm1, %%mm5 \n\t"
-+ "movq %%mm2, (%0, %%esi) \n\t"
-+ "movq %%mm3, 8(%0, %%esi) \n\t"
-+ "movq %%mm4, 1024(%0, %%esi) \n\t"
-+ "movq %%mm5, 1032(%0, %%esi) \n\t"
-+ "addl $16, %%esi \n\t"
++ "movq %%mm2, (%0, %%"REG_S") \n\t"
++ "movq %%mm3, 8(%0, %%"REG_S") \n\t"
++ "movq %%mm4, 1024(%0, %%"REG_S")\n\t"
++ "movq %%mm5, 1032(%0, %%"REG_S")\n\t"
++ "add $16, %%"REG_S" \n\t"
+ " jnz 1b \n\t"
+ :: "r" (samples+256), "m" (bias)
-+ : "%esi"
++ : "%"REG_S
+ );
+}
+
@@ -1138,35 +1115,35 @@
+ asm volatile(
+ "movd %1, %%mm7 \n\t"
+ "punpckldq %1, %%mm7 \n\t"
-+ "movl $-1024, %%esi \n\t"
-+ ".balign 16\n\t"
++ "mov $-1024, %%"REG_S" \n\t"
++ ASMALIGN16
+ "1: \n\t"
-+ "movq 1024(%0, %%esi), %%mm0 \n\t"
-+ "movq 1032(%0, %%esi), %%mm1 \n\t"
++ "movq 1024(%0, %%"REG_S"), %%mm0\n\t"
++ "movq 1032(%0, %%"REG_S"), %%mm1\n\t"
+ "pfadd %%mm7, %%mm0 \n\t" // common
+ "pfadd %%mm7, %%mm1 \n\t" // common
-+ "movq (%0, %%esi), %%mm2 \n\t"
-+ "movq 8(%0, %%esi), %%mm3 \n\t"
-+ "movq 2048(%0, %%esi), %%mm4 \n\t"
-+ "movq 2056(%0, %%esi), %%mm5 \n\t"
++ "movq (%0, %%"REG_S"), %%mm2 \n\t"
++ "movq 8(%0, %%"REG_S"), %%mm3 \n\t"
++ "movq 2048(%0, %%"REG_S"), %%mm4\n\t"
++ "movq 2056(%0, %%"REG_S"), %%mm5\n\t"
+ "pfadd %%mm0, %%mm2 \n\t"
+ "pfadd %%mm1, %%mm3 \n\t"
+ "pfadd %%mm0, %%mm4 \n\t"
+ "pfadd %%mm1, %%mm5 \n\t"
-+ "movq 3072(%0, %%esi), %%mm0 \n\t" // surround
-+ "movq 3080(%0, %%esi), %%mm1 \n\t" // surround
++ "movq 3072(%0, %%"REG_S"), %%mm0\n\t" // surround
++ "movq 3080(%0, %%"REG_S"), %%mm1\n\t" // surround
+ "pfsub %%mm0, %%mm2 \n\t"
+ "pfsub %%mm1, %%mm3 \n\t"
+ "pfadd %%mm0, %%mm4 \n\t"
+ "pfadd %%mm1, %%mm5 \n\t"
-+ "movq %%mm2, (%0, %%esi) \n\t"
-+ "movq %%mm3, 8(%0, %%esi) \n\t"
-+ "movq %%mm4, 1024(%0, %%esi) \n\t"
-+ "movq %%mm5, 1032(%0, %%esi) \n\t"
-+ "addl $16, %%esi \n\t"
++ "movq %%mm2, (%0, %%"REG_S") \n\t"
++ "movq %%mm3, 8(%0, %%"REG_S") \n\t"
++ "movq %%mm4, 1024(%0, %%"REG_S")\n\t"
++ "movq %%mm5, 1032(%0, %%"REG_S")\n\t"
++ "add $16, %%"REG_S" \n\t"
+ " jnz 1b \n\t"
+ :: "r" (samples+256), "m" (bias)
-+ : "%esi"
++ : "%"REG_S
+ );
+}
+
@@ -1175,17 +1152,17 @@
+ asm volatile(
+ "movd %1, %%mm7 \n\t"
+ "punpckldq %1, %%mm7 \n\t"
-+ "movl $-1024, %%esi \n\t"
-+ ".balign 16\n\t"
++ "mov $-1024, %%"REG_S" \n\t"
++ ASMALIGN16
+ "1: \n\t"
-+ "movq 2048(%0, %%esi), %%mm0 \n\t"
-+ "movq 2056(%0, %%esi), %%mm1 \n\t"
-+ "pfadd 3072(%0, %%esi), %%mm0 \n\t" // surround
-+ "pfadd 3080(%0, %%esi), %%mm1 \n\t" // surround
-+ "movq (%0, %%esi), %%mm2 \n\t"
-+ "movq 8(%0, %%esi), %%mm3 \n\t"
-+ "movq 1024(%0, %%esi), %%mm4 \n\t"
-+ "movq 1032(%0, %%esi), %%mm5 \n\t"
++ "movq 2048(%0, %%"REG_S"), %%mm0\n\t"
++ "movq 2056(%0, %%"REG_S"), %%mm1\n\t"
++ "pfadd 3072(%0, %%"REG_S"), %%mm0\n\t" // surround
++ "pfadd 3080(%0, %%"REG_S"), %%mm1\n\t" // surround
++ "movq (%0, %%"REG_S"), %%mm2 \n\t"
++ "movq 8(%0, %%"REG_S"), %%mm3 \n\t"
++ "movq 1024(%0, %%"REG_S"), %%mm4\n\t"
++ "movq 1032(%0, %%"REG_S"), %%mm5\n\t"
+ "pfadd %%mm7, %%mm2 \n\t"
+ "pfadd %%mm7, %%mm3 \n\t"
+ "pfadd %%mm7, %%mm4 \n\t"
@@ -1194,14 +1171,14 @@
+ "pfsub %%mm1, %%mm3 \n\t"
+ "pfadd %%mm0, %%mm4 \n\t"
+ "pfadd %%mm1, %%mm5 \n\t"
-+ "movq %%mm2, (%0, %%esi) \n\t"
-+ "movq %%mm3, 8(%0, %%esi) \n\t"
-+ "movq %%mm4, 1024(%0, %%esi) \n\t"
-+ "movq %%mm5, 1032(%0, %%esi) \n\t"
-+ "addl $16, %%esi \n\t"
++ "movq %%mm2, (%0, %%"REG_S") \n\t"
++ "movq %%mm3, 8(%0, %%"REG_S") \n\t"
++ "movq %%mm4, 1024(%0, %%"REG_S")\n\t"
++ "movq %%mm5, 1032(%0, %%"REG_S")\n\t"
++ "add $16, %%"REG_S" \n\t"
+ " jnz 1b \n\t"
+ :: "r" (samples+256), "m" (bias)
-+ : "%esi"
++ : "%"REG_S
+ );
+}
+
@@ -1210,31 +1187,31 @@
+ asm volatile(
+ "movd %1, %%mm7 \n\t"
+ "punpckldq %1, %%mm7 \n\t"
-+ "movl $-1024, %%esi \n\t"
-+ ".balign 16\n\t"
++ "mov $-1024, %%"REG_S" \n\t"
++ ASMALIGN16
+ "1: \n\t"
-+ "movq 1024(%0, %%esi), %%mm0 \n\t"
-+ "movq 1032(%0, %%esi), %%mm1 \n\t"
++ "movq 1024(%0, %%"REG_S"), %%mm0\n\t"
++ "movq 1032(%0, %%"REG_S"), %%mm1\n\t"
+ "pfadd %%mm7, %%mm0 \n\t" // common
+ "pfadd %%mm7, %%mm1 \n\t" // common
+ "movq %%mm0, %%mm2 \n\t" // common
+ "movq %%mm1, %%mm3 \n\t" // common
-+ "pfadd (%0, %%esi), %%mm0 \n\t"
-+ "pfadd 8(%0, %%esi), %%mm1 \n\t"
-+ "pfadd 2048(%0, %%esi), %%mm2 \n\t"
-+ "pfadd 2056(%0, %%esi), %%mm3 \n\t"
-+ "pfadd 3072(%0, %%esi), %%mm0 \n\t"
-+ "pfadd 3080(%0, %%esi), %%mm1 \n\t"
-+ "pfadd 4096(%0, %%esi), %%mm2 \n\t"
-+ "pfadd 4104(%0, %%esi), %%mm3 \n\t"
-+ "movq %%mm0, (%0, %%esi) \n\t"
-+ "movq %%mm1, 8(%0, %%esi) \n\t"
-+ "movq %%mm2, 1024(%0, %%esi) \n\t"
-+ "movq %%mm3, 1032(%0, %%esi) \n\t"
-+ "addl $16, %%esi \n\t"
++ "pfadd (%0, %%"REG_S"), %%mm0 \n\t"
++ "pfadd 8(%0, %%"REG_S"), %%mm1 \n\t"
++ "pfadd 2048(%0, %%"REG_S"), %%mm2\n\t"
++ "pfadd 2056(%0, %%"REG_S"), %%mm3\n\t"
++ "pfadd 3072(%0, %%"REG_S"), %%mm0\n\t"
++ "pfadd 3080(%0, %%"REG_S"), %%mm1\n\t"
++ "pfadd 4096(%0, %%"REG_S"), %%mm2\n\t"
++ "pfadd 4104(%0, %%"REG_S"), %%mm3\n\t"
++ "movq %%mm0, (%0, %%"REG_S") \n\t"
++ "movq %%mm1, 8(%0, %%"REG_S") \n\t"
++ "movq %%mm2, 1024(%0, %%"REG_S")\n\t"
++ "movq %%mm3, 1032(%0, %%"REG_S")\n\t"
++ "add $16, %%"REG_S" \n\t"
+ " jnz 1b \n\t"
+ :: "r" (samples+256), "m" (bias)
-+ : "%esi"
++ : "%"REG_S
+ );
+}
+
@@ -1242,23 +1219,23 @@
+static void mix32toS_3dnow (sample_t * samples, sample_t bias)
+{
+ asm volatile(
-+ "movl $-1024, %%esi \n\t"
-+ ".balign 16\n\t"
++ "mov $-1024, %%"REG_S" \n\t"
++ ASMALIGN16
+ "1: \n\t"
+ "movd %1, %%mm7 \n\t"
+ "punpckldq %1, %%mm7 \n\t"
-+ "movq 1024(%0, %%esi), %%mm0 \n\t"
-+ "movq 1032(%0, %%esi), %%mm1 \n\t"
-+ "movq 3072(%0, %%esi), %%mm4 \n\t"
-+ "movq 3080(%0, %%esi), %%mm5 \n\t"
++ "movq 1024(%0, %%"REG_S"), %%mm0\n\t"
++ "movq 1032(%0, %%"REG_S"), %%mm1\n\t"
++ "movq 3072(%0, %%"REG_S"), %%mm4\n\t"
++ "movq 3080(%0, %%"REG_S"), %%mm5\n\t"
+ "pfadd %%mm7, %%mm0 \n\t" // common
+ "pfadd %%mm7, %%mm1 \n\t" // common
-+ "pfadd 4096(%0, %%esi), %%mm4 \n\t" // surround
-+ "pfadd 4104(%0, %%esi), %%mm5 \n\t" // surround
-+ "movq (%0, %%esi), %%mm2 \n\t"
-+ "movq 8(%0, %%esi), %%mm3 \n\t"
-+ "movq 2048(%0, %%esi), %%mm6 \n\t"
-+ "movq 2056(%0, %%esi), %%mm7 \n\t"
++ "pfadd 4096(%0, %%"REG_S"), %%mm4\n\t" // surround
++ "pfadd 4104(%0, %%"REG_S"), %%mm5\n\t" // surround
++ "movq (%0, %%"REG_S"), %%mm2 \n\t"
++ "movq 8(%0, %%"REG_S"), %%mm3 \n\t"
++ "movq 2048(%0, %%"REG_S"), %%mm6\n\t"
++ "movq 2056(%0, %%"REG_S"), %%mm7\n\t"
+ "pfsub %%mm4, %%mm2 \n\t"
+ "pfsub %%mm5, %%mm3 \n\t"
+ "pfadd %%mm4, %%mm6 \n\t"
@@ -1267,14 +1244,14 @@
+ "pfadd %%mm1, %%mm3 \n\t"
+ "pfadd %%mm0, %%mm6 \n\t"
+ "pfadd %%mm1, %%mm7 \n\t"
-+ "movq %%mm2, (%0, %%esi) \n\t"
-+ "movq %%mm3, 8(%0, %%esi) \n\t"
-+ "movq %%mm6, 1024(%0, %%esi) \n\t"
-+ "movq %%mm7, 1032(%0, %%esi) \n\t"
-+ "addl $16, %%esi \n\t"
++ "movq %%mm2, (%0, %%"REG_S") \n\t"
++ "movq %%mm3, 8(%0, %%"REG_S") \n\t"
++ "movq %%mm6, 1024(%0, %%"REG_S")\n\t"
++ "movq %%mm7, 1032(%0, %%"REG_S")\n\t"
++ "add $16, %%"REG_S" \n\t"
+ " jnz 1b \n\t"
+ :: "r" (samples+256), "m" (bias)
-+ : "%esi"
++ : "%"REG_S
+ );
+}
+
@@ -1283,29 +1260,29 @@
+ asm volatile(
+ "movd %2, %%mm7 \n\t"
+ "punpckldq %2, %%mm7 \n\t"
-+ "movl $-1024, %%esi \n\t"
-+ ".balign 16\n\t"
++ "mov $-1024, %%"REG_S" \n\t"
++ ASMALIGN16
+ "1: \n\t"
-+ "movq (%0, %%esi), %%mm0 \n\t"
-+ "movq 8(%0, %%esi), %%mm1 \n\t"
-+ "movq 16(%0, %%esi), %%mm2 \n\t"
-+ "movq 24(%0, %%esi), %%mm3 \n\t"
-+ "pfadd 1024(%0, %%esi), %%mm0 \n\t"
-+ "pfadd 1032(%0, %%esi), %%mm1 \n\t"
-+ "pfadd 1040(%0, %%esi), %%mm2 \n\t"
-+ "pfadd 1048(%0, %%esi), %%mm3 \n\t"
++ "movq (%0, %%"REG_S"), %%mm0 \n\t"
++ "movq 8(%0, %%"REG_S"), %%mm1 \n\t"
++ "movq 16(%0, %%"REG_S"), %%mm2 \n\t"
++ "movq 24(%0, %%"REG_S"), %%mm3 \n\t"
++ "pfadd 1024(%0, %%"REG_S"), %%mm0\n\t"
++ "pfadd 1032(%0, %%"REG_S"), %%mm1\n\t"
++ "pfadd 1040(%0, %%"REG_S"), %%mm2\n\t"
++ "pfadd 1048(%0, %%"REG_S"), %%mm3\n\t"
+ "pfadd %%mm7, %%mm0 \n\t"
+ "pfadd %%mm7, %%mm1 \n\t"
+ "pfadd %%mm7, %%mm2 \n\t"
+ "pfadd %%mm7, %%mm3 \n\t"
-+ "movq %%mm0, (%1, %%esi) \n\t"
-+ "movq %%mm1, 8(%1, %%esi) \n\t"
-+ "movq %%mm2, 16(%1, %%esi) \n\t"
-+ "movq %%mm3, 24(%1, %%esi) \n\t"
-+ "addl $32, %%esi \n\t"
++ "movq %%mm0, (%1, %%"REG_S") \n\t"
++ "movq %%mm1, 8(%1, %%"REG_S") \n\t"
++ "movq %%mm2, 16(%1, %%"REG_S") \n\t"
++ "movq %%mm3, 24(%1, %%"REG_S") \n\t"
++ "add $32, %%"REG_S" \n\t"
+ " jnz 1b \n\t"
+ :: "r" (src+256), "r" (dest+256), "m" (bias)
-+ : "%esi"
++ : "%"REG_S
+ );
+}
+
@@ -1451,10 +1428,10 @@
+ __asm __volatile("femms":::"memory");
+}
+
-+#endif //ARCH_X86
---- liba52/imdct.c 2005-03-22 19:59:35.000000000 +0100
-+++ imdct.c 2004-04-26 22:00:57.000000000 +0200
-@@ -17,17 +23,32 @@
++#endif // ARCH_X86 || ARCH_X86_64
+--- liba52-0.7.4/imdct.c 2006-06-12 15:18:27.000000000 +0200
++++ liba52/imdct.c 2006-06-12 19:18:39.000000000 +0200
+@@ -26,9 +26,15 @@
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
@@ -1466,34 +1443,27 @@
*/
#include "config.h"
++#include "asmalign.h"
--#include <inttypes.h>
#include <math.h>
#include <stdio.h>
-+#ifndef M_PI
-+#define M_PI 3.1415926535897932384626433832795029
-+#endif
-+#include <inttypes.h>
-
+@@ -43,12 +49,49 @@
#include "a52.h"
#include "a52_internal.h"
#include "mm_accel.h"
+#include "mangle.h"
+
++void (*a52_imdct_512) (sample_t * data, sample_t * delay, sample_t bias);
++
+#ifdef RUNTIME_CPUDETECT
+#undef HAVE_3DNOWEX
+#endif
-+
-+#define USE_AC3_C
- void (* imdct_256) (sample_t data[], sample_t delay[], sample_t bias);
- void (* imdct_512) (sample_t data[], sample_t delay[], sample_t bias);
-@@ -37,9 +58,22 @@
+ typedef struct complex_s {
+ sample_t real;
sample_t imag;
} complex_t;
-+static void fft_128p(complex_t *a);
-+
+static const int pm128[128] attribute_used __attribute__((aligned(16))) =
+{
+ 0, 16, 32, 48, 64, 80, 96, 112, 8, 40, 72, 104, 24, 56, 88, 120,
@@ -1505,22 +1475,51 @@
+ 3, 19, 35, 51, 67, 83, 99, 115, 11, 43, 75, 107, 27, 59, 91, 123,
+ 7, 23, 39, 55, 71, 87, 103, 119, 15, 31, 47, 63, 79, 95, 111, 127
+};
-
- /* 128 point bit-reverse LUT */
--static uint8_t bit_reverse_512[] = {
++
+static uint8_t attribute_used bit_reverse_512[] = {
- 0x00, 0x40, 0x20, 0x60, 0x10, 0x50, 0x30, 0x70,
- 0x08, 0x48, 0x28, 0x68, 0x18, 0x58, 0x38, 0x78,
- 0x04, 0x44, 0x24, 0x64, 0x14, 0x54, 0x34, 0x74,
-@@ -67,23 +101,42 @@
- 0x03, 0x23, 0x13, 0x33, 0x0b, 0x2b, 0x1b, 0x3b,
- 0x07, 0x27, 0x17, 0x37, 0x0f, 0x2f, 0x1f, 0x3f};
++ 0x00, 0x40, 0x20, 0x60, 0x10, 0x50, 0x30, 0x70,
++ 0x08, 0x48, 0x28, 0x68, 0x18, 0x58, 0x38, 0x78,
++ 0x04, 0x44, 0x24, 0x64, 0x14, 0x54, 0x34, 0x74,
++ 0x0c, 0x4c, 0x2c, 0x6c, 0x1c, 0x5c, 0x3c, 0x7c,
++ 0x02, 0x42, 0x22, 0x62, 0x12, 0x52, 0x32, 0x72,
++ 0x0a, 0x4a, 0x2a, 0x6a, 0x1a, 0x5a, 0x3a, 0x7a,
++ 0x06, 0x46, 0x26, 0x66, 0x16, 0x56, 0x36, 0x76,
++ 0x0e, 0x4e, 0x2e, 0x6e, 0x1e, 0x5e, 0x3e, 0x7e,
++ 0x01, 0x41, 0x21, 0x61, 0x11, 0x51, 0x31, 0x71,
++ 0x09, 0x49, 0x29, 0x69, 0x19, 0x59, 0x39, 0x79,
++ 0x05, 0x45, 0x25, 0x65, 0x15, 0x55, 0x35, 0x75,
++ 0x0d, 0x4d, 0x2d, 0x6d, 0x1d, 0x5d, 0x3d, 0x7d,
++ 0x03, 0x43, 0x23, 0x63, 0x13, 0x53, 0x33, 0x73,
++ 0x0b, 0x4b, 0x2b, 0x6b, 0x1b, 0x5b, 0x3b, 0x7b,
++ 0x07, 0x47, 0x27, 0x67, 0x17, 0x57, 0x37, 0x77,
++ 0x0f, 0x4f, 0x2f, 0x6f, 0x1f, 0x5f, 0x3f, 0x7f};
++
+ static uint8_t fftorder[] = {
+ 0,128, 64,192, 32,160,224, 96, 16,144, 80,208,240,112, 48,176,
+ 8,136, 72,200, 40,168,232,104,248,120, 56,184, 24,152,216, 88,
+@@ -60,6 +103,40 @@
+ 6,134, 70,198, 38,166,230,102,246,118, 54,182, 22,150,214, 86
+ };
--static complex_t buf[128];
-+#ifdef ARCH_X86
++static complex_t __attribute__((aligned(16))) buf[128];
++
++/* Twiddle factor LUT */
++static complex_t __attribute__((aligned(16))) w_1[1];
++static complex_t __attribute__((aligned(16))) w_2[2];
++static complex_t __attribute__((aligned(16))) w_4[4];
++static complex_t __attribute__((aligned(16))) w_8[8];
++static complex_t __attribute__((aligned(16))) w_16[16];
++static complex_t __attribute__((aligned(16))) w_32[32];
++static complex_t __attribute__((aligned(16))) w_64[64];
++static complex_t __attribute__((aligned(16))) * w[7] = {w_1, w_2, w_4, w_8, w_16, w_32, w_64};
++
++/* Twiddle factors for IMDCT */
++static sample_t __attribute__((aligned(16))) xcos1[128];
++static sample_t __attribute__((aligned(16))) xsin1[128];
++
++#if defined(ARCH_X86) || defined(ARCH_X86_64)
+// NOTE: SSE needs 16byte alignment or it will segfault
+//
-+static complex_t __attribute__((aligned(16))) buf[128];
+static float __attribute__((aligned(16))) sseSinCos1c[256];
+static float __attribute__((aligned(16))) sseSinCos1d[256];
+static float attribute_used __attribute__((aligned(16))) ps111_1[4]={1,1,1,-1};
@@ -1534,234 +1533,21 @@
+static float __attribute__((aligned(16))) *sseW[7]=
+ {NULL /*sseW0*/,sseW1,sseW2,sseW3,sseW4,sseW5,sseW6};
+static float __attribute__((aligned(16))) sseWindow[512];
-+#else
-+static complex_t __attribute__((aligned(16))) buf[128];
-+#endif
-
- /* Twiddle factor LUT */
--static complex_t w_1[1];
--static complex_t w_2[2];
--static complex_t w_4[4];
--static complex_t w_8[8];
--static complex_t w_16[16];
--static complex_t w_32[32];
--static complex_t w_64[64];
--static complex_t * w[7] = {w_1, w_2, w_4, w_8, w_16, w_32, w_64};
-+static complex_t __attribute__((aligned(16))) w_1[1];
-+static complex_t __attribute__((aligned(16))) w_2[2];
-+static complex_t __attribute__((aligned(16))) w_4[4];
-+static complex_t __attribute__((aligned(16))) w_8[8];
-+static complex_t __attribute__((aligned(16))) w_16[16];
-+static complex_t __attribute__((aligned(16))) w_32[32];
-+static complex_t __attribute__((aligned(16))) w_64[64];
-+static complex_t __attribute__((aligned(16))) * w[7] = {w_1, w_2, w_4, w_8, w_16, w_32, w_64};
-
- /* Twiddle factors for IMDCT */
--static sample_t xcos1[128];
--static sample_t xsin1[128];
--static sample_t xcos2[64];
--static sample_t xsin2[64];
-+static sample_t __attribute__((aligned(16))) xcos1[128];
-+static sample_t __attribute__((aligned(16))) xsin1[128];
-+static sample_t __attribute__((aligned(16))) xcos2[64];
-+static sample_t __attribute__((aligned(16))) xsin2[64];
-
- /* Windowing function for Modified DCT - Thank you acroread */
- sample_t imdct_window[] = {
-@@ -145,16 +198,19 @@
- void
- imdct_do_512(sample_t data[],sample_t delay[], sample_t bias)
- {
-- int i,k;
-+ int i;
-+#ifndef USE_AC3_C
-+ int k;
- int p,q;
- int m;
- int two_m;
- int two_m_plus_one;
-
-- sample_t tmp_a_i;
-- sample_t tmp_a_r;
- sample_t tmp_b_i;
- sample_t tmp_b_r;
-+#endif
-+ sample_t tmp_a_i;
-+ sample_t tmp_a_r;
-
- sample_t *data_ptr;
- sample_t *delay_ptr;
-@@ -162,22 +218,21 @@
-
- /* 512 IMDCT with source and dest data in 'data' */
-
-- /* Pre IFFT complex multiply plus IFFT cmplx conjugate */
-+ /* Pre IFFT complex multiply plus IFFT cmplx conjugate & reordering*/
- for( i=0; i < 128; i++) {
- /* z[i] = (X[256-2*i-1] + j * X[2*i]) * (xcos1[i] + j * xsin1[i]) ; */
-- buf[i].real = (data[256-2*i-1] * xcos1[i]) - (data[2*i] * xsin1[i]);
-- buf[i].imag = -1.0 * ((data[2*i] * xcos1[i]) + (data[256-2*i-1] * xsin1[i]));
-- }
--
-- /* Bit reversed shuffling */
-- for(i=0; i<128; i++) {
-- k = bit_reverse_512[i];
-- if (k < i)
-- swap_cmplx(&buf[i],&buf[k]);
-+#ifdef USE_AC3_C
-+ int j= pm128[i];
-+#else
-+ int j= bit_reverse_512[i];
-+#endif
-+ buf[i].real = (data[256-2*j-1] * xcos1[j]) - (data[2*j] * xsin1[j]);
-+ buf[i].imag = -1.0 * ((data[2*j] * xcos1[j]) + (data[256-2*j-1] * xsin1[j]));
- }
-
- /* FFT Merge */
-- for (m=0; m < 7; m++) {
-+/* unoptimized variant
-+ for (m=1; m < 7; m++) {
- if(m)
- two_m = (1 << m);
- else
-@@ -185,8 +240,8 @@
-
- two_m_plus_one = (1 << (m+1));
-
-- for(k = 0; k < two_m; k++) {
-- for(i = 0; i < 128; i += two_m_plus_one) {
-+ for(i = 0; i < 128; i += two_m_plus_one) {
-+ for(k = 0; k < two_m; k++) {
- p = k + i;
- q = p + two_m;
- tmp_a_r = buf[p].real;
-@@ -200,7 +255,102 @@
- }
- }
- }
-+*/
-+#ifdef USE_AC3_C
-+ fft_128p (&buf[0]);
-+#else
-+
-+ /* 1. iteration */
-+ for(i = 0; i < 128; i += 2) {
-+ tmp_a_r = buf[i].real;
-+ tmp_a_i = buf[i].imag;
-+ tmp_b_r = buf[i+1].real;
-+ tmp_b_i = buf[i+1].imag;
-+ buf[i].real = tmp_a_r + tmp_b_r;
-+ buf[i].imag = tmp_a_i + tmp_b_i;
-+ buf[i+1].real = tmp_a_r - tmp_b_r;
-+ buf[i+1].imag = tmp_a_i - tmp_b_i;
-+ }
-+
-+ /* 2. iteration */
-+ // Note w[1]={{1,0}, {0,-1}}
-+ for(i = 0; i < 128; i += 4) {
-+ tmp_a_r = buf[i].real;
-+ tmp_a_i = buf[i].imag;
-+ tmp_b_r = buf[i+2].real;
-+ tmp_b_i = buf[i+2].imag;
-+ buf[i].real = tmp_a_r + tmp_b_r;
-+ buf[i].imag = tmp_a_i + tmp_b_i;
-+ buf[i+2].real = tmp_a_r - tmp_b_r;
-+ buf[i+2].imag = tmp_a_i - tmp_b_i;
-+ tmp_a_r = buf[i+1].real;
-+ tmp_a_i = buf[i+1].imag;
-+ tmp_b_r = buf[i+3].imag;
-+ tmp_b_i = buf[i+3].real;
-+ buf[i+1].real = tmp_a_r + tmp_b_r;
-+ buf[i+1].imag = tmp_a_i - tmp_b_i;
-+ buf[i+3].real = tmp_a_r - tmp_b_r;
-+ buf[i+3].imag = tmp_a_i + tmp_b_i;
-+ }
-
-+ /* 3. iteration */
-+ for(i = 0; i < 128; i += 8) {
-+ tmp_a_r = buf[i].real;
-+ tmp_a_i = buf[i].imag;
-+ tmp_b_r = buf[i+4].real;
-+ tmp_b_i = buf[i+4].imag;
-+ buf[i].real = tmp_a_r + tmp_b_r;
-+ buf[i].imag = tmp_a_i + tmp_b_i;
-+ buf[i+4].real = tmp_a_r - tmp_b_r;
-+ buf[i+4].imag = tmp_a_i - tmp_b_i;
-+ tmp_a_r = buf[1+i].real;
-+ tmp_a_i = buf[1+i].imag;
-+ tmp_b_r = (buf[i+5].real + buf[i+5].imag) * w[2][1].real;
-+ tmp_b_i = (buf[i+5].imag - buf[i+5].real) * w[2][1].real;
-+ buf[1+i].real = tmp_a_r + tmp_b_r;
-+ buf[1+i].imag = tmp_a_i + tmp_b_i;
-+ buf[i+5].real = tmp_a_r - tmp_b_r;
-+ buf[i+5].imag = tmp_a_i - tmp_b_i;
-+ tmp_a_r = buf[i+2].real;
-+ tmp_a_i = buf[i+2].imag;
-+ tmp_b_r = buf[i+6].imag;
-+ tmp_b_i = - buf[i+6].real;
-+ buf[i+2].real = tmp_a_r + tmp_b_r;
-+ buf[i+2].imag = tmp_a_i + tmp_b_i;
-+ buf[i+6].real = tmp_a_r - tmp_b_r;
-+ buf[i+6].imag = tmp_a_i - tmp_b_i;
-+ tmp_a_r = buf[i+3].real;
-+ tmp_a_i = buf[i+3].imag;
-+ tmp_b_r = (buf[i+7].real - buf[i+7].imag) * w[2][3].imag;
-+ tmp_b_i = (buf[i+7].imag + buf[i+7].real) * w[2][3].imag;
-+ buf[i+3].real = tmp_a_r + tmp_b_r;
-+ buf[i+3].imag = tmp_a_i + tmp_b_i;
-+ buf[i+7].real = tmp_a_r - tmp_b_r;
-+ buf[i+7].imag = tmp_a_i - tmp_b_i;
-+ }
-+
-+ /* 4-7. iterations */
-+ for (m=3; m < 7; m++) {
-+ two_m = (1 << m);
-+
-+ two_m_plus_one = two_m<<1;
-+
-+ for(i = 0; i < 128; i += two_m_plus_one) {
-+ for(k = 0; k < two_m; k++) {
-+ int p = k + i;
-+ int q = p + two_m;
-+ tmp_a_r = buf[p].real;
-+ tmp_a_i = buf[p].imag;
-+ tmp_b_r = buf[q].real * w[m][k].real - buf[q].imag * w[m][k].imag;
-+ tmp_b_i = buf[q].imag * w[m][k].real + buf[q].real * w[m][k].imag;
-+ buf[p].real = tmp_a_r + tmp_b_r;
-+ buf[p].imag = tmp_a_i + tmp_b_i;
-+ buf[q].real = tmp_a_r - tmp_b_r;
-+ buf[q].imag = tmp_a_i - tmp_b_i;
-+ }
-+ }
-+ }
+#endif
- /* Post IFFT complex multiply plus IFFT complex conjugate*/
- for( i=0; i < 128; i++) {
- /* y[n] = z[n] * (xcos1[n] + j * xsin1[n]) ; */
-@@ -219,12 +369,12 @@
- *data_ptr++ = -buf[64+i].imag * *window_ptr++ + *delay_ptr++ + bias;
- *data_ptr++ = buf[64-i-1].real * *window_ptr++ + *delay_ptr++ + bias;
- }
--
+
- for(i=0; i< 64; i++) {
- *data_ptr++ = -buf[i].real * *window_ptr++ + *delay_ptr++ + bias;
- *data_ptr++ = buf[128-i-1].imag * *window_ptr++ + *delay_ptr++ + bias;
+ /* Root values for IFFT */
+ static sample_t roots16[3];
+ static sample_t roots32[7];
+@@ -245,7 +322,7 @@
+ ifft_pass (buf, roots128 - 32, 32);
}
--
-+
- /* The trailing edge of the window goes into the delay line */
- delay_ptr = delay;
-@@ -232,13 +382,717 @@
- *delay_ptr++ = -buf[64+i].real * *--window_ptr;
- *delay_ptr++ = buf[64-i-1].imag * *--window_ptr;
- }
--
-+
- for(i=0; i<64; i++) {
- *delay_ptr++ = buf[i].imag * *--window_ptr;
- *delay_ptr++ = -buf[128-i-1].real * *--window_ptr;
+-void a52_imdct_512 (sample_t * data, sample_t * delay, sample_t bias)
++void imdct_do_512 (sample_t * data, sample_t * delay, sample_t bias)
+ {
+ int i, k;
+ sample_t t_r, t_i, a_r, a_i, b_r, b_i, w_1, w_2;
+@@ -289,6 +366,714 @@
}
}
@@ -1818,8 +1604,8 @@
+ int k;
+ int p,q;
+ int m;
-+ int two_m;
-+ int two_m_plus_one;
++ long two_m;
++ long two_m_plus_one;
+
+ sample_t tmp_b_i;
+ sample_t tmp_b_r;
@@ -2092,7 +1878,7 @@
+
+ data_ptr = data;
+ delay_ptr = delay;
-+ window_ptr = imdct_window;
++ window_ptr = a52_imdct_window;
+
+ /* Window and convert to real valued signal */
+ for(i=0; i< 64; i++) {
@@ -2123,7 +1909,7 @@
+
+// Stuff below this line is borrowed from libac3
+#include "srfftp.h"
-+#ifdef ARCH_X86
++#if defined(ARCH_X86) || defined(ARCH_X86_64)
+#ifndef HAVE_3DNOW
+#define HAVE_3DNOW 1
+#endif
@@ -2144,8 +1930,10 @@
+/* int i,k;
+ int p,q;*/
+ int m;
-+ int two_m;
-+ int two_m_plus_one;
++ long two_m;
++ long two_m_plus_one;
++ long two_m_plus_one_shl3;
++ complex_t *buf_offset;
+
+/* sample_t tmp_a_i;
+ sample_t tmp_a_r;
@@ -2162,33 +1950,33 @@
+ /* Pre IFFT complex multiply plus IFFT cmplx conjugate */
+ /* Bit reversed shuffling */
+ asm volatile(
-+ "xorl %%esi, %%esi \n\t"
-+ "leal "MANGLE(bit_reverse_512)", %%eax \n\t"
-+ "movl $1008, %%edi \n\t"
-+ "pushl %%ebp \n\t" //use ebp without telling gcc
-+ ".balign 16 \n\t"
++ "xor %%"REG_S", %%"REG_S" \n\t"
++ "lea "MANGLE(bit_reverse_512)", %%"REG_a"\n\t"
++ "mov $1008, %%"REG_D" \n\t"
++ "push %%"REG_BP" \n\t" //use ebp without telling gcc
++ ASMALIGN16
+ "1: \n\t"
-+ "movlps (%0, %%esi), %%xmm0 \n\t" // XXXI
-+ "movhps 8(%0, %%edi), %%xmm0 \n\t" // RXXI
-+ "movlps 8(%0, %%esi), %%xmm1 \n\t" // XXXi
-+ "movhps (%0, %%edi), %%xmm1 \n\t" // rXXi
++ "movlps (%0, %%"REG_S"), %%xmm0 \n\t" // XXXI
++ "movhps 8(%0, %%"REG_D"), %%xmm0 \n\t" // RXXI
++ "movlps 8(%0, %%"REG_S"), %%xmm1 \n\t" // XXXi
++ "movhps (%0, %%"REG_D"), %%xmm1 \n\t" // rXXi
+ "shufps $0x33, %%xmm1, %%xmm0 \n\t" // irIR
-+ "movaps "MANGLE(sseSinCos1c)"(%%esi), %%xmm2\n\t"
++ "movaps "MANGLE(sseSinCos1c)"(%%"REG_S"), %%xmm2\n\t"
+ "mulps %%xmm0, %%xmm2 \n\t"
+ "shufps $0xB1, %%xmm0, %%xmm0 \n\t" // riRI
-+ "mulps "MANGLE(sseSinCos1d)"(%%esi), %%xmm0\n\t"
++ "mulps "MANGLE(sseSinCos1d)"(%%"REG_S"), %%xmm0\n\t"
+ "subps %%xmm0, %%xmm2 \n\t"
-+ "movzbl (%%eax), %%edx \n\t"
-+ "movzbl 1(%%eax), %%ebp \n\t"
-+ "movlps %%xmm2, (%1, %%edx,8) \n\t"
-+ "movhps %%xmm2, (%1, %%ebp,8) \n\t"
-+ "addl $16, %%esi \n\t"
-+ "addl $2, %%eax \n\t" // avoid complex addressing for P4 crap
-+ "subl $16, %%edi \n\t"
++ "movzb (%%"REG_a"), %%"REG_d" \n\t"
++ "movzb 1(%%"REG_a"), %%"REG_BP" \n\t"
++ "movlps %%xmm2, (%1, %%"REG_d", 8) \n\t"
++ "movhps %%xmm2, (%1, %%"REG_BP", 8) \n\t"
++ "add $16, %%"REG_S" \n\t"
++ "add $2, %%"REG_a" \n\t" // avoid complex addressing for P4 crap
++ "sub $16, %%"REG_D" \n\t"
+ " jnc 1b \n\t"
-+ "popl %%ebp \n\t"//no we didnt touch ebp *g*
++ "pop %%"REG_BP" \n\t"//no we didnt touch ebp *g*
+ :: "b" (data), "c" (buf)
-+ : "%esi", "%edi", "%eax", "%edx"
++ : "%"REG_S, "%"REG_D, "%"REG_a, "%"REG_d
+ );
+
+
@@ -2224,44 +2012,44 @@
+ asm volatile(
+ "xorps %%xmm1, %%xmm1 \n\t"
+ "xorps %%xmm2, %%xmm2 \n\t"
-+ "movl %0, %%esi \n\t"
-+ ".balign 16 \n\t"
++ "mov %0, %%"REG_S" \n\t"
++ ASMALIGN16
+ "1: \n\t"
-+ "movlps (%%esi), %%xmm0 \n\t" //buf[p]
-+ "movlps 8(%%esi), %%xmm1\n\t" //buf[q]
-+ "movhps (%%esi), %%xmm0 \n\t" //buf[p]
-+ "movhps 8(%%esi), %%xmm2\n\t" //buf[q]
++ "movlps (%%"REG_S"), %%xmm0\n\t" //buf[p]
++ "movlps 8(%%"REG_S"), %%xmm1\n\t" //buf[q]
++ "movhps (%%"REG_S"), %%xmm0\n\t" //buf[p]
++ "movhps 8(%%"REG_S"), %%xmm2\n\t" //buf[q]
+ "addps %%xmm1, %%xmm0 \n\t"
+ "subps %%xmm2, %%xmm0 \n\t"
-+ "movaps %%xmm0, (%%esi) \n\t"
-+ "addl $16, %%esi \n\t"
-+ "cmpl %1, %%esi \n\t"
++ "movaps %%xmm0, (%%"REG_S")\n\t"
++ "add $16, %%"REG_S" \n\t"
++ "cmp %1, %%"REG_S" \n\t"
+ " jb 1b \n\t"
+ :: "g" (buf), "r" (buf + 128)
-+ : "%esi"
++ : "%"REG_S
+ );
+
+ /* 2. iteration */
+ // Note w[1]={{1,0}, {0,-1}}
+ asm volatile(
+ "movaps "MANGLE(ps111_1)", %%xmm7\n\t" // 1,1,1,-1
-+ "movl %0, %%esi \n\t"
-+ ".balign 16 \n\t"
++ "mov %0, %%"REG_S" \n\t"
++ ASMALIGN16
+ "1: \n\t"
-+ "movaps 16(%%esi), %%xmm2 \n\t" //r2,i2,r3,i3
++ "movaps 16(%%"REG_S"), %%xmm2 \n\t" //r2,i2,r3,i3
+ "shufps $0xB4, %%xmm2, %%xmm2 \n\t" //r2,i2,i3,r3
+ "mulps %%xmm7, %%xmm2 \n\t" //r2,i2,i3,-r3
-+ "movaps (%%esi), %%xmm0 \n\t" //r0,i0,r1,i1
-+ "movaps (%%esi), %%xmm1 \n\t" //r0,i0,r1,i1
++ "movaps (%%"REG_S"), %%xmm0 \n\t" //r0,i0,r1,i1
++ "movaps (%%"REG_S"), %%xmm1 \n\t" //r0,i0,r1,i1
+ "addps %%xmm2, %%xmm0 \n\t"
+ "subps %%xmm2, %%xmm1 \n\t"
-+ "movaps %%xmm0, (%%esi) \n\t"
-+ "movaps %%xmm1, 16(%%esi) \n\t"
-+ "addl $32, %%esi \n\t"
-+ "cmpl %1, %%esi \n\t"
++ "movaps %%xmm0, (%%"REG_S") \n\t"
++ "movaps %%xmm1, 16(%%"REG_S") \n\t"
++ "add $32, %%"REG_S" \n\t"
++ "cmp %1, %%"REG_S" \n\t"
+ " jb 1b \n\t"
+ :: "g" (buf), "r" (buf + 128)
-+ : "%esi"
++ : "%"REG_S
+ );
+
+ /* 3. iteration */
@@ -2276,11 +2064,11 @@
+ "movaps 16+"MANGLE(sseW2)", %%xmm7\n\t"
+ "xorps %%xmm5, %%xmm5 \n\t"
+ "xorps %%xmm2, %%xmm2 \n\t"
-+ "movl %0, %%esi \n\t"
-+ ".balign 16 \n\t"
++ "mov %0, %%"REG_S" \n\t"
++ ASMALIGN16
+ "1: \n\t"
-+ "movaps 32(%%esi), %%xmm2 \n\t" //r4,i4,r5,i5
-+ "movaps 48(%%esi), %%xmm3 \n\t" //r6,i6,r7,i7
++ "movaps 32(%%"REG_S"), %%xmm2 \n\t" //r4,i4,r5,i5
++ "movaps 48(%%"REG_S"), %%xmm3 \n\t" //r6,i6,r7,i7
+ "movaps "MANGLE(sseW2)", %%xmm4 \n\t" //r4,i4,r5,i5
+ "movaps 32+"MANGLE(sseW2)", %%xmm5\n\t" //r6,i6,r7,i7
+ "mulps %%xmm2, %%xmm4 \n\t"
@@ -2289,8 +2077,8 @@
+ "shufps $0xB1, %%xmm3, %%xmm3 \n\t" //i6,r6,i7,r7
+ "mulps %%xmm6, %%xmm3 \n\t"
+ "mulps %%xmm7, %%xmm2 \n\t"
-+ "movaps (%%esi), %%xmm0 \n\t" //r0,i0,r1,i1
-+ "movaps 16(%%esi), %%xmm1 \n\t" //r2,i2,r3,i3
++ "movaps (%%"REG_S"), %%xmm0 \n\t" //r0,i0,r1,i1
++ "movaps 16(%%"REG_S"), %%xmm1 \n\t" //r2,i2,r3,i3
+ "addps %%xmm4, %%xmm2 \n\t"
+ "addps %%xmm5, %%xmm3 \n\t"
+ "movaps %%xmm2, %%xmm4 \n\t"
@@ -2299,125 +2087,127 @@
+ "addps %%xmm1, %%xmm3 \n\t"
+ "subps %%xmm4, %%xmm0 \n\t"
+ "subps %%xmm5, %%xmm1 \n\t"
-+ "movaps %%xmm2, (%%esi) \n\t"
-+ "movaps %%xmm3, 16(%%esi) \n\t"
-+ "movaps %%xmm0, 32(%%esi) \n\t"
-+ "movaps %%xmm1, 48(%%esi) \n\t"
-+ "addl $64, %%esi \n\t"
-+ "cmpl %1, %%esi \n\t"
++ "movaps %%xmm2, (%%"REG_S") \n\t"
++ "movaps %%xmm3, 16(%%"REG_S") \n\t"
++ "movaps %%xmm0, 32(%%"REG_S") \n\t"
++ "movaps %%xmm1, 48(%%"REG_S") \n\t"
++ "add $64, %%"REG_S" \n\t"
++ "cmp %1, %%"REG_S" \n\t"
+ " jb 1b \n\t"
+ :: "g" (buf), "r" (buf + 128)
-+ : "%esi"
++ : "%"REG_S
+ );
+
+ /* 4-7. iterations */
+ for (m=3; m < 7; m++) {
+ two_m = (1 << m);
+ two_m_plus_one = two_m<<1;
++ two_m_plus_one_shl3 = (two_m_plus_one<<3);
++ buf_offset = buf+128;
+ asm volatile(
-+ "movl %0, %%esi \n\t"
-+ ".balign 16 \n\t"
++ "mov %0, %%"REG_S" \n\t"
++ ASMALIGN16
+ "1: \n\t"
-+ "xorl %%edi, %%edi \n\t" // k
-+ "leal (%%esi, %3), %%edx \n\t"
++ "xor %%"REG_D", %%"REG_D" \n\t" // k
++ "lea (%%"REG_S", %3), %%"REG_d" \n\t"
+ "2: \n\t"
-+ "movaps (%%edx, %%edi), %%xmm1 \n\t"
-+ "movaps (%4, %%edi, 2), %%xmm2 \n\t"
++ "movaps (%%"REG_d", %%"REG_D"), %%xmm1 \n\t"
++ "movaps (%4, %%"REG_D", 2), %%xmm2 \n\t"
+ "mulps %%xmm1, %%xmm2 \n\t"
+ "shufps $0xB1, %%xmm1, %%xmm1 \n\t"
-+ "mulps 16(%4, %%edi, 2), %%xmm1 \n\t"
-+ "movaps (%%esi, %%edi), %%xmm0 \n\t"
++ "mulps 16(%4, %%"REG_D", 2), %%xmm1 \n\t"
++ "movaps (%%"REG_S", %%"REG_D"), %%xmm0 \n\t"
+ "addps %%xmm2, %%xmm1 \n\t"
+ "movaps %%xmm1, %%xmm2 \n\t"
+ "addps %%xmm0, %%xmm1 \n\t"
+ "subps %%xmm2, %%xmm0 \n\t"
-+ "movaps %%xmm1, (%%esi, %%edi) \n\t"
-+ "movaps %%xmm0, (%%edx, %%edi) \n\t"
-+ "addl $16, %%edi \n\t"
-+ "cmpl %3, %%edi \n\t" //FIXME (opt) count against 0
++ "movaps %%xmm1, (%%"REG_S", %%"REG_D") \n\t"
++ "movaps %%xmm0, (%%"REG_d", %%"REG_D") \n\t"
++ "add $16, %%"REG_D" \n\t"
++ "cmp %3, %%"REG_D" \n\t" //FIXME (opt) count against 0
+ " jb 2b \n\t"
-+ "addl %2, %%esi \n\t"
-+ "cmpl %1, %%esi \n\t"
++ "add %2, %%"REG_S" \n\t"
++ "cmp %1, %%"REG_S" \n\t"
+ " jb 1b \n\t"
-+ :: "g" (buf), "m" (buf+128), "m" (two_m_plus_one<<3), "r" (two_m<<3),
++ :: "g" (buf), "m" (buf_offset), "m" (two_m_plus_one_shl3), "r" (two_m<<3),
+ "r" (sseW[m])
-+ : "%esi", "%edi", "%edx"
++ : "%"REG_S, "%"REG_D, "%"REG_d
+ );
+ }
+
+ /* Post IFFT complex multiply plus IFFT complex conjugate*/
+ asm volatile(
-+ "movl $-1024, %%esi \n\t"
-+ ".balign 16 \n\t"
++ "mov $-1024, %%"REG_S" \n\t"
++ ASMALIGN16
+ "1: \n\t"
-+ "movaps (%0, %%esi), %%xmm0 \n\t"
-+ "movaps (%0, %%esi), %%xmm1 \n\t"
++ "movaps (%0, %%"REG_S"), %%xmm0 \n\t"
++ "movaps (%0, %%"REG_S"), %%xmm1 \n\t"
+ "shufps $0xB1, %%xmm0, %%xmm0 \n\t"
-+ "mulps 1024+"MANGLE(sseSinCos1c)"(%%esi), %%xmm1\n\t"
-+ "mulps 1024+"MANGLE(sseSinCos1d)"(%%esi), %%xmm0\n\t"
++ "mulps 1024+"MANGLE(sseSinCos1c)"(%%"REG_S"), %%xmm1\n\t"
++ "mulps 1024+"MANGLE(sseSinCos1d)"(%%"REG_S"), %%xmm0\n\t"
+ "addps %%xmm1, %%xmm0 \n\t"
-+ "movaps %%xmm0, (%0, %%esi) \n\t"
-+ "addl $16, %%esi \n\t"
++ "movaps %%xmm0, (%0, %%"REG_S") \n\t"
++ "add $16, %%"REG_S" \n\t"
+ " jnz 1b \n\t"
+ :: "r" (buf+128)
-+ : "%esi"
++ : "%"REG_S
+ );
+
+
+ data_ptr = data;
+ delay_ptr = delay;
-+ window_ptr = imdct_window;
++ window_ptr = a52_imdct_window;
+
+ /* Window and convert to real valued signal */
+ asm volatile(
-+ "xorl %%edi, %%edi \n\t" // 0
-+ "xorl %%esi, %%esi \n\t" // 0
++ "xor %%"REG_D", %%"REG_D" \n\t" // 0
++ "xor %%"REG_S", %%"REG_S" \n\t" // 0
+ "movss %3, %%xmm2 \n\t" // bias
+ "shufps $0x00, %%xmm2, %%xmm2 \n\t" // bias, bias, ...
-+ ".balign 16 \n\t"
++ ASMALIGN16
+ "1: \n\t"
-+ "movlps (%0, %%esi), %%xmm0 \n\t" // ? ? A ?
-+ "movlps 8(%0, %%esi), %%xmm1 \n\t" // ? ? C ?
-+ "movhps -16(%0, %%edi), %%xmm1 \n\t" // ? D C ?
-+ "movhps -8(%0, %%edi), %%xmm0 \n\t" // ? B A ?
++ "movlps (%0, %%"REG_S"), %%xmm0 \n\t" // ? ? A ?
++ "movlps 8(%0, %%"REG_S"), %%xmm1 \n\t" // ? ? C ?
++ "movhps -16(%0, %%"REG_D"), %%xmm1 \n\t" // ? D C ?
++ "movhps -8(%0, %%"REG_D"), %%xmm0 \n\t" // ? B A ?
+ "shufps $0x99, %%xmm1, %%xmm0 \n\t" // D C B A
-+ "mulps "MANGLE(sseWindow)"(%%esi), %%xmm0\n\t"
-+ "addps (%2, %%esi), %%xmm0 \n\t"
++ "mulps "MANGLE(sseWindow)"(%%"REG_S"), %%xmm0\n\t"
++ "addps (%2, %%"REG_S"), %%xmm0 \n\t"
+ "addps %%xmm2, %%xmm0 \n\t"
-+ "movaps %%xmm0, (%1, %%esi) \n\t"
-+ "addl $16, %%esi \n\t"
-+ "subl $16, %%edi \n\t"
-+ "cmpl $512, %%esi \n\t"
++ "movaps %%xmm0, (%1, %%"REG_S") \n\t"
++ "add $16, %%"REG_S" \n\t"
++ "sub $16, %%"REG_D" \n\t"
++ "cmp $512, %%"REG_S" \n\t"
+ " jb 1b \n\t"
+ :: "r" (buf+64), "r" (data_ptr), "r" (delay_ptr), "m" (bias)
-+ : "%esi", "%edi"
++ : "%"REG_S, "%"REG_D
+ );
+ data_ptr+=128;
+ delay_ptr+=128;
+// window_ptr+=128;
+
+ asm volatile(
-+ "movl $1024, %%edi \n\t" // 512
-+ "xorl %%esi, %%esi \n\t" // 0
++ "mov $1024, %%"REG_D" \n\t" // 512
++ "xor %%"REG_S", %%"REG_S" \n\t" // 0
+ "movss %3, %%xmm2 \n\t" // bias
+ "shufps $0x00, %%xmm2, %%xmm2 \n\t" // bias, bias, ...
-+ ".balign 16 \n\t"
++ ASMALIGN16
+ "1: \n\t"
-+ "movlps (%0, %%esi), %%xmm0 \n\t" // ? ? ? A
-+ "movlps 8(%0, %%esi), %%xmm1 \n\t" // ? ? ? C
-+ "movhps -16(%0, %%edi), %%xmm1 \n\t" // D ? ? C
-+ "movhps -8(%0, %%edi), %%xmm0 \n\t" // B ? ? A
++ "movlps (%0, %%"REG_S"), %%xmm0 \n\t" // ? ? ? A
++ "movlps 8(%0, %%"REG_S"), %%xmm1 \n\t" // ? ? ? C
++ "movhps -16(%0, %%"REG_D"), %%xmm1 \n\t" // D ? ? C
++ "movhps -8(%0, %%"REG_D"), %%xmm0 \n\t" // B ? ? A
+ "shufps $0xCC, %%xmm1, %%xmm0 \n\t" // D C B A
-+ "mulps 512+"MANGLE(sseWindow)"(%%esi), %%xmm0\n\t"
-+ "addps (%2, %%esi), %%xmm0 \n\t"
++ "mulps 512+"MANGLE(sseWindow)"(%%"REG_S"), %%xmm0\n\t"
++ "addps (%2, %%"REG_S"), %%xmm0 \n\t"
+ "addps %%xmm2, %%xmm0 \n\t"
-+ "movaps %%xmm0, (%1, %%esi) \n\t"
-+ "addl $16, %%esi \n\t"
-+ "subl $16, %%edi \n\t"
-+ "cmpl $512, %%esi \n\t"
++ "movaps %%xmm0, (%1, %%"REG_S") \n\t"
++ "add $16, %%"REG_S" \n\t"
++ "sub $16, %%"REG_D" \n\t"
++ "cmp $512, %%"REG_S" \n\t"
+ " jb 1b \n\t"
+ :: "r" (buf), "r" (data_ptr), "r" (delay_ptr), "m" (bias)
-+ : "%esi", "%edi"
++ : "%"REG_S, "%"REG_D
+ );
+ data_ptr+=128;
+// window_ptr+=128;
@@ -2426,79 +2216,83 @@
+ delay_ptr = delay;
+
+ asm volatile(
-+ "xorl %%edi, %%edi \n\t" // 0
-+ "xorl %%esi, %%esi \n\t" // 0
-+ ".balign 16 \n\t"
++ "xor %%"REG_D", %%"REG_D" \n\t" // 0
++ "xor %%"REG_S", %%"REG_S" \n\t" // 0
++ ASMALIGN16
+ "1: \n\t"
-+ "movlps (%0, %%esi), %%xmm0 \n\t" // ? ? ? A
-+ "movlps 8(%0, %%esi), %%xmm1 \n\t" // ? ? ? C
-+ "movhps -16(%0, %%edi), %%xmm1 \n\t" // D ? ? C
-+ "movhps -8(%0, %%edi), %%xmm0 \n\t" // B ? ? A
++ "movlps (%0, %%"REG_S"), %%xmm0 \n\t" // ? ? ? A
++ "movlps 8(%0, %%"REG_S"), %%xmm1 \n\t" // ? ? ? C
++ "movhps -16(%0, %%"REG_D"), %%xmm1 \n\t" // D ? ? C
++ "movhps -8(%0, %%"REG_D"), %%xmm0 \n\t" // B ? ? A
+ "shufps $0xCC, %%xmm1, %%xmm0 \n\t" // D C B A
-+ "mulps 1024+"MANGLE(sseWindow)"(%%esi), %%xmm0\n\t"
-+ "movaps %%xmm0, (%1, %%esi) \n\t"
-+ "addl $16, %%esi \n\t"
-+ "subl $16, %%edi \n\t"
-+ "cmpl $512, %%esi \n\t"
++ "mulps 1024+"MANGLE(sseWindow)"(%%"REG_S"), %%xmm0\n\t"
++ "movaps %%xmm0, (%1, %%"REG_S") \n\t"
++ "add $16, %%"REG_S" \n\t"
++ "sub $16, %%"REG_D" \n\t"
++ "cmp $512, %%"REG_S" \n\t"
+ " jb 1b \n\t"
+ :: "r" (buf+64), "r" (delay_ptr)
-+ : "%esi", "%edi"
++ : "%"REG_S, "%"REG_D
+ );
+ delay_ptr+=128;
+// window_ptr-=128;
+
+ asm volatile(
-+ "movl $1024, %%edi \n\t" // 1024
-+ "xorl %%esi, %%esi \n\t" // 0
-+ ".balign 16 \n\t"
++ "mov $1024, %%"REG_D" \n\t" // 1024
++ "xor %%"REG_S", %%"REG_S" \n\t" // 0
++ ASMALIGN16
+ "1: \n\t"
-+ "movlps (%0, %%esi), %%xmm0 \n\t" // ? ? A ?
-+ "movlps 8(%0, %%esi), %%xmm1 \n\t" // ? ? C ?
-+ "movhps -16(%0, %%edi), %%xmm1 \n\t" // ? D C ?
-+ "movhps -8(%0, %%edi), %%xmm0 \n\t" // ? B A ?
++ "movlps (%0, %%"REG_S"), %%xmm0 \n\t" // ? ? A ?
++ "movlps 8(%0, %%"REG_S"), %%xmm1 \n\t" // ? ? C ?
++ "movhps -16(%0, %%"REG_D"), %%xmm1 \n\t" // ? D C ?
++ "movhps -8(%0, %%"REG_D"), %%xmm0 \n\t" // ? B A ?
+ "shufps $0x99, %%xmm1, %%xmm0 \n\t" // D C B A
-+ "mulps 1536+"MANGLE(sseWindow)"(%%esi), %%xmm0\n\t"
-+ "movaps %%xmm0, (%1, %%esi) \n\t"
-+ "addl $16, %%esi \n\t"
-+ "subl $16, %%edi \n\t"
-+ "cmpl $512, %%esi \n\t"
++ "mulps 1536+"MANGLE(sseWindow)"(%%"REG_S"), %%xmm0\n\t"
++ "movaps %%xmm0, (%1, %%"REG_S") \n\t"
++ "add $16, %%"REG_S" \n\t"
++ "sub $16, %%"REG_D" \n\t"
++ "cmp $512, %%"REG_S" \n\t"
+ " jb 1b \n\t"
+ :: "r" (buf), "r" (delay_ptr)
-+ : "%esi", "%edi"
++ : "%"REG_S, "%"REG_D
+ );
+}
-+#endif //arch_x86
++#endif // ARCH_X86 || ARCH_X86_64
+
- void
- imdct_do_256(sample_t data[],sample_t delay[],sample_t bias)
+ void a52_imdct_256(sample_t * data, sample_t * delay, sample_t bias)
{
-@@ -379,13 +1233,19 @@
- {
- int i, j, k;
+ int i, k;
+@@ -368,7 +1153,7 @@
-- fprintf (stderr, "No accelerated IMDCT transform found\n");
--
- /* Twiddle factors to turn IFFT into IMDCT */
- for (i = 0; i < 128; i++) {
- xcos1[i] = -cos ((M_PI / 2048) * (8 * i + 1));
- xsin1[i] = -sin ((M_PI / 2048) * (8 * i + 1));
+ void a52_imdct_init (uint32_t mm_accel)
+ {
+- int i, k;
++ int i, j, k;
+ double sum;
+
+ /* compute imdct window - kaiser-bessel derived window, alpha = 5.0 */
+@@ -420,6 +1205,99 @@
+ post2[i].real = cos ((M_PI / 128) * (i + 0.5));
+ post2[i].imag = sin ((M_PI / 128) * (i + 0.5));
}
-+#ifdef ARCH_X86
++ for (i = 0; i < 128; i++) {
++ xcos1[i] = -cos ((M_PI / 2048) * (8 * i + 1));
++ xsin1[i] = -sin ((M_PI / 2048) * (8 * i + 1));
++ }
++ for (i = 0; i < 7; i++) {
++ j = 1 << i;
++ for (k = 0; k < j; k++) {
++ w[i][k].real = cos (-M_PI * k / j);
++ w[i][k].imag = sin (-M_PI * k / j);
++ }
++ }
++#if defined(ARCH_X86) || defined(ARCH_X86_64)
+ for (i = 0; i < 128; i++) {
+ sseSinCos1c[2*i+0]= xcos1[i];
+ sseSinCos1c[2*i+1]= -xcos1[i];
+ sseSinCos1d[2*i+0]= xsin1[i];
+ sseSinCos1d[2*i+1]= xsin1[i];
+ }
-+#endif
-
- /* More twiddle factors to turn IFFT into IMDCT */
- for (i = 0; i < 64; i++) {
-@@ -400,7 +1260,334 @@
- w[i][k].imag = sin (-M_PI * k / j);
- }
- }
-+#ifdef ARCH_X86
+ for (i = 1; i < 7; i++) {
+ j = 1 << i;
+ for (k = 0; k < j; k+=2) {
@@ -2530,351 +2324,64 @@
+
+ for(i=0; i<128; i++)
+ {
-+ sseWindow[2*i+0]= -imdct_window[2*i+0];
-+ sseWindow[2*i+1]= imdct_window[2*i+1];
++ sseWindow[2*i+0]= -a52_imdct_window[2*i+0];
++ sseWindow[2*i+1]= a52_imdct_window[2*i+1];
+ }
+
+ for(i=0; i<64; i++)
+ {
-+ sseWindow[256 + 2*i+0]= -imdct_window[254 - 2*i+1];
-+ sseWindow[256 + 2*i+1]= imdct_window[254 - 2*i+0];
-+ sseWindow[384 + 2*i+0]= imdct_window[126 - 2*i+1];
-+ sseWindow[384 + 2*i+1]= -imdct_window[126 - 2*i+0];
++ sseWindow[256 + 2*i+0]= -a52_imdct_window[254 - 2*i+1];
++ sseWindow[256 + 2*i+1]= a52_imdct_window[254 - 2*i+0];
++ sseWindow[384 + 2*i+0]= a52_imdct_window[126 - 2*i+1];
++ sseWindow[384 + 2*i+1]= -a52_imdct_window[126 - 2*i+0];
+ }
-+#endif // arch_x86
++#endif
++ a52_imdct_512 = imdct_do_512;
++ ifft128 = ifft128_c;
++ ifft64 = ifft64_c;
+
- imdct_512 = imdct_do_512;
-+#ifdef ARCH_X86
++#if defined(ARCH_X86) || defined(ARCH_X86_64)
+ if(mm_accel & MM_ACCEL_X86_SSE)
+ {
+ fprintf (stderr, "Using SSE optimized IMDCT transform\n");
-+ imdct_512 = imdct_do_512_sse;
++ a52_imdct_512 = imdct_do_512_sse;
+ }
+ else
+ if(mm_accel & MM_ACCEL_X86_3DNOWEXT)
+ {
+ fprintf (stderr, "Using 3DNowEx optimized IMDCT transform\n");
-+ imdct_512 = imdct_do_512_3dnowex;
++ a52_imdct_512 = imdct_do_512_3dnowex;
+ }
+ else
+ if(mm_accel & MM_ACCEL_X86_3DNOW)
+ {
+ fprintf (stderr, "Using 3DNow optimized IMDCT transform\n");
-+ imdct_512 = imdct_do_512_3dnow;
++ a52_imdct_512 = imdct_do_512_3dnow;
+ }
+ else
-+#endif // arch_x86
++#endif // ARCH_X86 || ARCH_X86_64
+#ifdef HAVE_ALTIVEC
+ if (mm_accel & MM_ACCEL_PPC_ALTIVEC)
+ {
+ fprintf(stderr, "Using AltiVec optimized IMDCT transform\n");
-+ imdct_512 = imdct_do_512_altivec;
++ a52_imdct_512 = imdct_do_512_altivec;
+ }
+ else
+#endif
-+ fprintf (stderr, "No accelerated IMDCT transform found\n");
- imdct_256 = imdct_do_256;
+
+ #ifdef LIBA52_DJBFFT
+ if (mm_accel & MM_ACCEL_DJBFFT) {
+@@ -430,7 +1308,5 @@
+ #endif
+ {
+ fprintf (stderr, "No accelerated IMDCT transform found\n");
+- ifft128 = ifft128_c;
+- ifft64 = ifft64_c;
}
}
-+
-+static void fft_asmb(int k, complex_t *x, complex_t *wTB,
-+ const complex_t *d, const complex_t *d_3)
-+{
-+ register complex_t *x2k, *x3k, *x4k, *wB;
-+ register float a_r, a_i, a1_r, a1_i, u_r, u_i, v_r, v_i;
-+
-+ x2k = x + 2 * k;
-+ x3k = x2k + 2 * k;
-+ x4k = x3k + 2 * k;
-+ wB = wTB + 2 * k;
-+
-+ TRANSZERO(x[0],x2k[0],x3k[0],x4k[0]);
-+ TRANS(x[1],x2k[1],x3k[1],x4k[1],wTB[1],wB[1],d[1],d_3[1]);
-+
-+ --k;
-+ for(;;) {
-+ TRANS(x[2],x2k[2],x3k[2],x4k[2],wTB[2],wB[2],d[2],d_3[2]);
-+ TRANS(x[3],x2k[3],x3k[3],x4k[3],wTB[3],wB[3],d[3],d_3[3]);
-+ if (!--k) break;
-+ x += 2;
-+ x2k += 2;
-+ x3k += 2;
-+ x4k += 2;
-+ d += 2;
-+ d_3 += 2;
-+ wTB += 2;
-+ wB += 2;
-+ }
-+
-+}
-+
-+static void fft_asmb16(complex_t *x, complex_t *wTB)
-+{
-+ register float a_r, a_i, a1_r, a1_i, u_r, u_i, v_r, v_i;
-+ int k = 2;
-+
-+ /* transform x[0], x[8], x[4], x[12] */
-+ TRANSZERO(x[0],x[4],x[8],x[12]);
-+
-+ /* transform x[1], x[9], x[5], x[13] */
-+ TRANS(x[1],x[5],x[9],x[13],wTB[1],wTB[5],delta16[1],delta16_3[1]);
-+
-+ /* transform x[2], x[10], x[6], x[14] */
-+ TRANSHALF_16(x[2],x[6],x[10],x[14]);
-+
-+ /* transform x[3], x[11], x[7], x[15] */
-+ TRANS(x[3],x[7],x[11],x[15],wTB[3],wTB[7],delta16[3],delta16_3[3]);
-+
-+}
-+
-+static void fft_4(complex_t *x)
-+{
-+ /* delta_p = 1 here */
-+ /* x[k] = sum_{i=0..3} x[i] * w^{i*k}, w=e^{-2*pi/4}
-+ */
-+
-+ register float yt_r, yt_i, yb_r, yb_i, u_r, u_i, vi_r, vi_i;
-+
-+ yt_r = x[0].real;
-+ yb_r = yt_r - x[2].real;
-+ yt_r += x[2].real;
-+
-+ u_r = x[1].real;
-+ vi_i = x[3].real - u_r;
-+ u_r += x[3].real;
-+
-+ u_i = x[1].imag;
-+ vi_r = u_i - x[3].imag;
-+ u_i += x[3].imag;
-+
-+ yt_i = yt_r;
-+ yt_i += u_r;
-+ x[0].real = yt_i;
-+ yt_r -= u_r;
-+ x[2].real = yt_r;
-+ yt_i = yb_r;
-+ yt_i += vi_r;
-+ x[1].real = yt_i;
-+ yb_r -= vi_r;
-+ x[3].real = yb_r;
-+
-+ yt_i = x[0].imag;
-+ yb_i = yt_i - x[2].imag;
-+ yt_i += x[2].imag;
-+
-+ yt_r = yt_i;
-+ yt_r += u_i;
-+ x[0].imag = yt_r;
-+ yt_i -= u_i;
-+ x[2].imag = yt_i;
-+ yt_r = yb_i;
-+ yt_r += vi_i;
-+ x[1].imag = yt_r;
-+ yb_i -= vi_i;
-+ x[3].imag = yb_i;
-+}
-+
-+
-+static void fft_8(complex_t *x)
-+{
-+ /* delta_p = diag{1, sqrt(i)} here */
-+ /* x[k] = sum_{i=0..7} x[i] * w^{i*k}, w=e^{-2*pi/8}
-+ */
-+ register float wT1_r, wT1_i, wB1_r, wB1_i, wT2_r, wT2_i, wB2_r, wB2_i;
-+
-+ wT1_r = x[1].real;
-+ wT1_i = x[1].imag;
-+ wB1_r = x[3].real;
-+ wB1_i = x[3].imag;
-+
-+ x[1] = x[2];
-+ x[2] = x[4];
-+ x[3] = x[6];
-+ fft_4(&x[0]);
-+
-+
-+ /* x[0] x[4] */
-+ wT2_r = x[5].real;
-+ wT2_r += x[7].real;
-+ wT2_r += wT1_r;
-+ wT2_r += wB1_r;
-+ wT2_i = wT2_r;
-+ wT2_r += x[0].real;
-+ wT2_i = x[0].real - wT2_i;
-+ x[0].real = wT2_r;
-+ x[4].real = wT2_i;
-+
-+ wT2_i = x[5].imag;
-+ wT2_i += x[7].imag;
-+ wT2_i += wT1_i;
-+ wT2_i += wB1_i;
-+ wT2_r = wT2_i;
-+ wT2_r += x[0].imag;
-+ wT2_i = x[0].imag - wT2_i;
-+ x[0].imag = wT2_r;
-+ x[4].imag = wT2_i;
-+
-+ /* x[2] x[6] */
-+ wT2_r = x[5].imag;
-+ wT2_r -= x[7].imag;
-+ wT2_r += wT1_i;
-+ wT2_r -= wB1_i;
-+ wT2_i = wT2_r;
-+ wT2_r += x[2].real;
-+ wT2_i = x[2].real - wT2_i;
-+ x[2].real = wT2_r;
-+ x[6].real = wT2_i;
-+
-+ wT2_i = x[5].real;
-+ wT2_i -= x[7].real;
-+ wT2_i += wT1_r;
-+ wT2_i -= wB1_r;
-+ wT2_r = wT2_i;
-+ wT2_r += x[2].imag;
-+ wT2_i = x[2].imag - wT2_i;
-+ x[2].imag = wT2_i;
-+ x[6].imag = wT2_r;
-+
-+
-+ /* x[1] x[5] */
-+ wT2_r = wT1_r;
-+ wT2_r += wB1_i;
-+ wT2_r -= x[5].real;
-+ wT2_r -= x[7].imag;
-+ wT2_i = wT1_i;
-+ wT2_i -= wB1_r;
-+ wT2_i -= x[5].imag;
-+ wT2_i += x[7].real;
-+
-+ wB2_r = wT2_r;
-+ wB2_r += wT2_i;
-+ wT2_i -= wT2_r;
-+ wB2_r *= HSQRT2;
-+ wT2_i *= HSQRT2;
-+ wT2_r = wB2_r;
-+ wB2_r += x[1].real;
-+ wT2_r = x[1].real - wT2_r;
-+
-+ wB2_i = x[5].real;
-+ x[1].real = wB2_r;
-+ x[5].real = wT2_r;
-+
-+ wT2_r = wT2_i;
-+ wT2_r += x[1].imag;
-+ wT2_i = x[1].imag - wT2_i;
-+ wB2_r = x[5].imag;
-+ x[1].imag = wT2_r;
-+ x[5].imag = wT2_i;
-+
-+ /* x[3] x[7] */
-+ wT1_r -= wB1_i;
-+ wT1_i += wB1_r;
-+ wB1_r = wB2_i - x[7].imag;
-+ wB1_i = wB2_r + x[7].real;
-+ wT1_r -= wB1_r;
-+ wT1_i -= wB1_i;
-+ wB1_r = wT1_r + wT1_i;
-+ wB1_r *= HSQRT2;
-+ wT1_i -= wT1_r;
-+ wT1_i *= HSQRT2;
-+ wB2_r = x[3].real;
-+ wB2_i = wB2_r + wT1_i;
-+ wB2_r -= wT1_i;
-+ x[3].real = wB2_i;
-+ x[7].real = wB2_r;
-+ wB2_i = x[3].imag;
-+ wB2_r = wB2_i + wB1_r;
-+ wB2_i -= wB1_r;
-+ x[3].imag = wB2_i;
-+ x[7].imag = wB2_r;
-+}
-+
-+
-+static void fft_128p(complex_t *a)
-+{
-+ fft_8(&a[0]); fft_4(&a[8]); fft_4(&a[12]);
-+ fft_asmb16(&a[0], &a[8]);
-+
-+ fft_8(&a[16]), fft_8(&a[24]);
-+ fft_asmb(4, &a[0], &a[16],&delta32[0], &delta32_3[0]);
-+
-+ fft_8(&a[32]); fft_4(&a[40]); fft_4(&a[44]);
-+ fft_asmb16(&a[32], &a[40]);
-+
-+ fft_8(&a[48]); fft_4(&a[56]); fft_4(&a[60]);
-+ fft_asmb16(&a[48], &a[56]);
-+
-+ fft_asmb(8, &a[0], &a[32],&delta64[0], &delta64_3[0]);
-+
-+ fft_8(&a[64]); fft_4(&a[72]); fft_4(&a[76]);
-+ /* fft_16(&a[64]); */
-+ fft_asmb16(&a[64], &a[72]);
-+
-+ fft_8(&a[80]); fft_8(&a[88]);
-+
-+ /* fft_32(&a[64]); */
-+ fft_asmb(4, &a[64], &a[80],&delta32[0], &delta32_3[0]);
-+
-+ fft_8(&a[96]); fft_4(&a[104]), fft_4(&a[108]);
-+ /* fft_16(&a[96]); */
-+ fft_asmb16(&a[96], &a[104]);
-+
-+ fft_8(&a[112]), fft_8(&a[120]);
-+ /* fft_32(&a[96]); */
-+ fft_asmb(4, &a[96], &a[112], &delta32[0], &delta32_3[0]);
-+
-+ /* fft_128(&a[0]); */
-+ fft_asmb(16, &a[0], &a[64], &delta128[0], &delta128_3[0]);
-+}
-+
-+
-+
---- liba52/imdct_mlib.c 2005-03-22 19:59:35.000000000 +0100
-+++ imdct_mlib.c 2004-03-19 01:15:51.000000000 +0100
-@@ -23,11 +29,11 @@
-
- #ifdef LIBA52_MLIB
-
--#include <inttypes.h>
--#include <string.h>
- #include <mlib_types.h>
- #include <mlib_status.h>
- #include <mlib_signal.h>
-+#include <string.h>
-+#include <inttypes.h>
-
- #include "a52.h"
- #include "a52_internal.h"
-@@ -42,7 +48,7 @@
- sample_t *data_ptr;
- sample_t *delay_ptr;
- sample_t *window_ptr;
-- sample_t tmp[256] __attribute__ ((__aligned__ (16)));
-+ sample_t tmp[256] __attribute__((aligned(16)));
- int i;
-
- memcpy(tmp, data, 256 * sizeof(sample_t));
-@@ -91,7 +97,7 @@
- sample_t *data_ptr;
- sample_t *delay_ptr;
- sample_t *window_ptr;
-- sample_t tmp[256] __attribute__ ((__aligned__ (16)));
-+ sample_t tmp[256] __attribute__((aligned(16)));
- int i;
-
- memcpy(tmp, data, 256 * sizeof(sample_t));
---- include/mm_accel.h 2005-03-22 19:58:53.000000000 +0100
-+++ mm_accel.h 2004-03-19 01:15:52.000000000 +0100
-@@ -19,12 +25,22 @@
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- */
-
-+#ifndef MM_ACCEL_H
-+#define MM_ACCEL_H
-+
- /* generic accelerations */
- #define MM_ACCEL_MLIB 0x00000001
-
+--- liba52-0.7.4/mm_accel.h 2006-06-12 15:05:00.000000000 +0200
++++ liba52/mm_accel.h 2006-06-05 02:23:04.000000000 +0200
+@@ -30,7 +34,12 @@
/* x86 accelerations */
#define MM_ACCEL_X86_MMX 0x80000000
#define MM_ACCEL_X86_3DNOW 0x40000000
@@ -2886,20 +2393,18 @@
+#define MM_ACCEL_PPC_ALTIVEC 0x00010000
uint32_t mm_accel (void);
-+
-+#endif /* MM_ACCEL_H */
---- liba52/parse.c 2005-03-22 19:59:35.000000000 +0100
-+++ parse.c 2004-04-01 15:41:29.000000000 +0200
-@@ -21,21 +27,19 @@
+--- liba52-0.7.4/parse.c 2006-06-12 15:05:07.000000000 +0200
++++ liba52/parse.c 2006-06-12 14:51:33.000000000 +0200
+@@ -24,6 +28,7 @@
#include "config.h"
--#include <inttypes.h>
#include <stdlib.h>
++#include <stdio.h>
#include <string.h>
-+#include <inttypes.h>
+ #include <inttypes.h>
- #include "a52.h"
+@@ -31,13 +36,11 @@
#include "a52_internal.h"
#include "bitstream.h"
#include "tables.h"
@@ -2914,110 +2419,97 @@
#endif
typedef struct {
-@@ -54,12 +58,28 @@
- sample_t * samples;
- int i;
+@@ -61,6 +64,21 @@
+ return NULL;
-- imdct_init (mm_accel);
--
- samples = memalign (16, 256 * 12 * sizeof (sample_t));
+ state->samples = memalign (16, 256 * 12 * sizeof (sample_t));
+#if defined(__MINGW32__) && defined(HAVE_SSE)
+ for(i=0;i<10;i++){
-+ if((int)samples%16){
++ if((int)state->samples%16){
+ sample_t* samplestmp=malloc(256 * 12 * sizeof (sample_t));
-+ free(samples);
-+ samples = samplestmp;
++ free(state->samples);
++ state->samples = samplestmp;
+ }
+ else break;
+ }
+#endif
-+ if(((int)samples%16) && (mm_accel&MM_ACCEL_X86_SSE)){
++ if(((int)state->samples%16) && (mm_accel&MM_ACCEL_X86_SSE)){
+ mm_accel &=~MM_ACCEL_X86_SSE;
-+ printf("liba52: unable to get 16 byte aligned memory disabling usage of SSE instructions\n");
++ fprintf(stderr, "liba52: unable to get 16 byte aligned memory disabling usage of SSE instructions\n");
+ }
+
- if (samples == NULL)
-- return NULL;
--
-+ return NULL;
-+
-+ imdct_init (mm_accel);
+ if (state->samples == NULL) {
+ free (state);
+ return NULL;
+@@ -74,6 +92,7 @@
+ state->lfsr_state = 1;
+
+ a52_imdct_init (mm_accel);
+ downmix_accel_init(mm_accel);
-+
- for (i = 0; i < 256 * 12; i++)
- samples[i] = 0;
-@@ -124,7 +144,7 @@
+ return state;
+ }
+@@ -141,7 +160,7 @@
state->acmod = acmod = buf[6] >> 5;
- bitstream_set_ptr (buf + 6);
-- bitstream_get (3); /* skip acmod we already parsed */
-+ bitstream_skip (3); /* skip acmod we already parsed */
+ a52_bitstream_set_ptr (state, buf + 6);
+- bitstream_get (state, 3); /* skip acmod we already parsed */
++ bitstream_skip (state, 3); /* skip acmod we already parsed */
- if ((acmod == 2) && (bitstream_get (2) == 2)) /* dsurmod */
+ if ((acmod == 2) && (bitstream_get (state, 2) == 2)) /* dsurmod */
acmod = A52_DOLBY;
-@@ -144,7 +164,7 @@
- if (state->lfeon && (*flags & A52_LFE))
- state->output |= A52_LFE;
- *flags = state->output;
-- // the 2* compensates for differences in imdct
-+ /* the 2* compensates for differences in imdct */
- state->dynrng = state->level = 2 * *level;
- state->bias = bias;
- state->dynrnge = 1;
-@@ -152,28 +172,28 @@
+@@ -172,28 +191,28 @@
chaninfo = !acmod;
do {
-- bitstream_get (5); /* dialnorm */
-+ bitstream_skip (5); /* dialnorm */
- if (bitstream_get (1)) /* compre */
-- bitstream_get (8); /* compr */
-+ bitstream_skip (8); /* compr */
- if (bitstream_get (1)) /* langcode */
-- bitstream_get (8); /* langcod */
-+ bitstream_skip (8); /* langcod */
- if (bitstream_get (1)) /* audprodie */
-- bitstream_get (7); /* mixlevel + roomtyp */
-+ bitstream_skip (7); /* mixlevel + roomtyp */
+- bitstream_get (state, 5); /* dialnorm */
++ bitstream_skip (state, 5); /* dialnorm */
+ if (bitstream_get (state, 1)) /* compre */
+- bitstream_get (state, 8); /* compr */
++ bitstream_skip (state, 8); /* compr */
+ if (bitstream_get (state, 1)) /* langcode */
+- bitstream_get (state, 8); /* langcod */
++ bitstream_skip (state, 8); /* langcod */
+ if (bitstream_get (state, 1)) /* audprodie */
+- bitstream_get (state, 7); /* mixlevel + roomtyp */
++ bitstream_skip (state, 7); /* mixlevel + roomtyp */
} while (chaninfo--);
-- bitstream_get (2); /* copyrightb + origbs */
-+ bitstream_skip (2); /* copyrightb + origbs */
+- bitstream_get (state, 2); /* copyrightb + origbs */
++ bitstream_skip (state, 2); /* copyrightb + origbs */
- if (bitstream_get (1)) /* timecod1e */
-- bitstream_get (14); /* timecod1 */
-+ bitstream_skip (14); /* timecod1 */
- if (bitstream_get (1)) /* timecod2e */
-- bitstream_get (14); /* timecod2 */
-+ bitstream_skip (14); /* timecod2 */
+ if (bitstream_get (state, 1)) /* timecod1e */
+- bitstream_get (state, 14); /* timecod1 */
++ bitstream_skip (state, 14); /* timecod1 */
+ if (bitstream_get (state, 1)) /* timecod2e */
+- bitstream_get (state, 14); /* timecod2 */
++ bitstream_skip (state, 14); /* timecod2 */
- if (bitstream_get (1)) { /* addbsie */
+ if (bitstream_get (state, 1)) { /* addbsie */
int addbsil;
- addbsil = bitstream_get (6);
+ addbsil = bitstream_get (state, 6);
do {
-- bitstream_get (8); /* addbsi */
-+ bitstream_skip (8); /* addbsi */
+- bitstream_get (state, 8); /* addbsi */
++ bitstream_skip (state, 8); /* addbsi */
} while (addbsil--);
}
-@@ -647,7 +667,7 @@
- if (parse_exponents (chexpstr[i], nchgrps, state->fbw_exp[i][0],
- state->fbw_exp[i] + 1))
+@@ -680,7 +699,7 @@
+ state->fbw_expbap[i].exp[0],
+ state->fbw_expbap[i].exp + 1))
return 1;
-- bitstream_get (2); /* gainrng */
-+ bitstream_skip (2); /* gainrng */
+- bitstream_get (state, 2); /* gainrng */
++ bitstream_skip (state, 2); /* gainrng */
}
if (lfeexpstr != EXP_REUSE) {
do_bit_alloc |= 32;
-@@ -729,7 +749,7 @@
- if (bitstream_get (1)) { /* skiple */
- i = bitstream_get (9); /* skipl */
+@@ -755,7 +774,7 @@
+ if (bitstream_get (state, 1)) { /* skiple */
+ i = bitstream_get (state, 9); /* skipl */
while (i--)
-- bitstream_get (8);
-+ bitstream_skip (8);
+- bitstream_get (state, 8);
++ bitstream_skip (state, 8);
}
- if (state->output & A52_LFE)
-
+ samples = state->samples;
diff --git a/liba52/mm_accel.h b/liba52/mm_accel.h
index a20162e23d..e38f00136f 100644
--- a/liba52/mm_accel.h
+++ b/liba52/mm_accel.h
@@ -1,6 +1,6 @@
/*
* mm_accel.h
- * Copyright (C) 2000-2001 Michel Lespinasse <walken@zoy.org>
+ * Copyright (C) 2000-2002 Michel Lespinasse <walken@zoy.org>
* Copyright (C) 1999-2000 Aaron Holtzman <aholtzma@ess.engr.uvic.ca>
*
* This file is part of a52dec, a free ATSC A-52 stream decoder.
@@ -29,7 +29,7 @@
#define MM_ACCEL_H
/* generic accelerations */
-#define MM_ACCEL_MLIB 0x00000001
+#define MM_ACCEL_DJBFFT 0x00000001
/* x86 accelerations */
#define MM_ACCEL_X86_MMX 0x80000000
diff --git a/liba52/parse.c b/liba52/parse.c
index cabfee8ecd..0791123366 100644
--- a/liba52/parse.c
+++ b/liba52/parse.c
@@ -1,6 +1,6 @@
/*
* parse.c
- * Copyright (C) 2000-2001 Michel Lespinasse <walken@zoy.org>
+ * Copyright (C) 2000-2002 Michel Lespinasse <walken@zoy.org>
* Copyright (C) 1999-2000 Aaron Holtzman <aholtzma@ess.engr.uvic.ca>
*
* This file is part of a52dec, a free ATSC A-52 stream decoder.
@@ -28,6 +28,7 @@
#include "config.h"
#include <stdlib.h>
+#include <stdio.h>
#include <string.h>
#include <inttypes.h>
@@ -53,37 +54,52 @@ typedef struct {
static uint8_t halfrate[12] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3};
-sample_t * a52_init (uint32_t mm_accel)
+a52_state_t * a52_init (uint32_t mm_accel)
{
- sample_t * samples;
+ a52_state_t * state;
int i;
- samples = memalign (16, 256 * 12 * sizeof (sample_t));
+ state = malloc (sizeof (a52_state_t));
+ if (state == NULL)
+ return NULL;
+
+ state->samples = memalign (16, 256 * 12 * sizeof (sample_t));
#if defined(__MINGW32__) && defined(HAVE_SSE)
for(i=0;i<10;i++){
- if((int)samples%16){
+ if((int)state->samples%16){
sample_t* samplestmp=malloc(256 * 12 * sizeof (sample_t));
- free(samples);
- samples = samplestmp;
+ free(state->samples);
+ state->samples = samplestmp;
}
else break;
}
#endif
- if(((int)samples%16) && (mm_accel&MM_ACCEL_X86_SSE)){
+ if(((int)state->samples%16) && (mm_accel&MM_ACCEL_X86_SSE)){
mm_accel &=~MM_ACCEL_X86_SSE;
- printf("liba52: unable to get 16 byte aligned memory disabling usage of SSE instructions\n");
+ fprintf(stderr, "liba52: unable to get 16 byte aligned memory disabling usage of SSE instructions\n");
}
- if (samples == NULL)
+ if (state->samples == NULL) {
+ free (state);
return NULL;
+ }
+
+ for (i = 0; i < 256 * 12; i++)
+ state->samples[i] = 0;
+
+ state->downmixed = 1;
+
+ state->lfsr_state = 1;
- imdct_init (mm_accel);
+ a52_imdct_init (mm_accel);
downmix_accel_init(mm_accel);
- for (i = 0; i < 256 * 12; i++)
- samples[i] = 0;
+ return state;
+}
- return samples;
+sample_t * a52_samples (a52_state_t * state)
+{
+ return state->samples;
}
int a52_syncinfo (uint8_t * buf, int * flags,
@@ -117,7 +133,7 @@ int a52_syncinfo (uint8_t * buf, int * flags,
*bit_rate = (bitrate * 1000) >> half;
switch (buf[4] & 0xc0) {
- case 0: /* 48 KHz */
+ case 0:
*sample_rate = 48000 >> half;
return 4 * bitrate;
case 0x40:
@@ -143,21 +159,21 @@ int a52_frame (a52_state_t * state, uint8_t * buf, int * flags,
state->halfrate = halfrate[buf[5] >> 3];
state->acmod = acmod = buf[6] >> 5;
- bitstream_set_ptr (buf + 6);
- bitstream_skip (3); /* skip acmod we already parsed */
+ a52_bitstream_set_ptr (state, buf + 6);
+ bitstream_skip (state, 3); /* skip acmod we already parsed */
- if ((acmod == 2) && (bitstream_get (2) == 2)) /* dsurmod */
+ if ((acmod == 2) && (bitstream_get (state, 2) == 2)) /* dsurmod */
acmod = A52_DOLBY;
if ((acmod & 1) && (acmod != 1))
- state->clev = clev[bitstream_get (2)]; /* cmixlev */
+ state->clev = clev[bitstream_get (state, 2)]; /* cmixlev */
if (acmod & 4)
- state->slev = slev[bitstream_get (2)]; /* surmixlev */
+ state->slev = slev[bitstream_get (state, 2)]; /* surmixlev */
- state->lfeon = bitstream_get (1);
+ state->lfeon = bitstream_get (state, 1);
- state->output = downmix_init (acmod, *flags, level,
+ state->output = a52_downmix_init (acmod, *flags, level,
state->clev, state->slev);
if (state->output < 0)
return 1;
@@ -169,31 +185,34 @@ int a52_frame (a52_state_t * state, uint8_t * buf, int * flags,
state->bias = bias;
state->dynrnge = 1;
state->dynrngcall = NULL;
+ state->cplba.deltbae = DELTA_BIT_NONE;
+ state->ba[0].deltbae = state->ba[1].deltbae = state->ba[2].deltbae =
+ state->ba[3].deltbae = state->ba[4].deltbae = DELTA_BIT_NONE;
chaninfo = !acmod;
do {
- bitstream_skip (5); /* dialnorm */
- if (bitstream_get (1)) /* compre */
- bitstream_skip (8); /* compr */
- if (bitstream_get (1)) /* langcode */
- bitstream_skip (8); /* langcod */
- if (bitstream_get (1)) /* audprodie */
- bitstream_skip (7); /* mixlevel + roomtyp */
+ bitstream_skip (state, 5); /* dialnorm */
+ if (bitstream_get (state, 1)) /* compre */
+ bitstream_skip (state, 8); /* compr */
+ if (bitstream_get (state, 1)) /* langcode */
+ bitstream_skip (state, 8); /* langcod */
+ if (bitstream_get (state, 1)) /* audprodie */
+ bitstream_skip (state, 7); /* mixlevel + roomtyp */
} while (chaninfo--);
- bitstream_skip (2); /* copyrightb + origbs */
+ bitstream_skip (state, 2); /* copyrightb + origbs */
- if (bitstream_get (1)) /* timecod1e */
- bitstream_skip (14); /* timecod1 */
- if (bitstream_get (1)) /* timecod2e */
- bitstream_skip (14); /* timecod2 */
+ if (bitstream_get (state, 1)) /* timecod1e */
+ bitstream_skip (state, 14); /* timecod1 */
+ if (bitstream_get (state, 1)) /* timecod2e */
+ bitstream_skip (state, 14); /* timecod2 */
- if (bitstream_get (1)) { /* addbsie */
+ if (bitstream_get (state, 1)) { /* addbsie */
int addbsil;
- addbsil = bitstream_get (6);
+ addbsil = bitstream_get (state, 6);
do {
- bitstream_skip (8); /* addbsi */
+ bitstream_skip (state, 8); /* addbsi */
} while (addbsil--);
}
@@ -211,13 +230,13 @@ void a52_dynrng (a52_state_t * state,
}
}
-static int parse_exponents (int expstr, int ngrps, uint8_t exponent,
- uint8_t * dest)
+static int parse_exponents (a52_state_t * state, int expstr, int ngrps,
+ uint8_t exponent, uint8_t * dest)
{
int exps;
while (ngrps--) {
- exps = bitstream_get (7);
+ exps = bitstream_get (state, 7);
exponent += exp_1[exps];
if (exponent > 24)
@@ -265,18 +284,18 @@ static int parse_exponents (int expstr, int ngrps, uint8_t exponent,
return 0;
}
-static int parse_deltba (int8_t * deltba)
+static int parse_deltba (a52_state_t * state, int8_t * deltba)
{
int deltnseg, deltlen, delta, j;
memset (deltba, 0, 50);
- deltnseg = bitstream_get (3);
+ deltnseg = bitstream_get (state, 3);
j = 0;
do {
- j += bitstream_get (5);
- deltlen = bitstream_get (4);
- delta = bitstream_get (3);
+ j += bitstream_get (state, 5);
+ deltlen = bitstream_get (state, 4);
+ delta = bitstream_get (state, 3);
delta -= (delta >= 4) ? 3 : 4;
if (!deltlen)
continue;
@@ -293,37 +312,42 @@ static inline int zero_snr_offsets (int nfchans, a52_state_t * state)
{
int i;
- if ((state->csnroffst) || (state->cplinu && state->cplba.fsnroffst) ||
- (state->lfeon && state->lfeba.fsnroffst))
+ if ((state->csnroffst) ||
+ (state->chincpl && state->cplba.bai >> 3) || /* cplinu, fsnroffst */
+ (state->lfeon && state->lfeba.bai >> 3)) /* fsnroffst */
return 0;
for (i = 0; i < nfchans; i++)
- if (state->ba[i].fsnroffst)
+ if (state->ba[i].bai >> 3) /* fsnroffst */
return 0;
return 1;
}
-static inline int16_t dither_gen (void)
+static inline int16_t dither_gen (a52_state_t * state)
{
- static uint16_t lfsr_state = 1;
- int16_t state;
+ int16_t nstate;
- state = dither_lut[lfsr_state >> 8] ^ (lfsr_state << 8);
+ nstate = dither_lut[state->lfsr_state >> 8] ^ (state->lfsr_state << 8);
- lfsr_state = (uint16_t) state;
+ state->lfsr_state = (uint16_t) nstate;
- return state;
+ return nstate;
}
-static void coeff_get (sample_t * coeff, uint8_t * exp, int8_t * bap,
- quantizer_t * quantizer, sample_t level,
- int dither, int end)
+static void coeff_get (a52_state_t * state, sample_t * coeff,
+ expbap_t * expbap, quantizer_t * quantizer,
+ sample_t level, int dither, int end)
{
int i;
+ uint8_t * exp;
+ int8_t * bap;
sample_t factor[25];
for (i = 0; i <= 24; i++)
factor[i] = scale_factor[i] * level;
+ exp = expbap->exp;
+ bap = expbap->bap;
+
for (i = 0; i < end; i++) {
int bapi;
@@ -331,7 +355,7 @@ static void coeff_get (sample_t * coeff, uint8_t * exp, int8_t * bap,
switch (bapi) {
case 0:
if (dither) {
- coeff[i] = dither_gen() * LEVEL_3DB * factor[exp[i]];
+ coeff[i] = dither_gen (state) * LEVEL_3DB * factor[exp[i]];
continue;
} else {
coeff[i] = 0;
@@ -345,7 +369,7 @@ static void coeff_get (sample_t * coeff, uint8_t * exp, int8_t * bap,
} else {
int code;
- code = bitstream_get (5);
+ code = bitstream_get (state, 5);
quantizer->q1_ptr = 1;
quantizer->q1[0] = q_1_2[code];
@@ -361,7 +385,7 @@ static void coeff_get (sample_t * coeff, uint8_t * exp, int8_t * bap,
} else {
int code;
- code = bitstream_get (7);
+ code = bitstream_get (state, 7);
quantizer->q2_ptr = 1;
quantizer->q2[0] = q_2_2[code];
@@ -371,7 +395,7 @@ static void coeff_get (sample_t * coeff, uint8_t * exp, int8_t * bap,
}
case 3:
- coeff[i] = q_3[bitstream_get (3)] * factor[exp[i]];
+ coeff[i] = q_3[bitstream_get (state, 3)] * factor[exp[i]];
continue;
case -3:
@@ -382,7 +406,7 @@ static void coeff_get (sample_t * coeff, uint8_t * exp, int8_t * bap,
} else {
int code;
- code = bitstream_get (7);
+ code = bitstream_get (state, 7);
quantizer->q4_ptr = 0;
quantizer->q4 = q_4_1[code];
@@ -391,11 +415,11 @@ static void coeff_get (sample_t * coeff, uint8_t * exp, int8_t * bap,
}
case 4:
- coeff[i] = q_5[bitstream_get (4)] * factor[exp[i]];
+ coeff[i] = q_5[bitstream_get (state, 4)] * factor[exp[i]];
continue;
default:
- coeff[i] = ((bitstream_get_2 (bapi) << (16 - bapi)) *
+ coeff[i] = ((bitstream_get_2 (state, bapi) << (16 - bapi)) *
factor[exp[i]]);
}
}
@@ -405,19 +429,23 @@ static void coeff_get_coupling (a52_state_t * state, int nfchans,
sample_t * coeff, sample_t (* samples)[256],
quantizer_t * quantizer, uint8_t dithflag[5])
{
- int sub_bnd, bnd, i, i_end, ch;
- int8_t * bap;
+ int cplbndstrc, bnd, i, i_end, ch;
uint8_t * exp;
+ int8_t * bap;
sample_t cplco[5];
- bap = state->cpl_bap;
- exp = state->cpl_exp;
- sub_bnd = bnd = 0;
+ exp = state->cpl_expbap.exp;
+ bap = state->cpl_expbap.bap;
+ bnd = 0;
+ cplbndstrc = state->cplbndstrc;
i = state->cplstrtmant;
while (i < state->cplendmant) {
i_end = i + 12;
- while (state->cplbndstrc[sub_bnd++])
+ while (cplbndstrc & 1) {
+ cplbndstrc >>= 1;
i_end += 12;
+ }
+ cplbndstrc >>= 1;
for (ch = 0; ch < nfchans; ch++)
cplco[ch] = state->cplco[ch][bnd] * coeff[ch];
bnd++;
@@ -431,10 +459,10 @@ static void coeff_get_coupling (a52_state_t * state, int nfchans,
case 0:
cplcoeff = LEVEL_3DB * scale_factor[exp[i]];
for (ch = 0; ch < nfchans; ch++)
- if (state->chincpl[ch]) {
+ if ((state->chincpl >> ch) & 1) {
if (dithflag[ch])
samples[ch][i] = (cplcoeff * cplco[ch] *
- dither_gen ());
+ dither_gen (state));
else
samples[ch][i] = 0;
}
@@ -448,7 +476,7 @@ static void coeff_get_coupling (a52_state_t * state, int nfchans,
} else {
int code;
- code = bitstream_get (5);
+ code = bitstream_get (state, 5);
quantizer->q1_ptr = 1;
quantizer->q1[0] = q_1_2[code];
@@ -464,7 +492,7 @@ static void coeff_get_coupling (a52_state_t * state, int nfchans,
} else {
int code;
- code = bitstream_get (7);
+ code = bitstream_get (state, 7);
quantizer->q2_ptr = 1;
quantizer->q2[0] = q_2_2[code];
@@ -474,7 +502,7 @@ static void coeff_get_coupling (a52_state_t * state, int nfchans,
}
case 3:
- cplcoeff = q_3[bitstream_get (3)];
+ cplcoeff = q_3[bitstream_get (state, 3)];
break;
case -3:
@@ -485,7 +513,7 @@ static void coeff_get_coupling (a52_state_t * state, int nfchans,
} else {
int code;
- code = bitstream_get (7);
+ code = bitstream_get (state, 7);
quantizer->q4_ptr = 0;
quantizer->q4 = q_4_1[code];
@@ -494,23 +522,23 @@ static void coeff_get_coupling (a52_state_t * state, int nfchans,
}
case 4:
- cplcoeff = q_5[bitstream_get (4)];
+ cplcoeff = q_5[bitstream_get (state, 4)];
break;
default:
- cplcoeff = bitstream_get_2 (bapi) << (16 - bapi);
+ cplcoeff = bitstream_get_2 (state, bapi) << (16 - bapi);
}
cplcoeff *= scale_factor[exp[i]];
for (ch = 0; ch < nfchans; ch++)
- if (state->chincpl[ch])
+ if ((state->chincpl >> ch) & 1)
samples[ch][i] = cplcoeff * cplco[ch];
i++;
}
}
}
-int a52_block (a52_state_t * state, sample_t * samples)
+int a52_block (a52_state_t * state)
{
static const uint8_t nfchans_tbl[] = {2, 1, 2, 3, 3, 4, 4, 5, 1, 1, 2};
static int rematrix_band[4] = {25, 37, 61, 253};
@@ -520,21 +548,22 @@ int a52_block (a52_state_t * state, sample_t * samples)
sample_t coeff[5];
int chanbias;
quantizer_t quantizer;
+ sample_t * samples;
nfchans = nfchans_tbl[state->acmod];
for (i = 0; i < nfchans; i++)
- blksw[i] = bitstream_get (1);
+ blksw[i] = bitstream_get (state, 1);
for (i = 0; i < nfchans; i++)
- dithflag[i] = bitstream_get (1);
+ dithflag[i] = bitstream_get (state, 1);
- chaninfo = !(state->acmod);
+ chaninfo = !state->acmod;
do {
- if (bitstream_get (1)) { /* dynrnge */
+ if (bitstream_get (state, 1)) { /* dynrnge */
int dynrng;
- dynrng = bitstream_get_2 (8);
+ dynrng = bitstream_get_2 (state, 8);
if (state->dynrnge) {
sample_t range;
@@ -547,25 +576,25 @@ int a52_block (a52_state_t * state, sample_t * samples)
}
} while (chaninfo--);
- if (bitstream_get (1)) { /* cplstre */
- state->cplinu = bitstream_get (1);
- if (state->cplinu) {
- static int bndtab[16] = {31, 35, 37, 39, 41, 42, 43, 44,
+ if (bitstream_get (state, 1)) { /* cplstre */
+ state->chincpl = 0;
+ if (bitstream_get (state, 1)) { /* cplinu */
+ static uint8_t bndtab[16] = {31, 35, 37, 39, 41, 42, 43, 44,
45, 45, 46, 46, 47, 47, 48, 48};
int cplbegf;
int cplendf;
int ncplsubnd;
for (i = 0; i < nfchans; i++)
- state->chincpl[i] = bitstream_get (1);
+ state->chincpl |= bitstream_get (state, 1) << i;
switch (state->acmod) {
case 0: case 1:
return 1;
case 2:
- state->phsflginu = bitstream_get (1);
+ state->phsflginu = bitstream_get (state, 1);
}
- cplbegf = bitstream_get (4);
- cplendf = bitstream_get (4);
+ cplbegf = bitstream_get (state, 4);
+ cplendf = bitstream_get (state, 4);
if (cplendf + 3 - cplbegf < 0)
return 1;
@@ -574,28 +603,29 @@ int a52_block (a52_state_t * state, sample_t * samples)
state->cplstrtmant = cplbegf * 12 + 37;
state->cplendmant = cplendf * 12 + 73;
- for (i = 0; i < ncplsubnd - 1; i++) {
- state->cplbndstrc[i] = bitstream_get (1);
- state->ncplbnd -= state->cplbndstrc[i];
+ state->cplbndstrc = 0;
+ for (i = 0; i < ncplsubnd - 1; i++)
+ if (bitstream_get (state, 1)) {
+ state->cplbndstrc |= 1 << i;
+ state->ncplbnd--;
}
- state->cplbndstrc[i] = 0; /* last value is a sentinel */
}
}
- if (state->cplinu) {
+ if (state->chincpl) { /* cplinu */
int j, cplcoe;
cplcoe = 0;
for (i = 0; i < nfchans; i++)
- if (state->chincpl[i])
- if (bitstream_get (1)) { /* cplcoe */
+ if ((state->chincpl) >> i & 1)
+ if (bitstream_get (state, 1)) { /* cplcoe */
int mstrcplco, cplcoexp, cplcomant;
cplcoe = 1;
- mstrcplco = 3 * bitstream_get (2);
+ mstrcplco = 3 * bitstream_get (state, 2);
for (j = 0; j < state->ncplbnd; j++) {
- cplcoexp = bitstream_get (4);
- cplcomant = bitstream_get (4);
+ cplcoexp = bitstream_get (state, 4);
+ cplcomant = bitstream_get (state, 4);
if (cplcoexp == 15)
cplcomant <<= 14;
else
@@ -606,37 +636,38 @@ int a52_block (a52_state_t * state, sample_t * samples)
}
if ((state->acmod == 2) && state->phsflginu && cplcoe)
for (j = 0; j < state->ncplbnd; j++)
- if (bitstream_get (1)) /* phsflg */
+ if (bitstream_get (state, 1)) /* phsflg */
state->cplco[1][j] = -state->cplco[1][j];
}
- if ((state->acmod == 2) && (bitstream_get (1))) { /* rematstr */
+ if ((state->acmod == 2) && (bitstream_get (state, 1))) { /* rematstr */
int end;
- end = (state->cplinu) ? state->cplstrtmant : 253;
+ state->rematflg = 0;
+ end = (state->chincpl) ? state->cplstrtmant : 253; /* cplinu */
i = 0;
do
- state->rematflg[i] = bitstream_get (1);
+ state->rematflg |= bitstream_get (state, 1) << i;
while (rematrix_band[i++] < end);
}
cplexpstr = EXP_REUSE;
lfeexpstr = EXP_REUSE;
- if (state->cplinu)
- cplexpstr = bitstream_get (2);
+ if (state->chincpl) /* cplinu */
+ cplexpstr = bitstream_get (state, 2);
for (i = 0; i < nfchans; i++)
- chexpstr[i] = bitstream_get (2);
+ chexpstr[i] = bitstream_get (state, 2);
if (state->lfeon)
- lfeexpstr = bitstream_get (1);
+ lfeexpstr = bitstream_get (state, 1);
for (i = 0; i < nfchans; i++)
if (chexpstr[i] != EXP_REUSE) {
- if (state->cplinu && state->chincpl[i])
+ if ((state->chincpl >> i) & 1)
state->endmant[i] = state->cplstrtmant;
else {
int chbwcod;
- chbwcod = bitstream_get (6);
+ chbwcod = bitstream_get (state, 6);
if (chbwcod > 60)
return 1;
state->endmant[i] = chbwcod * 3 + 73;
@@ -651,9 +682,9 @@ int a52_block (a52_state_t * state, sample_t * samples)
do_bit_alloc = 64;
ncplgrps = ((state->cplendmant - state->cplstrtmant) /
(3 << (cplexpstr - 1)));
- cplabsexp = bitstream_get (4) << 1;
- if (parse_exponents (cplexpstr, ncplgrps, cplabsexp,
- state->cpl_exp + state->cplstrtmant))
+ cplabsexp = bitstream_get (state, 4) << 1;
+ if (parse_exponents (state, cplexpstr, ncplgrps, cplabsexp,
+ state->cpl_expbap.exp + state->cplstrtmant))
return 1;
}
for (i = 0; i < nfchans; i++)
@@ -663,99 +694,94 @@ int a52_block (a52_state_t * state, sample_t * samples)
do_bit_alloc |= 1 << i;
grp_size = 3 << (chexpstr[i] - 1);
nchgrps = (state->endmant[i] + grp_size - 4) / grp_size;
- state->fbw_exp[i][0] = bitstream_get (4);
- if (parse_exponents (chexpstr[i], nchgrps, state->fbw_exp[i][0],
- state->fbw_exp[i] + 1))
+ state->fbw_expbap[i].exp[0] = bitstream_get (state, 4);
+ if (parse_exponents (state, chexpstr[i], nchgrps,
+ state->fbw_expbap[i].exp[0],
+ state->fbw_expbap[i].exp + 1))
return 1;
- bitstream_skip (2); /* gainrng */
+ bitstream_skip (state, 2); /* gainrng */
}
if (lfeexpstr != EXP_REUSE) {
do_bit_alloc |= 32;
- state->lfe_exp[0] = bitstream_get (4);
- if (parse_exponents (lfeexpstr, 2, state->lfe_exp[0],
- state->lfe_exp + 1))
+ state->lfe_expbap.exp[0] = bitstream_get (state, 4);
+ if (parse_exponents (state, lfeexpstr, 2, state->lfe_expbap.exp[0],
+ state->lfe_expbap.exp + 1))
return 1;
}
- if (bitstream_get (1)) { /* baie */
+ if (bitstream_get (state, 1)) { /* baie */
do_bit_alloc = -1;
- state->sdcycod = bitstream_get (2);
- state->fdcycod = bitstream_get (2);
- state->sgaincod = bitstream_get (2);
- state->dbpbcod = bitstream_get (2);
- state->floorcod = bitstream_get (3);
+ state->bai = bitstream_get (state, 11);
}
- if (bitstream_get (1)) { /* snroffste */
+ if (bitstream_get (state, 1)) { /* snroffste */
do_bit_alloc = -1;
- state->csnroffst = bitstream_get (6);
- if (state->cplinu) {
- state->cplba.fsnroffst = bitstream_get (4);
- state->cplba.fgaincod = bitstream_get (3);
- }
- for (i = 0; i < nfchans; i++) {
- state->ba[i].fsnroffst = bitstream_get (4);
- state->ba[i].fgaincod = bitstream_get (3);
- }
- if (state->lfeon) {
- state->lfeba.fsnroffst = bitstream_get (4);
- state->lfeba.fgaincod = bitstream_get (3);
- }
+ state->csnroffst = bitstream_get (state, 6);
+ if (state->chincpl) /* cplinu */
+ state->cplba.bai = bitstream_get (state, 7);
+ for (i = 0; i < nfchans; i++)
+ state->ba[i].bai = bitstream_get (state, 7);
+ if (state->lfeon)
+ state->lfeba.bai = bitstream_get (state, 7);
}
- if ((state->cplinu) && (bitstream_get (1))) { /* cplleake */
+ if ((state->chincpl) && (bitstream_get (state, 1))) { /* cplleake */
do_bit_alloc |= 64;
- state->cplfleak = 2304 - (bitstream_get (3) << 8);
- state->cplsleak = 2304 - (bitstream_get (3) << 8);
+ state->cplfleak = 9 - bitstream_get (state, 3);
+ state->cplsleak = 9 - bitstream_get (state, 3);
}
- if (bitstream_get (1)) { /* deltbaie */
+ if (bitstream_get (state, 1)) { /* deltbaie */
do_bit_alloc = -1;
- if (state->cplinu)
- state->cplba.deltbae = bitstream_get (2);
+ if (state->chincpl) /* cplinu */
+ state->cplba.deltbae = bitstream_get (state, 2);
for (i = 0; i < nfchans; i++)
- state->ba[i].deltbae = bitstream_get (2);
- if (state->cplinu && (state->cplba.deltbae == DELTA_BIT_NEW) &&
- parse_deltba (state->cplba.deltba))
+ state->ba[i].deltbae = bitstream_get (state, 2);
+ if (state->chincpl && /* cplinu */
+ (state->cplba.deltbae == DELTA_BIT_NEW) &&
+ parse_deltba (state, state->cplba.deltba))
return 1;
for (i = 0; i < nfchans; i++)
if ((state->ba[i].deltbae == DELTA_BIT_NEW) &&
- parse_deltba (state->ba[i].deltba))
+ parse_deltba (state, state->ba[i].deltba))
return 1;
}
if (do_bit_alloc) {
if (zero_snr_offsets (nfchans, state)) {
- memset (state->cpl_bap, 0, sizeof (state->cpl_bap));
- memset (state->fbw_bap, 0, sizeof (state->fbw_bap));
- memset (state->lfe_bap, 0, sizeof (state->lfe_bap));
+ memset (state->cpl_expbap.bap, 0, sizeof (state->cpl_expbap.bap));
+ for (i = 0; i < nfchans; i++)
+ memset (state->fbw_expbap[i].bap, 0,
+ sizeof (state->fbw_expbap[i].bap));
+ memset (state->lfe_expbap.bap, 0, sizeof (state->lfe_expbap.bap));
} else {
- if (state->cplinu && (do_bit_alloc & 64))
- bit_allocate (state, &state->cplba, state->cplstrtbnd,
+ if (state->chincpl && (do_bit_alloc & 64)) /* cplinu */
+ a52_bit_allocate (state, &state->cplba, state->cplstrtbnd,
state->cplstrtmant, state->cplendmant,
- state->cplfleak, state->cplsleak,
- state->cpl_exp, state->cpl_bap);
+ state->cplfleak << 8, state->cplsleak << 8,
+ &state->cpl_expbap);
for (i = 0; i < nfchans; i++)
if (do_bit_alloc & (1 << i))
- bit_allocate (state, state->ba + i, 0, 0,
- state->endmant[i], 0, 0, state->fbw_exp[i],
- state->fbw_bap[i]);
+ a52_bit_allocate (state, state->ba + i, 0, 0,
+ state->endmant[i], 0, 0,
+ state->fbw_expbap +i);
if (state->lfeon && (do_bit_alloc & 32)) {
state->lfeba.deltbae = DELTA_BIT_NONE;
- bit_allocate (state, &state->lfeba, 0, 0, 7, 0, 0,
- state->lfe_exp, state->lfe_bap);
+ a52_bit_allocate (state, &state->lfeba, 0, 0, 7, 0, 0,
+ &state->lfe_expbap);
}
}
}
- if (bitstream_get (1)) { /* skiple */
- i = bitstream_get (9); /* skipl */
+ if (bitstream_get (state, 1)) { /* skiple */
+ i = bitstream_get (state, 9); /* skipl */
while (i--)
- bitstream_skip (8);
+ bitstream_skip (state, 8);
}
+ samples = state->samples;
if (state->output & A52_LFE)
samples += 256; /* shift for LFE channel */
- chanbias = downmix_coeff (coeff, state->acmod, state->output,
+ chanbias = a52_downmix_coeff (coeff, state->acmod, state->output,
state->dynrng, state->clev, state->slev);
quantizer.q1_ptr = quantizer.q2_ptr = quantizer.q4_ptr = -1;
@@ -764,10 +790,10 @@ int a52_block (a52_state_t * state, sample_t * samples)
for (i = 0; i < nfchans; i++) {
int j;
- coeff_get (samples + 256 * i, state->fbw_exp[i], state->fbw_bap[i],
- &quantizer, coeff[i], dithflag[i], state->endmant[i]);
+ coeff_get (state, samples + 256 * i, state->fbw_expbap +i, &quantizer,
+ coeff[i], dithflag[i], state->endmant[i]);
- if (state->cplinu && state->chincpl[i]) {
+ if ((state->chincpl >> i) & 1) {
if (!done_cpl) {
done_cpl = 1;
coeff_get_coupling (state, nfchans, coeff,
@@ -783,18 +809,21 @@ int a52_block (a52_state_t * state, sample_t * samples)
}
if (state->acmod == 2) {
- int j, end, band;
+ int j, end, band, rematflg;
end = ((state->endmant[0] < state->endmant[1]) ?
state->endmant[0] : state->endmant[1]);
i = 0;
j = 13;
+ rematflg = state->rematflg;
do {
- if (!state->rematflg[i]) {
+ if (! (rematflg & 1)) {
+ rematflg >>= 1;
j = rematrix_band[i++];
continue;
}
+ rematflg >>= 1;
band = rematrix_band[i++];
if (band > end)
band = end;
@@ -811,15 +840,15 @@ int a52_block (a52_state_t * state, sample_t * samples)
if (state->lfeon) {
if (state->output & A52_LFE) {
- coeff_get (samples - 256, state->lfe_exp, state->lfe_bap,
- &quantizer, state->dynrng, 0, 7);
+ coeff_get (state, samples - 256, &state->lfe_expbap, &quantizer,
+ state->dynrng, 0, 7);
for (i = 7; i < 256; i++)
(samples-256)[i] = 0;
- imdct_512 (samples - 256, samples + 1536 - 256, state->bias);
+ a52_imdct_512 (samples - 256, samples + 1536 - 256, state->bias);
} else {
/* just skip the LFE coefficients */
- coeff_get (samples + 1280, state->lfe_exp, state->lfe_bap,
- &quantizer, 0, 0, 7);
+ coeff_get (state, samples + 1280, &state->lfe_expbap, &quantizer,
+ 0, 0, 7);
}
}
@@ -830,9 +859,9 @@ int a52_block (a52_state_t * state, sample_t * samples)
break;
if (i < nfchans) {
- if (samples[2 * 1536 - 1] == (sample_t)0x776b6e21) {
- samples[2 * 1536 - 1] = 0;
- upmix (samples + 1536, state->acmod, state->output);
+ if (state->downmixed) {
+ state->downmixed = 0;
+ a52_upmix (samples + 1536, state->acmod, state->output);
}
for (i = 0; i < nfchans; i++) {
@@ -844,10 +873,10 @@ int a52_block (a52_state_t * state, sample_t * samples)
if (coeff[i]) {
if (blksw[i])
- imdct_256 (samples + 256 * i, samples + 1536 + 256 * i,
+ a52_imdct_256 (samples + 256 * i, samples + 1536 + 256 * i,
bias);
else
- imdct_512 (samples + 256 * i, samples + 1536 + 256 * i,
+ a52_imdct_512 (samples + 256 * i, samples + 1536 + 256 * i,
bias);
} else {
int j;
@@ -857,29 +886,35 @@ int a52_block (a52_state_t * state, sample_t * samples)
}
}
- downmix (samples, state->acmod, state->output, state->bias,
+ a52_downmix (samples, state->acmod, state->output, state->bias,
state->clev, state->slev);
} else {
nfchans = nfchans_tbl[state->output & A52_CHANNEL_MASK];
- downmix (samples, state->acmod, state->output, 0,
+ a52_downmix (samples, state->acmod, state->output, 0,
state->clev, state->slev);
- if (samples[2 * 1536 - 1] != (sample_t)0x776b6e21) {
- downmix (samples + 1536, state->acmod, state->output, 0,
+ if (!state->downmixed) {
+ state->downmixed = 1;
+ a52_downmix (samples + 1536, state->acmod, state->output, 0,
state->clev, state->slev);
- samples[2 * 1536 - 1] = (sample_t)0x776b6e21;
}
if (blksw[0])
for (i = 0; i < nfchans; i++)
- imdct_256 (samples + 256 * i, samples + 1536 + 256 * i,
+ a52_imdct_256 (samples + 256 * i, samples + 1536 + 256 * i,
state->bias);
else
for (i = 0; i < nfchans; i++)
- imdct_512 (samples + 256 * i, samples + 1536 + 256 * i,
+ a52_imdct_512 (samples + 256 * i, samples + 1536 + 256 * i,
state->bias);
}
return 0;
}
+
+void a52_free (a52_state_t * state)
+{
+ free (state->samples);
+ free (state);
+}
diff --git a/liba52/tables.h b/liba52/tables.h
index 7dc5ed731d..a35543db7c 100644
--- a/liba52/tables.h
+++ b/liba52/tables.h
@@ -1,6 +1,6 @@
/*
* tables.h
- * Copyright (C) 2000-2001 Michel Lespinasse <walken@zoy.org>
+ * Copyright (C) 2000-2002 Michel Lespinasse <walken@zoy.org>
* Copyright (C) 1999-2000 Aaron Holtzman <aholtzma@ess.engr.uvic.ca>
*
* This file is part of a52dec, a free ATSC A-52 stream decoder.
@@ -21,7 +21,7 @@
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
-static int8_t exp_1[128] = {
+static const int8_t exp_1[128] = {
-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,
-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
@@ -29,7 +29,7 @@ static int8_t exp_1[128] = {
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
25,25,25
};
-static int8_t exp_2[128] = {
+static const int8_t exp_2[128] = {
-2,-2,-2,-2,-2,-1,-1,-1,-1,-1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2,
-2,-2,-2,-2,-2,-1,-1,-1,-1,-1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2,
-2,-2,-2,-2,-2,-1,-1,-1,-1,-1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2,
@@ -37,7 +37,7 @@ static int8_t exp_2[128] = {
-2,-2,-2,-2,-2,-1,-1,-1,-1,-1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2,
25,25,25
};
-static int8_t exp_3[128] = {
+static const int8_t exp_3[128] = {
-2,-1, 0, 1, 2,-2,-1, 0, 1, 2,-2,-1, 0, 1, 2,-2,-1, 0, 1, 2,-2,-1, 0, 1, 2,
-2,-1, 0, 1, 2,-2,-1, 0, 1, 2,-2,-1, 0, 1, 2,-2,-1, 0, 1, 2,-2,-1, 0, 1, 2,
-2,-1, 0, 1, 2,-2,-1, 0, 1, 2,-2,-1, 0, 1, 2,-2,-1, 0, 1, 2,-2,-1, 0, 1, 2,
@@ -182,7 +182,7 @@ static const sample_t q_5[16] = {
0
};
-static sample_t scale_factor[25] = {
+static const sample_t scale_factor[25] = {
0.000030517578125,
0.0000152587890625,
0.00000762939453125,
diff --git a/libmpcodecs/ad_liba52.c b/libmpcodecs/ad_liba52.c
index dbdb89a619..82e21a9610 100644
--- a/libmpcodecs/ad_liba52.c
+++ b/libmpcodecs/ad_liba52.c
@@ -19,8 +19,7 @@
#include "liba52/a52.h"
#include "liba52/mm_accel.h"
-static sample_t * a52_samples;
-static a52_state_t a52_state;
+static a52_state_t *a52_state;
static uint32_t a52_flags=0;
/** Used by a52_resample_float, it defines the mapping between liba52
* channels and output channels. The ith nibble from the right in the
@@ -169,8 +168,8 @@ static int init(sh_audio_t *sh_audio)
if(gCpuCaps.has3DNow) a52_accel|=MM_ACCEL_X86_3DNOW;
if(gCpuCaps.has3DNowExt) a52_accel|=MM_ACCEL_X86_3DNOWEXT;
if(gCpuCaps.hasAltiVec) a52_accel|=MM_ACCEL_PPC_ALTIVEC;
- a52_samples=a52_init (a52_accel);
- if (a52_samples == NULL) {
+ a52_state=a52_init (a52_accel);
+ if (a52_state == NULL) {
mp_msg(MSGT_DECAUDIO,MSGL_ERR,"A52 init failed\n");
return 0;
}
@@ -210,7 +209,7 @@ while(sh_audio->channels>0){
/* test:*/
flags=a52_flags|A52_ADJUST_LEVEL;
mp_msg(MSGT_DECAUDIO,MSGL_V,"A52 flags before a52_frame: 0x%X\n",flags);
- if (a52_frame (&a52_state, sh_audio->a_in_buffer, &flags, &level, bias)){
+ if (a52_frame (a52_state, sh_audio->a_in_buffer, &flags, &level, bias)){
mp_msg(MSGT_DECAUDIO,MSGL_ERR,"a52: error decoding frame -> nosound\n");
return 0;
}
@@ -288,16 +287,12 @@ static int decode_audio(sh_audio_t *sh_audio,unsigned char *buf,int minlen,int m
sample_t level=a52_level, bias=384;
int flags=a52_flags|A52_ADJUST_LEVEL;
int i,len=-1;
- if (maxlen / sh_audio->samplesize / 256 / sh_audio->channels < 6) {
- mp_msg(MSGT_DECAUDIO, MSGL_V, "maxlen too small in decode_audio\n");
- return len;
- }
if (sh_audio->sample_format == AF_FORMAT_FLOAT_NE)
bias = 0;
if(!sh_audio->a_in_buffer_len)
if(a52_fillbuff(sh_audio)<0) return len; /* EOF */
sh_audio->a_in_buffer_len=0;
- if (a52_frame (&a52_state, sh_audio->a_in_buffer, &flags, &level, bias)){
+ if (a52_frame (a52_state, sh_audio->a_in_buffer, &flags, &level, bias)){
mp_msg(MSGT_DECAUDIO,MSGL_WARN,"a52: error decoding frame\n");
return len;
}
@@ -305,18 +300,18 @@ static int decode_audio(sh_audio_t *sh_audio,unsigned char *buf,int minlen,int m
/* handle dynrng */
if (a52_drc_action != DRC_NO_ACTION) {
if (a52_drc_action == DRC_NO_COMPRESSION)
- a52_dynrng(&a52_state, NULL, NULL);
+ a52_dynrng(a52_state, NULL, NULL);
else
- a52_dynrng(&a52_state, dynrng_call, NULL);
+ a52_dynrng(a52_state, dynrng_call, NULL);
}
len=0;
for (i = 0; i < 6; i++) {
- if (a52_block (&a52_state, a52_samples)){
+ if (a52_block (a52_state)){
mp_msg(MSGT_DECAUDIO,MSGL_WARN,"a52: error at resampling\n");
break;
}
- len+=2*a52_resample(a52_samples,(int16_t *)&buf[len]);
+ len+=2*a52_resample(a52_samples(a52_state),(int16_t *)&buf[len]);
}
assert(len <= maxlen);
return len;