/*************************************************************************** * Arithmetic Encoding and Decoding Library * * File : arcode.c * Purpose : Use arithmetic coding to compress/decompress file streams * Author : Michael Dipperstein * Date : April 2, 2004 * **************************************************************************** * UPDATES * * $Id: arcode.c,v 1.5 2007/09/08 15:47:02 michael Exp $ * $Log: arcode.c,v $ * Revision 1.5 2007/09/08 15:47:02 michael * Changes required for LGPL v3. * * Revision 1.4 2006/03/02 06:43:37 michael * Expanded tabs * * Revision 1.3 2006/01/12 07:39:24 michael * Use BitFileGetBitsIntBit and FilePutBitsInt for reading and writing the * header. This makes the code a little cleaner, but the new header is not * compatible with the old header. * * Revision 1.2 2004/08/13 13:10:27 michael * Add support for adaptive encoding * * Use binary search when trying to find decoded symbol * * Revision 1.1.1.1 2004/04/04 14:54:13 michael * Initial version * **************************************************************************** * * Arcode: An ANSI C Arithmetic Encoding/Decoding Routines * Copyright (C) 2004, 2006-2007 by Michael Dipperstein (mdipper@cs.ucsb.edu) * * This file is part of the arcode library. * * The arcode library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public License as * published by the Free Software Foundation; either version 3 of the * License, or (at your option) any later version. * * The arcode library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser * General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this program. If not, see . * ***************************************************************************/ /*************************************************************************** * INCLUDED FILES ***************************************************************************/ #include #include #include #include "arcode.h" #include "bitfile.h" /* compile-time options */ #undef BUILD_DEBUG_OUTPUT /* debugging output */ #if !(USHRT_MAX < ULONG_MAX) #error "Implementation requires USHRT_MAX < ULONG_MAX" #endif /*************************************************************************** * TYPE DEFINITIONS ***************************************************************************/ typedef unsigned short probability_t; /* probability count type */ /*************************************************************************** * CONSTANTS ***************************************************************************/ #define EOF_CHAR (UCHAR_MAX + 1) /* number of bits used to compute running code values */ #define PRECISION (8 * sizeof(probability_t)) /* 2 bits less than precision. keeps lower and upper bounds from crossing. */ #define MAX_PROBABILITY (1 << (PRECISION - 2)) /*************************************************************************** * MACROS ***************************************************************************/ /* set bit x to 1 in probability_t. Bit 0 is MSB */ #define MASK_BIT(x) (probability_t)(1 << (PRECISION - (1 + (x)))) /* indices for a symbol's lower and upper cumulative probability ranges */ #define LOWER(c) (c) #define UPPER(c) ((c) + 1) /*************************************************************************** * GLOBAL VARIABLES ***************************************************************************/ probability_t lower; /* lower bound of current code range */ probability_t upper; /* upper bound of current code range */ probability_t code; /* current MSBs of encode input stream */ unsigned char underflowBits; /* current underflow bit count */ /* probability ranges for each symbol: [ranges[LOWER(c)], ranges[UPPER(c)]) */ probability_t ranges[UPPER(EOF_CHAR) + 1]; probability_t cumulativeProb; /* cumulative probability of all ranges */ /*************************************************************************** * PROTOTYPES ***************************************************************************/ /* read write file headers */ void WriteHeader(bit_file_t *bfpOut); int ReadHeader(bit_file_t *bfpIn); /* applies symbol's ranges to current upper and lower range bounds */ void ApplySymbolRange(int symbol, char staticModel); /* routines for encoding*/ void WriteEncodedBits(bit_file_t *bfpOut); void WriteRemaining(bit_file_t *bfpOut); int BuildProbabilityRangeList(FILE *fpIn); void InitializeAdaptiveProbabilityRangeList(void); /* routines for decoding */ void InitializeDecoder(bit_file_t *bfpOut, char staticModel); probability_t GetUnscaledCode(void); int GetSymbolFromProbability(probability_t probability); void ReadEncodedBits(bit_file_t *bfpIn); /*************************************************************************** * FUNCTIONS ***************************************************************************/ /*************************************************************************** * Function : ArEncodeFile * Description: This routine generates a list of arithmetic code ranges for * a file and then uses them to write out an encoded version * of that file. * Parameters : inFile - Name of file to encode * outFile - Name of file to write encoded output to * staticModel - TRUE if encoding with a static model * Effects : File is arithmetically encoded * Returned : TRUE for success, otherwise FALSE. ***************************************************************************/ int ArEncodeFile(char *inFile, char *outFile, char staticModel) { int c; FILE *fpIn; /* uncoded input */ bit_file_t *bfpOut; /* encoded output */ /* open input and output files */ if ((fpIn = fopen(inFile, "rb")) == NULL) { perror(inFile); return FALSE; } if (outFile == NULL) { bfpOut = MakeBitFile(stdout, BF_WRITE); } else { if ((bfpOut = BitFileOpen(outFile, BF_WRITE)) == NULL) { fclose(fpIn); perror(outFile); return FALSE; } } if (staticModel) { /* count symbols in file and come up with a list of probability ranges */ if (!BuildProbabilityRangeList(fpIn)) { fclose(fpIn); BitFileClose(bfpOut); fprintf(stderr, "Error determining frequency ranges.\n"); return FALSE; } rewind(fpIn); /* write information required to decode file to encoded file */ WriteHeader(bfpOut); } else { /* initialize probability ranges asumming uniform distribution */ InitializeAdaptiveProbabilityRangeList(); } /* initialize coder start with full probability range [0%, 100%) */ lower = 0; upper = ~0; /* all ones */ underflowBits = 0; /* encode symbols one at a time */ while ((c = fgetc(fpIn)) != EOF) { ApplySymbolRange(c, staticModel); WriteEncodedBits(bfpOut); } fclose(fpIn); ApplySymbolRange(EOF_CHAR, staticModel); /* encode an EOF */ WriteEncodedBits(bfpOut); WriteRemaining(bfpOut); /* write out least significant bits */ BitFileClose(bfpOut); return TRUE; } /*************************************************************************** * Function : SymbolCountToProbabilityRanges * Description: This routine converts the ranges array containing only * symbol counts to an array containing the upper and lower * probability ranges for each symbol. * Parameters : None * Effects : ranges array containing symbol counts in the upper field * for each symbol is converted to a list of upper and lower * probability bounds for each symbol. * Returned : None ***************************************************************************/ void SymbolCountToProbabilityRanges(void) { int c; ranges[0] = 0; /* absolute lower bound is 0 */ ranges[UPPER(EOF_CHAR)] = 1; /* add 1 EOF character */ cumulativeProb++; /* assign upper and lower probability ranges */ for (c = 1; c <= UPPER(EOF_CHAR); c++) { ranges[c] += ranges[c - 1]; } #ifdef BUILD_DEBUG_OUTPUT /* dump list of ranges */ for (c = 0; c < UPPER(EOF_CHAR); c++) { printf("%02X\t%d\t%d\n", c, ranges[LOWER(c)], ranges[UPPER(c)]); } #endif return; } /*************************************************************************** * Function : BuildProbabilityRangeList * Description: This routine reads the input file and builds the global * list of upper and lower probability ranges for each * symbol. * Parameters : fpIn - file to build range list for * Effects : ranges array is made to contain probability ranges for * each symbol. * Returned : TRUE for success, otherwise FALSE. ***************************************************************************/ int BuildProbabilityRangeList(FILE *fpIn) { int c; /*********************************************************************** * unsigned long is used to hold the largest counts we can have without * any rescaling. Rescaling may take place before probability ranges * are computed. ***********************************************************************/ unsigned long countArray[EOF_CHAR]; unsigned long totalCount = 0; unsigned long rescaleValue; if (fpIn == NULL) { return FALSE; } /* start with no symbols counted */ for (c = 0; c < EOF_CHAR; c++) { countArray[c] = 0; } while ((c = fgetc(fpIn)) != EOF) { if (totalCount == ULONG_MAX) { fprintf(stderr, "Error: file too large\n"); return FALSE; } countArray[c]++; totalCount++; } /* rescale counts to be < MAX_PROBABILITY */ if (totalCount >= MAX_PROBABILITY) { rescaleValue = (totalCount / MAX_PROBABILITY) + 1; for (c = 0; c < EOF_CHAR; c++) { if (countArray[c] > rescaleValue) { countArray[c] /= rescaleValue; } else if (countArray[c] != 0) { countArray[c] = 1; } } } /* copy scaled symbol counts to range list */ ranges[0] = 0; cumulativeProb = 0; for (c = 0; c < EOF_CHAR; c++) { ranges[UPPER(c)] = countArray[c]; cumulativeProb += countArray[c]; } /* convert counts to a range of probabilities */ SymbolCountToProbabilityRanges(); return TRUE; } /*************************************************************************** * Function : WriteHeader * Description: This function writes each symbol contained in the encoded * file as well as its rescaled number of occurrences. A * decoding algorithm may use these numbers to reconstruct * the probability range list used to encode the file. * Parameters : bfpOut - pointer to open binary file to write to. * Effects : Symbol values and symbol counts are written to a file. * Returned : None ***************************************************************************/ void WriteHeader(bit_file_t *bfpOut) { int c; probability_t previous = 0; /* symbol count so far */ #if BUILD_DEBUG_OUTPUT printf("HEADER:\n"); #endif for(c = 0; c <= (EOF_CHAR - 1); c++) { if (ranges[UPPER(c)] > previous) { /* some of these symbols will be encoded */ BitFilePutChar((char)c, bfpOut); previous = (ranges[UPPER(c)] - previous); /* symbol count */ #if BUILD_DEBUG_OUTPUT printf("%02X\t%d\n", c, previous); #endif /* write out PRECISION - 2 bit count */ BitFilePutBitsInt(bfpOut, &previous, (PRECISION - 2), sizeof(probability_t)); /* current upper range is previous for the next character */ previous = ranges[UPPER(c)]; } } /* now write end of table (zero count) */ BitFilePutChar(0x00, bfpOut); previous = 0; BitFilePutBits(bfpOut, (void *)&previous, PRECISION - 2); } /*************************************************************************** * Function : InitializeAdaptiveProbabilityRangeList * Description: This routine builds the initial global list of upper and * lower probability ranges for each symbol. This routine * is used by both adaptive encoding and decoding. * Currently it provides a uniform symbol distribution. * Other distributions might be better suited for known data * types (such as English text). * Parameters : NONE * Effects : ranges array is made to contain initial probability ranges * for each symbol. * Returned : NONE ***************************************************************************/ void InitializeAdaptiveProbabilityRangeList(void) { int c; cumulativeProb = 0; ranges[0] = 0; /* absolute lower range */ /* assign upper and lower probability ranges assuming */ for (c = 1; c <= UPPER(EOF_CHAR); c++) { ranges[c] = ranges[c - 1] + 1; cumulativeProb++; } #ifdef BUILD_DEBUG_OUTPUT /* dump list of ranges */ for (c = 0; c < UPPER(EOF_CHAR); c++) { printf("%02X\t%d\t%d\n", c, ranges[LOWER(c)], ranges[UPPER(c)]); } #endif return; } /*************************************************************************** * Function : ApplySymbolRange * Description: This function is used for both encoding and decoding. It * applies the range restrictions of a new symbol to the * current upper and lower range bounds of an encoded stream. * If an adaptive model is being used, the probability range * list will be updated after the effect of the symbol is * applied. * Parameters : symbol - The symbol to be added to the current code range * staticModel - TRUE if encoding/decoding with a static * model. * Effects : The current upper and lower range bounds are adjusted to * include the range effects of adding another symbol to the * encoded stream. If an adaptive model is being used, the * probability range list will be updated. * Returned : None ***************************************************************************/ void ApplySymbolRange(int symbol, char staticModel) { unsigned long range; /* must be able to hold max upper + 1 */ unsigned long rescaled; /* range rescaled for range of new symbol */ /* must hold range * max upper */ /* for updating dynamic models */ int i; probability_t original; /* range value prior to rescale */ probability_t delta; /* range for individual symbol */ /*********************************************************************** * Calculate new upper and lower ranges. Since the new upper range is * dependant of the old lower range, compute the upper range first. ***********************************************************************/ range = (unsigned long)(upper - lower) + 1; /* current range */ /* scale upper range of the symbol being coded */ rescaled = (unsigned long)ranges[UPPER(symbol)] * range; rescaled /= (unsigned long)cumulativeProb; /* new upper = old lower + rescaled new upper - 1*/ upper = lower + (probability_t)rescaled - 1; /* scale lower range of the symbol being coded */ rescaled = (unsigned long)ranges[LOWER(symbol)] * range; rescaled /= (unsigned long)cumulativeProb; /* new lower = old lower + rescaled new upper */ lower = lower + (probability_t)rescaled; if (!staticModel) { /* add new symbol to model */ cumulativeProb++; for (i = UPPER(symbol); i <= UPPER(EOF_CHAR); i++) { ranges[i] += 1; } /* half current values if cumulativeProb is too large */ if (cumulativeProb >= MAX_PROBABILITY) { cumulativeProb = 0; original = 0; for (i = 1; i <= UPPER(EOF_CHAR); i++) { delta = ranges[i] - original; if (delta <= 2) { /* prevent probability from being 0 */ original = ranges[i]; ranges[i] = ranges[i - 1] + 1; } else { original = ranges[i]; ranges[i] = ranges[i - 1] + (delta / 2); } cumulativeProb += (ranges[i] - ranges[i - 1]); } } } #ifdef BUILD_DEBUG_OUTPUT if (lower > upper) { /* compile this in when testing new models. */ fprintf(stderr, "Panic: lower (%X)> upper (%X)\n", lower, upper); } #endif } /*************************************************************************** * Function : WriteEncodedBits * Description: This function attempts to shift out as many code bits as * possible, writing the shifted bits to the encoded output * file. Only bits that will be unchanged when additional * symbols are encoded may be written out. * * If the n most significant bits of the lower and upper range * bounds match, they will not be changed when additional * symbols are encoded, so they may be shifted out. * * Adjustments are also made to prevent possible underflows * that occur when the upper and lower ranges are so close * that encoding another symbol won't change their values. * Parameters : bfpOut - pointer to open binary file to write to. * Effects : The upper and lower code bounds are adjusted so that they * only contain only bits that may be affected by the * addition of a new symbol to the encoded stream. * Returned : None ***************************************************************************/ void WriteEncodedBits(bit_file_t *bfpOut) { for (;;) { if ((upper & MASK_BIT(0)) == (lower & MASK_BIT(0))) { /* MSBs match, write them to output file */ BitFilePutBit((upper & MASK_BIT(0)) != 0, bfpOut); /* we can write out underflow bits too */ while (underflowBits > 0) { BitFilePutBit((upper & MASK_BIT(0)) == 0, bfpOut); underflowBits--; } } else if ((lower & MASK_BIT(1)) && !(upper & MASK_BIT(1))) { /*************************************************************** * Possible underflow condition: neither MSBs nor second MSBs * match. It must be the case that lower and upper have MSBs of * 01 and 10. Remove 2nd MSB from lower and upper. ***************************************************************/ underflowBits += 1; lower &= ~(MASK_BIT(0) | MASK_BIT(1)); upper |= MASK_BIT(1); /*************************************************************** * The shifts below make the rest of the bit removal work. If * you don't believe me try it yourself. ***************************************************************/ } else { /* nothing left to do */ return ; } /******************************************************************* * Shift out old MSB and shift in new LSB. Remember that lower has * all 0s beyond it's end and upper has all 1s beyond it's end. *******************************************************************/ lower <<= 1; upper <<= 1; upper |= 1; } } /*************************************************************************** * Function : WriteRemaining * Description: This function writes out all remaining significant bits * in the upper and lower ranges and the underflow bits once * the last symbol has been encoded. * Parameters : bfpOut - pointer to open binary file to write to. * Effects : Remaining significant range bits are written to the output * file. * Returned : None ***************************************************************************/ void WriteRemaining(bit_file_t *bfpOut) { BitFilePutBit((lower & MASK_BIT(1)) != 0, bfpOut); /* write out any unwritten underflow bits */ for (underflowBits++; underflowBits > 0; underflowBits--) { BitFilePutBit((lower & MASK_BIT(1)) == 0, bfpOut); } } /*************************************************************************** * Function : ArDecodeFile * Description: This routine opens an arithmetically encoded file, reads * it's header, and builds a list of probability ranges which * it then uses to decode the rest of the file. * Parameters : inFile - Name of file to decode * outFile - Name of file to write decoded output to * staticModel - TRUE if decoding with a static model * Effects : Encoded file is decoded * Returned : TRUE for success, otherwise FALSE. ***************************************************************************/ int ArDecodeFile(char *inFile, char *outFile, char staticModel) { int c; probability_t unscaled; bit_file_t *bfpIn; FILE *fpOut; /* open input and output files */ if ((bfpIn = BitFileOpen(inFile, BF_READ)) == NULL) { perror(inFile); return FALSE; } if (outFile == NULL) { fpOut = stdout; } else { if ((fpOut = fopen(outFile, "wb")) == NULL) { BitFileClose(bfpIn); perror(outFile); return FALSE; } } if (staticModel) { /* build probability ranges from header in file */ if (ReadHeader(bfpIn) == FALSE) { BitFileClose(bfpIn); fclose(fpOut); return FALSE; } } /* read start of code and initialize bounds, and adaptive ranges */ InitializeDecoder(bfpIn, staticModel); /* decode one symbol at a time */ for (;;) { /* get the unscaled probability of the current symbol */ unscaled = GetUnscaledCode(); /* figure out which symbol has the above probability */ if((c = GetSymbolFromProbability(unscaled)) == -1) { /* error: unknown symbol */ break; } if (c == EOF_CHAR) { /* no more symbols */ break; } fputc((char)c, fpOut); /* factor out symbol */ ApplySymbolRange(c, staticModel); ReadEncodedBits(bfpIn); } fclose(fpOut); BitFileClose(bfpIn); return TRUE; } /**************************************************************************** * Function : ReadHeader * Description: This function reads the header information stored by * WriteHeader. The header can then be used to build a * probability range list matching the list that was used to * encode the file. * Parameters : bfpIn - file to read from * Effects : Probability range list is built. * Returned : TRUE for success, otherwise FALSE ****************************************************************************/ int ReadHeader(bit_file_t *bfpIn) { int c; probability_t count; #if BUILD_DEBUG_OUTPUT printf("HEADER:\n"); #endif cumulativeProb = 0; for (c = 0; c <= UPPER(EOF_CHAR); c++) { ranges[UPPER(c)] = 0; } /* read [character, probability] sets */ for (;;) { c = BitFileGetChar(bfpIn); count = 0; /* read (PRECISION - 2) bit count */ if (BitFileGetBitsInt(bfpIn, &count, (PRECISION - 2), sizeof(probability_t)) == EOF) { /* premature EOF */ fprintf(stderr, "Error: unexpected EOF\n"); return FALSE; } #if BUILD_DEBUG_OUTPUT printf("%02X\t%d\n", c, count); #endif if (count == 0) { /* 0 count means end of header */ break; } ranges[UPPER(c)] = count; cumulativeProb += count; } /* convert counts to range list */ SymbolCountToProbabilityRanges(); return TRUE; } /**************************************************************************** * Function : InitializeDecoder * Description: This function starts the upper and lower ranges at their * max/min values and reads in the most significant encoded * bits. * Parameters : bfpIn - file to read from * staticModel - TRUE if decoding using a staticModel * Effects : upper, lower, and code are initialized. The probability * range list will also be initialized if an adaptive model * will be used. * Returned : TRUE for success, otherwise FALSE ****************************************************************************/ void InitializeDecoder(bit_file_t *bfpIn, char staticModel) { int i; if (!staticModel) { /* initialize ranges for adaptive model */ InitializeAdaptiveProbabilityRangeList(); } code = 0; /* read PERCISION MSBs of code one bit at a time */ for (i = 0; i < PRECISION; i++) { code <<= 1; /* treat EOF like 0 */ if(BitFileGetBit(bfpIn) == 1) { code |= 1; } } /* start with full probability range [0%, 100%) */ lower = 0; upper = ~0; /* all ones */ } /**************************************************************************** * Function : GetUnscaledCode * Description: This function undoes the scaling that ApplySymbolRange * performed before bits were shifted out. The value returned * is the probability of the encoded symbol. * Parameters : None * Effects : None * Returned : The probability of the current symbol ****************************************************************************/ probability_t GetUnscaledCode(void) { unsigned long range; /* must be able to hold max upper + 1 */ unsigned long unscaled; range = (unsigned long)(upper - lower) + 1; /* reverse the scaling operations from ApplySymbolRange */ unscaled = (unsigned long)(code - lower) + 1; unscaled = unscaled * (unsigned long)cumulativeProb - 1; unscaled /= range; return ((probability_t)unscaled); } /**************************************************************************** * Function : GetSymbolFromProbability * Description: Given a probability, this function will return the symbol * whose range includes that probability. Symbol is found * binary search on probability ranges. * Parameters : probability - probability of symbol. * Effects : None * Returned : -1 for failure, otherwise encoded symbol ****************************************************************************/ int GetSymbolFromProbability(probability_t probability) { int first, last, middle; /* indicies for binary search */ first = 0; last = UPPER(EOF_CHAR); middle = last / 2; /* binary search */ while (last >= first) { if (probability < ranges[LOWER(middle)]) { /* lower bound is higher than probability */ last = middle - 1; middle = first + ((last - first) / 2); continue; } if (probability >= ranges[UPPER(middle)]) { /* upper bound is lower than probability */ first = middle + 1; middle = first + ((last - first) / 2); continue; } /* we must have found the right value */ return middle; } /* error: none of the ranges include the probability */ fprintf(stderr, "Unknown Symbol: %d (max: %d)\n", probability, ranges[UPPER(EOF_CHAR)]); return -1; } /*************************************************************************** * Function : ReadEncodedBits * Description: This function attempts to shift out as many code bits as * possible, as bits are shifted out the coded input is * populated with bits from the encoded file. Only bits * that will be unchanged when additional symbols are decoded * may be shifted out. * * If the n most significant bits of the lower and upper range * bounds match, they will not be changed when additional * symbols are decoded, so they may be shifted out. * * Adjustments are also made to prevent possible underflows * that occur when the upper and lower ranges are so close * that decoding another symbol won't change their values. * Parameters : bfpOut - pointer to open binary file to read from. * Effects : The upper and lower code bounds are adjusted so that they * only contain only bits that will be affected by the * addition of a new symbol. Replacements are read from the * encoded stream. * Returned : None ***************************************************************************/ void ReadEncodedBits(bit_file_t *bfpIn) { int nextBit; /* next bit from encoded input */ for (;;) { if (( upper & MASK_BIT(0)) == (lower & MASK_BIT(0))) { /* MSBs match, allow them to be shifted out*/ } else if ((lower & MASK_BIT(1)) && !(upper & MASK_BIT(1))) { /*************************************************************** * Possible underflow condition: neither MSBs nor second MSBs * match. It must be the case that lower and upper have MSBs of * 01 and 10. Remove 2nd MSB from lower and upper. ***************************************************************/ lower &= ~(MASK_BIT(0) | MASK_BIT(1)); upper |= MASK_BIT(1); code ^= MASK_BIT(1); /* the shifts below make the rest of the bit removal work */ } else { /* nothing to shift out */ return; } /******************************************************************* * Shift out old MSB and shift in new LSB. Remember that lower has * all 0s beyond it's end and upper has all 1s beyond it's end. *******************************************************************/ lower <<= 1; upper <<= 1; upper |= 1; code <<= 1; if ((nextBit = BitFileGetBit(bfpIn)) == EOF) { /* either all bits are shifted out or error occurred */ } else { code |= nextBit; /* add next encoded bit to code */ } } return; }