/***************************************************************************
* Lempel-Ziv-Welch Encoding Functions
*
* File : lzwencode.c
* Purpose : Provides a function for Lempel-Ziv-Welch encoding of file
* streams
* Author : Michael Dipperstein
* Date : January 30, 2005
*
****************************************************************************
* UPDATES
*
* $Id: lzwencode.c,v 1.3 2007/09/29 01:28:09 michael Exp $
* $Log: lzwencode.c,v $
* Revision 1.3 2007/09/29 01:28:09 michael
* Changes required for LGPL v3.
*
* Revision 1.2 2005/04/21 04:26:57 michael
* Encoding dictionary is now built using a binary tree.
*
* Revision 1.1 2005/04/09 03:11:22 michael
* Separated encode and decode routines into two different files in order to
* make future enhancements easier.
*
* Revision 1.4 2005/03/27 15:56:47 michael
* Use variable length code words.
* Include new e-mail addres.
*
* Revision 1.3 2005/03/10 14:26:58 michael
* User variable names that match discription in web page.
*
* Revision 1.2 2005/03/10 05:44:02 michael
* Minimize the size of the dictionary.
*
* Revision 1.1.1.1 2005/02/21 03:35:34 michael
* Initial Release
*
****************************************************************************
*
* LZW: An ANSI C Lempel-Ziv-Welch Encoding/Decoding Routines
* Copyright (C) 2005, 2007 by
* Michael Dipperstein (mdipper@alumni.engr.ucsb.edu)
*
* This file is part of the lzw library.
*
* The lzw library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public License as
* published by the Free Software Foundation; either version 3 of the
* License, or (at your option) any later version.
*
* The lzw library is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser
* General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program. If not, see .
*
***************************************************************************/
/***************************************************************************
* INCLUDED FILES
***************************************************************************/
#include
#include
#include
#include
#include "lzw.h"
#include "bitfile.h"
/***************************************************************************
* TYPE DEFINITIONS
***************************************************************************/
/* node in dictionary tree */
typedef struct dict_node_t
{
unsigned int codeWord; /* code word for this entry */
unsigned char suffixChar; /* last char in encoded string */
unsigned int prefixCode; /* code for remaining chars in string */
/* pointer to child nodes */
struct dict_node_t *left; /* child with < key */
struct dict_node_t *right; /* child with >= key */
} dict_node_t;
/***************************************************************************
* CONSTANTS
***************************************************************************/
#define MIN_CODE_LEN 9 /* min # bits in a code word */
#define MAX_CODE_LEN 15 /* max # bits in a code word */
#define FIRST_CODE (1 << CHAR_BIT) /* value of 1st string code */
#define MAX_CODES (1 << MAX_CODE_LEN)
#if (MIN_CODE_LEN <= CHAR_BIT)
#error Code words must be larger than 1 character
#endif
#if (MAX_CODE_LEN > (2 * CHAR_BIT))
#error Code words larger than 2 characters require a re-write of GetCodeWord\
and PutCodeWord
#endif
#if ((MAX_CODES - 1) > INT_MAX)
#error There cannot be more codes than can fit in an integer
#endif
/***************************************************************************
* MACROS
***************************************************************************/
#define CODE_MS_BITS(BITS) ((BITS) - CHAR_BIT)
#define MS_BITS_MASK(BITS) (UCHAR_MAX << (CHAR_BIT - CODE_MS_BITS(BITS)))
#define CURRENT_MAX_CODES(BITS) (1 << (BITS))
/***************************************************************************
* GLOBAL VARIABLES
***************************************************************************/
/* dictionary of string codes (encode process uses a hash key) */
/* XL hack: reuse that of lzwdecode.c */
extern char currentCodeLen;
/***************************************************************************
* PROTOTYPES
***************************************************************************/
/* dictionary tree node create/free */
dict_node_t *MakeNode(unsigned int codeWord,
unsigned int prefixCode, unsigned char suffixChar);
void FreeTree(dict_node_t *node);
/* searches tree for matching dictionary entry */
dict_node_t *FindDictionaryEntry(dict_node_t *root, int prefixCode,
unsigned char c);
/* makes key from prefix code and character */
unsigned int MakeKey(unsigned int prefixCode, unsigned char suffixChar);
/* write encoded data */
void PutCodeWord(int code, bit_file_t *bfpOut);
/***************************************************************************
* FUNCTIONS
***************************************************************************/
/***************************************************************************
* Function : LZWEncodeFile
* Description: This routine reads an input file 1 character at a time and
* writes out an LZW encoded version of that file.
* Parameters : inFile - Name of file to encode
* outFile - Name of file to write encoded output to
* Effects : File is encoded using the LZW algorithm with CODE_LEN
* codes.
* Returned : TRUE for success, otherwise FALSE.
***************************************************************************/
int LZWEncodeFile(char *inFile, char *outFile)
{
FILE *fpIn; /* uncoded input */
bit_file_t *bfpOut; /* encoded output */
int code; /* code for current string */
unsigned int nextCode; /* next available code index */
int c; /* character to add to string */
dict_node_t *dictRoot; /* root of dictionary tree */
dict_node_t *node; /* node of dictionary tree */
/* open input and output files */
if (NULL == (fpIn = fopen(inFile, "rb")))
{
perror(inFile);
return FALSE;
}
if (NULL == outFile)
{
bfpOut = MakeBitFile(stdout, BF_WRITE);
}
else
{
if (NULL == (bfpOut = BitFileOpen(outFile, BF_WRITE)))
{
fclose(fpIn);
perror(outFile);
return FALSE;
}
}
/* initialize dictionary as empty */
dictRoot = NULL;
/* start with 9 bit code words */
currentCodeLen = 9;
nextCode = FIRST_CODE; /* code for next (first) string */
/* now start the actual encoding process */
code = fgetc(fpIn); /* start with code string = first character */
/* create a tree root from 1st 2 character string */
if ((c = fgetc(fpIn)) != EOF)
{
/* special case for NULL root */
dictRoot = MakeNode(nextCode, code, c);
nextCode++;
/* write code for 1st char */
PutCodeWord(code, bfpOut);
/* new code is just 2nd char */
code = c;
}
/* now encode normally */
while ((c = fgetc(fpIn)) != EOF)
{
/* look for code + c in the dictionary */
node = FindDictionaryEntry(dictRoot, code, c);
if ((node->prefixCode == code) &&
(node->suffixChar == c))
{
/* code + c is in the dictionary, make it's code the new code */
code = node->codeWord;
}
else
{
/* code + c is not in the dictionary, add it if there's room */
if (nextCode < MAX_CODES)
{
dict_node_t *tmp;
tmp = MakeNode(nextCode, code, c);
nextCode++;
if (MakeKey(code, c) <
MakeKey(node->prefixCode, node->suffixChar))
{
node->left = tmp;
}
else
{
node->right = tmp;
}
}
/* are we using enough bits to write out this code word? */
if ((code >= (CURRENT_MAX_CODES(currentCodeLen) - 1)) &&
(currentCodeLen < MAX_CODE_LEN))
{
/* mark need for bigger code word with all ones */
PutCodeWord((CURRENT_MAX_CODES(currentCodeLen) - 1), bfpOut);
currentCodeLen++;
}
/* write out code for the string before c was added */
PutCodeWord(code, bfpOut);
/* new code is just c */
code = c;
}
}
/* no more input. write out last of the code. */
PutCodeWord(code, bfpOut);
fclose(fpIn);
BitFileClose(bfpOut);
/* free the dictionary */
if (dictRoot != NULL)
{
FreeTree(dictRoot);
}
return TRUE;
}
/***************************************************************************
* Function : MakeKey
* Description: This routine creates a simple key from a prefix code and
* an appended character. The key may be used to establish
* an order when building/searching a dictionary tree.
* Parameters : prefixCode - code for all but the last character of a
* string.
* suffixChar - the last character of a string
* Effects : None
* Returned : Key built from string represented as a prefix + char. Key
* format is {ms nibble of c} + prefix + {ls nibble of c}
***************************************************************************/
unsigned int MakeKey(unsigned int prefixCode, unsigned char suffixChar)
{
unsigned int key;
/* position ms nibble */
key = suffixChar & 0xF0;
key <<= MAX_CODE_LEN;
/* include prefix code */
key |= (prefixCode << 4);
/* inclulde ls nibble */
key |= (suffixChar & 0x0F);
return key;
}
/***************************************************************************
* Function : MakeNode
* Description: This routine creates and initializes a dictionary entry
* for a string and the code word that encodes it.
* Parameters : codeWord - code word used to encode the string prefixCode +
* suffixChar
* prefixCode - code for all but the last character of a
* string.
* suffixChar - the last character of a string
* Effects : Node is allocated for new dictionary entry
* Returned : Pointer to newly allocated node
***************************************************************************/
dict_node_t *MakeNode(unsigned int codeWord,
unsigned int prefixCode, unsigned char suffixChar)
{
dict_node_t *node;
node = malloc(sizeof(dict_node_t));
if (NULL != node)
{
node->codeWord = codeWord;
node->prefixCode = prefixCode;
node->suffixChar = suffixChar;
node->left = NULL;
node->right = NULL;
}
else
{
perror("allocating dictionary node");
exit(EXIT_FAILURE);
}
return node;
}
/***************************************************************************
* Function : FreeTree
* Description: This routine will free all nodes of a tree rooted at the
* node past as a parameter.
* Parameters : node - root of tree to free
* Effects : frees allocated tree node from initial parameter down.
* Returned : none
***************************************************************************/
void FreeTree(dict_node_t *node)
{
/* free left branch */
if (node->left != NULL)
{
FreeTree(node->left);
}
/* free right branch */
if (node->right != NULL)
{
FreeTree(node->right);
}
/* free root */
free(node);
}
/***************************************************************************
* Function : FindDictionaryEntry
* Description: This routine searches the dictionary tree for an entry
* with a matching string (prefix code + suffix character).
* If one isn't found, the parent node for that string is
* returned.
* Parameters : prefixCode - code for the prefix of string
* c - last character in string
* Effects : None
* Returned : If string is in dictionary, pointer to node containing
* string, otherwise pointer to suitable parent node. NULL
* is returned for an empty tree.
***************************************************************************/
dict_node_t *FindDictionaryEntry(dict_node_t *root, int prefixCode,
unsigned char c)
{
unsigned int searchKey, key;
if (NULL == root)
{
return NULL;
}
searchKey = MakeKey(prefixCode, c); /* key of string to find */
while (1)
{
/* key of current node */
key = MakeKey(root->prefixCode, root->suffixChar);
if (key == searchKey)
{
/* current node contains string */
return root;
}
else if (searchKey < key)
{
if (NULL != root->left)
{
/* check left branch for string */
root = root->left;
}
else
{
/* string isn't in tree, it can be added as a left child */
return root;
}
}
else
{
if (NULL != root->right)
{
/* check right branch for string */
root = root->right;
}
else
{
/* string isn't in tree, it can be added as a right child */
return root;
}
}
}
}
/***************************************************************************
* Function : PutCodeWord
* Description: This function writes a code word from to an encoded file.
* In order to deal with endian issue the code word is
* written least significant byte followed by the remaining
* bits.
* Parameters : bfpOut - bit file containing the encoded data
* code - code word to add to the encoded data
* Effects : code word is written to the encoded output
* Returned : None
*
* NOTE: If the code word contains more than 16 bits, this routine should
* be modified to write out all the bytes from least significant to
* most significant followed by any left over bits.
***************************************************************************/
void PutCodeWord(int code, bit_file_t *bfpOut)
{
unsigned char byte;
/* write LS character */
byte = code & 0xFF;
BitFilePutChar(byte, bfpOut);
/* write remaining bits */
byte = (code >> CODE_MS_BITS(currentCodeLen)) &
MS_BITS_MASK(currentCodeLen);
BitFilePutBits(bfpOut, &byte, CODE_MS_BITS(currentCodeLen));
}