blob: f9f3bb5251314bc6b058721110385e1f71f98dab [file] [log] [blame]
/*************************************************************************
* Name: huffman.c
* Author: Marcus Geelnard
* Description: Huffman coder/decoder implementation.
* Reentrant: Yes
* $Id: huffman.c,v 1.6 2004/12/14 18:59:40 marcus256 Exp $
*
* This is a very straight forward implementation of a Huffman coder and
* decoder.
*
* Primary flaws with this primitive implementation are:
* - Slow bit stream implementation
* - Fairly slow decoding (slower than encoding)
* - Maximum tree depth of 32 (the coder aborts if any code exceeds a
* size of 32 bits). If I'm not mistaking, this should not be possible
* unless the input buffer is larger than 2^32 bytes, which is not
* supported by the coder anyway (max 2^32-1 bytes can be specified with
* an unsigned 32-bit integer).
*
* On the other hand, there are a few advantages of this implementation:
* - The Huffman tree is stored in a very compact form, requiring only
* 12 bits per symbol (for 8 bit symbols), meaning a maximum of 384
* bytes overhead.
* - The Huffman coder does quite well in situations where the data is
* noisy, in which case most dictionary based coders run into problems.
*
* Possible improvements (probably not worth it):
* - Partition the input data stream into blocks, where each block has
* its own Huffman tree. With variable block sizes, it should be
* possible to find locally optimal Huffman trees, which in turn could
* reduce the total size.
* - Allow for a few different predefined Huffman trees, which could
* reduce the size of a block even further.
*-------------------------------------------------------------------------
* Copyright (c) 2003-2004 Marcus Geelnard
*
* This software is provided 'as-is', without any express or implied
* warranty. In no event will the authors be held liable for any damages
* arising from the use of this software.
*
* Permission is granted to anyone to use this software for any purpose,
* including commercial applications, and to alter it and redistribute it
* freely, subject to the following restrictions:
*
* 1. The origin of this software must not be misrepresented; you must not
* claim that you wrote the original software. If you use this software
* in a product, an acknowledgment in the product documentation would
* be appreciated but is not required.
*
* 2. Altered source versions must be plainly marked as such, and must not
* be misrepresented as being the original software.
*
* 3. This notice may not be removed or altered from any source
* distribution.
*
* Marcus Geelnard
* marcus.geelnard at home.se
*************************************************************************/
/* Modified May 06 by Julian Seward for use in Valgrind.
- changed integral types to V's versions (UInt, UChar etc)
- added initialisation in _Huffman_WriteBits, as described in
comment in that function.
*/
/*************************************************************************
* Types used for Huffman coding
*************************************************************************/
typedef struct {
UInt Symbol;
UInt Count;
UInt Code;
UInt Bits;
} huff_sym_t;
typedef struct {
UChar *BytePtr;
UInt BitPos;
} huff_bitstream_t;
/*************************************************************************
* INTERNAL FUNCTIONS *
*************************************************************************/
/*************************************************************************
* _Huffman_InitBitstream() - Initialize a bitstream.
*************************************************************************/
static void _Huffman_InitBitstream( huff_bitstream_t *stream,
UChar *buf )
{
stream->BytePtr = buf;
stream->BitPos = 0;
}
/*************************************************************************
* _Huffman_ReadBits() - Read bits from a bitstream.
*************************************************************************/
static UInt _Huffman_ReadBits( huff_bitstream_t *stream,
UInt bits )
{
UInt x, bit, count;
UChar *buf;
/* Get current stream state */
buf = stream->BytePtr;
bit = stream->BitPos;
/* Extract bits */
x = 0;
for( count = 0; count < bits; ++ count )
{
x = (x<<1) + (*buf & (1<<(7-bit)) ? 1 : 0);
bit = (bit+1) & 7;
if( !bit )
{
++ buf;
}
}
/* Store new stream state */
stream->BytePtr = buf;
stream->BitPos = bit;
return x;
}
/*************************************************************************
* _Huffman_WriteBits() - Write bits to a bitstream.
*************************************************************************/
static void _Huffman_WriteBits( huff_bitstream_t *stream, UInt x,
UInt bits )
{
UInt bit, count;
UChar *buf;
UInt mask;
/* Get current stream state */
buf = stream->BytePtr;
bit = stream->BitPos;
/* Append bits */
mask = 1 << (bits-1);
for( count = 0; count < bits; ++ count )
{
/* If we're starting a new byte, zero it out, so that the
resulting byte sequence looks completely defined from
Valgrind's point of view. If this doesn't happen then the
last byte in the stream may look partially undefined. */
if (bit == 0)
*buf = 0;
*buf = (*buf & (0xff^(1<<(7-bit)))) +
((x & mask ? 1 : 0) << (7-bit));
x <<= 1;
bit = (bit+1) & 7;
if( !bit )
{
++ buf;
}
}
/* Store new stream state */
stream->BytePtr = buf;
stream->BitPos = bit;
}
/*************************************************************************
* _Huffman_Hist() - Calculate (sorted) histogram for a block of data.
*************************************************************************/
static void _Huffman_Hist( UChar *in, huff_sym_t *sym,
UInt size )
{
Int k, swaps;
huff_sym_t tmp;
/* Clear/init histogram */
for( k = 0; k < 256; ++ k )
{
sym[k].Symbol = k;
sym[k].Count = 0;
sym[k].Code = 0;
sym[k].Bits = 0;
}
/* Build histogram */
for( k = size; k; -- k )
{
sym[ *in ++ ].Count ++;
}
/* Sort histogram - most frequent symbol first (bubble sort) */
do
{
swaps = 0;
for( k = 0; k < 255; ++ k )
{
if( sym[k].Count < sym[k+1].Count )
{
tmp = sym[k];
sym[k] = sym[k+1];
sym[k+1] = tmp;
swaps = 1;
}
}
}
while( swaps );
}
/*************************************************************************
* _Huffman_MakeTree() - Generate a Huffman tree.
*************************************************************************/
static void _Huffman_MakeTree( huff_sym_t *sym, huff_bitstream_t *stream,
UInt code, UInt bits, UInt first,
UInt last )
{
UInt k, size, size_a, size_b, last_a, first_b;
/* Is this a leaf node? */
if( first == last )
{
/* Append symbol to tree description */
_Huffman_WriteBits( stream, 1, 1 );
_Huffman_WriteBits( stream, sym[first].Symbol, 8 );
/* Store code info in symbol array */
sym[first].Code = code;
sym[first].Bits = bits;
return;
}
else
{
/* This was not a leaf node */
_Huffman_WriteBits( stream, 0, 1 );
}
/* Total size of interval */
size = 0;
for( k = first; k <= last; ++ k )
{
size += sym[k].Count;
}
/* Find size of branch a */
size_a = 0;
for( k = first; size_a < ((size+1)>>1) && k < last; ++ k )
{
size_a += sym[k].Count;
}
/* Non-empty branch? */
if( size_a > 0 )
{
/* Continue branching */
_Huffman_WriteBits( stream, 1, 1 );
/* Branch a cut in histogram */
last_a = k-1;
/* Create branch a */
_Huffman_MakeTree( sym, stream, (code<<1)+0, bits+1,
first, last_a );
}
else
{
/* This was an empty branch */
_Huffman_WriteBits( stream, 0, 1 );
}
/* Size of branch b */
size_b = size - size_a;
/* Non-empty branch? */
if( size_b > 0 )
{
/* Continue branching */
_Huffman_WriteBits( stream, 1, 1 );
/* Branch b cut in histogram */
first_b = k;
/* Create branch b */
_Huffman_MakeTree( sym, stream, (code<<1)+1, bits+1,
first_b, last );
}
else
{
/* This was an empty branch */
_Huffman_WriteBits( stream, 0, 1 );
}
}
/*************************************************************************
* _Huffman_RecoverTree() - Recover a Huffman tree from a bitstream.
*************************************************************************/
static void _Huffman_RecoverTree( huff_sym_t *sym,
huff_bitstream_t *stream, UInt code, UInt bits,
UInt *symnum )
{
UInt symbol;
/* Is this a leaf node? */
if( _Huffman_ReadBits( stream, 1 ) )
{
/* Get symbol from tree description */
symbol = _Huffman_ReadBits( stream, 8 );
/* Store code info in symbol array */
sym[*symnum].Symbol = symbol;
sym[*symnum].Code = code;
sym[*symnum].Bits = bits;
/* Increase symbol counter */
*symnum = *symnum + 1;
return;
}
/* Non-empty branch? */
if( _Huffman_ReadBits( stream, 1 ) )
{
/* Create branch a */
_Huffman_RecoverTree( sym, stream, (code<<1)+0, bits+1,
symnum );
}
/* Non-empty branch? */
if( _Huffman_ReadBits( stream, 1 ) )
{
/* Create branch b */
_Huffman_RecoverTree( sym, stream, (code<<1)+1, bits+1,
symnum );
}
}
/*************************************************************************
* PUBLIC FUNCTIONS *
*************************************************************************/
/*************************************************************************
* Huffman_Compress() - Compress a block of data using a Huffman coder.
* in - Input (uncompressed) buffer.
* out - Output (compressed) buffer. This buffer must be 384 bytes
* larger than the input buffer.
* insize - Number of input bytes.
* The function returns the size of the compressed data.
*************************************************************************/
static
Int Huffman_Compress( UChar *in, UChar *out,
UInt insize )
{
huff_sym_t sym[ 256 ], tmp;
huff_bitstream_t stream;
UInt k, total_bytes, swaps, symbol, last_symbol;
/* Do we have anything to compress? */
if( insize < 1 ) return 0;
/* Initialize bitstream */
_Huffman_InitBitstream( &stream, out );
/* Calculate and sort histogram for input data */
_Huffman_Hist( in, sym, insize );
/* Find number of used symbols */
for( last_symbol = 255; sym[last_symbol].Count == 0; -- last_symbol );
/* Special case: In order to build a correct tree, we need at least
two symbols (otherwise we get zero-bit representations). */
if( last_symbol == 0 ) ++ last_symbol;
/* Build Huffman tree */
_Huffman_MakeTree( sym, &stream, 0, 0, 0, last_symbol );
/* Was any code > 32 bits? (we do not handle that at present) */
for( k = 0; k < 255; ++ k )
{
if( sym[k].Bits > 32 )
{
return 0;
}
}
/* Sort histogram - first symbol first (bubble sort) */
do
{
swaps = 0;
for( k = 0; k < 255; ++ k )
{
if( sym[k].Symbol > sym[k+1].Symbol )
{
tmp = sym[k];
sym[k] = sym[k+1];
sym[k+1] = tmp;
swaps = 1;
}
}
}
while( swaps );
/* Encode input stream */
for( k = 0; k < insize; ++ k )
{
symbol = in[ k ];
_Huffman_WriteBits( &stream, sym[symbol].Code,
sym[symbol].Bits );
}
/* Calculate size of output data */
total_bytes = (Int)(stream.BytePtr - out);
if( stream.BitPos > 0 )
{
++ total_bytes;
}
return total_bytes;
}
/*************************************************************************
* Huffman_Uncompress() - Uncompress a block of data using a Huffman
* decoder.
* in - Input (compressed) buffer.
* out - Output (uncompressed) buffer. This buffer must be large
* enough to hold the uncompressed data.
* insize - Number of input bytes.
* outsize - Number of output bytes.
*************************************************************************/
static
void Huffman_Uncompress( UChar *in, UChar *out,
UInt insize, UInt outsize )
{
huff_sym_t sym[ 256 ], tmp;
huff_bitstream_t stream;
UInt k, m, symbol_count, swaps;
UChar *buf;
UInt bits, delta_bits, new_bits, code;
/* Do we have anything to decompress? */
if( insize < 1 ) return;
/* Initialize bitstream */
_Huffman_InitBitstream( &stream, in );
/* Clear tree/histogram */
for( k = 0; k < 256; ++ k )
{
sym[k].Bits = 0x7fffffff;
}
/* Recover Huffman tree */
symbol_count = 0;
_Huffman_RecoverTree( sym, &stream, 0, 0, &symbol_count );
/* Sort histogram - shortest code first (bubble sort) */
do
{
swaps = 0;
for( k = 0; k < symbol_count-1; ++ k )
{
if( sym[k].Bits > sym[k+1].Bits )
{
tmp = sym[k];
sym[k] = sym[k+1];
sym[k+1] = tmp;
swaps = 1;
}
}
}
while( swaps );
/* Decode input stream */
buf = out;
for( k = 0; k < outsize; ++ k )
{
/* Search tree for matching code */
bits = 0;
code = 0;
for( m = 0; m < symbol_count; ++ m )
{
delta_bits = sym[m].Bits - bits;
if( delta_bits )
{
new_bits = _Huffman_ReadBits( &stream, delta_bits );
code = code | (new_bits << (32-bits-delta_bits));
bits = sym[m].Bits;
}
if( code == (sym[m].Code << (32-sym[m].Bits)) )
{
*buf ++ = (UChar) sym[m].Symbol;
break;
}
}
}
}