RFC 1951 “Deflate” compression












3














This is a little project I did over Christmas, I wanted to understand a bit more about how RFC 1951 (https://www.ietf.org/rfc/rfc1951.txt) compression worked, mostly out of curiosity really. RFC 1951 is used to compress PNG images, zip files and portions of PDF files ( and probably other things as well ). I have tried to keep the C# code fairly simple, concise and hopefully comprehensible, rather than being too concerned with efficiency. Here's the code:



using Generic = System.Collections.Generic; 

class Encoder : Generic.List<byte> // Data compression per RFC 1950, RFC 1951.
{
public Generic.List<byte> Deflate( byte inp )
{
Clear();
Add( 0x78); Add( 0x9C ); // RFC 1950 bytes.
ReadInp( inp );
while ( DoOutput( 1 ) == 0 );
FlushBitBuf();
Put32( Adler32( inp ) ); // RFC 1950 checksum.
// System.Console.WriteLine( "Encoder.Deflate, input size=" + inp.Length + " output size=" + this.Count );
return this;
}

class PosList{ public int Pos; public PosList Next; } // List of 3-byte match positions.

void ReadInp( byte input ) // LZ77 compression, per RFC 1951.
{
Generic.Dictionary<int,PosList> dict = new Generic.Dictionary<int,PosList>();
int n = input.Length;
SetIBufSize( n );

int w = 0; // Holds last 3 bytes of input.
int todo = 0; // Number of bytes in w that have not yet been output to IBuf, can be negative when a match is found.
int pendingMatchLen = 0, pendingDist = 0;

for ( int i = 0; i < 2 && i < n; i += 1 ) { w = ( w << 8 ) | input[ i ]; todo += 1; }

for ( int i = 2; i < n; i += 1 )
{
w = ( ( w << 8 ) | input[ i ] ) & 0xffffff; todo += 1;
PosList e, x = new PosList(); x.Pos = i;
int bestMatchLen = 0, bestDist = 0;
if ( dict.TryGetValue( w, out e ) )
{
x.Next = e;
PosList p = x;
if ( todo >= 3 ) while ( e != null )
{
int dist = i - e.Pos; if ( dist > 32768 ) { p.Next = null; break; }
int matchLen = MatchLen( input, dist, i );
if ( matchLen > bestMatchLen ) { bestMatchLen = matchLen; bestDist = dist; }
p = e; e = e.Next;
}
}
dict[ w ] = x; ISpace();

// "Lazy matching" RFC 1951 p.15 : if there are overlapping matches, there is a choice over which of the match to use.
// Example: "abc012bc345.... abc345". Here abc345 can be encoded as either [abc][345] or as a[bc345].
// Since a range typically needs more bits to encode than a literal, choose the latter.

if ( pendingMatchLen > 0 )
{
if ( bestMatchLen > pendingMatchLen || bestMatchLen == pendingMatchLen && bestDist < pendingDist )
{ IPut( input[ i-3 ] ); todo -= 1; }
else // Save the pending match, suppress bestMatch if any.
{
IPut( 257 + pendingMatchLen );
IPut( pendingDist );
todo -= pendingMatchLen;
bestMatchLen = 0;
}
pendingMatchLen = 0;
}
if ( bestMatchLen > 0 ) { pendingMatchLen = bestMatchLen; pendingDist = bestDist; }
else if ( todo == 3 ) { IPut( w >> 16 ); todo = 2; }
} // End of main input loop.
if ( pendingMatchLen > 0 )
{
IPut( 257 + pendingMatchLen );
IPut( pendingDist );
todo -= pendingMatchLen;
}
while ( todo > 0 ){ todo -= 1; IPut( (byte)( w >> (todo*8) ) ); }
} // end ReadInp

int MatchLen( byte input, int dist, int i )
{
// We have a match of 3 bytes, this function computes total match.
int end = input.Length; if ( end - i > 256 ) end = i + 256; // Maximum match is 258.
int x = i + 1; while ( x < end && input[ x ] == input[ x-dist ] ) x += 1;
return x - i + 2;
}

ushort IBuf; // Intermediate circular buffer, holds output from LZ77 algorithm.
const int IBufSizeMax = 0x40000;
int IBufSize, IBufI, IBufJ;
void IPut( int x ) { IBuf[ IBufI++ ] = (ushort)x; if ( IBufI == IBufSize ) IBufI = 0; }
int IGet(){ int result = IBuf[ IBufJ++ ]; if ( IBufJ == IBufSize ) IBufJ = 0; return result; }
int ICount(){ if ( IBufI >= IBufJ ) return IBufI - IBufJ; else return IBufI + IBufSize - IBufJ; } // Number of values in IBuf.
void ISpace(){ while ( ICount() > IBufSize - 10 ) DoOutput( 0 ); } // Ensure IBuf has space for at least 10 values.
void SetIBufSize( int x ) { x += 20; if ( x > IBufSizeMax ) x = IBufSizeMax; if ( IBufSize < x ) { IBufSize = x; IBuf = new ushort[ x ]; } }

byte DoOutput( byte lastBlock ) // While DoBlock fails, retry with a smaller amount of input.
{
int n = ICount();
while ( !DoBlock( n, lastBlock ) ) { lastBlock = 0; n -= n / 20; }
return lastBlock;
}

// RFC 1951 encoding constants.
static byte ClenAlphabet = { 16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15 };
static byte MatchExtra = { 0,0,0,0, 0,0,0,0, 1,1,1,1, 2,2,2,2, 3,3,3,3, 4,4,4,4, 5,5,5,5, 0 };
static ushort MatchOff = { 3,4,5,6, 7,8,9,10, 11,13,15,17, 19,23,27,31, 35,43,51,59, 67,83,99,115, 131,163,195,227, 258, 0xffff };
static byte DistExtra = { 0,0,0,0, 1,1,2,2, 3,3,4,4, 5,5,6,6, 7,7,8,8, 9,9,10,10, 11,11,12,12, 13,13 };
static ushort DistOff = { 1,2,3,4, 5,7,9,13, 17,25,33,49, 65,97,129,193, 257,385,513,769, 1025,1537,2049,3073,
4097,6145,8193,12289, 16385,24577, 0xffff };

int LitFreq = new int [ 288 ], DistFreq = new int [ 32 ], LenFreq = new int [ 19 ];
byte LitLen = new byte [ 288 ], DistLen = new byte [ 32 ], LenLen = new byte [ 19 ];
ushort LitCode = new ushort[ 288 ], DistCode = new ushort[ 32 ], LenCode = new ushort[ 19 ];

bool DoBlock( int n, byte lastBlock )
{
// First compute symbol frequencies.
Clear( LitFreq ); Clear( DistFreq );
int saveIBufJ = IBufJ;
int got = 0; while ( got < n )
{
int x = IGet(); got += 1;
if ( x < 256 ) LitFreq[ x ] += 1;
else
{
x -= 257;
int dist = IGet(); got += 1;
int mc = 0; while ( x >= MatchOff[ mc ] ) mc += 1; mc -= 1;
int dc = 0; while ( dist >= DistOff[ dc ] ) dc += 1; dc -= 1;
LitFreq[ 257 + mc ] += 1;
DistFreq[ dc ] += 1;
}
}
LitFreq[ 256 ] += 1; // End of block code.
IBufJ = saveIBufJ;

// Now compute Huffman codes.
int nLitCode = Huff.ComputeCode( 15, LitFreq, LitLen, LitCode ); if ( nLitCode < 0 ) return false;
int nDistCode = Huff.ComputeCode( 15, DistFreq, DistLen, DistCode ); if ( nDistCode < 0 ) return false;
if ( nDistCode == 0 ) nDistCode = 1;

Clear( LenFreq );
LenPass = 1; DoLengths( nLitCode, LitLen, true ); DoLengths( nDistCode, DistLen, false );

if ( Huff.ComputeCode( 7, LenFreq, LenLen, LenCode ) < 0 ) return false;

int nLenCode = 19; while ( nLenCode > 4 && LenLen[ ClenAlphabet[ nLenCode - 1 ] ] == 0 ) nLenCode -= 1;

// Now output dynamic Huffman block. For small blocks fixed coding might work better, not implemented.
PutBit( lastBlock );
PutBits( 2, 2 );
PutBits( 5, nLitCode - 257 ); PutBits( 5, nDistCode - 1 ); PutBits( 4, nLenCode - 4 );
for ( int i = 0; i < nLenCode; i += 1 ) PutBits( 3, LenLen[ ClenAlphabet[ i ] ] );

LenPass = 2; DoLengths( nLitCode, LitLen, true ); DoLengths( nDistCode, DistLen, false );

got = 0; while ( got < n ) // Similar to loop above, but does output instead of computing symbol frequencies.
{
int x = IGet(); got += 1;
if ( x < 256 ) PutBits( LitLen[ x ], LitCode[ x ] );
else
{
x -= 257;
int dist = IGet(); got += 1;
int mc = 0; while ( x >= MatchOff[ mc ] ) mc += 1; mc -= 1;
int dc = 0; while ( dist >= DistOff[ dc ] ) dc += 1; dc -= 1;
PutBits( LitLen[ 257 + mc ], LitCode[ 257 + mc ] );
PutBits( MatchExtra[ mc ], x-MatchOff[ mc ] );
PutBits( DistLen[ dc ], DistCode[ dc ] );
PutBits( DistExtra[ dc ], dist-DistOff[ dc ] );
}
}
PutBits( LitLen[ 256 ], LitCode[ 256 ] ); // End of block code.
return true;
} // end DoBlock

// Run length encoding of code lengths - RFC 1951, page 13.

int LenPass, Plen, ZeroRun, Repeat;

void PutLenCode( int code ) { if ( LenPass == 1 ) LenFreq[ code ] += 1; else PutBits( LenLen[ code ], LenCode[ code ] ); }

void DoLengths( int n, byte a, bool isLit )
{
if ( isLit ) { Plen = 0; ZeroRun = 0; Repeat = 0; }
for ( int i = 0; i < n; i += 1 )
{
int len = a[ i ];
if ( len == 0 ){ EncRepeat(); ZeroRun += 1; Plen = 0; }
else if ( len == Plen ) { Repeat += 1; }
else { EncZeroRun(); EncRepeat(); PutLenCode( len ); Plen = len; }
}
if ( !isLit ) { EncZeroRun(); EncRepeat(); }
}

void EncRepeat()
{
while ( Repeat > 0 )
{
if ( Repeat < 3 ) { PutLenCode( Plen ); Repeat -= 1; }
else { int x = Repeat; if ( x > 6 ) x = 6; PutLenCode( 16 ); if ( LenPass == 2 ) PutBits( 2, x-3 ); Repeat -= x; }
}
}

void EncZeroRun()
{
while ( ZeroRun > 0 )
{
if ( ZeroRun < 3 ) { PutLenCode( 0 ); ZeroRun -= 1; }
else if ( ZeroRun < 11 ) { PutLenCode( 17 ); if ( LenPass == 2 ) PutBits( 3, ZeroRun-3 ); ZeroRun = 0; }
else { int x = ZeroRun; if ( x > 138 ) x = 138; PutLenCode( 18 ); if ( LenPass == 2 ) PutBits( 7, x - 11 ); ZeroRun -= x; }
}
}

public static uint Adler32( byte b ) // Checksum function per RFC 1950.
{
uint s1 = 1, s2 = 0;
for ( int i = 0; i < b.Length; i += 1 )
{
s1 = ( s1 + b[ i ] ) % 65521;
s2 = ( s2 + s1 ) % 65521;
}
return s2*65536 + s1;
}

static void Clear( int f ){ System.Array.Clear( f, 0, f.Length ); }

// Output stream.
byte Buf = 0, M = 1;
void PutBit( int b ) { if ( b != 0 ) Buf |= M; M <<= 1; if ( M == 0 ) { Add(Buf); Buf = 0; M = 1; } }
void PutBits( int n, int x ) { for ( int i = 0; i < n; i += 1 ) { PutBit( x & 1 ); x >>= 1; } }
void FlushBitBuf(){ while ( M != 1 ) PutBit( 0 ); }
void Put32( uint x ) { Add( (byte)( x >> 24 ) ); Add( (byte)( x >> 16 ) ); Add( (byte)( x >> 8 ) ); Add( (byte) x ); }

} // end class Encoder

//+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

class Huff // Given a list of frequencies (freq), compute Huffman code lengths (nbits) and codes (tree_code).
{
public static int ComputeCode( int bitLimit, int freq, byte nbits, ushort tree_code )
{
int ncode = freq.Length;
Node heap = new Node[ ncode ];
int hn = 0;
for ( int i = 0; i < ncode; i += 1 )
{
int f = freq[ i ];
if ( f > 0 )
{
Node n = new Node();
n.Freq = f;
n.Code = (ushort)i;
HeapInsert( heap, hn, n );
hn += 1;
}
}

for ( int i = 0; i < nbits.Length; i += 1 ) nbits[ i ] = 0;
if ( hn <= 1 ) // Special case.
{ if ( hn == 1 ) nbits[ heap[ 0 ].Code ] = 1; }
else
{
while ( hn > 1 ) // Keep pairing the lowest frequency nodes.
{
Node n = new Node();
hn -= 1; n.Left = HeapRemove( heap, hn );
hn -= 1; n.Right = HeapRemove( heap, hn );
n.Freq = n.Left.Freq + n.Right.Freq;
n.Depth = (byte) ( 1 + ( n.Left.Depth > n.Right.Depth ? n.Left.Depth : n.Right.Depth ) );
HeapInsert( heap, hn, n );
hn += 1;
}
Walk( nbits, heap[ 0 ], 0 ); // Walk the tree to find the code lengths (nbits).
}

for ( int i = 0; i < ncode; i += 1 ) if ( nbits[ i ] > bitLimit ) return -1;

// Now compute codes, code below is from RFC 1951 page 7.

int maxBits = 0;
for ( int i = 0; i < ncode; i += 1 ) if ( nbits[ i ] > maxBits ) maxBits = nbits[ i ];

int bl_count = new int[ maxBits+1 ];
for ( int i = 0; i < ncode; i += 1 ) bl_count[ nbits[ i ] ] += 1;

int next_code = new int[ maxBits+1 ];
int code = 0; bl_count[ 0 ] = 0;
for ( int i = 0; i < maxBits; i += 1 )
{
code = ( code + bl_count[ i ] ) << 1;
next_code[ i+1 ] = code;
}

for ( int i = 0; i < ncode; i += 1 )
{
int len = nbits[ i ];
if ( len != 0 )
{
tree_code[ i ] = (ushort)Reverse( next_code[ len ], len );
next_code[ len ] += 1;
}
}
while ( ncode > 0 && nbits[ ncode - 1 ] == 0 ) ncode -= 1;

//System.Console.WriteLine( "Huff.ComputeCode" );
//for ( int i = 0; i < ncode; i += 1 ) if ( nbits[ i ] > 0 )
// System.Console.WriteLine( "i=" + i + " len=" + nbits[ i ] + " tc=" + tree_code[ i ].ToString("X") + " freq=" + freq[ i ] );

return ncode;
}

class Node{ public Node Left, Right; public int Freq; public ushort Code; public byte Depth; }

static int Reverse( int x, int bits )
{ int result = 0; for ( int i = 0; i < bits; i += 1 ) { result <<= 1; result |= x & 1; x >>= 1; } return result; }

static void Walk( byte a, Node n, int len )
{ if ( n.Left == null ) a[ n.Code ] = (byte)len; else { Walk( a, n.Left, len+1 ); Walk( a, n.Right, len+1 ); } }

static bool LessThan( Node a, Node b )
{ return a.Freq < b.Freq || a.Freq == b.Freq && a.Depth < b.Depth; }

// See e.g. https://en.wikipedia.org/wiki/Heap_(data_structure) for information on heap data structure.

static void HeapInsert( Node heap, int h, Node n ) // h is size of heap before insertion.
{
int j = h;
while ( j > 0 )
{
int p = ( j - 1 ) / 2; // Index of parent.
Node pn = heap[ p ];
if ( !LessThan( n, pn ) ) break;
heap[ j ] = pn; // Demote parent.
j = p;
}
heap[ j ] = n;
}

static Node HeapRemove( Node heap, int h ) // h is size of heap after removal.
{
Node result = heap[ 0 ];
Node n = heap[ h ];
int j = 0;
while ( true )
{
int c = j * 2 + 1; if ( c >= h ) break;
Node cn = heap[ c ];
if ( c + 1 < h )
{
Node cn2 = heap[ c + 1 ];
if ( LessThan( cn2, cn ) ) { c += 1; cn = cn2; }
}
if ( !LessThan( cn, n ) ) break;
heap[ j ] = cn; j = c;
}
heap[ j ] = n;
return result;
}

} // end class Huff









share|improve this question







New contributor




George Barwood is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
Check out our Code of Conduct.
















  • 4




    I find your one-liners, abbreviations and lots of strange names and formatting scarry, ugly, incomprihesible and generally terrible - do you always write code like this? :-
    – t3chb0t
    Jan 2 at 19:21
















3














This is a little project I did over Christmas, I wanted to understand a bit more about how RFC 1951 (https://www.ietf.org/rfc/rfc1951.txt) compression worked, mostly out of curiosity really. RFC 1951 is used to compress PNG images, zip files and portions of PDF files ( and probably other things as well ). I have tried to keep the C# code fairly simple, concise and hopefully comprehensible, rather than being too concerned with efficiency. Here's the code:



using Generic = System.Collections.Generic; 

class Encoder : Generic.List<byte> // Data compression per RFC 1950, RFC 1951.
{
public Generic.List<byte> Deflate( byte inp )
{
Clear();
Add( 0x78); Add( 0x9C ); // RFC 1950 bytes.
ReadInp( inp );
while ( DoOutput( 1 ) == 0 );
FlushBitBuf();
Put32( Adler32( inp ) ); // RFC 1950 checksum.
// System.Console.WriteLine( "Encoder.Deflate, input size=" + inp.Length + " output size=" + this.Count );
return this;
}

class PosList{ public int Pos; public PosList Next; } // List of 3-byte match positions.

void ReadInp( byte input ) // LZ77 compression, per RFC 1951.
{
Generic.Dictionary<int,PosList> dict = new Generic.Dictionary<int,PosList>();
int n = input.Length;
SetIBufSize( n );

int w = 0; // Holds last 3 bytes of input.
int todo = 0; // Number of bytes in w that have not yet been output to IBuf, can be negative when a match is found.
int pendingMatchLen = 0, pendingDist = 0;

for ( int i = 0; i < 2 && i < n; i += 1 ) { w = ( w << 8 ) | input[ i ]; todo += 1; }

for ( int i = 2; i < n; i += 1 )
{
w = ( ( w << 8 ) | input[ i ] ) & 0xffffff; todo += 1;
PosList e, x = new PosList(); x.Pos = i;
int bestMatchLen = 0, bestDist = 0;
if ( dict.TryGetValue( w, out e ) )
{
x.Next = e;
PosList p = x;
if ( todo >= 3 ) while ( e != null )
{
int dist = i - e.Pos; if ( dist > 32768 ) { p.Next = null; break; }
int matchLen = MatchLen( input, dist, i );
if ( matchLen > bestMatchLen ) { bestMatchLen = matchLen; bestDist = dist; }
p = e; e = e.Next;
}
}
dict[ w ] = x; ISpace();

// "Lazy matching" RFC 1951 p.15 : if there are overlapping matches, there is a choice over which of the match to use.
// Example: "abc012bc345.... abc345". Here abc345 can be encoded as either [abc][345] or as a[bc345].
// Since a range typically needs more bits to encode than a literal, choose the latter.

if ( pendingMatchLen > 0 )
{
if ( bestMatchLen > pendingMatchLen || bestMatchLen == pendingMatchLen && bestDist < pendingDist )
{ IPut( input[ i-3 ] ); todo -= 1; }
else // Save the pending match, suppress bestMatch if any.
{
IPut( 257 + pendingMatchLen );
IPut( pendingDist );
todo -= pendingMatchLen;
bestMatchLen = 0;
}
pendingMatchLen = 0;
}
if ( bestMatchLen > 0 ) { pendingMatchLen = bestMatchLen; pendingDist = bestDist; }
else if ( todo == 3 ) { IPut( w >> 16 ); todo = 2; }
} // End of main input loop.
if ( pendingMatchLen > 0 )
{
IPut( 257 + pendingMatchLen );
IPut( pendingDist );
todo -= pendingMatchLen;
}
while ( todo > 0 ){ todo -= 1; IPut( (byte)( w >> (todo*8) ) ); }
} // end ReadInp

int MatchLen( byte input, int dist, int i )
{
// We have a match of 3 bytes, this function computes total match.
int end = input.Length; if ( end - i > 256 ) end = i + 256; // Maximum match is 258.
int x = i + 1; while ( x < end && input[ x ] == input[ x-dist ] ) x += 1;
return x - i + 2;
}

ushort IBuf; // Intermediate circular buffer, holds output from LZ77 algorithm.
const int IBufSizeMax = 0x40000;
int IBufSize, IBufI, IBufJ;
void IPut( int x ) { IBuf[ IBufI++ ] = (ushort)x; if ( IBufI == IBufSize ) IBufI = 0; }
int IGet(){ int result = IBuf[ IBufJ++ ]; if ( IBufJ == IBufSize ) IBufJ = 0; return result; }
int ICount(){ if ( IBufI >= IBufJ ) return IBufI - IBufJ; else return IBufI + IBufSize - IBufJ; } // Number of values in IBuf.
void ISpace(){ while ( ICount() > IBufSize - 10 ) DoOutput( 0 ); } // Ensure IBuf has space for at least 10 values.
void SetIBufSize( int x ) { x += 20; if ( x > IBufSizeMax ) x = IBufSizeMax; if ( IBufSize < x ) { IBufSize = x; IBuf = new ushort[ x ]; } }

byte DoOutput( byte lastBlock ) // While DoBlock fails, retry with a smaller amount of input.
{
int n = ICount();
while ( !DoBlock( n, lastBlock ) ) { lastBlock = 0; n -= n / 20; }
return lastBlock;
}

// RFC 1951 encoding constants.
static byte ClenAlphabet = { 16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15 };
static byte MatchExtra = { 0,0,0,0, 0,0,0,0, 1,1,1,1, 2,2,2,2, 3,3,3,3, 4,4,4,4, 5,5,5,5, 0 };
static ushort MatchOff = { 3,4,5,6, 7,8,9,10, 11,13,15,17, 19,23,27,31, 35,43,51,59, 67,83,99,115, 131,163,195,227, 258, 0xffff };
static byte DistExtra = { 0,0,0,0, 1,1,2,2, 3,3,4,4, 5,5,6,6, 7,7,8,8, 9,9,10,10, 11,11,12,12, 13,13 };
static ushort DistOff = { 1,2,3,4, 5,7,9,13, 17,25,33,49, 65,97,129,193, 257,385,513,769, 1025,1537,2049,3073,
4097,6145,8193,12289, 16385,24577, 0xffff };

int LitFreq = new int [ 288 ], DistFreq = new int [ 32 ], LenFreq = new int [ 19 ];
byte LitLen = new byte [ 288 ], DistLen = new byte [ 32 ], LenLen = new byte [ 19 ];
ushort LitCode = new ushort[ 288 ], DistCode = new ushort[ 32 ], LenCode = new ushort[ 19 ];

bool DoBlock( int n, byte lastBlock )
{
// First compute symbol frequencies.
Clear( LitFreq ); Clear( DistFreq );
int saveIBufJ = IBufJ;
int got = 0; while ( got < n )
{
int x = IGet(); got += 1;
if ( x < 256 ) LitFreq[ x ] += 1;
else
{
x -= 257;
int dist = IGet(); got += 1;
int mc = 0; while ( x >= MatchOff[ mc ] ) mc += 1; mc -= 1;
int dc = 0; while ( dist >= DistOff[ dc ] ) dc += 1; dc -= 1;
LitFreq[ 257 + mc ] += 1;
DistFreq[ dc ] += 1;
}
}
LitFreq[ 256 ] += 1; // End of block code.
IBufJ = saveIBufJ;

// Now compute Huffman codes.
int nLitCode = Huff.ComputeCode( 15, LitFreq, LitLen, LitCode ); if ( nLitCode < 0 ) return false;
int nDistCode = Huff.ComputeCode( 15, DistFreq, DistLen, DistCode ); if ( nDistCode < 0 ) return false;
if ( nDistCode == 0 ) nDistCode = 1;

Clear( LenFreq );
LenPass = 1; DoLengths( nLitCode, LitLen, true ); DoLengths( nDistCode, DistLen, false );

if ( Huff.ComputeCode( 7, LenFreq, LenLen, LenCode ) < 0 ) return false;

int nLenCode = 19; while ( nLenCode > 4 && LenLen[ ClenAlphabet[ nLenCode - 1 ] ] == 0 ) nLenCode -= 1;

// Now output dynamic Huffman block. For small blocks fixed coding might work better, not implemented.
PutBit( lastBlock );
PutBits( 2, 2 );
PutBits( 5, nLitCode - 257 ); PutBits( 5, nDistCode - 1 ); PutBits( 4, nLenCode - 4 );
for ( int i = 0; i < nLenCode; i += 1 ) PutBits( 3, LenLen[ ClenAlphabet[ i ] ] );

LenPass = 2; DoLengths( nLitCode, LitLen, true ); DoLengths( nDistCode, DistLen, false );

got = 0; while ( got < n ) // Similar to loop above, but does output instead of computing symbol frequencies.
{
int x = IGet(); got += 1;
if ( x < 256 ) PutBits( LitLen[ x ], LitCode[ x ] );
else
{
x -= 257;
int dist = IGet(); got += 1;
int mc = 0; while ( x >= MatchOff[ mc ] ) mc += 1; mc -= 1;
int dc = 0; while ( dist >= DistOff[ dc ] ) dc += 1; dc -= 1;
PutBits( LitLen[ 257 + mc ], LitCode[ 257 + mc ] );
PutBits( MatchExtra[ mc ], x-MatchOff[ mc ] );
PutBits( DistLen[ dc ], DistCode[ dc ] );
PutBits( DistExtra[ dc ], dist-DistOff[ dc ] );
}
}
PutBits( LitLen[ 256 ], LitCode[ 256 ] ); // End of block code.
return true;
} // end DoBlock

// Run length encoding of code lengths - RFC 1951, page 13.

int LenPass, Plen, ZeroRun, Repeat;

void PutLenCode( int code ) { if ( LenPass == 1 ) LenFreq[ code ] += 1; else PutBits( LenLen[ code ], LenCode[ code ] ); }

void DoLengths( int n, byte a, bool isLit )
{
if ( isLit ) { Plen = 0; ZeroRun = 0; Repeat = 0; }
for ( int i = 0; i < n; i += 1 )
{
int len = a[ i ];
if ( len == 0 ){ EncRepeat(); ZeroRun += 1; Plen = 0; }
else if ( len == Plen ) { Repeat += 1; }
else { EncZeroRun(); EncRepeat(); PutLenCode( len ); Plen = len; }
}
if ( !isLit ) { EncZeroRun(); EncRepeat(); }
}

void EncRepeat()
{
while ( Repeat > 0 )
{
if ( Repeat < 3 ) { PutLenCode( Plen ); Repeat -= 1; }
else { int x = Repeat; if ( x > 6 ) x = 6; PutLenCode( 16 ); if ( LenPass == 2 ) PutBits( 2, x-3 ); Repeat -= x; }
}
}

void EncZeroRun()
{
while ( ZeroRun > 0 )
{
if ( ZeroRun < 3 ) { PutLenCode( 0 ); ZeroRun -= 1; }
else if ( ZeroRun < 11 ) { PutLenCode( 17 ); if ( LenPass == 2 ) PutBits( 3, ZeroRun-3 ); ZeroRun = 0; }
else { int x = ZeroRun; if ( x > 138 ) x = 138; PutLenCode( 18 ); if ( LenPass == 2 ) PutBits( 7, x - 11 ); ZeroRun -= x; }
}
}

public static uint Adler32( byte b ) // Checksum function per RFC 1950.
{
uint s1 = 1, s2 = 0;
for ( int i = 0; i < b.Length; i += 1 )
{
s1 = ( s1 + b[ i ] ) % 65521;
s2 = ( s2 + s1 ) % 65521;
}
return s2*65536 + s1;
}

static void Clear( int f ){ System.Array.Clear( f, 0, f.Length ); }

// Output stream.
byte Buf = 0, M = 1;
void PutBit( int b ) { if ( b != 0 ) Buf |= M; M <<= 1; if ( M == 0 ) { Add(Buf); Buf = 0; M = 1; } }
void PutBits( int n, int x ) { for ( int i = 0; i < n; i += 1 ) { PutBit( x & 1 ); x >>= 1; } }
void FlushBitBuf(){ while ( M != 1 ) PutBit( 0 ); }
void Put32( uint x ) { Add( (byte)( x >> 24 ) ); Add( (byte)( x >> 16 ) ); Add( (byte)( x >> 8 ) ); Add( (byte) x ); }

} // end class Encoder

//+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

class Huff // Given a list of frequencies (freq), compute Huffman code lengths (nbits) and codes (tree_code).
{
public static int ComputeCode( int bitLimit, int freq, byte nbits, ushort tree_code )
{
int ncode = freq.Length;
Node heap = new Node[ ncode ];
int hn = 0;
for ( int i = 0; i < ncode; i += 1 )
{
int f = freq[ i ];
if ( f > 0 )
{
Node n = new Node();
n.Freq = f;
n.Code = (ushort)i;
HeapInsert( heap, hn, n );
hn += 1;
}
}

for ( int i = 0; i < nbits.Length; i += 1 ) nbits[ i ] = 0;
if ( hn <= 1 ) // Special case.
{ if ( hn == 1 ) nbits[ heap[ 0 ].Code ] = 1; }
else
{
while ( hn > 1 ) // Keep pairing the lowest frequency nodes.
{
Node n = new Node();
hn -= 1; n.Left = HeapRemove( heap, hn );
hn -= 1; n.Right = HeapRemove( heap, hn );
n.Freq = n.Left.Freq + n.Right.Freq;
n.Depth = (byte) ( 1 + ( n.Left.Depth > n.Right.Depth ? n.Left.Depth : n.Right.Depth ) );
HeapInsert( heap, hn, n );
hn += 1;
}
Walk( nbits, heap[ 0 ], 0 ); // Walk the tree to find the code lengths (nbits).
}

for ( int i = 0; i < ncode; i += 1 ) if ( nbits[ i ] > bitLimit ) return -1;

// Now compute codes, code below is from RFC 1951 page 7.

int maxBits = 0;
for ( int i = 0; i < ncode; i += 1 ) if ( nbits[ i ] > maxBits ) maxBits = nbits[ i ];

int bl_count = new int[ maxBits+1 ];
for ( int i = 0; i < ncode; i += 1 ) bl_count[ nbits[ i ] ] += 1;

int next_code = new int[ maxBits+1 ];
int code = 0; bl_count[ 0 ] = 0;
for ( int i = 0; i < maxBits; i += 1 )
{
code = ( code + bl_count[ i ] ) << 1;
next_code[ i+1 ] = code;
}

for ( int i = 0; i < ncode; i += 1 )
{
int len = nbits[ i ];
if ( len != 0 )
{
tree_code[ i ] = (ushort)Reverse( next_code[ len ], len );
next_code[ len ] += 1;
}
}
while ( ncode > 0 && nbits[ ncode - 1 ] == 0 ) ncode -= 1;

//System.Console.WriteLine( "Huff.ComputeCode" );
//for ( int i = 0; i < ncode; i += 1 ) if ( nbits[ i ] > 0 )
// System.Console.WriteLine( "i=" + i + " len=" + nbits[ i ] + " tc=" + tree_code[ i ].ToString("X") + " freq=" + freq[ i ] );

return ncode;
}

class Node{ public Node Left, Right; public int Freq; public ushort Code; public byte Depth; }

static int Reverse( int x, int bits )
{ int result = 0; for ( int i = 0; i < bits; i += 1 ) { result <<= 1; result |= x & 1; x >>= 1; } return result; }

static void Walk( byte a, Node n, int len )
{ if ( n.Left == null ) a[ n.Code ] = (byte)len; else { Walk( a, n.Left, len+1 ); Walk( a, n.Right, len+1 ); } }

static bool LessThan( Node a, Node b )
{ return a.Freq < b.Freq || a.Freq == b.Freq && a.Depth < b.Depth; }

// See e.g. https://en.wikipedia.org/wiki/Heap_(data_structure) for information on heap data structure.

static void HeapInsert( Node heap, int h, Node n ) // h is size of heap before insertion.
{
int j = h;
while ( j > 0 )
{
int p = ( j - 1 ) / 2; // Index of parent.
Node pn = heap[ p ];
if ( !LessThan( n, pn ) ) break;
heap[ j ] = pn; // Demote parent.
j = p;
}
heap[ j ] = n;
}

static Node HeapRemove( Node heap, int h ) // h is size of heap after removal.
{
Node result = heap[ 0 ];
Node n = heap[ h ];
int j = 0;
while ( true )
{
int c = j * 2 + 1; if ( c >= h ) break;
Node cn = heap[ c ];
if ( c + 1 < h )
{
Node cn2 = heap[ c + 1 ];
if ( LessThan( cn2, cn ) ) { c += 1; cn = cn2; }
}
if ( !LessThan( cn, n ) ) break;
heap[ j ] = cn; j = c;
}
heap[ j ] = n;
return result;
}

} // end class Huff









share|improve this question







New contributor




George Barwood is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
Check out our Code of Conduct.
















  • 4




    I find your one-liners, abbreviations and lots of strange names and formatting scarry, ugly, incomprihesible and generally terrible - do you always write code like this? :-
    – t3chb0t
    Jan 2 at 19:21














3












3








3







This is a little project I did over Christmas, I wanted to understand a bit more about how RFC 1951 (https://www.ietf.org/rfc/rfc1951.txt) compression worked, mostly out of curiosity really. RFC 1951 is used to compress PNG images, zip files and portions of PDF files ( and probably other things as well ). I have tried to keep the C# code fairly simple, concise and hopefully comprehensible, rather than being too concerned with efficiency. Here's the code:



using Generic = System.Collections.Generic; 

class Encoder : Generic.List<byte> // Data compression per RFC 1950, RFC 1951.
{
public Generic.List<byte> Deflate( byte inp )
{
Clear();
Add( 0x78); Add( 0x9C ); // RFC 1950 bytes.
ReadInp( inp );
while ( DoOutput( 1 ) == 0 );
FlushBitBuf();
Put32( Adler32( inp ) ); // RFC 1950 checksum.
// System.Console.WriteLine( "Encoder.Deflate, input size=" + inp.Length + " output size=" + this.Count );
return this;
}

class PosList{ public int Pos; public PosList Next; } // List of 3-byte match positions.

void ReadInp( byte input ) // LZ77 compression, per RFC 1951.
{
Generic.Dictionary<int,PosList> dict = new Generic.Dictionary<int,PosList>();
int n = input.Length;
SetIBufSize( n );

int w = 0; // Holds last 3 bytes of input.
int todo = 0; // Number of bytes in w that have not yet been output to IBuf, can be negative when a match is found.
int pendingMatchLen = 0, pendingDist = 0;

for ( int i = 0; i < 2 && i < n; i += 1 ) { w = ( w << 8 ) | input[ i ]; todo += 1; }

for ( int i = 2; i < n; i += 1 )
{
w = ( ( w << 8 ) | input[ i ] ) & 0xffffff; todo += 1;
PosList e, x = new PosList(); x.Pos = i;
int bestMatchLen = 0, bestDist = 0;
if ( dict.TryGetValue( w, out e ) )
{
x.Next = e;
PosList p = x;
if ( todo >= 3 ) while ( e != null )
{
int dist = i - e.Pos; if ( dist > 32768 ) { p.Next = null; break; }
int matchLen = MatchLen( input, dist, i );
if ( matchLen > bestMatchLen ) { bestMatchLen = matchLen; bestDist = dist; }
p = e; e = e.Next;
}
}
dict[ w ] = x; ISpace();

// "Lazy matching" RFC 1951 p.15 : if there are overlapping matches, there is a choice over which of the match to use.
// Example: "abc012bc345.... abc345". Here abc345 can be encoded as either [abc][345] or as a[bc345].
// Since a range typically needs more bits to encode than a literal, choose the latter.

if ( pendingMatchLen > 0 )
{
if ( bestMatchLen > pendingMatchLen || bestMatchLen == pendingMatchLen && bestDist < pendingDist )
{ IPut( input[ i-3 ] ); todo -= 1; }
else // Save the pending match, suppress bestMatch if any.
{
IPut( 257 + pendingMatchLen );
IPut( pendingDist );
todo -= pendingMatchLen;
bestMatchLen = 0;
}
pendingMatchLen = 0;
}
if ( bestMatchLen > 0 ) { pendingMatchLen = bestMatchLen; pendingDist = bestDist; }
else if ( todo == 3 ) { IPut( w >> 16 ); todo = 2; }
} // End of main input loop.
if ( pendingMatchLen > 0 )
{
IPut( 257 + pendingMatchLen );
IPut( pendingDist );
todo -= pendingMatchLen;
}
while ( todo > 0 ){ todo -= 1; IPut( (byte)( w >> (todo*8) ) ); }
} // end ReadInp

int MatchLen( byte input, int dist, int i )
{
// We have a match of 3 bytes, this function computes total match.
int end = input.Length; if ( end - i > 256 ) end = i + 256; // Maximum match is 258.
int x = i + 1; while ( x < end && input[ x ] == input[ x-dist ] ) x += 1;
return x - i + 2;
}

ushort IBuf; // Intermediate circular buffer, holds output from LZ77 algorithm.
const int IBufSizeMax = 0x40000;
int IBufSize, IBufI, IBufJ;
void IPut( int x ) { IBuf[ IBufI++ ] = (ushort)x; if ( IBufI == IBufSize ) IBufI = 0; }
int IGet(){ int result = IBuf[ IBufJ++ ]; if ( IBufJ == IBufSize ) IBufJ = 0; return result; }
int ICount(){ if ( IBufI >= IBufJ ) return IBufI - IBufJ; else return IBufI + IBufSize - IBufJ; } // Number of values in IBuf.
void ISpace(){ while ( ICount() > IBufSize - 10 ) DoOutput( 0 ); } // Ensure IBuf has space for at least 10 values.
void SetIBufSize( int x ) { x += 20; if ( x > IBufSizeMax ) x = IBufSizeMax; if ( IBufSize < x ) { IBufSize = x; IBuf = new ushort[ x ]; } }

byte DoOutput( byte lastBlock ) // While DoBlock fails, retry with a smaller amount of input.
{
int n = ICount();
while ( !DoBlock( n, lastBlock ) ) { lastBlock = 0; n -= n / 20; }
return lastBlock;
}

// RFC 1951 encoding constants.
static byte ClenAlphabet = { 16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15 };
static byte MatchExtra = { 0,0,0,0, 0,0,0,0, 1,1,1,1, 2,2,2,2, 3,3,3,3, 4,4,4,4, 5,5,5,5, 0 };
static ushort MatchOff = { 3,4,5,6, 7,8,9,10, 11,13,15,17, 19,23,27,31, 35,43,51,59, 67,83,99,115, 131,163,195,227, 258, 0xffff };
static byte DistExtra = { 0,0,0,0, 1,1,2,2, 3,3,4,4, 5,5,6,6, 7,7,8,8, 9,9,10,10, 11,11,12,12, 13,13 };
static ushort DistOff = { 1,2,3,4, 5,7,9,13, 17,25,33,49, 65,97,129,193, 257,385,513,769, 1025,1537,2049,3073,
4097,6145,8193,12289, 16385,24577, 0xffff };

int LitFreq = new int [ 288 ], DistFreq = new int [ 32 ], LenFreq = new int [ 19 ];
byte LitLen = new byte [ 288 ], DistLen = new byte [ 32 ], LenLen = new byte [ 19 ];
ushort LitCode = new ushort[ 288 ], DistCode = new ushort[ 32 ], LenCode = new ushort[ 19 ];

bool DoBlock( int n, byte lastBlock )
{
// First compute symbol frequencies.
Clear( LitFreq ); Clear( DistFreq );
int saveIBufJ = IBufJ;
int got = 0; while ( got < n )
{
int x = IGet(); got += 1;
if ( x < 256 ) LitFreq[ x ] += 1;
else
{
x -= 257;
int dist = IGet(); got += 1;
int mc = 0; while ( x >= MatchOff[ mc ] ) mc += 1; mc -= 1;
int dc = 0; while ( dist >= DistOff[ dc ] ) dc += 1; dc -= 1;
LitFreq[ 257 + mc ] += 1;
DistFreq[ dc ] += 1;
}
}
LitFreq[ 256 ] += 1; // End of block code.
IBufJ = saveIBufJ;

// Now compute Huffman codes.
int nLitCode = Huff.ComputeCode( 15, LitFreq, LitLen, LitCode ); if ( nLitCode < 0 ) return false;
int nDistCode = Huff.ComputeCode( 15, DistFreq, DistLen, DistCode ); if ( nDistCode < 0 ) return false;
if ( nDistCode == 0 ) nDistCode = 1;

Clear( LenFreq );
LenPass = 1; DoLengths( nLitCode, LitLen, true ); DoLengths( nDistCode, DistLen, false );

if ( Huff.ComputeCode( 7, LenFreq, LenLen, LenCode ) < 0 ) return false;

int nLenCode = 19; while ( nLenCode > 4 && LenLen[ ClenAlphabet[ nLenCode - 1 ] ] == 0 ) nLenCode -= 1;

// Now output dynamic Huffman block. For small blocks fixed coding might work better, not implemented.
PutBit( lastBlock );
PutBits( 2, 2 );
PutBits( 5, nLitCode - 257 ); PutBits( 5, nDistCode - 1 ); PutBits( 4, nLenCode - 4 );
for ( int i = 0; i < nLenCode; i += 1 ) PutBits( 3, LenLen[ ClenAlphabet[ i ] ] );

LenPass = 2; DoLengths( nLitCode, LitLen, true ); DoLengths( nDistCode, DistLen, false );

got = 0; while ( got < n ) // Similar to loop above, but does output instead of computing symbol frequencies.
{
int x = IGet(); got += 1;
if ( x < 256 ) PutBits( LitLen[ x ], LitCode[ x ] );
else
{
x -= 257;
int dist = IGet(); got += 1;
int mc = 0; while ( x >= MatchOff[ mc ] ) mc += 1; mc -= 1;
int dc = 0; while ( dist >= DistOff[ dc ] ) dc += 1; dc -= 1;
PutBits( LitLen[ 257 + mc ], LitCode[ 257 + mc ] );
PutBits( MatchExtra[ mc ], x-MatchOff[ mc ] );
PutBits( DistLen[ dc ], DistCode[ dc ] );
PutBits( DistExtra[ dc ], dist-DistOff[ dc ] );
}
}
PutBits( LitLen[ 256 ], LitCode[ 256 ] ); // End of block code.
return true;
} // end DoBlock

// Run length encoding of code lengths - RFC 1951, page 13.

int LenPass, Plen, ZeroRun, Repeat;

void PutLenCode( int code ) { if ( LenPass == 1 ) LenFreq[ code ] += 1; else PutBits( LenLen[ code ], LenCode[ code ] ); }

void DoLengths( int n, byte a, bool isLit )
{
if ( isLit ) { Plen = 0; ZeroRun = 0; Repeat = 0; }
for ( int i = 0; i < n; i += 1 )
{
int len = a[ i ];
if ( len == 0 ){ EncRepeat(); ZeroRun += 1; Plen = 0; }
else if ( len == Plen ) { Repeat += 1; }
else { EncZeroRun(); EncRepeat(); PutLenCode( len ); Plen = len; }
}
if ( !isLit ) { EncZeroRun(); EncRepeat(); }
}

void EncRepeat()
{
while ( Repeat > 0 )
{
if ( Repeat < 3 ) { PutLenCode( Plen ); Repeat -= 1; }
else { int x = Repeat; if ( x > 6 ) x = 6; PutLenCode( 16 ); if ( LenPass == 2 ) PutBits( 2, x-3 ); Repeat -= x; }
}
}

void EncZeroRun()
{
while ( ZeroRun > 0 )
{
if ( ZeroRun < 3 ) { PutLenCode( 0 ); ZeroRun -= 1; }
else if ( ZeroRun < 11 ) { PutLenCode( 17 ); if ( LenPass == 2 ) PutBits( 3, ZeroRun-3 ); ZeroRun = 0; }
else { int x = ZeroRun; if ( x > 138 ) x = 138; PutLenCode( 18 ); if ( LenPass == 2 ) PutBits( 7, x - 11 ); ZeroRun -= x; }
}
}

public static uint Adler32( byte b ) // Checksum function per RFC 1950.
{
uint s1 = 1, s2 = 0;
for ( int i = 0; i < b.Length; i += 1 )
{
s1 = ( s1 + b[ i ] ) % 65521;
s2 = ( s2 + s1 ) % 65521;
}
return s2*65536 + s1;
}

static void Clear( int f ){ System.Array.Clear( f, 0, f.Length ); }

// Output stream.
byte Buf = 0, M = 1;
void PutBit( int b ) { if ( b != 0 ) Buf |= M; M <<= 1; if ( M == 0 ) { Add(Buf); Buf = 0; M = 1; } }
void PutBits( int n, int x ) { for ( int i = 0; i < n; i += 1 ) { PutBit( x & 1 ); x >>= 1; } }
void FlushBitBuf(){ while ( M != 1 ) PutBit( 0 ); }
void Put32( uint x ) { Add( (byte)( x >> 24 ) ); Add( (byte)( x >> 16 ) ); Add( (byte)( x >> 8 ) ); Add( (byte) x ); }

} // end class Encoder

//+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

class Huff // Given a list of frequencies (freq), compute Huffman code lengths (nbits) and codes (tree_code).
{
public static int ComputeCode( int bitLimit, int freq, byte nbits, ushort tree_code )
{
int ncode = freq.Length;
Node heap = new Node[ ncode ];
int hn = 0;
for ( int i = 0; i < ncode; i += 1 )
{
int f = freq[ i ];
if ( f > 0 )
{
Node n = new Node();
n.Freq = f;
n.Code = (ushort)i;
HeapInsert( heap, hn, n );
hn += 1;
}
}

for ( int i = 0; i < nbits.Length; i += 1 ) nbits[ i ] = 0;
if ( hn <= 1 ) // Special case.
{ if ( hn == 1 ) nbits[ heap[ 0 ].Code ] = 1; }
else
{
while ( hn > 1 ) // Keep pairing the lowest frequency nodes.
{
Node n = new Node();
hn -= 1; n.Left = HeapRemove( heap, hn );
hn -= 1; n.Right = HeapRemove( heap, hn );
n.Freq = n.Left.Freq + n.Right.Freq;
n.Depth = (byte) ( 1 + ( n.Left.Depth > n.Right.Depth ? n.Left.Depth : n.Right.Depth ) );
HeapInsert( heap, hn, n );
hn += 1;
}
Walk( nbits, heap[ 0 ], 0 ); // Walk the tree to find the code lengths (nbits).
}

for ( int i = 0; i < ncode; i += 1 ) if ( nbits[ i ] > bitLimit ) return -1;

// Now compute codes, code below is from RFC 1951 page 7.

int maxBits = 0;
for ( int i = 0; i < ncode; i += 1 ) if ( nbits[ i ] > maxBits ) maxBits = nbits[ i ];

int bl_count = new int[ maxBits+1 ];
for ( int i = 0; i < ncode; i += 1 ) bl_count[ nbits[ i ] ] += 1;

int next_code = new int[ maxBits+1 ];
int code = 0; bl_count[ 0 ] = 0;
for ( int i = 0; i < maxBits; i += 1 )
{
code = ( code + bl_count[ i ] ) << 1;
next_code[ i+1 ] = code;
}

for ( int i = 0; i < ncode; i += 1 )
{
int len = nbits[ i ];
if ( len != 0 )
{
tree_code[ i ] = (ushort)Reverse( next_code[ len ], len );
next_code[ len ] += 1;
}
}
while ( ncode > 0 && nbits[ ncode - 1 ] == 0 ) ncode -= 1;

//System.Console.WriteLine( "Huff.ComputeCode" );
//for ( int i = 0; i < ncode; i += 1 ) if ( nbits[ i ] > 0 )
// System.Console.WriteLine( "i=" + i + " len=" + nbits[ i ] + " tc=" + tree_code[ i ].ToString("X") + " freq=" + freq[ i ] );

return ncode;
}

class Node{ public Node Left, Right; public int Freq; public ushort Code; public byte Depth; }

static int Reverse( int x, int bits )
{ int result = 0; for ( int i = 0; i < bits; i += 1 ) { result <<= 1; result |= x & 1; x >>= 1; } return result; }

static void Walk( byte a, Node n, int len )
{ if ( n.Left == null ) a[ n.Code ] = (byte)len; else { Walk( a, n.Left, len+1 ); Walk( a, n.Right, len+1 ); } }

static bool LessThan( Node a, Node b )
{ return a.Freq < b.Freq || a.Freq == b.Freq && a.Depth < b.Depth; }

// See e.g. https://en.wikipedia.org/wiki/Heap_(data_structure) for information on heap data structure.

static void HeapInsert( Node heap, int h, Node n ) // h is size of heap before insertion.
{
int j = h;
while ( j > 0 )
{
int p = ( j - 1 ) / 2; // Index of parent.
Node pn = heap[ p ];
if ( !LessThan( n, pn ) ) break;
heap[ j ] = pn; // Demote parent.
j = p;
}
heap[ j ] = n;
}

static Node HeapRemove( Node heap, int h ) // h is size of heap after removal.
{
Node result = heap[ 0 ];
Node n = heap[ h ];
int j = 0;
while ( true )
{
int c = j * 2 + 1; if ( c >= h ) break;
Node cn = heap[ c ];
if ( c + 1 < h )
{
Node cn2 = heap[ c + 1 ];
if ( LessThan( cn2, cn ) ) { c += 1; cn = cn2; }
}
if ( !LessThan( cn, n ) ) break;
heap[ j ] = cn; j = c;
}
heap[ j ] = n;
return result;
}

} // end class Huff









share|improve this question







New contributor




George Barwood is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
Check out our Code of Conduct.











This is a little project I did over Christmas, I wanted to understand a bit more about how RFC 1951 (https://www.ietf.org/rfc/rfc1951.txt) compression worked, mostly out of curiosity really. RFC 1951 is used to compress PNG images, zip files and portions of PDF files ( and probably other things as well ). I have tried to keep the C# code fairly simple, concise and hopefully comprehensible, rather than being too concerned with efficiency. Here's the code:



using Generic = System.Collections.Generic; 

class Encoder : Generic.List<byte> // Data compression per RFC 1950, RFC 1951.
{
public Generic.List<byte> Deflate( byte inp )
{
Clear();
Add( 0x78); Add( 0x9C ); // RFC 1950 bytes.
ReadInp( inp );
while ( DoOutput( 1 ) == 0 );
FlushBitBuf();
Put32( Adler32( inp ) ); // RFC 1950 checksum.
// System.Console.WriteLine( "Encoder.Deflate, input size=" + inp.Length + " output size=" + this.Count );
return this;
}

class PosList{ public int Pos; public PosList Next; } // List of 3-byte match positions.

void ReadInp( byte input ) // LZ77 compression, per RFC 1951.
{
Generic.Dictionary<int,PosList> dict = new Generic.Dictionary<int,PosList>();
int n = input.Length;
SetIBufSize( n );

int w = 0; // Holds last 3 bytes of input.
int todo = 0; // Number of bytes in w that have not yet been output to IBuf, can be negative when a match is found.
int pendingMatchLen = 0, pendingDist = 0;

for ( int i = 0; i < 2 && i < n; i += 1 ) { w = ( w << 8 ) | input[ i ]; todo += 1; }

for ( int i = 2; i < n; i += 1 )
{
w = ( ( w << 8 ) | input[ i ] ) & 0xffffff; todo += 1;
PosList e, x = new PosList(); x.Pos = i;
int bestMatchLen = 0, bestDist = 0;
if ( dict.TryGetValue( w, out e ) )
{
x.Next = e;
PosList p = x;
if ( todo >= 3 ) while ( e != null )
{
int dist = i - e.Pos; if ( dist > 32768 ) { p.Next = null; break; }
int matchLen = MatchLen( input, dist, i );
if ( matchLen > bestMatchLen ) { bestMatchLen = matchLen; bestDist = dist; }
p = e; e = e.Next;
}
}
dict[ w ] = x; ISpace();

// "Lazy matching" RFC 1951 p.15 : if there are overlapping matches, there is a choice over which of the match to use.
// Example: "abc012bc345.... abc345". Here abc345 can be encoded as either [abc][345] or as a[bc345].
// Since a range typically needs more bits to encode than a literal, choose the latter.

if ( pendingMatchLen > 0 )
{
if ( bestMatchLen > pendingMatchLen || bestMatchLen == pendingMatchLen && bestDist < pendingDist )
{ IPut( input[ i-3 ] ); todo -= 1; }
else // Save the pending match, suppress bestMatch if any.
{
IPut( 257 + pendingMatchLen );
IPut( pendingDist );
todo -= pendingMatchLen;
bestMatchLen = 0;
}
pendingMatchLen = 0;
}
if ( bestMatchLen > 0 ) { pendingMatchLen = bestMatchLen; pendingDist = bestDist; }
else if ( todo == 3 ) { IPut( w >> 16 ); todo = 2; }
} // End of main input loop.
if ( pendingMatchLen > 0 )
{
IPut( 257 + pendingMatchLen );
IPut( pendingDist );
todo -= pendingMatchLen;
}
while ( todo > 0 ){ todo -= 1; IPut( (byte)( w >> (todo*8) ) ); }
} // end ReadInp

int MatchLen( byte input, int dist, int i )
{
// We have a match of 3 bytes, this function computes total match.
int end = input.Length; if ( end - i > 256 ) end = i + 256; // Maximum match is 258.
int x = i + 1; while ( x < end && input[ x ] == input[ x-dist ] ) x += 1;
return x - i + 2;
}

ushort IBuf; // Intermediate circular buffer, holds output from LZ77 algorithm.
const int IBufSizeMax = 0x40000;
int IBufSize, IBufI, IBufJ;
void IPut( int x ) { IBuf[ IBufI++ ] = (ushort)x; if ( IBufI == IBufSize ) IBufI = 0; }
int IGet(){ int result = IBuf[ IBufJ++ ]; if ( IBufJ == IBufSize ) IBufJ = 0; return result; }
int ICount(){ if ( IBufI >= IBufJ ) return IBufI - IBufJ; else return IBufI + IBufSize - IBufJ; } // Number of values in IBuf.
void ISpace(){ while ( ICount() > IBufSize - 10 ) DoOutput( 0 ); } // Ensure IBuf has space for at least 10 values.
void SetIBufSize( int x ) { x += 20; if ( x > IBufSizeMax ) x = IBufSizeMax; if ( IBufSize < x ) { IBufSize = x; IBuf = new ushort[ x ]; } }

byte DoOutput( byte lastBlock ) // While DoBlock fails, retry with a smaller amount of input.
{
int n = ICount();
while ( !DoBlock( n, lastBlock ) ) { lastBlock = 0; n -= n / 20; }
return lastBlock;
}

// RFC 1951 encoding constants.
static byte ClenAlphabet = { 16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15 };
static byte MatchExtra = { 0,0,0,0, 0,0,0,0, 1,1,1,1, 2,2,2,2, 3,3,3,3, 4,4,4,4, 5,5,5,5, 0 };
static ushort MatchOff = { 3,4,5,6, 7,8,9,10, 11,13,15,17, 19,23,27,31, 35,43,51,59, 67,83,99,115, 131,163,195,227, 258, 0xffff };
static byte DistExtra = { 0,0,0,0, 1,1,2,2, 3,3,4,4, 5,5,6,6, 7,7,8,8, 9,9,10,10, 11,11,12,12, 13,13 };
static ushort DistOff = { 1,2,3,4, 5,7,9,13, 17,25,33,49, 65,97,129,193, 257,385,513,769, 1025,1537,2049,3073,
4097,6145,8193,12289, 16385,24577, 0xffff };

int LitFreq = new int [ 288 ], DistFreq = new int [ 32 ], LenFreq = new int [ 19 ];
byte LitLen = new byte [ 288 ], DistLen = new byte [ 32 ], LenLen = new byte [ 19 ];
ushort LitCode = new ushort[ 288 ], DistCode = new ushort[ 32 ], LenCode = new ushort[ 19 ];

bool DoBlock( int n, byte lastBlock )
{
// First compute symbol frequencies.
Clear( LitFreq ); Clear( DistFreq );
int saveIBufJ = IBufJ;
int got = 0; while ( got < n )
{
int x = IGet(); got += 1;
if ( x < 256 ) LitFreq[ x ] += 1;
else
{
x -= 257;
int dist = IGet(); got += 1;
int mc = 0; while ( x >= MatchOff[ mc ] ) mc += 1; mc -= 1;
int dc = 0; while ( dist >= DistOff[ dc ] ) dc += 1; dc -= 1;
LitFreq[ 257 + mc ] += 1;
DistFreq[ dc ] += 1;
}
}
LitFreq[ 256 ] += 1; // End of block code.
IBufJ = saveIBufJ;

// Now compute Huffman codes.
int nLitCode = Huff.ComputeCode( 15, LitFreq, LitLen, LitCode ); if ( nLitCode < 0 ) return false;
int nDistCode = Huff.ComputeCode( 15, DistFreq, DistLen, DistCode ); if ( nDistCode < 0 ) return false;
if ( nDistCode == 0 ) nDistCode = 1;

Clear( LenFreq );
LenPass = 1; DoLengths( nLitCode, LitLen, true ); DoLengths( nDistCode, DistLen, false );

if ( Huff.ComputeCode( 7, LenFreq, LenLen, LenCode ) < 0 ) return false;

int nLenCode = 19; while ( nLenCode > 4 && LenLen[ ClenAlphabet[ nLenCode - 1 ] ] == 0 ) nLenCode -= 1;

// Now output dynamic Huffman block. For small blocks fixed coding might work better, not implemented.
PutBit( lastBlock );
PutBits( 2, 2 );
PutBits( 5, nLitCode - 257 ); PutBits( 5, nDistCode - 1 ); PutBits( 4, nLenCode - 4 );
for ( int i = 0; i < nLenCode; i += 1 ) PutBits( 3, LenLen[ ClenAlphabet[ i ] ] );

LenPass = 2; DoLengths( nLitCode, LitLen, true ); DoLengths( nDistCode, DistLen, false );

got = 0; while ( got < n ) // Similar to loop above, but does output instead of computing symbol frequencies.
{
int x = IGet(); got += 1;
if ( x < 256 ) PutBits( LitLen[ x ], LitCode[ x ] );
else
{
x -= 257;
int dist = IGet(); got += 1;
int mc = 0; while ( x >= MatchOff[ mc ] ) mc += 1; mc -= 1;
int dc = 0; while ( dist >= DistOff[ dc ] ) dc += 1; dc -= 1;
PutBits( LitLen[ 257 + mc ], LitCode[ 257 + mc ] );
PutBits( MatchExtra[ mc ], x-MatchOff[ mc ] );
PutBits( DistLen[ dc ], DistCode[ dc ] );
PutBits( DistExtra[ dc ], dist-DistOff[ dc ] );
}
}
PutBits( LitLen[ 256 ], LitCode[ 256 ] ); // End of block code.
return true;
} // end DoBlock

// Run length encoding of code lengths - RFC 1951, page 13.

int LenPass, Plen, ZeroRun, Repeat;

void PutLenCode( int code ) { if ( LenPass == 1 ) LenFreq[ code ] += 1; else PutBits( LenLen[ code ], LenCode[ code ] ); }

void DoLengths( int n, byte a, bool isLit )
{
if ( isLit ) { Plen = 0; ZeroRun = 0; Repeat = 0; }
for ( int i = 0; i < n; i += 1 )
{
int len = a[ i ];
if ( len == 0 ){ EncRepeat(); ZeroRun += 1; Plen = 0; }
else if ( len == Plen ) { Repeat += 1; }
else { EncZeroRun(); EncRepeat(); PutLenCode( len ); Plen = len; }
}
if ( !isLit ) { EncZeroRun(); EncRepeat(); }
}

void EncRepeat()
{
while ( Repeat > 0 )
{
if ( Repeat < 3 ) { PutLenCode( Plen ); Repeat -= 1; }
else { int x = Repeat; if ( x > 6 ) x = 6; PutLenCode( 16 ); if ( LenPass == 2 ) PutBits( 2, x-3 ); Repeat -= x; }
}
}

void EncZeroRun()
{
while ( ZeroRun > 0 )
{
if ( ZeroRun < 3 ) { PutLenCode( 0 ); ZeroRun -= 1; }
else if ( ZeroRun < 11 ) { PutLenCode( 17 ); if ( LenPass == 2 ) PutBits( 3, ZeroRun-3 ); ZeroRun = 0; }
else { int x = ZeroRun; if ( x > 138 ) x = 138; PutLenCode( 18 ); if ( LenPass == 2 ) PutBits( 7, x - 11 ); ZeroRun -= x; }
}
}

public static uint Adler32( byte b ) // Checksum function per RFC 1950.
{
uint s1 = 1, s2 = 0;
for ( int i = 0; i < b.Length; i += 1 )
{
s1 = ( s1 + b[ i ] ) % 65521;
s2 = ( s2 + s1 ) % 65521;
}
return s2*65536 + s1;
}

static void Clear( int f ){ System.Array.Clear( f, 0, f.Length ); }

// Output stream.
byte Buf = 0, M = 1;
void PutBit( int b ) { if ( b != 0 ) Buf |= M; M <<= 1; if ( M == 0 ) { Add(Buf); Buf = 0; M = 1; } }
void PutBits( int n, int x ) { for ( int i = 0; i < n; i += 1 ) { PutBit( x & 1 ); x >>= 1; } }
void FlushBitBuf(){ while ( M != 1 ) PutBit( 0 ); }
void Put32( uint x ) { Add( (byte)( x >> 24 ) ); Add( (byte)( x >> 16 ) ); Add( (byte)( x >> 8 ) ); Add( (byte) x ); }

} // end class Encoder

//+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

class Huff // Given a list of frequencies (freq), compute Huffman code lengths (nbits) and codes (tree_code).
{
public static int ComputeCode( int bitLimit, int freq, byte nbits, ushort tree_code )
{
int ncode = freq.Length;
Node heap = new Node[ ncode ];
int hn = 0;
for ( int i = 0; i < ncode; i += 1 )
{
int f = freq[ i ];
if ( f > 0 )
{
Node n = new Node();
n.Freq = f;
n.Code = (ushort)i;
HeapInsert( heap, hn, n );
hn += 1;
}
}

for ( int i = 0; i < nbits.Length; i += 1 ) nbits[ i ] = 0;
if ( hn <= 1 ) // Special case.
{ if ( hn == 1 ) nbits[ heap[ 0 ].Code ] = 1; }
else
{
while ( hn > 1 ) // Keep pairing the lowest frequency nodes.
{
Node n = new Node();
hn -= 1; n.Left = HeapRemove( heap, hn );
hn -= 1; n.Right = HeapRemove( heap, hn );
n.Freq = n.Left.Freq + n.Right.Freq;
n.Depth = (byte) ( 1 + ( n.Left.Depth > n.Right.Depth ? n.Left.Depth : n.Right.Depth ) );
HeapInsert( heap, hn, n );
hn += 1;
}
Walk( nbits, heap[ 0 ], 0 ); // Walk the tree to find the code lengths (nbits).
}

for ( int i = 0; i < ncode; i += 1 ) if ( nbits[ i ] > bitLimit ) return -1;

// Now compute codes, code below is from RFC 1951 page 7.

int maxBits = 0;
for ( int i = 0; i < ncode; i += 1 ) if ( nbits[ i ] > maxBits ) maxBits = nbits[ i ];

int bl_count = new int[ maxBits+1 ];
for ( int i = 0; i < ncode; i += 1 ) bl_count[ nbits[ i ] ] += 1;

int next_code = new int[ maxBits+1 ];
int code = 0; bl_count[ 0 ] = 0;
for ( int i = 0; i < maxBits; i += 1 )
{
code = ( code + bl_count[ i ] ) << 1;
next_code[ i+1 ] = code;
}

for ( int i = 0; i < ncode; i += 1 )
{
int len = nbits[ i ];
if ( len != 0 )
{
tree_code[ i ] = (ushort)Reverse( next_code[ len ], len );
next_code[ len ] += 1;
}
}
while ( ncode > 0 && nbits[ ncode - 1 ] == 0 ) ncode -= 1;

//System.Console.WriteLine( "Huff.ComputeCode" );
//for ( int i = 0; i < ncode; i += 1 ) if ( nbits[ i ] > 0 )
// System.Console.WriteLine( "i=" + i + " len=" + nbits[ i ] + " tc=" + tree_code[ i ].ToString("X") + " freq=" + freq[ i ] );

return ncode;
}

class Node{ public Node Left, Right; public int Freq; public ushort Code; public byte Depth; }

static int Reverse( int x, int bits )
{ int result = 0; for ( int i = 0; i < bits; i += 1 ) { result <<= 1; result |= x & 1; x >>= 1; } return result; }

static void Walk( byte a, Node n, int len )
{ if ( n.Left == null ) a[ n.Code ] = (byte)len; else { Walk( a, n.Left, len+1 ); Walk( a, n.Right, len+1 ); } }

static bool LessThan( Node a, Node b )
{ return a.Freq < b.Freq || a.Freq == b.Freq && a.Depth < b.Depth; }

// See e.g. https://en.wikipedia.org/wiki/Heap_(data_structure) for information on heap data structure.

static void HeapInsert( Node heap, int h, Node n ) // h is size of heap before insertion.
{
int j = h;
while ( j > 0 )
{
int p = ( j - 1 ) / 2; // Index of parent.
Node pn = heap[ p ];
if ( !LessThan( n, pn ) ) break;
heap[ j ] = pn; // Demote parent.
j = p;
}
heap[ j ] = n;
}

static Node HeapRemove( Node heap, int h ) // h is size of heap after removal.
{
Node result = heap[ 0 ];
Node n = heap[ h ];
int j = 0;
while ( true )
{
int c = j * 2 + 1; if ( c >= h ) break;
Node cn = heap[ c ];
if ( c + 1 < h )
{
Node cn2 = heap[ c + 1 ];
if ( LessThan( cn2, cn ) ) { c += 1; cn = cn2; }
}
if ( !LessThan( cn, n ) ) break;
heap[ j ] = cn; j = c;
}
heap[ j ] = n;
return result;
}

} // end class Huff






c# compression heap






share|improve this question







New contributor




George Barwood is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
Check out our Code of Conduct.











share|improve this question







New contributor




George Barwood is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
Check out our Code of Conduct.









share|improve this question




share|improve this question






New contributor




George Barwood is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
Check out our Code of Conduct.









asked Jan 2 at 18:00









George Barwood

396




396




New contributor




George Barwood is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
Check out our Code of Conduct.





New contributor





George Barwood is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
Check out our Code of Conduct.






George Barwood is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
Check out our Code of Conduct.








  • 4




    I find your one-liners, abbreviations and lots of strange names and formatting scarry, ugly, incomprihesible and generally terrible - do you always write code like this? :-
    – t3chb0t
    Jan 2 at 19:21














  • 4




    I find your one-liners, abbreviations and lots of strange names and formatting scarry, ugly, incomprihesible and generally terrible - do you always write code like this? :-
    – t3chb0t
    Jan 2 at 19:21








4




4




I find your one-liners, abbreviations and lots of strange names and formatting scarry, ugly, incomprihesible and generally terrible - do you always write code like this? :-
– t3chb0t
Jan 2 at 19:21




I find your one-liners, abbreviations and lots of strange names and formatting scarry, ugly, incomprihesible and generally terrible - do you always write code like this? :-
– t3chb0t
Jan 2 at 19:21










2 Answers
2






active

oldest

votes


















5














Welcome to Code Review. I am sorry but I do not find your code as comprehensible as you were hoping. I find myself more aligned to the comment by @t3chb0t.



I really must ask have you ever read any parts of the C# Naming Guidelines? I am not saying that to be mean. Given that you are a new poster, we do try to be nice but unfortunately sometimes a certain bluntness cannot be avoided.



Something to also consider is that you should favor composition over inheritance (Wikipedia link). Here is a good StackOverflow link on it too. Rather than class Encoder implementing List<byte>, it would be better to have a private field of List<byte>. This also prevents someone from inadvertently performing any method that modifies the encoded value, such as an Add, Clear, or Remove. You could then expose that field publicly as a read only value.



You have a fair amount in a section related to the intermediate circular buffer. This bugs me for several reasons. For one, I think the buffer, variables, and methods could be put into its own class, even if its a private internal class. And having variable names being with "I" conflicts with the naming guideline on many fronts: (1) variables should begin with lowercase letters, and (2) a capital "I" makes me think its an Interface.



I applaud you for emphasizing simple and concise over efficient. Next I would urge you think strongly about readability. Ask yourself how well a mediocre C# developer would understand your code without having to ask you questions about it.






share|improve this answer





















  • Rick, strictly speaking, according the the guidelines, "The field-naming guidelines apply to static public and protected fields. Internal and private fields are not covered by guidelines", but leaving that aside, I use the convention that local variables and parameters begin with lower-case, and other names begin with upper-case.
    – George Barwood
    Jan 2 at 22:52










  • Oops, pressed enter before I meant to... Still, the "IBuf" and associated fields could be better named, I didn't give any real thought to it. Agreed about composition and inheritance. I also plead guilty to abbreviation, I hope poor t3chb0t isn't scarred for life! :)
    – George Barwood
    Jan 2 at 23:01








  • 2




    @GeorgeBarwood haha, I'm not ;-] I just find that carelessness about non-public code leads to many unnecessary hours of debugging and re-trying to understand the old code. The compiler doesn't care about the formatting or naming etc but people do and if I cannot relatively quickly understand how the code works I do everything to avoid it (and not only me)... this is why you most likely should not expect a thorough review becasue your code is extremely difficult to read and to understand.
    – t3chb0t
    2 days ago










  • @GeorgeBarwood If you expect other people to read your non-public code you should treat it as public and take great care of it too and try to make it as clean as possible.
    – t3chb0t
    2 days ago





















5














Readability



To expand a bit on what Rick and t3chb0t already said, here are some specific things that hurt the readability of your code:




  • Unnecessary abbreviations and undescriptive names. Some names convey virtually no meaning at all (n, w, x, p), while others are very unclear or do not accurately describe their meaning: ReadInp apparently performs deflation, not just input reading, and what does the 'do' in DoBlock mean?

  • Stuffing multiple statements on a single line. This makes code much more difficult to 'scan', and code that's difficult to read tends to be difficult to maintain and debug.


    • Especially jarring are lines like
      int mc = 0; while (x >= MatchOff[mc]) mc += 1; mc -= 1;
      There's no clear distinction between what's part of the while loop body and what's not. This can lead to subtle problems. Putting each statement on a separate line, and indenting the loop body makes it easier to identify the control flow at a glance.

    • Declaring multiple variables of the same type at once, separated by commas, also falls under this: it requires more care to see whether each has been properly initialized, and if not, whether that was intentional or not.



  • Related to the above point, the occasional omission of braces and sometimes inconsistent indentation doesn't help either.

  • Lack of whitespace:


    • Adding an empty line between subsequent if statements makes it more clear that they're not related, which makes it easier to quickly scan control flow.

    • Leave an empty lines between methods to provide some 'breathing space'. A wall of text is more difficult to read than an article that's split up into chapters and paragraphs.



  • Magic values - specific (numeric) values whose meaning is unclear, such as 19, 256, 257 and 288. Try using properly named constants instead.


C# specific




  • It's strange to see using Generic = System.Collections.Generic;. Namespace aliases are useful to prevent name clashes, but in my experience that's rarely a problem. In this case, System.Collections.Generic is so ubiquitously used that doing anything else than using System.Collections.Generic; will only cause confusion.

  • C# supports type inference, so Dictionary<int, PosList> dict = new Dictionary<int, PosList>(); can be simplified to var dict = new Dictionary<int, PosList>();.


  • out variables can be declared in-line, so if (dict.TryGetValue(w, out e)) can be simplified to if (dict.TryGetValue(w, out PosList e)) or if (dict.TryGetValue(w, out var e)).


Design




  • Inheritance is the wrong tool here, as Rick already pointed out. Just because an encoder uses byte-sequences as input and output does not make it a list of bytes itself. Note how calling Deflate again will overwrite the result of any previous calls - that's very surprising behavior (in a negative way). Also note that a lot of state that is only useful during the actual deflation is kept around - wasting memory, and making your job more difficult because you have to remember to properly reset things.

  • I see two designs that could work well here:


    • A Stream-based design, where you provide a CompressionStream class that inherits from Stream (it's a stream that you can read decompressed data from) and that wraps another Stream (which contains the still-compressed data). This integrates well with file and network stream reading, among other things. Be sure to read up on how disposal (IDisposable) works.

    • An IEnumerable<byte>-based design, with an IEnumerable<byte> Deflate(IEnumerable<byte> compressedData) method. This lets you pass in a variety of collections, including arrays, lists and lazily evaluated sequences such as the results of Linq extension methods. Returning an IEnumerable<byte> allows you to return any of these as well, and lets you use yield, which can be useful in certain cases. Deflate(byteArray.Skip(4)), for example, deflates all but the first 4 bytes from byteArray, without any modifications to your code. And if Deflate uses yield, then Deflate(byteArray).Take(4) only deflates the first 4 bytes, while Deflate(byteArray).ToArray() gives you an array that contains all deflated data.



  • Rick also mentioned those buffer-related methods, they're better moved to a separate buffer class. The same can be said for those 'output stream' methods. Take a look at how StreamWriter lets you write text to an underlying Stream, while taking care of things like encoding and buffering. You could create a similar construction for writing bits.






share|improve this answer























  • The idea was for the class to compress many parts of a PDF ( each page of a PDF has a "content stream" which can be "deflated" ), so the plan was to avoid re-allocating arrays/lists each time a page is compressed, reducing garbage collection overhead. But I didn't entirely follow-through on this, and whether it's worth worrying about I don't know - I haven't done any performance testing at all, or compared it to the standard Zlib implementation. Thanks for the C# tips, I had no idea about "var" or the other language features you mention.
    – George Barwood
    2 days ago













Your Answer





StackExchange.ifUsing("editor", function () {
return StackExchange.using("mathjaxEditing", function () {
StackExchange.MarkdownEditor.creationCallbacks.add(function (editor, postfix) {
StackExchange.mathjaxEditing.prepareWmdForMathJax(editor, postfix, [["\$", "\$"]]);
});
});
}, "mathjax-editing");

StackExchange.ifUsing("editor", function () {
StackExchange.using("externalEditor", function () {
StackExchange.using("snippets", function () {
StackExchange.snippets.init();
});
});
}, "code-snippets");

StackExchange.ready(function() {
var channelOptions = {
tags: "".split(" "),
id: "196"
};
initTagRenderer("".split(" "), "".split(" "), channelOptions);

StackExchange.using("externalEditor", function() {
// Have to fire editor after snippets, if snippets enabled
if (StackExchange.settings.snippets.snippetsEnabled) {
StackExchange.using("snippets", function() {
createEditor();
});
}
else {
createEditor();
}
});

function createEditor() {
StackExchange.prepareEditor({
heartbeatType: 'answer',
autoActivateHeartbeat: false,
convertImagesToLinks: false,
noModals: true,
showLowRepImageUploadWarning: true,
reputationToPostImages: null,
bindNavPrevention: true,
postfix: "",
imageUploader: {
brandingHtml: "Powered by u003ca class="icon-imgur-white" href="https://imgur.com/"u003eu003c/au003e",
contentPolicyHtml: "User contributions licensed under u003ca href="https://creativecommons.org/licenses/by-sa/3.0/"u003ecc by-sa 3.0 with attribution requiredu003c/au003e u003ca href="https://stackoverflow.com/legal/content-policy"u003e(content policy)u003c/au003e",
allowUrls: true
},
onDemand: true,
discardSelector: ".discard-answer"
,immediatelyShowMarkdownHelp:true
});


}
});






George Barwood is a new contributor. Be nice, and check out our Code of Conduct.










draft saved

draft discarded


















StackExchange.ready(
function () {
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fcodereview.stackexchange.com%2fquestions%2f210757%2frfc-1951-deflate-compression%23new-answer', 'question_page');
}
);

Post as a guest















Required, but never shown

























2 Answers
2






active

oldest

votes








2 Answers
2






active

oldest

votes









active

oldest

votes






active

oldest

votes









5














Welcome to Code Review. I am sorry but I do not find your code as comprehensible as you were hoping. I find myself more aligned to the comment by @t3chb0t.



I really must ask have you ever read any parts of the C# Naming Guidelines? I am not saying that to be mean. Given that you are a new poster, we do try to be nice but unfortunately sometimes a certain bluntness cannot be avoided.



Something to also consider is that you should favor composition over inheritance (Wikipedia link). Here is a good StackOverflow link on it too. Rather than class Encoder implementing List<byte>, it would be better to have a private field of List<byte>. This also prevents someone from inadvertently performing any method that modifies the encoded value, such as an Add, Clear, or Remove. You could then expose that field publicly as a read only value.



You have a fair amount in a section related to the intermediate circular buffer. This bugs me for several reasons. For one, I think the buffer, variables, and methods could be put into its own class, even if its a private internal class. And having variable names being with "I" conflicts with the naming guideline on many fronts: (1) variables should begin with lowercase letters, and (2) a capital "I" makes me think its an Interface.



I applaud you for emphasizing simple and concise over efficient. Next I would urge you think strongly about readability. Ask yourself how well a mediocre C# developer would understand your code without having to ask you questions about it.






share|improve this answer





















  • Rick, strictly speaking, according the the guidelines, "The field-naming guidelines apply to static public and protected fields. Internal and private fields are not covered by guidelines", but leaving that aside, I use the convention that local variables and parameters begin with lower-case, and other names begin with upper-case.
    – George Barwood
    Jan 2 at 22:52










  • Oops, pressed enter before I meant to... Still, the "IBuf" and associated fields could be better named, I didn't give any real thought to it. Agreed about composition and inheritance. I also plead guilty to abbreviation, I hope poor t3chb0t isn't scarred for life! :)
    – George Barwood
    Jan 2 at 23:01








  • 2




    @GeorgeBarwood haha, I'm not ;-] I just find that carelessness about non-public code leads to many unnecessary hours of debugging and re-trying to understand the old code. The compiler doesn't care about the formatting or naming etc but people do and if I cannot relatively quickly understand how the code works I do everything to avoid it (and not only me)... this is why you most likely should not expect a thorough review becasue your code is extremely difficult to read and to understand.
    – t3chb0t
    2 days ago










  • @GeorgeBarwood If you expect other people to read your non-public code you should treat it as public and take great care of it too and try to make it as clean as possible.
    – t3chb0t
    2 days ago


















5














Welcome to Code Review. I am sorry but I do not find your code as comprehensible as you were hoping. I find myself more aligned to the comment by @t3chb0t.



I really must ask have you ever read any parts of the C# Naming Guidelines? I am not saying that to be mean. Given that you are a new poster, we do try to be nice but unfortunately sometimes a certain bluntness cannot be avoided.



Something to also consider is that you should favor composition over inheritance (Wikipedia link). Here is a good StackOverflow link on it too. Rather than class Encoder implementing List<byte>, it would be better to have a private field of List<byte>. This also prevents someone from inadvertently performing any method that modifies the encoded value, such as an Add, Clear, or Remove. You could then expose that field publicly as a read only value.



You have a fair amount in a section related to the intermediate circular buffer. This bugs me for several reasons. For one, I think the buffer, variables, and methods could be put into its own class, even if its a private internal class. And having variable names being with "I" conflicts with the naming guideline on many fronts: (1) variables should begin with lowercase letters, and (2) a capital "I" makes me think its an Interface.



I applaud you for emphasizing simple and concise over efficient. Next I would urge you think strongly about readability. Ask yourself how well a mediocre C# developer would understand your code without having to ask you questions about it.






share|improve this answer





















  • Rick, strictly speaking, according the the guidelines, "The field-naming guidelines apply to static public and protected fields. Internal and private fields are not covered by guidelines", but leaving that aside, I use the convention that local variables and parameters begin with lower-case, and other names begin with upper-case.
    – George Barwood
    Jan 2 at 22:52










  • Oops, pressed enter before I meant to... Still, the "IBuf" and associated fields could be better named, I didn't give any real thought to it. Agreed about composition and inheritance. I also plead guilty to abbreviation, I hope poor t3chb0t isn't scarred for life! :)
    – George Barwood
    Jan 2 at 23:01








  • 2




    @GeorgeBarwood haha, I'm not ;-] I just find that carelessness about non-public code leads to many unnecessary hours of debugging and re-trying to understand the old code. The compiler doesn't care about the formatting or naming etc but people do and if I cannot relatively quickly understand how the code works I do everything to avoid it (and not only me)... this is why you most likely should not expect a thorough review becasue your code is extremely difficult to read and to understand.
    – t3chb0t
    2 days ago










  • @GeorgeBarwood If you expect other people to read your non-public code you should treat it as public and take great care of it too and try to make it as clean as possible.
    – t3chb0t
    2 days ago
















5












5








5






Welcome to Code Review. I am sorry but I do not find your code as comprehensible as you were hoping. I find myself more aligned to the comment by @t3chb0t.



I really must ask have you ever read any parts of the C# Naming Guidelines? I am not saying that to be mean. Given that you are a new poster, we do try to be nice but unfortunately sometimes a certain bluntness cannot be avoided.



Something to also consider is that you should favor composition over inheritance (Wikipedia link). Here is a good StackOverflow link on it too. Rather than class Encoder implementing List<byte>, it would be better to have a private field of List<byte>. This also prevents someone from inadvertently performing any method that modifies the encoded value, such as an Add, Clear, or Remove. You could then expose that field publicly as a read only value.



You have a fair amount in a section related to the intermediate circular buffer. This bugs me for several reasons. For one, I think the buffer, variables, and methods could be put into its own class, even if its a private internal class. And having variable names being with "I" conflicts with the naming guideline on many fronts: (1) variables should begin with lowercase letters, and (2) a capital "I" makes me think its an Interface.



I applaud you for emphasizing simple and concise over efficient. Next I would urge you think strongly about readability. Ask yourself how well a mediocre C# developer would understand your code without having to ask you questions about it.






share|improve this answer












Welcome to Code Review. I am sorry but I do not find your code as comprehensible as you were hoping. I find myself more aligned to the comment by @t3chb0t.



I really must ask have you ever read any parts of the C# Naming Guidelines? I am not saying that to be mean. Given that you are a new poster, we do try to be nice but unfortunately sometimes a certain bluntness cannot be avoided.



Something to also consider is that you should favor composition over inheritance (Wikipedia link). Here is a good StackOverflow link on it too. Rather than class Encoder implementing List<byte>, it would be better to have a private field of List<byte>. This also prevents someone from inadvertently performing any method that modifies the encoded value, such as an Add, Clear, or Remove. You could then expose that field publicly as a read only value.



You have a fair amount in a section related to the intermediate circular buffer. This bugs me for several reasons. For one, I think the buffer, variables, and methods could be put into its own class, even if its a private internal class. And having variable names being with "I" conflicts with the naming guideline on many fronts: (1) variables should begin with lowercase letters, and (2) a capital "I" makes me think its an Interface.



I applaud you for emphasizing simple and concise over efficient. Next I would urge you think strongly about readability. Ask yourself how well a mediocre C# developer would understand your code without having to ask you questions about it.







share|improve this answer












share|improve this answer



share|improve this answer










answered Jan 2 at 21:13









Rick Davin

3,157718




3,157718












  • Rick, strictly speaking, according the the guidelines, "The field-naming guidelines apply to static public and protected fields. Internal and private fields are not covered by guidelines", but leaving that aside, I use the convention that local variables and parameters begin with lower-case, and other names begin with upper-case.
    – George Barwood
    Jan 2 at 22:52










  • Oops, pressed enter before I meant to... Still, the "IBuf" and associated fields could be better named, I didn't give any real thought to it. Agreed about composition and inheritance. I also plead guilty to abbreviation, I hope poor t3chb0t isn't scarred for life! :)
    – George Barwood
    Jan 2 at 23:01








  • 2




    @GeorgeBarwood haha, I'm not ;-] I just find that carelessness about non-public code leads to many unnecessary hours of debugging and re-trying to understand the old code. The compiler doesn't care about the formatting or naming etc but people do and if I cannot relatively quickly understand how the code works I do everything to avoid it (and not only me)... this is why you most likely should not expect a thorough review becasue your code is extremely difficult to read and to understand.
    – t3chb0t
    2 days ago










  • @GeorgeBarwood If you expect other people to read your non-public code you should treat it as public and take great care of it too and try to make it as clean as possible.
    – t3chb0t
    2 days ago




















  • Rick, strictly speaking, according the the guidelines, "The field-naming guidelines apply to static public and protected fields. Internal and private fields are not covered by guidelines", but leaving that aside, I use the convention that local variables and parameters begin with lower-case, and other names begin with upper-case.
    – George Barwood
    Jan 2 at 22:52










  • Oops, pressed enter before I meant to... Still, the "IBuf" and associated fields could be better named, I didn't give any real thought to it. Agreed about composition and inheritance. I also plead guilty to abbreviation, I hope poor t3chb0t isn't scarred for life! :)
    – George Barwood
    Jan 2 at 23:01








  • 2




    @GeorgeBarwood haha, I'm not ;-] I just find that carelessness about non-public code leads to many unnecessary hours of debugging and re-trying to understand the old code. The compiler doesn't care about the formatting or naming etc but people do and if I cannot relatively quickly understand how the code works I do everything to avoid it (and not only me)... this is why you most likely should not expect a thorough review becasue your code is extremely difficult to read and to understand.
    – t3chb0t
    2 days ago










  • @GeorgeBarwood If you expect other people to read your non-public code you should treat it as public and take great care of it too and try to make it as clean as possible.
    – t3chb0t
    2 days ago


















Rick, strictly speaking, according the the guidelines, "The field-naming guidelines apply to static public and protected fields. Internal and private fields are not covered by guidelines", but leaving that aside, I use the convention that local variables and parameters begin with lower-case, and other names begin with upper-case.
– George Barwood
Jan 2 at 22:52




Rick, strictly speaking, according the the guidelines, "The field-naming guidelines apply to static public and protected fields. Internal and private fields are not covered by guidelines", but leaving that aside, I use the convention that local variables and parameters begin with lower-case, and other names begin with upper-case.
– George Barwood
Jan 2 at 22:52












Oops, pressed enter before I meant to... Still, the "IBuf" and associated fields could be better named, I didn't give any real thought to it. Agreed about composition and inheritance. I also plead guilty to abbreviation, I hope poor t3chb0t isn't scarred for life! :)
– George Barwood
Jan 2 at 23:01






Oops, pressed enter before I meant to... Still, the "IBuf" and associated fields could be better named, I didn't give any real thought to it. Agreed about composition and inheritance. I also plead guilty to abbreviation, I hope poor t3chb0t isn't scarred for life! :)
– George Barwood
Jan 2 at 23:01






2




2




@GeorgeBarwood haha, I'm not ;-] I just find that carelessness about non-public code leads to many unnecessary hours of debugging and re-trying to understand the old code. The compiler doesn't care about the formatting or naming etc but people do and if I cannot relatively quickly understand how the code works I do everything to avoid it (and not only me)... this is why you most likely should not expect a thorough review becasue your code is extremely difficult to read and to understand.
– t3chb0t
2 days ago




@GeorgeBarwood haha, I'm not ;-] I just find that carelessness about non-public code leads to many unnecessary hours of debugging and re-trying to understand the old code. The compiler doesn't care about the formatting or naming etc but people do and if I cannot relatively quickly understand how the code works I do everything to avoid it (and not only me)... this is why you most likely should not expect a thorough review becasue your code is extremely difficult to read and to understand.
– t3chb0t
2 days ago












@GeorgeBarwood If you expect other people to read your non-public code you should treat it as public and take great care of it too and try to make it as clean as possible.
– t3chb0t
2 days ago






@GeorgeBarwood If you expect other people to read your non-public code you should treat it as public and take great care of it too and try to make it as clean as possible.
– t3chb0t
2 days ago















5














Readability



To expand a bit on what Rick and t3chb0t already said, here are some specific things that hurt the readability of your code:




  • Unnecessary abbreviations and undescriptive names. Some names convey virtually no meaning at all (n, w, x, p), while others are very unclear or do not accurately describe their meaning: ReadInp apparently performs deflation, not just input reading, and what does the 'do' in DoBlock mean?

  • Stuffing multiple statements on a single line. This makes code much more difficult to 'scan', and code that's difficult to read tends to be difficult to maintain and debug.


    • Especially jarring are lines like
      int mc = 0; while (x >= MatchOff[mc]) mc += 1; mc -= 1;
      There's no clear distinction between what's part of the while loop body and what's not. This can lead to subtle problems. Putting each statement on a separate line, and indenting the loop body makes it easier to identify the control flow at a glance.

    • Declaring multiple variables of the same type at once, separated by commas, also falls under this: it requires more care to see whether each has been properly initialized, and if not, whether that was intentional or not.



  • Related to the above point, the occasional omission of braces and sometimes inconsistent indentation doesn't help either.

  • Lack of whitespace:


    • Adding an empty line between subsequent if statements makes it more clear that they're not related, which makes it easier to quickly scan control flow.

    • Leave an empty lines between methods to provide some 'breathing space'. A wall of text is more difficult to read than an article that's split up into chapters and paragraphs.



  • Magic values - specific (numeric) values whose meaning is unclear, such as 19, 256, 257 and 288. Try using properly named constants instead.


C# specific




  • It's strange to see using Generic = System.Collections.Generic;. Namespace aliases are useful to prevent name clashes, but in my experience that's rarely a problem. In this case, System.Collections.Generic is so ubiquitously used that doing anything else than using System.Collections.Generic; will only cause confusion.

  • C# supports type inference, so Dictionary<int, PosList> dict = new Dictionary<int, PosList>(); can be simplified to var dict = new Dictionary<int, PosList>();.


  • out variables can be declared in-line, so if (dict.TryGetValue(w, out e)) can be simplified to if (dict.TryGetValue(w, out PosList e)) or if (dict.TryGetValue(w, out var e)).


Design




  • Inheritance is the wrong tool here, as Rick already pointed out. Just because an encoder uses byte-sequences as input and output does not make it a list of bytes itself. Note how calling Deflate again will overwrite the result of any previous calls - that's very surprising behavior (in a negative way). Also note that a lot of state that is only useful during the actual deflation is kept around - wasting memory, and making your job more difficult because you have to remember to properly reset things.

  • I see two designs that could work well here:


    • A Stream-based design, where you provide a CompressionStream class that inherits from Stream (it's a stream that you can read decompressed data from) and that wraps another Stream (which contains the still-compressed data). This integrates well with file and network stream reading, among other things. Be sure to read up on how disposal (IDisposable) works.

    • An IEnumerable<byte>-based design, with an IEnumerable<byte> Deflate(IEnumerable<byte> compressedData) method. This lets you pass in a variety of collections, including arrays, lists and lazily evaluated sequences such as the results of Linq extension methods. Returning an IEnumerable<byte> allows you to return any of these as well, and lets you use yield, which can be useful in certain cases. Deflate(byteArray.Skip(4)), for example, deflates all but the first 4 bytes from byteArray, without any modifications to your code. And if Deflate uses yield, then Deflate(byteArray).Take(4) only deflates the first 4 bytes, while Deflate(byteArray).ToArray() gives you an array that contains all deflated data.



  • Rick also mentioned those buffer-related methods, they're better moved to a separate buffer class. The same can be said for those 'output stream' methods. Take a look at how StreamWriter lets you write text to an underlying Stream, while taking care of things like encoding and buffering. You could create a similar construction for writing bits.






share|improve this answer























  • The idea was for the class to compress many parts of a PDF ( each page of a PDF has a "content stream" which can be "deflated" ), so the plan was to avoid re-allocating arrays/lists each time a page is compressed, reducing garbage collection overhead. But I didn't entirely follow-through on this, and whether it's worth worrying about I don't know - I haven't done any performance testing at all, or compared it to the standard Zlib implementation. Thanks for the C# tips, I had no idea about "var" or the other language features you mention.
    – George Barwood
    2 days ago


















5














Readability



To expand a bit on what Rick and t3chb0t already said, here are some specific things that hurt the readability of your code:




  • Unnecessary abbreviations and undescriptive names. Some names convey virtually no meaning at all (n, w, x, p), while others are very unclear or do not accurately describe their meaning: ReadInp apparently performs deflation, not just input reading, and what does the 'do' in DoBlock mean?

  • Stuffing multiple statements on a single line. This makes code much more difficult to 'scan', and code that's difficult to read tends to be difficult to maintain and debug.


    • Especially jarring are lines like
      int mc = 0; while (x >= MatchOff[mc]) mc += 1; mc -= 1;
      There's no clear distinction between what's part of the while loop body and what's not. This can lead to subtle problems. Putting each statement on a separate line, and indenting the loop body makes it easier to identify the control flow at a glance.

    • Declaring multiple variables of the same type at once, separated by commas, also falls under this: it requires more care to see whether each has been properly initialized, and if not, whether that was intentional or not.



  • Related to the above point, the occasional omission of braces and sometimes inconsistent indentation doesn't help either.

  • Lack of whitespace:


    • Adding an empty line between subsequent if statements makes it more clear that they're not related, which makes it easier to quickly scan control flow.

    • Leave an empty lines between methods to provide some 'breathing space'. A wall of text is more difficult to read than an article that's split up into chapters and paragraphs.



  • Magic values - specific (numeric) values whose meaning is unclear, such as 19, 256, 257 and 288. Try using properly named constants instead.


C# specific




  • It's strange to see using Generic = System.Collections.Generic;. Namespace aliases are useful to prevent name clashes, but in my experience that's rarely a problem. In this case, System.Collections.Generic is so ubiquitously used that doing anything else than using System.Collections.Generic; will only cause confusion.

  • C# supports type inference, so Dictionary<int, PosList> dict = new Dictionary<int, PosList>(); can be simplified to var dict = new Dictionary<int, PosList>();.


  • out variables can be declared in-line, so if (dict.TryGetValue(w, out e)) can be simplified to if (dict.TryGetValue(w, out PosList e)) or if (dict.TryGetValue(w, out var e)).


Design




  • Inheritance is the wrong tool here, as Rick already pointed out. Just because an encoder uses byte-sequences as input and output does not make it a list of bytes itself. Note how calling Deflate again will overwrite the result of any previous calls - that's very surprising behavior (in a negative way). Also note that a lot of state that is only useful during the actual deflation is kept around - wasting memory, and making your job more difficult because you have to remember to properly reset things.

  • I see two designs that could work well here:


    • A Stream-based design, where you provide a CompressionStream class that inherits from Stream (it's a stream that you can read decompressed data from) and that wraps another Stream (which contains the still-compressed data). This integrates well with file and network stream reading, among other things. Be sure to read up on how disposal (IDisposable) works.

    • An IEnumerable<byte>-based design, with an IEnumerable<byte> Deflate(IEnumerable<byte> compressedData) method. This lets you pass in a variety of collections, including arrays, lists and lazily evaluated sequences such as the results of Linq extension methods. Returning an IEnumerable<byte> allows you to return any of these as well, and lets you use yield, which can be useful in certain cases. Deflate(byteArray.Skip(4)), for example, deflates all but the first 4 bytes from byteArray, without any modifications to your code. And if Deflate uses yield, then Deflate(byteArray).Take(4) only deflates the first 4 bytes, while Deflate(byteArray).ToArray() gives you an array that contains all deflated data.



  • Rick also mentioned those buffer-related methods, they're better moved to a separate buffer class. The same can be said for those 'output stream' methods. Take a look at how StreamWriter lets you write text to an underlying Stream, while taking care of things like encoding and buffering. You could create a similar construction for writing bits.






share|improve this answer























  • The idea was for the class to compress many parts of a PDF ( each page of a PDF has a "content stream" which can be "deflated" ), so the plan was to avoid re-allocating arrays/lists each time a page is compressed, reducing garbage collection overhead. But I didn't entirely follow-through on this, and whether it's worth worrying about I don't know - I haven't done any performance testing at all, or compared it to the standard Zlib implementation. Thanks for the C# tips, I had no idea about "var" or the other language features you mention.
    – George Barwood
    2 days ago
















5












5








5






Readability



To expand a bit on what Rick and t3chb0t already said, here are some specific things that hurt the readability of your code:




  • Unnecessary abbreviations and undescriptive names. Some names convey virtually no meaning at all (n, w, x, p), while others are very unclear or do not accurately describe their meaning: ReadInp apparently performs deflation, not just input reading, and what does the 'do' in DoBlock mean?

  • Stuffing multiple statements on a single line. This makes code much more difficult to 'scan', and code that's difficult to read tends to be difficult to maintain and debug.


    • Especially jarring are lines like
      int mc = 0; while (x >= MatchOff[mc]) mc += 1; mc -= 1;
      There's no clear distinction between what's part of the while loop body and what's not. This can lead to subtle problems. Putting each statement on a separate line, and indenting the loop body makes it easier to identify the control flow at a glance.

    • Declaring multiple variables of the same type at once, separated by commas, also falls under this: it requires more care to see whether each has been properly initialized, and if not, whether that was intentional or not.



  • Related to the above point, the occasional omission of braces and sometimes inconsistent indentation doesn't help either.

  • Lack of whitespace:


    • Adding an empty line between subsequent if statements makes it more clear that they're not related, which makes it easier to quickly scan control flow.

    • Leave an empty lines between methods to provide some 'breathing space'. A wall of text is more difficult to read than an article that's split up into chapters and paragraphs.



  • Magic values - specific (numeric) values whose meaning is unclear, such as 19, 256, 257 and 288. Try using properly named constants instead.


C# specific




  • It's strange to see using Generic = System.Collections.Generic;. Namespace aliases are useful to prevent name clashes, but in my experience that's rarely a problem. In this case, System.Collections.Generic is so ubiquitously used that doing anything else than using System.Collections.Generic; will only cause confusion.

  • C# supports type inference, so Dictionary<int, PosList> dict = new Dictionary<int, PosList>(); can be simplified to var dict = new Dictionary<int, PosList>();.


  • out variables can be declared in-line, so if (dict.TryGetValue(w, out e)) can be simplified to if (dict.TryGetValue(w, out PosList e)) or if (dict.TryGetValue(w, out var e)).


Design




  • Inheritance is the wrong tool here, as Rick already pointed out. Just because an encoder uses byte-sequences as input and output does not make it a list of bytes itself. Note how calling Deflate again will overwrite the result of any previous calls - that's very surprising behavior (in a negative way). Also note that a lot of state that is only useful during the actual deflation is kept around - wasting memory, and making your job more difficult because you have to remember to properly reset things.

  • I see two designs that could work well here:


    • A Stream-based design, where you provide a CompressionStream class that inherits from Stream (it's a stream that you can read decompressed data from) and that wraps another Stream (which contains the still-compressed data). This integrates well with file and network stream reading, among other things. Be sure to read up on how disposal (IDisposable) works.

    • An IEnumerable<byte>-based design, with an IEnumerable<byte> Deflate(IEnumerable<byte> compressedData) method. This lets you pass in a variety of collections, including arrays, lists and lazily evaluated sequences such as the results of Linq extension methods. Returning an IEnumerable<byte> allows you to return any of these as well, and lets you use yield, which can be useful in certain cases. Deflate(byteArray.Skip(4)), for example, deflates all but the first 4 bytes from byteArray, without any modifications to your code. And if Deflate uses yield, then Deflate(byteArray).Take(4) only deflates the first 4 bytes, while Deflate(byteArray).ToArray() gives you an array that contains all deflated data.



  • Rick also mentioned those buffer-related methods, they're better moved to a separate buffer class. The same can be said for those 'output stream' methods. Take a look at how StreamWriter lets you write text to an underlying Stream, while taking care of things like encoding and buffering. You could create a similar construction for writing bits.






share|improve this answer














Readability



To expand a bit on what Rick and t3chb0t already said, here are some specific things that hurt the readability of your code:




  • Unnecessary abbreviations and undescriptive names. Some names convey virtually no meaning at all (n, w, x, p), while others are very unclear or do not accurately describe their meaning: ReadInp apparently performs deflation, not just input reading, and what does the 'do' in DoBlock mean?

  • Stuffing multiple statements on a single line. This makes code much more difficult to 'scan', and code that's difficult to read tends to be difficult to maintain and debug.


    • Especially jarring are lines like
      int mc = 0; while (x >= MatchOff[mc]) mc += 1; mc -= 1;
      There's no clear distinction between what's part of the while loop body and what's not. This can lead to subtle problems. Putting each statement on a separate line, and indenting the loop body makes it easier to identify the control flow at a glance.

    • Declaring multiple variables of the same type at once, separated by commas, also falls under this: it requires more care to see whether each has been properly initialized, and if not, whether that was intentional or not.



  • Related to the above point, the occasional omission of braces and sometimes inconsistent indentation doesn't help either.

  • Lack of whitespace:


    • Adding an empty line between subsequent if statements makes it more clear that they're not related, which makes it easier to quickly scan control flow.

    • Leave an empty lines between methods to provide some 'breathing space'. A wall of text is more difficult to read than an article that's split up into chapters and paragraphs.



  • Magic values - specific (numeric) values whose meaning is unclear, such as 19, 256, 257 and 288. Try using properly named constants instead.


C# specific




  • It's strange to see using Generic = System.Collections.Generic;. Namespace aliases are useful to prevent name clashes, but in my experience that's rarely a problem. In this case, System.Collections.Generic is so ubiquitously used that doing anything else than using System.Collections.Generic; will only cause confusion.

  • C# supports type inference, so Dictionary<int, PosList> dict = new Dictionary<int, PosList>(); can be simplified to var dict = new Dictionary<int, PosList>();.


  • out variables can be declared in-line, so if (dict.TryGetValue(w, out e)) can be simplified to if (dict.TryGetValue(w, out PosList e)) or if (dict.TryGetValue(w, out var e)).


Design




  • Inheritance is the wrong tool here, as Rick already pointed out. Just because an encoder uses byte-sequences as input and output does not make it a list of bytes itself. Note how calling Deflate again will overwrite the result of any previous calls - that's very surprising behavior (in a negative way). Also note that a lot of state that is only useful during the actual deflation is kept around - wasting memory, and making your job more difficult because you have to remember to properly reset things.

  • I see two designs that could work well here:


    • A Stream-based design, where you provide a CompressionStream class that inherits from Stream (it's a stream that you can read decompressed data from) and that wraps another Stream (which contains the still-compressed data). This integrates well with file and network stream reading, among other things. Be sure to read up on how disposal (IDisposable) works.

    • An IEnumerable<byte>-based design, with an IEnumerable<byte> Deflate(IEnumerable<byte> compressedData) method. This lets you pass in a variety of collections, including arrays, lists and lazily evaluated sequences such as the results of Linq extension methods. Returning an IEnumerable<byte> allows you to return any of these as well, and lets you use yield, which can be useful in certain cases. Deflate(byteArray.Skip(4)), for example, deflates all but the first 4 bytes from byteArray, without any modifications to your code. And if Deflate uses yield, then Deflate(byteArray).Take(4) only deflates the first 4 bytes, while Deflate(byteArray).ToArray() gives you an array that contains all deflated data.



  • Rick also mentioned those buffer-related methods, they're better moved to a separate buffer class. The same can be said for those 'output stream' methods. Take a look at how StreamWriter lets you write text to an underlying Stream, while taking care of things like encoding and buffering. You could create a similar construction for writing bits.







share|improve this answer














share|improve this answer



share|improve this answer








edited 2 days ago









Blackwood

1358




1358










answered 2 days ago









Pieter Witvoet

5,801725




5,801725












  • The idea was for the class to compress many parts of a PDF ( each page of a PDF has a "content stream" which can be "deflated" ), so the plan was to avoid re-allocating arrays/lists each time a page is compressed, reducing garbage collection overhead. But I didn't entirely follow-through on this, and whether it's worth worrying about I don't know - I haven't done any performance testing at all, or compared it to the standard Zlib implementation. Thanks for the C# tips, I had no idea about "var" or the other language features you mention.
    – George Barwood
    2 days ago




















  • The idea was for the class to compress many parts of a PDF ( each page of a PDF has a "content stream" which can be "deflated" ), so the plan was to avoid re-allocating arrays/lists each time a page is compressed, reducing garbage collection overhead. But I didn't entirely follow-through on this, and whether it's worth worrying about I don't know - I haven't done any performance testing at all, or compared it to the standard Zlib implementation. Thanks for the C# tips, I had no idea about "var" or the other language features you mention.
    – George Barwood
    2 days ago


















The idea was for the class to compress many parts of a PDF ( each page of a PDF has a "content stream" which can be "deflated" ), so the plan was to avoid re-allocating arrays/lists each time a page is compressed, reducing garbage collection overhead. But I didn't entirely follow-through on this, and whether it's worth worrying about I don't know - I haven't done any performance testing at all, or compared it to the standard Zlib implementation. Thanks for the C# tips, I had no idea about "var" or the other language features you mention.
– George Barwood
2 days ago






The idea was for the class to compress many parts of a PDF ( each page of a PDF has a "content stream" which can be "deflated" ), so the plan was to avoid re-allocating arrays/lists each time a page is compressed, reducing garbage collection overhead. But I didn't entirely follow-through on this, and whether it's worth worrying about I don't know - I haven't done any performance testing at all, or compared it to the standard Zlib implementation. Thanks for the C# tips, I had no idea about "var" or the other language features you mention.
– George Barwood
2 days ago












George Barwood is a new contributor. Be nice, and check out our Code of Conduct.










draft saved

draft discarded


















George Barwood is a new contributor. Be nice, and check out our Code of Conduct.













George Barwood is a new contributor. Be nice, and check out our Code of Conduct.












George Barwood is a new contributor. Be nice, and check out our Code of Conduct.
















Thanks for contributing an answer to Code Review Stack Exchange!


  • Please be sure to answer the question. Provide details and share your research!

But avoid



  • Asking for help, clarification, or responding to other answers.

  • Making statements based on opinion; back them up with references or personal experience.


Use MathJax to format equations. MathJax reference.


To learn more, see our tips on writing great answers.





Some of your past answers have not been well-received, and you're in danger of being blocked from answering.


Please pay close attention to the following guidance:


  • Please be sure to answer the question. Provide details and share your research!

But avoid



  • Asking for help, clarification, or responding to other answers.

  • Making statements based on opinion; back them up with references or personal experience.


To learn more, see our tips on writing great answers.




draft saved


draft discarded














StackExchange.ready(
function () {
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fcodereview.stackexchange.com%2fquestions%2f210757%2frfc-1951-deflate-compression%23new-answer', 'question_page');
}
);

Post as a guest















Required, but never shown





















































Required, but never shown














Required, but never shown












Required, but never shown







Required, but never shown

































Required, but never shown














Required, but never shown












Required, but never shown







Required, but never shown







Popular posts from this blog

Список кардиналов, возведённых папой римским Каликстом III

Deduzione

Mysql.sock missing - “Can't connect to local MySQL server through socket”