RFC 1951 “Deflate” compression
This is a little project I did over Christmas, I wanted to understand a bit more about how RFC 1951 (https://www.ietf.org/rfc/rfc1951.txt) compression worked, mostly out of curiosity really. RFC 1951 is used to compress PNG images, zip files and portions of PDF files ( and probably other things as well ). I have tried to keep the C# code fairly simple, concise and hopefully comprehensible, rather than being too concerned with efficiency. Here's the code:
using Generic = System.Collections.Generic;
class Encoder : Generic.List<byte> // Data compression per RFC 1950, RFC 1951.
{
public Generic.List<byte> Deflate( byte inp )
{
Clear();
Add( 0x78); Add( 0x9C ); // RFC 1950 bytes.
ReadInp( inp );
while ( DoOutput( 1 ) == 0 );
FlushBitBuf();
Put32( Adler32( inp ) ); // RFC 1950 checksum.
// System.Console.WriteLine( "Encoder.Deflate, input size=" + inp.Length + " output size=" + this.Count );
return this;
}
class PosList{ public int Pos; public PosList Next; } // List of 3-byte match positions.
void ReadInp( byte input ) // LZ77 compression, per RFC 1951.
{
Generic.Dictionary<int,PosList> dict = new Generic.Dictionary<int,PosList>();
int n = input.Length;
SetIBufSize( n );
int w = 0; // Holds last 3 bytes of input.
int todo = 0; // Number of bytes in w that have not yet been output to IBuf, can be negative when a match is found.
int pendingMatchLen = 0, pendingDist = 0;
for ( int i = 0; i < 2 && i < n; i += 1 ) { w = ( w << 8 ) | input[ i ]; todo += 1; }
for ( int i = 2; i < n; i += 1 )
{
w = ( ( w << 8 ) | input[ i ] ) & 0xffffff; todo += 1;
PosList e, x = new PosList(); x.Pos = i;
int bestMatchLen = 0, bestDist = 0;
if ( dict.TryGetValue( w, out e ) )
{
x.Next = e;
PosList p = x;
if ( todo >= 3 ) while ( e != null )
{
int dist = i - e.Pos; if ( dist > 32768 ) { p.Next = null; break; }
int matchLen = MatchLen( input, dist, i );
if ( matchLen > bestMatchLen ) { bestMatchLen = matchLen; bestDist = dist; }
p = e; e = e.Next;
}
}
dict[ w ] = x; ISpace();
// "Lazy matching" RFC 1951 p.15 : if there are overlapping matches, there is a choice over which of the match to use.
// Example: "abc012bc345.... abc345". Here abc345 can be encoded as either [abc][345] or as a[bc345].
// Since a range typically needs more bits to encode than a literal, choose the latter.
if ( pendingMatchLen > 0 )
{
if ( bestMatchLen > pendingMatchLen || bestMatchLen == pendingMatchLen && bestDist < pendingDist )
{ IPut( input[ i-3 ] ); todo -= 1; }
else // Save the pending match, suppress bestMatch if any.
{
IPut( 257 + pendingMatchLen );
IPut( pendingDist );
todo -= pendingMatchLen;
bestMatchLen = 0;
}
pendingMatchLen = 0;
}
if ( bestMatchLen > 0 ) { pendingMatchLen = bestMatchLen; pendingDist = bestDist; }
else if ( todo == 3 ) { IPut( w >> 16 ); todo = 2; }
} // End of main input loop.
if ( pendingMatchLen > 0 )
{
IPut( 257 + pendingMatchLen );
IPut( pendingDist );
todo -= pendingMatchLen;
}
while ( todo > 0 ){ todo -= 1; IPut( (byte)( w >> (todo*8) ) ); }
} // end ReadInp
int MatchLen( byte input, int dist, int i )
{
// We have a match of 3 bytes, this function computes total match.
int end = input.Length; if ( end - i > 256 ) end = i + 256; // Maximum match is 258.
int x = i + 1; while ( x < end && input[ x ] == input[ x-dist ] ) x += 1;
return x - i + 2;
}
ushort IBuf; // Intermediate circular buffer, holds output from LZ77 algorithm.
const int IBufSizeMax = 0x40000;
int IBufSize, IBufI, IBufJ;
void IPut( int x ) { IBuf[ IBufI++ ] = (ushort)x; if ( IBufI == IBufSize ) IBufI = 0; }
int IGet(){ int result = IBuf[ IBufJ++ ]; if ( IBufJ == IBufSize ) IBufJ = 0; return result; }
int ICount(){ if ( IBufI >= IBufJ ) return IBufI - IBufJ; else return IBufI + IBufSize - IBufJ; } // Number of values in IBuf.
void ISpace(){ while ( ICount() > IBufSize - 10 ) DoOutput( 0 ); } // Ensure IBuf has space for at least 10 values.
void SetIBufSize( int x ) { x += 20; if ( x > IBufSizeMax ) x = IBufSizeMax; if ( IBufSize < x ) { IBufSize = x; IBuf = new ushort[ x ]; } }
byte DoOutput( byte lastBlock ) // While DoBlock fails, retry with a smaller amount of input.
{
int n = ICount();
while ( !DoBlock( n, lastBlock ) ) { lastBlock = 0; n -= n / 20; }
return lastBlock;
}
// RFC 1951 encoding constants.
static byte ClenAlphabet = { 16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15 };
static byte MatchExtra = { 0,0,0,0, 0,0,0,0, 1,1,1,1, 2,2,2,2, 3,3,3,3, 4,4,4,4, 5,5,5,5, 0 };
static ushort MatchOff = { 3,4,5,6, 7,8,9,10, 11,13,15,17, 19,23,27,31, 35,43,51,59, 67,83,99,115, 131,163,195,227, 258, 0xffff };
static byte DistExtra = { 0,0,0,0, 1,1,2,2, 3,3,4,4, 5,5,6,6, 7,7,8,8, 9,9,10,10, 11,11,12,12, 13,13 };
static ushort DistOff = { 1,2,3,4, 5,7,9,13, 17,25,33,49, 65,97,129,193, 257,385,513,769, 1025,1537,2049,3073,
4097,6145,8193,12289, 16385,24577, 0xffff };
int LitFreq = new int [ 288 ], DistFreq = new int [ 32 ], LenFreq = new int [ 19 ];
byte LitLen = new byte [ 288 ], DistLen = new byte [ 32 ], LenLen = new byte [ 19 ];
ushort LitCode = new ushort[ 288 ], DistCode = new ushort[ 32 ], LenCode = new ushort[ 19 ];
bool DoBlock( int n, byte lastBlock )
{
// First compute symbol frequencies.
Clear( LitFreq ); Clear( DistFreq );
int saveIBufJ = IBufJ;
int got = 0; while ( got < n )
{
int x = IGet(); got += 1;
if ( x < 256 ) LitFreq[ x ] += 1;
else
{
x -= 257;
int dist = IGet(); got += 1;
int mc = 0; while ( x >= MatchOff[ mc ] ) mc += 1; mc -= 1;
int dc = 0; while ( dist >= DistOff[ dc ] ) dc += 1; dc -= 1;
LitFreq[ 257 + mc ] += 1;
DistFreq[ dc ] += 1;
}
}
LitFreq[ 256 ] += 1; // End of block code.
IBufJ = saveIBufJ;
// Now compute Huffman codes.
int nLitCode = Huff.ComputeCode( 15, LitFreq, LitLen, LitCode ); if ( nLitCode < 0 ) return false;
int nDistCode = Huff.ComputeCode( 15, DistFreq, DistLen, DistCode ); if ( nDistCode < 0 ) return false;
if ( nDistCode == 0 ) nDistCode = 1;
Clear( LenFreq );
LenPass = 1; DoLengths( nLitCode, LitLen, true ); DoLengths( nDistCode, DistLen, false );
if ( Huff.ComputeCode( 7, LenFreq, LenLen, LenCode ) < 0 ) return false;
int nLenCode = 19; while ( nLenCode > 4 && LenLen[ ClenAlphabet[ nLenCode - 1 ] ] == 0 ) nLenCode -= 1;
// Now output dynamic Huffman block. For small blocks fixed coding might work better, not implemented.
PutBit( lastBlock );
PutBits( 2, 2 );
PutBits( 5, nLitCode - 257 ); PutBits( 5, nDistCode - 1 ); PutBits( 4, nLenCode - 4 );
for ( int i = 0; i < nLenCode; i += 1 ) PutBits( 3, LenLen[ ClenAlphabet[ i ] ] );
LenPass = 2; DoLengths( nLitCode, LitLen, true ); DoLengths( nDistCode, DistLen, false );
got = 0; while ( got < n ) // Similar to loop above, but does output instead of computing symbol frequencies.
{
int x = IGet(); got += 1;
if ( x < 256 ) PutBits( LitLen[ x ], LitCode[ x ] );
else
{
x -= 257;
int dist = IGet(); got += 1;
int mc = 0; while ( x >= MatchOff[ mc ] ) mc += 1; mc -= 1;
int dc = 0; while ( dist >= DistOff[ dc ] ) dc += 1; dc -= 1;
PutBits( LitLen[ 257 + mc ], LitCode[ 257 + mc ] );
PutBits( MatchExtra[ mc ], x-MatchOff[ mc ] );
PutBits( DistLen[ dc ], DistCode[ dc ] );
PutBits( DistExtra[ dc ], dist-DistOff[ dc ] );
}
}
PutBits( LitLen[ 256 ], LitCode[ 256 ] ); // End of block code.
return true;
} // end DoBlock
// Run length encoding of code lengths - RFC 1951, page 13.
int LenPass, Plen, ZeroRun, Repeat;
void PutLenCode( int code ) { if ( LenPass == 1 ) LenFreq[ code ] += 1; else PutBits( LenLen[ code ], LenCode[ code ] ); }
void DoLengths( int n, byte a, bool isLit )
{
if ( isLit ) { Plen = 0; ZeroRun = 0; Repeat = 0; }
for ( int i = 0; i < n; i += 1 )
{
int len = a[ i ];
if ( len == 0 ){ EncRepeat(); ZeroRun += 1; Plen = 0; }
else if ( len == Plen ) { Repeat += 1; }
else { EncZeroRun(); EncRepeat(); PutLenCode( len ); Plen = len; }
}
if ( !isLit ) { EncZeroRun(); EncRepeat(); }
}
void EncRepeat()
{
while ( Repeat > 0 )
{
if ( Repeat < 3 ) { PutLenCode( Plen ); Repeat -= 1; }
else { int x = Repeat; if ( x > 6 ) x = 6; PutLenCode( 16 ); if ( LenPass == 2 ) PutBits( 2, x-3 ); Repeat -= x; }
}
}
void EncZeroRun()
{
while ( ZeroRun > 0 )
{
if ( ZeroRun < 3 ) { PutLenCode( 0 ); ZeroRun -= 1; }
else if ( ZeroRun < 11 ) { PutLenCode( 17 ); if ( LenPass == 2 ) PutBits( 3, ZeroRun-3 ); ZeroRun = 0; }
else { int x = ZeroRun; if ( x > 138 ) x = 138; PutLenCode( 18 ); if ( LenPass == 2 ) PutBits( 7, x - 11 ); ZeroRun -= x; }
}
}
public static uint Adler32( byte b ) // Checksum function per RFC 1950.
{
uint s1 = 1, s2 = 0;
for ( int i = 0; i < b.Length; i += 1 )
{
s1 = ( s1 + b[ i ] ) % 65521;
s2 = ( s2 + s1 ) % 65521;
}
return s2*65536 + s1;
}
static void Clear( int f ){ System.Array.Clear( f, 0, f.Length ); }
// Output stream.
byte Buf = 0, M = 1;
void PutBit( int b ) { if ( b != 0 ) Buf |= M; M <<= 1; if ( M == 0 ) { Add(Buf); Buf = 0; M = 1; } }
void PutBits( int n, int x ) { for ( int i = 0; i < n; i += 1 ) { PutBit( x & 1 ); x >>= 1; } }
void FlushBitBuf(){ while ( M != 1 ) PutBit( 0 ); }
void Put32( uint x ) { Add( (byte)( x >> 24 ) ); Add( (byte)( x >> 16 ) ); Add( (byte)( x >> 8 ) ); Add( (byte) x ); }
} // end class Encoder
//+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
class Huff // Given a list of frequencies (freq), compute Huffman code lengths (nbits) and codes (tree_code).
{
public static int ComputeCode( int bitLimit, int freq, byte nbits, ushort tree_code )
{
int ncode = freq.Length;
Node heap = new Node[ ncode ];
int hn = 0;
for ( int i = 0; i < ncode; i += 1 )
{
int f = freq[ i ];
if ( f > 0 )
{
Node n = new Node();
n.Freq = f;
n.Code = (ushort)i;
HeapInsert( heap, hn, n );
hn += 1;
}
}
for ( int i = 0; i < nbits.Length; i += 1 ) nbits[ i ] = 0;
if ( hn <= 1 ) // Special case.
{ if ( hn == 1 ) nbits[ heap[ 0 ].Code ] = 1; }
else
{
while ( hn > 1 ) // Keep pairing the lowest frequency nodes.
{
Node n = new Node();
hn -= 1; n.Left = HeapRemove( heap, hn );
hn -= 1; n.Right = HeapRemove( heap, hn );
n.Freq = n.Left.Freq + n.Right.Freq;
n.Depth = (byte) ( 1 + ( n.Left.Depth > n.Right.Depth ? n.Left.Depth : n.Right.Depth ) );
HeapInsert( heap, hn, n );
hn += 1;
}
Walk( nbits, heap[ 0 ], 0 ); // Walk the tree to find the code lengths (nbits).
}
for ( int i = 0; i < ncode; i += 1 ) if ( nbits[ i ] > bitLimit ) return -1;
// Now compute codes, code below is from RFC 1951 page 7.
int maxBits = 0;
for ( int i = 0; i < ncode; i += 1 ) if ( nbits[ i ] > maxBits ) maxBits = nbits[ i ];
int bl_count = new int[ maxBits+1 ];
for ( int i = 0; i < ncode; i += 1 ) bl_count[ nbits[ i ] ] += 1;
int next_code = new int[ maxBits+1 ];
int code = 0; bl_count[ 0 ] = 0;
for ( int i = 0; i < maxBits; i += 1 )
{
code = ( code + bl_count[ i ] ) << 1;
next_code[ i+1 ] = code;
}
for ( int i = 0; i < ncode; i += 1 )
{
int len = nbits[ i ];
if ( len != 0 )
{
tree_code[ i ] = (ushort)Reverse( next_code[ len ], len );
next_code[ len ] += 1;
}
}
while ( ncode > 0 && nbits[ ncode - 1 ] == 0 ) ncode -= 1;
//System.Console.WriteLine( "Huff.ComputeCode" );
//for ( int i = 0; i < ncode; i += 1 ) if ( nbits[ i ] > 0 )
// System.Console.WriteLine( "i=" + i + " len=" + nbits[ i ] + " tc=" + tree_code[ i ].ToString("X") + " freq=" + freq[ i ] );
return ncode;
}
class Node{ public Node Left, Right; public int Freq; public ushort Code; public byte Depth; }
static int Reverse( int x, int bits )
{ int result = 0; for ( int i = 0; i < bits; i += 1 ) { result <<= 1; result |= x & 1; x >>= 1; } return result; }
static void Walk( byte a, Node n, int len )
{ if ( n.Left == null ) a[ n.Code ] = (byte)len; else { Walk( a, n.Left, len+1 ); Walk( a, n.Right, len+1 ); } }
static bool LessThan( Node a, Node b )
{ return a.Freq < b.Freq || a.Freq == b.Freq && a.Depth < b.Depth; }
// See e.g. https://en.wikipedia.org/wiki/Heap_(data_structure) for information on heap data structure.
static void HeapInsert( Node heap, int h, Node n ) // h is size of heap before insertion.
{
int j = h;
while ( j > 0 )
{
int p = ( j - 1 ) / 2; // Index of parent.
Node pn = heap[ p ];
if ( !LessThan( n, pn ) ) break;
heap[ j ] = pn; // Demote parent.
j = p;
}
heap[ j ] = n;
}
static Node HeapRemove( Node heap, int h ) // h is size of heap after removal.
{
Node result = heap[ 0 ];
Node n = heap[ h ];
int j = 0;
while ( true )
{
int c = j * 2 + 1; if ( c >= h ) break;
Node cn = heap[ c ];
if ( c + 1 < h )
{
Node cn2 = heap[ c + 1 ];
if ( LessThan( cn2, cn ) ) { c += 1; cn = cn2; }
}
if ( !LessThan( cn, n ) ) break;
heap[ j ] = cn; j = c;
}
heap[ j ] = n;
return result;
}
} // end class Huff
c# compression heap
New contributor
add a comment |
This is a little project I did over Christmas, I wanted to understand a bit more about how RFC 1951 (https://www.ietf.org/rfc/rfc1951.txt) compression worked, mostly out of curiosity really. RFC 1951 is used to compress PNG images, zip files and portions of PDF files ( and probably other things as well ). I have tried to keep the C# code fairly simple, concise and hopefully comprehensible, rather than being too concerned with efficiency. Here's the code:
using Generic = System.Collections.Generic;
class Encoder : Generic.List<byte> // Data compression per RFC 1950, RFC 1951.
{
public Generic.List<byte> Deflate( byte inp )
{
Clear();
Add( 0x78); Add( 0x9C ); // RFC 1950 bytes.
ReadInp( inp );
while ( DoOutput( 1 ) == 0 );
FlushBitBuf();
Put32( Adler32( inp ) ); // RFC 1950 checksum.
// System.Console.WriteLine( "Encoder.Deflate, input size=" + inp.Length + " output size=" + this.Count );
return this;
}
class PosList{ public int Pos; public PosList Next; } // List of 3-byte match positions.
void ReadInp( byte input ) // LZ77 compression, per RFC 1951.
{
Generic.Dictionary<int,PosList> dict = new Generic.Dictionary<int,PosList>();
int n = input.Length;
SetIBufSize( n );
int w = 0; // Holds last 3 bytes of input.
int todo = 0; // Number of bytes in w that have not yet been output to IBuf, can be negative when a match is found.
int pendingMatchLen = 0, pendingDist = 0;
for ( int i = 0; i < 2 && i < n; i += 1 ) { w = ( w << 8 ) | input[ i ]; todo += 1; }
for ( int i = 2; i < n; i += 1 )
{
w = ( ( w << 8 ) | input[ i ] ) & 0xffffff; todo += 1;
PosList e, x = new PosList(); x.Pos = i;
int bestMatchLen = 0, bestDist = 0;
if ( dict.TryGetValue( w, out e ) )
{
x.Next = e;
PosList p = x;
if ( todo >= 3 ) while ( e != null )
{
int dist = i - e.Pos; if ( dist > 32768 ) { p.Next = null; break; }
int matchLen = MatchLen( input, dist, i );
if ( matchLen > bestMatchLen ) { bestMatchLen = matchLen; bestDist = dist; }
p = e; e = e.Next;
}
}
dict[ w ] = x; ISpace();
// "Lazy matching" RFC 1951 p.15 : if there are overlapping matches, there is a choice over which of the match to use.
// Example: "abc012bc345.... abc345". Here abc345 can be encoded as either [abc][345] or as a[bc345].
// Since a range typically needs more bits to encode than a literal, choose the latter.
if ( pendingMatchLen > 0 )
{
if ( bestMatchLen > pendingMatchLen || bestMatchLen == pendingMatchLen && bestDist < pendingDist )
{ IPut( input[ i-3 ] ); todo -= 1; }
else // Save the pending match, suppress bestMatch if any.
{
IPut( 257 + pendingMatchLen );
IPut( pendingDist );
todo -= pendingMatchLen;
bestMatchLen = 0;
}
pendingMatchLen = 0;
}
if ( bestMatchLen > 0 ) { pendingMatchLen = bestMatchLen; pendingDist = bestDist; }
else if ( todo == 3 ) { IPut( w >> 16 ); todo = 2; }
} // End of main input loop.
if ( pendingMatchLen > 0 )
{
IPut( 257 + pendingMatchLen );
IPut( pendingDist );
todo -= pendingMatchLen;
}
while ( todo > 0 ){ todo -= 1; IPut( (byte)( w >> (todo*8) ) ); }
} // end ReadInp
int MatchLen( byte input, int dist, int i )
{
// We have a match of 3 bytes, this function computes total match.
int end = input.Length; if ( end - i > 256 ) end = i + 256; // Maximum match is 258.
int x = i + 1; while ( x < end && input[ x ] == input[ x-dist ] ) x += 1;
return x - i + 2;
}
ushort IBuf; // Intermediate circular buffer, holds output from LZ77 algorithm.
const int IBufSizeMax = 0x40000;
int IBufSize, IBufI, IBufJ;
void IPut( int x ) { IBuf[ IBufI++ ] = (ushort)x; if ( IBufI == IBufSize ) IBufI = 0; }
int IGet(){ int result = IBuf[ IBufJ++ ]; if ( IBufJ == IBufSize ) IBufJ = 0; return result; }
int ICount(){ if ( IBufI >= IBufJ ) return IBufI - IBufJ; else return IBufI + IBufSize - IBufJ; } // Number of values in IBuf.
void ISpace(){ while ( ICount() > IBufSize - 10 ) DoOutput( 0 ); } // Ensure IBuf has space for at least 10 values.
void SetIBufSize( int x ) { x += 20; if ( x > IBufSizeMax ) x = IBufSizeMax; if ( IBufSize < x ) { IBufSize = x; IBuf = new ushort[ x ]; } }
byte DoOutput( byte lastBlock ) // While DoBlock fails, retry with a smaller amount of input.
{
int n = ICount();
while ( !DoBlock( n, lastBlock ) ) { lastBlock = 0; n -= n / 20; }
return lastBlock;
}
// RFC 1951 encoding constants.
static byte ClenAlphabet = { 16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15 };
static byte MatchExtra = { 0,0,0,0, 0,0,0,0, 1,1,1,1, 2,2,2,2, 3,3,3,3, 4,4,4,4, 5,5,5,5, 0 };
static ushort MatchOff = { 3,4,5,6, 7,8,9,10, 11,13,15,17, 19,23,27,31, 35,43,51,59, 67,83,99,115, 131,163,195,227, 258, 0xffff };
static byte DistExtra = { 0,0,0,0, 1,1,2,2, 3,3,4,4, 5,5,6,6, 7,7,8,8, 9,9,10,10, 11,11,12,12, 13,13 };
static ushort DistOff = { 1,2,3,4, 5,7,9,13, 17,25,33,49, 65,97,129,193, 257,385,513,769, 1025,1537,2049,3073,
4097,6145,8193,12289, 16385,24577, 0xffff };
int LitFreq = new int [ 288 ], DistFreq = new int [ 32 ], LenFreq = new int [ 19 ];
byte LitLen = new byte [ 288 ], DistLen = new byte [ 32 ], LenLen = new byte [ 19 ];
ushort LitCode = new ushort[ 288 ], DistCode = new ushort[ 32 ], LenCode = new ushort[ 19 ];
bool DoBlock( int n, byte lastBlock )
{
// First compute symbol frequencies.
Clear( LitFreq ); Clear( DistFreq );
int saveIBufJ = IBufJ;
int got = 0; while ( got < n )
{
int x = IGet(); got += 1;
if ( x < 256 ) LitFreq[ x ] += 1;
else
{
x -= 257;
int dist = IGet(); got += 1;
int mc = 0; while ( x >= MatchOff[ mc ] ) mc += 1; mc -= 1;
int dc = 0; while ( dist >= DistOff[ dc ] ) dc += 1; dc -= 1;
LitFreq[ 257 + mc ] += 1;
DistFreq[ dc ] += 1;
}
}
LitFreq[ 256 ] += 1; // End of block code.
IBufJ = saveIBufJ;
// Now compute Huffman codes.
int nLitCode = Huff.ComputeCode( 15, LitFreq, LitLen, LitCode ); if ( nLitCode < 0 ) return false;
int nDistCode = Huff.ComputeCode( 15, DistFreq, DistLen, DistCode ); if ( nDistCode < 0 ) return false;
if ( nDistCode == 0 ) nDistCode = 1;
Clear( LenFreq );
LenPass = 1; DoLengths( nLitCode, LitLen, true ); DoLengths( nDistCode, DistLen, false );
if ( Huff.ComputeCode( 7, LenFreq, LenLen, LenCode ) < 0 ) return false;
int nLenCode = 19; while ( nLenCode > 4 && LenLen[ ClenAlphabet[ nLenCode - 1 ] ] == 0 ) nLenCode -= 1;
// Now output dynamic Huffman block. For small blocks fixed coding might work better, not implemented.
PutBit( lastBlock );
PutBits( 2, 2 );
PutBits( 5, nLitCode - 257 ); PutBits( 5, nDistCode - 1 ); PutBits( 4, nLenCode - 4 );
for ( int i = 0; i < nLenCode; i += 1 ) PutBits( 3, LenLen[ ClenAlphabet[ i ] ] );
LenPass = 2; DoLengths( nLitCode, LitLen, true ); DoLengths( nDistCode, DistLen, false );
got = 0; while ( got < n ) // Similar to loop above, but does output instead of computing symbol frequencies.
{
int x = IGet(); got += 1;
if ( x < 256 ) PutBits( LitLen[ x ], LitCode[ x ] );
else
{
x -= 257;
int dist = IGet(); got += 1;
int mc = 0; while ( x >= MatchOff[ mc ] ) mc += 1; mc -= 1;
int dc = 0; while ( dist >= DistOff[ dc ] ) dc += 1; dc -= 1;
PutBits( LitLen[ 257 + mc ], LitCode[ 257 + mc ] );
PutBits( MatchExtra[ mc ], x-MatchOff[ mc ] );
PutBits( DistLen[ dc ], DistCode[ dc ] );
PutBits( DistExtra[ dc ], dist-DistOff[ dc ] );
}
}
PutBits( LitLen[ 256 ], LitCode[ 256 ] ); // End of block code.
return true;
} // end DoBlock
// Run length encoding of code lengths - RFC 1951, page 13.
int LenPass, Plen, ZeroRun, Repeat;
void PutLenCode( int code ) { if ( LenPass == 1 ) LenFreq[ code ] += 1; else PutBits( LenLen[ code ], LenCode[ code ] ); }
void DoLengths( int n, byte a, bool isLit )
{
if ( isLit ) { Plen = 0; ZeroRun = 0; Repeat = 0; }
for ( int i = 0; i < n; i += 1 )
{
int len = a[ i ];
if ( len == 0 ){ EncRepeat(); ZeroRun += 1; Plen = 0; }
else if ( len == Plen ) { Repeat += 1; }
else { EncZeroRun(); EncRepeat(); PutLenCode( len ); Plen = len; }
}
if ( !isLit ) { EncZeroRun(); EncRepeat(); }
}
void EncRepeat()
{
while ( Repeat > 0 )
{
if ( Repeat < 3 ) { PutLenCode( Plen ); Repeat -= 1; }
else { int x = Repeat; if ( x > 6 ) x = 6; PutLenCode( 16 ); if ( LenPass == 2 ) PutBits( 2, x-3 ); Repeat -= x; }
}
}
void EncZeroRun()
{
while ( ZeroRun > 0 )
{
if ( ZeroRun < 3 ) { PutLenCode( 0 ); ZeroRun -= 1; }
else if ( ZeroRun < 11 ) { PutLenCode( 17 ); if ( LenPass == 2 ) PutBits( 3, ZeroRun-3 ); ZeroRun = 0; }
else { int x = ZeroRun; if ( x > 138 ) x = 138; PutLenCode( 18 ); if ( LenPass == 2 ) PutBits( 7, x - 11 ); ZeroRun -= x; }
}
}
public static uint Adler32( byte b ) // Checksum function per RFC 1950.
{
uint s1 = 1, s2 = 0;
for ( int i = 0; i < b.Length; i += 1 )
{
s1 = ( s1 + b[ i ] ) % 65521;
s2 = ( s2 + s1 ) % 65521;
}
return s2*65536 + s1;
}
static void Clear( int f ){ System.Array.Clear( f, 0, f.Length ); }
// Output stream.
byte Buf = 0, M = 1;
void PutBit( int b ) { if ( b != 0 ) Buf |= M; M <<= 1; if ( M == 0 ) { Add(Buf); Buf = 0; M = 1; } }
void PutBits( int n, int x ) { for ( int i = 0; i < n; i += 1 ) { PutBit( x & 1 ); x >>= 1; } }
void FlushBitBuf(){ while ( M != 1 ) PutBit( 0 ); }
void Put32( uint x ) { Add( (byte)( x >> 24 ) ); Add( (byte)( x >> 16 ) ); Add( (byte)( x >> 8 ) ); Add( (byte) x ); }
} // end class Encoder
//+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
class Huff // Given a list of frequencies (freq), compute Huffman code lengths (nbits) and codes (tree_code).
{
public static int ComputeCode( int bitLimit, int freq, byte nbits, ushort tree_code )
{
int ncode = freq.Length;
Node heap = new Node[ ncode ];
int hn = 0;
for ( int i = 0; i < ncode; i += 1 )
{
int f = freq[ i ];
if ( f > 0 )
{
Node n = new Node();
n.Freq = f;
n.Code = (ushort)i;
HeapInsert( heap, hn, n );
hn += 1;
}
}
for ( int i = 0; i < nbits.Length; i += 1 ) nbits[ i ] = 0;
if ( hn <= 1 ) // Special case.
{ if ( hn == 1 ) nbits[ heap[ 0 ].Code ] = 1; }
else
{
while ( hn > 1 ) // Keep pairing the lowest frequency nodes.
{
Node n = new Node();
hn -= 1; n.Left = HeapRemove( heap, hn );
hn -= 1; n.Right = HeapRemove( heap, hn );
n.Freq = n.Left.Freq + n.Right.Freq;
n.Depth = (byte) ( 1 + ( n.Left.Depth > n.Right.Depth ? n.Left.Depth : n.Right.Depth ) );
HeapInsert( heap, hn, n );
hn += 1;
}
Walk( nbits, heap[ 0 ], 0 ); // Walk the tree to find the code lengths (nbits).
}
for ( int i = 0; i < ncode; i += 1 ) if ( nbits[ i ] > bitLimit ) return -1;
// Now compute codes, code below is from RFC 1951 page 7.
int maxBits = 0;
for ( int i = 0; i < ncode; i += 1 ) if ( nbits[ i ] > maxBits ) maxBits = nbits[ i ];
int bl_count = new int[ maxBits+1 ];
for ( int i = 0; i < ncode; i += 1 ) bl_count[ nbits[ i ] ] += 1;
int next_code = new int[ maxBits+1 ];
int code = 0; bl_count[ 0 ] = 0;
for ( int i = 0; i < maxBits; i += 1 )
{
code = ( code + bl_count[ i ] ) << 1;
next_code[ i+1 ] = code;
}
for ( int i = 0; i < ncode; i += 1 )
{
int len = nbits[ i ];
if ( len != 0 )
{
tree_code[ i ] = (ushort)Reverse( next_code[ len ], len );
next_code[ len ] += 1;
}
}
while ( ncode > 0 && nbits[ ncode - 1 ] == 0 ) ncode -= 1;
//System.Console.WriteLine( "Huff.ComputeCode" );
//for ( int i = 0; i < ncode; i += 1 ) if ( nbits[ i ] > 0 )
// System.Console.WriteLine( "i=" + i + " len=" + nbits[ i ] + " tc=" + tree_code[ i ].ToString("X") + " freq=" + freq[ i ] );
return ncode;
}
class Node{ public Node Left, Right; public int Freq; public ushort Code; public byte Depth; }
static int Reverse( int x, int bits )
{ int result = 0; for ( int i = 0; i < bits; i += 1 ) { result <<= 1; result |= x & 1; x >>= 1; } return result; }
static void Walk( byte a, Node n, int len )
{ if ( n.Left == null ) a[ n.Code ] = (byte)len; else { Walk( a, n.Left, len+1 ); Walk( a, n.Right, len+1 ); } }
static bool LessThan( Node a, Node b )
{ return a.Freq < b.Freq || a.Freq == b.Freq && a.Depth < b.Depth; }
// See e.g. https://en.wikipedia.org/wiki/Heap_(data_structure) for information on heap data structure.
static void HeapInsert( Node heap, int h, Node n ) // h is size of heap before insertion.
{
int j = h;
while ( j > 0 )
{
int p = ( j - 1 ) / 2; // Index of parent.
Node pn = heap[ p ];
if ( !LessThan( n, pn ) ) break;
heap[ j ] = pn; // Demote parent.
j = p;
}
heap[ j ] = n;
}
static Node HeapRemove( Node heap, int h ) // h is size of heap after removal.
{
Node result = heap[ 0 ];
Node n = heap[ h ];
int j = 0;
while ( true )
{
int c = j * 2 + 1; if ( c >= h ) break;
Node cn = heap[ c ];
if ( c + 1 < h )
{
Node cn2 = heap[ c + 1 ];
if ( LessThan( cn2, cn ) ) { c += 1; cn = cn2; }
}
if ( !LessThan( cn, n ) ) break;
heap[ j ] = cn; j = c;
}
heap[ j ] = n;
return result;
}
} // end class Huff
c# compression heap
New contributor
4
I find your one-liners, abbreviations and lots of strange names and formatting scarry, ugly, incomprihesible and generally terrible - do you always write code like this? :-
– t3chb0t
Jan 2 at 19:21
add a comment |
This is a little project I did over Christmas, I wanted to understand a bit more about how RFC 1951 (https://www.ietf.org/rfc/rfc1951.txt) compression worked, mostly out of curiosity really. RFC 1951 is used to compress PNG images, zip files and portions of PDF files ( and probably other things as well ). I have tried to keep the C# code fairly simple, concise and hopefully comprehensible, rather than being too concerned with efficiency. Here's the code:
using Generic = System.Collections.Generic;
class Encoder : Generic.List<byte> // Data compression per RFC 1950, RFC 1951.
{
public Generic.List<byte> Deflate( byte inp )
{
Clear();
Add( 0x78); Add( 0x9C ); // RFC 1950 bytes.
ReadInp( inp );
while ( DoOutput( 1 ) == 0 );
FlushBitBuf();
Put32( Adler32( inp ) ); // RFC 1950 checksum.
// System.Console.WriteLine( "Encoder.Deflate, input size=" + inp.Length + " output size=" + this.Count );
return this;
}
class PosList{ public int Pos; public PosList Next; } // List of 3-byte match positions.
void ReadInp( byte input ) // LZ77 compression, per RFC 1951.
{
Generic.Dictionary<int,PosList> dict = new Generic.Dictionary<int,PosList>();
int n = input.Length;
SetIBufSize( n );
int w = 0; // Holds last 3 bytes of input.
int todo = 0; // Number of bytes in w that have not yet been output to IBuf, can be negative when a match is found.
int pendingMatchLen = 0, pendingDist = 0;
for ( int i = 0; i < 2 && i < n; i += 1 ) { w = ( w << 8 ) | input[ i ]; todo += 1; }
for ( int i = 2; i < n; i += 1 )
{
w = ( ( w << 8 ) | input[ i ] ) & 0xffffff; todo += 1;
PosList e, x = new PosList(); x.Pos = i;
int bestMatchLen = 0, bestDist = 0;
if ( dict.TryGetValue( w, out e ) )
{
x.Next = e;
PosList p = x;
if ( todo >= 3 ) while ( e != null )
{
int dist = i - e.Pos; if ( dist > 32768 ) { p.Next = null; break; }
int matchLen = MatchLen( input, dist, i );
if ( matchLen > bestMatchLen ) { bestMatchLen = matchLen; bestDist = dist; }
p = e; e = e.Next;
}
}
dict[ w ] = x; ISpace();
// "Lazy matching" RFC 1951 p.15 : if there are overlapping matches, there is a choice over which of the match to use.
// Example: "abc012bc345.... abc345". Here abc345 can be encoded as either [abc][345] or as a[bc345].
// Since a range typically needs more bits to encode than a literal, choose the latter.
if ( pendingMatchLen > 0 )
{
if ( bestMatchLen > pendingMatchLen || bestMatchLen == pendingMatchLen && bestDist < pendingDist )
{ IPut( input[ i-3 ] ); todo -= 1; }
else // Save the pending match, suppress bestMatch if any.
{
IPut( 257 + pendingMatchLen );
IPut( pendingDist );
todo -= pendingMatchLen;
bestMatchLen = 0;
}
pendingMatchLen = 0;
}
if ( bestMatchLen > 0 ) { pendingMatchLen = bestMatchLen; pendingDist = bestDist; }
else if ( todo == 3 ) { IPut( w >> 16 ); todo = 2; }
} // End of main input loop.
if ( pendingMatchLen > 0 )
{
IPut( 257 + pendingMatchLen );
IPut( pendingDist );
todo -= pendingMatchLen;
}
while ( todo > 0 ){ todo -= 1; IPut( (byte)( w >> (todo*8) ) ); }
} // end ReadInp
int MatchLen( byte input, int dist, int i )
{
// We have a match of 3 bytes, this function computes total match.
int end = input.Length; if ( end - i > 256 ) end = i + 256; // Maximum match is 258.
int x = i + 1; while ( x < end && input[ x ] == input[ x-dist ] ) x += 1;
return x - i + 2;
}
ushort IBuf; // Intermediate circular buffer, holds output from LZ77 algorithm.
const int IBufSizeMax = 0x40000;
int IBufSize, IBufI, IBufJ;
void IPut( int x ) { IBuf[ IBufI++ ] = (ushort)x; if ( IBufI == IBufSize ) IBufI = 0; }
int IGet(){ int result = IBuf[ IBufJ++ ]; if ( IBufJ == IBufSize ) IBufJ = 0; return result; }
int ICount(){ if ( IBufI >= IBufJ ) return IBufI - IBufJ; else return IBufI + IBufSize - IBufJ; } // Number of values in IBuf.
void ISpace(){ while ( ICount() > IBufSize - 10 ) DoOutput( 0 ); } // Ensure IBuf has space for at least 10 values.
void SetIBufSize( int x ) { x += 20; if ( x > IBufSizeMax ) x = IBufSizeMax; if ( IBufSize < x ) { IBufSize = x; IBuf = new ushort[ x ]; } }
byte DoOutput( byte lastBlock ) // While DoBlock fails, retry with a smaller amount of input.
{
int n = ICount();
while ( !DoBlock( n, lastBlock ) ) { lastBlock = 0; n -= n / 20; }
return lastBlock;
}
// RFC 1951 encoding constants.
static byte ClenAlphabet = { 16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15 };
static byte MatchExtra = { 0,0,0,0, 0,0,0,0, 1,1,1,1, 2,2,2,2, 3,3,3,3, 4,4,4,4, 5,5,5,5, 0 };
static ushort MatchOff = { 3,4,5,6, 7,8,9,10, 11,13,15,17, 19,23,27,31, 35,43,51,59, 67,83,99,115, 131,163,195,227, 258, 0xffff };
static byte DistExtra = { 0,0,0,0, 1,1,2,2, 3,3,4,4, 5,5,6,6, 7,7,8,8, 9,9,10,10, 11,11,12,12, 13,13 };
static ushort DistOff = { 1,2,3,4, 5,7,9,13, 17,25,33,49, 65,97,129,193, 257,385,513,769, 1025,1537,2049,3073,
4097,6145,8193,12289, 16385,24577, 0xffff };
int LitFreq = new int [ 288 ], DistFreq = new int [ 32 ], LenFreq = new int [ 19 ];
byte LitLen = new byte [ 288 ], DistLen = new byte [ 32 ], LenLen = new byte [ 19 ];
ushort LitCode = new ushort[ 288 ], DistCode = new ushort[ 32 ], LenCode = new ushort[ 19 ];
bool DoBlock( int n, byte lastBlock )
{
// First compute symbol frequencies.
Clear( LitFreq ); Clear( DistFreq );
int saveIBufJ = IBufJ;
int got = 0; while ( got < n )
{
int x = IGet(); got += 1;
if ( x < 256 ) LitFreq[ x ] += 1;
else
{
x -= 257;
int dist = IGet(); got += 1;
int mc = 0; while ( x >= MatchOff[ mc ] ) mc += 1; mc -= 1;
int dc = 0; while ( dist >= DistOff[ dc ] ) dc += 1; dc -= 1;
LitFreq[ 257 + mc ] += 1;
DistFreq[ dc ] += 1;
}
}
LitFreq[ 256 ] += 1; // End of block code.
IBufJ = saveIBufJ;
// Now compute Huffman codes.
int nLitCode = Huff.ComputeCode( 15, LitFreq, LitLen, LitCode ); if ( nLitCode < 0 ) return false;
int nDistCode = Huff.ComputeCode( 15, DistFreq, DistLen, DistCode ); if ( nDistCode < 0 ) return false;
if ( nDistCode == 0 ) nDistCode = 1;
Clear( LenFreq );
LenPass = 1; DoLengths( nLitCode, LitLen, true ); DoLengths( nDistCode, DistLen, false );
if ( Huff.ComputeCode( 7, LenFreq, LenLen, LenCode ) < 0 ) return false;
int nLenCode = 19; while ( nLenCode > 4 && LenLen[ ClenAlphabet[ nLenCode - 1 ] ] == 0 ) nLenCode -= 1;
// Now output dynamic Huffman block. For small blocks fixed coding might work better, not implemented.
PutBit( lastBlock );
PutBits( 2, 2 );
PutBits( 5, nLitCode - 257 ); PutBits( 5, nDistCode - 1 ); PutBits( 4, nLenCode - 4 );
for ( int i = 0; i < nLenCode; i += 1 ) PutBits( 3, LenLen[ ClenAlphabet[ i ] ] );
LenPass = 2; DoLengths( nLitCode, LitLen, true ); DoLengths( nDistCode, DistLen, false );
got = 0; while ( got < n ) // Similar to loop above, but does output instead of computing symbol frequencies.
{
int x = IGet(); got += 1;
if ( x < 256 ) PutBits( LitLen[ x ], LitCode[ x ] );
else
{
x -= 257;
int dist = IGet(); got += 1;
int mc = 0; while ( x >= MatchOff[ mc ] ) mc += 1; mc -= 1;
int dc = 0; while ( dist >= DistOff[ dc ] ) dc += 1; dc -= 1;
PutBits( LitLen[ 257 + mc ], LitCode[ 257 + mc ] );
PutBits( MatchExtra[ mc ], x-MatchOff[ mc ] );
PutBits( DistLen[ dc ], DistCode[ dc ] );
PutBits( DistExtra[ dc ], dist-DistOff[ dc ] );
}
}
PutBits( LitLen[ 256 ], LitCode[ 256 ] ); // End of block code.
return true;
} // end DoBlock
// Run length encoding of code lengths - RFC 1951, page 13.
int LenPass, Plen, ZeroRun, Repeat;
void PutLenCode( int code ) { if ( LenPass == 1 ) LenFreq[ code ] += 1; else PutBits( LenLen[ code ], LenCode[ code ] ); }
void DoLengths( int n, byte a, bool isLit )
{
if ( isLit ) { Plen = 0; ZeroRun = 0; Repeat = 0; }
for ( int i = 0; i < n; i += 1 )
{
int len = a[ i ];
if ( len == 0 ){ EncRepeat(); ZeroRun += 1; Plen = 0; }
else if ( len == Plen ) { Repeat += 1; }
else { EncZeroRun(); EncRepeat(); PutLenCode( len ); Plen = len; }
}
if ( !isLit ) { EncZeroRun(); EncRepeat(); }
}
void EncRepeat()
{
while ( Repeat > 0 )
{
if ( Repeat < 3 ) { PutLenCode( Plen ); Repeat -= 1; }
else { int x = Repeat; if ( x > 6 ) x = 6; PutLenCode( 16 ); if ( LenPass == 2 ) PutBits( 2, x-3 ); Repeat -= x; }
}
}
void EncZeroRun()
{
while ( ZeroRun > 0 )
{
if ( ZeroRun < 3 ) { PutLenCode( 0 ); ZeroRun -= 1; }
else if ( ZeroRun < 11 ) { PutLenCode( 17 ); if ( LenPass == 2 ) PutBits( 3, ZeroRun-3 ); ZeroRun = 0; }
else { int x = ZeroRun; if ( x > 138 ) x = 138; PutLenCode( 18 ); if ( LenPass == 2 ) PutBits( 7, x - 11 ); ZeroRun -= x; }
}
}
public static uint Adler32( byte b ) // Checksum function per RFC 1950.
{
uint s1 = 1, s2 = 0;
for ( int i = 0; i < b.Length; i += 1 )
{
s1 = ( s1 + b[ i ] ) % 65521;
s2 = ( s2 + s1 ) % 65521;
}
return s2*65536 + s1;
}
static void Clear( int f ){ System.Array.Clear( f, 0, f.Length ); }
// Output stream.
byte Buf = 0, M = 1;
void PutBit( int b ) { if ( b != 0 ) Buf |= M; M <<= 1; if ( M == 0 ) { Add(Buf); Buf = 0; M = 1; } }
void PutBits( int n, int x ) { for ( int i = 0; i < n; i += 1 ) { PutBit( x & 1 ); x >>= 1; } }
void FlushBitBuf(){ while ( M != 1 ) PutBit( 0 ); }
void Put32( uint x ) { Add( (byte)( x >> 24 ) ); Add( (byte)( x >> 16 ) ); Add( (byte)( x >> 8 ) ); Add( (byte) x ); }
} // end class Encoder
//+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
class Huff // Given a list of frequencies (freq), compute Huffman code lengths (nbits) and codes (tree_code).
{
public static int ComputeCode( int bitLimit, int freq, byte nbits, ushort tree_code )
{
int ncode = freq.Length;
Node heap = new Node[ ncode ];
int hn = 0;
for ( int i = 0; i < ncode; i += 1 )
{
int f = freq[ i ];
if ( f > 0 )
{
Node n = new Node();
n.Freq = f;
n.Code = (ushort)i;
HeapInsert( heap, hn, n );
hn += 1;
}
}
for ( int i = 0; i < nbits.Length; i += 1 ) nbits[ i ] = 0;
if ( hn <= 1 ) // Special case.
{ if ( hn == 1 ) nbits[ heap[ 0 ].Code ] = 1; }
else
{
while ( hn > 1 ) // Keep pairing the lowest frequency nodes.
{
Node n = new Node();
hn -= 1; n.Left = HeapRemove( heap, hn );
hn -= 1; n.Right = HeapRemove( heap, hn );
n.Freq = n.Left.Freq + n.Right.Freq;
n.Depth = (byte) ( 1 + ( n.Left.Depth > n.Right.Depth ? n.Left.Depth : n.Right.Depth ) );
HeapInsert( heap, hn, n );
hn += 1;
}
Walk( nbits, heap[ 0 ], 0 ); // Walk the tree to find the code lengths (nbits).
}
for ( int i = 0; i < ncode; i += 1 ) if ( nbits[ i ] > bitLimit ) return -1;
// Now compute codes, code below is from RFC 1951 page 7.
int maxBits = 0;
for ( int i = 0; i < ncode; i += 1 ) if ( nbits[ i ] > maxBits ) maxBits = nbits[ i ];
int bl_count = new int[ maxBits+1 ];
for ( int i = 0; i < ncode; i += 1 ) bl_count[ nbits[ i ] ] += 1;
int next_code = new int[ maxBits+1 ];
int code = 0; bl_count[ 0 ] = 0;
for ( int i = 0; i < maxBits; i += 1 )
{
code = ( code + bl_count[ i ] ) << 1;
next_code[ i+1 ] = code;
}
for ( int i = 0; i < ncode; i += 1 )
{
int len = nbits[ i ];
if ( len != 0 )
{
tree_code[ i ] = (ushort)Reverse( next_code[ len ], len );
next_code[ len ] += 1;
}
}
while ( ncode > 0 && nbits[ ncode - 1 ] == 0 ) ncode -= 1;
//System.Console.WriteLine( "Huff.ComputeCode" );
//for ( int i = 0; i < ncode; i += 1 ) if ( nbits[ i ] > 0 )
// System.Console.WriteLine( "i=" + i + " len=" + nbits[ i ] + " tc=" + tree_code[ i ].ToString("X") + " freq=" + freq[ i ] );
return ncode;
}
class Node{ public Node Left, Right; public int Freq; public ushort Code; public byte Depth; }
static int Reverse( int x, int bits )
{ int result = 0; for ( int i = 0; i < bits; i += 1 ) { result <<= 1; result |= x & 1; x >>= 1; } return result; }
static void Walk( byte a, Node n, int len )
{ if ( n.Left == null ) a[ n.Code ] = (byte)len; else { Walk( a, n.Left, len+1 ); Walk( a, n.Right, len+1 ); } }
static bool LessThan( Node a, Node b )
{ return a.Freq < b.Freq || a.Freq == b.Freq && a.Depth < b.Depth; }
// See e.g. https://en.wikipedia.org/wiki/Heap_(data_structure) for information on heap data structure.
static void HeapInsert( Node heap, int h, Node n ) // h is size of heap before insertion.
{
int j = h;
while ( j > 0 )
{
int p = ( j - 1 ) / 2; // Index of parent.
Node pn = heap[ p ];
if ( !LessThan( n, pn ) ) break;
heap[ j ] = pn; // Demote parent.
j = p;
}
heap[ j ] = n;
}
static Node HeapRemove( Node heap, int h ) // h is size of heap after removal.
{
Node result = heap[ 0 ];
Node n = heap[ h ];
int j = 0;
while ( true )
{
int c = j * 2 + 1; if ( c >= h ) break;
Node cn = heap[ c ];
if ( c + 1 < h )
{
Node cn2 = heap[ c + 1 ];
if ( LessThan( cn2, cn ) ) { c += 1; cn = cn2; }
}
if ( !LessThan( cn, n ) ) break;
heap[ j ] = cn; j = c;
}
heap[ j ] = n;
return result;
}
} // end class Huff
c# compression heap
New contributor
This is a little project I did over Christmas, I wanted to understand a bit more about how RFC 1951 (https://www.ietf.org/rfc/rfc1951.txt) compression worked, mostly out of curiosity really. RFC 1951 is used to compress PNG images, zip files and portions of PDF files ( and probably other things as well ). I have tried to keep the C# code fairly simple, concise and hopefully comprehensible, rather than being too concerned with efficiency. Here's the code:
using Generic = System.Collections.Generic;
class Encoder : Generic.List<byte> // Data compression per RFC 1950, RFC 1951.
{
public Generic.List<byte> Deflate( byte inp )
{
Clear();
Add( 0x78); Add( 0x9C ); // RFC 1950 bytes.
ReadInp( inp );
while ( DoOutput( 1 ) == 0 );
FlushBitBuf();
Put32( Adler32( inp ) ); // RFC 1950 checksum.
// System.Console.WriteLine( "Encoder.Deflate, input size=" + inp.Length + " output size=" + this.Count );
return this;
}
class PosList{ public int Pos; public PosList Next; } // List of 3-byte match positions.
void ReadInp( byte input ) // LZ77 compression, per RFC 1951.
{
Generic.Dictionary<int,PosList> dict = new Generic.Dictionary<int,PosList>();
int n = input.Length;
SetIBufSize( n );
int w = 0; // Holds last 3 bytes of input.
int todo = 0; // Number of bytes in w that have not yet been output to IBuf, can be negative when a match is found.
int pendingMatchLen = 0, pendingDist = 0;
for ( int i = 0; i < 2 && i < n; i += 1 ) { w = ( w << 8 ) | input[ i ]; todo += 1; }
for ( int i = 2; i < n; i += 1 )
{
w = ( ( w << 8 ) | input[ i ] ) & 0xffffff; todo += 1;
PosList e, x = new PosList(); x.Pos = i;
int bestMatchLen = 0, bestDist = 0;
if ( dict.TryGetValue( w, out e ) )
{
x.Next = e;
PosList p = x;
if ( todo >= 3 ) while ( e != null )
{
int dist = i - e.Pos; if ( dist > 32768 ) { p.Next = null; break; }
int matchLen = MatchLen( input, dist, i );
if ( matchLen > bestMatchLen ) { bestMatchLen = matchLen; bestDist = dist; }
p = e; e = e.Next;
}
}
dict[ w ] = x; ISpace();
// "Lazy matching" RFC 1951 p.15 : if there are overlapping matches, there is a choice over which of the match to use.
// Example: "abc012bc345.... abc345". Here abc345 can be encoded as either [abc][345] or as a[bc345].
// Since a range typically needs more bits to encode than a literal, choose the latter.
if ( pendingMatchLen > 0 )
{
if ( bestMatchLen > pendingMatchLen || bestMatchLen == pendingMatchLen && bestDist < pendingDist )
{ IPut( input[ i-3 ] ); todo -= 1; }
else // Save the pending match, suppress bestMatch if any.
{
IPut( 257 + pendingMatchLen );
IPut( pendingDist );
todo -= pendingMatchLen;
bestMatchLen = 0;
}
pendingMatchLen = 0;
}
if ( bestMatchLen > 0 ) { pendingMatchLen = bestMatchLen; pendingDist = bestDist; }
else if ( todo == 3 ) { IPut( w >> 16 ); todo = 2; }
} // End of main input loop.
if ( pendingMatchLen > 0 )
{
IPut( 257 + pendingMatchLen );
IPut( pendingDist );
todo -= pendingMatchLen;
}
while ( todo > 0 ){ todo -= 1; IPut( (byte)( w >> (todo*8) ) ); }
} // end ReadInp
int MatchLen( byte input, int dist, int i )
{
// We have a match of 3 bytes, this function computes total match.
int end = input.Length; if ( end - i > 256 ) end = i + 256; // Maximum match is 258.
int x = i + 1; while ( x < end && input[ x ] == input[ x-dist ] ) x += 1;
return x - i + 2;
}
ushort IBuf; // Intermediate circular buffer, holds output from LZ77 algorithm.
const int IBufSizeMax = 0x40000;
int IBufSize, IBufI, IBufJ;
void IPut( int x ) { IBuf[ IBufI++ ] = (ushort)x; if ( IBufI == IBufSize ) IBufI = 0; }
int IGet(){ int result = IBuf[ IBufJ++ ]; if ( IBufJ == IBufSize ) IBufJ = 0; return result; }
int ICount(){ if ( IBufI >= IBufJ ) return IBufI - IBufJ; else return IBufI + IBufSize - IBufJ; } // Number of values in IBuf.
void ISpace(){ while ( ICount() > IBufSize - 10 ) DoOutput( 0 ); } // Ensure IBuf has space for at least 10 values.
void SetIBufSize( int x ) { x += 20; if ( x > IBufSizeMax ) x = IBufSizeMax; if ( IBufSize < x ) { IBufSize = x; IBuf = new ushort[ x ]; } }
byte DoOutput( byte lastBlock ) // While DoBlock fails, retry with a smaller amount of input.
{
int n = ICount();
while ( !DoBlock( n, lastBlock ) ) { lastBlock = 0; n -= n / 20; }
return lastBlock;
}
// RFC 1951 encoding constants.
static byte ClenAlphabet = { 16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15 };
static byte MatchExtra = { 0,0,0,0, 0,0,0,0, 1,1,1,1, 2,2,2,2, 3,3,3,3, 4,4,4,4, 5,5,5,5, 0 };
static ushort MatchOff = { 3,4,5,6, 7,8,9,10, 11,13,15,17, 19,23,27,31, 35,43,51,59, 67,83,99,115, 131,163,195,227, 258, 0xffff };
static byte DistExtra = { 0,0,0,0, 1,1,2,2, 3,3,4,4, 5,5,6,6, 7,7,8,8, 9,9,10,10, 11,11,12,12, 13,13 };
static ushort DistOff = { 1,2,3,4, 5,7,9,13, 17,25,33,49, 65,97,129,193, 257,385,513,769, 1025,1537,2049,3073,
4097,6145,8193,12289, 16385,24577, 0xffff };
int LitFreq = new int [ 288 ], DistFreq = new int [ 32 ], LenFreq = new int [ 19 ];
byte LitLen = new byte [ 288 ], DistLen = new byte [ 32 ], LenLen = new byte [ 19 ];
ushort LitCode = new ushort[ 288 ], DistCode = new ushort[ 32 ], LenCode = new ushort[ 19 ];
bool DoBlock( int n, byte lastBlock )
{
// First compute symbol frequencies.
Clear( LitFreq ); Clear( DistFreq );
int saveIBufJ = IBufJ;
int got = 0; while ( got < n )
{
int x = IGet(); got += 1;
if ( x < 256 ) LitFreq[ x ] += 1;
else
{
x -= 257;
int dist = IGet(); got += 1;
int mc = 0; while ( x >= MatchOff[ mc ] ) mc += 1; mc -= 1;
int dc = 0; while ( dist >= DistOff[ dc ] ) dc += 1; dc -= 1;
LitFreq[ 257 + mc ] += 1;
DistFreq[ dc ] += 1;
}
}
LitFreq[ 256 ] += 1; // End of block code.
IBufJ = saveIBufJ;
// Now compute Huffman codes.
int nLitCode = Huff.ComputeCode( 15, LitFreq, LitLen, LitCode ); if ( nLitCode < 0 ) return false;
int nDistCode = Huff.ComputeCode( 15, DistFreq, DistLen, DistCode ); if ( nDistCode < 0 ) return false;
if ( nDistCode == 0 ) nDistCode = 1;
Clear( LenFreq );
LenPass = 1; DoLengths( nLitCode, LitLen, true ); DoLengths( nDistCode, DistLen, false );
if ( Huff.ComputeCode( 7, LenFreq, LenLen, LenCode ) < 0 ) return false;
int nLenCode = 19; while ( nLenCode > 4 && LenLen[ ClenAlphabet[ nLenCode - 1 ] ] == 0 ) nLenCode -= 1;
// Now output dynamic Huffman block. For small blocks fixed coding might work better, not implemented.
PutBit( lastBlock );
PutBits( 2, 2 );
PutBits( 5, nLitCode - 257 ); PutBits( 5, nDistCode - 1 ); PutBits( 4, nLenCode - 4 );
for ( int i = 0; i < nLenCode; i += 1 ) PutBits( 3, LenLen[ ClenAlphabet[ i ] ] );
LenPass = 2; DoLengths( nLitCode, LitLen, true ); DoLengths( nDistCode, DistLen, false );
got = 0; while ( got < n ) // Similar to loop above, but does output instead of computing symbol frequencies.
{
int x = IGet(); got += 1;
if ( x < 256 ) PutBits( LitLen[ x ], LitCode[ x ] );
else
{
x -= 257;
int dist = IGet(); got += 1;
int mc = 0; while ( x >= MatchOff[ mc ] ) mc += 1; mc -= 1;
int dc = 0; while ( dist >= DistOff[ dc ] ) dc += 1; dc -= 1;
PutBits( LitLen[ 257 + mc ], LitCode[ 257 + mc ] );
PutBits( MatchExtra[ mc ], x-MatchOff[ mc ] );
PutBits( DistLen[ dc ], DistCode[ dc ] );
PutBits( DistExtra[ dc ], dist-DistOff[ dc ] );
}
}
PutBits( LitLen[ 256 ], LitCode[ 256 ] ); // End of block code.
return true;
} // end DoBlock
// Run length encoding of code lengths - RFC 1951, page 13.
int LenPass, Plen, ZeroRun, Repeat;
void PutLenCode( int code ) { if ( LenPass == 1 ) LenFreq[ code ] += 1; else PutBits( LenLen[ code ], LenCode[ code ] ); }
void DoLengths( int n, byte a, bool isLit )
{
if ( isLit ) { Plen = 0; ZeroRun = 0; Repeat = 0; }
for ( int i = 0; i < n; i += 1 )
{
int len = a[ i ];
if ( len == 0 ){ EncRepeat(); ZeroRun += 1; Plen = 0; }
else if ( len == Plen ) { Repeat += 1; }
else { EncZeroRun(); EncRepeat(); PutLenCode( len ); Plen = len; }
}
if ( !isLit ) { EncZeroRun(); EncRepeat(); }
}
void EncRepeat()
{
while ( Repeat > 0 )
{
if ( Repeat < 3 ) { PutLenCode( Plen ); Repeat -= 1; }
else { int x = Repeat; if ( x > 6 ) x = 6; PutLenCode( 16 ); if ( LenPass == 2 ) PutBits( 2, x-3 ); Repeat -= x; }
}
}
void EncZeroRun()
{
while ( ZeroRun > 0 )
{
if ( ZeroRun < 3 ) { PutLenCode( 0 ); ZeroRun -= 1; }
else if ( ZeroRun < 11 ) { PutLenCode( 17 ); if ( LenPass == 2 ) PutBits( 3, ZeroRun-3 ); ZeroRun = 0; }
else { int x = ZeroRun; if ( x > 138 ) x = 138; PutLenCode( 18 ); if ( LenPass == 2 ) PutBits( 7, x - 11 ); ZeroRun -= x; }
}
}
public static uint Adler32( byte b ) // Checksum function per RFC 1950.
{
uint s1 = 1, s2 = 0;
for ( int i = 0; i < b.Length; i += 1 )
{
s1 = ( s1 + b[ i ] ) % 65521;
s2 = ( s2 + s1 ) % 65521;
}
return s2*65536 + s1;
}
static void Clear( int f ){ System.Array.Clear( f, 0, f.Length ); }
// Output stream.
byte Buf = 0, M = 1;
void PutBit( int b ) { if ( b != 0 ) Buf |= M; M <<= 1; if ( M == 0 ) { Add(Buf); Buf = 0; M = 1; } }
void PutBits( int n, int x ) { for ( int i = 0; i < n; i += 1 ) { PutBit( x & 1 ); x >>= 1; } }
void FlushBitBuf(){ while ( M != 1 ) PutBit( 0 ); }
void Put32( uint x ) { Add( (byte)( x >> 24 ) ); Add( (byte)( x >> 16 ) ); Add( (byte)( x >> 8 ) ); Add( (byte) x ); }
} // end class Encoder
//+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
class Huff // Given a list of frequencies (freq), compute Huffman code lengths (nbits) and codes (tree_code).
{
public static int ComputeCode( int bitLimit, int freq, byte nbits, ushort tree_code )
{
int ncode = freq.Length;
Node heap = new Node[ ncode ];
int hn = 0;
for ( int i = 0; i < ncode; i += 1 )
{
int f = freq[ i ];
if ( f > 0 )
{
Node n = new Node();
n.Freq = f;
n.Code = (ushort)i;
HeapInsert( heap, hn, n );
hn += 1;
}
}
for ( int i = 0; i < nbits.Length; i += 1 ) nbits[ i ] = 0;
if ( hn <= 1 ) // Special case.
{ if ( hn == 1 ) nbits[ heap[ 0 ].Code ] = 1; }
else
{
while ( hn > 1 ) // Keep pairing the lowest frequency nodes.
{
Node n = new Node();
hn -= 1; n.Left = HeapRemove( heap, hn );
hn -= 1; n.Right = HeapRemove( heap, hn );
n.Freq = n.Left.Freq + n.Right.Freq;
n.Depth = (byte) ( 1 + ( n.Left.Depth > n.Right.Depth ? n.Left.Depth : n.Right.Depth ) );
HeapInsert( heap, hn, n );
hn += 1;
}
Walk( nbits, heap[ 0 ], 0 ); // Walk the tree to find the code lengths (nbits).
}
for ( int i = 0; i < ncode; i += 1 ) if ( nbits[ i ] > bitLimit ) return -1;
// Now compute codes, code below is from RFC 1951 page 7.
int maxBits = 0;
for ( int i = 0; i < ncode; i += 1 ) if ( nbits[ i ] > maxBits ) maxBits = nbits[ i ];
int bl_count = new int[ maxBits+1 ];
for ( int i = 0; i < ncode; i += 1 ) bl_count[ nbits[ i ] ] += 1;
int next_code = new int[ maxBits+1 ];
int code = 0; bl_count[ 0 ] = 0;
for ( int i = 0; i < maxBits; i += 1 )
{
code = ( code + bl_count[ i ] ) << 1;
next_code[ i+1 ] = code;
}
for ( int i = 0; i < ncode; i += 1 )
{
int len = nbits[ i ];
if ( len != 0 )
{
tree_code[ i ] = (ushort)Reverse( next_code[ len ], len );
next_code[ len ] += 1;
}
}
while ( ncode > 0 && nbits[ ncode - 1 ] == 0 ) ncode -= 1;
//System.Console.WriteLine( "Huff.ComputeCode" );
//for ( int i = 0; i < ncode; i += 1 ) if ( nbits[ i ] > 0 )
// System.Console.WriteLine( "i=" + i + " len=" + nbits[ i ] + " tc=" + tree_code[ i ].ToString("X") + " freq=" + freq[ i ] );
return ncode;
}
class Node{ public Node Left, Right; public int Freq; public ushort Code; public byte Depth; }
static int Reverse( int x, int bits )
{ int result = 0; for ( int i = 0; i < bits; i += 1 ) { result <<= 1; result |= x & 1; x >>= 1; } return result; }
static void Walk( byte a, Node n, int len )
{ if ( n.Left == null ) a[ n.Code ] = (byte)len; else { Walk( a, n.Left, len+1 ); Walk( a, n.Right, len+1 ); } }
static bool LessThan( Node a, Node b )
{ return a.Freq < b.Freq || a.Freq == b.Freq && a.Depth < b.Depth; }
// See e.g. https://en.wikipedia.org/wiki/Heap_(data_structure) for information on heap data structure.
static void HeapInsert( Node heap, int h, Node n ) // h is size of heap before insertion.
{
int j = h;
while ( j > 0 )
{
int p = ( j - 1 ) / 2; // Index of parent.
Node pn = heap[ p ];
if ( !LessThan( n, pn ) ) break;
heap[ j ] = pn; // Demote parent.
j = p;
}
heap[ j ] = n;
}
static Node HeapRemove( Node heap, int h ) // h is size of heap after removal.
{
Node result = heap[ 0 ];
Node n = heap[ h ];
int j = 0;
while ( true )
{
int c = j * 2 + 1; if ( c >= h ) break;
Node cn = heap[ c ];
if ( c + 1 < h )
{
Node cn2 = heap[ c + 1 ];
if ( LessThan( cn2, cn ) ) { c += 1; cn = cn2; }
}
if ( !LessThan( cn, n ) ) break;
heap[ j ] = cn; j = c;
}
heap[ j ] = n;
return result;
}
} // end class Huff
c# compression heap
c# compression heap
New contributor
New contributor
New contributor
asked Jan 2 at 18:00
George Barwood
396
396
New contributor
New contributor
4
I find your one-liners, abbreviations and lots of strange names and formatting scarry, ugly, incomprihesible and generally terrible - do you always write code like this? :-
– t3chb0t
Jan 2 at 19:21
add a comment |
4
I find your one-liners, abbreviations and lots of strange names and formatting scarry, ugly, incomprihesible and generally terrible - do you always write code like this? :-
– t3chb0t
Jan 2 at 19:21
4
4
I find your one-liners, abbreviations and lots of strange names and formatting scarry, ugly, incomprihesible and generally terrible - do you always write code like this? :-
– t3chb0t
Jan 2 at 19:21
I find your one-liners, abbreviations and lots of strange names and formatting scarry, ugly, incomprihesible and generally terrible - do you always write code like this? :-
– t3chb0t
Jan 2 at 19:21
add a comment |
2 Answers
2
active
oldest
votes
Welcome to Code Review. I am sorry but I do not find your code as comprehensible as you were hoping. I find myself more aligned to the comment by @t3chb0t.
I really must ask have you ever read any parts of the C# Naming Guidelines? I am not saying that to be mean. Given that you are a new poster, we do try to be nice but unfortunately sometimes a certain bluntness cannot be avoided.
Something to also consider is that you should favor composition over inheritance (Wikipedia link). Here is a good StackOverflow link on it too. Rather than class Encoder
implementing List<byte>
, it would be better to have a private field of List<byte>
. This also prevents someone from inadvertently performing any method that modifies the encoded value, such as an Add, Clear, or Remove. You could then expose that field publicly as a read only value.
You have a fair amount in a section related to the intermediate circular buffer. This bugs me for several reasons. For one, I think the buffer, variables, and methods could be put into its own class, even if its a private internal class. And having variable names being with "I" conflicts with the naming guideline on many fronts: (1) variables should begin with lowercase letters, and (2) a capital "I" makes me think its an Interface.
I applaud you for emphasizing simple and concise over efficient. Next I would urge you think strongly about readability. Ask yourself how well a mediocre C# developer would understand your code without having to ask you questions about it.
Rick, strictly speaking, according the the guidelines, "The field-naming guidelines apply to static public and protected fields. Internal and private fields are not covered by guidelines", but leaving that aside, I use the convention that local variables and parameters begin with lower-case, and other names begin with upper-case.
– George Barwood
Jan 2 at 22:52
Oops, pressed enter before I meant to... Still, the "IBuf" and associated fields could be better named, I didn't give any real thought to it. Agreed about composition and inheritance. I also plead guilty to abbreviation, I hope poor t3chb0t isn't scarred for life! :)
– George Barwood
Jan 2 at 23:01
2
@GeorgeBarwood haha, I'm not ;-] I just find that carelessness about non-public code leads to many unnecessary hours of debugging and re-trying to understand the old code. The compiler doesn't care about the formatting or naming etc but people do and if I cannot relatively quickly understand how the code works I do everything to avoid it (and not only me)... this is why you most likely should not expect a thorough review becasue your code is extremely difficult to read and to understand.
– t3chb0t
2 days ago
@GeorgeBarwood If you expect other people to read your non-public code you should treat it as public and take great care of it too and try to make it as clean as possible.
– t3chb0t
2 days ago
add a comment |
Readability
To expand a bit on what Rick and t3chb0t already said, here are some specific things that hurt the readability of your code:
- Unnecessary abbreviations and undescriptive names. Some names convey virtually no meaning at all (
n
,w
,x
,p
), while others are very unclear or do not accurately describe their meaning:ReadInp
apparently performs deflation, not just input reading, and what does the 'do' inDoBlock
mean? - Stuffing multiple statements on a single line. This makes code much more difficult to 'scan', and code that's difficult to read tends to be difficult to maintain and debug.
- Especially jarring are lines like
int mc = 0; while (x >= MatchOff[mc]) mc += 1; mc -= 1;
There's no clear distinction between what's part of thewhile
loop body and what's not. This can lead to subtle problems. Putting each statement on a separate line, and indenting the loop body makes it easier to identify the control flow at a glance. - Declaring multiple variables of the same type at once, separated by commas, also falls under this: it requires more care to see whether each has been properly initialized, and if not, whether that was intentional or not.
- Especially jarring are lines like
- Related to the above point, the occasional omission of braces and sometimes inconsistent indentation doesn't help either.
- Lack of whitespace:
- Adding an empty line between subsequent
if
statements makes it more clear that they're not related, which makes it easier to quickly scan control flow. - Leave an empty lines between methods to provide some 'breathing space'. A wall of text is more difficult to read than an article that's split up into chapters and paragraphs.
- Adding an empty line between subsequent
- Magic values - specific (numeric) values whose meaning is unclear, such as
19
,256
,257
and288
. Try using properly named constants instead.
C# specific
- It's strange to see
using Generic = System.Collections.Generic;
. Namespace aliases are useful to prevent name clashes, but in my experience that's rarely a problem. In this case,System.Collections.Generic
is so ubiquitously used that doing anything else thanusing System.Collections.Generic;
will only cause confusion. - C# supports type inference, so
Dictionary<int, PosList> dict = new Dictionary<int, PosList>();
can be simplified tovar dict = new Dictionary<int, PosList>();
.
out
variables can be declared in-line, soif (dict.TryGetValue(w, out e))
can be simplified toif (dict.TryGetValue(w, out PosList e))
orif (dict.TryGetValue(w, out var e))
.
Design
- Inheritance is the wrong tool here, as Rick already pointed out. Just because an encoder uses byte-sequences as input and output does not make it a list of bytes itself. Note how calling
Deflate
again will overwrite the result of any previous calls - that's very surprising behavior (in a negative way). Also note that a lot of state that is only useful during the actual deflation is kept around - wasting memory, and making your job more difficult because you have to remember to properly reset things. - I see two designs that could work well here:
- A
Stream
-based design, where you provide aCompressionStream
class that inherits fromStream
(it's a stream that you can read decompressed data from) and that wraps anotherStream
(which contains the still-compressed data). This integrates well with file and network stream reading, among other things. Be sure to read up on how disposal (IDisposable
) works. - An
IEnumerable<byte>
-based design, with anIEnumerable<byte> Deflate(IEnumerable<byte> compressedData)
method. This lets you pass in a variety of collections, including arrays, lists and lazily evaluated sequences such as the results of Linq extension methods. Returning anIEnumerable<byte>
allows you to return any of these as well, and lets you useyield
, which can be useful in certain cases.Deflate(byteArray.Skip(4))
, for example, deflates all but the first 4 bytes frombyteArray
, without any modifications to your code. And ifDeflate
usesyield
, thenDeflate(byteArray).Take(4)
only deflates the first 4 bytes, whileDeflate(byteArray).ToArray()
gives you an array that contains all deflated data.
- A
- Rick also mentioned those buffer-related methods, they're better moved to a separate buffer class. The same can be said for those 'output stream' methods. Take a look at how
StreamWriter
lets you write text to an underlyingStream
, while taking care of things like encoding and buffering. You could create a similar construction for writing bits.
The idea was for the class to compress many parts of a PDF ( each page of a PDF has a "content stream" which can be "deflated" ), so the plan was to avoid re-allocating arrays/lists each time a page is compressed, reducing garbage collection overhead. But I didn't entirely follow-through on this, and whether it's worth worrying about I don't know - I haven't done any performance testing at all, or compared it to the standard Zlib implementation. Thanks for the C# tips, I had no idea about "var" or the other language features you mention.
– George Barwood
2 days ago
add a comment |
Your Answer
StackExchange.ifUsing("editor", function () {
return StackExchange.using("mathjaxEditing", function () {
StackExchange.MarkdownEditor.creationCallbacks.add(function (editor, postfix) {
StackExchange.mathjaxEditing.prepareWmdForMathJax(editor, postfix, [["\$", "\$"]]);
});
});
}, "mathjax-editing");
StackExchange.ifUsing("editor", function () {
StackExchange.using("externalEditor", function () {
StackExchange.using("snippets", function () {
StackExchange.snippets.init();
});
});
}, "code-snippets");
StackExchange.ready(function() {
var channelOptions = {
tags: "".split(" "),
id: "196"
};
initTagRenderer("".split(" "), "".split(" "), channelOptions);
StackExchange.using("externalEditor", function() {
// Have to fire editor after snippets, if snippets enabled
if (StackExchange.settings.snippets.snippetsEnabled) {
StackExchange.using("snippets", function() {
createEditor();
});
}
else {
createEditor();
}
});
function createEditor() {
StackExchange.prepareEditor({
heartbeatType: 'answer',
autoActivateHeartbeat: false,
convertImagesToLinks: false,
noModals: true,
showLowRepImageUploadWarning: true,
reputationToPostImages: null,
bindNavPrevention: true,
postfix: "",
imageUploader: {
brandingHtml: "Powered by u003ca class="icon-imgur-white" href="https://imgur.com/"u003eu003c/au003e",
contentPolicyHtml: "User contributions licensed under u003ca href="https://creativecommons.org/licenses/by-sa/3.0/"u003ecc by-sa 3.0 with attribution requiredu003c/au003e u003ca href="https://stackoverflow.com/legal/content-policy"u003e(content policy)u003c/au003e",
allowUrls: true
},
onDemand: true,
discardSelector: ".discard-answer"
,immediatelyShowMarkdownHelp:true
});
}
});
George Barwood is a new contributor. Be nice, and check out our Code of Conduct.
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
StackExchange.ready(
function () {
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fcodereview.stackexchange.com%2fquestions%2f210757%2frfc-1951-deflate-compression%23new-answer', 'question_page');
}
);
Post as a guest
Required, but never shown
2 Answers
2
active
oldest
votes
2 Answers
2
active
oldest
votes
active
oldest
votes
active
oldest
votes
Welcome to Code Review. I am sorry but I do not find your code as comprehensible as you were hoping. I find myself more aligned to the comment by @t3chb0t.
I really must ask have you ever read any parts of the C# Naming Guidelines? I am not saying that to be mean. Given that you are a new poster, we do try to be nice but unfortunately sometimes a certain bluntness cannot be avoided.
Something to also consider is that you should favor composition over inheritance (Wikipedia link). Here is a good StackOverflow link on it too. Rather than class Encoder
implementing List<byte>
, it would be better to have a private field of List<byte>
. This also prevents someone from inadvertently performing any method that modifies the encoded value, such as an Add, Clear, or Remove. You could then expose that field publicly as a read only value.
You have a fair amount in a section related to the intermediate circular buffer. This bugs me for several reasons. For one, I think the buffer, variables, and methods could be put into its own class, even if its a private internal class. And having variable names being with "I" conflicts with the naming guideline on many fronts: (1) variables should begin with lowercase letters, and (2) a capital "I" makes me think its an Interface.
I applaud you for emphasizing simple and concise over efficient. Next I would urge you think strongly about readability. Ask yourself how well a mediocre C# developer would understand your code without having to ask you questions about it.
Rick, strictly speaking, according the the guidelines, "The field-naming guidelines apply to static public and protected fields. Internal and private fields are not covered by guidelines", but leaving that aside, I use the convention that local variables and parameters begin with lower-case, and other names begin with upper-case.
– George Barwood
Jan 2 at 22:52
Oops, pressed enter before I meant to... Still, the "IBuf" and associated fields could be better named, I didn't give any real thought to it. Agreed about composition and inheritance. I also plead guilty to abbreviation, I hope poor t3chb0t isn't scarred for life! :)
– George Barwood
Jan 2 at 23:01
2
@GeorgeBarwood haha, I'm not ;-] I just find that carelessness about non-public code leads to many unnecessary hours of debugging and re-trying to understand the old code. The compiler doesn't care about the formatting or naming etc but people do and if I cannot relatively quickly understand how the code works I do everything to avoid it (and not only me)... this is why you most likely should not expect a thorough review becasue your code is extremely difficult to read and to understand.
– t3chb0t
2 days ago
@GeorgeBarwood If you expect other people to read your non-public code you should treat it as public and take great care of it too and try to make it as clean as possible.
– t3chb0t
2 days ago
add a comment |
Welcome to Code Review. I am sorry but I do not find your code as comprehensible as you were hoping. I find myself more aligned to the comment by @t3chb0t.
I really must ask have you ever read any parts of the C# Naming Guidelines? I am not saying that to be mean. Given that you are a new poster, we do try to be nice but unfortunately sometimes a certain bluntness cannot be avoided.
Something to also consider is that you should favor composition over inheritance (Wikipedia link). Here is a good StackOverflow link on it too. Rather than class Encoder
implementing List<byte>
, it would be better to have a private field of List<byte>
. This also prevents someone from inadvertently performing any method that modifies the encoded value, such as an Add, Clear, or Remove. You could then expose that field publicly as a read only value.
You have a fair amount in a section related to the intermediate circular buffer. This bugs me for several reasons. For one, I think the buffer, variables, and methods could be put into its own class, even if its a private internal class. And having variable names being with "I" conflicts with the naming guideline on many fronts: (1) variables should begin with lowercase letters, and (2) a capital "I" makes me think its an Interface.
I applaud you for emphasizing simple and concise over efficient. Next I would urge you think strongly about readability. Ask yourself how well a mediocre C# developer would understand your code without having to ask you questions about it.
Rick, strictly speaking, according the the guidelines, "The field-naming guidelines apply to static public and protected fields. Internal and private fields are not covered by guidelines", but leaving that aside, I use the convention that local variables and parameters begin with lower-case, and other names begin with upper-case.
– George Barwood
Jan 2 at 22:52
Oops, pressed enter before I meant to... Still, the "IBuf" and associated fields could be better named, I didn't give any real thought to it. Agreed about composition and inheritance. I also plead guilty to abbreviation, I hope poor t3chb0t isn't scarred for life! :)
– George Barwood
Jan 2 at 23:01
2
@GeorgeBarwood haha, I'm not ;-] I just find that carelessness about non-public code leads to many unnecessary hours of debugging and re-trying to understand the old code. The compiler doesn't care about the formatting or naming etc but people do and if I cannot relatively quickly understand how the code works I do everything to avoid it (and not only me)... this is why you most likely should not expect a thorough review becasue your code is extremely difficult to read and to understand.
– t3chb0t
2 days ago
@GeorgeBarwood If you expect other people to read your non-public code you should treat it as public and take great care of it too and try to make it as clean as possible.
– t3chb0t
2 days ago
add a comment |
Welcome to Code Review. I am sorry but I do not find your code as comprehensible as you were hoping. I find myself more aligned to the comment by @t3chb0t.
I really must ask have you ever read any parts of the C# Naming Guidelines? I am not saying that to be mean. Given that you are a new poster, we do try to be nice but unfortunately sometimes a certain bluntness cannot be avoided.
Something to also consider is that you should favor composition over inheritance (Wikipedia link). Here is a good StackOverflow link on it too. Rather than class Encoder
implementing List<byte>
, it would be better to have a private field of List<byte>
. This also prevents someone from inadvertently performing any method that modifies the encoded value, such as an Add, Clear, or Remove. You could then expose that field publicly as a read only value.
You have a fair amount in a section related to the intermediate circular buffer. This bugs me for several reasons. For one, I think the buffer, variables, and methods could be put into its own class, even if its a private internal class. And having variable names being with "I" conflicts with the naming guideline on many fronts: (1) variables should begin with lowercase letters, and (2) a capital "I" makes me think its an Interface.
I applaud you for emphasizing simple and concise over efficient. Next I would urge you think strongly about readability. Ask yourself how well a mediocre C# developer would understand your code without having to ask you questions about it.
Welcome to Code Review. I am sorry but I do not find your code as comprehensible as you were hoping. I find myself more aligned to the comment by @t3chb0t.
I really must ask have you ever read any parts of the C# Naming Guidelines? I am not saying that to be mean. Given that you are a new poster, we do try to be nice but unfortunately sometimes a certain bluntness cannot be avoided.
Something to also consider is that you should favor composition over inheritance (Wikipedia link). Here is a good StackOverflow link on it too. Rather than class Encoder
implementing List<byte>
, it would be better to have a private field of List<byte>
. This also prevents someone from inadvertently performing any method that modifies the encoded value, such as an Add, Clear, or Remove. You could then expose that field publicly as a read only value.
You have a fair amount in a section related to the intermediate circular buffer. This bugs me for several reasons. For one, I think the buffer, variables, and methods could be put into its own class, even if its a private internal class. And having variable names being with "I" conflicts with the naming guideline on many fronts: (1) variables should begin with lowercase letters, and (2) a capital "I" makes me think its an Interface.
I applaud you for emphasizing simple and concise over efficient. Next I would urge you think strongly about readability. Ask yourself how well a mediocre C# developer would understand your code without having to ask you questions about it.
answered Jan 2 at 21:13
Rick Davin
3,157718
3,157718
Rick, strictly speaking, according the the guidelines, "The field-naming guidelines apply to static public and protected fields. Internal and private fields are not covered by guidelines", but leaving that aside, I use the convention that local variables and parameters begin with lower-case, and other names begin with upper-case.
– George Barwood
Jan 2 at 22:52
Oops, pressed enter before I meant to... Still, the "IBuf" and associated fields could be better named, I didn't give any real thought to it. Agreed about composition and inheritance. I also plead guilty to abbreviation, I hope poor t3chb0t isn't scarred for life! :)
– George Barwood
Jan 2 at 23:01
2
@GeorgeBarwood haha, I'm not ;-] I just find that carelessness about non-public code leads to many unnecessary hours of debugging and re-trying to understand the old code. The compiler doesn't care about the formatting or naming etc but people do and if I cannot relatively quickly understand how the code works I do everything to avoid it (and not only me)... this is why you most likely should not expect a thorough review becasue your code is extremely difficult to read and to understand.
– t3chb0t
2 days ago
@GeorgeBarwood If you expect other people to read your non-public code you should treat it as public and take great care of it too and try to make it as clean as possible.
– t3chb0t
2 days ago
add a comment |
Rick, strictly speaking, according the the guidelines, "The field-naming guidelines apply to static public and protected fields. Internal and private fields are not covered by guidelines", but leaving that aside, I use the convention that local variables and parameters begin with lower-case, and other names begin with upper-case.
– George Barwood
Jan 2 at 22:52
Oops, pressed enter before I meant to... Still, the "IBuf" and associated fields could be better named, I didn't give any real thought to it. Agreed about composition and inheritance. I also plead guilty to abbreviation, I hope poor t3chb0t isn't scarred for life! :)
– George Barwood
Jan 2 at 23:01
2
@GeorgeBarwood haha, I'm not ;-] I just find that carelessness about non-public code leads to many unnecessary hours of debugging and re-trying to understand the old code. The compiler doesn't care about the formatting or naming etc but people do and if I cannot relatively quickly understand how the code works I do everything to avoid it (and not only me)... this is why you most likely should not expect a thorough review becasue your code is extremely difficult to read and to understand.
– t3chb0t
2 days ago
@GeorgeBarwood If you expect other people to read your non-public code you should treat it as public and take great care of it too and try to make it as clean as possible.
– t3chb0t
2 days ago
Rick, strictly speaking, according the the guidelines, "The field-naming guidelines apply to static public and protected fields. Internal and private fields are not covered by guidelines", but leaving that aside, I use the convention that local variables and parameters begin with lower-case, and other names begin with upper-case.
– George Barwood
Jan 2 at 22:52
Rick, strictly speaking, according the the guidelines, "The field-naming guidelines apply to static public and protected fields. Internal and private fields are not covered by guidelines", but leaving that aside, I use the convention that local variables and parameters begin with lower-case, and other names begin with upper-case.
– George Barwood
Jan 2 at 22:52
Oops, pressed enter before I meant to... Still, the "IBuf" and associated fields could be better named, I didn't give any real thought to it. Agreed about composition and inheritance. I also plead guilty to abbreviation, I hope poor t3chb0t isn't scarred for life! :)
– George Barwood
Jan 2 at 23:01
Oops, pressed enter before I meant to... Still, the "IBuf" and associated fields could be better named, I didn't give any real thought to it. Agreed about composition and inheritance. I also plead guilty to abbreviation, I hope poor t3chb0t isn't scarred for life! :)
– George Barwood
Jan 2 at 23:01
2
2
@GeorgeBarwood haha, I'm not ;-] I just find that carelessness about non-public code leads to many unnecessary hours of debugging and re-trying to understand the old code. The compiler doesn't care about the formatting or naming etc but people do and if I cannot relatively quickly understand how the code works I do everything to avoid it (and not only me)... this is why you most likely should not expect a thorough review becasue your code is extremely difficult to read and to understand.
– t3chb0t
2 days ago
@GeorgeBarwood haha, I'm not ;-] I just find that carelessness about non-public code leads to many unnecessary hours of debugging and re-trying to understand the old code. The compiler doesn't care about the formatting or naming etc but people do and if I cannot relatively quickly understand how the code works I do everything to avoid it (and not only me)... this is why you most likely should not expect a thorough review becasue your code is extremely difficult to read and to understand.
– t3chb0t
2 days ago
@GeorgeBarwood If you expect other people to read your non-public code you should treat it as public and take great care of it too and try to make it as clean as possible.
– t3chb0t
2 days ago
@GeorgeBarwood If you expect other people to read your non-public code you should treat it as public and take great care of it too and try to make it as clean as possible.
– t3chb0t
2 days ago
add a comment |
Readability
To expand a bit on what Rick and t3chb0t already said, here are some specific things that hurt the readability of your code:
- Unnecessary abbreviations and undescriptive names. Some names convey virtually no meaning at all (
n
,w
,x
,p
), while others are very unclear or do not accurately describe their meaning:ReadInp
apparently performs deflation, not just input reading, and what does the 'do' inDoBlock
mean? - Stuffing multiple statements on a single line. This makes code much more difficult to 'scan', and code that's difficult to read tends to be difficult to maintain and debug.
- Especially jarring are lines like
int mc = 0; while (x >= MatchOff[mc]) mc += 1; mc -= 1;
There's no clear distinction between what's part of thewhile
loop body and what's not. This can lead to subtle problems. Putting each statement on a separate line, and indenting the loop body makes it easier to identify the control flow at a glance. - Declaring multiple variables of the same type at once, separated by commas, also falls under this: it requires more care to see whether each has been properly initialized, and if not, whether that was intentional or not.
- Especially jarring are lines like
- Related to the above point, the occasional omission of braces and sometimes inconsistent indentation doesn't help either.
- Lack of whitespace:
- Adding an empty line between subsequent
if
statements makes it more clear that they're not related, which makes it easier to quickly scan control flow. - Leave an empty lines between methods to provide some 'breathing space'. A wall of text is more difficult to read than an article that's split up into chapters and paragraphs.
- Adding an empty line between subsequent
- Magic values - specific (numeric) values whose meaning is unclear, such as
19
,256
,257
and288
. Try using properly named constants instead.
C# specific
- It's strange to see
using Generic = System.Collections.Generic;
. Namespace aliases are useful to prevent name clashes, but in my experience that's rarely a problem. In this case,System.Collections.Generic
is so ubiquitously used that doing anything else thanusing System.Collections.Generic;
will only cause confusion. - C# supports type inference, so
Dictionary<int, PosList> dict = new Dictionary<int, PosList>();
can be simplified tovar dict = new Dictionary<int, PosList>();
.
out
variables can be declared in-line, soif (dict.TryGetValue(w, out e))
can be simplified toif (dict.TryGetValue(w, out PosList e))
orif (dict.TryGetValue(w, out var e))
.
Design
- Inheritance is the wrong tool here, as Rick already pointed out. Just because an encoder uses byte-sequences as input and output does not make it a list of bytes itself. Note how calling
Deflate
again will overwrite the result of any previous calls - that's very surprising behavior (in a negative way). Also note that a lot of state that is only useful during the actual deflation is kept around - wasting memory, and making your job more difficult because you have to remember to properly reset things. - I see two designs that could work well here:
- A
Stream
-based design, where you provide aCompressionStream
class that inherits fromStream
(it's a stream that you can read decompressed data from) and that wraps anotherStream
(which contains the still-compressed data). This integrates well with file and network stream reading, among other things. Be sure to read up on how disposal (IDisposable
) works. - An
IEnumerable<byte>
-based design, with anIEnumerable<byte> Deflate(IEnumerable<byte> compressedData)
method. This lets you pass in a variety of collections, including arrays, lists and lazily evaluated sequences such as the results of Linq extension methods. Returning anIEnumerable<byte>
allows you to return any of these as well, and lets you useyield
, which can be useful in certain cases.Deflate(byteArray.Skip(4))
, for example, deflates all but the first 4 bytes frombyteArray
, without any modifications to your code. And ifDeflate
usesyield
, thenDeflate(byteArray).Take(4)
only deflates the first 4 bytes, whileDeflate(byteArray).ToArray()
gives you an array that contains all deflated data.
- A
- Rick also mentioned those buffer-related methods, they're better moved to a separate buffer class. The same can be said for those 'output stream' methods. Take a look at how
StreamWriter
lets you write text to an underlyingStream
, while taking care of things like encoding and buffering. You could create a similar construction for writing bits.
The idea was for the class to compress many parts of a PDF ( each page of a PDF has a "content stream" which can be "deflated" ), so the plan was to avoid re-allocating arrays/lists each time a page is compressed, reducing garbage collection overhead. But I didn't entirely follow-through on this, and whether it's worth worrying about I don't know - I haven't done any performance testing at all, or compared it to the standard Zlib implementation. Thanks for the C# tips, I had no idea about "var" or the other language features you mention.
– George Barwood
2 days ago
add a comment |
Readability
To expand a bit on what Rick and t3chb0t already said, here are some specific things that hurt the readability of your code:
- Unnecessary abbreviations and undescriptive names. Some names convey virtually no meaning at all (
n
,w
,x
,p
), while others are very unclear or do not accurately describe their meaning:ReadInp
apparently performs deflation, not just input reading, and what does the 'do' inDoBlock
mean? - Stuffing multiple statements on a single line. This makes code much more difficult to 'scan', and code that's difficult to read tends to be difficult to maintain and debug.
- Especially jarring are lines like
int mc = 0; while (x >= MatchOff[mc]) mc += 1; mc -= 1;
There's no clear distinction between what's part of thewhile
loop body and what's not. This can lead to subtle problems. Putting each statement on a separate line, and indenting the loop body makes it easier to identify the control flow at a glance. - Declaring multiple variables of the same type at once, separated by commas, also falls under this: it requires more care to see whether each has been properly initialized, and if not, whether that was intentional or not.
- Especially jarring are lines like
- Related to the above point, the occasional omission of braces and sometimes inconsistent indentation doesn't help either.
- Lack of whitespace:
- Adding an empty line between subsequent
if
statements makes it more clear that they're not related, which makes it easier to quickly scan control flow. - Leave an empty lines between methods to provide some 'breathing space'. A wall of text is more difficult to read than an article that's split up into chapters and paragraphs.
- Adding an empty line between subsequent
- Magic values - specific (numeric) values whose meaning is unclear, such as
19
,256
,257
and288
. Try using properly named constants instead.
C# specific
- It's strange to see
using Generic = System.Collections.Generic;
. Namespace aliases are useful to prevent name clashes, but in my experience that's rarely a problem. In this case,System.Collections.Generic
is so ubiquitously used that doing anything else thanusing System.Collections.Generic;
will only cause confusion. - C# supports type inference, so
Dictionary<int, PosList> dict = new Dictionary<int, PosList>();
can be simplified tovar dict = new Dictionary<int, PosList>();
.
out
variables can be declared in-line, soif (dict.TryGetValue(w, out e))
can be simplified toif (dict.TryGetValue(w, out PosList e))
orif (dict.TryGetValue(w, out var e))
.
Design
- Inheritance is the wrong tool here, as Rick already pointed out. Just because an encoder uses byte-sequences as input and output does not make it a list of bytes itself. Note how calling
Deflate
again will overwrite the result of any previous calls - that's very surprising behavior (in a negative way). Also note that a lot of state that is only useful during the actual deflation is kept around - wasting memory, and making your job more difficult because you have to remember to properly reset things. - I see two designs that could work well here:
- A
Stream
-based design, where you provide aCompressionStream
class that inherits fromStream
(it's a stream that you can read decompressed data from) and that wraps anotherStream
(which contains the still-compressed data). This integrates well with file and network stream reading, among other things. Be sure to read up on how disposal (IDisposable
) works. - An
IEnumerable<byte>
-based design, with anIEnumerable<byte> Deflate(IEnumerable<byte> compressedData)
method. This lets you pass in a variety of collections, including arrays, lists and lazily evaluated sequences such as the results of Linq extension methods. Returning anIEnumerable<byte>
allows you to return any of these as well, and lets you useyield
, which can be useful in certain cases.Deflate(byteArray.Skip(4))
, for example, deflates all but the first 4 bytes frombyteArray
, without any modifications to your code. And ifDeflate
usesyield
, thenDeflate(byteArray).Take(4)
only deflates the first 4 bytes, whileDeflate(byteArray).ToArray()
gives you an array that contains all deflated data.
- A
- Rick also mentioned those buffer-related methods, they're better moved to a separate buffer class. The same can be said for those 'output stream' methods. Take a look at how
StreamWriter
lets you write text to an underlyingStream
, while taking care of things like encoding and buffering. You could create a similar construction for writing bits.
The idea was for the class to compress many parts of a PDF ( each page of a PDF has a "content stream" which can be "deflated" ), so the plan was to avoid re-allocating arrays/lists each time a page is compressed, reducing garbage collection overhead. But I didn't entirely follow-through on this, and whether it's worth worrying about I don't know - I haven't done any performance testing at all, or compared it to the standard Zlib implementation. Thanks for the C# tips, I had no idea about "var" or the other language features you mention.
– George Barwood
2 days ago
add a comment |
Readability
To expand a bit on what Rick and t3chb0t already said, here are some specific things that hurt the readability of your code:
- Unnecessary abbreviations and undescriptive names. Some names convey virtually no meaning at all (
n
,w
,x
,p
), while others are very unclear or do not accurately describe their meaning:ReadInp
apparently performs deflation, not just input reading, and what does the 'do' inDoBlock
mean? - Stuffing multiple statements on a single line. This makes code much more difficult to 'scan', and code that's difficult to read tends to be difficult to maintain and debug.
- Especially jarring are lines like
int mc = 0; while (x >= MatchOff[mc]) mc += 1; mc -= 1;
There's no clear distinction between what's part of thewhile
loop body and what's not. This can lead to subtle problems. Putting each statement on a separate line, and indenting the loop body makes it easier to identify the control flow at a glance. - Declaring multiple variables of the same type at once, separated by commas, also falls under this: it requires more care to see whether each has been properly initialized, and if not, whether that was intentional or not.
- Especially jarring are lines like
- Related to the above point, the occasional omission of braces and sometimes inconsistent indentation doesn't help either.
- Lack of whitespace:
- Adding an empty line between subsequent
if
statements makes it more clear that they're not related, which makes it easier to quickly scan control flow. - Leave an empty lines between methods to provide some 'breathing space'. A wall of text is more difficult to read than an article that's split up into chapters and paragraphs.
- Adding an empty line between subsequent
- Magic values - specific (numeric) values whose meaning is unclear, such as
19
,256
,257
and288
. Try using properly named constants instead.
C# specific
- It's strange to see
using Generic = System.Collections.Generic;
. Namespace aliases are useful to prevent name clashes, but in my experience that's rarely a problem. In this case,System.Collections.Generic
is so ubiquitously used that doing anything else thanusing System.Collections.Generic;
will only cause confusion. - C# supports type inference, so
Dictionary<int, PosList> dict = new Dictionary<int, PosList>();
can be simplified tovar dict = new Dictionary<int, PosList>();
.
out
variables can be declared in-line, soif (dict.TryGetValue(w, out e))
can be simplified toif (dict.TryGetValue(w, out PosList e))
orif (dict.TryGetValue(w, out var e))
.
Design
- Inheritance is the wrong tool here, as Rick already pointed out. Just because an encoder uses byte-sequences as input and output does not make it a list of bytes itself. Note how calling
Deflate
again will overwrite the result of any previous calls - that's very surprising behavior (in a negative way). Also note that a lot of state that is only useful during the actual deflation is kept around - wasting memory, and making your job more difficult because you have to remember to properly reset things. - I see two designs that could work well here:
- A
Stream
-based design, where you provide aCompressionStream
class that inherits fromStream
(it's a stream that you can read decompressed data from) and that wraps anotherStream
(which contains the still-compressed data). This integrates well with file and network stream reading, among other things. Be sure to read up on how disposal (IDisposable
) works. - An
IEnumerable<byte>
-based design, with anIEnumerable<byte> Deflate(IEnumerable<byte> compressedData)
method. This lets you pass in a variety of collections, including arrays, lists and lazily evaluated sequences such as the results of Linq extension methods. Returning anIEnumerable<byte>
allows you to return any of these as well, and lets you useyield
, which can be useful in certain cases.Deflate(byteArray.Skip(4))
, for example, deflates all but the first 4 bytes frombyteArray
, without any modifications to your code. And ifDeflate
usesyield
, thenDeflate(byteArray).Take(4)
only deflates the first 4 bytes, whileDeflate(byteArray).ToArray()
gives you an array that contains all deflated data.
- A
- Rick also mentioned those buffer-related methods, they're better moved to a separate buffer class. The same can be said for those 'output stream' methods. Take a look at how
StreamWriter
lets you write text to an underlyingStream
, while taking care of things like encoding and buffering. You could create a similar construction for writing bits.
Readability
To expand a bit on what Rick and t3chb0t already said, here are some specific things that hurt the readability of your code:
- Unnecessary abbreviations and undescriptive names. Some names convey virtually no meaning at all (
n
,w
,x
,p
), while others are very unclear or do not accurately describe their meaning:ReadInp
apparently performs deflation, not just input reading, and what does the 'do' inDoBlock
mean? - Stuffing multiple statements on a single line. This makes code much more difficult to 'scan', and code that's difficult to read tends to be difficult to maintain and debug.
- Especially jarring are lines like
int mc = 0; while (x >= MatchOff[mc]) mc += 1; mc -= 1;
There's no clear distinction between what's part of thewhile
loop body and what's not. This can lead to subtle problems. Putting each statement on a separate line, and indenting the loop body makes it easier to identify the control flow at a glance. - Declaring multiple variables of the same type at once, separated by commas, also falls under this: it requires more care to see whether each has been properly initialized, and if not, whether that was intentional or not.
- Especially jarring are lines like
- Related to the above point, the occasional omission of braces and sometimes inconsistent indentation doesn't help either.
- Lack of whitespace:
- Adding an empty line between subsequent
if
statements makes it more clear that they're not related, which makes it easier to quickly scan control flow. - Leave an empty lines between methods to provide some 'breathing space'. A wall of text is more difficult to read than an article that's split up into chapters and paragraphs.
- Adding an empty line between subsequent
- Magic values - specific (numeric) values whose meaning is unclear, such as
19
,256
,257
and288
. Try using properly named constants instead.
C# specific
- It's strange to see
using Generic = System.Collections.Generic;
. Namespace aliases are useful to prevent name clashes, but in my experience that's rarely a problem. In this case,System.Collections.Generic
is so ubiquitously used that doing anything else thanusing System.Collections.Generic;
will only cause confusion. - C# supports type inference, so
Dictionary<int, PosList> dict = new Dictionary<int, PosList>();
can be simplified tovar dict = new Dictionary<int, PosList>();
.
out
variables can be declared in-line, soif (dict.TryGetValue(w, out e))
can be simplified toif (dict.TryGetValue(w, out PosList e))
orif (dict.TryGetValue(w, out var e))
.
Design
- Inheritance is the wrong tool here, as Rick already pointed out. Just because an encoder uses byte-sequences as input and output does not make it a list of bytes itself. Note how calling
Deflate
again will overwrite the result of any previous calls - that's very surprising behavior (in a negative way). Also note that a lot of state that is only useful during the actual deflation is kept around - wasting memory, and making your job more difficult because you have to remember to properly reset things. - I see two designs that could work well here:
- A
Stream
-based design, where you provide aCompressionStream
class that inherits fromStream
(it's a stream that you can read decompressed data from) and that wraps anotherStream
(which contains the still-compressed data). This integrates well with file and network stream reading, among other things. Be sure to read up on how disposal (IDisposable
) works. - An
IEnumerable<byte>
-based design, with anIEnumerable<byte> Deflate(IEnumerable<byte> compressedData)
method. This lets you pass in a variety of collections, including arrays, lists and lazily evaluated sequences such as the results of Linq extension methods. Returning anIEnumerable<byte>
allows you to return any of these as well, and lets you useyield
, which can be useful in certain cases.Deflate(byteArray.Skip(4))
, for example, deflates all but the first 4 bytes frombyteArray
, without any modifications to your code. And ifDeflate
usesyield
, thenDeflate(byteArray).Take(4)
only deflates the first 4 bytes, whileDeflate(byteArray).ToArray()
gives you an array that contains all deflated data.
- A
- Rick also mentioned those buffer-related methods, they're better moved to a separate buffer class. The same can be said for those 'output stream' methods. Take a look at how
StreamWriter
lets you write text to an underlyingStream
, while taking care of things like encoding and buffering. You could create a similar construction for writing bits.
edited 2 days ago
Blackwood
1358
1358
answered 2 days ago
Pieter Witvoet
5,801725
5,801725
The idea was for the class to compress many parts of a PDF ( each page of a PDF has a "content stream" which can be "deflated" ), so the plan was to avoid re-allocating arrays/lists each time a page is compressed, reducing garbage collection overhead. But I didn't entirely follow-through on this, and whether it's worth worrying about I don't know - I haven't done any performance testing at all, or compared it to the standard Zlib implementation. Thanks for the C# tips, I had no idea about "var" or the other language features you mention.
– George Barwood
2 days ago
add a comment |
The idea was for the class to compress many parts of a PDF ( each page of a PDF has a "content stream" which can be "deflated" ), so the plan was to avoid re-allocating arrays/lists each time a page is compressed, reducing garbage collection overhead. But I didn't entirely follow-through on this, and whether it's worth worrying about I don't know - I haven't done any performance testing at all, or compared it to the standard Zlib implementation. Thanks for the C# tips, I had no idea about "var" or the other language features you mention.
– George Barwood
2 days ago
The idea was for the class to compress many parts of a PDF ( each page of a PDF has a "content stream" which can be "deflated" ), so the plan was to avoid re-allocating arrays/lists each time a page is compressed, reducing garbage collection overhead. But I didn't entirely follow-through on this, and whether it's worth worrying about I don't know - I haven't done any performance testing at all, or compared it to the standard Zlib implementation. Thanks for the C# tips, I had no idea about "var" or the other language features you mention.
– George Barwood
2 days ago
The idea was for the class to compress many parts of a PDF ( each page of a PDF has a "content stream" which can be "deflated" ), so the plan was to avoid re-allocating arrays/lists each time a page is compressed, reducing garbage collection overhead. But I didn't entirely follow-through on this, and whether it's worth worrying about I don't know - I haven't done any performance testing at all, or compared it to the standard Zlib implementation. Thanks for the C# tips, I had no idea about "var" or the other language features you mention.
– George Barwood
2 days ago
add a comment |
George Barwood is a new contributor. Be nice, and check out our Code of Conduct.
George Barwood is a new contributor. Be nice, and check out our Code of Conduct.
George Barwood is a new contributor. Be nice, and check out our Code of Conduct.
George Barwood is a new contributor. Be nice, and check out our Code of Conduct.
Thanks for contributing an answer to Code Review Stack Exchange!
- Please be sure to answer the question. Provide details and share your research!
But avoid …
- Asking for help, clarification, or responding to other answers.
- Making statements based on opinion; back them up with references or personal experience.
Use MathJax to format equations. MathJax reference.
To learn more, see our tips on writing great answers.
Some of your past answers have not been well-received, and you're in danger of being blocked from answering.
Please pay close attention to the following guidance:
- Please be sure to answer the question. Provide details and share your research!
But avoid …
- Asking for help, clarification, or responding to other answers.
- Making statements based on opinion; back them up with references or personal experience.
To learn more, see our tips on writing great answers.
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
StackExchange.ready(
function () {
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fcodereview.stackexchange.com%2fquestions%2f210757%2frfc-1951-deflate-compression%23new-answer', 'question_page');
}
);
Post as a guest
Required, but never shown
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
4
I find your one-liners, abbreviations and lots of strange names and formatting scarry, ugly, incomprihesible and generally terrible - do you always write code like this? :-
– t3chb0t
Jan 2 at 19:21