////////////////////////////////////////////////////////////////////// // // lzw_main.cpp // // Jeff Ondich, 4/12/04 // // Developed to help me experiment with compression of portions // of my dictionary databases. Alphabetized dictionary data // has lots of localized repetitions of fairly long strings, // so I had hopes of getting good compression even if I was only // compressing small chunks (e.g. 500 bytes) at a time. // // Uses lzw.h and lzw.cpp, adapted from from code published // by Mark R. Nelson in Dr. Dobbs' Journal, April 1989. // // Usage of this program is: // // lzw segmentsize termsfile // // Here, the file is assumed to contain one "term" (a word or phrase) // per line. The program reads the data from the file into a buffer, // substituting nulls for newlines, so the resulting buffer // contains a sequence of null-terminated strings. Once the buffer // fills to segmentsize bytes, I compress the buffer, add the // compressed size to a running total, and continue with an empty // buffer. By toying with segmentsize and the BITS constant, // I can balance compression rates against the speed of decompression // of chunks. (I'll be doing lots of decompressions at runtime // on a Palm, so I want the decompression to be fast). // // Note that I am testing the compression correctness by comparing // my original data to the results of compressing and then // decompressing. I have two reasons for doing this. First, // I want your profiles to examine both the compression and the // decompression. And second, I am a little paranoid, and I want to // make sure the compression and decompression are working properly. // ////////////////////////////////////////////////////////////////////// #include #include #include #include "lzw.h" using namespace std; void checkDecompression( int originalSize, UCHAR *original, int compressedSize, UCHAR *compressed, UCHAR *tmp ); int gSegmentSize = 0; int gNSegments = 0; int main( int argc, char *argv[] ) { // Parse the command line if( argc != 3 || !isdigit( argv[1][0] ) ) { cerr << "Usage: " << argv[0] << " segmentsize termsfile" << endl; exit( 1 ); } gSegmentSize = atoi( argv[1] ); ifstream in( argv[2] ); if( !in.is_open() ) { cerr << "Can't open " << argv[2] << endl; exit( 1 ); } char ch; UCHAR *originalData = new UCHAR[2*gSegmentSize]; UCHAR *compressedData = new UCHAR[2*gSegmentSize]; UCHAR *tmp = new UCHAR[2*gSegmentSize]; int compressedSize; int k = 0; int totalCompressedSize = 0; int totalUncompressedSize = 0; while( in.get( ch ) ) { if( ch == '\n' ) ch = '\0'; if( k == gSegmentSize ) { totalUncompressedSize += k; compressedSize = compress( originalData, compressedData, k ); totalCompressedSize += compressedSize; checkDecompression( k, originalData, compressedSize, compressedData, tmp ); gNSegments++; k = 0; } originalData[k] = (UCHAR)ch; k++; } if( k > 0 ) { totalUncompressedSize += k; totalCompressedSize += compress( originalData, compressedData, k ); checkDecompression( k, originalData, compressedSize, compressedData, tmp ); } cout << "Total uncompressed size: " << totalUncompressedSize << endl; cout << "Total compressed size: " << totalCompressedSize << endl; delete [] compressedData; delete [] originalData; delete [] tmp; in.close(); return 0; } void checkDecompression( int originalSize, UCHAR *original, int compressedSize, UCHAR *compressed, UCHAR *tmp ) { int expandedSize = expand( compressed, tmp ); if( expandedSize == originalSize ) { int j; for( j=0; j < originalSize; j++ ) { if( original[j] != tmp[j] ) break; } if( j == originalSize ) return; } cerr << "Decompressed data does not match original data at segment " << gNSegments << endl; cerr << "Original size: " << originalSize << endl; int i; for( i=0; i < originalSize; i++ ) cerr << "|" << int(original[i]); cerr << endl; cerr << "Expanded size: " << expandedSize << endl; for( i=0; i < originalSize; i++ ) cerr << "|" << int(original[i]); cerr << endl; cerr << "Compressed size: " << compressedSize << endl; for( i=0; i < compressedSize; i++ ) cerr << "|" << int(compressed[i]); cerr << endl; exit( 1 ); }