-
Notifications
You must be signed in to change notification settings - Fork 16
/
lzwlib.c
513 lines (432 loc) · 26.6 KB
/
lzwlib.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
////////////////////////////////////////////////////////////////////////////
// **** LZW-AB **** //
// Adjusted Binary LZW Compressor/Decompressor //
// Copyright (c) 2016-2020 David Bryant //
// All Rights Reserved //
// Distributed under the BSD Software License (see license.txt) //
////////////////////////////////////////////////////////////////////////////
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "lzwlib.h"
/* This library implements the LZW general-purpose data compression algorithm.
* The algorithm was originally described as a hardware implementation by
* Terry Welsh here:
*
* Welch, T.A. “A Technique for High-Performance Data Compression.”
* IEEE Computer 17,6 (June 1984), pp. 8-19.
*
* Since then there have been enumerable refinements and variations on the
* basic technique, and this implementation is no different. The target of
* the present implementation is embedded systems, and so emphasis was placed
* on simplicity, fast execution, and minimal RAM usage.
*
* This is a streaming compressor in that the data is not divided into blocks
* and no context information like dictionaries or Huffman tables are sent
* ahead of the compressed data (except for one byte to signal the maximum
* bit depth). This limits the maximum possible compression ratio compared to
* algorithms that significantly preprocess the data, but with the help of
* some enhancements to the LZW algorithm (described below) it is able to
* compress better than the UNIX "compress" utility (which is also LZW) and
* is in fact closer to and sometimes beats the compression level of "gzip".
*
* The symbols are stored in "adjusted binary" which provides somewhat better
* compression, with virtually no speed penalty, compared to the fixed word
* sizes normally used. These are sometimes called "phased-in" binary codes
* and their use in LZW is described here:
*
* R. N. Horspool, "Improving LZW (data compression algorithm)", Data
* Compression Conference, pp. 332-341, 1991.
*
* Earlier versions of this compressor would reset as soon as the dictionary
* became full to ensure good performance on heterogenous data (such as tar
* files or executable images). While trivial to implement, this is not
* particularly efficient with homogeneous data (or in general) because we
* spend a lot of time sending short symbols where the compression is poor.
*
* This newer version utilizes a technique such that once the dictionary is
* full, we restart at the beginning and recycle only those codes that were
* seen only once. We know this because they are not referenced by longer
* strings, and are easy to replace in the dictionary for the same reason.
* Since they have only been seen once it's also more likely that we will
* be replacing them with a more common string, and this is especially
* true if the data characteristics are changing.
*
* Replacing string codes in this manner has the interesting side effect that
* some older shorter strings that the removed strings were based on will
* possibly become unreferenced themselves and be recycled on the next pass.
* In this way, the entire dictionary constantly "churns" based on the
* incoming stream, thereby improving and adapting to optimal compression.
*
* Even with this technique there is still a possibility that a sudden change
* in the data characteristics will appear, resulting in significant negative
* compression (up to 100% for 16-bit codes). To detect this case we generate
* an exponentially decaying average of the current compression ratio and reset
* when this hits about 1.06, which limits worst case inflation to about 8%.
*
* The maximum symbol size is configurable on the encode side (from 9 bits to
* 16 bits) and determines the RAM footprint required by both sides and, to a
* large extent, the compression performance. This information is communicated
* to the decoder in the first stream byte so that it can allocate accordingly.
* The RAM requirements are as follows:
*
* maximum encoder RAM decoder RAM
* symbol size requirement requirement
* -----------------------------------------
* 9-bit 4096 bytes 2368 bytes
* 10-bit 8192 bytes 4992 bytes
* 11-bit 16384 bytes 10240 bytes
* 12-bit 32768 bytes 20736 bytes
* 13-bit 65536 bytes 41728 bytes
* 14-bit 131072 bytes 83712 bytes
* 15-bit 262144 bytes 167680 bytes
* 16-bit 524288 bytes 335616 bytes
*
* This implementation uses malloc(), but obviously an embedded version could
* use static arrays instead if desired (assuming that the maxbits was
* controlled outside).
*/
#define NULL_CODE 65535 // indicates a NULL prefix (must be unsigned short)
#define CLEAR_CODE 256 // code to flush dictionary and restart decoder
#define FIRST_STRING 257 // code of first dictionary string
/* This macro determines the number of bits required to represent the given value,
* not counting the implied MSB. For GNU C it will use the provided built-in,
* otherwise a comparison tree is employed. Note that in the non-GNU case, only
* values up to 65535 (15 bits) are supported.
*/
#ifdef __GNUC__
#define CODE_BITS(n) (31 - __builtin_clz(n))
#else
#define CODE_BITS(n) ((n) < 4096 ? \
((n) < 1024 ? 8 + ((n) >= 512) : 10 + ((n) >= 2048)) : \
((n) < 16384 ? 12 + ((n) >= 8192) : 14 + ((n) >= 32768)))
#endif
/* This macro writes the adjusted-binary symbol "code" given the maximum
* symbol "maxcode". A macro is used here just to avoid the duplication in
* the lzw_compress() function. The idea is that if "maxcode" is not one
* less than a power of two (which it rarely will be) then this code can
* often send fewer bits that would be required with a fixed-sized code.
*
* For example, the first code we send will have a "maxcode" of 257, so
* every "code" would normally consume 9 bits. But with adjusted binary we
* can actually represent any code from 0 to 253 with just 8 bits -- only
* the 4 codes from 254 to 257 take 9 bits.
*/
#define WRITE_CODE(code,maxcode) do { \
unsigned int code_bits = CODE_BITS (maxcode); \
unsigned int extras = (2 << code_bits) - (maxcode) - 1; \
if ((code) < extras) { \
shifter |= ((code) << bits); \
bits += code_bits; \
} \
else { \
shifter |= ((((code) + extras) >> 1) << bits); \
bits += code_bits; \
shifter |= ((((code) + extras) & 1) << bits++); \
} \
do { (*dst)(shifter,dstctx); shifter >>= 8; \
output_bytes += 256; \
} while ((bits -= 8) >= 8); \
} while (0)
/* LZW compression function. Bytes (8-bit) are read and written through callbacks and the
* "maxbits" parameter specifies the maximum symbol size (9-16), which in turn determines
* the RAM requirement and, to a large extent, the level of compression achievable. A return
* value of EOF from the "src" callback terminates the compression process. A non-zero return
* value indicates one of the two possible errors -- bad "maxbits" param or failed malloc().
* There are contexts (void pointers) that are passed to the callbacks to easily facilitate
* multiple instances of the compression operation (but simple applications can ignore these).
*/
typedef struct {
unsigned short first_reference, next_reference, back_reference;
unsigned char terminator;
} encoder_entry_t;
int lzw_compress (void (*dst)(int,void*), void *dstctx, int (*src)(void*), void *srcctx, int maxbits)
{
unsigned int maxcode = FIRST_STRING, next_string = FIRST_STRING, prefix = NULL_CODE, total_codes;
unsigned int dictionary_full = 0, available_entries, max_available_entries, max_available_code;
unsigned int input_bytes = 65536, output_bytes = 65536;
unsigned int shifter = 0, bits = 0;
encoder_entry_t *dictionary;
int c;
if (maxbits < 9 || maxbits > 16) // check for valid "maxbits" setting
return 1;
// based on the "maxbits" parameter, compute total codes and allocate dictionary storage
total_codes = 1 << maxbits;
dictionary = malloc (total_codes * sizeof (encoder_entry_t));
max_available_entries = total_codes - FIRST_STRING - 1;
max_available_code = total_codes - 2;
if (!dictionary)
return 1; // failed malloc()
// clear the dictionary
available_entries = max_available_entries;
memset (dictionary, 0, 256 * sizeof (encoder_entry_t));
(*dst)(maxbits - 9, dstctx); // first byte in output stream indicates the maximum symbol bits
// This is the main loop where we read input bytes and compress them. We always keep track of the
// "prefix", which represents a pending byte (if < 256) or string entry (if >= FIRST_STRING) that
// has not been sent to the decoder yet. The output symbols are kept in the "shifter" and "bits"
// variables and are sent to the output every time 8 bits are available (done in the macro).
while ((c = (*src)(srcctx)) != EOF) {
unsigned int cti; // coding table index
input_bytes += 256;
if (prefix == NULL_CODE) { // this only happens the very first byte when we don't yet have a prefix
prefix = c;
continue;
}
memset (dictionary + next_string, 0, sizeof (encoder_entry_t));
if ((cti = dictionary [prefix].first_reference)) { // if any longer strings are built on the current prefix...
while (1)
if (dictionary [cti].terminator == c) { // we found a matching string, so we just update the prefix
prefix = cti; // to that string and continue without sending anything
break;
}
else if (!dictionary [cti].next_reference) { // this string did not match the new character and
dictionary [cti].next_reference = next_string; // there aren't any more, so we'll add a new string,
// point to it with "next_reference", and also make the
dictionary [next_string].back_reference = cti; // "back_reference" which is used for recycling entries
cti = 0;
break;
}
else
cti = dictionary [cti].next_reference; // there are more possible matches to check, so loop back
}
else { // no longer strings are based on the current prefix, so now
dictionary [prefix].first_reference = next_string; // the current prefix plus the new byte will be the next string
dictionary [next_string].back_reference = prefix; // also make the back_reference used for recycling
if (prefix >= FIRST_STRING) available_entries--; // the codes 0-255 are never available for recycling
}
// If "cti" is zero, we could not simply extend our "prefix" to a longer string because we did not find a
// dictionary match, so we send the symbol representing the current "prefix" and add the new string to the
// dictionary. Since the current byte "c" was not included in the prefix, that now becomes our new prefix.
if (!cti) {
WRITE_CODE (prefix, maxcode); // send symbol for current prefix (0 to maxcode-1)
dictionary [next_string].terminator = c; // newly created string has current byte as the terminator
prefix = c; // current byte also becomes new prefix for next string
// If the dictionary is not full yet, we bump the maxcode and next_string and check to see if the
// dictionary is now full. If it is we set the dictionary_full flag and leave maxcode set to two
// less than total_codes because every string entry is now available for matching, but the actual
// maximum code is reserved for EOF.
if (!dictionary_full) {
dictionary_full = (++next_string > max_available_code);
maxcode++;
}
// If the dictionary is full we look for an entry to recycle starting at next_string (the one we
// just created or recycled) plus one (with check for wrap check). We know there is one because at
// a minimum the string we just added. This also takes care of removing the entry to be recycled
// (which is possible/easy because no longer strings have been based on it).
if (dictionary_full) {
for (next_string++; next_string <= max_available_code || (next_string = FIRST_STRING); next_string++)
if (!dictionary [next_string].first_reference)
break;
cti = dictionary [next_string].back_reference; // dictionary [cti] references the entry we're
// trying to recycle (either as a first or a next)
if (dictionary [cti].first_reference == next_string) {
dictionary [cti].first_reference = dictionary [next_string].next_reference;
// if we just cleared a first reference, and that string is not 0-255,
// then that's a newly available entry
if (!dictionary [cti].first_reference && cti >= FIRST_STRING)
available_entries++;
}
else if (dictionary [cti].next_reference == next_string) // fixup a "next_reference"
dictionary [cti].next_reference = dictionary [next_string].next_reference;
// If the entry we're recycling had a next reference, then update the back reference
// so it's completely out of the chain. Of course we know it didn't have a first
// reference because then we wouldn't be recycling it.
if (dictionary [next_string].next_reference)
dictionary [dictionary [next_string].next_reference].back_reference = cti;
// This check is technically not needed because there will always be an available entry
// (the last string we added at a minimum) but we don't want to get in a situation where
// we only have a few entries that we're cycling though. I pulled the limits (16 entries
// or 1% of total) out of a hat.
if (available_entries < 16 || available_entries * 100 < max_available_entries) {
// clear the dictionary and reset the byte counters -- basically everything starts over
// except that we keep the last pending "prefix" (which, of course, was never sent)
WRITE_CODE (CLEAR_CODE, maxcode);
memset (dictionary, 0, 256 * sizeof (encoder_entry_t));
available_entries = max_available_entries;
next_string = maxcode = FIRST_STRING;
input_bytes = output_bytes = 65536;
dictionary_full = 0;
}
}
// This is similar to the above check, except that it's used whether the dictionary is full or not.
// It uses an exponentially decaying average of the current compression ratio, so it can terminate
// very early if the incoming data is uncompressible or it can terminate any later time that the
// dictionary no longer compresses the incoming stream.
if (output_bytes > input_bytes + (input_bytes >> 4)) {
WRITE_CODE (CLEAR_CODE, maxcode);
memset (dictionary, 0, 256 * sizeof (encoder_entry_t));
available_entries = max_available_entries;
next_string = maxcode = FIRST_STRING;
input_bytes = output_bytes = 65536;
dictionary_full = 0;
}
else {
output_bytes -= output_bytes >> 8;
input_bytes -= input_bytes >> 8;
}
}
}
// we're done with input, so if we've received anything we still need to send that pesky pending prefix...
if (prefix != NULL_CODE) {
WRITE_CODE (prefix, maxcode);
if (!dictionary_full)
maxcode++;
}
WRITE_CODE (maxcode, maxcode); // the maximum possible code is always reserved for our END_CODE
if (bits) // finally, flush any pending bits from the shifter
(*dst)(shifter, dstctx);
free (dictionary);
return 0;
}
/* LZW decompression function. Bytes (8-bit) are read and written through callbacks. The
* "maxbits" parameter is read as the first byte in the stream and controls how much memory
* is allocated for decoding. A return value of EOF from the "src" callback terminates the
* decompression process (although this should not normally occur). A non-zero return value
* indicates an error, which in this case can be a bad "maxbits" read from the stream, a
* failed malloc(), or if an EOF is read from the input stream before the decompression
* terminates naturally with END_CODE. There are contexts (void pointers) that are passed
* to the callbacks to easily facilitate multiple instances of the decompression operation
* (but simple applications can ignore these).
*/
typedef struct {
unsigned char terminator, extra_references;
unsigned short prefix;
} decoder_entry_t;
int lzw_decompress (void (*dst)(int,void*), void *dstctx, int (*src)(void*), void *srcctx)
{
unsigned int maxcode = FIRST_STRING, next_string = FIRST_STRING - 1, prefix = CLEAR_CODE;
unsigned int dictionary_full = 0, max_available_code, total_codes;
unsigned int shifter = 0, bits = 0, read_byte, i;
unsigned char *reverse_buffer, *referenced;
decoder_entry_t *dictionary;
if ((read_byte = ((*src)(srcctx))) == EOF || (read_byte & 0xf8)) //sanitize first byte
return 1;
// based on the "maxbits" parameter, compute total codes and allocate dictionary storage
total_codes = 512 << (read_byte & 0x7);
max_available_code = total_codes - 2;
dictionary = malloc (total_codes * sizeof (decoder_entry_t));
reverse_buffer = malloc (total_codes - 256);
referenced = malloc (total_codes / 8); // bitfield indicating code is referenced at least once
// Note that to implement the dictionary entry recycling we have to keep track of how many
// longer strings are based on each string in the dictionary. This can be between 0 (no
// references) to 256 (every possible next byte), but unfortunately that's one more value
// than what can be stored in a byte. The solution is to have a single bit for each entry
// indicating any references (i.e., the code cannot be recycled) and an additional byte
// in the dictionary entry struct counting the "extra" references (beyond one).
if (!reverse_buffer || !dictionary) // check for malloc() failure
return 1;
for (i = 0; i < 256; ++i) { // these never change
dictionary [i].prefix = NULL_CODE;
dictionary [i].terminator = i;
}
// This is the main loop where we read input symbols. The values range from 0 to the code value
// of the "next" string in the dictionary (although the actual "next" code cannot be used yet,
// and so we reserve that code for the END_CODE). Note that receiving an EOF from the input
// stream is actually an error because we should have gotten the END_CODE first.
while (1) {
unsigned int code_bits = CODE_BITS (maxcode), code;
unsigned int extras = (2 << code_bits) - maxcode - 1;
do {
if ((read_byte = ((*src)(srcctx))) == EOF) {
free (dictionary); free (reverse_buffer); free (referenced);
return 1;
}
shifter |= read_byte << bits;
} while ((bits += 8) < code_bits);
// first we assume the code will fit in the minimum number of required bits
code = shifter & ((1 << code_bits) - 1);
shifter >>= code_bits;
bits -= code_bits;
// but if code >= extras, then we need to read another bit to calculate the real code
// (this is the "adjusted binary" part)
if (code >= extras) {
if (!bits) {
if ((read_byte = ((*src)(srcctx))) == EOF) {
free (dictionary); free (reverse_buffer); free (referenced);
return 1;
}
shifter = read_byte;
bits = 8;
}
code = (code << 1) - extras + (shifter & 1);
shifter >>= 1;
bits--;
}
if (code == maxcode) // sending the maximum code is reserved for the end of the file
break;
else if (code == CLEAR_CODE) { // otherwise check for a CLEAR_CODE to start over early
next_string = FIRST_STRING - 1;
maxcode = FIRST_STRING;
dictionary_full = 0;
}
else if (prefix == CLEAR_CODE) { // this only happens at the first symbol which is always sent
(*dst)(code, dstctx); // literally and becomes our initial prefix
next_string++;
maxcode++;
}
// Otherwise we have a valid prefix so we step through the string from end to beginning storing the
// bytes in the "reverse_buffer", and then we send them out in the proper order. One corner-case
// we have to handle here is that the string might be the same one that is actually being defined
// now (code == next_string).
else {
unsigned int cti = (code == next_string) ? prefix : code;
unsigned char *rbp = reverse_buffer, c;
do {
*rbp++ = dictionary [cti].terminator;
if (rbp == reverse_buffer + total_codes - 256) {
free (dictionary); free (reverse_buffer); free (referenced);
return 1;
}
} while ((cti = dictionary [cti].prefix) != NULL_CODE);
c = *--rbp; // the first byte in this string is the terminator for the last string, which is
// the one that we'll create a new dictionary entry for this time
do // send string in corrected order (except for the terminator which we don't know yet)
(*dst)(*rbp, dstctx);
while (rbp-- != reverse_buffer);
if (code == next_string) {
(*dst)(c,dstctx);
}
// This should always execute (the conditional is to catch corruptions) and is where we add a new string to
// the dictionary, either at the end or elsewhere when we are "recycling" entries that were never referenced
if (next_string >= FIRST_STRING && next_string < total_codes) {
if (referenced [prefix >> 3] & (1 << (prefix & 7))) // increment reference count on prefix
dictionary [prefix].extra_references++;
else
referenced [prefix >> 3] |= 1 << (prefix & 7);
dictionary [next_string].prefix = prefix; // now update the next dictionary entry with the new string
dictionary [next_string].terminator = c; // (but we're always one behind, so it's not the string just sent)
dictionary [next_string].extra_references = 0; // newly created string has not been referenced
referenced [next_string >> 3] &= ~(1 << (next_string & 7));
}
// If the dictionary is not full yet, we bump the maxcode and next_string and check to see if the
// dictionary is now full. If it is we set the dictionary_full flag and set next_string back to the
// beginning of the dictionary strings to start recycling them. Note that then maxcode will remain
// two less than total_codes because every string entry is available for matching, and the actual
// maximum code is reserved for EOF.
if (!dictionary_full) {
maxcode++;
if (++next_string > max_available_code) {
dictionary_full = 1;
maxcode--;
}
}
// If the dictionary is full we look for an entry to recycle starting at next_string (the one we
// created or recycled) plus one. We know there is one because at a minimum the string we just added
// has not been referenced). This also takes care of removing the entry to be recycled (which is
// possible/easy because no longer strings have been based on it).
if (dictionary_full) {
for (next_string++; next_string <= max_available_code || (next_string = FIRST_STRING); next_string++)
if (!(referenced [next_string >> 3] & (1 << (next_string & 7))))
break;
if (dictionary [dictionary [next_string].prefix].extra_references)
dictionary [dictionary [next_string].prefix].extra_references--;
else
referenced [dictionary [next_string].prefix >> 3] &= ~(1 << (dictionary [next_string].prefix & 7));
}
}
prefix = code; // the code we just received becomes the prefix for the next dictionary string entry
// (which we'll create once we find out the terminator)
}
free (dictionary); free (reverse_buffer); free (referenced);
return 0;
}