-
Notifications
You must be signed in to change notification settings - Fork 3
/
lzw.h
469 lines (385 loc) · 14.3 KB
/
lzw.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
/*
Variable-length code LZW compressor and decompressor for fixed-memory decoding.
Copyright (c) 2020-2022, Eddy L O Jansson. Licensed under The MIT License.
See https://github.com/eloj/lzw-eddy
*/
#ifdef __cplusplus
extern "C" {
#endif
#include <stdint.h>
#include <stdbool.h>
#if defined(_MSC_VER)
#include <BaseTsd.h>
typedef SSIZE_T ssize_t;
#else
#include <sys/types.h> // for ssize_t
#endif
#define LZW_EDDY_MAJOR_VERSION 1
#define LZW_EDDY_MINOR_VERSION 1
#define LZW_EDDY_PATCH_VERSION 0
#define LZW_EDDY_VERSION "1.1.0-dev"
#define LZW_MIN_CODE_WIDTH 9
// 9 to 16-bit codes should all work, but 12 is the default for a reason.
// Going beyond 16-bit codes would require code changes. More isn't better either.
#ifndef LZW_MAX_CODE_WIDTH
#define LZW_MAX_CODE_WIDTH 12
#endif
#define LZW_MAX_CODES (1UL << LZW_MAX_CODE_WIDTH)
enum lzw_errors {
LZW_NOERROR = 0,
LZW_DESTINATION_TOO_SMALL = -1,
LZW_INVALID_CODE_STREAM = -2,
LZW_STRING_TABLE_FULL = -3,
};
// This type must be large enough for SYMBOL_BITS + LZW_MAX_CODE_WIDTH*2 bits.
#if LZW_MAX_CODE_WIDTH > 12
typedef uint64_t lzw_node;
#else
typedef uint32_t lzw_node;
#endif
typedef uint32_t bitres_t;
typedef uint16_t code_t;
typedef uint8_t sym_t;
struct lzw_string_table {
uint32_t code_width;
code_t next_code;
code_t prev_code;
lzw_node node[LZW_MAX_CODES]; // 16K at 12-bit codes.
};
struct lzw_state {
struct lzw_string_table tree;
// If we ever need more of these, change to a flag-word.
bool was_init;
bool must_reset;
size_t rptr;
size_t wptr;
// Bit reservoir, need room for LZW_MAX_CODE_WIDTH*2-1 bits.
bitres_t bitres;
uint32_t bitres_len;
// Tracks the longest prefix used, which is equal to the minimum output buffer required for decompression.
size_t longest_prefix;
// Restrict the longest_prefix to this -- optimize for decode buffer size.
size_t longest_prefix_allowed;
};
// Translate error code to message.
const char *lzw_strerror(enum lzw_errors errnum);
/*
Decompress `slen` bytes from `src` into `dest` of size `dlen`.
Returns the number of bytes decompressed into `dest`.
Once all input has been consumed, 0 is returned.
On error, a negative integer is returned.
Neither `src` nor `dest` may be NULL.
`state`should be zero-initialized.
`dlen` should be at least 4096 bytes, unless the input is known to
require less.
`LZWD_DESTINATION_TOO_SMALL` will be returned if the output buffer is too small, in which case
you'd have to restart from the beginning with a larger `dest`.
All that said, even a file consisting of 80K zeros requires only 400 bytes,
so we're being very conservative here. A 'normal' file may need only 128 bytes or so.
*/
ssize_t lzw_decompress(struct lzw_state *state, uint8_t *src, size_t slen, uint8_t *dest, size_t dlen);
/*
Compress `slen` bytes from `src` into `dest` of size `dlen`.
Returns the number of bytes compressed into `dest`.
Once all input has been consumed, 0 is returned.
On error, a negative integer is returned.
Neither `src` nor `dest` may be NULL.
`state`should be zero-initialized.
*/
ssize_t lzw_compress(struct lzw_state *state, uint8_t *src, size_t slen, uint8_t *dest, size_t dlen);
#ifdef LZW_EDDY_IMPLEMENTATION
/*
Variable-length code LZW compressor and decompressor for fixed-memory decoding.
Copyright (c) 2020-2022, Eddy L O Jansson. Licensed under The MIT License.
See https://github.com/eloj/lzw-eddy
*/
#include <stdlib.h>
#include <string.h>
#include <stdint.h>
#include <assert.h>
#include <stdbool.h>
#define SYMBOL_BITS 8
#define SYMBOL_MASK ((1UL << SYMBOL_BITS)-1)
#define PARENT_BITS LZW_MAX_CODE_WIDTH
#define PARENT_SHIFT SYMBOL_BITS
#define PARENT_MASK ((1UL << PARENT_BITS)-1)
#define PREFIXLEN_BITS LZW_MAX_CODE_WIDTH
#define PREFIXLEN_SHIFT (PARENT_BITS+SYMBOL_BITS)
#define PREFIXLEN_MASK ((1UL << PREFIXLEN_BITS)-1)
#define CODE_CLEAR (1UL << SYMBOL_BITS)
#define CODE_EOF (CODE_CLEAR+1)
#define CODE_FIRST (CODE_CLEAR+2)
static_assert((LZW_MAX_CODE_WIDTH >= LZW_MIN_CODE_WIDTH), "");
static_assert(SYMBOL_BITS <= sizeof(sym_t)*8, "sym_t type too small");
static_assert((SYMBOL_BITS + PARENT_BITS + PREFIXLEN_BITS) <= sizeof(lzw_node)*8, "lzw_node type too small");
static_assert((LZW_MAX_CODE_WIDTH*2 - 1) < sizeof(bitres_t)*8, "bitres_t type too small");
static inline sym_t lzw_node_symbol(lzw_node node) {
return node & SYMBOL_MASK;
}
static inline code_t lzw_node_parent(lzw_node node) {
return (node >> PARENT_SHIFT) & PARENT_MASK;
}
static inline code_t lzw_node_prefix_len(lzw_node node) {
return (node >> PREFIXLEN_SHIFT) & PREFIXLEN_MASK;
}
static inline lzw_node lzw_make_node(sym_t symbol, code_t parent, code_t len) {
lzw_node node = (len << PREFIXLEN_SHIFT) | (parent << PARENT_SHIFT) | symbol;
return node;
}
static inline uint32_t mask_from_width(uint32_t width) {
return (1UL << width)-1;
}
static void lzw_reset(struct lzw_state *state) {
state->tree.prev_code = CODE_EOF;
state->tree.next_code = CODE_FIRST;
state->tree.code_width = LZW_MIN_CODE_WIDTH;
state->must_reset = false;
}
static void lzw_init(struct lzw_state *state) {
for (size_t i=0 ; i < (1UL << SYMBOL_BITS) ; ++i) {
state->tree.node[i] = lzw_make_node((sym_t)i, 0, 0);
}
state->rptr = 0;
state->bitres = 0;
state->bitres_len = 0;
state->was_init = true;
lzw_reset(state);
}
const char *lzw_strerror(enum lzw_errors errnum) {
const char *errstr = "Unknown error";
switch (errnum) {
case LZW_NOERROR:
errstr = "No error";
break;
case LZW_DESTINATION_TOO_SMALL:
errstr = "Destination buffer too small";
break;
case LZW_INVALID_CODE_STREAM:
errstr = "Invalid code stream";
break;
case LZW_STRING_TABLE_FULL:
errstr = "String table full";
break;
}
return errstr;
}
ssize_t lzw_decompress(struct lzw_state *state, uint8_t *src, size_t slen, uint8_t *dest, size_t dlen) {
if (state->was_init == false)
lzw_init(state);
// Keep local copies so that we can exit and continue without losing bits.
uint32_t bitres = state->bitres;
uint32_t bitres_len = state->bitres_len;
uint32_t code = 0;
size_t wptr = 0;
while (state->rptr < slen) {
// Fill bit-reservoir.
while ((bitres_len < state->tree.code_width) && (state->rptr < slen)) {
bitres |= src[state->rptr++] << bitres_len;
bitres_len += 8;
}
state->bitres = bitres;
state->bitres_len = bitres_len;
if (state->bitres_len < state->tree.code_width) {
return LZW_INVALID_CODE_STREAM;
}
code = bitres & mask_from_width(state->tree.code_width);
bitres >>= state->tree.code_width;
bitres_len -= state->tree.code_width;
if (code == CODE_CLEAR) {
if (state->tree.next_code != CODE_FIRST)
lzw_reset(state);
continue;
} else if (code == CODE_EOF) {
break;
} else if (state->must_reset) {
// ERROR: Ran out of space in string table
return LZW_STRING_TABLE_FULL;
}
if (code <= state->tree.next_code) {
bool known_code = code < state->tree.next_code;
code_t tcode = known_code ? code : state->tree.prev_code;
size_t prefix_len = 1 + lzw_node_prefix_len(state->tree.node[tcode]);
uint8_t symbol = 0;
assert(prefix_len > 0);
// Invalid state, invalid input.
if (!known_code && state->tree.prev_code == CODE_EOF) {
return LZW_INVALID_CODE_STREAM;
}
// Track longest prefix seen.
if (prefix_len > state->longest_prefix) {
state->longest_prefix = prefix_len;
}
// Check if prefix alone too large for output buffer. User could start over with a larger buffer.
if (prefix_len + (known_code ? 0 : 1) > dlen) {
return LZW_DESTINATION_TOO_SMALL;
}
// Check if room in output buffer, else return early.
if (wptr + prefix_len + (known_code ? 0 : 1) > dlen) {
return wptr;
}
// Write out prefix to destination
for (size_t i=0 ; i < prefix_len ; ++i) {
symbol = lzw_node_symbol(state->tree.node[tcode]);
dest[wptr + prefix_len - 1 - i] = symbol;
tcode = lzw_node_parent(state->tree.node[tcode]);
}
wptr += prefix_len;
// Add the first character of the prefix as a new code with prev_code as the parent.
if (state->tree.prev_code != CODE_EOF) {
if (!known_code) {
assert(code == state->tree.next_code);
assert(wptr < dlen);
dest[wptr++] = symbol; // Special case for new codes.
}
state->tree.node[state->tree.next_code] = lzw_make_node(symbol, state->tree.prev_code, 1 + lzw_node_prefix_len(state->tree.node[state->tree.prev_code]));
// TODO: Change to ==
if (state->tree.next_code >= mask_from_width(state->tree.code_width)) {
if (state->tree.code_width == LZW_MAX_CODE_WIDTH) {
// Out of bits in code, next code MUST be a reset!
state->must_reset = true;
state->tree.prev_code = code;
continue;
}
++state->tree.code_width;
}
state->tree.next_code++;
}
state->tree.prev_code = code;
} else {
// Desynchronized, probably corrupt/invalid input.
return LZW_INVALID_CODE_STREAM;
}
}
return wptr;
}
static bool lzw_string_table_lookup(struct lzw_state *state, uint8_t *prefix, size_t len, code_t *code) {
// printf("Looking up prefix '%.*s' from %p to %p (len=%zu)\n", (int)(len), prefix, prefix, prefix+len, len);
assert (len > 0);
if (len == 1) {
*code = state->tree.node[prefix[0]];
return true;
}
// PERF: This is slow, we should store an array of hashes to use as an initial comparison before walking the tree.
// NOTE: It's imperative that we search newest to oldest. When limiting the prefix length, we'll
// end up with duplicate prefixes, and only the newest code is valid for the decoder to stay in sync.
for (size_t i=state->tree.next_code - 1 ; i >= CODE_FIRST ; --i) {
assert(i < LZW_MAX_CODES);
lzw_node node = state->tree.node[i];
if (len - 1 == lzw_node_prefix_len(node)) {
for (size_t j=0 ; j < len ; ++j) {
if (prefix[len-j-1] != lzw_node_symbol(node)) {
break;
}
if (lzw_node_prefix_len(node) == 0) {
*code = (code_t)i;
assert(j == len - 1);
return true;
}
node = state->tree.node[lzw_node_parent(node)];
}
}
}
return false;
}
inline static void lzw_output_code(struct lzw_state *state, code_t code) {
assert(state->bitres_len + state->tree.code_width <= sizeof(bitres_t)*8); // maybe increase size of bitres_t?
state->bitres |= code << state->bitres_len;
state->bitres_len += state->tree.code_width;
state->tree.prev_code = code;
// printf("<CODE:%d width=%d reservoir:%02d/%zu:%02x>\n", code, state->tree.code_width, state->bitres_len, sizeof(bitres_t)*8, state->bitres);
}
static void lzw_flush_reservoir(struct lzw_state *state, uint8_t *dest, bool final) {
// SECURITY: We assume we have enough space left in dest!
// Write codes to output.
while (state->bitres_len >= 8) {
dest[state->wptr++] = state->bitres & 0xFF;
state->bitres >>= 8;
state->bitres_len -= 8;
// printf("DEBUG: Flushed: %02x, reservoir:%02d/%zu:%02x\n", dest[state->wptr-1], state->bitres_len, sizeof(bitres_t)*8, state->bitres);
}
if (final && state->bitres_len > 0) {
// printf("DEBUG: Flushing last %d bits.\n", state->bitres_len);
dest[state->wptr++] = state->bitres;
state->bitres = 0;
state->bitres_len = 0;
// printf("DEBUG: Flushed: %02x, reservoir:%02d/%zu:%02x\n", dest[state->wptr-1], state->bitres_len, sizeof(bitres_t)*8, state->bitres);
}
}
ssize_t lzw_compress(struct lzw_state *state, uint8_t *src, size_t slen, uint8_t *dest, size_t dlen) {
if (state->was_init == false) {
lzw_init(state);
lzw_output_code(state, CODE_CLEAR);
}
code_t code = CODE_EOF;
size_t prefix_end = 0;
state->wptr = 0;
while (state->rptr + prefix_end < slen) {
// Ensure we have enough space for flushing codes.
if (state->wptr + (state->tree.code_width >> 3) + 1 + 2 + 2 > dlen) { // Also reserve bits for worst-case 16-bit CLEAR + EOF code
return state->wptr;
}
++prefix_end;
// lookup prefix in string table
bool overlong = ((state->longest_prefix_allowed > 0) && (prefix_end >= state->longest_prefix_allowed));
bool existing_code = lzw_string_table_lookup(state, src + state->rptr, prefix_end, &code);
if (!existing_code || overlong) {
assert(code != CODE_CLEAR);
assert(code != CODE_EOF);
uint8_t symbol = src[state->rptr + prefix_end - 1];
code_t parent = code;
code_t parent_len = 1 + lzw_node_prefix_len(state->tree.node[parent]);
// Output code _before_ we potentially change the bit-width.
lzw_output_code(state, parent);
// Handle code width expansion.
if (state->tree.next_code == (1UL << state->tree.code_width)
#if LZW_MAX_CODE_WIDTH == 16
|| (state->tree.next_code == LZW_MAX_CODES - 1) /* special case for wrapping on 16-bit code_t */
#endif
) {
if (state->tree.code_width < LZW_MAX_CODE_WIDTH) {
// printf("DEBUG: Expanding bitwidth to %d\n", state->tree.code_width + 1);
++state->tree.code_width;
} else {
// printf("DEBUG: Max code-width reached -- Issuing clear/reset\n");
lzw_flush_reservoir(state, dest, false);
lzw_output_code(state, CODE_CLEAR);
lzw_reset(state);
lzw_flush_reservoir(state, dest, false);
state->tree.next_code = CODE_EOF; // XXX: Required for compatibility with puzznic.
}
}
assert(state->tree.next_code < LZW_MAX_CODES);
// printf("New prefix from src[%zu], adding symbol '%c' (%02x) as code %d /w parent %d\n", state->rptr + prefix_end, symbol, symbol, state->tree.next_code, parent);
state->tree.node[state->tree.next_code++] = lzw_make_node(symbol, parent, parent_len);
if (parent_len > state->longest_prefix) {
state->longest_prefix = parent_len;
}
state->rptr += parent_len;
prefix_end = 0;
lzw_flush_reservoir(state, dest, false);
}
}
if (prefix_end != 0) {
// printf("DEBUG: Last prefix existed, writing existing code %d to stream\n", code);
lzw_output_code(state, code);
lzw_flush_reservoir(state, dest, false);
state->rptr += prefix_end;
prefix_end = 0;
}
// WARN: Problem with this is that we can't chain encodes, add 'final' flag to compression call?
// NIGHTMARE: Handle zero-input
if ((state->rptr + prefix_end == slen && state->tree.prev_code != CODE_EOF)
// This happens to be true if we're called with slen=0, but only the first time as we now flush the bits.
|| (state->wptr == 0 && state->bitres_len > 0)) {
lzw_output_code(state, CODE_EOF);
lzw_flush_reservoir(state, dest, true);
}
// if we didn't write anything, there shouldn't be any bits left in reservoir.
assert(!(state->wptr == 0 && state->bitres_len > 0));
// printf("DEBUG: Returning %zu bytes written to caller.\n", state->wptr);
return state->wptr;
}
#endif // LZW_EDDY_IMPLEMENTATION
#ifdef __cplusplus
}
#endif