Skip to content

Commit

Permalink
Implement huff0 entropy encoder (#89)
Browse files Browse the repository at this point in the history
* Implement huff0 entropy encoder.
  • Loading branch information
klauspost committed Apr 2, 2018
1 parent 2c62357 commit 30b4e9b
Show file tree
Hide file tree
Showing 266 changed files with 2,688 additions and 8 deletions.
1 change: 1 addition & 0 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ go:
- 1.7.x
- 1.8.x
- 1.9.x
- 1.10.x
- master

install:
Expand Down
3 changes: 3 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,9 @@ It offers slightly better compression at lower compression settings, and up to 3
[![Sourcegraph Badge](https://sourcegraph.com/github.com/klauspost/compress/-/badge.svg)](https://sourcegraph.com/github.com/klauspost/compress?badge)

# changelog

* Apr 2, 2018: Added [huff0](https://godoc.org/github.com/klauspost/compress/huff0) en/decoder. Experimental for now, API may change.
* Mar 4, 2018: Added [FSE Entropy](https://godoc.org/github.com/klauspost/compress/fse) en/decoder. Experimental for now, API may change.
* Nov 3, 2017: Add compression [Estimate](https://godoc.org/github.com/klauspost/compress#Estimate) function.
* May 28, 2017: Reduce allocations when resetting decoder.
* Apr 02, 2017: Change back to official crc32, since changes were merged in Go 1.7.
Expand Down
21 changes: 13 additions & 8 deletions fse/fse.go
Original file line number Diff line number Diff line change
Expand Up @@ -82,17 +82,22 @@ type Scratch struct {

// Histogram allows to populate the histogram and skip that step in the compression,
// It otherwise allows to inspect the histogram when compression is done.
// To indicate that you have populated the histogram call the returned function
// To indicate that you have populated the histogram call HistogramFinished
// with the value of the highest populated symbol, as well as the number of entries
// in the most populated entry. These are accepted at face value.
// The returned slice will always be length 256.
// If you have *not* populated the histogram, send 0 as maxCount before next compression cycle.
func (s *Scratch) Histogram() ([]uint32, func(maxSymbol uint8, maxCount int)) {
return s.count[:], func(maxSymbol uint8, maxCount int) {
s.maxCount = maxCount
s.symbolLen = uint16(maxSymbol) + 1
s.clearCount = maxCount != 0
}
func (s *Scratch) Histogram() []uint32 {
return s.count[:]
}

// HistogramFinished can be called to indicate that the histogram has been populated.
// maxSymbol is the index of the highest set symbol of the next data segment.
// maxCount is the number of entries in the most populated entry.
// These are accepted at face value.
func (s *Scratch) HistogramFinished(maxSymbol uint8, maxCount int) {
s.maxCount = maxCount
s.symbolLen = uint16(maxSymbol) + 1
s.clearCount = maxCount != 0
}

// prepare will prepare and allocate scratch tables used for both compression and decompression.
Expand Down
1 change: 1 addition & 0 deletions huff0/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
/huff0-fuzz.zip
115 changes: 115 additions & 0 deletions huff0/bitreader.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
// Copyright 2018 Klaus Post. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Based on work Copyright (c) 2013, Yann Collet, released under BSD License.

package huff0

import (
"errors"
"io"
)

// bitReader reads a bitstream in reverse.
// The last set bit indicates the start of the stream and is used
// for aligning the input.
type bitReader struct {
in []byte
off uint // next byte to read is at in[off - 1]
value uint64
bitsRead uint8
}

// init initializes and resets the bit reader.
func (b *bitReader) init(in []byte) error {
if len(in) < 1 {
return errors.New("corrupt stream: too short")
}
b.in = in
b.off = uint(len(in))
// The highest bit of the last byte indicates where to start
v := in[len(in)-1]
if v == 0 {
return errors.New("corrupt stream, did not find end of stream")
}
b.bitsRead = 64
b.value = 0
b.fill()
b.fill()
b.bitsRead += 8 - uint8(highBit32(uint32(v)))
return nil
}

// getBits will return n bits. n can be 0.
func (b *bitReader) getBits(n uint8) uint16 {
if n == 0 || b.bitsRead >= 64 {
return 0
}
return b.getBitsFast(n)
}

// getBitsFast requires that at least one bit is requested every time.
// There are no checks if the buffer is filled.
func (b *bitReader) getBitsFast(n uint8) uint16 {
const regMask = 64 - 1
v := uint16((b.value << (b.bitsRead & regMask)) >> ((regMask + 1 - n) & regMask))
b.bitsRead += n
return v
}

// peekBitsFast requires that at least one bit is requested every time.
// There are no checks if the buffer is filled.
func (b *bitReader) peekBitsFast(n uint8) uint16 {
const regMask = 64 - 1
v := uint16((b.value << (b.bitsRead & regMask)) >> ((regMask + 1 - n) & regMask))
return v
}

// fillFast() will make sure at least 32 bits are available.
// There must be at least 4 bytes available.
func (b *bitReader) fillFast() {
if b.bitsRead < 32 {
return
}
// Do single re-slice to avoid bounds checks.
v := b.in[b.off-4 : b.off]
low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
b.value = (b.value << 32) | uint64(low)
b.bitsRead -= 32
b.off -= 4
}

// fill() will make sure at least 32 bits are available.
func (b *bitReader) fill() {
if b.bitsRead < 32 {
return
}
if b.off > 4 {
v := b.in[b.off-4 : b.off]
low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
b.value = (b.value << 32) | uint64(low)
b.bitsRead -= 32
b.off -= 4
return
}
for b.off > 0 {
b.value = (b.value << 8) | uint64(b.in[b.off-1])
b.bitsRead -= 8
b.off--
}
}

// finished returns true if all bits have been read from the bit stream.
func (b *bitReader) finished() bool {
return b.off == 0 && b.bitsRead >= 64
}

// close the bitstream and returns an error if out-of-buffer reads occurred.
func (b *bitReader) close() error {
// Release reference.
b.in = nil
if b.bitsRead > 64 {
return io.ErrUnexpectedEOF
}
return nil
}
176 changes: 176 additions & 0 deletions huff0/bitwriter.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,176 @@
// Copyright 2018 Klaus Post. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Based on work Copyright (c) 2013, Yann Collet, released under BSD License.

package huff0

import "fmt"

// bitWriter will write bits.
// First bit will be LSB of the first byte of output.
type bitWriter struct {
bitContainer uint64
nBits uint8
out []byte
}

// bitMask16 is bitmasks. Has extra to avoid bounds check.
var bitMask16 = [32]uint16{
0, 1, 3, 7, 0xF, 0x1F,
0x3F, 0x7F, 0xFF, 0x1FF, 0x3FF, 0x7FF,
0xFFF, 0x1FFF, 0x3FFF, 0x7FFF, 0xFFFF, 0xFFFF,
0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,
0xFFFF, 0xFFFF} /* up to 16 bits */

// addBits16NC will add up to 16 bits.
// It will not check if there is space for them,
// so the caller must ensure that it has flushed recently.
func (b *bitWriter) addBits16NC(value uint16, bits uint8) {
b.bitContainer |= uint64(value&bitMask16[bits&31]) << (b.nBits & 63)
b.nBits += bits
}

// addBits16Clean will add up to 16 bits. value may not contain more set bits than indicated.
// It will not check if there is space for them, so the caller must ensure that it has flushed recently.
func (b *bitWriter) addBits16Clean(value uint16, bits uint8) {
b.bitContainer |= uint64(value) << (b.nBits & 63)
b.nBits += bits
}

// addBits16Clean will add up to 16 bits. value may not contain more set bits than indicated.
// It will not check if there is space for them, so the caller must ensure that it has flushed recently.
func (b *bitWriter) encSymbol(ct cTable, symbol byte) {
enc := ct[symbol]
b.bitContainer |= uint64(enc.val) << (b.nBits & 63)
b.nBits += enc.nBits
}

// addBits16ZeroNC will add up to 16 bits.
// It will not check if there is space for them,
// so the caller must ensure that it has flushed recently.
// This is fastest if bits can be zero.
func (b *bitWriter) addBits16ZeroNC(value uint16, bits uint8) {
if bits == 0 {
return
}
value <<= (16 - bits) & 15
value >>= (16 - bits) & 15
b.bitContainer |= uint64(value) << (b.nBits & 63)
b.nBits += bits
}

// flush will flush all pending full bytes.
// There will be at least 56 bits available for writing when this has been called.
// Using flush32 is faster, but leaves less space for writing.
func (b *bitWriter) flush() {
v := b.nBits >> 3
switch v {
case 0:
case 1:
b.out = append(b.out,
byte(b.bitContainer),
)
case 2:
b.out = append(b.out,
byte(b.bitContainer),
byte(b.bitContainer>>8),
)
case 3:
b.out = append(b.out,
byte(b.bitContainer),
byte(b.bitContainer>>8),
byte(b.bitContainer>>16),
)
case 4:
b.out = append(b.out,
byte(b.bitContainer),
byte(b.bitContainer>>8),
byte(b.bitContainer>>16),
byte(b.bitContainer>>24),
)
case 5:
b.out = append(b.out,
byte(b.bitContainer),
byte(b.bitContainer>>8),
byte(b.bitContainer>>16),
byte(b.bitContainer>>24),
byte(b.bitContainer>>32),
)
case 6:
b.out = append(b.out,
byte(b.bitContainer),
byte(b.bitContainer>>8),
byte(b.bitContainer>>16),
byte(b.bitContainer>>24),
byte(b.bitContainer>>32),
byte(b.bitContainer>>40),
)
case 7:
b.out = append(b.out,
byte(b.bitContainer),
byte(b.bitContainer>>8),
byte(b.bitContainer>>16),
byte(b.bitContainer>>24),
byte(b.bitContainer>>32),
byte(b.bitContainer>>40),
byte(b.bitContainer>>48),
)
case 8:
b.out = append(b.out,
byte(b.bitContainer),
byte(b.bitContainer>>8),
byte(b.bitContainer>>16),
byte(b.bitContainer>>24),
byte(b.bitContainer>>32),
byte(b.bitContainer>>40),
byte(b.bitContainer>>48),
byte(b.bitContainer>>56),
)
default:
panic(fmt.Errorf("bits (%d) > 64", b.nBits))
}
b.bitContainer >>= v << 3
b.nBits &= 7
}

// flush32 will flush out, so there are at least 32 bits available for writing.
func (b *bitWriter) flush32() {
if b.nBits < 32 {
return
}
b.out = append(b.out,
byte(b.bitContainer),
byte(b.bitContainer>>8),
byte(b.bitContainer>>16),
byte(b.bitContainer>>24))
b.nBits -= 32
b.bitContainer >>= 32
}

// flushAlign will flush remaining full bytes and align to next byte boundary.
func (b *bitWriter) flushAlign() {
nbBytes := (b.nBits + 7) >> 3
for i := uint8(0); i < nbBytes; i++ {
b.out = append(b.out, byte(b.bitContainer>>(i*8)))
}
b.nBits = 0
b.bitContainer = 0
}

// close will write the alignment bit and write the final byte(s)
// to the output.
func (b *bitWriter) close() error {
// End mark
b.addBits16Clean(1, 1)
// flush until next byte.
b.flushAlign()
return nil
}

// reset and continue writing by appending to out.
func (b *bitWriter) reset(out []byte) {
b.bitContainer = 0
b.nBits = 0
b.out = out
}
54 changes: 54 additions & 0 deletions huff0/bytereader.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
// Copyright 2018 Klaus Post. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Based on work Copyright (c) 2013, Yann Collet, released under BSD License.

package huff0

// byteReader provides a byte reader that reads
// little endian values from a byte stream.
// The input stream is manually advanced.
// The reader performs no bounds checks.
type byteReader struct {
b []byte
off int
}

// init will initialize the reader and set the input.
func (b *byteReader) init(in []byte) {
b.b = in
b.off = 0
}

// advance the stream b n bytes.
func (b *byteReader) advance(n uint) {
b.off += int(n)
}

// Int32 returns a little endian int32 starting at current offset.
func (b byteReader) Int32() int32 {
v3 := int32(b.b[b.off+3])
v2 := int32(b.b[b.off+2])
v1 := int32(b.b[b.off+1])
v0 := int32(b.b[b.off])
return (v3 << 24) | (v2 << 16) | (v1 << 8) | v0
}

// Uint32 returns a little endian uint32 starting at current offset.
func (b byteReader) Uint32() uint32 {
v3 := uint32(b.b[b.off+3])
v2 := uint32(b.b[b.off+2])
v1 := uint32(b.b[b.off+1])
v0 := uint32(b.b[b.off])
return (v3 << 24) | (v2 << 16) | (v1 << 8) | v0
}

// unread returns the unread portion of the input.
func (b byteReader) unread() []byte {
return b.b[b.off:]
}

// remain will return the number of bytes remaining.
func (b byteReader) remain() int {
return len(b.b) - b.off
}
Loading

0 comments on commit 30b4e9b

Please sign in to comment.