From 4ae086caf9681be3c9eb005e6a6a7061810bad27 Mon Sep 17 00:00:00 2001 From: Klaus Post Date: Fri, 27 Dec 2019 04:50:49 -0800 Subject: [PATCH 1/5] Optimize single block encodes --- zstd/blockenc.go | 25 +++- zstd/enc_dfast.go | 323 +++++++++++++++++++++++++++++++++++++++++++ zstd/enc_fast.go | 254 ++++++++++++++++++++++++++++++++++ zstd/encoder.go | 68 ++++++--- zstd/encoder_test.go | 63 +++++++-- 5 files changed, 699 insertions(+), 34 deletions(-) diff --git a/zstd/blockenc.go b/zstd/blockenc.go index 99eccda11b..04c6b638ea 100644 --- a/zstd/blockenc.go +++ b/zstd/blockenc.go @@ -299,6 +299,20 @@ func (b *blockEnc) encodeRaw(a []byte) { } } +// encodeRaw can be used to set the output to a raw representation of supplied bytes. +func (b *blockEnc) encodeRawTo(dst, src []byte) []byte { + var bh blockHeader + bh.setLast(b.last) + bh.setSize(uint32(len(src))) + bh.setType(blockTypeRaw) + dst = bh.appendTo(dst) + dst = append(dst, src...) + if debug { + println("Adding RAW block, length", len(src)) + } + return dst +} + // encodeLits can be used if the block is only litLen. func (b *blockEnc) encodeLits(raw bool) error { var bh blockHeader @@ -437,7 +451,7 @@ func fuzzFseEncoder(data []byte) int { return 1 } -// encode will encode the block and put the output in b.output. +// encode will encode the block and append the output in b.output. func (b *blockEnc) encode(raw bool) error { if len(b.sequences) == 0 { return b.encodeLits(raw) @@ -451,6 +465,8 @@ func (b *blockEnc) encode(raw bool) error { var lh literalsHeader bh.setLast(b.last) bh.setType(blockTypeCompressed) + // Store offset of the block header. Needed when we know the size. + bhOffset := len(b.output) b.output = bh.appendTo(b.output) var ( @@ -468,6 +484,7 @@ func (b *blockEnc) encode(raw bool) error { } else { err = huff0.ErrIncompressible } + switch err { case huff0.ErrIncompressible: lh.setType(literalsBlockRaw) @@ -735,18 +752,18 @@ func (b *blockEnc) encode(raw bool) error { } b.output = wr.out - if len(b.output)-3 >= b.size { + if len(b.output)-3-bhOffset >= b.size { // Maybe even add a bigger margin. b.litEnc.Reuse = huff0.ReusePolicyNone return errIncompressible } // Size is output minus block header. - bh.setSize(uint32(len(b.output)) - 3) + bh.setSize(uint32(len(b.output)-bhOffset) - 3) if debug { println("Rewriting block header", bh) } - _ = bh.appendTo(b.output[:0]) + _ = bh.appendTo(b.output[bhOffset:bhOffset]) b.coders.setPrev(llEnc, mlEnc, ofEnc) return nil } diff --git a/zstd/enc_dfast.go b/zstd/enc_dfast.go index 2f41bcd0d5..148d6e5006 100644 --- a/zstd/enc_dfast.go +++ b/zstd/enc_dfast.go @@ -411,3 +411,326 @@ encodeLoop: println("returning, recent offsets:", blk.recentOffsets, "extra literals:", blk.extraLits) } } + +// EncodeNoHist will encode a block with no history and no following blocks. +// Most notable difference is that src will not be copied for history and +// we do not need to check for max match length. +func (e *doubleFastEncoder) EncodeNoHist(blk *blockEnc, src []byte) { + const ( + // Input margin is the number of bytes we read (8) + // and the maximum we will read ahead (2) + inputMargin = 8 + 2 + minNonLiteralBlockSize = 16 + ) + + // Protect against e.cur wraparound. + if e.cur > (1<<30)+e.maxMatchOff { + for i := range e.table[:] { + e.table[i] = tableEntry{} + } + for i := range e.longTable[:] { + e.longTable[i] = tableEntry{} + } + e.cur = e.maxMatchOff + } + + s := int32(0) + blk.size = len(src) + if len(src) < minNonLiteralBlockSize { + blk.extraLits = len(src) + blk.literals = blk.literals[:len(src)] + copy(blk.literals, src) + return + } + + // Override src + sLimit := int32(len(src)) - inputMargin + // stepSize is the number of bytes to skip on every main loop iteration. + // It should be >= 1. + stepSize := int32(e.o.targetLength) + if stepSize == 0 { + stepSize++ + } + + const kSearchStrength = 8 + + // nextEmit is where in src the next emitLiteral should start from. + nextEmit := s + cv := load6432(src, s) + + // Relative offsets + offset1 := int32(blk.recentOffsets[0]) + offset2 := int32(blk.recentOffsets[1]) + + addLiterals := func(s *seq, until int32) { + if until == nextEmit { + return + } + blk.literals = append(blk.literals, src[nextEmit:until]...) + s.litLen = uint32(until - nextEmit) + } + if debug { + println("recent offsets:", blk.recentOffsets) + } + +encodeLoop: + for { + var t int32 + // We allow the encoder to optionally turn off repeat offsets across blocks + canRepeat := len(blk.sequences) > 2 + + for { + if debug && canRepeat && offset1 == 0 { + panic("offset0 was 0") + } + + nextHashS := hash5(cv, dFastShortTableBits) + nextHashL := hash8(cv, dFastLongTableBits) + candidateL := e.longTable[nextHashL] + candidateS := e.table[nextHashS] + + const repOff = 1 + repIndex := s - offset1 + repOff + entry := tableEntry{offset: s + e.cur, val: uint32(cv)} + e.longTable[nextHashL] = entry + e.table[nextHashS] = entry + + if canRepeat { + if repIndex >= 0 && load3232(src, repIndex) == uint32(cv>>(repOff*8)) { + // Consider history as well. + var seq seq + //length := 4 + e.matchlen(s+4+repOff, repIndex+4, src) + length := 4 + int32(matchLen(src[s+4+repOff:], src[repIndex+4:])) + + seq.matchLen = uint32(length - zstdMinMatch) + + // We might be able to match backwards. + // Extend as long as we can. + start := s + repOff + // We end the search early, so we don't risk 0 literals + // and have to do special offset treatment. + startLimit := nextEmit + 1 + + tMin := s - e.maxMatchOff + if tMin < 0 { + tMin = 0 + } + for repIndex > tMin && start > startLimit && src[repIndex-1] == src[start-1] { + repIndex-- + start-- + seq.matchLen++ + } + addLiterals(&seq, start) + + // rep 0 + seq.offset = 1 + if debugSequences { + println("repeat sequence", seq, "next s:", s) + } + blk.sequences = append(blk.sequences, seq) + s += length + repOff + nextEmit = s + if s >= sLimit { + if debug { + println("repeat ended", s, length) + + } + break encodeLoop + } + cv = load6432(src, s) + continue + } + } + // Find the offsets of our two matches. + coffsetL := s - (candidateL.offset - e.cur) + coffsetS := s - (candidateS.offset - e.cur) + + // Check if we have a long match. + if coffsetL < e.maxMatchOff && uint32(cv) == candidateL.val { + // Found a long match, likely at least 8 bytes. + // Reference encoder checks all 8 bytes, we only check 4, + // but the likelihood of both the first 4 bytes and the hash matching should be enough. + t = candidateL.offset - e.cur + if debug && s <= t { + panic("s <= t") + } + if debug && s-t > e.maxMatchOff { + panic("s - t >e.maxMatchOff") + } + if debugMatches { + println("long match") + } + break + } + + // Check if we have a short match. + if coffsetS < e.maxMatchOff && uint32(cv) == candidateS.val { + // found a regular match + // See if we can find a long match at s+1 + const checkAt = 1 + cv := load6432(src, s+checkAt) + nextHashL = hash8(cv, dFastLongTableBits) + candidateL = e.longTable[nextHashL] + coffsetL = s - (candidateL.offset - e.cur) + checkAt + + // We can store it, since we have at least a 4 byte match. + e.longTable[nextHashL] = tableEntry{offset: s + checkAt + e.cur, val: uint32(cv)} + if coffsetL < e.maxMatchOff && uint32(cv) == candidateL.val { + // Found a long match, likely at least 8 bytes. + // Reference encoder checks all 8 bytes, we only check 4, + // but the likelihood of both the first 4 bytes and the hash matching should be enough. + t = candidateL.offset - e.cur + s += checkAt + if debugMatches { + println("long match (after short)") + } + break + } + + t = candidateS.offset - e.cur + if debug && s <= t { + panic("s <= t") + } + if debug && s-t > e.maxMatchOff { + panic("s - t >e.maxMatchOff") + } + if debug && t < 0 { + panic("t<0") + } + if debugMatches { + println("short match") + } + break + } + + // No match found, move forward in input. + s += stepSize + ((s - nextEmit) >> (kSearchStrength - 1)) + if s >= sLimit { + break encodeLoop + } + cv = load6432(src, s) + } + + // A 4-byte match has been found. Update recent offsets. + // We'll later see if more than 4 bytes. + offset2 = offset1 + offset1 = s - t + + if debug && s <= t { + panic("s <= t") + } + + if debug && canRepeat && int(offset1) > len(src) { + panic("invalid offset") + } + + // Extend the 4-byte match as long as possible. + //l := e.matchlen(s+4, t+4, src) + 4 + l := int32(matchLen(src[s+4:], src[t+4:])) + 4 + + // Extend backwards + tMin := s - e.maxMatchOff + if tMin < 0 { + tMin = 0 + } + for t > tMin && s > nextEmit && src[t-1] == src[s-1] { + s-- + t-- + l++ + } + + // Write our sequence + var seq seq + seq.litLen = uint32(s - nextEmit) + seq.matchLen = uint32(l - zstdMinMatch) + if seq.litLen > 0 { + blk.literals = append(blk.literals, src[nextEmit:s]...) + } + seq.offset = uint32(s-t) + 3 + s += l + if debugSequences { + println("sequence", seq, "next s:", s) + } + blk.sequences = append(blk.sequences, seq) + nextEmit = s + if s >= sLimit { + break encodeLoop + } + + // Index match start+1 (long) and start+2 (short) + index0 := s - l + 1 + // Index match end-2 (long) and end-1 (short) + index1 := s - 2 + + cv0 := load6432(src, index0) + cv1 := load6432(src, index1) + te0 := tableEntry{offset: index0 + e.cur, val: uint32(cv0)} + te1 := tableEntry{offset: index1 + e.cur, val: uint32(cv1)} + e.longTable[hash8(cv0, dFastLongTableBits)] = te0 + e.longTable[hash8(cv1, dFastLongTableBits)] = te1 + cv0 >>= 8 + cv1 >>= 8 + te0.offset++ + te1.offset++ + te0.val = uint32(cv0) + te1.val = uint32(cv1) + e.table[hash5(cv0, dFastShortTableBits)] = te0 + e.table[hash5(cv1, dFastShortTableBits)] = te1 + + cv = load6432(src, s) + + if !canRepeat { + continue + } + + // Check offset 2 + for { + o2 := s - offset2 + if load3232(src, o2) != uint32(cv) { + // Do regular search + break + } + + // Store this, since we have it. + nextHashS := hash5(cv1>>8, dFastShortTableBits) + nextHashL := hash8(cv, dFastLongTableBits) + + // We have at least 4 byte match. + // No need to check backwards. We come straight from a match + //l := 4 + e.matchlen(s+4, o2+4, src) + l := 4 + int32(matchLen(src[s+4:], src[o2+4:])) + + entry := tableEntry{offset: s + e.cur, val: uint32(cv)} + e.longTable[nextHashL] = entry + e.table[nextHashS] = entry + seq.matchLen = uint32(l) - zstdMinMatch + seq.litLen = 0 + + // Since litlen is always 0, this is offset 1. + seq.offset = 1 + s += l + nextEmit = s + if debugSequences { + println("sequence", seq, "next s:", s) + } + blk.sequences = append(blk.sequences, seq) + + // Swap offset 1 and 2. + offset1, offset2 = offset2, offset1 + if s >= sLimit { + // Finished + break encodeLoop + } + cv = load6432(src, s) + } + } + + if int(nextEmit) < len(src) { + blk.literals = append(blk.literals, src[nextEmit:]...) + blk.extraLits = len(src) - int(nextEmit) + } + if debug { + println("returning, recent offsets:", blk.recentOffsets, "extra literals:", blk.extraLits) + } + +} diff --git a/zstd/enc_fast.go b/zstd/enc_fast.go index 6f388de041..5fe3edca12 100644 --- a/zstd/enc_fast.go +++ b/zstd/enc_fast.go @@ -329,6 +329,255 @@ encodeLoop: } } +// EncodeNoHist will encode a block with no history and no following blocks. +// Most notable difference is that src will not be copied for history and +// we do not need to check for max match length. +func (e *fastEncoder) EncodeNoHist(blk *blockEnc, src []byte) { + const ( + inputMargin = 8 + minNonLiteralBlockSize = 1 + 1 + inputMargin + ) + if debug { + if len(src) > maxBlockSize { + panic("src too big") + } + } + // Protect against e.cur wraparound. + if e.cur > (1<<30)+e.maxMatchOff { + for i := range e.table[:] { + e.table[i] = tableEntry{} + } + e.cur = e.maxMatchOff + } + + s := int32(0) + blk.size = len(src) + if len(src) < minNonLiteralBlockSize { + blk.extraLits = len(src) + blk.literals = blk.literals[:len(src)] + copy(blk.literals, src) + return + } + + sLimit := int32(len(src)) - inputMargin + // stepSize is the number of bytes to skip on every main loop iteration. + // It should be >= 2. + const stepSize = 2 + + // TEMPLATE + const hashLog = tableBits + // seems global, but would be nice to tweak. + const kSearchStrength = 8 + + // nextEmit is where in src the next emitLiteral should start from. + nextEmit := s + cv := load6432(src, s) + + // Relative offsets + offset1 := int32(blk.recentOffsets[0]) + offset2 := int32(blk.recentOffsets[1]) + + addLiterals := func(s *seq, until int32) { + if until == nextEmit { + return + } + blk.literals = append(blk.literals, src[nextEmit:until]...) + s.litLen = uint32(until - nextEmit) + } + if debug { + println("recent offsets:", blk.recentOffsets) + } + +encodeLoop: + for { + // t will contain the match offset when we find one. + // When existing the search loop, we have already checked 4 bytes. + var t int32 + + // We will not use repeat offsets across blocks. + // By not using them for the first 3 matches + canRepeat := len(blk.sequences) > 2 + + for { + if debug && canRepeat && offset1 == 0 { + panic("offset0 was 0") + } + + nextHash := hash6(cv, hashLog) + nextHash2 := hash6(cv>>8, hashLog) + candidate := e.table[nextHash] + candidate2 := e.table[nextHash2] + repIndex := s - offset1 + 2 + + e.table[nextHash] = tableEntry{offset: s + e.cur, val: uint32(cv)} + e.table[nextHash2] = tableEntry{offset: s + e.cur + 1, val: uint32(cv >> 8)} + + if canRepeat && repIndex >= 0 && load3232(src, repIndex) == uint32(cv>>16) { + // Consider history as well. + var seq seq + // lenght := 4 + e.matchlen(s+6, repIndex+4, src) + lenght := 4 + int32(matchLen(src[s+6:], src[repIndex+4:])) + + seq.matchLen = uint32(lenght - zstdMinMatch) + + // We might be able to match backwards. + // Extend as long as we can. + start := s + 2 + // We end the search early, so we don't risk 0 literals + // and have to do special offset treatment. + startLimit := nextEmit + 1 + + sMin := s - e.maxMatchOff + if sMin < 0 { + sMin = 0 + } + for repIndex > sMin && start > startLimit && src[repIndex-1] == src[start-1] { + repIndex-- + start-- + seq.matchLen++ + } + addLiterals(&seq, start) + + // rep 0 + seq.offset = 1 + if debugSequences { + println("repeat sequence", seq, "next s:", s) + } + blk.sequences = append(blk.sequences, seq) + s += lenght + 2 + nextEmit = s + if s >= sLimit { + if debug { + println("repeat ended", s, lenght) + + } + break encodeLoop + } + cv = load6432(src, s) + continue + } + coffset0 := s - (candidate.offset - e.cur) + coffset1 := s - (candidate2.offset - e.cur) + 1 + if coffset0 < e.maxMatchOff && uint32(cv) == candidate.val { + // found a regular match + t = candidate.offset - e.cur + if debug && s <= t { + panic("s <= t") + } + if debug && s-t > e.maxMatchOff { + panic("s - t >e.maxMatchOff") + } + break + } + + if coffset1 < e.maxMatchOff && uint32(cv>>8) == candidate2.val { + // found a regular match + t = candidate2.offset - e.cur + s++ + if debug && s <= t { + panic("s <= t") + } + if debug && s-t > e.maxMatchOff { + panic("s - t >e.maxMatchOff") + } + if debug && t < 0 { + panic("t<0") + } + break + } + s += stepSize + ((s - nextEmit) >> (kSearchStrength - 1)) + if s >= sLimit { + break encodeLoop + } + cv = load6432(src, s) + } + // A 4-byte match has been found. We'll later see if more than 4 bytes. + offset2 = offset1 + offset1 = s - t + + if debug && s <= t { + panic("s <= t") + } + + if debug && canRepeat && int(offset1) > len(src) { + panic("invalid offset") + } + + // Extend the 4-byte match as long as possible. + //l := e.matchlenNoHist(s+4, t+4, src) + 4 + l := int32(matchLen(src[s+4:], src[t+4:])) + 4 + + // Extend backwards + tMin := s - e.maxMatchOff + if tMin < 0 { + tMin = 0 + } + for t > tMin && s > nextEmit && src[t-1] == src[s-1] { + s-- + t-- + l++ + } + + // Write our sequence. + var seq seq + seq.litLen = uint32(s - nextEmit) + seq.matchLen = uint32(l - zstdMinMatch) + if seq.litLen > 0 { + blk.literals = append(blk.literals, src[nextEmit:s]...) + } + // Don't use repeat offsets + seq.offset = uint32(s-t) + 3 + s += l + if debugSequences { + println("sequence", seq, "next s:", s) + } + blk.sequences = append(blk.sequences, seq) + nextEmit = s + if s >= sLimit { + break encodeLoop + } + cv = load6432(src, s) + + // Check offset 2 + if o2 := s - offset2; canRepeat && load3232(src, o2) == uint32(cv) { + // We have at least 4 byte match. + // No need to check backwards. We come straight from a match + //l := 4 + e.matchlenNoHist(s+4, o2+4, src) + l := 4 + int32(matchLen(src[s+4:], src[o2+4:])) + + // Store this, since we have it. + nextHash := hash6(cv, hashLog) + e.table[nextHash] = tableEntry{offset: s + e.cur, val: uint32(cv)} + seq.matchLen = uint32(l) - zstdMinMatch + seq.litLen = 0 + // Since litlen is always 0, this is offset 1. + seq.offset = 1 + s += l + nextEmit = s + if debugSequences { + println("sequence", seq, "next s:", s) + } + blk.sequences = append(blk.sequences, seq) + + // Swap offset 1 and 2. + offset1, offset2 = offset2, offset1 + if s >= sLimit { + break encodeLoop + } + // Prepare next loop. + cv = load6432(src, s) + } + } + + if int(nextEmit) < len(src) { + blk.literals = append(blk.literals, src[nextEmit:]...) + blk.extraLits = len(src) - int(nextEmit) + } + if debug { + println("returning, recent offsets:", blk.recentOffsets, "extra literals:", blk.extraLits) + } +} + func (e *fastEncoder) addBlock(src []byte) int32 { // check if we have space already if len(e.hist)+len(src) > cap(e.hist) { @@ -362,6 +611,11 @@ func (e *fastEncoder) UseBlock(enc *blockEnc) { e.blk = enc } +func (e *fastEncoder) matchlenNoHist(s, t int32, src []byte) int32 { + // Extend the match to be as long as possible. + return int32(matchLen(src[s:], src[t:])) +} + func (e *fastEncoder) matchlen(s, t int32, src []byte) int32 { if debug { if s < 0 { diff --git a/zstd/encoder.go b/zstd/encoder.go index f413042f4e..51c5cc8fca 100644 --- a/zstd/encoder.go +++ b/zstd/encoder.go @@ -29,6 +29,7 @@ type Encoder struct { type encoder interface { Encode(blk *blockEnc, src []byte) + EncodeNoHist(blk *blockEnc, src []byte) Block() *blockEnc CRC() *xxhash.Digest AppendCRC([]byte) []byte @@ -454,25 +455,22 @@ func (e *Encoder) EncodeAll(src, dst []byte) []byte { panic(err) } - for len(src) > 0 { - todo := src - if len(todo) > e.o.blockSize { - todo = todo[:e.o.blockSize] - } - src = src[len(todo):] + if len(src) <= e.o.blockSize && len(src) <= maxBlockSize { + // Slightly faster with no history and everything in one block. if e.o.crc { - _, _ = enc.CRC().Write(todo) + _, _ = enc.CRC().Write(src) } blk.reset(nil) - blk.pushOffsets() - enc.Encode(blk, todo) - if len(src) == 0 { - blk.last = true - } - err := errIncompressible + blk.last = true + enc.EncodeNoHist(blk, src) + // If we got the exact same number of literals as input, // assume the literals cannot be compressed. - if len(blk.literals) != len(todo) || len(todo) != e.o.blockSize { + err := errIncompressible + oldout := blk.output + if len(blk.literals) != len(src) || len(src) != e.o.blockSize { + // Output directly to dst + blk.output = dst err = blk.encode(e.o.noEntropy) } @@ -481,13 +479,49 @@ func (e *Encoder) EncodeAll(src, dst []byte) []byte { if debug { println("Storing incompressible block as raw") } - blk.encodeRaw(todo) - blk.popOffsets() + dst = blk.encodeRawTo(dst, src) case nil: + dst = blk.output default: panic(err) } - dst = append(dst, blk.output...) + blk.output = oldout + } else { + for len(src) > 0 { + todo := src + if len(todo) > e.o.blockSize { + todo = todo[:e.o.blockSize] + } + src = src[len(todo):] + if e.o.crc { + _, _ = enc.CRC().Write(todo) + } + blk.reset(nil) + blk.pushOffsets() + enc.Encode(blk, todo) + if len(src) == 0 { + blk.last = true + } + err := errIncompressible + // If we got the exact same number of literals as input, + // assume the literals cannot be compressed. + if len(blk.literals) != len(todo) || len(todo) != e.o.blockSize { + err = blk.encode(e.o.noEntropy) + } + + switch err { + case errIncompressible: + if debug { + println("Storing incompressible block as raw") + } + dst = blk.encodeRawTo(dst, todo) + blk.popOffsets() + case nil: + dst = append(dst, blk.output...) + default: + panic(err) + } + } } if e.o.crc { dst = enc.AppendCRC(dst) diff --git a/zstd/encoder_test.go b/zstd/encoder_test.go index 4a9ad598df..b2188ca567 100644 --- a/zstd/encoder_test.go +++ b/zstd/encoder_test.go @@ -534,7 +534,7 @@ func testEncoderRoundtrip(t *testing.T, file string, wantCRC []byte) { } else { t.Logf("CRC Verified: %#v", gotCRC) } - t.Log("Fast Encoder len", wantSize) + t.Log("Encoder len", wantSize) mbpersec := (float64(wantSize) / (1024 * 1024)) / (float64(time.Since(start)) / (float64(time.Second))) t.Logf("Encoded+Decoded %d bytes with %.2f MB/s", wantSize, mbpersec) }) @@ -800,21 +800,58 @@ func BenchmarkEncoder_EncodeAllSimple(b *testing.B) { b.Fatal(err) } - enc, err := NewWriter(nil, WithEncoderConcurrency(1)) + for level := EncoderLevel(speedNotSet + 1); level < speedLast; level++ { + b.Run(level.String(), func(b *testing.B) { + enc, err := NewWriter(nil, WithEncoderConcurrency(1), WithEncoderLevel(level)) + if err != nil { + b.Fatal(err) + } + defer enc.Close() + dst := enc.EncodeAll(in, nil) + wantSize := len(dst) + b.ResetTimer() + b.ReportAllocs() + b.SetBytes(int64(len(in))) + for i := 0; i < b.N; i++ { + dst := enc.EncodeAll(in, dst[:0]) + if len(dst) != wantSize { + b.Fatal(len(dst), "!=", wantSize) + } + } + }) + } +} + +func BenchmarkEncoder_EncodeAllSimple4K(b *testing.B) { + f, err := os.Open("testdata/z000028") if err != nil { b.Fatal(err) } - defer enc.Close() - dst := enc.EncodeAll(in, nil) - wantSize := len(dst) - b.ResetTimer() - b.ReportAllocs() - b.SetBytes(int64(len(in))) - for i := 0; i < b.N; i++ { - dst := enc.EncodeAll(in, dst[:0]) - if len(dst) != wantSize { - b.Fatal(len(dst), "!=", wantSize) - } + in, err := ioutil.ReadAll(f) + if err != nil { + b.Fatal(err) + } + in = in[:4096] + + for level := EncoderLevel(speedNotSet + 1); level < speedLast; level++ { + b.Run(level.String(), func(b *testing.B) { + enc, err := NewWriter(nil, WithEncoderConcurrency(1), WithEncoderLevel(level)) + if err != nil { + b.Fatal(err) + } + defer enc.Close() + dst := enc.EncodeAll(in, nil) + wantSize := len(dst) + b.ResetTimer() + b.ReportAllocs() + b.SetBytes(int64(len(in))) + for i := 0; i < b.N; i++ { + dst := enc.EncodeAll(in, dst[:0]) + if len(dst) != wantSize { + b.Fatal(len(dst), "!=", wantSize) + } + } + }) } } From 7731dd255b657d793312c58501fc625e8e854a47 Mon Sep 17 00:00:00 2001 From: Klaus Post Date: Fri, 27 Dec 2019 05:05:01 -0800 Subject: [PATCH 2/5] Blocks without history cannot have invalid repeat codes. --- zstd/enc_dfast.go | 16 +++------------- zstd/enc_fast.go | 13 ++----------- 2 files changed, 5 insertions(+), 24 deletions(-) diff --git a/zstd/enc_dfast.go b/zstd/enc_dfast.go index 148d6e5006..ee3b09b02a 100644 --- a/zstd/enc_dfast.go +++ b/zstd/enc_dfast.go @@ -476,13 +476,7 @@ func (e *doubleFastEncoder) EncodeNoHist(blk *blockEnc, src []byte) { encodeLoop: for { var t int32 - // We allow the encoder to optionally turn off repeat offsets across blocks - canRepeat := len(blk.sequences) > 2 - for { - if debug && canRepeat && offset1 == 0 { - panic("offset0 was 0") - } nextHashS := hash5(cv, dFastShortTableBits) nextHashL := hash8(cv, dFastLongTableBits) @@ -495,8 +489,8 @@ encodeLoop: e.longTable[nextHashL] = entry e.table[nextHashS] = entry - if canRepeat { - if repIndex >= 0 && load3232(src, repIndex) == uint32(cv>>(repOff*8)) { + if len(blk.sequences) > 2 { + if load3232(src, repIndex) == uint32(cv>>(repOff*8)) { // Consider history as well. var seq seq //length := 4 + e.matchlen(s+4+repOff, repIndex+4, src) @@ -620,10 +614,6 @@ encodeLoop: panic("s <= t") } - if debug && canRepeat && int(offset1) > len(src) { - panic("invalid offset") - } - // Extend the 4-byte match as long as possible. //l := e.matchlen(s+4, t+4, src) + 4 l := int32(matchLen(src[s+4:], src[t+4:])) + 4 @@ -679,7 +669,7 @@ encodeLoop: cv = load6432(src, s) - if !canRepeat { + if len(blk.sequences) <= 2 { continue } diff --git a/zstd/enc_fast.go b/zstd/enc_fast.go index 5fe3edca12..0bdddac5b4 100644 --- a/zstd/enc_fast.go +++ b/zstd/enc_fast.go @@ -396,13 +396,8 @@ encodeLoop: // We will not use repeat offsets across blocks. // By not using them for the first 3 matches - canRepeat := len(blk.sequences) > 2 for { - if debug && canRepeat && offset1 == 0 { - panic("offset0 was 0") - } - nextHash := hash6(cv, hashLog) nextHash2 := hash6(cv>>8, hashLog) candidate := e.table[nextHash] @@ -412,7 +407,7 @@ encodeLoop: e.table[nextHash] = tableEntry{offset: s + e.cur, val: uint32(cv)} e.table[nextHash2] = tableEntry{offset: s + e.cur + 1, val: uint32(cv >> 8)} - if canRepeat && repIndex >= 0 && load3232(src, repIndex) == uint32(cv>>16) { + if len(blk.sequences) > 2 && load3232(src, repIndex) == uint32(cv>>16) { // Consider history as well. var seq seq // lenght := 4 + e.matchlen(s+6, repIndex+4, src) @@ -499,10 +494,6 @@ encodeLoop: panic("s <= t") } - if debug && canRepeat && int(offset1) > len(src) { - panic("invalid offset") - } - // Extend the 4-byte match as long as possible. //l := e.matchlenNoHist(s+4, t+4, src) + 4 l := int32(matchLen(src[s+4:], src[t+4:])) + 4 @@ -539,7 +530,7 @@ encodeLoop: cv = load6432(src, s) // Check offset 2 - if o2 := s - offset2; canRepeat && load3232(src, o2) == uint32(cv) { + if o2 := s - offset2; len(blk.sequences) > 2 && load3232(src, o2) == uint32(cv) { // We have at least 4 byte match. // No need to check backwards. We come straight from a match //l := 4 + e.matchlenNoHist(s+4, o2+4, src) From 27df7edbe04e5357a3e5fea8c9036b48cf9dc854 Mon Sep 17 00:00:00 2001 From: Klaus Post Date: Sat, 28 Dec 2019 10:41:25 +0100 Subject: [PATCH 3/5] Remove bounds check. --- huff0/compress.go | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/huff0/compress.go b/huff0/compress.go index 51e00aaeb2..b376350df4 100644 --- a/huff0/compress.go +++ b/huff0/compress.go @@ -439,7 +439,7 @@ func (s *Scratch) buildCTable() error { return fmt.Errorf("internal error: maxNbBits (%d) > tableLogMax (%d)", maxNbBits, tableLogMax) } var nbPerRank [tableLogMax + 1]uint16 - var valPerRank [tableLogMax + 1]uint16 + var valPerRank [16]uint16 for _, v := range huffNode[:nonNullRank+1] { nbPerRank[v.nbBits]++ } @@ -455,16 +455,17 @@ func (s *Scratch) buildCTable() error { } // push nbBits per symbol, symbol order - // TODO: changed `s.symbolLen` -> `nonNullRank+1` (micro-opt) for _, v := range huffNode[:nonNullRank+1] { s.cTable[v.symbol].nBits = v.nbBits } // assign value within rank, symbol order - for n, val := range s.cTable[:s.symbolLen] { - v := valPerRank[val.nBits] - s.cTable[n].val = v - valPerRank[val.nBits] = v + 1 + t := s.cTable[:s.symbolLen] + for n, val := range t { + nbits := val.nBits & 15 + v := valPerRank[nbits] + t[n].val = v + valPerRank[nbits] = v + 1 } return nil From ae2ada91c1ef243155b12f2d7c676f08c1003741 Mon Sep 17 00:00:00 2001 From: Klaus Post Date: Sat, 28 Dec 2019 11:03:09 +0100 Subject: [PATCH 4/5] Don't rank empty part. --- huff0/compress.go | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/huff0/compress.go b/huff0/compress.go index b376350df4..18b874318d 100644 --- a/huff0/compress.go +++ b/huff0/compress.go @@ -489,10 +489,12 @@ func (s *Scratch) huffSort() { r := highBit32(v+1) & 31 rank[r].base++ } - for n := 30; n > 0; n-- { + // maxBitLength is log2(BlockSizeMax) + 1 + const maxBitLength = 18 + 1 + for n := maxBitLength; n > 0; n-- { rank[n-1].base += rank[n].base } - for n := range rank[:] { + for n := range rank[:maxBitLength] { rank[n].current = rank[n].base } for n, c := range s.count[:s.symbolLen] { From f6791a53a57c1ec7bf5589bc4f4da2f5ebb14b2d Mon Sep 17 00:00:00 2001 From: Klaus Post Date: Sat, 28 Dec 2019 11:47:02 +0100 Subject: [PATCH 5/5] Big speedup on huff0 encoding. --- huff0/bitwriter.go | 13 ++++++++++++- huff0/compress.go | 17 ++++++----------- 2 files changed, 18 insertions(+), 12 deletions(-) diff --git a/huff0/bitwriter.go b/huff0/bitwriter.go index ec0c3fc53b..bda4021efd 100644 --- a/huff0/bitwriter.go +++ b/huff0/bitwriter.go @@ -38,7 +38,7 @@ func (b *bitWriter) addBits16Clean(value uint16, bits uint8) { b.nBits += bits } -// addBits16Clean will add up to 16 bits. value may not contain more set bits than indicated. +// encSymbol will add up to 16 bits. value may not contain more set bits than indicated. // It will not check if there is space for them, so the caller must ensure that it has flushed recently. func (b *bitWriter) encSymbol(ct cTable, symbol byte) { enc := ct[symbol] @@ -46,6 +46,17 @@ func (b *bitWriter) encSymbol(ct cTable, symbol byte) { b.nBits += enc.nBits } +// encTwoSymbols will add up to 32 bits. value may not contain more set bits than indicated. +// It will not check if there is space for them, so the caller must ensure that it has flushed recently. +func (b *bitWriter) encTwoSymbols(ct cTable, av, bv byte) { + encA := ct[av] + encB := ct[bv] + sh := b.nBits & 63 + combined := uint64(encA.val) | (uint64(encB.val) << (encA.nBits & 63)) + b.bitContainer |= combined << sh + b.nBits += encA.nBits + encB.nBits +} + // addBits16ZeroNC will add up to 16 bits. // It will not check if there is space for them, // so the caller must ensure that it has flushed recently. diff --git a/huff0/compress.go b/huff0/compress.go index 18b874318d..125429d74d 100644 --- a/huff0/compress.go +++ b/huff0/compress.go @@ -163,28 +163,23 @@ func (s *Scratch) compress1xDo(dst, src []byte) ([]byte, error) { for i := len(src) & 3; i > 0; i-- { bw.encSymbol(cTable, src[n+i-1]) } + n -= 4 if s.actualTableLog <= 8 { - n -= 4 for ; n >= 0; n -= 4 { tmp := src[n : n+4] // tmp should be len 4 bw.flush32() - bw.encSymbol(cTable, tmp[3]) - bw.encSymbol(cTable, tmp[2]) - bw.encSymbol(cTable, tmp[1]) - bw.encSymbol(cTable, tmp[0]) + bw.encTwoSymbols(cTable, tmp[3], tmp[2]) + bw.encTwoSymbols(cTable, tmp[1], tmp[0]) } } else { - n -= 4 for ; n >= 0; n -= 4 { tmp := src[n : n+4] // tmp should be len 4 bw.flush32() - bw.encSymbol(cTable, tmp[3]) - bw.encSymbol(cTable, tmp[2]) + bw.encTwoSymbols(cTable, tmp[3], tmp[2]) bw.flush32() - bw.encSymbol(cTable, tmp[1]) - bw.encSymbol(cTable, tmp[0]) + bw.encTwoSymbols(cTable, tmp[1], tmp[0]) } } err := bw.close() @@ -513,7 +508,7 @@ func (s *Scratch) huffSort() { } func (s *Scratch) setMaxHeight(lastNonNull int) uint8 { - maxNbBits := s.TableLog + maxNbBits := s.actualTableLog huffNode := s.nodes[1 : huffNodesLen+1] //huffNode = huffNode[: huffNodesLen]