diff --git a/s2/index.go b/s2/index.go index 18a4f7acd..4229957b9 100644 --- a/s2/index.go +++ b/s2/index.go @@ -17,6 +17,8 @@ const ( S2IndexHeader = "s2idx\x00" S2IndexTrailer = "\x00xdi2s" maxIndexEntries = 1 << 16 + // If distance is less than this, we do not add the entry. + minIndexDist = 1 << 20 ) // Index represents an S2/Snappy index. @@ -72,6 +74,10 @@ func (i *Index) add(compressedOffset, uncompressedOffset int64) error { if latest.compressedOffset > compressedOffset { return fmt.Errorf("internal error: Earlier compressed received (%d > %d)", latest.uncompressedOffset, uncompressedOffset) } + if latest.uncompressedOffset+minIndexDist > uncompressedOffset { + // Only add entry if distance is large enough. + return nil + } } i.info = append(i.info, struct { compressedOffset int64 @@ -122,7 +128,7 @@ func (i *Index) Find(offset int64) (compressedOff, uncompressedOff int64, err er // reduce to stay below maxIndexEntries func (i *Index) reduce() { - if len(i.info) < maxIndexEntries && i.estBlockUncomp >= 1<<20 { + if len(i.info) < maxIndexEntries && i.estBlockUncomp >= minIndexDist { return } @@ -132,7 +138,7 @@ func (i *Index) reduce() { j := 0 // Each block should be at least 1MB, but don't reduce below 1000 entries. - for i.estBlockUncomp*(int64(removeN)+1) < 1<<20 && len(i.info)/(removeN+1) > 1000 { + for i.estBlockUncomp*(int64(removeN)+1) < minIndexDist && len(i.info)/(removeN+1) > 1000 { removeN++ } for idx := 0; idx < len(src); idx++ { diff --git a/s2/s2.go b/s2/s2.go index 72bcb4945..cbd1ed64d 100644 --- a/s2/s2.go +++ b/s2/s2.go @@ -109,7 +109,11 @@ const ( chunkTypeStreamIdentifier = 0xff ) -var crcTable = crc32.MakeTable(crc32.Castagnoli) +var ( + crcTable = crc32.MakeTable(crc32.Castagnoli) + magicChunkSnappyBytes = []byte(magicChunkSnappy) // Can be passed to functions where it escapes. + magicChunkBytes = []byte(magicChunk) // Can be passed to functions where it escapes. +) // crc implements the checksum specified in section 3 of // https://github.com/google/snappy/blob/master/framing_format.txt diff --git a/s2/writer.go b/s2/writer.go index 637c93147..0a46f2b98 100644 --- a/s2/writer.go +++ b/s2/writer.go @@ -239,6 +239,9 @@ func (w *Writer) ReadFrom(r io.Reader) (n int64, err error) { } } if n2 == 0 { + if cap(inbuf) >= w.obufLen { + w.buffers.Put(inbuf) + } break } n += int64(n2) @@ -314,9 +317,9 @@ func (w *Writer) AddSkippableBlock(id uint8, data []byte) (err error) { hWriter := make(chan result) w.output <- hWriter if w.snappy { - hWriter <- result{startOffset: w.uncompWritten, b: []byte(magicChunkSnappy)} + hWriter <- result{startOffset: w.uncompWritten, b: magicChunkSnappyBytes} } else { - hWriter <- result{startOffset: w.uncompWritten, b: []byte(magicChunk)} + hWriter <- result{startOffset: w.uncompWritten, b: magicChunkBytes} } } @@ -370,9 +373,9 @@ func (w *Writer) EncodeBuffer(buf []byte) (err error) { hWriter := make(chan result) w.output <- hWriter if w.snappy { - hWriter <- result{startOffset: w.uncompWritten, b: []byte(magicChunkSnappy)} + hWriter <- result{startOffset: w.uncompWritten, b: magicChunkSnappyBytes} } else { - hWriter <- result{startOffset: w.uncompWritten, b: []byte(magicChunk)} + hWriter <- result{startOffset: w.uncompWritten, b: magicChunkBytes} } } @@ -478,9 +481,9 @@ func (w *Writer) write(p []byte) (nRet int, errRet error) { hWriter := make(chan result) w.output <- hWriter if w.snappy { - hWriter <- result{startOffset: w.uncompWritten, b: []byte(magicChunkSnappy)} + hWriter <- result{startOffset: w.uncompWritten, b: magicChunkSnappyBytes} } else { - hWriter <- result{startOffset: w.uncompWritten, b: []byte(magicChunk)} + hWriter <- result{startOffset: w.uncompWritten, b: magicChunkBytes} } } @@ -560,6 +563,9 @@ func (w *Writer) writeFull(inbuf []byte) (errRet error) { if w.concurrency == 1 { _, err := w.writeSync(inbuf[obufHeaderLen:]) + if cap(inbuf) >= w.obufLen { + w.buffers.Put(inbuf) + } return err } @@ -569,9 +575,9 @@ func (w *Writer) writeFull(inbuf []byte) (errRet error) { hWriter := make(chan result) w.output <- hWriter if w.snappy { - hWriter <- result{startOffset: w.uncompWritten, b: []byte(magicChunkSnappy)} + hWriter <- result{startOffset: w.uncompWritten, b: magicChunkSnappyBytes} } else { - hWriter <- result{startOffset: w.uncompWritten, b: []byte(magicChunk)} + hWriter <- result{startOffset: w.uncompWritten, b: magicChunkBytes} } } @@ -637,9 +643,9 @@ func (w *Writer) writeSync(p []byte) (nRet int, errRet error) { var n int var err error if w.snappy { - n, err = w.writer.Write([]byte(magicChunkSnappy)) + n, err = w.writer.Write(magicChunkSnappyBytes) } else { - n, err = w.writer.Write([]byte(magicChunk)) + n, err = w.writer.Write(magicChunkBytes) } if err != nil { return 0, w.err(err) diff --git a/s2/writer_test.go b/s2/writer_test.go index 92f5066f8..470abbb80 100644 --- a/s2/writer_test.go +++ b/s2/writer_test.go @@ -603,6 +603,36 @@ func BenchmarkWriterRandom(b *testing.B) { } } +func BenchmarkReadFromRandom(b *testing.B) { + rng := rand.New(rand.NewSource(1)) + // Make max window so we never get matches. + data := make([]byte, 8<<20) + for i := range data { + data[i] = uint8(rng.Intn(256)) + } + + for name, opts := range testOptions(b) { + w := NewWriter(io.Discard, opts...) + in := bytes.NewReader(data) + w.ReadFrom(in) + b.Run(name, func(b *testing.B) { + b.ResetTimer() + b.ReportAllocs() + b.SetBytes(int64(len(data))) + for i := 0; i < b.N; i++ { + in.Reset(data) + _, err := w.ReadFrom(in) + if err != nil { + b.Fatal(err) + } + } + // Flush output + w.Flush() + }) + w.Close() + } +} + func BenchmarkIndexFind(b *testing.B) { fatalErr := func(t testing.TB, err error) { if err != nil {