Skip to content

Commit

Permalink
Zstd optimize small blocks (#265)
Browse files Browse the repository at this point in the history
Single threaded:
```
benchmark                                                    old MB/s     new MB/s     speedup
BenchmarkDecoder_DecoderSmall/kppkn.gtb.zst-32               299.66       348.55       1.16x
BenchmarkDecoder_DecoderSmall/geo.protodata.zst-32           824.68       971.14       1.18x
BenchmarkDecoder_DecoderSmall/plrabn12.txt.zst-32            211.31       233.53       1.11x
BenchmarkDecoder_DecoderSmall/lcet10.txt.zst-32              248.88       274.21       1.10x
BenchmarkDecoder_DecoderSmall/asyoulik.txt.zst-32            248.88       287.44       1.15x
BenchmarkDecoder_DecoderSmall/alice29.txt.zst-32             240.27       274.48       1.14x
BenchmarkDecoder_DecoderSmall/html_x_4.zst-32                1481.90      1442.57      0.97x
BenchmarkDecoder_DecoderSmall/paper-100k.pdf.zst-32          3848.34      4570.00      1.19x
BenchmarkDecoder_DecoderSmall/fireworks.jpeg.zst-32          12196.13     12295.82     1.01x
BenchmarkDecoder_DecoderSmall/urls.10K.zst-32                374.76       422.87       1.13x
BenchmarkDecoder_DecoderSmall/html.zst-32                    641.72       767.35       1.20x
BenchmarkDecoder_DecoderSmall/comp-data.bin.zst-32           394.59       425.95       1.08x
BenchmarkDecoder_DecodeAll/kppkn.gtb.zst-32                  302.35       351.40       1.16x
BenchmarkDecoder_DecodeAll/geo.protodata.zst-32              823.43       970.26       1.18x
BenchmarkDecoder_DecodeAll/plrabn12.txt.zst-32               253.10       291.87       1.15x
BenchmarkDecoder_DecodeAll/lcet10.txt.zst-32                 305.60       342.69       1.12x
BenchmarkDecoder_DecodeAll/asyoulik.txt.zst-32               250.99       292.41       1.17x
BenchmarkDecoder_DecodeAll/alice29.txt.zst-32                242.98       273.30       1.12x
BenchmarkDecoder_DecodeAll/html_x_4.zst-32                   1492.46      1448.13      0.97x
BenchmarkDecoder_DecodeAll/paper-100k.pdf.zst-32             3953.49      4726.96      1.20x
BenchmarkDecoder_DecodeAll/fireworks.jpeg.zst-32             13011.31     13076.62     1.01x
BenchmarkDecoder_DecodeAll/urls.10K.zst-32                   410.19       493.80       1.20x
BenchmarkDecoder_DecodeAll/html.zst-32                       641.87       765.77       1.19x
BenchmarkDecoder_DecodeAll/comp-data.bin.zst-32              379.34       425.51       1.12x
BenchmarkDecoder_DecodeAllParallel/kppkn.gtb.zst-32          5786.45      6353.87      1.10x
BenchmarkDecoder_DecodeAllParallel/geo.protodata.zst-32      15827.85     17395.66     1.10x
BenchmarkDecoder_DecodeAllParallel/plrabn12.txt.zst-32       4726.87      5203.13      1.10x
BenchmarkDecoder_DecodeAllParallel/lcet10.txt.zst-32         5660.74      6190.10      1.09x
BenchmarkDecoder_DecodeAllParallel/asyoulik.txt.zst-32       4781.65      5233.33      1.09x
BenchmarkDecoder_DecodeAllParallel/alice29.txt.zst-32        4465.43      4834.69      1.08x
BenchmarkDecoder_DecodeAllParallel/html_x_4.zst-32           28007.52     23775.05     0.85x
BenchmarkDecoder_DecodeAllParallel/paper-100k.pdf.zst-32     70726.30     75137.45     1.06x
BenchmarkDecoder_DecodeAllParallel/fireworks.jpeg.zst-32     58807.06     67592.15     1.15x
BenchmarkDecoder_DecodeAllParallel/urls.10K.zst-32           8025.76      9043.30      1.13x
BenchmarkDecoder_DecodeAllParallel/html.zst-32               12243.78     13733.45     1.12x
BenchmarkDecoder_DecodeAllParallel/comp-data.bin.zst-32      2925.99      4023.69      1.38x
```

huff0 alone:
```
benchmark                                            old MB/s     new MB/s     speedup
BenchmarkDecompress1XTable/digits-32                 230.69       351.20       1.52x
BenchmarkDecompress1XTable/gettysburg-32             202.22       302.48       1.50x
BenchmarkDecompress1XTable/twain-32                  212.84       320.18       1.50x
BenchmarkDecompress1XTable/low-ent.10k-32            250.10       384.25       1.54x
BenchmarkDecompress1XTable/superlow-ent-10k-32       245.01       381.06       1.56x
BenchmarkDecompress1XTable/crash2-32                 21.42        24.67        1.15x
BenchmarkDecompress1XTable/endzerobits-32            69.13        74.07        1.07x
BenchmarkDecompress1XTable/endnonzero-32             14.81        15.55        1.05x
BenchmarkDecompress1XTable/case1-32                  27.26        30.61        1.12x
BenchmarkDecompress1XTable/case2-32                  22.63        25.42        1.12x
BenchmarkDecompress1XTable/case3-32                  24.02        26.89        1.12x
BenchmarkDecompress1XTable/pngdata.001-32            242.79       407.23       1.68x
BenchmarkDecompress1XTable/normcount2-32             62.93        70.96        1.13x
BenchmarkDecompress1XNoTable/digits-32               229.28       350.49       1.53x
BenchmarkDecompress1XNoTable/gettysburg-32           235.56       383.63       1.63x
BenchmarkDecompress1XNoTable/twain-32                211.48       322.60       1.53x
BenchmarkDecompress1XNoTable/low-ent.10k-32          248.74       387.60       1.56x
BenchmarkDecompress1XNoTable/superlow-ent-10k-32     248.19       388.28       1.56x
BenchmarkDecompress1XNoTable/crash2-32               166.94       220.59       1.32x
BenchmarkDecompress1XNoTable/endzerobits-32          112.91       124.23       1.10x
BenchmarkDecompress1XNoTable/endnonzero-32           132.30       153.07       1.16x
BenchmarkDecompress1XNoTable/case1-32                214.54       314.86       1.47x
BenchmarkDecompress1XNoTable/case2-32                208.43       317.56       1.52x
BenchmarkDecompress1XNoTable/case3-32                208.96       303.29       1.45x
BenchmarkDecompress1XNoTable/pngdata.001-32          246.06       415.27       1.69x
BenchmarkDecompress1XNoTable/normcount2-32           222.14       322.08       1.45x
BenchmarkDecompress4XNoTable/digits-32               454.06       589.41       1.30x
BenchmarkDecompress4XNoTable/gettysburg-32           519.04       549.23       1.06x
BenchmarkDecompress4XNoTable/twain-32                377.67       455.67       1.21x
BenchmarkDecompress4XNoTable/low-ent.10k-32          606.85       692.22       1.14x
BenchmarkDecompress4XNoTable/superlow-ent-10k-32     587.74       677.59       1.15x
BenchmarkDecompress4XNoTable/case1-32                170.95       229.36       1.34x
BenchmarkDecompress4XNoTable/case2-32                165.58       229.85       1.39x
BenchmarkDecompress4XNoTable/case3-32                174.01       238.52       1.37x
BenchmarkDecompress4XNoTable/pngdata.001-32          585.15       655.99       1.12x
BenchmarkDecompress4XNoTable/normcount2-32           193.93       289.04       1.49x
BenchmarkDecompress4XTable/digits-32                 452.40       587.78       1.30x
BenchmarkDecompress4XTable/gettysburg-32             370.66       395.90       1.07x
BenchmarkDecompress4XTable/twain-32                  379.40       449.42       1.18x
BenchmarkDecompress4XTable/low-ent.10k-32            609.16       687.14       1.13x
BenchmarkDecompress4XTable/superlow-ent-10k-32       572.14       656.00       1.15x
BenchmarkDecompress4XTable/case1-32                  26.26        28.90        1.10x
BenchmarkDecompress4XTable/case2-32                  21.77        24.31        1.12x
BenchmarkDecompress4XTable/case3-32                  23.13        25.91        1.12x
BenchmarkDecompress4XTable/pngdata.001-32            564.01       635.71       1.13x
BenchmarkDecompress4XTable/normcount2-32             59.49        68.19        1.15x  
```
  • Loading branch information
klauspost authored Jun 5, 2020
1 parent 90824b4 commit 31108c0
Show file tree
Hide file tree
Showing 12 changed files with 1,084 additions and 148 deletions.
27 changes: 21 additions & 6 deletions fse/bitreader.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
package fse

import (
"encoding/binary"
"errors"
"io"
)
Expand Down Expand Up @@ -34,8 +35,12 @@ func (b *bitReader) init(in []byte) error {
}
b.bitsRead = 64
b.value = 0
b.fill()
b.fill()
if len(in) >= 8 {
b.fillFastStart()
} else {
b.fill()
b.fill()
}
b.bitsRead += 8 - uint8(highBits(uint32(v)))
return nil
}
Expand Down Expand Up @@ -63,8 +68,9 @@ func (b *bitReader) fillFast() {
if b.bitsRead < 32 {
return
}
// Do single re-slice to avoid bounds checks.
v := b.in[b.off-4 : b.off]
// 2 bounds checks.
v := b.in[b.off-4:]
v = v[:4]
low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
b.value = (b.value << 32) | uint64(low)
b.bitsRead -= 32
Expand All @@ -77,7 +83,8 @@ func (b *bitReader) fill() {
return
}
if b.off > 4 {
v := b.in[b.off-4 : b.off]
v := b.in[b.off-4:]
v = v[:4]
low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
b.value = (b.value << 32) | uint64(low)
b.bitsRead -= 32
Expand All @@ -91,9 +98,17 @@ func (b *bitReader) fill() {
}
}

// fillFastStart() assumes the bitreader is empty and there is at least 8 bytes to read.
func (b *bitReader) fillFastStart() {
// Do single re-slice to avoid bounds checks.
b.value = binary.LittleEndian.Uint64(b.in[b.off-8:])
b.bitsRead = 0
b.off -= 8
}

// finished returns true if all bits have been read from the bit stream.
func (b *bitReader) finished() bool {
return b.off == 0 && b.bitsRead >= 64
return b.bitsRead >= 64 && b.off == 0
}

// close the bitstream and returns an error if out-of-buffer reads occurred.
Expand Down
13 changes: 2 additions & 11 deletions fse/bytereader.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,19 +25,10 @@ func (b *byteReader) advance(n uint) {
b.off += int(n)
}

// Int32 returns a little endian int32 starting at current offset.
func (b byteReader) Int32() int32 {
b2 := b.b[b.off : b.off+4 : b.off+4]
v3 := int32(b2[3])
v2 := int32(b2[2])
v1 := int32(b2[1])
v0 := int32(b2[0])
return v0 | (v1 << 8) | (v2 << 16) | (v3 << 24)
}

// Uint32 returns a little endian uint32 starting at current offset.
func (b byteReader) Uint32() uint32 {
b2 := b.b[b.off : b.off+4 : b.off+4]
b2 := b.b[b.off:]
b2 = b2[:4]
v3 := uint32(b2[3])
v2 := uint32(b2[2])
v1 := uint32(b2[1])
Expand Down
256 changes: 235 additions & 21 deletions huff0/bitreader.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
package huff0

import (
"encoding/binary"
"errors"
"io"
)
Expand Down Expand Up @@ -34,29 +35,16 @@ func (b *bitReader) init(in []byte) error {
}
b.bitsRead = 64
b.value = 0
b.fill()
b.fill()
if len(in) >= 8 {
b.fillFastStart()
} else {
b.fill()
b.fill()
}
b.bitsRead += 8 - uint8(highBit32(uint32(v)))
return nil
}

// getBits will return n bits. n can be 0.
func (b *bitReader) getBits(n uint8) uint16 {
if n == 0 || b.bitsRead >= 64 {
return 0
}
return b.getBitsFast(n)
}

// getBitsFast requires that at least one bit is requested every time.
// There are no checks if the buffer is filled.
func (b *bitReader) getBitsFast(n uint8) uint16 {
const regMask = 64 - 1
v := uint16((b.value << (b.bitsRead & regMask)) >> ((regMask + 1 - n) & regMask))
b.bitsRead += n
return v
}

// peekBitsFast requires that at least one bit is requested every time.
// There are no checks if the buffer is filled.
func (b *bitReader) peekBitsFast(n uint8) uint16 {
Expand All @@ -71,21 +59,36 @@ func (b *bitReader) fillFast() {
if b.bitsRead < 32 {
return
}
// Do single re-slice to avoid bounds checks.

// 2 bounds checks.
v := b.in[b.off-4 : b.off]
v = v[:4]
low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
b.value = (b.value << 32) | uint64(low)
b.bitsRead -= 32
b.off -= 4
}

func (b *bitReader) advance(n uint8) {
b.bitsRead += n
}

// fillFastStart() assumes the bitreader is empty and there is at least 8 bytes to read.
func (b *bitReader) fillFastStart() {
// Do single re-slice to avoid bounds checks.
b.value = binary.LittleEndian.Uint64(b.in[b.off-8:])
b.bitsRead = 0
b.off -= 8
}

// fill() will make sure at least 32 bits are available.
func (b *bitReader) fill() {
if b.bitsRead < 32 {
return
}
if b.off > 4 {
v := b.in[b.off-4 : b.off]
v := b.in[b.off-4:]
v = v[:4]
low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
b.value = (b.value << 32) | uint64(low)
b.bitsRead -= 32
Expand Down Expand Up @@ -113,3 +116,214 @@ func (b *bitReader) close() error {
}
return nil
}

// bitReader reads a bitstream in reverse.
// The last set bit indicates the start of the stream and is used
// for aligning the input.
type bitReaderBytes struct {
in []byte
off uint // next byte to read is at in[off - 1]
value uint64
bitsRead uint8
}

// init initializes and resets the bit reader.
func (b *bitReaderBytes) init(in []byte) error {
if len(in) < 1 {
return errors.New("corrupt stream: too short")
}
b.in = in
b.off = uint(len(in))
// The highest bit of the last byte indicates where to start
v := in[len(in)-1]
if v == 0 {
return errors.New("corrupt stream, did not find end of stream")
}
b.bitsRead = 64
b.value = 0
if len(in) >= 8 {
b.fillFastStart()
} else {
b.fill()
b.fill()
}
b.advance(8 - uint8(highBit32(uint32(v))))
return nil
}

// peekBitsFast requires that at least one bit is requested every time.
// There are no checks if the buffer is filled.
func (b *bitReaderBytes) peekByteFast() uint8 {
got := uint8(b.value >> 56)
return got
}

func (b *bitReaderBytes) advance(n uint8) {
b.bitsRead += n
b.value <<= n & 63
}

// fillFast() will make sure at least 32 bits are available.
// There must be at least 4 bytes available.
func (b *bitReaderBytes) fillFast() {
if b.bitsRead < 32 {
return
}

// 2 bounds checks.
v := b.in[b.off-4 : b.off]
v = v[:4]
low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
b.value |= uint64(low) << (b.bitsRead - 32)
b.bitsRead -= 32
b.off -= 4
}

// fillFastStart() assumes the bitReaderBytes is empty and there is at least 8 bytes to read.
func (b *bitReaderBytes) fillFastStart() {
// Do single re-slice to avoid bounds checks.
b.value = binary.LittleEndian.Uint64(b.in[b.off-8:])
b.bitsRead = 0
b.off -= 8
}

// fill() will make sure at least 32 bits are available.
func (b *bitReaderBytes) fill() {
if b.bitsRead < 32 {
return
}
if b.off > 4 {
v := b.in[b.off-4:]
v = v[:4]
low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
b.value |= uint64(low) << (b.bitsRead - 32)
b.bitsRead -= 32
b.off -= 4
return
}
for b.off > 0 {
b.value |= uint64(b.in[b.off-1]) << (b.bitsRead - 8)
b.bitsRead -= 8
b.off--
}
}

// finished returns true if all bits have been read from the bit stream.
func (b *bitReaderBytes) finished() bool {
return b.off == 0 && b.bitsRead >= 64
}

// close the bitstream and returns an error if out-of-buffer reads occurred.
func (b *bitReaderBytes) close() error {
// Release reference.
b.in = nil
if b.bitsRead > 64 {
return io.ErrUnexpectedEOF
}
return nil
}

// bitReaderShifted reads a bitstream in reverse.
// The last set bit indicates the start of the stream and is used
// for aligning the input.
type bitReaderShifted struct {
in []byte
off uint // next byte to read is at in[off - 1]
value uint64
bitsRead uint8
}

// init initializes and resets the bit reader.
func (b *bitReaderShifted) init(in []byte) error {
if len(in) < 1 {
return errors.New("corrupt stream: too short")
}
b.in = in
b.off = uint(len(in))
// The highest bit of the last byte indicates where to start
v := in[len(in)-1]
if v == 0 {
return errors.New("corrupt stream, did not find end of stream")
}
b.bitsRead = 64
b.value = 0
if len(in) >= 8 {
b.fillFastStart()
} else {
b.fill()
b.fill()
}
b.advance(8 - uint8(highBit32(uint32(v))))
return nil
}

// peekBitsFast requires that at least one bit is requested every time.
// There are no checks if the buffer is filled.
func (b *bitReaderShifted) peekBitsFast(n uint8) uint16 {
return uint16(b.value >> ((64 - n) & 63))
}

func (b *bitReaderShifted) advance(n uint8) {
b.bitsRead += n
b.value <<= n & 63
}

// fillFast() will make sure at least 32 bits are available.
// There must be at least 4 bytes available.
func (b *bitReaderShifted) fillFast() {
if b.bitsRead < 32 {
return
}

// 2 bounds checks.
v := b.in[b.off-4 : b.off]
v = v[:4]
low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
b.value |= uint64(low) << ((b.bitsRead - 32) & 63)
b.bitsRead -= 32
b.off -= 4
}

// fillFastStart() assumes the bitReaderShifted is empty and there is at least 8 bytes to read.
func (b *bitReaderShifted) fillFastStart() {
// Do single re-slice to avoid bounds checks.
b.value = binary.LittleEndian.Uint64(b.in[b.off-8:])
b.bitsRead = 0
b.off -= 8
}

// fill() will make sure at least 32 bits are available.
func (b *bitReaderShifted) fill() {
if b.bitsRead < 32 {
return
}
if b.off > 4 {
v := b.in[b.off-4:]
v = v[:4]
low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
b.value |= uint64(low) << ((b.bitsRead - 32) & 63)
b.bitsRead -= 32
b.off -= 4
return
}
for b.off > 0 {
b.value |= uint64(b.in[b.off-1]) << ((b.bitsRead - 8) & 63)
b.bitsRead -= 8
b.off--
}
}

// finished returns true if all bits have been read from the bit stream.
func (b *bitReaderShifted) finished() bool {
return b.off == 0 && b.bitsRead >= 64
}

// close the bitstream and returns an error if out-of-buffer reads occurred.
func (b *bitReaderShifted) close() error {
// Release reference.
b.in = nil
if b.bitsRead > 64 {
return io.ErrUnexpectedEOF
}
return nil
}
Loading

0 comments on commit 31108c0

Please sign in to comment.