diff --git a/huff0/decompress.go b/huff0/decompress.go index 42a237eac4..3c0b398c72 100644 --- a/huff0/decompress.go +++ b/huff0/decompress.go @@ -61,7 +61,7 @@ func ReadTable(in []byte, s *Scratch) (s2 *Scratch, remain []byte, err error) { b, err := fse.Decompress(in[:iSize], s.fse) s.fse.Out = nil if err != nil { - return s, nil, err + return s, nil, fmt.Errorf("fse decompress returned: %w", err) } if len(b) > 255 { return s, nil, errors.New("corrupt input: output table too large") diff --git a/s2/_generate/gen.go b/s2/_generate/gen.go index c8b8012384..3959561e74 100644 --- a/s2/_generate/gen.go +++ b/s2/_generate/gen.go @@ -8,6 +8,7 @@ import ( "flag" "fmt" "math" + "math/rand" "runtime" . "github.com/mmcloughlin/avo/build" @@ -93,6 +94,8 @@ func main() { o.genEmitCopyNoRepeat() o.snappy = false o.genMatchLen() + o.cvtLZ4BlockAsm() + Generate() } @@ -1679,7 +1682,7 @@ func (o options) genEmitLiteral() { // stack must have at least 32 bytes. // retval will contain emitted bytes, but can be nil if this is not interesting. // dstBase and litBase are updated. -// Uses 2 GP registers. With AVX 4 registers. +// Uses 2 GP registers. // If updateDst is true dstBase will have the updated end pointer and an additional register will be used. func (o options) emitLiteral(name string, litLen, retval, dstBase, litBase reg.GPVirtual, end LabelRef, updateDst bool) { n := GP32() @@ -2168,8 +2171,9 @@ func (o options) emitCopy(name string, length, offset, retval, dstBase reg.GPVir // Inline call to emitRepeat. Will jump to end if !o.snappy { o.emitRepeat(name+"_emit_copy_short", length, offset, retval, dstBase, end, false) + } else { + JMP(LabelRef("two_byte_offset_" + name)) } - JMP(LabelRef("two_byte_offset_" + name)) Label("two_byte_offset_short_" + name) @@ -2771,3 +2775,205 @@ func (o options) matchLenAlt(name string, a, b, len reg.GPVirtual, end LabelRef) JMP(end) return matched } + +func (o options) cvtLZ4BlockAsm() { + TEXT("cvtLZ4BlockAsm", NOSPLIT, "func(dst, src []byte) (uncompressed int, dstUsed int)") + Doc("cvtLZ4BlockAsm converts an LZ4 block to S2", "") + Pragma("noescape") + o.outputMargin = 8 + + const ( + errCorrupt = -1 + errDstTooSmall = -2 + ) + dst, dstLen, src, srcLen, retval := GP64(), GP64(), GP64(), GP64(), GP64() + + // retval = 0 + XORQ(retval, retval) + + Load(Param("dst").Base(), dst) + Load(Param("dst").Len(), dstLen) + Load(Param("src").Base(), src) + Load(Param("src").Len(), srcLen) + srcEnd, dstEnd := GP64(), GP64() + LEAQ(Mem{Base: src, Index: srcLen, Scale: 1, Disp: 0}, srcEnd) + LEAQ(Mem{Base: dst, Index: dstLen, Scale: 1, Disp: -o.outputMargin}, dstEnd) + lastOffset := GP64() + XORQ(lastOffset, lastOffset) + + checkSrc := func(reg reg.GPVirtual) { + if debug { + name := fmt.Sprintf("lz4_s2_ok_%d", rand.Int31()) + + CMPQ(reg, srcEnd) + JB(LabelRef(name)) + JMP(LabelRef("lz4_s2_corrupt")) + Label(name) + } else { + CMPQ(reg, srcEnd) + JAE(LabelRef("lz4_s2_corrupt")) + } + } + checkDst := func(reg reg.GPVirtual) { + CMPQ(reg, dstEnd) + JAE(LabelRef("lz4_s2_dstfull")) + } + + const lz4MinMatch = 4 + + Label("lz4_s2_loop") + checkSrc(src) + checkDst(dst) + token := GP64() + MOVBQZX(Mem{Base: src}, token) + ll, ml := GP64(), GP64() + MOVQ(token, ll) + MOVQ(token, ml) + SHRQ(U8(4), ll) + ANDQ(U8(0xf), ml) + + // If upper nibble is 15, literal length is extended + { + CMPQ(token, U8(0xf0)) + JB(LabelRef("lz4_s2_ll_end")) + Label("lz4_s2_ll_loop") + INCQ(src) // s++ + checkSrc(src) + val := GP64() + MOVBQZX(Mem{Base: src}, val) + ADDQ(val, ll) + CMPQ(val, U8(255)) + JEQ(LabelRef("lz4_s2_ll_loop")) + Label("lz4_s2_ll_end") + } + + // if s+ll >= len(src) + endLits := GP64() + LEAQ(Mem{Base: src, Index: ll, Scale: 1}, endLits) + ADDQ(U8(lz4MinMatch), ml) + checkSrc(endLits) + INCQ(src) // s++ + INCQ(endLits) + TESTQ(ll, ll) + JZ(LabelRef("lz4_s2_lits_done")) + { + dstEnd := GP64() + LEAQ(Mem{Base: dst, Index: ll, Scale: 1}, dstEnd) + checkDst(dstEnd) + o.outputMargin++ + ADDQ(ll, retval) + o.emitLiteral("lz4_s2", ll, nil, dst, src, LabelRef("lz4_s2_lits_emit_done"), true) + o.outputMargin-- + Label("lz4_s2_lits_emit_done") + MOVQ(endLits, src) + } + Label("lz4_s2_lits_done") + // if s == len(src) && ml == lz4MinMatch + CMPQ(src, srcEnd) + JNE(LabelRef("lz4_s2_match")) + CMPQ(ml, U8(lz4MinMatch)) + JEQ(LabelRef("lz4_s2_done")) + JMP(LabelRef("lz4_s2_corrupt")) + + Label("lz4_s2_match") + // if s >= len(src)-2 { + end := GP64() + LEAQ(Mem{Base: src, Disp: 2}, end) + checkSrc(end) + offset := GP64() + MOVWQZX(Mem{Base: src}, offset) + MOVQ(end, src) // s = s + 2 + + if debug { + // if offset == 0 { + TESTQ(offset, offset) + JNZ(LabelRef("lz4_s2_c1")) + JMP(LabelRef("lz4_s2_corrupt")) + + Label("lz4_s2_c1") + + // if int(offset) > uncompressed { + CMPQ(offset, retval) + JB(LabelRef("lz4_s2_c2")) + JMP(LabelRef("lz4_s2_corrupt")) + + Label("lz4_s2_c2") + + } else { + // if offset == 0 { + TESTQ(offset, offset) + JZ(LabelRef("lz4_s2_corrupt")) + + // if int(offset) > uncompressed { + CMPQ(offset, retval) + JA(LabelRef("lz4_s2_corrupt")) + } + + // if ml == lz4MinMatch+15 { + { + CMPQ(ml, U8(lz4MinMatch+15)) + JNE(LabelRef("lz4_s2_ml_done")) + + Label("lz4_s2_ml_loop") + val := GP64() + MOVBQZX(Mem{Base: src}, val) + INCQ(src) // s++ + ADDQ(val, ml) // ml += val + checkSrc(src) + CMPQ(val, U8(255)) + JEQ(LabelRef("lz4_s2_ml_loop")) + } + Label("lz4_s2_ml_done") + + // uncompressed += ml + ADDQ(ml, retval) + CMPQ(offset, lastOffset) + JNE(LabelRef("lz4_s2_docopy")) + // Offsets can only be 16 bits + maxLength := o.maxLen + o.maxLen = 65535 + { + // emitRepeat16(dst[d:], offset, ml) + o.emitRepeat("lz4_s2", ml, offset, nil, dst, LabelRef("lz4_s2_loop"), false) + } + Label("lz4_s2_docopy") + { + // emitCopy16(dst[d:], offset, ml) + MOVQ(offset, lastOffset) + o.emitCopy("lz4_s2", ml, offset, nil, dst, LabelRef("lz4_s2_loop")) + } + o.maxLen = maxLength + + Label("lz4_s2_done") + { + tmp := GP64() + Load(Param("dst").Base(), tmp) + SUBQ(tmp, dst) + Store(retval, ReturnIndex(0)) + Store(dst, ReturnIndex(1)) + RET() + } + Label("lz4_s2_corrupt") + { + tmp := GP64() + if debug { + tmp := GP64() + Load(Param("dst").Base(), tmp) + SUBQ(tmp, dst) + Store(dst, ReturnIndex(1)) + } + XORQ(tmp, tmp) + LEAQ(Mem{Base: tmp, Disp: errCorrupt}, retval) + Store(retval, ReturnIndex(0)) + RET() + } + + Label("lz4_s2_dstfull") + { + tmp := GP64() + XORQ(tmp, tmp) + LEAQ(Mem{Base: tmp, Disp: errDstTooSmall}, retval) + Store(retval, ReturnIndex(0)) + RET() + } +} diff --git a/s2/encode_amd64.go b/s2/encode_amd64.go index 6b93daa5ae..ebc332ad5f 100644 --- a/s2/encode_amd64.go +++ b/s2/encode_amd64.go @@ -3,6 +3,8 @@ package s2 +const hasAmd64Asm = true + // encodeBlock encodes a non-empty src to a guaranteed-large-enough dst. It // assumes that the varint-encoded length of the decompressed bytes has already // been written. diff --git a/s2/encode_go.go b/s2/encode_go.go index db08fc355e..0a0c15e027 100644 --- a/s2/encode_go.go +++ b/s2/encode_go.go @@ -7,6 +7,8 @@ import ( "math/bits" ) +const hasAmd64Asm = false + // encodeBlock encodes a non-empty src to a guaranteed-large-enough dst. It // assumes that the varint-encoded length of the decompressed bytes has already // been written. diff --git a/s2/encodeblock_amd64.go b/s2/encodeblock_amd64.go index 7e00bac3ea..f6d8fa1018 100644 --- a/s2/encodeblock_amd64.go +++ b/s2/encodeblock_amd64.go @@ -192,3 +192,8 @@ func emitCopyNoRepeat(dst []byte, offset int, length int) int // //go:noescape func matchLen(a []byte, b []byte) int + +// cvtLZ4BlockAsm converts an LZ4 block to S2 +// +//go:noescape +func cvtLZ4BlockAsm(dst []byte, src []byte) (uncompressed int, dstUsed int) diff --git a/s2/encodeblock_amd64.s b/s2/encodeblock_amd64.s index df9770f976..4d3154d153 100644 --- a/s2/encodeblock_amd64.s +++ b/s2/encodeblock_amd64.s @@ -628,7 +628,6 @@ repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short: MOVB BL, (AX) ADDQ $0x02, AX JMP repeat_end_emit_encodeBlockAsm - JMP two_byte_offset_repeat_as_copy_encodeBlockAsm two_byte_offset_short_repeat_as_copy_encodeBlockAsm: MOVL BX, DI @@ -1169,7 +1168,6 @@ repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short: MOVB R9, (AX) ADDQ $0x02, AX JMP match_nolit_emitcopy_end_encodeBlockAsm - JMP two_byte_offset_match_nolit_encodeBlockAsm two_byte_offset_short_match_nolit_encodeBlockAsm: MOVL R9, SI @@ -1960,7 +1958,6 @@ repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short: MOVB BL, (AX) ADDQ $0x02, AX JMP repeat_end_emit_encodeBlockAsm4MB - JMP two_byte_offset_repeat_as_copy_encodeBlockAsm4MB two_byte_offset_short_repeat_as_copy_encodeBlockAsm4MB: MOVL BX, DI @@ -2458,7 +2455,6 @@ repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short: MOVB R9, (AX) ADDQ $0x02, AX JMP match_nolit_emitcopy_end_encodeBlockAsm4MB - JMP two_byte_offset_match_nolit_encodeBlockAsm4MB two_byte_offset_short_match_nolit_encodeBlockAsm4MB: MOVL R9, SI @@ -3004,7 +3000,6 @@ repeat_two_offset_match_repeat_encodeBlockAsm12B: repeat_as_copy_encodeBlockAsm12B: // emitCopy -two_byte_offset_repeat_as_copy_encodeBlockAsm12B: CMPL BX, $0x40 JLE two_byte_offset_short_repeat_as_copy_encodeBlockAsm12B CMPL SI, $0x00000800 @@ -3114,7 +3109,6 @@ repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short: MOVB BL, (AX) ADDQ $0x02, AX JMP repeat_end_emit_encodeBlockAsm12B - JMP two_byte_offset_repeat_as_copy_encodeBlockAsm12B two_byte_offset_short_repeat_as_copy_encodeBlockAsm12B: MOVL BX, DI @@ -3387,7 +3381,6 @@ match_nolit_end_encodeBlockAsm12B: MOVL CX, 12(SP) // emitCopy -two_byte_offset_match_nolit_encodeBlockAsm12B: CMPL R9, $0x40 JLE two_byte_offset_short_match_nolit_encodeBlockAsm12B CMPL BX, $0x00000800 @@ -3497,7 +3490,6 @@ repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short: MOVB R9, (AX) ADDQ $0x02, AX JMP match_nolit_emitcopy_end_encodeBlockAsm12B - JMP two_byte_offset_match_nolit_encodeBlockAsm12B two_byte_offset_short_match_nolit_encodeBlockAsm12B: MOVL R9, SI @@ -4032,7 +4024,6 @@ repeat_two_offset_match_repeat_encodeBlockAsm10B: repeat_as_copy_encodeBlockAsm10B: // emitCopy -two_byte_offset_repeat_as_copy_encodeBlockAsm10B: CMPL BX, $0x40 JLE two_byte_offset_short_repeat_as_copy_encodeBlockAsm10B CMPL SI, $0x00000800 @@ -4142,7 +4133,6 @@ repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short: MOVB BL, (AX) ADDQ $0x02, AX JMP repeat_end_emit_encodeBlockAsm10B - JMP two_byte_offset_repeat_as_copy_encodeBlockAsm10B two_byte_offset_short_repeat_as_copy_encodeBlockAsm10B: MOVL BX, DI @@ -4415,7 +4405,6 @@ match_nolit_end_encodeBlockAsm10B: MOVL CX, 12(SP) // emitCopy -two_byte_offset_match_nolit_encodeBlockAsm10B: CMPL R9, $0x40 JLE two_byte_offset_short_match_nolit_encodeBlockAsm10B CMPL BX, $0x00000800 @@ -4525,7 +4514,6 @@ repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short: MOVB R9, (AX) ADDQ $0x02, AX JMP match_nolit_emitcopy_end_encodeBlockAsm10B - JMP two_byte_offset_match_nolit_encodeBlockAsm10B two_byte_offset_short_match_nolit_encodeBlockAsm10B: MOVL R9, SI @@ -5056,7 +5044,6 @@ repeat_two_match_repeat_encodeBlockAsm8B: repeat_as_copy_encodeBlockAsm8B: // emitCopy -two_byte_offset_repeat_as_copy_encodeBlockAsm8B: CMPL BX, $0x40 JLE two_byte_offset_short_repeat_as_copy_encodeBlockAsm8B CMPL SI, $0x00000800 @@ -5158,7 +5145,6 @@ repeat_two_repeat_as_copy_encodeBlockAsm8B_emit_copy_short: MOVB BL, (AX) ADDQ $0x02, AX JMP repeat_end_emit_encodeBlockAsm8B - JMP two_byte_offset_repeat_as_copy_encodeBlockAsm8B two_byte_offset_short_repeat_as_copy_encodeBlockAsm8B: MOVL BX, DI @@ -5429,7 +5415,6 @@ match_nolit_end_encodeBlockAsm8B: MOVL CX, 12(SP) // emitCopy -two_byte_offset_match_nolit_encodeBlockAsm8B: CMPL R9, $0x40 JLE two_byte_offset_short_match_nolit_encodeBlockAsm8B CMPL BX, $0x00000800 @@ -5531,7 +5516,6 @@ repeat_two_match_nolit_encodeBlockAsm8B_emit_copy_short: MOVB R9, (AX) ADDQ $0x02, AX JMP match_nolit_emitcopy_end_encodeBlockAsm8B - JMP two_byte_offset_match_nolit_encodeBlockAsm8B two_byte_offset_short_match_nolit_encodeBlockAsm8B: MOVL R9, SI @@ -6344,7 +6328,6 @@ repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short: MOVB R11, (AX) ADDQ $0x02, AX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm - JMP two_byte_offset_match_nolit_encodeBetterBlockAsm two_byte_offset_short_match_nolit_encodeBetterBlockAsm: MOVL R11, BX @@ -7380,7 +7363,6 @@ repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short: MOVB R11, (AX) ADDQ $0x02, AX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB - JMP two_byte_offset_match_nolit_encodeBetterBlockAsm4MB two_byte_offset_short_match_nolit_encodeBetterBlockAsm4MB: MOVL R11, BX @@ -8147,7 +8129,6 @@ emit_literal_done_match_emit_encodeBetterBlockAsm12B: MOVL CX, 12(SP) // emitCopy -two_byte_offset_match_nolit_encodeBetterBlockAsm12B: CMPL R11, $0x40 JLE two_byte_offset_short_match_nolit_encodeBetterBlockAsm12B CMPL DI, $0x00000800 @@ -8257,7 +8238,6 @@ repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short: MOVB R11, (AX) ADDQ $0x02, AX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B - JMP two_byte_offset_match_nolit_encodeBetterBlockAsm12B two_byte_offset_short_match_nolit_encodeBetterBlockAsm12B: MOVL R11, BX @@ -8990,7 +8970,6 @@ emit_literal_done_match_emit_encodeBetterBlockAsm10B: MOVL CX, 12(SP) // emitCopy -two_byte_offset_match_nolit_encodeBetterBlockAsm10B: CMPL R11, $0x40 JLE two_byte_offset_short_match_nolit_encodeBetterBlockAsm10B CMPL DI, $0x00000800 @@ -9100,7 +9079,6 @@ repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short: MOVB R11, (AX) ADDQ $0x02, AX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B - JMP two_byte_offset_match_nolit_encodeBetterBlockAsm10B two_byte_offset_short_match_nolit_encodeBetterBlockAsm10B: MOVL R11, BX @@ -9833,7 +9811,6 @@ emit_literal_done_match_emit_encodeBetterBlockAsm8B: MOVL CX, 12(SP) // emitCopy -two_byte_offset_match_nolit_encodeBetterBlockAsm8B: CMPL R11, $0x40 JLE two_byte_offset_short_match_nolit_encodeBetterBlockAsm8B CMPL DI, $0x00000800 @@ -9935,7 +9912,6 @@ repeat_two_match_nolit_encodeBetterBlockAsm8B_emit_copy_short: MOVB R11, (AX) ADDQ $0x02, AX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B - JMP two_byte_offset_match_nolit_encodeBetterBlockAsm8B two_byte_offset_short_match_nolit_encodeBetterBlockAsm8B: MOVL R11, BX @@ -17679,7 +17655,6 @@ repeat_two_offset_standalone_emit_copy_short: ADDQ $0x02, BX ADDQ $0x02, AX JMP gen_emit_copy_end - JMP two_byte_offset_standalone two_byte_offset_short_standalone: MOVL DX, SI @@ -17846,3 +17821,418 @@ matchlen_match1_standalone: gen_match_len_end: MOVQ SI, ret+48(FP) RET + +// func cvtLZ4BlockAsm(dst []byte, src []byte) (uncompressed int, dstUsed int) +// Requires: SSE2 +TEXT ·cvtLZ4BlockAsm(SB), NOSPLIT, $0-64 + XORQ SI, SI + MOVQ dst_base+0(FP), AX + MOVQ dst_len+8(FP), CX + MOVQ src_base+24(FP), DX + MOVQ src_len+32(FP), BX + LEAQ (DX)(BX*1), BX + LEAQ -8(AX)(CX*1), CX + XORQ DI, DI + +lz4_s2_loop: + CMPQ DX, BX + JAE lz4_s2_corrupt + CMPQ AX, CX + JAE lz4_s2_dstfull + MOVBQZX (DX), R8 + MOVQ R8, R9 + MOVQ R8, R10 + SHRQ $0x04, R9 + ANDQ $0x0f, R10 + CMPQ R8, $0xf0 + JB lz4_s2_ll_end + +lz4_s2_ll_loop: + INCQ DX + CMPQ DX, BX + JAE lz4_s2_corrupt + MOVBQZX (DX), R8 + ADDQ R8, R9 + CMPQ R8, $0xff + JEQ lz4_s2_ll_loop + +lz4_s2_ll_end: + ADDQ $0x04, R10 + LEAQ (DX)(R9*1), R8 + CMPQ R8, BX + JAE lz4_s2_corrupt + INCQ DX + INCQ R8 + TESTQ R9, R9 + JZ lz4_s2_lits_done + LEAQ (AX)(R9*1), R11 + CMPQ R11, CX + JAE lz4_s2_dstfull + ADDQ R9, SI + LEAL -1(R9), R11 + CMPL R11, $0x3c + JLT one_byte_lz4_s2 + CMPL R11, $0x00000100 + JLT two_bytes_lz4_s2 + CMPL R11, $0x00010000 + JLT three_bytes_lz4_s2 + CMPL R11, $0x01000000 + JLT four_bytes_lz4_s2 + MOVB $0xfc, (AX) + MOVL R11, 1(AX) + ADDQ $0x05, AX + JMP memmove_long_lz4_s2 + +four_bytes_lz4_s2: + MOVL R11, R12 + SHRL $0x10, R12 + MOVB $0xf8, (AX) + MOVW R11, 1(AX) + MOVB R12, 3(AX) + ADDQ $0x04, AX + JMP memmove_long_lz4_s2 + +three_bytes_lz4_s2: + MOVB $0xf4, (AX) + MOVW R11, 1(AX) + ADDQ $0x03, AX + JMP memmove_long_lz4_s2 + +two_bytes_lz4_s2: + MOVB $0xf0, (AX) + MOVB R11, 1(AX) + ADDQ $0x02, AX + CMPL R11, $0x40 + JL memmove_lz4_s2 + JMP memmove_long_lz4_s2 + +one_byte_lz4_s2: + SHLB $0x02, R11 + MOVB R11, (AX) + ADDQ $0x01, AX + +memmove_lz4_s2: + LEAQ (AX)(R9*1), R11 + + // genMemMoveShort + CMPQ R9, $0x08 + JLE emit_lit_memmove_lz4_s2_memmove_move_8 + CMPQ R9, $0x10 + JBE emit_lit_memmove_lz4_s2_memmove_move_8through16 + CMPQ R9, $0x20 + JBE emit_lit_memmove_lz4_s2_memmove_move_17through32 + JMP emit_lit_memmove_lz4_s2_memmove_move_33through64 + +emit_lit_memmove_lz4_s2_memmove_move_8: + MOVQ (DX), R12 + MOVQ R12, (AX) + JMP memmove_end_copy_lz4_s2 + +emit_lit_memmove_lz4_s2_memmove_move_8through16: + MOVQ (DX), R12 + MOVQ -8(DX)(R9*1), DX + MOVQ R12, (AX) + MOVQ DX, -8(AX)(R9*1) + JMP memmove_end_copy_lz4_s2 + +emit_lit_memmove_lz4_s2_memmove_move_17through32: + MOVOU (DX), X0 + MOVOU -16(DX)(R9*1), X1 + MOVOU X0, (AX) + MOVOU X1, -16(AX)(R9*1) + JMP memmove_end_copy_lz4_s2 + +emit_lit_memmove_lz4_s2_memmove_move_33through64: + MOVOU (DX), X0 + MOVOU 16(DX), X1 + MOVOU -32(DX)(R9*1), X2 + MOVOU -16(DX)(R9*1), X3 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(R9*1) + MOVOU X3, -16(AX)(R9*1) + +memmove_end_copy_lz4_s2: + MOVQ R11, AX + JMP lz4_s2_lits_emit_done + +memmove_long_lz4_s2: + LEAQ (AX)(R9*1), R11 + + // genMemMoveLong + MOVOU (DX), X0 + MOVOU 16(DX), X1 + MOVOU -32(DX)(R9*1), X2 + MOVOU -16(DX)(R9*1), X3 + MOVQ R9, R13 + SHRQ $0x05, R13 + MOVQ AX, R12 + ANDL $0x0000001f, R12 + MOVQ $0x00000040, R14 + SUBQ R12, R14 + DECQ R13 + JA emit_lit_memmove_long_lz4_s2large_forward_sse_loop_32 + LEAQ -32(DX)(R14*1), R12 + LEAQ -32(AX)(R14*1), R15 + +emit_lit_memmove_long_lz4_s2large_big_loop_back: + MOVOU (R12), X4 + MOVOU 16(R12), X5 + MOVOA X4, (R15) + MOVOA X5, 16(R15) + ADDQ $0x20, R15 + ADDQ $0x20, R12 + ADDQ $0x20, R14 + DECQ R13 + JNA emit_lit_memmove_long_lz4_s2large_big_loop_back + +emit_lit_memmove_long_lz4_s2large_forward_sse_loop_32: + MOVOU -32(DX)(R14*1), X4 + MOVOU -16(DX)(R14*1), X5 + MOVOA X4, -32(AX)(R14*1) + MOVOA X5, -16(AX)(R14*1) + ADDQ $0x20, R14 + CMPQ R9, R14 + JAE emit_lit_memmove_long_lz4_s2large_forward_sse_loop_32 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, -32(AX)(R9*1) + MOVOU X3, -16(AX)(R9*1) + MOVQ R11, AX + +lz4_s2_lits_emit_done: + MOVQ R8, DX + +lz4_s2_lits_done: + CMPQ DX, BX + JNE lz4_s2_match + CMPQ R10, $0x04 + JEQ lz4_s2_done + JMP lz4_s2_corrupt + +lz4_s2_match: + LEAQ 2(DX), R8 + CMPQ R8, BX + JAE lz4_s2_corrupt + MOVWQZX (DX), R9 + MOVQ R8, DX + TESTQ R9, R9 + JZ lz4_s2_corrupt + CMPQ R9, SI + JA lz4_s2_corrupt + CMPQ R10, $0x13 + JNE lz4_s2_ml_done + +lz4_s2_ml_loop: + MOVBQZX (DX), R8 + INCQ DX + ADDQ R8, R10 + CMPQ DX, BX + JAE lz4_s2_corrupt + CMPQ R8, $0xff + JEQ lz4_s2_ml_loop + +lz4_s2_ml_done: + ADDQ R10, SI + CMPQ R9, DI + JNE lz4_s2_docopy + + // emitRepeat + MOVL R10, R8 + LEAL -4(R10), R10 + CMPL R8, $0x08 + JLE repeat_two_lz4_s2 + CMPL R8, $0x0c + JGE cant_repeat_two_offset_lz4_s2 + CMPL R9, $0x00000800 + JLT repeat_two_offset_lz4_s2 + +cant_repeat_two_offset_lz4_s2: + CMPL R10, $0x00000104 + JLT repeat_three_lz4_s2 + LEAL -256(R10), R10 + MOVW $0x0019, (AX) + MOVW R10, 2(AX) + ADDQ $0x04, AX + JMP lz4_s2_loop + +repeat_three_lz4_s2: + LEAL -4(R10), R10 + MOVW $0x0015, (AX) + MOVB R10, 2(AX) + ADDQ $0x03, AX + JMP lz4_s2_loop + +repeat_two_lz4_s2: + SHLL $0x02, R10 + ORL $0x01, R10 + MOVW R10, (AX) + ADDQ $0x02, AX + JMP lz4_s2_loop + +repeat_two_offset_lz4_s2: + XORQ R8, R8 + LEAL 1(R8)(R10*4), R10 + MOVB R9, 1(AX) + SARL $0x08, R9 + SHLL $0x05, R9 + ORL R9, R10 + MOVB R10, (AX) + ADDQ $0x02, AX + JMP lz4_s2_loop + +lz4_s2_docopy: + MOVQ R9, DI + + // emitCopy + CMPL R10, $0x40 + JLE two_byte_offset_short_lz4_s2 + CMPL R9, $0x00000800 + JAE long_offset_short_lz4_s2 + MOVL $0x00000001, R8 + LEAL 16(R8), R8 + MOVB R9, 1(AX) + SHRL $0x08, R9 + SHLL $0x05, R9 + ORL R9, R8 + MOVB R8, (AX) + ADDQ $0x02, AX + SUBL $0x08, R10 + + // emitRepeat + LEAL -4(R10), R10 + JMP cant_repeat_two_offset_lz4_s2_emit_copy_short_2b + MOVL R10, R8 + LEAL -4(R10), R10 + CMPL R8, $0x08 + JLE repeat_two_lz4_s2_emit_copy_short_2b + CMPL R8, $0x0c + JGE cant_repeat_two_offset_lz4_s2_emit_copy_short_2b + CMPL R9, $0x00000800 + JLT repeat_two_offset_lz4_s2_emit_copy_short_2b + +cant_repeat_two_offset_lz4_s2_emit_copy_short_2b: + CMPL R10, $0x00000104 + JLT repeat_three_lz4_s2_emit_copy_short_2b + LEAL -256(R10), R10 + MOVW $0x0019, (AX) + MOVW R10, 2(AX) + ADDQ $0x04, AX + JMP lz4_s2_loop + +repeat_three_lz4_s2_emit_copy_short_2b: + LEAL -4(R10), R10 + MOVW $0x0015, (AX) + MOVB R10, 2(AX) + ADDQ $0x03, AX + JMP lz4_s2_loop + +repeat_two_lz4_s2_emit_copy_short_2b: + SHLL $0x02, R10 + ORL $0x01, R10 + MOVW R10, (AX) + ADDQ $0x02, AX + JMP lz4_s2_loop + +repeat_two_offset_lz4_s2_emit_copy_short_2b: + XORQ R8, R8 + LEAL 1(R8)(R10*4), R10 + MOVB R9, 1(AX) + SARL $0x08, R9 + SHLL $0x05, R9 + ORL R9, R10 + MOVB R10, (AX) + ADDQ $0x02, AX + JMP lz4_s2_loop + +long_offset_short_lz4_s2: + MOVB $0xee, (AX) + MOVW R9, 1(AX) + LEAL -60(R10), R10 + ADDQ $0x03, AX + + // emitRepeat + MOVL R10, R8 + LEAL -4(R10), R10 + CMPL R8, $0x08 + JLE repeat_two_lz4_s2_emit_copy_short + CMPL R8, $0x0c + JGE cant_repeat_two_offset_lz4_s2_emit_copy_short + CMPL R9, $0x00000800 + JLT repeat_two_offset_lz4_s2_emit_copy_short + +cant_repeat_two_offset_lz4_s2_emit_copy_short: + CMPL R10, $0x00000104 + JLT repeat_three_lz4_s2_emit_copy_short + LEAL -256(R10), R10 + MOVW $0x0019, (AX) + MOVW R10, 2(AX) + ADDQ $0x04, AX + JMP lz4_s2_loop + +repeat_three_lz4_s2_emit_copy_short: + LEAL -4(R10), R10 + MOVW $0x0015, (AX) + MOVB R10, 2(AX) + ADDQ $0x03, AX + JMP lz4_s2_loop + +repeat_two_lz4_s2_emit_copy_short: + SHLL $0x02, R10 + ORL $0x01, R10 + MOVW R10, (AX) + ADDQ $0x02, AX + JMP lz4_s2_loop + +repeat_two_offset_lz4_s2_emit_copy_short: + XORQ R8, R8 + LEAL 1(R8)(R10*4), R10 + MOVB R9, 1(AX) + SARL $0x08, R9 + SHLL $0x05, R9 + ORL R9, R10 + MOVB R10, (AX) + ADDQ $0x02, AX + JMP lz4_s2_loop + +two_byte_offset_short_lz4_s2: + MOVL R10, R8 + SHLL $0x02, R8 + CMPL R10, $0x0c + JGE emit_copy_three_lz4_s2 + CMPL R9, $0x00000800 + JGE emit_copy_three_lz4_s2 + LEAL -15(R8), R8 + MOVB R9, 1(AX) + SHRL $0x08, R9 + SHLL $0x05, R9 + ORL R9, R8 + MOVB R8, (AX) + ADDQ $0x02, AX + JMP lz4_s2_loop + +emit_copy_three_lz4_s2: + LEAL -2(R8), R8 + MOVB R8, (AX) + MOVW R9, 1(AX) + ADDQ $0x03, AX + JMP lz4_s2_loop + +lz4_s2_done: + MOVQ dst_base+0(FP), CX + SUBQ CX, AX + MOVQ SI, uncompressed+48(FP) + MOVQ AX, dstUsed+56(FP) + RET + +lz4_s2_corrupt: + XORQ AX, AX + LEAQ -1(AX), SI + MOVQ SI, uncompressed+48(FP) + RET + +lz4_s2_dstfull: + XORQ AX, AX + LEAQ -2(AX), SI + MOVQ SI, uncompressed+48(FP) + RET diff --git a/s2/lz4convert.go b/s2/lz4convert.go index d3649d1c2a..824b32241e 100644 --- a/s2/lz4convert.go +++ b/s2/lz4convert.go @@ -33,6 +33,28 @@ func (l *LZ4Converter) ConvertBlock(dst, src []byte) ([]byte, int, error) { s, d := 0, len(dst) dst = dst[:cap(dst)] + if !debug && hasAmd64Asm { + res, sz := cvtLZ4BlockAsm(dst[d:], src) + if res < 0 { + const ( + errCorrupt = -1 + errDstTooSmall = -2 + ) + switch res { + case errCorrupt: + return dst[:d], 0, ErrCorrupt + case errDstTooSmall: + return dst[:d], 0, ErrDstTooSmall + default: + return dst[:d], 0, fmt.Errorf("unexpected result: %d", res) + } + } + if d+sz > len(dst) { + return dst[:0], 0, ErrDstTooSmall + } + return dst[:d+sz], res, nil + } + dLimit := len(dst) - 8 var lastOffset uint16 var uncompressed int @@ -84,10 +106,6 @@ func (l *LZ4Converter) ConvertBlock(dst, src []byte) ([]byte, int, error) { d += emitLiteralGo(dst[d:], src[s:s+ll]) s += ll uncompressed += ll - - if d > dLimit { - return dst[:d], 0, ErrDstTooSmall - } } // Check if we are done...