diff --git a/huff0/decompress.go b/huff0/decompress.go
index 42a237eac4..3c0b398c72 100644
--- a/huff0/decompress.go
+++ b/huff0/decompress.go
@@ -61,7 +61,7 @@ func ReadTable(in []byte, s *Scratch) (s2 *Scratch, remain []byte, err error) {
 		b, err := fse.Decompress(in[:iSize], s.fse)
 		s.fse.Out = nil
 		if err != nil {
-			return s, nil, err
+			return s, nil, fmt.Errorf("fse decompress returned: %w", err)
 		}
 		if len(b) > 255 {
 			return s, nil, errors.New("corrupt input: output table too large")
diff --git a/s2/_generate/gen.go b/s2/_generate/gen.go
index c8b8012384..3959561e74 100644
--- a/s2/_generate/gen.go
+++ b/s2/_generate/gen.go
@@ -8,6 +8,7 @@ import (
 	"flag"
 	"fmt"
 	"math"
+	"math/rand"
 	"runtime"
 
 	. "github.com/mmcloughlin/avo/build"
@@ -93,6 +94,8 @@ func main() {
 	o.genEmitCopyNoRepeat()
 	o.snappy = false
 	o.genMatchLen()
+	o.cvtLZ4BlockAsm()
+
 	Generate()
 }
 
@@ -1679,7 +1682,7 @@ func (o options) genEmitLiteral() {
 // stack must have at least 32 bytes.
 // retval will contain emitted bytes, but can be nil if this is not interesting.
 // dstBase and litBase are updated.
-// Uses 2 GP registers. With AVX 4 registers.
+// Uses 2 GP registers.
 // If updateDst is true dstBase will have the updated end pointer and an additional register will be used.
 func (o options) emitLiteral(name string, litLen, retval, dstBase, litBase reg.GPVirtual, end LabelRef, updateDst bool) {
 	n := GP32()
@@ -2168,8 +2171,9 @@ func (o options) emitCopy(name string, length, offset, retval, dstBase reg.GPVir
 	// Inline call to emitRepeat. Will jump to end
 	if !o.snappy {
 		o.emitRepeat(name+"_emit_copy_short", length, offset, retval, dstBase, end, false)
+	} else {
+		JMP(LabelRef("two_byte_offset_" + name))
 	}
-	JMP(LabelRef("two_byte_offset_" + name))
 
 	Label("two_byte_offset_short_" + name)
 
@@ -2771,3 +2775,205 @@ func (o options) matchLenAlt(name string, a, b, len reg.GPVirtual, end LabelRef)
 	JMP(end)
 	return matched
 }
+
+func (o options) cvtLZ4BlockAsm() {
+	TEXT("cvtLZ4BlockAsm", NOSPLIT, "func(dst, src []byte) (uncompressed int, dstUsed int)")
+	Doc("cvtLZ4BlockAsm converts an LZ4 block to S2", "")
+	Pragma("noescape")
+	o.outputMargin = 8
+
+	const (
+		errCorrupt     = -1
+		errDstTooSmall = -2
+	)
+	dst, dstLen, src, srcLen, retval := GP64(), GP64(), GP64(), GP64(), GP64()
+
+	// retval = 0
+	XORQ(retval, retval)
+
+	Load(Param("dst").Base(), dst)
+	Load(Param("dst").Len(), dstLen)
+	Load(Param("src").Base(), src)
+	Load(Param("src").Len(), srcLen)
+	srcEnd, dstEnd := GP64(), GP64()
+	LEAQ(Mem{Base: src, Index: srcLen, Scale: 1, Disp: 0}, srcEnd)
+	LEAQ(Mem{Base: dst, Index: dstLen, Scale: 1, Disp: -o.outputMargin}, dstEnd)
+	lastOffset := GP64()
+	XORQ(lastOffset, lastOffset)
+
+	checkSrc := func(reg reg.GPVirtual) {
+		if debug {
+			name := fmt.Sprintf("lz4_s2_ok_%d", rand.Int31())
+
+			CMPQ(reg, srcEnd)
+			JB(LabelRef(name))
+			JMP(LabelRef("lz4_s2_corrupt"))
+			Label(name)
+		} else {
+			CMPQ(reg, srcEnd)
+			JAE(LabelRef("lz4_s2_corrupt"))
+		}
+	}
+	checkDst := func(reg reg.GPVirtual) {
+		CMPQ(reg, dstEnd)
+		JAE(LabelRef("lz4_s2_dstfull"))
+	}
+
+	const lz4MinMatch = 4
+
+	Label("lz4_s2_loop")
+	checkSrc(src)
+	checkDst(dst)
+	token := GP64()
+	MOVBQZX(Mem{Base: src}, token)
+	ll, ml := GP64(), GP64()
+	MOVQ(token, ll)
+	MOVQ(token, ml)
+	SHRQ(U8(4), ll)
+	ANDQ(U8(0xf), ml)
+
+	// If upper nibble is 15, literal length is extended
+	{
+		CMPQ(token, U8(0xf0))
+		JB(LabelRef("lz4_s2_ll_end"))
+		Label("lz4_s2_ll_loop")
+		INCQ(src) // s++
+		checkSrc(src)
+		val := GP64()
+		MOVBQZX(Mem{Base: src}, val)
+		ADDQ(val, ll)
+		CMPQ(val, U8(255))
+		JEQ(LabelRef("lz4_s2_ll_loop"))
+		Label("lz4_s2_ll_end")
+	}
+
+	// if s+ll >= len(src)
+	endLits := GP64()
+	LEAQ(Mem{Base: src, Index: ll, Scale: 1}, endLits)
+	ADDQ(U8(lz4MinMatch), ml)
+	checkSrc(endLits)
+	INCQ(src) // s++
+	INCQ(endLits)
+	TESTQ(ll, ll)
+	JZ(LabelRef("lz4_s2_lits_done"))
+	{
+		dstEnd := GP64()
+		LEAQ(Mem{Base: dst, Index: ll, Scale: 1}, dstEnd)
+		checkDst(dstEnd)
+		o.outputMargin++
+		ADDQ(ll, retval)
+		o.emitLiteral("lz4_s2", ll, nil, dst, src, LabelRef("lz4_s2_lits_emit_done"), true)
+		o.outputMargin--
+		Label("lz4_s2_lits_emit_done")
+		MOVQ(endLits, src)
+	}
+	Label("lz4_s2_lits_done")
+	// if s == len(src) && ml == lz4MinMatch
+	CMPQ(src, srcEnd)
+	JNE(LabelRef("lz4_s2_match"))
+	CMPQ(ml, U8(lz4MinMatch))
+	JEQ(LabelRef("lz4_s2_done"))
+	JMP(LabelRef("lz4_s2_corrupt"))
+
+	Label("lz4_s2_match")
+	// if s >= len(src)-2 {
+	end := GP64()
+	LEAQ(Mem{Base: src, Disp: 2}, end)
+	checkSrc(end)
+	offset := GP64()
+	MOVWQZX(Mem{Base: src}, offset)
+	MOVQ(end, src) // s = s + 2
+
+	if debug {
+		// if offset == 0 {
+		TESTQ(offset, offset)
+		JNZ(LabelRef("lz4_s2_c1"))
+		JMP(LabelRef("lz4_s2_corrupt"))
+
+		Label("lz4_s2_c1")
+
+		// if int(offset) > uncompressed {
+		CMPQ(offset, retval)
+		JB(LabelRef("lz4_s2_c2"))
+		JMP(LabelRef("lz4_s2_corrupt"))
+
+		Label("lz4_s2_c2")
+
+	} else {
+		// if offset == 0 {
+		TESTQ(offset, offset)
+		JZ(LabelRef("lz4_s2_corrupt"))
+
+		// if int(offset) > uncompressed {
+		CMPQ(offset, retval)
+		JA(LabelRef("lz4_s2_corrupt"))
+	}
+
+	// if ml == lz4MinMatch+15 {
+	{
+		CMPQ(ml, U8(lz4MinMatch+15))
+		JNE(LabelRef("lz4_s2_ml_done"))
+
+		Label("lz4_s2_ml_loop")
+		val := GP64()
+		MOVBQZX(Mem{Base: src}, val)
+		INCQ(src)     // s++
+		ADDQ(val, ml) // ml += val
+		checkSrc(src)
+		CMPQ(val, U8(255))
+		JEQ(LabelRef("lz4_s2_ml_loop"))
+	}
+	Label("lz4_s2_ml_done")
+
+	// uncompressed += ml
+	ADDQ(ml, retval)
+	CMPQ(offset, lastOffset)
+	JNE(LabelRef("lz4_s2_docopy"))
+	// Offsets can only be 16 bits
+	maxLength := o.maxLen
+	o.maxLen = 65535
+	{
+		// emitRepeat16(dst[d:], offset, ml)
+		o.emitRepeat("lz4_s2", ml, offset, nil, dst, LabelRef("lz4_s2_loop"), false)
+	}
+	Label("lz4_s2_docopy")
+	{
+		// emitCopy16(dst[d:], offset, ml)
+		MOVQ(offset, lastOffset)
+		o.emitCopy("lz4_s2", ml, offset, nil, dst, LabelRef("lz4_s2_loop"))
+	}
+	o.maxLen = maxLength
+
+	Label("lz4_s2_done")
+	{
+		tmp := GP64()
+		Load(Param("dst").Base(), tmp)
+		SUBQ(tmp, dst)
+		Store(retval, ReturnIndex(0))
+		Store(dst, ReturnIndex(1))
+		RET()
+	}
+	Label("lz4_s2_corrupt")
+	{
+		tmp := GP64()
+		if debug {
+			tmp := GP64()
+			Load(Param("dst").Base(), tmp)
+			SUBQ(tmp, dst)
+			Store(dst, ReturnIndex(1))
+		}
+		XORQ(tmp, tmp)
+		LEAQ(Mem{Base: tmp, Disp: errCorrupt}, retval)
+		Store(retval, ReturnIndex(0))
+		RET()
+	}
+
+	Label("lz4_s2_dstfull")
+	{
+		tmp := GP64()
+		XORQ(tmp, tmp)
+		LEAQ(Mem{Base: tmp, Disp: errDstTooSmall}, retval)
+		Store(retval, ReturnIndex(0))
+		RET()
+	}
+}
diff --git a/s2/encode_amd64.go b/s2/encode_amd64.go
index 6b93daa5ae..ebc332ad5f 100644
--- a/s2/encode_amd64.go
+++ b/s2/encode_amd64.go
@@ -3,6 +3,8 @@
 
 package s2
 
+const hasAmd64Asm = true
+
 // encodeBlock encodes a non-empty src to a guaranteed-large-enough dst. It
 // assumes that the varint-encoded length of the decompressed bytes has already
 // been written.
diff --git a/s2/encode_go.go b/s2/encode_go.go
index db08fc355e..0a0c15e027 100644
--- a/s2/encode_go.go
+++ b/s2/encode_go.go
@@ -7,6 +7,8 @@ import (
 	"math/bits"
 )
 
+const hasAmd64Asm = false
+
 // encodeBlock encodes a non-empty src to a guaranteed-large-enough dst. It
 // assumes that the varint-encoded length of the decompressed bytes has already
 // been written.
diff --git a/s2/encodeblock_amd64.go b/s2/encodeblock_amd64.go
index 7e00bac3ea..f6d8fa1018 100644
--- a/s2/encodeblock_amd64.go
+++ b/s2/encodeblock_amd64.go
@@ -192,3 +192,8 @@ func emitCopyNoRepeat(dst []byte, offset int, length int) int
 //
 //go:noescape
 func matchLen(a []byte, b []byte) int
+
+// cvtLZ4BlockAsm converts an LZ4 block to S2
+//
+//go:noescape
+func cvtLZ4BlockAsm(dst []byte, src []byte) (uncompressed int, dstUsed int)
diff --git a/s2/encodeblock_amd64.s b/s2/encodeblock_amd64.s
index df9770f976..4d3154d153 100644
--- a/s2/encodeblock_amd64.s
+++ b/s2/encodeblock_amd64.s
@@ -628,7 +628,6 @@ repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short:
 	MOVB BL, (AX)
 	ADDQ $0x02, AX
 	JMP  repeat_end_emit_encodeBlockAsm
-	JMP two_byte_offset_repeat_as_copy_encodeBlockAsm
 
 two_byte_offset_short_repeat_as_copy_encodeBlockAsm:
 	MOVL BX, DI
@@ -1169,7 +1168,6 @@ repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short:
 	MOVB R9, (AX)
 	ADDQ $0x02, AX
 	JMP  match_nolit_emitcopy_end_encodeBlockAsm
-	JMP two_byte_offset_match_nolit_encodeBlockAsm
 
 two_byte_offset_short_match_nolit_encodeBlockAsm:
 	MOVL R9, SI
@@ -1960,7 +1958,6 @@ repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short:
 	MOVB BL, (AX)
 	ADDQ $0x02, AX
 	JMP  repeat_end_emit_encodeBlockAsm4MB
-	JMP two_byte_offset_repeat_as_copy_encodeBlockAsm4MB
 
 two_byte_offset_short_repeat_as_copy_encodeBlockAsm4MB:
 	MOVL BX, DI
@@ -2458,7 +2455,6 @@ repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short:
 	MOVB R9, (AX)
 	ADDQ $0x02, AX
 	JMP  match_nolit_emitcopy_end_encodeBlockAsm4MB
-	JMP two_byte_offset_match_nolit_encodeBlockAsm4MB
 
 two_byte_offset_short_match_nolit_encodeBlockAsm4MB:
 	MOVL R9, SI
@@ -3004,7 +3000,6 @@ repeat_two_offset_match_repeat_encodeBlockAsm12B:
 
 repeat_as_copy_encodeBlockAsm12B:
 	// emitCopy
-two_byte_offset_repeat_as_copy_encodeBlockAsm12B:
 	CMPL BX, $0x40
 	JLE  two_byte_offset_short_repeat_as_copy_encodeBlockAsm12B
 	CMPL SI, $0x00000800
@@ -3114,7 +3109,6 @@ repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short:
 	MOVB BL, (AX)
 	ADDQ $0x02, AX
 	JMP  repeat_end_emit_encodeBlockAsm12B
-	JMP two_byte_offset_repeat_as_copy_encodeBlockAsm12B
 
 two_byte_offset_short_repeat_as_copy_encodeBlockAsm12B:
 	MOVL BX, DI
@@ -3387,7 +3381,6 @@ match_nolit_end_encodeBlockAsm12B:
 	MOVL CX, 12(SP)
 
 	// emitCopy
-two_byte_offset_match_nolit_encodeBlockAsm12B:
 	CMPL R9, $0x40
 	JLE  two_byte_offset_short_match_nolit_encodeBlockAsm12B
 	CMPL BX, $0x00000800
@@ -3497,7 +3490,6 @@ repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short:
 	MOVB R9, (AX)
 	ADDQ $0x02, AX
 	JMP  match_nolit_emitcopy_end_encodeBlockAsm12B
-	JMP two_byte_offset_match_nolit_encodeBlockAsm12B
 
 two_byte_offset_short_match_nolit_encodeBlockAsm12B:
 	MOVL R9, SI
@@ -4032,7 +4024,6 @@ repeat_two_offset_match_repeat_encodeBlockAsm10B:
 
 repeat_as_copy_encodeBlockAsm10B:
 	// emitCopy
-two_byte_offset_repeat_as_copy_encodeBlockAsm10B:
 	CMPL BX, $0x40
 	JLE  two_byte_offset_short_repeat_as_copy_encodeBlockAsm10B
 	CMPL SI, $0x00000800
@@ -4142,7 +4133,6 @@ repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short:
 	MOVB BL, (AX)
 	ADDQ $0x02, AX
 	JMP  repeat_end_emit_encodeBlockAsm10B
-	JMP two_byte_offset_repeat_as_copy_encodeBlockAsm10B
 
 two_byte_offset_short_repeat_as_copy_encodeBlockAsm10B:
 	MOVL BX, DI
@@ -4415,7 +4405,6 @@ match_nolit_end_encodeBlockAsm10B:
 	MOVL CX, 12(SP)
 
 	// emitCopy
-two_byte_offset_match_nolit_encodeBlockAsm10B:
 	CMPL R9, $0x40
 	JLE  two_byte_offset_short_match_nolit_encodeBlockAsm10B
 	CMPL BX, $0x00000800
@@ -4525,7 +4514,6 @@ repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short:
 	MOVB R9, (AX)
 	ADDQ $0x02, AX
 	JMP  match_nolit_emitcopy_end_encodeBlockAsm10B
-	JMP two_byte_offset_match_nolit_encodeBlockAsm10B
 
 two_byte_offset_short_match_nolit_encodeBlockAsm10B:
 	MOVL R9, SI
@@ -5056,7 +5044,6 @@ repeat_two_match_repeat_encodeBlockAsm8B:
 
 repeat_as_copy_encodeBlockAsm8B:
 	// emitCopy
-two_byte_offset_repeat_as_copy_encodeBlockAsm8B:
 	CMPL BX, $0x40
 	JLE  two_byte_offset_short_repeat_as_copy_encodeBlockAsm8B
 	CMPL SI, $0x00000800
@@ -5158,7 +5145,6 @@ repeat_two_repeat_as_copy_encodeBlockAsm8B_emit_copy_short:
 	MOVB BL, (AX)
 	ADDQ $0x02, AX
 	JMP  repeat_end_emit_encodeBlockAsm8B
-	JMP two_byte_offset_repeat_as_copy_encodeBlockAsm8B
 
 two_byte_offset_short_repeat_as_copy_encodeBlockAsm8B:
 	MOVL BX, DI
@@ -5429,7 +5415,6 @@ match_nolit_end_encodeBlockAsm8B:
 	MOVL CX, 12(SP)
 
 	// emitCopy
-two_byte_offset_match_nolit_encodeBlockAsm8B:
 	CMPL R9, $0x40
 	JLE  two_byte_offset_short_match_nolit_encodeBlockAsm8B
 	CMPL BX, $0x00000800
@@ -5531,7 +5516,6 @@ repeat_two_match_nolit_encodeBlockAsm8B_emit_copy_short:
 	MOVB R9, (AX)
 	ADDQ $0x02, AX
 	JMP  match_nolit_emitcopy_end_encodeBlockAsm8B
-	JMP two_byte_offset_match_nolit_encodeBlockAsm8B
 
 two_byte_offset_short_match_nolit_encodeBlockAsm8B:
 	MOVL R9, SI
@@ -6344,7 +6328,6 @@ repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short:
 	MOVB R11, (AX)
 	ADDQ $0x02, AX
 	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm
-	JMP two_byte_offset_match_nolit_encodeBetterBlockAsm
 
 two_byte_offset_short_match_nolit_encodeBetterBlockAsm:
 	MOVL R11, BX
@@ -7380,7 +7363,6 @@ repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short:
 	MOVB R11, (AX)
 	ADDQ $0x02, AX
 	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm4MB
-	JMP two_byte_offset_match_nolit_encodeBetterBlockAsm4MB
 
 two_byte_offset_short_match_nolit_encodeBetterBlockAsm4MB:
 	MOVL R11, BX
@@ -8147,7 +8129,6 @@ emit_literal_done_match_emit_encodeBetterBlockAsm12B:
 	MOVL CX, 12(SP)
 
 	// emitCopy
-two_byte_offset_match_nolit_encodeBetterBlockAsm12B:
 	CMPL R11, $0x40
 	JLE  two_byte_offset_short_match_nolit_encodeBetterBlockAsm12B
 	CMPL DI, $0x00000800
@@ -8257,7 +8238,6 @@ repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short:
 	MOVB R11, (AX)
 	ADDQ $0x02, AX
 	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm12B
-	JMP two_byte_offset_match_nolit_encodeBetterBlockAsm12B
 
 two_byte_offset_short_match_nolit_encodeBetterBlockAsm12B:
 	MOVL R11, BX
@@ -8990,7 +8970,6 @@ emit_literal_done_match_emit_encodeBetterBlockAsm10B:
 	MOVL CX, 12(SP)
 
 	// emitCopy
-two_byte_offset_match_nolit_encodeBetterBlockAsm10B:
 	CMPL R11, $0x40
 	JLE  two_byte_offset_short_match_nolit_encodeBetterBlockAsm10B
 	CMPL DI, $0x00000800
@@ -9100,7 +9079,6 @@ repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short:
 	MOVB R11, (AX)
 	ADDQ $0x02, AX
 	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm10B
-	JMP two_byte_offset_match_nolit_encodeBetterBlockAsm10B
 
 two_byte_offset_short_match_nolit_encodeBetterBlockAsm10B:
 	MOVL R11, BX
@@ -9833,7 +9811,6 @@ emit_literal_done_match_emit_encodeBetterBlockAsm8B:
 	MOVL CX, 12(SP)
 
 	// emitCopy
-two_byte_offset_match_nolit_encodeBetterBlockAsm8B:
 	CMPL R11, $0x40
 	JLE  two_byte_offset_short_match_nolit_encodeBetterBlockAsm8B
 	CMPL DI, $0x00000800
@@ -9935,7 +9912,6 @@ repeat_two_match_nolit_encodeBetterBlockAsm8B_emit_copy_short:
 	MOVB R11, (AX)
 	ADDQ $0x02, AX
 	JMP  match_nolit_emitcopy_end_encodeBetterBlockAsm8B
-	JMP two_byte_offset_match_nolit_encodeBetterBlockAsm8B
 
 two_byte_offset_short_match_nolit_encodeBetterBlockAsm8B:
 	MOVL R11, BX
@@ -17679,7 +17655,6 @@ repeat_two_offset_standalone_emit_copy_short:
 	ADDQ $0x02, BX
 	ADDQ $0x02, AX
 	JMP  gen_emit_copy_end
-	JMP two_byte_offset_standalone
 
 two_byte_offset_short_standalone:
 	MOVL DX, SI
@@ -17846,3 +17821,418 @@ matchlen_match1_standalone:
 gen_match_len_end:
 	MOVQ SI, ret+48(FP)
 	RET
+
+// func cvtLZ4BlockAsm(dst []byte, src []byte) (uncompressed int, dstUsed int)
+// Requires: SSE2
+TEXT ·cvtLZ4BlockAsm(SB), NOSPLIT, $0-64
+	XORQ SI, SI
+	MOVQ dst_base+0(FP), AX
+	MOVQ dst_len+8(FP), CX
+	MOVQ src_base+24(FP), DX
+	MOVQ src_len+32(FP), BX
+	LEAQ (DX)(BX*1), BX
+	LEAQ -8(AX)(CX*1), CX
+	XORQ DI, DI
+
+lz4_s2_loop:
+	CMPQ    DX, BX
+	JAE     lz4_s2_corrupt
+	CMPQ    AX, CX
+	JAE     lz4_s2_dstfull
+	MOVBQZX (DX), R8
+	MOVQ    R8, R9
+	MOVQ    R8, R10
+	SHRQ    $0x04, R9
+	ANDQ    $0x0f, R10
+	CMPQ    R8, $0xf0
+	JB      lz4_s2_ll_end
+
+lz4_s2_ll_loop:
+	INCQ    DX
+	CMPQ    DX, BX
+	JAE     lz4_s2_corrupt
+	MOVBQZX (DX), R8
+	ADDQ    R8, R9
+	CMPQ    R8, $0xff
+	JEQ     lz4_s2_ll_loop
+
+lz4_s2_ll_end:
+	ADDQ  $0x04, R10
+	LEAQ  (DX)(R9*1), R8
+	CMPQ  R8, BX
+	JAE   lz4_s2_corrupt
+	INCQ  DX
+	INCQ  R8
+	TESTQ R9, R9
+	JZ    lz4_s2_lits_done
+	LEAQ  (AX)(R9*1), R11
+	CMPQ  R11, CX
+	JAE   lz4_s2_dstfull
+	ADDQ  R9, SI
+	LEAL  -1(R9), R11
+	CMPL  R11, $0x3c
+	JLT   one_byte_lz4_s2
+	CMPL  R11, $0x00000100
+	JLT   two_bytes_lz4_s2
+	CMPL  R11, $0x00010000
+	JLT   three_bytes_lz4_s2
+	CMPL  R11, $0x01000000
+	JLT   four_bytes_lz4_s2
+	MOVB  $0xfc, (AX)
+	MOVL  R11, 1(AX)
+	ADDQ  $0x05, AX
+	JMP   memmove_long_lz4_s2
+
+four_bytes_lz4_s2:
+	MOVL R11, R12
+	SHRL $0x10, R12
+	MOVB $0xf8, (AX)
+	MOVW R11, 1(AX)
+	MOVB R12, 3(AX)
+	ADDQ $0x04, AX
+	JMP  memmove_long_lz4_s2
+
+three_bytes_lz4_s2:
+	MOVB $0xf4, (AX)
+	MOVW R11, 1(AX)
+	ADDQ $0x03, AX
+	JMP  memmove_long_lz4_s2
+
+two_bytes_lz4_s2:
+	MOVB $0xf0, (AX)
+	MOVB R11, 1(AX)
+	ADDQ $0x02, AX
+	CMPL R11, $0x40
+	JL   memmove_lz4_s2
+	JMP  memmove_long_lz4_s2
+
+one_byte_lz4_s2:
+	SHLB $0x02, R11
+	MOVB R11, (AX)
+	ADDQ $0x01, AX
+
+memmove_lz4_s2:
+	LEAQ (AX)(R9*1), R11
+
+	// genMemMoveShort
+	CMPQ R9, $0x08
+	JLE  emit_lit_memmove_lz4_s2_memmove_move_8
+	CMPQ R9, $0x10
+	JBE  emit_lit_memmove_lz4_s2_memmove_move_8through16
+	CMPQ R9, $0x20
+	JBE  emit_lit_memmove_lz4_s2_memmove_move_17through32
+	JMP  emit_lit_memmove_lz4_s2_memmove_move_33through64
+
+emit_lit_memmove_lz4_s2_memmove_move_8:
+	MOVQ (DX), R12
+	MOVQ R12, (AX)
+	JMP  memmove_end_copy_lz4_s2
+
+emit_lit_memmove_lz4_s2_memmove_move_8through16:
+	MOVQ (DX), R12
+	MOVQ -8(DX)(R9*1), DX
+	MOVQ R12, (AX)
+	MOVQ DX, -8(AX)(R9*1)
+	JMP  memmove_end_copy_lz4_s2
+
+emit_lit_memmove_lz4_s2_memmove_move_17through32:
+	MOVOU (DX), X0
+	MOVOU -16(DX)(R9*1), X1
+	MOVOU X0, (AX)
+	MOVOU X1, -16(AX)(R9*1)
+	JMP   memmove_end_copy_lz4_s2
+
+emit_lit_memmove_lz4_s2_memmove_move_33through64:
+	MOVOU (DX), X0
+	MOVOU 16(DX), X1
+	MOVOU -32(DX)(R9*1), X2
+	MOVOU -16(DX)(R9*1), X3
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(R9*1)
+	MOVOU X3, -16(AX)(R9*1)
+
+memmove_end_copy_lz4_s2:
+	MOVQ R11, AX
+	JMP  lz4_s2_lits_emit_done
+
+memmove_long_lz4_s2:
+	LEAQ (AX)(R9*1), R11
+
+	// genMemMoveLong
+	MOVOU (DX), X0
+	MOVOU 16(DX), X1
+	MOVOU -32(DX)(R9*1), X2
+	MOVOU -16(DX)(R9*1), X3
+	MOVQ  R9, R13
+	SHRQ  $0x05, R13
+	MOVQ  AX, R12
+	ANDL  $0x0000001f, R12
+	MOVQ  $0x00000040, R14
+	SUBQ  R12, R14
+	DECQ  R13
+	JA    emit_lit_memmove_long_lz4_s2large_forward_sse_loop_32
+	LEAQ  -32(DX)(R14*1), R12
+	LEAQ  -32(AX)(R14*1), R15
+
+emit_lit_memmove_long_lz4_s2large_big_loop_back:
+	MOVOU (R12), X4
+	MOVOU 16(R12), X5
+	MOVOA X4, (R15)
+	MOVOA X5, 16(R15)
+	ADDQ  $0x20, R15
+	ADDQ  $0x20, R12
+	ADDQ  $0x20, R14
+	DECQ  R13
+	JNA   emit_lit_memmove_long_lz4_s2large_big_loop_back
+
+emit_lit_memmove_long_lz4_s2large_forward_sse_loop_32:
+	MOVOU -32(DX)(R14*1), X4
+	MOVOU -16(DX)(R14*1), X5
+	MOVOA X4, -32(AX)(R14*1)
+	MOVOA X5, -16(AX)(R14*1)
+	ADDQ  $0x20, R14
+	CMPQ  R9, R14
+	JAE   emit_lit_memmove_long_lz4_s2large_forward_sse_loop_32
+	MOVOU X0, (AX)
+	MOVOU X1, 16(AX)
+	MOVOU X2, -32(AX)(R9*1)
+	MOVOU X3, -16(AX)(R9*1)
+	MOVQ  R11, AX
+
+lz4_s2_lits_emit_done:
+	MOVQ R8, DX
+
+lz4_s2_lits_done:
+	CMPQ DX, BX
+	JNE  lz4_s2_match
+	CMPQ R10, $0x04
+	JEQ  lz4_s2_done
+	JMP  lz4_s2_corrupt
+
+lz4_s2_match:
+	LEAQ    2(DX), R8
+	CMPQ    R8, BX
+	JAE     lz4_s2_corrupt
+	MOVWQZX (DX), R9
+	MOVQ    R8, DX
+	TESTQ   R9, R9
+	JZ      lz4_s2_corrupt
+	CMPQ    R9, SI
+	JA      lz4_s2_corrupt
+	CMPQ    R10, $0x13
+	JNE     lz4_s2_ml_done
+
+lz4_s2_ml_loop:
+	MOVBQZX (DX), R8
+	INCQ    DX
+	ADDQ    R8, R10
+	CMPQ    DX, BX
+	JAE     lz4_s2_corrupt
+	CMPQ    R8, $0xff
+	JEQ     lz4_s2_ml_loop
+
+lz4_s2_ml_done:
+	ADDQ R10, SI
+	CMPQ R9, DI
+	JNE  lz4_s2_docopy
+
+	// emitRepeat
+	MOVL R10, R8
+	LEAL -4(R10), R10
+	CMPL R8, $0x08
+	JLE  repeat_two_lz4_s2
+	CMPL R8, $0x0c
+	JGE  cant_repeat_two_offset_lz4_s2
+	CMPL R9, $0x00000800
+	JLT  repeat_two_offset_lz4_s2
+
+cant_repeat_two_offset_lz4_s2:
+	CMPL R10, $0x00000104
+	JLT  repeat_three_lz4_s2
+	LEAL -256(R10), R10
+	MOVW $0x0019, (AX)
+	MOVW R10, 2(AX)
+	ADDQ $0x04, AX
+	JMP  lz4_s2_loop
+
+repeat_three_lz4_s2:
+	LEAL -4(R10), R10
+	MOVW $0x0015, (AX)
+	MOVB R10, 2(AX)
+	ADDQ $0x03, AX
+	JMP  lz4_s2_loop
+
+repeat_two_lz4_s2:
+	SHLL $0x02, R10
+	ORL  $0x01, R10
+	MOVW R10, (AX)
+	ADDQ $0x02, AX
+	JMP  lz4_s2_loop
+
+repeat_two_offset_lz4_s2:
+	XORQ R8, R8
+	LEAL 1(R8)(R10*4), R10
+	MOVB R9, 1(AX)
+	SARL $0x08, R9
+	SHLL $0x05, R9
+	ORL  R9, R10
+	MOVB R10, (AX)
+	ADDQ $0x02, AX
+	JMP  lz4_s2_loop
+
+lz4_s2_docopy:
+	MOVQ R9, DI
+
+	// emitCopy
+	CMPL R10, $0x40
+	JLE  two_byte_offset_short_lz4_s2
+	CMPL R9, $0x00000800
+	JAE  long_offset_short_lz4_s2
+	MOVL $0x00000001, R8
+	LEAL 16(R8), R8
+	MOVB R9, 1(AX)
+	SHRL $0x08, R9
+	SHLL $0x05, R9
+	ORL  R9, R8
+	MOVB R8, (AX)
+	ADDQ $0x02, AX
+	SUBL $0x08, R10
+
+	// emitRepeat
+	LEAL -4(R10), R10
+	JMP  cant_repeat_two_offset_lz4_s2_emit_copy_short_2b
+	MOVL R10, R8
+	LEAL -4(R10), R10
+	CMPL R8, $0x08
+	JLE  repeat_two_lz4_s2_emit_copy_short_2b
+	CMPL R8, $0x0c
+	JGE  cant_repeat_two_offset_lz4_s2_emit_copy_short_2b
+	CMPL R9, $0x00000800
+	JLT  repeat_two_offset_lz4_s2_emit_copy_short_2b
+
+cant_repeat_two_offset_lz4_s2_emit_copy_short_2b:
+	CMPL R10, $0x00000104
+	JLT  repeat_three_lz4_s2_emit_copy_short_2b
+	LEAL -256(R10), R10
+	MOVW $0x0019, (AX)
+	MOVW R10, 2(AX)
+	ADDQ $0x04, AX
+	JMP  lz4_s2_loop
+
+repeat_three_lz4_s2_emit_copy_short_2b:
+	LEAL -4(R10), R10
+	MOVW $0x0015, (AX)
+	MOVB R10, 2(AX)
+	ADDQ $0x03, AX
+	JMP  lz4_s2_loop
+
+repeat_two_lz4_s2_emit_copy_short_2b:
+	SHLL $0x02, R10
+	ORL  $0x01, R10
+	MOVW R10, (AX)
+	ADDQ $0x02, AX
+	JMP  lz4_s2_loop
+
+repeat_two_offset_lz4_s2_emit_copy_short_2b:
+	XORQ R8, R8
+	LEAL 1(R8)(R10*4), R10
+	MOVB R9, 1(AX)
+	SARL $0x08, R9
+	SHLL $0x05, R9
+	ORL  R9, R10
+	MOVB R10, (AX)
+	ADDQ $0x02, AX
+	JMP  lz4_s2_loop
+
+long_offset_short_lz4_s2:
+	MOVB $0xee, (AX)
+	MOVW R9, 1(AX)
+	LEAL -60(R10), R10
+	ADDQ $0x03, AX
+
+	// emitRepeat
+	MOVL R10, R8
+	LEAL -4(R10), R10
+	CMPL R8, $0x08
+	JLE  repeat_two_lz4_s2_emit_copy_short
+	CMPL R8, $0x0c
+	JGE  cant_repeat_two_offset_lz4_s2_emit_copy_short
+	CMPL R9, $0x00000800
+	JLT  repeat_two_offset_lz4_s2_emit_copy_short
+
+cant_repeat_two_offset_lz4_s2_emit_copy_short:
+	CMPL R10, $0x00000104
+	JLT  repeat_three_lz4_s2_emit_copy_short
+	LEAL -256(R10), R10
+	MOVW $0x0019, (AX)
+	MOVW R10, 2(AX)
+	ADDQ $0x04, AX
+	JMP  lz4_s2_loop
+
+repeat_three_lz4_s2_emit_copy_short:
+	LEAL -4(R10), R10
+	MOVW $0x0015, (AX)
+	MOVB R10, 2(AX)
+	ADDQ $0x03, AX
+	JMP  lz4_s2_loop
+
+repeat_two_lz4_s2_emit_copy_short:
+	SHLL $0x02, R10
+	ORL  $0x01, R10
+	MOVW R10, (AX)
+	ADDQ $0x02, AX
+	JMP  lz4_s2_loop
+
+repeat_two_offset_lz4_s2_emit_copy_short:
+	XORQ R8, R8
+	LEAL 1(R8)(R10*4), R10
+	MOVB R9, 1(AX)
+	SARL $0x08, R9
+	SHLL $0x05, R9
+	ORL  R9, R10
+	MOVB R10, (AX)
+	ADDQ $0x02, AX
+	JMP  lz4_s2_loop
+
+two_byte_offset_short_lz4_s2:
+	MOVL R10, R8
+	SHLL $0x02, R8
+	CMPL R10, $0x0c
+	JGE  emit_copy_three_lz4_s2
+	CMPL R9, $0x00000800
+	JGE  emit_copy_three_lz4_s2
+	LEAL -15(R8), R8
+	MOVB R9, 1(AX)
+	SHRL $0x08, R9
+	SHLL $0x05, R9
+	ORL  R9, R8
+	MOVB R8, (AX)
+	ADDQ $0x02, AX
+	JMP  lz4_s2_loop
+
+emit_copy_three_lz4_s2:
+	LEAL -2(R8), R8
+	MOVB R8, (AX)
+	MOVW R9, 1(AX)
+	ADDQ $0x03, AX
+	JMP  lz4_s2_loop
+
+lz4_s2_done:
+	MOVQ dst_base+0(FP), CX
+	SUBQ CX, AX
+	MOVQ SI, uncompressed+48(FP)
+	MOVQ AX, dstUsed+56(FP)
+	RET
+
+lz4_s2_corrupt:
+	XORQ AX, AX
+	LEAQ -1(AX), SI
+	MOVQ SI, uncompressed+48(FP)
+	RET
+
+lz4_s2_dstfull:
+	XORQ AX, AX
+	LEAQ -2(AX), SI
+	MOVQ SI, uncompressed+48(FP)
+	RET
diff --git a/s2/lz4convert.go b/s2/lz4convert.go
index d3649d1c2a..824b32241e 100644
--- a/s2/lz4convert.go
+++ b/s2/lz4convert.go
@@ -33,6 +33,28 @@ func (l *LZ4Converter) ConvertBlock(dst, src []byte) ([]byte, int, error) {
 
 	s, d := 0, len(dst)
 	dst = dst[:cap(dst)]
+	if !debug && hasAmd64Asm {
+		res, sz := cvtLZ4BlockAsm(dst[d:], src)
+		if res < 0 {
+			const (
+				errCorrupt     = -1
+				errDstTooSmall = -2
+			)
+			switch res {
+			case errCorrupt:
+				return dst[:d], 0, ErrCorrupt
+			case errDstTooSmall:
+				return dst[:d], 0, ErrDstTooSmall
+			default:
+				return dst[:d], 0, fmt.Errorf("unexpected result: %d", res)
+			}
+		}
+		if d+sz > len(dst) {
+			return dst[:0], 0, ErrDstTooSmall
+		}
+		return dst[:d+sz], res, nil
+	}
+
 	dLimit := len(dst) - 8
 	var lastOffset uint16
 	var uncompressed int
@@ -84,10 +106,6 @@ func (l *LZ4Converter) ConvertBlock(dst, src []byte) ([]byte, int, error) {
 			d += emitLiteralGo(dst[d:], src[s:s+ll])
 			s += ll
 			uncompressed += ll
-
-			if d > dLimit {
-				return dst[:d], 0, ErrDstTooSmall
-			}
 		}
 
 		// Check if we are done...