From f0f38ba8b102ff1643306b538cea1cce49fac5c4 Mon Sep 17 00:00:00 2001 From: Klaus Post Date: Tue, 10 Mar 2020 22:26:45 +0100 Subject: [PATCH 1/2] s2: Add pure Go snappy block compressor --- s2/encode_all.go | 44 + s2/encode_amd64.go | 48 +- s2/encode_go.go | 265 +- s2/encodeblock_amd64.go | 10 + s2/encodeblock_amd64.s | 14520 +++++++++++++++++++------------------- s2/gen.go | 32 + 6 files changed, 7610 insertions(+), 7309 deletions(-) diff --git a/s2/encode_all.go b/s2/encode_all.go index 237221b469..b2d79085cc 100644 --- a/s2/encode_all.go +++ b/s2/encode_all.go @@ -11,6 +11,50 @@ import ( "math/bits" ) +// EncodeSnappy returns the encoded form of src. The returned slice may be a sub- +// slice of dst if dst was large enough to hold the entire encoded block. +// Otherwise, a newly allocated slice will be returned. +// +// The output is Snappy compatible and will likely decompress faster. +// +// The dst and src must not overlap. It is valid to pass a nil dst. +// +// The blocks will require the same amount of memory to decode as encoding, +// and does not make for concurrent decoding. +// Also note that blocks do not contain CRC information, so corruption may be undetected. +// +// If you need to encode larger amounts of data, consider using +// the streaming interface which gives all of these features. +func EncodeSnappy(dst, src []byte) []byte { + if n := MaxEncodedLen(len(src)); n < 0 { + panic(ErrTooLarge) + } else if cap(dst) < n { + dst = make([]byte, n) + } else { + dst = dst[:n] + } + + // The block starts with the varint-encoded length of the decompressed bytes. + d := binary.PutUvarint(dst, uint64(len(src))) + + if len(src) == 0 { + return dst[:d] + } + if len(src) < minNonLiteralBlockSize { + d += emitLiteral(dst[d:], src) + return dst[:d] + } + + n := encodeBlockSnappy(dst[d:], src) + if n > 0 { + d += n + return dst[:d] + } + // Not compressible + d += emitLiteral(dst[d:], src) + return dst[:d] +} + func load32(b []byte, i int) uint32 { b = b[i:] b = b[:4] diff --git a/s2/encode_amd64.go b/s2/encode_amd64.go index 2b4ceb1c43..640240d6d5 100644 --- a/s2/encode_amd64.go +++ b/s2/encode_amd64.go @@ -4,8 +4,6 @@ package s2 -import "encoding/binary" - func init() { avxAvailable = cpu.avx() } @@ -57,51 +55,7 @@ func encodeBlock(dst, src []byte) (d int) { return encodeBlockAsm8B(dst, src) } -// EncodeSnappy returns the encoded form of src. The returned slice may be a sub- -// slice of dst if dst was large enough to hold the entire encoded block. -// Otherwise, a newly allocated slice will be returned. -// -// The output is Snappy compatible and will likely decompress faster. -// -// The dst and src must not overlap. It is valid to pass a nil dst. -// -// The blocks will require the same amount of memory to decode as encoding, -// and does not make for concurrent decoding. -// Also note that blocks do not contain CRC information, so corruption may be undetected. -// -// If you need to encode larger amounts of data, consider using -// the streaming interface which gives all of these features. -func EncodeSnappy(dst, src []byte) []byte { - if n := MaxEncodedLen(len(src)); n < 0 { - panic(ErrTooLarge) - } else if cap(dst) < n { - dst = make([]byte, n) - } else { - dst = dst[:n] - } - - // The block starts with the varint-encoded length of the decompressed bytes. - d := binary.PutUvarint(dst, uint64(len(src))) - - if len(src) == 0 { - return dst[:d] - } - if len(src) < minNonLiteralBlockSize { - d += emitLiteral(dst[d:], src) - return dst[:d] - } - - n := encodeBlockSnappy(dst[d:], src) - if n > 0 { - d += n - return dst[:d] - } - // Not compressible - d += emitLiteral(dst[d:], src) - return dst[:d] -} - -// encodeBlock encodes a non-empty src to a guaranteed-large-enough dst. It +// encodeBlockSnappy encodes a non-empty src to a guaranteed-large-enough dst. It // assumes that the varint-encoded length of the decompressed bytes has already // been written. // diff --git a/s2/encode_go.go b/s2/encode_go.go index 6035a311a1..82f0047df6 100644 --- a/s2/encode_go.go +++ b/s2/encode_go.go @@ -3,29 +3,10 @@ package s2 import ( + "bytes" "math/bits" - - "github.com/klauspost/compress/snappy" ) -// EncodeSnappy returns the encoded form of src. The returned slice may be a sub- -// slice of dst if dst was large enough to hold the entire encoded block. -// Otherwise, a newly allocated slice will be returned. -// -// The output is Snappy compatible and will likely decompress faster. -// -// The dst and src must not overlap. It is valid to pass a nil dst. -// -// The blocks will require the same amount of memory to decode as encoding, -// and does not make for concurrent decoding. -// Also note that blocks do not contain CRC information, so corruption may be undetected. -// -// If you need to encode larger amounts of data, consider using -// the streaming interface which gives all of these features. -func EncodeSnappy(dst, src []byte) []byte { - return snappy.Encode(dst, src) -} - // encodeBlock encodes a non-empty src to a guaranteed-large-enough dst. It // assumes that the varint-encoded length of the decompressed bytes has already // been written. @@ -188,6 +169,65 @@ func emitCopy(dst []byte, offset, length int) int { return 2 } +// emitCopyNoRepeat writes a copy chunk and returns the number of bytes written. +// +// It assumes that: +// dst is long enough to hold the encoded bytes +// 1 <= offset && offset <= math.MaxUint32 +// 4 <= length && length <= 1 << 24 +func emitCopyNoRepeat(dst []byte, offset, length int) int { + if offset >= 65536 { + i := 0 + if length > 64 { + // Emit a length 64 copy, encoded as 5 bytes. + dst[4] = uint8(offset >> 24) + dst[3] = uint8(offset >> 16) + dst[2] = uint8(offset >> 8) + dst[1] = uint8(offset) + dst[0] = 63<<2 | tagCopy4 + length -= 64 + if length >= 4 { + // Emit remaining as repeats + return 5 + emitCopyNoRepeat(dst[5:], offset, length) + } + i = 5 + } + if length == 0 { + return i + } + // Emit a copy, offset encoded as 4 bytes. + dst[i+0] = uint8(length-1)<<2 | tagCopy4 + dst[i+1] = uint8(offset) + dst[i+2] = uint8(offset >> 8) + dst[i+3] = uint8(offset >> 16) + dst[i+4] = uint8(offset >> 24) + return i + 5 + } + + // Offset no more than 2 bytes. + if length > 64 { + // Emit a length 60 copy, encoded as 3 bytes. + // Emit remaining as repeat value (minimum 4 bytes). + dst[2] = uint8(offset >> 8) + dst[1] = uint8(offset) + dst[0] = 59<<2 | tagCopy2 + length -= 60 + // Emit remaining as repeats, at least 4 bytes remain. + return 3 + emitCopyNoRepeat(dst[3:], offset, length) + } + if length >= 12 || offset >= 2048 { + // Emit the remaining copy, encoded as 3 bytes. + dst[2] = uint8(offset >> 8) + dst[1] = uint8(offset) + dst[0] = uint8(length-1)<<2 | tagCopy2 + return 3 + } + // Emit the remaining copy, encoded as 2 bytes. + dst[1] = uint8(offset) + dst[0] = uint8(offset>>8)<<5 | uint8(length-4)<<2 | tagCopy1 + return 2 +} + // matchLen returns how many bytes match in a and b // // It assumes that: @@ -223,3 +263,188 @@ func matchLen(a []byte, b []byte) int { } return len(a) + checked } + +func encodeBlockSnappy(dst, src []byte) (d int) { + // Initialize the hash table. + const ( + tableBits = 14 + maxTableSize = 1 << tableBits + ) + + var table [maxTableSize]uint32 + + // sLimit is when to stop looking for offset/length copies. The inputMargin + // lets us use a fast path for emitLiteral in the main loop, while we are + // looking for copies. + sLimit := len(src) - inputMargin + + // Bail if we can't compress to at least this. + dstLimit := len(src) - len(src)>>5 - 5 + + // nextEmit is where in src the next emitLiteral should start from. + nextEmit := 0 + + // The encoded form must start with a literal, as there are no previous + // bytes to copy, so we start looking for hash matches at s == 1. + s := 1 + cv := load64(src, s) + + // We search for a repeat at -1, but don't output repeats when nextEmit == 0 + repeat := 1 + + for { + candidate := 0 + for { + // Next src position to check + nextS := s + (s-nextEmit)>>6 + 4 + if nextS > sLimit { + goto emitRemainder + } + hash0 := hash6(cv, tableBits) + hash1 := hash6(cv>>8, tableBits) + candidate = int(table[hash0]) + candidate2 := int(table[hash1]) + table[hash0] = uint32(s) + table[hash1] = uint32(s + 1) + hash2 := hash6(cv>>16, tableBits) + + // Check repeat at offset checkRep. + const checkRep = 1 + if uint32(cv>>(checkRep*8)) == load32(src, s-repeat+checkRep) { + base := s + checkRep + // Extend back + for i := base - repeat; base > nextEmit && i > 0 && src[i-1] == src[base-1]; { + i-- + base-- + } + d += emitLiteral(dst[d:], src[nextEmit:base]) + + // Extend forward + candidate := s - repeat + 4 + checkRep + s += 4 + checkRep + for s <= sLimit { + if diff := load64(src, s) ^ load64(src, candidate); diff != 0 { + s += bits.TrailingZeros64(diff) >> 3 + break + } + s += 8 + candidate += 8 + } + + d += emitCopyNoRepeat(dst[d:], repeat, s-base) + nextEmit = s + if s >= sLimit { + goto emitRemainder + } + + cv = load64(src, s) + continue + } + + if uint32(cv) == load32(src, candidate) { + break + } + candidate = int(table[hash2]) + if uint32(cv>>8) == load32(src, candidate2) { + table[hash2] = uint32(s + 2) + candidate = candidate2 + s++ + break + } + table[hash2] = uint32(s + 2) + if uint32(cv>>16) == load32(src, candidate) { + s += 2 + break + } + + cv = load64(src, nextS) + s = nextS + } + + // Extend backwards + for candidate > 0 && s > nextEmit && src[candidate-1] == src[s-1] { + candidate-- + s-- + } + + // Bail if we exceed the maximum size. + if d+(s-nextEmit) > dstLimit { + return 0 + } + + // A 4-byte match has been found. We'll later see if more than 4 bytes + // match. But, prior to the match, src[nextEmit:s] are unmatched. Emit + // them as literal bytes. + + d += emitLiteral(dst[d:], src[nextEmit:s]) + + // Call emitCopy, and then see if another emitCopy could be our next + // move. Repeat until we find no match for the input immediately after + // what was consumed by the last emitCopy call. + // + // If we exit this loop normally then we need to call emitLiteral next, + // though we don't yet know how big the literal will be. We handle that + // by proceeding to the next iteration of the main loop. We also can + // exit this loop via goto if we get close to exhausting the input. + for { + // Invariant: we have a 4-byte match at s, and no need to emit any + // literal bytes prior to s. + base := s + repeat = base - candidate + + // Extend the 4-byte match as long as possible. + s += 4 + candidate += 4 + for s <= len(src)-8 { + if diff := load64(src, s) ^ load64(src, candidate); diff != 0 { + s += bits.TrailingZeros64(diff) >> 3 + break + } + s += 8 + candidate += 8 + } + + d += emitCopyNoRepeat(dst[d:], repeat, s-base) + if false { + // Validate match. + a := src[base:s] + b := src[base-repeat : base-repeat+(s-base)] + if !bytes.Equal(a, b) { + panic("mismatch") + } + } + + nextEmit = s + if s >= sLimit { + goto emitRemainder + } + + if d > dstLimit { + // Do we have space for more, if not bail. + return 0 + } + // Check for an immediate match, otherwise start search at s+1 + x := load64(src, s-2) + m2Hash := hash6(x, tableBits) + currHash := hash6(x>>16, tableBits) + candidate = int(table[currHash]) + table[m2Hash] = uint32(s - 2) + table[currHash] = uint32(s) + if uint32(x>>16) != load32(src, candidate) { + cv = load64(src, s+1) + s++ + break + } + } + } + +emitRemainder: + if nextEmit < len(src) { + // Bail if we exceed the maximum size. + if d+len(src)-nextEmit > dstLimit { + return 0 + } + d += emitLiteral(dst[d:], src[nextEmit:]) + } + return d +} diff --git a/s2/encodeblock_amd64.go b/s2/encodeblock_amd64.go index d14afa20e7..4d4fbab508 100644 --- a/s2/encodeblock_amd64.go +++ b/s2/encodeblock_amd64.go @@ -136,6 +136,16 @@ func emitRepeat(dst []byte, offset int, length int) int //go:noescape func emitCopy(dst []byte, offset int, length int) int +// emitCopyNoRepeat writes a copy chunk and returns the number of bytes written. +// +// It assumes that: +// dst is long enough to hold the encoded bytes +// 1 <= offset && offset <= math.MaxUint32 +// 4 <= length && length <= 1 << 24 +// +//go:noescape +func emitCopyNoRepeat(dst []byte, offset int, length int) int + // matchLen returns how many bytes match in a and b // // It assumes that: diff --git a/s2/encodeblock_amd64.s b/s2/encodeblock_amd64.s index 452f0f1e77..dd8dc45056 100644 --- a/s2/encodeblock_amd64.s +++ b/s2/encodeblock_amd64.s @@ -29,8 +29,8 @@ zero_loop_encodeBlockAsm: MOVL $0x00000000, 12(SP) MOVQ src_len+32(FP), CX LEAQ -5(CX), DX - LEAQ -8(CX), R9 - MOVL R9, 8(SP) + LEAQ -8(CX), BP + MOVL BP, 8(SP) SHRQ $0x05, CX SUBL CX, DX LEAQ (AX)(DX*1), DX @@ -40,68 +40,68 @@ zero_loop_encodeBlockAsm: MOVQ src_base+24(FP), DX search_loop_encodeBlockAsm: - MOVQ (DX)(CX*1), R9 - MOVL CX, BX - SUBL 12(SP), BX - SHRL $0x06, BX - LEAL 4(CX)(BX*1), BX - MOVL 8(SP), BP - CMPL BX, BP + MOVQ (DX)(CX*1), SI + MOVL CX, BP + SUBL 12(SP), BP + SHRL $0x06, BP + LEAL 4(CX)(BP*1), BP + MOVL 8(SP), DI + CMPL BP, DI JGT emit_remainder_encodeBlockAsm - MOVL BX, 20(SP) - MOVQ $0x0000cf1bbcdcbf9b, R10 - MOVQ R9, R11 - MOVQ R9, R12 - SHRQ $0x08, R12 - SHLQ $0x10, R11 - IMULQ R10, R11 - SHRQ $0x32, R11 - SHLQ $0x10, R12 - IMULQ R10, R12 - SHRQ $0x32, R12 - MOVL 24(SP)(R11*4), BX - MOVL 24(SP)(R12*4), BP - MOVL CX, 24(SP)(R11*4) - LEAL 1(CX), DI - MOVL DI, 24(SP)(R12*4) - MOVQ R9, R11 - SHRQ $0x10, R11 - SHLQ $0x10, R11 - IMULQ R10, R11 - SHRQ $0x32, R11 - MOVL CX, DI - SUBL 16(SP), DI - MOVL 1(DX)(DI*1), DI - MOVQ R9, R10 + MOVL BP, 20(SP) + MOVQ $0x0000cf1bbcdcbf9b, R8 + MOVQ SI, R9 + MOVQ SI, R10 SHRQ $0x08, R10 - CMPL R10, DI + SHLQ $0x10, R9 + IMULQ R8, R9 + SHRQ $0x32, R9 + SHLQ $0x10, R10 + IMULQ R8, R10 + SHRQ $0x32, R10 + MOVL 24(SP)(R9*4), BP + MOVL 24(SP)(R10*4), DI + MOVL CX, 24(SP)(R9*4) + LEAL 1(CX), R9 + MOVL R9, 24(SP)(R10*4) + MOVQ SI, R9 + SHRQ $0x10, R9 + SHLQ $0x10, R9 + IMULQ R8, R9 + SHRQ $0x32, R9 + MOVL CX, R8 + SUBL 16(SP), R8 + MOVL 1(DX)(R8*1), R10 + MOVQ SI, R8 + SHRQ $0x08, R8 + CMPL R8, R10 JNE no_repeat_found_encodeBlockAsm - LEAL 1(CX), DI - MOVL 12(SP), R8 - MOVL DI, BX - SUBL 16(SP), BX + LEAL 1(CX), SI + MOVL 12(SP), DI + MOVL SI, BP + SUBL 16(SP), BP JZ repeat_extend_back_end_encodeBlockAsm repeat_extend_back_loop_encodeBlockAsm: - CMPL DI, R8 + CMPL SI, DI JLE repeat_extend_back_end_encodeBlockAsm - MOVB -1(DX)(BX*1), BP - MOVB -1(DX)(DI*1), SI - CMPB BP, SI + MOVB -1(DX)(BP*1), BL + MOVB -1(DX)(SI*1), R8 + CMPB BL, R8 JNE repeat_extend_back_end_encodeBlockAsm - LEAL -1(DI), DI - DECL BX + LEAL -1(SI), SI + DECL BP JNZ repeat_extend_back_loop_encodeBlockAsm repeat_extend_back_end_encodeBlockAsm: MOVL 12(SP), BP - CMPL BP, DI + CMPL BP, SI JEQ emit_literal_done_repeat_emit_encodeBlockAsm - MOVL DI, BX - MOVL DI, 12(SP) + MOVL SI, R8 + MOVL SI, 12(SP) LEAQ (DX)(BP*1), R9 - SUBL BP, BX - MOVL BX, BP + SUBL BP, R8 + MOVL R8, BP SUBL $0x01, BP JC emit_literal_done_repeat_emit_encodeBlockAsm CMPL BP, $0x3c @@ -144,87 +144,86 @@ one_byte_repeat_emit_encodeBlockAsm: ADDQ $0x01, AX memmove_repeat_emit_encodeBlockAsm: - LEAQ (AX)(BX*1), R10 - MOVL BX, BP + LEAQ (AX)(R8*1), BP NOP emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_tail: - TESTQ BP, BP + TESTQ R8, R8 JEQ memmove_end_copy_repeat_emit_encodeBlockAsm - CMPQ BP, $0x02 + CMPQ R8, $0x02 JBE emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_1or2 - CMPQ BP, $0x04 + CMPQ R8, $0x04 JB emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_3 JBE emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_4 - CMPQ BP, $0x08 + CMPQ R8, $0x08 JB emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_5through7 JE emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_8 - CMPQ BP, $0x10 + CMPQ R8, $0x10 JBE emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_9through16 - CMPQ BP, $0x20 + CMPQ R8, $0x20 JBE emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_17through32 - CMPQ BP, $0x40 + CMPQ R8, $0x40 JBE emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_33through64 - CMPQ BP, $0x80 + CMPQ R8, $0x80 JBE emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_65through128 - CMPQ BP, $0x00000100 + CMPQ R8, $0x00000100 JBE emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_129through256 JMP emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_256through2048 emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_1or2: - MOVB (R9), R11 - MOVB -1(R9)(BP*1), R9 - MOVB R11, (AX) - MOVB R9, -1(AX)(BP*1) + MOVB (R9), R10 + MOVB -1(R9)(R8*1), R9 + MOVB R10, (AX) + MOVB R9, -1(AX)(R8*1) JMP memmove_end_copy_repeat_emit_encodeBlockAsm emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_4: - MOVL (R9), R11 - MOVL R11, (AX) + MOVL (R9), R10 + MOVL R10, (AX) JMP memmove_end_copy_repeat_emit_encodeBlockAsm emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_3: - MOVW (R9), R11 + MOVW (R9), R10 MOVB 2(R9), R9 - MOVW R11, (AX) + MOVW R10, (AX) MOVB R9, 2(AX) JMP memmove_end_copy_repeat_emit_encodeBlockAsm emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_5through7: - MOVL (R9), R11 - MOVL -4(R9)(BP*1), R9 - MOVL R11, (AX) - MOVL R9, -4(AX)(BP*1) + MOVL (R9), R10 + MOVL -4(R9)(R8*1), R9 + MOVL R10, (AX) + MOVL R9, -4(AX)(R8*1) JMP memmove_end_copy_repeat_emit_encodeBlockAsm emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_8: - MOVQ (R9), R11 - MOVQ R11, (AX) + MOVQ (R9), R10 + MOVQ R10, (AX) JMP memmove_end_copy_repeat_emit_encodeBlockAsm emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_9through16: - MOVQ (R9), R11 - MOVQ -8(R9)(BP*1), R9 - MOVQ R11, (AX) - MOVQ R9, -8(AX)(BP*1) + MOVQ (R9), R10 + MOVQ -8(R9)(R8*1), R9 + MOVQ R10, (AX) + MOVQ R9, -8(AX)(R8*1) JMP memmove_end_copy_repeat_emit_encodeBlockAsm emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_17through32: MOVOU (R9), X0 - MOVOU -16(R9)(BP*1), X1 + MOVOU -16(R9)(R8*1), X1 MOVOU X0, (AX) - MOVOU X1, -16(AX)(BP*1) + MOVOU X1, -16(AX)(R8*1) JMP memmove_end_copy_repeat_emit_encodeBlockAsm emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_33through64: MOVOU (R9), X0 MOVOU 16(R9), X1 - MOVOU -32(R9)(BP*1), X2 - MOVOU -16(R9)(BP*1), X3 + MOVOU -32(R9)(R8*1), X2 + MOVOU -16(R9)(R8*1), X3 MOVOU X0, (AX) MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(BP*1) - MOVOU X3, -16(AX)(BP*1) + MOVOU X2, -32(AX)(R8*1) + MOVOU X3, -16(AX)(R8*1) JMP memmove_end_copy_repeat_emit_encodeBlockAsm emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_65through128: @@ -232,18 +231,18 @@ emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_65through128: MOVOU 16(R9), X1 MOVOU 32(R9), X2 MOVOU 48(R9), X3 - MOVOU -64(R9)(BP*1), X12 - MOVOU -48(R9)(BP*1), X13 - MOVOU -32(R9)(BP*1), X14 - MOVOU -16(R9)(BP*1), X15 + MOVOU -64(R9)(R8*1), X12 + MOVOU -48(R9)(R8*1), X13 + MOVOU -32(R9)(R8*1), X14 + MOVOU -16(R9)(R8*1), X15 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, 32(AX) MOVOU X3, 48(AX) - MOVOU X12, -64(AX)(BP*1) - MOVOU X13, -48(AX)(BP*1) - MOVOU X14, -32(AX)(BP*1) - MOVOU X15, -16(AX)(BP*1) + MOVOU X12, -64(AX)(R8*1) + MOVOU X13, -48(AX)(R8*1) + MOVOU X14, -32(AX)(R8*1) + MOVOU X15, -16(AX)(R8*1) JMP memmove_end_copy_repeat_emit_encodeBlockAsm emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_129through256: @@ -255,14 +254,14 @@ emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_129through256: MOVOU 80(R9), X5 MOVOU 96(R9), X6 MOVOU 112(R9), X7 - MOVOU -128(R9)(BP*1), X8 - MOVOU -112(R9)(BP*1), X9 - MOVOU -96(R9)(BP*1), X10 - MOVOU -80(R9)(BP*1), X11 - MOVOU -64(R9)(BP*1), X12 - MOVOU -48(R9)(BP*1), X13 - MOVOU -32(R9)(BP*1), X14 - MOVOU -16(R9)(BP*1), X15 + MOVOU -128(R9)(R8*1), X8 + MOVOU -112(R9)(R8*1), X9 + MOVOU -96(R9)(R8*1), X10 + MOVOU -80(R9)(R8*1), X11 + MOVOU -64(R9)(R8*1), X12 + MOVOU -48(R9)(R8*1), X13 + MOVOU -32(R9)(R8*1), X14 + MOVOU -16(R9)(R8*1), X15 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, 32(AX) @@ -271,18 +270,18 @@ emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_129through256: MOVOU X5, 80(AX) MOVOU X6, 96(AX) MOVOU X7, 112(AX) - MOVOU X8, -128(AX)(BP*1) - MOVOU X9, -112(AX)(BP*1) - MOVOU X10, -96(AX)(BP*1) - MOVOU X11, -80(AX)(BP*1) - MOVOU X12, -64(AX)(BP*1) - MOVOU X13, -48(AX)(BP*1) - MOVOU X14, -32(AX)(BP*1) - MOVOU X15, -16(AX)(BP*1) + MOVOU X8, -128(AX)(R8*1) + MOVOU X9, -112(AX)(R8*1) + MOVOU X10, -96(AX)(R8*1) + MOVOU X11, -80(AX)(R8*1) + MOVOU X12, -64(AX)(R8*1) + MOVOU X13, -48(AX)(R8*1) + MOVOU X14, -32(AX)(R8*1) + MOVOU X15, -16(AX)(R8*1) JMP memmove_end_copy_repeat_emit_encodeBlockAsm emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_256through2048: - LEAQ -256(BP), BP + LEAQ -256(R8), R8 MOVOU (R9), X0 MOVOU 16(R9), X1 MOVOU 32(R9), X2 @@ -315,81 +314,81 @@ emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_256through2048: MOVOU X13, 208(AX) MOVOU X14, 224(AX) MOVOU X15, 240(AX) - CMPQ BP, $0x00000100 + CMPQ R8, $0x00000100 LEAQ 256(R9), R9 LEAQ 256(AX), AX JGE emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_256through2048 JMP emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_tail memmove_end_copy_repeat_emit_encodeBlockAsm: - MOVQ R10, AX + MOVQ BP, AX emit_literal_done_repeat_emit_encodeBlockAsm: ADDL $0x05, CX - MOVL CX, BX - SUBL 16(SP), BX - MOVQ src_len+32(FP), R9 - SUBL CX, R9 - LEAQ (DX)(CX*1), R10 - LEAQ (DX)(BX*1), R11 - XORL BX, BX - CMPL R9, $0x08 + MOVL CX, BP + SUBL 16(SP), BP + MOVQ src_len+32(FP), R8 + SUBL CX, R8 + LEAQ (DX)(CX*1), R9 + LEAQ (DX)(BP*1), BP + XORL R11, R11 + CMPL R8, $0x08 JL matchlen_single_repeat_extend matchlen_loopback_repeat_extend: - MOVQ (R10)(BX*1), R12 - XORQ (R11)(BX*1), R12 - TESTQ R12, R12 + MOVQ (R9)(R11*1), R10 + XORQ (BP)(R11*1), R10 + TESTQ R10, R10 JZ matchlen_loop_repeat_extend - BSFQ R12, R12 - SARQ $0x03, R12 - LEAL (BX)(R12*1), BX + BSFQ R10, R10 + SARQ $0x03, R10 + LEAL (R11)(R10*1), R11 JMP repeat_extend_forward_end_encodeBlockAsm matchlen_loop_repeat_extend: - LEAL -8(R9), R9 - LEAL 8(BX), BX - CMPL R9, $0x08 + LEAL -8(R8), R8 + LEAL 8(R11), R11 + CMPL R8, $0x08 JGE matchlen_loopback_repeat_extend matchlen_single_repeat_extend: - TESTL R9, R9 + TESTL R8, R8 JZ repeat_extend_forward_end_encodeBlockAsm matchlen_single_loopback_repeat_extend: - MOVB (R10)(BX*1), R12 - CMPB (R11)(BX*1), R12 + MOVB (R9)(R11*1), R10 + CMPB (BP)(R11*1), R10 JNE repeat_extend_forward_end_encodeBlockAsm - LEAL 1(BX), BX - DECL R9 + LEAL 1(R11), R11 + DECL R8 JNZ matchlen_single_loopback_repeat_extend repeat_extend_forward_end_encodeBlockAsm: - ADDL BX, CX - MOVL CX, BX - SUBL DI, BX - MOVL 16(SP), BP - TESTL R8, R8 + ADDL R11, CX + MOVL CX, BP + SUBL SI, BP + MOVL 16(SP), SI + TESTL DI, DI JZ repeat_as_copy_encodeBlockAsm emit_repeat_again_match_repeat_encodeBlockAsm: - MOVL BX, DI - LEAL -4(BX), BX + MOVL BP, DI + LEAL -4(BP), BP CMPL DI, $0x08 JLE repeat_two_match_repeat_encodeBlockAsm CMPL DI, $0x0c JGE cant_repeat_two_offset_match_repeat_encodeBlockAsm - CMPL BP, $0x00000800 + CMPL SI, $0x00000800 JLT repeat_two_offset_match_repeat_encodeBlockAsm cant_repeat_two_offset_match_repeat_encodeBlockAsm: - CMPL BX, $0x00000104 + CMPL BP, $0x00000104 JLT repeat_three_match_repeat_encodeBlockAsm - CMPL BX, $0x00010100 + CMPL BP, $0x00010100 JLT repeat_four_match_repeat_encodeBlockAsm - CMPL BX, $0x0100ffff + CMPL BP, $0x0100ffff JLT repeat_five_match_repeat_encodeBlockAsm - LEAL -16842747(BX), BX + LEAL -16842747(BP), BP MOVW $0x001d, (AX) MOVW $0xfffb, 2(AX) MOVB $0xff, 4(AX) @@ -397,79 +396,79 @@ cant_repeat_two_offset_match_repeat_encodeBlockAsm: JMP emit_repeat_again_match_repeat_encodeBlockAsm repeat_five_match_repeat_encodeBlockAsm: - LEAL -65536(BX), BX - MOVL BX, BP + LEAL -65536(BP), BP + MOVL BP, SI MOVW $0x001d, (AX) - MOVW BX, 2(AX) - SARL $0x10, BP - MOVB BP, 4(AX) + MOVW BP, 2(AX) + SARL $0x10, SI + MOVB SI, 4(AX) ADDQ $0x05, AX JMP repeat_end_emit_encodeBlockAsm repeat_four_match_repeat_encodeBlockAsm: - LEAL -256(BX), BX + LEAL -256(BP), BP MOVW $0x0019, (AX) - MOVW BX, 2(AX) + MOVW BP, 2(AX) ADDQ $0x04, AX JMP repeat_end_emit_encodeBlockAsm repeat_three_match_repeat_encodeBlockAsm: - LEAL -4(BX), BX + LEAL -4(BP), BP MOVW $0x0015, (AX) - MOVB BL, 2(AX) + MOVB BP, 2(AX) ADDQ $0x03, AX JMP repeat_end_emit_encodeBlockAsm repeat_two_match_repeat_encodeBlockAsm: - SHLL $0x02, BX - ORL $0x01, BX - MOVW BX, (AX) + SHLL $0x02, BP + ORL $0x01, BP + MOVW BP, (AX) ADDQ $0x02, AX JMP repeat_end_emit_encodeBlockAsm repeat_two_offset_match_repeat_encodeBlockAsm: - XORQ R9, R9 - LEAL 1(R9)(BX*4), BX - MOVB BP, 1(AX) - SARL $0x08, BP - SHLL $0x05, BP - ORL BP, BX - MOVB BL, (AX) + XORQ DI, DI + LEAL 1(DI)(BP*4), BP + MOVB SI, 1(AX) + SARL $0x08, SI + SHLL $0x05, SI + ORL SI, BP + MOVB BP, (AX) ADDQ $0x02, AX JMP repeat_end_emit_encodeBlockAsm repeat_as_copy_encodeBlockAsm: - CMPL BP, $0x00010000 + CMPL SI, $0x00010000 JL two_byte_offset_repeat_as_copy_encodeBlockAsm four_bytes_loop_back_repeat_as_copy_encodeBlockAsm: - CMPL BX, $0x40 + CMPL BP, $0x40 JLE four_bytes_remain_repeat_as_copy_encodeBlockAsm MOVB $0xff, (AX) - MOVL BP, 1(AX) - LEAL -64(BX), BX + MOVL SI, 1(AX) + LEAL -64(BP), BP ADDQ $0x05, AX - CMPL BX, $0x04 + CMPL BP, $0x04 JL four_bytes_remain_repeat_as_copy_encodeBlockAsm emit_repeat_again_repeat_as_copy_encodeBlockAsm_emit_copy: - MOVL BX, DI - LEAL -4(BX), BX + MOVL BP, DI + LEAL -4(BP), BP CMPL DI, $0x08 JLE repeat_two_repeat_as_copy_encodeBlockAsm_emit_copy CMPL DI, $0x0c JGE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy - CMPL BP, $0x00000800 + CMPL SI, $0x00000800 JLT repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy: - CMPL BX, $0x00000104 + CMPL BP, $0x00000104 JLT repeat_three_repeat_as_copy_encodeBlockAsm_emit_copy - CMPL BX, $0x00010100 + CMPL BP, $0x00010100 JLT repeat_four_repeat_as_copy_encodeBlockAsm_emit_copy - CMPL BX, $0x0100ffff + CMPL BP, $0x0100ffff JLT repeat_five_repeat_as_copy_encodeBlockAsm_emit_copy - LEAL -16842747(BX), BX + LEAL -16842747(BP), BP MOVW $0x001d, (AX) MOVW $0xfffb, 2(AX) MOVB $0xff, 4(AX) @@ -477,84 +476,84 @@ cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy: JMP emit_repeat_again_repeat_as_copy_encodeBlockAsm_emit_copy repeat_five_repeat_as_copy_encodeBlockAsm_emit_copy: - LEAL -65536(BX), BX - MOVL BX, BP + LEAL -65536(BP), BP + MOVL BP, SI MOVW $0x001d, (AX) - MOVW BX, 2(AX) - SARL $0x10, BP - MOVB BP, 4(AX) + MOVW BP, 2(AX) + SARL $0x10, SI + MOVB SI, 4(AX) ADDQ $0x05, AX JMP repeat_end_emit_encodeBlockAsm repeat_four_repeat_as_copy_encodeBlockAsm_emit_copy: - LEAL -256(BX), BX + LEAL -256(BP), BP MOVW $0x0019, (AX) - MOVW BX, 2(AX) + MOVW BP, 2(AX) ADDQ $0x04, AX JMP repeat_end_emit_encodeBlockAsm repeat_three_repeat_as_copy_encodeBlockAsm_emit_copy: - LEAL -4(BX), BX + LEAL -4(BP), BP MOVW $0x0015, (AX) - MOVB BL, 2(AX) + MOVB BP, 2(AX) ADDQ $0x03, AX JMP repeat_end_emit_encodeBlockAsm repeat_two_repeat_as_copy_encodeBlockAsm_emit_copy: - SHLL $0x02, BX - ORL $0x01, BX - MOVW BX, (AX) + SHLL $0x02, BP + ORL $0x01, BP + MOVW BP, (AX) ADDQ $0x02, AX JMP repeat_end_emit_encodeBlockAsm repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy: - XORQ R9, R9 - LEAL 1(R9)(BX*4), BX - MOVB BP, 1(AX) - SARL $0x08, BP - SHLL $0x05, BP - ORL BP, BX - MOVB BL, (AX) + XORQ DI, DI + LEAL 1(DI)(BP*4), BP + MOVB SI, 1(AX) + SARL $0x08, SI + SHLL $0x05, SI + ORL SI, BP + MOVB BP, (AX) ADDQ $0x02, AX JMP repeat_end_emit_encodeBlockAsm - JMP four_bytes_loop_back_repeat_as_copy_encodeBlockAsm + JMP four_bytes_loop_back_repeat_as_copy_encodeBlockAsm four_bytes_remain_repeat_as_copy_encodeBlockAsm: - TESTL BX, BX + TESTL BP, BP JZ repeat_end_emit_encodeBlockAsm - MOVB $0x03, SI - LEAL -4(SI)(BX*4), BX - MOVB BL, (AX) - MOVL BP, 1(AX) + MOVB $0x03, BL + LEAL -4(BX)(BP*4), BP + MOVB BP, (AX) + MOVL SI, 1(AX) ADDQ $0x05, AX JMP repeat_end_emit_encodeBlockAsm two_byte_offset_repeat_as_copy_encodeBlockAsm: - CMPL BX, $0x40 + CMPL BP, $0x40 JLE two_byte_offset_short_repeat_as_copy_encodeBlockAsm MOVB $0xee, (AX) - MOVW BP, 1(AX) - LEAL -60(BX), BX + MOVW SI, 1(AX) + LEAL -60(BP), BP ADDQ $0x03, AX emit_repeat_again_repeat_as_copy_encodeBlockAsm_emit_copy_short: - MOVL BX, DI - LEAL -4(BX), BX + MOVL BP, DI + LEAL -4(BP), BP CMPL DI, $0x08 JLE repeat_two_repeat_as_copy_encodeBlockAsm_emit_copy_short CMPL DI, $0x0c JGE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short - CMPL BP, $0x00000800 + CMPL SI, $0x00000800 JLT repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short: - CMPL BX, $0x00000104 + CMPL BP, $0x00000104 JLT repeat_three_repeat_as_copy_encodeBlockAsm_emit_copy_short - CMPL BX, $0x00010100 + CMPL BP, $0x00010100 JLT repeat_four_repeat_as_copy_encodeBlockAsm_emit_copy_short - CMPL BX, $0x0100ffff + CMPL BP, $0x0100ffff JLT repeat_five_repeat_as_copy_encodeBlockAsm_emit_copy_short - LEAL -16842747(BX), BX + LEAL -16842747(BP), BP MOVW $0x001d, (AX) MOVW $0xfffb, 2(AX) MOVB $0xff, 4(AX) @@ -562,68 +561,68 @@ cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short: JMP emit_repeat_again_repeat_as_copy_encodeBlockAsm_emit_copy_short repeat_five_repeat_as_copy_encodeBlockAsm_emit_copy_short: - LEAL -65536(BX), BX - MOVL BX, BP + LEAL -65536(BP), BP + MOVL BP, SI MOVW $0x001d, (AX) - MOVW BX, 2(AX) - SARL $0x10, BP - MOVB BP, 4(AX) + MOVW BP, 2(AX) + SARL $0x10, SI + MOVB SI, 4(AX) ADDQ $0x05, AX JMP repeat_end_emit_encodeBlockAsm repeat_four_repeat_as_copy_encodeBlockAsm_emit_copy_short: - LEAL -256(BX), BX + LEAL -256(BP), BP MOVW $0x0019, (AX) - MOVW BX, 2(AX) + MOVW BP, 2(AX) ADDQ $0x04, AX JMP repeat_end_emit_encodeBlockAsm repeat_three_repeat_as_copy_encodeBlockAsm_emit_copy_short: - LEAL -4(BX), BX + LEAL -4(BP), BP MOVW $0x0015, (AX) - MOVB BL, 2(AX) + MOVB BP, 2(AX) ADDQ $0x03, AX JMP repeat_end_emit_encodeBlockAsm repeat_two_repeat_as_copy_encodeBlockAsm_emit_copy_short: - SHLL $0x02, BX - ORL $0x01, BX - MOVW BX, (AX) + SHLL $0x02, BP + ORL $0x01, BP + MOVW BP, (AX) ADDQ $0x02, AX JMP repeat_end_emit_encodeBlockAsm repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short: - XORQ R9, R9 - LEAL 1(R9)(BX*4), BX - MOVB BP, 1(AX) - SARL $0x08, BP - SHLL $0x05, BP - ORL BP, BX - MOVB BL, (AX) + XORQ DI, DI + LEAL 1(DI)(BP*4), BP + MOVB SI, 1(AX) + SARL $0x08, SI + SHLL $0x05, SI + ORL SI, BP + MOVB BP, (AX) ADDQ $0x02, AX JMP repeat_end_emit_encodeBlockAsm - JMP two_byte_offset_repeat_as_copy_encodeBlockAsm + JMP two_byte_offset_repeat_as_copy_encodeBlockAsm two_byte_offset_short_repeat_as_copy_encodeBlockAsm: - CMPL BX, $0x0c + CMPL BP, $0x0c JGE emit_copy_three_repeat_as_copy_encodeBlockAsm - CMPL BP, $0x00000800 + CMPL SI, $0x00000800 JGE emit_copy_three_repeat_as_copy_encodeBlockAsm - MOVB $0x01, SI - LEAL -16(SI)(BX*4), BX - MOVB BP, 1(AX) - SHRL $0x08, BP - SHLL $0x05, BP - ORL BP, BX - MOVB BL, (AX) + MOVB $0x01, BL + LEAL -16(BX)(BP*4), BP + MOVB SI, 1(AX) + SHRL $0x08, SI + SHLL $0x05, SI + ORL SI, BP + MOVB BP, (AX) ADDQ $0x02, AX JMP repeat_end_emit_encodeBlockAsm emit_copy_three_repeat_as_copy_encodeBlockAsm: - MOVB $0x02, SI - LEAL -4(SI)(BX*4), BX - MOVB BL, (AX) - MOVW BP, 1(AX) + MOVB $0x02, BL + LEAL -4(BX)(BP*4), BP + MOVB BP, (AX) + MOVW SI, 1(AX) ADDQ $0x03, AX repeat_end_emit_encodeBlockAsm: @@ -633,16 +632,16 @@ repeat_end_emit_encodeBlockAsm: JMP search_loop_encodeBlockAsm no_repeat_found_encodeBlockAsm: - CMPL (DX)(BX*1), R9 + CMPL (DX)(BP*1), SI JEQ candidate_match_encodeBlockAsm - SHRQ $0x08, R9 - MOVL 24(SP)(R11*4), BX - LEAL 2(CX), DI - CMPL (DX)(BP*1), R9 + SHRQ $0x08, SI + MOVL 24(SP)(R9*4), BP + LEAL 2(CX), R8 + CMPL (DX)(DI*1), SI JEQ candidate2_match_encodeBlockAsm - MOVL DI, 24(SP)(R11*4) - SHRQ $0x08, R9 - CMPL (DX)(BX*1), R9 + MOVL R8, 24(SP)(R9*4) + SHRQ $0x08, SI + CMPL (DX)(BP*1), SI JEQ candidate3_match_encodeBlockAsm MOVL 20(SP), CX JMP search_loop_encodeBlockAsm @@ -652,46 +651,46 @@ candidate3_match_encodeBlockAsm: JMP candidate_match_encodeBlockAsm candidate2_match_encodeBlockAsm: - MOVL DI, 24(SP)(R11*4) + MOVL R8, 24(SP)(R9*4) INCL CX - MOVL BP, BX + MOVL DI, BP candidate_match_encodeBlockAsm: - MOVL 12(SP), BP - TESTL BX, BX + MOVL 12(SP), SI + TESTL BP, BP JZ match_extend_back_end_encodeBlockAsm match_extend_back_loop_encodeBlockAsm: - CMPL CX, BP + CMPL CX, SI JLE match_extend_back_end_encodeBlockAsm - MOVB -1(DX)(BX*1), SI + MOVB -1(DX)(BP*1), BL MOVB -1(DX)(CX*1), DI - CMPB SI, DI + CMPB BL, DI JNE match_extend_back_end_encodeBlockAsm LEAL -1(CX), CX - DECL BX + DECL BP JZ match_extend_back_end_encodeBlockAsm JMP match_extend_back_loop_encodeBlockAsm match_extend_back_end_encodeBlockAsm: - MOVL CX, DI - SUBL 12(SP), DI - LEAQ 4(AX)(DI*1), DI - CMPQ DI, (SP) + MOVL CX, SI + SUBL 12(SP), SI + LEAQ 4(AX)(SI*1), SI + CMPQ SI, (SP) JL match_dst_size_check_encodeBlockAsm MOVQ $0x00000000, ret+48(FP) RET match_dst_size_check_encodeBlockAsm: - MOVL CX, R8 + MOVL CX, SI MOVL 12(SP), DI - CMPL DI, R8 + CMPL DI, SI JEQ emit_literal_done_match_emit_encodeBlockAsm - MOVL R8, BP - MOVL R8, 12(SP) - LEAQ (DX)(DI*1), R9 - SUBL DI, BP - MOVL BP, DI + MOVL SI, R8 + MOVL SI, 12(SP) + LEAQ (DX)(DI*1), SI + SUBL DI, R8 + MOVL R8, DI SUBL $0x01, DI JC emit_literal_done_match_emit_encodeBlockAsm CMPL DI, $0x3c @@ -708,11 +707,11 @@ match_dst_size_check_encodeBlockAsm: JMP memmove_match_emit_encodeBlockAsm four_bytes_match_emit_encodeBlockAsm: - MOVL DI, R8 - SHRL $0x10, R8 + MOVL DI, R9 + SHRL $0x10, R9 MOVB $0xf8, (AX) MOVW DI, 1(AX) - MOVB R8, 3(AX) + MOVB R9, 3(AX) ADDQ $0x04, AX JMP memmove_match_emit_encodeBlockAsm @@ -734,8 +733,7 @@ one_byte_match_emit_encodeBlockAsm: ADDQ $0x01, AX memmove_match_emit_encodeBlockAsm: - LEAQ (AX)(BP*1), R10 - MOVL BP, R8 + LEAQ (AX)(R8*1), DI NOP emit_lit_memmove_match_emit_encodeBlockAsm_memmove_tail: @@ -762,55 +760,55 @@ emit_lit_memmove_match_emit_encodeBlockAsm_memmove_tail: JMP emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_256through2048 emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_1or2: - MOVB (R9), R11 - MOVB -1(R9)(R8*1), R9 - MOVB R11, (AX) - MOVB R9, -1(AX)(R8*1) + MOVB (SI), R9 + MOVB -1(SI)(R8*1), SI + MOVB R9, (AX) + MOVB SI, -1(AX)(R8*1) JMP memmove_end_copy_match_emit_encodeBlockAsm emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_4: - MOVL (R9), R11 - MOVL R11, (AX) + MOVL (SI), R9 + MOVL R9, (AX) JMP memmove_end_copy_match_emit_encodeBlockAsm emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_3: - MOVW (R9), R11 - MOVB 2(R9), R9 - MOVW R11, (AX) - MOVB R9, 2(AX) + MOVW (SI), R9 + MOVB 2(SI), SI + MOVW R9, (AX) + MOVB SI, 2(AX) JMP memmove_end_copy_match_emit_encodeBlockAsm emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_5through7: - MOVL (R9), R11 - MOVL -4(R9)(R8*1), R9 - MOVL R11, (AX) - MOVL R9, -4(AX)(R8*1) + MOVL (SI), R9 + MOVL -4(SI)(R8*1), SI + MOVL R9, (AX) + MOVL SI, -4(AX)(R8*1) JMP memmove_end_copy_match_emit_encodeBlockAsm emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_8: - MOVQ (R9), R11 - MOVQ R11, (AX) + MOVQ (SI), R9 + MOVQ R9, (AX) JMP memmove_end_copy_match_emit_encodeBlockAsm emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_9through16: - MOVQ (R9), R11 - MOVQ -8(R9)(R8*1), R9 - MOVQ R11, (AX) - MOVQ R9, -8(AX)(R8*1) + MOVQ (SI), R9 + MOVQ -8(SI)(R8*1), SI + MOVQ R9, (AX) + MOVQ SI, -8(AX)(R8*1) JMP memmove_end_copy_match_emit_encodeBlockAsm emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_17through32: - MOVOU (R9), X0 - MOVOU -16(R9)(R8*1), X1 + MOVOU (SI), X0 + MOVOU -16(SI)(R8*1), X1 MOVOU X0, (AX) MOVOU X1, -16(AX)(R8*1) JMP memmove_end_copy_match_emit_encodeBlockAsm emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_33through64: - MOVOU (R9), X0 - MOVOU 16(R9), X1 - MOVOU -32(R9)(R8*1), X2 - MOVOU -16(R9)(R8*1), X3 + MOVOU (SI), X0 + MOVOU 16(SI), X1 + MOVOU -32(SI)(R8*1), X2 + MOVOU -16(SI)(R8*1), X3 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, -32(AX)(R8*1) @@ -818,14 +816,14 @@ emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_33through64: JMP memmove_end_copy_match_emit_encodeBlockAsm emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_65through128: - MOVOU (R9), X0 - MOVOU 16(R9), X1 - MOVOU 32(R9), X2 - MOVOU 48(R9), X3 - MOVOU -64(R9)(R8*1), X12 - MOVOU -48(R9)(R8*1), X13 - MOVOU -32(R9)(R8*1), X14 - MOVOU -16(R9)(R8*1), X15 + MOVOU (SI), X0 + MOVOU 16(SI), X1 + MOVOU 32(SI), X2 + MOVOU 48(SI), X3 + MOVOU -64(SI)(R8*1), X12 + MOVOU -48(SI)(R8*1), X13 + MOVOU -32(SI)(R8*1), X14 + MOVOU -16(SI)(R8*1), X15 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, 32(AX) @@ -837,22 +835,22 @@ emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_65through128: JMP memmove_end_copy_match_emit_encodeBlockAsm emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_129through256: - MOVOU (R9), X0 - MOVOU 16(R9), X1 - MOVOU 32(R9), X2 - MOVOU 48(R9), X3 - MOVOU 64(R9), X4 - MOVOU 80(R9), X5 - MOVOU 96(R9), X6 - MOVOU 112(R9), X7 - MOVOU -128(R9)(R8*1), X8 - MOVOU -112(R9)(R8*1), X9 - MOVOU -96(R9)(R8*1), X10 - MOVOU -80(R9)(R8*1), X11 - MOVOU -64(R9)(R8*1), X12 - MOVOU -48(R9)(R8*1), X13 - MOVOU -32(R9)(R8*1), X14 - MOVOU -16(R9)(R8*1), X15 + MOVOU (SI), X0 + MOVOU 16(SI), X1 + MOVOU 32(SI), X2 + MOVOU 48(SI), X3 + MOVOU 64(SI), X4 + MOVOU 80(SI), X5 + MOVOU 96(SI), X6 + MOVOU 112(SI), X7 + MOVOU -128(SI)(R8*1), X8 + MOVOU -112(SI)(R8*1), X9 + MOVOU -96(SI)(R8*1), X10 + MOVOU -80(SI)(R8*1), X11 + MOVOU -64(SI)(R8*1), X12 + MOVOU -48(SI)(R8*1), X13 + MOVOU -32(SI)(R8*1), X14 + MOVOU -16(SI)(R8*1), X15 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, 32(AX) @@ -873,22 +871,22 @@ emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_129through256: emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_256through2048: LEAQ -256(R8), R8 - MOVOU (R9), X0 - MOVOU 16(R9), X1 - MOVOU 32(R9), X2 - MOVOU 48(R9), X3 - MOVOU 64(R9), X4 - MOVOU 80(R9), X5 - MOVOU 96(R9), X6 - MOVOU 112(R9), X7 - MOVOU 128(R9), X8 - MOVOU 144(R9), X9 - MOVOU 160(R9), X10 - MOVOU 176(R9), X11 - MOVOU 192(R9), X12 - MOVOU 208(R9), X13 - MOVOU 224(R9), X14 - MOVOU 240(R9), X15 + MOVOU (SI), X0 + MOVOU 16(SI), X1 + MOVOU 32(SI), X2 + MOVOU 48(SI), X3 + MOVOU 64(SI), X4 + MOVOU 80(SI), X5 + MOVOU 96(SI), X6 + MOVOU 112(SI), X7 + MOVOU 128(SI), X8 + MOVOU 144(SI), X9 + MOVOU 160(SI), X10 + MOVOU 176(SI), X11 + MOVOU 192(SI), X12 + MOVOU 208(SI), X13 + MOVOU 224(SI), X14 + MOVOU 240(SI), X15 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, 32(AX) @@ -906,92 +904,92 @@ emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_256through2048: MOVOU X14, 224(AX) MOVOU X15, 240(AX) CMPQ R8, $0x00000100 - LEAQ 256(R9), R9 + LEAQ 256(SI), SI LEAQ 256(AX), AX JGE emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_256through2048 JMP emit_lit_memmove_match_emit_encodeBlockAsm_memmove_tail memmove_end_copy_match_emit_encodeBlockAsm: - MOVQ R10, AX + MOVQ DI, AX emit_literal_done_match_emit_encodeBlockAsm: match_nolit_loop_encodeBlockAsm: - MOVL CX, BP - SUBL BX, BP - MOVL BP, 16(SP) + MOVL CX, SI + SUBL BP, SI + MOVL SI, 16(SP) ADDL $0x04, CX - ADDL $0x04, BX - MOVQ src_len+32(FP), R9 - SUBL CX, R9 - LEAQ (DX)(CX*1), R10 - LEAQ (DX)(BX*1), R11 - XORL BX, BX - CMPL R9, $0x08 + ADDL $0x04, BP + MOVQ src_len+32(FP), SI + SUBL CX, SI + LEAQ (DX)(CX*1), DI + LEAQ (DX)(BP*1), BP + XORL R9, R9 + CMPL SI, $0x08 JL matchlen_single_match_nolit_encodeBlockAsm matchlen_loopback_match_nolit_encodeBlockAsm: - MOVQ (R10)(BX*1), R12 - XORQ (R11)(BX*1), R12 - TESTQ R12, R12 + MOVQ (DI)(R9*1), R8 + XORQ (BP)(R9*1), R8 + TESTQ R8, R8 JZ matchlen_loop_match_nolit_encodeBlockAsm - BSFQ R12, R12 - SARQ $0x03, R12 - LEAL (BX)(R12*1), BX + BSFQ R8, R8 + SARQ $0x03, R8 + LEAL (R9)(R8*1), R9 JMP match_nolit_end_encodeBlockAsm matchlen_loop_match_nolit_encodeBlockAsm: - LEAL -8(R9), R9 - LEAL 8(BX), BX - CMPL R9, $0x08 + LEAL -8(SI), SI + LEAL 8(R9), R9 + CMPL SI, $0x08 JGE matchlen_loopback_match_nolit_encodeBlockAsm matchlen_single_match_nolit_encodeBlockAsm: - TESTL R9, R9 + TESTL SI, SI JZ match_nolit_end_encodeBlockAsm matchlen_single_loopback_match_nolit_encodeBlockAsm: - MOVB (R10)(BX*1), R12 - CMPB (R11)(BX*1), R12 + MOVB (DI)(R9*1), R8 + CMPB (BP)(R9*1), R8 JNE match_nolit_end_encodeBlockAsm - LEAL 1(BX), BX - DECL R9 + LEAL 1(R9), R9 + DECL SI JNZ matchlen_single_loopback_match_nolit_encodeBlockAsm match_nolit_end_encodeBlockAsm: - ADDL BX, CX + ADDL R9, CX MOVL 16(SP), BP - ADDL $0x04, BX + ADDL $0x04, R9 CMPL BP, $0x00010000 JL two_byte_offset_match_nolit_encodeBlockAsm four_bytes_loop_back_match_nolit_encodeBlockAsm: - CMPL BX, $0x40 + CMPL R9, $0x40 JLE four_bytes_remain_match_nolit_encodeBlockAsm MOVB $0xff, (AX) MOVL BP, 1(AX) - LEAL -64(BX), BX + LEAL -64(R9), R9 ADDQ $0x05, AX - CMPL BX, $0x04 + CMPL R9, $0x04 JL four_bytes_remain_match_nolit_encodeBlockAsm emit_repeat_again_match_nolit_encodeBlockAsm_emit_copy: - MOVL BX, DI - LEAL -4(BX), BX - CMPL DI, $0x08 + MOVL R9, SI + LEAL -4(R9), R9 + CMPL SI, $0x08 JLE repeat_two_match_nolit_encodeBlockAsm_emit_copy - CMPL DI, $0x0c + CMPL SI, $0x0c JGE cant_repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy CMPL BP, $0x00000800 JLT repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy cant_repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy: - CMPL BX, $0x00000104 + CMPL R9, $0x00000104 JLT repeat_three_match_nolit_encodeBlockAsm_emit_copy - CMPL BX, $0x00010100 + CMPL R9, $0x00010100 JLT repeat_four_match_nolit_encodeBlockAsm_emit_copy - CMPL BX, $0x0100ffff + CMPL R9, $0x0100ffff JLT repeat_five_match_nolit_encodeBlockAsm_emit_copy - LEAL -16842747(BX), BX + LEAL -16842747(R9), R9 MOVW $0x001d, (AX) MOVW $0xfffb, 2(AX) MOVB $0xff, 4(AX) @@ -999,84 +997,84 @@ cant_repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy: JMP emit_repeat_again_match_nolit_encodeBlockAsm_emit_copy repeat_five_match_nolit_encodeBlockAsm_emit_copy: - LEAL -65536(BX), BX - MOVL BX, BP + LEAL -65536(R9), R9 + MOVL R9, BP MOVW $0x001d, (AX) - MOVW BX, 2(AX) + MOVW R9, 2(AX) SARL $0x10, BP MOVB BP, 4(AX) ADDQ $0x05, AX JMP match_nolit_emitcopy_end_encodeBlockAsm repeat_four_match_nolit_encodeBlockAsm_emit_copy: - LEAL -256(BX), BX + LEAL -256(R9), R9 MOVW $0x0019, (AX) - MOVW BX, 2(AX) + MOVW R9, 2(AX) ADDQ $0x04, AX JMP match_nolit_emitcopy_end_encodeBlockAsm repeat_three_match_nolit_encodeBlockAsm_emit_copy: - LEAL -4(BX), BX + LEAL -4(R9), R9 MOVW $0x0015, (AX) - MOVB BL, 2(AX) + MOVB R9, 2(AX) ADDQ $0x03, AX JMP match_nolit_emitcopy_end_encodeBlockAsm repeat_two_match_nolit_encodeBlockAsm_emit_copy: - SHLL $0x02, BX - ORL $0x01, BX - MOVW BX, (AX) + SHLL $0x02, R9 + ORL $0x01, R9 + MOVW R9, (AX) ADDQ $0x02, AX JMP match_nolit_emitcopy_end_encodeBlockAsm repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy: - XORQ R9, R9 - LEAL 1(R9)(BX*4), BX + XORQ SI, SI + LEAL 1(SI)(R9*4), R9 MOVB BP, 1(AX) SARL $0x08, BP SHLL $0x05, BP - ORL BP, BX - MOVB BL, (AX) + ORL BP, R9 + MOVB R9, (AX) ADDQ $0x02, AX JMP match_nolit_emitcopy_end_encodeBlockAsm - JMP four_bytes_loop_back_match_nolit_encodeBlockAsm + JMP four_bytes_loop_back_match_nolit_encodeBlockAsm four_bytes_remain_match_nolit_encodeBlockAsm: - TESTL BX, BX + TESTL R9, R9 JZ match_nolit_emitcopy_end_encodeBlockAsm - MOVB $0x03, SI - LEAL -4(SI)(BX*4), BX - MOVB BL, (AX) + MOVB $0x03, BL + LEAL -4(BX)(R9*4), R9 + MOVB R9, (AX) MOVL BP, 1(AX) ADDQ $0x05, AX JMP match_nolit_emitcopy_end_encodeBlockAsm two_byte_offset_match_nolit_encodeBlockAsm: - CMPL BX, $0x40 + CMPL R9, $0x40 JLE two_byte_offset_short_match_nolit_encodeBlockAsm MOVB $0xee, (AX) MOVW BP, 1(AX) - LEAL -60(BX), BX + LEAL -60(R9), R9 ADDQ $0x03, AX emit_repeat_again_match_nolit_encodeBlockAsm_emit_copy_short: - MOVL BX, DI - LEAL -4(BX), BX - CMPL DI, $0x08 + MOVL R9, SI + LEAL -4(R9), R9 + CMPL SI, $0x08 JLE repeat_two_match_nolit_encodeBlockAsm_emit_copy_short - CMPL DI, $0x0c + CMPL SI, $0x0c JGE cant_repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short CMPL BP, $0x00000800 JLT repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short cant_repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short: - CMPL BX, $0x00000104 + CMPL R9, $0x00000104 JLT repeat_three_match_nolit_encodeBlockAsm_emit_copy_short - CMPL BX, $0x00010100 + CMPL R9, $0x00010100 JLT repeat_four_match_nolit_encodeBlockAsm_emit_copy_short - CMPL BX, $0x0100ffff + CMPL R9, $0x0100ffff JLT repeat_five_match_nolit_encodeBlockAsm_emit_copy_short - LEAL -16842747(BX), BX + LEAL -16842747(R9), R9 MOVW $0x001d, (AX) MOVW $0xfffb, 2(AX) MOVB $0xff, 4(AX) @@ -1084,67 +1082,67 @@ cant_repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short: JMP emit_repeat_again_match_nolit_encodeBlockAsm_emit_copy_short repeat_five_match_nolit_encodeBlockAsm_emit_copy_short: - LEAL -65536(BX), BX - MOVL BX, BP + LEAL -65536(R9), R9 + MOVL R9, BP MOVW $0x001d, (AX) - MOVW BX, 2(AX) + MOVW R9, 2(AX) SARL $0x10, BP MOVB BP, 4(AX) ADDQ $0x05, AX JMP match_nolit_emitcopy_end_encodeBlockAsm repeat_four_match_nolit_encodeBlockAsm_emit_copy_short: - LEAL -256(BX), BX + LEAL -256(R9), R9 MOVW $0x0019, (AX) - MOVW BX, 2(AX) + MOVW R9, 2(AX) ADDQ $0x04, AX JMP match_nolit_emitcopy_end_encodeBlockAsm repeat_three_match_nolit_encodeBlockAsm_emit_copy_short: - LEAL -4(BX), BX + LEAL -4(R9), R9 MOVW $0x0015, (AX) - MOVB BL, 2(AX) + MOVB R9, 2(AX) ADDQ $0x03, AX JMP match_nolit_emitcopy_end_encodeBlockAsm repeat_two_match_nolit_encodeBlockAsm_emit_copy_short: - SHLL $0x02, BX - ORL $0x01, BX - MOVW BX, (AX) + SHLL $0x02, R9 + ORL $0x01, R9 + MOVW R9, (AX) ADDQ $0x02, AX JMP match_nolit_emitcopy_end_encodeBlockAsm repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short: - XORQ R9, R9 - LEAL 1(R9)(BX*4), BX + XORQ SI, SI + LEAL 1(SI)(R9*4), R9 MOVB BP, 1(AX) SARL $0x08, BP SHLL $0x05, BP - ORL BP, BX - MOVB BL, (AX) + ORL BP, R9 + MOVB R9, (AX) ADDQ $0x02, AX JMP match_nolit_emitcopy_end_encodeBlockAsm - JMP two_byte_offset_match_nolit_encodeBlockAsm + JMP two_byte_offset_match_nolit_encodeBlockAsm two_byte_offset_short_match_nolit_encodeBlockAsm: - CMPL BX, $0x0c + CMPL R9, $0x0c JGE emit_copy_three_match_nolit_encodeBlockAsm CMPL BP, $0x00000800 JGE emit_copy_three_match_nolit_encodeBlockAsm - MOVB $0x01, SI - LEAL -16(SI)(BX*4), BX + MOVB $0x01, BL + LEAL -16(BX)(R9*4), R9 MOVB BP, 1(AX) SHRL $0x08, BP SHLL $0x05, BP - ORL BP, BX - MOVB BL, (AX) + ORL BP, R9 + MOVB R9, (AX) ADDQ $0x02, AX JMP match_nolit_emitcopy_end_encodeBlockAsm emit_copy_three_match_nolit_encodeBlockAsm: - MOVB $0x02, SI - LEAL -4(SI)(BX*4), BX - MOVB BL, (AX) + MOVB $0x02, BL + LEAL -4(BX)(R9*4), R9 + MOVB R9, (AX) MOVW BP, 1(AX) ADDQ $0x03, AX @@ -1158,22 +1156,22 @@ match_nolit_emitcopy_end_encodeBlockAsm: RET match_nolit_dst_ok_encodeBlockAsm: - MOVQ -2(DX)(CX*1), R9 - MOVQ $0x0000cf1bbcdcbf9b, R10 - MOVQ R9, R11 - SHRQ $0x10, R9 - MOVQ R9, R12 - SHLQ $0x10, R11 - IMULQ R10, R11 - SHRQ $0x32, R11 - SHLQ $0x10, R12 - IMULQ R10, R12 - SHRQ $0x32, R12 - LEAL -2(CX), BP - MOVL 24(SP)(R12*4), BX - MOVL BP, 24(SP)(R11*4) - MOVL CX, 24(SP)(R12*4) - CMPL (DX)(BX*1), R9 + MOVQ -2(DX)(CX*1), SI + MOVQ $0x0000cf1bbcdcbf9b, BP + MOVQ SI, DI + SHRQ $0x10, SI + MOVQ SI, R8 + SHLQ $0x10, DI + IMULQ BP, DI + SHRQ $0x32, DI + SHLQ $0x10, R8 + IMULQ BP, R8 + SHRQ $0x32, R8 + LEAL -2(CX), R9 + MOVL 24(SP)(R8*4), BP + MOVL R9, 24(SP)(DI*4) + MOVL CX, 24(SP)(R8*4) + CMPL (DX)(BP*1), SI JEQ match_nolit_loop_encodeBlockAsm INCL CX JMP search_loop_encodeBlockAsm @@ -1448,8 +1446,8 @@ zero_loop_encodeBlockAsm12B: MOVL $0x00000000, 12(SP) MOVQ src_len+32(FP), CX LEAQ -5(CX), DX - LEAQ -8(CX), R9 - MOVL R9, 8(SP) + LEAQ -8(CX), BP + MOVL BP, 8(SP) SHRQ $0x05, CX SUBL CX, DX LEAQ (AX)(DX*1), DX @@ -1459,68 +1457,68 @@ zero_loop_encodeBlockAsm12B: MOVQ src_base+24(FP), DX search_loop_encodeBlockAsm12B: - MOVQ (DX)(CX*1), R9 - MOVL CX, BX - SUBL 12(SP), BX - SHRL $0x05, BX - LEAL 4(CX)(BX*1), BX - MOVL 8(SP), BP - CMPL BX, BP + MOVQ (DX)(CX*1), SI + MOVL CX, BP + SUBL 12(SP), BP + SHRL $0x05, BP + LEAL 4(CX)(BP*1), BP + MOVL 8(SP), DI + CMPL BP, DI JGT emit_remainder_encodeBlockAsm12B - MOVL BX, 20(SP) - MOVQ $0x000000cf1bbcdcbb, R10 - MOVQ R9, R11 - MOVQ R9, R12 - SHRQ $0x08, R12 - SHLQ $0x18, R11 - IMULQ R10, R11 - SHRQ $0x34, R11 - SHLQ $0x18, R12 - IMULQ R10, R12 - SHRQ $0x34, R12 - MOVL 24(SP)(R11*4), BX - MOVL 24(SP)(R12*4), BP - MOVL CX, 24(SP)(R11*4) - LEAL 1(CX), DI - MOVL DI, 24(SP)(R12*4) - MOVQ R9, R11 - SHRQ $0x10, R11 - SHLQ $0x18, R11 - IMULQ R10, R11 - SHRQ $0x34, R11 - MOVL CX, DI - SUBL 16(SP), DI - MOVL 1(DX)(DI*1), DI - MOVQ R9, R10 + MOVL BP, 20(SP) + MOVQ $0x000000cf1bbcdcbb, R8 + MOVQ SI, R9 + MOVQ SI, R10 SHRQ $0x08, R10 - CMPL R10, DI + SHLQ $0x18, R9 + IMULQ R8, R9 + SHRQ $0x34, R9 + SHLQ $0x18, R10 + IMULQ R8, R10 + SHRQ $0x34, R10 + MOVL 24(SP)(R9*4), BP + MOVL 24(SP)(R10*4), DI + MOVL CX, 24(SP)(R9*4) + LEAL 1(CX), R9 + MOVL R9, 24(SP)(R10*4) + MOVQ SI, R9 + SHRQ $0x10, R9 + SHLQ $0x18, R9 + IMULQ R8, R9 + SHRQ $0x34, R9 + MOVL CX, R8 + SUBL 16(SP), R8 + MOVL 1(DX)(R8*1), R10 + MOVQ SI, R8 + SHRQ $0x08, R8 + CMPL R8, R10 JNE no_repeat_found_encodeBlockAsm12B - LEAL 1(CX), DI - MOVL 12(SP), R8 - MOVL DI, BX - SUBL 16(SP), BX + LEAL 1(CX), SI + MOVL 12(SP), DI + MOVL SI, BP + SUBL 16(SP), BP JZ repeat_extend_back_end_encodeBlockAsm12B repeat_extend_back_loop_encodeBlockAsm12B: - CMPL DI, R8 + CMPL SI, DI JLE repeat_extend_back_end_encodeBlockAsm12B - MOVB -1(DX)(BX*1), BP - MOVB -1(DX)(DI*1), SI - CMPB BP, SI + MOVB -1(DX)(BP*1), BL + MOVB -1(DX)(SI*1), R8 + CMPB BL, R8 JNE repeat_extend_back_end_encodeBlockAsm12B - LEAL -1(DI), DI - DECL BX + LEAL -1(SI), SI + DECL BP JNZ repeat_extend_back_loop_encodeBlockAsm12B repeat_extend_back_end_encodeBlockAsm12B: MOVL 12(SP), BP - CMPL BP, DI + CMPL BP, SI JEQ emit_literal_done_repeat_emit_encodeBlockAsm12B - MOVL DI, BX - MOVL DI, 12(SP) + MOVL SI, R8 + MOVL SI, 12(SP) LEAQ (DX)(BP*1), R9 - SUBL BP, BX - MOVL BX, BP + SUBL BP, R8 + MOVL R8, BP SUBL $0x01, BP JC emit_literal_done_repeat_emit_encodeBlockAsm12B CMPL BP, $0x3c @@ -1563,87 +1561,86 @@ one_byte_repeat_emit_encodeBlockAsm12B: ADDQ $0x01, AX memmove_repeat_emit_encodeBlockAsm12B: - LEAQ (AX)(BX*1), R10 - MOVL BX, BP + LEAQ (AX)(R8*1), BP NOP emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_tail: - TESTQ BP, BP + TESTQ R8, R8 JEQ memmove_end_copy_repeat_emit_encodeBlockAsm12B - CMPQ BP, $0x02 + CMPQ R8, $0x02 JBE emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_1or2 - CMPQ BP, $0x04 + CMPQ R8, $0x04 JB emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_3 JBE emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_4 - CMPQ BP, $0x08 + CMPQ R8, $0x08 JB emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_5through7 JE emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_8 - CMPQ BP, $0x10 + CMPQ R8, $0x10 JBE emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_9through16 - CMPQ BP, $0x20 + CMPQ R8, $0x20 JBE emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_17through32 - CMPQ BP, $0x40 + CMPQ R8, $0x40 JBE emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_33through64 - CMPQ BP, $0x80 + CMPQ R8, $0x80 JBE emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_65through128 - CMPQ BP, $0x00000100 + CMPQ R8, $0x00000100 JBE emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_129through256 JMP emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_256through2048 emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_1or2: - MOVB (R9), R11 - MOVB -1(R9)(BP*1), R9 - MOVB R11, (AX) - MOVB R9, -1(AX)(BP*1) + MOVB (R9), R10 + MOVB -1(R9)(R8*1), R9 + MOVB R10, (AX) + MOVB R9, -1(AX)(R8*1) JMP memmove_end_copy_repeat_emit_encodeBlockAsm12B emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_4: - MOVL (R9), R11 - MOVL R11, (AX) + MOVL (R9), R10 + MOVL R10, (AX) JMP memmove_end_copy_repeat_emit_encodeBlockAsm12B emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_3: - MOVW (R9), R11 + MOVW (R9), R10 MOVB 2(R9), R9 - MOVW R11, (AX) + MOVW R10, (AX) MOVB R9, 2(AX) JMP memmove_end_copy_repeat_emit_encodeBlockAsm12B emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_5through7: - MOVL (R9), R11 - MOVL -4(R9)(BP*1), R9 - MOVL R11, (AX) - MOVL R9, -4(AX)(BP*1) + MOVL (R9), R10 + MOVL -4(R9)(R8*1), R9 + MOVL R10, (AX) + MOVL R9, -4(AX)(R8*1) JMP memmove_end_copy_repeat_emit_encodeBlockAsm12B emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_8: - MOVQ (R9), R11 - MOVQ R11, (AX) + MOVQ (R9), R10 + MOVQ R10, (AX) JMP memmove_end_copy_repeat_emit_encodeBlockAsm12B emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_9through16: - MOVQ (R9), R11 - MOVQ -8(R9)(BP*1), R9 - MOVQ R11, (AX) - MOVQ R9, -8(AX)(BP*1) + MOVQ (R9), R10 + MOVQ -8(R9)(R8*1), R9 + MOVQ R10, (AX) + MOVQ R9, -8(AX)(R8*1) JMP memmove_end_copy_repeat_emit_encodeBlockAsm12B emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_17through32: MOVOU (R9), X0 - MOVOU -16(R9)(BP*1), X1 + MOVOU -16(R9)(R8*1), X1 MOVOU X0, (AX) - MOVOU X1, -16(AX)(BP*1) + MOVOU X1, -16(AX)(R8*1) JMP memmove_end_copy_repeat_emit_encodeBlockAsm12B emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_33through64: MOVOU (R9), X0 MOVOU 16(R9), X1 - MOVOU -32(R9)(BP*1), X2 - MOVOU -16(R9)(BP*1), X3 + MOVOU -32(R9)(R8*1), X2 + MOVOU -16(R9)(R8*1), X3 MOVOU X0, (AX) MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(BP*1) - MOVOU X3, -16(AX)(BP*1) + MOVOU X2, -32(AX)(R8*1) + MOVOU X3, -16(AX)(R8*1) JMP memmove_end_copy_repeat_emit_encodeBlockAsm12B emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_65through128: @@ -1651,18 +1648,18 @@ emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_65through128: MOVOU 16(R9), X1 MOVOU 32(R9), X2 MOVOU 48(R9), X3 - MOVOU -64(R9)(BP*1), X12 - MOVOU -48(R9)(BP*1), X13 - MOVOU -32(R9)(BP*1), X14 - MOVOU -16(R9)(BP*1), X15 + MOVOU -64(R9)(R8*1), X12 + MOVOU -48(R9)(R8*1), X13 + MOVOU -32(R9)(R8*1), X14 + MOVOU -16(R9)(R8*1), X15 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, 32(AX) MOVOU X3, 48(AX) - MOVOU X12, -64(AX)(BP*1) - MOVOU X13, -48(AX)(BP*1) - MOVOU X14, -32(AX)(BP*1) - MOVOU X15, -16(AX)(BP*1) + MOVOU X12, -64(AX)(R8*1) + MOVOU X13, -48(AX)(R8*1) + MOVOU X14, -32(AX)(R8*1) + MOVOU X15, -16(AX)(R8*1) JMP memmove_end_copy_repeat_emit_encodeBlockAsm12B emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_129through256: @@ -1674,14 +1671,14 @@ emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_129through256: MOVOU 80(R9), X5 MOVOU 96(R9), X6 MOVOU 112(R9), X7 - MOVOU -128(R9)(BP*1), X8 - MOVOU -112(R9)(BP*1), X9 - MOVOU -96(R9)(BP*1), X10 - MOVOU -80(R9)(BP*1), X11 - MOVOU -64(R9)(BP*1), X12 - MOVOU -48(R9)(BP*1), X13 - MOVOU -32(R9)(BP*1), X14 - MOVOU -16(R9)(BP*1), X15 + MOVOU -128(R9)(R8*1), X8 + MOVOU -112(R9)(R8*1), X9 + MOVOU -96(R9)(R8*1), X10 + MOVOU -80(R9)(R8*1), X11 + MOVOU -64(R9)(R8*1), X12 + MOVOU -48(R9)(R8*1), X13 + MOVOU -32(R9)(R8*1), X14 + MOVOU -16(R9)(R8*1), X15 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, 32(AX) @@ -1690,18 +1687,18 @@ emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_129through256: MOVOU X5, 80(AX) MOVOU X6, 96(AX) MOVOU X7, 112(AX) - MOVOU X8, -128(AX)(BP*1) - MOVOU X9, -112(AX)(BP*1) - MOVOU X10, -96(AX)(BP*1) - MOVOU X11, -80(AX)(BP*1) - MOVOU X12, -64(AX)(BP*1) - MOVOU X13, -48(AX)(BP*1) - MOVOU X14, -32(AX)(BP*1) - MOVOU X15, -16(AX)(BP*1) + MOVOU X8, -128(AX)(R8*1) + MOVOU X9, -112(AX)(R8*1) + MOVOU X10, -96(AX)(R8*1) + MOVOU X11, -80(AX)(R8*1) + MOVOU X12, -64(AX)(R8*1) + MOVOU X13, -48(AX)(R8*1) + MOVOU X14, -32(AX)(R8*1) + MOVOU X15, -16(AX)(R8*1) JMP memmove_end_copy_repeat_emit_encodeBlockAsm12B emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_256through2048: - LEAQ -256(BP), BP + LEAQ -256(R8), R8 MOVOU (R9), X0 MOVOU 16(R9), X1 MOVOU 32(R9), X2 @@ -1734,81 +1731,81 @@ emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_256through2048: MOVOU X13, 208(AX) MOVOU X14, 224(AX) MOVOU X15, 240(AX) - CMPQ BP, $0x00000100 + CMPQ R8, $0x00000100 LEAQ 256(R9), R9 LEAQ 256(AX), AX JGE emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_256through2048 JMP emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_tail memmove_end_copy_repeat_emit_encodeBlockAsm12B: - MOVQ R10, AX + MOVQ BP, AX emit_literal_done_repeat_emit_encodeBlockAsm12B: ADDL $0x05, CX - MOVL CX, BX - SUBL 16(SP), BX - MOVQ src_len+32(FP), R9 - SUBL CX, R9 - LEAQ (DX)(CX*1), R10 - LEAQ (DX)(BX*1), R11 - XORL BX, BX - CMPL R9, $0x08 + MOVL CX, BP + SUBL 16(SP), BP + MOVQ src_len+32(FP), R8 + SUBL CX, R8 + LEAQ (DX)(CX*1), R9 + LEAQ (DX)(BP*1), BP + XORL R11, R11 + CMPL R8, $0x08 JL matchlen_single_repeat_extend matchlen_loopback_repeat_extend: - MOVQ (R10)(BX*1), R12 - XORQ (R11)(BX*1), R12 - TESTQ R12, R12 + MOVQ (R9)(R11*1), R10 + XORQ (BP)(R11*1), R10 + TESTQ R10, R10 JZ matchlen_loop_repeat_extend - BSFQ R12, R12 - SARQ $0x03, R12 - LEAL (BX)(R12*1), BX + BSFQ R10, R10 + SARQ $0x03, R10 + LEAL (R11)(R10*1), R11 JMP repeat_extend_forward_end_encodeBlockAsm12B matchlen_loop_repeat_extend: - LEAL -8(R9), R9 - LEAL 8(BX), BX - CMPL R9, $0x08 + LEAL -8(R8), R8 + LEAL 8(R11), R11 + CMPL R8, $0x08 JGE matchlen_loopback_repeat_extend matchlen_single_repeat_extend: - TESTL R9, R9 + TESTL R8, R8 JZ repeat_extend_forward_end_encodeBlockAsm12B matchlen_single_loopback_repeat_extend: - MOVB (R10)(BX*1), R12 - CMPB (R11)(BX*1), R12 + MOVB (R9)(R11*1), R10 + CMPB (BP)(R11*1), R10 JNE repeat_extend_forward_end_encodeBlockAsm12B - LEAL 1(BX), BX - DECL R9 + LEAL 1(R11), R11 + DECL R8 JNZ matchlen_single_loopback_repeat_extend repeat_extend_forward_end_encodeBlockAsm12B: - ADDL BX, CX - MOVL CX, BX - SUBL DI, BX - MOVL 16(SP), BP - TESTL R8, R8 + ADDL R11, CX + MOVL CX, BP + SUBL SI, BP + MOVL 16(SP), SI + TESTL DI, DI JZ repeat_as_copy_encodeBlockAsm12B emit_repeat_again_match_repeat_encodeBlockAsm12B: - MOVL BX, DI - LEAL -4(BX), BX + MOVL BP, DI + LEAL -4(BP), BP CMPL DI, $0x08 JLE repeat_two_match_repeat_encodeBlockAsm12B CMPL DI, $0x0c JGE cant_repeat_two_offset_match_repeat_encodeBlockAsm12B - CMPL BP, $0x00000800 + CMPL SI, $0x00000800 JLT repeat_two_offset_match_repeat_encodeBlockAsm12B cant_repeat_two_offset_match_repeat_encodeBlockAsm12B: - CMPL BX, $0x00000104 + CMPL BP, $0x00000104 JLT repeat_three_match_repeat_encodeBlockAsm12B - CMPL BX, $0x00010100 + CMPL BP, $0x00010100 JLT repeat_four_match_repeat_encodeBlockAsm12B - CMPL BX, $0x0100ffff + CMPL BP, $0x0100ffff JLT repeat_five_match_repeat_encodeBlockAsm12B - LEAL -16842747(BX), BX + LEAL -16842747(BP), BP MOVW $0x001d, (AX) MOVW $0xfffb, 2(AX) MOVB $0xff, 4(AX) @@ -1816,79 +1813,79 @@ cant_repeat_two_offset_match_repeat_encodeBlockAsm12B: JMP emit_repeat_again_match_repeat_encodeBlockAsm12B repeat_five_match_repeat_encodeBlockAsm12B: - LEAL -65536(BX), BX - MOVL BX, BP + LEAL -65536(BP), BP + MOVL BP, SI MOVW $0x001d, (AX) - MOVW BX, 2(AX) - SARL $0x10, BP - MOVB BP, 4(AX) + MOVW BP, 2(AX) + SARL $0x10, SI + MOVB SI, 4(AX) ADDQ $0x05, AX JMP repeat_end_emit_encodeBlockAsm12B repeat_four_match_repeat_encodeBlockAsm12B: - LEAL -256(BX), BX + LEAL -256(BP), BP MOVW $0x0019, (AX) - MOVW BX, 2(AX) + MOVW BP, 2(AX) ADDQ $0x04, AX JMP repeat_end_emit_encodeBlockAsm12B repeat_three_match_repeat_encodeBlockAsm12B: - LEAL -4(BX), BX + LEAL -4(BP), BP MOVW $0x0015, (AX) - MOVB BL, 2(AX) + MOVB BP, 2(AX) ADDQ $0x03, AX JMP repeat_end_emit_encodeBlockAsm12B repeat_two_match_repeat_encodeBlockAsm12B: - SHLL $0x02, BX - ORL $0x01, BX - MOVW BX, (AX) + SHLL $0x02, BP + ORL $0x01, BP + MOVW BP, (AX) ADDQ $0x02, AX JMP repeat_end_emit_encodeBlockAsm12B repeat_two_offset_match_repeat_encodeBlockAsm12B: - XORQ R9, R9 - LEAL 1(R9)(BX*4), BX - MOVB BP, 1(AX) - SARL $0x08, BP - SHLL $0x05, BP - ORL BP, BX - MOVB BL, (AX) + XORQ DI, DI + LEAL 1(DI)(BP*4), BP + MOVB SI, 1(AX) + SARL $0x08, SI + SHLL $0x05, SI + ORL SI, BP + MOVB BP, (AX) ADDQ $0x02, AX JMP repeat_end_emit_encodeBlockAsm12B repeat_as_copy_encodeBlockAsm12B: - CMPL BP, $0x00010000 + CMPL SI, $0x00010000 JL two_byte_offset_repeat_as_copy_encodeBlockAsm12B four_bytes_loop_back_repeat_as_copy_encodeBlockAsm12B: - CMPL BX, $0x40 + CMPL BP, $0x40 JLE four_bytes_remain_repeat_as_copy_encodeBlockAsm12B MOVB $0xff, (AX) - MOVL BP, 1(AX) - LEAL -64(BX), BX + MOVL SI, 1(AX) + LEAL -64(BP), BP ADDQ $0x05, AX - CMPL BX, $0x04 + CMPL BP, $0x04 JL four_bytes_remain_repeat_as_copy_encodeBlockAsm12B emit_repeat_again_repeat_as_copy_encodeBlockAsm12B_emit_copy: - MOVL BX, DI - LEAL -4(BX), BX + MOVL BP, DI + LEAL -4(BP), BP CMPL DI, $0x08 JLE repeat_two_repeat_as_copy_encodeBlockAsm12B_emit_copy CMPL DI, $0x0c JGE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy - CMPL BP, $0x00000800 + CMPL SI, $0x00000800 JLT repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy: - CMPL BX, $0x00000104 + CMPL BP, $0x00000104 JLT repeat_three_repeat_as_copy_encodeBlockAsm12B_emit_copy - CMPL BX, $0x00010100 + CMPL BP, $0x00010100 JLT repeat_four_repeat_as_copy_encodeBlockAsm12B_emit_copy - CMPL BX, $0x0100ffff + CMPL BP, $0x0100ffff JLT repeat_five_repeat_as_copy_encodeBlockAsm12B_emit_copy - LEAL -16842747(BX), BX + LEAL -16842747(BP), BP MOVW $0x001d, (AX) MOVW $0xfffb, 2(AX) MOVB $0xff, 4(AX) @@ -1896,84 +1893,84 @@ cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy: JMP emit_repeat_again_repeat_as_copy_encodeBlockAsm12B_emit_copy repeat_five_repeat_as_copy_encodeBlockAsm12B_emit_copy: - LEAL -65536(BX), BX - MOVL BX, BP + LEAL -65536(BP), BP + MOVL BP, SI MOVW $0x001d, (AX) - MOVW BX, 2(AX) - SARL $0x10, BP - MOVB BP, 4(AX) + MOVW BP, 2(AX) + SARL $0x10, SI + MOVB SI, 4(AX) ADDQ $0x05, AX JMP repeat_end_emit_encodeBlockAsm12B repeat_four_repeat_as_copy_encodeBlockAsm12B_emit_copy: - LEAL -256(BX), BX + LEAL -256(BP), BP MOVW $0x0019, (AX) - MOVW BX, 2(AX) + MOVW BP, 2(AX) ADDQ $0x04, AX JMP repeat_end_emit_encodeBlockAsm12B repeat_three_repeat_as_copy_encodeBlockAsm12B_emit_copy: - LEAL -4(BX), BX + LEAL -4(BP), BP MOVW $0x0015, (AX) - MOVB BL, 2(AX) + MOVB BP, 2(AX) ADDQ $0x03, AX JMP repeat_end_emit_encodeBlockAsm12B repeat_two_repeat_as_copy_encodeBlockAsm12B_emit_copy: - SHLL $0x02, BX - ORL $0x01, BX - MOVW BX, (AX) + SHLL $0x02, BP + ORL $0x01, BP + MOVW BP, (AX) ADDQ $0x02, AX JMP repeat_end_emit_encodeBlockAsm12B repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy: - XORQ R9, R9 - LEAL 1(R9)(BX*4), BX - MOVB BP, 1(AX) - SARL $0x08, BP - SHLL $0x05, BP - ORL BP, BX - MOVB BL, (AX) + XORQ DI, DI + LEAL 1(DI)(BP*4), BP + MOVB SI, 1(AX) + SARL $0x08, SI + SHLL $0x05, SI + ORL SI, BP + MOVB BP, (AX) ADDQ $0x02, AX JMP repeat_end_emit_encodeBlockAsm12B - JMP four_bytes_loop_back_repeat_as_copy_encodeBlockAsm12B + JMP four_bytes_loop_back_repeat_as_copy_encodeBlockAsm12B four_bytes_remain_repeat_as_copy_encodeBlockAsm12B: - TESTL BX, BX + TESTL BP, BP JZ repeat_end_emit_encodeBlockAsm12B - MOVB $0x03, SI - LEAL -4(SI)(BX*4), BX - MOVB BL, (AX) - MOVL BP, 1(AX) + MOVB $0x03, BL + LEAL -4(BX)(BP*4), BP + MOVB BP, (AX) + MOVL SI, 1(AX) ADDQ $0x05, AX JMP repeat_end_emit_encodeBlockAsm12B two_byte_offset_repeat_as_copy_encodeBlockAsm12B: - CMPL BX, $0x40 + CMPL BP, $0x40 JLE two_byte_offset_short_repeat_as_copy_encodeBlockAsm12B MOVB $0xee, (AX) - MOVW BP, 1(AX) - LEAL -60(BX), BX + MOVW SI, 1(AX) + LEAL -60(BP), BP ADDQ $0x03, AX emit_repeat_again_repeat_as_copy_encodeBlockAsm12B_emit_copy_short: - MOVL BX, DI - LEAL -4(BX), BX + MOVL BP, DI + LEAL -4(BP), BP CMPL DI, $0x08 JLE repeat_two_repeat_as_copy_encodeBlockAsm12B_emit_copy_short CMPL DI, $0x0c JGE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short - CMPL BP, $0x00000800 + CMPL SI, $0x00000800 JLT repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short: - CMPL BX, $0x00000104 + CMPL BP, $0x00000104 JLT repeat_three_repeat_as_copy_encodeBlockAsm12B_emit_copy_short - CMPL BX, $0x00010100 + CMPL BP, $0x00010100 JLT repeat_four_repeat_as_copy_encodeBlockAsm12B_emit_copy_short - CMPL BX, $0x0100ffff + CMPL BP, $0x0100ffff JLT repeat_five_repeat_as_copy_encodeBlockAsm12B_emit_copy_short - LEAL -16842747(BX), BX + LEAL -16842747(BP), BP MOVW $0x001d, (AX) MOVW $0xfffb, 2(AX) MOVB $0xff, 4(AX) @@ -1981,68 +1978,68 @@ cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short: JMP emit_repeat_again_repeat_as_copy_encodeBlockAsm12B_emit_copy_short repeat_five_repeat_as_copy_encodeBlockAsm12B_emit_copy_short: - LEAL -65536(BX), BX - MOVL BX, BP + LEAL -65536(BP), BP + MOVL BP, SI MOVW $0x001d, (AX) - MOVW BX, 2(AX) - SARL $0x10, BP - MOVB BP, 4(AX) + MOVW BP, 2(AX) + SARL $0x10, SI + MOVB SI, 4(AX) ADDQ $0x05, AX JMP repeat_end_emit_encodeBlockAsm12B repeat_four_repeat_as_copy_encodeBlockAsm12B_emit_copy_short: - LEAL -256(BX), BX + LEAL -256(BP), BP MOVW $0x0019, (AX) - MOVW BX, 2(AX) + MOVW BP, 2(AX) ADDQ $0x04, AX JMP repeat_end_emit_encodeBlockAsm12B repeat_three_repeat_as_copy_encodeBlockAsm12B_emit_copy_short: - LEAL -4(BX), BX + LEAL -4(BP), BP MOVW $0x0015, (AX) - MOVB BL, 2(AX) + MOVB BP, 2(AX) ADDQ $0x03, AX JMP repeat_end_emit_encodeBlockAsm12B repeat_two_repeat_as_copy_encodeBlockAsm12B_emit_copy_short: - SHLL $0x02, BX - ORL $0x01, BX - MOVW BX, (AX) + SHLL $0x02, BP + ORL $0x01, BP + MOVW BP, (AX) ADDQ $0x02, AX JMP repeat_end_emit_encodeBlockAsm12B repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short: - XORQ R9, R9 - LEAL 1(R9)(BX*4), BX - MOVB BP, 1(AX) - SARL $0x08, BP - SHLL $0x05, BP - ORL BP, BX - MOVB BL, (AX) + XORQ DI, DI + LEAL 1(DI)(BP*4), BP + MOVB SI, 1(AX) + SARL $0x08, SI + SHLL $0x05, SI + ORL SI, BP + MOVB BP, (AX) ADDQ $0x02, AX JMP repeat_end_emit_encodeBlockAsm12B - JMP two_byte_offset_repeat_as_copy_encodeBlockAsm12B + JMP two_byte_offset_repeat_as_copy_encodeBlockAsm12B two_byte_offset_short_repeat_as_copy_encodeBlockAsm12B: - CMPL BX, $0x0c + CMPL BP, $0x0c JGE emit_copy_three_repeat_as_copy_encodeBlockAsm12B - CMPL BP, $0x00000800 + CMPL SI, $0x00000800 JGE emit_copy_three_repeat_as_copy_encodeBlockAsm12B - MOVB $0x01, SI - LEAL -16(SI)(BX*4), BX - MOVB BP, 1(AX) - SHRL $0x08, BP - SHLL $0x05, BP - ORL BP, BX - MOVB BL, (AX) + MOVB $0x01, BL + LEAL -16(BX)(BP*4), BP + MOVB SI, 1(AX) + SHRL $0x08, SI + SHLL $0x05, SI + ORL SI, BP + MOVB BP, (AX) ADDQ $0x02, AX JMP repeat_end_emit_encodeBlockAsm12B emit_copy_three_repeat_as_copy_encodeBlockAsm12B: - MOVB $0x02, SI - LEAL -4(SI)(BX*4), BX - MOVB BL, (AX) - MOVW BP, 1(AX) + MOVB $0x02, BL + LEAL -4(BX)(BP*4), BP + MOVB BP, (AX) + MOVW SI, 1(AX) ADDQ $0x03, AX repeat_end_emit_encodeBlockAsm12B: @@ -2052,16 +2049,16 @@ repeat_end_emit_encodeBlockAsm12B: JMP search_loop_encodeBlockAsm12B no_repeat_found_encodeBlockAsm12B: - CMPL (DX)(BX*1), R9 + CMPL (DX)(BP*1), SI JEQ candidate_match_encodeBlockAsm12B - SHRQ $0x08, R9 - MOVL 24(SP)(R11*4), BX - LEAL 2(CX), DI - CMPL (DX)(BP*1), R9 + SHRQ $0x08, SI + MOVL 24(SP)(R9*4), BP + LEAL 2(CX), R8 + CMPL (DX)(DI*1), SI JEQ candidate2_match_encodeBlockAsm12B - MOVL DI, 24(SP)(R11*4) - SHRQ $0x08, R9 - CMPL (DX)(BX*1), R9 + MOVL R8, 24(SP)(R9*4) + SHRQ $0x08, SI + CMPL (DX)(BP*1), SI JEQ candidate3_match_encodeBlockAsm12B MOVL 20(SP), CX JMP search_loop_encodeBlockAsm12B @@ -2071,46 +2068,46 @@ candidate3_match_encodeBlockAsm12B: JMP candidate_match_encodeBlockAsm12B candidate2_match_encodeBlockAsm12B: - MOVL DI, 24(SP)(R11*4) + MOVL R8, 24(SP)(R9*4) INCL CX - MOVL BP, BX + MOVL DI, BP candidate_match_encodeBlockAsm12B: - MOVL 12(SP), BP - TESTL BX, BX + MOVL 12(SP), SI + TESTL BP, BP JZ match_extend_back_end_encodeBlockAsm12B match_extend_back_loop_encodeBlockAsm12B: - CMPL CX, BP + CMPL CX, SI JLE match_extend_back_end_encodeBlockAsm12B - MOVB -1(DX)(BX*1), SI + MOVB -1(DX)(BP*1), BL MOVB -1(DX)(CX*1), DI - CMPB SI, DI + CMPB BL, DI JNE match_extend_back_end_encodeBlockAsm12B LEAL -1(CX), CX - DECL BX + DECL BP JZ match_extend_back_end_encodeBlockAsm12B JMP match_extend_back_loop_encodeBlockAsm12B match_extend_back_end_encodeBlockAsm12B: - MOVL CX, DI - SUBL 12(SP), DI - LEAQ 4(AX)(DI*1), DI - CMPQ DI, (SP) + MOVL CX, SI + SUBL 12(SP), SI + LEAQ 4(AX)(SI*1), SI + CMPQ SI, (SP) JL match_dst_size_check_encodeBlockAsm12B MOVQ $0x00000000, ret+48(FP) RET match_dst_size_check_encodeBlockAsm12B: - MOVL CX, R8 + MOVL CX, SI MOVL 12(SP), DI - CMPL DI, R8 + CMPL DI, SI JEQ emit_literal_done_match_emit_encodeBlockAsm12B - MOVL R8, BP - MOVL R8, 12(SP) - LEAQ (DX)(DI*1), R9 - SUBL DI, BP - MOVL BP, DI + MOVL SI, R8 + MOVL SI, 12(SP) + LEAQ (DX)(DI*1), SI + SUBL DI, R8 + MOVL R8, DI SUBL $0x01, DI JC emit_literal_done_match_emit_encodeBlockAsm12B CMPL DI, $0x3c @@ -2127,11 +2124,11 @@ match_dst_size_check_encodeBlockAsm12B: JMP memmove_match_emit_encodeBlockAsm12B four_bytes_match_emit_encodeBlockAsm12B: - MOVL DI, R8 - SHRL $0x10, R8 + MOVL DI, R9 + SHRL $0x10, R9 MOVB $0xf8, (AX) MOVW DI, 1(AX) - MOVB R8, 3(AX) + MOVB R9, 3(AX) ADDQ $0x04, AX JMP memmove_match_emit_encodeBlockAsm12B @@ -2153,8 +2150,7 @@ one_byte_match_emit_encodeBlockAsm12B: ADDQ $0x01, AX memmove_match_emit_encodeBlockAsm12B: - LEAQ (AX)(BP*1), R10 - MOVL BP, R8 + LEAQ (AX)(R8*1), DI NOP emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_tail: @@ -2181,55 +2177,55 @@ emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_tail: JMP emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_256through2048 emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_1or2: - MOVB (R9), R11 - MOVB -1(R9)(R8*1), R9 - MOVB R11, (AX) - MOVB R9, -1(AX)(R8*1) + MOVB (SI), R9 + MOVB -1(SI)(R8*1), SI + MOVB R9, (AX) + MOVB SI, -1(AX)(R8*1) JMP memmove_end_copy_match_emit_encodeBlockAsm12B emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_4: - MOVL (R9), R11 - MOVL R11, (AX) + MOVL (SI), R9 + MOVL R9, (AX) JMP memmove_end_copy_match_emit_encodeBlockAsm12B emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_3: - MOVW (R9), R11 - MOVB 2(R9), R9 - MOVW R11, (AX) - MOVB R9, 2(AX) + MOVW (SI), R9 + MOVB 2(SI), SI + MOVW R9, (AX) + MOVB SI, 2(AX) JMP memmove_end_copy_match_emit_encodeBlockAsm12B emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_5through7: - MOVL (R9), R11 - MOVL -4(R9)(R8*1), R9 - MOVL R11, (AX) - MOVL R9, -4(AX)(R8*1) + MOVL (SI), R9 + MOVL -4(SI)(R8*1), SI + MOVL R9, (AX) + MOVL SI, -4(AX)(R8*1) JMP memmove_end_copy_match_emit_encodeBlockAsm12B emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_8: - MOVQ (R9), R11 - MOVQ R11, (AX) + MOVQ (SI), R9 + MOVQ R9, (AX) JMP memmove_end_copy_match_emit_encodeBlockAsm12B emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_9through16: - MOVQ (R9), R11 - MOVQ -8(R9)(R8*1), R9 - MOVQ R11, (AX) - MOVQ R9, -8(AX)(R8*1) + MOVQ (SI), R9 + MOVQ -8(SI)(R8*1), SI + MOVQ R9, (AX) + MOVQ SI, -8(AX)(R8*1) JMP memmove_end_copy_match_emit_encodeBlockAsm12B emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_17through32: - MOVOU (R9), X0 - MOVOU -16(R9)(R8*1), X1 + MOVOU (SI), X0 + MOVOU -16(SI)(R8*1), X1 MOVOU X0, (AX) MOVOU X1, -16(AX)(R8*1) JMP memmove_end_copy_match_emit_encodeBlockAsm12B emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_33through64: - MOVOU (R9), X0 - MOVOU 16(R9), X1 - MOVOU -32(R9)(R8*1), X2 - MOVOU -16(R9)(R8*1), X3 + MOVOU (SI), X0 + MOVOU 16(SI), X1 + MOVOU -32(SI)(R8*1), X2 + MOVOU -16(SI)(R8*1), X3 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, -32(AX)(R8*1) @@ -2237,14 +2233,14 @@ emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_33through64: JMP memmove_end_copy_match_emit_encodeBlockAsm12B emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_65through128: - MOVOU (R9), X0 - MOVOU 16(R9), X1 - MOVOU 32(R9), X2 - MOVOU 48(R9), X3 - MOVOU -64(R9)(R8*1), X12 - MOVOU -48(R9)(R8*1), X13 - MOVOU -32(R9)(R8*1), X14 - MOVOU -16(R9)(R8*1), X15 + MOVOU (SI), X0 + MOVOU 16(SI), X1 + MOVOU 32(SI), X2 + MOVOU 48(SI), X3 + MOVOU -64(SI)(R8*1), X12 + MOVOU -48(SI)(R8*1), X13 + MOVOU -32(SI)(R8*1), X14 + MOVOU -16(SI)(R8*1), X15 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, 32(AX) @@ -2256,22 +2252,22 @@ emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_65through128: JMP memmove_end_copy_match_emit_encodeBlockAsm12B emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_129through256: - MOVOU (R9), X0 - MOVOU 16(R9), X1 - MOVOU 32(R9), X2 - MOVOU 48(R9), X3 - MOVOU 64(R9), X4 - MOVOU 80(R9), X5 - MOVOU 96(R9), X6 - MOVOU 112(R9), X7 - MOVOU -128(R9)(R8*1), X8 - MOVOU -112(R9)(R8*1), X9 - MOVOU -96(R9)(R8*1), X10 - MOVOU -80(R9)(R8*1), X11 - MOVOU -64(R9)(R8*1), X12 - MOVOU -48(R9)(R8*1), X13 - MOVOU -32(R9)(R8*1), X14 - MOVOU -16(R9)(R8*1), X15 + MOVOU (SI), X0 + MOVOU 16(SI), X1 + MOVOU 32(SI), X2 + MOVOU 48(SI), X3 + MOVOU 64(SI), X4 + MOVOU 80(SI), X5 + MOVOU 96(SI), X6 + MOVOU 112(SI), X7 + MOVOU -128(SI)(R8*1), X8 + MOVOU -112(SI)(R8*1), X9 + MOVOU -96(SI)(R8*1), X10 + MOVOU -80(SI)(R8*1), X11 + MOVOU -64(SI)(R8*1), X12 + MOVOU -48(SI)(R8*1), X13 + MOVOU -32(SI)(R8*1), X14 + MOVOU -16(SI)(R8*1), X15 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, 32(AX) @@ -2292,22 +2288,22 @@ emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_129through256: emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_256through2048: LEAQ -256(R8), R8 - MOVOU (R9), X0 - MOVOU 16(R9), X1 - MOVOU 32(R9), X2 - MOVOU 48(R9), X3 - MOVOU 64(R9), X4 - MOVOU 80(R9), X5 - MOVOU 96(R9), X6 - MOVOU 112(R9), X7 - MOVOU 128(R9), X8 - MOVOU 144(R9), X9 - MOVOU 160(R9), X10 - MOVOU 176(R9), X11 - MOVOU 192(R9), X12 - MOVOU 208(R9), X13 - MOVOU 224(R9), X14 - MOVOU 240(R9), X15 + MOVOU (SI), X0 + MOVOU 16(SI), X1 + MOVOU 32(SI), X2 + MOVOU 48(SI), X3 + MOVOU 64(SI), X4 + MOVOU 80(SI), X5 + MOVOU 96(SI), X6 + MOVOU 112(SI), X7 + MOVOU 128(SI), X8 + MOVOU 144(SI), X9 + MOVOU 160(SI), X10 + MOVOU 176(SI), X11 + MOVOU 192(SI), X12 + MOVOU 208(SI), X13 + MOVOU 224(SI), X14 + MOVOU 240(SI), X15 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, 32(AX) @@ -2325,92 +2321,92 @@ emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_256through2048: MOVOU X14, 224(AX) MOVOU X15, 240(AX) CMPQ R8, $0x00000100 - LEAQ 256(R9), R9 + LEAQ 256(SI), SI LEAQ 256(AX), AX JGE emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_256through2048 JMP emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_tail memmove_end_copy_match_emit_encodeBlockAsm12B: - MOVQ R10, AX + MOVQ DI, AX emit_literal_done_match_emit_encodeBlockAsm12B: match_nolit_loop_encodeBlockAsm12B: - MOVL CX, BP - SUBL BX, BP - MOVL BP, 16(SP) + MOVL CX, SI + SUBL BP, SI + MOVL SI, 16(SP) ADDL $0x04, CX - ADDL $0x04, BX - MOVQ src_len+32(FP), R9 - SUBL CX, R9 - LEAQ (DX)(CX*1), R10 - LEAQ (DX)(BX*1), R11 - XORL BX, BX - CMPL R9, $0x08 + ADDL $0x04, BP + MOVQ src_len+32(FP), SI + SUBL CX, SI + LEAQ (DX)(CX*1), DI + LEAQ (DX)(BP*1), BP + XORL R9, R9 + CMPL SI, $0x08 JL matchlen_single_match_nolit_encodeBlockAsm12B matchlen_loopback_match_nolit_encodeBlockAsm12B: - MOVQ (R10)(BX*1), R12 - XORQ (R11)(BX*1), R12 - TESTQ R12, R12 + MOVQ (DI)(R9*1), R8 + XORQ (BP)(R9*1), R8 + TESTQ R8, R8 JZ matchlen_loop_match_nolit_encodeBlockAsm12B - BSFQ R12, R12 - SARQ $0x03, R12 - LEAL (BX)(R12*1), BX + BSFQ R8, R8 + SARQ $0x03, R8 + LEAL (R9)(R8*1), R9 JMP match_nolit_end_encodeBlockAsm12B matchlen_loop_match_nolit_encodeBlockAsm12B: - LEAL -8(R9), R9 - LEAL 8(BX), BX - CMPL R9, $0x08 + LEAL -8(SI), SI + LEAL 8(R9), R9 + CMPL SI, $0x08 JGE matchlen_loopback_match_nolit_encodeBlockAsm12B matchlen_single_match_nolit_encodeBlockAsm12B: - TESTL R9, R9 + TESTL SI, SI JZ match_nolit_end_encodeBlockAsm12B matchlen_single_loopback_match_nolit_encodeBlockAsm12B: - MOVB (R10)(BX*1), R12 - CMPB (R11)(BX*1), R12 + MOVB (DI)(R9*1), R8 + CMPB (BP)(R9*1), R8 JNE match_nolit_end_encodeBlockAsm12B - LEAL 1(BX), BX - DECL R9 + LEAL 1(R9), R9 + DECL SI JNZ matchlen_single_loopback_match_nolit_encodeBlockAsm12B match_nolit_end_encodeBlockAsm12B: - ADDL BX, CX + ADDL R9, CX MOVL 16(SP), BP - ADDL $0x04, BX + ADDL $0x04, R9 CMPL BP, $0x00010000 JL two_byte_offset_match_nolit_encodeBlockAsm12B four_bytes_loop_back_match_nolit_encodeBlockAsm12B: - CMPL BX, $0x40 + CMPL R9, $0x40 JLE four_bytes_remain_match_nolit_encodeBlockAsm12B MOVB $0xff, (AX) MOVL BP, 1(AX) - LEAL -64(BX), BX + LEAL -64(R9), R9 ADDQ $0x05, AX - CMPL BX, $0x04 + CMPL R9, $0x04 JL four_bytes_remain_match_nolit_encodeBlockAsm12B emit_repeat_again_match_nolit_encodeBlockAsm12B_emit_copy: - MOVL BX, DI - LEAL -4(BX), BX - CMPL DI, $0x08 + MOVL R9, SI + LEAL -4(R9), R9 + CMPL SI, $0x08 JLE repeat_two_match_nolit_encodeBlockAsm12B_emit_copy - CMPL DI, $0x0c + CMPL SI, $0x0c JGE cant_repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy CMPL BP, $0x00000800 JLT repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy cant_repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy: - CMPL BX, $0x00000104 + CMPL R9, $0x00000104 JLT repeat_three_match_nolit_encodeBlockAsm12B_emit_copy - CMPL BX, $0x00010100 + CMPL R9, $0x00010100 JLT repeat_four_match_nolit_encodeBlockAsm12B_emit_copy - CMPL BX, $0x0100ffff + CMPL R9, $0x0100ffff JLT repeat_five_match_nolit_encodeBlockAsm12B_emit_copy - LEAL -16842747(BX), BX + LEAL -16842747(R9), R9 MOVW $0x001d, (AX) MOVW $0xfffb, 2(AX) MOVB $0xff, 4(AX) @@ -2418,84 +2414,84 @@ cant_repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy: JMP emit_repeat_again_match_nolit_encodeBlockAsm12B_emit_copy repeat_five_match_nolit_encodeBlockAsm12B_emit_copy: - LEAL -65536(BX), BX - MOVL BX, BP + LEAL -65536(R9), R9 + MOVL R9, BP MOVW $0x001d, (AX) - MOVW BX, 2(AX) + MOVW R9, 2(AX) SARL $0x10, BP MOVB BP, 4(AX) ADDQ $0x05, AX JMP match_nolit_emitcopy_end_encodeBlockAsm12B repeat_four_match_nolit_encodeBlockAsm12B_emit_copy: - LEAL -256(BX), BX + LEAL -256(R9), R9 MOVW $0x0019, (AX) - MOVW BX, 2(AX) + MOVW R9, 2(AX) ADDQ $0x04, AX JMP match_nolit_emitcopy_end_encodeBlockAsm12B repeat_three_match_nolit_encodeBlockAsm12B_emit_copy: - LEAL -4(BX), BX + LEAL -4(R9), R9 MOVW $0x0015, (AX) - MOVB BL, 2(AX) + MOVB R9, 2(AX) ADDQ $0x03, AX JMP match_nolit_emitcopy_end_encodeBlockAsm12B repeat_two_match_nolit_encodeBlockAsm12B_emit_copy: - SHLL $0x02, BX - ORL $0x01, BX - MOVW BX, (AX) + SHLL $0x02, R9 + ORL $0x01, R9 + MOVW R9, (AX) ADDQ $0x02, AX JMP match_nolit_emitcopy_end_encodeBlockAsm12B repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy: - XORQ R9, R9 - LEAL 1(R9)(BX*4), BX + XORQ SI, SI + LEAL 1(SI)(R9*4), R9 MOVB BP, 1(AX) SARL $0x08, BP SHLL $0x05, BP - ORL BP, BX - MOVB BL, (AX) + ORL BP, R9 + MOVB R9, (AX) ADDQ $0x02, AX JMP match_nolit_emitcopy_end_encodeBlockAsm12B - JMP four_bytes_loop_back_match_nolit_encodeBlockAsm12B + JMP four_bytes_loop_back_match_nolit_encodeBlockAsm12B four_bytes_remain_match_nolit_encodeBlockAsm12B: - TESTL BX, BX + TESTL R9, R9 JZ match_nolit_emitcopy_end_encodeBlockAsm12B - MOVB $0x03, SI - LEAL -4(SI)(BX*4), BX - MOVB BL, (AX) + MOVB $0x03, BL + LEAL -4(BX)(R9*4), R9 + MOVB R9, (AX) MOVL BP, 1(AX) ADDQ $0x05, AX JMP match_nolit_emitcopy_end_encodeBlockAsm12B two_byte_offset_match_nolit_encodeBlockAsm12B: - CMPL BX, $0x40 + CMPL R9, $0x40 JLE two_byte_offset_short_match_nolit_encodeBlockAsm12B MOVB $0xee, (AX) MOVW BP, 1(AX) - LEAL -60(BX), BX + LEAL -60(R9), R9 ADDQ $0x03, AX emit_repeat_again_match_nolit_encodeBlockAsm12B_emit_copy_short: - MOVL BX, DI - LEAL -4(BX), BX - CMPL DI, $0x08 + MOVL R9, SI + LEAL -4(R9), R9 + CMPL SI, $0x08 JLE repeat_two_match_nolit_encodeBlockAsm12B_emit_copy_short - CMPL DI, $0x0c + CMPL SI, $0x0c JGE cant_repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short CMPL BP, $0x00000800 JLT repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short cant_repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short: - CMPL BX, $0x00000104 + CMPL R9, $0x00000104 JLT repeat_three_match_nolit_encodeBlockAsm12B_emit_copy_short - CMPL BX, $0x00010100 + CMPL R9, $0x00010100 JLT repeat_four_match_nolit_encodeBlockAsm12B_emit_copy_short - CMPL BX, $0x0100ffff + CMPL R9, $0x0100ffff JLT repeat_five_match_nolit_encodeBlockAsm12B_emit_copy_short - LEAL -16842747(BX), BX + LEAL -16842747(R9), R9 MOVW $0x001d, (AX) MOVW $0xfffb, 2(AX) MOVB $0xff, 4(AX) @@ -2503,67 +2499,67 @@ cant_repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short: JMP emit_repeat_again_match_nolit_encodeBlockAsm12B_emit_copy_short repeat_five_match_nolit_encodeBlockAsm12B_emit_copy_short: - LEAL -65536(BX), BX - MOVL BX, BP + LEAL -65536(R9), R9 + MOVL R9, BP MOVW $0x001d, (AX) - MOVW BX, 2(AX) + MOVW R9, 2(AX) SARL $0x10, BP MOVB BP, 4(AX) ADDQ $0x05, AX JMP match_nolit_emitcopy_end_encodeBlockAsm12B repeat_four_match_nolit_encodeBlockAsm12B_emit_copy_short: - LEAL -256(BX), BX + LEAL -256(R9), R9 MOVW $0x0019, (AX) - MOVW BX, 2(AX) + MOVW R9, 2(AX) ADDQ $0x04, AX JMP match_nolit_emitcopy_end_encodeBlockAsm12B repeat_three_match_nolit_encodeBlockAsm12B_emit_copy_short: - LEAL -4(BX), BX + LEAL -4(R9), R9 MOVW $0x0015, (AX) - MOVB BL, 2(AX) + MOVB R9, 2(AX) ADDQ $0x03, AX JMP match_nolit_emitcopy_end_encodeBlockAsm12B repeat_two_match_nolit_encodeBlockAsm12B_emit_copy_short: - SHLL $0x02, BX - ORL $0x01, BX - MOVW BX, (AX) + SHLL $0x02, R9 + ORL $0x01, R9 + MOVW R9, (AX) ADDQ $0x02, AX JMP match_nolit_emitcopy_end_encodeBlockAsm12B repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short: - XORQ R9, R9 - LEAL 1(R9)(BX*4), BX + XORQ SI, SI + LEAL 1(SI)(R9*4), R9 MOVB BP, 1(AX) SARL $0x08, BP SHLL $0x05, BP - ORL BP, BX - MOVB BL, (AX) + ORL BP, R9 + MOVB R9, (AX) ADDQ $0x02, AX JMP match_nolit_emitcopy_end_encodeBlockAsm12B - JMP two_byte_offset_match_nolit_encodeBlockAsm12B + JMP two_byte_offset_match_nolit_encodeBlockAsm12B two_byte_offset_short_match_nolit_encodeBlockAsm12B: - CMPL BX, $0x0c + CMPL R9, $0x0c JGE emit_copy_three_match_nolit_encodeBlockAsm12B CMPL BP, $0x00000800 JGE emit_copy_three_match_nolit_encodeBlockAsm12B - MOVB $0x01, SI - LEAL -16(SI)(BX*4), BX + MOVB $0x01, BL + LEAL -16(BX)(R9*4), R9 MOVB BP, 1(AX) SHRL $0x08, BP SHLL $0x05, BP - ORL BP, BX - MOVB BL, (AX) + ORL BP, R9 + MOVB R9, (AX) ADDQ $0x02, AX JMP match_nolit_emitcopy_end_encodeBlockAsm12B emit_copy_three_match_nolit_encodeBlockAsm12B: - MOVB $0x02, SI - LEAL -4(SI)(BX*4), BX - MOVB BL, (AX) + MOVB $0x02, BL + LEAL -4(BX)(R9*4), R9 + MOVB R9, (AX) MOVW BP, 1(AX) ADDQ $0x03, AX @@ -2577,22 +2573,22 @@ match_nolit_emitcopy_end_encodeBlockAsm12B: RET match_nolit_dst_ok_encodeBlockAsm12B: - MOVQ -2(DX)(CX*1), R9 - MOVQ $0x000000cf1bbcdcbb, R10 - MOVQ R9, R11 - SHRQ $0x10, R9 - MOVQ R9, R12 - SHLQ $0x18, R11 - IMULQ R10, R11 - SHRQ $0x34, R11 - SHLQ $0x18, R12 - IMULQ R10, R12 - SHRQ $0x34, R12 - LEAL -2(CX), BP - MOVL 24(SP)(R12*4), BX - MOVL BP, 24(SP)(R11*4) - MOVL CX, 24(SP)(R12*4) - CMPL (DX)(BX*1), R9 + MOVQ -2(DX)(CX*1), SI + MOVQ $0x000000cf1bbcdcbb, BP + MOVQ SI, DI + SHRQ $0x10, SI + MOVQ SI, R8 + SHLQ $0x18, DI + IMULQ BP, DI + SHRQ $0x34, DI + SHLQ $0x18, R8 + IMULQ BP, R8 + SHRQ $0x34, R8 + LEAL -2(CX), R9 + MOVL 24(SP)(R8*4), BP + MOVL R9, 24(SP)(DI*4) + MOVL CX, 24(SP)(R8*4) + CMPL (DX)(BP*1), SI JEQ match_nolit_loop_encodeBlockAsm12B INCL CX JMP search_loop_encodeBlockAsm12B @@ -2867,8 +2863,8 @@ zero_loop_encodeBlockAsm10B: MOVL $0x00000000, 12(SP) MOVQ src_len+32(FP), CX LEAQ -5(CX), DX - LEAQ -8(CX), R9 - MOVL R9, 8(SP) + LEAQ -8(CX), BP + MOVL BP, 8(SP) SHRQ $0x05, CX SUBL CX, DX LEAQ (AX)(DX*1), DX @@ -2878,68 +2874,68 @@ zero_loop_encodeBlockAsm10B: MOVQ src_base+24(FP), DX search_loop_encodeBlockAsm10B: - MOVQ (DX)(CX*1), R9 - MOVL CX, BX - SUBL 12(SP), BX - SHRL $0x05, BX - LEAL 4(CX)(BX*1), BX - MOVL 8(SP), BP - CMPL BX, BP + MOVQ (DX)(CX*1), SI + MOVL CX, BP + SUBL 12(SP), BP + SHRL $0x05, BP + LEAL 4(CX)(BP*1), BP + MOVL 8(SP), DI + CMPL BP, DI JGT emit_remainder_encodeBlockAsm10B - MOVL BX, 20(SP) - MOVQ $0x000000cf1bbcdcbb, R10 - MOVQ R9, R11 - MOVQ R9, R12 - SHRQ $0x08, R12 - SHLQ $0x18, R11 - IMULQ R10, R11 - SHRQ $0x36, R11 - SHLQ $0x18, R12 - IMULQ R10, R12 - SHRQ $0x36, R12 - MOVL 24(SP)(R11*4), BX - MOVL 24(SP)(R12*4), BP - MOVL CX, 24(SP)(R11*4) - LEAL 1(CX), DI - MOVL DI, 24(SP)(R12*4) - MOVQ R9, R11 - SHRQ $0x10, R11 - SHLQ $0x18, R11 - IMULQ R10, R11 - SHRQ $0x36, R11 - MOVL CX, DI - SUBL 16(SP), DI - MOVL 1(DX)(DI*1), DI - MOVQ R9, R10 + MOVL BP, 20(SP) + MOVQ $0x000000cf1bbcdcbb, R8 + MOVQ SI, R9 + MOVQ SI, R10 SHRQ $0x08, R10 - CMPL R10, DI + SHLQ $0x18, R9 + IMULQ R8, R9 + SHRQ $0x36, R9 + SHLQ $0x18, R10 + IMULQ R8, R10 + SHRQ $0x36, R10 + MOVL 24(SP)(R9*4), BP + MOVL 24(SP)(R10*4), DI + MOVL CX, 24(SP)(R9*4) + LEAL 1(CX), R9 + MOVL R9, 24(SP)(R10*4) + MOVQ SI, R9 + SHRQ $0x10, R9 + SHLQ $0x18, R9 + IMULQ R8, R9 + SHRQ $0x36, R9 + MOVL CX, R8 + SUBL 16(SP), R8 + MOVL 1(DX)(R8*1), R10 + MOVQ SI, R8 + SHRQ $0x08, R8 + CMPL R8, R10 JNE no_repeat_found_encodeBlockAsm10B - LEAL 1(CX), DI - MOVL 12(SP), R8 - MOVL DI, BX - SUBL 16(SP), BX + LEAL 1(CX), SI + MOVL 12(SP), DI + MOVL SI, BP + SUBL 16(SP), BP JZ repeat_extend_back_end_encodeBlockAsm10B repeat_extend_back_loop_encodeBlockAsm10B: - CMPL DI, R8 + CMPL SI, DI JLE repeat_extend_back_end_encodeBlockAsm10B - MOVB -1(DX)(BX*1), BP - MOVB -1(DX)(DI*1), SI - CMPB BP, SI + MOVB -1(DX)(BP*1), BL + MOVB -1(DX)(SI*1), R8 + CMPB BL, R8 JNE repeat_extend_back_end_encodeBlockAsm10B - LEAL -1(DI), DI - DECL BX + LEAL -1(SI), SI + DECL BP JNZ repeat_extend_back_loop_encodeBlockAsm10B repeat_extend_back_end_encodeBlockAsm10B: MOVL 12(SP), BP - CMPL BP, DI + CMPL BP, SI JEQ emit_literal_done_repeat_emit_encodeBlockAsm10B - MOVL DI, BX - MOVL DI, 12(SP) + MOVL SI, R8 + MOVL SI, 12(SP) LEAQ (DX)(BP*1), R9 - SUBL BP, BX - MOVL BX, BP + SUBL BP, R8 + MOVL R8, BP SUBL $0x01, BP JC emit_literal_done_repeat_emit_encodeBlockAsm10B CMPL BP, $0x3c @@ -2982,87 +2978,86 @@ one_byte_repeat_emit_encodeBlockAsm10B: ADDQ $0x01, AX memmove_repeat_emit_encodeBlockAsm10B: - LEAQ (AX)(BX*1), R10 - MOVL BX, BP + LEAQ (AX)(R8*1), BP NOP emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_tail: - TESTQ BP, BP + TESTQ R8, R8 JEQ memmove_end_copy_repeat_emit_encodeBlockAsm10B - CMPQ BP, $0x02 + CMPQ R8, $0x02 JBE emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_1or2 - CMPQ BP, $0x04 + CMPQ R8, $0x04 JB emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_3 JBE emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_4 - CMPQ BP, $0x08 + CMPQ R8, $0x08 JB emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_5through7 JE emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_8 - CMPQ BP, $0x10 + CMPQ R8, $0x10 JBE emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_9through16 - CMPQ BP, $0x20 + CMPQ R8, $0x20 JBE emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_17through32 - CMPQ BP, $0x40 + CMPQ R8, $0x40 JBE emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_33through64 - CMPQ BP, $0x80 + CMPQ R8, $0x80 JBE emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_65through128 - CMPQ BP, $0x00000100 + CMPQ R8, $0x00000100 JBE emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_129through256 JMP emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_256through2048 emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_1or2: - MOVB (R9), R11 - MOVB -1(R9)(BP*1), R9 - MOVB R11, (AX) - MOVB R9, -1(AX)(BP*1) + MOVB (R9), R10 + MOVB -1(R9)(R8*1), R9 + MOVB R10, (AX) + MOVB R9, -1(AX)(R8*1) JMP memmove_end_copy_repeat_emit_encodeBlockAsm10B emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_4: - MOVL (R9), R11 - MOVL R11, (AX) + MOVL (R9), R10 + MOVL R10, (AX) JMP memmove_end_copy_repeat_emit_encodeBlockAsm10B emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_3: - MOVW (R9), R11 + MOVW (R9), R10 MOVB 2(R9), R9 - MOVW R11, (AX) + MOVW R10, (AX) MOVB R9, 2(AX) JMP memmove_end_copy_repeat_emit_encodeBlockAsm10B emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_5through7: - MOVL (R9), R11 - MOVL -4(R9)(BP*1), R9 - MOVL R11, (AX) - MOVL R9, -4(AX)(BP*1) + MOVL (R9), R10 + MOVL -4(R9)(R8*1), R9 + MOVL R10, (AX) + MOVL R9, -4(AX)(R8*1) JMP memmove_end_copy_repeat_emit_encodeBlockAsm10B emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_8: - MOVQ (R9), R11 - MOVQ R11, (AX) + MOVQ (R9), R10 + MOVQ R10, (AX) JMP memmove_end_copy_repeat_emit_encodeBlockAsm10B emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_9through16: - MOVQ (R9), R11 - MOVQ -8(R9)(BP*1), R9 - MOVQ R11, (AX) - MOVQ R9, -8(AX)(BP*1) + MOVQ (R9), R10 + MOVQ -8(R9)(R8*1), R9 + MOVQ R10, (AX) + MOVQ R9, -8(AX)(R8*1) JMP memmove_end_copy_repeat_emit_encodeBlockAsm10B emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_17through32: MOVOU (R9), X0 - MOVOU -16(R9)(BP*1), X1 + MOVOU -16(R9)(R8*1), X1 MOVOU X0, (AX) - MOVOU X1, -16(AX)(BP*1) + MOVOU X1, -16(AX)(R8*1) JMP memmove_end_copy_repeat_emit_encodeBlockAsm10B emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_33through64: MOVOU (R9), X0 MOVOU 16(R9), X1 - MOVOU -32(R9)(BP*1), X2 - MOVOU -16(R9)(BP*1), X3 + MOVOU -32(R9)(R8*1), X2 + MOVOU -16(R9)(R8*1), X3 MOVOU X0, (AX) MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(BP*1) - MOVOU X3, -16(AX)(BP*1) + MOVOU X2, -32(AX)(R8*1) + MOVOU X3, -16(AX)(R8*1) JMP memmove_end_copy_repeat_emit_encodeBlockAsm10B emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_65through128: @@ -3070,18 +3065,18 @@ emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_65through128: MOVOU 16(R9), X1 MOVOU 32(R9), X2 MOVOU 48(R9), X3 - MOVOU -64(R9)(BP*1), X12 - MOVOU -48(R9)(BP*1), X13 - MOVOU -32(R9)(BP*1), X14 - MOVOU -16(R9)(BP*1), X15 + MOVOU -64(R9)(R8*1), X12 + MOVOU -48(R9)(R8*1), X13 + MOVOU -32(R9)(R8*1), X14 + MOVOU -16(R9)(R8*1), X15 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, 32(AX) MOVOU X3, 48(AX) - MOVOU X12, -64(AX)(BP*1) - MOVOU X13, -48(AX)(BP*1) - MOVOU X14, -32(AX)(BP*1) - MOVOU X15, -16(AX)(BP*1) + MOVOU X12, -64(AX)(R8*1) + MOVOU X13, -48(AX)(R8*1) + MOVOU X14, -32(AX)(R8*1) + MOVOU X15, -16(AX)(R8*1) JMP memmove_end_copy_repeat_emit_encodeBlockAsm10B emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_129through256: @@ -3093,14 +3088,14 @@ emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_129through256: MOVOU 80(R9), X5 MOVOU 96(R9), X6 MOVOU 112(R9), X7 - MOVOU -128(R9)(BP*1), X8 - MOVOU -112(R9)(BP*1), X9 - MOVOU -96(R9)(BP*1), X10 - MOVOU -80(R9)(BP*1), X11 - MOVOU -64(R9)(BP*1), X12 - MOVOU -48(R9)(BP*1), X13 - MOVOU -32(R9)(BP*1), X14 - MOVOU -16(R9)(BP*1), X15 + MOVOU -128(R9)(R8*1), X8 + MOVOU -112(R9)(R8*1), X9 + MOVOU -96(R9)(R8*1), X10 + MOVOU -80(R9)(R8*1), X11 + MOVOU -64(R9)(R8*1), X12 + MOVOU -48(R9)(R8*1), X13 + MOVOU -32(R9)(R8*1), X14 + MOVOU -16(R9)(R8*1), X15 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, 32(AX) @@ -3109,18 +3104,18 @@ emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_129through256: MOVOU X5, 80(AX) MOVOU X6, 96(AX) MOVOU X7, 112(AX) - MOVOU X8, -128(AX)(BP*1) - MOVOU X9, -112(AX)(BP*1) - MOVOU X10, -96(AX)(BP*1) - MOVOU X11, -80(AX)(BP*1) - MOVOU X12, -64(AX)(BP*1) - MOVOU X13, -48(AX)(BP*1) - MOVOU X14, -32(AX)(BP*1) - MOVOU X15, -16(AX)(BP*1) + MOVOU X8, -128(AX)(R8*1) + MOVOU X9, -112(AX)(R8*1) + MOVOU X10, -96(AX)(R8*1) + MOVOU X11, -80(AX)(R8*1) + MOVOU X12, -64(AX)(R8*1) + MOVOU X13, -48(AX)(R8*1) + MOVOU X14, -32(AX)(R8*1) + MOVOU X15, -16(AX)(R8*1) JMP memmove_end_copy_repeat_emit_encodeBlockAsm10B emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_256through2048: - LEAQ -256(BP), BP + LEAQ -256(R8), R8 MOVOU (R9), X0 MOVOU 16(R9), X1 MOVOU 32(R9), X2 @@ -3153,81 +3148,81 @@ emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_256through2048: MOVOU X13, 208(AX) MOVOU X14, 224(AX) MOVOU X15, 240(AX) - CMPQ BP, $0x00000100 + CMPQ R8, $0x00000100 LEAQ 256(R9), R9 LEAQ 256(AX), AX JGE emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_256through2048 JMP emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_tail memmove_end_copy_repeat_emit_encodeBlockAsm10B: - MOVQ R10, AX + MOVQ BP, AX emit_literal_done_repeat_emit_encodeBlockAsm10B: ADDL $0x05, CX - MOVL CX, BX - SUBL 16(SP), BX - MOVQ src_len+32(FP), R9 - SUBL CX, R9 - LEAQ (DX)(CX*1), R10 - LEAQ (DX)(BX*1), R11 - XORL BX, BX - CMPL R9, $0x08 + MOVL CX, BP + SUBL 16(SP), BP + MOVQ src_len+32(FP), R8 + SUBL CX, R8 + LEAQ (DX)(CX*1), R9 + LEAQ (DX)(BP*1), BP + XORL R11, R11 + CMPL R8, $0x08 JL matchlen_single_repeat_extend matchlen_loopback_repeat_extend: - MOVQ (R10)(BX*1), R12 - XORQ (R11)(BX*1), R12 - TESTQ R12, R12 + MOVQ (R9)(R11*1), R10 + XORQ (BP)(R11*1), R10 + TESTQ R10, R10 JZ matchlen_loop_repeat_extend - BSFQ R12, R12 - SARQ $0x03, R12 - LEAL (BX)(R12*1), BX + BSFQ R10, R10 + SARQ $0x03, R10 + LEAL (R11)(R10*1), R11 JMP repeat_extend_forward_end_encodeBlockAsm10B matchlen_loop_repeat_extend: - LEAL -8(R9), R9 - LEAL 8(BX), BX - CMPL R9, $0x08 + LEAL -8(R8), R8 + LEAL 8(R11), R11 + CMPL R8, $0x08 JGE matchlen_loopback_repeat_extend matchlen_single_repeat_extend: - TESTL R9, R9 + TESTL R8, R8 JZ repeat_extend_forward_end_encodeBlockAsm10B matchlen_single_loopback_repeat_extend: - MOVB (R10)(BX*1), R12 - CMPB (R11)(BX*1), R12 + MOVB (R9)(R11*1), R10 + CMPB (BP)(R11*1), R10 JNE repeat_extend_forward_end_encodeBlockAsm10B - LEAL 1(BX), BX - DECL R9 + LEAL 1(R11), R11 + DECL R8 JNZ matchlen_single_loopback_repeat_extend repeat_extend_forward_end_encodeBlockAsm10B: - ADDL BX, CX - MOVL CX, BX - SUBL DI, BX - MOVL 16(SP), BP - TESTL R8, R8 + ADDL R11, CX + MOVL CX, BP + SUBL SI, BP + MOVL 16(SP), SI + TESTL DI, DI JZ repeat_as_copy_encodeBlockAsm10B emit_repeat_again_match_repeat_encodeBlockAsm10B: - MOVL BX, DI - LEAL -4(BX), BX + MOVL BP, DI + LEAL -4(BP), BP CMPL DI, $0x08 JLE repeat_two_match_repeat_encodeBlockAsm10B CMPL DI, $0x0c JGE cant_repeat_two_offset_match_repeat_encodeBlockAsm10B - CMPL BP, $0x00000800 + CMPL SI, $0x00000800 JLT repeat_two_offset_match_repeat_encodeBlockAsm10B cant_repeat_two_offset_match_repeat_encodeBlockAsm10B: - CMPL BX, $0x00000104 + CMPL BP, $0x00000104 JLT repeat_three_match_repeat_encodeBlockAsm10B - CMPL BX, $0x00010100 + CMPL BP, $0x00010100 JLT repeat_four_match_repeat_encodeBlockAsm10B - CMPL BX, $0x0100ffff + CMPL BP, $0x0100ffff JLT repeat_five_match_repeat_encodeBlockAsm10B - LEAL -16842747(BX), BX + LEAL -16842747(BP), BP MOVW $0x001d, (AX) MOVW $0xfffb, 2(AX) MOVB $0xff, 4(AX) @@ -3235,79 +3230,79 @@ cant_repeat_two_offset_match_repeat_encodeBlockAsm10B: JMP emit_repeat_again_match_repeat_encodeBlockAsm10B repeat_five_match_repeat_encodeBlockAsm10B: - LEAL -65536(BX), BX - MOVL BX, BP + LEAL -65536(BP), BP + MOVL BP, SI MOVW $0x001d, (AX) - MOVW BX, 2(AX) - SARL $0x10, BP - MOVB BP, 4(AX) + MOVW BP, 2(AX) + SARL $0x10, SI + MOVB SI, 4(AX) ADDQ $0x05, AX JMP repeat_end_emit_encodeBlockAsm10B repeat_four_match_repeat_encodeBlockAsm10B: - LEAL -256(BX), BX + LEAL -256(BP), BP MOVW $0x0019, (AX) - MOVW BX, 2(AX) + MOVW BP, 2(AX) ADDQ $0x04, AX JMP repeat_end_emit_encodeBlockAsm10B repeat_three_match_repeat_encodeBlockAsm10B: - LEAL -4(BX), BX + LEAL -4(BP), BP MOVW $0x0015, (AX) - MOVB BL, 2(AX) + MOVB BP, 2(AX) ADDQ $0x03, AX JMP repeat_end_emit_encodeBlockAsm10B repeat_two_match_repeat_encodeBlockAsm10B: - SHLL $0x02, BX - ORL $0x01, BX - MOVW BX, (AX) + SHLL $0x02, BP + ORL $0x01, BP + MOVW BP, (AX) ADDQ $0x02, AX JMP repeat_end_emit_encodeBlockAsm10B repeat_two_offset_match_repeat_encodeBlockAsm10B: - XORQ R9, R9 - LEAL 1(R9)(BX*4), BX - MOVB BP, 1(AX) - SARL $0x08, BP - SHLL $0x05, BP - ORL BP, BX - MOVB BL, (AX) + XORQ DI, DI + LEAL 1(DI)(BP*4), BP + MOVB SI, 1(AX) + SARL $0x08, SI + SHLL $0x05, SI + ORL SI, BP + MOVB BP, (AX) ADDQ $0x02, AX JMP repeat_end_emit_encodeBlockAsm10B repeat_as_copy_encodeBlockAsm10B: - CMPL BP, $0x00010000 + CMPL SI, $0x00010000 JL two_byte_offset_repeat_as_copy_encodeBlockAsm10B four_bytes_loop_back_repeat_as_copy_encodeBlockAsm10B: - CMPL BX, $0x40 + CMPL BP, $0x40 JLE four_bytes_remain_repeat_as_copy_encodeBlockAsm10B MOVB $0xff, (AX) - MOVL BP, 1(AX) - LEAL -64(BX), BX + MOVL SI, 1(AX) + LEAL -64(BP), BP ADDQ $0x05, AX - CMPL BX, $0x04 + CMPL BP, $0x04 JL four_bytes_remain_repeat_as_copy_encodeBlockAsm10B emit_repeat_again_repeat_as_copy_encodeBlockAsm10B_emit_copy: - MOVL BX, DI - LEAL -4(BX), BX + MOVL BP, DI + LEAL -4(BP), BP CMPL DI, $0x08 JLE repeat_two_repeat_as_copy_encodeBlockAsm10B_emit_copy CMPL DI, $0x0c JGE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy - CMPL BP, $0x00000800 + CMPL SI, $0x00000800 JLT repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy: - CMPL BX, $0x00000104 + CMPL BP, $0x00000104 JLT repeat_three_repeat_as_copy_encodeBlockAsm10B_emit_copy - CMPL BX, $0x00010100 + CMPL BP, $0x00010100 JLT repeat_four_repeat_as_copy_encodeBlockAsm10B_emit_copy - CMPL BX, $0x0100ffff + CMPL BP, $0x0100ffff JLT repeat_five_repeat_as_copy_encodeBlockAsm10B_emit_copy - LEAL -16842747(BX), BX + LEAL -16842747(BP), BP MOVW $0x001d, (AX) MOVW $0xfffb, 2(AX) MOVB $0xff, 4(AX) @@ -3315,84 +3310,84 @@ cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy: JMP emit_repeat_again_repeat_as_copy_encodeBlockAsm10B_emit_copy repeat_five_repeat_as_copy_encodeBlockAsm10B_emit_copy: - LEAL -65536(BX), BX - MOVL BX, BP + LEAL -65536(BP), BP + MOVL BP, SI MOVW $0x001d, (AX) - MOVW BX, 2(AX) - SARL $0x10, BP - MOVB BP, 4(AX) + MOVW BP, 2(AX) + SARL $0x10, SI + MOVB SI, 4(AX) ADDQ $0x05, AX JMP repeat_end_emit_encodeBlockAsm10B repeat_four_repeat_as_copy_encodeBlockAsm10B_emit_copy: - LEAL -256(BX), BX + LEAL -256(BP), BP MOVW $0x0019, (AX) - MOVW BX, 2(AX) + MOVW BP, 2(AX) ADDQ $0x04, AX JMP repeat_end_emit_encodeBlockAsm10B repeat_three_repeat_as_copy_encodeBlockAsm10B_emit_copy: - LEAL -4(BX), BX + LEAL -4(BP), BP MOVW $0x0015, (AX) - MOVB BL, 2(AX) + MOVB BP, 2(AX) ADDQ $0x03, AX JMP repeat_end_emit_encodeBlockAsm10B repeat_two_repeat_as_copy_encodeBlockAsm10B_emit_copy: - SHLL $0x02, BX - ORL $0x01, BX - MOVW BX, (AX) + SHLL $0x02, BP + ORL $0x01, BP + MOVW BP, (AX) ADDQ $0x02, AX JMP repeat_end_emit_encodeBlockAsm10B repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy: - XORQ R9, R9 - LEAL 1(R9)(BX*4), BX - MOVB BP, 1(AX) - SARL $0x08, BP - SHLL $0x05, BP - ORL BP, BX - MOVB BL, (AX) + XORQ DI, DI + LEAL 1(DI)(BP*4), BP + MOVB SI, 1(AX) + SARL $0x08, SI + SHLL $0x05, SI + ORL SI, BP + MOVB BP, (AX) ADDQ $0x02, AX JMP repeat_end_emit_encodeBlockAsm10B - JMP four_bytes_loop_back_repeat_as_copy_encodeBlockAsm10B + JMP four_bytes_loop_back_repeat_as_copy_encodeBlockAsm10B four_bytes_remain_repeat_as_copy_encodeBlockAsm10B: - TESTL BX, BX + TESTL BP, BP JZ repeat_end_emit_encodeBlockAsm10B - MOVB $0x03, SI - LEAL -4(SI)(BX*4), BX - MOVB BL, (AX) - MOVL BP, 1(AX) + MOVB $0x03, BL + LEAL -4(BX)(BP*4), BP + MOVB BP, (AX) + MOVL SI, 1(AX) ADDQ $0x05, AX JMP repeat_end_emit_encodeBlockAsm10B two_byte_offset_repeat_as_copy_encodeBlockAsm10B: - CMPL BX, $0x40 + CMPL BP, $0x40 JLE two_byte_offset_short_repeat_as_copy_encodeBlockAsm10B MOVB $0xee, (AX) - MOVW BP, 1(AX) - LEAL -60(BX), BX + MOVW SI, 1(AX) + LEAL -60(BP), BP ADDQ $0x03, AX emit_repeat_again_repeat_as_copy_encodeBlockAsm10B_emit_copy_short: - MOVL BX, DI - LEAL -4(BX), BX + MOVL BP, DI + LEAL -4(BP), BP CMPL DI, $0x08 JLE repeat_two_repeat_as_copy_encodeBlockAsm10B_emit_copy_short CMPL DI, $0x0c JGE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short - CMPL BP, $0x00000800 + CMPL SI, $0x00000800 JLT repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short: - CMPL BX, $0x00000104 + CMPL BP, $0x00000104 JLT repeat_three_repeat_as_copy_encodeBlockAsm10B_emit_copy_short - CMPL BX, $0x00010100 + CMPL BP, $0x00010100 JLT repeat_four_repeat_as_copy_encodeBlockAsm10B_emit_copy_short - CMPL BX, $0x0100ffff + CMPL BP, $0x0100ffff JLT repeat_five_repeat_as_copy_encodeBlockAsm10B_emit_copy_short - LEAL -16842747(BX), BX + LEAL -16842747(BP), BP MOVW $0x001d, (AX) MOVW $0xfffb, 2(AX) MOVB $0xff, 4(AX) @@ -3400,68 +3395,68 @@ cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short: JMP emit_repeat_again_repeat_as_copy_encodeBlockAsm10B_emit_copy_short repeat_five_repeat_as_copy_encodeBlockAsm10B_emit_copy_short: - LEAL -65536(BX), BX - MOVL BX, BP + LEAL -65536(BP), BP + MOVL BP, SI MOVW $0x001d, (AX) - MOVW BX, 2(AX) - SARL $0x10, BP - MOVB BP, 4(AX) + MOVW BP, 2(AX) + SARL $0x10, SI + MOVB SI, 4(AX) ADDQ $0x05, AX JMP repeat_end_emit_encodeBlockAsm10B repeat_four_repeat_as_copy_encodeBlockAsm10B_emit_copy_short: - LEAL -256(BX), BX + LEAL -256(BP), BP MOVW $0x0019, (AX) - MOVW BX, 2(AX) + MOVW BP, 2(AX) ADDQ $0x04, AX JMP repeat_end_emit_encodeBlockAsm10B repeat_three_repeat_as_copy_encodeBlockAsm10B_emit_copy_short: - LEAL -4(BX), BX + LEAL -4(BP), BP MOVW $0x0015, (AX) - MOVB BL, 2(AX) + MOVB BP, 2(AX) ADDQ $0x03, AX JMP repeat_end_emit_encodeBlockAsm10B repeat_two_repeat_as_copy_encodeBlockAsm10B_emit_copy_short: - SHLL $0x02, BX - ORL $0x01, BX - MOVW BX, (AX) + SHLL $0x02, BP + ORL $0x01, BP + MOVW BP, (AX) ADDQ $0x02, AX JMP repeat_end_emit_encodeBlockAsm10B repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short: - XORQ R9, R9 - LEAL 1(R9)(BX*4), BX - MOVB BP, 1(AX) - SARL $0x08, BP - SHLL $0x05, BP - ORL BP, BX - MOVB BL, (AX) + XORQ DI, DI + LEAL 1(DI)(BP*4), BP + MOVB SI, 1(AX) + SARL $0x08, SI + SHLL $0x05, SI + ORL SI, BP + MOVB BP, (AX) ADDQ $0x02, AX JMP repeat_end_emit_encodeBlockAsm10B - JMP two_byte_offset_repeat_as_copy_encodeBlockAsm10B + JMP two_byte_offset_repeat_as_copy_encodeBlockAsm10B two_byte_offset_short_repeat_as_copy_encodeBlockAsm10B: - CMPL BX, $0x0c + CMPL BP, $0x0c JGE emit_copy_three_repeat_as_copy_encodeBlockAsm10B - CMPL BP, $0x00000800 + CMPL SI, $0x00000800 JGE emit_copy_three_repeat_as_copy_encodeBlockAsm10B - MOVB $0x01, SI - LEAL -16(SI)(BX*4), BX - MOVB BP, 1(AX) - SHRL $0x08, BP - SHLL $0x05, BP - ORL BP, BX - MOVB BL, (AX) + MOVB $0x01, BL + LEAL -16(BX)(BP*4), BP + MOVB SI, 1(AX) + SHRL $0x08, SI + SHLL $0x05, SI + ORL SI, BP + MOVB BP, (AX) ADDQ $0x02, AX JMP repeat_end_emit_encodeBlockAsm10B emit_copy_three_repeat_as_copy_encodeBlockAsm10B: - MOVB $0x02, SI - LEAL -4(SI)(BX*4), BX - MOVB BL, (AX) - MOVW BP, 1(AX) + MOVB $0x02, BL + LEAL -4(BX)(BP*4), BP + MOVB BP, (AX) + MOVW SI, 1(AX) ADDQ $0x03, AX repeat_end_emit_encodeBlockAsm10B: @@ -3471,16 +3466,16 @@ repeat_end_emit_encodeBlockAsm10B: JMP search_loop_encodeBlockAsm10B no_repeat_found_encodeBlockAsm10B: - CMPL (DX)(BX*1), R9 + CMPL (DX)(BP*1), SI JEQ candidate_match_encodeBlockAsm10B - SHRQ $0x08, R9 - MOVL 24(SP)(R11*4), BX - LEAL 2(CX), DI - CMPL (DX)(BP*1), R9 + SHRQ $0x08, SI + MOVL 24(SP)(R9*4), BP + LEAL 2(CX), R8 + CMPL (DX)(DI*1), SI JEQ candidate2_match_encodeBlockAsm10B - MOVL DI, 24(SP)(R11*4) - SHRQ $0x08, R9 - CMPL (DX)(BX*1), R9 + MOVL R8, 24(SP)(R9*4) + SHRQ $0x08, SI + CMPL (DX)(BP*1), SI JEQ candidate3_match_encodeBlockAsm10B MOVL 20(SP), CX JMP search_loop_encodeBlockAsm10B @@ -3490,46 +3485,46 @@ candidate3_match_encodeBlockAsm10B: JMP candidate_match_encodeBlockAsm10B candidate2_match_encodeBlockAsm10B: - MOVL DI, 24(SP)(R11*4) + MOVL R8, 24(SP)(R9*4) INCL CX - MOVL BP, BX + MOVL DI, BP candidate_match_encodeBlockAsm10B: - MOVL 12(SP), BP - TESTL BX, BX + MOVL 12(SP), SI + TESTL BP, BP JZ match_extend_back_end_encodeBlockAsm10B match_extend_back_loop_encodeBlockAsm10B: - CMPL CX, BP + CMPL CX, SI JLE match_extend_back_end_encodeBlockAsm10B - MOVB -1(DX)(BX*1), SI + MOVB -1(DX)(BP*1), BL MOVB -1(DX)(CX*1), DI - CMPB SI, DI + CMPB BL, DI JNE match_extend_back_end_encodeBlockAsm10B LEAL -1(CX), CX - DECL BX + DECL BP JZ match_extend_back_end_encodeBlockAsm10B JMP match_extend_back_loop_encodeBlockAsm10B match_extend_back_end_encodeBlockAsm10B: - MOVL CX, DI - SUBL 12(SP), DI - LEAQ 4(AX)(DI*1), DI - CMPQ DI, (SP) + MOVL CX, SI + SUBL 12(SP), SI + LEAQ 4(AX)(SI*1), SI + CMPQ SI, (SP) JL match_dst_size_check_encodeBlockAsm10B MOVQ $0x00000000, ret+48(FP) RET match_dst_size_check_encodeBlockAsm10B: - MOVL CX, R8 + MOVL CX, SI MOVL 12(SP), DI - CMPL DI, R8 + CMPL DI, SI JEQ emit_literal_done_match_emit_encodeBlockAsm10B - MOVL R8, BP - MOVL R8, 12(SP) - LEAQ (DX)(DI*1), R9 - SUBL DI, BP - MOVL BP, DI + MOVL SI, R8 + MOVL SI, 12(SP) + LEAQ (DX)(DI*1), SI + SUBL DI, R8 + MOVL R8, DI SUBL $0x01, DI JC emit_literal_done_match_emit_encodeBlockAsm10B CMPL DI, $0x3c @@ -3546,11 +3541,11 @@ match_dst_size_check_encodeBlockAsm10B: JMP memmove_match_emit_encodeBlockAsm10B four_bytes_match_emit_encodeBlockAsm10B: - MOVL DI, R8 - SHRL $0x10, R8 + MOVL DI, R9 + SHRL $0x10, R9 MOVB $0xf8, (AX) MOVW DI, 1(AX) - MOVB R8, 3(AX) + MOVB R9, 3(AX) ADDQ $0x04, AX JMP memmove_match_emit_encodeBlockAsm10B @@ -3572,8 +3567,7 @@ one_byte_match_emit_encodeBlockAsm10B: ADDQ $0x01, AX memmove_match_emit_encodeBlockAsm10B: - LEAQ (AX)(BP*1), R10 - MOVL BP, R8 + LEAQ (AX)(R8*1), DI NOP emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_tail: @@ -3600,55 +3594,55 @@ emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_tail: JMP emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_256through2048 emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_1or2: - MOVB (R9), R11 - MOVB -1(R9)(R8*1), R9 - MOVB R11, (AX) - MOVB R9, -1(AX)(R8*1) + MOVB (SI), R9 + MOVB -1(SI)(R8*1), SI + MOVB R9, (AX) + MOVB SI, -1(AX)(R8*1) JMP memmove_end_copy_match_emit_encodeBlockAsm10B emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_4: - MOVL (R9), R11 - MOVL R11, (AX) + MOVL (SI), R9 + MOVL R9, (AX) JMP memmove_end_copy_match_emit_encodeBlockAsm10B emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_3: - MOVW (R9), R11 - MOVB 2(R9), R9 - MOVW R11, (AX) - MOVB R9, 2(AX) + MOVW (SI), R9 + MOVB 2(SI), SI + MOVW R9, (AX) + MOVB SI, 2(AX) JMP memmove_end_copy_match_emit_encodeBlockAsm10B emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_5through7: - MOVL (R9), R11 - MOVL -4(R9)(R8*1), R9 - MOVL R11, (AX) - MOVL R9, -4(AX)(R8*1) + MOVL (SI), R9 + MOVL -4(SI)(R8*1), SI + MOVL R9, (AX) + MOVL SI, -4(AX)(R8*1) JMP memmove_end_copy_match_emit_encodeBlockAsm10B emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_8: - MOVQ (R9), R11 - MOVQ R11, (AX) + MOVQ (SI), R9 + MOVQ R9, (AX) JMP memmove_end_copy_match_emit_encodeBlockAsm10B emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_9through16: - MOVQ (R9), R11 - MOVQ -8(R9)(R8*1), R9 - MOVQ R11, (AX) - MOVQ R9, -8(AX)(R8*1) + MOVQ (SI), R9 + MOVQ -8(SI)(R8*1), SI + MOVQ R9, (AX) + MOVQ SI, -8(AX)(R8*1) JMP memmove_end_copy_match_emit_encodeBlockAsm10B emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_17through32: - MOVOU (R9), X0 - MOVOU -16(R9)(R8*1), X1 + MOVOU (SI), X0 + MOVOU -16(SI)(R8*1), X1 MOVOU X0, (AX) MOVOU X1, -16(AX)(R8*1) JMP memmove_end_copy_match_emit_encodeBlockAsm10B emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_33through64: - MOVOU (R9), X0 - MOVOU 16(R9), X1 - MOVOU -32(R9)(R8*1), X2 - MOVOU -16(R9)(R8*1), X3 + MOVOU (SI), X0 + MOVOU 16(SI), X1 + MOVOU -32(SI)(R8*1), X2 + MOVOU -16(SI)(R8*1), X3 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, -32(AX)(R8*1) @@ -3656,14 +3650,14 @@ emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_33through64: JMP memmove_end_copy_match_emit_encodeBlockAsm10B emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_65through128: - MOVOU (R9), X0 - MOVOU 16(R9), X1 - MOVOU 32(R9), X2 - MOVOU 48(R9), X3 - MOVOU -64(R9)(R8*1), X12 - MOVOU -48(R9)(R8*1), X13 - MOVOU -32(R9)(R8*1), X14 - MOVOU -16(R9)(R8*1), X15 + MOVOU (SI), X0 + MOVOU 16(SI), X1 + MOVOU 32(SI), X2 + MOVOU 48(SI), X3 + MOVOU -64(SI)(R8*1), X12 + MOVOU -48(SI)(R8*1), X13 + MOVOU -32(SI)(R8*1), X14 + MOVOU -16(SI)(R8*1), X15 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, 32(AX) @@ -3675,22 +3669,22 @@ emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_65through128: JMP memmove_end_copy_match_emit_encodeBlockAsm10B emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_129through256: - MOVOU (R9), X0 - MOVOU 16(R9), X1 - MOVOU 32(R9), X2 - MOVOU 48(R9), X3 - MOVOU 64(R9), X4 - MOVOU 80(R9), X5 - MOVOU 96(R9), X6 - MOVOU 112(R9), X7 - MOVOU -128(R9)(R8*1), X8 - MOVOU -112(R9)(R8*1), X9 - MOVOU -96(R9)(R8*1), X10 - MOVOU -80(R9)(R8*1), X11 - MOVOU -64(R9)(R8*1), X12 - MOVOU -48(R9)(R8*1), X13 - MOVOU -32(R9)(R8*1), X14 - MOVOU -16(R9)(R8*1), X15 + MOVOU (SI), X0 + MOVOU 16(SI), X1 + MOVOU 32(SI), X2 + MOVOU 48(SI), X3 + MOVOU 64(SI), X4 + MOVOU 80(SI), X5 + MOVOU 96(SI), X6 + MOVOU 112(SI), X7 + MOVOU -128(SI)(R8*1), X8 + MOVOU -112(SI)(R8*1), X9 + MOVOU -96(SI)(R8*1), X10 + MOVOU -80(SI)(R8*1), X11 + MOVOU -64(SI)(R8*1), X12 + MOVOU -48(SI)(R8*1), X13 + MOVOU -32(SI)(R8*1), X14 + MOVOU -16(SI)(R8*1), X15 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, 32(AX) @@ -3711,22 +3705,22 @@ emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_129through256: emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_256through2048: LEAQ -256(R8), R8 - MOVOU (R9), X0 - MOVOU 16(R9), X1 - MOVOU 32(R9), X2 - MOVOU 48(R9), X3 - MOVOU 64(R9), X4 - MOVOU 80(R9), X5 - MOVOU 96(R9), X6 - MOVOU 112(R9), X7 - MOVOU 128(R9), X8 - MOVOU 144(R9), X9 - MOVOU 160(R9), X10 - MOVOU 176(R9), X11 - MOVOU 192(R9), X12 - MOVOU 208(R9), X13 - MOVOU 224(R9), X14 - MOVOU 240(R9), X15 + MOVOU (SI), X0 + MOVOU 16(SI), X1 + MOVOU 32(SI), X2 + MOVOU 48(SI), X3 + MOVOU 64(SI), X4 + MOVOU 80(SI), X5 + MOVOU 96(SI), X6 + MOVOU 112(SI), X7 + MOVOU 128(SI), X8 + MOVOU 144(SI), X9 + MOVOU 160(SI), X10 + MOVOU 176(SI), X11 + MOVOU 192(SI), X12 + MOVOU 208(SI), X13 + MOVOU 224(SI), X14 + MOVOU 240(SI), X15 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, 32(AX) @@ -3744,92 +3738,92 @@ emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_256through2048: MOVOU X14, 224(AX) MOVOU X15, 240(AX) CMPQ R8, $0x00000100 - LEAQ 256(R9), R9 + LEAQ 256(SI), SI LEAQ 256(AX), AX JGE emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_256through2048 JMP emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_tail memmove_end_copy_match_emit_encodeBlockAsm10B: - MOVQ R10, AX + MOVQ DI, AX emit_literal_done_match_emit_encodeBlockAsm10B: match_nolit_loop_encodeBlockAsm10B: - MOVL CX, BP - SUBL BX, BP - MOVL BP, 16(SP) + MOVL CX, SI + SUBL BP, SI + MOVL SI, 16(SP) ADDL $0x04, CX - ADDL $0x04, BX - MOVQ src_len+32(FP), R9 - SUBL CX, R9 - LEAQ (DX)(CX*1), R10 - LEAQ (DX)(BX*1), R11 - XORL BX, BX - CMPL R9, $0x08 + ADDL $0x04, BP + MOVQ src_len+32(FP), SI + SUBL CX, SI + LEAQ (DX)(CX*1), DI + LEAQ (DX)(BP*1), BP + XORL R9, R9 + CMPL SI, $0x08 JL matchlen_single_match_nolit_encodeBlockAsm10B matchlen_loopback_match_nolit_encodeBlockAsm10B: - MOVQ (R10)(BX*1), R12 - XORQ (R11)(BX*1), R12 - TESTQ R12, R12 + MOVQ (DI)(R9*1), R8 + XORQ (BP)(R9*1), R8 + TESTQ R8, R8 JZ matchlen_loop_match_nolit_encodeBlockAsm10B - BSFQ R12, R12 - SARQ $0x03, R12 - LEAL (BX)(R12*1), BX + BSFQ R8, R8 + SARQ $0x03, R8 + LEAL (R9)(R8*1), R9 JMP match_nolit_end_encodeBlockAsm10B matchlen_loop_match_nolit_encodeBlockAsm10B: - LEAL -8(R9), R9 - LEAL 8(BX), BX - CMPL R9, $0x08 + LEAL -8(SI), SI + LEAL 8(R9), R9 + CMPL SI, $0x08 JGE matchlen_loopback_match_nolit_encodeBlockAsm10B matchlen_single_match_nolit_encodeBlockAsm10B: - TESTL R9, R9 + TESTL SI, SI JZ match_nolit_end_encodeBlockAsm10B matchlen_single_loopback_match_nolit_encodeBlockAsm10B: - MOVB (R10)(BX*1), R12 - CMPB (R11)(BX*1), R12 + MOVB (DI)(R9*1), R8 + CMPB (BP)(R9*1), R8 JNE match_nolit_end_encodeBlockAsm10B - LEAL 1(BX), BX - DECL R9 + LEAL 1(R9), R9 + DECL SI JNZ matchlen_single_loopback_match_nolit_encodeBlockAsm10B match_nolit_end_encodeBlockAsm10B: - ADDL BX, CX + ADDL R9, CX MOVL 16(SP), BP - ADDL $0x04, BX + ADDL $0x04, R9 CMPL BP, $0x00010000 JL two_byte_offset_match_nolit_encodeBlockAsm10B four_bytes_loop_back_match_nolit_encodeBlockAsm10B: - CMPL BX, $0x40 + CMPL R9, $0x40 JLE four_bytes_remain_match_nolit_encodeBlockAsm10B MOVB $0xff, (AX) MOVL BP, 1(AX) - LEAL -64(BX), BX + LEAL -64(R9), R9 ADDQ $0x05, AX - CMPL BX, $0x04 + CMPL R9, $0x04 JL four_bytes_remain_match_nolit_encodeBlockAsm10B emit_repeat_again_match_nolit_encodeBlockAsm10B_emit_copy: - MOVL BX, DI - LEAL -4(BX), BX - CMPL DI, $0x08 + MOVL R9, SI + LEAL -4(R9), R9 + CMPL SI, $0x08 JLE repeat_two_match_nolit_encodeBlockAsm10B_emit_copy - CMPL DI, $0x0c + CMPL SI, $0x0c JGE cant_repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy CMPL BP, $0x00000800 JLT repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy cant_repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy: - CMPL BX, $0x00000104 + CMPL R9, $0x00000104 JLT repeat_three_match_nolit_encodeBlockAsm10B_emit_copy - CMPL BX, $0x00010100 + CMPL R9, $0x00010100 JLT repeat_four_match_nolit_encodeBlockAsm10B_emit_copy - CMPL BX, $0x0100ffff + CMPL R9, $0x0100ffff JLT repeat_five_match_nolit_encodeBlockAsm10B_emit_copy - LEAL -16842747(BX), BX + LEAL -16842747(R9), R9 MOVW $0x001d, (AX) MOVW $0xfffb, 2(AX) MOVB $0xff, 4(AX) @@ -3837,84 +3831,84 @@ cant_repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy: JMP emit_repeat_again_match_nolit_encodeBlockAsm10B_emit_copy repeat_five_match_nolit_encodeBlockAsm10B_emit_copy: - LEAL -65536(BX), BX - MOVL BX, BP + LEAL -65536(R9), R9 + MOVL R9, BP MOVW $0x001d, (AX) - MOVW BX, 2(AX) + MOVW R9, 2(AX) SARL $0x10, BP MOVB BP, 4(AX) ADDQ $0x05, AX JMP match_nolit_emitcopy_end_encodeBlockAsm10B repeat_four_match_nolit_encodeBlockAsm10B_emit_copy: - LEAL -256(BX), BX + LEAL -256(R9), R9 MOVW $0x0019, (AX) - MOVW BX, 2(AX) + MOVW R9, 2(AX) ADDQ $0x04, AX JMP match_nolit_emitcopy_end_encodeBlockAsm10B repeat_three_match_nolit_encodeBlockAsm10B_emit_copy: - LEAL -4(BX), BX + LEAL -4(R9), R9 MOVW $0x0015, (AX) - MOVB BL, 2(AX) + MOVB R9, 2(AX) ADDQ $0x03, AX JMP match_nolit_emitcopy_end_encodeBlockAsm10B repeat_two_match_nolit_encodeBlockAsm10B_emit_copy: - SHLL $0x02, BX - ORL $0x01, BX - MOVW BX, (AX) + SHLL $0x02, R9 + ORL $0x01, R9 + MOVW R9, (AX) ADDQ $0x02, AX JMP match_nolit_emitcopy_end_encodeBlockAsm10B repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy: - XORQ R9, R9 - LEAL 1(R9)(BX*4), BX + XORQ SI, SI + LEAL 1(SI)(R9*4), R9 MOVB BP, 1(AX) SARL $0x08, BP SHLL $0x05, BP - ORL BP, BX - MOVB BL, (AX) + ORL BP, R9 + MOVB R9, (AX) ADDQ $0x02, AX JMP match_nolit_emitcopy_end_encodeBlockAsm10B - JMP four_bytes_loop_back_match_nolit_encodeBlockAsm10B + JMP four_bytes_loop_back_match_nolit_encodeBlockAsm10B four_bytes_remain_match_nolit_encodeBlockAsm10B: - TESTL BX, BX + TESTL R9, R9 JZ match_nolit_emitcopy_end_encodeBlockAsm10B - MOVB $0x03, SI - LEAL -4(SI)(BX*4), BX - MOVB BL, (AX) + MOVB $0x03, BL + LEAL -4(BX)(R9*4), R9 + MOVB R9, (AX) MOVL BP, 1(AX) ADDQ $0x05, AX JMP match_nolit_emitcopy_end_encodeBlockAsm10B two_byte_offset_match_nolit_encodeBlockAsm10B: - CMPL BX, $0x40 + CMPL R9, $0x40 JLE two_byte_offset_short_match_nolit_encodeBlockAsm10B MOVB $0xee, (AX) MOVW BP, 1(AX) - LEAL -60(BX), BX + LEAL -60(R9), R9 ADDQ $0x03, AX emit_repeat_again_match_nolit_encodeBlockAsm10B_emit_copy_short: - MOVL BX, DI - LEAL -4(BX), BX - CMPL DI, $0x08 + MOVL R9, SI + LEAL -4(R9), R9 + CMPL SI, $0x08 JLE repeat_two_match_nolit_encodeBlockAsm10B_emit_copy_short - CMPL DI, $0x0c + CMPL SI, $0x0c JGE cant_repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short CMPL BP, $0x00000800 JLT repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short cant_repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short: - CMPL BX, $0x00000104 + CMPL R9, $0x00000104 JLT repeat_three_match_nolit_encodeBlockAsm10B_emit_copy_short - CMPL BX, $0x00010100 + CMPL R9, $0x00010100 JLT repeat_four_match_nolit_encodeBlockAsm10B_emit_copy_short - CMPL BX, $0x0100ffff + CMPL R9, $0x0100ffff JLT repeat_five_match_nolit_encodeBlockAsm10B_emit_copy_short - LEAL -16842747(BX), BX + LEAL -16842747(R9), R9 MOVW $0x001d, (AX) MOVW $0xfffb, 2(AX) MOVB $0xff, 4(AX) @@ -3922,67 +3916,67 @@ cant_repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short: JMP emit_repeat_again_match_nolit_encodeBlockAsm10B_emit_copy_short repeat_five_match_nolit_encodeBlockAsm10B_emit_copy_short: - LEAL -65536(BX), BX - MOVL BX, BP + LEAL -65536(R9), R9 + MOVL R9, BP MOVW $0x001d, (AX) - MOVW BX, 2(AX) + MOVW R9, 2(AX) SARL $0x10, BP MOVB BP, 4(AX) ADDQ $0x05, AX JMP match_nolit_emitcopy_end_encodeBlockAsm10B repeat_four_match_nolit_encodeBlockAsm10B_emit_copy_short: - LEAL -256(BX), BX + LEAL -256(R9), R9 MOVW $0x0019, (AX) - MOVW BX, 2(AX) + MOVW R9, 2(AX) ADDQ $0x04, AX JMP match_nolit_emitcopy_end_encodeBlockAsm10B repeat_three_match_nolit_encodeBlockAsm10B_emit_copy_short: - LEAL -4(BX), BX + LEAL -4(R9), R9 MOVW $0x0015, (AX) - MOVB BL, 2(AX) + MOVB R9, 2(AX) ADDQ $0x03, AX JMP match_nolit_emitcopy_end_encodeBlockAsm10B repeat_two_match_nolit_encodeBlockAsm10B_emit_copy_short: - SHLL $0x02, BX - ORL $0x01, BX - MOVW BX, (AX) + SHLL $0x02, R9 + ORL $0x01, R9 + MOVW R9, (AX) ADDQ $0x02, AX JMP match_nolit_emitcopy_end_encodeBlockAsm10B repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short: - XORQ R9, R9 - LEAL 1(R9)(BX*4), BX + XORQ SI, SI + LEAL 1(SI)(R9*4), R9 MOVB BP, 1(AX) SARL $0x08, BP SHLL $0x05, BP - ORL BP, BX - MOVB BL, (AX) + ORL BP, R9 + MOVB R9, (AX) ADDQ $0x02, AX JMP match_nolit_emitcopy_end_encodeBlockAsm10B - JMP two_byte_offset_match_nolit_encodeBlockAsm10B + JMP two_byte_offset_match_nolit_encodeBlockAsm10B two_byte_offset_short_match_nolit_encodeBlockAsm10B: - CMPL BX, $0x0c + CMPL R9, $0x0c JGE emit_copy_three_match_nolit_encodeBlockAsm10B CMPL BP, $0x00000800 JGE emit_copy_three_match_nolit_encodeBlockAsm10B - MOVB $0x01, SI - LEAL -16(SI)(BX*4), BX + MOVB $0x01, BL + LEAL -16(BX)(R9*4), R9 MOVB BP, 1(AX) SHRL $0x08, BP SHLL $0x05, BP - ORL BP, BX - MOVB BL, (AX) + ORL BP, R9 + MOVB R9, (AX) ADDQ $0x02, AX JMP match_nolit_emitcopy_end_encodeBlockAsm10B emit_copy_three_match_nolit_encodeBlockAsm10B: - MOVB $0x02, SI - LEAL -4(SI)(BX*4), BX - MOVB BL, (AX) + MOVB $0x02, BL + LEAL -4(BX)(R9*4), R9 + MOVB R9, (AX) MOVW BP, 1(AX) ADDQ $0x03, AX @@ -3996,22 +3990,22 @@ match_nolit_emitcopy_end_encodeBlockAsm10B: RET match_nolit_dst_ok_encodeBlockAsm10B: - MOVQ -2(DX)(CX*1), R9 - MOVQ $0x000000cf1bbcdcbb, R10 - MOVQ R9, R11 - SHRQ $0x10, R9 - MOVQ R9, R12 - SHLQ $0x18, R11 - IMULQ R10, R11 - SHRQ $0x36, R11 - SHLQ $0x18, R12 - IMULQ R10, R12 - SHRQ $0x36, R12 - LEAL -2(CX), BP - MOVL 24(SP)(R12*4), BX - MOVL BP, 24(SP)(R11*4) - MOVL CX, 24(SP)(R12*4) - CMPL (DX)(BX*1), R9 + MOVQ -2(DX)(CX*1), SI + MOVQ $0x000000cf1bbcdcbb, BP + MOVQ SI, DI + SHRQ $0x10, SI + MOVQ SI, R8 + SHLQ $0x18, DI + IMULQ BP, DI + SHRQ $0x36, DI + SHLQ $0x18, R8 + IMULQ BP, R8 + SHRQ $0x36, R8 + LEAL -2(CX), R9 + MOVL 24(SP)(R8*4), BP + MOVL R9, 24(SP)(DI*4) + MOVL CX, 24(SP)(R8*4) + CMPL (DX)(BP*1), SI JEQ match_nolit_loop_encodeBlockAsm10B INCL CX JMP search_loop_encodeBlockAsm10B @@ -4286,8 +4280,8 @@ zero_loop_encodeBlockAsm8B: MOVL $0x00000000, 12(SP) MOVQ src_len+32(FP), CX LEAQ -5(CX), DX - LEAQ -8(CX), R9 - MOVL R9, 8(SP) + LEAQ -8(CX), BP + MOVL BP, 8(SP) SHRQ $0x05, CX SUBL CX, DX LEAQ (AX)(DX*1), DX @@ -4297,68 +4291,68 @@ zero_loop_encodeBlockAsm8B: MOVQ src_base+24(FP), DX search_loop_encodeBlockAsm8B: - MOVQ (DX)(CX*1), R9 - MOVL CX, BX - SUBL 12(SP), BX - SHRL $0x04, BX - LEAL 4(CX)(BX*1), BX - MOVL 8(SP), BP - CMPL BX, BP + MOVQ (DX)(CX*1), SI + MOVL CX, BP + SUBL 12(SP), BP + SHRL $0x04, BP + LEAL 4(CX)(BP*1), BP + MOVL 8(SP), DI + CMPL BP, DI JGT emit_remainder_encodeBlockAsm8B - MOVL BX, 20(SP) - MOVQ $0x9e3779b1, R10 - MOVQ R9, R11 - MOVQ R9, R12 - SHRQ $0x08, R12 - SHLQ $0x20, R11 - IMULQ R10, R11 - SHRQ $0x38, R11 - SHLQ $0x20, R12 - IMULQ R10, R12 - SHRQ $0x38, R12 - MOVL 24(SP)(R11*4), BX - MOVL 24(SP)(R12*4), BP - MOVL CX, 24(SP)(R11*4) - LEAL 1(CX), DI - MOVL DI, 24(SP)(R12*4) - MOVQ R9, R11 - SHRQ $0x10, R11 - SHLQ $0x20, R11 - IMULQ R10, R11 - SHRQ $0x38, R11 - MOVL CX, DI - SUBL 16(SP), DI - MOVL 1(DX)(DI*1), DI - MOVQ R9, R10 + MOVL BP, 20(SP) + MOVQ $0x9e3779b1, R8 + MOVQ SI, R9 + MOVQ SI, R10 SHRQ $0x08, R10 - CMPL R10, DI + SHLQ $0x20, R9 + IMULQ R8, R9 + SHRQ $0x38, R9 + SHLQ $0x20, R10 + IMULQ R8, R10 + SHRQ $0x38, R10 + MOVL 24(SP)(R9*4), BP + MOVL 24(SP)(R10*4), DI + MOVL CX, 24(SP)(R9*4) + LEAL 1(CX), R9 + MOVL R9, 24(SP)(R10*4) + MOVQ SI, R9 + SHRQ $0x10, R9 + SHLQ $0x20, R9 + IMULQ R8, R9 + SHRQ $0x38, R9 + MOVL CX, R8 + SUBL 16(SP), R8 + MOVL 1(DX)(R8*1), R10 + MOVQ SI, R8 + SHRQ $0x08, R8 + CMPL R8, R10 JNE no_repeat_found_encodeBlockAsm8B - LEAL 1(CX), DI - MOVL 12(SP), R8 - MOVL DI, BX - SUBL 16(SP), BX + LEAL 1(CX), SI + MOVL 12(SP), DI + MOVL SI, BP + SUBL 16(SP), BP JZ repeat_extend_back_end_encodeBlockAsm8B repeat_extend_back_loop_encodeBlockAsm8B: - CMPL DI, R8 + CMPL SI, DI JLE repeat_extend_back_end_encodeBlockAsm8B - MOVB -1(DX)(BX*1), BP - MOVB -1(DX)(DI*1), SI - CMPB BP, SI + MOVB -1(DX)(BP*1), BL + MOVB -1(DX)(SI*1), R8 + CMPB BL, R8 JNE repeat_extend_back_end_encodeBlockAsm8B - LEAL -1(DI), DI - DECL BX + LEAL -1(SI), SI + DECL BP JNZ repeat_extend_back_loop_encodeBlockAsm8B repeat_extend_back_end_encodeBlockAsm8B: MOVL 12(SP), BP - CMPL BP, DI + CMPL BP, SI JEQ emit_literal_done_repeat_emit_encodeBlockAsm8B - MOVL DI, BX - MOVL DI, 12(SP) + MOVL SI, R8 + MOVL SI, 12(SP) LEAQ (DX)(BP*1), R9 - SUBL BP, BX - MOVL BX, BP + SUBL BP, R8 + MOVL R8, BP SUBL $0x01, BP JC emit_literal_done_repeat_emit_encodeBlockAsm8B CMPL BP, $0x3c @@ -4401,87 +4395,86 @@ one_byte_repeat_emit_encodeBlockAsm8B: ADDQ $0x01, AX memmove_repeat_emit_encodeBlockAsm8B: - LEAQ (AX)(BX*1), R10 - MOVL BX, BP + LEAQ (AX)(R8*1), BP NOP emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_tail: - TESTQ BP, BP + TESTQ R8, R8 JEQ memmove_end_copy_repeat_emit_encodeBlockAsm8B - CMPQ BP, $0x02 + CMPQ R8, $0x02 JBE emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_1or2 - CMPQ BP, $0x04 + CMPQ R8, $0x04 JB emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_3 JBE emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_4 - CMPQ BP, $0x08 + CMPQ R8, $0x08 JB emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_5through7 JE emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_8 - CMPQ BP, $0x10 + CMPQ R8, $0x10 JBE emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_9through16 - CMPQ BP, $0x20 + CMPQ R8, $0x20 JBE emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_17through32 - CMPQ BP, $0x40 + CMPQ R8, $0x40 JBE emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_33through64 - CMPQ BP, $0x80 + CMPQ R8, $0x80 JBE emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_65through128 - CMPQ BP, $0x00000100 + CMPQ R8, $0x00000100 JBE emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_129through256 JMP emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_256through2048 emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_1or2: - MOVB (R9), R11 - MOVB -1(R9)(BP*1), R9 - MOVB R11, (AX) - MOVB R9, -1(AX)(BP*1) + MOVB (R9), R10 + MOVB -1(R9)(R8*1), R9 + MOVB R10, (AX) + MOVB R9, -1(AX)(R8*1) JMP memmove_end_copy_repeat_emit_encodeBlockAsm8B emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_4: - MOVL (R9), R11 - MOVL R11, (AX) + MOVL (R9), R10 + MOVL R10, (AX) JMP memmove_end_copy_repeat_emit_encodeBlockAsm8B emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_3: - MOVW (R9), R11 + MOVW (R9), R10 MOVB 2(R9), R9 - MOVW R11, (AX) + MOVW R10, (AX) MOVB R9, 2(AX) JMP memmove_end_copy_repeat_emit_encodeBlockAsm8B emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_5through7: - MOVL (R9), R11 - MOVL -4(R9)(BP*1), R9 - MOVL R11, (AX) - MOVL R9, -4(AX)(BP*1) + MOVL (R9), R10 + MOVL -4(R9)(R8*1), R9 + MOVL R10, (AX) + MOVL R9, -4(AX)(R8*1) JMP memmove_end_copy_repeat_emit_encodeBlockAsm8B emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_8: - MOVQ (R9), R11 - MOVQ R11, (AX) + MOVQ (R9), R10 + MOVQ R10, (AX) JMP memmove_end_copy_repeat_emit_encodeBlockAsm8B emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_9through16: - MOVQ (R9), R11 - MOVQ -8(R9)(BP*1), R9 - MOVQ R11, (AX) - MOVQ R9, -8(AX)(BP*1) + MOVQ (R9), R10 + MOVQ -8(R9)(R8*1), R9 + MOVQ R10, (AX) + MOVQ R9, -8(AX)(R8*1) JMP memmove_end_copy_repeat_emit_encodeBlockAsm8B emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_17through32: MOVOU (R9), X0 - MOVOU -16(R9)(BP*1), X1 + MOVOU -16(R9)(R8*1), X1 MOVOU X0, (AX) - MOVOU X1, -16(AX)(BP*1) + MOVOU X1, -16(AX)(R8*1) JMP memmove_end_copy_repeat_emit_encodeBlockAsm8B emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_33through64: MOVOU (R9), X0 MOVOU 16(R9), X1 - MOVOU -32(R9)(BP*1), X2 - MOVOU -16(R9)(BP*1), X3 + MOVOU -32(R9)(R8*1), X2 + MOVOU -16(R9)(R8*1), X3 MOVOU X0, (AX) MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(BP*1) - MOVOU X3, -16(AX)(BP*1) + MOVOU X2, -32(AX)(R8*1) + MOVOU X3, -16(AX)(R8*1) JMP memmove_end_copy_repeat_emit_encodeBlockAsm8B emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_65through128: @@ -4489,18 +4482,18 @@ emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_65through128: MOVOU 16(R9), X1 MOVOU 32(R9), X2 MOVOU 48(R9), X3 - MOVOU -64(R9)(BP*1), X12 - MOVOU -48(R9)(BP*1), X13 - MOVOU -32(R9)(BP*1), X14 - MOVOU -16(R9)(BP*1), X15 + MOVOU -64(R9)(R8*1), X12 + MOVOU -48(R9)(R8*1), X13 + MOVOU -32(R9)(R8*1), X14 + MOVOU -16(R9)(R8*1), X15 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, 32(AX) MOVOU X3, 48(AX) - MOVOU X12, -64(AX)(BP*1) - MOVOU X13, -48(AX)(BP*1) - MOVOU X14, -32(AX)(BP*1) - MOVOU X15, -16(AX)(BP*1) + MOVOU X12, -64(AX)(R8*1) + MOVOU X13, -48(AX)(R8*1) + MOVOU X14, -32(AX)(R8*1) + MOVOU X15, -16(AX)(R8*1) JMP memmove_end_copy_repeat_emit_encodeBlockAsm8B emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_129through256: @@ -4512,14 +4505,14 @@ emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_129through256: MOVOU 80(R9), X5 MOVOU 96(R9), X6 MOVOU 112(R9), X7 - MOVOU -128(R9)(BP*1), X8 - MOVOU -112(R9)(BP*1), X9 - MOVOU -96(R9)(BP*1), X10 - MOVOU -80(R9)(BP*1), X11 - MOVOU -64(R9)(BP*1), X12 - MOVOU -48(R9)(BP*1), X13 - MOVOU -32(R9)(BP*1), X14 - MOVOU -16(R9)(BP*1), X15 + MOVOU -128(R9)(R8*1), X8 + MOVOU -112(R9)(R8*1), X9 + MOVOU -96(R9)(R8*1), X10 + MOVOU -80(R9)(R8*1), X11 + MOVOU -64(R9)(R8*1), X12 + MOVOU -48(R9)(R8*1), X13 + MOVOU -32(R9)(R8*1), X14 + MOVOU -16(R9)(R8*1), X15 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, 32(AX) @@ -4528,18 +4521,18 @@ emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_129through256: MOVOU X5, 80(AX) MOVOU X6, 96(AX) MOVOU X7, 112(AX) - MOVOU X8, -128(AX)(BP*1) - MOVOU X9, -112(AX)(BP*1) - MOVOU X10, -96(AX)(BP*1) - MOVOU X11, -80(AX)(BP*1) - MOVOU X12, -64(AX)(BP*1) - MOVOU X13, -48(AX)(BP*1) - MOVOU X14, -32(AX)(BP*1) - MOVOU X15, -16(AX)(BP*1) + MOVOU X8, -128(AX)(R8*1) + MOVOU X9, -112(AX)(R8*1) + MOVOU X10, -96(AX)(R8*1) + MOVOU X11, -80(AX)(R8*1) + MOVOU X12, -64(AX)(R8*1) + MOVOU X13, -48(AX)(R8*1) + MOVOU X14, -32(AX)(R8*1) + MOVOU X15, -16(AX)(R8*1) JMP memmove_end_copy_repeat_emit_encodeBlockAsm8B emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_256through2048: - LEAQ -256(BP), BP + LEAQ -256(R8), R8 MOVOU (R9), X0 MOVOU 16(R9), X1 MOVOU 32(R9), X2 @@ -4572,81 +4565,81 @@ emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_256through2048: MOVOU X13, 208(AX) MOVOU X14, 224(AX) MOVOU X15, 240(AX) - CMPQ BP, $0x00000100 + CMPQ R8, $0x00000100 LEAQ 256(R9), R9 LEAQ 256(AX), AX JGE emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_256through2048 JMP emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_tail memmove_end_copy_repeat_emit_encodeBlockAsm8B: - MOVQ R10, AX + MOVQ BP, AX emit_literal_done_repeat_emit_encodeBlockAsm8B: ADDL $0x05, CX - MOVL CX, BX - SUBL 16(SP), BX - MOVQ src_len+32(FP), R9 - SUBL CX, R9 - LEAQ (DX)(CX*1), R10 - LEAQ (DX)(BX*1), R11 - XORL BX, BX - CMPL R9, $0x08 + MOVL CX, BP + SUBL 16(SP), BP + MOVQ src_len+32(FP), R8 + SUBL CX, R8 + LEAQ (DX)(CX*1), R9 + LEAQ (DX)(BP*1), BP + XORL R11, R11 + CMPL R8, $0x08 JL matchlen_single_repeat_extend matchlen_loopback_repeat_extend: - MOVQ (R10)(BX*1), R12 - XORQ (R11)(BX*1), R12 - TESTQ R12, R12 + MOVQ (R9)(R11*1), R10 + XORQ (BP)(R11*1), R10 + TESTQ R10, R10 JZ matchlen_loop_repeat_extend - BSFQ R12, R12 - SARQ $0x03, R12 - LEAL (BX)(R12*1), BX + BSFQ R10, R10 + SARQ $0x03, R10 + LEAL (R11)(R10*1), R11 JMP repeat_extend_forward_end_encodeBlockAsm8B matchlen_loop_repeat_extend: - LEAL -8(R9), R9 - LEAL 8(BX), BX - CMPL R9, $0x08 + LEAL -8(R8), R8 + LEAL 8(R11), R11 + CMPL R8, $0x08 JGE matchlen_loopback_repeat_extend matchlen_single_repeat_extend: - TESTL R9, R9 + TESTL R8, R8 JZ repeat_extend_forward_end_encodeBlockAsm8B matchlen_single_loopback_repeat_extend: - MOVB (R10)(BX*1), R12 - CMPB (R11)(BX*1), R12 + MOVB (R9)(R11*1), R10 + CMPB (BP)(R11*1), R10 JNE repeat_extend_forward_end_encodeBlockAsm8B - LEAL 1(BX), BX - DECL R9 + LEAL 1(R11), R11 + DECL R8 JNZ matchlen_single_loopback_repeat_extend repeat_extend_forward_end_encodeBlockAsm8B: - ADDL BX, CX - MOVL CX, BX - SUBL DI, BX - MOVL 16(SP), BP - TESTL R8, R8 + ADDL R11, CX + MOVL CX, BP + SUBL SI, BP + MOVL 16(SP), SI + TESTL DI, DI JZ repeat_as_copy_encodeBlockAsm8B emit_repeat_again_match_repeat_encodeBlockAsm8B: - MOVL BX, DI - LEAL -4(BX), BX + MOVL BP, DI + LEAL -4(BP), BP CMPL DI, $0x08 JLE repeat_two_match_repeat_encodeBlockAsm8B CMPL DI, $0x0c JGE cant_repeat_two_offset_match_repeat_encodeBlockAsm8B - CMPL BP, $0x00000800 + CMPL SI, $0x00000800 JLT repeat_two_offset_match_repeat_encodeBlockAsm8B cant_repeat_two_offset_match_repeat_encodeBlockAsm8B: - CMPL BX, $0x00000104 + CMPL BP, $0x00000104 JLT repeat_three_match_repeat_encodeBlockAsm8B - CMPL BX, $0x00010100 + CMPL BP, $0x00010100 JLT repeat_four_match_repeat_encodeBlockAsm8B - CMPL BX, $0x0100ffff + CMPL BP, $0x0100ffff JLT repeat_five_match_repeat_encodeBlockAsm8B - LEAL -16842747(BX), BX + LEAL -16842747(BP), BP MOVW $0x001d, (AX) MOVW $0xfffb, 2(AX) MOVB $0xff, 4(AX) @@ -4654,79 +4647,79 @@ cant_repeat_two_offset_match_repeat_encodeBlockAsm8B: JMP emit_repeat_again_match_repeat_encodeBlockAsm8B repeat_five_match_repeat_encodeBlockAsm8B: - LEAL -65536(BX), BX - MOVL BX, BP + LEAL -65536(BP), BP + MOVL BP, SI MOVW $0x001d, (AX) - MOVW BX, 2(AX) - SARL $0x10, BP - MOVB BP, 4(AX) + MOVW BP, 2(AX) + SARL $0x10, SI + MOVB SI, 4(AX) ADDQ $0x05, AX JMP repeat_end_emit_encodeBlockAsm8B repeat_four_match_repeat_encodeBlockAsm8B: - LEAL -256(BX), BX + LEAL -256(BP), BP MOVW $0x0019, (AX) - MOVW BX, 2(AX) + MOVW BP, 2(AX) ADDQ $0x04, AX JMP repeat_end_emit_encodeBlockAsm8B repeat_three_match_repeat_encodeBlockAsm8B: - LEAL -4(BX), BX + LEAL -4(BP), BP MOVW $0x0015, (AX) - MOVB BL, 2(AX) + MOVB BP, 2(AX) ADDQ $0x03, AX JMP repeat_end_emit_encodeBlockAsm8B repeat_two_match_repeat_encodeBlockAsm8B: - SHLL $0x02, BX - ORL $0x01, BX - MOVW BX, (AX) + SHLL $0x02, BP + ORL $0x01, BP + MOVW BP, (AX) ADDQ $0x02, AX JMP repeat_end_emit_encodeBlockAsm8B repeat_two_offset_match_repeat_encodeBlockAsm8B: - XORQ R9, R9 - LEAL 1(R9)(BX*4), BX - MOVB BP, 1(AX) - SARL $0x08, BP - SHLL $0x05, BP - ORL BP, BX - MOVB BL, (AX) + XORQ DI, DI + LEAL 1(DI)(BP*4), BP + MOVB SI, 1(AX) + SARL $0x08, SI + SHLL $0x05, SI + ORL SI, BP + MOVB BP, (AX) ADDQ $0x02, AX JMP repeat_end_emit_encodeBlockAsm8B repeat_as_copy_encodeBlockAsm8B: - CMPL BP, $0x00010000 + CMPL SI, $0x00010000 JL two_byte_offset_repeat_as_copy_encodeBlockAsm8B four_bytes_loop_back_repeat_as_copy_encodeBlockAsm8B: - CMPL BX, $0x40 + CMPL BP, $0x40 JLE four_bytes_remain_repeat_as_copy_encodeBlockAsm8B MOVB $0xff, (AX) - MOVL BP, 1(AX) - LEAL -64(BX), BX + MOVL SI, 1(AX) + LEAL -64(BP), BP ADDQ $0x05, AX - CMPL BX, $0x04 + CMPL BP, $0x04 JL four_bytes_remain_repeat_as_copy_encodeBlockAsm8B emit_repeat_again_repeat_as_copy_encodeBlockAsm8B_emit_copy: - MOVL BX, DI - LEAL -4(BX), BX + MOVL BP, DI + LEAL -4(BP), BP CMPL DI, $0x08 JLE repeat_two_repeat_as_copy_encodeBlockAsm8B_emit_copy CMPL DI, $0x0c JGE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm8B_emit_copy - CMPL BP, $0x00000800 + CMPL SI, $0x00000800 JLT repeat_two_offset_repeat_as_copy_encodeBlockAsm8B_emit_copy cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm8B_emit_copy: - CMPL BX, $0x00000104 + CMPL BP, $0x00000104 JLT repeat_three_repeat_as_copy_encodeBlockAsm8B_emit_copy - CMPL BX, $0x00010100 + CMPL BP, $0x00010100 JLT repeat_four_repeat_as_copy_encodeBlockAsm8B_emit_copy - CMPL BX, $0x0100ffff + CMPL BP, $0x0100ffff JLT repeat_five_repeat_as_copy_encodeBlockAsm8B_emit_copy - LEAL -16842747(BX), BX + LEAL -16842747(BP), BP MOVW $0x001d, (AX) MOVW $0xfffb, 2(AX) MOVB $0xff, 4(AX) @@ -4734,84 +4727,84 @@ cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm8B_emit_copy: JMP emit_repeat_again_repeat_as_copy_encodeBlockAsm8B_emit_copy repeat_five_repeat_as_copy_encodeBlockAsm8B_emit_copy: - LEAL -65536(BX), BX - MOVL BX, BP + LEAL -65536(BP), BP + MOVL BP, SI MOVW $0x001d, (AX) - MOVW BX, 2(AX) - SARL $0x10, BP - MOVB BP, 4(AX) + MOVW BP, 2(AX) + SARL $0x10, SI + MOVB SI, 4(AX) ADDQ $0x05, AX JMP repeat_end_emit_encodeBlockAsm8B repeat_four_repeat_as_copy_encodeBlockAsm8B_emit_copy: - LEAL -256(BX), BX + LEAL -256(BP), BP MOVW $0x0019, (AX) - MOVW BX, 2(AX) + MOVW BP, 2(AX) ADDQ $0x04, AX JMP repeat_end_emit_encodeBlockAsm8B repeat_three_repeat_as_copy_encodeBlockAsm8B_emit_copy: - LEAL -4(BX), BX + LEAL -4(BP), BP MOVW $0x0015, (AX) - MOVB BL, 2(AX) + MOVB BP, 2(AX) ADDQ $0x03, AX JMP repeat_end_emit_encodeBlockAsm8B repeat_two_repeat_as_copy_encodeBlockAsm8B_emit_copy: - SHLL $0x02, BX - ORL $0x01, BX - MOVW BX, (AX) + SHLL $0x02, BP + ORL $0x01, BP + MOVW BP, (AX) ADDQ $0x02, AX JMP repeat_end_emit_encodeBlockAsm8B repeat_two_offset_repeat_as_copy_encodeBlockAsm8B_emit_copy: - XORQ R9, R9 - LEAL 1(R9)(BX*4), BX - MOVB BP, 1(AX) - SARL $0x08, BP - SHLL $0x05, BP - ORL BP, BX - MOVB BL, (AX) + XORQ DI, DI + LEAL 1(DI)(BP*4), BP + MOVB SI, 1(AX) + SARL $0x08, SI + SHLL $0x05, SI + ORL SI, BP + MOVB BP, (AX) ADDQ $0x02, AX JMP repeat_end_emit_encodeBlockAsm8B - JMP four_bytes_loop_back_repeat_as_copy_encodeBlockAsm8B + JMP four_bytes_loop_back_repeat_as_copy_encodeBlockAsm8B four_bytes_remain_repeat_as_copy_encodeBlockAsm8B: - TESTL BX, BX + TESTL BP, BP JZ repeat_end_emit_encodeBlockAsm8B - MOVB $0x03, SI - LEAL -4(SI)(BX*4), BX - MOVB BL, (AX) - MOVL BP, 1(AX) + MOVB $0x03, BL + LEAL -4(BX)(BP*4), BP + MOVB BP, (AX) + MOVL SI, 1(AX) ADDQ $0x05, AX JMP repeat_end_emit_encodeBlockAsm8B two_byte_offset_repeat_as_copy_encodeBlockAsm8B: - CMPL BX, $0x40 + CMPL BP, $0x40 JLE two_byte_offset_short_repeat_as_copy_encodeBlockAsm8B MOVB $0xee, (AX) - MOVW BP, 1(AX) - LEAL -60(BX), BX + MOVW SI, 1(AX) + LEAL -60(BP), BP ADDQ $0x03, AX emit_repeat_again_repeat_as_copy_encodeBlockAsm8B_emit_copy_short: - MOVL BX, DI - LEAL -4(BX), BX + MOVL BP, DI + LEAL -4(BP), BP CMPL DI, $0x08 JLE repeat_two_repeat_as_copy_encodeBlockAsm8B_emit_copy_short CMPL DI, $0x0c JGE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm8B_emit_copy_short - CMPL BP, $0x00000800 + CMPL SI, $0x00000800 JLT repeat_two_offset_repeat_as_copy_encodeBlockAsm8B_emit_copy_short cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm8B_emit_copy_short: - CMPL BX, $0x00000104 + CMPL BP, $0x00000104 JLT repeat_three_repeat_as_copy_encodeBlockAsm8B_emit_copy_short - CMPL BX, $0x00010100 + CMPL BP, $0x00010100 JLT repeat_four_repeat_as_copy_encodeBlockAsm8B_emit_copy_short - CMPL BX, $0x0100ffff + CMPL BP, $0x0100ffff JLT repeat_five_repeat_as_copy_encodeBlockAsm8B_emit_copy_short - LEAL -16842747(BX), BX + LEAL -16842747(BP), BP MOVW $0x001d, (AX) MOVW $0xfffb, 2(AX) MOVB $0xff, 4(AX) @@ -4819,68 +4812,68 @@ cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm8B_emit_copy_short: JMP emit_repeat_again_repeat_as_copy_encodeBlockAsm8B_emit_copy_short repeat_five_repeat_as_copy_encodeBlockAsm8B_emit_copy_short: - LEAL -65536(BX), BX - MOVL BX, BP + LEAL -65536(BP), BP + MOVL BP, SI MOVW $0x001d, (AX) - MOVW BX, 2(AX) - SARL $0x10, BP - MOVB BP, 4(AX) + MOVW BP, 2(AX) + SARL $0x10, SI + MOVB SI, 4(AX) ADDQ $0x05, AX JMP repeat_end_emit_encodeBlockAsm8B repeat_four_repeat_as_copy_encodeBlockAsm8B_emit_copy_short: - LEAL -256(BX), BX + LEAL -256(BP), BP MOVW $0x0019, (AX) - MOVW BX, 2(AX) + MOVW BP, 2(AX) ADDQ $0x04, AX JMP repeat_end_emit_encodeBlockAsm8B repeat_three_repeat_as_copy_encodeBlockAsm8B_emit_copy_short: - LEAL -4(BX), BX + LEAL -4(BP), BP MOVW $0x0015, (AX) - MOVB BL, 2(AX) + MOVB BP, 2(AX) ADDQ $0x03, AX JMP repeat_end_emit_encodeBlockAsm8B repeat_two_repeat_as_copy_encodeBlockAsm8B_emit_copy_short: - SHLL $0x02, BX - ORL $0x01, BX - MOVW BX, (AX) + SHLL $0x02, BP + ORL $0x01, BP + MOVW BP, (AX) ADDQ $0x02, AX JMP repeat_end_emit_encodeBlockAsm8B repeat_two_offset_repeat_as_copy_encodeBlockAsm8B_emit_copy_short: - XORQ R9, R9 - LEAL 1(R9)(BX*4), BX - MOVB BP, 1(AX) - SARL $0x08, BP - SHLL $0x05, BP - ORL BP, BX - MOVB BL, (AX) + XORQ DI, DI + LEAL 1(DI)(BP*4), BP + MOVB SI, 1(AX) + SARL $0x08, SI + SHLL $0x05, SI + ORL SI, BP + MOVB BP, (AX) ADDQ $0x02, AX JMP repeat_end_emit_encodeBlockAsm8B - JMP two_byte_offset_repeat_as_copy_encodeBlockAsm8B + JMP two_byte_offset_repeat_as_copy_encodeBlockAsm8B two_byte_offset_short_repeat_as_copy_encodeBlockAsm8B: - CMPL BX, $0x0c + CMPL BP, $0x0c JGE emit_copy_three_repeat_as_copy_encodeBlockAsm8B - CMPL BP, $0x00000800 + CMPL SI, $0x00000800 JGE emit_copy_three_repeat_as_copy_encodeBlockAsm8B - MOVB $0x01, SI - LEAL -16(SI)(BX*4), BX - MOVB BP, 1(AX) - SHRL $0x08, BP - SHLL $0x05, BP - ORL BP, BX - MOVB BL, (AX) + MOVB $0x01, BL + LEAL -16(BX)(BP*4), BP + MOVB SI, 1(AX) + SHRL $0x08, SI + SHLL $0x05, SI + ORL SI, BP + MOVB BP, (AX) ADDQ $0x02, AX JMP repeat_end_emit_encodeBlockAsm8B emit_copy_three_repeat_as_copy_encodeBlockAsm8B: - MOVB $0x02, SI - LEAL -4(SI)(BX*4), BX - MOVB BL, (AX) - MOVW BP, 1(AX) + MOVB $0x02, BL + LEAL -4(BX)(BP*4), BP + MOVB BP, (AX) + MOVW SI, 1(AX) ADDQ $0x03, AX repeat_end_emit_encodeBlockAsm8B: @@ -4890,16 +4883,16 @@ repeat_end_emit_encodeBlockAsm8B: JMP search_loop_encodeBlockAsm8B no_repeat_found_encodeBlockAsm8B: - CMPL (DX)(BX*1), R9 + CMPL (DX)(BP*1), SI JEQ candidate_match_encodeBlockAsm8B - SHRQ $0x08, R9 - MOVL 24(SP)(R11*4), BX - LEAL 2(CX), DI - CMPL (DX)(BP*1), R9 + SHRQ $0x08, SI + MOVL 24(SP)(R9*4), BP + LEAL 2(CX), R8 + CMPL (DX)(DI*1), SI JEQ candidate2_match_encodeBlockAsm8B - MOVL DI, 24(SP)(R11*4) - SHRQ $0x08, R9 - CMPL (DX)(BX*1), R9 + MOVL R8, 24(SP)(R9*4) + SHRQ $0x08, SI + CMPL (DX)(BP*1), SI JEQ candidate3_match_encodeBlockAsm8B MOVL 20(SP), CX JMP search_loop_encodeBlockAsm8B @@ -4909,46 +4902,46 @@ candidate3_match_encodeBlockAsm8B: JMP candidate_match_encodeBlockAsm8B candidate2_match_encodeBlockAsm8B: - MOVL DI, 24(SP)(R11*4) + MOVL R8, 24(SP)(R9*4) INCL CX - MOVL BP, BX + MOVL DI, BP candidate_match_encodeBlockAsm8B: - MOVL 12(SP), BP - TESTL BX, BX + MOVL 12(SP), SI + TESTL BP, BP JZ match_extend_back_end_encodeBlockAsm8B match_extend_back_loop_encodeBlockAsm8B: - CMPL CX, BP + CMPL CX, SI JLE match_extend_back_end_encodeBlockAsm8B - MOVB -1(DX)(BX*1), SI + MOVB -1(DX)(BP*1), BL MOVB -1(DX)(CX*1), DI - CMPB SI, DI + CMPB BL, DI JNE match_extend_back_end_encodeBlockAsm8B LEAL -1(CX), CX - DECL BX + DECL BP JZ match_extend_back_end_encodeBlockAsm8B JMP match_extend_back_loop_encodeBlockAsm8B match_extend_back_end_encodeBlockAsm8B: - MOVL CX, DI - SUBL 12(SP), DI - LEAQ 4(AX)(DI*1), DI - CMPQ DI, (SP) + MOVL CX, SI + SUBL 12(SP), SI + LEAQ 4(AX)(SI*1), SI + CMPQ SI, (SP) JL match_dst_size_check_encodeBlockAsm8B MOVQ $0x00000000, ret+48(FP) RET match_dst_size_check_encodeBlockAsm8B: - MOVL CX, R8 + MOVL CX, SI MOVL 12(SP), DI - CMPL DI, R8 + CMPL DI, SI JEQ emit_literal_done_match_emit_encodeBlockAsm8B - MOVL R8, BP - MOVL R8, 12(SP) - LEAQ (DX)(DI*1), R9 - SUBL DI, BP - MOVL BP, DI + MOVL SI, R8 + MOVL SI, 12(SP) + LEAQ (DX)(DI*1), SI + SUBL DI, R8 + MOVL R8, DI SUBL $0x01, DI JC emit_literal_done_match_emit_encodeBlockAsm8B CMPL DI, $0x3c @@ -4965,11 +4958,11 @@ match_dst_size_check_encodeBlockAsm8B: JMP memmove_match_emit_encodeBlockAsm8B four_bytes_match_emit_encodeBlockAsm8B: - MOVL DI, R8 - SHRL $0x10, R8 + MOVL DI, R9 + SHRL $0x10, R9 MOVB $0xf8, (AX) MOVW DI, 1(AX) - MOVB R8, 3(AX) + MOVB R9, 3(AX) ADDQ $0x04, AX JMP memmove_match_emit_encodeBlockAsm8B @@ -4991,8 +4984,7 @@ one_byte_match_emit_encodeBlockAsm8B: ADDQ $0x01, AX memmove_match_emit_encodeBlockAsm8B: - LEAQ (AX)(BP*1), R10 - MOVL BP, R8 + LEAQ (AX)(R8*1), DI NOP emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_tail: @@ -5019,55 +5011,55 @@ emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_tail: JMP emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_256through2048 emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_1or2: - MOVB (R9), R11 - MOVB -1(R9)(R8*1), R9 - MOVB R11, (AX) - MOVB R9, -1(AX)(R8*1) + MOVB (SI), R9 + MOVB -1(SI)(R8*1), SI + MOVB R9, (AX) + MOVB SI, -1(AX)(R8*1) JMP memmove_end_copy_match_emit_encodeBlockAsm8B emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_4: - MOVL (R9), R11 - MOVL R11, (AX) + MOVL (SI), R9 + MOVL R9, (AX) JMP memmove_end_copy_match_emit_encodeBlockAsm8B emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_3: - MOVW (R9), R11 - MOVB 2(R9), R9 - MOVW R11, (AX) - MOVB R9, 2(AX) + MOVW (SI), R9 + MOVB 2(SI), SI + MOVW R9, (AX) + MOVB SI, 2(AX) JMP memmove_end_copy_match_emit_encodeBlockAsm8B emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_5through7: - MOVL (R9), R11 - MOVL -4(R9)(R8*1), R9 - MOVL R11, (AX) - MOVL R9, -4(AX)(R8*1) + MOVL (SI), R9 + MOVL -4(SI)(R8*1), SI + MOVL R9, (AX) + MOVL SI, -4(AX)(R8*1) JMP memmove_end_copy_match_emit_encodeBlockAsm8B emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_8: - MOVQ (R9), R11 - MOVQ R11, (AX) + MOVQ (SI), R9 + MOVQ R9, (AX) JMP memmove_end_copy_match_emit_encodeBlockAsm8B emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_9through16: - MOVQ (R9), R11 - MOVQ -8(R9)(R8*1), R9 - MOVQ R11, (AX) - MOVQ R9, -8(AX)(R8*1) + MOVQ (SI), R9 + MOVQ -8(SI)(R8*1), SI + MOVQ R9, (AX) + MOVQ SI, -8(AX)(R8*1) JMP memmove_end_copy_match_emit_encodeBlockAsm8B emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_17through32: - MOVOU (R9), X0 - MOVOU -16(R9)(R8*1), X1 + MOVOU (SI), X0 + MOVOU -16(SI)(R8*1), X1 MOVOU X0, (AX) MOVOU X1, -16(AX)(R8*1) JMP memmove_end_copy_match_emit_encodeBlockAsm8B emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_33through64: - MOVOU (R9), X0 - MOVOU 16(R9), X1 - MOVOU -32(R9)(R8*1), X2 - MOVOU -16(R9)(R8*1), X3 + MOVOU (SI), X0 + MOVOU 16(SI), X1 + MOVOU -32(SI)(R8*1), X2 + MOVOU -16(SI)(R8*1), X3 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, -32(AX)(R8*1) @@ -5075,14 +5067,14 @@ emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_33through64: JMP memmove_end_copy_match_emit_encodeBlockAsm8B emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_65through128: - MOVOU (R9), X0 - MOVOU 16(R9), X1 - MOVOU 32(R9), X2 - MOVOU 48(R9), X3 - MOVOU -64(R9)(R8*1), X12 - MOVOU -48(R9)(R8*1), X13 - MOVOU -32(R9)(R8*1), X14 - MOVOU -16(R9)(R8*1), X15 + MOVOU (SI), X0 + MOVOU 16(SI), X1 + MOVOU 32(SI), X2 + MOVOU 48(SI), X3 + MOVOU -64(SI)(R8*1), X12 + MOVOU -48(SI)(R8*1), X13 + MOVOU -32(SI)(R8*1), X14 + MOVOU -16(SI)(R8*1), X15 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, 32(AX) @@ -5094,22 +5086,22 @@ emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_65through128: JMP memmove_end_copy_match_emit_encodeBlockAsm8B emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_129through256: - MOVOU (R9), X0 - MOVOU 16(R9), X1 - MOVOU 32(R9), X2 - MOVOU 48(R9), X3 - MOVOU 64(R9), X4 - MOVOU 80(R9), X5 - MOVOU 96(R9), X6 - MOVOU 112(R9), X7 - MOVOU -128(R9)(R8*1), X8 - MOVOU -112(R9)(R8*1), X9 - MOVOU -96(R9)(R8*1), X10 - MOVOU -80(R9)(R8*1), X11 - MOVOU -64(R9)(R8*1), X12 - MOVOU -48(R9)(R8*1), X13 - MOVOU -32(R9)(R8*1), X14 - MOVOU -16(R9)(R8*1), X15 + MOVOU (SI), X0 + MOVOU 16(SI), X1 + MOVOU 32(SI), X2 + MOVOU 48(SI), X3 + MOVOU 64(SI), X4 + MOVOU 80(SI), X5 + MOVOU 96(SI), X6 + MOVOU 112(SI), X7 + MOVOU -128(SI)(R8*1), X8 + MOVOU -112(SI)(R8*1), X9 + MOVOU -96(SI)(R8*1), X10 + MOVOU -80(SI)(R8*1), X11 + MOVOU -64(SI)(R8*1), X12 + MOVOU -48(SI)(R8*1), X13 + MOVOU -32(SI)(R8*1), X14 + MOVOU -16(SI)(R8*1), X15 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, 32(AX) @@ -5130,22 +5122,22 @@ emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_129through256: emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_256through2048: LEAQ -256(R8), R8 - MOVOU (R9), X0 - MOVOU 16(R9), X1 - MOVOU 32(R9), X2 - MOVOU 48(R9), X3 - MOVOU 64(R9), X4 - MOVOU 80(R9), X5 - MOVOU 96(R9), X6 - MOVOU 112(R9), X7 - MOVOU 128(R9), X8 - MOVOU 144(R9), X9 - MOVOU 160(R9), X10 - MOVOU 176(R9), X11 - MOVOU 192(R9), X12 - MOVOU 208(R9), X13 - MOVOU 224(R9), X14 - MOVOU 240(R9), X15 + MOVOU (SI), X0 + MOVOU 16(SI), X1 + MOVOU 32(SI), X2 + MOVOU 48(SI), X3 + MOVOU 64(SI), X4 + MOVOU 80(SI), X5 + MOVOU 96(SI), X6 + MOVOU 112(SI), X7 + MOVOU 128(SI), X8 + MOVOU 144(SI), X9 + MOVOU 160(SI), X10 + MOVOU 176(SI), X11 + MOVOU 192(SI), X12 + MOVOU 208(SI), X13 + MOVOU 224(SI), X14 + MOVOU 240(SI), X15 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, 32(AX) @@ -5163,92 +5155,92 @@ emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_256through2048: MOVOU X14, 224(AX) MOVOU X15, 240(AX) CMPQ R8, $0x00000100 - LEAQ 256(R9), R9 + LEAQ 256(SI), SI LEAQ 256(AX), AX JGE emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_256through2048 JMP emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_tail memmove_end_copy_match_emit_encodeBlockAsm8B: - MOVQ R10, AX + MOVQ DI, AX emit_literal_done_match_emit_encodeBlockAsm8B: match_nolit_loop_encodeBlockAsm8B: - MOVL CX, BP - SUBL BX, BP - MOVL BP, 16(SP) + MOVL CX, SI + SUBL BP, SI + MOVL SI, 16(SP) ADDL $0x04, CX - ADDL $0x04, BX - MOVQ src_len+32(FP), R9 - SUBL CX, R9 - LEAQ (DX)(CX*1), R10 - LEAQ (DX)(BX*1), R11 - XORL BX, BX - CMPL R9, $0x08 + ADDL $0x04, BP + MOVQ src_len+32(FP), SI + SUBL CX, SI + LEAQ (DX)(CX*1), DI + LEAQ (DX)(BP*1), BP + XORL R9, R9 + CMPL SI, $0x08 JL matchlen_single_match_nolit_encodeBlockAsm8B matchlen_loopback_match_nolit_encodeBlockAsm8B: - MOVQ (R10)(BX*1), R12 - XORQ (R11)(BX*1), R12 - TESTQ R12, R12 + MOVQ (DI)(R9*1), R8 + XORQ (BP)(R9*1), R8 + TESTQ R8, R8 JZ matchlen_loop_match_nolit_encodeBlockAsm8B - BSFQ R12, R12 - SARQ $0x03, R12 - LEAL (BX)(R12*1), BX + BSFQ R8, R8 + SARQ $0x03, R8 + LEAL (R9)(R8*1), R9 JMP match_nolit_end_encodeBlockAsm8B matchlen_loop_match_nolit_encodeBlockAsm8B: - LEAL -8(R9), R9 - LEAL 8(BX), BX - CMPL R9, $0x08 + LEAL -8(SI), SI + LEAL 8(R9), R9 + CMPL SI, $0x08 JGE matchlen_loopback_match_nolit_encodeBlockAsm8B matchlen_single_match_nolit_encodeBlockAsm8B: - TESTL R9, R9 + TESTL SI, SI JZ match_nolit_end_encodeBlockAsm8B matchlen_single_loopback_match_nolit_encodeBlockAsm8B: - MOVB (R10)(BX*1), R12 - CMPB (R11)(BX*1), R12 + MOVB (DI)(R9*1), R8 + CMPB (BP)(R9*1), R8 JNE match_nolit_end_encodeBlockAsm8B - LEAL 1(BX), BX - DECL R9 + LEAL 1(R9), R9 + DECL SI JNZ matchlen_single_loopback_match_nolit_encodeBlockAsm8B match_nolit_end_encodeBlockAsm8B: - ADDL BX, CX + ADDL R9, CX MOVL 16(SP), BP - ADDL $0x04, BX + ADDL $0x04, R9 CMPL BP, $0x00010000 JL two_byte_offset_match_nolit_encodeBlockAsm8B four_bytes_loop_back_match_nolit_encodeBlockAsm8B: - CMPL BX, $0x40 + CMPL R9, $0x40 JLE four_bytes_remain_match_nolit_encodeBlockAsm8B MOVB $0xff, (AX) MOVL BP, 1(AX) - LEAL -64(BX), BX + LEAL -64(R9), R9 ADDQ $0x05, AX - CMPL BX, $0x04 + CMPL R9, $0x04 JL four_bytes_remain_match_nolit_encodeBlockAsm8B emit_repeat_again_match_nolit_encodeBlockAsm8B_emit_copy: - MOVL BX, DI - LEAL -4(BX), BX - CMPL DI, $0x08 + MOVL R9, SI + LEAL -4(R9), R9 + CMPL SI, $0x08 JLE repeat_two_match_nolit_encodeBlockAsm8B_emit_copy - CMPL DI, $0x0c + CMPL SI, $0x0c JGE cant_repeat_two_offset_match_nolit_encodeBlockAsm8B_emit_copy CMPL BP, $0x00000800 JLT repeat_two_offset_match_nolit_encodeBlockAsm8B_emit_copy cant_repeat_two_offset_match_nolit_encodeBlockAsm8B_emit_copy: - CMPL BX, $0x00000104 + CMPL R9, $0x00000104 JLT repeat_three_match_nolit_encodeBlockAsm8B_emit_copy - CMPL BX, $0x00010100 + CMPL R9, $0x00010100 JLT repeat_four_match_nolit_encodeBlockAsm8B_emit_copy - CMPL BX, $0x0100ffff + CMPL R9, $0x0100ffff JLT repeat_five_match_nolit_encodeBlockAsm8B_emit_copy - LEAL -16842747(BX), BX + LEAL -16842747(R9), R9 MOVW $0x001d, (AX) MOVW $0xfffb, 2(AX) MOVB $0xff, 4(AX) @@ -5256,84 +5248,84 @@ cant_repeat_two_offset_match_nolit_encodeBlockAsm8B_emit_copy: JMP emit_repeat_again_match_nolit_encodeBlockAsm8B_emit_copy repeat_five_match_nolit_encodeBlockAsm8B_emit_copy: - LEAL -65536(BX), BX - MOVL BX, BP + LEAL -65536(R9), R9 + MOVL R9, BP MOVW $0x001d, (AX) - MOVW BX, 2(AX) + MOVW R9, 2(AX) SARL $0x10, BP MOVB BP, 4(AX) ADDQ $0x05, AX JMP match_nolit_emitcopy_end_encodeBlockAsm8B repeat_four_match_nolit_encodeBlockAsm8B_emit_copy: - LEAL -256(BX), BX + LEAL -256(R9), R9 MOVW $0x0019, (AX) - MOVW BX, 2(AX) + MOVW R9, 2(AX) ADDQ $0x04, AX JMP match_nolit_emitcopy_end_encodeBlockAsm8B repeat_three_match_nolit_encodeBlockAsm8B_emit_copy: - LEAL -4(BX), BX + LEAL -4(R9), R9 MOVW $0x0015, (AX) - MOVB BL, 2(AX) + MOVB R9, 2(AX) ADDQ $0x03, AX JMP match_nolit_emitcopy_end_encodeBlockAsm8B repeat_two_match_nolit_encodeBlockAsm8B_emit_copy: - SHLL $0x02, BX - ORL $0x01, BX - MOVW BX, (AX) + SHLL $0x02, R9 + ORL $0x01, R9 + MOVW R9, (AX) ADDQ $0x02, AX JMP match_nolit_emitcopy_end_encodeBlockAsm8B repeat_two_offset_match_nolit_encodeBlockAsm8B_emit_copy: - XORQ R9, R9 - LEAL 1(R9)(BX*4), BX + XORQ SI, SI + LEAL 1(SI)(R9*4), R9 MOVB BP, 1(AX) SARL $0x08, BP SHLL $0x05, BP - ORL BP, BX - MOVB BL, (AX) + ORL BP, R9 + MOVB R9, (AX) ADDQ $0x02, AX JMP match_nolit_emitcopy_end_encodeBlockAsm8B - JMP four_bytes_loop_back_match_nolit_encodeBlockAsm8B + JMP four_bytes_loop_back_match_nolit_encodeBlockAsm8B four_bytes_remain_match_nolit_encodeBlockAsm8B: - TESTL BX, BX + TESTL R9, R9 JZ match_nolit_emitcopy_end_encodeBlockAsm8B - MOVB $0x03, SI - LEAL -4(SI)(BX*4), BX - MOVB BL, (AX) + MOVB $0x03, BL + LEAL -4(BX)(R9*4), R9 + MOVB R9, (AX) MOVL BP, 1(AX) ADDQ $0x05, AX JMP match_nolit_emitcopy_end_encodeBlockAsm8B two_byte_offset_match_nolit_encodeBlockAsm8B: - CMPL BX, $0x40 + CMPL R9, $0x40 JLE two_byte_offset_short_match_nolit_encodeBlockAsm8B MOVB $0xee, (AX) MOVW BP, 1(AX) - LEAL -60(BX), BX + LEAL -60(R9), R9 ADDQ $0x03, AX emit_repeat_again_match_nolit_encodeBlockAsm8B_emit_copy_short: - MOVL BX, DI - LEAL -4(BX), BX - CMPL DI, $0x08 + MOVL R9, SI + LEAL -4(R9), R9 + CMPL SI, $0x08 JLE repeat_two_match_nolit_encodeBlockAsm8B_emit_copy_short - CMPL DI, $0x0c + CMPL SI, $0x0c JGE cant_repeat_two_offset_match_nolit_encodeBlockAsm8B_emit_copy_short CMPL BP, $0x00000800 JLT repeat_two_offset_match_nolit_encodeBlockAsm8B_emit_copy_short cant_repeat_two_offset_match_nolit_encodeBlockAsm8B_emit_copy_short: - CMPL BX, $0x00000104 + CMPL R9, $0x00000104 JLT repeat_three_match_nolit_encodeBlockAsm8B_emit_copy_short - CMPL BX, $0x00010100 + CMPL R9, $0x00010100 JLT repeat_four_match_nolit_encodeBlockAsm8B_emit_copy_short - CMPL BX, $0x0100ffff + CMPL R9, $0x0100ffff JLT repeat_five_match_nolit_encodeBlockAsm8B_emit_copy_short - LEAL -16842747(BX), BX + LEAL -16842747(R9), R9 MOVW $0x001d, (AX) MOVW $0xfffb, 2(AX) MOVB $0xff, 4(AX) @@ -5341,67 +5333,67 @@ cant_repeat_two_offset_match_nolit_encodeBlockAsm8B_emit_copy_short: JMP emit_repeat_again_match_nolit_encodeBlockAsm8B_emit_copy_short repeat_five_match_nolit_encodeBlockAsm8B_emit_copy_short: - LEAL -65536(BX), BX - MOVL BX, BP + LEAL -65536(R9), R9 + MOVL R9, BP MOVW $0x001d, (AX) - MOVW BX, 2(AX) + MOVW R9, 2(AX) SARL $0x10, BP MOVB BP, 4(AX) ADDQ $0x05, AX JMP match_nolit_emitcopy_end_encodeBlockAsm8B repeat_four_match_nolit_encodeBlockAsm8B_emit_copy_short: - LEAL -256(BX), BX + LEAL -256(R9), R9 MOVW $0x0019, (AX) - MOVW BX, 2(AX) + MOVW R9, 2(AX) ADDQ $0x04, AX JMP match_nolit_emitcopy_end_encodeBlockAsm8B repeat_three_match_nolit_encodeBlockAsm8B_emit_copy_short: - LEAL -4(BX), BX + LEAL -4(R9), R9 MOVW $0x0015, (AX) - MOVB BL, 2(AX) + MOVB R9, 2(AX) ADDQ $0x03, AX JMP match_nolit_emitcopy_end_encodeBlockAsm8B repeat_two_match_nolit_encodeBlockAsm8B_emit_copy_short: - SHLL $0x02, BX - ORL $0x01, BX - MOVW BX, (AX) + SHLL $0x02, R9 + ORL $0x01, R9 + MOVW R9, (AX) ADDQ $0x02, AX JMP match_nolit_emitcopy_end_encodeBlockAsm8B repeat_two_offset_match_nolit_encodeBlockAsm8B_emit_copy_short: - XORQ R9, R9 - LEAL 1(R9)(BX*4), BX + XORQ SI, SI + LEAL 1(SI)(R9*4), R9 MOVB BP, 1(AX) SARL $0x08, BP SHLL $0x05, BP - ORL BP, BX - MOVB BL, (AX) + ORL BP, R9 + MOVB R9, (AX) ADDQ $0x02, AX JMP match_nolit_emitcopy_end_encodeBlockAsm8B - JMP two_byte_offset_match_nolit_encodeBlockAsm8B + JMP two_byte_offset_match_nolit_encodeBlockAsm8B two_byte_offset_short_match_nolit_encodeBlockAsm8B: - CMPL BX, $0x0c + CMPL R9, $0x0c JGE emit_copy_three_match_nolit_encodeBlockAsm8B CMPL BP, $0x00000800 JGE emit_copy_three_match_nolit_encodeBlockAsm8B - MOVB $0x01, SI - LEAL -16(SI)(BX*4), BX + MOVB $0x01, BL + LEAL -16(BX)(R9*4), R9 MOVB BP, 1(AX) SHRL $0x08, BP SHLL $0x05, BP - ORL BP, BX - MOVB BL, (AX) + ORL BP, R9 + MOVB R9, (AX) ADDQ $0x02, AX JMP match_nolit_emitcopy_end_encodeBlockAsm8B emit_copy_three_match_nolit_encodeBlockAsm8B: - MOVB $0x02, SI - LEAL -4(SI)(BX*4), BX - MOVB BL, (AX) + MOVB $0x02, BL + LEAL -4(BX)(R9*4), R9 + MOVB R9, (AX) MOVW BP, 1(AX) ADDQ $0x03, AX @@ -5415,22 +5407,22 @@ match_nolit_emitcopy_end_encodeBlockAsm8B: RET match_nolit_dst_ok_encodeBlockAsm8B: - MOVQ -2(DX)(CX*1), R9 - MOVQ $0x9e3779b1, R10 - MOVQ R9, R11 - SHRQ $0x10, R9 - MOVQ R9, R12 - SHLQ $0x20, R11 - IMULQ R10, R11 - SHRQ $0x38, R11 - SHLQ $0x20, R12 - IMULQ R10, R12 - SHRQ $0x38, R12 - LEAL -2(CX), BP - MOVL 24(SP)(R12*4), BX - MOVL BP, 24(SP)(R11*4) - MOVL CX, 24(SP)(R12*4) - CMPL (DX)(BX*1), R9 + MOVQ -2(DX)(CX*1), SI + MOVQ $0x9e3779b1, BP + MOVQ SI, DI + SHRQ $0x10, SI + MOVQ SI, R8 + SHLQ $0x20, DI + IMULQ BP, DI + SHRQ $0x38, DI + SHLQ $0x20, R8 + IMULQ BP, R8 + SHRQ $0x38, R8 + LEAL -2(CX), R9 + MOVL 24(SP)(R8*4), BP + MOVL R9, 24(SP)(DI*4) + MOVL CX, 24(SP)(R8*4) + CMPL (DX)(BP*1), SI JEQ match_nolit_loop_encodeBlockAsm8B INCL CX JMP search_loop_encodeBlockAsm8B @@ -5705,8 +5697,8 @@ zero_loop_encodeBlockAsmAvx: MOVL $0x00000000, 12(SP) MOVQ src_len+32(FP), CX LEAQ -5(CX), DX - LEAQ -8(CX), R9 - MOVL R9, 8(SP) + LEAQ -8(CX), BP + MOVL BP, 8(SP) SHRQ $0x05, CX SUBL CX, DX LEAQ (AX)(DX*1), DX @@ -5716,68 +5708,68 @@ zero_loop_encodeBlockAsmAvx: MOVQ src_base+24(FP), DX search_loop_encodeBlockAsmAvx: - MOVQ (DX)(CX*1), R9 - MOVL CX, BX - SUBL 12(SP), BX - SHRL $0x05, BX - LEAL 4(CX)(BX*1), BX - MOVL 8(SP), BP - CMPL BX, BP + MOVQ (DX)(CX*1), SI + MOVL CX, BP + SUBL 12(SP), BP + SHRL $0x05, BP + LEAL 4(CX)(BP*1), BP + MOVL 8(SP), DI + CMPL BP, DI JGT emit_remainder_encodeBlockAsmAvx - MOVL BX, 20(SP) - MOVQ $0x0000cf1bbcdcbf9b, R10 - MOVQ R9, R11 - MOVQ R9, R12 - SHRQ $0x08, R12 - SHLQ $0x10, R11 - IMULQ R10, R11 - SHRQ $0x32, R11 - SHLQ $0x10, R12 - IMULQ R10, R12 - SHRQ $0x32, R12 - MOVL 24(SP)(R11*4), BX - MOVL 24(SP)(R12*4), BP - MOVL CX, 24(SP)(R11*4) - LEAL 1(CX), DI - MOVL DI, 24(SP)(R12*4) - MOVQ R9, R11 - SHRQ $0x10, R11 - SHLQ $0x10, R11 - IMULQ R10, R11 - SHRQ $0x32, R11 - MOVL CX, DI - SUBL 16(SP), DI - MOVL 1(DX)(DI*1), DI - MOVQ R9, R10 + MOVL BP, 20(SP) + MOVQ $0x0000cf1bbcdcbf9b, R8 + MOVQ SI, R9 + MOVQ SI, R10 SHRQ $0x08, R10 - CMPL R10, DI + SHLQ $0x10, R9 + IMULQ R8, R9 + SHRQ $0x32, R9 + SHLQ $0x10, R10 + IMULQ R8, R10 + SHRQ $0x32, R10 + MOVL 24(SP)(R9*4), BP + MOVL 24(SP)(R10*4), DI + MOVL CX, 24(SP)(R9*4) + LEAL 1(CX), R9 + MOVL R9, 24(SP)(R10*4) + MOVQ SI, R9 + SHRQ $0x10, R9 + SHLQ $0x10, R9 + IMULQ R8, R9 + SHRQ $0x32, R9 + MOVL CX, R8 + SUBL 16(SP), R8 + MOVL 1(DX)(R8*1), R10 + MOVQ SI, R8 + SHRQ $0x08, R8 + CMPL R8, R10 JNE no_repeat_found_encodeBlockAsmAvx - LEAL 1(CX), DI - MOVL 12(SP), R8 - MOVL DI, BX - SUBL 16(SP), BX + LEAL 1(CX), SI + MOVL 12(SP), DI + MOVL SI, BP + SUBL 16(SP), BP JZ repeat_extend_back_end_encodeBlockAsmAvx repeat_extend_back_loop_encodeBlockAsmAvx: - CMPL DI, R8 + CMPL SI, DI JLE repeat_extend_back_end_encodeBlockAsmAvx - MOVB -1(DX)(BX*1), BP - MOVB -1(DX)(DI*1), SI - CMPB BP, SI + MOVB -1(DX)(BP*1), BL + MOVB -1(DX)(SI*1), R8 + CMPB BL, R8 JNE repeat_extend_back_end_encodeBlockAsmAvx - LEAL -1(DI), DI - DECL BX + LEAL -1(SI), SI + DECL BP JNZ repeat_extend_back_loop_encodeBlockAsmAvx repeat_extend_back_end_encodeBlockAsmAvx: MOVL 12(SP), BP - CMPL BP, DI + CMPL BP, SI JEQ emit_literal_done_repeat_emit_encodeBlockAsmAvx - MOVL DI, BX - MOVL DI, 12(SP) + MOVL SI, R8 + MOVL SI, 12(SP) LEAQ (DX)(BP*1), R9 - SUBL BP, BX - MOVL BX, BP + SUBL BP, R8 + MOVL R8, BP SUBL $0x01, BP JC emit_literal_done_repeat_emit_encodeBlockAsmAvx CMPL BP, $0x3c @@ -5820,87 +5812,86 @@ one_byte_repeat_emit_encodeBlockAsmAvx: ADDQ $0x01, AX memmove_repeat_emit_encodeBlockAsmAvx: - LEAQ (AX)(BX*1), R10 - MOVL BX, BP + LEAQ (AX)(R8*1), BP NOP emit_lit_memmove_repeat_emit_encodeBlockAsmAvx_memmove_tail: - TESTQ BP, BP + TESTQ R8, R8 JEQ memmove_end_copy_repeat_emit_encodeBlockAsmAvx - CMPQ BP, $0x02 + CMPQ R8, $0x02 JBE emit_lit_memmove_repeat_emit_encodeBlockAsmAvx_memmove_move_1or2 - CMPQ BP, $0x04 + CMPQ R8, $0x04 JB emit_lit_memmove_repeat_emit_encodeBlockAsmAvx_memmove_move_3 JBE emit_lit_memmove_repeat_emit_encodeBlockAsmAvx_memmove_move_4 - CMPQ BP, $0x08 + CMPQ R8, $0x08 JB emit_lit_memmove_repeat_emit_encodeBlockAsmAvx_memmove_move_5through7 JE emit_lit_memmove_repeat_emit_encodeBlockAsmAvx_memmove_move_8 - CMPQ BP, $0x10 + CMPQ R8, $0x10 JBE emit_lit_memmove_repeat_emit_encodeBlockAsmAvx_memmove_move_9through16 - CMPQ BP, $0x20 + CMPQ R8, $0x20 JBE emit_lit_memmove_repeat_emit_encodeBlockAsmAvx_memmove_move_17through32 - CMPQ BP, $0x40 + CMPQ R8, $0x40 JBE emit_lit_memmove_repeat_emit_encodeBlockAsmAvx_memmove_move_33through64 - CMPQ BP, $0x80 + CMPQ R8, $0x80 JBE emit_lit_memmove_repeat_emit_encodeBlockAsmAvx_memmove_move_65through128 - CMPQ BP, $0x00000100 + CMPQ R8, $0x00000100 JBE emit_lit_memmove_repeat_emit_encodeBlockAsmAvx_memmove_move_129through256 JMP emit_lit_memmove_repeat_emit_encodeBlockAsmAvx_memmove_avxUnaligned emit_lit_memmove_repeat_emit_encodeBlockAsmAvx_memmove_move_1or2: - MOVB (R9), R11 - MOVB -1(R9)(BP*1), R12 - MOVB R11, (AX) - MOVB R12, -1(AX)(BP*1) + MOVB (R9), R10 + MOVB -1(R9)(R8*1), R11 + MOVB R10, (AX) + MOVB R11, -1(AX)(R8*1) JMP memmove_end_copy_repeat_emit_encodeBlockAsmAvx emit_lit_memmove_repeat_emit_encodeBlockAsmAvx_memmove_move_4: - MOVL (R9), R11 - MOVL R11, (AX) + MOVL (R9), R10 + MOVL R10, (AX) JMP memmove_end_copy_repeat_emit_encodeBlockAsmAvx emit_lit_memmove_repeat_emit_encodeBlockAsmAvx_memmove_move_3: - MOVW (R9), R11 - MOVB 2(R9), R12 - MOVW R11, (AX) - MOVB R12, 2(AX) + MOVW (R9), R10 + MOVB 2(R9), R11 + MOVW R10, (AX) + MOVB R11, 2(AX) JMP memmove_end_copy_repeat_emit_encodeBlockAsmAvx emit_lit_memmove_repeat_emit_encodeBlockAsmAvx_memmove_move_5through7: - MOVL (R9), R11 - MOVL -4(R9)(BP*1), R12 - MOVL R11, (AX) - MOVL R12, -4(AX)(BP*1) + MOVL (R9), R10 + MOVL -4(R9)(R8*1), R11 + MOVL R10, (AX) + MOVL R11, -4(AX)(R8*1) JMP memmove_end_copy_repeat_emit_encodeBlockAsmAvx emit_lit_memmove_repeat_emit_encodeBlockAsmAvx_memmove_move_8: - MOVQ (R9), R11 - MOVQ R11, (AX) + MOVQ (R9), R10 + MOVQ R10, (AX) JMP memmove_end_copy_repeat_emit_encodeBlockAsmAvx emit_lit_memmove_repeat_emit_encodeBlockAsmAvx_memmove_move_9through16: - MOVQ (R9), R11 - MOVQ -8(R9)(BP*1), R12 - MOVQ R11, (AX) - MOVQ R12, -8(AX)(BP*1) + MOVQ (R9), R10 + MOVQ -8(R9)(R8*1), R11 + MOVQ R10, (AX) + MOVQ R11, -8(AX)(R8*1) JMP memmove_end_copy_repeat_emit_encodeBlockAsmAvx emit_lit_memmove_repeat_emit_encodeBlockAsmAvx_memmove_move_17through32: MOVOU (R9), X0 - MOVOU -16(R9)(BP*1), X1 + MOVOU -16(R9)(R8*1), X1 MOVOU X0, (AX) - MOVOU X1, -16(AX)(BP*1) + MOVOU X1, -16(AX)(R8*1) JMP memmove_end_copy_repeat_emit_encodeBlockAsmAvx emit_lit_memmove_repeat_emit_encodeBlockAsmAvx_memmove_move_33through64: MOVOU (R9), X0 MOVOU 16(R9), X1 - MOVOU -32(R9)(BP*1), X2 - MOVOU -16(R9)(BP*1), X3 + MOVOU -32(R9)(R8*1), X2 + MOVOU -16(R9)(R8*1), X3 MOVOU X0, (AX) MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(BP*1) - MOVOU X3, -16(AX)(BP*1) + MOVOU X2, -32(AX)(R8*1) + MOVOU X3, -16(AX)(R8*1) JMP memmove_end_copy_repeat_emit_encodeBlockAsmAvx emit_lit_memmove_repeat_emit_encodeBlockAsmAvx_memmove_move_65through128: @@ -5908,18 +5899,18 @@ emit_lit_memmove_repeat_emit_encodeBlockAsmAvx_memmove_move_65through128: MOVOU 16(R9), X1 MOVOU 32(R9), X2 MOVOU 48(R9), X3 - MOVOU -64(R9)(BP*1), X12 - MOVOU -48(R9)(BP*1), X13 - MOVOU -32(R9)(BP*1), X14 - MOVOU -16(R9)(BP*1), X15 + MOVOU -64(R9)(R8*1), X12 + MOVOU -48(R9)(R8*1), X13 + MOVOU -32(R9)(R8*1), X14 + MOVOU -16(R9)(R8*1), X15 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, 32(AX) MOVOU X3, 48(AX) - MOVOU X12, -64(AX)(BP*1) - MOVOU X13, -48(AX)(BP*1) - MOVOU X14, -32(AX)(BP*1) - MOVOU X15, -16(AX)(BP*1) + MOVOU X12, -64(AX)(R8*1) + MOVOU X13, -48(AX)(R8*1) + MOVOU X14, -32(AX)(R8*1) + MOVOU X15, -16(AX)(R8*1) JMP memmove_end_copy_repeat_emit_encodeBlockAsmAvx emit_lit_memmove_repeat_emit_encodeBlockAsmAvx_memmove_move_129through256: @@ -5931,14 +5922,14 @@ emit_lit_memmove_repeat_emit_encodeBlockAsmAvx_memmove_move_129through256: MOVOU 80(R9), X5 MOVOU 96(R9), X6 MOVOU 112(R9), X7 - MOVOU -128(R9)(BP*1), X8 - MOVOU -112(R9)(BP*1), X9 - MOVOU -96(R9)(BP*1), X10 - MOVOU -80(R9)(BP*1), X11 - MOVOU -64(R9)(BP*1), X12 - MOVOU -48(R9)(BP*1), X13 - MOVOU -32(R9)(BP*1), X14 - MOVOU -16(R9)(BP*1), X15 + MOVOU -128(R9)(R8*1), X8 + MOVOU -112(R9)(R8*1), X9 + MOVOU -96(R9)(R8*1), X10 + MOVOU -80(R9)(R8*1), X11 + MOVOU -64(R9)(R8*1), X12 + MOVOU -48(R9)(R8*1), X13 + MOVOU -32(R9)(R8*1), X14 + MOVOU -16(R9)(R8*1), X15 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, 32(AX) @@ -5947,18 +5938,18 @@ emit_lit_memmove_repeat_emit_encodeBlockAsmAvx_memmove_move_129through256: MOVOU X5, 80(AX) MOVOU X6, 96(AX) MOVOU X7, 112(AX) - MOVOU X8, -128(AX)(BP*1) - MOVOU X9, -112(AX)(BP*1) - MOVOU X10, -96(AX)(BP*1) - MOVOU X11, -80(AX)(BP*1) - MOVOU X12, -64(AX)(BP*1) - MOVOU X13, -48(AX)(BP*1) - MOVOU X14, -32(AX)(BP*1) - MOVOU X15, -16(AX)(BP*1) + MOVOU X8, -128(AX)(R8*1) + MOVOU X9, -112(AX)(R8*1) + MOVOU X10, -96(AX)(R8*1) + MOVOU X11, -80(AX)(R8*1) + MOVOU X12, -64(AX)(R8*1) + MOVOU X13, -48(AX)(R8*1) + MOVOU X14, -32(AX)(R8*1) + MOVOU X15, -16(AX)(R8*1) JMP memmove_end_copy_repeat_emit_encodeBlockAsmAvx emit_lit_memmove_repeat_emit_encodeBlockAsmAvx_memmove_move_256through2048: - LEAQ -256(BP), BP + LEAQ -256(R8), R8 MOVOU (R9), X0 MOVOU 16(R9), X1 MOVOU 32(R9), X2 @@ -5991,128 +5982,128 @@ emit_lit_memmove_repeat_emit_encodeBlockAsmAvx_memmove_move_256through2048: MOVOU X13, 208(AX) MOVOU X14, 224(AX) MOVOU X15, 240(AX) - CMPQ BP, $0x00000100 + CMPQ R8, $0x00000100 LEAQ 256(R9), R9 LEAQ 256(AX), AX JGE emit_lit_memmove_repeat_emit_encodeBlockAsmAvx_memmove_move_256through2048 JMP emit_lit_memmove_repeat_emit_encodeBlockAsmAvx_memmove_tail emit_lit_memmove_repeat_emit_encodeBlockAsmAvx_memmove_avxUnaligned: - LEAQ (R9)(BP*1), R12 - MOVQ AX, R14 - MOVOU -128(R12), X5 - MOVOU -112(R12), X6 - MOVQ $0x00000080, R11 + LEAQ (R9)(R8*1), R11 + MOVQ AX, R13 + MOVOU -128(R11), X5 + MOVOU -112(R11), X6 + MOVQ $0x00000080, R10 ANDQ $0xffffffe0, AX ADDQ $0x20, AX - MOVOU -96(R12), X7 - MOVOU -80(R12), X8 - MOVQ AX, R13 - SUBQ R14, R13 - MOVOU -64(R12), X9 - MOVOU -48(R12), X10 - SUBQ R13, BP - MOVOU -32(R12), X11 - MOVOU -16(R12), X12 + MOVOU -96(R11), X7 + MOVOU -80(R11), X8 + MOVQ AX, R12 + SUBQ R13, R12 + MOVOU -64(R11), X9 + MOVOU -48(R11), X10 + SUBQ R12, R8 + MOVOU -32(R11), X11 + MOVOU -16(R11), X12 VMOVDQU (R9), Y4 - ADDQ R13, R9 - SUBQ R11, BP + ADDQ R12, R9 + SUBQ R10, R8 emit_lit_memmove_repeat_emit_encodeBlockAsmAvx_memmove_gobble_128_loop: VMOVDQU (R9), Y0 VMOVDQU 32(R9), Y1 VMOVDQU 64(R9), Y2 VMOVDQU 96(R9), Y3 - ADDQ R11, R9 + ADDQ R10, R9 VMOVDQA Y0, (AX) VMOVDQA Y1, 32(AX) VMOVDQA Y2, 64(AX) VMOVDQA Y3, 96(AX) - ADDQ R11, AX - SUBQ R11, BP + ADDQ R10, AX + SUBQ R10, R8 JA emit_lit_memmove_repeat_emit_encodeBlockAsmAvx_memmove_gobble_128_loop - ADDQ R11, BP - ADDQ AX, BP - VMOVDQU Y4, (R14) + ADDQ R10, R8 + ADDQ AX, R8 + VMOVDQU Y4, (R13) VZEROUPPER - MOVOU X5, -128(BP) - MOVOU X6, -112(BP) - MOVOU X7, -96(BP) - MOVOU X8, -80(BP) - MOVOU X9, -64(BP) - MOVOU X10, -48(BP) - MOVOU X11, -32(BP) - MOVOU X12, -16(BP) + MOVOU X5, -128(R8) + MOVOU X6, -112(R8) + MOVOU X7, -96(R8) + MOVOU X8, -80(R8) + MOVOU X9, -64(R8) + MOVOU X10, -48(R8) + MOVOU X11, -32(R8) + MOVOU X12, -16(R8) memmove_end_copy_repeat_emit_encodeBlockAsmAvx: - MOVQ R10, AX + MOVQ BP, AX emit_literal_done_repeat_emit_encodeBlockAsmAvx: ADDL $0x05, CX - MOVL CX, BX - SUBL 16(SP), BX - MOVQ src_len+32(FP), R9 - SUBL CX, R9 - LEAQ (DX)(CX*1), R10 - LEAQ (DX)(BX*1), R11 - XORL BX, BX - CMPL R9, $0x08 + MOVL CX, BP + SUBL 16(SP), BP + MOVQ src_len+32(FP), R8 + SUBL CX, R8 + LEAQ (DX)(CX*1), R9 + LEAQ (DX)(BP*1), BP + XORL R11, R11 + CMPL R8, $0x08 JL matchlen_single_repeat_extend matchlen_loopback_repeat_extend: - MOVQ (R10)(BX*1), R12 - XORQ (R11)(BX*1), R12 - TESTQ R12, R12 + MOVQ (R9)(R11*1), R10 + XORQ (BP)(R11*1), R10 + TESTQ R10, R10 JZ matchlen_loop_repeat_extend - BSFQ R12, R12 - SARQ $0x03, R12 - LEAL (BX)(R12*1), BX + BSFQ R10, R10 + SARQ $0x03, R10 + LEAL (R11)(R10*1), R11 JMP repeat_extend_forward_end_encodeBlockAsmAvx matchlen_loop_repeat_extend: - LEAL -8(R9), R9 - LEAL 8(BX), BX - CMPL R9, $0x08 + LEAL -8(R8), R8 + LEAL 8(R11), R11 + CMPL R8, $0x08 JGE matchlen_loopback_repeat_extend matchlen_single_repeat_extend: - TESTL R9, R9 + TESTL R8, R8 JZ repeat_extend_forward_end_encodeBlockAsmAvx matchlen_single_loopback_repeat_extend: - MOVB (R10)(BX*1), R12 - CMPB (R11)(BX*1), R12 + MOVB (R9)(R11*1), R10 + CMPB (BP)(R11*1), R10 JNE repeat_extend_forward_end_encodeBlockAsmAvx - LEAL 1(BX), BX - DECL R9 + LEAL 1(R11), R11 + DECL R8 JNZ matchlen_single_loopback_repeat_extend repeat_extend_forward_end_encodeBlockAsmAvx: - ADDL BX, CX - MOVL CX, BX - SUBL DI, BX - MOVL 16(SP), BP - TESTL R8, R8 + ADDL R11, CX + MOVL CX, BP + SUBL SI, BP + MOVL 16(SP), SI + TESTL DI, DI JZ repeat_as_copy_encodeBlockAsmAvx emit_repeat_again_match_repeat_encodeBlockAsmAvx: - MOVL BX, DI - LEAL -4(BX), BX + MOVL BP, DI + LEAL -4(BP), BP CMPL DI, $0x08 JLE repeat_two_match_repeat_encodeBlockAsmAvx CMPL DI, $0x0c JGE cant_repeat_two_offset_match_repeat_encodeBlockAsmAvx - CMPL BP, $0x00000800 + CMPL SI, $0x00000800 JLT repeat_two_offset_match_repeat_encodeBlockAsmAvx cant_repeat_two_offset_match_repeat_encodeBlockAsmAvx: - CMPL BX, $0x00000104 + CMPL BP, $0x00000104 JLT repeat_three_match_repeat_encodeBlockAsmAvx - CMPL BX, $0x00010100 + CMPL BP, $0x00010100 JLT repeat_four_match_repeat_encodeBlockAsmAvx - CMPL BX, $0x0100ffff + CMPL BP, $0x0100ffff JLT repeat_five_match_repeat_encodeBlockAsmAvx - LEAL -16842747(BX), BX + LEAL -16842747(BP), BP MOVW $0x001d, (AX) MOVW $0xfffb, 2(AX) MOVB $0xff, 4(AX) @@ -6120,79 +6111,79 @@ cant_repeat_two_offset_match_repeat_encodeBlockAsmAvx: JMP emit_repeat_again_match_repeat_encodeBlockAsmAvx repeat_five_match_repeat_encodeBlockAsmAvx: - LEAL -65536(BX), BX - MOVL BX, BP + LEAL -65536(BP), BP + MOVL BP, SI MOVW $0x001d, (AX) - MOVW BX, 2(AX) - SARL $0x10, BP - MOVB BP, 4(AX) + MOVW BP, 2(AX) + SARL $0x10, SI + MOVB SI, 4(AX) ADDQ $0x05, AX JMP repeat_end_emit_encodeBlockAsmAvx repeat_four_match_repeat_encodeBlockAsmAvx: - LEAL -256(BX), BX + LEAL -256(BP), BP MOVW $0x0019, (AX) - MOVW BX, 2(AX) + MOVW BP, 2(AX) ADDQ $0x04, AX JMP repeat_end_emit_encodeBlockAsmAvx repeat_three_match_repeat_encodeBlockAsmAvx: - LEAL -4(BX), BX + LEAL -4(BP), BP MOVW $0x0015, (AX) - MOVB BL, 2(AX) + MOVB BP, 2(AX) ADDQ $0x03, AX JMP repeat_end_emit_encodeBlockAsmAvx repeat_two_match_repeat_encodeBlockAsmAvx: - SHLL $0x02, BX - ORL $0x01, BX - MOVW BX, (AX) + SHLL $0x02, BP + ORL $0x01, BP + MOVW BP, (AX) ADDQ $0x02, AX JMP repeat_end_emit_encodeBlockAsmAvx repeat_two_offset_match_repeat_encodeBlockAsmAvx: - XORQ R9, R9 - LEAL 1(R9)(BX*4), BX - MOVB BP, 1(AX) - SARL $0x08, BP - SHLL $0x05, BP - ORL BP, BX - MOVB BL, (AX) + XORQ DI, DI + LEAL 1(DI)(BP*4), BP + MOVB SI, 1(AX) + SARL $0x08, SI + SHLL $0x05, SI + ORL SI, BP + MOVB BP, (AX) ADDQ $0x02, AX JMP repeat_end_emit_encodeBlockAsmAvx repeat_as_copy_encodeBlockAsmAvx: - CMPL BP, $0x00010000 + CMPL SI, $0x00010000 JL two_byte_offset_repeat_as_copy_encodeBlockAsmAvx four_bytes_loop_back_repeat_as_copy_encodeBlockAsmAvx: - CMPL BX, $0x40 + CMPL BP, $0x40 JLE four_bytes_remain_repeat_as_copy_encodeBlockAsmAvx MOVB $0xff, (AX) - MOVL BP, 1(AX) - LEAL -64(BX), BX + MOVL SI, 1(AX) + LEAL -64(BP), BP ADDQ $0x05, AX - CMPL BX, $0x04 + CMPL BP, $0x04 JL four_bytes_remain_repeat_as_copy_encodeBlockAsmAvx emit_repeat_again_repeat_as_copy_encodeBlockAsmAvx_emit_copy: - MOVL BX, DI - LEAL -4(BX), BX + MOVL BP, DI + LEAL -4(BP), BP CMPL DI, $0x08 JLE repeat_two_repeat_as_copy_encodeBlockAsmAvx_emit_copy CMPL DI, $0x0c JGE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsmAvx_emit_copy - CMPL BP, $0x00000800 + CMPL SI, $0x00000800 JLT repeat_two_offset_repeat_as_copy_encodeBlockAsmAvx_emit_copy cant_repeat_two_offset_repeat_as_copy_encodeBlockAsmAvx_emit_copy: - CMPL BX, $0x00000104 + CMPL BP, $0x00000104 JLT repeat_three_repeat_as_copy_encodeBlockAsmAvx_emit_copy - CMPL BX, $0x00010100 + CMPL BP, $0x00010100 JLT repeat_four_repeat_as_copy_encodeBlockAsmAvx_emit_copy - CMPL BX, $0x0100ffff + CMPL BP, $0x0100ffff JLT repeat_five_repeat_as_copy_encodeBlockAsmAvx_emit_copy - LEAL -16842747(BX), BX + LEAL -16842747(BP), BP MOVW $0x001d, (AX) MOVW $0xfffb, 2(AX) MOVB $0xff, 4(AX) @@ -6200,84 +6191,84 @@ cant_repeat_two_offset_repeat_as_copy_encodeBlockAsmAvx_emit_copy: JMP emit_repeat_again_repeat_as_copy_encodeBlockAsmAvx_emit_copy repeat_five_repeat_as_copy_encodeBlockAsmAvx_emit_copy: - LEAL -65536(BX), BX - MOVL BX, BP + LEAL -65536(BP), BP + MOVL BP, SI MOVW $0x001d, (AX) - MOVW BX, 2(AX) - SARL $0x10, BP - MOVB BP, 4(AX) + MOVW BP, 2(AX) + SARL $0x10, SI + MOVB SI, 4(AX) ADDQ $0x05, AX JMP repeat_end_emit_encodeBlockAsmAvx repeat_four_repeat_as_copy_encodeBlockAsmAvx_emit_copy: - LEAL -256(BX), BX + LEAL -256(BP), BP MOVW $0x0019, (AX) - MOVW BX, 2(AX) + MOVW BP, 2(AX) ADDQ $0x04, AX JMP repeat_end_emit_encodeBlockAsmAvx repeat_three_repeat_as_copy_encodeBlockAsmAvx_emit_copy: - LEAL -4(BX), BX + LEAL -4(BP), BP MOVW $0x0015, (AX) - MOVB BL, 2(AX) + MOVB BP, 2(AX) ADDQ $0x03, AX JMP repeat_end_emit_encodeBlockAsmAvx repeat_two_repeat_as_copy_encodeBlockAsmAvx_emit_copy: - SHLL $0x02, BX - ORL $0x01, BX - MOVW BX, (AX) + SHLL $0x02, BP + ORL $0x01, BP + MOVW BP, (AX) ADDQ $0x02, AX JMP repeat_end_emit_encodeBlockAsmAvx repeat_two_offset_repeat_as_copy_encodeBlockAsmAvx_emit_copy: - XORQ R9, R9 - LEAL 1(R9)(BX*4), BX - MOVB BP, 1(AX) - SARL $0x08, BP - SHLL $0x05, BP - ORL BP, BX - MOVB BL, (AX) + XORQ DI, DI + LEAL 1(DI)(BP*4), BP + MOVB SI, 1(AX) + SARL $0x08, SI + SHLL $0x05, SI + ORL SI, BP + MOVB BP, (AX) ADDQ $0x02, AX JMP repeat_end_emit_encodeBlockAsmAvx - JMP four_bytes_loop_back_repeat_as_copy_encodeBlockAsmAvx + JMP four_bytes_loop_back_repeat_as_copy_encodeBlockAsmAvx four_bytes_remain_repeat_as_copy_encodeBlockAsmAvx: - TESTL BX, BX + TESTL BP, BP JZ repeat_end_emit_encodeBlockAsmAvx - MOVB $0x03, SI - LEAL -4(SI)(BX*4), BX - MOVB BL, (AX) - MOVL BP, 1(AX) + MOVB $0x03, BL + LEAL -4(BX)(BP*4), BP + MOVB BP, (AX) + MOVL SI, 1(AX) ADDQ $0x05, AX JMP repeat_end_emit_encodeBlockAsmAvx two_byte_offset_repeat_as_copy_encodeBlockAsmAvx: - CMPL BX, $0x40 + CMPL BP, $0x40 JLE two_byte_offset_short_repeat_as_copy_encodeBlockAsmAvx MOVB $0xee, (AX) - MOVW BP, 1(AX) - LEAL -60(BX), BX + MOVW SI, 1(AX) + LEAL -60(BP), BP ADDQ $0x03, AX emit_repeat_again_repeat_as_copy_encodeBlockAsmAvx_emit_copy_short: - MOVL BX, DI - LEAL -4(BX), BX + MOVL BP, DI + LEAL -4(BP), BP CMPL DI, $0x08 JLE repeat_two_repeat_as_copy_encodeBlockAsmAvx_emit_copy_short CMPL DI, $0x0c JGE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsmAvx_emit_copy_short - CMPL BP, $0x00000800 + CMPL SI, $0x00000800 JLT repeat_two_offset_repeat_as_copy_encodeBlockAsmAvx_emit_copy_short cant_repeat_two_offset_repeat_as_copy_encodeBlockAsmAvx_emit_copy_short: - CMPL BX, $0x00000104 + CMPL BP, $0x00000104 JLT repeat_three_repeat_as_copy_encodeBlockAsmAvx_emit_copy_short - CMPL BX, $0x00010100 + CMPL BP, $0x00010100 JLT repeat_four_repeat_as_copy_encodeBlockAsmAvx_emit_copy_short - CMPL BX, $0x0100ffff + CMPL BP, $0x0100ffff JLT repeat_five_repeat_as_copy_encodeBlockAsmAvx_emit_copy_short - LEAL -16842747(BX), BX + LEAL -16842747(BP), BP MOVW $0x001d, (AX) MOVW $0xfffb, 2(AX) MOVB $0xff, 4(AX) @@ -6285,68 +6276,68 @@ cant_repeat_two_offset_repeat_as_copy_encodeBlockAsmAvx_emit_copy_short: JMP emit_repeat_again_repeat_as_copy_encodeBlockAsmAvx_emit_copy_short repeat_five_repeat_as_copy_encodeBlockAsmAvx_emit_copy_short: - LEAL -65536(BX), BX - MOVL BX, BP + LEAL -65536(BP), BP + MOVL BP, SI MOVW $0x001d, (AX) - MOVW BX, 2(AX) - SARL $0x10, BP - MOVB BP, 4(AX) + MOVW BP, 2(AX) + SARL $0x10, SI + MOVB SI, 4(AX) ADDQ $0x05, AX JMP repeat_end_emit_encodeBlockAsmAvx repeat_four_repeat_as_copy_encodeBlockAsmAvx_emit_copy_short: - LEAL -256(BX), BX + LEAL -256(BP), BP MOVW $0x0019, (AX) - MOVW BX, 2(AX) + MOVW BP, 2(AX) ADDQ $0x04, AX JMP repeat_end_emit_encodeBlockAsmAvx repeat_three_repeat_as_copy_encodeBlockAsmAvx_emit_copy_short: - LEAL -4(BX), BX + LEAL -4(BP), BP MOVW $0x0015, (AX) - MOVB BL, 2(AX) + MOVB BP, 2(AX) ADDQ $0x03, AX JMP repeat_end_emit_encodeBlockAsmAvx repeat_two_repeat_as_copy_encodeBlockAsmAvx_emit_copy_short: - SHLL $0x02, BX - ORL $0x01, BX - MOVW BX, (AX) + SHLL $0x02, BP + ORL $0x01, BP + MOVW BP, (AX) ADDQ $0x02, AX JMP repeat_end_emit_encodeBlockAsmAvx repeat_two_offset_repeat_as_copy_encodeBlockAsmAvx_emit_copy_short: - XORQ R9, R9 - LEAL 1(R9)(BX*4), BX - MOVB BP, 1(AX) - SARL $0x08, BP - SHLL $0x05, BP - ORL BP, BX - MOVB BL, (AX) + XORQ DI, DI + LEAL 1(DI)(BP*4), BP + MOVB SI, 1(AX) + SARL $0x08, SI + SHLL $0x05, SI + ORL SI, BP + MOVB BP, (AX) ADDQ $0x02, AX JMP repeat_end_emit_encodeBlockAsmAvx - JMP two_byte_offset_repeat_as_copy_encodeBlockAsmAvx + JMP two_byte_offset_repeat_as_copy_encodeBlockAsmAvx two_byte_offset_short_repeat_as_copy_encodeBlockAsmAvx: - CMPL BX, $0x0c + CMPL BP, $0x0c JGE emit_copy_three_repeat_as_copy_encodeBlockAsmAvx - CMPL BP, $0x00000800 + CMPL SI, $0x00000800 JGE emit_copy_three_repeat_as_copy_encodeBlockAsmAvx - MOVB $0x01, SI - LEAL -16(SI)(BX*4), BX - MOVB BP, 1(AX) - SHRL $0x08, BP - SHLL $0x05, BP - ORL BP, BX - MOVB BL, (AX) + MOVB $0x01, BL + LEAL -16(BX)(BP*4), BP + MOVB SI, 1(AX) + SHRL $0x08, SI + SHLL $0x05, SI + ORL SI, BP + MOVB BP, (AX) ADDQ $0x02, AX JMP repeat_end_emit_encodeBlockAsmAvx emit_copy_three_repeat_as_copy_encodeBlockAsmAvx: - MOVB $0x02, SI - LEAL -4(SI)(BX*4), BX - MOVB BL, (AX) - MOVW BP, 1(AX) + MOVB $0x02, BL + LEAL -4(BX)(BP*4), BP + MOVB BP, (AX) + MOVW SI, 1(AX) ADDQ $0x03, AX repeat_end_emit_encodeBlockAsmAvx: @@ -6356,16 +6347,16 @@ repeat_end_emit_encodeBlockAsmAvx: JMP search_loop_encodeBlockAsmAvx no_repeat_found_encodeBlockAsmAvx: - CMPL (DX)(BX*1), R9 + CMPL (DX)(BP*1), SI JEQ candidate_match_encodeBlockAsmAvx - SHRQ $0x08, R9 - MOVL 24(SP)(R11*4), BX - LEAL 2(CX), DI - CMPL (DX)(BP*1), R9 + SHRQ $0x08, SI + MOVL 24(SP)(R9*4), BP + LEAL 2(CX), R8 + CMPL (DX)(DI*1), SI JEQ candidate2_match_encodeBlockAsmAvx - MOVL DI, 24(SP)(R11*4) - SHRQ $0x08, R9 - CMPL (DX)(BX*1), R9 + MOVL R8, 24(SP)(R9*4) + SHRQ $0x08, SI + CMPL (DX)(BP*1), SI JEQ candidate3_match_encodeBlockAsmAvx MOVL 20(SP), CX JMP search_loop_encodeBlockAsmAvx @@ -6375,46 +6366,46 @@ candidate3_match_encodeBlockAsmAvx: JMP candidate_match_encodeBlockAsmAvx candidate2_match_encodeBlockAsmAvx: - MOVL DI, 24(SP)(R11*4) + MOVL R8, 24(SP)(R9*4) INCL CX - MOVL BP, BX + MOVL DI, BP candidate_match_encodeBlockAsmAvx: - MOVL 12(SP), BP - TESTL BX, BX + MOVL 12(SP), SI + TESTL BP, BP JZ match_extend_back_end_encodeBlockAsmAvx match_extend_back_loop_encodeBlockAsmAvx: - CMPL CX, BP + CMPL CX, SI JLE match_extend_back_end_encodeBlockAsmAvx - MOVB -1(DX)(BX*1), SI + MOVB -1(DX)(BP*1), BL MOVB -1(DX)(CX*1), DI - CMPB SI, DI + CMPB BL, DI JNE match_extend_back_end_encodeBlockAsmAvx LEAL -1(CX), CX - DECL BX + DECL BP JZ match_extend_back_end_encodeBlockAsmAvx JMP match_extend_back_loop_encodeBlockAsmAvx match_extend_back_end_encodeBlockAsmAvx: - MOVL CX, DI - SUBL 12(SP), DI - LEAQ 4(AX)(DI*1), DI - CMPQ DI, (SP) + MOVL CX, SI + SUBL 12(SP), SI + LEAQ 4(AX)(SI*1), SI + CMPQ SI, (SP) JL match_dst_size_check_encodeBlockAsmAvx MOVQ $0x00000000, ret+48(FP) RET match_dst_size_check_encodeBlockAsmAvx: - MOVL CX, R8 + MOVL CX, SI MOVL 12(SP), DI - CMPL DI, R8 + CMPL DI, SI JEQ emit_literal_done_match_emit_encodeBlockAsmAvx - MOVL R8, BP - MOVL R8, 12(SP) - LEAQ (DX)(DI*1), R9 - SUBL DI, BP - MOVL BP, DI + MOVL SI, R8 + MOVL SI, 12(SP) + LEAQ (DX)(DI*1), SI + SUBL DI, R8 + MOVL R8, DI SUBL $0x01, DI JC emit_literal_done_match_emit_encodeBlockAsmAvx CMPL DI, $0x3c @@ -6431,11 +6422,11 @@ match_dst_size_check_encodeBlockAsmAvx: JMP memmove_match_emit_encodeBlockAsmAvx four_bytes_match_emit_encodeBlockAsmAvx: - MOVL DI, R8 - SHRL $0x10, R8 + MOVL DI, R9 + SHRL $0x10, R9 MOVB $0xf8, (AX) MOVW DI, 1(AX) - MOVB R8, 3(AX) + MOVB R9, 3(AX) ADDQ $0x04, AX JMP memmove_match_emit_encodeBlockAsmAvx @@ -6457,8 +6448,7 @@ one_byte_match_emit_encodeBlockAsmAvx: ADDQ $0x01, AX memmove_match_emit_encodeBlockAsmAvx: - LEAQ (AX)(BP*1), R10 - MOVL BP, R8 + LEAQ (AX)(R8*1), DI NOP emit_lit_memmove_match_emit_encodeBlockAsmAvx_memmove_tail: @@ -6485,55 +6475,55 @@ emit_lit_memmove_match_emit_encodeBlockAsmAvx_memmove_tail: JMP emit_lit_memmove_match_emit_encodeBlockAsmAvx_memmove_avxUnaligned emit_lit_memmove_match_emit_encodeBlockAsmAvx_memmove_move_1or2: - MOVB (R9), R11 - MOVB -1(R9)(R8*1), R12 - MOVB R11, (AX) - MOVB R12, -1(AX)(R8*1) + MOVB (SI), R9 + MOVB -1(SI)(R8*1), R10 + MOVB R9, (AX) + MOVB R10, -1(AX)(R8*1) JMP memmove_end_copy_match_emit_encodeBlockAsmAvx emit_lit_memmove_match_emit_encodeBlockAsmAvx_memmove_move_4: - MOVL (R9), R11 - MOVL R11, (AX) + MOVL (SI), R9 + MOVL R9, (AX) JMP memmove_end_copy_match_emit_encodeBlockAsmAvx emit_lit_memmove_match_emit_encodeBlockAsmAvx_memmove_move_3: - MOVW (R9), R11 - MOVB 2(R9), R12 - MOVW R11, (AX) - MOVB R12, 2(AX) + MOVW (SI), R9 + MOVB 2(SI), R10 + MOVW R9, (AX) + MOVB R10, 2(AX) JMP memmove_end_copy_match_emit_encodeBlockAsmAvx emit_lit_memmove_match_emit_encodeBlockAsmAvx_memmove_move_5through7: - MOVL (R9), R11 - MOVL -4(R9)(R8*1), R12 - MOVL R11, (AX) - MOVL R12, -4(AX)(R8*1) + MOVL (SI), R9 + MOVL -4(SI)(R8*1), R10 + MOVL R9, (AX) + MOVL R10, -4(AX)(R8*1) JMP memmove_end_copy_match_emit_encodeBlockAsmAvx emit_lit_memmove_match_emit_encodeBlockAsmAvx_memmove_move_8: - MOVQ (R9), R11 - MOVQ R11, (AX) + MOVQ (SI), R9 + MOVQ R9, (AX) JMP memmove_end_copy_match_emit_encodeBlockAsmAvx emit_lit_memmove_match_emit_encodeBlockAsmAvx_memmove_move_9through16: - MOVQ (R9), R11 - MOVQ -8(R9)(R8*1), R12 - MOVQ R11, (AX) - MOVQ R12, -8(AX)(R8*1) + MOVQ (SI), R9 + MOVQ -8(SI)(R8*1), R10 + MOVQ R9, (AX) + MOVQ R10, -8(AX)(R8*1) JMP memmove_end_copy_match_emit_encodeBlockAsmAvx emit_lit_memmove_match_emit_encodeBlockAsmAvx_memmove_move_17through32: - MOVOU (R9), X0 - MOVOU -16(R9)(R8*1), X1 + MOVOU (SI), X0 + MOVOU -16(SI)(R8*1), X1 MOVOU X0, (AX) MOVOU X1, -16(AX)(R8*1) JMP memmove_end_copy_match_emit_encodeBlockAsmAvx emit_lit_memmove_match_emit_encodeBlockAsmAvx_memmove_move_33through64: - MOVOU (R9), X0 - MOVOU 16(R9), X1 - MOVOU -32(R9)(R8*1), X2 - MOVOU -16(R9)(R8*1), X3 + MOVOU (SI), X0 + MOVOU 16(SI), X1 + MOVOU -32(SI)(R8*1), X2 + MOVOU -16(SI)(R8*1), X3 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, -32(AX)(R8*1) @@ -6541,14 +6531,14 @@ emit_lit_memmove_match_emit_encodeBlockAsmAvx_memmove_move_33through64: JMP memmove_end_copy_match_emit_encodeBlockAsmAvx emit_lit_memmove_match_emit_encodeBlockAsmAvx_memmove_move_65through128: - MOVOU (R9), X0 - MOVOU 16(R9), X1 - MOVOU 32(R9), X2 - MOVOU 48(R9), X3 - MOVOU -64(R9)(R8*1), X12 - MOVOU -48(R9)(R8*1), X13 - MOVOU -32(R9)(R8*1), X14 - MOVOU -16(R9)(R8*1), X15 + MOVOU (SI), X0 + MOVOU 16(SI), X1 + MOVOU 32(SI), X2 + MOVOU 48(SI), X3 + MOVOU -64(SI)(R8*1), X12 + MOVOU -48(SI)(R8*1), X13 + MOVOU -32(SI)(R8*1), X14 + MOVOU -16(SI)(R8*1), X15 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, 32(AX) @@ -6560,22 +6550,22 @@ emit_lit_memmove_match_emit_encodeBlockAsmAvx_memmove_move_65through128: JMP memmove_end_copy_match_emit_encodeBlockAsmAvx emit_lit_memmove_match_emit_encodeBlockAsmAvx_memmove_move_129through256: - MOVOU (R9), X0 - MOVOU 16(R9), X1 - MOVOU 32(R9), X2 - MOVOU 48(R9), X3 - MOVOU 64(R9), X4 - MOVOU 80(R9), X5 - MOVOU 96(R9), X6 - MOVOU 112(R9), X7 - MOVOU -128(R9)(R8*1), X8 - MOVOU -112(R9)(R8*1), X9 - MOVOU -96(R9)(R8*1), X10 - MOVOU -80(R9)(R8*1), X11 - MOVOU -64(R9)(R8*1), X12 - MOVOU -48(R9)(R8*1), X13 - MOVOU -32(R9)(R8*1), X14 - MOVOU -16(R9)(R8*1), X15 + MOVOU (SI), X0 + MOVOU 16(SI), X1 + MOVOU 32(SI), X2 + MOVOU 48(SI), X3 + MOVOU 64(SI), X4 + MOVOU 80(SI), X5 + MOVOU 96(SI), X6 + MOVOU 112(SI), X7 + MOVOU -128(SI)(R8*1), X8 + MOVOU -112(SI)(R8*1), X9 + MOVOU -96(SI)(R8*1), X10 + MOVOU -80(SI)(R8*1), X11 + MOVOU -64(SI)(R8*1), X12 + MOVOU -48(SI)(R8*1), X13 + MOVOU -32(SI)(R8*1), X14 + MOVOU -16(SI)(R8*1), X15 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, 32(AX) @@ -6596,22 +6586,22 @@ emit_lit_memmove_match_emit_encodeBlockAsmAvx_memmove_move_129through256: emit_lit_memmove_match_emit_encodeBlockAsmAvx_memmove_move_256through2048: LEAQ -256(R8), R8 - MOVOU (R9), X0 - MOVOU 16(R9), X1 - MOVOU 32(R9), X2 - MOVOU 48(R9), X3 - MOVOU 64(R9), X4 - MOVOU 80(R9), X5 - MOVOU 96(R9), X6 - MOVOU 112(R9), X7 - MOVOU 128(R9), X8 - MOVOU 144(R9), X9 - MOVOU 160(R9), X10 - MOVOU 176(R9), X11 - MOVOU 192(R9), X12 - MOVOU 208(R9), X13 - MOVOU 224(R9), X14 - MOVOU 240(R9), X15 + MOVOU (SI), X0 + MOVOU 16(SI), X1 + MOVOU 32(SI), X2 + MOVOU 48(SI), X3 + MOVOU 64(SI), X4 + MOVOU 80(SI), X5 + MOVOU 96(SI), X6 + MOVOU 112(SI), X7 + MOVOU 128(SI), X8 + MOVOU 144(SI), X9 + MOVOU 160(SI), X10 + MOVOU 176(SI), X11 + MOVOU 192(SI), X12 + MOVOU 208(SI), X13 + MOVOU 224(SI), X14 + MOVOU 240(SI), X15 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, 32(AX) @@ -6629,139 +6619,139 @@ emit_lit_memmove_match_emit_encodeBlockAsmAvx_memmove_move_256through2048: MOVOU X14, 224(AX) MOVOU X15, 240(AX) CMPQ R8, $0x00000100 - LEAQ 256(R9), R9 + LEAQ 256(SI), SI LEAQ 256(AX), AX JGE emit_lit_memmove_match_emit_encodeBlockAsmAvx_memmove_move_256through2048 JMP emit_lit_memmove_match_emit_encodeBlockAsmAvx_memmove_tail emit_lit_memmove_match_emit_encodeBlockAsmAvx_memmove_avxUnaligned: - LEAQ (R9)(R8*1), R12 - MOVQ AX, R14 - MOVOU -128(R12), X5 - MOVOU -112(R12), X6 - MOVQ $0x00000080, R11 + LEAQ (SI)(R8*1), R10 + MOVQ AX, R12 + MOVOU -128(R10), X5 + MOVOU -112(R10), X6 + MOVQ $0x00000080, R9 ANDQ $0xffffffe0, AX ADDQ $0x20, AX - MOVOU -96(R12), X7 - MOVOU -80(R12), X8 - MOVQ AX, R13 - SUBQ R14, R13 - MOVOU -64(R12), X9 - MOVOU -48(R12), X10 - SUBQ R13, R8 - MOVOU -32(R12), X11 - MOVOU -16(R12), X12 - VMOVDQU (R9), Y4 - ADDQ R13, R9 + MOVOU -96(R10), X7 + MOVOU -80(R10), X8 + MOVQ AX, R11 + SUBQ R12, R11 + MOVOU -64(R10), X9 + MOVOU -48(R10), X10 SUBQ R11, R8 + MOVOU -32(R10), X11 + MOVOU -16(R10), X12 + VMOVDQU (SI), Y4 + ADDQ R11, SI + SUBQ R9, R8 emit_lit_memmove_match_emit_encodeBlockAsmAvx_memmove_gobble_128_loop: - VMOVDQU (R9), Y0 - VMOVDQU 32(R9), Y1 - VMOVDQU 64(R9), Y2 - VMOVDQU 96(R9), Y3 - ADDQ R11, R9 + VMOVDQU (SI), Y0 + VMOVDQU 32(SI), Y1 + VMOVDQU 64(SI), Y2 + VMOVDQU 96(SI), Y3 + ADDQ R9, SI VMOVDQA Y0, (AX) VMOVDQA Y1, 32(AX) VMOVDQA Y2, 64(AX) VMOVDQA Y3, 96(AX) - ADDQ R11, AX - SUBQ R11, R8 + ADDQ R9, AX + SUBQ R9, R8 JA emit_lit_memmove_match_emit_encodeBlockAsmAvx_memmove_gobble_128_loop - ADDQ R11, R8 + ADDQ R9, R8 ADDQ AX, R8 - VMOVDQU Y4, (R14) + VMOVDQU Y4, (R12) VZEROUPPER - MOVOU X5, -128(R8) - MOVOU X6, -112(R8) - MOVOU X7, -96(R8) - MOVOU X8, -80(R8) - MOVOU X9, -64(R8) - MOVOU X10, -48(R8) - MOVOU X11, -32(R8) - MOVOU X12, -16(R8) + MOVOU X5, -128(R8) + MOVOU X6, -112(R8) + MOVOU X7, -96(R8) + MOVOU X8, -80(R8) + MOVOU X9, -64(R8) + MOVOU X10, -48(R8) + MOVOU X11, -32(R8) + MOVOU X12, -16(R8) memmove_end_copy_match_emit_encodeBlockAsmAvx: - MOVQ R10, AX + MOVQ DI, AX emit_literal_done_match_emit_encodeBlockAsmAvx: match_nolit_loop_encodeBlockAsmAvx: - MOVL CX, BP - SUBL BX, BP - MOVL BP, 16(SP) + MOVL CX, SI + SUBL BP, SI + MOVL SI, 16(SP) ADDL $0x04, CX - ADDL $0x04, BX - MOVQ src_len+32(FP), R9 - SUBL CX, R9 - LEAQ (DX)(CX*1), R10 - LEAQ (DX)(BX*1), R11 - XORL BX, BX - CMPL R9, $0x08 + ADDL $0x04, BP + MOVQ src_len+32(FP), SI + SUBL CX, SI + LEAQ (DX)(CX*1), DI + LEAQ (DX)(BP*1), BP + XORL R9, R9 + CMPL SI, $0x08 JL matchlen_single_match_nolit_encodeBlockAsmAvx matchlen_loopback_match_nolit_encodeBlockAsmAvx: - MOVQ (R10)(BX*1), R12 - XORQ (R11)(BX*1), R12 - TESTQ R12, R12 + MOVQ (DI)(R9*1), R8 + XORQ (BP)(R9*1), R8 + TESTQ R8, R8 JZ matchlen_loop_match_nolit_encodeBlockAsmAvx - BSFQ R12, R12 - SARQ $0x03, R12 - LEAL (BX)(R12*1), BX + BSFQ R8, R8 + SARQ $0x03, R8 + LEAL (R9)(R8*1), R9 JMP match_nolit_end_encodeBlockAsmAvx matchlen_loop_match_nolit_encodeBlockAsmAvx: - LEAL -8(R9), R9 - LEAL 8(BX), BX - CMPL R9, $0x08 + LEAL -8(SI), SI + LEAL 8(R9), R9 + CMPL SI, $0x08 JGE matchlen_loopback_match_nolit_encodeBlockAsmAvx matchlen_single_match_nolit_encodeBlockAsmAvx: - TESTL R9, R9 + TESTL SI, SI JZ match_nolit_end_encodeBlockAsmAvx matchlen_single_loopback_match_nolit_encodeBlockAsmAvx: - MOVB (R10)(BX*1), R12 - CMPB (R11)(BX*1), R12 + MOVB (DI)(R9*1), R8 + CMPB (BP)(R9*1), R8 JNE match_nolit_end_encodeBlockAsmAvx - LEAL 1(BX), BX - DECL R9 + LEAL 1(R9), R9 + DECL SI JNZ matchlen_single_loopback_match_nolit_encodeBlockAsmAvx match_nolit_end_encodeBlockAsmAvx: - ADDL BX, CX + ADDL R9, CX MOVL 16(SP), BP - ADDL $0x04, BX + ADDL $0x04, R9 CMPL BP, $0x00010000 JL two_byte_offset_match_nolit_encodeBlockAsmAvx four_bytes_loop_back_match_nolit_encodeBlockAsmAvx: - CMPL BX, $0x40 + CMPL R9, $0x40 JLE four_bytes_remain_match_nolit_encodeBlockAsmAvx MOVB $0xff, (AX) MOVL BP, 1(AX) - LEAL -64(BX), BX + LEAL -64(R9), R9 ADDQ $0x05, AX - CMPL BX, $0x04 + CMPL R9, $0x04 JL four_bytes_remain_match_nolit_encodeBlockAsmAvx emit_repeat_again_match_nolit_encodeBlockAsmAvx_emit_copy: - MOVL BX, DI - LEAL -4(BX), BX - CMPL DI, $0x08 + MOVL R9, SI + LEAL -4(R9), R9 + CMPL SI, $0x08 JLE repeat_two_match_nolit_encodeBlockAsmAvx_emit_copy - CMPL DI, $0x0c + CMPL SI, $0x0c JGE cant_repeat_two_offset_match_nolit_encodeBlockAsmAvx_emit_copy CMPL BP, $0x00000800 JLT repeat_two_offset_match_nolit_encodeBlockAsmAvx_emit_copy cant_repeat_two_offset_match_nolit_encodeBlockAsmAvx_emit_copy: - CMPL BX, $0x00000104 + CMPL R9, $0x00000104 JLT repeat_three_match_nolit_encodeBlockAsmAvx_emit_copy - CMPL BX, $0x00010100 + CMPL R9, $0x00010100 JLT repeat_four_match_nolit_encodeBlockAsmAvx_emit_copy - CMPL BX, $0x0100ffff + CMPL R9, $0x0100ffff JLT repeat_five_match_nolit_encodeBlockAsmAvx_emit_copy - LEAL -16842747(BX), BX + LEAL -16842747(R9), R9 MOVW $0x001d, (AX) MOVW $0xfffb, 2(AX) MOVB $0xff, 4(AX) @@ -6769,84 +6759,84 @@ cant_repeat_two_offset_match_nolit_encodeBlockAsmAvx_emit_copy: JMP emit_repeat_again_match_nolit_encodeBlockAsmAvx_emit_copy repeat_five_match_nolit_encodeBlockAsmAvx_emit_copy: - LEAL -65536(BX), BX - MOVL BX, BP + LEAL -65536(R9), R9 + MOVL R9, BP MOVW $0x001d, (AX) - MOVW BX, 2(AX) + MOVW R9, 2(AX) SARL $0x10, BP MOVB BP, 4(AX) ADDQ $0x05, AX JMP match_nolit_emitcopy_end_encodeBlockAsmAvx repeat_four_match_nolit_encodeBlockAsmAvx_emit_copy: - LEAL -256(BX), BX + LEAL -256(R9), R9 MOVW $0x0019, (AX) - MOVW BX, 2(AX) + MOVW R9, 2(AX) ADDQ $0x04, AX JMP match_nolit_emitcopy_end_encodeBlockAsmAvx repeat_three_match_nolit_encodeBlockAsmAvx_emit_copy: - LEAL -4(BX), BX + LEAL -4(R9), R9 MOVW $0x0015, (AX) - MOVB BL, 2(AX) + MOVB R9, 2(AX) ADDQ $0x03, AX JMP match_nolit_emitcopy_end_encodeBlockAsmAvx repeat_two_match_nolit_encodeBlockAsmAvx_emit_copy: - SHLL $0x02, BX - ORL $0x01, BX - MOVW BX, (AX) + SHLL $0x02, R9 + ORL $0x01, R9 + MOVW R9, (AX) ADDQ $0x02, AX JMP match_nolit_emitcopy_end_encodeBlockAsmAvx repeat_two_offset_match_nolit_encodeBlockAsmAvx_emit_copy: - XORQ R9, R9 - LEAL 1(R9)(BX*4), BX + XORQ SI, SI + LEAL 1(SI)(R9*4), R9 MOVB BP, 1(AX) SARL $0x08, BP SHLL $0x05, BP - ORL BP, BX - MOVB BL, (AX) + ORL BP, R9 + MOVB R9, (AX) ADDQ $0x02, AX JMP match_nolit_emitcopy_end_encodeBlockAsmAvx - JMP four_bytes_loop_back_match_nolit_encodeBlockAsmAvx + JMP four_bytes_loop_back_match_nolit_encodeBlockAsmAvx four_bytes_remain_match_nolit_encodeBlockAsmAvx: - TESTL BX, BX + TESTL R9, R9 JZ match_nolit_emitcopy_end_encodeBlockAsmAvx - MOVB $0x03, SI - LEAL -4(SI)(BX*4), BX - MOVB BL, (AX) + MOVB $0x03, BL + LEAL -4(BX)(R9*4), R9 + MOVB R9, (AX) MOVL BP, 1(AX) ADDQ $0x05, AX JMP match_nolit_emitcopy_end_encodeBlockAsmAvx two_byte_offset_match_nolit_encodeBlockAsmAvx: - CMPL BX, $0x40 + CMPL R9, $0x40 JLE two_byte_offset_short_match_nolit_encodeBlockAsmAvx MOVB $0xee, (AX) MOVW BP, 1(AX) - LEAL -60(BX), BX + LEAL -60(R9), R9 ADDQ $0x03, AX emit_repeat_again_match_nolit_encodeBlockAsmAvx_emit_copy_short: - MOVL BX, DI - LEAL -4(BX), BX - CMPL DI, $0x08 + MOVL R9, SI + LEAL -4(R9), R9 + CMPL SI, $0x08 JLE repeat_two_match_nolit_encodeBlockAsmAvx_emit_copy_short - CMPL DI, $0x0c + CMPL SI, $0x0c JGE cant_repeat_two_offset_match_nolit_encodeBlockAsmAvx_emit_copy_short CMPL BP, $0x00000800 JLT repeat_two_offset_match_nolit_encodeBlockAsmAvx_emit_copy_short cant_repeat_two_offset_match_nolit_encodeBlockAsmAvx_emit_copy_short: - CMPL BX, $0x00000104 + CMPL R9, $0x00000104 JLT repeat_three_match_nolit_encodeBlockAsmAvx_emit_copy_short - CMPL BX, $0x00010100 + CMPL R9, $0x00010100 JLT repeat_four_match_nolit_encodeBlockAsmAvx_emit_copy_short - CMPL BX, $0x0100ffff + CMPL R9, $0x0100ffff JLT repeat_five_match_nolit_encodeBlockAsmAvx_emit_copy_short - LEAL -16842747(BX), BX + LEAL -16842747(R9), R9 MOVW $0x001d, (AX) MOVW $0xfffb, 2(AX) MOVB $0xff, 4(AX) @@ -6854,67 +6844,67 @@ cant_repeat_two_offset_match_nolit_encodeBlockAsmAvx_emit_copy_short: JMP emit_repeat_again_match_nolit_encodeBlockAsmAvx_emit_copy_short repeat_five_match_nolit_encodeBlockAsmAvx_emit_copy_short: - LEAL -65536(BX), BX - MOVL BX, BP + LEAL -65536(R9), R9 + MOVL R9, BP MOVW $0x001d, (AX) - MOVW BX, 2(AX) + MOVW R9, 2(AX) SARL $0x10, BP MOVB BP, 4(AX) ADDQ $0x05, AX JMP match_nolit_emitcopy_end_encodeBlockAsmAvx repeat_four_match_nolit_encodeBlockAsmAvx_emit_copy_short: - LEAL -256(BX), BX + LEAL -256(R9), R9 MOVW $0x0019, (AX) - MOVW BX, 2(AX) + MOVW R9, 2(AX) ADDQ $0x04, AX JMP match_nolit_emitcopy_end_encodeBlockAsmAvx repeat_three_match_nolit_encodeBlockAsmAvx_emit_copy_short: - LEAL -4(BX), BX + LEAL -4(R9), R9 MOVW $0x0015, (AX) - MOVB BL, 2(AX) + MOVB R9, 2(AX) ADDQ $0x03, AX JMP match_nolit_emitcopy_end_encodeBlockAsmAvx repeat_two_match_nolit_encodeBlockAsmAvx_emit_copy_short: - SHLL $0x02, BX - ORL $0x01, BX - MOVW BX, (AX) + SHLL $0x02, R9 + ORL $0x01, R9 + MOVW R9, (AX) ADDQ $0x02, AX JMP match_nolit_emitcopy_end_encodeBlockAsmAvx repeat_two_offset_match_nolit_encodeBlockAsmAvx_emit_copy_short: - XORQ R9, R9 - LEAL 1(R9)(BX*4), BX + XORQ SI, SI + LEAL 1(SI)(R9*4), R9 MOVB BP, 1(AX) SARL $0x08, BP SHLL $0x05, BP - ORL BP, BX - MOVB BL, (AX) + ORL BP, R9 + MOVB R9, (AX) ADDQ $0x02, AX JMP match_nolit_emitcopy_end_encodeBlockAsmAvx - JMP two_byte_offset_match_nolit_encodeBlockAsmAvx + JMP two_byte_offset_match_nolit_encodeBlockAsmAvx two_byte_offset_short_match_nolit_encodeBlockAsmAvx: - CMPL BX, $0x0c + CMPL R9, $0x0c JGE emit_copy_three_match_nolit_encodeBlockAsmAvx CMPL BP, $0x00000800 JGE emit_copy_three_match_nolit_encodeBlockAsmAvx - MOVB $0x01, SI - LEAL -16(SI)(BX*4), BX + MOVB $0x01, BL + LEAL -16(BX)(R9*4), R9 MOVB BP, 1(AX) SHRL $0x08, BP SHLL $0x05, BP - ORL BP, BX - MOVB BL, (AX) + ORL BP, R9 + MOVB R9, (AX) ADDQ $0x02, AX JMP match_nolit_emitcopy_end_encodeBlockAsmAvx emit_copy_three_match_nolit_encodeBlockAsmAvx: - MOVB $0x02, SI - LEAL -4(SI)(BX*4), BX - MOVB BL, (AX) + MOVB $0x02, BL + LEAL -4(BX)(R9*4), R9 + MOVB R9, (AX) MOVW BP, 1(AX) ADDQ $0x03, AX @@ -6928,22 +6918,22 @@ match_nolit_emitcopy_end_encodeBlockAsmAvx: RET match_nolit_dst_ok_encodeBlockAsmAvx: - MOVQ -2(DX)(CX*1), R9 - MOVQ $0x0000cf1bbcdcbf9b, R10 - MOVQ R9, R11 - SHRQ $0x10, R9 - MOVQ R9, R12 - SHLQ $0x10, R11 - IMULQ R10, R11 - SHRQ $0x32, R11 - SHLQ $0x10, R12 - IMULQ R10, R12 - SHRQ $0x32, R12 - LEAL -2(CX), BP - MOVL 24(SP)(R12*4), BX - MOVL BP, 24(SP)(R11*4) - MOVL CX, 24(SP)(R12*4) - CMPL (DX)(BX*1), R9 + MOVQ -2(DX)(CX*1), SI + MOVQ $0x0000cf1bbcdcbf9b, BP + MOVQ SI, DI + SHRQ $0x10, SI + MOVQ SI, R8 + SHLQ $0x10, DI + IMULQ BP, DI + SHRQ $0x32, DI + SHLQ $0x10, R8 + IMULQ BP, R8 + SHRQ $0x32, R8 + LEAL -2(CX), R9 + MOVL 24(SP)(R8*4), BP + MOVL R9, 24(SP)(DI*4) + MOVL CX, 24(SP)(R8*4) + CMPL (DX)(BP*1), SI JEQ match_nolit_loop_encodeBlockAsmAvx INCL CX JMP search_loop_encodeBlockAsmAvx @@ -7224,14 +7214,14 @@ emit_lit_memmove_emit_remainder_encodeBlockAsmAvx_memmove_gobble_128_loop: ADDQ AX, BX VMOVDQU Y4, (R8) VZEROUPPER - MOVOU X5, -128(BX) - MOVOU X6, -112(BX) - MOVOU X7, -96(BX) - MOVOU X8, -80(BX) - MOVOU X9, -64(BX) - MOVOU X10, -48(BX) - MOVOU X11, -32(BX) - MOVOU X12, -16(BX) + MOVOU X5, -128(BX) + MOVOU X6, -112(BX) + MOVOU X7, -96(BX) + MOVOU X8, -80(BX) + MOVOU X9, -64(BX) + MOVOU X10, -48(BX) + MOVOU X11, -32(BX) + MOVOU X12, -16(BX) memmove_end_copy_emit_remainder_encodeBlockAsmAvx: MOVQ DX, AX @@ -7265,8 +7255,8 @@ zero_loop_encodeBlockAsm12BAvx: MOVL $0x00000000, 12(SP) MOVQ src_len+32(FP), CX LEAQ -5(CX), DX - LEAQ -8(CX), R9 - MOVL R9, 8(SP) + LEAQ -8(CX), BP + MOVL BP, 8(SP) SHRQ $0x05, CX SUBL CX, DX LEAQ (AX)(DX*1), DX @@ -7276,68 +7266,68 @@ zero_loop_encodeBlockAsm12BAvx: MOVQ src_base+24(FP), DX search_loop_encodeBlockAsm12BAvx: - MOVQ (DX)(CX*1), R9 - MOVL CX, BX - SUBL 12(SP), BX - SHRL $0x05, BX - LEAL 4(CX)(BX*1), BX - MOVL 8(SP), BP - CMPL BX, BP + MOVQ (DX)(CX*1), SI + MOVL CX, BP + SUBL 12(SP), BP + SHRL $0x05, BP + LEAL 4(CX)(BP*1), BP + MOVL 8(SP), DI + CMPL BP, DI JGT emit_remainder_encodeBlockAsm12BAvx - MOVL BX, 20(SP) - MOVQ $0x000000cf1bbcdcbb, R10 - MOVQ R9, R11 - MOVQ R9, R12 - SHRQ $0x08, R12 - SHLQ $0x18, R11 - IMULQ R10, R11 - SHRQ $0x34, R11 - SHLQ $0x18, R12 - IMULQ R10, R12 - SHRQ $0x34, R12 - MOVL 24(SP)(R11*4), BX - MOVL 24(SP)(R12*4), BP - MOVL CX, 24(SP)(R11*4) - LEAL 1(CX), DI - MOVL DI, 24(SP)(R12*4) - MOVQ R9, R11 - SHRQ $0x10, R11 - SHLQ $0x18, R11 - IMULQ R10, R11 - SHRQ $0x34, R11 - MOVL CX, DI - SUBL 16(SP), DI - MOVL 1(DX)(DI*1), DI - MOVQ R9, R10 + MOVL BP, 20(SP) + MOVQ $0x000000cf1bbcdcbb, R8 + MOVQ SI, R9 + MOVQ SI, R10 SHRQ $0x08, R10 - CMPL R10, DI + SHLQ $0x18, R9 + IMULQ R8, R9 + SHRQ $0x34, R9 + SHLQ $0x18, R10 + IMULQ R8, R10 + SHRQ $0x34, R10 + MOVL 24(SP)(R9*4), BP + MOVL 24(SP)(R10*4), DI + MOVL CX, 24(SP)(R9*4) + LEAL 1(CX), R9 + MOVL R9, 24(SP)(R10*4) + MOVQ SI, R9 + SHRQ $0x10, R9 + SHLQ $0x18, R9 + IMULQ R8, R9 + SHRQ $0x34, R9 + MOVL CX, R8 + SUBL 16(SP), R8 + MOVL 1(DX)(R8*1), R10 + MOVQ SI, R8 + SHRQ $0x08, R8 + CMPL R8, R10 JNE no_repeat_found_encodeBlockAsm12BAvx - LEAL 1(CX), DI - MOVL 12(SP), R8 - MOVL DI, BX - SUBL 16(SP), BX + LEAL 1(CX), SI + MOVL 12(SP), DI + MOVL SI, BP + SUBL 16(SP), BP JZ repeat_extend_back_end_encodeBlockAsm12BAvx repeat_extend_back_loop_encodeBlockAsm12BAvx: - CMPL DI, R8 + CMPL SI, DI JLE repeat_extend_back_end_encodeBlockAsm12BAvx - MOVB -1(DX)(BX*1), BP - MOVB -1(DX)(DI*1), SI - CMPB BP, SI + MOVB -1(DX)(BP*1), BL + MOVB -1(DX)(SI*1), R8 + CMPB BL, R8 JNE repeat_extend_back_end_encodeBlockAsm12BAvx - LEAL -1(DI), DI - DECL BX + LEAL -1(SI), SI + DECL BP JNZ repeat_extend_back_loop_encodeBlockAsm12BAvx repeat_extend_back_end_encodeBlockAsm12BAvx: MOVL 12(SP), BP - CMPL BP, DI + CMPL BP, SI JEQ emit_literal_done_repeat_emit_encodeBlockAsm12BAvx - MOVL DI, BX - MOVL DI, 12(SP) + MOVL SI, R8 + MOVL SI, 12(SP) LEAQ (DX)(BP*1), R9 - SUBL BP, BX - MOVL BX, BP + SUBL BP, R8 + MOVL R8, BP SUBL $0x01, BP JC emit_literal_done_repeat_emit_encodeBlockAsm12BAvx CMPL BP, $0x3c @@ -7380,87 +7370,86 @@ one_byte_repeat_emit_encodeBlockAsm12BAvx: ADDQ $0x01, AX memmove_repeat_emit_encodeBlockAsm12BAvx: - LEAQ (AX)(BX*1), R10 - MOVL BX, BP + LEAQ (AX)(R8*1), BP NOP emit_lit_memmove_repeat_emit_encodeBlockAsm12BAvx_memmove_tail: - TESTQ BP, BP + TESTQ R8, R8 JEQ memmove_end_copy_repeat_emit_encodeBlockAsm12BAvx - CMPQ BP, $0x02 + CMPQ R8, $0x02 JBE emit_lit_memmove_repeat_emit_encodeBlockAsm12BAvx_memmove_move_1or2 - CMPQ BP, $0x04 + CMPQ R8, $0x04 JB emit_lit_memmove_repeat_emit_encodeBlockAsm12BAvx_memmove_move_3 JBE emit_lit_memmove_repeat_emit_encodeBlockAsm12BAvx_memmove_move_4 - CMPQ BP, $0x08 + CMPQ R8, $0x08 JB emit_lit_memmove_repeat_emit_encodeBlockAsm12BAvx_memmove_move_5through7 JE emit_lit_memmove_repeat_emit_encodeBlockAsm12BAvx_memmove_move_8 - CMPQ BP, $0x10 + CMPQ R8, $0x10 JBE emit_lit_memmove_repeat_emit_encodeBlockAsm12BAvx_memmove_move_9through16 - CMPQ BP, $0x20 + CMPQ R8, $0x20 JBE emit_lit_memmove_repeat_emit_encodeBlockAsm12BAvx_memmove_move_17through32 - CMPQ BP, $0x40 + CMPQ R8, $0x40 JBE emit_lit_memmove_repeat_emit_encodeBlockAsm12BAvx_memmove_move_33through64 - CMPQ BP, $0x80 + CMPQ R8, $0x80 JBE emit_lit_memmove_repeat_emit_encodeBlockAsm12BAvx_memmove_move_65through128 - CMPQ BP, $0x00000100 + CMPQ R8, $0x00000100 JBE emit_lit_memmove_repeat_emit_encodeBlockAsm12BAvx_memmove_move_129through256 JMP emit_lit_memmove_repeat_emit_encodeBlockAsm12BAvx_memmove_avxUnaligned emit_lit_memmove_repeat_emit_encodeBlockAsm12BAvx_memmove_move_1or2: - MOVB (R9), R11 - MOVB -1(R9)(BP*1), R12 - MOVB R11, (AX) - MOVB R12, -1(AX)(BP*1) + MOVB (R9), R10 + MOVB -1(R9)(R8*1), R11 + MOVB R10, (AX) + MOVB R11, -1(AX)(R8*1) JMP memmove_end_copy_repeat_emit_encodeBlockAsm12BAvx emit_lit_memmove_repeat_emit_encodeBlockAsm12BAvx_memmove_move_4: - MOVL (R9), R11 - MOVL R11, (AX) + MOVL (R9), R10 + MOVL R10, (AX) JMP memmove_end_copy_repeat_emit_encodeBlockAsm12BAvx emit_lit_memmove_repeat_emit_encodeBlockAsm12BAvx_memmove_move_3: - MOVW (R9), R11 - MOVB 2(R9), R12 - MOVW R11, (AX) - MOVB R12, 2(AX) + MOVW (R9), R10 + MOVB 2(R9), R11 + MOVW R10, (AX) + MOVB R11, 2(AX) JMP memmove_end_copy_repeat_emit_encodeBlockAsm12BAvx emit_lit_memmove_repeat_emit_encodeBlockAsm12BAvx_memmove_move_5through7: - MOVL (R9), R11 - MOVL -4(R9)(BP*1), R12 - MOVL R11, (AX) - MOVL R12, -4(AX)(BP*1) + MOVL (R9), R10 + MOVL -4(R9)(R8*1), R11 + MOVL R10, (AX) + MOVL R11, -4(AX)(R8*1) JMP memmove_end_copy_repeat_emit_encodeBlockAsm12BAvx emit_lit_memmove_repeat_emit_encodeBlockAsm12BAvx_memmove_move_8: - MOVQ (R9), R11 - MOVQ R11, (AX) + MOVQ (R9), R10 + MOVQ R10, (AX) JMP memmove_end_copy_repeat_emit_encodeBlockAsm12BAvx emit_lit_memmove_repeat_emit_encodeBlockAsm12BAvx_memmove_move_9through16: - MOVQ (R9), R11 - MOVQ -8(R9)(BP*1), R12 - MOVQ R11, (AX) - MOVQ R12, -8(AX)(BP*1) + MOVQ (R9), R10 + MOVQ -8(R9)(R8*1), R11 + MOVQ R10, (AX) + MOVQ R11, -8(AX)(R8*1) JMP memmove_end_copy_repeat_emit_encodeBlockAsm12BAvx emit_lit_memmove_repeat_emit_encodeBlockAsm12BAvx_memmove_move_17through32: MOVOU (R9), X0 - MOVOU -16(R9)(BP*1), X1 + MOVOU -16(R9)(R8*1), X1 MOVOU X0, (AX) - MOVOU X1, -16(AX)(BP*1) + MOVOU X1, -16(AX)(R8*1) JMP memmove_end_copy_repeat_emit_encodeBlockAsm12BAvx emit_lit_memmove_repeat_emit_encodeBlockAsm12BAvx_memmove_move_33through64: MOVOU (R9), X0 MOVOU 16(R9), X1 - MOVOU -32(R9)(BP*1), X2 - MOVOU -16(R9)(BP*1), X3 + MOVOU -32(R9)(R8*1), X2 + MOVOU -16(R9)(R8*1), X3 MOVOU X0, (AX) MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(BP*1) - MOVOU X3, -16(AX)(BP*1) + MOVOU X2, -32(AX)(R8*1) + MOVOU X3, -16(AX)(R8*1) JMP memmove_end_copy_repeat_emit_encodeBlockAsm12BAvx emit_lit_memmove_repeat_emit_encodeBlockAsm12BAvx_memmove_move_65through128: @@ -7468,18 +7457,18 @@ emit_lit_memmove_repeat_emit_encodeBlockAsm12BAvx_memmove_move_65through128: MOVOU 16(R9), X1 MOVOU 32(R9), X2 MOVOU 48(R9), X3 - MOVOU -64(R9)(BP*1), X12 - MOVOU -48(R9)(BP*1), X13 - MOVOU -32(R9)(BP*1), X14 - MOVOU -16(R9)(BP*1), X15 + MOVOU -64(R9)(R8*1), X12 + MOVOU -48(R9)(R8*1), X13 + MOVOU -32(R9)(R8*1), X14 + MOVOU -16(R9)(R8*1), X15 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, 32(AX) MOVOU X3, 48(AX) - MOVOU X12, -64(AX)(BP*1) - MOVOU X13, -48(AX)(BP*1) - MOVOU X14, -32(AX)(BP*1) - MOVOU X15, -16(AX)(BP*1) + MOVOU X12, -64(AX)(R8*1) + MOVOU X13, -48(AX)(R8*1) + MOVOU X14, -32(AX)(R8*1) + MOVOU X15, -16(AX)(R8*1) JMP memmove_end_copy_repeat_emit_encodeBlockAsm12BAvx emit_lit_memmove_repeat_emit_encodeBlockAsm12BAvx_memmove_move_129through256: @@ -7491,14 +7480,14 @@ emit_lit_memmove_repeat_emit_encodeBlockAsm12BAvx_memmove_move_129through256: MOVOU 80(R9), X5 MOVOU 96(R9), X6 MOVOU 112(R9), X7 - MOVOU -128(R9)(BP*1), X8 - MOVOU -112(R9)(BP*1), X9 - MOVOU -96(R9)(BP*1), X10 - MOVOU -80(R9)(BP*1), X11 - MOVOU -64(R9)(BP*1), X12 - MOVOU -48(R9)(BP*1), X13 - MOVOU -32(R9)(BP*1), X14 - MOVOU -16(R9)(BP*1), X15 + MOVOU -128(R9)(R8*1), X8 + MOVOU -112(R9)(R8*1), X9 + MOVOU -96(R9)(R8*1), X10 + MOVOU -80(R9)(R8*1), X11 + MOVOU -64(R9)(R8*1), X12 + MOVOU -48(R9)(R8*1), X13 + MOVOU -32(R9)(R8*1), X14 + MOVOU -16(R9)(R8*1), X15 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, 32(AX) @@ -7507,18 +7496,18 @@ emit_lit_memmove_repeat_emit_encodeBlockAsm12BAvx_memmove_move_129through256: MOVOU X5, 80(AX) MOVOU X6, 96(AX) MOVOU X7, 112(AX) - MOVOU X8, -128(AX)(BP*1) - MOVOU X9, -112(AX)(BP*1) - MOVOU X10, -96(AX)(BP*1) - MOVOU X11, -80(AX)(BP*1) - MOVOU X12, -64(AX)(BP*1) - MOVOU X13, -48(AX)(BP*1) - MOVOU X14, -32(AX)(BP*1) - MOVOU X15, -16(AX)(BP*1) + MOVOU X8, -128(AX)(R8*1) + MOVOU X9, -112(AX)(R8*1) + MOVOU X10, -96(AX)(R8*1) + MOVOU X11, -80(AX)(R8*1) + MOVOU X12, -64(AX)(R8*1) + MOVOU X13, -48(AX)(R8*1) + MOVOU X14, -32(AX)(R8*1) + MOVOU X15, -16(AX)(R8*1) JMP memmove_end_copy_repeat_emit_encodeBlockAsm12BAvx emit_lit_memmove_repeat_emit_encodeBlockAsm12BAvx_memmove_move_256through2048: - LEAQ -256(BP), BP + LEAQ -256(R8), R8 MOVOU (R9), X0 MOVOU 16(R9), X1 MOVOU 32(R9), X2 @@ -7551,128 +7540,128 @@ emit_lit_memmove_repeat_emit_encodeBlockAsm12BAvx_memmove_move_256through2048: MOVOU X13, 208(AX) MOVOU X14, 224(AX) MOVOU X15, 240(AX) - CMPQ BP, $0x00000100 + CMPQ R8, $0x00000100 LEAQ 256(R9), R9 LEAQ 256(AX), AX JGE emit_lit_memmove_repeat_emit_encodeBlockAsm12BAvx_memmove_move_256through2048 JMP emit_lit_memmove_repeat_emit_encodeBlockAsm12BAvx_memmove_tail emit_lit_memmove_repeat_emit_encodeBlockAsm12BAvx_memmove_avxUnaligned: - LEAQ (R9)(BP*1), R12 - MOVQ AX, R14 - MOVOU -128(R12), X5 - MOVOU -112(R12), X6 - MOVQ $0x00000080, R11 + LEAQ (R9)(R8*1), R11 + MOVQ AX, R13 + MOVOU -128(R11), X5 + MOVOU -112(R11), X6 + MOVQ $0x00000080, R10 ANDQ $0xffffffe0, AX ADDQ $0x20, AX - MOVOU -96(R12), X7 - MOVOU -80(R12), X8 - MOVQ AX, R13 - SUBQ R14, R13 - MOVOU -64(R12), X9 - MOVOU -48(R12), X10 - SUBQ R13, BP - MOVOU -32(R12), X11 - MOVOU -16(R12), X12 + MOVOU -96(R11), X7 + MOVOU -80(R11), X8 + MOVQ AX, R12 + SUBQ R13, R12 + MOVOU -64(R11), X9 + MOVOU -48(R11), X10 + SUBQ R12, R8 + MOVOU -32(R11), X11 + MOVOU -16(R11), X12 VMOVDQU (R9), Y4 - ADDQ R13, R9 - SUBQ R11, BP + ADDQ R12, R9 + SUBQ R10, R8 emit_lit_memmove_repeat_emit_encodeBlockAsm12BAvx_memmove_gobble_128_loop: VMOVDQU (R9), Y0 VMOVDQU 32(R9), Y1 VMOVDQU 64(R9), Y2 VMOVDQU 96(R9), Y3 - ADDQ R11, R9 + ADDQ R10, R9 VMOVDQA Y0, (AX) VMOVDQA Y1, 32(AX) VMOVDQA Y2, 64(AX) VMOVDQA Y3, 96(AX) - ADDQ R11, AX - SUBQ R11, BP + ADDQ R10, AX + SUBQ R10, R8 JA emit_lit_memmove_repeat_emit_encodeBlockAsm12BAvx_memmove_gobble_128_loop - ADDQ R11, BP - ADDQ AX, BP - VMOVDQU Y4, (R14) + ADDQ R10, R8 + ADDQ AX, R8 + VMOVDQU Y4, (R13) VZEROUPPER - MOVOU X5, -128(BP) - MOVOU X6, -112(BP) - MOVOU X7, -96(BP) - MOVOU X8, -80(BP) - MOVOU X9, -64(BP) - MOVOU X10, -48(BP) - MOVOU X11, -32(BP) - MOVOU X12, -16(BP) + MOVOU X5, -128(R8) + MOVOU X6, -112(R8) + MOVOU X7, -96(R8) + MOVOU X8, -80(R8) + MOVOU X9, -64(R8) + MOVOU X10, -48(R8) + MOVOU X11, -32(R8) + MOVOU X12, -16(R8) memmove_end_copy_repeat_emit_encodeBlockAsm12BAvx: - MOVQ R10, AX + MOVQ BP, AX emit_literal_done_repeat_emit_encodeBlockAsm12BAvx: ADDL $0x05, CX - MOVL CX, BX - SUBL 16(SP), BX - MOVQ src_len+32(FP), R9 - SUBL CX, R9 - LEAQ (DX)(CX*1), R10 - LEAQ (DX)(BX*1), R11 - XORL BX, BX - CMPL R9, $0x08 + MOVL CX, BP + SUBL 16(SP), BP + MOVQ src_len+32(FP), R8 + SUBL CX, R8 + LEAQ (DX)(CX*1), R9 + LEAQ (DX)(BP*1), BP + XORL R11, R11 + CMPL R8, $0x08 JL matchlen_single_repeat_extend matchlen_loopback_repeat_extend: - MOVQ (R10)(BX*1), R12 - XORQ (R11)(BX*1), R12 - TESTQ R12, R12 + MOVQ (R9)(R11*1), R10 + XORQ (BP)(R11*1), R10 + TESTQ R10, R10 JZ matchlen_loop_repeat_extend - BSFQ R12, R12 - SARQ $0x03, R12 - LEAL (BX)(R12*1), BX + BSFQ R10, R10 + SARQ $0x03, R10 + LEAL (R11)(R10*1), R11 JMP repeat_extend_forward_end_encodeBlockAsm12BAvx matchlen_loop_repeat_extend: - LEAL -8(R9), R9 - LEAL 8(BX), BX - CMPL R9, $0x08 + LEAL -8(R8), R8 + LEAL 8(R11), R11 + CMPL R8, $0x08 JGE matchlen_loopback_repeat_extend matchlen_single_repeat_extend: - TESTL R9, R9 + TESTL R8, R8 JZ repeat_extend_forward_end_encodeBlockAsm12BAvx matchlen_single_loopback_repeat_extend: - MOVB (R10)(BX*1), R12 - CMPB (R11)(BX*1), R12 + MOVB (R9)(R11*1), R10 + CMPB (BP)(R11*1), R10 JNE repeat_extend_forward_end_encodeBlockAsm12BAvx - LEAL 1(BX), BX - DECL R9 + LEAL 1(R11), R11 + DECL R8 JNZ matchlen_single_loopback_repeat_extend repeat_extend_forward_end_encodeBlockAsm12BAvx: - ADDL BX, CX - MOVL CX, BX - SUBL DI, BX - MOVL 16(SP), BP - TESTL R8, R8 + ADDL R11, CX + MOVL CX, BP + SUBL SI, BP + MOVL 16(SP), SI + TESTL DI, DI JZ repeat_as_copy_encodeBlockAsm12BAvx emit_repeat_again_match_repeat_encodeBlockAsm12BAvx: - MOVL BX, DI - LEAL -4(BX), BX + MOVL BP, DI + LEAL -4(BP), BP CMPL DI, $0x08 JLE repeat_two_match_repeat_encodeBlockAsm12BAvx CMPL DI, $0x0c JGE cant_repeat_two_offset_match_repeat_encodeBlockAsm12BAvx - CMPL BP, $0x00000800 + CMPL SI, $0x00000800 JLT repeat_two_offset_match_repeat_encodeBlockAsm12BAvx cant_repeat_two_offset_match_repeat_encodeBlockAsm12BAvx: - CMPL BX, $0x00000104 + CMPL BP, $0x00000104 JLT repeat_three_match_repeat_encodeBlockAsm12BAvx - CMPL BX, $0x00010100 + CMPL BP, $0x00010100 JLT repeat_four_match_repeat_encodeBlockAsm12BAvx - CMPL BX, $0x0100ffff + CMPL BP, $0x0100ffff JLT repeat_five_match_repeat_encodeBlockAsm12BAvx - LEAL -16842747(BX), BX + LEAL -16842747(BP), BP MOVW $0x001d, (AX) MOVW $0xfffb, 2(AX) MOVB $0xff, 4(AX) @@ -7680,79 +7669,79 @@ cant_repeat_two_offset_match_repeat_encodeBlockAsm12BAvx: JMP emit_repeat_again_match_repeat_encodeBlockAsm12BAvx repeat_five_match_repeat_encodeBlockAsm12BAvx: - LEAL -65536(BX), BX - MOVL BX, BP + LEAL -65536(BP), BP + MOVL BP, SI MOVW $0x001d, (AX) - MOVW BX, 2(AX) - SARL $0x10, BP - MOVB BP, 4(AX) + MOVW BP, 2(AX) + SARL $0x10, SI + MOVB SI, 4(AX) ADDQ $0x05, AX JMP repeat_end_emit_encodeBlockAsm12BAvx repeat_four_match_repeat_encodeBlockAsm12BAvx: - LEAL -256(BX), BX + LEAL -256(BP), BP MOVW $0x0019, (AX) - MOVW BX, 2(AX) + MOVW BP, 2(AX) ADDQ $0x04, AX JMP repeat_end_emit_encodeBlockAsm12BAvx repeat_three_match_repeat_encodeBlockAsm12BAvx: - LEAL -4(BX), BX + LEAL -4(BP), BP MOVW $0x0015, (AX) - MOVB BL, 2(AX) + MOVB BP, 2(AX) ADDQ $0x03, AX JMP repeat_end_emit_encodeBlockAsm12BAvx repeat_two_match_repeat_encodeBlockAsm12BAvx: - SHLL $0x02, BX - ORL $0x01, BX - MOVW BX, (AX) + SHLL $0x02, BP + ORL $0x01, BP + MOVW BP, (AX) ADDQ $0x02, AX JMP repeat_end_emit_encodeBlockAsm12BAvx repeat_two_offset_match_repeat_encodeBlockAsm12BAvx: - XORQ R9, R9 - LEAL 1(R9)(BX*4), BX - MOVB BP, 1(AX) - SARL $0x08, BP - SHLL $0x05, BP - ORL BP, BX - MOVB BL, (AX) + XORQ DI, DI + LEAL 1(DI)(BP*4), BP + MOVB SI, 1(AX) + SARL $0x08, SI + SHLL $0x05, SI + ORL SI, BP + MOVB BP, (AX) ADDQ $0x02, AX JMP repeat_end_emit_encodeBlockAsm12BAvx repeat_as_copy_encodeBlockAsm12BAvx: - CMPL BP, $0x00010000 + CMPL SI, $0x00010000 JL two_byte_offset_repeat_as_copy_encodeBlockAsm12BAvx four_bytes_loop_back_repeat_as_copy_encodeBlockAsm12BAvx: - CMPL BX, $0x40 + CMPL BP, $0x40 JLE four_bytes_remain_repeat_as_copy_encodeBlockAsm12BAvx MOVB $0xff, (AX) - MOVL BP, 1(AX) - LEAL -64(BX), BX + MOVL SI, 1(AX) + LEAL -64(BP), BP ADDQ $0x05, AX - CMPL BX, $0x04 + CMPL BP, $0x04 JL four_bytes_remain_repeat_as_copy_encodeBlockAsm12BAvx emit_repeat_again_repeat_as_copy_encodeBlockAsm12BAvx_emit_copy: - MOVL BX, DI - LEAL -4(BX), BX + MOVL BP, DI + LEAL -4(BP), BP CMPL DI, $0x08 JLE repeat_two_repeat_as_copy_encodeBlockAsm12BAvx_emit_copy CMPL DI, $0x0c JGE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm12BAvx_emit_copy - CMPL BP, $0x00000800 + CMPL SI, $0x00000800 JLT repeat_two_offset_repeat_as_copy_encodeBlockAsm12BAvx_emit_copy cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm12BAvx_emit_copy: - CMPL BX, $0x00000104 + CMPL BP, $0x00000104 JLT repeat_three_repeat_as_copy_encodeBlockAsm12BAvx_emit_copy - CMPL BX, $0x00010100 + CMPL BP, $0x00010100 JLT repeat_four_repeat_as_copy_encodeBlockAsm12BAvx_emit_copy - CMPL BX, $0x0100ffff + CMPL BP, $0x0100ffff JLT repeat_five_repeat_as_copy_encodeBlockAsm12BAvx_emit_copy - LEAL -16842747(BX), BX + LEAL -16842747(BP), BP MOVW $0x001d, (AX) MOVW $0xfffb, 2(AX) MOVB $0xff, 4(AX) @@ -7760,84 +7749,84 @@ cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm12BAvx_emit_copy: JMP emit_repeat_again_repeat_as_copy_encodeBlockAsm12BAvx_emit_copy repeat_five_repeat_as_copy_encodeBlockAsm12BAvx_emit_copy: - LEAL -65536(BX), BX - MOVL BX, BP + LEAL -65536(BP), BP + MOVL BP, SI MOVW $0x001d, (AX) - MOVW BX, 2(AX) - SARL $0x10, BP - MOVB BP, 4(AX) + MOVW BP, 2(AX) + SARL $0x10, SI + MOVB SI, 4(AX) ADDQ $0x05, AX JMP repeat_end_emit_encodeBlockAsm12BAvx repeat_four_repeat_as_copy_encodeBlockAsm12BAvx_emit_copy: - LEAL -256(BX), BX + LEAL -256(BP), BP MOVW $0x0019, (AX) - MOVW BX, 2(AX) + MOVW BP, 2(AX) ADDQ $0x04, AX JMP repeat_end_emit_encodeBlockAsm12BAvx repeat_three_repeat_as_copy_encodeBlockAsm12BAvx_emit_copy: - LEAL -4(BX), BX + LEAL -4(BP), BP MOVW $0x0015, (AX) - MOVB BL, 2(AX) + MOVB BP, 2(AX) ADDQ $0x03, AX JMP repeat_end_emit_encodeBlockAsm12BAvx repeat_two_repeat_as_copy_encodeBlockAsm12BAvx_emit_copy: - SHLL $0x02, BX - ORL $0x01, BX - MOVW BX, (AX) + SHLL $0x02, BP + ORL $0x01, BP + MOVW BP, (AX) ADDQ $0x02, AX JMP repeat_end_emit_encodeBlockAsm12BAvx repeat_two_offset_repeat_as_copy_encodeBlockAsm12BAvx_emit_copy: - XORQ R9, R9 - LEAL 1(R9)(BX*4), BX - MOVB BP, 1(AX) - SARL $0x08, BP - SHLL $0x05, BP - ORL BP, BX - MOVB BL, (AX) + XORQ DI, DI + LEAL 1(DI)(BP*4), BP + MOVB SI, 1(AX) + SARL $0x08, SI + SHLL $0x05, SI + ORL SI, BP + MOVB BP, (AX) ADDQ $0x02, AX JMP repeat_end_emit_encodeBlockAsm12BAvx - JMP four_bytes_loop_back_repeat_as_copy_encodeBlockAsm12BAvx + JMP four_bytes_loop_back_repeat_as_copy_encodeBlockAsm12BAvx four_bytes_remain_repeat_as_copy_encodeBlockAsm12BAvx: - TESTL BX, BX + TESTL BP, BP JZ repeat_end_emit_encodeBlockAsm12BAvx - MOVB $0x03, SI - LEAL -4(SI)(BX*4), BX - MOVB BL, (AX) - MOVL BP, 1(AX) + MOVB $0x03, BL + LEAL -4(BX)(BP*4), BP + MOVB BP, (AX) + MOVL SI, 1(AX) ADDQ $0x05, AX JMP repeat_end_emit_encodeBlockAsm12BAvx two_byte_offset_repeat_as_copy_encodeBlockAsm12BAvx: - CMPL BX, $0x40 + CMPL BP, $0x40 JLE two_byte_offset_short_repeat_as_copy_encodeBlockAsm12BAvx MOVB $0xee, (AX) - MOVW BP, 1(AX) - LEAL -60(BX), BX + MOVW SI, 1(AX) + LEAL -60(BP), BP ADDQ $0x03, AX emit_repeat_again_repeat_as_copy_encodeBlockAsm12BAvx_emit_copy_short: - MOVL BX, DI - LEAL -4(BX), BX + MOVL BP, DI + LEAL -4(BP), BP CMPL DI, $0x08 JLE repeat_two_repeat_as_copy_encodeBlockAsm12BAvx_emit_copy_short CMPL DI, $0x0c JGE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm12BAvx_emit_copy_short - CMPL BP, $0x00000800 + CMPL SI, $0x00000800 JLT repeat_two_offset_repeat_as_copy_encodeBlockAsm12BAvx_emit_copy_short cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm12BAvx_emit_copy_short: - CMPL BX, $0x00000104 + CMPL BP, $0x00000104 JLT repeat_three_repeat_as_copy_encodeBlockAsm12BAvx_emit_copy_short - CMPL BX, $0x00010100 + CMPL BP, $0x00010100 JLT repeat_four_repeat_as_copy_encodeBlockAsm12BAvx_emit_copy_short - CMPL BX, $0x0100ffff + CMPL BP, $0x0100ffff JLT repeat_five_repeat_as_copy_encodeBlockAsm12BAvx_emit_copy_short - LEAL -16842747(BX), BX + LEAL -16842747(BP), BP MOVW $0x001d, (AX) MOVW $0xfffb, 2(AX) MOVB $0xff, 4(AX) @@ -7845,68 +7834,68 @@ cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm12BAvx_emit_copy_short: JMP emit_repeat_again_repeat_as_copy_encodeBlockAsm12BAvx_emit_copy_short repeat_five_repeat_as_copy_encodeBlockAsm12BAvx_emit_copy_short: - LEAL -65536(BX), BX - MOVL BX, BP + LEAL -65536(BP), BP + MOVL BP, SI MOVW $0x001d, (AX) - MOVW BX, 2(AX) - SARL $0x10, BP - MOVB BP, 4(AX) + MOVW BP, 2(AX) + SARL $0x10, SI + MOVB SI, 4(AX) ADDQ $0x05, AX JMP repeat_end_emit_encodeBlockAsm12BAvx repeat_four_repeat_as_copy_encodeBlockAsm12BAvx_emit_copy_short: - LEAL -256(BX), BX + LEAL -256(BP), BP MOVW $0x0019, (AX) - MOVW BX, 2(AX) + MOVW BP, 2(AX) ADDQ $0x04, AX JMP repeat_end_emit_encodeBlockAsm12BAvx repeat_three_repeat_as_copy_encodeBlockAsm12BAvx_emit_copy_short: - LEAL -4(BX), BX + LEAL -4(BP), BP MOVW $0x0015, (AX) - MOVB BL, 2(AX) + MOVB BP, 2(AX) ADDQ $0x03, AX JMP repeat_end_emit_encodeBlockAsm12BAvx repeat_two_repeat_as_copy_encodeBlockAsm12BAvx_emit_copy_short: - SHLL $0x02, BX - ORL $0x01, BX - MOVW BX, (AX) + SHLL $0x02, BP + ORL $0x01, BP + MOVW BP, (AX) ADDQ $0x02, AX JMP repeat_end_emit_encodeBlockAsm12BAvx repeat_two_offset_repeat_as_copy_encodeBlockAsm12BAvx_emit_copy_short: - XORQ R9, R9 - LEAL 1(R9)(BX*4), BX - MOVB BP, 1(AX) - SARL $0x08, BP - SHLL $0x05, BP - ORL BP, BX - MOVB BL, (AX) + XORQ DI, DI + LEAL 1(DI)(BP*4), BP + MOVB SI, 1(AX) + SARL $0x08, SI + SHLL $0x05, SI + ORL SI, BP + MOVB BP, (AX) ADDQ $0x02, AX JMP repeat_end_emit_encodeBlockAsm12BAvx - JMP two_byte_offset_repeat_as_copy_encodeBlockAsm12BAvx + JMP two_byte_offset_repeat_as_copy_encodeBlockAsm12BAvx two_byte_offset_short_repeat_as_copy_encodeBlockAsm12BAvx: - CMPL BX, $0x0c + CMPL BP, $0x0c JGE emit_copy_three_repeat_as_copy_encodeBlockAsm12BAvx - CMPL BP, $0x00000800 + CMPL SI, $0x00000800 JGE emit_copy_three_repeat_as_copy_encodeBlockAsm12BAvx - MOVB $0x01, SI - LEAL -16(SI)(BX*4), BX - MOVB BP, 1(AX) - SHRL $0x08, BP - SHLL $0x05, BP - ORL BP, BX - MOVB BL, (AX) + MOVB $0x01, BL + LEAL -16(BX)(BP*4), BP + MOVB SI, 1(AX) + SHRL $0x08, SI + SHLL $0x05, SI + ORL SI, BP + MOVB BP, (AX) ADDQ $0x02, AX JMP repeat_end_emit_encodeBlockAsm12BAvx emit_copy_three_repeat_as_copy_encodeBlockAsm12BAvx: - MOVB $0x02, SI - LEAL -4(SI)(BX*4), BX - MOVB BL, (AX) - MOVW BP, 1(AX) + MOVB $0x02, BL + LEAL -4(BX)(BP*4), BP + MOVB BP, (AX) + MOVW SI, 1(AX) ADDQ $0x03, AX repeat_end_emit_encodeBlockAsm12BAvx: @@ -7916,16 +7905,16 @@ repeat_end_emit_encodeBlockAsm12BAvx: JMP search_loop_encodeBlockAsm12BAvx no_repeat_found_encodeBlockAsm12BAvx: - CMPL (DX)(BX*1), R9 + CMPL (DX)(BP*1), SI JEQ candidate_match_encodeBlockAsm12BAvx - SHRQ $0x08, R9 - MOVL 24(SP)(R11*4), BX - LEAL 2(CX), DI - CMPL (DX)(BP*1), R9 + SHRQ $0x08, SI + MOVL 24(SP)(R9*4), BP + LEAL 2(CX), R8 + CMPL (DX)(DI*1), SI JEQ candidate2_match_encodeBlockAsm12BAvx - MOVL DI, 24(SP)(R11*4) - SHRQ $0x08, R9 - CMPL (DX)(BX*1), R9 + MOVL R8, 24(SP)(R9*4) + SHRQ $0x08, SI + CMPL (DX)(BP*1), SI JEQ candidate3_match_encodeBlockAsm12BAvx MOVL 20(SP), CX JMP search_loop_encodeBlockAsm12BAvx @@ -7935,46 +7924,46 @@ candidate3_match_encodeBlockAsm12BAvx: JMP candidate_match_encodeBlockAsm12BAvx candidate2_match_encodeBlockAsm12BAvx: - MOVL DI, 24(SP)(R11*4) + MOVL R8, 24(SP)(R9*4) INCL CX - MOVL BP, BX + MOVL DI, BP candidate_match_encodeBlockAsm12BAvx: - MOVL 12(SP), BP - TESTL BX, BX + MOVL 12(SP), SI + TESTL BP, BP JZ match_extend_back_end_encodeBlockAsm12BAvx match_extend_back_loop_encodeBlockAsm12BAvx: - CMPL CX, BP + CMPL CX, SI JLE match_extend_back_end_encodeBlockAsm12BAvx - MOVB -1(DX)(BX*1), SI + MOVB -1(DX)(BP*1), BL MOVB -1(DX)(CX*1), DI - CMPB SI, DI + CMPB BL, DI JNE match_extend_back_end_encodeBlockAsm12BAvx LEAL -1(CX), CX - DECL BX + DECL BP JZ match_extend_back_end_encodeBlockAsm12BAvx JMP match_extend_back_loop_encodeBlockAsm12BAvx match_extend_back_end_encodeBlockAsm12BAvx: - MOVL CX, DI - SUBL 12(SP), DI - LEAQ 4(AX)(DI*1), DI - CMPQ DI, (SP) + MOVL CX, SI + SUBL 12(SP), SI + LEAQ 4(AX)(SI*1), SI + CMPQ SI, (SP) JL match_dst_size_check_encodeBlockAsm12BAvx MOVQ $0x00000000, ret+48(FP) RET match_dst_size_check_encodeBlockAsm12BAvx: - MOVL CX, R8 + MOVL CX, SI MOVL 12(SP), DI - CMPL DI, R8 + CMPL DI, SI JEQ emit_literal_done_match_emit_encodeBlockAsm12BAvx - MOVL R8, BP - MOVL R8, 12(SP) - LEAQ (DX)(DI*1), R9 - SUBL DI, BP - MOVL BP, DI + MOVL SI, R8 + MOVL SI, 12(SP) + LEAQ (DX)(DI*1), SI + SUBL DI, R8 + MOVL R8, DI SUBL $0x01, DI JC emit_literal_done_match_emit_encodeBlockAsm12BAvx CMPL DI, $0x3c @@ -7991,11 +7980,11 @@ match_dst_size_check_encodeBlockAsm12BAvx: JMP memmove_match_emit_encodeBlockAsm12BAvx four_bytes_match_emit_encodeBlockAsm12BAvx: - MOVL DI, R8 - SHRL $0x10, R8 + MOVL DI, R9 + SHRL $0x10, R9 MOVB $0xf8, (AX) MOVW DI, 1(AX) - MOVB R8, 3(AX) + MOVB R9, 3(AX) ADDQ $0x04, AX JMP memmove_match_emit_encodeBlockAsm12BAvx @@ -8017,8 +8006,7 @@ one_byte_match_emit_encodeBlockAsm12BAvx: ADDQ $0x01, AX memmove_match_emit_encodeBlockAsm12BAvx: - LEAQ (AX)(BP*1), R10 - MOVL BP, R8 + LEAQ (AX)(R8*1), DI NOP emit_lit_memmove_match_emit_encodeBlockAsm12BAvx_memmove_tail: @@ -8045,55 +8033,55 @@ emit_lit_memmove_match_emit_encodeBlockAsm12BAvx_memmove_tail: JMP emit_lit_memmove_match_emit_encodeBlockAsm12BAvx_memmove_avxUnaligned emit_lit_memmove_match_emit_encodeBlockAsm12BAvx_memmove_move_1or2: - MOVB (R9), R11 - MOVB -1(R9)(R8*1), R12 - MOVB R11, (AX) - MOVB R12, -1(AX)(R8*1) + MOVB (SI), R9 + MOVB -1(SI)(R8*1), R10 + MOVB R9, (AX) + MOVB R10, -1(AX)(R8*1) JMP memmove_end_copy_match_emit_encodeBlockAsm12BAvx emit_lit_memmove_match_emit_encodeBlockAsm12BAvx_memmove_move_4: - MOVL (R9), R11 - MOVL R11, (AX) + MOVL (SI), R9 + MOVL R9, (AX) JMP memmove_end_copy_match_emit_encodeBlockAsm12BAvx emit_lit_memmove_match_emit_encodeBlockAsm12BAvx_memmove_move_3: - MOVW (R9), R11 - MOVB 2(R9), R12 - MOVW R11, (AX) - MOVB R12, 2(AX) + MOVW (SI), R9 + MOVB 2(SI), R10 + MOVW R9, (AX) + MOVB R10, 2(AX) JMP memmove_end_copy_match_emit_encodeBlockAsm12BAvx emit_lit_memmove_match_emit_encodeBlockAsm12BAvx_memmove_move_5through7: - MOVL (R9), R11 - MOVL -4(R9)(R8*1), R12 - MOVL R11, (AX) - MOVL R12, -4(AX)(R8*1) + MOVL (SI), R9 + MOVL -4(SI)(R8*1), R10 + MOVL R9, (AX) + MOVL R10, -4(AX)(R8*1) JMP memmove_end_copy_match_emit_encodeBlockAsm12BAvx emit_lit_memmove_match_emit_encodeBlockAsm12BAvx_memmove_move_8: - MOVQ (R9), R11 - MOVQ R11, (AX) + MOVQ (SI), R9 + MOVQ R9, (AX) JMP memmove_end_copy_match_emit_encodeBlockAsm12BAvx emit_lit_memmove_match_emit_encodeBlockAsm12BAvx_memmove_move_9through16: - MOVQ (R9), R11 - MOVQ -8(R9)(R8*1), R12 - MOVQ R11, (AX) - MOVQ R12, -8(AX)(R8*1) + MOVQ (SI), R9 + MOVQ -8(SI)(R8*1), R10 + MOVQ R9, (AX) + MOVQ R10, -8(AX)(R8*1) JMP memmove_end_copy_match_emit_encodeBlockAsm12BAvx emit_lit_memmove_match_emit_encodeBlockAsm12BAvx_memmove_move_17through32: - MOVOU (R9), X0 - MOVOU -16(R9)(R8*1), X1 + MOVOU (SI), X0 + MOVOU -16(SI)(R8*1), X1 MOVOU X0, (AX) MOVOU X1, -16(AX)(R8*1) JMP memmove_end_copy_match_emit_encodeBlockAsm12BAvx emit_lit_memmove_match_emit_encodeBlockAsm12BAvx_memmove_move_33through64: - MOVOU (R9), X0 - MOVOU 16(R9), X1 - MOVOU -32(R9)(R8*1), X2 - MOVOU -16(R9)(R8*1), X3 + MOVOU (SI), X0 + MOVOU 16(SI), X1 + MOVOU -32(SI)(R8*1), X2 + MOVOU -16(SI)(R8*1), X3 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, -32(AX)(R8*1) @@ -8101,14 +8089,14 @@ emit_lit_memmove_match_emit_encodeBlockAsm12BAvx_memmove_move_33through64: JMP memmove_end_copy_match_emit_encodeBlockAsm12BAvx emit_lit_memmove_match_emit_encodeBlockAsm12BAvx_memmove_move_65through128: - MOVOU (R9), X0 - MOVOU 16(R9), X1 - MOVOU 32(R9), X2 - MOVOU 48(R9), X3 - MOVOU -64(R9)(R8*1), X12 - MOVOU -48(R9)(R8*1), X13 - MOVOU -32(R9)(R8*1), X14 - MOVOU -16(R9)(R8*1), X15 + MOVOU (SI), X0 + MOVOU 16(SI), X1 + MOVOU 32(SI), X2 + MOVOU 48(SI), X3 + MOVOU -64(SI)(R8*1), X12 + MOVOU -48(SI)(R8*1), X13 + MOVOU -32(SI)(R8*1), X14 + MOVOU -16(SI)(R8*1), X15 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, 32(AX) @@ -8120,22 +8108,22 @@ emit_lit_memmove_match_emit_encodeBlockAsm12BAvx_memmove_move_65through128: JMP memmove_end_copy_match_emit_encodeBlockAsm12BAvx emit_lit_memmove_match_emit_encodeBlockAsm12BAvx_memmove_move_129through256: - MOVOU (R9), X0 - MOVOU 16(R9), X1 - MOVOU 32(R9), X2 - MOVOU 48(R9), X3 - MOVOU 64(R9), X4 - MOVOU 80(R9), X5 - MOVOU 96(R9), X6 - MOVOU 112(R9), X7 - MOVOU -128(R9)(R8*1), X8 - MOVOU -112(R9)(R8*1), X9 - MOVOU -96(R9)(R8*1), X10 - MOVOU -80(R9)(R8*1), X11 - MOVOU -64(R9)(R8*1), X12 - MOVOU -48(R9)(R8*1), X13 - MOVOU -32(R9)(R8*1), X14 - MOVOU -16(R9)(R8*1), X15 + MOVOU (SI), X0 + MOVOU 16(SI), X1 + MOVOU 32(SI), X2 + MOVOU 48(SI), X3 + MOVOU 64(SI), X4 + MOVOU 80(SI), X5 + MOVOU 96(SI), X6 + MOVOU 112(SI), X7 + MOVOU -128(SI)(R8*1), X8 + MOVOU -112(SI)(R8*1), X9 + MOVOU -96(SI)(R8*1), X10 + MOVOU -80(SI)(R8*1), X11 + MOVOU -64(SI)(R8*1), X12 + MOVOU -48(SI)(R8*1), X13 + MOVOU -32(SI)(R8*1), X14 + MOVOU -16(SI)(R8*1), X15 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, 32(AX) @@ -8156,22 +8144,22 @@ emit_lit_memmove_match_emit_encodeBlockAsm12BAvx_memmove_move_129through256: emit_lit_memmove_match_emit_encodeBlockAsm12BAvx_memmove_move_256through2048: LEAQ -256(R8), R8 - MOVOU (R9), X0 - MOVOU 16(R9), X1 - MOVOU 32(R9), X2 - MOVOU 48(R9), X3 - MOVOU 64(R9), X4 - MOVOU 80(R9), X5 - MOVOU 96(R9), X6 - MOVOU 112(R9), X7 - MOVOU 128(R9), X8 - MOVOU 144(R9), X9 - MOVOU 160(R9), X10 - MOVOU 176(R9), X11 - MOVOU 192(R9), X12 - MOVOU 208(R9), X13 - MOVOU 224(R9), X14 - MOVOU 240(R9), X15 + MOVOU (SI), X0 + MOVOU 16(SI), X1 + MOVOU 32(SI), X2 + MOVOU 48(SI), X3 + MOVOU 64(SI), X4 + MOVOU 80(SI), X5 + MOVOU 96(SI), X6 + MOVOU 112(SI), X7 + MOVOU 128(SI), X8 + MOVOU 144(SI), X9 + MOVOU 160(SI), X10 + MOVOU 176(SI), X11 + MOVOU 192(SI), X12 + MOVOU 208(SI), X13 + MOVOU 224(SI), X14 + MOVOU 240(SI), X15 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, 32(AX) @@ -8189,139 +8177,139 @@ emit_lit_memmove_match_emit_encodeBlockAsm12BAvx_memmove_move_256through2048: MOVOU X14, 224(AX) MOVOU X15, 240(AX) CMPQ R8, $0x00000100 - LEAQ 256(R9), R9 + LEAQ 256(SI), SI LEAQ 256(AX), AX JGE emit_lit_memmove_match_emit_encodeBlockAsm12BAvx_memmove_move_256through2048 JMP emit_lit_memmove_match_emit_encodeBlockAsm12BAvx_memmove_tail emit_lit_memmove_match_emit_encodeBlockAsm12BAvx_memmove_avxUnaligned: - LEAQ (R9)(R8*1), R12 - MOVQ AX, R14 - MOVOU -128(R12), X5 - MOVOU -112(R12), X6 - MOVQ $0x00000080, R11 + LEAQ (SI)(R8*1), R10 + MOVQ AX, R12 + MOVOU -128(R10), X5 + MOVOU -112(R10), X6 + MOVQ $0x00000080, R9 ANDQ $0xffffffe0, AX ADDQ $0x20, AX - MOVOU -96(R12), X7 - MOVOU -80(R12), X8 - MOVQ AX, R13 - SUBQ R14, R13 - MOVOU -64(R12), X9 - MOVOU -48(R12), X10 - SUBQ R13, R8 - MOVOU -32(R12), X11 - MOVOU -16(R12), X12 - VMOVDQU (R9), Y4 - ADDQ R13, R9 + MOVOU -96(R10), X7 + MOVOU -80(R10), X8 + MOVQ AX, R11 + SUBQ R12, R11 + MOVOU -64(R10), X9 + MOVOU -48(R10), X10 SUBQ R11, R8 + MOVOU -32(R10), X11 + MOVOU -16(R10), X12 + VMOVDQU (SI), Y4 + ADDQ R11, SI + SUBQ R9, R8 emit_lit_memmove_match_emit_encodeBlockAsm12BAvx_memmove_gobble_128_loop: - VMOVDQU (R9), Y0 - VMOVDQU 32(R9), Y1 - VMOVDQU 64(R9), Y2 - VMOVDQU 96(R9), Y3 - ADDQ R11, R9 + VMOVDQU (SI), Y0 + VMOVDQU 32(SI), Y1 + VMOVDQU 64(SI), Y2 + VMOVDQU 96(SI), Y3 + ADDQ R9, SI VMOVDQA Y0, (AX) VMOVDQA Y1, 32(AX) VMOVDQA Y2, 64(AX) VMOVDQA Y3, 96(AX) - ADDQ R11, AX - SUBQ R11, R8 + ADDQ R9, AX + SUBQ R9, R8 JA emit_lit_memmove_match_emit_encodeBlockAsm12BAvx_memmove_gobble_128_loop - ADDQ R11, R8 + ADDQ R9, R8 ADDQ AX, R8 - VMOVDQU Y4, (R14) + VMOVDQU Y4, (R12) VZEROUPPER - MOVOU X5, -128(R8) - MOVOU X6, -112(R8) - MOVOU X7, -96(R8) - MOVOU X8, -80(R8) - MOVOU X9, -64(R8) - MOVOU X10, -48(R8) - MOVOU X11, -32(R8) - MOVOU X12, -16(R8) + MOVOU X5, -128(R8) + MOVOU X6, -112(R8) + MOVOU X7, -96(R8) + MOVOU X8, -80(R8) + MOVOU X9, -64(R8) + MOVOU X10, -48(R8) + MOVOU X11, -32(R8) + MOVOU X12, -16(R8) memmove_end_copy_match_emit_encodeBlockAsm12BAvx: - MOVQ R10, AX + MOVQ DI, AX emit_literal_done_match_emit_encodeBlockAsm12BAvx: match_nolit_loop_encodeBlockAsm12BAvx: - MOVL CX, BP - SUBL BX, BP - MOVL BP, 16(SP) + MOVL CX, SI + SUBL BP, SI + MOVL SI, 16(SP) ADDL $0x04, CX - ADDL $0x04, BX - MOVQ src_len+32(FP), R9 - SUBL CX, R9 - LEAQ (DX)(CX*1), R10 - LEAQ (DX)(BX*1), R11 - XORL BX, BX - CMPL R9, $0x08 + ADDL $0x04, BP + MOVQ src_len+32(FP), SI + SUBL CX, SI + LEAQ (DX)(CX*1), DI + LEAQ (DX)(BP*1), BP + XORL R9, R9 + CMPL SI, $0x08 JL matchlen_single_match_nolit_encodeBlockAsm12BAvx matchlen_loopback_match_nolit_encodeBlockAsm12BAvx: - MOVQ (R10)(BX*1), R12 - XORQ (R11)(BX*1), R12 - TESTQ R12, R12 + MOVQ (DI)(R9*1), R8 + XORQ (BP)(R9*1), R8 + TESTQ R8, R8 JZ matchlen_loop_match_nolit_encodeBlockAsm12BAvx - BSFQ R12, R12 - SARQ $0x03, R12 - LEAL (BX)(R12*1), BX + BSFQ R8, R8 + SARQ $0x03, R8 + LEAL (R9)(R8*1), R9 JMP match_nolit_end_encodeBlockAsm12BAvx matchlen_loop_match_nolit_encodeBlockAsm12BAvx: - LEAL -8(R9), R9 - LEAL 8(BX), BX - CMPL R9, $0x08 + LEAL -8(SI), SI + LEAL 8(R9), R9 + CMPL SI, $0x08 JGE matchlen_loopback_match_nolit_encodeBlockAsm12BAvx matchlen_single_match_nolit_encodeBlockAsm12BAvx: - TESTL R9, R9 + TESTL SI, SI JZ match_nolit_end_encodeBlockAsm12BAvx matchlen_single_loopback_match_nolit_encodeBlockAsm12BAvx: - MOVB (R10)(BX*1), R12 - CMPB (R11)(BX*1), R12 + MOVB (DI)(R9*1), R8 + CMPB (BP)(R9*1), R8 JNE match_nolit_end_encodeBlockAsm12BAvx - LEAL 1(BX), BX - DECL R9 + LEAL 1(R9), R9 + DECL SI JNZ matchlen_single_loopback_match_nolit_encodeBlockAsm12BAvx match_nolit_end_encodeBlockAsm12BAvx: - ADDL BX, CX + ADDL R9, CX MOVL 16(SP), BP - ADDL $0x04, BX + ADDL $0x04, R9 CMPL BP, $0x00010000 JL two_byte_offset_match_nolit_encodeBlockAsm12BAvx four_bytes_loop_back_match_nolit_encodeBlockAsm12BAvx: - CMPL BX, $0x40 + CMPL R9, $0x40 JLE four_bytes_remain_match_nolit_encodeBlockAsm12BAvx MOVB $0xff, (AX) MOVL BP, 1(AX) - LEAL -64(BX), BX + LEAL -64(R9), R9 ADDQ $0x05, AX - CMPL BX, $0x04 + CMPL R9, $0x04 JL four_bytes_remain_match_nolit_encodeBlockAsm12BAvx emit_repeat_again_match_nolit_encodeBlockAsm12BAvx_emit_copy: - MOVL BX, DI - LEAL -4(BX), BX - CMPL DI, $0x08 + MOVL R9, SI + LEAL -4(R9), R9 + CMPL SI, $0x08 JLE repeat_two_match_nolit_encodeBlockAsm12BAvx_emit_copy - CMPL DI, $0x0c + CMPL SI, $0x0c JGE cant_repeat_two_offset_match_nolit_encodeBlockAsm12BAvx_emit_copy CMPL BP, $0x00000800 JLT repeat_two_offset_match_nolit_encodeBlockAsm12BAvx_emit_copy cant_repeat_two_offset_match_nolit_encodeBlockAsm12BAvx_emit_copy: - CMPL BX, $0x00000104 + CMPL R9, $0x00000104 JLT repeat_three_match_nolit_encodeBlockAsm12BAvx_emit_copy - CMPL BX, $0x00010100 + CMPL R9, $0x00010100 JLT repeat_four_match_nolit_encodeBlockAsm12BAvx_emit_copy - CMPL BX, $0x0100ffff + CMPL R9, $0x0100ffff JLT repeat_five_match_nolit_encodeBlockAsm12BAvx_emit_copy - LEAL -16842747(BX), BX + LEAL -16842747(R9), R9 MOVW $0x001d, (AX) MOVW $0xfffb, 2(AX) MOVB $0xff, 4(AX) @@ -8329,84 +8317,84 @@ cant_repeat_two_offset_match_nolit_encodeBlockAsm12BAvx_emit_copy: JMP emit_repeat_again_match_nolit_encodeBlockAsm12BAvx_emit_copy repeat_five_match_nolit_encodeBlockAsm12BAvx_emit_copy: - LEAL -65536(BX), BX - MOVL BX, BP + LEAL -65536(R9), R9 + MOVL R9, BP MOVW $0x001d, (AX) - MOVW BX, 2(AX) + MOVW R9, 2(AX) SARL $0x10, BP MOVB BP, 4(AX) ADDQ $0x05, AX JMP match_nolit_emitcopy_end_encodeBlockAsm12BAvx repeat_four_match_nolit_encodeBlockAsm12BAvx_emit_copy: - LEAL -256(BX), BX + LEAL -256(R9), R9 MOVW $0x0019, (AX) - MOVW BX, 2(AX) + MOVW R9, 2(AX) ADDQ $0x04, AX JMP match_nolit_emitcopy_end_encodeBlockAsm12BAvx repeat_three_match_nolit_encodeBlockAsm12BAvx_emit_copy: - LEAL -4(BX), BX + LEAL -4(R9), R9 MOVW $0x0015, (AX) - MOVB BL, 2(AX) + MOVB R9, 2(AX) ADDQ $0x03, AX JMP match_nolit_emitcopy_end_encodeBlockAsm12BAvx repeat_two_match_nolit_encodeBlockAsm12BAvx_emit_copy: - SHLL $0x02, BX - ORL $0x01, BX - MOVW BX, (AX) + SHLL $0x02, R9 + ORL $0x01, R9 + MOVW R9, (AX) ADDQ $0x02, AX JMP match_nolit_emitcopy_end_encodeBlockAsm12BAvx repeat_two_offset_match_nolit_encodeBlockAsm12BAvx_emit_copy: - XORQ R9, R9 - LEAL 1(R9)(BX*4), BX + XORQ SI, SI + LEAL 1(SI)(R9*4), R9 MOVB BP, 1(AX) SARL $0x08, BP SHLL $0x05, BP - ORL BP, BX - MOVB BL, (AX) + ORL BP, R9 + MOVB R9, (AX) ADDQ $0x02, AX JMP match_nolit_emitcopy_end_encodeBlockAsm12BAvx - JMP four_bytes_loop_back_match_nolit_encodeBlockAsm12BAvx + JMP four_bytes_loop_back_match_nolit_encodeBlockAsm12BAvx four_bytes_remain_match_nolit_encodeBlockAsm12BAvx: - TESTL BX, BX + TESTL R9, R9 JZ match_nolit_emitcopy_end_encodeBlockAsm12BAvx - MOVB $0x03, SI - LEAL -4(SI)(BX*4), BX - MOVB BL, (AX) + MOVB $0x03, BL + LEAL -4(BX)(R9*4), R9 + MOVB R9, (AX) MOVL BP, 1(AX) ADDQ $0x05, AX JMP match_nolit_emitcopy_end_encodeBlockAsm12BAvx two_byte_offset_match_nolit_encodeBlockAsm12BAvx: - CMPL BX, $0x40 + CMPL R9, $0x40 JLE two_byte_offset_short_match_nolit_encodeBlockAsm12BAvx MOVB $0xee, (AX) MOVW BP, 1(AX) - LEAL -60(BX), BX + LEAL -60(R9), R9 ADDQ $0x03, AX emit_repeat_again_match_nolit_encodeBlockAsm12BAvx_emit_copy_short: - MOVL BX, DI - LEAL -4(BX), BX - CMPL DI, $0x08 + MOVL R9, SI + LEAL -4(R9), R9 + CMPL SI, $0x08 JLE repeat_two_match_nolit_encodeBlockAsm12BAvx_emit_copy_short - CMPL DI, $0x0c + CMPL SI, $0x0c JGE cant_repeat_two_offset_match_nolit_encodeBlockAsm12BAvx_emit_copy_short CMPL BP, $0x00000800 JLT repeat_two_offset_match_nolit_encodeBlockAsm12BAvx_emit_copy_short cant_repeat_two_offset_match_nolit_encodeBlockAsm12BAvx_emit_copy_short: - CMPL BX, $0x00000104 + CMPL R9, $0x00000104 JLT repeat_three_match_nolit_encodeBlockAsm12BAvx_emit_copy_short - CMPL BX, $0x00010100 + CMPL R9, $0x00010100 JLT repeat_four_match_nolit_encodeBlockAsm12BAvx_emit_copy_short - CMPL BX, $0x0100ffff + CMPL R9, $0x0100ffff JLT repeat_five_match_nolit_encodeBlockAsm12BAvx_emit_copy_short - LEAL -16842747(BX), BX + LEAL -16842747(R9), R9 MOVW $0x001d, (AX) MOVW $0xfffb, 2(AX) MOVB $0xff, 4(AX) @@ -8414,67 +8402,67 @@ cant_repeat_two_offset_match_nolit_encodeBlockAsm12BAvx_emit_copy_short: JMP emit_repeat_again_match_nolit_encodeBlockAsm12BAvx_emit_copy_short repeat_five_match_nolit_encodeBlockAsm12BAvx_emit_copy_short: - LEAL -65536(BX), BX - MOVL BX, BP + LEAL -65536(R9), R9 + MOVL R9, BP MOVW $0x001d, (AX) - MOVW BX, 2(AX) + MOVW R9, 2(AX) SARL $0x10, BP MOVB BP, 4(AX) ADDQ $0x05, AX JMP match_nolit_emitcopy_end_encodeBlockAsm12BAvx repeat_four_match_nolit_encodeBlockAsm12BAvx_emit_copy_short: - LEAL -256(BX), BX + LEAL -256(R9), R9 MOVW $0x0019, (AX) - MOVW BX, 2(AX) + MOVW R9, 2(AX) ADDQ $0x04, AX JMP match_nolit_emitcopy_end_encodeBlockAsm12BAvx repeat_three_match_nolit_encodeBlockAsm12BAvx_emit_copy_short: - LEAL -4(BX), BX + LEAL -4(R9), R9 MOVW $0x0015, (AX) - MOVB BL, 2(AX) + MOVB R9, 2(AX) ADDQ $0x03, AX JMP match_nolit_emitcopy_end_encodeBlockAsm12BAvx repeat_two_match_nolit_encodeBlockAsm12BAvx_emit_copy_short: - SHLL $0x02, BX - ORL $0x01, BX - MOVW BX, (AX) + SHLL $0x02, R9 + ORL $0x01, R9 + MOVW R9, (AX) ADDQ $0x02, AX JMP match_nolit_emitcopy_end_encodeBlockAsm12BAvx repeat_two_offset_match_nolit_encodeBlockAsm12BAvx_emit_copy_short: - XORQ R9, R9 - LEAL 1(R9)(BX*4), BX + XORQ SI, SI + LEAL 1(SI)(R9*4), R9 MOVB BP, 1(AX) SARL $0x08, BP SHLL $0x05, BP - ORL BP, BX - MOVB BL, (AX) + ORL BP, R9 + MOVB R9, (AX) ADDQ $0x02, AX JMP match_nolit_emitcopy_end_encodeBlockAsm12BAvx - JMP two_byte_offset_match_nolit_encodeBlockAsm12BAvx + JMP two_byte_offset_match_nolit_encodeBlockAsm12BAvx two_byte_offset_short_match_nolit_encodeBlockAsm12BAvx: - CMPL BX, $0x0c + CMPL R9, $0x0c JGE emit_copy_three_match_nolit_encodeBlockAsm12BAvx CMPL BP, $0x00000800 JGE emit_copy_three_match_nolit_encodeBlockAsm12BAvx - MOVB $0x01, SI - LEAL -16(SI)(BX*4), BX + MOVB $0x01, BL + LEAL -16(BX)(R9*4), R9 MOVB BP, 1(AX) SHRL $0x08, BP SHLL $0x05, BP - ORL BP, BX - MOVB BL, (AX) + ORL BP, R9 + MOVB R9, (AX) ADDQ $0x02, AX JMP match_nolit_emitcopy_end_encodeBlockAsm12BAvx emit_copy_three_match_nolit_encodeBlockAsm12BAvx: - MOVB $0x02, SI - LEAL -4(SI)(BX*4), BX - MOVB BL, (AX) + MOVB $0x02, BL + LEAL -4(BX)(R9*4), R9 + MOVB R9, (AX) MOVW BP, 1(AX) ADDQ $0x03, AX @@ -8488,22 +8476,22 @@ match_nolit_emitcopy_end_encodeBlockAsm12BAvx: RET match_nolit_dst_ok_encodeBlockAsm12BAvx: - MOVQ -2(DX)(CX*1), R9 - MOVQ $0x000000cf1bbcdcbb, R10 - MOVQ R9, R11 - SHRQ $0x10, R9 - MOVQ R9, R12 - SHLQ $0x18, R11 - IMULQ R10, R11 - SHRQ $0x34, R11 - SHLQ $0x18, R12 - IMULQ R10, R12 - SHRQ $0x34, R12 - LEAL -2(CX), BP - MOVL 24(SP)(R12*4), BX - MOVL BP, 24(SP)(R11*4) - MOVL CX, 24(SP)(R12*4) - CMPL (DX)(BX*1), R9 + MOVQ -2(DX)(CX*1), SI + MOVQ $0x000000cf1bbcdcbb, BP + MOVQ SI, DI + SHRQ $0x10, SI + MOVQ SI, R8 + SHLQ $0x18, DI + IMULQ BP, DI + SHRQ $0x34, DI + SHLQ $0x18, R8 + IMULQ BP, R8 + SHRQ $0x34, R8 + LEAL -2(CX), R9 + MOVL 24(SP)(R8*4), BP + MOVL R9, 24(SP)(DI*4) + MOVL CX, 24(SP)(R8*4) + CMPL (DX)(BP*1), SI JEQ match_nolit_loop_encodeBlockAsm12BAvx INCL CX JMP search_loop_encodeBlockAsm12BAvx @@ -8784,14 +8772,14 @@ emit_lit_memmove_emit_remainder_encodeBlockAsm12BAvx_memmove_gobble_128_loop: ADDQ AX, BX VMOVDQU Y4, (R8) VZEROUPPER - MOVOU X5, -128(BX) - MOVOU X6, -112(BX) - MOVOU X7, -96(BX) - MOVOU X8, -80(BX) - MOVOU X9, -64(BX) - MOVOU X10, -48(BX) - MOVOU X11, -32(BX) - MOVOU X12, -16(BX) + MOVOU X5, -128(BX) + MOVOU X6, -112(BX) + MOVOU X7, -96(BX) + MOVOU X8, -80(BX) + MOVOU X9, -64(BX) + MOVOU X10, -48(BX) + MOVOU X11, -32(BX) + MOVOU X12, -16(BX) memmove_end_copy_emit_remainder_encodeBlockAsm12BAvx: MOVQ DX, AX @@ -8825,8 +8813,8 @@ zero_loop_encodeBlockAsm10BAvx: MOVL $0x00000000, 12(SP) MOVQ src_len+32(FP), CX LEAQ -5(CX), DX - LEAQ -8(CX), R9 - MOVL R9, 8(SP) + LEAQ -8(CX), BP + MOVL BP, 8(SP) SHRQ $0x05, CX SUBL CX, DX LEAQ (AX)(DX*1), DX @@ -8836,68 +8824,68 @@ zero_loop_encodeBlockAsm10BAvx: MOVQ src_base+24(FP), DX search_loop_encodeBlockAsm10BAvx: - MOVQ (DX)(CX*1), R9 - MOVL CX, BX - SUBL 12(SP), BX - SHRL $0x05, BX - LEAL 4(CX)(BX*1), BX - MOVL 8(SP), BP - CMPL BX, BP + MOVQ (DX)(CX*1), SI + MOVL CX, BP + SUBL 12(SP), BP + SHRL $0x05, BP + LEAL 4(CX)(BP*1), BP + MOVL 8(SP), DI + CMPL BP, DI JGT emit_remainder_encodeBlockAsm10BAvx - MOVL BX, 20(SP) - MOVQ $0x9e3779b1, R10 - MOVQ R9, R11 - MOVQ R9, R12 - SHRQ $0x08, R12 - SHLQ $0x20, R11 - IMULQ R10, R11 - SHRQ $0x36, R11 - SHLQ $0x20, R12 - IMULQ R10, R12 - SHRQ $0x36, R12 - MOVL 24(SP)(R11*4), BX - MOVL 24(SP)(R12*4), BP - MOVL CX, 24(SP)(R11*4) - LEAL 1(CX), DI - MOVL DI, 24(SP)(R12*4) - MOVQ R9, R11 - SHRQ $0x10, R11 - SHLQ $0x20, R11 - IMULQ R10, R11 - SHRQ $0x36, R11 - MOVL CX, DI - SUBL 16(SP), DI - MOVL 1(DX)(DI*1), DI - MOVQ R9, R10 + MOVL BP, 20(SP) + MOVQ $0x9e3779b1, R8 + MOVQ SI, R9 + MOVQ SI, R10 SHRQ $0x08, R10 - CMPL R10, DI + SHLQ $0x20, R9 + IMULQ R8, R9 + SHRQ $0x36, R9 + SHLQ $0x20, R10 + IMULQ R8, R10 + SHRQ $0x36, R10 + MOVL 24(SP)(R9*4), BP + MOVL 24(SP)(R10*4), DI + MOVL CX, 24(SP)(R9*4) + LEAL 1(CX), R9 + MOVL R9, 24(SP)(R10*4) + MOVQ SI, R9 + SHRQ $0x10, R9 + SHLQ $0x20, R9 + IMULQ R8, R9 + SHRQ $0x36, R9 + MOVL CX, R8 + SUBL 16(SP), R8 + MOVL 1(DX)(R8*1), R10 + MOVQ SI, R8 + SHRQ $0x08, R8 + CMPL R8, R10 JNE no_repeat_found_encodeBlockAsm10BAvx - LEAL 1(CX), DI - MOVL 12(SP), R8 - MOVL DI, BX - SUBL 16(SP), BX + LEAL 1(CX), SI + MOVL 12(SP), DI + MOVL SI, BP + SUBL 16(SP), BP JZ repeat_extend_back_end_encodeBlockAsm10BAvx repeat_extend_back_loop_encodeBlockAsm10BAvx: - CMPL DI, R8 + CMPL SI, DI JLE repeat_extend_back_end_encodeBlockAsm10BAvx - MOVB -1(DX)(BX*1), BP - MOVB -1(DX)(DI*1), SI - CMPB BP, SI + MOVB -1(DX)(BP*1), BL + MOVB -1(DX)(SI*1), R8 + CMPB BL, R8 JNE repeat_extend_back_end_encodeBlockAsm10BAvx - LEAL -1(DI), DI - DECL BX + LEAL -1(SI), SI + DECL BP JNZ repeat_extend_back_loop_encodeBlockAsm10BAvx repeat_extend_back_end_encodeBlockAsm10BAvx: MOVL 12(SP), BP - CMPL BP, DI + CMPL BP, SI JEQ emit_literal_done_repeat_emit_encodeBlockAsm10BAvx - MOVL DI, BX - MOVL DI, 12(SP) + MOVL SI, R8 + MOVL SI, 12(SP) LEAQ (DX)(BP*1), R9 - SUBL BP, BX - MOVL BX, BP + SUBL BP, R8 + MOVL R8, BP SUBL $0x01, BP JC emit_literal_done_repeat_emit_encodeBlockAsm10BAvx CMPL BP, $0x3c @@ -8940,87 +8928,86 @@ one_byte_repeat_emit_encodeBlockAsm10BAvx: ADDQ $0x01, AX memmove_repeat_emit_encodeBlockAsm10BAvx: - LEAQ (AX)(BX*1), R10 - MOVL BX, BP + LEAQ (AX)(R8*1), BP NOP emit_lit_memmove_repeat_emit_encodeBlockAsm10BAvx_memmove_tail: - TESTQ BP, BP + TESTQ R8, R8 JEQ memmove_end_copy_repeat_emit_encodeBlockAsm10BAvx - CMPQ BP, $0x02 + CMPQ R8, $0x02 JBE emit_lit_memmove_repeat_emit_encodeBlockAsm10BAvx_memmove_move_1or2 - CMPQ BP, $0x04 + CMPQ R8, $0x04 JB emit_lit_memmove_repeat_emit_encodeBlockAsm10BAvx_memmove_move_3 JBE emit_lit_memmove_repeat_emit_encodeBlockAsm10BAvx_memmove_move_4 - CMPQ BP, $0x08 + CMPQ R8, $0x08 JB emit_lit_memmove_repeat_emit_encodeBlockAsm10BAvx_memmove_move_5through7 JE emit_lit_memmove_repeat_emit_encodeBlockAsm10BAvx_memmove_move_8 - CMPQ BP, $0x10 + CMPQ R8, $0x10 JBE emit_lit_memmove_repeat_emit_encodeBlockAsm10BAvx_memmove_move_9through16 - CMPQ BP, $0x20 + CMPQ R8, $0x20 JBE emit_lit_memmove_repeat_emit_encodeBlockAsm10BAvx_memmove_move_17through32 - CMPQ BP, $0x40 + CMPQ R8, $0x40 JBE emit_lit_memmove_repeat_emit_encodeBlockAsm10BAvx_memmove_move_33through64 - CMPQ BP, $0x80 + CMPQ R8, $0x80 JBE emit_lit_memmove_repeat_emit_encodeBlockAsm10BAvx_memmove_move_65through128 - CMPQ BP, $0x00000100 + CMPQ R8, $0x00000100 JBE emit_lit_memmove_repeat_emit_encodeBlockAsm10BAvx_memmove_move_129through256 JMP emit_lit_memmove_repeat_emit_encodeBlockAsm10BAvx_memmove_avxUnaligned emit_lit_memmove_repeat_emit_encodeBlockAsm10BAvx_memmove_move_1or2: - MOVB (R9), R11 - MOVB -1(R9)(BP*1), R12 - MOVB R11, (AX) - MOVB R12, -1(AX)(BP*1) + MOVB (R9), R10 + MOVB -1(R9)(R8*1), R11 + MOVB R10, (AX) + MOVB R11, -1(AX)(R8*1) JMP memmove_end_copy_repeat_emit_encodeBlockAsm10BAvx emit_lit_memmove_repeat_emit_encodeBlockAsm10BAvx_memmove_move_4: - MOVL (R9), R11 - MOVL R11, (AX) + MOVL (R9), R10 + MOVL R10, (AX) JMP memmove_end_copy_repeat_emit_encodeBlockAsm10BAvx emit_lit_memmove_repeat_emit_encodeBlockAsm10BAvx_memmove_move_3: - MOVW (R9), R11 - MOVB 2(R9), R12 - MOVW R11, (AX) - MOVB R12, 2(AX) + MOVW (R9), R10 + MOVB 2(R9), R11 + MOVW R10, (AX) + MOVB R11, 2(AX) JMP memmove_end_copy_repeat_emit_encodeBlockAsm10BAvx emit_lit_memmove_repeat_emit_encodeBlockAsm10BAvx_memmove_move_5through7: - MOVL (R9), R11 - MOVL -4(R9)(BP*1), R12 - MOVL R11, (AX) - MOVL R12, -4(AX)(BP*1) + MOVL (R9), R10 + MOVL -4(R9)(R8*1), R11 + MOVL R10, (AX) + MOVL R11, -4(AX)(R8*1) JMP memmove_end_copy_repeat_emit_encodeBlockAsm10BAvx emit_lit_memmove_repeat_emit_encodeBlockAsm10BAvx_memmove_move_8: - MOVQ (R9), R11 - MOVQ R11, (AX) + MOVQ (R9), R10 + MOVQ R10, (AX) JMP memmove_end_copy_repeat_emit_encodeBlockAsm10BAvx emit_lit_memmove_repeat_emit_encodeBlockAsm10BAvx_memmove_move_9through16: - MOVQ (R9), R11 - MOVQ -8(R9)(BP*1), R12 - MOVQ R11, (AX) - MOVQ R12, -8(AX)(BP*1) + MOVQ (R9), R10 + MOVQ -8(R9)(R8*1), R11 + MOVQ R10, (AX) + MOVQ R11, -8(AX)(R8*1) JMP memmove_end_copy_repeat_emit_encodeBlockAsm10BAvx emit_lit_memmove_repeat_emit_encodeBlockAsm10BAvx_memmove_move_17through32: MOVOU (R9), X0 - MOVOU -16(R9)(BP*1), X1 + MOVOU -16(R9)(R8*1), X1 MOVOU X0, (AX) - MOVOU X1, -16(AX)(BP*1) + MOVOU X1, -16(AX)(R8*1) JMP memmove_end_copy_repeat_emit_encodeBlockAsm10BAvx emit_lit_memmove_repeat_emit_encodeBlockAsm10BAvx_memmove_move_33through64: MOVOU (R9), X0 MOVOU 16(R9), X1 - MOVOU -32(R9)(BP*1), X2 - MOVOU -16(R9)(BP*1), X3 + MOVOU -32(R9)(R8*1), X2 + MOVOU -16(R9)(R8*1), X3 MOVOU X0, (AX) MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(BP*1) - MOVOU X3, -16(AX)(BP*1) + MOVOU X2, -32(AX)(R8*1) + MOVOU X3, -16(AX)(R8*1) JMP memmove_end_copy_repeat_emit_encodeBlockAsm10BAvx emit_lit_memmove_repeat_emit_encodeBlockAsm10BAvx_memmove_move_65through128: @@ -9028,18 +9015,18 @@ emit_lit_memmove_repeat_emit_encodeBlockAsm10BAvx_memmove_move_65through128: MOVOU 16(R9), X1 MOVOU 32(R9), X2 MOVOU 48(R9), X3 - MOVOU -64(R9)(BP*1), X12 - MOVOU -48(R9)(BP*1), X13 - MOVOU -32(R9)(BP*1), X14 - MOVOU -16(R9)(BP*1), X15 + MOVOU -64(R9)(R8*1), X12 + MOVOU -48(R9)(R8*1), X13 + MOVOU -32(R9)(R8*1), X14 + MOVOU -16(R9)(R8*1), X15 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, 32(AX) MOVOU X3, 48(AX) - MOVOU X12, -64(AX)(BP*1) - MOVOU X13, -48(AX)(BP*1) - MOVOU X14, -32(AX)(BP*1) - MOVOU X15, -16(AX)(BP*1) + MOVOU X12, -64(AX)(R8*1) + MOVOU X13, -48(AX)(R8*1) + MOVOU X14, -32(AX)(R8*1) + MOVOU X15, -16(AX)(R8*1) JMP memmove_end_copy_repeat_emit_encodeBlockAsm10BAvx emit_lit_memmove_repeat_emit_encodeBlockAsm10BAvx_memmove_move_129through256: @@ -9051,14 +9038,14 @@ emit_lit_memmove_repeat_emit_encodeBlockAsm10BAvx_memmove_move_129through256: MOVOU 80(R9), X5 MOVOU 96(R9), X6 MOVOU 112(R9), X7 - MOVOU -128(R9)(BP*1), X8 - MOVOU -112(R9)(BP*1), X9 - MOVOU -96(R9)(BP*1), X10 - MOVOU -80(R9)(BP*1), X11 - MOVOU -64(R9)(BP*1), X12 - MOVOU -48(R9)(BP*1), X13 - MOVOU -32(R9)(BP*1), X14 - MOVOU -16(R9)(BP*1), X15 + MOVOU -128(R9)(R8*1), X8 + MOVOU -112(R9)(R8*1), X9 + MOVOU -96(R9)(R8*1), X10 + MOVOU -80(R9)(R8*1), X11 + MOVOU -64(R9)(R8*1), X12 + MOVOU -48(R9)(R8*1), X13 + MOVOU -32(R9)(R8*1), X14 + MOVOU -16(R9)(R8*1), X15 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, 32(AX) @@ -9067,18 +9054,18 @@ emit_lit_memmove_repeat_emit_encodeBlockAsm10BAvx_memmove_move_129through256: MOVOU X5, 80(AX) MOVOU X6, 96(AX) MOVOU X7, 112(AX) - MOVOU X8, -128(AX)(BP*1) - MOVOU X9, -112(AX)(BP*1) - MOVOU X10, -96(AX)(BP*1) - MOVOU X11, -80(AX)(BP*1) - MOVOU X12, -64(AX)(BP*1) - MOVOU X13, -48(AX)(BP*1) - MOVOU X14, -32(AX)(BP*1) - MOVOU X15, -16(AX)(BP*1) + MOVOU X8, -128(AX)(R8*1) + MOVOU X9, -112(AX)(R8*1) + MOVOU X10, -96(AX)(R8*1) + MOVOU X11, -80(AX)(R8*1) + MOVOU X12, -64(AX)(R8*1) + MOVOU X13, -48(AX)(R8*1) + MOVOU X14, -32(AX)(R8*1) + MOVOU X15, -16(AX)(R8*1) JMP memmove_end_copy_repeat_emit_encodeBlockAsm10BAvx emit_lit_memmove_repeat_emit_encodeBlockAsm10BAvx_memmove_move_256through2048: - LEAQ -256(BP), BP + LEAQ -256(R8), R8 MOVOU (R9), X0 MOVOU 16(R9), X1 MOVOU 32(R9), X2 @@ -9111,128 +9098,128 @@ emit_lit_memmove_repeat_emit_encodeBlockAsm10BAvx_memmove_move_256through2048: MOVOU X13, 208(AX) MOVOU X14, 224(AX) MOVOU X15, 240(AX) - CMPQ BP, $0x00000100 + CMPQ R8, $0x00000100 LEAQ 256(R9), R9 LEAQ 256(AX), AX JGE emit_lit_memmove_repeat_emit_encodeBlockAsm10BAvx_memmove_move_256through2048 JMP emit_lit_memmove_repeat_emit_encodeBlockAsm10BAvx_memmove_tail emit_lit_memmove_repeat_emit_encodeBlockAsm10BAvx_memmove_avxUnaligned: - LEAQ (R9)(BP*1), R12 - MOVQ AX, R14 - MOVOU -128(R12), X5 - MOVOU -112(R12), X6 - MOVQ $0x00000080, R11 + LEAQ (R9)(R8*1), R11 + MOVQ AX, R13 + MOVOU -128(R11), X5 + MOVOU -112(R11), X6 + MOVQ $0x00000080, R10 ANDQ $0xffffffe0, AX ADDQ $0x20, AX - MOVOU -96(R12), X7 - MOVOU -80(R12), X8 - MOVQ AX, R13 - SUBQ R14, R13 - MOVOU -64(R12), X9 - MOVOU -48(R12), X10 - SUBQ R13, BP - MOVOU -32(R12), X11 - MOVOU -16(R12), X12 + MOVOU -96(R11), X7 + MOVOU -80(R11), X8 + MOVQ AX, R12 + SUBQ R13, R12 + MOVOU -64(R11), X9 + MOVOU -48(R11), X10 + SUBQ R12, R8 + MOVOU -32(R11), X11 + MOVOU -16(R11), X12 VMOVDQU (R9), Y4 - ADDQ R13, R9 - SUBQ R11, BP + ADDQ R12, R9 + SUBQ R10, R8 emit_lit_memmove_repeat_emit_encodeBlockAsm10BAvx_memmove_gobble_128_loop: VMOVDQU (R9), Y0 VMOVDQU 32(R9), Y1 VMOVDQU 64(R9), Y2 VMOVDQU 96(R9), Y3 - ADDQ R11, R9 + ADDQ R10, R9 VMOVDQA Y0, (AX) VMOVDQA Y1, 32(AX) VMOVDQA Y2, 64(AX) VMOVDQA Y3, 96(AX) - ADDQ R11, AX - SUBQ R11, BP + ADDQ R10, AX + SUBQ R10, R8 JA emit_lit_memmove_repeat_emit_encodeBlockAsm10BAvx_memmove_gobble_128_loop - ADDQ R11, BP - ADDQ AX, BP - VMOVDQU Y4, (R14) + ADDQ R10, R8 + ADDQ AX, R8 + VMOVDQU Y4, (R13) VZEROUPPER - MOVOU X5, -128(BP) - MOVOU X6, -112(BP) - MOVOU X7, -96(BP) - MOVOU X8, -80(BP) - MOVOU X9, -64(BP) - MOVOU X10, -48(BP) - MOVOU X11, -32(BP) - MOVOU X12, -16(BP) + MOVOU X5, -128(R8) + MOVOU X6, -112(R8) + MOVOU X7, -96(R8) + MOVOU X8, -80(R8) + MOVOU X9, -64(R8) + MOVOU X10, -48(R8) + MOVOU X11, -32(R8) + MOVOU X12, -16(R8) memmove_end_copy_repeat_emit_encodeBlockAsm10BAvx: - MOVQ R10, AX + MOVQ BP, AX emit_literal_done_repeat_emit_encodeBlockAsm10BAvx: ADDL $0x05, CX - MOVL CX, BX - SUBL 16(SP), BX - MOVQ src_len+32(FP), R9 - SUBL CX, R9 - LEAQ (DX)(CX*1), R10 - LEAQ (DX)(BX*1), R11 - XORL BX, BX - CMPL R9, $0x08 + MOVL CX, BP + SUBL 16(SP), BP + MOVQ src_len+32(FP), R8 + SUBL CX, R8 + LEAQ (DX)(CX*1), R9 + LEAQ (DX)(BP*1), BP + XORL R11, R11 + CMPL R8, $0x08 JL matchlen_single_repeat_extend matchlen_loopback_repeat_extend: - MOVQ (R10)(BX*1), R12 - XORQ (R11)(BX*1), R12 - TESTQ R12, R12 + MOVQ (R9)(R11*1), R10 + XORQ (BP)(R11*1), R10 + TESTQ R10, R10 JZ matchlen_loop_repeat_extend - BSFQ R12, R12 - SARQ $0x03, R12 - LEAL (BX)(R12*1), BX + BSFQ R10, R10 + SARQ $0x03, R10 + LEAL (R11)(R10*1), R11 JMP repeat_extend_forward_end_encodeBlockAsm10BAvx matchlen_loop_repeat_extend: - LEAL -8(R9), R9 - LEAL 8(BX), BX - CMPL R9, $0x08 + LEAL -8(R8), R8 + LEAL 8(R11), R11 + CMPL R8, $0x08 JGE matchlen_loopback_repeat_extend matchlen_single_repeat_extend: - TESTL R9, R9 + TESTL R8, R8 JZ repeat_extend_forward_end_encodeBlockAsm10BAvx matchlen_single_loopback_repeat_extend: - MOVB (R10)(BX*1), R12 - CMPB (R11)(BX*1), R12 + MOVB (R9)(R11*1), R10 + CMPB (BP)(R11*1), R10 JNE repeat_extend_forward_end_encodeBlockAsm10BAvx - LEAL 1(BX), BX - DECL R9 + LEAL 1(R11), R11 + DECL R8 JNZ matchlen_single_loopback_repeat_extend repeat_extend_forward_end_encodeBlockAsm10BAvx: - ADDL BX, CX - MOVL CX, BX - SUBL DI, BX - MOVL 16(SP), BP - TESTL R8, R8 + ADDL R11, CX + MOVL CX, BP + SUBL SI, BP + MOVL 16(SP), SI + TESTL DI, DI JZ repeat_as_copy_encodeBlockAsm10BAvx emit_repeat_again_match_repeat_encodeBlockAsm10BAvx: - MOVL BX, DI - LEAL -4(BX), BX + MOVL BP, DI + LEAL -4(BP), BP CMPL DI, $0x08 JLE repeat_two_match_repeat_encodeBlockAsm10BAvx CMPL DI, $0x0c JGE cant_repeat_two_offset_match_repeat_encodeBlockAsm10BAvx - CMPL BP, $0x00000800 + CMPL SI, $0x00000800 JLT repeat_two_offset_match_repeat_encodeBlockAsm10BAvx cant_repeat_two_offset_match_repeat_encodeBlockAsm10BAvx: - CMPL BX, $0x00000104 + CMPL BP, $0x00000104 JLT repeat_three_match_repeat_encodeBlockAsm10BAvx - CMPL BX, $0x00010100 + CMPL BP, $0x00010100 JLT repeat_four_match_repeat_encodeBlockAsm10BAvx - CMPL BX, $0x0100ffff + CMPL BP, $0x0100ffff JLT repeat_five_match_repeat_encodeBlockAsm10BAvx - LEAL -16842747(BX), BX + LEAL -16842747(BP), BP MOVW $0x001d, (AX) MOVW $0xfffb, 2(AX) MOVB $0xff, 4(AX) @@ -9240,79 +9227,79 @@ cant_repeat_two_offset_match_repeat_encodeBlockAsm10BAvx: JMP emit_repeat_again_match_repeat_encodeBlockAsm10BAvx repeat_five_match_repeat_encodeBlockAsm10BAvx: - LEAL -65536(BX), BX - MOVL BX, BP + LEAL -65536(BP), BP + MOVL BP, SI MOVW $0x001d, (AX) - MOVW BX, 2(AX) - SARL $0x10, BP - MOVB BP, 4(AX) + MOVW BP, 2(AX) + SARL $0x10, SI + MOVB SI, 4(AX) ADDQ $0x05, AX JMP repeat_end_emit_encodeBlockAsm10BAvx repeat_four_match_repeat_encodeBlockAsm10BAvx: - LEAL -256(BX), BX + LEAL -256(BP), BP MOVW $0x0019, (AX) - MOVW BX, 2(AX) + MOVW BP, 2(AX) ADDQ $0x04, AX JMP repeat_end_emit_encodeBlockAsm10BAvx repeat_three_match_repeat_encodeBlockAsm10BAvx: - LEAL -4(BX), BX + LEAL -4(BP), BP MOVW $0x0015, (AX) - MOVB BL, 2(AX) + MOVB BP, 2(AX) ADDQ $0x03, AX JMP repeat_end_emit_encodeBlockAsm10BAvx repeat_two_match_repeat_encodeBlockAsm10BAvx: - SHLL $0x02, BX - ORL $0x01, BX - MOVW BX, (AX) + SHLL $0x02, BP + ORL $0x01, BP + MOVW BP, (AX) ADDQ $0x02, AX JMP repeat_end_emit_encodeBlockAsm10BAvx repeat_two_offset_match_repeat_encodeBlockAsm10BAvx: - XORQ R9, R9 - LEAL 1(R9)(BX*4), BX - MOVB BP, 1(AX) - SARL $0x08, BP - SHLL $0x05, BP - ORL BP, BX - MOVB BL, (AX) + XORQ DI, DI + LEAL 1(DI)(BP*4), BP + MOVB SI, 1(AX) + SARL $0x08, SI + SHLL $0x05, SI + ORL SI, BP + MOVB BP, (AX) ADDQ $0x02, AX JMP repeat_end_emit_encodeBlockAsm10BAvx repeat_as_copy_encodeBlockAsm10BAvx: - CMPL BP, $0x00010000 + CMPL SI, $0x00010000 JL two_byte_offset_repeat_as_copy_encodeBlockAsm10BAvx four_bytes_loop_back_repeat_as_copy_encodeBlockAsm10BAvx: - CMPL BX, $0x40 + CMPL BP, $0x40 JLE four_bytes_remain_repeat_as_copy_encodeBlockAsm10BAvx MOVB $0xff, (AX) - MOVL BP, 1(AX) - LEAL -64(BX), BX + MOVL SI, 1(AX) + LEAL -64(BP), BP ADDQ $0x05, AX - CMPL BX, $0x04 + CMPL BP, $0x04 JL four_bytes_remain_repeat_as_copy_encodeBlockAsm10BAvx emit_repeat_again_repeat_as_copy_encodeBlockAsm10BAvx_emit_copy: - MOVL BX, DI - LEAL -4(BX), BX + MOVL BP, DI + LEAL -4(BP), BP CMPL DI, $0x08 JLE repeat_two_repeat_as_copy_encodeBlockAsm10BAvx_emit_copy CMPL DI, $0x0c JGE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm10BAvx_emit_copy - CMPL BP, $0x00000800 + CMPL SI, $0x00000800 JLT repeat_two_offset_repeat_as_copy_encodeBlockAsm10BAvx_emit_copy cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm10BAvx_emit_copy: - CMPL BX, $0x00000104 + CMPL BP, $0x00000104 JLT repeat_three_repeat_as_copy_encodeBlockAsm10BAvx_emit_copy - CMPL BX, $0x00010100 + CMPL BP, $0x00010100 JLT repeat_four_repeat_as_copy_encodeBlockAsm10BAvx_emit_copy - CMPL BX, $0x0100ffff + CMPL BP, $0x0100ffff JLT repeat_five_repeat_as_copy_encodeBlockAsm10BAvx_emit_copy - LEAL -16842747(BX), BX + LEAL -16842747(BP), BP MOVW $0x001d, (AX) MOVW $0xfffb, 2(AX) MOVB $0xff, 4(AX) @@ -9320,84 +9307,84 @@ cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm10BAvx_emit_copy: JMP emit_repeat_again_repeat_as_copy_encodeBlockAsm10BAvx_emit_copy repeat_five_repeat_as_copy_encodeBlockAsm10BAvx_emit_copy: - LEAL -65536(BX), BX - MOVL BX, BP + LEAL -65536(BP), BP + MOVL BP, SI MOVW $0x001d, (AX) - MOVW BX, 2(AX) - SARL $0x10, BP - MOVB BP, 4(AX) + MOVW BP, 2(AX) + SARL $0x10, SI + MOVB SI, 4(AX) ADDQ $0x05, AX JMP repeat_end_emit_encodeBlockAsm10BAvx repeat_four_repeat_as_copy_encodeBlockAsm10BAvx_emit_copy: - LEAL -256(BX), BX + LEAL -256(BP), BP MOVW $0x0019, (AX) - MOVW BX, 2(AX) + MOVW BP, 2(AX) ADDQ $0x04, AX JMP repeat_end_emit_encodeBlockAsm10BAvx repeat_three_repeat_as_copy_encodeBlockAsm10BAvx_emit_copy: - LEAL -4(BX), BX + LEAL -4(BP), BP MOVW $0x0015, (AX) - MOVB BL, 2(AX) + MOVB BP, 2(AX) ADDQ $0x03, AX JMP repeat_end_emit_encodeBlockAsm10BAvx repeat_two_repeat_as_copy_encodeBlockAsm10BAvx_emit_copy: - SHLL $0x02, BX - ORL $0x01, BX - MOVW BX, (AX) + SHLL $0x02, BP + ORL $0x01, BP + MOVW BP, (AX) ADDQ $0x02, AX JMP repeat_end_emit_encodeBlockAsm10BAvx repeat_two_offset_repeat_as_copy_encodeBlockAsm10BAvx_emit_copy: - XORQ R9, R9 - LEAL 1(R9)(BX*4), BX - MOVB BP, 1(AX) - SARL $0x08, BP - SHLL $0x05, BP - ORL BP, BX - MOVB BL, (AX) + XORQ DI, DI + LEAL 1(DI)(BP*4), BP + MOVB SI, 1(AX) + SARL $0x08, SI + SHLL $0x05, SI + ORL SI, BP + MOVB BP, (AX) ADDQ $0x02, AX JMP repeat_end_emit_encodeBlockAsm10BAvx - JMP four_bytes_loop_back_repeat_as_copy_encodeBlockAsm10BAvx + JMP four_bytes_loop_back_repeat_as_copy_encodeBlockAsm10BAvx four_bytes_remain_repeat_as_copy_encodeBlockAsm10BAvx: - TESTL BX, BX + TESTL BP, BP JZ repeat_end_emit_encodeBlockAsm10BAvx - MOVB $0x03, SI - LEAL -4(SI)(BX*4), BX - MOVB BL, (AX) - MOVL BP, 1(AX) + MOVB $0x03, BL + LEAL -4(BX)(BP*4), BP + MOVB BP, (AX) + MOVL SI, 1(AX) ADDQ $0x05, AX JMP repeat_end_emit_encodeBlockAsm10BAvx two_byte_offset_repeat_as_copy_encodeBlockAsm10BAvx: - CMPL BX, $0x40 + CMPL BP, $0x40 JLE two_byte_offset_short_repeat_as_copy_encodeBlockAsm10BAvx MOVB $0xee, (AX) - MOVW BP, 1(AX) - LEAL -60(BX), BX + MOVW SI, 1(AX) + LEAL -60(BP), BP ADDQ $0x03, AX emit_repeat_again_repeat_as_copy_encodeBlockAsm10BAvx_emit_copy_short: - MOVL BX, DI - LEAL -4(BX), BX + MOVL BP, DI + LEAL -4(BP), BP CMPL DI, $0x08 JLE repeat_two_repeat_as_copy_encodeBlockAsm10BAvx_emit_copy_short CMPL DI, $0x0c JGE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm10BAvx_emit_copy_short - CMPL BP, $0x00000800 + CMPL SI, $0x00000800 JLT repeat_two_offset_repeat_as_copy_encodeBlockAsm10BAvx_emit_copy_short cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm10BAvx_emit_copy_short: - CMPL BX, $0x00000104 + CMPL BP, $0x00000104 JLT repeat_three_repeat_as_copy_encodeBlockAsm10BAvx_emit_copy_short - CMPL BX, $0x00010100 + CMPL BP, $0x00010100 JLT repeat_four_repeat_as_copy_encodeBlockAsm10BAvx_emit_copy_short - CMPL BX, $0x0100ffff + CMPL BP, $0x0100ffff JLT repeat_five_repeat_as_copy_encodeBlockAsm10BAvx_emit_copy_short - LEAL -16842747(BX), BX + LEAL -16842747(BP), BP MOVW $0x001d, (AX) MOVW $0xfffb, 2(AX) MOVB $0xff, 4(AX) @@ -9405,68 +9392,68 @@ cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm10BAvx_emit_copy_short: JMP emit_repeat_again_repeat_as_copy_encodeBlockAsm10BAvx_emit_copy_short repeat_five_repeat_as_copy_encodeBlockAsm10BAvx_emit_copy_short: - LEAL -65536(BX), BX - MOVL BX, BP + LEAL -65536(BP), BP + MOVL BP, SI MOVW $0x001d, (AX) - MOVW BX, 2(AX) - SARL $0x10, BP - MOVB BP, 4(AX) + MOVW BP, 2(AX) + SARL $0x10, SI + MOVB SI, 4(AX) ADDQ $0x05, AX JMP repeat_end_emit_encodeBlockAsm10BAvx repeat_four_repeat_as_copy_encodeBlockAsm10BAvx_emit_copy_short: - LEAL -256(BX), BX + LEAL -256(BP), BP MOVW $0x0019, (AX) - MOVW BX, 2(AX) + MOVW BP, 2(AX) ADDQ $0x04, AX JMP repeat_end_emit_encodeBlockAsm10BAvx repeat_three_repeat_as_copy_encodeBlockAsm10BAvx_emit_copy_short: - LEAL -4(BX), BX + LEAL -4(BP), BP MOVW $0x0015, (AX) - MOVB BL, 2(AX) + MOVB BP, 2(AX) ADDQ $0x03, AX JMP repeat_end_emit_encodeBlockAsm10BAvx repeat_two_repeat_as_copy_encodeBlockAsm10BAvx_emit_copy_short: - SHLL $0x02, BX - ORL $0x01, BX - MOVW BX, (AX) + SHLL $0x02, BP + ORL $0x01, BP + MOVW BP, (AX) ADDQ $0x02, AX JMP repeat_end_emit_encodeBlockAsm10BAvx repeat_two_offset_repeat_as_copy_encodeBlockAsm10BAvx_emit_copy_short: - XORQ R9, R9 - LEAL 1(R9)(BX*4), BX - MOVB BP, 1(AX) - SARL $0x08, BP - SHLL $0x05, BP - ORL BP, BX - MOVB BL, (AX) + XORQ DI, DI + LEAL 1(DI)(BP*4), BP + MOVB SI, 1(AX) + SARL $0x08, SI + SHLL $0x05, SI + ORL SI, BP + MOVB BP, (AX) ADDQ $0x02, AX JMP repeat_end_emit_encodeBlockAsm10BAvx - JMP two_byte_offset_repeat_as_copy_encodeBlockAsm10BAvx + JMP two_byte_offset_repeat_as_copy_encodeBlockAsm10BAvx two_byte_offset_short_repeat_as_copy_encodeBlockAsm10BAvx: - CMPL BX, $0x0c + CMPL BP, $0x0c JGE emit_copy_three_repeat_as_copy_encodeBlockAsm10BAvx - CMPL BP, $0x00000800 + CMPL SI, $0x00000800 JGE emit_copy_three_repeat_as_copy_encodeBlockAsm10BAvx - MOVB $0x01, SI - LEAL -16(SI)(BX*4), BX - MOVB BP, 1(AX) - SHRL $0x08, BP - SHLL $0x05, BP - ORL BP, BX - MOVB BL, (AX) + MOVB $0x01, BL + LEAL -16(BX)(BP*4), BP + MOVB SI, 1(AX) + SHRL $0x08, SI + SHLL $0x05, SI + ORL SI, BP + MOVB BP, (AX) ADDQ $0x02, AX JMP repeat_end_emit_encodeBlockAsm10BAvx emit_copy_three_repeat_as_copy_encodeBlockAsm10BAvx: - MOVB $0x02, SI - LEAL -4(SI)(BX*4), BX - MOVB BL, (AX) - MOVW BP, 1(AX) + MOVB $0x02, BL + LEAL -4(BX)(BP*4), BP + MOVB BP, (AX) + MOVW SI, 1(AX) ADDQ $0x03, AX repeat_end_emit_encodeBlockAsm10BAvx: @@ -9476,16 +9463,16 @@ repeat_end_emit_encodeBlockAsm10BAvx: JMP search_loop_encodeBlockAsm10BAvx no_repeat_found_encodeBlockAsm10BAvx: - CMPL (DX)(BX*1), R9 + CMPL (DX)(BP*1), SI JEQ candidate_match_encodeBlockAsm10BAvx - SHRQ $0x08, R9 - MOVL 24(SP)(R11*4), BX - LEAL 2(CX), DI - CMPL (DX)(BP*1), R9 + SHRQ $0x08, SI + MOVL 24(SP)(R9*4), BP + LEAL 2(CX), R8 + CMPL (DX)(DI*1), SI JEQ candidate2_match_encodeBlockAsm10BAvx - MOVL DI, 24(SP)(R11*4) - SHRQ $0x08, R9 - CMPL (DX)(BX*1), R9 + MOVL R8, 24(SP)(R9*4) + SHRQ $0x08, SI + CMPL (DX)(BP*1), SI JEQ candidate3_match_encodeBlockAsm10BAvx MOVL 20(SP), CX JMP search_loop_encodeBlockAsm10BAvx @@ -9495,46 +9482,46 @@ candidate3_match_encodeBlockAsm10BAvx: JMP candidate_match_encodeBlockAsm10BAvx candidate2_match_encodeBlockAsm10BAvx: - MOVL DI, 24(SP)(R11*4) + MOVL R8, 24(SP)(R9*4) INCL CX - MOVL BP, BX + MOVL DI, BP candidate_match_encodeBlockAsm10BAvx: - MOVL 12(SP), BP - TESTL BX, BX + MOVL 12(SP), SI + TESTL BP, BP JZ match_extend_back_end_encodeBlockAsm10BAvx match_extend_back_loop_encodeBlockAsm10BAvx: - CMPL CX, BP + CMPL CX, SI JLE match_extend_back_end_encodeBlockAsm10BAvx - MOVB -1(DX)(BX*1), SI + MOVB -1(DX)(BP*1), BL MOVB -1(DX)(CX*1), DI - CMPB SI, DI + CMPB BL, DI JNE match_extend_back_end_encodeBlockAsm10BAvx LEAL -1(CX), CX - DECL BX + DECL BP JZ match_extend_back_end_encodeBlockAsm10BAvx JMP match_extend_back_loop_encodeBlockAsm10BAvx match_extend_back_end_encodeBlockAsm10BAvx: - MOVL CX, DI - SUBL 12(SP), DI - LEAQ 4(AX)(DI*1), DI - CMPQ DI, (SP) + MOVL CX, SI + SUBL 12(SP), SI + LEAQ 4(AX)(SI*1), SI + CMPQ SI, (SP) JL match_dst_size_check_encodeBlockAsm10BAvx MOVQ $0x00000000, ret+48(FP) RET match_dst_size_check_encodeBlockAsm10BAvx: - MOVL CX, R8 + MOVL CX, SI MOVL 12(SP), DI - CMPL DI, R8 + CMPL DI, SI JEQ emit_literal_done_match_emit_encodeBlockAsm10BAvx - MOVL R8, BP - MOVL R8, 12(SP) - LEAQ (DX)(DI*1), R9 - SUBL DI, BP - MOVL BP, DI + MOVL SI, R8 + MOVL SI, 12(SP) + LEAQ (DX)(DI*1), SI + SUBL DI, R8 + MOVL R8, DI SUBL $0x01, DI JC emit_literal_done_match_emit_encodeBlockAsm10BAvx CMPL DI, $0x3c @@ -9551,11 +9538,11 @@ match_dst_size_check_encodeBlockAsm10BAvx: JMP memmove_match_emit_encodeBlockAsm10BAvx four_bytes_match_emit_encodeBlockAsm10BAvx: - MOVL DI, R8 - SHRL $0x10, R8 + MOVL DI, R9 + SHRL $0x10, R9 MOVB $0xf8, (AX) MOVW DI, 1(AX) - MOVB R8, 3(AX) + MOVB R9, 3(AX) ADDQ $0x04, AX JMP memmove_match_emit_encodeBlockAsm10BAvx @@ -9577,8 +9564,7 @@ one_byte_match_emit_encodeBlockAsm10BAvx: ADDQ $0x01, AX memmove_match_emit_encodeBlockAsm10BAvx: - LEAQ (AX)(BP*1), R10 - MOVL BP, R8 + LEAQ (AX)(R8*1), DI NOP emit_lit_memmove_match_emit_encodeBlockAsm10BAvx_memmove_tail: @@ -9605,55 +9591,55 @@ emit_lit_memmove_match_emit_encodeBlockAsm10BAvx_memmove_tail: JMP emit_lit_memmove_match_emit_encodeBlockAsm10BAvx_memmove_avxUnaligned emit_lit_memmove_match_emit_encodeBlockAsm10BAvx_memmove_move_1or2: - MOVB (R9), R11 - MOVB -1(R9)(R8*1), R12 - MOVB R11, (AX) - MOVB R12, -1(AX)(R8*1) + MOVB (SI), R9 + MOVB -1(SI)(R8*1), R10 + MOVB R9, (AX) + MOVB R10, -1(AX)(R8*1) JMP memmove_end_copy_match_emit_encodeBlockAsm10BAvx emit_lit_memmove_match_emit_encodeBlockAsm10BAvx_memmove_move_4: - MOVL (R9), R11 - MOVL R11, (AX) + MOVL (SI), R9 + MOVL R9, (AX) JMP memmove_end_copy_match_emit_encodeBlockAsm10BAvx emit_lit_memmove_match_emit_encodeBlockAsm10BAvx_memmove_move_3: - MOVW (R9), R11 - MOVB 2(R9), R12 - MOVW R11, (AX) - MOVB R12, 2(AX) + MOVW (SI), R9 + MOVB 2(SI), R10 + MOVW R9, (AX) + MOVB R10, 2(AX) JMP memmove_end_copy_match_emit_encodeBlockAsm10BAvx emit_lit_memmove_match_emit_encodeBlockAsm10BAvx_memmove_move_5through7: - MOVL (R9), R11 - MOVL -4(R9)(R8*1), R12 - MOVL R11, (AX) - MOVL R12, -4(AX)(R8*1) + MOVL (SI), R9 + MOVL -4(SI)(R8*1), R10 + MOVL R9, (AX) + MOVL R10, -4(AX)(R8*1) JMP memmove_end_copy_match_emit_encodeBlockAsm10BAvx emit_lit_memmove_match_emit_encodeBlockAsm10BAvx_memmove_move_8: - MOVQ (R9), R11 - MOVQ R11, (AX) + MOVQ (SI), R9 + MOVQ R9, (AX) JMP memmove_end_copy_match_emit_encodeBlockAsm10BAvx emit_lit_memmove_match_emit_encodeBlockAsm10BAvx_memmove_move_9through16: - MOVQ (R9), R11 - MOVQ -8(R9)(R8*1), R12 - MOVQ R11, (AX) - MOVQ R12, -8(AX)(R8*1) + MOVQ (SI), R9 + MOVQ -8(SI)(R8*1), R10 + MOVQ R9, (AX) + MOVQ R10, -8(AX)(R8*1) JMP memmove_end_copy_match_emit_encodeBlockAsm10BAvx emit_lit_memmove_match_emit_encodeBlockAsm10BAvx_memmove_move_17through32: - MOVOU (R9), X0 - MOVOU -16(R9)(R8*1), X1 + MOVOU (SI), X0 + MOVOU -16(SI)(R8*1), X1 MOVOU X0, (AX) MOVOU X1, -16(AX)(R8*1) JMP memmove_end_copy_match_emit_encodeBlockAsm10BAvx emit_lit_memmove_match_emit_encodeBlockAsm10BAvx_memmove_move_33through64: - MOVOU (R9), X0 - MOVOU 16(R9), X1 - MOVOU -32(R9)(R8*1), X2 - MOVOU -16(R9)(R8*1), X3 + MOVOU (SI), X0 + MOVOU 16(SI), X1 + MOVOU -32(SI)(R8*1), X2 + MOVOU -16(SI)(R8*1), X3 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, -32(AX)(R8*1) @@ -9661,14 +9647,14 @@ emit_lit_memmove_match_emit_encodeBlockAsm10BAvx_memmove_move_33through64: JMP memmove_end_copy_match_emit_encodeBlockAsm10BAvx emit_lit_memmove_match_emit_encodeBlockAsm10BAvx_memmove_move_65through128: - MOVOU (R9), X0 - MOVOU 16(R9), X1 - MOVOU 32(R9), X2 - MOVOU 48(R9), X3 - MOVOU -64(R9)(R8*1), X12 - MOVOU -48(R9)(R8*1), X13 - MOVOU -32(R9)(R8*1), X14 - MOVOU -16(R9)(R8*1), X15 + MOVOU (SI), X0 + MOVOU 16(SI), X1 + MOVOU 32(SI), X2 + MOVOU 48(SI), X3 + MOVOU -64(SI)(R8*1), X12 + MOVOU -48(SI)(R8*1), X13 + MOVOU -32(SI)(R8*1), X14 + MOVOU -16(SI)(R8*1), X15 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, 32(AX) @@ -9680,22 +9666,22 @@ emit_lit_memmove_match_emit_encodeBlockAsm10BAvx_memmove_move_65through128: JMP memmove_end_copy_match_emit_encodeBlockAsm10BAvx emit_lit_memmove_match_emit_encodeBlockAsm10BAvx_memmove_move_129through256: - MOVOU (R9), X0 - MOVOU 16(R9), X1 - MOVOU 32(R9), X2 - MOVOU 48(R9), X3 - MOVOU 64(R9), X4 - MOVOU 80(R9), X5 - MOVOU 96(R9), X6 - MOVOU 112(R9), X7 - MOVOU -128(R9)(R8*1), X8 - MOVOU -112(R9)(R8*1), X9 - MOVOU -96(R9)(R8*1), X10 - MOVOU -80(R9)(R8*1), X11 - MOVOU -64(R9)(R8*1), X12 - MOVOU -48(R9)(R8*1), X13 - MOVOU -32(R9)(R8*1), X14 - MOVOU -16(R9)(R8*1), X15 + MOVOU (SI), X0 + MOVOU 16(SI), X1 + MOVOU 32(SI), X2 + MOVOU 48(SI), X3 + MOVOU 64(SI), X4 + MOVOU 80(SI), X5 + MOVOU 96(SI), X6 + MOVOU 112(SI), X7 + MOVOU -128(SI)(R8*1), X8 + MOVOU -112(SI)(R8*1), X9 + MOVOU -96(SI)(R8*1), X10 + MOVOU -80(SI)(R8*1), X11 + MOVOU -64(SI)(R8*1), X12 + MOVOU -48(SI)(R8*1), X13 + MOVOU -32(SI)(R8*1), X14 + MOVOU -16(SI)(R8*1), X15 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, 32(AX) @@ -9716,22 +9702,22 @@ emit_lit_memmove_match_emit_encodeBlockAsm10BAvx_memmove_move_129through256: emit_lit_memmove_match_emit_encodeBlockAsm10BAvx_memmove_move_256through2048: LEAQ -256(R8), R8 - MOVOU (R9), X0 - MOVOU 16(R9), X1 - MOVOU 32(R9), X2 - MOVOU 48(R9), X3 - MOVOU 64(R9), X4 - MOVOU 80(R9), X5 - MOVOU 96(R9), X6 - MOVOU 112(R9), X7 - MOVOU 128(R9), X8 - MOVOU 144(R9), X9 - MOVOU 160(R9), X10 - MOVOU 176(R9), X11 - MOVOU 192(R9), X12 - MOVOU 208(R9), X13 - MOVOU 224(R9), X14 - MOVOU 240(R9), X15 + MOVOU (SI), X0 + MOVOU 16(SI), X1 + MOVOU 32(SI), X2 + MOVOU 48(SI), X3 + MOVOU 64(SI), X4 + MOVOU 80(SI), X5 + MOVOU 96(SI), X6 + MOVOU 112(SI), X7 + MOVOU 128(SI), X8 + MOVOU 144(SI), X9 + MOVOU 160(SI), X10 + MOVOU 176(SI), X11 + MOVOU 192(SI), X12 + MOVOU 208(SI), X13 + MOVOU 224(SI), X14 + MOVOU 240(SI), X15 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, 32(AX) @@ -9749,139 +9735,139 @@ emit_lit_memmove_match_emit_encodeBlockAsm10BAvx_memmove_move_256through2048: MOVOU X14, 224(AX) MOVOU X15, 240(AX) CMPQ R8, $0x00000100 - LEAQ 256(R9), R9 + LEAQ 256(SI), SI LEAQ 256(AX), AX JGE emit_lit_memmove_match_emit_encodeBlockAsm10BAvx_memmove_move_256through2048 JMP emit_lit_memmove_match_emit_encodeBlockAsm10BAvx_memmove_tail emit_lit_memmove_match_emit_encodeBlockAsm10BAvx_memmove_avxUnaligned: - LEAQ (R9)(R8*1), R12 - MOVQ AX, R14 - MOVOU -128(R12), X5 - MOVOU -112(R12), X6 - MOVQ $0x00000080, R11 + LEAQ (SI)(R8*1), R10 + MOVQ AX, R12 + MOVOU -128(R10), X5 + MOVOU -112(R10), X6 + MOVQ $0x00000080, R9 ANDQ $0xffffffe0, AX ADDQ $0x20, AX - MOVOU -96(R12), X7 - MOVOU -80(R12), X8 - MOVQ AX, R13 - SUBQ R14, R13 - MOVOU -64(R12), X9 - MOVOU -48(R12), X10 - SUBQ R13, R8 - MOVOU -32(R12), X11 - MOVOU -16(R12), X12 - VMOVDQU (R9), Y4 - ADDQ R13, R9 + MOVOU -96(R10), X7 + MOVOU -80(R10), X8 + MOVQ AX, R11 + SUBQ R12, R11 + MOVOU -64(R10), X9 + MOVOU -48(R10), X10 SUBQ R11, R8 + MOVOU -32(R10), X11 + MOVOU -16(R10), X12 + VMOVDQU (SI), Y4 + ADDQ R11, SI + SUBQ R9, R8 emit_lit_memmove_match_emit_encodeBlockAsm10BAvx_memmove_gobble_128_loop: - VMOVDQU (R9), Y0 - VMOVDQU 32(R9), Y1 - VMOVDQU 64(R9), Y2 - VMOVDQU 96(R9), Y3 - ADDQ R11, R9 + VMOVDQU (SI), Y0 + VMOVDQU 32(SI), Y1 + VMOVDQU 64(SI), Y2 + VMOVDQU 96(SI), Y3 + ADDQ R9, SI VMOVDQA Y0, (AX) VMOVDQA Y1, 32(AX) VMOVDQA Y2, 64(AX) VMOVDQA Y3, 96(AX) - ADDQ R11, AX - SUBQ R11, R8 + ADDQ R9, AX + SUBQ R9, R8 JA emit_lit_memmove_match_emit_encodeBlockAsm10BAvx_memmove_gobble_128_loop - ADDQ R11, R8 + ADDQ R9, R8 ADDQ AX, R8 - VMOVDQU Y4, (R14) + VMOVDQU Y4, (R12) VZEROUPPER - MOVOU X5, -128(R8) - MOVOU X6, -112(R8) - MOVOU X7, -96(R8) - MOVOU X8, -80(R8) - MOVOU X9, -64(R8) - MOVOU X10, -48(R8) - MOVOU X11, -32(R8) - MOVOU X12, -16(R8) + MOVOU X5, -128(R8) + MOVOU X6, -112(R8) + MOVOU X7, -96(R8) + MOVOU X8, -80(R8) + MOVOU X9, -64(R8) + MOVOU X10, -48(R8) + MOVOU X11, -32(R8) + MOVOU X12, -16(R8) memmove_end_copy_match_emit_encodeBlockAsm10BAvx: - MOVQ R10, AX + MOVQ DI, AX emit_literal_done_match_emit_encodeBlockAsm10BAvx: match_nolit_loop_encodeBlockAsm10BAvx: - MOVL CX, BP - SUBL BX, BP - MOVL BP, 16(SP) + MOVL CX, SI + SUBL BP, SI + MOVL SI, 16(SP) ADDL $0x04, CX - ADDL $0x04, BX - MOVQ src_len+32(FP), R9 - SUBL CX, R9 - LEAQ (DX)(CX*1), R10 - LEAQ (DX)(BX*1), R11 - XORL BX, BX - CMPL R9, $0x08 + ADDL $0x04, BP + MOVQ src_len+32(FP), SI + SUBL CX, SI + LEAQ (DX)(CX*1), DI + LEAQ (DX)(BP*1), BP + XORL R9, R9 + CMPL SI, $0x08 JL matchlen_single_match_nolit_encodeBlockAsm10BAvx matchlen_loopback_match_nolit_encodeBlockAsm10BAvx: - MOVQ (R10)(BX*1), R12 - XORQ (R11)(BX*1), R12 - TESTQ R12, R12 + MOVQ (DI)(R9*1), R8 + XORQ (BP)(R9*1), R8 + TESTQ R8, R8 JZ matchlen_loop_match_nolit_encodeBlockAsm10BAvx - BSFQ R12, R12 - SARQ $0x03, R12 - LEAL (BX)(R12*1), BX + BSFQ R8, R8 + SARQ $0x03, R8 + LEAL (R9)(R8*1), R9 JMP match_nolit_end_encodeBlockAsm10BAvx matchlen_loop_match_nolit_encodeBlockAsm10BAvx: - LEAL -8(R9), R9 - LEAL 8(BX), BX - CMPL R9, $0x08 + LEAL -8(SI), SI + LEAL 8(R9), R9 + CMPL SI, $0x08 JGE matchlen_loopback_match_nolit_encodeBlockAsm10BAvx matchlen_single_match_nolit_encodeBlockAsm10BAvx: - TESTL R9, R9 + TESTL SI, SI JZ match_nolit_end_encodeBlockAsm10BAvx matchlen_single_loopback_match_nolit_encodeBlockAsm10BAvx: - MOVB (R10)(BX*1), R12 - CMPB (R11)(BX*1), R12 + MOVB (DI)(R9*1), R8 + CMPB (BP)(R9*1), R8 JNE match_nolit_end_encodeBlockAsm10BAvx - LEAL 1(BX), BX - DECL R9 + LEAL 1(R9), R9 + DECL SI JNZ matchlen_single_loopback_match_nolit_encodeBlockAsm10BAvx match_nolit_end_encodeBlockAsm10BAvx: - ADDL BX, CX + ADDL R9, CX MOVL 16(SP), BP - ADDL $0x04, BX + ADDL $0x04, R9 CMPL BP, $0x00010000 JL two_byte_offset_match_nolit_encodeBlockAsm10BAvx four_bytes_loop_back_match_nolit_encodeBlockAsm10BAvx: - CMPL BX, $0x40 + CMPL R9, $0x40 JLE four_bytes_remain_match_nolit_encodeBlockAsm10BAvx MOVB $0xff, (AX) MOVL BP, 1(AX) - LEAL -64(BX), BX + LEAL -64(R9), R9 ADDQ $0x05, AX - CMPL BX, $0x04 + CMPL R9, $0x04 JL four_bytes_remain_match_nolit_encodeBlockAsm10BAvx emit_repeat_again_match_nolit_encodeBlockAsm10BAvx_emit_copy: - MOVL BX, DI - LEAL -4(BX), BX - CMPL DI, $0x08 + MOVL R9, SI + LEAL -4(R9), R9 + CMPL SI, $0x08 JLE repeat_two_match_nolit_encodeBlockAsm10BAvx_emit_copy - CMPL DI, $0x0c + CMPL SI, $0x0c JGE cant_repeat_two_offset_match_nolit_encodeBlockAsm10BAvx_emit_copy CMPL BP, $0x00000800 JLT repeat_two_offset_match_nolit_encodeBlockAsm10BAvx_emit_copy cant_repeat_two_offset_match_nolit_encodeBlockAsm10BAvx_emit_copy: - CMPL BX, $0x00000104 + CMPL R9, $0x00000104 JLT repeat_three_match_nolit_encodeBlockAsm10BAvx_emit_copy - CMPL BX, $0x00010100 + CMPL R9, $0x00010100 JLT repeat_four_match_nolit_encodeBlockAsm10BAvx_emit_copy - CMPL BX, $0x0100ffff + CMPL R9, $0x0100ffff JLT repeat_five_match_nolit_encodeBlockAsm10BAvx_emit_copy - LEAL -16842747(BX), BX + LEAL -16842747(R9), R9 MOVW $0x001d, (AX) MOVW $0xfffb, 2(AX) MOVB $0xff, 4(AX) @@ -9889,84 +9875,84 @@ cant_repeat_two_offset_match_nolit_encodeBlockAsm10BAvx_emit_copy: JMP emit_repeat_again_match_nolit_encodeBlockAsm10BAvx_emit_copy repeat_five_match_nolit_encodeBlockAsm10BAvx_emit_copy: - LEAL -65536(BX), BX - MOVL BX, BP + LEAL -65536(R9), R9 + MOVL R9, BP MOVW $0x001d, (AX) - MOVW BX, 2(AX) + MOVW R9, 2(AX) SARL $0x10, BP MOVB BP, 4(AX) ADDQ $0x05, AX JMP match_nolit_emitcopy_end_encodeBlockAsm10BAvx repeat_four_match_nolit_encodeBlockAsm10BAvx_emit_copy: - LEAL -256(BX), BX + LEAL -256(R9), R9 MOVW $0x0019, (AX) - MOVW BX, 2(AX) + MOVW R9, 2(AX) ADDQ $0x04, AX JMP match_nolit_emitcopy_end_encodeBlockAsm10BAvx repeat_three_match_nolit_encodeBlockAsm10BAvx_emit_copy: - LEAL -4(BX), BX + LEAL -4(R9), R9 MOVW $0x0015, (AX) - MOVB BL, 2(AX) + MOVB R9, 2(AX) ADDQ $0x03, AX JMP match_nolit_emitcopy_end_encodeBlockAsm10BAvx repeat_two_match_nolit_encodeBlockAsm10BAvx_emit_copy: - SHLL $0x02, BX - ORL $0x01, BX - MOVW BX, (AX) + SHLL $0x02, R9 + ORL $0x01, R9 + MOVW R9, (AX) ADDQ $0x02, AX JMP match_nolit_emitcopy_end_encodeBlockAsm10BAvx repeat_two_offset_match_nolit_encodeBlockAsm10BAvx_emit_copy: - XORQ R9, R9 - LEAL 1(R9)(BX*4), BX + XORQ SI, SI + LEAL 1(SI)(R9*4), R9 MOVB BP, 1(AX) SARL $0x08, BP SHLL $0x05, BP - ORL BP, BX - MOVB BL, (AX) + ORL BP, R9 + MOVB R9, (AX) ADDQ $0x02, AX JMP match_nolit_emitcopy_end_encodeBlockAsm10BAvx - JMP four_bytes_loop_back_match_nolit_encodeBlockAsm10BAvx + JMP four_bytes_loop_back_match_nolit_encodeBlockAsm10BAvx four_bytes_remain_match_nolit_encodeBlockAsm10BAvx: - TESTL BX, BX + TESTL R9, R9 JZ match_nolit_emitcopy_end_encodeBlockAsm10BAvx - MOVB $0x03, SI - LEAL -4(SI)(BX*4), BX - MOVB BL, (AX) + MOVB $0x03, BL + LEAL -4(BX)(R9*4), R9 + MOVB R9, (AX) MOVL BP, 1(AX) ADDQ $0x05, AX JMP match_nolit_emitcopy_end_encodeBlockAsm10BAvx two_byte_offset_match_nolit_encodeBlockAsm10BAvx: - CMPL BX, $0x40 + CMPL R9, $0x40 JLE two_byte_offset_short_match_nolit_encodeBlockAsm10BAvx MOVB $0xee, (AX) MOVW BP, 1(AX) - LEAL -60(BX), BX + LEAL -60(R9), R9 ADDQ $0x03, AX emit_repeat_again_match_nolit_encodeBlockAsm10BAvx_emit_copy_short: - MOVL BX, DI - LEAL -4(BX), BX - CMPL DI, $0x08 + MOVL R9, SI + LEAL -4(R9), R9 + CMPL SI, $0x08 JLE repeat_two_match_nolit_encodeBlockAsm10BAvx_emit_copy_short - CMPL DI, $0x0c + CMPL SI, $0x0c JGE cant_repeat_two_offset_match_nolit_encodeBlockAsm10BAvx_emit_copy_short CMPL BP, $0x00000800 JLT repeat_two_offset_match_nolit_encodeBlockAsm10BAvx_emit_copy_short cant_repeat_two_offset_match_nolit_encodeBlockAsm10BAvx_emit_copy_short: - CMPL BX, $0x00000104 + CMPL R9, $0x00000104 JLT repeat_three_match_nolit_encodeBlockAsm10BAvx_emit_copy_short - CMPL BX, $0x00010100 + CMPL R9, $0x00010100 JLT repeat_four_match_nolit_encodeBlockAsm10BAvx_emit_copy_short - CMPL BX, $0x0100ffff + CMPL R9, $0x0100ffff JLT repeat_five_match_nolit_encodeBlockAsm10BAvx_emit_copy_short - LEAL -16842747(BX), BX + LEAL -16842747(R9), R9 MOVW $0x001d, (AX) MOVW $0xfffb, 2(AX) MOVB $0xff, 4(AX) @@ -9974,67 +9960,67 @@ cant_repeat_two_offset_match_nolit_encodeBlockAsm10BAvx_emit_copy_short: JMP emit_repeat_again_match_nolit_encodeBlockAsm10BAvx_emit_copy_short repeat_five_match_nolit_encodeBlockAsm10BAvx_emit_copy_short: - LEAL -65536(BX), BX - MOVL BX, BP + LEAL -65536(R9), R9 + MOVL R9, BP MOVW $0x001d, (AX) - MOVW BX, 2(AX) + MOVW R9, 2(AX) SARL $0x10, BP MOVB BP, 4(AX) ADDQ $0x05, AX JMP match_nolit_emitcopy_end_encodeBlockAsm10BAvx repeat_four_match_nolit_encodeBlockAsm10BAvx_emit_copy_short: - LEAL -256(BX), BX + LEAL -256(R9), R9 MOVW $0x0019, (AX) - MOVW BX, 2(AX) + MOVW R9, 2(AX) ADDQ $0x04, AX JMP match_nolit_emitcopy_end_encodeBlockAsm10BAvx repeat_three_match_nolit_encodeBlockAsm10BAvx_emit_copy_short: - LEAL -4(BX), BX + LEAL -4(R9), R9 MOVW $0x0015, (AX) - MOVB BL, 2(AX) + MOVB R9, 2(AX) ADDQ $0x03, AX JMP match_nolit_emitcopy_end_encodeBlockAsm10BAvx repeat_two_match_nolit_encodeBlockAsm10BAvx_emit_copy_short: - SHLL $0x02, BX - ORL $0x01, BX - MOVW BX, (AX) + SHLL $0x02, R9 + ORL $0x01, R9 + MOVW R9, (AX) ADDQ $0x02, AX JMP match_nolit_emitcopy_end_encodeBlockAsm10BAvx repeat_two_offset_match_nolit_encodeBlockAsm10BAvx_emit_copy_short: - XORQ R9, R9 - LEAL 1(R9)(BX*4), BX + XORQ SI, SI + LEAL 1(SI)(R9*4), R9 MOVB BP, 1(AX) SARL $0x08, BP SHLL $0x05, BP - ORL BP, BX - MOVB BL, (AX) + ORL BP, R9 + MOVB R9, (AX) ADDQ $0x02, AX JMP match_nolit_emitcopy_end_encodeBlockAsm10BAvx - JMP two_byte_offset_match_nolit_encodeBlockAsm10BAvx + JMP two_byte_offset_match_nolit_encodeBlockAsm10BAvx two_byte_offset_short_match_nolit_encodeBlockAsm10BAvx: - CMPL BX, $0x0c + CMPL R9, $0x0c JGE emit_copy_three_match_nolit_encodeBlockAsm10BAvx CMPL BP, $0x00000800 JGE emit_copy_three_match_nolit_encodeBlockAsm10BAvx - MOVB $0x01, SI - LEAL -16(SI)(BX*4), BX + MOVB $0x01, BL + LEAL -16(BX)(R9*4), R9 MOVB BP, 1(AX) SHRL $0x08, BP SHLL $0x05, BP - ORL BP, BX - MOVB BL, (AX) + ORL BP, R9 + MOVB R9, (AX) ADDQ $0x02, AX JMP match_nolit_emitcopy_end_encodeBlockAsm10BAvx emit_copy_three_match_nolit_encodeBlockAsm10BAvx: - MOVB $0x02, SI - LEAL -4(SI)(BX*4), BX - MOVB BL, (AX) + MOVB $0x02, BL + LEAL -4(BX)(R9*4), R9 + MOVB R9, (AX) MOVW BP, 1(AX) ADDQ $0x03, AX @@ -10048,22 +10034,22 @@ match_nolit_emitcopy_end_encodeBlockAsm10BAvx: RET match_nolit_dst_ok_encodeBlockAsm10BAvx: - MOVQ -2(DX)(CX*1), R9 - MOVQ $0x9e3779b1, R10 - MOVQ R9, R11 - SHRQ $0x10, R9 - MOVQ R9, R12 - SHLQ $0x20, R11 - IMULQ R10, R11 - SHRQ $0x36, R11 - SHLQ $0x20, R12 - IMULQ R10, R12 - SHRQ $0x36, R12 - LEAL -2(CX), BP - MOVL 24(SP)(R12*4), BX - MOVL BP, 24(SP)(R11*4) - MOVL CX, 24(SP)(R12*4) - CMPL (DX)(BX*1), R9 + MOVQ -2(DX)(CX*1), SI + MOVQ $0x9e3779b1, BP + MOVQ SI, DI + SHRQ $0x10, SI + MOVQ SI, R8 + SHLQ $0x20, DI + IMULQ BP, DI + SHRQ $0x36, DI + SHLQ $0x20, R8 + IMULQ BP, R8 + SHRQ $0x36, R8 + LEAL -2(CX), R9 + MOVL 24(SP)(R8*4), BP + MOVL R9, 24(SP)(DI*4) + MOVL CX, 24(SP)(R8*4) + CMPL (DX)(BP*1), SI JEQ match_nolit_loop_encodeBlockAsm10BAvx INCL CX JMP search_loop_encodeBlockAsm10BAvx @@ -10344,14 +10330,14 @@ emit_lit_memmove_emit_remainder_encodeBlockAsm10BAvx_memmove_gobble_128_loop: ADDQ AX, BX VMOVDQU Y4, (R8) VZEROUPPER - MOVOU X5, -128(BX) - MOVOU X6, -112(BX) - MOVOU X7, -96(BX) - MOVOU X8, -80(BX) - MOVOU X9, -64(BX) - MOVOU X10, -48(BX) - MOVOU X11, -32(BX) - MOVOU X12, -16(BX) + MOVOU X5, -128(BX) + MOVOU X6, -112(BX) + MOVOU X7, -96(BX) + MOVOU X8, -80(BX) + MOVOU X9, -64(BX) + MOVOU X10, -48(BX) + MOVOU X11, -32(BX) + MOVOU X12, -16(BX) memmove_end_copy_emit_remainder_encodeBlockAsm10BAvx: MOVQ DX, AX @@ -10385,8 +10371,8 @@ zero_loop_encodeBlockAsm8BAvx: MOVL $0x00000000, 12(SP) MOVQ src_len+32(FP), CX LEAQ -5(CX), DX - LEAQ -8(CX), R9 - MOVL R9, 8(SP) + LEAQ -8(CX), BP + MOVL BP, 8(SP) SHRQ $0x05, CX SUBL CX, DX LEAQ (AX)(DX*1), DX @@ -10396,68 +10382,68 @@ zero_loop_encodeBlockAsm8BAvx: MOVQ src_base+24(FP), DX search_loop_encodeBlockAsm8BAvx: - MOVQ (DX)(CX*1), R9 - MOVL CX, BX - SUBL 12(SP), BX - SHRL $0x04, BX - LEAL 4(CX)(BX*1), BX - MOVL 8(SP), BP - CMPL BX, BP + MOVQ (DX)(CX*1), SI + MOVL CX, BP + SUBL 12(SP), BP + SHRL $0x04, BP + LEAL 4(CX)(BP*1), BP + MOVL 8(SP), DI + CMPL BP, DI JGT emit_remainder_encodeBlockAsm8BAvx - MOVL BX, 20(SP) - MOVQ $0x9e3779b1, R10 - MOVQ R9, R11 - MOVQ R9, R12 - SHRQ $0x08, R12 - SHLQ $0x20, R11 - IMULQ R10, R11 - SHRQ $0x38, R11 - SHLQ $0x20, R12 - IMULQ R10, R12 - SHRQ $0x38, R12 - MOVL 24(SP)(R11*4), BX - MOVL 24(SP)(R12*4), BP - MOVL CX, 24(SP)(R11*4) - LEAL 1(CX), DI - MOVL DI, 24(SP)(R12*4) - MOVQ R9, R11 - SHRQ $0x10, R11 - SHLQ $0x20, R11 - IMULQ R10, R11 - SHRQ $0x38, R11 - MOVL CX, DI - SUBL 16(SP), DI - MOVL 1(DX)(DI*1), DI - MOVQ R9, R10 + MOVL BP, 20(SP) + MOVQ $0x9e3779b1, R8 + MOVQ SI, R9 + MOVQ SI, R10 SHRQ $0x08, R10 - CMPL R10, DI + SHLQ $0x20, R9 + IMULQ R8, R9 + SHRQ $0x38, R9 + SHLQ $0x20, R10 + IMULQ R8, R10 + SHRQ $0x38, R10 + MOVL 24(SP)(R9*4), BP + MOVL 24(SP)(R10*4), DI + MOVL CX, 24(SP)(R9*4) + LEAL 1(CX), R9 + MOVL R9, 24(SP)(R10*4) + MOVQ SI, R9 + SHRQ $0x10, R9 + SHLQ $0x20, R9 + IMULQ R8, R9 + SHRQ $0x38, R9 + MOVL CX, R8 + SUBL 16(SP), R8 + MOVL 1(DX)(R8*1), R10 + MOVQ SI, R8 + SHRQ $0x08, R8 + CMPL R8, R10 JNE no_repeat_found_encodeBlockAsm8BAvx - LEAL 1(CX), DI - MOVL 12(SP), R8 - MOVL DI, BX - SUBL 16(SP), BX + LEAL 1(CX), SI + MOVL 12(SP), DI + MOVL SI, BP + SUBL 16(SP), BP JZ repeat_extend_back_end_encodeBlockAsm8BAvx repeat_extend_back_loop_encodeBlockAsm8BAvx: - CMPL DI, R8 + CMPL SI, DI JLE repeat_extend_back_end_encodeBlockAsm8BAvx - MOVB -1(DX)(BX*1), BP - MOVB -1(DX)(DI*1), SI - CMPB BP, SI + MOVB -1(DX)(BP*1), BL + MOVB -1(DX)(SI*1), R8 + CMPB BL, R8 JNE repeat_extend_back_end_encodeBlockAsm8BAvx - LEAL -1(DI), DI - DECL BX + LEAL -1(SI), SI + DECL BP JNZ repeat_extend_back_loop_encodeBlockAsm8BAvx repeat_extend_back_end_encodeBlockAsm8BAvx: MOVL 12(SP), BP - CMPL BP, DI + CMPL BP, SI JEQ emit_literal_done_repeat_emit_encodeBlockAsm8BAvx - MOVL DI, BX - MOVL DI, 12(SP) + MOVL SI, R8 + MOVL SI, 12(SP) LEAQ (DX)(BP*1), R9 - SUBL BP, BX - MOVL BX, BP + SUBL BP, R8 + MOVL R8, BP SUBL $0x01, BP JC emit_literal_done_repeat_emit_encodeBlockAsm8BAvx CMPL BP, $0x3c @@ -10500,87 +10486,86 @@ one_byte_repeat_emit_encodeBlockAsm8BAvx: ADDQ $0x01, AX memmove_repeat_emit_encodeBlockAsm8BAvx: - LEAQ (AX)(BX*1), R10 - MOVL BX, BP + LEAQ (AX)(R8*1), BP NOP emit_lit_memmove_repeat_emit_encodeBlockAsm8BAvx_memmove_tail: - TESTQ BP, BP + TESTQ R8, R8 JEQ memmove_end_copy_repeat_emit_encodeBlockAsm8BAvx - CMPQ BP, $0x02 + CMPQ R8, $0x02 JBE emit_lit_memmove_repeat_emit_encodeBlockAsm8BAvx_memmove_move_1or2 - CMPQ BP, $0x04 + CMPQ R8, $0x04 JB emit_lit_memmove_repeat_emit_encodeBlockAsm8BAvx_memmove_move_3 JBE emit_lit_memmove_repeat_emit_encodeBlockAsm8BAvx_memmove_move_4 - CMPQ BP, $0x08 + CMPQ R8, $0x08 JB emit_lit_memmove_repeat_emit_encodeBlockAsm8BAvx_memmove_move_5through7 JE emit_lit_memmove_repeat_emit_encodeBlockAsm8BAvx_memmove_move_8 - CMPQ BP, $0x10 + CMPQ R8, $0x10 JBE emit_lit_memmove_repeat_emit_encodeBlockAsm8BAvx_memmove_move_9through16 - CMPQ BP, $0x20 + CMPQ R8, $0x20 JBE emit_lit_memmove_repeat_emit_encodeBlockAsm8BAvx_memmove_move_17through32 - CMPQ BP, $0x40 + CMPQ R8, $0x40 JBE emit_lit_memmove_repeat_emit_encodeBlockAsm8BAvx_memmove_move_33through64 - CMPQ BP, $0x80 + CMPQ R8, $0x80 JBE emit_lit_memmove_repeat_emit_encodeBlockAsm8BAvx_memmove_move_65through128 - CMPQ BP, $0x00000100 + CMPQ R8, $0x00000100 JBE emit_lit_memmove_repeat_emit_encodeBlockAsm8BAvx_memmove_move_129through256 JMP emit_lit_memmove_repeat_emit_encodeBlockAsm8BAvx_memmove_avxUnaligned emit_lit_memmove_repeat_emit_encodeBlockAsm8BAvx_memmove_move_1or2: - MOVB (R9), R11 - MOVB -1(R9)(BP*1), R12 - MOVB R11, (AX) - MOVB R12, -1(AX)(BP*1) + MOVB (R9), R10 + MOVB -1(R9)(R8*1), R11 + MOVB R10, (AX) + MOVB R11, -1(AX)(R8*1) JMP memmove_end_copy_repeat_emit_encodeBlockAsm8BAvx emit_lit_memmove_repeat_emit_encodeBlockAsm8BAvx_memmove_move_4: - MOVL (R9), R11 - MOVL R11, (AX) + MOVL (R9), R10 + MOVL R10, (AX) JMP memmove_end_copy_repeat_emit_encodeBlockAsm8BAvx emit_lit_memmove_repeat_emit_encodeBlockAsm8BAvx_memmove_move_3: - MOVW (R9), R11 - MOVB 2(R9), R12 - MOVW R11, (AX) - MOVB R12, 2(AX) + MOVW (R9), R10 + MOVB 2(R9), R11 + MOVW R10, (AX) + MOVB R11, 2(AX) JMP memmove_end_copy_repeat_emit_encodeBlockAsm8BAvx emit_lit_memmove_repeat_emit_encodeBlockAsm8BAvx_memmove_move_5through7: - MOVL (R9), R11 - MOVL -4(R9)(BP*1), R12 - MOVL R11, (AX) - MOVL R12, -4(AX)(BP*1) + MOVL (R9), R10 + MOVL -4(R9)(R8*1), R11 + MOVL R10, (AX) + MOVL R11, -4(AX)(R8*1) JMP memmove_end_copy_repeat_emit_encodeBlockAsm8BAvx emit_lit_memmove_repeat_emit_encodeBlockAsm8BAvx_memmove_move_8: - MOVQ (R9), R11 - MOVQ R11, (AX) + MOVQ (R9), R10 + MOVQ R10, (AX) JMP memmove_end_copy_repeat_emit_encodeBlockAsm8BAvx emit_lit_memmove_repeat_emit_encodeBlockAsm8BAvx_memmove_move_9through16: - MOVQ (R9), R11 - MOVQ -8(R9)(BP*1), R12 - MOVQ R11, (AX) - MOVQ R12, -8(AX)(BP*1) + MOVQ (R9), R10 + MOVQ -8(R9)(R8*1), R11 + MOVQ R10, (AX) + MOVQ R11, -8(AX)(R8*1) JMP memmove_end_copy_repeat_emit_encodeBlockAsm8BAvx emit_lit_memmove_repeat_emit_encodeBlockAsm8BAvx_memmove_move_17through32: MOVOU (R9), X0 - MOVOU -16(R9)(BP*1), X1 + MOVOU -16(R9)(R8*1), X1 MOVOU X0, (AX) - MOVOU X1, -16(AX)(BP*1) + MOVOU X1, -16(AX)(R8*1) JMP memmove_end_copy_repeat_emit_encodeBlockAsm8BAvx emit_lit_memmove_repeat_emit_encodeBlockAsm8BAvx_memmove_move_33through64: MOVOU (R9), X0 MOVOU 16(R9), X1 - MOVOU -32(R9)(BP*1), X2 - MOVOU -16(R9)(BP*1), X3 + MOVOU -32(R9)(R8*1), X2 + MOVOU -16(R9)(R8*1), X3 MOVOU X0, (AX) MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(BP*1) - MOVOU X3, -16(AX)(BP*1) + MOVOU X2, -32(AX)(R8*1) + MOVOU X3, -16(AX)(R8*1) JMP memmove_end_copy_repeat_emit_encodeBlockAsm8BAvx emit_lit_memmove_repeat_emit_encodeBlockAsm8BAvx_memmove_move_65through128: @@ -10588,18 +10573,18 @@ emit_lit_memmove_repeat_emit_encodeBlockAsm8BAvx_memmove_move_65through128: MOVOU 16(R9), X1 MOVOU 32(R9), X2 MOVOU 48(R9), X3 - MOVOU -64(R9)(BP*1), X12 - MOVOU -48(R9)(BP*1), X13 - MOVOU -32(R9)(BP*1), X14 - MOVOU -16(R9)(BP*1), X15 + MOVOU -64(R9)(R8*1), X12 + MOVOU -48(R9)(R8*1), X13 + MOVOU -32(R9)(R8*1), X14 + MOVOU -16(R9)(R8*1), X15 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, 32(AX) MOVOU X3, 48(AX) - MOVOU X12, -64(AX)(BP*1) - MOVOU X13, -48(AX)(BP*1) - MOVOU X14, -32(AX)(BP*1) - MOVOU X15, -16(AX)(BP*1) + MOVOU X12, -64(AX)(R8*1) + MOVOU X13, -48(AX)(R8*1) + MOVOU X14, -32(AX)(R8*1) + MOVOU X15, -16(AX)(R8*1) JMP memmove_end_copy_repeat_emit_encodeBlockAsm8BAvx emit_lit_memmove_repeat_emit_encodeBlockAsm8BAvx_memmove_move_129through256: @@ -10611,14 +10596,14 @@ emit_lit_memmove_repeat_emit_encodeBlockAsm8BAvx_memmove_move_129through256: MOVOU 80(R9), X5 MOVOU 96(R9), X6 MOVOU 112(R9), X7 - MOVOU -128(R9)(BP*1), X8 - MOVOU -112(R9)(BP*1), X9 - MOVOU -96(R9)(BP*1), X10 - MOVOU -80(R9)(BP*1), X11 - MOVOU -64(R9)(BP*1), X12 - MOVOU -48(R9)(BP*1), X13 - MOVOU -32(R9)(BP*1), X14 - MOVOU -16(R9)(BP*1), X15 + MOVOU -128(R9)(R8*1), X8 + MOVOU -112(R9)(R8*1), X9 + MOVOU -96(R9)(R8*1), X10 + MOVOU -80(R9)(R8*1), X11 + MOVOU -64(R9)(R8*1), X12 + MOVOU -48(R9)(R8*1), X13 + MOVOU -32(R9)(R8*1), X14 + MOVOU -16(R9)(R8*1), X15 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, 32(AX) @@ -10627,18 +10612,18 @@ emit_lit_memmove_repeat_emit_encodeBlockAsm8BAvx_memmove_move_129through256: MOVOU X5, 80(AX) MOVOU X6, 96(AX) MOVOU X7, 112(AX) - MOVOU X8, -128(AX)(BP*1) - MOVOU X9, -112(AX)(BP*1) - MOVOU X10, -96(AX)(BP*1) - MOVOU X11, -80(AX)(BP*1) - MOVOU X12, -64(AX)(BP*1) - MOVOU X13, -48(AX)(BP*1) - MOVOU X14, -32(AX)(BP*1) - MOVOU X15, -16(AX)(BP*1) + MOVOU X8, -128(AX)(R8*1) + MOVOU X9, -112(AX)(R8*1) + MOVOU X10, -96(AX)(R8*1) + MOVOU X11, -80(AX)(R8*1) + MOVOU X12, -64(AX)(R8*1) + MOVOU X13, -48(AX)(R8*1) + MOVOU X14, -32(AX)(R8*1) + MOVOU X15, -16(AX)(R8*1) JMP memmove_end_copy_repeat_emit_encodeBlockAsm8BAvx emit_lit_memmove_repeat_emit_encodeBlockAsm8BAvx_memmove_move_256through2048: - LEAQ -256(BP), BP + LEAQ -256(R8), R8 MOVOU (R9), X0 MOVOU 16(R9), X1 MOVOU 32(R9), X2 @@ -10671,128 +10656,128 @@ emit_lit_memmove_repeat_emit_encodeBlockAsm8BAvx_memmove_move_256through2048: MOVOU X13, 208(AX) MOVOU X14, 224(AX) MOVOU X15, 240(AX) - CMPQ BP, $0x00000100 + CMPQ R8, $0x00000100 LEAQ 256(R9), R9 LEAQ 256(AX), AX JGE emit_lit_memmove_repeat_emit_encodeBlockAsm8BAvx_memmove_move_256through2048 JMP emit_lit_memmove_repeat_emit_encodeBlockAsm8BAvx_memmove_tail emit_lit_memmove_repeat_emit_encodeBlockAsm8BAvx_memmove_avxUnaligned: - LEAQ (R9)(BP*1), R12 - MOVQ AX, R14 - MOVOU -128(R12), X5 - MOVOU -112(R12), X6 - MOVQ $0x00000080, R11 + LEAQ (R9)(R8*1), R11 + MOVQ AX, R13 + MOVOU -128(R11), X5 + MOVOU -112(R11), X6 + MOVQ $0x00000080, R10 ANDQ $0xffffffe0, AX ADDQ $0x20, AX - MOVOU -96(R12), X7 - MOVOU -80(R12), X8 - MOVQ AX, R13 - SUBQ R14, R13 - MOVOU -64(R12), X9 - MOVOU -48(R12), X10 - SUBQ R13, BP - MOVOU -32(R12), X11 - MOVOU -16(R12), X12 + MOVOU -96(R11), X7 + MOVOU -80(R11), X8 + MOVQ AX, R12 + SUBQ R13, R12 + MOVOU -64(R11), X9 + MOVOU -48(R11), X10 + SUBQ R12, R8 + MOVOU -32(R11), X11 + MOVOU -16(R11), X12 VMOVDQU (R9), Y4 - ADDQ R13, R9 - SUBQ R11, BP + ADDQ R12, R9 + SUBQ R10, R8 emit_lit_memmove_repeat_emit_encodeBlockAsm8BAvx_memmove_gobble_128_loop: VMOVDQU (R9), Y0 VMOVDQU 32(R9), Y1 VMOVDQU 64(R9), Y2 VMOVDQU 96(R9), Y3 - ADDQ R11, R9 + ADDQ R10, R9 VMOVDQA Y0, (AX) VMOVDQA Y1, 32(AX) VMOVDQA Y2, 64(AX) VMOVDQA Y3, 96(AX) - ADDQ R11, AX - SUBQ R11, BP + ADDQ R10, AX + SUBQ R10, R8 JA emit_lit_memmove_repeat_emit_encodeBlockAsm8BAvx_memmove_gobble_128_loop - ADDQ R11, BP - ADDQ AX, BP - VMOVDQU Y4, (R14) + ADDQ R10, R8 + ADDQ AX, R8 + VMOVDQU Y4, (R13) VZEROUPPER - MOVOU X5, -128(BP) - MOVOU X6, -112(BP) - MOVOU X7, -96(BP) - MOVOU X8, -80(BP) - MOVOU X9, -64(BP) - MOVOU X10, -48(BP) - MOVOU X11, -32(BP) - MOVOU X12, -16(BP) + MOVOU X5, -128(R8) + MOVOU X6, -112(R8) + MOVOU X7, -96(R8) + MOVOU X8, -80(R8) + MOVOU X9, -64(R8) + MOVOU X10, -48(R8) + MOVOU X11, -32(R8) + MOVOU X12, -16(R8) memmove_end_copy_repeat_emit_encodeBlockAsm8BAvx: - MOVQ R10, AX + MOVQ BP, AX emit_literal_done_repeat_emit_encodeBlockAsm8BAvx: ADDL $0x05, CX - MOVL CX, BX - SUBL 16(SP), BX - MOVQ src_len+32(FP), R9 - SUBL CX, R9 - LEAQ (DX)(CX*1), R10 - LEAQ (DX)(BX*1), R11 - XORL BX, BX - CMPL R9, $0x08 + MOVL CX, BP + SUBL 16(SP), BP + MOVQ src_len+32(FP), R8 + SUBL CX, R8 + LEAQ (DX)(CX*1), R9 + LEAQ (DX)(BP*1), BP + XORL R11, R11 + CMPL R8, $0x08 JL matchlen_single_repeat_extend matchlen_loopback_repeat_extend: - MOVQ (R10)(BX*1), R12 - XORQ (R11)(BX*1), R12 - TESTQ R12, R12 + MOVQ (R9)(R11*1), R10 + XORQ (BP)(R11*1), R10 + TESTQ R10, R10 JZ matchlen_loop_repeat_extend - BSFQ R12, R12 - SARQ $0x03, R12 - LEAL (BX)(R12*1), BX + BSFQ R10, R10 + SARQ $0x03, R10 + LEAL (R11)(R10*1), R11 JMP repeat_extend_forward_end_encodeBlockAsm8BAvx matchlen_loop_repeat_extend: - LEAL -8(R9), R9 - LEAL 8(BX), BX - CMPL R9, $0x08 + LEAL -8(R8), R8 + LEAL 8(R11), R11 + CMPL R8, $0x08 JGE matchlen_loopback_repeat_extend matchlen_single_repeat_extend: - TESTL R9, R9 + TESTL R8, R8 JZ repeat_extend_forward_end_encodeBlockAsm8BAvx matchlen_single_loopback_repeat_extend: - MOVB (R10)(BX*1), R12 - CMPB (R11)(BX*1), R12 + MOVB (R9)(R11*1), R10 + CMPB (BP)(R11*1), R10 JNE repeat_extend_forward_end_encodeBlockAsm8BAvx - LEAL 1(BX), BX - DECL R9 + LEAL 1(R11), R11 + DECL R8 JNZ matchlen_single_loopback_repeat_extend repeat_extend_forward_end_encodeBlockAsm8BAvx: - ADDL BX, CX - MOVL CX, BX - SUBL DI, BX - MOVL 16(SP), BP - TESTL R8, R8 + ADDL R11, CX + MOVL CX, BP + SUBL SI, BP + MOVL 16(SP), SI + TESTL DI, DI JZ repeat_as_copy_encodeBlockAsm8BAvx emit_repeat_again_match_repeat_encodeBlockAsm8BAvx: - MOVL BX, DI - LEAL -4(BX), BX + MOVL BP, DI + LEAL -4(BP), BP CMPL DI, $0x08 JLE repeat_two_match_repeat_encodeBlockAsm8BAvx CMPL DI, $0x0c JGE cant_repeat_two_offset_match_repeat_encodeBlockAsm8BAvx - CMPL BP, $0x00000800 + CMPL SI, $0x00000800 JLT repeat_two_offset_match_repeat_encodeBlockAsm8BAvx cant_repeat_two_offset_match_repeat_encodeBlockAsm8BAvx: - CMPL BX, $0x00000104 + CMPL BP, $0x00000104 JLT repeat_three_match_repeat_encodeBlockAsm8BAvx - CMPL BX, $0x00010100 + CMPL BP, $0x00010100 JLT repeat_four_match_repeat_encodeBlockAsm8BAvx - CMPL BX, $0x0100ffff + CMPL BP, $0x0100ffff JLT repeat_five_match_repeat_encodeBlockAsm8BAvx - LEAL -16842747(BX), BX + LEAL -16842747(BP), BP MOVW $0x001d, (AX) MOVW $0xfffb, 2(AX) MOVB $0xff, 4(AX) @@ -10800,79 +10785,79 @@ cant_repeat_two_offset_match_repeat_encodeBlockAsm8BAvx: JMP emit_repeat_again_match_repeat_encodeBlockAsm8BAvx repeat_five_match_repeat_encodeBlockAsm8BAvx: - LEAL -65536(BX), BX - MOVL BX, BP + LEAL -65536(BP), BP + MOVL BP, SI MOVW $0x001d, (AX) - MOVW BX, 2(AX) - SARL $0x10, BP - MOVB BP, 4(AX) + MOVW BP, 2(AX) + SARL $0x10, SI + MOVB SI, 4(AX) ADDQ $0x05, AX JMP repeat_end_emit_encodeBlockAsm8BAvx repeat_four_match_repeat_encodeBlockAsm8BAvx: - LEAL -256(BX), BX + LEAL -256(BP), BP MOVW $0x0019, (AX) - MOVW BX, 2(AX) + MOVW BP, 2(AX) ADDQ $0x04, AX JMP repeat_end_emit_encodeBlockAsm8BAvx repeat_three_match_repeat_encodeBlockAsm8BAvx: - LEAL -4(BX), BX + LEAL -4(BP), BP MOVW $0x0015, (AX) - MOVB BL, 2(AX) + MOVB BP, 2(AX) ADDQ $0x03, AX JMP repeat_end_emit_encodeBlockAsm8BAvx repeat_two_match_repeat_encodeBlockAsm8BAvx: - SHLL $0x02, BX - ORL $0x01, BX - MOVW BX, (AX) + SHLL $0x02, BP + ORL $0x01, BP + MOVW BP, (AX) ADDQ $0x02, AX JMP repeat_end_emit_encodeBlockAsm8BAvx repeat_two_offset_match_repeat_encodeBlockAsm8BAvx: - XORQ R9, R9 - LEAL 1(R9)(BX*4), BX - MOVB BP, 1(AX) - SARL $0x08, BP - SHLL $0x05, BP - ORL BP, BX - MOVB BL, (AX) + XORQ DI, DI + LEAL 1(DI)(BP*4), BP + MOVB SI, 1(AX) + SARL $0x08, SI + SHLL $0x05, SI + ORL SI, BP + MOVB BP, (AX) ADDQ $0x02, AX JMP repeat_end_emit_encodeBlockAsm8BAvx repeat_as_copy_encodeBlockAsm8BAvx: - CMPL BP, $0x00010000 + CMPL SI, $0x00010000 JL two_byte_offset_repeat_as_copy_encodeBlockAsm8BAvx four_bytes_loop_back_repeat_as_copy_encodeBlockAsm8BAvx: - CMPL BX, $0x40 + CMPL BP, $0x40 JLE four_bytes_remain_repeat_as_copy_encodeBlockAsm8BAvx MOVB $0xff, (AX) - MOVL BP, 1(AX) - LEAL -64(BX), BX + MOVL SI, 1(AX) + LEAL -64(BP), BP ADDQ $0x05, AX - CMPL BX, $0x04 + CMPL BP, $0x04 JL four_bytes_remain_repeat_as_copy_encodeBlockAsm8BAvx emit_repeat_again_repeat_as_copy_encodeBlockAsm8BAvx_emit_copy: - MOVL BX, DI - LEAL -4(BX), BX + MOVL BP, DI + LEAL -4(BP), BP CMPL DI, $0x08 JLE repeat_two_repeat_as_copy_encodeBlockAsm8BAvx_emit_copy CMPL DI, $0x0c JGE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm8BAvx_emit_copy - CMPL BP, $0x00000800 + CMPL SI, $0x00000800 JLT repeat_two_offset_repeat_as_copy_encodeBlockAsm8BAvx_emit_copy cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm8BAvx_emit_copy: - CMPL BX, $0x00000104 + CMPL BP, $0x00000104 JLT repeat_three_repeat_as_copy_encodeBlockAsm8BAvx_emit_copy - CMPL BX, $0x00010100 + CMPL BP, $0x00010100 JLT repeat_four_repeat_as_copy_encodeBlockAsm8BAvx_emit_copy - CMPL BX, $0x0100ffff + CMPL BP, $0x0100ffff JLT repeat_five_repeat_as_copy_encodeBlockAsm8BAvx_emit_copy - LEAL -16842747(BX), BX + LEAL -16842747(BP), BP MOVW $0x001d, (AX) MOVW $0xfffb, 2(AX) MOVB $0xff, 4(AX) @@ -10880,84 +10865,84 @@ cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm8BAvx_emit_copy: JMP emit_repeat_again_repeat_as_copy_encodeBlockAsm8BAvx_emit_copy repeat_five_repeat_as_copy_encodeBlockAsm8BAvx_emit_copy: - LEAL -65536(BX), BX - MOVL BX, BP + LEAL -65536(BP), BP + MOVL BP, SI MOVW $0x001d, (AX) - MOVW BX, 2(AX) - SARL $0x10, BP - MOVB BP, 4(AX) + MOVW BP, 2(AX) + SARL $0x10, SI + MOVB SI, 4(AX) ADDQ $0x05, AX JMP repeat_end_emit_encodeBlockAsm8BAvx repeat_four_repeat_as_copy_encodeBlockAsm8BAvx_emit_copy: - LEAL -256(BX), BX + LEAL -256(BP), BP MOVW $0x0019, (AX) - MOVW BX, 2(AX) + MOVW BP, 2(AX) ADDQ $0x04, AX JMP repeat_end_emit_encodeBlockAsm8BAvx repeat_three_repeat_as_copy_encodeBlockAsm8BAvx_emit_copy: - LEAL -4(BX), BX + LEAL -4(BP), BP MOVW $0x0015, (AX) - MOVB BL, 2(AX) + MOVB BP, 2(AX) ADDQ $0x03, AX JMP repeat_end_emit_encodeBlockAsm8BAvx repeat_two_repeat_as_copy_encodeBlockAsm8BAvx_emit_copy: - SHLL $0x02, BX - ORL $0x01, BX - MOVW BX, (AX) + SHLL $0x02, BP + ORL $0x01, BP + MOVW BP, (AX) ADDQ $0x02, AX JMP repeat_end_emit_encodeBlockAsm8BAvx repeat_two_offset_repeat_as_copy_encodeBlockAsm8BAvx_emit_copy: - XORQ R9, R9 - LEAL 1(R9)(BX*4), BX - MOVB BP, 1(AX) - SARL $0x08, BP - SHLL $0x05, BP - ORL BP, BX - MOVB BL, (AX) + XORQ DI, DI + LEAL 1(DI)(BP*4), BP + MOVB SI, 1(AX) + SARL $0x08, SI + SHLL $0x05, SI + ORL SI, BP + MOVB BP, (AX) ADDQ $0x02, AX JMP repeat_end_emit_encodeBlockAsm8BAvx - JMP four_bytes_loop_back_repeat_as_copy_encodeBlockAsm8BAvx + JMP four_bytes_loop_back_repeat_as_copy_encodeBlockAsm8BAvx four_bytes_remain_repeat_as_copy_encodeBlockAsm8BAvx: - TESTL BX, BX + TESTL BP, BP JZ repeat_end_emit_encodeBlockAsm8BAvx - MOVB $0x03, SI - LEAL -4(SI)(BX*4), BX - MOVB BL, (AX) - MOVL BP, 1(AX) + MOVB $0x03, BL + LEAL -4(BX)(BP*4), BP + MOVB BP, (AX) + MOVL SI, 1(AX) ADDQ $0x05, AX JMP repeat_end_emit_encodeBlockAsm8BAvx two_byte_offset_repeat_as_copy_encodeBlockAsm8BAvx: - CMPL BX, $0x40 + CMPL BP, $0x40 JLE two_byte_offset_short_repeat_as_copy_encodeBlockAsm8BAvx MOVB $0xee, (AX) - MOVW BP, 1(AX) - LEAL -60(BX), BX + MOVW SI, 1(AX) + LEAL -60(BP), BP ADDQ $0x03, AX emit_repeat_again_repeat_as_copy_encodeBlockAsm8BAvx_emit_copy_short: - MOVL BX, DI - LEAL -4(BX), BX + MOVL BP, DI + LEAL -4(BP), BP CMPL DI, $0x08 JLE repeat_two_repeat_as_copy_encodeBlockAsm8BAvx_emit_copy_short CMPL DI, $0x0c JGE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm8BAvx_emit_copy_short - CMPL BP, $0x00000800 + CMPL SI, $0x00000800 JLT repeat_two_offset_repeat_as_copy_encodeBlockAsm8BAvx_emit_copy_short cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm8BAvx_emit_copy_short: - CMPL BX, $0x00000104 + CMPL BP, $0x00000104 JLT repeat_three_repeat_as_copy_encodeBlockAsm8BAvx_emit_copy_short - CMPL BX, $0x00010100 + CMPL BP, $0x00010100 JLT repeat_four_repeat_as_copy_encodeBlockAsm8BAvx_emit_copy_short - CMPL BX, $0x0100ffff + CMPL BP, $0x0100ffff JLT repeat_five_repeat_as_copy_encodeBlockAsm8BAvx_emit_copy_short - LEAL -16842747(BX), BX + LEAL -16842747(BP), BP MOVW $0x001d, (AX) MOVW $0xfffb, 2(AX) MOVB $0xff, 4(AX) @@ -10965,68 +10950,68 @@ cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm8BAvx_emit_copy_short: JMP emit_repeat_again_repeat_as_copy_encodeBlockAsm8BAvx_emit_copy_short repeat_five_repeat_as_copy_encodeBlockAsm8BAvx_emit_copy_short: - LEAL -65536(BX), BX - MOVL BX, BP + LEAL -65536(BP), BP + MOVL BP, SI MOVW $0x001d, (AX) - MOVW BX, 2(AX) - SARL $0x10, BP - MOVB BP, 4(AX) + MOVW BP, 2(AX) + SARL $0x10, SI + MOVB SI, 4(AX) ADDQ $0x05, AX JMP repeat_end_emit_encodeBlockAsm8BAvx repeat_four_repeat_as_copy_encodeBlockAsm8BAvx_emit_copy_short: - LEAL -256(BX), BX + LEAL -256(BP), BP MOVW $0x0019, (AX) - MOVW BX, 2(AX) + MOVW BP, 2(AX) ADDQ $0x04, AX JMP repeat_end_emit_encodeBlockAsm8BAvx repeat_three_repeat_as_copy_encodeBlockAsm8BAvx_emit_copy_short: - LEAL -4(BX), BX + LEAL -4(BP), BP MOVW $0x0015, (AX) - MOVB BL, 2(AX) + MOVB BP, 2(AX) ADDQ $0x03, AX JMP repeat_end_emit_encodeBlockAsm8BAvx repeat_two_repeat_as_copy_encodeBlockAsm8BAvx_emit_copy_short: - SHLL $0x02, BX - ORL $0x01, BX - MOVW BX, (AX) + SHLL $0x02, BP + ORL $0x01, BP + MOVW BP, (AX) ADDQ $0x02, AX JMP repeat_end_emit_encodeBlockAsm8BAvx repeat_two_offset_repeat_as_copy_encodeBlockAsm8BAvx_emit_copy_short: - XORQ R9, R9 - LEAL 1(R9)(BX*4), BX - MOVB BP, 1(AX) - SARL $0x08, BP - SHLL $0x05, BP - ORL BP, BX - MOVB BL, (AX) + XORQ DI, DI + LEAL 1(DI)(BP*4), BP + MOVB SI, 1(AX) + SARL $0x08, SI + SHLL $0x05, SI + ORL SI, BP + MOVB BP, (AX) ADDQ $0x02, AX JMP repeat_end_emit_encodeBlockAsm8BAvx - JMP two_byte_offset_repeat_as_copy_encodeBlockAsm8BAvx + JMP two_byte_offset_repeat_as_copy_encodeBlockAsm8BAvx two_byte_offset_short_repeat_as_copy_encodeBlockAsm8BAvx: - CMPL BX, $0x0c + CMPL BP, $0x0c JGE emit_copy_three_repeat_as_copy_encodeBlockAsm8BAvx - CMPL BP, $0x00000800 + CMPL SI, $0x00000800 JGE emit_copy_three_repeat_as_copy_encodeBlockAsm8BAvx - MOVB $0x01, SI - LEAL -16(SI)(BX*4), BX - MOVB BP, 1(AX) - SHRL $0x08, BP - SHLL $0x05, BP - ORL BP, BX - MOVB BL, (AX) + MOVB $0x01, BL + LEAL -16(BX)(BP*4), BP + MOVB SI, 1(AX) + SHRL $0x08, SI + SHLL $0x05, SI + ORL SI, BP + MOVB BP, (AX) ADDQ $0x02, AX JMP repeat_end_emit_encodeBlockAsm8BAvx emit_copy_three_repeat_as_copy_encodeBlockAsm8BAvx: - MOVB $0x02, SI - LEAL -4(SI)(BX*4), BX - MOVB BL, (AX) - MOVW BP, 1(AX) + MOVB $0x02, BL + LEAL -4(BX)(BP*4), BP + MOVB BP, (AX) + MOVW SI, 1(AX) ADDQ $0x03, AX repeat_end_emit_encodeBlockAsm8BAvx: @@ -11036,16 +11021,16 @@ repeat_end_emit_encodeBlockAsm8BAvx: JMP search_loop_encodeBlockAsm8BAvx no_repeat_found_encodeBlockAsm8BAvx: - CMPL (DX)(BX*1), R9 + CMPL (DX)(BP*1), SI JEQ candidate_match_encodeBlockAsm8BAvx - SHRQ $0x08, R9 - MOVL 24(SP)(R11*4), BX - LEAL 2(CX), DI - CMPL (DX)(BP*1), R9 + SHRQ $0x08, SI + MOVL 24(SP)(R9*4), BP + LEAL 2(CX), R8 + CMPL (DX)(DI*1), SI JEQ candidate2_match_encodeBlockAsm8BAvx - MOVL DI, 24(SP)(R11*4) - SHRQ $0x08, R9 - CMPL (DX)(BX*1), R9 + MOVL R8, 24(SP)(R9*4) + SHRQ $0x08, SI + CMPL (DX)(BP*1), SI JEQ candidate3_match_encodeBlockAsm8BAvx MOVL 20(SP), CX JMP search_loop_encodeBlockAsm8BAvx @@ -11055,46 +11040,46 @@ candidate3_match_encodeBlockAsm8BAvx: JMP candidate_match_encodeBlockAsm8BAvx candidate2_match_encodeBlockAsm8BAvx: - MOVL DI, 24(SP)(R11*4) + MOVL R8, 24(SP)(R9*4) INCL CX - MOVL BP, BX + MOVL DI, BP candidate_match_encodeBlockAsm8BAvx: - MOVL 12(SP), BP - TESTL BX, BX + MOVL 12(SP), SI + TESTL BP, BP JZ match_extend_back_end_encodeBlockAsm8BAvx match_extend_back_loop_encodeBlockAsm8BAvx: - CMPL CX, BP + CMPL CX, SI JLE match_extend_back_end_encodeBlockAsm8BAvx - MOVB -1(DX)(BX*1), SI + MOVB -1(DX)(BP*1), BL MOVB -1(DX)(CX*1), DI - CMPB SI, DI + CMPB BL, DI JNE match_extend_back_end_encodeBlockAsm8BAvx LEAL -1(CX), CX - DECL BX + DECL BP JZ match_extend_back_end_encodeBlockAsm8BAvx JMP match_extend_back_loop_encodeBlockAsm8BAvx match_extend_back_end_encodeBlockAsm8BAvx: - MOVL CX, DI - SUBL 12(SP), DI - LEAQ 4(AX)(DI*1), DI - CMPQ DI, (SP) + MOVL CX, SI + SUBL 12(SP), SI + LEAQ 4(AX)(SI*1), SI + CMPQ SI, (SP) JL match_dst_size_check_encodeBlockAsm8BAvx MOVQ $0x00000000, ret+48(FP) RET match_dst_size_check_encodeBlockAsm8BAvx: - MOVL CX, R8 + MOVL CX, SI MOVL 12(SP), DI - CMPL DI, R8 + CMPL DI, SI JEQ emit_literal_done_match_emit_encodeBlockAsm8BAvx - MOVL R8, BP - MOVL R8, 12(SP) - LEAQ (DX)(DI*1), R9 - SUBL DI, BP - MOVL BP, DI + MOVL SI, R8 + MOVL SI, 12(SP) + LEAQ (DX)(DI*1), SI + SUBL DI, R8 + MOVL R8, DI SUBL $0x01, DI JC emit_literal_done_match_emit_encodeBlockAsm8BAvx CMPL DI, $0x3c @@ -11111,11 +11096,11 @@ match_dst_size_check_encodeBlockAsm8BAvx: JMP memmove_match_emit_encodeBlockAsm8BAvx four_bytes_match_emit_encodeBlockAsm8BAvx: - MOVL DI, R8 - SHRL $0x10, R8 + MOVL DI, R9 + SHRL $0x10, R9 MOVB $0xf8, (AX) MOVW DI, 1(AX) - MOVB R8, 3(AX) + MOVB R9, 3(AX) ADDQ $0x04, AX JMP memmove_match_emit_encodeBlockAsm8BAvx @@ -11137,8 +11122,7 @@ one_byte_match_emit_encodeBlockAsm8BAvx: ADDQ $0x01, AX memmove_match_emit_encodeBlockAsm8BAvx: - LEAQ (AX)(BP*1), R10 - MOVL BP, R8 + LEAQ (AX)(R8*1), DI NOP emit_lit_memmove_match_emit_encodeBlockAsm8BAvx_memmove_tail: @@ -11165,55 +11149,55 @@ emit_lit_memmove_match_emit_encodeBlockAsm8BAvx_memmove_tail: JMP emit_lit_memmove_match_emit_encodeBlockAsm8BAvx_memmove_avxUnaligned emit_lit_memmove_match_emit_encodeBlockAsm8BAvx_memmove_move_1or2: - MOVB (R9), R11 - MOVB -1(R9)(R8*1), R12 - MOVB R11, (AX) - MOVB R12, -1(AX)(R8*1) + MOVB (SI), R9 + MOVB -1(SI)(R8*1), R10 + MOVB R9, (AX) + MOVB R10, -1(AX)(R8*1) JMP memmove_end_copy_match_emit_encodeBlockAsm8BAvx emit_lit_memmove_match_emit_encodeBlockAsm8BAvx_memmove_move_4: - MOVL (R9), R11 - MOVL R11, (AX) + MOVL (SI), R9 + MOVL R9, (AX) JMP memmove_end_copy_match_emit_encodeBlockAsm8BAvx emit_lit_memmove_match_emit_encodeBlockAsm8BAvx_memmove_move_3: - MOVW (R9), R11 - MOVB 2(R9), R12 - MOVW R11, (AX) - MOVB R12, 2(AX) + MOVW (SI), R9 + MOVB 2(SI), R10 + MOVW R9, (AX) + MOVB R10, 2(AX) JMP memmove_end_copy_match_emit_encodeBlockAsm8BAvx emit_lit_memmove_match_emit_encodeBlockAsm8BAvx_memmove_move_5through7: - MOVL (R9), R11 - MOVL -4(R9)(R8*1), R12 - MOVL R11, (AX) - MOVL R12, -4(AX)(R8*1) + MOVL (SI), R9 + MOVL -4(SI)(R8*1), R10 + MOVL R9, (AX) + MOVL R10, -4(AX)(R8*1) JMP memmove_end_copy_match_emit_encodeBlockAsm8BAvx emit_lit_memmove_match_emit_encodeBlockAsm8BAvx_memmove_move_8: - MOVQ (R9), R11 - MOVQ R11, (AX) + MOVQ (SI), R9 + MOVQ R9, (AX) JMP memmove_end_copy_match_emit_encodeBlockAsm8BAvx emit_lit_memmove_match_emit_encodeBlockAsm8BAvx_memmove_move_9through16: - MOVQ (R9), R11 - MOVQ -8(R9)(R8*1), R12 - MOVQ R11, (AX) - MOVQ R12, -8(AX)(R8*1) + MOVQ (SI), R9 + MOVQ -8(SI)(R8*1), R10 + MOVQ R9, (AX) + MOVQ R10, -8(AX)(R8*1) JMP memmove_end_copy_match_emit_encodeBlockAsm8BAvx emit_lit_memmove_match_emit_encodeBlockAsm8BAvx_memmove_move_17through32: - MOVOU (R9), X0 - MOVOU -16(R9)(R8*1), X1 + MOVOU (SI), X0 + MOVOU -16(SI)(R8*1), X1 MOVOU X0, (AX) MOVOU X1, -16(AX)(R8*1) JMP memmove_end_copy_match_emit_encodeBlockAsm8BAvx emit_lit_memmove_match_emit_encodeBlockAsm8BAvx_memmove_move_33through64: - MOVOU (R9), X0 - MOVOU 16(R9), X1 - MOVOU -32(R9)(R8*1), X2 - MOVOU -16(R9)(R8*1), X3 + MOVOU (SI), X0 + MOVOU 16(SI), X1 + MOVOU -32(SI)(R8*1), X2 + MOVOU -16(SI)(R8*1), X3 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, -32(AX)(R8*1) @@ -11221,14 +11205,14 @@ emit_lit_memmove_match_emit_encodeBlockAsm8BAvx_memmove_move_33through64: JMP memmove_end_copy_match_emit_encodeBlockAsm8BAvx emit_lit_memmove_match_emit_encodeBlockAsm8BAvx_memmove_move_65through128: - MOVOU (R9), X0 - MOVOU 16(R9), X1 - MOVOU 32(R9), X2 - MOVOU 48(R9), X3 - MOVOU -64(R9)(R8*1), X12 - MOVOU -48(R9)(R8*1), X13 - MOVOU -32(R9)(R8*1), X14 - MOVOU -16(R9)(R8*1), X15 + MOVOU (SI), X0 + MOVOU 16(SI), X1 + MOVOU 32(SI), X2 + MOVOU 48(SI), X3 + MOVOU -64(SI)(R8*1), X12 + MOVOU -48(SI)(R8*1), X13 + MOVOU -32(SI)(R8*1), X14 + MOVOU -16(SI)(R8*1), X15 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, 32(AX) @@ -11240,22 +11224,22 @@ emit_lit_memmove_match_emit_encodeBlockAsm8BAvx_memmove_move_65through128: JMP memmove_end_copy_match_emit_encodeBlockAsm8BAvx emit_lit_memmove_match_emit_encodeBlockAsm8BAvx_memmove_move_129through256: - MOVOU (R9), X0 - MOVOU 16(R9), X1 - MOVOU 32(R9), X2 - MOVOU 48(R9), X3 - MOVOU 64(R9), X4 - MOVOU 80(R9), X5 - MOVOU 96(R9), X6 - MOVOU 112(R9), X7 - MOVOU -128(R9)(R8*1), X8 - MOVOU -112(R9)(R8*1), X9 - MOVOU -96(R9)(R8*1), X10 - MOVOU -80(R9)(R8*1), X11 - MOVOU -64(R9)(R8*1), X12 - MOVOU -48(R9)(R8*1), X13 - MOVOU -32(R9)(R8*1), X14 - MOVOU -16(R9)(R8*1), X15 + MOVOU (SI), X0 + MOVOU 16(SI), X1 + MOVOU 32(SI), X2 + MOVOU 48(SI), X3 + MOVOU 64(SI), X4 + MOVOU 80(SI), X5 + MOVOU 96(SI), X6 + MOVOU 112(SI), X7 + MOVOU -128(SI)(R8*1), X8 + MOVOU -112(SI)(R8*1), X9 + MOVOU -96(SI)(R8*1), X10 + MOVOU -80(SI)(R8*1), X11 + MOVOU -64(SI)(R8*1), X12 + MOVOU -48(SI)(R8*1), X13 + MOVOU -32(SI)(R8*1), X14 + MOVOU -16(SI)(R8*1), X15 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, 32(AX) @@ -11276,22 +11260,22 @@ emit_lit_memmove_match_emit_encodeBlockAsm8BAvx_memmove_move_129through256: emit_lit_memmove_match_emit_encodeBlockAsm8BAvx_memmove_move_256through2048: LEAQ -256(R8), R8 - MOVOU (R9), X0 - MOVOU 16(R9), X1 - MOVOU 32(R9), X2 - MOVOU 48(R9), X3 - MOVOU 64(R9), X4 - MOVOU 80(R9), X5 - MOVOU 96(R9), X6 - MOVOU 112(R9), X7 - MOVOU 128(R9), X8 - MOVOU 144(R9), X9 - MOVOU 160(R9), X10 - MOVOU 176(R9), X11 - MOVOU 192(R9), X12 - MOVOU 208(R9), X13 - MOVOU 224(R9), X14 - MOVOU 240(R9), X15 + MOVOU (SI), X0 + MOVOU 16(SI), X1 + MOVOU 32(SI), X2 + MOVOU 48(SI), X3 + MOVOU 64(SI), X4 + MOVOU 80(SI), X5 + MOVOU 96(SI), X6 + MOVOU 112(SI), X7 + MOVOU 128(SI), X8 + MOVOU 144(SI), X9 + MOVOU 160(SI), X10 + MOVOU 176(SI), X11 + MOVOU 192(SI), X12 + MOVOU 208(SI), X13 + MOVOU 224(SI), X14 + MOVOU 240(SI), X15 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, 32(AX) @@ -11309,139 +11293,139 @@ emit_lit_memmove_match_emit_encodeBlockAsm8BAvx_memmove_move_256through2048: MOVOU X14, 224(AX) MOVOU X15, 240(AX) CMPQ R8, $0x00000100 - LEAQ 256(R9), R9 + LEAQ 256(SI), SI LEAQ 256(AX), AX JGE emit_lit_memmove_match_emit_encodeBlockAsm8BAvx_memmove_move_256through2048 JMP emit_lit_memmove_match_emit_encodeBlockAsm8BAvx_memmove_tail emit_lit_memmove_match_emit_encodeBlockAsm8BAvx_memmove_avxUnaligned: - LEAQ (R9)(R8*1), R12 - MOVQ AX, R14 - MOVOU -128(R12), X5 - MOVOU -112(R12), X6 - MOVQ $0x00000080, R11 + LEAQ (SI)(R8*1), R10 + MOVQ AX, R12 + MOVOU -128(R10), X5 + MOVOU -112(R10), X6 + MOVQ $0x00000080, R9 ANDQ $0xffffffe0, AX ADDQ $0x20, AX - MOVOU -96(R12), X7 - MOVOU -80(R12), X8 - MOVQ AX, R13 - SUBQ R14, R13 - MOVOU -64(R12), X9 - MOVOU -48(R12), X10 - SUBQ R13, R8 - MOVOU -32(R12), X11 - MOVOU -16(R12), X12 - VMOVDQU (R9), Y4 - ADDQ R13, R9 + MOVOU -96(R10), X7 + MOVOU -80(R10), X8 + MOVQ AX, R11 + SUBQ R12, R11 + MOVOU -64(R10), X9 + MOVOU -48(R10), X10 SUBQ R11, R8 + MOVOU -32(R10), X11 + MOVOU -16(R10), X12 + VMOVDQU (SI), Y4 + ADDQ R11, SI + SUBQ R9, R8 emit_lit_memmove_match_emit_encodeBlockAsm8BAvx_memmove_gobble_128_loop: - VMOVDQU (R9), Y0 - VMOVDQU 32(R9), Y1 - VMOVDQU 64(R9), Y2 - VMOVDQU 96(R9), Y3 - ADDQ R11, R9 + VMOVDQU (SI), Y0 + VMOVDQU 32(SI), Y1 + VMOVDQU 64(SI), Y2 + VMOVDQU 96(SI), Y3 + ADDQ R9, SI VMOVDQA Y0, (AX) VMOVDQA Y1, 32(AX) VMOVDQA Y2, 64(AX) VMOVDQA Y3, 96(AX) - ADDQ R11, AX - SUBQ R11, R8 + ADDQ R9, AX + SUBQ R9, R8 JA emit_lit_memmove_match_emit_encodeBlockAsm8BAvx_memmove_gobble_128_loop - ADDQ R11, R8 + ADDQ R9, R8 ADDQ AX, R8 - VMOVDQU Y4, (R14) + VMOVDQU Y4, (R12) VZEROUPPER - MOVOU X5, -128(R8) - MOVOU X6, -112(R8) - MOVOU X7, -96(R8) - MOVOU X8, -80(R8) - MOVOU X9, -64(R8) - MOVOU X10, -48(R8) - MOVOU X11, -32(R8) - MOVOU X12, -16(R8) + MOVOU X5, -128(R8) + MOVOU X6, -112(R8) + MOVOU X7, -96(R8) + MOVOU X8, -80(R8) + MOVOU X9, -64(R8) + MOVOU X10, -48(R8) + MOVOU X11, -32(R8) + MOVOU X12, -16(R8) memmove_end_copy_match_emit_encodeBlockAsm8BAvx: - MOVQ R10, AX + MOVQ DI, AX emit_literal_done_match_emit_encodeBlockAsm8BAvx: match_nolit_loop_encodeBlockAsm8BAvx: - MOVL CX, BP - SUBL BX, BP - MOVL BP, 16(SP) + MOVL CX, SI + SUBL BP, SI + MOVL SI, 16(SP) ADDL $0x04, CX - ADDL $0x04, BX - MOVQ src_len+32(FP), R9 - SUBL CX, R9 - LEAQ (DX)(CX*1), R10 - LEAQ (DX)(BX*1), R11 - XORL BX, BX - CMPL R9, $0x08 + ADDL $0x04, BP + MOVQ src_len+32(FP), SI + SUBL CX, SI + LEAQ (DX)(CX*1), DI + LEAQ (DX)(BP*1), BP + XORL R9, R9 + CMPL SI, $0x08 JL matchlen_single_match_nolit_encodeBlockAsm8BAvx matchlen_loopback_match_nolit_encodeBlockAsm8BAvx: - MOVQ (R10)(BX*1), R12 - XORQ (R11)(BX*1), R12 - TESTQ R12, R12 + MOVQ (DI)(R9*1), R8 + XORQ (BP)(R9*1), R8 + TESTQ R8, R8 JZ matchlen_loop_match_nolit_encodeBlockAsm8BAvx - BSFQ R12, R12 - SARQ $0x03, R12 - LEAL (BX)(R12*1), BX + BSFQ R8, R8 + SARQ $0x03, R8 + LEAL (R9)(R8*1), R9 JMP match_nolit_end_encodeBlockAsm8BAvx matchlen_loop_match_nolit_encodeBlockAsm8BAvx: - LEAL -8(R9), R9 - LEAL 8(BX), BX - CMPL R9, $0x08 + LEAL -8(SI), SI + LEAL 8(R9), R9 + CMPL SI, $0x08 JGE matchlen_loopback_match_nolit_encodeBlockAsm8BAvx matchlen_single_match_nolit_encodeBlockAsm8BAvx: - TESTL R9, R9 + TESTL SI, SI JZ match_nolit_end_encodeBlockAsm8BAvx matchlen_single_loopback_match_nolit_encodeBlockAsm8BAvx: - MOVB (R10)(BX*1), R12 - CMPB (R11)(BX*1), R12 + MOVB (DI)(R9*1), R8 + CMPB (BP)(R9*1), R8 JNE match_nolit_end_encodeBlockAsm8BAvx - LEAL 1(BX), BX - DECL R9 + LEAL 1(R9), R9 + DECL SI JNZ matchlen_single_loopback_match_nolit_encodeBlockAsm8BAvx match_nolit_end_encodeBlockAsm8BAvx: - ADDL BX, CX + ADDL R9, CX MOVL 16(SP), BP - ADDL $0x04, BX + ADDL $0x04, R9 CMPL BP, $0x00010000 JL two_byte_offset_match_nolit_encodeBlockAsm8BAvx four_bytes_loop_back_match_nolit_encodeBlockAsm8BAvx: - CMPL BX, $0x40 + CMPL R9, $0x40 JLE four_bytes_remain_match_nolit_encodeBlockAsm8BAvx MOVB $0xff, (AX) MOVL BP, 1(AX) - LEAL -64(BX), BX + LEAL -64(R9), R9 ADDQ $0x05, AX - CMPL BX, $0x04 + CMPL R9, $0x04 JL four_bytes_remain_match_nolit_encodeBlockAsm8BAvx emit_repeat_again_match_nolit_encodeBlockAsm8BAvx_emit_copy: - MOVL BX, DI - LEAL -4(BX), BX - CMPL DI, $0x08 + MOVL R9, SI + LEAL -4(R9), R9 + CMPL SI, $0x08 JLE repeat_two_match_nolit_encodeBlockAsm8BAvx_emit_copy - CMPL DI, $0x0c + CMPL SI, $0x0c JGE cant_repeat_two_offset_match_nolit_encodeBlockAsm8BAvx_emit_copy CMPL BP, $0x00000800 JLT repeat_two_offset_match_nolit_encodeBlockAsm8BAvx_emit_copy cant_repeat_two_offset_match_nolit_encodeBlockAsm8BAvx_emit_copy: - CMPL BX, $0x00000104 + CMPL R9, $0x00000104 JLT repeat_three_match_nolit_encodeBlockAsm8BAvx_emit_copy - CMPL BX, $0x00010100 + CMPL R9, $0x00010100 JLT repeat_four_match_nolit_encodeBlockAsm8BAvx_emit_copy - CMPL BX, $0x0100ffff + CMPL R9, $0x0100ffff JLT repeat_five_match_nolit_encodeBlockAsm8BAvx_emit_copy - LEAL -16842747(BX), BX + LEAL -16842747(R9), R9 MOVW $0x001d, (AX) MOVW $0xfffb, 2(AX) MOVB $0xff, 4(AX) @@ -11449,84 +11433,84 @@ cant_repeat_two_offset_match_nolit_encodeBlockAsm8BAvx_emit_copy: JMP emit_repeat_again_match_nolit_encodeBlockAsm8BAvx_emit_copy repeat_five_match_nolit_encodeBlockAsm8BAvx_emit_copy: - LEAL -65536(BX), BX - MOVL BX, BP + LEAL -65536(R9), R9 + MOVL R9, BP MOVW $0x001d, (AX) - MOVW BX, 2(AX) + MOVW R9, 2(AX) SARL $0x10, BP MOVB BP, 4(AX) ADDQ $0x05, AX JMP match_nolit_emitcopy_end_encodeBlockAsm8BAvx repeat_four_match_nolit_encodeBlockAsm8BAvx_emit_copy: - LEAL -256(BX), BX + LEAL -256(R9), R9 MOVW $0x0019, (AX) - MOVW BX, 2(AX) + MOVW R9, 2(AX) ADDQ $0x04, AX JMP match_nolit_emitcopy_end_encodeBlockAsm8BAvx repeat_three_match_nolit_encodeBlockAsm8BAvx_emit_copy: - LEAL -4(BX), BX + LEAL -4(R9), R9 MOVW $0x0015, (AX) - MOVB BL, 2(AX) + MOVB R9, 2(AX) ADDQ $0x03, AX JMP match_nolit_emitcopy_end_encodeBlockAsm8BAvx repeat_two_match_nolit_encodeBlockAsm8BAvx_emit_copy: - SHLL $0x02, BX - ORL $0x01, BX - MOVW BX, (AX) + SHLL $0x02, R9 + ORL $0x01, R9 + MOVW R9, (AX) ADDQ $0x02, AX JMP match_nolit_emitcopy_end_encodeBlockAsm8BAvx repeat_two_offset_match_nolit_encodeBlockAsm8BAvx_emit_copy: - XORQ R9, R9 - LEAL 1(R9)(BX*4), BX + XORQ SI, SI + LEAL 1(SI)(R9*4), R9 MOVB BP, 1(AX) SARL $0x08, BP SHLL $0x05, BP - ORL BP, BX - MOVB BL, (AX) + ORL BP, R9 + MOVB R9, (AX) ADDQ $0x02, AX JMP match_nolit_emitcopy_end_encodeBlockAsm8BAvx - JMP four_bytes_loop_back_match_nolit_encodeBlockAsm8BAvx + JMP four_bytes_loop_back_match_nolit_encodeBlockAsm8BAvx four_bytes_remain_match_nolit_encodeBlockAsm8BAvx: - TESTL BX, BX + TESTL R9, R9 JZ match_nolit_emitcopy_end_encodeBlockAsm8BAvx - MOVB $0x03, SI - LEAL -4(SI)(BX*4), BX - MOVB BL, (AX) + MOVB $0x03, BL + LEAL -4(BX)(R9*4), R9 + MOVB R9, (AX) MOVL BP, 1(AX) ADDQ $0x05, AX JMP match_nolit_emitcopy_end_encodeBlockAsm8BAvx two_byte_offset_match_nolit_encodeBlockAsm8BAvx: - CMPL BX, $0x40 + CMPL R9, $0x40 JLE two_byte_offset_short_match_nolit_encodeBlockAsm8BAvx MOVB $0xee, (AX) MOVW BP, 1(AX) - LEAL -60(BX), BX + LEAL -60(R9), R9 ADDQ $0x03, AX emit_repeat_again_match_nolit_encodeBlockAsm8BAvx_emit_copy_short: - MOVL BX, DI - LEAL -4(BX), BX - CMPL DI, $0x08 + MOVL R9, SI + LEAL -4(R9), R9 + CMPL SI, $0x08 JLE repeat_two_match_nolit_encodeBlockAsm8BAvx_emit_copy_short - CMPL DI, $0x0c + CMPL SI, $0x0c JGE cant_repeat_two_offset_match_nolit_encodeBlockAsm8BAvx_emit_copy_short CMPL BP, $0x00000800 JLT repeat_two_offset_match_nolit_encodeBlockAsm8BAvx_emit_copy_short cant_repeat_two_offset_match_nolit_encodeBlockAsm8BAvx_emit_copy_short: - CMPL BX, $0x00000104 + CMPL R9, $0x00000104 JLT repeat_three_match_nolit_encodeBlockAsm8BAvx_emit_copy_short - CMPL BX, $0x00010100 + CMPL R9, $0x00010100 JLT repeat_four_match_nolit_encodeBlockAsm8BAvx_emit_copy_short - CMPL BX, $0x0100ffff + CMPL R9, $0x0100ffff JLT repeat_five_match_nolit_encodeBlockAsm8BAvx_emit_copy_short - LEAL -16842747(BX), BX + LEAL -16842747(R9), R9 MOVW $0x001d, (AX) MOVW $0xfffb, 2(AX) MOVB $0xff, 4(AX) @@ -11534,67 +11518,67 @@ cant_repeat_two_offset_match_nolit_encodeBlockAsm8BAvx_emit_copy_short: JMP emit_repeat_again_match_nolit_encodeBlockAsm8BAvx_emit_copy_short repeat_five_match_nolit_encodeBlockAsm8BAvx_emit_copy_short: - LEAL -65536(BX), BX - MOVL BX, BP + LEAL -65536(R9), R9 + MOVL R9, BP MOVW $0x001d, (AX) - MOVW BX, 2(AX) + MOVW R9, 2(AX) SARL $0x10, BP MOVB BP, 4(AX) ADDQ $0x05, AX JMP match_nolit_emitcopy_end_encodeBlockAsm8BAvx repeat_four_match_nolit_encodeBlockAsm8BAvx_emit_copy_short: - LEAL -256(BX), BX + LEAL -256(R9), R9 MOVW $0x0019, (AX) - MOVW BX, 2(AX) + MOVW R9, 2(AX) ADDQ $0x04, AX JMP match_nolit_emitcopy_end_encodeBlockAsm8BAvx repeat_three_match_nolit_encodeBlockAsm8BAvx_emit_copy_short: - LEAL -4(BX), BX + LEAL -4(R9), R9 MOVW $0x0015, (AX) - MOVB BL, 2(AX) + MOVB R9, 2(AX) ADDQ $0x03, AX JMP match_nolit_emitcopy_end_encodeBlockAsm8BAvx repeat_two_match_nolit_encodeBlockAsm8BAvx_emit_copy_short: - SHLL $0x02, BX - ORL $0x01, BX - MOVW BX, (AX) + SHLL $0x02, R9 + ORL $0x01, R9 + MOVW R9, (AX) ADDQ $0x02, AX JMP match_nolit_emitcopy_end_encodeBlockAsm8BAvx repeat_two_offset_match_nolit_encodeBlockAsm8BAvx_emit_copy_short: - XORQ R9, R9 - LEAL 1(R9)(BX*4), BX + XORQ SI, SI + LEAL 1(SI)(R9*4), R9 MOVB BP, 1(AX) SARL $0x08, BP SHLL $0x05, BP - ORL BP, BX - MOVB BL, (AX) + ORL BP, R9 + MOVB R9, (AX) ADDQ $0x02, AX JMP match_nolit_emitcopy_end_encodeBlockAsm8BAvx - JMP two_byte_offset_match_nolit_encodeBlockAsm8BAvx + JMP two_byte_offset_match_nolit_encodeBlockAsm8BAvx two_byte_offset_short_match_nolit_encodeBlockAsm8BAvx: - CMPL BX, $0x0c + CMPL R9, $0x0c JGE emit_copy_three_match_nolit_encodeBlockAsm8BAvx CMPL BP, $0x00000800 JGE emit_copy_three_match_nolit_encodeBlockAsm8BAvx - MOVB $0x01, SI - LEAL -16(SI)(BX*4), BX + MOVB $0x01, BL + LEAL -16(BX)(R9*4), R9 MOVB BP, 1(AX) SHRL $0x08, BP SHLL $0x05, BP - ORL BP, BX - MOVB BL, (AX) + ORL BP, R9 + MOVB R9, (AX) ADDQ $0x02, AX JMP match_nolit_emitcopy_end_encodeBlockAsm8BAvx emit_copy_three_match_nolit_encodeBlockAsm8BAvx: - MOVB $0x02, SI - LEAL -4(SI)(BX*4), BX - MOVB BL, (AX) + MOVB $0x02, BL + LEAL -4(BX)(R9*4), R9 + MOVB R9, (AX) MOVW BP, 1(AX) ADDQ $0x03, AX @@ -11608,22 +11592,22 @@ match_nolit_emitcopy_end_encodeBlockAsm8BAvx: RET match_nolit_dst_ok_encodeBlockAsm8BAvx: - MOVQ -2(DX)(CX*1), R9 - MOVQ $0x9e3779b1, R10 - MOVQ R9, R11 - SHRQ $0x10, R9 - MOVQ R9, R12 - SHLQ $0x20, R11 - IMULQ R10, R11 - SHRQ $0x38, R11 - SHLQ $0x20, R12 - IMULQ R10, R12 - SHRQ $0x38, R12 - LEAL -2(CX), BP - MOVL 24(SP)(R12*4), BX - MOVL BP, 24(SP)(R11*4) - MOVL CX, 24(SP)(R12*4) - CMPL (DX)(BX*1), R9 + MOVQ -2(DX)(CX*1), SI + MOVQ $0x9e3779b1, BP + MOVQ SI, DI + SHRQ $0x10, SI + MOVQ SI, R8 + SHLQ $0x20, DI + IMULQ BP, DI + SHRQ $0x38, DI + SHLQ $0x20, R8 + IMULQ BP, R8 + SHRQ $0x38, R8 + LEAL -2(CX), R9 + MOVL 24(SP)(R8*4), BP + MOVL R9, 24(SP)(DI*4) + MOVL CX, 24(SP)(R8*4) + CMPL (DX)(BP*1), SI JEQ match_nolit_loop_encodeBlockAsm8BAvx INCL CX JMP search_loop_encodeBlockAsm8BAvx @@ -11904,14 +11888,14 @@ emit_lit_memmove_emit_remainder_encodeBlockAsm8BAvx_memmove_gobble_128_loop: ADDQ AX, BX VMOVDQU Y4, (R8) VZEROUPPER - MOVOU X5, -128(BX) - MOVOU X6, -112(BX) - MOVOU X7, -96(BX) - MOVOU X8, -80(BX) - MOVOU X9, -64(BX) - MOVOU X10, -48(BX) - MOVOU X11, -32(BX) - MOVOU X12, -16(BX) + MOVOU X5, -128(BX) + MOVOU X6, -112(BX) + MOVOU X7, -96(BX) + MOVOU X8, -80(BX) + MOVOU X9, -64(BX) + MOVOU X10, -48(BX) + MOVOU X11, -32(BX) + MOVOU X12, -16(BX) memmove_end_copy_emit_remainder_encodeBlockAsm8BAvx: MOVQ DX, AX @@ -11945,8 +11929,8 @@ zero_loop_encodeSnappyBlockAsm: MOVL $0x00000000, 12(SP) MOVQ src_len+32(FP), CX LEAQ -5(CX), DX - LEAQ -8(CX), R9 - MOVL R9, 8(SP) + LEAQ -8(CX), BP + MOVL BP, 8(SP) SHRQ $0x05, CX SUBL CX, DX LEAQ (AX)(DX*1), DX @@ -11956,68 +11940,68 @@ zero_loop_encodeSnappyBlockAsm: MOVQ src_base+24(FP), DX search_loop_encodeSnappyBlockAsm: - MOVQ (DX)(CX*1), R9 - MOVL CX, BX - SUBL 12(SP), BX - SHRL $0x06, BX - LEAL 4(CX)(BX*1), BX - MOVL 8(SP), BP - CMPL BX, BP + MOVQ (DX)(CX*1), SI + MOVL CX, BP + SUBL 12(SP), BP + SHRL $0x06, BP + LEAL 4(CX)(BP*1), BP + MOVL 8(SP), DI + CMPL BP, DI JGT emit_remainder_encodeSnappyBlockAsm - MOVL BX, 20(SP) - MOVQ $0x0000cf1bbcdcbf9b, R10 - MOVQ R9, R11 - MOVQ R9, R12 - SHRQ $0x08, R12 - SHLQ $0x10, R11 - IMULQ R10, R11 - SHRQ $0x32, R11 - SHLQ $0x10, R12 - IMULQ R10, R12 - SHRQ $0x32, R12 - MOVL 24(SP)(R11*4), BX - MOVL 24(SP)(R12*4), BP - MOVL CX, 24(SP)(R11*4) - LEAL 1(CX), DI - MOVL DI, 24(SP)(R12*4) - MOVQ R9, R11 - SHRQ $0x10, R11 - SHLQ $0x10, R11 - IMULQ R10, R11 - SHRQ $0x32, R11 - MOVL CX, DI - SUBL 16(SP), DI - MOVL 1(DX)(DI*1), DI - MOVQ R9, R10 + MOVL BP, 20(SP) + MOVQ $0x0000cf1bbcdcbf9b, R8 + MOVQ SI, R9 + MOVQ SI, R10 SHRQ $0x08, R10 - CMPL R10, DI + SHLQ $0x10, R9 + IMULQ R8, R9 + SHRQ $0x32, R9 + SHLQ $0x10, R10 + IMULQ R8, R10 + SHRQ $0x32, R10 + MOVL 24(SP)(R9*4), BP + MOVL 24(SP)(R10*4), DI + MOVL CX, 24(SP)(R9*4) + LEAL 1(CX), R9 + MOVL R9, 24(SP)(R10*4) + MOVQ SI, R9 + SHRQ $0x10, R9 + SHLQ $0x10, R9 + IMULQ R8, R9 + SHRQ $0x32, R9 + MOVL CX, R8 + SUBL 16(SP), R8 + MOVL 1(DX)(R8*1), R10 + MOVQ SI, R8 + SHRQ $0x08, R8 + CMPL R8, R10 JNE no_repeat_found_encodeSnappyBlockAsm - LEAL 1(CX), DI - MOVL 12(SP), BX - MOVL DI, BP - SUBL 16(SP), BP + LEAL 1(CX), SI + MOVL 12(SP), BP + MOVL SI, DI + SUBL 16(SP), DI JZ repeat_extend_back_end_encodeSnappyBlockAsm repeat_extend_back_loop_encodeSnappyBlockAsm: - CMPL DI, BX + CMPL SI, BP JLE repeat_extend_back_end_encodeSnappyBlockAsm - MOVB -1(DX)(BP*1), SI - MOVB -1(DX)(DI*1), R8 - CMPB SI, R8 + MOVB -1(DX)(DI*1), BL + MOVB -1(DX)(SI*1), R8 + CMPB BL, R8 JNE repeat_extend_back_end_encodeSnappyBlockAsm - LEAL -1(DI), DI - DECL BP + LEAL -1(SI), SI + DECL DI JNZ repeat_extend_back_loop_encodeSnappyBlockAsm repeat_extend_back_end_encodeSnappyBlockAsm: MOVL 12(SP), BP - CMPL BP, DI + CMPL BP, SI JEQ emit_literal_done_repeat_emit_encodeSnappyBlockAsm - MOVL DI, BX - MOVL DI, 12(SP) - LEAQ (DX)(BP*1), R9 - SUBL BP, BX - MOVL BX, BP + MOVL SI, DI + MOVL SI, 12(SP) + LEAQ (DX)(BP*1), R8 + SUBL BP, DI + MOVL DI, BP SUBL $0x01, BP JC emit_literal_done_repeat_emit_encodeSnappyBlockAsm CMPL BP, $0x3c @@ -12034,11 +12018,11 @@ repeat_extend_back_end_encodeSnappyBlockAsm: JMP memmove_repeat_emit_encodeSnappyBlockAsm four_bytes_repeat_emit_encodeSnappyBlockAsm: - MOVL BP, R8 - SHRL $0x10, R8 + MOVL BP, R9 + SHRL $0x10, R9 MOVB $0xf8, (AX) MOVW BP, 1(AX) - MOVB R8, 3(AX) + MOVB R9, 3(AX) ADDQ $0x04, AX JMP memmove_repeat_emit_encodeSnappyBlockAsm @@ -12060,125 +12044,124 @@ one_byte_repeat_emit_encodeSnappyBlockAsm: ADDQ $0x01, AX memmove_repeat_emit_encodeSnappyBlockAsm: - LEAQ (AX)(BX*1), R10 - MOVL BX, BP + LEAQ (AX)(DI*1), BP NOP emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_tail: - TESTQ BP, BP + TESTQ DI, DI JEQ memmove_end_copy_repeat_emit_encodeSnappyBlockAsm - CMPQ BP, $0x02 + CMPQ DI, $0x02 JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_1or2 - CMPQ BP, $0x04 + CMPQ DI, $0x04 JB emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_3 JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_4 - CMPQ BP, $0x08 + CMPQ DI, $0x08 JB emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_5through7 JE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_8 - CMPQ BP, $0x10 + CMPQ DI, $0x10 JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_9through16 - CMPQ BP, $0x20 + CMPQ DI, $0x20 JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_17through32 - CMPQ BP, $0x40 + CMPQ DI, $0x40 JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_33through64 - CMPQ BP, $0x80 + CMPQ DI, $0x80 JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_65through128 - CMPQ BP, $0x00000100 + CMPQ DI, $0x00000100 JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_129through256 JMP emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_256through2048 emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_1or2: - MOVB (R9), R11 - MOVB -1(R9)(BP*1), R9 - MOVB R11, (AX) - MOVB R9, -1(AX)(BP*1) + MOVB (R8), R9 + MOVB -1(R8)(DI*1), R8 + MOVB R9, (AX) + MOVB R8, -1(AX)(DI*1) JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_4: - MOVL (R9), R11 - MOVL R11, (AX) + MOVL (R8), R9 + MOVL R9, (AX) JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_3: - MOVW (R9), R11 - MOVB 2(R9), R9 - MOVW R11, (AX) - MOVB R9, 2(AX) + MOVW (R8), R9 + MOVB 2(R8), R8 + MOVW R9, (AX) + MOVB R8, 2(AX) JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_5through7: - MOVL (R9), R11 - MOVL -4(R9)(BP*1), R9 - MOVL R11, (AX) - MOVL R9, -4(AX)(BP*1) + MOVL (R8), R9 + MOVL -4(R8)(DI*1), R8 + MOVL R9, (AX) + MOVL R8, -4(AX)(DI*1) JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_8: - MOVQ (R9), R11 - MOVQ R11, (AX) + MOVQ (R8), R9 + MOVQ R9, (AX) JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_9through16: - MOVQ (R9), R11 - MOVQ -8(R9)(BP*1), R9 - MOVQ R11, (AX) - MOVQ R9, -8(AX)(BP*1) + MOVQ (R8), R9 + MOVQ -8(R8)(DI*1), R8 + MOVQ R9, (AX) + MOVQ R8, -8(AX)(DI*1) JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_17through32: - MOVOU (R9), X0 - MOVOU -16(R9)(BP*1), X1 + MOVOU (R8), X0 + MOVOU -16(R8)(DI*1), X1 MOVOU X0, (AX) - MOVOU X1, -16(AX)(BP*1) + MOVOU X1, -16(AX)(DI*1) JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_33through64: - MOVOU (R9), X0 - MOVOU 16(R9), X1 - MOVOU -32(R9)(BP*1), X2 - MOVOU -16(R9)(BP*1), X3 + MOVOU (R8), X0 + MOVOU 16(R8), X1 + MOVOU -32(R8)(DI*1), X2 + MOVOU -16(R8)(DI*1), X3 MOVOU X0, (AX) MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(BP*1) - MOVOU X3, -16(AX)(BP*1) + MOVOU X2, -32(AX)(DI*1) + MOVOU X3, -16(AX)(DI*1) JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_65through128: - MOVOU (R9), X0 - MOVOU 16(R9), X1 - MOVOU 32(R9), X2 - MOVOU 48(R9), X3 - MOVOU -64(R9)(BP*1), X12 - MOVOU -48(R9)(BP*1), X13 - MOVOU -32(R9)(BP*1), X14 - MOVOU -16(R9)(BP*1), X15 + MOVOU (R8), X0 + MOVOU 16(R8), X1 + MOVOU 32(R8), X2 + MOVOU 48(R8), X3 + MOVOU -64(R8)(DI*1), X12 + MOVOU -48(R8)(DI*1), X13 + MOVOU -32(R8)(DI*1), X14 + MOVOU -16(R8)(DI*1), X15 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, 32(AX) MOVOU X3, 48(AX) - MOVOU X12, -64(AX)(BP*1) - MOVOU X13, -48(AX)(BP*1) - MOVOU X14, -32(AX)(BP*1) - MOVOU X15, -16(AX)(BP*1) + MOVOU X12, -64(AX)(DI*1) + MOVOU X13, -48(AX)(DI*1) + MOVOU X14, -32(AX)(DI*1) + MOVOU X15, -16(AX)(DI*1) JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_129through256: - MOVOU (R9), X0 - MOVOU 16(R9), X1 - MOVOU 32(R9), X2 - MOVOU 48(R9), X3 - MOVOU 64(R9), X4 - MOVOU 80(R9), X5 - MOVOU 96(R9), X6 - MOVOU 112(R9), X7 - MOVOU -128(R9)(BP*1), X8 - MOVOU -112(R9)(BP*1), X9 - MOVOU -96(R9)(BP*1), X10 - MOVOU -80(R9)(BP*1), X11 - MOVOU -64(R9)(BP*1), X12 - MOVOU -48(R9)(BP*1), X13 - MOVOU -32(R9)(BP*1), X14 - MOVOU -16(R9)(BP*1), X15 + MOVOU (R8), X0 + MOVOU 16(R8), X1 + MOVOU 32(R8), X2 + MOVOU 48(R8), X3 + MOVOU 64(R8), X4 + MOVOU 80(R8), X5 + MOVOU 96(R8), X6 + MOVOU 112(R8), X7 + MOVOU -128(R8)(DI*1), X8 + MOVOU -112(R8)(DI*1), X9 + MOVOU -96(R8)(DI*1), X10 + MOVOU -80(R8)(DI*1), X11 + MOVOU -64(R8)(DI*1), X12 + MOVOU -48(R8)(DI*1), X13 + MOVOU -32(R8)(DI*1), X14 + MOVOU -16(R8)(DI*1), X15 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, 32(AX) @@ -12187,34 +12170,34 @@ emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_129through256: MOVOU X5, 80(AX) MOVOU X6, 96(AX) MOVOU X7, 112(AX) - MOVOU X8, -128(AX)(BP*1) - MOVOU X9, -112(AX)(BP*1) - MOVOU X10, -96(AX)(BP*1) - MOVOU X11, -80(AX)(BP*1) - MOVOU X12, -64(AX)(BP*1) - MOVOU X13, -48(AX)(BP*1) - MOVOU X14, -32(AX)(BP*1) - MOVOU X15, -16(AX)(BP*1) + MOVOU X8, -128(AX)(DI*1) + MOVOU X9, -112(AX)(DI*1) + MOVOU X10, -96(AX)(DI*1) + MOVOU X11, -80(AX)(DI*1) + MOVOU X12, -64(AX)(DI*1) + MOVOU X13, -48(AX)(DI*1) + MOVOU X14, -32(AX)(DI*1) + MOVOU X15, -16(AX)(DI*1) JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_256through2048: - LEAQ -256(BP), BP - MOVOU (R9), X0 - MOVOU 16(R9), X1 - MOVOU 32(R9), X2 - MOVOU 48(R9), X3 - MOVOU 64(R9), X4 - MOVOU 80(R9), X5 - MOVOU 96(R9), X6 - MOVOU 112(R9), X7 - MOVOU 128(R9), X8 - MOVOU 144(R9), X9 - MOVOU 160(R9), X10 - MOVOU 176(R9), X11 - MOVOU 192(R9), X12 - MOVOU 208(R9), X13 - MOVOU 224(R9), X14 - MOVOU 240(R9), X15 + LEAQ -256(DI), DI + MOVOU (R8), X0 + MOVOU 16(R8), X1 + MOVOU 32(R8), X2 + MOVOU 48(R8), X3 + MOVOU 64(R8), X4 + MOVOU 80(R8), X5 + MOVOU 96(R8), X6 + MOVOU 112(R8), X7 + MOVOU 128(R8), X8 + MOVOU 144(R8), X9 + MOVOU 160(R8), X10 + MOVOU 176(R8), X11 + MOVOU 192(R8), X12 + MOVOU 208(R8), X13 + MOVOU 224(R8), X14 + MOVOU 240(R8), X15 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, 32(AX) @@ -12231,113 +12214,113 @@ emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_256through2048: MOVOU X13, 208(AX) MOVOU X14, 224(AX) MOVOU X15, 240(AX) - CMPQ BP, $0x00000100 - LEAQ 256(R9), R9 + CMPQ DI, $0x00000100 + LEAQ 256(R8), R8 LEAQ 256(AX), AX JGE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_256through2048 JMP emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_tail memmove_end_copy_repeat_emit_encodeSnappyBlockAsm: - MOVQ R10, AX + MOVQ BP, AX emit_literal_done_repeat_emit_encodeSnappyBlockAsm: ADDL $0x05, CX - MOVL CX, BX - SUBL 16(SP), BX - MOVQ src_len+32(FP), R9 - SUBL CX, R9 - LEAQ (DX)(CX*1), R10 - LEAQ (DX)(BX*1), R11 - XORL BX, BX - CMPL R9, $0x08 + MOVL CX, BP + SUBL 16(SP), BP + MOVQ src_len+32(FP), DI + SUBL CX, DI + LEAQ (DX)(CX*1), R8 + LEAQ (DX)(BP*1), BP + XORL R10, R10 + CMPL DI, $0x08 JL matchlen_single_repeat_extend matchlen_loopback_repeat_extend: - MOVQ (R10)(BX*1), R12 - XORQ (R11)(BX*1), R12 - TESTQ R12, R12 + MOVQ (R8)(R10*1), R9 + XORQ (BP)(R10*1), R9 + TESTQ R9, R9 JZ matchlen_loop_repeat_extend - BSFQ R12, R12 - SARQ $0x03, R12 - LEAL (BX)(R12*1), BX + BSFQ R9, R9 + SARQ $0x03, R9 + LEAL (R10)(R9*1), R10 JMP repeat_extend_forward_end_encodeSnappyBlockAsm matchlen_loop_repeat_extend: - LEAL -8(R9), R9 - LEAL 8(BX), BX - CMPL R9, $0x08 + LEAL -8(DI), DI + LEAL 8(R10), R10 + CMPL DI, $0x08 JGE matchlen_loopback_repeat_extend matchlen_single_repeat_extend: - TESTL R9, R9 + TESTL DI, DI JZ repeat_extend_forward_end_encodeSnappyBlockAsm matchlen_single_loopback_repeat_extend: - MOVB (R10)(BX*1), R12 - CMPB (R11)(BX*1), R12 + MOVB (R8)(R10*1), R9 + CMPB (BP)(R10*1), R9 JNE repeat_extend_forward_end_encodeSnappyBlockAsm - LEAL 1(BX), BX - DECL R9 + LEAL 1(R10), R10 + DECL DI JNZ matchlen_single_loopback_repeat_extend repeat_extend_forward_end_encodeSnappyBlockAsm: - ADDL BX, CX - MOVL CX, BX - SUBL DI, BX - MOVL 16(SP), BP - CMPL BP, $0x00010000 + ADDL R10, CX + MOVL CX, BP + SUBL SI, BP + MOVL 16(SP), SI + CMPL SI, $0x00010000 JL two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm four_bytes_loop_back_repeat_as_copy_encodeSnappyBlockAsm: - CMPL BX, $0x40 + CMPL BP, $0x40 JLE four_bytes_remain_repeat_as_copy_encodeSnappyBlockAsm MOVB $0xff, (AX) - MOVL BP, 1(AX) - LEAL -64(BX), BX + MOVL SI, 1(AX) + LEAL -64(BP), BP ADDQ $0x05, AX - CMPL BX, $0x04 + CMPL BP, $0x04 JL four_bytes_remain_repeat_as_copy_encodeSnappyBlockAsm JMP four_bytes_loop_back_repeat_as_copy_encodeSnappyBlockAsm four_bytes_remain_repeat_as_copy_encodeSnappyBlockAsm: - TESTL BX, BX + TESTL BP, BP JZ repeat_end_emit_encodeSnappyBlockAsm - MOVB $0x03, SI - LEAL -4(SI)(BX*4), BX - MOVB BL, (AX) - MOVL BP, 1(AX) + MOVB $0x03, BL + LEAL -4(BX)(BP*4), BP + MOVB BP, (AX) + MOVL SI, 1(AX) ADDQ $0x05, AX JMP repeat_end_emit_encodeSnappyBlockAsm two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm: - CMPL BX, $0x40 + CMPL BP, $0x40 JLE two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm MOVB $0xee, (AX) - MOVW BP, 1(AX) - LEAL -60(BX), BX + MOVW SI, 1(AX) + LEAL -60(BP), BP ADDQ $0x03, AX JMP two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm: - CMPL BX, $0x0c + CMPL BP, $0x0c JGE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm - CMPL BP, $0x00000800 + CMPL SI, $0x00000800 JGE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm - MOVB $0x01, SI - LEAL -16(SI)(BX*4), BX - MOVB BP, 1(AX) - SHRL $0x08, BP - SHLL $0x05, BP - ORL BP, BX - MOVB BL, (AX) + MOVB $0x01, BL + LEAL -16(BX)(BP*4), BP + MOVB SI, 1(AX) + SHRL $0x08, SI + SHLL $0x05, SI + ORL SI, BP + MOVB BP, (AX) ADDQ $0x02, AX JMP repeat_end_emit_encodeSnappyBlockAsm emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm: - MOVB $0x02, SI - LEAL -4(SI)(BX*4), BX - MOVB BL, (AX) - MOVW BP, 1(AX) + MOVB $0x02, BL + LEAL -4(BX)(BP*4), BP + MOVB BP, (AX) + MOVW SI, 1(AX) ADDQ $0x03, AX repeat_end_emit_encodeSnappyBlockAsm: @@ -12347,16 +12330,16 @@ repeat_end_emit_encodeSnappyBlockAsm: JMP search_loop_encodeSnappyBlockAsm no_repeat_found_encodeSnappyBlockAsm: - CMPL (DX)(BX*1), R9 + CMPL (DX)(BP*1), SI JEQ candidate_match_encodeSnappyBlockAsm - SHRQ $0x08, R9 - MOVL 24(SP)(R11*4), BX - LEAL 2(CX), DI - CMPL (DX)(BP*1), R9 + SHRQ $0x08, SI + MOVL 24(SP)(R9*4), BP + LEAL 2(CX), R8 + CMPL (DX)(DI*1), SI JEQ candidate2_match_encodeSnappyBlockAsm - MOVL DI, 24(SP)(R11*4) - SHRQ $0x08, R9 - CMPL (DX)(BX*1), R9 + MOVL R8, 24(SP)(R9*4) + SHRQ $0x08, SI + CMPL (DX)(BP*1), SI JEQ candidate3_match_encodeSnappyBlockAsm MOVL 20(SP), CX JMP search_loop_encodeSnappyBlockAsm @@ -12366,46 +12349,46 @@ candidate3_match_encodeSnappyBlockAsm: JMP candidate_match_encodeSnappyBlockAsm candidate2_match_encodeSnappyBlockAsm: - MOVL DI, 24(SP)(R11*4) + MOVL R8, 24(SP)(R9*4) INCL CX - MOVL BP, BX + MOVL DI, BP candidate_match_encodeSnappyBlockAsm: - MOVL 12(SP), BP - TESTL BX, BX + MOVL 12(SP), SI + TESTL BP, BP JZ match_extend_back_end_encodeSnappyBlockAsm match_extend_back_loop_encodeSnappyBlockAsm: - CMPL CX, BP + CMPL CX, SI JLE match_extend_back_end_encodeSnappyBlockAsm - MOVB -1(DX)(BX*1), SI + MOVB -1(DX)(BP*1), BL MOVB -1(DX)(CX*1), DI - CMPB SI, DI + CMPB BL, DI JNE match_extend_back_end_encodeSnappyBlockAsm LEAL -1(CX), CX - DECL BX + DECL BP JZ match_extend_back_end_encodeSnappyBlockAsm JMP match_extend_back_loop_encodeSnappyBlockAsm match_extend_back_end_encodeSnappyBlockAsm: - MOVL CX, DI - SUBL 12(SP), DI - LEAQ 4(AX)(DI*1), DI - CMPQ DI, (SP) + MOVL CX, SI + SUBL 12(SP), SI + LEAQ 4(AX)(SI*1), SI + CMPQ SI, (SP) JL match_dst_size_check_encodeSnappyBlockAsm MOVQ $0x00000000, ret+48(FP) RET match_dst_size_check_encodeSnappyBlockAsm: - MOVL CX, R8 + MOVL CX, SI MOVL 12(SP), DI - CMPL DI, R8 + CMPL DI, SI JEQ emit_literal_done_match_emit_encodeSnappyBlockAsm - MOVL R8, BP - MOVL R8, 12(SP) - LEAQ (DX)(DI*1), R9 - SUBL DI, BP - MOVL BP, DI + MOVL SI, R8 + MOVL SI, 12(SP) + LEAQ (DX)(DI*1), SI + SUBL DI, R8 + MOVL R8, DI SUBL $0x01, DI JC emit_literal_done_match_emit_encodeSnappyBlockAsm CMPL DI, $0x3c @@ -12422,11 +12405,11 @@ match_dst_size_check_encodeSnappyBlockAsm: JMP memmove_match_emit_encodeSnappyBlockAsm four_bytes_match_emit_encodeSnappyBlockAsm: - MOVL DI, R8 - SHRL $0x10, R8 + MOVL DI, R9 + SHRL $0x10, R9 MOVB $0xf8, (AX) MOVW DI, 1(AX) - MOVB R8, 3(AX) + MOVB R9, 3(AX) ADDQ $0x04, AX JMP memmove_match_emit_encodeSnappyBlockAsm @@ -12448,8 +12431,7 @@ one_byte_match_emit_encodeSnappyBlockAsm: ADDQ $0x01, AX memmove_match_emit_encodeSnappyBlockAsm: - LEAQ (AX)(BP*1), R10 - MOVL BP, R8 + LEAQ (AX)(R8*1), DI NOP emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_tail: @@ -12476,55 +12458,55 @@ emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_tail: JMP emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_256through2048 emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_1or2: - MOVB (R9), R11 - MOVB -1(R9)(R8*1), R9 - MOVB R11, (AX) - MOVB R9, -1(AX)(R8*1) + MOVB (SI), R9 + MOVB -1(SI)(R8*1), SI + MOVB R9, (AX) + MOVB SI, -1(AX)(R8*1) JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_4: - MOVL (R9), R11 - MOVL R11, (AX) + MOVL (SI), R9 + MOVL R9, (AX) JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_3: - MOVW (R9), R11 - MOVB 2(R9), R9 - MOVW R11, (AX) - MOVB R9, 2(AX) + MOVW (SI), R9 + MOVB 2(SI), SI + MOVW R9, (AX) + MOVB SI, 2(AX) JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_5through7: - MOVL (R9), R11 - MOVL -4(R9)(R8*1), R9 - MOVL R11, (AX) - MOVL R9, -4(AX)(R8*1) + MOVL (SI), R9 + MOVL -4(SI)(R8*1), SI + MOVL R9, (AX) + MOVL SI, -4(AX)(R8*1) JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_8: - MOVQ (R9), R11 - MOVQ R11, (AX) + MOVQ (SI), R9 + MOVQ R9, (AX) JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_9through16: - MOVQ (R9), R11 - MOVQ -8(R9)(R8*1), R9 - MOVQ R11, (AX) - MOVQ R9, -8(AX)(R8*1) + MOVQ (SI), R9 + MOVQ -8(SI)(R8*1), SI + MOVQ R9, (AX) + MOVQ SI, -8(AX)(R8*1) JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_17through32: - MOVOU (R9), X0 - MOVOU -16(R9)(R8*1), X1 + MOVOU (SI), X0 + MOVOU -16(SI)(R8*1), X1 MOVOU X0, (AX) MOVOU X1, -16(AX)(R8*1) JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_33through64: - MOVOU (R9), X0 - MOVOU 16(R9), X1 - MOVOU -32(R9)(R8*1), X2 - MOVOU -16(R9)(R8*1), X3 + MOVOU (SI), X0 + MOVOU 16(SI), X1 + MOVOU -32(SI)(R8*1), X2 + MOVOU -16(SI)(R8*1), X3 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, -32(AX)(R8*1) @@ -12532,14 +12514,14 @@ emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_33through64: JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_65through128: - MOVOU (R9), X0 - MOVOU 16(R9), X1 - MOVOU 32(R9), X2 - MOVOU 48(R9), X3 - MOVOU -64(R9)(R8*1), X12 - MOVOU -48(R9)(R8*1), X13 - MOVOU -32(R9)(R8*1), X14 - MOVOU -16(R9)(R8*1), X15 + MOVOU (SI), X0 + MOVOU 16(SI), X1 + MOVOU 32(SI), X2 + MOVOU 48(SI), X3 + MOVOU -64(SI)(R8*1), X12 + MOVOU -48(SI)(R8*1), X13 + MOVOU -32(SI)(R8*1), X14 + MOVOU -16(SI)(R8*1), X15 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, 32(AX) @@ -12551,22 +12533,22 @@ emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_65through128: JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_129through256: - MOVOU (R9), X0 - MOVOU 16(R9), X1 - MOVOU 32(R9), X2 - MOVOU 48(R9), X3 - MOVOU 64(R9), X4 - MOVOU 80(R9), X5 - MOVOU 96(R9), X6 - MOVOU 112(R9), X7 - MOVOU -128(R9)(R8*1), X8 - MOVOU -112(R9)(R8*1), X9 - MOVOU -96(R9)(R8*1), X10 - MOVOU -80(R9)(R8*1), X11 - MOVOU -64(R9)(R8*1), X12 - MOVOU -48(R9)(R8*1), X13 - MOVOU -32(R9)(R8*1), X14 - MOVOU -16(R9)(R8*1), X15 + MOVOU (SI), X0 + MOVOU 16(SI), X1 + MOVOU 32(SI), X2 + MOVOU 48(SI), X3 + MOVOU 64(SI), X4 + MOVOU 80(SI), X5 + MOVOU 96(SI), X6 + MOVOU 112(SI), X7 + MOVOU -128(SI)(R8*1), X8 + MOVOU -112(SI)(R8*1), X9 + MOVOU -96(SI)(R8*1), X10 + MOVOU -80(SI)(R8*1), X11 + MOVOU -64(SI)(R8*1), X12 + MOVOU -48(SI)(R8*1), X13 + MOVOU -32(SI)(R8*1), X14 + MOVOU -16(SI)(R8*1), X15 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, 32(AX) @@ -12587,22 +12569,22 @@ emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_129through256: emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_256through2048: LEAQ -256(R8), R8 - MOVOU (R9), X0 - MOVOU 16(R9), X1 - MOVOU 32(R9), X2 - MOVOU 48(R9), X3 - MOVOU 64(R9), X4 - MOVOU 80(R9), X5 - MOVOU 96(R9), X6 - MOVOU 112(R9), X7 - MOVOU 128(R9), X8 - MOVOU 144(R9), X9 - MOVOU 160(R9), X10 - MOVOU 176(R9), X11 - MOVOU 192(R9), X12 - MOVOU 208(R9), X13 - MOVOU 224(R9), X14 - MOVOU 240(R9), X15 + MOVOU (SI), X0 + MOVOU 16(SI), X1 + MOVOU 32(SI), X2 + MOVOU 48(SI), X3 + MOVOU 64(SI), X4 + MOVOU 80(SI), X5 + MOVOU 96(SI), X6 + MOVOU 112(SI), X7 + MOVOU 128(SI), X8 + MOVOU 144(SI), X9 + MOVOU 160(SI), X10 + MOVOU 176(SI), X11 + MOVOU 192(SI), X12 + MOVOU 208(SI), X13 + MOVOU 224(SI), X14 + MOVOU 240(SI), X15 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, 32(AX) @@ -12620,113 +12602,113 @@ emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_256through2048: MOVOU X14, 224(AX) MOVOU X15, 240(AX) CMPQ R8, $0x00000100 - LEAQ 256(R9), R9 + LEAQ 256(SI), SI LEAQ 256(AX), AX JGE emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_256through2048 JMP emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_tail memmove_end_copy_match_emit_encodeSnappyBlockAsm: - MOVQ R10, AX + MOVQ DI, AX emit_literal_done_match_emit_encodeSnappyBlockAsm: match_nolit_loop_encodeSnappyBlockAsm: - MOVL CX, BP - SUBL BX, BP - MOVL BP, 16(SP) + MOVL CX, SI + SUBL BP, SI + MOVL SI, 16(SP) ADDL $0x04, CX - ADDL $0x04, BX - MOVQ src_len+32(FP), R9 - SUBL CX, R9 - LEAQ (DX)(CX*1), R10 - LEAQ (DX)(BX*1), R11 - XORL BX, BX - CMPL R9, $0x08 + ADDL $0x04, BP + MOVQ src_len+32(FP), SI + SUBL CX, SI + LEAQ (DX)(CX*1), DI + LEAQ (DX)(BP*1), BP + XORL R9, R9 + CMPL SI, $0x08 JL matchlen_single_match_nolit_encodeSnappyBlockAsm matchlen_loopback_match_nolit_encodeSnappyBlockAsm: - MOVQ (R10)(BX*1), R12 - XORQ (R11)(BX*1), R12 - TESTQ R12, R12 + MOVQ (DI)(R9*1), R8 + XORQ (BP)(R9*1), R8 + TESTQ R8, R8 JZ matchlen_loop_match_nolit_encodeSnappyBlockAsm - BSFQ R12, R12 - SARQ $0x03, R12 - LEAL (BX)(R12*1), BX + BSFQ R8, R8 + SARQ $0x03, R8 + LEAL (R9)(R8*1), R9 JMP match_nolit_end_encodeSnappyBlockAsm matchlen_loop_match_nolit_encodeSnappyBlockAsm: - LEAL -8(R9), R9 - LEAL 8(BX), BX - CMPL R9, $0x08 + LEAL -8(SI), SI + LEAL 8(R9), R9 + CMPL SI, $0x08 JGE matchlen_loopback_match_nolit_encodeSnappyBlockAsm matchlen_single_match_nolit_encodeSnappyBlockAsm: - TESTL R9, R9 + TESTL SI, SI JZ match_nolit_end_encodeSnappyBlockAsm matchlen_single_loopback_match_nolit_encodeSnappyBlockAsm: - MOVB (R10)(BX*1), R12 - CMPB (R11)(BX*1), R12 + MOVB (DI)(R9*1), R8 + CMPB (BP)(R9*1), R8 JNE match_nolit_end_encodeSnappyBlockAsm - LEAL 1(BX), BX - DECL R9 + LEAL 1(R9), R9 + DECL SI JNZ matchlen_single_loopback_match_nolit_encodeSnappyBlockAsm match_nolit_end_encodeSnappyBlockAsm: - ADDL BX, CX + ADDL R9, CX MOVL 16(SP), BP - ADDL $0x04, BX + ADDL $0x04, R9 CMPL BP, $0x00010000 JL two_byte_offset_match_nolit_encodeSnappyBlockAsm four_bytes_loop_back_match_nolit_encodeSnappyBlockAsm: - CMPL BX, $0x40 + CMPL R9, $0x40 JLE four_bytes_remain_match_nolit_encodeSnappyBlockAsm MOVB $0xff, (AX) MOVL BP, 1(AX) - LEAL -64(BX), BX + LEAL -64(R9), R9 ADDQ $0x05, AX - CMPL BX, $0x04 + CMPL R9, $0x04 JL four_bytes_remain_match_nolit_encodeSnappyBlockAsm JMP four_bytes_loop_back_match_nolit_encodeSnappyBlockAsm four_bytes_remain_match_nolit_encodeSnappyBlockAsm: - TESTL BX, BX + TESTL R9, R9 JZ match_nolit_emitcopy_end_encodeSnappyBlockAsm - MOVB $0x03, SI - LEAL -4(SI)(BX*4), BX - MOVB BL, (AX) + MOVB $0x03, BL + LEAL -4(BX)(R9*4), R9 + MOVB R9, (AX) MOVL BP, 1(AX) ADDQ $0x05, AX JMP match_nolit_emitcopy_end_encodeSnappyBlockAsm two_byte_offset_match_nolit_encodeSnappyBlockAsm: - CMPL BX, $0x40 + CMPL R9, $0x40 JLE two_byte_offset_short_match_nolit_encodeSnappyBlockAsm MOVB $0xee, (AX) MOVW BP, 1(AX) - LEAL -60(BX), BX + LEAL -60(R9), R9 ADDQ $0x03, AX JMP two_byte_offset_match_nolit_encodeSnappyBlockAsm two_byte_offset_short_match_nolit_encodeSnappyBlockAsm: - CMPL BX, $0x0c + CMPL R9, $0x0c JGE emit_copy_three_match_nolit_encodeSnappyBlockAsm CMPL BP, $0x00000800 JGE emit_copy_three_match_nolit_encodeSnappyBlockAsm - MOVB $0x01, SI - LEAL -16(SI)(BX*4), BX + MOVB $0x01, BL + LEAL -16(BX)(R9*4), R9 MOVB BP, 1(AX) SHRL $0x08, BP SHLL $0x05, BP - ORL BP, BX - MOVB BL, (AX) + ORL BP, R9 + MOVB R9, (AX) ADDQ $0x02, AX JMP match_nolit_emitcopy_end_encodeSnappyBlockAsm emit_copy_three_match_nolit_encodeSnappyBlockAsm: - MOVB $0x02, SI - LEAL -4(SI)(BX*4), BX - MOVB BL, (AX) + MOVB $0x02, BL + LEAL -4(BX)(R9*4), R9 + MOVB R9, (AX) MOVW BP, 1(AX) ADDQ $0x03, AX @@ -12740,22 +12722,22 @@ match_nolit_emitcopy_end_encodeSnappyBlockAsm: RET match_nolit_dst_ok_encodeSnappyBlockAsm: - MOVQ -2(DX)(CX*1), R9 - MOVQ $0x0000cf1bbcdcbf9b, R10 - MOVQ R9, R11 - SHRQ $0x10, R9 - MOVQ R9, R12 - SHLQ $0x10, R11 - IMULQ R10, R11 - SHRQ $0x32, R11 - SHLQ $0x10, R12 - IMULQ R10, R12 - SHRQ $0x32, R12 - LEAL -2(CX), BP - MOVL 24(SP)(R12*4), BX - MOVL BP, 24(SP)(R11*4) - MOVL CX, 24(SP)(R12*4) - CMPL (DX)(BX*1), R9 + MOVQ -2(DX)(CX*1), SI + MOVQ $0x0000cf1bbcdcbf9b, BP + MOVQ SI, DI + SHRQ $0x10, SI + MOVQ SI, R8 + SHLQ $0x10, DI + IMULQ BP, DI + SHRQ $0x32, DI + SHLQ $0x10, R8 + IMULQ BP, R8 + SHRQ $0x32, R8 + LEAL -2(CX), R9 + MOVL 24(SP)(R8*4), BP + MOVL R9, 24(SP)(DI*4) + MOVL CX, 24(SP)(R8*4) + CMPL (DX)(BP*1), SI JEQ match_nolit_loop_encodeSnappyBlockAsm INCL CX JMP search_loop_encodeSnappyBlockAsm @@ -13030,8 +13012,8 @@ zero_loop_encodeSnappyBlockAsm12B: MOVL $0x00000000, 12(SP) MOVQ src_len+32(FP), CX LEAQ -5(CX), DX - LEAQ -8(CX), R9 - MOVL R9, 8(SP) + LEAQ -8(CX), BP + MOVL BP, 8(SP) SHRQ $0x05, CX SUBL CX, DX LEAQ (AX)(DX*1), DX @@ -13041,68 +13023,68 @@ zero_loop_encodeSnappyBlockAsm12B: MOVQ src_base+24(FP), DX search_loop_encodeSnappyBlockAsm12B: - MOVQ (DX)(CX*1), R9 - MOVL CX, BX - SUBL 12(SP), BX - SHRL $0x05, BX - LEAL 4(CX)(BX*1), BX - MOVL 8(SP), BP - CMPL BX, BP + MOVQ (DX)(CX*1), SI + MOVL CX, BP + SUBL 12(SP), BP + SHRL $0x05, BP + LEAL 4(CX)(BP*1), BP + MOVL 8(SP), DI + CMPL BP, DI JGT emit_remainder_encodeSnappyBlockAsm12B - MOVL BX, 20(SP) - MOVQ $0x000000cf1bbcdcbb, R10 - MOVQ R9, R11 - MOVQ R9, R12 - SHRQ $0x08, R12 - SHLQ $0x18, R11 - IMULQ R10, R11 - SHRQ $0x34, R11 - SHLQ $0x18, R12 - IMULQ R10, R12 - SHRQ $0x34, R12 - MOVL 24(SP)(R11*4), BX - MOVL 24(SP)(R12*4), BP - MOVL CX, 24(SP)(R11*4) - LEAL 1(CX), DI - MOVL DI, 24(SP)(R12*4) - MOVQ R9, R11 - SHRQ $0x10, R11 - SHLQ $0x18, R11 - IMULQ R10, R11 - SHRQ $0x34, R11 - MOVL CX, DI - SUBL 16(SP), DI - MOVL 1(DX)(DI*1), DI - MOVQ R9, R10 + MOVL BP, 20(SP) + MOVQ $0x000000cf1bbcdcbb, R8 + MOVQ SI, R9 + MOVQ SI, R10 SHRQ $0x08, R10 - CMPL R10, DI + SHLQ $0x18, R9 + IMULQ R8, R9 + SHRQ $0x34, R9 + SHLQ $0x18, R10 + IMULQ R8, R10 + SHRQ $0x34, R10 + MOVL 24(SP)(R9*4), BP + MOVL 24(SP)(R10*4), DI + MOVL CX, 24(SP)(R9*4) + LEAL 1(CX), R9 + MOVL R9, 24(SP)(R10*4) + MOVQ SI, R9 + SHRQ $0x10, R9 + SHLQ $0x18, R9 + IMULQ R8, R9 + SHRQ $0x34, R9 + MOVL CX, R8 + SUBL 16(SP), R8 + MOVL 1(DX)(R8*1), R10 + MOVQ SI, R8 + SHRQ $0x08, R8 + CMPL R8, R10 JNE no_repeat_found_encodeSnappyBlockAsm12B - LEAL 1(CX), DI - MOVL 12(SP), BX - MOVL DI, BP - SUBL 16(SP), BP + LEAL 1(CX), SI + MOVL 12(SP), BP + MOVL SI, DI + SUBL 16(SP), DI JZ repeat_extend_back_end_encodeSnappyBlockAsm12B repeat_extend_back_loop_encodeSnappyBlockAsm12B: - CMPL DI, BX + CMPL SI, BP JLE repeat_extend_back_end_encodeSnappyBlockAsm12B - MOVB -1(DX)(BP*1), SI - MOVB -1(DX)(DI*1), R8 - CMPB SI, R8 + MOVB -1(DX)(DI*1), BL + MOVB -1(DX)(SI*1), R8 + CMPB BL, R8 JNE repeat_extend_back_end_encodeSnappyBlockAsm12B - LEAL -1(DI), DI - DECL BP + LEAL -1(SI), SI + DECL DI JNZ repeat_extend_back_loop_encodeSnappyBlockAsm12B repeat_extend_back_end_encodeSnappyBlockAsm12B: MOVL 12(SP), BP - CMPL BP, DI + CMPL BP, SI JEQ emit_literal_done_repeat_emit_encodeSnappyBlockAsm12B - MOVL DI, BX - MOVL DI, 12(SP) - LEAQ (DX)(BP*1), R9 - SUBL BP, BX - MOVL BX, BP + MOVL SI, DI + MOVL SI, 12(SP) + LEAQ (DX)(BP*1), R8 + SUBL BP, DI + MOVL DI, BP SUBL $0x01, BP JC emit_literal_done_repeat_emit_encodeSnappyBlockAsm12B CMPL BP, $0x3c @@ -13119,11 +13101,11 @@ repeat_extend_back_end_encodeSnappyBlockAsm12B: JMP memmove_repeat_emit_encodeSnappyBlockAsm12B four_bytes_repeat_emit_encodeSnappyBlockAsm12B: - MOVL BP, R8 - SHRL $0x10, R8 + MOVL BP, R9 + SHRL $0x10, R9 MOVB $0xf8, (AX) MOVW BP, 1(AX) - MOVB R8, 3(AX) + MOVB R9, 3(AX) ADDQ $0x04, AX JMP memmove_repeat_emit_encodeSnappyBlockAsm12B @@ -13145,125 +13127,124 @@ one_byte_repeat_emit_encodeSnappyBlockAsm12B: ADDQ $0x01, AX memmove_repeat_emit_encodeSnappyBlockAsm12B: - LEAQ (AX)(BX*1), R10 - MOVL BX, BP + LEAQ (AX)(DI*1), BP NOP emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_tail: - TESTQ BP, BP + TESTQ DI, DI JEQ memmove_end_copy_repeat_emit_encodeSnappyBlockAsm12B - CMPQ BP, $0x02 + CMPQ DI, $0x02 JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_1or2 - CMPQ BP, $0x04 + CMPQ DI, $0x04 JB emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_3 JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_4 - CMPQ BP, $0x08 + CMPQ DI, $0x08 JB emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_5through7 JE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_8 - CMPQ BP, $0x10 + CMPQ DI, $0x10 JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_9through16 - CMPQ BP, $0x20 + CMPQ DI, $0x20 JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_17through32 - CMPQ BP, $0x40 + CMPQ DI, $0x40 JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_33through64 - CMPQ BP, $0x80 + CMPQ DI, $0x80 JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_65through128 - CMPQ BP, $0x00000100 + CMPQ DI, $0x00000100 JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_129through256 JMP emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_256through2048 emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_1or2: - MOVB (R9), R11 - MOVB -1(R9)(BP*1), R9 - MOVB R11, (AX) - MOVB R9, -1(AX)(BP*1) + MOVB (R8), R9 + MOVB -1(R8)(DI*1), R8 + MOVB R9, (AX) + MOVB R8, -1(AX)(DI*1) JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm12B emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_4: - MOVL (R9), R11 - MOVL R11, (AX) + MOVL (R8), R9 + MOVL R9, (AX) JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm12B emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_3: - MOVW (R9), R11 - MOVB 2(R9), R9 - MOVW R11, (AX) - MOVB R9, 2(AX) + MOVW (R8), R9 + MOVB 2(R8), R8 + MOVW R9, (AX) + MOVB R8, 2(AX) JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm12B emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_5through7: - MOVL (R9), R11 - MOVL -4(R9)(BP*1), R9 - MOVL R11, (AX) - MOVL R9, -4(AX)(BP*1) + MOVL (R8), R9 + MOVL -4(R8)(DI*1), R8 + MOVL R9, (AX) + MOVL R8, -4(AX)(DI*1) JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm12B emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_8: - MOVQ (R9), R11 - MOVQ R11, (AX) + MOVQ (R8), R9 + MOVQ R9, (AX) JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm12B emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_9through16: - MOVQ (R9), R11 - MOVQ -8(R9)(BP*1), R9 - MOVQ R11, (AX) - MOVQ R9, -8(AX)(BP*1) + MOVQ (R8), R9 + MOVQ -8(R8)(DI*1), R8 + MOVQ R9, (AX) + MOVQ R8, -8(AX)(DI*1) JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm12B emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_17through32: - MOVOU (R9), X0 - MOVOU -16(R9)(BP*1), X1 + MOVOU (R8), X0 + MOVOU -16(R8)(DI*1), X1 MOVOU X0, (AX) - MOVOU X1, -16(AX)(BP*1) + MOVOU X1, -16(AX)(DI*1) JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm12B emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_33through64: - MOVOU (R9), X0 - MOVOU 16(R9), X1 - MOVOU -32(R9)(BP*1), X2 - MOVOU -16(R9)(BP*1), X3 + MOVOU (R8), X0 + MOVOU 16(R8), X1 + MOVOU -32(R8)(DI*1), X2 + MOVOU -16(R8)(DI*1), X3 MOVOU X0, (AX) MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(BP*1) - MOVOU X3, -16(AX)(BP*1) + MOVOU X2, -32(AX)(DI*1) + MOVOU X3, -16(AX)(DI*1) JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm12B emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_65through128: - MOVOU (R9), X0 - MOVOU 16(R9), X1 - MOVOU 32(R9), X2 - MOVOU 48(R9), X3 - MOVOU -64(R9)(BP*1), X12 - MOVOU -48(R9)(BP*1), X13 - MOVOU -32(R9)(BP*1), X14 - MOVOU -16(R9)(BP*1), X15 + MOVOU (R8), X0 + MOVOU 16(R8), X1 + MOVOU 32(R8), X2 + MOVOU 48(R8), X3 + MOVOU -64(R8)(DI*1), X12 + MOVOU -48(R8)(DI*1), X13 + MOVOU -32(R8)(DI*1), X14 + MOVOU -16(R8)(DI*1), X15 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, 32(AX) MOVOU X3, 48(AX) - MOVOU X12, -64(AX)(BP*1) - MOVOU X13, -48(AX)(BP*1) - MOVOU X14, -32(AX)(BP*1) - MOVOU X15, -16(AX)(BP*1) + MOVOU X12, -64(AX)(DI*1) + MOVOU X13, -48(AX)(DI*1) + MOVOU X14, -32(AX)(DI*1) + MOVOU X15, -16(AX)(DI*1) JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm12B emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_129through256: - MOVOU (R9), X0 - MOVOU 16(R9), X1 - MOVOU 32(R9), X2 - MOVOU 48(R9), X3 - MOVOU 64(R9), X4 - MOVOU 80(R9), X5 - MOVOU 96(R9), X6 - MOVOU 112(R9), X7 - MOVOU -128(R9)(BP*1), X8 - MOVOU -112(R9)(BP*1), X9 - MOVOU -96(R9)(BP*1), X10 - MOVOU -80(R9)(BP*1), X11 - MOVOU -64(R9)(BP*1), X12 - MOVOU -48(R9)(BP*1), X13 - MOVOU -32(R9)(BP*1), X14 - MOVOU -16(R9)(BP*1), X15 + MOVOU (R8), X0 + MOVOU 16(R8), X1 + MOVOU 32(R8), X2 + MOVOU 48(R8), X3 + MOVOU 64(R8), X4 + MOVOU 80(R8), X5 + MOVOU 96(R8), X6 + MOVOU 112(R8), X7 + MOVOU -128(R8)(DI*1), X8 + MOVOU -112(R8)(DI*1), X9 + MOVOU -96(R8)(DI*1), X10 + MOVOU -80(R8)(DI*1), X11 + MOVOU -64(R8)(DI*1), X12 + MOVOU -48(R8)(DI*1), X13 + MOVOU -32(R8)(DI*1), X14 + MOVOU -16(R8)(DI*1), X15 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, 32(AX) @@ -13272,34 +13253,34 @@ emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_129through256: MOVOU X5, 80(AX) MOVOU X6, 96(AX) MOVOU X7, 112(AX) - MOVOU X8, -128(AX)(BP*1) - MOVOU X9, -112(AX)(BP*1) - MOVOU X10, -96(AX)(BP*1) - MOVOU X11, -80(AX)(BP*1) - MOVOU X12, -64(AX)(BP*1) - MOVOU X13, -48(AX)(BP*1) - MOVOU X14, -32(AX)(BP*1) - MOVOU X15, -16(AX)(BP*1) + MOVOU X8, -128(AX)(DI*1) + MOVOU X9, -112(AX)(DI*1) + MOVOU X10, -96(AX)(DI*1) + MOVOU X11, -80(AX)(DI*1) + MOVOU X12, -64(AX)(DI*1) + MOVOU X13, -48(AX)(DI*1) + MOVOU X14, -32(AX)(DI*1) + MOVOU X15, -16(AX)(DI*1) JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm12B emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_256through2048: - LEAQ -256(BP), BP - MOVOU (R9), X0 - MOVOU 16(R9), X1 - MOVOU 32(R9), X2 - MOVOU 48(R9), X3 - MOVOU 64(R9), X4 - MOVOU 80(R9), X5 - MOVOU 96(R9), X6 - MOVOU 112(R9), X7 - MOVOU 128(R9), X8 - MOVOU 144(R9), X9 - MOVOU 160(R9), X10 - MOVOU 176(R9), X11 - MOVOU 192(R9), X12 - MOVOU 208(R9), X13 - MOVOU 224(R9), X14 - MOVOU 240(R9), X15 + LEAQ -256(DI), DI + MOVOU (R8), X0 + MOVOU 16(R8), X1 + MOVOU 32(R8), X2 + MOVOU 48(R8), X3 + MOVOU 64(R8), X4 + MOVOU 80(R8), X5 + MOVOU 96(R8), X6 + MOVOU 112(R8), X7 + MOVOU 128(R8), X8 + MOVOU 144(R8), X9 + MOVOU 160(R8), X10 + MOVOU 176(R8), X11 + MOVOU 192(R8), X12 + MOVOU 208(R8), X13 + MOVOU 224(R8), X14 + MOVOU 240(R8), X15 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, 32(AX) @@ -13316,113 +13297,113 @@ emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_256through2048 MOVOU X13, 208(AX) MOVOU X14, 224(AX) MOVOU X15, 240(AX) - CMPQ BP, $0x00000100 - LEAQ 256(R9), R9 + CMPQ DI, $0x00000100 + LEAQ 256(R8), R8 LEAQ 256(AX), AX JGE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_256through2048 JMP emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_tail memmove_end_copy_repeat_emit_encodeSnappyBlockAsm12B: - MOVQ R10, AX + MOVQ BP, AX emit_literal_done_repeat_emit_encodeSnappyBlockAsm12B: ADDL $0x05, CX - MOVL CX, BX - SUBL 16(SP), BX - MOVQ src_len+32(FP), R9 - SUBL CX, R9 - LEAQ (DX)(CX*1), R10 - LEAQ (DX)(BX*1), R11 - XORL BX, BX - CMPL R9, $0x08 + MOVL CX, BP + SUBL 16(SP), BP + MOVQ src_len+32(FP), DI + SUBL CX, DI + LEAQ (DX)(CX*1), R8 + LEAQ (DX)(BP*1), BP + XORL R10, R10 + CMPL DI, $0x08 JL matchlen_single_repeat_extend matchlen_loopback_repeat_extend: - MOVQ (R10)(BX*1), R12 - XORQ (R11)(BX*1), R12 - TESTQ R12, R12 + MOVQ (R8)(R10*1), R9 + XORQ (BP)(R10*1), R9 + TESTQ R9, R9 JZ matchlen_loop_repeat_extend - BSFQ R12, R12 - SARQ $0x03, R12 - LEAL (BX)(R12*1), BX + BSFQ R9, R9 + SARQ $0x03, R9 + LEAL (R10)(R9*1), R10 JMP repeat_extend_forward_end_encodeSnappyBlockAsm12B matchlen_loop_repeat_extend: - LEAL -8(R9), R9 - LEAL 8(BX), BX - CMPL R9, $0x08 + LEAL -8(DI), DI + LEAL 8(R10), R10 + CMPL DI, $0x08 JGE matchlen_loopback_repeat_extend matchlen_single_repeat_extend: - TESTL R9, R9 + TESTL DI, DI JZ repeat_extend_forward_end_encodeSnappyBlockAsm12B matchlen_single_loopback_repeat_extend: - MOVB (R10)(BX*1), R12 - CMPB (R11)(BX*1), R12 + MOVB (R8)(R10*1), R9 + CMPB (BP)(R10*1), R9 JNE repeat_extend_forward_end_encodeSnappyBlockAsm12B - LEAL 1(BX), BX - DECL R9 + LEAL 1(R10), R10 + DECL DI JNZ matchlen_single_loopback_repeat_extend repeat_extend_forward_end_encodeSnappyBlockAsm12B: - ADDL BX, CX - MOVL CX, BX - SUBL DI, BX - MOVL 16(SP), BP - CMPL BP, $0x00010000 + ADDL R10, CX + MOVL CX, BP + SUBL SI, BP + MOVL 16(SP), SI + CMPL SI, $0x00010000 JL two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm12B four_bytes_loop_back_repeat_as_copy_encodeSnappyBlockAsm12B: - CMPL BX, $0x40 + CMPL BP, $0x40 JLE four_bytes_remain_repeat_as_copy_encodeSnappyBlockAsm12B MOVB $0xff, (AX) - MOVL BP, 1(AX) - LEAL -64(BX), BX + MOVL SI, 1(AX) + LEAL -64(BP), BP ADDQ $0x05, AX - CMPL BX, $0x04 + CMPL BP, $0x04 JL four_bytes_remain_repeat_as_copy_encodeSnappyBlockAsm12B JMP four_bytes_loop_back_repeat_as_copy_encodeSnappyBlockAsm12B four_bytes_remain_repeat_as_copy_encodeSnappyBlockAsm12B: - TESTL BX, BX + TESTL BP, BP JZ repeat_end_emit_encodeSnappyBlockAsm12B - MOVB $0x03, SI - LEAL -4(SI)(BX*4), BX - MOVB BL, (AX) - MOVL BP, 1(AX) + MOVB $0x03, BL + LEAL -4(BX)(BP*4), BP + MOVB BP, (AX) + MOVL SI, 1(AX) ADDQ $0x05, AX JMP repeat_end_emit_encodeSnappyBlockAsm12B two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm12B: - CMPL BX, $0x40 + CMPL BP, $0x40 JLE two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm12B MOVB $0xee, (AX) - MOVW BP, 1(AX) - LEAL -60(BX), BX + MOVW SI, 1(AX) + LEAL -60(BP), BP ADDQ $0x03, AX JMP two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm12B two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm12B: - CMPL BX, $0x0c + CMPL BP, $0x0c JGE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm12B - CMPL BP, $0x00000800 + CMPL SI, $0x00000800 JGE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm12B - MOVB $0x01, SI - LEAL -16(SI)(BX*4), BX - MOVB BP, 1(AX) - SHRL $0x08, BP - SHLL $0x05, BP - ORL BP, BX - MOVB BL, (AX) + MOVB $0x01, BL + LEAL -16(BX)(BP*4), BP + MOVB SI, 1(AX) + SHRL $0x08, SI + SHLL $0x05, SI + ORL SI, BP + MOVB BP, (AX) ADDQ $0x02, AX JMP repeat_end_emit_encodeSnappyBlockAsm12B emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm12B: - MOVB $0x02, SI - LEAL -4(SI)(BX*4), BX - MOVB BL, (AX) - MOVW BP, 1(AX) + MOVB $0x02, BL + LEAL -4(BX)(BP*4), BP + MOVB BP, (AX) + MOVW SI, 1(AX) ADDQ $0x03, AX repeat_end_emit_encodeSnappyBlockAsm12B: @@ -13432,16 +13413,16 @@ repeat_end_emit_encodeSnappyBlockAsm12B: JMP search_loop_encodeSnappyBlockAsm12B no_repeat_found_encodeSnappyBlockAsm12B: - CMPL (DX)(BX*1), R9 + CMPL (DX)(BP*1), SI JEQ candidate_match_encodeSnappyBlockAsm12B - SHRQ $0x08, R9 - MOVL 24(SP)(R11*4), BX - LEAL 2(CX), DI - CMPL (DX)(BP*1), R9 + SHRQ $0x08, SI + MOVL 24(SP)(R9*4), BP + LEAL 2(CX), R8 + CMPL (DX)(DI*1), SI JEQ candidate2_match_encodeSnappyBlockAsm12B - MOVL DI, 24(SP)(R11*4) - SHRQ $0x08, R9 - CMPL (DX)(BX*1), R9 + MOVL R8, 24(SP)(R9*4) + SHRQ $0x08, SI + CMPL (DX)(BP*1), SI JEQ candidate3_match_encodeSnappyBlockAsm12B MOVL 20(SP), CX JMP search_loop_encodeSnappyBlockAsm12B @@ -13451,46 +13432,46 @@ candidate3_match_encodeSnappyBlockAsm12B: JMP candidate_match_encodeSnappyBlockAsm12B candidate2_match_encodeSnappyBlockAsm12B: - MOVL DI, 24(SP)(R11*4) + MOVL R8, 24(SP)(R9*4) INCL CX - MOVL BP, BX + MOVL DI, BP candidate_match_encodeSnappyBlockAsm12B: - MOVL 12(SP), BP - TESTL BX, BX + MOVL 12(SP), SI + TESTL BP, BP JZ match_extend_back_end_encodeSnappyBlockAsm12B match_extend_back_loop_encodeSnappyBlockAsm12B: - CMPL CX, BP + CMPL CX, SI JLE match_extend_back_end_encodeSnappyBlockAsm12B - MOVB -1(DX)(BX*1), SI + MOVB -1(DX)(BP*1), BL MOVB -1(DX)(CX*1), DI - CMPB SI, DI + CMPB BL, DI JNE match_extend_back_end_encodeSnappyBlockAsm12B LEAL -1(CX), CX - DECL BX + DECL BP JZ match_extend_back_end_encodeSnappyBlockAsm12B JMP match_extend_back_loop_encodeSnappyBlockAsm12B match_extend_back_end_encodeSnappyBlockAsm12B: - MOVL CX, DI - SUBL 12(SP), DI - LEAQ 4(AX)(DI*1), DI - CMPQ DI, (SP) + MOVL CX, SI + SUBL 12(SP), SI + LEAQ 4(AX)(SI*1), SI + CMPQ SI, (SP) JL match_dst_size_check_encodeSnappyBlockAsm12B MOVQ $0x00000000, ret+48(FP) RET match_dst_size_check_encodeSnappyBlockAsm12B: - MOVL CX, R8 + MOVL CX, SI MOVL 12(SP), DI - CMPL DI, R8 + CMPL DI, SI JEQ emit_literal_done_match_emit_encodeSnappyBlockAsm12B - MOVL R8, BP - MOVL R8, 12(SP) - LEAQ (DX)(DI*1), R9 - SUBL DI, BP - MOVL BP, DI + MOVL SI, R8 + MOVL SI, 12(SP) + LEAQ (DX)(DI*1), SI + SUBL DI, R8 + MOVL R8, DI SUBL $0x01, DI JC emit_literal_done_match_emit_encodeSnappyBlockAsm12B CMPL DI, $0x3c @@ -13507,11 +13488,11 @@ match_dst_size_check_encodeSnappyBlockAsm12B: JMP memmove_match_emit_encodeSnappyBlockAsm12B four_bytes_match_emit_encodeSnappyBlockAsm12B: - MOVL DI, R8 - SHRL $0x10, R8 + MOVL DI, R9 + SHRL $0x10, R9 MOVB $0xf8, (AX) MOVW DI, 1(AX) - MOVB R8, 3(AX) + MOVB R9, 3(AX) ADDQ $0x04, AX JMP memmove_match_emit_encodeSnappyBlockAsm12B @@ -13533,8 +13514,7 @@ one_byte_match_emit_encodeSnappyBlockAsm12B: ADDQ $0x01, AX memmove_match_emit_encodeSnappyBlockAsm12B: - LEAQ (AX)(BP*1), R10 - MOVL BP, R8 + LEAQ (AX)(R8*1), DI NOP emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_tail: @@ -13561,55 +13541,55 @@ emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_tail: JMP emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_256through2048 emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_1or2: - MOVB (R9), R11 - MOVB -1(R9)(R8*1), R9 - MOVB R11, (AX) - MOVB R9, -1(AX)(R8*1) + MOVB (SI), R9 + MOVB -1(SI)(R8*1), SI + MOVB R9, (AX) + MOVB SI, -1(AX)(R8*1) JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm12B emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_4: - MOVL (R9), R11 - MOVL R11, (AX) + MOVL (SI), R9 + MOVL R9, (AX) JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm12B emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_3: - MOVW (R9), R11 - MOVB 2(R9), R9 - MOVW R11, (AX) - MOVB R9, 2(AX) + MOVW (SI), R9 + MOVB 2(SI), SI + MOVW R9, (AX) + MOVB SI, 2(AX) JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm12B emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_5through7: - MOVL (R9), R11 - MOVL -4(R9)(R8*1), R9 - MOVL R11, (AX) - MOVL R9, -4(AX)(R8*1) + MOVL (SI), R9 + MOVL -4(SI)(R8*1), SI + MOVL R9, (AX) + MOVL SI, -4(AX)(R8*1) JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm12B emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_8: - MOVQ (R9), R11 - MOVQ R11, (AX) + MOVQ (SI), R9 + MOVQ R9, (AX) JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm12B emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_9through16: - MOVQ (R9), R11 - MOVQ -8(R9)(R8*1), R9 - MOVQ R11, (AX) - MOVQ R9, -8(AX)(R8*1) + MOVQ (SI), R9 + MOVQ -8(SI)(R8*1), SI + MOVQ R9, (AX) + MOVQ SI, -8(AX)(R8*1) JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm12B emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_17through32: - MOVOU (R9), X0 - MOVOU -16(R9)(R8*1), X1 + MOVOU (SI), X0 + MOVOU -16(SI)(R8*1), X1 MOVOU X0, (AX) MOVOU X1, -16(AX)(R8*1) JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm12B emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_33through64: - MOVOU (R9), X0 - MOVOU 16(R9), X1 - MOVOU -32(R9)(R8*1), X2 - MOVOU -16(R9)(R8*1), X3 + MOVOU (SI), X0 + MOVOU 16(SI), X1 + MOVOU -32(SI)(R8*1), X2 + MOVOU -16(SI)(R8*1), X3 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, -32(AX)(R8*1) @@ -13617,14 +13597,14 @@ emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_33through64: JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm12B emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_65through128: - MOVOU (R9), X0 - MOVOU 16(R9), X1 - MOVOU 32(R9), X2 - MOVOU 48(R9), X3 - MOVOU -64(R9)(R8*1), X12 - MOVOU -48(R9)(R8*1), X13 - MOVOU -32(R9)(R8*1), X14 - MOVOU -16(R9)(R8*1), X15 + MOVOU (SI), X0 + MOVOU 16(SI), X1 + MOVOU 32(SI), X2 + MOVOU 48(SI), X3 + MOVOU -64(SI)(R8*1), X12 + MOVOU -48(SI)(R8*1), X13 + MOVOU -32(SI)(R8*1), X14 + MOVOU -16(SI)(R8*1), X15 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, 32(AX) @@ -13636,22 +13616,22 @@ emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_65through128: JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm12B emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_129through256: - MOVOU (R9), X0 - MOVOU 16(R9), X1 - MOVOU 32(R9), X2 - MOVOU 48(R9), X3 - MOVOU 64(R9), X4 - MOVOU 80(R9), X5 - MOVOU 96(R9), X6 - MOVOU 112(R9), X7 - MOVOU -128(R9)(R8*1), X8 - MOVOU -112(R9)(R8*1), X9 - MOVOU -96(R9)(R8*1), X10 - MOVOU -80(R9)(R8*1), X11 - MOVOU -64(R9)(R8*1), X12 - MOVOU -48(R9)(R8*1), X13 - MOVOU -32(R9)(R8*1), X14 - MOVOU -16(R9)(R8*1), X15 + MOVOU (SI), X0 + MOVOU 16(SI), X1 + MOVOU 32(SI), X2 + MOVOU 48(SI), X3 + MOVOU 64(SI), X4 + MOVOU 80(SI), X5 + MOVOU 96(SI), X6 + MOVOU 112(SI), X7 + MOVOU -128(SI)(R8*1), X8 + MOVOU -112(SI)(R8*1), X9 + MOVOU -96(SI)(R8*1), X10 + MOVOU -80(SI)(R8*1), X11 + MOVOU -64(SI)(R8*1), X12 + MOVOU -48(SI)(R8*1), X13 + MOVOU -32(SI)(R8*1), X14 + MOVOU -16(SI)(R8*1), X15 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, 32(AX) @@ -13672,22 +13652,22 @@ emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_129through256: emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_256through2048: LEAQ -256(R8), R8 - MOVOU (R9), X0 - MOVOU 16(R9), X1 - MOVOU 32(R9), X2 - MOVOU 48(R9), X3 - MOVOU 64(R9), X4 - MOVOU 80(R9), X5 - MOVOU 96(R9), X6 - MOVOU 112(R9), X7 - MOVOU 128(R9), X8 - MOVOU 144(R9), X9 - MOVOU 160(R9), X10 - MOVOU 176(R9), X11 - MOVOU 192(R9), X12 - MOVOU 208(R9), X13 - MOVOU 224(R9), X14 - MOVOU 240(R9), X15 + MOVOU (SI), X0 + MOVOU 16(SI), X1 + MOVOU 32(SI), X2 + MOVOU 48(SI), X3 + MOVOU 64(SI), X4 + MOVOU 80(SI), X5 + MOVOU 96(SI), X6 + MOVOU 112(SI), X7 + MOVOU 128(SI), X8 + MOVOU 144(SI), X9 + MOVOU 160(SI), X10 + MOVOU 176(SI), X11 + MOVOU 192(SI), X12 + MOVOU 208(SI), X13 + MOVOU 224(SI), X14 + MOVOU 240(SI), X15 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, 32(AX) @@ -13705,113 +13685,113 @@ emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_256through2048: MOVOU X14, 224(AX) MOVOU X15, 240(AX) CMPQ R8, $0x00000100 - LEAQ 256(R9), R9 + LEAQ 256(SI), SI LEAQ 256(AX), AX JGE emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_256through2048 JMP emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_tail memmove_end_copy_match_emit_encodeSnappyBlockAsm12B: - MOVQ R10, AX + MOVQ DI, AX emit_literal_done_match_emit_encodeSnappyBlockAsm12B: match_nolit_loop_encodeSnappyBlockAsm12B: - MOVL CX, BP - SUBL BX, BP - MOVL BP, 16(SP) + MOVL CX, SI + SUBL BP, SI + MOVL SI, 16(SP) ADDL $0x04, CX - ADDL $0x04, BX - MOVQ src_len+32(FP), R9 - SUBL CX, R9 - LEAQ (DX)(CX*1), R10 - LEAQ (DX)(BX*1), R11 - XORL BX, BX - CMPL R9, $0x08 + ADDL $0x04, BP + MOVQ src_len+32(FP), SI + SUBL CX, SI + LEAQ (DX)(CX*1), DI + LEAQ (DX)(BP*1), BP + XORL R9, R9 + CMPL SI, $0x08 JL matchlen_single_match_nolit_encodeSnappyBlockAsm12B matchlen_loopback_match_nolit_encodeSnappyBlockAsm12B: - MOVQ (R10)(BX*1), R12 - XORQ (R11)(BX*1), R12 - TESTQ R12, R12 + MOVQ (DI)(R9*1), R8 + XORQ (BP)(R9*1), R8 + TESTQ R8, R8 JZ matchlen_loop_match_nolit_encodeSnappyBlockAsm12B - BSFQ R12, R12 - SARQ $0x03, R12 - LEAL (BX)(R12*1), BX + BSFQ R8, R8 + SARQ $0x03, R8 + LEAL (R9)(R8*1), R9 JMP match_nolit_end_encodeSnappyBlockAsm12B matchlen_loop_match_nolit_encodeSnappyBlockAsm12B: - LEAL -8(R9), R9 - LEAL 8(BX), BX - CMPL R9, $0x08 + LEAL -8(SI), SI + LEAL 8(R9), R9 + CMPL SI, $0x08 JGE matchlen_loopback_match_nolit_encodeSnappyBlockAsm12B matchlen_single_match_nolit_encodeSnappyBlockAsm12B: - TESTL R9, R9 + TESTL SI, SI JZ match_nolit_end_encodeSnappyBlockAsm12B matchlen_single_loopback_match_nolit_encodeSnappyBlockAsm12B: - MOVB (R10)(BX*1), R12 - CMPB (R11)(BX*1), R12 + MOVB (DI)(R9*1), R8 + CMPB (BP)(R9*1), R8 JNE match_nolit_end_encodeSnappyBlockAsm12B - LEAL 1(BX), BX - DECL R9 + LEAL 1(R9), R9 + DECL SI JNZ matchlen_single_loopback_match_nolit_encodeSnappyBlockAsm12B match_nolit_end_encodeSnappyBlockAsm12B: - ADDL BX, CX + ADDL R9, CX MOVL 16(SP), BP - ADDL $0x04, BX + ADDL $0x04, R9 CMPL BP, $0x00010000 JL two_byte_offset_match_nolit_encodeSnappyBlockAsm12B four_bytes_loop_back_match_nolit_encodeSnappyBlockAsm12B: - CMPL BX, $0x40 + CMPL R9, $0x40 JLE four_bytes_remain_match_nolit_encodeSnappyBlockAsm12B MOVB $0xff, (AX) MOVL BP, 1(AX) - LEAL -64(BX), BX + LEAL -64(R9), R9 ADDQ $0x05, AX - CMPL BX, $0x04 + CMPL R9, $0x04 JL four_bytes_remain_match_nolit_encodeSnappyBlockAsm12B JMP four_bytes_loop_back_match_nolit_encodeSnappyBlockAsm12B four_bytes_remain_match_nolit_encodeSnappyBlockAsm12B: - TESTL BX, BX + TESTL R9, R9 JZ match_nolit_emitcopy_end_encodeSnappyBlockAsm12B - MOVB $0x03, SI - LEAL -4(SI)(BX*4), BX - MOVB BL, (AX) + MOVB $0x03, BL + LEAL -4(BX)(R9*4), R9 + MOVB R9, (AX) MOVL BP, 1(AX) ADDQ $0x05, AX JMP match_nolit_emitcopy_end_encodeSnappyBlockAsm12B two_byte_offset_match_nolit_encodeSnappyBlockAsm12B: - CMPL BX, $0x40 + CMPL R9, $0x40 JLE two_byte_offset_short_match_nolit_encodeSnappyBlockAsm12B MOVB $0xee, (AX) MOVW BP, 1(AX) - LEAL -60(BX), BX + LEAL -60(R9), R9 ADDQ $0x03, AX JMP two_byte_offset_match_nolit_encodeSnappyBlockAsm12B two_byte_offset_short_match_nolit_encodeSnappyBlockAsm12B: - CMPL BX, $0x0c + CMPL R9, $0x0c JGE emit_copy_three_match_nolit_encodeSnappyBlockAsm12B CMPL BP, $0x00000800 JGE emit_copy_three_match_nolit_encodeSnappyBlockAsm12B - MOVB $0x01, SI - LEAL -16(SI)(BX*4), BX + MOVB $0x01, BL + LEAL -16(BX)(R9*4), R9 MOVB BP, 1(AX) SHRL $0x08, BP SHLL $0x05, BP - ORL BP, BX - MOVB BL, (AX) + ORL BP, R9 + MOVB R9, (AX) ADDQ $0x02, AX JMP match_nolit_emitcopy_end_encodeSnappyBlockAsm12B emit_copy_three_match_nolit_encodeSnappyBlockAsm12B: - MOVB $0x02, SI - LEAL -4(SI)(BX*4), BX - MOVB BL, (AX) + MOVB $0x02, BL + LEAL -4(BX)(R9*4), R9 + MOVB R9, (AX) MOVW BP, 1(AX) ADDQ $0x03, AX @@ -13825,22 +13805,22 @@ match_nolit_emitcopy_end_encodeSnappyBlockAsm12B: RET match_nolit_dst_ok_encodeSnappyBlockAsm12B: - MOVQ -2(DX)(CX*1), R9 - MOVQ $0x000000cf1bbcdcbb, R10 - MOVQ R9, R11 - SHRQ $0x10, R9 - MOVQ R9, R12 - SHLQ $0x18, R11 - IMULQ R10, R11 - SHRQ $0x34, R11 - SHLQ $0x18, R12 - IMULQ R10, R12 - SHRQ $0x34, R12 - LEAL -2(CX), BP - MOVL 24(SP)(R12*4), BX - MOVL BP, 24(SP)(R11*4) - MOVL CX, 24(SP)(R12*4) - CMPL (DX)(BX*1), R9 + MOVQ -2(DX)(CX*1), SI + MOVQ $0x000000cf1bbcdcbb, BP + MOVQ SI, DI + SHRQ $0x10, SI + MOVQ SI, R8 + SHLQ $0x18, DI + IMULQ BP, DI + SHRQ $0x34, DI + SHLQ $0x18, R8 + IMULQ BP, R8 + SHRQ $0x34, R8 + LEAL -2(CX), R9 + MOVL 24(SP)(R8*4), BP + MOVL R9, 24(SP)(DI*4) + MOVL CX, 24(SP)(R8*4) + CMPL (DX)(BP*1), SI JEQ match_nolit_loop_encodeSnappyBlockAsm12B INCL CX JMP search_loop_encodeSnappyBlockAsm12B @@ -14115,8 +14095,8 @@ zero_loop_encodeSnappyBlockAsm10B: MOVL $0x00000000, 12(SP) MOVQ src_len+32(FP), CX LEAQ -5(CX), DX - LEAQ -8(CX), R9 - MOVL R9, 8(SP) + LEAQ -8(CX), BP + MOVL BP, 8(SP) SHRQ $0x05, CX SUBL CX, DX LEAQ (AX)(DX*1), DX @@ -14126,68 +14106,68 @@ zero_loop_encodeSnappyBlockAsm10B: MOVQ src_base+24(FP), DX search_loop_encodeSnappyBlockAsm10B: - MOVQ (DX)(CX*1), R9 - MOVL CX, BX - SUBL 12(SP), BX - SHRL $0x05, BX - LEAL 4(CX)(BX*1), BX - MOVL 8(SP), BP - CMPL BX, BP + MOVQ (DX)(CX*1), SI + MOVL CX, BP + SUBL 12(SP), BP + SHRL $0x05, BP + LEAL 4(CX)(BP*1), BP + MOVL 8(SP), DI + CMPL BP, DI JGT emit_remainder_encodeSnappyBlockAsm10B - MOVL BX, 20(SP) - MOVQ $0x000000cf1bbcdcbb, R10 - MOVQ R9, R11 - MOVQ R9, R12 - SHRQ $0x08, R12 - SHLQ $0x18, R11 - IMULQ R10, R11 - SHRQ $0x36, R11 - SHLQ $0x18, R12 - IMULQ R10, R12 - SHRQ $0x36, R12 - MOVL 24(SP)(R11*4), BX - MOVL 24(SP)(R12*4), BP - MOVL CX, 24(SP)(R11*4) - LEAL 1(CX), DI - MOVL DI, 24(SP)(R12*4) - MOVQ R9, R11 - SHRQ $0x10, R11 - SHLQ $0x18, R11 - IMULQ R10, R11 - SHRQ $0x36, R11 - MOVL CX, DI - SUBL 16(SP), DI - MOVL 1(DX)(DI*1), DI - MOVQ R9, R10 + MOVL BP, 20(SP) + MOVQ $0x000000cf1bbcdcbb, R8 + MOVQ SI, R9 + MOVQ SI, R10 SHRQ $0x08, R10 - CMPL R10, DI + SHLQ $0x18, R9 + IMULQ R8, R9 + SHRQ $0x36, R9 + SHLQ $0x18, R10 + IMULQ R8, R10 + SHRQ $0x36, R10 + MOVL 24(SP)(R9*4), BP + MOVL 24(SP)(R10*4), DI + MOVL CX, 24(SP)(R9*4) + LEAL 1(CX), R9 + MOVL R9, 24(SP)(R10*4) + MOVQ SI, R9 + SHRQ $0x10, R9 + SHLQ $0x18, R9 + IMULQ R8, R9 + SHRQ $0x36, R9 + MOVL CX, R8 + SUBL 16(SP), R8 + MOVL 1(DX)(R8*1), R10 + MOVQ SI, R8 + SHRQ $0x08, R8 + CMPL R8, R10 JNE no_repeat_found_encodeSnappyBlockAsm10B - LEAL 1(CX), DI - MOVL 12(SP), BX - MOVL DI, BP - SUBL 16(SP), BP + LEAL 1(CX), SI + MOVL 12(SP), BP + MOVL SI, DI + SUBL 16(SP), DI JZ repeat_extend_back_end_encodeSnappyBlockAsm10B repeat_extend_back_loop_encodeSnappyBlockAsm10B: - CMPL DI, BX + CMPL SI, BP JLE repeat_extend_back_end_encodeSnappyBlockAsm10B - MOVB -1(DX)(BP*1), SI - MOVB -1(DX)(DI*1), R8 - CMPB SI, R8 + MOVB -1(DX)(DI*1), BL + MOVB -1(DX)(SI*1), R8 + CMPB BL, R8 JNE repeat_extend_back_end_encodeSnappyBlockAsm10B - LEAL -1(DI), DI - DECL BP + LEAL -1(SI), SI + DECL DI JNZ repeat_extend_back_loop_encodeSnappyBlockAsm10B repeat_extend_back_end_encodeSnappyBlockAsm10B: MOVL 12(SP), BP - CMPL BP, DI + CMPL BP, SI JEQ emit_literal_done_repeat_emit_encodeSnappyBlockAsm10B - MOVL DI, BX - MOVL DI, 12(SP) - LEAQ (DX)(BP*1), R9 - SUBL BP, BX - MOVL BX, BP + MOVL SI, DI + MOVL SI, 12(SP) + LEAQ (DX)(BP*1), R8 + SUBL BP, DI + MOVL DI, BP SUBL $0x01, BP JC emit_literal_done_repeat_emit_encodeSnappyBlockAsm10B CMPL BP, $0x3c @@ -14204,11 +14184,11 @@ repeat_extend_back_end_encodeSnappyBlockAsm10B: JMP memmove_repeat_emit_encodeSnappyBlockAsm10B four_bytes_repeat_emit_encodeSnappyBlockAsm10B: - MOVL BP, R8 - SHRL $0x10, R8 + MOVL BP, R9 + SHRL $0x10, R9 MOVB $0xf8, (AX) MOVW BP, 1(AX) - MOVB R8, 3(AX) + MOVB R9, 3(AX) ADDQ $0x04, AX JMP memmove_repeat_emit_encodeSnappyBlockAsm10B @@ -14230,125 +14210,124 @@ one_byte_repeat_emit_encodeSnappyBlockAsm10B: ADDQ $0x01, AX memmove_repeat_emit_encodeSnappyBlockAsm10B: - LEAQ (AX)(BX*1), R10 - MOVL BX, BP + LEAQ (AX)(DI*1), BP NOP emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_tail: - TESTQ BP, BP + TESTQ DI, DI JEQ memmove_end_copy_repeat_emit_encodeSnappyBlockAsm10B - CMPQ BP, $0x02 + CMPQ DI, $0x02 JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_1or2 - CMPQ BP, $0x04 + CMPQ DI, $0x04 JB emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_3 JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_4 - CMPQ BP, $0x08 + CMPQ DI, $0x08 JB emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_5through7 JE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_8 - CMPQ BP, $0x10 + CMPQ DI, $0x10 JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_9through16 - CMPQ BP, $0x20 + CMPQ DI, $0x20 JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_17through32 - CMPQ BP, $0x40 + CMPQ DI, $0x40 JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_33through64 - CMPQ BP, $0x80 + CMPQ DI, $0x80 JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_65through128 - CMPQ BP, $0x00000100 + CMPQ DI, $0x00000100 JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_129through256 JMP emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_256through2048 emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_1or2: - MOVB (R9), R11 - MOVB -1(R9)(BP*1), R9 - MOVB R11, (AX) - MOVB R9, -1(AX)(BP*1) + MOVB (R8), R9 + MOVB -1(R8)(DI*1), R8 + MOVB R9, (AX) + MOVB R8, -1(AX)(DI*1) JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm10B emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_4: - MOVL (R9), R11 - MOVL R11, (AX) + MOVL (R8), R9 + MOVL R9, (AX) JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm10B emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_3: - MOVW (R9), R11 - MOVB 2(R9), R9 - MOVW R11, (AX) - MOVB R9, 2(AX) + MOVW (R8), R9 + MOVB 2(R8), R8 + MOVW R9, (AX) + MOVB R8, 2(AX) JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm10B emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_5through7: - MOVL (R9), R11 - MOVL -4(R9)(BP*1), R9 - MOVL R11, (AX) - MOVL R9, -4(AX)(BP*1) + MOVL (R8), R9 + MOVL -4(R8)(DI*1), R8 + MOVL R9, (AX) + MOVL R8, -4(AX)(DI*1) JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm10B emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_8: - MOVQ (R9), R11 - MOVQ R11, (AX) + MOVQ (R8), R9 + MOVQ R9, (AX) JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm10B emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_9through16: - MOVQ (R9), R11 - MOVQ -8(R9)(BP*1), R9 - MOVQ R11, (AX) - MOVQ R9, -8(AX)(BP*1) + MOVQ (R8), R9 + MOVQ -8(R8)(DI*1), R8 + MOVQ R9, (AX) + MOVQ R8, -8(AX)(DI*1) JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm10B emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_17through32: - MOVOU (R9), X0 - MOVOU -16(R9)(BP*1), X1 + MOVOU (R8), X0 + MOVOU -16(R8)(DI*1), X1 MOVOU X0, (AX) - MOVOU X1, -16(AX)(BP*1) + MOVOU X1, -16(AX)(DI*1) JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm10B emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_33through64: - MOVOU (R9), X0 - MOVOU 16(R9), X1 - MOVOU -32(R9)(BP*1), X2 - MOVOU -16(R9)(BP*1), X3 + MOVOU (R8), X0 + MOVOU 16(R8), X1 + MOVOU -32(R8)(DI*1), X2 + MOVOU -16(R8)(DI*1), X3 MOVOU X0, (AX) MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(BP*1) - MOVOU X3, -16(AX)(BP*1) + MOVOU X2, -32(AX)(DI*1) + MOVOU X3, -16(AX)(DI*1) JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm10B emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_65through128: - MOVOU (R9), X0 - MOVOU 16(R9), X1 - MOVOU 32(R9), X2 - MOVOU 48(R9), X3 - MOVOU -64(R9)(BP*1), X12 - MOVOU -48(R9)(BP*1), X13 - MOVOU -32(R9)(BP*1), X14 - MOVOU -16(R9)(BP*1), X15 + MOVOU (R8), X0 + MOVOU 16(R8), X1 + MOVOU 32(R8), X2 + MOVOU 48(R8), X3 + MOVOU -64(R8)(DI*1), X12 + MOVOU -48(R8)(DI*1), X13 + MOVOU -32(R8)(DI*1), X14 + MOVOU -16(R8)(DI*1), X15 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, 32(AX) MOVOU X3, 48(AX) - MOVOU X12, -64(AX)(BP*1) - MOVOU X13, -48(AX)(BP*1) - MOVOU X14, -32(AX)(BP*1) - MOVOU X15, -16(AX)(BP*1) + MOVOU X12, -64(AX)(DI*1) + MOVOU X13, -48(AX)(DI*1) + MOVOU X14, -32(AX)(DI*1) + MOVOU X15, -16(AX)(DI*1) JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm10B emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_129through256: - MOVOU (R9), X0 - MOVOU 16(R9), X1 - MOVOU 32(R9), X2 - MOVOU 48(R9), X3 - MOVOU 64(R9), X4 - MOVOU 80(R9), X5 - MOVOU 96(R9), X6 - MOVOU 112(R9), X7 - MOVOU -128(R9)(BP*1), X8 - MOVOU -112(R9)(BP*1), X9 - MOVOU -96(R9)(BP*1), X10 - MOVOU -80(R9)(BP*1), X11 - MOVOU -64(R9)(BP*1), X12 - MOVOU -48(R9)(BP*1), X13 - MOVOU -32(R9)(BP*1), X14 - MOVOU -16(R9)(BP*1), X15 + MOVOU (R8), X0 + MOVOU 16(R8), X1 + MOVOU 32(R8), X2 + MOVOU 48(R8), X3 + MOVOU 64(R8), X4 + MOVOU 80(R8), X5 + MOVOU 96(R8), X6 + MOVOU 112(R8), X7 + MOVOU -128(R8)(DI*1), X8 + MOVOU -112(R8)(DI*1), X9 + MOVOU -96(R8)(DI*1), X10 + MOVOU -80(R8)(DI*1), X11 + MOVOU -64(R8)(DI*1), X12 + MOVOU -48(R8)(DI*1), X13 + MOVOU -32(R8)(DI*1), X14 + MOVOU -16(R8)(DI*1), X15 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, 32(AX) @@ -14357,34 +14336,34 @@ emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_129through256: MOVOU X5, 80(AX) MOVOU X6, 96(AX) MOVOU X7, 112(AX) - MOVOU X8, -128(AX)(BP*1) - MOVOU X9, -112(AX)(BP*1) - MOVOU X10, -96(AX)(BP*1) - MOVOU X11, -80(AX)(BP*1) - MOVOU X12, -64(AX)(BP*1) - MOVOU X13, -48(AX)(BP*1) - MOVOU X14, -32(AX)(BP*1) - MOVOU X15, -16(AX)(BP*1) + MOVOU X8, -128(AX)(DI*1) + MOVOU X9, -112(AX)(DI*1) + MOVOU X10, -96(AX)(DI*1) + MOVOU X11, -80(AX)(DI*1) + MOVOU X12, -64(AX)(DI*1) + MOVOU X13, -48(AX)(DI*1) + MOVOU X14, -32(AX)(DI*1) + MOVOU X15, -16(AX)(DI*1) JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm10B emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_256through2048: - LEAQ -256(BP), BP - MOVOU (R9), X0 - MOVOU 16(R9), X1 - MOVOU 32(R9), X2 - MOVOU 48(R9), X3 - MOVOU 64(R9), X4 - MOVOU 80(R9), X5 - MOVOU 96(R9), X6 - MOVOU 112(R9), X7 - MOVOU 128(R9), X8 - MOVOU 144(R9), X9 - MOVOU 160(R9), X10 - MOVOU 176(R9), X11 - MOVOU 192(R9), X12 - MOVOU 208(R9), X13 - MOVOU 224(R9), X14 - MOVOU 240(R9), X15 + LEAQ -256(DI), DI + MOVOU (R8), X0 + MOVOU 16(R8), X1 + MOVOU 32(R8), X2 + MOVOU 48(R8), X3 + MOVOU 64(R8), X4 + MOVOU 80(R8), X5 + MOVOU 96(R8), X6 + MOVOU 112(R8), X7 + MOVOU 128(R8), X8 + MOVOU 144(R8), X9 + MOVOU 160(R8), X10 + MOVOU 176(R8), X11 + MOVOU 192(R8), X12 + MOVOU 208(R8), X13 + MOVOU 224(R8), X14 + MOVOU 240(R8), X15 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, 32(AX) @@ -14401,113 +14380,113 @@ emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_256through2048 MOVOU X13, 208(AX) MOVOU X14, 224(AX) MOVOU X15, 240(AX) - CMPQ BP, $0x00000100 - LEAQ 256(R9), R9 + CMPQ DI, $0x00000100 + LEAQ 256(R8), R8 LEAQ 256(AX), AX JGE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_256through2048 JMP emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_tail memmove_end_copy_repeat_emit_encodeSnappyBlockAsm10B: - MOVQ R10, AX + MOVQ BP, AX emit_literal_done_repeat_emit_encodeSnappyBlockAsm10B: ADDL $0x05, CX - MOVL CX, BX - SUBL 16(SP), BX - MOVQ src_len+32(FP), R9 - SUBL CX, R9 - LEAQ (DX)(CX*1), R10 - LEAQ (DX)(BX*1), R11 - XORL BX, BX - CMPL R9, $0x08 + MOVL CX, BP + SUBL 16(SP), BP + MOVQ src_len+32(FP), DI + SUBL CX, DI + LEAQ (DX)(CX*1), R8 + LEAQ (DX)(BP*1), BP + XORL R10, R10 + CMPL DI, $0x08 JL matchlen_single_repeat_extend matchlen_loopback_repeat_extend: - MOVQ (R10)(BX*1), R12 - XORQ (R11)(BX*1), R12 - TESTQ R12, R12 + MOVQ (R8)(R10*1), R9 + XORQ (BP)(R10*1), R9 + TESTQ R9, R9 JZ matchlen_loop_repeat_extend - BSFQ R12, R12 - SARQ $0x03, R12 - LEAL (BX)(R12*1), BX + BSFQ R9, R9 + SARQ $0x03, R9 + LEAL (R10)(R9*1), R10 JMP repeat_extend_forward_end_encodeSnappyBlockAsm10B matchlen_loop_repeat_extend: - LEAL -8(R9), R9 - LEAL 8(BX), BX - CMPL R9, $0x08 + LEAL -8(DI), DI + LEAL 8(R10), R10 + CMPL DI, $0x08 JGE matchlen_loopback_repeat_extend matchlen_single_repeat_extend: - TESTL R9, R9 + TESTL DI, DI JZ repeat_extend_forward_end_encodeSnappyBlockAsm10B matchlen_single_loopback_repeat_extend: - MOVB (R10)(BX*1), R12 - CMPB (R11)(BX*1), R12 + MOVB (R8)(R10*1), R9 + CMPB (BP)(R10*1), R9 JNE repeat_extend_forward_end_encodeSnappyBlockAsm10B - LEAL 1(BX), BX - DECL R9 + LEAL 1(R10), R10 + DECL DI JNZ matchlen_single_loopback_repeat_extend repeat_extend_forward_end_encodeSnappyBlockAsm10B: - ADDL BX, CX - MOVL CX, BX - SUBL DI, BX - MOVL 16(SP), BP - CMPL BP, $0x00010000 + ADDL R10, CX + MOVL CX, BP + SUBL SI, BP + MOVL 16(SP), SI + CMPL SI, $0x00010000 JL two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm10B four_bytes_loop_back_repeat_as_copy_encodeSnappyBlockAsm10B: - CMPL BX, $0x40 + CMPL BP, $0x40 JLE four_bytes_remain_repeat_as_copy_encodeSnappyBlockAsm10B MOVB $0xff, (AX) - MOVL BP, 1(AX) - LEAL -64(BX), BX + MOVL SI, 1(AX) + LEAL -64(BP), BP ADDQ $0x05, AX - CMPL BX, $0x04 + CMPL BP, $0x04 JL four_bytes_remain_repeat_as_copy_encodeSnappyBlockAsm10B JMP four_bytes_loop_back_repeat_as_copy_encodeSnappyBlockAsm10B four_bytes_remain_repeat_as_copy_encodeSnappyBlockAsm10B: - TESTL BX, BX + TESTL BP, BP JZ repeat_end_emit_encodeSnappyBlockAsm10B - MOVB $0x03, SI - LEAL -4(SI)(BX*4), BX - MOVB BL, (AX) - MOVL BP, 1(AX) + MOVB $0x03, BL + LEAL -4(BX)(BP*4), BP + MOVB BP, (AX) + MOVL SI, 1(AX) ADDQ $0x05, AX JMP repeat_end_emit_encodeSnappyBlockAsm10B two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm10B: - CMPL BX, $0x40 + CMPL BP, $0x40 JLE two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm10B MOVB $0xee, (AX) - MOVW BP, 1(AX) - LEAL -60(BX), BX + MOVW SI, 1(AX) + LEAL -60(BP), BP ADDQ $0x03, AX JMP two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm10B two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm10B: - CMPL BX, $0x0c + CMPL BP, $0x0c JGE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm10B - CMPL BP, $0x00000800 + CMPL SI, $0x00000800 JGE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm10B - MOVB $0x01, SI - LEAL -16(SI)(BX*4), BX - MOVB BP, 1(AX) - SHRL $0x08, BP - SHLL $0x05, BP - ORL BP, BX - MOVB BL, (AX) + MOVB $0x01, BL + LEAL -16(BX)(BP*4), BP + MOVB SI, 1(AX) + SHRL $0x08, SI + SHLL $0x05, SI + ORL SI, BP + MOVB BP, (AX) ADDQ $0x02, AX JMP repeat_end_emit_encodeSnappyBlockAsm10B emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm10B: - MOVB $0x02, SI - LEAL -4(SI)(BX*4), BX - MOVB BL, (AX) - MOVW BP, 1(AX) + MOVB $0x02, BL + LEAL -4(BX)(BP*4), BP + MOVB BP, (AX) + MOVW SI, 1(AX) ADDQ $0x03, AX repeat_end_emit_encodeSnappyBlockAsm10B: @@ -14517,16 +14496,16 @@ repeat_end_emit_encodeSnappyBlockAsm10B: JMP search_loop_encodeSnappyBlockAsm10B no_repeat_found_encodeSnappyBlockAsm10B: - CMPL (DX)(BX*1), R9 + CMPL (DX)(BP*1), SI JEQ candidate_match_encodeSnappyBlockAsm10B - SHRQ $0x08, R9 - MOVL 24(SP)(R11*4), BX - LEAL 2(CX), DI - CMPL (DX)(BP*1), R9 + SHRQ $0x08, SI + MOVL 24(SP)(R9*4), BP + LEAL 2(CX), R8 + CMPL (DX)(DI*1), SI JEQ candidate2_match_encodeSnappyBlockAsm10B - MOVL DI, 24(SP)(R11*4) - SHRQ $0x08, R9 - CMPL (DX)(BX*1), R9 + MOVL R8, 24(SP)(R9*4) + SHRQ $0x08, SI + CMPL (DX)(BP*1), SI JEQ candidate3_match_encodeSnappyBlockAsm10B MOVL 20(SP), CX JMP search_loop_encodeSnappyBlockAsm10B @@ -14536,46 +14515,46 @@ candidate3_match_encodeSnappyBlockAsm10B: JMP candidate_match_encodeSnappyBlockAsm10B candidate2_match_encodeSnappyBlockAsm10B: - MOVL DI, 24(SP)(R11*4) + MOVL R8, 24(SP)(R9*4) INCL CX - MOVL BP, BX + MOVL DI, BP candidate_match_encodeSnappyBlockAsm10B: - MOVL 12(SP), BP - TESTL BX, BX + MOVL 12(SP), SI + TESTL BP, BP JZ match_extend_back_end_encodeSnappyBlockAsm10B match_extend_back_loop_encodeSnappyBlockAsm10B: - CMPL CX, BP + CMPL CX, SI JLE match_extend_back_end_encodeSnappyBlockAsm10B - MOVB -1(DX)(BX*1), SI + MOVB -1(DX)(BP*1), BL MOVB -1(DX)(CX*1), DI - CMPB SI, DI + CMPB BL, DI JNE match_extend_back_end_encodeSnappyBlockAsm10B LEAL -1(CX), CX - DECL BX + DECL BP JZ match_extend_back_end_encodeSnappyBlockAsm10B JMP match_extend_back_loop_encodeSnappyBlockAsm10B match_extend_back_end_encodeSnappyBlockAsm10B: - MOVL CX, DI - SUBL 12(SP), DI - LEAQ 4(AX)(DI*1), DI - CMPQ DI, (SP) + MOVL CX, SI + SUBL 12(SP), SI + LEAQ 4(AX)(SI*1), SI + CMPQ SI, (SP) JL match_dst_size_check_encodeSnappyBlockAsm10B MOVQ $0x00000000, ret+48(FP) RET match_dst_size_check_encodeSnappyBlockAsm10B: - MOVL CX, R8 + MOVL CX, SI MOVL 12(SP), DI - CMPL DI, R8 + CMPL DI, SI JEQ emit_literal_done_match_emit_encodeSnappyBlockAsm10B - MOVL R8, BP - MOVL R8, 12(SP) - LEAQ (DX)(DI*1), R9 - SUBL DI, BP - MOVL BP, DI + MOVL SI, R8 + MOVL SI, 12(SP) + LEAQ (DX)(DI*1), SI + SUBL DI, R8 + MOVL R8, DI SUBL $0x01, DI JC emit_literal_done_match_emit_encodeSnappyBlockAsm10B CMPL DI, $0x3c @@ -14592,11 +14571,11 @@ match_dst_size_check_encodeSnappyBlockAsm10B: JMP memmove_match_emit_encodeSnappyBlockAsm10B four_bytes_match_emit_encodeSnappyBlockAsm10B: - MOVL DI, R8 - SHRL $0x10, R8 + MOVL DI, R9 + SHRL $0x10, R9 MOVB $0xf8, (AX) MOVW DI, 1(AX) - MOVB R8, 3(AX) + MOVB R9, 3(AX) ADDQ $0x04, AX JMP memmove_match_emit_encodeSnappyBlockAsm10B @@ -14618,8 +14597,7 @@ one_byte_match_emit_encodeSnappyBlockAsm10B: ADDQ $0x01, AX memmove_match_emit_encodeSnappyBlockAsm10B: - LEAQ (AX)(BP*1), R10 - MOVL BP, R8 + LEAQ (AX)(R8*1), DI NOP emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_tail: @@ -14646,55 +14624,55 @@ emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_tail: JMP emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_256through2048 emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_1or2: - MOVB (R9), R11 - MOVB -1(R9)(R8*1), R9 - MOVB R11, (AX) - MOVB R9, -1(AX)(R8*1) + MOVB (SI), R9 + MOVB -1(SI)(R8*1), SI + MOVB R9, (AX) + MOVB SI, -1(AX)(R8*1) JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm10B emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_4: - MOVL (R9), R11 - MOVL R11, (AX) + MOVL (SI), R9 + MOVL R9, (AX) JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm10B emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_3: - MOVW (R9), R11 - MOVB 2(R9), R9 - MOVW R11, (AX) - MOVB R9, 2(AX) + MOVW (SI), R9 + MOVB 2(SI), SI + MOVW R9, (AX) + MOVB SI, 2(AX) JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm10B emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_5through7: - MOVL (R9), R11 - MOVL -4(R9)(R8*1), R9 - MOVL R11, (AX) - MOVL R9, -4(AX)(R8*1) + MOVL (SI), R9 + MOVL -4(SI)(R8*1), SI + MOVL R9, (AX) + MOVL SI, -4(AX)(R8*1) JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm10B emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_8: - MOVQ (R9), R11 - MOVQ R11, (AX) + MOVQ (SI), R9 + MOVQ R9, (AX) JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm10B emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_9through16: - MOVQ (R9), R11 - MOVQ -8(R9)(R8*1), R9 - MOVQ R11, (AX) - MOVQ R9, -8(AX)(R8*1) + MOVQ (SI), R9 + MOVQ -8(SI)(R8*1), SI + MOVQ R9, (AX) + MOVQ SI, -8(AX)(R8*1) JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm10B emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_17through32: - MOVOU (R9), X0 - MOVOU -16(R9)(R8*1), X1 + MOVOU (SI), X0 + MOVOU -16(SI)(R8*1), X1 MOVOU X0, (AX) MOVOU X1, -16(AX)(R8*1) JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm10B emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_33through64: - MOVOU (R9), X0 - MOVOU 16(R9), X1 - MOVOU -32(R9)(R8*1), X2 - MOVOU -16(R9)(R8*1), X3 + MOVOU (SI), X0 + MOVOU 16(SI), X1 + MOVOU -32(SI)(R8*1), X2 + MOVOU -16(SI)(R8*1), X3 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, -32(AX)(R8*1) @@ -14702,14 +14680,14 @@ emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_33through64: JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm10B emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_65through128: - MOVOU (R9), X0 - MOVOU 16(R9), X1 - MOVOU 32(R9), X2 - MOVOU 48(R9), X3 - MOVOU -64(R9)(R8*1), X12 - MOVOU -48(R9)(R8*1), X13 - MOVOU -32(R9)(R8*1), X14 - MOVOU -16(R9)(R8*1), X15 + MOVOU (SI), X0 + MOVOU 16(SI), X1 + MOVOU 32(SI), X2 + MOVOU 48(SI), X3 + MOVOU -64(SI)(R8*1), X12 + MOVOU -48(SI)(R8*1), X13 + MOVOU -32(SI)(R8*1), X14 + MOVOU -16(SI)(R8*1), X15 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, 32(AX) @@ -14721,22 +14699,22 @@ emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_65through128: JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm10B emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_129through256: - MOVOU (R9), X0 - MOVOU 16(R9), X1 - MOVOU 32(R9), X2 - MOVOU 48(R9), X3 - MOVOU 64(R9), X4 - MOVOU 80(R9), X5 - MOVOU 96(R9), X6 - MOVOU 112(R9), X7 - MOVOU -128(R9)(R8*1), X8 - MOVOU -112(R9)(R8*1), X9 - MOVOU -96(R9)(R8*1), X10 - MOVOU -80(R9)(R8*1), X11 - MOVOU -64(R9)(R8*1), X12 - MOVOU -48(R9)(R8*1), X13 - MOVOU -32(R9)(R8*1), X14 - MOVOU -16(R9)(R8*1), X15 + MOVOU (SI), X0 + MOVOU 16(SI), X1 + MOVOU 32(SI), X2 + MOVOU 48(SI), X3 + MOVOU 64(SI), X4 + MOVOU 80(SI), X5 + MOVOU 96(SI), X6 + MOVOU 112(SI), X7 + MOVOU -128(SI)(R8*1), X8 + MOVOU -112(SI)(R8*1), X9 + MOVOU -96(SI)(R8*1), X10 + MOVOU -80(SI)(R8*1), X11 + MOVOU -64(SI)(R8*1), X12 + MOVOU -48(SI)(R8*1), X13 + MOVOU -32(SI)(R8*1), X14 + MOVOU -16(SI)(R8*1), X15 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, 32(AX) @@ -14750,29 +14728,29 @@ emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_129through256: MOVOU X10, -96(AX)(R8*1) MOVOU X11, -80(AX)(R8*1) MOVOU X12, -64(AX)(R8*1) - MOVOU X13, -48(AX)(R8*1) - MOVOU X14, -32(AX)(R8*1) - MOVOU X15, -16(AX)(R8*1) - JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm10B - -emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_256through2048: - LEAQ -256(R8), R8 - MOVOU (R9), X0 - MOVOU 16(R9), X1 - MOVOU 32(R9), X2 - MOVOU 48(R9), X3 - MOVOU 64(R9), X4 - MOVOU 80(R9), X5 - MOVOU 96(R9), X6 - MOVOU 112(R9), X7 - MOVOU 128(R9), X8 - MOVOU 144(R9), X9 - MOVOU 160(R9), X10 - MOVOU 176(R9), X11 - MOVOU 192(R9), X12 - MOVOU 208(R9), X13 - MOVOU 224(R9), X14 - MOVOU 240(R9), X15 + MOVOU X13, -48(AX)(R8*1) + MOVOU X14, -32(AX)(R8*1) + MOVOU X15, -16(AX)(R8*1) + JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm10B + +emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_256through2048: + LEAQ -256(R8), R8 + MOVOU (SI), X0 + MOVOU 16(SI), X1 + MOVOU 32(SI), X2 + MOVOU 48(SI), X3 + MOVOU 64(SI), X4 + MOVOU 80(SI), X5 + MOVOU 96(SI), X6 + MOVOU 112(SI), X7 + MOVOU 128(SI), X8 + MOVOU 144(SI), X9 + MOVOU 160(SI), X10 + MOVOU 176(SI), X11 + MOVOU 192(SI), X12 + MOVOU 208(SI), X13 + MOVOU 224(SI), X14 + MOVOU 240(SI), X15 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, 32(AX) @@ -14790,113 +14768,113 @@ emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_256through2048: MOVOU X14, 224(AX) MOVOU X15, 240(AX) CMPQ R8, $0x00000100 - LEAQ 256(R9), R9 + LEAQ 256(SI), SI LEAQ 256(AX), AX JGE emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_256through2048 JMP emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_tail memmove_end_copy_match_emit_encodeSnappyBlockAsm10B: - MOVQ R10, AX + MOVQ DI, AX emit_literal_done_match_emit_encodeSnappyBlockAsm10B: match_nolit_loop_encodeSnappyBlockAsm10B: - MOVL CX, BP - SUBL BX, BP - MOVL BP, 16(SP) + MOVL CX, SI + SUBL BP, SI + MOVL SI, 16(SP) ADDL $0x04, CX - ADDL $0x04, BX - MOVQ src_len+32(FP), R9 - SUBL CX, R9 - LEAQ (DX)(CX*1), R10 - LEAQ (DX)(BX*1), R11 - XORL BX, BX - CMPL R9, $0x08 + ADDL $0x04, BP + MOVQ src_len+32(FP), SI + SUBL CX, SI + LEAQ (DX)(CX*1), DI + LEAQ (DX)(BP*1), BP + XORL R9, R9 + CMPL SI, $0x08 JL matchlen_single_match_nolit_encodeSnappyBlockAsm10B matchlen_loopback_match_nolit_encodeSnappyBlockAsm10B: - MOVQ (R10)(BX*1), R12 - XORQ (R11)(BX*1), R12 - TESTQ R12, R12 + MOVQ (DI)(R9*1), R8 + XORQ (BP)(R9*1), R8 + TESTQ R8, R8 JZ matchlen_loop_match_nolit_encodeSnappyBlockAsm10B - BSFQ R12, R12 - SARQ $0x03, R12 - LEAL (BX)(R12*1), BX + BSFQ R8, R8 + SARQ $0x03, R8 + LEAL (R9)(R8*1), R9 JMP match_nolit_end_encodeSnappyBlockAsm10B matchlen_loop_match_nolit_encodeSnappyBlockAsm10B: - LEAL -8(R9), R9 - LEAL 8(BX), BX - CMPL R9, $0x08 + LEAL -8(SI), SI + LEAL 8(R9), R9 + CMPL SI, $0x08 JGE matchlen_loopback_match_nolit_encodeSnappyBlockAsm10B matchlen_single_match_nolit_encodeSnappyBlockAsm10B: - TESTL R9, R9 + TESTL SI, SI JZ match_nolit_end_encodeSnappyBlockAsm10B matchlen_single_loopback_match_nolit_encodeSnappyBlockAsm10B: - MOVB (R10)(BX*1), R12 - CMPB (R11)(BX*1), R12 + MOVB (DI)(R9*1), R8 + CMPB (BP)(R9*1), R8 JNE match_nolit_end_encodeSnappyBlockAsm10B - LEAL 1(BX), BX - DECL R9 + LEAL 1(R9), R9 + DECL SI JNZ matchlen_single_loopback_match_nolit_encodeSnappyBlockAsm10B match_nolit_end_encodeSnappyBlockAsm10B: - ADDL BX, CX + ADDL R9, CX MOVL 16(SP), BP - ADDL $0x04, BX + ADDL $0x04, R9 CMPL BP, $0x00010000 JL two_byte_offset_match_nolit_encodeSnappyBlockAsm10B four_bytes_loop_back_match_nolit_encodeSnappyBlockAsm10B: - CMPL BX, $0x40 + CMPL R9, $0x40 JLE four_bytes_remain_match_nolit_encodeSnappyBlockAsm10B MOVB $0xff, (AX) MOVL BP, 1(AX) - LEAL -64(BX), BX + LEAL -64(R9), R9 ADDQ $0x05, AX - CMPL BX, $0x04 + CMPL R9, $0x04 JL four_bytes_remain_match_nolit_encodeSnappyBlockAsm10B JMP four_bytes_loop_back_match_nolit_encodeSnappyBlockAsm10B four_bytes_remain_match_nolit_encodeSnappyBlockAsm10B: - TESTL BX, BX + TESTL R9, R9 JZ match_nolit_emitcopy_end_encodeSnappyBlockAsm10B - MOVB $0x03, SI - LEAL -4(SI)(BX*4), BX - MOVB BL, (AX) + MOVB $0x03, BL + LEAL -4(BX)(R9*4), R9 + MOVB R9, (AX) MOVL BP, 1(AX) ADDQ $0x05, AX JMP match_nolit_emitcopy_end_encodeSnappyBlockAsm10B two_byte_offset_match_nolit_encodeSnappyBlockAsm10B: - CMPL BX, $0x40 + CMPL R9, $0x40 JLE two_byte_offset_short_match_nolit_encodeSnappyBlockAsm10B MOVB $0xee, (AX) MOVW BP, 1(AX) - LEAL -60(BX), BX + LEAL -60(R9), R9 ADDQ $0x03, AX JMP two_byte_offset_match_nolit_encodeSnappyBlockAsm10B two_byte_offset_short_match_nolit_encodeSnappyBlockAsm10B: - CMPL BX, $0x0c + CMPL R9, $0x0c JGE emit_copy_three_match_nolit_encodeSnappyBlockAsm10B CMPL BP, $0x00000800 JGE emit_copy_three_match_nolit_encodeSnappyBlockAsm10B - MOVB $0x01, SI - LEAL -16(SI)(BX*4), BX + MOVB $0x01, BL + LEAL -16(BX)(R9*4), R9 MOVB BP, 1(AX) SHRL $0x08, BP SHLL $0x05, BP - ORL BP, BX - MOVB BL, (AX) + ORL BP, R9 + MOVB R9, (AX) ADDQ $0x02, AX JMP match_nolit_emitcopy_end_encodeSnappyBlockAsm10B emit_copy_three_match_nolit_encodeSnappyBlockAsm10B: - MOVB $0x02, SI - LEAL -4(SI)(BX*4), BX - MOVB BL, (AX) + MOVB $0x02, BL + LEAL -4(BX)(R9*4), R9 + MOVB R9, (AX) MOVW BP, 1(AX) ADDQ $0x03, AX @@ -14910,22 +14888,22 @@ match_nolit_emitcopy_end_encodeSnappyBlockAsm10B: RET match_nolit_dst_ok_encodeSnappyBlockAsm10B: - MOVQ -2(DX)(CX*1), R9 - MOVQ $0x000000cf1bbcdcbb, R10 - MOVQ R9, R11 - SHRQ $0x10, R9 - MOVQ R9, R12 - SHLQ $0x18, R11 - IMULQ R10, R11 - SHRQ $0x36, R11 - SHLQ $0x18, R12 - IMULQ R10, R12 - SHRQ $0x36, R12 - LEAL -2(CX), BP - MOVL 24(SP)(R12*4), BX - MOVL BP, 24(SP)(R11*4) - MOVL CX, 24(SP)(R12*4) - CMPL (DX)(BX*1), R9 + MOVQ -2(DX)(CX*1), SI + MOVQ $0x000000cf1bbcdcbb, BP + MOVQ SI, DI + SHRQ $0x10, SI + MOVQ SI, R8 + SHLQ $0x18, DI + IMULQ BP, DI + SHRQ $0x36, DI + SHLQ $0x18, R8 + IMULQ BP, R8 + SHRQ $0x36, R8 + LEAL -2(CX), R9 + MOVL 24(SP)(R8*4), BP + MOVL R9, 24(SP)(DI*4) + MOVL CX, 24(SP)(R8*4) + CMPL (DX)(BP*1), SI JEQ match_nolit_loop_encodeSnappyBlockAsm10B INCL CX JMP search_loop_encodeSnappyBlockAsm10B @@ -15200,8 +15178,8 @@ zero_loop_encodeSnappyBlockAsm8B: MOVL $0x00000000, 12(SP) MOVQ src_len+32(FP), CX LEAQ -5(CX), DX - LEAQ -8(CX), R9 - MOVL R9, 8(SP) + LEAQ -8(CX), BP + MOVL BP, 8(SP) SHRQ $0x05, CX SUBL CX, DX LEAQ (AX)(DX*1), DX @@ -15211,68 +15189,68 @@ zero_loop_encodeSnappyBlockAsm8B: MOVQ src_base+24(FP), DX search_loop_encodeSnappyBlockAsm8B: - MOVQ (DX)(CX*1), R9 - MOVL CX, BX - SUBL 12(SP), BX - SHRL $0x04, BX - LEAL 4(CX)(BX*1), BX - MOVL 8(SP), BP - CMPL BX, BP + MOVQ (DX)(CX*1), SI + MOVL CX, BP + SUBL 12(SP), BP + SHRL $0x04, BP + LEAL 4(CX)(BP*1), BP + MOVL 8(SP), DI + CMPL BP, DI JGT emit_remainder_encodeSnappyBlockAsm8B - MOVL BX, 20(SP) - MOVQ $0x9e3779b1, R10 - MOVQ R9, R11 - MOVQ R9, R12 - SHRQ $0x08, R12 - SHLQ $0x20, R11 - IMULQ R10, R11 - SHRQ $0x38, R11 - SHLQ $0x20, R12 - IMULQ R10, R12 - SHRQ $0x38, R12 - MOVL 24(SP)(R11*4), BX - MOVL 24(SP)(R12*4), BP - MOVL CX, 24(SP)(R11*4) - LEAL 1(CX), DI - MOVL DI, 24(SP)(R12*4) - MOVQ R9, R11 - SHRQ $0x10, R11 - SHLQ $0x20, R11 - IMULQ R10, R11 - SHRQ $0x38, R11 - MOVL CX, DI - SUBL 16(SP), DI - MOVL 1(DX)(DI*1), DI - MOVQ R9, R10 + MOVL BP, 20(SP) + MOVQ $0x9e3779b1, R8 + MOVQ SI, R9 + MOVQ SI, R10 SHRQ $0x08, R10 - CMPL R10, DI + SHLQ $0x20, R9 + IMULQ R8, R9 + SHRQ $0x38, R9 + SHLQ $0x20, R10 + IMULQ R8, R10 + SHRQ $0x38, R10 + MOVL 24(SP)(R9*4), BP + MOVL 24(SP)(R10*4), DI + MOVL CX, 24(SP)(R9*4) + LEAL 1(CX), R9 + MOVL R9, 24(SP)(R10*4) + MOVQ SI, R9 + SHRQ $0x10, R9 + SHLQ $0x20, R9 + IMULQ R8, R9 + SHRQ $0x38, R9 + MOVL CX, R8 + SUBL 16(SP), R8 + MOVL 1(DX)(R8*1), R10 + MOVQ SI, R8 + SHRQ $0x08, R8 + CMPL R8, R10 JNE no_repeat_found_encodeSnappyBlockAsm8B - LEAL 1(CX), DI - MOVL 12(SP), BX - MOVL DI, BP - SUBL 16(SP), BP + LEAL 1(CX), SI + MOVL 12(SP), BP + MOVL SI, DI + SUBL 16(SP), DI JZ repeat_extend_back_end_encodeSnappyBlockAsm8B repeat_extend_back_loop_encodeSnappyBlockAsm8B: - CMPL DI, BX + CMPL SI, BP JLE repeat_extend_back_end_encodeSnappyBlockAsm8B - MOVB -1(DX)(BP*1), SI - MOVB -1(DX)(DI*1), R8 - CMPB SI, R8 + MOVB -1(DX)(DI*1), BL + MOVB -1(DX)(SI*1), R8 + CMPB BL, R8 JNE repeat_extend_back_end_encodeSnappyBlockAsm8B - LEAL -1(DI), DI - DECL BP + LEAL -1(SI), SI + DECL DI JNZ repeat_extend_back_loop_encodeSnappyBlockAsm8B repeat_extend_back_end_encodeSnappyBlockAsm8B: MOVL 12(SP), BP - CMPL BP, DI + CMPL BP, SI JEQ emit_literal_done_repeat_emit_encodeSnappyBlockAsm8B - MOVL DI, BX - MOVL DI, 12(SP) - LEAQ (DX)(BP*1), R9 - SUBL BP, BX - MOVL BX, BP + MOVL SI, DI + MOVL SI, 12(SP) + LEAQ (DX)(BP*1), R8 + SUBL BP, DI + MOVL DI, BP SUBL $0x01, BP JC emit_literal_done_repeat_emit_encodeSnappyBlockAsm8B CMPL BP, $0x3c @@ -15289,11 +15267,11 @@ repeat_extend_back_end_encodeSnappyBlockAsm8B: JMP memmove_repeat_emit_encodeSnappyBlockAsm8B four_bytes_repeat_emit_encodeSnappyBlockAsm8B: - MOVL BP, R8 - SHRL $0x10, R8 + MOVL BP, R9 + SHRL $0x10, R9 MOVB $0xf8, (AX) MOVW BP, 1(AX) - MOVB R8, 3(AX) + MOVB R9, 3(AX) ADDQ $0x04, AX JMP memmove_repeat_emit_encodeSnappyBlockAsm8B @@ -15315,125 +15293,124 @@ one_byte_repeat_emit_encodeSnappyBlockAsm8B: ADDQ $0x01, AX memmove_repeat_emit_encodeSnappyBlockAsm8B: - LEAQ (AX)(BX*1), R10 - MOVL BX, BP + LEAQ (AX)(DI*1), BP NOP emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_tail: - TESTQ BP, BP + TESTQ DI, DI JEQ memmove_end_copy_repeat_emit_encodeSnappyBlockAsm8B - CMPQ BP, $0x02 + CMPQ DI, $0x02 JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_1or2 - CMPQ BP, $0x04 + CMPQ DI, $0x04 JB emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_3 JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_4 - CMPQ BP, $0x08 + CMPQ DI, $0x08 JB emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_5through7 JE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_8 - CMPQ BP, $0x10 + CMPQ DI, $0x10 JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_9through16 - CMPQ BP, $0x20 + CMPQ DI, $0x20 JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_17through32 - CMPQ BP, $0x40 + CMPQ DI, $0x40 JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_33through64 - CMPQ BP, $0x80 + CMPQ DI, $0x80 JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_65through128 - CMPQ BP, $0x00000100 + CMPQ DI, $0x00000100 JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_129through256 JMP emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_256through2048 emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_1or2: - MOVB (R9), R11 - MOVB -1(R9)(BP*1), R9 - MOVB R11, (AX) - MOVB R9, -1(AX)(BP*1) + MOVB (R8), R9 + MOVB -1(R8)(DI*1), R8 + MOVB R9, (AX) + MOVB R8, -1(AX)(DI*1) JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm8B emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_4: - MOVL (R9), R11 - MOVL R11, (AX) + MOVL (R8), R9 + MOVL R9, (AX) JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm8B emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_3: - MOVW (R9), R11 - MOVB 2(R9), R9 - MOVW R11, (AX) - MOVB R9, 2(AX) + MOVW (R8), R9 + MOVB 2(R8), R8 + MOVW R9, (AX) + MOVB R8, 2(AX) JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm8B emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_5through7: - MOVL (R9), R11 - MOVL -4(R9)(BP*1), R9 - MOVL R11, (AX) - MOVL R9, -4(AX)(BP*1) + MOVL (R8), R9 + MOVL -4(R8)(DI*1), R8 + MOVL R9, (AX) + MOVL R8, -4(AX)(DI*1) JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm8B emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_8: - MOVQ (R9), R11 - MOVQ R11, (AX) + MOVQ (R8), R9 + MOVQ R9, (AX) JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm8B emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_9through16: - MOVQ (R9), R11 - MOVQ -8(R9)(BP*1), R9 - MOVQ R11, (AX) - MOVQ R9, -8(AX)(BP*1) + MOVQ (R8), R9 + MOVQ -8(R8)(DI*1), R8 + MOVQ R9, (AX) + MOVQ R8, -8(AX)(DI*1) JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm8B emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_17through32: - MOVOU (R9), X0 - MOVOU -16(R9)(BP*1), X1 + MOVOU (R8), X0 + MOVOU -16(R8)(DI*1), X1 MOVOU X0, (AX) - MOVOU X1, -16(AX)(BP*1) + MOVOU X1, -16(AX)(DI*1) JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm8B emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_33through64: - MOVOU (R9), X0 - MOVOU 16(R9), X1 - MOVOU -32(R9)(BP*1), X2 - MOVOU -16(R9)(BP*1), X3 + MOVOU (R8), X0 + MOVOU 16(R8), X1 + MOVOU -32(R8)(DI*1), X2 + MOVOU -16(R8)(DI*1), X3 MOVOU X0, (AX) MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(BP*1) - MOVOU X3, -16(AX)(BP*1) + MOVOU X2, -32(AX)(DI*1) + MOVOU X3, -16(AX)(DI*1) JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm8B emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_65through128: - MOVOU (R9), X0 - MOVOU 16(R9), X1 - MOVOU 32(R9), X2 - MOVOU 48(R9), X3 - MOVOU -64(R9)(BP*1), X12 - MOVOU -48(R9)(BP*1), X13 - MOVOU -32(R9)(BP*1), X14 - MOVOU -16(R9)(BP*1), X15 + MOVOU (R8), X0 + MOVOU 16(R8), X1 + MOVOU 32(R8), X2 + MOVOU 48(R8), X3 + MOVOU -64(R8)(DI*1), X12 + MOVOU -48(R8)(DI*1), X13 + MOVOU -32(R8)(DI*1), X14 + MOVOU -16(R8)(DI*1), X15 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, 32(AX) MOVOU X3, 48(AX) - MOVOU X12, -64(AX)(BP*1) - MOVOU X13, -48(AX)(BP*1) - MOVOU X14, -32(AX)(BP*1) - MOVOU X15, -16(AX)(BP*1) + MOVOU X12, -64(AX)(DI*1) + MOVOU X13, -48(AX)(DI*1) + MOVOU X14, -32(AX)(DI*1) + MOVOU X15, -16(AX)(DI*1) JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm8B emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_129through256: - MOVOU (R9), X0 - MOVOU 16(R9), X1 - MOVOU 32(R9), X2 - MOVOU 48(R9), X3 - MOVOU 64(R9), X4 - MOVOU 80(R9), X5 - MOVOU 96(R9), X6 - MOVOU 112(R9), X7 - MOVOU -128(R9)(BP*1), X8 - MOVOU -112(R9)(BP*1), X9 - MOVOU -96(R9)(BP*1), X10 - MOVOU -80(R9)(BP*1), X11 - MOVOU -64(R9)(BP*1), X12 - MOVOU -48(R9)(BP*1), X13 - MOVOU -32(R9)(BP*1), X14 - MOVOU -16(R9)(BP*1), X15 + MOVOU (R8), X0 + MOVOU 16(R8), X1 + MOVOU 32(R8), X2 + MOVOU 48(R8), X3 + MOVOU 64(R8), X4 + MOVOU 80(R8), X5 + MOVOU 96(R8), X6 + MOVOU 112(R8), X7 + MOVOU -128(R8)(DI*1), X8 + MOVOU -112(R8)(DI*1), X9 + MOVOU -96(R8)(DI*1), X10 + MOVOU -80(R8)(DI*1), X11 + MOVOU -64(R8)(DI*1), X12 + MOVOU -48(R8)(DI*1), X13 + MOVOU -32(R8)(DI*1), X14 + MOVOU -16(R8)(DI*1), X15 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, 32(AX) @@ -15442,34 +15419,34 @@ emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_129through256: MOVOU X5, 80(AX) MOVOU X6, 96(AX) MOVOU X7, 112(AX) - MOVOU X8, -128(AX)(BP*1) - MOVOU X9, -112(AX)(BP*1) - MOVOU X10, -96(AX)(BP*1) - MOVOU X11, -80(AX)(BP*1) - MOVOU X12, -64(AX)(BP*1) - MOVOU X13, -48(AX)(BP*1) - MOVOU X14, -32(AX)(BP*1) - MOVOU X15, -16(AX)(BP*1) + MOVOU X8, -128(AX)(DI*1) + MOVOU X9, -112(AX)(DI*1) + MOVOU X10, -96(AX)(DI*1) + MOVOU X11, -80(AX)(DI*1) + MOVOU X12, -64(AX)(DI*1) + MOVOU X13, -48(AX)(DI*1) + MOVOU X14, -32(AX)(DI*1) + MOVOU X15, -16(AX)(DI*1) JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm8B emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_256through2048: - LEAQ -256(BP), BP - MOVOU (R9), X0 - MOVOU 16(R9), X1 - MOVOU 32(R9), X2 - MOVOU 48(R9), X3 - MOVOU 64(R9), X4 - MOVOU 80(R9), X5 - MOVOU 96(R9), X6 - MOVOU 112(R9), X7 - MOVOU 128(R9), X8 - MOVOU 144(R9), X9 - MOVOU 160(R9), X10 - MOVOU 176(R9), X11 - MOVOU 192(R9), X12 - MOVOU 208(R9), X13 - MOVOU 224(R9), X14 - MOVOU 240(R9), X15 + LEAQ -256(DI), DI + MOVOU (R8), X0 + MOVOU 16(R8), X1 + MOVOU 32(R8), X2 + MOVOU 48(R8), X3 + MOVOU 64(R8), X4 + MOVOU 80(R8), X5 + MOVOU 96(R8), X6 + MOVOU 112(R8), X7 + MOVOU 128(R8), X8 + MOVOU 144(R8), X9 + MOVOU 160(R8), X10 + MOVOU 176(R8), X11 + MOVOU 192(R8), X12 + MOVOU 208(R8), X13 + MOVOU 224(R8), X14 + MOVOU 240(R8), X15 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, 32(AX) @@ -15486,113 +15463,113 @@ emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_256through2048: MOVOU X13, 208(AX) MOVOU X14, 224(AX) MOVOU X15, 240(AX) - CMPQ BP, $0x00000100 - LEAQ 256(R9), R9 + CMPQ DI, $0x00000100 + LEAQ 256(R8), R8 LEAQ 256(AX), AX JGE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_256through2048 JMP emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_tail memmove_end_copy_repeat_emit_encodeSnappyBlockAsm8B: - MOVQ R10, AX + MOVQ BP, AX emit_literal_done_repeat_emit_encodeSnappyBlockAsm8B: ADDL $0x05, CX - MOVL CX, BX - SUBL 16(SP), BX - MOVQ src_len+32(FP), R9 - SUBL CX, R9 - LEAQ (DX)(CX*1), R10 - LEAQ (DX)(BX*1), R11 - XORL BX, BX - CMPL R9, $0x08 + MOVL CX, BP + SUBL 16(SP), BP + MOVQ src_len+32(FP), DI + SUBL CX, DI + LEAQ (DX)(CX*1), R8 + LEAQ (DX)(BP*1), BP + XORL R10, R10 + CMPL DI, $0x08 JL matchlen_single_repeat_extend matchlen_loopback_repeat_extend: - MOVQ (R10)(BX*1), R12 - XORQ (R11)(BX*1), R12 - TESTQ R12, R12 + MOVQ (R8)(R10*1), R9 + XORQ (BP)(R10*1), R9 + TESTQ R9, R9 JZ matchlen_loop_repeat_extend - BSFQ R12, R12 - SARQ $0x03, R12 - LEAL (BX)(R12*1), BX + BSFQ R9, R9 + SARQ $0x03, R9 + LEAL (R10)(R9*1), R10 JMP repeat_extend_forward_end_encodeSnappyBlockAsm8B matchlen_loop_repeat_extend: - LEAL -8(R9), R9 - LEAL 8(BX), BX - CMPL R9, $0x08 + LEAL -8(DI), DI + LEAL 8(R10), R10 + CMPL DI, $0x08 JGE matchlen_loopback_repeat_extend matchlen_single_repeat_extend: - TESTL R9, R9 + TESTL DI, DI JZ repeat_extend_forward_end_encodeSnappyBlockAsm8B matchlen_single_loopback_repeat_extend: - MOVB (R10)(BX*1), R12 - CMPB (R11)(BX*1), R12 + MOVB (R8)(R10*1), R9 + CMPB (BP)(R10*1), R9 JNE repeat_extend_forward_end_encodeSnappyBlockAsm8B - LEAL 1(BX), BX - DECL R9 + LEAL 1(R10), R10 + DECL DI JNZ matchlen_single_loopback_repeat_extend repeat_extend_forward_end_encodeSnappyBlockAsm8B: - ADDL BX, CX - MOVL CX, BX - SUBL DI, BX - MOVL 16(SP), BP - CMPL BP, $0x00010000 + ADDL R10, CX + MOVL CX, BP + SUBL SI, BP + MOVL 16(SP), SI + CMPL SI, $0x00010000 JL two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm8B four_bytes_loop_back_repeat_as_copy_encodeSnappyBlockAsm8B: - CMPL BX, $0x40 + CMPL BP, $0x40 JLE four_bytes_remain_repeat_as_copy_encodeSnappyBlockAsm8B MOVB $0xff, (AX) - MOVL BP, 1(AX) - LEAL -64(BX), BX + MOVL SI, 1(AX) + LEAL -64(BP), BP ADDQ $0x05, AX - CMPL BX, $0x04 + CMPL BP, $0x04 JL four_bytes_remain_repeat_as_copy_encodeSnappyBlockAsm8B JMP four_bytes_loop_back_repeat_as_copy_encodeSnappyBlockAsm8B four_bytes_remain_repeat_as_copy_encodeSnappyBlockAsm8B: - TESTL BX, BX + TESTL BP, BP JZ repeat_end_emit_encodeSnappyBlockAsm8B - MOVB $0x03, SI - LEAL -4(SI)(BX*4), BX - MOVB BL, (AX) - MOVL BP, 1(AX) + MOVB $0x03, BL + LEAL -4(BX)(BP*4), BP + MOVB BP, (AX) + MOVL SI, 1(AX) ADDQ $0x05, AX JMP repeat_end_emit_encodeSnappyBlockAsm8B two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm8B: - CMPL BX, $0x40 + CMPL BP, $0x40 JLE two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm8B MOVB $0xee, (AX) - MOVW BP, 1(AX) - LEAL -60(BX), BX + MOVW SI, 1(AX) + LEAL -60(BP), BP ADDQ $0x03, AX JMP two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm8B two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm8B: - CMPL BX, $0x0c + CMPL BP, $0x0c JGE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm8B - CMPL BP, $0x00000800 + CMPL SI, $0x00000800 JGE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm8B - MOVB $0x01, SI - LEAL -16(SI)(BX*4), BX - MOVB BP, 1(AX) - SHRL $0x08, BP - SHLL $0x05, BP - ORL BP, BX - MOVB BL, (AX) + MOVB $0x01, BL + LEAL -16(BX)(BP*4), BP + MOVB SI, 1(AX) + SHRL $0x08, SI + SHLL $0x05, SI + ORL SI, BP + MOVB BP, (AX) ADDQ $0x02, AX JMP repeat_end_emit_encodeSnappyBlockAsm8B emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm8B: - MOVB $0x02, SI - LEAL -4(SI)(BX*4), BX - MOVB BL, (AX) - MOVW BP, 1(AX) + MOVB $0x02, BL + LEAL -4(BX)(BP*4), BP + MOVB BP, (AX) + MOVW SI, 1(AX) ADDQ $0x03, AX repeat_end_emit_encodeSnappyBlockAsm8B: @@ -15602,16 +15579,16 @@ repeat_end_emit_encodeSnappyBlockAsm8B: JMP search_loop_encodeSnappyBlockAsm8B no_repeat_found_encodeSnappyBlockAsm8B: - CMPL (DX)(BX*1), R9 + CMPL (DX)(BP*1), SI JEQ candidate_match_encodeSnappyBlockAsm8B - SHRQ $0x08, R9 - MOVL 24(SP)(R11*4), BX - LEAL 2(CX), DI - CMPL (DX)(BP*1), R9 + SHRQ $0x08, SI + MOVL 24(SP)(R9*4), BP + LEAL 2(CX), R8 + CMPL (DX)(DI*1), SI JEQ candidate2_match_encodeSnappyBlockAsm8B - MOVL DI, 24(SP)(R11*4) - SHRQ $0x08, R9 - CMPL (DX)(BX*1), R9 + MOVL R8, 24(SP)(R9*4) + SHRQ $0x08, SI + CMPL (DX)(BP*1), SI JEQ candidate3_match_encodeSnappyBlockAsm8B MOVL 20(SP), CX JMP search_loop_encodeSnappyBlockAsm8B @@ -15621,46 +15598,46 @@ candidate3_match_encodeSnappyBlockAsm8B: JMP candidate_match_encodeSnappyBlockAsm8B candidate2_match_encodeSnappyBlockAsm8B: - MOVL DI, 24(SP)(R11*4) + MOVL R8, 24(SP)(R9*4) INCL CX - MOVL BP, BX + MOVL DI, BP candidate_match_encodeSnappyBlockAsm8B: - MOVL 12(SP), BP - TESTL BX, BX + MOVL 12(SP), SI + TESTL BP, BP JZ match_extend_back_end_encodeSnappyBlockAsm8B match_extend_back_loop_encodeSnappyBlockAsm8B: - CMPL CX, BP + CMPL CX, SI JLE match_extend_back_end_encodeSnappyBlockAsm8B - MOVB -1(DX)(BX*1), SI + MOVB -1(DX)(BP*1), BL MOVB -1(DX)(CX*1), DI - CMPB SI, DI + CMPB BL, DI JNE match_extend_back_end_encodeSnappyBlockAsm8B LEAL -1(CX), CX - DECL BX + DECL BP JZ match_extend_back_end_encodeSnappyBlockAsm8B JMP match_extend_back_loop_encodeSnappyBlockAsm8B match_extend_back_end_encodeSnappyBlockAsm8B: - MOVL CX, DI - SUBL 12(SP), DI - LEAQ 4(AX)(DI*1), DI - CMPQ DI, (SP) + MOVL CX, SI + SUBL 12(SP), SI + LEAQ 4(AX)(SI*1), SI + CMPQ SI, (SP) JL match_dst_size_check_encodeSnappyBlockAsm8B MOVQ $0x00000000, ret+48(FP) RET match_dst_size_check_encodeSnappyBlockAsm8B: - MOVL CX, R8 + MOVL CX, SI MOVL 12(SP), DI - CMPL DI, R8 + CMPL DI, SI JEQ emit_literal_done_match_emit_encodeSnappyBlockAsm8B - MOVL R8, BP - MOVL R8, 12(SP) - LEAQ (DX)(DI*1), R9 - SUBL DI, BP - MOVL BP, DI + MOVL SI, R8 + MOVL SI, 12(SP) + LEAQ (DX)(DI*1), SI + SUBL DI, R8 + MOVL R8, DI SUBL $0x01, DI JC emit_literal_done_match_emit_encodeSnappyBlockAsm8B CMPL DI, $0x3c @@ -15677,11 +15654,11 @@ match_dst_size_check_encodeSnappyBlockAsm8B: JMP memmove_match_emit_encodeSnappyBlockAsm8B four_bytes_match_emit_encodeSnappyBlockAsm8B: - MOVL DI, R8 - SHRL $0x10, R8 + MOVL DI, R9 + SHRL $0x10, R9 MOVB $0xf8, (AX) MOVW DI, 1(AX) - MOVB R8, 3(AX) + MOVB R9, 3(AX) ADDQ $0x04, AX JMP memmove_match_emit_encodeSnappyBlockAsm8B @@ -15703,8 +15680,7 @@ one_byte_match_emit_encodeSnappyBlockAsm8B: ADDQ $0x01, AX memmove_match_emit_encodeSnappyBlockAsm8B: - LEAQ (AX)(BP*1), R10 - MOVL BP, R8 + LEAQ (AX)(R8*1), DI NOP emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_tail: @@ -15731,55 +15707,55 @@ emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_tail: JMP emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_256through2048 emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_1or2: - MOVB (R9), R11 - MOVB -1(R9)(R8*1), R9 - MOVB R11, (AX) - MOVB R9, -1(AX)(R8*1) + MOVB (SI), R9 + MOVB -1(SI)(R8*1), SI + MOVB R9, (AX) + MOVB SI, -1(AX)(R8*1) JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm8B emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_4: - MOVL (R9), R11 - MOVL R11, (AX) + MOVL (SI), R9 + MOVL R9, (AX) JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm8B emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_3: - MOVW (R9), R11 - MOVB 2(R9), R9 - MOVW R11, (AX) - MOVB R9, 2(AX) + MOVW (SI), R9 + MOVB 2(SI), SI + MOVW R9, (AX) + MOVB SI, 2(AX) JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm8B emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_5through7: - MOVL (R9), R11 - MOVL -4(R9)(R8*1), R9 - MOVL R11, (AX) - MOVL R9, -4(AX)(R8*1) + MOVL (SI), R9 + MOVL -4(SI)(R8*1), SI + MOVL R9, (AX) + MOVL SI, -4(AX)(R8*1) JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm8B emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_8: - MOVQ (R9), R11 - MOVQ R11, (AX) + MOVQ (SI), R9 + MOVQ R9, (AX) JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm8B emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_9through16: - MOVQ (R9), R11 - MOVQ -8(R9)(R8*1), R9 - MOVQ R11, (AX) - MOVQ R9, -8(AX)(R8*1) + MOVQ (SI), R9 + MOVQ -8(SI)(R8*1), SI + MOVQ R9, (AX) + MOVQ SI, -8(AX)(R8*1) JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm8B emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_17through32: - MOVOU (R9), X0 - MOVOU -16(R9)(R8*1), X1 + MOVOU (SI), X0 + MOVOU -16(SI)(R8*1), X1 MOVOU X0, (AX) MOVOU X1, -16(AX)(R8*1) JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm8B emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_33through64: - MOVOU (R9), X0 - MOVOU 16(R9), X1 - MOVOU -32(R9)(R8*1), X2 - MOVOU -16(R9)(R8*1), X3 + MOVOU (SI), X0 + MOVOU 16(SI), X1 + MOVOU -32(SI)(R8*1), X2 + MOVOU -16(SI)(R8*1), X3 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, -32(AX)(R8*1) @@ -15787,14 +15763,14 @@ emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_33through64: JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm8B emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_65through128: - MOVOU (R9), X0 - MOVOU 16(R9), X1 - MOVOU 32(R9), X2 - MOVOU 48(R9), X3 - MOVOU -64(R9)(R8*1), X12 - MOVOU -48(R9)(R8*1), X13 - MOVOU -32(R9)(R8*1), X14 - MOVOU -16(R9)(R8*1), X15 + MOVOU (SI), X0 + MOVOU 16(SI), X1 + MOVOU 32(SI), X2 + MOVOU 48(SI), X3 + MOVOU -64(SI)(R8*1), X12 + MOVOU -48(SI)(R8*1), X13 + MOVOU -32(SI)(R8*1), X14 + MOVOU -16(SI)(R8*1), X15 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, 32(AX) @@ -15806,22 +15782,22 @@ emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_65through128: JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm8B emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_129through256: - MOVOU (R9), X0 - MOVOU 16(R9), X1 - MOVOU 32(R9), X2 - MOVOU 48(R9), X3 - MOVOU 64(R9), X4 - MOVOU 80(R9), X5 - MOVOU 96(R9), X6 - MOVOU 112(R9), X7 - MOVOU -128(R9)(R8*1), X8 - MOVOU -112(R9)(R8*1), X9 - MOVOU -96(R9)(R8*1), X10 - MOVOU -80(R9)(R8*1), X11 - MOVOU -64(R9)(R8*1), X12 - MOVOU -48(R9)(R8*1), X13 - MOVOU -32(R9)(R8*1), X14 - MOVOU -16(R9)(R8*1), X15 + MOVOU (SI), X0 + MOVOU 16(SI), X1 + MOVOU 32(SI), X2 + MOVOU 48(SI), X3 + MOVOU 64(SI), X4 + MOVOU 80(SI), X5 + MOVOU 96(SI), X6 + MOVOU 112(SI), X7 + MOVOU -128(SI)(R8*1), X8 + MOVOU -112(SI)(R8*1), X9 + MOVOU -96(SI)(R8*1), X10 + MOVOU -80(SI)(R8*1), X11 + MOVOU -64(SI)(R8*1), X12 + MOVOU -48(SI)(R8*1), X13 + MOVOU -32(SI)(R8*1), X14 + MOVOU -16(SI)(R8*1), X15 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, 32(AX) @@ -15842,22 +15818,22 @@ emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_129through256: emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_256through2048: LEAQ -256(R8), R8 - MOVOU (R9), X0 - MOVOU 16(R9), X1 - MOVOU 32(R9), X2 - MOVOU 48(R9), X3 - MOVOU 64(R9), X4 - MOVOU 80(R9), X5 - MOVOU 96(R9), X6 - MOVOU 112(R9), X7 - MOVOU 128(R9), X8 - MOVOU 144(R9), X9 - MOVOU 160(R9), X10 - MOVOU 176(R9), X11 - MOVOU 192(R9), X12 - MOVOU 208(R9), X13 - MOVOU 224(R9), X14 - MOVOU 240(R9), X15 + MOVOU (SI), X0 + MOVOU 16(SI), X1 + MOVOU 32(SI), X2 + MOVOU 48(SI), X3 + MOVOU 64(SI), X4 + MOVOU 80(SI), X5 + MOVOU 96(SI), X6 + MOVOU 112(SI), X7 + MOVOU 128(SI), X8 + MOVOU 144(SI), X9 + MOVOU 160(SI), X10 + MOVOU 176(SI), X11 + MOVOU 192(SI), X12 + MOVOU 208(SI), X13 + MOVOU 224(SI), X14 + MOVOU 240(SI), X15 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, 32(AX) @@ -15875,113 +15851,113 @@ emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_256through2048: MOVOU X14, 224(AX) MOVOU X15, 240(AX) CMPQ R8, $0x00000100 - LEAQ 256(R9), R9 + LEAQ 256(SI), SI LEAQ 256(AX), AX JGE emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_256through2048 JMP emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_tail memmove_end_copy_match_emit_encodeSnappyBlockAsm8B: - MOVQ R10, AX + MOVQ DI, AX emit_literal_done_match_emit_encodeSnappyBlockAsm8B: match_nolit_loop_encodeSnappyBlockAsm8B: - MOVL CX, BP - SUBL BX, BP - MOVL BP, 16(SP) + MOVL CX, SI + SUBL BP, SI + MOVL SI, 16(SP) ADDL $0x04, CX - ADDL $0x04, BX - MOVQ src_len+32(FP), R9 - SUBL CX, R9 - LEAQ (DX)(CX*1), R10 - LEAQ (DX)(BX*1), R11 - XORL BX, BX - CMPL R9, $0x08 + ADDL $0x04, BP + MOVQ src_len+32(FP), SI + SUBL CX, SI + LEAQ (DX)(CX*1), DI + LEAQ (DX)(BP*1), BP + XORL R9, R9 + CMPL SI, $0x08 JL matchlen_single_match_nolit_encodeSnappyBlockAsm8B matchlen_loopback_match_nolit_encodeSnappyBlockAsm8B: - MOVQ (R10)(BX*1), R12 - XORQ (R11)(BX*1), R12 - TESTQ R12, R12 + MOVQ (DI)(R9*1), R8 + XORQ (BP)(R9*1), R8 + TESTQ R8, R8 JZ matchlen_loop_match_nolit_encodeSnappyBlockAsm8B - BSFQ R12, R12 - SARQ $0x03, R12 - LEAL (BX)(R12*1), BX + BSFQ R8, R8 + SARQ $0x03, R8 + LEAL (R9)(R8*1), R9 JMP match_nolit_end_encodeSnappyBlockAsm8B matchlen_loop_match_nolit_encodeSnappyBlockAsm8B: - LEAL -8(R9), R9 - LEAL 8(BX), BX - CMPL R9, $0x08 + LEAL -8(SI), SI + LEAL 8(R9), R9 + CMPL SI, $0x08 JGE matchlen_loopback_match_nolit_encodeSnappyBlockAsm8B matchlen_single_match_nolit_encodeSnappyBlockAsm8B: - TESTL R9, R9 + TESTL SI, SI JZ match_nolit_end_encodeSnappyBlockAsm8B matchlen_single_loopback_match_nolit_encodeSnappyBlockAsm8B: - MOVB (R10)(BX*1), R12 - CMPB (R11)(BX*1), R12 + MOVB (DI)(R9*1), R8 + CMPB (BP)(R9*1), R8 JNE match_nolit_end_encodeSnappyBlockAsm8B - LEAL 1(BX), BX - DECL R9 + LEAL 1(R9), R9 + DECL SI JNZ matchlen_single_loopback_match_nolit_encodeSnappyBlockAsm8B match_nolit_end_encodeSnappyBlockAsm8B: - ADDL BX, CX + ADDL R9, CX MOVL 16(SP), BP - ADDL $0x04, BX + ADDL $0x04, R9 CMPL BP, $0x00010000 JL two_byte_offset_match_nolit_encodeSnappyBlockAsm8B four_bytes_loop_back_match_nolit_encodeSnappyBlockAsm8B: - CMPL BX, $0x40 + CMPL R9, $0x40 JLE four_bytes_remain_match_nolit_encodeSnappyBlockAsm8B MOVB $0xff, (AX) MOVL BP, 1(AX) - LEAL -64(BX), BX + LEAL -64(R9), R9 ADDQ $0x05, AX - CMPL BX, $0x04 + CMPL R9, $0x04 JL four_bytes_remain_match_nolit_encodeSnappyBlockAsm8B JMP four_bytes_loop_back_match_nolit_encodeSnappyBlockAsm8B four_bytes_remain_match_nolit_encodeSnappyBlockAsm8B: - TESTL BX, BX + TESTL R9, R9 JZ match_nolit_emitcopy_end_encodeSnappyBlockAsm8B - MOVB $0x03, SI - LEAL -4(SI)(BX*4), BX - MOVB BL, (AX) + MOVB $0x03, BL + LEAL -4(BX)(R9*4), R9 + MOVB R9, (AX) MOVL BP, 1(AX) ADDQ $0x05, AX JMP match_nolit_emitcopy_end_encodeSnappyBlockAsm8B two_byte_offset_match_nolit_encodeSnappyBlockAsm8B: - CMPL BX, $0x40 + CMPL R9, $0x40 JLE two_byte_offset_short_match_nolit_encodeSnappyBlockAsm8B MOVB $0xee, (AX) MOVW BP, 1(AX) - LEAL -60(BX), BX + LEAL -60(R9), R9 ADDQ $0x03, AX JMP two_byte_offset_match_nolit_encodeSnappyBlockAsm8B two_byte_offset_short_match_nolit_encodeSnappyBlockAsm8B: - CMPL BX, $0x0c + CMPL R9, $0x0c JGE emit_copy_three_match_nolit_encodeSnappyBlockAsm8B CMPL BP, $0x00000800 JGE emit_copy_three_match_nolit_encodeSnappyBlockAsm8B - MOVB $0x01, SI - LEAL -16(SI)(BX*4), BX + MOVB $0x01, BL + LEAL -16(BX)(R9*4), R9 MOVB BP, 1(AX) SHRL $0x08, BP SHLL $0x05, BP - ORL BP, BX - MOVB BL, (AX) + ORL BP, R9 + MOVB R9, (AX) ADDQ $0x02, AX JMP match_nolit_emitcopy_end_encodeSnappyBlockAsm8B emit_copy_three_match_nolit_encodeSnappyBlockAsm8B: - MOVB $0x02, SI - LEAL -4(SI)(BX*4), BX - MOVB BL, (AX) + MOVB $0x02, BL + LEAL -4(BX)(R9*4), R9 + MOVB R9, (AX) MOVW BP, 1(AX) ADDQ $0x03, AX @@ -15995,22 +15971,22 @@ match_nolit_emitcopy_end_encodeSnappyBlockAsm8B: RET match_nolit_dst_ok_encodeSnappyBlockAsm8B: - MOVQ -2(DX)(CX*1), R9 - MOVQ $0x9e3779b1, R10 - MOVQ R9, R11 - SHRQ $0x10, R9 - MOVQ R9, R12 - SHLQ $0x20, R11 - IMULQ R10, R11 - SHRQ $0x38, R11 - SHLQ $0x20, R12 - IMULQ R10, R12 - SHRQ $0x38, R12 - LEAL -2(CX), BP - MOVL 24(SP)(R12*4), BX - MOVL BP, 24(SP)(R11*4) - MOVL CX, 24(SP)(R12*4) - CMPL (DX)(BX*1), R9 + MOVQ -2(DX)(CX*1), SI + MOVQ $0x9e3779b1, BP + MOVQ SI, DI + SHRQ $0x10, SI + MOVQ SI, R8 + SHLQ $0x20, DI + IMULQ BP, DI + SHRQ $0x38, DI + SHLQ $0x20, R8 + IMULQ BP, R8 + SHRQ $0x38, R8 + LEAL -2(CX), R9 + MOVL 24(SP)(R8*4), BP + MOVL R9, 24(SP)(DI*4) + MOVL CX, 24(SP)(R8*4) + CMPL (DX)(BP*1), SI JEQ match_nolit_loop_encodeSnappyBlockAsm8B INCL CX JMP search_loop_encodeSnappyBlockAsm8B @@ -16285,8 +16261,8 @@ zero_loop_encodeSnappyBlockAsmAvx: MOVL $0x00000000, 12(SP) MOVQ src_len+32(FP), CX LEAQ -5(CX), DX - LEAQ -8(CX), R9 - MOVL R9, 8(SP) + LEAQ -8(CX), BP + MOVL BP, 8(SP) SHRQ $0x05, CX SUBL CX, DX LEAQ (AX)(DX*1), DX @@ -16296,68 +16272,68 @@ zero_loop_encodeSnappyBlockAsmAvx: MOVQ src_base+24(FP), DX search_loop_encodeSnappyBlockAsmAvx: - MOVQ (DX)(CX*1), R9 - MOVL CX, BX - SUBL 12(SP), BX - SHRL $0x06, BX - LEAL 4(CX)(BX*1), BX - MOVL 8(SP), BP - CMPL BX, BP + MOVQ (DX)(CX*1), SI + MOVL CX, BP + SUBL 12(SP), BP + SHRL $0x06, BP + LEAL 4(CX)(BP*1), BP + MOVL 8(SP), DI + CMPL BP, DI JGT emit_remainder_encodeSnappyBlockAsmAvx - MOVL BX, 20(SP) - MOVQ $0x0000cf1bbcdcbf9b, R10 - MOVQ R9, R11 - MOVQ R9, R12 - SHRQ $0x08, R12 - SHLQ $0x10, R11 - IMULQ R10, R11 - SHRQ $0x32, R11 - SHLQ $0x10, R12 - IMULQ R10, R12 - SHRQ $0x32, R12 - MOVL 24(SP)(R11*4), BX - MOVL 24(SP)(R12*4), BP - MOVL CX, 24(SP)(R11*4) - LEAL 1(CX), DI - MOVL DI, 24(SP)(R12*4) - MOVQ R9, R11 - SHRQ $0x10, R11 - SHLQ $0x10, R11 - IMULQ R10, R11 - SHRQ $0x32, R11 - MOVL CX, DI - SUBL 16(SP), DI - MOVL 1(DX)(DI*1), DI - MOVQ R9, R10 + MOVL BP, 20(SP) + MOVQ $0x0000cf1bbcdcbf9b, R8 + MOVQ SI, R9 + MOVQ SI, R10 SHRQ $0x08, R10 - CMPL R10, DI + SHLQ $0x10, R9 + IMULQ R8, R9 + SHRQ $0x32, R9 + SHLQ $0x10, R10 + IMULQ R8, R10 + SHRQ $0x32, R10 + MOVL 24(SP)(R9*4), BP + MOVL 24(SP)(R10*4), DI + MOVL CX, 24(SP)(R9*4) + LEAL 1(CX), R9 + MOVL R9, 24(SP)(R10*4) + MOVQ SI, R9 + SHRQ $0x10, R9 + SHLQ $0x10, R9 + IMULQ R8, R9 + SHRQ $0x32, R9 + MOVL CX, R8 + SUBL 16(SP), R8 + MOVL 1(DX)(R8*1), R10 + MOVQ SI, R8 + SHRQ $0x08, R8 + CMPL R8, R10 JNE no_repeat_found_encodeSnappyBlockAsmAvx - LEAL 1(CX), DI - MOVL 12(SP), BX - MOVL DI, BP - SUBL 16(SP), BP + LEAL 1(CX), SI + MOVL 12(SP), BP + MOVL SI, DI + SUBL 16(SP), DI JZ repeat_extend_back_end_encodeSnappyBlockAsmAvx repeat_extend_back_loop_encodeSnappyBlockAsmAvx: - CMPL DI, BX + CMPL SI, BP JLE repeat_extend_back_end_encodeSnappyBlockAsmAvx - MOVB -1(DX)(BP*1), SI - MOVB -1(DX)(DI*1), R8 - CMPB SI, R8 + MOVB -1(DX)(DI*1), BL + MOVB -1(DX)(SI*1), R8 + CMPB BL, R8 JNE repeat_extend_back_end_encodeSnappyBlockAsmAvx - LEAL -1(DI), DI - DECL BP + LEAL -1(SI), SI + DECL DI JNZ repeat_extend_back_loop_encodeSnappyBlockAsmAvx repeat_extend_back_end_encodeSnappyBlockAsmAvx: MOVL 12(SP), BP - CMPL BP, DI + CMPL BP, SI JEQ emit_literal_done_repeat_emit_encodeSnappyBlockAsmAvx - MOVL DI, BX - MOVL DI, 12(SP) - LEAQ (DX)(BP*1), R9 - SUBL BP, BX - MOVL BX, BP + MOVL SI, DI + MOVL SI, 12(SP) + LEAQ (DX)(BP*1), R8 + SUBL BP, DI + MOVL DI, BP SUBL $0x01, BP JC emit_literal_done_repeat_emit_encodeSnappyBlockAsmAvx CMPL BP, $0x3c @@ -16374,11 +16350,11 @@ repeat_extend_back_end_encodeSnappyBlockAsmAvx: JMP memmove_repeat_emit_encodeSnappyBlockAsmAvx four_bytes_repeat_emit_encodeSnappyBlockAsmAvx: - MOVL BP, R8 - SHRL $0x10, R8 + MOVL BP, R9 + SHRL $0x10, R9 MOVB $0xf8, (AX) MOVW BP, 1(AX) - MOVB R8, 3(AX) + MOVB R9, 3(AX) ADDQ $0x04, AX JMP memmove_repeat_emit_encodeSnappyBlockAsmAvx @@ -16400,125 +16376,124 @@ one_byte_repeat_emit_encodeSnappyBlockAsmAvx: ADDQ $0x01, AX memmove_repeat_emit_encodeSnappyBlockAsmAvx: - LEAQ (AX)(BX*1), R10 - MOVL BX, BP + LEAQ (AX)(DI*1), BP NOP emit_lit_memmove_repeat_emit_encodeSnappyBlockAsmAvx_memmove_tail: - TESTQ BP, BP + TESTQ DI, DI JEQ memmove_end_copy_repeat_emit_encodeSnappyBlockAsmAvx - CMPQ BP, $0x02 + CMPQ DI, $0x02 JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsmAvx_memmove_move_1or2 - CMPQ BP, $0x04 + CMPQ DI, $0x04 JB emit_lit_memmove_repeat_emit_encodeSnappyBlockAsmAvx_memmove_move_3 JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsmAvx_memmove_move_4 - CMPQ BP, $0x08 + CMPQ DI, $0x08 JB emit_lit_memmove_repeat_emit_encodeSnappyBlockAsmAvx_memmove_move_5through7 JE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsmAvx_memmove_move_8 - CMPQ BP, $0x10 + CMPQ DI, $0x10 JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsmAvx_memmove_move_9through16 - CMPQ BP, $0x20 + CMPQ DI, $0x20 JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsmAvx_memmove_move_17through32 - CMPQ BP, $0x40 + CMPQ DI, $0x40 JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsmAvx_memmove_move_33through64 - CMPQ BP, $0x80 + CMPQ DI, $0x80 JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsmAvx_memmove_move_65through128 - CMPQ BP, $0x00000100 + CMPQ DI, $0x00000100 JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsmAvx_memmove_move_129through256 JMP emit_lit_memmove_repeat_emit_encodeSnappyBlockAsmAvx_memmove_avxUnaligned emit_lit_memmove_repeat_emit_encodeSnappyBlockAsmAvx_memmove_move_1or2: - MOVB (R9), R11 - MOVB -1(R9)(BP*1), R12 - MOVB R11, (AX) - MOVB R12, -1(AX)(BP*1) + MOVB (R8), R9 + MOVB -1(R8)(DI*1), R10 + MOVB R9, (AX) + MOVB R10, -1(AX)(DI*1) JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsmAvx emit_lit_memmove_repeat_emit_encodeSnappyBlockAsmAvx_memmove_move_4: - MOVL (R9), R11 - MOVL R11, (AX) + MOVL (R8), R9 + MOVL R9, (AX) JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsmAvx emit_lit_memmove_repeat_emit_encodeSnappyBlockAsmAvx_memmove_move_3: - MOVW (R9), R11 - MOVB 2(R9), R12 - MOVW R11, (AX) - MOVB R12, 2(AX) + MOVW (R8), R9 + MOVB 2(R8), R10 + MOVW R9, (AX) + MOVB R10, 2(AX) JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsmAvx emit_lit_memmove_repeat_emit_encodeSnappyBlockAsmAvx_memmove_move_5through7: - MOVL (R9), R11 - MOVL -4(R9)(BP*1), R12 - MOVL R11, (AX) - MOVL R12, -4(AX)(BP*1) + MOVL (R8), R9 + MOVL -4(R8)(DI*1), R10 + MOVL R9, (AX) + MOVL R10, -4(AX)(DI*1) JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsmAvx emit_lit_memmove_repeat_emit_encodeSnappyBlockAsmAvx_memmove_move_8: - MOVQ (R9), R11 - MOVQ R11, (AX) + MOVQ (R8), R9 + MOVQ R9, (AX) JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsmAvx emit_lit_memmove_repeat_emit_encodeSnappyBlockAsmAvx_memmove_move_9through16: - MOVQ (R9), R11 - MOVQ -8(R9)(BP*1), R12 - MOVQ R11, (AX) - MOVQ R12, -8(AX)(BP*1) + MOVQ (R8), R9 + MOVQ -8(R8)(DI*1), R10 + MOVQ R9, (AX) + MOVQ R10, -8(AX)(DI*1) JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsmAvx emit_lit_memmove_repeat_emit_encodeSnappyBlockAsmAvx_memmove_move_17through32: - MOVOU (R9), X0 - MOVOU -16(R9)(BP*1), X1 + MOVOU (R8), X0 + MOVOU -16(R8)(DI*1), X1 MOVOU X0, (AX) - MOVOU X1, -16(AX)(BP*1) + MOVOU X1, -16(AX)(DI*1) JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsmAvx emit_lit_memmove_repeat_emit_encodeSnappyBlockAsmAvx_memmove_move_33through64: - MOVOU (R9), X0 - MOVOU 16(R9), X1 - MOVOU -32(R9)(BP*1), X2 - MOVOU -16(R9)(BP*1), X3 + MOVOU (R8), X0 + MOVOU 16(R8), X1 + MOVOU -32(R8)(DI*1), X2 + MOVOU -16(R8)(DI*1), X3 MOVOU X0, (AX) MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(BP*1) - MOVOU X3, -16(AX)(BP*1) + MOVOU X2, -32(AX)(DI*1) + MOVOU X3, -16(AX)(DI*1) JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsmAvx emit_lit_memmove_repeat_emit_encodeSnappyBlockAsmAvx_memmove_move_65through128: - MOVOU (R9), X0 - MOVOU 16(R9), X1 - MOVOU 32(R9), X2 - MOVOU 48(R9), X3 - MOVOU -64(R9)(BP*1), X12 - MOVOU -48(R9)(BP*1), X13 - MOVOU -32(R9)(BP*1), X14 - MOVOU -16(R9)(BP*1), X15 + MOVOU (R8), X0 + MOVOU 16(R8), X1 + MOVOU 32(R8), X2 + MOVOU 48(R8), X3 + MOVOU -64(R8)(DI*1), X12 + MOVOU -48(R8)(DI*1), X13 + MOVOU -32(R8)(DI*1), X14 + MOVOU -16(R8)(DI*1), X15 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, 32(AX) MOVOU X3, 48(AX) - MOVOU X12, -64(AX)(BP*1) - MOVOU X13, -48(AX)(BP*1) - MOVOU X14, -32(AX)(BP*1) - MOVOU X15, -16(AX)(BP*1) + MOVOU X12, -64(AX)(DI*1) + MOVOU X13, -48(AX)(DI*1) + MOVOU X14, -32(AX)(DI*1) + MOVOU X15, -16(AX)(DI*1) JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsmAvx emit_lit_memmove_repeat_emit_encodeSnappyBlockAsmAvx_memmove_move_129through256: - MOVOU (R9), X0 - MOVOU 16(R9), X1 - MOVOU 32(R9), X2 - MOVOU 48(R9), X3 - MOVOU 64(R9), X4 - MOVOU 80(R9), X5 - MOVOU 96(R9), X6 - MOVOU 112(R9), X7 - MOVOU -128(R9)(BP*1), X8 - MOVOU -112(R9)(BP*1), X9 - MOVOU -96(R9)(BP*1), X10 - MOVOU -80(R9)(BP*1), X11 - MOVOU -64(R9)(BP*1), X12 - MOVOU -48(R9)(BP*1), X13 - MOVOU -32(R9)(BP*1), X14 - MOVOU -16(R9)(BP*1), X15 + MOVOU (R8), X0 + MOVOU 16(R8), X1 + MOVOU 32(R8), X2 + MOVOU 48(R8), X3 + MOVOU 64(R8), X4 + MOVOU 80(R8), X5 + MOVOU 96(R8), X6 + MOVOU 112(R8), X7 + MOVOU -128(R8)(DI*1), X8 + MOVOU -112(R8)(DI*1), X9 + MOVOU -96(R8)(DI*1), X10 + MOVOU -80(R8)(DI*1), X11 + MOVOU -64(R8)(DI*1), X12 + MOVOU -48(R8)(DI*1), X13 + MOVOU -32(R8)(DI*1), X14 + MOVOU -16(R8)(DI*1), X15 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, 32(AX) @@ -16527,34 +16502,34 @@ emit_lit_memmove_repeat_emit_encodeSnappyBlockAsmAvx_memmove_move_129through256: MOVOU X5, 80(AX) MOVOU X6, 96(AX) MOVOU X7, 112(AX) - MOVOU X8, -128(AX)(BP*1) - MOVOU X9, -112(AX)(BP*1) - MOVOU X10, -96(AX)(BP*1) - MOVOU X11, -80(AX)(BP*1) - MOVOU X12, -64(AX)(BP*1) - MOVOU X13, -48(AX)(BP*1) - MOVOU X14, -32(AX)(BP*1) - MOVOU X15, -16(AX)(BP*1) + MOVOU X8, -128(AX)(DI*1) + MOVOU X9, -112(AX)(DI*1) + MOVOU X10, -96(AX)(DI*1) + MOVOU X11, -80(AX)(DI*1) + MOVOU X12, -64(AX)(DI*1) + MOVOU X13, -48(AX)(DI*1) + MOVOU X14, -32(AX)(DI*1) + MOVOU X15, -16(AX)(DI*1) JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsmAvx emit_lit_memmove_repeat_emit_encodeSnappyBlockAsmAvx_memmove_move_256through2048: - LEAQ -256(BP), BP - MOVOU (R9), X0 - MOVOU 16(R9), X1 - MOVOU 32(R9), X2 - MOVOU 48(R9), X3 - MOVOU 64(R9), X4 - MOVOU 80(R9), X5 - MOVOU 96(R9), X6 - MOVOU 112(R9), X7 - MOVOU 128(R9), X8 - MOVOU 144(R9), X9 - MOVOU 160(R9), X10 - MOVOU 176(R9), X11 - MOVOU 192(R9), X12 - MOVOU 208(R9), X13 - MOVOU 224(R9), X14 - MOVOU 240(R9), X15 + LEAQ -256(DI), DI + MOVOU (R8), X0 + MOVOU 16(R8), X1 + MOVOU 32(R8), X2 + MOVOU 48(R8), X3 + MOVOU 64(R8), X4 + MOVOU 80(R8), X5 + MOVOU 96(R8), X6 + MOVOU 112(R8), X7 + MOVOU 128(R8), X8 + MOVOU 144(R8), X9 + MOVOU 160(R8), X10 + MOVOU 176(R8), X11 + MOVOU 192(R8), X12 + MOVOU 208(R8), X13 + MOVOU 224(R8), X14 + MOVOU 240(R8), X15 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, 32(AX) @@ -16571,160 +16546,160 @@ emit_lit_memmove_repeat_emit_encodeSnappyBlockAsmAvx_memmove_move_256through2048 MOVOU X13, 208(AX) MOVOU X14, 224(AX) MOVOU X15, 240(AX) - CMPQ BP, $0x00000100 - LEAQ 256(R9), R9 + CMPQ DI, $0x00000100 + LEAQ 256(R8), R8 LEAQ 256(AX), AX JGE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsmAvx_memmove_move_256through2048 JMP emit_lit_memmove_repeat_emit_encodeSnappyBlockAsmAvx_memmove_tail emit_lit_memmove_repeat_emit_encodeSnappyBlockAsmAvx_memmove_avxUnaligned: - LEAQ (R9)(BP*1), R12 - MOVQ AX, R14 - MOVOU -128(R12), X5 - MOVOU -112(R12), X6 - MOVQ $0x00000080, R11 + LEAQ (R8)(DI*1), R10 + MOVQ AX, R12 + MOVOU -128(R10), X5 + MOVOU -112(R10), X6 + MOVQ $0x00000080, R9 ANDQ $0xffffffe0, AX ADDQ $0x20, AX - MOVOU -96(R12), X7 - MOVOU -80(R12), X8 - MOVQ AX, R13 - SUBQ R14, R13 - MOVOU -64(R12), X9 - MOVOU -48(R12), X10 - SUBQ R13, BP - MOVOU -32(R12), X11 - MOVOU -16(R12), X12 - VMOVDQU (R9), Y4 - ADDQ R13, R9 - SUBQ R11, BP + MOVOU -96(R10), X7 + MOVOU -80(R10), X8 + MOVQ AX, R11 + SUBQ R12, R11 + MOVOU -64(R10), X9 + MOVOU -48(R10), X10 + SUBQ R11, DI + MOVOU -32(R10), X11 + MOVOU -16(R10), X12 + VMOVDQU (R8), Y4 + ADDQ R11, R8 + SUBQ R9, DI emit_lit_memmove_repeat_emit_encodeSnappyBlockAsmAvx_memmove_gobble_128_loop: - VMOVDQU (R9), Y0 - VMOVDQU 32(R9), Y1 - VMOVDQU 64(R9), Y2 - VMOVDQU 96(R9), Y3 - ADDQ R11, R9 + VMOVDQU (R8), Y0 + VMOVDQU 32(R8), Y1 + VMOVDQU 64(R8), Y2 + VMOVDQU 96(R8), Y3 + ADDQ R9, R8 VMOVDQA Y0, (AX) VMOVDQA Y1, 32(AX) VMOVDQA Y2, 64(AX) VMOVDQA Y3, 96(AX) - ADDQ R11, AX - SUBQ R11, BP + ADDQ R9, AX + SUBQ R9, DI JA emit_lit_memmove_repeat_emit_encodeSnappyBlockAsmAvx_memmove_gobble_128_loop - ADDQ R11, BP - ADDQ AX, BP - VMOVDQU Y4, (R14) + ADDQ R9, DI + ADDQ AX, DI + VMOVDQU Y4, (R12) VZEROUPPER - MOVOU X5, -128(BP) - MOVOU X6, -112(BP) - MOVOU X7, -96(BP) - MOVOU X8, -80(BP) - MOVOU X9, -64(BP) - MOVOU X10, -48(BP) - MOVOU X11, -32(BP) - MOVOU X12, -16(BP) + MOVOU X5, -128(DI) + MOVOU X6, -112(DI) + MOVOU X7, -96(DI) + MOVOU X8, -80(DI) + MOVOU X9, -64(DI) + MOVOU X10, -48(DI) + MOVOU X11, -32(DI) + MOVOU X12, -16(DI) memmove_end_copy_repeat_emit_encodeSnappyBlockAsmAvx: - MOVQ R10, AX + MOVQ BP, AX emit_literal_done_repeat_emit_encodeSnappyBlockAsmAvx: ADDL $0x05, CX - MOVL CX, BX - SUBL 16(SP), BX - MOVQ src_len+32(FP), R9 - SUBL CX, R9 - LEAQ (DX)(CX*1), R10 - LEAQ (DX)(BX*1), R11 - XORL BX, BX - CMPL R9, $0x08 + MOVL CX, BP + SUBL 16(SP), BP + MOVQ src_len+32(FP), DI + SUBL CX, DI + LEAQ (DX)(CX*1), R8 + LEAQ (DX)(BP*1), BP + XORL R10, R10 + CMPL DI, $0x08 JL matchlen_single_repeat_extend matchlen_loopback_repeat_extend: - MOVQ (R10)(BX*1), R12 - XORQ (R11)(BX*1), R12 - TESTQ R12, R12 + MOVQ (R8)(R10*1), R9 + XORQ (BP)(R10*1), R9 + TESTQ R9, R9 JZ matchlen_loop_repeat_extend - BSFQ R12, R12 - SARQ $0x03, R12 - LEAL (BX)(R12*1), BX + BSFQ R9, R9 + SARQ $0x03, R9 + LEAL (R10)(R9*1), R10 JMP repeat_extend_forward_end_encodeSnappyBlockAsmAvx matchlen_loop_repeat_extend: - LEAL -8(R9), R9 - LEAL 8(BX), BX - CMPL R9, $0x08 + LEAL -8(DI), DI + LEAL 8(R10), R10 + CMPL DI, $0x08 JGE matchlen_loopback_repeat_extend matchlen_single_repeat_extend: - TESTL R9, R9 + TESTL DI, DI JZ repeat_extend_forward_end_encodeSnappyBlockAsmAvx matchlen_single_loopback_repeat_extend: - MOVB (R10)(BX*1), R12 - CMPB (R11)(BX*1), R12 + MOVB (R8)(R10*1), R9 + CMPB (BP)(R10*1), R9 JNE repeat_extend_forward_end_encodeSnappyBlockAsmAvx - LEAL 1(BX), BX - DECL R9 + LEAL 1(R10), R10 + DECL DI JNZ matchlen_single_loopback_repeat_extend repeat_extend_forward_end_encodeSnappyBlockAsmAvx: - ADDL BX, CX - MOVL CX, BX - SUBL DI, BX - MOVL 16(SP), BP - CMPL BP, $0x00010000 + ADDL R10, CX + MOVL CX, BP + SUBL SI, BP + MOVL 16(SP), SI + CMPL SI, $0x00010000 JL two_byte_offset_repeat_as_copy_encodeSnappyBlockAsmAvx four_bytes_loop_back_repeat_as_copy_encodeSnappyBlockAsmAvx: - CMPL BX, $0x40 + CMPL BP, $0x40 JLE four_bytes_remain_repeat_as_copy_encodeSnappyBlockAsmAvx MOVB $0xff, (AX) - MOVL BP, 1(AX) - LEAL -64(BX), BX + MOVL SI, 1(AX) + LEAL -64(BP), BP ADDQ $0x05, AX - CMPL BX, $0x04 + CMPL BP, $0x04 JL four_bytes_remain_repeat_as_copy_encodeSnappyBlockAsmAvx JMP four_bytes_loop_back_repeat_as_copy_encodeSnappyBlockAsmAvx four_bytes_remain_repeat_as_copy_encodeSnappyBlockAsmAvx: - TESTL BX, BX + TESTL BP, BP JZ repeat_end_emit_encodeSnappyBlockAsmAvx - MOVB $0x03, SI - LEAL -4(SI)(BX*4), BX - MOVB BL, (AX) - MOVL BP, 1(AX) + MOVB $0x03, BL + LEAL -4(BX)(BP*4), BP + MOVB BP, (AX) + MOVL SI, 1(AX) ADDQ $0x05, AX JMP repeat_end_emit_encodeSnappyBlockAsmAvx two_byte_offset_repeat_as_copy_encodeSnappyBlockAsmAvx: - CMPL BX, $0x40 + CMPL BP, $0x40 JLE two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsmAvx MOVB $0xee, (AX) - MOVW BP, 1(AX) - LEAL -60(BX), BX + MOVW SI, 1(AX) + LEAL -60(BP), BP ADDQ $0x03, AX JMP two_byte_offset_repeat_as_copy_encodeSnappyBlockAsmAvx two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsmAvx: - CMPL BX, $0x0c + CMPL BP, $0x0c JGE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsmAvx - CMPL BP, $0x00000800 + CMPL SI, $0x00000800 JGE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsmAvx - MOVB $0x01, SI - LEAL -16(SI)(BX*4), BX - MOVB BP, 1(AX) - SHRL $0x08, BP - SHLL $0x05, BP - ORL BP, BX - MOVB BL, (AX) + MOVB $0x01, BL + LEAL -16(BX)(BP*4), BP + MOVB SI, 1(AX) + SHRL $0x08, SI + SHLL $0x05, SI + ORL SI, BP + MOVB BP, (AX) ADDQ $0x02, AX JMP repeat_end_emit_encodeSnappyBlockAsmAvx emit_copy_three_repeat_as_copy_encodeSnappyBlockAsmAvx: - MOVB $0x02, SI - LEAL -4(SI)(BX*4), BX - MOVB BL, (AX) - MOVW BP, 1(AX) + MOVB $0x02, BL + LEAL -4(BX)(BP*4), BP + MOVB BP, (AX) + MOVW SI, 1(AX) ADDQ $0x03, AX repeat_end_emit_encodeSnappyBlockAsmAvx: @@ -16734,16 +16709,16 @@ repeat_end_emit_encodeSnappyBlockAsmAvx: JMP search_loop_encodeSnappyBlockAsmAvx no_repeat_found_encodeSnappyBlockAsmAvx: - CMPL (DX)(BX*1), R9 + CMPL (DX)(BP*1), SI JEQ candidate_match_encodeSnappyBlockAsmAvx - SHRQ $0x08, R9 - MOVL 24(SP)(R11*4), BX - LEAL 2(CX), DI - CMPL (DX)(BP*1), R9 + SHRQ $0x08, SI + MOVL 24(SP)(R9*4), BP + LEAL 2(CX), R8 + CMPL (DX)(DI*1), SI JEQ candidate2_match_encodeSnappyBlockAsmAvx - MOVL DI, 24(SP)(R11*4) - SHRQ $0x08, R9 - CMPL (DX)(BX*1), R9 + MOVL R8, 24(SP)(R9*4) + SHRQ $0x08, SI + CMPL (DX)(BP*1), SI JEQ candidate3_match_encodeSnappyBlockAsmAvx MOVL 20(SP), CX JMP search_loop_encodeSnappyBlockAsmAvx @@ -16753,46 +16728,46 @@ candidate3_match_encodeSnappyBlockAsmAvx: JMP candidate_match_encodeSnappyBlockAsmAvx candidate2_match_encodeSnappyBlockAsmAvx: - MOVL DI, 24(SP)(R11*4) + MOVL R8, 24(SP)(R9*4) INCL CX - MOVL BP, BX + MOVL DI, BP candidate_match_encodeSnappyBlockAsmAvx: - MOVL 12(SP), BP - TESTL BX, BX + MOVL 12(SP), SI + TESTL BP, BP JZ match_extend_back_end_encodeSnappyBlockAsmAvx match_extend_back_loop_encodeSnappyBlockAsmAvx: - CMPL CX, BP + CMPL CX, SI JLE match_extend_back_end_encodeSnappyBlockAsmAvx - MOVB -1(DX)(BX*1), SI + MOVB -1(DX)(BP*1), BL MOVB -1(DX)(CX*1), DI - CMPB SI, DI + CMPB BL, DI JNE match_extend_back_end_encodeSnappyBlockAsmAvx LEAL -1(CX), CX - DECL BX + DECL BP JZ match_extend_back_end_encodeSnappyBlockAsmAvx JMP match_extend_back_loop_encodeSnappyBlockAsmAvx match_extend_back_end_encodeSnappyBlockAsmAvx: - MOVL CX, DI - SUBL 12(SP), DI - LEAQ 4(AX)(DI*1), DI - CMPQ DI, (SP) + MOVL CX, SI + SUBL 12(SP), SI + LEAQ 4(AX)(SI*1), SI + CMPQ SI, (SP) JL match_dst_size_check_encodeSnappyBlockAsmAvx MOVQ $0x00000000, ret+48(FP) RET match_dst_size_check_encodeSnappyBlockAsmAvx: - MOVL CX, R8 + MOVL CX, SI MOVL 12(SP), DI - CMPL DI, R8 + CMPL DI, SI JEQ emit_literal_done_match_emit_encodeSnappyBlockAsmAvx - MOVL R8, BP - MOVL R8, 12(SP) - LEAQ (DX)(DI*1), R9 - SUBL DI, BP - MOVL BP, DI + MOVL SI, R8 + MOVL SI, 12(SP) + LEAQ (DX)(DI*1), SI + SUBL DI, R8 + MOVL R8, DI SUBL $0x01, DI JC emit_literal_done_match_emit_encodeSnappyBlockAsmAvx CMPL DI, $0x3c @@ -16809,11 +16784,11 @@ match_dst_size_check_encodeSnappyBlockAsmAvx: JMP memmove_match_emit_encodeSnappyBlockAsmAvx four_bytes_match_emit_encodeSnappyBlockAsmAvx: - MOVL DI, R8 - SHRL $0x10, R8 + MOVL DI, R9 + SHRL $0x10, R9 MOVB $0xf8, (AX) MOVW DI, 1(AX) - MOVB R8, 3(AX) + MOVB R9, 3(AX) ADDQ $0x04, AX JMP memmove_match_emit_encodeSnappyBlockAsmAvx @@ -16835,8 +16810,7 @@ one_byte_match_emit_encodeSnappyBlockAsmAvx: ADDQ $0x01, AX memmove_match_emit_encodeSnappyBlockAsmAvx: - LEAQ (AX)(BP*1), R10 - MOVL BP, R8 + LEAQ (AX)(R8*1), DI NOP emit_lit_memmove_match_emit_encodeSnappyBlockAsmAvx_memmove_tail: @@ -16863,55 +16837,55 @@ emit_lit_memmove_match_emit_encodeSnappyBlockAsmAvx_memmove_tail: JMP emit_lit_memmove_match_emit_encodeSnappyBlockAsmAvx_memmove_avxUnaligned emit_lit_memmove_match_emit_encodeSnappyBlockAsmAvx_memmove_move_1or2: - MOVB (R9), R11 - MOVB -1(R9)(R8*1), R12 - MOVB R11, (AX) - MOVB R12, -1(AX)(R8*1) + MOVB (SI), R9 + MOVB -1(SI)(R8*1), R10 + MOVB R9, (AX) + MOVB R10, -1(AX)(R8*1) JMP memmove_end_copy_match_emit_encodeSnappyBlockAsmAvx emit_lit_memmove_match_emit_encodeSnappyBlockAsmAvx_memmove_move_4: - MOVL (R9), R11 - MOVL R11, (AX) + MOVL (SI), R9 + MOVL R9, (AX) JMP memmove_end_copy_match_emit_encodeSnappyBlockAsmAvx emit_lit_memmove_match_emit_encodeSnappyBlockAsmAvx_memmove_move_3: - MOVW (R9), R11 - MOVB 2(R9), R12 - MOVW R11, (AX) - MOVB R12, 2(AX) + MOVW (SI), R9 + MOVB 2(SI), R10 + MOVW R9, (AX) + MOVB R10, 2(AX) JMP memmove_end_copy_match_emit_encodeSnappyBlockAsmAvx emit_lit_memmove_match_emit_encodeSnappyBlockAsmAvx_memmove_move_5through7: - MOVL (R9), R11 - MOVL -4(R9)(R8*1), R12 - MOVL R11, (AX) - MOVL R12, -4(AX)(R8*1) + MOVL (SI), R9 + MOVL -4(SI)(R8*1), R10 + MOVL R9, (AX) + MOVL R10, -4(AX)(R8*1) JMP memmove_end_copy_match_emit_encodeSnappyBlockAsmAvx emit_lit_memmove_match_emit_encodeSnappyBlockAsmAvx_memmove_move_8: - MOVQ (R9), R11 - MOVQ R11, (AX) + MOVQ (SI), R9 + MOVQ R9, (AX) JMP memmove_end_copy_match_emit_encodeSnappyBlockAsmAvx emit_lit_memmove_match_emit_encodeSnappyBlockAsmAvx_memmove_move_9through16: - MOVQ (R9), R11 - MOVQ -8(R9)(R8*1), R12 - MOVQ R11, (AX) - MOVQ R12, -8(AX)(R8*1) + MOVQ (SI), R9 + MOVQ -8(SI)(R8*1), R10 + MOVQ R9, (AX) + MOVQ R10, -8(AX)(R8*1) JMP memmove_end_copy_match_emit_encodeSnappyBlockAsmAvx emit_lit_memmove_match_emit_encodeSnappyBlockAsmAvx_memmove_move_17through32: - MOVOU (R9), X0 - MOVOU -16(R9)(R8*1), X1 + MOVOU (SI), X0 + MOVOU -16(SI)(R8*1), X1 MOVOU X0, (AX) MOVOU X1, -16(AX)(R8*1) JMP memmove_end_copy_match_emit_encodeSnappyBlockAsmAvx emit_lit_memmove_match_emit_encodeSnappyBlockAsmAvx_memmove_move_33through64: - MOVOU (R9), X0 - MOVOU 16(R9), X1 - MOVOU -32(R9)(R8*1), X2 - MOVOU -16(R9)(R8*1), X3 + MOVOU (SI), X0 + MOVOU 16(SI), X1 + MOVOU -32(SI)(R8*1), X2 + MOVOU -16(SI)(R8*1), X3 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, -32(AX)(R8*1) @@ -16919,14 +16893,14 @@ emit_lit_memmove_match_emit_encodeSnappyBlockAsmAvx_memmove_move_33through64: JMP memmove_end_copy_match_emit_encodeSnappyBlockAsmAvx emit_lit_memmove_match_emit_encodeSnappyBlockAsmAvx_memmove_move_65through128: - MOVOU (R9), X0 - MOVOU 16(R9), X1 - MOVOU 32(R9), X2 - MOVOU 48(R9), X3 - MOVOU -64(R9)(R8*1), X12 - MOVOU -48(R9)(R8*1), X13 - MOVOU -32(R9)(R8*1), X14 - MOVOU -16(R9)(R8*1), X15 + MOVOU (SI), X0 + MOVOU 16(SI), X1 + MOVOU 32(SI), X2 + MOVOU 48(SI), X3 + MOVOU -64(SI)(R8*1), X12 + MOVOU -48(SI)(R8*1), X13 + MOVOU -32(SI)(R8*1), X14 + MOVOU -16(SI)(R8*1), X15 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, 32(AX) @@ -16938,22 +16912,22 @@ emit_lit_memmove_match_emit_encodeSnappyBlockAsmAvx_memmove_move_65through128: JMP memmove_end_copy_match_emit_encodeSnappyBlockAsmAvx emit_lit_memmove_match_emit_encodeSnappyBlockAsmAvx_memmove_move_129through256: - MOVOU (R9), X0 - MOVOU 16(R9), X1 - MOVOU 32(R9), X2 - MOVOU 48(R9), X3 - MOVOU 64(R9), X4 - MOVOU 80(R9), X5 - MOVOU 96(R9), X6 - MOVOU 112(R9), X7 - MOVOU -128(R9)(R8*1), X8 - MOVOU -112(R9)(R8*1), X9 - MOVOU -96(R9)(R8*1), X10 - MOVOU -80(R9)(R8*1), X11 - MOVOU -64(R9)(R8*1), X12 - MOVOU -48(R9)(R8*1), X13 - MOVOU -32(R9)(R8*1), X14 - MOVOU -16(R9)(R8*1), X15 + MOVOU (SI), X0 + MOVOU 16(SI), X1 + MOVOU 32(SI), X2 + MOVOU 48(SI), X3 + MOVOU 64(SI), X4 + MOVOU 80(SI), X5 + MOVOU 96(SI), X6 + MOVOU 112(SI), X7 + MOVOU -128(SI)(R8*1), X8 + MOVOU -112(SI)(R8*1), X9 + MOVOU -96(SI)(R8*1), X10 + MOVOU -80(SI)(R8*1), X11 + MOVOU -64(SI)(R8*1), X12 + MOVOU -48(SI)(R8*1), X13 + MOVOU -32(SI)(R8*1), X14 + MOVOU -16(SI)(R8*1), X15 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, 32(AX) @@ -16973,23 +16947,23 @@ emit_lit_memmove_match_emit_encodeSnappyBlockAsmAvx_memmove_move_129through256: JMP memmove_end_copy_match_emit_encodeSnappyBlockAsmAvx emit_lit_memmove_match_emit_encodeSnappyBlockAsmAvx_memmove_move_256through2048: - LEAQ -256(R8), R8 - MOVOU (R9), X0 - MOVOU 16(R9), X1 - MOVOU 32(R9), X2 - MOVOU 48(R9), X3 - MOVOU 64(R9), X4 - MOVOU 80(R9), X5 - MOVOU 96(R9), X6 - MOVOU 112(R9), X7 - MOVOU 128(R9), X8 - MOVOU 144(R9), X9 - MOVOU 160(R9), X10 - MOVOU 176(R9), X11 - MOVOU 192(R9), X12 - MOVOU 208(R9), X13 - MOVOU 224(R9), X14 - MOVOU 240(R9), X15 + LEAQ -256(R8), R8 + MOVOU (SI), X0 + MOVOU 16(SI), X1 + MOVOU 32(SI), X2 + MOVOU 48(SI), X3 + MOVOU 64(SI), X4 + MOVOU 80(SI), X5 + MOVOU 96(SI), X6 + MOVOU 112(SI), X7 + MOVOU 128(SI), X8 + MOVOU 144(SI), X9 + MOVOU 160(SI), X10 + MOVOU 176(SI), X11 + MOVOU 192(SI), X12 + MOVOU 208(SI), X13 + MOVOU 224(SI), X14 + MOVOU 240(SI), X15 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, 32(AX) @@ -17007,160 +16981,160 @@ emit_lit_memmove_match_emit_encodeSnappyBlockAsmAvx_memmove_move_256through2048: MOVOU X14, 224(AX) MOVOU X15, 240(AX) CMPQ R8, $0x00000100 - LEAQ 256(R9), R9 + LEAQ 256(SI), SI LEAQ 256(AX), AX JGE emit_lit_memmove_match_emit_encodeSnappyBlockAsmAvx_memmove_move_256through2048 JMP emit_lit_memmove_match_emit_encodeSnappyBlockAsmAvx_memmove_tail emit_lit_memmove_match_emit_encodeSnappyBlockAsmAvx_memmove_avxUnaligned: - LEAQ (R9)(R8*1), R12 - MOVQ AX, R14 - MOVOU -128(R12), X5 - MOVOU -112(R12), X6 - MOVQ $0x00000080, R11 + LEAQ (SI)(R8*1), R10 + MOVQ AX, R12 + MOVOU -128(R10), X5 + MOVOU -112(R10), X6 + MOVQ $0x00000080, R9 ANDQ $0xffffffe0, AX ADDQ $0x20, AX - MOVOU -96(R12), X7 - MOVOU -80(R12), X8 - MOVQ AX, R13 - SUBQ R14, R13 - MOVOU -64(R12), X9 - MOVOU -48(R12), X10 - SUBQ R13, R8 - MOVOU -32(R12), X11 - MOVOU -16(R12), X12 - VMOVDQU (R9), Y4 - ADDQ R13, R9 + MOVOU -96(R10), X7 + MOVOU -80(R10), X8 + MOVQ AX, R11 + SUBQ R12, R11 + MOVOU -64(R10), X9 + MOVOU -48(R10), X10 SUBQ R11, R8 + MOVOU -32(R10), X11 + MOVOU -16(R10), X12 + VMOVDQU (SI), Y4 + ADDQ R11, SI + SUBQ R9, R8 emit_lit_memmove_match_emit_encodeSnappyBlockAsmAvx_memmove_gobble_128_loop: - VMOVDQU (R9), Y0 - VMOVDQU 32(R9), Y1 - VMOVDQU 64(R9), Y2 - VMOVDQU 96(R9), Y3 - ADDQ R11, R9 + VMOVDQU (SI), Y0 + VMOVDQU 32(SI), Y1 + VMOVDQU 64(SI), Y2 + VMOVDQU 96(SI), Y3 + ADDQ R9, SI VMOVDQA Y0, (AX) VMOVDQA Y1, 32(AX) VMOVDQA Y2, 64(AX) VMOVDQA Y3, 96(AX) - ADDQ R11, AX - SUBQ R11, R8 + ADDQ R9, AX + SUBQ R9, R8 JA emit_lit_memmove_match_emit_encodeSnappyBlockAsmAvx_memmove_gobble_128_loop - ADDQ R11, R8 + ADDQ R9, R8 ADDQ AX, R8 - VMOVDQU Y4, (R14) + VMOVDQU Y4, (R12) VZEROUPPER - MOVOU X5, -128(R8) - MOVOU X6, -112(R8) - MOVOU X7, -96(R8) - MOVOU X8, -80(R8) - MOVOU X9, -64(R8) - MOVOU X10, -48(R8) - MOVOU X11, -32(R8) - MOVOU X12, -16(R8) + MOVOU X5, -128(R8) + MOVOU X6, -112(R8) + MOVOU X7, -96(R8) + MOVOU X8, -80(R8) + MOVOU X9, -64(R8) + MOVOU X10, -48(R8) + MOVOU X11, -32(R8) + MOVOU X12, -16(R8) memmove_end_copy_match_emit_encodeSnappyBlockAsmAvx: - MOVQ R10, AX + MOVQ DI, AX emit_literal_done_match_emit_encodeSnappyBlockAsmAvx: match_nolit_loop_encodeSnappyBlockAsmAvx: - MOVL CX, BP - SUBL BX, BP - MOVL BP, 16(SP) + MOVL CX, SI + SUBL BP, SI + MOVL SI, 16(SP) ADDL $0x04, CX - ADDL $0x04, BX - MOVQ src_len+32(FP), R9 - SUBL CX, R9 - LEAQ (DX)(CX*1), R10 - LEAQ (DX)(BX*1), R11 - XORL BX, BX - CMPL R9, $0x08 + ADDL $0x04, BP + MOVQ src_len+32(FP), SI + SUBL CX, SI + LEAQ (DX)(CX*1), DI + LEAQ (DX)(BP*1), BP + XORL R9, R9 + CMPL SI, $0x08 JL matchlen_single_match_nolit_encodeSnappyBlockAsmAvx matchlen_loopback_match_nolit_encodeSnappyBlockAsmAvx: - MOVQ (R10)(BX*1), R12 - XORQ (R11)(BX*1), R12 - TESTQ R12, R12 + MOVQ (DI)(R9*1), R8 + XORQ (BP)(R9*1), R8 + TESTQ R8, R8 JZ matchlen_loop_match_nolit_encodeSnappyBlockAsmAvx - BSFQ R12, R12 - SARQ $0x03, R12 - LEAL (BX)(R12*1), BX + BSFQ R8, R8 + SARQ $0x03, R8 + LEAL (R9)(R8*1), R9 JMP match_nolit_end_encodeSnappyBlockAsmAvx matchlen_loop_match_nolit_encodeSnappyBlockAsmAvx: - LEAL -8(R9), R9 - LEAL 8(BX), BX - CMPL R9, $0x08 + LEAL -8(SI), SI + LEAL 8(R9), R9 + CMPL SI, $0x08 JGE matchlen_loopback_match_nolit_encodeSnappyBlockAsmAvx matchlen_single_match_nolit_encodeSnappyBlockAsmAvx: - TESTL R9, R9 + TESTL SI, SI JZ match_nolit_end_encodeSnappyBlockAsmAvx matchlen_single_loopback_match_nolit_encodeSnappyBlockAsmAvx: - MOVB (R10)(BX*1), R12 - CMPB (R11)(BX*1), R12 + MOVB (DI)(R9*1), R8 + CMPB (BP)(R9*1), R8 JNE match_nolit_end_encodeSnappyBlockAsmAvx - LEAL 1(BX), BX - DECL R9 + LEAL 1(R9), R9 + DECL SI JNZ matchlen_single_loopback_match_nolit_encodeSnappyBlockAsmAvx match_nolit_end_encodeSnappyBlockAsmAvx: - ADDL BX, CX + ADDL R9, CX MOVL 16(SP), BP - ADDL $0x04, BX + ADDL $0x04, R9 CMPL BP, $0x00010000 JL two_byte_offset_match_nolit_encodeSnappyBlockAsmAvx four_bytes_loop_back_match_nolit_encodeSnappyBlockAsmAvx: - CMPL BX, $0x40 + CMPL R9, $0x40 JLE four_bytes_remain_match_nolit_encodeSnappyBlockAsmAvx MOVB $0xff, (AX) MOVL BP, 1(AX) - LEAL -64(BX), BX + LEAL -64(R9), R9 ADDQ $0x05, AX - CMPL BX, $0x04 + CMPL R9, $0x04 JL four_bytes_remain_match_nolit_encodeSnappyBlockAsmAvx JMP four_bytes_loop_back_match_nolit_encodeSnappyBlockAsmAvx four_bytes_remain_match_nolit_encodeSnappyBlockAsmAvx: - TESTL BX, BX + TESTL R9, R9 JZ match_nolit_emitcopy_end_encodeSnappyBlockAsmAvx - MOVB $0x03, SI - LEAL -4(SI)(BX*4), BX - MOVB BL, (AX) + MOVB $0x03, BL + LEAL -4(BX)(R9*4), R9 + MOVB R9, (AX) MOVL BP, 1(AX) ADDQ $0x05, AX JMP match_nolit_emitcopy_end_encodeSnappyBlockAsmAvx two_byte_offset_match_nolit_encodeSnappyBlockAsmAvx: - CMPL BX, $0x40 + CMPL R9, $0x40 JLE two_byte_offset_short_match_nolit_encodeSnappyBlockAsmAvx MOVB $0xee, (AX) MOVW BP, 1(AX) - LEAL -60(BX), BX + LEAL -60(R9), R9 ADDQ $0x03, AX JMP two_byte_offset_match_nolit_encodeSnappyBlockAsmAvx two_byte_offset_short_match_nolit_encodeSnappyBlockAsmAvx: - CMPL BX, $0x0c + CMPL R9, $0x0c JGE emit_copy_three_match_nolit_encodeSnappyBlockAsmAvx CMPL BP, $0x00000800 JGE emit_copy_three_match_nolit_encodeSnappyBlockAsmAvx - MOVB $0x01, SI - LEAL -16(SI)(BX*4), BX + MOVB $0x01, BL + LEAL -16(BX)(R9*4), R9 MOVB BP, 1(AX) SHRL $0x08, BP SHLL $0x05, BP - ORL BP, BX - MOVB BL, (AX) + ORL BP, R9 + MOVB R9, (AX) ADDQ $0x02, AX JMP match_nolit_emitcopy_end_encodeSnappyBlockAsmAvx emit_copy_three_match_nolit_encodeSnappyBlockAsmAvx: - MOVB $0x02, SI - LEAL -4(SI)(BX*4), BX - MOVB BL, (AX) + MOVB $0x02, BL + LEAL -4(BX)(R9*4), R9 + MOVB R9, (AX) MOVW BP, 1(AX) ADDQ $0x03, AX @@ -17174,22 +17148,22 @@ match_nolit_emitcopy_end_encodeSnappyBlockAsmAvx: RET match_nolit_dst_ok_encodeSnappyBlockAsmAvx: - MOVQ -2(DX)(CX*1), R9 - MOVQ $0x0000cf1bbcdcbf9b, R10 - MOVQ R9, R11 - SHRQ $0x10, R9 - MOVQ R9, R12 - SHLQ $0x10, R11 - IMULQ R10, R11 - SHRQ $0x32, R11 - SHLQ $0x10, R12 - IMULQ R10, R12 - SHRQ $0x32, R12 - LEAL -2(CX), BP - MOVL 24(SP)(R12*4), BX - MOVL BP, 24(SP)(R11*4) - MOVL CX, 24(SP)(R12*4) - CMPL (DX)(BX*1), R9 + MOVQ -2(DX)(CX*1), SI + MOVQ $0x0000cf1bbcdcbf9b, BP + MOVQ SI, DI + SHRQ $0x10, SI + MOVQ SI, R8 + SHLQ $0x10, DI + IMULQ BP, DI + SHRQ $0x32, DI + SHLQ $0x10, R8 + IMULQ BP, R8 + SHRQ $0x32, R8 + LEAL -2(CX), R9 + MOVL 24(SP)(R8*4), BP + MOVL R9, 24(SP)(DI*4) + MOVL CX, 24(SP)(R8*4) + CMPL (DX)(BP*1), SI JEQ match_nolit_loop_encodeSnappyBlockAsmAvx INCL CX JMP search_loop_encodeSnappyBlockAsmAvx @@ -17470,14 +17444,14 @@ emit_lit_memmove_emit_remainder_encodeSnappyBlockAsmAvx_memmove_gobble_128_loop: ADDQ AX, BX VMOVDQU Y4, (R8) VZEROUPPER - MOVOU X5, -128(BX) - MOVOU X6, -112(BX) - MOVOU X7, -96(BX) - MOVOU X8, -80(BX) - MOVOU X9, -64(BX) - MOVOU X10, -48(BX) - MOVOU X11, -32(BX) - MOVOU X12, -16(BX) + MOVOU X5, -128(BX) + MOVOU X6, -112(BX) + MOVOU X7, -96(BX) + MOVOU X8, -80(BX) + MOVOU X9, -64(BX) + MOVOU X10, -48(BX) + MOVOU X11, -32(BX) + MOVOU X12, -16(BX) memmove_end_copy_emit_remainder_encodeSnappyBlockAsmAvx: MOVQ DX, AX @@ -17511,8 +17485,8 @@ zero_loop_encodeSnappyBlockAsm12BAvx: MOVL $0x00000000, 12(SP) MOVQ src_len+32(FP), CX LEAQ -5(CX), DX - LEAQ -8(CX), R9 - MOVL R9, 8(SP) + LEAQ -8(CX), BP + MOVL BP, 8(SP) SHRQ $0x05, CX SUBL CX, DX LEAQ (AX)(DX*1), DX @@ -17522,68 +17496,68 @@ zero_loop_encodeSnappyBlockAsm12BAvx: MOVQ src_base+24(FP), DX search_loop_encodeSnappyBlockAsm12BAvx: - MOVQ (DX)(CX*1), R9 - MOVL CX, BX - SUBL 12(SP), BX - SHRL $0x05, BX - LEAL 4(CX)(BX*1), BX - MOVL 8(SP), BP - CMPL BX, BP + MOVQ (DX)(CX*1), SI + MOVL CX, BP + SUBL 12(SP), BP + SHRL $0x05, BP + LEAL 4(CX)(BP*1), BP + MOVL 8(SP), DI + CMPL BP, DI JGT emit_remainder_encodeSnappyBlockAsm12BAvx - MOVL BX, 20(SP) - MOVQ $0x000000cf1bbcdcbb, R10 - MOVQ R9, R11 - MOVQ R9, R12 - SHRQ $0x08, R12 - SHLQ $0x18, R11 - IMULQ R10, R11 - SHRQ $0x34, R11 - SHLQ $0x18, R12 - IMULQ R10, R12 - SHRQ $0x34, R12 - MOVL 24(SP)(R11*4), BX - MOVL 24(SP)(R12*4), BP - MOVL CX, 24(SP)(R11*4) - LEAL 1(CX), DI - MOVL DI, 24(SP)(R12*4) - MOVQ R9, R11 - SHRQ $0x10, R11 - SHLQ $0x18, R11 - IMULQ R10, R11 - SHRQ $0x34, R11 - MOVL CX, DI - SUBL 16(SP), DI - MOVL 1(DX)(DI*1), DI - MOVQ R9, R10 + MOVL BP, 20(SP) + MOVQ $0x000000cf1bbcdcbb, R8 + MOVQ SI, R9 + MOVQ SI, R10 SHRQ $0x08, R10 - CMPL R10, DI + SHLQ $0x18, R9 + IMULQ R8, R9 + SHRQ $0x34, R9 + SHLQ $0x18, R10 + IMULQ R8, R10 + SHRQ $0x34, R10 + MOVL 24(SP)(R9*4), BP + MOVL 24(SP)(R10*4), DI + MOVL CX, 24(SP)(R9*4) + LEAL 1(CX), R9 + MOVL R9, 24(SP)(R10*4) + MOVQ SI, R9 + SHRQ $0x10, R9 + SHLQ $0x18, R9 + IMULQ R8, R9 + SHRQ $0x34, R9 + MOVL CX, R8 + SUBL 16(SP), R8 + MOVL 1(DX)(R8*1), R10 + MOVQ SI, R8 + SHRQ $0x08, R8 + CMPL R8, R10 JNE no_repeat_found_encodeSnappyBlockAsm12BAvx - LEAL 1(CX), DI - MOVL 12(SP), BX - MOVL DI, BP - SUBL 16(SP), BP + LEAL 1(CX), SI + MOVL 12(SP), BP + MOVL SI, DI + SUBL 16(SP), DI JZ repeat_extend_back_end_encodeSnappyBlockAsm12BAvx repeat_extend_back_loop_encodeSnappyBlockAsm12BAvx: - CMPL DI, BX + CMPL SI, BP JLE repeat_extend_back_end_encodeSnappyBlockAsm12BAvx - MOVB -1(DX)(BP*1), SI - MOVB -1(DX)(DI*1), R8 - CMPB SI, R8 + MOVB -1(DX)(DI*1), BL + MOVB -1(DX)(SI*1), R8 + CMPB BL, R8 JNE repeat_extend_back_end_encodeSnappyBlockAsm12BAvx - LEAL -1(DI), DI - DECL BP + LEAL -1(SI), SI + DECL DI JNZ repeat_extend_back_loop_encodeSnappyBlockAsm12BAvx repeat_extend_back_end_encodeSnappyBlockAsm12BAvx: MOVL 12(SP), BP - CMPL BP, DI + CMPL BP, SI JEQ emit_literal_done_repeat_emit_encodeSnappyBlockAsm12BAvx - MOVL DI, BX - MOVL DI, 12(SP) - LEAQ (DX)(BP*1), R9 - SUBL BP, BX - MOVL BX, BP + MOVL SI, DI + MOVL SI, 12(SP) + LEAQ (DX)(BP*1), R8 + SUBL BP, DI + MOVL DI, BP SUBL $0x01, BP JC emit_literal_done_repeat_emit_encodeSnappyBlockAsm12BAvx CMPL BP, $0x3c @@ -17600,11 +17574,11 @@ repeat_extend_back_end_encodeSnappyBlockAsm12BAvx: JMP memmove_repeat_emit_encodeSnappyBlockAsm12BAvx four_bytes_repeat_emit_encodeSnappyBlockAsm12BAvx: - MOVL BP, R8 - SHRL $0x10, R8 + MOVL BP, R9 + SHRL $0x10, R9 MOVB $0xf8, (AX) MOVW BP, 1(AX) - MOVB R8, 3(AX) + MOVB R9, 3(AX) ADDQ $0x04, AX JMP memmove_repeat_emit_encodeSnappyBlockAsm12BAvx @@ -17626,125 +17600,124 @@ one_byte_repeat_emit_encodeSnappyBlockAsm12BAvx: ADDQ $0x01, AX memmove_repeat_emit_encodeSnappyBlockAsm12BAvx: - LEAQ (AX)(BX*1), R10 - MOVL BX, BP + LEAQ (AX)(DI*1), BP NOP emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12BAvx_memmove_tail: - TESTQ BP, BP + TESTQ DI, DI JEQ memmove_end_copy_repeat_emit_encodeSnappyBlockAsm12BAvx - CMPQ BP, $0x02 + CMPQ DI, $0x02 JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12BAvx_memmove_move_1or2 - CMPQ BP, $0x04 + CMPQ DI, $0x04 JB emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12BAvx_memmove_move_3 JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12BAvx_memmove_move_4 - CMPQ BP, $0x08 + CMPQ DI, $0x08 JB emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12BAvx_memmove_move_5through7 JE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12BAvx_memmove_move_8 - CMPQ BP, $0x10 + CMPQ DI, $0x10 JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12BAvx_memmove_move_9through16 - CMPQ BP, $0x20 + CMPQ DI, $0x20 JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12BAvx_memmove_move_17through32 - CMPQ BP, $0x40 + CMPQ DI, $0x40 JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12BAvx_memmove_move_33through64 - CMPQ BP, $0x80 + CMPQ DI, $0x80 JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12BAvx_memmove_move_65through128 - CMPQ BP, $0x00000100 + CMPQ DI, $0x00000100 JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12BAvx_memmove_move_129through256 JMP emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12BAvx_memmove_avxUnaligned emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12BAvx_memmove_move_1or2: - MOVB (R9), R11 - MOVB -1(R9)(BP*1), R12 - MOVB R11, (AX) - MOVB R12, -1(AX)(BP*1) + MOVB (R8), R9 + MOVB -1(R8)(DI*1), R10 + MOVB R9, (AX) + MOVB R10, -1(AX)(DI*1) JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm12BAvx emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12BAvx_memmove_move_4: - MOVL (R9), R11 - MOVL R11, (AX) + MOVL (R8), R9 + MOVL R9, (AX) JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm12BAvx emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12BAvx_memmove_move_3: - MOVW (R9), R11 - MOVB 2(R9), R12 - MOVW R11, (AX) - MOVB R12, 2(AX) + MOVW (R8), R9 + MOVB 2(R8), R10 + MOVW R9, (AX) + MOVB R10, 2(AX) JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm12BAvx emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12BAvx_memmove_move_5through7: - MOVL (R9), R11 - MOVL -4(R9)(BP*1), R12 - MOVL R11, (AX) - MOVL R12, -4(AX)(BP*1) + MOVL (R8), R9 + MOVL -4(R8)(DI*1), R10 + MOVL R9, (AX) + MOVL R10, -4(AX)(DI*1) JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm12BAvx emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12BAvx_memmove_move_8: - MOVQ (R9), R11 - MOVQ R11, (AX) + MOVQ (R8), R9 + MOVQ R9, (AX) JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm12BAvx emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12BAvx_memmove_move_9through16: - MOVQ (R9), R11 - MOVQ -8(R9)(BP*1), R12 - MOVQ R11, (AX) - MOVQ R12, -8(AX)(BP*1) + MOVQ (R8), R9 + MOVQ -8(R8)(DI*1), R10 + MOVQ R9, (AX) + MOVQ R10, -8(AX)(DI*1) JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm12BAvx emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12BAvx_memmove_move_17through32: - MOVOU (R9), X0 - MOVOU -16(R9)(BP*1), X1 + MOVOU (R8), X0 + MOVOU -16(R8)(DI*1), X1 MOVOU X0, (AX) - MOVOU X1, -16(AX)(BP*1) + MOVOU X1, -16(AX)(DI*1) JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm12BAvx emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12BAvx_memmove_move_33through64: - MOVOU (R9), X0 - MOVOU 16(R9), X1 - MOVOU -32(R9)(BP*1), X2 - MOVOU -16(R9)(BP*1), X3 + MOVOU (R8), X0 + MOVOU 16(R8), X1 + MOVOU -32(R8)(DI*1), X2 + MOVOU -16(R8)(DI*1), X3 MOVOU X0, (AX) MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(BP*1) - MOVOU X3, -16(AX)(BP*1) + MOVOU X2, -32(AX)(DI*1) + MOVOU X3, -16(AX)(DI*1) JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm12BAvx emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12BAvx_memmove_move_65through128: - MOVOU (R9), X0 - MOVOU 16(R9), X1 - MOVOU 32(R9), X2 - MOVOU 48(R9), X3 - MOVOU -64(R9)(BP*1), X12 - MOVOU -48(R9)(BP*1), X13 - MOVOU -32(R9)(BP*1), X14 - MOVOU -16(R9)(BP*1), X15 + MOVOU (R8), X0 + MOVOU 16(R8), X1 + MOVOU 32(R8), X2 + MOVOU 48(R8), X3 + MOVOU -64(R8)(DI*1), X12 + MOVOU -48(R8)(DI*1), X13 + MOVOU -32(R8)(DI*1), X14 + MOVOU -16(R8)(DI*1), X15 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, 32(AX) MOVOU X3, 48(AX) - MOVOU X12, -64(AX)(BP*1) - MOVOU X13, -48(AX)(BP*1) - MOVOU X14, -32(AX)(BP*1) - MOVOU X15, -16(AX)(BP*1) + MOVOU X12, -64(AX)(DI*1) + MOVOU X13, -48(AX)(DI*1) + MOVOU X14, -32(AX)(DI*1) + MOVOU X15, -16(AX)(DI*1) JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm12BAvx emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12BAvx_memmove_move_129through256: - MOVOU (R9), X0 - MOVOU 16(R9), X1 - MOVOU 32(R9), X2 - MOVOU 48(R9), X3 - MOVOU 64(R9), X4 - MOVOU 80(R9), X5 - MOVOU 96(R9), X6 - MOVOU 112(R9), X7 - MOVOU -128(R9)(BP*1), X8 - MOVOU -112(R9)(BP*1), X9 - MOVOU -96(R9)(BP*1), X10 - MOVOU -80(R9)(BP*1), X11 - MOVOU -64(R9)(BP*1), X12 - MOVOU -48(R9)(BP*1), X13 - MOVOU -32(R9)(BP*1), X14 - MOVOU -16(R9)(BP*1), X15 + MOVOU (R8), X0 + MOVOU 16(R8), X1 + MOVOU 32(R8), X2 + MOVOU 48(R8), X3 + MOVOU 64(R8), X4 + MOVOU 80(R8), X5 + MOVOU 96(R8), X6 + MOVOU 112(R8), X7 + MOVOU -128(R8)(DI*1), X8 + MOVOU -112(R8)(DI*1), X9 + MOVOU -96(R8)(DI*1), X10 + MOVOU -80(R8)(DI*1), X11 + MOVOU -64(R8)(DI*1), X12 + MOVOU -48(R8)(DI*1), X13 + MOVOU -32(R8)(DI*1), X14 + MOVOU -16(R8)(DI*1), X15 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, 32(AX) @@ -17753,34 +17726,34 @@ emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12BAvx_memmove_move_129through2 MOVOU X5, 80(AX) MOVOU X6, 96(AX) MOVOU X7, 112(AX) - MOVOU X8, -128(AX)(BP*1) - MOVOU X9, -112(AX)(BP*1) - MOVOU X10, -96(AX)(BP*1) - MOVOU X11, -80(AX)(BP*1) - MOVOU X12, -64(AX)(BP*1) - MOVOU X13, -48(AX)(BP*1) - MOVOU X14, -32(AX)(BP*1) - MOVOU X15, -16(AX)(BP*1) + MOVOU X8, -128(AX)(DI*1) + MOVOU X9, -112(AX)(DI*1) + MOVOU X10, -96(AX)(DI*1) + MOVOU X11, -80(AX)(DI*1) + MOVOU X12, -64(AX)(DI*1) + MOVOU X13, -48(AX)(DI*1) + MOVOU X14, -32(AX)(DI*1) + MOVOU X15, -16(AX)(DI*1) JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm12BAvx emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12BAvx_memmove_move_256through2048: - LEAQ -256(BP), BP - MOVOU (R9), X0 - MOVOU 16(R9), X1 - MOVOU 32(R9), X2 - MOVOU 48(R9), X3 - MOVOU 64(R9), X4 - MOVOU 80(R9), X5 - MOVOU 96(R9), X6 - MOVOU 112(R9), X7 - MOVOU 128(R9), X8 - MOVOU 144(R9), X9 - MOVOU 160(R9), X10 - MOVOU 176(R9), X11 - MOVOU 192(R9), X12 - MOVOU 208(R9), X13 - MOVOU 224(R9), X14 - MOVOU 240(R9), X15 + LEAQ -256(DI), DI + MOVOU (R8), X0 + MOVOU 16(R8), X1 + MOVOU 32(R8), X2 + MOVOU 48(R8), X3 + MOVOU 64(R8), X4 + MOVOU 80(R8), X5 + MOVOU 96(R8), X6 + MOVOU 112(R8), X7 + MOVOU 128(R8), X8 + MOVOU 144(R8), X9 + MOVOU 160(R8), X10 + MOVOU 176(R8), X11 + MOVOU 192(R8), X12 + MOVOU 208(R8), X13 + MOVOU 224(R8), X14 + MOVOU 240(R8), X15 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, 32(AX) @@ -17797,160 +17770,160 @@ emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12BAvx_memmove_move_256through2 MOVOU X13, 208(AX) MOVOU X14, 224(AX) MOVOU X15, 240(AX) - CMPQ BP, $0x00000100 - LEAQ 256(R9), R9 + CMPQ DI, $0x00000100 + LEAQ 256(R8), R8 LEAQ 256(AX), AX JGE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12BAvx_memmove_move_256through2048 JMP emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12BAvx_memmove_tail emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12BAvx_memmove_avxUnaligned: - LEAQ (R9)(BP*1), R12 - MOVQ AX, R14 - MOVOU -128(R12), X5 - MOVOU -112(R12), X6 - MOVQ $0x00000080, R11 + LEAQ (R8)(DI*1), R10 + MOVQ AX, R12 + MOVOU -128(R10), X5 + MOVOU -112(R10), X6 + MOVQ $0x00000080, R9 ANDQ $0xffffffe0, AX ADDQ $0x20, AX - MOVOU -96(R12), X7 - MOVOU -80(R12), X8 - MOVQ AX, R13 - SUBQ R14, R13 - MOVOU -64(R12), X9 - MOVOU -48(R12), X10 - SUBQ R13, BP - MOVOU -32(R12), X11 - MOVOU -16(R12), X12 - VMOVDQU (R9), Y4 - ADDQ R13, R9 - SUBQ R11, BP + MOVOU -96(R10), X7 + MOVOU -80(R10), X8 + MOVQ AX, R11 + SUBQ R12, R11 + MOVOU -64(R10), X9 + MOVOU -48(R10), X10 + SUBQ R11, DI + MOVOU -32(R10), X11 + MOVOU -16(R10), X12 + VMOVDQU (R8), Y4 + ADDQ R11, R8 + SUBQ R9, DI emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12BAvx_memmove_gobble_128_loop: - VMOVDQU (R9), Y0 - VMOVDQU 32(R9), Y1 - VMOVDQU 64(R9), Y2 - VMOVDQU 96(R9), Y3 - ADDQ R11, R9 + VMOVDQU (R8), Y0 + VMOVDQU 32(R8), Y1 + VMOVDQU 64(R8), Y2 + VMOVDQU 96(R8), Y3 + ADDQ R9, R8 VMOVDQA Y0, (AX) VMOVDQA Y1, 32(AX) VMOVDQA Y2, 64(AX) VMOVDQA Y3, 96(AX) - ADDQ R11, AX - SUBQ R11, BP + ADDQ R9, AX + SUBQ R9, DI JA emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12BAvx_memmove_gobble_128_loop - ADDQ R11, BP - ADDQ AX, BP - VMOVDQU Y4, (R14) + ADDQ R9, DI + ADDQ AX, DI + VMOVDQU Y4, (R12) VZEROUPPER - MOVOU X5, -128(BP) - MOVOU X6, -112(BP) - MOVOU X7, -96(BP) - MOVOU X8, -80(BP) - MOVOU X9, -64(BP) - MOVOU X10, -48(BP) - MOVOU X11, -32(BP) - MOVOU X12, -16(BP) + MOVOU X5, -128(DI) + MOVOU X6, -112(DI) + MOVOU X7, -96(DI) + MOVOU X8, -80(DI) + MOVOU X9, -64(DI) + MOVOU X10, -48(DI) + MOVOU X11, -32(DI) + MOVOU X12, -16(DI) memmove_end_copy_repeat_emit_encodeSnappyBlockAsm12BAvx: - MOVQ R10, AX + MOVQ BP, AX emit_literal_done_repeat_emit_encodeSnappyBlockAsm12BAvx: ADDL $0x05, CX - MOVL CX, BX - SUBL 16(SP), BX - MOVQ src_len+32(FP), R9 - SUBL CX, R9 - LEAQ (DX)(CX*1), R10 - LEAQ (DX)(BX*1), R11 - XORL BX, BX - CMPL R9, $0x08 + MOVL CX, BP + SUBL 16(SP), BP + MOVQ src_len+32(FP), DI + SUBL CX, DI + LEAQ (DX)(CX*1), R8 + LEAQ (DX)(BP*1), BP + XORL R10, R10 + CMPL DI, $0x08 JL matchlen_single_repeat_extend matchlen_loopback_repeat_extend: - MOVQ (R10)(BX*1), R12 - XORQ (R11)(BX*1), R12 - TESTQ R12, R12 + MOVQ (R8)(R10*1), R9 + XORQ (BP)(R10*1), R9 + TESTQ R9, R9 JZ matchlen_loop_repeat_extend - BSFQ R12, R12 - SARQ $0x03, R12 - LEAL (BX)(R12*1), BX + BSFQ R9, R9 + SARQ $0x03, R9 + LEAL (R10)(R9*1), R10 JMP repeat_extend_forward_end_encodeSnappyBlockAsm12BAvx matchlen_loop_repeat_extend: - LEAL -8(R9), R9 - LEAL 8(BX), BX - CMPL R9, $0x08 + LEAL -8(DI), DI + LEAL 8(R10), R10 + CMPL DI, $0x08 JGE matchlen_loopback_repeat_extend matchlen_single_repeat_extend: - TESTL R9, R9 + TESTL DI, DI JZ repeat_extend_forward_end_encodeSnappyBlockAsm12BAvx matchlen_single_loopback_repeat_extend: - MOVB (R10)(BX*1), R12 - CMPB (R11)(BX*1), R12 + MOVB (R8)(R10*1), R9 + CMPB (BP)(R10*1), R9 JNE repeat_extend_forward_end_encodeSnappyBlockAsm12BAvx - LEAL 1(BX), BX - DECL R9 + LEAL 1(R10), R10 + DECL DI JNZ matchlen_single_loopback_repeat_extend repeat_extend_forward_end_encodeSnappyBlockAsm12BAvx: - ADDL BX, CX - MOVL CX, BX - SUBL DI, BX - MOVL 16(SP), BP - CMPL BP, $0x00010000 + ADDL R10, CX + MOVL CX, BP + SUBL SI, BP + MOVL 16(SP), SI + CMPL SI, $0x00010000 JL two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm12BAvx four_bytes_loop_back_repeat_as_copy_encodeSnappyBlockAsm12BAvx: - CMPL BX, $0x40 + CMPL BP, $0x40 JLE four_bytes_remain_repeat_as_copy_encodeSnappyBlockAsm12BAvx MOVB $0xff, (AX) - MOVL BP, 1(AX) - LEAL -64(BX), BX + MOVL SI, 1(AX) + LEAL -64(BP), BP ADDQ $0x05, AX - CMPL BX, $0x04 + CMPL BP, $0x04 JL four_bytes_remain_repeat_as_copy_encodeSnappyBlockAsm12BAvx JMP four_bytes_loop_back_repeat_as_copy_encodeSnappyBlockAsm12BAvx four_bytes_remain_repeat_as_copy_encodeSnappyBlockAsm12BAvx: - TESTL BX, BX + TESTL BP, BP JZ repeat_end_emit_encodeSnappyBlockAsm12BAvx - MOVB $0x03, SI - LEAL -4(SI)(BX*4), BX - MOVB BL, (AX) - MOVL BP, 1(AX) + MOVB $0x03, BL + LEAL -4(BX)(BP*4), BP + MOVB BP, (AX) + MOVL SI, 1(AX) ADDQ $0x05, AX JMP repeat_end_emit_encodeSnappyBlockAsm12BAvx two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm12BAvx: - CMPL BX, $0x40 + CMPL BP, $0x40 JLE two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm12BAvx MOVB $0xee, (AX) - MOVW BP, 1(AX) - LEAL -60(BX), BX + MOVW SI, 1(AX) + LEAL -60(BP), BP ADDQ $0x03, AX JMP two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm12BAvx two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm12BAvx: - CMPL BX, $0x0c + CMPL BP, $0x0c JGE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm12BAvx - CMPL BP, $0x00000800 + CMPL SI, $0x00000800 JGE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm12BAvx - MOVB $0x01, SI - LEAL -16(SI)(BX*4), BX - MOVB BP, 1(AX) - SHRL $0x08, BP - SHLL $0x05, BP - ORL BP, BX - MOVB BL, (AX) + MOVB $0x01, BL + LEAL -16(BX)(BP*4), BP + MOVB SI, 1(AX) + SHRL $0x08, SI + SHLL $0x05, SI + ORL SI, BP + MOVB BP, (AX) ADDQ $0x02, AX JMP repeat_end_emit_encodeSnappyBlockAsm12BAvx emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm12BAvx: - MOVB $0x02, SI - LEAL -4(SI)(BX*4), BX - MOVB BL, (AX) - MOVW BP, 1(AX) + MOVB $0x02, BL + LEAL -4(BX)(BP*4), BP + MOVB BP, (AX) + MOVW SI, 1(AX) ADDQ $0x03, AX repeat_end_emit_encodeSnappyBlockAsm12BAvx: @@ -17960,16 +17933,16 @@ repeat_end_emit_encodeSnappyBlockAsm12BAvx: JMP search_loop_encodeSnappyBlockAsm12BAvx no_repeat_found_encodeSnappyBlockAsm12BAvx: - CMPL (DX)(BX*1), R9 + CMPL (DX)(BP*1), SI JEQ candidate_match_encodeSnappyBlockAsm12BAvx - SHRQ $0x08, R9 - MOVL 24(SP)(R11*4), BX - LEAL 2(CX), DI - CMPL (DX)(BP*1), R9 + SHRQ $0x08, SI + MOVL 24(SP)(R9*4), BP + LEAL 2(CX), R8 + CMPL (DX)(DI*1), SI JEQ candidate2_match_encodeSnappyBlockAsm12BAvx - MOVL DI, 24(SP)(R11*4) - SHRQ $0x08, R9 - CMPL (DX)(BX*1), R9 + MOVL R8, 24(SP)(R9*4) + SHRQ $0x08, SI + CMPL (DX)(BP*1), SI JEQ candidate3_match_encodeSnappyBlockAsm12BAvx MOVL 20(SP), CX JMP search_loop_encodeSnappyBlockAsm12BAvx @@ -17979,46 +17952,46 @@ candidate3_match_encodeSnappyBlockAsm12BAvx: JMP candidate_match_encodeSnappyBlockAsm12BAvx candidate2_match_encodeSnappyBlockAsm12BAvx: - MOVL DI, 24(SP)(R11*4) + MOVL R8, 24(SP)(R9*4) INCL CX - MOVL BP, BX + MOVL DI, BP candidate_match_encodeSnappyBlockAsm12BAvx: - MOVL 12(SP), BP - TESTL BX, BX + MOVL 12(SP), SI + TESTL BP, BP JZ match_extend_back_end_encodeSnappyBlockAsm12BAvx match_extend_back_loop_encodeSnappyBlockAsm12BAvx: - CMPL CX, BP + CMPL CX, SI JLE match_extend_back_end_encodeSnappyBlockAsm12BAvx - MOVB -1(DX)(BX*1), SI + MOVB -1(DX)(BP*1), BL MOVB -1(DX)(CX*1), DI - CMPB SI, DI + CMPB BL, DI JNE match_extend_back_end_encodeSnappyBlockAsm12BAvx LEAL -1(CX), CX - DECL BX + DECL BP JZ match_extend_back_end_encodeSnappyBlockAsm12BAvx JMP match_extend_back_loop_encodeSnappyBlockAsm12BAvx match_extend_back_end_encodeSnappyBlockAsm12BAvx: - MOVL CX, DI - SUBL 12(SP), DI - LEAQ 4(AX)(DI*1), DI - CMPQ DI, (SP) + MOVL CX, SI + SUBL 12(SP), SI + LEAQ 4(AX)(SI*1), SI + CMPQ SI, (SP) JL match_dst_size_check_encodeSnappyBlockAsm12BAvx MOVQ $0x00000000, ret+48(FP) RET match_dst_size_check_encodeSnappyBlockAsm12BAvx: - MOVL CX, R8 + MOVL CX, SI MOVL 12(SP), DI - CMPL DI, R8 + CMPL DI, SI JEQ emit_literal_done_match_emit_encodeSnappyBlockAsm12BAvx - MOVL R8, BP - MOVL R8, 12(SP) - LEAQ (DX)(DI*1), R9 - SUBL DI, BP - MOVL BP, DI + MOVL SI, R8 + MOVL SI, 12(SP) + LEAQ (DX)(DI*1), SI + SUBL DI, R8 + MOVL R8, DI SUBL $0x01, DI JC emit_literal_done_match_emit_encodeSnappyBlockAsm12BAvx CMPL DI, $0x3c @@ -18035,11 +18008,11 @@ match_dst_size_check_encodeSnappyBlockAsm12BAvx: JMP memmove_match_emit_encodeSnappyBlockAsm12BAvx four_bytes_match_emit_encodeSnappyBlockAsm12BAvx: - MOVL DI, R8 - SHRL $0x10, R8 + MOVL DI, R9 + SHRL $0x10, R9 MOVB $0xf8, (AX) MOVW DI, 1(AX) - MOVB R8, 3(AX) + MOVB R9, 3(AX) ADDQ $0x04, AX JMP memmove_match_emit_encodeSnappyBlockAsm12BAvx @@ -18061,8 +18034,7 @@ one_byte_match_emit_encodeSnappyBlockAsm12BAvx: ADDQ $0x01, AX memmove_match_emit_encodeSnappyBlockAsm12BAvx: - LEAQ (AX)(BP*1), R10 - MOVL BP, R8 + LEAQ (AX)(R8*1), DI NOP emit_lit_memmove_match_emit_encodeSnappyBlockAsm12BAvx_memmove_tail: @@ -18089,55 +18061,55 @@ emit_lit_memmove_match_emit_encodeSnappyBlockAsm12BAvx_memmove_tail: JMP emit_lit_memmove_match_emit_encodeSnappyBlockAsm12BAvx_memmove_avxUnaligned emit_lit_memmove_match_emit_encodeSnappyBlockAsm12BAvx_memmove_move_1or2: - MOVB (R9), R11 - MOVB -1(R9)(R8*1), R12 - MOVB R11, (AX) - MOVB R12, -1(AX)(R8*1) + MOVB (SI), R9 + MOVB -1(SI)(R8*1), R10 + MOVB R9, (AX) + MOVB R10, -1(AX)(R8*1) JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm12BAvx emit_lit_memmove_match_emit_encodeSnappyBlockAsm12BAvx_memmove_move_4: - MOVL (R9), R11 - MOVL R11, (AX) + MOVL (SI), R9 + MOVL R9, (AX) JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm12BAvx emit_lit_memmove_match_emit_encodeSnappyBlockAsm12BAvx_memmove_move_3: - MOVW (R9), R11 - MOVB 2(R9), R12 - MOVW R11, (AX) - MOVB R12, 2(AX) + MOVW (SI), R9 + MOVB 2(SI), R10 + MOVW R9, (AX) + MOVB R10, 2(AX) JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm12BAvx emit_lit_memmove_match_emit_encodeSnappyBlockAsm12BAvx_memmove_move_5through7: - MOVL (R9), R11 - MOVL -4(R9)(R8*1), R12 - MOVL R11, (AX) - MOVL R12, -4(AX)(R8*1) + MOVL (SI), R9 + MOVL -4(SI)(R8*1), R10 + MOVL R9, (AX) + MOVL R10, -4(AX)(R8*1) JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm12BAvx emit_lit_memmove_match_emit_encodeSnappyBlockAsm12BAvx_memmove_move_8: - MOVQ (R9), R11 - MOVQ R11, (AX) + MOVQ (SI), R9 + MOVQ R9, (AX) JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm12BAvx emit_lit_memmove_match_emit_encodeSnappyBlockAsm12BAvx_memmove_move_9through16: - MOVQ (R9), R11 - MOVQ -8(R9)(R8*1), R12 - MOVQ R11, (AX) - MOVQ R12, -8(AX)(R8*1) + MOVQ (SI), R9 + MOVQ -8(SI)(R8*1), R10 + MOVQ R9, (AX) + MOVQ R10, -8(AX)(R8*1) JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm12BAvx emit_lit_memmove_match_emit_encodeSnappyBlockAsm12BAvx_memmove_move_17through32: - MOVOU (R9), X0 - MOVOU -16(R9)(R8*1), X1 + MOVOU (SI), X0 + MOVOU -16(SI)(R8*1), X1 MOVOU X0, (AX) MOVOU X1, -16(AX)(R8*1) JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm12BAvx emit_lit_memmove_match_emit_encodeSnappyBlockAsm12BAvx_memmove_move_33through64: - MOVOU (R9), X0 - MOVOU 16(R9), X1 - MOVOU -32(R9)(R8*1), X2 - MOVOU -16(R9)(R8*1), X3 + MOVOU (SI), X0 + MOVOU 16(SI), X1 + MOVOU -32(SI)(R8*1), X2 + MOVOU -16(SI)(R8*1), X3 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, -32(AX)(R8*1) @@ -18145,14 +18117,14 @@ emit_lit_memmove_match_emit_encodeSnappyBlockAsm12BAvx_memmove_move_33through64: JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm12BAvx emit_lit_memmove_match_emit_encodeSnappyBlockAsm12BAvx_memmove_move_65through128: - MOVOU (R9), X0 - MOVOU 16(R9), X1 - MOVOU 32(R9), X2 - MOVOU 48(R9), X3 - MOVOU -64(R9)(R8*1), X12 - MOVOU -48(R9)(R8*1), X13 - MOVOU -32(R9)(R8*1), X14 - MOVOU -16(R9)(R8*1), X15 + MOVOU (SI), X0 + MOVOU 16(SI), X1 + MOVOU 32(SI), X2 + MOVOU 48(SI), X3 + MOVOU -64(SI)(R8*1), X12 + MOVOU -48(SI)(R8*1), X13 + MOVOU -32(SI)(R8*1), X14 + MOVOU -16(SI)(R8*1), X15 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, 32(AX) @@ -18164,22 +18136,22 @@ emit_lit_memmove_match_emit_encodeSnappyBlockAsm12BAvx_memmove_move_65through128 JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm12BAvx emit_lit_memmove_match_emit_encodeSnappyBlockAsm12BAvx_memmove_move_129through256: - MOVOU (R9), X0 - MOVOU 16(R9), X1 - MOVOU 32(R9), X2 - MOVOU 48(R9), X3 - MOVOU 64(R9), X4 - MOVOU 80(R9), X5 - MOVOU 96(R9), X6 - MOVOU 112(R9), X7 - MOVOU -128(R9)(R8*1), X8 - MOVOU -112(R9)(R8*1), X9 - MOVOU -96(R9)(R8*1), X10 - MOVOU -80(R9)(R8*1), X11 - MOVOU -64(R9)(R8*1), X12 - MOVOU -48(R9)(R8*1), X13 - MOVOU -32(R9)(R8*1), X14 - MOVOU -16(R9)(R8*1), X15 + MOVOU (SI), X0 + MOVOU 16(SI), X1 + MOVOU 32(SI), X2 + MOVOU 48(SI), X3 + MOVOU 64(SI), X4 + MOVOU 80(SI), X5 + MOVOU 96(SI), X6 + MOVOU 112(SI), X7 + MOVOU -128(SI)(R8*1), X8 + MOVOU -112(SI)(R8*1), X9 + MOVOU -96(SI)(R8*1), X10 + MOVOU -80(SI)(R8*1), X11 + MOVOU -64(SI)(R8*1), X12 + MOVOU -48(SI)(R8*1), X13 + MOVOU -32(SI)(R8*1), X14 + MOVOU -16(SI)(R8*1), X15 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, 32(AX) @@ -18200,22 +18172,22 @@ emit_lit_memmove_match_emit_encodeSnappyBlockAsm12BAvx_memmove_move_129through25 emit_lit_memmove_match_emit_encodeSnappyBlockAsm12BAvx_memmove_move_256through2048: LEAQ -256(R8), R8 - MOVOU (R9), X0 - MOVOU 16(R9), X1 - MOVOU 32(R9), X2 - MOVOU 48(R9), X3 - MOVOU 64(R9), X4 - MOVOU 80(R9), X5 - MOVOU 96(R9), X6 - MOVOU 112(R9), X7 - MOVOU 128(R9), X8 - MOVOU 144(R9), X9 - MOVOU 160(R9), X10 - MOVOU 176(R9), X11 - MOVOU 192(R9), X12 - MOVOU 208(R9), X13 - MOVOU 224(R9), X14 - MOVOU 240(R9), X15 + MOVOU (SI), X0 + MOVOU 16(SI), X1 + MOVOU 32(SI), X2 + MOVOU 48(SI), X3 + MOVOU 64(SI), X4 + MOVOU 80(SI), X5 + MOVOU 96(SI), X6 + MOVOU 112(SI), X7 + MOVOU 128(SI), X8 + MOVOU 144(SI), X9 + MOVOU 160(SI), X10 + MOVOU 176(SI), X11 + MOVOU 192(SI), X12 + MOVOU 208(SI), X13 + MOVOU 224(SI), X14 + MOVOU 240(SI), X15 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, 32(AX) @@ -18233,160 +18205,160 @@ emit_lit_memmove_match_emit_encodeSnappyBlockAsm12BAvx_memmove_move_256through20 MOVOU X14, 224(AX) MOVOU X15, 240(AX) CMPQ R8, $0x00000100 - LEAQ 256(R9), R9 + LEAQ 256(SI), SI LEAQ 256(AX), AX JGE emit_lit_memmove_match_emit_encodeSnappyBlockAsm12BAvx_memmove_move_256through2048 JMP emit_lit_memmove_match_emit_encodeSnappyBlockAsm12BAvx_memmove_tail emit_lit_memmove_match_emit_encodeSnappyBlockAsm12BAvx_memmove_avxUnaligned: - LEAQ (R9)(R8*1), R12 - MOVQ AX, R14 - MOVOU -128(R12), X5 - MOVOU -112(R12), X6 - MOVQ $0x00000080, R11 + LEAQ (SI)(R8*1), R10 + MOVQ AX, R12 + MOVOU -128(R10), X5 + MOVOU -112(R10), X6 + MOVQ $0x00000080, R9 ANDQ $0xffffffe0, AX ADDQ $0x20, AX - MOVOU -96(R12), X7 - MOVOU -80(R12), X8 - MOVQ AX, R13 - SUBQ R14, R13 - MOVOU -64(R12), X9 - MOVOU -48(R12), X10 - SUBQ R13, R8 - MOVOU -32(R12), X11 - MOVOU -16(R12), X12 - VMOVDQU (R9), Y4 - ADDQ R13, R9 + MOVOU -96(R10), X7 + MOVOU -80(R10), X8 + MOVQ AX, R11 + SUBQ R12, R11 + MOVOU -64(R10), X9 + MOVOU -48(R10), X10 SUBQ R11, R8 + MOVOU -32(R10), X11 + MOVOU -16(R10), X12 + VMOVDQU (SI), Y4 + ADDQ R11, SI + SUBQ R9, R8 emit_lit_memmove_match_emit_encodeSnappyBlockAsm12BAvx_memmove_gobble_128_loop: - VMOVDQU (R9), Y0 - VMOVDQU 32(R9), Y1 - VMOVDQU 64(R9), Y2 - VMOVDQU 96(R9), Y3 - ADDQ R11, R9 + VMOVDQU (SI), Y0 + VMOVDQU 32(SI), Y1 + VMOVDQU 64(SI), Y2 + VMOVDQU 96(SI), Y3 + ADDQ R9, SI VMOVDQA Y0, (AX) VMOVDQA Y1, 32(AX) VMOVDQA Y2, 64(AX) VMOVDQA Y3, 96(AX) - ADDQ R11, AX - SUBQ R11, R8 + ADDQ R9, AX + SUBQ R9, R8 JA emit_lit_memmove_match_emit_encodeSnappyBlockAsm12BAvx_memmove_gobble_128_loop - ADDQ R11, R8 + ADDQ R9, R8 ADDQ AX, R8 - VMOVDQU Y4, (R14) + VMOVDQU Y4, (R12) VZEROUPPER - MOVOU X5, -128(R8) - MOVOU X6, -112(R8) - MOVOU X7, -96(R8) - MOVOU X8, -80(R8) - MOVOU X9, -64(R8) - MOVOU X10, -48(R8) - MOVOU X11, -32(R8) - MOVOU X12, -16(R8) + MOVOU X5, -128(R8) + MOVOU X6, -112(R8) + MOVOU X7, -96(R8) + MOVOU X8, -80(R8) + MOVOU X9, -64(R8) + MOVOU X10, -48(R8) + MOVOU X11, -32(R8) + MOVOU X12, -16(R8) memmove_end_copy_match_emit_encodeSnappyBlockAsm12BAvx: - MOVQ R10, AX + MOVQ DI, AX emit_literal_done_match_emit_encodeSnappyBlockAsm12BAvx: match_nolit_loop_encodeSnappyBlockAsm12BAvx: - MOVL CX, BP - SUBL BX, BP - MOVL BP, 16(SP) + MOVL CX, SI + SUBL BP, SI + MOVL SI, 16(SP) ADDL $0x04, CX - ADDL $0x04, BX - MOVQ src_len+32(FP), R9 - SUBL CX, R9 - LEAQ (DX)(CX*1), R10 - LEAQ (DX)(BX*1), R11 - XORL BX, BX - CMPL R9, $0x08 + ADDL $0x04, BP + MOVQ src_len+32(FP), SI + SUBL CX, SI + LEAQ (DX)(CX*1), DI + LEAQ (DX)(BP*1), BP + XORL R9, R9 + CMPL SI, $0x08 JL matchlen_single_match_nolit_encodeSnappyBlockAsm12BAvx matchlen_loopback_match_nolit_encodeSnappyBlockAsm12BAvx: - MOVQ (R10)(BX*1), R12 - XORQ (R11)(BX*1), R12 - TESTQ R12, R12 + MOVQ (DI)(R9*1), R8 + XORQ (BP)(R9*1), R8 + TESTQ R8, R8 JZ matchlen_loop_match_nolit_encodeSnappyBlockAsm12BAvx - BSFQ R12, R12 - SARQ $0x03, R12 - LEAL (BX)(R12*1), BX + BSFQ R8, R8 + SARQ $0x03, R8 + LEAL (R9)(R8*1), R9 JMP match_nolit_end_encodeSnappyBlockAsm12BAvx matchlen_loop_match_nolit_encodeSnappyBlockAsm12BAvx: - LEAL -8(R9), R9 - LEAL 8(BX), BX - CMPL R9, $0x08 + LEAL -8(SI), SI + LEAL 8(R9), R9 + CMPL SI, $0x08 JGE matchlen_loopback_match_nolit_encodeSnappyBlockAsm12BAvx matchlen_single_match_nolit_encodeSnappyBlockAsm12BAvx: - TESTL R9, R9 + TESTL SI, SI JZ match_nolit_end_encodeSnappyBlockAsm12BAvx matchlen_single_loopback_match_nolit_encodeSnappyBlockAsm12BAvx: - MOVB (R10)(BX*1), R12 - CMPB (R11)(BX*1), R12 + MOVB (DI)(R9*1), R8 + CMPB (BP)(R9*1), R8 JNE match_nolit_end_encodeSnappyBlockAsm12BAvx - LEAL 1(BX), BX - DECL R9 + LEAL 1(R9), R9 + DECL SI JNZ matchlen_single_loopback_match_nolit_encodeSnappyBlockAsm12BAvx match_nolit_end_encodeSnappyBlockAsm12BAvx: - ADDL BX, CX + ADDL R9, CX MOVL 16(SP), BP - ADDL $0x04, BX + ADDL $0x04, R9 CMPL BP, $0x00010000 JL two_byte_offset_match_nolit_encodeSnappyBlockAsm12BAvx four_bytes_loop_back_match_nolit_encodeSnappyBlockAsm12BAvx: - CMPL BX, $0x40 + CMPL R9, $0x40 JLE four_bytes_remain_match_nolit_encodeSnappyBlockAsm12BAvx MOVB $0xff, (AX) MOVL BP, 1(AX) - LEAL -64(BX), BX + LEAL -64(R9), R9 ADDQ $0x05, AX - CMPL BX, $0x04 + CMPL R9, $0x04 JL four_bytes_remain_match_nolit_encodeSnappyBlockAsm12BAvx JMP four_bytes_loop_back_match_nolit_encodeSnappyBlockAsm12BAvx four_bytes_remain_match_nolit_encodeSnappyBlockAsm12BAvx: - TESTL BX, BX + TESTL R9, R9 JZ match_nolit_emitcopy_end_encodeSnappyBlockAsm12BAvx - MOVB $0x03, SI - LEAL -4(SI)(BX*4), BX - MOVB BL, (AX) + MOVB $0x03, BL + LEAL -4(BX)(R9*4), R9 + MOVB R9, (AX) MOVL BP, 1(AX) ADDQ $0x05, AX JMP match_nolit_emitcopy_end_encodeSnappyBlockAsm12BAvx two_byte_offset_match_nolit_encodeSnappyBlockAsm12BAvx: - CMPL BX, $0x40 + CMPL R9, $0x40 JLE two_byte_offset_short_match_nolit_encodeSnappyBlockAsm12BAvx MOVB $0xee, (AX) MOVW BP, 1(AX) - LEAL -60(BX), BX + LEAL -60(R9), R9 ADDQ $0x03, AX JMP two_byte_offset_match_nolit_encodeSnappyBlockAsm12BAvx two_byte_offset_short_match_nolit_encodeSnappyBlockAsm12BAvx: - CMPL BX, $0x0c + CMPL R9, $0x0c JGE emit_copy_three_match_nolit_encodeSnappyBlockAsm12BAvx CMPL BP, $0x00000800 JGE emit_copy_three_match_nolit_encodeSnappyBlockAsm12BAvx - MOVB $0x01, SI - LEAL -16(SI)(BX*4), BX + MOVB $0x01, BL + LEAL -16(BX)(R9*4), R9 MOVB BP, 1(AX) SHRL $0x08, BP SHLL $0x05, BP - ORL BP, BX - MOVB BL, (AX) + ORL BP, R9 + MOVB R9, (AX) ADDQ $0x02, AX JMP match_nolit_emitcopy_end_encodeSnappyBlockAsm12BAvx emit_copy_three_match_nolit_encodeSnappyBlockAsm12BAvx: - MOVB $0x02, SI - LEAL -4(SI)(BX*4), BX - MOVB BL, (AX) + MOVB $0x02, BL + LEAL -4(BX)(R9*4), R9 + MOVB R9, (AX) MOVW BP, 1(AX) ADDQ $0x03, AX @@ -18400,22 +18372,22 @@ match_nolit_emitcopy_end_encodeSnappyBlockAsm12BAvx: RET match_nolit_dst_ok_encodeSnappyBlockAsm12BAvx: - MOVQ -2(DX)(CX*1), R9 - MOVQ $0x000000cf1bbcdcbb, R10 - MOVQ R9, R11 - SHRQ $0x10, R9 - MOVQ R9, R12 - SHLQ $0x18, R11 - IMULQ R10, R11 - SHRQ $0x34, R11 - SHLQ $0x18, R12 - IMULQ R10, R12 - SHRQ $0x34, R12 - LEAL -2(CX), BP - MOVL 24(SP)(R12*4), BX - MOVL BP, 24(SP)(R11*4) - MOVL CX, 24(SP)(R12*4) - CMPL (DX)(BX*1), R9 + MOVQ -2(DX)(CX*1), SI + MOVQ $0x000000cf1bbcdcbb, BP + MOVQ SI, DI + SHRQ $0x10, SI + MOVQ SI, R8 + SHLQ $0x18, DI + IMULQ BP, DI + SHRQ $0x34, DI + SHLQ $0x18, R8 + IMULQ BP, R8 + SHRQ $0x34, R8 + LEAL -2(CX), R9 + MOVL 24(SP)(R8*4), BP + MOVL R9, 24(SP)(DI*4) + MOVL CX, 24(SP)(R8*4) + CMPL (DX)(BP*1), SI JEQ match_nolit_loop_encodeSnappyBlockAsm12BAvx INCL CX JMP search_loop_encodeSnappyBlockAsm12BAvx @@ -18696,14 +18668,14 @@ emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12BAvx_memmove_gobble_128_lo ADDQ AX, BX VMOVDQU Y4, (R8) VZEROUPPER - MOVOU X5, -128(BX) - MOVOU X6, -112(BX) - MOVOU X7, -96(BX) - MOVOU X8, -80(BX) - MOVOU X9, -64(BX) - MOVOU X10, -48(BX) - MOVOU X11, -32(BX) - MOVOU X12, -16(BX) + MOVOU X5, -128(BX) + MOVOU X6, -112(BX) + MOVOU X7, -96(BX) + MOVOU X8, -80(BX) + MOVOU X9, -64(BX) + MOVOU X10, -48(BX) + MOVOU X11, -32(BX) + MOVOU X12, -16(BX) memmove_end_copy_emit_remainder_encodeSnappyBlockAsm12BAvx: MOVQ DX, AX @@ -18737,8 +18709,8 @@ zero_loop_encodeSnappyBlockAsm10BAvx: MOVL $0x00000000, 12(SP) MOVQ src_len+32(FP), CX LEAQ -5(CX), DX - LEAQ -8(CX), R9 - MOVL R9, 8(SP) + LEAQ -8(CX), BP + MOVL BP, 8(SP) SHRQ $0x05, CX SUBL CX, DX LEAQ (AX)(DX*1), DX @@ -18748,68 +18720,68 @@ zero_loop_encodeSnappyBlockAsm10BAvx: MOVQ src_base+24(FP), DX search_loop_encodeSnappyBlockAsm10BAvx: - MOVQ (DX)(CX*1), R9 - MOVL CX, BX - SUBL 12(SP), BX - SHRL $0x05, BX - LEAL 4(CX)(BX*1), BX - MOVL 8(SP), BP - CMPL BX, BP + MOVQ (DX)(CX*1), SI + MOVL CX, BP + SUBL 12(SP), BP + SHRL $0x05, BP + LEAL 4(CX)(BP*1), BP + MOVL 8(SP), DI + CMPL BP, DI JGT emit_remainder_encodeSnappyBlockAsm10BAvx - MOVL BX, 20(SP) - MOVQ $0x9e3779b1, R10 - MOVQ R9, R11 - MOVQ R9, R12 - SHRQ $0x08, R12 - SHLQ $0x20, R11 - IMULQ R10, R11 - SHRQ $0x36, R11 - SHLQ $0x20, R12 - IMULQ R10, R12 - SHRQ $0x36, R12 - MOVL 24(SP)(R11*4), BX - MOVL 24(SP)(R12*4), BP - MOVL CX, 24(SP)(R11*4) - LEAL 1(CX), DI - MOVL DI, 24(SP)(R12*4) - MOVQ R9, R11 - SHRQ $0x10, R11 - SHLQ $0x20, R11 - IMULQ R10, R11 - SHRQ $0x36, R11 - MOVL CX, DI - SUBL 16(SP), DI - MOVL 1(DX)(DI*1), DI - MOVQ R9, R10 + MOVL BP, 20(SP) + MOVQ $0x9e3779b1, R8 + MOVQ SI, R9 + MOVQ SI, R10 SHRQ $0x08, R10 - CMPL R10, DI + SHLQ $0x20, R9 + IMULQ R8, R9 + SHRQ $0x36, R9 + SHLQ $0x20, R10 + IMULQ R8, R10 + SHRQ $0x36, R10 + MOVL 24(SP)(R9*4), BP + MOVL 24(SP)(R10*4), DI + MOVL CX, 24(SP)(R9*4) + LEAL 1(CX), R9 + MOVL R9, 24(SP)(R10*4) + MOVQ SI, R9 + SHRQ $0x10, R9 + SHLQ $0x20, R9 + IMULQ R8, R9 + SHRQ $0x36, R9 + MOVL CX, R8 + SUBL 16(SP), R8 + MOVL 1(DX)(R8*1), R10 + MOVQ SI, R8 + SHRQ $0x08, R8 + CMPL R8, R10 JNE no_repeat_found_encodeSnappyBlockAsm10BAvx - LEAL 1(CX), DI - MOVL 12(SP), BX - MOVL DI, BP - SUBL 16(SP), BP + LEAL 1(CX), SI + MOVL 12(SP), BP + MOVL SI, DI + SUBL 16(SP), DI JZ repeat_extend_back_end_encodeSnappyBlockAsm10BAvx repeat_extend_back_loop_encodeSnappyBlockAsm10BAvx: - CMPL DI, BX + CMPL SI, BP JLE repeat_extend_back_end_encodeSnappyBlockAsm10BAvx - MOVB -1(DX)(BP*1), SI - MOVB -1(DX)(DI*1), R8 - CMPB SI, R8 + MOVB -1(DX)(DI*1), BL + MOVB -1(DX)(SI*1), R8 + CMPB BL, R8 JNE repeat_extend_back_end_encodeSnappyBlockAsm10BAvx - LEAL -1(DI), DI - DECL BP + LEAL -1(SI), SI + DECL DI JNZ repeat_extend_back_loop_encodeSnappyBlockAsm10BAvx repeat_extend_back_end_encodeSnappyBlockAsm10BAvx: MOVL 12(SP), BP - CMPL BP, DI + CMPL BP, SI JEQ emit_literal_done_repeat_emit_encodeSnappyBlockAsm10BAvx - MOVL DI, BX - MOVL DI, 12(SP) - LEAQ (DX)(BP*1), R9 - SUBL BP, BX - MOVL BX, BP + MOVL SI, DI + MOVL SI, 12(SP) + LEAQ (DX)(BP*1), R8 + SUBL BP, DI + MOVL DI, BP SUBL $0x01, BP JC emit_literal_done_repeat_emit_encodeSnappyBlockAsm10BAvx CMPL BP, $0x3c @@ -18826,11 +18798,11 @@ repeat_extend_back_end_encodeSnappyBlockAsm10BAvx: JMP memmove_repeat_emit_encodeSnappyBlockAsm10BAvx four_bytes_repeat_emit_encodeSnappyBlockAsm10BAvx: - MOVL BP, R8 - SHRL $0x10, R8 + MOVL BP, R9 + SHRL $0x10, R9 MOVB $0xf8, (AX) MOVW BP, 1(AX) - MOVB R8, 3(AX) + MOVB R9, 3(AX) ADDQ $0x04, AX JMP memmove_repeat_emit_encodeSnappyBlockAsm10BAvx @@ -18852,125 +18824,124 @@ one_byte_repeat_emit_encodeSnappyBlockAsm10BAvx: ADDQ $0x01, AX memmove_repeat_emit_encodeSnappyBlockAsm10BAvx: - LEAQ (AX)(BX*1), R10 - MOVL BX, BP + LEAQ (AX)(DI*1), BP NOP emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10BAvx_memmove_tail: - TESTQ BP, BP + TESTQ DI, DI JEQ memmove_end_copy_repeat_emit_encodeSnappyBlockAsm10BAvx - CMPQ BP, $0x02 + CMPQ DI, $0x02 JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10BAvx_memmove_move_1or2 - CMPQ BP, $0x04 + CMPQ DI, $0x04 JB emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10BAvx_memmove_move_3 JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10BAvx_memmove_move_4 - CMPQ BP, $0x08 + CMPQ DI, $0x08 JB emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10BAvx_memmove_move_5through7 JE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10BAvx_memmove_move_8 - CMPQ BP, $0x10 + CMPQ DI, $0x10 JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10BAvx_memmove_move_9through16 - CMPQ BP, $0x20 + CMPQ DI, $0x20 JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10BAvx_memmove_move_17through32 - CMPQ BP, $0x40 + CMPQ DI, $0x40 JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10BAvx_memmove_move_33through64 - CMPQ BP, $0x80 + CMPQ DI, $0x80 JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10BAvx_memmove_move_65through128 - CMPQ BP, $0x00000100 + CMPQ DI, $0x00000100 JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10BAvx_memmove_move_129through256 JMP emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10BAvx_memmove_avxUnaligned emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10BAvx_memmove_move_1or2: - MOVB (R9), R11 - MOVB -1(R9)(BP*1), R12 - MOVB R11, (AX) - MOVB R12, -1(AX)(BP*1) + MOVB (R8), R9 + MOVB -1(R8)(DI*1), R10 + MOVB R9, (AX) + MOVB R10, -1(AX)(DI*1) JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm10BAvx emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10BAvx_memmove_move_4: - MOVL (R9), R11 - MOVL R11, (AX) + MOVL (R8), R9 + MOVL R9, (AX) JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm10BAvx emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10BAvx_memmove_move_3: - MOVW (R9), R11 - MOVB 2(R9), R12 - MOVW R11, (AX) - MOVB R12, 2(AX) + MOVW (R8), R9 + MOVB 2(R8), R10 + MOVW R9, (AX) + MOVB R10, 2(AX) JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm10BAvx emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10BAvx_memmove_move_5through7: - MOVL (R9), R11 - MOVL -4(R9)(BP*1), R12 - MOVL R11, (AX) - MOVL R12, -4(AX)(BP*1) + MOVL (R8), R9 + MOVL -4(R8)(DI*1), R10 + MOVL R9, (AX) + MOVL R10, -4(AX)(DI*1) JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm10BAvx emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10BAvx_memmove_move_8: - MOVQ (R9), R11 - MOVQ R11, (AX) + MOVQ (R8), R9 + MOVQ R9, (AX) JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm10BAvx emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10BAvx_memmove_move_9through16: - MOVQ (R9), R11 - MOVQ -8(R9)(BP*1), R12 - MOVQ R11, (AX) - MOVQ R12, -8(AX)(BP*1) + MOVQ (R8), R9 + MOVQ -8(R8)(DI*1), R10 + MOVQ R9, (AX) + MOVQ R10, -8(AX)(DI*1) JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm10BAvx emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10BAvx_memmove_move_17through32: - MOVOU (R9), X0 - MOVOU -16(R9)(BP*1), X1 + MOVOU (R8), X0 + MOVOU -16(R8)(DI*1), X1 MOVOU X0, (AX) - MOVOU X1, -16(AX)(BP*1) + MOVOU X1, -16(AX)(DI*1) JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm10BAvx emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10BAvx_memmove_move_33through64: - MOVOU (R9), X0 - MOVOU 16(R9), X1 - MOVOU -32(R9)(BP*1), X2 - MOVOU -16(R9)(BP*1), X3 + MOVOU (R8), X0 + MOVOU 16(R8), X1 + MOVOU -32(R8)(DI*1), X2 + MOVOU -16(R8)(DI*1), X3 MOVOU X0, (AX) MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(BP*1) - MOVOU X3, -16(AX)(BP*1) + MOVOU X2, -32(AX)(DI*1) + MOVOU X3, -16(AX)(DI*1) JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm10BAvx emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10BAvx_memmove_move_65through128: - MOVOU (R9), X0 - MOVOU 16(R9), X1 - MOVOU 32(R9), X2 - MOVOU 48(R9), X3 - MOVOU -64(R9)(BP*1), X12 - MOVOU -48(R9)(BP*1), X13 - MOVOU -32(R9)(BP*1), X14 - MOVOU -16(R9)(BP*1), X15 + MOVOU (R8), X0 + MOVOU 16(R8), X1 + MOVOU 32(R8), X2 + MOVOU 48(R8), X3 + MOVOU -64(R8)(DI*1), X12 + MOVOU -48(R8)(DI*1), X13 + MOVOU -32(R8)(DI*1), X14 + MOVOU -16(R8)(DI*1), X15 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, 32(AX) MOVOU X3, 48(AX) - MOVOU X12, -64(AX)(BP*1) - MOVOU X13, -48(AX)(BP*1) - MOVOU X14, -32(AX)(BP*1) - MOVOU X15, -16(AX)(BP*1) + MOVOU X12, -64(AX)(DI*1) + MOVOU X13, -48(AX)(DI*1) + MOVOU X14, -32(AX)(DI*1) + MOVOU X15, -16(AX)(DI*1) JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm10BAvx emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10BAvx_memmove_move_129through256: - MOVOU (R9), X0 - MOVOU 16(R9), X1 - MOVOU 32(R9), X2 - MOVOU 48(R9), X3 - MOVOU 64(R9), X4 - MOVOU 80(R9), X5 - MOVOU 96(R9), X6 - MOVOU 112(R9), X7 - MOVOU -128(R9)(BP*1), X8 - MOVOU -112(R9)(BP*1), X9 - MOVOU -96(R9)(BP*1), X10 - MOVOU -80(R9)(BP*1), X11 - MOVOU -64(R9)(BP*1), X12 - MOVOU -48(R9)(BP*1), X13 - MOVOU -32(R9)(BP*1), X14 - MOVOU -16(R9)(BP*1), X15 + MOVOU (R8), X0 + MOVOU 16(R8), X1 + MOVOU 32(R8), X2 + MOVOU 48(R8), X3 + MOVOU 64(R8), X4 + MOVOU 80(R8), X5 + MOVOU 96(R8), X6 + MOVOU 112(R8), X7 + MOVOU -128(R8)(DI*1), X8 + MOVOU -112(R8)(DI*1), X9 + MOVOU -96(R8)(DI*1), X10 + MOVOU -80(R8)(DI*1), X11 + MOVOU -64(R8)(DI*1), X12 + MOVOU -48(R8)(DI*1), X13 + MOVOU -32(R8)(DI*1), X14 + MOVOU -16(R8)(DI*1), X15 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, 32(AX) @@ -18979,34 +18950,34 @@ emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10BAvx_memmove_move_129through2 MOVOU X5, 80(AX) MOVOU X6, 96(AX) MOVOU X7, 112(AX) - MOVOU X8, -128(AX)(BP*1) - MOVOU X9, -112(AX)(BP*1) - MOVOU X10, -96(AX)(BP*1) - MOVOU X11, -80(AX)(BP*1) - MOVOU X12, -64(AX)(BP*1) - MOVOU X13, -48(AX)(BP*1) - MOVOU X14, -32(AX)(BP*1) - MOVOU X15, -16(AX)(BP*1) + MOVOU X8, -128(AX)(DI*1) + MOVOU X9, -112(AX)(DI*1) + MOVOU X10, -96(AX)(DI*1) + MOVOU X11, -80(AX)(DI*1) + MOVOU X12, -64(AX)(DI*1) + MOVOU X13, -48(AX)(DI*1) + MOVOU X14, -32(AX)(DI*1) + MOVOU X15, -16(AX)(DI*1) JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm10BAvx emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10BAvx_memmove_move_256through2048: - LEAQ -256(BP), BP - MOVOU (R9), X0 - MOVOU 16(R9), X1 - MOVOU 32(R9), X2 - MOVOU 48(R9), X3 - MOVOU 64(R9), X4 - MOVOU 80(R9), X5 - MOVOU 96(R9), X6 - MOVOU 112(R9), X7 - MOVOU 128(R9), X8 - MOVOU 144(R9), X9 - MOVOU 160(R9), X10 - MOVOU 176(R9), X11 - MOVOU 192(R9), X12 - MOVOU 208(R9), X13 - MOVOU 224(R9), X14 - MOVOU 240(R9), X15 + LEAQ -256(DI), DI + MOVOU (R8), X0 + MOVOU 16(R8), X1 + MOVOU 32(R8), X2 + MOVOU 48(R8), X3 + MOVOU 64(R8), X4 + MOVOU 80(R8), X5 + MOVOU 96(R8), X6 + MOVOU 112(R8), X7 + MOVOU 128(R8), X8 + MOVOU 144(R8), X9 + MOVOU 160(R8), X10 + MOVOU 176(R8), X11 + MOVOU 192(R8), X12 + MOVOU 208(R8), X13 + MOVOU 224(R8), X14 + MOVOU 240(R8), X15 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, 32(AX) @@ -19023,160 +18994,160 @@ emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10BAvx_memmove_move_256through2 MOVOU X13, 208(AX) MOVOU X14, 224(AX) MOVOU X15, 240(AX) - CMPQ BP, $0x00000100 - LEAQ 256(R9), R9 + CMPQ DI, $0x00000100 + LEAQ 256(R8), R8 LEAQ 256(AX), AX JGE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10BAvx_memmove_move_256through2048 JMP emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10BAvx_memmove_tail emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10BAvx_memmove_avxUnaligned: - LEAQ (R9)(BP*1), R12 - MOVQ AX, R14 - MOVOU -128(R12), X5 - MOVOU -112(R12), X6 - MOVQ $0x00000080, R11 + LEAQ (R8)(DI*1), R10 + MOVQ AX, R12 + MOVOU -128(R10), X5 + MOVOU -112(R10), X6 + MOVQ $0x00000080, R9 ANDQ $0xffffffe0, AX ADDQ $0x20, AX - MOVOU -96(R12), X7 - MOVOU -80(R12), X8 - MOVQ AX, R13 - SUBQ R14, R13 - MOVOU -64(R12), X9 - MOVOU -48(R12), X10 - SUBQ R13, BP - MOVOU -32(R12), X11 - MOVOU -16(R12), X12 - VMOVDQU (R9), Y4 - ADDQ R13, R9 - SUBQ R11, BP + MOVOU -96(R10), X7 + MOVOU -80(R10), X8 + MOVQ AX, R11 + SUBQ R12, R11 + MOVOU -64(R10), X9 + MOVOU -48(R10), X10 + SUBQ R11, DI + MOVOU -32(R10), X11 + MOVOU -16(R10), X12 + VMOVDQU (R8), Y4 + ADDQ R11, R8 + SUBQ R9, DI emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10BAvx_memmove_gobble_128_loop: - VMOVDQU (R9), Y0 - VMOVDQU 32(R9), Y1 - VMOVDQU 64(R9), Y2 - VMOVDQU 96(R9), Y3 - ADDQ R11, R9 + VMOVDQU (R8), Y0 + VMOVDQU 32(R8), Y1 + VMOVDQU 64(R8), Y2 + VMOVDQU 96(R8), Y3 + ADDQ R9, R8 VMOVDQA Y0, (AX) VMOVDQA Y1, 32(AX) VMOVDQA Y2, 64(AX) VMOVDQA Y3, 96(AX) - ADDQ R11, AX - SUBQ R11, BP + ADDQ R9, AX + SUBQ R9, DI JA emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10BAvx_memmove_gobble_128_loop - ADDQ R11, BP - ADDQ AX, BP - VMOVDQU Y4, (R14) + ADDQ R9, DI + ADDQ AX, DI + VMOVDQU Y4, (R12) VZEROUPPER - MOVOU X5, -128(BP) - MOVOU X6, -112(BP) - MOVOU X7, -96(BP) - MOVOU X8, -80(BP) - MOVOU X9, -64(BP) - MOVOU X10, -48(BP) - MOVOU X11, -32(BP) - MOVOU X12, -16(BP) + MOVOU X5, -128(DI) + MOVOU X6, -112(DI) + MOVOU X7, -96(DI) + MOVOU X8, -80(DI) + MOVOU X9, -64(DI) + MOVOU X10, -48(DI) + MOVOU X11, -32(DI) + MOVOU X12, -16(DI) memmove_end_copy_repeat_emit_encodeSnappyBlockAsm10BAvx: - MOVQ R10, AX + MOVQ BP, AX emit_literal_done_repeat_emit_encodeSnappyBlockAsm10BAvx: ADDL $0x05, CX - MOVL CX, BX - SUBL 16(SP), BX - MOVQ src_len+32(FP), R9 - SUBL CX, R9 - LEAQ (DX)(CX*1), R10 - LEAQ (DX)(BX*1), R11 - XORL BX, BX - CMPL R9, $0x08 + MOVL CX, BP + SUBL 16(SP), BP + MOVQ src_len+32(FP), DI + SUBL CX, DI + LEAQ (DX)(CX*1), R8 + LEAQ (DX)(BP*1), BP + XORL R10, R10 + CMPL DI, $0x08 JL matchlen_single_repeat_extend matchlen_loopback_repeat_extend: - MOVQ (R10)(BX*1), R12 - XORQ (R11)(BX*1), R12 - TESTQ R12, R12 + MOVQ (R8)(R10*1), R9 + XORQ (BP)(R10*1), R9 + TESTQ R9, R9 JZ matchlen_loop_repeat_extend - BSFQ R12, R12 - SARQ $0x03, R12 - LEAL (BX)(R12*1), BX + BSFQ R9, R9 + SARQ $0x03, R9 + LEAL (R10)(R9*1), R10 JMP repeat_extend_forward_end_encodeSnappyBlockAsm10BAvx matchlen_loop_repeat_extend: - LEAL -8(R9), R9 - LEAL 8(BX), BX - CMPL R9, $0x08 + LEAL -8(DI), DI + LEAL 8(R10), R10 + CMPL DI, $0x08 JGE matchlen_loopback_repeat_extend matchlen_single_repeat_extend: - TESTL R9, R9 + TESTL DI, DI JZ repeat_extend_forward_end_encodeSnappyBlockAsm10BAvx matchlen_single_loopback_repeat_extend: - MOVB (R10)(BX*1), R12 - CMPB (R11)(BX*1), R12 + MOVB (R8)(R10*1), R9 + CMPB (BP)(R10*1), R9 JNE repeat_extend_forward_end_encodeSnappyBlockAsm10BAvx - LEAL 1(BX), BX - DECL R9 + LEAL 1(R10), R10 + DECL DI JNZ matchlen_single_loopback_repeat_extend repeat_extend_forward_end_encodeSnappyBlockAsm10BAvx: - ADDL BX, CX - MOVL CX, BX - SUBL DI, BX - MOVL 16(SP), BP - CMPL BP, $0x00010000 + ADDL R10, CX + MOVL CX, BP + SUBL SI, BP + MOVL 16(SP), SI + CMPL SI, $0x00010000 JL two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm10BAvx four_bytes_loop_back_repeat_as_copy_encodeSnappyBlockAsm10BAvx: - CMPL BX, $0x40 + CMPL BP, $0x40 JLE four_bytes_remain_repeat_as_copy_encodeSnappyBlockAsm10BAvx MOVB $0xff, (AX) - MOVL BP, 1(AX) - LEAL -64(BX), BX + MOVL SI, 1(AX) + LEAL -64(BP), BP ADDQ $0x05, AX - CMPL BX, $0x04 + CMPL BP, $0x04 JL four_bytes_remain_repeat_as_copy_encodeSnappyBlockAsm10BAvx JMP four_bytes_loop_back_repeat_as_copy_encodeSnappyBlockAsm10BAvx four_bytes_remain_repeat_as_copy_encodeSnappyBlockAsm10BAvx: - TESTL BX, BX + TESTL BP, BP JZ repeat_end_emit_encodeSnappyBlockAsm10BAvx - MOVB $0x03, SI - LEAL -4(SI)(BX*4), BX - MOVB BL, (AX) - MOVL BP, 1(AX) + MOVB $0x03, BL + LEAL -4(BX)(BP*4), BP + MOVB BP, (AX) + MOVL SI, 1(AX) ADDQ $0x05, AX JMP repeat_end_emit_encodeSnappyBlockAsm10BAvx two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm10BAvx: - CMPL BX, $0x40 + CMPL BP, $0x40 JLE two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm10BAvx MOVB $0xee, (AX) - MOVW BP, 1(AX) - LEAL -60(BX), BX + MOVW SI, 1(AX) + LEAL -60(BP), BP ADDQ $0x03, AX JMP two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm10BAvx two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm10BAvx: - CMPL BX, $0x0c + CMPL BP, $0x0c JGE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm10BAvx - CMPL BP, $0x00000800 + CMPL SI, $0x00000800 JGE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm10BAvx - MOVB $0x01, SI - LEAL -16(SI)(BX*4), BX - MOVB BP, 1(AX) - SHRL $0x08, BP - SHLL $0x05, BP - ORL BP, BX - MOVB BL, (AX) + MOVB $0x01, BL + LEAL -16(BX)(BP*4), BP + MOVB SI, 1(AX) + SHRL $0x08, SI + SHLL $0x05, SI + ORL SI, BP + MOVB BP, (AX) ADDQ $0x02, AX JMP repeat_end_emit_encodeSnappyBlockAsm10BAvx emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm10BAvx: - MOVB $0x02, SI - LEAL -4(SI)(BX*4), BX - MOVB BL, (AX) - MOVW BP, 1(AX) + MOVB $0x02, BL + LEAL -4(BX)(BP*4), BP + MOVB BP, (AX) + MOVW SI, 1(AX) ADDQ $0x03, AX repeat_end_emit_encodeSnappyBlockAsm10BAvx: @@ -19186,16 +19157,16 @@ repeat_end_emit_encodeSnappyBlockAsm10BAvx: JMP search_loop_encodeSnappyBlockAsm10BAvx no_repeat_found_encodeSnappyBlockAsm10BAvx: - CMPL (DX)(BX*1), R9 + CMPL (DX)(BP*1), SI JEQ candidate_match_encodeSnappyBlockAsm10BAvx - SHRQ $0x08, R9 - MOVL 24(SP)(R11*4), BX - LEAL 2(CX), DI - CMPL (DX)(BP*1), R9 + SHRQ $0x08, SI + MOVL 24(SP)(R9*4), BP + LEAL 2(CX), R8 + CMPL (DX)(DI*1), SI JEQ candidate2_match_encodeSnappyBlockAsm10BAvx - MOVL DI, 24(SP)(R11*4) - SHRQ $0x08, R9 - CMPL (DX)(BX*1), R9 + MOVL R8, 24(SP)(R9*4) + SHRQ $0x08, SI + CMPL (DX)(BP*1), SI JEQ candidate3_match_encodeSnappyBlockAsm10BAvx MOVL 20(SP), CX JMP search_loop_encodeSnappyBlockAsm10BAvx @@ -19205,46 +19176,46 @@ candidate3_match_encodeSnappyBlockAsm10BAvx: JMP candidate_match_encodeSnappyBlockAsm10BAvx candidate2_match_encodeSnappyBlockAsm10BAvx: - MOVL DI, 24(SP)(R11*4) + MOVL R8, 24(SP)(R9*4) INCL CX - MOVL BP, BX + MOVL DI, BP candidate_match_encodeSnappyBlockAsm10BAvx: - MOVL 12(SP), BP - TESTL BX, BX + MOVL 12(SP), SI + TESTL BP, BP JZ match_extend_back_end_encodeSnappyBlockAsm10BAvx match_extend_back_loop_encodeSnappyBlockAsm10BAvx: - CMPL CX, BP + CMPL CX, SI JLE match_extend_back_end_encodeSnappyBlockAsm10BAvx - MOVB -1(DX)(BX*1), SI + MOVB -1(DX)(BP*1), BL MOVB -1(DX)(CX*1), DI - CMPB SI, DI + CMPB BL, DI JNE match_extend_back_end_encodeSnappyBlockAsm10BAvx LEAL -1(CX), CX - DECL BX + DECL BP JZ match_extend_back_end_encodeSnappyBlockAsm10BAvx JMP match_extend_back_loop_encodeSnappyBlockAsm10BAvx match_extend_back_end_encodeSnappyBlockAsm10BAvx: - MOVL CX, DI - SUBL 12(SP), DI - LEAQ 4(AX)(DI*1), DI - CMPQ DI, (SP) + MOVL CX, SI + SUBL 12(SP), SI + LEAQ 4(AX)(SI*1), SI + CMPQ SI, (SP) JL match_dst_size_check_encodeSnappyBlockAsm10BAvx MOVQ $0x00000000, ret+48(FP) RET match_dst_size_check_encodeSnappyBlockAsm10BAvx: - MOVL CX, R8 + MOVL CX, SI MOVL 12(SP), DI - CMPL DI, R8 + CMPL DI, SI JEQ emit_literal_done_match_emit_encodeSnappyBlockAsm10BAvx - MOVL R8, BP - MOVL R8, 12(SP) - LEAQ (DX)(DI*1), R9 - SUBL DI, BP - MOVL BP, DI + MOVL SI, R8 + MOVL SI, 12(SP) + LEAQ (DX)(DI*1), SI + SUBL DI, R8 + MOVL R8, DI SUBL $0x01, DI JC emit_literal_done_match_emit_encodeSnappyBlockAsm10BAvx CMPL DI, $0x3c @@ -19261,11 +19232,11 @@ match_dst_size_check_encodeSnappyBlockAsm10BAvx: JMP memmove_match_emit_encodeSnappyBlockAsm10BAvx four_bytes_match_emit_encodeSnappyBlockAsm10BAvx: - MOVL DI, R8 - SHRL $0x10, R8 + MOVL DI, R9 + SHRL $0x10, R9 MOVB $0xf8, (AX) MOVW DI, 1(AX) - MOVB R8, 3(AX) + MOVB R9, 3(AX) ADDQ $0x04, AX JMP memmove_match_emit_encodeSnappyBlockAsm10BAvx @@ -19287,8 +19258,7 @@ one_byte_match_emit_encodeSnappyBlockAsm10BAvx: ADDQ $0x01, AX memmove_match_emit_encodeSnappyBlockAsm10BAvx: - LEAQ (AX)(BP*1), R10 - MOVL BP, R8 + LEAQ (AX)(R8*1), DI NOP emit_lit_memmove_match_emit_encodeSnappyBlockAsm10BAvx_memmove_tail: @@ -19315,55 +19285,55 @@ emit_lit_memmove_match_emit_encodeSnappyBlockAsm10BAvx_memmove_tail: JMP emit_lit_memmove_match_emit_encodeSnappyBlockAsm10BAvx_memmove_avxUnaligned emit_lit_memmove_match_emit_encodeSnappyBlockAsm10BAvx_memmove_move_1or2: - MOVB (R9), R11 - MOVB -1(R9)(R8*1), R12 - MOVB R11, (AX) - MOVB R12, -1(AX)(R8*1) + MOVB (SI), R9 + MOVB -1(SI)(R8*1), R10 + MOVB R9, (AX) + MOVB R10, -1(AX)(R8*1) JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm10BAvx emit_lit_memmove_match_emit_encodeSnappyBlockAsm10BAvx_memmove_move_4: - MOVL (R9), R11 - MOVL R11, (AX) + MOVL (SI), R9 + MOVL R9, (AX) JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm10BAvx emit_lit_memmove_match_emit_encodeSnappyBlockAsm10BAvx_memmove_move_3: - MOVW (R9), R11 - MOVB 2(R9), R12 - MOVW R11, (AX) - MOVB R12, 2(AX) + MOVW (SI), R9 + MOVB 2(SI), R10 + MOVW R9, (AX) + MOVB R10, 2(AX) JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm10BAvx emit_lit_memmove_match_emit_encodeSnappyBlockAsm10BAvx_memmove_move_5through7: - MOVL (R9), R11 - MOVL -4(R9)(R8*1), R12 - MOVL R11, (AX) - MOVL R12, -4(AX)(R8*1) + MOVL (SI), R9 + MOVL -4(SI)(R8*1), R10 + MOVL R9, (AX) + MOVL R10, -4(AX)(R8*1) JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm10BAvx emit_lit_memmove_match_emit_encodeSnappyBlockAsm10BAvx_memmove_move_8: - MOVQ (R9), R11 - MOVQ R11, (AX) + MOVQ (SI), R9 + MOVQ R9, (AX) JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm10BAvx emit_lit_memmove_match_emit_encodeSnappyBlockAsm10BAvx_memmove_move_9through16: - MOVQ (R9), R11 - MOVQ -8(R9)(R8*1), R12 - MOVQ R11, (AX) - MOVQ R12, -8(AX)(R8*1) + MOVQ (SI), R9 + MOVQ -8(SI)(R8*1), R10 + MOVQ R9, (AX) + MOVQ R10, -8(AX)(R8*1) JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm10BAvx emit_lit_memmove_match_emit_encodeSnappyBlockAsm10BAvx_memmove_move_17through32: - MOVOU (R9), X0 - MOVOU -16(R9)(R8*1), X1 + MOVOU (SI), X0 + MOVOU -16(SI)(R8*1), X1 MOVOU X0, (AX) MOVOU X1, -16(AX)(R8*1) JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm10BAvx emit_lit_memmove_match_emit_encodeSnappyBlockAsm10BAvx_memmove_move_33through64: - MOVOU (R9), X0 - MOVOU 16(R9), X1 - MOVOU -32(R9)(R8*1), X2 - MOVOU -16(R9)(R8*1), X3 + MOVOU (SI), X0 + MOVOU 16(SI), X1 + MOVOU -32(SI)(R8*1), X2 + MOVOU -16(SI)(R8*1), X3 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, -32(AX)(R8*1) @@ -19371,41 +19341,41 @@ emit_lit_memmove_match_emit_encodeSnappyBlockAsm10BAvx_memmove_move_33through64: JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm10BAvx emit_lit_memmove_match_emit_encodeSnappyBlockAsm10BAvx_memmove_move_65through128: - MOVOU (R9), X0 - MOVOU 16(R9), X1 - MOVOU 32(R9), X2 - MOVOU 48(R9), X3 - MOVOU -64(R9)(R8*1), X12 - MOVOU -48(R9)(R8*1), X13 - MOVOU -32(R9)(R8*1), X14 - MOVOU -16(R9)(R8*1), X15 - MOVOU X0, (AX) - MOVOU X1, 16(AX) - MOVOU X2, 32(AX) - MOVOU X3, 48(AX) - MOVOU X12, -64(AX)(R8*1) - MOVOU X13, -48(AX)(R8*1) - MOVOU X14, -32(AX)(R8*1) - MOVOU X15, -16(AX)(R8*1) - JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm10BAvx - -emit_lit_memmove_match_emit_encodeSnappyBlockAsm10BAvx_memmove_move_129through256: - MOVOU (R9), X0 - MOVOU 16(R9), X1 - MOVOU 32(R9), X2 - MOVOU 48(R9), X3 - MOVOU 64(R9), X4 - MOVOU 80(R9), X5 - MOVOU 96(R9), X6 - MOVOU 112(R9), X7 - MOVOU -128(R9)(R8*1), X8 - MOVOU -112(R9)(R8*1), X9 - MOVOU -96(R9)(R8*1), X10 - MOVOU -80(R9)(R8*1), X11 - MOVOU -64(R9)(R8*1), X12 - MOVOU -48(R9)(R8*1), X13 - MOVOU -32(R9)(R8*1), X14 - MOVOU -16(R9)(R8*1), X15 + MOVOU (SI), X0 + MOVOU 16(SI), X1 + MOVOU 32(SI), X2 + MOVOU 48(SI), X3 + MOVOU -64(SI)(R8*1), X12 + MOVOU -48(SI)(R8*1), X13 + MOVOU -32(SI)(R8*1), X14 + MOVOU -16(SI)(R8*1), X15 + MOVOU X0, (AX) + MOVOU X1, 16(AX) + MOVOU X2, 32(AX) + MOVOU X3, 48(AX) + MOVOU X12, -64(AX)(R8*1) + MOVOU X13, -48(AX)(R8*1) + MOVOU X14, -32(AX)(R8*1) + MOVOU X15, -16(AX)(R8*1) + JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm10BAvx + +emit_lit_memmove_match_emit_encodeSnappyBlockAsm10BAvx_memmove_move_129through256: + MOVOU (SI), X0 + MOVOU 16(SI), X1 + MOVOU 32(SI), X2 + MOVOU 48(SI), X3 + MOVOU 64(SI), X4 + MOVOU 80(SI), X5 + MOVOU 96(SI), X6 + MOVOU 112(SI), X7 + MOVOU -128(SI)(R8*1), X8 + MOVOU -112(SI)(R8*1), X9 + MOVOU -96(SI)(R8*1), X10 + MOVOU -80(SI)(R8*1), X11 + MOVOU -64(SI)(R8*1), X12 + MOVOU -48(SI)(R8*1), X13 + MOVOU -32(SI)(R8*1), X14 + MOVOU -16(SI)(R8*1), X15 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, 32(AX) @@ -19426,22 +19396,22 @@ emit_lit_memmove_match_emit_encodeSnappyBlockAsm10BAvx_memmove_move_129through25 emit_lit_memmove_match_emit_encodeSnappyBlockAsm10BAvx_memmove_move_256through2048: LEAQ -256(R8), R8 - MOVOU (R9), X0 - MOVOU 16(R9), X1 - MOVOU 32(R9), X2 - MOVOU 48(R9), X3 - MOVOU 64(R9), X4 - MOVOU 80(R9), X5 - MOVOU 96(R9), X6 - MOVOU 112(R9), X7 - MOVOU 128(R9), X8 - MOVOU 144(R9), X9 - MOVOU 160(R9), X10 - MOVOU 176(R9), X11 - MOVOU 192(R9), X12 - MOVOU 208(R9), X13 - MOVOU 224(R9), X14 - MOVOU 240(R9), X15 + MOVOU (SI), X0 + MOVOU 16(SI), X1 + MOVOU 32(SI), X2 + MOVOU 48(SI), X3 + MOVOU 64(SI), X4 + MOVOU 80(SI), X5 + MOVOU 96(SI), X6 + MOVOU 112(SI), X7 + MOVOU 128(SI), X8 + MOVOU 144(SI), X9 + MOVOU 160(SI), X10 + MOVOU 176(SI), X11 + MOVOU 192(SI), X12 + MOVOU 208(SI), X13 + MOVOU 224(SI), X14 + MOVOU 240(SI), X15 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, 32(AX) @@ -19459,160 +19429,160 @@ emit_lit_memmove_match_emit_encodeSnappyBlockAsm10BAvx_memmove_move_256through20 MOVOU X14, 224(AX) MOVOU X15, 240(AX) CMPQ R8, $0x00000100 - LEAQ 256(R9), R9 + LEAQ 256(SI), SI LEAQ 256(AX), AX JGE emit_lit_memmove_match_emit_encodeSnappyBlockAsm10BAvx_memmove_move_256through2048 JMP emit_lit_memmove_match_emit_encodeSnappyBlockAsm10BAvx_memmove_tail emit_lit_memmove_match_emit_encodeSnappyBlockAsm10BAvx_memmove_avxUnaligned: - LEAQ (R9)(R8*1), R12 - MOVQ AX, R14 - MOVOU -128(R12), X5 - MOVOU -112(R12), X6 - MOVQ $0x00000080, R11 + LEAQ (SI)(R8*1), R10 + MOVQ AX, R12 + MOVOU -128(R10), X5 + MOVOU -112(R10), X6 + MOVQ $0x00000080, R9 ANDQ $0xffffffe0, AX ADDQ $0x20, AX - MOVOU -96(R12), X7 - MOVOU -80(R12), X8 - MOVQ AX, R13 - SUBQ R14, R13 - MOVOU -64(R12), X9 - MOVOU -48(R12), X10 - SUBQ R13, R8 - MOVOU -32(R12), X11 - MOVOU -16(R12), X12 - VMOVDQU (R9), Y4 - ADDQ R13, R9 + MOVOU -96(R10), X7 + MOVOU -80(R10), X8 + MOVQ AX, R11 + SUBQ R12, R11 + MOVOU -64(R10), X9 + MOVOU -48(R10), X10 SUBQ R11, R8 + MOVOU -32(R10), X11 + MOVOU -16(R10), X12 + VMOVDQU (SI), Y4 + ADDQ R11, SI + SUBQ R9, R8 emit_lit_memmove_match_emit_encodeSnappyBlockAsm10BAvx_memmove_gobble_128_loop: - VMOVDQU (R9), Y0 - VMOVDQU 32(R9), Y1 - VMOVDQU 64(R9), Y2 - VMOVDQU 96(R9), Y3 - ADDQ R11, R9 + VMOVDQU (SI), Y0 + VMOVDQU 32(SI), Y1 + VMOVDQU 64(SI), Y2 + VMOVDQU 96(SI), Y3 + ADDQ R9, SI VMOVDQA Y0, (AX) VMOVDQA Y1, 32(AX) VMOVDQA Y2, 64(AX) VMOVDQA Y3, 96(AX) - ADDQ R11, AX - SUBQ R11, R8 + ADDQ R9, AX + SUBQ R9, R8 JA emit_lit_memmove_match_emit_encodeSnappyBlockAsm10BAvx_memmove_gobble_128_loop - ADDQ R11, R8 + ADDQ R9, R8 ADDQ AX, R8 - VMOVDQU Y4, (R14) + VMOVDQU Y4, (R12) VZEROUPPER - MOVOU X5, -128(R8) - MOVOU X6, -112(R8) - MOVOU X7, -96(R8) - MOVOU X8, -80(R8) - MOVOU X9, -64(R8) - MOVOU X10, -48(R8) - MOVOU X11, -32(R8) - MOVOU X12, -16(R8) + MOVOU X5, -128(R8) + MOVOU X6, -112(R8) + MOVOU X7, -96(R8) + MOVOU X8, -80(R8) + MOVOU X9, -64(R8) + MOVOU X10, -48(R8) + MOVOU X11, -32(R8) + MOVOU X12, -16(R8) memmove_end_copy_match_emit_encodeSnappyBlockAsm10BAvx: - MOVQ R10, AX + MOVQ DI, AX emit_literal_done_match_emit_encodeSnappyBlockAsm10BAvx: match_nolit_loop_encodeSnappyBlockAsm10BAvx: - MOVL CX, BP - SUBL BX, BP - MOVL BP, 16(SP) + MOVL CX, SI + SUBL BP, SI + MOVL SI, 16(SP) ADDL $0x04, CX - ADDL $0x04, BX - MOVQ src_len+32(FP), R9 - SUBL CX, R9 - LEAQ (DX)(CX*1), R10 - LEAQ (DX)(BX*1), R11 - XORL BX, BX - CMPL R9, $0x08 + ADDL $0x04, BP + MOVQ src_len+32(FP), SI + SUBL CX, SI + LEAQ (DX)(CX*1), DI + LEAQ (DX)(BP*1), BP + XORL R9, R9 + CMPL SI, $0x08 JL matchlen_single_match_nolit_encodeSnappyBlockAsm10BAvx matchlen_loopback_match_nolit_encodeSnappyBlockAsm10BAvx: - MOVQ (R10)(BX*1), R12 - XORQ (R11)(BX*1), R12 - TESTQ R12, R12 + MOVQ (DI)(R9*1), R8 + XORQ (BP)(R9*1), R8 + TESTQ R8, R8 JZ matchlen_loop_match_nolit_encodeSnappyBlockAsm10BAvx - BSFQ R12, R12 - SARQ $0x03, R12 - LEAL (BX)(R12*1), BX + BSFQ R8, R8 + SARQ $0x03, R8 + LEAL (R9)(R8*1), R9 JMP match_nolit_end_encodeSnappyBlockAsm10BAvx matchlen_loop_match_nolit_encodeSnappyBlockAsm10BAvx: - LEAL -8(R9), R9 - LEAL 8(BX), BX - CMPL R9, $0x08 + LEAL -8(SI), SI + LEAL 8(R9), R9 + CMPL SI, $0x08 JGE matchlen_loopback_match_nolit_encodeSnappyBlockAsm10BAvx matchlen_single_match_nolit_encodeSnappyBlockAsm10BAvx: - TESTL R9, R9 + TESTL SI, SI JZ match_nolit_end_encodeSnappyBlockAsm10BAvx matchlen_single_loopback_match_nolit_encodeSnappyBlockAsm10BAvx: - MOVB (R10)(BX*1), R12 - CMPB (R11)(BX*1), R12 + MOVB (DI)(R9*1), R8 + CMPB (BP)(R9*1), R8 JNE match_nolit_end_encodeSnappyBlockAsm10BAvx - LEAL 1(BX), BX - DECL R9 + LEAL 1(R9), R9 + DECL SI JNZ matchlen_single_loopback_match_nolit_encodeSnappyBlockAsm10BAvx match_nolit_end_encodeSnappyBlockAsm10BAvx: - ADDL BX, CX + ADDL R9, CX MOVL 16(SP), BP - ADDL $0x04, BX + ADDL $0x04, R9 CMPL BP, $0x00010000 JL two_byte_offset_match_nolit_encodeSnappyBlockAsm10BAvx four_bytes_loop_back_match_nolit_encodeSnappyBlockAsm10BAvx: - CMPL BX, $0x40 + CMPL R9, $0x40 JLE four_bytes_remain_match_nolit_encodeSnappyBlockAsm10BAvx MOVB $0xff, (AX) MOVL BP, 1(AX) - LEAL -64(BX), BX + LEAL -64(R9), R9 ADDQ $0x05, AX - CMPL BX, $0x04 + CMPL R9, $0x04 JL four_bytes_remain_match_nolit_encodeSnappyBlockAsm10BAvx JMP four_bytes_loop_back_match_nolit_encodeSnappyBlockAsm10BAvx four_bytes_remain_match_nolit_encodeSnappyBlockAsm10BAvx: - TESTL BX, BX + TESTL R9, R9 JZ match_nolit_emitcopy_end_encodeSnappyBlockAsm10BAvx - MOVB $0x03, SI - LEAL -4(SI)(BX*4), BX - MOVB BL, (AX) + MOVB $0x03, BL + LEAL -4(BX)(R9*4), R9 + MOVB R9, (AX) MOVL BP, 1(AX) ADDQ $0x05, AX JMP match_nolit_emitcopy_end_encodeSnappyBlockAsm10BAvx two_byte_offset_match_nolit_encodeSnappyBlockAsm10BAvx: - CMPL BX, $0x40 + CMPL R9, $0x40 JLE two_byte_offset_short_match_nolit_encodeSnappyBlockAsm10BAvx MOVB $0xee, (AX) MOVW BP, 1(AX) - LEAL -60(BX), BX + LEAL -60(R9), R9 ADDQ $0x03, AX JMP two_byte_offset_match_nolit_encodeSnappyBlockAsm10BAvx two_byte_offset_short_match_nolit_encodeSnappyBlockAsm10BAvx: - CMPL BX, $0x0c + CMPL R9, $0x0c JGE emit_copy_three_match_nolit_encodeSnappyBlockAsm10BAvx CMPL BP, $0x00000800 JGE emit_copy_three_match_nolit_encodeSnappyBlockAsm10BAvx - MOVB $0x01, SI - LEAL -16(SI)(BX*4), BX + MOVB $0x01, BL + LEAL -16(BX)(R9*4), R9 MOVB BP, 1(AX) SHRL $0x08, BP SHLL $0x05, BP - ORL BP, BX - MOVB BL, (AX) + ORL BP, R9 + MOVB R9, (AX) ADDQ $0x02, AX JMP match_nolit_emitcopy_end_encodeSnappyBlockAsm10BAvx emit_copy_three_match_nolit_encodeSnappyBlockAsm10BAvx: - MOVB $0x02, SI - LEAL -4(SI)(BX*4), BX - MOVB BL, (AX) + MOVB $0x02, BL + LEAL -4(BX)(R9*4), R9 + MOVB R9, (AX) MOVW BP, 1(AX) ADDQ $0x03, AX @@ -19626,22 +19596,22 @@ match_nolit_emitcopy_end_encodeSnappyBlockAsm10BAvx: RET match_nolit_dst_ok_encodeSnappyBlockAsm10BAvx: - MOVQ -2(DX)(CX*1), R9 - MOVQ $0x9e3779b1, R10 - MOVQ R9, R11 - SHRQ $0x10, R9 - MOVQ R9, R12 - SHLQ $0x20, R11 - IMULQ R10, R11 - SHRQ $0x36, R11 - SHLQ $0x20, R12 - IMULQ R10, R12 - SHRQ $0x36, R12 - LEAL -2(CX), BP - MOVL 24(SP)(R12*4), BX - MOVL BP, 24(SP)(R11*4) - MOVL CX, 24(SP)(R12*4) - CMPL (DX)(BX*1), R9 + MOVQ -2(DX)(CX*1), SI + MOVQ $0x9e3779b1, BP + MOVQ SI, DI + SHRQ $0x10, SI + MOVQ SI, R8 + SHLQ $0x20, DI + IMULQ BP, DI + SHRQ $0x36, DI + SHLQ $0x20, R8 + IMULQ BP, R8 + SHRQ $0x36, R8 + LEAL -2(CX), R9 + MOVL 24(SP)(R8*4), BP + MOVL R9, 24(SP)(DI*4) + MOVL CX, 24(SP)(R8*4) + CMPL (DX)(BP*1), SI JEQ match_nolit_loop_encodeSnappyBlockAsm10BAvx INCL CX JMP search_loop_encodeSnappyBlockAsm10BAvx @@ -19922,14 +19892,14 @@ emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10BAvx_memmove_gobble_128_lo ADDQ AX, BX VMOVDQU Y4, (R8) VZEROUPPER - MOVOU X5, -128(BX) - MOVOU X6, -112(BX) - MOVOU X7, -96(BX) - MOVOU X8, -80(BX) - MOVOU X9, -64(BX) - MOVOU X10, -48(BX) - MOVOU X11, -32(BX) - MOVOU X12, -16(BX) + MOVOU X5, -128(BX) + MOVOU X6, -112(BX) + MOVOU X7, -96(BX) + MOVOU X8, -80(BX) + MOVOU X9, -64(BX) + MOVOU X10, -48(BX) + MOVOU X11, -32(BX) + MOVOU X12, -16(BX) memmove_end_copy_emit_remainder_encodeSnappyBlockAsm10BAvx: MOVQ DX, AX @@ -19963,8 +19933,8 @@ zero_loop_encodeSnappyBlockAsm8BAvx: MOVL $0x00000000, 12(SP) MOVQ src_len+32(FP), CX LEAQ -5(CX), DX - LEAQ -8(CX), R9 - MOVL R9, 8(SP) + LEAQ -8(CX), BP + MOVL BP, 8(SP) SHRQ $0x05, CX SUBL CX, DX LEAQ (AX)(DX*1), DX @@ -19974,68 +19944,68 @@ zero_loop_encodeSnappyBlockAsm8BAvx: MOVQ src_base+24(FP), DX search_loop_encodeSnappyBlockAsm8BAvx: - MOVQ (DX)(CX*1), R9 - MOVL CX, BX - SUBL 12(SP), BX - SHRL $0x04, BX - LEAL 4(CX)(BX*1), BX - MOVL 8(SP), BP - CMPL BX, BP + MOVQ (DX)(CX*1), SI + MOVL CX, BP + SUBL 12(SP), BP + SHRL $0x04, BP + LEAL 4(CX)(BP*1), BP + MOVL 8(SP), DI + CMPL BP, DI JGT emit_remainder_encodeSnappyBlockAsm8BAvx - MOVL BX, 20(SP) - MOVQ $0x9e3779b1, R10 - MOVQ R9, R11 - MOVQ R9, R12 - SHRQ $0x08, R12 - SHLQ $0x20, R11 - IMULQ R10, R11 - SHRQ $0x38, R11 - SHLQ $0x20, R12 - IMULQ R10, R12 - SHRQ $0x38, R12 - MOVL 24(SP)(R11*4), BX - MOVL 24(SP)(R12*4), BP - MOVL CX, 24(SP)(R11*4) - LEAL 1(CX), DI - MOVL DI, 24(SP)(R12*4) - MOVQ R9, R11 - SHRQ $0x10, R11 - SHLQ $0x20, R11 - IMULQ R10, R11 - SHRQ $0x38, R11 - MOVL CX, DI - SUBL 16(SP), DI - MOVL 1(DX)(DI*1), DI - MOVQ R9, R10 + MOVL BP, 20(SP) + MOVQ $0x9e3779b1, R8 + MOVQ SI, R9 + MOVQ SI, R10 SHRQ $0x08, R10 - CMPL R10, DI + SHLQ $0x20, R9 + IMULQ R8, R9 + SHRQ $0x38, R9 + SHLQ $0x20, R10 + IMULQ R8, R10 + SHRQ $0x38, R10 + MOVL 24(SP)(R9*4), BP + MOVL 24(SP)(R10*4), DI + MOVL CX, 24(SP)(R9*4) + LEAL 1(CX), R9 + MOVL R9, 24(SP)(R10*4) + MOVQ SI, R9 + SHRQ $0x10, R9 + SHLQ $0x20, R9 + IMULQ R8, R9 + SHRQ $0x38, R9 + MOVL CX, R8 + SUBL 16(SP), R8 + MOVL 1(DX)(R8*1), R10 + MOVQ SI, R8 + SHRQ $0x08, R8 + CMPL R8, R10 JNE no_repeat_found_encodeSnappyBlockAsm8BAvx - LEAL 1(CX), DI - MOVL 12(SP), BX - MOVL DI, BP - SUBL 16(SP), BP + LEAL 1(CX), SI + MOVL 12(SP), BP + MOVL SI, DI + SUBL 16(SP), DI JZ repeat_extend_back_end_encodeSnappyBlockAsm8BAvx repeat_extend_back_loop_encodeSnappyBlockAsm8BAvx: - CMPL DI, BX + CMPL SI, BP JLE repeat_extend_back_end_encodeSnappyBlockAsm8BAvx - MOVB -1(DX)(BP*1), SI - MOVB -1(DX)(DI*1), R8 - CMPB SI, R8 + MOVB -1(DX)(DI*1), BL + MOVB -1(DX)(SI*1), R8 + CMPB BL, R8 JNE repeat_extend_back_end_encodeSnappyBlockAsm8BAvx - LEAL -1(DI), DI - DECL BP + LEAL -1(SI), SI + DECL DI JNZ repeat_extend_back_loop_encodeSnappyBlockAsm8BAvx repeat_extend_back_end_encodeSnappyBlockAsm8BAvx: MOVL 12(SP), BP - CMPL BP, DI + CMPL BP, SI JEQ emit_literal_done_repeat_emit_encodeSnappyBlockAsm8BAvx - MOVL DI, BX - MOVL DI, 12(SP) - LEAQ (DX)(BP*1), R9 - SUBL BP, BX - MOVL BX, BP + MOVL SI, DI + MOVL SI, 12(SP) + LEAQ (DX)(BP*1), R8 + SUBL BP, DI + MOVL DI, BP SUBL $0x01, BP JC emit_literal_done_repeat_emit_encodeSnappyBlockAsm8BAvx CMPL BP, $0x3c @@ -20052,11 +20022,11 @@ repeat_extend_back_end_encodeSnappyBlockAsm8BAvx: JMP memmove_repeat_emit_encodeSnappyBlockAsm8BAvx four_bytes_repeat_emit_encodeSnappyBlockAsm8BAvx: - MOVL BP, R8 - SHRL $0x10, R8 + MOVL BP, R9 + SHRL $0x10, R9 MOVB $0xf8, (AX) MOVW BP, 1(AX) - MOVB R8, 3(AX) + MOVB R9, 3(AX) ADDQ $0x04, AX JMP memmove_repeat_emit_encodeSnappyBlockAsm8BAvx @@ -20078,125 +20048,124 @@ one_byte_repeat_emit_encodeSnappyBlockAsm8BAvx: ADDQ $0x01, AX memmove_repeat_emit_encodeSnappyBlockAsm8BAvx: - LEAQ (AX)(BX*1), R10 - MOVL BX, BP + LEAQ (AX)(DI*1), BP NOP emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8BAvx_memmove_tail: - TESTQ BP, BP + TESTQ DI, DI JEQ memmove_end_copy_repeat_emit_encodeSnappyBlockAsm8BAvx - CMPQ BP, $0x02 + CMPQ DI, $0x02 JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8BAvx_memmove_move_1or2 - CMPQ BP, $0x04 + CMPQ DI, $0x04 JB emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8BAvx_memmove_move_3 JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8BAvx_memmove_move_4 - CMPQ BP, $0x08 + CMPQ DI, $0x08 JB emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8BAvx_memmove_move_5through7 JE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8BAvx_memmove_move_8 - CMPQ BP, $0x10 + CMPQ DI, $0x10 JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8BAvx_memmove_move_9through16 - CMPQ BP, $0x20 + CMPQ DI, $0x20 JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8BAvx_memmove_move_17through32 - CMPQ BP, $0x40 + CMPQ DI, $0x40 JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8BAvx_memmove_move_33through64 - CMPQ BP, $0x80 + CMPQ DI, $0x80 JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8BAvx_memmove_move_65through128 - CMPQ BP, $0x00000100 + CMPQ DI, $0x00000100 JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8BAvx_memmove_move_129through256 JMP emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8BAvx_memmove_avxUnaligned emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8BAvx_memmove_move_1or2: - MOVB (R9), R11 - MOVB -1(R9)(BP*1), R12 - MOVB R11, (AX) - MOVB R12, -1(AX)(BP*1) + MOVB (R8), R9 + MOVB -1(R8)(DI*1), R10 + MOVB R9, (AX) + MOVB R10, -1(AX)(DI*1) JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm8BAvx emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8BAvx_memmove_move_4: - MOVL (R9), R11 - MOVL R11, (AX) + MOVL (R8), R9 + MOVL R9, (AX) JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm8BAvx emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8BAvx_memmove_move_3: - MOVW (R9), R11 - MOVB 2(R9), R12 - MOVW R11, (AX) - MOVB R12, 2(AX) + MOVW (R8), R9 + MOVB 2(R8), R10 + MOVW R9, (AX) + MOVB R10, 2(AX) JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm8BAvx emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8BAvx_memmove_move_5through7: - MOVL (R9), R11 - MOVL -4(R9)(BP*1), R12 - MOVL R11, (AX) - MOVL R12, -4(AX)(BP*1) + MOVL (R8), R9 + MOVL -4(R8)(DI*1), R10 + MOVL R9, (AX) + MOVL R10, -4(AX)(DI*1) JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm8BAvx emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8BAvx_memmove_move_8: - MOVQ (R9), R11 - MOVQ R11, (AX) + MOVQ (R8), R9 + MOVQ R9, (AX) JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm8BAvx emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8BAvx_memmove_move_9through16: - MOVQ (R9), R11 - MOVQ -8(R9)(BP*1), R12 - MOVQ R11, (AX) - MOVQ R12, -8(AX)(BP*1) + MOVQ (R8), R9 + MOVQ -8(R8)(DI*1), R10 + MOVQ R9, (AX) + MOVQ R10, -8(AX)(DI*1) JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm8BAvx emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8BAvx_memmove_move_17through32: - MOVOU (R9), X0 - MOVOU -16(R9)(BP*1), X1 + MOVOU (R8), X0 + MOVOU -16(R8)(DI*1), X1 MOVOU X0, (AX) - MOVOU X1, -16(AX)(BP*1) + MOVOU X1, -16(AX)(DI*1) JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm8BAvx emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8BAvx_memmove_move_33through64: - MOVOU (R9), X0 - MOVOU 16(R9), X1 - MOVOU -32(R9)(BP*1), X2 - MOVOU -16(R9)(BP*1), X3 + MOVOU (R8), X0 + MOVOU 16(R8), X1 + MOVOU -32(R8)(DI*1), X2 + MOVOU -16(R8)(DI*1), X3 MOVOU X0, (AX) MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(BP*1) - MOVOU X3, -16(AX)(BP*1) + MOVOU X2, -32(AX)(DI*1) + MOVOU X3, -16(AX)(DI*1) JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm8BAvx emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8BAvx_memmove_move_65through128: - MOVOU (R9), X0 - MOVOU 16(R9), X1 - MOVOU 32(R9), X2 - MOVOU 48(R9), X3 - MOVOU -64(R9)(BP*1), X12 - MOVOU -48(R9)(BP*1), X13 - MOVOU -32(R9)(BP*1), X14 - MOVOU -16(R9)(BP*1), X15 + MOVOU (R8), X0 + MOVOU 16(R8), X1 + MOVOU 32(R8), X2 + MOVOU 48(R8), X3 + MOVOU -64(R8)(DI*1), X12 + MOVOU -48(R8)(DI*1), X13 + MOVOU -32(R8)(DI*1), X14 + MOVOU -16(R8)(DI*1), X15 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, 32(AX) MOVOU X3, 48(AX) - MOVOU X12, -64(AX)(BP*1) - MOVOU X13, -48(AX)(BP*1) - MOVOU X14, -32(AX)(BP*1) - MOVOU X15, -16(AX)(BP*1) + MOVOU X12, -64(AX)(DI*1) + MOVOU X13, -48(AX)(DI*1) + MOVOU X14, -32(AX)(DI*1) + MOVOU X15, -16(AX)(DI*1) JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm8BAvx emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8BAvx_memmove_move_129through256: - MOVOU (R9), X0 - MOVOU 16(R9), X1 - MOVOU 32(R9), X2 - MOVOU 48(R9), X3 - MOVOU 64(R9), X4 - MOVOU 80(R9), X5 - MOVOU 96(R9), X6 - MOVOU 112(R9), X7 - MOVOU -128(R9)(BP*1), X8 - MOVOU -112(R9)(BP*1), X9 - MOVOU -96(R9)(BP*1), X10 - MOVOU -80(R9)(BP*1), X11 - MOVOU -64(R9)(BP*1), X12 - MOVOU -48(R9)(BP*1), X13 - MOVOU -32(R9)(BP*1), X14 - MOVOU -16(R9)(BP*1), X15 + MOVOU (R8), X0 + MOVOU 16(R8), X1 + MOVOU 32(R8), X2 + MOVOU 48(R8), X3 + MOVOU 64(R8), X4 + MOVOU 80(R8), X5 + MOVOU 96(R8), X6 + MOVOU 112(R8), X7 + MOVOU -128(R8)(DI*1), X8 + MOVOU -112(R8)(DI*1), X9 + MOVOU -96(R8)(DI*1), X10 + MOVOU -80(R8)(DI*1), X11 + MOVOU -64(R8)(DI*1), X12 + MOVOU -48(R8)(DI*1), X13 + MOVOU -32(R8)(DI*1), X14 + MOVOU -16(R8)(DI*1), X15 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, 32(AX) @@ -20205,34 +20174,34 @@ emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8BAvx_memmove_move_129through25 MOVOU X5, 80(AX) MOVOU X6, 96(AX) MOVOU X7, 112(AX) - MOVOU X8, -128(AX)(BP*1) - MOVOU X9, -112(AX)(BP*1) - MOVOU X10, -96(AX)(BP*1) - MOVOU X11, -80(AX)(BP*1) - MOVOU X12, -64(AX)(BP*1) - MOVOU X13, -48(AX)(BP*1) - MOVOU X14, -32(AX)(BP*1) - MOVOU X15, -16(AX)(BP*1) + MOVOU X8, -128(AX)(DI*1) + MOVOU X9, -112(AX)(DI*1) + MOVOU X10, -96(AX)(DI*1) + MOVOU X11, -80(AX)(DI*1) + MOVOU X12, -64(AX)(DI*1) + MOVOU X13, -48(AX)(DI*1) + MOVOU X14, -32(AX)(DI*1) + MOVOU X15, -16(AX)(DI*1) JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm8BAvx emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8BAvx_memmove_move_256through2048: - LEAQ -256(BP), BP - MOVOU (R9), X0 - MOVOU 16(R9), X1 - MOVOU 32(R9), X2 - MOVOU 48(R9), X3 - MOVOU 64(R9), X4 - MOVOU 80(R9), X5 - MOVOU 96(R9), X6 - MOVOU 112(R9), X7 - MOVOU 128(R9), X8 - MOVOU 144(R9), X9 - MOVOU 160(R9), X10 - MOVOU 176(R9), X11 - MOVOU 192(R9), X12 - MOVOU 208(R9), X13 - MOVOU 224(R9), X14 - MOVOU 240(R9), X15 + LEAQ -256(DI), DI + MOVOU (R8), X0 + MOVOU 16(R8), X1 + MOVOU 32(R8), X2 + MOVOU 48(R8), X3 + MOVOU 64(R8), X4 + MOVOU 80(R8), X5 + MOVOU 96(R8), X6 + MOVOU 112(R8), X7 + MOVOU 128(R8), X8 + MOVOU 144(R8), X9 + MOVOU 160(R8), X10 + MOVOU 176(R8), X11 + MOVOU 192(R8), X12 + MOVOU 208(R8), X13 + MOVOU 224(R8), X14 + MOVOU 240(R8), X15 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, 32(AX) @@ -20249,160 +20218,160 @@ emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8BAvx_memmove_move_256through20 MOVOU X13, 208(AX) MOVOU X14, 224(AX) MOVOU X15, 240(AX) - CMPQ BP, $0x00000100 - LEAQ 256(R9), R9 + CMPQ DI, $0x00000100 + LEAQ 256(R8), R8 LEAQ 256(AX), AX JGE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8BAvx_memmove_move_256through2048 JMP emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8BAvx_memmove_tail emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8BAvx_memmove_avxUnaligned: - LEAQ (R9)(BP*1), R12 - MOVQ AX, R14 - MOVOU -128(R12), X5 - MOVOU -112(R12), X6 - MOVQ $0x00000080, R11 + LEAQ (R8)(DI*1), R10 + MOVQ AX, R12 + MOVOU -128(R10), X5 + MOVOU -112(R10), X6 + MOVQ $0x00000080, R9 ANDQ $0xffffffe0, AX ADDQ $0x20, AX - MOVOU -96(R12), X7 - MOVOU -80(R12), X8 - MOVQ AX, R13 - SUBQ R14, R13 - MOVOU -64(R12), X9 - MOVOU -48(R12), X10 - SUBQ R13, BP - MOVOU -32(R12), X11 - MOVOU -16(R12), X12 - VMOVDQU (R9), Y4 - ADDQ R13, R9 - SUBQ R11, BP + MOVOU -96(R10), X7 + MOVOU -80(R10), X8 + MOVQ AX, R11 + SUBQ R12, R11 + MOVOU -64(R10), X9 + MOVOU -48(R10), X10 + SUBQ R11, DI + MOVOU -32(R10), X11 + MOVOU -16(R10), X12 + VMOVDQU (R8), Y4 + ADDQ R11, R8 + SUBQ R9, DI emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8BAvx_memmove_gobble_128_loop: - VMOVDQU (R9), Y0 - VMOVDQU 32(R9), Y1 - VMOVDQU 64(R9), Y2 - VMOVDQU 96(R9), Y3 - ADDQ R11, R9 + VMOVDQU (R8), Y0 + VMOVDQU 32(R8), Y1 + VMOVDQU 64(R8), Y2 + VMOVDQU 96(R8), Y3 + ADDQ R9, R8 VMOVDQA Y0, (AX) VMOVDQA Y1, 32(AX) VMOVDQA Y2, 64(AX) VMOVDQA Y3, 96(AX) - ADDQ R11, AX - SUBQ R11, BP + ADDQ R9, AX + SUBQ R9, DI JA emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8BAvx_memmove_gobble_128_loop - ADDQ R11, BP - ADDQ AX, BP - VMOVDQU Y4, (R14) + ADDQ R9, DI + ADDQ AX, DI + VMOVDQU Y4, (R12) VZEROUPPER - MOVOU X5, -128(BP) - MOVOU X6, -112(BP) - MOVOU X7, -96(BP) - MOVOU X8, -80(BP) - MOVOU X9, -64(BP) - MOVOU X10, -48(BP) - MOVOU X11, -32(BP) - MOVOU X12, -16(BP) + MOVOU X5, -128(DI) + MOVOU X6, -112(DI) + MOVOU X7, -96(DI) + MOVOU X8, -80(DI) + MOVOU X9, -64(DI) + MOVOU X10, -48(DI) + MOVOU X11, -32(DI) + MOVOU X12, -16(DI) memmove_end_copy_repeat_emit_encodeSnappyBlockAsm8BAvx: - MOVQ R10, AX + MOVQ BP, AX emit_literal_done_repeat_emit_encodeSnappyBlockAsm8BAvx: ADDL $0x05, CX - MOVL CX, BX - SUBL 16(SP), BX - MOVQ src_len+32(FP), R9 - SUBL CX, R9 - LEAQ (DX)(CX*1), R10 - LEAQ (DX)(BX*1), R11 - XORL BX, BX - CMPL R9, $0x08 + MOVL CX, BP + SUBL 16(SP), BP + MOVQ src_len+32(FP), DI + SUBL CX, DI + LEAQ (DX)(CX*1), R8 + LEAQ (DX)(BP*1), BP + XORL R10, R10 + CMPL DI, $0x08 JL matchlen_single_repeat_extend matchlen_loopback_repeat_extend: - MOVQ (R10)(BX*1), R12 - XORQ (R11)(BX*1), R12 - TESTQ R12, R12 + MOVQ (R8)(R10*1), R9 + XORQ (BP)(R10*1), R9 + TESTQ R9, R9 JZ matchlen_loop_repeat_extend - BSFQ R12, R12 - SARQ $0x03, R12 - LEAL (BX)(R12*1), BX + BSFQ R9, R9 + SARQ $0x03, R9 + LEAL (R10)(R9*1), R10 JMP repeat_extend_forward_end_encodeSnappyBlockAsm8BAvx matchlen_loop_repeat_extend: - LEAL -8(R9), R9 - LEAL 8(BX), BX - CMPL R9, $0x08 + LEAL -8(DI), DI + LEAL 8(R10), R10 + CMPL DI, $0x08 JGE matchlen_loopback_repeat_extend matchlen_single_repeat_extend: - TESTL R9, R9 + TESTL DI, DI JZ repeat_extend_forward_end_encodeSnappyBlockAsm8BAvx matchlen_single_loopback_repeat_extend: - MOVB (R10)(BX*1), R12 - CMPB (R11)(BX*1), R12 + MOVB (R8)(R10*1), R9 + CMPB (BP)(R10*1), R9 JNE repeat_extend_forward_end_encodeSnappyBlockAsm8BAvx - LEAL 1(BX), BX - DECL R9 + LEAL 1(R10), R10 + DECL DI JNZ matchlen_single_loopback_repeat_extend repeat_extend_forward_end_encodeSnappyBlockAsm8BAvx: - ADDL BX, CX - MOVL CX, BX - SUBL DI, BX - MOVL 16(SP), BP - CMPL BP, $0x00010000 + ADDL R10, CX + MOVL CX, BP + SUBL SI, BP + MOVL 16(SP), SI + CMPL SI, $0x00010000 JL two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm8BAvx four_bytes_loop_back_repeat_as_copy_encodeSnappyBlockAsm8BAvx: - CMPL BX, $0x40 + CMPL BP, $0x40 JLE four_bytes_remain_repeat_as_copy_encodeSnappyBlockAsm8BAvx MOVB $0xff, (AX) - MOVL BP, 1(AX) - LEAL -64(BX), BX + MOVL SI, 1(AX) + LEAL -64(BP), BP ADDQ $0x05, AX - CMPL BX, $0x04 + CMPL BP, $0x04 JL four_bytes_remain_repeat_as_copy_encodeSnappyBlockAsm8BAvx JMP four_bytes_loop_back_repeat_as_copy_encodeSnappyBlockAsm8BAvx four_bytes_remain_repeat_as_copy_encodeSnappyBlockAsm8BAvx: - TESTL BX, BX + TESTL BP, BP JZ repeat_end_emit_encodeSnappyBlockAsm8BAvx - MOVB $0x03, SI - LEAL -4(SI)(BX*4), BX - MOVB BL, (AX) - MOVL BP, 1(AX) + MOVB $0x03, BL + LEAL -4(BX)(BP*4), BP + MOVB BP, (AX) + MOVL SI, 1(AX) ADDQ $0x05, AX JMP repeat_end_emit_encodeSnappyBlockAsm8BAvx two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm8BAvx: - CMPL BX, $0x40 + CMPL BP, $0x40 JLE two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm8BAvx MOVB $0xee, (AX) - MOVW BP, 1(AX) - LEAL -60(BX), BX + MOVW SI, 1(AX) + LEAL -60(BP), BP ADDQ $0x03, AX JMP two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm8BAvx two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm8BAvx: - CMPL BX, $0x0c + CMPL BP, $0x0c JGE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm8BAvx - CMPL BP, $0x00000800 + CMPL SI, $0x00000800 JGE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm8BAvx - MOVB $0x01, SI - LEAL -16(SI)(BX*4), BX - MOVB BP, 1(AX) - SHRL $0x08, BP - SHLL $0x05, BP - ORL BP, BX - MOVB BL, (AX) + MOVB $0x01, BL + LEAL -16(BX)(BP*4), BP + MOVB SI, 1(AX) + SHRL $0x08, SI + SHLL $0x05, SI + ORL SI, BP + MOVB BP, (AX) ADDQ $0x02, AX JMP repeat_end_emit_encodeSnappyBlockAsm8BAvx emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm8BAvx: - MOVB $0x02, SI - LEAL -4(SI)(BX*4), BX - MOVB BL, (AX) - MOVW BP, 1(AX) + MOVB $0x02, BL + LEAL -4(BX)(BP*4), BP + MOVB BP, (AX) + MOVW SI, 1(AX) ADDQ $0x03, AX repeat_end_emit_encodeSnappyBlockAsm8BAvx: @@ -20412,16 +20381,16 @@ repeat_end_emit_encodeSnappyBlockAsm8BAvx: JMP search_loop_encodeSnappyBlockAsm8BAvx no_repeat_found_encodeSnappyBlockAsm8BAvx: - CMPL (DX)(BX*1), R9 + CMPL (DX)(BP*1), SI JEQ candidate_match_encodeSnappyBlockAsm8BAvx - SHRQ $0x08, R9 - MOVL 24(SP)(R11*4), BX - LEAL 2(CX), DI - CMPL (DX)(BP*1), R9 + SHRQ $0x08, SI + MOVL 24(SP)(R9*4), BP + LEAL 2(CX), R8 + CMPL (DX)(DI*1), SI JEQ candidate2_match_encodeSnappyBlockAsm8BAvx - MOVL DI, 24(SP)(R11*4) - SHRQ $0x08, R9 - CMPL (DX)(BX*1), R9 + MOVL R8, 24(SP)(R9*4) + SHRQ $0x08, SI + CMPL (DX)(BP*1), SI JEQ candidate3_match_encodeSnappyBlockAsm8BAvx MOVL 20(SP), CX JMP search_loop_encodeSnappyBlockAsm8BAvx @@ -20431,46 +20400,46 @@ candidate3_match_encodeSnappyBlockAsm8BAvx: JMP candidate_match_encodeSnappyBlockAsm8BAvx candidate2_match_encodeSnappyBlockAsm8BAvx: - MOVL DI, 24(SP)(R11*4) + MOVL R8, 24(SP)(R9*4) INCL CX - MOVL BP, BX + MOVL DI, BP candidate_match_encodeSnappyBlockAsm8BAvx: - MOVL 12(SP), BP - TESTL BX, BX + MOVL 12(SP), SI + TESTL BP, BP JZ match_extend_back_end_encodeSnappyBlockAsm8BAvx match_extend_back_loop_encodeSnappyBlockAsm8BAvx: - CMPL CX, BP + CMPL CX, SI JLE match_extend_back_end_encodeSnappyBlockAsm8BAvx - MOVB -1(DX)(BX*1), SI + MOVB -1(DX)(BP*1), BL MOVB -1(DX)(CX*1), DI - CMPB SI, DI + CMPB BL, DI JNE match_extend_back_end_encodeSnappyBlockAsm8BAvx LEAL -1(CX), CX - DECL BX + DECL BP JZ match_extend_back_end_encodeSnappyBlockAsm8BAvx JMP match_extend_back_loop_encodeSnappyBlockAsm8BAvx match_extend_back_end_encodeSnappyBlockAsm8BAvx: - MOVL CX, DI - SUBL 12(SP), DI - LEAQ 4(AX)(DI*1), DI - CMPQ DI, (SP) + MOVL CX, SI + SUBL 12(SP), SI + LEAQ 4(AX)(SI*1), SI + CMPQ SI, (SP) JL match_dst_size_check_encodeSnappyBlockAsm8BAvx MOVQ $0x00000000, ret+48(FP) RET match_dst_size_check_encodeSnappyBlockAsm8BAvx: - MOVL CX, R8 + MOVL CX, SI MOVL 12(SP), DI - CMPL DI, R8 + CMPL DI, SI JEQ emit_literal_done_match_emit_encodeSnappyBlockAsm8BAvx - MOVL R8, BP - MOVL R8, 12(SP) - LEAQ (DX)(DI*1), R9 - SUBL DI, BP - MOVL BP, DI + MOVL SI, R8 + MOVL SI, 12(SP) + LEAQ (DX)(DI*1), SI + SUBL DI, R8 + MOVL R8, DI SUBL $0x01, DI JC emit_literal_done_match_emit_encodeSnappyBlockAsm8BAvx CMPL DI, $0x3c @@ -20487,11 +20456,11 @@ match_dst_size_check_encodeSnappyBlockAsm8BAvx: JMP memmove_match_emit_encodeSnappyBlockAsm8BAvx four_bytes_match_emit_encodeSnappyBlockAsm8BAvx: - MOVL DI, R8 - SHRL $0x10, R8 + MOVL DI, R9 + SHRL $0x10, R9 MOVB $0xf8, (AX) MOVW DI, 1(AX) - MOVB R8, 3(AX) + MOVB R9, 3(AX) ADDQ $0x04, AX JMP memmove_match_emit_encodeSnappyBlockAsm8BAvx @@ -20513,8 +20482,7 @@ one_byte_match_emit_encodeSnappyBlockAsm8BAvx: ADDQ $0x01, AX memmove_match_emit_encodeSnappyBlockAsm8BAvx: - LEAQ (AX)(BP*1), R10 - MOVL BP, R8 + LEAQ (AX)(R8*1), DI NOP emit_lit_memmove_match_emit_encodeSnappyBlockAsm8BAvx_memmove_tail: @@ -20541,55 +20509,55 @@ emit_lit_memmove_match_emit_encodeSnappyBlockAsm8BAvx_memmove_tail: JMP emit_lit_memmove_match_emit_encodeSnappyBlockAsm8BAvx_memmove_avxUnaligned emit_lit_memmove_match_emit_encodeSnappyBlockAsm8BAvx_memmove_move_1or2: - MOVB (R9), R11 - MOVB -1(R9)(R8*1), R12 - MOVB R11, (AX) - MOVB R12, -1(AX)(R8*1) + MOVB (SI), R9 + MOVB -1(SI)(R8*1), R10 + MOVB R9, (AX) + MOVB R10, -1(AX)(R8*1) JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm8BAvx emit_lit_memmove_match_emit_encodeSnappyBlockAsm8BAvx_memmove_move_4: - MOVL (R9), R11 - MOVL R11, (AX) + MOVL (SI), R9 + MOVL R9, (AX) JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm8BAvx emit_lit_memmove_match_emit_encodeSnappyBlockAsm8BAvx_memmove_move_3: - MOVW (R9), R11 - MOVB 2(R9), R12 - MOVW R11, (AX) - MOVB R12, 2(AX) + MOVW (SI), R9 + MOVB 2(SI), R10 + MOVW R9, (AX) + MOVB R10, 2(AX) JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm8BAvx emit_lit_memmove_match_emit_encodeSnappyBlockAsm8BAvx_memmove_move_5through7: - MOVL (R9), R11 - MOVL -4(R9)(R8*1), R12 - MOVL R11, (AX) - MOVL R12, -4(AX)(R8*1) + MOVL (SI), R9 + MOVL -4(SI)(R8*1), R10 + MOVL R9, (AX) + MOVL R10, -4(AX)(R8*1) JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm8BAvx emit_lit_memmove_match_emit_encodeSnappyBlockAsm8BAvx_memmove_move_8: - MOVQ (R9), R11 - MOVQ R11, (AX) + MOVQ (SI), R9 + MOVQ R9, (AX) JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm8BAvx emit_lit_memmove_match_emit_encodeSnappyBlockAsm8BAvx_memmove_move_9through16: - MOVQ (R9), R11 - MOVQ -8(R9)(R8*1), R12 - MOVQ R11, (AX) - MOVQ R12, -8(AX)(R8*1) + MOVQ (SI), R9 + MOVQ -8(SI)(R8*1), R10 + MOVQ R9, (AX) + MOVQ R10, -8(AX)(R8*1) JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm8BAvx emit_lit_memmove_match_emit_encodeSnappyBlockAsm8BAvx_memmove_move_17through32: - MOVOU (R9), X0 - MOVOU -16(R9)(R8*1), X1 + MOVOU (SI), X0 + MOVOU -16(SI)(R8*1), X1 MOVOU X0, (AX) MOVOU X1, -16(AX)(R8*1) JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm8BAvx emit_lit_memmove_match_emit_encodeSnappyBlockAsm8BAvx_memmove_move_33through64: - MOVOU (R9), X0 - MOVOU 16(R9), X1 - MOVOU -32(R9)(R8*1), X2 - MOVOU -16(R9)(R8*1), X3 + MOVOU (SI), X0 + MOVOU 16(SI), X1 + MOVOU -32(SI)(R8*1), X2 + MOVOU -16(SI)(R8*1), X3 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, -32(AX)(R8*1) @@ -20597,14 +20565,14 @@ emit_lit_memmove_match_emit_encodeSnappyBlockAsm8BAvx_memmove_move_33through64: JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm8BAvx emit_lit_memmove_match_emit_encodeSnappyBlockAsm8BAvx_memmove_move_65through128: - MOVOU (R9), X0 - MOVOU 16(R9), X1 - MOVOU 32(R9), X2 - MOVOU 48(R9), X3 - MOVOU -64(R9)(R8*1), X12 - MOVOU -48(R9)(R8*1), X13 - MOVOU -32(R9)(R8*1), X14 - MOVOU -16(R9)(R8*1), X15 + MOVOU (SI), X0 + MOVOU 16(SI), X1 + MOVOU 32(SI), X2 + MOVOU 48(SI), X3 + MOVOU -64(SI)(R8*1), X12 + MOVOU -48(SI)(R8*1), X13 + MOVOU -32(SI)(R8*1), X14 + MOVOU -16(SI)(R8*1), X15 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, 32(AX) @@ -20616,22 +20584,22 @@ emit_lit_memmove_match_emit_encodeSnappyBlockAsm8BAvx_memmove_move_65through128: JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm8BAvx emit_lit_memmove_match_emit_encodeSnappyBlockAsm8BAvx_memmove_move_129through256: - MOVOU (R9), X0 - MOVOU 16(R9), X1 - MOVOU 32(R9), X2 - MOVOU 48(R9), X3 - MOVOU 64(R9), X4 - MOVOU 80(R9), X5 - MOVOU 96(R9), X6 - MOVOU 112(R9), X7 - MOVOU -128(R9)(R8*1), X8 - MOVOU -112(R9)(R8*1), X9 - MOVOU -96(R9)(R8*1), X10 - MOVOU -80(R9)(R8*1), X11 - MOVOU -64(R9)(R8*1), X12 - MOVOU -48(R9)(R8*1), X13 - MOVOU -32(R9)(R8*1), X14 - MOVOU -16(R9)(R8*1), X15 + MOVOU (SI), X0 + MOVOU 16(SI), X1 + MOVOU 32(SI), X2 + MOVOU 48(SI), X3 + MOVOU 64(SI), X4 + MOVOU 80(SI), X5 + MOVOU 96(SI), X6 + MOVOU 112(SI), X7 + MOVOU -128(SI)(R8*1), X8 + MOVOU -112(SI)(R8*1), X9 + MOVOU -96(SI)(R8*1), X10 + MOVOU -80(SI)(R8*1), X11 + MOVOU -64(SI)(R8*1), X12 + MOVOU -48(SI)(R8*1), X13 + MOVOU -32(SI)(R8*1), X14 + MOVOU -16(SI)(R8*1), X15 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, 32(AX) @@ -20652,22 +20620,22 @@ emit_lit_memmove_match_emit_encodeSnappyBlockAsm8BAvx_memmove_move_129through256 emit_lit_memmove_match_emit_encodeSnappyBlockAsm8BAvx_memmove_move_256through2048: LEAQ -256(R8), R8 - MOVOU (R9), X0 - MOVOU 16(R9), X1 - MOVOU 32(R9), X2 - MOVOU 48(R9), X3 - MOVOU 64(R9), X4 - MOVOU 80(R9), X5 - MOVOU 96(R9), X6 - MOVOU 112(R9), X7 - MOVOU 128(R9), X8 - MOVOU 144(R9), X9 - MOVOU 160(R9), X10 - MOVOU 176(R9), X11 - MOVOU 192(R9), X12 - MOVOU 208(R9), X13 - MOVOU 224(R9), X14 - MOVOU 240(R9), X15 + MOVOU (SI), X0 + MOVOU 16(SI), X1 + MOVOU 32(SI), X2 + MOVOU 48(SI), X3 + MOVOU 64(SI), X4 + MOVOU 80(SI), X5 + MOVOU 96(SI), X6 + MOVOU 112(SI), X7 + MOVOU 128(SI), X8 + MOVOU 144(SI), X9 + MOVOU 160(SI), X10 + MOVOU 176(SI), X11 + MOVOU 192(SI), X12 + MOVOU 208(SI), X13 + MOVOU 224(SI), X14 + MOVOU 240(SI), X15 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, 32(AX) @@ -20685,160 +20653,160 @@ emit_lit_memmove_match_emit_encodeSnappyBlockAsm8BAvx_memmove_move_256through204 MOVOU X14, 224(AX) MOVOU X15, 240(AX) CMPQ R8, $0x00000100 - LEAQ 256(R9), R9 + LEAQ 256(SI), SI LEAQ 256(AX), AX JGE emit_lit_memmove_match_emit_encodeSnappyBlockAsm8BAvx_memmove_move_256through2048 JMP emit_lit_memmove_match_emit_encodeSnappyBlockAsm8BAvx_memmove_tail emit_lit_memmove_match_emit_encodeSnappyBlockAsm8BAvx_memmove_avxUnaligned: - LEAQ (R9)(R8*1), R12 - MOVQ AX, R14 - MOVOU -128(R12), X5 - MOVOU -112(R12), X6 - MOVQ $0x00000080, R11 + LEAQ (SI)(R8*1), R10 + MOVQ AX, R12 + MOVOU -128(R10), X5 + MOVOU -112(R10), X6 + MOVQ $0x00000080, R9 ANDQ $0xffffffe0, AX ADDQ $0x20, AX - MOVOU -96(R12), X7 - MOVOU -80(R12), X8 - MOVQ AX, R13 - SUBQ R14, R13 - MOVOU -64(R12), X9 - MOVOU -48(R12), X10 - SUBQ R13, R8 - MOVOU -32(R12), X11 - MOVOU -16(R12), X12 - VMOVDQU (R9), Y4 - ADDQ R13, R9 + MOVOU -96(R10), X7 + MOVOU -80(R10), X8 + MOVQ AX, R11 + SUBQ R12, R11 + MOVOU -64(R10), X9 + MOVOU -48(R10), X10 SUBQ R11, R8 + MOVOU -32(R10), X11 + MOVOU -16(R10), X12 + VMOVDQU (SI), Y4 + ADDQ R11, SI + SUBQ R9, R8 emit_lit_memmove_match_emit_encodeSnappyBlockAsm8BAvx_memmove_gobble_128_loop: - VMOVDQU (R9), Y0 - VMOVDQU 32(R9), Y1 - VMOVDQU 64(R9), Y2 - VMOVDQU 96(R9), Y3 - ADDQ R11, R9 + VMOVDQU (SI), Y0 + VMOVDQU 32(SI), Y1 + VMOVDQU 64(SI), Y2 + VMOVDQU 96(SI), Y3 + ADDQ R9, SI VMOVDQA Y0, (AX) VMOVDQA Y1, 32(AX) VMOVDQA Y2, 64(AX) VMOVDQA Y3, 96(AX) - ADDQ R11, AX - SUBQ R11, R8 + ADDQ R9, AX + SUBQ R9, R8 JA emit_lit_memmove_match_emit_encodeSnappyBlockAsm8BAvx_memmove_gobble_128_loop - ADDQ R11, R8 + ADDQ R9, R8 ADDQ AX, R8 - VMOVDQU Y4, (R14) + VMOVDQU Y4, (R12) VZEROUPPER - MOVOU X5, -128(R8) - MOVOU X6, -112(R8) - MOVOU X7, -96(R8) - MOVOU X8, -80(R8) - MOVOU X9, -64(R8) - MOVOU X10, -48(R8) - MOVOU X11, -32(R8) - MOVOU X12, -16(R8) + MOVOU X5, -128(R8) + MOVOU X6, -112(R8) + MOVOU X7, -96(R8) + MOVOU X8, -80(R8) + MOVOU X9, -64(R8) + MOVOU X10, -48(R8) + MOVOU X11, -32(R8) + MOVOU X12, -16(R8) memmove_end_copy_match_emit_encodeSnappyBlockAsm8BAvx: - MOVQ R10, AX + MOVQ DI, AX emit_literal_done_match_emit_encodeSnappyBlockAsm8BAvx: match_nolit_loop_encodeSnappyBlockAsm8BAvx: - MOVL CX, BP - SUBL BX, BP - MOVL BP, 16(SP) + MOVL CX, SI + SUBL BP, SI + MOVL SI, 16(SP) ADDL $0x04, CX - ADDL $0x04, BX - MOVQ src_len+32(FP), R9 - SUBL CX, R9 - LEAQ (DX)(CX*1), R10 - LEAQ (DX)(BX*1), R11 - XORL BX, BX - CMPL R9, $0x08 + ADDL $0x04, BP + MOVQ src_len+32(FP), SI + SUBL CX, SI + LEAQ (DX)(CX*1), DI + LEAQ (DX)(BP*1), BP + XORL R9, R9 + CMPL SI, $0x08 JL matchlen_single_match_nolit_encodeSnappyBlockAsm8BAvx matchlen_loopback_match_nolit_encodeSnappyBlockAsm8BAvx: - MOVQ (R10)(BX*1), R12 - XORQ (R11)(BX*1), R12 - TESTQ R12, R12 + MOVQ (DI)(R9*1), R8 + XORQ (BP)(R9*1), R8 + TESTQ R8, R8 JZ matchlen_loop_match_nolit_encodeSnappyBlockAsm8BAvx - BSFQ R12, R12 - SARQ $0x03, R12 - LEAL (BX)(R12*1), BX + BSFQ R8, R8 + SARQ $0x03, R8 + LEAL (R9)(R8*1), R9 JMP match_nolit_end_encodeSnappyBlockAsm8BAvx matchlen_loop_match_nolit_encodeSnappyBlockAsm8BAvx: - LEAL -8(R9), R9 - LEAL 8(BX), BX - CMPL R9, $0x08 + LEAL -8(SI), SI + LEAL 8(R9), R9 + CMPL SI, $0x08 JGE matchlen_loopback_match_nolit_encodeSnappyBlockAsm8BAvx matchlen_single_match_nolit_encodeSnappyBlockAsm8BAvx: - TESTL R9, R9 + TESTL SI, SI JZ match_nolit_end_encodeSnappyBlockAsm8BAvx matchlen_single_loopback_match_nolit_encodeSnappyBlockAsm8BAvx: - MOVB (R10)(BX*1), R12 - CMPB (R11)(BX*1), R12 + MOVB (DI)(R9*1), R8 + CMPB (BP)(R9*1), R8 JNE match_nolit_end_encodeSnappyBlockAsm8BAvx - LEAL 1(BX), BX - DECL R9 + LEAL 1(R9), R9 + DECL SI JNZ matchlen_single_loopback_match_nolit_encodeSnappyBlockAsm8BAvx match_nolit_end_encodeSnappyBlockAsm8BAvx: - ADDL BX, CX + ADDL R9, CX MOVL 16(SP), BP - ADDL $0x04, BX + ADDL $0x04, R9 CMPL BP, $0x00010000 JL two_byte_offset_match_nolit_encodeSnappyBlockAsm8BAvx four_bytes_loop_back_match_nolit_encodeSnappyBlockAsm8BAvx: - CMPL BX, $0x40 + CMPL R9, $0x40 JLE four_bytes_remain_match_nolit_encodeSnappyBlockAsm8BAvx MOVB $0xff, (AX) MOVL BP, 1(AX) - LEAL -64(BX), BX + LEAL -64(R9), R9 ADDQ $0x05, AX - CMPL BX, $0x04 + CMPL R9, $0x04 JL four_bytes_remain_match_nolit_encodeSnappyBlockAsm8BAvx JMP four_bytes_loop_back_match_nolit_encodeSnappyBlockAsm8BAvx four_bytes_remain_match_nolit_encodeSnappyBlockAsm8BAvx: - TESTL BX, BX + TESTL R9, R9 JZ match_nolit_emitcopy_end_encodeSnappyBlockAsm8BAvx - MOVB $0x03, SI - LEAL -4(SI)(BX*4), BX - MOVB BL, (AX) + MOVB $0x03, BL + LEAL -4(BX)(R9*4), R9 + MOVB R9, (AX) MOVL BP, 1(AX) ADDQ $0x05, AX JMP match_nolit_emitcopy_end_encodeSnappyBlockAsm8BAvx two_byte_offset_match_nolit_encodeSnappyBlockAsm8BAvx: - CMPL BX, $0x40 + CMPL R9, $0x40 JLE two_byte_offset_short_match_nolit_encodeSnappyBlockAsm8BAvx MOVB $0xee, (AX) MOVW BP, 1(AX) - LEAL -60(BX), BX + LEAL -60(R9), R9 ADDQ $0x03, AX JMP two_byte_offset_match_nolit_encodeSnappyBlockAsm8BAvx two_byte_offset_short_match_nolit_encodeSnappyBlockAsm8BAvx: - CMPL BX, $0x0c + CMPL R9, $0x0c JGE emit_copy_three_match_nolit_encodeSnappyBlockAsm8BAvx CMPL BP, $0x00000800 JGE emit_copy_three_match_nolit_encodeSnappyBlockAsm8BAvx - MOVB $0x01, SI - LEAL -16(SI)(BX*4), BX + MOVB $0x01, BL + LEAL -16(BX)(R9*4), R9 MOVB BP, 1(AX) SHRL $0x08, BP SHLL $0x05, BP - ORL BP, BX - MOVB BL, (AX) + ORL BP, R9 + MOVB R9, (AX) ADDQ $0x02, AX JMP match_nolit_emitcopy_end_encodeSnappyBlockAsm8BAvx emit_copy_three_match_nolit_encodeSnappyBlockAsm8BAvx: - MOVB $0x02, SI - LEAL -4(SI)(BX*4), BX - MOVB BL, (AX) + MOVB $0x02, BL + LEAL -4(BX)(R9*4), R9 + MOVB R9, (AX) MOVW BP, 1(AX) ADDQ $0x03, AX @@ -20852,22 +20820,22 @@ match_nolit_emitcopy_end_encodeSnappyBlockAsm8BAvx: RET match_nolit_dst_ok_encodeSnappyBlockAsm8BAvx: - MOVQ -2(DX)(CX*1), R9 - MOVQ $0x9e3779b1, R10 - MOVQ R9, R11 - SHRQ $0x10, R9 - MOVQ R9, R12 - SHLQ $0x20, R11 - IMULQ R10, R11 - SHRQ $0x38, R11 - SHLQ $0x20, R12 - IMULQ R10, R12 - SHRQ $0x38, R12 - LEAL -2(CX), BP - MOVL 24(SP)(R12*4), BX - MOVL BP, 24(SP)(R11*4) - MOVL CX, 24(SP)(R12*4) - CMPL (DX)(BX*1), R9 + MOVQ -2(DX)(CX*1), SI + MOVQ $0x9e3779b1, BP + MOVQ SI, DI + SHRQ $0x10, SI + MOVQ SI, R8 + SHLQ $0x20, DI + IMULQ BP, DI + SHRQ $0x38, DI + SHLQ $0x20, R8 + IMULQ BP, R8 + SHRQ $0x38, R8 + LEAL -2(CX), R9 + MOVL 24(SP)(R8*4), BP + MOVL R9, 24(SP)(DI*4) + MOVL CX, 24(SP)(R8*4) + CMPL (DX)(BP*1), SI JEQ match_nolit_loop_encodeSnappyBlockAsm8BAvx INCL CX JMP search_loop_encodeSnappyBlockAsm8BAvx @@ -21148,14 +21116,14 @@ emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8BAvx_memmove_gobble_128_loo ADDQ AX, BX VMOVDQU Y4, (R8) VZEROUPPER - MOVOU X5, -128(BX) - MOVOU X6, -112(BX) - MOVOU X7, -96(BX) - MOVOU X8, -80(BX) - MOVOU X9, -64(BX) - MOVOU X10, -48(BX) - MOVOU X11, -32(BX) - MOVOU X12, -16(BX) + MOVOU X5, -128(BX) + MOVOU X6, -112(BX) + MOVOU X7, -96(BX) + MOVOU X8, -80(BX) + MOVOU X9, -64(BX) + MOVOU X10, -48(BX) + MOVOU X11, -32(BX) + MOVOU X12, -16(BX) memmove_end_copy_emit_remainder_encodeSnappyBlockAsm8BAvx: MOVQ DX, AX @@ -21221,86 +21189,85 @@ one_byte_standalone: ADDQ $0x01, AX memmove_standalone: - MOVL DX, BP NOP emit_lit_memmove_standalone_memmove_tail: - TESTQ BP, BP + TESTQ DX, DX JEQ emit_literal_end_standalone - CMPQ BP, $0x02 + CMPQ DX, $0x02 JBE emit_lit_memmove_standalone_memmove_move_1or2 - CMPQ BP, $0x04 + CMPQ DX, $0x04 JB emit_lit_memmove_standalone_memmove_move_3 JBE emit_lit_memmove_standalone_memmove_move_4 - CMPQ BP, $0x08 + CMPQ DX, $0x08 JB emit_lit_memmove_standalone_memmove_move_5through7 JE emit_lit_memmove_standalone_memmove_move_8 - CMPQ BP, $0x10 + CMPQ DX, $0x10 JBE emit_lit_memmove_standalone_memmove_move_9through16 - CMPQ BP, $0x20 + CMPQ DX, $0x20 JBE emit_lit_memmove_standalone_memmove_move_17through32 - CMPQ BP, $0x40 + CMPQ DX, $0x40 JBE emit_lit_memmove_standalone_memmove_move_33through64 - CMPQ BP, $0x80 + CMPQ DX, $0x80 JBE emit_lit_memmove_standalone_memmove_move_65through128 - CMPQ BP, $0x00000100 + CMPQ DX, $0x00000100 JBE emit_lit_memmove_standalone_memmove_move_129through256 JMP emit_lit_memmove_standalone_memmove_move_256through2048 emit_lit_memmove_standalone_memmove_move_1or2: - MOVB (CX), DL - MOVB -1(CX)(BP*1), CL - MOVB DL, (AX) - MOVB CL, -1(AX)(BP*1) + MOVB (CX), BP + MOVB -1(CX)(DX*1), CL + MOVB BP, (AX) + MOVB CL, -1(AX)(DX*1) JMP emit_literal_end_standalone emit_lit_memmove_standalone_memmove_move_4: - MOVL (CX), DX - MOVL DX, (AX) + MOVL (CX), BP + MOVL BP, (AX) JMP emit_literal_end_standalone emit_lit_memmove_standalone_memmove_move_3: - MOVW (CX), DX + MOVW (CX), BP MOVB 2(CX), CL - MOVW DX, (AX) + MOVW BP, (AX) MOVB CL, 2(AX) JMP emit_literal_end_standalone emit_lit_memmove_standalone_memmove_move_5through7: - MOVL (CX), DX - MOVL -4(CX)(BP*1), CX - MOVL DX, (AX) - MOVL CX, -4(AX)(BP*1) + MOVL (CX), BP + MOVL -4(CX)(DX*1), CX + MOVL BP, (AX) + MOVL CX, -4(AX)(DX*1) JMP emit_literal_end_standalone emit_lit_memmove_standalone_memmove_move_8: - MOVQ (CX), DX - MOVQ DX, (AX) + MOVQ (CX), BP + MOVQ BP, (AX) JMP emit_literal_end_standalone emit_lit_memmove_standalone_memmove_move_9through16: - MOVQ (CX), DX - MOVQ -8(CX)(BP*1), CX - MOVQ DX, (AX) - MOVQ CX, -8(AX)(BP*1) + MOVQ (CX), BP + MOVQ -8(CX)(DX*1), CX + MOVQ BP, (AX) + MOVQ CX, -8(AX)(DX*1) JMP emit_literal_end_standalone emit_lit_memmove_standalone_memmove_move_17through32: MOVOU (CX), X0 - MOVOU -16(CX)(BP*1), X1 + MOVOU -16(CX)(DX*1), X1 MOVOU X0, (AX) - MOVOU X1, -16(AX)(BP*1) + MOVOU X1, -16(AX)(DX*1) JMP emit_literal_end_standalone emit_lit_memmove_standalone_memmove_move_33through64: MOVOU (CX), X0 MOVOU 16(CX), X1 - MOVOU -32(CX)(BP*1), X2 - MOVOU -16(CX)(BP*1), X3 + MOVOU -32(CX)(DX*1), X2 + MOVOU -16(CX)(DX*1), X3 MOVOU X0, (AX) MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(BP*1) - MOVOU X3, -16(AX)(BP*1) + MOVOU X2, -32(AX)(DX*1) + MOVOU X3, -16(AX)(DX*1) JMP emit_literal_end_standalone emit_lit_memmove_standalone_memmove_move_65through128: @@ -21308,18 +21275,18 @@ emit_lit_memmove_standalone_memmove_move_65through128: MOVOU 16(CX), X1 MOVOU 32(CX), X2 MOVOU 48(CX), X3 - MOVOU -64(CX)(BP*1), X12 - MOVOU -48(CX)(BP*1), X13 - MOVOU -32(CX)(BP*1), X14 - MOVOU -16(CX)(BP*1), X15 + MOVOU -64(CX)(DX*1), X12 + MOVOU -48(CX)(DX*1), X13 + MOVOU -32(CX)(DX*1), X14 + MOVOU -16(CX)(DX*1), X15 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, 32(AX) MOVOU X3, 48(AX) - MOVOU X12, -64(AX)(BP*1) - MOVOU X13, -48(AX)(BP*1) - MOVOU X14, -32(AX)(BP*1) - MOVOU X15, -16(AX)(BP*1) + MOVOU X12, -64(AX)(DX*1) + MOVOU X13, -48(AX)(DX*1) + MOVOU X14, -32(AX)(DX*1) + MOVOU X15, -16(AX)(DX*1) JMP emit_literal_end_standalone emit_lit_memmove_standalone_memmove_move_129through256: @@ -21331,14 +21298,14 @@ emit_lit_memmove_standalone_memmove_move_129through256: MOVOU 80(CX), X5 MOVOU 96(CX), X6 MOVOU 112(CX), X7 - MOVOU -128(CX)(BP*1), X8 - MOVOU -112(CX)(BP*1), X9 - MOVOU -96(CX)(BP*1), X10 - MOVOU -80(CX)(BP*1), X11 - MOVOU -64(CX)(BP*1), X12 - MOVOU -48(CX)(BP*1), X13 - MOVOU -32(CX)(BP*1), X14 - MOVOU -16(CX)(BP*1), X15 + MOVOU -128(CX)(DX*1), X8 + MOVOU -112(CX)(DX*1), X9 + MOVOU -96(CX)(DX*1), X10 + MOVOU -80(CX)(DX*1), X11 + MOVOU -64(CX)(DX*1), X12 + MOVOU -48(CX)(DX*1), X13 + MOVOU -32(CX)(DX*1), X14 + MOVOU -16(CX)(DX*1), X15 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, 32(AX) @@ -21347,18 +21314,18 @@ emit_lit_memmove_standalone_memmove_move_129through256: MOVOU X5, 80(AX) MOVOU X6, 96(AX) MOVOU X7, 112(AX) - MOVOU X8, -128(AX)(BP*1) - MOVOU X9, -112(AX)(BP*1) - MOVOU X10, -96(AX)(BP*1) - MOVOU X11, -80(AX)(BP*1) - MOVOU X12, -64(AX)(BP*1) - MOVOU X13, -48(AX)(BP*1) - MOVOU X14, -32(AX)(BP*1) - MOVOU X15, -16(AX)(BP*1) + MOVOU X8, -128(AX)(DX*1) + MOVOU X9, -112(AX)(DX*1) + MOVOU X10, -96(AX)(DX*1) + MOVOU X11, -80(AX)(DX*1) + MOVOU X12, -64(AX)(DX*1) + MOVOU X13, -48(AX)(DX*1) + MOVOU X14, -32(AX)(DX*1) + MOVOU X15, -16(AX)(DX*1) JMP emit_literal_end_standalone emit_lit_memmove_standalone_memmove_move_256through2048: - LEAQ -256(BP), BP + LEAQ -256(DX), DX MOVOU (CX), X0 MOVOU 16(CX), X1 MOVOU 32(CX), X2 @@ -21391,7 +21358,7 @@ emit_lit_memmove_standalone_memmove_move_256through2048: MOVOU X13, 208(AX) MOVOU X14, 224(AX) MOVOU X15, 240(AX) - CMPQ BP, $0x00000100 + CMPQ DX, $0x00000100 LEAQ 256(CX), CX LEAQ 256(AX), AX JGE emit_lit_memmove_standalone_memmove_move_256through2048 @@ -21456,86 +21423,85 @@ one_byte_standalone: ADDQ $0x01, AX memmove_standalone: - MOVL DX, BP NOP emit_lit_memmove_standalone_memmove_tail: - TESTQ BP, BP + TESTQ DX, DX JEQ emit_literal_end_avx_standalone - CMPQ BP, $0x02 + CMPQ DX, $0x02 JBE emit_lit_memmove_standalone_memmove_move_1or2 - CMPQ BP, $0x04 + CMPQ DX, $0x04 JB emit_lit_memmove_standalone_memmove_move_3 JBE emit_lit_memmove_standalone_memmove_move_4 - CMPQ BP, $0x08 + CMPQ DX, $0x08 JB emit_lit_memmove_standalone_memmove_move_5through7 JE emit_lit_memmove_standalone_memmove_move_8 - CMPQ BP, $0x10 + CMPQ DX, $0x10 JBE emit_lit_memmove_standalone_memmove_move_9through16 - CMPQ BP, $0x20 + CMPQ DX, $0x20 JBE emit_lit_memmove_standalone_memmove_move_17through32 - CMPQ BP, $0x40 + CMPQ DX, $0x40 JBE emit_lit_memmove_standalone_memmove_move_33through64 - CMPQ BP, $0x80 + CMPQ DX, $0x80 JBE emit_lit_memmove_standalone_memmove_move_65through128 - CMPQ BP, $0x00000100 + CMPQ DX, $0x00000100 JBE emit_lit_memmove_standalone_memmove_move_129through256 JMP emit_lit_memmove_standalone_memmove_avxUnaligned emit_lit_memmove_standalone_memmove_move_1or2: - MOVB (CX), DL - MOVB -1(CX)(BP*1), SI - MOVB DL, (AX) - MOVB SI, -1(AX)(BP*1) + MOVB (CX), BP + MOVB -1(CX)(DX*1), SI + MOVB BP, (AX) + MOVB SI, -1(AX)(DX*1) JMP emit_literal_end_avx_standalone emit_lit_memmove_standalone_memmove_move_4: - MOVL (CX), DX - MOVL DX, (AX) + MOVL (CX), BP + MOVL BP, (AX) JMP emit_literal_end_avx_standalone emit_lit_memmove_standalone_memmove_move_3: - MOVW (CX), DX + MOVW (CX), BP MOVB 2(CX), SI - MOVW DX, (AX) + MOVW BP, (AX) MOVB SI, 2(AX) JMP emit_literal_end_avx_standalone emit_lit_memmove_standalone_memmove_move_5through7: - MOVL (CX), DX - MOVL -4(CX)(BP*1), SI - MOVL DX, (AX) - MOVL SI, -4(AX)(BP*1) + MOVL (CX), BP + MOVL -4(CX)(DX*1), SI + MOVL BP, (AX) + MOVL SI, -4(AX)(DX*1) JMP emit_literal_end_avx_standalone emit_lit_memmove_standalone_memmove_move_8: - MOVQ (CX), DX - MOVQ DX, (AX) + MOVQ (CX), BP + MOVQ BP, (AX) JMP emit_literal_end_avx_standalone emit_lit_memmove_standalone_memmove_move_9through16: - MOVQ (CX), DX - MOVQ -8(CX)(BP*1), SI - MOVQ DX, (AX) - MOVQ SI, -8(AX)(BP*1) + MOVQ (CX), BP + MOVQ -8(CX)(DX*1), SI + MOVQ BP, (AX) + MOVQ SI, -8(AX)(DX*1) JMP emit_literal_end_avx_standalone emit_lit_memmove_standalone_memmove_move_17through32: MOVOU (CX), X0 - MOVOU -16(CX)(BP*1), X1 + MOVOU -16(CX)(DX*1), X1 MOVOU X0, (AX) - MOVOU X1, -16(AX)(BP*1) + MOVOU X1, -16(AX)(DX*1) JMP emit_literal_end_avx_standalone emit_lit_memmove_standalone_memmove_move_33through64: MOVOU (CX), X0 MOVOU 16(CX), X1 - MOVOU -32(CX)(BP*1), X2 - MOVOU -16(CX)(BP*1), X3 + MOVOU -32(CX)(DX*1), X2 + MOVOU -16(CX)(DX*1), X3 MOVOU X0, (AX) MOVOU X1, 16(AX) - MOVOU X2, -32(AX)(BP*1) - MOVOU X3, -16(AX)(BP*1) + MOVOU X2, -32(AX)(DX*1) + MOVOU X3, -16(AX)(DX*1) JMP emit_literal_end_avx_standalone emit_lit_memmove_standalone_memmove_move_65through128: @@ -21543,18 +21509,18 @@ emit_lit_memmove_standalone_memmove_move_65through128: MOVOU 16(CX), X1 MOVOU 32(CX), X2 MOVOU 48(CX), X3 - MOVOU -64(CX)(BP*1), X12 - MOVOU -48(CX)(BP*1), X13 - MOVOU -32(CX)(BP*1), X14 - MOVOU -16(CX)(BP*1), X15 + MOVOU -64(CX)(DX*1), X12 + MOVOU -48(CX)(DX*1), X13 + MOVOU -32(CX)(DX*1), X14 + MOVOU -16(CX)(DX*1), X15 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, 32(AX) MOVOU X3, 48(AX) - MOVOU X12, -64(AX)(BP*1) - MOVOU X13, -48(AX)(BP*1) - MOVOU X14, -32(AX)(BP*1) - MOVOU X15, -16(AX)(BP*1) + MOVOU X12, -64(AX)(DX*1) + MOVOU X13, -48(AX)(DX*1) + MOVOU X14, -32(AX)(DX*1) + MOVOU X15, -16(AX)(DX*1) JMP emit_literal_end_avx_standalone emit_lit_memmove_standalone_memmove_move_129through256: @@ -21566,14 +21532,14 @@ emit_lit_memmove_standalone_memmove_move_129through256: MOVOU 80(CX), X5 MOVOU 96(CX), X6 MOVOU 112(CX), X7 - MOVOU -128(CX)(BP*1), X8 - MOVOU -112(CX)(BP*1), X9 - MOVOU -96(CX)(BP*1), X10 - MOVOU -80(CX)(BP*1), X11 - MOVOU -64(CX)(BP*1), X12 - MOVOU -48(CX)(BP*1), X13 - MOVOU -32(CX)(BP*1), X14 - MOVOU -16(CX)(BP*1), X15 + MOVOU -128(CX)(DX*1), X8 + MOVOU -112(CX)(DX*1), X9 + MOVOU -96(CX)(DX*1), X10 + MOVOU -80(CX)(DX*1), X11 + MOVOU -64(CX)(DX*1), X12 + MOVOU -48(CX)(DX*1), X13 + MOVOU -32(CX)(DX*1), X14 + MOVOU -16(CX)(DX*1), X15 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, 32(AX) @@ -21582,18 +21548,18 @@ emit_lit_memmove_standalone_memmove_move_129through256: MOVOU X5, 80(AX) MOVOU X6, 96(AX) MOVOU X7, 112(AX) - MOVOU X8, -128(AX)(BP*1) - MOVOU X9, -112(AX)(BP*1) - MOVOU X10, -96(AX)(BP*1) - MOVOU X11, -80(AX)(BP*1) - MOVOU X12, -64(AX)(BP*1) - MOVOU X13, -48(AX)(BP*1) - MOVOU X14, -32(AX)(BP*1) - MOVOU X15, -16(AX)(BP*1) + MOVOU X8, -128(AX)(DX*1) + MOVOU X9, -112(AX)(DX*1) + MOVOU X10, -96(AX)(DX*1) + MOVOU X11, -80(AX)(DX*1) + MOVOU X12, -64(AX)(DX*1) + MOVOU X13, -48(AX)(DX*1) + MOVOU X14, -32(AX)(DX*1) + MOVOU X15, -16(AX)(DX*1) JMP emit_literal_end_avx_standalone emit_lit_memmove_standalone_memmove_move_256through2048: - LEAQ -256(BP), BP + LEAQ -256(DX), DX MOVOU (CX), X0 MOVOU 16(CX), X1 MOVOU 32(CX), X2 @@ -21626,18 +21592,18 @@ emit_lit_memmove_standalone_memmove_move_256through2048: MOVOU X13, 208(AX) MOVOU X14, 224(AX) MOVOU X15, 240(AX) - CMPQ BP, $0x00000100 + CMPQ DX, $0x00000100 LEAQ 256(CX), CX LEAQ 256(AX), AX JGE emit_lit_memmove_standalone_memmove_move_256through2048 JMP emit_lit_memmove_standalone_memmove_tail emit_lit_memmove_standalone_memmove_avxUnaligned: - LEAQ (CX)(BP*1), SI + LEAQ (CX)(DX*1), SI MOVQ AX, R8 MOVOU -128(SI), X5 MOVOU -112(SI), X6 - MOVQ $0x00000080, DX + MOVQ $0x00000080, BP ANDQ $0xffffffe0, AX ADDQ $0x20, AX MOVOU -96(SI), X7 @@ -21646,38 +21612,38 @@ emit_lit_memmove_standalone_memmove_avxUnaligned: SUBQ R8, DI MOVOU -64(SI), X9 MOVOU -48(SI), X10 - SUBQ DI, BP + SUBQ DI, DX MOVOU -32(SI), X11 MOVOU -16(SI), X12 VMOVDQU (CX), Y4 ADDQ DI, CX - SUBQ DX, BP + SUBQ BP, DX emit_lit_memmove_standalone_memmove_gobble_128_loop: VMOVDQU (CX), Y0 VMOVDQU 32(CX), Y1 VMOVDQU 64(CX), Y2 VMOVDQU 96(CX), Y3 - ADDQ DX, CX + ADDQ BP, CX VMOVDQA Y0, (AX) VMOVDQA Y1, 32(AX) VMOVDQA Y2, 64(AX) VMOVDQA Y3, 96(AX) - ADDQ DX, AX - SUBQ DX, BP + ADDQ BP, AX + SUBQ BP, DX JA emit_lit_memmove_standalone_memmove_gobble_128_loop - ADDQ DX, BP - ADDQ AX, BP + ADDQ BP, DX + ADDQ AX, DX VMOVDQU Y4, (R8) VZEROUPPER - MOVOU X5, -128(BP) - MOVOU X6, -112(BP) - MOVOU X7, -96(BP) - MOVOU X8, -80(BP) - MOVOU X9, -64(BP) - MOVOU X10, -48(BP) - MOVOU X11, -32(BP) - MOVOU X12, -16(BP) + MOVOU X5, -128(DX) + MOVOU X6, -112(DX) + MOVOU X7, -96(DX) + MOVOU X8, -80(DX) + MOVOU X9, -64(DX) + MOVOU X10, -48(DX) + MOVOU X11, -32(DX) + MOVOU X12, -16(DX) emit_literal_end_avx_standalone: MOVQ BX, ret+48(FP) @@ -21856,7 +21822,7 @@ repeat_two_offset_standalone_emit_copy: ADDQ $0x02, BX ADDQ $0x02, AX JMP gen_emit_copy_end - JMP four_bytes_loop_back_standalone + JMP four_bytes_loop_back_standalone four_bytes_remain_standalone: TESTL DX, DX @@ -21949,7 +21915,7 @@ repeat_two_offset_standalone_emit_copy_short: ADDQ $0x02, BX ADDQ $0x02, AX JMP gen_emit_copy_end - JMP two_byte_offset_standalone + JMP two_byte_offset_standalone two_byte_offset_short_standalone: CMPL DX, $0x0c @@ -21979,6 +21945,76 @@ gen_emit_copy_end: MOVQ BX, ret+40(FP) RET +// func emitCopyNoRepeat(dst []byte, offset int, length int) int +TEXT ·emitCopyNoRepeat(SB), NOSPLIT, $0-48 + XORQ BX, BX + MOVQ dst_base+0(FP), AX + MOVQ offset+24(FP), CX + MOVQ length+32(FP), DX + CMPL CX, $0x00010000 + JL two_byte_offset_standalone_snappy + +four_bytes_loop_back_standalone_snappy: + CMPL DX, $0x40 + JLE four_bytes_remain_standalone_snappy + MOVB $0xff, (AX) + MOVL CX, 1(AX) + LEAL -64(DX), DX + ADDQ $0x05, BX + ADDQ $0x05, AX + CMPL DX, $0x04 + JL four_bytes_remain_standalone_snappy + JMP four_bytes_loop_back_standalone_snappy + +four_bytes_remain_standalone_snappy: + TESTL DX, DX + JZ gen_emit_copy_end_snappy + MOVB $0x03, BP + LEAL -4(BP)(DX*4), DX + MOVB DL, (AX) + MOVL CX, 1(AX) + ADDQ $0x05, BX + ADDQ $0x05, AX + JMP gen_emit_copy_end_snappy + +two_byte_offset_standalone_snappy: + CMPL DX, $0x40 + JLE two_byte_offset_short_standalone_snappy + MOVB $0xee, (AX) + MOVW CX, 1(AX) + LEAL -60(DX), DX + ADDQ $0x03, AX + ADDQ $0x03, BX + JMP two_byte_offset_standalone_snappy + +two_byte_offset_short_standalone_snappy: + CMPL DX, $0x0c + JGE emit_copy_three_standalone_snappy + CMPL CX, $0x00000800 + JGE emit_copy_three_standalone_snappy + MOVB $0x01, BP + LEAL -16(BP)(DX*4), DX + MOVB CL, 1(AX) + SHRL $0x08, CX + SHLL $0x05, CX + ORL CX, DX + MOVB DL, (AX) + ADDQ $0x02, BX + ADDQ $0x02, AX + JMP gen_emit_copy_end_snappy + +emit_copy_three_standalone_snappy: + MOVB $0x02, BP + LEAL -4(BP)(DX*4), DX + MOVB DL, (AX) + MOVW CX, 1(AX) + ADDQ $0x03, BX + ADDQ $0x03, AX + +gen_emit_copy_end_snappy: + MOVQ BX, ret+40(FP) + RET + // func matchLen(a []byte, b []byte) int TEXT ·matchLen(SB), NOSPLIT, $0-56 MOVQ a_base+0(FP), AX diff --git a/s2/gen.go b/s2/gen.go index 2f9e833be2..74ea7bc0ba 100644 --- a/s2/gen.go +++ b/s2/gen.go @@ -42,6 +42,7 @@ func main() { genEmitLiteral() genEmitRepeat() genEmitCopy() + genEmitCopyNoRepeat() genMatchLen() Generate() } @@ -1110,6 +1111,37 @@ func genEmitCopy() { RET() } +// emitCopy writes a copy chunk and returns the number of bytes written. +// +// It assumes that: +// dst is long enough to hold the encoded bytes +// 1 <= offset && offset <= math.MaxUint32 +// 4 <= length && length <= 1 << 24 + +// genEmitCopy generates a standlone emitCopy +func genEmitCopyNoRepeat() { + TEXT("emitCopyNoRepeat", NOSPLIT, "func(dst []byte, offset, length int) int") + Doc("emitCopyNoRepeat writes a copy chunk and returns the number of bytes written.", "", + "It assumes that:", + " dst is long enough to hold the encoded bytes", + " 1 <= offset && offset <= math.MaxUint32", + " 4 <= length && length <= 1 << 24", "") + Pragma("noescape") + + dstBase, offset, length, retval := GP64(), GP64(), GP64(), GP64() + + // i := 0 + XORQ(retval, retval) + + Load(Param("dst").Base(), dstBase) + Load(Param("offset"), offset) + Load(Param("length"), length) + emitCopy("standalone_snappy", length, offset, retval, dstBase, "gen_emit_copy_end_snappy", true) + Label("gen_emit_copy_end_snappy") + Store(retval, ReturnIndex(0)) + RET() +} + const ( tagLiteral = 0x00 tagCopy1 = 0x01 From 5657952f31d3cf28ea4c43814c566ecabdae3e63 Mon Sep 17 00:00:00 2001 From: Klaus Post Date: Wed, 11 Mar 2020 09:12:30 +0100 Subject: [PATCH 2/2] Move code. --- s2/encode.go | 44 ++++++++++++++++++++++++++++++++++++++++++++ s2/encode_all.go | 44 -------------------------------------------- 2 files changed, 44 insertions(+), 44 deletions(-) diff --git a/s2/encode.go b/s2/encode.go index ff2f39b1bb..0d2c8e21df 100644 --- a/s2/encode.go +++ b/s2/encode.go @@ -100,6 +100,50 @@ func EncodeBetter(dst, src []byte) []byte { return dst[:d] } +// EncodeSnappy returns the encoded form of src. The returned slice may be a sub- +// slice of dst if dst was large enough to hold the entire encoded block. +// Otherwise, a newly allocated slice will be returned. +// +// The output is Snappy compatible and will likely decompress faster. +// +// The dst and src must not overlap. It is valid to pass a nil dst. +// +// The blocks will require the same amount of memory to decode as encoding, +// and does not make for concurrent decoding. +// Also note that blocks do not contain CRC information, so corruption may be undetected. +// +// If you need to encode larger amounts of data, consider using +// the streaming interface which gives all of these features. +func EncodeSnappy(dst, src []byte) []byte { + if n := MaxEncodedLen(len(src)); n < 0 { + panic(ErrTooLarge) + } else if cap(dst) < n { + dst = make([]byte, n) + } else { + dst = dst[:n] + } + + // The block starts with the varint-encoded length of the decompressed bytes. + d := binary.PutUvarint(dst, uint64(len(src))) + + if len(src) == 0 { + return dst[:d] + } + if len(src) < minNonLiteralBlockSize { + d += emitLiteral(dst[d:], src) + return dst[:d] + } + + n := encodeBlockSnappy(dst[d:], src) + if n > 0 { + d += n + return dst[:d] + } + // Not compressible + d += emitLiteral(dst[d:], src) + return dst[:d] +} + // ConcatBlocks will concatenate the supplied blocks and append them to the supplied destination. // If the destination is nil or too small, a new will be allocated. // The blocks are not validated, so garbage in = garbage out. diff --git a/s2/encode_all.go b/s2/encode_all.go index b2d79085cc..237221b469 100644 --- a/s2/encode_all.go +++ b/s2/encode_all.go @@ -11,50 +11,6 @@ import ( "math/bits" ) -// EncodeSnappy returns the encoded form of src. The returned slice may be a sub- -// slice of dst if dst was large enough to hold the entire encoded block. -// Otherwise, a newly allocated slice will be returned. -// -// The output is Snappy compatible and will likely decompress faster. -// -// The dst and src must not overlap. It is valid to pass a nil dst. -// -// The blocks will require the same amount of memory to decode as encoding, -// and does not make for concurrent decoding. -// Also note that blocks do not contain CRC information, so corruption may be undetected. -// -// If you need to encode larger amounts of data, consider using -// the streaming interface which gives all of these features. -func EncodeSnappy(dst, src []byte) []byte { - if n := MaxEncodedLen(len(src)); n < 0 { - panic(ErrTooLarge) - } else if cap(dst) < n { - dst = make([]byte, n) - } else { - dst = dst[:n] - } - - // The block starts with the varint-encoded length of the decompressed bytes. - d := binary.PutUvarint(dst, uint64(len(src))) - - if len(src) == 0 { - return dst[:d] - } - if len(src) < minNonLiteralBlockSize { - d += emitLiteral(dst[d:], src) - return dst[:d] - } - - n := encodeBlockSnappy(dst[d:], src) - if n > 0 { - d += n - return dst[:d] - } - // Not compressible - d += emitLiteral(dst[d:], src) - return dst[:d] -} - func load32(b []byte, i int) uint32 { b = b[i:] b = b[:4]