diff --git a/s2/_generate/gen.go b/s2/_generate/gen.go index e191847695..56c5536059 100644 --- a/s2/_generate/gen.go +++ b/s2/_generate/gen.go @@ -2743,16 +2743,48 @@ func (o options) matchLen(name string, a, b, len reg.GPVirtual, end LabelRef) re } Label("avx2_continue_" + name) + Label("matchlen_loopback_16_" + name) + tmp2 := GP64() + CMPL(len.As32(), U8(16)) + JB(LabelRef("matchlen_match8_" + name)) + MOVQ(Mem{Base: a, Index: matched, Scale: 1}, tmp) + MOVQ(Mem{Base: a, Index: matched, Scale: 1, Disp: 8}, tmp2) + XORQ(Mem{Base: b, Index: matched, Scale: 1}, tmp) + JNZ(LabelRef("matchlen_bsf_8_" + name)) + XORQ(Mem{Base: b, Index: matched, Scale: 1, Disp: 8}, tmp2) + JNZ(LabelRef("matchlen_bsf_16" + name)) + // All 8 byte matched, update and loop. + LEAL(Mem{Base: len, Disp: -16}, len.As32()) + LEAL(Mem{Base: matched, Disp: 16}, matched) + JMP(LabelRef("matchlen_loopback_16_" + name)) + + Label("matchlen_bsf_16" + name) + // Not all match. + Comment("#ifdef GOAMD64_v3") + // 2016 BMI :TZCNT r64, r64 L: 0.57ns= 2.0c T: 0.29ns= 1.00c + // 315 AMD64 :BSF r64, r64 L: 0.88ns= 3.1c T: 0.86ns= 3.00c + TZCNTQ(tmp2, tmp2) + Comment("#else") + BSFQ(tmp2, tmp2) + Comment("#endif") + + SARQ(U8(3), tmp2) + LEAL(Mem{Base: matched, Index: tmp2, Scale: 1, Disp: 8}, matched) + JMP(end) + + Label("matchlen_match8_" + name) CMPL(len.As32(), U8(8)) JB(LabelRef("matchlen_match4_" + name)) - - Label("matchlen_loopback_" + name) MOVQ(Mem{Base: a, Index: matched, Scale: 1}, tmp) XORQ(Mem{Base: b, Index: matched, Scale: 1}, tmp) - TESTQ(tmp, tmp) - JZ(LabelRef("matchlen_loop_" + name)) - // Not all match. + JNZ(LabelRef("matchlen_bsf_8_" + name)) + // All 8 byte matched, update and loop. + LEAL(Mem{Base: len, Disp: -8}, len.As32()) + LEAL(Mem{Base: matched, Disp: 8}, matched) + JMP(LabelRef("matchlen_match4_" + name)) + Label("matchlen_bsf_8_" + name) + // Not all match. Comment("#ifdef GOAMD64_v3") // 2016 BMI :TZCNT r64, r64 L: 0.57ns= 2.0c T: 0.29ns= 1.00c // 315 AMD64 :BSF r64, r64 L: 0.88ns= 3.1c T: 0.86ns= 3.00c @@ -2765,13 +2797,6 @@ func (o options) matchLen(name string, a, b, len reg.GPVirtual, end LabelRef) re LEAL(Mem{Base: matched, Index: tmp, Scale: 1}, matched) JMP(end) - // All 8 byte matched, update and loop. - Label("matchlen_loop_" + name) - LEAL(Mem{Base: len, Disp: -8}, len.As32()) - LEAL(Mem{Base: matched, Disp: 8}, matched) - CMPL(len.As32(), U8(8)) - JAE(LabelRef("matchlen_loopback_" + name)) - // Less than 8 bytes left. // Test 4 bytes... Label("matchlen_match4_" + name) diff --git a/s2/encodeblock_amd64.s b/s2/encodeblock_amd64.s index 54031aa313..5f110d1940 100644 --- a/s2/encodeblock_amd64.s +++ b/s2/encodeblock_amd64.s @@ -249,15 +249,43 @@ emit_literal_done_repeat_emit_encodeBlockAsm: // matchLen XORL R11, R11 + +matchlen_loopback_16_repeat_extend_encodeBlockAsm: + CMPL R8, $0x10 + JB matchlen_match8_repeat_extend_encodeBlockAsm + MOVQ (R9)(R11*1), R10 + MOVQ 8(R9)(R11*1), R12 + XORQ (BX)(R11*1), R10 + JNZ matchlen_bsf_8_repeat_extend_encodeBlockAsm + XORQ 8(BX)(R11*1), R12 + JNZ matchlen_bsf_16repeat_extend_encodeBlockAsm + LEAL -16(R8), R8 + LEAL 16(R11), R11 + JMP matchlen_loopback_16_repeat_extend_encodeBlockAsm + +matchlen_bsf_16repeat_extend_encodeBlockAsm: +#ifdef GOAMD64_v3 + TZCNTQ R12, R12 + +#else + BSFQ R12, R12 + +#endif + SARQ $0x03, R12 + LEAL 8(R11)(R12*1), R11 + JMP repeat_extend_forward_end_encodeBlockAsm + +matchlen_match8_repeat_extend_encodeBlockAsm: CMPL R8, $0x08 JB matchlen_match4_repeat_extend_encodeBlockAsm + MOVQ (R9)(R11*1), R10 + XORQ (BX)(R11*1), R10 + JNZ matchlen_bsf_8_repeat_extend_encodeBlockAsm + LEAL -8(R8), R8 + LEAL 8(R11), R11 + JMP matchlen_match4_repeat_extend_encodeBlockAsm -matchlen_loopback_repeat_extend_encodeBlockAsm: - MOVQ (R9)(R11*1), R10 - XORQ (BX)(R11*1), R10 - TESTQ R10, R10 - JZ matchlen_loop_repeat_extend_encodeBlockAsm - +matchlen_bsf_8_repeat_extend_encodeBlockAsm: #ifdef GOAMD64_v3 TZCNTQ R10, R10 @@ -269,12 +297,6 @@ matchlen_loopback_repeat_extend_encodeBlockAsm: LEAL (R11)(R10*1), R11 JMP repeat_extend_forward_end_encodeBlockAsm -matchlen_loop_repeat_extend_encodeBlockAsm: - LEAL -8(R8), R8 - LEAL 8(R11), R11 - CMPL R8, $0x08 - JAE matchlen_loopback_repeat_extend_encodeBlockAsm - matchlen_match4_repeat_extend_encodeBlockAsm: CMPL R8, $0x04 JB matchlen_match2_repeat_extend_encodeBlockAsm @@ -851,15 +873,43 @@ match_nolit_loop_encodeBlockAsm: // matchLen XORL R9, R9 + +matchlen_loopback_16_match_nolit_encodeBlockAsm: + CMPL SI, $0x10 + JB matchlen_match8_match_nolit_encodeBlockAsm + MOVQ (DI)(R9*1), R8 + MOVQ 8(DI)(R9*1), R10 + XORQ (BX)(R9*1), R8 + JNZ matchlen_bsf_8_match_nolit_encodeBlockAsm + XORQ 8(BX)(R9*1), R10 + JNZ matchlen_bsf_16match_nolit_encodeBlockAsm + LEAL -16(SI), SI + LEAL 16(R9), R9 + JMP matchlen_loopback_16_match_nolit_encodeBlockAsm + +matchlen_bsf_16match_nolit_encodeBlockAsm: +#ifdef GOAMD64_v3 + TZCNTQ R10, R10 + +#else + BSFQ R10, R10 + +#endif + SARQ $0x03, R10 + LEAL 8(R9)(R10*1), R9 + JMP match_nolit_end_encodeBlockAsm + +matchlen_match8_match_nolit_encodeBlockAsm: CMPL SI, $0x08 JB matchlen_match4_match_nolit_encodeBlockAsm + MOVQ (DI)(R9*1), R8 + XORQ (BX)(R9*1), R8 + JNZ matchlen_bsf_8_match_nolit_encodeBlockAsm + LEAL -8(SI), SI + LEAL 8(R9), R9 + JMP matchlen_match4_match_nolit_encodeBlockAsm -matchlen_loopback_match_nolit_encodeBlockAsm: - MOVQ (DI)(R9*1), R8 - XORQ (BX)(R9*1), R8 - TESTQ R8, R8 - JZ matchlen_loop_match_nolit_encodeBlockAsm - +matchlen_bsf_8_match_nolit_encodeBlockAsm: #ifdef GOAMD64_v3 TZCNTQ R8, R8 @@ -871,12 +921,6 @@ matchlen_loopback_match_nolit_encodeBlockAsm: LEAL (R9)(R8*1), R9 JMP match_nolit_end_encodeBlockAsm -matchlen_loop_match_nolit_encodeBlockAsm: - LEAL -8(SI), SI - LEAL 8(R9), R9 - CMPL SI, $0x08 - JAE matchlen_loopback_match_nolit_encodeBlockAsm - matchlen_match4_match_nolit_encodeBlockAsm: CMPL SI, $0x04 JB matchlen_match2_match_nolit_encodeBlockAsm @@ -1610,15 +1654,43 @@ emit_literal_done_repeat_emit_encodeBlockAsm4MB: // matchLen XORL R11, R11 + +matchlen_loopback_16_repeat_extend_encodeBlockAsm4MB: + CMPL R8, $0x10 + JB matchlen_match8_repeat_extend_encodeBlockAsm4MB + MOVQ (R9)(R11*1), R10 + MOVQ 8(R9)(R11*1), R12 + XORQ (BX)(R11*1), R10 + JNZ matchlen_bsf_8_repeat_extend_encodeBlockAsm4MB + XORQ 8(BX)(R11*1), R12 + JNZ matchlen_bsf_16repeat_extend_encodeBlockAsm4MB + LEAL -16(R8), R8 + LEAL 16(R11), R11 + JMP matchlen_loopback_16_repeat_extend_encodeBlockAsm4MB + +matchlen_bsf_16repeat_extend_encodeBlockAsm4MB: +#ifdef GOAMD64_v3 + TZCNTQ R12, R12 + +#else + BSFQ R12, R12 + +#endif + SARQ $0x03, R12 + LEAL 8(R11)(R12*1), R11 + JMP repeat_extend_forward_end_encodeBlockAsm4MB + +matchlen_match8_repeat_extend_encodeBlockAsm4MB: CMPL R8, $0x08 JB matchlen_match4_repeat_extend_encodeBlockAsm4MB + MOVQ (R9)(R11*1), R10 + XORQ (BX)(R11*1), R10 + JNZ matchlen_bsf_8_repeat_extend_encodeBlockAsm4MB + LEAL -8(R8), R8 + LEAL 8(R11), R11 + JMP matchlen_match4_repeat_extend_encodeBlockAsm4MB -matchlen_loopback_repeat_extend_encodeBlockAsm4MB: - MOVQ (R9)(R11*1), R10 - XORQ (BX)(R11*1), R10 - TESTQ R10, R10 - JZ matchlen_loop_repeat_extend_encodeBlockAsm4MB - +matchlen_bsf_8_repeat_extend_encodeBlockAsm4MB: #ifdef GOAMD64_v3 TZCNTQ R10, R10 @@ -1630,12 +1702,6 @@ matchlen_loopback_repeat_extend_encodeBlockAsm4MB: LEAL (R11)(R10*1), R11 JMP repeat_extend_forward_end_encodeBlockAsm4MB -matchlen_loop_repeat_extend_encodeBlockAsm4MB: - LEAL -8(R8), R8 - LEAL 8(R11), R11 - CMPL R8, $0x08 - JAE matchlen_loopback_repeat_extend_encodeBlockAsm4MB - matchlen_match4_repeat_extend_encodeBlockAsm4MB: CMPL R8, $0x04 JB matchlen_match2_repeat_extend_encodeBlockAsm4MB @@ -2162,15 +2228,43 @@ match_nolit_loop_encodeBlockAsm4MB: // matchLen XORL R9, R9 + +matchlen_loopback_16_match_nolit_encodeBlockAsm4MB: + CMPL SI, $0x10 + JB matchlen_match8_match_nolit_encodeBlockAsm4MB + MOVQ (DI)(R9*1), R8 + MOVQ 8(DI)(R9*1), R10 + XORQ (BX)(R9*1), R8 + JNZ matchlen_bsf_8_match_nolit_encodeBlockAsm4MB + XORQ 8(BX)(R9*1), R10 + JNZ matchlen_bsf_16match_nolit_encodeBlockAsm4MB + LEAL -16(SI), SI + LEAL 16(R9), R9 + JMP matchlen_loopback_16_match_nolit_encodeBlockAsm4MB + +matchlen_bsf_16match_nolit_encodeBlockAsm4MB: +#ifdef GOAMD64_v3 + TZCNTQ R10, R10 + +#else + BSFQ R10, R10 + +#endif + SARQ $0x03, R10 + LEAL 8(R9)(R10*1), R9 + JMP match_nolit_end_encodeBlockAsm4MB + +matchlen_match8_match_nolit_encodeBlockAsm4MB: CMPL SI, $0x08 JB matchlen_match4_match_nolit_encodeBlockAsm4MB + MOVQ (DI)(R9*1), R8 + XORQ (BX)(R9*1), R8 + JNZ matchlen_bsf_8_match_nolit_encodeBlockAsm4MB + LEAL -8(SI), SI + LEAL 8(R9), R9 + JMP matchlen_match4_match_nolit_encodeBlockAsm4MB -matchlen_loopback_match_nolit_encodeBlockAsm4MB: - MOVQ (DI)(R9*1), R8 - XORQ (BX)(R9*1), R8 - TESTQ R8, R8 - JZ matchlen_loop_match_nolit_encodeBlockAsm4MB - +matchlen_bsf_8_match_nolit_encodeBlockAsm4MB: #ifdef GOAMD64_v3 TZCNTQ R8, R8 @@ -2182,12 +2276,6 @@ matchlen_loopback_match_nolit_encodeBlockAsm4MB: LEAL (R9)(R8*1), R9 JMP match_nolit_end_encodeBlockAsm4MB -matchlen_loop_match_nolit_encodeBlockAsm4MB: - LEAL -8(SI), SI - LEAL 8(R9), R9 - CMPL SI, $0x08 - JAE matchlen_loopback_match_nolit_encodeBlockAsm4MB - matchlen_match4_match_nolit_encodeBlockAsm4MB: CMPL SI, $0x04 JB matchlen_match2_match_nolit_encodeBlockAsm4MB @@ -2873,15 +2961,43 @@ emit_literal_done_repeat_emit_encodeBlockAsm12B: // matchLen XORL R11, R11 + +matchlen_loopback_16_repeat_extend_encodeBlockAsm12B: + CMPL R8, $0x10 + JB matchlen_match8_repeat_extend_encodeBlockAsm12B + MOVQ (R9)(R11*1), R10 + MOVQ 8(R9)(R11*1), R12 + XORQ (BX)(R11*1), R10 + JNZ matchlen_bsf_8_repeat_extend_encodeBlockAsm12B + XORQ 8(BX)(R11*1), R12 + JNZ matchlen_bsf_16repeat_extend_encodeBlockAsm12B + LEAL -16(R8), R8 + LEAL 16(R11), R11 + JMP matchlen_loopback_16_repeat_extend_encodeBlockAsm12B + +matchlen_bsf_16repeat_extend_encodeBlockAsm12B: +#ifdef GOAMD64_v3 + TZCNTQ R12, R12 + +#else + BSFQ R12, R12 + +#endif + SARQ $0x03, R12 + LEAL 8(R11)(R12*1), R11 + JMP repeat_extend_forward_end_encodeBlockAsm12B + +matchlen_match8_repeat_extend_encodeBlockAsm12B: CMPL R8, $0x08 JB matchlen_match4_repeat_extend_encodeBlockAsm12B + MOVQ (R9)(R11*1), R10 + XORQ (BX)(R11*1), R10 + JNZ matchlen_bsf_8_repeat_extend_encodeBlockAsm12B + LEAL -8(R8), R8 + LEAL 8(R11), R11 + JMP matchlen_match4_repeat_extend_encodeBlockAsm12B -matchlen_loopback_repeat_extend_encodeBlockAsm12B: - MOVQ (R9)(R11*1), R10 - XORQ (BX)(R11*1), R10 - TESTQ R10, R10 - JZ matchlen_loop_repeat_extend_encodeBlockAsm12B - +matchlen_bsf_8_repeat_extend_encodeBlockAsm12B: #ifdef GOAMD64_v3 TZCNTQ R10, R10 @@ -2893,12 +3009,6 @@ matchlen_loopback_repeat_extend_encodeBlockAsm12B: LEAL (R11)(R10*1), R11 JMP repeat_extend_forward_end_encodeBlockAsm12B -matchlen_loop_repeat_extend_encodeBlockAsm12B: - LEAL -8(R8), R8 - LEAL 8(R11), R11 - CMPL R8, $0x08 - JAE matchlen_loopback_repeat_extend_encodeBlockAsm12B - matchlen_match4_repeat_extend_encodeBlockAsm12B: CMPL R8, $0x04 JB matchlen_match2_repeat_extend_encodeBlockAsm12B @@ -3303,15 +3413,43 @@ match_nolit_loop_encodeBlockAsm12B: // matchLen XORL R9, R9 + +matchlen_loopback_16_match_nolit_encodeBlockAsm12B: + CMPL SI, $0x10 + JB matchlen_match8_match_nolit_encodeBlockAsm12B + MOVQ (DI)(R9*1), R8 + MOVQ 8(DI)(R9*1), R10 + XORQ (BX)(R9*1), R8 + JNZ matchlen_bsf_8_match_nolit_encodeBlockAsm12B + XORQ 8(BX)(R9*1), R10 + JNZ matchlen_bsf_16match_nolit_encodeBlockAsm12B + LEAL -16(SI), SI + LEAL 16(R9), R9 + JMP matchlen_loopback_16_match_nolit_encodeBlockAsm12B + +matchlen_bsf_16match_nolit_encodeBlockAsm12B: +#ifdef GOAMD64_v3 + TZCNTQ R10, R10 + +#else + BSFQ R10, R10 + +#endif + SARQ $0x03, R10 + LEAL 8(R9)(R10*1), R9 + JMP match_nolit_end_encodeBlockAsm12B + +matchlen_match8_match_nolit_encodeBlockAsm12B: CMPL SI, $0x08 JB matchlen_match4_match_nolit_encodeBlockAsm12B + MOVQ (DI)(R9*1), R8 + XORQ (BX)(R9*1), R8 + JNZ matchlen_bsf_8_match_nolit_encodeBlockAsm12B + LEAL -8(SI), SI + LEAL 8(R9), R9 + JMP matchlen_match4_match_nolit_encodeBlockAsm12B -matchlen_loopback_match_nolit_encodeBlockAsm12B: - MOVQ (DI)(R9*1), R8 - XORQ (BX)(R9*1), R8 - TESTQ R8, R8 - JZ matchlen_loop_match_nolit_encodeBlockAsm12B - +matchlen_bsf_8_match_nolit_encodeBlockAsm12B: #ifdef GOAMD64_v3 TZCNTQ R8, R8 @@ -3323,12 +3461,6 @@ matchlen_loopback_match_nolit_encodeBlockAsm12B: LEAL (R9)(R8*1), R9 JMP match_nolit_end_encodeBlockAsm12B -matchlen_loop_match_nolit_encodeBlockAsm12B: - LEAL -8(SI), SI - LEAL 8(R9), R9 - CMPL SI, $0x08 - JAE matchlen_loopback_match_nolit_encodeBlockAsm12B - matchlen_match4_match_nolit_encodeBlockAsm12B: CMPL SI, $0x04 JB matchlen_match2_match_nolit_encodeBlockAsm12B @@ -3904,15 +4036,43 @@ emit_literal_done_repeat_emit_encodeBlockAsm10B: // matchLen XORL R11, R11 + +matchlen_loopback_16_repeat_extend_encodeBlockAsm10B: + CMPL R8, $0x10 + JB matchlen_match8_repeat_extend_encodeBlockAsm10B + MOVQ (R9)(R11*1), R10 + MOVQ 8(R9)(R11*1), R12 + XORQ (BX)(R11*1), R10 + JNZ matchlen_bsf_8_repeat_extend_encodeBlockAsm10B + XORQ 8(BX)(R11*1), R12 + JNZ matchlen_bsf_16repeat_extend_encodeBlockAsm10B + LEAL -16(R8), R8 + LEAL 16(R11), R11 + JMP matchlen_loopback_16_repeat_extend_encodeBlockAsm10B + +matchlen_bsf_16repeat_extend_encodeBlockAsm10B: +#ifdef GOAMD64_v3 + TZCNTQ R12, R12 + +#else + BSFQ R12, R12 + +#endif + SARQ $0x03, R12 + LEAL 8(R11)(R12*1), R11 + JMP repeat_extend_forward_end_encodeBlockAsm10B + +matchlen_match8_repeat_extend_encodeBlockAsm10B: CMPL R8, $0x08 JB matchlen_match4_repeat_extend_encodeBlockAsm10B + MOVQ (R9)(R11*1), R10 + XORQ (BX)(R11*1), R10 + JNZ matchlen_bsf_8_repeat_extend_encodeBlockAsm10B + LEAL -8(R8), R8 + LEAL 8(R11), R11 + JMP matchlen_match4_repeat_extend_encodeBlockAsm10B -matchlen_loopback_repeat_extend_encodeBlockAsm10B: - MOVQ (R9)(R11*1), R10 - XORQ (BX)(R11*1), R10 - TESTQ R10, R10 - JZ matchlen_loop_repeat_extend_encodeBlockAsm10B - +matchlen_bsf_8_repeat_extend_encodeBlockAsm10B: #ifdef GOAMD64_v3 TZCNTQ R10, R10 @@ -3924,12 +4084,6 @@ matchlen_loopback_repeat_extend_encodeBlockAsm10B: LEAL (R11)(R10*1), R11 JMP repeat_extend_forward_end_encodeBlockAsm10B -matchlen_loop_repeat_extend_encodeBlockAsm10B: - LEAL -8(R8), R8 - LEAL 8(R11), R11 - CMPL R8, $0x08 - JAE matchlen_loopback_repeat_extend_encodeBlockAsm10B - matchlen_match4_repeat_extend_encodeBlockAsm10B: CMPL R8, $0x04 JB matchlen_match2_repeat_extend_encodeBlockAsm10B @@ -4334,15 +4488,43 @@ match_nolit_loop_encodeBlockAsm10B: // matchLen XORL R9, R9 + +matchlen_loopback_16_match_nolit_encodeBlockAsm10B: + CMPL SI, $0x10 + JB matchlen_match8_match_nolit_encodeBlockAsm10B + MOVQ (DI)(R9*1), R8 + MOVQ 8(DI)(R9*1), R10 + XORQ (BX)(R9*1), R8 + JNZ matchlen_bsf_8_match_nolit_encodeBlockAsm10B + XORQ 8(BX)(R9*1), R10 + JNZ matchlen_bsf_16match_nolit_encodeBlockAsm10B + LEAL -16(SI), SI + LEAL 16(R9), R9 + JMP matchlen_loopback_16_match_nolit_encodeBlockAsm10B + +matchlen_bsf_16match_nolit_encodeBlockAsm10B: +#ifdef GOAMD64_v3 + TZCNTQ R10, R10 + +#else + BSFQ R10, R10 + +#endif + SARQ $0x03, R10 + LEAL 8(R9)(R10*1), R9 + JMP match_nolit_end_encodeBlockAsm10B + +matchlen_match8_match_nolit_encodeBlockAsm10B: CMPL SI, $0x08 JB matchlen_match4_match_nolit_encodeBlockAsm10B + MOVQ (DI)(R9*1), R8 + XORQ (BX)(R9*1), R8 + JNZ matchlen_bsf_8_match_nolit_encodeBlockAsm10B + LEAL -8(SI), SI + LEAL 8(R9), R9 + JMP matchlen_match4_match_nolit_encodeBlockAsm10B -matchlen_loopback_match_nolit_encodeBlockAsm10B: - MOVQ (DI)(R9*1), R8 - XORQ (BX)(R9*1), R8 - TESTQ R8, R8 - JZ matchlen_loop_match_nolit_encodeBlockAsm10B - +matchlen_bsf_8_match_nolit_encodeBlockAsm10B: #ifdef GOAMD64_v3 TZCNTQ R8, R8 @@ -4354,12 +4536,6 @@ matchlen_loopback_match_nolit_encodeBlockAsm10B: LEAL (R9)(R8*1), R9 JMP match_nolit_end_encodeBlockAsm10B -matchlen_loop_match_nolit_encodeBlockAsm10B: - LEAL -8(SI), SI - LEAL 8(R9), R9 - CMPL SI, $0x08 - JAE matchlen_loopback_match_nolit_encodeBlockAsm10B - matchlen_match4_match_nolit_encodeBlockAsm10B: CMPL SI, $0x04 JB matchlen_match2_match_nolit_encodeBlockAsm10B @@ -4935,15 +5111,43 @@ emit_literal_done_repeat_emit_encodeBlockAsm8B: // matchLen XORL R11, R11 + +matchlen_loopback_16_repeat_extend_encodeBlockAsm8B: + CMPL R8, $0x10 + JB matchlen_match8_repeat_extend_encodeBlockAsm8B + MOVQ (R9)(R11*1), R10 + MOVQ 8(R9)(R11*1), R12 + XORQ (BX)(R11*1), R10 + JNZ matchlen_bsf_8_repeat_extend_encodeBlockAsm8B + XORQ 8(BX)(R11*1), R12 + JNZ matchlen_bsf_16repeat_extend_encodeBlockAsm8B + LEAL -16(R8), R8 + LEAL 16(R11), R11 + JMP matchlen_loopback_16_repeat_extend_encodeBlockAsm8B + +matchlen_bsf_16repeat_extend_encodeBlockAsm8B: +#ifdef GOAMD64_v3 + TZCNTQ R12, R12 + +#else + BSFQ R12, R12 + +#endif + SARQ $0x03, R12 + LEAL 8(R11)(R12*1), R11 + JMP repeat_extend_forward_end_encodeBlockAsm8B + +matchlen_match8_repeat_extend_encodeBlockAsm8B: CMPL R8, $0x08 JB matchlen_match4_repeat_extend_encodeBlockAsm8B + MOVQ (R9)(R11*1), R10 + XORQ (BX)(R11*1), R10 + JNZ matchlen_bsf_8_repeat_extend_encodeBlockAsm8B + LEAL -8(R8), R8 + LEAL 8(R11), R11 + JMP matchlen_match4_repeat_extend_encodeBlockAsm8B -matchlen_loopback_repeat_extend_encodeBlockAsm8B: - MOVQ (R9)(R11*1), R10 - XORQ (BX)(R11*1), R10 - TESTQ R10, R10 - JZ matchlen_loop_repeat_extend_encodeBlockAsm8B - +matchlen_bsf_8_repeat_extend_encodeBlockAsm8B: #ifdef GOAMD64_v3 TZCNTQ R10, R10 @@ -4955,12 +5159,6 @@ matchlen_loopback_repeat_extend_encodeBlockAsm8B: LEAL (R11)(R10*1), R11 JMP repeat_extend_forward_end_encodeBlockAsm8B -matchlen_loop_repeat_extend_encodeBlockAsm8B: - LEAL -8(R8), R8 - LEAL 8(R11), R11 - CMPL R8, $0x08 - JAE matchlen_loopback_repeat_extend_encodeBlockAsm8B - matchlen_match4_repeat_extend_encodeBlockAsm8B: CMPL R8, $0x04 JB matchlen_match2_repeat_extend_encodeBlockAsm8B @@ -5351,15 +5549,43 @@ match_nolit_loop_encodeBlockAsm8B: // matchLen XORL R9, R9 + +matchlen_loopback_16_match_nolit_encodeBlockAsm8B: + CMPL SI, $0x10 + JB matchlen_match8_match_nolit_encodeBlockAsm8B + MOVQ (DI)(R9*1), R8 + MOVQ 8(DI)(R9*1), R10 + XORQ (BX)(R9*1), R8 + JNZ matchlen_bsf_8_match_nolit_encodeBlockAsm8B + XORQ 8(BX)(R9*1), R10 + JNZ matchlen_bsf_16match_nolit_encodeBlockAsm8B + LEAL -16(SI), SI + LEAL 16(R9), R9 + JMP matchlen_loopback_16_match_nolit_encodeBlockAsm8B + +matchlen_bsf_16match_nolit_encodeBlockAsm8B: +#ifdef GOAMD64_v3 + TZCNTQ R10, R10 + +#else + BSFQ R10, R10 + +#endif + SARQ $0x03, R10 + LEAL 8(R9)(R10*1), R9 + JMP match_nolit_end_encodeBlockAsm8B + +matchlen_match8_match_nolit_encodeBlockAsm8B: CMPL SI, $0x08 JB matchlen_match4_match_nolit_encodeBlockAsm8B + MOVQ (DI)(R9*1), R8 + XORQ (BX)(R9*1), R8 + JNZ matchlen_bsf_8_match_nolit_encodeBlockAsm8B + LEAL -8(SI), SI + LEAL 8(R9), R9 + JMP matchlen_match4_match_nolit_encodeBlockAsm8B -matchlen_loopback_match_nolit_encodeBlockAsm8B: - MOVQ (DI)(R9*1), R8 - XORQ (BX)(R9*1), R8 - TESTQ R8, R8 - JZ matchlen_loop_match_nolit_encodeBlockAsm8B - +matchlen_bsf_8_match_nolit_encodeBlockAsm8B: #ifdef GOAMD64_v3 TZCNTQ R8, R8 @@ -5371,12 +5597,6 @@ matchlen_loopback_match_nolit_encodeBlockAsm8B: LEAL (R9)(R8*1), R9 JMP match_nolit_end_encodeBlockAsm8B -matchlen_loop_match_nolit_encodeBlockAsm8B: - LEAL -8(SI), SI - LEAL 8(R9), R9 - CMPL SI, $0x08 - JAE matchlen_loopback_match_nolit_encodeBlockAsm8B - matchlen_match4_match_nolit_encodeBlockAsm8B: CMPL SI, $0x04 JB matchlen_match2_match_nolit_encodeBlockAsm8B @@ -5854,15 +6074,43 @@ match_dst_size_check_encodeBetterBlockAsm: // matchLen XORL R11, R11 + +matchlen_loopback_16_match_nolit_encodeBetterBlockAsm: + CMPL DI, $0x10 + JB matchlen_match8_match_nolit_encodeBetterBlockAsm + MOVQ (R8)(R11*1), R10 + MOVQ 8(R8)(R11*1), R12 + XORQ (R9)(R11*1), R10 + JNZ matchlen_bsf_8_match_nolit_encodeBetterBlockAsm + XORQ 8(R9)(R11*1), R12 + JNZ matchlen_bsf_16match_nolit_encodeBetterBlockAsm + LEAL -16(DI), DI + LEAL 16(R11), R11 + JMP matchlen_loopback_16_match_nolit_encodeBetterBlockAsm + +matchlen_bsf_16match_nolit_encodeBetterBlockAsm: +#ifdef GOAMD64_v3 + TZCNTQ R12, R12 + +#else + BSFQ R12, R12 + +#endif + SARQ $0x03, R12 + LEAL 8(R11)(R12*1), R11 + JMP match_nolit_end_encodeBetterBlockAsm + +matchlen_match8_match_nolit_encodeBetterBlockAsm: CMPL DI, $0x08 JB matchlen_match4_match_nolit_encodeBetterBlockAsm + MOVQ (R8)(R11*1), R10 + XORQ (R9)(R11*1), R10 + JNZ matchlen_bsf_8_match_nolit_encodeBetterBlockAsm + LEAL -8(DI), DI + LEAL 8(R11), R11 + JMP matchlen_match4_match_nolit_encodeBetterBlockAsm -matchlen_loopback_match_nolit_encodeBetterBlockAsm: - MOVQ (R8)(R11*1), R10 - XORQ (R9)(R11*1), R10 - TESTQ R10, R10 - JZ matchlen_loop_match_nolit_encodeBetterBlockAsm - +matchlen_bsf_8_match_nolit_encodeBetterBlockAsm: #ifdef GOAMD64_v3 TZCNTQ R10, R10 @@ -5874,12 +6122,6 @@ matchlen_loopback_match_nolit_encodeBetterBlockAsm: LEAL (R11)(R10*1), R11 JMP match_nolit_end_encodeBetterBlockAsm -matchlen_loop_match_nolit_encodeBetterBlockAsm: - LEAL -8(DI), DI - LEAL 8(R11), R11 - CMPL DI, $0x08 - JAE matchlen_loopback_match_nolit_encodeBetterBlockAsm - matchlen_match4_match_nolit_encodeBetterBlockAsm: CMPL DI, $0x04 JB matchlen_match2_match_nolit_encodeBetterBlockAsm @@ -6926,15 +7168,43 @@ match_dst_size_check_encodeBetterBlockAsm4MB: // matchLen XORL R11, R11 + +matchlen_loopback_16_match_nolit_encodeBetterBlockAsm4MB: + CMPL DI, $0x10 + JB matchlen_match8_match_nolit_encodeBetterBlockAsm4MB + MOVQ (R8)(R11*1), R10 + MOVQ 8(R8)(R11*1), R12 + XORQ (R9)(R11*1), R10 + JNZ matchlen_bsf_8_match_nolit_encodeBetterBlockAsm4MB + XORQ 8(R9)(R11*1), R12 + JNZ matchlen_bsf_16match_nolit_encodeBetterBlockAsm4MB + LEAL -16(DI), DI + LEAL 16(R11), R11 + JMP matchlen_loopback_16_match_nolit_encodeBetterBlockAsm4MB + +matchlen_bsf_16match_nolit_encodeBetterBlockAsm4MB: +#ifdef GOAMD64_v3 + TZCNTQ R12, R12 + +#else + BSFQ R12, R12 + +#endif + SARQ $0x03, R12 + LEAL 8(R11)(R12*1), R11 + JMP match_nolit_end_encodeBetterBlockAsm4MB + +matchlen_match8_match_nolit_encodeBetterBlockAsm4MB: CMPL DI, $0x08 JB matchlen_match4_match_nolit_encodeBetterBlockAsm4MB + MOVQ (R8)(R11*1), R10 + XORQ (R9)(R11*1), R10 + JNZ matchlen_bsf_8_match_nolit_encodeBetterBlockAsm4MB + LEAL -8(DI), DI + LEAL 8(R11), R11 + JMP matchlen_match4_match_nolit_encodeBetterBlockAsm4MB -matchlen_loopback_match_nolit_encodeBetterBlockAsm4MB: - MOVQ (R8)(R11*1), R10 - XORQ (R9)(R11*1), R10 - TESTQ R10, R10 - JZ matchlen_loop_match_nolit_encodeBetterBlockAsm4MB - +matchlen_bsf_8_match_nolit_encodeBetterBlockAsm4MB: #ifdef GOAMD64_v3 TZCNTQ R10, R10 @@ -6946,12 +7216,6 @@ matchlen_loopback_match_nolit_encodeBetterBlockAsm4MB: LEAL (R11)(R10*1), R11 JMP match_nolit_end_encodeBetterBlockAsm4MB -matchlen_loop_match_nolit_encodeBetterBlockAsm4MB: - LEAL -8(DI), DI - LEAL 8(R11), R11 - CMPL DI, $0x08 - JAE matchlen_loopback_match_nolit_encodeBetterBlockAsm4MB - matchlen_match4_match_nolit_encodeBetterBlockAsm4MB: CMPL DI, $0x04 JB matchlen_match2_match_nolit_encodeBetterBlockAsm4MB @@ -7924,15 +8188,43 @@ match_dst_size_check_encodeBetterBlockAsm12B: // matchLen XORL R11, R11 + +matchlen_loopback_16_match_nolit_encodeBetterBlockAsm12B: + CMPL DI, $0x10 + JB matchlen_match8_match_nolit_encodeBetterBlockAsm12B + MOVQ (R8)(R11*1), R10 + MOVQ 8(R8)(R11*1), R12 + XORQ (R9)(R11*1), R10 + JNZ matchlen_bsf_8_match_nolit_encodeBetterBlockAsm12B + XORQ 8(R9)(R11*1), R12 + JNZ matchlen_bsf_16match_nolit_encodeBetterBlockAsm12B + LEAL -16(DI), DI + LEAL 16(R11), R11 + JMP matchlen_loopback_16_match_nolit_encodeBetterBlockAsm12B + +matchlen_bsf_16match_nolit_encodeBetterBlockAsm12B: +#ifdef GOAMD64_v3 + TZCNTQ R12, R12 + +#else + BSFQ R12, R12 + +#endif + SARQ $0x03, R12 + LEAL 8(R11)(R12*1), R11 + JMP match_nolit_end_encodeBetterBlockAsm12B + +matchlen_match8_match_nolit_encodeBetterBlockAsm12B: CMPL DI, $0x08 JB matchlen_match4_match_nolit_encodeBetterBlockAsm12B + MOVQ (R8)(R11*1), R10 + XORQ (R9)(R11*1), R10 + JNZ matchlen_bsf_8_match_nolit_encodeBetterBlockAsm12B + LEAL -8(DI), DI + LEAL 8(R11), R11 + JMP matchlen_match4_match_nolit_encodeBetterBlockAsm12B -matchlen_loopback_match_nolit_encodeBetterBlockAsm12B: - MOVQ (R8)(R11*1), R10 - XORQ (R9)(R11*1), R10 - TESTQ R10, R10 - JZ matchlen_loop_match_nolit_encodeBetterBlockAsm12B - +matchlen_bsf_8_match_nolit_encodeBetterBlockAsm12B: #ifdef GOAMD64_v3 TZCNTQ R10, R10 @@ -7944,12 +8236,6 @@ matchlen_loopback_match_nolit_encodeBetterBlockAsm12B: LEAL (R11)(R10*1), R11 JMP match_nolit_end_encodeBetterBlockAsm12B -matchlen_loop_match_nolit_encodeBetterBlockAsm12B: - LEAL -8(DI), DI - LEAL 8(R11), R11 - CMPL DI, $0x08 - JAE matchlen_loopback_match_nolit_encodeBetterBlockAsm12B - matchlen_match4_match_nolit_encodeBetterBlockAsm12B: CMPL DI, $0x04 JB matchlen_match2_match_nolit_encodeBetterBlockAsm12B @@ -8775,15 +9061,43 @@ match_dst_size_check_encodeBetterBlockAsm10B: // matchLen XORL R11, R11 + +matchlen_loopback_16_match_nolit_encodeBetterBlockAsm10B: + CMPL DI, $0x10 + JB matchlen_match8_match_nolit_encodeBetterBlockAsm10B + MOVQ (R8)(R11*1), R10 + MOVQ 8(R8)(R11*1), R12 + XORQ (R9)(R11*1), R10 + JNZ matchlen_bsf_8_match_nolit_encodeBetterBlockAsm10B + XORQ 8(R9)(R11*1), R12 + JNZ matchlen_bsf_16match_nolit_encodeBetterBlockAsm10B + LEAL -16(DI), DI + LEAL 16(R11), R11 + JMP matchlen_loopback_16_match_nolit_encodeBetterBlockAsm10B + +matchlen_bsf_16match_nolit_encodeBetterBlockAsm10B: +#ifdef GOAMD64_v3 + TZCNTQ R12, R12 + +#else + BSFQ R12, R12 + +#endif + SARQ $0x03, R12 + LEAL 8(R11)(R12*1), R11 + JMP match_nolit_end_encodeBetterBlockAsm10B + +matchlen_match8_match_nolit_encodeBetterBlockAsm10B: CMPL DI, $0x08 JB matchlen_match4_match_nolit_encodeBetterBlockAsm10B + MOVQ (R8)(R11*1), R10 + XORQ (R9)(R11*1), R10 + JNZ matchlen_bsf_8_match_nolit_encodeBetterBlockAsm10B + LEAL -8(DI), DI + LEAL 8(R11), R11 + JMP matchlen_match4_match_nolit_encodeBetterBlockAsm10B -matchlen_loopback_match_nolit_encodeBetterBlockAsm10B: - MOVQ (R8)(R11*1), R10 - XORQ (R9)(R11*1), R10 - TESTQ R10, R10 - JZ matchlen_loop_match_nolit_encodeBetterBlockAsm10B - +matchlen_bsf_8_match_nolit_encodeBetterBlockAsm10B: #ifdef GOAMD64_v3 TZCNTQ R10, R10 @@ -8795,12 +9109,6 @@ matchlen_loopback_match_nolit_encodeBetterBlockAsm10B: LEAL (R11)(R10*1), R11 JMP match_nolit_end_encodeBetterBlockAsm10B -matchlen_loop_match_nolit_encodeBetterBlockAsm10B: - LEAL -8(DI), DI - LEAL 8(R11), R11 - CMPL DI, $0x08 - JAE matchlen_loopback_match_nolit_encodeBetterBlockAsm10B - matchlen_match4_match_nolit_encodeBetterBlockAsm10B: CMPL DI, $0x04 JB matchlen_match2_match_nolit_encodeBetterBlockAsm10B @@ -9626,15 +9934,43 @@ match_dst_size_check_encodeBetterBlockAsm8B: // matchLen XORL R11, R11 + +matchlen_loopback_16_match_nolit_encodeBetterBlockAsm8B: + CMPL DI, $0x10 + JB matchlen_match8_match_nolit_encodeBetterBlockAsm8B + MOVQ (R8)(R11*1), R10 + MOVQ 8(R8)(R11*1), R12 + XORQ (R9)(R11*1), R10 + JNZ matchlen_bsf_8_match_nolit_encodeBetterBlockAsm8B + XORQ 8(R9)(R11*1), R12 + JNZ matchlen_bsf_16match_nolit_encodeBetterBlockAsm8B + LEAL -16(DI), DI + LEAL 16(R11), R11 + JMP matchlen_loopback_16_match_nolit_encodeBetterBlockAsm8B + +matchlen_bsf_16match_nolit_encodeBetterBlockAsm8B: +#ifdef GOAMD64_v3 + TZCNTQ R12, R12 + +#else + BSFQ R12, R12 + +#endif + SARQ $0x03, R12 + LEAL 8(R11)(R12*1), R11 + JMP match_nolit_end_encodeBetterBlockAsm8B + +matchlen_match8_match_nolit_encodeBetterBlockAsm8B: CMPL DI, $0x08 JB matchlen_match4_match_nolit_encodeBetterBlockAsm8B + MOVQ (R8)(R11*1), R10 + XORQ (R9)(R11*1), R10 + JNZ matchlen_bsf_8_match_nolit_encodeBetterBlockAsm8B + LEAL -8(DI), DI + LEAL 8(R11), R11 + JMP matchlen_match4_match_nolit_encodeBetterBlockAsm8B -matchlen_loopback_match_nolit_encodeBetterBlockAsm8B: - MOVQ (R8)(R11*1), R10 - XORQ (R9)(R11*1), R10 - TESTQ R10, R10 - JZ matchlen_loop_match_nolit_encodeBetterBlockAsm8B - +matchlen_bsf_8_match_nolit_encodeBetterBlockAsm8B: #ifdef GOAMD64_v3 TZCNTQ R10, R10 @@ -9646,12 +9982,6 @@ matchlen_loopback_match_nolit_encodeBetterBlockAsm8B: LEAL (R11)(R10*1), R11 JMP match_nolit_end_encodeBetterBlockAsm8B -matchlen_loop_match_nolit_encodeBetterBlockAsm8B: - LEAL -8(DI), DI - LEAL 8(R11), R11 - CMPL DI, $0x08 - JAE matchlen_loopback_match_nolit_encodeBetterBlockAsm8B - matchlen_match4_match_nolit_encodeBetterBlockAsm8B: CMPL DI, $0x04 JB matchlen_match2_match_nolit_encodeBetterBlockAsm8B @@ -10575,31 +10905,53 @@ emit_literal_done_repeat_emit_encodeSnappyBlockAsm: // matchLen XORL R10, R10 - CMPL DI, $0x08 - JB matchlen_match4_repeat_extend_encodeSnappyBlockAsm - -matchlen_loopback_repeat_extend_encodeSnappyBlockAsm: - MOVQ (R8)(R10*1), R9 - XORQ (BX)(R10*1), R9 - TESTQ R9, R9 - JZ matchlen_loop_repeat_extend_encodeSnappyBlockAsm +matchlen_loopback_16_repeat_extend_encodeSnappyBlockAsm: + CMPL DI, $0x10 + JB matchlen_match8_repeat_extend_encodeSnappyBlockAsm + MOVQ (R8)(R10*1), R9 + MOVQ 8(R8)(R10*1), R11 + XORQ (BX)(R10*1), R9 + JNZ matchlen_bsf_8_repeat_extend_encodeSnappyBlockAsm + XORQ 8(BX)(R10*1), R11 + JNZ matchlen_bsf_16repeat_extend_encodeSnappyBlockAsm + LEAL -16(DI), DI + LEAL 16(R10), R10 + JMP matchlen_loopback_16_repeat_extend_encodeSnappyBlockAsm + +matchlen_bsf_16repeat_extend_encodeSnappyBlockAsm: #ifdef GOAMD64_v3 - TZCNTQ R9, R9 + TZCNTQ R11, R11 #else - BSFQ R9, R9 + BSFQ R11, R11 #endif - SARQ $0x03, R9 - LEAL (R10)(R9*1), R10 + SARQ $0x03, R11 + LEAL 8(R10)(R11*1), R10 JMP repeat_extend_forward_end_encodeSnappyBlockAsm -matchlen_loop_repeat_extend_encodeSnappyBlockAsm: +matchlen_match8_repeat_extend_encodeSnappyBlockAsm: + CMPL DI, $0x08 + JB matchlen_match4_repeat_extend_encodeSnappyBlockAsm + MOVQ (R8)(R10*1), R9 + XORQ (BX)(R10*1), R9 + JNZ matchlen_bsf_8_repeat_extend_encodeSnappyBlockAsm LEAL -8(DI), DI LEAL 8(R10), R10 - CMPL DI, $0x08 - JAE matchlen_loopback_repeat_extend_encodeSnappyBlockAsm + JMP matchlen_match4_repeat_extend_encodeSnappyBlockAsm + +matchlen_bsf_8_repeat_extend_encodeSnappyBlockAsm: +#ifdef GOAMD64_v3 + TZCNTQ R9, R9 + +#else + BSFQ R9, R9 + +#endif + SARQ $0x03, R9 + LEAL (R10)(R9*1), R10 + JMP repeat_extend_forward_end_encodeSnappyBlockAsm matchlen_match4_repeat_extend_encodeSnappyBlockAsm: CMPL DI, $0x04 @@ -10897,15 +11249,43 @@ match_nolit_loop_encodeSnappyBlockAsm: // matchLen XORL R9, R9 + +matchlen_loopback_16_match_nolit_encodeSnappyBlockAsm: + CMPL SI, $0x10 + JB matchlen_match8_match_nolit_encodeSnappyBlockAsm + MOVQ (DI)(R9*1), R8 + MOVQ 8(DI)(R9*1), R10 + XORQ (BX)(R9*1), R8 + JNZ matchlen_bsf_8_match_nolit_encodeSnappyBlockAsm + XORQ 8(BX)(R9*1), R10 + JNZ matchlen_bsf_16match_nolit_encodeSnappyBlockAsm + LEAL -16(SI), SI + LEAL 16(R9), R9 + JMP matchlen_loopback_16_match_nolit_encodeSnappyBlockAsm + +matchlen_bsf_16match_nolit_encodeSnappyBlockAsm: +#ifdef GOAMD64_v3 + TZCNTQ R10, R10 + +#else + BSFQ R10, R10 + +#endif + SARQ $0x03, R10 + LEAL 8(R9)(R10*1), R9 + JMP match_nolit_end_encodeSnappyBlockAsm + +matchlen_match8_match_nolit_encodeSnappyBlockAsm: CMPL SI, $0x08 JB matchlen_match4_match_nolit_encodeSnappyBlockAsm + MOVQ (DI)(R9*1), R8 + XORQ (BX)(R9*1), R8 + JNZ matchlen_bsf_8_match_nolit_encodeSnappyBlockAsm + LEAL -8(SI), SI + LEAL 8(R9), R9 + JMP matchlen_match4_match_nolit_encodeSnappyBlockAsm -matchlen_loopback_match_nolit_encodeSnappyBlockAsm: - MOVQ (DI)(R9*1), R8 - XORQ (BX)(R9*1), R8 - TESTQ R8, R8 - JZ matchlen_loop_match_nolit_encodeSnappyBlockAsm - +matchlen_bsf_8_match_nolit_encodeSnappyBlockAsm: #ifdef GOAMD64_v3 TZCNTQ R8, R8 @@ -10917,12 +11297,6 @@ matchlen_loopback_match_nolit_encodeSnappyBlockAsm: LEAL (R9)(R8*1), R9 JMP match_nolit_end_encodeSnappyBlockAsm -matchlen_loop_match_nolit_encodeSnappyBlockAsm: - LEAL -8(SI), SI - LEAL 8(R9), R9 - CMPL SI, $0x08 - JAE matchlen_loopback_match_nolit_encodeSnappyBlockAsm - matchlen_match4_match_nolit_encodeSnappyBlockAsm: CMPL SI, $0x04 JB matchlen_match2_match_nolit_encodeSnappyBlockAsm @@ -11437,15 +11811,43 @@ emit_literal_done_repeat_emit_encodeSnappyBlockAsm64K: // matchLen XORL R10, R10 + +matchlen_loopback_16_repeat_extend_encodeSnappyBlockAsm64K: + CMPL DI, $0x10 + JB matchlen_match8_repeat_extend_encodeSnappyBlockAsm64K + MOVQ (R8)(R10*1), R9 + MOVQ 8(R8)(R10*1), R11 + XORQ (BX)(R10*1), R9 + JNZ matchlen_bsf_8_repeat_extend_encodeSnappyBlockAsm64K + XORQ 8(BX)(R10*1), R11 + JNZ matchlen_bsf_16repeat_extend_encodeSnappyBlockAsm64K + LEAL -16(DI), DI + LEAL 16(R10), R10 + JMP matchlen_loopback_16_repeat_extend_encodeSnappyBlockAsm64K + +matchlen_bsf_16repeat_extend_encodeSnappyBlockAsm64K: +#ifdef GOAMD64_v3 + TZCNTQ R11, R11 + +#else + BSFQ R11, R11 + +#endif + SARQ $0x03, R11 + LEAL 8(R10)(R11*1), R10 + JMP repeat_extend_forward_end_encodeSnappyBlockAsm64K + +matchlen_match8_repeat_extend_encodeSnappyBlockAsm64K: CMPL DI, $0x08 JB matchlen_match4_repeat_extend_encodeSnappyBlockAsm64K + MOVQ (R8)(R10*1), R9 + XORQ (BX)(R10*1), R9 + JNZ matchlen_bsf_8_repeat_extend_encodeSnappyBlockAsm64K + LEAL -8(DI), DI + LEAL 8(R10), R10 + JMP matchlen_match4_repeat_extend_encodeSnappyBlockAsm64K -matchlen_loopback_repeat_extend_encodeSnappyBlockAsm64K: - MOVQ (R8)(R10*1), R9 - XORQ (BX)(R10*1), R9 - TESTQ R9, R9 - JZ matchlen_loop_repeat_extend_encodeSnappyBlockAsm64K - +matchlen_bsf_8_repeat_extend_encodeSnappyBlockAsm64K: #ifdef GOAMD64_v3 TZCNTQ R9, R9 @@ -11457,12 +11859,6 @@ matchlen_loopback_repeat_extend_encodeSnappyBlockAsm64K: LEAL (R10)(R9*1), R10 JMP repeat_extend_forward_end_encodeSnappyBlockAsm64K -matchlen_loop_repeat_extend_encodeSnappyBlockAsm64K: - LEAL -8(DI), DI - LEAL 8(R10), R10 - CMPL DI, $0x08 - JAE matchlen_loopback_repeat_extend_encodeSnappyBlockAsm64K - matchlen_match4_repeat_extend_encodeSnappyBlockAsm64K: CMPL DI, $0x04 JB matchlen_match2_repeat_extend_encodeSnappyBlockAsm64K @@ -11719,15 +12115,43 @@ match_nolit_loop_encodeSnappyBlockAsm64K: // matchLen XORL R9, R9 + +matchlen_loopback_16_match_nolit_encodeSnappyBlockAsm64K: + CMPL SI, $0x10 + JB matchlen_match8_match_nolit_encodeSnappyBlockAsm64K + MOVQ (DI)(R9*1), R8 + MOVQ 8(DI)(R9*1), R10 + XORQ (BX)(R9*1), R8 + JNZ matchlen_bsf_8_match_nolit_encodeSnappyBlockAsm64K + XORQ 8(BX)(R9*1), R10 + JNZ matchlen_bsf_16match_nolit_encodeSnappyBlockAsm64K + LEAL -16(SI), SI + LEAL 16(R9), R9 + JMP matchlen_loopback_16_match_nolit_encodeSnappyBlockAsm64K + +matchlen_bsf_16match_nolit_encodeSnappyBlockAsm64K: +#ifdef GOAMD64_v3 + TZCNTQ R10, R10 + +#else + BSFQ R10, R10 + +#endif + SARQ $0x03, R10 + LEAL 8(R9)(R10*1), R9 + JMP match_nolit_end_encodeSnappyBlockAsm64K + +matchlen_match8_match_nolit_encodeSnappyBlockAsm64K: CMPL SI, $0x08 JB matchlen_match4_match_nolit_encodeSnappyBlockAsm64K + MOVQ (DI)(R9*1), R8 + XORQ (BX)(R9*1), R8 + JNZ matchlen_bsf_8_match_nolit_encodeSnappyBlockAsm64K + LEAL -8(SI), SI + LEAL 8(R9), R9 + JMP matchlen_match4_match_nolit_encodeSnappyBlockAsm64K -matchlen_loopback_match_nolit_encodeSnappyBlockAsm64K: - MOVQ (DI)(R9*1), R8 - XORQ (BX)(R9*1), R8 - TESTQ R8, R8 - JZ matchlen_loop_match_nolit_encodeSnappyBlockAsm64K - +matchlen_bsf_8_match_nolit_encodeSnappyBlockAsm64K: #ifdef GOAMD64_v3 TZCNTQ R8, R8 @@ -11739,12 +12163,6 @@ matchlen_loopback_match_nolit_encodeSnappyBlockAsm64K: LEAL (R9)(R8*1), R9 JMP match_nolit_end_encodeSnappyBlockAsm64K -matchlen_loop_match_nolit_encodeSnappyBlockAsm64K: - LEAL -8(SI), SI - LEAL 8(R9), R9 - CMPL SI, $0x08 - JAE matchlen_loopback_match_nolit_encodeSnappyBlockAsm64K - matchlen_match4_match_nolit_encodeSnappyBlockAsm64K: CMPL SI, $0x04 JB matchlen_match2_match_nolit_encodeSnappyBlockAsm64K @@ -12219,15 +12637,43 @@ emit_literal_done_repeat_emit_encodeSnappyBlockAsm12B: // matchLen XORL R10, R10 + +matchlen_loopback_16_repeat_extend_encodeSnappyBlockAsm12B: + CMPL DI, $0x10 + JB matchlen_match8_repeat_extend_encodeSnappyBlockAsm12B + MOVQ (R8)(R10*1), R9 + MOVQ 8(R8)(R10*1), R11 + XORQ (BX)(R10*1), R9 + JNZ matchlen_bsf_8_repeat_extend_encodeSnappyBlockAsm12B + XORQ 8(BX)(R10*1), R11 + JNZ matchlen_bsf_16repeat_extend_encodeSnappyBlockAsm12B + LEAL -16(DI), DI + LEAL 16(R10), R10 + JMP matchlen_loopback_16_repeat_extend_encodeSnappyBlockAsm12B + +matchlen_bsf_16repeat_extend_encodeSnappyBlockAsm12B: +#ifdef GOAMD64_v3 + TZCNTQ R11, R11 + +#else + BSFQ R11, R11 + +#endif + SARQ $0x03, R11 + LEAL 8(R10)(R11*1), R10 + JMP repeat_extend_forward_end_encodeSnappyBlockAsm12B + +matchlen_match8_repeat_extend_encodeSnappyBlockAsm12B: CMPL DI, $0x08 JB matchlen_match4_repeat_extend_encodeSnappyBlockAsm12B + MOVQ (R8)(R10*1), R9 + XORQ (BX)(R10*1), R9 + JNZ matchlen_bsf_8_repeat_extend_encodeSnappyBlockAsm12B + LEAL -8(DI), DI + LEAL 8(R10), R10 + JMP matchlen_match4_repeat_extend_encodeSnappyBlockAsm12B -matchlen_loopback_repeat_extend_encodeSnappyBlockAsm12B: - MOVQ (R8)(R10*1), R9 - XORQ (BX)(R10*1), R9 - TESTQ R9, R9 - JZ matchlen_loop_repeat_extend_encodeSnappyBlockAsm12B - +matchlen_bsf_8_repeat_extend_encodeSnappyBlockAsm12B: #ifdef GOAMD64_v3 TZCNTQ R9, R9 @@ -12239,12 +12685,6 @@ matchlen_loopback_repeat_extend_encodeSnappyBlockAsm12B: LEAL (R10)(R9*1), R10 JMP repeat_extend_forward_end_encodeSnappyBlockAsm12B -matchlen_loop_repeat_extend_encodeSnappyBlockAsm12B: - LEAL -8(DI), DI - LEAL 8(R10), R10 - CMPL DI, $0x08 - JAE matchlen_loopback_repeat_extend_encodeSnappyBlockAsm12B - matchlen_match4_repeat_extend_encodeSnappyBlockAsm12B: CMPL DI, $0x04 JB matchlen_match2_repeat_extend_encodeSnappyBlockAsm12B @@ -12501,15 +12941,43 @@ match_nolit_loop_encodeSnappyBlockAsm12B: // matchLen XORL R9, R9 + +matchlen_loopback_16_match_nolit_encodeSnappyBlockAsm12B: + CMPL SI, $0x10 + JB matchlen_match8_match_nolit_encodeSnappyBlockAsm12B + MOVQ (DI)(R9*1), R8 + MOVQ 8(DI)(R9*1), R10 + XORQ (BX)(R9*1), R8 + JNZ matchlen_bsf_8_match_nolit_encodeSnappyBlockAsm12B + XORQ 8(BX)(R9*1), R10 + JNZ matchlen_bsf_16match_nolit_encodeSnappyBlockAsm12B + LEAL -16(SI), SI + LEAL 16(R9), R9 + JMP matchlen_loopback_16_match_nolit_encodeSnappyBlockAsm12B + +matchlen_bsf_16match_nolit_encodeSnappyBlockAsm12B: +#ifdef GOAMD64_v3 + TZCNTQ R10, R10 + +#else + BSFQ R10, R10 + +#endif + SARQ $0x03, R10 + LEAL 8(R9)(R10*1), R9 + JMP match_nolit_end_encodeSnappyBlockAsm12B + +matchlen_match8_match_nolit_encodeSnappyBlockAsm12B: CMPL SI, $0x08 JB matchlen_match4_match_nolit_encodeSnappyBlockAsm12B + MOVQ (DI)(R9*1), R8 + XORQ (BX)(R9*1), R8 + JNZ matchlen_bsf_8_match_nolit_encodeSnappyBlockAsm12B + LEAL -8(SI), SI + LEAL 8(R9), R9 + JMP matchlen_match4_match_nolit_encodeSnappyBlockAsm12B -matchlen_loopback_match_nolit_encodeSnappyBlockAsm12B: - MOVQ (DI)(R9*1), R8 - XORQ (BX)(R9*1), R8 - TESTQ R8, R8 - JZ matchlen_loop_match_nolit_encodeSnappyBlockAsm12B - +matchlen_bsf_8_match_nolit_encodeSnappyBlockAsm12B: #ifdef GOAMD64_v3 TZCNTQ R8, R8 @@ -12521,12 +12989,6 @@ matchlen_loopback_match_nolit_encodeSnappyBlockAsm12B: LEAL (R9)(R8*1), R9 JMP match_nolit_end_encodeSnappyBlockAsm12B -matchlen_loop_match_nolit_encodeSnappyBlockAsm12B: - LEAL -8(SI), SI - LEAL 8(R9), R9 - CMPL SI, $0x08 - JAE matchlen_loopback_match_nolit_encodeSnappyBlockAsm12B - matchlen_match4_match_nolit_encodeSnappyBlockAsm12B: CMPL SI, $0x04 JB matchlen_match2_match_nolit_encodeSnappyBlockAsm12B @@ -13001,15 +13463,43 @@ emit_literal_done_repeat_emit_encodeSnappyBlockAsm10B: // matchLen XORL R10, R10 + +matchlen_loopback_16_repeat_extend_encodeSnappyBlockAsm10B: + CMPL DI, $0x10 + JB matchlen_match8_repeat_extend_encodeSnappyBlockAsm10B + MOVQ (R8)(R10*1), R9 + MOVQ 8(R8)(R10*1), R11 + XORQ (BX)(R10*1), R9 + JNZ matchlen_bsf_8_repeat_extend_encodeSnappyBlockAsm10B + XORQ 8(BX)(R10*1), R11 + JNZ matchlen_bsf_16repeat_extend_encodeSnappyBlockAsm10B + LEAL -16(DI), DI + LEAL 16(R10), R10 + JMP matchlen_loopback_16_repeat_extend_encodeSnappyBlockAsm10B + +matchlen_bsf_16repeat_extend_encodeSnappyBlockAsm10B: +#ifdef GOAMD64_v3 + TZCNTQ R11, R11 + +#else + BSFQ R11, R11 + +#endif + SARQ $0x03, R11 + LEAL 8(R10)(R11*1), R10 + JMP repeat_extend_forward_end_encodeSnappyBlockAsm10B + +matchlen_match8_repeat_extend_encodeSnappyBlockAsm10B: CMPL DI, $0x08 JB matchlen_match4_repeat_extend_encodeSnappyBlockAsm10B + MOVQ (R8)(R10*1), R9 + XORQ (BX)(R10*1), R9 + JNZ matchlen_bsf_8_repeat_extend_encodeSnappyBlockAsm10B + LEAL -8(DI), DI + LEAL 8(R10), R10 + JMP matchlen_match4_repeat_extend_encodeSnappyBlockAsm10B -matchlen_loopback_repeat_extend_encodeSnappyBlockAsm10B: - MOVQ (R8)(R10*1), R9 - XORQ (BX)(R10*1), R9 - TESTQ R9, R9 - JZ matchlen_loop_repeat_extend_encodeSnappyBlockAsm10B - +matchlen_bsf_8_repeat_extend_encodeSnappyBlockAsm10B: #ifdef GOAMD64_v3 TZCNTQ R9, R9 @@ -13021,12 +13511,6 @@ matchlen_loopback_repeat_extend_encodeSnappyBlockAsm10B: LEAL (R10)(R9*1), R10 JMP repeat_extend_forward_end_encodeSnappyBlockAsm10B -matchlen_loop_repeat_extend_encodeSnappyBlockAsm10B: - LEAL -8(DI), DI - LEAL 8(R10), R10 - CMPL DI, $0x08 - JAE matchlen_loopback_repeat_extend_encodeSnappyBlockAsm10B - matchlen_match4_repeat_extend_encodeSnappyBlockAsm10B: CMPL DI, $0x04 JB matchlen_match2_repeat_extend_encodeSnappyBlockAsm10B @@ -13283,15 +13767,43 @@ match_nolit_loop_encodeSnappyBlockAsm10B: // matchLen XORL R9, R9 + +matchlen_loopback_16_match_nolit_encodeSnappyBlockAsm10B: + CMPL SI, $0x10 + JB matchlen_match8_match_nolit_encodeSnappyBlockAsm10B + MOVQ (DI)(R9*1), R8 + MOVQ 8(DI)(R9*1), R10 + XORQ (BX)(R9*1), R8 + JNZ matchlen_bsf_8_match_nolit_encodeSnappyBlockAsm10B + XORQ 8(BX)(R9*1), R10 + JNZ matchlen_bsf_16match_nolit_encodeSnappyBlockAsm10B + LEAL -16(SI), SI + LEAL 16(R9), R9 + JMP matchlen_loopback_16_match_nolit_encodeSnappyBlockAsm10B + +matchlen_bsf_16match_nolit_encodeSnappyBlockAsm10B: +#ifdef GOAMD64_v3 + TZCNTQ R10, R10 + +#else + BSFQ R10, R10 + +#endif + SARQ $0x03, R10 + LEAL 8(R9)(R10*1), R9 + JMP match_nolit_end_encodeSnappyBlockAsm10B + +matchlen_match8_match_nolit_encodeSnappyBlockAsm10B: CMPL SI, $0x08 JB matchlen_match4_match_nolit_encodeSnappyBlockAsm10B + MOVQ (DI)(R9*1), R8 + XORQ (BX)(R9*1), R8 + JNZ matchlen_bsf_8_match_nolit_encodeSnappyBlockAsm10B + LEAL -8(SI), SI + LEAL 8(R9), R9 + JMP matchlen_match4_match_nolit_encodeSnappyBlockAsm10B -matchlen_loopback_match_nolit_encodeSnappyBlockAsm10B: - MOVQ (DI)(R9*1), R8 - XORQ (BX)(R9*1), R8 - TESTQ R8, R8 - JZ matchlen_loop_match_nolit_encodeSnappyBlockAsm10B - +matchlen_bsf_8_match_nolit_encodeSnappyBlockAsm10B: #ifdef GOAMD64_v3 TZCNTQ R8, R8 @@ -13303,12 +13815,6 @@ matchlen_loopback_match_nolit_encodeSnappyBlockAsm10B: LEAL (R9)(R8*1), R9 JMP match_nolit_end_encodeSnappyBlockAsm10B -matchlen_loop_match_nolit_encodeSnappyBlockAsm10B: - LEAL -8(SI), SI - LEAL 8(R9), R9 - CMPL SI, $0x08 - JAE matchlen_loopback_match_nolit_encodeSnappyBlockAsm10B - matchlen_match4_match_nolit_encodeSnappyBlockAsm10B: CMPL SI, $0x04 JB matchlen_match2_match_nolit_encodeSnappyBlockAsm10B @@ -13783,15 +14289,43 @@ emit_literal_done_repeat_emit_encodeSnappyBlockAsm8B: // matchLen XORL R10, R10 + +matchlen_loopback_16_repeat_extend_encodeSnappyBlockAsm8B: + CMPL DI, $0x10 + JB matchlen_match8_repeat_extend_encodeSnappyBlockAsm8B + MOVQ (R8)(R10*1), R9 + MOVQ 8(R8)(R10*1), R11 + XORQ (BX)(R10*1), R9 + JNZ matchlen_bsf_8_repeat_extend_encodeSnappyBlockAsm8B + XORQ 8(BX)(R10*1), R11 + JNZ matchlen_bsf_16repeat_extend_encodeSnappyBlockAsm8B + LEAL -16(DI), DI + LEAL 16(R10), R10 + JMP matchlen_loopback_16_repeat_extend_encodeSnappyBlockAsm8B + +matchlen_bsf_16repeat_extend_encodeSnappyBlockAsm8B: +#ifdef GOAMD64_v3 + TZCNTQ R11, R11 + +#else + BSFQ R11, R11 + +#endif + SARQ $0x03, R11 + LEAL 8(R10)(R11*1), R10 + JMP repeat_extend_forward_end_encodeSnappyBlockAsm8B + +matchlen_match8_repeat_extend_encodeSnappyBlockAsm8B: CMPL DI, $0x08 JB matchlen_match4_repeat_extend_encodeSnappyBlockAsm8B + MOVQ (R8)(R10*1), R9 + XORQ (BX)(R10*1), R9 + JNZ matchlen_bsf_8_repeat_extend_encodeSnappyBlockAsm8B + LEAL -8(DI), DI + LEAL 8(R10), R10 + JMP matchlen_match4_repeat_extend_encodeSnappyBlockAsm8B -matchlen_loopback_repeat_extend_encodeSnappyBlockAsm8B: - MOVQ (R8)(R10*1), R9 - XORQ (BX)(R10*1), R9 - TESTQ R9, R9 - JZ matchlen_loop_repeat_extend_encodeSnappyBlockAsm8B - +matchlen_bsf_8_repeat_extend_encodeSnappyBlockAsm8B: #ifdef GOAMD64_v3 TZCNTQ R9, R9 @@ -13803,12 +14337,6 @@ matchlen_loopback_repeat_extend_encodeSnappyBlockAsm8B: LEAL (R10)(R9*1), R10 JMP repeat_extend_forward_end_encodeSnappyBlockAsm8B -matchlen_loop_repeat_extend_encodeSnappyBlockAsm8B: - LEAL -8(DI), DI - LEAL 8(R10), R10 - CMPL DI, $0x08 - JAE matchlen_loopback_repeat_extend_encodeSnappyBlockAsm8B - matchlen_match4_repeat_extend_encodeSnappyBlockAsm8B: CMPL DI, $0x04 JB matchlen_match2_repeat_extend_encodeSnappyBlockAsm8B @@ -14063,15 +14591,43 @@ match_nolit_loop_encodeSnappyBlockAsm8B: // matchLen XORL R9, R9 + +matchlen_loopback_16_match_nolit_encodeSnappyBlockAsm8B: + CMPL SI, $0x10 + JB matchlen_match8_match_nolit_encodeSnappyBlockAsm8B + MOVQ (DI)(R9*1), R8 + MOVQ 8(DI)(R9*1), R10 + XORQ (BX)(R9*1), R8 + JNZ matchlen_bsf_8_match_nolit_encodeSnappyBlockAsm8B + XORQ 8(BX)(R9*1), R10 + JNZ matchlen_bsf_16match_nolit_encodeSnappyBlockAsm8B + LEAL -16(SI), SI + LEAL 16(R9), R9 + JMP matchlen_loopback_16_match_nolit_encodeSnappyBlockAsm8B + +matchlen_bsf_16match_nolit_encodeSnappyBlockAsm8B: +#ifdef GOAMD64_v3 + TZCNTQ R10, R10 + +#else + BSFQ R10, R10 + +#endif + SARQ $0x03, R10 + LEAL 8(R9)(R10*1), R9 + JMP match_nolit_end_encodeSnappyBlockAsm8B + +matchlen_match8_match_nolit_encodeSnappyBlockAsm8B: CMPL SI, $0x08 JB matchlen_match4_match_nolit_encodeSnappyBlockAsm8B + MOVQ (DI)(R9*1), R8 + XORQ (BX)(R9*1), R8 + JNZ matchlen_bsf_8_match_nolit_encodeSnappyBlockAsm8B + LEAL -8(SI), SI + LEAL 8(R9), R9 + JMP matchlen_match4_match_nolit_encodeSnappyBlockAsm8B -matchlen_loopback_match_nolit_encodeSnappyBlockAsm8B: - MOVQ (DI)(R9*1), R8 - XORQ (BX)(R9*1), R8 - TESTQ R8, R8 - JZ matchlen_loop_match_nolit_encodeSnappyBlockAsm8B - +matchlen_bsf_8_match_nolit_encodeSnappyBlockAsm8B: #ifdef GOAMD64_v3 TZCNTQ R8, R8 @@ -14083,12 +14639,6 @@ matchlen_loopback_match_nolit_encodeSnappyBlockAsm8B: LEAL (R9)(R8*1), R9 JMP match_nolit_end_encodeSnappyBlockAsm8B -matchlen_loop_match_nolit_encodeSnappyBlockAsm8B: - LEAL -8(SI), SI - LEAL 8(R9), R9 - CMPL SI, $0x08 - JAE matchlen_loopback_match_nolit_encodeSnappyBlockAsm8B - matchlen_match4_match_nolit_encodeSnappyBlockAsm8B: CMPL SI, $0x04 JB matchlen_match2_match_nolit_encodeSnappyBlockAsm8B @@ -14473,15 +15023,43 @@ match_dst_size_check_encodeSnappyBetterBlockAsm: // matchLen XORL R11, R11 + +matchlen_loopback_16_match_nolit_encodeSnappyBetterBlockAsm: + CMPL DI, $0x10 + JB matchlen_match8_match_nolit_encodeSnappyBetterBlockAsm + MOVQ (R8)(R11*1), R10 + MOVQ 8(R8)(R11*1), R12 + XORQ (R9)(R11*1), R10 + JNZ matchlen_bsf_8_match_nolit_encodeSnappyBetterBlockAsm + XORQ 8(R9)(R11*1), R12 + JNZ matchlen_bsf_16match_nolit_encodeSnappyBetterBlockAsm + LEAL -16(DI), DI + LEAL 16(R11), R11 + JMP matchlen_loopback_16_match_nolit_encodeSnappyBetterBlockAsm + +matchlen_bsf_16match_nolit_encodeSnappyBetterBlockAsm: +#ifdef GOAMD64_v3 + TZCNTQ R12, R12 + +#else + BSFQ R12, R12 + +#endif + SARQ $0x03, R12 + LEAL 8(R11)(R12*1), R11 + JMP match_nolit_end_encodeSnappyBetterBlockAsm + +matchlen_match8_match_nolit_encodeSnappyBetterBlockAsm: CMPL DI, $0x08 JB matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm + MOVQ (R8)(R11*1), R10 + XORQ (R9)(R11*1), R10 + JNZ matchlen_bsf_8_match_nolit_encodeSnappyBetterBlockAsm + LEAL -8(DI), DI + LEAL 8(R11), R11 + JMP matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm -matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm: - MOVQ (R8)(R11*1), R10 - XORQ (R9)(R11*1), R10 - TESTQ R10, R10 - JZ matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm - +matchlen_bsf_8_match_nolit_encodeSnappyBetterBlockAsm: #ifdef GOAMD64_v3 TZCNTQ R10, R10 @@ -14493,12 +15071,6 @@ matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm: LEAL (R11)(R10*1), R11 JMP match_nolit_end_encodeSnappyBetterBlockAsm -matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm: - LEAL -8(DI), DI - LEAL 8(R11), R11 - CMPL DI, $0x08 - JAE matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm - matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm: CMPL DI, $0x04 JB matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm @@ -15096,15 +15668,43 @@ match_dst_size_check_encodeSnappyBetterBlockAsm64K: // matchLen XORL R11, R11 + +matchlen_loopback_16_match_nolit_encodeSnappyBetterBlockAsm64K: + CMPL DI, $0x10 + JB matchlen_match8_match_nolit_encodeSnappyBetterBlockAsm64K + MOVQ (R8)(R11*1), R10 + MOVQ 8(R8)(R11*1), R12 + XORQ (R9)(R11*1), R10 + JNZ matchlen_bsf_8_match_nolit_encodeSnappyBetterBlockAsm64K + XORQ 8(R9)(R11*1), R12 + JNZ matchlen_bsf_16match_nolit_encodeSnappyBetterBlockAsm64K + LEAL -16(DI), DI + LEAL 16(R11), R11 + JMP matchlen_loopback_16_match_nolit_encodeSnappyBetterBlockAsm64K + +matchlen_bsf_16match_nolit_encodeSnappyBetterBlockAsm64K: +#ifdef GOAMD64_v3 + TZCNTQ R12, R12 + +#else + BSFQ R12, R12 + +#endif + SARQ $0x03, R12 + LEAL 8(R11)(R12*1), R11 + JMP match_nolit_end_encodeSnappyBetterBlockAsm64K + +matchlen_match8_match_nolit_encodeSnappyBetterBlockAsm64K: CMPL DI, $0x08 JB matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm64K + MOVQ (R8)(R11*1), R10 + XORQ (R9)(R11*1), R10 + JNZ matchlen_bsf_8_match_nolit_encodeSnappyBetterBlockAsm64K + LEAL -8(DI), DI + LEAL 8(R11), R11 + JMP matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm64K -matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm64K: - MOVQ (R8)(R11*1), R10 - XORQ (R9)(R11*1), R10 - TESTQ R10, R10 - JZ matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm64K - +matchlen_bsf_8_match_nolit_encodeSnappyBetterBlockAsm64K: #ifdef GOAMD64_v3 TZCNTQ R10, R10 @@ -15116,12 +15716,6 @@ matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm64K: LEAL (R11)(R10*1), R11 JMP match_nolit_end_encodeSnappyBetterBlockAsm64K -matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm64K: - LEAL -8(DI), DI - LEAL 8(R11), R11 - CMPL DI, $0x08 - JAE matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm64K - matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm64K: CMPL DI, $0x04 JB matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm64K @@ -15654,15 +16248,43 @@ match_dst_size_check_encodeSnappyBetterBlockAsm12B: // matchLen XORL R11, R11 + +matchlen_loopback_16_match_nolit_encodeSnappyBetterBlockAsm12B: + CMPL DI, $0x10 + JB matchlen_match8_match_nolit_encodeSnappyBetterBlockAsm12B + MOVQ (R8)(R11*1), R10 + MOVQ 8(R8)(R11*1), R12 + XORQ (R9)(R11*1), R10 + JNZ matchlen_bsf_8_match_nolit_encodeSnappyBetterBlockAsm12B + XORQ 8(R9)(R11*1), R12 + JNZ matchlen_bsf_16match_nolit_encodeSnappyBetterBlockAsm12B + LEAL -16(DI), DI + LEAL 16(R11), R11 + JMP matchlen_loopback_16_match_nolit_encodeSnappyBetterBlockAsm12B + +matchlen_bsf_16match_nolit_encodeSnappyBetterBlockAsm12B: +#ifdef GOAMD64_v3 + TZCNTQ R12, R12 + +#else + BSFQ R12, R12 + +#endif + SARQ $0x03, R12 + LEAL 8(R11)(R12*1), R11 + JMP match_nolit_end_encodeSnappyBetterBlockAsm12B + +matchlen_match8_match_nolit_encodeSnappyBetterBlockAsm12B: CMPL DI, $0x08 JB matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm12B + MOVQ (R8)(R11*1), R10 + XORQ (R9)(R11*1), R10 + JNZ matchlen_bsf_8_match_nolit_encodeSnappyBetterBlockAsm12B + LEAL -8(DI), DI + LEAL 8(R11), R11 + JMP matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm12B -matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm12B: - MOVQ (R8)(R11*1), R10 - XORQ (R9)(R11*1), R10 - TESTQ R10, R10 - JZ matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm12B - +matchlen_bsf_8_match_nolit_encodeSnappyBetterBlockAsm12B: #ifdef GOAMD64_v3 TZCNTQ R10, R10 @@ -15674,12 +16296,6 @@ matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm12B: LEAL (R11)(R10*1), R11 JMP match_nolit_end_encodeSnappyBetterBlockAsm12B -matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm12B: - LEAL -8(DI), DI - LEAL 8(R11), R11 - CMPL DI, $0x08 - JAE matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm12B - matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm12B: CMPL DI, $0x04 JB matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm12B @@ -16212,15 +16828,43 @@ match_dst_size_check_encodeSnappyBetterBlockAsm10B: // matchLen XORL R11, R11 + +matchlen_loopback_16_match_nolit_encodeSnappyBetterBlockAsm10B: + CMPL DI, $0x10 + JB matchlen_match8_match_nolit_encodeSnappyBetterBlockAsm10B + MOVQ (R8)(R11*1), R10 + MOVQ 8(R8)(R11*1), R12 + XORQ (R9)(R11*1), R10 + JNZ matchlen_bsf_8_match_nolit_encodeSnappyBetterBlockAsm10B + XORQ 8(R9)(R11*1), R12 + JNZ matchlen_bsf_16match_nolit_encodeSnappyBetterBlockAsm10B + LEAL -16(DI), DI + LEAL 16(R11), R11 + JMP matchlen_loopback_16_match_nolit_encodeSnappyBetterBlockAsm10B + +matchlen_bsf_16match_nolit_encodeSnappyBetterBlockAsm10B: +#ifdef GOAMD64_v3 + TZCNTQ R12, R12 + +#else + BSFQ R12, R12 + +#endif + SARQ $0x03, R12 + LEAL 8(R11)(R12*1), R11 + JMP match_nolit_end_encodeSnappyBetterBlockAsm10B + +matchlen_match8_match_nolit_encodeSnappyBetterBlockAsm10B: CMPL DI, $0x08 JB matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm10B + MOVQ (R8)(R11*1), R10 + XORQ (R9)(R11*1), R10 + JNZ matchlen_bsf_8_match_nolit_encodeSnappyBetterBlockAsm10B + LEAL -8(DI), DI + LEAL 8(R11), R11 + JMP matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm10B -matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm10B: - MOVQ (R8)(R11*1), R10 - XORQ (R9)(R11*1), R10 - TESTQ R10, R10 - JZ matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm10B - +matchlen_bsf_8_match_nolit_encodeSnappyBetterBlockAsm10B: #ifdef GOAMD64_v3 TZCNTQ R10, R10 @@ -16232,12 +16876,6 @@ matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm10B: LEAL (R11)(R10*1), R11 JMP match_nolit_end_encodeSnappyBetterBlockAsm10B -matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm10B: - LEAL -8(DI), DI - LEAL 8(R11), R11 - CMPL DI, $0x08 - JAE matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm10B - matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm10B: CMPL DI, $0x04 JB matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm10B @@ -16770,15 +17408,43 @@ match_dst_size_check_encodeSnappyBetterBlockAsm8B: // matchLen XORL R11, R11 + +matchlen_loopback_16_match_nolit_encodeSnappyBetterBlockAsm8B: + CMPL DI, $0x10 + JB matchlen_match8_match_nolit_encodeSnappyBetterBlockAsm8B + MOVQ (R8)(R11*1), R10 + MOVQ 8(R8)(R11*1), R12 + XORQ (R9)(R11*1), R10 + JNZ matchlen_bsf_8_match_nolit_encodeSnappyBetterBlockAsm8B + XORQ 8(R9)(R11*1), R12 + JNZ matchlen_bsf_16match_nolit_encodeSnappyBetterBlockAsm8B + LEAL -16(DI), DI + LEAL 16(R11), R11 + JMP matchlen_loopback_16_match_nolit_encodeSnappyBetterBlockAsm8B + +matchlen_bsf_16match_nolit_encodeSnappyBetterBlockAsm8B: +#ifdef GOAMD64_v3 + TZCNTQ R12, R12 + +#else + BSFQ R12, R12 + +#endif + SARQ $0x03, R12 + LEAL 8(R11)(R12*1), R11 + JMP match_nolit_end_encodeSnappyBetterBlockAsm8B + +matchlen_match8_match_nolit_encodeSnappyBetterBlockAsm8B: CMPL DI, $0x08 JB matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm8B + MOVQ (R8)(R11*1), R10 + XORQ (R9)(R11*1), R10 + JNZ matchlen_bsf_8_match_nolit_encodeSnappyBetterBlockAsm8B + LEAL -8(DI), DI + LEAL 8(R11), R11 + JMP matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm8B -matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm8B: - MOVQ (R8)(R11*1), R10 - XORQ (R9)(R11*1), R10 - TESTQ R10, R10 - JZ matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm8B - +matchlen_bsf_8_match_nolit_encodeSnappyBetterBlockAsm8B: #ifdef GOAMD64_v3 TZCNTQ R10, R10 @@ -16790,12 +17456,6 @@ matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm8B: LEAL (R11)(R10*1), R11 JMP match_nolit_end_encodeSnappyBetterBlockAsm8B -matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm8B: - LEAL -8(DI), DI - LEAL 8(R11), R11 - CMPL DI, $0x08 - JAE matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm8B - matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm8B: CMPL DI, $0x04 JB matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm8B @@ -17343,15 +18003,43 @@ emit_literal_done_repeat_emit_calcBlockSize: // matchLen XORL R10, R10 + +matchlen_loopback_16_repeat_extend_calcBlockSize: + CMPL DI, $0x10 + JB matchlen_match8_repeat_extend_calcBlockSize + MOVQ (R8)(R10*1), R9 + MOVQ 8(R8)(R10*1), R11 + XORQ (BX)(R10*1), R9 + JNZ matchlen_bsf_8_repeat_extend_calcBlockSize + XORQ 8(BX)(R10*1), R11 + JNZ matchlen_bsf_16repeat_extend_calcBlockSize + LEAL -16(DI), DI + LEAL 16(R10), R10 + JMP matchlen_loopback_16_repeat_extend_calcBlockSize + +matchlen_bsf_16repeat_extend_calcBlockSize: +#ifdef GOAMD64_v3 + TZCNTQ R11, R11 + +#else + BSFQ R11, R11 + +#endif + SARQ $0x03, R11 + LEAL 8(R10)(R11*1), R10 + JMP repeat_extend_forward_end_calcBlockSize + +matchlen_match8_repeat_extend_calcBlockSize: CMPL DI, $0x08 JB matchlen_match4_repeat_extend_calcBlockSize + MOVQ (R8)(R10*1), R9 + XORQ (BX)(R10*1), R9 + JNZ matchlen_bsf_8_repeat_extend_calcBlockSize + LEAL -8(DI), DI + LEAL 8(R10), R10 + JMP matchlen_match4_repeat_extend_calcBlockSize -matchlen_loopback_repeat_extend_calcBlockSize: - MOVQ (R8)(R10*1), R9 - XORQ (BX)(R10*1), R9 - TESTQ R9, R9 - JZ matchlen_loop_repeat_extend_calcBlockSize - +matchlen_bsf_8_repeat_extend_calcBlockSize: #ifdef GOAMD64_v3 TZCNTQ R9, R9 @@ -17363,12 +18051,6 @@ matchlen_loopback_repeat_extend_calcBlockSize: LEAL (R10)(R9*1), R10 JMP repeat_extend_forward_end_calcBlockSize -matchlen_loop_repeat_extend_calcBlockSize: - LEAL -8(DI), DI - LEAL 8(R10), R10 - CMPL DI, $0x08 - JAE matchlen_loopback_repeat_extend_calcBlockSize - matchlen_match4_repeat_extend_calcBlockSize: CMPL DI, $0x04 JB matchlen_match2_repeat_extend_calcBlockSize @@ -17554,15 +18236,43 @@ match_nolit_loop_calcBlockSize: // matchLen XORL R9, R9 + +matchlen_loopback_16_match_nolit_calcBlockSize: + CMPL SI, $0x10 + JB matchlen_match8_match_nolit_calcBlockSize + MOVQ (DI)(R9*1), R8 + MOVQ 8(DI)(R9*1), R10 + XORQ (BX)(R9*1), R8 + JNZ matchlen_bsf_8_match_nolit_calcBlockSize + XORQ 8(BX)(R9*1), R10 + JNZ matchlen_bsf_16match_nolit_calcBlockSize + LEAL -16(SI), SI + LEAL 16(R9), R9 + JMP matchlen_loopback_16_match_nolit_calcBlockSize + +matchlen_bsf_16match_nolit_calcBlockSize: +#ifdef GOAMD64_v3 + TZCNTQ R10, R10 + +#else + BSFQ R10, R10 + +#endif + SARQ $0x03, R10 + LEAL 8(R9)(R10*1), R9 + JMP match_nolit_end_calcBlockSize + +matchlen_match8_match_nolit_calcBlockSize: CMPL SI, $0x08 JB matchlen_match4_match_nolit_calcBlockSize + MOVQ (DI)(R9*1), R8 + XORQ (BX)(R9*1), R8 + JNZ matchlen_bsf_8_match_nolit_calcBlockSize + LEAL -8(SI), SI + LEAL 8(R9), R9 + JMP matchlen_match4_match_nolit_calcBlockSize -matchlen_loopback_match_nolit_calcBlockSize: - MOVQ (DI)(R9*1), R8 - XORQ (BX)(R9*1), R8 - TESTQ R8, R8 - JZ matchlen_loop_match_nolit_calcBlockSize - +matchlen_bsf_8_match_nolit_calcBlockSize: #ifdef GOAMD64_v3 TZCNTQ R8, R8 @@ -17574,12 +18284,6 @@ matchlen_loopback_match_nolit_calcBlockSize: LEAL (R9)(R8*1), R9 JMP match_nolit_end_calcBlockSize -matchlen_loop_match_nolit_calcBlockSize: - LEAL -8(SI), SI - LEAL 8(R9), R9 - CMPL SI, $0x08 - JAE matchlen_loopback_match_nolit_calcBlockSize - matchlen_match4_match_nolit_calcBlockSize: CMPL SI, $0x04 JB matchlen_match2_match_nolit_calcBlockSize @@ -17872,15 +18576,43 @@ emit_literal_done_repeat_emit_calcBlockSizeSmall: // matchLen XORL R10, R10 + +matchlen_loopback_16_repeat_extend_calcBlockSizeSmall: + CMPL DI, $0x10 + JB matchlen_match8_repeat_extend_calcBlockSizeSmall + MOVQ (R8)(R10*1), R9 + MOVQ 8(R8)(R10*1), R11 + XORQ (BX)(R10*1), R9 + JNZ matchlen_bsf_8_repeat_extend_calcBlockSizeSmall + XORQ 8(BX)(R10*1), R11 + JNZ matchlen_bsf_16repeat_extend_calcBlockSizeSmall + LEAL -16(DI), DI + LEAL 16(R10), R10 + JMP matchlen_loopback_16_repeat_extend_calcBlockSizeSmall + +matchlen_bsf_16repeat_extend_calcBlockSizeSmall: +#ifdef GOAMD64_v3 + TZCNTQ R11, R11 + +#else + BSFQ R11, R11 + +#endif + SARQ $0x03, R11 + LEAL 8(R10)(R11*1), R10 + JMP repeat_extend_forward_end_calcBlockSizeSmall + +matchlen_match8_repeat_extend_calcBlockSizeSmall: CMPL DI, $0x08 JB matchlen_match4_repeat_extend_calcBlockSizeSmall + MOVQ (R8)(R10*1), R9 + XORQ (BX)(R10*1), R9 + JNZ matchlen_bsf_8_repeat_extend_calcBlockSizeSmall + LEAL -8(DI), DI + LEAL 8(R10), R10 + JMP matchlen_match4_repeat_extend_calcBlockSizeSmall -matchlen_loopback_repeat_extend_calcBlockSizeSmall: - MOVQ (R8)(R10*1), R9 - XORQ (BX)(R10*1), R9 - TESTQ R9, R9 - JZ matchlen_loop_repeat_extend_calcBlockSizeSmall - +matchlen_bsf_8_repeat_extend_calcBlockSizeSmall: #ifdef GOAMD64_v3 TZCNTQ R9, R9 @@ -17892,12 +18624,6 @@ matchlen_loopback_repeat_extend_calcBlockSizeSmall: LEAL (R10)(R9*1), R10 JMP repeat_extend_forward_end_calcBlockSizeSmall -matchlen_loop_repeat_extend_calcBlockSizeSmall: - LEAL -8(DI), DI - LEAL 8(R10), R10 - CMPL DI, $0x08 - JAE matchlen_loopback_repeat_extend_calcBlockSizeSmall - matchlen_match4_repeat_extend_calcBlockSizeSmall: CMPL DI, $0x04 JB matchlen_match2_repeat_extend_calcBlockSizeSmall @@ -18053,15 +18779,43 @@ match_nolit_loop_calcBlockSizeSmall: // matchLen XORL R9, R9 + +matchlen_loopback_16_match_nolit_calcBlockSizeSmall: + CMPL SI, $0x10 + JB matchlen_match8_match_nolit_calcBlockSizeSmall + MOVQ (DI)(R9*1), R8 + MOVQ 8(DI)(R9*1), R10 + XORQ (BX)(R9*1), R8 + JNZ matchlen_bsf_8_match_nolit_calcBlockSizeSmall + XORQ 8(BX)(R9*1), R10 + JNZ matchlen_bsf_16match_nolit_calcBlockSizeSmall + LEAL -16(SI), SI + LEAL 16(R9), R9 + JMP matchlen_loopback_16_match_nolit_calcBlockSizeSmall + +matchlen_bsf_16match_nolit_calcBlockSizeSmall: +#ifdef GOAMD64_v3 + TZCNTQ R10, R10 + +#else + BSFQ R10, R10 + +#endif + SARQ $0x03, R10 + LEAL 8(R9)(R10*1), R9 + JMP match_nolit_end_calcBlockSizeSmall + +matchlen_match8_match_nolit_calcBlockSizeSmall: CMPL SI, $0x08 JB matchlen_match4_match_nolit_calcBlockSizeSmall + MOVQ (DI)(R9*1), R8 + XORQ (BX)(R9*1), R8 + JNZ matchlen_bsf_8_match_nolit_calcBlockSizeSmall + LEAL -8(SI), SI + LEAL 8(R9), R9 + JMP matchlen_match4_match_nolit_calcBlockSizeSmall -matchlen_loopback_match_nolit_calcBlockSizeSmall: - MOVQ (DI)(R9*1), R8 - XORQ (BX)(R9*1), R8 - TESTQ R8, R8 - JZ matchlen_loop_match_nolit_calcBlockSizeSmall - +matchlen_bsf_8_match_nolit_calcBlockSizeSmall: #ifdef GOAMD64_v3 TZCNTQ R8, R8 @@ -18073,12 +18827,6 @@ matchlen_loopback_match_nolit_calcBlockSizeSmall: LEAL (R9)(R8*1), R9 JMP match_nolit_end_calcBlockSizeSmall -matchlen_loop_match_nolit_calcBlockSizeSmall: - LEAL -8(SI), SI - LEAL 8(R9), R9 - CMPL SI, $0x08 - JAE matchlen_loopback_match_nolit_calcBlockSizeSmall - matchlen_match4_match_nolit_calcBlockSizeSmall: CMPL SI, $0x04 JB matchlen_match2_match_nolit_calcBlockSizeSmall @@ -18840,15 +19588,43 @@ TEXT ·matchLen(SB), NOSPLIT, $0-56 // matchLen XORL SI, SI + +matchlen_loopback_16_standalone: + CMPL DX, $0x10 + JB matchlen_match8_standalone + MOVQ (AX)(SI*1), BX + MOVQ 8(AX)(SI*1), DI + XORQ (CX)(SI*1), BX + JNZ matchlen_bsf_8_standalone + XORQ 8(CX)(SI*1), DI + JNZ matchlen_bsf_16standalone + LEAL -16(DX), DX + LEAL 16(SI), SI + JMP matchlen_loopback_16_standalone + +matchlen_bsf_16standalone: +#ifdef GOAMD64_v3 + TZCNTQ DI, DI + +#else + BSFQ DI, DI + +#endif + SARQ $0x03, DI + LEAL 8(SI)(DI*1), SI + JMP gen_match_len_end + +matchlen_match8_standalone: CMPL DX, $0x08 JB matchlen_match4_standalone + MOVQ (AX)(SI*1), BX + XORQ (CX)(SI*1), BX + JNZ matchlen_bsf_8_standalone + LEAL -8(DX), DX + LEAL 8(SI), SI + JMP matchlen_match4_standalone -matchlen_loopback_standalone: - MOVQ (AX)(SI*1), BX - XORQ (CX)(SI*1), BX - TESTQ BX, BX - JZ matchlen_loop_standalone - +matchlen_bsf_8_standalone: #ifdef GOAMD64_v3 TZCNTQ BX, BX @@ -18860,12 +19636,6 @@ matchlen_loopback_standalone: LEAL (SI)(BX*1), SI JMP gen_match_len_end -matchlen_loop_standalone: - LEAL -8(DX), DX - LEAL 8(SI), SI - CMPL DX, $0x08 - JAE matchlen_loopback_standalone - matchlen_match4_standalone: CMPL DX, $0x04 JB matchlen_match2_standalone