diff --git a/salsa20/salsa/_asm/go.mod b/salsa20/salsa/_asm/go.mod new file mode 100644 index 0000000000..0cf7f76881 --- /dev/null +++ b/salsa20/salsa/_asm/go.mod @@ -0,0 +1,14 @@ +module salsa20/salsa/_asm + +go 1.23 + +require ( + github.com/mmcloughlin/avo v0.6.0 + golang.org/x/crypto v0.26.0 +) + +require ( + golang.org/x/mod v0.20.0 // indirect + golang.org/x/sync v0.8.0 // indirect + golang.org/x/tools v0.24.0 // indirect +) diff --git a/salsa20/salsa/_asm/go.sum b/salsa20/salsa/_asm/go.sum new file mode 100644 index 0000000000..e5970800fb --- /dev/null +++ b/salsa20/salsa/_asm/go.sum @@ -0,0 +1,10 @@ +github.com/mmcloughlin/avo v0.6.0 h1:QH6FU8SKoTLaVs80GA8TJuLNkUYl4VokHKlPhVDg4YY= +github.com/mmcloughlin/avo v0.6.0/go.mod h1:8CoAGaCSYXtCPR+8y18Y9aB/kxb8JSS6FRI7mSkvD+8= +golang.org/x/crypto v0.26.0 h1:RrRspgV4mU+YwB4FYnuBoKsUapNIL5cohGAmSH3azsw= +golang.org/x/crypto v0.26.0/go.mod h1:GY7jblb9wI+FOo5y8/S2oY4zWP07AkOJ4+jxCqdqn54= +golang.org/x/mod v0.20.0 h1:utOm6MM3R3dnawAiJgn0y+xvuYRsm1RKM/4giyfDgV0= +golang.org/x/mod v0.20.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c= +golang.org/x/sync v0.8.0 h1:3NFvSEYkUoMifnESzZl15y791HH1qU2xm6eCJU5ZPXQ= +golang.org/x/sync v0.8.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= +golang.org/x/tools v0.24.0 h1:J1shsA93PJUEVaUSaay7UXAyE8aimq3GW0pjlolpa24= +golang.org/x/tools v0.24.0/go.mod h1:YhNqVBIfWHdzvTLs0d8LCuMhkKUgSUKldakyV7W/WDQ= diff --git a/salsa20/salsa/_asm/salsa20_amd64_asm.go b/salsa20/salsa/_asm/salsa20_amd64_asm.go new file mode 100644 index 0000000000..6546791c4c --- /dev/null +++ b/salsa20/salsa/_asm/salsa20_amd64_asm.go @@ -0,0 +1,932 @@ +// Copyright 2024 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// This code was translated into a form compatible with 6a from the public +// domain sources in SUPERCOP: https://bench.cr.yp.to/supercop.html + +package main + +import ( + . "github.com/mmcloughlin/avo/build" + "github.com/mmcloughlin/avo/ir" + . "github.com/mmcloughlin/avo/operand" + . "github.com/mmcloughlin/avo/reg" + _ "golang.org/x/crypto/salsa20/salsa" +) + +//go:generate go run . -out ../salsa20_amd64.s -pkg salsa + +func main() { + Package("golang.org/x/crypto/salsa20/salsa") + ConstraintExpr("amd64,!purego,gc") + salsa2020XORKeyStream() + Generate() +} + +func salsa2020XORKeyStream() { + Implement("salsa2020XORKeyStream") + Attributes(0) + AllocLocal(456) // frame = 424 + 32 byte alignment + Comment("This needs up to 64 bytes at 360(R12); hence the non-obvious frame size.") + + Load(Param("out"), RDI) + Load(Param("in"), RSI) + Load(Param("n"), RDX) + Load(Param("nonce"), RCX) + Load(Param("key"), R8) + + MOVQ(RSP, R12) + ADDQ(Imm(31), R12) + ANDQ(I32(^31), R12) + + MOVQ(RDX, R9) + MOVQ(RCX, RDX) + MOVQ(R8, R10) + CMPQ(R9, Imm(0)) + JBE(LabelRef("DONE")) + + START() + BYTESATLEAST256() + MAINLOOP1() + BYTESBETWEEN1AND255() + NOCOPY() + MAINLOOP2() + + Label("BYTESATLEAST64") + Label("DONE") + RET() + Label("BYTESATLEAST65") + SUBQ(Imm(64), R9) + ADDQ(Imm(64), RDI) + ADDQ(Imm(64), RSI) + JMP(LabelRef("BYTESBETWEEN1AND255")) +} + +func START() { + Label("START") + MOVL(Mem{Base: R10}.Offset(20), ECX) + MOVL(Mem{Base: R10}.Offset(0), R8L) + MOVL(Mem{Base: EDX}.Offset(0), EAX) + MOVL(Mem{Base: R10}.Offset(16), R11L) + MOVL(ECX, Mem{Base: R12}.Offset(0)) + MOVL(R8L, Mem{Base: R12}.Offset(4)) + MOVL(EAX, Mem{Base: R12}.Offset(8)) + MOVL(R11L, Mem{Base: R12}.Offset(12)) + MOVL(Mem{Base: EDX}.Offset(8), ECX) + MOVL(Mem{Base: R10}.Offset(24), R8L) + MOVL(Mem{Base: R10}.Offset(4), EAX) + MOVL(Mem{Base: EDX}.Offset(4), R11L) + MOVL(ECX, Mem{Base: R12}.Offset(16)) + MOVL(R8L, Mem{Base: R12}.Offset(20)) + MOVL(EAX, Mem{Base: R12}.Offset(24)) + MOVL(R11L, Mem{Base: R12}.Offset(28)) + MOVL(Mem{Base: EDX}.Offset(12), ECX) + MOVL(Mem{Base: R10}.Offset(12), EDX) + MOVL(Mem{Base: R10}.Offset(28), R8L) + MOVL(Mem{Base: R10}.Offset(8), EAX) + MOVL(EDX, Mem{Base: R12}.Offset(32)) + MOVL(ECX, Mem{Base: R12}.Offset(36)) + MOVL(R8L, Mem{Base: R12}.Offset(40)) + MOVL(EAX, Mem{Base: R12}.Offset(44)) + MOVQ(Imm(1634760805), RDX) + MOVQ(Imm(857760878), RCX) + MOVQ(Imm(2036477234), R8) + MOVQ(Imm(1797285236), RAX) + MOVL(EDX, Mem{Base: R12}.Offset(48)) + MOVL(ECX, Mem{Base: R12}.Offset(52)) + MOVL(R8L, Mem{Base: R12}.Offset(56)) + MOVL(EAX, Mem{Base: R12}.Offset(60)) + CMPQ(R9, U32(256)) + JB(LabelRef("BYTESBETWEEN1AND255")) + MOVOA(Mem{Base: R12}.Offset(48), X0) + PSHUFL(Imm(0x55), X0, X1) + PSHUFL(Imm(0xAA), X0, X2) + PSHUFL(Imm(0xFF), X0, X3) + PSHUFL(Imm(0x00), X0, X0) + MOVOA(X1, Mem{Base: R12}.Offset(64)) + MOVOA(X2, Mem{Base: R12}.Offset(80)) + MOVOA(X3, Mem{Base: R12}.Offset(96)) + MOVOA(X0, Mem{Base: R12}.Offset(112)) + MOVOA(Mem{Base: R12}.Offset(0), X0) + PSHUFL(Imm(0xAA), X0, X1) + PSHUFL(Imm(0xFF), X0, X2) + PSHUFL(Imm(0x00), X0, X3) + PSHUFL(Imm(0x55), X0, X0) + MOVOA(X1, Mem{Base: R12}.Offset(128)) + MOVOA(X2, Mem{Base: R12}.Offset(144)) + MOVOA(X3, Mem{Base: R12}.Offset(160)) + MOVOA(X0, Mem{Base: R12}.Offset(176)) + MOVOA(Mem{Base: R12}.Offset(16), X0) + PSHUFL(Imm(0xFF), X0, X1) + PSHUFL(Imm(0x55), X0, X2) + PSHUFL(Imm(0xAA), X0, X0) + MOVOA(X1, Mem{Base: R12}.Offset(192)) + MOVOA(X2, Mem{Base: R12}.Offset(208)) + MOVOA(X0, Mem{Base: R12}.Offset(224)) + MOVOA(Mem{Base: R12}.Offset(32), X0) + PSHUFL(Imm(0x00), X0, X1) + PSHUFL(Imm(0xAA), X0, X2) + PSHUFL(Imm(0xFF), X0, X0) + MOVOA(X1, Mem{Base: R12}.Offset(240)) + MOVOA(X2, Mem{Base: R12}.Offset(256)) + MOVOA(X0, Mem{Base: R12}.Offset(272)) + +} + +func BYTESATLEAST256() { + Label("BYTESATLEAST256") + MOVL(Mem{Base: R12}.Offset(16), EDX) + MOVL(Mem{Base: R12}.Offset(36), ECX) + MOVL(EDX, Mem{Base: R12}.Offset(288)) + MOVL(ECX, Mem{Base: R12}.Offset(304)) + SHLQ(Imm(32), RCX) + ADDQ(RCX, RDX) + ADDQ(Imm(1), RDX) + MOVQ(RDX, RCX) + SHRQ(Imm(32), RCX) + MOVL(EDX, Mem{Base: R12}.Offset(292)) + MOVL(ECX, Mem{Base: R12}.Offset(308)) + ADDQ(Imm(1), RDX) + MOVQ(RDX, RCX) + SHRQ(Imm(32), RCX) + MOVL(EDX, Mem{Base: R12}.Offset(296)) + MOVL(ECX, Mem{Base: R12}.Offset(312)) + ADDQ(Imm(1), RDX) + MOVQ(RDX, RCX) + SHRQ(Imm(32), RCX) + MOVL(EDX, Mem{Base: R12}.Offset(300)) + MOVL(ECX, Mem{Base: R12}.Offset(316)) + ADDQ(Imm(1), RDX) + MOVQ(RDX, RCX) + SHRQ(Imm(32), RCX) + MOVL(EDX, Mem{Base: R12}.Offset(16)) + MOVL(ECX, Mem{Base: R12}.Offset(36)) + MOVQ(R9, Mem{Base: R12}.Offset(352)) + MOVQ(U32(20), RDX) + MOVOA(Mem{Base: R12}.Offset(64), X0) + MOVOA(Mem{Base: R12}.Offset(80), X1) + MOVOA(Mem{Base: R12}.Offset(96), X2) + MOVOA(Mem{Base: R12}.Offset(256), X3) + MOVOA(Mem{Base: R12}.Offset(272), X4) + MOVOA(Mem{Base: R12}.Offset(128), X5) + MOVOA(Mem{Base: R12}.Offset(144), X6) + MOVOA(Mem{Base: R12}.Offset(176), X7) + MOVOA(Mem{Base: R12}.Offset(192), X8) + MOVOA(Mem{Base: R12}.Offset(208), X9) + MOVOA(Mem{Base: R12}.Offset(224), X10) + MOVOA(Mem{Base: R12}.Offset(304), X11) + MOVOA(Mem{Base: R12}.Offset(112), X12) + MOVOA(Mem{Base: R12}.Offset(160), X13) + MOVOA(Mem{Base: R12}.Offset(240), X14) + MOVOA(Mem{Base: R12}.Offset(288), X15) +} + +func MAINLOOP1() { + Label("MAINLOOP1") + MOVOA(X1, Mem{Base: R12}.Offset(320)) + MOVOA(X2, Mem{Base: R12}.Offset(336)) + MOVOA(X13, X1) + PADDL(X12, X1) + MOVOA(X1, X2) + PSLLL(Imm(7), X1) + PXOR(X1, X14) + PSRLL(Imm(25), X2) + PXOR(X2, X14) + MOVOA(X7, X1) + PADDL(X0, X1) + MOVOA(X1, X2) + PSLLL(Imm(7), X1) + PXOR(X1, X11) + PSRLL(Imm(25), X2) + PXOR(X2, X11) + MOVOA(X12, X1) + PADDL(X14, X1) + MOVOA(X1, X2) + PSLLL(Imm(9), X1) + PXOR(X1, X15) + PSRLL(Imm(23), X2) + PXOR(X2, X15) + MOVOA(X0, X1) + PADDL(X11, X1) + MOVOA(X1, X2) + PSLLL(Imm(9), X1) + PXOR(X1, X9) + PSRLL(Imm(23), X2) + PXOR(X2, X9) + MOVOA(X14, X1) + PADDL(X15, X1) + MOVOA(X1, X2) + PSLLL(Imm(13), X1) + PXOR(X1, X13) + PSRLL(Imm(19), X2) + PXOR(X2, X13) + MOVOA(X11, X1) + PADDL(X9, X1) + MOVOA(X1, X2) + PSLLL(Imm(13), X1) + PXOR(X1, X7) + PSRLL(Imm(19), X2) + PXOR(X2, X7) + MOVOA(X15, X1) + PADDL(X13, X1) + MOVOA(X1, X2) + PSLLL(Imm(18), X1) + PXOR(X1, X12) + PSRLL(Imm(14), X2) + PXOR(X2, X12) + MOVOA(Mem{Base: R12}.Offset(320), X1) + MOVOA(X12, Mem{Base: R12}.Offset(320)) + MOVOA(X9, X2) + PADDL(X7, X2) + MOVOA(X2, X12) + PSLLL(Imm(18), X2) + PXOR(X2, X0) + PSRLL(Imm(14), X12) + PXOR(X12, X0) + MOVOA(X5, X2) + PADDL(X1, X2) + MOVOA(X2, X12) + PSLLL(Imm(7), X2) + PXOR(X2, X3) + PSRLL(Imm(25), X12) + PXOR(X12, X3) + MOVOA(Mem{Base: R12}.Offset(336), X2) + MOVOA(X0, Mem{Base: R12}.Offset(336)) + MOVOA(X6, X0) + PADDL(X2, X0) + MOVOA(X0, X12) + PSLLL(Imm(7), X0) + PXOR(X0, X4) + PSRLL(Imm(25), X12) + PXOR(X12, X4) + MOVOA(X1, X0) + PADDL(X3, X0) + MOVOA(X0, X12) + PSLLL(Imm(9), X0) + PXOR(X0, X10) + PSRLL(Imm(23), X12) + PXOR(X12, X10) + MOVOA(X2, X0) + PADDL(X4, X0) + MOVOA(X0, X12) + PSLLL(Imm(9), X0) + PXOR(X0, X8) + PSRLL(Imm(23), X12) + PXOR(X12, X8) + MOVOA(X3, X0) + PADDL(X10, X0) + MOVOA(X0, X12) + PSLLL(Imm(13), X0) + PXOR(X0, X5) + PSRLL(Imm(19), X12) + PXOR(X12, X5) + MOVOA(X4, X0) + PADDL(X8, X0) + MOVOA(X0, X12) + PSLLL(Imm(13), X0) + PXOR(X0, X6) + PSRLL(Imm(19), X12) + PXOR(X12, X6) + MOVOA(X10, X0) + PADDL(X5, X0) + MOVOA(X0, X12) + PSLLL(Imm(18), X0) + PXOR(X0, X1) + PSRLL(Imm(14), X12) + PXOR(X12, X1) + MOVOA(Mem{Base: R12}.Offset(320), X0) + MOVOA(X1, Mem{Base: R12}.Offset(320)) + MOVOA(X4, X1) + PADDL(X0, X1) + MOVOA(X1, X12) + PSLLL(Imm(7), X1) + PXOR(X1, X7) + PSRLL(Imm(25), X12) + PXOR(X12, X7) + MOVOA(X8, X1) + PADDL(X6, X1) + MOVOA(X1, X12) + PSLLL(Imm(18), X1) + PXOR(X1, X2) + PSRLL(Imm(14), X12) + PXOR(X12, X2) + MOVOA(Mem{Base: R12}.Offset(336), X12) + MOVOA(X2, Mem{Base: R12}.Offset(336)) + MOVOA(X14, X1) + PADDL(X12, X1) + MOVOA(X1, X2) + PSLLL(Imm(7), X1) + PXOR(X1, X5) + PSRLL(Imm(25), X2) + PXOR(X2, X5) + MOVOA(X0, X1) + PADDL(X7, X1) + MOVOA(X1, X2) + PSLLL(Imm(9), X1) + PXOR(X1, X10) + PSRLL(Imm(23), X2) + PXOR(X2, X10) + MOVOA(X12, X1) + PADDL(X5, X1) + MOVOA(X1, X2) + PSLLL(Imm(9), X1) + PXOR(X1, X8) + PSRLL(Imm(23), X2) + PXOR(X2, X8) + MOVOA(X7, X1) + PADDL(X10, X1) + MOVOA(X1, X2) + PSLLL(Imm(13), X1) + PXOR(X1, X4) + PSRLL(Imm(19), X2) + PXOR(X2, X4) + MOVOA(X5, X1) + PADDL(X8, X1) + MOVOA(X1, X2) + PSLLL(Imm(13), X1) + PXOR(X1, X14) + PSRLL(Imm(19), X2) + PXOR(X2, X14) + MOVOA(X10, X1) + PADDL(X4, X1) + MOVOA(X1, X2) + PSLLL(Imm(18), X1) + PXOR(X1, X0) + PSRLL(Imm(14), X2) + PXOR(X2, X0) + MOVOA(Mem{Base: R12}.Offset(320), X1) + MOVOA(X0, Mem{Base: R12}.Offset(320)) + MOVOA(X8, X0) + PADDL(X14, X0) + MOVOA(X0, X2) + PSLLL(Imm(18), X0) + PXOR(X0, X12) + PSRLL(Imm(14), X2) + PXOR(X2, X12) + MOVOA(X11, X0) + PADDL(X1, X0) + MOVOA(X0, X2) + PSLLL(Imm(7), X0) + PXOR(X0, X6) + PSRLL(Imm(25), X2) + PXOR(X2, X6) + MOVOA(Mem{Base: R12}.Offset(336), X2) + MOVOA(X12, Mem{Base: R12}.Offset(336)) + MOVOA(X3, X0) + PADDL(X2, X0) + MOVOA(X0, X12) + PSLLL(Imm(7), X0) + PXOR(X0, X13) + PSRLL(Imm(25), X12) + PXOR(X12, X13) + MOVOA(X1, X0) + PADDL(X6, X0) + MOVOA(X0, X12) + PSLLL(Imm(9), X0) + PXOR(X0, X15) + PSRLL(Imm(23), X12) + PXOR(X12, X15) + MOVOA(X2, X0) + PADDL(X13, X0) + MOVOA(X0, X12) + PSLLL(Imm(9), X0) + PXOR(X0, X9) + PSRLL(Imm(23), X12) + PXOR(X12, X9) + MOVOA(X6, X0) + PADDL(X15, X0) + MOVOA(X0, X12) + PSLLL(Imm(13), X0) + PXOR(X0, X11) + PSRLL(Imm(19), X12) + PXOR(X12, X11) + MOVOA(X13, X0) + PADDL(X9, X0) + MOVOA(X0, X12) + PSLLL(Imm(13), X0) + PXOR(X0, X3) + PSRLL(Imm(19), X12) + PXOR(X12, X3) + MOVOA(X15, X0) + PADDL(X11, X0) + MOVOA(X0, X12) + PSLLL(Imm(18), X0) + PXOR(X0, X1) + PSRLL(Imm(14), X12) + PXOR(X12, X1) + MOVOA(X9, X0) + PADDL(X3, X0) + MOVOA(X0, X12) + PSLLL(Imm(18), X0) + PXOR(X0, X2) + PSRLL(Imm(14), X12) + PXOR(X12, X2) + MOVOA(Mem{Base: R12}.Offset(320), X12) + MOVOA(Mem{Base: R12}.Offset(336), X0) + SUBQ(Imm(2), RDX) + JA(LabelRef("MAINLOOP1")) + PADDL(Mem{Base: R12}.Offset(112), X12) + PADDL(Mem{Base: R12}.Offset(176), X7) + PADDL(Mem{Base: R12}.Offset(224), X10) + PADDL(Mem{Base: R12}.Offset(272), X4) + MOVD(X12, EDX) + MOVD(X7, ECX) + MOVD(X10, R8) + MOVD(X4, R9) + PSHUFL(Imm(0x39), X12, X12) + PSHUFL(Imm(0x39), X7, X7) + PSHUFL(Imm(0x39), X10, X10) + PSHUFL(Imm(0x39), X4, X4) + XORL(Mem{Base: SI}.Offset(0), EDX) + XORL(Mem{Base: SI}.Offset(4), ECX) + XORL(Mem{Base: SI}.Offset(8), R8L) + XORL(Mem{Base: SI}.Offset(12), R9L) + MOVL(EDX, Mem{Base: DI}.Offset(0)) + MOVL(ECX, Mem{Base: DI}.Offset(4)) + MOVL(R8L, Mem{Base: DI}.Offset(8)) + MOVL(R9L, Mem{Base: DI}.Offset(12)) + MOVD(X12, EDX) + MOVD(X7, ECX) + MOVD(X10, R8) + MOVD(X4, R9) + PSHUFL(Imm(0x39), X12, X12) + PSHUFL(Imm(0x39), X7, X7) + PSHUFL(Imm(0x39), X10, X10) + PSHUFL(Imm(0x39), X4, X4) + XORL(Mem{Base: SI}.Offset(64), EDX) + XORL(Mem{Base: SI}.Offset(68), ECX) + XORL(Mem{Base: SI}.Offset(72), R8L) + XORL(Mem{Base: SI}.Offset(76), R9L) + MOVL(EDX, Mem{Base: DI}.Offset(64)) + MOVL(ECX, Mem{Base: DI}.Offset(68)) + MOVL(R8L, Mem{Base: DI}.Offset(72)) + MOVL(R9L, Mem{Base: DI}.Offset(76)) + MOVD(X12, EDX) + MOVD(X7, ECX) + MOVD(X10, R8) + MOVD(X4, R9) + PSHUFL(Imm(0x39), X12, X12) + PSHUFL(Imm(0x39), X7, X7) + PSHUFL(Imm(0x39), X10, X10) + PSHUFL(Imm(0x39), X4, X4) + XORL(Mem{Base: SI}.Offset(128), EDX) + XORL(Mem{Base: SI}.Offset(132), ECX) + XORL(Mem{Base: SI}.Offset(136), R8L) + XORL(Mem{Base: SI}.Offset(140), R9L) + MOVL(EDX, Mem{Base: DI}.Offset(128)) + MOVL(ECX, Mem{Base: DI}.Offset(132)) + MOVL(R8L, Mem{Base: DI}.Offset(136)) + MOVL(R9L, Mem{Base: DI}.Offset(140)) + MOVD(X12, EDX) + MOVD(X7, ECX) + MOVD(X10, R8) + MOVD(X4, R9) + XORL(Mem{Base: SI}.Offset(192), EDX) + XORL(Mem{Base: SI}.Offset(196), ECX) + XORL(Mem{Base: SI}.Offset(200), R8L) + XORL(Mem{Base: SI}.Offset(204), R9L) + MOVL(EDX, Mem{Base: DI}.Offset(192)) + MOVL(ECX, Mem{Base: DI}.Offset(196)) + MOVL(R8L, Mem{Base: DI}.Offset(200)) + MOVL(R9L, Mem{Base: DI}.Offset(204)) + PADDL(Mem{Base: R12}.Offset(240), X14) + PADDL(Mem{Base: R12}.Offset(64), X0) + PADDL(Mem{Base: R12}.Offset(128), X5) + PADDL(Mem{Base: R12}.Offset(192), X8) + MOVD(X14, EDX) + MOVD(X0, ECX) + MOVD(X5, R8) + MOVD(X8, R9) + PSHUFL(Imm(0x39), X14, X14) + PSHUFL(Imm(0x39), X0, X0) + PSHUFL(Imm(0x39), X5, X5) + PSHUFL(Imm(0x39), X8, X8) + XORL(Mem{Base: SI}.Offset(16), EDX) + XORL(Mem{Base: SI}.Offset(20), ECX) + XORL(Mem{Base: SI}.Offset(24), R8L) + XORL(Mem{Base: SI}.Offset(28), R9L) + MOVL(EDX, Mem{Base: DI}.Offset(16)) + MOVL(ECX, Mem{Base: DI}.Offset(20)) + MOVL(R8L, Mem{Base: DI}.Offset(24)) + MOVL(R9L, Mem{Base: DI}.Offset(28)) + MOVD(X14, EDX) + MOVD(X0, ECX) + MOVD(X5, R8) + MOVD(X8, R9) + PSHUFL(Imm(0x39), X14, X14) + PSHUFL(Imm(0x39), X0, X0) + PSHUFL(Imm(0x39), X5, X5) + PSHUFL(Imm(0x39), X8, X8) + XORL(Mem{Base: SI}.Offset(80), EDX) + XORL(Mem{Base: SI}.Offset(84), ECX) + XORL(Mem{Base: SI}.Offset(88), R8L) + XORL(Mem{Base: SI}.Offset(92), R9L) + MOVL(EDX, Mem{Base: DI}.Offset(80)) + MOVL(ECX, Mem{Base: DI}.Offset(84)) + MOVL(R8L, Mem{Base: DI}.Offset(88)) + MOVL(R9L, Mem{Base: DI}.Offset(92)) + MOVD(X14, EDX) + MOVD(X0, ECX) + MOVD(X5, R8) + MOVD(X8, R9) + PSHUFL(Imm(0x39), X14, X14) + PSHUFL(Imm(0x39), X0, X0) + PSHUFL(Imm(0x39), X5, X5) + PSHUFL(Imm(0x39), X8, X8) + XORL(Mem{Base: SI}.Offset(144), EDX) + XORL(Mem{Base: SI}.Offset(148), ECX) + XORL(Mem{Base: SI}.Offset(152), R8L) + XORL(Mem{Base: SI}.Offset(156), R9L) + MOVL(EDX, Mem{Base: DI}.Offset(144)) + MOVL(ECX, Mem{Base: DI}.Offset(148)) + MOVL(R8L, Mem{Base: DI}.Offset(152)) + MOVL(R9L, Mem{Base: DI}.Offset(156)) + MOVD(X14, EDX) + MOVD(X0, ECX) + MOVD(X5, R8) + MOVD(X8, R9) + XORL(Mem{Base: SI}.Offset(208), EDX) + XORL(Mem{Base: SI}.Offset(212), ECX) + XORL(Mem{Base: SI}.Offset(216), R8L) + XORL(Mem{Base: SI}.Offset(220), R9L) + MOVL(EDX, Mem{Base: DI}.Offset(208)) + MOVL(ECX, Mem{Base: DI}.Offset(212)) + MOVL(R8L, Mem{Base: DI}.Offset(216)) + MOVL(R9L, Mem{Base: DI}.Offset(220)) + PADDL(Mem{Base: R12}.Offset(288), X15) + PADDL(Mem{Base: R12}.Offset(304), X11) + PADDL(Mem{Base: R12}.Offset(80), X1) + PADDL(Mem{Base: R12}.Offset(144), X6) + MOVD(X15, EDX) + MOVD(X11, ECX) + MOVD(X1, R8) + MOVD(X6, R9) + PSHUFL(Imm(0x39), X15, X15) + PSHUFL(Imm(0x39), X11, X11) + PSHUFL(Imm(0x39), X1, X1) + PSHUFL(Imm(0x39), X6, X6) + XORL(Mem{Base: SI}.Offset(32), EDX) + XORL(Mem{Base: SI}.Offset(36), ECX) + XORL(Mem{Base: SI}.Offset(40), R8L) + XORL(Mem{Base: SI}.Offset(44), R9L) + MOVL(EDX, Mem{Base: DI}.Offset(32)) + MOVL(ECX, Mem{Base: DI}.Offset(36)) + MOVL(R8L, Mem{Base: DI}.Offset(40)) + MOVL(R9L, Mem{Base: DI}.Offset(44)) + MOVD(X15, EDX) + MOVD(X11, ECX) + MOVD(X1, R8) + MOVD(X6, R9) + PSHUFL(Imm(0x39), X15, X15) + PSHUFL(Imm(0x39), X11, X11) + PSHUFL(Imm(0x39), X1, X1) + PSHUFL(Imm(0x39), X6, X6) + XORL(Mem{Base: SI}.Offset(96), EDX) + XORL(Mem{Base: SI}.Offset(100), ECX) + XORL(Mem{Base: SI}.Offset(104), R8L) + XORL(Mem{Base: SI}.Offset(108), R9L) + MOVL(EDX, Mem{Base: DI}.Offset(96)) + MOVL(ECX, Mem{Base: DI}.Offset(100)) + MOVL(R8L, Mem{Base: DI}.Offset(104)) + MOVL(R9L, Mem{Base: DI}.Offset(108)) + MOVD(X15, EDX) + MOVD(X11, ECX) + MOVD(X1, R8) + MOVD(X6, R9) + PSHUFL(Imm(0x39), X15, X15) + PSHUFL(Imm(0x39), X11, X11) + PSHUFL(Imm(0x39), X1, X1) + PSHUFL(Imm(0x39), X6, X6) + XORL(Mem{Base: SI}.Offset(160), EDX) + XORL(Mem{Base: SI}.Offset(164), ECX) + XORL(Mem{Base: SI}.Offset(168), R8L) + XORL(Mem{Base: SI}.Offset(172), R9L) + MOVL(EDX, Mem{Base: DI}.Offset(160)) + MOVL(ECX, Mem{Base: DI}.Offset(164)) + MOVL(R8L, Mem{Base: DI}.Offset(168)) + MOVL(R9L, Mem{Base: DI}.Offset(172)) + MOVD(X15, EDX) + MOVD(X11, ECX) + MOVD(X1, R8) + MOVD(X6, R9) + XORL(Mem{Base: SI}.Offset(224), EDX) + XORL(Mem{Base: SI}.Offset(228), ECX) + XORL(Mem{Base: SI}.Offset(232), R8L) + XORL(Mem{Base: SI}.Offset(236), R9L) + MOVL(EDX, Mem{Base: DI}.Offset(224)) + MOVL(ECX, Mem{Base: DI}.Offset(228)) + MOVL(R8L, Mem{Base: DI}.Offset(232)) + MOVL(R9L, Mem{Base: DI}.Offset(236)) + PADDL(Mem{Base: R12}.Offset(160), X13) + PADDL(Mem{Base: R12}.Offset(208), X9) + PADDL(Mem{Base: R12}.Offset(256), X3) + PADDL(Mem{Base: R12}.Offset(96), X2) + MOVD(X13, EDX) + MOVD(X9, ECX) + MOVD(X3, R8) + MOVD(X2, R9) + PSHUFL(Imm(0x39), X13, X13) + PSHUFL(Imm(0x39), X9, X9) + PSHUFL(Imm(0x39), X3, X3) + PSHUFL(Imm(0x39), X2, X2) + XORL(Mem{Base: SI}.Offset(48), EDX) + XORL(Mem{Base: SI}.Offset(52), ECX) + XORL(Mem{Base: SI}.Offset(56), R8L) + XORL(Mem{Base: SI}.Offset(60), R9L) + MOVL(EDX, Mem{Base: DI}.Offset(48)) + MOVL(ECX, Mem{Base: DI}.Offset(52)) + MOVL(R8L, Mem{Base: DI}.Offset(56)) + MOVL(R9L, Mem{Base: DI}.Offset(60)) + MOVD(X13, EDX) + MOVD(X9, ECX) + MOVD(X3, R8) + MOVD(X2, R9) + PSHUFL(Imm(0x39), X13, X13) + PSHUFL(Imm(0x39), X9, X9) + PSHUFL(Imm(0x39), X3, X3) + PSHUFL(Imm(0x39), X2, X2) + XORL(Mem{Base: SI}.Offset(112), EDX) + XORL(Mem{Base: SI}.Offset(116), ECX) + XORL(Mem{Base: SI}.Offset(120), R8L) + XORL(Mem{Base: SI}.Offset(124), R9L) + MOVL(EDX, Mem{Base: DI}.Offset(112)) + MOVL(ECX, Mem{Base: DI}.Offset(116)) + MOVL(R8L, Mem{Base: DI}.Offset(120)) + MOVL(R9L, Mem{Base: DI}.Offset(124)) + MOVD(X13, EDX) + MOVD(X9, ECX) + MOVD(X3, R8) + MOVD(X2, R9) + PSHUFL(Imm(0x39), X13, X13) + PSHUFL(Imm(0x39), X9, X9) + PSHUFL(Imm(0x39), X3, X3) + PSHUFL(Imm(0x39), X2, X2) + XORL(Mem{Base: SI}.Offset(176), EDX) + XORL(Mem{Base: SI}.Offset(180), ECX) + XORL(Mem{Base: SI}.Offset(184), R8L) + XORL(Mem{Base: SI}.Offset(188), R9L) + MOVL(EDX, Mem{Base: DI}.Offset(176)) + MOVL(ECX, Mem{Base: DI}.Offset(180)) + MOVL(R8L, Mem{Base: DI}.Offset(184)) + MOVL(R9L, Mem{Base: DI}.Offset(188)) + MOVD(X13, EDX) + MOVD(X9, ECX) + MOVD(X3, R8) + MOVD(X2, R9) + XORL(Mem{Base: SI}.Offset(240), EDX) + XORL(Mem{Base: SI}.Offset(244), ECX) + XORL(Mem{Base: SI}.Offset(248), R8L) + XORL(Mem{Base: SI}.Offset(252), R9L) + MOVL(EDX, Mem{Base: DI}.Offset(240)) + MOVL(ECX, Mem{Base: DI}.Offset(244)) + MOVL(R8L, Mem{Base: DI}.Offset(248)) + MOVL(R9L, Mem{Base: DI}.Offset(252)) + MOVQ(Mem{Base: R12}.Offset(352), R9) + SUBQ(U32(256), R9) + ADDQ(U32(256), RSI) + ADDQ(U32(256), RDI) + CMPQ(R9, U32(256)) + JAE(LabelRef("BYTESATLEAST256")) + CMPQ(R9, Imm(0)) + JBE(LabelRef("DONE")) +} + +func BYTESBETWEEN1AND255() { + Label("BYTESBETWEEN1AND255") + CMPQ(R9, Imm(64)) + JAE(LabelRef("NOCOPY")) + MOVQ(RDI, RDX) + LEAQ(Mem{Base: R12}.Offset(360), RDI) + MOVQ(R9, RCX) + // Hack to get Avo to emit: + // REP; MOVSB + Instruction(&ir.Instruction{Opcode: "REP; MOVSB"}) + LEAQ(Mem{Base: R12}.Offset(360), RDI) + LEAQ(Mem{Base: R12}.Offset(360), RSI) +} + +func NOCOPY() { + Label("NOCOPY") + MOVQ(R9, Mem{Base: R12}.Offset(352)) + MOVOA(Mem{Base: R12}.Offset(48), X0) + MOVOA(Mem{Base: R12}.Offset(0), X1) + MOVOA(Mem{Base: R12}.Offset(16), X2) + MOVOA(Mem{Base: R12}.Offset(32), X3) + MOVOA(X1, X4) + MOVQ(U32(20), RCX) +} + +func MAINLOOP2() { + Label("MAINLOOP2") + PADDL(X0, X4) + MOVOA(X0, X5) + MOVOA(X4, X6) + PSLLL(Imm(7), X4) + PSRLL(Imm(25), X6) + PXOR(X4, X3) + PXOR(X6, X3) + PADDL(X3, X5) + MOVOA(X3, X4) + MOVOA(X5, X6) + PSLLL(Imm(9), X5) + PSRLL(Imm(23), X6) + PXOR(X5, X2) + PSHUFL(Imm(0x93), X3, X3) + PXOR(X6, X2) + PADDL(X2, X4) + MOVOA(X2, X5) + MOVOA(X4, X6) + PSLLL(Imm(13), X4) + PSRLL(Imm(19), X6) + PXOR(X4, X1) + PSHUFL(Imm(0x4E), X2, X2) + PXOR(X6, X1) + PADDL(X1, X5) + MOVOA(X3, X4) + MOVOA(X5, X6) + PSLLL(Imm(18), X5) + PSRLL(Imm(14), X6) + PXOR(X5, X0) + PSHUFL(Imm(0x39), X1, X1) + PXOR(X6, X0) + PADDL(X0, X4) + MOVOA(X0, X5) + MOVOA(X4, X6) + PSLLL(Imm(7), X4) + PSRLL(Imm(25), X6) + PXOR(X4, X1) + PXOR(X6, X1) + PADDL(X1, X5) + MOVOA(X1, X4) + MOVOA(X5, X6) + PSLLL(Imm(9), X5) + PSRLL(Imm(23), X6) + PXOR(X5, X2) + PSHUFL(Imm(0x93), X1, X1) + PXOR(X6, X2) + PADDL(X2, X4) + MOVOA(X2, X5) + MOVOA(X4, X6) + PSLLL(Imm(13), X4) + PSRLL(Imm(19), X6) + PXOR(X4, X3) + PSHUFL(Imm(0x4E), X2, X2) + PXOR(X6, X3) + PADDL(X3, X5) + MOVOA(X1, X4) + MOVOA(X5, X6) + PSLLL(Imm(18), X5) + PSRLL(Imm(14), X6) + PXOR(X5, X0) + PSHUFL(Imm(0x39), X3, X3) + PXOR(X6, X0) + PADDL(X0, X4) + MOVOA(X0, X5) + MOVOA(X4, X6) + PSLLL(Imm(7), X4) + PSRLL(Imm(25), X6) + PXOR(X4, X3) + PXOR(X6, X3) + PADDL(X3, X5) + MOVOA(X3, X4) + MOVOA(X5, X6) + PSLLL(Imm(9), X5) + PSRLL(Imm(23), X6) + PXOR(X5, X2) + PSHUFL(Imm(0x93), X3, X3) + PXOR(X6, X2) + PADDL(X2, X4) + MOVOA(X2, X5) + MOVOA(X4, X6) + PSLLL(Imm(13), X4) + PSRLL(Imm(19), X6) + PXOR(X4, X1) + PSHUFL(Imm(0x4E), X2, X2) + PXOR(X6, X1) + PADDL(X1, X5) + MOVOA(X3, X4) + MOVOA(X5, X6) + PSLLL(Imm(18), X5) + PSRLL(Imm(14), X6) + PXOR(X5, X0) + PSHUFL(Imm(0x39), X1, X1) + PXOR(X6, X0) + PADDL(X0, X4) + MOVOA(X0, X5) + MOVOA(X4, X6) + PSLLL(Imm(7), X4) + PSRLL(Imm(25), X6) + PXOR(X4, X1) + PXOR(X6, X1) + PADDL(X1, X5) + MOVOA(X1, X4) + MOVOA(X5, X6) + PSLLL(Imm(9), X5) + PSRLL(Imm(23), X6) + PXOR(X5, X2) + PSHUFL(Imm(0x93), X1, X1) + PXOR(X6, X2) + PADDL(X2, X4) + MOVOA(X2, X5) + MOVOA(X4, X6) + PSLLL(Imm(13), X4) + PSRLL(Imm(19), X6) + PXOR(X4, X3) + PSHUFL(Imm(0x4E), X2, X2) + PXOR(X6, X3) + SUBQ(Imm(4), RCX) + PADDL(X3, X5) + MOVOA(X1, X4) + MOVOA(X5, X6) + PSLLL(Imm(18), X5) + PXOR(X7, X7) + PSRLL(Imm(14), X6) + PXOR(X5, X0) + PSHUFL(Imm(0x39), X3, X3) + PXOR(X6, X0) + JA(LabelRef("MAINLOOP2")) + PADDL(Mem{Base: R12}.Offset(48), X0) + PADDL(Mem{Base: R12}.Offset(0), X1) + PADDL(Mem{Base: R12}.Offset(16), X2) + PADDL(Mem{Base: R12}.Offset(32), X3) + MOVD(X0, ECX) + MOVD(X1, R8) + MOVD(X2, R9) + MOVD(X3, EAX) + PSHUFL(Imm(0x39), X0, X0) + PSHUFL(Imm(0x39), X1, X1) + PSHUFL(Imm(0x39), X2, X2) + PSHUFL(Imm(0x39), X3, X3) + XORL(Mem{Base: SI}.Offset(0), ECX) + XORL(Mem{Base: SI}.Offset(48), R8L) + XORL(Mem{Base: SI}.Offset(32), R9L) + XORL(Mem{Base: SI}.Offset(16), EAX) + MOVL(ECX, Mem{Base: DI}.Offset(0)) + MOVL(R8L, Mem{Base: DI}.Offset(48)) + MOVL(R9L, Mem{Base: DI}.Offset(32)) + MOVL(EAX, Mem{Base: DI}.Offset(16)) + MOVD(X0, ECX) + MOVD(X1, R8) + MOVD(X2, R9) + MOVD(X3, EAX) + PSHUFL(Imm(0x39), X0, X0) + PSHUFL(Imm(0x39), X1, X1) + PSHUFL(Imm(0x39), X2, X2) + PSHUFL(Imm(0x39), X3, X3) + XORL(Mem{Base: SI}.Offset(20), ECX) + XORL(Mem{Base: SI}.Offset(4), R8L) + XORL(Mem{Base: SI}.Offset(52), R9L) + XORL(Mem{Base: SI}.Offset(36), EAX) + MOVL(ECX, Mem{Base: DI}.Offset(20)) + MOVL(R8L, Mem{Base: DI}.Offset(4)) + MOVL(R9L, Mem{Base: DI}.Offset(52)) + MOVL(EAX, Mem{Base: DI}.Offset(36)) + MOVD(X0, ECX) + MOVD(X1, R8) + MOVD(X2, R9) + MOVD(X3, EAX) + PSHUFL(Imm(0x39), X0, X0) + PSHUFL(Imm(0x39), X1, X1) + PSHUFL(Imm(0x39), X2, X2) + PSHUFL(Imm(0x39), X3, X3) + XORL(Mem{Base: SI}.Offset(40), ECX) + XORL(Mem{Base: SI}.Offset(24), R8L) + XORL(Mem{Base: SI}.Offset(8), R9L) + XORL(Mem{Base: SI}.Offset(56), EAX) + MOVL(ECX, Mem{Base: DI}.Offset(40)) + MOVL(R8L, Mem{Base: DI}.Offset(24)) + MOVL(R9L, Mem{Base: DI}.Offset(8)) + MOVL(EAX, Mem{Base: DI}.Offset(56)) + MOVD(X0, ECX) + MOVD(X1, R8) + MOVD(X2, R9) + MOVD(X3, EAX) + XORL(Mem{Base: SI}.Offset(60), ECX) + XORL(Mem{Base: SI}.Offset(44), R8L) + XORL(Mem{Base: SI}.Offset(28), R9L) + XORL(Mem{Base: SI}.Offset(12), EAX) + MOVL(ECX, Mem{Base: DI}.Offset(60)) + MOVL(R8L, Mem{Base: DI}.Offset(44)) + MOVL(R9L, Mem{Base: DI}.Offset(28)) + MOVL(EAX, Mem{Base: DI}.Offset(12)) + MOVQ(Mem{Base: R12}.Offset(352), R9) + MOVL(Mem{Base: R12}.Offset(16), ECX) + MOVL(Mem{Base: R12}.Offset(36), R8L) + ADDQ(Imm(1), RCX) + SHLQ(Imm(32), R8) + ADDQ(R8, RCX) + MOVQ(RCX, R8) + SHRQ(Imm(32), R8) + MOVL(ECX, Mem{Base: R12}.Offset(16)) + MOVL(R8L, Mem{Base: R12}.Offset(36)) + CMPQ(R9, Imm(64)) + JA(LabelRef("BYTESATLEAST65")) + JAE(LabelRef("BYTESATLEAST64")) + MOVQ(RDI, RSI) + MOVQ(RDX, RDI) + MOVQ(R9, RCX) + // Hack to get Avo to emit: + // REP; MOVSB + Instruction(&ir.Instruction{Opcode: "REP; MOVSB"}) +} diff --git a/salsa20/salsa/salsa20_amd64.s b/salsa20/salsa/salsa20_amd64.s index fcce0234b6..3883e0ec22 100644 --- a/salsa20/salsa/salsa20_amd64.s +++ b/salsa20/salsa/salsa20_amd64.s @@ -1,880 +1,880 @@ -// Copyright 2012 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. +// Code generated by command: go run salsa20_amd64_asm.go -out ../salsa20_amd64.s -pkg salsa. DO NOT EDIT. //go:build amd64 && !purego && gc -// This code was translated into a form compatible with 6a from the public -// domain sources in SUPERCOP: https://bench.cr.yp.to/supercop.html +// func salsa2020XORKeyStream(out *byte, in *byte, n uint64, nonce *byte, key *byte) +// Requires: SSE2 +TEXT ·salsa2020XORKeyStream(SB), $456-40 + // This needs up to 64 bytes at 360(R12); hence the non-obvious frame size. + MOVQ out+0(FP), DI + MOVQ in+8(FP), SI + MOVQ n+16(FP), DX + MOVQ nonce+24(FP), CX + MOVQ key+32(FP), R8 + MOVQ SP, R12 + ADDQ $0x1f, R12 + ANDQ $-32, R12 + MOVQ DX, R9 + MOVQ CX, DX + MOVQ R8, R10 + CMPQ R9, $0x00 + JBE DONE + MOVL 20(R10), CX + MOVL (R10), R8 + MOVL (DX), AX + MOVL 16(R10), R11 + MOVL CX, (R12) + MOVL R8, 4(R12) + MOVL AX, 8(R12) + MOVL R11, 12(R12) + MOVL 8(DX), CX + MOVL 24(R10), R8 + MOVL 4(R10), AX + MOVL 4(DX), R11 + MOVL CX, 16(R12) + MOVL R8, 20(R12) + MOVL AX, 24(R12) + MOVL R11, 28(R12) + MOVL 12(DX), CX + MOVL 12(R10), DX + MOVL 28(R10), R8 + MOVL 8(R10), AX + MOVL DX, 32(R12) + MOVL CX, 36(R12) + MOVL R8, 40(R12) + MOVL AX, 44(R12) + MOVQ $0x61707865, DX + MOVQ $0x3320646e, CX + MOVQ $0x79622d32, R8 + MOVQ $0x6b206574, AX + MOVL DX, 48(R12) + MOVL CX, 52(R12) + MOVL R8, 56(R12) + MOVL AX, 60(R12) + CMPQ R9, $0x00000100 + JB BYTESBETWEEN1AND255 + MOVOA 48(R12), X0 + PSHUFL $0x55, X0, X1 + PSHUFL $0xaa, X0, X2 + PSHUFL $0xff, X0, X3 + PSHUFL $0x00, X0, X0 + MOVOA X1, 64(R12) + MOVOA X2, 80(R12) + MOVOA X3, 96(R12) + MOVOA X0, 112(R12) + MOVOA (R12), X0 + PSHUFL $0xaa, X0, X1 + PSHUFL $0xff, X0, X2 + PSHUFL $0x00, X0, X3 + PSHUFL $0x55, X0, X0 + MOVOA X1, 128(R12) + MOVOA X2, 144(R12) + MOVOA X3, 160(R12) + MOVOA X0, 176(R12) + MOVOA 16(R12), X0 + PSHUFL $0xff, X0, X1 + PSHUFL $0x55, X0, X2 + PSHUFL $0xaa, X0, X0 + MOVOA X1, 192(R12) + MOVOA X2, 208(R12) + MOVOA X0, 224(R12) + MOVOA 32(R12), X0 + PSHUFL $0x00, X0, X1 + PSHUFL $0xaa, X0, X2 + PSHUFL $0xff, X0, X0 + MOVOA X1, 240(R12) + MOVOA X2, 256(R12) + MOVOA X0, 272(R12) -// func salsa2020XORKeyStream(out, in *byte, n uint64, nonce, key *byte) -// This needs up to 64 bytes at 360(R12); hence the non-obvious frame size. -TEXT ·salsa2020XORKeyStream(SB),0,$456-40 // frame = 424 + 32 byte alignment - MOVQ out+0(FP),DI - MOVQ in+8(FP),SI - MOVQ n+16(FP),DX - MOVQ nonce+24(FP),CX - MOVQ key+32(FP),R8 +BYTESATLEAST256: + MOVL 16(R12), DX + MOVL 36(R12), CX + MOVL DX, 288(R12) + MOVL CX, 304(R12) + SHLQ $0x20, CX + ADDQ CX, DX + ADDQ $0x01, DX + MOVQ DX, CX + SHRQ $0x20, CX + MOVL DX, 292(R12) + MOVL CX, 308(R12) + ADDQ $0x01, DX + MOVQ DX, CX + SHRQ $0x20, CX + MOVL DX, 296(R12) + MOVL CX, 312(R12) + ADDQ $0x01, DX + MOVQ DX, CX + SHRQ $0x20, CX + MOVL DX, 300(R12) + MOVL CX, 316(R12) + ADDQ $0x01, DX + MOVQ DX, CX + SHRQ $0x20, CX + MOVL DX, 16(R12) + MOVL CX, 36(R12) + MOVQ R9, 352(R12) + MOVQ $0x00000014, DX + MOVOA 64(R12), X0 + MOVOA 80(R12), X1 + MOVOA 96(R12), X2 + MOVOA 256(R12), X3 + MOVOA 272(R12), X4 + MOVOA 128(R12), X5 + MOVOA 144(R12), X6 + MOVOA 176(R12), X7 + MOVOA 192(R12), X8 + MOVOA 208(R12), X9 + MOVOA 224(R12), X10 + MOVOA 304(R12), X11 + MOVOA 112(R12), X12 + MOVOA 160(R12), X13 + MOVOA 240(R12), X14 + MOVOA 288(R12), X15 - MOVQ SP,R12 - ADDQ $31, R12 - ANDQ $~31, R12 +MAINLOOP1: + MOVOA X1, 320(R12) + MOVOA X2, 336(R12) + MOVOA X13, X1 + PADDL X12, X1 + MOVOA X1, X2 + PSLLL $0x07, X1 + PXOR X1, X14 + PSRLL $0x19, X2 + PXOR X2, X14 + MOVOA X7, X1 + PADDL X0, X1 + MOVOA X1, X2 + PSLLL $0x07, X1 + PXOR X1, X11 + PSRLL $0x19, X2 + PXOR X2, X11 + MOVOA X12, X1 + PADDL X14, X1 + MOVOA X1, X2 + PSLLL $0x09, X1 + PXOR X1, X15 + PSRLL $0x17, X2 + PXOR X2, X15 + MOVOA X0, X1 + PADDL X11, X1 + MOVOA X1, X2 + PSLLL $0x09, X1 + PXOR X1, X9 + PSRLL $0x17, X2 + PXOR X2, X9 + MOVOA X14, X1 + PADDL X15, X1 + MOVOA X1, X2 + PSLLL $0x0d, X1 + PXOR X1, X13 + PSRLL $0x13, X2 + PXOR X2, X13 + MOVOA X11, X1 + PADDL X9, X1 + MOVOA X1, X2 + PSLLL $0x0d, X1 + PXOR X1, X7 + PSRLL $0x13, X2 + PXOR X2, X7 + MOVOA X15, X1 + PADDL X13, X1 + MOVOA X1, X2 + PSLLL $0x12, X1 + PXOR X1, X12 + PSRLL $0x0e, X2 + PXOR X2, X12 + MOVOA 320(R12), X1 + MOVOA X12, 320(R12) + MOVOA X9, X2 + PADDL X7, X2 + MOVOA X2, X12 + PSLLL $0x12, X2 + PXOR X2, X0 + PSRLL $0x0e, X12 + PXOR X12, X0 + MOVOA X5, X2 + PADDL X1, X2 + MOVOA X2, X12 + PSLLL $0x07, X2 + PXOR X2, X3 + PSRLL $0x19, X12 + PXOR X12, X3 + MOVOA 336(R12), X2 + MOVOA X0, 336(R12) + MOVOA X6, X0 + PADDL X2, X0 + MOVOA X0, X12 + PSLLL $0x07, X0 + PXOR X0, X4 + PSRLL $0x19, X12 + PXOR X12, X4 + MOVOA X1, X0 + PADDL X3, X0 + MOVOA X0, X12 + PSLLL $0x09, X0 + PXOR X0, X10 + PSRLL $0x17, X12 + PXOR X12, X10 + MOVOA X2, X0 + PADDL X4, X0 + MOVOA X0, X12 + PSLLL $0x09, X0 + PXOR X0, X8 + PSRLL $0x17, X12 + PXOR X12, X8 + MOVOA X3, X0 + PADDL X10, X0 + MOVOA X0, X12 + PSLLL $0x0d, X0 + PXOR X0, X5 + PSRLL $0x13, X12 + PXOR X12, X5 + MOVOA X4, X0 + PADDL X8, X0 + MOVOA X0, X12 + PSLLL $0x0d, X0 + PXOR X0, X6 + PSRLL $0x13, X12 + PXOR X12, X6 + MOVOA X10, X0 + PADDL X5, X0 + MOVOA X0, X12 + PSLLL $0x12, X0 + PXOR X0, X1 + PSRLL $0x0e, X12 + PXOR X12, X1 + MOVOA 320(R12), X0 + MOVOA X1, 320(R12) + MOVOA X4, X1 + PADDL X0, X1 + MOVOA X1, X12 + PSLLL $0x07, X1 + PXOR X1, X7 + PSRLL $0x19, X12 + PXOR X12, X7 + MOVOA X8, X1 + PADDL X6, X1 + MOVOA X1, X12 + PSLLL $0x12, X1 + PXOR X1, X2 + PSRLL $0x0e, X12 + PXOR X12, X2 + MOVOA 336(R12), X12 + MOVOA X2, 336(R12) + MOVOA X14, X1 + PADDL X12, X1 + MOVOA X1, X2 + PSLLL $0x07, X1 + PXOR X1, X5 + PSRLL $0x19, X2 + PXOR X2, X5 + MOVOA X0, X1 + PADDL X7, X1 + MOVOA X1, X2 + PSLLL $0x09, X1 + PXOR X1, X10 + PSRLL $0x17, X2 + PXOR X2, X10 + MOVOA X12, X1 + PADDL X5, X1 + MOVOA X1, X2 + PSLLL $0x09, X1 + PXOR X1, X8 + PSRLL $0x17, X2 + PXOR X2, X8 + MOVOA X7, X1 + PADDL X10, X1 + MOVOA X1, X2 + PSLLL $0x0d, X1 + PXOR X1, X4 + PSRLL $0x13, X2 + PXOR X2, X4 + MOVOA X5, X1 + PADDL X8, X1 + MOVOA X1, X2 + PSLLL $0x0d, X1 + PXOR X1, X14 + PSRLL $0x13, X2 + PXOR X2, X14 + MOVOA X10, X1 + PADDL X4, X1 + MOVOA X1, X2 + PSLLL $0x12, X1 + PXOR X1, X0 + PSRLL $0x0e, X2 + PXOR X2, X0 + MOVOA 320(R12), X1 + MOVOA X0, 320(R12) + MOVOA X8, X0 + PADDL X14, X0 + MOVOA X0, X2 + PSLLL $0x12, X0 + PXOR X0, X12 + PSRLL $0x0e, X2 + PXOR X2, X12 + MOVOA X11, X0 + PADDL X1, X0 + MOVOA X0, X2 + PSLLL $0x07, X0 + PXOR X0, X6 + PSRLL $0x19, X2 + PXOR X2, X6 + MOVOA 336(R12), X2 + MOVOA X12, 336(R12) + MOVOA X3, X0 + PADDL X2, X0 + MOVOA X0, X12 + PSLLL $0x07, X0 + PXOR X0, X13 + PSRLL $0x19, X12 + PXOR X12, X13 + MOVOA X1, X0 + PADDL X6, X0 + MOVOA X0, X12 + PSLLL $0x09, X0 + PXOR X0, X15 + PSRLL $0x17, X12 + PXOR X12, X15 + MOVOA X2, X0 + PADDL X13, X0 + MOVOA X0, X12 + PSLLL $0x09, X0 + PXOR X0, X9 + PSRLL $0x17, X12 + PXOR X12, X9 + MOVOA X6, X0 + PADDL X15, X0 + MOVOA X0, X12 + PSLLL $0x0d, X0 + PXOR X0, X11 + PSRLL $0x13, X12 + PXOR X12, X11 + MOVOA X13, X0 + PADDL X9, X0 + MOVOA X0, X12 + PSLLL $0x0d, X0 + PXOR X0, X3 + PSRLL $0x13, X12 + PXOR X12, X3 + MOVOA X15, X0 + PADDL X11, X0 + MOVOA X0, X12 + PSLLL $0x12, X0 + PXOR X0, X1 + PSRLL $0x0e, X12 + PXOR X12, X1 + MOVOA X9, X0 + PADDL X3, X0 + MOVOA X0, X12 + PSLLL $0x12, X0 + PXOR X0, X2 + PSRLL $0x0e, X12 + PXOR X12, X2 + MOVOA 320(R12), X12 + MOVOA 336(R12), X0 + SUBQ $0x02, DX + JA MAINLOOP1 + PADDL 112(R12), X12 + PADDL 176(R12), X7 + PADDL 224(R12), X10 + PADDL 272(R12), X4 + MOVD X12, DX + MOVD X7, CX + MOVD X10, R8 + MOVD X4, R9 + PSHUFL $0x39, X12, X12 + PSHUFL $0x39, X7, X7 + PSHUFL $0x39, X10, X10 + PSHUFL $0x39, X4, X4 + XORL (SI), DX + XORL 4(SI), CX + XORL 8(SI), R8 + XORL 12(SI), R9 + MOVL DX, (DI) + MOVL CX, 4(DI) + MOVL R8, 8(DI) + MOVL R9, 12(DI) + MOVD X12, DX + MOVD X7, CX + MOVD X10, R8 + MOVD X4, R9 + PSHUFL $0x39, X12, X12 + PSHUFL $0x39, X7, X7 + PSHUFL $0x39, X10, X10 + PSHUFL $0x39, X4, X4 + XORL 64(SI), DX + XORL 68(SI), CX + XORL 72(SI), R8 + XORL 76(SI), R9 + MOVL DX, 64(DI) + MOVL CX, 68(DI) + MOVL R8, 72(DI) + MOVL R9, 76(DI) + MOVD X12, DX + MOVD X7, CX + MOVD X10, R8 + MOVD X4, R9 + PSHUFL $0x39, X12, X12 + PSHUFL $0x39, X7, X7 + PSHUFL $0x39, X10, X10 + PSHUFL $0x39, X4, X4 + XORL 128(SI), DX + XORL 132(SI), CX + XORL 136(SI), R8 + XORL 140(SI), R9 + MOVL DX, 128(DI) + MOVL CX, 132(DI) + MOVL R8, 136(DI) + MOVL R9, 140(DI) + MOVD X12, DX + MOVD X7, CX + MOVD X10, R8 + MOVD X4, R9 + XORL 192(SI), DX + XORL 196(SI), CX + XORL 200(SI), R8 + XORL 204(SI), R9 + MOVL DX, 192(DI) + MOVL CX, 196(DI) + MOVL R8, 200(DI) + MOVL R9, 204(DI) + PADDL 240(R12), X14 + PADDL 64(R12), X0 + PADDL 128(R12), X5 + PADDL 192(R12), X8 + MOVD X14, DX + MOVD X0, CX + MOVD X5, R8 + MOVD X8, R9 + PSHUFL $0x39, X14, X14 + PSHUFL $0x39, X0, X0 + PSHUFL $0x39, X5, X5 + PSHUFL $0x39, X8, X8 + XORL 16(SI), DX + XORL 20(SI), CX + XORL 24(SI), R8 + XORL 28(SI), R9 + MOVL DX, 16(DI) + MOVL CX, 20(DI) + MOVL R8, 24(DI) + MOVL R9, 28(DI) + MOVD X14, DX + MOVD X0, CX + MOVD X5, R8 + MOVD X8, R9 + PSHUFL $0x39, X14, X14 + PSHUFL $0x39, X0, X0 + PSHUFL $0x39, X5, X5 + PSHUFL $0x39, X8, X8 + XORL 80(SI), DX + XORL 84(SI), CX + XORL 88(SI), R8 + XORL 92(SI), R9 + MOVL DX, 80(DI) + MOVL CX, 84(DI) + MOVL R8, 88(DI) + MOVL R9, 92(DI) + MOVD X14, DX + MOVD X0, CX + MOVD X5, R8 + MOVD X8, R9 + PSHUFL $0x39, X14, X14 + PSHUFL $0x39, X0, X0 + PSHUFL $0x39, X5, X5 + PSHUFL $0x39, X8, X8 + XORL 144(SI), DX + XORL 148(SI), CX + XORL 152(SI), R8 + XORL 156(SI), R9 + MOVL DX, 144(DI) + MOVL CX, 148(DI) + MOVL R8, 152(DI) + MOVL R9, 156(DI) + MOVD X14, DX + MOVD X0, CX + MOVD X5, R8 + MOVD X8, R9 + XORL 208(SI), DX + XORL 212(SI), CX + XORL 216(SI), R8 + XORL 220(SI), R9 + MOVL DX, 208(DI) + MOVL CX, 212(DI) + MOVL R8, 216(DI) + MOVL R9, 220(DI) + PADDL 288(R12), X15 + PADDL 304(R12), X11 + PADDL 80(R12), X1 + PADDL 144(R12), X6 + MOVD X15, DX + MOVD X11, CX + MOVD X1, R8 + MOVD X6, R9 + PSHUFL $0x39, X15, X15 + PSHUFL $0x39, X11, X11 + PSHUFL $0x39, X1, X1 + PSHUFL $0x39, X6, X6 + XORL 32(SI), DX + XORL 36(SI), CX + XORL 40(SI), R8 + XORL 44(SI), R9 + MOVL DX, 32(DI) + MOVL CX, 36(DI) + MOVL R8, 40(DI) + MOVL R9, 44(DI) + MOVD X15, DX + MOVD X11, CX + MOVD X1, R8 + MOVD X6, R9 + PSHUFL $0x39, X15, X15 + PSHUFL $0x39, X11, X11 + PSHUFL $0x39, X1, X1 + PSHUFL $0x39, X6, X6 + XORL 96(SI), DX + XORL 100(SI), CX + XORL 104(SI), R8 + XORL 108(SI), R9 + MOVL DX, 96(DI) + MOVL CX, 100(DI) + MOVL R8, 104(DI) + MOVL R9, 108(DI) + MOVD X15, DX + MOVD X11, CX + MOVD X1, R8 + MOVD X6, R9 + PSHUFL $0x39, X15, X15 + PSHUFL $0x39, X11, X11 + PSHUFL $0x39, X1, X1 + PSHUFL $0x39, X6, X6 + XORL 160(SI), DX + XORL 164(SI), CX + XORL 168(SI), R8 + XORL 172(SI), R9 + MOVL DX, 160(DI) + MOVL CX, 164(DI) + MOVL R8, 168(DI) + MOVL R9, 172(DI) + MOVD X15, DX + MOVD X11, CX + MOVD X1, R8 + MOVD X6, R9 + XORL 224(SI), DX + XORL 228(SI), CX + XORL 232(SI), R8 + XORL 236(SI), R9 + MOVL DX, 224(DI) + MOVL CX, 228(DI) + MOVL R8, 232(DI) + MOVL R9, 236(DI) + PADDL 160(R12), X13 + PADDL 208(R12), X9 + PADDL 256(R12), X3 + PADDL 96(R12), X2 + MOVD X13, DX + MOVD X9, CX + MOVD X3, R8 + MOVD X2, R9 + PSHUFL $0x39, X13, X13 + PSHUFL $0x39, X9, X9 + PSHUFL $0x39, X3, X3 + PSHUFL $0x39, X2, X2 + XORL 48(SI), DX + XORL 52(SI), CX + XORL 56(SI), R8 + XORL 60(SI), R9 + MOVL DX, 48(DI) + MOVL CX, 52(DI) + MOVL R8, 56(DI) + MOVL R9, 60(DI) + MOVD X13, DX + MOVD X9, CX + MOVD X3, R8 + MOVD X2, R9 + PSHUFL $0x39, X13, X13 + PSHUFL $0x39, X9, X9 + PSHUFL $0x39, X3, X3 + PSHUFL $0x39, X2, X2 + XORL 112(SI), DX + XORL 116(SI), CX + XORL 120(SI), R8 + XORL 124(SI), R9 + MOVL DX, 112(DI) + MOVL CX, 116(DI) + MOVL R8, 120(DI) + MOVL R9, 124(DI) + MOVD X13, DX + MOVD X9, CX + MOVD X3, R8 + MOVD X2, R9 + PSHUFL $0x39, X13, X13 + PSHUFL $0x39, X9, X9 + PSHUFL $0x39, X3, X3 + PSHUFL $0x39, X2, X2 + XORL 176(SI), DX + XORL 180(SI), CX + XORL 184(SI), R8 + XORL 188(SI), R9 + MOVL DX, 176(DI) + MOVL CX, 180(DI) + MOVL R8, 184(DI) + MOVL R9, 188(DI) + MOVD X13, DX + MOVD X9, CX + MOVD X3, R8 + MOVD X2, R9 + XORL 240(SI), DX + XORL 244(SI), CX + XORL 248(SI), R8 + XORL 252(SI), R9 + MOVL DX, 240(DI) + MOVL CX, 244(DI) + MOVL R8, 248(DI) + MOVL R9, 252(DI) + MOVQ 352(R12), R9 + SUBQ $0x00000100, R9 + ADDQ $0x00000100, SI + ADDQ $0x00000100, DI + CMPQ R9, $0x00000100 + JAE BYTESATLEAST256 + CMPQ R9, $0x00 + JBE DONE - MOVQ DX,R9 - MOVQ CX,DX - MOVQ R8,R10 - CMPQ R9,$0 - JBE DONE - START: - MOVL 20(R10),CX - MOVL 0(R10),R8 - MOVL 0(DX),AX - MOVL 16(R10),R11 - MOVL CX,0(R12) - MOVL R8, 4 (R12) - MOVL AX, 8 (R12) - MOVL R11, 12 (R12) - MOVL 8(DX),CX - MOVL 24(R10),R8 - MOVL 4(R10),AX - MOVL 4(DX),R11 - MOVL CX,16(R12) - MOVL R8, 20 (R12) - MOVL AX, 24 (R12) - MOVL R11, 28 (R12) - MOVL 12(DX),CX - MOVL 12(R10),DX - MOVL 28(R10),R8 - MOVL 8(R10),AX - MOVL DX,32(R12) - MOVL CX, 36 (R12) - MOVL R8, 40 (R12) - MOVL AX, 44 (R12) - MOVQ $1634760805,DX - MOVQ $857760878,CX - MOVQ $2036477234,R8 - MOVQ $1797285236,AX - MOVL DX,48(R12) - MOVL CX, 52 (R12) - MOVL R8, 56 (R12) - MOVL AX, 60 (R12) - CMPQ R9,$256 - JB BYTESBETWEEN1AND255 - MOVOA 48(R12),X0 - PSHUFL $0X55,X0,X1 - PSHUFL $0XAA,X0,X2 - PSHUFL $0XFF,X0,X3 - PSHUFL $0X00,X0,X0 - MOVOA X1,64(R12) - MOVOA X2,80(R12) - MOVOA X3,96(R12) - MOVOA X0,112(R12) - MOVOA 0(R12),X0 - PSHUFL $0XAA,X0,X1 - PSHUFL $0XFF,X0,X2 - PSHUFL $0X00,X0,X3 - PSHUFL $0X55,X0,X0 - MOVOA X1,128(R12) - MOVOA X2,144(R12) - MOVOA X3,160(R12) - MOVOA X0,176(R12) - MOVOA 16(R12),X0 - PSHUFL $0XFF,X0,X1 - PSHUFL $0X55,X0,X2 - PSHUFL $0XAA,X0,X0 - MOVOA X1,192(R12) - MOVOA X2,208(R12) - MOVOA X0,224(R12) - MOVOA 32(R12),X0 - PSHUFL $0X00,X0,X1 - PSHUFL $0XAA,X0,X2 - PSHUFL $0XFF,X0,X0 - MOVOA X1,240(R12) - MOVOA X2,256(R12) - MOVOA X0,272(R12) - BYTESATLEAST256: - MOVL 16(R12),DX - MOVL 36 (R12),CX - MOVL DX,288(R12) - MOVL CX,304(R12) - SHLQ $32,CX - ADDQ CX,DX - ADDQ $1,DX - MOVQ DX,CX - SHRQ $32,CX - MOVL DX, 292 (R12) - MOVL CX, 308 (R12) - ADDQ $1,DX - MOVQ DX,CX - SHRQ $32,CX - MOVL DX, 296 (R12) - MOVL CX, 312 (R12) - ADDQ $1,DX - MOVQ DX,CX - SHRQ $32,CX - MOVL DX, 300 (R12) - MOVL CX, 316 (R12) - ADDQ $1,DX - MOVQ DX,CX - SHRQ $32,CX - MOVL DX,16(R12) - MOVL CX, 36 (R12) - MOVQ R9,352(R12) - MOVQ $20,DX - MOVOA 64(R12),X0 - MOVOA 80(R12),X1 - MOVOA 96(R12),X2 - MOVOA 256(R12),X3 - MOVOA 272(R12),X4 - MOVOA 128(R12),X5 - MOVOA 144(R12),X6 - MOVOA 176(R12),X7 - MOVOA 192(R12),X8 - MOVOA 208(R12),X9 - MOVOA 224(R12),X10 - MOVOA 304(R12),X11 - MOVOA 112(R12),X12 - MOVOA 160(R12),X13 - MOVOA 240(R12),X14 - MOVOA 288(R12),X15 - MAINLOOP1: - MOVOA X1,320(R12) - MOVOA X2,336(R12) - MOVOA X13,X1 - PADDL X12,X1 - MOVOA X1,X2 - PSLLL $7,X1 - PXOR X1,X14 - PSRLL $25,X2 - PXOR X2,X14 - MOVOA X7,X1 - PADDL X0,X1 - MOVOA X1,X2 - PSLLL $7,X1 - PXOR X1,X11 - PSRLL $25,X2 - PXOR X2,X11 - MOVOA X12,X1 - PADDL X14,X1 - MOVOA X1,X2 - PSLLL $9,X1 - PXOR X1,X15 - PSRLL $23,X2 - PXOR X2,X15 - MOVOA X0,X1 - PADDL X11,X1 - MOVOA X1,X2 - PSLLL $9,X1 - PXOR X1,X9 - PSRLL $23,X2 - PXOR X2,X9 - MOVOA X14,X1 - PADDL X15,X1 - MOVOA X1,X2 - PSLLL $13,X1 - PXOR X1,X13 - PSRLL $19,X2 - PXOR X2,X13 - MOVOA X11,X1 - PADDL X9,X1 - MOVOA X1,X2 - PSLLL $13,X1 - PXOR X1,X7 - PSRLL $19,X2 - PXOR X2,X7 - MOVOA X15,X1 - PADDL X13,X1 - MOVOA X1,X2 - PSLLL $18,X1 - PXOR X1,X12 - PSRLL $14,X2 - PXOR X2,X12 - MOVOA 320(R12),X1 - MOVOA X12,320(R12) - MOVOA X9,X2 - PADDL X7,X2 - MOVOA X2,X12 - PSLLL $18,X2 - PXOR X2,X0 - PSRLL $14,X12 - PXOR X12,X0 - MOVOA X5,X2 - PADDL X1,X2 - MOVOA X2,X12 - PSLLL $7,X2 - PXOR X2,X3 - PSRLL $25,X12 - PXOR X12,X3 - MOVOA 336(R12),X2 - MOVOA X0,336(R12) - MOVOA X6,X0 - PADDL X2,X0 - MOVOA X0,X12 - PSLLL $7,X0 - PXOR X0,X4 - PSRLL $25,X12 - PXOR X12,X4 - MOVOA X1,X0 - PADDL X3,X0 - MOVOA X0,X12 - PSLLL $9,X0 - PXOR X0,X10 - PSRLL $23,X12 - PXOR X12,X10 - MOVOA X2,X0 - PADDL X4,X0 - MOVOA X0,X12 - PSLLL $9,X0 - PXOR X0,X8 - PSRLL $23,X12 - PXOR X12,X8 - MOVOA X3,X0 - PADDL X10,X0 - MOVOA X0,X12 - PSLLL $13,X0 - PXOR X0,X5 - PSRLL $19,X12 - PXOR X12,X5 - MOVOA X4,X0 - PADDL X8,X0 - MOVOA X0,X12 - PSLLL $13,X0 - PXOR X0,X6 - PSRLL $19,X12 - PXOR X12,X6 - MOVOA X10,X0 - PADDL X5,X0 - MOVOA X0,X12 - PSLLL $18,X0 - PXOR X0,X1 - PSRLL $14,X12 - PXOR X12,X1 - MOVOA 320(R12),X0 - MOVOA X1,320(R12) - MOVOA X4,X1 - PADDL X0,X1 - MOVOA X1,X12 - PSLLL $7,X1 - PXOR X1,X7 - PSRLL $25,X12 - PXOR X12,X7 - MOVOA X8,X1 - PADDL X6,X1 - MOVOA X1,X12 - PSLLL $18,X1 - PXOR X1,X2 - PSRLL $14,X12 - PXOR X12,X2 - MOVOA 336(R12),X12 - MOVOA X2,336(R12) - MOVOA X14,X1 - PADDL X12,X1 - MOVOA X1,X2 - PSLLL $7,X1 - PXOR X1,X5 - PSRLL $25,X2 - PXOR X2,X5 - MOVOA X0,X1 - PADDL X7,X1 - MOVOA X1,X2 - PSLLL $9,X1 - PXOR X1,X10 - PSRLL $23,X2 - PXOR X2,X10 - MOVOA X12,X1 - PADDL X5,X1 - MOVOA X1,X2 - PSLLL $9,X1 - PXOR X1,X8 - PSRLL $23,X2 - PXOR X2,X8 - MOVOA X7,X1 - PADDL X10,X1 - MOVOA X1,X2 - PSLLL $13,X1 - PXOR X1,X4 - PSRLL $19,X2 - PXOR X2,X4 - MOVOA X5,X1 - PADDL X8,X1 - MOVOA X1,X2 - PSLLL $13,X1 - PXOR X1,X14 - PSRLL $19,X2 - PXOR X2,X14 - MOVOA X10,X1 - PADDL X4,X1 - MOVOA X1,X2 - PSLLL $18,X1 - PXOR X1,X0 - PSRLL $14,X2 - PXOR X2,X0 - MOVOA 320(R12),X1 - MOVOA X0,320(R12) - MOVOA X8,X0 - PADDL X14,X0 - MOVOA X0,X2 - PSLLL $18,X0 - PXOR X0,X12 - PSRLL $14,X2 - PXOR X2,X12 - MOVOA X11,X0 - PADDL X1,X0 - MOVOA X0,X2 - PSLLL $7,X0 - PXOR X0,X6 - PSRLL $25,X2 - PXOR X2,X6 - MOVOA 336(R12),X2 - MOVOA X12,336(R12) - MOVOA X3,X0 - PADDL X2,X0 - MOVOA X0,X12 - PSLLL $7,X0 - PXOR X0,X13 - PSRLL $25,X12 - PXOR X12,X13 - MOVOA X1,X0 - PADDL X6,X0 - MOVOA X0,X12 - PSLLL $9,X0 - PXOR X0,X15 - PSRLL $23,X12 - PXOR X12,X15 - MOVOA X2,X0 - PADDL X13,X0 - MOVOA X0,X12 - PSLLL $9,X0 - PXOR X0,X9 - PSRLL $23,X12 - PXOR X12,X9 - MOVOA X6,X0 - PADDL X15,X0 - MOVOA X0,X12 - PSLLL $13,X0 - PXOR X0,X11 - PSRLL $19,X12 - PXOR X12,X11 - MOVOA X13,X0 - PADDL X9,X0 - MOVOA X0,X12 - PSLLL $13,X0 - PXOR X0,X3 - PSRLL $19,X12 - PXOR X12,X3 - MOVOA X15,X0 - PADDL X11,X0 - MOVOA X0,X12 - PSLLL $18,X0 - PXOR X0,X1 - PSRLL $14,X12 - PXOR X12,X1 - MOVOA X9,X0 - PADDL X3,X0 - MOVOA X0,X12 - PSLLL $18,X0 - PXOR X0,X2 - PSRLL $14,X12 - PXOR X12,X2 - MOVOA 320(R12),X12 - MOVOA 336(R12),X0 - SUBQ $2,DX - JA MAINLOOP1 - PADDL 112(R12),X12 - PADDL 176(R12),X7 - PADDL 224(R12),X10 - PADDL 272(R12),X4 - MOVD X12,DX - MOVD X7,CX - MOVD X10,R8 - MOVD X4,R9 - PSHUFL $0X39,X12,X12 - PSHUFL $0X39,X7,X7 - PSHUFL $0X39,X10,X10 - PSHUFL $0X39,X4,X4 - XORL 0(SI),DX - XORL 4(SI),CX - XORL 8(SI),R8 - XORL 12(SI),R9 - MOVL DX,0(DI) - MOVL CX,4(DI) - MOVL R8,8(DI) - MOVL R9,12(DI) - MOVD X12,DX - MOVD X7,CX - MOVD X10,R8 - MOVD X4,R9 - PSHUFL $0X39,X12,X12 - PSHUFL $0X39,X7,X7 - PSHUFL $0X39,X10,X10 - PSHUFL $0X39,X4,X4 - XORL 64(SI),DX - XORL 68(SI),CX - XORL 72(SI),R8 - XORL 76(SI),R9 - MOVL DX,64(DI) - MOVL CX,68(DI) - MOVL R8,72(DI) - MOVL R9,76(DI) - MOVD X12,DX - MOVD X7,CX - MOVD X10,R8 - MOVD X4,R9 - PSHUFL $0X39,X12,X12 - PSHUFL $0X39,X7,X7 - PSHUFL $0X39,X10,X10 - PSHUFL $0X39,X4,X4 - XORL 128(SI),DX - XORL 132(SI),CX - XORL 136(SI),R8 - XORL 140(SI),R9 - MOVL DX,128(DI) - MOVL CX,132(DI) - MOVL R8,136(DI) - MOVL R9,140(DI) - MOVD X12,DX - MOVD X7,CX - MOVD X10,R8 - MOVD X4,R9 - XORL 192(SI),DX - XORL 196(SI),CX - XORL 200(SI),R8 - XORL 204(SI),R9 - MOVL DX,192(DI) - MOVL CX,196(DI) - MOVL R8,200(DI) - MOVL R9,204(DI) - PADDL 240(R12),X14 - PADDL 64(R12),X0 - PADDL 128(R12),X5 - PADDL 192(R12),X8 - MOVD X14,DX - MOVD X0,CX - MOVD X5,R8 - MOVD X8,R9 - PSHUFL $0X39,X14,X14 - PSHUFL $0X39,X0,X0 - PSHUFL $0X39,X5,X5 - PSHUFL $0X39,X8,X8 - XORL 16(SI),DX - XORL 20(SI),CX - XORL 24(SI),R8 - XORL 28(SI),R9 - MOVL DX,16(DI) - MOVL CX,20(DI) - MOVL R8,24(DI) - MOVL R9,28(DI) - MOVD X14,DX - MOVD X0,CX - MOVD X5,R8 - MOVD X8,R9 - PSHUFL $0X39,X14,X14 - PSHUFL $0X39,X0,X0 - PSHUFL $0X39,X5,X5 - PSHUFL $0X39,X8,X8 - XORL 80(SI),DX - XORL 84(SI),CX - XORL 88(SI),R8 - XORL 92(SI),R9 - MOVL DX,80(DI) - MOVL CX,84(DI) - MOVL R8,88(DI) - MOVL R9,92(DI) - MOVD X14,DX - MOVD X0,CX - MOVD X5,R8 - MOVD X8,R9 - PSHUFL $0X39,X14,X14 - PSHUFL $0X39,X0,X0 - PSHUFL $0X39,X5,X5 - PSHUFL $0X39,X8,X8 - XORL 144(SI),DX - XORL 148(SI),CX - XORL 152(SI),R8 - XORL 156(SI),R9 - MOVL DX,144(DI) - MOVL CX,148(DI) - MOVL R8,152(DI) - MOVL R9,156(DI) - MOVD X14,DX - MOVD X0,CX - MOVD X5,R8 - MOVD X8,R9 - XORL 208(SI),DX - XORL 212(SI),CX - XORL 216(SI),R8 - XORL 220(SI),R9 - MOVL DX,208(DI) - MOVL CX,212(DI) - MOVL R8,216(DI) - MOVL R9,220(DI) - PADDL 288(R12),X15 - PADDL 304(R12),X11 - PADDL 80(R12),X1 - PADDL 144(R12),X6 - MOVD X15,DX - MOVD X11,CX - MOVD X1,R8 - MOVD X6,R9 - PSHUFL $0X39,X15,X15 - PSHUFL $0X39,X11,X11 - PSHUFL $0X39,X1,X1 - PSHUFL $0X39,X6,X6 - XORL 32(SI),DX - XORL 36(SI),CX - XORL 40(SI),R8 - XORL 44(SI),R9 - MOVL DX,32(DI) - MOVL CX,36(DI) - MOVL R8,40(DI) - MOVL R9,44(DI) - MOVD X15,DX - MOVD X11,CX - MOVD X1,R8 - MOVD X6,R9 - PSHUFL $0X39,X15,X15 - PSHUFL $0X39,X11,X11 - PSHUFL $0X39,X1,X1 - PSHUFL $0X39,X6,X6 - XORL 96(SI),DX - XORL 100(SI),CX - XORL 104(SI),R8 - XORL 108(SI),R9 - MOVL DX,96(DI) - MOVL CX,100(DI) - MOVL R8,104(DI) - MOVL R9,108(DI) - MOVD X15,DX - MOVD X11,CX - MOVD X1,R8 - MOVD X6,R9 - PSHUFL $0X39,X15,X15 - PSHUFL $0X39,X11,X11 - PSHUFL $0X39,X1,X1 - PSHUFL $0X39,X6,X6 - XORL 160(SI),DX - XORL 164(SI),CX - XORL 168(SI),R8 - XORL 172(SI),R9 - MOVL DX,160(DI) - MOVL CX,164(DI) - MOVL R8,168(DI) - MOVL R9,172(DI) - MOVD X15,DX - MOVD X11,CX - MOVD X1,R8 - MOVD X6,R9 - XORL 224(SI),DX - XORL 228(SI),CX - XORL 232(SI),R8 - XORL 236(SI),R9 - MOVL DX,224(DI) - MOVL CX,228(DI) - MOVL R8,232(DI) - MOVL R9,236(DI) - PADDL 160(R12),X13 - PADDL 208(R12),X9 - PADDL 256(R12),X3 - PADDL 96(R12),X2 - MOVD X13,DX - MOVD X9,CX - MOVD X3,R8 - MOVD X2,R9 - PSHUFL $0X39,X13,X13 - PSHUFL $0X39,X9,X9 - PSHUFL $0X39,X3,X3 - PSHUFL $0X39,X2,X2 - XORL 48(SI),DX - XORL 52(SI),CX - XORL 56(SI),R8 - XORL 60(SI),R9 - MOVL DX,48(DI) - MOVL CX,52(DI) - MOVL R8,56(DI) - MOVL R9,60(DI) - MOVD X13,DX - MOVD X9,CX - MOVD X3,R8 - MOVD X2,R9 - PSHUFL $0X39,X13,X13 - PSHUFL $0X39,X9,X9 - PSHUFL $0X39,X3,X3 - PSHUFL $0X39,X2,X2 - XORL 112(SI),DX - XORL 116(SI),CX - XORL 120(SI),R8 - XORL 124(SI),R9 - MOVL DX,112(DI) - MOVL CX,116(DI) - MOVL R8,120(DI) - MOVL R9,124(DI) - MOVD X13,DX - MOVD X9,CX - MOVD X3,R8 - MOVD X2,R9 - PSHUFL $0X39,X13,X13 - PSHUFL $0X39,X9,X9 - PSHUFL $0X39,X3,X3 - PSHUFL $0X39,X2,X2 - XORL 176(SI),DX - XORL 180(SI),CX - XORL 184(SI),R8 - XORL 188(SI),R9 - MOVL DX,176(DI) - MOVL CX,180(DI) - MOVL R8,184(DI) - MOVL R9,188(DI) - MOVD X13,DX - MOVD X9,CX - MOVD X3,R8 - MOVD X2,R9 - XORL 240(SI),DX - XORL 244(SI),CX - XORL 248(SI),R8 - XORL 252(SI),R9 - MOVL DX,240(DI) - MOVL CX,244(DI) - MOVL R8,248(DI) - MOVL R9,252(DI) - MOVQ 352(R12),R9 - SUBQ $256,R9 - ADDQ $256,SI - ADDQ $256,DI - CMPQ R9,$256 - JAE BYTESATLEAST256 - CMPQ R9,$0 - JBE DONE - BYTESBETWEEN1AND255: - CMPQ R9,$64 - JAE NOCOPY - MOVQ DI,DX - LEAQ 360(R12),DI - MOVQ R9,CX +BYTESBETWEEN1AND255: + CMPQ R9, $0x40 + JAE NOCOPY + MOVQ DI, DX + LEAQ 360(R12), DI + MOVQ R9, CX REP; MOVSB - LEAQ 360(R12),DI - LEAQ 360(R12),SI - NOCOPY: - MOVQ R9,352(R12) - MOVOA 48(R12),X0 - MOVOA 0(R12),X1 - MOVOA 16(R12),X2 - MOVOA 32(R12),X3 - MOVOA X1,X4 - MOVQ $20,CX - MAINLOOP2: - PADDL X0,X4 - MOVOA X0,X5 - MOVOA X4,X6 - PSLLL $7,X4 - PSRLL $25,X6 - PXOR X4,X3 - PXOR X6,X3 - PADDL X3,X5 - MOVOA X3,X4 - MOVOA X5,X6 - PSLLL $9,X5 - PSRLL $23,X6 - PXOR X5,X2 - PSHUFL $0X93,X3,X3 - PXOR X6,X2 - PADDL X2,X4 - MOVOA X2,X5 - MOVOA X4,X6 - PSLLL $13,X4 - PSRLL $19,X6 - PXOR X4,X1 - PSHUFL $0X4E,X2,X2 - PXOR X6,X1 - PADDL X1,X5 - MOVOA X3,X4 - MOVOA X5,X6 - PSLLL $18,X5 - PSRLL $14,X6 - PXOR X5,X0 - PSHUFL $0X39,X1,X1 - PXOR X6,X0 - PADDL X0,X4 - MOVOA X0,X5 - MOVOA X4,X6 - PSLLL $7,X4 - PSRLL $25,X6 - PXOR X4,X1 - PXOR X6,X1 - PADDL X1,X5 - MOVOA X1,X4 - MOVOA X5,X6 - PSLLL $9,X5 - PSRLL $23,X6 - PXOR X5,X2 - PSHUFL $0X93,X1,X1 - PXOR X6,X2 - PADDL X2,X4 - MOVOA X2,X5 - MOVOA X4,X6 - PSLLL $13,X4 - PSRLL $19,X6 - PXOR X4,X3 - PSHUFL $0X4E,X2,X2 - PXOR X6,X3 - PADDL X3,X5 - MOVOA X1,X4 - MOVOA X5,X6 - PSLLL $18,X5 - PSRLL $14,X6 - PXOR X5,X0 - PSHUFL $0X39,X3,X3 - PXOR X6,X0 - PADDL X0,X4 - MOVOA X0,X5 - MOVOA X4,X6 - PSLLL $7,X4 - PSRLL $25,X6 - PXOR X4,X3 - PXOR X6,X3 - PADDL X3,X5 - MOVOA X3,X4 - MOVOA X5,X6 - PSLLL $9,X5 - PSRLL $23,X6 - PXOR X5,X2 - PSHUFL $0X93,X3,X3 - PXOR X6,X2 - PADDL X2,X4 - MOVOA X2,X5 - MOVOA X4,X6 - PSLLL $13,X4 - PSRLL $19,X6 - PXOR X4,X1 - PSHUFL $0X4E,X2,X2 - PXOR X6,X1 - PADDL X1,X5 - MOVOA X3,X4 - MOVOA X5,X6 - PSLLL $18,X5 - PSRLL $14,X6 - PXOR X5,X0 - PSHUFL $0X39,X1,X1 - PXOR X6,X0 - PADDL X0,X4 - MOVOA X0,X5 - MOVOA X4,X6 - PSLLL $7,X4 - PSRLL $25,X6 - PXOR X4,X1 - PXOR X6,X1 - PADDL X1,X5 - MOVOA X1,X4 - MOVOA X5,X6 - PSLLL $9,X5 - PSRLL $23,X6 - PXOR X5,X2 - PSHUFL $0X93,X1,X1 - PXOR X6,X2 - PADDL X2,X4 - MOVOA X2,X5 - MOVOA X4,X6 - PSLLL $13,X4 - PSRLL $19,X6 - PXOR X4,X3 - PSHUFL $0X4E,X2,X2 - PXOR X6,X3 - SUBQ $4,CX - PADDL X3,X5 - MOVOA X1,X4 - MOVOA X5,X6 - PSLLL $18,X5 - PXOR X7,X7 - PSRLL $14,X6 - PXOR X5,X0 - PSHUFL $0X39,X3,X3 - PXOR X6,X0 - JA MAINLOOP2 - PADDL 48(R12),X0 - PADDL 0(R12),X1 - PADDL 16(R12),X2 - PADDL 32(R12),X3 - MOVD X0,CX - MOVD X1,R8 - MOVD X2,R9 - MOVD X3,AX - PSHUFL $0X39,X0,X0 - PSHUFL $0X39,X1,X1 - PSHUFL $0X39,X2,X2 - PSHUFL $0X39,X3,X3 - XORL 0(SI),CX - XORL 48(SI),R8 - XORL 32(SI),R9 - XORL 16(SI),AX - MOVL CX,0(DI) - MOVL R8,48(DI) - MOVL R9,32(DI) - MOVL AX,16(DI) - MOVD X0,CX - MOVD X1,R8 - MOVD X2,R9 - MOVD X3,AX - PSHUFL $0X39,X0,X0 - PSHUFL $0X39,X1,X1 - PSHUFL $0X39,X2,X2 - PSHUFL $0X39,X3,X3 - XORL 20(SI),CX - XORL 4(SI),R8 - XORL 52(SI),R9 - XORL 36(SI),AX - MOVL CX,20(DI) - MOVL R8,4(DI) - MOVL R9,52(DI) - MOVL AX,36(DI) - MOVD X0,CX - MOVD X1,R8 - MOVD X2,R9 - MOVD X3,AX - PSHUFL $0X39,X0,X0 - PSHUFL $0X39,X1,X1 - PSHUFL $0X39,X2,X2 - PSHUFL $0X39,X3,X3 - XORL 40(SI),CX - XORL 24(SI),R8 - XORL 8(SI),R9 - XORL 56(SI),AX - MOVL CX,40(DI) - MOVL R8,24(DI) - MOVL R9,8(DI) - MOVL AX,56(DI) - MOVD X0,CX - MOVD X1,R8 - MOVD X2,R9 - MOVD X3,AX - XORL 60(SI),CX - XORL 44(SI),R8 - XORL 28(SI),R9 - XORL 12(SI),AX - MOVL CX,60(DI) - MOVL R8,44(DI) - MOVL R9,28(DI) - MOVL AX,12(DI) - MOVQ 352(R12),R9 - MOVL 16(R12),CX - MOVL 36 (R12),R8 - ADDQ $1,CX - SHLQ $32,R8 - ADDQ R8,CX - MOVQ CX,R8 - SHRQ $32,R8 - MOVL CX,16(R12) - MOVL R8, 36 (R12) - CMPQ R9,$64 - JA BYTESATLEAST65 - JAE BYTESATLEAST64 - MOVQ DI,SI - MOVQ DX,DI - MOVQ R9,CX + LEAQ 360(R12), DI + LEAQ 360(R12), SI + +NOCOPY: + MOVQ R9, 352(R12) + MOVOA 48(R12), X0 + MOVOA (R12), X1 + MOVOA 16(R12), X2 + MOVOA 32(R12), X3 + MOVOA X1, X4 + MOVQ $0x00000014, CX + +MAINLOOP2: + PADDL X0, X4 + MOVOA X0, X5 + MOVOA X4, X6 + PSLLL $0x07, X4 + PSRLL $0x19, X6 + PXOR X4, X3 + PXOR X6, X3 + PADDL X3, X5 + MOVOA X3, X4 + MOVOA X5, X6 + PSLLL $0x09, X5 + PSRLL $0x17, X6 + PXOR X5, X2 + PSHUFL $0x93, X3, X3 + PXOR X6, X2 + PADDL X2, X4 + MOVOA X2, X5 + MOVOA X4, X6 + PSLLL $0x0d, X4 + PSRLL $0x13, X6 + PXOR X4, X1 + PSHUFL $0x4e, X2, X2 + PXOR X6, X1 + PADDL X1, X5 + MOVOA X3, X4 + MOVOA X5, X6 + PSLLL $0x12, X5 + PSRLL $0x0e, X6 + PXOR X5, X0 + PSHUFL $0x39, X1, X1 + PXOR X6, X0 + PADDL X0, X4 + MOVOA X0, X5 + MOVOA X4, X6 + PSLLL $0x07, X4 + PSRLL $0x19, X6 + PXOR X4, X1 + PXOR X6, X1 + PADDL X1, X5 + MOVOA X1, X4 + MOVOA X5, X6 + PSLLL $0x09, X5 + PSRLL $0x17, X6 + PXOR X5, X2 + PSHUFL $0x93, X1, X1 + PXOR X6, X2 + PADDL X2, X4 + MOVOA X2, X5 + MOVOA X4, X6 + PSLLL $0x0d, X4 + PSRLL $0x13, X6 + PXOR X4, X3 + PSHUFL $0x4e, X2, X2 + PXOR X6, X3 + PADDL X3, X5 + MOVOA X1, X4 + MOVOA X5, X6 + PSLLL $0x12, X5 + PSRLL $0x0e, X6 + PXOR X5, X0 + PSHUFL $0x39, X3, X3 + PXOR X6, X0 + PADDL X0, X4 + MOVOA X0, X5 + MOVOA X4, X6 + PSLLL $0x07, X4 + PSRLL $0x19, X6 + PXOR X4, X3 + PXOR X6, X3 + PADDL X3, X5 + MOVOA X3, X4 + MOVOA X5, X6 + PSLLL $0x09, X5 + PSRLL $0x17, X6 + PXOR X5, X2 + PSHUFL $0x93, X3, X3 + PXOR X6, X2 + PADDL X2, X4 + MOVOA X2, X5 + MOVOA X4, X6 + PSLLL $0x0d, X4 + PSRLL $0x13, X6 + PXOR X4, X1 + PSHUFL $0x4e, X2, X2 + PXOR X6, X1 + PADDL X1, X5 + MOVOA X3, X4 + MOVOA X5, X6 + PSLLL $0x12, X5 + PSRLL $0x0e, X6 + PXOR X5, X0 + PSHUFL $0x39, X1, X1 + PXOR X6, X0 + PADDL X0, X4 + MOVOA X0, X5 + MOVOA X4, X6 + PSLLL $0x07, X4 + PSRLL $0x19, X6 + PXOR X4, X1 + PXOR X6, X1 + PADDL X1, X5 + MOVOA X1, X4 + MOVOA X5, X6 + PSLLL $0x09, X5 + PSRLL $0x17, X6 + PXOR X5, X2 + PSHUFL $0x93, X1, X1 + PXOR X6, X2 + PADDL X2, X4 + MOVOA X2, X5 + MOVOA X4, X6 + PSLLL $0x0d, X4 + PSRLL $0x13, X6 + PXOR X4, X3 + PSHUFL $0x4e, X2, X2 + PXOR X6, X3 + SUBQ $0x04, CX + PADDL X3, X5 + MOVOA X1, X4 + MOVOA X5, X6 + PSLLL $0x12, X5 + PXOR X7, X7 + PSRLL $0x0e, X6 + PXOR X5, X0 + PSHUFL $0x39, X3, X3 + PXOR X6, X0 + JA MAINLOOP2 + PADDL 48(R12), X0 + PADDL (R12), X1 + PADDL 16(R12), X2 + PADDL 32(R12), X3 + MOVD X0, CX + MOVD X1, R8 + MOVD X2, R9 + MOVD X3, AX + PSHUFL $0x39, X0, X0 + PSHUFL $0x39, X1, X1 + PSHUFL $0x39, X2, X2 + PSHUFL $0x39, X3, X3 + XORL (SI), CX + XORL 48(SI), R8 + XORL 32(SI), R9 + XORL 16(SI), AX + MOVL CX, (DI) + MOVL R8, 48(DI) + MOVL R9, 32(DI) + MOVL AX, 16(DI) + MOVD X0, CX + MOVD X1, R8 + MOVD X2, R9 + MOVD X3, AX + PSHUFL $0x39, X0, X0 + PSHUFL $0x39, X1, X1 + PSHUFL $0x39, X2, X2 + PSHUFL $0x39, X3, X3 + XORL 20(SI), CX + XORL 4(SI), R8 + XORL 52(SI), R9 + XORL 36(SI), AX + MOVL CX, 20(DI) + MOVL R8, 4(DI) + MOVL R9, 52(DI) + MOVL AX, 36(DI) + MOVD X0, CX + MOVD X1, R8 + MOVD X2, R9 + MOVD X3, AX + PSHUFL $0x39, X0, X0 + PSHUFL $0x39, X1, X1 + PSHUFL $0x39, X2, X2 + PSHUFL $0x39, X3, X3 + XORL 40(SI), CX + XORL 24(SI), R8 + XORL 8(SI), R9 + XORL 56(SI), AX + MOVL CX, 40(DI) + MOVL R8, 24(DI) + MOVL R9, 8(DI) + MOVL AX, 56(DI) + MOVD X0, CX + MOVD X1, R8 + MOVD X2, R9 + MOVD X3, AX + XORL 60(SI), CX + XORL 44(SI), R8 + XORL 28(SI), R9 + XORL 12(SI), AX + MOVL CX, 60(DI) + MOVL R8, 44(DI) + MOVL R9, 28(DI) + MOVL AX, 12(DI) + MOVQ 352(R12), R9 + MOVL 16(R12), CX + MOVL 36(R12), R8 + ADDQ $0x01, CX + SHLQ $0x20, R8 + ADDQ R8, CX + MOVQ CX, R8 + SHRQ $0x20, R8 + MOVL CX, 16(R12) + MOVL R8, 36(R12) + CMPQ R9, $0x40 + JA BYTESATLEAST65 + JAE BYTESATLEAST64 + MOVQ DI, SI + MOVQ DX, DI + MOVQ R9, CX REP; MOVSB - BYTESATLEAST64: - DONE: + +BYTESATLEAST64: +DONE: RET - BYTESATLEAST65: - SUBQ $64,R9 - ADDQ $64,DI - ADDQ $64,SI - JMP BYTESBETWEEN1AND255 + +BYTESATLEAST65: + SUBQ $0x40, R9 + ADDQ $0x40, DI + ADDQ $0x40, SI + JMP BYTESBETWEEN1AND255