Skip to content

Commit

Permalink
Optimize x86/aarch64 MD5 implementation (#2137)
Browse files Browse the repository at this point in the history
As suggested in https://github.com/animetosho/md5-optimisation?tab=readme-ov-file#dependency-shortcut-in-g-function, we can delay the dependency on 'x' by recognizing that ((x & z) | (y & ~z)) is equivalent to ((x & z) + (y + ~z)) in this scenario, and we can perform those additions independently, leaving our dependency on x to the final addition. This speeds it up around 5% on both platforms.
  • Loading branch information
olivergillespie authored Jan 28, 2025
1 parent d15dc5c commit ea58b3f
Show file tree
Hide file tree
Showing 8 changed files with 306 additions and 307 deletions.
128 changes: 64 additions & 64 deletions crypto/fipsmodule/md5/asm/md5-armv8.pl

Large diffs are not rendered by default.

5 changes: 2 additions & 3 deletions crypto/fipsmodule/md5/asm/md5-x86_64.pl
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,6 @@ sub round1_step
# %r10d = X[k_next]
# %r11d = z' (copy of z for the next step)
# %r12d = z' (copy of z for the next step)
# Each round2_step() takes about 5.4 clocks (11 instructions, 2.0 IPC)
sub round2_step
{
my ($pos, $dst, $x, $y, $z, $k_next, $T_i, $s) = @_;
Expand All @@ -52,9 +51,9 @@ sub round2_step
and $x, %r12d /* x & z */
and $y, %r11d /* y & (not z) */
mov $k_next*4(%rsi),%r10d /* (NEXT STEP) X[$k_next] */
or %r11d, %r12d /* (y & (not z)) | (x & z) */
add %r11d, $dst /* dst += (y & (not z)) */
mov $y, %r11d /* (NEXT STEP) z' = $y */
add %r12d, $dst /* dst += ... */
add %r12d, $dst /* dst += (x & z) */
mov $y, %r12d /* (NEXT STEP) z' = $y */
rol \$$s, $dst /* dst <<< s */
add $x, $dst /* dst += x */
Expand Down
128 changes: 64 additions & 64 deletions generated-src/ios-aarch64/crypto/fipsmodule/md5-armv8.S
Original file line number Diff line number Diff line change
Expand Up @@ -192,165 +192,165 @@ md5_blocks_loop:
add w9, w9, w13 // Add constant 0x49b40821
add w9, w9, w6 // Add aux function result
ror w9, w9, #10 // Rotate left s=22 bits
bic x6, x8, x17 // Aux function round 2 G(x,y,z)=((x&z)|(~z&y))
bic x6, x8, x17 // Aux function round 2 (~z & y)
add w9, w8, w9 // Add X parameter round 1 B=FF(B, C, D, A, 0x49b40821, s=22, M[15])
and x13, x9, x17 // Aux function round 2 G(x,y,z)=((x&z)|(~z&y))
orr x6, x6, x13 // End aux function round 2 G(x,y,z)=((x&z)|(~z&y))
movz x13, #0x2562 // Load lower half of constant 0xf61e2562
movk x13, #0xf61e, lsl #16 // Load upper half of constant 0xf61e2562
add w4, w4, w20 // Add dest value
add w4, w4, w13 // Add constant 0xf61e2562
add w4, w4, w6 // Add aux function result
and x13, x9, x17 // Aux function round 2 (x & z)
add w4, w4, w6 // Add (~z & y)
add w4, w4, w13 // Add (x & z)
ror w4, w4, #27 // Rotate left s=5 bits
bic x6, x9, x8 // Aux function round 2 G(x,y,z)=((x&z)|(~z&y))
bic x6, x9, x8 // Aux function round 2 (~z & y)
add w4, w9, w4 // Add X parameter round 2 A=GG(A, B, C, D, 0xf61e2562, s=5, M[1])
and x13, x4, x8 // Aux function round 2 G(x,y,z)=((x&z)|(~z&y))
orr x6, x6, x13 // End aux function round 2 G(x,y,z)=((x&z)|(~z&y))
movz x13, #0xb340 // Load lower half of constant 0xc040b340
movk x13, #0xc040, lsl #16 // Load upper half of constant 0xc040b340
add w17, w17, w7 // Add dest value
add w17, w17, w13 // Add constant 0xc040b340
add w17, w17, w6 // Add aux function result
and x13, x4, x8 // Aux function round 2 (x & z)
add w17, w17, w6 // Add (~z & y)
add w17, w17, w13 // Add (x & z)
ror w17, w17, #23 // Rotate left s=9 bits
bic x6, x4, x9 // Aux function round 2 G(x,y,z)=((x&z)|(~z&y))
bic x6, x4, x9 // Aux function round 2 (~z & y)
add w17, w4, w17 // Add X parameter round 2 D=GG(D, A, B, C, 0xc040b340, s=9, M[6])
and x13, x17, x9 // Aux function round 2 G(x,y,z)=((x&z)|(~z&y))
orr x6, x6, x13 // End aux function round 2 G(x,y,z)=((x&z)|(~z&y))
movz x13, #0x5a51 // Load lower half of constant 0x265e5a51
movk x13, #0x265e, lsl #16 // Load upper half of constant 0x265e5a51
add w8, w8, w25 // Add dest value
add w8, w8, w13 // Add constant 0x265e5a51
add w8, w8, w6 // Add aux function result
and x13, x17, x9 // Aux function round 2 (x & z)
add w8, w8, w6 // Add (~z & y)
add w8, w8, w13 // Add (x & z)
ror w8, w8, #18 // Rotate left s=14 bits
bic x6, x17, x4 // Aux function round 2 G(x,y,z)=((x&z)|(~z&y))
bic x6, x17, x4 // Aux function round 2 (~z & y)
add w8, w17, w8 // Add X parameter round 2 C=GG(C, D, A, B, 0x265e5a51, s=14, M[11])
and x13, x8, x4 // Aux function round 2 G(x,y,z)=((x&z)|(~z&y))
orr x6, x6, x13 // End aux function round 2 G(x,y,z)=((x&z)|(~z&y))
movz x13, #0xc7aa // Load lower half of constant 0xe9b6c7aa
movk x13, #0xe9b6, lsl #16 // Load upper half of constant 0xe9b6c7aa
add w9, w9, w15 // Add dest value
add w9, w9, w13 // Add constant 0xe9b6c7aa
add w9, w9, w6 // Add aux function result
and x13, x8, x4 // Aux function round 2 (x & z)
add w9, w9, w6 // Add (~z & y)
add w9, w9, w13 // Add (x & z)
ror w9, w9, #12 // Rotate left s=20 bits
bic x6, x8, x17 // Aux function round 2 G(x,y,z)=((x&z)|(~z&y))
bic x6, x8, x17 // Aux function round 2 (~z & y)
add w9, w8, w9 // Add X parameter round 2 B=GG(B, C, D, A, 0xe9b6c7aa, s=20, M[0])
and x13, x9, x17 // Aux function round 2 G(x,y,z)=((x&z)|(~z&y))
orr x6, x6, x13 // End aux function round 2 G(x,y,z)=((x&z)|(~z&y))
movz x13, #0x105d // Load lower half of constant 0xd62f105d
movk x13, #0xd62f, lsl #16 // Load upper half of constant 0xd62f105d
add w4, w4, w22 // Add dest value
add w4, w4, w13 // Add constant 0xd62f105d
add w4, w4, w6 // Add aux function result
and x13, x9, x17 // Aux function round 2 (x & z)
add w4, w4, w6 // Add (~z & y)
add w4, w4, w13 // Add (x & z)
ror w4, w4, #27 // Rotate left s=5 bits
bic x6, x9, x8 // Aux function round 2 G(x,y,z)=((x&z)|(~z&y))
bic x6, x9, x8 // Aux function round 2 (~z & y)
add w4, w9, w4 // Add X parameter round 2 A=GG(A, B, C, D, 0xd62f105d, s=5, M[5])
and x13, x4, x8 // Aux function round 2 G(x,y,z)=((x&z)|(~z&y))
orr x6, x6, x13 // End aux function round 2 G(x,y,z)=((x&z)|(~z&y))
movz x13, #0x1453 // Load lower half of constant 0x2441453
movk x13, #0x244, lsl #16 // Load upper half of constant 0x2441453
add w17, w17, w16 // Add dest value
add w17, w17, w13 // Add constant 0x2441453
add w17, w17, w6 // Add aux function result
and x13, x4, x8 // Aux function round 2 (x & z)
add w17, w17, w6 // Add (~z & y)
add w17, w17, w13 // Add (x & z)
ror w17, w17, #23 // Rotate left s=9 bits
bic x6, x4, x9 // Aux function round 2 G(x,y,z)=((x&z)|(~z&y))
bic x6, x4, x9 // Aux function round 2 (~z & y)
add w17, w4, w17 // Add X parameter round 2 D=GG(D, A, B, C, 0x2441453, s=9, M[10])
and x13, x17, x9 // Aux function round 2 G(x,y,z)=((x&z)|(~z&y))
orr x6, x6, x13 // End aux function round 2 G(x,y,z)=((x&z)|(~z&y))
movz x13, #0xe681 // Load lower half of constant 0xd8a1e681
movk x13, #0xd8a1, lsl #16 // Load upper half of constant 0xd8a1e681
add w8, w8, w27 // Add dest value
add w8, w8, w13 // Add constant 0xd8a1e681
add w8, w8, w6 // Add aux function result
and x13, x17, x9 // Aux function round 2 (x & z)
add w8, w8, w6 // Add (~z & y)
add w8, w8, w13 // Add (x & z)
ror w8, w8, #18 // Rotate left s=14 bits
bic x6, x17, x4 // Aux function round 2 G(x,y,z)=((x&z)|(~z&y))
bic x6, x17, x4 // Aux function round 2 (~z & y)
add w8, w17, w8 // Add X parameter round 2 C=GG(C, D, A, B, 0xd8a1e681, s=14, M[15])
and x13, x8, x4 // Aux function round 2 G(x,y,z)=((x&z)|(~z&y))
orr x6, x6, x13 // End aux function round 2 G(x,y,z)=((x&z)|(~z&y))
movz x13, #0xfbc8 // Load lower half of constant 0xe7d3fbc8
movk x13, #0xe7d3, lsl #16 // Load upper half of constant 0xe7d3fbc8
add w9, w9, w14 // Add dest value
add w9, w9, w13 // Add constant 0xe7d3fbc8
add w9, w9, w6 // Add aux function result
and x13, x8, x4 // Aux function round 2 (x & z)
add w9, w9, w6 // Add (~z & y)
add w9, w9, w13 // Add (x & z)
ror w9, w9, #12 // Rotate left s=20 bits
bic x6, x8, x17 // Aux function round 2 G(x,y,z)=((x&z)|(~z&y))
bic x6, x8, x17 // Aux function round 2 (~z & y)
add w9, w8, w9 // Add X parameter round 2 B=GG(B, C, D, A, 0xe7d3fbc8, s=20, M[4])
and x13, x9, x17 // Aux function round 2 G(x,y,z)=((x&z)|(~z&y))
orr x6, x6, x13 // End aux function round 2 G(x,y,z)=((x&z)|(~z&y))
movz x13, #0xcde6 // Load lower half of constant 0x21e1cde6
movk x13, #0x21e1, lsl #16 // Load upper half of constant 0x21e1cde6
add w4, w4, w24 // Add dest value
add w4, w4, w13 // Add constant 0x21e1cde6
add w4, w4, w6 // Add aux function result
and x13, x9, x17 // Aux function round 2 (x & z)
add w4, w4, w6 // Add (~z & y)
add w4, w4, w13 // Add (x & z)
ror w4, w4, #27 // Rotate left s=5 bits
bic x6, x9, x8 // Aux function round 2 G(x,y,z)=((x&z)|(~z&y))
bic x6, x9, x8 // Aux function round 2 (~z & y)
add w4, w9, w4 // Add X parameter round 2 A=GG(A, B, C, D, 0x21e1cde6, s=5, M[9])
and x13, x4, x8 // Aux function round 2 G(x,y,z)=((x&z)|(~z&y))
orr x6, x6, x13 // End aux function round 2 G(x,y,z)=((x&z)|(~z&y))
movz x13, #0x7d6 // Load lower half of constant 0xc33707d6
movk x13, #0xc337, lsl #16 // Load upper half of constant 0xc33707d6
add w17, w17, w12 // Add dest value
add w17, w17, w13 // Add constant 0xc33707d6
add w17, w17, w6 // Add aux function result
and x13, x4, x8 // Aux function round 2 (x & z)
add w17, w17, w6 // Add (~z & y)
add w17, w17, w13 // Add (x & z)
ror w17, w17, #23 // Rotate left s=9 bits
bic x6, x4, x9 // Aux function round 2 G(x,y,z)=((x&z)|(~z&y))
bic x6, x4, x9 // Aux function round 2 (~z & y)
add w17, w4, w17 // Add X parameter round 2 D=GG(D, A, B, C, 0xc33707d6, s=9, M[14])
and x13, x17, x9 // Aux function round 2 G(x,y,z)=((x&z)|(~z&y))
orr x6, x6, x13 // End aux function round 2 G(x,y,z)=((x&z)|(~z&y))
movz x13, #0xd87 // Load lower half of constant 0xf4d50d87
movk x13, #0xf4d5, lsl #16 // Load upper half of constant 0xf4d50d87
add w8, w8, w21 // Add dest value
add w8, w8, w13 // Add constant 0xf4d50d87
add w8, w8, w6 // Add aux function result
and x13, x17, x9 // Aux function round 2 (x & z)
add w8, w8, w6 // Add (~z & y)
add w8, w8, w13 // Add (x & z)
ror w8, w8, #18 // Rotate left s=14 bits
bic x6, x17, x4 // Aux function round 2 G(x,y,z)=((x&z)|(~z&y))
bic x6, x17, x4 // Aux function round 2 (~z & y)
add w8, w17, w8 // Add X parameter round 2 C=GG(C, D, A, B, 0xf4d50d87, s=14, M[3])
and x13, x8, x4 // Aux function round 2 G(x,y,z)=((x&z)|(~z&y))
orr x6, x6, x13 // End aux function round 2 G(x,y,z)=((x&z)|(~z&y))
movz x13, #0x14ed // Load lower half of constant 0x455a14ed
movk x13, #0x455a, lsl #16 // Load upper half of constant 0x455a14ed
add w9, w9, w5 // Add dest value
add w9, w9, w13 // Add constant 0x455a14ed
add w9, w9, w6 // Add aux function result
and x13, x8, x4 // Aux function round 2 (x & z)
add w9, w9, w6 // Add (~z & y)
add w9, w9, w13 // Add (x & z)
ror w9, w9, #12 // Rotate left s=20 bits
bic x6, x8, x17 // Aux function round 2 G(x,y,z)=((x&z)|(~z&y))
bic x6, x8, x17 // Aux function round 2 (~z & y)
add w9, w8, w9 // Add X parameter round 2 B=GG(B, C, D, A, 0x455a14ed, s=20, M[8])
and x13, x9, x17 // Aux function round 2 G(x,y,z)=((x&z)|(~z&y))
orr x6, x6, x13 // End aux function round 2 G(x,y,z)=((x&z)|(~z&y))
movz x13, #0xe905 // Load lower half of constant 0xa9e3e905
movk x13, #0xa9e3, lsl #16 // Load upper half of constant 0xa9e3e905
add w4, w4, w26 // Add dest value
add w4, w4, w13 // Add constant 0xa9e3e905
add w4, w4, w6 // Add aux function result
and x13, x9, x17 // Aux function round 2 (x & z)
add w4, w4, w6 // Add (~z & y)
add w4, w4, w13 // Add (x & z)
ror w4, w4, #27 // Rotate left s=5 bits
bic x6, x9, x8 // Aux function round 2 G(x,y,z)=((x&z)|(~z&y))
bic x6, x9, x8 // Aux function round 2 (~z & y)
add w4, w9, w4 // Add X parameter round 2 A=GG(A, B, C, D, 0xa9e3e905, s=5, M[13])
and x13, x4, x8 // Aux function round 2 G(x,y,z)=((x&z)|(~z&y))
orr x6, x6, x13 // End aux function round 2 G(x,y,z)=((x&z)|(~z&y))
movz x13, #0xa3f8 // Load lower half of constant 0xfcefa3f8
movk x13, #0xfcef, lsl #16 // Load upper half of constant 0xfcefa3f8
add w17, w17, w3 // Add dest value
add w17, w17, w13 // Add constant 0xfcefa3f8
add w17, w17, w6 // Add aux function result
and x13, x4, x8 // Aux function round 2 (x & z)
add w17, w17, w6 // Add (~z & y)
add w17, w17, w13 // Add (x & z)
ror w17, w17, #23 // Rotate left s=9 bits
bic x6, x4, x9 // Aux function round 2 G(x,y,z)=((x&z)|(~z&y))
bic x6, x4, x9 // Aux function round 2 (~z & y)
add w17, w4, w17 // Add X parameter round 2 D=GG(D, A, B, C, 0xfcefa3f8, s=9, M[2])
and x13, x17, x9 // Aux function round 2 G(x,y,z)=((x&z)|(~z&y))
orr x6, x6, x13 // End aux function round 2 G(x,y,z)=((x&z)|(~z&y))
movz x13, #0x2d9 // Load lower half of constant 0x676f02d9
movk x13, #0x676f, lsl #16 // Load upper half of constant 0x676f02d9
add w8, w8, w23 // Add dest value
add w8, w8, w13 // Add constant 0x676f02d9
add w8, w8, w6 // Add aux function result
and x13, x17, x9 // Aux function round 2 (x & z)
add w8, w8, w6 // Add (~z & y)
add w8, w8, w13 // Add (x & z)
ror w8, w8, #18 // Rotate left s=14 bits
bic x6, x17, x4 // Aux function round 2 G(x,y,z)=((x&z)|(~z&y))
bic x6, x17, x4 // Aux function round 2 (~z & y)
add w8, w17, w8 // Add X parameter round 2 C=GG(C, D, A, B, 0x676f02d9, s=14, M[7])
and x13, x8, x4 // Aux function round 2 G(x,y,z)=((x&z)|(~z&y))
orr x6, x6, x13 // End aux function round 2 G(x,y,z)=((x&z)|(~z&y))
movz x13, #0x4c8a // Load lower half of constant 0x8d2a4c8a
movk x13, #0x8d2a, lsl #16 // Load upper half of constant 0x8d2a4c8a
add w9, w9, w11 // Add dest value
add w9, w9, w13 // Add constant 0x8d2a4c8a
add w9, w9, w6 // Add aux function result
and x13, x8, x4 // Aux function round 2 (x & z)
add w9, w9, w6 // Add (~z & y)
add w9, w9, w13 // Add (x & z)
eor x6, x8, x17 // Begin aux function round 3 H(x,y,z)=(x^y^z)
ror w9, w9, #12 // Rotate left s=20 bits
movz x10, #0x3942 // Load lower half of constant 0xfffa3942
Expand Down
Loading

0 comments on commit ea58b3f

Please sign in to comment.