diff --git a/lib/std/crypto/sha2.zig b/lib/std/crypto/sha2.zig index b7a78c4b44e5..9cdf8edcf180 100644 --- a/lib/std/crypto/sha2.zig +++ b/lib/std/crypto/sha2.zig @@ -1,4 +1,5 @@ const std = @import("../std.zig"); +const builtin = @import("builtin"); const mem = std.mem; const math = std.math; const htest = @import("test.zig"); @@ -16,10 +17,9 @@ const RoundParam256 = struct { g: usize, h: usize, i: usize, - k: u32, }; -fn roundParam256(a: usize, b: usize, c: usize, d: usize, e: usize, f: usize, g: usize, h: usize, i: usize, k: u32) RoundParam256 { +fn roundParam256(a: usize, b: usize, c: usize, d: usize, e: usize, f: usize, g: usize, h: usize, i: usize) RoundParam256 { return RoundParam256{ .a = a, .b = b, @@ -30,7 +30,6 @@ fn roundParam256(a: usize, b: usize, c: usize, d: usize, e: usize, f: usize, g: .g = g, .h = h, .i = i, - .k = k, }; } @@ -70,6 +69,14 @@ const Sha256Params = Sha2Params32{ .digest_bits = 256, }; +const v4u32 = @Vector(4, u32); + +// TODO: Remove once https://github.com/ziglang/zig/issues/868 is resolved. +fn isComptime() bool { + var a: u8 = 0; + return @typeInfo(@TypeOf(.{a})).Struct.fields[0].is_comptime; +} + /// SHA-224 pub const Sha224 = Sha2x32(Sha224Params); @@ -83,7 +90,7 @@ fn Sha2x32(comptime params: Sha2Params32) type { pub const digest_length = params.digest_bits / 8; pub const Options = struct {}; - s: [8]u32, + s: [8]u32 align(16), // Streaming Cache buf: [64]u8 = undefined, buf_len: u8 = 0, @@ -168,17 +175,116 @@ fn Sha2x32(comptime params: Sha2Params32) type { } } + const W = [64]u32{ + 0x428A2F98, 0x71374491, 0xB5C0FBCF, 0xE9B5DBA5, 0x3956C25B, 0x59F111F1, 0x923F82A4, 0xAB1C5ED5, + 0xD807AA98, 0x12835B01, 0x243185BE, 0x550C7DC3, 0x72BE5D74, 0x80DEB1FE, 0x9BDC06A7, 0xC19BF174, + 0xE49B69C1, 0xEFBE4786, 0x0FC19DC6, 0x240CA1CC, 0x2DE92C6F, 0x4A7484AA, 0x5CB0A9DC, 0x76F988DA, + 0x983E5152, 0xA831C66D, 0xB00327C8, 0xBF597FC7, 0xC6E00BF3, 0xD5A79147, 0x06CA6351, 0x14292967, + 0x27B70A85, 0x2E1B2138, 0x4D2C6DFC, 0x53380D13, 0x650A7354, 0x766A0ABB, 0x81C2C92E, 0x92722C85, + 0xA2BFE8A1, 0xA81A664B, 0xC24B8B70, 0xC76C51A3, 0xD192E819, 0xD6990624, 0xF40E3585, 0x106AA070, + 0x19A4C116, 0x1E376C08, 0x2748774C, 0x34B0BCB5, 0x391C0CB3, 0x4ED8AA4A, 0x5B9CCA4F, 0x682E6FF3, + 0x748F82EE, 0x78A5636F, 0x84C87814, 0x8CC70208, 0x90BEFFFA, 0xA4506CEB, 0xBEF9A3F7, 0xC67178F2, + }; + fn round(d: *Self, b: *const [64]u8) void { - var s: [64]u32 = undefined; + var s: [64]u32 align(16) = undefined; + for (@ptrCast(*align(1) const [16]u32, b)) |*elem, i| { + s[i] = mem.readIntBig(u32, mem.asBytes(elem)); + } - var i: usize = 0; - while (i < 16) : (i += 1) { - s[i] = 0; - s[i] |= @as(u32, b[i * 4 + 0]) << 24; - s[i] |= @as(u32, b[i * 4 + 1]) << 16; - s[i] |= @as(u32, b[i * 4 + 2]) << 8; - s[i] |= @as(u32, b[i * 4 + 3]) << 0; + if (!isComptime()) { + switch (builtin.cpu.arch) { + .aarch64 => if (comptime std.Target.aarch64.featureSetHas(builtin.cpu.features, .sha2)) { + var x: v4u32 = d.s[0..4].*; + var y: v4u32 = d.s[4..8].*; + const s_v = @ptrCast(*[16]v4u32, &s); + + comptime var k: u8 = 0; + inline while (k < 16) : (k += 1) { + if (k > 3) { + s_v[k] = asm ( + \\sha256su0.4s %[w0_3], %[w4_7] + \\sha256su1.4s %[w0_3], %[w8_11], %[w12_15] + : [w0_3] "=w" (-> v4u32), + : [_] "0" (s_v[k - 4]), + [w4_7] "w" (s_v[k - 3]), + [w8_11] "w" (s_v[k - 2]), + [w12_15] "w" (s_v[k - 1]), + ); + } + + const w: v4u32 = s_v[k] +% @as(v4u32, W[4 * k ..][0..4].*); + asm volatile ( + \\mov.4s v0, %[x] + \\sha256h.4s %[x], %[y], %[w] + \\sha256h2.4s %[y], v0, %[w] + : [x] "=w" (x), + [y] "=w" (y), + : [_] "0" (x), + [_] "1" (y), + [w] "w" (w), + : "v0" + ); + } + + d.s[0..4].* = x +% @as(v4u32, d.s[0..4].*); + d.s[4..8].* = y +% @as(v4u32, d.s[4..8].*); + return; + }, + .x86_64 => if (comptime std.Target.x86.featureSetHas(builtin.cpu.features, .sha)) { + var x: v4u32 = [_]u32{ d.s[5], d.s[4], d.s[1], d.s[0] }; + var y: v4u32 = [_]u32{ d.s[7], d.s[6], d.s[3], d.s[2] }; + const s_v = @ptrCast(*[16]v4u32, &s); + + comptime var k: u8 = 0; + inline while (k < 16) : (k += 1) { + if (k < 12) { + var tmp = s_v[k]; + s_v[k + 4] = asm ( + \\ sha256msg1 %[w4_7], %[tmp] + \\ vpalignr $0x4, %[w8_11], %[w12_15], %[result] + \\ paddd %[tmp], %[result] + \\ sha256msg2 %[w12_15], %[result] + : [tmp] "=&x" (tmp), + [result] "=&x" (-> v4u32), + : [_] "0" (tmp), + [w4_7] "x" (s_v[k + 1]), + [w8_11] "x" (s_v[k + 2]), + [w12_15] "x" (s_v[k + 3]), + ); + } + + const w: v4u32 = s_v[k] +% @as(v4u32, W[4 * k ..][0..4].*); + y = asm ("sha256rnds2 %[x], %[y]" + : [y] "=x" (-> v4u32), + : [_] "0" (y), + [x] "x" (x), + [_] "{xmm0}" (w), + ); + + x = asm ("sha256rnds2 %[y], %[x]" + : [x] "=x" (-> v4u32), + : [_] "0" (x), + [y] "x" (y), + [_] "{xmm0}" (@bitCast(v4u32, @bitCast(u128, w) >> 64)), + ); + } + + d.s[0] +%= x[3]; + d.s[1] +%= x[2]; + d.s[4] +%= x[1]; + d.s[5] +%= x[0]; + d.s[2] +%= y[3]; + d.s[3] +%= y[2]; + d.s[6] +%= y[1]; + d.s[7] +%= y[0]; + return; + }, + else => {}, + } } + + var i: usize = 16; while (i < 64) : (i += 1) { s[i] = s[i - 16] +% s[i - 7] +% (math.rotr(u32, s[i - 15], @as(u32, 7)) ^ math.rotr(u32, s[i - 15], @as(u32, 18)) ^ (s[i - 15] >> 3)) +% (math.rotr(u32, s[i - 2], @as(u32, 17)) ^ math.rotr(u32, s[i - 2], @as(u32, 19)) ^ (s[i - 2] >> 10)); } @@ -195,73 +301,73 @@ fn Sha2x32(comptime params: Sha2Params32) type { }; const round0 = comptime [_]RoundParam256{ - roundParam256(0, 1, 2, 3, 4, 5, 6, 7, 0, 0x428A2F98), - roundParam256(7, 0, 1, 2, 3, 4, 5, 6, 1, 0x71374491), - roundParam256(6, 7, 0, 1, 2, 3, 4, 5, 2, 0xB5C0FBCF), - roundParam256(5, 6, 7, 0, 1, 2, 3, 4, 3, 0xE9B5DBA5), - roundParam256(4, 5, 6, 7, 0, 1, 2, 3, 4, 0x3956C25B), - roundParam256(3, 4, 5, 6, 7, 0, 1, 2, 5, 0x59F111F1), - roundParam256(2, 3, 4, 5, 6, 7, 0, 1, 6, 0x923F82A4), - roundParam256(1, 2, 3, 4, 5, 6, 7, 0, 7, 0xAB1C5ED5), - roundParam256(0, 1, 2, 3, 4, 5, 6, 7, 8, 0xD807AA98), - roundParam256(7, 0, 1, 2, 3, 4, 5, 6, 9, 0x12835B01), - roundParam256(6, 7, 0, 1, 2, 3, 4, 5, 10, 0x243185BE), - roundParam256(5, 6, 7, 0, 1, 2, 3, 4, 11, 0x550C7DC3), - roundParam256(4, 5, 6, 7, 0, 1, 2, 3, 12, 0x72BE5D74), - roundParam256(3, 4, 5, 6, 7, 0, 1, 2, 13, 0x80DEB1FE), - roundParam256(2, 3, 4, 5, 6, 7, 0, 1, 14, 0x9BDC06A7), - roundParam256(1, 2, 3, 4, 5, 6, 7, 0, 15, 0xC19BF174), - roundParam256(0, 1, 2, 3, 4, 5, 6, 7, 16, 0xE49B69C1), - roundParam256(7, 0, 1, 2, 3, 4, 5, 6, 17, 0xEFBE4786), - roundParam256(6, 7, 0, 1, 2, 3, 4, 5, 18, 0x0FC19DC6), - roundParam256(5, 6, 7, 0, 1, 2, 3, 4, 19, 0x240CA1CC), - roundParam256(4, 5, 6, 7, 0, 1, 2, 3, 20, 0x2DE92C6F), - roundParam256(3, 4, 5, 6, 7, 0, 1, 2, 21, 0x4A7484AA), - roundParam256(2, 3, 4, 5, 6, 7, 0, 1, 22, 0x5CB0A9DC), - roundParam256(1, 2, 3, 4, 5, 6, 7, 0, 23, 0x76F988DA), - roundParam256(0, 1, 2, 3, 4, 5, 6, 7, 24, 0x983E5152), - roundParam256(7, 0, 1, 2, 3, 4, 5, 6, 25, 0xA831C66D), - roundParam256(6, 7, 0, 1, 2, 3, 4, 5, 26, 0xB00327C8), - roundParam256(5, 6, 7, 0, 1, 2, 3, 4, 27, 0xBF597FC7), - roundParam256(4, 5, 6, 7, 0, 1, 2, 3, 28, 0xC6E00BF3), - roundParam256(3, 4, 5, 6, 7, 0, 1, 2, 29, 0xD5A79147), - roundParam256(2, 3, 4, 5, 6, 7, 0, 1, 30, 0x06CA6351), - roundParam256(1, 2, 3, 4, 5, 6, 7, 0, 31, 0x14292967), - roundParam256(0, 1, 2, 3, 4, 5, 6, 7, 32, 0x27B70A85), - roundParam256(7, 0, 1, 2, 3, 4, 5, 6, 33, 0x2E1B2138), - roundParam256(6, 7, 0, 1, 2, 3, 4, 5, 34, 0x4D2C6DFC), - roundParam256(5, 6, 7, 0, 1, 2, 3, 4, 35, 0x53380D13), - roundParam256(4, 5, 6, 7, 0, 1, 2, 3, 36, 0x650A7354), - roundParam256(3, 4, 5, 6, 7, 0, 1, 2, 37, 0x766A0ABB), - roundParam256(2, 3, 4, 5, 6, 7, 0, 1, 38, 0x81C2C92E), - roundParam256(1, 2, 3, 4, 5, 6, 7, 0, 39, 0x92722C85), - roundParam256(0, 1, 2, 3, 4, 5, 6, 7, 40, 0xA2BFE8A1), - roundParam256(7, 0, 1, 2, 3, 4, 5, 6, 41, 0xA81A664B), - roundParam256(6, 7, 0, 1, 2, 3, 4, 5, 42, 0xC24B8B70), - roundParam256(5, 6, 7, 0, 1, 2, 3, 4, 43, 0xC76C51A3), - roundParam256(4, 5, 6, 7, 0, 1, 2, 3, 44, 0xD192E819), - roundParam256(3, 4, 5, 6, 7, 0, 1, 2, 45, 0xD6990624), - roundParam256(2, 3, 4, 5, 6, 7, 0, 1, 46, 0xF40E3585), - roundParam256(1, 2, 3, 4, 5, 6, 7, 0, 47, 0x106AA070), - roundParam256(0, 1, 2, 3, 4, 5, 6, 7, 48, 0x19A4C116), - roundParam256(7, 0, 1, 2, 3, 4, 5, 6, 49, 0x1E376C08), - roundParam256(6, 7, 0, 1, 2, 3, 4, 5, 50, 0x2748774C), - roundParam256(5, 6, 7, 0, 1, 2, 3, 4, 51, 0x34B0BCB5), - roundParam256(4, 5, 6, 7, 0, 1, 2, 3, 52, 0x391C0CB3), - roundParam256(3, 4, 5, 6, 7, 0, 1, 2, 53, 0x4ED8AA4A), - roundParam256(2, 3, 4, 5, 6, 7, 0, 1, 54, 0x5B9CCA4F), - roundParam256(1, 2, 3, 4, 5, 6, 7, 0, 55, 0x682E6FF3), - roundParam256(0, 1, 2, 3, 4, 5, 6, 7, 56, 0x748F82EE), - roundParam256(7, 0, 1, 2, 3, 4, 5, 6, 57, 0x78A5636F), - roundParam256(6, 7, 0, 1, 2, 3, 4, 5, 58, 0x84C87814), - roundParam256(5, 6, 7, 0, 1, 2, 3, 4, 59, 0x8CC70208), - roundParam256(4, 5, 6, 7, 0, 1, 2, 3, 60, 0x90BEFFFA), - roundParam256(3, 4, 5, 6, 7, 0, 1, 2, 61, 0xA4506CEB), - roundParam256(2, 3, 4, 5, 6, 7, 0, 1, 62, 0xBEF9A3F7), - roundParam256(1, 2, 3, 4, 5, 6, 7, 0, 63, 0xC67178F2), + roundParam256(0, 1, 2, 3, 4, 5, 6, 7, 0), + roundParam256(7, 0, 1, 2, 3, 4, 5, 6, 1), + roundParam256(6, 7, 0, 1, 2, 3, 4, 5, 2), + roundParam256(5, 6, 7, 0, 1, 2, 3, 4, 3), + roundParam256(4, 5, 6, 7, 0, 1, 2, 3, 4), + roundParam256(3, 4, 5, 6, 7, 0, 1, 2, 5), + roundParam256(2, 3, 4, 5, 6, 7, 0, 1, 6), + roundParam256(1, 2, 3, 4, 5, 6, 7, 0, 7), + roundParam256(0, 1, 2, 3, 4, 5, 6, 7, 8), + roundParam256(7, 0, 1, 2, 3, 4, 5, 6, 9), + roundParam256(6, 7, 0, 1, 2, 3, 4, 5, 10), + roundParam256(5, 6, 7, 0, 1, 2, 3, 4, 11), + roundParam256(4, 5, 6, 7, 0, 1, 2, 3, 12), + roundParam256(3, 4, 5, 6, 7, 0, 1, 2, 13), + roundParam256(2, 3, 4, 5, 6, 7, 0, 1, 14), + roundParam256(1, 2, 3, 4, 5, 6, 7, 0, 15), + roundParam256(0, 1, 2, 3, 4, 5, 6, 7, 16), + roundParam256(7, 0, 1, 2, 3, 4, 5, 6, 17), + roundParam256(6, 7, 0, 1, 2, 3, 4, 5, 18), + roundParam256(5, 6, 7, 0, 1, 2, 3, 4, 19), + roundParam256(4, 5, 6, 7, 0, 1, 2, 3, 20), + roundParam256(3, 4, 5, 6, 7, 0, 1, 2, 21), + roundParam256(2, 3, 4, 5, 6, 7, 0, 1, 22), + roundParam256(1, 2, 3, 4, 5, 6, 7, 0, 23), + roundParam256(0, 1, 2, 3, 4, 5, 6, 7, 24), + roundParam256(7, 0, 1, 2, 3, 4, 5, 6, 25), + roundParam256(6, 7, 0, 1, 2, 3, 4, 5, 26), + roundParam256(5, 6, 7, 0, 1, 2, 3, 4, 27), + roundParam256(4, 5, 6, 7, 0, 1, 2, 3, 28), + roundParam256(3, 4, 5, 6, 7, 0, 1, 2, 29), + roundParam256(2, 3, 4, 5, 6, 7, 0, 1, 30), + roundParam256(1, 2, 3, 4, 5, 6, 7, 0, 31), + roundParam256(0, 1, 2, 3, 4, 5, 6, 7, 32), + roundParam256(7, 0, 1, 2, 3, 4, 5, 6, 33), + roundParam256(6, 7, 0, 1, 2, 3, 4, 5, 34), + roundParam256(5, 6, 7, 0, 1, 2, 3, 4, 35), + roundParam256(4, 5, 6, 7, 0, 1, 2, 3, 36), + roundParam256(3, 4, 5, 6, 7, 0, 1, 2, 37), + roundParam256(2, 3, 4, 5, 6, 7, 0, 1, 38), + roundParam256(1, 2, 3, 4, 5, 6, 7, 0, 39), + roundParam256(0, 1, 2, 3, 4, 5, 6, 7, 40), + roundParam256(7, 0, 1, 2, 3, 4, 5, 6, 41), + roundParam256(6, 7, 0, 1, 2, 3, 4, 5, 42), + roundParam256(5, 6, 7, 0, 1, 2, 3, 4, 43), + roundParam256(4, 5, 6, 7, 0, 1, 2, 3, 44), + roundParam256(3, 4, 5, 6, 7, 0, 1, 2, 45), + roundParam256(2, 3, 4, 5, 6, 7, 0, 1, 46), + roundParam256(1, 2, 3, 4, 5, 6, 7, 0, 47), + roundParam256(0, 1, 2, 3, 4, 5, 6, 7, 48), + roundParam256(7, 0, 1, 2, 3, 4, 5, 6, 49), + roundParam256(6, 7, 0, 1, 2, 3, 4, 5, 50), + roundParam256(5, 6, 7, 0, 1, 2, 3, 4, 51), + roundParam256(4, 5, 6, 7, 0, 1, 2, 3, 52), + roundParam256(3, 4, 5, 6, 7, 0, 1, 2, 53), + roundParam256(2, 3, 4, 5, 6, 7, 0, 1, 54), + roundParam256(1, 2, 3, 4, 5, 6, 7, 0, 55), + roundParam256(0, 1, 2, 3, 4, 5, 6, 7, 56), + roundParam256(7, 0, 1, 2, 3, 4, 5, 6, 57), + roundParam256(6, 7, 0, 1, 2, 3, 4, 5, 58), + roundParam256(5, 6, 7, 0, 1, 2, 3, 4, 59), + roundParam256(4, 5, 6, 7, 0, 1, 2, 3, 60), + roundParam256(3, 4, 5, 6, 7, 0, 1, 2, 61), + roundParam256(2, 3, 4, 5, 6, 7, 0, 1, 62), + roundParam256(1, 2, 3, 4, 5, 6, 7, 0, 63), }; inline for (round0) |r| { - v[r.h] = v[r.h] +% (math.rotr(u32, v[r.e], @as(u32, 6)) ^ math.rotr(u32, v[r.e], @as(u32, 11)) ^ math.rotr(u32, v[r.e], @as(u32, 25))) +% (v[r.g] ^ (v[r.e] & (v[r.f] ^ v[r.g]))) +% r.k +% s[r.i]; + v[r.h] = v[r.h] +% (math.rotr(u32, v[r.e], @as(u32, 6)) ^ math.rotr(u32, v[r.e], @as(u32, 11)) ^ math.rotr(u32, v[r.e], @as(u32, 25))) +% (v[r.g] ^ (v[r.e] & (v[r.f] ^ v[r.g]))) +% W[r.i] +% s[r.i]; v[r.d] = v[r.d] +% v[r.h];