From 16999f5730e92ae00e5b915bb28b39d8b50b69bf Mon Sep 17 00:00:00 2001 From: Tony Arcieri Date: Tue, 13 Aug 2019 16:32:42 -0700 Subject: [PATCH] [WIP] polyval: Initial implementation Implements POLYVAL using Shay Gueron's techniques for efficient field multiplications using PCLMULQDQ. More information on these techniques here: https://blog.quarkslab.com/reversing-a-finite-field-multiplication-optimization.html --- Cargo.toml | 2 +- polyval/Cargo.toml | 9 +++ polyval/benches/polyval.rs | 89 ++++++++++++++++++++ polyval/src/field/backend/mod.rs | 61 ++++++++++++++ polyval/src/field/backend/pclmulqdq.rs | 108 +++++++++++++++++++++++++ polyval/src/field/backend/soft.rs | 72 +++++++++++++++++ polyval/src/field/clmul.rs | 55 +++++++++++++ polyval/src/field/mod.rs | 99 +++++++++++++++++++++++ polyval/src/lib.rs | 104 +++++++++++++++++++++++- polyval/tests/lib.rs | 23 ++++++ 10 files changed, 620 insertions(+), 2 deletions(-) create mode 100644 polyval/benches/polyval.rs create mode 100644 polyval/src/field/backend/mod.rs create mode 100644 polyval/src/field/backend/pclmulqdq.rs create mode 100644 polyval/src/field/backend/soft.rs create mode 100644 polyval/src/field/clmul.rs create mode 100644 polyval/src/field/mod.rs create mode 100644 polyval/tests/lib.rs diff --git a/Cargo.toml b/Cargo.toml index 5c5a885..4fc543c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -5,5 +5,5 @@ members = [ "hmac", "pmac", "poly1305", - "polyval" + "polyval", ] diff --git a/polyval/Cargo.toml b/polyval/Cargo.toml index a0fc47d..5c5f9f2 100644 --- a/polyval/Cargo.toml +++ b/polyval/Cargo.toml @@ -10,6 +10,15 @@ keywords = ["aes-gcm-siv", "crypto", "ghash", "gcm", "universal-hashing"] categories = ["cryptography", "no-std"] [dependencies] +byteorder = { version = "1", default-features = false } +zeroize = { version = "0.9", optional = true, default-features = false } + +[dev-dependencies] +crypto-mac = { version = "0.7", features = ["dev"] } +hex-literal = "0.1" + +[features] +nightly = [] [badges] travis-ci = { repository = "RustCrypto/hashes" } diff --git a/polyval/benches/polyval.rs b/polyval/benches/polyval.rs new file mode 100644 index 0000000..a3cba0a --- /dev/null +++ b/polyval/benches/polyval.rs @@ -0,0 +1,89 @@ +#![feature(test)] +#[macro_use] +extern crate crypto_mac; +extern crate polyval; + +use crypto_mac::generic_array::{typenum::U16, GenericArray}; +use crypto_mac::MacResult; +use polyval::{FieldElement, Polyval}; +use std::{cmp::min, convert::TryInto}; + +bench!(PolyvalMac); + +/// POLYVAL isn't a traditional MAC and for that reason doesn't impl the +/// `crypto_mac::Mac` trait. +/// +/// This type is a newtype that impls a pseudo-MAC to leverage the benchmark +/// functionality. +/// +/// This is just for benchmarking! Don't copy and paste this into your program +/// unless you really know what you're doing!!! +#[derive(Clone)] +struct PolyvalMac { + poly: Polyval, + leftover: usize, + buffer: FieldElement, +} + +impl Mac for PolyvalMac { + type OutputSize = U16; + type KeySize = U16; + + fn new(key: &GenericArray) -> PolyvalMac { + let poly = Polyval::new(key.as_slice().try_into().unwrap()); + + PolyvalMac { + poly, + leftover: 0, + buffer: FieldElement::default(), + } + } + + fn input(&mut self, data: &[u8]) { + let mut m = data; + + if self.leftover > 0 { + let want = min(16 - self.leftover, m.len()); + + for (i, byte) in m.iter().cloned().enumerate().take(want) { + self.buffer[self.leftover + i] = byte; + } + + m = &m[want..]; + self.leftover += want; + + if self.leftover < 16 { + return; + } + + self.block(); + self.leftover = 0; + } + + while m.len() >= 16 { + self.block(); + m = &m[16..]; + } + + self.buffer[..m.len()].copy_from_slice(m); + self.leftover = m.len(); + } + + fn reset(&mut self) { + unimplemented!(); + } + + fn result(self) -> MacResult { + let mut mac = GenericArray::default(); + mac.copy_from_slice(&self.poly.result()); + MacResult::new(mac) + } +} + +impl PolyvalMac { + /// Input the current internal buffer into POLYVAL + fn block(&mut self) { + let elem = self.buffer; + self.poly.input(&elem) + } +} diff --git a/polyval/src/field/backend/mod.rs b/polyval/src/field/backend/mod.rs new file mode 100644 index 0000000..d7519d7 --- /dev/null +++ b/polyval/src/field/backend/mod.rs @@ -0,0 +1,61 @@ +//! Field arithmetic backends + +#[cfg(all( + target_feature = "pclmulqdq", + target_feature = "sse2", + target_feature = "sse4.1", + any(target_arch = "x86", target_arch = "x86_64") +))] +pub mod pclmulqdq; +pub mod soft; + +use super::clmul::Clmul; +use core::{ + mem, + ops::{BitXor, BitXorAssign}, +}; + +/// Mask value to load into XMM register when performing Montgomery reduction. +/// See: +const MASK: [u64; 2] = [0x1, 0xc200_0000_0000_0000]; + +/// Trait representing the arithmetic operations we expect on the XMM registers +pub trait Xmm: + BitXor + BitXorAssign + Clmul + Copy + From<[u64; 2]> + Into<[u64; 2]> +{ + /// Fast reduction modulo x^128 + x^127 + x^126 +x^121 + 1 (Gueron 2012) + /// Algorithm 4: "Montgomery reduction" + fn reduce(self) -> Self { + let mask = Self::from(MASK); + let a = mask.clmul(self, 0x01); + let b = self.rotate_left() ^ a; + let c = mask.clmul(b, 0x01); + b.rotate_left() ^ c + } + + /// Rotate the contents of the register left by 64-bits + fn rotate_left(self) -> Self { + let t1: [u64; 2] = self.into(); + let t2: [u32; 4] = unsafe { mem::transmute(t1) }; + let t3 = [t2[2], t2[3], t2[0], t2[1]]; + let t4: [u64; 2] = unsafe { mem::transmute(t3) }; + t4.into() + + } + + /// Shift the contents of the register right by 64-bits + fn shift_right(self) -> Self { + let mut u64x2: [u64; 2] = self.into(); + u64x2[1] = u64x2[0]; + u64x2[0] = 0; + u64x2.into() + } + + /// Shift the contents of the register left by 64-bits + fn shift_left(self) -> Self { + let mut u64x2: [u64; 2] = self.into(); + u64x2[0] = u64x2[1]; + u64x2[1] = 0; + u64x2.into() + } +} diff --git a/polyval/src/field/backend/pclmulqdq.rs b/polyval/src/field/backend/pclmulqdq.rs new file mode 100644 index 0000000..cf30580 --- /dev/null +++ b/polyval/src/field/backend/pclmulqdq.rs @@ -0,0 +1,108 @@ +//! Support for the VPCLMULQDQ CPU intrinsic on `x86` and `x86_64` target +//! architectures. + +#[cfg(target_arch = "x86")] +use core::arch::x86::*; +#[cfg(target_arch = "x86_64")] +use core::arch::x86_64::*; + +use super::Xmm; +use crate::field::clmul::{self, Clmul}; +use core::ops::{BitXor, BitXorAssign}; + +/// 2 x `u64` values loaded into a `__m128i` register +#[repr(align(16))] +#[derive(Copy, Clone)] +pub struct U64x2(__m128i); + +impl From<[u64; 2]> for U64x2 { + fn from(array: [u64; 2]) -> U64x2 { + unsafe { _mm_loadu_si128(array.as_ptr() as *const __m128i) }.into() + } +} + +impl From for [u64; 2] { + fn from(u64x2: U64x2) -> [u64; 2] { + let mut result = [0u64; 2]; + + unsafe { + _mm_storeu_si128(result.as_mut_ptr() as *mut __m128i, u64x2.0); + } + + result + } +} + +impl From<__m128i> for U64x2 { + fn from(mm: __m128i) -> U64x2 { + U64x2(mm) + } +} + +impl From for __m128i { + fn from(u64x2: U64x2) -> __m128i { + u64x2.0 + } +} + +impl BitXor for U64x2 { + type Output = Self; + + fn bitxor(self, rhs: Self) -> Self::Output { + U64x2(unsafe { xor(self.0, rhs.0) }) + } +} + +impl BitXorAssign for U64x2 { + fn bitxor_assign(&mut self, rhs: Self) { + // TODO(tarcieri): optimize + self.0 = unsafe { xor(self.0, rhs.0) }; + } +} + +impl Clmul for U64x2 { + fn clmul(self, rhs: Self, imm: I) -> Self + where + I: Into, + { + unsafe { vpclmulqdq(self.0, rhs.0, imm.into()) }.into() + } +} + +// TODO(tarcieri): optimized `rotate_left`, `shift_right`, `shift_left` +impl Xmm for U64x2 {} + +#[target_feature(enable = "sse2", enable = "sse4.1")] +unsafe fn xor(a: __m128i, b: __m128i) -> __m128i { + _mm_xor_si128(a, b) +} + +#[target_feature(enable = "pclmulqdq", enable = "sse2", enable = "sse4.1")] +unsafe fn vpclmulqdq(a: __m128i, b: __m128i, op: clmul::PseudoOp) -> __m128i { + match op { + clmul::PseudoOp::PCLMULLQLQDQ => _mm_clmulepi64_si128(a, b, 0x00), + clmul::PseudoOp::PCLMULHQLQDQ => _mm_clmulepi64_si128(a, b, 0x01), + clmul::PseudoOp::PCLMULLQHQDQ => _mm_clmulepi64_si128(a, b, 0x10), + clmul::PseudoOp::PCLMULHQHQDQ => _mm_clmulepi64_si128(a, b, 0x11), + } +} + +#[cfg(test)] +mod tests { + use crate::field::{ + backend::soft, + clmul::{self, Clmul}, + }; + + #[test] + fn vclmul_emulation() { + let a: [u64; 2] = [0x00000000ada5f29b, 0]; + let b: [u64; 2] = [0x000000002d978a49, 0]; + let op = clmul::PseudoOp::from(0x00); + + let hard_result: [u64; 2] = super::U64x2::from(a).clmul(b.into(), op).into(); + let soft_result: [u64; 2] = soft::U64x2::from(a).clmul(b.into(), op).into(); + + assert_eq!(&hard_result, &soft_result); + } +} diff --git a/polyval/src/field/backend/soft.rs b/polyval/src/field/backend/soft.rs new file mode 100644 index 0000000..0c5df4e --- /dev/null +++ b/polyval/src/field/backend/soft.rs @@ -0,0 +1,72 @@ +//! Software emulation support for CLMUL hardware intrinsics. +//! +//! WARNING: Not constant time! Should be made constant-time or disabled by default. + +use super::Xmm; +use field::clmul::{self, Clmul}; +use core::ops::{BitXor, BitXorAssign}; + +/// 2 x `u64` values +#[derive(Copy, Clone, Debug, Eq, PartialEq)] +pub struct U64x2([u64; 2]); + +impl From<[u64; 2]> for U64x2 { + fn from(array: [u64; 2]) -> U64x2 { + U64x2(array) + } +} + +impl From for [u64; 2] { + fn from(u64x2: U64x2) -> [u64; 2] { + u64x2.0 + } +} + +impl BitXor for U64x2 { + type Output = Self; + + fn bitxor(self, rhs: Self) -> Self::Output { + U64x2([self.0[0] ^ rhs.0[0], self.0[1] ^ rhs.0[1]]) + } +} + +impl BitXorAssign for U64x2 { + fn bitxor_assign(&mut self, rhs: Self) { + self.0[0] ^= rhs.0[0]; + self.0[1] ^= rhs.0[1]; + } +} + +impl Clmul for U64x2 { + fn clmul(self, other: Self, imm: I) -> Self + where + I: Into, + { + let (a, b) = match imm.into() { + clmul::PseudoOp::PCLMULLQLQDQ => (self.0[0], other.0[0]), + clmul::PseudoOp::PCLMULHQLQDQ => (self.0[1], other.0[0]), + clmul::PseudoOp::PCLMULLQHQDQ => (self.0[0], other.0[1]), + clmul::PseudoOp::PCLMULHQHQDQ => (self.0[1], other.0[1]), + }; + + let mut result = [0u64; 2]; + + for i in 0..64 { + if b & (1 << i) != 0 { + result[1] ^= a; + } + + result[0] >>= 1; + + if result[1] & 1 != 0 { + result[0] ^= 1 << 63; + } + + result[1] >>= 1; + } + + result.into() + } +} + +impl Xmm for U64x2 {} diff --git a/polyval/src/field/clmul.rs b/polyval/src/field/clmul.rs new file mode 100644 index 0000000..7b9e5a7 --- /dev/null +++ b/polyval/src/field/clmul.rs @@ -0,0 +1,55 @@ +//! Carry-less multiplication support. +//! +//! Modern `x86` and `x86_64` CPUs support hardware instructions for +//! carry-less multiplication which are necessary for efficient implementations +//! of GHASH and POLYVAL. + +/// Carry-less multiplication trait - allows field arithmetic to be generic +/// across both the `hard` and `soft` backends +pub trait Clmul: Copy { + /// Performs carry-less multiplication of two 64-bit polynomials over the + /// finite field GF(2^k). + fn clmul>(self, other: Self, imm: I) -> Self; +} + +/// Pseudo-Op: selected by bits 4 and 0 of the immediate byte (`imm8`). +/// +/// PCLMULQDQ performs carry-less multiplication of two quadwords which are +/// selected from both operands according to the value of `imm8`. +/// +/// Bits 4 and 0 of `imm8` are used to select which 64-bit half of each operand +/// to use. Each of the possibilities has a named CLMUL Pseudo-Op, which is +/// represented by this enum. +#[repr(u8)] +#[derive(Copy, Clone, Debug, Eq, PartialEq)] +pub enum PseudoOp { + /// Low-Low: `clmul(a[0..8], b[0..8])` + PCLMULLQLQDQ = 0x00, + + /// High-Low: `clmul(a[8..16], b[0..8])` + PCLMULHQLQDQ = 0x01, + + /// Low-High: `clmul(a[0..8], b[8..16])` + PCLMULLQHQDQ = 0x10, + + /// High-High: `clmul(a[8..16], b[8..16])` + PCLMULHQHQDQ = 0x11, +} + +impl From for PseudoOp { + fn from(imm8: u8) -> PseudoOp { + match imm8 { + 0x00 => PseudoOp::PCLMULLQLQDQ, + 0x01 => PseudoOp::PCLMULHQLQDQ, + 0x10 => PseudoOp::PCLMULLQHQDQ, + 0x11 => PseudoOp::PCLMULHQHQDQ, + _ => panic!("invalid imm8 value: 0x{:02x}", imm8), + } + } +} + +impl From for u8 { + fn from(op: PseudoOp) -> u8 { + op as u8 + } +} diff --git a/polyval/src/field/mod.rs b/polyval/src/field/mod.rs new file mode 100644 index 0000000..d08686c --- /dev/null +++ b/polyval/src/field/mod.rs @@ -0,0 +1,99 @@ +//! Implementation of POLYVAL's finite field. +//! +//! From [RFC 8452 Section 3] which defines POLYVAL for use in AES-GCM_SIV: +//! +//! > "POLYVAL, like GHASH (the authenticator in AES-GCM; ...), operates in a +//! > binary field of size 2^128. The field is defined by the irreducible +//! > polynomial x^128 + x^127 + x^126 + x^121 + 1." +//! +//! This implementation provides multiplication over GF(2^128) optimized using +//! Shay Gueron's PCLMULQDQ-based techniques. +//! +//! For more information on how these techniques work, see: +//! +//! +//! [RFC 8452 Section 3]: https://tools.ietf.org/html/rfc8452#section-3 + +pub mod backend; +pub mod clmul; + +use self::backend::Xmm; +use super::FIELD_SIZE; +use byteorder::{ByteOrder, LE}; +use core::ops::{BitXor, Mul}; + +/// POLYVAL field element. +#[derive(Copy, Clone)] +pub struct FieldElement(X); + +impl FieldElement { + /// Load a `FieldElement` from its bytestring representation. + pub fn from_bytes(bytes: [u8; FIELD_SIZE]) -> Self { + let mut u64x2 = [0u64; 2]; + LE::read_u64_into(&bytes, &mut u64x2); + u64x2.into() + } + + /// Serialize this `FieldElement` as a bytestring. + pub fn to_bytes(self) -> [u8; FIELD_SIZE] { + let u64x2: [u64; 2] = self.0.into(); + let mut result = [0u8; FIELD_SIZE]; + LE::write_u64_into(&u64x2, &mut result); + result + } +} + +impl Mul for FieldElement { + type Output = Self; + + /// Computes POLYVAL multiplication over GF(2^128). + /// + /// From [RFC 8452 Section 3]: + /// + /// > "The product of any two elements is calculated using standard + /// > (binary) polynomial multiplication followed by reduction modulo the + /// > irreducible polynomial." + /// + /// [RFC 8452 Section 3]: https://tools.ietf.org/html/rfc8452#section-3 + fn mul(self, rhs: Self) -> Self { + let mut t1 = self.0.clmul(rhs.0, 0x00); + let mut t2 = self.0.clmul(rhs.0, 0x01); + let mut t3 = self.0.clmul(rhs.0, 0x10); + let mut t4 = self.0.clmul(rhs.0, 0x11); + + t2 ^= t3; + t3 = t2.shift_right(); + t2 = t2.shift_left(); + t1 ^= t3; + t4 ^= t2; + t4 ^= t1.reduce(); + + FieldElement(t4) + } +} + +impl From for FieldElement { + fn from(element: X) -> FieldElement { + FieldElement(element) + } +} + +impl From<[u64; 2]> for FieldElement { + fn from(array: [u64; 2]) -> FieldElement { + FieldElement(array.into()) + } +} + +impl From> for [u64; 2] { + fn from(fe: FieldElement) -> [u64; 2] { + fe.0.into() + } +} + +impl BitXor for FieldElement { + type Output = Self; + + fn bitxor(self, rhs: Self) -> Self::Output { + FieldElement(self.0 ^ rhs.0) + } +} diff --git a/polyval/src/lib.rs b/polyval/src/lib.rs index 1b3bed6..3fd3eb6 100644 --- a/polyval/src/lib.rs +++ b/polyval/src/lib.rs @@ -1 +1,103 @@ -//! POLYVAL +//! **POLYVAL** is a GHASH-like universal hash over GF(2^128) useful for +//! implementing [AES-GCM-SIV] or [AES-GCM/GMAC]. +//! +//! From [RFC 8452 Section 3] which defines POLYVAL for use in AES-GCM_SIV: +//! +//! > "POLYVAL, like GHASH (the authenticator in AES-GCM; ...), operates in a +//! > binary field of size 2^128. The field is defined by the irreducible +//! > polynomial x^128 + x^127 + x^126 + x^121 + 1." +//! +//! By multiplying (in the finite field sense) a sequence of 128-bit blocks of +//! input data data by a field element `H`, POLYVAL can be used to authenticate +//! the message sequence as powers (in a finite field sense) of `H`. +//! +//! ## Relationship to GHASH +//! +//! POLYVAL can be thought of as the little endian equivalent of GHASH, which +//! affords it a small performance advantage over GHASH when used on little +//! endian architectures. +//! +//! It has also been designed so it can also be used to compute GHASH and with +//! it GMAC, the Message Authentication Code (MAC) used by AES-GCM. +//! +//! From [RFC 8452 Appendix A]: +//! +//! > "GHASH and POLYVAL both operate in GF(2^128), although with different +//! > irreducible polynomials: POLYVAL works modulo x^128 + x^127 + x^126 + +//! > x^121 + 1 and GHASH works modulo x^128 + x^7 + x^2 + x + 1. Note +//! > that these irreducible polynomials are the 'reverse' of each other." +//! +//! [AES-GCM-SIV]: https://en.wikipedia.org/wiki/AES-GCM-SIV +//! [AES-GCM/GMAC]: https://en.wikipedia.org/wiki/Galois/Counter_Mode +//! [RFC 8452 Section 3]: https://tools.ietf.org/html/rfc8452#section-3 +//! [RFC 8452 Appendix A]: https://tools.ietf.org/html/rfc8452#appendix-A + +#![no_std] +#![doc(html_logo_url = "https://raw.githubusercontent.com/RustCrypto/meta/master/logo_small.png")] +#![deny(missing_docs)] + +extern crate byteorder; +#[cfg(feature = "zeroize")] +extern crate zeroize; + +pub mod field; + +use self::field::FieldElement; +#[cfg(feature = "zeroize")] +use zeroize::Zeroize; + +// TODO(tarcieri): selectable backends +use self::field::backend::soft::U64x2; + +/// Size of the GF(2^128) field modulus in bytes (16-bytes). +pub const FIELD_SIZE: usize = 16; + +/// **POLYVAL**: GHASH-like universal hash over GF(2^128). +#[repr(align(16))] +#[derive(Clone)] +#[allow(non_snake_case)] +pub struct Polyval { + /// GF(2^128) field element input blocks are multiplied by + H: FieldElement, + + /// Field element representing the computed universal hash + S: FieldElement, +} + +impl Polyval { + /// Initialize POLYVAL with the given `H` field element + pub fn new(h: [u8; FIELD_SIZE]) -> Self { + Self { + H: FieldElement::from_bytes(h), + S: [0u64; 2].into(), + } + } + + /// Input a field element `X` to be authenticated into POLYVAL. + pub fn input(&mut self, x: [u8; FIELD_SIZE]) { + // "The sum of any two elements in the field is the result of XORing them." + // -- RFC 8452 Section 3 + let sum = self.S ^ FieldElement::from_bytes(x); + self.S = sum * self.H; + } + + /// Process input blocks in a chained manner + pub fn chain(mut self, x: [u8; FIELD_SIZE]) -> Self { + self.input(x); + self + } + + /// Get POLYVAL result (i.e. computed `S` field element) + pub fn result(self) -> [u8; FIELD_SIZE] { + self.S.to_bytes() + } +} + +#[cfg(feature = "zeroize")] +impl Drop for Polyval { + fn drop(&mut self) { + self.H.zeroize(); + self.S.zeroize(); + self.buffer.zeroize(); + } +} diff --git a/polyval/tests/lib.rs b/polyval/tests/lib.rs new file mode 100644 index 0000000..34254b9 --- /dev/null +++ b/polyval/tests/lib.rs @@ -0,0 +1,23 @@ +#[macro_use] +extern crate hex_literal; +extern crate polyval; + +use polyval::{Polyval, FIELD_SIZE}; + +// +// Test vectors or POLYVAL from RFC 8452 Appendix A +// +// + +const H: [u8; FIELD_SIZE] = hex!("25629347589242761d31f826ba4b757b"); +const X_1: [u8; FIELD_SIZE] = hex!("4f4f95668c83dfb6401762bb2d01a262"); +const X_2: [u8; FIELD_SIZE] = hex!("d1a24ddd2721d006bbe45f20d3c9f362"); + +/// POLYVAL(H, X_1, X_2) +const POLYVAL_RESULT: [u8; FIELD_SIZE] = hex!("f7a3b47b846119fae5b7866cf5e5b77e"); + +#[test] +fn rfc_8452_test_vector() { + let result = Polyval::new(H).chain(X_1).chain(X_2).result(); + assert_eq!(&result, &POLYVAL_RESULT); +}