Skip to content

Commit

Permalink
[WIP] polyval: Initial implementation
Browse files Browse the repository at this point in the history
Implements POLYVAL using Shay Gueron's techniques for efficient field
multiplications using PCLMULQDQ.

More information on these techniques here:

https://blog.quarkslab.com/reversing-a-finite-field-multiplication-optimization.html
  • Loading branch information
tarcieri committed Aug 21, 2019
1 parent a3e4cbc commit 16999f5
Show file tree
Hide file tree
Showing 10 changed files with 620 additions and 2 deletions.
2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -5,5 +5,5 @@ members = [
"hmac",
"pmac",
"poly1305",
"polyval"
"polyval",
]
9 changes: 9 additions & 0 deletions polyval/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,15 @@ keywords = ["aes-gcm-siv", "crypto", "ghash", "gcm", "universal-hashing"]
categories = ["cryptography", "no-std"]

[dependencies]
byteorder = { version = "1", default-features = false }
zeroize = { version = "0.9", optional = true, default-features = false }

[dev-dependencies]
crypto-mac = { version = "0.7", features = ["dev"] }
hex-literal = "0.1"

[features]
nightly = []

[badges]
travis-ci = { repository = "RustCrypto/hashes" }
89 changes: 89 additions & 0 deletions polyval/benches/polyval.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
#![feature(test)]
#[macro_use]
extern crate crypto_mac;
extern crate polyval;

use crypto_mac::generic_array::{typenum::U16, GenericArray};
use crypto_mac::MacResult;
use polyval::{FieldElement, Polyval};
use std::{cmp::min, convert::TryInto};

bench!(PolyvalMac);

/// POLYVAL isn't a traditional MAC and for that reason doesn't impl the
/// `crypto_mac::Mac` trait.
///
/// This type is a newtype that impls a pseudo-MAC to leverage the benchmark
/// functionality.
///
/// This is just for benchmarking! Don't copy and paste this into your program
/// unless you really know what you're doing!!!
#[derive(Clone)]
struct PolyvalMac {
poly: Polyval,
leftover: usize,
buffer: FieldElement,
}

impl Mac for PolyvalMac {
type OutputSize = U16;
type KeySize = U16;

fn new(key: &GenericArray<u8, Self::KeySize>) -> PolyvalMac {
let poly = Polyval::new(key.as_slice().try_into().unwrap());

PolyvalMac {
poly,
leftover: 0,
buffer: FieldElement::default(),
}
}

fn input(&mut self, data: &[u8]) {
let mut m = data;

if self.leftover > 0 {
let want = min(16 - self.leftover, m.len());

for (i, byte) in m.iter().cloned().enumerate().take(want) {
self.buffer[self.leftover + i] = byte;
}

m = &m[want..];
self.leftover += want;

if self.leftover < 16 {
return;
}

self.block();
self.leftover = 0;
}

while m.len() >= 16 {
self.block();
m = &m[16..];
}

self.buffer[..m.len()].copy_from_slice(m);
self.leftover = m.len();
}

fn reset(&mut self) {
unimplemented!();
}

fn result(self) -> MacResult<Self::OutputSize> {
let mut mac = GenericArray::default();
mac.copy_from_slice(&self.poly.result());
MacResult::new(mac)
}
}

impl PolyvalMac {
/// Input the current internal buffer into POLYVAL
fn block(&mut self) {
let elem = self.buffer;
self.poly.input(&elem)
}
}
61 changes: 61 additions & 0 deletions polyval/src/field/backend/mod.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
//! Field arithmetic backends

#[cfg(all(
target_feature = "pclmulqdq",
target_feature = "sse2",
target_feature = "sse4.1",
any(target_arch = "x86", target_arch = "x86_64")
))]
pub mod pclmulqdq;
pub mod soft;

use super::clmul::Clmul;
use core::{
mem,
ops::{BitXor, BitXorAssign},
};

/// Mask value to load into XMM register when performing Montgomery reduction.
/// See: <https://crypto.stanford.edu/RealWorldCrypto/slides/gueron.pdf>
const MASK: [u64; 2] = [0x1, 0xc200_0000_0000_0000];

/// Trait representing the arithmetic operations we expect on the XMM registers
pub trait Xmm:
BitXor<Output = Self> + BitXorAssign + Clmul + Copy + From<[u64; 2]> + Into<[u64; 2]>
{
/// Fast reduction modulo x^128 + x^127 + x^126 +x^121 + 1 (Gueron 2012)
/// Algorithm 4: "Montgomery reduction"
fn reduce(self) -> Self {
let mask = Self::from(MASK);
let a = mask.clmul(self, 0x01);
let b = self.rotate_left() ^ a;
let c = mask.clmul(b, 0x01);
b.rotate_left() ^ c
}

/// Rotate the contents of the register left by 64-bits
fn rotate_left(self) -> Self {
let t1: [u64; 2] = self.into();
let t2: [u32; 4] = unsafe { mem::transmute(t1) };
let t3 = [t2[2], t2[3], t2[0], t2[1]];
let t4: [u64; 2] = unsafe { mem::transmute(t3) };
t4.into()

}

/// Shift the contents of the register right by 64-bits
fn shift_right(self) -> Self {
let mut u64x2: [u64; 2] = self.into();
u64x2[1] = u64x2[0];
u64x2[0] = 0;
u64x2.into()
}

/// Shift the contents of the register left by 64-bits
fn shift_left(self) -> Self {
let mut u64x2: [u64; 2] = self.into();
u64x2[0] = u64x2[1];
u64x2[1] = 0;
u64x2.into()
}
}
108 changes: 108 additions & 0 deletions polyval/src/field/backend/pclmulqdq.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
//! Support for the VPCLMULQDQ CPU intrinsic on `x86` and `x86_64` target
//! architectures.

#[cfg(target_arch = "x86")]
use core::arch::x86::*;
#[cfg(target_arch = "x86_64")]
use core::arch::x86_64::*;

use super::Xmm;
use crate::field::clmul::{self, Clmul};
use core::ops::{BitXor, BitXorAssign};

/// 2 x `u64` values loaded into a `__m128i` register
#[repr(align(16))]
#[derive(Copy, Clone)]
pub struct U64x2(__m128i);

impl From<[u64; 2]> for U64x2 {
fn from(array: [u64; 2]) -> U64x2 {
unsafe { _mm_loadu_si128(array.as_ptr() as *const __m128i) }.into()
}
}

impl From<U64x2> for [u64; 2] {
fn from(u64x2: U64x2) -> [u64; 2] {
let mut result = [0u64; 2];

unsafe {
_mm_storeu_si128(result.as_mut_ptr() as *mut __m128i, u64x2.0);
}

result
}
}

impl From<__m128i> for U64x2 {
fn from(mm: __m128i) -> U64x2 {
U64x2(mm)
}
}

impl From<U64x2> for __m128i {
fn from(u64x2: U64x2) -> __m128i {
u64x2.0
}
}

impl BitXor for U64x2 {
type Output = Self;

fn bitxor(self, rhs: Self) -> Self::Output {
U64x2(unsafe { xor(self.0, rhs.0) })
}
}

impl BitXorAssign for U64x2 {
fn bitxor_assign(&mut self, rhs: Self) {
// TODO(tarcieri): optimize
self.0 = unsafe { xor(self.0, rhs.0) };
}
}

impl Clmul for U64x2 {
fn clmul<I>(self, rhs: Self, imm: I) -> Self
where
I: Into<clmul::PseudoOp>,
{
unsafe { vpclmulqdq(self.0, rhs.0, imm.into()) }.into()
}
}

// TODO(tarcieri): optimized `rotate_left`, `shift_right`, `shift_left`
impl Xmm for U64x2 {}

#[target_feature(enable = "sse2", enable = "sse4.1")]
unsafe fn xor(a: __m128i, b: __m128i) -> __m128i {
_mm_xor_si128(a, b)
}

#[target_feature(enable = "pclmulqdq", enable = "sse2", enable = "sse4.1")]
unsafe fn vpclmulqdq(a: __m128i, b: __m128i, op: clmul::PseudoOp) -> __m128i {
match op {
clmul::PseudoOp::PCLMULLQLQDQ => _mm_clmulepi64_si128(a, b, 0x00),
clmul::PseudoOp::PCLMULHQLQDQ => _mm_clmulepi64_si128(a, b, 0x01),
clmul::PseudoOp::PCLMULLQHQDQ => _mm_clmulepi64_si128(a, b, 0x10),
clmul::PseudoOp::PCLMULHQHQDQ => _mm_clmulepi64_si128(a, b, 0x11),
}
}

#[cfg(test)]
mod tests {
use crate::field::{
backend::soft,
clmul::{self, Clmul},
};

#[test]
fn vclmul_emulation() {
let a: [u64; 2] = [0x00000000ada5f29b, 0];
let b: [u64; 2] = [0x000000002d978a49, 0];
let op = clmul::PseudoOp::from(0x00);

let hard_result: [u64; 2] = super::U64x2::from(a).clmul(b.into(), op).into();
let soft_result: [u64; 2] = soft::U64x2::from(a).clmul(b.into(), op).into();

assert_eq!(&hard_result, &soft_result);
}
}
72 changes: 72 additions & 0 deletions polyval/src/field/backend/soft.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
//! Software emulation support for CLMUL hardware intrinsics.
//!
//! WARNING: Not constant time! Should be made constant-time or disabled by default.

use super::Xmm;
use field::clmul::{self, Clmul};
use core::ops::{BitXor, BitXorAssign};

/// 2 x `u64` values
#[derive(Copy, Clone, Debug, Eq, PartialEq)]
pub struct U64x2([u64; 2]);

impl From<[u64; 2]> for U64x2 {
fn from(array: [u64; 2]) -> U64x2 {
U64x2(array)
}
}

impl From<U64x2> for [u64; 2] {
fn from(u64x2: U64x2) -> [u64; 2] {
u64x2.0
}
}

impl BitXor for U64x2 {
type Output = Self;

fn bitxor(self, rhs: Self) -> Self::Output {
U64x2([self.0[0] ^ rhs.0[0], self.0[1] ^ rhs.0[1]])
}
}

impl BitXorAssign for U64x2 {
fn bitxor_assign(&mut self, rhs: Self) {
self.0[0] ^= rhs.0[0];
self.0[1] ^= rhs.0[1];
}
}

impl Clmul for U64x2 {
fn clmul<I>(self, other: Self, imm: I) -> Self
where
I: Into<clmul::PseudoOp>,
{
let (a, b) = match imm.into() {
clmul::PseudoOp::PCLMULLQLQDQ => (self.0[0], other.0[0]),
clmul::PseudoOp::PCLMULHQLQDQ => (self.0[1], other.0[0]),
clmul::PseudoOp::PCLMULLQHQDQ => (self.0[0], other.0[1]),
clmul::PseudoOp::PCLMULHQHQDQ => (self.0[1], other.0[1]),
};

let mut result = [0u64; 2];

for i in 0..64 {
if b & (1 << i) != 0 {
result[1] ^= a;
}

result[0] >>= 1;

if result[1] & 1 != 0 {
result[0] ^= 1 << 63;
}

result[1] >>= 1;
}

result.into()
}
}

impl Xmm for U64x2 {}
Loading

0 comments on commit 16999f5

Please sign in to comment.