diff --git a/CHANGELOG.md b/CHANGELOG.md index d853eb161..6e33106af 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -27,8 +27,10 @@ - #100 (ark-ff) Implement `batch_inverse_and_mul` - #101 (ark-ff) Add `element(i: usize)` on the `Domain` trait. - #107 (ark-serialize) Add an impl of `CanonicalSerialize/Deserialize` for `BTreeSet`. +- #114 (ark-poly) Significantly speedup and reduce memory usage of `DensePolynomial.evaluate`. - #115 (ark-poly) Add parallel implementation to operations on `Evaluations`. - #115 (ark-ff) Add parallel implementation of `batch_inversion`. + ### Bug fixes - #36 (ark-ec) In Short-Weierstrass curves, include an infinity bit in `ToConstraintField`. - #107 (ark-serialize) Fix handling of `(de)serialize_uncompressed/unchecked` in various impls of `CanonicalSerialize/Deserialize`. diff --git a/poly-benches/Cargo.toml b/poly-benches/Cargo.toml index 1ba153e5e..1aa1197a1 100644 --- a/poly-benches/Cargo.toml +++ b/poly-benches/Cargo.toml @@ -8,14 +8,24 @@ license = "MIT/Apache-2.0" edition = "2018" publish = false -[dev-dependencies] +[dependencies] ark-poly = { path = "../poly" } ark-ff = { path = "../ff" } -ark-test-curves = { path = "../test-curves", default-features = false, features = [ "bls12_381_scalar_field" ] } +ark-test-curves = { path = "../test-curves", default-features = false, features = [ "bls12_381_scalar_field", "mnt4_753_curve" ] } criterion = "0.3.1" rand = "0.7" +rayon = { version = "1", optional = true } + +[features] +default = [] +parallel = ["ark-ff/parallel", "rayon", "ark-poly/parallel" ] [[bench]] -name = "fft" -path = "benches/fft.rs" +name = "groth16_fft" +path = "benches/groth16_fft.rs" harness = false + +[[bench]] +name = "dense_polynomial" +path = "benches/dense_polynomial.rs" +harness = false \ No newline at end of file diff --git a/poly-benches/benches/dense_polynomial.rs b/poly-benches/benches/dense_polynomial.rs new file mode 100644 index 000000000..c819864a0 --- /dev/null +++ b/poly-benches/benches/dense_polynomial.rs @@ -0,0 +1,47 @@ +use rand; + +extern crate criterion; + +use ark_ff::Field; +use ark_poly::{polynomial::univariate::DensePolynomial, Polynomial, UVPolynomial}; +use ark_test_curves::bls12_381::Fr as bls12_381_fr; +use criterion::BenchmarkId; +use criterion::Criterion; +use criterion::{criterion_group, criterion_main}; + +const POLY_LOG_MIN_SIZE: usize = 15; +const POLY_EVALUATE_MAX_DEGREE: usize = 1 << 17; + +// returns vec![2^{POLY_LOG_MIN_SIZE}, ... 2^n], where n = ceil(log_2(max_degree)) +fn size_range(max_degree: usize) -> Vec { + let mut to_ret = vec![1 << POLY_LOG_MIN_SIZE]; + while *to_ret.last().unwrap() < max_degree { + to_ret.push(to_ret.last().unwrap() * 2); + } + to_ret +} + +fn bench_poly_evaluate(c: &mut Criterion, name: &'static str) { + let mut group = c.benchmark_group(format!("{:?} - evaluate_polynomial", name)); + for degree in size_range(POLY_EVALUATE_MAX_DEGREE).iter() { + group.bench_with_input(BenchmarkId::from_parameter(degree), degree, |b, °ree| { + // Per benchmark setup + let mut rng = &mut rand::thread_rng(); + let poly = DensePolynomial::::rand(degree, &mut rng); + b.iter(|| { + // Per benchmark iteration + let pt = F::rand(&mut rng); + poly.evaluate(&pt); + }); + }); + } + group.finish(); +} + +fn bench_bls12_381(c: &mut Criterion) { + let name = "bls12_381"; + bench_poly_evaluate::(c, name); +} + +criterion_group!(benches, bench_bls12_381); +criterion_main!(benches); diff --git a/poly-benches/benches/fft.rs b/poly-benches/benches/groth16_fft.rs similarity index 100% rename from poly-benches/benches/fft.rs rename to poly-benches/benches/groth16_fft.rs diff --git a/poly/src/polynomial/univariate/dense.rs b/poly/src/polynomial/univariate/dense.rs index d807b6b94..0e4e880c9 100644 --- a/poly/src/polynomial/univariate/dense.rs +++ b/poly/src/polynomial/univariate/dense.rs @@ -12,6 +12,8 @@ use ark_std::{ use ark_ff::{FftField, Field, Zero}; use rand::Rng; +#[cfg(feature = "parallel")] +use ark_std::cmp::max; #[cfg(feature = "parallel")] use rayon::prelude::*; @@ -39,18 +41,60 @@ impl Polynomial for DensePolynomial { fn evaluate(&self, point: &F) -> F { if self.is_zero() { return F::zero(); + } else if point.is_zero() { + return self.coeffs[0]; } - let mut powers_of_point = vec![F::one()]; - let mut cur = *point; - for _ in 0..self.degree() { - powers_of_point.push(cur); - cur *= point; + self.internal_evaluate(point) + } +} + +#[cfg(feature = "parallel")] +// Set some minimum number of field elements to be worked on per thread +// to avoid per-thread costs dominating parallel execution time. +const MIN_ELEMENTS_PER_THREAD: usize = 16; + +impl DensePolynomial { + #[inline] + // Horner's method for polynomial evaluation + fn horner_evaluate(poly_coeffs: &[F], point: &F) -> F { + let mut result = F::zero(); + let num_coeffs = poly_coeffs.len(); + for i in (0..num_coeffs).rev() { + result *= point; + result += poly_coeffs[i]; } - assert_eq!(powers_of_point.len(), self.coeffs.len()); - ark_std::cfg_into_iter!(powers_of_point) - .zip(&self.coeffs) - .map(|(power, coeff)| power * coeff) - .sum() + result + } + + #[cfg(not(feature = "parallel"))] + fn internal_evaluate(&self, point: &F) -> F { + Self::horner_evaluate(&self.coeffs, point) + } + + #[cfg(feature = "parallel")] + fn internal_evaluate(&self, point: &F) -> F { + // Horners method - parallel method + // compute the number of threads we will be using. + let num_cpus_available = rayon::current_num_threads(); + let num_coeffs = self.coeffs.len(); + let num_elem_per_thread = max(num_coeffs / num_cpus_available, MIN_ELEMENTS_PER_THREAD); + + // run Horners method on each thread as follows: + // 1) Split up the coefficients across each thread evenly. + // 2) Do polynomial evaluation via horner's method for the thread's coefficeints + // 3) Scale the result point^{thread coefficient start index} + // Then obtain the final polynomial evaluation by summing each threads result. + let result = self + .coeffs + .par_chunks(num_elem_per_thread) + .enumerate() + .map(|(i, chunk)| { + let mut thread_result = Self::horner_evaluate(&chunk, point); + thread_result *= point.pow(&[(i * num_elem_per_thread) as u64]); + thread_result + }) + .sum(); + result } }