Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add polynomial benchmark infra, switch poly eval to horners methods #114

Merged
merged 13 commits into from
Dec 7, 2020
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,10 @@
- #100 (ark-ff) Implement `batch_inverse_and_mul`
- #101 (ark-ff) Add `element(i: usize)` on the `Domain` trait.
- #107 (ark-serialize) Add an impl of `CanonicalSerialize/Deserialize` for `BTreeSet`.
- #114 (ark-poly) Significantly speedup and reduce memory usage of `DensePolynomial.evaluate`.
- #115 (ark-poly) Add parallel implementation to operations on `Evaluations`.
- #115 (ark-ff) Add parallel implementation of `batch_inversion`.

### Bug fixes
- #36 (ark-ec) In Short-Weierstrass curves, include an infinity bit in `ToConstraintField`.
- #107 (ark-serialize) Fix handling of `(de)serialize_uncompressed/unchecked` in various impls of `CanonicalSerialize/Deserialize`.
Expand Down
18 changes: 14 additions & 4 deletions poly-benches/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -8,14 +8,24 @@ license = "MIT/Apache-2.0"
edition = "2018"
publish = false

[dev-dependencies]
[dependencies]
ark-poly = { path = "../poly" }
ark-ff = { path = "../ff" }
ark-test-curves = { path = "../test-curves", default-features = false, features = [ "bls12_381_scalar_field" ] }
ark-test-curves = { path = "../test-curves", default-features = false, features = [ "bls12_381_scalar_field", "mnt4_753_curve" ] }
criterion = "0.3.1"
rand = "0.7"
rayon = { version = "1", optional = true }

[features]
default = []
parallel = ["ark-ff/parallel", "rayon", "ark-poly/parallel" ]

[[bench]]
name = "fft"
path = "benches/fft.rs"
name = "groth16_fft"
path = "benches/groth16_fft.rs"
harness = false

[[bench]]
name = "dense_polynomial"
path = "benches/dense_polynomial.rs"
harness = false
47 changes: 47 additions & 0 deletions poly-benches/benches/dense_polynomial.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
use rand;

extern crate criterion;

use ark_ff::Field;
use ark_poly::{polynomial::univariate::DensePolynomial, Polynomial, UVPolynomial};
use ark_test_curves::bls12_381::Fr as bls12_381_fr;
use criterion::BenchmarkId;
use criterion::Criterion;
use criterion::{criterion_group, criterion_main};

const POLY_LOG_MIN_SIZE: usize = 15;
const POLY_EVALUATE_MAX_DEGREE: usize = 1 << 17;

// returns vec![2^{POLY_LOG_MIN_SIZE}, ... 2^n], where n = ceil(log_2(max_degree))
fn size_range(max_degree: usize) -> Vec<usize> {
let mut to_ret = vec![1 << POLY_LOG_MIN_SIZE];
while *to_ret.last().unwrap() < max_degree {
to_ret.push(to_ret.last().unwrap() * 2);
}
to_ret
}

fn bench_poly_evaluate<F: Field>(c: &mut Criterion, name: &'static str) {
let mut group = c.benchmark_group(format!("{:?} - evaluate_polynomial", name));
for degree in size_range(POLY_EVALUATE_MAX_DEGREE).iter() {
group.bench_with_input(BenchmarkId::from_parameter(degree), degree, |b, &degree| {
// Per benchmark setup
let mut rng = &mut rand::thread_rng();
let poly = DensePolynomial::<F>::rand(degree, &mut rng);
b.iter(|| {
// Per benchmark iteration
let pt = F::rand(&mut rng);
poly.evaluate(&pt);
});
});
}
group.finish();
}

fn bench_bls12_381(c: &mut Criterion) {
let name = "bls12_381";
bench_poly_evaluate::<bls12_381_fr>(c, name);
}

criterion_group!(benches, bench_bls12_381);
criterion_main!(benches);
File renamed without changes.
64 changes: 54 additions & 10 deletions poly/src/polynomial/univariate/dense.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@ use ark_std::{
use ark_ff::{FftField, Field, Zero};
use rand::Rng;

#[cfg(feature = "parallel")]
use ark_std::cmp::max;
#[cfg(feature = "parallel")]
use rayon::prelude::*;

Expand Down Expand Up @@ -39,18 +41,60 @@ impl<F: Field> Polynomial<F> for DensePolynomial<F> {
fn evaluate(&self, point: &F) -> F {
if self.is_zero() {
return F::zero();
} else if point.is_zero() {
return self.coeffs[0];
}
let mut powers_of_point = vec![F::one()];
let mut cur = *point;
for _ in 0..self.degree() {
powers_of_point.push(cur);
cur *= point;
self.internal_evaluate(point)
}
}

#[cfg(feature = "parallel")]
// Set some minimum number of field elements to be worked on per thread
// to avoid per-thread costs dominating parallel execution time.
const MIN_ELEMENTS_PER_THREAD: usize = 16;

impl<F: Field> DensePolynomial<F> {
#[inline]
// Horner's method for polynomial evaluation
fn horner_evaluate(poly_coeffs: &[F], point: &F) -> F {
let mut result = F::zero();
let num_coeffs = poly_coeffs.len();
for i in (0..num_coeffs).rev() {
result *= point;
result += poly_coeffs[i];
}
assert_eq!(powers_of_point.len(), self.coeffs.len());
ark_std::cfg_into_iter!(powers_of_point)
.zip(&self.coeffs)
.map(|(power, coeff)| power * coeff)
.sum()
result
}

#[cfg(not(feature = "parallel"))]
fn internal_evaluate(&self, point: &F) -> F {
Self::horner_evaluate(&self.coeffs, point)
}

#[cfg(feature = "parallel")]
fn internal_evaluate(&self, point: &F) -> F {
// Horners method - parallel method
// compute the number of threads we will be using.
let num_cpus_available = rayon::current_num_threads();
let num_coeffs = self.coeffs.len();
let num_elem_per_thread = max(num_coeffs / num_cpus_available, MIN_ELEMENTS_PER_THREAD);

// run Horners method on each thread as follows:
// 1) Split up the coefficients across each thread evenly.
// 2) Do polynomial evaluation via horner's method for the thread's coefficeints
// 3) Scale the result point^{thread coefficient start index}
// Then obtain the final polynomial evaluation by summing each threads result.
let result = self
.coeffs
.par_chunks(num_elem_per_thread)
.enumerate()
.map(|(i, chunk)| {
let mut thread_result = Self::horner_evaluate(&chunk, point);
thread_result *= point.pow(&[(i * num_elem_per_thread) as u64]);
thread_result
})
.sum();
result
}
}

Expand Down