diff --git a/CHANGELOG.md b/CHANGELOG.md
index d853eb161..6e33106af 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -27,8 +27,10 @@
 - #100 (ark-ff) Implement `batch_inverse_and_mul`
 - #101 (ark-ff) Add `element(i: usize)` on the `Domain` trait.
 - #107 (ark-serialize) Add an impl of `CanonicalSerialize/Deserialize` for `BTreeSet`.
+- #114 (ark-poly) Significantly speedup and reduce memory usage of `DensePolynomial.evaluate`.
 - #115 (ark-poly) Add parallel implementation to operations on `Evaluations`.
 - #115 (ark-ff) Add parallel implementation of `batch_inversion`.
+
 ### Bug fixes
 - #36 (ark-ec) In Short-Weierstrass curves, include an infinity bit in `ToConstraintField`.
 - #107 (ark-serialize) Fix handling of `(de)serialize_uncompressed/unchecked` in various impls of `CanonicalSerialize/Deserialize`.
diff --git a/poly-benches/Cargo.toml b/poly-benches/Cargo.toml
index 1ba153e5e..1aa1197a1 100644
--- a/poly-benches/Cargo.toml
+++ b/poly-benches/Cargo.toml
@@ -8,14 +8,24 @@ license = "MIT/Apache-2.0"
 edition = "2018"
 publish = false
 
-[dev-dependencies]
+[dependencies]
 ark-poly = { path = "../poly" }
 ark-ff = { path = "../ff" }
-ark-test-curves = { path = "../test-curves", default-features = false, features = [ "bls12_381_scalar_field" ] }
+ark-test-curves = { path = "../test-curves", default-features = false, features = [ "bls12_381_scalar_field", "mnt4_753_curve" ] }
 criterion = "0.3.1"
 rand = "0.7"
+rayon = { version = "1", optional = true }
+
+[features]
+default = []
+parallel = ["ark-ff/parallel", "rayon", "ark-poly/parallel" ]
 
 [[bench]]
-name = "fft"
-path = "benches/fft.rs"
+name = "groth16_fft"
+path = "benches/groth16_fft.rs"
 harness = false
+
+[[bench]]
+name = "dense_polynomial"
+path = "benches/dense_polynomial.rs"
+harness = false
\ No newline at end of file
diff --git a/poly-benches/benches/dense_polynomial.rs b/poly-benches/benches/dense_polynomial.rs
new file mode 100644
index 000000000..c819864a0
--- /dev/null
+++ b/poly-benches/benches/dense_polynomial.rs
@@ -0,0 +1,47 @@
+use rand;
+
+extern crate criterion;
+
+use ark_ff::Field;
+use ark_poly::{polynomial::univariate::DensePolynomial, Polynomial, UVPolynomial};
+use ark_test_curves::bls12_381::Fr as bls12_381_fr;
+use criterion::BenchmarkId;
+use criterion::Criterion;
+use criterion::{criterion_group, criterion_main};
+
+const POLY_LOG_MIN_SIZE: usize = 15;
+const POLY_EVALUATE_MAX_DEGREE: usize = 1 << 17;
+
+// returns vec![2^{POLY_LOG_MIN_SIZE}, ... 2^n], where n = ceil(log_2(max_degree))
+fn size_range(max_degree: usize) -> Vec<usize> {
+    let mut to_ret = vec![1 << POLY_LOG_MIN_SIZE];
+    while *to_ret.last().unwrap() < max_degree {
+        to_ret.push(to_ret.last().unwrap() * 2);
+    }
+    to_ret
+}
+
+fn bench_poly_evaluate<F: Field>(c: &mut Criterion, name: &'static str) {
+    let mut group = c.benchmark_group(format!("{:?} - evaluate_polynomial", name));
+    for degree in size_range(POLY_EVALUATE_MAX_DEGREE).iter() {
+        group.bench_with_input(BenchmarkId::from_parameter(degree), degree, |b, &degree| {
+            // Per benchmark setup
+            let mut rng = &mut rand::thread_rng();
+            let poly = DensePolynomial::<F>::rand(degree, &mut rng);
+            b.iter(|| {
+                // Per benchmark iteration
+                let pt = F::rand(&mut rng);
+                poly.evaluate(&pt);
+            });
+        });
+    }
+    group.finish();
+}
+
+fn bench_bls12_381(c: &mut Criterion) {
+    let name = "bls12_381";
+    bench_poly_evaluate::<bls12_381_fr>(c, name);
+}
+
+criterion_group!(benches, bench_bls12_381);
+criterion_main!(benches);
diff --git a/poly-benches/benches/fft.rs b/poly-benches/benches/groth16_fft.rs
similarity index 100%
rename from poly-benches/benches/fft.rs
rename to poly-benches/benches/groth16_fft.rs
diff --git a/poly/src/polynomial/univariate/dense.rs b/poly/src/polynomial/univariate/dense.rs
index d807b6b94..0e4e880c9 100644
--- a/poly/src/polynomial/univariate/dense.rs
+++ b/poly/src/polynomial/univariate/dense.rs
@@ -12,6 +12,8 @@ use ark_std::{
 use ark_ff::{FftField, Field, Zero};
 use rand::Rng;
 
+#[cfg(feature = "parallel")]
+use ark_std::cmp::max;
 #[cfg(feature = "parallel")]
 use rayon::prelude::*;
 
@@ -39,18 +41,60 @@ impl<F: Field> Polynomial<F> for DensePolynomial<F> {
     fn evaluate(&self, point: &F) -> F {
         if self.is_zero() {
             return F::zero();
+        } else if point.is_zero() {
+            return self.coeffs[0];
         }
-        let mut powers_of_point = vec![F::one()];
-        let mut cur = *point;
-        for _ in 0..self.degree() {
-            powers_of_point.push(cur);
-            cur *= point;
+        self.internal_evaluate(point)
+    }
+}
+
+#[cfg(feature = "parallel")]
+// Set some minimum number of field elements to be worked on per thread
+// to avoid per-thread costs dominating parallel execution time.
+const MIN_ELEMENTS_PER_THREAD: usize = 16;
+
+impl<F: Field> DensePolynomial<F> {
+    #[inline]
+    // Horner's method for polynomial evaluation
+    fn horner_evaluate(poly_coeffs: &[F], point: &F) -> F {
+        let mut result = F::zero();
+        let num_coeffs = poly_coeffs.len();
+        for i in (0..num_coeffs).rev() {
+            result *= point;
+            result += poly_coeffs[i];
         }
-        assert_eq!(powers_of_point.len(), self.coeffs.len());
-        ark_std::cfg_into_iter!(powers_of_point)
-            .zip(&self.coeffs)
-            .map(|(power, coeff)| power * coeff)
-            .sum()
+        result
+    }
+
+    #[cfg(not(feature = "parallel"))]
+    fn internal_evaluate(&self, point: &F) -> F {
+        Self::horner_evaluate(&self.coeffs, point)
+    }
+
+    #[cfg(feature = "parallel")]
+    fn internal_evaluate(&self, point: &F) -> F {
+        // Horners method - parallel method
+        // compute the number of threads we will be using.
+        let num_cpus_available = rayon::current_num_threads();
+        let num_coeffs = self.coeffs.len();
+        let num_elem_per_thread = max(num_coeffs / num_cpus_available, MIN_ELEMENTS_PER_THREAD);
+
+        // run Horners method on each thread as follows:
+        // 1) Split up the coefficients across each thread evenly.
+        // 2) Do polynomial evaluation via horner's method for the thread's coefficeints
+        // 3) Scale the result point^{thread coefficient start index}
+        // Then obtain the final polynomial evaluation by summing each threads result.
+        let result = self
+            .coeffs
+            .par_chunks(num_elem_per_thread)
+            .enumerate()
+            .map(|(i, chunk)| {
+                let mut thread_result = Self::horner_evaluate(&chunk, point);
+                thread_result *= point.pow(&[(i * num_elem_per_thread) as u64]);
+                thread_result
+            })
+            .sum();
+        result
     }
 }