From 854c0abece524a114b1e442213fce2af1f11aefe Mon Sep 17 00:00:00 2001
From: Michael Zhu <mchl.zhu.96@gmail.com>
Date: Thu, 7 Nov 2024 17:21:55 -0500
Subject: [PATCH] Add comments

---
 jolt-core/Cargo.toml                          |   2 +-
 .../benches/{polynomial.rs => binding.rs}     |   0
 jolt-core/src/jolt/instruction/div.rs         |   4 +-
 jolt-core/src/jolt/vm/instruction_lookups.rs  |  40 +++++-
 jolt-core/src/jolt/vm/mod.rs                  |   7 +
 jolt-core/src/lasso/memory_checking.rs        |  14 +-
 jolt-core/src/lib.rs                          |   2 -
 jolt-core/src/poly/dense_interleaved_poly.rs  |  67 +++++++---
 jolt-core/src/poly/sparse_interleaved_poly.rs | 121 ++++++++++++++----
 jolt-core/src/subprotocols/grand_product.rs   |  30 +++--
 .../src/subprotocols/grand_product_quarks.rs  |  64 ++++++---
 .../src/subprotocols/sparse_grand_product.rs  |  84 ++++++++++--
 jolt-core/src/utils/sol_types.rs              |   2 +-
 13 files changed, 347 insertions(+), 90 deletions(-)
 rename jolt-core/benches/{polynomial.rs => binding.rs} (100%)
diff --git a/jolt-core/Cargo.toml b/jolt-core/Cargo.toml
index 15aedbd09..a08331b22 100644
--- a/jolt-core/Cargo.toml
+++ b/jolt-core/Cargo.toml
@@ -87,7 +87,7 @@ name = "commit"
 harness = false
 
 [[bench]]
-name = "polynomial"
+name = "binding"
 harness = false
 
 [[bench]]
diff --git a/jolt-core/benches/polynomial.rs b/jolt-core/benches/binding.rs
similarity index 100%
rename from jolt-core/benches/polynomial.rs
rename to jolt-core/benches/binding.rs
diff --git a/jolt-core/src/jolt/instruction/div.rs b/jolt-core/src/jolt/instruction/div.rs
index 616f17d85..9b7e7e601 100644
--- a/jolt-core/src/jolt/instruction/div.rs
+++ b/jolt-core/src/jolt/instruction/div.rs
@@ -76,7 +76,7 @@ impl<const WORD_SIZE: usize> VirtualInstructionSequence for DIVInstruction<WORD_
                 rd_post_val: Some(q),
             },
             memory_state: None,
-            advice_value: Some(quotient), // What should advice value be here?
+            advice_value: Some(quotient),
         });
 
         let r = ADVICEInstruction::<WORD_SIZE>(remainder).lookup_entry();
@@ -96,7 +96,7 @@ impl<const WORD_SIZE: usize> VirtualInstructionSequence for DIVInstruction<WORD_
                 rd_post_val: Some(r),
             },
             memory_state: None,
-            advice_value: Some(remainder), // What should advice value be here?
+            advice_value: Some(remainder),
         });
 
         let is_valid: u64 = AssertValidSignedRemainderInstruction::<WORD_SIZE>(r, y).lookup_entry();
diff --git a/jolt-core/src/jolt/vm/instruction_lookups.rs b/jolt-core/src/jolt/vm/instruction_lookups.rs
index abb954dce..846929bc1 100644
--- a/jolt-core/src/jolt/vm/instruction_lookups.rs
+++ b/jolt-core/src/jolt/vm/instruction_lookups.rs
@@ -256,6 +256,7 @@ where
             (memory_flags, read_write_leaves),
             (
                 init_final_leaves,
+                // # init = # subtables; # final = # memories
                 Self::NUM_SUBTABLES + preprocessing.num_memories,
             ),
         )
@@ -447,8 +448,10 @@ where
             .collect()
     }
 
-    /// Checks that the claimed multiset hashes (output by grand product) are consistent with the
-    /// openings given by `read_write_openings` and `init_final_openings`.
+    /// Checks that the claims output by the grand products are consistent with the openings of
+    /// the polynomials comprising the input layers.
+    ///
+    ///
     fn check_fingerprints(
         preprocessing: &Self::Preprocessing,
         read_write_claim: F,
@@ -486,22 +489,55 @@ where
             .iter()
             .map(|tuple| tuple.3.unwrap())
             .collect();
+        // For the toggled grand product, the flags in the input layer are padded with 1s,
+        // while the fingerprints are padded with 0s, so that all subsequent padding layers
+        // are all 0s.
+        // To see why this is the case, observe that the input layer's gates will output
+        // flag * fingerprint + 1 - flag = 1 * 0 + 1 - 1 = 0.
+        // Then all subsequent layers will output gate values 0 * 0 = 0.
         read_write_flags.resize(read_write_flags.len().next_power_of_two(), F::one());
+
+        // Let r' := r_read_write_batch_index
+        // and r'':= r_read_write_opening.
+        //
+        // Let k denote the batch size.
+        //
+        // The `read_write_flags` vector above contains the evaluations of the k individual
+        // flag MLEs at r''.
+        //
+        // What we want to compute is the evaluation of the MLE of ALL the flags, concatenated together,
+        // at (r', r''):
+        //
+        // flags(r', r'') = \sum_j eq(r', j) * flag_j(r'')
+        //
+        // where flag_j(r'') is what we already have in `read_write_flags`.
         let combined_flags: F = read_write_flags
             .iter()
             .zip(EqPolynomial::evals(r_read_write_batch_index).iter())
             .map(|(flag, eq_eval)| *flag * eq_eval)
             .sum();
+        // Similar thing for the fingerprints:
+        //
+        // fingerprints(r', r'') = \sum_j eq(r', j) * (t_j(r'') * \gamma^2 + v_j(r'') * \gamma + a_j(r'') - \tau)
         let combined_read_write_fingerprint: F = read_write_tuples
             .iter()
             .zip(EqPolynomial::evals(r_read_write_batch_index).iter())
             .map(|(tuple, eq_eval)| Self::fingerprint(tuple, gamma, tau) * eq_eval)
             .sum();
+
+        // Now we combine flags(r', r'') and fingerprints(r', r'') to obtain the evaluation of the
+        // multi-*quadratic* extension W of the input layer at (r', r'')
+        //
+        // W(r', r'') = flags(r', r'') * fingerprints(r', r'') + 1 - flags(r', r'')
+        //
+        // and this should equal the claim output by the read-write grand product.
         assert_eq!(
             combined_flags * combined_read_write_fingerprint + F::one() - combined_flags,
             read_write_claim
         );
 
+        // The init-final grand product isn't toggled using flags (it's just a "normal" grand product)
+        // so we combine the openings the normal way.
         let combined_init_final_fingerprint: F = init_final_tuples
             .iter()
             .zip(EqPolynomial::evals(r_init_final_batch_index).iter())
diff --git a/jolt-core/src/jolt/vm/mod.rs b/jolt-core/src/jolt/vm/mod.rs
index 11f224f80..b1aac8825 100644
--- a/jolt-core/src/jolt/vm/mod.rs
+++ b/jolt-core/src/jolt/vm/mod.rs
@@ -245,6 +245,12 @@ impl<F: JoltField> JoltPolynomials<F> {
             .zip(trace_comitments.into_iter())
             .for_each(|(dest, src)| *dest = src);
 
+        println!(
+            "# commitments: {} + {}",
+            commitments.read_write_values().len(),
+            commitments.init_final_values().len(),
+        );
+
         commitments.bytecode.t_final =
             PCS::commit(&self.bytecode.t_final, &preprocessing.generators);
         (
@@ -366,6 +372,7 @@ where
     ) {
         let trace_length = trace.len();
         let padded_trace_length = trace_length.next_power_of_two();
+        println!("Trace length: {}", trace_length);
 
         JoltTraceStep::pad(&mut trace);
 
diff --git a/jolt-core/src/lasso/memory_checking.rs b/jolt-core/src/lasso/memory_checking.rs
index 67cb07ecb..9e3fc121e 100644
--- a/jolt-core/src/lasso/memory_checking.rs
+++ b/jolt-core/src/lasso/memory_checking.rs
@@ -199,7 +199,7 @@ pub trait Initializable<T, Preprocessing>: StructuredPolynomialData<T> + Default
     }
 }
 
-// Empty struct to represent that no preprocessing data is used.
+/// Empty struct to represent that no preprocessing data is used.
 pub struct NoPreprocessing;
 
 pub trait MemoryCheckingProver<F, PCS, ProofTranscript>
@@ -254,6 +254,9 @@ where
         let init_final_batch_size =
             multiset_hashes.init_hashes.len() + multiset_hashes.final_hashes.len();
 
+        // For a batch size of k, the first log2(k) elements of `r_read_write`/`r_init_final`
+        // form the point at which the output layer's MLE is evaluated. The remaining elements
+        // then form the point at which the leaf layer's polynomials are evaluated.
         let (_, r_read_write_opening) =
             r_read_write.split_at(read_write_batch_size.next_power_of_two().log_2());
         let (_, r_init_final_opening) =
@@ -569,6 +572,9 @@ where
             transcript,
             Some(pcs_setup),
         );
+        // For a batch size of k, the first log2(k) elements of `r_read_write`/`r_init_final`
+        // form the point at which the output layer's MLE is evaluated. The remaining elements
+        // then form the point at which the leaf layer's polynomials are evaluated.
         let (r_read_write_batch_index, r_read_write_opening) =
             r_read_write.split_at(read_write_batch_size.next_power_of_two().log_2());
 
@@ -665,8 +671,8 @@ where
         exogenous_openings: &Self::ExogenousOpenings,
     ) -> Vec<Self::MemoryTuple>;
 
-    /// Checks that the claimed multiset hashes (output by grand product) are consistent with the
-    /// openings given by `read_write_openings` and `init_final_openings`.
+    /// Checks that the claims output by the grand products are consistent with the openings of
+    /// the polynomials comprising the input layers.
     fn check_fingerprints(
         preprocessing: &Self::Preprocessing,
         read_write_claim: F,
@@ -712,6 +718,8 @@ where
             r_init_final_batch_index.len().pow2()
         );
 
+        // `r_read_write_batch_index`/`r_init_final_batch_index` are used to
+        // combine the k claims in the batch into a single claim.
         let combined_read_write_hash: F = read_write_hashes
             .iter()
             .zip(EqPolynomial::evals(r_read_write_batch_index).iter())
diff --git a/jolt-core/src/lib.rs b/jolt-core/src/lib.rs
index 7c9091ab6..86559a07c 100644
--- a/jolt-core/src/lib.rs
+++ b/jolt-core/src/lib.rs
@@ -10,8 +10,6 @@
 #![allow(long_running_const_eval)]
 #![allow(clippy::len_without_is_empty)]
 #![allow(type_alias_bounds)]
-#![feature(coroutines)]
-#![feature(iter_from_coroutine)]
 
 #[cfg(feature = "host")]
 pub mod benches;
diff --git a/jolt-core/src/poly/dense_interleaved_poly.rs b/jolt-core/src/poly/dense_interleaved_poly.rs
index af5a5a48f..361fc5cfe 100644
--- a/jolt-core/src/poly/dense_interleaved_poly.rs
+++ b/jolt-core/src/poly/dense_interleaved_poly.rs
@@ -12,10 +12,26 @@ use rayon::{prelude::*, slice::Chunks};
 use super::dense_mlpoly::DensePolynomial;
 use super::{split_eq_poly::SplitEqPolynomial, unipoly::UniPoly};
 
+/// Represents a single layer of a grand product circuit.
+/// A layer is assumed to be arranged in "interleaved" order, i.e. the natural
+/// order in the visual representation of the circuit:
+///      Λ        Λ        Λ        Λ
+///     / \      / \      / \      /  \
+///   L0   R0  L1   R1  L2   R2  L3   R3   <- This is layer would be represented as [L0, R0, L1, R1, L2, R2, L3, R3]
+///                                           (as opposed to e.g. [L0, L1, L2, L3, R0, R1, R2, R3])
 #[derive(Default, Debug, Clone)]
 pub struct DenseInterleavedPolynomial<F: JoltField> {
+    /// The coefficients for the "left" and "right" polynomials comprising a
+    /// dense grand product layer.
+    /// The coefficients are in interleaved order:
+    /// [L0, R0, L1, R1, L2, R2, L3, R3, ...]
     pub(crate) coeffs: Vec<F>,
+    /// The effective length of `coeffs`. When binding, we update this length
+    /// instead of truncating `coeffs`, which incurs the cost of dropping the
+    /// truncated values.
     len: usize,
+    /// A reused buffer where bound values are written to during `bind`.
+    /// With every bind, `coeffs` and `binding_scratch_space` are swapped.
     binding_scratch_space: Vec<F>,
 }
 
@@ -36,7 +52,7 @@ impl<F: JoltField> DenseInterleavedPolynomial<F> {
         Self {
             coeffs,
             len,
-            binding_scratch_space: unsafe_allocate_zero_vec(len),
+            binding_scratch_space: unsafe_allocate_zero_vec(len.next_multiple_of(4) / 2),
         }
     }
 
@@ -87,11 +103,8 @@ impl<F: JoltField> DenseInterleavedPolynomial<F> {
 }
 
 impl<F: JoltField> Bindable<F> for DenseInterleavedPolynomial<F> {
-    /// Incrementally binds a variable of this batched layer's polynomials.
-    /// Even though each layer is backed by a single Vec<F>, it represents two polynomials
-    /// one for the left nodes in the circuit, one for the right nodes in the circuit.
-    /// These two polynomials' coefficients are interleaved into one Vec<F>. To preserve
-    /// this interleaved order, we bind values like this:
+    /// Incrementally binds a variable of the interleaved left and right polynomials.
+    /// To preserve the interleaved order of coefficients, we bind values like this:
     ///   0'  1'     2'  3'
     ///   |\ |\      |\ |\
     ///   | \| \     | \| \
@@ -105,6 +118,9 @@ impl<F: JoltField> Bindable<F> for DenseInterleavedPolynomial<F> {
         let (mut left_before_binding, mut right_before_binding) = self.uninterleave();
 
         let padded_len = self.len.next_multiple_of(4);
+        // In order to parallelize binding while obeying Rust ownership rules, we
+        // must write to a different vector than we are reading from. `binding_scratch_space`
+        // serves this purpose.
         self.binding_scratch_space
             .par_chunks_mut(2)
             .zip(self.coeffs[..self.len].par_chunks(4))
@@ -121,6 +137,8 @@ impl<F: JoltField> Bindable<F> for DenseInterleavedPolynomial<F> {
             });
 
         self.len = padded_len / 2;
+        // Point `self.coeffs` to the bound coefficients, and `self.coeffs` will serve as the
+        // binding scratch space in the next invocation of `bind`.
         std::mem::swap(&mut self.coeffs, &mut self.binding_scratch_space);
 
         #[cfg(test)]
@@ -155,13 +173,6 @@ pub fn bind_left_and_right<F: JoltField>(left: &mut Vec<F>, right: &mut Vec<F>,
     *right = right_poly.Z[..right.len() / 2].to_vec();
 }
 
-/// Represents a single layer of a batched grand product circuit.
-/// A layer is assumed to be arranged in "interleaved" order, i.e. the natural
-/// order in the visual representation of the circuit:
-///      Λ        Λ        Λ        Λ
-///     / \      / \      / \      / \
-///   L0   R0  L1   R1  L2   R2  L3   R3   <- This is layer would be represented as [L0, R0, L1, R1, L2, R2, L3, R3]
-///                                           (as opposed to e.g. [L0, L1, L2, L3, R0, R1, R2, R3])
 impl<F: JoltField, ProofTranscript: Transcript> BatchedGrandProductLayer<F, ProofTranscript>
     for DenseInterleavedPolynomial<F>
 {
@@ -190,15 +201,20 @@ impl<F: JoltField, ProofTranscript: Transcript> BatchedCubicSumcheck<F, ProofTra
     ///
     /// Computing these evaluations requires processing pairs of adjacent coefficients of
     /// `eq`, `left`, and `right`.
-    /// Recall that the `left` and `right` polynomials are interleaved in each layer of `self.layers`,
-    /// so we process each layer 4 values at a time:
-    ///                  layer = [L, R, L, R, L, R, ...]
+    /// Recall that the `left` and `right` polynomials are interleaved in `self.coeffs`,
+    /// so we process 4 values at a time:
+    ///                 coeffs = [L, R, L, R, L, R, ...]
     ///                           |  |  |  |
     ///    left(0, 0, 0, ..., x_b=0) |  |  right(0, 0, 0, ..., x_b=1)
     ///     right(0, 0, 0, ..., x_b=0)  left(0, 0, 0, ..., x_b=1)
     #[tracing::instrument(skip_all, name = "DenseInterleavedPolynomial::compute_cubic")]
     fn compute_cubic(&self, eq_poly: &SplitEqPolynomial<F>, previous_round_claim: F) -> UniPoly<F> {
+        // We use the Dao-Thaler optimization for the EQ polynomial, so there are two cases we
+        // must handle. For details, refer to Section 2.2 of https://eprint.iacr.org/2024/1210.pdf
         let cubic_evals = if eq_poly.E1_len == 1 {
+            // If `eq_poly.E1` has been fully bound, we compute the cubic polynomial as we
+            // would without the Dao-Thaler optimization, using the standard linear-time
+            // sumcheck algorithm.
             self.par_chunks(4)
                 .zip(eq_poly.E2.par_chunks(2))
                 .map(|(layer_chunk, eq_chunk)| {
@@ -238,6 +254,22 @@ impl<F: JoltField, ProofTranscript: Transcript> BatchedCubicSumcheck<F, ProofTra
                     |sum, evals| (sum.0 + evals.0, sum.1 + evals.1, sum.2 + evals.2),
                 )
         } else {
+            // If `eq_poly.E1` has NOT been fully bound, we compute the cubic polynomial
+            // using the nested summation approach described in Section 2.2 of https://eprint.iacr.org/2024/1210.pdf
+            //
+            // Note, however, that we reverse the inner/outer summation compared to the
+            // description in the paper. I.e. instead of:
+            //
+            // \sum_x1 ((1 - j) * E1[0, x1] + j * E1[1, x1]) * (\sum_x2 E2[x2] * \prod_k ((1 - j) * P_k(0 || x1 || x2) + j * P_k(1 || x1 || x2)))
+            //
+            // we do:
+            //
+            // \sum_x2 E2[x2] * (\sum_x1 ((1 - j) * E1[0, x1] + j * E1[1, x1]) * \prod_k ((1 - j) * P_k(0 || x1 || x2) + j * P_k(1 || x1 || x2)))
+            //
+            // because it has better memory locality.
+
+            // We start by computing the E1 evals:
+            // (1 - j) * E1[0, x1] + j * E1[1, x1]
             let E1_evals: Vec<_> = eq_poly.E1[..eq_poly.E1_len]
                 .par_chunks(2)
                 .map(|E1_chunk| {
@@ -254,6 +286,8 @@ impl<F: JoltField, ProofTranscript: Transcript> BatchedCubicSumcheck<F, ProofTra
                 .par_iter()
                 .zip(self.par_chunks(chunk_size))
                 .map(|(E2_eval, P_x2)| {
+                    // The for-loop below corresponds to the inner sum:
+                    // \sum_x1 ((1 - j) * E1[0, x1] + j * E1[1, x1]) * \prod_k ((1 - j) * P_k(0 || x1 || x2) + j * P_k(1 || x1 || x2))
                     let mut inner_sum = (F::zero(), F::zero(), F::zero());
                     for (E1_evals, P_chunk) in E1_evals.iter().zip(P_x2.chunks(4)) {
                         let left = (
@@ -278,6 +312,7 @@ impl<F: JoltField, ProofTranscript: Transcript> BatchedCubicSumcheck<F, ProofTra
                         inner_sum.2 += E1_evals.2 * left_eval_3 * right_eval_3;
                     }
 
+                    // Multiply the inner sum by E2[x2]
                     (
                         *E2_eval * inner_sum.0,
                         *E2_eval * inner_sum.1,
diff --git a/jolt-core/src/poly/sparse_interleaved_poly.rs b/jolt-core/src/poly/sparse_interleaved_poly.rs
index 674dc2c26..766b14b53 100644
--- a/jolt-core/src/poly/sparse_interleaved_poly.rs
+++ b/jolt-core/src/poly/sparse_interleaved_poly.rs
@@ -27,10 +27,41 @@ impl<F: JoltField> From<(usize, F)> for SparseCoefficient<F> {
     }
 }
 
+/// Represents a single layer of a sparse grand product circuit.
+/// A layer is assumed to be arranged in "interleaved" order, i.e. the natural
+/// order in the visual representation of the circuit:
+///      Λ        Λ        Λ        Λ
+///     / \      / \      / \      /  \
+///   L0   R0  L1   R1  L2   R2  L3   R3   <- This is layer would be represented as [L0, R0, L1, R1, L2, R2, L3, R3]
+///                                           (as opposed to e.g. [L0, L1, L2, L3, R0, R1, R2, R3])
+///
+/// Where SparseInterleavedPolynomial differs from DenseInterleavedPolynomial
+/// is that many of the coefficients are expected to be 1s, so the circuit may
+/// look something like this:
+///      Λ        Λ        Λ        Λ
+///     / \      / \      / \      /  \
+///    1   R0   1   1   L2   1    1    1
+///
+/// Instead of materializing all the 1s, we use a sparse vector to represent the layer,
+/// where each element of the vector contains the index and value of a non-one coefficient.
+/// So the above layer would be represented by:
+///   vec![(1, R0), (4, L2)]        (except with `SparseCoefficient` structs, not tuples)
+///
+/// In the context of a batched grand product (see sparse_grand_product.rs), there
+/// are k of these sparse vectors, where k is the batch size.
+/// For the first log2(n) rounds of binding, these k vectors can be processed in parallel.
+/// After that, they are "coalesced" into a single DenseInterleavedPolynomial for the
+/// remaining rounds of binding.
 #[derive(Default, Debug, Clone)]
 pub struct SparseInterleavedPolynomial<F: JoltField> {
+    /// A vector of sparse vectors representing the coefficients in a batched grand product
+    /// layer, where batch size = coeffs.len().
     pub(crate) coeffs: Vec<Vec<SparseCoefficient<F>>>,
+    /// Once `coeffs` cannot be bound further (i.e. binding would require processing values
+    /// in different vectors), we switch to using `coalesced` to represent the grand product
+    /// layer. See `SparseInterleavedPolynomial::coalesce()`.
     pub(crate) coalesced: Option<DenseInterleavedPolynomial<F>>,
+    /// The length of the layer if it were represented by a single dense vector.
     pub(crate) dense_len: usize,
 }
 
@@ -64,6 +95,8 @@ impl<F: JoltField> SparseInterleavedPolynomial<F> {
                 .for_each(|sparse_coeff| coalesced[sparse_coeff.index] = sparse_coeff.value);
             Self {
                 dense_len,
+                // The batch size is implied by coeffs.len(), so we must initialize this
+                // vector:
                 coeffs: vec![vec![]; batch_size],
                 coalesced: Some(DenseInterleavedPolynomial::new(coalesced)),
             }
@@ -80,19 +113,17 @@ impl<F: JoltField> SparseInterleavedPolynomial<F> {
         self.coeffs.len()
     }
 
+    /// Converts a `SparseInterleavedPolynomial` into the equivalent `DensePolynomial`.
     pub fn to_dense(&self) -> DensePolynomial<F> {
         if let Some(coalesced) = &self.coalesced {
             DensePolynomial::new_padded(coalesced.coeffs[..coalesced.len()].to_vec())
         } else {
-            let mut dense_layer = vec![F::one(); self.dense_len];
-            for coeff in self.coeffs.iter().flatten() {
-                dense_layer[coeff.index] = coeff.value;
-            }
-            DensePolynomial::new_padded(dense_layer)
+            DensePolynomial::new_padded(self.coalesce())
         }
     }
 
     #[tracing::instrument(skip_all, name = "SparseInterleavedPolynomial::coalesce")]
+    /// Coalesces a `SparseInterleavedPolynomial` into a `DenseInterleavedPolynomial`.
     pub fn coalesce(&self) -> Vec<F> {
         if let Some(coalesced) = &self.coalesced {
             coalesced.coeffs.clone()
@@ -152,6 +183,8 @@ impl<F: JoltField> SparseInterleavedPolynomial<F> {
         Self::new(coeffs, left.len() + right.len())
     }
 
+    /// Uninterleaves a `SparseInterleavedPolynomial` into two vectors
+    /// containing the left and right coefficients.
     pub fn uninterleave(&self) -> (Vec<F>, Vec<F>) {
         if let Some(coalesced) = &self.coalesced {
             coalesced.uninterleave()
@@ -170,12 +203,11 @@ impl<F: JoltField> SparseInterleavedPolynomial<F> {
         }
     }
 
-    pub fn par_blocks(&self) -> impl ParallelIterator<Item = &[SparseCoefficient<F>]> {
-        self.coeffs
-            .par_iter()
-            .flat_map(|segment| segment.par_chunk_by(|x, y| x.index / 4 == y.index / 4))
-    }
-
+    /// Computes the grand product layer output by this one.
+    ///     L0'      R0'      L1'      R1'     <- Output layer
+    ///      Λ        Λ        Λ        Λ
+    ///     / \      / \      / \      /  \
+    ///   L0   R0  L1   R1  L2   R2  L3   R3   <- This layer
     #[tracing::instrument(skip_all, name = "SparseInterleavedPolynomial::layer_output")]
     pub fn layer_output(&self) -> Self {
         if let Some(coalesced) = &self.coalesced {
@@ -211,8 +243,8 @@ impl<F: JoltField> SparseInterleavedPolynomial<F> {
 }
 
 impl<F: JoltField> Bindable<F> for SparseInterleavedPolynomial<F> {
-    /// Incrementally binds a variable of this batched layer's polynomials.
-    /// If `self` is dense, we bind as in `BatchedDenseGrandProductLayer`,
+    /// Incrementally binds a variable of the interleaved left and right polynomials.
+    /// If `self` is coalesced, we invoke `DenseInterleavedPolynomial::bind`,
     /// processing nodes 4 at a time to preserve the interleaved order:
     ///   0'  1'     2'  3'
     ///   |\ |\      |\ |\
@@ -221,8 +253,9 @@ impl<F: JoltField> Bindable<F> for SparseInterleavedPolynomial<F> {
     ///   |  |\  \   |  |\  \
     ///   0  1 2  3  4  5 6  7
     /// Left nodes have even indices, right nodes have odd indices.
-    /// If `self` is sparse, we basically do the same thing but with more
-    /// cases to check 😬
+    ///
+    /// If `self` is not coalesced, we basically do the same thing but with the
+    /// sparse vectors in `self.coeffs`, and many more cases to check 😬
     #[tracing::instrument(skip_all, name = "SparseInterleavedPolynomial::bind")]
     fn bind(&mut self, r: F) {
         #[cfg(test)]
@@ -390,13 +423,15 @@ impl<F: JoltField, ProofTranscript: Transcript> BatchedCubicSumcheck<F, ProofTra
     ///
     /// Computing these evaluations requires processing pairs of adjacent coefficients of
     /// `eq`, `left`, and `right`.
-    /// If `self` is dense, we process each layer 4 values at a time:
-    ///                  layer = [L, R, L, R, L, R, ...]
+    /// If `self` is coalesced, we invoke `DenseInterleavedPolynomial::compute_cubic`, processing
+    /// 4 values at a time:
+    ///                 coeffs = [L, R, L, R, L, R, ...]
     ///                           |  |  |  |
     ///    left(0, 0, 0, ..., x_b=0) |  |  right(0, 0, 0, ..., x_b=1)
     ///     right(0, 0, 0, ..., x_b=0)  left(0, 0, 0, ..., x_b=1)
-    /// If `self` is sparse, we basically do the same thing but with some fancy optimizations and
-    /// more cases to check 😬
+    ///
+    /// If `self` is not coalesced, we basically do the same thing but with with the
+    /// sparse vectors in `self.coeffs`, some fancy optimizations, and many more cases to check 😬
     #[tracing::instrument(skip_all, name = "SparseInterleavedPolynomial::compute_cubic")]
     fn compute_cubic(&self, eq_poly: &SplitEqPolynomial<F>, previous_round_claim: F) -> UniPoly<F> {
         if let Some(coalesced) = &self.coalesced {
@@ -407,7 +442,13 @@ impl<F: JoltField, ProofTranscript: Transcript> BatchedCubicSumcheck<F, ProofTra
             );
         }
 
+        // We use the Dao-Thaler optimization for the EQ polynomial, so there are two cases we
+        // must handle. For details, refer to Section 2.2 of https://eprint.iacr.org/2024/1210.pdf
         let cubic_evals = if eq_poly.E1_len == 1 {
+            // If `eq_poly.E1` has been fully bound, we compute the cubic polynomial as we
+            // would without the Dao-Thaler optimization, using the standard linear-time
+            // sumcheck algorithm with optimizations for sparsity.
+
             let eq_evals: Vec<(F, F, F)> = eq_poly
                 .E2
                 .par_chunks(2)
@@ -420,7 +461,8 @@ impl<F: JoltField, ProofTranscript: Transcript> BatchedCubicSumcheck<F, ProofTra
                     (eval_point_0, eval_point_2, eval_point_3)
                 })
                 .collect();
-            // TODO(moodlezoup): Can more efficiently compute these
+            // This is what Σ eq(r, x) * left(x) * right(x) would be if
+            // `left` and `right` were both all ones.
             let eq_eval_sums: (F, F, F) = eq_evals
                 .par_iter()
                 .fold(
@@ -431,7 +473,8 @@ impl<F: JoltField, ProofTranscript: Transcript> BatchedCubicSumcheck<F, ProofTra
                     || (F::zero(), F::zero(), F::zero()),
                     |sum, evals| (sum.0 + evals.0, sum.1 + evals.1, sum.2 + evals.2),
                 );
-
+            // Now we compute the deltas, correcting `eq_eval_sums` for the
+            // elements of `left` and `right` that aren't ones.
             let deltas: (F, F, F) = self
                 .coeffs
                 .par_iter()
@@ -478,6 +521,11 @@ impl<F: JoltField, ProofTranscript: Transcript> BatchedCubicSumcheck<F, ProofTra
                 eq_eval_sums.2 + deltas.2,
             )
         } else {
+            // This is a more complicated version of the `else` case in
+            // `DenseInterleavedPolynomial::compute_cubic`. Read that one first.
+
+            // We start by computing the E1 evals:
+            // (1 - j) * E1[0, x1] + j * E1[1, x1]
             let E1_evals: Vec<_> = eq_poly.E1[..eq_poly.E1_len]
                 .par_chunks(2)
                 .map(|E1_chunk| {
@@ -488,6 +536,7 @@ impl<F: JoltField, ProofTranscript: Transcript> BatchedCubicSumcheck<F, ProofTra
                     (eval_point_0, eval_point_2, eval_point_3)
                 })
                 .collect();
+            // Now compute \sum_x1 ((1 - j) * E1[0, x1] + j * E1[1, x1])
             let E1_eval_sums: (F, F, F) = E1_evals
                 .par_iter()
                 .fold(
@@ -502,6 +551,8 @@ impl<F: JoltField, ProofTranscript: Transcript> BatchedCubicSumcheck<F, ProofTra
             let num_x1_bits = eq_poly.E1_len.log_2() - 1;
             let x1_bitmask = (1 << num_x1_bits) - 1;
 
+            // Iterate over the non-one coefficients and compute the deltas (relative to
+            // what the cubic would be if all the coefficients were ones).
             let deltas = self
                 .coeffs
                 .par_iter()
@@ -560,21 +611,45 @@ impl<F: JoltField, ProofTranscript: Transcript> BatchedCubicSumcheck<F, ProofTra
                     |sum, evals| (sum.0 + evals.0, sum.1 + evals.1, sum.2 + evals.2),
                 );
 
+            // The cubic evals assuming all the coefficients are ones is affected by the
+            // `dense_len`, since we implicitly 0-pad the `dense_len` to a power of 2.
+            //
+            // As a refresher, the cubic evals we're computing are:
+            //
+            // \sum_x2 E2[x2] * (\sum_x1 ((1 - j) * E1[0, x1] + j * E1[1, x1]) * \prod_k ((1 - j) * P_k(0 || x1 || x2) + j * P_k(1 || x1 || x2)))
             let evals_assuming_all_ones = if self.dense_len.is_power_of_two() {
+                // If `dense_len` is a power of 2, there is no 0-padding.
+                //
+                // So we have:
+                // \sum_x2 (E2[x2] * (\sum_x1 ((1 - j) * E1[0, x1] + j * E1[1, x1]) * 1))
+                //   = \sum_x2 (E2[x2] * \sum_x1 E1_evals[x1])
+                //   = (\sum_x2 E2[x2]) * (\sum_x1 E1_evals[x1])
+                //   = 1 * E1_eval_sums
                 E1_eval_sums
             } else {
                 let chunk_size = self.dense_len.next_power_of_two() / eq_poly.E2_len;
                 let num_all_one_chunks = self.dense_len / chunk_size;
                 let E2_sum: F = eq_poly.E2[..num_all_one_chunks].iter().sum();
                 if self.dense_len % chunk_size == 0 {
+                    // If `dense_len` isn't a power of 2 but evenly divides `chunk_size`,
+                    // that means that for the last values of x2, we have:
+                    //   (1 - j) * P_k(0 || x1 || x2) + j * P_k(1 || x1 || x2)) = 0
+                    // due to the 0-padding.
+                    //
+                    // This makes the entire inner sum 0 for those values of x2.
+                    // So we can simply sum over E2 for the _other_ values of x2, and
+                    // multiply by `E1_eval_sums`.
                     (
                         E2_sum * E1_eval_sums.0,
                         E2_sum * E1_eval_sums.1,
                         E2_sum * E1_eval_sums.2,
                     )
                 } else {
-                    // The last "chunk" will have (self.dense_len % chunk_size) ones,
-                    // followed by (chunk_size - self.dense_len % chunk_size) zeros.
+                    // If `dense_len` isn't a power of 2 and doesn't divide `chunk_size`,
+                    // the last nonzero "chunk" will have (self.dense_len % chunk_size) ones,
+                    // followed by (chunk_size - self.dense_len % chunk_size) zeros,
+                    // e.g. 1 1 1 1 1 1 1 1 0 0 0 0
+                    //
                     // This handles this last chunk:
                     let last_chunk_evals = E1_evals[..(self.dense_len % chunk_size) / 4]
                         .par_iter()
diff --git a/jolt-core/src/subprotocols/grand_product.rs b/jolt-core/src/subprotocols/grand_product.rs
index 3ce15c6a7..87b4028ac 100644
--- a/jolt-core/src/subprotocols/grand_product.rs
+++ b/jolt-core/src/subprotocols/grand_product.rs
@@ -40,7 +40,7 @@ where
     PCS: CommitmentScheme<ProofTranscript>,
     ProofTranscript: Transcript,
 {
-    pub layers: Vec<BatchedGrandProductLayerProof<PCS::Field, ProofTranscript>>,
+    pub gkr_layers: Vec<BatchedGrandProductLayerProof<PCS::Field, ProofTranscript>>,
     pub quark_proof: Option<QuarkGrandProductProof<PCS, ProofTranscript>>,
 }
 
@@ -81,6 +81,8 @@ where
     ) -> (BatchedGrandProductProof<PCS, ProofTranscript>, Vec<F>) {
         let mut proof_layers = Vec::with_capacity(self.num_layers());
 
+        // Evaluate the MLE of the output layer at a random point to reduce the outputs to
+        // a single claim.
         let outputs = self.claimed_outputs();
         transcript.append_scalars(&outputs);
         let output_mle = DensePolynomial::new_padded(outputs);
@@ -93,7 +95,7 @@ where
 
         (
             BatchedGrandProductProof {
-                layers: proof_layers,
+                gkr_layers: proof_layers,
                 quark_proof: None,
             },
             r,
@@ -101,7 +103,7 @@ where
     }
 
     /// Verifies that the `sumcheck_claim` output by sumcheck verification is consistent
-    /// with the `left_claims` and `right_claims` of corresponding `BatchedGrandProductLayerProof`.
+    /// with the `left_claim` and `right_claim` of corresponding `BatchedGrandProductLayerProof`.
     /// This function may be overridden if the layer isn't just multiplication gates, e.g. in the
     /// case of `ToggledBatchedGrandProduct`.
     fn verify_sumcheck_claim(
@@ -132,9 +134,9 @@ where
         transcript: &mut ProofTranscript,
         r_start: Vec<F>,
     ) -> (F, Vec<F>) {
-        // We allow a non empty start in this function call because the quark hybrid form provides prespecified random for
-        // most of the positions and then we proceed with GKR on the remaining layers using the preset random values.
-        // For default thaler '13 layered grand products this should be empty.
+        // `r_start` is the random point at which the MLE of the first layer of the grand product is evaluated.
+        // In the case of the Quarks hybrid grand product, this is obtained from the Quarks grand product sumcheck.
+        // In the case of Thaler'13 GKR-based grand products, this is from Fiat-Shamir.
         let mut r_grand_product = r_start.clone();
         let fixed_at_start = r_start.len();
 
@@ -175,12 +177,14 @@ where
         transcript: &mut ProofTranscript,
         _setup: Option<&PCS::Setup>,
     ) -> (F, Vec<F>) {
+        // Evaluate the MLE of the output layer at a random point to reduce the outputs to
+        // a single claim.
         transcript.append_scalars(claimed_outputs);
         let r: Vec<F> =
             transcript.challenge_vector(claimed_outputs.len().next_power_of_two().log_2());
         let claim = DensePolynomial::new_padded(claimed_outputs.to_vec()).evaluate(&r);
 
-        Self::verify_layers(&proof.layers, claim, transcript, r)
+        Self::verify_layers(&proof.gkr_layers, claim, transcript, r)
     }
 }
 
@@ -229,12 +233,12 @@ where
 
 /// A batched grand product circuit.
 /// Note that the circuit roots are not included in `self.layers`
-///        o
-///      /   \
-///     o     o  <- layers[layers.len() - 1]
-///    / \   / \
-///   o   o o   o  <- layers[layers.len() - 2]
-///       ...
+///        o            o
+///      /   \        /   \
+///     o     o      o     o  <- layers[layers.len() - 1]
+///    / \   / \    / \   / \
+///   o   o o   o  o   o o   o  <- layers[layers.len() - 2]
+///       ...          ...
 pub struct BatchedDenseGrandProduct<F: JoltField> {
     layers: Vec<DenseInterleavedPolynomial<F>>,
 }
diff --git a/jolt-core/src/subprotocols/grand_product_quarks.rs b/jolt-core/src/subprotocols/grand_product_quarks.rs
index 44491bbcf..6096b0734 100644
--- a/jolt-core/src/subprotocols/grand_product_quarks.rs
+++ b/jolt-core/src/subprotocols/grand_product_quarks.rs
@@ -47,7 +47,8 @@ pub enum QuarkHybridLayerDepth {
 }
 
 impl QuarkHybridLayerDepth {
-    // The depth in the product tree of the GKR grand product at which the hybrid scheme will switch to using quarks grand product proofs
+    /// The depth in the binary tree of the GKR grand product at which the hybrid scheme
+    /// will switch to using Quarks Section 5 grand product argument.
     pub fn get_crossover_depth(&self) -> usize {
         match self {
             QuarkHybridLayerDepth::Min => 0,
@@ -99,7 +100,7 @@ where
             crossover
         };
 
-        // Taken 1 to 1 from the code in the BatchedDenseGrandProductLayer implementation
+        // Taken 1 to 1 from the code in the BatchedDenseGrandProduct implementation
         let mut layers = Vec::<DenseInterleavedPolynomial<F>>::new();
         layers.push(DenseInterleavedPolynomial::new(leaves));
 
@@ -109,7 +110,7 @@ where
             layers.push(new_layer);
         }
 
-        // If the tree depth is too small we return no quark poly and all base layers
+        // If the tree depth is too small we just do the GKR grand product
         if tree_depth <= num_layers {
             return Self {
                 batch_size,
@@ -129,10 +130,9 @@ where
             _marker: PhantomData,
         }
     }
-    /// The number of layers in the grand product, in this case it is the log of the quark layer size plus the gkr layer depth.
+
     fn num_layers(&self) -> usize {
-        todo!()
-        // self.quark_poly[0].len().log_2()
+        unimplemented!("Unused");
     }
 
     /// The claimed outputs of the grand products.
@@ -151,7 +151,7 @@ where
     fn layers(
         &'_ mut self,
     ) -> impl Iterator<Item = &'_ mut dyn BatchedGrandProductLayer<F, ProofTranscript>> {
-        panic!("We don't use the default prover and so we don't need the generic iterator");
+        unimplemented!("We don't use the default prover and so we don't need the generic iterator");
         std::iter::empty()
     }
 
@@ -172,10 +172,10 @@ where
         let r_outputs: Vec<F> = transcript.challenge_vector(output_mle.get_num_vars());
         let claim = output_mle.evaluate(&r_outputs);
 
-        // For proofs of polynomials of size less than 16 we support these with no quark proof
-        let (quark_option, mut random, mut claim) = if !self.quark_poly.is_empty() {
+        // For polynomials of size less than 16 we just use the GKR grand product
+        let (quark_proof, mut random, mut claim) = if !self.quark_poly.is_empty() {
             // When doing the quark hybrid proof, we first prove the grand product of a layer of a polynomial which is 4 layers deep in the tree
-            // of a standard layered sumcheck grand product, then we use the sumcheck layers to prove via gkr layers that the random point opened
+            // of a standard layered sumcheck grand product, then we use the sumcheck layers to prove via GKR layers that the random point opened
             // by the quark proof is in fact the folded result of the base layer.
             let (quark, random, quark_claim) =
                 QuarkGrandProductProof::<PCS, ProofTranscript>::prove(
@@ -197,8 +197,8 @@ where
 
         (
             BatchedGrandProductProof {
-                layers: proof_layers,
-                quark_proof: quark_option,
+                gkr_layers: proof_layers,
+                quark_proof,
             },
             random,
         )
@@ -213,6 +213,8 @@ where
         transcript: &mut ProofTranscript,
         _setup: Option<&PCS::Setup>,
     ) -> (F, Vec<F>) {
+        // Evaluate the MLE of the output layer at a random point to reduce the outputs to
+        // a single claim.
         transcript.append_scalars(claimed_outputs);
         let r_outputs: Vec<F> =
             transcript.challenge_vector(claimed_outputs.len().next_power_of_two().log_2());
@@ -245,7 +247,7 @@ where
             PCS,
             ProofTranscript,
         >>::verify_layers(
-            &proof.layers, claim, transcript, rand
+            &proof.gkr_layers, claim, transcript, rand
         );
 
         (grand_product_claim, grand_product_r)
@@ -287,12 +289,13 @@ where
         let v_variables = v_length.log_2();
 
         let v_polynomial = DensePolynomial::<PCS::Field>::new(v.to_vec());
+        // Compute f(1, x), f(x, 0), and f(x, 1) from v(x)
         let (f_1x, f_x0, f_x1) = v_into_f::<PCS::Field>(&v_polynomial);
 
         let g_polynomial = f_1x.clone();
         let mut sumcheck_polys = vec![f_1x, f_x0, f_x1];
 
-        // We commit to f(1, x)
+        // We commit to g(x) = f(1, x)
         let g_commitment = PCS::commit(&g_polynomial, setup);
         g_commitment.append_to_transcript(transcript);
 
@@ -302,7 +305,33 @@ where
         // We add eq_tau as the second to last polynomial in the sumcheck
         sumcheck_polys.push(eq_tau);
 
-        // Next we calculate EQ(11...10 || r_outputs, x)
+        // This is where things start to deviate from the protocol described in
+        // Quarks Section 5.
+        //
+        // We batch our grand products by laying out the circuits side-by-side, and
+        // proving them together as one big circuit with k outputs, where k is the batch size.
+        // In `prove_grand_product`, we evaluate the MLE of these outputs at a random point,
+        //   claim := \tilde{outputs}(r_outputs)
+        //
+        // Quarks Section 5 assumes there's only one output, P = f(1, ..., 1, 0).
+        // But claim != f(1, ..., 1, 0), so we have to use a different sumcheck expression.
+        //
+        // If you closely examine `v_into_f` and work it out, you'll find that our k grand product
+        // outputs are contained in f(1, x) at x = (1, ..., 1, 0, b), where b \in {0, 1}^{log2(k)}.
+        // So we have:
+        //   claim = \tilde{outputs}(r_outputs)
+        //         = \sum_b EQ(r_outputs, b) * outputs(b)
+        //         = \sum_x EQ(1, ..., 1, 0, r_outputs, x) * f(1, x)        where r_outputs ∈ 𝔽^{log2(k)}, x ∈ {0, 1}^{log2(kn)}
+        //
+        // Modifying the sumcheck instance described in Section 5 of the Quarks paper, we will
+        // be proving:
+        //   claim = \sum_x (EQ(\tau, x) * (f(1, x) - f(x, 0) * f(x, 1)) + EQ(1, ..., 1, 0, r_outputs, x) * f(1, x))
+        //
+        // Note that the first half of the summand EQ(\tau, x) * (f(1, x) - f(x, 0) * f(x, 1))
+        // should equal 0 for all x ∈ {0, 1}^{log2(kn)}, ensuring that every output value f(1, x) is equal to the
+        // product of its input values f(x, 0) and f(x, 1).
+
+        // First we compute EQ(1, ..., 1, 0, r_outputs, x)
         let mut one_padded_r_outputs = vec![PCS::Field::one(); v_variables];
         let slice_index = one_padded_r_outputs.len() - r_outputs.len();
         one_padded_r_outputs[slice_index..].copy_from_slice(r_outputs.as_slice());
@@ -324,6 +353,8 @@ where
         // We add eq_output as the last polynomial in the sumcheck
         sumcheck_polys.push(eq_output);
 
+        // This is the sumcheck polynomial
+        //   EQ(\tau, x) * (f(1, x) - f(x, 0) * f(x, 1)) + EQ(1, ..., 1, 0, r_outputs, x) * f(1, x)
         let output_check_fn = |vals: &[PCS::Field]| -> PCS::Field {
             assert_eq!(vals.len(), 5);
             let f_1x = vals[0];
@@ -494,7 +525,8 @@ where
     }
 }
 
-// Computes slices of f for the sumcheck
+/// Computes the polynomials f(1, x), f(x, 0), and f(x, 1) from the v polynomial,
+/// as described in Lemma 5.1 of the Quarks paper.
 #[allow(clippy::type_complexity)]
 fn v_into_f<F: JoltField>(
     v: &DensePolynomial<F>,
diff --git a/jolt-core/src/subprotocols/sparse_grand_product.rs b/jolt-core/src/subprotocols/sparse_grand_product.rs
index 2e7b1740d..9122e9e43 100644
--- a/jolt-core/src/subprotocols/sparse_grand_product.rs
+++ b/jolt-core/src/subprotocols/sparse_grand_product.rs
@@ -26,18 +26,25 @@ use rayon::prelude::*;
 /// 🏴  o  🏳️ o  🏳️ o  🏴  o    toggle layer        ↓
 #[derive(Debug)]
 struct BatchedGrandProductToggleLayer<F: JoltField> {
-    /// The list of non-zero flag indices for each layer in the batch.
+    /// The list of non-zero flag indices for each circuit in the batch.
     flag_indices: Vec<Vec<usize>>,
-    /// The list of non-zero flag values for each layer in the batch.
+    /// The list of non-zero flag values for each circuit in the batch.
     /// Before the first binding iteration of sumcheck, this will be empty
     /// (we know that all non-zero, unbound flag values are 1).
     flag_values: Vec<Vec<F>>,
+    /// The Reed-Solomon fingerprints for each circuit in the batch.
     fingerprints: Vec<Vec<F>>,
-
+    /// Once the sparse flag/fingerprint vectors cannnot be bound further
+    /// (i.e. binding would require processing values in different vectors),
+    /// we switch to using `coalesced_flags` to represent the flag values.
     coalesced_flags: Option<Vec<F>>,
+    /// Once the sparse flag/fingerprint vectors cannnot be bound further
+    /// (i.e. binding would require processing values in different vectors),
+    /// we switch to using `coalesced_fingerprints` to represent the fingerprint values.
     coalesced_fingerprints: Option<Vec<F>>,
-
+    /// The length of a layer in one of the circuits in the batch.
     layer_len: usize,
+
     batched_layer_len: usize,
 }
 
@@ -60,6 +67,7 @@ impl<F: JoltField> BatchedGrandProductToggleLayer<F> {
                         F::one();
                 }
             }
+            // Fingerprints are padded with 0s, flags are padded with 1s
             flags.resize(flags.len().next_power_of_two(), F::one());
 
             (
@@ -86,6 +94,7 @@ impl<F: JoltField> BatchedGrandProductToggleLayer<F> {
                         *flag_value;
                 }
             }
+            // Fingerprints are padded with 0s, flags are padded with 1s
             flags.resize(flags.len().next_power_of_two(), F::one());
 
             (
@@ -112,6 +121,12 @@ impl<F: JoltField> BatchedGrandProductToggleLayer<F> {
         }
     }
 
+    /// Computes the grand product layer output by this one.
+    /// Since this is a toggle layer, most of the output values are 1s, so
+    /// the return type is a SparseInterleavedPolyomial
+    ///   o      o     o      o    <-  output layer
+    ///  / \    / \   / \    / \
+    /// 🏴  o  🏳️ o  🏳️ o  🏴  o  <- toggle layer
     #[tracing::instrument(skip_all, name = "BatchedGrandProductToggleLayer::layer_output")]
     fn layer_output(&self) -> SparseInterleavedPolynomial<F> {
         let values: Vec<_> = self
@@ -132,6 +147,9 @@ impl<F: JoltField> BatchedGrandProductToggleLayer<F> {
         SparseInterleavedPolynomial::new(values, self.batched_layer_len / 2)
     }
 
+    /// Coalesces flags and fingerprints into one (dense) vector each.
+    /// After a certain number of bindings, we can no longer process the k
+    /// circuits in the batch in independently, at which point we coalesce.
     #[tracing::instrument(skip_all, name = "BatchedGrandProductToggleLayer::coalesce")]
     fn coalesce(&mut self) {
         let mut coalesced_fingerprints: Vec<F> =
@@ -153,6 +171,7 @@ impl<F: JoltField> BatchedGrandProductToggleLayer<F> {
                 coalesced
             })
             .collect();
+        // Fingerprints are padded with 0s, flags are padded with 1s
         coalesced_flags.resize(coalesced_flags.len().next_power_of_two(), F::one());
 
         self.coalesced_fingerprints = Some(coalesced_fingerprints);
@@ -161,8 +180,8 @@ impl<F: JoltField> BatchedGrandProductToggleLayer<F> {
 }
 
 impl<F: JoltField> Bindable<F> for BatchedGrandProductToggleLayer<F> {
-    /// Incrementally binds a variable of this batched layer's polynomials.
-    /// Similar to `BatchedSparseGrandProductLayer::bind`, in that fingerprints use
+    /// Incrementally binds a variable of the flag and fingerprint polynomials.
+    /// Similar to `SparseInterleavedPolynomial::bind`, in that flags use
     /// a sparse representation, but different in a couple of key ways:
     /// - flags use two separate vectors (for indices and values) rather than
     ///   a single vector of (index, value) pairs
@@ -179,6 +198,7 @@ impl<F: JoltField> Bindable<F> for BatchedGrandProductToggleLayer<F> {
         let (mut flags_before_binding, mut fingerprints_before_binding) = self.to_dense();
 
         if let Some(coalesced_flags) = &mut self.coalesced_flags {
+            // Polynomials have already been coalesced, so bind the coalesced vectors.
             let mut bound_flags = vec![F::one(); coalesced_flags.len() / 2];
             for i in 0..bound_flags.len() {
                 bound_flags[i] = coalesced_flags[2 * i]
@@ -215,6 +235,7 @@ impl<F: JoltField> Bindable<F> for BatchedGrandProductToggleLayer<F> {
 
         debug_assert!(self.layer_len % 4 == 0);
 
+        // Bind the fingerprints
         self.fingerprints
             .par_iter_mut()
             .for_each(|layer: &mut Vec<F>| {
@@ -229,6 +250,7 @@ impl<F: JoltField> Bindable<F> for BatchedGrandProductToggleLayer<F> {
             self.flag_values = vec![vec![]; self.flag_indices.len()];
         }
 
+        // Bind the flags
         self.flag_indices
             .par_iter_mut()
             .zip(self.flag_values.par_iter_mut())
@@ -322,6 +344,7 @@ impl<F: JoltField> Bindable<F> for BatchedGrandProductToggleLayer<F> {
         }
 
         if self.layer_len == 2 {
+            // Time to coalesce
             assert!(self.coalesced_fingerprints.is_none());
             assert!(self.coalesced_flags.is_none());
             self.coalesce();
@@ -359,16 +382,24 @@ impl<F: JoltField, ProofTranscript: Transcript> BatchedCubicSumcheck<F, ProofTra
         assert_eq!(expected, round_claim);
     }
 
-    /// Similar to `BatchedSparseGrandProductLayer::compute_cubic`, but with changes to
-    /// accomodate the differences between `BatchedSparseGrandProductLayer` and
+    /// Similar to `SparseInterleavedPolynomial::compute_cubic`, but with changes to
+    /// accomodate the differences between `SparseInterleavedPolynomial` and
     /// `BatchedGrandProductToggleLayer`. These differences are described in the doc comments
     /// for `BatchedGrandProductToggleLayer::bind`.
+    ///
+    /// Since we are using the Dao-Thaler EQ optimization, there are four cases to handle:
+    /// 1. Flags/fingerprints are coalesced, and E1 is fully bound
+    /// 2. Flags/fingerprints are coalesced, and E1 isn't fully bound
+    /// 3. Flags/fingerprints aren't coalesced, and E1 is fully bound
+    /// 4. Flags/fingerprints aren't coalesced, and E1 isn't fully bound
     #[tracing::instrument(skip_all, name = "BatchedGrandProductToggleLayer::compute_cubic")]
     fn compute_cubic(&self, eq_poly: &SplitEqPolynomial<F>, previous_round_claim: F) -> UniPoly<F> {
         if let Some(coalesced_flags) = &self.coalesced_flags {
             let coalesced_fingerpints = self.coalesced_fingerprints.as_ref().unwrap();
 
             let cubic_evals = if eq_poly.E1_len == 1 {
+                // 1. Flags/fingerprints are coalesced, and E1 is fully bound
+                // This is similar to the if case of `DenseInterleavedPolynomial::compute_cubic`
                 coalesced_flags
                     .par_chunks(2)
                     .zip(coalesced_fingerpints.par_chunks(2))
@@ -403,6 +434,8 @@ impl<F: JoltField, ProofTranscript: Transcript> BatchedCubicSumcheck<F, ProofTra
                         |sum, evals| (sum.0 + evals.0, sum.1 + evals.1, sum.2 + evals.2),
                     )
             } else {
+                // 2. Flags/fingerprints are coalesced, and E1 isn't fully bound
+                // This is similar to the else case of `DenseInterleavedPolynomial::compute_cubic`
                 let E1_evals: Vec<_> = eq_poly.E1[..eq_poly.E1_len]
                     .par_chunks(2)
                     .map(|E1_chunk| {
@@ -467,8 +500,9 @@ impl<F: JoltField, ProofTranscript: Transcript> BatchedCubicSumcheck<F, ProofTra
             return UniPoly::from_evals(&cubic_evals);
         }
 
-        // Non-coalesced case
         let cubic_evals = if eq_poly.E1_len == 1 {
+            // 3. Flags/fingerprints aren't coalesced, and E1 is fully bound
+            // This is similar to the if case of `SparseInterleavedPolynomial::compute_cubic`
             let eq_evals: Vec<(F, F, F)> = eq_poly.E2[..eq_poly.E2_len]
                 .par_chunks(2)
                 .take(self.batched_layer_len / 4)
@@ -588,6 +622,8 @@ impl<F: JoltField, ProofTranscript: Transcript> BatchedCubicSumcheck<F, ProofTra
                 eq_eval_sums.2 + deltas.2,
             )
         } else {
+            // 4. Flags/fingerprints aren't coalesced, and E1 isn't fully bound
+            // This is similar to the else case of `SparseInterleavedPolynomial::compute_cubic`
             let E1_evals: Vec<_> = eq_poly.E1[..eq_poly.E1_len]
                 .par_chunks(2)
                 .map(|E1_chunk| {
@@ -716,21 +752,47 @@ impl<F: JoltField, ProofTranscript: Transcript> BatchedCubicSumcheck<F, ProofTra
                     |sum, evals| (sum.0 + evals.0, sum.1 + evals.1, sum.2 + evals.2),
                 );
 
+            // The cubic evals assuming all the coefficients are ones is affected by the
+            // `batched_layer_len`, since we implicitly pad the `batched_layer_len` to a power of 2.
+            // By pad here we mean that flags are padded with 1s, and fingerprints are
+            // padded with 0s.
+            //
+            // As a refresher, the cubic evals we're computing are:
+            //
+            // \sum_x2 E2[x2] * (\sum_x1 ((1 - j) * E1[0, x1] + j * E1[1, x1]) * \prod_k ((1 - j) * P_k(0 || x1 || x2) + j * P_k(1 || x1 || x2)))
             let evals_assuming_all_ones = if self.batched_layer_len.is_power_of_two() {
+                // If `batched_layer_len` is a power of 2, there is no 0-padding.
+                //
+                // So we have:
+                // \sum_x2 (E2[x2] * (\sum_x1 ((1 - j) * E1[0, x1] + j * E1[1, x1]) * 1))
+                //   = \sum_x2 (E2[x2] * \sum_x1 E1_evals[x1])
+                //   = (\sum_x2 E2[x2]) * (\sum_x1 E1_evals[x1])
+                //   = 1 * E1_eval_sums
                 E1_eval_sums
             } else {
                 let chunk_size = self.batched_layer_len.next_power_of_two() / eq_poly.E2_len;
                 let num_all_one_chunks = self.batched_layer_len / chunk_size;
                 let E2_sum: F = eq_poly.E2[..num_all_one_chunks].iter().sum();
                 if self.batched_layer_len % chunk_size == 0 {
+                    // If `batched_layer_len` isn't a power of 2 but evenly divides `chunk_size`,
+                    // that means that for the last values of x2, we have:
+                    //   (1 - j) * P_k(0 || x1 || x2) + j * P_k(1 || x1 || x2)) = 0
+                    // due to the 0-padding.
+                    //
+                    // This makes the entire inner sum 0 for those values of x2.
+                    // So we can simply sum over E2 for the _other_ values of x2, and
+                    // multiply by `E1_eval_sums`.
                     (
                         E2_sum * E1_eval_sums.0,
                         E2_sum * E1_eval_sums.1,
                         E2_sum * E1_eval_sums.2,
                     )
                 } else {
-                    // The last "chunk" will have (self.dense_len % chunk_size) ones,
-                    // followed by (chunk_size - self.dense_len % chunk_size) zeros.
+                    // If `batched_layer_len` isn't a power of 2 and doesn't divide `chunk_size`,
+                    // the last nonzero "chunk" will have (self.dense_len % chunk_size) ones,
+                    // followed by (chunk_size - self.dense_len % chunk_size) zeros,
+                    // e.g. 1 1 1 1 1 1 1 1 0 0 0 0
+                    //
                     // This handles this last chunk:
                     let last_chunk_evals = E1_evals[..(self.batched_layer_len % chunk_size) / 4]
                         .par_iter()
diff --git a/jolt-core/src/utils/sol_types.rs b/jolt-core/src/utils/sol_types.rs
index 2bb9f26a7..eb05a7f77 100644
--- a/jolt-core/src/utils/sol_types.rs
+++ b/jolt-core/src/utils/sol_types.rs
@@ -196,7 +196,7 @@ impl<ProofTranscript: Transcript> Into<GrandProductProof>
     for BatchedGrandProductProof<HyperKZG<Bn254, ProofTranscript>, ProofTranscript>
 {
     fn into(self) -> GrandProductProof {
-        let layers: Vec<GKRLayer> = self.layers.into_iter().map(|i| i.into()).collect();
+        let layers: Vec<GKRLayer> = self.gkr_layers.into_iter().map(|i| i.into()).collect();
         assert!(self.quark_proof.is_none(), "Quarks are unsupported");
         GrandProductProof { layers }
     }