Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

chore: parallelise inverse polynomial construction for lookup relations #10413

Merged
merged 2 commits into from
Dec 5, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
#include <tuple>

#include "barretenberg/common/constexpr_utils.hpp"
#include "barretenberg/common/thread.hpp"
#include "barretenberg/polynomials/univariate.hpp"
#include "barretenberg/relations/relation_types.hpp"

Expand Down Expand Up @@ -231,32 +232,42 @@ template <typename FF_> class DatabusLookupRelationImpl {
const size_t circuit_size)
{
auto& inverse_polynomial = BusData<bus_idx, Polynomials>::inverses(polynomials);
bool is_read = false;
bool nonzero_read_count = false;
for (size_t i = 0; i < circuit_size; ++i) {
// Determine if the present row contains a databus operation
auto q_busread = polynomials.q_busread[i];
if constexpr (bus_idx == 0) { // calldata
is_read = q_busread == 1 && polynomials.q_l[i] == 1;
nonzero_read_count = polynomials.calldata_read_counts[i] > 0;
}
if constexpr (bus_idx == 1) { // secondary_calldata
is_read = q_busread == 1 && polynomials.q_r[i] == 1;
nonzero_read_count = polynomials.secondary_calldata_read_counts[i] > 0;
}
if constexpr (bus_idx == 2) { // return data
is_read = q_busread == 1 && polynomials.q_o[i] == 1;
nonzero_read_count = polynomials.return_data_read_counts[i] > 0;
}
// We only compute the inverse if this row contains a read gate or data that has been read
if (is_read || nonzero_read_count) {
// TODO(https://github.com/AztecProtocol/barretenberg/issues/940): avoid get_row if possible.
auto row = polynomials.get_row(i); // Note: this is a copy. use sparingly!
auto value = compute_read_term<FF>(row, relation_parameters) *
compute_write_term<FF, bus_idx>(row, relation_parameters);
inverse_polynomial.at(i) = value;

size_t min_iterations_per_thread = 1 << 6; // min number of iterations for which we'll spin up a unique thread
size_t num_threads = bb::calculate_num_threads_pow2(circuit_size, min_iterations_per_thread);
size_t iterations_per_thread = circuit_size / num_threads; // actual iterations per thread

parallel_for(num_threads, [&](size_t thread_idx) {
size_t start = thread_idx * iterations_per_thread;
size_t end = (thread_idx + 1) * iterations_per_thread;
bool is_read = false;
bool nonzero_read_count = false;
for (size_t i = start; i < end; ++i) {
// Determine if the present row contains a databus operation
auto q_busread = polynomials.q_busread[i];
if constexpr (bus_idx == 0) { // calldata
is_read = q_busread == 1 && polynomials.q_l[i] == 1;
nonzero_read_count = polynomials.calldata_read_counts[i] > 0;
}
if constexpr (bus_idx == 1) { // secondary_calldata
is_read = q_busread == 1 && polynomials.q_r[i] == 1;
nonzero_read_count = polynomials.secondary_calldata_read_counts[i] > 0;
}
if constexpr (bus_idx == 2) { // return data
is_read = q_busread == 1 && polynomials.q_o[i] == 1;
nonzero_read_count = polynomials.return_data_read_counts[i] > 0;
}
// We only compute the inverse if this row contains a read gate or data that has been read
if (is_read || nonzero_read_count) {
// TODO(https://github.com/AztecProtocol/barretenberg/issues/940): avoid get_row if possible.
auto row = polynomials.get_row(i); // Note: this is a copy. use sparingly!
auto value = compute_read_term<FF>(row, relation_parameters) *
compute_write_term<FF, bus_idx>(row, relation_parameters);
inverse_polynomial.at(i) = value;
}
}
}
});

// Compute inverse polynomial I in place by inverting the product at each row
// Note: zeroes are ignored as they are not used anyway
FF::batch_invert(inverse_polynomial.coeffs());
Expand Down Expand Up @@ -299,8 +310,8 @@ template <typename FF_> class DatabusLookupRelationImpl {
constexpr size_t subrel_idx_1 = 2 * bus_idx;
constexpr size_t subrel_idx_2 = 2 * bus_idx + 1;

// Establish the correctness of the polynomial of inverses I. Note: inverses is computed so that the value is 0
// if !inverse_exists. Degree 3 (5)
// Establish the correctness of the polynomial of inverses I. Note: inverses is computed so that the value
// is 0 if !inverse_exists. Degree 3 (5)
std::get<subrel_idx_1>(accumulator) += (read_term * write_term * inverses - inverse_exists) * scaling_factor;

// Establish validity of the read. Note: no scaling factor here since this constraint is enforced across the
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
#include <tuple>

#include "barretenberg/common/constexpr_utils.hpp"
#include "barretenberg/common/thread.hpp"
#include "barretenberg/honk/proof_system/logderivative_library.hpp"
#include "barretenberg/polynomials/univariate.hpp"
#include "barretenberg/relations/relation_types.hpp"
Expand Down Expand Up @@ -157,16 +158,25 @@ template <typename FF_> class LogDerivLookupRelationImpl {
{
auto& inverse_polynomial = get_inverse_polynomial(polynomials);

for (size_t i = 0; i < circuit_size; ++i) {
// We only compute the inverse if this row contains a lookup gate or data that has been looked up
if (polynomials.q_lookup.get(i) == 1 || polynomials.lookup_read_tags.get(i) == 1) {
// TODO(https://github.com/AztecProtocol/barretenberg/issues/940): avoid get_row if possible.
auto row = polynomials.get_row(i); // Note: this is a copy. use sparingly!
auto value = compute_read_term<FF, 0>(row, relation_parameters) *
compute_write_term<FF, 0>(row, relation_parameters);
inverse_polynomial.at(i) = value;
size_t min_iterations_per_thread = 1 << 6; // min number of iterations for which we'll spin up a unique thread
size_t num_threads = bb::calculate_num_threads_pow2(circuit_size, min_iterations_per_thread);
size_t iterations_per_thread = circuit_size / num_threads; // actual iterations per thread

parallel_for(num_threads, [&](size_t thread_idx) {
size_t start = thread_idx * iterations_per_thread;
size_t end = (thread_idx + 1) * iterations_per_thread;
for (size_t i = start; i < end; ++i) {
// We only compute the inverse if this row contains a lookup gate or data that has been looked up
if (polynomials.q_lookup.get(i) == 1 || polynomials.lookup_read_tags.get(i) == 1) {
// TODO(https://github.com/AztecProtocol/barretenberg/issues/940): avoid get_row if possible.
auto row = polynomials.get_row(i); // Note: this is a copy. use sparingly!
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just a comment because I see a lot of microoptimisation going on: have you guys profiled where the time is going?

IIRC when I did some profiling for the AVM (which luckily does not use this relation, since we already parallelize) a lot of time was spent in

  • Using get_row which (according to some comments) "was only meant for debugging"
  • (related to the above) Visiting all rows of the trace and finding out if you actually need to compute the inverse or not.

If this is still true, you'd benefit far more if you could save which rows need an inverse, and only act on those (therefore O(needs_inverse) vs O(rows)). I think this is true for the AVM (where the cost of get_row is really big) and I have some ideas on how to implement this. However, for general bb, the problem is that the whole proving system is stateless in terms of relations. That is, everything uses Relation::method, and not relation_instance.method.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I actually looked into the issue here AztecProtocol/barretenberg#940 and we indeed only use get_row for those that need an inverse (achieved by checking q_lookup and lookup_read_target polynomial). I'm optimising with the scope of reducing the impact of using a larger trace structure when keeping the same circuit content (so introducing more sparsity in the trace). I did indeed profiled get_row and the time spent doing it is not getting worse because we avoid calling it when not needed. We still are spending around 3.5s calling get_row but as Luke noted in the issue moving to a different pattern requires more involved work and we want to eliminate the existing low hanging fruits first.

auto value = compute_read_term<FF, 0>(row, relation_parameters) *
compute_write_term<FF, 0>(row, relation_parameters);
inverse_polynomial.at(i) = value;
}
}
}
});

// Compute inverse polynomial I in place by inverting the product at each row
FF::batch_invert(inverse_polynomial.coeffs());
};
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,7 @@ typename TraceToPolynomials<Flavor>::TraceData TraceToPolynomials<Flavor>::const
auto block_size = static_cast<uint32_t>(block.size());

// Save ranges over which the blocks are "active" for use in structured commitments
if constexpr (IsUltraFlavor<Flavor>) {
if constexpr (IsUltraFlavor<Flavor>) { // Mega and Ultra
if (block.size() > 0) {
proving_key.active_block_ranges.emplace_back(offset, offset + block.size());
}
Expand Down
4 changes: 2 additions & 2 deletions barretenberg/cpp/src/barretenberg/ultra_honk/oink_prover.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -206,8 +206,8 @@ template <IsUltraFlavor Flavor> void OinkProver<Flavor>::execute_log_derivative_

{
PROFILE_THIS_NAME("COMMIT::lookup_inverses");
witness_commitments.lookup_inverses =
proving_key->proving_key.commitment_key->commit(proving_key->proving_key.polynomials.lookup_inverses);
witness_commitments.lookup_inverses = proving_key->proving_key.commitment_key->commit_sparse(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

are they really (going to be) sparse in practice?

proving_key->proving_key.polynomials.lookup_inverses);
}
transcript->send_to_verifier(domain_separator + commitment_labels.lookup_inverses,
witness_commitments.lookup_inverses);
Expand Down
Loading