Skip to content

Commit

Permalink
feat: Tracy time with instrumentation (#9170)
Browse files Browse the repository at this point in the history
At scripts for profiling locally with tracy and samply, add
instrumentation so that tracy profile is pretty complete, and combine
BB_OP_COUNT macros with tracy macros.
  • Loading branch information
codygunton authored Oct 11, 2024
1 parent 2592e50 commit 1c008d9
Show file tree
Hide file tree
Showing 51 changed files with 337 additions and 251 deletions.
2 changes: 1 addition & 1 deletion barretenberg/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -477,4 +477,4 @@ See Tracy manual linked here <https://github.com/wolfpld/tracy> for in-depth Tra
The basic use of Tracy is to run a benchmark with the `cmake --preset tracy` build type, create a capture file, then
transfer it to a local machine for interactive UI introspection.

All the steps to do this effectively are included in cpp/scripts/benchmark_tracy.sh
All the steps to do this effectively are included in various scripts in cpp/scripts/.
13 changes: 7 additions & 6 deletions barretenberg/cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -54,21 +54,22 @@ if(CHECK_CIRCUIT_STACKTRACES)
add_compile_options(-DCHECK_CIRCUIT_STACKTRACES)
endif()

if(ENABLE_TRACY)
if(ENABLE_TRACY OR ENABLE_TRACY_TIME_INSTRUMENTED)
add_compile_options(-DTRACY_ENABLE)
SET(TRACY_LIBS Tracy::TracyClient)
else()
SET(TRACY_LIBS)
endif()

if(TRACY_PROFILE_MEMORY)
add_compile_options(-DTRACY_MEMORY)
endif()

if(TRACY_PROFILE_TIME)
add_compile_options(-DTRACY_TIME)
if(ENABLE_TRACY_TIME_INSTRUMENTED)
add_compile_options(-DTRACY_INSTRUMENTED)
endif()

if(TRACY_PROFILE_MEMORY)
add_compile_options(-DTRACY_MEMORY)
add_compile_options(-DTRACY_INSTRUMENTED)
endif()

if(ENABLE_ASAN)
add_compile_options(-fsanitize=address)
Expand Down
34 changes: 24 additions & 10 deletions barretenberg/cpp/CMakePresets.json
Original file line number Diff line number Diff line change
Expand Up @@ -112,8 +112,8 @@
},
{
"name": "tracy-memory",
"displayName": "Release build with tracy, optimized for memory tracking",
"description": "Release build with tracy, optimized for memory tracking",
"displayName": "Release build with tracy with memory tracking",
"description": "Release build with tracy with memory tracking",
"inherits": "clang16",
"binaryDir": "build-tracy-memory",
"cacheVariables": {
Expand All @@ -122,20 +122,29 @@
}
},
{
"name": "tracy-time",
"displayName": "Build for tracy time profiling",
"description": "Build for tracy time profiling",
"binaryDir": "build-tracy-time",
"name": "tracy-time-instrumented",
"displayName": "Build for tracy time profiling via instrumentation",
"description": "Build for tracy time profiling via instrumentation",
"binaryDir": "build-tracy-time-instrumented",
"inherits": "clang16",
"cacheVariables": {
"ENABLE_TRACY_TIME_INSTRUMENTED": "ON"
}
},
{
"name": "tracy-time-sampled",
"displayName": "Build for tracy time profiling via sampling",
"description": "Build for tracy time profiling via sampling",
"binaryDir": "build-tracy-time-sampled",
"inherits": "default",
"environment": {
"CMAKE_BUILD_TYPE": "RelWithDebInfo",
"CFLAGS": "-g -fno-omit-frame-pointer",
"CXXFLAGS": "-g -fno-omit-frame-pointer",
"LDFLAGS": "-g -fno-omit-frame-pointer -rdynamic"
},
"cacheVariables": {
"ENABLE_TRACY": "ON",
"TRACY_PROFILE_TIME": "ON"
"ENABLE_TRACY": "ON"
}
},
{
Expand Down Expand Up @@ -497,9 +506,14 @@
"configurePreset": "tracy-memory"
},
{
"name": "tracy-time",
"name": "tracy-time-instrumented",
"inherits": "default",
"configurePreset": "tracy-time-instrumented"
},
{
"name": "tracy-time-sampled",
"inherits": "default",
"configurePreset": "tracy-time"
"configurePreset": "tracy-time-sampled"
},
{
"name": "clang16-pic",
Expand Down
4 changes: 2 additions & 2 deletions barretenberg/cpp/scripts/benchmark_wasm.sh
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
#!/usr/bin/env bash
set -eu

BENCHMARK=${1:-goblin_bench}
COMMAND=${2:-./bin/$BENCHMARK}
BENCHMARK=${1:-client_ivc_bench}
COMMAND=${2:-./bin/$BENCHMARK --benchmark_filter=ClientIVCBench/Full/6}
HARDWARE_CONCURRENCY=${HARDWARE_CONCURRENCY:-16}

# Move above script dir.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,30 +13,31 @@ USER=${1:-$USER}
BOX=$USER-box
BENCHMARK=${2:-client_ivc_bench}
COMMAND=${3:-./bin/$BENCHMARK --benchmark_filter=ClientIVCBench/Full/6"\$"}

HARDWARE_CONCURRENCY=${HARDWARE_CONCURRENCY:-16}
# Can also set PRESET=tracy-gates env variable
PRESET=${PRESET:-tracy-memory}

ssh $BOX "
set -eux ;
cd ~/aztec-packages/barretenberg/cpp/ ;
cmake --preset $PRESET && cmake --build --preset $PRESET --target $BENCHMARK ;
! [ -d ~/tracy ] && git clone https://github.com/wolfpld/tracy ~/tracy ;
cd ~/tracy/capture ;
git checkout 075395620a504c0cdcaf9bab3d196db16a043de7 ;
sudo apt-get install -y libdbus-1-dev libdbus-glib-1-dev ;
mkdir -p build && cd build && cmake .. && make -j ;
sudo apt-get install -y libdbus-1-dev libdbus-glib-1-dev libtbb-dev libfreetype-dev ;
mkdir -p build && cd build && cmake -DCMAKE_MESSAGE_LOG_LEVEL=Warning .. && make -j ;
cd ~/aztec-packages/barretenberg/cpp/ ;
cmake -DCMAKE_MESSAGE_LOG_LEVEL=Warning --preset $PRESET && cmake --build --preset $PRESET --target $BENCHMARK ;
./tracy-capture -a 127.0.0.1 -f -o trace-$BENCHMARK & ;
sleep 0.1 ;
cd ~/aztec-packages/barretenberg/cpp/build-$PRESET ;
ninja $BENCHMARK ;
$COMMAND ;
HARDWARE_CONCURRENCY=$HARDWARE_CONCURRENCY $COMMAND ;
" &

wait # TODO(AD) hack - not sure why needed
! [ -d ~/tracy ] && git clone https://github.com/wolfpld/tracy ~/tracy
cd ~/tracy
git checkout 075395620a504c0cdcaf9bab3d196db16a043de7 # release 0.11.0
cmake -B profiler/build -S profiler -DCMAKE_BUILD_TYPE=Release
cmake -DCMAKE_MESSAGE_LOG_LEVEL=Warning -B profiler/build -S profiler -DCMAKE_BUILD_TYPE=Release
cmake --build profiler/build --parallel
scp $BOX:/mnt/user-data/$USER/tracy/capture/build/trace-$BENCHMARK .
~/tracy/profiler/build/tracy-profiler trace-$BENCHMARK
36 changes: 36 additions & 0 deletions barretenberg/cpp/scripts/profile_tracy_local.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
# Collect a profile completely locally, i.e., without using any remote machine for building or capturing.

set -eux
USER=${1:-$USER}
BOX=$USER-box
BENCHMARK=${2:-client_ivc_bench}
COMMAND=${3:-./bin/$BENCHMARK --benchmark_filter=ClientIVCBench/Full/6}
HARDWARE_CONCURRENCY=${HARDWARE_CONCURRENCY:-16}
PRESET=${PRESET:-tracy-time-sampled}

! [ -d ~/tracy ] && git clone https://github.com/wolfpld/tracy ~/tracy
cd ~/tracy
git checkout 075395620a504c0cdcaf9bab3d196db16a043de7 # release 0.11.0
cmake -B profiler/build -S profiler -DCMAKE_BUILD_TYPE=Release
cmake --build profiler/build --parallel

cd ~/aztec-packages/barretenberg/cpp/
cmake --preset $PRESET -DCMAKE_MESSAGE_LOG_LEVEL=Warning && cmake --build --preset $PRESET --target $BENCHMARK

! [ -d ~/tracy ] && git clone https://github.com/wolfpld/tracy ~/tracy
cd ~/tracy/capture
git checkout 075395620a504c0cdcaf9bab3d196db16a043de7
mkdir -p build && cd build && cmake .. -DCMAKE_MESSAGE_LOG_LEVEL=Warning && make -j

./tracy-capture -a 127.0.0.1 -f -o ../trace-$BENCHMARK &
sleep 0.1
cd ~/aztec-packages/barretenberg/cpp/build-$PRESET/

# Run the COMMAND with sudo if PRESET is 'tracy-time-sampled'
if [ "$PRESET" = "tracy-time-sampled" ]; then
sudo HARDWARE_CONCURRENCY=$HARDWARE_CONCURRENCY $COMMAND
else
HARDWARE_CONCURRENCY=$HARDWARE_CONCURRENCY $COMMAND
fi

~/tracy/profiler/build/tracy-profiler ~/tracy/capture/trace-$BENCHMARK
19 changes: 19 additions & 0 deletions barretenberg/cpp/scripts/profile_wasm_samply.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
# This is to be run locally not in a container, so the user must handle samply installation.
#!/usr/bin/env bash
set -eu

BENCHMARK=${1:-client_ivc_bench}
COMMAND=${2:-./bin/$BENCHMARK --benchmark_filter=ClientIVCBench/Full/6}
HARDWARE_CONCURRENCY=${HARDWARE_CONCURRENCY:-16}

# Move above script dir.
cd $(dirname $0)/..

# Configure and build.
cmake --preset wasm-threads -DCMAKE_MESSAGE_LOG_LEVEL=Warning
cmake --build --preset wasm-threads --target $BENCHMARK

cd build-wasm-threads
# Consistency with _wasm.sh targets / shorter $COMMAND.
cp ./bin/$BENCHMARK .
samply record wasmtime run --profile=perfmap --env HARDWARE_CONCURRENCY=$HARDWARE_CONCURRENCY -Wthreads=y -Sthreads=y --dir=.. $COMMAND
Original file line number Diff line number Diff line change
Expand Up @@ -54,9 +54,8 @@ Prover get_prover(void (*test_circuit_function)(typename Prover::Flavor::Circuit
Composer composer;
return composer.create_prover(builder);
} else {
#ifdef TRACY_MEMORY
ZoneScopedN("creating prover");
#endif
PROFILE_THIS_NAME("creating prover");

return Prover(builder);
}
};
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ void perform_ivc_accumulation_rounds(size_t NUM_CIRCUITS,
for (size_t circuit_idx = 0; circuit_idx < NUM_CIRCUITS; ++circuit_idx) {
MegaCircuitBuilder circuit;
{
BB_OP_COUNT_TIME_NAME("construct_circuits");
PROFILE_THIS_NAME("construct_circuits");
circuit = circuit_producer.create_next_circuit(ivc);
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ template <class Curve> class CommitmentKey {
*/
Commitment commit(PolynomialSpan<const Fr> polynomial)
{
BB_OP_COUNT_TIME();
PROFILE_THIS();
// We must have a power-of-2 SRS points *after* subtracting by start_index.
const size_t consumed_srs = numeric::round_up_power_2(polynomial.size()) + polynomial.start_index;
auto srs = srs::get_crs_factory<Curve>()->get_prover_crs(consumed_srs);
Expand Down Expand Up @@ -120,7 +120,7 @@ template <class Curve> class CommitmentKey {
*/
Commitment commit_sparse(PolynomialSpan<const Fr> polynomial)
{
BB_OP_COUNT_TIME();
PROFILE_THIS();
const size_t poly_size = polynomial.size();
ASSERT(polynomial.end_index() <= srs->get_monomial_size());

Expand Down
2 changes: 1 addition & 1 deletion barretenberg/cpp/src/barretenberg/common/mem.cpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#include "barretenberg/common/mem.hpp"

#ifdef TRACY_ENABLE
#ifdef TRACY_MEMORY
void* operator new(std::size_t count)
{
// NOLINTBEGIN(cppcoreguidelines-no-malloc)
Expand Down
23 changes: 14 additions & 9 deletions barretenberg/cpp/src/barretenberg/common/op_count.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,18 @@

#include <memory>
#include <tracy/Tracy.hpp>

#ifdef BB_USE_OP_COUNT_TIME_ONLY
#define PROFILE_THIS() BB_OP_COUNT_TIME_NAME(__func__)
#define PROFILE_THIS_NAME(name) BB_OP_COUNT_TIME_NAME(name)
#elif defined TRACY_INSTRUMENTED
#define PROFILE_THIS() ZoneScopedN(__func__)
#define PROFILE_THIS_NAME(name) ZoneScopedN(name)
#else
#define PROFILE_THIS() (void)0
#define PROFILE_THIS_NAME(name) (void)0
#endif

#ifndef BB_USE_OP_COUNT
// require a semicolon to appease formatters
// NOLINTNEXTLINE(cppcoreguidelines-macro-usage)
Expand All @@ -12,18 +24,11 @@
// NOLINTNEXTLINE(cppcoreguidelines-macro-usage)
#define BB_OP_COUNT_CYCLES_NAME(name) (void)0
// NOLINTNEXTLINE(cppcoreguidelines-macro-usage)
#define BB_OP_COUNT_CYCLES() (void)0
#ifndef TRACY_TIME
// NOLINTNEXTLINE(cppcoreguidelines-macro-usage)
#define BB_OP_COUNT_TIME_NAME(name) (void)0
// NOLINTNEXTLINE(cppcoreguidelines-macro-usage)
#define BB_OP_COUNT_TIME() (void)0
#else
// NOLINTNEXTLINE(cppcoreguidelines-macro-usage)
#define BB_OP_COUNT_TIME_NAME(name) ZoneScopedN(name)
#define BB_OP_COUNT_CYCLES() (void)0
// NOLINTNEXTLINE(cppcoreguidelines-macro-usage)
#define BB_OP_COUNT_TIME() BB_OP_COUNT_TIME_NAME(__func__)
#endif
#define BB_OP_COUNT_TIME() (void)0
#else
/**
* Provides an abstraction that counts operations based on function names.
Expand Down
9 changes: 6 additions & 3 deletions barretenberg/cpp/src/barretenberg/common/slab_allocator.cpp
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
#include "slab_allocator.hpp"
#include <barretenberg/common/assert.hpp>
#include <barretenberg/common/log.hpp>
#include <barretenberg/common/mem.hpp>
#include "barretenberg/common/assert.hpp"
#include "barretenberg/common/log.hpp"
#include "barretenberg/common/mem.hpp"
#include "barretenberg/common/op_count.hpp"
#include <cstddef>
#include <numeric>
#include <unordered_map>
Expand Down Expand Up @@ -211,6 +212,8 @@ void init_slab_allocator(size_t circuit_subgroup_size)

std::shared_ptr<void> get_mem_slab(size_t size)
{
PROFILE_THIS();

return allocator.get(size);
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -792,7 +792,7 @@ template <class Fq, class Fr, class T>
std::vector<affine_element<Fq, Fr, T>> element<Fq, Fr, T>::batch_mul_with_endomorphism(
const std::span<const affine_element<Fq, Fr, T>>& points, const Fr& scalar) noexcept
{
BB_OP_COUNT_TIME();
PROFILE_THIS();
typedef affine_element<Fq, Fr, T> affine_element;
const size_t num_points = points.size();

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,8 @@ pippenger_runtime_state<Curve>::pippenger_runtime_state(const size_t num_initial
, bucket_empty_status(reinterpret_cast<bool*>(aligned_alloc(64, num_threads * num_buckets * sizeof(bool))))
, round_counts(reinterpret_cast<uint64_t*>(aligned_alloc(32, MAX_NUM_ROUNDS * sizeof(uint64_t))))
{
PROFILE_THIS();

using Fq = typename Curve::BaseField;
using AffineElement = typename Curve::AffineElement;

Expand All @@ -51,6 +53,7 @@ pippenger_runtime_state<Curve>::pippenger_runtime_state(const size_t num_initial

const size_t points_per_thread = static_cast<size_t>(num_points) / num_threads;
parallel_for(num_threads, [&](size_t i) {
PROFILE_THIS_NAME("memset in Pippenger runtime state creation");
const size_t thread_offset = i * points_per_thread;
memset(reinterpret_cast<void*>(point_pairs_1 + thread_offset + (i * 16)),
0,
Expand Down Expand Up @@ -96,6 +99,8 @@ pippenger_runtime_state<Curve>::pippenger_runtime_state(pippenger_runtime_state&
, round_counts(other.round_counts)

{
PROFILE_THIS();

other.point_schedule = nullptr;
other.skew_table = nullptr;
other.point_pairs_1 = nullptr;
Expand All @@ -111,6 +116,8 @@ template <typename Curve>
pippenger_runtime_state<Curve>& pippenger_runtime_state<Curve>::operator=(
pippenger_runtime_state<Curve>&& other) noexcept
{
PROFILE_THIS();

if (skew_table != nullptr) {
aligned_free(skew_table);
}
Expand Down Expand Up @@ -164,6 +171,8 @@ template <typename Curve>
affine_product_runtime_state<Curve> pippenger_runtime_state<Curve>::get_affine_product_runtime_state(
const size_t num_threads, const size_t thread_index)
{
PROFILE_THIS();

const auto points_per_thread = static_cast<size_t>(num_points / num_threads);
const auto num_buckets =
static_cast<size_t>(1U << scalar_multiplication::get_optimal_bucket_width(static_cast<size_t>(num_points) / 2));
Expand All @@ -181,6 +190,8 @@ affine_product_runtime_state<Curve> pippenger_runtime_state<Curve>::get_affine_p

template <typename Curve> pippenger_runtime_state<Curve>::~pippenger_runtime_state() noexcept
{
PROFILE_THIS();

if (skew_table != nullptr) {
aligned_free(skew_table);
}
Expand Down
Loading

0 comments on commit 1c008d9

Please sign in to comment.