Skip to content

Commit

Permalink
Add fork/join latency benchmark
Browse files Browse the repository at this point in the history
PiperOrigin-RevId: 725126942
  • Loading branch information
jan-wassenberg authored and copybara-github committed Feb 10, 2025
1 parent b0fe9a4 commit 7187c95
Show file tree
Hide file tree
Showing 2 changed files with 67 additions and 0 deletions.
1 change: 1 addition & 0 deletions BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@ cc_test(
"@googletest//:gtest_main",
"@highway//:hwy",
"@highway//:hwy_test_util",
"@highway//:nanobenchmark",
"@highway//:thread_pool",
],
)
Expand Down
66 changes: 66 additions & 0 deletions util/threading_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,10 @@

#include "gmock/gmock.h"
#include "gtest/gtest.h"
#include "hwy/aligned_allocator.h"
#include "hwy/base.h" // HWY_ASSERT
#include "hwy/contrib/thread_pool/thread_pool.h"
#include "hwy/nanobenchmark.h"

namespace gcpp {
namespace {
Expand Down Expand Up @@ -251,5 +253,69 @@ TEST(ThreadingTest, TestParallelizeTwoRanges) {
}
}

// Governs duration of test; avoid timeout in debug builds.
#if HWY_IS_DEBUG_BUILD
constexpr size_t kMaxEvals = 4;
#else
constexpr size_t kMaxEvals = 8;
#endif

static constexpr size_t kU64PerThread = HWY_ALIGNMENT / sizeof(size_t);
static uint64_t outputs[hwy::kMaxLogicalProcessors * kU64PerThread];

hwy::FuncOutput ForkJoin(const void* opaque, hwy::FuncInput in) {
hwy::ThreadPool& pool =
*reinterpret_cast<hwy::ThreadPool*>(const_cast<void*>(opaque));
pool.Run(0, in, [&](uint64_t task, size_t thread) {
outputs[thread * kU64PerThread] = in;
});
return in;
}

TEST(ThreadingTest, BenchJoin) {
constexpr size_t kInputs = 1;
static hwy::FuncInput inputs[kInputs];

const auto measure = [&](hwy::ThreadPool& pool, const char* caption) {
inputs[0] =
static_cast<hwy::FuncInput>(hwy::Unpredictable1() * pool.NumWorkers());
hwy::Result results[kInputs];
hwy::Params params;
params.max_evals = kMaxEvals;
const size_t num_results =
Measure(&ForkJoin, reinterpret_cast<const uint8_t*>(&pool), inputs,
kInputs, results, params);
for (size_t i = 0; i < num_results; ++i) {
printf("%s: %5d: %6.2f us; MAD=%4.2f%%\n", caption,
static_cast<int>(results[i].input),
results[i].ticks / hwy::platform::InvariantTicksPerSecond() * 1E6,
results[i].variability * 100.0);
}

// Verify outputs to ensure the measured code is not a no-op.
for (size_t lp = 0; lp < pool.NumWorkers(); ++lp) {
HWY_ASSERT(outputs[lp * kU64PerThread] == pool.NumWorkers());
for (size_t i = 1; i < kU64PerThread; ++i) {
HWY_ASSERT(outputs[lp * kU64PerThread + i] == 0);
}
}
};

NestedPools pools(0);
measure(pools.AllPackages(), "\nblock packages");
if (pools.AllClusters(0).NumWorkers() > 1) {
measure(pools.AllClusters(0), "\nblock clusters");
}
measure(pools.Cluster(0, 0), "\nblock in_cluster");

Tristate use_spinning = Tristate::kTrue;
pools.MaybeStartSpinning(use_spinning);
measure(pools.AllPackages(), "\nspin packages");
if (pools.AllClusters(0).NumWorkers() > 1) {
measure(pools.AllClusters(0), "\nspin clusters");
}
measure(pools.Cluster(0, 0), "\nspin in_cluster");
}

} // namespace
} // namespace gcpp

0 comments on commit 7187c95

Please sign in to comment.