Skip to content
This repository has been archived by the owner on Nov 25, 2024. It is now read-only.

add benchmark #65

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ option(BUILD_SHARED_LIBS "Build libwholegraph shared libraries" ON)
option(CMAKE_CUDA_LINEINFO "Enable the -lineinfo option for nvcc (useful for cuda-memcheck / profiler" OFF)
option(BUILD_TESTS "Configure CMake to build tests" ON)
option(CUDA_STATIC_RUNTIME "Statically link the CUDA toolkit runtime and libraries" OFF)
option(BUILD_BENCHMARKS "Configure CMake to build benchmark" ON)

##############################################################################
# - Set options based on user defined one -----------------------------------
Expand Down Expand Up @@ -203,6 +204,11 @@ if(BUILD_TESTS AND CMAKE_PROJECT_NAME STREQUAL PROJECT_NAME)
add_subdirectory(tests)
endif()

# optionally build benchmark
if (BUILD_BENCHMARKS AND CMAKE_PROJECT_NAME STREQUAL PROJECT_NAME)
add_subdirectory(bench)
endif()

##############################################################################
# - code checker -------------------------------------------------------------

Expand Down
70 changes: 70 additions & 0 deletions cpp/bench/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
#=============================================================================
# Copyright (c) 2023, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#=============================================================================
# option(BUILD_BENCHMARKS "Build wholegraph C++ benchmark tests" ON)
message(VERBOSE "WHOLEGRAPH: Building wholegraph C++ benchmarks: ${BUILD_BENCHMARKS}")

function(ConfigureBench)

set(options OPTIONAL)
set(oneValueArgs NAME)
set(multiValueArgs PATH TARGETS CONFIGURATIONS)
cmake_parse_arguments(ConfigureBench "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})

set(BENCH_NAME ${ConfigureBench_NAME})

add_executable(${BENCH_NAME} ${ConfigureBench_PATH})

target_include_directories(${BENCH_NAME} PRIVATE "$<BUILD_INTERFACE:${WHOLEGRAPH_SOURCE_DIR}>/src")
target_link_libraries(
${BENCH_NAME}
PRIVATE wholegraph
raft::raft
rmm::rmm
pthread
)

set_target_properties(
${BENCH_NAME}
PROPERTIES # set target compile options
INSTALL_RPATH "\$ORIGIN/../../../lib"
CXX_STANDARD 17
CXX_STANDARD_REQUIRED ON
CUDA_ARCHITECTURES "${CMAKE_CUDA_ARCHITECTURES}"
POSITION_INDEPENDENT_CODE ON
RUNTIME_OUTPUT_DIRECTORY "$<BUILD_INTERFACE:${WHOLEGRAPH_BINARY_DIR}/gbench>"
INTERFACE_POSITION_INDEPENDENT_CODE ON
)
target_compile_options(${BENCH_NAME} PUBLIC $<$<COMPILE_LANG_AND_ID:CXX,GNU,Clang>:-Wall -Werror
-Wno-error=deprecated-declarations>)

install(
TARGETS ${BENCH_NAME}
COMPONENT testing
DESTINATION bin/gbench/libwholegraph
EXCLUDE_FROM_ALL
)


endfunction()

if(BUILD_BENCHMARKS)
ConfigureBench(
NAME GATHER_SCATTER_BENCH
PATH wholememory_ops/gather_scatter_bench.cu
common/wholegraph_benchmark.cpp
)

endif()
132 changes: 132 additions & 0 deletions cpp/bench/common/wholegraph_benchmark.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@
#=============================================================================
# Copyright (c) 2023, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#=============================================================================
#include "wholegraph_benchmark.hpp"

#include "wholememory/communicator.hpp"
#include <cstdint>
#include <experimental/functional>
#include <experimental/random>
#include <wholememory/tensor_description.h>
#include <wholememory/wholememory.h>

#include <functional>
#include <string>
#include <vector>

namespace wholegraph::bench {

template <typename IndexT>
void host_get_random_integer_indices(void* indices,
wholememory_array_description_t indice_desc,
int64_t max_indices)
{
IndexT* indices_ptr = static_cast<IndexT*>(indices);
std::experimental::reseed();
for (int64_t i = 0; i < indice_desc.size; i++) {
IndexT random_index = std::experimental::randint<IndexT>(0, max_indices - 1);
indices_ptr[i + indice_desc.storage_offset] = random_index;
}
}

void host_random_init_integer_indices(void* indices,
wholememory_array_description_t indices_desc,
int64_t max_indices)
{
if (indices_desc.dtype == WHOLEMEMORY_DT_INT) {
host_get_random_integer_indices<int>(indices, indices_desc, max_indices);
} else {
host_get_random_integer_indices<int64_t>(indices, indices_desc, max_indices);
}
}

void MultiProcessMeasurePerformance(std::function<void()> run_fn,
wholememory_comm_t& wm_comm,
const PerformanceMeter& meter,
const std::function<void()>& barrier_fn)
{
barrier_fn();
// warm up
struct timeval tv_warmup_s;
gettimeofday(&tv_warmup_s, nullptr);
int64_t target_warmup_time = 1000LL * 1000LL * meter.warmup_seconds;
while (true) {
struct timeval tv_warmup_c;
gettimeofday(&tv_warmup_c, nullptr);
int64_t time_warmup = TIME_DIFF_US(tv_warmup_s, tv_warmup_c);
if (time_warmup >= target_warmup_time) break;
run_fn();
WHOLEMEMORY_CHECK_NOTHROW(cudaDeviceSynchronize() == cudaSuccess);
}
WHOLEMEMORY_CHECK_NOTHROW(cudaDeviceSynchronize() == cudaSuccess);
barrier_fn();

// run
struct timeval tv_run_s, tv_run_e;
int64_t max_run_us = 1000LL * 1000LL * meter.max_run_seconds;
gettimeofday(&tv_run_s, nullptr);
int real_run_count = 0;
for (int i = 0; i < meter.run_count; i++) {
run_fn();
real_run_count++;
struct timeval tv_run_c;
gettimeofday(&tv_run_c, nullptr);
int64_t time_run_used = TIME_DIFF_US(tv_run_s, tv_run_c);
if (time_run_used >= max_run_us || real_run_count >= meter.run_count) break;
if (meter.sync) { WHOLEMEMORY_CHECK_NOTHROW(cudaDeviceSynchronize() == cudaSuccess); }
}
WHOLEMEMORY_CHECK_NOTHROW(cudaDeviceSynchronize() == cudaSuccess);
gettimeofday(&tv_run_e, nullptr);
int64_t real_time_used_us = TIME_DIFF_US(tv_run_s, tv_run_e);
double single_run_time_us = real_time_used_us;
single_run_time_us /= real_run_count;
barrier_fn();

for (size_t i = 0; i < meter.metrics_.size(); i++) {
double metric_value = meter.metrics_[i].value;
if (meter.metrics_[i].invert) {
metric_value *= single_run_time_us;
metric_value /= 1e6;
} else {
metric_value /= single_run_time_us;
metric_value *= 1e6;
}

std::vector<double> recv_vec(wm_comm->world_size);
wm_comm->host_allgather(&metric_value, recv_vec.data(), 1, WHOLEMEMORY_DT_DOUBLE);
double min_metric, max_metric, avg_metric;
min_metric = max_metric = recv_vec[0];
avg_metric = 0.0;
for (int j = 0; j < wm_comm->world_size; j++) {
min_metric = std::min(min_metric, recv_vec[j]);
max_metric = std::max(max_metric, recv_vec[j]);
avg_metric += recv_vec[j];
}
avg_metric /= wm_comm->world_size;
if (wm_comm->world_rank == 0) {
fprintf(stderr,
"== Metric: %20s: min=%.2lf %s,, max=%.2lf %s,, avg=%.2lf %s\n",
meter.metrics_[i].name.c_str(),
min_metric,
meter.metrics_[i].unit.c_str(),
max_metric,
meter.metrics_[i].unit.c_str(),
avg_metric,
meter.metrics_[i].unit.c_str());
}
}
}

} // namespace wholegraph::bench
108 changes: 108 additions & 0 deletions cpp/bench/common/wholegraph_benchmark.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
/*
* Copyright (c) 2023, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include <cstdint>
#include <cuda_runtime.h>
#include <sys/time.h>

#include <functional>
#include <string>
#include <vector>

#include "error.hpp"

#include <wholememory/tensor_description.h>
#include <wholememory/wholememory.h>
namespace wholegraph::bench {

#define TIME_DIFF_US(TVS, TVE) \
((TVE.tv_sec - TVS.tv_sec) * 1000ULL * 1000ULL + (TVE.tv_usec - TVS.tv_usec))

void host_random_init_integer_indices(void* indices,
wholememory_array_description_t indices_desc,
int64_t max_indices);

struct Metric {
Metric(const std::string& metrics_name,
const std::string& metrics_unit,
const double metrics_value,
bool inv)
{
name = metrics_name;
unit = metrics_unit;
value = metrics_value;
invert = inv;
}
std::string name;
std::string unit;
double value;
bool invert;
};

struct PerformanceMeter {
PerformanceMeter& SetSync()
{
sync = true;
return *this;
}
bool sync = false;

PerformanceMeter& SetWarmupTime(float w)
{
warmup_seconds = w;
return *this;
}
float warmup_seconds = 0.05f;

std::vector<Metric> metrics_;

PerformanceMeter& AddMetrics(const std::string& metrics_name,
const std::string& unit,
double value,
bool inv = false)
{
metrics_.emplace_back(metrics_name, unit, value, inv);
return *this;
}

PerformanceMeter& SetRunCount(int count)
{
run_count = count;
return *this;
}
int run_count = 100;

PerformanceMeter& SetMaxRunSeconds(float sec)
{
max_run_seconds = sec;
return *this;
}
float max_run_seconds = 10;

PerformanceMeter& SetName(const std::string& n)
{
name = n;
return *this;
}
std::string name;
};

void MultiProcessMeasurePerformance(std::function<void()> run_fn,
wholememory_comm_t& wm_comm,
const PerformanceMeter& meter,
const std::function<void()>& barrier_fn);

} // namespace wholegraph::bench
Loading
Loading