-
Notifications
You must be signed in to change notification settings - Fork 38
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add a one workgroup argmax benchmark (#49)
This PR is based on #47. I opened a new one because the old one got stale.
- Loading branch information
Showing
6 changed files
with
414 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,45 @@ | ||
# Copyright 2024 Advanced Micro Devices Inc. | ||
# Copyright 2020-2024 Google LLC | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# https://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
|
||
uvkc_glsl_shader_instance( | ||
NAME | ||
one_workgroup_argmax_loop_shader | ||
SRC | ||
"one_workgroup_argmax_loop.glsl" | ||
GLSLC_ARGS | ||
"--target-env=vulkan1.1" | ||
) | ||
|
||
uvkc_glsl_shader_instance( | ||
NAME | ||
one_workgroup_argmax_subgroup_shader | ||
SRC | ||
"one_workgroup_argmax_subgroup.glsl" | ||
GLSLC_ARGS | ||
"--target-env=vulkan1.1" | ||
) | ||
|
||
uvkc_cc_binary( | ||
NAME | ||
one_workgrop_argmax | ||
SRCS | ||
"one_workgroup_argmax_main.cc" | ||
DEPS | ||
::one_workgroup_argmax_loop_shader | ||
::one_workgroup_argmax_subgroup_shader | ||
benchmark::benchmark | ||
uvkc::benchmark::core | ||
uvkc::benchmark::main | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,12 @@ | ||
# Argmax Benchmarks | ||
|
||
This directory contains microbenchmarks for evaluating different strategy to | ||
implement argmax. | ||
|
||
### `one_workgroup_argmax` | ||
|
||
Performs argmax using just one workgroup. The workgroup just contains one | ||
subgroup. This approach does not use any synchronization mechanisms. | ||
|
||
A subgroup uses either a single thread to loop over all elements or subgroup | ||
reduction operations involving all invocations. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,30 @@ | ||
# version 450 core | ||
#extension GL_EXT_control_flow_attributes : enable | ||
|
||
layout(local_size_x = 32, local_size_y = 1, local_size_z = 1) in; | ||
|
||
layout(set=0, binding=0) buffer InputBuffer { float data[]; } Input; | ||
layout(set=0, binding=1) buffer OutputBuffer { uint data; } Output; | ||
|
||
layout(constant_id = 0) const uint totalCount = 1; // Total number of scalars | ||
|
||
// Each workgroup contains just one subgroup. | ||
|
||
void main() { | ||
uint laneID = gl_LocalInvocationID.x; | ||
|
||
if (laneID != 0) return; | ||
|
||
uint wgResult = 0; | ||
float wgMax = Input.data[0]; | ||
|
||
for (uint i = 1; i < totalCount; ++i) { | ||
float elem = Input.data[i]; | ||
if (elem > wgMax) { | ||
wgResult = i; | ||
wgMax = elem; | ||
} | ||
} | ||
|
||
Output.data = wgResult; | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,282 @@ | ||
// Copyright 2024 Advanced Micro Devices Inc. | ||
// Copyright 2020-2024 Google LLC | ||
// | ||
// Licensed under the Apache License, Version 2.0 (the "License"); | ||
// you may not use this file except in compliance with the License. | ||
// You may obtain a copy of the License at | ||
// | ||
// https://www.apache.org/licenses/LICENSE-2.0 | ||
// | ||
// Unless required by applicable law or agreed to in writing, software | ||
// distributed under the License is distributed on an "AS IS" BASIS, | ||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
// See the License for the specific language governing permissions and | ||
// limitations under the License. | ||
|
||
#include <chrono> | ||
#include <memory> | ||
#include <numeric> | ||
|
||
#include "absl/strings/str_cat.h" | ||
#include "absl/types/span.h" | ||
#include "benchmark/benchmark.h" | ||
#include "uvkc/benchmark/data_type_util.h" | ||
#include "uvkc/benchmark/main.h" | ||
#include "uvkc/benchmark/status_util.h" | ||
#include "uvkc/benchmark/vulkan_buffer_util.h" | ||
#include "uvkc/benchmark/vulkan_context.h" | ||
#include "uvkc/vulkan/device.h" | ||
#include "uvkc/vulkan/pipeline.h" | ||
|
||
using ::uvkc::benchmark::LatencyMeasureMode; | ||
using ::uvkc::vulkan::Pipeline; | ||
|
||
static const char kBenchmarkName[] = "one_workgroup_argmax"; | ||
|
||
static const uint32_t kLoopShader[] = { | ||
#include "one_workgroup_argmax_loop_shader_spirv_instance.inc" | ||
}; | ||
|
||
static const uint32_t kSubgroupShader[] = { | ||
#include "one_workgroup_argmax_subgroup_shader_spirv_instance.inc" | ||
}; | ||
|
||
struct ShaderCode { | ||
const char *name; // Test case name | ||
const uint32_t *code; // SPIR-V code | ||
size_t code_num_bytes; // Number of bytes for SPIR-V code | ||
int workgroup_size; // Number of invocations per workgroup | ||
}; | ||
|
||
ShaderCode kShaders[] = { | ||
{"loop", kLoopShader, sizeof(kLoopShader), 32}, | ||
{"subgroup", kSubgroupShader, sizeof(kSubgroupShader), 32}, | ||
}; | ||
|
||
static void Argmax(::benchmark::State &state, ::uvkc::vulkan::Device *device, | ||
const ::uvkc::benchmark::LatencyMeasure *latency_measure, | ||
const uint32_t *code, size_t code_num_words, | ||
size_t total_elements, int workgroup_size) { | ||
//===-------------------------------------------------------------------===/ | ||
// Create shader module, pipeline, and descriptor sets | ||
//===-------------------------------------------------------------------===/ | ||
|
||
BM_CHECK_OK_AND_ASSIGN(auto shader_module, | ||
device->CreateShaderModule(code, code_num_words)); | ||
BM_CHECK_OK_AND_ASSIGN(auto descriptor_pool, | ||
device->CreateDescriptorPool(*shader_module)); | ||
BM_CHECK_OK_AND_ASSIGN(auto layout_set_map, | ||
descriptor_pool->AllocateDescriptorSets( | ||
shader_module->descriptor_set_layouts())); | ||
|
||
Pipeline::SpecConstant spec_constants[] = { | ||
{/*id=*/0, Pipeline::SpecConstant::Type::u32, | ||
static_cast<int32_t>(total_elements)}, | ||
{/*id=*/1, Pipeline::SpecConstant::Type::u32, workgroup_size}, | ||
}; | ||
BM_CHECK_OK_AND_ASSIGN( | ||
auto pipeline, device->CreatePipeline(*shader_module, "main", | ||
absl::MakeSpan(spec_constants, 2))); | ||
|
||
//===-------------------------------------------------------------------===/ | ||
// Create buffers | ||
//===-------------------------------------------------------------------===/ | ||
|
||
const size_t src_buffer_size = total_elements * sizeof(float); | ||
const size_t dst_buffer_size = sizeof(int); | ||
|
||
BM_CHECK_OK_AND_ASSIGN( | ||
auto src_buffer, | ||
device->CreateBuffer( | ||
VK_BUFFER_USAGE_TRANSFER_DST_BIT | VK_BUFFER_USAGE_STORAGE_BUFFER_BIT, | ||
VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, src_buffer_size)); | ||
BM_CHECK_OK_AND_ASSIGN( | ||
auto dst_buffer, | ||
device->CreateBuffer( | ||
VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT | | ||
VK_BUFFER_USAGE_STORAGE_BUFFER_BIT, | ||
VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, dst_buffer_size)); | ||
|
||
// Create a buffer for zeroing the destination buffer. | ||
BM_CHECK_OK_AND_ASSIGN( | ||
auto data_buffer, | ||
device->CreateBuffer( | ||
VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT | | ||
VK_BUFFER_USAGE_STORAGE_BUFFER_BIT, | ||
VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, dst_buffer_size)); | ||
|
||
//===-------------------------------------------------------------------===/ | ||
// Set source buffer data | ||
//===-------------------------------------------------------------------===/ | ||
|
||
auto generate_float_data = [](size_t i) { | ||
return i == 550 ? float(i) : 1.0f; | ||
}; | ||
|
||
BM_CHECK_OK(::uvkc::benchmark::SetDeviceBufferViaStagingBuffer( | ||
device, src_buffer.get(), src_buffer_size, | ||
[&](void *ptr, size_t num_bytes) { | ||
float *src_float_buffer = reinterpret_cast<float *>(ptr); | ||
for (size_t i = 0; i < num_bytes / sizeof(float); i++) { | ||
src_float_buffer[i] = generate_float_data(i); | ||
} | ||
})); | ||
|
||
//===-------------------------------------------------------------------===/ | ||
// Dispatch | ||
//===-------------------------------------------------------------------===/ | ||
|
||
std::vector<::uvkc::vulkan::Device::BoundBuffer> bound_buffers = { | ||
{src_buffer.get(), /*set=*/0, /*binding=*/0}, | ||
{dst_buffer.get(), /*set=*/0, /*binding=*/1}, | ||
}; | ||
BM_CHECK_OK(device->AttachBufferToDescriptor( | ||
*shader_module, layout_set_map, | ||
{bound_buffers.data(), bound_buffers.size()})); | ||
|
||
BM_CHECK_EQ(shader_module->descriptor_set_layouts().size(), 1) | ||
<< "unexpected number of descriptor sets"; | ||
auto descriptor_set_layout = shader_module->descriptor_set_layouts().front(); | ||
|
||
std::vector<::uvkc::vulkan::CommandBuffer::BoundDescriptorSet> | ||
bound_descriptor_sets(1); | ||
bound_descriptor_sets[0].index = 0; | ||
bound_descriptor_sets[0].set = layout_set_map.at(descriptor_set_layout); | ||
BM_CHECK_OK_AND_ASSIGN(auto dispatch_cmdbuf, device->AllocateCommandBuffer()); | ||
|
||
BM_CHECK_OK(dispatch_cmdbuf->Begin()); | ||
dispatch_cmdbuf->BindPipelineAndDescriptorSets( | ||
*pipeline, {bound_descriptor_sets.data(), bound_descriptor_sets.size()}); | ||
dispatch_cmdbuf->Dispatch(1, 1, 1); | ||
BM_CHECK_OK(dispatch_cmdbuf->End()); | ||
BM_CHECK_OK(device->QueueSubmitAndWait(*dispatch_cmdbuf)); | ||
|
||
//===-------------------------------------------------------------------===/ | ||
// Verify destination buffer data | ||
//===-------------------------------------------------------------------===/ | ||
|
||
BM_CHECK_OK(::uvkc::benchmark::GetDeviceBufferViaStagingBuffer( | ||
device, dst_buffer.get(), dst_buffer_size, | ||
[&](void *ptr, size_t num_bytes) { | ||
int *dst_int_buffer = reinterpret_cast<int *>(ptr); | ||
float max = -10000.f; | ||
int idx = -1; | ||
for (size_t i = 0; i < total_elements; i++) { | ||
float data = generate_float_data(i); | ||
if (data > max) { | ||
idx = i; | ||
max = data; | ||
} | ||
}; | ||
BM_CHECK_EQ(dst_int_buffer[0], idx) | ||
<< "destination buffer element #0 has incorrect value: " | ||
"expected to be " | ||
<< idx << " but found " << dst_int_buffer[0]; | ||
})); | ||
|
||
//===-------------------------------------------------------------------===/ | ||
// Benchmarking | ||
//===-------------------------------------------------------------------===/ | ||
|
||
std::unique_ptr<::uvkc::vulkan::TimestampQueryPool> query_pool; | ||
bool use_timestamp = | ||
latency_measure->mode == LatencyMeasureMode::kGpuTimestamp; | ||
if (use_timestamp) { | ||
BM_CHECK_OK_AND_ASSIGN(query_pool, device->CreateTimestampQueryPool(2)); | ||
} | ||
|
||
BM_CHECK_OK_AND_ASSIGN(auto cmdbuf, device->AllocateCommandBuffer()); | ||
for (auto _ : state) { | ||
BM_CHECK_OK(cmdbuf->Begin()); | ||
if (use_timestamp) cmdbuf->ResetQueryPool(*query_pool); | ||
|
||
cmdbuf->BindPipelineAndDescriptorSets( | ||
*pipeline, | ||
{bound_descriptor_sets.data(), bound_descriptor_sets.size()}); | ||
|
||
if (use_timestamp) { | ||
cmdbuf->WriteTimestamp(*query_pool, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, 0); | ||
} | ||
|
||
cmdbuf->Dispatch(1, 1, 1); | ||
|
||
if (use_timestamp) { | ||
cmdbuf->WriteTimestamp(*query_pool, VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT, | ||
1); | ||
} | ||
|
||
BM_CHECK_OK(cmdbuf->End()); | ||
|
||
auto start_time = std::chrono::high_resolution_clock::now(); | ||
BM_CHECK_OK(device->QueueSubmitAndWait(*cmdbuf)); | ||
auto end_time = std::chrono::high_resolution_clock::now(); | ||
auto elapsed_seconds = | ||
std::chrono::duration_cast<std::chrono::duration<double>>(end_time - | ||
start_time); | ||
|
||
switch (latency_measure->mode) { | ||
case LatencyMeasureMode::kSystemDispatch: { | ||
state.SetIterationTime(elapsed_seconds.count() - | ||
latency_measure->overhead_seconds); | ||
} break; | ||
case LatencyMeasureMode::kSystemSubmit: { | ||
state.SetIterationTime(elapsed_seconds.count()); | ||
} break; | ||
case LatencyMeasureMode::kGpuTimestamp: { | ||
BM_CHECK_OK_AND_ASSIGN( | ||
double timestamp_seconds, | ||
query_pool->CalculateElapsedSecondsBetween(0, 1)); | ||
state.SetIterationTime(timestamp_seconds); | ||
} break; | ||
} | ||
|
||
BM_CHECK_OK(cmdbuf->Reset()); | ||
} | ||
|
||
state.SetBytesProcessed(state.iterations() * src_buffer_size); | ||
state.counters["FLOps"] = | ||
::benchmark::Counter(total_elements, | ||
::benchmark::Counter::kIsIterationInvariant | | ||
::benchmark::Counter::kIsRate, | ||
::benchmark::Counter::kIs1000); | ||
|
||
// Reset the command pool to release all command buffers in the benchmarking | ||
// loop to avoid draining GPU resources. | ||
BM_CHECK_OK(device->ResetCommandPool()); | ||
} | ||
|
||
namespace uvkc { | ||
namespace benchmark { | ||
|
||
absl::StatusOr<std::unique_ptr<VulkanContext>> CreateVulkanContext() { | ||
return CreateDefaultVulkanContext(kBenchmarkName); | ||
} | ||
|
||
bool RegisterVulkanOverheadBenchmark( | ||
const vulkan::Driver::PhysicalDeviceInfo &physical_device, | ||
vulkan::Device *device, double *overhead_seconds) { | ||
return false; | ||
} | ||
|
||
void RegisterVulkanBenchmarks( | ||
const vulkan::Driver::PhysicalDeviceInfo &physical_device, | ||
vulkan::Device *device, const LatencyMeasure *latency_measure) { | ||
const char *gpu_name = physical_device.v10_properties.deviceName; | ||
|
||
for (const auto &shader : kShaders) { | ||
for (size_t total_elements : {1 << 10, 1 << 12, 1 << 14, 1 << 16}) { | ||
std::string test_name = absl::StrCat( | ||
gpu_name, "/#elements=", total_elements, | ||
"/workgroup_size=", shader.workgroup_size, "/", shader.name); | ||
::benchmark::RegisterBenchmark(test_name.c_str(), Argmax, device, | ||
latency_measure, shader.code, | ||
shader.code_num_bytes / sizeof(uint32_t), | ||
total_elements, shader.workgroup_size) | ||
->UseManualTime() | ||
->Unit(::benchmark::kMicrosecond); | ||
} | ||
} | ||
} | ||
|
||
} // namespace benchmark | ||
} // namespace uvkc |
Oops, something went wrong.