-
Notifications
You must be signed in to change notification settings - Fork 41
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
5b983a5
commit 8652754
Showing
12 changed files
with
796 additions
and
4 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,63 @@ | ||
name: Benchmarks | ||
permissions: | ||
contents: write # contents permission to update benchmark contents in gh-pages branch | ||
statuses: read | ||
deployments: write # deployments permission to deploy GitHub pages website | ||
pull-requests: write | ||
|
||
on: | ||
pull_request: | ||
branches: | ||
- main | ||
paths: | ||
- "src/**/*" | ||
- "ext/**/*" | ||
- "perf/**/*" | ||
- ".buildkite/**/*" | ||
- "Project.toml" | ||
- ".github/workflows/Benchmark.yml" | ||
push: | ||
branches: | ||
- main | ||
paths: | ||
- "src/**/*" | ||
- "ext/**/*" | ||
- "benchmarks/**/*" | ||
- ".buildkite/**/*" | ||
- "Project.toml" | ||
- ".github/workflows/Benchmark.yml" | ||
|
||
jobs: | ||
benchmark: | ||
if: ${{ contains(github.event.head_commit.message, '[only benchmarks]') || !contains(github.event.head_commit.message, '[only') && !contains(github.event.head_commit.message, '[skip benchmarks]') && github.event.pull_request.draft == false }} | ||
runs-on: ubuntu-latest | ||
steps: | ||
- uses: actions/checkout@v4 | ||
- name: Download Buildkite Artifacts | ||
id: download | ||
uses: EnricoMi/download-buildkite-artifact-action@v1 | ||
with: | ||
buildkite_token: ${{ secrets.BUILDKITE_TOKEN }} | ||
ignore_build_states: blocked,canceled,skipped,not_run,failed | ||
ignore_job_states: timed_out,failed | ||
output_path: artifacts | ||
|
||
- name: Locate Benchmarks Artifact | ||
id: locate | ||
if: ${{ steps.download.outputs.download-state == 'success' }} | ||
run: echo "path=$(find artifacts -type f -name benchmarkresults.json 2>/dev/null)" >> $GITHUB_OUTPUT | ||
|
||
- name: Upload Benchmark Results | ||
if: ${{ steps.locate.outputs.path != '' }} | ||
uses: benchmark-action/github-action-benchmark@v1 | ||
with: | ||
name: Metal Benchmarks | ||
tool: "julia" | ||
output-file-path: ${{ steps.locate.outputs.path }} | ||
benchmark-data-dir-path: "" | ||
github-token: ${{ secrets.GITHUB_TOKEN }} | ||
comment-always: true | ||
summary-always: true | ||
alert-threshold: "150%" | ||
fail-on-alert: false | ||
auto-push: ${{ github.event_name != 'pull_request' }} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
results.json | ||
reference.json |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
[deps] | ||
BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf" | ||
HTTP = "cd3eb016-35fb-5094-929b-558a96fad6f3" | ||
JSON = "682c06a0-de6a-54ab-a142-c8b1cf79cde6" | ||
Metal = "dde4c033-4e86-420c-a63e-0dd931031962" | ||
StableRNGs = "860ef19b-820b-49d6-a774-d7a799459cd3" | ||
StaticArrays = "90137ffa-7385-5640-81b9-e52037218182" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,110 @@ | ||
group = addgroup!(SUITE, "array") | ||
|
||
const m = 512 | ||
const n = 1000 | ||
|
||
# generate some arrays | ||
cpu_mat = rand(rng, Float32, m, n) | ||
gpu_mat = MtlArray{Float32}(undef, size(cpu_mat)) | ||
gpu_vec = reshape(gpu_mat, length(gpu_mat)) | ||
gpu_arr_3d = reshape(gpu_mat, (m, 40, 25)) | ||
gpu_arr_4d = reshape(gpu_mat, (m, 10, 10, 10)) | ||
gpu_mat_ints = MtlArray(rand(rng, Int, m, n)) | ||
gpu_vec_ints = reshape(gpu_mat_ints, length(gpu_mat_ints)) | ||
gpu_mat_bools = MtlArray(rand(rng, Bool, m, n)) | ||
gpu_vec_bools = reshape(gpu_mat_bools, length(gpu_mat_bools)) | ||
|
||
group["construct"] = @benchmarkable MtlArray{Int}(undef, 1) | ||
|
||
group["copy"] = @async_benchmarkable copy($gpu_mat) | ||
|
||
gpu_mat2 = copy(gpu_mat) | ||
let group = addgroup!(group, "copyto!") | ||
group["cpu_to_gpu"] = @async_benchmarkable copyto!($gpu_mat, $cpu_mat) | ||
group["gpu_to_cpu"] = @async_benchmarkable copyto!($cpu_mat, $gpu_mat) | ||
group["gpu_to_gpu"] = @async_benchmarkable copyto!($gpu_mat2, $gpu_mat) | ||
end | ||
|
||
let group = addgroup!(group, "iteration") | ||
group["scalar"] = @benchmarkable Metal.@allowscalar [$gpu_vec[i] for i in 1:10] | ||
|
||
group["logical"] = @benchmarkable $gpu_vec[$gpu_vec_bools] | ||
|
||
let group = addgroup!(group, "findall") | ||
group["bool"] = @benchmarkable findall($gpu_vec_bools) | ||
group["int"] = @benchmarkable findall(isodd, $gpu_vec_ints) | ||
end | ||
|
||
let group = addgroup!(group, "findfirst") | ||
group["bool"] = @benchmarkable findfirst($gpu_vec_bools) | ||
group["int"] = @benchmarkable findfirst(isodd, $gpu_vec_ints) | ||
end | ||
|
||
let group = addgroup!(group, "findmin") # findmax | ||
group["1d"] = @async_benchmarkable findmin($gpu_vec) | ||
group["2d"] = @async_benchmarkable findmin($gpu_mat; dims=1) | ||
end | ||
end | ||
|
||
# let group = addgroup!(group, "reverse") | ||
# group["1d"] = @async_benchmarkable reverse($gpu_vec) | ||
# group["2d"] = @async_benchmarkable reverse($gpu_mat; dims=1) | ||
# group["1d_inplace"] = @async_benchmarkable reverse!($gpu_vec) | ||
# group["2d_inplace"] = @async_benchmarkable reverse!($gpu_mat; dims=1) | ||
# end | ||
|
||
group["broadcast"] = @async_benchmarkable $gpu_mat .= 0f0 | ||
|
||
# no need to test inplace version, which performs the same operation (but with an alloc) | ||
let group = addgroup!(group, "accumulate") | ||
group["1d"] = @async_benchmarkable accumulate(+, $gpu_vec) | ||
group["2d"] = @async_benchmarkable accumulate(+, $gpu_mat; dims=1) | ||
end | ||
|
||
let group = addgroup!(group, "reductions") | ||
let group = addgroup!(group, "reduce") | ||
group["1d"] = @async_benchmarkable reduce(+, $gpu_vec) | ||
group["2d"] = @async_benchmarkable reduce(+, $gpu_mat; dims=1) | ||
end | ||
|
||
let group = addgroup!(group, "mapreduce") | ||
group["1d"] = @async_benchmarkable mapreduce(x->x+1, +, $gpu_vec) | ||
group["2d"] = @async_benchmarkable mapreduce(x->x+1, +, $gpu_mat; dims=1) | ||
end | ||
|
||
# used by sum, prod, minimum, maximum, all, any, count | ||
end | ||
|
||
let group = addgroup!(group, "random") | ||
let group = addgroup!(group, "rand") | ||
group["Float32"] = @async_benchmarkable Metal.rand(Float32, m*n) | ||
group["Int64"] = @async_benchmarkable Metal.rand(Int64, m*n) | ||
end | ||
|
||
let group = addgroup!(group, "rand!") | ||
group["Float32"] = @async_benchmarkable Metal.rand!($gpu_vec) | ||
group["Int64"] = @async_benchmarkable Metal.rand!($gpu_vec_ints) | ||
end | ||
|
||
let group = addgroup!(group, "randn") | ||
group["Float32"] = @async_benchmarkable Metal.randn(Float32, m*n) | ||
# group["Int64"] = @async_benchmarkable Metal.randn(Int64, m*n) | ||
end | ||
|
||
let group = addgroup!(group, "randn!") | ||
group["Float32"] = @async_benchmarkable Metal.randn!($gpu_vec) | ||
# group["Int64"] = @async_benchmarkable Metal.randn!($gpu_vec_ints) | ||
end | ||
end | ||
|
||
# let group = addgroup!(group, "sorting") | ||
# group["1d"] = @async_benchmarkable sort($gpu_vec) | ||
# group["2d"] = @async_benchmarkable sort($gpu_mat; dims=1) | ||
# group["by"] = @async_benchmarkable sort($gpu_vec; by=sin) | ||
# end | ||
|
||
let group = addgroup!(group, "permutedims") | ||
group["2d"] = @async_benchmarkable permutedims($gpu_mat, (2,1)) | ||
group["3d"] = @async_benchmarkable permutedims($gpu_arr_3d, (3,1,2)) | ||
group["4d"] = @async_benchmarkable permutedims($gpu_arr_4d, (2,1,4,3)) | ||
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,78 @@ | ||
module ByVal | ||
|
||
using Metal, BenchmarkTools, Random | ||
|
||
const threads = 256 | ||
|
||
# simple add matrixes kernel | ||
function kernel_add_mat(n, x1, x2, y) | ||
i = thread_position_in_grid_1d() | ||
if i <= n | ||
@inbounds y[i] = x1[i] + x2[i] | ||
end | ||
return | ||
end | ||
|
||
@inline get_inputs3(indx_y, a, b, c) = (a, b, c) | ||
@inline get_inputs3(indx_y, a1, a2, b1, b2, c1, c2) = indx_y == 1 ? (a1, b1, c1) : (a2, b2, c2) | ||
@inline get_inputs3(indx_y, a1, a2, a3, b1, b2, b3, c1, c2, c3) = indx_y == 1 ? (a1, b1, c1) : indx_y == 2 ? (a2, b2, c2) : (a3, b3, c3) | ||
|
||
# add arrays of matrixes kernel | ||
function kernel_add_mat_z_slices(n, vararg...) | ||
x1, x2, y = get_inputs3(threadgroup_position_in_grid_2d().y, vararg...) | ||
i = thread_position_in_grid_1d() | ||
if i <= n | ||
@inbounds y[i] = x1[i] + x2[i] | ||
end | ||
return | ||
end | ||
|
||
function add_z_slices!(y, x1, x2) | ||
m1, n1 = size(x1[1]) #get size of first slice | ||
groups = (m1 * n1 + threads - 1) ÷ threads | ||
# get length(x1) more groups than needed to process 1 slice | ||
@metal groups = groups, length(x1) threads = threads kernel_add_mat_z_slices(m1 * n1, x1..., x2..., y...) | ||
end | ||
|
||
function add!(y, x1, x2) | ||
m1, n1 = size(x1) | ||
groups = (m1 * n1 + threads - 1) ÷ threads | ||
@metal groups = (groups, 1) threads = threads kernel_add_mat(m1 * n1, x1, x2, y) | ||
end | ||
|
||
function main() | ||
results = BenchmarkGroup() | ||
|
||
num_z_slices = 3 | ||
Random.seed!(1) | ||
|
||
#m, n = 7, 5 # tiny to measure overhead | ||
#m, n = 521, 111 | ||
#m, n = 1521, 1111 | ||
#m, n = 3001, 1511 # prime numbers to test memory access correctness | ||
m, n = 3072, 1536 # 256 multiplier | ||
#m, n = 6007, 3001 # prime numbers to test memory access correctness | ||
|
||
x1 = [mtl(randn(Float32, (m, n)) .+ Float32(0.5)) for i = 1:num_z_slices] | ||
x2 = [mtl(randn(Float32, (m, n)) .+ Float32(0.5)) for i = 1:num_z_slices] | ||
y1 = [similar(x1[1]) for i = 1:num_z_slices] | ||
|
||
# reference down to bones add on GPU | ||
results["reference"] = @benchmark Metal.@sync add!($y1[1], $x1[1], $x2[1]) | ||
|
||
# adding arrays in an array | ||
for slices = 1:num_z_slices | ||
results["slices=$slices"] = @benchmark Metal.@sync add_z_slices!($y1[1:$slices], $x1[1:$slices], $x2[1:$slices]) | ||
end | ||
|
||
# BenchmarkTools captures inputs, JuliaCI/BenchmarkTools.jl#127, so forcibly free them | ||
Metal.unsafe_free!.(x1) | ||
Metal.unsafe_free!.(x2) | ||
Metal.unsafe_free!.(y1) | ||
|
||
return results | ||
end | ||
|
||
end | ||
|
||
ByVal.main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,35 @@ | ||
# using GPUArrays | ||
|
||
group = addgroup!(SUITE, "kernel") | ||
|
||
group["launch"] = @benchmarkable @metal identity(nothing) | ||
|
||
# group["occupancy"] = @benchmarkable begin | ||
# kernel = @metal launch=false identity(nothing) | ||
# GPUArrays.launch_heuristic(Metal.mtlArrayBackend(), kernel.f; elements=1, elements_per_thread=1) | ||
# return | ||
# end | ||
|
||
src = Metal.rand(Float32, 512, 1000) | ||
dest = similar(src) | ||
function indexing_kernel(dest, src) | ||
i = thread_position_in_grid_1d() | ||
@inbounds dest[i] = src[i] | ||
return | ||
end | ||
group["indexing"] = @async_benchmarkable @metal threads=size(src,1) groups=size(src,2) $indexing_kernel($dest, $src) | ||
|
||
function checked_indexing_kernel(dest, src) | ||
i = thread_position_in_grid_1d() | ||
dest[i] = src[i] | ||
return | ||
end | ||
group["indexing_checked"] = @async_benchmarkable @metal threads=size(src,1) groups=size(src,2) $checked_indexing_kernel($dest, $src) | ||
|
||
## DELETE | ||
# function rand_kernel(dest::AbstractArray{T}) where {T} | ||
# i = thread_position_in_grid_1d() | ||
# dest[i] = Metal.rand(T) | ||
# return | ||
# end | ||
# group["rand"] = @async_benchmarkable @metal threads=size(src,1) groups=size(src,2) $rand_kernel($dest) |
Oops, something went wrong.