From da2455f755df6e1d55aae15dc7a02a05a79fdb7a Mon Sep 17 00:00:00 2001 From: Matthew Schlegel Date: Sun, 13 Feb 2022 10:27:40 -0700 Subject: [PATCH 1/7] Adding recurrent perf benchmarks for RNN. --- perf/bench_utils.jl | 20 ++++++++++++++++++-- perf/recurrent.jl | 13 +++++++++++++ perf/runbenchmarks.jl | 3 +++ 3 files changed, 34 insertions(+), 2 deletions(-) create mode 100644 perf/recurrent.jl diff --git a/perf/bench_utils.jl b/perf/bench_utils.jl index 923851ee91..ec93ecf894 100644 --- a/perf/bench_utils.jl +++ b/perf/bench_utils.jl @@ -1,12 +1,24 @@ using BenchmarkTools using Flux using CUDA -using Zygote: pullback +using Zygote: pullback, ignore fw(m, x) = m(x) bw(back) = back(1f0) fwbw(m, ps, x) = gradient(() -> sum(m(x)), ps) + +# Need to specialize for flux.recur. +fw(m::Flux.Recur, X::Vector{<:AbstractArray}) = begin + ignore() do + Flux.reset!(m) + end + [m(x) for x in X] +end +fwbw(m::Flux.Recur, ps, X::Vector{<:AbstractArray}) = gradient(ps) do + y = fw(m, X) + sum(sum(y)) +end function run_benchmark(model, x; cuda=true) @@ -16,7 +28,11 @@ function run_benchmark(model, x; cuda=true) end ps = Flux.params(model) - y, back = pullback(() -> sum(model(x)), ps) + y, back = if model isa Flux.Recur + pullback(() -> sum(sum([model(x_t) for x_t in x])), ps) + else + pullback(() -> sum(model(x)), ps) + end if cuda diff --git a/perf/recurrent.jl b/perf/recurrent.jl new file mode 100644 index 0000000000..bf3309be52 --- /dev/null +++ b/perf/recurrent.jl @@ -0,0 +1,13 @@ + +println("RNN") +for n in [2, 20, 200, 2000], T in [1, 8, 16, 64] + x = [randn(Float32, n, n) for t in 1:T] + model = RNN(n, n) + println("CPU n=$n, t=$T") + run_benchmark(model, x, cuda=false) + println("CUDA n=$n, t=$T") + run_benchmark(model, x, cuda=true) +end + + + diff --git a/perf/runbenchmarks.jl b/perf/runbenchmarks.jl index b458095ce8..7927699894 100644 --- a/perf/runbenchmarks.jl +++ b/perf/runbenchmarks.jl @@ -11,3 +11,6 @@ include("conv.jl") @info "Benchmark VGG" include("vgg.jl") + +@info "Benchmark Recurrent" +include("recurrent.jl") From 3a32ae6b22dfac13622f72b39e116bf578a4f335 Mon Sep 17 00:00:00 2001 From: Matthew Schlegel Date: Sun, 13 Feb 2022 10:57:51 -0700 Subject: [PATCH 2/7] Added benchmarks for 3d rnn api. --- perf/bench_utils.jl | 2 +- perf/recurrent.jl | 10 ++++++++++ 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/perf/bench_utils.jl b/perf/bench_utils.jl index ec93ecf894..4d80c59267 100644 --- a/perf/bench_utils.jl +++ b/perf/bench_utils.jl @@ -28,7 +28,7 @@ function run_benchmark(model, x; cuda=true) end ps = Flux.params(model) - y, back = if model isa Flux.Recur + y, back = if model isa Flux.Recur && eltype(x) isa AbstractVector pullback(() -> sum(sum([model(x_t) for x_t in x])), ps) else pullback(() -> sum(model(x)), ps) diff --git a/perf/recurrent.jl b/perf/recurrent.jl index bf3309be52..7a12612881 100644 --- a/perf/recurrent.jl +++ b/perf/recurrent.jl @@ -9,5 +9,15 @@ for n in [2, 20, 200, 2000], T in [1, 8, 16, 64] run_benchmark(model, x, cuda=true) end +println("RNN-3d") +for n in [2, 20, 200, 2000], T in [1, 8, 16, 64] + x = randn(Float32, n, n, T) + model = RNN(n, n) + println("CPU n=$n, t=$T") + run_benchmark(model, x, cuda=false) + println("CUDA n=$n, t=$T") + run_benchmark(model, x, cuda=true) +end + From db4d358e7217841bbf1258db11018d34f58d7c67 Mon Sep 17 00:00:00 2001 From: Matthew Schlegel Date: Sun, 13 Feb 2022 11:09:36 -0700 Subject: [PATCH 3/7] Fixed dumb error. --- perf/bench_utils.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/perf/bench_utils.jl b/perf/bench_utils.jl index 4d80c59267..d5b50c3001 100644 --- a/perf/bench_utils.jl +++ b/perf/bench_utils.jl @@ -28,7 +28,7 @@ function run_benchmark(model, x; cuda=true) end ps = Flux.params(model) - y, back = if model isa Flux.Recur && eltype(x) isa AbstractVector + y, back = if model isa Flux.Recur && eltype(x) isa AbstractArray pullback(() -> sum(sum([model(x_t) for x_t in x])), ps) else pullback(() -> sum(model(x)), ps) From 42fee275f9b3cb5aeb92f5389ac132de6acb1b97 Mon Sep 17 00:00:00 2001 From: Matthew Schlegel Date: Sun, 13 Feb 2022 11:12:15 -0700 Subject: [PATCH 4/7] Fixed issue w/ array type check. --- perf/bench_utils.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/perf/bench_utils.jl b/perf/bench_utils.jl index d5b50c3001..e01d086b05 100644 --- a/perf/bench_utils.jl +++ b/perf/bench_utils.jl @@ -28,7 +28,7 @@ function run_benchmark(model, x; cuda=true) end ps = Flux.params(model) - y, back = if model isa Flux.Recur && eltype(x) isa AbstractArray + y, back = if model isa Flux.Recur && eltype(x) <: AbstractArray pullback(() -> sum(sum([model(x_t) for x_t in x])), ps) else pullback(() -> sum(model(x)), ps) From 1525b303339c6f6a4ed1fb4a4280660d5f10e84b Mon Sep 17 00:00:00 2001 From: Matthew Schlegel Date: Sun, 13 Feb 2022 13:30:40 -0700 Subject: [PATCH 5/7] Make lower n to 1k instead of 2k to fit in 8GB. Try-catch for gpumem --- perf/recurrent.jl | 26 ++++++++++++++++++++++---- 1 file changed, 22 insertions(+), 4 deletions(-) diff --git a/perf/recurrent.jl b/perf/recurrent.jl index 7a12612881..bac0b92f0d 100644 --- a/perf/recurrent.jl +++ b/perf/recurrent.jl @@ -1,22 +1,40 @@ println("RNN") -for n in [2, 20, 200, 2000], T in [1, 8, 16, 64] +for n in [2, 20, 200, 1000], T in [1, 8, 16, 64] x = [randn(Float32, n, n) for t in 1:T] model = RNN(n, n) println("CPU n=$n, t=$T") run_benchmark(model, x, cuda=false) println("CUDA n=$n, t=$T") - run_benchmark(model, x, cuda=true) + try + run_benchmark(model, x, cuda=true) + catch ex + @show typeof(ex) + if ex isa OutOfGPUMemoryError + @warn "Not enough GPU memory to run test" + else + rethrow(ex) + end + end end println("RNN-3d") -for n in [2, 20, 200, 2000], T in [1, 8, 16, 64] +for n in [2, 20, 200, 1000], T in [1, 8, 16, 64] x = randn(Float32, n, n, T) model = RNN(n, n) println("CPU n=$n, t=$T") run_benchmark(model, x, cuda=false) println("CUDA n=$n, t=$T") - run_benchmark(model, x, cuda=true) + try + run_benchmark(model, x, cuda=true) + catch ex + @show typeof(ex) + if ex isa OutOfGPUMemoryError + @warn "Not enough GPU memory to run test" + else + rethrow(ex) + end + end end From add28e3e527cfeb211183bda121019668fa62f3b Mon Sep 17 00:00:00 2001 From: Matthew Schlegel Date: Thu, 3 Mar 2022 13:24:15 -0700 Subject: [PATCH 6/7] Updated recurrent benchmarks from suggestions. Modified benchutils to be easier to overload behaviour. --- perf/bench_utils.jl | 21 ++---------- perf/recurrent.jl | 79 ++++++++++++++++++++++++++++----------------- 2 files changed, 53 insertions(+), 47 deletions(-) diff --git a/perf/bench_utils.jl b/perf/bench_utils.jl index e01d086b05..548b9c2c02 100644 --- a/perf/bench_utils.jl +++ b/perf/bench_utils.jl @@ -6,20 +6,9 @@ using Zygote: pullback, ignore fw(m, x) = m(x) bw(back) = back(1f0) -fwbw(m, ps, x) = gradient(() -> sum(m(x)), ps) +fwbw(m, ps, x) = gradient(() -> sum(fw(m, x)), ps) +pb(m, ps, x) = pullback(()->sum(fw(m, x)), ps) -# Need to specialize for flux.recur. -fw(m::Flux.Recur, X::Vector{<:AbstractArray}) = begin - ignore() do - Flux.reset!(m) - end - [m(x) for x in X] -end -fwbw(m::Flux.Recur, ps, X::Vector{<:AbstractArray}) = gradient(ps) do - y = fw(m, X) - sum(sum(y)) -end - function run_benchmark(model, x; cuda=true) if cuda @@ -28,11 +17,7 @@ function run_benchmark(model, x; cuda=true) end ps = Flux.params(model) - y, back = if model isa Flux.Recur && eltype(x) <: AbstractArray - pullback(() -> sum(sum([model(x_t) for x_t in x])), ps) - else - pullback(() -> sum(model(x)), ps) - end + y, back = pb(model, ps, x) if cuda diff --git a/perf/recurrent.jl b/perf/recurrent.jl index bac0b92f0d..ef00a8d9a5 100644 --- a/perf/recurrent.jl +++ b/perf/recurrent.jl @@ -1,41 +1,62 @@ -println("RNN") -for n in [2, 20, 200, 1000], T in [1, 8, 16, 64] - x = [randn(Float32, n, n) for t in 1:T] - model = RNN(n, n) - println("CPU n=$n, t=$T") - run_benchmark(model, x, cuda=false) - println("CUDA n=$n, t=$T") - try - run_benchmark(model, x, cuda=true) - catch ex - @show typeof(ex) - if ex isa OutOfGPUMemoryError - @warn "Not enough GPU memory to run test" - else - rethrow(ex) - end - end + +struct RNNWrapper{T} + rnn::T +end +Flux.@functor RNNWrapper + +# Need to specialize for RNNWrapper. +fw(r::RNNWrapper, X::Vector{<:AbstractArray}) = begin + Flux.reset!(r.rnn) + [r.rnn(x) for x in X] +end + +fw(r::RNNWrapper, X) = begin + Flux.reset!(r.rnn) + r.rnn(X) +end + +fwbw(r::RNNWrapper, ps, X::Vector{<:AbstractArray}) = gradient(ps) do + y = fw(r, X) + sum(sum(y)) +end + +pb(r::RNNWrapper, ps, X::Vector{<:AbstractArray}) = pullback(ps) do + y = fw(r, X) + sum(sum(y)) end -println("RNN-3d") -for n in [2, 20, 200, 1000], T in [1, 8, 16, 64] - x = randn(Float32, n, n, T) - model = RNN(n, n) - println("CPU n=$n, t=$T") - run_benchmark(model, x, cuda=false) - println("CUDA n=$n, t=$T") - try +function rnn_benchmark_sweep(data_creator::Function, rnn_type) + for n in [2, 20, 200, 1000], ts in [1, 4, 16, 64] + x, x_n = data_creator(n, ts) + model = RNNWrapper(rnn_type(n, n)) + + println("$rnn_type $x_n CPU n=$n, ts=$ts") + run_benchmark(model, x, cuda=false) + + println("$rnn_type $x_n CUDA n=$n, ts=$ts") + try run_benchmark(model, x, cuda=true) - catch ex + catch ex @show typeof(ex) if ex isa OutOfGPUMemoryError - @warn "Not enough GPU memory to run test" + @warn "Not enough GPU memory to run test" else - rethrow(ex) + rethrow(ex) end - end + end + end end +for rnn_type in [Flux.RNN, Flux.GRU, Flux.LSTM] + rnn_benchmark_sweep(rnn_type) do n, ts + [randn(Float32, n, n) for _ in 1:ts], "Vec" + end +end +for rnn_type in [Flux.RNN, Flux.GRU, Flux.LSTM] + rnn_benchmark_sweep(rnn_type) do n, ts + randn(Float32, n, n, ts), "Block" + end +end From 9d1eb8cdbe2acb2b4b3b8576d4cd8fec0b74ced6 Mon Sep 17 00:00:00 2001 From: Matthew Schlegel Date: Fri, 25 Mar 2022 09:05:27 -0600 Subject: [PATCH 7/7] Apply suggestions from code review Co-authored-by: Brian Chen --- perf/bench_utils.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/perf/bench_utils.jl b/perf/bench_utils.jl index 548b9c2c02..525184f773 100644 --- a/perf/bench_utils.jl +++ b/perf/bench_utils.jl @@ -7,7 +7,7 @@ using Zygote: pullback, ignore fw(m, x) = m(x) bw(back) = back(1f0) fwbw(m, ps, x) = gradient(() -> sum(fw(m, x)), ps) -pb(m, ps, x) = pullback(()->sum(fw(m, x)), ps) +pb(m, ps, x) = pullback(() -> sum(fw(m, x)), ps) function run_benchmark(model, x; cuda=true)