From da2455f755df6e1d55aae15dc7a02a05a79fdb7a Mon Sep 17 00:00:00 2001
From: Matthew Schlegel <mkschleg@gmail.com>
Date: Sun, 13 Feb 2022 10:27:40 -0700
Subject: [PATCH 1/7] Adding recurrent perf benchmarks for RNN.

---
 perf/bench_utils.jl   | 20 ++++++++++++++++++--
 perf/recurrent.jl     | 13 +++++++++++++
 perf/runbenchmarks.jl |  3 +++
 3 files changed, 34 insertions(+), 2 deletions(-)
 create mode 100644 perf/recurrent.jl

diff --git a/perf/bench_utils.jl b/perf/bench_utils.jl
index 923851ee91..ec93ecf894 100644
--- a/perf/bench_utils.jl
+++ b/perf/bench_utils.jl
@@ -1,12 +1,24 @@
 using BenchmarkTools
 using Flux
 using CUDA
-using Zygote: pullback
+using Zygote: pullback, ignore
 
 
 fw(m, x) = m(x)
 bw(back) = back(1f0)
 fwbw(m, ps, x) = gradient(() -> sum(m(x)), ps)
+
+# Need to specialize for flux.recur.
+fw(m::Flux.Recur, X::Vector{<:AbstractArray}) = begin
+    ignore() do
+      Flux.reset!(m)
+    end
+    [m(x) for x in X]
+end
+fwbw(m::Flux.Recur, ps, X::Vector{<:AbstractArray}) = gradient(ps) do
+    y = fw(m, X)
+    sum(sum(y))
+end
   
 function run_benchmark(model, x; cuda=true)
     
@@ -16,7 +28,11 @@ function run_benchmark(model, x; cuda=true)
     end
 
     ps = Flux.params(model)
-    y, back = pullback(() -> sum(model(x)), ps)
+    y, back = if model isa Flux.Recur
+        pullback(() -> sum(sum([model(x_t) for x_t in x])), ps)
+    else
+        pullback(() -> sum(model(x)), ps)
+    end
 
 
     if cuda
diff --git a/perf/recurrent.jl b/perf/recurrent.jl
new file mode 100644
index 0000000000..bf3309be52
--- /dev/null
+++ b/perf/recurrent.jl
@@ -0,0 +1,13 @@
+
+println("RNN")
+for n in [2, 20, 200, 2000], T in [1, 8, 16, 64]
+  x = [randn(Float32, n, n) for t in 1:T]
+  model = RNN(n, n)
+  println("CPU n=$n, t=$T")
+  run_benchmark(model, x, cuda=false)
+  println("CUDA n=$n, t=$T")
+  run_benchmark(model, x, cuda=true)    
+end
+
+
+
diff --git a/perf/runbenchmarks.jl b/perf/runbenchmarks.jl
index b458095ce8..7927699894 100644
--- a/perf/runbenchmarks.jl
+++ b/perf/runbenchmarks.jl
@@ -11,3 +11,6 @@ include("conv.jl")
 
 @info "Benchmark VGG"
 include("vgg.jl")
+
+@info "Benchmark Recurrent"
+include("recurrent.jl")

From 3a32ae6b22dfac13622f72b39e116bf578a4f335 Mon Sep 17 00:00:00 2001
From: Matthew Schlegel <mkschleg@gmail.com>
Date: Sun, 13 Feb 2022 10:57:51 -0700
Subject: [PATCH 2/7] Added benchmarks for 3d rnn api.

---
 perf/bench_utils.jl |  2 +-
 perf/recurrent.jl   | 10 ++++++++++
 2 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/perf/bench_utils.jl b/perf/bench_utils.jl
index ec93ecf894..4d80c59267 100644
--- a/perf/bench_utils.jl
+++ b/perf/bench_utils.jl
@@ -28,7 +28,7 @@ function run_benchmark(model, x; cuda=true)
     end
 
     ps = Flux.params(model)
-    y, back = if model isa Flux.Recur
+    y, back = if model isa Flux.Recur && eltype(x) isa AbstractVector
         pullback(() -> sum(sum([model(x_t) for x_t in x])), ps)
     else
         pullback(() -> sum(model(x)), ps)
diff --git a/perf/recurrent.jl b/perf/recurrent.jl
index bf3309be52..7a12612881 100644
--- a/perf/recurrent.jl
+++ b/perf/recurrent.jl
@@ -9,5 +9,15 @@ for n in [2, 20, 200, 2000], T in [1, 8, 16, 64]
   run_benchmark(model, x, cuda=true)    
 end
 
+println("RNN-3d")
+for n in [2, 20, 200, 2000], T in [1, 8, 16, 64]
+  x = randn(Float32, n, n, T)
+  model = RNN(n, n)
+  println("CPU n=$n, t=$T")
+  run_benchmark(model, x, cuda=false)
+  println("CUDA n=$n, t=$T")
+  run_benchmark(model, x, cuda=true)    
+end
+
 
 

From db4d358e7217841bbf1258db11018d34f58d7c67 Mon Sep 17 00:00:00 2001
From: Matthew Schlegel <mkschleg@gmail.com>
Date: Sun, 13 Feb 2022 11:09:36 -0700
Subject: [PATCH 3/7] Fixed dumb error.

---
 perf/bench_utils.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/perf/bench_utils.jl b/perf/bench_utils.jl
index 4d80c59267..d5b50c3001 100644
--- a/perf/bench_utils.jl
+++ b/perf/bench_utils.jl
@@ -28,7 +28,7 @@ function run_benchmark(model, x; cuda=true)
     end
 
     ps = Flux.params(model)
-    y, back = if model isa Flux.Recur && eltype(x) isa AbstractVector
+    y, back = if model isa Flux.Recur && eltype(x) isa AbstractArray
         pullback(() -> sum(sum([model(x_t) for x_t in x])), ps)
     else
         pullback(() -> sum(model(x)), ps)

From 42fee275f9b3cb5aeb92f5389ac132de6acb1b97 Mon Sep 17 00:00:00 2001
From: Matthew Schlegel <mkschleg@gmail.com>
Date: Sun, 13 Feb 2022 11:12:15 -0700
Subject: [PATCH 4/7] Fixed issue w/ array type check.

---
 perf/bench_utils.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/perf/bench_utils.jl b/perf/bench_utils.jl
index d5b50c3001..e01d086b05 100644
--- a/perf/bench_utils.jl
+++ b/perf/bench_utils.jl
@@ -28,7 +28,7 @@ function run_benchmark(model, x; cuda=true)
     end
 
     ps = Flux.params(model)
-    y, back = if model isa Flux.Recur && eltype(x) isa AbstractArray
+    y, back = if model isa Flux.Recur && eltype(x) <: AbstractArray
         pullback(() -> sum(sum([model(x_t) for x_t in x])), ps)
     else
         pullback(() -> sum(model(x)), ps)

From 1525b303339c6f6a4ed1fb4a4280660d5f10e84b Mon Sep 17 00:00:00 2001
From: Matthew Schlegel <mkschleg@gmail.com>
Date: Sun, 13 Feb 2022 13:30:40 -0700
Subject: [PATCH 5/7] Make lower n to 1k instead of 2k to fit in 8GB. Try-catch
 for gpumem

---
 perf/recurrent.jl | 26 ++++++++++++++++++++++----
 1 file changed, 22 insertions(+), 4 deletions(-)

diff --git a/perf/recurrent.jl b/perf/recurrent.jl
index 7a12612881..bac0b92f0d 100644
--- a/perf/recurrent.jl
+++ b/perf/recurrent.jl
@@ -1,22 +1,40 @@
 
 println("RNN")
-for n in [2, 20, 200, 2000], T in [1, 8, 16, 64]
+for n in [2, 20, 200, 1000], T in [1, 8, 16, 64]
   x = [randn(Float32, n, n) for t in 1:T]
   model = RNN(n, n)
   println("CPU n=$n, t=$T")
   run_benchmark(model, x, cuda=false)
   println("CUDA n=$n, t=$T")
-  run_benchmark(model, x, cuda=true)    
+  try
+      run_benchmark(model, x, cuda=true)
+  catch ex
+      @show typeof(ex)
+      if ex isa OutOfGPUMemoryError
+          @warn "Not enough GPU memory to run test"
+      else
+          rethrow(ex)
+      end
+  end
 end
 
 println("RNN-3d")
-for n in [2, 20, 200, 2000], T in [1, 8, 16, 64]
+for n in [2, 20, 200, 1000], T in [1, 8, 16, 64]
   x = randn(Float32, n, n, T)
   model = RNN(n, n)
   println("CPU n=$n, t=$T")
   run_benchmark(model, x, cuda=false)
   println("CUDA n=$n, t=$T")
-  run_benchmark(model, x, cuda=true)    
+  try
+      run_benchmark(model, x, cuda=true)
+  catch ex
+      @show typeof(ex)
+      if ex isa OutOfGPUMemoryError
+          @warn "Not enough GPU memory to run test"
+      else
+          rethrow(ex)
+      end
+  end
 end
 
 

From add28e3e527cfeb211183bda121019668fa62f3b Mon Sep 17 00:00:00 2001
From: Matthew Schlegel <mkschleg@gmail.com>
Date: Thu, 3 Mar 2022 13:24:15 -0700
Subject: [PATCH 6/7] Updated recurrent benchmarks from suggestions. Modified
 benchutils to be easier to overload behaviour.

---
 perf/bench_utils.jl | 21 ++----------
 perf/recurrent.jl   | 79 ++++++++++++++++++++++++++++-----------------
 2 files changed, 53 insertions(+), 47 deletions(-)

diff --git a/perf/bench_utils.jl b/perf/bench_utils.jl
index e01d086b05..548b9c2c02 100644
--- a/perf/bench_utils.jl
+++ b/perf/bench_utils.jl
@@ -6,20 +6,9 @@ using Zygote: pullback, ignore
 
 fw(m, x) = m(x)
 bw(back) = back(1f0)
-fwbw(m, ps, x) = gradient(() -> sum(m(x)), ps)
+fwbw(m, ps, x) = gradient(() -> sum(fw(m, x)), ps)
+pb(m, ps, x) = pullback(()->sum(fw(m, x)), ps)
 
-# Need to specialize for flux.recur.
-fw(m::Flux.Recur, X::Vector{<:AbstractArray}) = begin
-    ignore() do
-      Flux.reset!(m)
-    end
-    [m(x) for x in X]
-end
-fwbw(m::Flux.Recur, ps, X::Vector{<:AbstractArray}) = gradient(ps) do
-    y = fw(m, X)
-    sum(sum(y))
-end
-  
 function run_benchmark(model, x; cuda=true)
     
     if cuda 
@@ -28,11 +17,7 @@ function run_benchmark(model, x; cuda=true)
     end
 
     ps = Flux.params(model)
-    y, back = if model isa Flux.Recur && eltype(x) <: AbstractArray
-        pullback(() -> sum(sum([model(x_t) for x_t in x])), ps)
-    else
-        pullback(() -> sum(model(x)), ps)
-    end
+    y, back =  pb(model, ps, x)
 
 
     if cuda
diff --git a/perf/recurrent.jl b/perf/recurrent.jl
index bac0b92f0d..ef00a8d9a5 100644
--- a/perf/recurrent.jl
+++ b/perf/recurrent.jl
@@ -1,41 +1,62 @@
 
-println("RNN")
-for n in [2, 20, 200, 1000], T in [1, 8, 16, 64]
-  x = [randn(Float32, n, n) for t in 1:T]
-  model = RNN(n, n)
-  println("CPU n=$n, t=$T")
-  run_benchmark(model, x, cuda=false)
-  println("CUDA n=$n, t=$T")
-  try
-      run_benchmark(model, x, cuda=true)
-  catch ex
-      @show typeof(ex)
-      if ex isa OutOfGPUMemoryError
-          @warn "Not enough GPU memory to run test"
-      else
-          rethrow(ex)
-      end
-  end
+
+struct RNNWrapper{T}
+  rnn::T
+end
+Flux.@functor RNNWrapper
+
+# Need to specialize for RNNWrapper.
+fw(r::RNNWrapper, X::Vector{<:AbstractArray}) = begin
+  Flux.reset!(r.rnn)
+  [r.rnn(x) for x in X]
+end
+
+fw(r::RNNWrapper, X) = begin
+  Flux.reset!(r.rnn)
+  r.rnn(X)
+end
+
+fwbw(r::RNNWrapper, ps, X::Vector{<:AbstractArray}) = gradient(ps) do
+  y = fw(r, X)
+  sum(sum(y))
+end
+
+pb(r::RNNWrapper, ps, X::Vector{<:AbstractArray}) = pullback(ps) do
+  y = fw(r, X)
+  sum(sum(y))
 end
 
-println("RNN-3d")
-for n in [2, 20, 200, 1000], T in [1, 8, 16, 64]
-  x = randn(Float32, n, n, T)
-  model = RNN(n, n)
-  println("CPU n=$n, t=$T")
-  run_benchmark(model, x, cuda=false)
-  println("CUDA n=$n, t=$T")
-  try
+function rnn_benchmark_sweep(data_creator::Function, rnn_type)
+  for n in [2, 20, 200, 1000], ts in [1, 4, 16, 64]
+    x, x_n = data_creator(n, ts)
+    model = RNNWrapper(rnn_type(n, n))
+    
+    println("$rnn_type $x_n CPU n=$n, ts=$ts")
+    run_benchmark(model, x, cuda=false)
+    
+    println("$rnn_type $x_n CUDA n=$n, ts=$ts")
+    try
       run_benchmark(model, x, cuda=true)
-  catch ex
+    catch ex
       @show typeof(ex)
       if ex isa OutOfGPUMemoryError
-          @warn "Not enough GPU memory to run test"
+        @warn "Not enough GPU memory to run test"
       else
-          rethrow(ex)
+        rethrow(ex)
       end
-  end
+    end
+  end  
 end
 
+for rnn_type in [Flux.RNN, Flux.GRU, Flux.LSTM]
+  rnn_benchmark_sweep(rnn_type) do n, ts
+    [randn(Float32, n, n) for _ in 1:ts], "Vec"
+  end
+end
 
+for rnn_type in [Flux.RNN, Flux.GRU, Flux.LSTM]
+  rnn_benchmark_sweep(rnn_type) do n, ts
+    randn(Float32, n, n, ts), "Block"
+  end
+end
 

From 9d1eb8cdbe2acb2b4b3b8576d4cd8fec0b74ced6 Mon Sep 17 00:00:00 2001
From: Matthew Schlegel <mkschleg@users.noreply.github.com>
Date: Fri, 25 Mar 2022 09:05:27 -0600
Subject: [PATCH 7/7] Apply suggestions from code review

Co-authored-by: Brian Chen <ToucheSir@users.noreply.github.com>
---
 perf/bench_utils.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/perf/bench_utils.jl b/perf/bench_utils.jl
index 548b9c2c02..525184f773 100644
--- a/perf/bench_utils.jl
+++ b/perf/bench_utils.jl
@@ -7,7 +7,7 @@ using Zygote: pullback, ignore
 fw(m, x) = m(x)
 bw(back) = back(1f0)
 fwbw(m, ps, x) = gradient(() -> sum(fw(m, x)), ps)
-pb(m, ps, x) = pullback(()->sum(fw(m, x)), ps)
+pb(m, ps, x) = pullback(() -> sum(fw(m, x)), ps)
 
 function run_benchmark(model, x; cuda=true)