FluxML · DhairyaLGandhi · Jun 15, 2021 · Jun 15, 2021 · Jun 16, 2021 · Jun 16, 2021
diff --git a/src/layers/normalise.jl b/src/layers/normalise.jl
@@ -1,3 +1,5 @@
+using Random: default_rng
+
 istraining() = false
 
 @adjoint istraining() = true, _ -> nothing
@@ -10,7 +12,7 @@ _dropout_shape(s, dims) = tuple((i ∉ dims ? 1 : si for (i, si) ∈ enumerate(s
 _dropout_kernel(y::T, p, q) where {T} = y > p ? T(1 / q) : T(0)
 
 """
-    dropout(x, p; dims=:, active=true)
+    dropout([rng = default_rng()], x, p; dims=:, active=true)
 
 The dropout function. If `active` is `true`,
 for each input, either sets that input to `0` (with probability
@@ -28,26 +30,36 @@ automatically managed using the [`Dropout`](@ref) layer instead of the
 
 The [`Dropout`](@ref) layer is what you should use in most scenarios.
 """
-function dropout(x, p; dims=:, active::Bool=true)
+function dropout(rng::AbstractRNG, x, p; dims = :, active::Bool = true)
   active || return x
-  y = dropout_mask(x, p, dims=dims)
+  y = dropout_mask(rng, x, p, dims=dims)
   return x .* y
 end
 
-@adjoint function dropout(x, p; dims=:, active::Bool=true)
+function dropout(x, p; dims = :, active::Bool = true)
+  dropout(default_rng(), x, p, dims = dims, active = active)
+end
+
+# CUDA currently needs a manual dispatch to avoid
+# calling a non-GPU RNG with a CuArray
+function dropout(x::CUDA.CuArray, p; dims = :, active::Bool = true)
+  dropout(CUDA.CURAND.default_rng(), x, p, dims = dims, active = active)
+end
+
+@adjoint function dropout(rng, x, p; dims = :, active::Bool = true)
   active || return x, Δ -> (Δ, nothing)
-  y = dropout_mask(x, p, dims=dims)
-  return x .* y, Δ -> (Δ .* y, nothing)
+  y = dropout_mask(rng, x, p, dims = dims)
+  return x .* y, Δ -> (nothing, Δ .* y, nothing)
 end
 
-function dropout_mask(x, p; dims=:)
-  y = rand!(similar(x, _dropout_shape(x, dims)))
+function dropout_mask(rng::AbstractRNG, x, p; dims=:)
+  y = rand!(rng, similar(x, _dropout_shape(x, dims)))
   y .= _dropout_kernel.(y, p, 1 - p)
   return y
 end
 
 """
-    Dropout(p; dims=:)
+    Dropout([rng = default_rng()], p; dims=:)
 function DataLoader(data; batchsize=1, shuffle=false, partial=true, rng=GLOBAL_RNG) 
 function DataLoader(data; batchsize=1, shuffle=false, partial=true, rng=GLOBAL_RNG) 
 
 Dropout layer. In the forward pass, apply the [`Flux.dropout`](@ref) function on the input.
 
@@ -60,20 +72,25 @@ Does nothing to the input once [`Flux.testmode!`](@ref) is `true`.
 mutable struct Dropout{F,D}
   p::F
   dims::D
+  rng::AbstractRNG
   active::Union{Bool, Nothing}
 end
 
-function Dropout(p; dims=:)
+function Dropout(p; dims = :)
+  Dropout(default_rng(), p; dims)
+end
+
+function Dropout(rng, p; dims = :)
   @assert 0 ≤ p ≤ 1
-  Dropout(p, dims, nothing)
+  Dropout(p, dims, rng, nothing)
 end
 
 function (a::Dropout)(x)
   _isactive(a) || return x
-  return dropout(x, a.p; dims=a.dims, active=true)
+  return dropout(x, a.p; dims = a.dims, active = true)
 end
 
-testmode!(m::Dropout, mode=true) =
+testmode!(m::Dropout, mode = true) =
   (m.active = (isnothing(mode) || mode == :auto) ? nothing : !mode; m)
 
 function Base.show(io::IO, d::Dropout)

diff --git a/test/runtests.jl b/test/runtests.jl
@@ -25,7 +25,6 @@ end
 @testset "Losses" begin
   include("losses.jl")
   include("ctc.jl")
-  if Flux.use_cuda[] include("ctc-gpu.jl") end
 end
 
 @testset "Layers" begin
@@ -44,6 +43,9 @@ end
 
 @testset "CUDA" begin
   if Flux.use_cuda[]
+    using CUDA
+    CUDA.allowscalar(false)
+    include("ctc-gpu.jl")
     include("cuda/runtests.jl")
   else
     @warn "CUDA unavailable, not testing GPU support"