FluxML · ToucheSir · Apr 23, 2023 · Apr 21, 2023 · Apr 23, 2023 · Apr 23, 2023
diff --git a/Project.toml b/Project.toml
@@ -26,7 +26,7 @@ Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
 cuDNN = "02a925ec-e4fe-4b08-9a7e-0d78e3d38ccd"
 
 [compat]
-AMDGPU = "0.4.8"
+AMDGPU = "0.4.13"
 Adapt = "3.0"
 CUDA = "3, 4"
 ChainRulesCore = "1.12"

diff --git a/ext/AMDGPUExt/functor.jl b/ext/AMDGPUExt/functor.jl
@@ -26,70 +26,51 @@ function ChainRulesCore.rrule(
         adapt_storage(FluxAMDAdaptor(), unthunk(dx)))
 end
 
-function _amd(x)
-    check_use_amdgpu()
-    USE_AMDGPU[] || return x
-    fmap(x -> Adapt.adapt(FluxAMDAdaptor(), x), x; exclude=_isleaf)
-end
-
 # Since MIOpen supports only cross-correlation as convolution,
 # for the actual convolution, we flip horizontally and vertically the weights.
 # Same for CPU -> GPU & GPU -> CPU movements.
 # Note, that gradients are also flipped.
 
-# CPU -> GPU
+const FLUX_CONV{M} = Union{
+    Flux.Conv{<:Any, <:Any, <:Any, <:M, <:Any},
+    Flux.ConvTranspose{<:Any, <:Any, <:Any, <:M, <:Any}}
+const CPU_CONV = FLUX_CONV{Array}
+const AMD_CONV = FLUX_CONV{ROCArray}
+
+_conv_basetype(::Conv) = Conv
+_conv_basetype(::ConvTranspose) = ConvTranspose
 
-_conv_basetype(c::Type{C}) where C <: Conv = Conv
-_conv_basetype(c::Type{C}) where C <: ConvTranspose = ConvTranspose
+Flux._isleaf(::AMD_CONV) = true
 
-function adapt_storage(to::FluxAMDAdaptor, m::C) where C <: Union{Conv, ConvTranspose}
+_exclude(x) = _isleaf(x)
+_exclude(::CPU_CONV) = true
+
+function _amd(x)
+    check_use_amdgpu()
+    USE_AMDGPU[] || return x
+    fmap(x -> Adapt.adapt(FluxAMDAdaptor(), x), x; exclude=_exclude)
+end
+
+# CPU -> GPU
+
+function Adapt.adapt_structure(to::FluxAMDAdaptor, m::CPU_CONV)
     flipped_weight = reverse(m.weight; dims=ntuple(i -> i, ndims(m.weight) - 2))
-    _conv_basetype(C)(
+    _conv_basetype(m)(
         Adapt.adapt(to, m.σ),
         Adapt.adapt(to, flipped_weight),
         Adapt.adapt(to, m.bias),
         m.stride, m.pad, m.dilation, m.groups)
 end
 
 # Don't adapt again.
-function adapt_storage(
-    to::FluxAMDAdaptor, m::Conv{N, M, F, A, V},
-) where {N, M, F, A <: ROCArray, V}
-    return m
-end
-
-function adapt_storage(
-    to::FluxAMDAdaptor, m::ConvTranspose{N, M, F, A, V},
-) where {N, M, F, A <: ROCArray, V}
-    return m
-end
 
-_amd(m::Union{Conv, ConvTranspose}) = adapt_storage(FluxAMDAdaptor(), m)
+Adapt.adapt_structure(to::FluxAMDAdaptor, m::AMD_CONV) = m
 
 # GPU -> CPU
 
-function Flux.cpu(m::Conv{N, M, F, A, V}) where {N, M, F, A <: ROCArray, V}
-    adapt_storage(FluxCPUAdaptor(), m)
-end
-
-function Flux.cpu(m::ConvTranspose{N, M, F, A, V}) where {N, M, F, A <: ROCArray, V}
-    adapt_storage(FluxCPUAdaptor(), m)
-end
-
-function adapt_storage(
-    to::FluxCPUAdaptor, m::Conv{N, M, F, A, V},
-) where {N, M, F, A <: ROCArray, V}
-    dims = ntuple(i -> i, ndims(m.weight) - 2)
-    Conv(
-        Adapt.adapt(to, m.σ), reverse(Adapt.adapt(to, m.weight); dims),
-        Adapt.adapt(to, m.bias), m.stride, m.pad, m.dilation, m.groups)
-end
-
-function adapt_storage(
-    to::FluxCPUAdaptor, m::ConvTranspose{N, M, F, A, V},
-) where {N, M, F, A <: ROCArray, V}
+function Adapt.adapt_structure(to::FluxCPUAdaptor, m::AMD_CONV)
     dims = ntuple(i -> i, ndims(m.weight) - 2)
-    ConvTranspose(
+    _conv_basetype(m)(
         Adapt.adapt(to, m.σ), reverse(Adapt.adapt(to, m.weight); dims),
         Adapt.adapt(to, m.bias), m.stride, m.pad, m.dilation, m.groups)
 end
diff --git a/test/amd/basic.jl b/test/amd/basic.jl
@@ -46,10 +46,26 @@ end
     end
 end
 
+@testset "Chain(Conv)" begin
+    m = Chain(Conv((3, 3), 3 => 3)) |> f32
+    x = rand(Float32, 10, 10, 3, 2)
+    gpu_autodiff_test(m, x; atol=1f-3, checkgrad=false)
+
+    md = m |> gpu |> cpu
+    @test md[1].weight ≈ m[1].weight atol=1f-3
+
+    m = Chain(ConvTranspose((3, 3), 3 => 3)) |> f32
+    x = rand(Float32, 10, 10, 3, 2)
+    gpu_autodiff_test(m, x; atol=1f-3, checkgrad=false)
+
+    md = m |> gpu |> cpu
+    @test md[1].weight ≈ m[1].weight atol=1f-3
+end
+
 @testset "Cross-correlation" begin
     m = CrossCor((2, 2), 3 => 4) |> f32
     x = rand(Float32, 10, 10, 3, 2)
-    gpu_autodiff_test(m, x)
+    gpu_autodiff_test(m, x; atol=1f-3)
 end
 
 @testset "Restructure" begin

diff --git a/test/amd/runtests.jl b/test/amd/runtests.jl
@@ -1,19 +1,13 @@
-Flux.gpu_backend!("AMD")
-
-AMDGPU.allowscalar(false)
-
-# Extend test utils to AMDGPU.
-
 function check_grad(
-    g_gpu::ROCArray{Float32}, g_cpu::Array{Float32}, atol, rtol;
-    allow_nothing::Bool,
+    g_gpu::ROCArray{Float32}, g_cpu::Array{Float32};
+    atol, rtol, allow_nothing::Bool,
 )
     @test g_cpu ≈ collect(g_gpu) atol=atol rtol=rtol
 end
 
 function check_grad(
-    g_gpu::ROCArray{Float32}, g_cpu::Zygote.FillArrays.AbstractFill,
-    atol, rtol; allow_nothing::Bool,
+    g_gpu::ROCArray{Float32}, g_cpu::Zygote.FillArrays.AbstractFill;
+    atol, rtol, allow_nothing::Bool,
 )
     @test g_cpu ≈ collect(g_gpu) atol=atol rtol=rtol
 end

diff --git a/test/runtests.jl b/test/runtests.jl
@@ -66,9 +66,10 @@ Random.seed!(0)
 
   if get(ENV, "FLUX_TEST_AMDGPU", "false") == "true"
     using AMDGPU
-    AMDGPU.versioninfo()
+    Flux.gpu_backend!("AMD")
+    AMDGPU.allowscalar(false)
+
     if AMDGPU.functional() && AMDGPU.functional(:MIOpen)
-      @show AMDGPU.MIOpen.version()
       @testset "AMDGPU" begin
         include("amd/runtests.jl")
       end

diff --git a/test/test_utils.jl b/test/test_utils.jl
@@ -2,6 +2,7 @@ function check_grad(g_gpu, g_cpu;
             rtol=1e-4, atol=1e-4,
             allow_nothing::Bool=false)
     allow_nothing && return
+    @warn "Unsupported types in `check_grad`: $(typeof(g_gpu)), $(typeof(g_cpu))"
     @show g_gpu g_cpu
     @test false
 end