diff --git a/Project.toml b/Project.toml index 3d1cd0ca6a..381521f9b7 100644 --- a/Project.toml +++ b/Project.toml @@ -26,7 +26,7 @@ Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f" cuDNN = "02a925ec-e4fe-4b08-9a7e-0d78e3d38ccd" [compat] -AMDGPU = "0.4.8" +AMDGPU = "0.4.13" Adapt = "3.0" CUDA = "3, 4" ChainRulesCore = "1.12" diff --git a/ext/AMDGPUExt/functor.jl b/ext/AMDGPUExt/functor.jl index 797a55c110..27327bfebb 100644 --- a/ext/AMDGPUExt/functor.jl +++ b/ext/AMDGPUExt/functor.jl @@ -26,25 +26,36 @@ function ChainRulesCore.rrule( adapt_storage(FluxAMDAdaptor(), unthunk(dx))) end -function _amd(x) - check_use_amdgpu() - USE_AMDGPU[] || return x - fmap(x -> Adapt.adapt(FluxAMDAdaptor(), x), x; exclude=_isleaf) -end - # Since MIOpen supports only cross-correlation as convolution, # for the actual convolution, we flip horizontally and vertically the weights. # Same for CPU -> GPU & GPU -> CPU movements. # Note, that gradients are also flipped. -# CPU -> GPU +const FLUX_CONV{M} = Union{ + Flux.Conv{<:Any, <:Any, <:Any, <:M, <:Any}, + Flux.ConvTranspose{<:Any, <:Any, <:Any, <:M, <:Any}} +const CPU_CONV = FLUX_CONV{Array} +const AMD_CONV = FLUX_CONV{ROCArray} + +_conv_basetype(::Conv) = Conv +_conv_basetype(::ConvTranspose) = ConvTranspose -_conv_basetype(c::Type{C}) where C <: Conv = Conv -_conv_basetype(c::Type{C}) where C <: ConvTranspose = ConvTranspose +Flux._isleaf(::AMD_CONV) = true -function adapt_storage(to::FluxAMDAdaptor, m::C) where C <: Union{Conv, ConvTranspose} +_exclude(x) = _isleaf(x) +_exclude(::CPU_CONV) = true + +function _amd(x) + check_use_amdgpu() + USE_AMDGPU[] || return x + fmap(x -> Adapt.adapt(FluxAMDAdaptor(), x), x; exclude=_exclude) +end + +# CPU -> GPU + +function Adapt.adapt_structure(to::FluxAMDAdaptor, m::CPU_CONV) flipped_weight = reverse(m.weight; dims=ntuple(i -> i, ndims(m.weight) - 2)) - _conv_basetype(C)( + _conv_basetype(m)( Adapt.adapt(to, m.σ), Adapt.adapt(to, flipped_weight), Adapt.adapt(to, m.bias), @@ -52,44 +63,14 @@ function adapt_storage(to::FluxAMDAdaptor, m::C) where C <: Union{Conv, ConvTran end # Don't adapt again. -function adapt_storage( - to::FluxAMDAdaptor, m::Conv{N, M, F, A, V}, -) where {N, M, F, A <: ROCArray, V} - return m -end - -function adapt_storage( - to::FluxAMDAdaptor, m::ConvTranspose{N, M, F, A, V}, -) where {N, M, F, A <: ROCArray, V} - return m -end -_amd(m::Union{Conv, ConvTranspose}) = adapt_storage(FluxAMDAdaptor(), m) +Adapt.adapt_structure(to::FluxAMDAdaptor, m::AMD_CONV) = m # GPU -> CPU -function Flux.cpu(m::Conv{N, M, F, A, V}) where {N, M, F, A <: ROCArray, V} - adapt_storage(FluxCPUAdaptor(), m) -end - -function Flux.cpu(m::ConvTranspose{N, M, F, A, V}) where {N, M, F, A <: ROCArray, V} - adapt_storage(FluxCPUAdaptor(), m) -end - -function adapt_storage( - to::FluxCPUAdaptor, m::Conv{N, M, F, A, V}, -) where {N, M, F, A <: ROCArray, V} - dims = ntuple(i -> i, ndims(m.weight) - 2) - Conv( - Adapt.adapt(to, m.σ), reverse(Adapt.adapt(to, m.weight); dims), - Adapt.adapt(to, m.bias), m.stride, m.pad, m.dilation, m.groups) -end - -function adapt_storage( - to::FluxCPUAdaptor, m::ConvTranspose{N, M, F, A, V}, -) where {N, M, F, A <: ROCArray, V} +function Adapt.adapt_structure(to::FluxCPUAdaptor, m::AMD_CONV) dims = ntuple(i -> i, ndims(m.weight) - 2) - ConvTranspose( + _conv_basetype(m)( Adapt.adapt(to, m.σ), reverse(Adapt.adapt(to, m.weight); dims), Adapt.adapt(to, m.bias), m.stride, m.pad, m.dilation, m.groups) end diff --git a/test/amd/basic.jl b/test/amd/basic.jl index d053337381..d9b8db104a 100644 --- a/test/amd/basic.jl +++ b/test/amd/basic.jl @@ -46,10 +46,26 @@ end end end +@testset "Chain(Conv)" begin + m = Chain(Conv((3, 3), 3 => 3)) |> f32 + x = rand(Float32, 10, 10, 3, 2) + gpu_autodiff_test(m, x; atol=1f-3, checkgrad=false) + + md = m |> gpu |> cpu + @test md[1].weight ≈ m[1].weight atol=1f-3 + + m = Chain(ConvTranspose((3, 3), 3 => 3)) |> f32 + x = rand(Float32, 10, 10, 3, 2) + gpu_autodiff_test(m, x; atol=1f-3, checkgrad=false) + + md = m |> gpu |> cpu + @test md[1].weight ≈ m[1].weight atol=1f-3 +end + @testset "Cross-correlation" begin m = CrossCor((2, 2), 3 => 4) |> f32 x = rand(Float32, 10, 10, 3, 2) - gpu_autodiff_test(m, x) + gpu_autodiff_test(m, x; atol=1f-3) end @testset "Restructure" begin diff --git a/test/amd/runtests.jl b/test/amd/runtests.jl index 70c1876487..fa3f22d2ec 100644 --- a/test/amd/runtests.jl +++ b/test/amd/runtests.jl @@ -1,19 +1,13 @@ -Flux.gpu_backend!("AMD") - -AMDGPU.allowscalar(false) - -# Extend test utils to AMDGPU. - function check_grad( - g_gpu::ROCArray{Float32}, g_cpu::Array{Float32}, atol, rtol; - allow_nothing::Bool, + g_gpu::ROCArray{Float32}, g_cpu::Array{Float32}; + atol, rtol, allow_nothing::Bool, ) @test g_cpu ≈ collect(g_gpu) atol=atol rtol=rtol end function check_grad( - g_gpu::ROCArray{Float32}, g_cpu::Zygote.FillArrays.AbstractFill, - atol, rtol; allow_nothing::Bool, + g_gpu::ROCArray{Float32}, g_cpu::Zygote.FillArrays.AbstractFill; + atol, rtol, allow_nothing::Bool, ) @test g_cpu ≈ collect(g_gpu) atol=atol rtol=rtol end diff --git a/test/runtests.jl b/test/runtests.jl index a14372317c..09a65bb046 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -66,9 +66,10 @@ Random.seed!(0) if get(ENV, "FLUX_TEST_AMDGPU", "false") == "true" using AMDGPU - AMDGPU.versioninfo() + Flux.gpu_backend!("AMD") + AMDGPU.allowscalar(false) + if AMDGPU.functional() && AMDGPU.functional(:MIOpen) - @show AMDGPU.MIOpen.version() @testset "AMDGPU" begin include("amd/runtests.jl") end diff --git a/test/test_utils.jl b/test/test_utils.jl index f07fb1c721..14cfd61774 100644 --- a/test/test_utils.jl +++ b/test/test_utils.jl @@ -2,6 +2,7 @@ function check_grad(g_gpu, g_cpu; rtol=1e-4, atol=1e-4, allow_nothing::Bool=false) allow_nothing && return + @warn "Unsupported types in `check_grad`: $(typeof(g_gpu)), $(typeof(g_cpu))" @show g_gpu g_cpu @test false end