Skip to content
This repository has been archived by the owner on Nov 4, 2024. It is now read-only.

[Enzyme] Mark certain operations as Enzyme inactive #69

Merged
merged 4 commits into from
May 12, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions Project.toml
Original file line number Diff line number Diff line change
@@ -1,15 +1,15 @@
name = "LuxLib"
uuid = "82251201-b29d-42c6-8e01-566dec8acb11"
authors = ["Avik Pal <avikpal@mit.edu> and contributors"]
version = "0.3.22"
version = "0.3.23"

[deps]
ArrayInterface = "4fba245c-0d91-5ea0-9b3e-6abc04ee57a9"
ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
EnzymeCore = "f151be2c-9106-41f4-ab19-57ee4f262869"
FastBroadcast = "7034ab61-46d4-4ed7-9d0f-46aef9175898"
FastClosures = "9aa1b823-49e4-5ca5-8b0f-3971ec8bab6a"
GPUArraysCore = "46192b85-c4d5-4398-a991-12ede77f4527"
KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
LuxCore = "bb33d45b-7691-41d6-9220-0943567d0623"
Markdown = "d6f4376e-aef5-505a-96c1-9c027394607a"
Expand Down Expand Up @@ -44,19 +44,19 @@ ArrayInterface = "7.9"
CUDA = "5.3.2"
ChainRulesCore = "1.23"
ComponentArrays = "0.15.8"
EnzymeCore = "0.7"
ExplicitImports = "1.4.1"
FastBroadcast = "0.2.8"
FastClosures = "0.3.2"
ForwardDiff = "0.10.36"
GPUArraysCore = "0.1.6"
KernelAbstractions = "0.9.15"
LinearAlgebra = "1.10"
LuxAMDGPU = "0.2.1"
LuxCUDA = "0.3.1"
LuxCore = "0.1.13"
LuxTestUtils = "0.1.15"
Markdown = "1.10"
NNlib = "0.9.10"
NNlib = "0.9.13"
PrecompileTools = "1.2"
Random = "1.10"
ReTestItems = "1.23.1"
Expand Down
2 changes: 0 additions & 2 deletions ext/LuxLibReverseDiffExt.jl
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,6 @@ end
@grad_from_chainrules LuxLib._copy_autodiff_barrier(x::TrackedArray)
@grad_from_chainrules LuxLib._copy_autodiff_barrier(x::TrackedReal)

LuxLib._get_backend(x::TrackedArray) = LuxLib._get_backend(ReverseDiff.value(x))

# api/dropout.jl
LuxLib._dropout_fptype(x::TrackedArray) = LuxLib._dropout_fptype(ReverseDiff.value(x))

Expand Down
13 changes: 0 additions & 13 deletions ext/LuxLibTrackerExt.jl
Original file line number Diff line number Diff line change
Expand Up @@ -41,20 +41,7 @@ function LuxLib._copy_autodiff_barrier(x::Union{TrackedArray, TrackedReal})
return LuxLib._copy_autodiff_barrier(Tracker.data(x))
end

LuxLib._get_backend(x::TrackedArray) = LuxLib._get_backend(Tracker.data(x))

# api/dropout.jl
LuxLib._dropout_fptype(x::TrackedArray) = LuxLib._dropout_fptype(Tracker.data(x))

# api/groupnorm.jl
for T1 in (:TrackedArray, :AbstractArray),
T2 in (:TrackedVector, :AbstractVector),
T3 in (:TrackedVector, :AbstractVector)

LuxLib.__is_tracked(T1, T2, T3) || continue

@eval Tracker.@grad_from_chainrules LuxLib.__fast_groupnorm(
x::$T1, groups, scale::$T2, bias::$T3, epsilon::Real)
end

end
6 changes: 2 additions & 4 deletions src/LuxLib.jl
Original file line number Diff line number Diff line change
Expand Up @@ -5,28 +5,26 @@ using PrecompileTools: @recompile_invalidations
@recompile_invalidations begin
using ArrayInterface: ArrayInterface
using ChainRulesCore: ChainRulesCore, NoTangent
using EnzymeCore: EnzymeCore, EnzymeRules
using FastBroadcast: @..
using FastClosures: @closure
using GPUArraysCore: GPUArraysCore, AnyGPUArray
using KernelAbstractions: KernelAbstractions, @Const, @index, @kernel
using LinearAlgebra: LinearAlgebra, BLAS, mul!
using LuxCore: LuxCore
using Markdown: @doc_str
using NNlib: NNlib
using Random: Random, AbstractRNG, rand!
using Reexport: @reexport
using Statistics: Statistics, mean, std, var
using Statistics: Statistics, mean, var
end

@reexport using NNlib

const CRC = ChainRulesCore
const KA = KernelAbstractions

include("utils.jl")

# Low-Level Implementations
include("impl/groupnorm.jl")
include("impl/normalization.jl")
include("impl/fused_dense.jl")
include("impl/fused_conv.jl")
Expand Down
4 changes: 4 additions & 0 deletions src/api/dropout.jl
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,7 @@ end
@inline _dropout_fptype(x) = float(real(eltype(x)))

CRC.@non_differentiable _dropout_fptype(::Any...)
EnzymeRules.inactive_noinl(::typeof(_dropout_fptype), ::Any...) = nothing

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you see if it still works if you change to inactive_noinl for all of these?

It will be performant if so

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think some of these are not needed at all. Is there a cost of defining these functions?

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

inactive_noinl is strictly beneficial. It adds inactive metadata to the call/body which can improve performance or do nothing in the worst case if Enzyme can already prove it inactive on own.

inactive itself will stop inlining so there's a potential performance penalty. It will, however, guarantee that the activity information is propagated (whereas the top is presently a best effort procedure).

@inline function _alpha_dropout_noise(rng, x)
rng = LuxCore.replicate(rng)
Expand All @@ -139,6 +140,7 @@ CRC.@non_differentiable _dropout_fptype(::Any...)
end

CRC.@non_differentiable _alpha_dropout_noise(::Any...)
EnzymeRules.inactive_noinl(::typeof(_alpha_dropout_noise), ::Any...) = nothing

@inline function _generate_dropout_mask(rng::AbstractRNG, x, p, invp; dims)
realfptype = _dropout_fptype(x)
Expand All @@ -148,4 +150,6 @@ CRC.@non_differentiable _alpha_dropout_noise(::Any...)
end

CRC.@non_differentiable _generate_dropout_mask(::Any...)
EnzymeRules.inactive_noinl(::typeof(_generate_dropout_mask), ::Any...) = nothing
CRC.@non_differentiable _dropout_shape(::Any...)
EnzymeRules.inactive_noinl(::typeof(_dropout_shape), ::Any...) = nothing
40 changes: 1 addition & 39 deletions src/api/groupnorm.jl
Original file line number Diff line number Diff line change
Expand Up @@ -21,39 +21,11 @@ statistics.

The normalized array is returned.

## Performance Considerations

The most common case of this Op -- `x` is a 4D array -- is optimized using
KernelAbstractions and has a fast custom backwards pass implemented. All other cases have a
fallback implementation which is not especially optimized.

We have tested the code path for `Float16` and it works, but gradient accumulation is
extremely fragile. Hence, for `Float16` inputs, it uses the fallback implementation.

If the batch size is small (< 16), then the fallback implementation will be faster than the
KA version. However, this customization is not possible using the direct `groupnorm`
interface.

## References

[1] Wu, Yuxin, and Kaiming He. "Group normalization." Proceedings of the European conference
on computer vision (ECCV). 2018.
"""
function groupnorm(x::AbstractArray{<:Union{Float32, Float64}, 4},
scale::AbstractVector{<:Union{Float32, Float64}},
bias::AbstractVector{<:Union{Float32, Float64}},
groups::Int, σ::F=identity, epsilon::Real=1.0f-5) where {F}
_test_valid_groupnorm_arguments(x, scale, bias, groups)
# FIXME: We need to fuse the activation function into the kernel for optimal performance
return fast_activation!!(σ, __fast_groupnorm(x, groups, scale, bias, epsilon))
end

# Separate this out for a cleaner rrule later on
@inline function __fast_groupnorm(x, groups, scale, bias, epsilon)
return first(_groupnorm(x, groups, scale, bias, epsilon))
end

# Slow Fallback (without custom Pullback Implementation)
function groupnorm(x::AbstractArray{<:Real, N}, scale::Union{Nothing, <:AbstractVector},
bias::Union{Nothing, <:AbstractVector}, groups::Int,
σ::F=identity, epsilon::Real=1.0f-5) where {F, N}
Expand All @@ -71,19 +43,8 @@ end
return :($(Val(Tuple(collect(1:(N - 1))))))
end

# Custom Pullbacks
function CRC.rrule(::typeof(__fast_groupnorm), x, groups, scale, bias, epsilon)
y, μ, σ⁻¹ = _groupnorm(x, groups, scale, bias, epsilon)
∇groupnorm = @closure Δ -> begin
∂x, ∂scale, ∂bias = _∇groupnorm(Δ, y, x, groups, scale, bias, μ, σ⁻¹)
return NoTangent(), ∂x, NoTangent(), ∂scale, ∂bias, NoTangent()
end
return y, ∇groupnorm
end

function _test_valid_groupnorm_arguments(
x::AbstractArray{T, N}, scale, bias, groups) where {T, N}
_assert_same_backend(x, scale, bias)
if scale !== nothing && bias !== nothing && length(scale) != length(bias) != size(x, 3)
throw(ArgumentError("Length of `scale` and `bias` must be equal to the number of \
channels (N - 1 dim of the input array)."))
Expand All @@ -95,3 +56,4 @@ function _test_valid_groupnorm_arguments(
end

CRC.@non_differentiable _test_valid_groupnorm_arguments(::Any...)
EnzymeRules.inactive_noinl(::typeof(_test_valid_groupnorm_arguments), ::Any...) = nothing
1 change: 1 addition & 0 deletions src/api/instancenorm.jl
Original file line number Diff line number Diff line change
Expand Up @@ -47,3 +47,4 @@ function _test_valid_instancenorm_arguments(::AbstractArray{T, N}) where {T, N}
end

CRC.@non_differentiable _test_valid_instancenorm_arguments(::Any...)
EnzymeRules.inactive_noinl(::typeof(_test_valid_instancenorm_arguments), ::Any...) = nothing
113 changes: 0 additions & 113 deletions src/impl/groupnorm.jl

This file was deleted.

1 change: 1 addition & 0 deletions src/impl/normalization.jl
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ end
@inline __accum_size(x, ::Val{dims}) where {dims} = prod(Base.Fix1(size, x), dims)

CRC.@non_differentiable __accum_size(::Any...)
EnzymeRules.inactive_noinl(::typeof(__accum_size), ::Any...) = nothing

@inline function _get_batch_statistics(x::AbstractArray, ::Nothing, ::Nothing,
::Val{rdims}, ::Val{false}, momentum) where {rdims}
Expand Down
Loading
Loading