diff --git a/src/ForwardDiff.jl b/src/ForwardDiff.jl index 65190c1c..1c7f19cf 100644 --- a/src/ForwardDiff.jl +++ b/src/ForwardDiff.jl @@ -6,11 +6,22 @@ import Calculus import NaNMath const IS_MULTITHREADED_JULIA = VERSION >= v"0.5.0-dev+923" && Base.Threads.nthreads() > 1 -const NTHREADS = IS_MULTITHREADED_JULIA ? Base.Threads.nthreads() : 1 + +if IS_MULTITHREADED_JULIA + const NTHREADS = Base.Threads.nthreads() + @inline compat_threadid() = Base.Threads.threadid() +else + const NTHREADS = 1 + @inline compat_threadid() = 1 +end + const AUTO_DEFINED_UNARY_FUNCS = map(first, Calculus.symbolic_derivatives_1arg()) const NANMATH_FUNCS = (:sin, :cos, :tan, :asin, :acos, :acosh, :atanh, :log, :log2, :log10, :lgamma, :log1p) +@inline value{x}(::Type{Val{x}}) = x +@inline value{x}(::Type{Type{Val{x}}}) = x + include("Partials.jl") include("DiffNumber.jl") include("cache.jl") diff --git a/src/gradient.jl b/src/gradient.jl index 04aa7fd4..37a8247d 100644 --- a/src/gradient.jl +++ b/src/gradient.jl @@ -25,184 +25,179 @@ end # gradient!/gradient # ###################### -@generated function gradient!{S,ALL,CHUNK,LEN,MULTITHREAD}(f, out::AbstractVector{S}, x::AbstractVector, - ::Type{Val{ALL}}, ::Type{Val{CHUNK}}, - ::Type{Val{LEN}}, ::Type{Val{MULTITHREAD}}) +@generated function gradient!(f, out, x, all::DataType, chunk::DataType, + input_length::DataType, multithread::DataType) + input_length_value = value(input_length) == nothing ? :(length(x)) : value(input_length) + return_statement = value(all) ? :(val::$(eltype(out)), out::$(out)) : :(out::$(out)) return quote - val, grad = call_gradient!(f, out, x, Val{CHUNK}, Val{$(LEN == nothing ? :(length(x)) : LEN)}, Val{MULTITHREAD}) - return $(ALL ? :(val::S, out::$(out)) : :(out::$(out))) + val, _ = call_gradient!(f, out, x, chunk, Val{$(input_length_value)}, multithread) + return $(return_statement) end end -function gradient{ALL,CHUNK,LEN,MULTITHREAD}(f, x::AbstractVector, ::Type{Val{ALL}}, ::Type{Val{CHUNK}}, - ::Type{Val{LEN}}, ::Type{Val{MULTITHREAD}}) - return gradient!(f, similar(x), x, Val{ALL}, Val{CHUNK}, Val{LEN}, Val{MULTITHREAD}) +function gradient(f, x, all::DataType, chunk::DataType, input_length::DataType, multithread::DataType) + return gradient!(f, similar(x), x, all, chunk, input_length, multithread) end -@generated function gradient{ALL,CHUNK,LEN,MULTITHREAD,MUTATES}(f, ::Type{Val{ALL}}, ::Type{Val{CHUNK}}, - ::Type{Val{LEN}}, ::Type{Val{MULTITHREAD}}, - ::Type{Val{MUTATES}}) - if MUTATES - R = ALL ? :(Tuple{S,typeof(out)}) : :(typeof(out)) +@generated function gradient(f, all::DataType, chunk::DataType, input_length::DataType, + multithread::DataType, output_mutates::DataType) + if value(output_mutates) + R = value(all) ? :(Tuple{eltype(out),typeof(out)}) : :(typeof(out)) return quote - g!{S}(out::AbstractVector{S}, x::AbstractVector) = gradient!(f, out, x, Val{ALL}, Val{CHUNK}, Val{LEN}, Val{MULTITHREAD})::$(R) + g!(out, x) = gradient!(f, out, x, all, chunk, input_length, multithread)::$(R) return g! end else - R = ALL ? :(Tuple{S,typeof(x)}) : :(typeof(x)) + R = value(all) ? :(Tuple{eltype(x),typeof(x)}) : :(typeof(x)) return quote - g{S}(x::AbstractVector{S}) = gradient(f, x, Val{ALL}, Val{CHUNK}, Val{LEN}, Val{MULTITHREAD})::$(R) + g(x) = gradient(f, x, all, chunk, input_length, multithread)::$(R) return g end end end -################## -# calc_gradient! # -################## -# The below code is pretty ugly, so here's an overview: -# -# `call_gradient!` is the entry point that is called by the API functions. If a chunk size +###################################### +# call_gradient!/workhorse functions # +###################################### + +# `call_gradient!` is the entry point that is called by the API functions. It decides which +# workhorse function to call based on the provided parameters. Note that if a chunk size # isn't given by an upstream caller, `call_gradient!` picks one based on the input length. -# -# `calc_gradient!` is the workhorse function - it generates code for calculating the -# gradient in chunk-mode or vector-mode, depending on the input length and chunk size. -# -# `multi_calc_gradient!` is just like `_calc_gradient!`, but uses Julia's multithreading -# capabilities when performing calculations in chunk-mode. -# -# `VEC_MODE_EXPR` is a constant expression for vector-mode that provides the function body -# for `calc_gradient!` and `multi_calc_gradient!` when chunk size equals input length. -# -# `calc_gradient_expr` takes in a vector-mode expression body or chunk-mode expression body -# and returns a completed function body with the input body injected in the correct place. - -@generated function call_gradient!{S,CHUNK,LEN,MULTITHREAD}(f, out::AbstractVector{S}, x::AbstractVector, - ::Type{Val{CHUNK}}, ::Type{Val{LEN}}, - ::Type{Val{MULTITHREAD}}) - gradf! = MULTITHREAD && IS_MULTITHREADED_JULIA ? :multi_calc_gradient! : :calc_gradient! - return :($(gradf!)(f, out, x, Val{$(CHUNK == nothing ? pick_chunk(LEN) : CHUNK)}, Val{LEN})::Tuple{S, $(out)}) +@generated function call_gradient!(f, out, x, chunk, input_length, multithread) + input_length_value = value(input_length) + chunk_value = value(chunk) == nothing ? pick_chunk(input_length_value) : value(chunk) + use_chunk_mode = chunk_value != input_length_value + if use_chunk_mode + gradfunc! = value(multithread) && IS_MULTITHREADED_JULIA ? :multi_gradient_chunk_mode! : :gradient_chunk_mode! + return :($(gradfunc!)(f, out, x, Val{$(chunk_value)}, Val{$(input_length_value)})) + else + return :(gradient_vector_mode!(f, out, x, Val{$(input_length_value)})) + end end -@generated function calc_gradient!{S,T,CHUNK,LEN}(f, out::AbstractVector{S}, x::AbstractVector{T}, - ::Type{Val{CHUNK}}, ::Type{Val{LEN}}) - if CHUNK == LEN - body = VEC_MODE_EXPR - else - remainder = LEN % CHUNK == 0 ? CHUNK : LEN % CHUNK - fill_length = LEN - remainder - reseed_partials = remainder == CHUNK ? :() : :(seed_partials = cachefetch!(tid, Partials{CHUNK,T}, Val{$(remainder)})) - body = quote - workvec::Vector{DiffNumber{CHUNK,T}} = cachefetch!(tid, DiffNumber{CHUNK,T}, Val{LEN}) - pzeros = zero(Partials{CHUNK,T}) - - @simd for i in 1:LEN - @inbounds workvec[i] = DiffNumber{CHUNK,T}(x[i], pzeros) +function gradient_vector_mode!{input_length}(f, out, x, ::Type{Val{input_length}}) + @assert input_length == length(x) == length(out) + S, T = eltype(out), eltype(x) + tid = compat_threadid() + seed_partials::Vector{Partials{input_length,T}} = cachefetch!(tid, Partials{input_length,T}) + workvec::Vector{DiffNumber{input_length,T}} = cachefetch!(tid, DiffNumber{input_length,T}, Val{input_length}) + + @simd for i in 1:input_length + @inbounds workvec[i] = DiffNumber{input_length,T}(x[i], seed_partials[i]) + end + + result::DiffNumber{input_length,S} = f(workvec) + + @simd for i in 1:input_length + @inbounds out[i] = partials(result, i) + end + + return value(result)::S, out +end + +@generated function gradient_chunk_mode!{chunk,input_length}(f, out, x, ::Type{Val{chunk}}, ::Type{Val{input_length}}) + remainder = input_length % chunk == 0 ? chunk : input_length % chunk + fill_length = input_length - remainder + reseed_partials = remainder == chunk ? :() : :(seed_partials = cachefetch!(tid, Partials{chunk,T}, Val{$(remainder)})) + return quote + @assert input_length == length(x) == length(out) + S, T = eltype(out), eltype(x) + tid = compat_threadid() + seed_partials::Vector{Partials{chunk,T}} = cachefetch!(tid, Partials{chunk,T}) + workvec::Vector{DiffNumber{chunk,T}} = cachefetch!(tid, DiffNumber{chunk,T}, Val{input_length}) + pzeros = zero(Partials{chunk,T}) + + @simd for i in 1:input_length + @inbounds workvec[i] = DiffNumber{chunk,T}(x[i], pzeros) + end + + for c in 1:$(chunk):$(fill_length) + @simd for i in 1:chunk + j = i + c - 1 + @inbounds workvec[j] = DiffNumber{chunk,T}(x[j], seed_partials[i]) + end + local result::DiffNumber{chunk,S} = f(workvec) + @simd for i in 1:chunk + j = i + c - 1 + @inbounds out[j] = partials(result, i) + @inbounds workvec[j] = DiffNumber{chunk,T}(x[j], pzeros) + end + end + + # Performing the final chunk manually seems to triggers some additional + # optimization heuristics, which results in more efficient memory allocation + $(reseed_partials) + + @simd for i in 1:$(remainder) + j = $(fill_length) + i + @inbounds workvec[j] = DiffNumber{chunk,T}(x[j], seed_partials[i]) + end + result::DiffNumber{chunk,S} = f(workvec) + @simd for i in 1:$(remainder) + j = $(fill_length) + i + @inbounds out[j] = partials(result, i) + @inbounds workvec[j] = DiffNumber{chunk,T}(x[j], pzeros) + end + + return value(result)::S, out + end +end + +if IS_MULTITHREADED_JULIA + @generated function multi_gradient_chunk_mode!{chunk,input_length}(f, out, x, ::Type{Val{chunk}}, ::Type{Val{input_length}}) + remainder = input_length % chunk == 0 ? chunk : input_length % chunk + fill_length = input_length - remainder + reseed_partials = remainder == chunk ? :() : :(seed_partials = cachefetch!(tid, Partials{chunk,T}, Val{$(remainder)})) + return quote + @assert input_length == length(x) == length(out) + S, T = eltype(out), eltype(x) + tid = compat_threadid() + seed_partials::Vector{Partials{chunk,T}} = cachefetch!(tid, Partials{chunk,T}) + workvecs::NTuple{NTHREADS, Vector{DiffNumber{chunk,T}}} = cachefetch!(DiffNumber{chunk,T}, Val{input_length}) + pzeros = zero(Partials{chunk,T}) + + Base.Threads.@threads for t in 1:NTHREADS + # see https://github.com/JuliaLang/julia/issues/14948 + local workvec = workvecs[t] + @simd for i in 1:input_length + @inbounds workvec[i] = DiffNumber{chunk,T}(x[i], pzeros) + end end - for c in 1:$(CHUNK):$(fill_length) - @simd for i in 1:CHUNK + Base.Threads.@threads for c in 1:$(chunk):$(fill_length) + local workvec = workvecs[compat_threadid()] + @simd for i in 1:chunk j = i + c - 1 - @inbounds workvec[j] = DiffNumber{CHUNK,T}(x[j], seed_partials[i]) + @inbounds workvec[j] = DiffNumber{chunk,T}(x[j], seed_partials[i]) end - local result::DiffNumber{CHUNK,S} = f(workvec) - @simd for i in 1:CHUNK + local result::DiffNumber{chunk,S} = f(workvec) + @simd for i in 1:chunk j = i + c - 1 @inbounds out[j] = partials(result, i) - @inbounds workvec[j] = DiffNumber{CHUNK,T}(x[j], pzeros) + @inbounds workvec[j] = DiffNumber{chunk,T}(x[j], pzeros) end end # Performing the final chunk manually seems to triggers some additional # optimization heuristics, which results in more efficient memory allocation $(reseed_partials) + + workvec = workvecs[tid] + @simd for i in 1:$(remainder) j = $(fill_length) + i - @inbounds workvec[j] = DiffNumber{CHUNK,T}(x[j], seed_partials[i]) + @inbounds workvec[j] = DiffNumber{chunk,T}(x[j], seed_partials[i]) end - result::DiffNumber{CHUNK,S} = f(workvec) + + result::DiffNumber{chunk,S} = f(workvec) + @simd for i in 1:$(remainder) j = $(fill_length) + i @inbounds out[j] = partials(result, i) - @inbounds workvec[j] = DiffNumber{CHUNK,T}(x[j], pzeros) + @inbounds workvec[j] = DiffNumber{chunk,T}(x[j], pzeros) end - end - end - return calc_gradient_expr(body) -end - -if IS_MULTITHREADED_JULIA - @generated function multi_calc_gradient!{S,T,CHUNK,LEN}(f, out::AbstractVector{S}, x::AbstractVector{T}, - ::Type{Val{CHUNK}}, ::Type{Val{LEN}}) - if CHUNK == LEN - body = VEC_MODE_EXPR - else - remainder = LEN % CHUNK == 0 ? CHUNK : LEN % CHUNK - fill_length = LEN - remainder - reseed_partials = remainder == CHUNK ? :() : :(seed_partials = cachefetch!(tid, Partials{CHUNK,T}, Val{$(remainder)})) - body = quote - workvecs::NTuple{NTHREADS, Vector{DiffNumber{CHUNK,T}}} = cachefetch!(DiffNumber{CHUNK,T}, Val{LEN}) - pzeros = zero(Partials{CHUNK,T}) - - Base.Threads.@threads for t in 1:NTHREADS - # must be local, see https://github.com/JuliaLang/julia/issues/14948 - local workvec = workvecs[t] - @simd for i in 1:LEN - @inbounds workvec[i] = DiffNumber{CHUNK,T}(x[i], pzeros) - end - end - - Base.Threads.@threads for c in 1:$(CHUNK):$(fill_length) - local workvec = workvecs[Base.Threads.threadid()] - @simd for i in 1:CHUNK - j = i + c - 1 - @inbounds workvec[j] = DiffNumber{CHUNK,T}(x[j], seed_partials[i]) - end - local result::DiffNumber{CHUNK,S} = f(workvec) - @simd for i in 1:CHUNK - j = i + c - 1 - @inbounds out[j] = partials(result, i) - @inbounds workvec[j] = DiffNumber{CHUNK,T}(x[j], pzeros) - end - end - # Performing the final chunk manually seems to triggers some additional - # optimization heuristics, which results in more efficient memory allocation - $(reseed_partials) - workvec = workvecs[tid] - @simd for i in 1:$(remainder) - j = $(fill_length) + i - @inbounds workvec[j] = DiffNumber{CHUNK,T}(x[j], seed_partials[i]) - end - result::DiffNumber{CHUNK,S} = f(workvec) - @simd for i in 1:$(remainder) - j = $(fill_length) + i - @inbounds out[j] = partials(result, i) - @inbounds workvec[j] = DiffNumber{CHUNK,T}(x[j], pzeros) - end - end + return value(result)::S, out end - return calc_gradient_expr(body) - end -end - -const VEC_MODE_EXPR = quote - workvec::Vector{DiffNumber{CHUNK,T}} = cachefetch!(tid, DiffNumber{CHUNK,T}, Val{LEN}) - @simd for i in 1:LEN - @inbounds workvec[i] = DiffNumber{CHUNK,T}(x[i], seed_partials[i]) - end - result::DiffNumber{CHUNK,S} = f(workvec) - @simd for i in 1:LEN - @inbounds out[i] = partials(result, i) - end -end - -function calc_gradient_expr(body) - return quote - @assert LEN == length(x) == length(out) - tid = $(IS_MULTITHREADED_JULIA ? Base.Threads.threadid() : 1) - seed_partials::Vector{Partials{CHUNK,T}} = cachefetch!(tid, Partials{CHUNK,T}) - $(body) - return (value(result)::S, out) end end