Skip to content

Commit

Permalink
clean up messy type parameter handling and fix segfaults
Browse files Browse the repository at this point in the history
  • Loading branch information
jrevels committed Feb 21, 2016
1 parent cf1024d commit d1effac
Show file tree
Hide file tree
Showing 2 changed files with 144 additions and 140 deletions.
13 changes: 12 additions & 1 deletion src/ForwardDiff.jl
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,22 @@ import Calculus
import NaNMath

const IS_MULTITHREADED_JULIA = VERSION >= v"0.5.0-dev+923" && Base.Threads.nthreads() > 1
const NTHREADS = IS_MULTITHREADED_JULIA ? Base.Threads.nthreads() : 1

if IS_MULTITHREADED_JULIA
const NTHREADS = Base.Threads.nthreads()
@inline compat_threadid() = Base.Threads.threadid()
else
const NTHREADS = 1
@inline compat_threadid() = 1
end

const AUTO_DEFINED_UNARY_FUNCS = map(first, Calculus.symbolic_derivatives_1arg())
const NANMATH_FUNCS = (:sin, :cos, :tan, :asin, :acos, :acosh,
:atanh, :log, :log2, :log10, :lgamma, :log1p)

@inline value{x}(::Type{Val{x}}) = x
@inline value{x}(::Type{Type{Val{x}}}) = x

include("Partials.jl")
include("DiffNumber.jl")
include("cache.jl")
Expand Down
271 changes: 132 additions & 139 deletions src/gradient.jl
Original file line number Diff line number Diff line change
Expand Up @@ -25,184 +25,177 @@ end
# gradient!/gradient #
######################

@generated function gradient!{S,ALL,CHUNK,LEN,MULTITHREAD}(f, out::AbstractVector{S}, x::AbstractVector,
::Type{Val{ALL}}, ::Type{Val{CHUNK}},
::Type{Val{LEN}}, ::Type{Val{MULTITHREAD}})
@generated function gradient!(f, out, x, all::DataType, chunk::DataType, input_length::DataType, multithread::DataType)
input_length_value = value(input_length) == nothing ? :(length(x)) : value(input_length)
return_statement = value(all) ? :(val::$(eltype(out)), out::$(out)) : :(out::$(out))
return quote
val, grad = call_gradient!(f, out, x, Val{CHUNK}, Val{$(LEN == nothing ? :(length(x)) : LEN)}, Val{MULTITHREAD})
return $(ALL ? :(val::S, out::$(out)) : :(out::$(out)))
val, _ = call_gradient!(f, out, x, chunk, Val{$(input_length_value)}, multithread)
return $(return_statement)
end
end

function gradient{ALL,CHUNK,LEN,MULTITHREAD}(f, x::AbstractVector, ::Type{Val{ALL}}, ::Type{Val{CHUNK}},
::Type{Val{LEN}}, ::Type{Val{MULTITHREAD}})
return gradient!(f, similar(x), x, Val{ALL}, Val{CHUNK}, Val{LEN}, Val{MULTITHREAD})
function gradient(f, x, all::DataType, chunk::DataType, input_length::DataType, multithread::DataType)
return gradient!(f, similar(x), x, all, chunk, input_length, multithread)
end

@generated function gradient{ALL,CHUNK,LEN,MULTITHREAD,MUTATES}(f, ::Type{Val{ALL}}, ::Type{Val{CHUNK}},
::Type{Val{LEN}}, ::Type{Val{MULTITHREAD}},
::Type{Val{MUTATES}})
if MUTATES
R = ALL ? :(Tuple{S,typeof(out)}) : :(typeof(out))
@generated function gradient(f, all::DataType, chunk::DataType, input_length::DataType, multithread::DataType, mutates::DataType)
if value(mutates)
R = value(all) ? :(Tuple{eltype(out),typeof(out)}) : :(typeof(out))
return quote
g!{S}(out::AbstractVector{S}, x::AbstractVector) = gradient!(f, out, x, Val{ALL}, Val{CHUNK}, Val{LEN}, Val{MULTITHREAD})::$(R)
g!(out, x) = gradient!(f, out, x, all, chunk, input_length, multithread)::$(R)
return g!
end
else
R = ALL ? :(Tuple{S,typeof(x)}) : :(typeof(x))
R = value(all) ? :(Tuple{eltype(x),typeof(x)}) : :(typeof(x))
return quote
g{S}(x::AbstractVector{S}) = gradient(f, x, Val{ALL}, Val{CHUNK}, Val{LEN}, Val{MULTITHREAD})::$(R)
g(x) = gradient(f, x, all, chunk, input_length, multithread)::$(R)
return g
end
end
end

##################
# calc_gradient! #
##################
# The below code is pretty ugly, so here's an overview:
#
# `call_gradient!` is the entry point that is called by the API functions. If a chunk size
######################################
# call_gradient!/workhorse functions #
######################################

# `call_gradient!` is the entry point that is called by the API functions. It decides which
# workhorse function to call based on the provided parameters. Note that if a chunk size
# isn't given by an upstream caller, `call_gradient!` picks one based on the input length.
#
# `calc_gradient!` is the workhorse function - it generates code for calculating the
# gradient in chunk-mode or vector-mode, depending on the input length and chunk size.
#
# `multi_calc_gradient!` is just like `_calc_gradient!`, but uses Julia's multithreading
# capabilities when performing calculations in chunk-mode.
#
# `VEC_MODE_EXPR` is a constant expression for vector-mode that provides the function body
# for `calc_gradient!` and `multi_calc_gradient!` when chunk size equals input length.
#
# `calc_gradient_expr` takes in a vector-mode expression body or chunk-mode expression body
# and returns a completed function body with the input body injected in the correct place.

@generated function call_gradient!{S,CHUNK,LEN,MULTITHREAD}(f, out::AbstractVector{S}, x::AbstractVector,
::Type{Val{CHUNK}}, ::Type{Val{LEN}},
::Type{Val{MULTITHREAD}})
gradf! = MULTITHREAD && IS_MULTITHREADED_JULIA ? :multi_calc_gradient! : :calc_gradient!
return :($(gradf!)(f, out, x, Val{$(CHUNK == nothing ? pick_chunk(LEN) : CHUNK)}, Val{LEN})::Tuple{S, $(out)})
@generated function call_gradient!(f, out, x, chunk, input_length, multithread)
input_length_value = value(input_length)
chunk_value = value(chunk) == nothing ? pick_chunk(input_length_value) : value(chunk)
use_chunk_mode = chunk_value != input_length_value
if use_chunk_mode
gradfunc! = value(multithread) && IS_MULTITHREADED_JULIA ? :multi_gradient_chunk_mode! : :gradient_chunk_mode!
return :($(gradfunc!)(f, out, x, Val{$(chunk_value)}, Val{$(input_length_value)}))
else
return :(gradient_vector_mode!(f, out, x, Val{$(input_length_value)}))
end
end

@generated function calc_gradient!{S,T,CHUNK,LEN}(f, out::AbstractVector{S}, x::AbstractVector{T},
::Type{Val{CHUNK}}, ::Type{Val{LEN}})
if CHUNK == LEN
body = VEC_MODE_EXPR
else
remainder = LEN % CHUNK == 0 ? CHUNK : LEN % CHUNK
fill_length = LEN - remainder
reseed_partials = remainder == CHUNK ? :() : :(seed_partials = cachefetch!(tid, Partials{CHUNK,T}, Val{$(remainder)}))
body = quote
workvec::Vector{DiffNumber{CHUNK,T}} = cachefetch!(tid, DiffNumber{CHUNK,T}, Val{LEN})
pzeros = zero(Partials{CHUNK,T})

@simd for i in 1:LEN
@inbounds workvec[i] = DiffNumber{CHUNK,T}(x[i], pzeros)
function gradient_vector_mode!{input_length}(f, out, x, ::Type{Val{input_length}})
@assert input_length == length(x) == length(out)
S, T = eltype(out), eltype(x)
tid = compat_threadid()
seed_partials::Vector{Partials{input_length,T}} = cachefetch!(tid, Partials{input_length,T})
workvec::Vector{DiffNumber{input_length,T}} = cachefetch!(tid, DiffNumber{input_length,T}, Val{input_length})

@simd for i in 1:input_length
@inbounds workvec[i] = DiffNumber{input_length,T}(x[i], seed_partials[i])
end

result::DiffNumber{input_length,S} = f(workvec)

@simd for i in 1:input_length
@inbounds out[i] = partials(result, i)
end

return value(result)::S, out
end

@generated function gradient_chunk_mode!{chunk,input_length}(f, out, x, ::Type{Val{chunk}}, ::Type{Val{input_length}})
remainder = input_length % chunk == 0 ? chunk : input_length % chunk
fill_length = input_length - remainder
reseed_partials = remainder == chunk ? :() : :(seed_partials = cachefetch!(tid, Partials{chunk,T}, Val{$(remainder)}))
return quote
@assert input_length == length(x) == length(out)
S, T = eltype(out), eltype(x)
tid = compat_threadid()
seed_partials::Vector{Partials{chunk,T}} = cachefetch!(tid, Partials{chunk,T})
workvec::Vector{DiffNumber{chunk,T}} = cachefetch!(tid, DiffNumber{chunk,T}, Val{input_length})
pzeros = zero(Partials{chunk,T})

@simd for i in 1:input_length
@inbounds workvec[i] = DiffNumber{chunk,T}(x[i], pzeros)
end

for c in 1:$(chunk):$(fill_length)
@simd for i in 1:chunk
j = i + c - 1
@inbounds workvec[j] = DiffNumber{chunk,T}(x[j], seed_partials[i])
end
local result::DiffNumber{chunk,S} = f(workvec)
@simd for i in 1:chunk
j = i + c - 1
@inbounds out[j] = partials(result, i)
@inbounds workvec[j] = DiffNumber{chunk,T}(x[j], pzeros)
end
end

# Performing the final chunk manually seems to triggers some additional
# optimization heuristics, which results in more efficient memory allocation
$(reseed_partials)

@simd for i in 1:$(remainder)
j = $(fill_length) + i
@inbounds workvec[j] = DiffNumber{chunk,T}(x[j], seed_partials[i])
end
result::DiffNumber{chunk,S} = f(workvec)
@simd for i in 1:$(remainder)
j = $(fill_length) + i
@inbounds out[j] = partials(result, i)
@inbounds workvec[j] = DiffNumber{chunk,T}(x[j], pzeros)
end

return value(result)::S, out
end
end

if IS_MULTITHREADED_JULIA
@generated function multi_gradient_chunk_mode!{chunk,input_length}(f, out, x, ::Type{Val{chunk}}, ::Type{Val{input_length}})
remainder = input_length % chunk == 0 ? chunk : input_length % chunk
fill_length = input_length - remainder
reseed_partials = remainder == chunk ? :() : :(seed_partials = cachefetch!(tid, Partials{chunk,T}, Val{$(remainder)}))
return quote
@assert input_length == length(x) == length(out)
S, T = eltype(out), eltype(x)
tid = compat_threadid()
seed_partials::Vector{Partials{chunk,T}} = cachefetch!(tid, Partials{chunk,T})
workvecs::NTuple{NTHREADS, Vector{DiffNumber{chunk,T}}} = cachefetch!(DiffNumber{chunk,T}, Val{input_length})
pzeros = zero(Partials{chunk,T})

Base.Threads.@threads for t in 1:NTHREADS
# see https://github.com/JuliaLang/julia/issues/14948
local workvec = workvecs[t]
@simd for i in 1:input_length
@inbounds workvec[i] = DiffNumber{chunk,T}(x[i], pzeros)
end
end

for c in 1:$(CHUNK):$(fill_length)
@simd for i in 1:CHUNK
Base.Threads.@threads for c in 1:$(chunk):$(fill_length)
local workvec = workvecs[compat_threadid()]
@simd for i in 1:chunk
j = i + c - 1
@inbounds workvec[j] = DiffNumber{CHUNK,T}(x[j], seed_partials[i])
@inbounds workvec[j] = DiffNumber{chunk,T}(x[j], seed_partials[i])
end
local result::DiffNumber{CHUNK,S} = f(workvec)
@simd for i in 1:CHUNK
local result::DiffNumber{chunk,S} = f(workvec)
@simd for i in 1:chunk
j = i + c - 1
@inbounds out[j] = partials(result, i)
@inbounds workvec[j] = DiffNumber{CHUNK,T}(x[j], pzeros)
@inbounds workvec[j] = DiffNumber{chunk,T}(x[j], pzeros)
end
end

# Performing the final chunk manually seems to triggers some additional
# optimization heuristics, which results in more efficient memory allocation
$(reseed_partials)

workvec = workvecs[tid]

@simd for i in 1:$(remainder)
j = $(fill_length) + i
@inbounds workvec[j] = DiffNumber{CHUNK,T}(x[j], seed_partials[i])
@inbounds workvec[j] = DiffNumber{chunk,T}(x[j], seed_partials[i])
end
result::DiffNumber{CHUNK,S} = f(workvec)

result::DiffNumber{chunk,S} = f(workvec)

@simd for i in 1:$(remainder)
j = $(fill_length) + i
@inbounds out[j] = partials(result, i)
@inbounds workvec[j] = DiffNumber{CHUNK,T}(x[j], pzeros)
@inbounds workvec[j] = DiffNumber{chunk,T}(x[j], pzeros)
end
end
end
return calc_gradient_expr(body)
end

if IS_MULTITHREADED_JULIA
@generated function multi_calc_gradient!{S,T,CHUNK,LEN}(f, out::AbstractVector{S}, x::AbstractVector{T},
::Type{Val{CHUNK}}, ::Type{Val{LEN}})
if CHUNK == LEN
body = VEC_MODE_EXPR
else
remainder = LEN % CHUNK == 0 ? CHUNK : LEN % CHUNK
fill_length = LEN - remainder
reseed_partials = remainder == CHUNK ? :() : :(seed_partials = cachefetch!(tid, Partials{CHUNK,T}, Val{$(remainder)}))
body = quote
workvecs::NTuple{NTHREADS, Vector{DiffNumber{CHUNK,T}}} = cachefetch!(DiffNumber{CHUNK,T}, Val{LEN})
pzeros = zero(Partials{CHUNK,T})

Base.Threads.@threads for t in 1:NTHREADS
# must be local, see https://github.com/JuliaLang/julia/issues/14948
local workvec = workvecs[t]
@simd for i in 1:LEN
@inbounds workvec[i] = DiffNumber{CHUNK,T}(x[i], pzeros)
end
end

Base.Threads.@threads for c in 1:$(CHUNK):$(fill_length)
local workvec = workvecs[Base.Threads.threadid()]
@simd for i in 1:CHUNK
j = i + c - 1
@inbounds workvec[j] = DiffNumber{CHUNK,T}(x[j], seed_partials[i])
end
local result::DiffNumber{CHUNK,S} = f(workvec)
@simd for i in 1:CHUNK
j = i + c - 1
@inbounds out[j] = partials(result, i)
@inbounds workvec[j] = DiffNumber{CHUNK,T}(x[j], pzeros)
end
end

# Performing the final chunk manually seems to triggers some additional
# optimization heuristics, which results in more efficient memory allocation
$(reseed_partials)
workvec = workvecs[tid]
@simd for i in 1:$(remainder)
j = $(fill_length) + i
@inbounds workvec[j] = DiffNumber{CHUNK,T}(x[j], seed_partials[i])
end
result::DiffNumber{CHUNK,S} = f(workvec)
@simd for i in 1:$(remainder)
j = $(fill_length) + i
@inbounds out[j] = partials(result, i)
@inbounds workvec[j] = DiffNumber{CHUNK,T}(x[j], pzeros)
end
end
return value(result)::S, out
end
return calc_gradient_expr(body)
end
end

const VEC_MODE_EXPR = quote
workvec::Vector{DiffNumber{CHUNK,T}} = cachefetch!(tid, DiffNumber{CHUNK,T}, Val{LEN})
@simd for i in 1:LEN
@inbounds workvec[i] = DiffNumber{CHUNK,T}(x[i], seed_partials[i])
end
result::DiffNumber{CHUNK,S} = f(workvec)
@simd for i in 1:LEN
@inbounds out[i] = partials(result, i)
end
end

function calc_gradient_expr(body)
return quote
@assert LEN == length(x) == length(out)
tid = $(IS_MULTITHREADED_JULIA ? Base.Threads.threadid() : 1)
seed_partials::Vector{Partials{CHUNK,T}} = cachefetch!(tid, Partials{CHUNK,T})
$(body)
return (value(result)::S, out)
end
end

0 comments on commit d1effac

Please sign in to comment.