diff --git a/src/ForwardDiff.jl b/src/ForwardDiff.jl
index 65190c1c..1c7f19cf 100644
--- a/src/ForwardDiff.jl
+++ b/src/ForwardDiff.jl
@@ -6,11 +6,22 @@ import Calculus
 import NaNMath
 
 const IS_MULTITHREADED_JULIA = VERSION >= v"0.5.0-dev+923" && Base.Threads.nthreads() > 1
-const NTHREADS = IS_MULTITHREADED_JULIA ? Base.Threads.nthreads() : 1
+
+if IS_MULTITHREADED_JULIA
+    const NTHREADS = Base.Threads.nthreads()
+    @inline compat_threadid() = Base.Threads.threadid()
+else
+    const NTHREADS = 1
+    @inline compat_threadid() = 1
+end
+
 const AUTO_DEFINED_UNARY_FUNCS = map(first, Calculus.symbolic_derivatives_1arg())
 const NANMATH_FUNCS = (:sin, :cos, :tan, :asin, :acos, :acosh,
                        :atanh, :log, :log2, :log10, :lgamma, :log1p)
 
+@inline value{x}(::Type{Val{x}}) = x
+@inline value{x}(::Type{Type{Val{x}}}) = x
+
 include("Partials.jl")
 include("DiffNumber.jl")
 include("cache.jl")
diff --git a/src/gradient.jl b/src/gradient.jl
index 04aa7fd4..37a8247d 100644
--- a/src/gradient.jl
+++ b/src/gradient.jl
@@ -25,184 +25,179 @@ end
 # gradient!/gradient #
 ######################
 
-@generated function gradient!{S,ALL,CHUNK,LEN,MULTITHREAD}(f, out::AbstractVector{S}, x::AbstractVector,
-                                                           ::Type{Val{ALL}}, ::Type{Val{CHUNK}},
-                                                           ::Type{Val{LEN}}, ::Type{Val{MULTITHREAD}})
+@generated function gradient!(f, out, x, all::DataType, chunk::DataType,
+                              input_length::DataType, multithread::DataType)
+    input_length_value = value(input_length) == nothing ? :(length(x)) : value(input_length)
+    return_statement = value(all) ? :(val::$(eltype(out)), out::$(out)) : :(out::$(out))
     return quote
-        val, grad = call_gradient!(f, out, x, Val{CHUNK}, Val{$(LEN == nothing ? :(length(x)) : LEN)}, Val{MULTITHREAD})
-        return $(ALL ? :(val::S, out::$(out)) : :(out::$(out)))
+        val, _ = call_gradient!(f, out, x, chunk, Val{$(input_length_value)}, multithread)
+        return $(return_statement)
     end
 end
 
-function gradient{ALL,CHUNK,LEN,MULTITHREAD}(f, x::AbstractVector, ::Type{Val{ALL}}, ::Type{Val{CHUNK}},
-                                             ::Type{Val{LEN}}, ::Type{Val{MULTITHREAD}})
-    return gradient!(f, similar(x), x, Val{ALL}, Val{CHUNK}, Val{LEN}, Val{MULTITHREAD})
+function gradient(f, x, all::DataType, chunk::DataType, input_length::DataType, multithread::DataType)
+    return gradient!(f, similar(x), x, all, chunk, input_length, multithread)
 end
 
-@generated function gradient{ALL,CHUNK,LEN,MULTITHREAD,MUTATES}(f, ::Type{Val{ALL}}, ::Type{Val{CHUNK}},
-                                                                ::Type{Val{LEN}}, ::Type{Val{MULTITHREAD}},
-                                                                ::Type{Val{MUTATES}})
-    if MUTATES
-        R = ALL ? :(Tuple{S,typeof(out)}) : :(typeof(out))
+@generated function gradient(f, all::DataType, chunk::DataType, input_length::DataType,
+                             multithread::DataType, output_mutates::DataType)
+    if value(output_mutates)
+        R = value(all) ? :(Tuple{eltype(out),typeof(out)}) : :(typeof(out))
         return quote
-            g!{S}(out::AbstractVector{S}, x::AbstractVector) = gradient!(f, out, x, Val{ALL}, Val{CHUNK}, Val{LEN}, Val{MULTITHREAD})::$(R)
+            g!(out, x) = gradient!(f, out, x, all, chunk, input_length, multithread)::$(R)
             return g!
         end
     else
-        R = ALL ? :(Tuple{S,typeof(x)}) : :(typeof(x))
+        R = value(all) ? :(Tuple{eltype(x),typeof(x)}) : :(typeof(x))
         return quote
-            g{S}(x::AbstractVector{S}) = gradient(f, x, Val{ALL}, Val{CHUNK}, Val{LEN}, Val{MULTITHREAD})::$(R)
+            g(x) = gradient(f, x, all, chunk, input_length, multithread)::$(R)
             return g
         end
     end
 end
 
-##################
-# calc_gradient! #
-##################
-# The below code is pretty ugly, so here's an overview:
-#
-# `call_gradient!` is the entry point that is called by the API functions. If a chunk size
+######################################
+# call_gradient!/workhorse functions #
+######################################
+
+# `call_gradient!` is the entry point that is called by the API functions. It decides which
+# workhorse function to call based on the provided parameters. Note that if a chunk size
 # isn't given by an upstream caller, `call_gradient!` picks one based on the input length.
-#
-# `calc_gradient!` is the workhorse function - it generates code for calculating the
-# gradient in chunk-mode or vector-mode, depending on the input length and chunk size.
-#
-# `multi_calc_gradient!` is just like `_calc_gradient!`, but uses Julia's multithreading
-# capabilities when performing calculations in chunk-mode.
-#
-# `VEC_MODE_EXPR` is a constant expression for vector-mode that provides the function body
-# for `calc_gradient!` and `multi_calc_gradient!` when chunk size equals input length.
-#
-# `calc_gradient_expr` takes in a vector-mode expression body or chunk-mode expression body
-# and returns a completed function body with the input body injected in the correct place.
-
-@generated function call_gradient!{S,CHUNK,LEN,MULTITHREAD}(f, out::AbstractVector{S}, x::AbstractVector,
-                                                            ::Type{Val{CHUNK}}, ::Type{Val{LEN}},
-                                                            ::Type{Val{MULTITHREAD}})
-    gradf! = MULTITHREAD && IS_MULTITHREADED_JULIA ? :multi_calc_gradient! : :calc_gradient!
-    return :($(gradf!)(f, out, x, Val{$(CHUNK == nothing ? pick_chunk(LEN) : CHUNK)}, Val{LEN})::Tuple{S, $(out)})
+@generated function call_gradient!(f, out, x, chunk, input_length, multithread)
+    input_length_value = value(input_length)
+    chunk_value = value(chunk) == nothing ? pick_chunk(input_length_value) : value(chunk)
+    use_chunk_mode = chunk_value != input_length_value
+    if use_chunk_mode
+        gradfunc! = value(multithread) && IS_MULTITHREADED_JULIA ? :multi_gradient_chunk_mode! : :gradient_chunk_mode!
+        return :($(gradfunc!)(f, out, x, Val{$(chunk_value)}, Val{$(input_length_value)}))
+    else
+        return :(gradient_vector_mode!(f, out, x, Val{$(input_length_value)}))
+    end
 end
 
-@generated function calc_gradient!{S,T,CHUNK,LEN}(f, out::AbstractVector{S}, x::AbstractVector{T},
-                                                  ::Type{Val{CHUNK}}, ::Type{Val{LEN}})
-    if CHUNK == LEN
-        body = VEC_MODE_EXPR
-    else
-        remainder = LEN % CHUNK == 0 ? CHUNK : LEN % CHUNK
-        fill_length = LEN - remainder
-        reseed_partials = remainder == CHUNK ? :() : :(seed_partials = cachefetch!(tid, Partials{CHUNK,T}, Val{$(remainder)}))
-        body = quote
-            workvec::Vector{DiffNumber{CHUNK,T}} = cachefetch!(tid, DiffNumber{CHUNK,T}, Val{LEN})
-            pzeros = zero(Partials{CHUNK,T})
-
-            @simd for i in 1:LEN
-                @inbounds workvec[i] = DiffNumber{CHUNK,T}(x[i], pzeros)
+function gradient_vector_mode!{input_length}(f, out, x, ::Type{Val{input_length}})
+    @assert input_length == length(x) == length(out)
+    S, T = eltype(out), eltype(x)
+    tid = compat_threadid()
+    seed_partials::Vector{Partials{input_length,T}} = cachefetch!(tid, Partials{input_length,T})
+    workvec::Vector{DiffNumber{input_length,T}} = cachefetch!(tid, DiffNumber{input_length,T}, Val{input_length})
+
+    @simd for i in 1:input_length
+        @inbounds workvec[i] = DiffNumber{input_length,T}(x[i], seed_partials[i])
+    end
+
+    result::DiffNumber{input_length,S} = f(workvec)
+
+    @simd for i in 1:input_length
+        @inbounds out[i] = partials(result, i)
+    end
+
+    return value(result)::S, out
+end
+
+@generated function gradient_chunk_mode!{chunk,input_length}(f, out, x, ::Type{Val{chunk}}, ::Type{Val{input_length}})
+    remainder = input_length % chunk == 0 ? chunk : input_length % chunk
+    fill_length = input_length - remainder
+    reseed_partials = remainder == chunk ? :() : :(seed_partials = cachefetch!(tid, Partials{chunk,T}, Val{$(remainder)}))
+    return quote
+        @assert input_length == length(x) == length(out)
+        S, T = eltype(out), eltype(x)
+        tid = compat_threadid()
+        seed_partials::Vector{Partials{chunk,T}} = cachefetch!(tid, Partials{chunk,T})
+        workvec::Vector{DiffNumber{chunk,T}} = cachefetch!(tid, DiffNumber{chunk,T}, Val{input_length})
+        pzeros = zero(Partials{chunk,T})
+
+        @simd for i in 1:input_length
+            @inbounds workvec[i] = DiffNumber{chunk,T}(x[i], pzeros)
+        end
+
+        for c in 1:$(chunk):$(fill_length)
+            @simd for i in 1:chunk
+                j = i + c - 1
+                @inbounds workvec[j] = DiffNumber{chunk,T}(x[j], seed_partials[i])
+            end
+            local result::DiffNumber{chunk,S} = f(workvec)
+            @simd for i in 1:chunk
+                j = i + c - 1
+                @inbounds out[j] = partials(result, i)
+                @inbounds workvec[j] = DiffNumber{chunk,T}(x[j], pzeros)
+            end
+        end
+
+        # Performing the final chunk manually seems to triggers some additional
+        # optimization heuristics, which results in more efficient memory allocation
+        $(reseed_partials)
+
+        @simd for i in 1:$(remainder)
+            j = $(fill_length) + i
+            @inbounds workvec[j] = DiffNumber{chunk,T}(x[j], seed_partials[i])
+        end
+        result::DiffNumber{chunk,S} = f(workvec)
+        @simd for i in 1:$(remainder)
+            j = $(fill_length) + i
+            @inbounds out[j] = partials(result, i)
+            @inbounds workvec[j] = DiffNumber{chunk,T}(x[j], pzeros)
+        end
+
+        return value(result)::S, out
+    end
+end
+
+if IS_MULTITHREADED_JULIA
+    @generated function multi_gradient_chunk_mode!{chunk,input_length}(f, out, x, ::Type{Val{chunk}}, ::Type{Val{input_length}})
+        remainder = input_length % chunk == 0 ? chunk : input_length % chunk
+        fill_length = input_length - remainder
+        reseed_partials = remainder == chunk ? :() : :(seed_partials = cachefetch!(tid, Partials{chunk,T}, Val{$(remainder)}))
+        return quote
+            @assert input_length == length(x) == length(out)
+            S, T = eltype(out), eltype(x)
+            tid = compat_threadid()
+            seed_partials::Vector{Partials{chunk,T}} = cachefetch!(tid, Partials{chunk,T})
+            workvecs::NTuple{NTHREADS, Vector{DiffNumber{chunk,T}}} = cachefetch!(DiffNumber{chunk,T}, Val{input_length})
+            pzeros = zero(Partials{chunk,T})
+
+            Base.Threads.@threads for t in 1:NTHREADS
+                # see https://github.com/JuliaLang/julia/issues/14948
+                local workvec = workvecs[t]
+                @simd for i in 1:input_length
+                    @inbounds workvec[i] = DiffNumber{chunk,T}(x[i], pzeros)
+                end
             end
 
-            for c in 1:$(CHUNK):$(fill_length)
-                @simd for i in 1:CHUNK
+            Base.Threads.@threads for c in 1:$(chunk):$(fill_length)
+                local workvec = workvecs[compat_threadid()]
+                @simd for i in 1:chunk
                     j = i + c - 1
-                    @inbounds workvec[j] = DiffNumber{CHUNK,T}(x[j], seed_partials[i])
+                    @inbounds workvec[j] = DiffNumber{chunk,T}(x[j], seed_partials[i])
                 end
-                local result::DiffNumber{CHUNK,S} = f(workvec)
-                @simd for i in 1:CHUNK
+                local result::DiffNumber{chunk,S} = f(workvec)
+                @simd for i in 1:chunk
                     j = i + c - 1
                     @inbounds out[j] = partials(result, i)
-                    @inbounds workvec[j] = DiffNumber{CHUNK,T}(x[j], pzeros)
+                    @inbounds workvec[j] = DiffNumber{chunk,T}(x[j], pzeros)
                 end
             end
 
             # Performing the final chunk manually seems to triggers some additional
             # optimization heuristics, which results in more efficient memory allocation
             $(reseed_partials)
+
+            workvec = workvecs[tid]
+
             @simd for i in 1:$(remainder)
                 j = $(fill_length) + i
-                @inbounds workvec[j] = DiffNumber{CHUNK,T}(x[j], seed_partials[i])
+                @inbounds workvec[j] = DiffNumber{chunk,T}(x[j], seed_partials[i])
             end
-            result::DiffNumber{CHUNK,S} = f(workvec)
+
+            result::DiffNumber{chunk,S} = f(workvec)
+
             @simd for i in 1:$(remainder)
                 j = $(fill_length) + i
                 @inbounds out[j] = partials(result, i)
-                @inbounds workvec[j] = DiffNumber{CHUNK,T}(x[j], pzeros)
+                @inbounds workvec[j] = DiffNumber{chunk,T}(x[j], pzeros)
             end
-        end
-    end
-    return calc_gradient_expr(body)
-end
-
-if IS_MULTITHREADED_JULIA
-    @generated function multi_calc_gradient!{S,T,CHUNK,LEN}(f, out::AbstractVector{S}, x::AbstractVector{T},
-                                                            ::Type{Val{CHUNK}}, ::Type{Val{LEN}})
-        if CHUNK == LEN
-            body = VEC_MODE_EXPR
-        else
-            remainder = LEN % CHUNK == 0 ? CHUNK : LEN % CHUNK
-            fill_length = LEN - remainder
-            reseed_partials = remainder == CHUNK ? :() : :(seed_partials = cachefetch!(tid, Partials{CHUNK,T}, Val{$(remainder)}))
-            body = quote
-                workvecs::NTuple{NTHREADS, Vector{DiffNumber{CHUNK,T}}} = cachefetch!(DiffNumber{CHUNK,T}, Val{LEN})
-                pzeros = zero(Partials{CHUNK,T})
-
-                Base.Threads.@threads for t in 1:NTHREADS
-                    # must be local, see https://github.com/JuliaLang/julia/issues/14948
-                    local workvec = workvecs[t]
-                    @simd for i in 1:LEN
-                        @inbounds workvec[i] = DiffNumber{CHUNK,T}(x[i], pzeros)
-                    end
-                end
-
-                Base.Threads.@threads for c in 1:$(CHUNK):$(fill_length)
-                    local workvec = workvecs[Base.Threads.threadid()]
-                    @simd for i in 1:CHUNK
-                        j = i + c - 1
-                        @inbounds workvec[j] = DiffNumber{CHUNK,T}(x[j], seed_partials[i])
-                    end
-                    local result::DiffNumber{CHUNK,S} = f(workvec)
-                    @simd for i in 1:CHUNK
-                        j = i + c - 1
-                        @inbounds out[j] = partials(result, i)
-                        @inbounds workvec[j] = DiffNumber{CHUNK,T}(x[j], pzeros)
-                    end
-                end
 
-                # Performing the final chunk manually seems to triggers some additional
-                # optimization heuristics, which results in more efficient memory allocation
-                $(reseed_partials)
-                workvec = workvecs[tid]
-                @simd for i in 1:$(remainder)
-                    j = $(fill_length) + i
-                    @inbounds workvec[j] = DiffNumber{CHUNK,T}(x[j], seed_partials[i])
-                end
-                result::DiffNumber{CHUNK,S} = f(workvec)
-                @simd for i in 1:$(remainder)
-                    j = $(fill_length) + i
-                    @inbounds out[j] = partials(result, i)
-                    @inbounds workvec[j] = DiffNumber{CHUNK,T}(x[j], pzeros)
-                end
-            end
+            return value(result)::S, out
         end
-        return calc_gradient_expr(body)
-    end
-end
-
-const VEC_MODE_EXPR = quote
-    workvec::Vector{DiffNumber{CHUNK,T}} = cachefetch!(tid, DiffNumber{CHUNK,T}, Val{LEN})
-    @simd for i in 1:LEN
-        @inbounds workvec[i] = DiffNumber{CHUNK,T}(x[i], seed_partials[i])
-    end
-    result::DiffNumber{CHUNK,S} = f(workvec)
-    @simd for i in 1:LEN
-        @inbounds out[i] = partials(result, i)
-    end
-end
-
-function calc_gradient_expr(body)
-    return quote
-        @assert LEN == length(x) == length(out)
-        tid = $(IS_MULTITHREADED_JULIA ? Base.Threads.threadid() : 1)
-        seed_partials::Vector{Partials{CHUNK,T}} = cachefetch!(tid, Partials{CHUNK,T})
-        $(body)
-        return (value(result)::S, out)
     end
 end