From 6f311f01e9671509656e5f74562e663b5d1fe854 Mon Sep 17 00:00:00 2001
From: Jarrett Revels <jarrettrevels@gmail.com>
Date: Mon, 8 Feb 2016 18:30:46 -0500
Subject: [PATCH] Wrap multithreading code in VERSION conditionals in order to
 support v0.4

---
 .travis.yml        |  2 +-
 src/ForwardDiff.jl |  3 +-
 src/cache.jl       |  4 +--
 src/gradient.jl    | 82 ++++++++++++++++++++++++----------------------
 4 files changed, 47 insertions(+), 44 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index c0904ee6..47ffb0d8 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -9,4 +9,4 @@ script:
     - if [[ -a .git/shallow ]]; then git fetch --unshallow; fi
     - julia -e 'Pkg.clone(pwd()); Pkg.build("ForwardDiff"); Pkg.test("ForwardDiff"; coverage=true)';
 after_success:
-    - julia -e 'cd(Pkg.dir("ForwardDiff")); Pkg.add("Coverage"); using Coverage; Coveralls.submit(Coveralls.process_folder())'
\ No newline at end of file
+    - julia -e 'cd(Pkg.dir("ForwardDiff")); Pkg.add("Coverage"); using Coverage; Coveralls.submit(Coveralls.process_folder())'
diff --git a/src/ForwardDiff.jl b/src/ForwardDiff.jl
index 0a2256ab..e4975efb 100644
--- a/src/ForwardDiff.jl
+++ b/src/ForwardDiff.jl
@@ -2,10 +2,11 @@ isdefined(Base, :__precompile__) && __precompile__()
 
 module ForwardDiff
 
-import Base.Threads
 import Calculus
 import NaNMath
 
+const THREAD_VERSION = v"0.5.0-dev+923"
+const NTHREADS = VERSION >= THREAD_VERSION ? Base.Threads.nthreads() : 1
 const AUTO_DEFINED_UNARY_FUNCS = map(first, Calculus.symbolic_derivatives_1arg())
 const NANMATH_FUNCS = (:sin, :cos, :tan, :asin, :acos, :acosh,
                        :atanh, :log, :log2, :log10, :lgamma, :log1p)
diff --git a/src/cache.jl b/src/cache.jl
index 303328b1..c989118d 100644
--- a/src/cache.jl
+++ b/src/cache.jl
@@ -1,4 +1,4 @@
-const CACHE = ntuple(n -> Dict{DataType,Any}(), Threads.nthreads())
+const CACHE = ntuple(n -> Dict{DataType,Any}(), NTHREADS)
 
 function clearcache!()
     for d in CACHE
@@ -6,7 +6,7 @@ function clearcache!()
     end
 end
 
-@eval cachefetch!(D::DataType, L::DataType) = $(Expr(:tuple, [:(cachefetch!($i, D, L)) for i in 1:Threads.nthreads()]...))
+@eval cachefetch!(D::DataType, L::DataType) = $(Expr(:tuple, [:(cachefetch!($i, D, L)) for i in 1:NTHREADS]...))
 
 function cachefetch!{N,T,L}(tid::Integer, ::Type{DiffNumber{N,T}}, ::Type{Val{L}})
     K = Tuple{DiffNumber{N,T},L}
diff --git a/src/gradient.jl b/src/gradient.jl
index 43a6fb82..14d8d236 100644
--- a/src/gradient.jl
+++ b/src/gradient.jl
@@ -122,57 +122,59 @@ end
     return calc_gradient_expr(body)
 end
 
-@generated function multi_calc_gradient!{S,T,N,L}(f, output::Vector{S}, x::Vector{T}, ::Type{Val{N}}, ::Type{Val{L}})
-    if N == L
-        body = VEC_MODE_EXPR
-    else
-        nthreads = Threads.nthreads()
-        remainder = L % N == 0 ? N : L % N
-        fill_length = L - remainder
-        reseed_partials = remainder == N ? :() : :(seed_partials = cachefetch!(tid, Partials{N,T}, Val{$(remainder)}))
-        body = quote
-            workvecs::NTuple{$(nthreads), Vector{DiffNumber{N,T}}} = cachefetch!(DiffNumber{N,T}, Val{L})
-            pzeros = zero(Partials{N,T})
+if VERSION >= THREAD_VERSION
+    @generated function multi_calc_gradient!{S,T,N,L}(f, output::Vector{S}, x::Vector{T}, ::Type{Val{N}}, ::Type{Val{L}})
+        if N == L
+            body = VEC_MODE_EXPR
+        else
+            nthreads = Threads.nthreads()
+            remainder = L % N == 0 ? N : L % N
+            fill_length = L - remainder
+            reseed_partials = remainder == N ? :() : :(seed_partials = cachefetch!(tid, Partials{N,T}, Val{$(remainder)}))
+            body = quote
+                workvecs::NTuple{$(nthreads), Vector{DiffNumber{N,T}}} = cachefetch!(DiffNumber{N,T}, Val{L})
+                pzeros = zero(Partials{N,T})
+
+                Threads.@threads for t in 1:$(nthreads)
+                    # must be local, see https://github.com/JuliaLang/julia/issues/14948
+                    local workvec = workvecs[t]
+                    @simd for i in 1:L
+                        @inbounds workvec[i] = DiffNumber{N,T}(x[i], pzeros)
+                    end
+                end
 
-            Threads.@threads for t in 1:$(nthreads)
-                # must be local, see https://github.com/JuliaLang/julia/issues/14948
-                local workvec = workvecs[t]
-                @simd for i in 1:L
-                    @inbounds workvec[i] = DiffNumber{N,T}(x[i], pzeros)
+                Threads.@threads for c in 1:$(N):$(fill_length)
+                    local workvec = workvecs[Threads.threadid()]
+                    @simd for i in 1:N
+                        j = i + c - 1
+                        @inbounds workvec[j] = DiffNumber{N,T}(x[j], seed_partials[i])
+                    end
+                    local result::DiffNumber{N,S} = f(workvec)
+                    @simd for i in 1:N
+                        j = i + c - 1
+                        @inbounds output[j] = partials(result, i)
+                        @inbounds workvec[j] = DiffNumber{N,T}(x[j], pzeros)
+                    end
                 end
-            end
 
-            Threads.@threads for c in 1:$(N):$(fill_length)
-                local workvec = workvecs[Threads.threadid()]
-                @simd for i in 1:N
-                    j = i + c - 1
+                # Performing the final chunk manually seems to triggers some additional
+                # optimization heuristics, which results in more efficient memory allocation
+                $(reseed_partials)
+                workvec = workvecs[tid]
+                @simd for i in 1:$(remainder)
+                    j = $(fill_length) + i
                     @inbounds workvec[j] = DiffNumber{N,T}(x[j], seed_partials[i])
                 end
-                local result::DiffNumber{N,S} = f(workvec)
-                @simd for i in 1:N
-                    j = i + c - 1
+                result::DiffNumber{N,S} = f(workvec)
+                @simd for i in 1:$(remainder)
+                    j = $(fill_length) + i
                     @inbounds output[j] = partials(result, i)
                     @inbounds workvec[j] = DiffNumber{N,T}(x[j], pzeros)
                 end
             end
-
-            # Performing the final chunk manually seems to triggers some additional
-            # optimization heuristics, which results in more efficient memory allocation
-            $(reseed_partials)
-            workvec = workvecs[tid]
-            @simd for i in 1:$(remainder)
-                j = $(fill_length) + i
-                @inbounds workvec[j] = DiffNumber{N,T}(x[j], seed_partials[i])
-            end
-            result::DiffNumber{N,S} = f(workvec)
-            @simd for i in 1:$(remainder)
-                j = $(fill_length) + i
-                @inbounds output[j] = partials(result, i)
-                @inbounds workvec[j] = DiffNumber{N,T}(x[j], pzeros)
-            end
         end
+        return calc_gradient_expr(body)
     end
-    return calc_gradient_expr(body)
 end
 
 const VEC_MODE_EXPR = quote