Skip to content

Commit

Permalink
General updates
Browse files Browse the repository at this point in the history
General updates after changing example networks to use map instead
of dot broadcasting.
  • Loading branch information
maetshju committed Oct 13, 2020
1 parent 37efaa0 commit f471337
Show file tree
Hide file tree
Showing 3 changed files with 92 additions and 127 deletions.
143 changes: 60 additions & 83 deletions src/losses/ctc-gpu.jl
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,9 @@

using Flux
using Statistics
using CuArrays
using CUDAnative
using CUDA

const MAX_THREADS = 256

function log_plus_f(p1, p2)

Expand All @@ -18,7 +19,7 @@ function log_plus_f(p1, p2)
p1, p2 = p2, p1
end

return p1 + CUDAnative.log(1+CUDAnative.exp(p2 - p1))
return p1 + CUDA.log(1+CUDA.exp(p2 - p1))
end

function countRepeats(A)
Expand Down Expand Up @@ -51,24 +52,21 @@ function computeAlphaKernel(probs, labelSize, uttLength, repeats, labelsWithoutB
# Fill in first column (time step)
i = tid
while i <= last - start
alpha[start + i] = probs[labels[start + i]]
alpha[start+i, 1] = probs[labels[start+i], 1]
i += blockDim().x
end

sync_threads()

# Fill in coefficients for each time step
for t=2:T
startCurCol = (t-1) * S
startPrevCol = (t-2) * S
startProbCol = (t-1) * div(length(probs), T)

# Corner-case checking
if tid == 1 && !(1 < S - 2*(T-t) - 1)
if start == 0
alpha[startCurCol + 1] = probs[startProbCol + blankLabel] + alpha[startPrevCol + 1]
alpha[1, t] = probs[blankLabel, t] + alpha[1, t-1]
elseif start == 1
alpha[startCurCol + 1] = alpha[startPrevCol + 1]
alpha[1, t] = alpha[1, t-1]
end
end

Expand All @@ -79,16 +77,16 @@ function computeAlphaKernel(probs, labelSize, uttLength, repeats, labelsWithoutB
idx = tid+1
while idx <= S

prevSum = log_plus_f(alpha[startPrevCol + idx], alpha[startPrevCol + idx-1])
prevSum = log_plus_f(alpha[idx, t-1], alpha[idx-1, t-1])

if labels[idx] != blankLabel && idx != 2 && labels[idx] != labels[idx-2]
prevSum = log_plus_f(prevSum, alpha[startPrevCol + idx-2])
prevSum = log_plus_f(prevSum, alpha[idx-2, t-1])
end

if idx < S - 2*(T-t) - 1
alpha[idx + startCurCol] = -Inf32
alpha[idx, t] = -Inf32
else
alpha[startCurCol + idx] = prevSum + probs[startProbCol + labels[idx]]
alpha[idx, t] = prevSum + probs[labels[idx], t]
end

idx += blockDim().x
Expand Down Expand Up @@ -122,50 +120,40 @@ function computeBetasAndGradKernel(probs, labelSize, uttLength,

sync_threads()


startCurCol = (T-1)*S
startProbCol = (T-1) * div(length(probs), T)

i = tid

# Calculate coefficients for last column (time step)
# then determine alpha and beta product
while i <= last - start + 1

beta[startCurCol + i + start] = 0
output[startCurCol + i + start] = beta[startCurCol + i + start] + alphas[startCurCol + i + start]
beta[i+start, T] = 0
output[i+start, T] = beta[i+start, T] + alphas[i+start, T]
i += blockDim().x
end

sync_threads()

# Fill in `accum` for last column (time step)
if tid == 1
startAccCol = startProbCol
startOutputCol = startCurCol

if tid == 1
for i=1:S
labelIdx = labels[i]
accum[startAccCol + labelIdx] = log_plus_f(accum[startAccCol + labelIdx], output[startOutputCol + i])
accum[labelIdx, T] = log_plus_f(accum[labelIdx, T], output[i, T])
end
end

sync_threads()

# Fill in `grad` for last column (time step)
idx = tid
while idx <= CUDAnative.div_fast(Float32(length(grad)), Float32(T))
#
startProbCol = (T - 1) * div(length(probs), T)
startOutputCol = (T - 1) * S
while idx <= size(grad, 1)

s = -Inf32

for i=1:S
s = log_plus_f(s, output[startOutputCol + i])
s = log_plus_f(s, output[i, T])
end

# ∂L/∂a (where a is activation before logsoftmax)
grad[startProbCol + idx] = CUDAnative.exp(probs[startProbCol + idx]) - CUDAnative.exp(accum[startProbCol + idx] - s)
grad[idx, T] = CUDA.exp(probs[idx, T]) - CUDA.exp(accum[idx, T] - s)
idx += blockDim().x
end

Expand All @@ -174,28 +162,29 @@ function computeBetasAndGradKernel(probs, labelSize, uttLength,
# Fill in the rest of the coefficients
t = T-1
while t >= 1

startCurCol = (t-1)*S
startNextCol = t*S
startProbCol = t * div(length(probs), T)

if t < T

idx = tid
while idx <= S-1
# while idx <= S-1
while idx <= S

nextSum = log_plus_f(beta[startNextCol + idx] + probs[startProbCol + labels[idx]],
beta[startNextCol + idx+1] + probs[startProbCol + labels[idx+1]])
nextSum = beta[idx, t+1] + probs[labels[idx], t+1]

if idx < S

nextSum = log_plus_f(nextSum,
beta[idx+1, t+1] + probs[labels[idx+1], t+1])
end

if labels[idx] != blankLabel && idx != S-1 && labels[idx] != labels[idx+2]
nextSum = log_plus_f(nextSum,
beta[startNextCol + idx + 2] + probs[startProbCol + labels[idx+2]])
beta[idx + 2, t+1] + probs[labels[idx+2], t+1])
end

if idx > 2*t
beta[idx + startCurCol] = -Inf32
beta[idx, t] = -Inf32
else
beta[idx + startCurCol] = nextSum
beta[idx, t] = nextSum

end

Expand All @@ -205,14 +194,14 @@ function computeBetasAndGradKernel(probs, labelSize, uttLength,
sync_threads()

if tid == 1 && last == S
beta[startCurCol + S] = beta[startNextCol + S] + probs[startProbCol + blankLabel]
beta[S, t] = beta[S, t] + probs[blankLabel, t+1]
end

sync_threads()

idx = tid
while idx <= S
output[startCurCol + idx] = alphas[idx+startCurCol] + beta[startCurCol + idx]
output[idx, t] = alphas[idx, t] + beta[idx, t]
idx += blockDim().x
end

Expand All @@ -224,14 +213,10 @@ function computeBetasAndGradKernel(probs, labelSize, uttLength,

# Calculate accumulated alpha-beta products for each label class for
# each time step; used in calculating gradients
if tid == 1

startAccCol = (t-1) * div(length(accum), T)
startOutputCol = (t-1) * S

if tid == 1
for i=1:S
labelIdx = labels[i]
accum[startAccCol + labelIdx] = log_plus_f(accum[startAccCol + labelIdx], output[startOutputCol + i])
accum[labelIdx, t] = log_plus_f(accum[labelIdx, t], output[i, t])
end
end

Expand All @@ -240,75 +225,67 @@ function computeBetasAndGradKernel(probs, labelSize, uttLength,
idx = tid

# Calculate gradients
while idx <= CUDAnative.div_fast(Float32(length(grad)), Float32(T))
#
startProbCol = (t - 1) * div(length(probs), T)
startOutputCol = (t - 1) * S
while idx <= size(grad, 1)

s = -Inf32

for i=1:S
s = log_plus_f(s, output[startOutputCol + i])
s = log_plus_f(s, output[i, t])
end

# ∂L/∂a (where a is activation before logsoftmax)
grad[startProbCol + idx] = CUDAnative.exp(probs[startProbCol + idx]) - CUDAnative.exp(accum[startProbCol + idx] - s)
grad[idx, t] = CUDA.exp(probs[idx, t]) - CUDA.exp(accum[idx, t] - s)
idx += blockDim().x
end

sync_threads()

t -= 1
sync_threads()
# because of course, it wouldn't work without this earlier return statement
# otherwise, some of the gradient values become 0
t == 0 && return
end

return nothing
end

ctc(ŷ::CuArrays.CuArray, y::Array) = ctc_(ŷ, y)[1] |> mean

ctc(ŷ::Array, y::CuArrays.CuArray) = ctc_(CuArray(ŷ), y)[1] |> mean

ctc(ŷ::CuArrays.CuArray, y::CuArrays.CuArray) = ctc_(ŷ, y)[1] |> mean

# methods for `ctc_` helper function
ctc_(ŷ::Array, y::CuArrays.CuArray) = ctc_(CuArray(ŷ), y)
ctc(ŷ::CuArray, y::Array) = ctc_(ŷ, y)[1] |> mean
ctc(ŷ::Array, y::CuArray) = ctc_(CuArray(ŷ), collect(y))[1] |> mean
ctc(ŷ::CuArray, y::CuArray) = ctc_(ŷ, collect(y))[1] |> mean
ctc_(ŷ::Array, y::CuArray) = ctc_(CuArray(ŷ), collect(y))

function ctc_(ŷ::CuArrays.CuArray, y)
function ctc_(ŷ::CuArray, y)

= logsoftmax(ŷ)

blank = size(ŷ, 1)
labels = vec(mapslices(Base.argmax, y, dims=1))
labels = [Base.argmax(y[:,i]) for i in 1:size(y, 2)]
z = F(labels, blank)
z′ = [blank]
for label in z
push!(z′, label)
push!(z′, blank)
end

T = size(ŷ, 2)
U′ = 2*length(z) + 1
alphas = CuArrays.fill(log(zero(ŷ[1])), T * U′)
betas = copy(alphas)
output = copy(alphas)

alphas = CUDA.fill(log(zero(ŷ[1])), U′, T)
betas = CUDA.fill(log(zero(ŷ[1])), U′, T)
output = CUDA.fill(log(zero(ŷ[1])), U′, T)

nRepeats = countRepeats(labels)
nThreads = min(U′, MAX_THREADS)

# 1 block with `U′` threads
@cuda blocks=1 threads=U′ computeAlphaKernel(ŷ, length(z), size(ŷ,2), nRepeats, CuArray(z), CuArray(z′), alphas, blank)
@cuda blocks=1 threads=nThreads computeAlphaKernel(ŷ, length(z), size(ŷ,2), nRepeats, CuArray(z), CuArray(z′), alphas, blank)

grads = CuArrays.fill(log(zero(ŷ[1])), length(ŷ))
accum = copy(grads)
grads = CUDA.fill(log(zero(ŷ[1])), size(ŷ))
accum = CUDA.fill(log(zero(ŷ[1])), size(ŷ))

@cuda blocks=1 threads=U′ computeBetasAndGradKernel(ŷ, length(z), size(ŷ,2), nRepeats, CuArray(z′), alphas, betas, output, accum, grads, blank)
@cuda blocks=1 threads=nThreads computeBetasAndGradKernel(ŷ, length(z), size(ŷ,2), nRepeats, CuArray(z′), alphas, betas, output, accum, grads, blank)

ls = reshape(collect(output), U′, T)
ls = -1 .* mapslices(logsum, ls, dims=1) |> vec

gs = reshape(grads, size(ŷ,1), size(ŷ,2))

= alphas = betas = output = accum = grads = nothing
return ls, gs
ls = collect(output)
ls = vec(-1 .* [logsum(ls[:,i]) for i in 1:size(ls, 2)])

= alphas = betas = output = accum = nothing
return ls, grads
end
Loading

0 comments on commit f471337

Please sign in to comment.