Skip to content

Commit

Permalink
[NDTensors] Remove and test for scalar indexing on GPU (#1245)
Browse files Browse the repository at this point in the history
  • Loading branch information
kmp5VT authored Nov 15, 2023
1 parent 4d3372d commit 61caa1b
Show file tree
Hide file tree
Showing 23 changed files with 296 additions and 108 deletions.
2 changes: 2 additions & 0 deletions NDTensors/Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ Dictionaries = "85a47980-9c8c-11e8-2b9f-f7ca1fa99fb4"
FLoops = "cc61a311-1640-44b5-9fba-1b764f453329"
Folds = "41a02a25-b8f0-4f67-bc48-60067656b558"
Functors = "d9f16b24-f501-4c13-a1f2-28368ffc5196"
GPUArraysCore="46192b85-c4d5-4398-a991-12ede77f4527"
HDF5 = "f67ccb44-e63f-5c2f-98bd-6dc0ccc4ba2f"
InlineStrings = "842dd82b-1e85-43dc-bf29-5d0ee9dffc48"
LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
Expand Down Expand Up @@ -44,6 +45,7 @@ Dictionaries = "0.3.5"
FLoops = "0.2.1"
Folds = "0.2.8"
Functors = "0.2, 0.3, 0.4"
GPUArraysCore = "0.1"
HDF5 = "0.14, 0.15, 0.16, 0.17"
InlineStrings = "1"
LinearAlgebra = "1.6"
Expand Down
5 changes: 4 additions & 1 deletion NDTensors/ext/NDTensorsCUDAExt/NDTensorsCUDAExt.jl
Original file line number Diff line number Diff line change
Expand Up @@ -5,16 +5,19 @@ using NDTensors.SetParameters
using NDTensors.Unwrap
using Adapt
using Functors
using LinearAlgebra
using LinearAlgebra: LinearAlgebra, Adjoint, Transpose, mul!, svd
using CUDA
using CUDA.CUBLAS
using CUDA.CUSOLVER

include("imports.jl")
include("default_kwargs.jl")
include("copyto.jl")
include("set_types.jl")
include("iscu.jl")
include("adapt.jl")
include("indexing.jl")
include("linearalgebra.jl")
include("mul.jl")
include("permutedims.jl")
end
26 changes: 26 additions & 0 deletions NDTensors/ext/NDTensorsCUDAExt/copyto.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
# Same definition as `MtlArray`.
function Base.copy(src::Exposed{<:CuArray,<:Base.ReshapedArray})
return reshape(copy(parent(src)), size(unexpose(src)))
end

function Base.copy(
src::Exposed{
<:CuArray,<:SubArray{<:Any,<:Any,<:Base.ReshapedArray{<:Any,<:Any,<:Adjoint}}
},
)
return copy(@view copy(expose(parent(src)))[parentindices(unexpose(src))...])
end

# Catches a bug in `copyto!` in CUDA backend.
function Base.copyto!(dest::Exposed{<:CuArray}, src::Exposed{<:CuArray,<:SubArray})
copyto!(dest, expose(copy(src)))
return unexpose(dest)
end

# Catches a bug in `copyto!` in CUDA backend.
function Base.copyto!(
dest::Exposed{<:CuArray}, src::Exposed{<:CuArray,<:Base.ReshapedArray}
)
copyto!(dest, expose(parent(src)))
return unexpose(dest)
end
2 changes: 0 additions & 2 deletions NDTensors/ext/NDTensorsCUDAExt/imports.jl
Original file line number Diff line number Diff line change
Expand Up @@ -2,5 +2,3 @@ import NDTensors: cu, set_ndims, set_eltype, set_eltype_if_unspecified, similart
import NDTensors:
ContractionProperties, _contract!, GemmBackend, auto_select_backend, _gemm!, iscu
import NDTensors.SetParameters: nparameters, get_parameter, set_parameter, default_parameter

import .CUDA: CuArrayAdaptor
6 changes: 1 addition & 5 deletions NDTensors/ext/NDTensorsCUDAExt/indexing.jl
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ function Base.getindex(E::Exposed{<:CuArray})
return CUDA.@allowscalar unexpose(E)[]
end

function setindex!(E::Exposed{<:CuArray}, x::Number)
function Base.setindex!(E::Exposed{<:CuArray}, x::Number)
CUDA.@allowscalar unexpose(E)[] = x
return unexpose(E)
end
Expand All @@ -11,10 +11,6 @@ function Base.getindex(E::Exposed{<:CuArray,<:Adjoint}, i, j)
return (expose(parent(E))[j, i])'
end

function Base.copy(E::Exposed{<:CuArray,<:Base.ReshapedArray})
return reshape(copy(expose(parent(E))), size(unexpose(E)))
end

Base.any(f, E::Exposed{<:CuArray,<:NDTensors.Tensor}) = any(f, data(unexpose(E)))

function Base.print_array(io::IO, E::Exposed{<:CuArray})
Expand Down
43 changes: 43 additions & 0 deletions NDTensors/ext/NDTensorsCUDAExt/mul.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
# This was calling generic matrix multiplication.
# TODO: Raise an issue with `CUDA.jl`.
function LinearAlgebra.mul!(
CM::Exposed{<:CuArray,<:LinearAlgebra.Transpose},
AM::Exposed{<:CuArray},
BM::Exposed{<:CuArray},
α,
β,
)
mul!(transpose(CM), transpose(BM), transpose(AM), α, β)
return unexpose(CM)
end

# This was calling generic matrix multiplication.
# TODO: Raise an issue with `CUDA.jl`.
function LinearAlgebra.mul!(
CM::Exposed{<:CuArray,<:LinearAlgebra.Adjoint},
AM::Exposed{<:CuArray},
BM::Exposed{<:CuArray},
α,
β,
)
mul!(CM', BM', AM', α, β)
return unexpose(CM)
end

## Fix issue in CUDA.jl where it cannot distinguish Transpose{Reshape{Adjoint{CuArray}}}
## as a CuArray and calls generic matmul
function LinearAlgebra.mul!(
CM::Exposed{<:CuArray},
AM::Exposed{<:CuArray},
BM::Exposed{
<:CuArray,
<:LinearAlgebra.Transpose{
<:Any,<:Base.ReshapedArray{<:Any,<:Any,<:LinearAlgebra.Adjoint}
},
},
α,
β,
)
mul!(CM, AM, expose(transpose(copy(expose(parent(BM))))), α, β)
return unexpose(CM)
end
7 changes: 7 additions & 0 deletions NDTensors/ext/NDTensorsCUDAExt/permutedims.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
function Base.permutedims!(
Edest::Exposed{<:CuArray,<:Base.ReshapedArray}, Esrc::Exposed{<:CuArray}, perm
)
Aperm = permutedims(Esrc, perm)
copyto!(expose(parent(Edest)), expose(Aperm))
return unexpose(Edest)
end
6 changes: 4 additions & 2 deletions NDTensors/ext/NDTensorsMetalExt/copyto.jl
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,14 @@ end

# Catches a bug in `copyto!` in Metal backend.
function Base.copyto!(dest::Exposed{<:MtlArray}, src::Exposed{<:MtlArray,<:SubArray})
return copyto!(dest, expose(copy(src)))
copyto!(dest, expose(copy(src)))
return unexpose(dest)
end

# Catches a bug in `copyto!` in Metal backend.
function Base.copyto!(
dest::Exposed{<:MtlArray}, src::Exposed{<:MtlArray,<:Base.ReshapedArray}
)
return copyto!(dest, expose(parent(src)))
copyto!(dest, expose(parent(src)))
return unexpose(dest)
end
9 changes: 9 additions & 0 deletions NDTensors/ext/NDTensorsMetalExt/mul.jl
Original file line number Diff line number Diff line change
Expand Up @@ -10,3 +10,12 @@ function LinearAlgebra.mul!(
mul!(transpose(CM), transpose(BM), transpose(AM), α, β)
return unexpose(CM)
end

# This was calling generic matrix multiplication.
# TODO: Raise an issue with `Metal.jl`.
function LinearAlgebra.mul!(
CM::Exposed{<:MtlArray,<:Adjoint}, AM::Exposed{<:MtlArray}, BM::Exposed{<:MtlArray}, α, β
)
mul!(CM', BM', AM', α, β)
return unexpose(CM)
end
6 changes: 3 additions & 3 deletions NDTensors/ext/examples/NDTensorCUDA.jl
Original file line number Diff line number Diff line change
Expand Up @@ -61,9 +61,9 @@ function main()
#Currently this code fails with CUDA.allowscalar(false)
# Because of outer calling the _gemm! function which calls a
# generic implementation
grad = gradient(f, cA, cB, cC, cD)
@test NDTensors.cpu(cB * cC * cD) NDTensors.cpu(grad[1])
@test (cB * cC * cD) grad[1]
@allowscalar grad = gradient(f, cA, cB, cC, cD)
@allowscalar @test NDTensors.cpu(cB * cC * cD) NDTensors.cpu(grad[1])
@allowscalar @test (cB * cC * cD) grad[1]
# Create a tuple of indices
decomp = (
dim(NDTensors.ind(grad[1], 1)),
Expand Down
1 change: 1 addition & 0 deletions NDTensors/src/NDTensors.jl
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ using Compat
using Dictionaries
using FLoops
using Folds
using GPUArraysCore
using InlineStrings
using Random
using LinearAlgebra
Expand Down
2 changes: 2 additions & 0 deletions NDTensors/src/Unwrap/src/functions/abstractarray.jl
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@ parent(E::Exposed) = parent(unexpose(E))

transpose(E::Exposed) = transpose(unexpose(E))

adjoint(E::Exposed) = adjoint(unexpose(E))

cpu(E::Exposed) = cpu(unexpose(E))

getindex(E::Exposed) = unexpose(E)[]
Expand Down
1 change: 1 addition & 0 deletions NDTensors/src/Unwrap/src/import.jl
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import Base:
adjoint,
permutedims,
permutedims!,
copy,
Expand Down
64 changes: 58 additions & 6 deletions NDTensors/src/Unwrap/test/runtests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ using Test
using NDTensors.Unwrap
using NDTensors
using LinearAlgebra
using GPUArraysCore: @allowscalar

include("../../../test/device_list.jl")
@testset "Testing Unwrap $dev, $elt" for dev in devices_list(ARGS),
Expand All @@ -24,8 +25,8 @@ include("../../../test/device_list.jl")
@test parent(Et) == v
@test parent(Ea) == v
@test transpose(E) == vt
@test cpu(E) == v
@test cpu(Et) == vt
@test cpu(E) == cpu(v)
@test cpu(Et) == cpu(vt)

m = reshape(v, (5, 2))
mt = transpose(m)
Expand Down Expand Up @@ -125,17 +126,68 @@ include("../../../test/device_list.jl")
y = dev(randn(elt, 16))
x = reshape(dev(randn(elt, 4, 4))', 16)
copyto!(expose(y), expose(x))
@test y == x
@test copy(x) == x
@allowscalar begin
@test y == x
@test copy(x) == x
end

y = dev(randn(elt, 8))
x = @view reshape(dev(randn(elt, 8, 8))', 64)[1:8]
copyto!(expose(y), expose(x))
@test y == x
@test copy(x) == x
@allowscalar begin
@test y == x
@test copy(x) == x
end

y = Base.ReshapedArray(dev(randn(elt, 16)), (4, 4), ())
x = dev(randn(elt, 4, 4))
permutedims!(expose(y), expose(x), (2, 1))
@test NDTensors.cpu(y) == transpose(NDTensors.cpu(x))

##########################################
### Testing an issue with CUDA&Metal transpose/adjoint mul
A = dev(randn(elt, (3, 2)))
B = dev(randn(elt, (3, 4)))
C = dev(randn(elt, (4, 2)))
Cp = copy(C)

## This fails with scalar indexing
if dev != NDTensors.cpu
@test_broken mul!(transpose(C), transpose(A), B, true, false)
end
mul!(C, transpose(B), A, true, false)
mul!(expose(transpose(Cp)), expose(transpose(A)), expose(B), true, false)
@test C Cp
Cp = zero(C)
## Try calling mul!! with transposes to verify that code works
Cpt = NDTensors.mul!!(transpose(Cp), transpose(A), B, true, false)
@test transpose(Cpt) C

Cp = zero(C)
## This fails with scalar indexing
if dev != NDTensors.cpu
@test_broken mul!(C', A', B, true, false)
end
mul!(C, B', A, true, false)
mul!(expose(Cp'), expose(A'), expose(B), true, false)
@test C Cp
Cp = zero(C)
Cpt = NDTensors.mul!!(Cp', A', B, true, false)
@test Cpt' C

##################################
### Add test for transpose(reshape(adjoint )) failure in CUDA

A = dev(transpose(reshape(randn(elt, 2, 12)', (12, 2))))
B = dev(randn(elt, 2, 2))
C = dev(zeros(elt, 2, 12))
NDTensors.mul!(expose(C), expose(B), expose(A), true, false)
Cp = NDTensors.cpu(similar(C))
NDTensors.mul!(
expose(Cp), expose(NDTensors.cpu(B)), expose(NDTensors.cpu(A)), true, false
)
@test NDTensors.cpu(C) Cp
NDTensors.zero(C)
NDTensors.mul!!(C, B, A, true, false)
@test NDTensors.cpu(C) Cp
end
8 changes: 6 additions & 2 deletions NDTensors/src/blocksparse/blocksparsetensor.jl
Original file line number Diff line number Diff line change
Expand Up @@ -335,11 +335,15 @@ view(T::BlockSparseTensor, b::Block) = blockview(T, b)
# convert to Dense
function dense(T::TensorT) where {TensorT<:BlockSparseTensor}
R = zeros(dense(TensorT), inds(T))
## Here this failed with scalar indexing (R[blockindices] = blockview)
## We can fix this by using copyto the arrays
r = array(R)
for block in keys(blockoffsets(T))
# TODO: make sure this assignment is efficient
R[blockindices(T, block)] = blockview(T, block)
rview = @view r[blockindices(T, block)]
copyto!(expose(rview), expose(array(blockview(T, block))))
end
return R
return tensor(Dense(r), inds(T))
end

#
Expand Down
2 changes: 1 addition & 1 deletion NDTensors/src/blocksparse/linearalgebra.jl
Original file line number Diff line number Diff line change
Expand Up @@ -200,7 +200,7 @@ function svd(
if (sV * sVP) == -1
Vb *= -1
end
copyto!(expose(blockview(V, blockV)), expose(Vb))
copyto!(blockview(V, blockV), Vb)
end
return U, S, V, Spectrum(d, truncerr)
end
Expand Down
1 change: 1 addition & 0 deletions NDTensors/test/Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
BlockArrays = "8e7c35d0-a365-5155-bbbb-fb81a777f24e"
CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
Dictionaries = "85a47980-9c8c-11e8-2b9f-f7ca1fa99fb4"
GPUArraysCore="46192b85-c4d5-4398-a991-12ede77f4527"
ITensors = "9136182c-28ba-11e9-034c-db9fb085ebd5"
LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
Octavian = "6fd5a793-0b7e-452c-907f-f8bfe9c57db4"
Expand Down
Loading

0 comments on commit 61caa1b

Please sign in to comment.