[NDTensors] Remove and test for scalar indexing on GPU (#1245)

ITensor · Nov 15, 2023 · 61caa1b · 61caa1b
1 parent 4d3372d
commit 61caa1b
Show file tree

Hide file tree

Showing 23 changed files with 296 additions and 108 deletions.
diff --git a/NDTensors/Project.toml b/NDTensors/Project.toml
@@ -11,6 +11,7 @@ Dictionaries = "85a47980-9c8c-11e8-2b9f-f7ca1fa99fb4"
 FLoops = "cc61a311-1640-44b5-9fba-1b764f453329"
 Folds = "41a02a25-b8f0-4f67-bc48-60067656b558"
 Functors = "d9f16b24-f501-4c13-a1f2-28368ffc5196"
+GPUArraysCore="46192b85-c4d5-4398-a991-12ede77f4527"
 HDF5 = "f67ccb44-e63f-5c2f-98bd-6dc0ccc4ba2f"
 InlineStrings = "842dd82b-1e85-43dc-bf29-5d0ee9dffc48"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
@@ -44,6 +45,7 @@ Dictionaries = "0.3.5"
 FLoops = "0.2.1"
 Folds = "0.2.8"
 Functors = "0.2, 0.3, 0.4"
+GPUArraysCore = "0.1"
 HDF5 = "0.14, 0.15, 0.16, 0.17"
 InlineStrings = "1"
 LinearAlgebra = "1.6"

diff --git a/NDTensors/ext/NDTensorsCUDAExt/NDTensorsCUDAExt.jl b/NDTensors/ext/NDTensorsCUDAExt/NDTensorsCUDAExt.jl
@@ -5,16 +5,19 @@ using NDTensors.SetParameters
 using NDTensors.Unwrap
 using Adapt
 using Functors
-using LinearAlgebra
+using LinearAlgebra: LinearAlgebra, Adjoint, Transpose, mul!, svd
 using CUDA
 using CUDA.CUBLAS
 using CUDA.CUSOLVER
 
 include("imports.jl")
 include("default_kwargs.jl")
+include("copyto.jl")
 include("set_types.jl")
 include("iscu.jl")
 include("adapt.jl")
 include("indexing.jl")
 include("linearalgebra.jl")
+include("mul.jl")
+include("permutedims.jl")
 end
diff --git a/NDTensors/ext/NDTensorsCUDAExt/copyto.jl b/NDTensors/ext/NDTensorsCUDAExt/copyto.jl
@@ -0,0 +1,26 @@
+# Same definition as `MtlArray`.
+function Base.copy(src::Exposed{<:CuArray,<:Base.ReshapedArray})
+  return reshape(copy(parent(src)), size(unexpose(src)))
+end
+
+function Base.copy(
+  src::Exposed{
+    <:CuArray,<:SubArray{<:Any,<:Any,<:Base.ReshapedArray{<:Any,<:Any,<:Adjoint}}
+  },
+)
+  return copy(@view copy(expose(parent(src)))[parentindices(unexpose(src))...])
+end
+
+# Catches a bug in `copyto!` in CUDA backend.
+function Base.copyto!(dest::Exposed{<:CuArray}, src::Exposed{<:CuArray,<:SubArray})
+  copyto!(dest, expose(copy(src)))
+  return unexpose(dest)
+end
+
+# Catches a bug in `copyto!` in CUDA backend.
+function Base.copyto!(
+  dest::Exposed{<:CuArray}, src::Exposed{<:CuArray,<:Base.ReshapedArray}
+)
+  copyto!(dest, expose(parent(src)))
+  return unexpose(dest)
+end
diff --git a/NDTensors/ext/NDTensorsCUDAExt/imports.jl b/NDTensors/ext/NDTensorsCUDAExt/imports.jl
@@ -2,5 +2,3 @@ import NDTensors: cu, set_ndims, set_eltype, set_eltype_if_unspecified, similart
 import NDTensors:
   ContractionProperties, _contract!, GemmBackend, auto_select_backend, _gemm!, iscu
 import NDTensors.SetParameters: nparameters, get_parameter, set_parameter, default_parameter
-
-import .CUDA: CuArrayAdaptor
diff --git a/NDTensors/ext/NDTensorsCUDAExt/indexing.jl b/NDTensors/ext/NDTensorsCUDAExt/indexing.jl
@@ -2,7 +2,7 @@ function Base.getindex(E::Exposed{<:CuArray})
   return CUDA.@allowscalar unexpose(E)[]
 end
 
-function setindex!(E::Exposed{<:CuArray}, x::Number)
+function Base.setindex!(E::Exposed{<:CuArray}, x::Number)
   CUDA.@allowscalar unexpose(E)[] = x
   return unexpose(E)
 end
@@ -11,10 +11,6 @@ function Base.getindex(E::Exposed{<:CuArray,<:Adjoint}, i, j)
   return (expose(parent(E))[j, i])'
 end
 
-function Base.copy(E::Exposed{<:CuArray,<:Base.ReshapedArray})
-  return reshape(copy(expose(parent(E))), size(unexpose(E)))
-end
-
 Base.any(f, E::Exposed{<:CuArray,<:NDTensors.Tensor}) = any(f, data(unexpose(E)))
 
 function Base.print_array(io::IO, E::Exposed{<:CuArray})

diff --git a/NDTensors/ext/NDTensorsCUDAExt/mul.jl b/NDTensors/ext/NDTensorsCUDAExt/mul.jl
@@ -0,0 +1,43 @@
+# This was calling generic matrix multiplication.
+# TODO: Raise an issue with `CUDA.jl`.
+function LinearAlgebra.mul!(
+  CM::Exposed{<:CuArray,<:LinearAlgebra.Transpose},
+  AM::Exposed{<:CuArray},
+  BM::Exposed{<:CuArray},
+  α,
+  β,
+)
+  mul!(transpose(CM), transpose(BM), transpose(AM), α, β)
+  return unexpose(CM)
+end
+
+# This was calling generic matrix multiplication.
+# TODO: Raise an issue with `CUDA.jl`.
+function LinearAlgebra.mul!(
+  CM::Exposed{<:CuArray,<:LinearAlgebra.Adjoint},
+  AM::Exposed{<:CuArray},
+  BM::Exposed{<:CuArray},
+  α,
+  β,
+)
+  mul!(CM', BM', AM', α, β)
+  return unexpose(CM)
+end
+
+## Fix issue in CUDA.jl where it cannot distinguish Transpose{Reshape{Adjoint{CuArray}}}
+## as a CuArray and calls generic matmul
+function LinearAlgebra.mul!(
+  CM::Exposed{<:CuArray},
+  AM::Exposed{<:CuArray},
+  BM::Exposed{
+    <:CuArray,
+    <:LinearAlgebra.Transpose{
+      <:Any,<:Base.ReshapedArray{<:Any,<:Any,<:LinearAlgebra.Adjoint}
+    },
+  },
+  α,
+  β,
+)
+  mul!(CM, AM, expose(transpose(copy(expose(parent(BM))))), α, β)
+  return unexpose(CM)
+end
diff --git a/NDTensors/ext/NDTensorsCUDAExt/permutedims.jl b/NDTensors/ext/NDTensorsCUDAExt/permutedims.jl
@@ -0,0 +1,7 @@
+function Base.permutedims!(
+  Edest::Exposed{<:CuArray,<:Base.ReshapedArray}, Esrc::Exposed{<:CuArray}, perm
+)
+  Aperm = permutedims(Esrc, perm)
+  copyto!(expose(parent(Edest)), expose(Aperm))
+  return unexpose(Edest)
+end
diff --git a/NDTensors/ext/NDTensorsMetalExt/copyto.jl b/NDTensors/ext/NDTensorsMetalExt/copyto.jl
@@ -12,12 +12,14 @@ end
 
 # Catches a bug in `copyto!` in Metal backend.
 function Base.copyto!(dest::Exposed{<:MtlArray}, src::Exposed{<:MtlArray,<:SubArray})
-  return copyto!(dest, expose(copy(src)))
+  copyto!(dest, expose(copy(src)))
+  return unexpose(dest)
 end
 
 # Catches a bug in `copyto!` in Metal backend.
 function Base.copyto!(
   dest::Exposed{<:MtlArray}, src::Exposed{<:MtlArray,<:Base.ReshapedArray}
 )
-  return copyto!(dest, expose(parent(src)))
+  copyto!(dest, expose(parent(src)))
+  return unexpose(dest)
 end
diff --git a/NDTensors/ext/NDTensorsMetalExt/mul.jl b/NDTensors/ext/NDTensorsMetalExt/mul.jl
@@ -10,3 +10,12 @@ function LinearAlgebra.mul!(
   mul!(transpose(CM), transpose(BM), transpose(AM), α, β)
   return unexpose(CM)
 end
+
+# This was calling generic matrix multiplication.
+# TODO: Raise an issue with `Metal.jl`.
+function LinearAlgebra.mul!(
+  CM::Exposed{<:MtlArray,<:Adjoint}, AM::Exposed{<:MtlArray}, BM::Exposed{<:MtlArray}, α, β
+)
+  mul!(CM', BM', AM', α, β)
+  return unexpose(CM)
+end
diff --git a/NDTensors/ext/examples/NDTensorCUDA.jl b/NDTensors/ext/examples/NDTensorCUDA.jl
@@ -61,9 +61,9 @@ function main()
   #Currently this code fails with CUDA.allowscalar(false)
   # Because of outer calling the _gemm! function which calls a 
   # generic implementation
-  grad = gradient(f, cA, cB, cC, cD)
-  @test NDTensors.cpu(cB * cC * cD) ≈ NDTensors.cpu(grad[1])
-  @test (cB * cC * cD) ≈ grad[1]
+  @allowscalar grad = gradient(f, cA, cB, cC, cD)
+  @allowscalar @test NDTensors.cpu(cB * cC * cD) ≈ NDTensors.cpu(grad[1])
+  @allowscalar @test (cB * cC * cD) ≈ grad[1]
   # Create a tuple of indices
   decomp = (
     dim(NDTensors.ind(grad[1], 1)),

diff --git a/NDTensors/src/NDTensors.jl b/NDTensors/src/NDTensors.jl
@@ -6,6 +6,7 @@ using Compat
 using Dictionaries
 using FLoops
 using Folds
+using GPUArraysCore
 using InlineStrings
 using Random
 using LinearAlgebra

diff --git a/NDTensors/src/Unwrap/src/functions/abstractarray.jl b/NDTensors/src/Unwrap/src/functions/abstractarray.jl
@@ -2,6 +2,8 @@ parent(E::Exposed) = parent(unexpose(E))
 
 transpose(E::Exposed) = transpose(unexpose(E))
 
+adjoint(E::Exposed) = adjoint(unexpose(E))
+
 cpu(E::Exposed) = cpu(unexpose(E))
 
 getindex(E::Exposed) = unexpose(E)[]

diff --git a/NDTensors/src/Unwrap/src/import.jl b/NDTensors/src/Unwrap/src/import.jl
@@ -1,4 +1,5 @@
 import Base:
+  adjoint,
   permutedims,
   permutedims!,
   copy,

diff --git a/NDTensors/src/Unwrap/test/runtests.jl b/NDTensors/src/Unwrap/test/runtests.jl
@@ -2,6 +2,7 @@ using Test
 using NDTensors.Unwrap
 using NDTensors
 using LinearAlgebra
+using GPUArraysCore: @allowscalar
 
 include("../../../test/device_list.jl")
 @testset "Testing Unwrap $dev, $elt" for dev in devices_list(ARGS),
@@ -24,8 +25,8 @@ include("../../../test/device_list.jl")
   @test parent(Et) == v
   @test parent(Ea) == v
   @test transpose(E) == vt
-  @test cpu(E) == v
-  @test cpu(Et) == vt
+  @test cpu(E) == cpu(v)
+  @test cpu(Et) == cpu(vt)
 
   m = reshape(v, (5, 2))
   mt = transpose(m)
@@ -125,17 +126,68 @@ include("../../../test/device_list.jl")
   y = dev(randn(elt, 16))
   x = reshape(dev(randn(elt, 4, 4))', 16)
   copyto!(expose(y), expose(x))
-  @test y == x
-  @test copy(x) == x
+  @allowscalar begin
+    @test y == x
+    @test copy(x) == x
+  end
 
   y = dev(randn(elt, 8))
   x = @view reshape(dev(randn(elt, 8, 8))', 64)[1:8]
   copyto!(expose(y), expose(x))
-  @test y == x
-  @test copy(x) == x
+  @allowscalar begin
+    @test y == x
+    @test copy(x) == x
+  end
 
   y = Base.ReshapedArray(dev(randn(elt, 16)), (4, 4), ())
   x = dev(randn(elt, 4, 4))
   permutedims!(expose(y), expose(x), (2, 1))
   @test NDTensors.cpu(y) == transpose(NDTensors.cpu(x))
+
+  ##########################################
+  ### Testing an issue with CUDA&Metal transpose/adjoint mul
+  A = dev(randn(elt, (3, 2)))
+  B = dev(randn(elt, (3, 4)))
+  C = dev(randn(elt, (4, 2)))
+  Cp = copy(C)
+
+  ## This fails with scalar indexing
+  if dev != NDTensors.cpu
+    @test_broken mul!(transpose(C), transpose(A), B, true, false)
+  end
+  mul!(C, transpose(B), A, true, false)
+  mul!(expose(transpose(Cp)), expose(transpose(A)), expose(B), true, false)
+  @test C ≈ Cp
+  Cp = zero(C)
+  ## Try calling mul!! with transposes to verify that code works
+  Cpt = NDTensors.mul!!(transpose(Cp), transpose(A), B, true, false)
+  @test transpose(Cpt) ≈ C
+
+  Cp = zero(C)
+  ## This fails with scalar indexing 
+  if dev != NDTensors.cpu
+    @test_broken mul!(C', A', B, true, false)
+  end
+  mul!(C, B', A, true, false)
+  mul!(expose(Cp'), expose(A'), expose(B), true, false)
+  @test C ≈ Cp
+  Cp = zero(C)
+  Cpt = NDTensors.mul!!(Cp', A', B, true, false)
+  @test Cpt' ≈ C
+
+  ##################################
+  ### Add test for transpose(reshape(adjoint )) failure in CUDA
+
+  A = dev(transpose(reshape(randn(elt, 2, 12)', (12, 2))))
+  B = dev(randn(elt, 2, 2))
+  C = dev(zeros(elt, 2, 12))
+  NDTensors.mul!(expose(C), expose(B), expose(A), true, false)
+  Cp = NDTensors.cpu(similar(C))
+  NDTensors.mul!(
+    expose(Cp), expose(NDTensors.cpu(B)), expose(NDTensors.cpu(A)), true, false
+  )
+  @test NDTensors.cpu(C) ≈ Cp
+  NDTensors.zero(C)
+  NDTensors.mul!!(C, B, A, true, false)
+  @test NDTensors.cpu(C) ≈ Cp
 end
diff --git a/NDTensors/src/blocksparse/blocksparsetensor.jl b/NDTensors/src/blocksparse/blocksparsetensor.jl
@@ -335,11 +335,15 @@ view(T::BlockSparseTensor, b::Block) = blockview(T, b)
 # convert to Dense
 function dense(T::TensorT) where {TensorT<:BlockSparseTensor}
   R = zeros(dense(TensorT), inds(T))
+  ## Here this failed with scalar indexing (R[blockindices] = blockview)
+  ## We can fix this by using copyto the arrays
+  r = array(R)
   for block in keys(blockoffsets(T))
     # TODO: make sure this assignment is efficient
-    R[blockindices(T, block)] = blockview(T, block)
+    rview = @view r[blockindices(T, block)]
+    copyto!(expose(rview), expose(array(blockview(T, block))))
   end
-  return R
+  return tensor(Dense(r), inds(T))
 end
 
 #

diff --git a/NDTensors/src/blocksparse/linearalgebra.jl b/NDTensors/src/blocksparse/linearalgebra.jl
@@ -200,7 +200,7 @@ function svd(
     if (sV * sVP) == -1
       Vb *= -1
     end
-    copyto!(expose(blockview(V, blockV)), expose(Vb))
+    copyto!(blockview(V, blockV), Vb)
   end
   return U, S, V, Spectrum(d, truncerr)
 end

diff --git a/NDTensors/test/Project.toml b/NDTensors/test/Project.toml
@@ -2,6 +2,7 @@
 BlockArrays = "8e7c35d0-a365-5155-bbbb-fb81a777f24e"
 CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 Dictionaries = "85a47980-9c8c-11e8-2b9f-f7ca1fa99fb4"
+GPUArraysCore="46192b85-c4d5-4398-a991-12ede77f4527"
 ITensors = "9136182c-28ba-11e9-034c-db9fb085ebd5"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 Octavian = "6fd5a793-0b7e-452c-907f-f8bfe9c57db4"