Merge pull request #1230 from helmholtz-analytics/bugs/1229-_Bug_User…

…Warning_TypedStorage_is_deprecated Remove calls to deprecated `Tensor.storage()` when using newer PyTorch versions
helmholtz-analytics · Oct 13, 2023 · 5beb254 · 5beb254 · github-actions · Oct 13, 2023
2 parents a32efdb + 94e2140
commit 5beb254
Show file tree

Hide file tree

Showing 9 changed files with 203 additions and 89 deletions.
diff --git a/heat/core/communication.py b/heat/core/communication.py
@@ -803,18 +803,43 @@ def __reduce_like(
             dummy = (
                 sendbuf.contiguous()
             )  # make a contiguous copy and reassign the storage, old will be collected
-            sendbuf.set_(
-                dummy.storage(), dummy.storage_offset(), size=dummy.shape, stride=dummy.stride()
-            )
+            # In PyTorch Version >= 2.0.0 we can use untyped_storage() instead of storage
+            # to keep backward compatibility with earlier PyTorch versions (where no untyped_storage() exists) we use a try/except
+            # (this applies to all places of Heat where untyped_storage() is used without further comment)
+            try:
+                sendbuf.set_(
+                    dummy.untyped_storage(),
+                    dummy.storage_offset(),
+                    size=dummy.shape,
+                    stride=dummy.stride(),
+                )
+            except AttributeError:
+                sendbuf.set_(
+                    dummy.storage(),
+                    dummy.storage_offset(),
+                    size=dummy.shape,
+                    stride=dummy.stride(),
+                )
             sbuf = sendbuf if CUDA_AWARE_MPI else sendbuf.cpu()
             sendbuf = self.as_buffer(sbuf)
         if isinstance(recvbuf, torch.Tensor):
             buf = recvbuf
             # nothing matches, the buffers have to be made contiguous
             dummy = recvbuf.contiguous()
-            recvbuf.set_(
-                dummy.storage(), dummy.storage_offset(), size=dummy.shape, stride=dummy.stride()
-            )
+            try:
+                recvbuf.set_(
+                    dummy.untyped_storage(),
+                    dummy.storage_offset(),
+                    size=dummy.shape,
+                    stride=dummy.stride(),
+                )
+            except AttributeError:
+                recvbuf.set_(
+                    dummy.storage(),
+                    dummy.storage_offset(),
+                    size=dummy.shape,
+                    stride=dummy.stride(),
+                )
             rbuf = recvbuf if CUDA_AWARE_MPI else recvbuf.cpu()
             if sendbuf is MPI.IN_PLACE:
                 recvbuf = self.as_buffer(rbuf)
@@ -1340,7 +1365,7 @@ def __alltoall_like(
             mpi_recvbuf = self.alltoall_recvbuffer(rbuf)
 
             exit_code = self.handle.Alltoallw(mpi_sendbuf, mpi_recvbuf, **kwargs)
-            # original_recvbuf.set_(recvbuf.storage(), recvbuf.storage_offset(), original_recvbuf.shape, original_recvbuf.stride())
+            # original_recvbuf.set_(recvbuf.untyped_storage(), recvbuf.storage_offset(), original_recvbuf.shape, original_recvbuf.stride())
             recv_axis_permutation = list(np.argsort(np.array(axis_permutation)))
 
         return exit_code, sbuf, rbuf, original_recvbuf, recv_axis_permutation
@@ -1570,7 +1595,7 @@ def __gather_like(
         # undo the recvbuf permutation and assign the temporary buffer to the original recvbuf
         # if recv_axis != 0:
         #    recvbuf = recvbuf.permute(*recv_axis_permutation)
-        #    original_recvbuf.set_(recvbuf.storage(), recvbuf.storage_offset(), recvbuf.shape, recvbuf.stride())
+        #    original_recvbuf.set_(recvbuf.untyped_storage(), recvbuf.storage_offset(), recvbuf.shape, recvbuf.stride())
 
         return exit_code, sbuf, rbuf, original_recvbuf, recv_axis_permutation
 
@@ -1812,7 +1837,7 @@ def __scatter_like(
         # undo the recvbuf permutation and assign the temporary buffer to the original recvbuf
         # if recv_axis != 0:
         #    recvbuf = recvbuf.permute(*recv_axis_permutation)
-        #    original_recvbuf.set_(recvbuf.storage(), recvbuf.storage_offset(), recvbuf.shape, recvbuf.stride())
+        #    original_recvbuf.set_(recvbuf.untyped_storage(), recvbuf.storage_offset(), recvbuf.shape, recvbuf.stride())
 
         return exit_code, sbuf, rbuf, original_recvbuf, recv_axis_permutation
 

diff --git a/heat/core/dndarray.py b/heat/core/dndarray.py
@@ -340,7 +340,10 @@ def strides(self) -> Tuple[int]:
         Returns bytes to step in each dimension when traversing a ``DNDarray``. numpy-like usage: ``self.strides()``
         """
         steps = list(self.larray.stride())
-        itemsize = self.larray.storage().element_size()
+        try:
+            itemsize = self.larray.untyped_storage().element_size()
+        except AttributeError:
+            itemsize = self.larray.storage().element_size()
         strides = tuple(step * itemsize for step in steps)
         return strides
 

diff --git a/heat/core/factories.py b/heat/core/factories.py
@@ -237,7 +237,7 @@ def array(
               [3, 4, 5]], dtype=ht.int64, device=cpu:0, split=None)
     >>> b.strides
     (24, 8)
-    >>> b.larray.storage()
+    >>> b.larray.untyped_storage()
      0
      1
      2
@@ -251,7 +251,7 @@ def array(
               [3, 4, 5]], dtype=ht.int64, device=cpu:0, split=None)
     >>> c.strides
     (8, 16)
-    >>> c.larray.storage()
+    >>> c.larray.untyped_storage()
      0
      3
      1
@@ -271,7 +271,7 @@ def array(
     >>> b.strides
     [0/2] (8, 16)
     [1/2] (8, 16)
-    >>> b.larray.storage()
+    >>> b.larray.untyped_storage()
     [0/2] 0
           3
           1

diff --git a/heat/core/manipulations.py b/heat/core/manipulations.py
@@ -4120,8 +4120,12 @@ def local_topk(*args, **kwargs):
                     gres.shape, gindices.shape, out[0].shape, out[1].shape
                 )
             )
-        out[0].larray.storage().copy_(final_array.larray.storage())
-        out[1].larray.storage().copy_(final_indices.larray.storage())
+        try:
+            out[0].larray.untyped_storage().copy_(final_array.larray.untyped_storage())
+            out[1].larray.untyped_storage().copy_(final_indices.larray.untyped_storage())
+        except AttributeError:
+            out[0].larray.storage().copy_(final_array.larray.storage())
+            out[1].larray.storage().copy_(final_indices.larray.storage())
 
         out[0]._DNDarray__dtype = a.dtype
         out[1]._DNDarray__dtype = types.int64

diff --git a/heat/core/memory.py b/heat/core/memory.py
@@ -74,12 +74,20 @@ def sanitize_memory_layout(x: torch.Tensor, order: str = "C") -> torch.Tensor:
         dims = tuple(reversed(dims))
         y = torch.empty_like(x)
         permutation = x.permute(dims).contiguous()
-        y = y.set_(
-            permutation.storage(),
-            x.storage_offset(),
-            x.shape,
-            tuple(reversed(permutation.stride())),
-        )
+        try:
+            y = y.set_(
+                permutation.untyped_storage(),
+                x.storage_offset(),
+                x.shape,
+                tuple(reversed(permutation.stride())),
+            )
+        except AttributeError:
+            y = y.set_(
+                permutation.storage(),
+                x.storage_offset(),
+                x.shape,
+                tuple(reversed(permutation.stride())),
+            )
         del permutation, dims, column_major, row_major, x
         return y
     else:

diff --git a/heat/core/tests/test_dndarray.py b/heat/core/tests/test_dndarray.py
@@ -4,6 +4,8 @@
 import heat as ht
 from .test_suites.basic_test import TestCase
 
+pytorch_major_version = int(torch.__version__.split(".")[0])
+
 
 class TestDNDarray(TestCase):
     @classmethod
@@ -1593,7 +1595,12 @@ def test_stride_and_strides(self):
         heat_int16 = ht.array(torch_int16)
         numpy_int16 = torch_int16.cpu().numpy()
         self.assertEqual(heat_int16.stride(), torch_int16.stride())
-        self.assertEqual(heat_int16.strides, numpy_int16.strides)
+        if pytorch_major_version >= 2:
+            self.assertTrue(
+                (np.asarray(heat_int16.strides) * 2 == np.asarray(numpy_int16.strides)).all()
+            )
+        else:
+            self.assertEqual(heat_int16.strides, numpy_int16.strides)
 
         # Local, float32, row-major memory layout
         torch_float32 = torch.arange(
@@ -1602,7 +1609,12 @@ def test_stride_and_strides(self):
         heat_float32 = ht.array(torch_float32)
         numpy_float32 = torch_float32.cpu().numpy()
         self.assertEqual(heat_float32.stride(), torch_float32.stride())
-        self.assertEqual(heat_float32.strides, numpy_float32.strides)
+        if pytorch_major_version >= 2:
+            self.assertTrue(
+                (np.asarray(heat_float32.strides) * 4 == np.asarray(numpy_float32.strides)).all()
+            )
+        else:
+            self.assertEqual(heat_float32.strides, numpy_float32.strides)
 
         # Local, float64, column-major memory layout
         torch_float64 = torch.arange(
@@ -1611,7 +1623,14 @@ def test_stride_and_strides(self):
         heat_float64_F = ht.array(torch_float64, order="F")
         numpy_float64_F = np.array(torch_float64.cpu().numpy(), order="F")
         self.assertNotEqual(heat_float64_F.stride(), torch_float64.stride())
-        self.assertEqual(heat_float64_F.strides, numpy_float64_F.strides)
+        if pytorch_major_version >= 2:
+            self.assertTrue(
+                (
+                    np.asarray(heat_float64_F.strides) * 8 == np.asarray(numpy_float64_F.strides)
+                ).all()
+            )
+        else:
+            self.assertEqual(heat_float64_F.strides, numpy_float64_F.strides)
 
         # Distributed, int16, row-major memory layout
         size = ht.communication.MPI_WORLD.size
@@ -1626,7 +1645,15 @@ def test_stride_and_strides(self):
         numpy_int16_split_strides = (
             tuple(np.array(numpy_int16.strides[:split]) / size) + numpy_int16.strides[split:]
         )
-        self.assertEqual(heat_int16_split.strides, numpy_int16_split_strides)
+        if pytorch_major_version >= 2:
+            self.assertTrue(
+                (
+                    np.asarray(heat_int16_split.strides) * 2
+                    == np.asarray(numpy_int16_split_strides)
+                ).all()
+            )
+        else:
+            self.assertEqual(heat_int16_split.strides, numpy_int16_split_strides)
 
         # Distributed, float32, row-major memory layout
         split = -1
@@ -1638,7 +1665,15 @@ def test_stride_and_strides(self):
         numpy_float32_split_strides = (
             tuple(np.array(numpy_float32.strides[:split]) / size) + numpy_float32.strides[split:]
         )
-        self.assertEqual(heat_float32_split.strides, numpy_float32_split_strides)
+        if pytorch_major_version >= 2:
+            self.assertTrue(
+                (
+                    np.asarray(heat_float32_split.strides) * 4
+                    == np.asarray(numpy_float32_split_strides)
+                ).all()
+            )
+        else:
+            self.assertEqual(heat_float32_split.strides, numpy_float32_split_strides)
 
         # Distributed, float64, column-major memory layout
         split = -2
@@ -1650,7 +1685,15 @@ def test_stride_and_strides(self):
         numpy_float64_F_split_strides = numpy_float64_F.strides[: split + 1] + tuple(
             np.array(numpy_float64_F.strides[split + 1 :]) / size
         )
-        self.assertEqual(heat_float64_F_split.strides, numpy_float64_F_split_strides)
+        if pytorch_major_version >= 2:
+            self.assertTrue(
+                (
+                    np.asarray(heat_float64_F_split.strides) * 8
+                    == np.asarray(numpy_float64_F_split_strides)
+                ).all()
+            )
+        else:
+            self.assertEqual(heat_float64_F_split.strides, numpy_float64_F_split_strides)
 
     def test_tolist(self):
         a = ht.zeros([ht.MPI_WORLD.size, ht.MPI_WORLD.size, ht.MPI_WORLD.size], dtype=ht.int32)
@@ -1691,16 +1734,30 @@ def test_torch_proxy(self):
         scalar_array = ht.array(1)
         scalar_proxy = scalar_array.__torch_proxy__()
         self.assertTrue(scalar_proxy.ndim == 0)
-        scalar_proxy_nbytes = scalar_proxy.storage().size() * scalar_proxy.storage().element_size()
+        if pytorch_major_version >= 2:
+            scalar_proxy_nbytes = (
+                scalar_proxy.untyped_storage().size()
+                * scalar_proxy.untyped_storage().element_size()
+            )
+        else:
+            scalar_proxy_nbytes = (
+                scalar_proxy.storage().size() * scalar_proxy.storage().element_size()
+            )
         self.assertTrue(scalar_proxy_nbytes == 1)
 
         dndarray = ht.zeros((4, 7, 6), split=1)
         dndarray_proxy = dndarray.__torch_proxy__()
         self.assertTrue(dndarray_proxy.ndim == dndarray.ndim)
         self.assertTrue(tuple(dndarray_proxy.shape) == dndarray.gshape)
-        dndarray_proxy_nbytes = (
-            dndarray_proxy.storage().size() * dndarray_proxy.storage().element_size()
-        )
+        if pytorch_major_version >= 2:
+            dndarray_proxy_nbytes = (
+                dndarray_proxy.untyped_storage().size()
+                * dndarray_proxy.untyped_storage().element_size()
+            )
+        else:
+            dndarray_proxy_nbytes = (
+                dndarray_proxy.storage().size() * dndarray_proxy.storage().element_size()
+            )
         self.assertTrue(dndarray_proxy_nbytes == 1)
 
     def test_xor(self):

diff --git a/heat/optim/dp_optimizer.py b/heat/optim/dp_optimizer.py
@@ -20,8 +20,17 @@
 
 def __sum_f16_cb(buffer_a, buffer_b, _):
     # MPI custom sum function to use torch.half
-    tens_a = torch.HalfTensor().set_(torch.HalfStorage.from_buffer(buffer_a, "native"))
-    tens_b = torch.HalfTensor().set_(torch.HalfStorage.from_buffer(buffer_b, "native"))
+    # try/except is used to use UntypedStorages from Pytorch version >= 2.0.0 while keeping backward compatibility
+    try:
+        tens_a = torch.HalfTensor().set_(
+            torch.UntypedStorage.from_buffer(buffer_a, "native", dtype=torch.half)
+        )
+        tens_b = torch.HalfTensor().set_(
+            torch.UntypedStorage.from_buffer(buffer_b, "native", dtype=torch.half)
+        )
+    except AttributeError:
+        tens_a = torch.HalfTensor().set_(torch.HalfStorage.from_buffer(buffer_a, "native"))
+        tens_b = torch.HalfTensor().set_(torch.HalfStorage.from_buffer(buffer_b, "native"))
     tens_b += tens_a
     nelem = torch.prod(torch.tensor(tens_b.shape)).item()
     new_buff = MPI.memory.fromaddress(tens_b.data_ptr(), nbytes=tens_b.element_size() * nelem)
@@ -30,8 +39,17 @@ def __sum_f16_cb(buffer_a, buffer_b, _):
 
 def __sum_bfloat_cb(buffer_a, buffer_b, _):
     # MPI custom sum function to use torch.bfloat16
-    tens_a = torch.BFloat16Tensor().set_(torch.BFloat16Storage.from_buffer(buffer_a, "native"))
-    tens_b = torch.BFloat16Tensor().set_(torch.BFloat16Storage.from_buffer(buffer_b, "native"))
+    # try/except is used to use UntypedStorages from Pytorch version >= 2.0.0 while keeping backward compatibility
+    try:
+        tens_a = torch.BFloat16Tensor().set_(
+            torch.UntypedStorage.from_buffer(buffer_a, "native", dtype=torch.bfloat16)
+        )
+        tens_b = torch.BFloat16Tensor().set_(
+            torch.UntypedStorage.from_buffer(buffer_b, "native", dtype=torch.bfloat16)
+        )
+    except AttributeError:
+        tens_a = torch.BFloat16Tensor().set_(torch.BFloat16Storage.from_buffer(buffer_a, "native"))
+        tens_b = torch.BFloat16Tensor().set_(torch.BFloat16Storage.from_buffer(buffer_b, "native"))
     tens_b += tens_a
     nelem = int(tens_b.numel())
     new_buff = MPI.memory.fromaddress(tens_b.data_ptr(), nbytes=nelem * tens_b.element_size())
Benchmark suite	Current: `5beb254`	Previous: `a32efdb`	Ratio
`matmul_split_0_N1_GPU - CPU_UTIL`	`99.14675481122748` % (`0.6588248042098502`)	`24.1721173698035` % (`3.1628969431341782`)	`4.10`
`matmul_split_1_N1_GPU - CPU_UTIL`	`99.08751676039331` % (`0.7573631770985972`)	`24.290434500686892` % (`3.21988751431144`)	`4.08`
`qr_split_0_N1_GPU - CPU_UTIL`	`98.7932449474676` % (`1.4823988295190613`)	`25.003572445928643` % (`3.5402755909495855`)	`3.95`
`qr_split_1_N1_GPU - CPU_UTIL`	`98.29783746613725` % (`2.845840735157734`)	`26.1223368365623` % (`4.0695526843861956`)	`3.76`
`lanczos_N1_GPU - CPU_UTIL`	`97.65947338526752` % (`4.53578742837434`)	`27.092713952064663` % (`4.616533756013383`)	`3.60`
`hierachical_svd_rank_N1_GPU - CPU_UTIL`	`97.5133299941763` % (`4.953962321930678`)	`27.533158800419102` % (`4.920966610996647`)	`3.54`
`hierachical_svd_tol_N1_GPU - CPU_UTIL`	`97.54257474442564` % (`4.898167183945299`)	`27.623730917864407` % (`4.997210671930903`)	`3.53`
`kmeans_N1_GPU - CPU_UTIL`	`97.88984404012716` % (`4.206034771227191`)	`28.089565961523135` % (`4.467811764602459`)	`3.48`
`kmedians_N1_GPU - CPU_UTIL`	`96.90191935589496` % (`4.942240609373106`)	`28.43067134765505` % (`4.342067047643432`)	`3.41`
`kmedoids_N1_GPU - CPU_UTIL`	`96.58130653547907` % (`4.935108310596944`)	`29.3411847893787` % (`3.9011727976445263`)	`3.29`
`reshape_N1_GPU - CPU_UTIL`	`96.20281409934653` % (`5.851529513589717`)	`28.899277628235303` % (`4.178183664618654`)	`3.33`
`concatenate_N1_GPU - CPU_UTIL`	`96.20251001904393` % (`5.852397932189008`)	`28.89971835734907` % (`4.178530382178055`)	`3.33`
`matmul_split_0_N4_CPU - POWER`	`26.16773420125101` W (`20.37569454671251`)	`12.571030422055518` W (`10.932127495992125`)	`2.08`
`matmul_split_0_N4_CPU - CPU_UTIL`	`98.07799321430363` % (`1.7875665146533668`)	`31.40404209271974` % (`0.11607318805195059`)	`3.12`
`matmul_split_0_N4_CPU - GPU_UTIL`	`10.170251655578614` % (`11.839158558418474`)	`1.35162353515625` % (`0.216064453125`)	`7.52`
`matmul_split_1_N4_CPU - CPU_UTIL`	`98.45733172054341` % (`2.3526743055678416`)	`31.257841524224023` % (`0.12291488170155088`)	`3.15`
`matmul_split_1_N4_CPU - GPU_UTIL`	`9.59802703857422` % (`10.778778688417491`)	`1.3426623851060868` % (`0.21474584312521502`)	`7.15`
`qr_split_0_N4_CPU - CPU_UTIL`	`99.02590151420495` % (`1.4754035690183123`)	`31.7698472149623` % (`1.5093889488426258`)	`3.12`
`qr_split_0_N4_CPU - GPU_UTIL`	`7.1474112391471865` % (`8.95209400906877`)	`1.2875876486301423` % (`0.2726999845454847`)	`5.55`
`qr_split_1_N4_CPU - CPU_UTIL`	`99.19172585255654` % (`1.2977801352964295`)	`32.375693861338064` % (`1.9918572054747112`)	`3.06`
`qr_split_1_N4_CPU - GPU_UTIL`	`6.87042236328125` % (`9.103068607782923`)	`1.2879200279712677` % (`0.272086451442363`)	`5.33`
`lanczos_N4_CPU - CPU_UTIL`	`99.29514911613906` % (`1.2135127691844967`)	`32.78580696775307` % (`2.390155261761452`)	`3.03`
`lanczos_N4_CPU - GPU_UTIL`	`6.87042236328125` % (`9.103068607782923`)	`1.3358707278966904` % (`0.21598177054882042`)	`5.14`
`hierachical_svd_rank_N4_CPU - CPU_UTIL`	`99.28832271028884` % (`1.2750607966554037`)	`32.70382876882083` % (`2.1836661542611058`)	`3.04`
`hierachical_svd_rank_N4_CPU - GPU_UTIL`	`6.87042236328125` % (`9.103068607782923`)	`1.34307861328125` % (`0.21473274831343772`)	`5.12`
`hierachical_svd_tol_N4_CPU - CPU_UTIL`	`99.22322318594212` % (`1.2606963826516075`)	`32.63882381640006` % (`2.048723859712222`)	`3.04`
`hierachical_svd_tol_N4_CPU - GPU_UTIL`	`6.87042236328125` % (`9.103068607782923`)	`1.34307861328125` % (`0.21473274831343772`)	`5.12`
`kmeans_N4_CPU - CPU_UTIL`	`98.62073651740255` % (`1.720607991414032`)	`32.58798650586897` % (`2.403376559959448`)	`3.03`
`kmeans_N4_CPU - GPU_UTIL`	`6.87042236328125` % (`9.103068607782923`)	`1.34307861328125` % (`0.21473274831343772`)	`5.12`
`kmedians_N4_CPU - CPU_UTIL`	`98.99652634140178` % (`1.3026842418690099`)	`32.3447531751636` % (`2.2699938816082343`)	`3.06`
`kmedians_N4_CPU - GPU_UTIL`	`6.87042236328125` % (`9.103068607782923`)	`1.4093509674072267` % (`0.029263662767613337`)	`4.87`
`kmedoids_N4_CPU - CPU_UTIL`	`98.99230606474273` % (`1.2180512805911463`)	`31.918579446000745` % (`1.869685884222718`)	`3.10`
`kmedoids_N4_CPU - GPU_UTIL`	`7.032718563079834` % (`9.006253426866035`)	`1.4785373151302337` % (`0.1648080786300417`)	`4.76`
`reshape_N4_CPU - CPU_UTIL`	`99.28722907648037` % (`1.2057748680185831`)	`31.9860676779026` % (`1.8982050970980562`)	`3.10`
`reshape_N4_CPU - GPU_UTIL`	`7.221616578102112` % (`8.924363871364994`)	`1.47857666015625` % (`0.164794921875`)	`4.88`
`concatenate_N4_CPU - CPU_UTIL`	`99.03047626530702` % (`2.1927199492196534`)	`31.957383070250295` % (`1.8618385704070273`)	`3.10`
`concatenate_N4_CPU - GPU_UTIL`	`7.23052978515625` % (`8.921400952814546`)	`1.47857666015625` % (`0.164794921875`)	`4.89`