Skip to content

Commit

Permalink
Merge pull request #1230 from helmholtz-analytics/bugs/1229-_Bug_User…
Browse files Browse the repository at this point in the history
…Warning_TypedStorage_is_deprecated

Remove calls to deprecated `Tensor.storage()` when using newer PyTorch versions
  • Loading branch information
mrfh92 authored Oct 13, 2023
2 parents a32efdb + 94e2140 commit 5beb254
Show file tree
Hide file tree
Showing 9 changed files with 203 additions and 89 deletions.
43 changes: 34 additions & 9 deletions heat/core/communication.py
Original file line number Diff line number Diff line change
Expand Up @@ -803,18 +803,43 @@ def __reduce_like(
dummy = (
sendbuf.contiguous()
) # make a contiguous copy and reassign the storage, old will be collected
sendbuf.set_(
dummy.storage(), dummy.storage_offset(), size=dummy.shape, stride=dummy.stride()
)
# In PyTorch Version >= 2.0.0 we can use untyped_storage() instead of storage
# to keep backward compatibility with earlier PyTorch versions (where no untyped_storage() exists) we use a try/except
# (this applies to all places of Heat where untyped_storage() is used without further comment)
try:
sendbuf.set_(
dummy.untyped_storage(),
dummy.storage_offset(),
size=dummy.shape,
stride=dummy.stride(),
)
except AttributeError:
sendbuf.set_(
dummy.storage(),
dummy.storage_offset(),
size=dummy.shape,
stride=dummy.stride(),
)
sbuf = sendbuf if CUDA_AWARE_MPI else sendbuf.cpu()
sendbuf = self.as_buffer(sbuf)
if isinstance(recvbuf, torch.Tensor):
buf = recvbuf
# nothing matches, the buffers have to be made contiguous
dummy = recvbuf.contiguous()
recvbuf.set_(
dummy.storage(), dummy.storage_offset(), size=dummy.shape, stride=dummy.stride()
)
try:
recvbuf.set_(
dummy.untyped_storage(),
dummy.storage_offset(),
size=dummy.shape,
stride=dummy.stride(),
)
except AttributeError:
recvbuf.set_(
dummy.storage(),
dummy.storage_offset(),
size=dummy.shape,
stride=dummy.stride(),
)
rbuf = recvbuf if CUDA_AWARE_MPI else recvbuf.cpu()
if sendbuf is MPI.IN_PLACE:
recvbuf = self.as_buffer(rbuf)
Expand Down Expand Up @@ -1340,7 +1365,7 @@ def __alltoall_like(
mpi_recvbuf = self.alltoall_recvbuffer(rbuf)

exit_code = self.handle.Alltoallw(mpi_sendbuf, mpi_recvbuf, **kwargs)
# original_recvbuf.set_(recvbuf.storage(), recvbuf.storage_offset(), original_recvbuf.shape, original_recvbuf.stride())
# original_recvbuf.set_(recvbuf.untyped_storage(), recvbuf.storage_offset(), original_recvbuf.shape, original_recvbuf.stride())
recv_axis_permutation = list(np.argsort(np.array(axis_permutation)))

return exit_code, sbuf, rbuf, original_recvbuf, recv_axis_permutation
Expand Down Expand Up @@ -1570,7 +1595,7 @@ def __gather_like(
# undo the recvbuf permutation and assign the temporary buffer to the original recvbuf
# if recv_axis != 0:
# recvbuf = recvbuf.permute(*recv_axis_permutation)
# original_recvbuf.set_(recvbuf.storage(), recvbuf.storage_offset(), recvbuf.shape, recvbuf.stride())
# original_recvbuf.set_(recvbuf.untyped_storage(), recvbuf.storage_offset(), recvbuf.shape, recvbuf.stride())

return exit_code, sbuf, rbuf, original_recvbuf, recv_axis_permutation

Expand Down Expand Up @@ -1812,7 +1837,7 @@ def __scatter_like(
# undo the recvbuf permutation and assign the temporary buffer to the original recvbuf
# if recv_axis != 0:
# recvbuf = recvbuf.permute(*recv_axis_permutation)
# original_recvbuf.set_(recvbuf.storage(), recvbuf.storage_offset(), recvbuf.shape, recvbuf.stride())
# original_recvbuf.set_(recvbuf.untyped_storage(), recvbuf.storage_offset(), recvbuf.shape, recvbuf.stride())

return exit_code, sbuf, rbuf, original_recvbuf, recv_axis_permutation

Expand Down
5 changes: 4 additions & 1 deletion heat/core/dndarray.py
Original file line number Diff line number Diff line change
Expand Up @@ -340,7 +340,10 @@ def strides(self) -> Tuple[int]:
Returns bytes to step in each dimension when traversing a ``DNDarray``. numpy-like usage: ``self.strides()``
"""
steps = list(self.larray.stride())
itemsize = self.larray.storage().element_size()
try:
itemsize = self.larray.untyped_storage().element_size()
except AttributeError:
itemsize = self.larray.storage().element_size()
strides = tuple(step * itemsize for step in steps)
return strides

Expand Down
6 changes: 3 additions & 3 deletions heat/core/factories.py
Original file line number Diff line number Diff line change
Expand Up @@ -237,7 +237,7 @@ def array(
[3, 4, 5]], dtype=ht.int64, device=cpu:0, split=None)
>>> b.strides
(24, 8)
>>> b.larray.storage()
>>> b.larray.untyped_storage()
0
1
2
Expand All @@ -251,7 +251,7 @@ def array(
[3, 4, 5]], dtype=ht.int64, device=cpu:0, split=None)
>>> c.strides
(8, 16)
>>> c.larray.storage()
>>> c.larray.untyped_storage()
0
3
1
Expand All @@ -271,7 +271,7 @@ def array(
>>> b.strides
[0/2] (8, 16)
[1/2] (8, 16)
>>> b.larray.storage()
>>> b.larray.untyped_storage()
[0/2] 0
3
1
Expand Down
8 changes: 6 additions & 2 deletions heat/core/manipulations.py
Original file line number Diff line number Diff line change
Expand Up @@ -4120,8 +4120,12 @@ def local_topk(*args, **kwargs):
gres.shape, gindices.shape, out[0].shape, out[1].shape
)
)
out[0].larray.storage().copy_(final_array.larray.storage())
out[1].larray.storage().copy_(final_indices.larray.storage())
try:
out[0].larray.untyped_storage().copy_(final_array.larray.untyped_storage())
out[1].larray.untyped_storage().copy_(final_indices.larray.untyped_storage())
except AttributeError:
out[0].larray.storage().copy_(final_array.larray.storage())
out[1].larray.storage().copy_(final_indices.larray.storage())

out[0]._DNDarray__dtype = a.dtype
out[1]._DNDarray__dtype = types.int64
Expand Down
20 changes: 14 additions & 6 deletions heat/core/memory.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,12 +74,20 @@ def sanitize_memory_layout(x: torch.Tensor, order: str = "C") -> torch.Tensor:
dims = tuple(reversed(dims))
y = torch.empty_like(x)
permutation = x.permute(dims).contiguous()
y = y.set_(
permutation.storage(),
x.storage_offset(),
x.shape,
tuple(reversed(permutation.stride())),
)
try:
y = y.set_(
permutation.untyped_storage(),
x.storage_offset(),
x.shape,
tuple(reversed(permutation.stride())),
)
except AttributeError:
y = y.set_(
permutation.storage(),
x.storage_offset(),
x.shape,
tuple(reversed(permutation.stride())),
)
del permutation, dims, column_major, row_major, x
return y
else:
Expand Down
77 changes: 67 additions & 10 deletions heat/core/tests/test_dndarray.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
import heat as ht
from .test_suites.basic_test import TestCase

pytorch_major_version = int(torch.__version__.split(".")[0])


class TestDNDarray(TestCase):
@classmethod
Expand Down Expand Up @@ -1593,7 +1595,12 @@ def test_stride_and_strides(self):
heat_int16 = ht.array(torch_int16)
numpy_int16 = torch_int16.cpu().numpy()
self.assertEqual(heat_int16.stride(), torch_int16.stride())
self.assertEqual(heat_int16.strides, numpy_int16.strides)
if pytorch_major_version >= 2:
self.assertTrue(
(np.asarray(heat_int16.strides) * 2 == np.asarray(numpy_int16.strides)).all()
)
else:
self.assertEqual(heat_int16.strides, numpy_int16.strides)

# Local, float32, row-major memory layout
torch_float32 = torch.arange(
Expand All @@ -1602,7 +1609,12 @@ def test_stride_and_strides(self):
heat_float32 = ht.array(torch_float32)
numpy_float32 = torch_float32.cpu().numpy()
self.assertEqual(heat_float32.stride(), torch_float32.stride())
self.assertEqual(heat_float32.strides, numpy_float32.strides)
if pytorch_major_version >= 2:
self.assertTrue(
(np.asarray(heat_float32.strides) * 4 == np.asarray(numpy_float32.strides)).all()
)
else:
self.assertEqual(heat_float32.strides, numpy_float32.strides)

# Local, float64, column-major memory layout
torch_float64 = torch.arange(
Expand All @@ -1611,7 +1623,14 @@ def test_stride_and_strides(self):
heat_float64_F = ht.array(torch_float64, order="F")
numpy_float64_F = np.array(torch_float64.cpu().numpy(), order="F")
self.assertNotEqual(heat_float64_F.stride(), torch_float64.stride())
self.assertEqual(heat_float64_F.strides, numpy_float64_F.strides)
if pytorch_major_version >= 2:
self.assertTrue(
(
np.asarray(heat_float64_F.strides) * 8 == np.asarray(numpy_float64_F.strides)
).all()
)
else:
self.assertEqual(heat_float64_F.strides, numpy_float64_F.strides)

# Distributed, int16, row-major memory layout
size = ht.communication.MPI_WORLD.size
Expand All @@ -1626,7 +1645,15 @@ def test_stride_and_strides(self):
numpy_int16_split_strides = (
tuple(np.array(numpy_int16.strides[:split]) / size) + numpy_int16.strides[split:]
)
self.assertEqual(heat_int16_split.strides, numpy_int16_split_strides)
if pytorch_major_version >= 2:
self.assertTrue(
(
np.asarray(heat_int16_split.strides) * 2
== np.asarray(numpy_int16_split_strides)
).all()
)
else:
self.assertEqual(heat_int16_split.strides, numpy_int16_split_strides)

# Distributed, float32, row-major memory layout
split = -1
Expand All @@ -1638,7 +1665,15 @@ def test_stride_and_strides(self):
numpy_float32_split_strides = (
tuple(np.array(numpy_float32.strides[:split]) / size) + numpy_float32.strides[split:]
)
self.assertEqual(heat_float32_split.strides, numpy_float32_split_strides)
if pytorch_major_version >= 2:
self.assertTrue(
(
np.asarray(heat_float32_split.strides) * 4
== np.asarray(numpy_float32_split_strides)
).all()
)
else:
self.assertEqual(heat_float32_split.strides, numpy_float32_split_strides)

# Distributed, float64, column-major memory layout
split = -2
Expand All @@ -1650,7 +1685,15 @@ def test_stride_and_strides(self):
numpy_float64_F_split_strides = numpy_float64_F.strides[: split + 1] + tuple(
np.array(numpy_float64_F.strides[split + 1 :]) / size
)
self.assertEqual(heat_float64_F_split.strides, numpy_float64_F_split_strides)
if pytorch_major_version >= 2:
self.assertTrue(
(
np.asarray(heat_float64_F_split.strides) * 8
== np.asarray(numpy_float64_F_split_strides)
).all()
)
else:
self.assertEqual(heat_float64_F_split.strides, numpy_float64_F_split_strides)

def test_tolist(self):
a = ht.zeros([ht.MPI_WORLD.size, ht.MPI_WORLD.size, ht.MPI_WORLD.size], dtype=ht.int32)
Expand Down Expand Up @@ -1691,16 +1734,30 @@ def test_torch_proxy(self):
scalar_array = ht.array(1)
scalar_proxy = scalar_array.__torch_proxy__()
self.assertTrue(scalar_proxy.ndim == 0)
scalar_proxy_nbytes = scalar_proxy.storage().size() * scalar_proxy.storage().element_size()
if pytorch_major_version >= 2:
scalar_proxy_nbytes = (
scalar_proxy.untyped_storage().size()
* scalar_proxy.untyped_storage().element_size()
)
else:
scalar_proxy_nbytes = (
scalar_proxy.storage().size() * scalar_proxy.storage().element_size()
)
self.assertTrue(scalar_proxy_nbytes == 1)

dndarray = ht.zeros((4, 7, 6), split=1)
dndarray_proxy = dndarray.__torch_proxy__()
self.assertTrue(dndarray_proxy.ndim == dndarray.ndim)
self.assertTrue(tuple(dndarray_proxy.shape) == dndarray.gshape)
dndarray_proxy_nbytes = (
dndarray_proxy.storage().size() * dndarray_proxy.storage().element_size()
)
if pytorch_major_version >= 2:
dndarray_proxy_nbytes = (
dndarray_proxy.untyped_storage().size()
* dndarray_proxy.untyped_storage().element_size()
)
else:
dndarray_proxy_nbytes = (
dndarray_proxy.storage().size() * dndarray_proxy.storage().element_size()
)
self.assertTrue(dndarray_proxy_nbytes == 1)

def test_xor(self):
Expand Down
26 changes: 22 additions & 4 deletions heat/optim/dp_optimizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,17 @@

def __sum_f16_cb(buffer_a, buffer_b, _):
# MPI custom sum function to use torch.half
tens_a = torch.HalfTensor().set_(torch.HalfStorage.from_buffer(buffer_a, "native"))
tens_b = torch.HalfTensor().set_(torch.HalfStorage.from_buffer(buffer_b, "native"))
# try/except is used to use UntypedStorages from Pytorch version >= 2.0.0 while keeping backward compatibility
try:
tens_a = torch.HalfTensor().set_(
torch.UntypedStorage.from_buffer(buffer_a, "native", dtype=torch.half)
)
tens_b = torch.HalfTensor().set_(
torch.UntypedStorage.from_buffer(buffer_b, "native", dtype=torch.half)
)
except AttributeError:
tens_a = torch.HalfTensor().set_(torch.HalfStorage.from_buffer(buffer_a, "native"))
tens_b = torch.HalfTensor().set_(torch.HalfStorage.from_buffer(buffer_b, "native"))
tens_b += tens_a
nelem = torch.prod(torch.tensor(tens_b.shape)).item()
new_buff = MPI.memory.fromaddress(tens_b.data_ptr(), nbytes=tens_b.element_size() * nelem)
Expand All @@ -30,8 +39,17 @@ def __sum_f16_cb(buffer_a, buffer_b, _):

def __sum_bfloat_cb(buffer_a, buffer_b, _):
# MPI custom sum function to use torch.bfloat16
tens_a = torch.BFloat16Tensor().set_(torch.BFloat16Storage.from_buffer(buffer_a, "native"))
tens_b = torch.BFloat16Tensor().set_(torch.BFloat16Storage.from_buffer(buffer_b, "native"))
# try/except is used to use UntypedStorages from Pytorch version >= 2.0.0 while keeping backward compatibility
try:
tens_a = torch.BFloat16Tensor().set_(
torch.UntypedStorage.from_buffer(buffer_a, "native", dtype=torch.bfloat16)
)
tens_b = torch.BFloat16Tensor().set_(
torch.UntypedStorage.from_buffer(buffer_b, "native", dtype=torch.bfloat16)
)
except AttributeError:
tens_a = torch.BFloat16Tensor().set_(torch.BFloat16Storage.from_buffer(buffer_a, "native"))
tens_b = torch.BFloat16Tensor().set_(torch.BFloat16Storage.from_buffer(buffer_b, "native"))
tens_b += tens_a
nelem = int(tens_b.numel())
new_buff = MPI.memory.fromaddress(tens_b.data_ptr(), nbytes=nelem * tens_b.element_size())
Expand Down
Loading

1 comment on commit 5beb254

@github-actions
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Performance Alert ⚠️

Possible performance regression was detected for benchmark.
Benchmark result of this commit is worse than the previous benchmark result exceeding threshold 2.

Benchmark suite Current: 5beb254 Previous: a32efdb Ratio
matmul_split_0_N1_GPU - CPU_UTIL 99.14675481122748 % (0.6588248042098502) 24.1721173698035 % (3.1628969431341782) 4.10
matmul_split_1_N1_GPU - CPU_UTIL 99.08751676039331 % (0.7573631770985972) 24.290434500686892 % (3.21988751431144) 4.08
qr_split_0_N1_GPU - CPU_UTIL 98.7932449474676 % (1.4823988295190613) 25.003572445928643 % (3.5402755909495855) 3.95
qr_split_1_N1_GPU - CPU_UTIL 98.29783746613725 % (2.845840735157734) 26.1223368365623 % (4.0695526843861956) 3.76
lanczos_N1_GPU - CPU_UTIL 97.65947338526752 % (4.53578742837434) 27.092713952064663 % (4.616533756013383) 3.60
hierachical_svd_rank_N1_GPU - CPU_UTIL 97.5133299941763 % (4.953962321930678) 27.533158800419102 % (4.920966610996647) 3.54
hierachical_svd_tol_N1_GPU - CPU_UTIL 97.54257474442564 % (4.898167183945299) 27.623730917864407 % (4.997210671930903) 3.53
kmeans_N1_GPU - CPU_UTIL 97.88984404012716 % (4.206034771227191) 28.089565961523135 % (4.467811764602459) 3.48
kmedians_N1_GPU - CPU_UTIL 96.90191935589496 % (4.942240609373106) 28.43067134765505 % (4.342067047643432) 3.41
kmedoids_N1_GPU - CPU_UTIL 96.58130653547907 % (4.935108310596944) 29.3411847893787 % (3.9011727976445263) 3.29
reshape_N1_GPU - CPU_UTIL 96.20281409934653 % (5.851529513589717) 28.899277628235303 % (4.178183664618654) 3.33
concatenate_N1_GPU - CPU_UTIL 96.20251001904393 % (5.852397932189008) 28.89971835734907 % (4.178530382178055) 3.33
matmul_split_0_N4_CPU - POWER 26.16773420125101 W (20.37569454671251) 12.571030422055518 W (10.932127495992125) 2.08
matmul_split_0_N4_CPU - CPU_UTIL 98.07799321430363 % (1.7875665146533668) 31.40404209271974 % (0.11607318805195059) 3.12
matmul_split_0_N4_CPU - GPU_UTIL 10.170251655578614 % (11.839158558418474) 1.35162353515625 % (0.216064453125) 7.52
matmul_split_1_N4_CPU - CPU_UTIL 98.45733172054341 % (2.3526743055678416) 31.257841524224023 % (0.12291488170155088) 3.15
matmul_split_1_N4_CPU - GPU_UTIL 9.59802703857422 % (10.778778688417491) 1.3426623851060868 % (0.21474584312521502) 7.15
qr_split_0_N4_CPU - CPU_UTIL 99.02590151420495 % (1.4754035690183123) 31.7698472149623 % (1.5093889488426258) 3.12
qr_split_0_N4_CPU - GPU_UTIL 7.1474112391471865 % (8.95209400906877) 1.2875876486301423 % (0.2726999845454847) 5.55
qr_split_1_N4_CPU - CPU_UTIL 99.19172585255654 % (1.2977801352964295) 32.375693861338064 % (1.9918572054747112) 3.06
qr_split_1_N4_CPU - GPU_UTIL 6.87042236328125 % (9.103068607782923) 1.2879200279712677 % (0.272086451442363) 5.33
lanczos_N4_CPU - CPU_UTIL 99.29514911613906 % (1.2135127691844967) 32.78580696775307 % (2.390155261761452) 3.03
lanczos_N4_CPU - GPU_UTIL 6.87042236328125 % (9.103068607782923) 1.3358707278966904 % (0.21598177054882042) 5.14
hierachical_svd_rank_N4_CPU - CPU_UTIL 99.28832271028884 % (1.2750607966554037) 32.70382876882083 % (2.1836661542611058) 3.04
hierachical_svd_rank_N4_CPU - GPU_UTIL 6.87042236328125 % (9.103068607782923) 1.34307861328125 % (0.21473274831343772) 5.12
hierachical_svd_tol_N4_CPU - CPU_UTIL 99.22322318594212 % (1.2606963826516075) 32.63882381640006 % (2.048723859712222) 3.04
hierachical_svd_tol_N4_CPU - GPU_UTIL 6.87042236328125 % (9.103068607782923) 1.34307861328125 % (0.21473274831343772) 5.12
kmeans_N4_CPU - CPU_UTIL 98.62073651740255 % (1.720607991414032) 32.58798650586897 % (2.403376559959448) 3.03
kmeans_N4_CPU - GPU_UTIL 6.87042236328125 % (9.103068607782923) 1.34307861328125 % (0.21473274831343772) 5.12
kmedians_N4_CPU - CPU_UTIL 98.99652634140178 % (1.3026842418690099) 32.3447531751636 % (2.2699938816082343) 3.06
kmedians_N4_CPU - GPU_UTIL 6.87042236328125 % (9.103068607782923) 1.4093509674072267 % (0.029263662767613337) 4.87
kmedoids_N4_CPU - CPU_UTIL 98.99230606474273 % (1.2180512805911463) 31.918579446000745 % (1.869685884222718) 3.10
kmedoids_N4_CPU - GPU_UTIL 7.032718563079834 % (9.006253426866035) 1.4785373151302337 % (0.1648080786300417) 4.76
reshape_N4_CPU - CPU_UTIL 99.28722907648037 % (1.2057748680185831) 31.9860676779026 % (1.8982050970980562) 3.10
reshape_N4_CPU - GPU_UTIL 7.221616578102112 % (8.924363871364994) 1.47857666015625 % (0.164794921875) 4.88
concatenate_N4_CPU - CPU_UTIL 99.03047626530702 % (2.1927199492196534) 31.957383070250295 % (1.8618385704070273) 3.10
concatenate_N4_CPU - GPU_UTIL 7.23052978515625 % (8.921400952814546) 1.47857666015625 % (0.164794921875) 4.89

This comment was automatically generated by workflow using github-action-benchmark.

CC: @web-flow

Please sign in to comment.