Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add get_size_on_disk method to RemoteData #6584

Open
wants to merge 6 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 34 additions & 0 deletions src/aiida/common/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -572,3 +572,37 @@ def __init__(self, dtobj, precision):

self.dtobj = dtobj
self.precision = precision


def format_directory_size(size_in_bytes: int) -> str:
khsrali marked this conversation as resolved.
Show resolved Hide resolved
"""
Converts a size in bytes to a human-readable string with the appropriate prefix.
:param size_in_bytes: Size in bytes.
:type size_in_bytes: int
:raises ValueError: If the size is negative.
:return: Human-readable size string with a prefix (e.g., "1.23 KB", "5.67 MB").
:rtype: str
The function converts a given size in bytes to a more readable format by
adding the appropriate unit suffix (e.g., KB, MB, GB). It uses the binary
system (base-1024) for unit conversions.
Example:
>>> format_directory_size(123456789)
'117.74 MB'
"""
if size_in_bytes < 0:
raise ValueError('Size cannot be negative.')

# Define size prefixes
prefixes = ['B', 'KB', 'MB', 'GB', 'TB', 'PB', 'EB', 'ZB', 'YB']
factor = 1024 # 1 KB = 1024 B
index = 0

while size_in_bytes >= factor and index < len(prefixes) - 1:
size_in_bytes /= factor
index += 1

# Format the size to two decimal places
return f'{size_in_bytes:.2f} {prefixes[index]}'
119 changes: 117 additions & 2 deletions src/aiida/orm/nodes/data/remote/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,20 @@
###########################################################################
"""Data plugin that models a folder on a remote computer."""

from __future__ import annotations

import logging
import os
from pathlib import Path

from aiida.orm import AuthInfo
from aiida.orm.fields import add_field
from aiida.transports import Transport

from ..data import Data

_logger = logging.getLogger(__name__)

__all__ = ('RemoteData',)


Expand Down Expand Up @@ -103,7 +110,10 @@ def listdir(self, relpath='.'):
try:
return transport.listdir(full_path)
except OSError as exception:
if exception.errno in (2, 20): # directory not existing or not a directory
if exception.errno in (
2,
20,
): # directory not existing or not a directory
exc = OSError(
f'The required remote folder {full_path} on {self.computer.label} does not exist, is not a '
'directory or has been deleted.'
Expand Down Expand Up @@ -132,7 +142,10 @@ def listdir_withattributes(self, path='.'):
try:
return transport.listdir_withattributes(full_path)
except OSError as exception:
if exception.errno in (2, 20): # directory not existing or not a directory
if exception.errno in (
2,
20,
): # directory not existing or not a directory
exc = OSError(
f'The required remote folder {full_path} on {self.computer.label} does not exist, is not a '
'directory or has been deleted.'
Expand Down Expand Up @@ -185,3 +198,105 @@ def _validate(self):

def get_authinfo(self):
return AuthInfo.get_collection(self.backend).get(dbcomputer=self.computer, aiidauser=self.user)

def get_size_on_disk(self, relpath: Path | None = None) -> str:
"""
Connects to the remote folder and returns the total size of all files in the directory recursively in a
human-readable format.

:param relpath: File or directory path for which the total size should be returned, relative to
``self.get_remote_path``.
:return: Total size of file or directory in human-readable format.

:raises: FileNotFoundError, if file or directory does not exist.
"""

from aiida.common.utils import format_directory_size

if relpath is None:
relpath = Path('.')

authinfo = self.get_authinfo()
full_path = Path(self.get_remote_path()) / relpath
computer_label = self.computer.label if self.computer is not None else ''

with authinfo.get_transport() as transport:
if not transport.path_exists(str(full_path)):
exc_message = (
f'The required remote folder {full_path} on Computer <{computer_label}>'
GeigerJ2 marked this conversation as resolved.
Show resolved Hide resolved
'does not exist, is not a directory or has been deleted.'
)
raise FileNotFoundError(exc_message)

try:
total_size: int = self._get_size_on_disk_du(full_path, transport)

except RuntimeError:
lstat_warn = (
'Problem executing `du` command. Will return total file size based on `lstat`. '
'Take the result with a grain of salt, as `lstat` does not consider the file system block size, '
'but instead returns the true size of the files in bytes, which differs from the actual space'
'requirements on disk.'
)
_logger.warning(lstat_warn)

total_size: int = self._get_size_on_disk_lstat(full_path, transport)

except OSError:
_logger.critical('Could not evaluate directory size using either `du` or `lstat`.')

return format_directory_size(size_in_bytes=total_size)

def _get_size_on_disk_du(self, full_path: Path, transport: 'Transport') -> int:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this whole function, doesn't really need to be separated I feel.. It's basically only executes a command, and you don't re-use it, right?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I found the design to have just one top-level method get_size_on_disk nice, which tries to call the two private methods, in terms of separation of concerns.

Though, it is true that this pollutes the API of RemoteData somewhat... So I'm also fine of either moving the private methods to some utils module, or merging them. Maybe @unkcpz can comment on good coding practices here?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

which tries to call the two private methods, in terms of separation of concerns.

I'm not sure if I understand, sorry 😬
Since it's not really re-usable by other method, I'd vote for merging it, and avoid over-fictionalizing

"""Connects to the remote folder and returns the total size of all files in the directory recursively in bytes
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
"""Connects to the remote folder and returns the total size of all files in the directory recursively in bytes
"""Connects to the remote folder and returns the total size of all files in the directory in bytes

using `du`.

:param full_path: Full path of which the size should be evaluated
:type full_path: Path
:param transport: Open transport instance
:type transport: Transport
:raises RuntimeError: When `du` command cannot be successfully executed
:return: Total size of directory recursively in bytes.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sorry, I found the term recursively confusing.. (thought the function does yield )

Suggested change
:return: Total size of directory recursively in bytes.
:return: Total size of directory in bytes (including all it's contents)

:rtype: int
"""

retval, stdout, stderr = transport.exec_command_wait(f'du --bytes {full_path}')
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
retval, stdout, stderr = transport.exec_command_wait(f'du --bytes {full_path}')
retval, stdout, stderr = transport.exec_command_wait(f'du -h {full_path}')

that already returns the human readable one, why not?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

At least for testing and validation, I think returning the bytes is more convenient. I'd convert it to the human-readable format only at the last step, when printing it to the user.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

one more concern, is the aiida-firecrest plugin..
they don't support command execution, so it's better we avoid adding more calls to exec_command_wait in the code base, wherever it's not absolutely crucial...
I mean it adds maintenance overheads.. in future somebody will open an issue and PR to change this..

Also your nice _get_size_on_disk_lstat function is already addressing this functionality, so du doesn't seem super crucial

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hm, I can add a check on the transport-type before, to make sure it stays compatible with FirecREST in the future. Though, I wouldn't remove the convenient and preferred implementation for now in anticipation of FirecREST eventually becoming the required transport mechanism for CSCS.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes, please! Makes sense to add a except NotImplementedError on line 232, in case exec_command_wait is not implemented.


if not stderr and retval == 0:
total_size: int = int(stdout.split('\t')[0])
return total_size
else:
raise RuntimeError(f'Error executing `du` command: {stderr}')

def _get_size_on_disk_lstat(self, full_path: Path, transport: 'Transport') -> int:
"""
Connects to the remote folder and returns the total size of all files in the directory recursively in bytes
using ``lstat``. Note that even if a file is only 1 byte, on disk, it still occupies one full disk block size.
As such, getting accurate measures of the total expected size on disk when retrieving a ``RemoteData`` is not
straightforward with ``lstat``, as one would need to consider the occupied block sizes for each file, as well as
repository metadata. Thus, this function only serves as a fallback in the absence of the ``du`` command.

:param full_path: Full path of which the size should be evaluated.
:type full_path: Path
:param transport: Open transport instance.
:type transport: Transport
:raises RuntimeError: When `du` command cannot be successfully executed.
:return: Total size of directory recursively in bytes.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the same:

Suggested change
:return: Total size of directory recursively in bytes.
:return: Total size of directory in bytes (including all it's contents)

:rtype: int
"""
try:
total_size = 0
contents = self.listdir_withattributes(full_path)

for item in contents:
item_path = full_path / item['name']
# Add size of current item (file or directory metadata)
total_size += item['attributes']['st_size']

# If it's a directory, recursively get size of contents
if item['isdir']:
total_size += self._get_size_on_disk_lstat(item_path, transport)

return total_size
except OSError:
raise
114 changes: 111 additions & 3 deletions tests/orm/nodes/data/test_remote.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,25 +8,133 @@
###########################################################################
"""Tests for the :mod:`aiida.orm.nodes.data.remote.base.RemoteData` module."""

from pathlib import Path

import pytest
from aiida.orm import RemoteData


@pytest.fixture
def remote_data(tmp_path, aiida_localhost):
def remote_data_local(tmp_path, aiida_localhost, num_char: int | None = None):
"""Return a non-empty ``RemoteData`` instance."""
node = RemoteData(computer=aiida_localhost)
node.set_remote_path(str(tmp_path))
node.store()

if num_char is None:
content = b'some content'
else:
content = b'a' * num_char
(tmp_path / 'file.txt').write_bytes(content)
return node


@pytest.fixture
def remote_data_ssh(tmp_path, aiida_computer_ssh):
"""Return a non-empty ``RemoteData`` instance."""
# Compared to `aiida_localhost`, `aiida_computer_ssh` doesn't return an actual `Computer`, but just a factory
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🙉 aiida_computer_ssh should be compared with aiida_computer_local, which in that sense they are similar.. the issue is we don't have "aiida_ssh" that would return the actual computer instance, lol 🙉

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, exactly. We could add a fixture that actually calls the factory and returns the Computer instance, though I'd rather call it aiida_localhost_ssh, and don't see the need for it right now.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yews, so maybe please fix this comment, which I found confusing:

Suggested change
# Compared to `aiida_localhost`, `aiida_computer_ssh` doesn't return an actual `Computer`, but just a factory
# `aiida_computer_ssh` doesn't return an actual `Computer`, but just a factory

# Thus, we need to call the factory here passing the label to actually create the `Computer` instance
localhost_ssh = aiida_computer_ssh(label='localhost-ssh')
node = RemoteData(computer=localhost_ssh)
node.set_remote_path(str(tmp_path))
node.store()
(tmp_path / 'file.txt').write_bytes(b'some content')
return node


def test_clean(remote_data):
@pytest.mark.parametrize('fixture', ['remote_data_local', 'remote_data_ssh'])
def test_clean(request, fixture):
"""Test the :meth:`aiida.orm.nodes.data.remote.base.RemoteData.clean` method."""

remote_data = request.getfixturevalue(fixture)

assert not remote_data.is_empty
assert not remote_data.is_cleaned

remote_data._clean()
assert remote_data.is_empty
assert remote_data.is_cleaned


@pytest.mark.parametrize('fixture', ['remote_data_local', 'remote_data_ssh'])
def test_get_size_on_disk(request, fixture):
"""Test the :meth:`aiida.orm.nodes.data.remote.base.RemoteData.clean` method."""

remote_data = request.getfixturevalue(fixture)

# Check here for human-readable output string, as integer byte values are checked in
# `test_get_size_on_disk_[du|lstat]`
size_on_disk = remote_data.get_size_on_disk()
assert size_on_disk == '4.01 KB'

# Path/file non-existent
with pytest.raises(FileNotFoundError, match='.*does not exist, is not a directory.*'):
remote_data.get_size_on_disk(relpath=Path('non-existent'))


@pytest.mark.parametrize(
'num_char, sizes',
(
(1, {'du': 4097, 'lstat': 1, 'human': '4.00 KB'}),
(10, {'du': 4106, 'lstat': 10, 'human': '4.01 KB'}),
(100, {'du': 4196, 'lstat': 100, 'human': '4.10 KB'}),
(1000, {'du': 5096, 'lstat': 1000, 'human': '4.98 KB'}),
(int(1e6), {'du': 1004096, 'lstat': int(1e6), 'human': '980.56 KB'}),
),
)
def test_get_size_on_disk_sizes(aiida_localhost, tmp_path, remote_data_local, num_char, sizes):
"""Test the :meth:`aiida.orm.nodes.data.remote.base.RemoteData.clean` method."""

remote_data = RemoteData(computer=aiida_localhost)
remote_data.set_remote_path(str(tmp_path))
remote_data.store()

(tmp_path / 'file.txt').write_bytes(b'a' * num_char)

authinfo = remote_data.get_authinfo()
full_path = Path(remote_data.get_remote_path())

with authinfo.get_transport() as transport:
size_on_disk_du = remote_data._get_size_on_disk_du(transport=transport, full_path=full_path)
size_on_disk_lstat = remote_data._get_size_on_disk_lstat(transport=transport, full_path=full_path)
size_on_disk_human = remote_data.get_size_on_disk()

assert size_on_disk_du == sizes['du']
assert size_on_disk_lstat == sizes['lstat']
assert size_on_disk_human == sizes['human']


@pytest.mark.parametrize('fixture', ['remote_data_local', 'remote_data_ssh'])
def test_get_size_on_disk_du(request, fixture, monkeypatch):
"""Test the :meth:`aiida.orm.nodes.data.remote.base.RemoteData.clean` method."""

remote_data = request.getfixturevalue(fixture)

# Normal call
authinfo = remote_data.get_authinfo()
full_path = Path(remote_data.get_remote_path())

with authinfo.get_transport() as transport:
size_on_disk = remote_data._get_size_on_disk_du(transport=transport, full_path=full_path)
assert size_on_disk == 4108

# Monkeypatch transport exec_command_wait command to simulate `du` failure
def mock_exec_command_wait(command):
return (1, '', 'Error executing `du` command')

monkeypatch.setattr(transport, 'exec_command_wait', mock_exec_command_wait)
with pytest.raises(RuntimeError, match='Error executing `du`.*'):
remote_data._get_size_on_disk_du(full_path, transport)


@pytest.mark.parametrize('fixture', ['remote_data_local', 'remote_data_ssh'])
def test_get_size_on_disk_lstat(request, fixture):
"""Test the :meth:`aiida.orm.nodes.data.remote.base.RemoteData.clean` method."""

remote_data = request.getfixturevalue(fixture)

authinfo = remote_data.get_authinfo()
full_path = Path(remote_data.get_remote_path())

with authinfo.get_transport() as transport:
size_on_disk = remote_data._get_size_on_disk_lstat(transport=transport, full_path=full_path)
assert size_on_disk == 12
Loading