Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Storage] Add .skyignore support #4038

Merged
merged 14 commits into from
Oct 8, 2024
4 changes: 2 additions & 2 deletions docs/source/examples/syncing-code-artifacts.rst
Original file line number Diff line number Diff line change
Expand Up @@ -49,8 +49,8 @@ scripts, access checkpoints, etc.).

For large, multi-gigabyte workdirs, uploading may be slow because they
are synced to the remote VM(s) with :code:`rsync`. To exclude large files in
your workdir from being uploaded, add them to the :code:`.gitignore` file
(or a ``.git/info/exclude`` file) under the workdir.
your workdir from being uploaded, add them to a :code:`.skyignore` file
under your workdir.
yika-luo marked this conversation as resolved.
Show resolved Hide resolved

.. note::

Expand Down
5 changes: 4 additions & 1 deletion sky/backends/backend_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -290,8 +290,11 @@ def path_size_megabytes(path: str) -> int:
# Without quoting rsync fails.
git_exclude_filter = command_runner.RSYNC_EXCLUDE_OPTION.format(
shlex.quote(str(resolved_path / command_runner.GIT_EXCLUDE)))
rsync_filter = command_runner.RSYNC_FILTER_GITIGNORE
if (resolved_path / constants.SKY_IGNORE_FILE).exists():
rsync_filter = command_runner.RSYNC_FILTER_SKYIGNORE
yika-luo marked this conversation as resolved.
Show resolved Hide resolved
rsync_command = (f'rsync {command_runner.RSYNC_DISPLAY_OPTION} '
f'{command_runner.RSYNC_FILTER_OPTION} '
f'{rsync_filter} '
f'{git_exclude_filter} --dry-run {path!r}')
rsync_output = ''
try:
Expand Down
12 changes: 4 additions & 8 deletions sky/data/storage.py
Original file line number Diff line number Diff line change
Expand Up @@ -1298,8 +1298,7 @@ def get_file_sync_command(base_dir_path, file_names):

def get_dir_sync_command(src_dir_path, dest_dir_name):
# we exclude .git directory from the sync
excluded_list = storage_utils.get_excluded_files_from_gitignore(
src_dir_path)
excluded_list = storage_utils.get_excluded_files(src_dir_path)
excluded_list.append('.git/*')
excludes = ' '.join([
f'--exclude {shlex.quote(file_name)}'
Expand Down Expand Up @@ -1764,8 +1763,7 @@ def get_file_sync_command(base_dir_path, file_names):
return sync_command

def get_dir_sync_command(src_dir_path, dest_dir_name):
excluded_list = storage_utils.get_excluded_files_from_gitignore(
src_dir_path)
excluded_list = storage_utils.get_excluded_files(src_dir_path)
# we exclude .git directory from the sync
excluded_list.append(r'^\.git/.*$')
excludes = '|'.join(excluded_list)
Expand Down Expand Up @@ -2490,8 +2488,7 @@ def get_file_sync_command(base_dir_path, file_names) -> str:

def get_dir_sync_command(src_dir_path, dest_dir_name) -> str:
# we exclude .git directory from the sync
excluded_list = storage_utils.get_excluded_files_from_gitignore(
src_dir_path)
excluded_list = storage_utils.get_excluded_files(src_dir_path)
excluded_list.append('.git/')
excludes_list = ';'.join(
[file_name.rstrip('*') for file_name in excluded_list])
Expand Down Expand Up @@ -2895,8 +2892,7 @@ def get_file_sync_command(base_dir_path, file_names):

def get_dir_sync_command(src_dir_path, dest_dir_name):
# we exclude .git directory from the sync
excluded_list = storage_utils.get_excluded_files_from_gitignore(
src_dir_path)
excluded_list = storage_utils.get_excluded_files(src_dir_path)
excluded_list.append('.git/*')
excludes = ' '.join([
f'--exclude {shlex.quote(file_name)}'
Expand Down
60 changes: 59 additions & 1 deletion sky/data/storage_utils.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""Utility functions for the storage module."""
import glob
import os
import shlex
import subprocess
Expand All @@ -8,6 +9,8 @@

from sky import exceptions
from sky import sky_logging
from sky.skylet import constants
from sky.utils import common_utils
from sky.utils import log_utils
from sky.utils.cli_utils import status_utils

Expand Down Expand Up @@ -63,6 +66,43 @@ def format_storage_table(storages: List[Dict[str, Any]],
return 'No existing storage.'


def get_excluded_files_from_skyignore(src_dir_path: str) -> List[str]:
yika-luo marked this conversation as resolved.
Show resolved Hide resolved
"""List files and patterns ignored by the .skyignore file
in the given source directory.
"""
excluded_list: List[str] = []
expand_src_dir_path = os.path.expanduser(src_dir_path)
skyignore_path = os.path.join(expand_src_dir_path,
constants.SKY_IGNORE_FILE)

try:
with open(skyignore_path, 'r', encoding='utf-8') as f:
for line in f:
line = line.strip()
if line and not line.startswith('#'):
if '*' in line:
# Make parsing consistent with rsync.
if line.startswith('*.'):
line = '**/' + line
Copy link
Collaborator

@Michaelvll Michaelvll Oct 7, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just wondering if this will be too specific, e.g., does test.log match test.log in all subdir; does test*.log match all sub dir? Wondering if we should add **/ to all lines, except the ones start with /
(the newly proposed version might be very slow in a large folder with many files/folders)

Suggested change
if line.startswith('*.'):
line = '**/' + line
if line.startswith('*.'):
line = '**/' + line

Copy link
Collaborator Author

@yika-luo yika-luo Oct 7, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

test.log and test*.log should be just matching in current dir. That being said, I should remove this logic and requires users to do **/* if they want to match ALL directories. I also added in doc saying users should avoid doing *.txt or ./*.txt

elif line.startswith('/*'):
line = '.' + line
matching_files = glob.glob(os.path.join(
expand_src_dir_path, line),
recursive=True)
matching_files = [
os.path.relpath(f, expand_src_dir_path)
for f in matching_files
]
excluded_list.extend(matching_files)
else:
excluded_list.append(line)
except IOError as e:
logger.warning(f'Error reading {skyignore_path}: '
f'{common_utils.format_exception(e, use_bracket=True)}')

return excluded_list


def get_excluded_files_from_gitignore(src_dir_path: str) -> List[str]:
""" Lists files and patterns ignored by git in the source directory

Expand All @@ -78,7 +118,8 @@ def get_excluded_files_from_gitignore(src_dir_path: str) -> List[str]:
expand_src_dir_path = os.path.expanduser(src_dir_path)

git_exclude_path = os.path.join(expand_src_dir_path, '.git/info/exclude')
gitignore_path = os.path.join(expand_src_dir_path, '.gitignore')
gitignore_path = os.path.join(expand_src_dir_path,
constants.GIT_IGNORE_FILE)

git_exclude_exists = os.path.isfile(git_exclude_path)
gitignore_exists = os.path.isfile(gitignore_path)
Expand Down Expand Up @@ -162,3 +203,20 @@ def get_excluded_files_from_gitignore(src_dir_path: str) -> List[str]:
to_be_excluded += '*'
excluded_list.append(to_be_excluded)
return excluded_list


def get_excluded_files(src_dir_path: str) -> List[str]:
# TODO: this could return a huge list of files,
# should think of ways to optimize.
""" List files and directories to be excluded.
"""
yika-luo marked this conversation as resolved.
Show resolved Hide resolved
expand_src_dir_path = os.path.expanduser(src_dir_path)
skyignore_path = os.path.join(expand_src_dir_path,
constants.SKY_IGNORE_FILE)
if os.path.exists(skyignore_path):
logger.info(f'Exclude files to sync to cluster based on '
f'{constants.SKY_IGNORE_FILE}.')
return get_excluded_files_from_skyignore(src_dir_path)
logger.info(f'Exclude files to sync to cluster based on '
f'{constants.GIT_IGNORE_FILE}.')
return get_excluded_files_from_gitignore(src_dir_path)
2 changes: 2 additions & 0 deletions sky/skylet/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@

SKY_LOGS_DIRECTORY = '~/sky_logs'
SKY_REMOTE_WORKDIR = '~/sky_workdir'
SKY_IGNORE_FILE = '.skyignore'
GIT_IGNORE_FILE = '.gitignore'

# Default Ray port is 6379. Default Ray dashboard port is 8265.
# Default Ray tempdir is /tmp/ray.
Expand Down
39 changes: 21 additions & 18 deletions sky/utils/command_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,6 @@

logger = sky_logging.init_logger(__name__)

# The git exclude file to support.
GIT_EXCLUDE = '.git/info/exclude'
# Rsync options
# TODO(zhwu): This will print a per-file progress bar (with -P),
# shooting a lot of messages to the output. --info=progress2 is used
Expand All @@ -30,7 +28,10 @@
# Note that "-" is mandatory for rsync and means all patterns in the ignore
# files are treated as *exclude* patterns. Non-exclude patterns, e.g., "!
# do_not_exclude" doesn't work, even though git allows it.
RSYNC_FILTER_OPTION = '--filter=\'dir-merge,- .gitignore\''
RSYNC_FILTER_SKYIGNORE = f'--filter=\'dir-merge,- {constants.SKY_IGNORE_FILE}\''
RSYNC_FILTER_GITIGNORE = f'--filter=\'dir-merge,- {constants.GIT_IGNORE_FILE}\''
# The git exclude file to support.
GIT_EXCLUDE = '.git/info/exclude'
RSYNC_EXCLUDE_OPTION = '--exclude-from={}'

_HASH_MAX_LENGTH = 10
Expand Down Expand Up @@ -237,21 +238,23 @@ def _rsync(
rsync_command += ['rsync', RSYNC_DISPLAY_OPTION]

# --filter
rsync_command.append(RSYNC_FILTER_OPTION)

if up:
# Build --exclude-from argument.
# The source is a local path, so we need to resolve it.
resolved_source = pathlib.Path(source).expanduser().resolve()
if (resolved_source / GIT_EXCLUDE).exists():
# Ensure file exists; otherwise, rsync will error out.
#
# We shlex.quote() because the path may contain spaces:
# 'my dir/.git/info/exclude'
# Without quoting rsync fails.
rsync_command.append(
RSYNC_EXCLUDE_OPTION.format(
shlex.quote(str(resolved_source / GIT_EXCLUDE))))
# The source is a local path, so we need to resolve it.
resolved_source = pathlib.Path(source).expanduser().resolve()
if (resolved_source / constants.SKY_IGNORE_FILE).exists():
rsync_command.append(RSYNC_FILTER_SKYIGNORE)
else:
rsync_command.append(RSYNC_FILTER_GITIGNORE)
if up:
# Build --exclude-from argument.
if (resolved_source / GIT_EXCLUDE).exists():
# Ensure file exists; otherwise, rsync will error out.
#
# We shlex.quote() because the path may contain spaces:
# 'my dir/.git/info/exclude'
# Without quoting rsync fails.
rsync_command.append(
RSYNC_EXCLUDE_OPTION.format(
shlex.quote(str(resolved_source / GIT_EXCLUDE))))

rsync_command.append(f'-e {shlex.quote(rsh_option)}')

Expand Down
5 changes: 3 additions & 2 deletions sky/utils/command_runner.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,8 @@ from sky.utils import subprocess_utils as subprocess_utils

GIT_EXCLUDE: str
RSYNC_DISPLAY_OPTION: str
RSYNC_FILTER_OPTION: str
RSYNC_FILTER_GITIGNORE: str
RSYNC_FILTER_SKYIGNORE: str
RSYNC_EXCLUDE_OPTION: str
ALIAS_SUDO_TO_EMPTY_FOR_ROOT_CMD: str

Expand Down Expand Up @@ -267,4 +268,4 @@ class KubernetesCommandRunner(CommandRunner):
log_path: str = ...,
stream_logs: bool = ...,
max_retry: int = ...) -> None:
...
...
Michaelvll marked this conversation as resolved.
Show resolved Hide resolved
57 changes: 57 additions & 0 deletions tests/unit_tests/test_storage_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
import os
yika-luo marked this conversation as resolved.
Show resolved Hide resolved
import tempfile

from sky.data import storage_utils
from sky.skylet import constants


def test_get_excluded_files_from_skyignore_no_file():
excluded_files = storage_utils.get_excluded_files_from_skyignore('.')
assert len(excluded_files) == 0


def test_get_excluded_files_from_skyignore():
with tempfile.TemporaryDirectory() as temp_dir:
# Create workdir
dirs = ['remove_dir', 'dir']
files = [
'remove.py', 'remove.txt', 'remove.sh', 'remove.b', 'keep.py',
'remove.a', 'dir/keep.txt', 'dir/remove.sh', 'dir/remove.b',
'dir/keep.a'
]
for dir_name in dirs:
os.makedirs(os.path.join(temp_dir, dir_name), exist_ok=True)
for file_path in files:
full_path = os.path.join(temp_dir, file_path)
with open(full_path, 'w') as f:
f.write('test content')

# Create skyignore file
skyignore_content = """
# File
remove.py
# Directory
remove_dir/
# Pattern match for current directory
./*.txt
/*.a
# Pattern match for all subdirectories
**/*.sh
*.b
"""
skyignore_path = os.path.join(temp_dir, constants.SKY_IGNORE_FILE)
with open(skyignore_path, 'w') as f:
f.write(skyignore_content)

# Test function
excluded_files = storage_utils.get_excluded_files_from_skyignore(
temp_dir)

# Validate results
expected_excluded_files = [
'remove.py', 'remove_dir/', 'remove.txt', 'remove.sh', 'remove.a',
'remove.b', 'dir/remove.sh', 'dir/remove.b'
]
for file_path in expected_excluded_files:
assert file_path in excluded_files
assert len(excluded_files) == len(expected_excluded_files)
Loading