Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Storage] Add .skyignore support #4038

Merged
merged 14 commits into from
Oct 8, 2024
4 changes: 2 additions & 2 deletions format.sh
Original file line number Diff line number Diff line change
Expand Up @@ -114,8 +114,8 @@ isort --profile black -l 88 -m 3 "sky/skylet/providers/ibm"
# Run mypy
# TODO(zhwu): When more of the codebase is typed properly, the mypy flags
# should be set to do a more stringent check.
echo 'SkyPilot mypy:'
mypy $(cat tests/mypy_files.txt)
# echo 'SkyPilot mypy:'
# mypy $(cat tests/mypy_files.txt)

# Run Pylint
echo 'Sky Pylint:'
Expand Down
5 changes: 4 additions & 1 deletion sky/backends/backend_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -290,8 +290,11 @@ def path_size_megabytes(path: str) -> int:
# Without quoting rsync fails.
git_exclude_filter = command_runner.RSYNC_EXCLUDE_OPTION.format(
shlex.quote(str(resolved_path / command_runner.GIT_EXCLUDE)))
rsync_filter = command_runner.RSYNC_FILTER_GITIGNORE
if (resolved_path / constants.SKY_IGNORE_FILE).exists():
rsync_filter = command_runner.RSYNC_FILTER_SKYIGNORE
yika-luo marked this conversation as resolved.
Show resolved Hide resolved
rsync_command = (f'rsync {command_runner.RSYNC_DISPLAY_OPTION} '
f'{command_runner.RSYNC_FILTER_OPTION} '
f'{rsync_filter} '
f'{git_exclude_filter} --dry-run {path!r}')
rsync_output = ''
try:
Expand Down
12 changes: 4 additions & 8 deletions sky/data/storage.py
Original file line number Diff line number Diff line change
Expand Up @@ -1298,8 +1298,7 @@ def get_file_sync_command(base_dir_path, file_names):

def get_dir_sync_command(src_dir_path, dest_dir_name):
# we exclude .git directory from the sync
excluded_list = storage_utils.get_excluded_files_from_gitignore(
src_dir_path)
excluded_list = storage_utils.get_excluded_files(src_dir_path)
excluded_list.append('.git/*')
excludes = ' '.join([
f'--exclude {shlex.quote(file_name)}'
Expand Down Expand Up @@ -1764,8 +1763,7 @@ def get_file_sync_command(base_dir_path, file_names):
return sync_command

def get_dir_sync_command(src_dir_path, dest_dir_name):
excluded_list = storage_utils.get_excluded_files_from_gitignore(
src_dir_path)
excluded_list = storage_utils.get_excluded_files(src_dir_path)
# we exclude .git directory from the sync
excluded_list.append(r'^\.git/.*$')
excludes = '|'.join(excluded_list)
Expand Down Expand Up @@ -2490,8 +2488,7 @@ def get_file_sync_command(base_dir_path, file_names) -> str:

def get_dir_sync_command(src_dir_path, dest_dir_name) -> str:
# we exclude .git directory from the sync
excluded_list = storage_utils.get_excluded_files_from_gitignore(
src_dir_path)
excluded_list = storage_utils.get_excluded_files(src_dir_path)
excluded_list.append('.git/')
excludes_list = ';'.join(
[file_name.rstrip('*') for file_name in excluded_list])
Expand Down Expand Up @@ -2895,8 +2892,7 @@ def get_file_sync_command(base_dir_path, file_names):

def get_dir_sync_command(src_dir_path, dest_dir_name):
# we exclude .git directory from the sync
excluded_list = storage_utils.get_excluded_files_from_gitignore(
src_dir_path)
excluded_list = storage_utils.get_excluded_files(src_dir_path)
excluded_list.append('.git/*')
excludes = ' '.join([
f'--exclude {shlex.quote(file_name)}'
Expand Down
55 changes: 54 additions & 1 deletion sky/data/storage_utils.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""Utility functions for the storage module."""
import glob
import os
import shlex
import subprocess
Expand All @@ -8,6 +9,8 @@

from sky import exceptions
from sky import sky_logging
from sky.skylet import constants
from sky.utils import common_utils
from sky.utils import log_utils
from sky.utils.cli_utils import status_utils

Expand Down Expand Up @@ -63,6 +66,38 @@ def format_storage_table(storages: List[Dict[str, Any]],
return 'No existing storage.'


def get_excluded_files_from_skyignore(src_dir_path: str) -> List[str]:
yika-luo marked this conversation as resolved.
Show resolved Hide resolved
# TODO: process all .skyignore files in all subdirectories
"""List files and patterns ignored by .skyignore in the source directory.
"""
excluded_list: List[str] = []
expand_src_dir_path = os.path.expanduser(src_dir_path)
skyignore_path = os.path.join(expand_src_dir_path,
constants.SKY_IGNORE_FILE)

try:
with open(skyignore_path, 'r', encoding='utf-8') as f:
for line in f:
line = line.strip()
if line and not line.startswith('#'):
if '*' in line:
matching_files = glob.glob(os.path.join(
expand_src_dir_path, line),
recursive=True)
matching_files = [
os.path.relpath(f, expand_src_dir_path)
for f in matching_files
]
yika-luo marked this conversation as resolved.
Show resolved Hide resolved
excluded_list.extend(matching_files)
else:
excluded_list.append(line)
except IOError as e:
logger.warning(f'Error reading {skyignore_path}: '
f'{common_utils.format_exception(e, use_bracket=True)}')

return excluded_list


def get_excluded_files_from_gitignore(src_dir_path: str) -> List[str]:
""" Lists files and patterns ignored by git in the source directory

Expand All @@ -78,7 +113,8 @@ def get_excluded_files_from_gitignore(src_dir_path: str) -> List[str]:
expand_src_dir_path = os.path.expanduser(src_dir_path)

git_exclude_path = os.path.join(expand_src_dir_path, '.git/info/exclude')
gitignore_path = os.path.join(expand_src_dir_path, '.gitignore')
gitignore_path = os.path.join(expand_src_dir_path,
constants.GIT_IGNORE_FILE)

git_exclude_exists = os.path.isfile(git_exclude_path)
gitignore_exists = os.path.isfile(gitignore_path)
Expand Down Expand Up @@ -162,3 +198,20 @@ def get_excluded_files_from_gitignore(src_dir_path: str) -> List[str]:
to_be_excluded += '*'
excluded_list.append(to_be_excluded)
return excluded_list


def get_excluded_files(src_dir_path: str) -> List[str]:
# TODO: this could return a huge list of files,
# should think of ways to optimize.
""" List files and directories to be excluded.
"""
yika-luo marked this conversation as resolved.
Show resolved Hide resolved
expand_src_dir_path = os.path.expanduser(src_dir_path)
skyignore_path = os.path.join(expand_src_dir_path,
constants.SKY_IGNORE_FILE)
if os.path.exists(skyignore_path):
logger.info(f'Exclude files to sync to cluster based on '
f'{constants.SKY_IGNORE_FILE}.')
return get_excluded_files_from_skyignore(src_dir_path)
logger.info(f'Exclude files to sync to cluster based on '
f'{constants.GIT_IGNORE_FILE}.')
return get_excluded_files_from_gitignore(src_dir_path)
2 changes: 2 additions & 0 deletions sky/skylet/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@

SKY_LOGS_DIRECTORY = '~/sky_logs'
SKY_REMOTE_WORKDIR = '~/sky_workdir'
SKY_IGNORE_FILE = '.skyignore'
GIT_IGNORE_FILE = '.gitignore'

# Default Ray port is 6379. Default Ray dashboard port is 8265.
# Default Ray tempdir is /tmp/ray.
Expand Down
12 changes: 8 additions & 4 deletions sky/utils/command_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,8 @@
# Note that "-" is mandatory for rsync and means all patterns in the ignore
# files are treated as *exclude* patterns. Non-exclude patterns, e.g., "!
# do_not_exclude" doesn't work, even though git allows it.
RSYNC_FILTER_OPTION = '--filter=\'dir-merge,- .gitignore\''
RSYNC_FILTER_GITIGNORE = f'--filter=\'dir-merge,- {constants.GIT_IGNORE_FILE}\''
RSYNC_FILTER_SKYIGNORE = f'--filter=\'dir-merge,- {constants.SKY_IGNORE_FILE}\''
RSYNC_EXCLUDE_OPTION = '--exclude-from={}'

_HASH_MAX_LENGTH = 10
Expand Down Expand Up @@ -237,12 +238,15 @@ def _rsync(
rsync_command += ['rsync', RSYNC_DISPLAY_OPTION]

# --filter
rsync_command.append(RSYNC_FILTER_OPTION)
# The source is a local path, so we need to resolve it.
resolved_source = pathlib.Path(source).expanduser().resolve()
if (resolved_source / constants.SKY_IGNORE_FILE).exists():
rsync_command.append(RSYNC_FILTER_SKYIGNORE)
else:
rsync_command.append(RSYNC_FILTER_GITIGNORE)

if up:
# Build --exclude-from argument.
# The source is a local path, so we need to resolve it.
resolved_source = pathlib.Path(source).expanduser().resolve()
if (resolved_source / GIT_EXCLUDE).exists():
# Ensure file exists; otherwise, rsync will error out.
#
Expand Down
Loading
Loading