From ade89e7d20b629858cada4c2d7395a519b6e42ac Mon Sep 17 00:00:00 2001 From: Romil Bhardwaj Date: Tue, 7 Jun 2022 01:28:53 +0530 Subject: [PATCH] Fix recursive symlinks when uploading to sky storage (#882) * Fix symlinks * Add tests * fix symlink * address comments * lint * fix --- docs/source/reference/storage.rst | 6 +++--- examples/using_file_mounts.yaml | 5 +++++ sky/cloud_stores.py | 7 ++++--- sky/data/storage.py | 3 ++- tests/test_smoke.py | 11 +++++++++++ 5 files changed, 25 insertions(+), 7 deletions(-) diff --git a/docs/source/reference/storage.rst b/docs/source/reference/storage.rst index 1c8eda722ba1..d68e528aa0b5 100644 --- a/docs/source/reference/storage.rst +++ b/docs/source/reference/storage.rst @@ -171,9 +171,9 @@ and storage mounting: .. note:: Symbolic links are handled differently in :code:`file_mounts` depending on whether Sky Storage is used. - For mounts backed by Sky Storage, referenced data for all symbolic links is copied to remote. - For mounts not using Sky Storage (e.g., those using rsync) the symbolic links are directly copied. - Their targets must be separately mounted or else the symlinks may break. + For mounts backed by Sky Storage, symbolic links are not copied to remote. + For mounts not using Sky Storage (e.g., those using rsync) the symbolic links are directly copied, not their target data. + The targets must be separately mounted or else the symlinks may break. Creating a shared file system ----------------------------- diff --git a/examples/using_file_mounts.yaml b/examples/using_file_mounts.yaml index edfb08a1eb15..43cdaf5d1768 100644 --- a/examples/using_file_mounts.yaml +++ b/examples/using_file_mounts.yaml @@ -116,3 +116,8 @@ run: | # sky.egg-info/ should not exists on remote due to the .gitingore cd ~/sky_workdir ! ls sky.egg-info/ + + # Symlinks should be copied, but they will be broken + # Assumes symlink named circle-link in /tmp/workdir (created by test_smoke.py) + ls /tmp/workdir/circle-link + ! readlink -f /tmp/workdir/circle-link diff --git a/sky/cloud_stores.py b/sky/cloud_stores.py index 08e551354671..dc8e695741ef 100644 --- a/sky/cloud_stores.py +++ b/sky/cloud_stores.py @@ -70,7 +70,8 @@ def make_sync_dir_command(self, source: str, destination: str) -> str: # To increase parallelism, modify max_concurrent_requests in your # aws config file (Default path: ~/.aws/config). download_via_awscli = f'mkdir -p {destination} && \ - aws s3 sync {source} {destination}' + aws s3 sync --no-follow-symlinks ' \ + f'{source} {destination}' all_commands = list(self._GET_AWSCLI) all_commands.append(download_via_awscli) @@ -78,8 +79,8 @@ def make_sync_dir_command(self, source: str, destination: str) -> str: def make_sync_file_command(self, source: str, destination: str) -> str: """Downloads a file using AWS CLI.""" - download_via_awscli = f'mkdir -p {destination} && \ - aws s3 cp {source} {destination}' + download_via_awscli = (f'mkdir -p {destination} &&' + f'aws s3 cp {source} {destination}') all_commands = list(self._GET_AWSCLI) all_commands.append(download_via_awscli) diff --git a/sky/data/storage.py b/sky/data/storage.py index 0457e72a891e..d5073a72ba31 100644 --- a/sky/data/storage.py +++ b/sky/data/storage.py @@ -754,7 +754,8 @@ def sync_local_dir(self) -> None: file (Default path: ~/.aws/config). """ source = os.path.abspath(os.path.expanduser(self.source)) - sync_command = f'aws s3 sync {source} s3://{self.name}/' + sync_command = ('aws s3 sync --no-follow-symlinks ' + f'{source} s3://{self.name}/') with backend_utils.safe_console_status( f'[bold cyan]Syncing ' f'[green]{self.source} to s3://{self.name}/'): diff --git a/tests/test_smoke.py b/tests/test_smoke.py index 8fae2fc0d644..a84458031874 100644 --- a/tests/test_smoke.py +++ b/tests/test_smoke.py @@ -158,6 +158,7 @@ def test_file_mounts(): 'touch ~/tmpfile', 'mkdir -p ~/tmp-workdir', 'touch ~/tmp-workdir/foo', + 'ln -f -s ~/tmp-workdir/ ~/tmp-workdir/circle-link', f'sky launch -y -c {name} examples/using_file_mounts.yaml', f'sky logs {name} 1 --status', # Ensure the job succeeded. ], @@ -543,6 +544,8 @@ def tmp_mount(self, tmp_path): tmp_dir.mkdir() tmp_file = tmp_dir / 'tmp-file' tmp_file.write_text('test') + circle_link = tmp_dir / 'circle-link' + circle_link.symlink_to(tmp_dir, target_is_directory=True) yield str(tmp_dir) @pytest.fixture @@ -625,6 +628,14 @@ def test_upload_to_existing_bucket(self, tmp_awscli_bucket, tmp_mount): 'File not found in bucket - output was : {}'.format(out.decode ('utf-8')) + # Check symlinks - symlinks don't get copied by sky storage + assert (pathlib.Path(tmp_mount) / 'circle-link').is_symlink(), ( + 'circle-link was not found in the upload source - ' + 'are the test fixtures correct?') + assert 'circle-link' not in out.decode('utf-8'), ( + 'Symlink found in bucket - ls output was : {}'.format( + out.decode('utf-8'))) + # Run sky storage ls to check if storage object exists in the output. # It should not exist because the bucket was created externally. out = subprocess.check_output(['sky', 'storage', 'ls'])