Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add test and fix for disappearing dependency issue #3627 #3669

Merged
merged 7 commits into from
Jul 6, 2021
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions codalab/worker/worker.py
Original file line number Diff line number Diff line change
Expand Up @@ -659,6 +659,7 @@ def initialize_run(self, bundle, resources):
is_restaged=False,
cpu_usage=0.0,
memory_usage=0.0,
paths_to_remove=[],
)
# Start measuring bundle stats for the initial bundle state.
self.start_stage_stats(bundle.uuid, RunStage.PREPARING)
Expand Down
19 changes: 12 additions & 7 deletions codalab/worker/worker_run_state.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,7 @@ class RunStage(object):
'cpu_usage', # float
'memory_usage', # float
'bundle_profile_stats', # dict
'paths_to_remove', # list[str]. Stores paths to be removed after the worker run.
],
)

Expand Down Expand Up @@ -193,7 +194,6 @@ def __init__(
self.upload_bundle_callback = upload_bundle_callback
self.assign_cpu_and_gpu_sets_fn = assign_cpu_and_gpu_sets_fn
self.shared_file_system = shared_file_system
self.paths_to_remove = []

def stop(self):
for uuid in self.disk_utilization.keys():
Expand Down Expand Up @@ -354,7 +354,9 @@ def mount_dependency(dependency, shared_file_system):
parent_path=os.path.join(dependency_path, child),
)
)
self.paths_to_remove.append(child_path)
run_state = run_state._replace(
paths_to_remove=(run_state.paths_to_remove or []) + [child_path]
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Included or [] so we don't break existing runs on prod (which may not have paths_to_remove defined yet).

)
else:
to_mount.append(
DependencyToMount(
Expand All @@ -366,11 +368,14 @@ def mount_dependency(dependency, shared_file_system):

first_element_of_path = Path(dep.child_path).parts[0]
if first_element_of_path == RunStateMachine._ROOT:
self.paths_to_remove.append(full_child_path)
run_state = run_state._replace(
paths_to_remove=(run_state.paths_to_remove or []) + [full_child_path]
)
else:
# child_path can be a nested path, so later remove everything from the first element of the path
self.paths_to_remove.append(
os.path.join(run_state.bundle_path, first_element_of_path)
path_to_remove = os.path.join(run_state.bundle_path, first_element_of_path)
run_state = run_state._replace(
paths_to_remove=(run_state.paths_to_remove or []) + [path_to_remove]
)
for dependency in to_mount:
try:
Expand Down Expand Up @@ -607,9 +612,9 @@ def remove_path_no_fail(path):
self.dependency_manager.release(run_state.bundle.uuid, dep_key)

# Clean up dependencies paths
for path in self.paths_to_remove:
for path in run_state.paths_to_remove or []:
remove_path_no_fail(path)
self.paths_to_remove = []
run_state = run_state._replace(paths_to_remove=[])

if run_state.is_restaged:
log_bundle_transition(
Expand Down
14 changes: 14 additions & 0 deletions tests/cli/test_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -1507,6 +1507,20 @@ def test_link(ctx):

@TestModule.register('run2')
def test_run2(ctx):
# Test that finishing a bundle (uuid2) with a dependency doesn't delete the dependency
# for another running bundle (uuid1) dependent on that same dependency; this is the
# scenario reported in https://github.com/codalab/codalab-worksheets/issues/3627.
dir3 = _run_command([cl, 'upload', test_path('dir3')])
uuid1 = _run_command(
[cl, 'run', 'dir3:%s' % dir3, 'for x in {1..100}; do ls dir3 && sleep 1; done']
)
uuid2 = _run_command(
[cl, 'run', 'dir3:%s' % dir3, 'for x in {1..10}; do ls dir3 && sleep 1; done']
)
wait(uuid2)
check_equals(State.RUNNING, get_info(uuid1, 'state'))
wait(uuid1)

# Test that content of dependency is mounted at the top when . is specified as the dependency key
dir3 = _run_command([cl, 'upload', test_path('dir3')])
uuid = _run_command([cl, 'run', '.:%s' % dir3, 'cat f1'])
Expand Down