Skip to content

Commit

Permalink
Move glob_run_dir() to workflow_files
Browse files Browse the repository at this point in the history
Add unit tests
  • Loading branch information
MetRonnie committed Jun 11, 2021
1 parent 2300f4d commit 59ba89e
Show file tree
Hide file tree
Showing 5 changed files with 368 additions and 187 deletions.
54 changes: 1 addition & 53 deletions cylc/flow/pathutil.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,12 +15,11 @@
# along with this program. If not, see <http://www.gnu.org/licenses/>.
"""Functions to return paths to common workflow files and directories."""

import glob
import os
from pathlib import Path
import re
from shutil import rmtree
from typing import Container, Dict, Iterable, List, Set, Union
from typing import Dict, Iterable, Set, Union

from cylc.flow import LOG
from cylc.flow.cfgspec.glbl_cfg import glbl_cfg
Expand Down Expand Up @@ -314,57 +313,6 @@ def parse_rm_dirs(rm_dirs: Iterable[str]) -> Set[str]:
return result


def glob_in_run_dir(
run_dir: Union[Path, str], pattern: str, symlink_dirs: Container[Path]
) -> List[Path]:
"""Execute a (recursive) glob search in the given run directory.
Returns list of any absolute paths that match the pattern. However:
* Does not follow symlinks (apart from the spcedified symlink dirs).
* Also does not return matching subpaths of matching directories (because
that would be redundant).
Args:
run_dir: Absolute path of the workflow run dir.
pattern: The glob pattern.
symlink_dirs: Absolute paths to the workflow's symlink dirs.
"""
# Note: use os.path.join, not pathlib, to preserve trailing slash if
# present in pattern
pattern = os.path.join(glob.escape(str(run_dir)), pattern)
# Note: don't use pathlib.Path.glob() because when you give it an exact
# filename instead of pattern, it doesn't return broken symlinks
matches = sorted(Path(i) for i in glob.iglob(pattern, recursive=True))
# sort guarantees parents come before their children
if len(matches) == 1 and not os.path.lexists(matches[0]):
# https://bugs.python.org/issue35201
return []
results: List[Path] = []
subpath_excludes: Set[Path] = set()
for path in matches:
for rel_ancestor in reversed(path.relative_to(run_dir).parents):
ancestor = run_dir / rel_ancestor
if ancestor in subpath_excludes:
break
if ancestor.is_symlink() and ancestor not in symlink_dirs:
# Do not follow non-standard symlinks
subpath_excludes.add(ancestor)
break
if (not symlink_dirs) and ancestor in results:
# We can be sure all subpaths of this ancestor are redundant
subpath_excludes.add(ancestor)
break
if ancestor == path.parent:
# Final iteration over ancestors
if ancestor in matches and path not in symlink_dirs:
# Redundant (but don't exclude subpaths in case any of the
# subpaths are std symlink dirs)
break
else: # no break
results.append(path)
return results


def is_relative_to(path1: Union[Path, str], path2: Union[Path, str]) -> bool:
"""Return whether or not path1 is relative to path2."""
# In future, we can just use pathlib.Path.is_relative_to()
Expand Down
57 changes: 54 additions & 3 deletions cylc/flow/workflow_files.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
from collections import deque
from enum import Enum
from functools import partial
import glob
import logging
import os
from pathlib import Path
Expand All @@ -29,8 +30,8 @@
from subprocess import Popen, PIPE, DEVNULL
import time
from typing import (
Any, Deque, Dict, Iterable, List, NamedTuple, Optional, Set, Tuple,
TYPE_CHECKING, Union
Any, Container, Deque, Dict, Iterable, List, NamedTuple, Optional, Set,
Tuple, TYPE_CHECKING, Union
)
import zmq.auth

Expand All @@ -47,7 +48,6 @@
from cylc.flow.pathutil import (
expand_path,
get_workflow_run_dir,
glob_in_run_dir,
make_localhost_symlinks,
parse_rm_dirs,
remove_dir_and_target,
Expand Down Expand Up @@ -723,6 +723,57 @@ def get_symlink_dirs(reg: str, run_dir: Union[Path, str]) -> Dict[str, Path]:
return ret


def glob_in_run_dir(
run_dir: Union[Path, str], pattern: str, symlink_dirs: Container[Path]
) -> List[Path]:
"""Execute a (recursive) glob search in the given run directory.
Returns list of any absolute paths that match the pattern. However:
* Does not follow symlinks (apart from the spcedified symlink dirs).
* Also does not return matching subpaths of matching directories (because
that would be redundant).
Args:
run_dir: Absolute path of the workflow run dir.
pattern: The glob pattern.
symlink_dirs: Absolute paths to the workflow's symlink dirs.
"""
# Note: use os.path.join, not pathlib, to preserve trailing slash if
# present in pattern
pattern = os.path.join(glob.escape(str(run_dir)), pattern)
# Note: don't use pathlib.Path.glob() because when you give it an exact
# filename instead of pattern, it doesn't return broken symlinks
matches = sorted(Path(i) for i in glob.iglob(pattern, recursive=True))
# sort guarantees parents come before their children
if len(matches) == 1 and not os.path.lexists(matches[0]):
# https://bugs.python.org/issue35201
return []
results: List[Path] = []
subpath_excludes: Set[Path] = set()
for path in matches:
for rel_ancestor in reversed(path.relative_to(run_dir).parents):
ancestor = run_dir / rel_ancestor
if ancestor in subpath_excludes:
break
if ancestor.is_symlink() and ancestor not in symlink_dirs:
# Do not follow non-standard symlinks
subpath_excludes.add(ancestor)
break
if (not symlink_dirs) and ancestor in results:
# We can be sure all subpaths of this ancestor are redundant
subpath_excludes.add(ancestor)
break
if ancestor == path.parent:
# Final iteration over ancestors
if ancestor in matches and path not in symlink_dirs:
# Redundant (but don't exclude subpaths in case any of the
# subpaths are std symlink dirs)
break
else: # no break
results.append(path)
return results


def _clean_using_glob(
run_dir: Path, pattern: str, symlink_dirs: Iterable[str]
) -> None:
Expand Down
167 changes: 167 additions & 0 deletions tests/unit/filetree.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,167 @@
# THIS FILE IS PART OF THE CYLC WORKFLOW ENGINE.
# Copyright (C) NIWA & British Crown (Met Office) & Contributors.
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.

"""Utilities for testing workflow directory structure.
(There should be no tests in this module.)
A filetree is represented by a dict like so:
{
# Dirs are represented by dicts (which are also sub-filetrees):
'dir': {
'another-dir': {
# Files are represented by None:
'file.txt': None
}
},
# Symlinks are represented by pathlib.Path, with the target represented
# by the relative path from the tmp_path directory:
'symlink': Path('dir/another-dir')
}
"""

from pathlib import Path
from typing import Any, Dict, List


def create_filetree(
filetree: Dict[str, Any], location: Path, root: Path
) -> None:
"""Create the directory structure represented by the filetree dict.
Args:
filetree: The filetree to create.
location: The absolute path in which to create the filetree.
root: The top-level dir from which relative symlink targets are
located (typically tmp_path).
"""
for name, entry in filetree.items():
path = location / name
if isinstance(entry, dict):
path.mkdir(exist_ok=True)
create_filetree(entry, path, root)
elif isinstance(entry, Path):
path.symlink_to(root / entry)
else:

path.touch()


def get_filetree_as_list(
filetree: Dict[str, Any], location: Path
) -> List[str]:
"""Return a list of the paths in a filetree.
Args:
filetree: The filetree to listify.
location: The absolute path to the filetree.
"""
ret: List[str] = []
for name, entry in filetree.items():
path = location / name
ret.append(str(path))
if isinstance(entry, dict):
ret.extend(get_filetree_as_list(entry, path))
return ret


FILETREE_1 = {
'cylc-run': {'foo': {'bar': {
'.service': {'db': None},
'flow.cylc': None,
'log': Path('sym/cylc-run/foo/bar/log'),
'mirkwood': Path('you-shall-not-pass/mirkwood'),
'rincewind.txt': Path('you-shall-not-pass/rincewind.txt')
}}},
'sym': {'cylc-run': {'foo': {'bar': {
'log': {
'darmok': Path('you-shall-not-pass/darmok'),
'temba.txt': Path('you-shall-not-pass/temba.txt'),
'bib': {
'fortuna.txt': None
}
}
}}}},
'you-shall-not-pass': { # Nothing in here should get deleted
'darmok': {
'jalad.txt': None
},
'mirkwood': {
'spiders.txt': None
},
'rincewind.txt': None,
'temba.txt': None
}
}

FILETREE_2 = {
'cylc-run': {'foo': {'bar': Path('sym-run/cylc-run/foo/bar')}},
'sym-run': {'cylc-run': {'foo': {'bar': {
'.service': {'db': None},
'flow.cylc': None,
'share': Path('sym-share/cylc-run/foo/bar/share')
}}}},
'sym-share': {'cylc-run': {'foo': {'bar': {
'share': {
'cycle': Path('sym-cycle/cylc-run/foo/bar/share/cycle')
}
}}}},
'sym-cycle': {'cylc-run': {'foo': {'bar': {
'share': {
'cycle': {
'macklunkey.txt': None
}
}
}}}},
'you-shall-not-pass': {}
}

FILETREE_3 = {
'cylc-run': {'foo': {'bar': Path('sym-run/cylc-run/foo/bar')}},
'sym-run': {'cylc-run': {'foo': {'bar': {
'.service': {'db': None},
'flow.cylc': None,
'share': {
'cycle': Path('sym-cycle/cylc-run/foo/bar/share/cycle')
}
}}}},
'sym-cycle': {'cylc-run': {'foo': {'bar': {
'share': {
'cycle': {
'sokath.txt': None
}
}
}}}},
'you-shall-not-pass': {}
}

FILETREE_4 = {
'cylc-run': {'foo': {'bar': {
'.service': {'db': None},
'flow.cylc': None,
'share': {
'cycle': Path('sym-cycle/cylc-run/foo/bar/share/cycle')
}
}}},
'sym-cycle': {'cylc-run': {'foo': {'bar': {
'share': {
'cycle': {
'kiazi.txt': None
}
}
}}}},
'you-shall-not-pass': {}
}
2 changes: 1 addition & 1 deletion tests/unit/test_pathutil.py
Original file line number Diff line number Diff line change
Expand Up @@ -432,7 +432,7 @@ def test_parse_rm_dirs(dirs: List[str], expected: Set[str]):
"cannot take paths that point to the run directory or above"),
]
)
def test_parse_rm_dirs_bad(dirs: List[str], err_msg: str):
def test_parse_rm_dirs__bad(dirs: List[str], err_msg: str):
"""Test parse_dirs() with bad inputs"""
with pytest.raises(UserInputError) as exc:
parse_rm_dirs(dirs)
Expand Down
Loading

0 comments on commit 59ba89e

Please sign in to comment.