Merge pull request #50 from tanbro/develop

Develop
tanbro · Sep 22, 2024 · 14d3e40 · 14d3e40
2 parents bc1cf7b + da96563
commit 14d3e40
Show file tree

Hide file tree

Showing 13 changed files with 202 additions and 37 deletions.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -20,7 +20,7 @@ repos:
       - id: check-docstring-first
 
   - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.6.2
+    rev: v0.6.5
     hooks:
       - id: ruff # Run the linter.
         types_or: [python, pyi, jupyter]
@@ -36,7 +36,7 @@ repos:
         additional_dependencies: [types-PyYAML]
 
   - repo: https://github.com/python-jsonschema/check-jsonschema
-    rev: "0.28.6"
+    rev: "0.29.2"
     hooks:
       - id: check-github-workflows
       - id: check-readthedocs
diff --git a/README.md b/README.md
@@ -414,11 +414,56 @@ xyz: !http-include xyz.yml
 
 the actual URL to access is `http://$HOST:$PORT/sub_1/sub_1_1/xyz.yml`
 
+### Flatten sequence object in multiple matched files
+
+Consider we have such a YAML:
+
+```yaml
+items: !include "*.yaml"
+```
+
+If every file matches `*.yaml` contains a sequence object at the top level in it, what parsed and loaded will be:
+
+```yaml
+items: [
+    [item 0 of 1st file, item 1 of 1st file, ... , item n of 1st file, ...],
+    [item 0 of 2nd file, item 1 of 2nd file, ... , item n of 2nd file, ...],
+    # ....
+    [item 0 of nth file, item 1 of nth file, ... , item n of nth file, ...],
+    # ...
+]
+```
+
+It's a 2-dim array, because YAML content of each matched file is treated as a member of the list(sequence).
+
+But if `flatten` parameter was set to `true`, like:
+
+```yaml
+items: !include {urlpath: "*.yaml", flatten: true}
+```
+
+we'll get:
+
+```yaml
+items: [
+    item 0 of 1st file, item 1 of 1st file, ... , item n of 1st file,  # ...
+    item 0 of 2nd file, item 1 of 2nd file, ... , item n of 2nd file,  # ...
+    # ....
+    item 0 of n-th file, item 1 of n-th file, ... , item n of n-th file,  # ...
+    # ...
+]
+```
+
+> ℹ️ **Note**
+>
+> - Only available when multiple files were matched.
+> - **Every matched file should have a Sequence object in its top level**, or a `TypeError` exception may be thrown.
+
 ### Serialization
 
-When load [YAML][] string with include statement, the including files are default parsed into python objects. Thant is, if we call `yaml.dump()` on the object, what dumped is the parsed python object, and can not serialize the include statement itself.
+When load [YAML][] string with include statement, the including files are parsed into python objects by default. That is, if we call `yaml.dump()` on the object, what dumped is the parsed python object, and can not serialize the include statement itself.
 
-To serialize the statement, we shall first create an `yaml_include.Constructor` object whose **`autoload` is `False`**:
+To serialize the statement, we shall first create an `yaml_include.Constructor` object whose **`autoload` attribute is `False`**:
 
 ```python
 import yaml

diff --git a/docs/README.rst b/docs/README.rst
@@ -23,11 +23,19 @@ How to build docs
 
 #. Build HTML documentation:
 
-   .. code:: sh
+   * Make tool:
+
+      .. code:: sh
+
+         make -C docs/make html
+
+   * Windows:
 
-      docs/make html
+      .. code:: bat
 
-The built static web site is output to ``docs/_build/html``, we can serve it:
+         docs\make html
+
+The built-out static web site is at ``docs/_build/html``, we can serve it:
 
 .. code:: sh
 
@@ -44,3 +52,8 @@ then open http://localhost:8000/ in a web browser.
       python -m http.server -d docs/_build/html 8080
 
    .. seealso:: Python ``stdlib``'s :mod:`http.server`
+
+.. tip::
+   If want to build PDF, use ``make rinoh`` instead.
+
+   .. seealso:: <https://www.sphinx-doc.org/en/master/usage/builders/index.html#sphinx.builders.latex.LaTeXBuilder>
diff --git a/docs/apidocs/yaml_include.rst b/docs/apidocs/yaml_include.rst
@@ -1,14 +1,6 @@
 yaml\_include package
 =====================
 
-.. automodule:: yaml_include
-   :members:
-   :undoc-members:
-   :show-inheritance:
-
-Submodules
-----------
-
 .. toctree::
    :maxdepth: 4
 

diff --git a/docs/conf.py b/docs/conf.py
@@ -106,3 +106,6 @@
 napoleon_use_admonition_for_examples = True
 napoleon_use_admonition_for_notes = True
 napoleon_use_admonition_for_references = True
+
+
+latex_engine = "xelatex"
diff --git a/docs/requirements.txt b/docs/requirements.txt
@@ -11,3 +11,7 @@ sphinx-copybutton
 sphinx-inline-tabs
 sphinx_tippy
 sphinx-version-warning
+
+# for direct pdf generate
+rinohtype
+pillow
diff --git a/pyproject.toml b/pyproject.toml
@@ -33,8 +33,6 @@ classifiers = [
   "Programming Language :: Python",
   "Programming Language :: Python :: 3",
   "Programming Language :: Python :: 3 :: Only",
-  "Programming Language :: Python :: 3.7",
-  "Programming Language :: Python :: 3.7",
   "Programming Language :: Python :: 3.8",
   "Programming Language :: Python :: 3.9",
   "Programming Language :: Python :: 3.10",

diff --git a/requirements.txt b/requirements.txt
@@ -6,8 +6,6 @@
 -r docs/requirements.txt
 -r tests/requirements.txt
 
-setuptools_scm
-
 coverage
 
 mypy

diff --git a/src/yaml_include/constructor.py b/src/yaml_include/constructor.py
@@ -11,7 +11,7 @@
 from itertools import chain
 from os import PathLike
 from pathlib import Path
-from typing import TYPE_CHECKING, Any, Callable, Generator, Iterable, Mapping, Optional, Sequence, Type, TypeVar, Union
+from typing import TYPE_CHECKING, Any, Callable, Iterable, Iterator, Mapping, Optional, Sequence, Type, TypeVar, Union
 from urllib.parse import urlsplit, urlunsplit
 
 if sys.version_info >= (3, 10):  # pragma: no cover
@@ -31,13 +31,13 @@
 
 if TYPE_CHECKING:  # pragma: no cover
     from yaml import Node
-    from yaml.constructor import _Scalar
+    from yaml.constructor import _Scalar  # type: ignore[attr-defined]
     from yaml.cyaml import _CLoader
     from yaml.loader import _Loader
     from yaml.reader import _ReadStream
 
-    _TOpenFile = TypeVar("_TOpenFile", bound=_ReadStream)
-    _TLoaderType = TypeVar("_TLoaderType", bound=Type[Union[_Loader, _CLoader]])
+    OpenFileT = TypeVar("OpenFileT", bound=_ReadStream)
+    LoaderTypeT = TypeVar("LoaderTypeT", bound=Type[Union[_Loader, _CLoader]])
 
 
 __all__ = ["Constructor"]
@@ -47,11 +47,17 @@
 )  # We support "**", "?" and "[..]". We do not support "^" for pattern negation.
 
 
+if yaml.__with_libyaml__:  # pragma: no cover
+    DEFAULT_YAML_LOAD_FUNCTION = lambda x: yaml.load(x, yaml.CSafeLoader)  # noqa: E731
+else:  # pragma: no cover
+    DEFAULT_YAML_LOAD_FUNCTION = yaml.safe_load
+
+
 def load_open_file(
-    file: _TOpenFile,
-    loader_type: _TLoaderType,
+    file: OpenFileT,
+    loader_type: LoaderTypeT,
     path: str,
-    custom_loader: Optional[Callable[[str, _TOpenFile, _TLoaderType], Any]] = None,
+    custom_loader: Optional[Callable[[str, OpenFileT, LoaderTypeT], Any]] = None,
 ) -> Any:
     if custom_loader is None:
         return yaml.load(file, loader_type)
@@ -181,7 +187,7 @@ def read(self, length: int = ..., /) -> bytes | str: ...
     """
 
     @contextmanager
-    def managed_autoload(self, autoload: bool) -> Generator[Self, None, None]:
+    def managed_autoload(self, autoload: bool) -> Iterator[Self]:
         """``with`` statement context manager for :attr:`autoload`
 
         Args:
@@ -207,9 +213,19 @@ def __call__(self, loader: Union[_Loader, _CLoader], node: Node) -> Union[Data,
         elif is_yaml_mapping_node(node):
             val = loader.construct_mapping(node)
             if is_kwds(val):
-                data = Data(val["urlpath"], mapping_params={k: v for k, v in val.items() if k != "urlpath"})
+                kdargs = {
+                    "urlpath": val["urlpath"],
+                    "mapping_params": {k: v for k, v in val.items() if k not in ("urlpath", "flatten")},
+                }
+                if (flatten := val.get("flatten")) is not None:
+                    if isinstance(flatten, str):
+                        flatten = DEFAULT_YAML_LOAD_FUNCTION(flatten)
+                    if not isinstance(flatten, bool):  # pragma: no cover
+                        raise ValueError("`flatten` must be a boolean")
+                    kdargs["flatten"] = flatten
+                data = Data(**kdargs)
             else:  # pragma: no cover
-                raise ValueError("not all key of the YAML mapping node is `str`")
+                raise ValueError("not all keys type of the YAML mapping node are identifier string")
         else:  # pragma: no cover
             raise TypeError(f"{type(node)}")
         if self.autoload:
@@ -333,8 +349,8 @@ def load(self, loader_type: Type[Union[_Loader, _CLoader]], data: Data) -> Any:
                 result = []
                 with fsspec.open_files(urlpath, *data.sequence_params, **data.mapping_params) as ofs:
                     for of_ in ofs:
-                        data = load_open_file(of_, loader_type, urlpath, self.custom_loader)
-                        result.append(data)
+                        loaded_data = load_open_file(of_, loader_type, urlpath, self.custom_loader)
+                        result.append(loaded_data)
                 return result
             # else if no wildcard, returns a single object
             with fsspec.open(urlpath, *data.sequence_params, **data.mapping_params) as of_:
@@ -374,7 +390,10 @@ def load(self, loader_type: Type[Union[_Loader, _CLoader]], data: Data) -> Any:
                 glob_fn = lambda: self.fs.glob(urlpath, *pos_args)  # noqa: E731
             else:
                 # special for maxdepth, because PyYAML sometimes treat number as string for constructor's parameter
-                maxdepth = int(glob_params)
+                try:
+                    maxdepth = int(glob_params)
+                except ValueError:
+                    maxdepth = None
                 glob_fn = lambda: self.fs.glob(urlpath, maxdepth=maxdepth)  # noqa: E731
 
             if open_params is None:
@@ -392,9 +411,12 @@ def load(self, loader_type: Type[Union[_Loader, _CLoader]], data: Data) -> Any:
                 if not isinstance(file, str):  # pragma: no cover
                     raise RuntimeError(f"`fs.glob()` function does not return a `str` ({file})")
                 with open_fn(file) as of_:
-                    data = load_open_file(of_, loader_type, file, self.custom_loader)
-                    result.append(data)
-            return result
+                    loaded_data = load_open_file(of_, loader_type, file, self.custom_loader)
+                    result.append(loaded_data)
+            if data.flatten:
+                return [child for item in result for child in item]
+            else:
+                return result
 
         # else if no wildcards, return a single object
         with self.fs.open(urlpath, *data.sequence_params, **data.mapping_params) as of_:
@@ -415,4 +437,4 @@ def is_yaml_mapping_node(node) -> TypeGuard[yaml.MappingNode]:
 
 
 def is_kwds(val) -> TypeGuard[Mapping[str, Any]]:
-    return isinstance(val, Mapping) and all(isinstance(k, str) for k in val)
+    return isinstance(val, Mapping) and all(isinstance(k, str) and k.isidentifier() for k in val)
diff --git a/src/yaml_include/data.py b/src/yaml_include/data.py
@@ -11,7 +11,7 @@ class Data:
     urlpath: str
     """url/path of the YAML include statement
 
-    urlpath can be either absolute (like `/usr/src/Python-1.5/*.yml`) or relative (like `../../Tools/*/*.yml`), and can contain shell-style wildcards.
+    ``urlpath`` can be either absolute (like `/usr/src/Python-1.5/*.yml`) or relative (like `../../Tools/*/*.yml`), and can contain shell-style wildcards.
 
     We support ``"**"``, ``"?"`` and ``"[..]"``. We do not support ``"^"`` for pattern negation.
     The ``maxdepth`` option is applied on the first ``"**"`` found in the path.
@@ -20,6 +20,52 @@ class Data:
         Using the ``"**"`` pattern in large directory trees or remote files may consume an inordinate amount of time.
     """
 
+    flatten: bool = False
+    """Whether to flatten sequence object pared from multiple matched YAML files.
+
+    * Only available when multiple files were matched
+    * **Every matched file should have a Sequence object in its top level**, or a :class:`TypeError` exception may be thrown.
+
+    Example:
+        Consider we have such a YAML:
+
+        .. code-block:: yaml
+
+            items: !include "*.yaml"
+
+        If every file matches `*.yaml` contains a sequence object at the top level in it, what parsed and loaded will be:
+
+        .. code-block:: yaml
+
+                items: [
+                    [item 0 of 1st file, item 1 of 1st file, ... , item n of 1st file, ...],
+                    [item 0 of 2nd file, item 1 of 2nd file, ... , item n of 2nd file, ...],
+                    # ....
+                    [item 0 of nth file, item 1 of nth file, ... , item n of nth file, ...],
+                    # ...
+                ]
+
+        It's a 2-dim array, because YAML content of each matched file is treated as a member of the list(sequence).
+
+        But if ``flatten`` parameter was set to ``true``, like:
+
+        .. code-block:: yaml
+
+            items: !include {urlpath: "*.yaml", flatten: true}
+
+        we'll get:
+
+            .. code-block:: yaml
+
+                items: [
+                    item 0 of 1st file, item 1 of 1st file, ... , item n of 1st file,  # ...
+                    item 0 of 2nd file, item 1 of 2nd file, ... , item n of 2nd file,  # ...
+                    # ....
+                    item 0 of n-th file, item 1 of n-th file, ... , item n of n-th file,  # ...
+                    # ...
+                ]
+    """
+
     sequence_params: Sequence[Any] = field(default_factory=list)
     """sequence parameters of the YAML include statement.
     """

diff --git a/tests/data/include3.d/1.yml b/tests/data/include3.d/1.yml
@@ -0,0 +1 @@
+[1, 2, 3]
diff --git a/tests/data/include3.d/2.yml b/tests/data/include3.d/2.yml
@@ -0,0 +1 @@
+[4, 5, 6]
diff --git a/tests/test_basic.py b/tests/test_basic.py
@@ -162,6 +162,48 @@ def test_include_wildcards_5(self):
             data = yaml.load(StringIO(yml), loader_cls)
             self.assertListEqual(sorted(data["files"], key=lambda m: m["name"]), [YAML1, YAML2])
 
+    def test_flatten_true(self):
+        yml = dedent(
+            """
+            items: !inc {urlpath: "include3.d/*.yml", flatten: true}
+            """
+        )
+
+        for loader_cls in YAML_LOADERS:
+            two_dim_sequence = []
+            for pth in Path().glob("tests/data/include3.d/*.yml"):
+                two_dim_sequence.append(yaml.load(pth.read_bytes(), loader_cls))
+            flattened_sequence = sorted([member for data in two_dim_sequence for member in data])
+
+            data = yaml.load(StringIO(yml), loader_cls)
+            result = sorted(data["items"])
+            self.assertListEqual(result, flattened_sequence)
+
+    def test_flatten_false_or_default(self):
+        yml1 = dedent(
+            """
+            items: !inc {urlpath: "include3.d/*.yml", flatten: false}
+            """
+        )
+        yml2 = dedent(
+            """
+            items: !inc "include3.d/*.yml"
+            """
+        )
+        for loader_cls in YAML_LOADERS:
+            two_dim_sequence = []
+            for pth in Path().glob("tests/data/include3.d/*.yml"):
+                two_dim_sequence.append(yaml.load(pth.read_bytes(), loader_cls))
+            two_dim_sequence = sorted(two_dim_sequence)
+
+            data1 = yaml.load(StringIO(yml1), loader_cls)
+            result1 = data1["items"]
+            self.assertListEqual(result1, two_dim_sequence)
+
+            data2 = yaml.load(StringIO(yml2), loader_cls)
+            result2 = data2["items"]
+            self.assertListEqual(result2, two_dim_sequence)
+
 
 class DefaultFsBasicTestCase(BaseTestCase):
     @classmethod