Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Introduce asdf.util.load_yaml #1700

Merged
merged 7 commits into from
Jan 15, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions CHANGES.rst
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,10 @@ The ASDF Standard is at v1.6.0
is removed in an upcoming asdf release will be ``False`` and
asdf will no longer by-default memory map arrays. [#1667]

- Introduce ``asdf.util.load_yaml`` to load just the YAML contents
of an ASDF file (with the option ``tagged`` to load the contents
as a tree of ``asdf.tagged.Tagged`` instances to preserve tags) [#1700]

3.0.1 (2023-10-30)
------------------

Expand Down
20 changes: 10 additions & 10 deletions asdf/_tests/tags/core/tests/test_integer.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,16 +46,16 @@ def test_integer_storage(tmpdir, inline):
with asdf.AsdfFile(tree) as af:
af.write_to(tmpfile)

with asdf.open(tmpfile, _force_raw_types=True) as rf:
if inline:
assert "source" not in rf.tree["integer"]["words"]
assert "data" in rf.tree["integer"]["words"]
else:
assert "source" in rf.tree["integer"]["words"]
assert "data" not in rf.tree["integer"]["words"]

assert "string" in rf.tree["integer"]
assert rf.tree["integer"]["string"] == str(value)
tree = asdf.util.load_yaml(tmpfile, tagged=True)
if inline:
assert "source" not in tree["integer"]["words"]
assert "data" in tree["integer"]["words"]
else:
assert "source" in tree["integer"]["words"]
assert "data" not in tree["integer"]["words"]

assert "string" in tree["integer"]
assert tree["integer"]["string"] == str(value)


def test_integer_conversion():
Expand Down
3 changes: 1 addition & 2 deletions asdf/_tests/test_reference.py
Original file line number Diff line number Diff line change
Expand Up @@ -221,8 +221,7 @@ def test_internal_reference(tmp_path):
ff.write_to(buff)
buff.seek(0)
ff = asdf.AsdfFile()
content = asdf.AsdfFile()._open_impl(ff, buff, _get_yaml_content=True)
assert b"{$ref: ''}" in content
assert b"{$ref: ''}" in buff.getvalue()


def test_implicit_internal_reference(tmp_path):
Expand Down
27 changes: 27 additions & 0 deletions asdf/_tests/test_util.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,11 @@
import contextlib
import io
import warnings

import numpy as np
import pytest

import asdf
from asdf import generic_io, util
from asdf.exceptions import AsdfDeprecationWarning

Expand Down Expand Up @@ -118,3 +121,27 @@ def test_minversion():

assert util.minversion(yaml, "3.1")
assert util.minversion("yaml", "3.1")


@pytest.mark.parametrize("input_type", ["filename", "binary_file", "generic_file"])
@pytest.mark.parametrize("tagged", [True, False])
def test_load_yaml(tmp_path, input_type, tagged):
fn = tmp_path / "test.asdf"
asdf.AsdfFile({"a": np.zeros(3)}).write_to(fn)

if input_type == "filename":
init = fn
ctx = contextlib.nullcontext()
elif input_type == "binary_file":
init = open(fn, "rb")
ctx = init
elif input_type == "generic_file":
init = generic_io.get_file(fn, "r")
ctx = init

with ctx:
tree = util.load_yaml(init, tagged=tagged)
if tagged:
assert isinstance(tree["a"], asdf.tagged.TaggedDict)
else:
assert not isinstance(tree["a"], asdf.tagged.TaggedDict)
16 changes: 10 additions & 6 deletions asdf/commands/edit.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
import yaml

from asdf import constants, generic_io, schema, util
from asdf._asdf import AsdfFile, open_asdf
from asdf._asdf import AsdfFile
from asdf._block import io as bio
from asdf._block.exceptions import BlockIndexError

Expand Down Expand Up @@ -259,11 +259,15 @@ def edit(path):
continue

try:
# Blocks are not read during validation, so this will not raise
# an error even though we're only opening the YAML portion of
# the file.
with open_asdf(io.BytesIO(new_content), _force_raw_types=True):
pass
# check this is an ASDF file
if new_content[: len(constants.ASDF_MAGIC)] != constants.ASDF_MAGIC:
msg = "Does not appear to be a ASDF file."
raise ValueError(msg)
# read the tagged tree (which also checks if the YAML is valid)
tagged_tree = util.load_yaml(io.BytesIO(new_content), tagged=True)
# validate the tagged tree
ctx = AsdfFile(version=new_asdf_version)
schema.validate(tagged_tree, ctx=ctx, reading=True)
except yaml.YAMLError as e:
print("Error: failed to parse updated YAML:")
print_exception(e)
Expand Down
46 changes: 46 additions & 0 deletions asdf/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from urllib.request import pathname2url

import numpy as np
import yaml

# The standard library importlib.metadata returns duplicate entrypoints
# for all python versions up to and including 3.11
Expand Down Expand Up @@ -39,6 +40,7 @@


__all__ = [
"load_yaml",
"human_list",
"get_array_base",
"get_base_uri",
Expand All @@ -55,6 +57,50 @@
]


def load_yaml(init, tagged=False):
"""
Load just the yaml portion of an ASDF file

Parameters
----------

init : filename or file-like
If file-like this must be opened in binary mode.

tagged: bool, optional
Return tree with instances of `asdf.tagged.Tagged` this
can be helpful if the yaml tags are of interest.
If False, the tree will only contain basic python types
(see the pyyaml ``BaseLoader`` documentation).

Returns
-------

tree : dict
Dictionary representing the ASDF tree
"""

from .generic_io import get_file
from .yamlutil import AsdfLoader

if tagged:
loader = AsdfLoader
else:
loader = yaml.CBaseLoader if getattr(yaml, "__with_libyaml__", None) else yaml.BaseLoader
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What does BaseLoader do when it encounters a tag that it doesn't know how to handle? I thought I remembered that it throws an error in that case but clearly not...

Copy link
Contributor Author

@braingram braingram Dec 18, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for taking a look!

My understanding is the BaseLoader ignores all tags. The pyyaml documentation has a rather brief description:

BaseLoader(stream) does not resolve or support any tags and construct only basic Python objects: lists, dictionaries and Unicode strings.

Throwing an empty ASDF file at it:

#ASDF 1.0.0
#ASDF_STANDARD 1.5.0
%YAML 1.1
%TAG ! tag:stsci.edu:asdf/
--- !core/asdf-1.1.0
asdf_library: !core/software-1.0.0 {author: The ASDF Developers, homepage: 'http://github.com/asdf-format/asdf',
  name: asdf, version: 3.0.2.dev46+gfc646c3c}
history:
  extensions:
  - !core/extension_metadata-1.0.0
    extension_class: asdf.extension._manifest.ManifestExtension
    extension_uri: asdf://asdf-format.org/core/extensions/core-1.5.0
    software: !core/software-1.0.0 {name: asdf, version: 3.0.2.dev46+gfc646c3c}
...

with:

d = yaml.load(open('foo.asdf').read(), Loader=yaml.BaseLoader)

yields:

> d
{'asdf_library': {'author': 'The ASDF Developers',
  'homepage': 'http://github.com/asdf-format/asdf',
  'name': 'asdf',
  'version': '3.0.2.dev46+gfc646c3c'},
 'history': {'extensions': [{'extension_class': 'asdf.extension._manifest.ManifestExtension',
    'extension_uri': 'asdf://asdf-format.org/core/extensions/core-1.5.0',
    'software': {'name': 'asdf', 'version': '3.0.2.dev46+gfc646c3c'}}]}}

inspecting the extension item:

> type(d['history']['extensions'][0])
dict

In the pyyaml source BaseLoader uses BaseConstructor for building objects and BaseConstructor is the parent class for SafeConstructor.


with get_file(init, "r") as gf:
reader = gf.reader_until(
constants.YAML_END_MARKER_REGEX,
7,
"End of YAML marker",
include=True,
)
# The following call to yaml.load is safe because we're
# using only loaders that don't create custom python objects
content = yaml.load(reader, Loader=loader) # noqa: S506
return content


def human_list(line, separator="and"):
"""
Formats a list for human readability.
Expand Down
Loading