Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

BUG: Fix embedded metadata elements #188

Merged
merged 1 commit into from
Feb 24, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 27 additions & 5 deletions siphon/metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,9 @@
log.setLevel(logging.ERROR)
log.addHandler(logging.StreamHandler())

xlink_href_attr = '{http://www.w3.org/1999/xlink}href'
xlink_title_attr = '{http://www.w3.org/1999/xlink}title'


class _SimpleTypes(object):
def __init__(self):
Expand Down Expand Up @@ -473,9 +476,11 @@ def __init__(self, element, metadata_in=None):
else:
inherited = False

if metadata_in and inherited:
if metadata_in and (inherited or self._is_external_metadata_doc(element)):
# only inherit metadata passed in if the new metadata
# element has inherit set to True
# element has inherit set to True or if the new
# metadata element is pointing to an external metadata
# document using an xlink
self.metadata = metadata_in
else:
self.metadata = {'inherited': inherited}
Expand All @@ -495,6 +500,13 @@ def _get_tag_name(element):
element_name = element.tag
return element_name

@staticmethod
def _is_external_metadata_doc(element):
attributes = element.attrib
has_xlink_title = xlink_title_attr in attributes
has_xlink_href = xlink_href_attr in attributes
return has_xlink_title and has_xlink_href

def _get_handler(self, handler_name):
handler_name = 'handle_' + handler_name
if handler_name in self._cts:
Expand Down Expand Up @@ -524,7 +536,8 @@ def _parse_element(self, element):
'date': self._parse_date,
'timeCoverage': self._parse_timeCoverage,
'variableMap': self._parse_variableMap,
'variables': self._parse_variables}
'variables': self._parse_variables,
'metadata': self._parse_embedded_metadata}

try:
parser[element_name](element)
Expand Down Expand Up @@ -554,8 +567,6 @@ def _parse_documentation(self, element):
# <xsd:attribute name="type" type="documentationEnumTypes"/>
# <xsd:attributeGroup ref="XLink" />
# </xsd:complexType>
xlink_href_attr = '{http://www.w3.org/1999/xlink}href'
xlink_title_attr = '{http://www.w3.org/1999/xlink}title'

# doc_enum_types = ("funding", "history", "processing_level", "rights",
# "summary")
Expand Down Expand Up @@ -711,3 +722,14 @@ def _parse_variables(self, element):
var_name = variable['name']
variable.pop('name', None)
self.metadata.setdefault(element_type, {})[var_name] = variable

def _parse_embedded_metadata(self, element):
element_type = 'external_metadata'
if xlink_href_attr in element.attrib:
title = element.attrib[xlink_title_attr]
href = element.attrib[xlink_href_attr]

self.metadata.setdefault(element_type, {})[title] = href
else:
log.warning('Cannot parse embedded metadata element %s: %s',
element.tag, element.attrib)
49 changes: 49 additions & 0 deletions siphon/tests/fixtures/ncei_embedded_metadata
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
interactions:
- request:
body: null
headers:
Accept: ['*/*']
Accept-Encoding: ['gzip, deflate']
Connection: [keep-alive]
User-Agent: [Siphon (0.4.0+209.gde8764c.dirty)]
method: GET
uri: https://www.ncei.noaa.gov/thredds/catalog/namanl/201802/20180220/catalog.xml
response:
body:
string: !!binary |
H4sIAAAAAAAAA8xY32/iOBB+P+n+BytPu1JJnPBjoQL2uBC6SBBQw+7q7qVyEwO+TeLIdqDc6f73
mwQIhbaoNJVuXygZj7+Z+TzzObT9+SEK0YoKyXjc0Uwda4jGPg9YvOhoX2eDSlP73P31l7ZPFAn5
AoF7LDvaUqnk2jDW67WexiyART31idBpkBoxiahMiE+loZaCBoE0hvHK3gIYqzxGDnP9ELL4xxHY
uqpzsTDMVqtl5KvaUXK6qUEyCLUlFSvmU5TF6mi90UhDO9Nsk4DF5lHC0zjQ0D2R8Lzd9mRj7Ac8
kCd7J1PH7fem+61FEZmrbWjIeB7ry2w29cBCxQne44UTyDkL6XbpZdy1f5rgd9t7ggReZyCiJxDj
ZyCiMxCxL5+rzaXKD+YvVAd7pLEQLABYmc7n7AEWs16RVOlLFYVnomWLx5Hs8eiZCFH4cs5pEPgn
KF/7ffsJSub3MgqT/ARk6E2eYIBXAdE2du7bp13FOzgLm01sWTADwz4USiISh8bWaBRr+0wiqki2
HbF4SQVTNOhoSqR073BI1gXwLsxBETs3FF4kVUsOAJvugq/0mBOiQ+/7beOwUPhmAbNCuze3w37b
KB4PDtxPIxoromAukco5mcOwgWZo3a+6pyM7ZBFRFE0FXwgSoQkcPhD6QVASk3AjmURgj9XHK+SM
7StkT+HDtZ0psgFXkBBNEipyfIk+uPYEHEkcINfx+kMPcnqcwfnEEsFBiSTkdhfSFQ217ij7g2oX
oQi2WCqpdZ2Q+krwmPko4Os45CSQiM+RWlKUHxQRFM0FpVdoydcQSKA5pRKRJAk3aM7F1sunQjGg
ZBsiqyxgUgl2n+aGx4DwmCyBMR9IiWjAiI4GGeKKiE3ehEHmwkUAoWRC/QJW6i9W6MM5KC4KA5iy
5kQr7pP7NATkjnZjj/tatz+xDXfS6xnud8/IDqidy/vjnT4HfF+hVIRHUh77NNl2GrQczAaNCDvn
kS3fhYzAJMWPpjEbp9N82z/oZg0lP5PwzeSbc+uOHXeGejeOaw8dr5K35MDpO7e9UWG9QlAbNB0U
l3/dlZk3WFbpENZy2oD3rCnhFsrPz4lXDBogZzVEwxhsUe53tevOK5TH69OECJW5ZccJ91FEhQ9t
4fRuZ1+QBylAEOQ5t9+GdpbNeNJ3Rt7p+q3Tc3ujPzzHM3qeNxwPR73ZcOLuvNvGjogDM4Vi5Lfn
NajTvOB8wWQ+9QfOF5QnXEAhhqBSFe3ymQWdfxq1asu2a9UKtnCzUuv1G5XfB41BpdYc1JuNwad6
zez/q+3iKKbCrTIis2Wa9Yp1bWHc+uB8RBUoPszGJuvs7eyNd0kenTIM6l/g1XUhoSXqAV3QyDH4
Si6h+Wnb2HsctqT3IZOgjBd18uGIX9HM8tCr7Jluzs0MmuC3/dpx555m2Db2B7Q3HN8O2/vgzjKb
d/vL4A43MHzghr4Q99a5a8N4xW4obErUsgTE8UXhsb8pgldABS+F4/uNoiCS9aZu1bYXR7Z8tIHu
BDWCV8w5o4HWzcJAj1Usa2Y2r2v4ulr7M99MC852HF1MWbUUZdXylFVfTdknvVX9GSizSlFmlafM
uoCyqlWCsup7UWaWoswsT5n5asoaeq3xM1CGS1GGy1OGX01ZVa/j/5cyXEr+cXn5xxfLf918O2X1
d6PsrfKPy8s/vlj+GyW6rG69F2VvlX9cXv7xxfKPS2jZ+1H2VvnH5eUfXyz/ZpnBNN+LsrfKPy4v
//hi+a+WGcyzlD1+gh+523+Qwvf/AAAA//8DAGF1zKCVFQAA
headers:
Access-Control-Allow-Headers: ['X-Requested-With, Content-Type']
Access-Control-Allow-Origin: ['*']
Connection: [close]
Content-Encoding: [gzip]
Content-Language: [en-US]
Content-Type: [application/xml;charset=UTF-8]
Date: ['Sat, 24 Feb 2018 02:16:17 GMT']
Strict-Transport-Security: [max-age=31536000]
Vary: [Accept-Encoding]
status: {code: 200, message: OK}
version: 1
10 changes: 10 additions & 0 deletions siphon/tests/test_catalog.py
Original file line number Diff line number Diff line change
Expand Up @@ -302,3 +302,13 @@ def test_catalog_ref_str():
url = 'http://thredds.ucar.edu/thredds/catalog.xml'
cat = TDSCatalog(url)
assert str(cat.catalog_refs[0]) == 'Forecast Model Data'


@recorder.use_cassette('ncei_embedded_metadata')
def test_catalog_with_embedded_metadata_elements():
"""Test catalog with embedded metadata elements."""
url = 'https://www.ncei.noaa.gov/thredds/catalog/namanl/201802/20180220/catalog.xml'
cat = TDSCatalog(url)
md = cat.metadata
assert 'external_metadata' in md
assert 'serviceName' in md
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is sufficient to prove that it's working?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If the embedded metadata element is successfully parsed and understood, then the external_metadata keyword will be added to the metadata dict. If the other metadata isn't added (because the embedded metadata element does not have inherited=true, which is currently the case), then the second assert will fail.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ok, works for me.

35 changes: 35 additions & 0 deletions siphon/tests/test_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -580,3 +580,38 @@ def test_time_coverage(self):
assert len(md['timeCoverage']) == 1
assert md['timeCoverage'][0]['end'] == 'present'
assert md['timeCoverage'][0]['duration'] == '45 days'

def test_external_metadata(self):
"""Test an embedded metadata element that points to an external document."""
xml = '<metadata inherited="true">' \
'<serviceName>ALL</serviceName>' \
'<metadata xmlns:xlink="http://www.w3.org/1999/xlink" xlink:href=' \
'"http://gis.ncdc.noaa.gov/geoportal/rest/document?' \
'id={6439CC43-0208-4AD6-BF6F-48F586F7541D}" ' \
'xlink:title="ISO 19115-2:2009(E) - Collection Level Metadata"/>' \
'</metadata>'
element = ET.fromstring(xml)
md = TDSCatalogMetadata(element).metadata
# make sure other metadata is still captured
assert 'serviceName' in md
# make sure the embedded metadata element gets processed and added
expected_title = 'ISO 19115-2:2009(E) - Collection Level Metadata'
expected_href = 'http://gis.ncdc.noaa.gov/geoportal/rest/document?' \
'id={6439CC43-0208-4AD6-BF6F-48F586F7541D}'
assert 'external_metadata' in md
assert expected_title in md['external_metadata']
assert md['external_metadata'][expected_title] == expected_href

def test_external_metadata_non_xlink(self, caplog):
"""Test an non-xlink embedded external metadata element."""
xml = '<metadata inherited="true">' \
'<serviceName>ALL</serviceName>' \
'<metadata url="http://gis.ncdc.noaa.gov/geoportal/rest/document?' \
'id={6439CC43-0208-4AD6-BF6F-48F586F7541D}" ' \
'name="ISO 19115-2:2009(E) - Collection Level Metadata"/>' \
'</metadata>'
element = ET.fromstring(xml)
md = TDSCatalogMetadata(element).metadata
assert 'serviceName' in md
assert 'external_metadata' not in md
assert 'Cannot parse embedded metadata element' in caplog.text