Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve reading facets from ESGF search results #1920

Merged
merged 3 commits into from
Feb 13, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
94 changes: 69 additions & 25 deletions esmvalcore/esgf/_download.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@
import yaml
from humanfriendly import format_size, format_timespan

from esmvalcore.typing import Facets

from ..local import LocalFile
from ._logon import get_credentials
from .facets import DATASET_MAP, FACETS
Expand Down Expand Up @@ -113,8 +115,7 @@ def get_preferred_hosts():

# Hosts from which no data has been downloaded yet get median speed; if no
# host with non-zero entries is found assign a value of 0.0
speeds_list = [speeds[h][SPEED] for h in speeds if
speeds[h][SPEED] != 0.0]
speeds_list = [speeds[h][SPEED] for h in speeds if speeds[h][SPEED] != 0.0]
if not speeds_list:
median_speed = 0.0
else:
Expand Down Expand Up @@ -234,8 +235,48 @@ def same_file(result):

return files

def _get_facets(self, results):
schlunma marked this conversation as resolved.
Show resolved Hide resolved
"""Read the facets.

This works by first reading the facets from the json response of
the first search result. Next, an alternative set of facets is
read from the `dataset_id` and filename and used to correct any
wrong facets values.
"""
project = results[0].json['project'][0]

# Read the facets from the metadata
facets = {
our_facet: results[0].json[their_facet]
for our_facet, their_facet in FACETS[project].items()
if their_facet in results[0].json
}
facets = {
facet:
value[0] if isinstance(value, list) and len(value) == 1 else value
for facet, value in facets.items()
}
facets['project'] = project
if 'dataset' in facets:
reverse_dataset_map = {
v: k
for k, v in DATASET_MAP.get(project, {}).items()
}
facets['dataset'] = reverse_dataset_map.get(
facets['dataset'], facets['dataset'])

# Update the facets with information from the dataset_id and filename
more_reliable_facets = self._get_facets_from_dataset_id(results)
for facet, value in more_reliable_facets.items():
if facet not in facets or facets[facet] != value:
logger.debug(
"Correcting facet '%s' from '%s' to '%s' for %s.%s", facet,
facets.get(facet), value, self.dataset, self.name)
facets[facet] = value
return facets

@staticmethod
def _get_facets(results):
def _get_facets_from_dataset_id(results) -> Facets:
"""Read the facets from the `dataset_id`."""
# This reads the facets from the dataset_id because the facets
# provided by ESGF are unreliable.
Expand Down Expand Up @@ -268,27 +309,24 @@ def _get_facets(results):
if keys[0] == 'project':
# The project is sometimes hardcoded all lowercase in the template
keys = keys[1:]

# Read values from dataset_id
# Pick the first dataset_id if there are differences in case
dataset_id = sorted(r.json['dataset_id'].split('|')[0]
for r in results)[0]
values = dataset_id.split('.')[1:]

# Compose facets
facets = {
'project': project,
}
facets = {}
if len(keys) == len(values):
for idx, key in enumerate(keys):
facets[key] = values[idx]
else:
logger.debug("Wrong dataset_id_template_ %s for dataset %s",
template, dataset_id)
logger.debug(
"Wrong dataset_id_template_ %s or facet values containing '.' "
"for dataset %s", template, dataset_id)
facets['version'] = dataset_id.split('.')[-1]

# The dataset_id does not contain the short_name for all projects,
# so get it from the filename if needed:
if 'short_name' not in facets:
facets['short_name'] = results[0].json['title'].split('_')[0]
# so get it from the filename:
facets['short_name'] = results[0].json['title'].split('_')[0]

return facets

Expand All @@ -311,18 +349,26 @@ def _get_dataset_id(results):
dataset_name = DATASET_MAP[project].get(dataset_name, dataset_name)
return f"{project}.{dataset_name}.{version}"

def _get_relative_path(self) -> Path:
"""Get the subdirectories."""
if self.facets['project'] == 'obs4MIPs':
# Avoid errors due to a to a `.` in the dataset name
facets = ['project', 'dataset', 'version']
path = Path(*[self.facets[f] for f in facets])
else:
path = Path(*self.dataset.split('.'))
return path / self.name

def __repr__(self):
"""Represent the file as a string."""
hosts = [urlparse(u).hostname for u in self.urls]
return (f"ESGFFile:{self.dataset.replace('.', '/')}/{self.name}"
return (f"ESGFFile:{self._get_relative_path()}"
f" on hosts {hosts}")

def __eq__(self, other):
"""Compare `self` to `other`."""
return (
isinstance(other, self.__class__)
and (self.dataset, self.name) == (other.dataset, other.name)
)
return (isinstance(other, self.__class__)
and (self.dataset, self.name) == (other.dataset, other.name))

def __lt__(self, other):
"""Compare `self` to `other`."""
Expand All @@ -345,11 +391,7 @@ def local_file(self, dest_folder):
LocalFile
The path where the file will be located after download.
"""
file = LocalFile(
dest_folder,
*self.dataset.split('.'),
self.name,
).absolute()
file = LocalFile(dest_folder, self._get_relative_path())
file.facets = self.facets
return file

Expand Down Expand Up @@ -457,7 +499,9 @@ def get_download_message(files):
lines = []
for file in files:
total_size += file.size
lines.append(f"{format_size(file.size)}" "\t" f"{file}")
lines.append(f"{format_size(file.size)}"
"\t"
f"{file}")

lines.insert(0, "Will download the following files:")
lines.insert(0, f"Will download {format_size(total_size)}")
Expand Down
7 changes: 7 additions & 0 deletions esmvalcore/esgf/facets.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,15 +10,18 @@
'ensemble': 'ensemble',
'exp': 'experiment',
'frequency': 'time_frequency',
'realm': 'realm',
'short_name': 'variable',
},
'CMIP5': {
'dataset': 'model',
'ensemble': 'ensemble',
'exp': 'experiment',
'frequency': 'time_frequency',
'institute': 'institute',
'mip': 'cmor_table',
'product': 'product',
'realm': 'realm',
'short_name': 'variable',
},
'CMIP6': {
Expand All @@ -38,11 +41,15 @@
'ensemble': 'ensemble',
'exp': 'experiment',
'frequency': 'time_frequency',
'institute': 'institute',
'product': 'product',
'short_name': 'variable',
},
'obs4MIPs': {
'dataset': 'source_id',
'frequency': 'time_frequency',
'institute': 'institute',
'realm': 'realm',
'short_name': 'variable',
}
}
Expand Down
Loading