Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added input file sorting to many diagnostic to make output exactly reproducible #2710

Merged
merged 4 commits into from
Jul 8, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion esmvaltool/diag_scripts/climate_metrics/ecs.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@
io,
run_diagnostic,
select_metadata,
sorted_metadata,
variables_available,
)

Expand Down Expand Up @@ -309,6 +310,7 @@ def check_input_data(cfg):
def preprocess_data(cfg):
"""Extract input data."""
input_data = deepcopy(list(cfg['input_data'].values()))
input_data = sorted_metadata(input_data, ['short_name', 'exp', 'dataset'])
if not input_data:
return ([], [])

Expand Down Expand Up @@ -471,8 +473,9 @@ def write_data(cfg, ecs_data, feedback_parameter_data, ancestor_files):
else:
attrs = {}
if RTMT_DATASETS:
rtmt_datasets = sorted(list(RTMT_DATASETS))
attrs['net_toa_radiation'] = (
f"For datasets {RTMT_DATASETS}, 'rtmt' (net top of model "
f"For datasets {rtmt_datasets}, 'rtmt' (net top of model "
f"radiation) instead of 'rtnt' (net top of atmosphere radiation) "
f"is used due to lack of data. These two variables might differ.")
attrs.update(cfg.get('output_attributes', {}))
Expand Down
17 changes: 13 additions & 4 deletions esmvaltool/diag_scripts/climate_metrics/feedback_parameters.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@
plot,
run_diagnostic,
select_metadata,
sorted_metadata,
variables_available,
)

Expand Down Expand Up @@ -465,8 +466,13 @@ def _create_table(table, cfg, description=None):
'SW = short wave, LW = long wave, cs = clear sky, CRE = cloud '
'radiative effect (similar to Andrews et al., Geophys. Res. Lett., '
'39, 2012).')
_write_provenance(netcdf_path, plot_path, caption,
[d['filename'] for d in cfg['input_data'].values()], cfg)
_write_provenance(
netcdf_path,
plot_path,
caption,
sorted([d['filename'] for d in cfg['input_data'].values()]),
cfg,
)


def _dict_to_array(dict_):
Expand Down Expand Up @@ -533,7 +539,8 @@ def _get_cube_list_for_table(cell_data, row_labels, col_labels, col_units):
cubes = iris.cube.CubeList()
for (idx, label) in enumerate(col_labels):
if label in ('ECS', 'F', 'rtnt') and RTMT_DATASETS:
attrs = {'net_toa_radiation': RTMT_TEXT.format(RTMT_DATASETS)}
rtmt_datasets = sorted(list(RTMT_DATASETS))
attrs = {'net_toa_radiation': RTMT_TEXT.format(rtmt_datasets)}
else:
attrs = {}
cube = iris.cube.Cube(
Expand Down Expand Up @@ -711,7 +718,8 @@ def _write_scalar_data(data, ancestor_files, cfg, description=None):
]
global_attrs = {'project': list(cfg['input_data'].values())[0]['project']}
if RTMT_DATASETS:
global_attrs['net_toa_radiation'] = RTMT_TEXT.format(RTMT_DATASETS)
rtmt_datasets = sorted(list(RTMT_DATASETS))
global_attrs['net_toa_radiation'] = RTMT_TEXT.format(rtmt_datasets)
for (idx, var_attr) in enumerate(var_attrs):
caption = '{long_name} for multiple climate models'.format(**var_attr)
if description is not None:
Expand Down Expand Up @@ -967,6 +975,7 @@ def plot_regressions(input_data, cfg, description=None):
def preprocess_data(cfg, year_idx=None):
"""Calculate anomalies and multi-model mean."""
input_data = deepcopy(list(cfg['input_data'].values()))
input_data = sorted_metadata(input_data, ['short_name', 'exp', 'dataset'])

# Use 'rtmt' instead of 'rtmt' if necessary
for dataset in input_data:
Expand Down
25 changes: 17 additions & 8 deletions esmvaltool/diag_scripts/climate_metrics/psi.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,10 +35,15 @@
import numpy as np
from scipy import stats

from esmvaltool.diag_scripts.shared import (ProvenanceLogger,
get_diagnostic_filename,
group_metadata, io, run_diagnostic,
select_metadata)
from esmvaltool.diag_scripts.shared import (
ProvenanceLogger,
get_diagnostic_filename,
group_metadata,
io,
run_diagnostic,
select_metadata,
sorted_metadata,
)

logger = logging.getLogger(os.path.basename(__file__))

Expand Down Expand Up @@ -102,9 +107,12 @@ def get_provenance_record(caption, ancestor_files):

def get_attributes(cfg, single_psi_cube, input_data):
"""Get attributes for psi cube for all datasets."""
datasets = "|".join({str(d['dataset']) for d in input_data})
projects = "|".join({str(d['project']) for d in input_data})
ref = "|".join({str(d.get('reference_dataset')) for d in input_data})
datasets = sorted(list({str(d['dataset']) for d in input_data}))
projects = sorted(list({str(d['project']) for d in input_data}))
ref = sorted(list({str(d.get('reference_dataset')) for d in input_data}))
datasets = "|".join(datasets)
projects = "|".join(projects)
ref = "|".join(ref)
attrs = single_psi_cube.attributes
attrs.update({
'dataset': datasets,
Expand All @@ -120,6 +128,7 @@ def main(cfg):
input_data = (
select_metadata(cfg['input_data'].values(), short_name='tas') +
select_metadata(cfg['input_data'].values(), short_name='tasa'))
input_data = sorted_metadata(input_data, ['short_name', 'exp', 'dataset'])
if not input_data:
raise ValueError("This diagnostics needs 'tas' or 'tasa' variable")

Expand Down Expand Up @@ -162,7 +171,7 @@ def main(cfg):
io.save_scalar_data(psis, out_path, psi_attrs, attributes=attrs)

# Provenance
caption = "{long_name} for mutliple climate models.".format(**psi_attrs)
caption = "{long_name} for multiple climate models.".format(**psi_attrs)
ancestor_files = [d['filename'] for d in input_data]
provenance_record = get_provenance_record(caption, ancestor_files)
with ProvenanceLogger(cfg) as provenance_logger:
Expand Down
4 changes: 3 additions & 1 deletion esmvaltool/diag_scripts/climate_metrics/tcr.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@
io,
run_diagnostic,
select_metadata,
sorted_metadata,
variables_available,
)

Expand Down Expand Up @@ -108,6 +109,7 @@ def _get_anomaly_cubes(cfg):
cubes = {}
ancestors = {}
input_data = cfg['input_data'].values()
input_data = sorted_metadata(input_data, ['short_name', 'exp', 'dataset'])
onepct_data = select_metadata(input_data, short_name='tas', exp='1pctCO2')

# Process data
Expand Down Expand Up @@ -329,7 +331,7 @@ def write_data(cfg, tcr, external_file=None):
for dataset_name in tcr.keys():
datasets = select_metadata(cfg['input_data'].values(),
dataset=dataset_name)
ancestor_files.extend([d['filename'] for d in datasets])
ancestor_files.extend(sorted([d['filename'] for d in datasets]))
if external_file is not None:
ancestor_files.append(external_file)
provenance_record['ancestors'] = ancestor_files
Expand Down
7 changes: 5 additions & 2 deletions esmvaltool/diag_scripts/emergent_constraints/cox18nature.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@
plot,
run_diagnostic,
select_metadata,
sorted_metadata,
)

logger = logging.getLogger(os.path.basename(__file__))
Expand Down Expand Up @@ -80,7 +81,7 @@ def _get_ancestor_files(cfg, obs_name, projects=None):
select_metadata(cfg['input_data'].values(), project=project))
datasets.extend(
select_metadata(cfg['input_data'].values(), dataset=obs_name))
return [d['filename'] for d in datasets]
return sorted([d['filename'] for d in datasets])


def _get_model_color(model, lambda_cube):
Expand Down Expand Up @@ -133,7 +134,7 @@ def _get_project(cfg):
projects = [p for p in projects if 'obs' not in p.lower()]
if len(projects) == 1:
return projects[0]
return projects
return sorted(projects)


def _save_fig(cfg, basename, legend=None):
Expand All @@ -156,6 +157,7 @@ def get_external_cubes(cfg):
"""Get external cubes for psi, ECS and lambda."""
cubes = iris.cube.CubeList()
input_data = list(cfg['input_data'].values())
input_data = sorted_metadata(input_data, ['short_name', 'exp', 'dataset'])
for filename in ('psi.nc', 'ecs.nc', 'lambda.nc'):
filepath = io.get_ancestor_file(cfg, filename)
cube = iris.load_cube(filepath)
Expand Down Expand Up @@ -509,6 +511,7 @@ def main(cfg):
input_data = (
select_metadata(cfg['input_data'].values(), short_name='tas') +
select_metadata(cfg['input_data'].values(), short_name='tasa'))
input_data = sorted_metadata(input_data, ['short_name', 'exp', 'dataset'])
if not input_data:
raise ValueError("This diagnostics needs 'tas' or 'tasa' variable")

Expand Down
13 changes: 9 additions & 4 deletions esmvaltool/diag_scripts/emergent_constraints/ecs_scatter.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@
io,
run_diagnostic,
select_metadata,
sorted_metadata,
)

logger = logging.getLogger(os.path.basename(__file__))
Expand Down Expand Up @@ -276,7 +277,7 @@ def _get_su_cube_dict(grouped_data, var_name, reference_datasets):
ref_filenames.append(grouped_data[ref_dataset_name][0]['filename'])
ref_cube = cube.copy(ref_data)
ref_cube.attributes['dataset'] = reference_datasets
ref_cube.attributes['ancestors'] = '|'.join(ref_filenames)
ref_cube.attributes['ancestors'] = '|'.join(sorted(ref_filenames))
ref_cube.coord('air_pressure').attributes['positive'] = 'down'

# All other cubes
Expand Down Expand Up @@ -772,9 +773,12 @@ def get_default_settings(cfg):

def get_global_attributes(input_data, cfg):
"""Get attributes for psi cube for all datasets."""
datasets = "|".join({str(d['dataset']) for d in input_data})
projects = "|".join({str(d['project']) for d in input_data})
ref = "|".join({str(d.get('reference_dataset')) for d in input_data})
datasets = sorted(list({str(d['dataset']) for d in input_data}))
projects = sorted(list({str(d['project']) for d in input_data}))
ref = sorted(list({str(d.get('reference_dataset')) for d in input_data}))
datasets = "|".join(datasets)
projects = "|".join(projects)
ref = "|".join(ref)
attrs = {
'dataset': datasets,
'project': projects,
Expand All @@ -794,6 +798,7 @@ def main(cfg):
input_data = list(cfg['input_data'].values())
input_data.extend(io.netcdf_to_metadata(cfg, pattern=cfg.get('pattern')))
input_data = deepcopy(input_data)
input_data = sorted_metadata(input_data, ['short_name', 'exp', 'dataset'])
check_input_data(input_data)
grouped_data = group_metadata(input_data, 'dataset')

Expand Down
5 changes: 4 additions & 1 deletion esmvaltool/diag_scripts/ipcc_ar5/ch09_fig09_42a.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@

import logging
import os
from copy import deepcopy

import iris
import seaborn as sns
Expand All @@ -47,6 +48,7 @@
io,
plot,
run_diagnostic,
sorted_metadata,
variables_available,
)

Expand Down Expand Up @@ -163,7 +165,8 @@ def write_data(cfg, hist_cubes, pi_cubes, ecs_cube):
def main(cfg):
"""Run the diagnostic."""
sns.set(**cfg.get('seaborn_settings', {}))
input_data = cfg['input_data'].values()
input_data = deepcopy(list(cfg['input_data'].values()))
input_data = sorted_metadata(input_data, ['short_name', 'exp', 'dataset'])
project = list(group_metadata(input_data, 'project').keys())
project = [p for p in project if 'obs' not in p.lower()]
if len(project) == 1:
Expand Down
2 changes: 2 additions & 0 deletions esmvaltool/diag_scripts/mlr/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
get_diagnostic_filename,
io,
select_metadata,
sorted_metadata,
)

logger = logging.getLogger(os.path.basename(__file__))
Expand Down Expand Up @@ -542,6 +543,7 @@ def get_input_data(cfg, pattern=None, check_mlr_attributes=True, ignore=None):
if not datasets_have_mlr_attributes(valid_data, log_level='error'):
raise ValueError("At least one input dataset does not have valid "
"MLR attributes")
valid_data = sorted_metadata(valid_data, ['var_type', 'tag', 'dataset'])
logger.debug("Found files:")
logger.debug(pformat([d['filename'] for d in valid_data]))
return valid_data
Expand Down
3 changes: 2 additions & 1 deletion esmvaltool/diag_scripts/shared/io.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ def get_all_ancestor_files(cfg, pattern=None):
files = fnmatch.filter(files, pattern)
files = [os.path.join(root, f) for f in files]
ancestor_files.extend(files)
return ancestor_files
return sorted(ancestor_files)


def get_ancestor_file(cfg, pattern):
Expand Down Expand Up @@ -131,6 +131,7 @@ def netcdf_to_metadata(cfg, pattern=None, root=None):
files = [os.path.join(base, f) for f in files]
all_files.extend(files)
all_files = fnmatch.filter(all_files, '*.nc')
all_files = sorted(all_files)

# Iterate over netcdf files
metadata = []
Expand Down
4 changes: 2 additions & 2 deletions tests/unit/diag_scripts/mlr/test_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -390,15 +390,15 @@ def test_get_datasets(input_data, kwargs, output):
(CFG_3, [], False, IGNORE, [D_3], 0),
(CFG_3, [D_1], True, None, ValueError, 1),
(CFG_3, [D_1], True, IGNORE, ValueError, 1),
(CFG_3, [D_1], False, None, [D_1, D_3, D_1], 0),
(CFG_3, [D_1], False, None, [D_1, D_1, D_3], 0),
(CFG_3, [D_1], False, IGNORE, [D_3], 0),
(CFG_4, [], True, None, ValueError, 2),
(CFG_4, [], True, IGNORE, ValueError, 1),
(CFG_4, [], False, None, [D_1, D_2, D_3], 0),
(CFG_4, [], False, IGNORE, [D_3], 0),
(CFG_4, [D_1], True, None, ValueError, 2),
(CFG_4, [D_1], True, IGNORE, ValueError, 1),
(CFG_4, [D_1], False, None, [D_1, D_2, D_3, D_1], 0),
(CFG_4, [D_1], False, None, [D_1, D_2, D_1, D_3], 0),
(CFG_4, [D_1], False, IGNORE, [D_3], 0),
]

Expand Down
12 changes: 6 additions & 6 deletions tests/unit/diag_scripts/shared/test_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,31 +49,31 @@ def test_has_necessary_attributes(mock_logger, data):
ROOT_DIR = '/root/to/something'
TEST_GET_ALL_ANCESTOR_FILES = [
(None, [
os.path.join(ROOT_DIR, 'test.nc'),
os.path.join(ROOT_DIR, 'egg.yml'),
os.path.join(ROOT_DIR, 'root2', 'x.nc'),
os.path.join(ROOT_DIR, 'root2', 'y.png'),
os.path.join(ROOT_DIR, 'root3', 'egg.nc'),
os.path.join(ROOT_DIR, 'root4', 'egg.nc'),
os.path.join(ROOT_DIR, 'test.nc'),
os.path.join(ROOT_DIR, 'test_1.nc'),
os.path.join(ROOT_DIR, 'test_2.yml'),
os.path.join(ROOT_DIR, 'root4', 'egg.nc'),
]),
('*', [
os.path.join(ROOT_DIR, 'test.nc'),
os.path.join(ROOT_DIR, 'egg.yml'),
os.path.join(ROOT_DIR, 'root2', 'x.nc'),
os.path.join(ROOT_DIR, 'root2', 'y.png'),
os.path.join(ROOT_DIR, 'root3', 'egg.nc'),
os.path.join(ROOT_DIR, 'root4', 'egg.nc'),
os.path.join(ROOT_DIR, 'test.nc'),
os.path.join(ROOT_DIR, 'test_1.nc'),
os.path.join(ROOT_DIR, 'test_2.yml'),
os.path.join(ROOT_DIR, 'root4', 'egg.nc'),
]),
('*.nc', [
os.path.join(ROOT_DIR, 'test.nc'),
os.path.join(ROOT_DIR, 'root2', 'x.nc'),
os.path.join(ROOT_DIR, 'root3', 'egg.nc'),
os.path.join(ROOT_DIR, 'test_1.nc'),
os.path.join(ROOT_DIR, 'root4', 'egg.nc'),
os.path.join(ROOT_DIR, 'test.nc'),
os.path.join(ROOT_DIR, 'test_1.nc'),
]),
('test*', [
os.path.join(ROOT_DIR, 'test.nc'),
Expand Down