From 4d4e051d9c5a14273d8addcbc192e2685ecf4319 Mon Sep 17 00:00:00 2001 From: Manuel Schlund Date: Thu, 7 Jul 2022 17:01:00 +0200 Subject: [PATCH 1/3] Added sorting of input data to many diagnostics to make the output exactly reproducible --- .../diag_scripts/climate_metrics/ecs.py | 5 +++- .../climate_metrics/feedback_parameters.py | 17 ++++++++++--- .../diag_scripts/climate_metrics/psi.py | 25 +++++++++++++------ .../diag_scripts/climate_metrics/tcr.py | 4 ++- .../emergent_constraints/cox18nature.py | 7 ++++-- .../emergent_constraints/ecs_scatter.py | 13 +++++++--- .../diag_scripts/ipcc_ar5/ch09_fig09_42a.py | 5 +++- esmvaltool/diag_scripts/mlr/__init__.py | 2 ++ 8 files changed, 57 insertions(+), 21 deletions(-) diff --git a/esmvaltool/diag_scripts/climate_metrics/ecs.py b/esmvaltool/diag_scripts/climate_metrics/ecs.py index c74d898198..9c34879c25 100644 --- a/esmvaltool/diag_scripts/climate_metrics/ecs.py +++ b/esmvaltool/diag_scripts/climate_metrics/ecs.py @@ -71,6 +71,7 @@ io, run_diagnostic, select_metadata, + sorted_metadata, variables_available, ) @@ -309,6 +310,7 @@ def check_input_data(cfg): def preprocess_data(cfg): """Extract input data.""" input_data = deepcopy(list(cfg['input_data'].values())) + input_data = sorted_metadata(input_data, ['short_name', 'exp', 'dataset']) if not input_data: return ([], []) @@ -471,8 +473,9 @@ def write_data(cfg, ecs_data, feedback_parameter_data, ancestor_files): else: attrs = {} if RTMT_DATASETS: + rtmt_datasets = sorted(list(RTMT_DATASETS)) attrs['net_toa_radiation'] = ( - f"For datasets {RTMT_DATASETS}, 'rtmt' (net top of model " + f"For datasets {rtmt_datasets}, 'rtmt' (net top of model " f"radiation) instead of 'rtnt' (net top of atmosphere radiation) " f"is used due to lack of data. These two variables might differ.") attrs.update(cfg.get('output_attributes', {})) diff --git a/esmvaltool/diag_scripts/climate_metrics/feedback_parameters.py b/esmvaltool/diag_scripts/climate_metrics/feedback_parameters.py index 59529111b5..00cdc07775 100644 --- a/esmvaltool/diag_scripts/climate_metrics/feedback_parameters.py +++ b/esmvaltool/diag_scripts/climate_metrics/feedback_parameters.py @@ -55,6 +55,7 @@ plot, run_diagnostic, select_metadata, + sorted_metadata, variables_available, ) @@ -465,8 +466,13 @@ def _create_table(table, cfg, description=None): 'SW = short wave, LW = long wave, cs = clear sky, CRE = cloud ' 'radiative effect (similar to Andrews et al., Geophys. Res. Lett., ' '39, 2012).') - _write_provenance(netcdf_path, plot_path, caption, - [d['filename'] for d in cfg['input_data'].values()], cfg) + _write_provenance( + netcdf_path, + plot_path, + caption, + sorted([d['filename'] for d in cfg['input_data'].values()]), + cfg, + ) def _dict_to_array(dict_): @@ -533,7 +539,8 @@ def _get_cube_list_for_table(cell_data, row_labels, col_labels, col_units): cubes = iris.cube.CubeList() for (idx, label) in enumerate(col_labels): if label in ('ECS', 'F', 'rtnt') and RTMT_DATASETS: - attrs = {'net_toa_radiation': RTMT_TEXT.format(RTMT_DATASETS)} + rtmt_datasets = sorted(list(RTMT_DATASETS)) + attrs = {'net_toa_radiation': RTMT_TEXT.format(rtmt_datasets)} else: attrs = {} cube = iris.cube.Cube( @@ -711,7 +718,8 @@ def _write_scalar_data(data, ancestor_files, cfg, description=None): ] global_attrs = {'project': list(cfg['input_data'].values())[0]['project']} if RTMT_DATASETS: - global_attrs['net_toa_radiation'] = RTMT_TEXT.format(RTMT_DATASETS) + rtmt_datasets = sorted(list(RTMT_DATASETS)) + global_attrs['net_toa_radiation'] = RTMT_TEXT.format(rtmt_datasets) for (idx, var_attr) in enumerate(var_attrs): caption = '{long_name} for multiple climate models'.format(**var_attr) if description is not None: @@ -967,6 +975,7 @@ def plot_regressions(input_data, cfg, description=None): def preprocess_data(cfg, year_idx=None): """Calculate anomalies and multi-model mean.""" input_data = deepcopy(list(cfg['input_data'].values())) + input_data = sorted_metadata(input_data, ['short_name', 'exp', 'dataset']) # Use 'rtmt' instead of 'rtmt' if necessary for dataset in input_data: diff --git a/esmvaltool/diag_scripts/climate_metrics/psi.py b/esmvaltool/diag_scripts/climate_metrics/psi.py index 3c2cda36d4..05f06e9614 100644 --- a/esmvaltool/diag_scripts/climate_metrics/psi.py +++ b/esmvaltool/diag_scripts/climate_metrics/psi.py @@ -35,10 +35,15 @@ import numpy as np from scipy import stats -from esmvaltool.diag_scripts.shared import (ProvenanceLogger, - get_diagnostic_filename, - group_metadata, io, run_diagnostic, - select_metadata) +from esmvaltool.diag_scripts.shared import ( + ProvenanceLogger, + get_diagnostic_filename, + group_metadata, + io, + run_diagnostic, + select_metadata, + sorted_metadata, +) logger = logging.getLogger(os.path.basename(__file__)) @@ -102,9 +107,12 @@ def get_provenance_record(caption, ancestor_files): def get_attributes(cfg, single_psi_cube, input_data): """Get attributes for psi cube for all datasets.""" - datasets = "|".join({str(d['dataset']) for d in input_data}) - projects = "|".join({str(d['project']) for d in input_data}) - ref = "|".join({str(d.get('reference_dataset')) for d in input_data}) + datasets = sorted(list({str(d['dataset']) for d in input_data})) + projects = sorted(list({str(d['project']) for d in input_data})) + ref = sorted(list({str(d.get('reference_dataset')) for d in input_data})) + datasets = "|".join(datasets) + projects = "|".join(projects) + ref = "|".join(ref) attrs = single_psi_cube.attributes attrs.update({ 'dataset': datasets, @@ -120,6 +128,7 @@ def main(cfg): input_data = ( select_metadata(cfg['input_data'].values(), short_name='tas') + select_metadata(cfg['input_data'].values(), short_name='tasa')) + input_data = sorted_metadata(input_data, ['short_name', 'exp', 'dataset']) if not input_data: raise ValueError("This diagnostics needs 'tas' or 'tasa' variable") @@ -162,7 +171,7 @@ def main(cfg): io.save_scalar_data(psis, out_path, psi_attrs, attributes=attrs) # Provenance - caption = "{long_name} for mutliple climate models.".format(**psi_attrs) + caption = "{long_name} for multiple climate models.".format(**psi_attrs) ancestor_files = [d['filename'] for d in input_data] provenance_record = get_provenance_record(caption, ancestor_files) with ProvenanceLogger(cfg) as provenance_logger: diff --git a/esmvaltool/diag_scripts/climate_metrics/tcr.py b/esmvaltool/diag_scripts/climate_metrics/tcr.py index 449c62a6ec..c500133e53 100644 --- a/esmvaltool/diag_scripts/climate_metrics/tcr.py +++ b/esmvaltool/diag_scripts/climate_metrics/tcr.py @@ -52,6 +52,7 @@ io, run_diagnostic, select_metadata, + sorted_metadata, variables_available, ) @@ -108,6 +109,7 @@ def _get_anomaly_cubes(cfg): cubes = {} ancestors = {} input_data = cfg['input_data'].values() + input_data = sorted_metadata(input_data, ['short_name', 'exp', 'dataset']) onepct_data = select_metadata(input_data, short_name='tas', exp='1pctCO2') # Process data @@ -329,7 +331,7 @@ def write_data(cfg, tcr, external_file=None): for dataset_name in tcr.keys(): datasets = select_metadata(cfg['input_data'].values(), dataset=dataset_name) - ancestor_files.extend([d['filename'] for d in datasets]) + ancestor_files.extend(sorted([d['filename'] for d in datasets])) if external_file is not None: ancestor_files.append(external_file) provenance_record['ancestors'] = ancestor_files diff --git a/esmvaltool/diag_scripts/emergent_constraints/cox18nature.py b/esmvaltool/diag_scripts/emergent_constraints/cox18nature.py index 86b55dad6f..e071f56aaf 100644 --- a/esmvaltool/diag_scripts/emergent_constraints/cox18nature.py +++ b/esmvaltool/diag_scripts/emergent_constraints/cox18nature.py @@ -42,6 +42,7 @@ plot, run_diagnostic, select_metadata, + sorted_metadata, ) logger = logging.getLogger(os.path.basename(__file__)) @@ -80,7 +81,7 @@ def _get_ancestor_files(cfg, obs_name, projects=None): select_metadata(cfg['input_data'].values(), project=project)) datasets.extend( select_metadata(cfg['input_data'].values(), dataset=obs_name)) - return [d['filename'] for d in datasets] + return sorted([d['filename'] for d in datasets]) def _get_model_color(model, lambda_cube): @@ -133,7 +134,7 @@ def _get_project(cfg): projects = [p for p in projects if 'obs' not in p.lower()] if len(projects) == 1: return projects[0] - return projects + return sorted(projects) def _save_fig(cfg, basename, legend=None): @@ -156,6 +157,7 @@ def get_external_cubes(cfg): """Get external cubes for psi, ECS and lambda.""" cubes = iris.cube.CubeList() input_data = list(cfg['input_data'].values()) + input_data = sorted_metadata(input_data, ['short_name', 'exp', 'dataset']) for filename in ('psi.nc', 'ecs.nc', 'lambda.nc'): filepath = io.get_ancestor_file(cfg, filename) cube = iris.load_cube(filepath) @@ -509,6 +511,7 @@ def main(cfg): input_data = ( select_metadata(cfg['input_data'].values(), short_name='tas') + select_metadata(cfg['input_data'].values(), short_name='tasa')) + input_data = sorted_metadata(input_data, ['short_name', 'exp', 'dataset']) if not input_data: raise ValueError("This diagnostics needs 'tas' or 'tasa' variable") diff --git a/esmvaltool/diag_scripts/emergent_constraints/ecs_scatter.py b/esmvaltool/diag_scripts/emergent_constraints/ecs_scatter.py index fd5c3e48bb..23fd3a3ee4 100644 --- a/esmvaltool/diag_scripts/emergent_constraints/ecs_scatter.py +++ b/esmvaltool/diag_scripts/emergent_constraints/ecs_scatter.py @@ -64,6 +64,7 @@ io, run_diagnostic, select_metadata, + sorted_metadata, ) logger = logging.getLogger(os.path.basename(__file__)) @@ -276,7 +277,7 @@ def _get_su_cube_dict(grouped_data, var_name, reference_datasets): ref_filenames.append(grouped_data[ref_dataset_name][0]['filename']) ref_cube = cube.copy(ref_data) ref_cube.attributes['dataset'] = reference_datasets - ref_cube.attributes['ancestors'] = '|'.join(ref_filenames) + ref_cube.attributes['ancestors'] = '|'.join(sorted(ref_filenames)) ref_cube.coord('air_pressure').attributes['positive'] = 'down' # All other cubes @@ -772,9 +773,12 @@ def get_default_settings(cfg): def get_global_attributes(input_data, cfg): """Get attributes for psi cube for all datasets.""" - datasets = "|".join({str(d['dataset']) for d in input_data}) - projects = "|".join({str(d['project']) for d in input_data}) - ref = "|".join({str(d.get('reference_dataset')) for d in input_data}) + datasets = sorted(list({str(d['dataset']) for d in input_data})) + projects = sorted(list({str(d['project']) for d in input_data})) + ref = sorted(list({str(d.get('reference_dataset')) for d in input_data})) + datasets = "|".join(datasets) + projects = "|".join(projects) + ref = "|".join(ref) attrs = { 'dataset': datasets, 'project': projects, @@ -794,6 +798,7 @@ def main(cfg): input_data = list(cfg['input_data'].values()) input_data.extend(io.netcdf_to_metadata(cfg, pattern=cfg.get('pattern'))) input_data = deepcopy(input_data) + input_data = sorted_metadata(input_data, ['short_name', 'exp', 'dataset']) check_input_data(input_data) grouped_data = group_metadata(input_data, 'dataset') diff --git a/esmvaltool/diag_scripts/ipcc_ar5/ch09_fig09_42a.py b/esmvaltool/diag_scripts/ipcc_ar5/ch09_fig09_42a.py index c9e7e4ce45..7f498adf5f 100644 --- a/esmvaltool/diag_scripts/ipcc_ar5/ch09_fig09_42a.py +++ b/esmvaltool/diag_scripts/ipcc_ar5/ch09_fig09_42a.py @@ -34,6 +34,7 @@ import logging import os +from copy import deepcopy import iris import seaborn as sns @@ -47,6 +48,7 @@ io, plot, run_diagnostic, + sorted_metadata, variables_available, ) @@ -163,7 +165,8 @@ def write_data(cfg, hist_cubes, pi_cubes, ecs_cube): def main(cfg): """Run the diagnostic.""" sns.set(**cfg.get('seaborn_settings', {})) - input_data = cfg['input_data'].values() + input_data = deepcopy(list(cfg['input_data'].values())) + input_data = sorted_metadata(input_data, ['short_name', 'exp', 'dataset']) project = list(group_metadata(input_data, 'project').keys()) project = [p for p in project if 'obs' not in p.lower()] if len(project) == 1: diff --git a/esmvaltool/diag_scripts/mlr/__init__.py b/esmvaltool/diag_scripts/mlr/__init__.py index 486e975030..aa89303896 100644 --- a/esmvaltool/diag_scripts/mlr/__init__.py +++ b/esmvaltool/diag_scripts/mlr/__init__.py @@ -20,6 +20,7 @@ get_diagnostic_filename, io, select_metadata, + sorted_metadata, ) logger = logging.getLogger(os.path.basename(__file__)) @@ -542,6 +543,7 @@ def get_input_data(cfg, pattern=None, check_mlr_attributes=True, ignore=None): if not datasets_have_mlr_attributes(valid_data, log_level='error'): raise ValueError("At least one input dataset does not have valid " "MLR attributes") + valid_data = sorted_metadata(valid_data, ['var_type', 'tag', 'dataset']) logger.debug("Found files:") logger.debug(pformat([d['filename'] for d in valid_data])) return valid_data From 6c3d12d1fa185ed332418f917ce8b07085122124 Mon Sep 17 00:00:00 2001 From: Manuel Schlund Date: Thu, 7 Jul 2022 17:09:13 +0200 Subject: [PATCH 2/3] Added sorting to basic function that retrieves input files --- esmvaltool/diag_scripts/shared/io.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/esmvaltool/diag_scripts/shared/io.py b/esmvaltool/diag_scripts/shared/io.py index 9eb7f0770e..4889f5b1c1 100644 --- a/esmvaltool/diag_scripts/shared/io.py +++ b/esmvaltool/diag_scripts/shared/io.py @@ -66,7 +66,7 @@ def get_all_ancestor_files(cfg, pattern=None): files = fnmatch.filter(files, pattern) files = [os.path.join(root, f) for f in files] ancestor_files.extend(files) - return ancestor_files + return sorted(ancestor_files) def get_ancestor_file(cfg, pattern): @@ -131,6 +131,7 @@ def netcdf_to_metadata(cfg, pattern=None, root=None): files = [os.path.join(base, f) for f in files] all_files.extend(files) all_files = fnmatch.filter(all_files, '*.nc') + all_files = sorted(all_files) # Iterate over netcdf files metadata = [] From 7ce630da7c58646ebefe10cdafab855822304e20 Mon Sep 17 00:00:00 2001 From: Manuel Schlund Date: Thu, 7 Jul 2022 17:30:21 +0200 Subject: [PATCH 3/3] Fixed tests --- tests/unit/diag_scripts/mlr/test_helpers.py | 4 ++-- tests/unit/diag_scripts/shared/test_io.py | 12 ++++++------ 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/tests/unit/diag_scripts/mlr/test_helpers.py b/tests/unit/diag_scripts/mlr/test_helpers.py index 68d514be67..818a9daf74 100644 --- a/tests/unit/diag_scripts/mlr/test_helpers.py +++ b/tests/unit/diag_scripts/mlr/test_helpers.py @@ -390,7 +390,7 @@ def test_get_datasets(input_data, kwargs, output): (CFG_3, [], False, IGNORE, [D_3], 0), (CFG_3, [D_1], True, None, ValueError, 1), (CFG_3, [D_1], True, IGNORE, ValueError, 1), - (CFG_3, [D_1], False, None, [D_1, D_3, D_1], 0), + (CFG_3, [D_1], False, None, [D_1, D_1, D_3], 0), (CFG_3, [D_1], False, IGNORE, [D_3], 0), (CFG_4, [], True, None, ValueError, 2), (CFG_4, [], True, IGNORE, ValueError, 1), @@ -398,7 +398,7 @@ def test_get_datasets(input_data, kwargs, output): (CFG_4, [], False, IGNORE, [D_3], 0), (CFG_4, [D_1], True, None, ValueError, 2), (CFG_4, [D_1], True, IGNORE, ValueError, 1), - (CFG_4, [D_1], False, None, [D_1, D_2, D_3, D_1], 0), + (CFG_4, [D_1], False, None, [D_1, D_2, D_1, D_3], 0), (CFG_4, [D_1], False, IGNORE, [D_3], 0), ] diff --git a/tests/unit/diag_scripts/shared/test_io.py b/tests/unit/diag_scripts/shared/test_io.py index 7829cdcd22..1bac515373 100644 --- a/tests/unit/diag_scripts/shared/test_io.py +++ b/tests/unit/diag_scripts/shared/test_io.py @@ -49,31 +49,31 @@ def test_has_necessary_attributes(mock_logger, data): ROOT_DIR = '/root/to/something' TEST_GET_ALL_ANCESTOR_FILES = [ (None, [ - os.path.join(ROOT_DIR, 'test.nc'), os.path.join(ROOT_DIR, 'egg.yml'), os.path.join(ROOT_DIR, 'root2', 'x.nc'), os.path.join(ROOT_DIR, 'root2', 'y.png'), os.path.join(ROOT_DIR, 'root3', 'egg.nc'), + os.path.join(ROOT_DIR, 'root4', 'egg.nc'), + os.path.join(ROOT_DIR, 'test.nc'), os.path.join(ROOT_DIR, 'test_1.nc'), os.path.join(ROOT_DIR, 'test_2.yml'), - os.path.join(ROOT_DIR, 'root4', 'egg.nc'), ]), ('*', [ - os.path.join(ROOT_DIR, 'test.nc'), os.path.join(ROOT_DIR, 'egg.yml'), os.path.join(ROOT_DIR, 'root2', 'x.nc'), os.path.join(ROOT_DIR, 'root2', 'y.png'), os.path.join(ROOT_DIR, 'root3', 'egg.nc'), + os.path.join(ROOT_DIR, 'root4', 'egg.nc'), + os.path.join(ROOT_DIR, 'test.nc'), os.path.join(ROOT_DIR, 'test_1.nc'), os.path.join(ROOT_DIR, 'test_2.yml'), - os.path.join(ROOT_DIR, 'root4', 'egg.nc'), ]), ('*.nc', [ - os.path.join(ROOT_DIR, 'test.nc'), os.path.join(ROOT_DIR, 'root2', 'x.nc'), os.path.join(ROOT_DIR, 'root3', 'egg.nc'), - os.path.join(ROOT_DIR, 'test_1.nc'), os.path.join(ROOT_DIR, 'root4', 'egg.nc'), + os.path.join(ROOT_DIR, 'test.nc'), + os.path.join(ROOT_DIR, 'test_1.nc'), ]), ('test*', [ os.path.join(ROOT_DIR, 'test.nc'),